Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0-only
0002 /*
0003  * Stress userfaultfd syscall.
0004  *
0005  *  Copyright (C) 2015  Red Hat, Inc.
0006  *
0007  * This test allocates two virtual areas and bounces the physical
0008  * memory across the two virtual areas (from area_src to area_dst)
0009  * using userfaultfd.
0010  *
0011  * There are three threads running per CPU:
0012  *
0013  * 1) one per-CPU thread takes a per-page pthread_mutex in a random
0014  *    page of the area_dst (while the physical page may still be in
0015  *    area_src), and increments a per-page counter in the same page,
0016  *    and checks its value against a verification region.
0017  *
0018  * 2) another per-CPU thread handles the userfaults generated by
0019  *    thread 1 above. userfaultfd blocking reads or poll() modes are
0020  *    exercised interleaved.
0021  *
0022  * 3) one last per-CPU thread transfers the memory in the background
0023  *    at maximum bandwidth (if not already transferred by thread
0024  *    2). Each cpu thread takes cares of transferring a portion of the
0025  *    area.
0026  *
0027  * When all threads of type 3 completed the transfer, one bounce is
0028  * complete. area_src and area_dst are then swapped. All threads are
0029  * respawned and so the bounce is immediately restarted in the
0030  * opposite direction.
0031  *
0032  * per-CPU threads 1 by triggering userfaults inside
0033  * pthread_mutex_lock will also verify the atomicity of the memory
0034  * transfer (UFFDIO_COPY).
0035  */
0036 
0037 #define _GNU_SOURCE
0038 #include <stdio.h>
0039 #include <errno.h>
0040 #include <unistd.h>
0041 #include <stdlib.h>
0042 #include <sys/types.h>
0043 #include <sys/stat.h>
0044 #include <fcntl.h>
0045 #include <time.h>
0046 #include <signal.h>
0047 #include <poll.h>
0048 #include <string.h>
0049 #include <linux/mman.h>
0050 #include <sys/mman.h>
0051 #include <sys/syscall.h>
0052 #include <sys/ioctl.h>
0053 #include <sys/wait.h>
0054 #include <pthread.h>
0055 #include <linux/userfaultfd.h>
0056 #include <setjmp.h>
0057 #include <stdbool.h>
0058 #include <assert.h>
0059 #include <inttypes.h>
0060 #include <stdint.h>
0061 #include <sys/random.h>
0062 
0063 #include "../kselftest.h"
0064 
0065 #ifdef __NR_userfaultfd
0066 
0067 static unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size;
0068 
0069 #define BOUNCE_RANDOM       (1<<0)
0070 #define BOUNCE_RACINGFAULTS (1<<1)
0071 #define BOUNCE_VERIFY       (1<<2)
0072 #define BOUNCE_POLL     (1<<3)
0073 static int bounces;
0074 
0075 #define TEST_ANON   1
0076 #define TEST_HUGETLB    2
0077 #define TEST_SHMEM  3
0078 static int test_type;
0079 
0080 /* exercise the test_uffdio_*_eexist every ALARM_INTERVAL_SECS */
0081 #define ALARM_INTERVAL_SECS 10
0082 static volatile bool test_uffdio_copy_eexist = true;
0083 static volatile bool test_uffdio_zeropage_eexist = true;
0084 /* Whether to test uffd write-protection */
0085 static bool test_uffdio_wp = true;
0086 /* Whether to test uffd minor faults */
0087 static bool test_uffdio_minor = false;
0088 
0089 static bool map_shared;
0090 static int shm_fd;
0091 static int huge_fd;
0092 static unsigned long long *count_verify;
0093 static int uffd = -1;
0094 static int uffd_flags, finished, *pipefd;
0095 static char *area_src, *area_src_alias, *area_dst, *area_dst_alias;
0096 static char *zeropage;
0097 pthread_attr_t attr;
0098 
0099 /* Userfaultfd test statistics */
0100 struct uffd_stats {
0101     int cpu;
0102     unsigned long missing_faults;
0103     unsigned long wp_faults;
0104     unsigned long minor_faults;
0105 };
0106 
0107 /* pthread_mutex_t starts at page offset 0 */
0108 #define area_mutex(___area, ___nr)                  \
0109     ((pthread_mutex_t *) ((___area) + (___nr)*page_size))
0110 /*
0111  * count is placed in the page after pthread_mutex_t naturally aligned
0112  * to avoid non alignment faults on non-x86 archs.
0113  */
0114 #define area_count(___area, ___nr)                  \
0115     ((volatile unsigned long long *) ((unsigned long)       \
0116                  ((___area) + (___nr)*page_size +   \
0117                   sizeof(pthread_mutex_t) +     \
0118                   sizeof(unsigned long long) - 1) & \
0119                  ~(unsigned long)(sizeof(unsigned long long) \
0120                           -  1)))
0121 
0122 #define swap(a, b) \
0123     do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0)
0124 
0125 const char *examples =
0126     "# Run anonymous memory test on 100MiB region with 99999 bounces:\n"
0127     "./userfaultfd anon 100 99999\n\n"
0128     "# Run share memory test on 1GiB region with 99 bounces:\n"
0129     "./userfaultfd shmem 1000 99\n\n"
0130     "# Run hugetlb memory test on 256MiB region with 50 bounces:\n"
0131     "./userfaultfd hugetlb 256 50\n\n"
0132     "# Run the same hugetlb test but using shared file:\n"
0133     "./userfaultfd hugetlb_shared 256 50 /dev/hugepages/hugefile\n\n"
0134     "# 10MiB-~6GiB 999 bounces anonymous test, "
0135     "continue forever unless an error triggers\n"
0136     "while ./userfaultfd anon $[RANDOM % 6000 + 10] 999; do true; done\n\n";
0137 
0138 static void usage(void)
0139 {
0140     fprintf(stderr, "\nUsage: ./userfaultfd <test type> <MiB> <bounces> "
0141         "[hugetlbfs_file]\n\n");
0142     fprintf(stderr, "Supported <test type>: anon, hugetlb, "
0143         "hugetlb_shared, shmem\n\n");
0144     fprintf(stderr, "Examples:\n\n");
0145     fprintf(stderr, "%s", examples);
0146     exit(1);
0147 }
0148 
0149 #define _err(fmt, ...)                      \
0150     do {                            \
0151         int ret = errno;                \
0152         fprintf(stderr, "ERROR: " fmt, ##__VA_ARGS__);  \
0153         fprintf(stderr, " (errno=%d, line=%d)\n",   \
0154             ret, __LINE__);             \
0155     } while (0)
0156 
0157 #define err(fmt, ...)               \
0158     do {                    \
0159         _err(fmt, ##__VA_ARGS__);   \
0160         exit(1);            \
0161     } while (0)
0162 
0163 static void uffd_stats_reset(struct uffd_stats *uffd_stats,
0164                  unsigned long n_cpus)
0165 {
0166     int i;
0167 
0168     for (i = 0; i < n_cpus; i++) {
0169         uffd_stats[i].cpu = i;
0170         uffd_stats[i].missing_faults = 0;
0171         uffd_stats[i].wp_faults = 0;
0172         uffd_stats[i].minor_faults = 0;
0173     }
0174 }
0175 
0176 static void uffd_stats_report(struct uffd_stats *stats, int n_cpus)
0177 {
0178     int i;
0179     unsigned long long miss_total = 0, wp_total = 0, minor_total = 0;
0180 
0181     for (i = 0; i < n_cpus; i++) {
0182         miss_total += stats[i].missing_faults;
0183         wp_total += stats[i].wp_faults;
0184         minor_total += stats[i].minor_faults;
0185     }
0186 
0187     printf("userfaults: ");
0188     if (miss_total) {
0189         printf("%llu missing (", miss_total);
0190         for (i = 0; i < n_cpus; i++)
0191             printf("%lu+", stats[i].missing_faults);
0192         printf("\b) ");
0193     }
0194     if (wp_total) {
0195         printf("%llu wp (", wp_total);
0196         for (i = 0; i < n_cpus; i++)
0197             printf("%lu+", stats[i].wp_faults);
0198         printf("\b) ");
0199     }
0200     if (minor_total) {
0201         printf("%llu minor (", minor_total);
0202         for (i = 0; i < n_cpus; i++)
0203             printf("%lu+", stats[i].minor_faults);
0204         printf("\b)");
0205     }
0206     printf("\n");
0207 }
0208 
0209 static void anon_release_pages(char *rel_area)
0210 {
0211     if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED))
0212         err("madvise(MADV_DONTNEED) failed");
0213 }
0214 
0215 static void anon_allocate_area(void **alloc_area)
0216 {
0217     *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
0218                MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
0219     if (*alloc_area == MAP_FAILED)
0220         err("mmap of anonymous memory failed");
0221 }
0222 
0223 static void noop_alias_mapping(__u64 *start, size_t len, unsigned long offset)
0224 {
0225 }
0226 
0227 static void hugetlb_release_pages(char *rel_area)
0228 {
0229     if (!map_shared) {
0230         if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED))
0231             err("madvise(MADV_DONTNEED) failed");
0232     } else {
0233         if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE))
0234             err("madvise(MADV_REMOVE) failed");
0235     }
0236 }
0237 
0238 static void hugetlb_allocate_area(void **alloc_area)
0239 {
0240     void *area_alias = NULL;
0241     char **alloc_area_alias;
0242 
0243     if (!map_shared)
0244         *alloc_area = mmap(NULL,
0245             nr_pages * page_size,
0246             PROT_READ | PROT_WRITE,
0247             MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB |
0248                 (*alloc_area == area_src ? 0 : MAP_NORESERVE),
0249             -1,
0250             0);
0251     else
0252         *alloc_area = mmap(NULL,
0253             nr_pages * page_size,
0254             PROT_READ | PROT_WRITE,
0255             MAP_SHARED |
0256                 (*alloc_area == area_src ? 0 : MAP_NORESERVE),
0257             huge_fd,
0258             *alloc_area == area_src ? 0 : nr_pages * page_size);
0259     if (*alloc_area == MAP_FAILED)
0260         err("mmap of hugetlbfs file failed");
0261 
0262     if (map_shared) {
0263         area_alias = mmap(NULL,
0264             nr_pages * page_size,
0265             PROT_READ | PROT_WRITE,
0266             MAP_SHARED,
0267             huge_fd,
0268             *alloc_area == area_src ? 0 : nr_pages * page_size);
0269         if (area_alias == MAP_FAILED)
0270             err("mmap of hugetlb file alias failed");
0271     }
0272 
0273     if (*alloc_area == area_src) {
0274         alloc_area_alias = &area_src_alias;
0275     } else {
0276         alloc_area_alias = &area_dst_alias;
0277     }
0278     if (area_alias)
0279         *alloc_area_alias = area_alias;
0280 }
0281 
0282 static void hugetlb_alias_mapping(__u64 *start, size_t len, unsigned long offset)
0283 {
0284     if (!map_shared)
0285         return;
0286 
0287     *start = (unsigned long) area_dst_alias + offset;
0288 }
0289 
0290 static void shmem_release_pages(char *rel_area)
0291 {
0292     if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE))
0293         err("madvise(MADV_REMOVE) failed");
0294 }
0295 
0296 static void shmem_allocate_area(void **alloc_area)
0297 {
0298     void *area_alias = NULL;
0299     bool is_src = alloc_area == (void **)&area_src;
0300     unsigned long offset = is_src ? 0 : nr_pages * page_size;
0301 
0302     *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
0303                MAP_SHARED, shm_fd, offset);
0304     if (*alloc_area == MAP_FAILED)
0305         err("mmap of memfd failed");
0306 
0307     area_alias = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
0308               MAP_SHARED, shm_fd, offset);
0309     if (area_alias == MAP_FAILED)
0310         err("mmap of memfd alias failed");
0311 
0312     if (is_src)
0313         area_src_alias = area_alias;
0314     else
0315         area_dst_alias = area_alias;
0316 }
0317 
0318 static void shmem_alias_mapping(__u64 *start, size_t len, unsigned long offset)
0319 {
0320     *start = (unsigned long)area_dst_alias + offset;
0321 }
0322 
0323 struct uffd_test_ops {
0324     void (*allocate_area)(void **alloc_area);
0325     void (*release_pages)(char *rel_area);
0326     void (*alias_mapping)(__u64 *start, size_t len, unsigned long offset);
0327 };
0328 
0329 static struct uffd_test_ops anon_uffd_test_ops = {
0330     .allocate_area  = anon_allocate_area,
0331     .release_pages  = anon_release_pages,
0332     .alias_mapping = noop_alias_mapping,
0333 };
0334 
0335 static struct uffd_test_ops shmem_uffd_test_ops = {
0336     .allocate_area  = shmem_allocate_area,
0337     .release_pages  = shmem_release_pages,
0338     .alias_mapping = shmem_alias_mapping,
0339 };
0340 
0341 static struct uffd_test_ops hugetlb_uffd_test_ops = {
0342     .allocate_area  = hugetlb_allocate_area,
0343     .release_pages  = hugetlb_release_pages,
0344     .alias_mapping = hugetlb_alias_mapping,
0345 };
0346 
0347 static struct uffd_test_ops *uffd_test_ops;
0348 
0349 static inline uint64_t uffd_minor_feature(void)
0350 {
0351     if (test_type == TEST_HUGETLB && map_shared)
0352         return UFFD_FEATURE_MINOR_HUGETLBFS;
0353     else if (test_type == TEST_SHMEM)
0354         return UFFD_FEATURE_MINOR_SHMEM;
0355     else
0356         return 0;
0357 }
0358 
0359 static uint64_t get_expected_ioctls(uint64_t mode)
0360 {
0361     uint64_t ioctls = UFFD_API_RANGE_IOCTLS;
0362 
0363     if (test_type == TEST_HUGETLB)
0364         ioctls &= ~(1 << _UFFDIO_ZEROPAGE);
0365 
0366     if (!((mode & UFFDIO_REGISTER_MODE_WP) && test_uffdio_wp))
0367         ioctls &= ~(1 << _UFFDIO_WRITEPROTECT);
0368 
0369     if (!((mode & UFFDIO_REGISTER_MODE_MINOR) && test_uffdio_minor))
0370         ioctls &= ~(1 << _UFFDIO_CONTINUE);
0371 
0372     return ioctls;
0373 }
0374 
0375 static void assert_expected_ioctls_present(uint64_t mode, uint64_t ioctls)
0376 {
0377     uint64_t expected = get_expected_ioctls(mode);
0378     uint64_t actual = ioctls & expected;
0379 
0380     if (actual != expected) {
0381         err("missing ioctl(s): expected %"PRIx64" actual: %"PRIx64,
0382             expected, actual);
0383     }
0384 }
0385 
0386 static void userfaultfd_open(uint64_t *features)
0387 {
0388     struct uffdio_api uffdio_api;
0389 
0390     uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK | UFFD_USER_MODE_ONLY);
0391     if (uffd < 0)
0392         err("userfaultfd syscall not available in this kernel");
0393     uffd_flags = fcntl(uffd, F_GETFD, NULL);
0394 
0395     uffdio_api.api = UFFD_API;
0396     uffdio_api.features = *features;
0397     if (ioctl(uffd, UFFDIO_API, &uffdio_api))
0398         err("UFFDIO_API failed.\nPlease make sure to "
0399             "run with either root or ptrace capability.");
0400     if (uffdio_api.api != UFFD_API)
0401         err("UFFDIO_API error: %" PRIu64, (uint64_t)uffdio_api.api);
0402 
0403     *features = uffdio_api.features;
0404 }
0405 
0406 static inline void munmap_area(void **area)
0407 {
0408     if (*area)
0409         if (munmap(*area, nr_pages * page_size))
0410             err("munmap");
0411 
0412     *area = NULL;
0413 }
0414 
0415 static void uffd_test_ctx_clear(void)
0416 {
0417     size_t i;
0418 
0419     if (pipefd) {
0420         for (i = 0; i < nr_cpus * 2; ++i) {
0421             if (close(pipefd[i]))
0422                 err("close pipefd");
0423         }
0424         free(pipefd);
0425         pipefd = NULL;
0426     }
0427 
0428     if (count_verify) {
0429         free(count_verify);
0430         count_verify = NULL;
0431     }
0432 
0433     if (uffd != -1) {
0434         if (close(uffd))
0435             err("close uffd");
0436         uffd = -1;
0437     }
0438 
0439     munmap_area((void **)&area_src);
0440     munmap_area((void **)&area_src_alias);
0441     munmap_area((void **)&area_dst);
0442     munmap_area((void **)&area_dst_alias);
0443 }
0444 
0445 static void uffd_test_ctx_init(uint64_t features)
0446 {
0447     unsigned long nr, cpu;
0448 
0449     uffd_test_ctx_clear();
0450 
0451     uffd_test_ops->allocate_area((void **)&area_src);
0452     uffd_test_ops->allocate_area((void **)&area_dst);
0453 
0454     userfaultfd_open(&features);
0455 
0456     count_verify = malloc(nr_pages * sizeof(unsigned long long));
0457     if (!count_verify)
0458         err("count_verify");
0459 
0460     for (nr = 0; nr < nr_pages; nr++) {
0461         *area_mutex(area_src, nr) =
0462             (pthread_mutex_t)PTHREAD_MUTEX_INITIALIZER;
0463         count_verify[nr] = *area_count(area_src, nr) = 1;
0464         /*
0465          * In the transition between 255 to 256, powerpc will
0466          * read out of order in my_bcmp and see both bytes as
0467          * zero, so leave a placeholder below always non-zero
0468          * after the count, to avoid my_bcmp to trigger false
0469          * positives.
0470          */
0471         *(area_count(area_src, nr) + 1) = 1;
0472     }
0473 
0474     /*
0475      * After initialization of area_src, we must explicitly release pages
0476      * for area_dst to make sure it's fully empty.  Otherwise we could have
0477      * some area_dst pages be errornously initialized with zero pages,
0478      * hence we could hit memory corruption later in the test.
0479      *
0480      * One example is when THP is globally enabled, above allocate_area()
0481      * calls could have the two areas merged into a single VMA (as they
0482      * will have the same VMA flags so they're mergeable).  When we
0483      * initialize the area_src above, it's possible that some part of
0484      * area_dst could have been faulted in via one huge THP that will be
0485      * shared between area_src and area_dst.  It could cause some of the
0486      * area_dst won't be trapped by missing userfaults.
0487      *
0488      * This release_pages() will guarantee even if that happened, we'll
0489      * proactively split the thp and drop any accidentally initialized
0490      * pages within area_dst.
0491      */
0492     uffd_test_ops->release_pages(area_dst);
0493 
0494     pipefd = malloc(sizeof(int) * nr_cpus * 2);
0495     if (!pipefd)
0496         err("pipefd");
0497     for (cpu = 0; cpu < nr_cpus; cpu++)
0498         if (pipe2(&pipefd[cpu * 2], O_CLOEXEC | O_NONBLOCK))
0499             err("pipe");
0500 }
0501 
0502 static int my_bcmp(char *str1, char *str2, size_t n)
0503 {
0504     unsigned long i;
0505     for (i = 0; i < n; i++)
0506         if (str1[i] != str2[i])
0507             return 1;
0508     return 0;
0509 }
0510 
0511 static void wp_range(int ufd, __u64 start, __u64 len, bool wp)
0512 {
0513     struct uffdio_writeprotect prms;
0514 
0515     /* Write protection page faults */
0516     prms.range.start = start;
0517     prms.range.len = len;
0518     /* Undo write-protect, do wakeup after that */
0519     prms.mode = wp ? UFFDIO_WRITEPROTECT_MODE_WP : 0;
0520 
0521     if (ioctl(ufd, UFFDIO_WRITEPROTECT, &prms))
0522         err("clear WP failed: address=0x%"PRIx64, (uint64_t)start);
0523 }
0524 
0525 static void continue_range(int ufd, __u64 start, __u64 len)
0526 {
0527     struct uffdio_continue req;
0528     int ret;
0529 
0530     req.range.start = start;
0531     req.range.len = len;
0532     req.mode = 0;
0533 
0534     if (ioctl(ufd, UFFDIO_CONTINUE, &req))
0535         err("UFFDIO_CONTINUE failed for address 0x%" PRIx64,
0536             (uint64_t)start);
0537 
0538     /*
0539      * Error handling within the kernel for continue is subtly different
0540      * from copy or zeropage, so it may be a source of bugs. Trigger an
0541      * error (-EEXIST) on purpose, to verify doing so doesn't cause a BUG.
0542      */
0543     req.mapped = 0;
0544     ret = ioctl(ufd, UFFDIO_CONTINUE, &req);
0545     if (ret >= 0 || req.mapped != -EEXIST)
0546         err("failed to exercise UFFDIO_CONTINUE error handling, ret=%d, mapped=%" PRId64,
0547             ret, (int64_t) req.mapped);
0548 }
0549 
0550 static void *locking_thread(void *arg)
0551 {
0552     unsigned long cpu = (unsigned long) arg;
0553     unsigned long page_nr;
0554     unsigned long long count;
0555 
0556     if (!(bounces & BOUNCE_RANDOM)) {
0557         page_nr = -bounces;
0558         if (!(bounces & BOUNCE_RACINGFAULTS))
0559             page_nr += cpu * nr_pages_per_cpu;
0560     }
0561 
0562     while (!finished) {
0563         if (bounces & BOUNCE_RANDOM) {
0564             if (getrandom(&page_nr, sizeof(page_nr), 0) != sizeof(page_nr))
0565                 err("getrandom failed");
0566         } else
0567             page_nr += 1;
0568         page_nr %= nr_pages;
0569         pthread_mutex_lock(area_mutex(area_dst, page_nr));
0570         count = *area_count(area_dst, page_nr);
0571         if (count != count_verify[page_nr])
0572             err("page_nr %lu memory corruption %llu %llu",
0573                 page_nr, count, count_verify[page_nr]);
0574         count++;
0575         *area_count(area_dst, page_nr) = count_verify[page_nr] = count;
0576         pthread_mutex_unlock(area_mutex(area_dst, page_nr));
0577     }
0578 
0579     return NULL;
0580 }
0581 
0582 static void retry_copy_page(int ufd, struct uffdio_copy *uffdio_copy,
0583                 unsigned long offset)
0584 {
0585     uffd_test_ops->alias_mapping(&uffdio_copy->dst,
0586                      uffdio_copy->len,
0587                      offset);
0588     if (ioctl(ufd, UFFDIO_COPY, uffdio_copy)) {
0589         /* real retval in ufdio_copy.copy */
0590         if (uffdio_copy->copy != -EEXIST)
0591             err("UFFDIO_COPY retry error: %"PRId64,
0592                 (int64_t)uffdio_copy->copy);
0593     } else {
0594         err("UFFDIO_COPY retry unexpected: %"PRId64,
0595             (int64_t)uffdio_copy->copy);
0596     }
0597 }
0598 
0599 static void wake_range(int ufd, unsigned long addr, unsigned long len)
0600 {
0601     struct uffdio_range uffdio_wake;
0602 
0603     uffdio_wake.start = addr;
0604     uffdio_wake.len = len;
0605 
0606     if (ioctl(ufd, UFFDIO_WAKE, &uffdio_wake))
0607         fprintf(stderr, "error waking %lu\n",
0608             addr), exit(1);
0609 }
0610 
0611 static int __copy_page(int ufd, unsigned long offset, bool retry)
0612 {
0613     struct uffdio_copy uffdio_copy;
0614 
0615     if (offset >= nr_pages * page_size)
0616         err("unexpected offset %lu\n", offset);
0617     uffdio_copy.dst = (unsigned long) area_dst + offset;
0618     uffdio_copy.src = (unsigned long) area_src + offset;
0619     uffdio_copy.len = page_size;
0620     if (test_uffdio_wp)
0621         uffdio_copy.mode = UFFDIO_COPY_MODE_WP;
0622     else
0623         uffdio_copy.mode = 0;
0624     uffdio_copy.copy = 0;
0625     if (ioctl(ufd, UFFDIO_COPY, &uffdio_copy)) {
0626         /* real retval in ufdio_copy.copy */
0627         if (uffdio_copy.copy != -EEXIST)
0628             err("UFFDIO_COPY error: %"PRId64,
0629                 (int64_t)uffdio_copy.copy);
0630         wake_range(ufd, uffdio_copy.dst, page_size);
0631     } else if (uffdio_copy.copy != page_size) {
0632         err("UFFDIO_COPY error: %"PRId64, (int64_t)uffdio_copy.copy);
0633     } else {
0634         if (test_uffdio_copy_eexist && retry) {
0635             test_uffdio_copy_eexist = false;
0636             retry_copy_page(ufd, &uffdio_copy, offset);
0637         }
0638         return 1;
0639     }
0640     return 0;
0641 }
0642 
0643 static int copy_page_retry(int ufd, unsigned long offset)
0644 {
0645     return __copy_page(ufd, offset, true);
0646 }
0647 
0648 static int copy_page(int ufd, unsigned long offset)
0649 {
0650     return __copy_page(ufd, offset, false);
0651 }
0652 
0653 static int uffd_read_msg(int ufd, struct uffd_msg *msg)
0654 {
0655     int ret = read(uffd, msg, sizeof(*msg));
0656 
0657     if (ret != sizeof(*msg)) {
0658         if (ret < 0) {
0659             if (errno == EAGAIN || errno == EINTR)
0660                 return 1;
0661             err("blocking read error");
0662         } else {
0663             err("short read");
0664         }
0665     }
0666 
0667     return 0;
0668 }
0669 
0670 static void uffd_handle_page_fault(struct uffd_msg *msg,
0671                    struct uffd_stats *stats)
0672 {
0673     unsigned long offset;
0674 
0675     if (msg->event != UFFD_EVENT_PAGEFAULT)
0676         err("unexpected msg event %u", msg->event);
0677 
0678     if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP) {
0679         /* Write protect page faults */
0680         wp_range(uffd, msg->arg.pagefault.address, page_size, false);
0681         stats->wp_faults++;
0682     } else if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_MINOR) {
0683         uint8_t *area;
0684         int b;
0685 
0686         /*
0687          * Minor page faults
0688          *
0689          * To prove we can modify the original range for testing
0690          * purposes, we're going to bit flip this range before
0691          * continuing.
0692          *
0693          * Note that this requires all minor page fault tests operate on
0694          * area_dst (non-UFFD-registered) and area_dst_alias
0695          * (UFFD-registered).
0696          */
0697 
0698         area = (uint8_t *)(area_dst +
0699                    ((char *)msg->arg.pagefault.address -
0700                     area_dst_alias));
0701         for (b = 0; b < page_size; ++b)
0702             area[b] = ~area[b];
0703         continue_range(uffd, msg->arg.pagefault.address, page_size);
0704         stats->minor_faults++;
0705     } else {
0706         /* Missing page faults */
0707         if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE)
0708             err("unexpected write fault");
0709 
0710         offset = (char *)(unsigned long)msg->arg.pagefault.address - area_dst;
0711         offset &= ~(page_size-1);
0712 
0713         if (copy_page(uffd, offset))
0714             stats->missing_faults++;
0715     }
0716 }
0717 
0718 static void *uffd_poll_thread(void *arg)
0719 {
0720     struct uffd_stats *stats = (struct uffd_stats *)arg;
0721     unsigned long cpu = stats->cpu;
0722     struct pollfd pollfd[2];
0723     struct uffd_msg msg;
0724     struct uffdio_register uffd_reg;
0725     int ret;
0726     char tmp_chr;
0727 
0728     pollfd[0].fd = uffd;
0729     pollfd[0].events = POLLIN;
0730     pollfd[1].fd = pipefd[cpu*2];
0731     pollfd[1].events = POLLIN;
0732 
0733     for (;;) {
0734         ret = poll(pollfd, 2, -1);
0735         if (ret <= 0) {
0736             if (errno == EINTR || errno == EAGAIN)
0737                 continue;
0738             err("poll error: %d", ret);
0739         }
0740         if (pollfd[1].revents & POLLIN) {
0741             if (read(pollfd[1].fd, &tmp_chr, 1) != 1)
0742                 err("read pipefd error");
0743             break;
0744         }
0745         if (!(pollfd[0].revents & POLLIN))
0746             err("pollfd[0].revents %d", pollfd[0].revents);
0747         if (uffd_read_msg(uffd, &msg))
0748             continue;
0749         switch (msg.event) {
0750         default:
0751             err("unexpected msg event %u\n", msg.event);
0752             break;
0753         case UFFD_EVENT_PAGEFAULT:
0754             uffd_handle_page_fault(&msg, stats);
0755             break;
0756         case UFFD_EVENT_FORK:
0757             close(uffd);
0758             uffd = msg.arg.fork.ufd;
0759             pollfd[0].fd = uffd;
0760             break;
0761         case UFFD_EVENT_REMOVE:
0762             uffd_reg.range.start = msg.arg.remove.start;
0763             uffd_reg.range.len = msg.arg.remove.end -
0764                 msg.arg.remove.start;
0765             if (ioctl(uffd, UFFDIO_UNREGISTER, &uffd_reg.range))
0766                 err("remove failure");
0767             break;
0768         case UFFD_EVENT_REMAP:
0769             area_dst = (char *)(unsigned long)msg.arg.remap.to;
0770             break;
0771         }
0772     }
0773 
0774     return NULL;
0775 }
0776 
0777 pthread_mutex_t uffd_read_mutex = PTHREAD_MUTEX_INITIALIZER;
0778 
0779 static void *uffd_read_thread(void *arg)
0780 {
0781     struct uffd_stats *stats = (struct uffd_stats *)arg;
0782     struct uffd_msg msg;
0783 
0784     pthread_mutex_unlock(&uffd_read_mutex);
0785     /* from here cancellation is ok */
0786 
0787     for (;;) {
0788         if (uffd_read_msg(uffd, &msg))
0789             continue;
0790         uffd_handle_page_fault(&msg, stats);
0791     }
0792 
0793     return NULL;
0794 }
0795 
0796 static void *background_thread(void *arg)
0797 {
0798     unsigned long cpu = (unsigned long) arg;
0799     unsigned long page_nr, start_nr, mid_nr, end_nr;
0800 
0801     start_nr = cpu * nr_pages_per_cpu;
0802     end_nr = (cpu+1) * nr_pages_per_cpu;
0803     mid_nr = (start_nr + end_nr) / 2;
0804 
0805     /* Copy the first half of the pages */
0806     for (page_nr = start_nr; page_nr < mid_nr; page_nr++)
0807         copy_page_retry(uffd, page_nr * page_size);
0808 
0809     /*
0810      * If we need to test uffd-wp, set it up now.  Then we'll have
0811      * at least the first half of the pages mapped already which
0812      * can be write-protected for testing
0813      */
0814     if (test_uffdio_wp)
0815         wp_range(uffd, (unsigned long)area_dst + start_nr * page_size,
0816             nr_pages_per_cpu * page_size, true);
0817 
0818     /*
0819      * Continue the 2nd half of the page copying, handling write
0820      * protection faults if any
0821      */
0822     for (page_nr = mid_nr; page_nr < end_nr; page_nr++)
0823         copy_page_retry(uffd, page_nr * page_size);
0824 
0825     return NULL;
0826 }
0827 
0828 static int stress(struct uffd_stats *uffd_stats)
0829 {
0830     unsigned long cpu;
0831     pthread_t locking_threads[nr_cpus];
0832     pthread_t uffd_threads[nr_cpus];
0833     pthread_t background_threads[nr_cpus];
0834 
0835     finished = 0;
0836     for (cpu = 0; cpu < nr_cpus; cpu++) {
0837         if (pthread_create(&locking_threads[cpu], &attr,
0838                    locking_thread, (void *)cpu))
0839             return 1;
0840         if (bounces & BOUNCE_POLL) {
0841             if (pthread_create(&uffd_threads[cpu], &attr,
0842                        uffd_poll_thread,
0843                        (void *)&uffd_stats[cpu]))
0844                 return 1;
0845         } else {
0846             if (pthread_create(&uffd_threads[cpu], &attr,
0847                        uffd_read_thread,
0848                        (void *)&uffd_stats[cpu]))
0849                 return 1;
0850             pthread_mutex_lock(&uffd_read_mutex);
0851         }
0852         if (pthread_create(&background_threads[cpu], &attr,
0853                    background_thread, (void *)cpu))
0854             return 1;
0855     }
0856     for (cpu = 0; cpu < nr_cpus; cpu++)
0857         if (pthread_join(background_threads[cpu], NULL))
0858             return 1;
0859 
0860     /*
0861      * Be strict and immediately zap area_src, the whole area has
0862      * been transferred already by the background treads. The
0863      * area_src could then be faulted in a racy way by still
0864      * running uffdio_threads reading zeropages after we zapped
0865      * area_src (but they're guaranteed to get -EEXIST from
0866      * UFFDIO_COPY without writing zero pages into area_dst
0867      * because the background threads already completed).
0868      */
0869     uffd_test_ops->release_pages(area_src);
0870 
0871     finished = 1;
0872     for (cpu = 0; cpu < nr_cpus; cpu++)
0873         if (pthread_join(locking_threads[cpu], NULL))
0874             return 1;
0875 
0876     for (cpu = 0; cpu < nr_cpus; cpu++) {
0877         char c;
0878         if (bounces & BOUNCE_POLL) {
0879             if (write(pipefd[cpu*2+1], &c, 1) != 1)
0880                 err("pipefd write error");
0881             if (pthread_join(uffd_threads[cpu],
0882                      (void *)&uffd_stats[cpu]))
0883                 return 1;
0884         } else {
0885             if (pthread_cancel(uffd_threads[cpu]))
0886                 return 1;
0887             if (pthread_join(uffd_threads[cpu], NULL))
0888                 return 1;
0889         }
0890     }
0891 
0892     return 0;
0893 }
0894 
0895 sigjmp_buf jbuf, *sigbuf;
0896 
0897 static void sighndl(int sig, siginfo_t *siginfo, void *ptr)
0898 {
0899     if (sig == SIGBUS) {
0900         if (sigbuf)
0901             siglongjmp(*sigbuf, 1);
0902         abort();
0903     }
0904 }
0905 
0906 /*
0907  * For non-cooperative userfaultfd test we fork() a process that will
0908  * generate pagefaults, will mremap the area monitored by the
0909  * userfaultfd and at last this process will release the monitored
0910  * area.
0911  * For the anonymous and shared memory the area is divided into two
0912  * parts, the first part is accessed before mremap, and the second
0913  * part is accessed after mremap. Since hugetlbfs does not support
0914  * mremap, the entire monitored area is accessed in a single pass for
0915  * HUGETLB_TEST.
0916  * The release of the pages currently generates event for shmem and
0917  * anonymous memory (UFFD_EVENT_REMOVE), hence it is not checked
0918  * for hugetlb.
0919  * For signal test(UFFD_FEATURE_SIGBUS), signal_test = 1, we register
0920  * monitored area, generate pagefaults and test that signal is delivered.
0921  * Use UFFDIO_COPY to allocate missing page and retry. For signal_test = 2
0922  * test robustness use case - we release monitored area, fork a process
0923  * that will generate pagefaults and verify signal is generated.
0924  * This also tests UFFD_FEATURE_EVENT_FORK event along with the signal
0925  * feature. Using monitor thread, verify no userfault events are generated.
0926  */
0927 static int faulting_process(int signal_test)
0928 {
0929     unsigned long nr;
0930     unsigned long long count;
0931     unsigned long split_nr_pages;
0932     unsigned long lastnr;
0933     struct sigaction act;
0934     volatile unsigned long signalled = 0;
0935 
0936     split_nr_pages = (nr_pages + 1) / 2;
0937 
0938     if (signal_test) {
0939         sigbuf = &jbuf;
0940         memset(&act, 0, sizeof(act));
0941         act.sa_sigaction = sighndl;
0942         act.sa_flags = SA_SIGINFO;
0943         if (sigaction(SIGBUS, &act, 0))
0944             err("sigaction");
0945         lastnr = (unsigned long)-1;
0946     }
0947 
0948     for (nr = 0; nr < split_nr_pages; nr++) {
0949         volatile int steps = 1;
0950         unsigned long offset = nr * page_size;
0951 
0952         if (signal_test) {
0953             if (sigsetjmp(*sigbuf, 1) != 0) {
0954                 if (steps == 1 && nr == lastnr)
0955                     err("Signal repeated");
0956 
0957                 lastnr = nr;
0958                 if (signal_test == 1) {
0959                     if (steps == 1) {
0960                         /* This is a MISSING request */
0961                         steps++;
0962                         if (copy_page(uffd, offset))
0963                             signalled++;
0964                     } else {
0965                         /* This is a WP request */
0966                         assert(steps == 2);
0967                         wp_range(uffd,
0968                              (__u64)area_dst +
0969                              offset,
0970                              page_size, false);
0971                     }
0972                 } else {
0973                     signalled++;
0974                     continue;
0975                 }
0976             }
0977         }
0978 
0979         count = *area_count(area_dst, nr);
0980         if (count != count_verify[nr])
0981             err("nr %lu memory corruption %llu %llu\n",
0982                 nr, count, count_verify[nr]);
0983         /*
0984          * Trigger write protection if there is by writing
0985          * the same value back.
0986          */
0987         *area_count(area_dst, nr) = count;
0988     }
0989 
0990     if (signal_test)
0991         return signalled != split_nr_pages;
0992 
0993     area_dst = mremap(area_dst, nr_pages * page_size,  nr_pages * page_size,
0994               MREMAP_MAYMOVE | MREMAP_FIXED, area_src);
0995     if (area_dst == MAP_FAILED)
0996         err("mremap");
0997     /* Reset area_src since we just clobbered it */
0998     area_src = NULL;
0999 
1000     for (; nr < nr_pages; nr++) {
1001         count = *area_count(area_dst, nr);
1002         if (count != count_verify[nr]) {
1003             err("nr %lu memory corruption %llu %llu\n",
1004                 nr, count, count_verify[nr]);
1005         }
1006         /*
1007          * Trigger write protection if there is by writing
1008          * the same value back.
1009          */
1010         *area_count(area_dst, nr) = count;
1011     }
1012 
1013     uffd_test_ops->release_pages(area_dst);
1014 
1015     for (nr = 0; nr < nr_pages; nr++)
1016         if (my_bcmp(area_dst + nr * page_size, zeropage, page_size))
1017             err("nr %lu is not zero", nr);
1018 
1019     return 0;
1020 }
1021 
1022 static void retry_uffdio_zeropage(int ufd,
1023                   struct uffdio_zeropage *uffdio_zeropage,
1024                   unsigned long offset)
1025 {
1026     uffd_test_ops->alias_mapping(&uffdio_zeropage->range.start,
1027                      uffdio_zeropage->range.len,
1028                      offset);
1029     if (ioctl(ufd, UFFDIO_ZEROPAGE, uffdio_zeropage)) {
1030         if (uffdio_zeropage->zeropage != -EEXIST)
1031             err("UFFDIO_ZEROPAGE error: %"PRId64,
1032                 (int64_t)uffdio_zeropage->zeropage);
1033     } else {
1034         err("UFFDIO_ZEROPAGE error: %"PRId64,
1035             (int64_t)uffdio_zeropage->zeropage);
1036     }
1037 }
1038 
1039 static int __uffdio_zeropage(int ufd, unsigned long offset, bool retry)
1040 {
1041     struct uffdio_zeropage uffdio_zeropage;
1042     int ret;
1043     bool has_zeropage = get_expected_ioctls(0) & (1 << _UFFDIO_ZEROPAGE);
1044     __s64 res;
1045 
1046     if (offset >= nr_pages * page_size)
1047         err("unexpected offset %lu", offset);
1048     uffdio_zeropage.range.start = (unsigned long) area_dst + offset;
1049     uffdio_zeropage.range.len = page_size;
1050     uffdio_zeropage.mode = 0;
1051     ret = ioctl(ufd, UFFDIO_ZEROPAGE, &uffdio_zeropage);
1052     res = uffdio_zeropage.zeropage;
1053     if (ret) {
1054         /* real retval in ufdio_zeropage.zeropage */
1055         if (has_zeropage)
1056             err("UFFDIO_ZEROPAGE error: %"PRId64, (int64_t)res);
1057         else if (res != -EINVAL)
1058             err("UFFDIO_ZEROPAGE not -EINVAL");
1059     } else if (has_zeropage) {
1060         if (res != page_size) {
1061             err("UFFDIO_ZEROPAGE unexpected size");
1062         } else {
1063             if (test_uffdio_zeropage_eexist && retry) {
1064                 test_uffdio_zeropage_eexist = false;
1065                 retry_uffdio_zeropage(ufd, &uffdio_zeropage,
1066                               offset);
1067             }
1068             return 1;
1069         }
1070     } else
1071         err("UFFDIO_ZEROPAGE succeeded");
1072 
1073     return 0;
1074 }
1075 
1076 static int uffdio_zeropage(int ufd, unsigned long offset)
1077 {
1078     return __uffdio_zeropage(ufd, offset, false);
1079 }
1080 
1081 /* exercise UFFDIO_ZEROPAGE */
1082 static int userfaultfd_zeropage_test(void)
1083 {
1084     struct uffdio_register uffdio_register;
1085 
1086     printf("testing UFFDIO_ZEROPAGE: ");
1087     fflush(stdout);
1088 
1089     uffd_test_ctx_init(0);
1090 
1091     uffdio_register.range.start = (unsigned long) area_dst;
1092     uffdio_register.range.len = nr_pages * page_size;
1093     uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
1094     if (test_uffdio_wp)
1095         uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP;
1096     if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
1097         err("register failure");
1098 
1099     assert_expected_ioctls_present(
1100         uffdio_register.mode, uffdio_register.ioctls);
1101 
1102     if (uffdio_zeropage(uffd, 0))
1103         if (my_bcmp(area_dst, zeropage, page_size))
1104             err("zeropage is not zero");
1105 
1106     printf("done.\n");
1107     return 0;
1108 }
1109 
1110 static int userfaultfd_events_test(void)
1111 {
1112     struct uffdio_register uffdio_register;
1113     pthread_t uffd_mon;
1114     int err, features;
1115     pid_t pid;
1116     char c;
1117     struct uffd_stats stats = { 0 };
1118 
1119     printf("testing events (fork, remap, remove): ");
1120     fflush(stdout);
1121 
1122     features = UFFD_FEATURE_EVENT_FORK | UFFD_FEATURE_EVENT_REMAP |
1123         UFFD_FEATURE_EVENT_REMOVE;
1124     uffd_test_ctx_init(features);
1125 
1126     fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
1127 
1128     uffdio_register.range.start = (unsigned long) area_dst;
1129     uffdio_register.range.len = nr_pages * page_size;
1130     uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
1131     if (test_uffdio_wp)
1132         uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP;
1133     if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
1134         err("register failure");
1135 
1136     assert_expected_ioctls_present(
1137         uffdio_register.mode, uffdio_register.ioctls);
1138 
1139     if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats))
1140         err("uffd_poll_thread create");
1141 
1142     pid = fork();
1143     if (pid < 0)
1144         err("fork");
1145 
1146     if (!pid)
1147         exit(faulting_process(0));
1148 
1149     waitpid(pid, &err, 0);
1150     if (err)
1151         err("faulting process failed");
1152     if (write(pipefd[1], &c, sizeof(c)) != sizeof(c))
1153         err("pipe write");
1154     if (pthread_join(uffd_mon, NULL))
1155         return 1;
1156 
1157     uffd_stats_report(&stats, 1);
1158 
1159     return stats.missing_faults != nr_pages;
1160 }
1161 
1162 static int userfaultfd_sig_test(void)
1163 {
1164     struct uffdio_register uffdio_register;
1165     unsigned long userfaults;
1166     pthread_t uffd_mon;
1167     int err, features;
1168     pid_t pid;
1169     char c;
1170     struct uffd_stats stats = { 0 };
1171 
1172     printf("testing signal delivery: ");
1173     fflush(stdout);
1174 
1175     features = UFFD_FEATURE_EVENT_FORK|UFFD_FEATURE_SIGBUS;
1176     uffd_test_ctx_init(features);
1177 
1178     fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
1179 
1180     uffdio_register.range.start = (unsigned long) area_dst;
1181     uffdio_register.range.len = nr_pages * page_size;
1182     uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
1183     if (test_uffdio_wp)
1184         uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP;
1185     if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
1186         err("register failure");
1187 
1188     assert_expected_ioctls_present(
1189         uffdio_register.mode, uffdio_register.ioctls);
1190 
1191     if (faulting_process(1))
1192         err("faulting process failed");
1193 
1194     uffd_test_ops->release_pages(area_dst);
1195 
1196     if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats))
1197         err("uffd_poll_thread create");
1198 
1199     pid = fork();
1200     if (pid < 0)
1201         err("fork");
1202 
1203     if (!pid)
1204         exit(faulting_process(2));
1205 
1206     waitpid(pid, &err, 0);
1207     if (err)
1208         err("faulting process failed");
1209     if (write(pipefd[1], &c, sizeof(c)) != sizeof(c))
1210         err("pipe write");
1211     if (pthread_join(uffd_mon, (void **)&userfaults))
1212         return 1;
1213 
1214     printf("done.\n");
1215     if (userfaults)
1216         err("Signal test failed, userfaults: %ld", userfaults);
1217 
1218     return userfaults != 0;
1219 }
1220 
1221 static int userfaultfd_minor_test(void)
1222 {
1223     struct uffdio_register uffdio_register;
1224     unsigned long p;
1225     pthread_t uffd_mon;
1226     uint8_t expected_byte;
1227     void *expected_page;
1228     char c;
1229     struct uffd_stats stats = { 0 };
1230 
1231     if (!test_uffdio_minor)
1232         return 0;
1233 
1234     printf("testing minor faults: ");
1235     fflush(stdout);
1236 
1237     uffd_test_ctx_init(uffd_minor_feature());
1238 
1239     uffdio_register.range.start = (unsigned long)area_dst_alias;
1240     uffdio_register.range.len = nr_pages * page_size;
1241     uffdio_register.mode = UFFDIO_REGISTER_MODE_MINOR;
1242     if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
1243         err("register failure");
1244 
1245     assert_expected_ioctls_present(
1246         uffdio_register.mode, uffdio_register.ioctls);
1247 
1248     /*
1249      * After registering with UFFD, populate the non-UFFD-registered side of
1250      * the shared mapping. This should *not* trigger any UFFD minor faults.
1251      */
1252     for (p = 0; p < nr_pages; ++p) {
1253         memset(area_dst + (p * page_size), p % ((uint8_t)-1),
1254                page_size);
1255     }
1256 
1257     if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats))
1258         err("uffd_poll_thread create");
1259 
1260     /*
1261      * Read each of the pages back using the UFFD-registered mapping. We
1262      * expect that the first time we touch a page, it will result in a minor
1263      * fault. uffd_poll_thread will resolve the fault by bit-flipping the
1264      * page's contents, and then issuing a CONTINUE ioctl.
1265      */
1266 
1267     if (posix_memalign(&expected_page, page_size, page_size))
1268         err("out of memory");
1269 
1270     for (p = 0; p < nr_pages; ++p) {
1271         expected_byte = ~((uint8_t)(p % ((uint8_t)-1)));
1272         memset(expected_page, expected_byte, page_size);
1273         if (my_bcmp(expected_page, area_dst_alias + (p * page_size),
1274                 page_size))
1275             err("unexpected page contents after minor fault");
1276     }
1277 
1278     if (write(pipefd[1], &c, sizeof(c)) != sizeof(c))
1279         err("pipe write");
1280     if (pthread_join(uffd_mon, NULL))
1281         return 1;
1282 
1283     uffd_stats_report(&stats, 1);
1284 
1285     return stats.missing_faults != 0 || stats.minor_faults != nr_pages;
1286 }
1287 
1288 #define BIT_ULL(nr)                   (1ULL << (nr))
1289 #define PM_SOFT_DIRTY                 BIT_ULL(55)
1290 #define PM_MMAP_EXCLUSIVE             BIT_ULL(56)
1291 #define PM_UFFD_WP                    BIT_ULL(57)
1292 #define PM_FILE                       BIT_ULL(61)
1293 #define PM_SWAP                       BIT_ULL(62)
1294 #define PM_PRESENT                    BIT_ULL(63)
1295 
1296 static int pagemap_open(void)
1297 {
1298     int fd = open("/proc/self/pagemap", O_RDONLY);
1299 
1300     if (fd < 0)
1301         err("open pagemap");
1302 
1303     return fd;
1304 }
1305 
1306 static uint64_t pagemap_read_vaddr(int fd, void *vaddr)
1307 {
1308     uint64_t value;
1309     int ret;
1310 
1311     ret = pread(fd, &value, sizeof(uint64_t),
1312             ((uint64_t)vaddr >> 12) * sizeof(uint64_t));
1313     if (ret != sizeof(uint64_t))
1314         err("pread() on pagemap failed");
1315 
1316     return value;
1317 }
1318 
1319 /* This macro let __LINE__ works in err() */
1320 #define  pagemap_check_wp(value, wp) do {               \
1321         if (!!(value & PM_UFFD_WP) != wp)           \
1322             err("pagemap uffd-wp bit error: 0x%"PRIx64, value); \
1323     } while (0)
1324 
1325 static int pagemap_test_fork(bool present)
1326 {
1327     pid_t child = fork();
1328     uint64_t value;
1329     int fd, result;
1330 
1331     if (!child) {
1332         /* Open the pagemap fd of the child itself */
1333         fd = pagemap_open();
1334         value = pagemap_read_vaddr(fd, area_dst);
1335         /*
1336          * After fork() uffd-wp bit should be gone as long as we're
1337          * without UFFD_FEATURE_EVENT_FORK
1338          */
1339         pagemap_check_wp(value, false);
1340         /* Succeed */
1341         exit(0);
1342     }
1343     waitpid(child, &result, 0);
1344     return result;
1345 }
1346 
1347 static void userfaultfd_pagemap_test(unsigned int test_pgsize)
1348 {
1349     struct uffdio_register uffdio_register;
1350     int pagemap_fd;
1351     uint64_t value;
1352 
1353     /* Pagemap tests uffd-wp only */
1354     if (!test_uffdio_wp)
1355         return;
1356 
1357     /* Not enough memory to test this page size */
1358     if (test_pgsize > nr_pages * page_size)
1359         return;
1360 
1361     printf("testing uffd-wp with pagemap (pgsize=%u): ", test_pgsize);
1362     /* Flush so it doesn't flush twice in parent/child later */
1363     fflush(stdout);
1364 
1365     uffd_test_ctx_init(0);
1366 
1367     if (test_pgsize > page_size) {
1368         /* This is a thp test */
1369         if (madvise(area_dst, nr_pages * page_size, MADV_HUGEPAGE))
1370             err("madvise(MADV_HUGEPAGE) failed");
1371     } else if (test_pgsize == page_size) {
1372         /* This is normal page test; force no thp */
1373         if (madvise(area_dst, nr_pages * page_size, MADV_NOHUGEPAGE))
1374             err("madvise(MADV_NOHUGEPAGE) failed");
1375     }
1376 
1377     uffdio_register.range.start = (unsigned long) area_dst;
1378     uffdio_register.range.len = nr_pages * page_size;
1379     uffdio_register.mode = UFFDIO_REGISTER_MODE_WP;
1380     if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
1381         err("register failed");
1382 
1383     pagemap_fd = pagemap_open();
1384 
1385     /* Touch the page */
1386     *area_dst = 1;
1387     wp_range(uffd, (uint64_t)area_dst, test_pgsize, true);
1388     value = pagemap_read_vaddr(pagemap_fd, area_dst);
1389     pagemap_check_wp(value, true);
1390     /* Make sure uffd-wp bit dropped when fork */
1391     if (pagemap_test_fork(true))
1392         err("Detected stall uffd-wp bit in child");
1393 
1394     /* Exclusive required or PAGEOUT won't work */
1395     if (!(value & PM_MMAP_EXCLUSIVE))
1396         err("multiple mapping detected: 0x%"PRIx64, value);
1397 
1398     if (madvise(area_dst, test_pgsize, MADV_PAGEOUT))
1399         err("madvise(MADV_PAGEOUT) failed");
1400 
1401     /* Uffd-wp should persist even swapped out */
1402     value = pagemap_read_vaddr(pagemap_fd, area_dst);
1403     pagemap_check_wp(value, true);
1404     /* Make sure uffd-wp bit dropped when fork */
1405     if (pagemap_test_fork(false))
1406         err("Detected stall uffd-wp bit in child");
1407 
1408     /* Unprotect; this tests swap pte modifications */
1409     wp_range(uffd, (uint64_t)area_dst, page_size, false);
1410     value = pagemap_read_vaddr(pagemap_fd, area_dst);
1411     pagemap_check_wp(value, false);
1412 
1413     /* Fault in the page from disk */
1414     *area_dst = 2;
1415     value = pagemap_read_vaddr(pagemap_fd, area_dst);
1416     pagemap_check_wp(value, false);
1417 
1418     close(pagemap_fd);
1419     printf("done\n");
1420 }
1421 
1422 static int userfaultfd_stress(void)
1423 {
1424     void *area;
1425     unsigned long nr;
1426     struct uffdio_register uffdio_register;
1427     struct uffd_stats uffd_stats[nr_cpus];
1428 
1429     uffd_test_ctx_init(0);
1430 
1431     if (posix_memalign(&area, page_size, page_size))
1432         err("out of memory");
1433     zeropage = area;
1434     bzero(zeropage, page_size);
1435 
1436     pthread_mutex_lock(&uffd_read_mutex);
1437 
1438     pthread_attr_init(&attr);
1439     pthread_attr_setstacksize(&attr, 16*1024*1024);
1440 
1441     while (bounces--) {
1442         printf("bounces: %d, mode:", bounces);
1443         if (bounces & BOUNCE_RANDOM)
1444             printf(" rnd");
1445         if (bounces & BOUNCE_RACINGFAULTS)
1446             printf(" racing");
1447         if (bounces & BOUNCE_VERIFY)
1448             printf(" ver");
1449         if (bounces & BOUNCE_POLL)
1450             printf(" poll");
1451         else
1452             printf(" read");
1453         printf(", ");
1454         fflush(stdout);
1455 
1456         if (bounces & BOUNCE_POLL)
1457             fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
1458         else
1459             fcntl(uffd, F_SETFL, uffd_flags & ~O_NONBLOCK);
1460 
1461         /* register */
1462         uffdio_register.range.start = (unsigned long) area_dst;
1463         uffdio_register.range.len = nr_pages * page_size;
1464         uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
1465         if (test_uffdio_wp)
1466             uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP;
1467         if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
1468             err("register failure");
1469         assert_expected_ioctls_present(
1470             uffdio_register.mode, uffdio_register.ioctls);
1471 
1472         if (area_dst_alias) {
1473             uffdio_register.range.start = (unsigned long)
1474                 area_dst_alias;
1475             if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
1476                 err("register failure alias");
1477         }
1478 
1479         /*
1480          * The madvise done previously isn't enough: some
1481          * uffd_thread could have read userfaults (one of
1482          * those already resolved by the background thread)
1483          * and it may be in the process of calling
1484          * UFFDIO_COPY. UFFDIO_COPY will read the zapped
1485          * area_src and it would map a zero page in it (of
1486          * course such a UFFDIO_COPY is perfectly safe as it'd
1487          * return -EEXIST). The problem comes at the next
1488          * bounce though: that racing UFFDIO_COPY would
1489          * generate zeropages in the area_src, so invalidating
1490          * the previous MADV_DONTNEED. Without this additional
1491          * MADV_DONTNEED those zeropages leftovers in the
1492          * area_src would lead to -EEXIST failure during the
1493          * next bounce, effectively leaving a zeropage in the
1494          * area_dst.
1495          *
1496          * Try to comment this out madvise to see the memory
1497          * corruption being caught pretty quick.
1498          *
1499          * khugepaged is also inhibited to collapse THP after
1500          * MADV_DONTNEED only after the UFFDIO_REGISTER, so it's
1501          * required to MADV_DONTNEED here.
1502          */
1503         uffd_test_ops->release_pages(area_dst);
1504 
1505         uffd_stats_reset(uffd_stats, nr_cpus);
1506 
1507         /* bounce pass */
1508         if (stress(uffd_stats))
1509             return 1;
1510 
1511         /* Clear all the write protections if there is any */
1512         if (test_uffdio_wp)
1513             wp_range(uffd, (unsigned long)area_dst,
1514                  nr_pages * page_size, false);
1515 
1516         /* unregister */
1517         if (ioctl(uffd, UFFDIO_UNREGISTER, &uffdio_register.range))
1518             err("unregister failure");
1519         if (area_dst_alias) {
1520             uffdio_register.range.start = (unsigned long) area_dst;
1521             if (ioctl(uffd, UFFDIO_UNREGISTER,
1522                   &uffdio_register.range))
1523                 err("unregister failure alias");
1524         }
1525 
1526         /* verification */
1527         if (bounces & BOUNCE_VERIFY)
1528             for (nr = 0; nr < nr_pages; nr++)
1529                 if (*area_count(area_dst, nr) != count_verify[nr])
1530                     err("error area_count %llu %llu %lu\n",
1531                         *area_count(area_src, nr),
1532                         count_verify[nr], nr);
1533 
1534         /* prepare next bounce */
1535         swap(area_src, area_dst);
1536 
1537         swap(area_src_alias, area_dst_alias);
1538 
1539         uffd_stats_report(uffd_stats, nr_cpus);
1540     }
1541 
1542     if (test_type == TEST_ANON) {
1543         /*
1544          * shmem/hugetlb won't be able to run since they have different
1545          * behavior on fork() (file-backed memory normally drops ptes
1546          * directly when fork), meanwhile the pagemap test will verify
1547          * pgtable entry of fork()ed child.
1548          */
1549         userfaultfd_pagemap_test(page_size);
1550         /*
1551          * Hard-code for x86_64 for now for 2M THP, as x86_64 is
1552          * currently the only one that supports uffd-wp
1553          */
1554         userfaultfd_pagemap_test(page_size * 512);
1555     }
1556 
1557     return userfaultfd_zeropage_test() || userfaultfd_sig_test()
1558         || userfaultfd_events_test() || userfaultfd_minor_test();
1559 }
1560 
1561 /*
1562  * Copied from mlock2-tests.c
1563  */
1564 unsigned long default_huge_page_size(void)
1565 {
1566     unsigned long hps = 0;
1567     char *line = NULL;
1568     size_t linelen = 0;
1569     FILE *f = fopen("/proc/meminfo", "r");
1570 
1571     if (!f)
1572         return 0;
1573     while (getline(&line, &linelen, f) > 0) {
1574         if (sscanf(line, "Hugepagesize:       %lu kB", &hps) == 1) {
1575             hps <<= 10;
1576             break;
1577         }
1578     }
1579 
1580     free(line);
1581     fclose(f);
1582     return hps;
1583 }
1584 
1585 static void set_test_type(const char *type)
1586 {
1587     uint64_t features = UFFD_API_FEATURES;
1588 
1589     if (!strcmp(type, "anon")) {
1590         test_type = TEST_ANON;
1591         uffd_test_ops = &anon_uffd_test_ops;
1592     } else if (!strcmp(type, "hugetlb")) {
1593         test_type = TEST_HUGETLB;
1594         uffd_test_ops = &hugetlb_uffd_test_ops;
1595     } else if (!strcmp(type, "hugetlb_shared")) {
1596         map_shared = true;
1597         test_type = TEST_HUGETLB;
1598         uffd_test_ops = &hugetlb_uffd_test_ops;
1599         /* Minor faults require shared hugetlb; only enable here. */
1600         test_uffdio_minor = true;
1601     } else if (!strcmp(type, "shmem")) {
1602         map_shared = true;
1603         test_type = TEST_SHMEM;
1604         uffd_test_ops = &shmem_uffd_test_ops;
1605         test_uffdio_minor = true;
1606     } else {
1607         err("Unknown test type: %s", type);
1608     }
1609 
1610     if (test_type == TEST_HUGETLB)
1611         page_size = default_huge_page_size();
1612     else
1613         page_size = sysconf(_SC_PAGE_SIZE);
1614 
1615     if (!page_size)
1616         err("Unable to determine page size");
1617     if ((unsigned long) area_count(NULL, 0) + sizeof(unsigned long long) * 2
1618         > page_size)
1619         err("Impossible to run this test");
1620 
1621     /*
1622      * Whether we can test certain features depends not just on test type,
1623      * but also on whether or not this particular kernel supports the
1624      * feature.
1625      */
1626 
1627     userfaultfd_open(&features);
1628 
1629     test_uffdio_wp = test_uffdio_wp &&
1630         (features & UFFD_FEATURE_PAGEFAULT_FLAG_WP);
1631     test_uffdio_minor = test_uffdio_minor &&
1632         (features & uffd_minor_feature());
1633 
1634     close(uffd);
1635     uffd = -1;
1636 }
1637 
1638 static void sigalrm(int sig)
1639 {
1640     if (sig != SIGALRM)
1641         abort();
1642     test_uffdio_copy_eexist = true;
1643     test_uffdio_zeropage_eexist = true;
1644     alarm(ALARM_INTERVAL_SECS);
1645 }
1646 
1647 int main(int argc, char **argv)
1648 {
1649     if (argc < 4)
1650         usage();
1651 
1652     if (signal(SIGALRM, sigalrm) == SIG_ERR)
1653         err("failed to arm SIGALRM");
1654     alarm(ALARM_INTERVAL_SECS);
1655 
1656     set_test_type(argv[1]);
1657 
1658     nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
1659     nr_pages_per_cpu = atol(argv[2]) * 1024*1024 / page_size /
1660         nr_cpus;
1661     if (!nr_pages_per_cpu) {
1662         _err("invalid MiB");
1663         usage();
1664     }
1665 
1666     bounces = atoi(argv[3]);
1667     if (bounces <= 0) {
1668         _err("invalid bounces");
1669         usage();
1670     }
1671     nr_pages = nr_pages_per_cpu * nr_cpus;
1672 
1673     if (test_type == TEST_HUGETLB && map_shared) {
1674         if (argc < 5)
1675             usage();
1676         huge_fd = open(argv[4], O_CREAT | O_RDWR, 0755);
1677         if (huge_fd < 0)
1678             err("Open of %s failed", argv[4]);
1679         if (ftruncate(huge_fd, 0))
1680             err("ftruncate %s to size 0 failed", argv[4]);
1681     } else if (test_type == TEST_SHMEM) {
1682         shm_fd = memfd_create(argv[0], 0);
1683         if (shm_fd < 0)
1684             err("memfd_create");
1685         if (ftruncate(shm_fd, nr_pages * page_size * 2))
1686             err("ftruncate");
1687         if (fallocate(shm_fd,
1688                   FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0,
1689                   nr_pages * page_size * 2))
1690             err("fallocate");
1691     }
1692     printf("nr_pages: %lu, nr_pages_per_cpu: %lu\n",
1693            nr_pages, nr_pages_per_cpu);
1694     return userfaultfd_stress();
1695 }
1696 
1697 #else /* __NR_userfaultfd */
1698 
1699 #warning "missing __NR_userfaultfd definition"
1700 
1701 int main(void)
1702 {
1703     printf("skip: Skipping userfaultfd test (missing __NR_userfaultfd)\n");
1704     return KSFT_SKIP;
1705 }
1706 
1707 #endif /* __NR_userfaultfd */