Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0
0002 /*
0003  * Tests Memory Protection Keys (see Documentation/core-api/protection-keys.rst)
0004  *
0005  * There are examples in here of:
0006  *  * how to set protection keys on memory
0007  *  * how to set/clear bits in pkey registers (the rights register)
0008  *  * how to handle SEGV_PKUERR signals and extract pkey-relevant
0009  *    information from the siginfo
0010  *
0011  * Things to add:
0012  *  make sure KSM and KSM COW breaking works
0013  *  prefault pages in at malloc, or not
0014  *  protect MPX bounds tables with protection keys?
0015  *  make sure VMA splitting/merging is working correctly
0016  *  OOMs can destroy mm->mmap (see exit_mmap()), so make sure it is immune to pkeys
0017  *  look for pkey "leaks" where it is still set on a VMA but "freed" back to the kernel
0018  *  do a plain mprotect() to a mprotect_pkey() area and make sure the pkey sticks
0019  *
0020  * Compile like this:
0021  *  gcc      -o protection_keys    -O2 -g -std=gnu99 -pthread -Wall protection_keys.c -lrt -ldl -lm
0022  *  gcc -m32 -o protection_keys_32 -O2 -g -std=gnu99 -pthread -Wall protection_keys.c -lrt -ldl -lm
0023  */
0024 #define _GNU_SOURCE
0025 #define __SANE_USERSPACE_TYPES__
0026 #include <errno.h>
0027 #include <linux/futex.h>
0028 #include <time.h>
0029 #include <sys/time.h>
0030 #include <sys/syscall.h>
0031 #include <string.h>
0032 #include <stdio.h>
0033 #include <stdint.h>
0034 #include <stdbool.h>
0035 #include <signal.h>
0036 #include <assert.h>
0037 #include <stdlib.h>
0038 #include <ucontext.h>
0039 #include <sys/mman.h>
0040 #include <sys/types.h>
0041 #include <sys/wait.h>
0042 #include <sys/stat.h>
0043 #include <fcntl.h>
0044 #include <unistd.h>
0045 #include <sys/ptrace.h>
0046 #include <setjmp.h>
0047 
0048 #include "pkey-helpers.h"
0049 
0050 int iteration_nr = 1;
0051 int test_nr;
0052 
0053 u64 shadow_pkey_reg;
0054 int dprint_in_signal;
0055 char dprint_in_signal_buffer[DPRINT_IN_SIGNAL_BUF_SIZE];
0056 
0057 void cat_into_file(char *str, char *file)
0058 {
0059     int fd = open(file, O_RDWR);
0060     int ret;
0061 
0062     dprintf2("%s(): writing '%s' to '%s'\n", __func__, str, file);
0063     /*
0064      * these need to be raw because they are called under
0065      * pkey_assert()
0066      */
0067     if (fd < 0) {
0068         fprintf(stderr, "error opening '%s'\n", str);
0069         perror("error: ");
0070         exit(__LINE__);
0071     }
0072 
0073     ret = write(fd, str, strlen(str));
0074     if (ret != strlen(str)) {
0075         perror("write to file failed");
0076         fprintf(stderr, "filename: '%s' str: '%s'\n", file, str);
0077         exit(__LINE__);
0078     }
0079     close(fd);
0080 }
0081 
0082 #if CONTROL_TRACING > 0
0083 static int warned_tracing;
0084 int tracing_root_ok(void)
0085 {
0086     if (geteuid() != 0) {
0087         if (!warned_tracing)
0088             fprintf(stderr, "WARNING: not run as root, "
0089                     "can not do tracing control\n");
0090         warned_tracing = 1;
0091         return 0;
0092     }
0093     return 1;
0094 }
0095 #endif
0096 
0097 void tracing_on(void)
0098 {
0099 #if CONTROL_TRACING > 0
0100 #define TRACEDIR "/sys/kernel/debug/tracing"
0101     char pidstr[32];
0102 
0103     if (!tracing_root_ok())
0104         return;
0105 
0106     sprintf(pidstr, "%d", getpid());
0107     cat_into_file("0", TRACEDIR "/tracing_on");
0108     cat_into_file("\n", TRACEDIR "/trace");
0109     if (1) {
0110         cat_into_file("function_graph", TRACEDIR "/current_tracer");
0111         cat_into_file("1", TRACEDIR "/options/funcgraph-proc");
0112     } else {
0113         cat_into_file("nop", TRACEDIR "/current_tracer");
0114     }
0115     cat_into_file(pidstr, TRACEDIR "/set_ftrace_pid");
0116     cat_into_file("1", TRACEDIR "/tracing_on");
0117     dprintf1("enabled tracing\n");
0118 #endif
0119 }
0120 
0121 void tracing_off(void)
0122 {
0123 #if CONTROL_TRACING > 0
0124     if (!tracing_root_ok())
0125         return;
0126     cat_into_file("0", "/sys/kernel/debug/tracing/tracing_on");
0127 #endif
0128 }
0129 
0130 void abort_hooks(void)
0131 {
0132     fprintf(stderr, "running %s()...\n", __func__);
0133     tracing_off();
0134 #ifdef SLEEP_ON_ABORT
0135     sleep(SLEEP_ON_ABORT);
0136 #endif
0137 }
0138 
0139 /*
0140  * This attempts to have roughly a page of instructions followed by a few
0141  * instructions that do a write, and another page of instructions.  That
0142  * way, we are pretty sure that the write is in the second page of
0143  * instructions and has at least a page of padding behind it.
0144  *
0145  * *That* lets us be sure to madvise() away the write instruction, which
0146  * will then fault, which makes sure that the fault code handles
0147  * execute-only memory properly.
0148  */
0149 #ifdef __powerpc64__
0150 /* This way, both 4K and 64K alignment are maintained */
0151 __attribute__((__aligned__(65536)))
0152 #else
0153 __attribute__((__aligned__(PAGE_SIZE)))
0154 #endif
0155 void lots_o_noops_around_write(int *write_to_me)
0156 {
0157     dprintf3("running %s()\n", __func__);
0158     __page_o_noops();
0159     /* Assume this happens in the second page of instructions: */
0160     *write_to_me = __LINE__;
0161     /* pad out by another page: */
0162     __page_o_noops();
0163     dprintf3("%s() done\n", __func__);
0164 }
0165 
0166 void dump_mem(void *dumpme, int len_bytes)
0167 {
0168     char *c = (void *)dumpme;
0169     int i;
0170 
0171     for (i = 0; i < len_bytes; i += sizeof(u64)) {
0172         u64 *ptr = (u64 *)(c + i);
0173         dprintf1("dump[%03d][@%p]: %016llx\n", i, ptr, *ptr);
0174     }
0175 }
0176 
0177 static u32 hw_pkey_get(int pkey, unsigned long flags)
0178 {
0179     u64 pkey_reg = __read_pkey_reg();
0180 
0181     dprintf1("%s(pkey=%d, flags=%lx) = %x / %d\n",
0182             __func__, pkey, flags, 0, 0);
0183     dprintf2("%s() raw pkey_reg: %016llx\n", __func__, pkey_reg);
0184 
0185     return (u32) get_pkey_bits(pkey_reg, pkey);
0186 }
0187 
0188 static int hw_pkey_set(int pkey, unsigned long rights, unsigned long flags)
0189 {
0190     u32 mask = (PKEY_DISABLE_ACCESS|PKEY_DISABLE_WRITE);
0191     u64 old_pkey_reg = __read_pkey_reg();
0192     u64 new_pkey_reg;
0193 
0194     /* make sure that 'rights' only contains the bits we expect: */
0195     assert(!(rights & ~mask));
0196 
0197     /* modify bits accordingly in old pkey_reg and assign it */
0198     new_pkey_reg = set_pkey_bits(old_pkey_reg, pkey, rights);
0199 
0200     __write_pkey_reg(new_pkey_reg);
0201 
0202     dprintf3("%s(pkey=%d, rights=%lx, flags=%lx) = %x"
0203         " pkey_reg now: %016llx old_pkey_reg: %016llx\n",
0204         __func__, pkey, rights, flags, 0, __read_pkey_reg(),
0205         old_pkey_reg);
0206     return 0;
0207 }
0208 
0209 void pkey_disable_set(int pkey, int flags)
0210 {
0211     unsigned long syscall_flags = 0;
0212     int ret;
0213     int pkey_rights;
0214     u64 orig_pkey_reg = read_pkey_reg();
0215 
0216     dprintf1("START->%s(%d, 0x%x)\n", __func__,
0217         pkey, flags);
0218     pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE));
0219 
0220     pkey_rights = hw_pkey_get(pkey, syscall_flags);
0221 
0222     dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__,
0223             pkey, pkey, pkey_rights);
0224 
0225     pkey_assert(pkey_rights >= 0);
0226 
0227     pkey_rights |= flags;
0228 
0229     ret = hw_pkey_set(pkey, pkey_rights, syscall_flags);
0230     assert(!ret);
0231     /* pkey_reg and flags have the same format */
0232     shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, pkey, pkey_rights);
0233     dprintf1("%s(%d) shadow: 0x%016llx\n",
0234         __func__, pkey, shadow_pkey_reg);
0235 
0236     pkey_assert(ret >= 0);
0237 
0238     pkey_rights = hw_pkey_get(pkey, syscall_flags);
0239     dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__,
0240             pkey, pkey, pkey_rights);
0241 
0242     dprintf1("%s(%d) pkey_reg: 0x%016llx\n",
0243         __func__, pkey, read_pkey_reg());
0244     if (flags)
0245         pkey_assert(read_pkey_reg() >= orig_pkey_reg);
0246     dprintf1("END<---%s(%d, 0x%x)\n", __func__,
0247         pkey, flags);
0248 }
0249 
0250 void pkey_disable_clear(int pkey, int flags)
0251 {
0252     unsigned long syscall_flags = 0;
0253     int ret;
0254     int pkey_rights = hw_pkey_get(pkey, syscall_flags);
0255     u64 orig_pkey_reg = read_pkey_reg();
0256 
0257     pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE));
0258 
0259     dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__,
0260             pkey, pkey, pkey_rights);
0261     pkey_assert(pkey_rights >= 0);
0262 
0263     pkey_rights &= ~flags;
0264 
0265     ret = hw_pkey_set(pkey, pkey_rights, 0);
0266     shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, pkey, pkey_rights);
0267     pkey_assert(ret >= 0);
0268 
0269     pkey_rights = hw_pkey_get(pkey, syscall_flags);
0270     dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__,
0271             pkey, pkey, pkey_rights);
0272 
0273     dprintf1("%s(%d) pkey_reg: 0x%016llx\n", __func__,
0274             pkey, read_pkey_reg());
0275     if (flags)
0276         assert(read_pkey_reg() <= orig_pkey_reg);
0277 }
0278 
0279 void pkey_write_allow(int pkey)
0280 {
0281     pkey_disable_clear(pkey, PKEY_DISABLE_WRITE);
0282 }
0283 void pkey_write_deny(int pkey)
0284 {
0285     pkey_disable_set(pkey, PKEY_DISABLE_WRITE);
0286 }
0287 void pkey_access_allow(int pkey)
0288 {
0289     pkey_disable_clear(pkey, PKEY_DISABLE_ACCESS);
0290 }
0291 void pkey_access_deny(int pkey)
0292 {
0293     pkey_disable_set(pkey, PKEY_DISABLE_ACCESS);
0294 }
0295 
0296 /* Failed address bound checks: */
0297 #ifndef SEGV_BNDERR
0298 # define SEGV_BNDERR        3
0299 #endif
0300 
0301 #ifndef SEGV_PKUERR
0302 # define SEGV_PKUERR        4
0303 #endif
0304 
0305 static char *si_code_str(int si_code)
0306 {
0307     if (si_code == SEGV_MAPERR)
0308         return "SEGV_MAPERR";
0309     if (si_code == SEGV_ACCERR)
0310         return "SEGV_ACCERR";
0311     if (si_code == SEGV_BNDERR)
0312         return "SEGV_BNDERR";
0313     if (si_code == SEGV_PKUERR)
0314         return "SEGV_PKUERR";
0315     return "UNKNOWN";
0316 }
0317 
0318 int pkey_faults;
0319 int last_si_pkey = -1;
0320 void signal_handler(int signum, siginfo_t *si, void *vucontext)
0321 {
0322     ucontext_t *uctxt = vucontext;
0323     int trapno;
0324     unsigned long ip;
0325     char *fpregs;
0326 #if defined(__i386__) || defined(__x86_64__) /* arch */
0327     u32 *pkey_reg_ptr;
0328     int pkey_reg_offset;
0329 #endif /* arch */
0330     u64 siginfo_pkey;
0331     u32 *si_pkey_ptr;
0332 
0333     dprint_in_signal = 1;
0334     dprintf1(">>>>===============SIGSEGV============================\n");
0335     dprintf1("%s()::%d, pkey_reg: 0x%016llx shadow: %016llx\n",
0336             __func__, __LINE__,
0337             __read_pkey_reg(), shadow_pkey_reg);
0338 
0339     trapno = uctxt->uc_mcontext.gregs[REG_TRAPNO];
0340     ip = uctxt->uc_mcontext.gregs[REG_IP_IDX];
0341     fpregs = (char *) uctxt->uc_mcontext.fpregs;
0342 
0343     dprintf2("%s() trapno: %d ip: 0x%016lx info->si_code: %s/%d\n",
0344             __func__, trapno, ip, si_code_str(si->si_code),
0345             si->si_code);
0346 
0347 #if defined(__i386__) || defined(__x86_64__) /* arch */
0348 #ifdef __i386__
0349     /*
0350      * 32-bit has some extra padding so that userspace can tell whether
0351      * the XSTATE header is present in addition to the "legacy" FPU
0352      * state.  We just assume that it is here.
0353      */
0354     fpregs += 0x70;
0355 #endif /* i386 */
0356     pkey_reg_offset = pkey_reg_xstate_offset();
0357     pkey_reg_ptr = (void *)(&fpregs[pkey_reg_offset]);
0358 
0359     /*
0360      * If we got a PKEY fault, we *HAVE* to have at least one bit set in
0361      * here.
0362      */
0363     dprintf1("pkey_reg_xstate_offset: %d\n", pkey_reg_xstate_offset());
0364     if (DEBUG_LEVEL > 4)
0365         dump_mem(pkey_reg_ptr - 128, 256);
0366     pkey_assert(*pkey_reg_ptr);
0367 #endif /* arch */
0368 
0369     dprintf1("siginfo: %p\n", si);
0370     dprintf1(" fpregs: %p\n", fpregs);
0371 
0372     if ((si->si_code == SEGV_MAPERR) ||
0373         (si->si_code == SEGV_ACCERR) ||
0374         (si->si_code == SEGV_BNDERR)) {
0375         printf("non-PK si_code, exiting...\n");
0376         exit(4);
0377     }
0378 
0379     si_pkey_ptr = siginfo_get_pkey_ptr(si);
0380     dprintf1("si_pkey_ptr: %p\n", si_pkey_ptr);
0381     dump_mem((u8 *)si_pkey_ptr - 8, 24);
0382     siginfo_pkey = *si_pkey_ptr;
0383     pkey_assert(siginfo_pkey < NR_PKEYS);
0384     last_si_pkey = siginfo_pkey;
0385 
0386     /*
0387      * need __read_pkey_reg() version so we do not do shadow_pkey_reg
0388      * checking
0389      */
0390     dprintf1("signal pkey_reg from  pkey_reg: %016llx\n",
0391             __read_pkey_reg());
0392     dprintf1("pkey from siginfo: %016llx\n", siginfo_pkey);
0393 #if defined(__i386__) || defined(__x86_64__) /* arch */
0394     dprintf1("signal pkey_reg from xsave: %08x\n", *pkey_reg_ptr);
0395     *(u64 *)pkey_reg_ptr = 0x00000000;
0396     dprintf1("WARNING: set PKEY_REG=0 to allow faulting instruction to continue\n");
0397 #elif defined(__powerpc64__) /* arch */
0398     /* restore access and let the faulting instruction continue */
0399     pkey_access_allow(siginfo_pkey);
0400 #endif /* arch */
0401     pkey_faults++;
0402     dprintf1("<<<<==================================================\n");
0403     dprint_in_signal = 0;
0404 }
0405 
0406 int wait_all_children(void)
0407 {
0408     int status;
0409     return waitpid(-1, &status, 0);
0410 }
0411 
0412 void sig_chld(int x)
0413 {
0414     dprint_in_signal = 1;
0415     dprintf2("[%d] SIGCHLD: %d\n", getpid(), x);
0416     dprint_in_signal = 0;
0417 }
0418 
0419 void setup_sigsegv_handler(void)
0420 {
0421     int r, rs;
0422     struct sigaction newact;
0423     struct sigaction oldact;
0424 
0425     /* #PF is mapped to sigsegv */
0426     int signum  = SIGSEGV;
0427 
0428     newact.sa_handler = 0;
0429     newact.sa_sigaction = signal_handler;
0430 
0431     /*sigset_t - signals to block while in the handler */
0432     /* get the old signal mask. */
0433     rs = sigprocmask(SIG_SETMASK, 0, &newact.sa_mask);
0434     pkey_assert(rs == 0);
0435 
0436     /* call sa_sigaction, not sa_handler*/
0437     newact.sa_flags = SA_SIGINFO;
0438 
0439     newact.sa_restorer = 0;  /* void(*)(), obsolete */
0440     r = sigaction(signum, &newact, &oldact);
0441     r = sigaction(SIGALRM, &newact, &oldact);
0442     pkey_assert(r == 0);
0443 }
0444 
0445 void setup_handlers(void)
0446 {
0447     signal(SIGCHLD, &sig_chld);
0448     setup_sigsegv_handler();
0449 }
0450 
0451 pid_t fork_lazy_child(void)
0452 {
0453     pid_t forkret;
0454 
0455     forkret = fork();
0456     pkey_assert(forkret >= 0);
0457     dprintf3("[%d] fork() ret: %d\n", getpid(), forkret);
0458 
0459     if (!forkret) {
0460         /* in the child */
0461         while (1) {
0462             dprintf1("child sleeping...\n");
0463             sleep(30);
0464         }
0465     }
0466     return forkret;
0467 }
0468 
0469 int sys_mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot,
0470         unsigned long pkey)
0471 {
0472     int sret;
0473 
0474     dprintf2("%s(0x%p, %zx, prot=%lx, pkey=%lx)\n", __func__,
0475             ptr, size, orig_prot, pkey);
0476 
0477     errno = 0;
0478     sret = syscall(SYS_mprotect_key, ptr, size, orig_prot, pkey);
0479     if (errno) {
0480         dprintf2("SYS_mprotect_key sret: %d\n", sret);
0481         dprintf2("SYS_mprotect_key prot: 0x%lx\n", orig_prot);
0482         dprintf2("SYS_mprotect_key failed, errno: %d\n", errno);
0483         if (DEBUG_LEVEL >= 2)
0484             perror("SYS_mprotect_pkey");
0485     }
0486     return sret;
0487 }
0488 
0489 int sys_pkey_alloc(unsigned long flags, unsigned long init_val)
0490 {
0491     int ret = syscall(SYS_pkey_alloc, flags, init_val);
0492     dprintf1("%s(flags=%lx, init_val=%lx) syscall ret: %d errno: %d\n",
0493             __func__, flags, init_val, ret, errno);
0494     return ret;
0495 }
0496 
0497 int alloc_pkey(void)
0498 {
0499     int ret;
0500     unsigned long init_val = 0x0;
0501 
0502     dprintf1("%s()::%d, pkey_reg: 0x%016llx shadow: %016llx\n",
0503             __func__, __LINE__, __read_pkey_reg(), shadow_pkey_reg);
0504     ret = sys_pkey_alloc(0, init_val);
0505     /*
0506      * pkey_alloc() sets PKEY register, so we need to reflect it in
0507      * shadow_pkey_reg:
0508      */
0509     dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx"
0510             " shadow: 0x%016llx\n",
0511             __func__, __LINE__, ret, __read_pkey_reg(),
0512             shadow_pkey_reg);
0513     if (ret > 0) {
0514         /* clear both the bits: */
0515         shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, ret,
0516                         ~PKEY_MASK);
0517         dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx"
0518                 " shadow: 0x%016llx\n",
0519                 __func__,
0520                 __LINE__, ret, __read_pkey_reg(),
0521                 shadow_pkey_reg);
0522         /*
0523          * move the new state in from init_val
0524          * (remember, we cheated and init_val == pkey_reg format)
0525          */
0526         shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, ret,
0527                         init_val);
0528     }
0529     dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx"
0530             " shadow: 0x%016llx\n",
0531             __func__, __LINE__, ret, __read_pkey_reg(),
0532             shadow_pkey_reg);
0533     dprintf1("%s()::%d errno: %d\n", __func__, __LINE__, errno);
0534     /* for shadow checking: */
0535     read_pkey_reg();
0536     dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx"
0537          " shadow: 0x%016llx\n",
0538         __func__, __LINE__, ret, __read_pkey_reg(),
0539         shadow_pkey_reg);
0540     return ret;
0541 }
0542 
0543 int sys_pkey_free(unsigned long pkey)
0544 {
0545     int ret = syscall(SYS_pkey_free, pkey);
0546     dprintf1("%s(pkey=%ld) syscall ret: %d\n", __func__, pkey, ret);
0547     return ret;
0548 }
0549 
0550 /*
0551  * I had a bug where pkey bits could be set by mprotect() but
0552  * not cleared.  This ensures we get lots of random bit sets
0553  * and clears on the vma and pte pkey bits.
0554  */
0555 int alloc_random_pkey(void)
0556 {
0557     int max_nr_pkey_allocs;
0558     int ret;
0559     int i;
0560     int alloced_pkeys[NR_PKEYS];
0561     int nr_alloced = 0;
0562     int random_index;
0563     memset(alloced_pkeys, 0, sizeof(alloced_pkeys));
0564 
0565     /* allocate every possible key and make a note of which ones we got */
0566     max_nr_pkey_allocs = NR_PKEYS;
0567     for (i = 0; i < max_nr_pkey_allocs; i++) {
0568         int new_pkey = alloc_pkey();
0569         if (new_pkey < 0)
0570             break;
0571         alloced_pkeys[nr_alloced++] = new_pkey;
0572     }
0573 
0574     pkey_assert(nr_alloced > 0);
0575     /* select a random one out of the allocated ones */
0576     random_index = rand() % nr_alloced;
0577     ret = alloced_pkeys[random_index];
0578     /* now zero it out so we don't free it next */
0579     alloced_pkeys[random_index] = 0;
0580 
0581     /* go through the allocated ones that we did not want and free them */
0582     for (i = 0; i < nr_alloced; i++) {
0583         int free_ret;
0584         if (!alloced_pkeys[i])
0585             continue;
0586         free_ret = sys_pkey_free(alloced_pkeys[i]);
0587         pkey_assert(!free_ret);
0588     }
0589     dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx"
0590              " shadow: 0x%016llx\n", __func__,
0591             __LINE__, ret, __read_pkey_reg(), shadow_pkey_reg);
0592     return ret;
0593 }
0594 
0595 int mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot,
0596         unsigned long pkey)
0597 {
0598     int nr_iterations = random() % 100;
0599     int ret;
0600 
0601     while (0) {
0602         int rpkey = alloc_random_pkey();
0603         ret = sys_mprotect_pkey(ptr, size, orig_prot, pkey);
0604         dprintf1("sys_mprotect_pkey(%p, %zx, prot=0x%lx, pkey=%ld) ret: %d\n",
0605                 ptr, size, orig_prot, pkey, ret);
0606         if (nr_iterations-- < 0)
0607             break;
0608 
0609         dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx"
0610             " shadow: 0x%016llx\n",
0611             __func__, __LINE__, ret, __read_pkey_reg(),
0612             shadow_pkey_reg);
0613         sys_pkey_free(rpkey);
0614         dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx"
0615             " shadow: 0x%016llx\n",
0616             __func__, __LINE__, ret, __read_pkey_reg(),
0617             shadow_pkey_reg);
0618     }
0619     pkey_assert(pkey < NR_PKEYS);
0620 
0621     ret = sys_mprotect_pkey(ptr, size, orig_prot, pkey);
0622     dprintf1("mprotect_pkey(%p, %zx, prot=0x%lx, pkey=%ld) ret: %d\n",
0623             ptr, size, orig_prot, pkey, ret);
0624     pkey_assert(!ret);
0625     dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx"
0626             " shadow: 0x%016llx\n", __func__,
0627             __LINE__, ret, __read_pkey_reg(), shadow_pkey_reg);
0628     return ret;
0629 }
0630 
0631 struct pkey_malloc_record {
0632     void *ptr;
0633     long size;
0634     int prot;
0635 };
0636 struct pkey_malloc_record *pkey_malloc_records;
0637 struct pkey_malloc_record *pkey_last_malloc_record;
0638 long nr_pkey_malloc_records;
0639 void record_pkey_malloc(void *ptr, long size, int prot)
0640 {
0641     long i;
0642     struct pkey_malloc_record *rec = NULL;
0643 
0644     for (i = 0; i < nr_pkey_malloc_records; i++) {
0645         rec = &pkey_malloc_records[i];
0646         /* find a free record */
0647         if (rec)
0648             break;
0649     }
0650     if (!rec) {
0651         /* every record is full */
0652         size_t old_nr_records = nr_pkey_malloc_records;
0653         size_t new_nr_records = (nr_pkey_malloc_records * 2 + 1);
0654         size_t new_size = new_nr_records * sizeof(struct pkey_malloc_record);
0655         dprintf2("new_nr_records: %zd\n", new_nr_records);
0656         dprintf2("new_size: %zd\n", new_size);
0657         pkey_malloc_records = realloc(pkey_malloc_records, new_size);
0658         pkey_assert(pkey_malloc_records != NULL);
0659         rec = &pkey_malloc_records[nr_pkey_malloc_records];
0660         /*
0661          * realloc() does not initialize memory, so zero it from
0662          * the first new record all the way to the end.
0663          */
0664         for (i = 0; i < new_nr_records - old_nr_records; i++)
0665             memset(rec + i, 0, sizeof(*rec));
0666     }
0667     dprintf3("filling malloc record[%d/%p]: {%p, %ld}\n",
0668         (int)(rec - pkey_malloc_records), rec, ptr, size);
0669     rec->ptr = ptr;
0670     rec->size = size;
0671     rec->prot = prot;
0672     pkey_last_malloc_record = rec;
0673     nr_pkey_malloc_records++;
0674 }
0675 
0676 void free_pkey_malloc(void *ptr)
0677 {
0678     long i;
0679     int ret;
0680     dprintf3("%s(%p)\n", __func__, ptr);
0681     for (i = 0; i < nr_pkey_malloc_records; i++) {
0682         struct pkey_malloc_record *rec = &pkey_malloc_records[i];
0683         dprintf4("looking for ptr %p at record[%ld/%p]: {%p, %ld}\n",
0684                 ptr, i, rec, rec->ptr, rec->size);
0685         if ((ptr <  rec->ptr) ||
0686             (ptr >= rec->ptr + rec->size))
0687             continue;
0688 
0689         dprintf3("found ptr %p at record[%ld/%p]: {%p, %ld}\n",
0690                 ptr, i, rec, rec->ptr, rec->size);
0691         nr_pkey_malloc_records--;
0692         ret = munmap(rec->ptr, rec->size);
0693         dprintf3("munmap ret: %d\n", ret);
0694         pkey_assert(!ret);
0695         dprintf3("clearing rec->ptr, rec: %p\n", rec);
0696         rec->ptr = NULL;
0697         dprintf3("done clearing rec->ptr, rec: %p\n", rec);
0698         return;
0699     }
0700     pkey_assert(false);
0701 }
0702 
0703 
0704 void *malloc_pkey_with_mprotect(long size, int prot, u16 pkey)
0705 {
0706     void *ptr;
0707     int ret;
0708 
0709     read_pkey_reg();
0710     dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__,
0711             size, prot, pkey);
0712     pkey_assert(pkey < NR_PKEYS);
0713     ptr = mmap(NULL, size, prot, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
0714     pkey_assert(ptr != (void *)-1);
0715     ret = mprotect_pkey((void *)ptr, PAGE_SIZE, prot, pkey);
0716     pkey_assert(!ret);
0717     record_pkey_malloc(ptr, size, prot);
0718     read_pkey_reg();
0719 
0720     dprintf1("%s() for pkey %d @ %p\n", __func__, pkey, ptr);
0721     return ptr;
0722 }
0723 
0724 void *malloc_pkey_anon_huge(long size, int prot, u16 pkey)
0725 {
0726     int ret;
0727     void *ptr;
0728 
0729     dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__,
0730             size, prot, pkey);
0731     /*
0732      * Guarantee we can fit at least one huge page in the resulting
0733      * allocation by allocating space for 2:
0734      */
0735     size = ALIGN_UP(size, HPAGE_SIZE * 2);
0736     ptr = mmap(NULL, size, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
0737     pkey_assert(ptr != (void *)-1);
0738     record_pkey_malloc(ptr, size, prot);
0739     mprotect_pkey(ptr, size, prot, pkey);
0740 
0741     dprintf1("unaligned ptr: %p\n", ptr);
0742     ptr = ALIGN_PTR_UP(ptr, HPAGE_SIZE);
0743     dprintf1("  aligned ptr: %p\n", ptr);
0744     ret = madvise(ptr, HPAGE_SIZE, MADV_HUGEPAGE);
0745     dprintf1("MADV_HUGEPAGE ret: %d\n", ret);
0746     ret = madvise(ptr, HPAGE_SIZE, MADV_WILLNEED);
0747     dprintf1("MADV_WILLNEED ret: %d\n", ret);
0748     memset(ptr, 0, HPAGE_SIZE);
0749 
0750     dprintf1("mmap()'d thp for pkey %d @ %p\n", pkey, ptr);
0751     return ptr;
0752 }
0753 
0754 int hugetlb_setup_ok;
0755 #define SYSFS_FMT_NR_HUGE_PAGES "/sys/kernel/mm/hugepages/hugepages-%ldkB/nr_hugepages"
0756 #define GET_NR_HUGE_PAGES 10
0757 void setup_hugetlbfs(void)
0758 {
0759     int err;
0760     int fd;
0761     char buf[256];
0762     long hpagesz_kb;
0763     long hpagesz_mb;
0764 
0765     if (geteuid() != 0) {
0766         fprintf(stderr, "WARNING: not run as root, can not do hugetlb test\n");
0767         return;
0768     }
0769 
0770     cat_into_file(__stringify(GET_NR_HUGE_PAGES), "/proc/sys/vm/nr_hugepages");
0771 
0772     /*
0773      * Now go make sure that we got the pages and that they
0774      * are PMD-level pages. Someone might have made PUD-level
0775      * pages the default.
0776      */
0777     hpagesz_kb = HPAGE_SIZE / 1024;
0778     hpagesz_mb = hpagesz_kb / 1024;
0779     sprintf(buf, SYSFS_FMT_NR_HUGE_PAGES, hpagesz_kb);
0780     fd = open(buf, O_RDONLY);
0781     if (fd < 0) {
0782         fprintf(stderr, "opening sysfs %ldM hugetlb config: %s\n",
0783             hpagesz_mb, strerror(errno));
0784         return;
0785     }
0786 
0787     /* -1 to guarantee leaving the trailing \0 */
0788     err = read(fd, buf, sizeof(buf)-1);
0789     close(fd);
0790     if (err <= 0) {
0791         fprintf(stderr, "reading sysfs %ldM hugetlb config: %s\n",
0792             hpagesz_mb, strerror(errno));
0793         return;
0794     }
0795 
0796     if (atoi(buf) != GET_NR_HUGE_PAGES) {
0797         fprintf(stderr, "could not confirm %ldM pages, got: '%s' expected %d\n",
0798             hpagesz_mb, buf, GET_NR_HUGE_PAGES);
0799         return;
0800     }
0801 
0802     hugetlb_setup_ok = 1;
0803 }
0804 
0805 void *malloc_pkey_hugetlb(long size, int prot, u16 pkey)
0806 {
0807     void *ptr;
0808     int flags = MAP_ANONYMOUS|MAP_PRIVATE|MAP_HUGETLB;
0809 
0810     if (!hugetlb_setup_ok)
0811         return PTR_ERR_ENOTSUP;
0812 
0813     dprintf1("doing %s(%ld, %x, %x)\n", __func__, size, prot, pkey);
0814     size = ALIGN_UP(size, HPAGE_SIZE * 2);
0815     pkey_assert(pkey < NR_PKEYS);
0816     ptr = mmap(NULL, size, PROT_NONE, flags, -1, 0);
0817     pkey_assert(ptr != (void *)-1);
0818     mprotect_pkey(ptr, size, prot, pkey);
0819 
0820     record_pkey_malloc(ptr, size, prot);
0821 
0822     dprintf1("mmap()'d hugetlbfs for pkey %d @ %p\n", pkey, ptr);
0823     return ptr;
0824 }
0825 
0826 void *malloc_pkey_mmap_dax(long size, int prot, u16 pkey)
0827 {
0828     void *ptr;
0829     int fd;
0830 
0831     dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__,
0832             size, prot, pkey);
0833     pkey_assert(pkey < NR_PKEYS);
0834     fd = open("/dax/foo", O_RDWR);
0835     pkey_assert(fd >= 0);
0836 
0837     ptr = mmap(0, size, prot, MAP_SHARED, fd, 0);
0838     pkey_assert(ptr != (void *)-1);
0839 
0840     mprotect_pkey(ptr, size, prot, pkey);
0841 
0842     record_pkey_malloc(ptr, size, prot);
0843 
0844     dprintf1("mmap()'d for pkey %d @ %p\n", pkey, ptr);
0845     close(fd);
0846     return ptr;
0847 }
0848 
0849 void *(*pkey_malloc[])(long size, int prot, u16 pkey) = {
0850 
0851     malloc_pkey_with_mprotect,
0852     malloc_pkey_with_mprotect_subpage,
0853     malloc_pkey_anon_huge,
0854     malloc_pkey_hugetlb
0855 /* can not do direct with the pkey_mprotect() API:
0856     malloc_pkey_mmap_direct,
0857     malloc_pkey_mmap_dax,
0858 */
0859 };
0860 
0861 void *malloc_pkey(long size, int prot, u16 pkey)
0862 {
0863     void *ret;
0864     static int malloc_type;
0865     int nr_malloc_types = ARRAY_SIZE(pkey_malloc);
0866 
0867     pkey_assert(pkey < NR_PKEYS);
0868 
0869     while (1) {
0870         pkey_assert(malloc_type < nr_malloc_types);
0871 
0872         ret = pkey_malloc[malloc_type](size, prot, pkey);
0873         pkey_assert(ret != (void *)-1);
0874 
0875         malloc_type++;
0876         if (malloc_type >= nr_malloc_types)
0877             malloc_type = (random()%nr_malloc_types);
0878 
0879         /* try again if the malloc_type we tried is unsupported */
0880         if (ret == PTR_ERR_ENOTSUP)
0881             continue;
0882 
0883         break;
0884     }
0885 
0886     dprintf3("%s(%ld, prot=%x, pkey=%x) returning: %p\n", __func__,
0887             size, prot, pkey, ret);
0888     return ret;
0889 }
0890 
0891 int last_pkey_faults;
0892 #define UNKNOWN_PKEY -2
0893 void expected_pkey_fault(int pkey)
0894 {
0895     dprintf2("%s(): last_pkey_faults: %d pkey_faults: %d\n",
0896             __func__, last_pkey_faults, pkey_faults);
0897     dprintf2("%s(%d): last_si_pkey: %d\n", __func__, pkey, last_si_pkey);
0898     pkey_assert(last_pkey_faults + 1 == pkey_faults);
0899 
0900        /*
0901     * For exec-only memory, we do not know the pkey in
0902     * advance, so skip this check.
0903     */
0904     if (pkey != UNKNOWN_PKEY)
0905         pkey_assert(last_si_pkey == pkey);
0906 
0907 #if defined(__i386__) || defined(__x86_64__) /* arch */
0908     /*
0909      * The signal handler shold have cleared out PKEY register to let the
0910      * test program continue.  We now have to restore it.
0911      */
0912     if (__read_pkey_reg() != 0)
0913 #else /* arch */
0914     if (__read_pkey_reg() != shadow_pkey_reg)
0915 #endif /* arch */
0916         pkey_assert(0);
0917 
0918     __write_pkey_reg(shadow_pkey_reg);
0919     dprintf1("%s() set pkey_reg=%016llx to restore state after signal "
0920                "nuked it\n", __func__, shadow_pkey_reg);
0921     last_pkey_faults = pkey_faults;
0922     last_si_pkey = -1;
0923 }
0924 
0925 #define do_not_expect_pkey_fault(msg)   do {            \
0926     if (last_pkey_faults != pkey_faults)            \
0927         dprintf0("unexpected PKey fault: %s\n", msg);   \
0928     pkey_assert(last_pkey_faults == pkey_faults);       \
0929 } while (0)
0930 
0931 int test_fds[10] = { -1 };
0932 int nr_test_fds;
0933 void __save_test_fd(int fd)
0934 {
0935     pkey_assert(fd >= 0);
0936     pkey_assert(nr_test_fds < ARRAY_SIZE(test_fds));
0937     test_fds[nr_test_fds] = fd;
0938     nr_test_fds++;
0939 }
0940 
0941 int get_test_read_fd(void)
0942 {
0943     int test_fd = open("/etc/passwd", O_RDONLY);
0944     __save_test_fd(test_fd);
0945     return test_fd;
0946 }
0947 
0948 void close_test_fds(void)
0949 {
0950     int i;
0951 
0952     for (i = 0; i < nr_test_fds; i++) {
0953         if (test_fds[i] < 0)
0954             continue;
0955         close(test_fds[i]);
0956         test_fds[i] = -1;
0957     }
0958     nr_test_fds = 0;
0959 }
0960 
0961 #define barrier() __asm__ __volatile__("": : :"memory")
0962 __attribute__((noinline)) int read_ptr(int *ptr)
0963 {
0964     /*
0965      * Keep GCC from optimizing this away somehow
0966      */
0967     barrier();
0968     return *ptr;
0969 }
0970 
0971 void test_pkey_alloc_free_attach_pkey0(int *ptr, u16 pkey)
0972 {
0973     int i, err;
0974     int max_nr_pkey_allocs;
0975     int alloced_pkeys[NR_PKEYS];
0976     int nr_alloced = 0;
0977     long size;
0978 
0979     pkey_assert(pkey_last_malloc_record);
0980     size = pkey_last_malloc_record->size;
0981     /*
0982      * This is a bit of a hack.  But mprotect() requires
0983      * huge-page-aligned sizes when operating on hugetlbfs.
0984      * So, make sure that we use something that's a multiple
0985      * of a huge page when we can.
0986      */
0987     if (size >= HPAGE_SIZE)
0988         size = HPAGE_SIZE;
0989 
0990     /* allocate every possible key and make sure key-0 never got allocated */
0991     max_nr_pkey_allocs = NR_PKEYS;
0992     for (i = 0; i < max_nr_pkey_allocs; i++) {
0993         int new_pkey = alloc_pkey();
0994         pkey_assert(new_pkey != 0);
0995 
0996         if (new_pkey < 0)
0997             break;
0998         alloced_pkeys[nr_alloced++] = new_pkey;
0999     }
1000     /* free all the allocated keys */
1001     for (i = 0; i < nr_alloced; i++) {
1002         int free_ret;
1003 
1004         if (!alloced_pkeys[i])
1005             continue;
1006         free_ret = sys_pkey_free(alloced_pkeys[i]);
1007         pkey_assert(!free_ret);
1008     }
1009 
1010     /* attach key-0 in various modes */
1011     err = sys_mprotect_pkey(ptr, size, PROT_READ, 0);
1012     pkey_assert(!err);
1013     err = sys_mprotect_pkey(ptr, size, PROT_WRITE, 0);
1014     pkey_assert(!err);
1015     err = sys_mprotect_pkey(ptr, size, PROT_EXEC, 0);
1016     pkey_assert(!err);
1017     err = sys_mprotect_pkey(ptr, size, PROT_READ|PROT_WRITE, 0);
1018     pkey_assert(!err);
1019     err = sys_mprotect_pkey(ptr, size, PROT_READ|PROT_WRITE|PROT_EXEC, 0);
1020     pkey_assert(!err);
1021 }
1022 
1023 void test_read_of_write_disabled_region(int *ptr, u16 pkey)
1024 {
1025     int ptr_contents;
1026 
1027     dprintf1("disabling write access to PKEY[1], doing read\n");
1028     pkey_write_deny(pkey);
1029     ptr_contents = read_ptr(ptr);
1030     dprintf1("*ptr: %d\n", ptr_contents);
1031     dprintf1("\n");
1032 }
1033 void test_read_of_access_disabled_region(int *ptr, u16 pkey)
1034 {
1035     int ptr_contents;
1036 
1037     dprintf1("disabling access to PKEY[%02d], doing read @ %p\n", pkey, ptr);
1038     read_pkey_reg();
1039     pkey_access_deny(pkey);
1040     ptr_contents = read_ptr(ptr);
1041     dprintf1("*ptr: %d\n", ptr_contents);
1042     expected_pkey_fault(pkey);
1043 }
1044 
1045 void test_read_of_access_disabled_region_with_page_already_mapped(int *ptr,
1046         u16 pkey)
1047 {
1048     int ptr_contents;
1049 
1050     dprintf1("disabling access to PKEY[%02d], doing read @ %p\n",
1051                 pkey, ptr);
1052     ptr_contents = read_ptr(ptr);
1053     dprintf1("reading ptr before disabling the read : %d\n",
1054             ptr_contents);
1055     read_pkey_reg();
1056     pkey_access_deny(pkey);
1057     ptr_contents = read_ptr(ptr);
1058     dprintf1("*ptr: %d\n", ptr_contents);
1059     expected_pkey_fault(pkey);
1060 }
1061 
1062 void test_write_of_write_disabled_region_with_page_already_mapped(int *ptr,
1063         u16 pkey)
1064 {
1065     *ptr = __LINE__;
1066     dprintf1("disabling write access; after accessing the page, "
1067         "to PKEY[%02d], doing write\n", pkey);
1068     pkey_write_deny(pkey);
1069     *ptr = __LINE__;
1070     expected_pkey_fault(pkey);
1071 }
1072 
1073 void test_write_of_write_disabled_region(int *ptr, u16 pkey)
1074 {
1075     dprintf1("disabling write access to PKEY[%02d], doing write\n", pkey);
1076     pkey_write_deny(pkey);
1077     *ptr = __LINE__;
1078     expected_pkey_fault(pkey);
1079 }
1080 void test_write_of_access_disabled_region(int *ptr, u16 pkey)
1081 {
1082     dprintf1("disabling access to PKEY[%02d], doing write\n", pkey);
1083     pkey_access_deny(pkey);
1084     *ptr = __LINE__;
1085     expected_pkey_fault(pkey);
1086 }
1087 
1088 void test_write_of_access_disabled_region_with_page_already_mapped(int *ptr,
1089             u16 pkey)
1090 {
1091     *ptr = __LINE__;
1092     dprintf1("disabling access; after accessing the page, "
1093         " to PKEY[%02d], doing write\n", pkey);
1094     pkey_access_deny(pkey);
1095     *ptr = __LINE__;
1096     expected_pkey_fault(pkey);
1097 }
1098 
1099 void test_kernel_write_of_access_disabled_region(int *ptr, u16 pkey)
1100 {
1101     int ret;
1102     int test_fd = get_test_read_fd();
1103 
1104     dprintf1("disabling access to PKEY[%02d], "
1105          "having kernel read() to buffer\n", pkey);
1106     pkey_access_deny(pkey);
1107     ret = read(test_fd, ptr, 1);
1108     dprintf1("read ret: %d\n", ret);
1109     pkey_assert(ret);
1110 }
1111 void test_kernel_write_of_write_disabled_region(int *ptr, u16 pkey)
1112 {
1113     int ret;
1114     int test_fd = get_test_read_fd();
1115 
1116     pkey_write_deny(pkey);
1117     ret = read(test_fd, ptr, 100);
1118     dprintf1("read ret: %d\n", ret);
1119     if (ret < 0 && (DEBUG_LEVEL > 0))
1120         perror("verbose read result (OK for this to be bad)");
1121     pkey_assert(ret);
1122 }
1123 
1124 void test_kernel_gup_of_access_disabled_region(int *ptr, u16 pkey)
1125 {
1126     int pipe_ret, vmsplice_ret;
1127     struct iovec iov;
1128     int pipe_fds[2];
1129 
1130     pipe_ret = pipe(pipe_fds);
1131 
1132     pkey_assert(pipe_ret == 0);
1133     dprintf1("disabling access to PKEY[%02d], "
1134          "having kernel vmsplice from buffer\n", pkey);
1135     pkey_access_deny(pkey);
1136     iov.iov_base = ptr;
1137     iov.iov_len = PAGE_SIZE;
1138     vmsplice_ret = vmsplice(pipe_fds[1], &iov, 1, SPLICE_F_GIFT);
1139     dprintf1("vmsplice() ret: %d\n", vmsplice_ret);
1140     pkey_assert(vmsplice_ret == -1);
1141 
1142     close(pipe_fds[0]);
1143     close(pipe_fds[1]);
1144 }
1145 
1146 void test_kernel_gup_write_to_write_disabled_region(int *ptr, u16 pkey)
1147 {
1148     int ignored = 0xdada;
1149     int futex_ret;
1150     int some_int = __LINE__;
1151 
1152     dprintf1("disabling write to PKEY[%02d], "
1153          "doing futex gunk in buffer\n", pkey);
1154     *ptr = some_int;
1155     pkey_write_deny(pkey);
1156     futex_ret = syscall(SYS_futex, ptr, FUTEX_WAIT, some_int-1, NULL,
1157             &ignored, ignored);
1158     if (DEBUG_LEVEL > 0)
1159         perror("futex");
1160     dprintf1("futex() ret: %d\n", futex_ret);
1161 }
1162 
1163 /* Assumes that all pkeys other than 'pkey' are unallocated */
1164 void test_pkey_syscalls_on_non_allocated_pkey(int *ptr, u16 pkey)
1165 {
1166     int err;
1167     int i;
1168 
1169     /* Note: 0 is the default pkey, so don't mess with it */
1170     for (i = 1; i < NR_PKEYS; i++) {
1171         if (pkey == i)
1172             continue;
1173 
1174         dprintf1("trying get/set/free to non-allocated pkey: %2d\n", i);
1175         err = sys_pkey_free(i);
1176         pkey_assert(err);
1177 
1178         err = sys_pkey_free(i);
1179         pkey_assert(err);
1180 
1181         err = sys_mprotect_pkey(ptr, PAGE_SIZE, PROT_READ, i);
1182         pkey_assert(err);
1183     }
1184 }
1185 
1186 /* Assumes that all pkeys other than 'pkey' are unallocated */
1187 void test_pkey_syscalls_bad_args(int *ptr, u16 pkey)
1188 {
1189     int err;
1190     int bad_pkey = NR_PKEYS+99;
1191 
1192     /* pass a known-invalid pkey in: */
1193     err = sys_mprotect_pkey(ptr, PAGE_SIZE, PROT_READ, bad_pkey);
1194     pkey_assert(err);
1195 }
1196 
1197 void become_child(void)
1198 {
1199     pid_t forkret;
1200 
1201     forkret = fork();
1202     pkey_assert(forkret >= 0);
1203     dprintf3("[%d] fork() ret: %d\n", getpid(), forkret);
1204 
1205     if (!forkret) {
1206         /* in the child */
1207         return;
1208     }
1209     exit(0);
1210 }
1211 
1212 /* Assumes that all pkeys other than 'pkey' are unallocated */
1213 void test_pkey_alloc_exhaust(int *ptr, u16 pkey)
1214 {
1215     int err;
1216     int allocated_pkeys[NR_PKEYS] = {0};
1217     int nr_allocated_pkeys = 0;
1218     int i;
1219 
1220     for (i = 0; i < NR_PKEYS*3; i++) {
1221         int new_pkey;
1222         dprintf1("%s() alloc loop: %d\n", __func__, i);
1223         new_pkey = alloc_pkey();
1224         dprintf4("%s()::%d, err: %d pkey_reg: 0x%016llx"
1225                 " shadow: 0x%016llx\n",
1226                 __func__, __LINE__, err, __read_pkey_reg(),
1227                 shadow_pkey_reg);
1228         read_pkey_reg(); /* for shadow checking */
1229         dprintf2("%s() errno: %d ENOSPC: %d\n", __func__, errno, ENOSPC);
1230         if ((new_pkey == -1) && (errno == ENOSPC)) {
1231             dprintf2("%s() failed to allocate pkey after %d tries\n",
1232                 __func__, nr_allocated_pkeys);
1233         } else {
1234             /*
1235              * Ensure the number of successes never
1236              * exceeds the number of keys supported
1237              * in the hardware.
1238              */
1239             pkey_assert(nr_allocated_pkeys < NR_PKEYS);
1240             allocated_pkeys[nr_allocated_pkeys++] = new_pkey;
1241         }
1242 
1243         /*
1244          * Make sure that allocation state is properly
1245          * preserved across fork().
1246          */
1247         if (i == NR_PKEYS*2)
1248             become_child();
1249     }
1250 
1251     dprintf3("%s()::%d\n", __func__, __LINE__);
1252 
1253     /*
1254      * On x86:
1255      * There are 16 pkeys supported in hardware.  Three are
1256      * allocated by the time we get here:
1257      *   1. The default key (0)
1258      *   2. One possibly consumed by an execute-only mapping.
1259      *   3. One allocated by the test code and passed in via
1260      *      'pkey' to this function.
1261      * Ensure that we can allocate at least another 13 (16-3).
1262      *
1263      * On powerpc:
1264      * There are either 5, 28, 29 or 32 pkeys supported in
1265      * hardware depending on the page size (4K or 64K) and
1266      * platform (powernv or powervm). Four are allocated by
1267      * the time we get here. These include pkey-0, pkey-1,
1268      * exec-only pkey and the one allocated by the test code.
1269      * Ensure that we can allocate the remaining.
1270      */
1271     pkey_assert(i >= (NR_PKEYS - get_arch_reserved_keys() - 1));
1272 
1273     for (i = 0; i < nr_allocated_pkeys; i++) {
1274         err = sys_pkey_free(allocated_pkeys[i]);
1275         pkey_assert(!err);
1276         read_pkey_reg(); /* for shadow checking */
1277     }
1278 }
1279 
1280 void arch_force_pkey_reg_init(void)
1281 {
1282 #if defined(__i386__) || defined(__x86_64__) /* arch */
1283     u64 *buf;
1284 
1285     /*
1286      * All keys should be allocated and set to allow reads and
1287      * writes, so the register should be all 0.  If not, just
1288      * skip the test.
1289      */
1290     if (read_pkey_reg())
1291         return;
1292 
1293     /*
1294      * Just allocate an absurd about of memory rather than
1295      * doing the XSAVE size enumeration dance.
1296      */
1297     buf = mmap(NULL, 1*MB, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
1298 
1299     /* These __builtins require compiling with -mxsave */
1300 
1301     /* XSAVE to build a valid buffer: */
1302     __builtin_ia32_xsave(buf, XSTATE_PKEY);
1303     /* Clear XSTATE_BV[PKRU]: */
1304     buf[XSTATE_BV_OFFSET/sizeof(u64)] &= ~XSTATE_PKEY;
1305     /* XRSTOR will likely get PKRU back to the init state: */
1306     __builtin_ia32_xrstor(buf, XSTATE_PKEY);
1307 
1308     munmap(buf, 1*MB);
1309 #endif
1310 }
1311 
1312 
1313 /*
1314  * This is mostly useless on ppc for now.  But it will not
1315  * hurt anything and should give some better coverage as
1316  * a long-running test that continually checks the pkey
1317  * register.
1318  */
1319 void test_pkey_init_state(int *ptr, u16 pkey)
1320 {
1321     int err;
1322     int allocated_pkeys[NR_PKEYS] = {0};
1323     int nr_allocated_pkeys = 0;
1324     int i;
1325 
1326     for (i = 0; i < NR_PKEYS; i++) {
1327         int new_pkey = alloc_pkey();
1328 
1329         if (new_pkey < 0)
1330             continue;
1331         allocated_pkeys[nr_allocated_pkeys++] = new_pkey;
1332     }
1333 
1334     dprintf3("%s()::%d\n", __func__, __LINE__);
1335 
1336     arch_force_pkey_reg_init();
1337 
1338     /*
1339      * Loop for a bit, hoping to get exercise the kernel
1340      * context switch code.
1341      */
1342     for (i = 0; i < 1000000; i++)
1343         read_pkey_reg();
1344 
1345     for (i = 0; i < nr_allocated_pkeys; i++) {
1346         err = sys_pkey_free(allocated_pkeys[i]);
1347         pkey_assert(!err);
1348         read_pkey_reg(); /* for shadow checking */
1349     }
1350 }
1351 
1352 /*
1353  * pkey 0 is special.  It is allocated by default, so you do not
1354  * have to call pkey_alloc() to use it first.  Make sure that it
1355  * is usable.
1356  */
1357 void test_mprotect_with_pkey_0(int *ptr, u16 pkey)
1358 {
1359     long size;
1360     int prot;
1361 
1362     assert(pkey_last_malloc_record);
1363     size = pkey_last_malloc_record->size;
1364     /*
1365      * This is a bit of a hack.  But mprotect() requires
1366      * huge-page-aligned sizes when operating on hugetlbfs.
1367      * So, make sure that we use something that's a multiple
1368      * of a huge page when we can.
1369      */
1370     if (size >= HPAGE_SIZE)
1371         size = HPAGE_SIZE;
1372     prot = pkey_last_malloc_record->prot;
1373 
1374     /* Use pkey 0 */
1375     mprotect_pkey(ptr, size, prot, 0);
1376 
1377     /* Make sure that we can set it back to the original pkey. */
1378     mprotect_pkey(ptr, size, prot, pkey);
1379 }
1380 
1381 void test_ptrace_of_child(int *ptr, u16 pkey)
1382 {
1383     __attribute__((__unused__)) int peek_result;
1384     pid_t child_pid;
1385     void *ignored = 0;
1386     long ret;
1387     int status;
1388     /*
1389      * This is the "control" for our little expermient.  Make sure
1390      * we can always access it when ptracing.
1391      */
1392     int *plain_ptr_unaligned = malloc(HPAGE_SIZE);
1393     int *plain_ptr = ALIGN_PTR_UP(plain_ptr_unaligned, PAGE_SIZE);
1394 
1395     /*
1396      * Fork a child which is an exact copy of this process, of course.
1397      * That means we can do all of our tests via ptrace() and then plain
1398      * memory access and ensure they work differently.
1399      */
1400     child_pid = fork_lazy_child();
1401     dprintf1("[%d] child pid: %d\n", getpid(), child_pid);
1402 
1403     ret = ptrace(PTRACE_ATTACH, child_pid, ignored, ignored);
1404     if (ret)
1405         perror("attach");
1406     dprintf1("[%d] attach ret: %ld %d\n", getpid(), ret, __LINE__);
1407     pkey_assert(ret != -1);
1408     ret = waitpid(child_pid, &status, WUNTRACED);
1409     if ((ret != child_pid) || !(WIFSTOPPED(status))) {
1410         fprintf(stderr, "weird waitpid result %ld stat %x\n",
1411                 ret, status);
1412         pkey_assert(0);
1413     }
1414     dprintf2("waitpid ret: %ld\n", ret);
1415     dprintf2("waitpid status: %d\n", status);
1416 
1417     pkey_access_deny(pkey);
1418     pkey_write_deny(pkey);
1419 
1420     /* Write access, untested for now:
1421     ret = ptrace(PTRACE_POKEDATA, child_pid, peek_at, data);
1422     pkey_assert(ret != -1);
1423     dprintf1("poke at %p: %ld\n", peek_at, ret);
1424     */
1425 
1426     /*
1427      * Try to access the pkey-protected "ptr" via ptrace:
1428      */
1429     ret = ptrace(PTRACE_PEEKDATA, child_pid, ptr, ignored);
1430     /* expect it to work, without an error: */
1431     pkey_assert(ret != -1);
1432     /* Now access from the current task, and expect an exception: */
1433     peek_result = read_ptr(ptr);
1434     expected_pkey_fault(pkey);
1435 
1436     /*
1437      * Try to access the NON-pkey-protected "plain_ptr" via ptrace:
1438      */
1439     ret = ptrace(PTRACE_PEEKDATA, child_pid, plain_ptr, ignored);
1440     /* expect it to work, without an error: */
1441     pkey_assert(ret != -1);
1442     /* Now access from the current task, and expect NO exception: */
1443     peek_result = read_ptr(plain_ptr);
1444     do_not_expect_pkey_fault("read plain pointer after ptrace");
1445 
1446     ret = ptrace(PTRACE_DETACH, child_pid, ignored, 0);
1447     pkey_assert(ret != -1);
1448 
1449     ret = kill(child_pid, SIGKILL);
1450     pkey_assert(ret != -1);
1451 
1452     wait(&status);
1453 
1454     free(plain_ptr_unaligned);
1455 }
1456 
1457 void *get_pointer_to_instructions(void)
1458 {
1459     void *p1;
1460 
1461     p1 = ALIGN_PTR_UP(&lots_o_noops_around_write, PAGE_SIZE);
1462     dprintf3("&lots_o_noops: %p\n", &lots_o_noops_around_write);
1463     /* lots_o_noops_around_write should be page-aligned already */
1464     assert(p1 == &lots_o_noops_around_write);
1465 
1466     /* Point 'p1' at the *second* page of the function: */
1467     p1 += PAGE_SIZE;
1468 
1469     /*
1470      * Try to ensure we fault this in on next touch to ensure
1471      * we get an instruction fault as opposed to a data one
1472      */
1473     madvise(p1, PAGE_SIZE, MADV_DONTNEED);
1474 
1475     return p1;
1476 }
1477 
1478 void test_executing_on_unreadable_memory(int *ptr, u16 pkey)
1479 {
1480     void *p1;
1481     int scratch;
1482     int ptr_contents;
1483     int ret;
1484 
1485     p1 = get_pointer_to_instructions();
1486     lots_o_noops_around_write(&scratch);
1487     ptr_contents = read_ptr(p1);
1488     dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents);
1489 
1490     ret = mprotect_pkey(p1, PAGE_SIZE, PROT_EXEC, (u64)pkey);
1491     pkey_assert(!ret);
1492     pkey_access_deny(pkey);
1493 
1494     dprintf2("pkey_reg: %016llx\n", read_pkey_reg());
1495 
1496     /*
1497      * Make sure this is an *instruction* fault
1498      */
1499     madvise(p1, PAGE_SIZE, MADV_DONTNEED);
1500     lots_o_noops_around_write(&scratch);
1501     do_not_expect_pkey_fault("executing on PROT_EXEC memory");
1502     expect_fault_on_read_execonly_key(p1, pkey);
1503 }
1504 
1505 void test_implicit_mprotect_exec_only_memory(int *ptr, u16 pkey)
1506 {
1507     void *p1;
1508     int scratch;
1509     int ptr_contents;
1510     int ret;
1511 
1512     dprintf1("%s() start\n", __func__);
1513 
1514     p1 = get_pointer_to_instructions();
1515     lots_o_noops_around_write(&scratch);
1516     ptr_contents = read_ptr(p1);
1517     dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents);
1518 
1519     /* Use a *normal* mprotect(), not mprotect_pkey(): */
1520     ret = mprotect(p1, PAGE_SIZE, PROT_EXEC);
1521     pkey_assert(!ret);
1522 
1523     /*
1524      * Reset the shadow, assuming that the above mprotect()
1525      * correctly changed PKRU, but to an unknown value since
1526      * the actual allocated pkey is unknown.
1527      */
1528     shadow_pkey_reg = __read_pkey_reg();
1529 
1530     dprintf2("pkey_reg: %016llx\n", read_pkey_reg());
1531 
1532     /* Make sure this is an *instruction* fault */
1533     madvise(p1, PAGE_SIZE, MADV_DONTNEED);
1534     lots_o_noops_around_write(&scratch);
1535     do_not_expect_pkey_fault("executing on PROT_EXEC memory");
1536     expect_fault_on_read_execonly_key(p1, UNKNOWN_PKEY);
1537 
1538     /*
1539      * Put the memory back to non-PROT_EXEC.  Should clear the
1540      * exec-only pkey off the VMA and allow it to be readable
1541      * again.  Go to PROT_NONE first to check for a kernel bug
1542      * that did not clear the pkey when doing PROT_NONE.
1543      */
1544     ret = mprotect(p1, PAGE_SIZE, PROT_NONE);
1545     pkey_assert(!ret);
1546 
1547     ret = mprotect(p1, PAGE_SIZE, PROT_READ|PROT_EXEC);
1548     pkey_assert(!ret);
1549     ptr_contents = read_ptr(p1);
1550     do_not_expect_pkey_fault("plain read on recently PROT_EXEC area");
1551 }
1552 
1553 void test_mprotect_pkey_on_unsupported_cpu(int *ptr, u16 pkey)
1554 {
1555     int size = PAGE_SIZE;
1556     int sret;
1557 
1558     if (cpu_has_pkeys()) {
1559         dprintf1("SKIP: %s: no CPU support\n", __func__);
1560         return;
1561     }
1562 
1563     sret = syscall(SYS_mprotect_key, ptr, size, PROT_READ, pkey);
1564     pkey_assert(sret < 0);
1565 }
1566 
1567 void (*pkey_tests[])(int *ptr, u16 pkey) = {
1568     test_read_of_write_disabled_region,
1569     test_read_of_access_disabled_region,
1570     test_read_of_access_disabled_region_with_page_already_mapped,
1571     test_write_of_write_disabled_region,
1572     test_write_of_write_disabled_region_with_page_already_mapped,
1573     test_write_of_access_disabled_region,
1574     test_write_of_access_disabled_region_with_page_already_mapped,
1575     test_kernel_write_of_access_disabled_region,
1576     test_kernel_write_of_write_disabled_region,
1577     test_kernel_gup_of_access_disabled_region,
1578     test_kernel_gup_write_to_write_disabled_region,
1579     test_executing_on_unreadable_memory,
1580     test_implicit_mprotect_exec_only_memory,
1581     test_mprotect_with_pkey_0,
1582     test_ptrace_of_child,
1583     test_pkey_init_state,
1584     test_pkey_syscalls_on_non_allocated_pkey,
1585     test_pkey_syscalls_bad_args,
1586     test_pkey_alloc_exhaust,
1587     test_pkey_alloc_free_attach_pkey0,
1588 };
1589 
1590 void run_tests_once(void)
1591 {
1592     int *ptr;
1593     int prot = PROT_READ|PROT_WRITE;
1594 
1595     for (test_nr = 0; test_nr < ARRAY_SIZE(pkey_tests); test_nr++) {
1596         int pkey;
1597         int orig_pkey_faults = pkey_faults;
1598 
1599         dprintf1("======================\n");
1600         dprintf1("test %d preparing...\n", test_nr);
1601 
1602         tracing_on();
1603         pkey = alloc_random_pkey();
1604         dprintf1("test %d starting with pkey: %d\n", test_nr, pkey);
1605         ptr = malloc_pkey(PAGE_SIZE, prot, pkey);
1606         dprintf1("test %d starting...\n", test_nr);
1607         pkey_tests[test_nr](ptr, pkey);
1608         dprintf1("freeing test memory: %p\n", ptr);
1609         free_pkey_malloc(ptr);
1610         sys_pkey_free(pkey);
1611 
1612         dprintf1("pkey_faults: %d\n", pkey_faults);
1613         dprintf1("orig_pkey_faults: %d\n", orig_pkey_faults);
1614 
1615         tracing_off();
1616         close_test_fds();
1617 
1618         printf("test %2d PASSED (iteration %d)\n", test_nr, iteration_nr);
1619         dprintf1("======================\n\n");
1620     }
1621     iteration_nr++;
1622 }
1623 
1624 void pkey_setup_shadow(void)
1625 {
1626     shadow_pkey_reg = __read_pkey_reg();
1627 }
1628 
1629 int main(void)
1630 {
1631     int nr_iterations = 22;
1632     int pkeys_supported = is_pkeys_supported();
1633 
1634     srand((unsigned int)time(NULL));
1635 
1636     setup_handlers();
1637 
1638     printf("has pkeys: %d\n", pkeys_supported);
1639 
1640     if (!pkeys_supported) {
1641         int size = PAGE_SIZE;
1642         int *ptr;
1643 
1644         printf("running PKEY tests for unsupported CPU/OS\n");
1645 
1646         ptr  = mmap(NULL, size, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
1647         assert(ptr != (void *)-1);
1648         test_mprotect_pkey_on_unsupported_cpu(ptr, 1);
1649         exit(0);
1650     }
1651 
1652     pkey_setup_shadow();
1653     printf("startup pkey_reg: %016llx\n", read_pkey_reg());
1654     setup_hugetlbfs();
1655 
1656     while (nr_iterations-- > 0)
1657         run_tests_once();
1658 
1659     printf("done (all tests OK)\n");
1660     return 0;
1661 }