Back to home page

LXR

 
 

    


0001 /*
0002  * linux/kernel/seccomp.c
0003  *
0004  * Copyright 2004-2005  Andrea Arcangeli <andrea@cpushare.com>
0005  *
0006  * Copyright (C) 2012 Google, Inc.
0007  * Will Drewry <wad@chromium.org>
0008  *
0009  * This defines a simple but solid secure-computing facility.
0010  *
0011  * Mode 1 uses a fixed list of allowed system calls.
0012  * Mode 2 allows user-defined system call filters in the form
0013  *        of Berkeley Packet Filters/Linux Socket Filters.
0014  */
0015 
0016 #include <linux/atomic.h>
0017 #include <linux/audit.h>
0018 #include <linux/compat.h>
0019 #include <linux/sched.h>
0020 #include <linux/seccomp.h>
0021 #include <linux/slab.h>
0022 #include <linux/syscalls.h>
0023 
0024 #ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER
0025 #include <asm/syscall.h>
0026 #endif
0027 
0028 #ifdef CONFIG_SECCOMP_FILTER
0029 #include <linux/filter.h>
0030 #include <linux/pid.h>
0031 #include <linux/ptrace.h>
0032 #include <linux/security.h>
0033 #include <linux/tracehook.h>
0034 #include <linux/uaccess.h>
0035 
0036 /**
0037  * struct seccomp_filter - container for seccomp BPF programs
0038  *
0039  * @usage: reference count to manage the object lifetime.
0040  *         get/put helpers should be used when accessing an instance
0041  *         outside of a lifetime-guarded section.  In general, this
0042  *         is only needed for handling filters shared across tasks.
0043  * @prev: points to a previously installed, or inherited, filter
0044  * @prog: the BPF program to evaluate
0045  *
0046  * seccomp_filter objects are organized in a tree linked via the @prev
0047  * pointer.  For any task, it appears to be a singly-linked list starting
0048  * with current->seccomp.filter, the most recently attached or inherited filter.
0049  * However, multiple filters may share a @prev node, by way of fork(), which
0050  * results in a unidirectional tree existing in memory.  This is similar to
0051  * how namespaces work.
0052  *
0053  * seccomp_filter objects should never be modified after being attached
0054  * to a task_struct (other than @usage).
0055  */
0056 struct seccomp_filter {
0057     atomic_t usage;
0058     struct seccomp_filter *prev;
0059     struct bpf_prog *prog;
0060 };
0061 
0062 /* Limit any path through the tree to 256KB worth of instructions. */
0063 #define MAX_INSNS_PER_PATH ((1 << 18) / sizeof(struct sock_filter))
0064 
0065 /*
0066  * Endianness is explicitly ignored and left for BPF program authors to manage
0067  * as per the specific architecture.
0068  */
0069 static void populate_seccomp_data(struct seccomp_data *sd)
0070 {
0071     struct task_struct *task = current;
0072     struct pt_regs *regs = task_pt_regs(task);
0073     unsigned long args[6];
0074 
0075     sd->nr = syscall_get_nr(task, regs);
0076     sd->arch = syscall_get_arch();
0077     syscall_get_arguments(task, regs, 0, 6, args);
0078     sd->args[0] = args[0];
0079     sd->args[1] = args[1];
0080     sd->args[2] = args[2];
0081     sd->args[3] = args[3];
0082     sd->args[4] = args[4];
0083     sd->args[5] = args[5];
0084     sd->instruction_pointer = KSTK_EIP(task);
0085 }
0086 
0087 /**
0088  *  seccomp_check_filter - verify seccomp filter code
0089  *  @filter: filter to verify
0090  *  @flen: length of filter
0091  *
0092  * Takes a previously checked filter (by bpf_check_classic) and
0093  * redirects all filter code that loads struct sk_buff data
0094  * and related data through seccomp_bpf_load.  It also
0095  * enforces length and alignment checking of those loads.
0096  *
0097  * Returns 0 if the rule set is legal or -EINVAL if not.
0098  */
0099 static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
0100 {
0101     int pc;
0102     for (pc = 0; pc < flen; pc++) {
0103         struct sock_filter *ftest = &filter[pc];
0104         u16 code = ftest->code;
0105         u32 k = ftest->k;
0106 
0107         switch (code) {
0108         case BPF_LD | BPF_W | BPF_ABS:
0109             ftest->code = BPF_LDX | BPF_W | BPF_ABS;
0110             /* 32-bit aligned and not out of bounds. */
0111             if (k >= sizeof(struct seccomp_data) || k & 3)
0112                 return -EINVAL;
0113             continue;
0114         case BPF_LD | BPF_W | BPF_LEN:
0115             ftest->code = BPF_LD | BPF_IMM;
0116             ftest->k = sizeof(struct seccomp_data);
0117             continue;
0118         case BPF_LDX | BPF_W | BPF_LEN:
0119             ftest->code = BPF_LDX | BPF_IMM;
0120             ftest->k = sizeof(struct seccomp_data);
0121             continue;
0122         /* Explicitly include allowed calls. */
0123         case BPF_RET | BPF_K:
0124         case BPF_RET | BPF_A:
0125         case BPF_ALU | BPF_ADD | BPF_K:
0126         case BPF_ALU | BPF_ADD | BPF_X:
0127         case BPF_ALU | BPF_SUB | BPF_K:
0128         case BPF_ALU | BPF_SUB | BPF_X:
0129         case BPF_ALU | BPF_MUL | BPF_K:
0130         case BPF_ALU | BPF_MUL | BPF_X:
0131         case BPF_ALU | BPF_DIV | BPF_K:
0132         case BPF_ALU | BPF_DIV | BPF_X:
0133         case BPF_ALU | BPF_AND | BPF_K:
0134         case BPF_ALU | BPF_AND | BPF_X:
0135         case BPF_ALU | BPF_OR | BPF_K:
0136         case BPF_ALU | BPF_OR | BPF_X:
0137         case BPF_ALU | BPF_XOR | BPF_K:
0138         case BPF_ALU | BPF_XOR | BPF_X:
0139         case BPF_ALU | BPF_LSH | BPF_K:
0140         case BPF_ALU | BPF_LSH | BPF_X:
0141         case BPF_ALU | BPF_RSH | BPF_K:
0142         case BPF_ALU | BPF_RSH | BPF_X:
0143         case BPF_ALU | BPF_NEG:
0144         case BPF_LD | BPF_IMM:
0145         case BPF_LDX | BPF_IMM:
0146         case BPF_MISC | BPF_TAX:
0147         case BPF_MISC | BPF_TXA:
0148         case BPF_LD | BPF_MEM:
0149         case BPF_LDX | BPF_MEM:
0150         case BPF_ST:
0151         case BPF_STX:
0152         case BPF_JMP | BPF_JA:
0153         case BPF_JMP | BPF_JEQ | BPF_K:
0154         case BPF_JMP | BPF_JEQ | BPF_X:
0155         case BPF_JMP | BPF_JGE | BPF_K:
0156         case BPF_JMP | BPF_JGE | BPF_X:
0157         case BPF_JMP | BPF_JGT | BPF_K:
0158         case BPF_JMP | BPF_JGT | BPF_X:
0159         case BPF_JMP | BPF_JSET | BPF_K:
0160         case BPF_JMP | BPF_JSET | BPF_X:
0161             continue;
0162         default:
0163             return -EINVAL;
0164         }
0165     }
0166     return 0;
0167 }
0168 
0169 /**
0170  * seccomp_run_filters - evaluates all seccomp filters against @sd
0171  * @sd: optional seccomp data to be passed to filters
0172  *
0173  * Returns valid seccomp BPF response codes.
0174  */
0175 static u32 seccomp_run_filters(const struct seccomp_data *sd)
0176 {
0177     struct seccomp_data sd_local;
0178     u32 ret = SECCOMP_RET_ALLOW;
0179     /* Make sure cross-thread synced filter points somewhere sane. */
0180     struct seccomp_filter *f =
0181             lockless_dereference(current->seccomp.filter);
0182 
0183     /* Ensure unexpected behavior doesn't result in failing open. */
0184     if (unlikely(WARN_ON(f == NULL)))
0185         return SECCOMP_RET_KILL;
0186 
0187     if (!sd) {
0188         populate_seccomp_data(&sd_local);
0189         sd = &sd_local;
0190     }
0191 
0192     /*
0193      * All filters in the list are evaluated and the lowest BPF return
0194      * value always takes priority (ignoring the DATA).
0195      */
0196     for (; f; f = f->prev) {
0197         u32 cur_ret = BPF_PROG_RUN(f->prog, sd);
0198 
0199         if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION))
0200             ret = cur_ret;
0201     }
0202     return ret;
0203 }
0204 #endif /* CONFIG_SECCOMP_FILTER */
0205 
0206 static inline bool seccomp_may_assign_mode(unsigned long seccomp_mode)
0207 {
0208     assert_spin_locked(&current->sighand->siglock);
0209 
0210     if (current->seccomp.mode && current->seccomp.mode != seccomp_mode)
0211         return false;
0212 
0213     return true;
0214 }
0215 
0216 static inline void seccomp_assign_mode(struct task_struct *task,
0217                        unsigned long seccomp_mode)
0218 {
0219     assert_spin_locked(&task->sighand->siglock);
0220 
0221     task->seccomp.mode = seccomp_mode;
0222     /*
0223      * Make sure TIF_SECCOMP cannot be set before the mode (and
0224      * filter) is set.
0225      */
0226     smp_mb__before_atomic();
0227     set_tsk_thread_flag(task, TIF_SECCOMP);
0228 }
0229 
0230 #ifdef CONFIG_SECCOMP_FILTER
0231 /* Returns 1 if the parent is an ancestor of the child. */
0232 static int is_ancestor(struct seccomp_filter *parent,
0233                struct seccomp_filter *child)
0234 {
0235     /* NULL is the root ancestor. */
0236     if (parent == NULL)
0237         return 1;
0238     for (; child; child = child->prev)
0239         if (child == parent)
0240             return 1;
0241     return 0;
0242 }
0243 
0244 /**
0245  * seccomp_can_sync_threads: checks if all threads can be synchronized
0246  *
0247  * Expects sighand and cred_guard_mutex locks to be held.
0248  *
0249  * Returns 0 on success, -ve on error, or the pid of a thread which was
0250  * either not in the correct seccomp mode or it did not have an ancestral
0251  * seccomp filter.
0252  */
0253 static inline pid_t seccomp_can_sync_threads(void)
0254 {
0255     struct task_struct *thread, *caller;
0256 
0257     BUG_ON(!mutex_is_locked(&current->signal->cred_guard_mutex));
0258     assert_spin_locked(&current->sighand->siglock);
0259 
0260     /* Validate all threads being eligible for synchronization. */
0261     caller = current;
0262     for_each_thread(caller, thread) {
0263         pid_t failed;
0264 
0265         /* Skip current, since it is initiating the sync. */
0266         if (thread == caller)
0267             continue;
0268 
0269         if (thread->seccomp.mode == SECCOMP_MODE_DISABLED ||
0270             (thread->seccomp.mode == SECCOMP_MODE_FILTER &&
0271              is_ancestor(thread->seccomp.filter,
0272                  caller->seccomp.filter)))
0273             continue;
0274 
0275         /* Return the first thread that cannot be synchronized. */
0276         failed = task_pid_vnr(thread);
0277         /* If the pid cannot be resolved, then return -ESRCH */
0278         if (unlikely(WARN_ON(failed == 0)))
0279             failed = -ESRCH;
0280         return failed;
0281     }
0282 
0283     return 0;
0284 }
0285 
0286 /**
0287  * seccomp_sync_threads: sets all threads to use current's filter
0288  *
0289  * Expects sighand and cred_guard_mutex locks to be held, and for
0290  * seccomp_can_sync_threads() to have returned success already
0291  * without dropping the locks.
0292  *
0293  */
0294 static inline void seccomp_sync_threads(void)
0295 {
0296     struct task_struct *thread, *caller;
0297 
0298     BUG_ON(!mutex_is_locked(&current->signal->cred_guard_mutex));
0299     assert_spin_locked(&current->sighand->siglock);
0300 
0301     /* Synchronize all threads. */
0302     caller = current;
0303     for_each_thread(caller, thread) {
0304         /* Skip current, since it needs no changes. */
0305         if (thread == caller)
0306             continue;
0307 
0308         /* Get a task reference for the new leaf node. */
0309         get_seccomp_filter(caller);
0310         /*
0311          * Drop the task reference to the shared ancestor since
0312          * current's path will hold a reference.  (This also
0313          * allows a put before the assignment.)
0314          */
0315         put_seccomp_filter(thread);
0316         smp_store_release(&thread->seccomp.filter,
0317                   caller->seccomp.filter);
0318 
0319         /*
0320          * Don't let an unprivileged task work around
0321          * the no_new_privs restriction by creating
0322          * a thread that sets it up, enters seccomp,
0323          * then dies.
0324          */
0325         if (task_no_new_privs(caller))
0326             task_set_no_new_privs(thread);
0327 
0328         /*
0329          * Opt the other thread into seccomp if needed.
0330          * As threads are considered to be trust-realm
0331          * equivalent (see ptrace_may_access), it is safe to
0332          * allow one thread to transition the other.
0333          */
0334         if (thread->seccomp.mode == SECCOMP_MODE_DISABLED)
0335             seccomp_assign_mode(thread, SECCOMP_MODE_FILTER);
0336     }
0337 }
0338 
0339 /**
0340  * seccomp_prepare_filter: Prepares a seccomp filter for use.
0341  * @fprog: BPF program to install
0342  *
0343  * Returns filter on success or an ERR_PTR on failure.
0344  */
0345 static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
0346 {
0347     struct seccomp_filter *sfilter;
0348     int ret;
0349     const bool save_orig = IS_ENABLED(CONFIG_CHECKPOINT_RESTORE);
0350 
0351     if (fprog->len == 0 || fprog->len > BPF_MAXINSNS)
0352         return ERR_PTR(-EINVAL);
0353 
0354     BUG_ON(INT_MAX / fprog->len < sizeof(struct sock_filter));
0355 
0356     /*
0357      * Installing a seccomp filter requires that the task has
0358      * CAP_SYS_ADMIN in its namespace or be running with no_new_privs.
0359      * This avoids scenarios where unprivileged tasks can affect the
0360      * behavior of privileged children.
0361      */
0362     if (!task_no_new_privs(current) &&
0363         security_capable_noaudit(current_cred(), current_user_ns(),
0364                      CAP_SYS_ADMIN) != 0)
0365         return ERR_PTR(-EACCES);
0366 
0367     /* Allocate a new seccomp_filter */
0368     sfilter = kzalloc(sizeof(*sfilter), GFP_KERNEL | __GFP_NOWARN);
0369     if (!sfilter)
0370         return ERR_PTR(-ENOMEM);
0371 
0372     ret = bpf_prog_create_from_user(&sfilter->prog, fprog,
0373                     seccomp_check_filter, save_orig);
0374     if (ret < 0) {
0375         kfree(sfilter);
0376         return ERR_PTR(ret);
0377     }
0378 
0379     atomic_set(&sfilter->usage, 1);
0380 
0381     return sfilter;
0382 }
0383 
0384 /**
0385  * seccomp_prepare_user_filter - prepares a user-supplied sock_fprog
0386  * @user_filter: pointer to the user data containing a sock_fprog.
0387  *
0388  * Returns 0 on success and non-zero otherwise.
0389  */
0390 static struct seccomp_filter *
0391 seccomp_prepare_user_filter(const char __user *user_filter)
0392 {
0393     struct sock_fprog fprog;
0394     struct seccomp_filter *filter = ERR_PTR(-EFAULT);
0395 
0396 #ifdef CONFIG_COMPAT
0397     if (in_compat_syscall()) {
0398         struct compat_sock_fprog fprog32;
0399         if (copy_from_user(&fprog32, user_filter, sizeof(fprog32)))
0400             goto out;
0401         fprog.len = fprog32.len;
0402         fprog.filter = compat_ptr(fprog32.filter);
0403     } else /* falls through to the if below. */
0404 #endif
0405     if (copy_from_user(&fprog, user_filter, sizeof(fprog)))
0406         goto out;
0407     filter = seccomp_prepare_filter(&fprog);
0408 out:
0409     return filter;
0410 }
0411 
0412 /**
0413  * seccomp_attach_filter: validate and attach filter
0414  * @flags:  flags to change filter behavior
0415  * @filter: seccomp filter to add to the current process
0416  *
0417  * Caller must be holding current->sighand->siglock lock.
0418  *
0419  * Returns 0 on success, -ve on error.
0420  */
0421 static long seccomp_attach_filter(unsigned int flags,
0422                   struct seccomp_filter *filter)
0423 {
0424     unsigned long total_insns;
0425     struct seccomp_filter *walker;
0426 
0427     assert_spin_locked(&current->sighand->siglock);
0428 
0429     /* Validate resulting filter length. */
0430     total_insns = filter->prog->len;
0431     for (walker = current->seccomp.filter; walker; walker = walker->prev)
0432         total_insns += walker->prog->len + 4;  /* 4 instr penalty */
0433     if (total_insns > MAX_INSNS_PER_PATH)
0434         return -ENOMEM;
0435 
0436     /* If thread sync has been requested, check that it is possible. */
0437     if (flags & SECCOMP_FILTER_FLAG_TSYNC) {
0438         int ret;
0439 
0440         ret = seccomp_can_sync_threads();
0441         if (ret)
0442             return ret;
0443     }
0444 
0445     /*
0446      * If there is an existing filter, make it the prev and don't drop its
0447      * task reference.
0448      */
0449     filter->prev = current->seccomp.filter;
0450     current->seccomp.filter = filter;
0451 
0452     /* Now that the new filter is in place, synchronize to all threads. */
0453     if (flags & SECCOMP_FILTER_FLAG_TSYNC)
0454         seccomp_sync_threads();
0455 
0456     return 0;
0457 }
0458 
0459 /* get_seccomp_filter - increments the reference count of the filter on @tsk */
0460 void get_seccomp_filter(struct task_struct *tsk)
0461 {
0462     struct seccomp_filter *orig = tsk->seccomp.filter;
0463     if (!orig)
0464         return;
0465     /* Reference count is bounded by the number of total processes. */
0466     atomic_inc(&orig->usage);
0467 }
0468 
0469 static inline void seccomp_filter_free(struct seccomp_filter *filter)
0470 {
0471     if (filter) {
0472         bpf_prog_destroy(filter->prog);
0473         kfree(filter);
0474     }
0475 }
0476 
0477 /* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */
0478 void put_seccomp_filter(struct task_struct *tsk)
0479 {
0480     struct seccomp_filter *orig = tsk->seccomp.filter;
0481     /* Clean up single-reference branches iteratively. */
0482     while (orig && atomic_dec_and_test(&orig->usage)) {
0483         struct seccomp_filter *freeme = orig;
0484         orig = orig->prev;
0485         seccomp_filter_free(freeme);
0486     }
0487 }
0488 
0489 /**
0490  * seccomp_send_sigsys - signals the task to allow in-process syscall emulation
0491  * @syscall: syscall number to send to userland
0492  * @reason: filter-supplied reason code to send to userland (via si_errno)
0493  *
0494  * Forces a SIGSYS with a code of SYS_SECCOMP and related sigsys info.
0495  */
0496 static void seccomp_send_sigsys(int syscall, int reason)
0497 {
0498     struct siginfo info;
0499     memset(&info, 0, sizeof(info));
0500     info.si_signo = SIGSYS;
0501     info.si_code = SYS_SECCOMP;
0502     info.si_call_addr = (void __user *)KSTK_EIP(current);
0503     info.si_errno = reason;
0504     info.si_arch = syscall_get_arch();
0505     info.si_syscall = syscall;
0506     force_sig_info(SIGSYS, &info, current);
0507 }
0508 #endif  /* CONFIG_SECCOMP_FILTER */
0509 
0510 /*
0511  * Secure computing mode 1 allows only read/write/exit/sigreturn.
0512  * To be fully secure this must be combined with rlimit
0513  * to limit the stack allocations too.
0514  */
0515 static const int mode1_syscalls[] = {
0516     __NR_seccomp_read, __NR_seccomp_write, __NR_seccomp_exit, __NR_seccomp_sigreturn,
0517     0, /* null terminated */
0518 };
0519 
0520 static void __secure_computing_strict(int this_syscall)
0521 {
0522     const int *syscall_whitelist = mode1_syscalls;
0523 #ifdef CONFIG_COMPAT
0524     if (in_compat_syscall())
0525         syscall_whitelist = get_compat_mode1_syscalls();
0526 #endif
0527     do {
0528         if (*syscall_whitelist == this_syscall)
0529             return;
0530     } while (*++syscall_whitelist);
0531 
0532 #ifdef SECCOMP_DEBUG
0533     dump_stack();
0534 #endif
0535     audit_seccomp(this_syscall, SIGKILL, SECCOMP_RET_KILL);
0536     do_exit(SIGKILL);
0537 }
0538 
0539 #ifndef CONFIG_HAVE_ARCH_SECCOMP_FILTER
0540 void secure_computing_strict(int this_syscall)
0541 {
0542     int mode = current->seccomp.mode;
0543 
0544     if (IS_ENABLED(CONFIG_CHECKPOINT_RESTORE) &&
0545         unlikely(current->ptrace & PT_SUSPEND_SECCOMP))
0546         return;
0547 
0548     if (mode == SECCOMP_MODE_DISABLED)
0549         return;
0550     else if (mode == SECCOMP_MODE_STRICT)
0551         __secure_computing_strict(this_syscall);
0552     else
0553         BUG();
0554 }
0555 #else
0556 
0557 #ifdef CONFIG_SECCOMP_FILTER
0558 static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
0559                 const bool recheck_after_trace)
0560 {
0561     u32 filter_ret, action;
0562     int data;
0563 
0564     /*
0565      * Make sure that any changes to mode from another thread have
0566      * been seen after TIF_SECCOMP was seen.
0567      */
0568     rmb();
0569 
0570     filter_ret = seccomp_run_filters(sd);
0571     data = filter_ret & SECCOMP_RET_DATA;
0572     action = filter_ret & SECCOMP_RET_ACTION;
0573 
0574     switch (action) {
0575     case SECCOMP_RET_ERRNO:
0576         /* Set low-order bits as an errno, capped at MAX_ERRNO. */
0577         if (data > MAX_ERRNO)
0578             data = MAX_ERRNO;
0579         syscall_set_return_value(current, task_pt_regs(current),
0580                      -data, 0);
0581         goto skip;
0582 
0583     case SECCOMP_RET_TRAP:
0584         /* Show the handler the original registers. */
0585         syscall_rollback(current, task_pt_regs(current));
0586         /* Let the filter pass back 16 bits of data. */
0587         seccomp_send_sigsys(this_syscall, data);
0588         goto skip;
0589 
0590     case SECCOMP_RET_TRACE:
0591         /* We've been put in this state by the ptracer already. */
0592         if (recheck_after_trace)
0593             return 0;
0594 
0595         /* ENOSYS these calls if there is no tracer attached. */
0596         if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) {
0597             syscall_set_return_value(current,
0598                          task_pt_regs(current),
0599                          -ENOSYS, 0);
0600             goto skip;
0601         }
0602 
0603         /* Allow the BPF to provide the event message */
0604         ptrace_event(PTRACE_EVENT_SECCOMP, data);
0605         /*
0606          * The delivery of a fatal signal during event
0607          * notification may silently skip tracer notification,
0608          * which could leave us with a potentially unmodified
0609          * syscall that the tracer would have liked to have
0610          * changed. Since the process is about to die, we just
0611          * force the syscall to be skipped and let the signal
0612          * kill the process and correctly handle any tracer exit
0613          * notifications.
0614          */
0615         if (fatal_signal_pending(current))
0616             goto skip;
0617         /* Check if the tracer forced the syscall to be skipped. */
0618         this_syscall = syscall_get_nr(current, task_pt_regs(current));
0619         if (this_syscall < 0)
0620             goto skip;
0621 
0622         /*
0623          * Recheck the syscall, since it may have changed. This
0624          * intentionally uses a NULL struct seccomp_data to force
0625          * a reload of all registers. This does not goto skip since
0626          * a skip would have already been reported.
0627          */
0628         if (__seccomp_filter(this_syscall, NULL, true))
0629             return -1;
0630 
0631         return 0;
0632 
0633     case SECCOMP_RET_ALLOW:
0634         return 0;
0635 
0636     case SECCOMP_RET_KILL:
0637     default:
0638         audit_seccomp(this_syscall, SIGSYS, action);
0639         do_exit(SIGSYS);
0640     }
0641 
0642     unreachable();
0643 
0644 skip:
0645     audit_seccomp(this_syscall, 0, action);
0646     return -1;
0647 }
0648 #else
0649 static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
0650                 const bool recheck_after_trace)
0651 {
0652     BUG();
0653 }
0654 #endif
0655 
0656 int __secure_computing(const struct seccomp_data *sd)
0657 {
0658     int mode = current->seccomp.mode;
0659     int this_syscall;
0660 
0661     if (IS_ENABLED(CONFIG_CHECKPOINT_RESTORE) &&
0662         unlikely(current->ptrace & PT_SUSPEND_SECCOMP))
0663         return 0;
0664 
0665     this_syscall = sd ? sd->nr :
0666         syscall_get_nr(current, task_pt_regs(current));
0667 
0668     switch (mode) {
0669     case SECCOMP_MODE_STRICT:
0670         __secure_computing_strict(this_syscall);  /* may call do_exit */
0671         return 0;
0672     case SECCOMP_MODE_FILTER:
0673         return __seccomp_filter(this_syscall, sd, false);
0674     default:
0675         BUG();
0676     }
0677 }
0678 #endif /* CONFIG_HAVE_ARCH_SECCOMP_FILTER */
0679 
0680 long prctl_get_seccomp(void)
0681 {
0682     return current->seccomp.mode;
0683 }
0684 
0685 /**
0686  * seccomp_set_mode_strict: internal function for setting strict seccomp
0687  *
0688  * Once current->seccomp.mode is non-zero, it may not be changed.
0689  *
0690  * Returns 0 on success or -EINVAL on failure.
0691  */
0692 static long seccomp_set_mode_strict(void)
0693 {
0694     const unsigned long seccomp_mode = SECCOMP_MODE_STRICT;
0695     long ret = -EINVAL;
0696 
0697     spin_lock_irq(&current->sighand->siglock);
0698 
0699     if (!seccomp_may_assign_mode(seccomp_mode))
0700         goto out;
0701 
0702 #ifdef TIF_NOTSC
0703     disable_TSC();
0704 #endif
0705     seccomp_assign_mode(current, seccomp_mode);
0706     ret = 0;
0707 
0708 out:
0709     spin_unlock_irq(&current->sighand->siglock);
0710 
0711     return ret;
0712 }
0713 
0714 #ifdef CONFIG_SECCOMP_FILTER
0715 /**
0716  * seccomp_set_mode_filter: internal function for setting seccomp filter
0717  * @flags:  flags to change filter behavior
0718  * @filter: struct sock_fprog containing filter
0719  *
0720  * This function may be called repeatedly to install additional filters.
0721  * Every filter successfully installed will be evaluated (in reverse order)
0722  * for each system call the task makes.
0723  *
0724  * Once current->seccomp.mode is non-zero, it may not be changed.
0725  *
0726  * Returns 0 on success or -EINVAL on failure.
0727  */
0728 static long seccomp_set_mode_filter(unsigned int flags,
0729                     const char __user *filter)
0730 {
0731     const unsigned long seccomp_mode = SECCOMP_MODE_FILTER;
0732     struct seccomp_filter *prepared = NULL;
0733     long ret = -EINVAL;
0734 
0735     /* Validate flags. */
0736     if (flags & ~SECCOMP_FILTER_FLAG_MASK)
0737         return -EINVAL;
0738 
0739     /* Prepare the new filter before holding any locks. */
0740     prepared = seccomp_prepare_user_filter(filter);
0741     if (IS_ERR(prepared))
0742         return PTR_ERR(prepared);
0743 
0744     /*
0745      * Make sure we cannot change seccomp or nnp state via TSYNC
0746      * while another thread is in the middle of calling exec.
0747      */
0748     if (flags & SECCOMP_FILTER_FLAG_TSYNC &&
0749         mutex_lock_killable(&current->signal->cred_guard_mutex))
0750         goto out_free;
0751 
0752     spin_lock_irq(&current->sighand->siglock);
0753 
0754     if (!seccomp_may_assign_mode(seccomp_mode))
0755         goto out;
0756 
0757     ret = seccomp_attach_filter(flags, prepared);
0758     if (ret)
0759         goto out;
0760     /* Do not free the successfully attached filter. */
0761     prepared = NULL;
0762 
0763     seccomp_assign_mode(current, seccomp_mode);
0764 out:
0765     spin_unlock_irq(&current->sighand->siglock);
0766     if (flags & SECCOMP_FILTER_FLAG_TSYNC)
0767         mutex_unlock(&current->signal->cred_guard_mutex);
0768 out_free:
0769     seccomp_filter_free(prepared);
0770     return ret;
0771 }
0772 #else
0773 static inline long seccomp_set_mode_filter(unsigned int flags,
0774                        const char __user *filter)
0775 {
0776     return -EINVAL;
0777 }
0778 #endif
0779 
0780 /* Common entry point for both prctl and syscall. */
0781 static long do_seccomp(unsigned int op, unsigned int flags,
0782                const char __user *uargs)
0783 {
0784     switch (op) {
0785     case SECCOMP_SET_MODE_STRICT:
0786         if (flags != 0 || uargs != NULL)
0787             return -EINVAL;
0788         return seccomp_set_mode_strict();
0789     case SECCOMP_SET_MODE_FILTER:
0790         return seccomp_set_mode_filter(flags, uargs);
0791     default:
0792         return -EINVAL;
0793     }
0794 }
0795 
0796 SYSCALL_DEFINE3(seccomp, unsigned int, op, unsigned int, flags,
0797              const char __user *, uargs)
0798 {
0799     return do_seccomp(op, flags, uargs);
0800 }
0801 
0802 /**
0803  * prctl_set_seccomp: configures current->seccomp.mode
0804  * @seccomp_mode: requested mode to use
0805  * @filter: optional struct sock_fprog for use with SECCOMP_MODE_FILTER
0806  *
0807  * Returns 0 on success or -EINVAL on failure.
0808  */
0809 long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter)
0810 {
0811     unsigned int op;
0812     char __user *uargs;
0813 
0814     switch (seccomp_mode) {
0815     case SECCOMP_MODE_STRICT:
0816         op = SECCOMP_SET_MODE_STRICT;
0817         /*
0818          * Setting strict mode through prctl always ignored filter,
0819          * so make sure it is always NULL here to pass the internal
0820          * check in do_seccomp().
0821          */
0822         uargs = NULL;
0823         break;
0824     case SECCOMP_MODE_FILTER:
0825         op = SECCOMP_SET_MODE_FILTER;
0826         uargs = filter;
0827         break;
0828     default:
0829         return -EINVAL;
0830     }
0831 
0832     /* prctl interface doesn't have flags, so they are always zero. */
0833     return do_seccomp(op, 0, uargs);
0834 }
0835 
0836 #if defined(CONFIG_SECCOMP_FILTER) && defined(CONFIG_CHECKPOINT_RESTORE)
0837 long seccomp_get_filter(struct task_struct *task, unsigned long filter_off,
0838             void __user *data)
0839 {
0840     struct seccomp_filter *filter;
0841     struct sock_fprog_kern *fprog;
0842     long ret;
0843     unsigned long count = 0;
0844 
0845     if (!capable(CAP_SYS_ADMIN) ||
0846         current->seccomp.mode != SECCOMP_MODE_DISABLED) {
0847         return -EACCES;
0848     }
0849 
0850     spin_lock_irq(&task->sighand->siglock);
0851     if (task->seccomp.mode != SECCOMP_MODE_FILTER) {
0852         ret = -EINVAL;
0853         goto out;
0854     }
0855 
0856     filter = task->seccomp.filter;
0857     while (filter) {
0858         filter = filter->prev;
0859         count++;
0860     }
0861 
0862     if (filter_off >= count) {
0863         ret = -ENOENT;
0864         goto out;
0865     }
0866     count -= filter_off;
0867 
0868     filter = task->seccomp.filter;
0869     while (filter && count > 1) {
0870         filter = filter->prev;
0871         count--;
0872     }
0873 
0874     if (WARN_ON(count != 1 || !filter)) {
0875         /* The filter tree shouldn't shrink while we're using it. */
0876         ret = -ENOENT;
0877         goto out;
0878     }
0879 
0880     fprog = filter->prog->orig_prog;
0881     if (!fprog) {
0882         /* This must be a new non-cBPF filter, since we save
0883          * every cBPF filter's orig_prog above when
0884          * CONFIG_CHECKPOINT_RESTORE is enabled.
0885          */
0886         ret = -EMEDIUMTYPE;
0887         goto out;
0888     }
0889 
0890     ret = fprog->len;
0891     if (!data)
0892         goto out;
0893 
0894     get_seccomp_filter(task);
0895     spin_unlock_irq(&task->sighand->siglock);
0896 
0897     if (copy_to_user(data, fprog->filter, bpf_classic_proglen(fprog)))
0898         ret = -EFAULT;
0899 
0900     put_seccomp_filter(task);
0901     return ret;
0902 
0903 out:
0904     spin_unlock_irq(&task->sighand->siglock);
0905     return ret;
0906 }
0907 #endif