the-tree/security/commoncap.c

0001 // SPDX-License-Identifier: GPL-2.0-or-later
0002 /* Common capabilities, needed by capability.o.
0003  */
0004
0005 #include <linux/capability.h>
0006 #include <linux/audit.h>
0007 #include <linux/init.h>
0008 #include <linux/kernel.h>
0009 #include <linux/lsm_hooks.h>
0010 #include <linux/file.h>
0011 #include <linux/mm.h>
0012 #include <linux/mman.h>
0013 #include <linux/pagemap.h>
0014 #include <linux/swap.h>
0015 #include <linux/skbuff.h>
0016 #include <linux/netlink.h>
0017 #include <linux/ptrace.h>
0018 #include <linux/xattr.h>
0019 #include <linux/hugetlb.h>
0020 #include <linux/mount.h>
0021 #include <linux/sched.h>
0022 #include <linux/prctl.h>
0023 #include <linux/securebits.h>
0024 #include <linux/user_namespace.h>
0025 #include <linux/binfmts.h>
0026 #include <linux/personality.h>
0027 #include <linux/mnt_idmapping.h>
0028
0029 /*
0030  * If a non-root user executes a setuid-root binary in
0031  * !secure(SECURE_NOROOT) mode, then we raise capabilities.
0032  * However if fE is also set, then the intent is for only
0033  * the file capabilities to be applied, and the setuid-root
0034  * bit is left on either to change the uid (plausible) or
0035  * to get full privilege on a kernel without file capabilities
0036  * support.  So in that case we do not raise capabilities.
0037  *
0038  * Warn if that happens, once per boot.
0039  */
0040 static void warn_setuid_and_fcaps_mixed(const char *fname)
0041 {
0042     static int warned;
0043     if (!warned) {
0044         printk(KERN_INFO "warning: `%s' has both setuid-root and"
0045             " effective capabilities. Therefore not raising all"
0046             " capabilities.\n", fname);
0047         warned = 1;
0048     }
0049 }
0050
0051 /**
0052  * cap_capable - Determine whether a task has a particular effective capability
0053  * @cred: The credentials to use
0054  * @targ_ns:  The user namespace in which we need the capability
0055  * @cap: The capability to check for
0056  * @opts: Bitmask of options defined in include/linux/security.h
0057  *
0058  * Determine whether the nominated task has the specified capability amongst
0059  * its effective set, returning 0 if it does, -ve if it does not.
0060  *
0061  * NOTE WELL: cap_has_capability() cannot be used like the kernel's capable()
0062  * and has_capability() functions.  That is, it has the reverse semantics:
0063  * cap_has_capability() returns 0 when a task has a capability, but the
0064  * kernel's capable() and has_capability() returns 1 for this case.
0065  */
0066 int cap_capable(const struct cred *cred, struct user_namespace *targ_ns,
0067         int cap, unsigned int opts)
0068 {
0069     struct user_namespace *ns = targ_ns;
0070
0071     /* See if cred has the capability in the target user namespace
0072      * by examining the target user namespace and all of the target
0073      * user namespace's parents.
0074      */
0075     for (;;) {
0076         /* Do we have the necessary capabilities? */
0077         if (ns == cred->user_ns)
0078             return cap_raised(cred->cap_effective, cap) ? 0 : -EPERM;
0079
0080         /*
0081          * If we're already at a lower level than we're looking for,
0082          * we're done searching.
0083          */
0084         if (ns->level <= cred->user_ns->level)
0085             return -EPERM;
0086
0087         /*
0088          * The owner of the user namespace in the parent of the
0089          * user namespace has all caps.
0090          */
0091         if ((ns->parent == cred->user_ns) && uid_eq(ns->owner, cred->euid))
0092             return 0;
0093
0094         /*
0095          * If you have a capability in a parent user ns, then you have
0096          * it over all children user namespaces as well.
0097          */
0098         ns = ns->parent;
0099     }
0100
0101     /* We never get here */
0102 }
0103
0104 /**
0105  * cap_settime - Determine whether the current process may set the system clock
0106  * @ts: The time to set
0107  * @tz: The timezone to set
0108  *
0109  * Determine whether the current process may set the system clock and timezone
0110  * information, returning 0 if permission granted, -ve if denied.
0111  */
0112 int cap_settime(const struct timespec64 *ts, const struct timezone *tz)
0113 {
0114     if (!capable(CAP_SYS_TIME))
0115         return -EPERM;
0116     return 0;
0117 }
0118
0119 /**
0120  * cap_ptrace_access_check - Determine whether the current process may access
0121  *             another
0122  * @child: The process to be accessed
0123  * @mode: The mode of attachment.
0124  *
0125  * If we are in the same or an ancestor user_ns and have all the target
0126  * task's capabilities, then ptrace access is allowed.
0127  * If we have the ptrace capability to the target user_ns, then ptrace
0128  * access is allowed.
0129  * Else denied.
0130  *
0131  * Determine whether a process may access another, returning 0 if permission
0132  * granted, -ve if denied.
0133  */
0134 int cap_ptrace_access_check(struct task_struct *child, unsigned int mode)
0135 {
0136     int ret = 0;
0137     const struct cred *cred, *child_cred;
0138     const kernel_cap_t *caller_caps;
0139
0140     rcu_read_lock();
0141     cred = current_cred();
0142     child_cred = __task_cred(child);
0143     if (mode & PTRACE_MODE_FSCREDS)
0144         caller_caps = &cred->cap_effective;
0145     else
0146         caller_caps = &cred->cap_permitted;
0147     if (cred->user_ns == child_cred->user_ns &&
0148         cap_issubset(child_cred->cap_permitted, *caller_caps))
0149         goto out;
0150     if (ns_capable(child_cred->user_ns, CAP_SYS_PTRACE))
0151         goto out;
0152     ret = -EPERM;
0153 out:
0154     rcu_read_unlock();
0155     return ret;
0156 }
0157
0158 /**
0159  * cap_ptrace_traceme - Determine whether another process may trace the current
0160  * @parent: The task proposed to be the tracer
0161  *
0162  * If parent is in the same or an ancestor user_ns and has all current's
0163  * capabilities, then ptrace access is allowed.
0164  * If parent has the ptrace capability to current's user_ns, then ptrace
0165  * access is allowed.
0166  * Else denied.
0167  *
0168  * Determine whether the nominated task is permitted to trace the current
0169  * process, returning 0 if permission is granted, -ve if denied.
0170  */
0171 int cap_ptrace_traceme(struct task_struct *parent)
0172 {
0173     int ret = 0;
0174     const struct cred *cred, *child_cred;
0175
0176     rcu_read_lock();
0177     cred = __task_cred(parent);
0178     child_cred = current_cred();
0179     if (cred->user_ns == child_cred->user_ns &&
0180         cap_issubset(child_cred->cap_permitted, cred->cap_permitted))
0181         goto out;
0182     if (has_ns_capability(parent, child_cred->user_ns, CAP_SYS_PTRACE))
0183         goto out;
0184     ret = -EPERM;
0185 out:
0186     rcu_read_unlock();
0187     return ret;
0188 }
0189
0190 /**
0191  * cap_capget - Retrieve a task's capability sets
0192  * @target: The task from which to retrieve the capability sets
0193  * @effective: The place to record the effective set
0194  * @inheritable: The place to record the inheritable set
0195  * @permitted: The place to record the permitted set
0196  *
0197  * This function retrieves the capabilities of the nominated task and returns
0198  * them to the caller.
0199  */
0200 int cap_capget(struct task_struct *target, kernel_cap_t *effective,
0201            kernel_cap_t *inheritable, kernel_cap_t *permitted)
0202 {
0203     const struct cred *cred;
0204
0205     /* Derived from kernel/capability.c:sys_capget. */
0206     rcu_read_lock();
0207     cred = __task_cred(target);
0208     *effective   = cred->cap_effective;
0209     *inheritable = cred->cap_inheritable;
0210     *permitted   = cred->cap_permitted;
0211     rcu_read_unlock();
0212     return 0;
0213 }
0214
0215 /*
0216  * Determine whether the inheritable capabilities are limited to the old
0217  * permitted set.  Returns 1 if they are limited, 0 if they are not.
0218  */
0219 static inline int cap_inh_is_capped(void)
0220 {
0221     /* they are so limited unless the current task has the CAP_SETPCAP
0222      * capability
0223      */
0224     if (cap_capable(current_cred(), current_cred()->user_ns,
0225             CAP_SETPCAP, CAP_OPT_NONE) == 0)
0226         return 0;
0227     return 1;
0228 }
0229
0230 /**
0231  * cap_capset - Validate and apply proposed changes to current's capabilities
0232  * @new: The proposed new credentials; alterations should be made here
0233  * @old: The current task's current credentials
0234  * @effective: A pointer to the proposed new effective capabilities set
0235  * @inheritable: A pointer to the proposed new inheritable capabilities set
0236  * @permitted: A pointer to the proposed new permitted capabilities set
0237  *
0238  * This function validates and applies a proposed mass change to the current
0239  * process's capability sets.  The changes are made to the proposed new
0240  * credentials, and assuming no error, will be committed by the caller of LSM.
0241  */
0242 int cap_capset(struct cred *new,
0243            const struct cred *old,
0244            const kernel_cap_t *effective,
0245            const kernel_cap_t *inheritable,
0246            const kernel_cap_t *permitted)
0247 {
0248     if (cap_inh_is_capped() &&
0249         !cap_issubset(*inheritable,
0250               cap_combine(old->cap_inheritable,
0251                       old->cap_permitted)))
0252         /* incapable of using this inheritable set */
0253         return -EPERM;
0254
0255     if (!cap_issubset(*inheritable,
0256               cap_combine(old->cap_inheritable,
0257                       old->cap_bset)))
0258         /* no new pI capabilities outside bounding set */
0259         return -EPERM;
0260
0261     /* verify restrictions on target's new Permitted set */
0262     if (!cap_issubset(*permitted, old->cap_permitted))
0263         return -EPERM;
0264
0265     /* verify the _new_Effective_ is a subset of the _new_Permitted_ */
0266     if (!cap_issubset(*effective, *permitted))
0267         return -EPERM;
0268
0269     new->cap_effective   = *effective;
0270     new->cap_inheritable = *inheritable;
0271     new->cap_permitted   = *permitted;
0272
0273     /*
0274      * Mask off ambient bits that are no longer both permitted and
0275      * inheritable.
0276      */
0277     new->cap_ambient = cap_intersect(new->cap_ambient,
0278                      cap_intersect(*permitted,
0279                                *inheritable));
0280     if (WARN_ON(!cap_ambient_invariant_ok(new)))
0281         return -EINVAL;
0282     return 0;
0283 }
0284
0285 /**
0286  * cap_inode_need_killpriv - Determine if inode change affects privileges
0287  * @dentry: The inode/dentry in being changed with change marked ATTR_KILL_PRIV
0288  *
0289  * Determine if an inode having a change applied that's marked ATTR_KILL_PRIV
0290  * affects the security markings on that inode, and if it is, should
0291  * inode_killpriv() be invoked or the change rejected.
0292  *
0293  * Return: 1 if security.capability has a value, meaning inode_killpriv()
0294  * is required, 0 otherwise, meaning inode_killpriv() is not required.
0295  */
0296 int cap_inode_need_killpriv(struct dentry *dentry)
0297 {
0298     struct inode *inode = d_backing_inode(dentry);
0299     int error;
0300
0301     error = __vfs_getxattr(dentry, inode, XATTR_NAME_CAPS, NULL, 0);
0302     return error > 0;
0303 }
0304
0305 /**
0306  * cap_inode_killpriv - Erase the security markings on an inode
0307  *
0308  * @mnt_userns: user namespace of the mount the inode was found from
0309  * @dentry: The inode/dentry to alter
0310  *
0311  * Erase the privilege-enhancing security markings on an inode.
0312  *
0313  * If the inode has been found through an idmapped mount the user namespace of
0314  * the vfsmount must be passed through @mnt_userns. This function will then
0315  * take care to map the inode according to @mnt_userns before checking
0316  * permissions. On non-idmapped mounts or if permission checking is to be
0317  * performed on the raw inode simply passs init_user_ns.
0318  *
0319  * Return: 0 if successful, -ve on error.
0320  */
0321 int cap_inode_killpriv(struct user_namespace *mnt_userns, struct dentry *dentry)
0322 {
0323     int error;
0324
0325     error = __vfs_removexattr(mnt_userns, dentry, XATTR_NAME_CAPS);
0326     if (error == -EOPNOTSUPP)
0327         error = 0;
0328     return error;
0329 }
0330
0331 static bool rootid_owns_currentns(kuid_t kroot)
0332 {
0333     struct user_namespace *ns;
0334
0335     if (!uid_valid(kroot))
0336         return false;
0337
0338     for (ns = current_user_ns(); ; ns = ns->parent) {
0339         if (from_kuid(ns, kroot) == 0)
0340             return true;
0341         if (ns == &init_user_ns)
0342             break;
0343     }
0344
0345     return false;
0346 }
0347
0348 static __u32 sansflags(__u32 m)
0349 {
0350     return m & ~VFS_CAP_FLAGS_EFFECTIVE;
0351 }
0352
0353 static bool is_v2header(size_t size, const struct vfs_cap_data *cap)
0354 {
0355     if (size != XATTR_CAPS_SZ_2)
0356         return false;
0357     return sansflags(le32_to_cpu(cap->magic_etc)) == VFS_CAP_REVISION_2;
0358 }
0359
0360 static bool is_v3header(size_t size, const struct vfs_cap_data *cap)
0361 {
0362     if (size != XATTR_CAPS_SZ_3)
0363         return false;
0364     return sansflags(le32_to_cpu(cap->magic_etc)) == VFS_CAP_REVISION_3;
0365 }
0366
0367 /*
0368  * getsecurity: We are called for security.* before any attempt to read the
0369  * xattr from the inode itself.
0370  *
0371  * This gives us a chance to read the on-disk value and convert it.  If we
0372  * return -EOPNOTSUPP, then vfs_getxattr() will call the i_op handler.
0373  *
0374  * Note we are not called by vfs_getxattr_alloc(), but that is only called
0375  * by the integrity subsystem, which really wants the unconverted values -
0376  * so that's good.
0377  */
0378 int cap_inode_getsecurity(struct user_namespace *mnt_userns,
0379               struct inode *inode, const char *name, void **buffer,
0380               bool alloc)
0381 {
0382     int size, ret;
0383     kuid_t kroot;
0384     u32 nsmagic, magic;
0385     uid_t root, mappedroot;
0386     char *tmpbuf = NULL;
0387     struct vfs_cap_data *cap;
0388     struct vfs_ns_cap_data *nscap = NULL;
0389     struct dentry *dentry;
0390     struct user_namespace *fs_ns;
0391
0392     if (strcmp(name, "capability") != 0)
0393         return -EOPNOTSUPP;
0394
0395     dentry = d_find_any_alias(inode);
0396     if (!dentry)
0397         return -EINVAL;
0398
0399     size = sizeof(struct vfs_ns_cap_data);
0400     ret = (int)vfs_getxattr_alloc(mnt_userns, dentry, XATTR_NAME_CAPS,
0401                       &tmpbuf, size, GFP_NOFS);
0402     dput(dentry);
0403
0404     if (ret < 0 || !tmpbuf)
0405         return ret;
0406
0407     fs_ns = inode->i_sb->s_user_ns;
0408     cap = (struct vfs_cap_data *) tmpbuf;
0409     if (is_v2header((size_t) ret, cap)) {
0410         root = 0;
0411     } else if (is_v3header((size_t) ret, cap)) {
0412         nscap = (struct vfs_ns_cap_data *) tmpbuf;
0413         root = le32_to_cpu(nscap->rootid);
0414     } else {
0415         size = -EINVAL;
0416         goto out_free;
0417     }
0418
0419     kroot = make_kuid(fs_ns, root);
0420
0421     /* If this is an idmapped mount shift the kuid. */
0422     kroot = mapped_kuid_fs(mnt_userns, fs_ns, kroot);
0423
0424     /* If the root kuid maps to a valid uid in current ns, then return
0425      * this as a nscap. */
0426     mappedroot = from_kuid(current_user_ns(), kroot);
0427     if (mappedroot != (uid_t)-1 && mappedroot != (uid_t)0) {
0428         size = sizeof(struct vfs_ns_cap_data);
0429         if (alloc) {
0430             if (!nscap) {
0431                 /* v2 -> v3 conversion */
0432                 nscap = kzalloc(size, GFP_ATOMIC);
0433                 if (!nscap) {
0434                     size = -ENOMEM;
0435                     goto out_free;
0436                 }
0437                 nsmagic = VFS_CAP_REVISION_3;
0438                 magic = le32_to_cpu(cap->magic_etc);
0439                 if (magic & VFS_CAP_FLAGS_EFFECTIVE)
0440                     nsmagic |= VFS_CAP_FLAGS_EFFECTIVE;
0441                 memcpy(&nscap->data, &cap->data, sizeof(__le32) * 2 * VFS_CAP_U32);
0442                 nscap->magic_etc = cpu_to_le32(nsmagic);
0443             } else {
0444                 /* use allocated v3 buffer */
0445                 tmpbuf = NULL;
0446             }
0447             nscap->rootid = cpu_to_le32(mappedroot);
0448             *buffer = nscap;
0449         }
0450         goto out_free;
0451     }
0452
0453     if (!rootid_owns_currentns(kroot)) {
0454         size = -EOVERFLOW;
0455         goto out_free;
0456     }
0457
0458     /* This comes from a parent namespace.  Return as a v2 capability */
0459     size = sizeof(struct vfs_cap_data);
0460     if (alloc) {
0461         if (nscap) {
0462             /* v3 -> v2 conversion */
0463             cap = kzalloc(size, GFP_ATOMIC);
0464             if (!cap) {
0465                 size = -ENOMEM;
0466                 goto out_free;
0467             }
0468             magic = VFS_CAP_REVISION_2;
0469             nsmagic = le32_to_cpu(nscap->magic_etc);
0470             if (nsmagic & VFS_CAP_FLAGS_EFFECTIVE)
0471                 magic |= VFS_CAP_FLAGS_EFFECTIVE;
0472             memcpy(&cap->data, &nscap->data, sizeof(__le32) * 2 * VFS_CAP_U32);
0473             cap->magic_etc = cpu_to_le32(magic);
0474         } else {
0475             /* use unconverted v2 */
0476             tmpbuf = NULL;
0477         }
0478         *buffer = cap;
0479     }
0480 out_free:
0481     kfree(tmpbuf);
0482     return size;
0483 }
0484
0485 /**
0486  * rootid_from_xattr - translate root uid of vfs caps
0487  *
0488  * @value:  vfs caps value which may be modified by this function
0489  * @size:   size of @ivalue
0490  * @task_ns:    user namespace of the caller
0491  * @mnt_userns: user namespace of the mount the inode was found from
0492  * @fs_userns:  user namespace of the filesystem
0493  *
0494  * If the inode has been found through an idmapped mount the user namespace of
0495  * the vfsmount must be passed through @mnt_userns. This function will then
0496  * take care to map the inode according to @mnt_userns before checking
0497  * permissions. On non-idmapped mounts or if permission checking is to be
0498  * performed on the raw inode simply passs init_user_ns.
0499  */
0500 static kuid_t rootid_from_xattr(const void *value, size_t size,
0501                 struct user_namespace *task_ns,
0502                 struct user_namespace *mnt_userns,
0503                 struct user_namespace *fs_userns)
0504 {
0505     const struct vfs_ns_cap_data *nscap = value;
0506     kuid_t rootkid;
0507     uid_t rootid = 0;
0508
0509     if (size == XATTR_CAPS_SZ_3)
0510         rootid = le32_to_cpu(nscap->rootid);
0511
0512     rootkid = make_kuid(task_ns, rootid);
0513     return mapped_kuid_user(mnt_userns, fs_userns, rootkid);
0514 }
0515
0516 static bool validheader(size_t size, const struct vfs_cap_data *cap)
0517 {
0518     return is_v2header(size, cap) || is_v3header(size, cap);
0519 }
0520
0521 /**
0522  * cap_convert_nscap - check vfs caps
0523  *
0524  * @mnt_userns: user namespace of the mount the inode was found from
0525  * @dentry: used to retrieve inode to check permissions on
0526  * @ivalue: vfs caps value which may be modified by this function
0527  * @size:   size of @ivalue
0528  *
0529  * User requested a write of security.capability.  If needed, update the
0530  * xattr to change from v2 to v3, or to fixup the v3 rootid.
0531  *
0532  * If the inode has been found through an idmapped mount the user namespace of
0533  * the vfsmount must be passed through @mnt_userns. This function will then
0534  * take care to map the inode according to @mnt_userns before checking
0535  * permissions. On non-idmapped mounts or if permission checking is to be
0536  * performed on the raw inode simply passs init_user_ns.
0537  *
0538  * Return: On success, return the new size; on error, return < 0.
0539  */
0540 int cap_convert_nscap(struct user_namespace *mnt_userns, struct dentry *dentry,
0541               const void **ivalue, size_t size)
0542 {
0543     struct vfs_ns_cap_data *nscap;
0544     uid_t nsrootid;
0545     const struct vfs_cap_data *cap = *ivalue;
0546     __u32 magic, nsmagic;
0547     struct inode *inode = d_backing_inode(dentry);
0548     struct user_namespace *task_ns = current_user_ns(),
0549         *fs_ns = inode->i_sb->s_user_ns;
0550     kuid_t rootid;
0551     size_t newsize;
0552
0553     if (!*ivalue)
0554         return -EINVAL;
0555     if (!validheader(size, cap))
0556         return -EINVAL;
0557     if (!capable_wrt_inode_uidgid(mnt_userns, inode, CAP_SETFCAP))
0558         return -EPERM;
0559     if (size == XATTR_CAPS_SZ_2 && (mnt_userns == fs_ns))
0560         if (ns_capable(inode->i_sb->s_user_ns, CAP_SETFCAP))
0561             /* user is privileged, just write the v2 */
0562             return size;
0563
0564     rootid = rootid_from_xattr(*ivalue, size, task_ns, mnt_userns, fs_ns);
0565     if (!uid_valid(rootid))
0566         return -EINVAL;
0567
0568     nsrootid = from_kuid(fs_ns, rootid);
0569     if (nsrootid == -1)
0570         return -EINVAL;
0571
0572     newsize = sizeof(struct vfs_ns_cap_data);
0573     nscap = kmalloc(newsize, GFP_ATOMIC);
0574     if (!nscap)
0575         return -ENOMEM;
0576     nscap->rootid = cpu_to_le32(nsrootid);
0577     nsmagic = VFS_CAP_REVISION_3;
0578     magic = le32_to_cpu(cap->magic_etc);
0579     if (magic & VFS_CAP_FLAGS_EFFECTIVE)
0580         nsmagic |= VFS_CAP_FLAGS_EFFECTIVE;
0581     nscap->magic_etc = cpu_to_le32(nsmagic);
0582     memcpy(&nscap->data, &cap->data, sizeof(__le32) * 2 * VFS_CAP_U32);
0583
0584     *ivalue = nscap;
0585     return newsize;
0586 }
0587
0588 /*
0589  * Calculate the new process capability sets from the capability sets attached
0590  * to a file.
0591  */
0592 static inline int bprm_caps_from_vfs_caps(struct cpu_vfs_cap_data *caps,
0593                       struct linux_binprm *bprm,
0594                       bool *effective,
0595                       bool *has_fcap)
0596 {
0597     struct cred *new = bprm->cred;
0598     unsigned i;
0599     int ret = 0;
0600
0601     if (caps->magic_etc & VFS_CAP_FLAGS_EFFECTIVE)
0602         *effective = true;
0603
0604     if (caps->magic_etc & VFS_CAP_REVISION_MASK)
0605         *has_fcap = true;
0606
0607     CAP_FOR_EACH_U32(i) {
0608         __u32 permitted = caps->permitted.cap[i];
0609         __u32 inheritable = caps->inheritable.cap[i];
0610
0611         /*
0612          * pP' = (X & fP) | (pI & fI)
0613          * The addition of pA' is handled later.
0614          */
0615         new->cap_permitted.cap[i] =
0616             (new->cap_bset.cap[i] & permitted) |
0617             (new->cap_inheritable.cap[i] & inheritable);
0618
0619         if (permitted & ~new->cap_permitted.cap[i])
0620             /* insufficient to execute correctly */
0621             ret = -EPERM;
0622     }
0623
0624     /*
0625      * For legacy apps, with no internal support for recognizing they
0626      * do not have enough capabilities, we return an error if they are
0627      * missing some "forced" (aka file-permitted) capabilities.
0628      */
0629     return *effective ? ret : 0;
0630 }
0631
0632 /**
0633  * get_vfs_caps_from_disk - retrieve vfs caps from disk
0634  *
0635  * @mnt_userns: user namespace of the mount the inode was found from
0636  * @dentry: dentry from which @inode is retrieved
0637  * @cpu_caps:   vfs capabilities
0638  *
0639  * Extract the on-exec-apply capability sets for an executable file.
0640  *
0641  * If the inode has been found through an idmapped mount the user namespace of
0642  * the vfsmount must be passed through @mnt_userns. This function will then
0643  * take care to map the inode according to @mnt_userns before checking
0644  * permissions. On non-idmapped mounts or if permission checking is to be
0645  * performed on the raw inode simply passs init_user_ns.
0646  */
0647 int get_vfs_caps_from_disk(struct user_namespace *mnt_userns,
0648                const struct dentry *dentry,
0649                struct cpu_vfs_cap_data *cpu_caps)
0650 {
0651     struct inode *inode = d_backing_inode(dentry);
0652     __u32 magic_etc;
0653     unsigned tocopy, i;
0654     int size;
0655     struct vfs_ns_cap_data data, *nscaps = &data;
0656     struct vfs_cap_data *caps = (struct vfs_cap_data *) &data;
0657     kuid_t rootkuid;
0658     struct user_namespace *fs_ns;
0659
0660     memset(cpu_caps, 0, sizeof(struct cpu_vfs_cap_data));
0661
0662     if (!inode)
0663         return -ENODATA;
0664
0665     fs_ns = inode->i_sb->s_user_ns;
0666     size = __vfs_getxattr((struct dentry *)dentry, inode,
0667                   XATTR_NAME_CAPS, &data, XATTR_CAPS_SZ);
0668     if (size == -ENODATA || size == -EOPNOTSUPP)
0669         /* no data, that's ok */
0670         return -ENODATA;
0671
0672     if (size < 0)
0673         return size;
0674
0675     if (size < sizeof(magic_etc))
0676         return -EINVAL;
0677
0678     cpu_caps->magic_etc = magic_etc = le32_to_cpu(caps->magic_etc);
0679
0680     rootkuid = make_kuid(fs_ns, 0);
0681     switch (magic_etc & VFS_CAP_REVISION_MASK) {
0682     case VFS_CAP_REVISION_1:
0683         if (size != XATTR_CAPS_SZ_1)
0684             return -EINVAL;
0685         tocopy = VFS_CAP_U32_1;
0686         break;
0687     case VFS_CAP_REVISION_2:
0688         if (size != XATTR_CAPS_SZ_2)
0689             return -EINVAL;
0690         tocopy = VFS_CAP_U32_2;
0691         break;
0692     case VFS_CAP_REVISION_3:
0693         if (size != XATTR_CAPS_SZ_3)
0694             return -EINVAL;
0695         tocopy = VFS_CAP_U32_3;
0696         rootkuid = make_kuid(fs_ns, le32_to_cpu(nscaps->rootid));
0697         break;
0698
0699     default:
0700         return -EINVAL;
0701     }
0702     /* Limit the caps to the mounter of the filesystem
0703      * or the more limited uid specified in the xattr.
0704      */
0705     rootkuid = mapped_kuid_fs(mnt_userns, fs_ns, rootkuid);
0706     if (!rootid_owns_currentns(rootkuid))
0707         return -ENODATA;
0708
0709     CAP_FOR_EACH_U32(i) {
0710         if (i >= tocopy)
0711             break;
0712         cpu_caps->permitted.cap[i] = le32_to_cpu(caps->data[i].permitted);
0713         cpu_caps->inheritable.cap[i] = le32_to_cpu(caps->data[i].inheritable);
0714     }
0715
0716     cpu_caps->permitted.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
0717     cpu_caps->inheritable.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
0718
0719     cpu_caps->rootid = rootkuid;
0720
0721     return 0;
0722 }
0723
0724 /*
0725  * Attempt to get the on-exec apply capability sets for an executable file from
0726  * its xattrs and, if present, apply them to the proposed credentials being
0727  * constructed by execve().
0728  */
0729 static int get_file_caps(struct linux_binprm *bprm, struct file *file,
0730              bool *effective, bool *has_fcap)
0731 {
0732     int rc = 0;
0733     struct cpu_vfs_cap_data vcaps;
0734
0735     cap_clear(bprm->cred->cap_permitted);
0736
0737     if (!file_caps_enabled)
0738         return 0;
0739
0740     if (!mnt_may_suid(file->f_path.mnt))
0741         return 0;
0742
0743     /*
0744      * This check is redundant with mnt_may_suid() but is kept to make
0745      * explicit that capability bits are limited to s_user_ns and its
0746      * descendants.
0747      */
0748     if (!current_in_userns(file->f_path.mnt->mnt_sb->s_user_ns))
0749         return 0;
0750
0751     rc = get_vfs_caps_from_disk(file_mnt_user_ns(file),
0752                     file->f_path.dentry, &vcaps);
0753     if (rc < 0) {
0754         if (rc == -EINVAL)
0755             printk(KERN_NOTICE "Invalid argument reading file caps for %s\n",
0756                     bprm->filename);
0757         else if (rc == -ENODATA)
0758             rc = 0;
0759         goto out;
0760     }
0761
0762     rc = bprm_caps_from_vfs_caps(&vcaps, bprm, effective, has_fcap);
0763
0764 out:
0765     if (rc)
0766         cap_clear(bprm->cred->cap_permitted);
0767
0768     return rc;
0769 }
0770
0771 static inline bool root_privileged(void) { return !issecure(SECURE_NOROOT); }
0772
0773 static inline bool __is_real(kuid_t uid, struct cred *cred)
0774 { return uid_eq(cred->uid, uid); }
0775
0776 static inline bool __is_eff(kuid_t uid, struct cred *cred)
0777 { return uid_eq(cred->euid, uid); }
0778
0779 static inline bool __is_suid(kuid_t uid, struct cred *cred)
0780 { return !__is_real(uid, cred) && __is_eff(uid, cred); }
0781
0782 /*
0783  * handle_privileged_root - Handle case of privileged root
0784  * @bprm: The execution parameters, including the proposed creds
0785  * @has_fcap: Are any file capabilities set?
0786  * @effective: Do we have effective root privilege?
0787  * @root_uid: This namespace' root UID WRT initial USER namespace
0788  *
0789  * Handle the case where root is privileged and hasn't been neutered by
0790  * SECURE_NOROOT.  If file capabilities are set, they won't be combined with
0791  * set UID root and nothing is changed.  If we are root, cap_permitted is
0792  * updated.  If we have become set UID root, the effective bit is set.
0793  */
0794 static void handle_privileged_root(struct linux_binprm *bprm, bool has_fcap,
0795                    bool *effective, kuid_t root_uid)
0796 {
0797     const struct cred *old = current_cred();
0798     struct cred *new = bprm->cred;
0799
0800     if (!root_privileged())
0801         return;
0802     /*
0803      * If the legacy file capability is set, then don't set privs
0804      * for a setuid root binary run by a non-root user.  Do set it
0805      * for a root user just to cause least surprise to an admin.
0806      */
0807     if (has_fcap && __is_suid(root_uid, new)) {
0808         warn_setuid_and_fcaps_mixed(bprm->filename);
0809         return;
0810     }
0811     /*
0812      * To support inheritance of root-permissions and suid-root
0813      * executables under compatibility mode, we override the
0814      * capability sets for the file.
0815      */
0816     if (__is_eff(root_uid, new) || __is_real(root_uid, new)) {
0817         /* pP' = (cap_bset & ~0) | (pI & ~0) */
0818         new->cap_permitted = cap_combine(old->cap_bset,
0819                          old->cap_inheritable);
0820     }
0821     /*
0822      * If only the real uid is 0, we do not set the effective bit.
0823      */
0824     if (__is_eff(root_uid, new))
0825         *effective = true;
0826 }
0827
0828 #define __cap_gained(field, target, source) \
0829     !cap_issubset(target->cap_##field, source->cap_##field)
0830 #define __cap_grew(target, source, cred) \
0831     !cap_issubset(cred->cap_##target, cred->cap_##source)
0832 #define __cap_full(field, cred) \
0833     cap_issubset(CAP_FULL_SET, cred->cap_##field)
0834
0835 static inline bool __is_setuid(struct cred *new, const struct cred *old)
0836 { return !uid_eq(new->euid, old->uid); }
0837
0838 static inline bool __is_setgid(struct cred *new, const struct cred *old)
0839 { return !gid_eq(new->egid, old->gid); }
0840
0841 /*
0842  * 1) Audit candidate if current->cap_effective is set
0843  *
0844  * We do not bother to audit if 3 things are true:
0845  *   1) cap_effective has all caps
0846  *   2) we became root *OR* are were already root
0847  *   3) root is supposed to have all caps (SECURE_NOROOT)
0848  * Since this is just a normal root execing a process.
0849  *
0850  * Number 1 above might fail if you don't have a full bset, but I think
0851  * that is interesting information to audit.
0852  *
0853  * A number of other conditions require logging:
0854  * 2) something prevented setuid root getting all caps
0855  * 3) non-setuid root gets fcaps
0856  * 4) non-setuid root gets ambient
0857  */
0858 static inline bool nonroot_raised_pE(struct cred *new, const struct cred *old,
0859                      kuid_t root, bool has_fcap)
0860 {
0861     bool ret = false;
0862
0863     if ((__cap_grew(effective, ambient, new) &&
0864          !(__cap_full(effective, new) &&
0865            (__is_eff(root, new) || __is_real(root, new)) &&
0866            root_privileged())) ||
0867         (root_privileged() &&
0868          __is_suid(root, new) &&
0869          !__cap_full(effective, new)) ||
0870         (!__is_setuid(new, old) &&
0871          ((has_fcap &&
0872            __cap_gained(permitted, new, old)) ||
0873           __cap_gained(ambient, new, old))))
0874
0875         ret = true;
0876
0877     return ret;
0878 }
0879
0880 /**
0881  * cap_bprm_creds_from_file - Set up the proposed credentials for execve().
0882  * @bprm: The execution parameters, including the proposed creds
0883  * @file: The file to pull the credentials from
0884  *
0885  * Set up the proposed credentials for a new execution context being
0886  * constructed by execve().  The proposed creds in @bprm->cred is altered,
0887  * which won't take effect immediately.
0888  *
0889  * Return: 0 if successful, -ve on error.
0890  */
0891 int cap_bprm_creds_from_file(struct linux_binprm *bprm, struct file *file)
0892 {
0893     /* Process setpcap binaries and capabilities for uid 0 */
0894     const struct cred *old = current_cred();
0895     struct cred *new = bprm->cred;
0896     bool effective = false, has_fcap = false, is_setid;
0897     int ret;
0898     kuid_t root_uid;
0899
0900     if (WARN_ON(!cap_ambient_invariant_ok(old)))
0901         return -EPERM;
0902
0903     ret = get_file_caps(bprm, file, &effective, &has_fcap);
0904     if (ret < 0)
0905         return ret;
0906
0907     root_uid = make_kuid(new->user_ns, 0);
0908
0909     handle_privileged_root(bprm, has_fcap, &effective, root_uid);
0910
0911     /* if we have fs caps, clear dangerous personality flags */
0912     if (__cap_gained(permitted, new, old))
0913         bprm->per_clear |= PER_CLEAR_ON_SETID;
0914
0915     /* Don't let someone trace a set[ug]id/setpcap binary with the revised
0916      * credentials unless they have the appropriate permit.
0917      *
0918      * In addition, if NO_NEW_PRIVS, then ensure we get no new privs.
0919      */
0920     is_setid = __is_setuid(new, old) || __is_setgid(new, old);
0921
0922     if ((is_setid || __cap_gained(permitted, new, old)) &&
0923         ((bprm->unsafe & ~LSM_UNSAFE_PTRACE) ||
0924          !ptracer_capable(current, new->user_ns))) {
0925         /* downgrade; they get no more than they had, and maybe less */
0926         if (!ns_capable(new->user_ns, CAP_SETUID) ||
0927             (bprm->unsafe & LSM_UNSAFE_NO_NEW_PRIVS)) {
0928             new->euid = new->uid;
0929             new->egid = new->gid;
0930         }
0931         new->cap_permitted = cap_intersect(new->cap_permitted,
0932                            old->cap_permitted);
0933     }
0934
0935     new->suid = new->fsuid = new->euid;
0936     new->sgid = new->fsgid = new->egid;
0937
0938     /* File caps or setid cancels ambient. */
0939     if (has_fcap || is_setid)
0940         cap_clear(new->cap_ambient);
0941
0942     /*
0943      * Now that we've computed pA', update pP' to give:
0944      *   pP' = (X & fP) | (pI & fI) | pA'
0945      */
0946     new->cap_permitted = cap_combine(new->cap_permitted, new->cap_ambient);
0947
0948     /*
0949      * Set pE' = (fE ? pP' : pA').  Because pA' is zero if fE is set,
0950      * this is the same as pE' = (fE ? pP' : 0) | pA'.
0951      */
0952     if (effective)
0953         new->cap_effective = new->cap_permitted;
0954     else
0955         new->cap_effective = new->cap_ambient;
0956
0957     if (WARN_ON(!cap_ambient_invariant_ok(new)))
0958         return -EPERM;
0959
0960     if (nonroot_raised_pE(new, old, root_uid, has_fcap)) {
0961         ret = audit_log_bprm_fcaps(bprm, new, old);
0962         if (ret < 0)
0963             return ret;
0964     }
0965
0966     new->securebits &= ~issecure_mask(SECURE_KEEP_CAPS);
0967
0968     if (WARN_ON(!cap_ambient_invariant_ok(new)))
0969         return -EPERM;
0970
0971     /* Check for privilege-elevated exec. */
0972     if (is_setid ||
0973         (!__is_real(root_uid, new) &&
0974          (effective ||
0975           __cap_grew(permitted, ambient, new))))
0976         bprm->secureexec = 1;
0977
0978     return 0;
0979 }
0980
0981 /**
0982  * cap_inode_setxattr - Determine whether an xattr may be altered
0983  * @dentry: The inode/dentry being altered
0984  * @name: The name of the xattr to be changed
0985  * @value: The value that the xattr will be changed to
0986  * @size: The size of value
0987  * @flags: The replacement flag
0988  *
0989  * Determine whether an xattr may be altered or set on an inode, returning 0 if
0990  * permission is granted, -ve if denied.
0991  *
0992  * This is used to make sure security xattrs don't get updated or set by those
0993  * who aren't privileged to do so.
0994  */
0995 int cap_inode_setxattr(struct dentry *dentry, const char *name,
0996                const void *value, size_t size, int flags)
0997 {
0998     struct user_namespace *user_ns = dentry->d_sb->s_user_ns;
0999
1000     /* Ignore non-security xattrs */
1001     if (strncmp(name, XATTR_SECURITY_PREFIX,
1002             XATTR_SECURITY_PREFIX_LEN) != 0)
1003         return 0;
1004
1005     /*
1006      * For XATTR_NAME_CAPS the check will be done in
1007      * cap_convert_nscap(), called by setxattr()
1008      */
1009     if (strcmp(name, XATTR_NAME_CAPS) == 0)
1010         return 0;
1011
1012     if (!ns_capable(user_ns, CAP_SYS_ADMIN))
1013         return -EPERM;
1014     return 0;
1015 }
1016
1017 /**
1018  * cap_inode_removexattr - Determine whether an xattr may be removed
1019  *
1020  * @mnt_userns: User namespace of the mount the inode was found from
1021  * @dentry: The inode/dentry being altered
1022  * @name:   The name of the xattr to be changed
1023  *
1024  * Determine whether an xattr may be removed from an inode, returning 0 if
1025  * permission is granted, -ve if denied.
1026  *
1027  * If the inode has been found through an idmapped mount the user namespace of
1028  * the vfsmount must be passed through @mnt_userns. This function will then
1029  * take care to map the inode according to @mnt_userns before checking
1030  * permissions. On non-idmapped mounts or if permission checking is to be
1031  * performed on the raw inode simply passs init_user_ns.
1032  *
1033  * This is used to make sure security xattrs don't get removed by those who
1034  * aren't privileged to remove them.
1035  */
1036 int cap_inode_removexattr(struct user_namespace *mnt_userns,
1037               struct dentry *dentry, const char *name)
1038 {
1039     struct user_namespace *user_ns = dentry->d_sb->s_user_ns;
1040
1041     /* Ignore non-security xattrs */
1042     if (strncmp(name, XATTR_SECURITY_PREFIX,
1043             XATTR_SECURITY_PREFIX_LEN) != 0)
1044         return 0;
1045
1046     if (strcmp(name, XATTR_NAME_CAPS) == 0) {
1047         /* security.capability gets namespaced */
1048         struct inode *inode = d_backing_inode(dentry);
1049         if (!inode)
1050             return -EINVAL;
1051         if (!capable_wrt_inode_uidgid(mnt_userns, inode, CAP_SETFCAP))
1052             return -EPERM;
1053         return 0;
1054     }
1055
1056     if (!ns_capable(user_ns, CAP_SYS_ADMIN))
1057         return -EPERM;
1058     return 0;
1059 }
1060
1061 /*
1062  * cap_emulate_setxuid() fixes the effective / permitted capabilities of
1063  * a process after a call to setuid, setreuid, or setresuid.
1064  *
1065  *  1) When set*uiding _from_ one of {r,e,s}uid == 0 _to_ all of
1066  *  {r,e,s}uid != 0, the permitted and effective capabilities are
1067  *  cleared.
1068  *
1069  *  2) When set*uiding _from_ euid == 0 _to_ euid != 0, the effective
1070  *  capabilities of the process are cleared.
1071  *
1072  *  3) When set*uiding _from_ euid != 0 _to_ euid == 0, the effective
1073  *  capabilities are set to the permitted capabilities.
1074  *
1075  *  fsuid is handled elsewhere. fsuid == 0 and {r,e,s}uid!= 0 should
1076  *  never happen.
1077  *
1078  *  -astor
1079  *
1080  * cevans - New behaviour, Oct '99
1081  * A process may, via prctl(), elect to keep its capabilities when it
1082  * calls setuid() and switches away from uid==0. Both permitted and
1083  * effective sets will be retained.
1084  * Without this change, it was impossible for a daemon to drop only some
1085  * of its privilege. The call to setuid(!=0) would drop all privileges!
1086  * Keeping uid 0 is not an option because uid 0 owns too many vital
1087  * files..
1088  * Thanks to Olaf Kirch and Peter Benie for spotting this.
1089  */
1090 static inline void cap_emulate_setxuid(struct cred *new, const struct cred *old)
1091 {
1092     kuid_t root_uid = make_kuid(old->user_ns, 0);
1093
1094     if ((uid_eq(old->uid, root_uid) ||
1095          uid_eq(old->euid, root_uid) ||
1096          uid_eq(old->suid, root_uid)) &&
1097         (!uid_eq(new->uid, root_uid) &&
1098          !uid_eq(new->euid, root_uid) &&
1099          !uid_eq(new->suid, root_uid))) {
1100         if (!issecure(SECURE_KEEP_CAPS)) {
1101             cap_clear(new->cap_permitted);
1102             cap_clear(new->cap_effective);
1103         }
1104
1105         /*
1106          * Pre-ambient programs expect setresuid to nonroot followed
1107          * by exec to drop capabilities.  We should make sure that
1108          * this remains the case.
1109          */
1110         cap_clear(new->cap_ambient);
1111     }
1112     if (uid_eq(old->euid, root_uid) && !uid_eq(new->euid, root_uid))
1113         cap_clear(new->cap_effective);
1114     if (!uid_eq(old->euid, root_uid) && uid_eq(new->euid, root_uid))
1115         new->cap_effective = new->cap_permitted;
1116 }
1117
1118 /**
1119  * cap_task_fix_setuid - Fix up the results of setuid() call
1120  * @new: The proposed credentials
1121  * @old: The current task's current credentials
1122  * @flags: Indications of what has changed
1123  *
1124  * Fix up the results of setuid() call before the credential changes are
1125  * actually applied.
1126  *
1127  * Return: 0 to grant the changes, -ve to deny them.
1128  */
1129 int cap_task_fix_setuid(struct cred *new, const struct cred *old, int flags)
1130 {
1131     switch (flags) {
1132     case LSM_SETID_RE:
1133     case LSM_SETID_ID:
1134     case LSM_SETID_RES:
1135         /* juggle the capabilities to follow [RES]UID changes unless
1136          * otherwise suppressed */
1137         if (!issecure(SECURE_NO_SETUID_FIXUP))
1138             cap_emulate_setxuid(new, old);
1139         break;
1140
1141     case LSM_SETID_FS:
1142         /* juggle the capabilties to follow FSUID changes, unless
1143          * otherwise suppressed
1144          *
1145          * FIXME - is fsuser used for all CAP_FS_MASK capabilities?
1146          *          if not, we might be a bit too harsh here.
1147          */
1148         if (!issecure(SECURE_NO_SETUID_FIXUP)) {
1149             kuid_t root_uid = make_kuid(old->user_ns, 0);
1150             if (uid_eq(old->fsuid, root_uid) && !uid_eq(new->fsuid, root_uid))
1151                 new->cap_effective =
1152                     cap_drop_fs_set(new->cap_effective);
1153
1154             if (!uid_eq(old->fsuid, root_uid) && uid_eq(new->fsuid, root_uid))
1155                 new->cap_effective =
1156                     cap_raise_fs_set(new->cap_effective,
1157                              new->cap_permitted);
1158         }
1159         break;
1160
1161     default:
1162         return -EINVAL;
1163     }
1164
1165     return 0;
1166 }
1167
1168 /*
1169  * Rationale: code calling task_setscheduler, task_setioprio, and
1170  * task_setnice, assumes that
1171  *   . if capable(cap_sys_nice), then those actions should be allowed
1172  *   . if not capable(cap_sys_nice), but acting on your own processes,
1173  *      then those actions should be allowed
1174  * This is insufficient now since you can call code without suid, but
1175  * yet with increased caps.
1176  * So we check for increased caps on the target process.
1177  */
1178 static int cap_safe_nice(struct task_struct *p)
1179 {
1180     int is_subset, ret = 0;
1181
1182     rcu_read_lock();
1183     is_subset = cap_issubset(__task_cred(p)->cap_permitted,
1184                  current_cred()->cap_permitted);
1185     if (!is_subset && !ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE))
1186         ret = -EPERM;
1187     rcu_read_unlock();
1188
1189     return ret;
1190 }
1191
1192 /**
1193  * cap_task_setscheduler - Detemine if scheduler policy change is permitted
1194  * @p: The task to affect
1195  *
1196  * Detemine if the requested scheduler policy change is permitted for the
1197  * specified task.
1198  *
1199  * Return: 0 if permission is granted, -ve if denied.
1200  */
1201 int cap_task_setscheduler(struct task_struct *p)
1202 {
1203     return cap_safe_nice(p);
1204 }
1205
1206 /**
1207  * cap_task_setioprio - Detemine if I/O priority change is permitted
1208  * @p: The task to affect
1209  * @ioprio: The I/O priority to set
1210  *
1211  * Detemine if the requested I/O priority change is permitted for the specified
1212  * task.
1213  *
1214  * Return: 0 if permission is granted, -ve if denied.
1215  */
1216 int cap_task_setioprio(struct task_struct *p, int ioprio)
1217 {
1218     return cap_safe_nice(p);
1219 }
1220
1221 /**
1222  * cap_task_setnice - Detemine if task priority change is permitted
1223  * @p: The task to affect
1224  * @nice: The nice value to set
1225  *
1226  * Detemine if the requested task priority change is permitted for the
1227  * specified task.
1228  *
1229  * Return: 0 if permission is granted, -ve if denied.
1230  */
1231 int cap_task_setnice(struct task_struct *p, int nice)
1232 {
1233     return cap_safe_nice(p);
1234 }
1235
1236 /*
1237  * Implement PR_CAPBSET_DROP.  Attempt to remove the specified capability from
1238  * the current task's bounding set.  Returns 0 on success, -ve on error.
1239  */
1240 static int cap_prctl_drop(unsigned long cap)
1241 {
1242     struct cred *new;
1243
1244     if (!ns_capable(current_user_ns(), CAP_SETPCAP))
1245         return -EPERM;
1246     if (!cap_valid(cap))
1247         return -EINVAL;
1248
1249     new = prepare_creds();
1250     if (!new)
1251         return -ENOMEM;
1252     cap_lower(new->cap_bset, cap);
1253     return commit_creds(new);
1254 }
1255
1256 /**
1257  * cap_task_prctl - Implement process control functions for this security module
1258  * @option: The process control function requested
1259  * @arg2: The argument data for this function
1260  * @arg3: The argument data for this function
1261  * @arg4: The argument data for this function
1262  * @arg5: The argument data for this function
1263  *
1264  * Allow process control functions (sys_prctl()) to alter capabilities; may
1265  * also deny access to other functions not otherwise implemented here.
1266  *
1267  * Return: 0 or +ve on success, -ENOSYS if this function is not implemented
1268  * here, other -ve on error.  If -ENOSYS is returned, sys_prctl() and other LSM
1269  * modules will consider performing the function.
1270  */
1271 int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3,
1272            unsigned long arg4, unsigned long arg5)
1273 {
1274     const struct cred *old = current_cred();
1275     struct cred *new;
1276
1277     switch (option) {
1278     case PR_CAPBSET_READ:
1279         if (!cap_valid(arg2))
1280             return -EINVAL;
1281         return !!cap_raised(old->cap_bset, arg2);
1282
1283     case PR_CAPBSET_DROP:
1284         return cap_prctl_drop(arg2);
1285
1286     /*
1287      * The next four prctl's remain to assist with transitioning a
1288      * system from legacy UID=0 based privilege (when filesystem
1289      * capabilities are not in use) to a system using filesystem
1290      * capabilities only - as the POSIX.1e draft intended.
1291      *
1292      * Note:
1293      *
1294      *  PR_SET_SECUREBITS =
1295      *      issecure_mask(SECURE_KEEP_CAPS_LOCKED)
1296      *    | issecure_mask(SECURE_NOROOT)
1297      *    | issecure_mask(SECURE_NOROOT_LOCKED)
1298      *    | issecure_mask(SECURE_NO_SETUID_FIXUP)
1299      *    | issecure_mask(SECURE_NO_SETUID_FIXUP_LOCKED)
1300      *
1301      * will ensure that the current process and all of its
1302      * children will be locked into a pure
1303      * capability-based-privilege environment.
1304      */
1305     case PR_SET_SECUREBITS:
1306         if ((((old->securebits & SECURE_ALL_LOCKS) >> 1)
1307              & (old->securebits ^ arg2))            /*[1]*/
1308             || ((old->securebits & SECURE_ALL_LOCKS & ~arg2))   /*[2]*/
1309             || (arg2 & ~(SECURE_ALL_LOCKS | SECURE_ALL_BITS))   /*[3]*/
1310             || (cap_capable(current_cred(),
1311                     current_cred()->user_ns,
1312                     CAP_SETPCAP,
1313                     CAP_OPT_NONE) != 0)         /*[4]*/
1314             /*
1315              * [1] no changing of bits that are locked
1316              * [2] no unlocking of locks
1317              * [3] no setting of unsupported bits
1318              * [4] doing anything requires privilege (go read about
1319              *     the "sendmail capabilities bug")
1320              */
1321             )
1322             /* cannot change a locked bit */
1323             return -EPERM;
1324
1325         new = prepare_creds();
1326         if (!new)
1327             return -ENOMEM;
1328         new->securebits = arg2;
1329         return commit_creds(new);
1330
1331     case PR_GET_SECUREBITS:
1332         return old->securebits;
1333
1334     case PR_GET_KEEPCAPS:
1335         return !!issecure(SECURE_KEEP_CAPS);
1336
1337     case PR_SET_KEEPCAPS:
1338         if (arg2 > 1) /* Note, we rely on arg2 being unsigned here */
1339             return -EINVAL;
1340         if (issecure(SECURE_KEEP_CAPS_LOCKED))
1341             return -EPERM;
1342
1343         new = prepare_creds();
1344         if (!new)
1345             return -ENOMEM;
1346         if (arg2)
1347             new->securebits |= issecure_mask(SECURE_KEEP_CAPS);
1348         else
1349             new->securebits &= ~issecure_mask(SECURE_KEEP_CAPS);
1350         return commit_creds(new);
1351
1352     case PR_CAP_AMBIENT:
1353         if (arg2 == PR_CAP_AMBIENT_CLEAR_ALL) {
1354             if (arg3 | arg4 | arg5)
1355                 return -EINVAL;
1356
1357             new = prepare_creds();
1358             if (!new)
1359                 return -ENOMEM;
1360             cap_clear(new->cap_ambient);
1361             return commit_creds(new);
1362         }
1363
1364         if (((!cap_valid(arg3)) | arg4 | arg5))
1365             return -EINVAL;
1366
1367         if (arg2 == PR_CAP_AMBIENT_IS_SET) {
1368             return !!cap_raised(current_cred()->cap_ambient, arg3);
1369         } else if (arg2 != PR_CAP_AMBIENT_RAISE &&
1370                arg2 != PR_CAP_AMBIENT_LOWER) {
1371             return -EINVAL;
1372         } else {
1373             if (arg2 == PR_CAP_AMBIENT_RAISE &&
1374                 (!cap_raised(current_cred()->cap_permitted, arg3) ||
1375                  !cap_raised(current_cred()->cap_inheritable,
1376                      arg3) ||
1377                  issecure(SECURE_NO_CAP_AMBIENT_RAISE)))
1378                 return -EPERM;
1379
1380             new = prepare_creds();
1381             if (!new)
1382                 return -ENOMEM;
1383             if (arg2 == PR_CAP_AMBIENT_RAISE)
1384                 cap_raise(new->cap_ambient, arg3);
1385             else
1386                 cap_lower(new->cap_ambient, arg3);
1387             return commit_creds(new);
1388         }
1389
1390     default:
1391         /* No functionality available - continue with default */
1392         return -ENOSYS;
1393     }
1394 }
1395
1396 /**
1397  * cap_vm_enough_memory - Determine whether a new virtual mapping is permitted
1398  * @mm: The VM space in which the new mapping is to be made
1399  * @pages: The size of the mapping
1400  *
1401  * Determine whether the allocation of a new virtual mapping by the current
1402  * task is permitted.
1403  *
1404  * Return: 1 if permission is granted, 0 if not.
1405  */
1406 int cap_vm_enough_memory(struct mm_struct *mm, long pages)
1407 {
1408     int cap_sys_admin = 0;
1409
1410     if (cap_capable(current_cred(), &init_user_ns,
1411                 CAP_SYS_ADMIN, CAP_OPT_NOAUDIT) == 0)
1412         cap_sys_admin = 1;
1413
1414     return cap_sys_admin;
1415 }
1416
1417 /**
1418  * cap_mmap_addr - check if able to map given addr
1419  * @addr: address attempting to be mapped
1420  *
1421  * If the process is attempting to map memory below dac_mmap_min_addr they need
1422  * CAP_SYS_RAWIO.  The other parameters to this function are unused by the
1423  * capability security module.
1424  *
1425  * Return: 0 if this mapping should be allowed or -EPERM if not.
1426  */
1427 int cap_mmap_addr(unsigned long addr)
1428 {
1429     int ret = 0;
1430
1431     if (addr < dac_mmap_min_addr) {
1432         ret = cap_capable(current_cred(), &init_user_ns, CAP_SYS_RAWIO,
1433                   CAP_OPT_NONE);
1434         /* set PF_SUPERPRIV if it turns out we allow the low mmap */
1435         if (ret == 0)
1436             current->flags |= PF_SUPERPRIV;
1437     }
1438     return ret;
1439 }
1440
1441 int cap_mmap_file(struct file *file, unsigned long reqprot,
1442           unsigned long prot, unsigned long flags)
1443 {
1444     return 0;
1445 }
1446
1447 #ifdef CONFIG_SECURITY
1448
1449 static struct security_hook_list capability_hooks[] __lsm_ro_after_init = {
1450     LSM_HOOK_INIT(capable, cap_capable),
1451     LSM_HOOK_INIT(settime, cap_settime),
1452     LSM_HOOK_INIT(ptrace_access_check, cap_ptrace_access_check),
1453     LSM_HOOK_INIT(ptrace_traceme, cap_ptrace_traceme),
1454     LSM_HOOK_INIT(capget, cap_capget),
1455     LSM_HOOK_INIT(capset, cap_capset),
1456     LSM_HOOK_INIT(bprm_creds_from_file, cap_bprm_creds_from_file),
1457     LSM_HOOK_INIT(inode_need_killpriv, cap_inode_need_killpriv),
1458     LSM_HOOK_INIT(inode_killpriv, cap_inode_killpriv),
1459     LSM_HOOK_INIT(inode_getsecurity, cap_inode_getsecurity),
1460     LSM_HOOK_INIT(mmap_addr, cap_mmap_addr),
1461     LSM_HOOK_INIT(mmap_file, cap_mmap_file),
1462     LSM_HOOK_INIT(task_fix_setuid, cap_task_fix_setuid),
1463     LSM_HOOK_INIT(task_prctl, cap_task_prctl),
1464     LSM_HOOK_INIT(task_setscheduler, cap_task_setscheduler),
1465     LSM_HOOK_INIT(task_setioprio, cap_task_setioprio),
1466     LSM_HOOK_INIT(task_setnice, cap_task_setnice),
1467     LSM_HOOK_INIT(vm_enough_memory, cap_vm_enough_memory),
1468 };
1469
1470 static int __init capability_init(void)
1471 {
1472     security_add_hooks(capability_hooks, ARRAY_SIZE(capability_hooks),
1473                 "capability");
1474     return 0;
1475 }
1476
1477 DEFINE_LSM(capability) = {
1478     .name = "capability",
1479     .order = LSM_ORDER_FIRST,
1480     .init = capability_init,
1481 };
1482
1483 #endif /* CONFIG_SECURITY */