Back to home page

LXR

 
 

    


0001 /*
0002  *  linux/kernel/sys.c
0003  *
0004  *  Copyright (C) 1991, 1992  Linus Torvalds
0005  */
0006 
0007 #include <linux/export.h>
0008 #include <linux/mm.h>
0009 #include <linux/utsname.h>
0010 #include <linux/mman.h>
0011 #include <linux/reboot.h>
0012 #include <linux/prctl.h>
0013 #include <linux/highuid.h>
0014 #include <linux/fs.h>
0015 #include <linux/kmod.h>
0016 #include <linux/perf_event.h>
0017 #include <linux/resource.h>
0018 #include <linux/kernel.h>
0019 #include <linux/workqueue.h>
0020 #include <linux/capability.h>
0021 #include <linux/device.h>
0022 #include <linux/key.h>
0023 #include <linux/times.h>
0024 #include <linux/posix-timers.h>
0025 #include <linux/security.h>
0026 #include <linux/dcookies.h>
0027 #include <linux/suspend.h>
0028 #include <linux/tty.h>
0029 #include <linux/signal.h>
0030 #include <linux/cn_proc.h>
0031 #include <linux/getcpu.h>
0032 #include <linux/task_io_accounting_ops.h>
0033 #include <linux/seccomp.h>
0034 #include <linux/cpu.h>
0035 #include <linux/personality.h>
0036 #include <linux/ptrace.h>
0037 #include <linux/fs_struct.h>
0038 #include <linux/file.h>
0039 #include <linux/mount.h>
0040 #include <linux/gfp.h>
0041 #include <linux/syscore_ops.h>
0042 #include <linux/version.h>
0043 #include <linux/ctype.h>
0044 
0045 #include <linux/compat.h>
0046 #include <linux/syscalls.h>
0047 #include <linux/kprobes.h>
0048 #include <linux/user_namespace.h>
0049 #include <linux/binfmts.h>
0050 
0051 #include <linux/sched.h>
0052 #include <linux/rcupdate.h>
0053 #include <linux/uidgid.h>
0054 #include <linux/cred.h>
0055 
0056 #include <linux/kmsg_dump.h>
0057 /* Move somewhere else to avoid recompiling? */
0058 #include <generated/utsrelease.h>
0059 
0060 #include <linux/uaccess.h>
0061 #include <asm/io.h>
0062 #include <asm/unistd.h>
0063 
0064 #ifndef SET_UNALIGN_CTL
0065 # define SET_UNALIGN_CTL(a, b)  (-EINVAL)
0066 #endif
0067 #ifndef GET_UNALIGN_CTL
0068 # define GET_UNALIGN_CTL(a, b)  (-EINVAL)
0069 #endif
0070 #ifndef SET_FPEMU_CTL
0071 # define SET_FPEMU_CTL(a, b)    (-EINVAL)
0072 #endif
0073 #ifndef GET_FPEMU_CTL
0074 # define GET_FPEMU_CTL(a, b)    (-EINVAL)
0075 #endif
0076 #ifndef SET_FPEXC_CTL
0077 # define SET_FPEXC_CTL(a, b)    (-EINVAL)
0078 #endif
0079 #ifndef GET_FPEXC_CTL
0080 # define GET_FPEXC_CTL(a, b)    (-EINVAL)
0081 #endif
0082 #ifndef GET_ENDIAN
0083 # define GET_ENDIAN(a, b)   (-EINVAL)
0084 #endif
0085 #ifndef SET_ENDIAN
0086 # define SET_ENDIAN(a, b)   (-EINVAL)
0087 #endif
0088 #ifndef GET_TSC_CTL
0089 # define GET_TSC_CTL(a)     (-EINVAL)
0090 #endif
0091 #ifndef SET_TSC_CTL
0092 # define SET_TSC_CTL(a)     (-EINVAL)
0093 #endif
0094 #ifndef MPX_ENABLE_MANAGEMENT
0095 # define MPX_ENABLE_MANAGEMENT()    (-EINVAL)
0096 #endif
0097 #ifndef MPX_DISABLE_MANAGEMENT
0098 # define MPX_DISABLE_MANAGEMENT()   (-EINVAL)
0099 #endif
0100 #ifndef GET_FP_MODE
0101 # define GET_FP_MODE(a)     (-EINVAL)
0102 #endif
0103 #ifndef SET_FP_MODE
0104 # define SET_FP_MODE(a,b)   (-EINVAL)
0105 #endif
0106 
0107 /*
0108  * this is where the system-wide overflow UID and GID are defined, for
0109  * architectures that now have 32-bit UID/GID but didn't in the past
0110  */
0111 
0112 int overflowuid = DEFAULT_OVERFLOWUID;
0113 int overflowgid = DEFAULT_OVERFLOWGID;
0114 
0115 EXPORT_SYMBOL(overflowuid);
0116 EXPORT_SYMBOL(overflowgid);
0117 
0118 /*
0119  * the same as above, but for filesystems which can only store a 16-bit
0120  * UID and GID. as such, this is needed on all architectures
0121  */
0122 
0123 int fs_overflowuid = DEFAULT_FS_OVERFLOWUID;
0124 int fs_overflowgid = DEFAULT_FS_OVERFLOWUID;
0125 
0126 EXPORT_SYMBOL(fs_overflowuid);
0127 EXPORT_SYMBOL(fs_overflowgid);
0128 
0129 /*
0130  * Returns true if current's euid is same as p's uid or euid,
0131  * or has CAP_SYS_NICE to p's user_ns.
0132  *
0133  * Called with rcu_read_lock, creds are safe
0134  */
0135 static bool set_one_prio_perm(struct task_struct *p)
0136 {
0137     const struct cred *cred = current_cred(), *pcred = __task_cred(p);
0138 
0139     if (uid_eq(pcred->uid,  cred->euid) ||
0140         uid_eq(pcred->euid, cred->euid))
0141         return true;
0142     if (ns_capable(pcred->user_ns, CAP_SYS_NICE))
0143         return true;
0144     return false;
0145 }
0146 
0147 /*
0148  * set the priority of a task
0149  * - the caller must hold the RCU read lock
0150  */
0151 static int set_one_prio(struct task_struct *p, int niceval, int error)
0152 {
0153     int no_nice;
0154 
0155     if (!set_one_prio_perm(p)) {
0156         error = -EPERM;
0157         goto out;
0158     }
0159     if (niceval < task_nice(p) && !can_nice(p, niceval)) {
0160         error = -EACCES;
0161         goto out;
0162     }
0163     no_nice = security_task_setnice(p, niceval);
0164     if (no_nice) {
0165         error = no_nice;
0166         goto out;
0167     }
0168     if (error == -ESRCH)
0169         error = 0;
0170     set_user_nice(p, niceval);
0171 out:
0172     return error;
0173 }
0174 
0175 SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval)
0176 {
0177     struct task_struct *g, *p;
0178     struct user_struct *user;
0179     const struct cred *cred = current_cred();
0180     int error = -EINVAL;
0181     struct pid *pgrp;
0182     kuid_t uid;
0183 
0184     if (which > PRIO_USER || which < PRIO_PROCESS)
0185         goto out;
0186 
0187     /* normalize: avoid signed division (rounding problems) */
0188     error = -ESRCH;
0189     if (niceval < MIN_NICE)
0190         niceval = MIN_NICE;
0191     if (niceval > MAX_NICE)
0192         niceval = MAX_NICE;
0193 
0194     rcu_read_lock();
0195     read_lock(&tasklist_lock);
0196     switch (which) {
0197     case PRIO_PROCESS:
0198         if (who)
0199             p = find_task_by_vpid(who);
0200         else
0201             p = current;
0202         if (p)
0203             error = set_one_prio(p, niceval, error);
0204         break;
0205     case PRIO_PGRP:
0206         if (who)
0207             pgrp = find_vpid(who);
0208         else
0209             pgrp = task_pgrp(current);
0210         do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
0211             error = set_one_prio(p, niceval, error);
0212         } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
0213         break;
0214     case PRIO_USER:
0215         uid = make_kuid(cred->user_ns, who);
0216         user = cred->user;
0217         if (!who)
0218             uid = cred->uid;
0219         else if (!uid_eq(uid, cred->uid)) {
0220             user = find_user(uid);
0221             if (!user)
0222                 goto out_unlock;    /* No processes for this user */
0223         }
0224         do_each_thread(g, p) {
0225             if (uid_eq(task_uid(p), uid) && task_pid_vnr(p))
0226                 error = set_one_prio(p, niceval, error);
0227         } while_each_thread(g, p);
0228         if (!uid_eq(uid, cred->uid))
0229             free_uid(user);     /* For find_user() */
0230         break;
0231     }
0232 out_unlock:
0233     read_unlock(&tasklist_lock);
0234     rcu_read_unlock();
0235 out:
0236     return error;
0237 }
0238 
0239 /*
0240  * Ugh. To avoid negative return values, "getpriority()" will
0241  * not return the normal nice-value, but a negated value that
0242  * has been offset by 20 (ie it returns 40..1 instead of -20..19)
0243  * to stay compatible.
0244  */
0245 SYSCALL_DEFINE2(getpriority, int, which, int, who)
0246 {
0247     struct task_struct *g, *p;
0248     struct user_struct *user;
0249     const struct cred *cred = current_cred();
0250     long niceval, retval = -ESRCH;
0251     struct pid *pgrp;
0252     kuid_t uid;
0253 
0254     if (which > PRIO_USER || which < PRIO_PROCESS)
0255         return -EINVAL;
0256 
0257     rcu_read_lock();
0258     read_lock(&tasklist_lock);
0259     switch (which) {
0260     case PRIO_PROCESS:
0261         if (who)
0262             p = find_task_by_vpid(who);
0263         else
0264             p = current;
0265         if (p) {
0266             niceval = nice_to_rlimit(task_nice(p));
0267             if (niceval > retval)
0268                 retval = niceval;
0269         }
0270         break;
0271     case PRIO_PGRP:
0272         if (who)
0273             pgrp = find_vpid(who);
0274         else
0275             pgrp = task_pgrp(current);
0276         do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
0277             niceval = nice_to_rlimit(task_nice(p));
0278             if (niceval > retval)
0279                 retval = niceval;
0280         } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
0281         break;
0282     case PRIO_USER:
0283         uid = make_kuid(cred->user_ns, who);
0284         user = cred->user;
0285         if (!who)
0286             uid = cred->uid;
0287         else if (!uid_eq(uid, cred->uid)) {
0288             user = find_user(uid);
0289             if (!user)
0290                 goto out_unlock;    /* No processes for this user */
0291         }
0292         do_each_thread(g, p) {
0293             if (uid_eq(task_uid(p), uid) && task_pid_vnr(p)) {
0294                 niceval = nice_to_rlimit(task_nice(p));
0295                 if (niceval > retval)
0296                     retval = niceval;
0297             }
0298         } while_each_thread(g, p);
0299         if (!uid_eq(uid, cred->uid))
0300             free_uid(user);     /* for find_user() */
0301         break;
0302     }
0303 out_unlock:
0304     read_unlock(&tasklist_lock);
0305     rcu_read_unlock();
0306 
0307     return retval;
0308 }
0309 
0310 /*
0311  * Unprivileged users may change the real gid to the effective gid
0312  * or vice versa.  (BSD-style)
0313  *
0314  * If you set the real gid at all, or set the effective gid to a value not
0315  * equal to the real gid, then the saved gid is set to the new effective gid.
0316  *
0317  * This makes it possible for a setgid program to completely drop its
0318  * privileges, which is often a useful assertion to make when you are doing
0319  * a security audit over a program.
0320  *
0321  * The general idea is that a program which uses just setregid() will be
0322  * 100% compatible with BSD.  A program which uses just setgid() will be
0323  * 100% compatible with POSIX with saved IDs.
0324  *
0325  * SMP: There are not races, the GIDs are checked only by filesystem
0326  *      operations (as far as semantic preservation is concerned).
0327  */
0328 #ifdef CONFIG_MULTIUSER
0329 SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid)
0330 {
0331     struct user_namespace *ns = current_user_ns();
0332     const struct cred *old;
0333     struct cred *new;
0334     int retval;
0335     kgid_t krgid, kegid;
0336 
0337     krgid = make_kgid(ns, rgid);
0338     kegid = make_kgid(ns, egid);
0339 
0340     if ((rgid != (gid_t) -1) && !gid_valid(krgid))
0341         return -EINVAL;
0342     if ((egid != (gid_t) -1) && !gid_valid(kegid))
0343         return -EINVAL;
0344 
0345     new = prepare_creds();
0346     if (!new)
0347         return -ENOMEM;
0348     old = current_cred();
0349 
0350     retval = -EPERM;
0351     if (rgid != (gid_t) -1) {
0352         if (gid_eq(old->gid, krgid) ||
0353             gid_eq(old->egid, krgid) ||
0354             ns_capable(old->user_ns, CAP_SETGID))
0355             new->gid = krgid;
0356         else
0357             goto error;
0358     }
0359     if (egid != (gid_t) -1) {
0360         if (gid_eq(old->gid, kegid) ||
0361             gid_eq(old->egid, kegid) ||
0362             gid_eq(old->sgid, kegid) ||
0363             ns_capable(old->user_ns, CAP_SETGID))
0364             new->egid = kegid;
0365         else
0366             goto error;
0367     }
0368 
0369     if (rgid != (gid_t) -1 ||
0370         (egid != (gid_t) -1 && !gid_eq(kegid, old->gid)))
0371         new->sgid = new->egid;
0372     new->fsgid = new->egid;
0373 
0374     return commit_creds(new);
0375 
0376 error:
0377     abort_creds(new);
0378     return retval;
0379 }
0380 
0381 /*
0382  * setgid() is implemented like SysV w/ SAVED_IDS
0383  *
0384  * SMP: Same implicit races as above.
0385  */
0386 SYSCALL_DEFINE1(setgid, gid_t, gid)
0387 {
0388     struct user_namespace *ns = current_user_ns();
0389     const struct cred *old;
0390     struct cred *new;
0391     int retval;
0392     kgid_t kgid;
0393 
0394     kgid = make_kgid(ns, gid);
0395     if (!gid_valid(kgid))
0396         return -EINVAL;
0397 
0398     new = prepare_creds();
0399     if (!new)
0400         return -ENOMEM;
0401     old = current_cred();
0402 
0403     retval = -EPERM;
0404     if (ns_capable(old->user_ns, CAP_SETGID))
0405         new->gid = new->egid = new->sgid = new->fsgid = kgid;
0406     else if (gid_eq(kgid, old->gid) || gid_eq(kgid, old->sgid))
0407         new->egid = new->fsgid = kgid;
0408     else
0409         goto error;
0410 
0411     return commit_creds(new);
0412 
0413 error:
0414     abort_creds(new);
0415     return retval;
0416 }
0417 
0418 /*
0419  * change the user struct in a credentials set to match the new UID
0420  */
0421 static int set_user(struct cred *new)
0422 {
0423     struct user_struct *new_user;
0424 
0425     new_user = alloc_uid(new->uid);
0426     if (!new_user)
0427         return -EAGAIN;
0428 
0429     /*
0430      * We don't fail in case of NPROC limit excess here because too many
0431      * poorly written programs don't check set*uid() return code, assuming
0432      * it never fails if called by root.  We may still enforce NPROC limit
0433      * for programs doing set*uid()+execve() by harmlessly deferring the
0434      * failure to the execve() stage.
0435      */
0436     if (atomic_read(&new_user->processes) >= rlimit(RLIMIT_NPROC) &&
0437             new_user != INIT_USER)
0438         current->flags |= PF_NPROC_EXCEEDED;
0439     else
0440         current->flags &= ~PF_NPROC_EXCEEDED;
0441 
0442     free_uid(new->user);
0443     new->user = new_user;
0444     return 0;
0445 }
0446 
0447 /*
0448  * Unprivileged users may change the real uid to the effective uid
0449  * or vice versa.  (BSD-style)
0450  *
0451  * If you set the real uid at all, or set the effective uid to a value not
0452  * equal to the real uid, then the saved uid is set to the new effective uid.
0453  *
0454  * This makes it possible for a setuid program to completely drop its
0455  * privileges, which is often a useful assertion to make when you are doing
0456  * a security audit over a program.
0457  *
0458  * The general idea is that a program which uses just setreuid() will be
0459  * 100% compatible with BSD.  A program which uses just setuid() will be
0460  * 100% compatible with POSIX with saved IDs.
0461  */
0462 SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
0463 {
0464     struct user_namespace *ns = current_user_ns();
0465     const struct cred *old;
0466     struct cred *new;
0467     int retval;
0468     kuid_t kruid, keuid;
0469 
0470     kruid = make_kuid(ns, ruid);
0471     keuid = make_kuid(ns, euid);
0472 
0473     if ((ruid != (uid_t) -1) && !uid_valid(kruid))
0474         return -EINVAL;
0475     if ((euid != (uid_t) -1) && !uid_valid(keuid))
0476         return -EINVAL;
0477 
0478     new = prepare_creds();
0479     if (!new)
0480         return -ENOMEM;
0481     old = current_cred();
0482 
0483     retval = -EPERM;
0484     if (ruid != (uid_t) -1) {
0485         new->uid = kruid;
0486         if (!uid_eq(old->uid, kruid) &&
0487             !uid_eq(old->euid, kruid) &&
0488             !ns_capable(old->user_ns, CAP_SETUID))
0489             goto error;
0490     }
0491 
0492     if (euid != (uid_t) -1) {
0493         new->euid = keuid;
0494         if (!uid_eq(old->uid, keuid) &&
0495             !uid_eq(old->euid, keuid) &&
0496             !uid_eq(old->suid, keuid) &&
0497             !ns_capable(old->user_ns, CAP_SETUID))
0498             goto error;
0499     }
0500 
0501     if (!uid_eq(new->uid, old->uid)) {
0502         retval = set_user(new);
0503         if (retval < 0)
0504             goto error;
0505     }
0506     if (ruid != (uid_t) -1 ||
0507         (euid != (uid_t) -1 && !uid_eq(keuid, old->uid)))
0508         new->suid = new->euid;
0509     new->fsuid = new->euid;
0510 
0511     retval = security_task_fix_setuid(new, old, LSM_SETID_RE);
0512     if (retval < 0)
0513         goto error;
0514 
0515     return commit_creds(new);
0516 
0517 error:
0518     abort_creds(new);
0519     return retval;
0520 }
0521 
0522 /*
0523  * setuid() is implemented like SysV with SAVED_IDS
0524  *
0525  * Note that SAVED_ID's is deficient in that a setuid root program
0526  * like sendmail, for example, cannot set its uid to be a normal
0527  * user and then switch back, because if you're root, setuid() sets
0528  * the saved uid too.  If you don't like this, blame the bright people
0529  * in the POSIX committee and/or USG.  Note that the BSD-style setreuid()
0530  * will allow a root program to temporarily drop privileges and be able to
0531  * regain them by swapping the real and effective uid.
0532  */
0533 SYSCALL_DEFINE1(setuid, uid_t, uid)
0534 {
0535     struct user_namespace *ns = current_user_ns();
0536     const struct cred *old;
0537     struct cred *new;
0538     int retval;
0539     kuid_t kuid;
0540 
0541     kuid = make_kuid(ns, uid);
0542     if (!uid_valid(kuid))
0543         return -EINVAL;
0544 
0545     new = prepare_creds();
0546     if (!new)
0547         return -ENOMEM;
0548     old = current_cred();
0549 
0550     retval = -EPERM;
0551     if (ns_capable(old->user_ns, CAP_SETUID)) {
0552         new->suid = new->uid = kuid;
0553         if (!uid_eq(kuid, old->uid)) {
0554             retval = set_user(new);
0555             if (retval < 0)
0556                 goto error;
0557         }
0558     } else if (!uid_eq(kuid, old->uid) && !uid_eq(kuid, new->suid)) {
0559         goto error;
0560     }
0561 
0562     new->fsuid = new->euid = kuid;
0563 
0564     retval = security_task_fix_setuid(new, old, LSM_SETID_ID);
0565     if (retval < 0)
0566         goto error;
0567 
0568     return commit_creds(new);
0569 
0570 error:
0571     abort_creds(new);
0572     return retval;
0573 }
0574 
0575 
0576 /*
0577  * This function implements a generic ability to update ruid, euid,
0578  * and suid.  This allows you to implement the 4.4 compatible seteuid().
0579  */
0580 SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid)
0581 {
0582     struct user_namespace *ns = current_user_ns();
0583     const struct cred *old;
0584     struct cred *new;
0585     int retval;
0586     kuid_t kruid, keuid, ksuid;
0587 
0588     kruid = make_kuid(ns, ruid);
0589     keuid = make_kuid(ns, euid);
0590     ksuid = make_kuid(ns, suid);
0591 
0592     if ((ruid != (uid_t) -1) && !uid_valid(kruid))
0593         return -EINVAL;
0594 
0595     if ((euid != (uid_t) -1) && !uid_valid(keuid))
0596         return -EINVAL;
0597 
0598     if ((suid != (uid_t) -1) && !uid_valid(ksuid))
0599         return -EINVAL;
0600 
0601     new = prepare_creds();
0602     if (!new)
0603         return -ENOMEM;
0604 
0605     old = current_cred();
0606 
0607     retval = -EPERM;
0608     if (!ns_capable(old->user_ns, CAP_SETUID)) {
0609         if (ruid != (uid_t) -1        && !uid_eq(kruid, old->uid) &&
0610             !uid_eq(kruid, old->euid) && !uid_eq(kruid, old->suid))
0611             goto error;
0612         if (euid != (uid_t) -1        && !uid_eq(keuid, old->uid) &&
0613             !uid_eq(keuid, old->euid) && !uid_eq(keuid, old->suid))
0614             goto error;
0615         if (suid != (uid_t) -1        && !uid_eq(ksuid, old->uid) &&
0616             !uid_eq(ksuid, old->euid) && !uid_eq(ksuid, old->suid))
0617             goto error;
0618     }
0619 
0620     if (ruid != (uid_t) -1) {
0621         new->uid = kruid;
0622         if (!uid_eq(kruid, old->uid)) {
0623             retval = set_user(new);
0624             if (retval < 0)
0625                 goto error;
0626         }
0627     }
0628     if (euid != (uid_t) -1)
0629         new->euid = keuid;
0630     if (suid != (uid_t) -1)
0631         new->suid = ksuid;
0632     new->fsuid = new->euid;
0633 
0634     retval = security_task_fix_setuid(new, old, LSM_SETID_RES);
0635     if (retval < 0)
0636         goto error;
0637 
0638     return commit_creds(new);
0639 
0640 error:
0641     abort_creds(new);
0642     return retval;
0643 }
0644 
0645 SYSCALL_DEFINE3(getresuid, uid_t __user *, ruidp, uid_t __user *, euidp, uid_t __user *, suidp)
0646 {
0647     const struct cred *cred = current_cred();
0648     int retval;
0649     uid_t ruid, euid, suid;
0650 
0651     ruid = from_kuid_munged(cred->user_ns, cred->uid);
0652     euid = from_kuid_munged(cred->user_ns, cred->euid);
0653     suid = from_kuid_munged(cred->user_ns, cred->suid);
0654 
0655     retval = put_user(ruid, ruidp);
0656     if (!retval) {
0657         retval = put_user(euid, euidp);
0658         if (!retval)
0659             return put_user(suid, suidp);
0660     }
0661     return retval;
0662 }
0663 
0664 /*
0665  * Same as above, but for rgid, egid, sgid.
0666  */
0667 SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid)
0668 {
0669     struct user_namespace *ns = current_user_ns();
0670     const struct cred *old;
0671     struct cred *new;
0672     int retval;
0673     kgid_t krgid, kegid, ksgid;
0674 
0675     krgid = make_kgid(ns, rgid);
0676     kegid = make_kgid(ns, egid);
0677     ksgid = make_kgid(ns, sgid);
0678 
0679     if ((rgid != (gid_t) -1) && !gid_valid(krgid))
0680         return -EINVAL;
0681     if ((egid != (gid_t) -1) && !gid_valid(kegid))
0682         return -EINVAL;
0683     if ((sgid != (gid_t) -1) && !gid_valid(ksgid))
0684         return -EINVAL;
0685 
0686     new = prepare_creds();
0687     if (!new)
0688         return -ENOMEM;
0689     old = current_cred();
0690 
0691     retval = -EPERM;
0692     if (!ns_capable(old->user_ns, CAP_SETGID)) {
0693         if (rgid != (gid_t) -1        && !gid_eq(krgid, old->gid) &&
0694             !gid_eq(krgid, old->egid) && !gid_eq(krgid, old->sgid))
0695             goto error;
0696         if (egid != (gid_t) -1        && !gid_eq(kegid, old->gid) &&
0697             !gid_eq(kegid, old->egid) && !gid_eq(kegid, old->sgid))
0698             goto error;
0699         if (sgid != (gid_t) -1        && !gid_eq(ksgid, old->gid) &&
0700             !gid_eq(ksgid, old->egid) && !gid_eq(ksgid, old->sgid))
0701             goto error;
0702     }
0703 
0704     if (rgid != (gid_t) -1)
0705         new->gid = krgid;
0706     if (egid != (gid_t) -1)
0707         new->egid = kegid;
0708     if (sgid != (gid_t) -1)
0709         new->sgid = ksgid;
0710     new->fsgid = new->egid;
0711 
0712     return commit_creds(new);
0713 
0714 error:
0715     abort_creds(new);
0716     return retval;
0717 }
0718 
0719 SYSCALL_DEFINE3(getresgid, gid_t __user *, rgidp, gid_t __user *, egidp, gid_t __user *, sgidp)
0720 {
0721     const struct cred *cred = current_cred();
0722     int retval;
0723     gid_t rgid, egid, sgid;
0724 
0725     rgid = from_kgid_munged(cred->user_ns, cred->gid);
0726     egid = from_kgid_munged(cred->user_ns, cred->egid);
0727     sgid = from_kgid_munged(cred->user_ns, cred->sgid);
0728 
0729     retval = put_user(rgid, rgidp);
0730     if (!retval) {
0731         retval = put_user(egid, egidp);
0732         if (!retval)
0733             retval = put_user(sgid, sgidp);
0734     }
0735 
0736     return retval;
0737 }
0738 
0739 
0740 /*
0741  * "setfsuid()" sets the fsuid - the uid used for filesystem checks. This
0742  * is used for "access()" and for the NFS daemon (letting nfsd stay at
0743  * whatever uid it wants to). It normally shadows "euid", except when
0744  * explicitly set by setfsuid() or for access..
0745  */
0746 SYSCALL_DEFINE1(setfsuid, uid_t, uid)
0747 {
0748     const struct cred *old;
0749     struct cred *new;
0750     uid_t old_fsuid;
0751     kuid_t kuid;
0752 
0753     old = current_cred();
0754     old_fsuid = from_kuid_munged(old->user_ns, old->fsuid);
0755 
0756     kuid = make_kuid(old->user_ns, uid);
0757     if (!uid_valid(kuid))
0758         return old_fsuid;
0759 
0760     new = prepare_creds();
0761     if (!new)
0762         return old_fsuid;
0763 
0764     if (uid_eq(kuid, old->uid)  || uid_eq(kuid, old->euid)  ||
0765         uid_eq(kuid, old->suid) || uid_eq(kuid, old->fsuid) ||
0766         ns_capable(old->user_ns, CAP_SETUID)) {
0767         if (!uid_eq(kuid, old->fsuid)) {
0768             new->fsuid = kuid;
0769             if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0)
0770                 goto change_okay;
0771         }
0772     }
0773 
0774     abort_creds(new);
0775     return old_fsuid;
0776 
0777 change_okay:
0778     commit_creds(new);
0779     return old_fsuid;
0780 }
0781 
0782 /*
0783  * Samma på svenska..
0784  */
0785 SYSCALL_DEFINE1(setfsgid, gid_t, gid)
0786 {
0787     const struct cred *old;
0788     struct cred *new;
0789     gid_t old_fsgid;
0790     kgid_t kgid;
0791 
0792     old = current_cred();
0793     old_fsgid = from_kgid_munged(old->user_ns, old->fsgid);
0794 
0795     kgid = make_kgid(old->user_ns, gid);
0796     if (!gid_valid(kgid))
0797         return old_fsgid;
0798 
0799     new = prepare_creds();
0800     if (!new)
0801         return old_fsgid;
0802 
0803     if (gid_eq(kgid, old->gid)  || gid_eq(kgid, old->egid)  ||
0804         gid_eq(kgid, old->sgid) || gid_eq(kgid, old->fsgid) ||
0805         ns_capable(old->user_ns, CAP_SETGID)) {
0806         if (!gid_eq(kgid, old->fsgid)) {
0807             new->fsgid = kgid;
0808             goto change_okay;
0809         }
0810     }
0811 
0812     abort_creds(new);
0813     return old_fsgid;
0814 
0815 change_okay:
0816     commit_creds(new);
0817     return old_fsgid;
0818 }
0819 #endif /* CONFIG_MULTIUSER */
0820 
0821 /**
0822  * sys_getpid - return the thread group id of the current process
0823  *
0824  * Note, despite the name, this returns the tgid not the pid.  The tgid and
0825  * the pid are identical unless CLONE_THREAD was specified on clone() in
0826  * which case the tgid is the same in all threads of the same group.
0827  *
0828  * This is SMP safe as current->tgid does not change.
0829  */
0830 SYSCALL_DEFINE0(getpid)
0831 {
0832     return task_tgid_vnr(current);
0833 }
0834 
0835 /* Thread ID - the internal kernel "pid" */
0836 SYSCALL_DEFINE0(gettid)
0837 {
0838     return task_pid_vnr(current);
0839 }
0840 
0841 /*
0842  * Accessing ->real_parent is not SMP-safe, it could
0843  * change from under us. However, we can use a stale
0844  * value of ->real_parent under rcu_read_lock(), see
0845  * release_task()->call_rcu(delayed_put_task_struct).
0846  */
0847 SYSCALL_DEFINE0(getppid)
0848 {
0849     int pid;
0850 
0851     rcu_read_lock();
0852     pid = task_tgid_vnr(rcu_dereference(current->real_parent));
0853     rcu_read_unlock();
0854 
0855     return pid;
0856 }
0857 
0858 SYSCALL_DEFINE0(getuid)
0859 {
0860     /* Only we change this so SMP safe */
0861     return from_kuid_munged(current_user_ns(), current_uid());
0862 }
0863 
0864 SYSCALL_DEFINE0(geteuid)
0865 {
0866     /* Only we change this so SMP safe */
0867     return from_kuid_munged(current_user_ns(), current_euid());
0868 }
0869 
0870 SYSCALL_DEFINE0(getgid)
0871 {
0872     /* Only we change this so SMP safe */
0873     return from_kgid_munged(current_user_ns(), current_gid());
0874 }
0875 
0876 SYSCALL_DEFINE0(getegid)
0877 {
0878     /* Only we change this so SMP safe */
0879     return from_kgid_munged(current_user_ns(), current_egid());
0880 }
0881 
0882 void do_sys_times(struct tms *tms)
0883 {
0884     cputime_t tgutime, tgstime, cutime, cstime;
0885 
0886     thread_group_cputime_adjusted(current, &tgutime, &tgstime);
0887     cutime = current->signal->cutime;
0888     cstime = current->signal->cstime;
0889     tms->tms_utime = cputime_to_clock_t(tgutime);
0890     tms->tms_stime = cputime_to_clock_t(tgstime);
0891     tms->tms_cutime = cputime_to_clock_t(cutime);
0892     tms->tms_cstime = cputime_to_clock_t(cstime);
0893 }
0894 
0895 SYSCALL_DEFINE1(times, struct tms __user *, tbuf)
0896 {
0897     if (tbuf) {
0898         struct tms tmp;
0899 
0900         do_sys_times(&tmp);
0901         if (copy_to_user(tbuf, &tmp, sizeof(struct tms)))
0902             return -EFAULT;
0903     }
0904     force_successful_syscall_return();
0905     return (long) jiffies_64_to_clock_t(get_jiffies_64());
0906 }
0907 
0908 /*
0909  * This needs some heavy checking ...
0910  * I just haven't the stomach for it. I also don't fully
0911  * understand sessions/pgrp etc. Let somebody who does explain it.
0912  *
0913  * OK, I think I have the protection semantics right.... this is really
0914  * only important on a multi-user system anyway, to make sure one user
0915  * can't send a signal to a process owned by another.  -TYT, 12/12/91
0916  *
0917  * !PF_FORKNOEXEC check to conform completely to POSIX.
0918  */
0919 SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid)
0920 {
0921     struct task_struct *p;
0922     struct task_struct *group_leader = current->group_leader;
0923     struct pid *pgrp;
0924     int err;
0925 
0926     if (!pid)
0927         pid = task_pid_vnr(group_leader);
0928     if (!pgid)
0929         pgid = pid;
0930     if (pgid < 0)
0931         return -EINVAL;
0932     rcu_read_lock();
0933 
0934     /* From this point forward we keep holding onto the tasklist lock
0935      * so that our parent does not change from under us. -DaveM
0936      */
0937     write_lock_irq(&tasklist_lock);
0938 
0939     err = -ESRCH;
0940     p = find_task_by_vpid(pid);
0941     if (!p)
0942         goto out;
0943 
0944     err = -EINVAL;
0945     if (!thread_group_leader(p))
0946         goto out;
0947 
0948     if (same_thread_group(p->real_parent, group_leader)) {
0949         err = -EPERM;
0950         if (task_session(p) != task_session(group_leader))
0951             goto out;
0952         err = -EACCES;
0953         if (!(p->flags & PF_FORKNOEXEC))
0954             goto out;
0955     } else {
0956         err = -ESRCH;
0957         if (p != group_leader)
0958             goto out;
0959     }
0960 
0961     err = -EPERM;
0962     if (p->signal->leader)
0963         goto out;
0964 
0965     pgrp = task_pid(p);
0966     if (pgid != pid) {
0967         struct task_struct *g;
0968 
0969         pgrp = find_vpid(pgid);
0970         g = pid_task(pgrp, PIDTYPE_PGID);
0971         if (!g || task_session(g) != task_session(group_leader))
0972             goto out;
0973     }
0974 
0975     err = security_task_setpgid(p, pgid);
0976     if (err)
0977         goto out;
0978 
0979     if (task_pgrp(p) != pgrp)
0980         change_pid(p, PIDTYPE_PGID, pgrp);
0981 
0982     err = 0;
0983 out:
0984     /* All paths lead to here, thus we are safe. -DaveM */
0985     write_unlock_irq(&tasklist_lock);
0986     rcu_read_unlock();
0987     return err;
0988 }
0989 
0990 SYSCALL_DEFINE1(getpgid, pid_t, pid)
0991 {
0992     struct task_struct *p;
0993     struct pid *grp;
0994     int retval;
0995 
0996     rcu_read_lock();
0997     if (!pid)
0998         grp = task_pgrp(current);
0999     else {
1000         retval = -ESRCH;
1001         p = find_task_by_vpid(pid);
1002         if (!p)
1003             goto out;
1004         grp = task_pgrp(p);
1005         if (!grp)
1006             goto out;
1007 
1008         retval = security_task_getpgid(p);
1009         if (retval)
1010             goto out;
1011     }
1012     retval = pid_vnr(grp);
1013 out:
1014     rcu_read_unlock();
1015     return retval;
1016 }
1017 
1018 #ifdef __ARCH_WANT_SYS_GETPGRP
1019 
1020 SYSCALL_DEFINE0(getpgrp)
1021 {
1022     return sys_getpgid(0);
1023 }
1024 
1025 #endif
1026 
1027 SYSCALL_DEFINE1(getsid, pid_t, pid)
1028 {
1029     struct task_struct *p;
1030     struct pid *sid;
1031     int retval;
1032 
1033     rcu_read_lock();
1034     if (!pid)
1035         sid = task_session(current);
1036     else {
1037         retval = -ESRCH;
1038         p = find_task_by_vpid(pid);
1039         if (!p)
1040             goto out;
1041         sid = task_session(p);
1042         if (!sid)
1043             goto out;
1044 
1045         retval = security_task_getsid(p);
1046         if (retval)
1047             goto out;
1048     }
1049     retval = pid_vnr(sid);
1050 out:
1051     rcu_read_unlock();
1052     return retval;
1053 }
1054 
1055 static void set_special_pids(struct pid *pid)
1056 {
1057     struct task_struct *curr = current->group_leader;
1058 
1059     if (task_session(curr) != pid)
1060         change_pid(curr, PIDTYPE_SID, pid);
1061 
1062     if (task_pgrp(curr) != pid)
1063         change_pid(curr, PIDTYPE_PGID, pid);
1064 }
1065 
1066 SYSCALL_DEFINE0(setsid)
1067 {
1068     struct task_struct *group_leader = current->group_leader;
1069     struct pid *sid = task_pid(group_leader);
1070     pid_t session = pid_vnr(sid);
1071     int err = -EPERM;
1072 
1073     write_lock_irq(&tasklist_lock);
1074     /* Fail if I am already a session leader */
1075     if (group_leader->signal->leader)
1076         goto out;
1077 
1078     /* Fail if a process group id already exists that equals the
1079      * proposed session id.
1080      */
1081     if (pid_task(sid, PIDTYPE_PGID))
1082         goto out;
1083 
1084     group_leader->signal->leader = 1;
1085     set_special_pids(sid);
1086 
1087     proc_clear_tty(group_leader);
1088 
1089     err = session;
1090 out:
1091     write_unlock_irq(&tasklist_lock);
1092     if (err > 0) {
1093         proc_sid_connector(group_leader);
1094         sched_autogroup_create_attach(group_leader);
1095     }
1096     return err;
1097 }
1098 
1099 DECLARE_RWSEM(uts_sem);
1100 
1101 #ifdef COMPAT_UTS_MACHINE
1102 #define override_architecture(name) \
1103     (personality(current->personality) == PER_LINUX32 && \
1104      copy_to_user(name->machine, COMPAT_UTS_MACHINE, \
1105               sizeof(COMPAT_UTS_MACHINE)))
1106 #else
1107 #define override_architecture(name) 0
1108 #endif
1109 
1110 /*
1111  * Work around broken programs that cannot handle "Linux 3.0".
1112  * Instead we map 3.x to 2.6.40+x, so e.g. 3.0 would be 2.6.40
1113  * And we map 4.x to 2.6.60+x, so 4.0 would be 2.6.60.
1114  */
1115 static int override_release(char __user *release, size_t len)
1116 {
1117     int ret = 0;
1118 
1119     if (current->personality & UNAME26) {
1120         const char *rest = UTS_RELEASE;
1121         char buf[65] = { 0 };
1122         int ndots = 0;
1123         unsigned v;
1124         size_t copy;
1125 
1126         while (*rest) {
1127             if (*rest == '.' && ++ndots >= 3)
1128                 break;
1129             if (!isdigit(*rest) && *rest != '.')
1130                 break;
1131             rest++;
1132         }
1133         v = ((LINUX_VERSION_CODE >> 8) & 0xff) + 60;
1134         copy = clamp_t(size_t, len, 1, sizeof(buf));
1135         copy = scnprintf(buf, copy, "2.6.%u%s", v, rest);
1136         ret = copy_to_user(release, buf, copy + 1);
1137     }
1138     return ret;
1139 }
1140 
1141 SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name)
1142 {
1143     int errno = 0;
1144 
1145     down_read(&uts_sem);
1146     if (copy_to_user(name, utsname(), sizeof *name))
1147         errno = -EFAULT;
1148     up_read(&uts_sem);
1149 
1150     if (!errno && override_release(name->release, sizeof(name->release)))
1151         errno = -EFAULT;
1152     if (!errno && override_architecture(name))
1153         errno = -EFAULT;
1154     return errno;
1155 }
1156 
1157 #ifdef __ARCH_WANT_SYS_OLD_UNAME
1158 /*
1159  * Old cruft
1160  */
1161 SYSCALL_DEFINE1(uname, struct old_utsname __user *, name)
1162 {
1163     int error = 0;
1164 
1165     if (!name)
1166         return -EFAULT;
1167 
1168     down_read(&uts_sem);
1169     if (copy_to_user(name, utsname(), sizeof(*name)))
1170         error = -EFAULT;
1171     up_read(&uts_sem);
1172 
1173     if (!error && override_release(name->release, sizeof(name->release)))
1174         error = -EFAULT;
1175     if (!error && override_architecture(name))
1176         error = -EFAULT;
1177     return error;
1178 }
1179 
1180 SYSCALL_DEFINE1(olduname, struct oldold_utsname __user *, name)
1181 {
1182     int error;
1183 
1184     if (!name)
1185         return -EFAULT;
1186     if (!access_ok(VERIFY_WRITE, name, sizeof(struct oldold_utsname)))
1187         return -EFAULT;
1188 
1189     down_read(&uts_sem);
1190     error = __copy_to_user(&name->sysname, &utsname()->sysname,
1191                    __OLD_UTS_LEN);
1192     error |= __put_user(0, name->sysname + __OLD_UTS_LEN);
1193     error |= __copy_to_user(&name->nodename, &utsname()->nodename,
1194                 __OLD_UTS_LEN);
1195     error |= __put_user(0, name->nodename + __OLD_UTS_LEN);
1196     error |= __copy_to_user(&name->release, &utsname()->release,
1197                 __OLD_UTS_LEN);
1198     error |= __put_user(0, name->release + __OLD_UTS_LEN);
1199     error |= __copy_to_user(&name->version, &utsname()->version,
1200                 __OLD_UTS_LEN);
1201     error |= __put_user(0, name->version + __OLD_UTS_LEN);
1202     error |= __copy_to_user(&name->machine, &utsname()->machine,
1203                 __OLD_UTS_LEN);
1204     error |= __put_user(0, name->machine + __OLD_UTS_LEN);
1205     up_read(&uts_sem);
1206 
1207     if (!error && override_architecture(name))
1208         error = -EFAULT;
1209     if (!error && override_release(name->release, sizeof(name->release)))
1210         error = -EFAULT;
1211     return error ? -EFAULT : 0;
1212 }
1213 #endif
1214 
1215 SYSCALL_DEFINE2(sethostname, char __user *, name, int, len)
1216 {
1217     int errno;
1218     char tmp[__NEW_UTS_LEN];
1219 
1220     if (!ns_capable(current->nsproxy->uts_ns->user_ns, CAP_SYS_ADMIN))
1221         return -EPERM;
1222 
1223     if (len < 0 || len > __NEW_UTS_LEN)
1224         return -EINVAL;
1225     down_write(&uts_sem);
1226     errno = -EFAULT;
1227     if (!copy_from_user(tmp, name, len)) {
1228         struct new_utsname *u = utsname();
1229 
1230         memcpy(u->nodename, tmp, len);
1231         memset(u->nodename + len, 0, sizeof(u->nodename) - len);
1232         errno = 0;
1233         uts_proc_notify(UTS_PROC_HOSTNAME);
1234     }
1235     up_write(&uts_sem);
1236     return errno;
1237 }
1238 
1239 #ifdef __ARCH_WANT_SYS_GETHOSTNAME
1240 
1241 SYSCALL_DEFINE2(gethostname, char __user *, name, int, len)
1242 {
1243     int i, errno;
1244     struct new_utsname *u;
1245 
1246     if (len < 0)
1247         return -EINVAL;
1248     down_read(&uts_sem);
1249     u = utsname();
1250     i = 1 + strlen(u->nodename);
1251     if (i > len)
1252         i = len;
1253     errno = 0;
1254     if (copy_to_user(name, u->nodename, i))
1255         errno = -EFAULT;
1256     up_read(&uts_sem);
1257     return errno;
1258 }
1259 
1260 #endif
1261 
1262 /*
1263  * Only setdomainname; getdomainname can be implemented by calling
1264  * uname()
1265  */
1266 SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len)
1267 {
1268     int errno;
1269     char tmp[__NEW_UTS_LEN];
1270 
1271     if (!ns_capable(current->nsproxy->uts_ns->user_ns, CAP_SYS_ADMIN))
1272         return -EPERM;
1273     if (len < 0 || len > __NEW_UTS_LEN)
1274         return -EINVAL;
1275 
1276     down_write(&uts_sem);
1277     errno = -EFAULT;
1278     if (!copy_from_user(tmp, name, len)) {
1279         struct new_utsname *u = utsname();
1280 
1281         memcpy(u->domainname, tmp, len);
1282         memset(u->domainname + len, 0, sizeof(u->domainname) - len);
1283         errno = 0;
1284         uts_proc_notify(UTS_PROC_DOMAINNAME);
1285     }
1286     up_write(&uts_sem);
1287     return errno;
1288 }
1289 
1290 SYSCALL_DEFINE2(getrlimit, unsigned int, resource, struct rlimit __user *, rlim)
1291 {
1292     struct rlimit value;
1293     int ret;
1294 
1295     ret = do_prlimit(current, resource, NULL, &value);
1296     if (!ret)
1297         ret = copy_to_user(rlim, &value, sizeof(*rlim)) ? -EFAULT : 0;
1298 
1299     return ret;
1300 }
1301 
1302 #ifdef __ARCH_WANT_SYS_OLD_GETRLIMIT
1303 
1304 /*
1305  *  Back compatibility for getrlimit. Needed for some apps.
1306  */
1307 SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource,
1308         struct rlimit __user *, rlim)
1309 {
1310     struct rlimit x;
1311     if (resource >= RLIM_NLIMITS)
1312         return -EINVAL;
1313 
1314     task_lock(current->group_leader);
1315     x = current->signal->rlim[resource];
1316     task_unlock(current->group_leader);
1317     if (x.rlim_cur > 0x7FFFFFFF)
1318         x.rlim_cur = 0x7FFFFFFF;
1319     if (x.rlim_max > 0x7FFFFFFF)
1320         x.rlim_max = 0x7FFFFFFF;
1321     return copy_to_user(rlim, &x, sizeof(x)) ? -EFAULT : 0;
1322 }
1323 
1324 #endif
1325 
1326 static inline bool rlim64_is_infinity(__u64 rlim64)
1327 {
1328 #if BITS_PER_LONG < 64
1329     return rlim64 >= ULONG_MAX;
1330 #else
1331     return rlim64 == RLIM64_INFINITY;
1332 #endif
1333 }
1334 
1335 static void rlim_to_rlim64(const struct rlimit *rlim, struct rlimit64 *rlim64)
1336 {
1337     if (rlim->rlim_cur == RLIM_INFINITY)
1338         rlim64->rlim_cur = RLIM64_INFINITY;
1339     else
1340         rlim64->rlim_cur = rlim->rlim_cur;
1341     if (rlim->rlim_max == RLIM_INFINITY)
1342         rlim64->rlim_max = RLIM64_INFINITY;
1343     else
1344         rlim64->rlim_max = rlim->rlim_max;
1345 }
1346 
1347 static void rlim64_to_rlim(const struct rlimit64 *rlim64, struct rlimit *rlim)
1348 {
1349     if (rlim64_is_infinity(rlim64->rlim_cur))
1350         rlim->rlim_cur = RLIM_INFINITY;
1351     else
1352         rlim->rlim_cur = (unsigned long)rlim64->rlim_cur;
1353     if (rlim64_is_infinity(rlim64->rlim_max))
1354         rlim->rlim_max = RLIM_INFINITY;
1355     else
1356         rlim->rlim_max = (unsigned long)rlim64->rlim_max;
1357 }
1358 
1359 /* make sure you are allowed to change @tsk limits before calling this */
1360 int do_prlimit(struct task_struct *tsk, unsigned int resource,
1361         struct rlimit *new_rlim, struct rlimit *old_rlim)
1362 {
1363     struct rlimit *rlim;
1364     int retval = 0;
1365 
1366     if (resource >= RLIM_NLIMITS)
1367         return -EINVAL;
1368     if (new_rlim) {
1369         if (new_rlim->rlim_cur > new_rlim->rlim_max)
1370             return -EINVAL;
1371         if (resource == RLIMIT_NOFILE &&
1372                 new_rlim->rlim_max > sysctl_nr_open)
1373             return -EPERM;
1374     }
1375 
1376     /* protect tsk->signal and tsk->sighand from disappearing */
1377     read_lock(&tasklist_lock);
1378     if (!tsk->sighand) {
1379         retval = -ESRCH;
1380         goto out;
1381     }
1382 
1383     rlim = tsk->signal->rlim + resource;
1384     task_lock(tsk->group_leader);
1385     if (new_rlim) {
1386         /* Keep the capable check against init_user_ns until
1387            cgroups can contain all limits */
1388         if (new_rlim->rlim_max > rlim->rlim_max &&
1389                 !capable(CAP_SYS_RESOURCE))
1390             retval = -EPERM;
1391         if (!retval)
1392             retval = security_task_setrlimit(tsk->group_leader,
1393                     resource, new_rlim);
1394         if (resource == RLIMIT_CPU && new_rlim->rlim_cur == 0) {
1395             /*
1396              * The caller is asking for an immediate RLIMIT_CPU
1397              * expiry.  But we use the zero value to mean "it was
1398              * never set".  So let's cheat and make it one second
1399              * instead
1400              */
1401             new_rlim->rlim_cur = 1;
1402         }
1403     }
1404     if (!retval) {
1405         if (old_rlim)
1406             *old_rlim = *rlim;
1407         if (new_rlim)
1408             *rlim = *new_rlim;
1409     }
1410     task_unlock(tsk->group_leader);
1411 
1412     /*
1413      * RLIMIT_CPU handling.   Note that the kernel fails to return an error
1414      * code if it rejected the user's attempt to set RLIMIT_CPU.  This is a
1415      * very long-standing error, and fixing it now risks breakage of
1416      * applications, so we live with it
1417      */
1418      if (!retval && new_rlim && resource == RLIMIT_CPU &&
1419          new_rlim->rlim_cur != RLIM_INFINITY &&
1420          IS_ENABLED(CONFIG_POSIX_TIMERS))
1421         update_rlimit_cpu(tsk, new_rlim->rlim_cur);
1422 out:
1423     read_unlock(&tasklist_lock);
1424     return retval;
1425 }
1426 
1427 /* rcu lock must be held */
1428 static int check_prlimit_permission(struct task_struct *task)
1429 {
1430     const struct cred *cred = current_cred(), *tcred;
1431 
1432     if (current == task)
1433         return 0;
1434 
1435     tcred = __task_cred(task);
1436     if (uid_eq(cred->uid, tcred->euid) &&
1437         uid_eq(cred->uid, tcred->suid) &&
1438         uid_eq(cred->uid, tcred->uid)  &&
1439         gid_eq(cred->gid, tcred->egid) &&
1440         gid_eq(cred->gid, tcred->sgid) &&
1441         gid_eq(cred->gid, tcred->gid))
1442         return 0;
1443     if (ns_capable(tcred->user_ns, CAP_SYS_RESOURCE))
1444         return 0;
1445 
1446     return -EPERM;
1447 }
1448 
1449 SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource,
1450         const struct rlimit64 __user *, new_rlim,
1451         struct rlimit64 __user *, old_rlim)
1452 {
1453     struct rlimit64 old64, new64;
1454     struct rlimit old, new;
1455     struct task_struct *tsk;
1456     int ret;
1457 
1458     if (new_rlim) {
1459         if (copy_from_user(&new64, new_rlim, sizeof(new64)))
1460             return -EFAULT;
1461         rlim64_to_rlim(&new64, &new);
1462     }
1463 
1464     rcu_read_lock();
1465     tsk = pid ? find_task_by_vpid(pid) : current;
1466     if (!tsk) {
1467         rcu_read_unlock();
1468         return -ESRCH;
1469     }
1470     ret = check_prlimit_permission(tsk);
1471     if (ret) {
1472         rcu_read_unlock();
1473         return ret;
1474     }
1475     get_task_struct(tsk);
1476     rcu_read_unlock();
1477 
1478     ret = do_prlimit(tsk, resource, new_rlim ? &new : NULL,
1479             old_rlim ? &old : NULL);
1480 
1481     if (!ret && old_rlim) {
1482         rlim_to_rlim64(&old, &old64);
1483         if (copy_to_user(old_rlim, &old64, sizeof(old64)))
1484             ret = -EFAULT;
1485     }
1486 
1487     put_task_struct(tsk);
1488     return ret;
1489 }
1490 
1491 SYSCALL_DEFINE2(setrlimit, unsigned int, resource, struct rlimit __user *, rlim)
1492 {
1493     struct rlimit new_rlim;
1494 
1495     if (copy_from_user(&new_rlim, rlim, sizeof(*rlim)))
1496         return -EFAULT;
1497     return do_prlimit(current, resource, &new_rlim, NULL);
1498 }
1499 
1500 /*
1501  * It would make sense to put struct rusage in the task_struct,
1502  * except that would make the task_struct be *really big*.  After
1503  * task_struct gets moved into malloc'ed memory, it would
1504  * make sense to do this.  It will make moving the rest of the information
1505  * a lot simpler!  (Which we're not doing right now because we're not
1506  * measuring them yet).
1507  *
1508  * When sampling multiple threads for RUSAGE_SELF, under SMP we might have
1509  * races with threads incrementing their own counters.  But since word
1510  * reads are atomic, we either get new values or old values and we don't
1511  * care which for the sums.  We always take the siglock to protect reading
1512  * the c* fields from p->signal from races with exit.c updating those
1513  * fields when reaping, so a sample either gets all the additions of a
1514  * given child after it's reaped, or none so this sample is before reaping.
1515  *
1516  * Locking:
1517  * We need to take the siglock for CHILDEREN, SELF and BOTH
1518  * for  the cases current multithreaded, non-current single threaded
1519  * non-current multithreaded.  Thread traversal is now safe with
1520  * the siglock held.
1521  * Strictly speaking, we donot need to take the siglock if we are current and
1522  * single threaded,  as no one else can take our signal_struct away, no one
1523  * else can  reap the  children to update signal->c* counters, and no one else
1524  * can race with the signal-> fields. If we do not take any lock, the
1525  * signal-> fields could be read out of order while another thread was just
1526  * exiting. So we should  place a read memory barrier when we avoid the lock.
1527  * On the writer side,  write memory barrier is implied in  __exit_signal
1528  * as __exit_signal releases  the siglock spinlock after updating the signal->
1529  * fields. But we don't do this yet to keep things simple.
1530  *
1531  */
1532 
1533 static void accumulate_thread_rusage(struct task_struct *t, struct rusage *r)
1534 {
1535     r->ru_nvcsw += t->nvcsw;
1536     r->ru_nivcsw += t->nivcsw;
1537     r->ru_minflt += t->min_flt;
1538     r->ru_majflt += t->maj_flt;
1539     r->ru_inblock += task_io_get_inblock(t);
1540     r->ru_oublock += task_io_get_oublock(t);
1541 }
1542 
1543 static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1544 {
1545     struct task_struct *t;
1546     unsigned long flags;
1547     cputime_t tgutime, tgstime, utime, stime;
1548     unsigned long maxrss = 0;
1549 
1550     memset((char *)r, 0, sizeof (*r));
1551     utime = stime = 0;
1552 
1553     if (who == RUSAGE_THREAD) {
1554         task_cputime_adjusted(current, &utime, &stime);
1555         accumulate_thread_rusage(p, r);
1556         maxrss = p->signal->maxrss;
1557         goto out;
1558     }
1559 
1560     if (!lock_task_sighand(p, &flags))
1561         return;
1562 
1563     switch (who) {
1564     case RUSAGE_BOTH:
1565     case RUSAGE_CHILDREN:
1566         utime = p->signal->cutime;
1567         stime = p->signal->cstime;
1568         r->ru_nvcsw = p->signal->cnvcsw;
1569         r->ru_nivcsw = p->signal->cnivcsw;
1570         r->ru_minflt = p->signal->cmin_flt;
1571         r->ru_majflt = p->signal->cmaj_flt;
1572         r->ru_inblock = p->signal->cinblock;
1573         r->ru_oublock = p->signal->coublock;
1574         maxrss = p->signal->cmaxrss;
1575 
1576         if (who == RUSAGE_CHILDREN)
1577             break;
1578 
1579     case RUSAGE_SELF:
1580         thread_group_cputime_adjusted(p, &tgutime, &tgstime);
1581         utime += tgutime;
1582         stime += tgstime;
1583         r->ru_nvcsw += p->signal->nvcsw;
1584         r->ru_nivcsw += p->signal->nivcsw;
1585         r->ru_minflt += p->signal->min_flt;
1586         r->ru_majflt += p->signal->maj_flt;
1587         r->ru_inblock += p->signal->inblock;
1588         r->ru_oublock += p->signal->oublock;
1589         if (maxrss < p->signal->maxrss)
1590             maxrss = p->signal->maxrss;
1591         t = p;
1592         do {
1593             accumulate_thread_rusage(t, r);
1594         } while_each_thread(p, t);
1595         break;
1596 
1597     default:
1598         BUG();
1599     }
1600     unlock_task_sighand(p, &flags);
1601 
1602 out:
1603     cputime_to_timeval(utime, &r->ru_utime);
1604     cputime_to_timeval(stime, &r->ru_stime);
1605 
1606     if (who != RUSAGE_CHILDREN) {
1607         struct mm_struct *mm = get_task_mm(p);
1608 
1609         if (mm) {
1610             setmax_mm_hiwater_rss(&maxrss, mm);
1611             mmput(mm);
1612         }
1613     }
1614     r->ru_maxrss = maxrss * (PAGE_SIZE / 1024); /* convert pages to KBs */
1615 }
1616 
1617 int getrusage(struct task_struct *p, int who, struct rusage __user *ru)
1618 {
1619     struct rusage r;
1620 
1621     k_getrusage(p, who, &r);
1622     return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0;
1623 }
1624 
1625 SYSCALL_DEFINE2(getrusage, int, who, struct rusage __user *, ru)
1626 {
1627     if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN &&
1628         who != RUSAGE_THREAD)
1629         return -EINVAL;
1630     return getrusage(current, who, ru);
1631 }
1632 
1633 #ifdef CONFIG_COMPAT
1634 COMPAT_SYSCALL_DEFINE2(getrusage, int, who, struct compat_rusage __user *, ru)
1635 {
1636     struct rusage r;
1637 
1638     if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN &&
1639         who != RUSAGE_THREAD)
1640         return -EINVAL;
1641 
1642     k_getrusage(current, who, &r);
1643     return put_compat_rusage(&r, ru);
1644 }
1645 #endif
1646 
1647 SYSCALL_DEFINE1(umask, int, mask)
1648 {
1649     mask = xchg(&current->fs->umask, mask & S_IRWXUGO);
1650     return mask;
1651 }
1652 
1653 static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
1654 {
1655     struct fd exe;
1656     struct file *old_exe, *exe_file;
1657     struct inode *inode;
1658     int err;
1659 
1660     exe = fdget(fd);
1661     if (!exe.file)
1662         return -EBADF;
1663 
1664     inode = file_inode(exe.file);
1665 
1666     /*
1667      * Because the original mm->exe_file points to executable file, make
1668      * sure that this one is executable as well, to avoid breaking an
1669      * overall picture.
1670      */
1671     err = -EACCES;
1672     if (!S_ISREG(inode->i_mode) || path_noexec(&exe.file->f_path))
1673         goto exit;
1674 
1675     err = inode_permission(inode, MAY_EXEC);
1676     if (err)
1677         goto exit;
1678 
1679     /*
1680      * Forbid mm->exe_file change if old file still mapped.
1681      */
1682     exe_file = get_mm_exe_file(mm);
1683     err = -EBUSY;
1684     if (exe_file) {
1685         struct vm_area_struct *vma;
1686 
1687         down_read(&mm->mmap_sem);
1688         for (vma = mm->mmap; vma; vma = vma->vm_next) {
1689             if (!vma->vm_file)
1690                 continue;
1691             if (path_equal(&vma->vm_file->f_path,
1692                        &exe_file->f_path))
1693                 goto exit_err;
1694         }
1695 
1696         up_read(&mm->mmap_sem);
1697         fput(exe_file);
1698     }
1699 
1700     err = 0;
1701     /* set the new file, lockless */
1702     get_file(exe.file);
1703     old_exe = xchg(&mm->exe_file, exe.file);
1704     if (old_exe)
1705         fput(old_exe);
1706 exit:
1707     fdput(exe);
1708     return err;
1709 exit_err:
1710     up_read(&mm->mmap_sem);
1711     fput(exe_file);
1712     goto exit;
1713 }
1714 
1715 /*
1716  * WARNING: we don't require any capability here so be very careful
1717  * in what is allowed for modification from userspace.
1718  */
1719 static int validate_prctl_map(struct prctl_mm_map *prctl_map)
1720 {
1721     unsigned long mmap_max_addr = TASK_SIZE;
1722     struct mm_struct *mm = current->mm;
1723     int error = -EINVAL, i;
1724 
1725     static const unsigned char offsets[] = {
1726         offsetof(struct prctl_mm_map, start_code),
1727         offsetof(struct prctl_mm_map, end_code),
1728         offsetof(struct prctl_mm_map, start_data),
1729         offsetof(struct prctl_mm_map, end_data),
1730         offsetof(struct prctl_mm_map, start_brk),
1731         offsetof(struct prctl_mm_map, brk),
1732         offsetof(struct prctl_mm_map, start_stack),
1733         offsetof(struct prctl_mm_map, arg_start),
1734         offsetof(struct prctl_mm_map, arg_end),
1735         offsetof(struct prctl_mm_map, env_start),
1736         offsetof(struct prctl_mm_map, env_end),
1737     };
1738 
1739     /*
1740      * Make sure the members are not somewhere outside
1741      * of allowed address space.
1742      */
1743     for (i = 0; i < ARRAY_SIZE(offsets); i++) {
1744         u64 val = *(u64 *)((char *)prctl_map + offsets[i]);
1745 
1746         if ((unsigned long)val >= mmap_max_addr ||
1747             (unsigned long)val < mmap_min_addr)
1748             goto out;
1749     }
1750 
1751     /*
1752      * Make sure the pairs are ordered.
1753      */
1754 #define __prctl_check_order(__m1, __op, __m2)               \
1755     ((unsigned long)prctl_map->__m1 __op                \
1756      (unsigned long)prctl_map->__m2) ? 0 : -EINVAL
1757     error  = __prctl_check_order(start_code, <, end_code);
1758     error |= __prctl_check_order(start_data, <, end_data);
1759     error |= __prctl_check_order(start_brk, <=, brk);
1760     error |= __prctl_check_order(arg_start, <=, arg_end);
1761     error |= __prctl_check_order(env_start, <=, env_end);
1762     if (error)
1763         goto out;
1764 #undef __prctl_check_order
1765 
1766     error = -EINVAL;
1767 
1768     /*
1769      * @brk should be after @end_data in traditional maps.
1770      */
1771     if (prctl_map->start_brk <= prctl_map->end_data ||
1772         prctl_map->brk <= prctl_map->end_data)
1773         goto out;
1774 
1775     /*
1776      * Neither we should allow to override limits if they set.
1777      */
1778     if (check_data_rlimit(rlimit(RLIMIT_DATA), prctl_map->brk,
1779                   prctl_map->start_brk, prctl_map->end_data,
1780                   prctl_map->start_data))
1781             goto out;
1782 
1783     /*
1784      * Someone is trying to cheat the auxv vector.
1785      */
1786     if (prctl_map->auxv_size) {
1787         if (!prctl_map->auxv || prctl_map->auxv_size > sizeof(mm->saved_auxv))
1788             goto out;
1789     }
1790 
1791     /*
1792      * Finally, make sure the caller has the rights to
1793      * change /proc/pid/exe link: only local root should
1794      * be allowed to.
1795      */
1796     if (prctl_map->exe_fd != (u32)-1) {
1797         struct user_namespace *ns = current_user_ns();
1798         const struct cred *cred = current_cred();
1799 
1800         if (!uid_eq(cred->uid, make_kuid(ns, 0)) ||
1801             !gid_eq(cred->gid, make_kgid(ns, 0)))
1802             goto out;
1803     }
1804 
1805     error = 0;
1806 out:
1807     return error;
1808 }
1809 
1810 #ifdef CONFIG_CHECKPOINT_RESTORE
1811 static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data_size)
1812 {
1813     struct prctl_mm_map prctl_map = { .exe_fd = (u32)-1, };
1814     unsigned long user_auxv[AT_VECTOR_SIZE];
1815     struct mm_struct *mm = current->mm;
1816     int error;
1817 
1818     BUILD_BUG_ON(sizeof(user_auxv) != sizeof(mm->saved_auxv));
1819     BUILD_BUG_ON(sizeof(struct prctl_mm_map) > 256);
1820 
1821     if (opt == PR_SET_MM_MAP_SIZE)
1822         return put_user((unsigned int)sizeof(prctl_map),
1823                 (unsigned int __user *)addr);
1824 
1825     if (data_size != sizeof(prctl_map))
1826         return -EINVAL;
1827 
1828     if (copy_from_user(&prctl_map, addr, sizeof(prctl_map)))
1829         return -EFAULT;
1830 
1831     error = validate_prctl_map(&prctl_map);
1832     if (error)
1833         return error;
1834 
1835     if (prctl_map.auxv_size) {
1836         memset(user_auxv, 0, sizeof(user_auxv));
1837         if (copy_from_user(user_auxv,
1838                    (const void __user *)prctl_map.auxv,
1839                    prctl_map.auxv_size))
1840             return -EFAULT;
1841 
1842         /* Last entry must be AT_NULL as specification requires */
1843         user_auxv[AT_VECTOR_SIZE - 2] = AT_NULL;
1844         user_auxv[AT_VECTOR_SIZE - 1] = AT_NULL;
1845     }
1846 
1847     if (prctl_map.exe_fd != (u32)-1) {
1848         error = prctl_set_mm_exe_file(mm, prctl_map.exe_fd);
1849         if (error)
1850             return error;
1851     }
1852 
1853     down_write(&mm->mmap_sem);
1854 
1855     /*
1856      * We don't validate if these members are pointing to
1857      * real present VMAs because application may have correspond
1858      * VMAs already unmapped and kernel uses these members for statistics
1859      * output in procfs mostly, except
1860      *
1861      *  - @start_brk/@brk which are used in do_brk but kernel lookups
1862      *    for VMAs when updating these memvers so anything wrong written
1863      *    here cause kernel to swear at userspace program but won't lead
1864      *    to any problem in kernel itself
1865      */
1866 
1867     mm->start_code  = prctl_map.start_code;
1868     mm->end_code    = prctl_map.end_code;
1869     mm->start_data  = prctl_map.start_data;
1870     mm->end_data    = prctl_map.end_data;
1871     mm->start_brk   = prctl_map.start_brk;
1872     mm->brk     = prctl_map.brk;
1873     mm->start_stack = prctl_map.start_stack;
1874     mm->arg_start   = prctl_map.arg_start;
1875     mm->arg_end = prctl_map.arg_end;
1876     mm->env_start   = prctl_map.env_start;
1877     mm->env_end = prctl_map.env_end;
1878 
1879     /*
1880      * Note this update of @saved_auxv is lockless thus
1881      * if someone reads this member in procfs while we're
1882      * updating -- it may get partly updated results. It's
1883      * known and acceptable trade off: we leave it as is to
1884      * not introduce additional locks here making the kernel
1885      * more complex.
1886      */
1887     if (prctl_map.auxv_size)
1888         memcpy(mm->saved_auxv, user_auxv, sizeof(user_auxv));
1889 
1890     up_write(&mm->mmap_sem);
1891     return 0;
1892 }
1893 #endif /* CONFIG_CHECKPOINT_RESTORE */
1894 
1895 static int prctl_set_auxv(struct mm_struct *mm, unsigned long addr,
1896               unsigned long len)
1897 {
1898     /*
1899      * This doesn't move the auxiliary vector itself since it's pinned to
1900      * mm_struct, but it permits filling the vector with new values.  It's
1901      * up to the caller to provide sane values here, otherwise userspace
1902      * tools which use this vector might be unhappy.
1903      */
1904     unsigned long user_auxv[AT_VECTOR_SIZE];
1905 
1906     if (len > sizeof(user_auxv))
1907         return -EINVAL;
1908 
1909     if (copy_from_user(user_auxv, (const void __user *)addr, len))
1910         return -EFAULT;
1911 
1912     /* Make sure the last entry is always AT_NULL */
1913     user_auxv[AT_VECTOR_SIZE - 2] = 0;
1914     user_auxv[AT_VECTOR_SIZE - 1] = 0;
1915 
1916     BUILD_BUG_ON(sizeof(user_auxv) != sizeof(mm->saved_auxv));
1917 
1918     task_lock(current);
1919     memcpy(mm->saved_auxv, user_auxv, len);
1920     task_unlock(current);
1921 
1922     return 0;
1923 }
1924 
1925 static int prctl_set_mm(int opt, unsigned long addr,
1926             unsigned long arg4, unsigned long arg5)
1927 {
1928     struct mm_struct *mm = current->mm;
1929     struct prctl_mm_map prctl_map;
1930     struct vm_area_struct *vma;
1931     int error;
1932 
1933     if (arg5 || (arg4 && (opt != PR_SET_MM_AUXV &&
1934                   opt != PR_SET_MM_MAP &&
1935                   opt != PR_SET_MM_MAP_SIZE)))
1936         return -EINVAL;
1937 
1938 #ifdef CONFIG_CHECKPOINT_RESTORE
1939     if (opt == PR_SET_MM_MAP || opt == PR_SET_MM_MAP_SIZE)
1940         return prctl_set_mm_map(opt, (const void __user *)addr, arg4);
1941 #endif
1942 
1943     if (!capable(CAP_SYS_RESOURCE))
1944         return -EPERM;
1945 
1946     if (opt == PR_SET_MM_EXE_FILE)
1947         return prctl_set_mm_exe_file(mm, (unsigned int)addr);
1948 
1949     if (opt == PR_SET_MM_AUXV)
1950         return prctl_set_auxv(mm, addr, arg4);
1951 
1952     if (addr >= TASK_SIZE || addr < mmap_min_addr)
1953         return -EINVAL;
1954 
1955     error = -EINVAL;
1956 
1957     down_write(&mm->mmap_sem);
1958     vma = find_vma(mm, addr);
1959 
1960     prctl_map.start_code    = mm->start_code;
1961     prctl_map.end_code  = mm->end_code;
1962     prctl_map.start_data    = mm->start_data;
1963     prctl_map.end_data  = mm->end_data;
1964     prctl_map.start_brk = mm->start_brk;
1965     prctl_map.brk       = mm->brk;
1966     prctl_map.start_stack   = mm->start_stack;
1967     prctl_map.arg_start = mm->arg_start;
1968     prctl_map.arg_end   = mm->arg_end;
1969     prctl_map.env_start = mm->env_start;
1970     prctl_map.env_end   = mm->env_end;
1971     prctl_map.auxv      = NULL;
1972     prctl_map.auxv_size = 0;
1973     prctl_map.exe_fd    = -1;
1974 
1975     switch (opt) {
1976     case PR_SET_MM_START_CODE:
1977         prctl_map.start_code = addr;
1978         break;
1979     case PR_SET_MM_END_CODE:
1980         prctl_map.end_code = addr;
1981         break;
1982     case PR_SET_MM_START_DATA:
1983         prctl_map.start_data = addr;
1984         break;
1985     case PR_SET_MM_END_DATA:
1986         prctl_map.end_data = addr;
1987         break;
1988     case PR_SET_MM_START_STACK:
1989         prctl_map.start_stack = addr;
1990         break;
1991     case PR_SET_MM_START_BRK:
1992         prctl_map.start_brk = addr;
1993         break;
1994     case PR_SET_MM_BRK:
1995         prctl_map.brk = addr;
1996         break;
1997     case PR_SET_MM_ARG_START:
1998         prctl_map.arg_start = addr;
1999         break;
2000     case PR_SET_MM_ARG_END:
2001         prctl_map.arg_end = addr;
2002         break;
2003     case PR_SET_MM_ENV_START:
2004         prctl_map.env_start = addr;
2005         break;
2006     case PR_SET_MM_ENV_END:
2007         prctl_map.env_end = addr;
2008         break;
2009     default:
2010         goto out;
2011     }
2012 
2013     error = validate_prctl_map(&prctl_map);
2014     if (error)
2015         goto out;
2016 
2017     switch (opt) {
2018     /*
2019      * If command line arguments and environment
2020      * are placed somewhere else on stack, we can
2021      * set them up here, ARG_START/END to setup
2022      * command line argumets and ENV_START/END
2023      * for environment.
2024      */
2025     case PR_SET_MM_START_STACK:
2026     case PR_SET_MM_ARG_START:
2027     case PR_SET_MM_ARG_END:
2028     case PR_SET_MM_ENV_START:
2029     case PR_SET_MM_ENV_END:
2030         if (!vma) {
2031             error = -EFAULT;
2032             goto out;
2033         }
2034     }
2035 
2036     mm->start_code  = prctl_map.start_code;
2037     mm->end_code    = prctl_map.end_code;
2038     mm->start_data  = prctl_map.start_data;
2039     mm->end_data    = prctl_map.end_data;
2040     mm->start_brk   = prctl_map.start_brk;
2041     mm->brk     = prctl_map.brk;
2042     mm->start_stack = prctl_map.start_stack;
2043     mm->arg_start   = prctl_map.arg_start;
2044     mm->arg_end = prctl_map.arg_end;
2045     mm->env_start   = prctl_map.env_start;
2046     mm->env_end = prctl_map.env_end;
2047 
2048     error = 0;
2049 out:
2050     up_write(&mm->mmap_sem);
2051     return error;
2052 }
2053 
2054 #ifdef CONFIG_CHECKPOINT_RESTORE
2055 static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr)
2056 {
2057     return put_user(me->clear_child_tid, tid_addr);
2058 }
2059 #else
2060 static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr)
2061 {
2062     return -EINVAL;
2063 }
2064 #endif
2065 
2066 SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
2067         unsigned long, arg4, unsigned long, arg5)
2068 {
2069     struct task_struct *me = current;
2070     unsigned char comm[sizeof(me->comm)];
2071     long error;
2072 
2073     error = security_task_prctl(option, arg2, arg3, arg4, arg5);
2074     if (error != -ENOSYS)
2075         return error;
2076 
2077     error = 0;
2078     switch (option) {
2079     case PR_SET_PDEATHSIG:
2080         if (!valid_signal(arg2)) {
2081             error = -EINVAL;
2082             break;
2083         }
2084         me->pdeath_signal = arg2;
2085         break;
2086     case PR_GET_PDEATHSIG:
2087         error = put_user(me->pdeath_signal, (int __user *)arg2);
2088         break;
2089     case PR_GET_DUMPABLE:
2090         error = get_dumpable(me->mm);
2091         break;
2092     case PR_SET_DUMPABLE:
2093         if (arg2 != SUID_DUMP_DISABLE && arg2 != SUID_DUMP_USER) {
2094             error = -EINVAL;
2095             break;
2096         }
2097         set_dumpable(me->mm, arg2);
2098         break;
2099 
2100     case PR_SET_UNALIGN:
2101         error = SET_UNALIGN_CTL(me, arg2);
2102         break;
2103     case PR_GET_UNALIGN:
2104         error = GET_UNALIGN_CTL(me, arg2);
2105         break;
2106     case PR_SET_FPEMU:
2107         error = SET_FPEMU_CTL(me, arg2);
2108         break;
2109     case PR_GET_FPEMU:
2110         error = GET_FPEMU_CTL(me, arg2);
2111         break;
2112     case PR_SET_FPEXC:
2113         error = SET_FPEXC_CTL(me, arg2);
2114         break;
2115     case PR_GET_FPEXC:
2116         error = GET_FPEXC_CTL(me, arg2);
2117         break;
2118     case PR_GET_TIMING:
2119         error = PR_TIMING_STATISTICAL;
2120         break;
2121     case PR_SET_TIMING:
2122         if (arg2 != PR_TIMING_STATISTICAL)
2123             error = -EINVAL;
2124         break;
2125     case PR_SET_NAME:
2126         comm[sizeof(me->comm) - 1] = 0;
2127         if (strncpy_from_user(comm, (char __user *)arg2,
2128                       sizeof(me->comm) - 1) < 0)
2129             return -EFAULT;
2130         set_task_comm(me, comm);
2131         proc_comm_connector(me);
2132         break;
2133     case PR_GET_NAME:
2134         get_task_comm(comm, me);
2135         if (copy_to_user((char __user *)arg2, comm, sizeof(comm)))
2136             return -EFAULT;
2137         break;
2138     case PR_GET_ENDIAN:
2139         error = GET_ENDIAN(me, arg2);
2140         break;
2141     case PR_SET_ENDIAN:
2142         error = SET_ENDIAN(me, arg2);
2143         break;
2144     case PR_GET_SECCOMP:
2145         error = prctl_get_seccomp();
2146         break;
2147     case PR_SET_SECCOMP:
2148         error = prctl_set_seccomp(arg2, (char __user *)arg3);
2149         break;
2150     case PR_GET_TSC:
2151         error = GET_TSC_CTL(arg2);
2152         break;
2153     case PR_SET_TSC:
2154         error = SET_TSC_CTL(arg2);
2155         break;
2156     case PR_TASK_PERF_EVENTS_DISABLE:
2157         error = perf_event_task_disable();
2158         break;
2159     case PR_TASK_PERF_EVENTS_ENABLE:
2160         error = perf_event_task_enable();
2161         break;
2162     case PR_GET_TIMERSLACK:
2163         if (current->timer_slack_ns > ULONG_MAX)
2164             error = ULONG_MAX;
2165         else
2166             error = current->timer_slack_ns;
2167         break;
2168     case PR_SET_TIMERSLACK:
2169         if (arg2 <= 0)
2170             current->timer_slack_ns =
2171                     current->default_timer_slack_ns;
2172         else
2173             current->timer_slack_ns = arg2;
2174         break;
2175     case PR_MCE_KILL:
2176         if (arg4 | arg5)
2177             return -EINVAL;
2178         switch (arg2) {
2179         case PR_MCE_KILL_CLEAR:
2180             if (arg3 != 0)
2181                 return -EINVAL;
2182             current->flags &= ~PF_MCE_PROCESS;
2183             break;
2184         case PR_MCE_KILL_SET:
2185             current->flags |= PF_MCE_PROCESS;
2186             if (arg3 == PR_MCE_KILL_EARLY)
2187                 current->flags |= PF_MCE_EARLY;
2188             else if (arg3 == PR_MCE_KILL_LATE)
2189                 current->flags &= ~PF_MCE_EARLY;
2190             else if (arg3 == PR_MCE_KILL_DEFAULT)
2191                 current->flags &=
2192                         ~(PF_MCE_EARLY|PF_MCE_PROCESS);
2193             else
2194                 return -EINVAL;
2195             break;
2196         default:
2197             return -EINVAL;
2198         }
2199         break;
2200     case PR_MCE_KILL_GET:
2201         if (arg2 | arg3 | arg4 | arg5)
2202             return -EINVAL;
2203         if (current->flags & PF_MCE_PROCESS)
2204             error = (current->flags & PF_MCE_EARLY) ?
2205                 PR_MCE_KILL_EARLY : PR_MCE_KILL_LATE;
2206         else
2207             error = PR_MCE_KILL_DEFAULT;
2208         break;
2209     case PR_SET_MM:
2210         error = prctl_set_mm(arg2, arg3, arg4, arg5);
2211         break;
2212     case PR_GET_TID_ADDRESS:
2213         error = prctl_get_tid_address(me, (int __user **)arg2);
2214         break;
2215     case PR_SET_CHILD_SUBREAPER:
2216         me->signal->is_child_subreaper = !!arg2;
2217         break;
2218     case PR_GET_CHILD_SUBREAPER:
2219         error = put_user(me->signal->is_child_subreaper,
2220                  (int __user *)arg2);
2221         break;
2222     case PR_SET_NO_NEW_PRIVS:
2223         if (arg2 != 1 || arg3 || arg4 || arg5)
2224             return -EINVAL;
2225 
2226         task_set_no_new_privs(current);
2227         break;
2228     case PR_GET_NO_NEW_PRIVS:
2229         if (arg2 || arg3 || arg4 || arg5)
2230             return -EINVAL;
2231         return task_no_new_privs(current) ? 1 : 0;
2232     case PR_GET_THP_DISABLE:
2233         if (arg2 || arg3 || arg4 || arg5)
2234             return -EINVAL;
2235         error = !!(me->mm->def_flags & VM_NOHUGEPAGE);
2236         break;
2237     case PR_SET_THP_DISABLE:
2238         if (arg3 || arg4 || arg5)
2239             return -EINVAL;
2240         if (down_write_killable(&me->mm->mmap_sem))
2241             return -EINTR;
2242         if (arg2)
2243             me->mm->def_flags |= VM_NOHUGEPAGE;
2244         else
2245             me->mm->def_flags &= ~VM_NOHUGEPAGE;
2246         up_write(&me->mm->mmap_sem);
2247         break;
2248     case PR_MPX_ENABLE_MANAGEMENT:
2249         if (arg2 || arg3 || arg4 || arg5)
2250             return -EINVAL;
2251         error = MPX_ENABLE_MANAGEMENT();
2252         break;
2253     case PR_MPX_DISABLE_MANAGEMENT:
2254         if (arg2 || arg3 || arg4 || arg5)
2255             return -EINVAL;
2256         error = MPX_DISABLE_MANAGEMENT();
2257         break;
2258     case PR_SET_FP_MODE:
2259         error = SET_FP_MODE(me, arg2);
2260         break;
2261     case PR_GET_FP_MODE:
2262         error = GET_FP_MODE(me);
2263         break;
2264     default:
2265         error = -EINVAL;
2266         break;
2267     }
2268     return error;
2269 }
2270 
2271 SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep,
2272         struct getcpu_cache __user *, unused)
2273 {
2274     int err = 0;
2275     int cpu = raw_smp_processor_id();
2276 
2277     if (cpup)
2278         err |= put_user(cpu, cpup);
2279     if (nodep)
2280         err |= put_user(cpu_to_node(cpu), nodep);
2281     return err ? -EFAULT : 0;
2282 }
2283 
2284 /**
2285  * do_sysinfo - fill in sysinfo struct
2286  * @info: pointer to buffer to fill
2287  */
2288 static int do_sysinfo(struct sysinfo *info)
2289 {
2290     unsigned long mem_total, sav_total;
2291     unsigned int mem_unit, bitcount;
2292     struct timespec tp;
2293 
2294     memset(info, 0, sizeof(struct sysinfo));
2295 
2296     get_monotonic_boottime(&tp);
2297     info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0);
2298 
2299     get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT);
2300 
2301     info->procs = nr_threads;
2302 
2303     si_meminfo(info);
2304     si_swapinfo(info);
2305 
2306     /*
2307      * If the sum of all the available memory (i.e. ram + swap)
2308      * is less than can be stored in a 32 bit unsigned long then
2309      * we can be binary compatible with 2.2.x kernels.  If not,
2310      * well, in that case 2.2.x was broken anyways...
2311      *
2312      *  -Erik Andersen <andersee@debian.org>
2313      */
2314 
2315     mem_total = info->totalram + info->totalswap;
2316     if (mem_total < info->totalram || mem_total < info->totalswap)
2317         goto out;
2318     bitcount = 0;
2319     mem_unit = info->mem_unit;
2320     while (mem_unit > 1) {
2321         bitcount++;
2322         mem_unit >>= 1;
2323         sav_total = mem_total;
2324         mem_total <<= 1;
2325         if (mem_total < sav_total)
2326             goto out;
2327     }
2328 
2329     /*
2330      * If mem_total did not overflow, multiply all memory values by
2331      * info->mem_unit and set it to 1.  This leaves things compatible
2332      * with 2.2.x, and also retains compatibility with earlier 2.4.x
2333      * kernels...
2334      */
2335 
2336     info->mem_unit = 1;
2337     info->totalram <<= bitcount;
2338     info->freeram <<= bitcount;
2339     info->sharedram <<= bitcount;
2340     info->bufferram <<= bitcount;
2341     info->totalswap <<= bitcount;
2342     info->freeswap <<= bitcount;
2343     info->totalhigh <<= bitcount;
2344     info->freehigh <<= bitcount;
2345 
2346 out:
2347     return 0;
2348 }
2349 
2350 SYSCALL_DEFINE1(sysinfo, struct sysinfo __user *, info)
2351 {
2352     struct sysinfo val;
2353 
2354     do_sysinfo(&val);
2355 
2356     if (copy_to_user(info, &val, sizeof(struct sysinfo)))
2357         return -EFAULT;
2358 
2359     return 0;
2360 }
2361 
2362 #ifdef CONFIG_COMPAT
2363 struct compat_sysinfo {
2364     s32 uptime;
2365     u32 loads[3];
2366     u32 totalram;
2367     u32 freeram;
2368     u32 sharedram;
2369     u32 bufferram;
2370     u32 totalswap;
2371     u32 freeswap;
2372     u16 procs;
2373     u16 pad;
2374     u32 totalhigh;
2375     u32 freehigh;
2376     u32 mem_unit;
2377     char _f[20-2*sizeof(u32)-sizeof(int)];
2378 };
2379 
2380 COMPAT_SYSCALL_DEFINE1(sysinfo, struct compat_sysinfo __user *, info)
2381 {
2382     struct sysinfo s;
2383 
2384     do_sysinfo(&s);
2385 
2386     /* Check to see if any memory value is too large for 32-bit and scale
2387      *  down if needed
2388      */
2389     if (upper_32_bits(s.totalram) || upper_32_bits(s.totalswap)) {
2390         int bitcount = 0;
2391 
2392         while (s.mem_unit < PAGE_SIZE) {
2393             s.mem_unit <<= 1;
2394             bitcount++;
2395         }
2396 
2397         s.totalram >>= bitcount;
2398         s.freeram >>= bitcount;
2399         s.sharedram >>= bitcount;
2400         s.bufferram >>= bitcount;
2401         s.totalswap >>= bitcount;
2402         s.freeswap >>= bitcount;
2403         s.totalhigh >>= bitcount;
2404         s.freehigh >>= bitcount;
2405     }
2406 
2407     if (!access_ok(VERIFY_WRITE, info, sizeof(struct compat_sysinfo)) ||
2408         __put_user(s.uptime, &info->uptime) ||
2409         __put_user(s.loads[0], &info->loads[0]) ||
2410         __put_user(s.loads[1], &info->loads[1]) ||
2411         __put_user(s.loads[2], &info->loads[2]) ||
2412         __put_user(s.totalram, &info->totalram) ||
2413         __put_user(s.freeram, &info->freeram) ||
2414         __put_user(s.sharedram, &info->sharedram) ||
2415         __put_user(s.bufferram, &info->bufferram) ||
2416         __put_user(s.totalswap, &info->totalswap) ||
2417         __put_user(s.freeswap, &info->freeswap) ||
2418         __put_user(s.procs, &info->procs) ||
2419         __put_user(s.totalhigh, &info->totalhigh) ||
2420         __put_user(s.freehigh, &info->freehigh) ||
2421         __put_user(s.mem_unit, &info->mem_unit))
2422         return -EFAULT;
2423 
2424     return 0;
2425 }
2426 #endif /* CONFIG_COMPAT */