Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0-only
0002 /*
0003  *  Copyright (C) 2006 IBM Corporation
0004  *
0005  *  Author: Serge Hallyn <serue@us.ibm.com>
0006  *
0007  *  Jun 2006 - namespaces support
0008  *             OpenVZ, SWsoft Inc.
0009  *             Pavel Emelianov <xemul@openvz.org>
0010  */
0011 
0012 #include <linux/slab.h>
0013 #include <linux/export.h>
0014 #include <linux/nsproxy.h>
0015 #include <linux/init_task.h>
0016 #include <linux/mnt_namespace.h>
0017 #include <linux/utsname.h>
0018 #include <linux/pid_namespace.h>
0019 #include <net/net_namespace.h>
0020 #include <linux/ipc_namespace.h>
0021 #include <linux/time_namespace.h>
0022 #include <linux/fs_struct.h>
0023 #include <linux/proc_fs.h>
0024 #include <linux/proc_ns.h>
0025 #include <linux/file.h>
0026 #include <linux/syscalls.h>
0027 #include <linux/cgroup.h>
0028 #include <linux/perf_event.h>
0029 
0030 static struct kmem_cache *nsproxy_cachep;
0031 
0032 struct nsproxy init_nsproxy = {
0033     .count          = ATOMIC_INIT(1),
0034     .uts_ns         = &init_uts_ns,
0035 #if defined(CONFIG_POSIX_MQUEUE) || defined(CONFIG_SYSVIPC)
0036     .ipc_ns         = &init_ipc_ns,
0037 #endif
0038     .mnt_ns         = NULL,
0039     .pid_ns_for_children    = &init_pid_ns,
0040 #ifdef CONFIG_NET
0041     .net_ns         = &init_net,
0042 #endif
0043 #ifdef CONFIG_CGROUPS
0044     .cgroup_ns      = &init_cgroup_ns,
0045 #endif
0046 #ifdef CONFIG_TIME_NS
0047     .time_ns        = &init_time_ns,
0048     .time_ns_for_children   = &init_time_ns,
0049 #endif
0050 };
0051 
0052 static inline struct nsproxy *create_nsproxy(void)
0053 {
0054     struct nsproxy *nsproxy;
0055 
0056     nsproxy = kmem_cache_alloc(nsproxy_cachep, GFP_KERNEL);
0057     if (nsproxy)
0058         atomic_set(&nsproxy->count, 1);
0059     return nsproxy;
0060 }
0061 
0062 /*
0063  * Create new nsproxy and all of its the associated namespaces.
0064  * Return the newly created nsproxy.  Do not attach this to the task,
0065  * leave it to the caller to do proper locking and attach it to task.
0066  */
0067 static struct nsproxy *create_new_namespaces(unsigned long flags,
0068     struct task_struct *tsk, struct user_namespace *user_ns,
0069     struct fs_struct *new_fs)
0070 {
0071     struct nsproxy *new_nsp;
0072     int err;
0073 
0074     new_nsp = create_nsproxy();
0075     if (!new_nsp)
0076         return ERR_PTR(-ENOMEM);
0077 
0078     new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, user_ns, new_fs);
0079     if (IS_ERR(new_nsp->mnt_ns)) {
0080         err = PTR_ERR(new_nsp->mnt_ns);
0081         goto out_ns;
0082     }
0083 
0084     new_nsp->uts_ns = copy_utsname(flags, user_ns, tsk->nsproxy->uts_ns);
0085     if (IS_ERR(new_nsp->uts_ns)) {
0086         err = PTR_ERR(new_nsp->uts_ns);
0087         goto out_uts;
0088     }
0089 
0090     new_nsp->ipc_ns = copy_ipcs(flags, user_ns, tsk->nsproxy->ipc_ns);
0091     if (IS_ERR(new_nsp->ipc_ns)) {
0092         err = PTR_ERR(new_nsp->ipc_ns);
0093         goto out_ipc;
0094     }
0095 
0096     new_nsp->pid_ns_for_children =
0097         copy_pid_ns(flags, user_ns, tsk->nsproxy->pid_ns_for_children);
0098     if (IS_ERR(new_nsp->pid_ns_for_children)) {
0099         err = PTR_ERR(new_nsp->pid_ns_for_children);
0100         goto out_pid;
0101     }
0102 
0103     new_nsp->cgroup_ns = copy_cgroup_ns(flags, user_ns,
0104                         tsk->nsproxy->cgroup_ns);
0105     if (IS_ERR(new_nsp->cgroup_ns)) {
0106         err = PTR_ERR(new_nsp->cgroup_ns);
0107         goto out_cgroup;
0108     }
0109 
0110     new_nsp->net_ns = copy_net_ns(flags, user_ns, tsk->nsproxy->net_ns);
0111     if (IS_ERR(new_nsp->net_ns)) {
0112         err = PTR_ERR(new_nsp->net_ns);
0113         goto out_net;
0114     }
0115 
0116     new_nsp->time_ns_for_children = copy_time_ns(flags, user_ns,
0117                     tsk->nsproxy->time_ns_for_children);
0118     if (IS_ERR(new_nsp->time_ns_for_children)) {
0119         err = PTR_ERR(new_nsp->time_ns_for_children);
0120         goto out_time;
0121     }
0122     new_nsp->time_ns = get_time_ns(tsk->nsproxy->time_ns);
0123 
0124     return new_nsp;
0125 
0126 out_time:
0127     put_net(new_nsp->net_ns);
0128 out_net:
0129     put_cgroup_ns(new_nsp->cgroup_ns);
0130 out_cgroup:
0131     if (new_nsp->pid_ns_for_children)
0132         put_pid_ns(new_nsp->pid_ns_for_children);
0133 out_pid:
0134     if (new_nsp->ipc_ns)
0135         put_ipc_ns(new_nsp->ipc_ns);
0136 out_ipc:
0137     if (new_nsp->uts_ns)
0138         put_uts_ns(new_nsp->uts_ns);
0139 out_uts:
0140     if (new_nsp->mnt_ns)
0141         put_mnt_ns(new_nsp->mnt_ns);
0142 out_ns:
0143     kmem_cache_free(nsproxy_cachep, new_nsp);
0144     return ERR_PTR(err);
0145 }
0146 
0147 /*
0148  * called from clone.  This now handles copy for nsproxy and all
0149  * namespaces therein.
0150  */
0151 int copy_namespaces(unsigned long flags, struct task_struct *tsk)
0152 {
0153     struct nsproxy *old_ns = tsk->nsproxy;
0154     struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns);
0155     struct nsproxy *new_ns;
0156 
0157     if (likely(!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
0158                   CLONE_NEWPID | CLONE_NEWNET |
0159                   CLONE_NEWCGROUP | CLONE_NEWTIME)))) {
0160         if (likely(old_ns->time_ns_for_children == old_ns->time_ns)) {
0161             get_nsproxy(old_ns);
0162             return 0;
0163         }
0164     } else if (!ns_capable(user_ns, CAP_SYS_ADMIN))
0165         return -EPERM;
0166 
0167     /*
0168      * CLONE_NEWIPC must detach from the undolist: after switching
0169      * to a new ipc namespace, the semaphore arrays from the old
0170      * namespace are unreachable.  In clone parlance, CLONE_SYSVSEM
0171      * means share undolist with parent, so we must forbid using
0172      * it along with CLONE_NEWIPC.
0173      */
0174     if ((flags & (CLONE_NEWIPC | CLONE_SYSVSEM)) ==
0175         (CLONE_NEWIPC | CLONE_SYSVSEM))
0176         return -EINVAL;
0177 
0178     new_ns = create_new_namespaces(flags, tsk, user_ns, tsk->fs);
0179     if (IS_ERR(new_ns))
0180         return  PTR_ERR(new_ns);
0181 
0182     timens_on_fork(new_ns, tsk);
0183 
0184     tsk->nsproxy = new_ns;
0185     return 0;
0186 }
0187 
0188 void free_nsproxy(struct nsproxy *ns)
0189 {
0190     if (ns->mnt_ns)
0191         put_mnt_ns(ns->mnt_ns);
0192     if (ns->uts_ns)
0193         put_uts_ns(ns->uts_ns);
0194     if (ns->ipc_ns)
0195         put_ipc_ns(ns->ipc_ns);
0196     if (ns->pid_ns_for_children)
0197         put_pid_ns(ns->pid_ns_for_children);
0198     if (ns->time_ns)
0199         put_time_ns(ns->time_ns);
0200     if (ns->time_ns_for_children)
0201         put_time_ns(ns->time_ns_for_children);
0202     put_cgroup_ns(ns->cgroup_ns);
0203     put_net(ns->net_ns);
0204     kmem_cache_free(nsproxy_cachep, ns);
0205 }
0206 
0207 /*
0208  * Called from unshare. Unshare all the namespaces part of nsproxy.
0209  * On success, returns the new nsproxy.
0210  */
0211 int unshare_nsproxy_namespaces(unsigned long unshare_flags,
0212     struct nsproxy **new_nsp, struct cred *new_cred, struct fs_struct *new_fs)
0213 {
0214     struct user_namespace *user_ns;
0215     int err = 0;
0216 
0217     if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
0218                    CLONE_NEWNET | CLONE_NEWPID | CLONE_NEWCGROUP |
0219                    CLONE_NEWTIME)))
0220         return 0;
0221 
0222     user_ns = new_cred ? new_cred->user_ns : current_user_ns();
0223     if (!ns_capable(user_ns, CAP_SYS_ADMIN))
0224         return -EPERM;
0225 
0226     *new_nsp = create_new_namespaces(unshare_flags, current, user_ns,
0227                      new_fs ? new_fs : current->fs);
0228     if (IS_ERR(*new_nsp)) {
0229         err = PTR_ERR(*new_nsp);
0230         goto out;
0231     }
0232 
0233 out:
0234     return err;
0235 }
0236 
0237 void switch_task_namespaces(struct task_struct *p, struct nsproxy *new)
0238 {
0239     struct nsproxy *ns;
0240 
0241     might_sleep();
0242 
0243     task_lock(p);
0244     ns = p->nsproxy;
0245     p->nsproxy = new;
0246     task_unlock(p);
0247 
0248     if (ns)
0249         put_nsproxy(ns);
0250 }
0251 
0252 void exit_task_namespaces(struct task_struct *p)
0253 {
0254     switch_task_namespaces(p, NULL);
0255 }
0256 
0257 static int check_setns_flags(unsigned long flags)
0258 {
0259     if (!flags || (flags & ~(CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
0260                  CLONE_NEWNET | CLONE_NEWTIME | CLONE_NEWUSER |
0261                  CLONE_NEWPID | CLONE_NEWCGROUP)))
0262         return -EINVAL;
0263 
0264 #ifndef CONFIG_USER_NS
0265     if (flags & CLONE_NEWUSER)
0266         return -EINVAL;
0267 #endif
0268 #ifndef CONFIG_PID_NS
0269     if (flags & CLONE_NEWPID)
0270         return -EINVAL;
0271 #endif
0272 #ifndef CONFIG_UTS_NS
0273     if (flags & CLONE_NEWUTS)
0274         return -EINVAL;
0275 #endif
0276 #ifndef CONFIG_IPC_NS
0277     if (flags & CLONE_NEWIPC)
0278         return -EINVAL;
0279 #endif
0280 #ifndef CONFIG_CGROUPS
0281     if (flags & CLONE_NEWCGROUP)
0282         return -EINVAL;
0283 #endif
0284 #ifndef CONFIG_NET_NS
0285     if (flags & CLONE_NEWNET)
0286         return -EINVAL;
0287 #endif
0288 #ifndef CONFIG_TIME_NS
0289     if (flags & CLONE_NEWTIME)
0290         return -EINVAL;
0291 #endif
0292 
0293     return 0;
0294 }
0295 
0296 static void put_nsset(struct nsset *nsset)
0297 {
0298     unsigned flags = nsset->flags;
0299 
0300     if (flags & CLONE_NEWUSER)
0301         put_cred(nsset_cred(nsset));
0302     /*
0303      * We only created a temporary copy if we attached to more than just
0304      * the mount namespace.
0305      */
0306     if (nsset->fs && (flags & CLONE_NEWNS) && (flags & ~CLONE_NEWNS))
0307         free_fs_struct(nsset->fs);
0308     if (nsset->nsproxy)
0309         free_nsproxy(nsset->nsproxy);
0310 }
0311 
0312 static int prepare_nsset(unsigned flags, struct nsset *nsset)
0313 {
0314     struct task_struct *me = current;
0315 
0316     nsset->nsproxy = create_new_namespaces(0, me, current_user_ns(), me->fs);
0317     if (IS_ERR(nsset->nsproxy))
0318         return PTR_ERR(nsset->nsproxy);
0319 
0320     if (flags & CLONE_NEWUSER)
0321         nsset->cred = prepare_creds();
0322     else
0323         nsset->cred = current_cred();
0324     if (!nsset->cred)
0325         goto out;
0326 
0327     /* Only create a temporary copy of fs_struct if we really need to. */
0328     if (flags == CLONE_NEWNS) {
0329         nsset->fs = me->fs;
0330     } else if (flags & CLONE_NEWNS) {
0331         nsset->fs = copy_fs_struct(me->fs);
0332         if (!nsset->fs)
0333             goto out;
0334     }
0335 
0336     nsset->flags = flags;
0337     return 0;
0338 
0339 out:
0340     put_nsset(nsset);
0341     return -ENOMEM;
0342 }
0343 
0344 static inline int validate_ns(struct nsset *nsset, struct ns_common *ns)
0345 {
0346     return ns->ops->install(nsset, ns);
0347 }
0348 
0349 /*
0350  * This is the inverse operation to unshare().
0351  * Ordering is equivalent to the standard ordering used everywhere else
0352  * during unshare and process creation. The switch to the new set of
0353  * namespaces occurs at the point of no return after installation of
0354  * all requested namespaces was successful in commit_nsset().
0355  */
0356 static int validate_nsset(struct nsset *nsset, struct pid *pid)
0357 {
0358     int ret = 0;
0359     unsigned flags = nsset->flags;
0360     struct user_namespace *user_ns = NULL;
0361     struct pid_namespace *pid_ns = NULL;
0362     struct nsproxy *nsp;
0363     struct task_struct *tsk;
0364 
0365     /* Take a "snapshot" of the target task's namespaces. */
0366     rcu_read_lock();
0367     tsk = pid_task(pid, PIDTYPE_PID);
0368     if (!tsk) {
0369         rcu_read_unlock();
0370         return -ESRCH;
0371     }
0372 
0373     if (!ptrace_may_access(tsk, PTRACE_MODE_READ_REALCREDS)) {
0374         rcu_read_unlock();
0375         return -EPERM;
0376     }
0377 
0378     task_lock(tsk);
0379     nsp = tsk->nsproxy;
0380     if (nsp)
0381         get_nsproxy(nsp);
0382     task_unlock(tsk);
0383     if (!nsp) {
0384         rcu_read_unlock();
0385         return -ESRCH;
0386     }
0387 
0388 #ifdef CONFIG_PID_NS
0389     if (flags & CLONE_NEWPID) {
0390         pid_ns = task_active_pid_ns(tsk);
0391         if (unlikely(!pid_ns)) {
0392             rcu_read_unlock();
0393             ret = -ESRCH;
0394             goto out;
0395         }
0396         get_pid_ns(pid_ns);
0397     }
0398 #endif
0399 
0400 #ifdef CONFIG_USER_NS
0401     if (flags & CLONE_NEWUSER)
0402         user_ns = get_user_ns(__task_cred(tsk)->user_ns);
0403 #endif
0404     rcu_read_unlock();
0405 
0406     /*
0407      * Install requested namespaces. The caller will have
0408      * verified earlier that the requested namespaces are
0409      * supported on this kernel. We don't report errors here
0410      * if a namespace is requested that isn't supported.
0411      */
0412 #ifdef CONFIG_USER_NS
0413     if (flags & CLONE_NEWUSER) {
0414         ret = validate_ns(nsset, &user_ns->ns);
0415         if (ret)
0416             goto out;
0417     }
0418 #endif
0419 
0420     if (flags & CLONE_NEWNS) {
0421         ret = validate_ns(nsset, from_mnt_ns(nsp->mnt_ns));
0422         if (ret)
0423             goto out;
0424     }
0425 
0426 #ifdef CONFIG_UTS_NS
0427     if (flags & CLONE_NEWUTS) {
0428         ret = validate_ns(nsset, &nsp->uts_ns->ns);
0429         if (ret)
0430             goto out;
0431     }
0432 #endif
0433 
0434 #ifdef CONFIG_IPC_NS
0435     if (flags & CLONE_NEWIPC) {
0436         ret = validate_ns(nsset, &nsp->ipc_ns->ns);
0437         if (ret)
0438             goto out;
0439     }
0440 #endif
0441 
0442 #ifdef CONFIG_PID_NS
0443     if (flags & CLONE_NEWPID) {
0444         ret = validate_ns(nsset, &pid_ns->ns);
0445         if (ret)
0446             goto out;
0447     }
0448 #endif
0449 
0450 #ifdef CONFIG_CGROUPS
0451     if (flags & CLONE_NEWCGROUP) {
0452         ret = validate_ns(nsset, &nsp->cgroup_ns->ns);
0453         if (ret)
0454             goto out;
0455     }
0456 #endif
0457 
0458 #ifdef CONFIG_NET_NS
0459     if (flags & CLONE_NEWNET) {
0460         ret = validate_ns(nsset, &nsp->net_ns->ns);
0461         if (ret)
0462             goto out;
0463     }
0464 #endif
0465 
0466 #ifdef CONFIG_TIME_NS
0467     if (flags & CLONE_NEWTIME) {
0468         ret = validate_ns(nsset, &nsp->time_ns->ns);
0469         if (ret)
0470             goto out;
0471     }
0472 #endif
0473 
0474 out:
0475     if (pid_ns)
0476         put_pid_ns(pid_ns);
0477     if (nsp)
0478         put_nsproxy(nsp);
0479     put_user_ns(user_ns);
0480 
0481     return ret;
0482 }
0483 
0484 /*
0485  * This is the point of no return. There are just a few namespaces
0486  * that do some actual work here and it's sufficiently minimal that
0487  * a separate ns_common operation seems unnecessary for now.
0488  * Unshare is doing the same thing. If we'll end up needing to do
0489  * more in a given namespace or a helper here is ultimately not
0490  * exported anymore a simple commit handler for each namespace
0491  * should be added to ns_common.
0492  */
0493 static void commit_nsset(struct nsset *nsset)
0494 {
0495     unsigned flags = nsset->flags;
0496     struct task_struct *me = current;
0497 
0498 #ifdef CONFIG_USER_NS
0499     if (flags & CLONE_NEWUSER) {
0500         /* transfer ownership */
0501         commit_creds(nsset_cred(nsset));
0502         nsset->cred = NULL;
0503     }
0504 #endif
0505 
0506     /* We only need to commit if we have used a temporary fs_struct. */
0507     if ((flags & CLONE_NEWNS) && (flags & ~CLONE_NEWNS)) {
0508         set_fs_root(me->fs, &nsset->fs->root);
0509         set_fs_pwd(me->fs, &nsset->fs->pwd);
0510     }
0511 
0512 #ifdef CONFIG_IPC_NS
0513     if (flags & CLONE_NEWIPC)
0514         exit_sem(me);
0515 #endif
0516 
0517 #ifdef CONFIG_TIME_NS
0518     if (flags & CLONE_NEWTIME)
0519         timens_commit(me, nsset->nsproxy->time_ns);
0520 #endif
0521 
0522     /* transfer ownership */
0523     switch_task_namespaces(me, nsset->nsproxy);
0524     nsset->nsproxy = NULL;
0525 }
0526 
0527 SYSCALL_DEFINE2(setns, int, fd, int, flags)
0528 {
0529     struct file *file;
0530     struct ns_common *ns = NULL;
0531     struct nsset nsset = {};
0532     int err = 0;
0533 
0534     file = fget(fd);
0535     if (!file)
0536         return -EBADF;
0537 
0538     if (proc_ns_file(file)) {
0539         ns = get_proc_ns(file_inode(file));
0540         if (flags && (ns->ops->type != flags))
0541             err = -EINVAL;
0542         flags = ns->ops->type;
0543     } else if (!IS_ERR(pidfd_pid(file))) {
0544         err = check_setns_flags(flags);
0545     } else {
0546         err = -EINVAL;
0547     }
0548     if (err)
0549         goto out;
0550 
0551     err = prepare_nsset(flags, &nsset);
0552     if (err)
0553         goto out;
0554 
0555     if (proc_ns_file(file))
0556         err = validate_ns(&nsset, ns);
0557     else
0558         err = validate_nsset(&nsset, file->private_data);
0559     if (!err) {
0560         commit_nsset(&nsset);
0561         perf_event_namespaces(current);
0562     }
0563     put_nsset(&nsset);
0564 out:
0565     fput(file);
0566     return err;
0567 }
0568 
0569 int __init nsproxy_cache_init(void)
0570 {
0571     nsproxy_cachep = KMEM_CACHE(nsproxy, SLAB_PANIC|SLAB_ACCOUNT);
0572     return 0;
0573 }