0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012 #include <linux/pid.h>
0013 #include <linux/pid_namespace.h>
0014 #include <linux/user_namespace.h>
0015 #include <linux/syscalls.h>
0016 #include <linux/cred.h>
0017 #include <linux/err.h>
0018 #include <linux/acct.h>
0019 #include <linux/slab.h>
0020 #include <linux/proc_ns.h>
0021 #include <linux/reboot.h>
0022 #include <linux/export.h>
0023 #include <linux/sched/task.h>
0024 #include <linux/sched/signal.h>
0025 #include <linux/idr.h>
0026
0027 static DEFINE_MUTEX(pid_caches_mutex);
0028 static struct kmem_cache *pid_ns_cachep;
0029
0030 static struct kmem_cache *pid_cache[MAX_PID_NS_LEVEL];
0031
0032
0033
0034
0035
0036
0037 static struct kmem_cache *create_pid_cachep(unsigned int level)
0038 {
0039
0040 struct kmem_cache **pkc = &pid_cache[level - 1];
0041 struct kmem_cache *kc;
0042 char name[4 + 10 + 1];
0043 unsigned int len;
0044
0045 kc = READ_ONCE(*pkc);
0046 if (kc)
0047 return kc;
0048
0049 snprintf(name, sizeof(name), "pid_%u", level + 1);
0050 len = sizeof(struct pid) + level * sizeof(struct upid);
0051 mutex_lock(&pid_caches_mutex);
0052
0053 if (!*pkc)
0054 *pkc = kmem_cache_create(name, len, 0,
0055 SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL);
0056 mutex_unlock(&pid_caches_mutex);
0057
0058 return READ_ONCE(*pkc);
0059 }
0060
0061 static struct ucounts *inc_pid_namespaces(struct user_namespace *ns)
0062 {
0063 return inc_ucount(ns, current_euid(), UCOUNT_PID_NAMESPACES);
0064 }
0065
0066 static void dec_pid_namespaces(struct ucounts *ucounts)
0067 {
0068 dec_ucount(ucounts, UCOUNT_PID_NAMESPACES);
0069 }
0070
0071 static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns,
0072 struct pid_namespace *parent_pid_ns)
0073 {
0074 struct pid_namespace *ns;
0075 unsigned int level = parent_pid_ns->level + 1;
0076 struct ucounts *ucounts;
0077 int err;
0078
0079 err = -EINVAL;
0080 if (!in_userns(parent_pid_ns->user_ns, user_ns))
0081 goto out;
0082
0083 err = -ENOSPC;
0084 if (level > MAX_PID_NS_LEVEL)
0085 goto out;
0086 ucounts = inc_pid_namespaces(user_ns);
0087 if (!ucounts)
0088 goto out;
0089
0090 err = -ENOMEM;
0091 ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL);
0092 if (ns == NULL)
0093 goto out_dec;
0094
0095 idr_init(&ns->idr);
0096
0097 ns->pid_cachep = create_pid_cachep(level);
0098 if (ns->pid_cachep == NULL)
0099 goto out_free_idr;
0100
0101 err = ns_alloc_inum(&ns->ns);
0102 if (err)
0103 goto out_free_idr;
0104 ns->ns.ops = &pidns_operations;
0105
0106 refcount_set(&ns->ns.count, 1);
0107 ns->level = level;
0108 ns->parent = get_pid_ns(parent_pid_ns);
0109 ns->user_ns = get_user_ns(user_ns);
0110 ns->ucounts = ucounts;
0111 ns->pid_allocated = PIDNS_ADDING;
0112
0113 return ns;
0114
0115 out_free_idr:
0116 idr_destroy(&ns->idr);
0117 kmem_cache_free(pid_ns_cachep, ns);
0118 out_dec:
0119 dec_pid_namespaces(ucounts);
0120 out:
0121 return ERR_PTR(err);
0122 }
0123
0124 static void delayed_free_pidns(struct rcu_head *p)
0125 {
0126 struct pid_namespace *ns = container_of(p, struct pid_namespace, rcu);
0127
0128 dec_pid_namespaces(ns->ucounts);
0129 put_user_ns(ns->user_ns);
0130
0131 kmem_cache_free(pid_ns_cachep, ns);
0132 }
0133
0134 static void destroy_pid_namespace(struct pid_namespace *ns)
0135 {
0136 ns_free_inum(&ns->ns);
0137
0138 idr_destroy(&ns->idr);
0139 call_rcu(&ns->rcu, delayed_free_pidns);
0140 }
0141
0142 struct pid_namespace *copy_pid_ns(unsigned long flags,
0143 struct user_namespace *user_ns, struct pid_namespace *old_ns)
0144 {
0145 if (!(flags & CLONE_NEWPID))
0146 return get_pid_ns(old_ns);
0147 if (task_active_pid_ns(current) != old_ns)
0148 return ERR_PTR(-EINVAL);
0149 return create_pid_namespace(user_ns, old_ns);
0150 }
0151
0152 void put_pid_ns(struct pid_namespace *ns)
0153 {
0154 struct pid_namespace *parent;
0155
0156 while (ns != &init_pid_ns) {
0157 parent = ns->parent;
0158 if (!refcount_dec_and_test(&ns->ns.count))
0159 break;
0160 destroy_pid_namespace(ns);
0161 ns = parent;
0162 }
0163 }
0164 EXPORT_SYMBOL_GPL(put_pid_ns);
0165
0166 void zap_pid_ns_processes(struct pid_namespace *pid_ns)
0167 {
0168 int nr;
0169 int rc;
0170 struct task_struct *task, *me = current;
0171 int init_pids = thread_group_leader(me) ? 1 : 2;
0172 struct pid *pid;
0173
0174
0175 disable_pid_allocation(pid_ns);
0176
0177
0178
0179
0180
0181
0182 spin_lock_irq(&me->sighand->siglock);
0183 me->sighand->action[SIGCHLD - 1].sa.sa_handler = SIG_IGN;
0184 spin_unlock_irq(&me->sighand->siglock);
0185
0186
0187
0188
0189
0190
0191
0192
0193
0194
0195
0196
0197
0198
0199 rcu_read_lock();
0200 read_lock(&tasklist_lock);
0201 nr = 2;
0202 idr_for_each_entry_continue(&pid_ns->idr, pid, nr) {
0203 task = pid_task(pid, PIDTYPE_PID);
0204 if (task && !__fatal_signal_pending(task))
0205 group_send_sig_info(SIGKILL, SEND_SIG_PRIV, task, PIDTYPE_MAX);
0206 }
0207 read_unlock(&tasklist_lock);
0208 rcu_read_unlock();
0209
0210
0211
0212
0213
0214
0215 do {
0216 clear_thread_flag(TIF_SIGPENDING);
0217 rc = kernel_wait4(-1, NULL, __WALL, NULL);
0218 } while (rc != -ECHILD);
0219
0220
0221
0222
0223
0224
0225
0226
0227
0228
0229
0230
0231
0232
0233
0234
0235
0236
0237
0238
0239
0240
0241
0242
0243 for (;;) {
0244 set_current_state(TASK_INTERRUPTIBLE);
0245 if (pid_ns->pid_allocated == init_pids)
0246 break;
0247 schedule();
0248 }
0249 __set_current_state(TASK_RUNNING);
0250
0251 if (pid_ns->reboot)
0252 current->signal->group_exit_code = pid_ns->reboot;
0253
0254 acct_exit_ns(pid_ns);
0255 return;
0256 }
0257
0258 #ifdef CONFIG_CHECKPOINT_RESTORE
0259 static int pid_ns_ctl_handler(struct ctl_table *table, int write,
0260 void *buffer, size_t *lenp, loff_t *ppos)
0261 {
0262 struct pid_namespace *pid_ns = task_active_pid_ns(current);
0263 struct ctl_table tmp = *table;
0264 int ret, next;
0265
0266 if (write && !checkpoint_restore_ns_capable(pid_ns->user_ns))
0267 return -EPERM;
0268
0269
0270
0271
0272
0273
0274
0275 next = idr_get_cursor(&pid_ns->idr) - 1;
0276
0277 tmp.data = &next;
0278 ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
0279 if (!ret && write)
0280 idr_set_cursor(&pid_ns->idr, next + 1);
0281
0282 return ret;
0283 }
0284
0285 extern int pid_max;
0286 static struct ctl_table pid_ns_ctl_table[] = {
0287 {
0288 .procname = "ns_last_pid",
0289 .maxlen = sizeof(int),
0290 .mode = 0666,
0291 .proc_handler = pid_ns_ctl_handler,
0292 .extra1 = SYSCTL_ZERO,
0293 .extra2 = &pid_max,
0294 },
0295 { }
0296 };
0297 static struct ctl_path kern_path[] = { { .procname = "kernel", }, { } };
0298 #endif
0299
0300 int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
0301 {
0302 if (pid_ns == &init_pid_ns)
0303 return 0;
0304
0305 switch (cmd) {
0306 case LINUX_REBOOT_CMD_RESTART2:
0307 case LINUX_REBOOT_CMD_RESTART:
0308 pid_ns->reboot = SIGHUP;
0309 break;
0310
0311 case LINUX_REBOOT_CMD_POWER_OFF:
0312 case LINUX_REBOOT_CMD_HALT:
0313 pid_ns->reboot = SIGINT;
0314 break;
0315 default:
0316 return -EINVAL;
0317 }
0318
0319 read_lock(&tasklist_lock);
0320 send_sig(SIGKILL, pid_ns->child_reaper, 1);
0321 read_unlock(&tasklist_lock);
0322
0323 do_exit(0);
0324
0325
0326 return 0;
0327 }
0328
0329 static inline struct pid_namespace *to_pid_ns(struct ns_common *ns)
0330 {
0331 return container_of(ns, struct pid_namespace, ns);
0332 }
0333
0334 static struct ns_common *pidns_get(struct task_struct *task)
0335 {
0336 struct pid_namespace *ns;
0337
0338 rcu_read_lock();
0339 ns = task_active_pid_ns(task);
0340 if (ns)
0341 get_pid_ns(ns);
0342 rcu_read_unlock();
0343
0344 return ns ? &ns->ns : NULL;
0345 }
0346
0347 static struct ns_common *pidns_for_children_get(struct task_struct *task)
0348 {
0349 struct pid_namespace *ns = NULL;
0350
0351 task_lock(task);
0352 if (task->nsproxy) {
0353 ns = task->nsproxy->pid_ns_for_children;
0354 get_pid_ns(ns);
0355 }
0356 task_unlock(task);
0357
0358 if (ns) {
0359 read_lock(&tasklist_lock);
0360 if (!ns->child_reaper) {
0361 put_pid_ns(ns);
0362 ns = NULL;
0363 }
0364 read_unlock(&tasklist_lock);
0365 }
0366
0367 return ns ? &ns->ns : NULL;
0368 }
0369
0370 static void pidns_put(struct ns_common *ns)
0371 {
0372 put_pid_ns(to_pid_ns(ns));
0373 }
0374
0375 static int pidns_install(struct nsset *nsset, struct ns_common *ns)
0376 {
0377 struct nsproxy *nsproxy = nsset->nsproxy;
0378 struct pid_namespace *active = task_active_pid_ns(current);
0379 struct pid_namespace *ancestor, *new = to_pid_ns(ns);
0380
0381 if (!ns_capable(new->user_ns, CAP_SYS_ADMIN) ||
0382 !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN))
0383 return -EPERM;
0384
0385
0386
0387
0388
0389
0390
0391
0392
0393 if (new->level < active->level)
0394 return -EINVAL;
0395
0396 ancestor = new;
0397 while (ancestor->level > active->level)
0398 ancestor = ancestor->parent;
0399 if (ancestor != active)
0400 return -EINVAL;
0401
0402 put_pid_ns(nsproxy->pid_ns_for_children);
0403 nsproxy->pid_ns_for_children = get_pid_ns(new);
0404 return 0;
0405 }
0406
0407 static struct ns_common *pidns_get_parent(struct ns_common *ns)
0408 {
0409 struct pid_namespace *active = task_active_pid_ns(current);
0410 struct pid_namespace *pid_ns, *p;
0411
0412
0413 pid_ns = p = to_pid_ns(ns)->parent;
0414 for (;;) {
0415 if (!p)
0416 return ERR_PTR(-EPERM);
0417 if (p == active)
0418 break;
0419 p = p->parent;
0420 }
0421
0422 return &get_pid_ns(pid_ns)->ns;
0423 }
0424
0425 static struct user_namespace *pidns_owner(struct ns_common *ns)
0426 {
0427 return to_pid_ns(ns)->user_ns;
0428 }
0429
0430 const struct proc_ns_operations pidns_operations = {
0431 .name = "pid",
0432 .type = CLONE_NEWPID,
0433 .get = pidns_get,
0434 .put = pidns_put,
0435 .install = pidns_install,
0436 .owner = pidns_owner,
0437 .get_parent = pidns_get_parent,
0438 };
0439
0440 const struct proc_ns_operations pidns_for_children_operations = {
0441 .name = "pid_for_children",
0442 .real_ns_name = "pid",
0443 .type = CLONE_NEWPID,
0444 .get = pidns_for_children_get,
0445 .put = pidns_put,
0446 .install = pidns_install,
0447 .owner = pidns_owner,
0448 .get_parent = pidns_get_parent,
0449 };
0450
0451 static __init int pid_namespaces_init(void)
0452 {
0453 pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC | SLAB_ACCOUNT);
0454
0455 #ifdef CONFIG_CHECKPOINT_RESTORE
0456 register_sysctl_paths(kern_path, pid_ns_ctl_table);
0457 #endif
0458 return 0;
0459 }
0460
0461 __initcall(pid_namespaces_init);