0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024
0025
0026
0027
0028
0029
0030 #include <linux/mm.h>
0031 #include <linux/export.h>
0032 #include <linux/slab.h>
0033 #include <linux/init.h>
0034 #include <linux/rculist.h>
0035 #include <linux/memblock.h>
0036 #include <linux/pid_namespace.h>
0037 #include <linux/init_task.h>
0038 #include <linux/syscalls.h>
0039 #include <linux/proc_ns.h>
0040 #include <linux/refcount.h>
0041 #include <linux/anon_inodes.h>
0042 #include <linux/sched/signal.h>
0043 #include <linux/sched/task.h>
0044 #include <linux/idr.h>
0045 #include <net/sock.h>
0046 #include <uapi/linux/pidfd.h>
0047
0048 struct pid init_struct_pid = {
0049 .count = REFCOUNT_INIT(1),
0050 .tasks = {
0051 { .first = NULL },
0052 { .first = NULL },
0053 { .first = NULL },
0054 },
0055 .level = 0,
0056 .numbers = { {
0057 .nr = 0,
0058 .ns = &init_pid_ns,
0059 }, }
0060 };
0061
0062 int pid_max = PID_MAX_DEFAULT;
0063
0064 #define RESERVED_PIDS 300
0065
0066 int pid_max_min = RESERVED_PIDS + 1;
0067 int pid_max_max = PID_MAX_LIMIT;
0068
0069
0070
0071
0072
0073
0074
0075 struct pid_namespace init_pid_ns = {
0076 .ns.count = REFCOUNT_INIT(2),
0077 .idr = IDR_INIT(init_pid_ns.idr),
0078 .pid_allocated = PIDNS_ADDING,
0079 .level = 0,
0080 .child_reaper = &init_task,
0081 .user_ns = &init_user_ns,
0082 .ns.inum = PROC_PID_INIT_INO,
0083 #ifdef CONFIG_PID_NS
0084 .ns.ops = &pidns_operations,
0085 #endif
0086 };
0087 EXPORT_SYMBOL_GPL(init_pid_ns);
0088
0089
0090
0091
0092
0093
0094
0095
0096
0097
0098
0099
0100
0101
0102
0103 static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
0104
0105 void put_pid(struct pid *pid)
0106 {
0107 struct pid_namespace *ns;
0108
0109 if (!pid)
0110 return;
0111
0112 ns = pid->numbers[pid->level].ns;
0113 if (refcount_dec_and_test(&pid->count)) {
0114 kmem_cache_free(ns->pid_cachep, pid);
0115 put_pid_ns(ns);
0116 }
0117 }
0118 EXPORT_SYMBOL_GPL(put_pid);
0119
0120 static void delayed_put_pid(struct rcu_head *rhp)
0121 {
0122 struct pid *pid = container_of(rhp, struct pid, rcu);
0123 put_pid(pid);
0124 }
0125
0126 void free_pid(struct pid *pid)
0127 {
0128
0129 int i;
0130 unsigned long flags;
0131
0132 spin_lock_irqsave(&pidmap_lock, flags);
0133 for (i = 0; i <= pid->level; i++) {
0134 struct upid *upid = pid->numbers + i;
0135 struct pid_namespace *ns = upid->ns;
0136 switch (--ns->pid_allocated) {
0137 case 2:
0138 case 1:
0139
0140
0141
0142
0143 wake_up_process(ns->child_reaper);
0144 break;
0145 case PIDNS_ADDING:
0146
0147 WARN_ON(ns->child_reaper);
0148 ns->pid_allocated = 0;
0149 break;
0150 }
0151
0152 idr_remove(&ns->idr, upid->nr);
0153 }
0154 spin_unlock_irqrestore(&pidmap_lock, flags);
0155
0156 call_rcu(&pid->rcu, delayed_put_pid);
0157 }
0158
0159 struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
0160 size_t set_tid_size)
0161 {
0162 struct pid *pid;
0163 enum pid_type type;
0164 int i, nr;
0165 struct pid_namespace *tmp;
0166 struct upid *upid;
0167 int retval = -ENOMEM;
0168
0169
0170
0171
0172
0173
0174
0175
0176
0177 if (set_tid_size > ns->level + 1)
0178 return ERR_PTR(-EINVAL);
0179
0180 pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL);
0181 if (!pid)
0182 return ERR_PTR(retval);
0183
0184 tmp = ns;
0185 pid->level = ns->level;
0186
0187 for (i = ns->level; i >= 0; i--) {
0188 int tid = 0;
0189
0190 if (set_tid_size) {
0191 tid = set_tid[ns->level - i];
0192
0193 retval = -EINVAL;
0194 if (tid < 1 || tid >= pid_max)
0195 goto out_free;
0196
0197
0198
0199
0200 if (tid != 1 && !tmp->child_reaper)
0201 goto out_free;
0202 retval = -EPERM;
0203 if (!checkpoint_restore_ns_capable(tmp->user_ns))
0204 goto out_free;
0205 set_tid_size--;
0206 }
0207
0208 idr_preload(GFP_KERNEL);
0209 spin_lock_irq(&pidmap_lock);
0210
0211 if (tid) {
0212 nr = idr_alloc(&tmp->idr, NULL, tid,
0213 tid + 1, GFP_ATOMIC);
0214
0215
0216
0217
0218 if (nr == -ENOSPC)
0219 nr = -EEXIST;
0220 } else {
0221 int pid_min = 1;
0222
0223
0224
0225
0226 if (idr_get_cursor(&tmp->idr) > RESERVED_PIDS)
0227 pid_min = RESERVED_PIDS;
0228
0229
0230
0231
0232
0233 nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
0234 pid_max, GFP_ATOMIC);
0235 }
0236 spin_unlock_irq(&pidmap_lock);
0237 idr_preload_end();
0238
0239 if (nr < 0) {
0240 retval = (nr == -ENOSPC) ? -EAGAIN : nr;
0241 goto out_free;
0242 }
0243
0244 pid->numbers[i].nr = nr;
0245 pid->numbers[i].ns = tmp;
0246 tmp = tmp->parent;
0247 }
0248
0249
0250
0251
0252
0253
0254
0255
0256
0257 retval = -ENOMEM;
0258
0259 get_pid_ns(ns);
0260 refcount_set(&pid->count, 1);
0261 spin_lock_init(&pid->lock);
0262 for (type = 0; type < PIDTYPE_MAX; ++type)
0263 INIT_HLIST_HEAD(&pid->tasks[type]);
0264
0265 init_waitqueue_head(&pid->wait_pidfd);
0266 INIT_HLIST_HEAD(&pid->inodes);
0267
0268 upid = pid->numbers + ns->level;
0269 spin_lock_irq(&pidmap_lock);
0270 if (!(ns->pid_allocated & PIDNS_ADDING))
0271 goto out_unlock;
0272 for ( ; upid >= pid->numbers; --upid) {
0273
0274 idr_replace(&upid->ns->idr, pid, upid->nr);
0275 upid->ns->pid_allocated++;
0276 }
0277 spin_unlock_irq(&pidmap_lock);
0278
0279 return pid;
0280
0281 out_unlock:
0282 spin_unlock_irq(&pidmap_lock);
0283 put_pid_ns(ns);
0284
0285 out_free:
0286 spin_lock_irq(&pidmap_lock);
0287 while (++i <= ns->level) {
0288 upid = pid->numbers + i;
0289 idr_remove(&upid->ns->idr, upid->nr);
0290 }
0291
0292
0293 if (ns->pid_allocated == PIDNS_ADDING)
0294 idr_set_cursor(&ns->idr, 0);
0295
0296 spin_unlock_irq(&pidmap_lock);
0297
0298 kmem_cache_free(ns->pid_cachep, pid);
0299 return ERR_PTR(retval);
0300 }
0301
0302 void disable_pid_allocation(struct pid_namespace *ns)
0303 {
0304 spin_lock_irq(&pidmap_lock);
0305 ns->pid_allocated &= ~PIDNS_ADDING;
0306 spin_unlock_irq(&pidmap_lock);
0307 }
0308
0309 struct pid *find_pid_ns(int nr, struct pid_namespace *ns)
0310 {
0311 return idr_find(&ns->idr, nr);
0312 }
0313 EXPORT_SYMBOL_GPL(find_pid_ns);
0314
0315 struct pid *find_vpid(int nr)
0316 {
0317 return find_pid_ns(nr, task_active_pid_ns(current));
0318 }
0319 EXPORT_SYMBOL_GPL(find_vpid);
0320
0321 static struct pid **task_pid_ptr(struct task_struct *task, enum pid_type type)
0322 {
0323 return (type == PIDTYPE_PID) ?
0324 &task->thread_pid :
0325 &task->signal->pids[type];
0326 }
0327
0328
0329
0330
0331 void attach_pid(struct task_struct *task, enum pid_type type)
0332 {
0333 struct pid *pid = *task_pid_ptr(task, type);
0334 hlist_add_head_rcu(&task->pid_links[type], &pid->tasks[type]);
0335 }
0336
0337 static void __change_pid(struct task_struct *task, enum pid_type type,
0338 struct pid *new)
0339 {
0340 struct pid **pid_ptr = task_pid_ptr(task, type);
0341 struct pid *pid;
0342 int tmp;
0343
0344 pid = *pid_ptr;
0345
0346 hlist_del_rcu(&task->pid_links[type]);
0347 *pid_ptr = new;
0348
0349 for (tmp = PIDTYPE_MAX; --tmp >= 0; )
0350 if (pid_has_task(pid, tmp))
0351 return;
0352
0353 free_pid(pid);
0354 }
0355
0356 void detach_pid(struct task_struct *task, enum pid_type type)
0357 {
0358 __change_pid(task, type, NULL);
0359 }
0360
0361 void change_pid(struct task_struct *task, enum pid_type type,
0362 struct pid *pid)
0363 {
0364 __change_pid(task, type, pid);
0365 attach_pid(task, type);
0366 }
0367
0368 void exchange_tids(struct task_struct *left, struct task_struct *right)
0369 {
0370 struct pid *pid1 = left->thread_pid;
0371 struct pid *pid2 = right->thread_pid;
0372 struct hlist_head *head1 = &pid1->tasks[PIDTYPE_PID];
0373 struct hlist_head *head2 = &pid2->tasks[PIDTYPE_PID];
0374
0375
0376 hlists_swap_heads_rcu(head1, head2);
0377
0378
0379 rcu_assign_pointer(left->thread_pid, pid2);
0380 rcu_assign_pointer(right->thread_pid, pid1);
0381
0382
0383 WRITE_ONCE(left->pid, pid_nr(pid2));
0384 WRITE_ONCE(right->pid, pid_nr(pid1));
0385 }
0386
0387
0388 void transfer_pid(struct task_struct *old, struct task_struct *new,
0389 enum pid_type type)
0390 {
0391 if (type == PIDTYPE_PID)
0392 new->thread_pid = old->thread_pid;
0393 hlist_replace_rcu(&old->pid_links[type], &new->pid_links[type]);
0394 }
0395
0396 struct task_struct *pid_task(struct pid *pid, enum pid_type type)
0397 {
0398 struct task_struct *result = NULL;
0399 if (pid) {
0400 struct hlist_node *first;
0401 first = rcu_dereference_check(hlist_first_rcu(&pid->tasks[type]),
0402 lockdep_tasklist_lock_is_held());
0403 if (first)
0404 result = hlist_entry(first, struct task_struct, pid_links[(type)]);
0405 }
0406 return result;
0407 }
0408 EXPORT_SYMBOL(pid_task);
0409
0410
0411
0412
0413 struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
0414 {
0415 RCU_LOCKDEP_WARN(!rcu_read_lock_held(),
0416 "find_task_by_pid_ns() needs rcu_read_lock() protection");
0417 return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID);
0418 }
0419
0420 struct task_struct *find_task_by_vpid(pid_t vnr)
0421 {
0422 return find_task_by_pid_ns(vnr, task_active_pid_ns(current));
0423 }
0424
0425 struct task_struct *find_get_task_by_vpid(pid_t nr)
0426 {
0427 struct task_struct *task;
0428
0429 rcu_read_lock();
0430 task = find_task_by_vpid(nr);
0431 if (task)
0432 get_task_struct(task);
0433 rcu_read_unlock();
0434
0435 return task;
0436 }
0437
0438 struct pid *get_task_pid(struct task_struct *task, enum pid_type type)
0439 {
0440 struct pid *pid;
0441 rcu_read_lock();
0442 pid = get_pid(rcu_dereference(*task_pid_ptr(task, type)));
0443 rcu_read_unlock();
0444 return pid;
0445 }
0446 EXPORT_SYMBOL_GPL(get_task_pid);
0447
0448 struct task_struct *get_pid_task(struct pid *pid, enum pid_type type)
0449 {
0450 struct task_struct *result;
0451 rcu_read_lock();
0452 result = pid_task(pid, type);
0453 if (result)
0454 get_task_struct(result);
0455 rcu_read_unlock();
0456 return result;
0457 }
0458 EXPORT_SYMBOL_GPL(get_pid_task);
0459
0460 struct pid *find_get_pid(pid_t nr)
0461 {
0462 struct pid *pid;
0463
0464 rcu_read_lock();
0465 pid = get_pid(find_vpid(nr));
0466 rcu_read_unlock();
0467
0468 return pid;
0469 }
0470 EXPORT_SYMBOL_GPL(find_get_pid);
0471
0472 pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns)
0473 {
0474 struct upid *upid;
0475 pid_t nr = 0;
0476
0477 if (pid && ns->level <= pid->level) {
0478 upid = &pid->numbers[ns->level];
0479 if (upid->ns == ns)
0480 nr = upid->nr;
0481 }
0482 return nr;
0483 }
0484 EXPORT_SYMBOL_GPL(pid_nr_ns);
0485
0486 pid_t pid_vnr(struct pid *pid)
0487 {
0488 return pid_nr_ns(pid, task_active_pid_ns(current));
0489 }
0490 EXPORT_SYMBOL_GPL(pid_vnr);
0491
0492 pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
0493 struct pid_namespace *ns)
0494 {
0495 pid_t nr = 0;
0496
0497 rcu_read_lock();
0498 if (!ns)
0499 ns = task_active_pid_ns(current);
0500 nr = pid_nr_ns(rcu_dereference(*task_pid_ptr(task, type)), ns);
0501 rcu_read_unlock();
0502
0503 return nr;
0504 }
0505 EXPORT_SYMBOL(__task_pid_nr_ns);
0506
0507 struct pid_namespace *task_active_pid_ns(struct task_struct *tsk)
0508 {
0509 return ns_of_pid(task_pid(tsk));
0510 }
0511 EXPORT_SYMBOL_GPL(task_active_pid_ns);
0512
0513
0514
0515
0516
0517
0518 struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
0519 {
0520 return idr_get_next(&ns->idr, &nr);
0521 }
0522
0523 struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags)
0524 {
0525 struct fd f;
0526 struct pid *pid;
0527
0528 f = fdget(fd);
0529 if (!f.file)
0530 return ERR_PTR(-EBADF);
0531
0532 pid = pidfd_pid(f.file);
0533 if (!IS_ERR(pid)) {
0534 get_pid(pid);
0535 *flags = f.file->f_flags;
0536 }
0537
0538 fdput(f);
0539 return pid;
0540 }
0541
0542
0543
0544
0545
0546
0547
0548
0549
0550
0551
0552
0553
0554
0555
0556
0557
0558
0559 struct task_struct *pidfd_get_task(int pidfd, unsigned int *flags)
0560 {
0561 unsigned int f_flags;
0562 struct pid *pid;
0563 struct task_struct *task;
0564
0565 pid = pidfd_get_pid(pidfd, &f_flags);
0566 if (IS_ERR(pid))
0567 return ERR_CAST(pid);
0568
0569 task = get_pid_task(pid, PIDTYPE_TGID);
0570 put_pid(pid);
0571 if (!task)
0572 return ERR_PTR(-ESRCH);
0573
0574 *flags = f_flags;
0575 return task;
0576 }
0577
0578
0579
0580
0581
0582
0583
0584
0585
0586
0587
0588
0589
0590
0591
0592
0593
0594 int pidfd_create(struct pid *pid, unsigned int flags)
0595 {
0596 int fd;
0597
0598 if (!pid || !pid_has_task(pid, PIDTYPE_TGID))
0599 return -EINVAL;
0600
0601 if (flags & ~(O_NONBLOCK | O_RDWR | O_CLOEXEC))
0602 return -EINVAL;
0603
0604 fd = anon_inode_getfd("[pidfd]", &pidfd_fops, get_pid(pid),
0605 flags | O_RDWR | O_CLOEXEC);
0606 if (fd < 0)
0607 put_pid(pid);
0608
0609 return fd;
0610 }
0611
0612
0613
0614
0615
0616
0617
0618
0619
0620
0621
0622
0623
0624
0625
0626
0627
0628 SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags)
0629 {
0630 int fd;
0631 struct pid *p;
0632
0633 if (flags & ~PIDFD_NONBLOCK)
0634 return -EINVAL;
0635
0636 if (pid <= 0)
0637 return -EINVAL;
0638
0639 p = find_get_pid(pid);
0640 if (!p)
0641 return -ESRCH;
0642
0643 fd = pidfd_create(p, flags);
0644
0645 put_pid(p);
0646 return fd;
0647 }
0648
0649 void __init pid_idr_init(void)
0650 {
0651
0652 BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_ADDING);
0653
0654
0655 pid_max = min(pid_max_max, max_t(int, pid_max,
0656 PIDS_PER_CPU_DEFAULT * num_possible_cpus()));
0657 pid_max_min = max_t(int, pid_max_min,
0658 PIDS_PER_CPU_MIN * num_possible_cpus());
0659 pr_info("pid_max: default: %u minimum: %u\n", pid_max, pid_max_min);
0660
0661 idr_init(&init_pid_ns.idr);
0662
0663 init_pid_ns.pid_cachep = KMEM_CACHE(pid,
0664 SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT);
0665 }
0666
0667 static struct file *__pidfd_fget(struct task_struct *task, int fd)
0668 {
0669 struct file *file;
0670 int ret;
0671
0672 ret = down_read_killable(&task->signal->exec_update_lock);
0673 if (ret)
0674 return ERR_PTR(ret);
0675
0676 if (ptrace_may_access(task, PTRACE_MODE_ATTACH_REALCREDS))
0677 file = fget_task(task, fd);
0678 else
0679 file = ERR_PTR(-EPERM);
0680
0681 up_read(&task->signal->exec_update_lock);
0682
0683 return file ?: ERR_PTR(-EBADF);
0684 }
0685
0686 static int pidfd_getfd(struct pid *pid, int fd)
0687 {
0688 struct task_struct *task;
0689 struct file *file;
0690 int ret;
0691
0692 task = get_pid_task(pid, PIDTYPE_PID);
0693 if (!task)
0694 return -ESRCH;
0695
0696 file = __pidfd_fget(task, fd);
0697 put_task_struct(task);
0698 if (IS_ERR(file))
0699 return PTR_ERR(file);
0700
0701 ret = receive_fd(file, O_CLOEXEC);
0702 fput(file);
0703
0704 return ret;
0705 }
0706
0707
0708
0709
0710
0711
0712
0713
0714
0715
0716
0717
0718
0719
0720
0721
0722
0723 SYSCALL_DEFINE3(pidfd_getfd, int, pidfd, int, fd,
0724 unsigned int, flags)
0725 {
0726 struct pid *pid;
0727 struct fd f;
0728 int ret;
0729
0730
0731 if (flags)
0732 return -EINVAL;
0733
0734 f = fdget(pidfd);
0735 if (!f.file)
0736 return -EBADF;
0737
0738 pid = pidfd_pid(f.file);
0739 if (IS_ERR(pid))
0740 ret = PTR_ERR(pid);
0741 else
0742 ret = pidfd_getfd(pid, fd);
0743
0744 fdput(f);
0745 return ret;
0746 }