0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015 #include <linux/anon_inodes.h>
0016 #include <linux/slab.h>
0017 #include <linux/sched/autogroup.h>
0018 #include <linux/sched/mm.h>
0019 #include <linux/sched/coredump.h>
0020 #include <linux/sched/user.h>
0021 #include <linux/sched/numa_balancing.h>
0022 #include <linux/sched/stat.h>
0023 #include <linux/sched/task.h>
0024 #include <linux/sched/task_stack.h>
0025 #include <linux/sched/cputime.h>
0026 #include <linux/seq_file.h>
0027 #include <linux/rtmutex.h>
0028 #include <linux/init.h>
0029 #include <linux/unistd.h>
0030 #include <linux/module.h>
0031 #include <linux/vmalloc.h>
0032 #include <linux/completion.h>
0033 #include <linux/personality.h>
0034 #include <linux/mempolicy.h>
0035 #include <linux/sem.h>
0036 #include <linux/file.h>
0037 #include <linux/fdtable.h>
0038 #include <linux/iocontext.h>
0039 #include <linux/key.h>
0040 #include <linux/binfmts.h>
0041 #include <linux/mman.h>
0042 #include <linux/mmu_notifier.h>
0043 #include <linux/fs.h>
0044 #include <linux/mm.h>
0045 #include <linux/mm_inline.h>
0046 #include <linux/vmacache.h>
0047 #include <linux/nsproxy.h>
0048 #include <linux/capability.h>
0049 #include <linux/cpu.h>
0050 #include <linux/cgroup.h>
0051 #include <linux/security.h>
0052 #include <linux/hugetlb.h>
0053 #include <linux/seccomp.h>
0054 #include <linux/swap.h>
0055 #include <linux/syscalls.h>
0056 #include <linux/jiffies.h>
0057 #include <linux/futex.h>
0058 #include <linux/compat.h>
0059 #include <linux/kthread.h>
0060 #include <linux/task_io_accounting_ops.h>
0061 #include <linux/rcupdate.h>
0062 #include <linux/ptrace.h>
0063 #include <linux/mount.h>
0064 #include <linux/audit.h>
0065 #include <linux/memcontrol.h>
0066 #include <linux/ftrace.h>
0067 #include <linux/proc_fs.h>
0068 #include <linux/profile.h>
0069 #include <linux/rmap.h>
0070 #include <linux/ksm.h>
0071 #include <linux/acct.h>
0072 #include <linux/userfaultfd_k.h>
0073 #include <linux/tsacct_kern.h>
0074 #include <linux/cn_proc.h>
0075 #include <linux/freezer.h>
0076 #include <linux/delayacct.h>
0077 #include <linux/taskstats_kern.h>
0078 #include <linux/random.h>
0079 #include <linux/tty.h>
0080 #include <linux/fs_struct.h>
0081 #include <linux/magic.h>
0082 #include <linux/perf_event.h>
0083 #include <linux/posix-timers.h>
0084 #include <linux/user-return-notifier.h>
0085 #include <linux/oom.h>
0086 #include <linux/khugepaged.h>
0087 #include <linux/signalfd.h>
0088 #include <linux/uprobes.h>
0089 #include <linux/aio.h>
0090 #include <linux/compiler.h>
0091 #include <linux/sysctl.h>
0092 #include <linux/kcov.h>
0093 #include <linux/livepatch.h>
0094 #include <linux/thread_info.h>
0095 #include <linux/stackleak.h>
0096 #include <linux/kasan.h>
0097 #include <linux/scs.h>
0098 #include <linux/io_uring.h>
0099 #include <linux/bpf.h>
0100 #include <linux/sched/mm.h>
0101
0102 #include <asm/pgalloc.h>
0103 #include <linux/uaccess.h>
0104 #include <asm/mmu_context.h>
0105 #include <asm/cacheflush.h>
0106 #include <asm/tlbflush.h>
0107
0108 #include <trace/events/sched.h>
0109
0110 #define CREATE_TRACE_POINTS
0111 #include <trace/events/task.h>
0112
0113
0114
0115
0116 #define MIN_THREADS 20
0117
0118
0119
0120
0121 #define MAX_THREADS FUTEX_TID_MASK
0122
0123
0124
0125
0126 unsigned long total_forks;
0127 int nr_threads;
0128
0129 static int max_threads;
0130
0131 #define NAMED_ARRAY_INDEX(x) [x] = __stringify(x)
0132
0133 static const char * const resident_page_types[] = {
0134 NAMED_ARRAY_INDEX(MM_FILEPAGES),
0135 NAMED_ARRAY_INDEX(MM_ANONPAGES),
0136 NAMED_ARRAY_INDEX(MM_SWAPENTS),
0137 NAMED_ARRAY_INDEX(MM_SHMEMPAGES),
0138 };
0139
0140 DEFINE_PER_CPU(unsigned long, process_counts) = 0;
0141
0142 __cacheline_aligned DEFINE_RWLOCK(tasklist_lock);
0143
0144 #ifdef CONFIG_PROVE_RCU
0145 int lockdep_tasklist_lock_is_held(void)
0146 {
0147 return lockdep_is_held(&tasklist_lock);
0148 }
0149 EXPORT_SYMBOL_GPL(lockdep_tasklist_lock_is_held);
0150 #endif
0151
0152 int nr_processes(void)
0153 {
0154 int cpu;
0155 int total = 0;
0156
0157 for_each_possible_cpu(cpu)
0158 total += per_cpu(process_counts, cpu);
0159
0160 return total;
0161 }
0162
0163 void __weak arch_release_task_struct(struct task_struct *tsk)
0164 {
0165 }
0166
0167 #ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
0168 static struct kmem_cache *task_struct_cachep;
0169
0170 static inline struct task_struct *alloc_task_struct_node(int node)
0171 {
0172 return kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node);
0173 }
0174
0175 static inline void free_task_struct(struct task_struct *tsk)
0176 {
0177 kmem_cache_free(task_struct_cachep, tsk);
0178 }
0179 #endif
0180
0181 #ifndef CONFIG_ARCH_THREAD_STACK_ALLOCATOR
0182
0183
0184
0185
0186
0187 # if THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK)
0188
0189 # ifdef CONFIG_VMAP_STACK
0190
0191
0192
0193
0194 #define NR_CACHED_STACKS 2
0195 static DEFINE_PER_CPU(struct vm_struct *, cached_stacks[NR_CACHED_STACKS]);
0196
0197 struct vm_stack {
0198 struct rcu_head rcu;
0199 struct vm_struct *stack_vm_area;
0200 };
0201
0202 static bool try_release_thread_stack_to_cache(struct vm_struct *vm)
0203 {
0204 unsigned int i;
0205
0206 for (i = 0; i < NR_CACHED_STACKS; i++) {
0207 if (this_cpu_cmpxchg(cached_stacks[i], NULL, vm) != NULL)
0208 continue;
0209 return true;
0210 }
0211 return false;
0212 }
0213
0214 static void thread_stack_free_rcu(struct rcu_head *rh)
0215 {
0216 struct vm_stack *vm_stack = container_of(rh, struct vm_stack, rcu);
0217
0218 if (try_release_thread_stack_to_cache(vm_stack->stack_vm_area))
0219 return;
0220
0221 vfree(vm_stack);
0222 }
0223
0224 static void thread_stack_delayed_free(struct task_struct *tsk)
0225 {
0226 struct vm_stack *vm_stack = tsk->stack;
0227
0228 vm_stack->stack_vm_area = tsk->stack_vm_area;
0229 call_rcu(&vm_stack->rcu, thread_stack_free_rcu);
0230 }
0231
0232 static int free_vm_stack_cache(unsigned int cpu)
0233 {
0234 struct vm_struct **cached_vm_stacks = per_cpu_ptr(cached_stacks, cpu);
0235 int i;
0236
0237 for (i = 0; i < NR_CACHED_STACKS; i++) {
0238 struct vm_struct *vm_stack = cached_vm_stacks[i];
0239
0240 if (!vm_stack)
0241 continue;
0242
0243 vfree(vm_stack->addr);
0244 cached_vm_stacks[i] = NULL;
0245 }
0246
0247 return 0;
0248 }
0249
0250 static int memcg_charge_kernel_stack(struct vm_struct *vm)
0251 {
0252 int i;
0253 int ret;
0254
0255 BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0);
0256 BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE);
0257
0258 for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
0259 ret = memcg_kmem_charge_page(vm->pages[i], GFP_KERNEL, 0);
0260 if (ret)
0261 goto err;
0262 }
0263 return 0;
0264 err:
0265
0266
0267
0268
0269
0270 for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
0271 memcg_kmem_uncharge_page(vm->pages[i], 0);
0272 return ret;
0273 }
0274
0275 static int alloc_thread_stack_node(struct task_struct *tsk, int node)
0276 {
0277 struct vm_struct *vm;
0278 void *stack;
0279 int i;
0280
0281 for (i = 0; i < NR_CACHED_STACKS; i++) {
0282 struct vm_struct *s;
0283
0284 s = this_cpu_xchg(cached_stacks[i], NULL);
0285
0286 if (!s)
0287 continue;
0288
0289
0290 kasan_unpoison_range(s->addr, THREAD_SIZE);
0291
0292 stack = kasan_reset_tag(s->addr);
0293
0294
0295 memset(stack, 0, THREAD_SIZE);
0296
0297 if (memcg_charge_kernel_stack(s)) {
0298 vfree(s->addr);
0299 return -ENOMEM;
0300 }
0301
0302 tsk->stack_vm_area = s;
0303 tsk->stack = stack;
0304 return 0;
0305 }
0306
0307
0308
0309
0310
0311
0312 stack = __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN,
0313 VMALLOC_START, VMALLOC_END,
0314 THREADINFO_GFP & ~__GFP_ACCOUNT,
0315 PAGE_KERNEL,
0316 0, node, __builtin_return_address(0));
0317 if (!stack)
0318 return -ENOMEM;
0319
0320 vm = find_vm_area(stack);
0321 if (memcg_charge_kernel_stack(vm)) {
0322 vfree(stack);
0323 return -ENOMEM;
0324 }
0325
0326
0327
0328
0329
0330 tsk->stack_vm_area = vm;
0331 stack = kasan_reset_tag(stack);
0332 tsk->stack = stack;
0333 return 0;
0334 }
0335
0336 static void free_thread_stack(struct task_struct *tsk)
0337 {
0338 if (!try_release_thread_stack_to_cache(tsk->stack_vm_area))
0339 thread_stack_delayed_free(tsk);
0340
0341 tsk->stack = NULL;
0342 tsk->stack_vm_area = NULL;
0343 }
0344
0345 # else
0346
0347 static void thread_stack_free_rcu(struct rcu_head *rh)
0348 {
0349 __free_pages(virt_to_page(rh), THREAD_SIZE_ORDER);
0350 }
0351
0352 static void thread_stack_delayed_free(struct task_struct *tsk)
0353 {
0354 struct rcu_head *rh = tsk->stack;
0355
0356 call_rcu(rh, thread_stack_free_rcu);
0357 }
0358
0359 static int alloc_thread_stack_node(struct task_struct *tsk, int node)
0360 {
0361 struct page *page = alloc_pages_node(node, THREADINFO_GFP,
0362 THREAD_SIZE_ORDER);
0363
0364 if (likely(page)) {
0365 tsk->stack = kasan_reset_tag(page_address(page));
0366 return 0;
0367 }
0368 return -ENOMEM;
0369 }
0370
0371 static void free_thread_stack(struct task_struct *tsk)
0372 {
0373 thread_stack_delayed_free(tsk);
0374 tsk->stack = NULL;
0375 }
0376
0377 # endif
0378 # else
0379
0380 static struct kmem_cache *thread_stack_cache;
0381
0382 static void thread_stack_free_rcu(struct rcu_head *rh)
0383 {
0384 kmem_cache_free(thread_stack_cache, rh);
0385 }
0386
0387 static void thread_stack_delayed_free(struct task_struct *tsk)
0388 {
0389 struct rcu_head *rh = tsk->stack;
0390
0391 call_rcu(rh, thread_stack_free_rcu);
0392 }
0393
0394 static int alloc_thread_stack_node(struct task_struct *tsk, int node)
0395 {
0396 unsigned long *stack;
0397 stack = kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node);
0398 stack = kasan_reset_tag(stack);
0399 tsk->stack = stack;
0400 return stack ? 0 : -ENOMEM;
0401 }
0402
0403 static void free_thread_stack(struct task_struct *tsk)
0404 {
0405 thread_stack_delayed_free(tsk);
0406 tsk->stack = NULL;
0407 }
0408
0409 void thread_stack_cache_init(void)
0410 {
0411 thread_stack_cache = kmem_cache_create_usercopy("thread_stack",
0412 THREAD_SIZE, THREAD_SIZE, 0, 0,
0413 THREAD_SIZE, NULL);
0414 BUG_ON(thread_stack_cache == NULL);
0415 }
0416
0417 # endif
0418 #else
0419
0420 static int alloc_thread_stack_node(struct task_struct *tsk, int node)
0421 {
0422 unsigned long *stack;
0423
0424 stack = arch_alloc_thread_stack_node(tsk, node);
0425 tsk->stack = stack;
0426 return stack ? 0 : -ENOMEM;
0427 }
0428
0429 static void free_thread_stack(struct task_struct *tsk)
0430 {
0431 arch_free_thread_stack(tsk);
0432 tsk->stack = NULL;
0433 }
0434
0435 #endif
0436
0437
0438 static struct kmem_cache *signal_cachep;
0439
0440
0441 struct kmem_cache *sighand_cachep;
0442
0443
0444 struct kmem_cache *files_cachep;
0445
0446
0447 struct kmem_cache *fs_cachep;
0448
0449
0450 static struct kmem_cache *vm_area_cachep;
0451
0452
0453 static struct kmem_cache *mm_cachep;
0454
0455 struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
0456 {
0457 struct vm_area_struct *vma;
0458
0459 vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
0460 if (vma)
0461 vma_init(vma, mm);
0462 return vma;
0463 }
0464
0465 struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
0466 {
0467 struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
0468
0469 if (new) {
0470 ASSERT_EXCLUSIVE_WRITER(orig->vm_flags);
0471 ASSERT_EXCLUSIVE_WRITER(orig->vm_file);
0472
0473
0474
0475
0476 *new = data_race(*orig);
0477 INIT_LIST_HEAD(&new->anon_vma_chain);
0478 new->vm_next = new->vm_prev = NULL;
0479 dup_anon_vma_name(orig, new);
0480 }
0481 return new;
0482 }
0483
0484 void vm_area_free(struct vm_area_struct *vma)
0485 {
0486 free_anon_vma_name(vma);
0487 kmem_cache_free(vm_area_cachep, vma);
0488 }
0489
0490 static void account_kernel_stack(struct task_struct *tsk, int account)
0491 {
0492 if (IS_ENABLED(CONFIG_VMAP_STACK)) {
0493 struct vm_struct *vm = task_stack_vm_area(tsk);
0494 int i;
0495
0496 for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
0497 mod_lruvec_page_state(vm->pages[i], NR_KERNEL_STACK_KB,
0498 account * (PAGE_SIZE / 1024));
0499 } else {
0500 void *stack = task_stack_page(tsk);
0501
0502
0503 mod_lruvec_kmem_state(stack, NR_KERNEL_STACK_KB,
0504 account * (THREAD_SIZE / 1024));
0505 }
0506 }
0507
0508 void exit_task_stack_account(struct task_struct *tsk)
0509 {
0510 account_kernel_stack(tsk, -1);
0511
0512 if (IS_ENABLED(CONFIG_VMAP_STACK)) {
0513 struct vm_struct *vm;
0514 int i;
0515
0516 vm = task_stack_vm_area(tsk);
0517 for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
0518 memcg_kmem_uncharge_page(vm->pages[i], 0);
0519 }
0520 }
0521
0522 static void release_task_stack(struct task_struct *tsk)
0523 {
0524 if (WARN_ON(READ_ONCE(tsk->__state) != TASK_DEAD))
0525 return;
0526
0527 free_thread_stack(tsk);
0528 }
0529
0530 #ifdef CONFIG_THREAD_INFO_IN_TASK
0531 void put_task_stack(struct task_struct *tsk)
0532 {
0533 if (refcount_dec_and_test(&tsk->stack_refcount))
0534 release_task_stack(tsk);
0535 }
0536 #endif
0537
0538 void free_task(struct task_struct *tsk)
0539 {
0540 release_user_cpus_ptr(tsk);
0541 scs_release(tsk);
0542
0543 #ifndef CONFIG_THREAD_INFO_IN_TASK
0544
0545
0546
0547
0548 release_task_stack(tsk);
0549 #else
0550
0551
0552
0553
0554 WARN_ON_ONCE(refcount_read(&tsk->stack_refcount) != 0);
0555 #endif
0556 rt_mutex_debug_task_free(tsk);
0557 ftrace_graph_exit_task(tsk);
0558 arch_release_task_struct(tsk);
0559 if (tsk->flags & PF_KTHREAD)
0560 free_kthread_struct(tsk);
0561 free_task_struct(tsk);
0562 }
0563 EXPORT_SYMBOL(free_task);
0564
0565 static void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm)
0566 {
0567 struct file *exe_file;
0568
0569 exe_file = get_mm_exe_file(oldmm);
0570 RCU_INIT_POINTER(mm->exe_file, exe_file);
0571
0572
0573
0574
0575 if (exe_file && deny_write_access(exe_file))
0576 pr_warn_once("deny_write_access() failed in %s\n", __func__);
0577 }
0578
0579 #ifdef CONFIG_MMU
0580 static __latent_entropy int dup_mmap(struct mm_struct *mm,
0581 struct mm_struct *oldmm)
0582 {
0583 struct vm_area_struct *mpnt, *tmp, *prev, **pprev;
0584 struct rb_node **rb_link, *rb_parent;
0585 int retval;
0586 unsigned long charge;
0587 LIST_HEAD(uf);
0588
0589 uprobe_start_dup_mmap();
0590 if (mmap_write_lock_killable(oldmm)) {
0591 retval = -EINTR;
0592 goto fail_uprobe_end;
0593 }
0594 flush_cache_dup_mm(oldmm);
0595 uprobe_dup_mmap(oldmm, mm);
0596
0597
0598
0599 mmap_write_lock_nested(mm, SINGLE_DEPTH_NESTING);
0600
0601
0602 dup_mm_exe_file(mm, oldmm);
0603
0604 mm->total_vm = oldmm->total_vm;
0605 mm->data_vm = oldmm->data_vm;
0606 mm->exec_vm = oldmm->exec_vm;
0607 mm->stack_vm = oldmm->stack_vm;
0608
0609 rb_link = &mm->mm_rb.rb_node;
0610 rb_parent = NULL;
0611 pprev = &mm->mmap;
0612 retval = ksm_fork(mm, oldmm);
0613 if (retval)
0614 goto out;
0615 khugepaged_fork(mm, oldmm);
0616
0617 prev = NULL;
0618 for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
0619 struct file *file;
0620
0621 if (mpnt->vm_flags & VM_DONTCOPY) {
0622 vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt));
0623 continue;
0624 }
0625 charge = 0;
0626
0627
0628
0629
0630 if (fatal_signal_pending(current)) {
0631 retval = -EINTR;
0632 goto out;
0633 }
0634 if (mpnt->vm_flags & VM_ACCOUNT) {
0635 unsigned long len = vma_pages(mpnt);
0636
0637 if (security_vm_enough_memory_mm(oldmm, len))
0638 goto fail_nomem;
0639 charge = len;
0640 }
0641 tmp = vm_area_dup(mpnt);
0642 if (!tmp)
0643 goto fail_nomem;
0644 retval = vma_dup_policy(mpnt, tmp);
0645 if (retval)
0646 goto fail_nomem_policy;
0647 tmp->vm_mm = mm;
0648 retval = dup_userfaultfd(tmp, &uf);
0649 if (retval)
0650 goto fail_nomem_anon_vma_fork;
0651 if (tmp->vm_flags & VM_WIPEONFORK) {
0652
0653
0654
0655
0656
0657 tmp->anon_vma = NULL;
0658 } else if (anon_vma_fork(tmp, mpnt))
0659 goto fail_nomem_anon_vma_fork;
0660 tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT);
0661 file = tmp->vm_file;
0662 if (file) {
0663 struct address_space *mapping = file->f_mapping;
0664
0665 get_file(file);
0666 i_mmap_lock_write(mapping);
0667 if (tmp->vm_flags & VM_SHARED)
0668 mapping_allow_writable(mapping);
0669 flush_dcache_mmap_lock(mapping);
0670
0671 vma_interval_tree_insert_after(tmp, mpnt,
0672 &mapping->i_mmap);
0673 flush_dcache_mmap_unlock(mapping);
0674 i_mmap_unlock_write(mapping);
0675 }
0676
0677
0678
0679
0680
0681
0682 if (is_vm_hugetlb_page(tmp))
0683 reset_vma_resv_huge_pages(tmp);
0684
0685
0686
0687
0688 *pprev = tmp;
0689 pprev = &tmp->vm_next;
0690 tmp->vm_prev = prev;
0691 prev = tmp;
0692
0693 __vma_link_rb(mm, tmp, rb_link, rb_parent);
0694 rb_link = &tmp->vm_rb.rb_right;
0695 rb_parent = &tmp->vm_rb;
0696
0697 mm->map_count++;
0698 if (!(tmp->vm_flags & VM_WIPEONFORK))
0699 retval = copy_page_range(tmp, mpnt);
0700
0701 if (tmp->vm_ops && tmp->vm_ops->open)
0702 tmp->vm_ops->open(tmp);
0703
0704 if (retval)
0705 goto out;
0706 }
0707
0708 retval = arch_dup_mmap(oldmm, mm);
0709 out:
0710 mmap_write_unlock(mm);
0711 flush_tlb_mm(oldmm);
0712 mmap_write_unlock(oldmm);
0713 dup_userfaultfd_complete(&uf);
0714 fail_uprobe_end:
0715 uprobe_end_dup_mmap();
0716 return retval;
0717 fail_nomem_anon_vma_fork:
0718 mpol_put(vma_policy(tmp));
0719 fail_nomem_policy:
0720 vm_area_free(tmp);
0721 fail_nomem:
0722 retval = -ENOMEM;
0723 vm_unacct_memory(charge);
0724 goto out;
0725 }
0726
0727 static inline int mm_alloc_pgd(struct mm_struct *mm)
0728 {
0729 mm->pgd = pgd_alloc(mm);
0730 if (unlikely(!mm->pgd))
0731 return -ENOMEM;
0732 return 0;
0733 }
0734
0735 static inline void mm_free_pgd(struct mm_struct *mm)
0736 {
0737 pgd_free(mm, mm->pgd);
0738 }
0739 #else
0740 static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
0741 {
0742 mmap_write_lock(oldmm);
0743 dup_mm_exe_file(mm, oldmm);
0744 mmap_write_unlock(oldmm);
0745 return 0;
0746 }
0747 #define mm_alloc_pgd(mm) (0)
0748 #define mm_free_pgd(mm)
0749 #endif
0750
0751 static void check_mm(struct mm_struct *mm)
0752 {
0753 int i;
0754
0755 BUILD_BUG_ON_MSG(ARRAY_SIZE(resident_page_types) != NR_MM_COUNTERS,
0756 "Please make sure 'struct resident_page_types[]' is updated as well");
0757
0758 for (i = 0; i < NR_MM_COUNTERS; i++) {
0759 long x = atomic_long_read(&mm->rss_stat.count[i]);
0760
0761 if (unlikely(x))
0762 pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld\n",
0763 mm, resident_page_types[i], x);
0764 }
0765
0766 if (mm_pgtables_bytes(mm))
0767 pr_alert("BUG: non-zero pgtables_bytes on freeing mm: %ld\n",
0768 mm_pgtables_bytes(mm));
0769
0770 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
0771 VM_BUG_ON_MM(mm->pmd_huge_pte, mm);
0772 #endif
0773 }
0774
0775 #define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL))
0776 #define free_mm(mm) (kmem_cache_free(mm_cachep, (mm)))
0777
0778
0779
0780
0781
0782
0783 void __mmdrop(struct mm_struct *mm)
0784 {
0785 BUG_ON(mm == &init_mm);
0786 WARN_ON_ONCE(mm == current->mm);
0787 WARN_ON_ONCE(mm == current->active_mm);
0788 mm_free_pgd(mm);
0789 destroy_context(mm);
0790 mmu_notifier_subscriptions_destroy(mm);
0791 check_mm(mm);
0792 put_user_ns(mm->user_ns);
0793 mm_pasid_drop(mm);
0794 free_mm(mm);
0795 }
0796 EXPORT_SYMBOL_GPL(__mmdrop);
0797
0798 static void mmdrop_async_fn(struct work_struct *work)
0799 {
0800 struct mm_struct *mm;
0801
0802 mm = container_of(work, struct mm_struct, async_put_work);
0803 __mmdrop(mm);
0804 }
0805
0806 static void mmdrop_async(struct mm_struct *mm)
0807 {
0808 if (unlikely(atomic_dec_and_test(&mm->mm_count))) {
0809 INIT_WORK(&mm->async_put_work, mmdrop_async_fn);
0810 schedule_work(&mm->async_put_work);
0811 }
0812 }
0813
0814 static inline void free_signal_struct(struct signal_struct *sig)
0815 {
0816 taskstats_tgid_free(sig);
0817 sched_autogroup_exit(sig);
0818
0819
0820
0821
0822 if (sig->oom_mm)
0823 mmdrop_async(sig->oom_mm);
0824 kmem_cache_free(signal_cachep, sig);
0825 }
0826
0827 static inline void put_signal_struct(struct signal_struct *sig)
0828 {
0829 if (refcount_dec_and_test(&sig->sigcnt))
0830 free_signal_struct(sig);
0831 }
0832
0833 void __put_task_struct(struct task_struct *tsk)
0834 {
0835 WARN_ON(!tsk->exit_state);
0836 WARN_ON(refcount_read(&tsk->usage));
0837 WARN_ON(tsk == current);
0838
0839 io_uring_free(tsk);
0840 cgroup_free(tsk);
0841 task_numa_free(tsk, true);
0842 security_task_free(tsk);
0843 bpf_task_storage_free(tsk);
0844 exit_creds(tsk);
0845 delayacct_tsk_free(tsk);
0846 put_signal_struct(tsk->signal);
0847 sched_core_free(tsk);
0848 free_task(tsk);
0849 }
0850 EXPORT_SYMBOL_GPL(__put_task_struct);
0851
0852 void __init __weak arch_task_cache_init(void) { }
0853
0854
0855
0856
0857 static void set_max_threads(unsigned int max_threads_suggested)
0858 {
0859 u64 threads;
0860 unsigned long nr_pages = totalram_pages();
0861
0862
0863
0864
0865
0866 if (fls64(nr_pages) + fls64(PAGE_SIZE) > 64)
0867 threads = MAX_THREADS;
0868 else
0869 threads = div64_u64((u64) nr_pages * (u64) PAGE_SIZE,
0870 (u64) THREAD_SIZE * 8UL);
0871
0872 if (threads > max_threads_suggested)
0873 threads = max_threads_suggested;
0874
0875 max_threads = clamp_t(u64, threads, MIN_THREADS, MAX_THREADS);
0876 }
0877
0878 #ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT
0879
0880 int arch_task_struct_size __read_mostly;
0881 #endif
0882
0883 #ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
0884 static void task_struct_whitelist(unsigned long *offset, unsigned long *size)
0885 {
0886
0887 arch_thread_struct_whitelist(offset, size);
0888
0889
0890
0891
0892
0893 if (unlikely(*size == 0))
0894 *offset = 0;
0895 else
0896 *offset += offsetof(struct task_struct, thread);
0897 }
0898 #endif
0899
0900 void __init fork_init(void)
0901 {
0902 int i;
0903 #ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
0904 #ifndef ARCH_MIN_TASKALIGN
0905 #define ARCH_MIN_TASKALIGN 0
0906 #endif
0907 int align = max_t(int, L1_CACHE_BYTES, ARCH_MIN_TASKALIGN);
0908 unsigned long useroffset, usersize;
0909
0910
0911 task_struct_whitelist(&useroffset, &usersize);
0912 task_struct_cachep = kmem_cache_create_usercopy("task_struct",
0913 arch_task_struct_size, align,
0914 SLAB_PANIC|SLAB_ACCOUNT,
0915 useroffset, usersize, NULL);
0916 #endif
0917
0918
0919 arch_task_cache_init();
0920
0921 set_max_threads(MAX_THREADS);
0922
0923 init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2;
0924 init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2;
0925 init_task.signal->rlim[RLIMIT_SIGPENDING] =
0926 init_task.signal->rlim[RLIMIT_NPROC];
0927
0928 for (i = 0; i < MAX_PER_NAMESPACE_UCOUNTS; i++)
0929 init_user_ns.ucount_max[i] = max_threads/2;
0930
0931 set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_NPROC, RLIM_INFINITY);
0932 set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_MSGQUEUE, RLIM_INFINITY);
0933 set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_SIGPENDING, RLIM_INFINITY);
0934 set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_MEMLOCK, RLIM_INFINITY);
0935
0936 #ifdef CONFIG_VMAP_STACK
0937 cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache",
0938 NULL, free_vm_stack_cache);
0939 #endif
0940
0941 scs_init();
0942
0943 lockdep_init_task(&init_task);
0944 uprobes_init();
0945 }
0946
0947 int __weak arch_dup_task_struct(struct task_struct *dst,
0948 struct task_struct *src)
0949 {
0950 *dst = *src;
0951 return 0;
0952 }
0953
0954 void set_task_stack_end_magic(struct task_struct *tsk)
0955 {
0956 unsigned long *stackend;
0957
0958 stackend = end_of_stack(tsk);
0959 *stackend = STACK_END_MAGIC;
0960 }
0961
0962 static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
0963 {
0964 struct task_struct *tsk;
0965 int err;
0966
0967 if (node == NUMA_NO_NODE)
0968 node = tsk_fork_get_node(orig);
0969 tsk = alloc_task_struct_node(node);
0970 if (!tsk)
0971 return NULL;
0972
0973 err = arch_dup_task_struct(tsk, orig);
0974 if (err)
0975 goto free_tsk;
0976
0977 err = alloc_thread_stack_node(tsk, node);
0978 if (err)
0979 goto free_tsk;
0980
0981 #ifdef CONFIG_THREAD_INFO_IN_TASK
0982 refcount_set(&tsk->stack_refcount, 1);
0983 #endif
0984 account_kernel_stack(tsk, 1);
0985
0986 err = scs_prepare(tsk, node);
0987 if (err)
0988 goto free_stack;
0989
0990 #ifdef CONFIG_SECCOMP
0991
0992
0993
0994
0995
0996
0997 tsk->seccomp.filter = NULL;
0998 #endif
0999
1000 setup_thread_stack(tsk, orig);
1001 clear_user_return_notifier(tsk);
1002 clear_tsk_need_resched(tsk);
1003 set_task_stack_end_magic(tsk);
1004 clear_syscall_work_syscall_user_dispatch(tsk);
1005
1006 #ifdef CONFIG_STACKPROTECTOR
1007 tsk->stack_canary = get_random_canary();
1008 #endif
1009 if (orig->cpus_ptr == &orig->cpus_mask)
1010 tsk->cpus_ptr = &tsk->cpus_mask;
1011 dup_user_cpus_ptr(tsk, orig, node);
1012
1013
1014
1015
1016
1017 refcount_set(&tsk->rcu_users, 2);
1018
1019 refcount_set(&tsk->usage, 1);
1020 #ifdef CONFIG_BLK_DEV_IO_TRACE
1021 tsk->btrace_seq = 0;
1022 #endif
1023 tsk->splice_pipe = NULL;
1024 tsk->task_frag.page = NULL;
1025 tsk->wake_q.next = NULL;
1026 tsk->worker_private = NULL;
1027
1028 kcov_task_init(tsk);
1029 kmap_local_fork(tsk);
1030
1031 #ifdef CONFIG_FAULT_INJECTION
1032 tsk->fail_nth = 0;
1033 #endif
1034
1035 #ifdef CONFIG_BLK_CGROUP
1036 tsk->throttle_queue = NULL;
1037 tsk->use_memdelay = 0;
1038 #endif
1039
1040 #ifdef CONFIG_IOMMU_SVA
1041 tsk->pasid_activated = 0;
1042 #endif
1043
1044 #ifdef CONFIG_MEMCG
1045 tsk->active_memcg = NULL;
1046 #endif
1047
1048 #ifdef CONFIG_CPU_SUP_INTEL
1049 tsk->reported_split_lock = 0;
1050 #endif
1051
1052 return tsk;
1053
1054 free_stack:
1055 exit_task_stack_account(tsk);
1056 free_thread_stack(tsk);
1057 free_tsk:
1058 free_task_struct(tsk);
1059 return NULL;
1060 }
1061
1062 __cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock);
1063
1064 static unsigned long default_dump_filter = MMF_DUMP_FILTER_DEFAULT;
1065
1066 static int __init coredump_filter_setup(char *s)
1067 {
1068 default_dump_filter =
1069 (simple_strtoul(s, NULL, 0) << MMF_DUMP_FILTER_SHIFT) &
1070 MMF_DUMP_FILTER_MASK;
1071 return 1;
1072 }
1073
1074 __setup("coredump_filter=", coredump_filter_setup);
1075
1076 #include <linux/init_task.h>
1077
1078 static void mm_init_aio(struct mm_struct *mm)
1079 {
1080 #ifdef CONFIG_AIO
1081 spin_lock_init(&mm->ioctx_lock);
1082 mm->ioctx_table = NULL;
1083 #endif
1084 }
1085
1086 static __always_inline void mm_clear_owner(struct mm_struct *mm,
1087 struct task_struct *p)
1088 {
1089 #ifdef CONFIG_MEMCG
1090 if (mm->owner == p)
1091 WRITE_ONCE(mm->owner, NULL);
1092 #endif
1093 }
1094
1095 static void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
1096 {
1097 #ifdef CONFIG_MEMCG
1098 mm->owner = p;
1099 #endif
1100 }
1101
1102 static void mm_init_uprobes_state(struct mm_struct *mm)
1103 {
1104 #ifdef CONFIG_UPROBES
1105 mm->uprobes_state.xol_area = NULL;
1106 #endif
1107 }
1108
1109 static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
1110 struct user_namespace *user_ns)
1111 {
1112 mm->mmap = NULL;
1113 mm->mm_rb = RB_ROOT;
1114 mm->vmacache_seqnum = 0;
1115 atomic_set(&mm->mm_users, 1);
1116 atomic_set(&mm->mm_count, 1);
1117 seqcount_init(&mm->write_protect_seq);
1118 mmap_init_lock(mm);
1119 INIT_LIST_HEAD(&mm->mmlist);
1120 mm_pgtables_bytes_init(mm);
1121 mm->map_count = 0;
1122 mm->locked_vm = 0;
1123 atomic64_set(&mm->pinned_vm, 0);
1124 memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
1125 spin_lock_init(&mm->page_table_lock);
1126 spin_lock_init(&mm->arg_lock);
1127 mm_init_cpumask(mm);
1128 mm_init_aio(mm);
1129 mm_init_owner(mm, p);
1130 mm_pasid_init(mm);
1131 RCU_INIT_POINTER(mm->exe_file, NULL);
1132 mmu_notifier_subscriptions_init(mm);
1133 init_tlb_flush_pending(mm);
1134 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
1135 mm->pmd_huge_pte = NULL;
1136 #endif
1137 mm_init_uprobes_state(mm);
1138 hugetlb_count_init(mm);
1139
1140 if (current->mm) {
1141 mm->flags = current->mm->flags & MMF_INIT_MASK;
1142 mm->def_flags = current->mm->def_flags & VM_INIT_DEF_MASK;
1143 } else {
1144 mm->flags = default_dump_filter;
1145 mm->def_flags = 0;
1146 }
1147
1148 if (mm_alloc_pgd(mm))
1149 goto fail_nopgd;
1150
1151 if (init_new_context(p, mm))
1152 goto fail_nocontext;
1153
1154 mm->user_ns = get_user_ns(user_ns);
1155 return mm;
1156
1157 fail_nocontext:
1158 mm_free_pgd(mm);
1159 fail_nopgd:
1160 free_mm(mm);
1161 return NULL;
1162 }
1163
1164
1165
1166
1167 struct mm_struct *mm_alloc(void)
1168 {
1169 struct mm_struct *mm;
1170
1171 mm = allocate_mm();
1172 if (!mm)
1173 return NULL;
1174
1175 memset(mm, 0, sizeof(*mm));
1176 return mm_init(mm, current, current_user_ns());
1177 }
1178
1179 static inline void __mmput(struct mm_struct *mm)
1180 {
1181 VM_BUG_ON(atomic_read(&mm->mm_users));
1182
1183 uprobe_clear_state(mm);
1184 exit_aio(mm);
1185 ksm_exit(mm);
1186 khugepaged_exit(mm);
1187 exit_mmap(mm);
1188 mm_put_huge_zero_page(mm);
1189 set_mm_exe_file(mm, NULL);
1190 if (!list_empty(&mm->mmlist)) {
1191 spin_lock(&mmlist_lock);
1192 list_del(&mm->mmlist);
1193 spin_unlock(&mmlist_lock);
1194 }
1195 if (mm->binfmt)
1196 module_put(mm->binfmt->module);
1197 mmdrop(mm);
1198 }
1199
1200
1201
1202
1203 void mmput(struct mm_struct *mm)
1204 {
1205 might_sleep();
1206
1207 if (atomic_dec_and_test(&mm->mm_users))
1208 __mmput(mm);
1209 }
1210 EXPORT_SYMBOL_GPL(mmput);
1211
1212 #ifdef CONFIG_MMU
1213 static void mmput_async_fn(struct work_struct *work)
1214 {
1215 struct mm_struct *mm = container_of(work, struct mm_struct,
1216 async_put_work);
1217
1218 __mmput(mm);
1219 }
1220
1221 void mmput_async(struct mm_struct *mm)
1222 {
1223 if (atomic_dec_and_test(&mm->mm_users)) {
1224 INIT_WORK(&mm->async_put_work, mmput_async_fn);
1225 schedule_work(&mm->async_put_work);
1226 }
1227 }
1228 EXPORT_SYMBOL_GPL(mmput_async);
1229 #endif
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242 int set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
1243 {
1244 struct file *old_exe_file;
1245
1246
1247
1248
1249
1250
1251 old_exe_file = rcu_dereference_raw(mm->exe_file);
1252
1253 if (new_exe_file) {
1254
1255
1256
1257
1258 if (unlikely(deny_write_access(new_exe_file)))
1259 return -EACCES;
1260 get_file(new_exe_file);
1261 }
1262 rcu_assign_pointer(mm->exe_file, new_exe_file);
1263 if (old_exe_file) {
1264 allow_write_access(old_exe_file);
1265 fput(old_exe_file);
1266 }
1267 return 0;
1268 }
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279 int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
1280 {
1281 struct vm_area_struct *vma;
1282 struct file *old_exe_file;
1283 int ret = 0;
1284
1285
1286 old_exe_file = get_mm_exe_file(mm);
1287 if (old_exe_file) {
1288 mmap_read_lock(mm);
1289 for (vma = mm->mmap; vma && !ret; vma = vma->vm_next) {
1290 if (!vma->vm_file)
1291 continue;
1292 if (path_equal(&vma->vm_file->f_path,
1293 &old_exe_file->f_path))
1294 ret = -EBUSY;
1295 }
1296 mmap_read_unlock(mm);
1297 fput(old_exe_file);
1298 if (ret)
1299 return ret;
1300 }
1301
1302
1303 ret = deny_write_access(new_exe_file);
1304 if (ret)
1305 return -EACCES;
1306 get_file(new_exe_file);
1307
1308 old_exe_file = xchg(&mm->exe_file, new_exe_file);
1309 if (old_exe_file) {
1310
1311
1312
1313
1314 mmap_read_lock(mm);
1315 allow_write_access(old_exe_file);
1316 fput(old_exe_file);
1317 mmap_read_unlock(mm);
1318 }
1319 return 0;
1320 }
1321
1322
1323
1324
1325
1326
1327
1328 struct file *get_mm_exe_file(struct mm_struct *mm)
1329 {
1330 struct file *exe_file;
1331
1332 rcu_read_lock();
1333 exe_file = rcu_dereference(mm->exe_file);
1334 if (exe_file && !get_file_rcu(exe_file))
1335 exe_file = NULL;
1336 rcu_read_unlock();
1337 return exe_file;
1338 }
1339
1340
1341
1342
1343
1344
1345
1346
1347 struct file *get_task_exe_file(struct task_struct *task)
1348 {
1349 struct file *exe_file = NULL;
1350 struct mm_struct *mm;
1351
1352 task_lock(task);
1353 mm = task->mm;
1354 if (mm) {
1355 if (!(task->flags & PF_KTHREAD))
1356 exe_file = get_mm_exe_file(mm);
1357 }
1358 task_unlock(task);
1359 return exe_file;
1360 }
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371 struct mm_struct *get_task_mm(struct task_struct *task)
1372 {
1373 struct mm_struct *mm;
1374
1375 task_lock(task);
1376 mm = task->mm;
1377 if (mm) {
1378 if (task->flags & PF_KTHREAD)
1379 mm = NULL;
1380 else
1381 mmget(mm);
1382 }
1383 task_unlock(task);
1384 return mm;
1385 }
1386 EXPORT_SYMBOL_GPL(get_task_mm);
1387
1388 struct mm_struct *mm_access(struct task_struct *task, unsigned int mode)
1389 {
1390 struct mm_struct *mm;
1391 int err;
1392
1393 err = down_read_killable(&task->signal->exec_update_lock);
1394 if (err)
1395 return ERR_PTR(err);
1396
1397 mm = get_task_mm(task);
1398 if (mm && mm != current->mm &&
1399 !ptrace_may_access(task, mode)) {
1400 mmput(mm);
1401 mm = ERR_PTR(-EACCES);
1402 }
1403 up_read(&task->signal->exec_update_lock);
1404
1405 return mm;
1406 }
1407
1408 static void complete_vfork_done(struct task_struct *tsk)
1409 {
1410 struct completion *vfork;
1411
1412 task_lock(tsk);
1413 vfork = tsk->vfork_done;
1414 if (likely(vfork)) {
1415 tsk->vfork_done = NULL;
1416 complete(vfork);
1417 }
1418 task_unlock(tsk);
1419 }
1420
1421 static int wait_for_vfork_done(struct task_struct *child,
1422 struct completion *vfork)
1423 {
1424 int killed;
1425
1426 freezer_do_not_count();
1427 cgroup_enter_frozen();
1428 killed = wait_for_completion_killable(vfork);
1429 cgroup_leave_frozen(false);
1430 freezer_count();
1431
1432 if (killed) {
1433 task_lock(child);
1434 child->vfork_done = NULL;
1435 task_unlock(child);
1436 }
1437
1438 put_task_struct(child);
1439 return killed;
1440 }
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455 static void mm_release(struct task_struct *tsk, struct mm_struct *mm)
1456 {
1457 uprobe_free_utask(tsk);
1458
1459
1460 deactivate_mm(tsk, mm);
1461
1462
1463
1464
1465
1466
1467 if (tsk->clear_child_tid) {
1468 if (atomic_read(&mm->mm_users) > 1) {
1469
1470
1471
1472
1473 put_user(0, tsk->clear_child_tid);
1474 do_futex(tsk->clear_child_tid, FUTEX_WAKE,
1475 1, NULL, NULL, 0, 0);
1476 }
1477 tsk->clear_child_tid = NULL;
1478 }
1479
1480
1481
1482
1483
1484 if (tsk->vfork_done)
1485 complete_vfork_done(tsk);
1486 }
1487
1488 void exit_mm_release(struct task_struct *tsk, struct mm_struct *mm)
1489 {
1490 futex_exit_release(tsk);
1491 mm_release(tsk, mm);
1492 }
1493
1494 void exec_mm_release(struct task_struct *tsk, struct mm_struct *mm)
1495 {
1496 futex_exec_release(tsk);
1497 mm_release(tsk, mm);
1498 }
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510 static struct mm_struct *dup_mm(struct task_struct *tsk,
1511 struct mm_struct *oldmm)
1512 {
1513 struct mm_struct *mm;
1514 int err;
1515
1516 mm = allocate_mm();
1517 if (!mm)
1518 goto fail_nomem;
1519
1520 memcpy(mm, oldmm, sizeof(*mm));
1521
1522 if (!mm_init(mm, tsk, mm->user_ns))
1523 goto fail_nomem;
1524
1525 err = dup_mmap(mm, oldmm);
1526 if (err)
1527 goto free_pt;
1528
1529 mm->hiwater_rss = get_mm_rss(mm);
1530 mm->hiwater_vm = mm->total_vm;
1531
1532 if (mm->binfmt && !try_module_get(mm->binfmt->module))
1533 goto free_pt;
1534
1535 return mm;
1536
1537 free_pt:
1538
1539 mm->binfmt = NULL;
1540 mm_init_owner(mm, NULL);
1541 mmput(mm);
1542
1543 fail_nomem:
1544 return NULL;
1545 }
1546
1547 static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
1548 {
1549 struct mm_struct *mm, *oldmm;
1550
1551 tsk->min_flt = tsk->maj_flt = 0;
1552 tsk->nvcsw = tsk->nivcsw = 0;
1553 #ifdef CONFIG_DETECT_HUNG_TASK
1554 tsk->last_switch_count = tsk->nvcsw + tsk->nivcsw;
1555 tsk->last_switch_time = 0;
1556 #endif
1557
1558 tsk->mm = NULL;
1559 tsk->active_mm = NULL;
1560
1561
1562
1563
1564
1565
1566 oldmm = current->mm;
1567 if (!oldmm)
1568 return 0;
1569
1570
1571 vmacache_flush(tsk);
1572
1573 if (clone_flags & CLONE_VM) {
1574 mmget(oldmm);
1575 mm = oldmm;
1576 } else {
1577 mm = dup_mm(tsk, current->mm);
1578 if (!mm)
1579 return -ENOMEM;
1580 }
1581
1582 tsk->mm = mm;
1583 tsk->active_mm = mm;
1584 return 0;
1585 }
1586
1587 static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
1588 {
1589 struct fs_struct *fs = current->fs;
1590 if (clone_flags & CLONE_FS) {
1591
1592 spin_lock(&fs->lock);
1593 if (fs->in_exec) {
1594 spin_unlock(&fs->lock);
1595 return -EAGAIN;
1596 }
1597 fs->users++;
1598 spin_unlock(&fs->lock);
1599 return 0;
1600 }
1601 tsk->fs = copy_fs_struct(fs);
1602 if (!tsk->fs)
1603 return -ENOMEM;
1604 return 0;
1605 }
1606
1607 static int copy_files(unsigned long clone_flags, struct task_struct *tsk)
1608 {
1609 struct files_struct *oldf, *newf;
1610 int error = 0;
1611
1612
1613
1614
1615 oldf = current->files;
1616 if (!oldf)
1617 goto out;
1618
1619 if (clone_flags & CLONE_FILES) {
1620 atomic_inc(&oldf->count);
1621 goto out;
1622 }
1623
1624 newf = dup_fd(oldf, NR_OPEN_MAX, &error);
1625 if (!newf)
1626 goto out;
1627
1628 tsk->files = newf;
1629 error = 0;
1630 out:
1631 return error;
1632 }
1633
1634 static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
1635 {
1636 struct sighand_struct *sig;
1637
1638 if (clone_flags & CLONE_SIGHAND) {
1639 refcount_inc(¤t->sighand->count);
1640 return 0;
1641 }
1642 sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
1643 RCU_INIT_POINTER(tsk->sighand, sig);
1644 if (!sig)
1645 return -ENOMEM;
1646
1647 refcount_set(&sig->count, 1);
1648 spin_lock_irq(¤t->sighand->siglock);
1649 memcpy(sig->action, current->sighand->action, sizeof(sig->action));
1650 spin_unlock_irq(¤t->sighand->siglock);
1651
1652
1653 if (clone_flags & CLONE_CLEAR_SIGHAND)
1654 flush_signal_handlers(tsk, 0);
1655
1656 return 0;
1657 }
1658
1659 void __cleanup_sighand(struct sighand_struct *sighand)
1660 {
1661 if (refcount_dec_and_test(&sighand->count)) {
1662 signalfd_cleanup(sighand);
1663
1664
1665
1666
1667 kmem_cache_free(sighand_cachep, sighand);
1668 }
1669 }
1670
1671
1672
1673
1674 static void posix_cpu_timers_init_group(struct signal_struct *sig)
1675 {
1676 struct posix_cputimers *pct = &sig->posix_cputimers;
1677 unsigned long cpu_limit;
1678
1679 cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
1680 posix_cputimers_group_init(pct, cpu_limit);
1681 }
1682
1683 static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
1684 {
1685 struct signal_struct *sig;
1686
1687 if (clone_flags & CLONE_THREAD)
1688 return 0;
1689
1690 sig = kmem_cache_zalloc(signal_cachep, GFP_KERNEL);
1691 tsk->signal = sig;
1692 if (!sig)
1693 return -ENOMEM;
1694
1695 sig->nr_threads = 1;
1696 atomic_set(&sig->live, 1);
1697 refcount_set(&sig->sigcnt, 1);
1698
1699
1700 sig->thread_head = (struct list_head)LIST_HEAD_INIT(tsk->thread_node);
1701 tsk->thread_node = (struct list_head)LIST_HEAD_INIT(sig->thread_head);
1702
1703 init_waitqueue_head(&sig->wait_chldexit);
1704 sig->curr_target = tsk;
1705 init_sigpending(&sig->shared_pending);
1706 INIT_HLIST_HEAD(&sig->multiprocess);
1707 seqlock_init(&sig->stats_lock);
1708 prev_cputime_init(&sig->prev_cputime);
1709
1710 #ifdef CONFIG_POSIX_TIMERS
1711 INIT_LIST_HEAD(&sig->posix_timers);
1712 hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1713 sig->real_timer.function = it_real_fn;
1714 #endif
1715
1716 task_lock(current->group_leader);
1717 memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
1718 task_unlock(current->group_leader);
1719
1720 posix_cpu_timers_init_group(sig);
1721
1722 tty_audit_fork(sig);
1723 sched_autogroup_fork(sig);
1724
1725 sig->oom_score_adj = current->signal->oom_score_adj;
1726 sig->oom_score_adj_min = current->signal->oom_score_adj_min;
1727
1728 mutex_init(&sig->cred_guard_mutex);
1729 init_rwsem(&sig->exec_update_lock);
1730
1731 return 0;
1732 }
1733
1734 static void copy_seccomp(struct task_struct *p)
1735 {
1736 #ifdef CONFIG_SECCOMP
1737
1738
1739
1740
1741
1742
1743 assert_spin_locked(¤t->sighand->siglock);
1744
1745
1746 get_seccomp_filter(current);
1747 p->seccomp = current->seccomp;
1748
1749
1750
1751
1752
1753
1754 if (task_no_new_privs(current))
1755 task_set_no_new_privs(p);
1756
1757
1758
1759
1760
1761
1762 if (p->seccomp.mode != SECCOMP_MODE_DISABLED)
1763 set_task_syscall_work(p, SECCOMP);
1764 #endif
1765 }
1766
1767 SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr)
1768 {
1769 current->clear_child_tid = tidptr;
1770
1771 return task_pid_vnr(current);
1772 }
1773
1774 static void rt_mutex_init_task(struct task_struct *p)
1775 {
1776 raw_spin_lock_init(&p->pi_lock);
1777 #ifdef CONFIG_RT_MUTEXES
1778 p->pi_waiters = RB_ROOT_CACHED;
1779 p->pi_top_task = NULL;
1780 p->pi_blocked_on = NULL;
1781 #endif
1782 }
1783
1784 static inline void init_task_pid_links(struct task_struct *task)
1785 {
1786 enum pid_type type;
1787
1788 for (type = PIDTYPE_PID; type < PIDTYPE_MAX; ++type)
1789 INIT_HLIST_NODE(&task->pid_links[type]);
1790 }
1791
1792 static inline void
1793 init_task_pid(struct task_struct *task, enum pid_type type, struct pid *pid)
1794 {
1795 if (type == PIDTYPE_PID)
1796 task->thread_pid = pid;
1797 else
1798 task->signal->pids[type] = pid;
1799 }
1800
1801 static inline void rcu_copy_process(struct task_struct *p)
1802 {
1803 #ifdef CONFIG_PREEMPT_RCU
1804 p->rcu_read_lock_nesting = 0;
1805 p->rcu_read_unlock_special.s = 0;
1806 p->rcu_blocked_node = NULL;
1807 INIT_LIST_HEAD(&p->rcu_node_entry);
1808 #endif
1809 #ifdef CONFIG_TASKS_RCU
1810 p->rcu_tasks_holdout = false;
1811 INIT_LIST_HEAD(&p->rcu_tasks_holdout_list);
1812 p->rcu_tasks_idle_cpu = -1;
1813 #endif
1814 #ifdef CONFIG_TASKS_TRACE_RCU
1815 p->trc_reader_nesting = 0;
1816 p->trc_reader_special.s = 0;
1817 INIT_LIST_HEAD(&p->trc_holdout_list);
1818 INIT_LIST_HEAD(&p->trc_blkd_node);
1819 #endif
1820 }
1821
1822 struct pid *pidfd_pid(const struct file *file)
1823 {
1824 if (file->f_op == &pidfd_fops)
1825 return file->private_data;
1826
1827 return ERR_PTR(-EBADF);
1828 }
1829
1830 static int pidfd_release(struct inode *inode, struct file *file)
1831 {
1832 struct pid *pid = file->private_data;
1833
1834 file->private_data = NULL;
1835 put_pid(pid);
1836 return 0;
1837 }
1838
1839 #ifdef CONFIG_PROC_FS
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875 static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
1876 {
1877 struct pid *pid = f->private_data;
1878 struct pid_namespace *ns;
1879 pid_t nr = -1;
1880
1881 if (likely(pid_has_task(pid, PIDTYPE_PID))) {
1882 ns = proc_pid_ns(file_inode(m->file)->i_sb);
1883 nr = pid_nr_ns(pid, ns);
1884 }
1885
1886 seq_put_decimal_ll(m, "Pid:\t", nr);
1887
1888 #ifdef CONFIG_PID_NS
1889 seq_put_decimal_ll(m, "\nNSpid:\t", nr);
1890 if (nr > 0) {
1891 int i;
1892
1893
1894
1895
1896
1897
1898 for (i = ns->level + 1; i <= pid->level; i++)
1899 seq_put_decimal_ll(m, "\t", pid->numbers[i].nr);
1900 }
1901 #endif
1902 seq_putc(m, '\n');
1903 }
1904 #endif
1905
1906
1907
1908
1909 static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts)
1910 {
1911 struct pid *pid = file->private_data;
1912 __poll_t poll_flags = 0;
1913
1914 poll_wait(file, &pid->wait_pidfd, pts);
1915
1916
1917
1918
1919
1920
1921 if (thread_group_exited(pid))
1922 poll_flags = EPOLLIN | EPOLLRDNORM;
1923
1924 return poll_flags;
1925 }
1926
1927 const struct file_operations pidfd_fops = {
1928 .release = pidfd_release,
1929 .poll = pidfd_poll,
1930 #ifdef CONFIG_PROC_FS
1931 .show_fdinfo = pidfd_show_fdinfo,
1932 #endif
1933 };
1934
1935 static void __delayed_free_task(struct rcu_head *rhp)
1936 {
1937 struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
1938
1939 free_task(tsk);
1940 }
1941
1942 static __always_inline void delayed_free_task(struct task_struct *tsk)
1943 {
1944 if (IS_ENABLED(CONFIG_MEMCG))
1945 call_rcu(&tsk->rcu, __delayed_free_task);
1946 else
1947 free_task(tsk);
1948 }
1949
1950 static void copy_oom_score_adj(u64 clone_flags, struct task_struct *tsk)
1951 {
1952
1953 if (!tsk->mm)
1954 return;
1955
1956
1957 if ((clone_flags & (CLONE_VM | CLONE_THREAD | CLONE_VFORK)) != CLONE_VM)
1958 return;
1959
1960
1961 mutex_lock(&oom_adj_mutex);
1962 set_bit(MMF_MULTIPROCESS, &tsk->mm->flags);
1963
1964 tsk->signal->oom_score_adj = current->signal->oom_score_adj;
1965 tsk->signal->oom_score_adj_min = current->signal->oom_score_adj_min;
1966 mutex_unlock(&oom_adj_mutex);
1967 }
1968
1969 #ifdef CONFIG_RV
1970 static void rv_task_fork(struct task_struct *p)
1971 {
1972 int i;
1973
1974 for (i = 0; i < RV_PER_TASK_MONITORS; i++)
1975 p->rv[i].da_mon.monitoring = false;
1976 }
1977 #else
1978 #define rv_task_fork(p) do {} while (0)
1979 #endif
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989 static __latent_entropy struct task_struct *copy_process(
1990 struct pid *pid,
1991 int trace,
1992 int node,
1993 struct kernel_clone_args *args)
1994 {
1995 int pidfd = -1, retval;
1996 struct task_struct *p;
1997 struct multiprocess_signals delayed;
1998 struct file *pidfile = NULL;
1999 const u64 clone_flags = args->flags;
2000 struct nsproxy *nsp = current->nsproxy;
2001
2002
2003
2004
2005
2006 if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
2007 return ERR_PTR(-EINVAL);
2008
2009 if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS))
2010 return ERR_PTR(-EINVAL);
2011
2012
2013
2014
2015
2016 if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND))
2017 return ERR_PTR(-EINVAL);
2018
2019
2020
2021
2022
2023
2024 if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM))
2025 return ERR_PTR(-EINVAL);
2026
2027
2028
2029
2030
2031
2032
2033 if ((clone_flags & CLONE_PARENT) &&
2034 current->signal->flags & SIGNAL_UNKILLABLE)
2035 return ERR_PTR(-EINVAL);
2036
2037
2038
2039
2040
2041 if (clone_flags & CLONE_THREAD) {
2042 if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) ||
2043 (task_active_pid_ns(current) != nsp->pid_ns_for_children))
2044 return ERR_PTR(-EINVAL);
2045 }
2046
2047
2048
2049
2050
2051 if (clone_flags & (CLONE_THREAD | CLONE_VM)) {
2052 if (nsp->time_ns != nsp->time_ns_for_children)
2053 return ERR_PTR(-EINVAL);
2054 }
2055
2056 if (clone_flags & CLONE_PIDFD) {
2057
2058
2059
2060
2061
2062 if (clone_flags & (CLONE_DETACHED | CLONE_THREAD))
2063 return ERR_PTR(-EINVAL);
2064 }
2065
2066
2067
2068
2069
2070
2071
2072 sigemptyset(&delayed.signal);
2073 INIT_HLIST_NODE(&delayed.node);
2074
2075 spin_lock_irq(¤t->sighand->siglock);
2076 if (!(clone_flags & CLONE_THREAD))
2077 hlist_add_head(&delayed.node, ¤t->signal->multiprocess);
2078 recalc_sigpending();
2079 spin_unlock_irq(¤t->sighand->siglock);
2080 retval = -ERESTARTNOINTR;
2081 if (task_sigpending(current))
2082 goto fork_out;
2083
2084 retval = -ENOMEM;
2085 p = dup_task_struct(current, node);
2086 if (!p)
2087 goto fork_out;
2088 p->flags &= ~PF_KTHREAD;
2089 if (args->kthread)
2090 p->flags |= PF_KTHREAD;
2091 if (args->io_thread) {
2092
2093
2094
2095
2096 p->flags |= PF_IO_WORKER;
2097 siginitsetinv(&p->blocked, sigmask(SIGKILL)|sigmask(SIGSTOP));
2098 }
2099
2100 p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? args->child_tid : NULL;
2101
2102
2103
2104 p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? args->child_tid : NULL;
2105
2106 ftrace_graph_init_task(p);
2107
2108 rt_mutex_init_task(p);
2109
2110 lockdep_assert_irqs_enabled();
2111 #ifdef CONFIG_PROVE_LOCKING
2112 DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
2113 #endif
2114 retval = copy_creds(p, clone_flags);
2115 if (retval < 0)
2116 goto bad_fork_free;
2117
2118 retval = -EAGAIN;
2119 if (is_ucounts_overlimit(task_ucounts(p), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) {
2120 if (p->real_cred->user != INIT_USER &&
2121 !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN))
2122 goto bad_fork_cleanup_count;
2123 }
2124 current->flags &= ~PF_NPROC_EXCEEDED;
2125
2126
2127
2128
2129
2130
2131 retval = -EAGAIN;
2132 if (data_race(nr_threads >= max_threads))
2133 goto bad_fork_cleanup_count;
2134
2135 delayacct_tsk_init(p);
2136 p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER | PF_IDLE | PF_NO_SETAFFINITY);
2137 p->flags |= PF_FORKNOEXEC;
2138 INIT_LIST_HEAD(&p->children);
2139 INIT_LIST_HEAD(&p->sibling);
2140 rcu_copy_process(p);
2141 p->vfork_done = NULL;
2142 spin_lock_init(&p->alloc_lock);
2143
2144 init_sigpending(&p->pending);
2145
2146 p->utime = p->stime = p->gtime = 0;
2147 #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
2148 p->utimescaled = p->stimescaled = 0;
2149 #endif
2150 prev_cputime_init(&p->prev_cputime);
2151
2152 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
2153 seqcount_init(&p->vtime.seqcount);
2154 p->vtime.starttime = 0;
2155 p->vtime.state = VTIME_INACTIVE;
2156 #endif
2157
2158 #ifdef CONFIG_IO_URING
2159 p->io_uring = NULL;
2160 #endif
2161
2162 #if defined(SPLIT_RSS_COUNTING)
2163 memset(&p->rss_stat, 0, sizeof(p->rss_stat));
2164 #endif
2165
2166 p->default_timer_slack_ns = current->timer_slack_ns;
2167
2168 #ifdef CONFIG_PSI
2169 p->psi_flags = 0;
2170 #endif
2171
2172 task_io_accounting_init(&p->ioac);
2173 acct_clear_integrals(p);
2174
2175 posix_cputimers_init(&p->posix_cputimers);
2176
2177 p->io_context = NULL;
2178 audit_set_context(p, NULL);
2179 cgroup_fork(p);
2180 if (args->kthread) {
2181 if (!set_kthread_struct(p))
2182 goto bad_fork_cleanup_delayacct;
2183 }
2184 #ifdef CONFIG_NUMA
2185 p->mempolicy = mpol_dup(p->mempolicy);
2186 if (IS_ERR(p->mempolicy)) {
2187 retval = PTR_ERR(p->mempolicy);
2188 p->mempolicy = NULL;
2189 goto bad_fork_cleanup_delayacct;
2190 }
2191 #endif
2192 #ifdef CONFIG_CPUSETS
2193 p->cpuset_mem_spread_rotor = NUMA_NO_NODE;
2194 p->cpuset_slab_spread_rotor = NUMA_NO_NODE;
2195 seqcount_spinlock_init(&p->mems_allowed_seq, &p->alloc_lock);
2196 #endif
2197 #ifdef CONFIG_TRACE_IRQFLAGS
2198 memset(&p->irqtrace, 0, sizeof(p->irqtrace));
2199 p->irqtrace.hardirq_disable_ip = _THIS_IP_;
2200 p->irqtrace.softirq_enable_ip = _THIS_IP_;
2201 p->softirqs_enabled = 1;
2202 p->softirq_context = 0;
2203 #endif
2204
2205 p->pagefault_disabled = 0;
2206
2207 #ifdef CONFIG_LOCKDEP
2208 lockdep_init_task(p);
2209 #endif
2210
2211 #ifdef CONFIG_DEBUG_MUTEXES
2212 p->blocked_on = NULL;
2213 #endif
2214 #ifdef CONFIG_BCACHE
2215 p->sequential_io = 0;
2216 p->sequential_io_avg = 0;
2217 #endif
2218 #ifdef CONFIG_BPF_SYSCALL
2219 RCU_INIT_POINTER(p->bpf_storage, NULL);
2220 p->bpf_ctx = NULL;
2221 #endif
2222
2223
2224 retval = sched_fork(clone_flags, p);
2225 if (retval)
2226 goto bad_fork_cleanup_policy;
2227
2228 retval = perf_event_init_task(p, clone_flags);
2229 if (retval)
2230 goto bad_fork_cleanup_policy;
2231 retval = audit_alloc(p);
2232 if (retval)
2233 goto bad_fork_cleanup_perf;
2234
2235 shm_init_task(p);
2236 retval = security_task_alloc(p, clone_flags);
2237 if (retval)
2238 goto bad_fork_cleanup_audit;
2239 retval = copy_semundo(clone_flags, p);
2240 if (retval)
2241 goto bad_fork_cleanup_security;
2242 retval = copy_files(clone_flags, p);
2243 if (retval)
2244 goto bad_fork_cleanup_semundo;
2245 retval = copy_fs(clone_flags, p);
2246 if (retval)
2247 goto bad_fork_cleanup_files;
2248 retval = copy_sighand(clone_flags, p);
2249 if (retval)
2250 goto bad_fork_cleanup_fs;
2251 retval = copy_signal(clone_flags, p);
2252 if (retval)
2253 goto bad_fork_cleanup_sighand;
2254 retval = copy_mm(clone_flags, p);
2255 if (retval)
2256 goto bad_fork_cleanup_signal;
2257 retval = copy_namespaces(clone_flags, p);
2258 if (retval)
2259 goto bad_fork_cleanup_mm;
2260 retval = copy_io(clone_flags, p);
2261 if (retval)
2262 goto bad_fork_cleanup_namespaces;
2263 retval = copy_thread(p, args);
2264 if (retval)
2265 goto bad_fork_cleanup_io;
2266
2267 stackleak_task_init(p);
2268
2269 if (pid != &init_struct_pid) {
2270 pid = alloc_pid(p->nsproxy->pid_ns_for_children, args->set_tid,
2271 args->set_tid_size);
2272 if (IS_ERR(pid)) {
2273 retval = PTR_ERR(pid);
2274 goto bad_fork_cleanup_thread;
2275 }
2276 }
2277
2278
2279
2280
2281
2282
2283 if (clone_flags & CLONE_PIDFD) {
2284 retval = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
2285 if (retval < 0)
2286 goto bad_fork_free_pid;
2287
2288 pidfd = retval;
2289
2290 pidfile = anon_inode_getfile("[pidfd]", &pidfd_fops, pid,
2291 O_RDWR | O_CLOEXEC);
2292 if (IS_ERR(pidfile)) {
2293 put_unused_fd(pidfd);
2294 retval = PTR_ERR(pidfile);
2295 goto bad_fork_free_pid;
2296 }
2297 get_pid(pid);
2298
2299 retval = put_user(pidfd, args->pidfd);
2300 if (retval)
2301 goto bad_fork_put_pidfd;
2302 }
2303
2304 #ifdef CONFIG_BLOCK
2305 p->plug = NULL;
2306 #endif
2307 futex_init_task(p);
2308
2309
2310
2311
2312 if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM)
2313 sas_ss_reset(p);
2314
2315
2316
2317
2318
2319 user_disable_single_step(p);
2320 clear_task_syscall_work(p, SYSCALL_TRACE);
2321 #if defined(CONFIG_GENERIC_ENTRY) || defined(TIF_SYSCALL_EMU)
2322 clear_task_syscall_work(p, SYSCALL_EMU);
2323 #endif
2324 clear_tsk_latency_tracing(p);
2325
2326
2327 p->pid = pid_nr(pid);
2328 if (clone_flags & CLONE_THREAD) {
2329 p->group_leader = current->group_leader;
2330 p->tgid = current->tgid;
2331 } else {
2332 p->group_leader = p;
2333 p->tgid = p->pid;
2334 }
2335
2336 p->nr_dirtied = 0;
2337 p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10);
2338 p->dirty_paused_when = 0;
2339
2340 p->pdeath_signal = 0;
2341 INIT_LIST_HEAD(&p->thread_group);
2342 p->task_works = NULL;
2343 clear_posix_cputimers_work(p);
2344
2345 #ifdef CONFIG_KRETPROBES
2346 p->kretprobe_instances.first = NULL;
2347 #endif
2348 #ifdef CONFIG_RETHOOK
2349 p->rethooks.first = NULL;
2350 #endif
2351
2352
2353
2354
2355
2356
2357
2358 retval = cgroup_can_fork(p, args);
2359 if (retval)
2360 goto bad_fork_put_pidfd;
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371 sched_cgroup_fork(p, args);
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381 p->start_time = ktime_get_ns();
2382 p->start_boottime = ktime_get_boottime_ns();
2383
2384
2385
2386
2387
2388 write_lock_irq(&tasklist_lock);
2389
2390
2391 if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
2392 p->real_parent = current->real_parent;
2393 p->parent_exec_id = current->parent_exec_id;
2394 if (clone_flags & CLONE_THREAD)
2395 p->exit_signal = -1;
2396 else
2397 p->exit_signal = current->group_leader->exit_signal;
2398 } else {
2399 p->real_parent = current;
2400 p->parent_exec_id = current->self_exec_id;
2401 p->exit_signal = args->exit_signal;
2402 }
2403
2404 klp_copy_process(p);
2405
2406 sched_core_fork(p);
2407
2408 spin_lock(¤t->sighand->siglock);
2409
2410
2411
2412
2413
2414 copy_seccomp(p);
2415
2416 rv_task_fork(p);
2417
2418 rseq_fork(p, clone_flags);
2419
2420
2421 if (unlikely(!(ns_of_pid(pid)->pid_allocated & PIDNS_ADDING))) {
2422 retval = -ENOMEM;
2423 goto bad_fork_cancel_cgroup;
2424 }
2425
2426
2427 if (fatal_signal_pending(current)) {
2428 retval = -EINTR;
2429 goto bad_fork_cancel_cgroup;
2430 }
2431
2432 init_task_pid_links(p);
2433 if (likely(p->pid)) {
2434 ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace);
2435
2436 init_task_pid(p, PIDTYPE_PID, pid);
2437 if (thread_group_leader(p)) {
2438 init_task_pid(p, PIDTYPE_TGID, pid);
2439 init_task_pid(p, PIDTYPE_PGID, task_pgrp(current));
2440 init_task_pid(p, PIDTYPE_SID, task_session(current));
2441
2442 if (is_child_reaper(pid)) {
2443 ns_of_pid(pid)->child_reaper = p;
2444 p->signal->flags |= SIGNAL_UNKILLABLE;
2445 }
2446 p->signal->shared_pending.signal = delayed.signal;
2447 p->signal->tty = tty_kref_get(current->signal->tty);
2448
2449
2450
2451
2452
2453 p->signal->has_child_subreaper = p->real_parent->signal->has_child_subreaper ||
2454 p->real_parent->signal->is_child_subreaper;
2455 list_add_tail(&p->sibling, &p->real_parent->children);
2456 list_add_tail_rcu(&p->tasks, &init_task.tasks);
2457 attach_pid(p, PIDTYPE_TGID);
2458 attach_pid(p, PIDTYPE_PGID);
2459 attach_pid(p, PIDTYPE_SID);
2460 __this_cpu_inc(process_counts);
2461 } else {
2462 current->signal->nr_threads++;
2463 atomic_inc(¤t->signal->live);
2464 refcount_inc(¤t->signal->sigcnt);
2465 task_join_group_stop(p);
2466 list_add_tail_rcu(&p->thread_group,
2467 &p->group_leader->thread_group);
2468 list_add_tail_rcu(&p->thread_node,
2469 &p->signal->thread_head);
2470 }
2471 attach_pid(p, PIDTYPE_PID);
2472 nr_threads++;
2473 }
2474 total_forks++;
2475 hlist_del_init(&delayed.node);
2476 spin_unlock(¤t->sighand->siglock);
2477 syscall_tracepoint_update(p);
2478 write_unlock_irq(&tasklist_lock);
2479
2480 if (pidfile)
2481 fd_install(pidfd, pidfile);
2482
2483 proc_fork_connector(p);
2484 sched_post_fork(p);
2485 cgroup_post_fork(p, args);
2486 perf_event_fork(p);
2487
2488 trace_task_newtask(p, clone_flags);
2489 uprobe_copy_process(p, clone_flags);
2490
2491 copy_oom_score_adj(clone_flags, p);
2492
2493 return p;
2494
2495 bad_fork_cancel_cgroup:
2496 sched_core_free(p);
2497 spin_unlock(¤t->sighand->siglock);
2498 write_unlock_irq(&tasklist_lock);
2499 cgroup_cancel_fork(p, args);
2500 bad_fork_put_pidfd:
2501 if (clone_flags & CLONE_PIDFD) {
2502 fput(pidfile);
2503 put_unused_fd(pidfd);
2504 }
2505 bad_fork_free_pid:
2506 if (pid != &init_struct_pid)
2507 free_pid(pid);
2508 bad_fork_cleanup_thread:
2509 exit_thread(p);
2510 bad_fork_cleanup_io:
2511 if (p->io_context)
2512 exit_io_context(p);
2513 bad_fork_cleanup_namespaces:
2514 exit_task_namespaces(p);
2515 bad_fork_cleanup_mm:
2516 if (p->mm) {
2517 mm_clear_owner(p->mm, p);
2518 mmput(p->mm);
2519 }
2520 bad_fork_cleanup_signal:
2521 if (!(clone_flags & CLONE_THREAD))
2522 free_signal_struct(p->signal);
2523 bad_fork_cleanup_sighand:
2524 __cleanup_sighand(p->sighand);
2525 bad_fork_cleanup_fs:
2526 exit_fs(p);
2527 bad_fork_cleanup_files:
2528 exit_files(p);
2529 bad_fork_cleanup_semundo:
2530 exit_sem(p);
2531 bad_fork_cleanup_security:
2532 security_task_free(p);
2533 bad_fork_cleanup_audit:
2534 audit_free(p);
2535 bad_fork_cleanup_perf:
2536 perf_event_free_task(p);
2537 bad_fork_cleanup_policy:
2538 lockdep_free_task(p);
2539 #ifdef CONFIG_NUMA
2540 mpol_put(p->mempolicy);
2541 #endif
2542 bad_fork_cleanup_delayacct:
2543 delayacct_tsk_free(p);
2544 bad_fork_cleanup_count:
2545 dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
2546 exit_creds(p);
2547 bad_fork_free:
2548 WRITE_ONCE(p->__state, TASK_DEAD);
2549 exit_task_stack_account(p);
2550 put_task_stack(p);
2551 delayed_free_task(p);
2552 fork_out:
2553 spin_lock_irq(¤t->sighand->siglock);
2554 hlist_del_init(&delayed.node);
2555 spin_unlock_irq(¤t->sighand->siglock);
2556 return ERR_PTR(retval);
2557 }
2558
2559 static inline void init_idle_pids(struct task_struct *idle)
2560 {
2561 enum pid_type type;
2562
2563 for (type = PIDTYPE_PID; type < PIDTYPE_MAX; ++type) {
2564 INIT_HLIST_NODE(&idle->pid_links[type]);
2565 init_task_pid(idle, type, &init_struct_pid);
2566 }
2567 }
2568
2569 static int idle_dummy(void *dummy)
2570 {
2571
2572 return 0;
2573 }
2574
2575 struct task_struct * __init fork_idle(int cpu)
2576 {
2577 struct task_struct *task;
2578 struct kernel_clone_args args = {
2579 .flags = CLONE_VM,
2580 .fn = &idle_dummy,
2581 .fn_arg = NULL,
2582 .kthread = 1,
2583 .idle = 1,
2584 };
2585
2586 task = copy_process(&init_struct_pid, 0, cpu_to_node(cpu), &args);
2587 if (!IS_ERR(task)) {
2588 init_idle_pids(task);
2589 init_idle(task, cpu);
2590 }
2591
2592 return task;
2593 }
2594
2595 struct mm_struct *copy_init_mm(void)
2596 {
2597 return dup_mm(NULL, &init_mm);
2598 }
2599
2600
2601
2602
2603
2604
2605
2606 struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node)
2607 {
2608 unsigned long flags = CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|
2609 CLONE_IO;
2610 struct kernel_clone_args args = {
2611 .flags = ((lower_32_bits(flags) | CLONE_VM |
2612 CLONE_UNTRACED) & ~CSIGNAL),
2613 .exit_signal = (lower_32_bits(flags) & CSIGNAL),
2614 .fn = fn,
2615 .fn_arg = arg,
2616 .io_thread = 1,
2617 };
2618
2619 return copy_process(NULL, 0, node, &args);
2620 }
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630 pid_t kernel_clone(struct kernel_clone_args *args)
2631 {
2632 u64 clone_flags = args->flags;
2633 struct completion vfork;
2634 struct pid *pid;
2635 struct task_struct *p;
2636 int trace = 0;
2637 pid_t nr;
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648 if ((args->flags & CLONE_PIDFD) &&
2649 (args->flags & CLONE_PARENT_SETTID) &&
2650 (args->pidfd == args->parent_tid))
2651 return -EINVAL;
2652
2653
2654
2655
2656
2657
2658
2659 if (!(clone_flags & CLONE_UNTRACED)) {
2660 if (clone_flags & CLONE_VFORK)
2661 trace = PTRACE_EVENT_VFORK;
2662 else if (args->exit_signal != SIGCHLD)
2663 trace = PTRACE_EVENT_CLONE;
2664 else
2665 trace = PTRACE_EVENT_FORK;
2666
2667 if (likely(!ptrace_event_enabled(current, trace)))
2668 trace = 0;
2669 }
2670
2671 p = copy_process(NULL, trace, NUMA_NO_NODE, args);
2672 add_latent_entropy();
2673
2674 if (IS_ERR(p))
2675 return PTR_ERR(p);
2676
2677
2678
2679
2680
2681 trace_sched_process_fork(current, p);
2682
2683 pid = get_task_pid(p, PIDTYPE_PID);
2684 nr = pid_vnr(pid);
2685
2686 if (clone_flags & CLONE_PARENT_SETTID)
2687 put_user(nr, args->parent_tid);
2688
2689 if (clone_flags & CLONE_VFORK) {
2690 p->vfork_done = &vfork;
2691 init_completion(&vfork);
2692 get_task_struct(p);
2693 }
2694
2695 wake_up_new_task(p);
2696
2697
2698 if (unlikely(trace))
2699 ptrace_event_pid(trace, pid);
2700
2701 if (clone_flags & CLONE_VFORK) {
2702 if (!wait_for_vfork_done(p, &vfork))
2703 ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid);
2704 }
2705
2706 put_pid(pid);
2707 return nr;
2708 }
2709
2710
2711
2712
2713 pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
2714 {
2715 struct kernel_clone_args args = {
2716 .flags = ((lower_32_bits(flags) | CLONE_VM |
2717 CLONE_UNTRACED) & ~CSIGNAL),
2718 .exit_signal = (lower_32_bits(flags) & CSIGNAL),
2719 .fn = fn,
2720 .fn_arg = arg,
2721 .kthread = 1,
2722 };
2723
2724 return kernel_clone(&args);
2725 }
2726
2727
2728
2729
2730 pid_t user_mode_thread(int (*fn)(void *), void *arg, unsigned long flags)
2731 {
2732 struct kernel_clone_args args = {
2733 .flags = ((lower_32_bits(flags) | CLONE_VM |
2734 CLONE_UNTRACED) & ~CSIGNAL),
2735 .exit_signal = (lower_32_bits(flags) & CSIGNAL),
2736 .fn = fn,
2737 .fn_arg = arg,
2738 };
2739
2740 return kernel_clone(&args);
2741 }
2742
2743 #ifdef __ARCH_WANT_SYS_FORK
2744 SYSCALL_DEFINE0(fork)
2745 {
2746 #ifdef CONFIG_MMU
2747 struct kernel_clone_args args = {
2748 .exit_signal = SIGCHLD,
2749 };
2750
2751 return kernel_clone(&args);
2752 #else
2753
2754 return -EINVAL;
2755 #endif
2756 }
2757 #endif
2758
2759 #ifdef __ARCH_WANT_SYS_VFORK
2760 SYSCALL_DEFINE0(vfork)
2761 {
2762 struct kernel_clone_args args = {
2763 .flags = CLONE_VFORK | CLONE_VM,
2764 .exit_signal = SIGCHLD,
2765 };
2766
2767 return kernel_clone(&args);
2768 }
2769 #endif
2770
2771 #ifdef __ARCH_WANT_SYS_CLONE
2772 #ifdef CONFIG_CLONE_BACKWARDS
2773 SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
2774 int __user *, parent_tidptr,
2775 unsigned long, tls,
2776 int __user *, child_tidptr)
2777 #elif defined(CONFIG_CLONE_BACKWARDS2)
2778 SYSCALL_DEFINE5(clone, unsigned long, newsp, unsigned long, clone_flags,
2779 int __user *, parent_tidptr,
2780 int __user *, child_tidptr,
2781 unsigned long, tls)
2782 #elif defined(CONFIG_CLONE_BACKWARDS3)
2783 SYSCALL_DEFINE6(clone, unsigned long, clone_flags, unsigned long, newsp,
2784 int, stack_size,
2785 int __user *, parent_tidptr,
2786 int __user *, child_tidptr,
2787 unsigned long, tls)
2788 #else
2789 SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
2790 int __user *, parent_tidptr,
2791 int __user *, child_tidptr,
2792 unsigned long, tls)
2793 #endif
2794 {
2795 struct kernel_clone_args args = {
2796 .flags = (lower_32_bits(clone_flags) & ~CSIGNAL),
2797 .pidfd = parent_tidptr,
2798 .child_tid = child_tidptr,
2799 .parent_tid = parent_tidptr,
2800 .exit_signal = (lower_32_bits(clone_flags) & CSIGNAL),
2801 .stack = newsp,
2802 .tls = tls,
2803 };
2804
2805 return kernel_clone(&args);
2806 }
2807 #endif
2808
2809 #ifdef __ARCH_WANT_SYS_CLONE3
2810
2811 noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
2812 struct clone_args __user *uargs,
2813 size_t usize)
2814 {
2815 int err;
2816 struct clone_args args;
2817 pid_t *kset_tid = kargs->set_tid;
2818
2819 BUILD_BUG_ON(offsetofend(struct clone_args, tls) !=
2820 CLONE_ARGS_SIZE_VER0);
2821 BUILD_BUG_ON(offsetofend(struct clone_args, set_tid_size) !=
2822 CLONE_ARGS_SIZE_VER1);
2823 BUILD_BUG_ON(offsetofend(struct clone_args, cgroup) !=
2824 CLONE_ARGS_SIZE_VER2);
2825 BUILD_BUG_ON(sizeof(struct clone_args) != CLONE_ARGS_SIZE_VER2);
2826
2827 if (unlikely(usize > PAGE_SIZE))
2828 return -E2BIG;
2829 if (unlikely(usize < CLONE_ARGS_SIZE_VER0))
2830 return -EINVAL;
2831
2832 err = copy_struct_from_user(&args, sizeof(args), uargs, usize);
2833 if (err)
2834 return err;
2835
2836 if (unlikely(args.set_tid_size > MAX_PID_NS_LEVEL))
2837 return -EINVAL;
2838
2839 if (unlikely(!args.set_tid && args.set_tid_size > 0))
2840 return -EINVAL;
2841
2842 if (unlikely(args.set_tid && args.set_tid_size == 0))
2843 return -EINVAL;
2844
2845
2846
2847
2848
2849 if (unlikely((args.exit_signal & ~((u64)CSIGNAL)) ||
2850 !valid_signal(args.exit_signal)))
2851 return -EINVAL;
2852
2853 if ((args.flags & CLONE_INTO_CGROUP) &&
2854 (args.cgroup > INT_MAX || usize < CLONE_ARGS_SIZE_VER2))
2855 return -EINVAL;
2856
2857 *kargs = (struct kernel_clone_args){
2858 .flags = args.flags,
2859 .pidfd = u64_to_user_ptr(args.pidfd),
2860 .child_tid = u64_to_user_ptr(args.child_tid),
2861 .parent_tid = u64_to_user_ptr(args.parent_tid),
2862 .exit_signal = args.exit_signal,
2863 .stack = args.stack,
2864 .stack_size = args.stack_size,
2865 .tls = args.tls,
2866 .set_tid_size = args.set_tid_size,
2867 .cgroup = args.cgroup,
2868 };
2869
2870 if (args.set_tid &&
2871 copy_from_user(kset_tid, u64_to_user_ptr(args.set_tid),
2872 (kargs->set_tid_size * sizeof(pid_t))))
2873 return -EFAULT;
2874
2875 kargs->set_tid = kset_tid;
2876
2877 return 0;
2878 }
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888 static inline bool clone3_stack_valid(struct kernel_clone_args *kargs)
2889 {
2890 if (kargs->stack == 0) {
2891 if (kargs->stack_size > 0)
2892 return false;
2893 } else {
2894 if (kargs->stack_size == 0)
2895 return false;
2896
2897 if (!access_ok((void __user *)kargs->stack, kargs->stack_size))
2898 return false;
2899
2900 #if !defined(CONFIG_STACK_GROWSUP) && !defined(CONFIG_IA64)
2901 kargs->stack += kargs->stack_size;
2902 #endif
2903 }
2904
2905 return true;
2906 }
2907
2908 static bool clone3_args_valid(struct kernel_clone_args *kargs)
2909 {
2910
2911 if (kargs->flags &
2912 ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND | CLONE_INTO_CGROUP))
2913 return false;
2914
2915
2916
2917
2918
2919 if (kargs->flags & (CLONE_DETACHED | CSIGNAL))
2920 return false;
2921
2922 if ((kargs->flags & (CLONE_SIGHAND | CLONE_CLEAR_SIGHAND)) ==
2923 (CLONE_SIGHAND | CLONE_CLEAR_SIGHAND))
2924 return false;
2925
2926 if ((kargs->flags & (CLONE_THREAD | CLONE_PARENT)) &&
2927 kargs->exit_signal)
2928 return false;
2929
2930 if (!clone3_stack_valid(kargs))
2931 return false;
2932
2933 return true;
2934 }
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947 SYSCALL_DEFINE2(clone3, struct clone_args __user *, uargs, size_t, size)
2948 {
2949 int err;
2950
2951 struct kernel_clone_args kargs;
2952 pid_t set_tid[MAX_PID_NS_LEVEL];
2953
2954 kargs.set_tid = set_tid;
2955
2956 err = copy_clone_args_from_user(&kargs, uargs, size);
2957 if (err)
2958 return err;
2959
2960 if (!clone3_args_valid(&kargs))
2961 return -EINVAL;
2962
2963 return kernel_clone(&kargs);
2964 }
2965 #endif
2966
2967 void walk_process_tree(struct task_struct *top, proc_visitor visitor, void *data)
2968 {
2969 struct task_struct *leader, *parent, *child;
2970 int res;
2971
2972 read_lock(&tasklist_lock);
2973 leader = top = top->group_leader;
2974 down:
2975 for_each_thread(leader, parent) {
2976 list_for_each_entry(child, &parent->children, sibling) {
2977 res = visitor(child, data);
2978 if (res) {
2979 if (res < 0)
2980 goto out;
2981 leader = child;
2982 goto down;
2983 }
2984 up:
2985 ;
2986 }
2987 }
2988
2989 if (leader != top) {
2990 child = leader;
2991 parent = child->real_parent;
2992 leader = parent->group_leader;
2993 goto up;
2994 }
2995 out:
2996 read_unlock(&tasklist_lock);
2997 }
2998
2999 #ifndef ARCH_MIN_MMSTRUCT_ALIGN
3000 #define ARCH_MIN_MMSTRUCT_ALIGN 0
3001 #endif
3002
3003 static void sighand_ctor(void *data)
3004 {
3005 struct sighand_struct *sighand = data;
3006
3007 spin_lock_init(&sighand->siglock);
3008 init_waitqueue_head(&sighand->signalfd_wqh);
3009 }
3010
3011 void __init proc_caches_init(void)
3012 {
3013 unsigned int mm_size;
3014
3015 sighand_cachep = kmem_cache_create("sighand_cache",
3016 sizeof(struct sighand_struct), 0,
3017 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU|
3018 SLAB_ACCOUNT, sighand_ctor);
3019 signal_cachep = kmem_cache_create("signal_cache",
3020 sizeof(struct signal_struct), 0,
3021 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
3022 NULL);
3023 files_cachep = kmem_cache_create("files_cache",
3024 sizeof(struct files_struct), 0,
3025 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
3026 NULL);
3027 fs_cachep = kmem_cache_create("fs_cache",
3028 sizeof(struct fs_struct), 0,
3029 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
3030 NULL);
3031
3032
3033
3034
3035
3036
3037 mm_size = sizeof(struct mm_struct) + cpumask_size();
3038
3039 mm_cachep = kmem_cache_create_usercopy("mm_struct",
3040 mm_size, ARCH_MIN_MMSTRUCT_ALIGN,
3041 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
3042 offsetof(struct mm_struct, saved_auxv),
3043 sizeof_field(struct mm_struct, saved_auxv),
3044 NULL);
3045 vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT);
3046 mmap_init();
3047 nsproxy_cache_init();
3048 }
3049
3050
3051
3052
3053 static int check_unshare_flags(unsigned long unshare_flags)
3054 {
3055 if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
3056 CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
3057 CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET|
3058 CLONE_NEWUSER|CLONE_NEWPID|CLONE_NEWCGROUP|
3059 CLONE_NEWTIME))
3060 return -EINVAL;
3061
3062
3063
3064
3065
3066
3067 if (unshare_flags & (CLONE_THREAD | CLONE_SIGHAND | CLONE_VM)) {
3068 if (!thread_group_empty(current))
3069 return -EINVAL;
3070 }
3071 if (unshare_flags & (CLONE_SIGHAND | CLONE_VM)) {
3072 if (refcount_read(¤t->sighand->count) > 1)
3073 return -EINVAL;
3074 }
3075 if (unshare_flags & CLONE_VM) {
3076 if (!current_is_single_threaded())
3077 return -EINVAL;
3078 }
3079
3080 return 0;
3081 }
3082
3083
3084
3085
3086 static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
3087 {
3088 struct fs_struct *fs = current->fs;
3089
3090 if (!(unshare_flags & CLONE_FS) || !fs)
3091 return 0;
3092
3093
3094 if (fs->users == 1)
3095 return 0;
3096
3097 *new_fsp = copy_fs_struct(fs);
3098 if (!*new_fsp)
3099 return -ENOMEM;
3100
3101 return 0;
3102 }
3103
3104
3105
3106
3107 int unshare_fd(unsigned long unshare_flags, unsigned int max_fds,
3108 struct files_struct **new_fdp)
3109 {
3110 struct files_struct *fd = current->files;
3111 int error = 0;
3112
3113 if ((unshare_flags & CLONE_FILES) &&
3114 (fd && atomic_read(&fd->count) > 1)) {
3115 *new_fdp = dup_fd(fd, max_fds, &error);
3116 if (!*new_fdp)
3117 return error;
3118 }
3119
3120 return 0;
3121 }
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131 int ksys_unshare(unsigned long unshare_flags)
3132 {
3133 struct fs_struct *fs, *new_fs = NULL;
3134 struct files_struct *new_fd = NULL;
3135 struct cred *new_cred = NULL;
3136 struct nsproxy *new_nsproxy = NULL;
3137 int do_sysvsem = 0;
3138 int err;
3139
3140
3141
3142
3143
3144 if (unshare_flags & CLONE_NEWUSER)
3145 unshare_flags |= CLONE_THREAD | CLONE_FS;
3146
3147
3148
3149 if (unshare_flags & CLONE_VM)
3150 unshare_flags |= CLONE_SIGHAND;
3151
3152
3153
3154 if (unshare_flags & CLONE_SIGHAND)
3155 unshare_flags |= CLONE_THREAD;
3156
3157
3158
3159 if (unshare_flags & CLONE_NEWNS)
3160 unshare_flags |= CLONE_FS;
3161
3162 err = check_unshare_flags(unshare_flags);
3163 if (err)
3164 goto bad_unshare_out;
3165
3166
3167
3168
3169
3170 if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM))
3171 do_sysvsem = 1;
3172 err = unshare_fs(unshare_flags, &new_fs);
3173 if (err)
3174 goto bad_unshare_out;
3175 err = unshare_fd(unshare_flags, NR_OPEN_MAX, &new_fd);
3176 if (err)
3177 goto bad_unshare_cleanup_fs;
3178 err = unshare_userns(unshare_flags, &new_cred);
3179 if (err)
3180 goto bad_unshare_cleanup_fd;
3181 err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy,
3182 new_cred, new_fs);
3183 if (err)
3184 goto bad_unshare_cleanup_cred;
3185
3186 if (new_cred) {
3187 err = set_cred_ucounts(new_cred);
3188 if (err)
3189 goto bad_unshare_cleanup_cred;
3190 }
3191
3192 if (new_fs || new_fd || do_sysvsem || new_cred || new_nsproxy) {
3193 if (do_sysvsem) {
3194
3195
3196
3197 exit_sem(current);
3198 }
3199 if (unshare_flags & CLONE_NEWIPC) {
3200
3201 exit_shm(current);
3202 shm_init_task(current);
3203 }
3204
3205 if (new_nsproxy)
3206 switch_task_namespaces(current, new_nsproxy);
3207
3208 task_lock(current);
3209
3210 if (new_fs) {
3211 fs = current->fs;
3212 spin_lock(&fs->lock);
3213 current->fs = new_fs;
3214 if (--fs->users)
3215 new_fs = NULL;
3216 else
3217 new_fs = fs;
3218 spin_unlock(&fs->lock);
3219 }
3220
3221 if (new_fd)
3222 swap(current->files, new_fd);
3223
3224 task_unlock(current);
3225
3226 if (new_cred) {
3227
3228 commit_creds(new_cred);
3229 new_cred = NULL;
3230 }
3231 }
3232
3233 perf_event_namespaces(current);
3234
3235 bad_unshare_cleanup_cred:
3236 if (new_cred)
3237 put_cred(new_cred);
3238 bad_unshare_cleanup_fd:
3239 if (new_fd)
3240 put_files_struct(new_fd);
3241
3242 bad_unshare_cleanup_fs:
3243 if (new_fs)
3244 free_fs_struct(new_fs);
3245
3246 bad_unshare_out:
3247 return err;
3248 }
3249
3250 SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
3251 {
3252 return ksys_unshare(unshare_flags);
3253 }
3254
3255
3256
3257
3258
3259
3260
3261 int unshare_files(void)
3262 {
3263 struct task_struct *task = current;
3264 struct files_struct *old, *copy = NULL;
3265 int error;
3266
3267 error = unshare_fd(CLONE_FILES, NR_OPEN_MAX, ©);
3268 if (error || !copy)
3269 return error;
3270
3271 old = task->files;
3272 task_lock(task);
3273 task->files = copy;
3274 task_unlock(task);
3275 put_files_struct(old);
3276 return 0;
3277 }
3278
3279 int sysctl_max_threads(struct ctl_table *table, int write,
3280 void *buffer, size_t *lenp, loff_t *ppos)
3281 {
3282 struct ctl_table t;
3283 int ret;
3284 int threads = max_threads;
3285 int min = 1;
3286 int max = MAX_THREADS;
3287
3288 t = *table;
3289 t.data = &threads;
3290 t.extra1 = &min;
3291 t.extra2 = &max;
3292
3293 ret = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
3294 if (ret || !write)
3295 return ret;
3296
3297 max_threads = threads;
3298
3299 return 0;
3300 }