fs/proc/base.c

0001 // SPDX-License-Identifier: GPL-2.0
0002 /*
0003  *  linux/fs/proc/base.c
0004  *
0005  *  Copyright (C) 1991, 1992 Linus Torvalds
0006  *
0007  *  proc base directory handling functions
0008  *
0009  *  1999, Al Viro. Rewritten. Now it covers the whole per-process part.
0010  *  Instead of using magical inumbers to determine the kind of object
0011  *  we allocate and fill in-core inodes upon lookup. They don't even
0012  *  go into icache. We cache the reference to task_struct upon lookup too.
0013  *  Eventually it should become a filesystem in its own. We don't use the
0014  *  rest of procfs anymore.
0015  *
0016  *
0017  *  Changelog:
0018  *  17-Jan-2005
0019  *  Allan Bezerra
0020  *  Bruna Moreira <bruna.moreira@indt.org.br>
0021  *  Edjard Mota <edjard.mota@indt.org.br>
0022  *  Ilias Biris <ilias.biris@indt.org.br>
0023  *  Mauricio Lin <mauricio.lin@indt.org.br>
0024  *
0025  *  Embedded Linux Lab - 10LE Instituto Nokia de Tecnologia - INdT
0026  *
0027  *  A new process specific entry (smaps) included in /proc. It shows the
0028  *  size of rss for each memory area. The maps entry lacks information
0029  *  about physical memory size (rss) for each mapped file, i.e.,
0030  *  rss information for executables and library files.
0031  *  This additional information is useful for any tools that need to know
0032  *  about physical memory consumption for a process specific library.
0033  *
0034  *  Changelog:
0035  *  21-Feb-2005
0036  *  Embedded Linux Lab - 10LE Instituto Nokia de Tecnologia - INdT
0037  *  Pud inclusion in the page table walking.
0038  *
0039  *  ChangeLog:
0040  *  10-Mar-2005
0041  *  10LE Instituto Nokia de Tecnologia - INdT:
0042  *  A better way to walks through the page table as suggested by Hugh Dickins.
0043  *
0044  *  Simo Piiroinen <simo.piiroinen@nokia.com>:
0045  *  Smaps information related to shared, private, clean and dirty pages.
0046  *
0047  *  Paul Mundt <paul.mundt@nokia.com>:
0048  *  Overall revision about smaps.
0049  */
0050
0051 #include <linux/uaccess.h>
0052
0053 #include <linux/errno.h>
0054 #include <linux/time.h>
0055 #include <linux/proc_fs.h>
0056 #include <linux/stat.h>
0057 #include <linux/task_io_accounting_ops.h>
0058 #include <linux/init.h>
0059 #include <linux/capability.h>
0060 #include <linux/file.h>
0061 #include <linux/fdtable.h>
0062 #include <linux/generic-radix-tree.h>
0063 #include <linux/string.h>
0064 #include <linux/seq_file.h>
0065 #include <linux/namei.h>
0066 #include <linux/mnt_namespace.h>
0067 #include <linux/mm.h>
0068 #include <linux/swap.h>
0069 #include <linux/rcupdate.h>
0070 #include <linux/kallsyms.h>
0071 #include <linux/stacktrace.h>
0072 #include <linux/resource.h>
0073 #include <linux/module.h>
0074 #include <linux/mount.h>
0075 #include <linux/security.h>
0076 #include <linux/ptrace.h>
0077 #include <linux/printk.h>
0078 #include <linux/cache.h>
0079 #include <linux/cgroup.h>
0080 #include <linux/cpuset.h>
0081 #include <linux/audit.h>
0082 #include <linux/poll.h>
0083 #include <linux/nsproxy.h>
0084 #include <linux/oom.h>
0085 #include <linux/elf.h>
0086 #include <linux/pid_namespace.h>
0087 #include <linux/user_namespace.h>
0088 #include <linux/fs_struct.h>
0089 #include <linux/slab.h>
0090 #include <linux/sched/autogroup.h>
0091 #include <linux/sched/mm.h>
0092 #include <linux/sched/coredump.h>
0093 #include <linux/sched/debug.h>
0094 #include <linux/sched/stat.h>
0095 #include <linux/posix-timers.h>
0096 #include <linux/time_namespace.h>
0097 #include <linux/resctrl.h>
0098 #include <linux/cn_proc.h>
0099 #include <trace/events/oom.h>
0100 #include "internal.h"
0101 #include "fd.h"
0102
0103 #include "../../lib/kstrtox.h"
0104
0105 /* NOTE:
0106  *  Implementing inode permission operations in /proc is almost
0107  *  certainly an error.  Permission checks need to happen during
0108  *  each system call not at open time.  The reason is that most of
0109  *  what we wish to check for permissions in /proc varies at runtime.
0110  *
0111  *  The classic example of a problem is opening file descriptors
0112  *  in /proc for a task before it execs a suid executable.
0113  */
0114
0115 static u8 nlink_tid __ro_after_init;
0116 static u8 nlink_tgid __ro_after_init;
0117
0118 struct pid_entry {
0119     const char *name;
0120     unsigned int len;
0121     umode_t mode;
0122     const struct inode_operations *iop;
0123     const struct file_operations *fop;
0124     union proc_op op;
0125 };
0126
0127 #define NOD(NAME, MODE, IOP, FOP, OP) {         \
0128     .name = (NAME),                 \
0129     .len  = sizeof(NAME) - 1,           \
0130     .mode = MODE,                   \
0131     .iop  = IOP,                    \
0132     .fop  = FOP,                    \
0133     .op   = OP,                 \
0134 }
0135
0136 #define DIR(NAME, MODE, iops, fops) \
0137     NOD(NAME, (S_IFDIR|(MODE)), &iops, &fops, {} )
0138 #define LNK(NAME, get_link)                 \
0139     NOD(NAME, (S_IFLNK|S_IRWXUGO),              \
0140         &proc_pid_link_inode_operations, NULL,      \
0141         { .proc_get_link = get_link } )
0142 #define REG(NAME, MODE, fops)               \
0143     NOD(NAME, (S_IFREG|(MODE)), NULL, &fops, {})
0144 #define ONE(NAME, MODE, show)               \
0145     NOD(NAME, (S_IFREG|(MODE)),         \
0146         NULL, &proc_single_file_operations, \
0147         { .proc_show = show } )
0148 #define ATTR(LSM, NAME, MODE)               \
0149     NOD(NAME, (S_IFREG|(MODE)),         \
0150         NULL, &proc_pid_attr_operations,    \
0151         { .lsm = LSM })
0152
0153 /*
0154  * Count the number of hardlinks for the pid_entry table, excluding the .
0155  * and .. links.
0156  */
0157 static unsigned int __init pid_entry_nlink(const struct pid_entry *entries,
0158     unsigned int n)
0159 {
0160     unsigned int i;
0161     unsigned int count;
0162
0163     count = 2;
0164     for (i = 0; i < n; ++i) {
0165         if (S_ISDIR(entries[i].mode))
0166             ++count;
0167     }
0168
0169     return count;
0170 }
0171
0172 static int get_task_root(struct task_struct *task, struct path *root)
0173 {
0174     int result = -ENOENT;
0175
0176     task_lock(task);
0177     if (task->fs) {
0178         get_fs_root(task->fs, root);
0179         result = 0;
0180     }
0181     task_unlock(task);
0182     return result;
0183 }
0184
0185 static int proc_cwd_link(struct dentry *dentry, struct path *path)
0186 {
0187     struct task_struct *task = get_proc_task(d_inode(dentry));
0188     int result = -ENOENT;
0189
0190     if (task) {
0191         task_lock(task);
0192         if (task->fs) {
0193             get_fs_pwd(task->fs, path);
0194             result = 0;
0195         }
0196         task_unlock(task);
0197         put_task_struct(task);
0198     }
0199     return result;
0200 }
0201
0202 static int proc_root_link(struct dentry *dentry, struct path *path)
0203 {
0204     struct task_struct *task = get_proc_task(d_inode(dentry));
0205     int result = -ENOENT;
0206
0207     if (task) {
0208         result = get_task_root(task, path);
0209         put_task_struct(task);
0210     }
0211     return result;
0212 }
0213
0214 /*
0215  * If the user used setproctitle(), we just get the string from
0216  * user space at arg_start, and limit it to a maximum of one page.
0217  */
0218 static ssize_t get_mm_proctitle(struct mm_struct *mm, char __user *buf,
0219                 size_t count, unsigned long pos,
0220                 unsigned long arg_start)
0221 {
0222     char *page;
0223     int ret, got;
0224
0225     if (pos >= PAGE_SIZE)
0226         return 0;
0227
0228     page = (char *)__get_free_page(GFP_KERNEL);
0229     if (!page)
0230         return -ENOMEM;
0231
0232     ret = 0;
0233     got = access_remote_vm(mm, arg_start, page, PAGE_SIZE, FOLL_ANON);
0234     if (got > 0) {
0235         int len = strnlen(page, got);
0236
0237         /* Include the NUL character if it was found */
0238         if (len < got)
0239             len++;
0240
0241         if (len > pos) {
0242             len -= pos;
0243             if (len > count)
0244                 len = count;
0245             len -= copy_to_user(buf, page+pos, len);
0246             if (!len)
0247                 len = -EFAULT;
0248             ret = len;
0249         }
0250     }
0251     free_page((unsigned long)page);
0252     return ret;
0253 }
0254
0255 static ssize_t get_mm_cmdline(struct mm_struct *mm, char __user *buf,
0256                   size_t count, loff_t *ppos)
0257 {
0258     unsigned long arg_start, arg_end, env_start, env_end;
0259     unsigned long pos, len;
0260     char *page, c;
0261
0262     /* Check if process spawned far enough to have cmdline. */
0263     if (!mm->env_end)
0264         return 0;
0265
0266     spin_lock(&mm->arg_lock);
0267     arg_start = mm->arg_start;
0268     arg_end = mm->arg_end;
0269     env_start = mm->env_start;
0270     env_end = mm->env_end;
0271     spin_unlock(&mm->arg_lock);
0272
0273     if (arg_start >= arg_end)
0274         return 0;
0275
0276     /*
0277      * We allow setproctitle() to overwrite the argument
0278      * strings, and overflow past the original end. But
0279      * only when it overflows into the environment area.
0280      */
0281     if (env_start != arg_end || env_end < env_start)
0282         env_start = env_end = arg_end;
0283     len = env_end - arg_start;
0284
0285     /* We're not going to care if "*ppos" has high bits set */
0286     pos = *ppos;
0287     if (pos >= len)
0288         return 0;
0289     if (count > len - pos)
0290         count = len - pos;
0291     if (!count)
0292         return 0;
0293
0294     /*
0295      * Magical special case: if the argv[] end byte is not
0296      * zero, the user has overwritten it with setproctitle(3).
0297      *
0298      * Possible future enhancement: do this only once when
0299      * pos is 0, and set a flag in the 'struct file'.
0300      */
0301     if (access_remote_vm(mm, arg_end-1, &c, 1, FOLL_ANON) == 1 && c)
0302         return get_mm_proctitle(mm, buf, count, pos, arg_start);
0303
0304     /*
0305      * For the non-setproctitle() case we limit things strictly
0306      * to the [arg_start, arg_end[ range.
0307      */
0308     pos += arg_start;
0309     if (pos < arg_start || pos >= arg_end)
0310         return 0;
0311     if (count > arg_end - pos)
0312         count = arg_end - pos;
0313
0314     page = (char *)__get_free_page(GFP_KERNEL);
0315     if (!page)
0316         return -ENOMEM;
0317
0318     len = 0;
0319     while (count) {
0320         int got;
0321         size_t size = min_t(size_t, PAGE_SIZE, count);
0322
0323         got = access_remote_vm(mm, pos, page, size, FOLL_ANON);
0324         if (got <= 0)
0325             break;
0326         got -= copy_to_user(buf, page, got);
0327         if (unlikely(!got)) {
0328             if (!len)
0329                 len = -EFAULT;
0330             break;
0331         }
0332         pos += got;
0333         buf += got;
0334         len += got;
0335         count -= got;
0336     }
0337
0338     free_page((unsigned long)page);
0339     return len;
0340 }
0341
0342 static ssize_t get_task_cmdline(struct task_struct *tsk, char __user *buf,
0343                 size_t count, loff_t *pos)
0344 {
0345     struct mm_struct *mm;
0346     ssize_t ret;
0347
0348     mm = get_task_mm(tsk);
0349     if (!mm)
0350         return 0;
0351
0352     ret = get_mm_cmdline(mm, buf, count, pos);
0353     mmput(mm);
0354     return ret;
0355 }
0356
0357 static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf,
0358                      size_t count, loff_t *pos)
0359 {
0360     struct task_struct *tsk;
0361     ssize_t ret;
0362
0363     BUG_ON(*pos < 0);
0364
0365     tsk = get_proc_task(file_inode(file));
0366     if (!tsk)
0367         return -ESRCH;
0368     ret = get_task_cmdline(tsk, buf, count, pos);
0369     put_task_struct(tsk);
0370     if (ret > 0)
0371         *pos += ret;
0372     return ret;
0373 }
0374
0375 static const struct file_operations proc_pid_cmdline_ops = {
0376     .read   = proc_pid_cmdline_read,
0377     .llseek = generic_file_llseek,
0378 };
0379
0380 #ifdef CONFIG_KALLSYMS
0381 /*
0382  * Provides a wchan file via kallsyms in a proper one-value-per-file format.
0383  * Returns the resolved symbol.  If that fails, simply return the address.
0384  */
0385 static int proc_pid_wchan(struct seq_file *m, struct pid_namespace *ns,
0386               struct pid *pid, struct task_struct *task)
0387 {
0388     unsigned long wchan;
0389     char symname[KSYM_NAME_LEN];
0390
0391     if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS))
0392         goto print0;
0393
0394     wchan = get_wchan(task);
0395     if (wchan && !lookup_symbol_name(wchan, symname)) {
0396         seq_puts(m, symname);
0397         return 0;
0398     }
0399
0400 print0:
0401     seq_putc(m, '0');
0402     return 0;
0403 }
0404 #endif /* CONFIG_KALLSYMS */
0405
0406 static int lock_trace(struct task_struct *task)
0407 {
0408     int err = down_read_killable(&task->signal->exec_update_lock);
0409     if (err)
0410         return err;
0411     if (!ptrace_may_access(task, PTRACE_MODE_ATTACH_FSCREDS)) {
0412         up_read(&task->signal->exec_update_lock);
0413         return -EPERM;
0414     }
0415     return 0;
0416 }
0417
0418 static void unlock_trace(struct task_struct *task)
0419 {
0420     up_read(&task->signal->exec_update_lock);
0421 }
0422
0423 #ifdef CONFIG_STACKTRACE
0424
0425 #define MAX_STACK_TRACE_DEPTH   64
0426
0427 static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns,
0428               struct pid *pid, struct task_struct *task)
0429 {
0430     unsigned long *entries;
0431     int err;
0432
0433     /*
0434      * The ability to racily run the kernel stack unwinder on a running task
0435      * and then observe the unwinder output is scary; while it is useful for
0436      * debugging kernel issues, it can also allow an attacker to leak kernel
0437      * stack contents.
0438      * Doing this in a manner that is at least safe from races would require
0439      * some work to ensure that the remote task can not be scheduled; and
0440      * even then, this would still expose the unwinder as local attack
0441      * surface.
0442      * Therefore, this interface is restricted to root.
0443      */
0444     if (!file_ns_capable(m->file, &init_user_ns, CAP_SYS_ADMIN))
0445         return -EACCES;
0446
0447     entries = kmalloc_array(MAX_STACK_TRACE_DEPTH, sizeof(*entries),
0448                 GFP_KERNEL);
0449     if (!entries)
0450         return -ENOMEM;
0451
0452     err = lock_trace(task);
0453     if (!err) {
0454         unsigned int i, nr_entries;
0455
0456         nr_entries = stack_trace_save_tsk(task, entries,
0457                           MAX_STACK_TRACE_DEPTH, 0);
0458
0459         for (i = 0; i < nr_entries; i++) {
0460             seq_printf(m, "[<0>] %pB\n", (void *)entries[i]);
0461         }
0462
0463         unlock_trace(task);
0464     }
0465     kfree(entries);
0466
0467     return err;
0468 }
0469 #endif
0470
0471 #ifdef CONFIG_SCHED_INFO
0472 /*
0473  * Provides /proc/PID/schedstat
0474  */
0475 static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns,
0476                   struct pid *pid, struct task_struct *task)
0477 {
0478     if (unlikely(!sched_info_on()))
0479         seq_puts(m, "0 0 0\n");
0480     else
0481         seq_printf(m, "%llu %llu %lu\n",
0482            (unsigned long long)task->se.sum_exec_runtime,
0483            (unsigned long long)task->sched_info.run_delay,
0484            task->sched_info.pcount);
0485
0486     return 0;
0487 }
0488 #endif
0489
0490 #ifdef CONFIG_LATENCYTOP
0491 static int lstats_show_proc(struct seq_file *m, void *v)
0492 {
0493     int i;
0494     struct inode *inode = m->private;
0495     struct task_struct *task = get_proc_task(inode);
0496
0497     if (!task)
0498         return -ESRCH;
0499     seq_puts(m, "Latency Top version : v0.1\n");
0500     for (i = 0; i < LT_SAVECOUNT; i++) {
0501         struct latency_record *lr = &task->latency_record[i];
0502         if (lr->backtrace[0]) {
0503             int q;
0504             seq_printf(m, "%i %li %li",
0505                    lr->count, lr->time, lr->max);
0506             for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
0507                 unsigned long bt = lr->backtrace[q];
0508
0509                 if (!bt)
0510                     break;
0511                 seq_printf(m, " %ps", (void *)bt);
0512             }
0513             seq_putc(m, '\n');
0514         }
0515
0516     }
0517     put_task_struct(task);
0518     return 0;
0519 }
0520
0521 static int lstats_open(struct inode *inode, struct file *file)
0522 {
0523     return single_open(file, lstats_show_proc, inode);
0524 }
0525
0526 static ssize_t lstats_write(struct file *file, const char __user *buf,
0527                 size_t count, loff_t *offs)
0528 {
0529     struct task_struct *task = get_proc_task(file_inode(file));
0530
0531     if (!task)
0532         return -ESRCH;
0533     clear_tsk_latency_tracing(task);
0534     put_task_struct(task);
0535
0536     return count;
0537 }
0538
0539 static const struct file_operations proc_lstats_operations = {
0540     .open       = lstats_open,
0541     .read       = seq_read,
0542     .write      = lstats_write,
0543     .llseek     = seq_lseek,
0544     .release    = single_release,
0545 };
0546
0547 #endif
0548
0549 static int proc_oom_score(struct seq_file *m, struct pid_namespace *ns,
0550               struct pid *pid, struct task_struct *task)
0551 {
0552     unsigned long totalpages = totalram_pages() + total_swap_pages;
0553     unsigned long points = 0;
0554     long badness;
0555
0556     badness = oom_badness(task, totalpages);
0557     /*
0558      * Special case OOM_SCORE_ADJ_MIN for all others scale the
0559      * badness value into [0, 2000] range which we have been
0560      * exporting for a long time so userspace might depend on it.
0561      */
0562     if (badness != LONG_MIN)
0563         points = (1000 + badness * 1000 / (long)totalpages) * 2 / 3;
0564
0565     seq_printf(m, "%lu\n", points);
0566
0567     return 0;
0568 }
0569
0570 struct limit_names {
0571     const char *name;
0572     const char *unit;
0573 };
0574
0575 static const struct limit_names lnames[RLIM_NLIMITS] = {
0576     [RLIMIT_CPU] = {"Max cpu time", "seconds"},
0577     [RLIMIT_FSIZE] = {"Max file size", "bytes"},
0578     [RLIMIT_DATA] = {"Max data size", "bytes"},
0579     [RLIMIT_STACK] = {"Max stack size", "bytes"},
0580     [RLIMIT_CORE] = {"Max core file size", "bytes"},
0581     [RLIMIT_RSS] = {"Max resident set", "bytes"},
0582     [RLIMIT_NPROC] = {"Max processes", "processes"},
0583     [RLIMIT_NOFILE] = {"Max open files", "files"},
0584     [RLIMIT_MEMLOCK] = {"Max locked memory", "bytes"},
0585     [RLIMIT_AS] = {"Max address space", "bytes"},
0586     [RLIMIT_LOCKS] = {"Max file locks", "locks"},
0587     [RLIMIT_SIGPENDING] = {"Max pending signals", "signals"},
0588     [RLIMIT_MSGQUEUE] = {"Max msgqueue size", "bytes"},
0589     [RLIMIT_NICE] = {"Max nice priority", NULL},
0590     [RLIMIT_RTPRIO] = {"Max realtime priority", NULL},
0591     [RLIMIT_RTTIME] = {"Max realtime timeout", "us"},
0592 };
0593
0594 /* Display limits for a process */
0595 static int proc_pid_limits(struct seq_file *m, struct pid_namespace *ns,
0596                struct pid *pid, struct task_struct *task)
0597 {
0598     unsigned int i;
0599     unsigned long flags;
0600
0601     struct rlimit rlim[RLIM_NLIMITS];
0602
0603     if (!lock_task_sighand(task, &flags))
0604         return 0;
0605     memcpy(rlim, task->signal->rlim, sizeof(struct rlimit) * RLIM_NLIMITS);
0606     unlock_task_sighand(task, &flags);
0607
0608     /*
0609      * print the file header
0610      */
0611     seq_puts(m, "Limit                     "
0612         "Soft Limit           "
0613         "Hard Limit           "
0614         "Units     \n");
0615
0616     for (i = 0; i < RLIM_NLIMITS; i++) {
0617         if (rlim[i].rlim_cur == RLIM_INFINITY)
0618             seq_printf(m, "%-25s %-20s ",
0619                    lnames[i].name, "unlimited");
0620         else
0621             seq_printf(m, "%-25s %-20lu ",
0622                    lnames[i].name, rlim[i].rlim_cur);
0623
0624         if (rlim[i].rlim_max == RLIM_INFINITY)
0625             seq_printf(m, "%-20s ", "unlimited");
0626         else
0627             seq_printf(m, "%-20lu ", rlim[i].rlim_max);
0628
0629         if (lnames[i].unit)
0630             seq_printf(m, "%-10s\n", lnames[i].unit);
0631         else
0632             seq_putc(m, '\n');
0633     }
0634
0635     return 0;
0636 }
0637
0638 #ifdef CONFIG_HAVE_ARCH_TRACEHOOK
0639 static int proc_pid_syscall(struct seq_file *m, struct pid_namespace *ns,
0640                 struct pid *pid, struct task_struct *task)
0641 {
0642     struct syscall_info info;
0643     u64 *args = &info.data.args[0];
0644     int res;
0645
0646     res = lock_trace(task);
0647     if (res)
0648         return res;
0649
0650     if (task_current_syscall(task, &info))
0651         seq_puts(m, "running\n");
0652     else if (info.data.nr < 0)
0653         seq_printf(m, "%d 0x%llx 0x%llx\n",
0654                info.data.nr, info.sp, info.data.instruction_pointer);
0655     else
0656         seq_printf(m,
0657                "%d 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx\n",
0658                info.data.nr,
0659                args[0], args[1], args[2], args[3], args[4], args[5],
0660                info.sp, info.data.instruction_pointer);
0661     unlock_trace(task);
0662
0663     return 0;
0664 }
0665 #endif /* CONFIG_HAVE_ARCH_TRACEHOOK */
0666
0667 /************************************************************************/
0668 /*                       Here the fs part begins                        */
0669 /************************************************************************/
0670
0671 /* permission checks */
0672 static bool proc_fd_access_allowed(struct inode *inode)
0673 {
0674     struct task_struct *task;
0675     bool allowed = false;
0676     /* Allow access to a task's file descriptors if it is us or we
0677      * may use ptrace attach to the process and find out that
0678      * information.
0679      */
0680     task = get_proc_task(inode);
0681     if (task) {
0682         allowed = ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS);
0683         put_task_struct(task);
0684     }
0685     return allowed;
0686 }
0687
0688 int proc_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
0689          struct iattr *attr)
0690 {
0691     int error;
0692     struct inode *inode = d_inode(dentry);
0693
0694     if (attr->ia_valid & ATTR_MODE)
0695         return -EPERM;
0696
0697     error = setattr_prepare(&init_user_ns, dentry, attr);
0698     if (error)
0699         return error;
0700
0701     setattr_copy(&init_user_ns, inode, attr);
0702     mark_inode_dirty(inode);
0703     return 0;
0704 }
0705
0706 /*
0707  * May current process learn task's sched/cmdline info (for hide_pid_min=1)
0708  * or euid/egid (for hide_pid_min=2)?
0709  */
0710 static bool has_pid_permissions(struct proc_fs_info *fs_info,
0711                  struct task_struct *task,
0712                  enum proc_hidepid hide_pid_min)
0713 {
0714     /*
0715      * If 'hidpid' mount option is set force a ptrace check,
0716      * we indicate that we are using a filesystem syscall
0717      * by passing PTRACE_MODE_READ_FSCREDS
0718      */
0719     if (fs_info->hide_pid == HIDEPID_NOT_PTRACEABLE)
0720         return ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS);
0721
0722     if (fs_info->hide_pid < hide_pid_min)
0723         return true;
0724     if (in_group_p(fs_info->pid_gid))
0725         return true;
0726     return ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS);
0727 }
0728
0729
0730 static int proc_pid_permission(struct user_namespace *mnt_userns,
0731                    struct inode *inode, int mask)
0732 {
0733     struct proc_fs_info *fs_info = proc_sb_info(inode->i_sb);
0734     struct task_struct *task;
0735     bool has_perms;
0736
0737     task = get_proc_task(inode);
0738     if (!task)
0739         return -ESRCH;
0740     has_perms = has_pid_permissions(fs_info, task, HIDEPID_NO_ACCESS);
0741     put_task_struct(task);
0742
0743     if (!has_perms) {
0744         if (fs_info->hide_pid == HIDEPID_INVISIBLE) {
0745             /*
0746              * Let's make getdents(), stat(), and open()
0747              * consistent with each other.  If a process
0748              * may not stat() a file, it shouldn't be seen
0749              * in procfs at all.
0750              */
0751             return -ENOENT;
0752         }
0753
0754         return -EPERM;
0755     }
0756     return generic_permission(&init_user_ns, inode, mask);
0757 }
0758
0759
0760
0761 static const struct inode_operations proc_def_inode_operations = {
0762     .setattr    = proc_setattr,
0763 };
0764
0765 static int proc_single_show(struct seq_file *m, void *v)
0766 {
0767     struct inode *inode = m->private;
0768     struct pid_namespace *ns = proc_pid_ns(inode->i_sb);
0769     struct pid *pid = proc_pid(inode);
0770     struct task_struct *task;
0771     int ret;
0772
0773     task = get_pid_task(pid, PIDTYPE_PID);
0774     if (!task)
0775         return -ESRCH;
0776
0777     ret = PROC_I(inode)->op.proc_show(m, ns, pid, task);
0778
0779     put_task_struct(task);
0780     return ret;
0781 }
0782
0783 static int proc_single_open(struct inode *inode, struct file *filp)
0784 {
0785     return single_open(filp, proc_single_show, inode);
0786 }
0787
0788 static const struct file_operations proc_single_file_operations = {
0789     .open       = proc_single_open,
0790     .read       = seq_read,
0791     .llseek     = seq_lseek,
0792     .release    = single_release,
0793 };
0794
0795
0796 struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode)
0797 {
0798     struct task_struct *task = get_proc_task(inode);
0799     struct mm_struct *mm = ERR_PTR(-ESRCH);
0800
0801     if (task) {
0802         mm = mm_access(task, mode | PTRACE_MODE_FSCREDS);
0803         put_task_struct(task);
0804
0805         if (!IS_ERR_OR_NULL(mm)) {
0806             /* ensure this mm_struct can't be freed */
0807             mmgrab(mm);
0808             /* but do not pin its memory */
0809             mmput(mm);
0810         }
0811     }
0812
0813     return mm;
0814 }
0815
0816 static int __mem_open(struct inode *inode, struct file *file, unsigned int mode)
0817 {
0818     struct mm_struct *mm = proc_mem_open(inode, mode);
0819
0820     if (IS_ERR(mm))
0821         return PTR_ERR(mm);
0822
0823     file->private_data = mm;
0824     return 0;
0825 }
0826
0827 static int mem_open(struct inode *inode, struct file *file)
0828 {
0829     int ret = __mem_open(inode, file, PTRACE_MODE_ATTACH);
0830
0831     /* OK to pass negative loff_t, we can catch out-of-range */
0832     file->f_mode |= FMODE_UNSIGNED_OFFSET;
0833
0834     return ret;
0835 }
0836
0837 static ssize_t mem_rw(struct file *file, char __user *buf,
0838             size_t count, loff_t *ppos, int write)
0839 {
0840     struct mm_struct *mm = file->private_data;
0841     unsigned long addr = *ppos;
0842     ssize_t copied;
0843     char *page;
0844     unsigned int flags;
0845
0846     if (!mm)
0847         return 0;
0848
0849     page = (char *)__get_free_page(GFP_KERNEL);
0850     if (!page)
0851         return -ENOMEM;
0852
0853     copied = 0;
0854     if (!mmget_not_zero(mm))
0855         goto free;
0856
0857     flags = FOLL_FORCE | (write ? FOLL_WRITE : 0);
0858
0859     while (count > 0) {
0860         size_t this_len = min_t(size_t, count, PAGE_SIZE);
0861
0862         if (write && copy_from_user(page, buf, this_len)) {
0863             copied = -EFAULT;
0864             break;
0865         }
0866
0867         this_len = access_remote_vm(mm, addr, page, this_len, flags);
0868         if (!this_len) {
0869             if (!copied)
0870                 copied = -EIO;
0871             break;
0872         }
0873
0874         if (!write && copy_to_user(buf, page, this_len)) {
0875             copied = -EFAULT;
0876             break;
0877         }
0878
0879         buf += this_len;
0880         addr += this_len;
0881         copied += this_len;
0882         count -= this_len;
0883     }
0884     *ppos = addr;
0885
0886     mmput(mm);
0887 free:
0888     free_page((unsigned long) page);
0889     return copied;
0890 }
0891
0892 static ssize_t mem_read(struct file *file, char __user *buf,
0893             size_t count, loff_t *ppos)
0894 {
0895     return mem_rw(file, buf, count, ppos, 0);
0896 }
0897
0898 static ssize_t mem_write(struct file *file, const char __user *buf,
0899              size_t count, loff_t *ppos)
0900 {
0901     return mem_rw(file, (char __user*)buf, count, ppos, 1);
0902 }
0903
0904 loff_t mem_lseek(struct file *file, loff_t offset, int orig)
0905 {
0906     switch (orig) {
0907     case 0:
0908         file->f_pos = offset;
0909         break;
0910     case 1:
0911         file->f_pos += offset;
0912         break;
0913     default:
0914         return -EINVAL;
0915     }
0916     force_successful_syscall_return();
0917     return file->f_pos;
0918 }
0919
0920 static int mem_release(struct inode *inode, struct file *file)
0921 {
0922     struct mm_struct *mm = file->private_data;
0923     if (mm)
0924         mmdrop(mm);
0925     return 0;
0926 }
0927
0928 static const struct file_operations proc_mem_operations = {
0929     .llseek     = mem_lseek,
0930     .read       = mem_read,
0931     .write      = mem_write,
0932     .open       = mem_open,
0933     .release    = mem_release,
0934 };
0935
0936 static int environ_open(struct inode *inode, struct file *file)
0937 {
0938     return __mem_open(inode, file, PTRACE_MODE_READ);
0939 }
0940
0941 static ssize_t environ_read(struct file *file, char __user *buf,
0942             size_t count, loff_t *ppos)
0943 {
0944     char *page;
0945     unsigned long src = *ppos;
0946     int ret = 0;
0947     struct mm_struct *mm = file->private_data;
0948     unsigned long env_start, env_end;
0949
0950     /* Ensure the process spawned far enough to have an environment. */
0951     if (!mm || !mm->env_end)
0952         return 0;
0953
0954     page = (char *)__get_free_page(GFP_KERNEL);
0955     if (!page)
0956         return -ENOMEM;
0957
0958     ret = 0;
0959     if (!mmget_not_zero(mm))
0960         goto free;
0961
0962     spin_lock(&mm->arg_lock);
0963     env_start = mm->env_start;
0964     env_end = mm->env_end;
0965     spin_unlock(&mm->arg_lock);
0966
0967     while (count > 0) {
0968         size_t this_len, max_len;
0969         int retval;
0970
0971         if (src >= (env_end - env_start))
0972             break;
0973
0974         this_len = env_end - (env_start + src);
0975
0976         max_len = min_t(size_t, PAGE_SIZE, count);
0977         this_len = min(max_len, this_len);
0978
0979         retval = access_remote_vm(mm, (env_start + src), page, this_len, FOLL_ANON);
0980
0981         if (retval <= 0) {
0982             ret = retval;
0983             break;
0984         }
0985
0986         if (copy_to_user(buf, page, retval)) {
0987             ret = -EFAULT;
0988             break;
0989         }
0990
0991         ret += retval;
0992         src += retval;
0993         buf += retval;
0994         count -= retval;
0995     }
0996     *ppos = src;
0997     mmput(mm);
0998
0999 free:
1000     free_page((unsigned long) page);
1001     return ret;
1002 }
1003
1004 static const struct file_operations proc_environ_operations = {
1005     .open       = environ_open,
1006     .read       = environ_read,
1007     .llseek     = generic_file_llseek,
1008     .release    = mem_release,
1009 };
1010
1011 static int auxv_open(struct inode *inode, struct file *file)
1012 {
1013     return __mem_open(inode, file, PTRACE_MODE_READ_FSCREDS);
1014 }
1015
1016 static ssize_t auxv_read(struct file *file, char __user *buf,
1017             size_t count, loff_t *ppos)
1018 {
1019     struct mm_struct *mm = file->private_data;
1020     unsigned int nwords = 0;
1021
1022     if (!mm)
1023         return 0;
1024     do {
1025         nwords += 2;
1026     } while (mm->saved_auxv[nwords - 2] != 0); /* AT_NULL */
1027     return simple_read_from_buffer(buf, count, ppos, mm->saved_auxv,
1028                        nwords * sizeof(mm->saved_auxv[0]));
1029 }
1030
1031 static const struct file_operations proc_auxv_operations = {
1032     .open       = auxv_open,
1033     .read       = auxv_read,
1034     .llseek     = generic_file_llseek,
1035     .release    = mem_release,
1036 };
1037
1038 static ssize_t oom_adj_read(struct file *file, char __user *buf, size_t count,
1039                 loff_t *ppos)
1040 {
1041     struct task_struct *task = get_proc_task(file_inode(file));
1042     char buffer[PROC_NUMBUF];
1043     int oom_adj = OOM_ADJUST_MIN;
1044     size_t len;
1045
1046     if (!task)
1047         return -ESRCH;
1048     if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MAX)
1049         oom_adj = OOM_ADJUST_MAX;
1050     else
1051         oom_adj = (task->signal->oom_score_adj * -OOM_DISABLE) /
1052               OOM_SCORE_ADJ_MAX;
1053     put_task_struct(task);
1054     if (oom_adj > OOM_ADJUST_MAX)
1055         oom_adj = OOM_ADJUST_MAX;
1056     len = snprintf(buffer, sizeof(buffer), "%d\n", oom_adj);
1057     return simple_read_from_buffer(buf, count, ppos, buffer, len);
1058 }
1059
1060 static int __set_oom_adj(struct file *file, int oom_adj, bool legacy)
1061 {
1062     struct mm_struct *mm = NULL;
1063     struct task_struct *task;
1064     int err = 0;
1065
1066     task = get_proc_task(file_inode(file));
1067     if (!task)
1068         return -ESRCH;
1069
1070     mutex_lock(&oom_adj_mutex);
1071     if (legacy) {
1072         if (oom_adj < task->signal->oom_score_adj &&
1073                 !capable(CAP_SYS_RESOURCE)) {
1074             err = -EACCES;
1075             goto err_unlock;
1076         }
1077         /*
1078          * /proc/pid/oom_adj is provided for legacy purposes, ask users to use
1079          * /proc/pid/oom_score_adj instead.
1080          */
1081         pr_warn_once("%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n",
1082               current->comm, task_pid_nr(current), task_pid_nr(task),
1083               task_pid_nr(task));
1084     } else {
1085         if ((short)oom_adj < task->signal->oom_score_adj_min &&
1086                 !capable(CAP_SYS_RESOURCE)) {
1087             err = -EACCES;
1088             goto err_unlock;
1089         }
1090     }
1091
1092     /*
1093      * Make sure we will check other processes sharing the mm if this is
1094      * not vfrok which wants its own oom_score_adj.
1095      * pin the mm so it doesn't go away and get reused after task_unlock
1096      */
1097     if (!task->vfork_done) {
1098         struct task_struct *p = find_lock_task_mm(task);
1099
1100         if (p) {
1101             if (test_bit(MMF_MULTIPROCESS, &p->mm->flags)) {
1102                 mm = p->mm;
1103                 mmgrab(mm);
1104             }
1105             task_unlock(p);
1106         }
1107     }
1108
1109     task->signal->oom_score_adj = oom_adj;
1110     if (!legacy && has_capability_noaudit(current, CAP_SYS_RESOURCE))
1111         task->signal->oom_score_adj_min = (short)oom_adj;
1112     trace_oom_score_adj_update(task);
1113
1114     if (mm) {
1115         struct task_struct *p;
1116
1117         rcu_read_lock();
1118         for_each_process(p) {
1119             if (same_thread_group(task, p))
1120                 continue;
1121
1122             /* do not touch kernel threads or the global init */
1123             if (p->flags & PF_KTHREAD || is_global_init(p))
1124                 continue;
1125
1126             task_lock(p);
1127             if (!p->vfork_done && process_shares_mm(p, mm)) {
1128                 p->signal->oom_score_adj = oom_adj;
1129                 if (!legacy && has_capability_noaudit(current, CAP_SYS_RESOURCE))
1130                     p->signal->oom_score_adj_min = (short)oom_adj;
1131             }
1132             task_unlock(p);
1133         }
1134         rcu_read_unlock();
1135         mmdrop(mm);
1136     }
1137 err_unlock:
1138     mutex_unlock(&oom_adj_mutex);
1139     put_task_struct(task);
1140     return err;
1141 }
1142
1143 /*
1144  * /proc/pid/oom_adj exists solely for backwards compatibility with previous
1145  * kernels.  The effective policy is defined by oom_score_adj, which has a
1146  * different scale: oom_adj grew exponentially and oom_score_adj grows linearly.
1147  * Values written to oom_adj are simply mapped linearly to oom_score_adj.
1148  * Processes that become oom disabled via oom_adj will still be oom disabled
1149  * with this implementation.
1150  *
1151  * oom_adj cannot be removed since existing userspace binaries use it.
1152  */
1153 static ssize_t oom_adj_write(struct file *file, const char __user *buf,
1154                  size_t count, loff_t *ppos)
1155 {
1156     char buffer[PROC_NUMBUF];
1157     int oom_adj;
1158     int err;
1159
1160     memset(buffer, 0, sizeof(buffer));
1161     if (count > sizeof(buffer) - 1)
1162         count = sizeof(buffer) - 1;
1163     if (copy_from_user(buffer, buf, count)) {
1164         err = -EFAULT;
1165         goto out;
1166     }
1167
1168     err = kstrtoint(strstrip(buffer), 0, &oom_adj);
1169     if (err)
1170         goto out;
1171     if ((oom_adj < OOM_ADJUST_MIN || oom_adj > OOM_ADJUST_MAX) &&
1172          oom_adj != OOM_DISABLE) {
1173         err = -EINVAL;
1174         goto out;
1175     }
1176
1177     /*
1178      * Scale /proc/pid/oom_score_adj appropriately ensuring that a maximum
1179      * value is always attainable.
1180      */
1181     if (oom_adj == OOM_ADJUST_MAX)
1182         oom_adj = OOM_SCORE_ADJ_MAX;
1183     else
1184         oom_adj = (oom_adj * OOM_SCORE_ADJ_MAX) / -OOM_DISABLE;
1185
1186     err = __set_oom_adj(file, oom_adj, true);
1187 out:
1188     return err < 0 ? err : count;
1189 }
1190
1191 static const struct file_operations proc_oom_adj_operations = {
1192     .read       = oom_adj_read,
1193     .write      = oom_adj_write,
1194     .llseek     = generic_file_llseek,
1195 };
1196
1197 static ssize_t oom_score_adj_read(struct file *file, char __user *buf,
1198                     size_t count, loff_t *ppos)
1199 {
1200     struct task_struct *task = get_proc_task(file_inode(file));
1201     char buffer[PROC_NUMBUF];
1202     short oom_score_adj = OOM_SCORE_ADJ_MIN;
1203     size_t len;
1204
1205     if (!task)
1206         return -ESRCH;
1207     oom_score_adj = task->signal->oom_score_adj;
1208     put_task_struct(task);
1209     len = snprintf(buffer, sizeof(buffer), "%hd\n", oom_score_adj);
1210     return simple_read_from_buffer(buf, count, ppos, buffer, len);
1211 }
1212
1213 static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
1214                     size_t count, loff_t *ppos)
1215 {
1216     char buffer[PROC_NUMBUF];
1217     int oom_score_adj;
1218     int err;
1219
1220     memset(buffer, 0, sizeof(buffer));
1221     if (count > sizeof(buffer) - 1)
1222         count = sizeof(buffer) - 1;
1223     if (copy_from_user(buffer, buf, count)) {
1224         err = -EFAULT;
1225         goto out;
1226     }
1227
1228     err = kstrtoint(strstrip(buffer), 0, &oom_score_adj);
1229     if (err)
1230         goto out;
1231     if (oom_score_adj < OOM_SCORE_ADJ_MIN ||
1232             oom_score_adj > OOM_SCORE_ADJ_MAX) {
1233         err = -EINVAL;
1234         goto out;
1235     }
1236
1237     err = __set_oom_adj(file, oom_score_adj, false);
1238 out:
1239     return err < 0 ? err : count;
1240 }
1241
1242 static const struct file_operations proc_oom_score_adj_operations = {
1243     .read       = oom_score_adj_read,
1244     .write      = oom_score_adj_write,
1245     .llseek     = default_llseek,
1246 };
1247
1248 #ifdef CONFIG_AUDIT
1249 #define TMPBUFLEN 11
1250 static ssize_t proc_loginuid_read(struct file * file, char __user * buf,
1251                   size_t count, loff_t *ppos)
1252 {
1253     struct inode * inode = file_inode(file);
1254     struct task_struct *task = get_proc_task(inode);
1255     ssize_t length;
1256     char tmpbuf[TMPBUFLEN];
1257
1258     if (!task)
1259         return -ESRCH;
1260     length = scnprintf(tmpbuf, TMPBUFLEN, "%u",
1261                from_kuid(file->f_cred->user_ns,
1262                      audit_get_loginuid(task)));
1263     put_task_struct(task);
1264     return simple_read_from_buffer(buf, count, ppos, tmpbuf, length);
1265 }
1266
1267 static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
1268                    size_t count, loff_t *ppos)
1269 {
1270     struct inode * inode = file_inode(file);
1271     uid_t loginuid;
1272     kuid_t kloginuid;
1273     int rv;
1274
1275     /* Don't let kthreads write their own loginuid */
1276     if (current->flags & PF_KTHREAD)
1277         return -EPERM;
1278
1279     rcu_read_lock();
1280     if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) {
1281         rcu_read_unlock();
1282         return -EPERM;
1283     }
1284     rcu_read_unlock();
1285
1286     if (*ppos != 0) {
1287         /* No partial writes. */
1288         return -EINVAL;
1289     }
1290
1291     rv = kstrtou32_from_user(buf, count, 10, &loginuid);
1292     if (rv < 0)
1293         return rv;
1294
1295     /* is userspace tring to explicitly UNSET the loginuid? */
1296     if (loginuid == AUDIT_UID_UNSET) {
1297         kloginuid = INVALID_UID;
1298     } else {
1299         kloginuid = make_kuid(file->f_cred->user_ns, loginuid);
1300         if (!uid_valid(kloginuid))
1301             return -EINVAL;
1302     }
1303
1304     rv = audit_set_loginuid(kloginuid);
1305     if (rv < 0)
1306         return rv;
1307     return count;
1308 }
1309
1310 static const struct file_operations proc_loginuid_operations = {
1311     .read       = proc_loginuid_read,
1312     .write      = proc_loginuid_write,
1313     .llseek     = generic_file_llseek,
1314 };
1315
1316 static ssize_t proc_sessionid_read(struct file * file, char __user * buf,
1317                   size_t count, loff_t *ppos)
1318 {
1319     struct inode * inode = file_inode(file);
1320     struct task_struct *task = get_proc_task(inode);
1321     ssize_t length;
1322     char tmpbuf[TMPBUFLEN];
1323
1324     if (!task)
1325         return -ESRCH;
1326     length = scnprintf(tmpbuf, TMPBUFLEN, "%u",
1327                 audit_get_sessionid(task));
1328     put_task_struct(task);
1329     return simple_read_from_buffer(buf, count, ppos, tmpbuf, length);
1330 }
1331
1332 static const struct file_operations proc_sessionid_operations = {
1333     .read       = proc_sessionid_read,
1334     .llseek     = generic_file_llseek,
1335 };
1336 #endif
1337
1338 #ifdef CONFIG_FAULT_INJECTION
1339 static ssize_t proc_fault_inject_read(struct file * file, char __user * buf,
1340                       size_t count, loff_t *ppos)
1341 {
1342     struct task_struct *task = get_proc_task(file_inode(file));
1343     char buffer[PROC_NUMBUF];
1344     size_t len;
1345     int make_it_fail;
1346
1347     if (!task)
1348         return -ESRCH;
1349     make_it_fail = task->make_it_fail;
1350     put_task_struct(task);
1351
1352     len = snprintf(buffer, sizeof(buffer), "%i\n", make_it_fail);
1353
1354     return simple_read_from_buffer(buf, count, ppos, buffer, len);
1355 }
1356
1357 static ssize_t proc_fault_inject_write(struct file * file,
1358             const char __user * buf, size_t count, loff_t *ppos)
1359 {
1360     struct task_struct *task;
1361     char buffer[PROC_NUMBUF];
1362     int make_it_fail;
1363     int rv;
1364
1365     if (!capable(CAP_SYS_RESOURCE))
1366         return -EPERM;
1367     memset(buffer, 0, sizeof(buffer));
1368     if (count > sizeof(buffer) - 1)
1369         count = sizeof(buffer) - 1;
1370     if (copy_from_user(buffer, buf, count))
1371         return -EFAULT;
1372     rv = kstrtoint(strstrip(buffer), 0, &make_it_fail);
1373     if (rv < 0)
1374         return rv;
1375     if (make_it_fail < 0 || make_it_fail > 1)
1376         return -EINVAL;
1377
1378     task = get_proc_task(file_inode(file));
1379     if (!task)
1380         return -ESRCH;
1381     task->make_it_fail = make_it_fail;
1382     put_task_struct(task);
1383
1384     return count;
1385 }
1386
1387 static const struct file_operations proc_fault_inject_operations = {
1388     .read       = proc_fault_inject_read,
1389     .write      = proc_fault_inject_write,
1390     .llseek     = generic_file_llseek,
1391 };
1392
1393 static ssize_t proc_fail_nth_write(struct file *file, const char __user *buf,
1394                    size_t count, loff_t *ppos)
1395 {
1396     struct task_struct *task;
1397     int err;
1398     unsigned int n;
1399
1400     err = kstrtouint_from_user(buf, count, 0, &n);
1401     if (err)
1402         return err;
1403
1404     task = get_proc_task(file_inode(file));
1405     if (!task)
1406         return -ESRCH;
1407     task->fail_nth = n;
1408     put_task_struct(task);
1409
1410     return count;
1411 }
1412
1413 static ssize_t proc_fail_nth_read(struct file *file, char __user *buf,
1414                   size_t count, loff_t *ppos)
1415 {
1416     struct task_struct *task;
1417     char numbuf[PROC_NUMBUF];
1418     ssize_t len;
1419
1420     task = get_proc_task(file_inode(file));
1421     if (!task)
1422         return -ESRCH;
1423     len = snprintf(numbuf, sizeof(numbuf), "%u\n", task->fail_nth);
1424     put_task_struct(task);
1425     return simple_read_from_buffer(buf, count, ppos, numbuf, len);
1426 }
1427
1428 static const struct file_operations proc_fail_nth_operations = {
1429     .read       = proc_fail_nth_read,
1430     .write      = proc_fail_nth_write,
1431 };
1432 #endif
1433
1434
1435 #ifdef CONFIG_SCHED_DEBUG
1436 /*
1437  * Print out various scheduling related per-task fields:
1438  */
1439 static int sched_show(struct seq_file *m, void *v)
1440 {
1441     struct inode *inode = m->private;
1442     struct pid_namespace *ns = proc_pid_ns(inode->i_sb);
1443     struct task_struct *p;
1444
1445     p = get_proc_task(inode);
1446     if (!p)
1447         return -ESRCH;
1448     proc_sched_show_task(p, ns, m);
1449
1450     put_task_struct(p);
1451
1452     return 0;
1453 }
1454
1455 static ssize_t
1456 sched_write(struct file *file, const char __user *buf,
1457         size_t count, loff_t *offset)
1458 {
1459     struct inode *inode = file_inode(file);
1460     struct task_struct *p;
1461
1462     p = get_proc_task(inode);
1463     if (!p)
1464         return -ESRCH;
1465     proc_sched_set_task(p);
1466
1467     put_task_struct(p);
1468
1469     return count;
1470 }
1471
1472 static int sched_open(struct inode *inode, struct file *filp)
1473 {
1474     return single_open(filp, sched_show, inode);
1475 }
1476
1477 static const struct file_operations proc_pid_sched_operations = {
1478     .open       = sched_open,
1479     .read       = seq_read,
1480     .write      = sched_write,
1481     .llseek     = seq_lseek,
1482     .release    = single_release,
1483 };
1484
1485 #endif
1486
1487 #ifdef CONFIG_SCHED_AUTOGROUP
1488 /*
1489  * Print out autogroup related information:
1490  */
1491 static int sched_autogroup_show(struct seq_file *m, void *v)
1492 {
1493     struct inode *inode = m->private;
1494     struct task_struct *p;
1495
1496     p = get_proc_task(inode);
1497     if (!p)
1498         return -ESRCH;
1499     proc_sched_autogroup_show_task(p, m);
1500
1501     put_task_struct(p);
1502
1503     return 0;
1504 }
1505
1506 static ssize_t
1507 sched_autogroup_write(struct file *file, const char __user *buf,
1508         size_t count, loff_t *offset)
1509 {
1510     struct inode *inode = file_inode(file);
1511     struct task_struct *p;
1512     char buffer[PROC_NUMBUF];
1513     int nice;
1514     int err;
1515
1516     memset(buffer, 0, sizeof(buffer));
1517     if (count > sizeof(buffer) - 1)
1518         count = sizeof(buffer) - 1;
1519     if (copy_from_user(buffer, buf, count))
1520         return -EFAULT;
1521
1522     err = kstrtoint(strstrip(buffer), 0, &nice);
1523     if (err < 0)
1524         return err;
1525
1526     p = get_proc_task(inode);
1527     if (!p)
1528         return -ESRCH;
1529
1530     err = proc_sched_autogroup_set_nice(p, nice);
1531     if (err)
1532         count = err;
1533
1534     put_task_struct(p);
1535
1536     return count;
1537 }
1538
1539 static int sched_autogroup_open(struct inode *inode, struct file *filp)
1540 {
1541     int ret;
1542
1543     ret = single_open(filp, sched_autogroup_show, NULL);
1544     if (!ret) {
1545         struct seq_file *m = filp->private_data;
1546
1547         m->private = inode;
1548     }
1549     return ret;
1550 }
1551
1552 static const struct file_operations proc_pid_sched_autogroup_operations = {
1553     .open       = sched_autogroup_open,
1554     .read       = seq_read,
1555     .write      = sched_autogroup_write,
1556     .llseek     = seq_lseek,
1557     .release    = single_release,
1558 };
1559
1560 #endif /* CONFIG_SCHED_AUTOGROUP */
1561
1562 #ifdef CONFIG_TIME_NS
1563 static int timens_offsets_show(struct seq_file *m, void *v)
1564 {
1565     struct task_struct *p;
1566
1567     p = get_proc_task(file_inode(m->file));
1568     if (!p)
1569         return -ESRCH;
1570     proc_timens_show_offsets(p, m);
1571
1572     put_task_struct(p);
1573
1574     return 0;
1575 }
1576
1577 static ssize_t timens_offsets_write(struct file *file, const char __user *buf,
1578                     size_t count, loff_t *ppos)
1579 {
1580     struct inode *inode = file_inode(file);
1581     struct proc_timens_offset offsets[2];
1582     char *kbuf = NULL, *pos, *next_line;
1583     struct task_struct *p;
1584     int ret, noffsets;
1585
1586     /* Only allow < page size writes at the beginning of the file */
1587     if ((*ppos != 0) || (count >= PAGE_SIZE))
1588         return -EINVAL;
1589
1590     /* Slurp in the user data */
1591     kbuf = memdup_user_nul(buf, count);
1592     if (IS_ERR(kbuf))
1593         return PTR_ERR(kbuf);
1594
1595     /* Parse the user data */
1596     ret = -EINVAL;
1597     noffsets = 0;
1598     for (pos = kbuf; pos; pos = next_line) {
1599         struct proc_timens_offset *off = &offsets[noffsets];
1600         char clock[10];
1601         int err;
1602
1603         /* Find the end of line and ensure we don't look past it */
1604         next_line = strchr(pos, '\n');
1605         if (next_line) {
1606             *next_line = '\0';
1607             next_line++;
1608             if (*next_line == '\0')
1609                 next_line = NULL;
1610         }
1611
1612         err = sscanf(pos, "%9s %lld %lu", clock,
1613                 &off->val.tv_sec, &off->val.tv_nsec);
1614         if (err != 3 || off->val.tv_nsec >= NSEC_PER_SEC)
1615             goto out;
1616
1617         clock[sizeof(clock) - 1] = 0;
1618         if (strcmp(clock, "monotonic") == 0 ||
1619             strcmp(clock, __stringify(CLOCK_MONOTONIC)) == 0)
1620             off->clockid = CLOCK_MONOTONIC;
1621         else if (strcmp(clock, "boottime") == 0 ||
1622              strcmp(clock, __stringify(CLOCK_BOOTTIME)) == 0)
1623             off->clockid = CLOCK_BOOTTIME;
1624         else
1625             goto out;
1626
1627         noffsets++;
1628         if (noffsets == ARRAY_SIZE(offsets)) {
1629             if (next_line)
1630                 count = next_line - kbuf;
1631             break;
1632         }
1633     }
1634
1635     ret = -ESRCH;
1636     p = get_proc_task(inode);
1637     if (!p)
1638         goto out;
1639     ret = proc_timens_set_offset(file, p, offsets, noffsets);
1640     put_task_struct(p);
1641     if (ret)
1642         goto out;
1643
1644     ret = count;
1645 out:
1646     kfree(kbuf);
1647     return ret;
1648 }
1649
1650 static int timens_offsets_open(struct inode *inode, struct file *filp)
1651 {
1652     return single_open(filp, timens_offsets_show, inode);
1653 }
1654
1655 static const struct file_operations proc_timens_offsets_operations = {
1656     .open       = timens_offsets_open,
1657     .read       = seq_read,
1658     .write      = timens_offsets_write,
1659     .llseek     = seq_lseek,
1660     .release    = single_release,
1661 };
1662 #endif /* CONFIG_TIME_NS */
1663
1664 static ssize_t comm_write(struct file *file, const char __user *buf,
1665                 size_t count, loff_t *offset)
1666 {
1667     struct inode *inode = file_inode(file);
1668     struct task_struct *p;
1669     char buffer[TASK_COMM_LEN];
1670     const size_t maxlen = sizeof(buffer) - 1;
1671
1672     memset(buffer, 0, sizeof(buffer));
1673     if (copy_from_user(buffer, buf, count > maxlen ? maxlen : count))
1674         return -EFAULT;
1675
1676     p = get_proc_task(inode);
1677     if (!p)
1678         return -ESRCH;
1679
1680     if (same_thread_group(current, p)) {
1681         set_task_comm(p, buffer);
1682         proc_comm_connector(p);
1683     }
1684     else
1685         count = -EINVAL;
1686
1687     put_task_struct(p);
1688
1689     return count;
1690 }
1691
1692 static int comm_show(struct seq_file *m, void *v)
1693 {
1694     struct inode *inode = m->private;
1695     struct task_struct *p;
1696
1697     p = get_proc_task(inode);
1698     if (!p)
1699         return -ESRCH;
1700
1701     proc_task_name(m, p, false);
1702     seq_putc(m, '\n');
1703
1704     put_task_struct(p);
1705
1706     return 0;
1707 }
1708
1709 static int comm_open(struct inode *inode, struct file *filp)
1710 {
1711     return single_open(filp, comm_show, inode);
1712 }
1713
1714 static const struct file_operations proc_pid_set_comm_operations = {
1715     .open       = comm_open,
1716     .read       = seq_read,
1717     .write      = comm_write,
1718     .llseek     = seq_lseek,
1719     .release    = single_release,
1720 };
1721
1722 static int proc_exe_link(struct dentry *dentry, struct path *exe_path)
1723 {
1724     struct task_struct *task;
1725     struct file *exe_file;
1726
1727     task = get_proc_task(d_inode(dentry));
1728     if (!task)
1729         return -ENOENT;
1730     exe_file = get_task_exe_file(task);
1731     put_task_struct(task);
1732     if (exe_file) {
1733         *exe_path = exe_file->f_path;
1734         path_get(&exe_file->f_path);
1735         fput(exe_file);
1736         return 0;
1737     } else
1738         return -ENOENT;
1739 }
1740
1741 static const char *proc_pid_get_link(struct dentry *dentry,
1742                      struct inode *inode,
1743                      struct delayed_call *done)
1744 {
1745     struct path path;
1746     int error = -EACCES;
1747
1748     if (!dentry)
1749         return ERR_PTR(-ECHILD);
1750
1751     /* Are we allowed to snoop on the tasks file descriptors? */
1752     if (!proc_fd_access_allowed(inode))
1753         goto out;
1754
1755     error = PROC_I(inode)->op.proc_get_link(dentry, &path);
1756     if (error)
1757         goto out;
1758
1759     error = nd_jump_link(&path);
1760 out:
1761     return ERR_PTR(error);
1762 }
1763
1764 static int do_proc_readlink(struct path *path, char __user *buffer, int buflen)
1765 {
1766     char *tmp = kmalloc(PATH_MAX, GFP_KERNEL);
1767     char *pathname;
1768     int len;
1769
1770     if (!tmp)
1771         return -ENOMEM;
1772
1773     pathname = d_path(path, tmp, PATH_MAX);
1774     len = PTR_ERR(pathname);
1775     if (IS_ERR(pathname))
1776         goto out;
1777     len = tmp + PATH_MAX - 1 - pathname;
1778
1779     if (len > buflen)
1780         len = buflen;
1781     if (copy_to_user(buffer, pathname, len))
1782         len = -EFAULT;
1783  out:
1784     kfree(tmp);
1785     return len;
1786 }
1787
1788 static int proc_pid_readlink(struct dentry * dentry, char __user * buffer, int buflen)
1789 {
1790     int error = -EACCES;
1791     struct inode *inode = d_inode(dentry);
1792     struct path path;
1793
1794     /* Are we allowed to snoop on the tasks file descriptors? */
1795     if (!proc_fd_access_allowed(inode))
1796         goto out;
1797
1798     error = PROC_I(inode)->op.proc_get_link(dentry, &path);
1799     if (error)
1800         goto out;
1801
1802     error = do_proc_readlink(&path, buffer, buflen);
1803     path_put(&path);
1804 out:
1805     return error;
1806 }
1807
1808 const struct inode_operations proc_pid_link_inode_operations = {
1809     .readlink   = proc_pid_readlink,
1810     .get_link   = proc_pid_get_link,
1811     .setattr    = proc_setattr,
1812 };
1813
1814
1815 /* building an inode */
1816
1817 void task_dump_owner(struct task_struct *task, umode_t mode,
1818              kuid_t *ruid, kgid_t *rgid)
1819 {
1820     /* Depending on the state of dumpable compute who should own a
1821      * proc file for a task.
1822      */
1823     const struct cred *cred;
1824     kuid_t uid;
1825     kgid_t gid;
1826
1827     if (unlikely(task->flags & PF_KTHREAD)) {
1828         *ruid = GLOBAL_ROOT_UID;
1829         *rgid = GLOBAL_ROOT_GID;
1830         return;
1831     }
1832
1833     /* Default to the tasks effective ownership */
1834     rcu_read_lock();
1835     cred = __task_cred(task);
1836     uid = cred->euid;
1837     gid = cred->egid;
1838     rcu_read_unlock();
1839
1840     /*
1841      * Before the /proc/pid/status file was created the only way to read
1842      * the effective uid of a /process was to stat /proc/pid.  Reading
1843      * /proc/pid/status is slow enough that procps and other packages
1844      * kept stating /proc/pid.  To keep the rules in /proc simple I have
1845      * made this apply to all per process world readable and executable
1846      * directories.
1847      */
1848     if (mode != (S_IFDIR|S_IRUGO|S_IXUGO)) {
1849         struct mm_struct *mm;
1850         task_lock(task);
1851         mm = task->mm;
1852         /* Make non-dumpable tasks owned by some root */
1853         if (mm) {
1854             if (get_dumpable(mm) != SUID_DUMP_USER) {
1855                 struct user_namespace *user_ns = mm->user_ns;
1856
1857                 uid = make_kuid(user_ns, 0);
1858                 if (!uid_valid(uid))
1859                     uid = GLOBAL_ROOT_UID;
1860
1861                 gid = make_kgid(user_ns, 0);
1862                 if (!gid_valid(gid))
1863                     gid = GLOBAL_ROOT_GID;
1864             }
1865         } else {
1866             uid = GLOBAL_ROOT_UID;
1867             gid = GLOBAL_ROOT_GID;
1868         }
1869         task_unlock(task);
1870     }
1871     *ruid = uid;
1872     *rgid = gid;
1873 }
1874
1875 void proc_pid_evict_inode(struct proc_inode *ei)
1876 {
1877     struct pid *pid = ei->pid;
1878
1879     if (S_ISDIR(ei->vfs_inode.i_mode)) {
1880         spin_lock(&pid->lock);
1881         hlist_del_init_rcu(&ei->sibling_inodes);
1882         spin_unlock(&pid->lock);
1883     }
1884
1885     put_pid(pid);
1886 }
1887
1888 struct inode *proc_pid_make_inode(struct super_block *sb,
1889                   struct task_struct *task, umode_t mode)
1890 {
1891     struct inode * inode;
1892     struct proc_inode *ei;
1893     struct pid *pid;
1894
1895     /* We need a new inode */
1896
1897     inode = new_inode(sb);
1898     if (!inode)
1899         goto out;
1900
1901     /* Common stuff */
1902     ei = PROC_I(inode);
1903     inode->i_mode = mode;
1904     inode->i_ino = get_next_ino();
1905     inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
1906     inode->i_op = &proc_def_inode_operations;
1907
1908     /*
1909      * grab the reference to task.
1910      */
1911     pid = get_task_pid(task, PIDTYPE_PID);
1912     if (!pid)
1913         goto out_unlock;
1914
1915     /* Let the pid remember us for quick removal */
1916     ei->pid = pid;
1917
1918     task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid);
1919     security_task_to_inode(task, inode);
1920
1921 out:
1922     return inode;
1923
1924 out_unlock:
1925     iput(inode);
1926     return NULL;
1927 }
1928
1929 /*
1930  * Generating an inode and adding it into @pid->inodes, so that task will
1931  * invalidate inode's dentry before being released.
1932  *
1933  * This helper is used for creating dir-type entries under '/proc' and
1934  * '/proc/<tgid>/task'. Other entries(eg. fd, stat) under '/proc/<tgid>'
1935  * can be released by invalidating '/proc/<tgid>' dentry.
1936  * In theory, dentries under '/proc/<tgid>/task' can also be released by
1937  * invalidating '/proc/<tgid>' dentry, we reserve it to handle single
1938  * thread exiting situation: Any one of threads should invalidate its
1939  * '/proc/<tgid>/task/<pid>' dentry before released.
1940  */
1941 static struct inode *proc_pid_make_base_inode(struct super_block *sb,
1942                 struct task_struct *task, umode_t mode)
1943 {
1944     struct inode *inode;
1945     struct proc_inode *ei;
1946     struct pid *pid;
1947
1948     inode = proc_pid_make_inode(sb, task, mode);
1949     if (!inode)
1950         return NULL;
1951
1952     /* Let proc_flush_pid find this directory inode */
1953     ei = PROC_I(inode);
1954     pid = ei->pid;
1955     spin_lock(&pid->lock);
1956     hlist_add_head_rcu(&ei->sibling_inodes, &pid->inodes);
1957     spin_unlock(&pid->lock);
1958
1959     return inode;
1960 }
1961
1962 int pid_getattr(struct user_namespace *mnt_userns, const struct path *path,
1963         struct kstat *stat, u32 request_mask, unsigned int query_flags)
1964 {
1965     struct inode *inode = d_inode(path->dentry);
1966     struct proc_fs_info *fs_info = proc_sb_info(inode->i_sb);
1967     struct task_struct *task;
1968
1969     generic_fillattr(&init_user_ns, inode, stat);
1970
1971     stat->uid = GLOBAL_ROOT_UID;
1972     stat->gid = GLOBAL_ROOT_GID;
1973     rcu_read_lock();
1974     task = pid_task(proc_pid(inode), PIDTYPE_PID);
1975     if (task) {
1976         if (!has_pid_permissions(fs_info, task, HIDEPID_INVISIBLE)) {
1977             rcu_read_unlock();
1978             /*
1979              * This doesn't prevent learning whether PID exists,
1980              * it only makes getattr() consistent with readdir().
1981              */
1982             return -ENOENT;
1983         }
1984         task_dump_owner(task, inode->i_mode, &stat->uid, &stat->gid);
1985     }
1986     rcu_read_unlock();
1987     return 0;
1988 }
1989
1990 /* dentry stuff */
1991
1992 /*
1993  * Set <pid>/... inode ownership (can change due to setuid(), etc.)
1994  */
1995 void pid_update_inode(struct task_struct *task, struct inode *inode)
1996 {
1997     task_dump_owner(task, inode->i_mode, &inode->i_uid, &inode->i_gid);
1998
1999     inode->i_mode &= ~(S_ISUID | S_ISGID);
2000     security_task_to_inode(task, inode);
2001 }
2002
2003 /*
2004  * Rewrite the inode's ownerships here because the owning task may have
2005  * performed a setuid(), etc.
2006  *
2007  */
2008 static int pid_revalidate(struct dentry *dentry, unsigned int flags)
2009 {
2010     struct inode *inode;
2011     struct task_struct *task;
2012     int ret = 0;
2013
2014     rcu_read_lock();
2015     inode = d_inode_rcu(dentry);
2016     if (!inode)
2017         goto out;
2018     task = pid_task(proc_pid(inode), PIDTYPE_PID);
2019
2020     if (task) {
2021         pid_update_inode(task, inode);
2022         ret = 1;
2023     }
2024 out:
2025     rcu_read_unlock();
2026     return ret;
2027 }
2028
2029 static inline bool proc_inode_is_dead(struct inode *inode)
2030 {
2031     return !proc_pid(inode)->tasks[PIDTYPE_PID].first;
2032 }
2033
2034 int pid_delete_dentry(const struct dentry *dentry)
2035 {
2036     /* Is the task we represent dead?
2037      * If so, then don't put the dentry on the lru list,
2038      * kill it immediately.
2039      */
2040     return proc_inode_is_dead(d_inode(dentry));
2041 }
2042
2043 const struct dentry_operations pid_dentry_operations =
2044 {
2045     .d_revalidate   = pid_revalidate,
2046     .d_delete   = pid_delete_dentry,
2047 };
2048
2049 /* Lookups */
2050
2051 /*
2052  * Fill a directory entry.
2053  *
2054  * If possible create the dcache entry and derive our inode number and
2055  * file type from dcache entry.
2056  *
2057  * Since all of the proc inode numbers are dynamically generated, the inode
2058  * numbers do not exist until the inode is cache.  This means creating
2059  * the dcache entry in readdir is necessary to keep the inode numbers
2060  * reported by readdir in sync with the inode numbers reported
2061  * by stat.
2062  */
2063 bool proc_fill_cache(struct file *file, struct dir_context *ctx,
2064     const char *name, unsigned int len,
2065     instantiate_t instantiate, struct task_struct *task, const void *ptr)
2066 {
2067     struct dentry *child, *dir = file->f_path.dentry;
2068     struct qstr qname = QSTR_INIT(name, len);
2069     struct inode *inode;
2070     unsigned type = DT_UNKNOWN;
2071     ino_t ino = 1;
2072
2073     child = d_hash_and_lookup(dir, &qname);
2074     if (!child) {
2075         DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
2076         child = d_alloc_parallel(dir, &qname, &wq);
2077         if (IS_ERR(child))
2078             goto end_instantiate;
2079         if (d_in_lookup(child)) {
2080             struct dentry *res;
2081             res = instantiate(child, task, ptr);
2082             d_lookup_done(child);
2083             if (unlikely(res)) {
2084                 dput(child);
2085                 child = res;
2086                 if (IS_ERR(child))
2087                     goto end_instantiate;
2088             }
2089         }
2090     }
2091     inode = d_inode(child);
2092     ino = inode->i_ino;
2093     type = inode->i_mode >> 12;
2094     dput(child);
2095 end_instantiate:
2096     return dir_emit(ctx, name, len, ino, type);
2097 }
2098
2099 /*
2100  * dname_to_vma_addr - maps a dentry name into two unsigned longs
2101  * which represent vma start and end addresses.
2102  */
2103 static int dname_to_vma_addr(struct dentry *dentry,
2104                  unsigned long *start, unsigned long *end)
2105 {
2106     const char *str = dentry->d_name.name;
2107     unsigned long long sval, eval;
2108     unsigned int len;
2109
2110     if (str[0] == '0' && str[1] != '-')
2111         return -EINVAL;
2112     len = _parse_integer(str, 16, &sval);
2113     if (len & KSTRTOX_OVERFLOW)
2114         return -EINVAL;
2115     if (sval != (unsigned long)sval)
2116         return -EINVAL;
2117     str += len;
2118
2119     if (*str != '-')
2120         return -EINVAL;
2121     str++;
2122
2123     if (str[0] == '0' && str[1])
2124         return -EINVAL;
2125     len = _parse_integer(str, 16, &eval);
2126     if (len & KSTRTOX_OVERFLOW)
2127         return -EINVAL;
2128     if (eval != (unsigned long)eval)
2129         return -EINVAL;
2130     str += len;
2131
2132     if (*str != '\0')
2133         return -EINVAL;
2134
2135     *start = sval;
2136     *end = eval;
2137
2138     return 0;
2139 }
2140
2141 static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags)
2142 {
2143     unsigned long vm_start, vm_end;
2144     bool exact_vma_exists = false;
2145     struct mm_struct *mm = NULL;
2146     struct task_struct *task;
2147     struct inode *inode;
2148     int status = 0;
2149
2150     if (flags & LOOKUP_RCU)
2151         return -ECHILD;
2152
2153     inode = d_inode(dentry);
2154     task = get_proc_task(inode);
2155     if (!task)
2156         goto out_notask;
2157
2158     mm = mm_access(task, PTRACE_MODE_READ_FSCREDS);
2159     if (IS_ERR_OR_NULL(mm))
2160         goto out;
2161
2162     if (!dname_to_vma_addr(dentry, &vm_start, &vm_end)) {
2163         status = mmap_read_lock_killable(mm);
2164         if (!status) {
2165             exact_vma_exists = !!find_exact_vma(mm, vm_start,
2166                                 vm_end);
2167             mmap_read_unlock(mm);
2168         }
2169     }
2170
2171     mmput(mm);
2172
2173     if (exact_vma_exists) {
2174         task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid);
2175
2176         security_task_to_inode(task, inode);
2177         status = 1;
2178     }
2179
2180 out:
2181     put_task_struct(task);
2182
2183 out_notask:
2184     return status;
2185 }
2186
2187 static const struct dentry_operations tid_map_files_dentry_operations = {
2188     .d_revalidate   = map_files_d_revalidate,
2189     .d_delete   = pid_delete_dentry,
2190 };
2191
2192 static int map_files_get_link(struct dentry *dentry, struct path *path)
2193 {
2194     unsigned long vm_start, vm_end;
2195     struct vm_area_struct *vma;
2196     struct task_struct *task;
2197     struct mm_struct *mm;
2198     int rc;
2199
2200     rc = -ENOENT;
2201     task = get_proc_task(d_inode(dentry));
2202     if (!task)
2203         goto out;
2204
2205     mm = get_task_mm(task);
2206     put_task_struct(task);
2207     if (!mm)
2208         goto out;
2209
2210     rc = dname_to_vma_addr(dentry, &vm_start, &vm_end);
2211     if (rc)
2212         goto out_mmput;
2213
2214     rc = mmap_read_lock_killable(mm);
2215     if (rc)
2216         goto out_mmput;
2217
2218     rc = -ENOENT;
2219     vma = find_exact_vma(mm, vm_start, vm_end);
2220     if (vma && vma->vm_file) {
2221         *path = vma->vm_file->f_path;
2222         path_get(path);
2223         rc = 0;
2224     }
2225     mmap_read_unlock(mm);
2226
2227 out_mmput:
2228     mmput(mm);
2229 out:
2230     return rc;
2231 }
2232
2233 struct map_files_info {
2234     unsigned long   start;
2235     unsigned long   end;
2236     fmode_t     mode;
2237 };
2238
2239 /*
2240  * Only allow CAP_SYS_ADMIN and CAP_CHECKPOINT_RESTORE to follow the links, due
2241  * to concerns about how the symlinks may be used to bypass permissions on
2242  * ancestor directories in the path to the file in question.
2243  */
2244 static const char *
2245 proc_map_files_get_link(struct dentry *dentry,
2246             struct inode *inode,
2247                 struct delayed_call *done)
2248 {
2249     if (!checkpoint_restore_ns_capable(&init_user_ns))
2250         return ERR_PTR(-EPERM);
2251
2252     return proc_pid_get_link(dentry, inode, done);
2253 }
2254
2255 /*
2256  * Identical to proc_pid_link_inode_operations except for get_link()
2257  */
2258 static const struct inode_operations proc_map_files_link_inode_operations = {
2259     .readlink   = proc_pid_readlink,
2260     .get_link   = proc_map_files_get_link,
2261     .setattr    = proc_setattr,
2262 };
2263
2264 static struct dentry *
2265 proc_map_files_instantiate(struct dentry *dentry,
2266                struct task_struct *task, const void *ptr)
2267 {
2268     fmode_t mode = (fmode_t)(unsigned long)ptr;
2269     struct proc_inode *ei;
2270     struct inode *inode;
2271
2272     inode = proc_pid_make_inode(dentry->d_sb, task, S_IFLNK |
2273                     ((mode & FMODE_READ ) ? S_IRUSR : 0) |
2274                     ((mode & FMODE_WRITE) ? S_IWUSR : 0));
2275     if (!inode)
2276         return ERR_PTR(-ENOENT);
2277
2278     ei = PROC_I(inode);
2279     ei->op.proc_get_link = map_files_get_link;
2280
2281     inode->i_op = &proc_map_files_link_inode_operations;
2282     inode->i_size = 64;
2283
2284     d_set_d_op(dentry, &tid_map_files_dentry_operations);
2285     return d_splice_alias(inode, dentry);
2286 }
2287
2288 static struct dentry *proc_map_files_lookup(struct inode *dir,
2289         struct dentry *dentry, unsigned int flags)
2290 {
2291     unsigned long vm_start, vm_end;
2292     struct vm_area_struct *vma;
2293     struct task_struct *task;
2294     struct dentry *result;
2295     struct mm_struct *mm;
2296
2297     result = ERR_PTR(-ENOENT);
2298     task = get_proc_task(dir);
2299     if (!task)
2300         goto out;
2301
2302     result = ERR_PTR(-EACCES);
2303     if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS))
2304         goto out_put_task;
2305
2306     result = ERR_PTR(-ENOENT);
2307     if (dname_to_vma_addr(dentry, &vm_start, &vm_end))
2308         goto out_put_task;
2309
2310     mm = get_task_mm(task);
2311     if (!mm)
2312         goto out_put_task;
2313
2314     result = ERR_PTR(-EINTR);
2315     if (mmap_read_lock_killable(mm))
2316         goto out_put_mm;
2317
2318     result = ERR_PTR(-ENOENT);
2319     vma = find_exact_vma(mm, vm_start, vm_end);
2320     if (!vma)
2321         goto out_no_vma;
2322
2323     if (vma->vm_file)
2324         result = proc_map_files_instantiate(dentry, task,
2325                 (void *)(unsigned long)vma->vm_file->f_mode);
2326
2327 out_no_vma:
2328     mmap_read_unlock(mm);
2329 out_put_mm:
2330     mmput(mm);
2331 out_put_task:
2332     put_task_struct(task);
2333 out:
2334     return result;
2335 }
2336
2337 static const struct inode_operations proc_map_files_inode_operations = {
2338     .lookup     = proc_map_files_lookup,
2339     .permission = proc_fd_permission,
2340     .setattr    = proc_setattr,
2341 };
2342
2343 static int
2344 proc_map_files_readdir(struct file *file, struct dir_context *ctx)
2345 {
2346     struct vm_area_struct *vma;
2347     struct task_struct *task;
2348     struct mm_struct *mm;
2349     unsigned long nr_files, pos, i;
2350     GENRADIX(struct map_files_info) fa;
2351     struct map_files_info *p;
2352     int ret;
2353
2354     genradix_init(&fa);
2355
2356     ret = -ENOENT;
2357     task = get_proc_task(file_inode(file));
2358     if (!task)
2359         goto out;
2360
2361     ret = -EACCES;
2362     if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS))
2363         goto out_put_task;
2364
2365     ret = 0;
2366     if (!dir_emit_dots(file, ctx))
2367         goto out_put_task;
2368
2369     mm = get_task_mm(task);
2370     if (!mm)
2371         goto out_put_task;
2372
2373     ret = mmap_read_lock_killable(mm);
2374     if (ret) {
2375         mmput(mm);
2376         goto out_put_task;
2377     }
2378
2379     nr_files = 0;
2380
2381     /*
2382      * We need two passes here:
2383      *
2384      *  1) Collect vmas of mapped files with mmap_lock taken
2385      *  2) Release mmap_lock and instantiate entries
2386      *
2387      * otherwise we get lockdep complained, since filldir()
2388      * routine might require mmap_lock taken in might_fault().
2389      */
2390
2391     for (vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) {
2392         if (!vma->vm_file)
2393             continue;
2394         if (++pos <= ctx->pos)
2395             continue;
2396
2397         p = genradix_ptr_alloc(&fa, nr_files++, GFP_KERNEL);
2398         if (!p) {
2399             ret = -ENOMEM;
2400             mmap_read_unlock(mm);
2401             mmput(mm);
2402             goto out_put_task;
2403         }
2404
2405         p->start = vma->vm_start;
2406         p->end = vma->vm_end;
2407         p->mode = vma->vm_file->f_mode;
2408     }
2409     mmap_read_unlock(mm);
2410     mmput(mm);
2411
2412     for (i = 0; i < nr_files; i++) {
2413         char buf[4 * sizeof(long) + 2]; /* max: %lx-%lx\0 */
2414         unsigned int len;
2415
2416         p = genradix_ptr(&fa, i);
2417         len = snprintf(buf, sizeof(buf), "%lx-%lx", p->start, p->end);
2418         if (!proc_fill_cache(file, ctx,
2419                       buf, len,
2420                       proc_map_files_instantiate,
2421                       task,
2422                       (void *)(unsigned long)p->mode))
2423             break;
2424         ctx->pos++;
2425     }
2426
2427 out_put_task:
2428     put_task_struct(task);
2429 out:
2430     genradix_free(&fa);
2431     return ret;
2432 }
2433
2434 static const struct file_operations proc_map_files_operations = {
2435     .read       = generic_read_dir,
2436     .iterate_shared = proc_map_files_readdir,
2437     .llseek     = generic_file_llseek,
2438 };
2439
2440 #if defined(CONFIG_CHECKPOINT_RESTORE) && defined(CONFIG_POSIX_TIMERS)
2441 struct timers_private {
2442     struct pid *pid;
2443     struct task_struct *task;
2444     struct sighand_struct *sighand;
2445     struct pid_namespace *ns;
2446     unsigned long flags;
2447 };
2448
2449 static void *timers_start(struct seq_file *m, loff_t *pos)
2450 {
2451     struct timers_private *tp = m->private;
2452
2453     tp->task = get_pid_task(tp->pid, PIDTYPE_PID);
2454     if (!tp->task)
2455         return ERR_PTR(-ESRCH);
2456
2457     tp->sighand = lock_task_sighand(tp->task, &tp->flags);
2458     if (!tp->sighand)
2459         return ERR_PTR(-ESRCH);
2460
2461     return seq_list_start(&tp->task->signal->posix_timers, *pos);
2462 }
2463
2464 static void *timers_next(struct seq_file *m, void *v, loff_t *pos)
2465 {
2466     struct timers_private *tp = m->private;
2467     return seq_list_next(v, &tp->task->signal->posix_timers, pos);
2468 }
2469
2470 static void timers_stop(struct seq_file *m, void *v)
2471 {
2472     struct timers_private *tp = m->private;
2473
2474     if (tp->sighand) {
2475         unlock_task_sighand(tp->task, &tp->flags);
2476         tp->sighand = NULL;
2477     }
2478
2479     if (tp->task) {
2480         put_task_struct(tp->task);
2481         tp->task = NULL;
2482     }
2483 }
2484
2485 static int show_timer(struct seq_file *m, void *v)
2486 {
2487     struct k_itimer *timer;
2488     struct timers_private *tp = m->private;
2489     int notify;
2490     static const char * const nstr[] = {
2491         [SIGEV_SIGNAL] = "signal",
2492         [SIGEV_NONE] = "none",
2493         [SIGEV_THREAD] = "thread",
2494     };
2495
2496     timer = list_entry((struct list_head *)v, struct k_itimer, list);
2497     notify = timer->it_sigev_notify;
2498
2499     seq_printf(m, "ID: %d\n", timer->it_id);
2500     seq_printf(m, "signal: %d/%px\n",
2501            timer->sigq->info.si_signo,
2502            timer->sigq->info.si_value.sival_ptr);
2503     seq_printf(m, "notify: %s/%s.%d\n",
2504            nstr[notify & ~SIGEV_THREAD_ID],
2505            (notify & SIGEV_THREAD_ID) ? "tid" : "pid",
2506            pid_nr_ns(timer->it_pid, tp->ns));
2507     seq_printf(m, "ClockID: %d\n", timer->it_clock);
2508
2509     return 0;
2510 }
2511
2512 static const struct seq_operations proc_timers_seq_ops = {
2513     .start  = timers_start,
2514     .next   = timers_next,
2515     .stop   = timers_stop,
2516     .show   = show_timer,
2517 };
2518
2519 static int proc_timers_open(struct inode *inode, struct file *file)
2520 {
2521     struct timers_private *tp;
2522
2523     tp = __seq_open_private(file, &proc_timers_seq_ops,
2524             sizeof(struct timers_private));
2525     if (!tp)
2526         return -ENOMEM;
2527
2528     tp->pid = proc_pid(inode);
2529     tp->ns = proc_pid_ns(inode->i_sb);
2530     return 0;
2531 }
2532
2533 static const struct file_operations proc_timers_operations = {
2534     .open       = proc_timers_open,
2535     .read       = seq_read,
2536     .llseek     = seq_lseek,
2537     .release    = seq_release_private,
2538 };
2539 #endif
2540
2541 static ssize_t timerslack_ns_write(struct file *file, const char __user *buf,
2542                     size_t count, loff_t *offset)
2543 {
2544     struct inode *inode = file_inode(file);
2545     struct task_struct *p;
2546     u64 slack_ns;
2547     int err;
2548
2549     err = kstrtoull_from_user(buf, count, 10, &slack_ns);
2550     if (err < 0)
2551         return err;
2552
2553     p = get_proc_task(inode);
2554     if (!p)
2555         return -ESRCH;
2556
2557     if (p != current) {
2558         rcu_read_lock();
2559         if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
2560             rcu_read_unlock();
2561             count = -EPERM;
2562             goto out;
2563         }
2564         rcu_read_unlock();
2565
2566         err = security_task_setscheduler(p);
2567         if (err) {
2568             count = err;
2569             goto out;
2570         }
2571     }
2572
2573     task_lock(p);
2574     if (slack_ns == 0)
2575         p->timer_slack_ns = p->default_timer_slack_ns;
2576     else
2577         p->timer_slack_ns = slack_ns;
2578     task_unlock(p);
2579
2580 out:
2581     put_task_struct(p);
2582
2583     return count;
2584 }
2585
2586 static int timerslack_ns_show(struct seq_file *m, void *v)
2587 {
2588     struct inode *inode = m->private;
2589     struct task_struct *p;
2590     int err = 0;
2591
2592     p = get_proc_task(inode);
2593     if (!p)
2594         return -ESRCH;
2595
2596     if (p != current) {
2597         rcu_read_lock();
2598         if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
2599             rcu_read_unlock();
2600             err = -EPERM;
2601             goto out;
2602         }
2603         rcu_read_unlock();
2604
2605         err = security_task_getscheduler(p);
2606         if (err)
2607             goto out;
2608     }
2609
2610     task_lock(p);
2611     seq_printf(m, "%llu\n", p->timer_slack_ns);
2612     task_unlock(p);
2613
2614 out:
2615     put_task_struct(p);
2616
2617     return err;
2618 }
2619
2620 static int timerslack_ns_open(struct inode *inode, struct file *filp)
2621 {
2622     return single_open(filp, timerslack_ns_show, inode);
2623 }
2624
2625 static const struct file_operations proc_pid_set_timerslack_ns_operations = {
2626     .open       = timerslack_ns_open,
2627     .read       = seq_read,
2628     .write      = timerslack_ns_write,
2629     .llseek     = seq_lseek,
2630     .release    = single_release,
2631 };
2632
2633 static struct dentry *proc_pident_instantiate(struct dentry *dentry,
2634     struct task_struct *task, const void *ptr)
2635 {
2636     const struct pid_entry *p = ptr;
2637     struct inode *inode;
2638     struct proc_inode *ei;
2639
2640     inode = proc_pid_make_inode(dentry->d_sb, task, p->mode);
2641     if (!inode)
2642         return ERR_PTR(-ENOENT);
2643
2644     ei = PROC_I(inode);
2645     if (S_ISDIR(inode->i_mode))
2646         set_nlink(inode, 2);    /* Use getattr to fix if necessary */
2647     if (p->iop)
2648         inode->i_op = p->iop;
2649     if (p->fop)
2650         inode->i_fop = p->fop;
2651     ei->op = p->op;
2652     pid_update_inode(task, inode);
2653     d_set_d_op(dentry, &pid_dentry_operations);
2654     return d_splice_alias(inode, dentry);
2655 }
2656
2657 static struct dentry *proc_pident_lookup(struct inode *dir,
2658                      struct dentry *dentry,
2659                      const struct pid_entry *p,
2660                      const struct pid_entry *end)
2661 {
2662     struct task_struct *task = get_proc_task(dir);
2663     struct dentry *res = ERR_PTR(-ENOENT);
2664
2665     if (!task)
2666         goto out_no_task;
2667
2668     /*
2669      * Yes, it does not scale. And it should not. Don't add
2670      * new entries into /proc/<tgid>/ without very good reasons.
2671      */
2672     for (; p < end; p++) {
2673         if (p->len != dentry->d_name.len)
2674             continue;
2675         if (!memcmp(dentry->d_name.name, p->name, p->len)) {
2676             res = proc_pident_instantiate(dentry, task, p);
2677             break;
2678         }
2679     }
2680     put_task_struct(task);
2681 out_no_task:
2682     return res;
2683 }
2684
2685 static int proc_pident_readdir(struct file *file, struct dir_context *ctx,
2686         const struct pid_entry *ents, unsigned int nents)
2687 {
2688     struct task_struct *task = get_proc_task(file_inode(file));
2689     const struct pid_entry *p;
2690
2691     if (!task)
2692         return -ENOENT;
2693
2694     if (!dir_emit_dots(file, ctx))
2695         goto out;
2696
2697     if (ctx->pos >= nents + 2)
2698         goto out;
2699
2700     for (p = ents + (ctx->pos - 2); p < ents + nents; p++) {
2701         if (!proc_fill_cache(file, ctx, p->name, p->len,
2702                 proc_pident_instantiate, task, p))
2703             break;
2704         ctx->pos++;
2705     }
2706 out:
2707     put_task_struct(task);
2708     return 0;
2709 }
2710
2711 #ifdef CONFIG_SECURITY
2712 static int proc_pid_attr_open(struct inode *inode, struct file *file)
2713 {
2714     file->private_data = NULL;
2715     __mem_open(inode, file, PTRACE_MODE_READ_FSCREDS);
2716     return 0;
2717 }
2718
2719 static ssize_t proc_pid_attr_read(struct file * file, char __user * buf,
2720                   size_t count, loff_t *ppos)
2721 {
2722     struct inode * inode = file_inode(file);
2723     char *p = NULL;
2724     ssize_t length;
2725     struct task_struct *task = get_proc_task(inode);
2726
2727     if (!task)
2728         return -ESRCH;
2729
2730     length = security_getprocattr(task, PROC_I(inode)->op.lsm,
2731                       (char*)file->f_path.dentry->d_name.name,
2732                       &p);
2733     put_task_struct(task);
2734     if (length > 0)
2735         length = simple_read_from_buffer(buf, count, ppos, p, length);
2736     kfree(p);
2737     return length;
2738 }
2739
2740 static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf,
2741                    size_t count, loff_t *ppos)
2742 {
2743     struct inode * inode = file_inode(file);
2744     struct task_struct *task;
2745     void *page;
2746     int rv;
2747
2748     /* A task may only write when it was the opener. */
2749     if (file->private_data != current->mm)
2750         return -EPERM;
2751
2752     rcu_read_lock();
2753     task = pid_task(proc_pid(inode), PIDTYPE_PID);
2754     if (!task) {
2755         rcu_read_unlock();
2756         return -ESRCH;
2757     }
2758     /* A task may only write its own attributes. */
2759     if (current != task) {
2760         rcu_read_unlock();
2761         return -EACCES;
2762     }
2763     /* Prevent changes to overridden credentials. */
2764     if (current_cred() != current_real_cred()) {
2765         rcu_read_unlock();
2766         return -EBUSY;
2767     }
2768     rcu_read_unlock();
2769
2770     if (count > PAGE_SIZE)
2771         count = PAGE_SIZE;
2772
2773     /* No partial writes. */
2774     if (*ppos != 0)
2775         return -EINVAL;
2776
2777     page = memdup_user(buf, count);
2778     if (IS_ERR(page)) {
2779         rv = PTR_ERR(page);
2780         goto out;
2781     }
2782
2783     /* Guard against adverse ptrace interaction */
2784     rv = mutex_lock_interruptible(&current->signal->cred_guard_mutex);
2785     if (rv < 0)
2786         goto out_free;
2787
2788     rv = security_setprocattr(PROC_I(inode)->op.lsm,
2789                   file->f_path.dentry->d_name.name, page,
2790                   count);
2791     mutex_unlock(&current->signal->cred_guard_mutex);
2792 out_free:
2793     kfree(page);
2794 out:
2795     return rv;
2796 }
2797
2798 static const struct file_operations proc_pid_attr_operations = {
2799     .open       = proc_pid_attr_open,
2800     .read       = proc_pid_attr_read,
2801     .write      = proc_pid_attr_write,
2802     .llseek     = generic_file_llseek,
2803     .release    = mem_release,
2804 };
2805
2806 #define LSM_DIR_OPS(LSM) \
2807 static int proc_##LSM##_attr_dir_iterate(struct file *filp, \
2808                  struct dir_context *ctx) \
2809 { \
2810     return proc_pident_readdir(filp, ctx, \
2811                    LSM##_attr_dir_stuff, \
2812                    ARRAY_SIZE(LSM##_attr_dir_stuff)); \
2813 } \
2814 \
2815 static const struct file_operations proc_##LSM##_attr_dir_ops = { \
2816     .read       = generic_read_dir, \
2817     .iterate    = proc_##LSM##_attr_dir_iterate, \
2818     .llseek     = default_llseek, \
2819 }; \
2820 \
2821 static struct dentry *proc_##LSM##_attr_dir_lookup(struct inode *dir, \
2822                 struct dentry *dentry, unsigned int flags) \
2823 { \
2824     return proc_pident_lookup(dir, dentry, \
2825                   LSM##_attr_dir_stuff, \
2826                   LSM##_attr_dir_stuff + ARRAY_SIZE(LSM##_attr_dir_stuff)); \
2827 } \
2828 \
2829 static const struct inode_operations proc_##LSM##_attr_dir_inode_ops = { \
2830     .lookup     = proc_##LSM##_attr_dir_lookup, \
2831     .getattr    = pid_getattr, \
2832     .setattr    = proc_setattr, \
2833 }
2834
2835 #ifdef CONFIG_SECURITY_SMACK
2836 static const struct pid_entry smack_attr_dir_stuff[] = {
2837     ATTR("smack", "current",    0666),
2838 };
2839 LSM_DIR_OPS(smack);
2840 #endif
2841
2842 #ifdef CONFIG_SECURITY_APPARMOR
2843 static const struct pid_entry apparmor_attr_dir_stuff[] = {
2844     ATTR("apparmor", "current", 0666),
2845     ATTR("apparmor", "prev",    0444),
2846     ATTR("apparmor", "exec",    0666),
2847 };
2848 LSM_DIR_OPS(apparmor);
2849 #endif
2850
2851 static const struct pid_entry attr_dir_stuff[] = {
2852     ATTR(NULL, "current",       0666),
2853     ATTR(NULL, "prev",      0444),
2854     ATTR(NULL, "exec",      0666),
2855     ATTR(NULL, "fscreate",      0666),
2856     ATTR(NULL, "keycreate",     0666),
2857     ATTR(NULL, "sockcreate",    0666),
2858 #ifdef CONFIG_SECURITY_SMACK
2859     DIR("smack",            0555,
2860         proc_smack_attr_dir_inode_ops, proc_smack_attr_dir_ops),
2861 #endif
2862 #ifdef CONFIG_SECURITY_APPARMOR
2863     DIR("apparmor",         0555,
2864         proc_apparmor_attr_dir_inode_ops, proc_apparmor_attr_dir_ops),
2865 #endif
2866 };
2867
2868 static int proc_attr_dir_readdir(struct file *file, struct dir_context *ctx)
2869 {
2870     return proc_pident_readdir(file, ctx,
2871                    attr_dir_stuff, ARRAY_SIZE(attr_dir_stuff));
2872 }
2873
2874 static const struct file_operations proc_attr_dir_operations = {
2875     .read       = generic_read_dir,
2876     .iterate_shared = proc_attr_dir_readdir,
2877     .llseek     = generic_file_llseek,
2878 };
2879
2880 static struct dentry *proc_attr_dir_lookup(struct inode *dir,
2881                 struct dentry *dentry, unsigned int flags)
2882 {
2883     return proc_pident_lookup(dir, dentry,
2884                   attr_dir_stuff,
2885                   attr_dir_stuff + ARRAY_SIZE(attr_dir_stuff));
2886 }
2887
2888 static const struct inode_operations proc_attr_dir_inode_operations = {
2889     .lookup     = proc_attr_dir_lookup,
2890     .getattr    = pid_getattr,
2891     .setattr    = proc_setattr,
2892 };
2893
2894 #endif
2895
2896 #ifdef CONFIG_ELF_CORE
2897 static ssize_t proc_coredump_filter_read(struct file *file, char __user *buf,
2898                      size_t count, loff_t *ppos)
2899 {
2900     struct task_struct *task = get_proc_task(file_inode(file));
2901     struct mm_struct *mm;
2902     char buffer[PROC_NUMBUF];
2903     size_t len;
2904     int ret;
2905
2906     if (!task)
2907         return -ESRCH;
2908
2909     ret = 0;
2910     mm = get_task_mm(task);
2911     if (mm) {
2912         len = snprintf(buffer, sizeof(buffer), "%08lx\n",
2913                    ((mm->flags & MMF_DUMP_FILTER_MASK) >>
2914                 MMF_DUMP_FILTER_SHIFT));
2915         mmput(mm);
2916         ret = simple_read_from_buffer(buf, count, ppos, buffer, len);
2917     }
2918
2919     put_task_struct(task);
2920
2921     return ret;
2922 }
2923
2924 static ssize_t proc_coredump_filter_write(struct file *file,
2925                       const char __user *buf,
2926                       size_t count,
2927                       loff_t *ppos)
2928 {
2929     struct task_struct *task;
2930     struct mm_struct *mm;
2931     unsigned int val;
2932     int ret;
2933     int i;
2934     unsigned long mask;
2935
2936     ret = kstrtouint_from_user(buf, count, 0, &val);
2937     if (ret < 0)
2938         return ret;
2939
2940     ret = -ESRCH;
2941     task = get_proc_task(file_inode(file));
2942     if (!task)
2943         goto out_no_task;
2944
2945     mm = get_task_mm(task);
2946     if (!mm)
2947         goto out_no_mm;
2948     ret = 0;
2949
2950     for (i = 0, mask = 1; i < MMF_DUMP_FILTER_BITS; i++, mask <<= 1) {
2951         if (val & mask)
2952             set_bit(i + MMF_DUMP_FILTER_SHIFT, &mm->flags);
2953         else
2954             clear_bit(i + MMF_DUMP_FILTER_SHIFT, &mm->flags);
2955     }
2956
2957     mmput(mm);
2958  out_no_mm:
2959     put_task_struct(task);
2960  out_no_task:
2961     if (ret < 0)
2962         return ret;
2963     return count;
2964 }
2965
2966 static const struct file_operations proc_coredump_filter_operations = {
2967     .read       = proc_coredump_filter_read,
2968     .write      = proc_coredump_filter_write,
2969     .llseek     = generic_file_llseek,
2970 };
2971 #endif
2972
2973 #ifdef CONFIG_TASK_IO_ACCOUNTING
2974 static int do_io_accounting(struct task_struct *task, struct seq_file *m, int whole)
2975 {
2976     struct task_io_accounting acct = task->ioac;
2977     unsigned long flags;
2978     int result;
2979
2980     result = down_read_killable(&task->signal->exec_update_lock);
2981     if (result)
2982         return result;
2983
2984     if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) {
2985         result = -EACCES;
2986         goto out_unlock;
2987     }
2988
2989     if (whole && lock_task_sighand(task, &flags)) {
2990         struct task_struct *t = task;
2991
2992         task_io_accounting_add(&acct, &task->signal->ioac);
2993         while_each_thread(task, t)
2994             task_io_accounting_add(&acct, &t->ioac);
2995
2996         unlock_task_sighand(task, &flags);
2997     }
2998     seq_printf(m,
2999            "rchar: %llu\n"
3000            "wchar: %llu\n"
3001            "syscr: %llu\n"
3002            "syscw: %llu\n"
3003            "read_bytes: %llu\n"
3004            "write_bytes: %llu\n"
3005            "cancelled_write_bytes: %llu\n",
3006            (unsigned long long)acct.rchar,
3007            (unsigned long long)acct.wchar,
3008            (unsigned long long)acct.syscr,
3009            (unsigned long long)acct.syscw,
3010            (unsigned long long)acct.read_bytes,
3011            (unsigned long long)acct.write_bytes,
3012            (unsigned long long)acct.cancelled_write_bytes);
3013     result = 0;
3014
3015 out_unlock:
3016     up_read(&task->signal->exec_update_lock);
3017     return result;
3018 }
3019
3020 static int proc_tid_io_accounting(struct seq_file *m, struct pid_namespace *ns,
3021                   struct pid *pid, struct task_struct *task)
3022 {
3023     return do_io_accounting(task, m, 0);
3024 }
3025
3026 static int proc_tgid_io_accounting(struct seq_file *m, struct pid_namespace *ns,
3027                    struct pid *pid, struct task_struct *task)
3028 {
3029     return do_io_accounting(task, m, 1);
3030 }
3031 #endif /* CONFIG_TASK_IO_ACCOUNTING */
3032
3033 #ifdef CONFIG_USER_NS
3034 static int proc_id_map_open(struct inode *inode, struct file *file,
3035     const struct seq_operations *seq_ops)
3036 {
3037     struct user_namespace *ns = NULL;
3038     struct task_struct *task;
3039     struct seq_file *seq;
3040     int ret = -EINVAL;
3041
3042     task = get_proc_task(inode);
3043     if (task) {
3044         rcu_read_lock();
3045         ns = get_user_ns(task_cred_xxx(task, user_ns));
3046         rcu_read_unlock();
3047         put_task_struct(task);
3048     }
3049     if (!ns)
3050         goto err;
3051
3052     ret = seq_open(file, seq_ops);
3053     if (ret)
3054         goto err_put_ns;
3055
3056     seq = file->private_data;
3057     seq->private = ns;
3058
3059     return 0;
3060 err_put_ns:
3061     put_user_ns(ns);
3062 err:
3063     return ret;
3064 }
3065
3066 static int proc_id_map_release(struct inode *inode, struct file *file)
3067 {
3068     struct seq_file *seq = file->private_data;
3069     struct user_namespace *ns = seq->private;
3070     put_user_ns(ns);
3071     return seq_release(inode, file);
3072 }
3073
3074 static int proc_uid_map_open(struct inode *inode, struct file *file)
3075 {
3076     return proc_id_map_open(inode, file, &proc_uid_seq_operations);
3077 }
3078
3079 static int proc_gid_map_open(struct inode *inode, struct file *file)
3080 {
3081     return proc_id_map_open(inode, file, &proc_gid_seq_operations);
3082 }
3083
3084 static int proc_projid_map_open(struct inode *inode, struct file *file)
3085 {
3086     return proc_id_map_open(inode, file, &proc_projid_seq_operations);
3087 }
3088
3089 static const struct file_operations proc_uid_map_operations = {
3090     .open       = proc_uid_map_open,
3091     .write      = proc_uid_map_write,
3092     .read       = seq_read,
3093     .llseek     = seq_lseek,
3094     .release    = proc_id_map_release,
3095 };
3096
3097 static const struct file_operations proc_gid_map_operations = {
3098     .open       = proc_gid_map_open,
3099     .write      = proc_gid_map_write,
3100     .read       = seq_read,
3101     .llseek     = seq_lseek,
3102     .release    = proc_id_map_release,
3103 };
3104
3105 static const struct file_operations proc_projid_map_operations = {
3106     .open       = proc_projid_map_open,
3107     .write      = proc_projid_map_write,
3108     .read       = seq_read,
3109     .llseek     = seq_lseek,
3110     .release    = proc_id_map_release,
3111 };
3112
3113 static int proc_setgroups_open(struct inode *inode, struct file *file)
3114 {
3115     struct user_namespace *ns = NULL;
3116     struct task_struct *task;
3117     int ret;
3118
3119     ret = -ESRCH;
3120     task = get_proc_task(inode);
3121     if (task) {
3122         rcu_read_lock();
3123         ns = get_user_ns(task_cred_xxx(task, user_ns));
3124         rcu_read_unlock();
3125         put_task_struct(task);
3126     }
3127     if (!ns)
3128         goto err;
3129
3130     if (file->f_mode & FMODE_WRITE) {
3131         ret = -EACCES;
3132         if (!ns_capable(ns, CAP_SYS_ADMIN))
3133             goto err_put_ns;
3134     }
3135
3136     ret = single_open(file, &proc_setgroups_show, ns);
3137     if (ret)
3138         goto err_put_ns;
3139
3140     return 0;
3141 err_put_ns:
3142     put_user_ns(ns);
3143 err:
3144     return ret;
3145 }
3146
3147 static int proc_setgroups_release(struct inode *inode, struct file *file)
3148 {
3149     struct seq_file *seq = file->private_data;
3150     struct user_namespace *ns = seq->private;
3151     int ret = single_release(inode, file);
3152     put_user_ns(ns);
3153     return ret;
3154 }
3155
3156 static const struct file_operations proc_setgroups_operations = {
3157     .open       = proc_setgroups_open,
3158     .write      = proc_setgroups_write,
3159     .read       = seq_read,
3160     .llseek     = seq_lseek,
3161     .release    = proc_setgroups_release,
3162 };
3163 #endif /* CONFIG_USER_NS */
3164
3165 static int proc_pid_personality(struct seq_file *m, struct pid_namespace *ns,
3166                 struct pid *pid, struct task_struct *task)
3167 {
3168     int err = lock_trace(task);
3169     if (!err) {
3170         seq_printf(m, "%08x\n", task->personality);
3171         unlock_trace(task);
3172     }
3173     return err;
3174 }
3175
3176 #ifdef CONFIG_LIVEPATCH
3177 static int proc_pid_patch_state(struct seq_file *m, struct pid_namespace *ns,
3178                 struct pid *pid, struct task_struct *task)
3179 {
3180     seq_printf(m, "%d\n", task->patch_state);
3181     return 0;
3182 }
3183 #endif /* CONFIG_LIVEPATCH */
3184
3185 #ifdef CONFIG_KSM
3186 static int proc_pid_ksm_merging_pages(struct seq_file *m, struct pid_namespace *ns,
3187                 struct pid *pid, struct task_struct *task)
3188 {
3189     struct mm_struct *mm;
3190
3191     mm = get_task_mm(task);
3192     if (mm) {
3193         seq_printf(m, "%lu\n", mm->ksm_merging_pages);
3194         mmput(mm);
3195     }
3196
3197     return 0;
3198 }
3199 #endif /* CONFIG_KSM */
3200
3201 #ifdef CONFIG_STACKLEAK_METRICS
3202 static int proc_stack_depth(struct seq_file *m, struct pid_namespace *ns,
3203                 struct pid *pid, struct task_struct *task)
3204 {
3205     unsigned long prev_depth = THREAD_SIZE -
3206                 (task->prev_lowest_stack & (THREAD_SIZE - 1));
3207     unsigned long depth = THREAD_SIZE -
3208                 (task->lowest_stack & (THREAD_SIZE - 1));
3209
3210     seq_printf(m, "previous stack depth: %lu\nstack depth: %lu\n",
3211                             prev_depth, depth);
3212     return 0;
3213 }
3214 #endif /* CONFIG_STACKLEAK_METRICS */
3215
3216 /*
3217  * Thread groups
3218  */
3219 static const struct file_operations proc_task_operations;
3220 static const struct inode_operations proc_task_inode_operations;
3221
3222 static const struct pid_entry tgid_base_stuff[] = {
3223     DIR("task",       S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations),
3224     DIR("fd",         S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
3225     DIR("map_files",  S_IRUSR|S_IXUSR, proc_map_files_inode_operations, proc_map_files_operations),
3226     DIR("fdinfo",     S_IRUGO|S_IXUGO, proc_fdinfo_inode_operations, proc_fdinfo_operations),
3227     DIR("ns",     S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
3228 #ifdef CONFIG_NET
3229     DIR("net",        S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations),
3230 #endif
3231     REG("environ",    S_IRUSR, proc_environ_operations),
3232     REG("auxv",       S_IRUSR, proc_auxv_operations),
3233     ONE("status",     S_IRUGO, proc_pid_status),
3234     ONE("personality", S_IRUSR, proc_pid_personality),
3235     ONE("limits",     S_IRUGO, proc_pid_limits),
3236 #ifdef CONFIG_SCHED_DEBUG
3237     REG("sched",      S_IRUGO|S_IWUSR, proc_pid_sched_operations),
3238 #endif
3239 #ifdef CONFIG_SCHED_AUTOGROUP
3240     REG("autogroup",  S_IRUGO|S_IWUSR, proc_pid_sched_autogroup_operations),
3241 #endif
3242 #ifdef CONFIG_TIME_NS
3243     REG("timens_offsets",  S_IRUGO|S_IWUSR, proc_timens_offsets_operations),
3244 #endif
3245     REG("comm",      S_IRUGO|S_IWUSR, proc_pid_set_comm_operations),
3246 #ifdef CONFIG_HAVE_ARCH_TRACEHOOK
3247     ONE("syscall",    S_IRUSR, proc_pid_syscall),
3248 #endif
3249     REG("cmdline",    S_IRUGO, proc_pid_cmdline_ops),
3250     ONE("stat",       S_IRUGO, proc_tgid_stat),
3251     ONE("statm",      S_IRUGO, proc_pid_statm),
3252     REG("maps",       S_IRUGO, proc_pid_maps_operations),
3253 #ifdef CONFIG_NUMA
3254     REG("numa_maps",  S_IRUGO, proc_pid_numa_maps_operations),
3255 #endif
3256     REG("mem",        S_IRUSR|S_IWUSR, proc_mem_operations),
3257     LNK("cwd",        proc_cwd_link),
3258     LNK("root",       proc_root_link),
3259     LNK("exe",        proc_exe_link),
3260     REG("mounts",     S_IRUGO, proc_mounts_operations),
3261     REG("mountinfo",  S_IRUGO, proc_mountinfo_operations),
3262     REG("mountstats", S_IRUSR, proc_mountstats_operations),
3263 #ifdef CONFIG_PROC_PAGE_MONITOR
3264     REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
3265     REG("smaps",      S_IRUGO, proc_pid_smaps_operations),
3266     REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations),
3267     REG("pagemap",    S_IRUSR, proc_pagemap_operations),
3268 #endif
3269 #ifdef CONFIG_SECURITY
3270     DIR("attr",       S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
3271 #endif
3272 #ifdef CONFIG_KALLSYMS
3273     ONE("wchan",      S_IRUGO, proc_pid_wchan),
3274 #endif
3275 #ifdef CONFIG_STACKTRACE
3276     ONE("stack",      S_IRUSR, proc_pid_stack),
3277 #endif
3278 #ifdef CONFIG_SCHED_INFO
3279     ONE("schedstat",  S_IRUGO, proc_pid_schedstat),
3280 #endif
3281 #ifdef CONFIG_LATENCYTOP
3282     REG("latency",  S_IRUGO, proc_lstats_operations),
3283 #endif
3284 #ifdef CONFIG_PROC_PID_CPUSET
3285     ONE("cpuset",     S_IRUGO, proc_cpuset_show),
3286 #endif
3287 #ifdef CONFIG_CGROUPS
3288     ONE("cgroup",  S_IRUGO, proc_cgroup_show),
3289 #endif
3290 #ifdef CONFIG_PROC_CPU_RESCTRL
3291     ONE("cpu_resctrl_groups", S_IRUGO, proc_resctrl_show),
3292 #endif
3293     ONE("oom_score",  S_IRUGO, proc_oom_score),
3294     REG("oom_adj",    S_IRUGO|S_IWUSR, proc_oom_adj_operations),
3295     REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
3296 #ifdef CONFIG_AUDIT
3297     REG("loginuid",   S_IWUSR|S_IRUGO, proc_loginuid_operations),
3298     REG("sessionid",  S_IRUGO, proc_sessionid_operations),
3299 #endif
3300 #ifdef CONFIG_FAULT_INJECTION
3301     REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations),
3302     REG("fail-nth", 0644, proc_fail_nth_operations),
3303 #endif
3304 #ifdef CONFIG_ELF_CORE
3305     REG("coredump_filter", S_IRUGO|S_IWUSR, proc_coredump_filter_operations),
3306 #endif
3307 #ifdef CONFIG_TASK_IO_ACCOUNTING
3308     ONE("io",   S_IRUSR, proc_tgid_io_accounting),
3309 #endif
3310 #ifdef CONFIG_USER_NS
3311     REG("uid_map",    S_IRUGO|S_IWUSR, proc_uid_map_operations),
3312     REG("gid_map",    S_IRUGO|S_IWUSR, proc_gid_map_operations),
3313     REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations),
3314     REG("setgroups",  S_IRUGO|S_IWUSR, proc_setgroups_operations),
3315 #endif
3316 #if defined(CONFIG_CHECKPOINT_RESTORE) && defined(CONFIG_POSIX_TIMERS)
3317     REG("timers",     S_IRUGO, proc_timers_operations),
3318 #endif
3319     REG("timerslack_ns", S_IRUGO|S_IWUGO, proc_pid_set_timerslack_ns_operations),
3320 #ifdef CONFIG_LIVEPATCH
3321     ONE("patch_state",  S_IRUSR, proc_pid_patch_state),
3322 #endif
3323 #ifdef CONFIG_STACKLEAK_METRICS
3324     ONE("stack_depth", S_IRUGO, proc_stack_depth),
3325 #endif
3326 #ifdef CONFIG_PROC_PID_ARCH_STATUS
3327     ONE("arch_status", S_IRUGO, proc_pid_arch_status),
3328 #endif
3329 #ifdef CONFIG_SECCOMP_CACHE_DEBUG
3330     ONE("seccomp_cache", S_IRUSR, proc_pid_seccomp_cache),
3331 #endif
3332 #ifdef CONFIG_KSM
3333     ONE("ksm_merging_pages",  S_IRUSR, proc_pid_ksm_merging_pages),
3334 #endif
3335 };
3336
3337 static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx)
3338 {
3339     return proc_pident_readdir(file, ctx,
3340                    tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff));
3341 }
3342
3343 static const struct file_operations proc_tgid_base_operations = {
3344     .read       = generic_read_dir,
3345     .iterate_shared = proc_tgid_base_readdir,
3346     .llseek     = generic_file_llseek,
3347 };
3348
3349 struct pid *tgid_pidfd_to_pid(const struct file *file)
3350 {
3351     if (file->f_op != &proc_tgid_base_operations)
3352         return ERR_PTR(-EBADF);
3353
3354     return proc_pid(file_inode(file));
3355 }
3356
3357 static struct dentry *proc_tgid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
3358 {
3359     return proc_pident_lookup(dir, dentry,
3360                   tgid_base_stuff,
3361                   tgid_base_stuff + ARRAY_SIZE(tgid_base_stuff));
3362 }
3363
3364 static const struct inode_operations proc_tgid_base_inode_operations = {
3365     .lookup     = proc_tgid_base_lookup,
3366     .getattr    = pid_getattr,
3367     .setattr    = proc_setattr,
3368     .permission = proc_pid_permission,
3369 };
3370
3371 /**
3372  * proc_flush_pid -  Remove dcache entries for @pid from the /proc dcache.
3373  * @pid: pid that should be flushed.
3374  *
3375  * This function walks a list of inodes (that belong to any proc
3376  * filesystem) that are attached to the pid and flushes them from
3377  * the dentry cache.
3378  *
3379  * It is safe and reasonable to cache /proc entries for a task until
3380  * that task exits.  After that they just clog up the dcache with
3381  * useless entries, possibly causing useful dcache entries to be
3382  * flushed instead.  This routine is provided to flush those useless
3383  * dcache entries when a process is reaped.
3384  *
3385  * NOTE: This routine is just an optimization so it does not guarantee
3386  *       that no dcache entries will exist after a process is reaped
3387  *       it just makes it very unlikely that any will persist.
3388  */
3389
3390 void proc_flush_pid(struct pid *pid)
3391 {
3392     proc_invalidate_siblings_dcache(&pid->inodes, &pid->lock);
3393 }
3394
3395 static struct dentry *proc_pid_instantiate(struct dentry * dentry,
3396                    struct task_struct *task, const void *ptr)
3397 {
3398     struct inode *inode;
3399
3400     inode = proc_pid_make_base_inode(dentry->d_sb, task,
3401                      S_IFDIR | S_IRUGO | S_IXUGO);
3402     if (!inode)
3403         return ERR_PTR(-ENOENT);
3404
3405     inode->i_op = &proc_tgid_base_inode_operations;
3406     inode->i_fop = &proc_tgid_base_operations;
3407     inode->i_flags|=S_IMMUTABLE;
3408
3409     set_nlink(inode, nlink_tgid);
3410     pid_update_inode(task, inode);
3411
3412     d_set_d_op(dentry, &pid_dentry_operations);
3413     return d_splice_alias(inode, dentry);
3414 }
3415
3416 struct dentry *proc_pid_lookup(struct dentry *dentry, unsigned int flags)
3417 {
3418     struct task_struct *task;
3419     unsigned tgid;
3420     struct proc_fs_info *fs_info;
3421     struct pid_namespace *ns;
3422     struct dentry *result = ERR_PTR(-ENOENT);
3423
3424     tgid = name_to_int(&dentry->d_name);
3425     if (tgid == ~0U)
3426         goto out;
3427
3428     fs_info = proc_sb_info(dentry->d_sb);
3429     ns = fs_info->pid_ns;
3430     rcu_read_lock();
3431     task = find_task_by_pid_ns(tgid, ns);
3432     if (task)
3433         get_task_struct(task);
3434     rcu_read_unlock();
3435     if (!task)
3436         goto out;
3437
3438     /* Limit procfs to only ptraceable tasks */
3439     if (fs_info->hide_pid == HIDEPID_NOT_PTRACEABLE) {
3440         if (!has_pid_permissions(fs_info, task, HIDEPID_NO_ACCESS))
3441             goto out_put_task;
3442     }
3443
3444     result = proc_pid_instantiate(dentry, task, NULL);
3445 out_put_task:
3446     put_task_struct(task);
3447 out:
3448     return result;
3449 }
3450
3451 /*
3452  * Find the first task with tgid >= tgid
3453  *
3454  */
3455 struct tgid_iter {
3456     unsigned int tgid;
3457     struct task_struct *task;
3458 };
3459 static struct tgid_iter next_tgid(struct pid_namespace *ns, struct tgid_iter iter)
3460 {
3461     struct pid *pid;
3462
3463     if (iter.task)
3464         put_task_struct(iter.task);
3465     rcu_read_lock();
3466 retry:
3467     iter.task = NULL;
3468     pid = find_ge_pid(iter.tgid, ns);
3469     if (pid) {
3470         iter.tgid = pid_nr_ns(pid, ns);
3471         iter.task = pid_task(pid, PIDTYPE_TGID);
3472         if (!iter.task) {
3473             iter.tgid += 1;
3474             goto retry;
3475         }
3476         get_task_struct(iter.task);
3477     }
3478     rcu_read_unlock();
3479     return iter;
3480 }
3481
3482 #define TGID_OFFSET (FIRST_PROCESS_ENTRY + 2)
3483
3484 /* for the /proc/ directory itself, after non-process stuff has been done */
3485 int proc_pid_readdir(struct file *file, struct dir_context *ctx)
3486 {
3487     struct tgid_iter iter;
3488     struct proc_fs_info *fs_info = proc_sb_info(file_inode(file)->i_sb);
3489     struct pid_namespace *ns = proc_pid_ns(file_inode(file)->i_sb);
3490     loff_t pos = ctx->pos;
3491
3492     if (pos >= PID_MAX_LIMIT + TGID_OFFSET)
3493         return 0;
3494
3495     if (pos == TGID_OFFSET - 2) {
3496         struct inode *inode = d_inode(fs_info->proc_self);
3497         if (!dir_emit(ctx, "self", 4, inode->i_ino, DT_LNK))
3498             return 0;
3499         ctx->pos = pos = pos + 1;
3500     }
3501     if (pos == TGID_OFFSET - 1) {
3502         struct inode *inode = d_inode(fs_info->proc_thread_self);
3503         if (!dir_emit(ctx, "thread-self", 11, inode->i_ino, DT_LNK))
3504             return 0;
3505         ctx->pos = pos = pos + 1;
3506     }
3507     iter.tgid = pos - TGID_OFFSET;
3508     iter.task = NULL;
3509     for (iter = next_tgid(ns, iter);
3510          iter.task;
3511          iter.tgid += 1, iter = next_tgid(ns, iter)) {
3512         char name[10 + 1];
3513         unsigned int len;
3514
3515         cond_resched();
3516         if (!has_pid_permissions(fs_info, iter.task, HIDEPID_INVISIBLE))
3517             continue;
3518
3519         len = snprintf(name, sizeof(name), "%u", iter.tgid);
3520         ctx->pos = iter.tgid + TGID_OFFSET;
3521         if (!proc_fill_cache(file, ctx, name, len,
3522                      proc_pid_instantiate, iter.task, NULL)) {
3523             put_task_struct(iter.task);
3524             return 0;
3525         }
3526     }
3527     ctx->pos = PID_MAX_LIMIT + TGID_OFFSET;
3528     return 0;
3529 }
3530
3531 /*
3532  * proc_tid_comm_permission is a special permission function exclusively
3533  * used for the node /proc/<pid>/task/<tid>/comm.
3534  * It bypasses generic permission checks in the case where a task of the same
3535  * task group attempts to access the node.
3536  * The rationale behind this is that glibc and bionic access this node for
3537  * cross thread naming (pthread_set/getname_np(!self)). However, if
3538  * PR_SET_DUMPABLE gets set to 0 this node among others becomes uid=0 gid=0,
3539  * which locks out the cross thread naming implementation.
3540  * This function makes sure that the node is always accessible for members of
3541  * same thread group.
3542  */
3543 static int proc_tid_comm_permission(struct user_namespace *mnt_userns,
3544                     struct inode *inode, int mask)
3545 {
3546     bool is_same_tgroup;
3547     struct task_struct *task;
3548
3549     task = get_proc_task(inode);
3550     if (!task)
3551         return -ESRCH;
3552     is_same_tgroup = same_thread_group(current, task);
3553     put_task_struct(task);
3554
3555     if (likely(is_same_tgroup && !(mask & MAY_EXEC))) {
3556         /* This file (/proc/<pid>/task/<tid>/comm) can always be
3557          * read or written by the members of the corresponding
3558          * thread group.
3559          */
3560         return 0;
3561     }
3562
3563     return generic_permission(&init_user_ns, inode, mask);
3564 }
3565
3566 static const struct inode_operations proc_tid_comm_inode_operations = {
3567         .permission = proc_tid_comm_permission,
3568 };
3569
3570 /*
3571  * Tasks
3572  */
3573 static const struct pid_entry tid_base_stuff[] = {
3574     DIR("fd",        S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
3575     DIR("fdinfo",    S_IRUGO|S_IXUGO, proc_fdinfo_inode_operations, proc_fdinfo_operations),
3576     DIR("ns",    S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
3577 #ifdef CONFIG_NET
3578     DIR("net",        S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations),
3579 #endif
3580     REG("environ",   S_IRUSR, proc_environ_operations),
3581     REG("auxv",      S_IRUSR, proc_auxv_operations),
3582     ONE("status",    S_IRUGO, proc_pid_status),
3583     ONE("personality", S_IRUSR, proc_pid_personality),
3584     ONE("limits",    S_IRUGO, proc_pid_limits),
3585 #ifdef CONFIG_SCHED_DEBUG
3586     REG("sched",     S_IRUGO|S_IWUSR, proc_pid_sched_operations),
3587 #endif
3588     NOD("comm",      S_IFREG|S_IRUGO|S_IWUSR,
3589              &proc_tid_comm_inode_operations,
3590              &proc_pid_set_comm_operations, {}),
3591 #ifdef CONFIG_HAVE_ARCH_TRACEHOOK
3592     ONE("syscall",   S_IRUSR, proc_pid_syscall),
3593 #endif
3594     REG("cmdline",   S_IRUGO, proc_pid_cmdline_ops),
3595     ONE("stat",      S_IRUGO, proc_tid_stat),
3596     ONE("statm",     S_IRUGO, proc_pid_statm),
3597     REG("maps",      S_IRUGO, proc_pid_maps_operations),
3598 #ifdef CONFIG_PROC_CHILDREN
3599     REG("children",  S_IRUGO, proc_tid_children_operations),
3600 #endif
3601 #ifdef CONFIG_NUMA
3602     REG("numa_maps", S_IRUGO, proc_pid_numa_maps_operations),
3603 #endif
3604     REG("mem",       S_IRUSR|S_IWUSR, proc_mem_operations),
3605     LNK("cwd",       proc_cwd_link),
3606     LNK("root",      proc_root_link),
3607     LNK("exe",       proc_exe_link),
3608     REG("mounts",    S_IRUGO, proc_mounts_operations),
3609     REG("mountinfo",  S_IRUGO, proc_mountinfo_operations),
3610 #ifdef CONFIG_PROC_PAGE_MONITOR
3611     REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
3612     REG("smaps",     S_IRUGO, proc_pid_smaps_operations),
3613     REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations),
3614     REG("pagemap",    S_IRUSR, proc_pagemap_operations),
3615 #endif
3616 #ifdef CONFIG_SECURITY
3617     DIR("attr",      S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
3618 #endif
3619 #ifdef CONFIG_KALLSYMS
3620     ONE("wchan",     S_IRUGO, proc_pid_wchan),
3621 #endif
3622 #ifdef CONFIG_STACKTRACE
3623     ONE("stack",      S_IRUSR, proc_pid_stack),
3624 #endif
3625 #ifdef CONFIG_SCHED_INFO
3626     ONE("schedstat", S_IRUGO, proc_pid_schedstat),
3627 #endif
3628 #ifdef CONFIG_LATENCYTOP
3629     REG("latency",  S_IRUGO, proc_lstats_operations),
3630 #endif
3631 #ifdef CONFIG_PROC_PID_CPUSET
3632     ONE("cpuset",    S_IRUGO, proc_cpuset_show),
3633 #endif
3634 #ifdef CONFIG_CGROUPS
3635     ONE("cgroup",  S_IRUGO, proc_cgroup_show),
3636 #endif
3637 #ifdef CONFIG_PROC_CPU_RESCTRL
3638     ONE("cpu_resctrl_groups", S_IRUGO, proc_resctrl_show),
3639 #endif
3640     ONE("oom_score", S_IRUGO, proc_oom_score),
3641     REG("oom_adj",   S_IRUGO|S_IWUSR, proc_oom_adj_operations),
3642     REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
3643 #ifdef CONFIG_AUDIT
3644     REG("loginuid",  S_IWUSR|S_IRUGO, proc_loginuid_operations),
3645     REG("sessionid",  S_IRUGO, proc_sessionid_operations),
3646 #endif
3647 #ifdef CONFIG_FAULT_INJECTION
3648     REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations),
3649     REG("fail-nth", 0644, proc_fail_nth_operations),
3650 #endif
3651 #ifdef CONFIG_TASK_IO_ACCOUNTING
3652     ONE("io",   S_IRUSR, proc_tid_io_accounting),
3653 #endif
3654 #ifdef CONFIG_USER_NS
3655     REG("uid_map",    S_IRUGO|S_IWUSR, proc_uid_map_operations),
3656     REG("gid_map",    S_IRUGO|S_IWUSR, proc_gid_map_operations),
3657     REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations),
3658     REG("setgroups",  S_IRUGO|S_IWUSR, proc_setgroups_operations),
3659 #endif
3660 #ifdef CONFIG_LIVEPATCH
3661     ONE("patch_state",  S_IRUSR, proc_pid_patch_state),
3662 #endif
3663 #ifdef CONFIG_PROC_PID_ARCH_STATUS
3664     ONE("arch_status", S_IRUGO, proc_pid_arch_status),
3665 #endif
3666 #ifdef CONFIG_SECCOMP_CACHE_DEBUG
3667     ONE("seccomp_cache", S_IRUSR, proc_pid_seccomp_cache),
3668 #endif
3669 #ifdef CONFIG_KSM
3670     ONE("ksm_merging_pages",  S_IRUSR, proc_pid_ksm_merging_pages),
3671 #endif
3672 };
3673
3674 static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx)
3675 {
3676     return proc_pident_readdir(file, ctx,
3677                    tid_base_stuff, ARRAY_SIZE(tid_base_stuff));
3678 }
3679
3680 static struct dentry *proc_tid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
3681 {
3682     return proc_pident_lookup(dir, dentry,
3683                   tid_base_stuff,
3684                   tid_base_stuff + ARRAY_SIZE(tid_base_stuff));
3685 }
3686
3687 static const struct file_operations proc_tid_base_operations = {
3688     .read       = generic_read_dir,
3689     .iterate_shared = proc_tid_base_readdir,
3690     .llseek     = generic_file_llseek,
3691 };
3692
3693 static const struct inode_operations proc_tid_base_inode_operations = {
3694     .lookup     = proc_tid_base_lookup,
3695     .getattr    = pid_getattr,
3696     .setattr    = proc_setattr,
3697 };
3698
3699 static struct dentry *proc_task_instantiate(struct dentry *dentry,
3700     struct task_struct *task, const void *ptr)
3701 {
3702     struct inode *inode;
3703     inode = proc_pid_make_base_inode(dentry->d_sb, task,
3704                      S_IFDIR | S_IRUGO | S_IXUGO);
3705     if (!inode)
3706         return ERR_PTR(-ENOENT);
3707
3708     inode->i_op = &proc_tid_base_inode_operations;
3709     inode->i_fop = &proc_tid_base_operations;
3710     inode->i_flags |= S_IMMUTABLE;
3711
3712     set_nlink(inode, nlink_tid);
3713     pid_update_inode(task, inode);
3714
3715     d_set_d_op(dentry, &pid_dentry_operations);
3716     return d_splice_alias(inode, dentry);
3717 }
3718
3719 static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags)
3720 {
3721     struct task_struct *task;
3722     struct task_struct *leader = get_proc_task(dir);
3723     unsigned tid;
3724     struct proc_fs_info *fs_info;
3725     struct pid_namespace *ns;
3726     struct dentry *result = ERR_PTR(-ENOENT);
3727
3728     if (!leader)
3729         goto out_no_task;
3730
3731     tid = name_to_int(&dentry->d_name);
3732     if (tid == ~0U)
3733         goto out;
3734
3735     fs_info = proc_sb_info(dentry->d_sb);
3736     ns = fs_info->pid_ns;
3737     rcu_read_lock();
3738     task = find_task_by_pid_ns(tid, ns);
3739     if (task)
3740         get_task_struct(task);
3741     rcu_read_unlock();
3742     if (!task)
3743         goto out;
3744     if (!same_thread_group(leader, task))
3745         goto out_drop_task;
3746
3747     result = proc_task_instantiate(dentry, task, NULL);
3748 out_drop_task:
3749     put_task_struct(task);
3750 out:
3751     put_task_struct(leader);
3752 out_no_task:
3753     return result;
3754 }
3755
3756 /*
3757  * Find the first tid of a thread group to return to user space.
3758  *
3759  * Usually this is just the thread group leader, but if the users
3760  * buffer was too small or there was a seek into the middle of the
3761  * directory we have more work todo.
3762  *
3763  * In the case of a short read we start with find_task_by_pid.
3764  *
3765  * In the case of a seek we start with the leader and walk nr
3766  * threads past it.
3767  */
3768 static struct task_struct *first_tid(struct pid *pid, int tid, loff_t f_pos,
3769                     struct pid_namespace *ns)
3770 {
3771     struct task_struct *pos, *task;
3772     unsigned long nr = f_pos;
3773
3774     if (nr != f_pos)    /* 32bit overflow? */
3775         return NULL;
3776
3777     rcu_read_lock();
3778     task = pid_task(pid, PIDTYPE_PID);
3779     if (!task)
3780         goto fail;
3781
3782     /* Attempt to start with the tid of a thread */
3783     if (tid && nr) {
3784         pos = find_task_by_pid_ns(tid, ns);
3785         if (pos && same_thread_group(pos, task))
3786             goto found;
3787     }
3788
3789     /* If nr exceeds the number of threads there is nothing todo */
3790     if (nr >= get_nr_threads(task))
3791         goto fail;
3792
3793     /* If we haven't found our starting place yet start
3794      * with the leader and walk nr threads forward.
3795      */
3796     pos = task = task->group_leader;
3797     do {
3798         if (!nr--)
3799             goto found;
3800     } while_each_thread(task, pos);
3801 fail:
3802     pos = NULL;
3803     goto out;
3804 found:
3805     get_task_struct(pos);
3806 out:
3807     rcu_read_unlock();
3808     return pos;
3809 }
3810
3811 /*
3812  * Find the next thread in the thread list.
3813  * Return NULL if there is an error or no next thread.
3814  *
3815  * The reference to the input task_struct is released.
3816  */
3817 static struct task_struct *next_tid(struct task_struct *start)
3818 {
3819     struct task_struct *pos = NULL;
3820     rcu_read_lock();
3821     if (pid_alive(start)) {
3822         pos = next_thread(start);
3823         if (thread_group_leader(pos))
3824             pos = NULL;
3825         else
3826             get_task_struct(pos);
3827     }
3828     rcu_read_unlock();
3829     put_task_struct(start);
3830     return pos;
3831 }
3832
3833 /* for the /proc/TGID/task/ directories */
3834 static int proc_task_readdir(struct file *file, struct dir_context *ctx)
3835 {
3836     struct inode *inode = file_inode(file);
3837     struct task_struct *task;
3838     struct pid_namespace *ns;
3839     int tid;
3840
3841     if (proc_inode_is_dead(inode))
3842         return -ENOENT;
3843
3844     if (!dir_emit_dots(file, ctx))
3845         return 0;
3846
3847     /* f_version caches the tgid value that the last readdir call couldn't
3848      * return. lseek aka telldir automagically resets f_version to 0.
3849      */
3850     ns = proc_pid_ns(inode->i_sb);
3851     tid = (int)file->f_version;
3852     file->f_version = 0;
3853     for (task = first_tid(proc_pid(inode), tid, ctx->pos - 2, ns);
3854          task;
3855          task = next_tid(task), ctx->pos++) {
3856         char name[10 + 1];
3857         unsigned int len;
3858
3859         tid = task_pid_nr_ns(task, ns);
3860         if (!tid)
3861             continue;   /* The task has just exited. */
3862         len = snprintf(name, sizeof(name), "%u", tid);
3863         if (!proc_fill_cache(file, ctx, name, len,
3864                 proc_task_instantiate, task, NULL)) {
3865             /* returning this tgid failed, save it as the first
3866              * pid for the next readir call */
3867             file->f_version = (u64)tid;
3868             put_task_struct(task);
3869             break;
3870         }
3871     }
3872
3873     return 0;
3874 }
3875
3876 static int proc_task_getattr(struct user_namespace *mnt_userns,
3877                  const struct path *path, struct kstat *stat,
3878                  u32 request_mask, unsigned int query_flags)
3879 {
3880     struct inode *inode = d_inode(path->dentry);
3881     struct task_struct *p = get_proc_task(inode);
3882     generic_fillattr(&init_user_ns, inode, stat);
3883
3884     if (p) {
3885         stat->nlink += get_nr_threads(p);
3886         put_task_struct(p);
3887     }
3888
3889     return 0;
3890 }
3891
3892 static const struct inode_operations proc_task_inode_operations = {
3893     .lookup     = proc_task_lookup,
3894     .getattr    = proc_task_getattr,
3895     .setattr    = proc_setattr,
3896     .permission = proc_pid_permission,
3897 };
3898
3899 static const struct file_operations proc_task_operations = {
3900     .read       = generic_read_dir,
3901     .iterate_shared = proc_task_readdir,
3902     .llseek     = generic_file_llseek,
3903 };
3904
3905 void __init set_proc_pid_nlink(void)
3906 {
3907     nlink_tid = pid_entry_nlink(tid_base_stuff, ARRAY_SIZE(tid_base_stuff));
3908     nlink_tgid = pid_entry_nlink(tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff));
3909 }