Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0-only
0002 /* Copyright (c) 2020 Facebook */
0003 
0004 #include <linux/init.h>
0005 #include <linux/namei.h>
0006 #include <linux/pid_namespace.h>
0007 #include <linux/fs.h>
0008 #include <linux/fdtable.h>
0009 #include <linux/filter.h>
0010 #include <linux/btf_ids.h>
0011 #include "mmap_unlock_work.h"
0012 
0013 struct bpf_iter_seq_task_common {
0014     struct pid_namespace *ns;
0015 };
0016 
0017 struct bpf_iter_seq_task_info {
0018     /* The first field must be struct bpf_iter_seq_task_common.
0019      * this is assumed by {init, fini}_seq_pidns() callback functions.
0020      */
0021     struct bpf_iter_seq_task_common common;
0022     u32 tid;
0023 };
0024 
0025 static struct task_struct *task_seq_get_next(struct pid_namespace *ns,
0026                          u32 *tid,
0027                          bool skip_if_dup_files)
0028 {
0029     struct task_struct *task = NULL;
0030     struct pid *pid;
0031 
0032     rcu_read_lock();
0033 retry:
0034     pid = find_ge_pid(*tid, ns);
0035     if (pid) {
0036         *tid = pid_nr_ns(pid, ns);
0037         task = get_pid_task(pid, PIDTYPE_PID);
0038         if (!task) {
0039             ++*tid;
0040             goto retry;
0041         } else if (skip_if_dup_files && !thread_group_leader(task) &&
0042                task->files == task->group_leader->files) {
0043             put_task_struct(task);
0044             task = NULL;
0045             ++*tid;
0046             goto retry;
0047         }
0048     }
0049     rcu_read_unlock();
0050 
0051     return task;
0052 }
0053 
0054 static void *task_seq_start(struct seq_file *seq, loff_t *pos)
0055 {
0056     struct bpf_iter_seq_task_info *info = seq->private;
0057     struct task_struct *task;
0058 
0059     task = task_seq_get_next(info->common.ns, &info->tid, false);
0060     if (!task)
0061         return NULL;
0062 
0063     if (*pos == 0)
0064         ++*pos;
0065     return task;
0066 }
0067 
0068 static void *task_seq_next(struct seq_file *seq, void *v, loff_t *pos)
0069 {
0070     struct bpf_iter_seq_task_info *info = seq->private;
0071     struct task_struct *task;
0072 
0073     ++*pos;
0074     ++info->tid;
0075     put_task_struct((struct task_struct *)v);
0076     task = task_seq_get_next(info->common.ns, &info->tid, false);
0077     if (!task)
0078         return NULL;
0079 
0080     return task;
0081 }
0082 
0083 struct bpf_iter__task {
0084     __bpf_md_ptr(struct bpf_iter_meta *, meta);
0085     __bpf_md_ptr(struct task_struct *, task);
0086 };
0087 
0088 DEFINE_BPF_ITER_FUNC(task, struct bpf_iter_meta *meta, struct task_struct *task)
0089 
0090 static int __task_seq_show(struct seq_file *seq, struct task_struct *task,
0091                bool in_stop)
0092 {
0093     struct bpf_iter_meta meta;
0094     struct bpf_iter__task ctx;
0095     struct bpf_prog *prog;
0096 
0097     meta.seq = seq;
0098     prog = bpf_iter_get_info(&meta, in_stop);
0099     if (!prog)
0100         return 0;
0101 
0102     ctx.meta = &meta;
0103     ctx.task = task;
0104     return bpf_iter_run_prog(prog, &ctx);
0105 }
0106 
0107 static int task_seq_show(struct seq_file *seq, void *v)
0108 {
0109     return __task_seq_show(seq, v, false);
0110 }
0111 
0112 static void task_seq_stop(struct seq_file *seq, void *v)
0113 {
0114     if (!v)
0115         (void)__task_seq_show(seq, v, true);
0116     else
0117         put_task_struct((struct task_struct *)v);
0118 }
0119 
0120 static const struct seq_operations task_seq_ops = {
0121     .start  = task_seq_start,
0122     .next   = task_seq_next,
0123     .stop   = task_seq_stop,
0124     .show   = task_seq_show,
0125 };
0126 
0127 struct bpf_iter_seq_task_file_info {
0128     /* The first field must be struct bpf_iter_seq_task_common.
0129      * this is assumed by {init, fini}_seq_pidns() callback functions.
0130      */
0131     struct bpf_iter_seq_task_common common;
0132     struct task_struct *task;
0133     u32 tid;
0134     u32 fd;
0135 };
0136 
0137 static struct file *
0138 task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info)
0139 {
0140     struct pid_namespace *ns = info->common.ns;
0141     u32 curr_tid = info->tid;
0142     struct task_struct *curr_task;
0143     unsigned int curr_fd = info->fd;
0144 
0145     /* If this function returns a non-NULL file object,
0146      * it held a reference to the task/file.
0147      * Otherwise, it does not hold any reference.
0148      */
0149 again:
0150     if (info->task) {
0151         curr_task = info->task;
0152         curr_fd = info->fd;
0153     } else {
0154                 curr_task = task_seq_get_next(ns, &curr_tid, true);
0155                 if (!curr_task) {
0156                         info->task = NULL;
0157                         info->tid = curr_tid;
0158                         return NULL;
0159                 }
0160 
0161                 /* set info->task and info->tid */
0162         info->task = curr_task;
0163         if (curr_tid == info->tid) {
0164             curr_fd = info->fd;
0165         } else {
0166             info->tid = curr_tid;
0167             curr_fd = 0;
0168         }
0169     }
0170 
0171     rcu_read_lock();
0172     for (;; curr_fd++) {
0173         struct file *f;
0174         f = task_lookup_next_fd_rcu(curr_task, &curr_fd);
0175         if (!f)
0176             break;
0177         if (!get_file_rcu(f))
0178             continue;
0179 
0180         /* set info->fd */
0181         info->fd = curr_fd;
0182         rcu_read_unlock();
0183         return f;
0184     }
0185 
0186     /* the current task is done, go to the next task */
0187     rcu_read_unlock();
0188     put_task_struct(curr_task);
0189     info->task = NULL;
0190     info->fd = 0;
0191     curr_tid = ++(info->tid);
0192     goto again;
0193 }
0194 
0195 static void *task_file_seq_start(struct seq_file *seq, loff_t *pos)
0196 {
0197     struct bpf_iter_seq_task_file_info *info = seq->private;
0198     struct file *file;
0199 
0200     info->task = NULL;
0201     file = task_file_seq_get_next(info);
0202     if (file && *pos == 0)
0203         ++*pos;
0204 
0205     return file;
0206 }
0207 
0208 static void *task_file_seq_next(struct seq_file *seq, void *v, loff_t *pos)
0209 {
0210     struct bpf_iter_seq_task_file_info *info = seq->private;
0211 
0212     ++*pos;
0213     ++info->fd;
0214     fput((struct file *)v);
0215     return task_file_seq_get_next(info);
0216 }
0217 
0218 struct bpf_iter__task_file {
0219     __bpf_md_ptr(struct bpf_iter_meta *, meta);
0220     __bpf_md_ptr(struct task_struct *, task);
0221     u32 fd __aligned(8);
0222     __bpf_md_ptr(struct file *, file);
0223 };
0224 
0225 DEFINE_BPF_ITER_FUNC(task_file, struct bpf_iter_meta *meta,
0226              struct task_struct *task, u32 fd,
0227              struct file *file)
0228 
0229 static int __task_file_seq_show(struct seq_file *seq, struct file *file,
0230                 bool in_stop)
0231 {
0232     struct bpf_iter_seq_task_file_info *info = seq->private;
0233     struct bpf_iter__task_file ctx;
0234     struct bpf_iter_meta meta;
0235     struct bpf_prog *prog;
0236 
0237     meta.seq = seq;
0238     prog = bpf_iter_get_info(&meta, in_stop);
0239     if (!prog)
0240         return 0;
0241 
0242     ctx.meta = &meta;
0243     ctx.task = info->task;
0244     ctx.fd = info->fd;
0245     ctx.file = file;
0246     return bpf_iter_run_prog(prog, &ctx);
0247 }
0248 
0249 static int task_file_seq_show(struct seq_file *seq, void *v)
0250 {
0251     return __task_file_seq_show(seq, v, false);
0252 }
0253 
0254 static void task_file_seq_stop(struct seq_file *seq, void *v)
0255 {
0256     struct bpf_iter_seq_task_file_info *info = seq->private;
0257 
0258     if (!v) {
0259         (void)__task_file_seq_show(seq, v, true);
0260     } else {
0261         fput((struct file *)v);
0262         put_task_struct(info->task);
0263         info->task = NULL;
0264     }
0265 }
0266 
0267 static int init_seq_pidns(void *priv_data, struct bpf_iter_aux_info *aux)
0268 {
0269     struct bpf_iter_seq_task_common *common = priv_data;
0270 
0271     common->ns = get_pid_ns(task_active_pid_ns(current));
0272     return 0;
0273 }
0274 
0275 static void fini_seq_pidns(void *priv_data)
0276 {
0277     struct bpf_iter_seq_task_common *common = priv_data;
0278 
0279     put_pid_ns(common->ns);
0280 }
0281 
0282 static const struct seq_operations task_file_seq_ops = {
0283     .start  = task_file_seq_start,
0284     .next   = task_file_seq_next,
0285     .stop   = task_file_seq_stop,
0286     .show   = task_file_seq_show,
0287 };
0288 
0289 struct bpf_iter_seq_task_vma_info {
0290     /* The first field must be struct bpf_iter_seq_task_common.
0291      * this is assumed by {init, fini}_seq_pidns() callback functions.
0292      */
0293     struct bpf_iter_seq_task_common common;
0294     struct task_struct *task;
0295     struct vm_area_struct *vma;
0296     u32 tid;
0297     unsigned long prev_vm_start;
0298     unsigned long prev_vm_end;
0299 };
0300 
0301 enum bpf_task_vma_iter_find_op {
0302     task_vma_iter_first_vma,   /* use mm->mmap */
0303     task_vma_iter_next_vma,    /* use curr_vma->vm_next */
0304     task_vma_iter_find_vma,    /* use find_vma() to find next vma */
0305 };
0306 
0307 static struct vm_area_struct *
0308 task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info)
0309 {
0310     struct pid_namespace *ns = info->common.ns;
0311     enum bpf_task_vma_iter_find_op op;
0312     struct vm_area_struct *curr_vma;
0313     struct task_struct *curr_task;
0314     u32 curr_tid = info->tid;
0315 
0316     /* If this function returns a non-NULL vma, it holds a reference to
0317      * the task_struct, and holds read lock on vma->mm->mmap_lock.
0318      * If this function returns NULL, it does not hold any reference or
0319      * lock.
0320      */
0321     if (info->task) {
0322         curr_task = info->task;
0323         curr_vma = info->vma;
0324         /* In case of lock contention, drop mmap_lock to unblock
0325          * the writer.
0326          *
0327          * After relock, call find(mm, prev_vm_end - 1) to find
0328          * new vma to process.
0329          *
0330          *   +------+------+-----------+
0331          *   | VMA1 | VMA2 | VMA3      |
0332          *   +------+------+-----------+
0333          *   |      |      |           |
0334          *  4k     8k     16k         400k
0335          *
0336          * For example, curr_vma == VMA2. Before unlock, we set
0337          *
0338          *    prev_vm_start = 8k
0339          *    prev_vm_end   = 16k
0340          *
0341          * There are a few cases:
0342          *
0343          * 1) VMA2 is freed, but VMA3 exists.
0344          *
0345          *    find_vma() will return VMA3, just process VMA3.
0346          *
0347          * 2) VMA2 still exists.
0348          *
0349          *    find_vma() will return VMA2, process VMA2->next.
0350          *
0351          * 3) no more vma in this mm.
0352          *
0353          *    Process the next task.
0354          *
0355          * 4) find_vma() returns a different vma, VMA2'.
0356          *
0357          *    4.1) If VMA2 covers same range as VMA2', skip VMA2',
0358          *         because we already covered the range;
0359          *    4.2) VMA2 and VMA2' covers different ranges, process
0360          *         VMA2'.
0361          */
0362         if (mmap_lock_is_contended(curr_task->mm)) {
0363             info->prev_vm_start = curr_vma->vm_start;
0364             info->prev_vm_end = curr_vma->vm_end;
0365             op = task_vma_iter_find_vma;
0366             mmap_read_unlock(curr_task->mm);
0367             if (mmap_read_lock_killable(curr_task->mm))
0368                 goto finish;
0369         } else {
0370             op = task_vma_iter_next_vma;
0371         }
0372     } else {
0373 again:
0374         curr_task = task_seq_get_next(ns, &curr_tid, true);
0375         if (!curr_task) {
0376             info->tid = curr_tid + 1;
0377             goto finish;
0378         }
0379 
0380         if (curr_tid != info->tid) {
0381             info->tid = curr_tid;
0382             /* new task, process the first vma */
0383             op = task_vma_iter_first_vma;
0384         } else {
0385             /* Found the same tid, which means the user space
0386              * finished data in previous buffer and read more.
0387              * We dropped mmap_lock before returning to user
0388              * space, so it is necessary to use find_vma() to
0389              * find the next vma to process.
0390              */
0391             op = task_vma_iter_find_vma;
0392         }
0393 
0394         if (!curr_task->mm)
0395             goto next_task;
0396 
0397         if (mmap_read_lock_killable(curr_task->mm))
0398             goto finish;
0399     }
0400 
0401     switch (op) {
0402     case task_vma_iter_first_vma:
0403         curr_vma = curr_task->mm->mmap;
0404         break;
0405     case task_vma_iter_next_vma:
0406         curr_vma = curr_vma->vm_next;
0407         break;
0408     case task_vma_iter_find_vma:
0409         /* We dropped mmap_lock so it is necessary to use find_vma
0410          * to find the next vma. This is similar to the  mechanism
0411          * in show_smaps_rollup().
0412          */
0413         curr_vma = find_vma(curr_task->mm, info->prev_vm_end - 1);
0414         /* case 1) and 4.2) above just use curr_vma */
0415 
0416         /* check for case 2) or case 4.1) above */
0417         if (curr_vma &&
0418             curr_vma->vm_start == info->prev_vm_start &&
0419             curr_vma->vm_end == info->prev_vm_end)
0420             curr_vma = curr_vma->vm_next;
0421         break;
0422     }
0423     if (!curr_vma) {
0424         /* case 3) above, or case 2) 4.1) with vma->next == NULL */
0425         mmap_read_unlock(curr_task->mm);
0426         goto next_task;
0427     }
0428     info->task = curr_task;
0429     info->vma = curr_vma;
0430     return curr_vma;
0431 
0432 next_task:
0433     put_task_struct(curr_task);
0434     info->task = NULL;
0435     curr_tid++;
0436     goto again;
0437 
0438 finish:
0439     if (curr_task)
0440         put_task_struct(curr_task);
0441     info->task = NULL;
0442     info->vma = NULL;
0443     return NULL;
0444 }
0445 
0446 static void *task_vma_seq_start(struct seq_file *seq, loff_t *pos)
0447 {
0448     struct bpf_iter_seq_task_vma_info *info = seq->private;
0449     struct vm_area_struct *vma;
0450 
0451     vma = task_vma_seq_get_next(info);
0452     if (vma && *pos == 0)
0453         ++*pos;
0454 
0455     return vma;
0456 }
0457 
0458 static void *task_vma_seq_next(struct seq_file *seq, void *v, loff_t *pos)
0459 {
0460     struct bpf_iter_seq_task_vma_info *info = seq->private;
0461 
0462     ++*pos;
0463     return task_vma_seq_get_next(info);
0464 }
0465 
0466 struct bpf_iter__task_vma {
0467     __bpf_md_ptr(struct bpf_iter_meta *, meta);
0468     __bpf_md_ptr(struct task_struct *, task);
0469     __bpf_md_ptr(struct vm_area_struct *, vma);
0470 };
0471 
0472 DEFINE_BPF_ITER_FUNC(task_vma, struct bpf_iter_meta *meta,
0473              struct task_struct *task, struct vm_area_struct *vma)
0474 
0475 static int __task_vma_seq_show(struct seq_file *seq, bool in_stop)
0476 {
0477     struct bpf_iter_seq_task_vma_info *info = seq->private;
0478     struct bpf_iter__task_vma ctx;
0479     struct bpf_iter_meta meta;
0480     struct bpf_prog *prog;
0481 
0482     meta.seq = seq;
0483     prog = bpf_iter_get_info(&meta, in_stop);
0484     if (!prog)
0485         return 0;
0486 
0487     ctx.meta = &meta;
0488     ctx.task = info->task;
0489     ctx.vma = info->vma;
0490     return bpf_iter_run_prog(prog, &ctx);
0491 }
0492 
0493 static int task_vma_seq_show(struct seq_file *seq, void *v)
0494 {
0495     return __task_vma_seq_show(seq, false);
0496 }
0497 
0498 static void task_vma_seq_stop(struct seq_file *seq, void *v)
0499 {
0500     struct bpf_iter_seq_task_vma_info *info = seq->private;
0501 
0502     if (!v) {
0503         (void)__task_vma_seq_show(seq, true);
0504     } else {
0505         /* info->vma has not been seen by the BPF program. If the
0506          * user space reads more, task_vma_seq_get_next should
0507          * return this vma again. Set prev_vm_start to ~0UL,
0508          * so that we don't skip the vma returned by the next
0509          * find_vma() (case task_vma_iter_find_vma in
0510          * task_vma_seq_get_next()).
0511          */
0512         info->prev_vm_start = ~0UL;
0513         info->prev_vm_end = info->vma->vm_end;
0514         mmap_read_unlock(info->task->mm);
0515         put_task_struct(info->task);
0516         info->task = NULL;
0517     }
0518 }
0519 
0520 static const struct seq_operations task_vma_seq_ops = {
0521     .start  = task_vma_seq_start,
0522     .next   = task_vma_seq_next,
0523     .stop   = task_vma_seq_stop,
0524     .show   = task_vma_seq_show,
0525 };
0526 
0527 static const struct bpf_iter_seq_info task_seq_info = {
0528     .seq_ops        = &task_seq_ops,
0529     .init_seq_private   = init_seq_pidns,
0530     .fini_seq_private   = fini_seq_pidns,
0531     .seq_priv_size      = sizeof(struct bpf_iter_seq_task_info),
0532 };
0533 
0534 static struct bpf_iter_reg task_reg_info = {
0535     .target         = "task",
0536     .feature        = BPF_ITER_RESCHED,
0537     .ctx_arg_info_size  = 1,
0538     .ctx_arg_info       = {
0539         { offsetof(struct bpf_iter__task, task),
0540           PTR_TO_BTF_ID_OR_NULL },
0541     },
0542     .seq_info       = &task_seq_info,
0543 };
0544 
0545 static const struct bpf_iter_seq_info task_file_seq_info = {
0546     .seq_ops        = &task_file_seq_ops,
0547     .init_seq_private   = init_seq_pidns,
0548     .fini_seq_private   = fini_seq_pidns,
0549     .seq_priv_size      = sizeof(struct bpf_iter_seq_task_file_info),
0550 };
0551 
0552 static struct bpf_iter_reg task_file_reg_info = {
0553     .target         = "task_file",
0554     .feature        = BPF_ITER_RESCHED,
0555     .ctx_arg_info_size  = 2,
0556     .ctx_arg_info       = {
0557         { offsetof(struct bpf_iter__task_file, task),
0558           PTR_TO_BTF_ID_OR_NULL },
0559         { offsetof(struct bpf_iter__task_file, file),
0560           PTR_TO_BTF_ID_OR_NULL },
0561     },
0562     .seq_info       = &task_file_seq_info,
0563 };
0564 
0565 static const struct bpf_iter_seq_info task_vma_seq_info = {
0566     .seq_ops        = &task_vma_seq_ops,
0567     .init_seq_private   = init_seq_pidns,
0568     .fini_seq_private   = fini_seq_pidns,
0569     .seq_priv_size      = sizeof(struct bpf_iter_seq_task_vma_info),
0570 };
0571 
0572 static struct bpf_iter_reg task_vma_reg_info = {
0573     .target         = "task_vma",
0574     .feature        = BPF_ITER_RESCHED,
0575     .ctx_arg_info_size  = 2,
0576     .ctx_arg_info       = {
0577         { offsetof(struct bpf_iter__task_vma, task),
0578           PTR_TO_BTF_ID_OR_NULL },
0579         { offsetof(struct bpf_iter__task_vma, vma),
0580           PTR_TO_BTF_ID_OR_NULL },
0581     },
0582     .seq_info       = &task_vma_seq_info,
0583 };
0584 
0585 BPF_CALL_5(bpf_find_vma, struct task_struct *, task, u64, start,
0586        bpf_callback_t, callback_fn, void *, callback_ctx, u64, flags)
0587 {
0588     struct mmap_unlock_irq_work *work = NULL;
0589     struct vm_area_struct *vma;
0590     bool irq_work_busy = false;
0591     struct mm_struct *mm;
0592     int ret = -ENOENT;
0593 
0594     if (flags)
0595         return -EINVAL;
0596 
0597     if (!task)
0598         return -ENOENT;
0599 
0600     mm = task->mm;
0601     if (!mm)
0602         return -ENOENT;
0603 
0604     irq_work_busy = bpf_mmap_unlock_get_irq_work(&work);
0605 
0606     if (irq_work_busy || !mmap_read_trylock(mm))
0607         return -EBUSY;
0608 
0609     vma = find_vma(mm, start);
0610 
0611     if (vma && vma->vm_start <= start && vma->vm_end > start) {
0612         callback_fn((u64)(long)task, (u64)(long)vma,
0613                 (u64)(long)callback_ctx, 0, 0);
0614         ret = 0;
0615     }
0616     bpf_mmap_unlock_mm(work, mm);
0617     return ret;
0618 }
0619 
0620 const struct bpf_func_proto bpf_find_vma_proto = {
0621     .func       = bpf_find_vma,
0622     .ret_type   = RET_INTEGER,
0623     .arg1_type  = ARG_PTR_TO_BTF_ID,
0624     .arg1_btf_id    = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
0625     .arg2_type  = ARG_ANYTHING,
0626     .arg3_type  = ARG_PTR_TO_FUNC,
0627     .arg4_type  = ARG_PTR_TO_STACK_OR_NULL,
0628     .arg5_type  = ARG_ANYTHING,
0629 };
0630 
0631 DEFINE_PER_CPU(struct mmap_unlock_irq_work, mmap_unlock_work);
0632 
0633 static void do_mmap_read_unlock(struct irq_work *entry)
0634 {
0635     struct mmap_unlock_irq_work *work;
0636 
0637     if (WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_RT)))
0638         return;
0639 
0640     work = container_of(entry, struct mmap_unlock_irq_work, irq_work);
0641     mmap_read_unlock_non_owner(work->mm);
0642 }
0643 
0644 static int __init task_iter_init(void)
0645 {
0646     struct mmap_unlock_irq_work *work;
0647     int ret, cpu;
0648 
0649     for_each_possible_cpu(cpu) {
0650         work = per_cpu_ptr(&mmap_unlock_work, cpu);
0651         init_irq_work(&work->irq_work, do_mmap_read_unlock);
0652     }
0653 
0654     task_reg_info.ctx_arg_info[0].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_TASK];
0655     ret = bpf_iter_reg_target(&task_reg_info);
0656     if (ret)
0657         return ret;
0658 
0659     task_file_reg_info.ctx_arg_info[0].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_TASK];
0660     task_file_reg_info.ctx_arg_info[1].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_FILE];
0661     ret =  bpf_iter_reg_target(&task_file_reg_info);
0662     if (ret)
0663         return ret;
0664 
0665     task_vma_reg_info.ctx_arg_info[0].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_TASK];
0666     task_vma_reg_info.ctx_arg_info[1].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_VMA];
0667     return bpf_iter_reg_target(&task_vma_reg_info);
0668 }
0669 late_initcall(task_iter_init);