Back to home page

LXR

 
 

    


0001 /*
0002  * taskstats.c - Export per-task statistics to userland
0003  *
0004  * Copyright (C) Shailabh Nagar, IBM Corp. 2006
0005  *           (C) Balbir Singh,   IBM Corp. 2006
0006  *
0007  * This program is free software; you can redistribute it and/or modify
0008  * it under the terms of the GNU General Public License as published by
0009  * the Free Software Foundation; either version 2 of the License, or
0010  * (at your option) any later version.
0011  *
0012  * This program is distributed in the hope that it will be useful,
0013  * but WITHOUT ANY WARRANTY; without even the implied warranty of
0014  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
0015  * GNU General Public License for more details.
0016  *
0017  */
0018 
0019 #include <linux/kernel.h>
0020 #include <linux/taskstats_kern.h>
0021 #include <linux/tsacct_kern.h>
0022 #include <linux/delayacct.h>
0023 #include <linux/cpumask.h>
0024 #include <linux/percpu.h>
0025 #include <linux/slab.h>
0026 #include <linux/cgroupstats.h>
0027 #include <linux/cgroup.h>
0028 #include <linux/fs.h>
0029 #include <linux/file.h>
0030 #include <linux/pid_namespace.h>
0031 #include <net/genetlink.h>
0032 #include <linux/atomic.h>
0033 
0034 /*
0035  * Maximum length of a cpumask that can be specified in
0036  * the TASKSTATS_CMD_ATTR_REGISTER/DEREGISTER_CPUMASK attribute
0037  */
0038 #define TASKSTATS_CPUMASK_MAXLEN    (100+6*NR_CPUS)
0039 
0040 static DEFINE_PER_CPU(__u32, taskstats_seqnum);
0041 static int family_registered;
0042 struct kmem_cache *taskstats_cache;
0043 
0044 static struct genl_family family;
0045 
0046 static const struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] = {
0047     [TASKSTATS_CMD_ATTR_PID]  = { .type = NLA_U32 },
0048     [TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 },
0049     [TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING },
0050     [TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },};
0051 
0052 /*
0053  * We have to use TASKSTATS_CMD_ATTR_MAX here, it is the maxattr in the family.
0054  * Make sure they are always aligned.
0055  */
0056 static const struct nla_policy cgroupstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] = {
0057     [CGROUPSTATS_CMD_ATTR_FD] = { .type = NLA_U32 },
0058 };
0059 
0060 struct listener {
0061     struct list_head list;
0062     pid_t pid;
0063     char valid;
0064 };
0065 
0066 struct listener_list {
0067     struct rw_semaphore sem;
0068     struct list_head list;
0069 };
0070 static DEFINE_PER_CPU(struct listener_list, listener_array);
0071 
0072 enum actions {
0073     REGISTER,
0074     DEREGISTER,
0075     CPU_DONT_CARE
0076 };
0077 
0078 static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
0079                 size_t size)
0080 {
0081     struct sk_buff *skb;
0082     void *reply;
0083 
0084     /*
0085      * If new attributes are added, please revisit this allocation
0086      */
0087     skb = genlmsg_new(size, GFP_KERNEL);
0088     if (!skb)
0089         return -ENOMEM;
0090 
0091     if (!info) {
0092         int seq = this_cpu_inc_return(taskstats_seqnum) - 1;
0093 
0094         reply = genlmsg_put(skb, 0, seq, &family, 0, cmd);
0095     } else
0096         reply = genlmsg_put_reply(skb, info, &family, 0, cmd);
0097     if (reply == NULL) {
0098         nlmsg_free(skb);
0099         return -EINVAL;
0100     }
0101 
0102     *skbp = skb;
0103     return 0;
0104 }
0105 
0106 /*
0107  * Send taskstats data in @skb to listener with nl_pid @pid
0108  */
0109 static int send_reply(struct sk_buff *skb, struct genl_info *info)
0110 {
0111     struct genlmsghdr *genlhdr = nlmsg_data(nlmsg_hdr(skb));
0112     void *reply = genlmsg_data(genlhdr);
0113 
0114     genlmsg_end(skb, reply);
0115 
0116     return genlmsg_reply(skb, info);
0117 }
0118 
0119 /*
0120  * Send taskstats data in @skb to listeners registered for @cpu's exit data
0121  */
0122 static void send_cpu_listeners(struct sk_buff *skb,
0123                     struct listener_list *listeners)
0124 {
0125     struct genlmsghdr *genlhdr = nlmsg_data(nlmsg_hdr(skb));
0126     struct listener *s, *tmp;
0127     struct sk_buff *skb_next, *skb_cur = skb;
0128     void *reply = genlmsg_data(genlhdr);
0129     int rc, delcount = 0;
0130 
0131     genlmsg_end(skb, reply);
0132 
0133     rc = 0;
0134     down_read(&listeners->sem);
0135     list_for_each_entry(s, &listeners->list, list) {
0136         skb_next = NULL;
0137         if (!list_is_last(&s->list, &listeners->list)) {
0138             skb_next = skb_clone(skb_cur, GFP_KERNEL);
0139             if (!skb_next)
0140                 break;
0141         }
0142         rc = genlmsg_unicast(&init_net, skb_cur, s->pid);
0143         if (rc == -ECONNREFUSED) {
0144             s->valid = 0;
0145             delcount++;
0146         }
0147         skb_cur = skb_next;
0148     }
0149     up_read(&listeners->sem);
0150 
0151     if (skb_cur)
0152         nlmsg_free(skb_cur);
0153 
0154     if (!delcount)
0155         return;
0156 
0157     /* Delete invalidated entries */
0158     down_write(&listeners->sem);
0159     list_for_each_entry_safe(s, tmp, &listeners->list, list) {
0160         if (!s->valid) {
0161             list_del(&s->list);
0162             kfree(s);
0163         }
0164     }
0165     up_write(&listeners->sem);
0166 }
0167 
0168 static void fill_stats(struct user_namespace *user_ns,
0169                struct pid_namespace *pid_ns,
0170                struct task_struct *tsk, struct taskstats *stats)
0171 {
0172     memset(stats, 0, sizeof(*stats));
0173     /*
0174      * Each accounting subsystem adds calls to its functions to
0175      * fill in relevant parts of struct taskstsats as follows
0176      *
0177      *  per-task-foo(stats, tsk);
0178      */
0179 
0180     delayacct_add_tsk(stats, tsk);
0181 
0182     /* fill in basic acct fields */
0183     stats->version = TASKSTATS_VERSION;
0184     stats->nvcsw = tsk->nvcsw;
0185     stats->nivcsw = tsk->nivcsw;
0186     bacct_add_tsk(user_ns, pid_ns, stats, tsk);
0187 
0188     /* fill in extended acct fields */
0189     xacct_add_tsk(stats, tsk);
0190 }
0191 
0192 static int fill_stats_for_pid(pid_t pid, struct taskstats *stats)
0193 {
0194     struct task_struct *tsk;
0195 
0196     rcu_read_lock();
0197     tsk = find_task_by_vpid(pid);
0198     if (tsk)
0199         get_task_struct(tsk);
0200     rcu_read_unlock();
0201     if (!tsk)
0202         return -ESRCH;
0203     fill_stats(current_user_ns(), task_active_pid_ns(current), tsk, stats);
0204     put_task_struct(tsk);
0205     return 0;
0206 }
0207 
0208 static int fill_stats_for_tgid(pid_t tgid, struct taskstats *stats)
0209 {
0210     struct task_struct *tsk, *first;
0211     unsigned long flags;
0212     int rc = -ESRCH;
0213 
0214     /*
0215      * Add additional stats from live tasks except zombie thread group
0216      * leaders who are already counted with the dead tasks
0217      */
0218     rcu_read_lock();
0219     first = find_task_by_vpid(tgid);
0220 
0221     if (!first || !lock_task_sighand(first, &flags))
0222         goto out;
0223 
0224     if (first->signal->stats)
0225         memcpy(stats, first->signal->stats, sizeof(*stats));
0226     else
0227         memset(stats, 0, sizeof(*stats));
0228 
0229     tsk = first;
0230     do {
0231         if (tsk->exit_state)
0232             continue;
0233         /*
0234          * Accounting subsystem can call its functions here to
0235          * fill in relevant parts of struct taskstsats as follows
0236          *
0237          *  per-task-foo(stats, tsk);
0238          */
0239         delayacct_add_tsk(stats, tsk);
0240 
0241         stats->nvcsw += tsk->nvcsw;
0242         stats->nivcsw += tsk->nivcsw;
0243     } while_each_thread(first, tsk);
0244 
0245     unlock_task_sighand(first, &flags);
0246     rc = 0;
0247 out:
0248     rcu_read_unlock();
0249 
0250     stats->version = TASKSTATS_VERSION;
0251     /*
0252      * Accounting subsystems can also add calls here to modify
0253      * fields of taskstats.
0254      */
0255     return rc;
0256 }
0257 
0258 static void fill_tgid_exit(struct task_struct *tsk)
0259 {
0260     unsigned long flags;
0261 
0262     spin_lock_irqsave(&tsk->sighand->siglock, flags);
0263     if (!tsk->signal->stats)
0264         goto ret;
0265 
0266     /*
0267      * Each accounting subsystem calls its functions here to
0268      * accumalate its per-task stats for tsk, into the per-tgid structure
0269      *
0270      *  per-task-foo(tsk->signal->stats, tsk);
0271      */
0272     delayacct_add_tsk(tsk->signal->stats, tsk);
0273 ret:
0274     spin_unlock_irqrestore(&tsk->sighand->siglock, flags);
0275     return;
0276 }
0277 
0278 static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd)
0279 {
0280     struct listener_list *listeners;
0281     struct listener *s, *tmp, *s2;
0282     unsigned int cpu;
0283     int ret = 0;
0284 
0285     if (!cpumask_subset(mask, cpu_possible_mask))
0286         return -EINVAL;
0287 
0288     if (current_user_ns() != &init_user_ns)
0289         return -EINVAL;
0290 
0291     if (task_active_pid_ns(current) != &init_pid_ns)
0292         return -EINVAL;
0293 
0294     if (isadd == REGISTER) {
0295         for_each_cpu(cpu, mask) {
0296             s = kmalloc_node(sizeof(struct listener),
0297                     GFP_KERNEL, cpu_to_node(cpu));
0298             if (!s) {
0299                 ret = -ENOMEM;
0300                 goto cleanup;
0301             }
0302             s->pid = pid;
0303             s->valid = 1;
0304 
0305             listeners = &per_cpu(listener_array, cpu);
0306             down_write(&listeners->sem);
0307             list_for_each_entry(s2, &listeners->list, list) {
0308                 if (s2->pid == pid && s2->valid)
0309                     goto exists;
0310             }
0311             list_add(&s->list, &listeners->list);
0312             s = NULL;
0313 exists:
0314             up_write(&listeners->sem);
0315             kfree(s); /* nop if NULL */
0316         }
0317         return 0;
0318     }
0319 
0320     /* Deregister or cleanup */
0321 cleanup:
0322     for_each_cpu(cpu, mask) {
0323         listeners = &per_cpu(listener_array, cpu);
0324         down_write(&listeners->sem);
0325         list_for_each_entry_safe(s, tmp, &listeners->list, list) {
0326             if (s->pid == pid) {
0327                 list_del(&s->list);
0328                 kfree(s);
0329                 break;
0330             }
0331         }
0332         up_write(&listeners->sem);
0333     }
0334     return ret;
0335 }
0336 
0337 static int parse(struct nlattr *na, struct cpumask *mask)
0338 {
0339     char *data;
0340     int len;
0341     int ret;
0342 
0343     if (na == NULL)
0344         return 1;
0345     len = nla_len(na);
0346     if (len > TASKSTATS_CPUMASK_MAXLEN)
0347         return -E2BIG;
0348     if (len < 1)
0349         return -EINVAL;
0350     data = kmalloc(len, GFP_KERNEL);
0351     if (!data)
0352         return -ENOMEM;
0353     nla_strlcpy(data, na, len);
0354     ret = cpulist_parse(data, mask);
0355     kfree(data);
0356     return ret;
0357 }
0358 
0359 static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid)
0360 {
0361     struct nlattr *na, *ret;
0362     int aggr;
0363 
0364     aggr = (type == TASKSTATS_TYPE_PID)
0365             ? TASKSTATS_TYPE_AGGR_PID
0366             : TASKSTATS_TYPE_AGGR_TGID;
0367 
0368     na = nla_nest_start(skb, aggr);
0369     if (!na)
0370         goto err;
0371 
0372     if (nla_put(skb, type, sizeof(pid), &pid) < 0) {
0373         nla_nest_cancel(skb, na);
0374         goto err;
0375     }
0376     ret = nla_reserve_64bit(skb, TASKSTATS_TYPE_STATS,
0377                 sizeof(struct taskstats), TASKSTATS_TYPE_NULL);
0378     if (!ret) {
0379         nla_nest_cancel(skb, na);
0380         goto err;
0381     }
0382     nla_nest_end(skb, na);
0383 
0384     return nla_data(ret);
0385 err:
0386     return NULL;
0387 }
0388 
0389 static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
0390 {
0391     int rc = 0;
0392     struct sk_buff *rep_skb;
0393     struct cgroupstats *stats;
0394     struct nlattr *na;
0395     size_t size;
0396     u32 fd;
0397     struct fd f;
0398 
0399     na = info->attrs[CGROUPSTATS_CMD_ATTR_FD];
0400     if (!na)
0401         return -EINVAL;
0402 
0403     fd = nla_get_u32(info->attrs[CGROUPSTATS_CMD_ATTR_FD]);
0404     f = fdget(fd);
0405     if (!f.file)
0406         return 0;
0407 
0408     size = nla_total_size(sizeof(struct cgroupstats));
0409 
0410     rc = prepare_reply(info, CGROUPSTATS_CMD_NEW, &rep_skb,
0411                 size);
0412     if (rc < 0)
0413         goto err;
0414 
0415     na = nla_reserve(rep_skb, CGROUPSTATS_TYPE_CGROUP_STATS,
0416                 sizeof(struct cgroupstats));
0417     if (na == NULL) {
0418         nlmsg_free(rep_skb);
0419         rc = -EMSGSIZE;
0420         goto err;
0421     }
0422 
0423     stats = nla_data(na);
0424     memset(stats, 0, sizeof(*stats));
0425 
0426     rc = cgroupstats_build(stats, f.file->f_path.dentry);
0427     if (rc < 0) {
0428         nlmsg_free(rep_skb);
0429         goto err;
0430     }
0431 
0432     rc = send_reply(rep_skb, info);
0433 
0434 err:
0435     fdput(f);
0436     return rc;
0437 }
0438 
0439 static int cmd_attr_register_cpumask(struct genl_info *info)
0440 {
0441     cpumask_var_t mask;
0442     int rc;
0443 
0444     if (!alloc_cpumask_var(&mask, GFP_KERNEL))
0445         return -ENOMEM;
0446     rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], mask);
0447     if (rc < 0)
0448         goto out;
0449     rc = add_del_listener(info->snd_portid, mask, REGISTER);
0450 out:
0451     free_cpumask_var(mask);
0452     return rc;
0453 }
0454 
0455 static int cmd_attr_deregister_cpumask(struct genl_info *info)
0456 {
0457     cpumask_var_t mask;
0458     int rc;
0459 
0460     if (!alloc_cpumask_var(&mask, GFP_KERNEL))
0461         return -ENOMEM;
0462     rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], mask);
0463     if (rc < 0)
0464         goto out;
0465     rc = add_del_listener(info->snd_portid, mask, DEREGISTER);
0466 out:
0467     free_cpumask_var(mask);
0468     return rc;
0469 }
0470 
0471 static size_t taskstats_packet_size(void)
0472 {
0473     size_t size;
0474 
0475     size = nla_total_size(sizeof(u32)) +
0476         nla_total_size_64bit(sizeof(struct taskstats)) +
0477         nla_total_size(0);
0478 
0479     return size;
0480 }
0481 
0482 static int cmd_attr_pid(struct genl_info *info)
0483 {
0484     struct taskstats *stats;
0485     struct sk_buff *rep_skb;
0486     size_t size;
0487     u32 pid;
0488     int rc;
0489 
0490     size = taskstats_packet_size();
0491 
0492     rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size);
0493     if (rc < 0)
0494         return rc;
0495 
0496     rc = -EINVAL;
0497     pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]);
0498     stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, pid);
0499     if (!stats)
0500         goto err;
0501 
0502     rc = fill_stats_for_pid(pid, stats);
0503     if (rc < 0)
0504         goto err;
0505     return send_reply(rep_skb, info);
0506 err:
0507     nlmsg_free(rep_skb);
0508     return rc;
0509 }
0510 
0511 static int cmd_attr_tgid(struct genl_info *info)
0512 {
0513     struct taskstats *stats;
0514     struct sk_buff *rep_skb;
0515     size_t size;
0516     u32 tgid;
0517     int rc;
0518 
0519     size = taskstats_packet_size();
0520 
0521     rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size);
0522     if (rc < 0)
0523         return rc;
0524 
0525     rc = -EINVAL;
0526     tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]);
0527     stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tgid);
0528     if (!stats)
0529         goto err;
0530 
0531     rc = fill_stats_for_tgid(tgid, stats);
0532     if (rc < 0)
0533         goto err;
0534     return send_reply(rep_skb, info);
0535 err:
0536     nlmsg_free(rep_skb);
0537     return rc;
0538 }
0539 
0540 static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
0541 {
0542     if (info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK])
0543         return cmd_attr_register_cpumask(info);
0544     else if (info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK])
0545         return cmd_attr_deregister_cpumask(info);
0546     else if (info->attrs[TASKSTATS_CMD_ATTR_PID])
0547         return cmd_attr_pid(info);
0548     else if (info->attrs[TASKSTATS_CMD_ATTR_TGID])
0549         return cmd_attr_tgid(info);
0550     else
0551         return -EINVAL;
0552 }
0553 
0554 static struct taskstats *taskstats_tgid_alloc(struct task_struct *tsk)
0555 {
0556     struct signal_struct *sig = tsk->signal;
0557     struct taskstats *stats;
0558 
0559     if (sig->stats || thread_group_empty(tsk))
0560         goto ret;
0561 
0562     /* No problem if kmem_cache_zalloc() fails */
0563     stats = kmem_cache_zalloc(taskstats_cache, GFP_KERNEL);
0564 
0565     spin_lock_irq(&tsk->sighand->siglock);
0566     if (!sig->stats) {
0567         sig->stats = stats;
0568         stats = NULL;
0569     }
0570     spin_unlock_irq(&tsk->sighand->siglock);
0571 
0572     if (stats)
0573         kmem_cache_free(taskstats_cache, stats);
0574 ret:
0575     return sig->stats;
0576 }
0577 
0578 /* Send pid data out on exit */
0579 void taskstats_exit(struct task_struct *tsk, int group_dead)
0580 {
0581     int rc;
0582     struct listener_list *listeners;
0583     struct taskstats *stats;
0584     struct sk_buff *rep_skb;
0585     size_t size;
0586     int is_thread_group;
0587 
0588     if (!family_registered)
0589         return;
0590 
0591     /*
0592      * Size includes space for nested attributes
0593      */
0594     size = taskstats_packet_size();
0595 
0596     is_thread_group = !!taskstats_tgid_alloc(tsk);
0597     if (is_thread_group) {
0598         /* PID + STATS + TGID + STATS */
0599         size = 2 * size;
0600         /* fill the tsk->signal->stats structure */
0601         fill_tgid_exit(tsk);
0602     }
0603 
0604     listeners = raw_cpu_ptr(&listener_array);
0605     if (list_empty(&listeners->list))
0606         return;
0607 
0608     rc = prepare_reply(NULL, TASKSTATS_CMD_NEW, &rep_skb, size);
0609     if (rc < 0)
0610         return;
0611 
0612     stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID,
0613              task_pid_nr_ns(tsk, &init_pid_ns));
0614     if (!stats)
0615         goto err;
0616 
0617     fill_stats(&init_user_ns, &init_pid_ns, tsk, stats);
0618 
0619     /*
0620      * Doesn't matter if tsk is the leader or the last group member leaving
0621      */
0622     if (!is_thread_group || !group_dead)
0623         goto send;
0624 
0625     stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID,
0626              task_tgid_nr_ns(tsk, &init_pid_ns));
0627     if (!stats)
0628         goto err;
0629 
0630     memcpy(stats, tsk->signal->stats, sizeof(*stats));
0631 
0632 send:
0633     send_cpu_listeners(rep_skb, listeners);
0634     return;
0635 err:
0636     nlmsg_free(rep_skb);
0637 }
0638 
0639 static const struct genl_ops taskstats_ops[] = {
0640     {
0641         .cmd        = TASKSTATS_CMD_GET,
0642         .doit       = taskstats_user_cmd,
0643         .policy     = taskstats_cmd_get_policy,
0644         .flags      = GENL_ADMIN_PERM,
0645     },
0646     {
0647         .cmd        = CGROUPSTATS_CMD_GET,
0648         .doit       = cgroupstats_user_cmd,
0649         .policy     = cgroupstats_cmd_get_policy,
0650     },
0651 };
0652 
0653 static struct genl_family family __ro_after_init = {
0654     .name       = TASKSTATS_GENL_NAME,
0655     .version    = TASKSTATS_GENL_VERSION,
0656     .maxattr    = TASKSTATS_CMD_ATTR_MAX,
0657     .module     = THIS_MODULE,
0658     .ops        = taskstats_ops,
0659     .n_ops      = ARRAY_SIZE(taskstats_ops),
0660 };
0661 
0662 /* Needed early in initialization */
0663 void __init taskstats_init_early(void)
0664 {
0665     unsigned int i;
0666 
0667     taskstats_cache = KMEM_CACHE(taskstats, SLAB_PANIC);
0668     for_each_possible_cpu(i) {
0669         INIT_LIST_HEAD(&(per_cpu(listener_array, i).list));
0670         init_rwsem(&(per_cpu(listener_array, i).sem));
0671     }
0672 }
0673 
0674 static int __init taskstats_init(void)
0675 {
0676     int rc;
0677 
0678     rc = genl_register_family(&family);
0679     if (rc)
0680         return rc;
0681 
0682     family_registered = 1;
0683     pr_info("registered taskstats version %d\n", TASKSTATS_GENL_VERSION);
0684     return 0;
0685 }
0686 
0687 /*
0688  * late initcall ensures initialization of statistics collection
0689  * mechanisms precedes initialization of the taskstats interface
0690  */
0691 late_initcall(taskstats_init);