Back to home page

LXR

 
 

    


0001 /*
0002  * Generic pidhash and scalable, time-bounded PID allocator
0003  *
0004  * (C) 2002-2003 Nadia Yvette Chambers, IBM
0005  * (C) 2004 Nadia Yvette Chambers, Oracle
0006  * (C) 2002-2004 Ingo Molnar, Red Hat
0007  *
0008  * pid-structures are backing objects for tasks sharing a given ID to chain
0009  * against. There is very little to them aside from hashing them and
0010  * parking tasks using given ID's on a list.
0011  *
0012  * The hash is always changed with the tasklist_lock write-acquired,
0013  * and the hash is only accessed with the tasklist_lock at least
0014  * read-acquired, so there's no additional SMP locking needed here.
0015  *
0016  * We have a list of bitmap pages, which bitmaps represent the PID space.
0017  * Allocating and freeing PIDs is completely lockless. The worst-case
0018  * allocation scenario when all but one out of 1 million PIDs possible are
0019  * allocated already: the scanning of 32 list entries and at most PAGE_SIZE
0020  * bytes. The typical fastpath is a single successful setbit. Freeing is O(1).
0021  *
0022  * Pid namespaces:
0023  *    (C) 2007 Pavel Emelyanov <xemul@openvz.org>, OpenVZ, SWsoft Inc.
0024  *    (C) 2007 Sukadev Bhattiprolu <sukadev@us.ibm.com>, IBM
0025  *     Many thanks to Oleg Nesterov for comments and help
0026  *
0027  */
0028 
0029 #include <linux/mm.h>
0030 #include <linux/export.h>
0031 #include <linux/slab.h>
0032 #include <linux/init.h>
0033 #include <linux/rculist.h>
0034 #include <linux/bootmem.h>
0035 #include <linux/hash.h>
0036 #include <linux/pid_namespace.h>
0037 #include <linux/init_task.h>
0038 #include <linux/syscalls.h>
0039 #include <linux/proc_ns.h>
0040 #include <linux/proc_fs.h>
0041 
0042 #define pid_hashfn(nr, ns)  \
0043     hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift)
0044 static struct hlist_head *pid_hash;
0045 static unsigned int pidhash_shift = 4;
0046 struct pid init_struct_pid = INIT_STRUCT_PID;
0047 
0048 int pid_max = PID_MAX_DEFAULT;
0049 
0050 #define RESERVED_PIDS       300
0051 
0052 int pid_max_min = RESERVED_PIDS + 1;
0053 int pid_max_max = PID_MAX_LIMIT;
0054 
0055 static inline int mk_pid(struct pid_namespace *pid_ns,
0056         struct pidmap *map, int off)
0057 {
0058     return (map - pid_ns->pidmap)*BITS_PER_PAGE + off;
0059 }
0060 
0061 #define find_next_offset(map, off)                  \
0062         find_next_zero_bit((map)->page, BITS_PER_PAGE, off)
0063 
0064 /*
0065  * PID-map pages start out as NULL, they get allocated upon
0066  * first use and are never deallocated. This way a low pid_max
0067  * value does not cause lots of bitmaps to be allocated, but
0068  * the scheme scales to up to 4 million PIDs, runtime.
0069  */
0070 struct pid_namespace init_pid_ns = {
0071     .kref = {
0072         .refcount       = ATOMIC_INIT(2),
0073     },
0074     .pidmap = {
0075         [ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL }
0076     },
0077     .last_pid = 0,
0078     .nr_hashed = PIDNS_HASH_ADDING,
0079     .level = 0,
0080     .child_reaper = &init_task,
0081     .user_ns = &init_user_ns,
0082     .ns.inum = PROC_PID_INIT_INO,
0083 #ifdef CONFIG_PID_NS
0084     .ns.ops = &pidns_operations,
0085 #endif
0086 };
0087 EXPORT_SYMBOL_GPL(init_pid_ns);
0088 
0089 /*
0090  * Note: disable interrupts while the pidmap_lock is held as an
0091  * interrupt might come in and do read_lock(&tasklist_lock).
0092  *
0093  * If we don't disable interrupts there is a nasty deadlock between
0094  * detach_pid()->free_pid() and another cpu that does
0095  * spin_lock(&pidmap_lock) followed by an interrupt routine that does
0096  * read_lock(&tasklist_lock);
0097  *
0098  * After we clean up the tasklist_lock and know there are no
0099  * irq handlers that take it we can leave the interrupts enabled.
0100  * For now it is easier to be safe than to prove it can't happen.
0101  */
0102 
0103 static  __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
0104 
0105 static void free_pidmap(struct upid *upid)
0106 {
0107     int nr = upid->nr;
0108     struct pidmap *map = upid->ns->pidmap + nr / BITS_PER_PAGE;
0109     int offset = nr & BITS_PER_PAGE_MASK;
0110 
0111     clear_bit(offset, map->page);
0112     atomic_inc(&map->nr_free);
0113 }
0114 
0115 /*
0116  * If we started walking pids at 'base', is 'a' seen before 'b'?
0117  */
0118 static int pid_before(int base, int a, int b)
0119 {
0120     /*
0121      * This is the same as saying
0122      *
0123      * (a - base + MAXUINT) % MAXUINT < (b - base + MAXUINT) % MAXUINT
0124      * and that mapping orders 'a' and 'b' with respect to 'base'.
0125      */
0126     return (unsigned)(a - base) < (unsigned)(b - base);
0127 }
0128 
0129 /*
0130  * We might be racing with someone else trying to set pid_ns->last_pid
0131  * at the pid allocation time (there's also a sysctl for this, but racing
0132  * with this one is OK, see comment in kernel/pid_namespace.c about it).
0133  * We want the winner to have the "later" value, because if the
0134  * "earlier" value prevails, then a pid may get reused immediately.
0135  *
0136  * Since pids rollover, it is not sufficient to just pick the bigger
0137  * value.  We have to consider where we started counting from.
0138  *
0139  * 'base' is the value of pid_ns->last_pid that we observed when
0140  * we started looking for a pid.
0141  *
0142  * 'pid' is the pid that we eventually found.
0143  */
0144 static void set_last_pid(struct pid_namespace *pid_ns, int base, int pid)
0145 {
0146     int prev;
0147     int last_write = base;
0148     do {
0149         prev = last_write;
0150         last_write = cmpxchg(&pid_ns->last_pid, prev, pid);
0151     } while ((prev != last_write) && (pid_before(base, last_write, pid)));
0152 }
0153 
0154 static int alloc_pidmap(struct pid_namespace *pid_ns)
0155 {
0156     int i, offset, max_scan, pid, last = pid_ns->last_pid;
0157     struct pidmap *map;
0158 
0159     pid = last + 1;
0160     if (pid >= pid_max)
0161         pid = RESERVED_PIDS;
0162     offset = pid & BITS_PER_PAGE_MASK;
0163     map = &pid_ns->pidmap[pid/BITS_PER_PAGE];
0164     /*
0165      * If last_pid points into the middle of the map->page we
0166      * want to scan this bitmap block twice, the second time
0167      * we start with offset == 0 (or RESERVED_PIDS).
0168      */
0169     max_scan = DIV_ROUND_UP(pid_max, BITS_PER_PAGE) - !offset;
0170     for (i = 0; i <= max_scan; ++i) {
0171         if (unlikely(!map->page)) {
0172             void *page = kzalloc(PAGE_SIZE, GFP_KERNEL);
0173             /*
0174              * Free the page if someone raced with us
0175              * installing it:
0176              */
0177             spin_lock_irq(&pidmap_lock);
0178             if (!map->page) {
0179                 map->page = page;
0180                 page = NULL;
0181             }
0182             spin_unlock_irq(&pidmap_lock);
0183             kfree(page);
0184             if (unlikely(!map->page))
0185                 return -ENOMEM;
0186         }
0187         if (likely(atomic_read(&map->nr_free))) {
0188             for ( ; ; ) {
0189                 if (!test_and_set_bit(offset, map->page)) {
0190                     atomic_dec(&map->nr_free);
0191                     set_last_pid(pid_ns, last, pid);
0192                     return pid;
0193                 }
0194                 offset = find_next_offset(map, offset);
0195                 if (offset >= BITS_PER_PAGE)
0196                     break;
0197                 pid = mk_pid(pid_ns, map, offset);
0198                 if (pid >= pid_max)
0199                     break;
0200             }
0201         }
0202         if (map < &pid_ns->pidmap[(pid_max-1)/BITS_PER_PAGE]) {
0203             ++map;
0204             offset = 0;
0205         } else {
0206             map = &pid_ns->pidmap[0];
0207             offset = RESERVED_PIDS;
0208             if (unlikely(last == offset))
0209                 break;
0210         }
0211         pid = mk_pid(pid_ns, map, offset);
0212     }
0213     return -EAGAIN;
0214 }
0215 
0216 int next_pidmap(struct pid_namespace *pid_ns, unsigned int last)
0217 {
0218     int offset;
0219     struct pidmap *map, *end;
0220 
0221     if (last >= PID_MAX_LIMIT)
0222         return -1;
0223 
0224     offset = (last + 1) & BITS_PER_PAGE_MASK;
0225     map = &pid_ns->pidmap[(last + 1)/BITS_PER_PAGE];
0226     end = &pid_ns->pidmap[PIDMAP_ENTRIES];
0227     for (; map < end; map++, offset = 0) {
0228         if (unlikely(!map->page))
0229             continue;
0230         offset = find_next_bit((map)->page, BITS_PER_PAGE, offset);
0231         if (offset < BITS_PER_PAGE)
0232             return mk_pid(pid_ns, map, offset);
0233     }
0234     return -1;
0235 }
0236 
0237 void put_pid(struct pid *pid)
0238 {
0239     struct pid_namespace *ns;
0240 
0241     if (!pid)
0242         return;
0243 
0244     ns = pid->numbers[pid->level].ns;
0245     if ((atomic_read(&pid->count) == 1) ||
0246          atomic_dec_and_test(&pid->count)) {
0247         kmem_cache_free(ns->pid_cachep, pid);
0248         put_pid_ns(ns);
0249     }
0250 }
0251 EXPORT_SYMBOL_GPL(put_pid);
0252 
0253 static void delayed_put_pid(struct rcu_head *rhp)
0254 {
0255     struct pid *pid = container_of(rhp, struct pid, rcu);
0256     put_pid(pid);
0257 }
0258 
0259 void free_pid(struct pid *pid)
0260 {
0261     /* We can be called with write_lock_irq(&tasklist_lock) held */
0262     int i;
0263     unsigned long flags;
0264 
0265     spin_lock_irqsave(&pidmap_lock, flags);
0266     for (i = 0; i <= pid->level; i++) {
0267         struct upid *upid = pid->numbers + i;
0268         struct pid_namespace *ns = upid->ns;
0269         hlist_del_rcu(&upid->pid_chain);
0270         switch(--ns->nr_hashed) {
0271         case 2:
0272         case 1:
0273             /* When all that is left in the pid namespace
0274              * is the reaper wake up the reaper.  The reaper
0275              * may be sleeping in zap_pid_ns_processes().
0276              */
0277             wake_up_process(ns->child_reaper);
0278             break;
0279         case PIDNS_HASH_ADDING:
0280             /* Handle a fork failure of the first process */
0281             WARN_ON(ns->child_reaper);
0282             ns->nr_hashed = 0;
0283             /* fall through */
0284         case 0:
0285             schedule_work(&ns->proc_work);
0286             break;
0287         }
0288     }
0289     spin_unlock_irqrestore(&pidmap_lock, flags);
0290 
0291     for (i = 0; i <= pid->level; i++)
0292         free_pidmap(pid->numbers + i);
0293 
0294     call_rcu(&pid->rcu, delayed_put_pid);
0295 }
0296 
0297 struct pid *alloc_pid(struct pid_namespace *ns)
0298 {
0299     struct pid *pid;
0300     enum pid_type type;
0301     int i, nr;
0302     struct pid_namespace *tmp;
0303     struct upid *upid;
0304     int retval = -ENOMEM;
0305 
0306     pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL);
0307     if (!pid)
0308         return ERR_PTR(retval);
0309 
0310     tmp = ns;
0311     pid->level = ns->level;
0312     for (i = ns->level; i >= 0; i--) {
0313         nr = alloc_pidmap(tmp);
0314         if (nr < 0) {
0315             retval = nr;
0316             goto out_free;
0317         }
0318 
0319         pid->numbers[i].nr = nr;
0320         pid->numbers[i].ns = tmp;
0321         tmp = tmp->parent;
0322     }
0323 
0324     if (unlikely(is_child_reaper(pid))) {
0325         if (pid_ns_prepare_proc(ns))
0326             goto out_free;
0327     }
0328 
0329     get_pid_ns(ns);
0330     atomic_set(&pid->count, 1);
0331     for (type = 0; type < PIDTYPE_MAX; ++type)
0332         INIT_HLIST_HEAD(&pid->tasks[type]);
0333 
0334     upid = pid->numbers + ns->level;
0335     spin_lock_irq(&pidmap_lock);
0336     if (!(ns->nr_hashed & PIDNS_HASH_ADDING))
0337         goto out_unlock;
0338     for ( ; upid >= pid->numbers; --upid) {
0339         hlist_add_head_rcu(&upid->pid_chain,
0340                 &pid_hash[pid_hashfn(upid->nr, upid->ns)]);
0341         upid->ns->nr_hashed++;
0342     }
0343     spin_unlock_irq(&pidmap_lock);
0344 
0345     return pid;
0346 
0347 out_unlock:
0348     spin_unlock_irq(&pidmap_lock);
0349     put_pid_ns(ns);
0350 
0351 out_free:
0352     while (++i <= ns->level)
0353         free_pidmap(pid->numbers + i);
0354 
0355     kmem_cache_free(ns->pid_cachep, pid);
0356     return ERR_PTR(retval);
0357 }
0358 
0359 void disable_pid_allocation(struct pid_namespace *ns)
0360 {
0361     spin_lock_irq(&pidmap_lock);
0362     ns->nr_hashed &= ~PIDNS_HASH_ADDING;
0363     spin_unlock_irq(&pidmap_lock);
0364 }
0365 
0366 struct pid *find_pid_ns(int nr, struct pid_namespace *ns)
0367 {
0368     struct upid *pnr;
0369 
0370     hlist_for_each_entry_rcu(pnr,
0371             &pid_hash[pid_hashfn(nr, ns)], pid_chain)
0372         if (pnr->nr == nr && pnr->ns == ns)
0373             return container_of(pnr, struct pid,
0374                     numbers[ns->level]);
0375 
0376     return NULL;
0377 }
0378 EXPORT_SYMBOL_GPL(find_pid_ns);
0379 
0380 struct pid *find_vpid(int nr)
0381 {
0382     return find_pid_ns(nr, task_active_pid_ns(current));
0383 }
0384 EXPORT_SYMBOL_GPL(find_vpid);
0385 
0386 /*
0387  * attach_pid() must be called with the tasklist_lock write-held.
0388  */
0389 void attach_pid(struct task_struct *task, enum pid_type type)
0390 {
0391     struct pid_link *link = &task->pids[type];
0392     hlist_add_head_rcu(&link->node, &link->pid->tasks[type]);
0393 }
0394 
0395 static void __change_pid(struct task_struct *task, enum pid_type type,
0396             struct pid *new)
0397 {
0398     struct pid_link *link;
0399     struct pid *pid;
0400     int tmp;
0401 
0402     link = &task->pids[type];
0403     pid = link->pid;
0404 
0405     hlist_del_rcu(&link->node);
0406     link->pid = new;
0407 
0408     for (tmp = PIDTYPE_MAX; --tmp >= 0; )
0409         if (!hlist_empty(&pid->tasks[tmp]))
0410             return;
0411 
0412     free_pid(pid);
0413 }
0414 
0415 void detach_pid(struct task_struct *task, enum pid_type type)
0416 {
0417     __change_pid(task, type, NULL);
0418 }
0419 
0420 void change_pid(struct task_struct *task, enum pid_type type,
0421         struct pid *pid)
0422 {
0423     __change_pid(task, type, pid);
0424     attach_pid(task, type);
0425 }
0426 
0427 /* transfer_pid is an optimization of attach_pid(new), detach_pid(old) */
0428 void transfer_pid(struct task_struct *old, struct task_struct *new,
0429                enum pid_type type)
0430 {
0431     new->pids[type].pid = old->pids[type].pid;
0432     hlist_replace_rcu(&old->pids[type].node, &new->pids[type].node);
0433 }
0434 
0435 struct task_struct *pid_task(struct pid *pid, enum pid_type type)
0436 {
0437     struct task_struct *result = NULL;
0438     if (pid) {
0439         struct hlist_node *first;
0440         first = rcu_dereference_check(hlist_first_rcu(&pid->tasks[type]),
0441                           lockdep_tasklist_lock_is_held());
0442         if (first)
0443             result = hlist_entry(first, struct task_struct, pids[(type)].node);
0444     }
0445     return result;
0446 }
0447 EXPORT_SYMBOL(pid_task);
0448 
0449 /*
0450  * Must be called under rcu_read_lock().
0451  */
0452 struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
0453 {
0454     RCU_LOCKDEP_WARN(!rcu_read_lock_held(),
0455              "find_task_by_pid_ns() needs rcu_read_lock() protection");
0456     return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID);
0457 }
0458 
0459 struct task_struct *find_task_by_vpid(pid_t vnr)
0460 {
0461     return find_task_by_pid_ns(vnr, task_active_pid_ns(current));
0462 }
0463 
0464 struct pid *get_task_pid(struct task_struct *task, enum pid_type type)
0465 {
0466     struct pid *pid;
0467     rcu_read_lock();
0468     if (type != PIDTYPE_PID)
0469         task = task->group_leader;
0470     pid = get_pid(rcu_dereference(task->pids[type].pid));
0471     rcu_read_unlock();
0472     return pid;
0473 }
0474 EXPORT_SYMBOL_GPL(get_task_pid);
0475 
0476 struct task_struct *get_pid_task(struct pid *pid, enum pid_type type)
0477 {
0478     struct task_struct *result;
0479     rcu_read_lock();
0480     result = pid_task(pid, type);
0481     if (result)
0482         get_task_struct(result);
0483     rcu_read_unlock();
0484     return result;
0485 }
0486 EXPORT_SYMBOL_GPL(get_pid_task);
0487 
0488 struct pid *find_get_pid(pid_t nr)
0489 {
0490     struct pid *pid;
0491 
0492     rcu_read_lock();
0493     pid = get_pid(find_vpid(nr));
0494     rcu_read_unlock();
0495 
0496     return pid;
0497 }
0498 EXPORT_SYMBOL_GPL(find_get_pid);
0499 
0500 pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns)
0501 {
0502     struct upid *upid;
0503     pid_t nr = 0;
0504 
0505     if (pid && ns->level <= pid->level) {
0506         upid = &pid->numbers[ns->level];
0507         if (upid->ns == ns)
0508             nr = upid->nr;
0509     }
0510     return nr;
0511 }
0512 EXPORT_SYMBOL_GPL(pid_nr_ns);
0513 
0514 pid_t pid_vnr(struct pid *pid)
0515 {
0516     return pid_nr_ns(pid, task_active_pid_ns(current));
0517 }
0518 EXPORT_SYMBOL_GPL(pid_vnr);
0519 
0520 pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
0521             struct pid_namespace *ns)
0522 {
0523     pid_t nr = 0;
0524 
0525     rcu_read_lock();
0526     if (!ns)
0527         ns = task_active_pid_ns(current);
0528     if (likely(pid_alive(task))) {
0529         if (type != PIDTYPE_PID)
0530             task = task->group_leader;
0531         nr = pid_nr_ns(rcu_dereference(task->pids[type].pid), ns);
0532     }
0533     rcu_read_unlock();
0534 
0535     return nr;
0536 }
0537 EXPORT_SYMBOL(__task_pid_nr_ns);
0538 
0539 pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
0540 {
0541     return pid_nr_ns(task_tgid(tsk), ns);
0542 }
0543 EXPORT_SYMBOL(task_tgid_nr_ns);
0544 
0545 struct pid_namespace *task_active_pid_ns(struct task_struct *tsk)
0546 {
0547     return ns_of_pid(task_pid(tsk));
0548 }
0549 EXPORT_SYMBOL_GPL(task_active_pid_ns);
0550 
0551 /*
0552  * Used by proc to find the first pid that is greater than or equal to nr.
0553  *
0554  * If there is a pid at nr this function is exactly the same as find_pid_ns.
0555  */
0556 struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
0557 {
0558     struct pid *pid;
0559 
0560     do {
0561         pid = find_pid_ns(nr, ns);
0562         if (pid)
0563             break;
0564         nr = next_pidmap(ns, nr);
0565     } while (nr > 0);
0566 
0567     return pid;
0568 }
0569 
0570 /*
0571  * The pid hash table is scaled according to the amount of memory in the
0572  * machine.  From a minimum of 16 slots up to 4096 slots at one gigabyte or
0573  * more.
0574  */
0575 void __init pidhash_init(void)
0576 {
0577     unsigned int i, pidhash_size;
0578 
0579     pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18,
0580                        HASH_EARLY | HASH_SMALL,
0581                        &pidhash_shift, NULL,
0582                        0, 4096);
0583     pidhash_size = 1U << pidhash_shift;
0584 
0585     for (i = 0; i < pidhash_size; i++)
0586         INIT_HLIST_HEAD(&pid_hash[i]);
0587 }
0588 
0589 void __init pidmap_init(void)
0590 {
0591     /* Verify no one has done anything silly: */
0592     BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_HASH_ADDING);
0593 
0594     /* bump default and minimum pid_max based on number of cpus */
0595     pid_max = min(pid_max_max, max_t(int, pid_max,
0596                 PIDS_PER_CPU_DEFAULT * num_possible_cpus()));
0597     pid_max_min = max_t(int, pid_max_min,
0598                 PIDS_PER_CPU_MIN * num_possible_cpus());
0599     pr_info("pid_max: default: %u minimum: %u\n", pid_max, pid_max_min);
0600 
0601     init_pid_ns.pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
0602     /* Reserve PID 0. We never call free_pidmap(0) */
0603     set_bit(0, init_pid_ns.pidmap[0].page);
0604     atomic_dec(&init_pid_ns.pidmap[0].nr_free);
0605 
0606     init_pid_ns.pid_cachep = KMEM_CACHE(pid,
0607             SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT);
0608 }