Back to home page

LXR

 
 

    


0001 /*
0002  * SLOB Allocator: Simple List Of Blocks
0003  *
0004  * Matt Mackall <mpm@selenic.com> 12/30/03
0005  *
0006  * NUMA support by Paul Mundt, 2007.
0007  *
0008  * How SLOB works:
0009  *
0010  * The core of SLOB is a traditional K&R style heap allocator, with
0011  * support for returning aligned objects. The granularity of this
0012  * allocator is as little as 2 bytes, however typically most architectures
0013  * will require 4 bytes on 32-bit and 8 bytes on 64-bit.
0014  *
0015  * The slob heap is a set of linked list of pages from alloc_pages(),
0016  * and within each page, there is a singly-linked list of free blocks
0017  * (slob_t). The heap is grown on demand. To reduce fragmentation,
0018  * heap pages are segregated into three lists, with objects less than
0019  * 256 bytes, objects less than 1024 bytes, and all other objects.
0020  *
0021  * Allocation from heap involves first searching for a page with
0022  * sufficient free blocks (using a next-fit-like approach) followed by
0023  * a first-fit scan of the page. Deallocation inserts objects back
0024  * into the free list in address order, so this is effectively an
0025  * address-ordered first fit.
0026  *
0027  * Above this is an implementation of kmalloc/kfree. Blocks returned
0028  * from kmalloc are prepended with a 4-byte header with the kmalloc size.
0029  * If kmalloc is asked for objects of PAGE_SIZE or larger, it calls
0030  * alloc_pages() directly, allocating compound pages so the page order
0031  * does not have to be separately tracked.
0032  * These objects are detected in kfree() because PageSlab()
0033  * is false for them.
0034  *
0035  * SLAB is emulated on top of SLOB by simply calling constructors and
0036  * destructors for every SLAB allocation. Objects are returned with the
0037  * 4-byte alignment unless the SLAB_HWCACHE_ALIGN flag is set, in which
0038  * case the low-level allocator will fragment blocks to create the proper
0039  * alignment. Again, objects of page-size or greater are allocated by
0040  * calling alloc_pages(). As SLAB objects know their size, no separate
0041  * size bookkeeping is necessary and there is essentially no allocation
0042  * space overhead, and compound pages aren't needed for multi-page
0043  * allocations.
0044  *
0045  * NUMA support in SLOB is fairly simplistic, pushing most of the real
0046  * logic down to the page allocator, and simply doing the node accounting
0047  * on the upper levels. In the event that a node id is explicitly
0048  * provided, __alloc_pages_node() with the specified node id is used
0049  * instead. The common case (or when the node id isn't explicitly provided)
0050  * will default to the current node, as per numa_node_id().
0051  *
0052  * Node aware pages are still inserted in to the global freelist, and
0053  * these are scanned for by matching against the node id encoded in the
0054  * page flags. As a result, block allocations that can be satisfied from
0055  * the freelist will only be done so on pages residing on the same node,
0056  * in order to prevent random node placement.
0057  */
0058 
0059 #include <linux/kernel.h>
0060 #include <linux/slab.h>
0061 
0062 #include <linux/mm.h>
0063 #include <linux/swap.h> /* struct reclaim_state */
0064 #include <linux/cache.h>
0065 #include <linux/init.h>
0066 #include <linux/export.h>
0067 #include <linux/rcupdate.h>
0068 #include <linux/list.h>
0069 #include <linux/kmemleak.h>
0070 
0071 #include <trace/events/kmem.h>
0072 
0073 #include <linux/atomic.h>
0074 
0075 #include "slab.h"
0076 /*
0077  * slob_block has a field 'units', which indicates size of block if +ve,
0078  * or offset of next block if -ve (in SLOB_UNITs).
0079  *
0080  * Free blocks of size 1 unit simply contain the offset of the next block.
0081  * Those with larger size contain their size in the first SLOB_UNIT of
0082  * memory, and the offset of the next free block in the second SLOB_UNIT.
0083  */
0084 #if PAGE_SIZE <= (32767 * 2)
0085 typedef s16 slobidx_t;
0086 #else
0087 typedef s32 slobidx_t;
0088 #endif
0089 
0090 struct slob_block {
0091     slobidx_t units;
0092 };
0093 typedef struct slob_block slob_t;
0094 
0095 /*
0096  * All partially free slob pages go on these lists.
0097  */
0098 #define SLOB_BREAK1 256
0099 #define SLOB_BREAK2 1024
0100 static LIST_HEAD(free_slob_small);
0101 static LIST_HEAD(free_slob_medium);
0102 static LIST_HEAD(free_slob_large);
0103 
0104 /*
0105  * slob_page_free: true for pages on free_slob_pages list.
0106  */
0107 static inline int slob_page_free(struct page *sp)
0108 {
0109     return PageSlobFree(sp);
0110 }
0111 
0112 static void set_slob_page_free(struct page *sp, struct list_head *list)
0113 {
0114     list_add(&sp->lru, list);
0115     __SetPageSlobFree(sp);
0116 }
0117 
0118 static inline void clear_slob_page_free(struct page *sp)
0119 {
0120     list_del(&sp->lru);
0121     __ClearPageSlobFree(sp);
0122 }
0123 
0124 #define SLOB_UNIT sizeof(slob_t)
0125 #define SLOB_UNITS(size) DIV_ROUND_UP(size, SLOB_UNIT)
0126 
0127 /*
0128  * struct slob_rcu is inserted at the tail of allocated slob blocks, which
0129  * were created with a SLAB_DESTROY_BY_RCU slab. slob_rcu is used to free
0130  * the block using call_rcu.
0131  */
0132 struct slob_rcu {
0133     struct rcu_head head;
0134     int size;
0135 };
0136 
0137 /*
0138  * slob_lock protects all slob allocator structures.
0139  */
0140 static DEFINE_SPINLOCK(slob_lock);
0141 
0142 /*
0143  * Encode the given size and next info into a free slob block s.
0144  */
0145 static void set_slob(slob_t *s, slobidx_t size, slob_t *next)
0146 {
0147     slob_t *base = (slob_t *)((unsigned long)s & PAGE_MASK);
0148     slobidx_t offset = next - base;
0149 
0150     if (size > 1) {
0151         s[0].units = size;
0152         s[1].units = offset;
0153     } else
0154         s[0].units = -offset;
0155 }
0156 
0157 /*
0158  * Return the size of a slob block.
0159  */
0160 static slobidx_t slob_units(slob_t *s)
0161 {
0162     if (s->units > 0)
0163         return s->units;
0164     return 1;
0165 }
0166 
0167 /*
0168  * Return the next free slob block pointer after this one.
0169  */
0170 static slob_t *slob_next(slob_t *s)
0171 {
0172     slob_t *base = (slob_t *)((unsigned long)s & PAGE_MASK);
0173     slobidx_t next;
0174 
0175     if (s[0].units < 0)
0176         next = -s[0].units;
0177     else
0178         next = s[1].units;
0179     return base+next;
0180 }
0181 
0182 /*
0183  * Returns true if s is the last free block in its page.
0184  */
0185 static int slob_last(slob_t *s)
0186 {
0187     return !((unsigned long)slob_next(s) & ~PAGE_MASK);
0188 }
0189 
0190 static void *slob_new_pages(gfp_t gfp, int order, int node)
0191 {
0192     void *page;
0193 
0194 #ifdef CONFIG_NUMA
0195     if (node != NUMA_NO_NODE)
0196         page = __alloc_pages_node(node, gfp, order);
0197     else
0198 #endif
0199         page = alloc_pages(gfp, order);
0200 
0201     if (!page)
0202         return NULL;
0203 
0204     return page_address(page);
0205 }
0206 
0207 static void slob_free_pages(void *b, int order)
0208 {
0209     if (current->reclaim_state)
0210         current->reclaim_state->reclaimed_slab += 1 << order;
0211     free_pages((unsigned long)b, order);
0212 }
0213 
0214 /*
0215  * Allocate a slob block within a given slob_page sp.
0216  */
0217 static void *slob_page_alloc(struct page *sp, size_t size, int align)
0218 {
0219     slob_t *prev, *cur, *aligned = NULL;
0220     int delta = 0, units = SLOB_UNITS(size);
0221 
0222     for (prev = NULL, cur = sp->freelist; ; prev = cur, cur = slob_next(cur)) {
0223         slobidx_t avail = slob_units(cur);
0224 
0225         if (align) {
0226             aligned = (slob_t *)ALIGN((unsigned long)cur, align);
0227             delta = aligned - cur;
0228         }
0229         if (avail >= units + delta) { /* room enough? */
0230             slob_t *next;
0231 
0232             if (delta) { /* need to fragment head to align? */
0233                 next = slob_next(cur);
0234                 set_slob(aligned, avail - delta, next);
0235                 set_slob(cur, delta, aligned);
0236                 prev = cur;
0237                 cur = aligned;
0238                 avail = slob_units(cur);
0239             }
0240 
0241             next = slob_next(cur);
0242             if (avail == units) { /* exact fit? unlink. */
0243                 if (prev)
0244                     set_slob(prev, slob_units(prev), next);
0245                 else
0246                     sp->freelist = next;
0247             } else { /* fragment */
0248                 if (prev)
0249                     set_slob(prev, slob_units(prev), cur + units);
0250                 else
0251                     sp->freelist = cur + units;
0252                 set_slob(cur + units, avail - units, next);
0253             }
0254 
0255             sp->units -= units;
0256             if (!sp->units)
0257                 clear_slob_page_free(sp);
0258             return cur;
0259         }
0260         if (slob_last(cur))
0261             return NULL;
0262     }
0263 }
0264 
0265 /*
0266  * slob_alloc: entry point into the slob allocator.
0267  */
0268 static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
0269 {
0270     struct page *sp;
0271     struct list_head *prev;
0272     struct list_head *slob_list;
0273     slob_t *b = NULL;
0274     unsigned long flags;
0275 
0276     if (size < SLOB_BREAK1)
0277         slob_list = &free_slob_small;
0278     else if (size < SLOB_BREAK2)
0279         slob_list = &free_slob_medium;
0280     else
0281         slob_list = &free_slob_large;
0282 
0283     spin_lock_irqsave(&slob_lock, flags);
0284     /* Iterate through each partially free page, try to find room */
0285     list_for_each_entry(sp, slob_list, lru) {
0286 #ifdef CONFIG_NUMA
0287         /*
0288          * If there's a node specification, search for a partial
0289          * page with a matching node id in the freelist.
0290          */
0291         if (node != NUMA_NO_NODE && page_to_nid(sp) != node)
0292             continue;
0293 #endif
0294         /* Enough room on this page? */
0295         if (sp->units < SLOB_UNITS(size))
0296             continue;
0297 
0298         /* Attempt to alloc */
0299         prev = sp->lru.prev;
0300         b = slob_page_alloc(sp, size, align);
0301         if (!b)
0302             continue;
0303 
0304         /* Improve fragment distribution and reduce our average
0305          * search time by starting our next search here. (see
0306          * Knuth vol 1, sec 2.5, pg 449) */
0307         if (prev != slob_list->prev &&
0308                 slob_list->next != prev->next)
0309             list_move_tail(slob_list, prev->next);
0310         break;
0311     }
0312     spin_unlock_irqrestore(&slob_lock, flags);
0313 
0314     /* Not enough space: must allocate a new page */
0315     if (!b) {
0316         b = slob_new_pages(gfp & ~__GFP_ZERO, 0, node);
0317         if (!b)
0318             return NULL;
0319         sp = virt_to_page(b);
0320         __SetPageSlab(sp);
0321 
0322         spin_lock_irqsave(&slob_lock, flags);
0323         sp->units = SLOB_UNITS(PAGE_SIZE);
0324         sp->freelist = b;
0325         INIT_LIST_HEAD(&sp->lru);
0326         set_slob(b, SLOB_UNITS(PAGE_SIZE), b + SLOB_UNITS(PAGE_SIZE));
0327         set_slob_page_free(sp, slob_list);
0328         b = slob_page_alloc(sp, size, align);
0329         BUG_ON(!b);
0330         spin_unlock_irqrestore(&slob_lock, flags);
0331     }
0332     if (unlikely((gfp & __GFP_ZERO) && b))
0333         memset(b, 0, size);
0334     return b;
0335 }
0336 
0337 /*
0338  * slob_free: entry point into the slob allocator.
0339  */
0340 static void slob_free(void *block, int size)
0341 {
0342     struct page *sp;
0343     slob_t *prev, *next, *b = (slob_t *)block;
0344     slobidx_t units;
0345     unsigned long flags;
0346     struct list_head *slob_list;
0347 
0348     if (unlikely(ZERO_OR_NULL_PTR(block)))
0349         return;
0350     BUG_ON(!size);
0351 
0352     sp = virt_to_page(block);
0353     units = SLOB_UNITS(size);
0354 
0355     spin_lock_irqsave(&slob_lock, flags);
0356 
0357     if (sp->units + units == SLOB_UNITS(PAGE_SIZE)) {
0358         /* Go directly to page allocator. Do not pass slob allocator */
0359         if (slob_page_free(sp))
0360             clear_slob_page_free(sp);
0361         spin_unlock_irqrestore(&slob_lock, flags);
0362         __ClearPageSlab(sp);
0363         page_mapcount_reset(sp);
0364         slob_free_pages(b, 0);
0365         return;
0366     }
0367 
0368     if (!slob_page_free(sp)) {
0369         /* This slob page is about to become partially free. Easy! */
0370         sp->units = units;
0371         sp->freelist = b;
0372         set_slob(b, units,
0373             (void *)((unsigned long)(b +
0374                     SLOB_UNITS(PAGE_SIZE)) & PAGE_MASK));
0375         if (size < SLOB_BREAK1)
0376             slob_list = &free_slob_small;
0377         else if (size < SLOB_BREAK2)
0378             slob_list = &free_slob_medium;
0379         else
0380             slob_list = &free_slob_large;
0381         set_slob_page_free(sp, slob_list);
0382         goto out;
0383     }
0384 
0385     /*
0386      * Otherwise the page is already partially free, so find reinsertion
0387      * point.
0388      */
0389     sp->units += units;
0390 
0391     if (b < (slob_t *)sp->freelist) {
0392         if (b + units == sp->freelist) {
0393             units += slob_units(sp->freelist);
0394             sp->freelist = slob_next(sp->freelist);
0395         }
0396         set_slob(b, units, sp->freelist);
0397         sp->freelist = b;
0398     } else {
0399         prev = sp->freelist;
0400         next = slob_next(prev);
0401         while (b > next) {
0402             prev = next;
0403             next = slob_next(prev);
0404         }
0405 
0406         if (!slob_last(prev) && b + units == next) {
0407             units += slob_units(next);
0408             set_slob(b, units, slob_next(next));
0409         } else
0410             set_slob(b, units, next);
0411 
0412         if (prev + slob_units(prev) == b) {
0413             units = slob_units(b) + slob_units(prev);
0414             set_slob(prev, units, slob_next(b));
0415         } else
0416             set_slob(prev, slob_units(prev), b);
0417     }
0418 out:
0419     spin_unlock_irqrestore(&slob_lock, flags);
0420 }
0421 
0422 /*
0423  * End of slob allocator proper. Begin kmem_cache_alloc and kmalloc frontend.
0424  */
0425 
0426 static __always_inline void *
0427 __do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller)
0428 {
0429     unsigned int *m;
0430     int align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
0431     void *ret;
0432 
0433     gfp &= gfp_allowed_mask;
0434 
0435     lockdep_trace_alloc(gfp);
0436 
0437     if (size < PAGE_SIZE - align) {
0438         if (!size)
0439             return ZERO_SIZE_PTR;
0440 
0441         m = slob_alloc(size + align, gfp, align, node);
0442 
0443         if (!m)
0444             return NULL;
0445         *m = size;
0446         ret = (void *)m + align;
0447 
0448         trace_kmalloc_node(caller, ret,
0449                    size, size + align, gfp, node);
0450     } else {
0451         unsigned int order = get_order(size);
0452 
0453         if (likely(order))
0454             gfp |= __GFP_COMP;
0455         ret = slob_new_pages(gfp, order, node);
0456 
0457         trace_kmalloc_node(caller, ret,
0458                    size, PAGE_SIZE << order, gfp, node);
0459     }
0460 
0461     kmemleak_alloc(ret, size, 1, gfp);
0462     return ret;
0463 }
0464 
0465 void *__kmalloc(size_t size, gfp_t gfp)
0466 {
0467     return __do_kmalloc_node(size, gfp, NUMA_NO_NODE, _RET_IP_);
0468 }
0469 EXPORT_SYMBOL(__kmalloc);
0470 
0471 void *__kmalloc_track_caller(size_t size, gfp_t gfp, unsigned long caller)
0472 {
0473     return __do_kmalloc_node(size, gfp, NUMA_NO_NODE, caller);
0474 }
0475 
0476 #ifdef CONFIG_NUMA
0477 void *__kmalloc_node_track_caller(size_t size, gfp_t gfp,
0478                     int node, unsigned long caller)
0479 {
0480     return __do_kmalloc_node(size, gfp, node, caller);
0481 }
0482 #endif
0483 
0484 void kfree(const void *block)
0485 {
0486     struct page *sp;
0487 
0488     trace_kfree(_RET_IP_, block);
0489 
0490     if (unlikely(ZERO_OR_NULL_PTR(block)))
0491         return;
0492     kmemleak_free(block);
0493 
0494     sp = virt_to_page(block);
0495     if (PageSlab(sp)) {
0496         int align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
0497         unsigned int *m = (unsigned int *)(block - align);
0498         slob_free(m, *m + align);
0499     } else
0500         __free_pages(sp, compound_order(sp));
0501 }
0502 EXPORT_SYMBOL(kfree);
0503 
0504 /* can't use ksize for kmem_cache_alloc memory, only kmalloc */
0505 size_t ksize(const void *block)
0506 {
0507     struct page *sp;
0508     int align;
0509     unsigned int *m;
0510 
0511     BUG_ON(!block);
0512     if (unlikely(block == ZERO_SIZE_PTR))
0513         return 0;
0514 
0515     sp = virt_to_page(block);
0516     if (unlikely(!PageSlab(sp)))
0517         return PAGE_SIZE << compound_order(sp);
0518 
0519     align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
0520     m = (unsigned int *)(block - align);
0521     return SLOB_UNITS(*m) * SLOB_UNIT;
0522 }
0523 EXPORT_SYMBOL(ksize);
0524 
0525 int __kmem_cache_create(struct kmem_cache *c, unsigned long flags)
0526 {
0527     if (flags & SLAB_DESTROY_BY_RCU) {
0528         /* leave room for rcu footer at the end of object */
0529         c->size += sizeof(struct slob_rcu);
0530     }
0531     c->flags = flags;
0532     return 0;
0533 }
0534 
0535 static void *slob_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
0536 {
0537     void *b;
0538 
0539     flags &= gfp_allowed_mask;
0540 
0541     lockdep_trace_alloc(flags);
0542 
0543     if (c->size < PAGE_SIZE) {
0544         b = slob_alloc(c->size, flags, c->align, node);
0545         trace_kmem_cache_alloc_node(_RET_IP_, b, c->object_size,
0546                         SLOB_UNITS(c->size) * SLOB_UNIT,
0547                         flags, node);
0548     } else {
0549         b = slob_new_pages(flags, get_order(c->size), node);
0550         trace_kmem_cache_alloc_node(_RET_IP_, b, c->object_size,
0551                         PAGE_SIZE << get_order(c->size),
0552                         flags, node);
0553     }
0554 
0555     if (b && c->ctor)
0556         c->ctor(b);
0557 
0558     kmemleak_alloc_recursive(b, c->size, 1, c->flags, flags);
0559     return b;
0560 }
0561 
0562 void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
0563 {
0564     return slob_alloc_node(cachep, flags, NUMA_NO_NODE);
0565 }
0566 EXPORT_SYMBOL(kmem_cache_alloc);
0567 
0568 #ifdef CONFIG_NUMA
0569 void *__kmalloc_node(size_t size, gfp_t gfp, int node)
0570 {
0571     return __do_kmalloc_node(size, gfp, node, _RET_IP_);
0572 }
0573 EXPORT_SYMBOL(__kmalloc_node);
0574 
0575 void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t gfp, int node)
0576 {
0577     return slob_alloc_node(cachep, gfp, node);
0578 }
0579 EXPORT_SYMBOL(kmem_cache_alloc_node);
0580 #endif
0581 
0582 static void __kmem_cache_free(void *b, int size)
0583 {
0584     if (size < PAGE_SIZE)
0585         slob_free(b, size);
0586     else
0587         slob_free_pages(b, get_order(size));
0588 }
0589 
0590 static void kmem_rcu_free(struct rcu_head *head)
0591 {
0592     struct slob_rcu *slob_rcu = (struct slob_rcu *)head;
0593     void *b = (void *)slob_rcu - (slob_rcu->size - sizeof(struct slob_rcu));
0594 
0595     __kmem_cache_free(b, slob_rcu->size);
0596 }
0597 
0598 void kmem_cache_free(struct kmem_cache *c, void *b)
0599 {
0600     kmemleak_free_recursive(b, c->flags);
0601     if (unlikely(c->flags & SLAB_DESTROY_BY_RCU)) {
0602         struct slob_rcu *slob_rcu;
0603         slob_rcu = b + (c->size - sizeof(struct slob_rcu));
0604         slob_rcu->size = c->size;
0605         call_rcu(&slob_rcu->head, kmem_rcu_free);
0606     } else {
0607         __kmem_cache_free(b, c->size);
0608     }
0609 
0610     trace_kmem_cache_free(_RET_IP_, b);
0611 }
0612 EXPORT_SYMBOL(kmem_cache_free);
0613 
0614 void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
0615 {
0616     __kmem_cache_free_bulk(s, size, p);
0617 }
0618 EXPORT_SYMBOL(kmem_cache_free_bulk);
0619 
0620 int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
0621                                 void **p)
0622 {
0623     return __kmem_cache_alloc_bulk(s, flags, size, p);
0624 }
0625 EXPORT_SYMBOL(kmem_cache_alloc_bulk);
0626 
0627 int __kmem_cache_shutdown(struct kmem_cache *c)
0628 {
0629     /* No way to check for remaining objects */
0630     return 0;
0631 }
0632 
0633 void __kmem_cache_release(struct kmem_cache *c)
0634 {
0635 }
0636 
0637 int __kmem_cache_shrink(struct kmem_cache *d)
0638 {
0639     return 0;
0640 }
0641 
0642 struct kmem_cache kmem_cache_boot = {
0643     .name = "kmem_cache",
0644     .size = sizeof(struct kmem_cache),
0645     .flags = SLAB_PANIC,
0646     .align = ARCH_KMALLOC_MINALIGN,
0647 };
0648 
0649 void __init kmem_cache_init(void)
0650 {
0651     kmem_cache = &kmem_cache_boot;
0652     slab_state = UP;
0653 }
0654 
0655 void __init kmem_cache_init_late(void)
0656 {
0657     slab_state = FULL;
0658 }