the-tree/mm/slub.c

0001 // SPDX-License-Identifier: GPL-2.0
0002 /*
0003  * SLUB: A slab allocator that limits cache line use instead of queuing
0004  * objects in per cpu and per node lists.
0005  *
0006  * The allocator synchronizes using per slab locks or atomic operations
0007  * and only uses a centralized lock to manage a pool of partial slabs.
0008  *
0009  * (C) 2007 SGI, Christoph Lameter
0010  * (C) 2011 Linux Foundation, Christoph Lameter
0011  */
0012
0013 #include <linux/mm.h>
0014 #include <linux/swap.h> /* struct reclaim_state */
0015 #include <linux/module.h>
0016 #include <linux/bit_spinlock.h>
0017 #include <linux/interrupt.h>
0018 #include <linux/swab.h>
0019 #include <linux/bitops.h>
0020 #include <linux/slab.h>
0021 #include "slab.h"
0022 #include <linux/proc_fs.h>
0023 #include <linux/seq_file.h>
0024 #include <linux/kasan.h>
0025 #include <linux/cpu.h>
0026 #include <linux/cpuset.h>
0027 #include <linux/mempolicy.h>
0028 #include <linux/ctype.h>
0029 #include <linux/stackdepot.h>
0030 #include <linux/debugobjects.h>
0031 #include <linux/kallsyms.h>
0032 #include <linux/kfence.h>
0033 #include <linux/memory.h>
0034 #include <linux/math64.h>
0035 #include <linux/fault-inject.h>
0036 #include <linux/stacktrace.h>
0037 #include <linux/prefetch.h>
0038 #include <linux/memcontrol.h>
0039 #include <linux/random.h>
0040 #include <kunit/test.h>
0041 #include <linux/sort.h>
0042
0043 #include <linux/debugfs.h>
0044 #include <trace/events/kmem.h>
0045
0046 #include "internal.h"
0047
0048 /*
0049  * Lock order:
0050  *   1. slab_mutex (Global Mutex)
0051  *   2. node->list_lock (Spinlock)
0052  *   3. kmem_cache->cpu_slab->lock (Local lock)
0053  *   4. slab_lock(slab) (Only on some arches or for debugging)
0054  *   5. object_map_lock (Only for debugging)
0055  *
0056  *   slab_mutex
0057  *
0058  *   The role of the slab_mutex is to protect the list of all the slabs
0059  *   and to synchronize major metadata changes to slab cache structures.
0060  *   Also synchronizes memory hotplug callbacks.
0061  *
0062  *   slab_lock
0063  *
0064  *   The slab_lock is a wrapper around the page lock, thus it is a bit
0065  *   spinlock.
0066  *
0067  *   The slab_lock is only used for debugging and on arches that do not
0068  *   have the ability to do a cmpxchg_double. It only protects:
0069  *  A. slab->freelist   -> List of free objects in a slab
0070  *  B. slab->inuse      -> Number of objects in use
0071  *  C. slab->objects    -> Number of objects in slab
0072  *  D. slab->frozen     -> frozen state
0073  *
0074  *   Frozen slabs
0075  *
0076  *   If a slab is frozen then it is exempt from list management. It is not
0077  *   on any list except per cpu partial list. The processor that froze the
0078  *   slab is the one who can perform list operations on the slab. Other
0079  *   processors may put objects onto the freelist but the processor that
0080  *   froze the slab is the only one that can retrieve the objects from the
0081  *   slab's freelist.
0082  *
0083  *   list_lock
0084  *
0085  *   The list_lock protects the partial and full list on each node and
0086  *   the partial slab counter. If taken then no new slabs may be added or
0087  *   removed from the lists nor make the number of partial slabs be modified.
0088  *   (Note that the total number of slabs is an atomic value that may be
0089  *   modified without taking the list lock).
0090  *
0091  *   The list_lock is a centralized lock and thus we avoid taking it as
0092  *   much as possible. As long as SLUB does not have to handle partial
0093  *   slabs, operations can continue without any centralized lock. F.e.
0094  *   allocating a long series of objects that fill up slabs does not require
0095  *   the list lock.
0096  *
0097  *   cpu_slab->lock local lock
0098  *
0099  *   This locks protect slowpath manipulation of all kmem_cache_cpu fields
0100  *   except the stat counters. This is a percpu structure manipulated only by
0101  *   the local cpu, so the lock protects against being preempted or interrupted
0102  *   by an irq. Fast path operations rely on lockless operations instead.
0103  *   On PREEMPT_RT, the local lock does not actually disable irqs (and thus
0104  *   prevent the lockless operations), so fastpath operations also need to take
0105  *   the lock and are no longer lockless.
0106  *
0107  *   lockless fastpaths
0108  *
0109  *   The fast path allocation (slab_alloc_node()) and freeing (do_slab_free())
0110  *   are fully lockless when satisfied from the percpu slab (and when
0111  *   cmpxchg_double is possible to use, otherwise slab_lock is taken).
0112  *   They also don't disable preemption or migration or irqs. They rely on
0113  *   the transaction id (tid) field to detect being preempted or moved to
0114  *   another cpu.
0115  *
0116  *   irq, preemption, migration considerations
0117  *
0118  *   Interrupts are disabled as part of list_lock or local_lock operations, or
0119  *   around the slab_lock operation, in order to make the slab allocator safe
0120  *   to use in the context of an irq.
0121  *
0122  *   In addition, preemption (or migration on PREEMPT_RT) is disabled in the
0123  *   allocation slowpath, bulk allocation, and put_cpu_partial(), so that the
0124  *   local cpu doesn't change in the process and e.g. the kmem_cache_cpu pointer
0125  *   doesn't have to be revalidated in each section protected by the local lock.
0126  *
0127  * SLUB assigns one slab for allocation to each processor.
0128  * Allocations only occur from these slabs called cpu slabs.
0129  *
0130  * Slabs with free elements are kept on a partial list and during regular
0131  * operations no list for full slabs is used. If an object in a full slab is
0132  * freed then the slab will show up again on the partial lists.
0133  * We track full slabs for debugging purposes though because otherwise we
0134  * cannot scan all objects.
0135  *
0136  * Slabs are freed when they become empty. Teardown and setup is
0137  * minimal so we rely on the page allocators per cpu caches for
0138  * fast frees and allocs.
0139  *
0140  * slab->frozen     The slab is frozen and exempt from list processing.
0141  *          This means that the slab is dedicated to a purpose
0142  *          such as satisfying allocations for a specific
0143  *          processor. Objects may be freed in the slab while
0144  *          it is frozen but slab_free will then skip the usual
0145  *          list operations. It is up to the processor holding
0146  *          the slab to integrate the slab into the slab lists
0147  *          when the slab is no longer needed.
0148  *
0149  *          One use of this flag is to mark slabs that are
0150  *          used for allocations. Then such a slab becomes a cpu
0151  *          slab. The cpu slab may be equipped with an additional
0152  *          freelist that allows lockless access to
0153  *          free objects in addition to the regular freelist
0154  *          that requires the slab lock.
0155  *
0156  * SLAB_DEBUG_FLAGS Slab requires special handling due to debug
0157  *          options set. This moves slab handling out of
0158  *          the fast path and disables lockless freelists.
0159  */
0160
0161 /*
0162  * We could simply use migrate_disable()/enable() but as long as it's a
0163  * function call even on !PREEMPT_RT, use inline preempt_disable() there.
0164  */
0165 #ifndef CONFIG_PREEMPT_RT
0166 #define slub_get_cpu_ptr(var)   get_cpu_ptr(var)
0167 #define slub_put_cpu_ptr(var)   put_cpu_ptr(var)
0168 #else
0169 #define slub_get_cpu_ptr(var)       \
0170 ({                  \
0171     migrate_disable();      \
0172     this_cpu_ptr(var);      \
0173 })
0174 #define slub_put_cpu_ptr(var)       \
0175 do {                    \
0176     (void)(var);            \
0177     migrate_enable();       \
0178 } while (0)
0179 #endif
0180
0181 #ifdef CONFIG_SLUB_DEBUG
0182 #ifdef CONFIG_SLUB_DEBUG_ON
0183 DEFINE_STATIC_KEY_TRUE(slub_debug_enabled);
0184 #else
0185 DEFINE_STATIC_KEY_FALSE(slub_debug_enabled);
0186 #endif
0187 #endif      /* CONFIG_SLUB_DEBUG */
0188
0189 static inline bool kmem_cache_debug(struct kmem_cache *s)
0190 {
0191     return kmem_cache_debug_flags(s, SLAB_DEBUG_FLAGS);
0192 }
0193
0194 void *fixup_red_left(struct kmem_cache *s, void *p)
0195 {
0196     if (kmem_cache_debug_flags(s, SLAB_RED_ZONE))
0197         p += s->red_left_pad;
0198
0199     return p;
0200 }
0201
0202 static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s)
0203 {
0204 #ifdef CONFIG_SLUB_CPU_PARTIAL
0205     return !kmem_cache_debug(s);
0206 #else
0207     return false;
0208 #endif
0209 }
0210
0211 /*
0212  * Issues still to be resolved:
0213  *
0214  * - Support PAGE_ALLOC_DEBUG. Should be easy to do.
0215  *
0216  * - Variable sizing of the per node arrays
0217  */
0218
0219 /* Enable to log cmpxchg failures */
0220 #undef SLUB_DEBUG_CMPXCHG
0221
0222 /*
0223  * Minimum number of partial slabs. These will be left on the partial
0224  * lists even if they are empty. kmem_cache_shrink may reclaim them.
0225  */
0226 #define MIN_PARTIAL 5
0227
0228 /*
0229  * Maximum number of desirable partial slabs.
0230  * The existence of more partial slabs makes kmem_cache_shrink
0231  * sort the partial list by the number of objects in use.
0232  */
0233 #define MAX_PARTIAL 10
0234
0235 #define DEBUG_DEFAULT_FLAGS (SLAB_CONSISTENCY_CHECKS | SLAB_RED_ZONE | \
0236                 SLAB_POISON | SLAB_STORE_USER)
0237
0238 /*
0239  * These debug flags cannot use CMPXCHG because there might be consistency
0240  * issues when checking or reading debug information
0241  */
0242 #define SLAB_NO_CMPXCHG (SLAB_CONSISTENCY_CHECKS | SLAB_STORE_USER | \
0243                 SLAB_TRACE)
0244
0245
0246 /*
0247  * Debugging flags that require metadata to be stored in the slab.  These get
0248  * disabled when slub_debug=O is used and a cache's min order increases with
0249  * metadata.
0250  */
0251 #define DEBUG_METADATA_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER)
0252
0253 #define OO_SHIFT    16
0254 #define OO_MASK     ((1 << OO_SHIFT) - 1)
0255 #define MAX_OBJS_PER_PAGE   32767 /* since slab.objects is u15 */
0256
0257 /* Internal SLUB flags */
0258 /* Poison object */
0259 #define __OBJECT_POISON     ((slab_flags_t __force)0x80000000U)
0260 /* Use cmpxchg_double */
0261 #define __CMPXCHG_DOUBLE    ((slab_flags_t __force)0x40000000U)
0262
0263 /*
0264  * Tracking user of a slab.
0265  */
0266 #define TRACK_ADDRS_COUNT 16
0267 struct track {
0268     unsigned long addr; /* Called from address */
0269 #ifdef CONFIG_STACKDEPOT
0270     depot_stack_handle_t handle;
0271 #endif
0272     int cpu;        /* Was running on cpu */
0273     int pid;        /* Pid context */
0274     unsigned long when; /* When did the operation occur */
0275 };
0276
0277 enum track_item { TRACK_ALLOC, TRACK_FREE };
0278
0279 #ifdef CONFIG_SYSFS
0280 static int sysfs_slab_add(struct kmem_cache *);
0281 static int sysfs_slab_alias(struct kmem_cache *, const char *);
0282 #else
0283 static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; }
0284 static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p)
0285                             { return 0; }
0286 #endif
0287
0288 #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_SLUB_DEBUG)
0289 static void debugfs_slab_add(struct kmem_cache *);
0290 #else
0291 static inline void debugfs_slab_add(struct kmem_cache *s) { }
0292 #endif
0293
0294 static inline void stat(const struct kmem_cache *s, enum stat_item si)
0295 {
0296 #ifdef CONFIG_SLUB_STATS
0297     /*
0298      * The rmw is racy on a preemptible kernel but this is acceptable, so
0299      * avoid this_cpu_add()'s irq-disable overhead.
0300      */
0301     raw_cpu_inc(s->cpu_slab->stat[si]);
0302 #endif
0303 }
0304
0305 /*
0306  * Tracks for which NUMA nodes we have kmem_cache_nodes allocated.
0307  * Corresponds to node_state[N_NORMAL_MEMORY], but can temporarily
0308  * differ during memory hotplug/hotremove operations.
0309  * Protected by slab_mutex.
0310  */
0311 static nodemask_t slab_nodes;
0312
0313 /*
0314  * Workqueue used for flush_cpu_slab().
0315  */
0316 static struct workqueue_struct *flushwq;
0317
0318 /********************************************************************
0319  *          Core slab cache functions
0320  *******************************************************************/
0321
0322 /*
0323  * Returns freelist pointer (ptr). With hardening, this is obfuscated
0324  * with an XOR of the address where the pointer is held and a per-cache
0325  * random number.
0326  */
0327 static inline void *freelist_ptr(const struct kmem_cache *s, void *ptr,
0328                  unsigned long ptr_addr)
0329 {
0330 #ifdef CONFIG_SLAB_FREELIST_HARDENED
0331     /*
0332      * When CONFIG_KASAN_SW/HW_TAGS is enabled, ptr_addr might be tagged.
0333      * Normally, this doesn't cause any issues, as both set_freepointer()
0334      * and get_freepointer() are called with a pointer with the same tag.
0335      * However, there are some issues with CONFIG_SLUB_DEBUG code. For
0336      * example, when __free_slub() iterates over objects in a cache, it
0337      * passes untagged pointers to check_object(). check_object() in turns
0338      * calls get_freepointer() with an untagged pointer, which causes the
0339      * freepointer to be restored incorrectly.
0340      */
0341     return (void *)((unsigned long)ptr ^ s->random ^
0342             swab((unsigned long)kasan_reset_tag((void *)ptr_addr)));
0343 #else
0344     return ptr;
0345 #endif
0346 }
0347
0348 /* Returns the freelist pointer recorded at location ptr_addr. */
0349 static inline void *freelist_dereference(const struct kmem_cache *s,
0350                      void *ptr_addr)
0351 {
0352     return freelist_ptr(s, (void *)*(unsigned long *)(ptr_addr),
0353                 (unsigned long)ptr_addr);
0354 }
0355
0356 static inline void *get_freepointer(struct kmem_cache *s, void *object)
0357 {
0358     object = kasan_reset_tag(object);
0359     return freelist_dereference(s, object + s->offset);
0360 }
0361
0362 static void prefetch_freepointer(const struct kmem_cache *s, void *object)
0363 {
0364     prefetchw(object + s->offset);
0365 }
0366
0367 static inline void *get_freepointer_safe(struct kmem_cache *s, void *object)
0368 {
0369     unsigned long freepointer_addr;
0370     void *p;
0371
0372     if (!debug_pagealloc_enabled_static())
0373         return get_freepointer(s, object);
0374
0375     object = kasan_reset_tag(object);
0376     freepointer_addr = (unsigned long)object + s->offset;
0377     copy_from_kernel_nofault(&p, (void **)freepointer_addr, sizeof(p));
0378     return freelist_ptr(s, p, freepointer_addr);
0379 }
0380
0381 static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
0382 {
0383     unsigned long freeptr_addr = (unsigned long)object + s->offset;
0384
0385 #ifdef CONFIG_SLAB_FREELIST_HARDENED
0386     BUG_ON(object == fp); /* naive detection of double free or corruption */
0387 #endif
0388
0389     freeptr_addr = (unsigned long)kasan_reset_tag((void *)freeptr_addr);
0390     *(void **)freeptr_addr = freelist_ptr(s, fp, freeptr_addr);
0391 }
0392
0393 /* Loop over all objects in a slab */
0394 #define for_each_object(__p, __s, __addr, __objects) \
0395     for (__p = fixup_red_left(__s, __addr); \
0396         __p < (__addr) + (__objects) * (__s)->size; \
0397         __p += (__s)->size)
0398
0399 static inline unsigned int order_objects(unsigned int order, unsigned int size)
0400 {
0401     return ((unsigned int)PAGE_SIZE << order) / size;
0402 }
0403
0404 static inline struct kmem_cache_order_objects oo_make(unsigned int order,
0405         unsigned int size)
0406 {
0407     struct kmem_cache_order_objects x = {
0408         (order << OO_SHIFT) + order_objects(order, size)
0409     };
0410
0411     return x;
0412 }
0413
0414 static inline unsigned int oo_order(struct kmem_cache_order_objects x)
0415 {
0416     return x.x >> OO_SHIFT;
0417 }
0418
0419 static inline unsigned int oo_objects(struct kmem_cache_order_objects x)
0420 {
0421     return x.x & OO_MASK;
0422 }
0423
0424 #ifdef CONFIG_SLUB_CPU_PARTIAL
0425 static void slub_set_cpu_partial(struct kmem_cache *s, unsigned int nr_objects)
0426 {
0427     unsigned int nr_slabs;
0428
0429     s->cpu_partial = nr_objects;
0430
0431     /*
0432      * We take the number of objects but actually limit the number of
0433      * slabs on the per cpu partial list, in order to limit excessive
0434      * growth of the list. For simplicity we assume that the slabs will
0435      * be half-full.
0436      */
0437     nr_slabs = DIV_ROUND_UP(nr_objects * 2, oo_objects(s->oo));
0438     s->cpu_partial_slabs = nr_slabs;
0439 }
0440 #else
0441 static inline void
0442 slub_set_cpu_partial(struct kmem_cache *s, unsigned int nr_objects)
0443 {
0444 }
0445 #endif /* CONFIG_SLUB_CPU_PARTIAL */
0446
0447 /*
0448  * Per slab locking using the pagelock
0449  */
0450 static __always_inline void __slab_lock(struct slab *slab)
0451 {
0452     struct page *page = slab_page(slab);
0453
0454     VM_BUG_ON_PAGE(PageTail(page), page);
0455     bit_spin_lock(PG_locked, &page->flags);
0456 }
0457
0458 static __always_inline void __slab_unlock(struct slab *slab)
0459 {
0460     struct page *page = slab_page(slab);
0461
0462     VM_BUG_ON_PAGE(PageTail(page), page);
0463     __bit_spin_unlock(PG_locked, &page->flags);
0464 }
0465
0466 static __always_inline void slab_lock(struct slab *slab, unsigned long *flags)
0467 {
0468     if (IS_ENABLED(CONFIG_PREEMPT_RT))
0469         local_irq_save(*flags);
0470     __slab_lock(slab);
0471 }
0472
0473 static __always_inline void slab_unlock(struct slab *slab, unsigned long *flags)
0474 {
0475     __slab_unlock(slab);
0476     if (IS_ENABLED(CONFIG_PREEMPT_RT))
0477         local_irq_restore(*flags);
0478 }
0479
0480 /*
0481  * Interrupts must be disabled (for the fallback code to work right), typically
0482  * by an _irqsave() lock variant. Except on PREEMPT_RT where locks are different
0483  * so we disable interrupts as part of slab_[un]lock().
0484  */
0485 static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct slab *slab,
0486         void *freelist_old, unsigned long counters_old,
0487         void *freelist_new, unsigned long counters_new,
0488         const char *n)
0489 {
0490     if (!IS_ENABLED(CONFIG_PREEMPT_RT))
0491         lockdep_assert_irqs_disabled();
0492 #if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
0493     defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
0494     if (s->flags & __CMPXCHG_DOUBLE) {
0495         if (cmpxchg_double(&slab->freelist, &slab->counters,
0496                    freelist_old, counters_old,
0497                    freelist_new, counters_new))
0498             return true;
0499     } else
0500 #endif
0501     {
0502         /* init to 0 to prevent spurious warnings */
0503         unsigned long flags = 0;
0504
0505         slab_lock(slab, &flags);
0506         if (slab->freelist == freelist_old &&
0507                     slab->counters == counters_old) {
0508             slab->freelist = freelist_new;
0509             slab->counters = counters_new;
0510             slab_unlock(slab, &flags);
0511             return true;
0512         }
0513         slab_unlock(slab, &flags);
0514     }
0515
0516     cpu_relax();
0517     stat(s, CMPXCHG_DOUBLE_FAIL);
0518
0519 #ifdef SLUB_DEBUG_CMPXCHG
0520     pr_info("%s %s: cmpxchg double redo ", n, s->name);
0521 #endif
0522
0523     return false;
0524 }
0525
0526 static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct slab *slab,
0527         void *freelist_old, unsigned long counters_old,
0528         void *freelist_new, unsigned long counters_new,
0529         const char *n)
0530 {
0531 #if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
0532     defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
0533     if (s->flags & __CMPXCHG_DOUBLE) {
0534         if (cmpxchg_double(&slab->freelist, &slab->counters,
0535                    freelist_old, counters_old,
0536                    freelist_new, counters_new))
0537             return true;
0538     } else
0539 #endif
0540     {
0541         unsigned long flags;
0542
0543         local_irq_save(flags);
0544         __slab_lock(slab);
0545         if (slab->freelist == freelist_old &&
0546                     slab->counters == counters_old) {
0547             slab->freelist = freelist_new;
0548             slab->counters = counters_new;
0549             __slab_unlock(slab);
0550             local_irq_restore(flags);
0551             return true;
0552         }
0553         __slab_unlock(slab);
0554         local_irq_restore(flags);
0555     }
0556
0557     cpu_relax();
0558     stat(s, CMPXCHG_DOUBLE_FAIL);
0559
0560 #ifdef SLUB_DEBUG_CMPXCHG
0561     pr_info("%s %s: cmpxchg double redo ", n, s->name);
0562 #endif
0563
0564     return false;
0565 }
0566
0567 #ifdef CONFIG_SLUB_DEBUG
0568 static unsigned long object_map[BITS_TO_LONGS(MAX_OBJS_PER_PAGE)];
0569 static DEFINE_RAW_SPINLOCK(object_map_lock);
0570
0571 static void __fill_map(unsigned long *obj_map, struct kmem_cache *s,
0572                struct slab *slab)
0573 {
0574     void *addr = slab_address(slab);
0575     void *p;
0576
0577     bitmap_zero(obj_map, slab->objects);
0578
0579     for (p = slab->freelist; p; p = get_freepointer(s, p))
0580         set_bit(__obj_to_index(s, addr, p), obj_map);
0581 }
0582
0583 #if IS_ENABLED(CONFIG_KUNIT)
0584 static bool slab_add_kunit_errors(void)
0585 {
0586     struct kunit_resource *resource;
0587
0588     if (likely(!current->kunit_test))
0589         return false;
0590
0591     resource = kunit_find_named_resource(current->kunit_test, "slab_errors");
0592     if (!resource)
0593         return false;
0594
0595     (*(int *)resource->data)++;
0596     kunit_put_resource(resource);
0597     return true;
0598 }
0599 #else
0600 static inline bool slab_add_kunit_errors(void) { return false; }
0601 #endif
0602
0603 /*
0604  * Determine a map of objects in use in a slab.
0605  *
0606  * Node listlock must be held to guarantee that the slab does
0607  * not vanish from under us.
0608  */
0609 static unsigned long *get_map(struct kmem_cache *s, struct slab *slab)
0610     __acquires(&object_map_lock)
0611 {
0612     VM_BUG_ON(!irqs_disabled());
0613
0614     raw_spin_lock(&object_map_lock);
0615
0616     __fill_map(object_map, s, slab);
0617
0618     return object_map;
0619 }
0620
0621 static void put_map(unsigned long *map) __releases(&object_map_lock)
0622 {
0623     VM_BUG_ON(map != object_map);
0624     raw_spin_unlock(&object_map_lock);
0625 }
0626
0627 static inline unsigned int size_from_object(struct kmem_cache *s)
0628 {
0629     if (s->flags & SLAB_RED_ZONE)
0630         return s->size - s->red_left_pad;
0631
0632     return s->size;
0633 }
0634
0635 static inline void *restore_red_left(struct kmem_cache *s, void *p)
0636 {
0637     if (s->flags & SLAB_RED_ZONE)
0638         p -= s->red_left_pad;
0639
0640     return p;
0641 }
0642
0643 /*
0644  * Debug settings:
0645  */
0646 #if defined(CONFIG_SLUB_DEBUG_ON)
0647 static slab_flags_t slub_debug = DEBUG_DEFAULT_FLAGS;
0648 #else
0649 static slab_flags_t slub_debug;
0650 #endif
0651
0652 static char *slub_debug_string;
0653 static int disable_higher_order_debug;
0654
0655 /*
0656  * slub is about to manipulate internal object metadata.  This memory lies
0657  * outside the range of the allocated object, so accessing it would normally
0658  * be reported by kasan as a bounds error.  metadata_access_enable() is used
0659  * to tell kasan that these accesses are OK.
0660  */
0661 static inline void metadata_access_enable(void)
0662 {
0663     kasan_disable_current();
0664 }
0665
0666 static inline void metadata_access_disable(void)
0667 {
0668     kasan_enable_current();
0669 }
0670
0671 /*
0672  * Object debugging
0673  */
0674
0675 /* Verify that a pointer has an address that is valid within a slab page */
0676 static inline int check_valid_pointer(struct kmem_cache *s,
0677                 struct slab *slab, void *object)
0678 {
0679     void *base;
0680
0681     if (!object)
0682         return 1;
0683
0684     base = slab_address(slab);
0685     object = kasan_reset_tag(object);
0686     object = restore_red_left(s, object);
0687     if (object < base || object >= base + slab->objects * s->size ||
0688         (object - base) % s->size) {
0689         return 0;
0690     }
0691
0692     return 1;
0693 }
0694
0695 static void print_section(char *level, char *text, u8 *addr,
0696               unsigned int length)
0697 {
0698     metadata_access_enable();
0699     print_hex_dump(level, text, DUMP_PREFIX_ADDRESS,
0700             16, 1, kasan_reset_tag((void *)addr), length, 1);
0701     metadata_access_disable();
0702 }
0703
0704 /*
0705  * See comment in calculate_sizes().
0706  */
0707 static inline bool freeptr_outside_object(struct kmem_cache *s)
0708 {
0709     return s->offset >= s->inuse;
0710 }
0711
0712 /*
0713  * Return offset of the end of info block which is inuse + free pointer if
0714  * not overlapping with object.
0715  */
0716 static inline unsigned int get_info_end(struct kmem_cache *s)
0717 {
0718     if (freeptr_outside_object(s))
0719         return s->inuse + sizeof(void *);
0720     else
0721         return s->inuse;
0722 }
0723
0724 static struct track *get_track(struct kmem_cache *s, void *object,
0725     enum track_item alloc)
0726 {
0727     struct track *p;
0728
0729     p = object + get_info_end(s);
0730
0731     return kasan_reset_tag(p + alloc);
0732 }
0733
0734 #ifdef CONFIG_STACKDEPOT
0735 static noinline depot_stack_handle_t set_track_prepare(void)
0736 {
0737     depot_stack_handle_t handle;
0738     unsigned long entries[TRACK_ADDRS_COUNT];
0739     unsigned int nr_entries;
0740
0741     nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 3);
0742     handle = stack_depot_save(entries, nr_entries, GFP_NOWAIT);
0743
0744     return handle;
0745 }
0746 #else
0747 static inline depot_stack_handle_t set_track_prepare(void)
0748 {
0749     return 0;
0750 }
0751 #endif
0752
0753 static void set_track_update(struct kmem_cache *s, void *object,
0754                  enum track_item alloc, unsigned long addr,
0755                  depot_stack_handle_t handle)
0756 {
0757     struct track *p = get_track(s, object, alloc);
0758
0759 #ifdef CONFIG_STACKDEPOT
0760     p->handle = handle;
0761 #endif
0762     p->addr = addr;
0763     p->cpu = smp_processor_id();
0764     p->pid = current->pid;
0765     p->when = jiffies;
0766 }
0767
0768 static __always_inline void set_track(struct kmem_cache *s, void *object,
0769                       enum track_item alloc, unsigned long addr)
0770 {
0771     depot_stack_handle_t handle = set_track_prepare();
0772
0773     set_track_update(s, object, alloc, addr, handle);
0774 }
0775
0776 static void init_tracking(struct kmem_cache *s, void *object)
0777 {
0778     struct track *p;
0779
0780     if (!(s->flags & SLAB_STORE_USER))
0781         return;
0782
0783     p = get_track(s, object, TRACK_ALLOC);
0784     memset(p, 0, 2*sizeof(struct track));
0785 }
0786
0787 static void print_track(const char *s, struct track *t, unsigned long pr_time)
0788 {
0789     depot_stack_handle_t handle __maybe_unused;
0790
0791     if (!t->addr)
0792         return;
0793
0794     pr_err("%s in %pS age=%lu cpu=%u pid=%d\n",
0795            s, (void *)t->addr, pr_time - t->when, t->cpu, t->pid);
0796 #ifdef CONFIG_STACKDEPOT
0797     handle = READ_ONCE(t->handle);
0798     if (handle)
0799         stack_depot_print(handle);
0800     else
0801         pr_err("object allocation/free stack trace missing\n");
0802 #endif
0803 }
0804
0805 void print_tracking(struct kmem_cache *s, void *object)
0806 {
0807     unsigned long pr_time = jiffies;
0808     if (!(s->flags & SLAB_STORE_USER))
0809         return;
0810
0811     print_track("Allocated", get_track(s, object, TRACK_ALLOC), pr_time);
0812     print_track("Freed", get_track(s, object, TRACK_FREE), pr_time);
0813 }
0814
0815 static void print_slab_info(const struct slab *slab)
0816 {
0817     struct folio *folio = (struct folio *)slab_folio(slab);
0818
0819     pr_err("Slab 0x%p objects=%u used=%u fp=0x%p flags=%pGp\n",
0820            slab, slab->objects, slab->inuse, slab->freelist,
0821            folio_flags(folio, 0));
0822 }
0823
0824 static void slab_bug(struct kmem_cache *s, char *fmt, ...)
0825 {
0826     struct va_format vaf;
0827     va_list args;
0828
0829     va_start(args, fmt);
0830     vaf.fmt = fmt;
0831     vaf.va = &args;
0832     pr_err("=============================================================================\n");
0833     pr_err("BUG %s (%s): %pV\n", s->name, print_tainted(), &vaf);
0834     pr_err("-----------------------------------------------------------------------------\n\n");
0835     va_end(args);
0836 }
0837
0838 __printf(2, 3)
0839 static void slab_fix(struct kmem_cache *s, char *fmt, ...)
0840 {
0841     struct va_format vaf;
0842     va_list args;
0843
0844     if (slab_add_kunit_errors())
0845         return;
0846
0847     va_start(args, fmt);
0848     vaf.fmt = fmt;
0849     vaf.va = &args;
0850     pr_err("FIX %s: %pV\n", s->name, &vaf);
0851     va_end(args);
0852 }
0853
0854 static void print_trailer(struct kmem_cache *s, struct slab *slab, u8 *p)
0855 {
0856     unsigned int off;   /* Offset of last byte */
0857     u8 *addr = slab_address(slab);
0858
0859     print_tracking(s, p);
0860
0861     print_slab_info(slab);
0862
0863     pr_err("Object 0x%p @offset=%tu fp=0x%p\n\n",
0864            p, p - addr, get_freepointer(s, p));
0865
0866     if (s->flags & SLAB_RED_ZONE)
0867         print_section(KERN_ERR, "Redzone  ", p - s->red_left_pad,
0868                   s->red_left_pad);
0869     else if (p > addr + 16)
0870         print_section(KERN_ERR, "Bytes b4 ", p - 16, 16);
0871
0872     print_section(KERN_ERR,         "Object   ", p,
0873               min_t(unsigned int, s->object_size, PAGE_SIZE));
0874     if (s->flags & SLAB_RED_ZONE)
0875         print_section(KERN_ERR, "Redzone  ", p + s->object_size,
0876             s->inuse - s->object_size);
0877
0878     off = get_info_end(s);
0879
0880     if (s->flags & SLAB_STORE_USER)
0881         off += 2 * sizeof(struct track);
0882
0883     off += kasan_metadata_size(s);
0884
0885     if (off != size_from_object(s))
0886         /* Beginning of the filler is the free pointer */
0887         print_section(KERN_ERR, "Padding  ", p + off,
0888                   size_from_object(s) - off);
0889
0890     dump_stack();
0891 }
0892
0893 static void object_err(struct kmem_cache *s, struct slab *slab,
0894             u8 *object, char *reason)
0895 {
0896     if (slab_add_kunit_errors())
0897         return;
0898
0899     slab_bug(s, "%s", reason);
0900     print_trailer(s, slab, object);
0901     add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
0902 }
0903
0904 static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab,
0905                    void **freelist, void *nextfree)
0906 {
0907     if ((s->flags & SLAB_CONSISTENCY_CHECKS) &&
0908         !check_valid_pointer(s, slab, nextfree) && freelist) {
0909         object_err(s, slab, *freelist, "Freechain corrupt");
0910         *freelist = NULL;
0911         slab_fix(s, "Isolate corrupted freechain");
0912         return true;
0913     }
0914
0915     return false;
0916 }
0917
0918 static __printf(3, 4) void slab_err(struct kmem_cache *s, struct slab *slab,
0919             const char *fmt, ...)
0920 {
0921     va_list args;
0922     char buf[100];
0923
0924     if (slab_add_kunit_errors())
0925         return;
0926
0927     va_start(args, fmt);
0928     vsnprintf(buf, sizeof(buf), fmt, args);
0929     va_end(args);
0930     slab_bug(s, "%s", buf);
0931     print_slab_info(slab);
0932     dump_stack();
0933     add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
0934 }
0935
0936 static void init_object(struct kmem_cache *s, void *object, u8 val)
0937 {
0938     u8 *p = kasan_reset_tag(object);
0939
0940     if (s->flags & SLAB_RED_ZONE)
0941         memset(p - s->red_left_pad, val, s->red_left_pad);
0942
0943     if (s->flags & __OBJECT_POISON) {
0944         memset(p, POISON_FREE, s->object_size - 1);
0945         p[s->object_size - 1] = POISON_END;
0946     }
0947
0948     if (s->flags & SLAB_RED_ZONE)
0949         memset(p + s->object_size, val, s->inuse - s->object_size);
0950 }
0951
0952 static void restore_bytes(struct kmem_cache *s, char *message, u8 data,
0953                         void *from, void *to)
0954 {
0955     slab_fix(s, "Restoring %s 0x%p-0x%p=0x%x", message, from, to - 1, data);
0956     memset(from, data, to - from);
0957 }
0958
0959 static int check_bytes_and_report(struct kmem_cache *s, struct slab *slab,
0960             u8 *object, char *what,
0961             u8 *start, unsigned int value, unsigned int bytes)
0962 {
0963     u8 *fault;
0964     u8 *end;
0965     u8 *addr = slab_address(slab);
0966
0967     metadata_access_enable();
0968     fault = memchr_inv(kasan_reset_tag(start), value, bytes);
0969     metadata_access_disable();
0970     if (!fault)
0971         return 1;
0972
0973     end = start + bytes;
0974     while (end > fault && end[-1] == value)
0975         end--;
0976
0977     if (slab_add_kunit_errors())
0978         goto skip_bug_print;
0979
0980     slab_bug(s, "%s overwritten", what);
0981     pr_err("0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n",
0982                     fault, end - 1, fault - addr,
0983                     fault[0], value);
0984     print_trailer(s, slab, object);
0985     add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
0986
0987 skip_bug_print:
0988     restore_bytes(s, what, value, fault, end);
0989     return 0;
0990 }
0991
0992 /*
0993  * Object layout:
0994  *
0995  * object address
0996  *  Bytes of the object to be managed.
0997  *  If the freepointer may overlay the object then the free
0998  *  pointer is at the middle of the object.
0999  *
1000  *  Poisoning uses 0x6b (POISON_FREE) and the last byte is
1001  *  0xa5 (POISON_END)
1002  *
1003  * object + s->object_size
1004  *  Padding to reach word boundary. This is also used for Redzoning.
1005  *  Padding is extended by another word if Redzoning is enabled and
1006  *  object_size == inuse.
1007  *
1008  *  We fill with 0xbb (RED_INACTIVE) for inactive objects and with
1009  *  0xcc (RED_ACTIVE) for objects in use.
1010  *
1011  * object + s->inuse
1012  *  Meta data starts here.
1013  *
1014  *  A. Free pointer (if we cannot overwrite object on free)
1015  *  B. Tracking data for SLAB_STORE_USER
1016  *  C. Padding to reach required alignment boundary or at minimum
1017  *      one word if debugging is on to be able to detect writes
1018  *      before the word boundary.
1019  *
1020  *  Padding is done using 0x5a (POISON_INUSE)
1021  *
1022  * object + s->size
1023  *  Nothing is used beyond s->size.
1024  *
1025  * If slabcaches are merged then the object_size and inuse boundaries are mostly
1026  * ignored. And therefore no slab options that rely on these boundaries
1027  * may be used with merged slabcaches.
1028  */
1029
1030 static int check_pad_bytes(struct kmem_cache *s, struct slab *slab, u8 *p)
1031 {
1032     unsigned long off = get_info_end(s);    /* The end of info */
1033
1034     if (s->flags & SLAB_STORE_USER)
1035         /* We also have user information there */
1036         off += 2 * sizeof(struct track);
1037
1038     off += kasan_metadata_size(s);
1039
1040     if (size_from_object(s) == off)
1041         return 1;
1042
1043     return check_bytes_and_report(s, slab, p, "Object padding",
1044             p + off, POISON_INUSE, size_from_object(s) - off);
1045 }
1046
1047 /* Check the pad bytes at the end of a slab page */
1048 static void slab_pad_check(struct kmem_cache *s, struct slab *slab)
1049 {
1050     u8 *start;
1051     u8 *fault;
1052     u8 *end;
1053     u8 *pad;
1054     int length;
1055     int remainder;
1056
1057     if (!(s->flags & SLAB_POISON))
1058         return;
1059
1060     start = slab_address(slab);
1061     length = slab_size(slab);
1062     end = start + length;
1063     remainder = length % s->size;
1064     if (!remainder)
1065         return;
1066
1067     pad = end - remainder;
1068     metadata_access_enable();
1069     fault = memchr_inv(kasan_reset_tag(pad), POISON_INUSE, remainder);
1070     metadata_access_disable();
1071     if (!fault)
1072         return;
1073     while (end > fault && end[-1] == POISON_INUSE)
1074         end--;
1075
1076     slab_err(s, slab, "Padding overwritten. 0x%p-0x%p @offset=%tu",
1077             fault, end - 1, fault - start);
1078     print_section(KERN_ERR, "Padding ", pad, remainder);
1079
1080     restore_bytes(s, "slab padding", POISON_INUSE, fault, end);
1081 }
1082
1083 static int check_object(struct kmem_cache *s, struct slab *slab,
1084                     void *object, u8 val)
1085 {
1086     u8 *p = object;
1087     u8 *endobject = object + s->object_size;
1088
1089     if (s->flags & SLAB_RED_ZONE) {
1090         if (!check_bytes_and_report(s, slab, object, "Left Redzone",
1091             object - s->red_left_pad, val, s->red_left_pad))
1092             return 0;
1093
1094         if (!check_bytes_and_report(s, slab, object, "Right Redzone",
1095             endobject, val, s->inuse - s->object_size))
1096             return 0;
1097     } else {
1098         if ((s->flags & SLAB_POISON) && s->object_size < s->inuse) {
1099             check_bytes_and_report(s, slab, p, "Alignment padding",
1100                 endobject, POISON_INUSE,
1101                 s->inuse - s->object_size);
1102         }
1103     }
1104
1105     if (s->flags & SLAB_POISON) {
1106         if (val != SLUB_RED_ACTIVE && (s->flags & __OBJECT_POISON) &&
1107             (!check_bytes_and_report(s, slab, p, "Poison", p,
1108                     POISON_FREE, s->object_size - 1) ||
1109              !check_bytes_and_report(s, slab, p, "End Poison",
1110                 p + s->object_size - 1, POISON_END, 1)))
1111             return 0;
1112         /*
1113          * check_pad_bytes cleans up on its own.
1114          */
1115         check_pad_bytes(s, slab, p);
1116     }
1117
1118     if (!freeptr_outside_object(s) && val == SLUB_RED_ACTIVE)
1119         /*
1120          * Object and freepointer overlap. Cannot check
1121          * freepointer while object is allocated.
1122          */
1123         return 1;
1124
1125     /* Check free pointer validity */
1126     if (!check_valid_pointer(s, slab, get_freepointer(s, p))) {
1127         object_err(s, slab, p, "Freepointer corrupt");
1128         /*
1129          * No choice but to zap it and thus lose the remainder
1130          * of the free objects in this slab. May cause
1131          * another error because the object count is now wrong.
1132          */
1133         set_freepointer(s, p, NULL);
1134         return 0;
1135     }
1136     return 1;
1137 }
1138
1139 static int check_slab(struct kmem_cache *s, struct slab *slab)
1140 {
1141     int maxobj;
1142
1143     if (!folio_test_slab(slab_folio(slab))) {
1144         slab_err(s, slab, "Not a valid slab page");
1145         return 0;
1146     }
1147
1148     maxobj = order_objects(slab_order(slab), s->size);
1149     if (slab->objects > maxobj) {
1150         slab_err(s, slab, "objects %u > max %u",
1151             slab->objects, maxobj);
1152         return 0;
1153     }
1154     if (slab->inuse > slab->objects) {
1155         slab_err(s, slab, "inuse %u > max %u",
1156             slab->inuse, slab->objects);
1157         return 0;
1158     }
1159     /* Slab_pad_check fixes things up after itself */
1160     slab_pad_check(s, slab);
1161     return 1;
1162 }
1163
1164 /*
1165  * Determine if a certain object in a slab is on the freelist. Must hold the
1166  * slab lock to guarantee that the chains are in a consistent state.
1167  */
1168 static int on_freelist(struct kmem_cache *s, struct slab *slab, void *search)
1169 {
1170     int nr = 0;
1171     void *fp;
1172     void *object = NULL;
1173     int max_objects;
1174
1175     fp = slab->freelist;
1176     while (fp && nr <= slab->objects) {
1177         if (fp == search)
1178             return 1;
1179         if (!check_valid_pointer(s, slab, fp)) {
1180             if (object) {
1181                 object_err(s, slab, object,
1182                     "Freechain corrupt");
1183                 set_freepointer(s, object, NULL);
1184             } else {
1185                 slab_err(s, slab, "Freepointer corrupt");
1186                 slab->freelist = NULL;
1187                 slab->inuse = slab->objects;
1188                 slab_fix(s, "Freelist cleared");
1189                 return 0;
1190             }
1191             break;
1192         }
1193         object = fp;
1194         fp = get_freepointer(s, object);
1195         nr++;
1196     }
1197
1198     max_objects = order_objects(slab_order(slab), s->size);
1199     if (max_objects > MAX_OBJS_PER_PAGE)
1200         max_objects = MAX_OBJS_PER_PAGE;
1201
1202     if (slab->objects != max_objects) {
1203         slab_err(s, slab, "Wrong number of objects. Found %d but should be %d",
1204              slab->objects, max_objects);
1205         slab->objects = max_objects;
1206         slab_fix(s, "Number of objects adjusted");
1207     }
1208     if (slab->inuse != slab->objects - nr) {
1209         slab_err(s, slab, "Wrong object count. Counter is %d but counted were %d",
1210              slab->inuse, slab->objects - nr);
1211         slab->inuse = slab->objects - nr;
1212         slab_fix(s, "Object count adjusted");
1213     }
1214     return search == NULL;
1215 }
1216
1217 static void trace(struct kmem_cache *s, struct slab *slab, void *object,
1218                                 int alloc)
1219 {
1220     if (s->flags & SLAB_TRACE) {
1221         pr_info("TRACE %s %s 0x%p inuse=%d fp=0x%p\n",
1222             s->name,
1223             alloc ? "alloc" : "free",
1224             object, slab->inuse,
1225             slab->freelist);
1226
1227         if (!alloc)
1228             print_section(KERN_INFO, "Object ", (void *)object,
1229                     s->object_size);
1230
1231         dump_stack();
1232     }
1233 }
1234
1235 /*
1236  * Tracking of fully allocated slabs for debugging purposes.
1237  */
1238 static void add_full(struct kmem_cache *s,
1239     struct kmem_cache_node *n, struct slab *slab)
1240 {
1241     if (!(s->flags & SLAB_STORE_USER))
1242         return;
1243
1244     lockdep_assert_held(&n->list_lock);
1245     list_add(&slab->slab_list, &n->full);
1246 }
1247
1248 static void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, struct slab *slab)
1249 {
1250     if (!(s->flags & SLAB_STORE_USER))
1251         return;
1252
1253     lockdep_assert_held(&n->list_lock);
1254     list_del(&slab->slab_list);
1255 }
1256
1257 /* Tracking of the number of slabs for debugging purposes */
1258 static inline unsigned long slabs_node(struct kmem_cache *s, int node)
1259 {
1260     struct kmem_cache_node *n = get_node(s, node);
1261
1262     return atomic_long_read(&n->nr_slabs);
1263 }
1264
1265 static inline unsigned long node_nr_slabs(struct kmem_cache_node *n)
1266 {
1267     return atomic_long_read(&n->nr_slabs);
1268 }
1269
1270 static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects)
1271 {
1272     struct kmem_cache_node *n = get_node(s, node);
1273
1274     /*
1275      * May be called early in order to allocate a slab for the
1276      * kmem_cache_node structure. Solve the chicken-egg
1277      * dilemma by deferring the increment of the count during
1278      * bootstrap (see early_kmem_cache_node_alloc).
1279      */
1280     if (likely(n)) {
1281         atomic_long_inc(&n->nr_slabs);
1282         atomic_long_add(objects, &n->total_objects);
1283     }
1284 }
1285 static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects)
1286 {
1287     struct kmem_cache_node *n = get_node(s, node);
1288
1289     atomic_long_dec(&n->nr_slabs);
1290     atomic_long_sub(objects, &n->total_objects);
1291 }
1292
1293 /* Object debug checks for alloc/free paths */
1294 static void setup_object_debug(struct kmem_cache *s, void *object)
1295 {
1296     if (!kmem_cache_debug_flags(s, SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON))
1297         return;
1298
1299     init_object(s, object, SLUB_RED_INACTIVE);
1300     init_tracking(s, object);
1301 }
1302
1303 static
1304 void setup_slab_debug(struct kmem_cache *s, struct slab *slab, void *addr)
1305 {
1306     if (!kmem_cache_debug_flags(s, SLAB_POISON))
1307         return;
1308
1309     metadata_access_enable();
1310     memset(kasan_reset_tag(addr), POISON_INUSE, slab_size(slab));
1311     metadata_access_disable();
1312 }
1313
1314 static inline int alloc_consistency_checks(struct kmem_cache *s,
1315                     struct slab *slab, void *object)
1316 {
1317     if (!check_slab(s, slab))
1318         return 0;
1319
1320     if (!check_valid_pointer(s, slab, object)) {
1321         object_err(s, slab, object, "Freelist Pointer check fails");
1322         return 0;
1323     }
1324
1325     if (!check_object(s, slab, object, SLUB_RED_INACTIVE))
1326         return 0;
1327
1328     return 1;
1329 }
1330
1331 static noinline int alloc_debug_processing(struct kmem_cache *s,
1332                     struct slab *slab,
1333                     void *object, unsigned long addr)
1334 {
1335     if (s->flags & SLAB_CONSISTENCY_CHECKS) {
1336         if (!alloc_consistency_checks(s, slab, object))
1337             goto bad;
1338     }
1339
1340     /* Success perform special debug activities for allocs */
1341     if (s->flags & SLAB_STORE_USER)
1342         set_track(s, object, TRACK_ALLOC, addr);
1343     trace(s, slab, object, 1);
1344     init_object(s, object, SLUB_RED_ACTIVE);
1345     return 1;
1346
1347 bad:
1348     if (folio_test_slab(slab_folio(slab))) {
1349         /*
1350          * If this is a slab page then lets do the best we can
1351          * to avoid issues in the future. Marking all objects
1352          * as used avoids touching the remaining objects.
1353          */
1354         slab_fix(s, "Marking all objects used");
1355         slab->inuse = slab->objects;
1356         slab->freelist = NULL;
1357     }
1358     return 0;
1359 }
1360
1361 static inline int free_consistency_checks(struct kmem_cache *s,
1362         struct slab *slab, void *object, unsigned long addr)
1363 {
1364     if (!check_valid_pointer(s, slab, object)) {
1365         slab_err(s, slab, "Invalid object pointer 0x%p", object);
1366         return 0;
1367     }
1368
1369     if (on_freelist(s, slab, object)) {
1370         object_err(s, slab, object, "Object already free");
1371         return 0;
1372     }
1373
1374     if (!check_object(s, slab, object, SLUB_RED_ACTIVE))
1375         return 0;
1376
1377     if (unlikely(s != slab->slab_cache)) {
1378         if (!folio_test_slab(slab_folio(slab))) {
1379             slab_err(s, slab, "Attempt to free object(0x%p) outside of slab",
1380                  object);
1381         } else if (!slab->slab_cache) {
1382             pr_err("SLUB <none>: no slab for object 0x%p.\n",
1383                    object);
1384             dump_stack();
1385         } else
1386             object_err(s, slab, object,
1387                     "page slab pointer corrupt.");
1388         return 0;
1389     }
1390     return 1;
1391 }
1392
1393 /* Supports checking bulk free of a constructed freelist */
1394 static noinline int free_debug_processing(
1395     struct kmem_cache *s, struct slab *slab,
1396     void *head, void *tail, int bulk_cnt,
1397     unsigned long addr)
1398 {
1399     struct kmem_cache_node *n = get_node(s, slab_nid(slab));
1400     void *object = head;
1401     int cnt = 0;
1402     unsigned long flags, flags2;
1403     int ret = 0;
1404     depot_stack_handle_t handle = 0;
1405
1406     if (s->flags & SLAB_STORE_USER)
1407         handle = set_track_prepare();
1408
1409     spin_lock_irqsave(&n->list_lock, flags);
1410     slab_lock(slab, &flags2);
1411
1412     if (s->flags & SLAB_CONSISTENCY_CHECKS) {
1413         if (!check_slab(s, slab))
1414             goto out;
1415     }
1416
1417 next_object:
1418     cnt++;
1419
1420     if (s->flags & SLAB_CONSISTENCY_CHECKS) {
1421         if (!free_consistency_checks(s, slab, object, addr))
1422             goto out;
1423     }
1424
1425     if (s->flags & SLAB_STORE_USER)
1426         set_track_update(s, object, TRACK_FREE, addr, handle);
1427     trace(s, slab, object, 0);
1428     /* Freepointer not overwritten by init_object(), SLAB_POISON moved it */
1429     init_object(s, object, SLUB_RED_INACTIVE);
1430
1431     /* Reached end of constructed freelist yet? */
1432     if (object != tail) {
1433         object = get_freepointer(s, object);
1434         goto next_object;
1435     }
1436     ret = 1;
1437
1438 out:
1439     if (cnt != bulk_cnt)
1440         slab_err(s, slab, "Bulk freelist count(%d) invalid(%d)\n",
1441              bulk_cnt, cnt);
1442
1443     slab_unlock(slab, &flags2);
1444     spin_unlock_irqrestore(&n->list_lock, flags);
1445     if (!ret)
1446         slab_fix(s, "Object at 0x%p not freed", object);
1447     return ret;
1448 }
1449
1450 /*
1451  * Parse a block of slub_debug options. Blocks are delimited by ';'
1452  *
1453  * @str:    start of block
1454  * @flags:  returns parsed flags, or DEBUG_DEFAULT_FLAGS if none specified
1455  * @slabs:  return start of list of slabs, or NULL when there's no list
1456  * @init:   assume this is initial parsing and not per-kmem-create parsing
1457  *
1458  * returns the start of next block if there's any, or NULL
1459  */
1460 static char *
1461 parse_slub_debug_flags(char *str, slab_flags_t *flags, char **slabs, bool init)
1462 {
1463     bool higher_order_disable = false;
1464
1465     /* Skip any completely empty blocks */
1466     while (*str && *str == ';')
1467         str++;
1468
1469     if (*str == ',') {
1470         /*
1471          * No options but restriction on slabs. This means full
1472          * debugging for slabs matching a pattern.
1473          */
1474         *flags = DEBUG_DEFAULT_FLAGS;
1475         goto check_slabs;
1476     }
1477     *flags = 0;
1478
1479     /* Determine which debug features should be switched on */
1480     for (; *str && *str != ',' && *str != ';'; str++) {
1481         switch (tolower(*str)) {
1482         case '-':
1483             *flags = 0;
1484             break;
1485         case 'f':
1486             *flags |= SLAB_CONSISTENCY_CHECKS;
1487             break;
1488         case 'z':
1489             *flags |= SLAB_RED_ZONE;
1490             break;
1491         case 'p':
1492             *flags |= SLAB_POISON;
1493             break;
1494         case 'u':
1495             *flags |= SLAB_STORE_USER;
1496             break;
1497         case 't':
1498             *flags |= SLAB_TRACE;
1499             break;
1500         case 'a':
1501             *flags |= SLAB_FAILSLAB;
1502             break;
1503         case 'o':
1504             /*
1505              * Avoid enabling debugging on caches if its minimum
1506              * order would increase as a result.
1507              */
1508             higher_order_disable = true;
1509             break;
1510         default:
1511             if (init)
1512                 pr_err("slub_debug option '%c' unknown. skipped\n", *str);
1513         }
1514     }
1515 check_slabs:
1516     if (*str == ',')
1517         *slabs = ++str;
1518     else
1519         *slabs = NULL;
1520
1521     /* Skip over the slab list */
1522     while (*str && *str != ';')
1523         str++;
1524
1525     /* Skip any completely empty blocks */
1526     while (*str && *str == ';')
1527         str++;
1528
1529     if (init && higher_order_disable)
1530         disable_higher_order_debug = 1;
1531
1532     if (*str)
1533         return str;
1534     else
1535         return NULL;
1536 }
1537
1538 static int __init setup_slub_debug(char *str)
1539 {
1540     slab_flags_t flags;
1541     slab_flags_t global_flags;
1542     char *saved_str;
1543     char *slab_list;
1544     bool global_slub_debug_changed = false;
1545     bool slab_list_specified = false;
1546
1547     global_flags = DEBUG_DEFAULT_FLAGS;
1548     if (*str++ != '=' || !*str)
1549         /*
1550          * No options specified. Switch on full debugging.
1551          */
1552         goto out;
1553
1554     saved_str = str;
1555     while (str) {
1556         str = parse_slub_debug_flags(str, &flags, &slab_list, true);
1557
1558         if (!slab_list) {
1559             global_flags = flags;
1560             global_slub_debug_changed = true;
1561         } else {
1562             slab_list_specified = true;
1563             if (flags & SLAB_STORE_USER)
1564                 stack_depot_want_early_init();
1565         }
1566     }
1567
1568     /*
1569      * For backwards compatibility, a single list of flags with list of
1570      * slabs means debugging is only changed for those slabs, so the global
1571      * slub_debug should be unchanged (0 or DEBUG_DEFAULT_FLAGS, depending
1572      * on CONFIG_SLUB_DEBUG_ON). We can extended that to multiple lists as
1573      * long as there is no option specifying flags without a slab list.
1574      */
1575     if (slab_list_specified) {
1576         if (!global_slub_debug_changed)
1577             global_flags = slub_debug;
1578         slub_debug_string = saved_str;
1579     }
1580 out:
1581     slub_debug = global_flags;
1582     if (slub_debug & SLAB_STORE_USER)
1583         stack_depot_want_early_init();
1584     if (slub_debug != 0 || slub_debug_string)
1585         static_branch_enable(&slub_debug_enabled);
1586     else
1587         static_branch_disable(&slub_debug_enabled);
1588     if ((static_branch_unlikely(&init_on_alloc) ||
1589          static_branch_unlikely(&init_on_free)) &&
1590         (slub_debug & SLAB_POISON))
1591         pr_info("mem auto-init: SLAB_POISON will take precedence over init_on_alloc/init_on_free\n");
1592     return 1;
1593 }
1594
1595 __setup("slub_debug", setup_slub_debug);
1596
1597 /*
1598  * kmem_cache_flags - apply debugging options to the cache
1599  * @object_size:    the size of an object without meta data
1600  * @flags:      flags to set
1601  * @name:       name of the cache
1602  *
1603  * Debug option(s) are applied to @flags. In addition to the debug
1604  * option(s), if a slab name (or multiple) is specified i.e.
1605  * slub_debug=<Debug-Options>,<slab name1>,<slab name2> ...
1606  * then only the select slabs will receive the debug option(s).
1607  */
1608 slab_flags_t kmem_cache_flags(unsigned int object_size,
1609     slab_flags_t flags, const char *name)
1610 {
1611     char *iter;
1612     size_t len;
1613     char *next_block;
1614     slab_flags_t block_flags;
1615     slab_flags_t slub_debug_local = slub_debug;
1616
1617     if (flags & SLAB_NO_USER_FLAGS)
1618         return flags;
1619
1620     /*
1621      * If the slab cache is for debugging (e.g. kmemleak) then
1622      * don't store user (stack trace) information by default,
1623      * but let the user enable it via the command line below.
1624      */
1625     if (flags & SLAB_NOLEAKTRACE)
1626         slub_debug_local &= ~SLAB_STORE_USER;
1627
1628     len = strlen(name);
1629     next_block = slub_debug_string;
1630     /* Go through all blocks of debug options, see if any matches our slab's name */
1631     while (next_block) {
1632         next_block = parse_slub_debug_flags(next_block, &block_flags, &iter, false);
1633         if (!iter)
1634             continue;
1635         /* Found a block that has a slab list, search it */
1636         while (*iter) {
1637             char *end, *glob;
1638             size_t cmplen;
1639
1640             end = strchrnul(iter, ',');
1641             if (next_block && next_block < end)
1642                 end = next_block - 1;
1643
1644             glob = strnchr(iter, end - iter, '*');
1645             if (glob)
1646                 cmplen = glob - iter;
1647             else
1648                 cmplen = max_t(size_t, len, (end - iter));
1649
1650             if (!strncmp(name, iter, cmplen)) {
1651                 flags |= block_flags;
1652                 return flags;
1653             }
1654
1655             if (!*end || *end == ';')
1656                 break;
1657             iter = end + 1;
1658         }
1659     }
1660
1661     return flags | slub_debug_local;
1662 }
1663 #else /* !CONFIG_SLUB_DEBUG */
1664 static inline void setup_object_debug(struct kmem_cache *s, void *object) {}
1665 static inline
1666 void setup_slab_debug(struct kmem_cache *s, struct slab *slab, void *addr) {}
1667
1668 static inline int alloc_debug_processing(struct kmem_cache *s,
1669     struct slab *slab, void *object, unsigned long addr) { return 0; }
1670
1671 static inline int free_debug_processing(
1672     struct kmem_cache *s, struct slab *slab,
1673     void *head, void *tail, int bulk_cnt,
1674     unsigned long addr) { return 0; }
1675
1676 static inline void slab_pad_check(struct kmem_cache *s, struct slab *slab) {}
1677 static inline int check_object(struct kmem_cache *s, struct slab *slab,
1678             void *object, u8 val) { return 1; }
1679 static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n,
1680                     struct slab *slab) {}
1681 static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n,
1682                     struct slab *slab) {}
1683 slab_flags_t kmem_cache_flags(unsigned int object_size,
1684     slab_flags_t flags, const char *name)
1685 {
1686     return flags;
1687 }
1688 #define slub_debug 0
1689
1690 #define disable_higher_order_debug 0
1691
1692 static inline unsigned long slabs_node(struct kmem_cache *s, int node)
1693                             { return 0; }
1694 static inline unsigned long node_nr_slabs(struct kmem_cache_node *n)
1695                             { return 0; }
1696 static inline void inc_slabs_node(struct kmem_cache *s, int node,
1697                             int objects) {}
1698 static inline void dec_slabs_node(struct kmem_cache *s, int node,
1699                             int objects) {}
1700
1701 static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab,
1702                    void **freelist, void *nextfree)
1703 {
1704     return false;
1705 }
1706 #endif /* CONFIG_SLUB_DEBUG */
1707
1708 /*
1709  * Hooks for other subsystems that check memory allocations. In a typical
1710  * production configuration these hooks all should produce no code at all.
1711  */
1712 static inline void *kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags)
1713 {
1714     ptr = kasan_kmalloc_large(ptr, size, flags);
1715     /* As ptr might get tagged, call kmemleak hook after KASAN. */
1716     kmemleak_alloc(ptr, size, 1, flags);
1717     return ptr;
1718 }
1719
1720 static __always_inline void kfree_hook(void *x)
1721 {
1722     kmemleak_free(x);
1723     kasan_kfree_large(x);
1724 }
1725
1726 static __always_inline bool slab_free_hook(struct kmem_cache *s,
1727                         void *x, bool init)
1728 {
1729     kmemleak_free_recursive(x, s->flags);
1730
1731     debug_check_no_locks_freed(x, s->object_size);
1732
1733     if (!(s->flags & SLAB_DEBUG_OBJECTS))
1734         debug_check_no_obj_freed(x, s->object_size);
1735
1736     /* Use KCSAN to help debug racy use-after-free. */
1737     if (!(s->flags & SLAB_TYPESAFE_BY_RCU))
1738         __kcsan_check_access(x, s->object_size,
1739                      KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ASSERT);
1740
1741     /*
1742      * As memory initialization might be integrated into KASAN,
1743      * kasan_slab_free and initialization memset's must be
1744      * kept together to avoid discrepancies in behavior.
1745      *
1746      * The initialization memset's clear the object and the metadata,
1747      * but don't touch the SLAB redzone.
1748      */
1749     if (init) {
1750         int rsize;
1751
1752         if (!kasan_has_integrated_init())
1753             memset(kasan_reset_tag(x), 0, s->object_size);
1754         rsize = (s->flags & SLAB_RED_ZONE) ? s->red_left_pad : 0;
1755         memset((char *)kasan_reset_tag(x) + s->inuse, 0,
1756                s->size - s->inuse - rsize);
1757     }
1758     /* KASAN might put x into memory quarantine, delaying its reuse. */
1759     return kasan_slab_free(s, x, init);
1760 }
1761
1762 static inline bool slab_free_freelist_hook(struct kmem_cache *s,
1763                        void **head, void **tail,
1764                        int *cnt)
1765 {
1766
1767     void *object;
1768     void *next = *head;
1769     void *old_tail = *tail ? *tail : *head;
1770
1771     if (is_kfence_address(next)) {
1772         slab_free_hook(s, next, false);
1773         return true;
1774     }
1775
1776     /* Head and tail of the reconstructed freelist */
1777     *head = NULL;
1778     *tail = NULL;
1779
1780     do {
1781         object = next;
1782         next = get_freepointer(s, object);
1783
1784         /* If object's reuse doesn't have to be delayed */
1785         if (!slab_free_hook(s, object, slab_want_init_on_free(s))) {
1786             /* Move object to the new freelist */
1787             set_freepointer(s, object, *head);
1788             *head = object;
1789             if (!*tail)
1790                 *tail = object;
1791         } else {
1792             /*
1793              * Adjust the reconstructed freelist depth
1794              * accordingly if object's reuse is delayed.
1795              */
1796             --(*cnt);
1797         }
1798     } while (object != old_tail);
1799
1800     if (*head == *tail)
1801         *tail = NULL;
1802
1803     return *head != NULL;
1804 }
1805
1806 static void *setup_object(struct kmem_cache *s, void *object)
1807 {
1808     setup_object_debug(s, object);
1809     object = kasan_init_slab_obj(s, object);
1810     if (unlikely(s->ctor)) {
1811         kasan_unpoison_object_data(s, object);
1812         s->ctor(object);
1813         kasan_poison_object_data(s, object);
1814     }
1815     return object;
1816 }
1817
1818 /*
1819  * Slab allocation and freeing
1820  */
1821 static inline struct slab *alloc_slab_page(gfp_t flags, int node,
1822         struct kmem_cache_order_objects oo)
1823 {
1824     struct folio *folio;
1825     struct slab *slab;
1826     unsigned int order = oo_order(oo);
1827
1828     if (node == NUMA_NO_NODE)
1829         folio = (struct folio *)alloc_pages(flags, order);
1830     else
1831         folio = (struct folio *)__alloc_pages_node(node, flags, order);
1832
1833     if (!folio)
1834         return NULL;
1835
1836     slab = folio_slab(folio);
1837     __folio_set_slab(folio);
1838     if (page_is_pfmemalloc(folio_page(folio, 0)))
1839         slab_set_pfmemalloc(slab);
1840
1841     return slab;
1842 }
1843
1844 #ifdef CONFIG_SLAB_FREELIST_RANDOM
1845 /* Pre-initialize the random sequence cache */
1846 static int init_cache_random_seq(struct kmem_cache *s)
1847 {
1848     unsigned int count = oo_objects(s->oo);
1849     int err;
1850
1851     /* Bailout if already initialised */
1852     if (s->random_seq)
1853         return 0;
1854
1855     err = cache_random_seq_create(s, count, GFP_KERNEL);
1856     if (err) {
1857         pr_err("SLUB: Unable to initialize free list for %s\n",
1858             s->name);
1859         return err;
1860     }
1861
1862     /* Transform to an offset on the set of pages */
1863     if (s->random_seq) {
1864         unsigned int i;
1865
1866         for (i = 0; i < count; i++)
1867             s->random_seq[i] *= s->size;
1868     }
1869     return 0;
1870 }
1871
1872 /* Initialize each random sequence freelist per cache */
1873 static void __init init_freelist_randomization(void)
1874 {
1875     struct kmem_cache *s;
1876
1877     mutex_lock(&slab_mutex);
1878
1879     list_for_each_entry(s, &slab_caches, list)
1880         init_cache_random_seq(s);
1881
1882     mutex_unlock(&slab_mutex);
1883 }
1884
1885 /* Get the next entry on the pre-computed freelist randomized */
1886 static void *next_freelist_entry(struct kmem_cache *s, struct slab *slab,
1887                 unsigned long *pos, void *start,
1888                 unsigned long page_limit,
1889                 unsigned long freelist_count)
1890 {
1891     unsigned int idx;
1892
1893     /*
1894      * If the target page allocation failed, the number of objects on the
1895      * page might be smaller than the usual size defined by the cache.
1896      */
1897     do {
1898         idx = s->random_seq[*pos];
1899         *pos += 1;
1900         if (*pos >= freelist_count)
1901             *pos = 0;
1902     } while (unlikely(idx >= page_limit));
1903
1904     return (char *)start + idx;
1905 }
1906
1907 /* Shuffle the single linked freelist based on a random pre-computed sequence */
1908 static bool shuffle_freelist(struct kmem_cache *s, struct slab *slab)
1909 {
1910     void *start;
1911     void *cur;
1912     void *next;
1913     unsigned long idx, pos, page_limit, freelist_count;
1914
1915     if (slab->objects < 2 || !s->random_seq)
1916         return false;
1917
1918     freelist_count = oo_objects(s->oo);
1919     pos = get_random_int() % freelist_count;
1920
1921     page_limit = slab->objects * s->size;
1922     start = fixup_red_left(s, slab_address(slab));
1923
1924     /* First entry is used as the base of the freelist */
1925     cur = next_freelist_entry(s, slab, &pos, start, page_limit,
1926                 freelist_count);
1927     cur = setup_object(s, cur);
1928     slab->freelist = cur;
1929
1930     for (idx = 1; idx < slab->objects; idx++) {
1931         next = next_freelist_entry(s, slab, &pos, start, page_limit,
1932             freelist_count);
1933         next = setup_object(s, next);
1934         set_freepointer(s, cur, next);
1935         cur = next;
1936     }
1937     set_freepointer(s, cur, NULL);
1938
1939     return true;
1940 }
1941 #else
1942 static inline int init_cache_random_seq(struct kmem_cache *s)
1943 {
1944     return 0;
1945 }
1946 static inline void init_freelist_randomization(void) { }
1947 static inline bool shuffle_freelist(struct kmem_cache *s, struct slab *slab)
1948 {
1949     return false;
1950 }
1951 #endif /* CONFIG_SLAB_FREELIST_RANDOM */
1952
1953 static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
1954 {
1955     struct slab *slab;
1956     struct kmem_cache_order_objects oo = s->oo;
1957     gfp_t alloc_gfp;
1958     void *start, *p, *next;
1959     int idx;
1960     bool shuffle;
1961
1962     flags &= gfp_allowed_mask;
1963
1964     flags |= s->allocflags;
1965
1966     /*
1967      * Let the initial higher-order allocation fail under memory pressure
1968      * so we fall-back to the minimum order allocation.
1969      */
1970     alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL;
1971     if ((alloc_gfp & __GFP_DIRECT_RECLAIM) && oo_order(oo) > oo_order(s->min))
1972         alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~__GFP_RECLAIM;
1973
1974     slab = alloc_slab_page(alloc_gfp, node, oo);
1975     if (unlikely(!slab)) {
1976         oo = s->min;
1977         alloc_gfp = flags;
1978         /*
1979          * Allocation may have failed due to fragmentation.
1980          * Try a lower order alloc if possible
1981          */
1982         slab = alloc_slab_page(alloc_gfp, node, oo);
1983         if (unlikely(!slab))
1984             goto out;
1985         stat(s, ORDER_FALLBACK);
1986     }
1987
1988     slab->objects = oo_objects(oo);
1989
1990     account_slab(slab, oo_order(oo), s, flags);
1991
1992     slab->slab_cache = s;
1993
1994     kasan_poison_slab(slab);
1995
1996     start = slab_address(slab);
1997
1998     setup_slab_debug(s, slab, start);
1999
2000     shuffle = shuffle_freelist(s, slab);
2001
2002     if (!shuffle) {
2003         start = fixup_red_left(s, start);
2004         start = setup_object(s, start);
2005         slab->freelist = start;
2006         for (idx = 0, p = start; idx < slab->objects - 1; idx++) {
2007             next = p + s->size;
2008             next = setup_object(s, next);
2009             set_freepointer(s, p, next);
2010             p = next;
2011         }
2012         set_freepointer(s, p, NULL);
2013     }
2014
2015     slab->inuse = slab->objects;
2016     slab->frozen = 1;
2017
2018 out:
2019     if (!slab)
2020         return NULL;
2021
2022     inc_slabs_node(s, slab_nid(slab), slab->objects);
2023
2024     return slab;
2025 }
2026
2027 static struct slab *new_slab(struct kmem_cache *s, gfp_t flags, int node)
2028 {
2029     if (unlikely(flags & GFP_SLAB_BUG_MASK))
2030         flags = kmalloc_fix_flags(flags);
2031
2032     WARN_ON_ONCE(s->ctor && (flags & __GFP_ZERO));
2033
2034     return allocate_slab(s,
2035         flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node);
2036 }
2037
2038 static void __free_slab(struct kmem_cache *s, struct slab *slab)
2039 {
2040     struct folio *folio = slab_folio(slab);
2041     int order = folio_order(folio);
2042     int pages = 1 << order;
2043
2044     if (kmem_cache_debug_flags(s, SLAB_CONSISTENCY_CHECKS)) {
2045         void *p;
2046
2047         slab_pad_check(s, slab);
2048         for_each_object(p, s, slab_address(slab), slab->objects)
2049             check_object(s, slab, p, SLUB_RED_INACTIVE);
2050     }
2051
2052     __slab_clear_pfmemalloc(slab);
2053     __folio_clear_slab(folio);
2054     folio->mapping = NULL;
2055     if (current->reclaim_state)
2056         current->reclaim_state->reclaimed_slab += pages;
2057     unaccount_slab(slab, order, s);
2058     __free_pages(folio_page(folio, 0), order);
2059 }
2060
2061 static void rcu_free_slab(struct rcu_head *h)
2062 {
2063     struct slab *slab = container_of(h, struct slab, rcu_head);
2064
2065     __free_slab(slab->slab_cache, slab);
2066 }
2067
2068 static void free_slab(struct kmem_cache *s, struct slab *slab)
2069 {
2070     if (unlikely(s->flags & SLAB_TYPESAFE_BY_RCU)) {
2071         call_rcu(&slab->rcu_head, rcu_free_slab);
2072     } else
2073         __free_slab(s, slab);
2074 }
2075
2076 static void discard_slab(struct kmem_cache *s, struct slab *slab)
2077 {
2078     dec_slabs_node(s, slab_nid(slab), slab->objects);
2079     free_slab(s, slab);
2080 }
2081
2082 /*
2083  * Management of partially allocated slabs.
2084  */
2085 static inline void
2086 __add_partial(struct kmem_cache_node *n, struct slab *slab, int tail)
2087 {
2088     n->nr_partial++;
2089     if (tail == DEACTIVATE_TO_TAIL)
2090         list_add_tail(&slab->slab_list, &n->partial);
2091     else
2092         list_add(&slab->slab_list, &n->partial);
2093 }
2094
2095 static inline void add_partial(struct kmem_cache_node *n,
2096                 struct slab *slab, int tail)
2097 {
2098     lockdep_assert_held(&n->list_lock);
2099     __add_partial(n, slab, tail);
2100 }
2101
2102 static inline void remove_partial(struct kmem_cache_node *n,
2103                     struct slab *slab)
2104 {
2105     lockdep_assert_held(&n->list_lock);
2106     list_del(&slab->slab_list);
2107     n->nr_partial--;
2108 }
2109
2110 /*
2111  * Remove slab from the partial list, freeze it and
2112  * return the pointer to the freelist.
2113  *
2114  * Returns a list of objects or NULL if it fails.
2115  */
2116 static inline void *acquire_slab(struct kmem_cache *s,
2117         struct kmem_cache_node *n, struct slab *slab,
2118         int mode)
2119 {
2120     void *freelist;
2121     unsigned long counters;
2122     struct slab new;
2123
2124     lockdep_assert_held(&n->list_lock);
2125
2126     /*
2127      * Zap the freelist and set the frozen bit.
2128      * The old freelist is the list of objects for the
2129      * per cpu allocation list.
2130      */
2131     freelist = slab->freelist;
2132     counters = slab->counters;
2133     new.counters = counters;
2134     if (mode) {
2135         new.inuse = slab->objects;
2136         new.freelist = NULL;
2137     } else {
2138         new.freelist = freelist;
2139     }
2140
2141     VM_BUG_ON(new.frozen);
2142     new.frozen = 1;
2143
2144     if (!__cmpxchg_double_slab(s, slab,
2145             freelist, counters,
2146             new.freelist, new.counters,
2147             "acquire_slab"))
2148         return NULL;
2149
2150     remove_partial(n, slab);
2151     WARN_ON(!freelist);
2152     return freelist;
2153 }
2154
2155 #ifdef CONFIG_SLUB_CPU_PARTIAL
2156 static void put_cpu_partial(struct kmem_cache *s, struct slab *slab, int drain);
2157 #else
2158 static inline void put_cpu_partial(struct kmem_cache *s, struct slab *slab,
2159                    int drain) { }
2160 #endif
2161 static inline bool pfmemalloc_match(struct slab *slab, gfp_t gfpflags);
2162
2163 /*
2164  * Try to allocate a partial slab from a specific node.
2165  */
2166 static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
2167                   struct slab **ret_slab, gfp_t gfpflags)
2168 {
2169     struct slab *slab, *slab2;
2170     void *object = NULL;
2171     unsigned long flags;
2172     unsigned int partial_slabs = 0;
2173
2174     /*
2175      * Racy check. If we mistakenly see no partial slabs then we
2176      * just allocate an empty slab. If we mistakenly try to get a
2177      * partial slab and there is none available then get_partial()
2178      * will return NULL.
2179      */
2180     if (!n || !n->nr_partial)
2181         return NULL;
2182
2183     spin_lock_irqsave(&n->list_lock, flags);
2184     list_for_each_entry_safe(slab, slab2, &n->partial, slab_list) {
2185         void *t;
2186
2187         if (!pfmemalloc_match(slab, gfpflags))
2188             continue;
2189
2190         t = acquire_slab(s, n, slab, object == NULL);
2191         if (!t)
2192             break;
2193
2194         if (!object) {
2195             *ret_slab = slab;
2196             stat(s, ALLOC_FROM_PARTIAL);
2197             object = t;
2198         } else {
2199             put_cpu_partial(s, slab, 0);
2200             stat(s, CPU_PARTIAL_NODE);
2201             partial_slabs++;
2202         }
2203 #ifdef CONFIG_SLUB_CPU_PARTIAL
2204         if (!kmem_cache_has_cpu_partial(s)
2205             || partial_slabs > s->cpu_partial_slabs / 2)
2206             break;
2207 #else
2208         break;
2209 #endif
2210
2211     }
2212     spin_unlock_irqrestore(&n->list_lock, flags);
2213     return object;
2214 }
2215
2216 /*
2217  * Get a slab from somewhere. Search in increasing NUMA distances.
2218  */
2219 static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
2220                  struct slab **ret_slab)
2221 {
2222 #ifdef CONFIG_NUMA
2223     struct zonelist *zonelist;
2224     struct zoneref *z;
2225     struct zone *zone;
2226     enum zone_type highest_zoneidx = gfp_zone(flags);
2227     void *object;
2228     unsigned int cpuset_mems_cookie;
2229
2230     /*
2231      * The defrag ratio allows a configuration of the tradeoffs between
2232      * inter node defragmentation and node local allocations. A lower
2233      * defrag_ratio increases the tendency to do local allocations
2234      * instead of attempting to obtain partial slabs from other nodes.
2235      *
2236      * If the defrag_ratio is set to 0 then kmalloc() always
2237      * returns node local objects. If the ratio is higher then kmalloc()
2238      * may return off node objects because partial slabs are obtained
2239      * from other nodes and filled up.
2240      *
2241      * If /sys/kernel/slab/xx/remote_node_defrag_ratio is set to 100
2242      * (which makes defrag_ratio = 1000) then every (well almost)
2243      * allocation will first attempt to defrag slab caches on other nodes.
2244      * This means scanning over all nodes to look for partial slabs which
2245      * may be expensive if we do it every time we are trying to find a slab
2246      * with available objects.
2247      */
2248     if (!s->remote_node_defrag_ratio ||
2249             get_cycles() % 1024 > s->remote_node_defrag_ratio)
2250         return NULL;
2251
2252     do {
2253         cpuset_mems_cookie = read_mems_allowed_begin();
2254         zonelist = node_zonelist(mempolicy_slab_node(), flags);
2255         for_each_zone_zonelist(zone, z, zonelist, highest_zoneidx) {
2256             struct kmem_cache_node *n;
2257
2258             n = get_node(s, zone_to_nid(zone));
2259
2260             if (n && cpuset_zone_allowed(zone, flags) &&
2261                     n->nr_partial > s->min_partial) {
2262                 object = get_partial_node(s, n, ret_slab, flags);
2263                 if (object) {
2264                     /*
2265                      * Don't check read_mems_allowed_retry()
2266                      * here - if mems_allowed was updated in
2267                      * parallel, that was a harmless race
2268                      * between allocation and the cpuset
2269                      * update
2270                      */
2271                     return object;
2272                 }
2273             }
2274         }
2275     } while (read_mems_allowed_retry(cpuset_mems_cookie));
2276 #endif  /* CONFIG_NUMA */
2277     return NULL;
2278 }
2279
2280 /*
2281  * Get a partial slab, lock it and return it.
2282  */
2283 static void *get_partial(struct kmem_cache *s, gfp_t flags, int node,
2284              struct slab **ret_slab)
2285 {
2286     void *object;
2287     int searchnode = node;
2288
2289     if (node == NUMA_NO_NODE)
2290         searchnode = numa_mem_id();
2291
2292     object = get_partial_node(s, get_node(s, searchnode), ret_slab, flags);
2293     if (object || node != NUMA_NO_NODE)
2294         return object;
2295
2296     return get_any_partial(s, flags, ret_slab);
2297 }
2298
2299 #ifdef CONFIG_PREEMPTION
2300 /*
2301  * Calculate the next globally unique transaction for disambiguation
2302  * during cmpxchg. The transactions start with the cpu number and are then
2303  * incremented by CONFIG_NR_CPUS.
2304  */
2305 #define TID_STEP  roundup_pow_of_two(CONFIG_NR_CPUS)
2306 #else
2307 /*
2308  * No preemption supported therefore also no need to check for
2309  * different cpus.
2310  */
2311 #define TID_STEP 1
2312 #endif
2313
2314 static inline unsigned long next_tid(unsigned long tid)
2315 {
2316     return tid + TID_STEP;
2317 }
2318
2319 #ifdef SLUB_DEBUG_CMPXCHG
2320 static inline unsigned int tid_to_cpu(unsigned long tid)
2321 {
2322     return tid % TID_STEP;
2323 }
2324
2325 static inline unsigned long tid_to_event(unsigned long tid)
2326 {
2327     return tid / TID_STEP;
2328 }
2329 #endif
2330
2331 static inline unsigned int init_tid(int cpu)
2332 {
2333     return cpu;
2334 }
2335
2336 static inline void note_cmpxchg_failure(const char *n,
2337         const struct kmem_cache *s, unsigned long tid)
2338 {
2339 #ifdef SLUB_DEBUG_CMPXCHG
2340     unsigned long actual_tid = __this_cpu_read(s->cpu_slab->tid);
2341
2342     pr_info("%s %s: cmpxchg redo ", n, s->name);
2343
2344 #ifdef CONFIG_PREEMPTION
2345     if (tid_to_cpu(tid) != tid_to_cpu(actual_tid))
2346         pr_warn("due to cpu change %d -> %d\n",
2347             tid_to_cpu(tid), tid_to_cpu(actual_tid));
2348     else
2349 #endif
2350     if (tid_to_event(tid) != tid_to_event(actual_tid))
2351         pr_warn("due to cpu running other code. Event %ld->%ld\n",
2352             tid_to_event(tid), tid_to_event(actual_tid));
2353     else
2354         pr_warn("for unknown reason: actual=%lx was=%lx target=%lx\n",
2355             actual_tid, tid, next_tid(tid));
2356 #endif
2357     stat(s, CMPXCHG_DOUBLE_CPU_FAIL);
2358 }
2359
2360 static void init_kmem_cache_cpus(struct kmem_cache *s)
2361 {
2362     int cpu;
2363     struct kmem_cache_cpu *c;
2364
2365     for_each_possible_cpu(cpu) {
2366         c = per_cpu_ptr(s->cpu_slab, cpu);
2367         local_lock_init(&c->lock);
2368         c->tid = init_tid(cpu);
2369     }
2370 }
2371
2372 /*
2373  * Finishes removing the cpu slab. Merges cpu's freelist with slab's freelist,
2374  * unfreezes the slabs and puts it on the proper list.
2375  * Assumes the slab has been already safely taken away from kmem_cache_cpu
2376  * by the caller.
2377  */
2378 static void deactivate_slab(struct kmem_cache *s, struct slab *slab,
2379                 void *freelist)
2380 {
2381     enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE, M_FULL_NOLIST };
2382     struct kmem_cache_node *n = get_node(s, slab_nid(slab));
2383     int free_delta = 0;
2384     enum slab_modes mode = M_NONE;
2385     void *nextfree, *freelist_iter, *freelist_tail;
2386     int tail = DEACTIVATE_TO_HEAD;
2387     unsigned long flags = 0;
2388     struct slab new;
2389     struct slab old;
2390
2391     if (slab->freelist) {
2392         stat(s, DEACTIVATE_REMOTE_FREES);
2393         tail = DEACTIVATE_TO_TAIL;
2394     }
2395
2396     /*
2397      * Stage one: Count the objects on cpu's freelist as free_delta and
2398      * remember the last object in freelist_tail for later splicing.
2399      */
2400     freelist_tail = NULL;
2401     freelist_iter = freelist;
2402     while (freelist_iter) {
2403         nextfree = get_freepointer(s, freelist_iter);
2404
2405         /*
2406          * If 'nextfree' is invalid, it is possible that the object at
2407          * 'freelist_iter' is already corrupted.  So isolate all objects
2408          * starting at 'freelist_iter' by skipping them.
2409          */
2410         if (freelist_corrupted(s, slab, &freelist_iter, nextfree))
2411             break;
2412
2413         freelist_tail = freelist_iter;
2414         free_delta++;
2415
2416         freelist_iter = nextfree;
2417     }
2418
2419     /*
2420      * Stage two: Unfreeze the slab while splicing the per-cpu
2421      * freelist to the head of slab's freelist.
2422      *
2423      * Ensure that the slab is unfrozen while the list presence
2424      * reflects the actual number of objects during unfreeze.
2425      *
2426      * We first perform cmpxchg holding lock and insert to list
2427      * when it succeed. If there is mismatch then the slab is not
2428      * unfrozen and number of objects in the slab may have changed.
2429      * Then release lock and retry cmpxchg again.
2430      */
2431 redo:
2432
2433     old.freelist = READ_ONCE(slab->freelist);
2434     old.counters = READ_ONCE(slab->counters);
2435     VM_BUG_ON(!old.frozen);
2436
2437     /* Determine target state of the slab */
2438     new.counters = old.counters;
2439     if (freelist_tail) {
2440         new.inuse -= free_delta;
2441         set_freepointer(s, freelist_tail, old.freelist);
2442         new.freelist = freelist;
2443     } else
2444         new.freelist = old.freelist;
2445
2446     new.frozen = 0;
2447
2448     if (!new.inuse && n->nr_partial >= s->min_partial) {
2449         mode = M_FREE;
2450     } else if (new.freelist) {
2451         mode = M_PARTIAL;
2452         /*
2453          * Taking the spinlock removes the possibility that
2454          * acquire_slab() will see a slab that is frozen
2455          */
2456         spin_lock_irqsave(&n->list_lock, flags);
2457     } else if (kmem_cache_debug_flags(s, SLAB_STORE_USER)) {
2458         mode = M_FULL;
2459         /*
2460          * This also ensures that the scanning of full
2461          * slabs from diagnostic functions will not see
2462          * any frozen slabs.
2463          */
2464         spin_lock_irqsave(&n->list_lock, flags);
2465     } else {
2466         mode = M_FULL_NOLIST;
2467     }
2468
2469
2470     if (!cmpxchg_double_slab(s, slab,
2471                 old.freelist, old.counters,
2472                 new.freelist, new.counters,
2473                 "unfreezing slab")) {
2474         if (mode == M_PARTIAL || mode == M_FULL)
2475             spin_unlock_irqrestore(&n->list_lock, flags);
2476         goto redo;
2477     }
2478
2479
2480     if (mode == M_PARTIAL) {
2481         add_partial(n, slab, tail);
2482         spin_unlock_irqrestore(&n->list_lock, flags);
2483         stat(s, tail);
2484     } else if (mode == M_FREE) {
2485         stat(s, DEACTIVATE_EMPTY);
2486         discard_slab(s, slab);
2487         stat(s, FREE_SLAB);
2488     } else if (mode == M_FULL) {
2489         add_full(s, n, slab);
2490         spin_unlock_irqrestore(&n->list_lock, flags);
2491         stat(s, DEACTIVATE_FULL);
2492     } else if (mode == M_FULL_NOLIST) {
2493         stat(s, DEACTIVATE_FULL);
2494     }
2495 }
2496
2497 #ifdef CONFIG_SLUB_CPU_PARTIAL
2498 static void __unfreeze_partials(struct kmem_cache *s, struct slab *partial_slab)
2499 {
2500     struct kmem_cache_node *n = NULL, *n2 = NULL;
2501     struct slab *slab, *slab_to_discard = NULL;
2502     unsigned long flags = 0;
2503
2504     while (partial_slab) {
2505         struct slab new;
2506         struct slab old;
2507
2508         slab = partial_slab;
2509         partial_slab = slab->next;
2510
2511         n2 = get_node(s, slab_nid(slab));
2512         if (n != n2) {
2513             if (n)
2514                 spin_unlock_irqrestore(&n->list_lock, flags);
2515
2516             n = n2;
2517             spin_lock_irqsave(&n->list_lock, flags);
2518         }
2519
2520         do {
2521
2522             old.freelist = slab->freelist;
2523             old.counters = slab->counters;
2524             VM_BUG_ON(!old.frozen);
2525
2526             new.counters = old.counters;
2527             new.freelist = old.freelist;
2528
2529             new.frozen = 0;
2530
2531         } while (!__cmpxchg_double_slab(s, slab,
2532                 old.freelist, old.counters,
2533                 new.freelist, new.counters,
2534                 "unfreezing slab"));
2535
2536         if (unlikely(!new.inuse && n->nr_partial >= s->min_partial)) {
2537             slab->next = slab_to_discard;
2538             slab_to_discard = slab;
2539         } else {
2540             add_partial(n, slab, DEACTIVATE_TO_TAIL);
2541             stat(s, FREE_ADD_PARTIAL);
2542         }
2543     }
2544
2545     if (n)
2546         spin_unlock_irqrestore(&n->list_lock, flags);
2547
2548     while (slab_to_discard) {
2549         slab = slab_to_discard;
2550         slab_to_discard = slab_to_discard->next;
2551
2552         stat(s, DEACTIVATE_EMPTY);
2553         discard_slab(s, slab);
2554         stat(s, FREE_SLAB);
2555     }
2556 }
2557
2558 /*
2559  * Unfreeze all the cpu partial slabs.
2560  */
2561 static void unfreeze_partials(struct kmem_cache *s)
2562 {
2563     struct slab *partial_slab;
2564     unsigned long flags;
2565
2566     local_lock_irqsave(&s->cpu_slab->lock, flags);
2567     partial_slab = this_cpu_read(s->cpu_slab->partial);
2568     this_cpu_write(s->cpu_slab->partial, NULL);
2569     local_unlock_irqrestore(&s->cpu_slab->lock, flags);
2570
2571     if (partial_slab)
2572         __unfreeze_partials(s, partial_slab);
2573 }
2574
2575 static void unfreeze_partials_cpu(struct kmem_cache *s,
2576                   struct kmem_cache_cpu *c)
2577 {
2578     struct slab *partial_slab;
2579
2580     partial_slab = slub_percpu_partial(c);
2581     c->partial = NULL;
2582
2583     if (partial_slab)
2584         __unfreeze_partials(s, partial_slab);
2585 }
2586
2587 /*
2588  * Put a slab that was just frozen (in __slab_free|get_partial_node) into a
2589  * partial slab slot if available.
2590  *
2591  * If we did not find a slot then simply move all the partials to the
2592  * per node partial list.
2593  */
2594 static void put_cpu_partial(struct kmem_cache *s, struct slab *slab, int drain)
2595 {
2596     struct slab *oldslab;
2597     struct slab *slab_to_unfreeze = NULL;
2598     unsigned long flags;
2599     int slabs = 0;
2600
2601     local_lock_irqsave(&s->cpu_slab->lock, flags);
2602
2603     oldslab = this_cpu_read(s->cpu_slab->partial);
2604
2605     if (oldslab) {
2606         if (drain && oldslab->slabs >= s->cpu_partial_slabs) {
2607             /*
2608              * Partial array is full. Move the existing set to the
2609              * per node partial list. Postpone the actual unfreezing
2610              * outside of the critical section.
2611              */
2612             slab_to_unfreeze = oldslab;
2613             oldslab = NULL;
2614         } else {
2615             slabs = oldslab->slabs;
2616         }
2617     }
2618
2619     slabs++;
2620
2621     slab->slabs = slabs;
2622     slab->next = oldslab;
2623
2624     this_cpu_write(s->cpu_slab->partial, slab);
2625
2626     local_unlock_irqrestore(&s->cpu_slab->lock, flags);
2627
2628     if (slab_to_unfreeze) {
2629         __unfreeze_partials(s, slab_to_unfreeze);
2630         stat(s, CPU_PARTIAL_DRAIN);
2631     }
2632 }
2633
2634 #else   /* CONFIG_SLUB_CPU_PARTIAL */
2635
2636 static inline void unfreeze_partials(struct kmem_cache *s) { }
2637 static inline void unfreeze_partials_cpu(struct kmem_cache *s,
2638                   struct kmem_cache_cpu *c) { }
2639
2640 #endif  /* CONFIG_SLUB_CPU_PARTIAL */
2641
2642 static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
2643 {
2644     unsigned long flags;
2645     struct slab *slab;
2646     void *freelist;
2647
2648     local_lock_irqsave(&s->cpu_slab->lock, flags);
2649
2650     slab = c->slab;
2651     freelist = c->freelist;
2652
2653     c->slab = NULL;
2654     c->freelist = NULL;
2655     c->tid = next_tid(c->tid);
2656
2657     local_unlock_irqrestore(&s->cpu_slab->lock, flags);
2658
2659     if (slab) {
2660         deactivate_slab(s, slab, freelist);
2661         stat(s, CPUSLAB_FLUSH);
2662     }
2663 }
2664
2665 static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu)
2666 {
2667     struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
2668     void *freelist = c->freelist;
2669     struct slab *slab = c->slab;
2670
2671     c->slab = NULL;
2672     c->freelist = NULL;
2673     c->tid = next_tid(c->tid);
2674
2675     if (slab) {
2676         deactivate_slab(s, slab, freelist);
2677         stat(s, CPUSLAB_FLUSH);
2678     }
2679
2680     unfreeze_partials_cpu(s, c);
2681 }
2682
2683 struct slub_flush_work {
2684     struct work_struct work;
2685     struct kmem_cache *s;
2686     bool skip;
2687 };
2688
2689 /*
2690  * Flush cpu slab.
2691  *
2692  * Called from CPU work handler with migration disabled.
2693  */
2694 static void flush_cpu_slab(struct work_struct *w)
2695 {
2696     struct kmem_cache *s;
2697     struct kmem_cache_cpu *c;
2698     struct slub_flush_work *sfw;
2699
2700     sfw = container_of(w, struct slub_flush_work, work);
2701
2702     s = sfw->s;
2703     c = this_cpu_ptr(s->cpu_slab);
2704
2705     if (c->slab)
2706         flush_slab(s, c);
2707
2708     unfreeze_partials(s);
2709 }
2710
2711 static bool has_cpu_slab(int cpu, struct kmem_cache *s)
2712 {
2713     struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
2714
2715     return c->slab || slub_percpu_partial(c);
2716 }
2717
2718 static DEFINE_MUTEX(flush_lock);
2719 static DEFINE_PER_CPU(struct slub_flush_work, slub_flush);
2720
2721 static void flush_all_cpus_locked(struct kmem_cache *s)
2722 {
2723     struct slub_flush_work *sfw;
2724     unsigned int cpu;
2725
2726     lockdep_assert_cpus_held();
2727     mutex_lock(&flush_lock);
2728
2729     for_each_online_cpu(cpu) {
2730         sfw = &per_cpu(slub_flush, cpu);
2731         if (!has_cpu_slab(cpu, s)) {
2732             sfw->skip = true;
2733             continue;
2734         }
2735         INIT_WORK(&sfw->work, flush_cpu_slab);
2736         sfw->skip = false;
2737         sfw->s = s;
2738         queue_work_on(cpu, flushwq, &sfw->work);
2739     }
2740
2741     for_each_online_cpu(cpu) {
2742         sfw = &per_cpu(slub_flush, cpu);
2743         if (sfw->skip)
2744             continue;
2745         flush_work(&sfw->work);
2746     }
2747
2748     mutex_unlock(&flush_lock);
2749 }
2750
2751 static void flush_all(struct kmem_cache *s)
2752 {
2753     cpus_read_lock();
2754     flush_all_cpus_locked(s);
2755     cpus_read_unlock();
2756 }
2757
2758 /*
2759  * Use the cpu notifier to insure that the cpu slabs are flushed when
2760  * necessary.
2761  */
2762 static int slub_cpu_dead(unsigned int cpu)
2763 {
2764     struct kmem_cache *s;
2765
2766     mutex_lock(&slab_mutex);
2767     list_for_each_entry(s, &slab_caches, list)
2768         __flush_cpu_slab(s, cpu);
2769     mutex_unlock(&slab_mutex);
2770     return 0;
2771 }
2772
2773 /*
2774  * Check if the objects in a per cpu structure fit numa
2775  * locality expectations.
2776  */
2777 static inline int node_match(struct slab *slab, int node)
2778 {
2779 #ifdef CONFIG_NUMA
2780     if (node != NUMA_NO_NODE && slab_nid(slab) != node)
2781         return 0;
2782 #endif
2783     return 1;
2784 }
2785
2786 #ifdef CONFIG_SLUB_DEBUG
2787 static int count_free(struct slab *slab)
2788 {
2789     return slab->objects - slab->inuse;
2790 }
2791
2792 static inline unsigned long node_nr_objs(struct kmem_cache_node *n)
2793 {
2794     return atomic_long_read(&n->total_objects);
2795 }
2796 #endif /* CONFIG_SLUB_DEBUG */
2797
2798 #if defined(CONFIG_SLUB_DEBUG) || defined(CONFIG_SYSFS)
2799 static unsigned long count_partial(struct kmem_cache_node *n,
2800                     int (*get_count)(struct slab *))
2801 {
2802     unsigned long flags;
2803     unsigned long x = 0;
2804     struct slab *slab;
2805
2806     spin_lock_irqsave(&n->list_lock, flags);
2807     list_for_each_entry(slab, &n->partial, slab_list)
2808         x += get_count(slab);
2809     spin_unlock_irqrestore(&n->list_lock, flags);
2810     return x;
2811 }
2812 #endif /* CONFIG_SLUB_DEBUG || CONFIG_SYSFS */
2813
2814 static noinline void
2815 slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)
2816 {
2817 #ifdef CONFIG_SLUB_DEBUG
2818     static DEFINE_RATELIMIT_STATE(slub_oom_rs, DEFAULT_RATELIMIT_INTERVAL,
2819                       DEFAULT_RATELIMIT_BURST);
2820     int node;
2821     struct kmem_cache_node *n;
2822
2823     if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slub_oom_rs))
2824         return;
2825
2826     pr_warn("SLUB: Unable to allocate memory on node %d, gfp=%#x(%pGg)\n",
2827         nid, gfpflags, &gfpflags);
2828     pr_warn("  cache: %s, object size: %u, buffer size: %u, default order: %u, min order: %u\n",
2829         s->name, s->object_size, s->size, oo_order(s->oo),
2830         oo_order(s->min));
2831
2832     if (oo_order(s->min) > get_order(s->object_size))
2833         pr_warn("  %s debugging increased min order, use slub_debug=O to disable.\n",
2834             s->name);
2835
2836     for_each_kmem_cache_node(s, node, n) {
2837         unsigned long nr_slabs;
2838         unsigned long nr_objs;
2839         unsigned long nr_free;
2840
2841         nr_free  = count_partial(n, count_free);
2842         nr_slabs = node_nr_slabs(n);
2843         nr_objs  = node_nr_objs(n);
2844
2845         pr_warn("  node %d: slabs: %ld, objs: %ld, free: %ld\n",
2846             node, nr_slabs, nr_objs, nr_free);
2847     }
2848 #endif
2849 }
2850
2851 static inline bool pfmemalloc_match(struct slab *slab, gfp_t gfpflags)
2852 {
2853     if (unlikely(slab_test_pfmemalloc(slab)))
2854         return gfp_pfmemalloc_allowed(gfpflags);
2855
2856     return true;
2857 }
2858
2859 /*
2860  * Check the slab->freelist and either transfer the freelist to the
2861  * per cpu freelist or deactivate the slab.
2862  *
2863  * The slab is still frozen if the return value is not NULL.
2864  *
2865  * If this function returns NULL then the slab has been unfrozen.
2866  */
2867 static inline void *get_freelist(struct kmem_cache *s, struct slab *slab)
2868 {
2869     struct slab new;
2870     unsigned long counters;
2871     void *freelist;
2872
2873     lockdep_assert_held(this_cpu_ptr(&s->cpu_slab->lock));
2874
2875     do {
2876         freelist = slab->freelist;
2877         counters = slab->counters;
2878
2879         new.counters = counters;
2880         VM_BUG_ON(!new.frozen);
2881
2882         new.inuse = slab->objects;
2883         new.frozen = freelist != NULL;
2884
2885     } while (!__cmpxchg_double_slab(s, slab,
2886         freelist, counters,
2887         NULL, new.counters,
2888         "get_freelist"));
2889
2890     return freelist;
2891 }
2892
2893 /*
2894  * Slow path. The lockless freelist is empty or we need to perform
2895  * debugging duties.
2896  *
2897  * Processing is still very fast if new objects have been freed to the
2898  * regular freelist. In that case we simply take over the regular freelist
2899  * as the lockless freelist and zap the regular freelist.
2900  *
2901  * If that is not working then we fall back to the partial lists. We take the
2902  * first element of the freelist as the object to allocate now and move the
2903  * rest of the freelist to the lockless freelist.
2904  *
2905  * And if we were unable to get a new slab from the partial slab lists then
2906  * we need to allocate a new slab. This is the slowest path since it involves
2907  * a call to the page allocator and the setup of a new slab.
2908  *
2909  * Version of __slab_alloc to use when we know that preemption is
2910  * already disabled (which is the case for bulk allocation).
2911  */
2912 static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
2913               unsigned long addr, struct kmem_cache_cpu *c)
2914 {
2915     void *freelist;
2916     struct slab *slab;
2917     unsigned long flags;
2918
2919     stat(s, ALLOC_SLOWPATH);
2920
2921 reread_slab:
2922
2923     slab = READ_ONCE(c->slab);
2924     if (!slab) {
2925         /*
2926          * if the node is not online or has no normal memory, just
2927          * ignore the node constraint
2928          */
2929         if (unlikely(node != NUMA_NO_NODE &&
2930                  !node_isset(node, slab_nodes)))
2931             node = NUMA_NO_NODE;
2932         goto new_slab;
2933     }
2934 redo:
2935
2936     if (unlikely(!node_match(slab, node))) {
2937         /*
2938          * same as above but node_match() being false already
2939          * implies node != NUMA_NO_NODE
2940          */
2941         if (!node_isset(node, slab_nodes)) {
2942             node = NUMA_NO_NODE;
2943         } else {
2944             stat(s, ALLOC_NODE_MISMATCH);
2945             goto deactivate_slab;
2946         }
2947     }
2948
2949     /*
2950      * By rights, we should be searching for a slab page that was
2951      * PFMEMALLOC but right now, we are losing the pfmemalloc
2952      * information when the page leaves the per-cpu allocator
2953      */
2954     if (unlikely(!pfmemalloc_match(slab, gfpflags)))
2955         goto deactivate_slab;
2956
2957     /* must check again c->slab in case we got preempted and it changed */
2958     local_lock_irqsave(&s->cpu_slab->lock, flags);
2959     if (unlikely(slab != c->slab)) {
2960         local_unlock_irqrestore(&s->cpu_slab->lock, flags);
2961         goto reread_slab;
2962     }
2963     freelist = c->freelist;
2964     if (freelist)
2965         goto load_freelist;
2966
2967     freelist = get_freelist(s, slab);
2968
2969     if (!freelist) {
2970         c->slab = NULL;
2971         c->tid = next_tid(c->tid);
2972         local_unlock_irqrestore(&s->cpu_slab->lock, flags);
2973         stat(s, DEACTIVATE_BYPASS);
2974         goto new_slab;
2975     }
2976
2977     stat(s, ALLOC_REFILL);
2978
2979 load_freelist:
2980
2981     lockdep_assert_held(this_cpu_ptr(&s->cpu_slab->lock));
2982
2983     /*
2984      * freelist is pointing to the list of objects to be used.
2985      * slab is pointing to the slab from which the objects are obtained.
2986      * That slab must be frozen for per cpu allocations to work.
2987      */
2988     VM_BUG_ON(!c->slab->frozen);
2989     c->freelist = get_freepointer(s, freelist);
2990     c->tid = next_tid(c->tid);
2991     local_unlock_irqrestore(&s->cpu_slab->lock, flags);
2992     return freelist;
2993
2994 deactivate_slab:
2995
2996     local_lock_irqsave(&s->cpu_slab->lock, flags);
2997     if (slab != c->slab) {
2998         local_unlock_irqrestore(&s->cpu_slab->lock, flags);
2999         goto reread_slab;
3000     }
3001     freelist = c->freelist;
3002     c->slab = NULL;
3003     c->freelist = NULL;
3004     c->tid = next_tid(c->tid);
3005     local_unlock_irqrestore(&s->cpu_slab->lock, flags);
3006     deactivate_slab(s, slab, freelist);
3007
3008 new_slab:
3009
3010     if (slub_percpu_partial(c)) {
3011         local_lock_irqsave(&s->cpu_slab->lock, flags);
3012         if (unlikely(c->slab)) {
3013             local_unlock_irqrestore(&s->cpu_slab->lock, flags);
3014             goto reread_slab;
3015         }
3016         if (unlikely(!slub_percpu_partial(c))) {
3017             local_unlock_irqrestore(&s->cpu_slab->lock, flags);
3018             /* we were preempted and partial list got empty */
3019             goto new_objects;
3020         }
3021
3022         slab = c->slab = slub_percpu_partial(c);
3023         slub_set_percpu_partial(c, slab);
3024         local_unlock_irqrestore(&s->cpu_slab->lock, flags);
3025         stat(s, CPU_PARTIAL_ALLOC);
3026         goto redo;
3027     }
3028
3029 new_objects:
3030
3031     freelist = get_partial(s, gfpflags, node, &slab);
3032     if (freelist)
3033         goto check_new_slab;
3034
3035     slub_put_cpu_ptr(s->cpu_slab);
3036     slab = new_slab(s, gfpflags, node);
3037     c = slub_get_cpu_ptr(s->cpu_slab);
3038
3039     if (unlikely(!slab)) {
3040         slab_out_of_memory(s, gfpflags, node);
3041         return NULL;
3042     }
3043
3044     /*
3045      * No other reference to the slab yet so we can
3046      * muck around with it freely without cmpxchg
3047      */
3048     freelist = slab->freelist;
3049     slab->freelist = NULL;
3050
3051     stat(s, ALLOC_SLAB);
3052
3053 check_new_slab:
3054
3055     if (kmem_cache_debug(s)) {
3056         if (!alloc_debug_processing(s, slab, freelist, addr)) {
3057             /* Slab failed checks. Next slab needed */
3058             goto new_slab;
3059         } else {
3060             /*
3061              * For debug case, we don't load freelist so that all
3062              * allocations go through alloc_debug_processing()
3063              */
3064             goto return_single;
3065         }
3066     }
3067
3068     if (unlikely(!pfmemalloc_match(slab, gfpflags)))
3069         /*
3070          * For !pfmemalloc_match() case we don't load freelist so that
3071          * we don't make further mismatched allocations easier.
3072          */
3073         goto return_single;
3074
3075 retry_load_slab:
3076
3077     local_lock_irqsave(&s->cpu_slab->lock, flags);
3078     if (unlikely(c->slab)) {
3079         void *flush_freelist = c->freelist;
3080         struct slab *flush_slab = c->slab;
3081
3082         c->slab = NULL;
3083         c->freelist = NULL;
3084         c->tid = next_tid(c->tid);
3085
3086         local_unlock_irqrestore(&s->cpu_slab->lock, flags);
3087
3088         deactivate_slab(s, flush_slab, flush_freelist);
3089
3090         stat(s, CPUSLAB_FLUSH);
3091
3092         goto retry_load_slab;
3093     }
3094     c->slab = slab;
3095
3096     goto load_freelist;
3097
3098 return_single:
3099
3100     deactivate_slab(s, slab, get_freepointer(s, freelist));
3101     return freelist;
3102 }
3103
3104 /*
3105  * A wrapper for ___slab_alloc() for contexts where preemption is not yet
3106  * disabled. Compensates for possible cpu changes by refetching the per cpu area
3107  * pointer.
3108  */
3109 static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
3110               unsigned long addr, struct kmem_cache_cpu *c)
3111 {
3112     void *p;
3113
3114 #ifdef CONFIG_PREEMPT_COUNT
3115     /*
3116      * We may have been preempted and rescheduled on a different
3117      * cpu before disabling preemption. Need to reload cpu area
3118      * pointer.
3119      */
3120     c = slub_get_cpu_ptr(s->cpu_slab);
3121 #endif
3122
3123     p = ___slab_alloc(s, gfpflags, node, addr, c);
3124 #ifdef CONFIG_PREEMPT_COUNT
3125     slub_put_cpu_ptr(s->cpu_slab);
3126 #endif
3127     return p;
3128 }
3129
3130 /*
3131  * If the object has been wiped upon free, make sure it's fully initialized by
3132  * zeroing out freelist pointer.
3133  */
3134 static __always_inline void maybe_wipe_obj_freeptr(struct kmem_cache *s,
3135                            void *obj)
3136 {
3137     if (unlikely(slab_want_init_on_free(s)) && obj)
3138         memset((void *)((char *)kasan_reset_tag(obj) + s->offset),
3139             0, sizeof(void *));
3140 }
3141
3142 /*
3143  * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc)
3144  * have the fastpath folded into their functions. So no function call
3145  * overhead for requests that can be satisfied on the fastpath.
3146  *
3147  * The fastpath works by first checking if the lockless freelist can be used.
3148  * If not then __slab_alloc is called for slow processing.
3149  *
3150  * Otherwise we can simply pick the next object from the lockless free list.
3151  */
3152 static __always_inline void *slab_alloc_node(struct kmem_cache *s, struct list_lru *lru,
3153         gfp_t gfpflags, int node, unsigned long addr, size_t orig_size)
3154 {
3155     void *object;
3156     struct kmem_cache_cpu *c;
3157     struct slab *slab;
3158     unsigned long tid;
3159     struct obj_cgroup *objcg = NULL;
3160     bool init = false;
3161
3162     s = slab_pre_alloc_hook(s, lru, &objcg, 1, gfpflags);
3163     if (!s)
3164         return NULL;
3165
3166     object = kfence_alloc(s, orig_size, gfpflags);
3167     if (unlikely(object))
3168         goto out;
3169
3170 redo:
3171     /*
3172      * Must read kmem_cache cpu data via this cpu ptr. Preemption is
3173      * enabled. We may switch back and forth between cpus while
3174      * reading from one cpu area. That does not matter as long
3175      * as we end up on the original cpu again when doing the cmpxchg.
3176      *
3177      * We must guarantee that tid and kmem_cache_cpu are retrieved on the
3178      * same cpu. We read first the kmem_cache_cpu pointer and use it to read
3179      * the tid. If we are preempted and switched to another cpu between the
3180      * two reads, it's OK as the two are still associated with the same cpu
3181      * and cmpxchg later will validate the cpu.
3182      */
3183     c = raw_cpu_ptr(s->cpu_slab);
3184     tid = READ_ONCE(c->tid);
3185
3186     /*
3187      * Irqless object alloc/free algorithm used here depends on sequence
3188      * of fetching cpu_slab's data. tid should be fetched before anything
3189      * on c to guarantee that object and slab associated with previous tid
3190      * won't be used with current tid. If we fetch tid first, object and
3191      * slab could be one associated with next tid and our alloc/free
3192      * request will be failed. In this case, we will retry. So, no problem.
3193      */
3194     barrier();
3195
3196     /*
3197      * The transaction ids are globally unique per cpu and per operation on
3198      * a per cpu queue. Thus they can be guarantee that the cmpxchg_double
3199      * occurs on the right processor and that there was no operation on the
3200      * linked list in between.
3201      */
3202
3203     object = c->freelist;
3204     slab = c->slab;
3205     /*
3206      * We cannot use the lockless fastpath on PREEMPT_RT because if a
3207      * slowpath has taken the local_lock_irqsave(), it is not protected
3208      * against a fast path operation in an irq handler. So we need to take
3209      * the slow path which uses local_lock. It is still relatively fast if
3210      * there is a suitable cpu freelist.
3211      */
3212     if (IS_ENABLED(CONFIG_PREEMPT_RT) ||
3213         unlikely(!object || !slab || !node_match(slab, node))) {
3214         object = __slab_alloc(s, gfpflags, node, addr, c);
3215     } else {
3216         void *next_object = get_freepointer_safe(s, object);
3217
3218         /*
3219          * The cmpxchg will only match if there was no additional
3220          * operation and if we are on the right processor.
3221          *
3222          * The cmpxchg does the following atomically (without lock
3223          * semantics!)
3224          * 1. Relocate first pointer to the current per cpu area.
3225          * 2. Verify that tid and freelist have not been changed
3226          * 3. If they were not changed replace tid and freelist
3227          *
3228          * Since this is without lock semantics the protection is only
3229          * against code executing on this cpu *not* from access by
3230          * other cpus.
3231          */
3232         if (unlikely(!this_cpu_cmpxchg_double(
3233                 s->cpu_slab->freelist, s->cpu_slab->tid,
3234                 object, tid,
3235                 next_object, next_tid(tid)))) {
3236
3237             note_cmpxchg_failure("slab_alloc", s, tid);
3238             goto redo;
3239         }
3240         prefetch_freepointer(s, next_object);
3241         stat(s, ALLOC_FASTPATH);
3242     }
3243
3244     maybe_wipe_obj_freeptr(s, object);
3245     init = slab_want_init_on_alloc(gfpflags, s);
3246
3247 out:
3248     slab_post_alloc_hook(s, objcg, gfpflags, 1, &object, init);
3249
3250     return object;
3251 }
3252
3253 static __always_inline void *slab_alloc(struct kmem_cache *s, struct list_lru *lru,
3254         gfp_t gfpflags, unsigned long addr, size_t orig_size)
3255 {
3256     return slab_alloc_node(s, lru, gfpflags, NUMA_NO_NODE, addr, orig_size);
3257 }
3258
3259 static __always_inline
3260 void *__kmem_cache_alloc_lru(struct kmem_cache *s, struct list_lru *lru,
3261                  gfp_t gfpflags)
3262 {
3263     void *ret = slab_alloc(s, lru, gfpflags, _RET_IP_, s->object_size);
3264
3265     trace_kmem_cache_alloc(_RET_IP_, ret, s, s->object_size,
3266                 s->size, gfpflags);
3267
3268     return ret;
3269 }
3270
3271 void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
3272 {
3273     return __kmem_cache_alloc_lru(s, NULL, gfpflags);
3274 }
3275 EXPORT_SYMBOL(kmem_cache_alloc);
3276
3277 void *kmem_cache_alloc_lru(struct kmem_cache *s, struct list_lru *lru,
3278                gfp_t gfpflags)
3279 {
3280     return __kmem_cache_alloc_lru(s, lru, gfpflags);
3281 }
3282 EXPORT_SYMBOL(kmem_cache_alloc_lru);
3283
3284 #ifdef CONFIG_TRACING
3285 void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size)
3286 {
3287     void *ret = slab_alloc(s, NULL, gfpflags, _RET_IP_, size);
3288     trace_kmalloc(_RET_IP_, ret, s, size, s->size, gfpflags);
3289     ret = kasan_kmalloc(s, ret, size, gfpflags);
3290     return ret;
3291 }
3292 EXPORT_SYMBOL(kmem_cache_alloc_trace);
3293 #endif
3294
3295 #ifdef CONFIG_NUMA
3296 void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
3297 {
3298     void *ret = slab_alloc_node(s, NULL, gfpflags, node, _RET_IP_, s->object_size);
3299
3300     trace_kmem_cache_alloc_node(_RET_IP_, ret, s,
3301                     s->object_size, s->size, gfpflags, node);
3302
3303     return ret;
3304 }
3305 EXPORT_SYMBOL(kmem_cache_alloc_node);
3306
3307 #ifdef CONFIG_TRACING
3308 void *kmem_cache_alloc_node_trace(struct kmem_cache *s,
3309                     gfp_t gfpflags,
3310                     int node, size_t size)
3311 {
3312     void *ret = slab_alloc_node(s, NULL, gfpflags, node, _RET_IP_, size);
3313
3314     trace_kmalloc_node(_RET_IP_, ret, s,
3315                size, s->size, gfpflags, node);
3316
3317     ret = kasan_kmalloc(s, ret, size, gfpflags);
3318     return ret;
3319 }
3320 EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
3321 #endif
3322 #endif  /* CONFIG_NUMA */
3323
3324 /*
3325  * Slow path handling. This may still be called frequently since objects
3326  * have a longer lifetime than the cpu slabs in most processing loads.
3327  *
3328  * So we still attempt to reduce cache line usage. Just take the slab
3329  * lock and free the item. If there is no additional partial slab
3330  * handling required then we can return immediately.
3331  */
3332 static void __slab_free(struct kmem_cache *s, struct slab *slab,
3333             void *head, void *tail, int cnt,
3334             unsigned long addr)
3335
3336 {
3337     void *prior;
3338     int was_frozen;
3339     struct slab new;
3340     unsigned long counters;
3341     struct kmem_cache_node *n = NULL;
3342     unsigned long flags;
3343
3344     stat(s, FREE_SLOWPATH);
3345
3346     if (kfence_free(head))
3347         return;
3348
3349     if (kmem_cache_debug(s) &&
3350         !free_debug_processing(s, slab, head, tail, cnt, addr))
3351         return;
3352
3353     do {
3354         if (unlikely(n)) {
3355             spin_unlock_irqrestore(&n->list_lock, flags);
3356             n = NULL;
3357         }
3358         prior = slab->freelist;
3359         counters = slab->counters;
3360         set_freepointer(s, tail, prior);
3361         new.counters = counters;
3362         was_frozen = new.frozen;
3363         new.inuse -= cnt;
3364         if ((!new.inuse || !prior) && !was_frozen) {
3365
3366             if (kmem_cache_has_cpu_partial(s) && !prior) {
3367
3368                 /*
3369                  * Slab was on no list before and will be
3370                  * partially empty
3371                  * We can defer the list move and instead
3372                  * freeze it.
3373                  */
3374                 new.frozen = 1;
3375
3376             } else { /* Needs to be taken off a list */
3377
3378                 n = get_node(s, slab_nid(slab));
3379                 /*
3380                  * Speculatively acquire the list_lock.
3381                  * If the cmpxchg does not succeed then we may
3382                  * drop the list_lock without any processing.
3383                  *
3384                  * Otherwise the list_lock will synchronize with
3385                  * other processors updating the list of slabs.
3386                  */
3387                 spin_lock_irqsave(&n->list_lock, flags);
3388
3389             }
3390         }
3391
3392     } while (!cmpxchg_double_slab(s, slab,
3393         prior, counters,
3394         head, new.counters,
3395         "__slab_free"));
3396
3397     if (likely(!n)) {
3398
3399         if (likely(was_frozen)) {
3400             /*
3401              * The list lock was not taken therefore no list
3402              * activity can be necessary.
3403              */
3404             stat(s, FREE_FROZEN);
3405         } else if (new.frozen) {
3406             /*
3407              * If we just froze the slab then put it onto the
3408              * per cpu partial list.
3409              */
3410             put_cpu_partial(s, slab, 1);
3411             stat(s, CPU_PARTIAL_FREE);
3412         }
3413
3414         return;
3415     }
3416
3417     if (unlikely(!new.inuse && n->nr_partial >= s->min_partial))
3418         goto slab_empty;
3419
3420     /*
3421      * Objects left in the slab. If it was not on the partial list before
3422      * then add it.
3423      */
3424     if (!kmem_cache_has_cpu_partial(s) && unlikely(!prior)) {
3425         remove_full(s, n, slab);
3426         add_partial(n, slab, DEACTIVATE_TO_TAIL);
3427         stat(s, FREE_ADD_PARTIAL);
3428     }
3429     spin_unlock_irqrestore(&n->list_lock, flags);
3430     return;
3431
3432 slab_empty:
3433     if (prior) {
3434         /*
3435          * Slab on the partial list.
3436          */
3437         remove_partial(n, slab);
3438         stat(s, FREE_REMOVE_PARTIAL);
3439     } else {
3440         /* Slab must be on the full list */
3441         remove_full(s, n, slab);
3442     }
3443
3444     spin_unlock_irqrestore(&n->list_lock, flags);
3445     stat(s, FREE_SLAB);
3446     discard_slab(s, slab);
3447 }
3448
3449 /*
3450  * Fastpath with forced inlining to produce a kfree and kmem_cache_free that
3451  * can perform fastpath freeing without additional function calls.
3452  *
3453  * The fastpath is only possible if we are freeing to the current cpu slab
3454  * of this processor. This typically the case if we have just allocated
3455  * the item before.
3456  *
3457  * If fastpath is not possible then fall back to __slab_free where we deal
3458  * with all sorts of special processing.
3459  *
3460  * Bulk free of a freelist with several objects (all pointing to the
3461  * same slab) possible by specifying head and tail ptr, plus objects
3462  * count (cnt). Bulk free indicated by tail pointer being set.
3463  */
3464 static __always_inline void do_slab_free(struct kmem_cache *s,
3465                 struct slab *slab, void *head, void *tail,
3466                 int cnt, unsigned long addr)
3467 {
3468     void *tail_obj = tail ? : head;
3469     struct kmem_cache_cpu *c;
3470     unsigned long tid;
3471
3472 redo:
3473     /*
3474      * Determine the currently cpus per cpu slab.
3475      * The cpu may change afterward. However that does not matter since
3476      * data is retrieved via this pointer. If we are on the same cpu
3477      * during the cmpxchg then the free will succeed.
3478      */
3479     c = raw_cpu_ptr(s->cpu_slab);
3480     tid = READ_ONCE(c->tid);
3481
3482     /* Same with comment on barrier() in slab_alloc_node() */
3483     barrier();
3484
3485     if (likely(slab == c->slab)) {
3486 #ifndef CONFIG_PREEMPT_RT
3487         void **freelist = READ_ONCE(c->freelist);
3488
3489         set_freepointer(s, tail_obj, freelist);
3490
3491         if (unlikely(!this_cpu_cmpxchg_double(
3492                 s->cpu_slab->freelist, s->cpu_slab->tid,
3493                 freelist, tid,
3494                 head, next_tid(tid)))) {
3495
3496             note_cmpxchg_failure("slab_free", s, tid);
3497             goto redo;
3498         }
3499 #else /* CONFIG_PREEMPT_RT */
3500         /*
3501          * We cannot use the lockless fastpath on PREEMPT_RT because if
3502          * a slowpath has taken the local_lock_irqsave(), it is not
3503          * protected against a fast path operation in an irq handler. So
3504          * we need to take the local_lock. We shouldn't simply defer to
3505          * __slab_free() as that wouldn't use the cpu freelist at all.
3506          */
3507         void **freelist;
3508
3509         local_lock(&s->cpu_slab->lock);
3510         c = this_cpu_ptr(s->cpu_slab);
3511         if (unlikely(slab != c->slab)) {
3512             local_unlock(&s->cpu_slab->lock);
3513             goto redo;
3514         }
3515         tid = c->tid;
3516         freelist = c->freelist;
3517
3518         set_freepointer(s, tail_obj, freelist);
3519         c->freelist = head;
3520         c->tid = next_tid(tid);
3521
3522         local_unlock(&s->cpu_slab->lock);
3523 #endif
3524         stat(s, FREE_FASTPATH);
3525     } else
3526         __slab_free(s, slab, head, tail_obj, cnt, addr);
3527
3528 }
3529
3530 static __always_inline void slab_free(struct kmem_cache *s, struct slab *slab,
3531                       void *head, void *tail, void **p, int cnt,
3532                       unsigned long addr)
3533 {
3534     memcg_slab_free_hook(s, slab, p, cnt);
3535     /*
3536      * With KASAN enabled slab_free_freelist_hook modifies the freelist
3537      * to remove objects, whose reuse must be delayed.
3538      */
3539     if (slab_free_freelist_hook(s, &head, &tail, &cnt))
3540         do_slab_free(s, slab, head, tail, cnt, addr);
3541 }
3542
3543 #ifdef CONFIG_KASAN_GENERIC
3544 void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr)
3545 {
3546     do_slab_free(cache, virt_to_slab(x), x, NULL, 1, addr);
3547 }
3548 #endif
3549
3550 void kmem_cache_free(struct kmem_cache *s, void *x)
3551 {
3552     s = cache_from_obj(s, x);
3553     if (!s)
3554         return;
3555     trace_kmem_cache_free(_RET_IP_, x, s->name);
3556     slab_free(s, virt_to_slab(x), x, NULL, &x, 1, _RET_IP_);
3557 }
3558 EXPORT_SYMBOL(kmem_cache_free);
3559
3560 struct detached_freelist {
3561     struct slab *slab;
3562     void *tail;
3563     void *freelist;
3564     int cnt;
3565     struct kmem_cache *s;
3566 };
3567
3568 static inline void free_large_kmalloc(struct folio *folio, void *object)
3569 {
3570     unsigned int order = folio_order(folio);
3571
3572     if (WARN_ON_ONCE(order == 0))
3573         pr_warn_once("object pointer: 0x%p\n", object);
3574
3575     kfree_hook(object);
3576     mod_lruvec_page_state(folio_page(folio, 0), NR_SLAB_UNRECLAIMABLE_B,
3577                   -(PAGE_SIZE << order));
3578     __free_pages(folio_page(folio, 0), order);
3579 }
3580
3581 /*
3582  * This function progressively scans the array with free objects (with
3583  * a limited look ahead) and extract objects belonging to the same
3584  * slab.  It builds a detached freelist directly within the given
3585  * slab/objects.  This can happen without any need for
3586  * synchronization, because the objects are owned by running process.
3587  * The freelist is build up as a single linked list in the objects.
3588  * The idea is, that this detached freelist can then be bulk
3589  * transferred to the real freelist(s), but only requiring a single
3590  * synchronization primitive.  Look ahead in the array is limited due
3591  * to performance reasons.
3592  */
3593 static inline
3594 int build_detached_freelist(struct kmem_cache *s, size_t size,
3595                 void **p, struct detached_freelist *df)
3596 {
3597     int lookahead = 3;
3598     void *object;
3599     struct folio *folio;
3600     size_t same;
3601
3602     object = p[--size];
3603     folio = virt_to_folio(object);
3604     if (!s) {
3605         /* Handle kalloc'ed objects */
3606         if (unlikely(!folio_test_slab(folio))) {
3607             free_large_kmalloc(folio, object);
3608             df->slab = NULL;
3609             return size;
3610         }
3611         /* Derive kmem_cache from object */
3612         df->slab = folio_slab(folio);
3613         df->s = df->slab->slab_cache;
3614     } else {
3615         df->slab = folio_slab(folio);
3616         df->s = cache_from_obj(s, object); /* Support for memcg */
3617     }
3618
3619     /* Start new detached freelist */
3620     df->tail = object;
3621     df->freelist = object;
3622     df->cnt = 1;
3623
3624     if (is_kfence_address(object))
3625         return size;
3626
3627     set_freepointer(df->s, object, NULL);
3628
3629     same = size;
3630     while (size) {
3631         object = p[--size];
3632         /* df->slab is always set at this point */
3633         if (df->slab == virt_to_slab(object)) {
3634             /* Opportunity build freelist */
3635             set_freepointer(df->s, object, df->freelist);
3636             df->freelist = object;
3637             df->cnt++;
3638             same--;
3639             if (size != same)
3640                 swap(p[size], p[same]);
3641             continue;
3642         }
3643
3644         /* Limit look ahead search */
3645         if (!--lookahead)
3646             break;
3647     }
3648
3649     return same;
3650 }
3651
3652 /* Note that interrupts must be enabled when calling this function. */
3653 void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
3654 {
3655     if (!size)
3656         return;
3657
3658     do {
3659         struct detached_freelist df;
3660
3661         size = build_detached_freelist(s, size, p, &df);
3662         if (!df.slab)
3663             continue;
3664
3665         slab_free(df.s, df.slab, df.freelist, df.tail, &p[size], df.cnt,
3666               _RET_IP_);
3667     } while (likely(size));
3668 }
3669 EXPORT_SYMBOL(kmem_cache_free_bulk);
3670
3671 /* Note that interrupts must be enabled when calling this function. */
3672 int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
3673               void **p)
3674 {
3675     struct kmem_cache_cpu *c;
3676     int i;
3677     struct obj_cgroup *objcg = NULL;
3678
3679     /* memcg and kmem_cache debug support */
3680     s = slab_pre_alloc_hook(s, NULL, &objcg, size, flags);
3681     if (unlikely(!s))
3682         return false;
3683     /*
3684      * Drain objects in the per cpu slab, while disabling local
3685      * IRQs, which protects against PREEMPT and interrupts
3686      * handlers invoking normal fastpath.
3687      */
3688     c = slub_get_cpu_ptr(s->cpu_slab);
3689     local_lock_irq(&s->cpu_slab->lock);
3690
3691     for (i = 0; i < size; i++) {
3692         void *object = kfence_alloc(s, s->object_size, flags);
3693
3694         if (unlikely(object)) {
3695             p[i] = object;
3696             continue;
3697         }
3698
3699         object = c->freelist;
3700         if (unlikely(!object)) {
3701             /*
3702              * We may have removed an object from c->freelist using
3703              * the fastpath in the previous iteration; in that case,
3704              * c->tid has not been bumped yet.
3705              * Since ___slab_alloc() may reenable interrupts while
3706              * allocating memory, we should bump c->tid now.
3707              */
3708             c->tid = next_tid(c->tid);
3709
3710             local_unlock_irq(&s->cpu_slab->lock);
3711
3712             /*
3713              * Invoking slow path likely have side-effect
3714              * of re-populating per CPU c->freelist
3715              */
3716             p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE,
3717                         _RET_IP_, c);
3718             if (unlikely(!p[i]))
3719                 goto error;
3720
3721             c = this_cpu_ptr(s->cpu_slab);
3722             maybe_wipe_obj_freeptr(s, p[i]);
3723
3724             local_lock_irq(&s->cpu_slab->lock);
3725
3726             continue; /* goto for-loop */
3727         }
3728         c->freelist = get_freepointer(s, object);
3729         p[i] = object;
3730         maybe_wipe_obj_freeptr(s, p[i]);
3731     }
3732     c->tid = next_tid(c->tid);
3733     local_unlock_irq(&s->cpu_slab->lock);
3734     slub_put_cpu_ptr(s->cpu_slab);
3735
3736     /*
3737      * memcg and kmem_cache debug support and memory initialization.
3738      * Done outside of the IRQ disabled fastpath loop.
3739      */
3740     slab_post_alloc_hook(s, objcg, flags, size, p,
3741                 slab_want_init_on_alloc(flags, s));
3742     return i;
3743 error:
3744     slub_put_cpu_ptr(s->cpu_slab);
3745     slab_post_alloc_hook(s, objcg, flags, i, p, false);
3746     kmem_cache_free_bulk(s, i, p);
3747     return 0;
3748 }
3749 EXPORT_SYMBOL(kmem_cache_alloc_bulk);
3750
3751
3752 /*
3753  * Object placement in a slab is made very easy because we always start at
3754  * offset 0. If we tune the size of the object to the alignment then we can
3755  * get the required alignment by putting one properly sized object after
3756  * another.
3757  *
3758  * Notice that the allocation order determines the sizes of the per cpu
3759  * caches. Each processor has always one slab available for allocations.
3760  * Increasing the allocation order reduces the number of times that slabs
3761  * must be moved on and off the partial lists and is therefore a factor in
3762  * locking overhead.
3763  */
3764
3765 /*
3766  * Minimum / Maximum order of slab pages. This influences locking overhead
3767  * and slab fragmentation. A higher order reduces the number of partial slabs
3768  * and increases the number of allocations possible without having to
3769  * take the list_lock.
3770  */
3771 static unsigned int slub_min_order;
3772 static unsigned int slub_max_order = PAGE_ALLOC_COSTLY_ORDER;
3773 static unsigned int slub_min_objects;
3774
3775 /*
3776  * Calculate the order of allocation given an slab object size.
3777  *
3778  * The order of allocation has significant impact on performance and other
3779  * system components. Generally order 0 allocations should be preferred since
3780  * order 0 does not cause fragmentation in the page allocator. Larger objects
3781  * be problematic to put into order 0 slabs because there may be too much
3782  * unused space left. We go to a higher order if more than 1/16th of the slab
3783  * would be wasted.
3784  *
3785  * In order to reach satisfactory performance we must ensure that a minimum
3786  * number of objects is in one slab. Otherwise we may generate too much
3787  * activity on the partial lists which requires taking the list_lock. This is
3788  * less a concern for large slabs though which are rarely used.
3789  *
3790  * slub_max_order specifies the order where we begin to stop considering the
3791  * number of objects in a slab as critical. If we reach slub_max_order then
3792  * we try to keep the page order as low as possible. So we accept more waste
3793  * of space in favor of a small page order.
3794  *
3795  * Higher order allocations also allow the placement of more objects in a
3796  * slab and thereby reduce object handling overhead. If the user has
3797  * requested a higher minimum order then we start with that one instead of
3798  * the smallest order which will fit the object.
3799  */
3800 static inline unsigned int calc_slab_order(unsigned int size,
3801         unsigned int min_objects, unsigned int max_order,
3802         unsigned int fract_leftover)
3803 {
3804     unsigned int min_order = slub_min_order;
3805     unsigned int order;
3806
3807     if (order_objects(min_order, size) > MAX_OBJS_PER_PAGE)
3808         return get_order(size * MAX_OBJS_PER_PAGE) - 1;
3809
3810     for (order = max(min_order, (unsigned int)get_order(min_objects * size));
3811             order <= max_order; order++) {
3812
3813         unsigned int slab_size = (unsigned int)PAGE_SIZE << order;
3814         unsigned int rem;
3815
3816         rem = slab_size % size;
3817
3818         if (rem <= slab_size / fract_leftover)
3819             break;
3820     }
3821
3822     return order;
3823 }
3824
3825 static inline int calculate_order(unsigned int size)
3826 {
3827     unsigned int order;
3828     unsigned int min_objects;
3829     unsigned int max_objects;
3830     unsigned int nr_cpus;
3831
3832     /*
3833      * Attempt to find best configuration for a slab. This
3834      * works by first attempting to generate a layout with
3835      * the best configuration and backing off gradually.
3836      *
3837      * First we increase the acceptable waste in a slab. Then
3838      * we reduce the minimum objects required in a slab.
3839      */
3840     min_objects = slub_min_objects;
3841     if (!min_objects) {
3842         /*
3843          * Some architectures will only update present cpus when
3844          * onlining them, so don't trust the number if it's just 1. But
3845          * we also don't want to use nr_cpu_ids always, as on some other
3846          * architectures, there can be many possible cpus, but never
3847          * onlined. Here we compromise between trying to avoid too high
3848          * order on systems that appear larger than they are, and too
3849          * low order on systems that appear smaller than they are.
3850          */
3851         nr_cpus = num_present_cpus();
3852         if (nr_cpus <= 1)
3853             nr_cpus = nr_cpu_ids;
3854         min_objects = 4 * (fls(nr_cpus) + 1);
3855     }
3856     max_objects = order_objects(slub_max_order, size);
3857     min_objects = min(min_objects, max_objects);
3858
3859     while (min_objects > 1) {
3860         unsigned int fraction;
3861
3862         fraction = 16;
3863         while (fraction >= 4) {
3864             order = calc_slab_order(size, min_objects,
3865                     slub_max_order, fraction);
3866             if (order <= slub_max_order)
3867                 return order;
3868             fraction /= 2;
3869         }
3870         min_objects--;
3871     }
3872
3873     /*
3874      * We were unable to place multiple objects in a slab. Now
3875      * lets see if we can place a single object there.
3876      */
3877     order = calc_slab_order(size, 1, slub_max_order, 1);
3878     if (order <= slub_max_order)
3879         return order;
3880
3881     /*
3882      * Doh this slab cannot be placed using slub_max_order.
3883      */
3884     order = calc_slab_order(size, 1, MAX_ORDER, 1);
3885     if (order < MAX_ORDER)
3886         return order;
3887     return -ENOSYS;
3888 }
3889
3890 static void
3891 init_kmem_cache_node(struct kmem_cache_node *n)
3892 {
3893     n->nr_partial = 0;
3894     spin_lock_init(&n->list_lock);
3895     INIT_LIST_HEAD(&n->partial);
3896 #ifdef CONFIG_SLUB_DEBUG
3897     atomic_long_set(&n->nr_slabs, 0);
3898     atomic_long_set(&n->total_objects, 0);
3899     INIT_LIST_HEAD(&n->full);
3900 #endif
3901 }
3902
3903 static inline int alloc_kmem_cache_cpus(struct kmem_cache *s)
3904 {
3905     BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE <
3906             KMALLOC_SHIFT_HIGH * sizeof(struct kmem_cache_cpu));
3907
3908     /*
3909      * Must align to double word boundary for the double cmpxchg
3910      * instructions to work; see __pcpu_double_call_return_bool().
3911      */
3912     s->cpu_slab = __alloc_percpu(sizeof(struct kmem_cache_cpu),
3913                      2 * sizeof(void *));
3914
3915     if (!s->cpu_slab)
3916         return 0;
3917
3918     init_kmem_cache_cpus(s);
3919
3920     return 1;
3921 }
3922
3923 static struct kmem_cache *kmem_cache_node;
3924
3925 /*
3926  * No kmalloc_node yet so do it by hand. We know that this is the first
3927  * slab on the node for this slabcache. There are no concurrent accesses
3928  * possible.
3929  *
3930  * Note that this function only works on the kmem_cache_node
3931  * when allocating for the kmem_cache_node. This is used for bootstrapping
3932  * memory on a fresh node that has no slab structures yet.
3933  */
3934 static void early_kmem_cache_node_alloc(int node)
3935 {
3936     struct slab *slab;
3937     struct kmem_cache_node *n;
3938
3939     BUG_ON(kmem_cache_node->size < sizeof(struct kmem_cache_node));
3940
3941     slab = new_slab(kmem_cache_node, GFP_NOWAIT, node);
3942
3943     BUG_ON(!slab);
3944     if (slab_nid(slab) != node) {
3945         pr_err("SLUB: Unable to allocate memory from node %d\n", node);
3946         pr_err("SLUB: Allocating a useless per node structure in order to be able to continue\n");
3947     }
3948
3949     n = slab->freelist;
3950     BUG_ON(!n);
3951 #ifdef CONFIG_SLUB_DEBUG
3952     init_object(kmem_cache_node, n, SLUB_RED_ACTIVE);
3953     init_tracking(kmem_cache_node, n);
3954 #endif
3955     n = kasan_slab_alloc(kmem_cache_node, n, GFP_KERNEL, false);
3956     slab->freelist = get_freepointer(kmem_cache_node, n);
3957     slab->inuse = 1;
3958     slab->frozen = 0;
3959     kmem_cache_node->node[node] = n;
3960     init_kmem_cache_node(n);
3961     inc_slabs_node(kmem_cache_node, node, slab->objects);
3962
3963     /*
3964      * No locks need to be taken here as it has just been
3965      * initialized and there is no concurrent access.
3966      */
3967     __add_partial(n, slab, DEACTIVATE_TO_HEAD);
3968 }
3969
3970 static void free_kmem_cache_nodes(struct kmem_cache *s)
3971 {
3972     int node;
3973     struct kmem_cache_node *n;
3974
3975     for_each_kmem_cache_node(s, node, n) {
3976         s->node[node] = NULL;
3977         kmem_cache_free(kmem_cache_node, n);
3978     }
3979 }
3980
3981 void __kmem_cache_release(struct kmem_cache *s)
3982 {
3983     cache_random_seq_destroy(s);
3984     free_percpu(s->cpu_slab);
3985     free_kmem_cache_nodes(s);
3986 }
3987
3988 static int init_kmem_cache_nodes(struct kmem_cache *s)
3989 {
3990     int node;
3991
3992     for_each_node_mask(node, slab_nodes) {
3993         struct kmem_cache_node *n;
3994
3995         if (slab_state == DOWN) {
3996             early_kmem_cache_node_alloc(node);
3997             continue;
3998         }
3999         n = kmem_cache_alloc_node(kmem_cache_node,
4000                         GFP_KERNEL, node);
4001
4002         if (!n) {
4003             free_kmem_cache_nodes(s);
4004             return 0;
4005         }
4006
4007         init_kmem_cache_node(n);
4008         s->node[node] = n;
4009     }
4010     return 1;
4011 }
4012
4013 static void set_cpu_partial(struct kmem_cache *s)
4014 {
4015 #ifdef CONFIG_SLUB_CPU_PARTIAL
4016     unsigned int nr_objects;
4017
4018     /*
4019      * cpu_partial determined the maximum number of objects kept in the
4020      * per cpu partial lists of a processor.
4021      *
4022      * Per cpu partial lists mainly contain slabs that just have one
4023      * object freed. If they are used for allocation then they can be
4024      * filled up again with minimal effort. The slab will never hit the
4025      * per node partial lists and therefore no locking will be required.
4026      *
4027      * For backwards compatibility reasons, this is determined as number
4028      * of objects, even though we now limit maximum number of pages, see
4029      * slub_set_cpu_partial()
4030      */
4031     if (!kmem_cache_has_cpu_partial(s))
4032         nr_objects = 0;
4033     else if (s->size >= PAGE_SIZE)
4034         nr_objects = 6;
4035     else if (s->size >= 1024)
4036         nr_objects = 24;
4037     else if (s->size >= 256)
4038         nr_objects = 52;
4039     else
4040         nr_objects = 120;
4041
4042     slub_set_cpu_partial(s, nr_objects);
4043 #endif
4044 }
4045
4046 /*
4047  * calculate_sizes() determines the order and the distribution of data within
4048  * a slab object.
4049  */
4050 static int calculate_sizes(struct kmem_cache *s)
4051 {
4052     slab_flags_t flags = s->flags;
4053     unsigned int size = s->object_size;
4054     unsigned int order;
4055
4056     /*
4057      * Round up object size to the next word boundary. We can only
4058      * place the free pointer at word boundaries and this determines
4059      * the possible location of the free pointer.
4060      */
4061     size = ALIGN(size, sizeof(void *));
4062
4063 #ifdef CONFIG_SLUB_DEBUG
4064     /*
4065      * Determine if we can poison the object itself. If the user of
4066      * the slab may touch the object after free or before allocation
4067      * then we should never poison the object itself.
4068      */
4069     if ((flags & SLAB_POISON) && !(flags & SLAB_TYPESAFE_BY_RCU) &&
4070             !s->ctor)
4071         s->flags |= __OBJECT_POISON;
4072     else
4073         s->flags &= ~__OBJECT_POISON;
4074
4075
4076     /*
4077      * If we are Redzoning then check if there is some space between the
4078      * end of the object and the free pointer. If not then add an
4079      * additional word to have some bytes to store Redzone information.
4080      */
4081     if ((flags & SLAB_RED_ZONE) && size == s->object_size)
4082         size += sizeof(void *);
4083 #endif
4084
4085     /*
4086      * With that we have determined the number of bytes in actual use
4087      * by the object and redzoning.
4088      */
4089     s->inuse = size;
4090
4091     if ((flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) ||
4092         ((flags & SLAB_RED_ZONE) && s->object_size < sizeof(void *)) ||
4093         s->ctor) {
4094         /*
4095          * Relocate free pointer after the object if it is not
4096          * permitted to overwrite the first word of the object on
4097          * kmem_cache_free.
4098          *
4099          * This is the case if we do RCU, have a constructor or
4100          * destructor, are poisoning the objects, or are
4101          * redzoning an object smaller than sizeof(void *).
4102          *
4103          * The assumption that s->offset >= s->inuse means free
4104          * pointer is outside of the object is used in the
4105          * freeptr_outside_object() function. If that is no
4106          * longer true, the function needs to be modified.
4107          */
4108         s->offset = size;
4109         size += sizeof(void *);
4110     } else {
4111         /*
4112          * Store freelist pointer near middle of object to keep
4113          * it away from the edges of the object to avoid small
4114          * sized over/underflows from neighboring allocations.
4115          */
4116         s->offset = ALIGN_DOWN(s->object_size / 2, sizeof(void *));
4117     }
4118
4119 #ifdef CONFIG_SLUB_DEBUG
4120     if (flags & SLAB_STORE_USER)
4121         /*
4122          * Need to store information about allocs and frees after
4123          * the object.
4124          */
4125         size += 2 * sizeof(struct track);
4126 #endif
4127
4128     kasan_cache_create(s, &size, &s->flags);
4129 #ifdef CONFIG_SLUB_DEBUG
4130     if (flags & SLAB_RED_ZONE) {
4131         /*
4132          * Add some empty padding so that we can catch
4133          * overwrites from earlier objects rather than let
4134          * tracking information or the free pointer be
4135          * corrupted if a user writes before the start
4136          * of the object.
4137          */
4138         size += sizeof(void *);
4139
4140         s->red_left_pad = sizeof(void *);
4141         s->red_left_pad = ALIGN(s->red_left_pad, s->align);
4142         size += s->red_left_pad;
4143     }
4144 #endif
4145
4146     /*
4147      * SLUB stores one object immediately after another beginning from
4148      * offset 0. In order to align the objects we have to simply size
4149      * each object to conform to the alignment.
4150      */
4151     size = ALIGN(size, s->align);
4152     s->size = size;
4153     s->reciprocal_size = reciprocal_value(size);
4154     order = calculate_order(size);
4155
4156     if ((int)order < 0)
4157         return 0;
4158
4159     s->allocflags = 0;
4160     if (order)
4161         s->allocflags |= __GFP_COMP;
4162
4163     if (s->flags & SLAB_CACHE_DMA)
4164         s->allocflags |= GFP_DMA;
4165
4166     if (s->flags & SLAB_CACHE_DMA32)
4167         s->allocflags |= GFP_DMA32;
4168
4169     if (s->flags & SLAB_RECLAIM_ACCOUNT)
4170         s->allocflags |= __GFP_RECLAIMABLE;
4171
4172     /*
4173      * Determine the number of objects per slab
4174      */
4175     s->oo = oo_make(order, size);
4176     s->min = oo_make(get_order(size), size);
4177
4178     return !!oo_objects(s->oo);
4179 }
4180
4181 static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags)
4182 {
4183     s->flags = kmem_cache_flags(s->size, flags, s->name);
4184 #ifdef CONFIG_SLAB_FREELIST_HARDENED
4185     s->random = get_random_long();
4186 #endif
4187
4188     if (!calculate_sizes(s))
4189         goto error;
4190     if (disable_higher_order_debug) {
4191         /*
4192          * Disable debugging flags that store metadata if the min slab
4193          * order increased.
4194          */
4195         if (get_order(s->size) > get_order(s->object_size)) {
4196             s->flags &= ~DEBUG_METADATA_FLAGS;
4197             s->offset = 0;
4198             if (!calculate_sizes(s))
4199                 goto error;
4200         }
4201     }
4202
4203 #if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
4204     defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
4205     if (system_has_cmpxchg_double() && (s->flags & SLAB_NO_CMPXCHG) == 0)
4206         /* Enable fast mode */
4207         s->flags |= __CMPXCHG_DOUBLE;
4208 #endif
4209
4210     /*
4211      * The larger the object size is, the more slabs we want on the partial
4212      * list to avoid pounding the page allocator excessively.
4213      */
4214     s->min_partial = min_t(unsigned long, MAX_PARTIAL, ilog2(s->size) / 2);
4215     s->min_partial = max_t(unsigned long, MIN_PARTIAL, s->min_partial);
4216
4217     set_cpu_partial(s);
4218
4219 #ifdef CONFIG_NUMA
4220     s->remote_node_defrag_ratio = 1000;
4221 #endif
4222
4223     /* Initialize the pre-computed randomized freelist if slab is up */
4224     if (slab_state >= UP) {
4225         if (init_cache_random_seq(s))
4226             goto error;
4227     }
4228
4229     if (!init_kmem_cache_nodes(s))
4230         goto error;
4231
4232     if (alloc_kmem_cache_cpus(s))
4233         return 0;
4234
4235 error:
4236     __kmem_cache_release(s);
4237     return -EINVAL;
4238 }
4239
4240 static void list_slab_objects(struct kmem_cache *s, struct slab *slab,
4241                   const char *text)
4242 {
4243 #ifdef CONFIG_SLUB_DEBUG
4244     void *addr = slab_address(slab);
4245     unsigned long flags;
4246     unsigned long *map;
4247     void *p;
4248
4249     slab_err(s, slab, text, s->name);
4250     slab_lock(slab, &flags);
4251
4252     map = get_map(s, slab);
4253     for_each_object(p, s, addr, slab->objects) {
4254
4255         if (!test_bit(__obj_to_index(s, addr, p), map)) {
4256             pr_err("Object 0x%p @offset=%tu\n", p, p - addr);
4257             print_tracking(s, p);
4258         }
4259     }
4260     put_map(map);
4261     slab_unlock(slab, &flags);
4262 #endif
4263 }
4264
4265 /*
4266  * Attempt to free all partial slabs on a node.
4267  * This is called from __kmem_cache_shutdown(). We must take list_lock
4268  * because sysfs file might still access partial list after the shutdowning.
4269  */
4270 static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
4271 {
4272     LIST_HEAD(discard);
4273     struct slab *slab, *h;
4274
4275     BUG_ON(irqs_disabled());
4276     spin_lock_irq(&n->list_lock);
4277     list_for_each_entry_safe(slab, h, &n->partial, slab_list) {
4278         if (!slab->inuse) {
4279             remove_partial(n, slab);
4280             list_add(&slab->slab_list, &discard);
4281         } else {
4282             list_slab_objects(s, slab,
4283               "Objects remaining in %s on __kmem_cache_shutdown()");
4284         }
4285     }
4286     spin_unlock_irq(&n->list_lock);
4287
4288     list_for_each_entry_safe(slab, h, &discard, slab_list)
4289         discard_slab(s, slab);
4290 }
4291
4292 bool __kmem_cache_empty(struct kmem_cache *s)
4293 {
4294     int node;
4295     struct kmem_cache_node *n;
4296
4297     for_each_kmem_cache_node(s, node, n)
4298         if (n->nr_partial || slabs_node(s, node))
4299             return false;
4300     return true;
4301 }
4302
4303 /*
4304  * Release all resources used by a slab cache.
4305  */
4306 int __kmem_cache_shutdown(struct kmem_cache *s)
4307 {
4308     int node;
4309     struct kmem_cache_node *n;
4310
4311     flush_all_cpus_locked(s);
4312     /* Attempt to free all objects */
4313     for_each_kmem_cache_node(s, node, n) {
4314         free_partial(s, n);
4315         if (n->nr_partial || slabs_node(s, node))
4316             return 1;
4317     }
4318     return 0;
4319 }
4320
4321 #ifdef CONFIG_PRINTK
4322 void __kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab)
4323 {
4324     void *base;
4325     int __maybe_unused i;
4326     unsigned int objnr;
4327     void *objp;
4328     void *objp0;
4329     struct kmem_cache *s = slab->slab_cache;
4330     struct track __maybe_unused *trackp;
4331
4332     kpp->kp_ptr = object;
4333     kpp->kp_slab = slab;
4334     kpp->kp_slab_cache = s;
4335     base = slab_address(slab);
4336     objp0 = kasan_reset_tag(object);
4337 #ifdef CONFIG_SLUB_DEBUG
4338     objp = restore_red_left(s, objp0);
4339 #else
4340     objp = objp0;
4341 #endif
4342     objnr = obj_to_index(s, slab, objp);
4343     kpp->kp_data_offset = (unsigned long)((char *)objp0 - (char *)objp);
4344     objp = base + s->size * objnr;
4345     kpp->kp_objp = objp;
4346     if (WARN_ON_ONCE(objp < base || objp >= base + slab->objects * s->size
4347              || (objp - base) % s->size) ||
4348         !(s->flags & SLAB_STORE_USER))
4349         return;
4350 #ifdef CONFIG_SLUB_DEBUG
4351     objp = fixup_red_left(s, objp);
4352     trackp = get_track(s, objp, TRACK_ALLOC);
4353     kpp->kp_ret = (void *)trackp->addr;
4354 #ifdef CONFIG_STACKDEPOT
4355     {
4356         depot_stack_handle_t handle;
4357         unsigned long *entries;
4358         unsigned int nr_entries;
4359
4360         handle = READ_ONCE(trackp->handle);
4361         if (handle) {
4362             nr_entries = stack_depot_fetch(handle, &entries);
4363             for (i = 0; i < KS_ADDRS_COUNT && i < nr_entries; i++)
4364                 kpp->kp_stack[i] = (void *)entries[i];
4365         }
4366
4367         trackp = get_track(s, objp, TRACK_FREE);
4368         handle = READ_ONCE(trackp->handle);
4369         if (handle) {
4370             nr_entries = stack_depot_fetch(handle, &entries);
4371             for (i = 0; i < KS_ADDRS_COUNT && i < nr_entries; i++)
4372                 kpp->kp_free_stack[i] = (void *)entries[i];
4373         }
4374     }
4375 #endif
4376 #endif
4377 }
4378 #endif
4379
4380 /********************************************************************
4381  *      Kmalloc subsystem
4382  *******************************************************************/
4383
4384 static int __init setup_slub_min_order(char *str)
4385 {
4386     get_option(&str, (int *)&slub_min_order);
4387
4388     return 1;
4389 }
4390
4391 __setup("slub_min_order=", setup_slub_min_order);
4392
4393 static int __init setup_slub_max_order(char *str)
4394 {
4395     get_option(&str, (int *)&slub_max_order);
4396     slub_max_order = min(slub_max_order, (unsigned int)MAX_ORDER - 1);
4397
4398     return 1;
4399 }
4400
4401 __setup("slub_max_order=", setup_slub_max_order);
4402
4403 static int __init setup_slub_min_objects(char *str)
4404 {
4405     get_option(&str, (int *)&slub_min_objects);
4406
4407     return 1;
4408 }
4409
4410 __setup("slub_min_objects=", setup_slub_min_objects);
4411
4412 void *__kmalloc(size_t size, gfp_t flags)
4413 {
4414     struct kmem_cache *s;
4415     void *ret;
4416
4417     if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
4418         return kmalloc_large(size, flags);
4419
4420     s = kmalloc_slab(size, flags);
4421
4422     if (unlikely(ZERO_OR_NULL_PTR(s)))
4423         return s;
4424
4425     ret = slab_alloc(s, NULL, flags, _RET_IP_, size);
4426
4427     trace_kmalloc(_RET_IP_, ret, s, size, s->size, flags);
4428
4429     ret = kasan_kmalloc(s, ret, size, flags);
4430
4431     return ret;
4432 }
4433 EXPORT_SYMBOL(__kmalloc);
4434
4435 #ifdef CONFIG_NUMA
4436 static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
4437 {
4438     struct page *page;
4439     void *ptr = NULL;
4440     unsigned int order = get_order(size);
4441
4442     flags |= __GFP_COMP;
4443     page = alloc_pages_node(node, flags, order);
4444     if (page) {
4445         ptr = page_address(page);
4446         mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B,
4447                       PAGE_SIZE << order);
4448     }
4449
4450     return kmalloc_large_node_hook(ptr, size, flags);
4451 }
4452
4453 void *__kmalloc_node(size_t size, gfp_t flags, int node)
4454 {
4455     struct kmem_cache *s;
4456     void *ret;
4457
4458     if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) {
4459         ret = kmalloc_large_node(size, flags, node);
4460
4461         trace_kmalloc_node(_RET_IP_, ret, NULL,
4462                    size, PAGE_SIZE << get_order(size),
4463                    flags, node);
4464
4465         return ret;
4466     }
4467
4468     s = kmalloc_slab(size, flags);
4469
4470     if (unlikely(ZERO_OR_NULL_PTR(s)))
4471         return s;
4472
4473     ret = slab_alloc_node(s, NULL, flags, node, _RET_IP_, size);
4474
4475     trace_kmalloc_node(_RET_IP_, ret, s, size, s->size, flags, node);
4476
4477     ret = kasan_kmalloc(s, ret, size, flags);
4478
4479     return ret;
4480 }
4481 EXPORT_SYMBOL(__kmalloc_node);
4482 #endif  /* CONFIG_NUMA */
4483
4484 #ifdef CONFIG_HARDENED_USERCOPY
4485 /*
4486  * Rejects incorrectly sized objects and objects that are to be copied
4487  * to/from userspace but do not fall entirely within the containing slab
4488  * cache's usercopy region.
4489  *
4490  * Returns NULL if check passes, otherwise const char * to name of cache
4491  * to indicate an error.
4492  */
4493 void __check_heap_object(const void *ptr, unsigned long n,
4494              const struct slab *slab, bool to_user)
4495 {
4496     struct kmem_cache *s;
4497     unsigned int offset;
4498     bool is_kfence = is_kfence_address(ptr);
4499
4500     ptr = kasan_reset_tag(ptr);
4501
4502     /* Find object and usable object size. */
4503     s = slab->slab_cache;
4504
4505     /* Reject impossible pointers. */
4506     if (ptr < slab_address(slab))
4507         usercopy_abort("SLUB object not in SLUB page?!", NULL,
4508                    to_user, 0, n);
4509
4510     /* Find offset within object. */
4511     if (is_kfence)
4512         offset = ptr - kfence_object_start(ptr);
4513     else
4514         offset = (ptr - slab_address(slab)) % s->size;
4515
4516     /* Adjust for redzone and reject if within the redzone. */
4517     if (!is_kfence && kmem_cache_debug_flags(s, SLAB_RED_ZONE)) {
4518         if (offset < s->red_left_pad)
4519             usercopy_abort("SLUB object in left red zone",
4520                        s->name, to_user, offset, n);
4521         offset -= s->red_left_pad;
4522     }
4523
4524     /* Allow address range falling entirely within usercopy region. */
4525     if (offset >= s->useroffset &&
4526         offset - s->useroffset <= s->usersize &&
4527         n <= s->useroffset - offset + s->usersize)
4528         return;
4529
4530     usercopy_abort("SLUB object", s->name, to_user, offset, n);
4531 }
4532 #endif /* CONFIG_HARDENED_USERCOPY */
4533
4534 size_t __ksize(const void *object)
4535 {
4536     struct folio *folio;
4537
4538     if (unlikely(object == ZERO_SIZE_PTR))
4539         return 0;
4540
4541     folio = virt_to_folio(object);
4542
4543     if (unlikely(!folio_test_slab(folio)))
4544         return folio_size(folio);
4545
4546     return slab_ksize(folio_slab(folio)->slab_cache);
4547 }
4548 EXPORT_SYMBOL(__ksize);
4549
4550 void kfree(const void *x)
4551 {
4552     struct folio *folio;
4553     struct slab *slab;
4554     void *object = (void *)x;
4555
4556     trace_kfree(_RET_IP_, x);
4557
4558     if (unlikely(ZERO_OR_NULL_PTR(x)))
4559         return;
4560
4561     folio = virt_to_folio(x);
4562     if (unlikely(!folio_test_slab(folio))) {
4563         free_large_kmalloc(folio, object);
4564         return;
4565     }
4566     slab = folio_slab(folio);
4567     slab_free(slab->slab_cache, slab, object, NULL, &object, 1, _RET_IP_);
4568 }
4569 EXPORT_SYMBOL(kfree);
4570
4571 #define SHRINK_PROMOTE_MAX 32
4572
4573 /*
4574  * kmem_cache_shrink discards empty slabs and promotes the slabs filled
4575  * up most to the head of the partial lists. New allocations will then
4576  * fill those up and thus they can be removed from the partial lists.
4577  *
4578  * The slabs with the least items are placed last. This results in them
4579  * being allocated from last increasing the chance that the last objects
4580  * are freed in them.
4581  */
4582 static int __kmem_cache_do_shrink(struct kmem_cache *s)
4583 {
4584     int node;
4585     int i;
4586     struct kmem_cache_node *n;
4587     struct slab *slab;
4588     struct slab *t;
4589     struct list_head discard;
4590     struct list_head promote[SHRINK_PROMOTE_MAX];
4591     unsigned long flags;
4592     int ret = 0;
4593
4594     for_each_kmem_cache_node(s, node, n) {
4595         INIT_LIST_HEAD(&discard);
4596         for (i = 0; i < SHRINK_PROMOTE_MAX; i++)
4597             INIT_LIST_HEAD(promote + i);
4598
4599         spin_lock_irqsave(&n->list_lock, flags);
4600
4601         /*
4602          * Build lists of slabs to discard or promote.
4603          *
4604          * Note that concurrent frees may occur while we hold the
4605          * list_lock. slab->inuse here is the upper limit.
4606          */
4607         list_for_each_entry_safe(slab, t, &n->partial, slab_list) {
4608             int free = slab->objects - slab->inuse;
4609
4610             /* Do not reread slab->inuse */
4611             barrier();
4612
4613             /* We do not keep full slabs on the list */
4614             BUG_ON(free <= 0);
4615
4616             if (free == slab->objects) {
4617                 list_move(&slab->slab_list, &discard);
4618                 n->nr_partial--;
4619             } else if (free <= SHRINK_PROMOTE_MAX)
4620                 list_move(&slab->slab_list, promote + free - 1);
4621         }
4622
4623         /*
4624          * Promote the slabs filled up most to the head of the
4625          * partial list.
4626          */
4627         for (i = SHRINK_PROMOTE_MAX - 1; i >= 0; i--)
4628             list_splice(promote + i, &n->partial);
4629
4630         spin_unlock_irqrestore(&n->list_lock, flags);
4631
4632         /* Release empty slabs */
4633         list_for_each_entry_safe(slab, t, &discard, slab_list)
4634             discard_slab(s, slab);
4635
4636         if (slabs_node(s, node))
4637             ret = 1;
4638     }
4639
4640     return ret;
4641 }
4642
4643 int __kmem_cache_shrink(struct kmem_cache *s)
4644 {
4645     flush_all(s);
4646     return __kmem_cache_do_shrink(s);
4647 }
4648
4649 static int slab_mem_going_offline_callback(void *arg)
4650 {
4651     struct kmem_cache *s;
4652
4653     mutex_lock(&slab_mutex);
4654     list_for_each_entry(s, &slab_caches, list) {
4655         flush_all_cpus_locked(s);
4656         __kmem_cache_do_shrink(s);
4657     }
4658     mutex_unlock(&slab_mutex);
4659
4660     return 0;
4661 }
4662
4663 static void slab_mem_offline_callback(void *arg)
4664 {
4665     struct memory_notify *marg = arg;
4666     int offline_node;
4667
4668     offline_node = marg->status_change_nid_normal;
4669
4670     /*
4671      * If the node still has available memory. we need kmem_cache_node
4672      * for it yet.
4673      */
4674     if (offline_node < 0)
4675         return;
4676
4677     mutex_lock(&slab_mutex);
4678     node_clear(offline_node, slab_nodes);
4679     /*
4680      * We no longer free kmem_cache_node structures here, as it would be
4681      * racy with all get_node() users, and infeasible to protect them with
4682      * slab_mutex.
4683      */
4684     mutex_unlock(&slab_mutex);
4685 }
4686
4687 static int slab_mem_going_online_callback(void *arg)
4688 {
4689     struct kmem_cache_node *n;
4690     struct kmem_cache *s;
4691     struct memory_notify *marg = arg;
4692     int nid = marg->status_change_nid_normal;
4693     int ret = 0;
4694
4695     /*
4696      * If the node's memory is already available, then kmem_cache_node is
4697      * already created. Nothing to do.
4698      */
4699     if (nid < 0)
4700         return 0;
4701
4702     /*
4703      * We are bringing a node online. No memory is available yet. We must
4704      * allocate a kmem_cache_node structure in order to bring the node
4705      * online.
4706      */
4707     mutex_lock(&slab_mutex);
4708     list_for_each_entry(s, &slab_caches, list) {
4709         /*
4710          * The structure may already exist if the node was previously
4711          * onlined and offlined.
4712          */
4713         if (get_node(s, nid))
4714             continue;
4715         /*
4716          * XXX: kmem_cache_alloc_node will fallback to other nodes
4717          *      since memory is not yet available from the node that
4718          *      is brought up.
4719          */
4720         n = kmem_cache_alloc(kmem_cache_node, GFP_KERNEL);
4721         if (!n) {
4722             ret = -ENOMEM;
4723             goto out;
4724         }
4725         init_kmem_cache_node(n);
4726         s->node[nid] = n;
4727     }
4728     /*
4729      * Any cache created after this point will also have kmem_cache_node
4730      * initialized for the new node.
4731      */
4732     node_set(nid, slab_nodes);
4733 out:
4734     mutex_unlock(&slab_mutex);
4735     return ret;
4736 }
4737
4738 static int slab_memory_callback(struct notifier_block *self,
4739                 unsigned long action, void *arg)
4740 {
4741     int ret = 0;
4742
4743     switch (action) {
4744     case MEM_GOING_ONLINE:
4745         ret = slab_mem_going_online_callback(arg);
4746         break;
4747     case MEM_GOING_OFFLINE:
4748         ret = slab_mem_going_offline_callback(arg);
4749         break;
4750     case MEM_OFFLINE:
4751     case MEM_CANCEL_ONLINE:
4752         slab_mem_offline_callback(arg);
4753         break;
4754     case MEM_ONLINE:
4755     case MEM_CANCEL_OFFLINE:
4756         break;
4757     }
4758     if (ret)
4759         ret = notifier_from_errno(ret);
4760     else
4761         ret = NOTIFY_OK;
4762     return ret;
4763 }
4764
4765 static struct notifier_block slab_memory_callback_nb = {
4766     .notifier_call = slab_memory_callback,
4767     .priority = SLAB_CALLBACK_PRI,
4768 };
4769
4770 /********************************************************************
4771  *          Basic setup of slabs
4772  *******************************************************************/
4773
4774 /*
4775  * Used for early kmem_cache structures that were allocated using
4776  * the page allocator. Allocate them properly then fix up the pointers
4777  * that may be pointing to the wrong kmem_cache structure.
4778  */
4779
4780 static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache)
4781 {
4782     int node;
4783     struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
4784     struct kmem_cache_node *n;
4785
4786     memcpy(s, static_cache, kmem_cache->object_size);
4787
4788     /*
4789      * This runs very early, and only the boot processor is supposed to be
4790      * up.  Even if it weren't true, IRQs are not up so we couldn't fire
4791      * IPIs around.
4792      */
4793     __flush_cpu_slab(s, smp_processor_id());
4794     for_each_kmem_cache_node(s, node, n) {
4795         struct slab *p;
4796
4797         list_for_each_entry(p, &n->partial, slab_list)
4798             p->slab_cache = s;
4799
4800 #ifdef CONFIG_SLUB_DEBUG
4801         list_for_each_entry(p, &n->full, slab_list)
4802             p->slab_cache = s;
4803 #endif
4804     }
4805     list_add(&s->list, &slab_caches);
4806     return s;
4807 }
4808
4809 void __init kmem_cache_init(void)
4810 {
4811     static __initdata struct kmem_cache boot_kmem_cache,
4812         boot_kmem_cache_node;
4813     int node;
4814
4815     if (debug_guardpage_minorder())
4816         slub_max_order = 0;
4817
4818     /* Print slub debugging pointers without hashing */
4819     if (__slub_debug_enabled())
4820         no_hash_pointers_enable(NULL);
4821
4822     kmem_cache_node = &boot_kmem_cache_node;
4823     kmem_cache = &boot_kmem_cache;
4824
4825     /*
4826      * Initialize the nodemask for which we will allocate per node
4827      * structures. Here we don't need taking slab_mutex yet.
4828      */
4829     for_each_node_state(node, N_NORMAL_MEMORY)
4830         node_set(node, slab_nodes);
4831
4832     create_boot_cache(kmem_cache_node, "kmem_cache_node",
4833         sizeof(struct kmem_cache_node), SLAB_HWCACHE_ALIGN, 0, 0);
4834
4835     register_hotmemory_notifier(&slab_memory_callback_nb);
4836
4837     /* Able to allocate the per node structures */
4838     slab_state = PARTIAL;
4839
4840     create_boot_cache(kmem_cache, "kmem_cache",
4841             offsetof(struct kmem_cache, node) +
4842                 nr_node_ids * sizeof(struct kmem_cache_node *),
4843                SLAB_HWCACHE_ALIGN, 0, 0);
4844
4845     kmem_cache = bootstrap(&boot_kmem_cache);
4846     kmem_cache_node = bootstrap(&boot_kmem_cache_node);
4847
4848     /* Now we can use the kmem_cache to allocate kmalloc slabs */
4849     setup_kmalloc_cache_index_table();
4850     create_kmalloc_caches(0);
4851
4852     /* Setup random freelists for each cache */
4853     init_freelist_randomization();
4854
4855     cpuhp_setup_state_nocalls(CPUHP_SLUB_DEAD, "slub:dead", NULL,
4856                   slub_cpu_dead);
4857
4858     pr_info("SLUB: HWalign=%d, Order=%u-%u, MinObjects=%u, CPUs=%u, Nodes=%u\n",
4859         cache_line_size(),
4860         slub_min_order, slub_max_order, slub_min_objects,
4861         nr_cpu_ids, nr_node_ids);
4862 }
4863
4864 void __init kmem_cache_init_late(void)
4865 {
4866     flushwq = alloc_workqueue("slub_flushwq", WQ_MEM_RECLAIM, 0);
4867     WARN_ON(!flushwq);
4868 }
4869
4870 struct kmem_cache *
4871 __kmem_cache_alias(const char *name, unsigned int size, unsigned int align,
4872            slab_flags_t flags, void (*ctor)(void *))
4873 {
4874     struct kmem_cache *s;
4875
4876     s = find_mergeable(size, align, flags, name, ctor);
4877     if (s) {
4878         if (sysfs_slab_alias(s, name))
4879             return NULL;
4880
4881         s->refcount++;
4882
4883         /*
4884          * Adjust the object sizes so that we clear
4885          * the complete object on kzalloc.
4886          */
4887         s->object_size = max(s->object_size, size);
4888         s->inuse = max(s->inuse, ALIGN(size, sizeof(void *)));
4889     }
4890
4891     return s;
4892 }
4893
4894 int __kmem_cache_create(struct kmem_cache *s, slab_flags_t flags)
4895 {
4896     int err;
4897
4898     err = kmem_cache_open(s, flags);
4899     if (err)
4900         return err;
4901
4902     /* Mutex is not taken during early boot */
4903     if (slab_state <= UP)
4904         return 0;
4905
4906     err = sysfs_slab_add(s);
4907     if (err) {
4908         __kmem_cache_release(s);
4909         return err;
4910     }
4911
4912     if (s->flags & SLAB_STORE_USER)
4913         debugfs_slab_add(s);
4914
4915     return 0;
4916 }
4917
4918 void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller)
4919 {
4920     struct kmem_cache *s;
4921     void *ret;
4922
4923     if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
4924         return kmalloc_large(size, gfpflags);
4925
4926     s = kmalloc_slab(size, gfpflags);
4927
4928     if (unlikely(ZERO_OR_NULL_PTR(s)))
4929         return s;
4930
4931     ret = slab_alloc(s, NULL, gfpflags, caller, size);
4932
4933     /* Honor the call site pointer we received. */
4934     trace_kmalloc(caller, ret, s, size, s->size, gfpflags);
4935
4936     ret = kasan_kmalloc(s, ret, size, gfpflags);
4937
4938     return ret;
4939 }
4940 EXPORT_SYMBOL(__kmalloc_track_caller);
4941
4942 #ifdef CONFIG_NUMA
4943 void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
4944                     int node, unsigned long caller)
4945 {
4946     struct kmem_cache *s;
4947     void *ret;
4948
4949     if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) {
4950         ret = kmalloc_large_node(size, gfpflags, node);
4951
4952         trace_kmalloc_node(caller, ret, NULL,
4953                    size, PAGE_SIZE << get_order(size),
4954                    gfpflags, node);
4955
4956         return ret;
4957     }
4958
4959     s = kmalloc_slab(size, gfpflags);
4960
4961     if (unlikely(ZERO_OR_NULL_PTR(s)))
4962         return s;
4963
4964     ret = slab_alloc_node(s, NULL, gfpflags, node, caller, size);
4965
4966     /* Honor the call site pointer we received. */
4967     trace_kmalloc_node(caller, ret, s, size, s->size, gfpflags, node);
4968
4969     ret = kasan_kmalloc(s, ret, size, gfpflags);
4970
4971     return ret;
4972 }
4973 EXPORT_SYMBOL(__kmalloc_node_track_caller);
4974 #endif
4975
4976 #ifdef CONFIG_SYSFS
4977 static int count_inuse(struct slab *slab)
4978 {
4979     return slab->inuse;
4980 }
4981
4982 static int count_total(struct slab *slab)
4983 {
4984     return slab->objects;
4985 }
4986 #endif
4987
4988 #ifdef CONFIG_SLUB_DEBUG
4989 static void validate_slab(struct kmem_cache *s, struct slab *slab,
4990               unsigned long *obj_map)
4991 {
4992     void *p;
4993     void *addr = slab_address(slab);
4994     unsigned long flags;
4995
4996     slab_lock(slab, &flags);
4997
4998     if (!check_slab(s, slab) || !on_freelist(s, slab, NULL))
4999         goto unlock;
5000
5001     /* Now we know that a valid freelist exists */
5002     __fill_map(obj_map, s, slab);
5003     for_each_object(p, s, addr, slab->objects) {
5004         u8 val = test_bit(__obj_to_index(s, addr, p), obj_map) ?
5005              SLUB_RED_INACTIVE : SLUB_RED_ACTIVE;
5006
5007         if (!check_object(s, slab, p, val))
5008             break;
5009     }
5010 unlock:
5011     slab_unlock(slab, &flags);
5012 }
5013
5014 static int validate_slab_node(struct kmem_cache *s,
5015         struct kmem_cache_node *n, unsigned long *obj_map)
5016 {
5017     unsigned long count = 0;
5018     struct slab *slab;
5019     unsigned long flags;
5020
5021     spin_lock_irqsave(&n->list_lock, flags);
5022
5023     list_for_each_entry(slab, &n->partial, slab_list) {
5024         validate_slab(s, slab, obj_map);
5025         count++;
5026     }
5027     if (count != n->nr_partial) {
5028         pr_err("SLUB %s: %ld partial slabs counted but counter=%ld\n",
5029                s->name, count, n->nr_partial);
5030         slab_add_kunit_errors();
5031     }
5032
5033     if (!(s->flags & SLAB_STORE_USER))
5034         goto out;
5035
5036     list_for_each_entry(slab, &n->full, slab_list) {
5037         validate_slab(s, slab, obj_map);
5038         count++;
5039     }
5040     if (count != atomic_long_read(&n->nr_slabs)) {
5041         pr_err("SLUB: %s %ld slabs counted but counter=%ld\n",
5042                s->name, count, atomic_long_read(&n->nr_slabs));
5043         slab_add_kunit_errors();
5044     }
5045
5046 out:
5047     spin_unlock_irqrestore(&n->list_lock, flags);
5048     return count;
5049 }
5050
5051 long validate_slab_cache(struct kmem_cache *s)
5052 {
5053     int node;
5054     unsigned long count = 0;
5055     struct kmem_cache_node *n;
5056     unsigned long *obj_map;
5057
5058     obj_map = bitmap_alloc(oo_objects(s->oo), GFP_KERNEL);
5059     if (!obj_map)
5060         return -ENOMEM;
5061
5062     flush_all(s);
5063     for_each_kmem_cache_node(s, node, n)
5064         count += validate_slab_node(s, n, obj_map);
5065
5066     bitmap_free(obj_map);
5067
5068     return count;
5069 }
5070 EXPORT_SYMBOL(validate_slab_cache);
5071
5072 #ifdef CONFIG_DEBUG_FS
5073 /*
5074  * Generate lists of code addresses where slabcache objects are allocated
5075  * and freed.
5076  */
5077
5078 struct location {
5079     depot_stack_handle_t handle;
5080     unsigned long count;
5081     unsigned long addr;
5082     long long sum_time;
5083     long min_time;
5084     long max_time;
5085     long min_pid;
5086     long max_pid;
5087     DECLARE_BITMAP(cpus, NR_CPUS);
5088     nodemask_t nodes;
5089 };
5090
5091 struct loc_track {
5092     unsigned long max;
5093     unsigned long count;
5094     struct location *loc;
5095     loff_t idx;
5096 };
5097
5098 static struct dentry *slab_debugfs_root;
5099
5100 static void free_loc_track(struct loc_track *t)
5101 {
5102     if (t->max)
5103         free_pages((unsigned long)t->loc,
5104             get_order(sizeof(struct location) * t->max));
5105 }
5106
5107 static int alloc_loc_track(struct loc_track *t, unsigned long max, gfp_t flags)
5108 {
5109     struct location *l;
5110     int order;
5111
5112     order = get_order(sizeof(struct location) * max);
5113
5114     l = (void *)__get_free_pages(flags, order);
5115     if (!l)
5116         return 0;
5117
5118     if (t->count) {
5119         memcpy(l, t->loc, sizeof(struct location) * t->count);
5120         free_loc_track(t);
5121     }
5122     t->max = max;
5123     t->loc = l;
5124     return 1;
5125 }
5126
5127 static int add_location(struct loc_track *t, struct kmem_cache *s,
5128                 const struct track *track)
5129 {
5130     long start, end, pos;
5131     struct location *l;
5132     unsigned long caddr, chandle;
5133     unsigned long age = jiffies - track->when;
5134     depot_stack_handle_t handle = 0;
5135
5136 #ifdef CONFIG_STACKDEPOT
5137     handle = READ_ONCE(track->handle);
5138 #endif
5139     start = -1;
5140     end = t->count;
5141
5142     for ( ; ; ) {
5143         pos = start + (end - start + 1) / 2;
5144
5145         /*
5146          * There is nothing at "end". If we end up there
5147          * we need to add something to before end.
5148          */
5149         if (pos == end)
5150             break;
5151
5152         caddr = t->loc[pos].addr;
5153         chandle = t->loc[pos].handle;
5154         if ((track->addr == caddr) && (handle == chandle)) {
5155
5156             l = &t->loc[pos];
5157             l->count++;
5158             if (track->when) {
5159                 l->sum_time += age;
5160                 if (age < l->min_time)
5161                     l->min_time = age;
5162                 if (age > l->max_time)
5163                     l->max_time = age;
5164
5165                 if (track->pid < l->min_pid)
5166                     l->min_pid = track->pid;
5167                 if (track->pid > l->max_pid)
5168                     l->max_pid = track->pid;
5169
5170                 cpumask_set_cpu(track->cpu,
5171                         to_cpumask(l->cpus));
5172             }
5173             node_set(page_to_nid(virt_to_page(track)), l->nodes);
5174             return 1;
5175         }
5176
5177         if (track->addr < caddr)
5178             end = pos;
5179         else if (track->addr == caddr && handle < chandle)
5180             end = pos;
5181         else
5182             start = pos;
5183     }
5184
5185     /*
5186      * Not found. Insert new tracking element.
5187      */
5188     if (t->count >= t->max && !alloc_loc_track(t, 2 * t->max, GFP_ATOMIC))
5189         return 0;
5190
5191     l = t->loc + pos;
5192     if (pos < t->count)
5193         memmove(l + 1, l,
5194             (t->count - pos) * sizeof(struct location));
5195     t->count++;
5196     l->count = 1;
5197     l->addr = track->addr;
5198     l->sum_time = age;
5199     l->min_time = age;
5200     l->max_time = age;
5201     l->min_pid = track->pid;
5202     l->max_pid = track->pid;
5203     l->handle = handle;
5204     cpumask_clear(to_cpumask(l->cpus));
5205     cpumask_set_cpu(track->cpu, to_cpumask(l->cpus));
5206     nodes_clear(l->nodes);
5207     node_set(page_to_nid(virt_to_page(track)), l->nodes);
5208     return 1;
5209 }
5210
5211 static void process_slab(struct loc_track *t, struct kmem_cache *s,
5212         struct slab *slab, enum track_item alloc,
5213         unsigned long *obj_map)
5214 {
5215     void *addr = slab_address(slab);
5216     void *p;
5217
5218     __fill_map(obj_map, s, slab);
5219
5220     for_each_object(p, s, addr, slab->objects)
5221         if (!test_bit(__obj_to_index(s, addr, p), obj_map))
5222             add_location(t, s, get_track(s, p, alloc));
5223 }
5224 #endif  /* CONFIG_DEBUG_FS   */
5225 #endif  /* CONFIG_SLUB_DEBUG */
5226
5227 #ifdef CONFIG_SYSFS
5228 enum slab_stat_type {
5229     SL_ALL,         /* All slabs */
5230     SL_PARTIAL,     /* Only partially allocated slabs */
5231     SL_CPU,         /* Only slabs used for cpu caches */
5232     SL_OBJECTS,     /* Determine allocated objects not slabs */
5233     SL_TOTAL        /* Determine object capacity not slabs */
5234 };
5235
5236 #define SO_ALL      (1 << SL_ALL)
5237 #define SO_PARTIAL  (1 << SL_PARTIAL)
5238 #define SO_CPU      (1 << SL_CPU)
5239 #define SO_OBJECTS  (1 << SL_OBJECTS)
5240 #define SO_TOTAL    (1 << SL_TOTAL)
5241
5242 static ssize_t show_slab_objects(struct kmem_cache *s,
5243                  char *buf, unsigned long flags)
5244 {
5245     unsigned long total = 0;
5246     int node;
5247     int x;
5248     unsigned long *nodes;
5249     int len = 0;
5250
5251     nodes = kcalloc(nr_node_ids, sizeof(unsigned long), GFP_KERNEL);
5252     if (!nodes)
5253         return -ENOMEM;
5254
5255     if (flags & SO_CPU) {
5256         int cpu;
5257
5258         for_each_possible_cpu(cpu) {
5259             struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab,
5260                                    cpu);
5261             int node;
5262             struct slab *slab;
5263
5264             slab = READ_ONCE(c->slab);
5265             if (!slab)
5266                 continue;
5267
5268             node = slab_nid(slab);
5269             if (flags & SO_TOTAL)
5270                 x = slab->objects;
5271             else if (flags & SO_OBJECTS)
5272                 x = slab->inuse;
5273             else
5274                 x = 1;
5275
5276             total += x;
5277             nodes[node] += x;
5278
5279 #ifdef CONFIG_SLUB_CPU_PARTIAL
5280             slab = slub_percpu_partial_read_once(c);
5281             if (slab) {
5282                 node = slab_nid(slab);
5283                 if (flags & SO_TOTAL)
5284                     WARN_ON_ONCE(1);
5285                 else if (flags & SO_OBJECTS)
5286                     WARN_ON_ONCE(1);
5287                 else
5288                     x = slab->slabs;
5289                 total += x;
5290                 nodes[node] += x;
5291             }
5292 #endif
5293         }
5294     }
5295
5296     /*
5297      * It is impossible to take "mem_hotplug_lock" here with "kernfs_mutex"
5298      * already held which will conflict with an existing lock order:
5299      *
5300      * mem_hotplug_lock->slab_mutex->kernfs_mutex
5301      *
5302      * We don't really need mem_hotplug_lock (to hold off
5303      * slab_mem_going_offline_callback) here because slab's memory hot
5304      * unplug code doesn't destroy the kmem_cache->node[] data.
5305      */
5306
5307 #ifdef CONFIG_SLUB_DEBUG
5308     if (flags & SO_ALL) {
5309         struct kmem_cache_node *n;
5310
5311         for_each_kmem_cache_node(s, node, n) {
5312
5313             if (flags & SO_TOTAL)
5314                 x = atomic_long_read(&n->total_objects);
5315             else if (flags & SO_OBJECTS)
5316                 x = atomic_long_read(&n->total_objects) -
5317                     count_partial(n, count_free);
5318             else
5319                 x = atomic_long_read(&n->nr_slabs);
5320             total += x;
5321             nodes[node] += x;
5322         }
5323
5324     } else
5325 #endif
5326     if (flags & SO_PARTIAL) {
5327         struct kmem_cache_node *n;
5328
5329         for_each_kmem_cache_node(s, node, n) {
5330             if (flags & SO_TOTAL)
5331                 x = count_partial(n, count_total);
5332             else if (flags & SO_OBJECTS)
5333                 x = count_partial(n, count_inuse);
5334             else
5335                 x = n->nr_partial;
5336             total += x;
5337             nodes[node] += x;
5338         }
5339     }
5340
5341     len += sysfs_emit_at(buf, len, "%lu", total);
5342 #ifdef CONFIG_NUMA
5343     for (node = 0; node < nr_node_ids; node++) {
5344         if (nodes[node])
5345             len += sysfs_emit_at(buf, len, " N%d=%lu",
5346                          node, nodes[node]);
5347     }
5348 #endif
5349     len += sysfs_emit_at(buf, len, "\n");
5350     kfree(nodes);
5351
5352     return len;
5353 }
5354
5355 #define to_slab_attr(n) container_of(n, struct slab_attribute, attr)
5356 #define to_slab(n) container_of(n, struct kmem_cache, kobj)
5357
5358 struct slab_attribute {
5359     struct attribute attr;
5360     ssize_t (*show)(struct kmem_cache *s, char *buf);
5361     ssize_t (*store)(struct kmem_cache *s, const char *x, size_t count);
5362 };
5363
5364 #define SLAB_ATTR_RO(_name) \
5365     static struct slab_attribute _name##_attr = __ATTR_RO_MODE(_name, 0400)
5366
5367 #define SLAB_ATTR(_name) \
5368     static struct slab_attribute _name##_attr = __ATTR_RW_MODE(_name, 0600)
5369
5370 static ssize_t slab_size_show(struct kmem_cache *s, char *buf)
5371 {
5372     return sysfs_emit(buf, "%u\n", s->size);
5373 }
5374 SLAB_ATTR_RO(slab_size);
5375
5376 static ssize_t align_show(struct kmem_cache *s, char *buf)
5377 {
5378     return sysfs_emit(buf, "%u\n", s->align);
5379 }
5380 SLAB_ATTR_RO(align);
5381
5382 static ssize_t object_size_show(struct kmem_cache *s, char *buf)
5383 {
5384     return sysfs_emit(buf, "%u\n", s->object_size);
5385 }
5386 SLAB_ATTR_RO(object_size);
5387
5388 static ssize_t objs_per_slab_show(struct kmem_cache *s, char *buf)
5389 {
5390     return sysfs_emit(buf, "%u\n", oo_objects(s->oo));
5391 }
5392 SLAB_ATTR_RO(objs_per_slab);
5393
5394 static ssize_t order_show(struct kmem_cache *s, char *buf)
5395 {
5396     return sysfs_emit(buf, "%u\n", oo_order(s->oo));
5397 }
5398 SLAB_ATTR_RO(order);
5399
5400 static ssize_t min_partial_show(struct kmem_cache *s, char *buf)
5401 {
5402     return sysfs_emit(buf, "%lu\n", s->min_partial);
5403 }
5404
5405 static ssize_t min_partial_store(struct kmem_cache *s, const char *buf,
5406                  size_t length)
5407 {
5408     unsigned long min;
5409     int err;
5410
5411     err = kstrtoul(buf, 10, &min);
5412     if (err)
5413         return err;
5414
5415     s->min_partial = min;
5416     return length;
5417 }
5418 SLAB_ATTR(min_partial);
5419
5420 static ssize_t cpu_partial_show(struct kmem_cache *s, char *buf)
5421 {
5422     unsigned int nr_partial = 0;
5423 #ifdef CONFIG_SLUB_CPU_PARTIAL
5424     nr_partial = s->cpu_partial;
5425 #endif
5426
5427     return sysfs_emit(buf, "%u\n", nr_partial);
5428 }
5429
5430 static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf,
5431                  size_t length)
5432 {
5433     unsigned int objects;
5434     int err;
5435
5436     err = kstrtouint(buf, 10, &objects);
5437     if (err)
5438         return err;
5439     if (objects && !kmem_cache_has_cpu_partial(s))
5440         return -EINVAL;
5441
5442     slub_set_cpu_partial(s, objects);
5443     flush_all(s);
5444     return length;
5445 }
5446 SLAB_ATTR(cpu_partial);
5447
5448 static ssize_t ctor_show(struct kmem_cache *s, char *buf)
5449 {
5450     if (!s->ctor)
5451         return 0;
5452     return sysfs_emit(buf, "%pS\n", s->ctor);
5453 }
5454 SLAB_ATTR_RO(ctor);
5455
5456 static ssize_t aliases_show(struct kmem_cache *s, char *buf)
5457 {
5458     return sysfs_emit(buf, "%d\n", s->refcount < 0 ? 0 : s->refcount - 1);
5459 }
5460 SLAB_ATTR_RO(aliases);
5461
5462 static ssize_t partial_show(struct kmem_cache *s, char *buf)
5463 {
5464     return show_slab_objects(s, buf, SO_PARTIAL);
5465 }
5466 SLAB_ATTR_RO(partial);
5467
5468 static ssize_t cpu_slabs_show(struct kmem_cache *s, char *buf)
5469 {
5470     return show_slab_objects(s, buf, SO_CPU);
5471 }
5472 SLAB_ATTR_RO(cpu_slabs);
5473
5474 static ssize_t objects_show(struct kmem_cache *s, char *buf)
5475 {
5476     return show_slab_objects(s, buf, SO_ALL|SO_OBJECTS);
5477 }
5478 SLAB_ATTR_RO(objects);
5479
5480 static ssize_t objects_partial_show(struct kmem_cache *s, char *buf)
5481 {
5482     return show_slab_objects(s, buf, SO_PARTIAL|SO_OBJECTS);
5483 }
5484 SLAB_ATTR_RO(objects_partial);
5485
5486 static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf)
5487 {
5488     int objects = 0;
5489     int slabs = 0;
5490     int cpu __maybe_unused;
5491     int len = 0;
5492
5493 #ifdef CONFIG_SLUB_CPU_PARTIAL
5494     for_each_online_cpu(cpu) {
5495         struct slab *slab;
5496
5497         slab = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu));
5498
5499         if (slab)
5500             slabs += slab->slabs;
5501     }
5502 #endif
5503
5504     /* Approximate half-full slabs, see slub_set_cpu_partial() */
5505     objects = (slabs * oo_objects(s->oo)) / 2;
5506     len += sysfs_emit_at(buf, len, "%d(%d)", objects, slabs);
5507
5508 #if defined(CONFIG_SLUB_CPU_PARTIAL) && defined(CONFIG_SMP)
5509     for_each_online_cpu(cpu) {
5510         struct slab *slab;
5511
5512         slab = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu));
5513         if (slab) {
5514             slabs = READ_ONCE(slab->slabs);
5515             objects = (slabs * oo_objects(s->oo)) / 2;
5516             len += sysfs_emit_at(buf, len, " C%d=%d(%d)",
5517                          cpu, objects, slabs);
5518         }
5519     }
5520 #endif
5521     len += sysfs_emit_at(buf, len, "\n");
5522
5523     return len;
5524 }
5525 SLAB_ATTR_RO(slabs_cpu_partial);
5526
5527 static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf)
5528 {
5529     return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT));
5530 }
5531 SLAB_ATTR_RO(reclaim_account);
5532
5533 static ssize_t hwcache_align_show(struct kmem_cache *s, char *buf)
5534 {
5535     return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_HWCACHE_ALIGN));
5536 }
5537 SLAB_ATTR_RO(hwcache_align);
5538
5539 #ifdef CONFIG_ZONE_DMA
5540 static ssize_t cache_dma_show(struct kmem_cache *s, char *buf)
5541 {
5542     return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_CACHE_DMA));
5543 }
5544 SLAB_ATTR_RO(cache_dma);
5545 #endif
5546
5547 static ssize_t usersize_show(struct kmem_cache *s, char *buf)
5548 {
5549     return sysfs_emit(buf, "%u\n", s->usersize);
5550 }
5551 SLAB_ATTR_RO(usersize);
5552
5553 static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf)
5554 {
5555     return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_TYPESAFE_BY_RCU));
5556 }
5557 SLAB_ATTR_RO(destroy_by_rcu);
5558
5559 #ifdef CONFIG_SLUB_DEBUG
5560 static ssize_t slabs_show(struct kmem_cache *s, char *buf)
5561 {
5562     return show_slab_objects(s, buf, SO_ALL);
5563 }
5564 SLAB_ATTR_RO(slabs);
5565
5566 static ssize_t total_objects_show(struct kmem_cache *s, char *buf)
5567 {
5568     return show_slab_objects(s, buf, SO_ALL|SO_TOTAL);
5569 }
5570 SLAB_ATTR_RO(total_objects);
5571
5572 static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf)
5573 {
5574     return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_CONSISTENCY_CHECKS));
5575 }
5576 SLAB_ATTR_RO(sanity_checks);
5577
5578 static ssize_t trace_show(struct kmem_cache *s, char *buf)
5579 {
5580     return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_TRACE));
5581 }
5582 SLAB_ATTR_RO(trace);
5583
5584 static ssize_t red_zone_show(struct kmem_cache *s, char *buf)
5585 {
5586     return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_RED_ZONE));
5587 }
5588
5589 SLAB_ATTR_RO(red_zone);
5590
5591 static ssize_t poison_show(struct kmem_cache *s, char *buf)
5592 {
5593     return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_POISON));
5594 }
5595
5596 SLAB_ATTR_RO(poison);
5597
5598 static ssize_t store_user_show(struct kmem_cache *s, char *buf)
5599 {
5600     return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_STORE_USER));
5601 }
5602
5603 SLAB_ATTR_RO(store_user);
5604
5605 static ssize_t validate_show(struct kmem_cache *s, char *buf)
5606 {
5607     return 0;
5608 }
5609
5610 static ssize_t validate_store(struct kmem_cache *s,
5611             const char *buf, size_t length)
5612 {
5613     int ret = -EINVAL;
5614
5615     if (buf[0] == '1') {
5616         ret = validate_slab_cache(s);
5617         if (ret >= 0)
5618             ret = length;
5619     }
5620     return ret;
5621 }
5622 SLAB_ATTR(validate);
5623
5624 #endif /* CONFIG_SLUB_DEBUG */
5625
5626 #ifdef CONFIG_FAILSLAB
5627 static ssize_t failslab_show(struct kmem_cache *s, char *buf)
5628 {
5629     return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB));
5630 }
5631 SLAB_ATTR_RO(failslab);
5632 #endif
5633
5634 static ssize_t shrink_show(struct kmem_cache *s, char *buf)
5635 {
5636     return 0;
5637 }
5638
5639 static ssize_t shrink_store(struct kmem_cache *s,
5640             const char *buf, size_t length)
5641 {
5642     if (buf[0] == '1')
5643         kmem_cache_shrink(s);
5644     else
5645         return -EINVAL;
5646     return length;
5647 }
5648 SLAB_ATTR(shrink);
5649
5650 #ifdef CONFIG_NUMA
5651 static ssize_t remote_node_defrag_ratio_show(struct kmem_cache *s, char *buf)
5652 {
5653     return sysfs_emit(buf, "%u\n", s->remote_node_defrag_ratio / 10);
5654 }
5655
5656 static ssize_t remote_node_defrag_ratio_store(struct kmem_cache *s,
5657                 const char *buf, size_t length)
5658 {
5659     unsigned int ratio;
5660     int err;
5661
5662     err = kstrtouint(buf, 10, &ratio);
5663     if (err)
5664         return err;
5665     if (ratio > 100)
5666         return -ERANGE;
5667
5668     s->remote_node_defrag_ratio = ratio * 10;
5669
5670     return length;
5671 }
5672 SLAB_ATTR(remote_node_defrag_ratio);
5673 #endif
5674
5675 #ifdef CONFIG_SLUB_STATS
5676 static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si)
5677 {
5678     unsigned long sum  = 0;
5679     int cpu;
5680     int len = 0;
5681     int *data = kmalloc_array(nr_cpu_ids, sizeof(int), GFP_KERNEL);
5682
5683     if (!data)
5684         return -ENOMEM;
5685
5686     for_each_online_cpu(cpu) {
5687         unsigned x = per_cpu_ptr(s->cpu_slab, cpu)->stat[si];
5688
5689         data[cpu] = x;
5690         sum += x;
5691     }
5692
5693     len += sysfs_emit_at(buf, len, "%lu", sum);
5694
5695 #ifdef CONFIG_SMP
5696     for_each_online_cpu(cpu) {
5697         if (data[cpu])
5698             len += sysfs_emit_at(buf, len, " C%d=%u",
5699                          cpu, data[cpu]);
5700     }
5701 #endif
5702     kfree(data);
5703     len += sysfs_emit_at(buf, len, "\n");
5704
5705     return len;
5706 }
5707
5708 static void clear_stat(struct kmem_cache *s, enum stat_item si)
5709 {
5710     int cpu;
5711
5712     for_each_online_cpu(cpu)
5713         per_cpu_ptr(s->cpu_slab, cpu)->stat[si] = 0;
5714 }
5715
5716 #define STAT_ATTR(si, text)                     \
5717 static ssize_t text##_show(struct kmem_cache *s, char *buf) \
5718 {                               \
5719     return show_stat(s, buf, si);               \
5720 }                               \
5721 static ssize_t text##_store(struct kmem_cache *s,       \
5722                 const char *buf, size_t length) \
5723 {                               \
5724     if (buf[0] != '0')                  \
5725         return -EINVAL;                 \
5726     clear_stat(s, si);                  \
5727     return length;                      \
5728 }                               \
5729 SLAB_ATTR(text);                        \
5730
5731 STAT_ATTR(ALLOC_FASTPATH, alloc_fastpath);
5732 STAT_ATTR(ALLOC_SLOWPATH, alloc_slowpath);
5733 STAT_ATTR(FREE_FASTPATH, free_fastpath);
5734 STAT_ATTR(FREE_SLOWPATH, free_slowpath);
5735 STAT_ATTR(FREE_FROZEN, free_frozen);
5736 STAT_ATTR(FREE_ADD_PARTIAL, free_add_partial);
5737 STAT_ATTR(FREE_REMOVE_PARTIAL, free_remove_partial);
5738 STAT_ATTR(ALLOC_FROM_PARTIAL, alloc_from_partial);
5739 STAT_ATTR(ALLOC_SLAB, alloc_slab);
5740 STAT_ATTR(ALLOC_REFILL, alloc_refill);
5741 STAT_ATTR(ALLOC_NODE_MISMATCH, alloc_node_mismatch);
5742 STAT_ATTR(FREE_SLAB, free_slab);
5743 STAT_ATTR(CPUSLAB_FLUSH, cpuslab_flush);
5744 STAT_ATTR(DEACTIVATE_FULL, deactivate_full);
5745 STAT_ATTR(DEACTIVATE_EMPTY, deactivate_empty);
5746 STAT_ATTR(DEACTIVATE_TO_HEAD, deactivate_to_head);
5747 STAT_ATTR(DEACTIVATE_TO_TAIL, deactivate_to_tail);
5748 STAT_ATTR(DEACTIVATE_REMOTE_FREES, deactivate_remote_frees);
5749 STAT_ATTR(DEACTIVATE_BYPASS, deactivate_bypass);
5750 STAT_ATTR(ORDER_FALLBACK, order_fallback);
5751 STAT_ATTR(CMPXCHG_DOUBLE_CPU_FAIL, cmpxchg_double_cpu_fail);
5752 STAT_ATTR(CMPXCHG_DOUBLE_FAIL, cmpxchg_double_fail);
5753 STAT_ATTR(CPU_PARTIAL_ALLOC, cpu_partial_alloc);
5754 STAT_ATTR(CPU_PARTIAL_FREE, cpu_partial_free);
5755 STAT_ATTR(CPU_PARTIAL_NODE, cpu_partial_node);
5756 STAT_ATTR(CPU_PARTIAL_DRAIN, cpu_partial_drain);
5757 #endif  /* CONFIG_SLUB_STATS */
5758
5759 static struct attribute *slab_attrs[] = {
5760     &slab_size_attr.attr,
5761     &object_size_attr.attr,
5762     &objs_per_slab_attr.attr,
5763     &order_attr.attr,
5764     &min_partial_attr.attr,
5765     &cpu_partial_attr.attr,
5766     &objects_attr.attr,
5767     &objects_partial_attr.attr,
5768     &partial_attr.attr,
5769     &cpu_slabs_attr.attr,
5770     &ctor_attr.attr,
5771     &aliases_attr.attr,
5772     &align_attr.attr,
5773     &hwcache_align_attr.attr,
5774     &reclaim_account_attr.attr,
5775     &destroy_by_rcu_attr.attr,
5776     &shrink_attr.attr,
5777     &slabs_cpu_partial_attr.attr,
5778 #ifdef CONFIG_SLUB_DEBUG
5779     &total_objects_attr.attr,
5780     &slabs_attr.attr,
5781     &sanity_checks_attr.attr,
5782     &trace_attr.attr,
5783     &red_zone_attr.attr,
5784     &poison_attr.attr,
5785     &store_user_attr.attr,
5786     &validate_attr.attr,
5787 #endif
5788 #ifdef CONFIG_ZONE_DMA
5789     &cache_dma_attr.attr,
5790 #endif
5791 #ifdef CONFIG_NUMA
5792     &remote_node_defrag_ratio_attr.attr,
5793 #endif
5794 #ifdef CONFIG_SLUB_STATS
5795     &alloc_fastpath_attr.attr,
5796     &alloc_slowpath_attr.attr,
5797     &free_fastpath_attr.attr,
5798     &free_slowpath_attr.attr,
5799     &free_frozen_attr.attr,
5800     &free_add_partial_attr.attr,
5801     &free_remove_partial_attr.attr,
5802     &alloc_from_partial_attr.attr,
5803     &alloc_slab_attr.attr,
5804     &alloc_refill_attr.attr,
5805     &alloc_node_mismatch_attr.attr,
5806     &free_slab_attr.attr,
5807     &cpuslab_flush_attr.attr,
5808     &deactivate_full_attr.attr,
5809     &deactivate_empty_attr.attr,
5810     &deactivate_to_head_attr.attr,
5811     &deactivate_to_tail_attr.attr,
5812     &deactivate_remote_frees_attr.attr,
5813     &deactivate_bypass_attr.attr,
5814     &order_fallback_attr.attr,
5815     &cmpxchg_double_fail_attr.attr,
5816     &cmpxchg_double_cpu_fail_attr.attr,
5817     &cpu_partial_alloc_attr.attr,
5818     &cpu_partial_free_attr.attr,
5819     &cpu_partial_node_attr.attr,
5820     &cpu_partial_drain_attr.attr,
5821 #endif
5822 #ifdef CONFIG_FAILSLAB
5823     &failslab_attr.attr,
5824 #endif
5825     &usersize_attr.attr,
5826
5827     NULL
5828 };
5829
5830 static const struct attribute_group slab_attr_group = {
5831     .attrs = slab_attrs,
5832 };
5833
5834 static ssize_t slab_attr_show(struct kobject *kobj,
5835                 struct attribute *attr,
5836                 char *buf)
5837 {
5838     struct slab_attribute *attribute;
5839     struct kmem_cache *s;
5840     int err;
5841
5842     attribute = to_slab_attr(attr);
5843     s = to_slab(kobj);
5844
5845     if (!attribute->show)
5846         return -EIO;
5847
5848     err = attribute->show(s, buf);
5849
5850     return err;
5851 }
5852
5853 static ssize_t slab_attr_store(struct kobject *kobj,
5854                 struct attribute *attr,
5855                 const char *buf, size_t len)
5856 {
5857     struct slab_attribute *attribute;
5858     struct kmem_cache *s;
5859     int err;
5860
5861     attribute = to_slab_attr(attr);
5862     s = to_slab(kobj);
5863
5864     if (!attribute->store)
5865         return -EIO;
5866
5867     err = attribute->store(s, buf, len);
5868     return err;
5869 }
5870
5871 static void kmem_cache_release(struct kobject *k)
5872 {
5873     slab_kmem_cache_release(to_slab(k));
5874 }
5875
5876 static const struct sysfs_ops slab_sysfs_ops = {
5877     .show = slab_attr_show,
5878     .store = slab_attr_store,
5879 };
5880
5881 static struct kobj_type slab_ktype = {
5882     .sysfs_ops = &slab_sysfs_ops,
5883     .release = kmem_cache_release,
5884 };
5885
5886 static struct kset *slab_kset;
5887
5888 static inline struct kset *cache_kset(struct kmem_cache *s)
5889 {
5890     return slab_kset;
5891 }
5892
5893 #define ID_STR_LENGTH 64
5894
5895 /* Create a unique string id for a slab cache:
5896  *
5897  * Format   :[flags-]size
5898  */
5899 static char *create_unique_id(struct kmem_cache *s)
5900 {
5901     char *name = kmalloc(ID_STR_LENGTH, GFP_KERNEL);
5902     char *p = name;
5903
5904     if (!name)
5905         return ERR_PTR(-ENOMEM);
5906
5907     *p++ = ':';
5908     /*
5909      * First flags affecting slabcache operations. We will only
5910      * get here for aliasable slabs so we do not need to support
5911      * too many flags. The flags here must cover all flags that
5912      * are matched during merging to guarantee that the id is
5913      * unique.
5914      */
5915     if (s->flags & SLAB_CACHE_DMA)
5916         *p++ = 'd';
5917     if (s->flags & SLAB_CACHE_DMA32)
5918         *p++ = 'D';
5919     if (s->flags & SLAB_RECLAIM_ACCOUNT)
5920         *p++ = 'a';
5921     if (s->flags & SLAB_CONSISTENCY_CHECKS)
5922         *p++ = 'F';
5923     if (s->flags & SLAB_ACCOUNT)
5924         *p++ = 'A';
5925     if (p != name + 1)
5926         *p++ = '-';
5927     p += sprintf(p, "%07u", s->size);
5928
5929     BUG_ON(p > name + ID_STR_LENGTH - 1);
5930     return name;
5931 }
5932
5933 static int sysfs_slab_add(struct kmem_cache *s)
5934 {
5935     int err;
5936     const char *name;
5937     struct kset *kset = cache_kset(s);
5938     int unmergeable = slab_unmergeable(s);
5939
5940     if (!kset) {
5941         kobject_init(&s->kobj, &slab_ktype);
5942         return 0;
5943     }
5944
5945     if (!unmergeable && disable_higher_order_debug &&
5946             (slub_debug & DEBUG_METADATA_FLAGS))
5947         unmergeable = 1;
5948
5949     if (unmergeable) {
5950         /*
5951          * Slabcache can never be merged so we can use the name proper.
5952          * This is typically the case for debug situations. In that
5953          * case we can catch duplicate names easily.
5954          */
5955         sysfs_remove_link(&slab_kset->kobj, s->name);
5956         name = s->name;
5957     } else {
5958         /*
5959          * Create a unique name for the slab as a target
5960          * for the symlinks.
5961          */
5962         name = create_unique_id(s);
5963         if (IS_ERR(name))
5964             return PTR_ERR(name);
5965     }
5966
5967     s->kobj.kset = kset;
5968     err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, "%s", name);
5969     if (err)
5970         goto out;
5971
5972     err = sysfs_create_group(&s->kobj, &slab_attr_group);
5973     if (err)
5974         goto out_del_kobj;
5975
5976     if (!unmergeable) {
5977         /* Setup first alias */
5978         sysfs_slab_alias(s, s->name);
5979     }
5980 out:
5981     if (!unmergeable)
5982         kfree(name);
5983     return err;
5984 out_del_kobj:
5985     kobject_del(&s->kobj);
5986     goto out;
5987 }
5988
5989 void sysfs_slab_unlink(struct kmem_cache *s)
5990 {
5991     if (slab_state >= FULL)
5992         kobject_del(&s->kobj);
5993 }
5994
5995 void sysfs_slab_release(struct kmem_cache *s)
5996 {
5997     if (slab_state >= FULL)
5998         kobject_put(&s->kobj);
5999 }
6000
6001 /*
6002  * Need to buffer aliases during bootup until sysfs becomes
6003  * available lest we lose that information.
6004  */
6005 struct saved_alias {
6006     struct kmem_cache *s;
6007     const char *name;
6008     struct saved_alias *next;
6009 };
6010
6011 static struct saved_alias *alias_list;
6012
6013 static int sysfs_slab_alias(struct kmem_cache *s, const char *name)
6014 {
6015     struct saved_alias *al;
6016
6017     if (slab_state == FULL) {
6018         /*
6019          * If we have a leftover link then remove it.
6020          */
6021         sysfs_remove_link(&slab_kset->kobj, name);
6022         return sysfs_create_link(&slab_kset->kobj, &s->kobj, name);
6023     }
6024
6025     al = kmalloc(sizeof(struct saved_alias), GFP_KERNEL);
6026     if (!al)
6027         return -ENOMEM;
6028
6029     al->s = s;
6030     al->name = name;
6031     al->next = alias_list;
6032     alias_list = al;
6033     return 0;
6034 }
6035
6036 static int __init slab_sysfs_init(void)
6037 {
6038     struct kmem_cache *s;
6039     int err;
6040
6041     mutex_lock(&slab_mutex);
6042
6043     slab_kset = kset_create_and_add("slab", NULL, kernel_kobj);
6044     if (!slab_kset) {
6045         mutex_unlock(&slab_mutex);
6046         pr_err("Cannot register slab subsystem.\n");
6047         return -ENOSYS;
6048     }
6049
6050     slab_state = FULL;
6051
6052     list_for_each_entry(s, &slab_caches, list) {
6053         err = sysfs_slab_add(s);
6054         if (err)
6055             pr_err("SLUB: Unable to add boot slab %s to sysfs\n",
6056                    s->name);
6057     }
6058
6059     while (alias_list) {
6060         struct saved_alias *al = alias_list;
6061
6062         alias_list = alias_list->next;
6063         err = sysfs_slab_alias(al->s, al->name);
6064         if (err)
6065             pr_err("SLUB: Unable to add boot slab alias %s to sysfs\n",
6066                    al->name);
6067         kfree(al);
6068     }
6069
6070     mutex_unlock(&slab_mutex);
6071     return 0;
6072 }
6073
6074 __initcall(slab_sysfs_init);
6075 #endif /* CONFIG_SYSFS */
6076
6077 #if defined(CONFIG_SLUB_DEBUG) && defined(CONFIG_DEBUG_FS)
6078 static int slab_debugfs_show(struct seq_file *seq, void *v)
6079 {
6080     struct loc_track *t = seq->private;
6081     struct location *l;
6082     unsigned long idx;
6083
6084     idx = (unsigned long) t->idx;
6085     if (idx < t->count) {
6086         l = &t->loc[idx];
6087
6088         seq_printf(seq, "%7ld ", l->count);
6089
6090         if (l->addr)
6091             seq_printf(seq, "%pS", (void *)l->addr);
6092         else
6093             seq_puts(seq, "<not-available>");
6094
6095         if (l->sum_time != l->min_time) {
6096             seq_printf(seq, " age=%ld/%llu/%ld",
6097                 l->min_time, div_u64(l->sum_time, l->count),
6098                 l->max_time);
6099         } else
6100             seq_printf(seq, " age=%ld", l->min_time);
6101
6102         if (l->min_pid != l->max_pid)
6103             seq_printf(seq, " pid=%ld-%ld", l->min_pid, l->max_pid);
6104         else
6105             seq_printf(seq, " pid=%ld",
6106                 l->min_pid);
6107
6108         if (num_online_cpus() > 1 && !cpumask_empty(to_cpumask(l->cpus)))
6109             seq_printf(seq, " cpus=%*pbl",
6110                  cpumask_pr_args(to_cpumask(l->cpus)));
6111
6112         if (nr_online_nodes > 1 && !nodes_empty(l->nodes))
6113             seq_printf(seq, " nodes=%*pbl",
6114                  nodemask_pr_args(&l->nodes));
6115
6116 #ifdef CONFIG_STACKDEPOT
6117         {
6118             depot_stack_handle_t handle;
6119             unsigned long *entries;
6120             unsigned int nr_entries, j;
6121
6122             handle = READ_ONCE(l->handle);
6123             if (handle) {
6124                 nr_entries = stack_depot_fetch(handle, &entries);
6125                 seq_puts(seq, "\n");
6126                 for (j = 0; j < nr_entries; j++)
6127                     seq_printf(seq, "        %pS\n", (void *)entries[j]);
6128             }
6129         }
6130 #endif
6131         seq_puts(seq, "\n");
6132     }
6133
6134     if (!idx && !t->count)
6135         seq_puts(seq, "No data\n");
6136
6137     return 0;
6138 }
6139
6140 static void slab_debugfs_stop(struct seq_file *seq, void *v)
6141 {
6142 }
6143
6144 static void *slab_debugfs_next(struct seq_file *seq, void *v, loff_t *ppos)
6145 {
6146     struct loc_track *t = seq->private;
6147
6148     t->idx = ++(*ppos);
6149     if (*ppos <= t->count)
6150         return ppos;
6151
6152     return NULL;
6153 }
6154
6155 static int cmp_loc_by_count(const void *a, const void *b, const void *data)
6156 {
6157     struct location *loc1 = (struct location *)a;
6158     struct location *loc2 = (struct location *)b;
6159
6160     if (loc1->count > loc2->count)
6161         return -1;
6162     else
6163         return 1;
6164 }
6165
6166 static void *slab_debugfs_start(struct seq_file *seq, loff_t *ppos)
6167 {
6168     struct loc_track *t = seq->private;
6169
6170     t->idx = *ppos;
6171     return ppos;
6172 }
6173
6174 static const struct seq_operations slab_debugfs_sops = {
6175     .start  = slab_debugfs_start,
6176     .next   = slab_debugfs_next,
6177     .stop   = slab_debugfs_stop,
6178     .show   = slab_debugfs_show,
6179 };
6180
6181 static int slab_debug_trace_open(struct inode *inode, struct file *filep)
6182 {
6183
6184     struct kmem_cache_node *n;
6185     enum track_item alloc;
6186     int node;
6187     struct loc_track *t = __seq_open_private(filep, &slab_debugfs_sops,
6188                         sizeof(struct loc_track));
6189     struct kmem_cache *s = file_inode(filep)->i_private;
6190     unsigned long *obj_map;
6191
6192     if (!t)
6193         return -ENOMEM;
6194
6195     obj_map = bitmap_alloc(oo_objects(s->oo), GFP_KERNEL);
6196     if (!obj_map) {
6197         seq_release_private(inode, filep);
6198         return -ENOMEM;
6199     }
6200
6201     if (strcmp(filep->f_path.dentry->d_name.name, "alloc_traces") == 0)
6202         alloc = TRACK_ALLOC;
6203     else
6204         alloc = TRACK_FREE;
6205
6206     if (!alloc_loc_track(t, PAGE_SIZE / sizeof(struct location), GFP_KERNEL)) {
6207         bitmap_free(obj_map);
6208         seq_release_private(inode, filep);
6209         return -ENOMEM;
6210     }
6211
6212     for_each_kmem_cache_node(s, node, n) {
6213         unsigned long flags;
6214         struct slab *slab;
6215
6216         if (!atomic_long_read(&n->nr_slabs))
6217             continue;
6218
6219         spin_lock_irqsave(&n->list_lock, flags);
6220         list_for_each_entry(slab, &n->partial, slab_list)
6221             process_slab(t, s, slab, alloc, obj_map);
6222         list_for_each_entry(slab, &n->full, slab_list)
6223             process_slab(t, s, slab, alloc, obj_map);
6224         spin_unlock_irqrestore(&n->list_lock, flags);
6225     }
6226
6227     /* Sort locations by count */
6228     sort_r(t->loc, t->count, sizeof(struct location),
6229         cmp_loc_by_count, NULL, NULL);
6230
6231     bitmap_free(obj_map);
6232     return 0;
6233 }
6234
6235 static int slab_debug_trace_release(struct inode *inode, struct file *file)
6236 {
6237     struct seq_file *seq = file->private_data;
6238     struct loc_track *t = seq->private;
6239
6240     free_loc_track(t);
6241     return seq_release_private(inode, file);
6242 }
6243
6244 static const struct file_operations slab_debugfs_fops = {
6245     .open    = slab_debug_trace_open,
6246     .read    = seq_read,
6247     .llseek  = seq_lseek,
6248     .release = slab_debug_trace_release,
6249 };
6250
6251 static void debugfs_slab_add(struct kmem_cache *s)
6252 {
6253     struct dentry *slab_cache_dir;
6254
6255     if (unlikely(!slab_debugfs_root))
6256         return;
6257
6258     slab_cache_dir = debugfs_create_dir(s->name, slab_debugfs_root);
6259
6260     debugfs_create_file("alloc_traces", 0400,
6261         slab_cache_dir, s, &slab_debugfs_fops);
6262
6263     debugfs_create_file("free_traces", 0400,
6264         slab_cache_dir, s, &slab_debugfs_fops);
6265 }
6266
6267 void debugfs_slab_release(struct kmem_cache *s)
6268 {
6269     debugfs_remove_recursive(debugfs_lookup(s->name, slab_debugfs_root));
6270 }
6271
6272 static int __init slab_debugfs_init(void)
6273 {
6274     struct kmem_cache *s;
6275
6276     slab_debugfs_root = debugfs_create_dir("slab", NULL);
6277
6278     list_for_each_entry(s, &slab_caches, list)
6279         if (s->flags & SLAB_STORE_USER)
6280             debugfs_slab_add(s);
6281
6282     return 0;
6283
6284 }
6285 __initcall(slab_debugfs_init);
6286 #endif
6287 /*
6288  * The /proc/slabinfo ABI
6289  */
6290 #ifdef CONFIG_SLUB_DEBUG
6291 void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo)
6292 {
6293     unsigned long nr_slabs = 0;
6294     unsigned long nr_objs = 0;
6295     unsigned long nr_free = 0;
6296     int node;
6297     struct kmem_cache_node *n;
6298
6299     for_each_kmem_cache_node(s, node, n) {
6300         nr_slabs += node_nr_slabs(n);
6301         nr_objs += node_nr_objs(n);
6302         nr_free += count_partial(n, count_free);
6303     }
6304
6305     sinfo->active_objs = nr_objs - nr_free;
6306     sinfo->num_objs = nr_objs;
6307     sinfo->active_slabs = nr_slabs;
6308     sinfo->num_slabs = nr_slabs;
6309     sinfo->objects_per_slab = oo_objects(s->oo);
6310     sinfo->cache_order = oo_order(s->oo);
6311 }
6312
6313 void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s)
6314 {
6315 }
6316
6317 ssize_t slabinfo_write(struct file *file, const char __user *buffer,
6318                size_t count, loff_t *ppos)
6319 {
6320     return -EIO;
6321 }
6322 #endif /* CONFIG_SLUB_DEBUG */