Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0
0002 /*
0003  * KFENCE guarded object allocator and fault handling.
0004  *
0005  * Copyright (C) 2020, Google LLC.
0006  */
0007 
0008 #define pr_fmt(fmt) "kfence: " fmt
0009 
0010 #include <linux/atomic.h>
0011 #include <linux/bug.h>
0012 #include <linux/debugfs.h>
0013 #include <linux/hash.h>
0014 #include <linux/irq_work.h>
0015 #include <linux/jhash.h>
0016 #include <linux/kcsan-checks.h>
0017 #include <linux/kfence.h>
0018 #include <linux/kmemleak.h>
0019 #include <linux/list.h>
0020 #include <linux/lockdep.h>
0021 #include <linux/log2.h>
0022 #include <linux/memblock.h>
0023 #include <linux/moduleparam.h>
0024 #include <linux/notifier.h>
0025 #include <linux/panic_notifier.h>
0026 #include <linux/random.h>
0027 #include <linux/rcupdate.h>
0028 #include <linux/sched/clock.h>
0029 #include <linux/sched/sysctl.h>
0030 #include <linux/seq_file.h>
0031 #include <linux/slab.h>
0032 #include <linux/spinlock.h>
0033 #include <linux/string.h>
0034 
0035 #include <asm/kfence.h>
0036 
0037 #include "kfence.h"
0038 
0039 /* Disables KFENCE on the first warning assuming an irrecoverable error. */
0040 #define KFENCE_WARN_ON(cond)                                                   \
0041     ({                                                                     \
0042         const bool __cond = WARN_ON(cond);                             \
0043         if (unlikely(__cond)) {                                        \
0044             WRITE_ONCE(kfence_enabled, false);                     \
0045             disabled_by_warn = true;                               \
0046         }                                                              \
0047         __cond;                                                        \
0048     })
0049 
0050 /* === Data ================================================================= */
0051 
0052 static bool kfence_enabled __read_mostly;
0053 static bool disabled_by_warn __read_mostly;
0054 
0055 unsigned long kfence_sample_interval __read_mostly = CONFIG_KFENCE_SAMPLE_INTERVAL;
0056 EXPORT_SYMBOL_GPL(kfence_sample_interval); /* Export for test modules. */
0057 
0058 #ifdef MODULE_PARAM_PREFIX
0059 #undef MODULE_PARAM_PREFIX
0060 #endif
0061 #define MODULE_PARAM_PREFIX "kfence."
0062 
0063 static int kfence_enable_late(void);
0064 static int param_set_sample_interval(const char *val, const struct kernel_param *kp)
0065 {
0066     unsigned long num;
0067     int ret = kstrtoul(val, 0, &num);
0068 
0069     if (ret < 0)
0070         return ret;
0071 
0072     /* Using 0 to indicate KFENCE is disabled. */
0073     if (!num && READ_ONCE(kfence_enabled)) {
0074         pr_info("disabled\n");
0075         WRITE_ONCE(kfence_enabled, false);
0076     }
0077 
0078     *((unsigned long *)kp->arg) = num;
0079 
0080     if (num && !READ_ONCE(kfence_enabled) && system_state != SYSTEM_BOOTING)
0081         return disabled_by_warn ? -EINVAL : kfence_enable_late();
0082     return 0;
0083 }
0084 
0085 static int param_get_sample_interval(char *buffer, const struct kernel_param *kp)
0086 {
0087     if (!READ_ONCE(kfence_enabled))
0088         return sprintf(buffer, "0\n");
0089 
0090     return param_get_ulong(buffer, kp);
0091 }
0092 
0093 static const struct kernel_param_ops sample_interval_param_ops = {
0094     .set = param_set_sample_interval,
0095     .get = param_get_sample_interval,
0096 };
0097 module_param_cb(sample_interval, &sample_interval_param_ops, &kfence_sample_interval, 0600);
0098 
0099 /* Pool usage% threshold when currently covered allocations are skipped. */
0100 static unsigned long kfence_skip_covered_thresh __read_mostly = 75;
0101 module_param_named(skip_covered_thresh, kfence_skip_covered_thresh, ulong, 0644);
0102 
0103 /* If true, use a deferrable timer. */
0104 static bool kfence_deferrable __read_mostly = IS_ENABLED(CONFIG_KFENCE_DEFERRABLE);
0105 module_param_named(deferrable, kfence_deferrable, bool, 0444);
0106 
0107 /* If true, check all canary bytes on panic. */
0108 static bool kfence_check_on_panic __read_mostly;
0109 module_param_named(check_on_panic, kfence_check_on_panic, bool, 0444);
0110 
0111 /* The pool of pages used for guard pages and objects. */
0112 char *__kfence_pool __read_mostly;
0113 EXPORT_SYMBOL(__kfence_pool); /* Export for test modules. */
0114 
0115 /*
0116  * Per-object metadata, with one-to-one mapping of object metadata to
0117  * backing pages (in __kfence_pool).
0118  */
0119 static_assert(CONFIG_KFENCE_NUM_OBJECTS > 0);
0120 struct kfence_metadata kfence_metadata[CONFIG_KFENCE_NUM_OBJECTS];
0121 
0122 /* Freelist with available objects. */
0123 static struct list_head kfence_freelist = LIST_HEAD_INIT(kfence_freelist);
0124 static DEFINE_RAW_SPINLOCK(kfence_freelist_lock); /* Lock protecting freelist. */
0125 
0126 /*
0127  * The static key to set up a KFENCE allocation; or if static keys are not used
0128  * to gate allocations, to avoid a load and compare if KFENCE is disabled.
0129  */
0130 DEFINE_STATIC_KEY_FALSE(kfence_allocation_key);
0131 
0132 /* Gates the allocation, ensuring only one succeeds in a given period. */
0133 atomic_t kfence_allocation_gate = ATOMIC_INIT(1);
0134 
0135 /*
0136  * A Counting Bloom filter of allocation coverage: limits currently covered
0137  * allocations of the same source filling up the pool.
0138  *
0139  * Assuming a range of 15%-85% unique allocations in the pool at any point in
0140  * time, the below parameters provide a probablity of 0.02-0.33 for false
0141  * positive hits respectively:
0142  *
0143  *  P(alloc_traces) = (1 - e^(-HNUM * (alloc_traces / SIZE)) ^ HNUM
0144  */
0145 #define ALLOC_COVERED_HNUM  2
0146 #define ALLOC_COVERED_ORDER (const_ilog2(CONFIG_KFENCE_NUM_OBJECTS) + 2)
0147 #define ALLOC_COVERED_SIZE  (1 << ALLOC_COVERED_ORDER)
0148 #define ALLOC_COVERED_HNEXT(h)  hash_32(h, ALLOC_COVERED_ORDER)
0149 #define ALLOC_COVERED_MASK  (ALLOC_COVERED_SIZE - 1)
0150 static atomic_t alloc_covered[ALLOC_COVERED_SIZE];
0151 
0152 /* Stack depth used to determine uniqueness of an allocation. */
0153 #define UNIQUE_ALLOC_STACK_DEPTH ((size_t)8)
0154 
0155 /*
0156  * Randomness for stack hashes, making the same collisions across reboots and
0157  * different machines less likely.
0158  */
0159 static u32 stack_hash_seed __ro_after_init;
0160 
0161 /* Statistics counters for debugfs. */
0162 enum kfence_counter_id {
0163     KFENCE_COUNTER_ALLOCATED,
0164     KFENCE_COUNTER_ALLOCS,
0165     KFENCE_COUNTER_FREES,
0166     KFENCE_COUNTER_ZOMBIES,
0167     KFENCE_COUNTER_BUGS,
0168     KFENCE_COUNTER_SKIP_INCOMPAT,
0169     KFENCE_COUNTER_SKIP_CAPACITY,
0170     KFENCE_COUNTER_SKIP_COVERED,
0171     KFENCE_COUNTER_COUNT,
0172 };
0173 static atomic_long_t counters[KFENCE_COUNTER_COUNT];
0174 static const char *const counter_names[] = {
0175     [KFENCE_COUNTER_ALLOCATED]  = "currently allocated",
0176     [KFENCE_COUNTER_ALLOCS]     = "total allocations",
0177     [KFENCE_COUNTER_FREES]      = "total frees",
0178     [KFENCE_COUNTER_ZOMBIES]    = "zombie allocations",
0179     [KFENCE_COUNTER_BUGS]       = "total bugs",
0180     [KFENCE_COUNTER_SKIP_INCOMPAT]  = "skipped allocations (incompatible)",
0181     [KFENCE_COUNTER_SKIP_CAPACITY]  = "skipped allocations (capacity)",
0182     [KFENCE_COUNTER_SKIP_COVERED]   = "skipped allocations (covered)",
0183 };
0184 static_assert(ARRAY_SIZE(counter_names) == KFENCE_COUNTER_COUNT);
0185 
0186 /* === Internals ============================================================ */
0187 
0188 static inline bool should_skip_covered(void)
0189 {
0190     unsigned long thresh = (CONFIG_KFENCE_NUM_OBJECTS * kfence_skip_covered_thresh) / 100;
0191 
0192     return atomic_long_read(&counters[KFENCE_COUNTER_ALLOCATED]) > thresh;
0193 }
0194 
0195 static u32 get_alloc_stack_hash(unsigned long *stack_entries, size_t num_entries)
0196 {
0197     num_entries = min(num_entries, UNIQUE_ALLOC_STACK_DEPTH);
0198     num_entries = filter_irq_stacks(stack_entries, num_entries);
0199     return jhash(stack_entries, num_entries * sizeof(stack_entries[0]), stack_hash_seed);
0200 }
0201 
0202 /*
0203  * Adds (or subtracts) count @val for allocation stack trace hash
0204  * @alloc_stack_hash from Counting Bloom filter.
0205  */
0206 static void alloc_covered_add(u32 alloc_stack_hash, int val)
0207 {
0208     int i;
0209 
0210     for (i = 0; i < ALLOC_COVERED_HNUM; i++) {
0211         atomic_add(val, &alloc_covered[alloc_stack_hash & ALLOC_COVERED_MASK]);
0212         alloc_stack_hash = ALLOC_COVERED_HNEXT(alloc_stack_hash);
0213     }
0214 }
0215 
0216 /*
0217  * Returns true if the allocation stack trace hash @alloc_stack_hash is
0218  * currently contained (non-zero count) in Counting Bloom filter.
0219  */
0220 static bool alloc_covered_contains(u32 alloc_stack_hash)
0221 {
0222     int i;
0223 
0224     for (i = 0; i < ALLOC_COVERED_HNUM; i++) {
0225         if (!atomic_read(&alloc_covered[alloc_stack_hash & ALLOC_COVERED_MASK]))
0226             return false;
0227         alloc_stack_hash = ALLOC_COVERED_HNEXT(alloc_stack_hash);
0228     }
0229 
0230     return true;
0231 }
0232 
0233 static bool kfence_protect(unsigned long addr)
0234 {
0235     return !KFENCE_WARN_ON(!kfence_protect_page(ALIGN_DOWN(addr, PAGE_SIZE), true));
0236 }
0237 
0238 static bool kfence_unprotect(unsigned long addr)
0239 {
0240     return !KFENCE_WARN_ON(!kfence_protect_page(ALIGN_DOWN(addr, PAGE_SIZE), false));
0241 }
0242 
0243 static inline unsigned long metadata_to_pageaddr(const struct kfence_metadata *meta)
0244 {
0245     unsigned long offset = (meta - kfence_metadata + 1) * PAGE_SIZE * 2;
0246     unsigned long pageaddr = (unsigned long)&__kfence_pool[offset];
0247 
0248     /* The checks do not affect performance; only called from slow-paths. */
0249 
0250     /* Only call with a pointer into kfence_metadata. */
0251     if (KFENCE_WARN_ON(meta < kfence_metadata ||
0252                meta >= kfence_metadata + CONFIG_KFENCE_NUM_OBJECTS))
0253         return 0;
0254 
0255     /*
0256      * This metadata object only ever maps to 1 page; verify that the stored
0257      * address is in the expected range.
0258      */
0259     if (KFENCE_WARN_ON(ALIGN_DOWN(meta->addr, PAGE_SIZE) != pageaddr))
0260         return 0;
0261 
0262     return pageaddr;
0263 }
0264 
0265 /*
0266  * Update the object's metadata state, including updating the alloc/free stacks
0267  * depending on the state transition.
0268  */
0269 static noinline void
0270 metadata_update_state(struct kfence_metadata *meta, enum kfence_object_state next,
0271               unsigned long *stack_entries, size_t num_stack_entries)
0272 {
0273     struct kfence_track *track =
0274         next == KFENCE_OBJECT_FREED ? &meta->free_track : &meta->alloc_track;
0275 
0276     lockdep_assert_held(&meta->lock);
0277 
0278     if (stack_entries) {
0279         memcpy(track->stack_entries, stack_entries,
0280                num_stack_entries * sizeof(stack_entries[0]));
0281     } else {
0282         /*
0283          * Skip over 1 (this) functions; noinline ensures we do not
0284          * accidentally skip over the caller by never inlining.
0285          */
0286         num_stack_entries = stack_trace_save(track->stack_entries, KFENCE_STACK_DEPTH, 1);
0287     }
0288     track->num_stack_entries = num_stack_entries;
0289     track->pid = task_pid_nr(current);
0290     track->cpu = raw_smp_processor_id();
0291     track->ts_nsec = local_clock(); /* Same source as printk timestamps. */
0292 
0293     /*
0294      * Pairs with READ_ONCE() in
0295      *  kfence_shutdown_cache(),
0296      *  kfence_handle_page_fault().
0297      */
0298     WRITE_ONCE(meta->state, next);
0299 }
0300 
0301 /* Write canary byte to @addr. */
0302 static inline bool set_canary_byte(u8 *addr)
0303 {
0304     *addr = KFENCE_CANARY_PATTERN(addr);
0305     return true;
0306 }
0307 
0308 /* Check canary byte at @addr. */
0309 static inline bool check_canary_byte(u8 *addr)
0310 {
0311     struct kfence_metadata *meta;
0312     unsigned long flags;
0313 
0314     if (likely(*addr == KFENCE_CANARY_PATTERN(addr)))
0315         return true;
0316 
0317     atomic_long_inc(&counters[KFENCE_COUNTER_BUGS]);
0318 
0319     meta = addr_to_metadata((unsigned long)addr);
0320     raw_spin_lock_irqsave(&meta->lock, flags);
0321     kfence_report_error((unsigned long)addr, false, NULL, meta, KFENCE_ERROR_CORRUPTION);
0322     raw_spin_unlock_irqrestore(&meta->lock, flags);
0323 
0324     return false;
0325 }
0326 
0327 /* __always_inline this to ensure we won't do an indirect call to fn. */
0328 static __always_inline void for_each_canary(const struct kfence_metadata *meta, bool (*fn)(u8 *))
0329 {
0330     const unsigned long pageaddr = ALIGN_DOWN(meta->addr, PAGE_SIZE);
0331     unsigned long addr;
0332 
0333     /*
0334      * We'll iterate over each canary byte per-side until fn() returns
0335      * false. However, we'll still iterate over the canary bytes to the
0336      * right of the object even if there was an error in the canary bytes to
0337      * the left of the object. Specifically, if check_canary_byte()
0338      * generates an error, showing both sides might give more clues as to
0339      * what the error is about when displaying which bytes were corrupted.
0340      */
0341 
0342     /* Apply to left of object. */
0343     for (addr = pageaddr; addr < meta->addr; addr++) {
0344         if (!fn((u8 *)addr))
0345             break;
0346     }
0347 
0348     /* Apply to right of object. */
0349     for (addr = meta->addr + meta->size; addr < pageaddr + PAGE_SIZE; addr++) {
0350         if (!fn((u8 *)addr))
0351             break;
0352     }
0353 }
0354 
0355 static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t gfp,
0356                   unsigned long *stack_entries, size_t num_stack_entries,
0357                   u32 alloc_stack_hash)
0358 {
0359     struct kfence_metadata *meta = NULL;
0360     unsigned long flags;
0361     struct slab *slab;
0362     void *addr;
0363     const bool random_right_allocate = prandom_u32_max(2);
0364     const bool random_fault = CONFIG_KFENCE_STRESS_TEST_FAULTS &&
0365                   !prandom_u32_max(CONFIG_KFENCE_STRESS_TEST_FAULTS);
0366 
0367     /* Try to obtain a free object. */
0368     raw_spin_lock_irqsave(&kfence_freelist_lock, flags);
0369     if (!list_empty(&kfence_freelist)) {
0370         meta = list_entry(kfence_freelist.next, struct kfence_metadata, list);
0371         list_del_init(&meta->list);
0372     }
0373     raw_spin_unlock_irqrestore(&kfence_freelist_lock, flags);
0374     if (!meta) {
0375         atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_CAPACITY]);
0376         return NULL;
0377     }
0378 
0379     if (unlikely(!raw_spin_trylock_irqsave(&meta->lock, flags))) {
0380         /*
0381          * This is extremely unlikely -- we are reporting on a
0382          * use-after-free, which locked meta->lock, and the reporting
0383          * code via printk calls kmalloc() which ends up in
0384          * kfence_alloc() and tries to grab the same object that we're
0385          * reporting on. While it has never been observed, lockdep does
0386          * report that there is a possibility of deadlock. Fix it by
0387          * using trylock and bailing out gracefully.
0388          */
0389         raw_spin_lock_irqsave(&kfence_freelist_lock, flags);
0390         /* Put the object back on the freelist. */
0391         list_add_tail(&meta->list, &kfence_freelist);
0392         raw_spin_unlock_irqrestore(&kfence_freelist_lock, flags);
0393 
0394         return NULL;
0395     }
0396 
0397     meta->addr = metadata_to_pageaddr(meta);
0398     /* Unprotect if we're reusing this page. */
0399     if (meta->state == KFENCE_OBJECT_FREED)
0400         kfence_unprotect(meta->addr);
0401 
0402     /*
0403      * Note: for allocations made before RNG initialization, will always
0404      * return zero. We still benefit from enabling KFENCE as early as
0405      * possible, even when the RNG is not yet available, as this will allow
0406      * KFENCE to detect bugs due to earlier allocations. The only downside
0407      * is that the out-of-bounds accesses detected are deterministic for
0408      * such allocations.
0409      */
0410     if (random_right_allocate) {
0411         /* Allocate on the "right" side, re-calculate address. */
0412         meta->addr += PAGE_SIZE - size;
0413         meta->addr = ALIGN_DOWN(meta->addr, cache->align);
0414     }
0415 
0416     addr = (void *)meta->addr;
0417 
0418     /* Update remaining metadata. */
0419     metadata_update_state(meta, KFENCE_OBJECT_ALLOCATED, stack_entries, num_stack_entries);
0420     /* Pairs with READ_ONCE() in kfence_shutdown_cache(). */
0421     WRITE_ONCE(meta->cache, cache);
0422     meta->size = size;
0423     meta->alloc_stack_hash = alloc_stack_hash;
0424     raw_spin_unlock_irqrestore(&meta->lock, flags);
0425 
0426     alloc_covered_add(alloc_stack_hash, 1);
0427 
0428     /* Set required slab fields. */
0429     slab = virt_to_slab((void *)meta->addr);
0430     slab->slab_cache = cache;
0431 #if defined(CONFIG_SLUB)
0432     slab->objects = 1;
0433 #elif defined(CONFIG_SLAB)
0434     slab->s_mem = addr;
0435 #endif
0436 
0437     /* Memory initialization. */
0438     for_each_canary(meta, set_canary_byte);
0439 
0440     /*
0441      * We check slab_want_init_on_alloc() ourselves, rather than letting
0442      * SL*B do the initialization, as otherwise we might overwrite KFENCE's
0443      * redzone.
0444      */
0445     if (unlikely(slab_want_init_on_alloc(gfp, cache)))
0446         memzero_explicit(addr, size);
0447     if (cache->ctor)
0448         cache->ctor(addr);
0449 
0450     if (random_fault)
0451         kfence_protect(meta->addr); /* Random "faults" by protecting the object. */
0452 
0453     atomic_long_inc(&counters[KFENCE_COUNTER_ALLOCATED]);
0454     atomic_long_inc(&counters[KFENCE_COUNTER_ALLOCS]);
0455 
0456     return addr;
0457 }
0458 
0459 static void kfence_guarded_free(void *addr, struct kfence_metadata *meta, bool zombie)
0460 {
0461     struct kcsan_scoped_access assert_page_exclusive;
0462     unsigned long flags;
0463     bool init;
0464 
0465     raw_spin_lock_irqsave(&meta->lock, flags);
0466 
0467     if (meta->state != KFENCE_OBJECT_ALLOCATED || meta->addr != (unsigned long)addr) {
0468         /* Invalid or double-free, bail out. */
0469         atomic_long_inc(&counters[KFENCE_COUNTER_BUGS]);
0470         kfence_report_error((unsigned long)addr, false, NULL, meta,
0471                     KFENCE_ERROR_INVALID_FREE);
0472         raw_spin_unlock_irqrestore(&meta->lock, flags);
0473         return;
0474     }
0475 
0476     /* Detect racy use-after-free, or incorrect reallocation of this page by KFENCE. */
0477     kcsan_begin_scoped_access((void *)ALIGN_DOWN((unsigned long)addr, PAGE_SIZE), PAGE_SIZE,
0478                   KCSAN_ACCESS_SCOPED | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ASSERT,
0479                   &assert_page_exclusive);
0480 
0481     if (CONFIG_KFENCE_STRESS_TEST_FAULTS)
0482         kfence_unprotect((unsigned long)addr); /* To check canary bytes. */
0483 
0484     /* Restore page protection if there was an OOB access. */
0485     if (meta->unprotected_page) {
0486         memzero_explicit((void *)ALIGN_DOWN(meta->unprotected_page, PAGE_SIZE), PAGE_SIZE);
0487         kfence_protect(meta->unprotected_page);
0488         meta->unprotected_page = 0;
0489     }
0490 
0491     /* Mark the object as freed. */
0492     metadata_update_state(meta, KFENCE_OBJECT_FREED, NULL, 0);
0493     init = slab_want_init_on_free(meta->cache);
0494     raw_spin_unlock_irqrestore(&meta->lock, flags);
0495 
0496     alloc_covered_add(meta->alloc_stack_hash, -1);
0497 
0498     /* Check canary bytes for memory corruption. */
0499     for_each_canary(meta, check_canary_byte);
0500 
0501     /*
0502      * Clear memory if init-on-free is set. While we protect the page, the
0503      * data is still there, and after a use-after-free is detected, we
0504      * unprotect the page, so the data is still accessible.
0505      */
0506     if (!zombie && unlikely(init))
0507         memzero_explicit(addr, meta->size);
0508 
0509     /* Protect to detect use-after-frees. */
0510     kfence_protect((unsigned long)addr);
0511 
0512     kcsan_end_scoped_access(&assert_page_exclusive);
0513     if (!zombie) {
0514         /* Add it to the tail of the freelist for reuse. */
0515         raw_spin_lock_irqsave(&kfence_freelist_lock, flags);
0516         KFENCE_WARN_ON(!list_empty(&meta->list));
0517         list_add_tail(&meta->list, &kfence_freelist);
0518         raw_spin_unlock_irqrestore(&kfence_freelist_lock, flags);
0519 
0520         atomic_long_dec(&counters[KFENCE_COUNTER_ALLOCATED]);
0521         atomic_long_inc(&counters[KFENCE_COUNTER_FREES]);
0522     } else {
0523         /* See kfence_shutdown_cache(). */
0524         atomic_long_inc(&counters[KFENCE_COUNTER_ZOMBIES]);
0525     }
0526 }
0527 
0528 static void rcu_guarded_free(struct rcu_head *h)
0529 {
0530     struct kfence_metadata *meta = container_of(h, struct kfence_metadata, rcu_head);
0531 
0532     kfence_guarded_free((void *)meta->addr, meta, false);
0533 }
0534 
0535 /*
0536  * Initialization of the KFENCE pool after its allocation.
0537  * Returns 0 on success; otherwise returns the address up to
0538  * which partial initialization succeeded.
0539  */
0540 static unsigned long kfence_init_pool(void)
0541 {
0542     unsigned long addr = (unsigned long)__kfence_pool;
0543     struct page *pages;
0544     int i;
0545 
0546     if (!arch_kfence_init_pool())
0547         return addr;
0548 
0549     pages = virt_to_page(__kfence_pool);
0550 
0551     /*
0552      * Set up object pages: they must have PG_slab set, to avoid freeing
0553      * these as real pages.
0554      *
0555      * We also want to avoid inserting kfence_free() in the kfree()
0556      * fast-path in SLUB, and therefore need to ensure kfree() correctly
0557      * enters __slab_free() slow-path.
0558      */
0559     for (i = 0; i < KFENCE_POOL_SIZE / PAGE_SIZE; i++) {
0560         struct slab *slab = page_slab(&pages[i]);
0561 
0562         if (!i || (i % 2))
0563             continue;
0564 
0565         /* Verify we do not have a compound head page. */
0566         if (WARN_ON(compound_head(&pages[i]) != &pages[i]))
0567             return addr;
0568 
0569         __folio_set_slab(slab_folio(slab));
0570 #ifdef CONFIG_MEMCG
0571         slab->memcg_data = (unsigned long)&kfence_metadata[i / 2 - 1].objcg |
0572                    MEMCG_DATA_OBJCGS;
0573 #endif
0574     }
0575 
0576     /*
0577      * Protect the first 2 pages. The first page is mostly unnecessary, and
0578      * merely serves as an extended guard page. However, adding one
0579      * additional page in the beginning gives us an even number of pages,
0580      * which simplifies the mapping of address to metadata index.
0581      */
0582     for (i = 0; i < 2; i++) {
0583         if (unlikely(!kfence_protect(addr)))
0584             return addr;
0585 
0586         addr += PAGE_SIZE;
0587     }
0588 
0589     for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) {
0590         struct kfence_metadata *meta = &kfence_metadata[i];
0591 
0592         /* Initialize metadata. */
0593         INIT_LIST_HEAD(&meta->list);
0594         raw_spin_lock_init(&meta->lock);
0595         meta->state = KFENCE_OBJECT_UNUSED;
0596         meta->addr = addr; /* Initialize for validation in metadata_to_pageaddr(). */
0597         list_add_tail(&meta->list, &kfence_freelist);
0598 
0599         /* Protect the right redzone. */
0600         if (unlikely(!kfence_protect(addr + PAGE_SIZE)))
0601             return addr;
0602 
0603         addr += 2 * PAGE_SIZE;
0604     }
0605 
0606     return 0;
0607 }
0608 
0609 static bool __init kfence_init_pool_early(void)
0610 {
0611     unsigned long addr;
0612 
0613     if (!__kfence_pool)
0614         return false;
0615 
0616     addr = kfence_init_pool();
0617 
0618     if (!addr) {
0619         /*
0620          * The pool is live and will never be deallocated from this point on.
0621          * Ignore the pool object from the kmemleak phys object tree, as it would
0622          * otherwise overlap with allocations returned by kfence_alloc(), which
0623          * are registered with kmemleak through the slab post-alloc hook.
0624          */
0625         kmemleak_ignore_phys(__pa(__kfence_pool));
0626         return true;
0627     }
0628 
0629     /*
0630      * Only release unprotected pages, and do not try to go back and change
0631      * page attributes due to risk of failing to do so as well. If changing
0632      * page attributes for some pages fails, it is very likely that it also
0633      * fails for the first page, and therefore expect addr==__kfence_pool in
0634      * most failure cases.
0635      */
0636     for (char *p = (char *)addr; p < __kfence_pool + KFENCE_POOL_SIZE; p += PAGE_SIZE) {
0637         struct slab *slab = virt_to_slab(p);
0638 
0639         if (!slab)
0640             continue;
0641 #ifdef CONFIG_MEMCG
0642         slab->memcg_data = 0;
0643 #endif
0644         __folio_clear_slab(slab_folio(slab));
0645     }
0646     memblock_free_late(__pa(addr), KFENCE_POOL_SIZE - (addr - (unsigned long)__kfence_pool));
0647     __kfence_pool = NULL;
0648     return false;
0649 }
0650 
0651 static bool kfence_init_pool_late(void)
0652 {
0653     unsigned long addr, free_size;
0654 
0655     addr = kfence_init_pool();
0656 
0657     if (!addr)
0658         return true;
0659 
0660     /* Same as above. */
0661     free_size = KFENCE_POOL_SIZE - (addr - (unsigned long)__kfence_pool);
0662 #ifdef CONFIG_CONTIG_ALLOC
0663     free_contig_range(page_to_pfn(virt_to_page((void *)addr)), free_size / PAGE_SIZE);
0664 #else
0665     free_pages_exact((void *)addr, free_size);
0666 #endif
0667     __kfence_pool = NULL;
0668     return false;
0669 }
0670 
0671 /* === DebugFS Interface ==================================================== */
0672 
0673 static int stats_show(struct seq_file *seq, void *v)
0674 {
0675     int i;
0676 
0677     seq_printf(seq, "enabled: %i\n", READ_ONCE(kfence_enabled));
0678     for (i = 0; i < KFENCE_COUNTER_COUNT; i++)
0679         seq_printf(seq, "%s: %ld\n", counter_names[i], atomic_long_read(&counters[i]));
0680 
0681     return 0;
0682 }
0683 DEFINE_SHOW_ATTRIBUTE(stats);
0684 
0685 /*
0686  * debugfs seq_file operations for /sys/kernel/debug/kfence/objects.
0687  * start_object() and next_object() return the object index + 1, because NULL is used
0688  * to stop iteration.
0689  */
0690 static void *start_object(struct seq_file *seq, loff_t *pos)
0691 {
0692     if (*pos < CONFIG_KFENCE_NUM_OBJECTS)
0693         return (void *)((long)*pos + 1);
0694     return NULL;
0695 }
0696 
0697 static void stop_object(struct seq_file *seq, void *v)
0698 {
0699 }
0700 
0701 static void *next_object(struct seq_file *seq, void *v, loff_t *pos)
0702 {
0703     ++*pos;
0704     if (*pos < CONFIG_KFENCE_NUM_OBJECTS)
0705         return (void *)((long)*pos + 1);
0706     return NULL;
0707 }
0708 
0709 static int show_object(struct seq_file *seq, void *v)
0710 {
0711     struct kfence_metadata *meta = &kfence_metadata[(long)v - 1];
0712     unsigned long flags;
0713 
0714     raw_spin_lock_irqsave(&meta->lock, flags);
0715     kfence_print_object(seq, meta);
0716     raw_spin_unlock_irqrestore(&meta->lock, flags);
0717     seq_puts(seq, "---------------------------------\n");
0718 
0719     return 0;
0720 }
0721 
0722 static const struct seq_operations object_seqops = {
0723     .start = start_object,
0724     .next = next_object,
0725     .stop = stop_object,
0726     .show = show_object,
0727 };
0728 
0729 static int open_objects(struct inode *inode, struct file *file)
0730 {
0731     return seq_open(file, &object_seqops);
0732 }
0733 
0734 static const struct file_operations objects_fops = {
0735     .open = open_objects,
0736     .read = seq_read,
0737     .llseek = seq_lseek,
0738     .release = seq_release,
0739 };
0740 
0741 static int __init kfence_debugfs_init(void)
0742 {
0743     struct dentry *kfence_dir = debugfs_create_dir("kfence", NULL);
0744 
0745     debugfs_create_file("stats", 0444, kfence_dir, NULL, &stats_fops);
0746     debugfs_create_file("objects", 0400, kfence_dir, NULL, &objects_fops);
0747     return 0;
0748 }
0749 
0750 late_initcall(kfence_debugfs_init);
0751 
0752 /* === Panic Notifier ====================================================== */
0753 
0754 static void kfence_check_all_canary(void)
0755 {
0756     int i;
0757 
0758     for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) {
0759         struct kfence_metadata *meta = &kfence_metadata[i];
0760 
0761         if (meta->state == KFENCE_OBJECT_ALLOCATED)
0762             for_each_canary(meta, check_canary_byte);
0763     }
0764 }
0765 
0766 static int kfence_check_canary_callback(struct notifier_block *nb,
0767                     unsigned long reason, void *arg)
0768 {
0769     kfence_check_all_canary();
0770     return NOTIFY_OK;
0771 }
0772 
0773 static struct notifier_block kfence_check_canary_notifier = {
0774     .notifier_call = kfence_check_canary_callback,
0775 };
0776 
0777 /* === Allocation Gate Timer ================================================ */
0778 
0779 static struct delayed_work kfence_timer;
0780 
0781 #ifdef CONFIG_KFENCE_STATIC_KEYS
0782 /* Wait queue to wake up allocation-gate timer task. */
0783 static DECLARE_WAIT_QUEUE_HEAD(allocation_wait);
0784 
0785 static void wake_up_kfence_timer(struct irq_work *work)
0786 {
0787     wake_up(&allocation_wait);
0788 }
0789 static DEFINE_IRQ_WORK(wake_up_kfence_timer_work, wake_up_kfence_timer);
0790 #endif
0791 
0792 /*
0793  * Set up delayed work, which will enable and disable the static key. We need to
0794  * use a work queue (rather than a simple timer), since enabling and disabling a
0795  * static key cannot be done from an interrupt.
0796  *
0797  * Note: Toggling a static branch currently causes IPIs, and here we'll end up
0798  * with a total of 2 IPIs to all CPUs. If this ends up a problem in future (with
0799  * more aggressive sampling intervals), we could get away with a variant that
0800  * avoids IPIs, at the cost of not immediately capturing allocations if the
0801  * instructions remain cached.
0802  */
0803 static void toggle_allocation_gate(struct work_struct *work)
0804 {
0805     if (!READ_ONCE(kfence_enabled))
0806         return;
0807 
0808     atomic_set(&kfence_allocation_gate, 0);
0809 #ifdef CONFIG_KFENCE_STATIC_KEYS
0810     /* Enable static key, and await allocation to happen. */
0811     static_branch_enable(&kfence_allocation_key);
0812 
0813     if (sysctl_hung_task_timeout_secs) {
0814         /*
0815          * During low activity with no allocations we might wait a
0816          * while; let's avoid the hung task warning.
0817          */
0818         wait_event_idle_timeout(allocation_wait, atomic_read(&kfence_allocation_gate),
0819                     sysctl_hung_task_timeout_secs * HZ / 2);
0820     } else {
0821         wait_event_idle(allocation_wait, atomic_read(&kfence_allocation_gate));
0822     }
0823 
0824     /* Disable static key and reset timer. */
0825     static_branch_disable(&kfence_allocation_key);
0826 #endif
0827     queue_delayed_work(system_unbound_wq, &kfence_timer,
0828                msecs_to_jiffies(kfence_sample_interval));
0829 }
0830 
0831 /* === Public interface ===================================================== */
0832 
0833 void __init kfence_alloc_pool(void)
0834 {
0835     if (!kfence_sample_interval)
0836         return;
0837 
0838     __kfence_pool = memblock_alloc(KFENCE_POOL_SIZE, PAGE_SIZE);
0839 
0840     if (!__kfence_pool)
0841         pr_err("failed to allocate pool\n");
0842 }
0843 
0844 static void kfence_init_enable(void)
0845 {
0846     if (!IS_ENABLED(CONFIG_KFENCE_STATIC_KEYS))
0847         static_branch_enable(&kfence_allocation_key);
0848 
0849     if (kfence_deferrable)
0850         INIT_DEFERRABLE_WORK(&kfence_timer, toggle_allocation_gate);
0851     else
0852         INIT_DELAYED_WORK(&kfence_timer, toggle_allocation_gate);
0853 
0854     if (kfence_check_on_panic)
0855         atomic_notifier_chain_register(&panic_notifier_list, &kfence_check_canary_notifier);
0856 
0857     WRITE_ONCE(kfence_enabled, true);
0858     queue_delayed_work(system_unbound_wq, &kfence_timer, 0);
0859 
0860     pr_info("initialized - using %lu bytes for %d objects at 0x%p-0x%p\n", KFENCE_POOL_SIZE,
0861         CONFIG_KFENCE_NUM_OBJECTS, (void *)__kfence_pool,
0862         (void *)(__kfence_pool + KFENCE_POOL_SIZE));
0863 }
0864 
0865 void __init kfence_init(void)
0866 {
0867     stack_hash_seed = (u32)random_get_entropy();
0868 
0869     /* Setting kfence_sample_interval to 0 on boot disables KFENCE. */
0870     if (!kfence_sample_interval)
0871         return;
0872 
0873     if (!kfence_init_pool_early()) {
0874         pr_err("%s failed\n", __func__);
0875         return;
0876     }
0877 
0878     kfence_init_enable();
0879 }
0880 
0881 static int kfence_init_late(void)
0882 {
0883     const unsigned long nr_pages = KFENCE_POOL_SIZE / PAGE_SIZE;
0884 #ifdef CONFIG_CONTIG_ALLOC
0885     struct page *pages;
0886 
0887     pages = alloc_contig_pages(nr_pages, GFP_KERNEL, first_online_node, NULL);
0888     if (!pages)
0889         return -ENOMEM;
0890     __kfence_pool = page_to_virt(pages);
0891 #else
0892     if (nr_pages > MAX_ORDER_NR_PAGES) {
0893         pr_warn("KFENCE_NUM_OBJECTS too large for buddy allocator\n");
0894         return -EINVAL;
0895     }
0896     __kfence_pool = alloc_pages_exact(KFENCE_POOL_SIZE, GFP_KERNEL);
0897     if (!__kfence_pool)
0898         return -ENOMEM;
0899 #endif
0900 
0901     if (!kfence_init_pool_late()) {
0902         pr_err("%s failed\n", __func__);
0903         return -EBUSY;
0904     }
0905 
0906     kfence_init_enable();
0907     return 0;
0908 }
0909 
0910 static int kfence_enable_late(void)
0911 {
0912     if (!__kfence_pool)
0913         return kfence_init_late();
0914 
0915     WRITE_ONCE(kfence_enabled, true);
0916     queue_delayed_work(system_unbound_wq, &kfence_timer, 0);
0917     pr_info("re-enabled\n");
0918     return 0;
0919 }
0920 
0921 void kfence_shutdown_cache(struct kmem_cache *s)
0922 {
0923     unsigned long flags;
0924     struct kfence_metadata *meta;
0925     int i;
0926 
0927     for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) {
0928         bool in_use;
0929 
0930         meta = &kfence_metadata[i];
0931 
0932         /*
0933          * If we observe some inconsistent cache and state pair where we
0934          * should have returned false here, cache destruction is racing
0935          * with either kmem_cache_alloc() or kmem_cache_free(). Taking
0936          * the lock will not help, as different critical section
0937          * serialization will have the same outcome.
0938          */
0939         if (READ_ONCE(meta->cache) != s ||
0940             READ_ONCE(meta->state) != KFENCE_OBJECT_ALLOCATED)
0941             continue;
0942 
0943         raw_spin_lock_irqsave(&meta->lock, flags);
0944         in_use = meta->cache == s && meta->state == KFENCE_OBJECT_ALLOCATED;
0945         raw_spin_unlock_irqrestore(&meta->lock, flags);
0946 
0947         if (in_use) {
0948             /*
0949              * This cache still has allocations, and we should not
0950              * release them back into the freelist so they can still
0951              * safely be used and retain the kernel's default
0952              * behaviour of keeping the allocations alive (leak the
0953              * cache); however, they effectively become "zombie
0954              * allocations" as the KFENCE objects are the only ones
0955              * still in use and the owning cache is being destroyed.
0956              *
0957              * We mark them freed, so that any subsequent use shows
0958              * more useful error messages that will include stack
0959              * traces of the user of the object, the original
0960              * allocation, and caller to shutdown_cache().
0961              */
0962             kfence_guarded_free((void *)meta->addr, meta, /*zombie=*/true);
0963         }
0964     }
0965 
0966     for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) {
0967         meta = &kfence_metadata[i];
0968 
0969         /* See above. */
0970         if (READ_ONCE(meta->cache) != s || READ_ONCE(meta->state) != KFENCE_OBJECT_FREED)
0971             continue;
0972 
0973         raw_spin_lock_irqsave(&meta->lock, flags);
0974         if (meta->cache == s && meta->state == KFENCE_OBJECT_FREED)
0975             meta->cache = NULL;
0976         raw_spin_unlock_irqrestore(&meta->lock, flags);
0977     }
0978 }
0979 
0980 void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
0981 {
0982     unsigned long stack_entries[KFENCE_STACK_DEPTH];
0983     size_t num_stack_entries;
0984     u32 alloc_stack_hash;
0985 
0986     /*
0987      * Perform size check before switching kfence_allocation_gate, so that
0988      * we don't disable KFENCE without making an allocation.
0989      */
0990     if (size > PAGE_SIZE) {
0991         atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_INCOMPAT]);
0992         return NULL;
0993     }
0994 
0995     /*
0996      * Skip allocations from non-default zones, including DMA. We cannot
0997      * guarantee that pages in the KFENCE pool will have the requested
0998      * properties (e.g. reside in DMAable memory).
0999      */
1000     if ((flags & GFP_ZONEMASK) ||
1001         (s->flags & (SLAB_CACHE_DMA | SLAB_CACHE_DMA32))) {
1002         atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_INCOMPAT]);
1003         return NULL;
1004     }
1005 
1006     if (atomic_inc_return(&kfence_allocation_gate) > 1)
1007         return NULL;
1008 #ifdef CONFIG_KFENCE_STATIC_KEYS
1009     /*
1010      * waitqueue_active() is fully ordered after the update of
1011      * kfence_allocation_gate per atomic_inc_return().
1012      */
1013     if (waitqueue_active(&allocation_wait)) {
1014         /*
1015          * Calling wake_up() here may deadlock when allocations happen
1016          * from within timer code. Use an irq_work to defer it.
1017          */
1018         irq_work_queue(&wake_up_kfence_timer_work);
1019     }
1020 #endif
1021 
1022     if (!READ_ONCE(kfence_enabled))
1023         return NULL;
1024 
1025     num_stack_entries = stack_trace_save(stack_entries, KFENCE_STACK_DEPTH, 0);
1026 
1027     /*
1028      * Do expensive check for coverage of allocation in slow-path after
1029      * allocation_gate has already become non-zero, even though it might
1030      * mean not making any allocation within a given sample interval.
1031      *
1032      * This ensures reasonable allocation coverage when the pool is almost
1033      * full, including avoiding long-lived allocations of the same source
1034      * filling up the pool (e.g. pagecache allocations).
1035      */
1036     alloc_stack_hash = get_alloc_stack_hash(stack_entries, num_stack_entries);
1037     if (should_skip_covered() && alloc_covered_contains(alloc_stack_hash)) {
1038         atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_COVERED]);
1039         return NULL;
1040     }
1041 
1042     return kfence_guarded_alloc(s, size, flags, stack_entries, num_stack_entries,
1043                     alloc_stack_hash);
1044 }
1045 
1046 size_t kfence_ksize(const void *addr)
1047 {
1048     const struct kfence_metadata *meta = addr_to_metadata((unsigned long)addr);
1049 
1050     /*
1051      * Read locklessly -- if there is a race with __kfence_alloc(), this is
1052      * either a use-after-free or invalid access.
1053      */
1054     return meta ? meta->size : 0;
1055 }
1056 
1057 void *kfence_object_start(const void *addr)
1058 {
1059     const struct kfence_metadata *meta = addr_to_metadata((unsigned long)addr);
1060 
1061     /*
1062      * Read locklessly -- if there is a race with __kfence_alloc(), this is
1063      * either a use-after-free or invalid access.
1064      */
1065     return meta ? (void *)meta->addr : NULL;
1066 }
1067 
1068 void __kfence_free(void *addr)
1069 {
1070     struct kfence_metadata *meta = addr_to_metadata((unsigned long)addr);
1071 
1072 #ifdef CONFIG_MEMCG
1073     KFENCE_WARN_ON(meta->objcg);
1074 #endif
1075     /*
1076      * If the objects of the cache are SLAB_TYPESAFE_BY_RCU, defer freeing
1077      * the object, as the object page may be recycled for other-typed
1078      * objects once it has been freed. meta->cache may be NULL if the cache
1079      * was destroyed.
1080      */
1081     if (unlikely(meta->cache && (meta->cache->flags & SLAB_TYPESAFE_BY_RCU)))
1082         call_rcu(&meta->rcu_head, rcu_guarded_free);
1083     else
1084         kfence_guarded_free(addr, meta, false);
1085 }
1086 
1087 bool kfence_handle_page_fault(unsigned long addr, bool is_write, struct pt_regs *regs)
1088 {
1089     const int page_index = (addr - (unsigned long)__kfence_pool) / PAGE_SIZE;
1090     struct kfence_metadata *to_report = NULL;
1091     enum kfence_error_type error_type;
1092     unsigned long flags;
1093 
1094     if (!is_kfence_address((void *)addr))
1095         return false;
1096 
1097     if (!READ_ONCE(kfence_enabled)) /* If disabled at runtime ... */
1098         return kfence_unprotect(addr); /* ... unprotect and proceed. */
1099 
1100     atomic_long_inc(&counters[KFENCE_COUNTER_BUGS]);
1101 
1102     if (page_index % 2) {
1103         /* This is a redzone, report a buffer overflow. */
1104         struct kfence_metadata *meta;
1105         int distance = 0;
1106 
1107         meta = addr_to_metadata(addr - PAGE_SIZE);
1108         if (meta && READ_ONCE(meta->state) == KFENCE_OBJECT_ALLOCATED) {
1109             to_report = meta;
1110             /* Data race ok; distance calculation approximate. */
1111             distance = addr - data_race(meta->addr + meta->size);
1112         }
1113 
1114         meta = addr_to_metadata(addr + PAGE_SIZE);
1115         if (meta && READ_ONCE(meta->state) == KFENCE_OBJECT_ALLOCATED) {
1116             /* Data race ok; distance calculation approximate. */
1117             if (!to_report || distance > data_race(meta->addr) - addr)
1118                 to_report = meta;
1119         }
1120 
1121         if (!to_report)
1122             goto out;
1123 
1124         raw_spin_lock_irqsave(&to_report->lock, flags);
1125         to_report->unprotected_page = addr;
1126         error_type = KFENCE_ERROR_OOB;
1127 
1128         /*
1129          * If the object was freed before we took the look we can still
1130          * report this as an OOB -- the report will simply show the
1131          * stacktrace of the free as well.
1132          */
1133     } else {
1134         to_report = addr_to_metadata(addr);
1135         if (!to_report)
1136             goto out;
1137 
1138         raw_spin_lock_irqsave(&to_report->lock, flags);
1139         error_type = KFENCE_ERROR_UAF;
1140         /*
1141          * We may race with __kfence_alloc(), and it is possible that a
1142          * freed object may be reallocated. We simply report this as a
1143          * use-after-free, with the stack trace showing the place where
1144          * the object was re-allocated.
1145          */
1146     }
1147 
1148 out:
1149     if (to_report) {
1150         kfence_report_error(addr, is_write, regs, to_report, error_type);
1151         raw_spin_unlock_irqrestore(&to_report->lock, flags);
1152     } else {
1153         /* This may be a UAF or OOB access, but we can't be sure. */
1154         kfence_report_error(addr, is_write, regs, NULL, KFENCE_ERROR_INVALID);
1155     }
1156 
1157     return kfence_unprotect(addr); /* Unprotect and let access proceed. */
1158 }