Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0-only
0002 /*
0003  * z3fold.c
0004  *
0005  * Author: Vitaly Wool <vitaly.wool@konsulko.com>
0006  * Copyright (C) 2016, Sony Mobile Communications Inc.
0007  *
0008  * This implementation is based on zbud written by Seth Jennings.
0009  *
0010  * z3fold is an special purpose allocator for storing compressed pages. It
0011  * can store up to three compressed pages per page which improves the
0012  * compression ratio of zbud while retaining its main concepts (e. g. always
0013  * storing an integral number of objects per page) and simplicity.
0014  * It still has simple and deterministic reclaim properties that make it
0015  * preferable to a higher density approach (with no requirement on integral
0016  * number of object per page) when reclaim is used.
0017  *
0018  * As in zbud, pages are divided into "chunks".  The size of the chunks is
0019  * fixed at compile time and is determined by NCHUNKS_ORDER below.
0020  *
0021  * z3fold doesn't export any API and is meant to be used via zpool API.
0022  */
0023 
0024 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
0025 
0026 #include <linux/atomic.h>
0027 #include <linux/sched.h>
0028 #include <linux/cpumask.h>
0029 #include <linux/list.h>
0030 #include <linux/mm.h>
0031 #include <linux/module.h>
0032 #include <linux/page-flags.h>
0033 #include <linux/migrate.h>
0034 #include <linux/node.h>
0035 #include <linux/compaction.h>
0036 #include <linux/percpu.h>
0037 #include <linux/preempt.h>
0038 #include <linux/workqueue.h>
0039 #include <linux/slab.h>
0040 #include <linux/spinlock.h>
0041 #include <linux/zpool.h>
0042 #include <linux/kmemleak.h>
0043 
0044 /*
0045  * NCHUNKS_ORDER determines the internal allocation granularity, effectively
0046  * adjusting internal fragmentation.  It also determines the number of
0047  * freelists maintained in each pool. NCHUNKS_ORDER of 6 means that the
0048  * allocation granularity will be in chunks of size PAGE_SIZE/64. Some chunks
0049  * in the beginning of an allocated page are occupied by z3fold header, so
0050  * NCHUNKS will be calculated to 63 (or 62 in case CONFIG_DEBUG_SPINLOCK=y),
0051  * which shows the max number of free chunks in z3fold page, also there will
0052  * be 63, or 62, respectively, freelists per pool.
0053  */
0054 #define NCHUNKS_ORDER   6
0055 
0056 #define CHUNK_SHIFT (PAGE_SHIFT - NCHUNKS_ORDER)
0057 #define CHUNK_SIZE  (1 << CHUNK_SHIFT)
0058 #define ZHDR_SIZE_ALIGNED round_up(sizeof(struct z3fold_header), CHUNK_SIZE)
0059 #define ZHDR_CHUNKS (ZHDR_SIZE_ALIGNED >> CHUNK_SHIFT)
0060 #define TOTAL_CHUNKS    (PAGE_SIZE >> CHUNK_SHIFT)
0061 #define NCHUNKS     (TOTAL_CHUNKS - ZHDR_CHUNKS)
0062 
0063 #define BUDDY_MASK  (0x3)
0064 #define BUDDY_SHIFT 2
0065 #define SLOTS_ALIGN (0x40)
0066 
0067 /*****************
0068  * Structures
0069 *****************/
0070 struct z3fold_pool;
0071 struct z3fold_ops {
0072     int (*evict)(struct z3fold_pool *pool, unsigned long handle);
0073 };
0074 
0075 enum buddy {
0076     HEADLESS = 0,
0077     FIRST,
0078     MIDDLE,
0079     LAST,
0080     BUDDIES_MAX = LAST
0081 };
0082 
0083 struct z3fold_buddy_slots {
0084     /*
0085      * we are using BUDDY_MASK in handle_to_buddy etc. so there should
0086      * be enough slots to hold all possible variants
0087      */
0088     unsigned long slot[BUDDY_MASK + 1];
0089     unsigned long pool; /* back link */
0090     rwlock_t lock;
0091 };
0092 #define HANDLE_FLAG_MASK    (0x03)
0093 
0094 /*
0095  * struct z3fold_header - z3fold page metadata occupying first chunks of each
0096  *          z3fold page, except for HEADLESS pages
0097  * @buddy:      links the z3fold page into the relevant list in the
0098  *          pool
0099  * @page_lock:      per-page lock
0100  * @refcount:       reference count for the z3fold page
0101  * @work:       work_struct for page layout optimization
0102  * @slots:      pointer to the structure holding buddy slots
0103  * @pool:       pointer to the containing pool
0104  * @cpu:        CPU which this page "belongs" to
0105  * @first_chunks:   the size of the first buddy in chunks, 0 if free
0106  * @middle_chunks:  the size of the middle buddy in chunks, 0 if free
0107  * @last_chunks:    the size of the last buddy in chunks, 0 if free
0108  * @first_num:      the starting number (for the first handle)
0109  * @mapped_count:   the number of objects currently mapped
0110  */
0111 struct z3fold_header {
0112     struct list_head buddy;
0113     spinlock_t page_lock;
0114     struct kref refcount;
0115     struct work_struct work;
0116     struct z3fold_buddy_slots *slots;
0117     struct z3fold_pool *pool;
0118     short cpu;
0119     unsigned short first_chunks;
0120     unsigned short middle_chunks;
0121     unsigned short last_chunks;
0122     unsigned short start_middle;
0123     unsigned short first_num:2;
0124     unsigned short mapped_count:2;
0125     unsigned short foreign_handles:2;
0126 };
0127 
0128 /**
0129  * struct z3fold_pool - stores metadata for each z3fold pool
0130  * @name:   pool name
0131  * @lock:   protects pool unbuddied/lru lists
0132  * @stale_lock: protects pool stale page list
0133  * @unbuddied:  per-cpu array of lists tracking z3fold pages that contain 2-
0134  *      buddies; the list each z3fold page is added to depends on
0135  *      the size of its free region.
0136  * @lru:    list tracking the z3fold pages in LRU order by most recently
0137  *      added buddy.
0138  * @stale:  list of pages marked for freeing
0139  * @pages_nr:   number of z3fold pages in the pool.
0140  * @c_handle:   cache for z3fold_buddy_slots allocation
0141  * @ops:    pointer to a structure of user defined operations specified at
0142  *      pool creation time.
0143  * @zpool:  zpool driver
0144  * @zpool_ops:  zpool operations structure with an evict callback
0145  * @compact_wq: workqueue for page layout background optimization
0146  * @release_wq: workqueue for safe page release
0147  * @work:   work_struct for safe page release
0148  *
0149  * This structure is allocated at pool creation time and maintains metadata
0150  * pertaining to a particular z3fold pool.
0151  */
0152 struct z3fold_pool {
0153     const char *name;
0154     spinlock_t lock;
0155     spinlock_t stale_lock;
0156     struct list_head *unbuddied;
0157     struct list_head lru;
0158     struct list_head stale;
0159     atomic64_t pages_nr;
0160     struct kmem_cache *c_handle;
0161     const struct z3fold_ops *ops;
0162     struct zpool *zpool;
0163     const struct zpool_ops *zpool_ops;
0164     struct workqueue_struct *compact_wq;
0165     struct workqueue_struct *release_wq;
0166     struct work_struct work;
0167 };
0168 
0169 /*
0170  * Internal z3fold page flags
0171  */
0172 enum z3fold_page_flags {
0173     PAGE_HEADLESS = 0,
0174     MIDDLE_CHUNK_MAPPED,
0175     NEEDS_COMPACTING,
0176     PAGE_STALE,
0177     PAGE_CLAIMED, /* by either reclaim or free */
0178     PAGE_MIGRATED, /* page is migrated and soon to be released */
0179 };
0180 
0181 /*
0182  * handle flags, go under HANDLE_FLAG_MASK
0183  */
0184 enum z3fold_handle_flags {
0185     HANDLES_NOFREE = 0,
0186 };
0187 
0188 /*
0189  * Forward declarations
0190  */
0191 static struct z3fold_header *__z3fold_alloc(struct z3fold_pool *, size_t, bool);
0192 static void compact_page_work(struct work_struct *w);
0193 
0194 /*****************
0195  * Helpers
0196 *****************/
0197 
0198 /* Converts an allocation size in bytes to size in z3fold chunks */
0199 static int size_to_chunks(size_t size)
0200 {
0201     return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT;
0202 }
0203 
0204 #define for_each_unbuddied_list(_iter, _begin) \
0205     for ((_iter) = (_begin); (_iter) < NCHUNKS; (_iter)++)
0206 
0207 static inline struct z3fold_buddy_slots *alloc_slots(struct z3fold_pool *pool,
0208                             gfp_t gfp)
0209 {
0210     struct z3fold_buddy_slots *slots = kmem_cache_zalloc(pool->c_handle,
0211                                  gfp);
0212 
0213     if (slots) {
0214         /* It will be freed separately in free_handle(). */
0215         kmemleak_not_leak(slots);
0216         slots->pool = (unsigned long)pool;
0217         rwlock_init(&slots->lock);
0218     }
0219 
0220     return slots;
0221 }
0222 
0223 static inline struct z3fold_pool *slots_to_pool(struct z3fold_buddy_slots *s)
0224 {
0225     return (struct z3fold_pool *)(s->pool & ~HANDLE_FLAG_MASK);
0226 }
0227 
0228 static inline struct z3fold_buddy_slots *handle_to_slots(unsigned long handle)
0229 {
0230     return (struct z3fold_buddy_slots *)(handle & ~(SLOTS_ALIGN - 1));
0231 }
0232 
0233 /* Lock a z3fold page */
0234 static inline void z3fold_page_lock(struct z3fold_header *zhdr)
0235 {
0236     spin_lock(&zhdr->page_lock);
0237 }
0238 
0239 /* Try to lock a z3fold page */
0240 static inline int z3fold_page_trylock(struct z3fold_header *zhdr)
0241 {
0242     return spin_trylock(&zhdr->page_lock);
0243 }
0244 
0245 /* Unlock a z3fold page */
0246 static inline void z3fold_page_unlock(struct z3fold_header *zhdr)
0247 {
0248     spin_unlock(&zhdr->page_lock);
0249 }
0250 
0251 /* return locked z3fold page if it's not headless */
0252 static inline struct z3fold_header *get_z3fold_header(unsigned long handle)
0253 {
0254     struct z3fold_buddy_slots *slots;
0255     struct z3fold_header *zhdr;
0256     int locked = 0;
0257 
0258     if (!(handle & (1 << PAGE_HEADLESS))) {
0259         slots = handle_to_slots(handle);
0260         do {
0261             unsigned long addr;
0262 
0263             read_lock(&slots->lock);
0264             addr = *(unsigned long *)handle;
0265             zhdr = (struct z3fold_header *)(addr & PAGE_MASK);
0266             locked = z3fold_page_trylock(zhdr);
0267             read_unlock(&slots->lock);
0268             if (locked) {
0269                 struct page *page = virt_to_page(zhdr);
0270 
0271                 if (!test_bit(PAGE_MIGRATED, &page->private))
0272                     break;
0273                 z3fold_page_unlock(zhdr);
0274             }
0275             cpu_relax();
0276         } while (true);
0277     } else {
0278         zhdr = (struct z3fold_header *)(handle & PAGE_MASK);
0279     }
0280 
0281     return zhdr;
0282 }
0283 
0284 static inline void put_z3fold_header(struct z3fold_header *zhdr)
0285 {
0286     struct page *page = virt_to_page(zhdr);
0287 
0288     if (!test_bit(PAGE_HEADLESS, &page->private))
0289         z3fold_page_unlock(zhdr);
0290 }
0291 
0292 static inline void free_handle(unsigned long handle, struct z3fold_header *zhdr)
0293 {
0294     struct z3fold_buddy_slots *slots;
0295     int i;
0296     bool is_free;
0297 
0298     if (WARN_ON(*(unsigned long *)handle == 0))
0299         return;
0300 
0301     slots = handle_to_slots(handle);
0302     write_lock(&slots->lock);
0303     *(unsigned long *)handle = 0;
0304 
0305     if (test_bit(HANDLES_NOFREE, &slots->pool)) {
0306         write_unlock(&slots->lock);
0307         return; /* simple case, nothing else to do */
0308     }
0309 
0310     if (zhdr->slots != slots)
0311         zhdr->foreign_handles--;
0312 
0313     is_free = true;
0314     for (i = 0; i <= BUDDY_MASK; i++) {
0315         if (slots->slot[i]) {
0316             is_free = false;
0317             break;
0318         }
0319     }
0320     write_unlock(&slots->lock);
0321 
0322     if (is_free) {
0323         struct z3fold_pool *pool = slots_to_pool(slots);
0324 
0325         if (zhdr->slots == slots)
0326             zhdr->slots = NULL;
0327         kmem_cache_free(pool->c_handle, slots);
0328     }
0329 }
0330 
0331 /* Initializes the z3fold header of a newly allocated z3fold page */
0332 static struct z3fold_header *init_z3fold_page(struct page *page, bool headless,
0333                     struct z3fold_pool *pool, gfp_t gfp)
0334 {
0335     struct z3fold_header *zhdr = page_address(page);
0336     struct z3fold_buddy_slots *slots;
0337 
0338     INIT_LIST_HEAD(&page->lru);
0339     clear_bit(PAGE_HEADLESS, &page->private);
0340     clear_bit(MIDDLE_CHUNK_MAPPED, &page->private);
0341     clear_bit(NEEDS_COMPACTING, &page->private);
0342     clear_bit(PAGE_STALE, &page->private);
0343     clear_bit(PAGE_CLAIMED, &page->private);
0344     clear_bit(PAGE_MIGRATED, &page->private);
0345     if (headless)
0346         return zhdr;
0347 
0348     slots = alloc_slots(pool, gfp);
0349     if (!slots)
0350         return NULL;
0351 
0352     memset(zhdr, 0, sizeof(*zhdr));
0353     spin_lock_init(&zhdr->page_lock);
0354     kref_init(&zhdr->refcount);
0355     zhdr->cpu = -1;
0356     zhdr->slots = slots;
0357     zhdr->pool = pool;
0358     INIT_LIST_HEAD(&zhdr->buddy);
0359     INIT_WORK(&zhdr->work, compact_page_work);
0360     return zhdr;
0361 }
0362 
0363 /* Resets the struct page fields and frees the page */
0364 static void free_z3fold_page(struct page *page, bool headless)
0365 {
0366     if (!headless) {
0367         lock_page(page);
0368         __ClearPageMovable(page);
0369         unlock_page(page);
0370     }
0371     __free_page(page);
0372 }
0373 
0374 /* Helper function to build the index */
0375 static inline int __idx(struct z3fold_header *zhdr, enum buddy bud)
0376 {
0377     return (bud + zhdr->first_num) & BUDDY_MASK;
0378 }
0379 
0380 /*
0381  * Encodes the handle of a particular buddy within a z3fold page
0382  * Pool lock should be held as this function accesses first_num
0383  */
0384 static unsigned long __encode_handle(struct z3fold_header *zhdr,
0385                 struct z3fold_buddy_slots *slots,
0386                 enum buddy bud)
0387 {
0388     unsigned long h = (unsigned long)zhdr;
0389     int idx = 0;
0390 
0391     /*
0392      * For a headless page, its handle is its pointer with the extra
0393      * PAGE_HEADLESS bit set
0394      */
0395     if (bud == HEADLESS)
0396         return h | (1 << PAGE_HEADLESS);
0397 
0398     /* otherwise, return pointer to encoded handle */
0399     idx = __idx(zhdr, bud);
0400     h += idx;
0401     if (bud == LAST)
0402         h |= (zhdr->last_chunks << BUDDY_SHIFT);
0403 
0404     write_lock(&slots->lock);
0405     slots->slot[idx] = h;
0406     write_unlock(&slots->lock);
0407     return (unsigned long)&slots->slot[idx];
0408 }
0409 
0410 static unsigned long encode_handle(struct z3fold_header *zhdr, enum buddy bud)
0411 {
0412     return __encode_handle(zhdr, zhdr->slots, bud);
0413 }
0414 
0415 /* only for LAST bud, returns zero otherwise */
0416 static unsigned short handle_to_chunks(unsigned long handle)
0417 {
0418     struct z3fold_buddy_slots *slots = handle_to_slots(handle);
0419     unsigned long addr;
0420 
0421     read_lock(&slots->lock);
0422     addr = *(unsigned long *)handle;
0423     read_unlock(&slots->lock);
0424     return (addr & ~PAGE_MASK) >> BUDDY_SHIFT;
0425 }
0426 
0427 /*
0428  * (handle & BUDDY_MASK) < zhdr->first_num is possible in encode_handle
0429  *  but that doesn't matter. because the masking will result in the
0430  *  correct buddy number.
0431  */
0432 static enum buddy handle_to_buddy(unsigned long handle)
0433 {
0434     struct z3fold_header *zhdr;
0435     struct z3fold_buddy_slots *slots = handle_to_slots(handle);
0436     unsigned long addr;
0437 
0438     read_lock(&slots->lock);
0439     WARN_ON(handle & (1 << PAGE_HEADLESS));
0440     addr = *(unsigned long *)handle;
0441     read_unlock(&slots->lock);
0442     zhdr = (struct z3fold_header *)(addr & PAGE_MASK);
0443     return (addr - zhdr->first_num) & BUDDY_MASK;
0444 }
0445 
0446 static inline struct z3fold_pool *zhdr_to_pool(struct z3fold_header *zhdr)
0447 {
0448     return zhdr->pool;
0449 }
0450 
0451 static void __release_z3fold_page(struct z3fold_header *zhdr, bool locked)
0452 {
0453     struct page *page = virt_to_page(zhdr);
0454     struct z3fold_pool *pool = zhdr_to_pool(zhdr);
0455 
0456     WARN_ON(!list_empty(&zhdr->buddy));
0457     set_bit(PAGE_STALE, &page->private);
0458     clear_bit(NEEDS_COMPACTING, &page->private);
0459     spin_lock(&pool->lock);
0460     if (!list_empty(&page->lru))
0461         list_del_init(&page->lru);
0462     spin_unlock(&pool->lock);
0463 
0464     if (locked)
0465         z3fold_page_unlock(zhdr);
0466 
0467     spin_lock(&pool->stale_lock);
0468     list_add(&zhdr->buddy, &pool->stale);
0469     queue_work(pool->release_wq, &pool->work);
0470     spin_unlock(&pool->stale_lock);
0471 
0472     atomic64_dec(&pool->pages_nr);
0473 }
0474 
0475 static void release_z3fold_page_locked(struct kref *ref)
0476 {
0477     struct z3fold_header *zhdr = container_of(ref, struct z3fold_header,
0478                         refcount);
0479     WARN_ON(z3fold_page_trylock(zhdr));
0480     __release_z3fold_page(zhdr, true);
0481 }
0482 
0483 static void release_z3fold_page_locked_list(struct kref *ref)
0484 {
0485     struct z3fold_header *zhdr = container_of(ref, struct z3fold_header,
0486                            refcount);
0487     struct z3fold_pool *pool = zhdr_to_pool(zhdr);
0488 
0489     spin_lock(&pool->lock);
0490     list_del_init(&zhdr->buddy);
0491     spin_unlock(&pool->lock);
0492 
0493     WARN_ON(z3fold_page_trylock(zhdr));
0494     __release_z3fold_page(zhdr, true);
0495 }
0496 
0497 static void free_pages_work(struct work_struct *w)
0498 {
0499     struct z3fold_pool *pool = container_of(w, struct z3fold_pool, work);
0500 
0501     spin_lock(&pool->stale_lock);
0502     while (!list_empty(&pool->stale)) {
0503         struct z3fold_header *zhdr = list_first_entry(&pool->stale,
0504                         struct z3fold_header, buddy);
0505         struct page *page = virt_to_page(zhdr);
0506 
0507         list_del(&zhdr->buddy);
0508         if (WARN_ON(!test_bit(PAGE_STALE, &page->private)))
0509             continue;
0510         spin_unlock(&pool->stale_lock);
0511         cancel_work_sync(&zhdr->work);
0512         free_z3fold_page(page, false);
0513         cond_resched();
0514         spin_lock(&pool->stale_lock);
0515     }
0516     spin_unlock(&pool->stale_lock);
0517 }
0518 
0519 /*
0520  * Returns the number of free chunks in a z3fold page.
0521  * NB: can't be used with HEADLESS pages.
0522  */
0523 static int num_free_chunks(struct z3fold_header *zhdr)
0524 {
0525     int nfree;
0526     /*
0527      * If there is a middle object, pick up the bigger free space
0528      * either before or after it. Otherwise just subtract the number
0529      * of chunks occupied by the first and the last objects.
0530      */
0531     if (zhdr->middle_chunks != 0) {
0532         int nfree_before = zhdr->first_chunks ?
0533             0 : zhdr->start_middle - ZHDR_CHUNKS;
0534         int nfree_after = zhdr->last_chunks ?
0535             0 : TOTAL_CHUNKS -
0536                 (zhdr->start_middle + zhdr->middle_chunks);
0537         nfree = max(nfree_before, nfree_after);
0538     } else
0539         nfree = NCHUNKS - zhdr->first_chunks - zhdr->last_chunks;
0540     return nfree;
0541 }
0542 
0543 /* Add to the appropriate unbuddied list */
0544 static inline void add_to_unbuddied(struct z3fold_pool *pool,
0545                 struct z3fold_header *zhdr)
0546 {
0547     if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0 ||
0548             zhdr->middle_chunks == 0) {
0549         struct list_head *unbuddied;
0550         int freechunks = num_free_chunks(zhdr);
0551 
0552         migrate_disable();
0553         unbuddied = this_cpu_ptr(pool->unbuddied);
0554         spin_lock(&pool->lock);
0555         list_add(&zhdr->buddy, &unbuddied[freechunks]);
0556         spin_unlock(&pool->lock);
0557         zhdr->cpu = smp_processor_id();
0558         migrate_enable();
0559     }
0560 }
0561 
0562 static inline enum buddy get_free_buddy(struct z3fold_header *zhdr, int chunks)
0563 {
0564     enum buddy bud = HEADLESS;
0565 
0566     if (zhdr->middle_chunks) {
0567         if (!zhdr->first_chunks &&
0568             chunks <= zhdr->start_middle - ZHDR_CHUNKS)
0569             bud = FIRST;
0570         else if (!zhdr->last_chunks)
0571             bud = LAST;
0572     } else {
0573         if (!zhdr->first_chunks)
0574             bud = FIRST;
0575         else if (!zhdr->last_chunks)
0576             bud = LAST;
0577         else
0578             bud = MIDDLE;
0579     }
0580 
0581     return bud;
0582 }
0583 
0584 static inline void *mchunk_memmove(struct z3fold_header *zhdr,
0585                 unsigned short dst_chunk)
0586 {
0587     void *beg = zhdr;
0588     return memmove(beg + (dst_chunk << CHUNK_SHIFT),
0589                beg + (zhdr->start_middle << CHUNK_SHIFT),
0590                zhdr->middle_chunks << CHUNK_SHIFT);
0591 }
0592 
0593 static inline bool buddy_single(struct z3fold_header *zhdr)
0594 {
0595     return !((zhdr->first_chunks && zhdr->middle_chunks) ||
0596             (zhdr->first_chunks && zhdr->last_chunks) ||
0597             (zhdr->middle_chunks && zhdr->last_chunks));
0598 }
0599 
0600 static struct z3fold_header *compact_single_buddy(struct z3fold_header *zhdr)
0601 {
0602     struct z3fold_pool *pool = zhdr_to_pool(zhdr);
0603     void *p = zhdr;
0604     unsigned long old_handle = 0;
0605     size_t sz = 0;
0606     struct z3fold_header *new_zhdr = NULL;
0607     int first_idx = __idx(zhdr, FIRST);
0608     int middle_idx = __idx(zhdr, MIDDLE);
0609     int last_idx = __idx(zhdr, LAST);
0610     unsigned short *moved_chunks = NULL;
0611 
0612     /*
0613      * No need to protect slots here -- all the slots are "local" and
0614      * the page lock is already taken
0615      */
0616     if (zhdr->first_chunks && zhdr->slots->slot[first_idx]) {
0617         p += ZHDR_SIZE_ALIGNED;
0618         sz = zhdr->first_chunks << CHUNK_SHIFT;
0619         old_handle = (unsigned long)&zhdr->slots->slot[first_idx];
0620         moved_chunks = &zhdr->first_chunks;
0621     } else if (zhdr->middle_chunks && zhdr->slots->slot[middle_idx]) {
0622         p += zhdr->start_middle << CHUNK_SHIFT;
0623         sz = zhdr->middle_chunks << CHUNK_SHIFT;
0624         old_handle = (unsigned long)&zhdr->slots->slot[middle_idx];
0625         moved_chunks = &zhdr->middle_chunks;
0626     } else if (zhdr->last_chunks && zhdr->slots->slot[last_idx]) {
0627         p += PAGE_SIZE - (zhdr->last_chunks << CHUNK_SHIFT);
0628         sz = zhdr->last_chunks << CHUNK_SHIFT;
0629         old_handle = (unsigned long)&zhdr->slots->slot[last_idx];
0630         moved_chunks = &zhdr->last_chunks;
0631     }
0632 
0633     if (sz > 0) {
0634         enum buddy new_bud = HEADLESS;
0635         short chunks = size_to_chunks(sz);
0636         void *q;
0637 
0638         new_zhdr = __z3fold_alloc(pool, sz, false);
0639         if (!new_zhdr)
0640             return NULL;
0641 
0642         if (WARN_ON(new_zhdr == zhdr))
0643             goto out_fail;
0644 
0645         new_bud = get_free_buddy(new_zhdr, chunks);
0646         q = new_zhdr;
0647         switch (new_bud) {
0648         case FIRST:
0649             new_zhdr->first_chunks = chunks;
0650             q += ZHDR_SIZE_ALIGNED;
0651             break;
0652         case MIDDLE:
0653             new_zhdr->middle_chunks = chunks;
0654             new_zhdr->start_middle =
0655                 new_zhdr->first_chunks + ZHDR_CHUNKS;
0656             q += new_zhdr->start_middle << CHUNK_SHIFT;
0657             break;
0658         case LAST:
0659             new_zhdr->last_chunks = chunks;
0660             q += PAGE_SIZE - (new_zhdr->last_chunks << CHUNK_SHIFT);
0661             break;
0662         default:
0663             goto out_fail;
0664         }
0665         new_zhdr->foreign_handles++;
0666         memcpy(q, p, sz);
0667         write_lock(&zhdr->slots->lock);
0668         *(unsigned long *)old_handle = (unsigned long)new_zhdr +
0669             __idx(new_zhdr, new_bud);
0670         if (new_bud == LAST)
0671             *(unsigned long *)old_handle |=
0672                     (new_zhdr->last_chunks << BUDDY_SHIFT);
0673         write_unlock(&zhdr->slots->lock);
0674         add_to_unbuddied(pool, new_zhdr);
0675         z3fold_page_unlock(new_zhdr);
0676 
0677         *moved_chunks = 0;
0678     }
0679 
0680     return new_zhdr;
0681 
0682 out_fail:
0683     if (new_zhdr && !kref_put(&new_zhdr->refcount, release_z3fold_page_locked)) {
0684         add_to_unbuddied(pool, new_zhdr);
0685         z3fold_page_unlock(new_zhdr);
0686     }
0687     return NULL;
0688 
0689 }
0690 
0691 #define BIG_CHUNK_GAP   3
0692 /* Has to be called with lock held */
0693 static int z3fold_compact_page(struct z3fold_header *zhdr)
0694 {
0695     struct page *page = virt_to_page(zhdr);
0696 
0697     if (test_bit(MIDDLE_CHUNK_MAPPED, &page->private))
0698         return 0; /* can't move middle chunk, it's used */
0699 
0700     if (unlikely(PageIsolated(page)))
0701         return 0;
0702 
0703     if (zhdr->middle_chunks == 0)
0704         return 0; /* nothing to compact */
0705 
0706     if (zhdr->first_chunks == 0 && zhdr->last_chunks == 0) {
0707         /* move to the beginning */
0708         mchunk_memmove(zhdr, ZHDR_CHUNKS);
0709         zhdr->first_chunks = zhdr->middle_chunks;
0710         zhdr->middle_chunks = 0;
0711         zhdr->start_middle = 0;
0712         zhdr->first_num++;
0713         return 1;
0714     }
0715 
0716     /*
0717      * moving data is expensive, so let's only do that if
0718      * there's substantial gain (at least BIG_CHUNK_GAP chunks)
0719      */
0720     if (zhdr->first_chunks != 0 && zhdr->last_chunks == 0 &&
0721         zhdr->start_middle - (zhdr->first_chunks + ZHDR_CHUNKS) >=
0722             BIG_CHUNK_GAP) {
0723         mchunk_memmove(zhdr, zhdr->first_chunks + ZHDR_CHUNKS);
0724         zhdr->start_middle = zhdr->first_chunks + ZHDR_CHUNKS;
0725         return 1;
0726     } else if (zhdr->last_chunks != 0 && zhdr->first_chunks == 0 &&
0727            TOTAL_CHUNKS - (zhdr->last_chunks + zhdr->start_middle
0728                     + zhdr->middle_chunks) >=
0729             BIG_CHUNK_GAP) {
0730         unsigned short new_start = TOTAL_CHUNKS - zhdr->last_chunks -
0731             zhdr->middle_chunks;
0732         mchunk_memmove(zhdr, new_start);
0733         zhdr->start_middle = new_start;
0734         return 1;
0735     }
0736 
0737     return 0;
0738 }
0739 
0740 static void do_compact_page(struct z3fold_header *zhdr, bool locked)
0741 {
0742     struct z3fold_pool *pool = zhdr_to_pool(zhdr);
0743     struct page *page;
0744 
0745     page = virt_to_page(zhdr);
0746     if (locked)
0747         WARN_ON(z3fold_page_trylock(zhdr));
0748     else
0749         z3fold_page_lock(zhdr);
0750     if (WARN_ON(!test_and_clear_bit(NEEDS_COMPACTING, &page->private))) {
0751         z3fold_page_unlock(zhdr);
0752         return;
0753     }
0754     spin_lock(&pool->lock);
0755     list_del_init(&zhdr->buddy);
0756     spin_unlock(&pool->lock);
0757 
0758     if (kref_put(&zhdr->refcount, release_z3fold_page_locked))
0759         return;
0760 
0761     if (test_bit(PAGE_STALE, &page->private) ||
0762         test_and_set_bit(PAGE_CLAIMED, &page->private)) {
0763         z3fold_page_unlock(zhdr);
0764         return;
0765     }
0766 
0767     if (!zhdr->foreign_handles && buddy_single(zhdr) &&
0768         zhdr->mapped_count == 0 && compact_single_buddy(zhdr)) {
0769         if (!kref_put(&zhdr->refcount, release_z3fold_page_locked)) {
0770             clear_bit(PAGE_CLAIMED, &page->private);
0771             z3fold_page_unlock(zhdr);
0772         }
0773         return;
0774     }
0775 
0776     z3fold_compact_page(zhdr);
0777     add_to_unbuddied(pool, zhdr);
0778     clear_bit(PAGE_CLAIMED, &page->private);
0779     z3fold_page_unlock(zhdr);
0780 }
0781 
0782 static void compact_page_work(struct work_struct *w)
0783 {
0784     struct z3fold_header *zhdr = container_of(w, struct z3fold_header,
0785                         work);
0786 
0787     do_compact_page(zhdr, false);
0788 }
0789 
0790 /* returns _locked_ z3fold page header or NULL */
0791 static inline struct z3fold_header *__z3fold_alloc(struct z3fold_pool *pool,
0792                         size_t size, bool can_sleep)
0793 {
0794     struct z3fold_header *zhdr = NULL;
0795     struct page *page;
0796     struct list_head *unbuddied;
0797     int chunks = size_to_chunks(size), i;
0798 
0799 lookup:
0800     migrate_disable();
0801     /* First, try to find an unbuddied z3fold page. */
0802     unbuddied = this_cpu_ptr(pool->unbuddied);
0803     for_each_unbuddied_list(i, chunks) {
0804         struct list_head *l = &unbuddied[i];
0805 
0806         zhdr = list_first_entry_or_null(READ_ONCE(l),
0807                     struct z3fold_header, buddy);
0808 
0809         if (!zhdr)
0810             continue;
0811 
0812         /* Re-check under lock. */
0813         spin_lock(&pool->lock);
0814         if (unlikely(zhdr != list_first_entry(READ_ONCE(l),
0815                         struct z3fold_header, buddy)) ||
0816             !z3fold_page_trylock(zhdr)) {
0817             spin_unlock(&pool->lock);
0818             zhdr = NULL;
0819             migrate_enable();
0820             if (can_sleep)
0821                 cond_resched();
0822             goto lookup;
0823         }
0824         list_del_init(&zhdr->buddy);
0825         zhdr->cpu = -1;
0826         spin_unlock(&pool->lock);
0827 
0828         page = virt_to_page(zhdr);
0829         if (test_bit(NEEDS_COMPACTING, &page->private) ||
0830             test_bit(PAGE_CLAIMED, &page->private)) {
0831             z3fold_page_unlock(zhdr);
0832             zhdr = NULL;
0833             migrate_enable();
0834             if (can_sleep)
0835                 cond_resched();
0836             goto lookup;
0837         }
0838 
0839         /*
0840          * this page could not be removed from its unbuddied
0841          * list while pool lock was held, and then we've taken
0842          * page lock so kref_put could not be called before
0843          * we got here, so it's safe to just call kref_get()
0844          */
0845         kref_get(&zhdr->refcount);
0846         break;
0847     }
0848     migrate_enable();
0849 
0850     if (!zhdr) {
0851         int cpu;
0852 
0853         /* look for _exact_ match on other cpus' lists */
0854         for_each_online_cpu(cpu) {
0855             struct list_head *l;
0856 
0857             unbuddied = per_cpu_ptr(pool->unbuddied, cpu);
0858             spin_lock(&pool->lock);
0859             l = &unbuddied[chunks];
0860 
0861             zhdr = list_first_entry_or_null(READ_ONCE(l),
0862                         struct z3fold_header, buddy);
0863 
0864             if (!zhdr || !z3fold_page_trylock(zhdr)) {
0865                 spin_unlock(&pool->lock);
0866                 zhdr = NULL;
0867                 continue;
0868             }
0869             list_del_init(&zhdr->buddy);
0870             zhdr->cpu = -1;
0871             spin_unlock(&pool->lock);
0872 
0873             page = virt_to_page(zhdr);
0874             if (test_bit(NEEDS_COMPACTING, &page->private) ||
0875                 test_bit(PAGE_CLAIMED, &page->private)) {
0876                 z3fold_page_unlock(zhdr);
0877                 zhdr = NULL;
0878                 if (can_sleep)
0879                     cond_resched();
0880                 continue;
0881             }
0882             kref_get(&zhdr->refcount);
0883             break;
0884         }
0885     }
0886 
0887     if (zhdr && !zhdr->slots) {
0888         zhdr->slots = alloc_slots(pool, GFP_ATOMIC);
0889         if (!zhdr->slots)
0890             goto out_fail;
0891     }
0892     return zhdr;
0893 
0894 out_fail:
0895     if (!kref_put(&zhdr->refcount, release_z3fold_page_locked)) {
0896         add_to_unbuddied(pool, zhdr);
0897         z3fold_page_unlock(zhdr);
0898     }
0899     return NULL;
0900 }
0901 
0902 /*
0903  * API Functions
0904  */
0905 
0906 /**
0907  * z3fold_create_pool() - create a new z3fold pool
0908  * @name:   pool name
0909  * @gfp:    gfp flags when allocating the z3fold pool structure
0910  * @ops:    user-defined operations for the z3fold pool
0911  *
0912  * Return: pointer to the new z3fold pool or NULL if the metadata allocation
0913  * failed.
0914  */
0915 static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp,
0916         const struct z3fold_ops *ops)
0917 {
0918     struct z3fold_pool *pool = NULL;
0919     int i, cpu;
0920 
0921     pool = kzalloc(sizeof(struct z3fold_pool), gfp);
0922     if (!pool)
0923         goto out;
0924     pool->c_handle = kmem_cache_create("z3fold_handle",
0925                 sizeof(struct z3fold_buddy_slots),
0926                 SLOTS_ALIGN, 0, NULL);
0927     if (!pool->c_handle)
0928         goto out_c;
0929     spin_lock_init(&pool->lock);
0930     spin_lock_init(&pool->stale_lock);
0931     pool->unbuddied = __alloc_percpu(sizeof(struct list_head) * NCHUNKS,
0932                      __alignof__(struct list_head));
0933     if (!pool->unbuddied)
0934         goto out_pool;
0935     for_each_possible_cpu(cpu) {
0936         struct list_head *unbuddied =
0937                 per_cpu_ptr(pool->unbuddied, cpu);
0938         for_each_unbuddied_list(i, 0)
0939             INIT_LIST_HEAD(&unbuddied[i]);
0940     }
0941     INIT_LIST_HEAD(&pool->lru);
0942     INIT_LIST_HEAD(&pool->stale);
0943     atomic64_set(&pool->pages_nr, 0);
0944     pool->name = name;
0945     pool->compact_wq = create_singlethread_workqueue(pool->name);
0946     if (!pool->compact_wq)
0947         goto out_unbuddied;
0948     pool->release_wq = create_singlethread_workqueue(pool->name);
0949     if (!pool->release_wq)
0950         goto out_wq;
0951     INIT_WORK(&pool->work, free_pages_work);
0952     pool->ops = ops;
0953     return pool;
0954 
0955 out_wq:
0956     destroy_workqueue(pool->compact_wq);
0957 out_unbuddied:
0958     free_percpu(pool->unbuddied);
0959 out_pool:
0960     kmem_cache_destroy(pool->c_handle);
0961 out_c:
0962     kfree(pool);
0963 out:
0964     return NULL;
0965 }
0966 
0967 /**
0968  * z3fold_destroy_pool() - destroys an existing z3fold pool
0969  * @pool:   the z3fold pool to be destroyed
0970  *
0971  * The pool should be emptied before this function is called.
0972  */
0973 static void z3fold_destroy_pool(struct z3fold_pool *pool)
0974 {
0975     kmem_cache_destroy(pool->c_handle);
0976 
0977     /*
0978      * We need to destroy pool->compact_wq before pool->release_wq,
0979      * as any pending work on pool->compact_wq will call
0980      * queue_work(pool->release_wq, &pool->work).
0981      *
0982      * There are still outstanding pages until both workqueues are drained,
0983      * so we cannot unregister migration until then.
0984      */
0985 
0986     destroy_workqueue(pool->compact_wq);
0987     destroy_workqueue(pool->release_wq);
0988     free_percpu(pool->unbuddied);
0989     kfree(pool);
0990 }
0991 
0992 static const struct movable_operations z3fold_mops;
0993 
0994 /**
0995  * z3fold_alloc() - allocates a region of a given size
0996  * @pool:   z3fold pool from which to allocate
0997  * @size:   size in bytes of the desired allocation
0998  * @gfp:    gfp flags used if the pool needs to grow
0999  * @handle: handle of the new allocation
1000  *
1001  * This function will attempt to find a free region in the pool large enough to
1002  * satisfy the allocation request.  A search of the unbuddied lists is
1003  * performed first. If no suitable free region is found, then a new page is
1004  * allocated and added to the pool to satisfy the request.
1005  *
1006  * Return: 0 if success and handle is set, otherwise -EINVAL if the size or
1007  * gfp arguments are invalid or -ENOMEM if the pool was unable to allocate
1008  * a new page.
1009  */
1010 static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp,
1011             unsigned long *handle)
1012 {
1013     int chunks = size_to_chunks(size);
1014     struct z3fold_header *zhdr = NULL;
1015     struct page *page = NULL;
1016     enum buddy bud;
1017     bool can_sleep = gfpflags_allow_blocking(gfp);
1018 
1019     if (!size || (gfp & __GFP_HIGHMEM))
1020         return -EINVAL;
1021 
1022     if (size > PAGE_SIZE)
1023         return -ENOSPC;
1024 
1025     if (size > PAGE_SIZE - ZHDR_SIZE_ALIGNED - CHUNK_SIZE)
1026         bud = HEADLESS;
1027     else {
1028 retry:
1029         zhdr = __z3fold_alloc(pool, size, can_sleep);
1030         if (zhdr) {
1031             bud = get_free_buddy(zhdr, chunks);
1032             if (bud == HEADLESS) {
1033                 if (!kref_put(&zhdr->refcount,
1034                          release_z3fold_page_locked))
1035                     z3fold_page_unlock(zhdr);
1036                 pr_err("No free chunks in unbuddied\n");
1037                 WARN_ON(1);
1038                 goto retry;
1039             }
1040             page = virt_to_page(zhdr);
1041             goto found;
1042         }
1043         bud = FIRST;
1044     }
1045 
1046     page = alloc_page(gfp);
1047     if (!page)
1048         return -ENOMEM;
1049 
1050     zhdr = init_z3fold_page(page, bud == HEADLESS, pool, gfp);
1051     if (!zhdr) {
1052         __free_page(page);
1053         return -ENOMEM;
1054     }
1055     atomic64_inc(&pool->pages_nr);
1056 
1057     if (bud == HEADLESS) {
1058         set_bit(PAGE_HEADLESS, &page->private);
1059         goto headless;
1060     }
1061     if (can_sleep) {
1062         lock_page(page);
1063         __SetPageMovable(page, &z3fold_mops);
1064         unlock_page(page);
1065     } else {
1066         WARN_ON(!trylock_page(page));
1067         __SetPageMovable(page, &z3fold_mops);
1068         unlock_page(page);
1069     }
1070     z3fold_page_lock(zhdr);
1071 
1072 found:
1073     if (bud == FIRST)
1074         zhdr->first_chunks = chunks;
1075     else if (bud == LAST)
1076         zhdr->last_chunks = chunks;
1077     else {
1078         zhdr->middle_chunks = chunks;
1079         zhdr->start_middle = zhdr->first_chunks + ZHDR_CHUNKS;
1080     }
1081     add_to_unbuddied(pool, zhdr);
1082 
1083 headless:
1084     spin_lock(&pool->lock);
1085     /* Add/move z3fold page to beginning of LRU */
1086     if (!list_empty(&page->lru))
1087         list_del(&page->lru);
1088 
1089     list_add(&page->lru, &pool->lru);
1090 
1091     *handle = encode_handle(zhdr, bud);
1092     spin_unlock(&pool->lock);
1093     if (bud != HEADLESS)
1094         z3fold_page_unlock(zhdr);
1095 
1096     return 0;
1097 }
1098 
1099 /**
1100  * z3fold_free() - frees the allocation associated with the given handle
1101  * @pool:   pool in which the allocation resided
1102  * @handle: handle associated with the allocation returned by z3fold_alloc()
1103  *
1104  * In the case that the z3fold page in which the allocation resides is under
1105  * reclaim, as indicated by the PAGE_CLAIMED flag being set, this function
1106  * only sets the first|middle|last_chunks to 0.  The page is actually freed
1107  * once all buddies are evicted (see z3fold_reclaim_page() below).
1108  */
1109 static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
1110 {
1111     struct z3fold_header *zhdr;
1112     struct page *page;
1113     enum buddy bud;
1114     bool page_claimed;
1115 
1116     zhdr = get_z3fold_header(handle);
1117     page = virt_to_page(zhdr);
1118     page_claimed = test_and_set_bit(PAGE_CLAIMED, &page->private);
1119 
1120     if (test_bit(PAGE_HEADLESS, &page->private)) {
1121         /* if a headless page is under reclaim, just leave.
1122          * NB: we use test_and_set_bit for a reason: if the bit
1123          * has not been set before, we release this page
1124          * immediately so we don't care about its value any more.
1125          */
1126         if (!page_claimed) {
1127             spin_lock(&pool->lock);
1128             list_del(&page->lru);
1129             spin_unlock(&pool->lock);
1130             put_z3fold_header(zhdr);
1131             free_z3fold_page(page, true);
1132             atomic64_dec(&pool->pages_nr);
1133         }
1134         return;
1135     }
1136 
1137     /* Non-headless case */
1138     bud = handle_to_buddy(handle);
1139 
1140     switch (bud) {
1141     case FIRST:
1142         zhdr->first_chunks = 0;
1143         break;
1144     case MIDDLE:
1145         zhdr->middle_chunks = 0;
1146         break;
1147     case LAST:
1148         zhdr->last_chunks = 0;
1149         break;
1150     default:
1151         pr_err("%s: unknown bud %d\n", __func__, bud);
1152         WARN_ON(1);
1153         put_z3fold_header(zhdr);
1154         return;
1155     }
1156 
1157     if (!page_claimed)
1158         free_handle(handle, zhdr);
1159     if (kref_put(&zhdr->refcount, release_z3fold_page_locked_list))
1160         return;
1161     if (page_claimed) {
1162         /* the page has not been claimed by us */
1163         put_z3fold_header(zhdr);
1164         return;
1165     }
1166     if (test_and_set_bit(NEEDS_COMPACTING, &page->private)) {
1167         clear_bit(PAGE_CLAIMED, &page->private);
1168         put_z3fold_header(zhdr);
1169         return;
1170     }
1171     if (zhdr->cpu < 0 || !cpu_online(zhdr->cpu)) {
1172         zhdr->cpu = -1;
1173         kref_get(&zhdr->refcount);
1174         clear_bit(PAGE_CLAIMED, &page->private);
1175         do_compact_page(zhdr, true);
1176         return;
1177     }
1178     kref_get(&zhdr->refcount);
1179     clear_bit(PAGE_CLAIMED, &page->private);
1180     queue_work_on(zhdr->cpu, pool->compact_wq, &zhdr->work);
1181     put_z3fold_header(zhdr);
1182 }
1183 
1184 /**
1185  * z3fold_reclaim_page() - evicts allocations from a pool page and frees it
1186  * @pool:   pool from which a page will attempt to be evicted
1187  * @retries:    number of pages on the LRU list for which eviction will
1188  *      be attempted before failing
1189  *
1190  * z3fold reclaim is different from normal system reclaim in that it is done
1191  * from the bottom, up. This is because only the bottom layer, z3fold, has
1192  * information on how the allocations are organized within each z3fold page.
1193  * This has the potential to create interesting locking situations between
1194  * z3fold and the user, however.
1195  *
1196  * To avoid these, this is how z3fold_reclaim_page() should be called:
1197  *
1198  * The user detects a page should be reclaimed and calls z3fold_reclaim_page().
1199  * z3fold_reclaim_page() will remove a z3fold page from the pool LRU list and
1200  * call the user-defined eviction handler with the pool and handle as
1201  * arguments.
1202  *
1203  * If the handle can not be evicted, the eviction handler should return
1204  * non-zero. z3fold_reclaim_page() will add the z3fold page back to the
1205  * appropriate list and try the next z3fold page on the LRU up to
1206  * a user defined number of retries.
1207  *
1208  * If the handle is successfully evicted, the eviction handler should
1209  * return 0 _and_ should have called z3fold_free() on the handle. z3fold_free()
1210  * contains logic to delay freeing the page if the page is under reclaim,
1211  * as indicated by the setting of the PG_reclaim flag on the underlying page.
1212  *
1213  * If all buddies in the z3fold page are successfully evicted, then the
1214  * z3fold page can be freed.
1215  *
1216  * Returns: 0 if page is successfully freed, otherwise -EINVAL if there are
1217  * no pages to evict or an eviction handler is not registered, -EAGAIN if
1218  * the retry limit was hit.
1219  */
1220 static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
1221 {
1222     int i, ret = -1;
1223     struct z3fold_header *zhdr = NULL;
1224     struct page *page = NULL;
1225     struct list_head *pos;
1226     unsigned long first_handle = 0, middle_handle = 0, last_handle = 0;
1227     struct z3fold_buddy_slots slots __attribute__((aligned(SLOTS_ALIGN)));
1228 
1229     rwlock_init(&slots.lock);
1230     slots.pool = (unsigned long)pool | (1 << HANDLES_NOFREE);
1231 
1232     spin_lock(&pool->lock);
1233     if (!pool->ops || !pool->ops->evict || retries == 0) {
1234         spin_unlock(&pool->lock);
1235         return -EINVAL;
1236     }
1237     for (i = 0; i < retries; i++) {
1238         if (list_empty(&pool->lru)) {
1239             spin_unlock(&pool->lock);
1240             return -EINVAL;
1241         }
1242         list_for_each_prev(pos, &pool->lru) {
1243             page = list_entry(pos, struct page, lru);
1244 
1245             zhdr = page_address(page);
1246             if (test_bit(PAGE_HEADLESS, &page->private)) {
1247                 /*
1248                  * For non-headless pages, we wait to do this
1249                  * until we have the page lock to avoid racing
1250                  * with __z3fold_alloc(). Headless pages don't
1251                  * have a lock (and __z3fold_alloc() will never
1252                  * see them), but we still need to test and set
1253                  * PAGE_CLAIMED to avoid racing with
1254                  * z3fold_free(), so just do it now before
1255                  * leaving the loop.
1256                  */
1257                 if (test_and_set_bit(PAGE_CLAIMED, &page->private))
1258                     continue;
1259 
1260                 break;
1261             }
1262 
1263             if (!z3fold_page_trylock(zhdr)) {
1264                 zhdr = NULL;
1265                 continue; /* can't evict at this point */
1266             }
1267 
1268             /* test_and_set_bit is of course atomic, but we still
1269              * need to do it under page lock, otherwise checking
1270              * that bit in __z3fold_alloc wouldn't make sense
1271              */
1272             if (zhdr->foreign_handles ||
1273                 test_and_set_bit(PAGE_CLAIMED, &page->private)) {
1274                 z3fold_page_unlock(zhdr);
1275                 zhdr = NULL;
1276                 continue; /* can't evict such page */
1277             }
1278             list_del_init(&zhdr->buddy);
1279             zhdr->cpu = -1;
1280             /* See comment in __z3fold_alloc. */
1281             kref_get(&zhdr->refcount);
1282             break;
1283         }
1284 
1285         if (!zhdr)
1286             break;
1287 
1288         list_del_init(&page->lru);
1289         spin_unlock(&pool->lock);
1290 
1291         if (!test_bit(PAGE_HEADLESS, &page->private)) {
1292             /*
1293              * We need encode the handles before unlocking, and
1294              * use our local slots structure because z3fold_free
1295              * can zero out zhdr->slots and we can't do much
1296              * about that
1297              */
1298             first_handle = 0;
1299             last_handle = 0;
1300             middle_handle = 0;
1301             memset(slots.slot, 0, sizeof(slots.slot));
1302             if (zhdr->first_chunks)
1303                 first_handle = __encode_handle(zhdr, &slots,
1304                                 FIRST);
1305             if (zhdr->middle_chunks)
1306                 middle_handle = __encode_handle(zhdr, &slots,
1307                                 MIDDLE);
1308             if (zhdr->last_chunks)
1309                 last_handle = __encode_handle(zhdr, &slots,
1310                                 LAST);
1311             /*
1312              * it's safe to unlock here because we hold a
1313              * reference to this page
1314              */
1315             z3fold_page_unlock(zhdr);
1316         } else {
1317             first_handle = encode_handle(zhdr, HEADLESS);
1318             last_handle = middle_handle = 0;
1319         }
1320         /* Issue the eviction callback(s) */
1321         if (middle_handle) {
1322             ret = pool->ops->evict(pool, middle_handle);
1323             if (ret)
1324                 goto next;
1325         }
1326         if (first_handle) {
1327             ret = pool->ops->evict(pool, first_handle);
1328             if (ret)
1329                 goto next;
1330         }
1331         if (last_handle) {
1332             ret = pool->ops->evict(pool, last_handle);
1333             if (ret)
1334                 goto next;
1335         }
1336 next:
1337         if (test_bit(PAGE_HEADLESS, &page->private)) {
1338             if (ret == 0) {
1339                 free_z3fold_page(page, true);
1340                 atomic64_dec(&pool->pages_nr);
1341                 return 0;
1342             }
1343             spin_lock(&pool->lock);
1344             list_add(&page->lru, &pool->lru);
1345             spin_unlock(&pool->lock);
1346             clear_bit(PAGE_CLAIMED, &page->private);
1347         } else {
1348             struct z3fold_buddy_slots *slots = zhdr->slots;
1349             z3fold_page_lock(zhdr);
1350             if (kref_put(&zhdr->refcount,
1351                     release_z3fold_page_locked)) {
1352                 kmem_cache_free(pool->c_handle, slots);
1353                 return 0;
1354             }
1355             /*
1356              * if we are here, the page is still not completely
1357              * free. Take the global pool lock then to be able
1358              * to add it back to the lru list
1359              */
1360             spin_lock(&pool->lock);
1361             list_add(&page->lru, &pool->lru);
1362             spin_unlock(&pool->lock);
1363             if (list_empty(&zhdr->buddy))
1364                 add_to_unbuddied(pool, zhdr);
1365             clear_bit(PAGE_CLAIMED, &page->private);
1366             z3fold_page_unlock(zhdr);
1367         }
1368 
1369         /* We started off locked to we need to lock the pool back */
1370         spin_lock(&pool->lock);
1371     }
1372     spin_unlock(&pool->lock);
1373     return -EAGAIN;
1374 }
1375 
1376 /**
1377  * z3fold_map() - maps the allocation associated with the given handle
1378  * @pool:   pool in which the allocation resides
1379  * @handle: handle associated with the allocation to be mapped
1380  *
1381  * Extracts the buddy number from handle and constructs the pointer to the
1382  * correct starting chunk within the page.
1383  *
1384  * Returns: a pointer to the mapped allocation
1385  */
1386 static void *z3fold_map(struct z3fold_pool *pool, unsigned long handle)
1387 {
1388     struct z3fold_header *zhdr;
1389     struct page *page;
1390     void *addr;
1391     enum buddy buddy;
1392 
1393     zhdr = get_z3fold_header(handle);
1394     addr = zhdr;
1395     page = virt_to_page(zhdr);
1396 
1397     if (test_bit(PAGE_HEADLESS, &page->private))
1398         goto out;
1399 
1400     buddy = handle_to_buddy(handle);
1401     switch (buddy) {
1402     case FIRST:
1403         addr += ZHDR_SIZE_ALIGNED;
1404         break;
1405     case MIDDLE:
1406         addr += zhdr->start_middle << CHUNK_SHIFT;
1407         set_bit(MIDDLE_CHUNK_MAPPED, &page->private);
1408         break;
1409     case LAST:
1410         addr += PAGE_SIZE - (handle_to_chunks(handle) << CHUNK_SHIFT);
1411         break;
1412     default:
1413         pr_err("unknown buddy id %d\n", buddy);
1414         WARN_ON(1);
1415         addr = NULL;
1416         break;
1417     }
1418 
1419     if (addr)
1420         zhdr->mapped_count++;
1421 out:
1422     put_z3fold_header(zhdr);
1423     return addr;
1424 }
1425 
1426 /**
1427  * z3fold_unmap() - unmaps the allocation associated with the given handle
1428  * @pool:   pool in which the allocation resides
1429  * @handle: handle associated with the allocation to be unmapped
1430  */
1431 static void z3fold_unmap(struct z3fold_pool *pool, unsigned long handle)
1432 {
1433     struct z3fold_header *zhdr;
1434     struct page *page;
1435     enum buddy buddy;
1436 
1437     zhdr = get_z3fold_header(handle);
1438     page = virt_to_page(zhdr);
1439 
1440     if (test_bit(PAGE_HEADLESS, &page->private))
1441         return;
1442 
1443     buddy = handle_to_buddy(handle);
1444     if (buddy == MIDDLE)
1445         clear_bit(MIDDLE_CHUNK_MAPPED, &page->private);
1446     zhdr->mapped_count--;
1447     put_z3fold_header(zhdr);
1448 }
1449 
1450 /**
1451  * z3fold_get_pool_size() - gets the z3fold pool size in pages
1452  * @pool:   pool whose size is being queried
1453  *
1454  * Returns: size in pages of the given pool.
1455  */
1456 static u64 z3fold_get_pool_size(struct z3fold_pool *pool)
1457 {
1458     return atomic64_read(&pool->pages_nr);
1459 }
1460 
1461 static bool z3fold_page_isolate(struct page *page, isolate_mode_t mode)
1462 {
1463     struct z3fold_header *zhdr;
1464     struct z3fold_pool *pool;
1465 
1466     VM_BUG_ON_PAGE(!PageMovable(page), page);
1467     VM_BUG_ON_PAGE(PageIsolated(page), page);
1468 
1469     if (test_bit(PAGE_HEADLESS, &page->private))
1470         return false;
1471 
1472     zhdr = page_address(page);
1473     z3fold_page_lock(zhdr);
1474     if (test_bit(NEEDS_COMPACTING, &page->private) ||
1475         test_bit(PAGE_STALE, &page->private))
1476         goto out;
1477 
1478     if (zhdr->mapped_count != 0 || zhdr->foreign_handles != 0)
1479         goto out;
1480 
1481     if (test_and_set_bit(PAGE_CLAIMED, &page->private))
1482         goto out;
1483     pool = zhdr_to_pool(zhdr);
1484     spin_lock(&pool->lock);
1485     if (!list_empty(&zhdr->buddy))
1486         list_del_init(&zhdr->buddy);
1487     if (!list_empty(&page->lru))
1488         list_del_init(&page->lru);
1489     spin_unlock(&pool->lock);
1490 
1491     kref_get(&zhdr->refcount);
1492     z3fold_page_unlock(zhdr);
1493     return true;
1494 
1495 out:
1496     z3fold_page_unlock(zhdr);
1497     return false;
1498 }
1499 
1500 static int z3fold_page_migrate(struct page *newpage, struct page *page,
1501         enum migrate_mode mode)
1502 {
1503     struct z3fold_header *zhdr, *new_zhdr;
1504     struct z3fold_pool *pool;
1505 
1506     VM_BUG_ON_PAGE(!PageMovable(page), page);
1507     VM_BUG_ON_PAGE(!PageIsolated(page), page);
1508     VM_BUG_ON_PAGE(!test_bit(PAGE_CLAIMED, &page->private), page);
1509     VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
1510 
1511     zhdr = page_address(page);
1512     pool = zhdr_to_pool(zhdr);
1513 
1514     if (!z3fold_page_trylock(zhdr))
1515         return -EAGAIN;
1516     if (zhdr->mapped_count != 0 || zhdr->foreign_handles != 0) {
1517         clear_bit(PAGE_CLAIMED, &page->private);
1518         z3fold_page_unlock(zhdr);
1519         return -EBUSY;
1520     }
1521     if (work_pending(&zhdr->work)) {
1522         z3fold_page_unlock(zhdr);
1523         return -EAGAIN;
1524     }
1525     new_zhdr = page_address(newpage);
1526     memcpy(new_zhdr, zhdr, PAGE_SIZE);
1527     newpage->private = page->private;
1528     set_bit(PAGE_MIGRATED, &page->private);
1529     z3fold_page_unlock(zhdr);
1530     spin_lock_init(&new_zhdr->page_lock);
1531     INIT_WORK(&new_zhdr->work, compact_page_work);
1532     /*
1533      * z3fold_page_isolate() ensures that new_zhdr->buddy is empty,
1534      * so we only have to reinitialize it.
1535      */
1536     INIT_LIST_HEAD(&new_zhdr->buddy);
1537     __ClearPageMovable(page);
1538 
1539     get_page(newpage);
1540     z3fold_page_lock(new_zhdr);
1541     if (new_zhdr->first_chunks)
1542         encode_handle(new_zhdr, FIRST);
1543     if (new_zhdr->last_chunks)
1544         encode_handle(new_zhdr, LAST);
1545     if (new_zhdr->middle_chunks)
1546         encode_handle(new_zhdr, MIDDLE);
1547     set_bit(NEEDS_COMPACTING, &newpage->private);
1548     new_zhdr->cpu = smp_processor_id();
1549     spin_lock(&pool->lock);
1550     list_add(&newpage->lru, &pool->lru);
1551     spin_unlock(&pool->lock);
1552     __SetPageMovable(newpage, &z3fold_mops);
1553     z3fold_page_unlock(new_zhdr);
1554 
1555     queue_work_on(new_zhdr->cpu, pool->compact_wq, &new_zhdr->work);
1556 
1557     /* PAGE_CLAIMED and PAGE_MIGRATED are cleared now. */
1558     page->private = 0;
1559     put_page(page);
1560     return 0;
1561 }
1562 
1563 static void z3fold_page_putback(struct page *page)
1564 {
1565     struct z3fold_header *zhdr;
1566     struct z3fold_pool *pool;
1567 
1568     zhdr = page_address(page);
1569     pool = zhdr_to_pool(zhdr);
1570 
1571     z3fold_page_lock(zhdr);
1572     if (!list_empty(&zhdr->buddy))
1573         list_del_init(&zhdr->buddy);
1574     INIT_LIST_HEAD(&page->lru);
1575     if (kref_put(&zhdr->refcount, release_z3fold_page_locked))
1576         return;
1577     spin_lock(&pool->lock);
1578     list_add(&page->lru, &pool->lru);
1579     spin_unlock(&pool->lock);
1580     if (list_empty(&zhdr->buddy))
1581         add_to_unbuddied(pool, zhdr);
1582     clear_bit(PAGE_CLAIMED, &page->private);
1583     z3fold_page_unlock(zhdr);
1584 }
1585 
1586 static const struct movable_operations z3fold_mops = {
1587     .isolate_page = z3fold_page_isolate,
1588     .migrate_page = z3fold_page_migrate,
1589     .putback_page = z3fold_page_putback,
1590 };
1591 
1592 /*****************
1593  * zpool
1594  ****************/
1595 
1596 static int z3fold_zpool_evict(struct z3fold_pool *pool, unsigned long handle)
1597 {
1598     if (pool->zpool && pool->zpool_ops && pool->zpool_ops->evict)
1599         return pool->zpool_ops->evict(pool->zpool, handle);
1600     else
1601         return -ENOENT;
1602 }
1603 
1604 static const struct z3fold_ops z3fold_zpool_ops = {
1605     .evict =    z3fold_zpool_evict
1606 };
1607 
1608 static void *z3fold_zpool_create(const char *name, gfp_t gfp,
1609                    const struct zpool_ops *zpool_ops,
1610                    struct zpool *zpool)
1611 {
1612     struct z3fold_pool *pool;
1613 
1614     pool = z3fold_create_pool(name, gfp,
1615                 zpool_ops ? &z3fold_zpool_ops : NULL);
1616     if (pool) {
1617         pool->zpool = zpool;
1618         pool->zpool_ops = zpool_ops;
1619     }
1620     return pool;
1621 }
1622 
1623 static void z3fold_zpool_destroy(void *pool)
1624 {
1625     z3fold_destroy_pool(pool);
1626 }
1627 
1628 static int z3fold_zpool_malloc(void *pool, size_t size, gfp_t gfp,
1629             unsigned long *handle)
1630 {
1631     return z3fold_alloc(pool, size, gfp, handle);
1632 }
1633 static void z3fold_zpool_free(void *pool, unsigned long handle)
1634 {
1635     z3fold_free(pool, handle);
1636 }
1637 
1638 static int z3fold_zpool_shrink(void *pool, unsigned int pages,
1639             unsigned int *reclaimed)
1640 {
1641     unsigned int total = 0;
1642     int ret = -EINVAL;
1643 
1644     while (total < pages) {
1645         ret = z3fold_reclaim_page(pool, 8);
1646         if (ret < 0)
1647             break;
1648         total++;
1649     }
1650 
1651     if (reclaimed)
1652         *reclaimed = total;
1653 
1654     return ret;
1655 }
1656 
1657 static void *z3fold_zpool_map(void *pool, unsigned long handle,
1658             enum zpool_mapmode mm)
1659 {
1660     return z3fold_map(pool, handle);
1661 }
1662 static void z3fold_zpool_unmap(void *pool, unsigned long handle)
1663 {
1664     z3fold_unmap(pool, handle);
1665 }
1666 
1667 static u64 z3fold_zpool_total_size(void *pool)
1668 {
1669     return z3fold_get_pool_size(pool) * PAGE_SIZE;
1670 }
1671 
1672 static struct zpool_driver z3fold_zpool_driver = {
1673     .type =     "z3fold",
1674     .sleep_mapped = true,
1675     .owner =    THIS_MODULE,
1676     .create =   z3fold_zpool_create,
1677     .destroy =  z3fold_zpool_destroy,
1678     .malloc =   z3fold_zpool_malloc,
1679     .free =     z3fold_zpool_free,
1680     .shrink =   z3fold_zpool_shrink,
1681     .map =      z3fold_zpool_map,
1682     .unmap =    z3fold_zpool_unmap,
1683     .total_size =   z3fold_zpool_total_size,
1684 };
1685 
1686 MODULE_ALIAS("zpool-z3fold");
1687 
1688 static int __init init_z3fold(void)
1689 {
1690     /*
1691      * Make sure the z3fold header is not larger than the page size and
1692      * there has remaining spaces for its buddy.
1693      */
1694     BUILD_BUG_ON(ZHDR_SIZE_ALIGNED > PAGE_SIZE - CHUNK_SIZE);
1695     zpool_register_driver(&z3fold_zpool_driver);
1696 
1697     return 0;
1698 }
1699 
1700 static void __exit exit_z3fold(void)
1701 {
1702     zpool_unregister_driver(&z3fold_zpool_driver);
1703 }
1704 
1705 module_init(init_z3fold);
1706 module_exit(exit_z3fold);
1707 
1708 MODULE_LICENSE("GPL");
1709 MODULE_AUTHOR("Vitaly Wool <vitalywool@gmail.com>");
1710 MODULE_DESCRIPTION("3-Fold Allocator for Compressed Pages");