fs/xfs/xfs_mru_cache.c

0001 // SPDX-License-Identifier: GPL-2.0
0002 /*
0003  * Copyright (c) 2006-2007 Silicon Graphics, Inc.
0004  * All Rights Reserved.
0005  */
0006 #include "xfs.h"
0007 #include "xfs_mru_cache.h"
0008
0009 /*
0010  * The MRU Cache data structure consists of a data store, an array of lists and
0011  * a lock to protect its internal state.  At initialisation time, the client
0012  * supplies an element lifetime in milliseconds and a group count, as well as a
0013  * function pointer to call when deleting elements.  A data structure for
0014  * queueing up work in the form of timed callbacks is also included.
0015  *
0016  * The group count controls how many lists are created, and thereby how finely
0017  * the elements are grouped in time.  When reaping occurs, all the elements in
0018  * all the lists whose time has expired are deleted.
0019  *
0020  * To give an example of how this works in practice, consider a client that
0021  * initialises an MRU Cache with a lifetime of ten seconds and a group count of
0022  * five.  Five internal lists will be created, each representing a two second
0023  * period in time.  When the first element is added, time zero for the data
0024  * structure is initialised to the current time.
0025  *
0026  * All the elements added in the first two seconds are appended to the first
0027  * list.  Elements added in the third second go into the second list, and so on.
0028  * If an element is accessed at any point, it is removed from its list and
0029  * inserted at the head of the current most-recently-used list.
0030  *
0031  * The reaper function will have nothing to do until at least twelve seconds
0032  * have elapsed since the first element was added.  The reason for this is that
0033  * if it were called at t=11s, there could be elements in the first list that
0034  * have only been inactive for nine seconds, so it still does nothing.  If it is
0035  * called anywhere between t=12 and t=14 seconds, it will delete all the
0036  * elements that remain in the first list.  It's therefore possible for elements
0037  * to remain in the data store even after they've been inactive for up to
0038  * (t + t/g) seconds, where t is the inactive element lifetime and g is the
0039  * number of groups.
0040  *
0041  * The above example assumes that the reaper function gets called at least once
0042  * every (t/g) seconds.  If it is called less frequently, unused elements will
0043  * accumulate in the reap list until the reaper function is eventually called.
0044  * The current implementation uses work queue callbacks to carefully time the
0045  * reaper function calls, so this should happen rarely, if at all.
0046  *
0047  * From a design perspective, the primary reason for the choice of a list array
0048  * representing discrete time intervals is that it's only practical to reap
0049  * expired elements in groups of some appreciable size.  This automatically
0050  * introduces a granularity to element lifetimes, so there's no point storing an
0051  * individual timeout with each element that specifies a more precise reap time.
0052  * The bonus is a saving of sizeof(long) bytes of memory per element stored.
0053  *
0054  * The elements could have been stored in just one list, but an array of
0055  * counters or pointers would need to be maintained to allow them to be divided
0056  * up into discrete time groups.  More critically, the process of touching or
0057  * removing an element would involve walking large portions of the entire list,
0058  * which would have a detrimental effect on performance.  The additional memory
0059  * requirement for the array of list heads is minimal.
0060  *
0061  * When an element is touched or deleted, it needs to be removed from its
0062  * current list.  Doubly linked lists are used to make the list maintenance
0063  * portion of these operations O(1).  Since reaper timing can be imprecise,
0064  * inserts and lookups can occur when there are no free lists available.  When
0065  * this happens, all the elements on the LRU list need to be migrated to the end
0066  * of the reap list.  To keep the list maintenance portion of these operations
0067  * O(1) also, list tails need to be accessible without walking the entire list.
0068  * This is the reason why doubly linked list heads are used.
0069  */
0070
0071 /*
0072  * An MRU Cache is a dynamic data structure that stores its elements in a way
0073  * that allows efficient lookups, but also groups them into discrete time
0074  * intervals based on insertion time.  This allows elements to be efficiently
0075  * and automatically reaped after a fixed period of inactivity.
0076  *
0077  * When a client data pointer is stored in the MRU Cache it needs to be added to
0078  * both the data store and to one of the lists.  It must also be possible to
0079  * access each of these entries via the other, i.e. to:
0080  *
0081  *    a) Walk a list, removing the corresponding data store entry for each item.
0082  *    b) Look up a data store entry, then access its list entry directly.
0083  *
0084  * To achieve both of these goals, each entry must contain both a list entry and
0085  * a key, in addition to the user's data pointer.  Note that it's not a good
0086  * idea to have the client embed one of these structures at the top of their own
0087  * data structure, because inserting the same item more than once would most
0088  * likely result in a loop in one of the lists.  That's a sure-fire recipe for
0089  * an infinite loop in the code.
0090  */
0091 struct xfs_mru_cache {
0092     struct radix_tree_root  store;     /* Core storage data structure.  */
0093     struct list_head    *lists;    /* Array of lists, one per grp.  */
0094     struct list_head    reap_list; /* Elements overdue for reaping. */
0095     spinlock_t      lock;      /* Lock to protect this struct.  */
0096     unsigned int        grp_count; /* Number of discrete groups.    */
0097     unsigned int        grp_time;  /* Time period spanned by grps.  */
0098     unsigned int        lru_grp;   /* Group containing time zero.   */
0099     unsigned long       time_zero; /* Time first element was added. */
0100     xfs_mru_cache_free_func_t free_func; /* Function pointer for freeing. */
0101     struct delayed_work work;      /* Workqueue data for reaping.   */
0102     unsigned int        queued;    /* work has been queued */
0103     void            *data;
0104 };
0105
0106 static struct workqueue_struct  *xfs_mru_reap_wq;
0107
0108 /*
0109  * When inserting, destroying or reaping, it's first necessary to update the
0110  * lists relative to a particular time.  In the case of destroying, that time
0111  * will be well in the future to ensure that all items are moved to the reap
0112  * list.  In all other cases though, the time will be the current time.
0113  *
0114  * This function enters a loop, moving the contents of the LRU list to the reap
0115  * list again and again until either a) the lists are all empty, or b) time zero
0116  * has been advanced sufficiently to be within the immediate element lifetime.
0117  *
0118  * Case a) above is detected by counting how many groups are migrated and
0119  * stopping when they've all been moved.  Case b) is detected by monitoring the
0120  * time_zero field, which is updated as each group is migrated.
0121  *
0122  * The return value is the earliest time that more migration could be needed, or
0123  * zero if there's no need to schedule more work because the lists are empty.
0124  */
0125 STATIC unsigned long
0126 _xfs_mru_cache_migrate(
0127     struct xfs_mru_cache    *mru,
0128     unsigned long       now)
0129 {
0130     unsigned int        grp;
0131     unsigned int        migrated = 0;
0132     struct list_head    *lru_list;
0133
0134     /* Nothing to do if the data store is empty. */
0135     if (!mru->time_zero)
0136         return 0;
0137
0138     /* While time zero is older than the time spanned by all the lists. */
0139     while (mru->time_zero <= now - mru->grp_count * mru->grp_time) {
0140
0141         /*
0142          * If the LRU list isn't empty, migrate its elements to the tail
0143          * of the reap list.
0144          */
0145         lru_list = mru->lists + mru->lru_grp;
0146         if (!list_empty(lru_list))
0147             list_splice_init(lru_list, mru->reap_list.prev);
0148
0149         /*
0150          * Advance the LRU group number, freeing the old LRU list to
0151          * become the new MRU list; advance time zero accordingly.
0152          */
0153         mru->lru_grp = (mru->lru_grp + 1) % mru->grp_count;
0154         mru->time_zero += mru->grp_time;
0155
0156         /*
0157          * If reaping is so far behind that all the elements on all the
0158          * lists have been migrated to the reap list, it's now empty.
0159          */
0160         if (++migrated == mru->grp_count) {
0161             mru->lru_grp = 0;
0162             mru->time_zero = 0;
0163             return 0;
0164         }
0165     }
0166
0167     /* Find the first non-empty list from the LRU end. */
0168     for (grp = 0; grp < mru->grp_count; grp++) {
0169
0170         /* Check the grp'th list from the LRU end. */
0171         lru_list = mru->lists + ((mru->lru_grp + grp) % mru->grp_count);
0172         if (!list_empty(lru_list))
0173             return mru->time_zero +
0174                    (mru->grp_count + grp) * mru->grp_time;
0175     }
0176
0177     /* All the lists must be empty. */
0178     mru->lru_grp = 0;
0179     mru->time_zero = 0;
0180     return 0;
0181 }
0182
0183 /*
0184  * When inserting or doing a lookup, an element needs to be inserted into the
0185  * MRU list.  The lists must be migrated first to ensure that they're
0186  * up-to-date, otherwise the new element could be given a shorter lifetime in
0187  * the cache than it should.
0188  */
0189 STATIC void
0190 _xfs_mru_cache_list_insert(
0191     struct xfs_mru_cache    *mru,
0192     struct xfs_mru_cache_elem *elem)
0193 {
0194     unsigned int        grp = 0;
0195     unsigned long       now = jiffies;
0196
0197     /*
0198      * If the data store is empty, initialise time zero, leave grp set to
0199      * zero and start the work queue timer if necessary.  Otherwise, set grp
0200      * to the number of group times that have elapsed since time zero.
0201      */
0202     if (!_xfs_mru_cache_migrate(mru, now)) {
0203         mru->time_zero = now;
0204         if (!mru->queued) {
0205             mru->queued = 1;
0206             queue_delayed_work(xfs_mru_reap_wq, &mru->work,
0207                                mru->grp_count * mru->grp_time);
0208         }
0209     } else {
0210         grp = (now - mru->time_zero) / mru->grp_time;
0211         grp = (mru->lru_grp + grp) % mru->grp_count;
0212     }
0213
0214     /* Insert the element at the tail of the corresponding list. */
0215     list_add_tail(&elem->list_node, mru->lists + grp);
0216 }
0217
0218 /*
0219  * When destroying or reaping, all the elements that were migrated to the reap
0220  * list need to be deleted.  For each element this involves removing it from the
0221  * data store, removing it from the reap list, calling the client's free
0222  * function and deleting the element from the element cache.
0223  *
0224  * We get called holding the mru->lock, which we drop and then reacquire.
0225  * Sparse need special help with this to tell it we know what we are doing.
0226  */
0227 STATIC void
0228 _xfs_mru_cache_clear_reap_list(
0229     struct xfs_mru_cache    *mru)
0230         __releases(mru->lock) __acquires(mru->lock)
0231 {
0232     struct xfs_mru_cache_elem *elem, *next;
0233     struct list_head    tmp;
0234
0235     INIT_LIST_HEAD(&tmp);
0236     list_for_each_entry_safe(elem, next, &mru->reap_list, list_node) {
0237
0238         /* Remove the element from the data store. */
0239         radix_tree_delete(&mru->store, elem->key);
0240
0241         /*
0242          * remove to temp list so it can be freed without
0243          * needing to hold the lock
0244          */
0245         list_move(&elem->list_node, &tmp);
0246     }
0247     spin_unlock(&mru->lock);
0248
0249     list_for_each_entry_safe(elem, next, &tmp, list_node) {
0250         list_del_init(&elem->list_node);
0251         mru->free_func(mru->data, elem);
0252     }
0253
0254     spin_lock(&mru->lock);
0255 }
0256
0257 /*
0258  * We fire the reap timer every group expiry interval so
0259  * we always have a reaper ready to run. This makes shutdown
0260  * and flushing of the reaper easy to do. Hence we need to
0261  * keep when the next reap must occur so we can determine
0262  * at each interval whether there is anything we need to do.
0263  */
0264 STATIC void
0265 _xfs_mru_cache_reap(
0266     struct work_struct  *work)
0267 {
0268     struct xfs_mru_cache    *mru =
0269         container_of(work, struct xfs_mru_cache, work.work);
0270     unsigned long       now, next;
0271
0272     ASSERT(mru && mru->lists);
0273     if (!mru || !mru->lists)
0274         return;
0275
0276     spin_lock(&mru->lock);
0277     next = _xfs_mru_cache_migrate(mru, jiffies);
0278     _xfs_mru_cache_clear_reap_list(mru);
0279
0280     mru->queued = next;
0281     if ((mru->queued > 0)) {
0282         now = jiffies;
0283         if (next <= now)
0284             next = 0;
0285         else
0286             next -= now;
0287         queue_delayed_work(xfs_mru_reap_wq, &mru->work, next);
0288     }
0289
0290     spin_unlock(&mru->lock);
0291 }
0292
0293 int
0294 xfs_mru_cache_init(void)
0295 {
0296     xfs_mru_reap_wq = alloc_workqueue("xfs_mru_cache",
0297             XFS_WQFLAGS(WQ_MEM_RECLAIM | WQ_FREEZABLE), 1);
0298     if (!xfs_mru_reap_wq)
0299         return -ENOMEM;
0300     return 0;
0301 }
0302
0303 void
0304 xfs_mru_cache_uninit(void)
0305 {
0306     destroy_workqueue(xfs_mru_reap_wq);
0307 }
0308
0309 /*
0310  * To initialise a struct xfs_mru_cache pointer, call xfs_mru_cache_create()
0311  * with the address of the pointer, a lifetime value in milliseconds, a group
0312  * count and a free function to use when deleting elements.  This function
0313  * returns 0 if the initialisation was successful.
0314  */
0315 int
0316 xfs_mru_cache_create(
0317     struct xfs_mru_cache    **mrup,
0318     void            *data,
0319     unsigned int        lifetime_ms,
0320     unsigned int        grp_count,
0321     xfs_mru_cache_free_func_t free_func)
0322 {
0323     struct xfs_mru_cache    *mru = NULL;
0324     int         err = 0, grp;
0325     unsigned int        grp_time;
0326
0327     if (mrup)
0328         *mrup = NULL;
0329
0330     if (!mrup || !grp_count || !lifetime_ms || !free_func)
0331         return -EINVAL;
0332
0333     if (!(grp_time = msecs_to_jiffies(lifetime_ms) / grp_count))
0334         return -EINVAL;
0335
0336     if (!(mru = kmem_zalloc(sizeof(*mru), 0)))
0337         return -ENOMEM;
0338
0339     /* An extra list is needed to avoid reaping up to a grp_time early. */
0340     mru->grp_count = grp_count + 1;
0341     mru->lists = kmem_zalloc(mru->grp_count * sizeof(*mru->lists), 0);
0342
0343     if (!mru->lists) {
0344         err = -ENOMEM;
0345         goto exit;
0346     }
0347
0348     for (grp = 0; grp < mru->grp_count; grp++)
0349         INIT_LIST_HEAD(mru->lists + grp);
0350
0351     /*
0352      * We use GFP_KERNEL radix tree preload and do inserts under a
0353      * spinlock so GFP_ATOMIC is appropriate for the radix tree itself.
0354      */
0355     INIT_RADIX_TREE(&mru->store, GFP_ATOMIC);
0356     INIT_LIST_HEAD(&mru->reap_list);
0357     spin_lock_init(&mru->lock);
0358     INIT_DELAYED_WORK(&mru->work, _xfs_mru_cache_reap);
0359
0360     mru->grp_time  = grp_time;
0361     mru->free_func = free_func;
0362     mru->data = data;
0363     *mrup = mru;
0364
0365 exit:
0366     if (err && mru && mru->lists)
0367         kmem_free(mru->lists);
0368     if (err && mru)
0369         kmem_free(mru);
0370
0371     return err;
0372 }
0373
0374 /*
0375  * Call xfs_mru_cache_flush() to flush out all cached entries, calling their
0376  * free functions as they're deleted.  When this function returns, the caller is
0377  * guaranteed that all the free functions for all the elements have finished
0378  * executing and the reaper is not running.
0379  */
0380 static void
0381 xfs_mru_cache_flush(
0382     struct xfs_mru_cache    *mru)
0383 {
0384     if (!mru || !mru->lists)
0385         return;
0386
0387     spin_lock(&mru->lock);
0388     if (mru->queued) {
0389         spin_unlock(&mru->lock);
0390         cancel_delayed_work_sync(&mru->work);
0391         spin_lock(&mru->lock);
0392     }
0393
0394     _xfs_mru_cache_migrate(mru, jiffies + mru->grp_count * mru->grp_time);
0395     _xfs_mru_cache_clear_reap_list(mru);
0396
0397     spin_unlock(&mru->lock);
0398 }
0399
0400 void
0401 xfs_mru_cache_destroy(
0402     struct xfs_mru_cache    *mru)
0403 {
0404     if (!mru || !mru->lists)
0405         return;
0406
0407     xfs_mru_cache_flush(mru);
0408
0409     kmem_free(mru->lists);
0410     kmem_free(mru);
0411 }
0412
0413 /*
0414  * To insert an element, call xfs_mru_cache_insert() with the data store, the
0415  * element's key and the client data pointer.  This function returns 0 on
0416  * success or ENOMEM if memory for the data element couldn't be allocated.
0417  */
0418 int
0419 xfs_mru_cache_insert(
0420     struct xfs_mru_cache    *mru,
0421     unsigned long       key,
0422     struct xfs_mru_cache_elem *elem)
0423 {
0424     int         error;
0425
0426     ASSERT(mru && mru->lists);
0427     if (!mru || !mru->lists)
0428         return -EINVAL;
0429
0430     if (radix_tree_preload(GFP_NOFS))
0431         return -ENOMEM;
0432
0433     INIT_LIST_HEAD(&elem->list_node);
0434     elem->key = key;
0435
0436     spin_lock(&mru->lock);
0437     error = radix_tree_insert(&mru->store, key, elem);
0438     radix_tree_preload_end();
0439     if (!error)
0440         _xfs_mru_cache_list_insert(mru, elem);
0441     spin_unlock(&mru->lock);
0442
0443     return error;
0444 }
0445
0446 /*
0447  * To remove an element without calling the free function, call
0448  * xfs_mru_cache_remove() with the data store and the element's key.  On success
0449  * the client data pointer for the removed element is returned, otherwise this
0450  * function will return a NULL pointer.
0451  */
0452 struct xfs_mru_cache_elem *
0453 xfs_mru_cache_remove(
0454     struct xfs_mru_cache    *mru,
0455     unsigned long       key)
0456 {
0457     struct xfs_mru_cache_elem *elem;
0458
0459     ASSERT(mru && mru->lists);
0460     if (!mru || !mru->lists)
0461         return NULL;
0462
0463     spin_lock(&mru->lock);
0464     elem = radix_tree_delete(&mru->store, key);
0465     if (elem)
0466         list_del(&elem->list_node);
0467     spin_unlock(&mru->lock);
0468
0469     return elem;
0470 }
0471
0472 /*
0473  * To remove and element and call the free function, call xfs_mru_cache_delete()
0474  * with the data store and the element's key.
0475  */
0476 void
0477 xfs_mru_cache_delete(
0478     struct xfs_mru_cache    *mru,
0479     unsigned long       key)
0480 {
0481     struct xfs_mru_cache_elem *elem;
0482
0483     elem = xfs_mru_cache_remove(mru, key);
0484     if (elem)
0485         mru->free_func(mru->data, elem);
0486 }
0487
0488 /*
0489  * To look up an element using its key, call xfs_mru_cache_lookup() with the
0490  * data store and the element's key.  If found, the element will be moved to the
0491  * head of the MRU list to indicate that it's been touched.
0492  *
0493  * The internal data structures are protected by a spinlock that is STILL HELD
0494  * when this function returns.  Call xfs_mru_cache_done() to release it.  Note
0495  * that it is not safe to call any function that might sleep in the interim.
0496  *
0497  * The implementation could have used reference counting to avoid this
0498  * restriction, but since most clients simply want to get, set or test a member
0499  * of the returned data structure, the extra per-element memory isn't warranted.
0500  *
0501  * If the element isn't found, this function returns NULL and the spinlock is
0502  * released.  xfs_mru_cache_done() should NOT be called when this occurs.
0503  *
0504  * Because sparse isn't smart enough to know about conditional lock return
0505  * status, we need to help it get it right by annotating the path that does
0506  * not release the lock.
0507  */
0508 struct xfs_mru_cache_elem *
0509 xfs_mru_cache_lookup(
0510     struct xfs_mru_cache    *mru,
0511     unsigned long       key)
0512 {
0513     struct xfs_mru_cache_elem *elem;
0514
0515     ASSERT(mru && mru->lists);
0516     if (!mru || !mru->lists)
0517         return NULL;
0518
0519     spin_lock(&mru->lock);
0520     elem = radix_tree_lookup(&mru->store, key);
0521     if (elem) {
0522         list_del(&elem->list_node);
0523         _xfs_mru_cache_list_insert(mru, elem);
0524         __release(mru_lock); /* help sparse not be stupid */
0525     } else
0526         spin_unlock(&mru->lock);
0527
0528     return elem;
0529 }
0530
0531 /*
0532  * To release the internal data structure spinlock after having performed an
0533  * xfs_mru_cache_lookup() or an xfs_mru_cache_peek(), call xfs_mru_cache_done()
0534  * with the data store pointer.
0535  */
0536 void
0537 xfs_mru_cache_done(
0538     struct xfs_mru_cache    *mru)
0539         __releases(mru->lock)
0540 {
0541     spin_unlock(&mru->lock);
0542 }