![]() |
|
|||
0001 // SPDX-License-Identifier: GPL-2.0 0002 /* 0003 * Copyright (c) 2006-2007 Silicon Graphics, Inc. 0004 * All Rights Reserved. 0005 */ 0006 #include "xfs.h" 0007 #include "xfs_mru_cache.h" 0008 0009 /* 0010 * The MRU Cache data structure consists of a data store, an array of lists and 0011 * a lock to protect its internal state. At initialisation time, the client 0012 * supplies an element lifetime in milliseconds and a group count, as well as a 0013 * function pointer to call when deleting elements. A data structure for 0014 * queueing up work in the form of timed callbacks is also included. 0015 * 0016 * The group count controls how many lists are created, and thereby how finely 0017 * the elements are grouped in time. When reaping occurs, all the elements in 0018 * all the lists whose time has expired are deleted. 0019 * 0020 * To give an example of how this works in practice, consider a client that 0021 * initialises an MRU Cache with a lifetime of ten seconds and a group count of 0022 * five. Five internal lists will be created, each representing a two second 0023 * period in time. When the first element is added, time zero for the data 0024 * structure is initialised to the current time. 0025 * 0026 * All the elements added in the first two seconds are appended to the first 0027 * list. Elements added in the third second go into the second list, and so on. 0028 * If an element is accessed at any point, it is removed from its list and 0029 * inserted at the head of the current most-recently-used list. 0030 * 0031 * The reaper function will have nothing to do until at least twelve seconds 0032 * have elapsed since the first element was added. The reason for this is that 0033 * if it were called at t=11s, there could be elements in the first list that 0034 * have only been inactive for nine seconds, so it still does nothing. If it is 0035 * called anywhere between t=12 and t=14 seconds, it will delete all the 0036 * elements that remain in the first list. It's therefore possible for elements 0037 * to remain in the data store even after they've been inactive for up to 0038 * (t + t/g) seconds, where t is the inactive element lifetime and g is the 0039 * number of groups. 0040 * 0041 * The above example assumes that the reaper function gets called at least once 0042 * every (t/g) seconds. If it is called less frequently, unused elements will 0043 * accumulate in the reap list until the reaper function is eventually called. 0044 * The current implementation uses work queue callbacks to carefully time the 0045 * reaper function calls, so this should happen rarely, if at all. 0046 * 0047 * From a design perspective, the primary reason for the choice of a list array 0048 * representing discrete time intervals is that it's only practical to reap 0049 * expired elements in groups of some appreciable size. This automatically 0050 * introduces a granularity to element lifetimes, so there's no point storing an 0051 * individual timeout with each element that specifies a more precise reap time. 0052 * The bonus is a saving of sizeof(long) bytes of memory per element stored. 0053 * 0054 * The elements could have been stored in just one list, but an array of 0055 * counters or pointers would need to be maintained to allow them to be divided 0056 * up into discrete time groups. More critically, the process of touching or 0057 * removing an element would involve walking large portions of the entire list, 0058 * which would have a detrimental effect on performance. The additional memory 0059 * requirement for the array of list heads is minimal. 0060 * 0061 * When an element is touched or deleted, it needs to be removed from its 0062 * current list. Doubly linked lists are used to make the list maintenance 0063 * portion of these operations O(1). Since reaper timing can be imprecise, 0064 * inserts and lookups can occur when there are no free lists available. When 0065 * this happens, all the elements on the LRU list need to be migrated to the end 0066 * of the reap list. To keep the list maintenance portion of these operations 0067 * O(1) also, list tails need to be accessible without walking the entire list. 0068 * This is the reason why doubly linked list heads are used. 0069 */ 0070 0071 /* 0072 * An MRU Cache is a dynamic data structure that stores its elements in a way 0073 * that allows efficient lookups, but also groups them into discrete time 0074 * intervals based on insertion time. This allows elements to be efficiently 0075 * and automatically reaped after a fixed period of inactivity. 0076 * 0077 * When a client data pointer is stored in the MRU Cache it needs to be added to 0078 * both the data store and to one of the lists. It must also be possible to 0079 * access each of these entries via the other, i.e. to: 0080 * 0081 * a) Walk a list, removing the corresponding data store entry for each item. 0082 * b) Look up a data store entry, then access its list entry directly. 0083 * 0084 * To achieve both of these goals, each entry must contain both a list entry and 0085 * a key, in addition to the user's data pointer. Note that it's not a good 0086 * idea to have the client embed one of these structures at the top of their own 0087 * data structure, because inserting the same item more than once would most 0088 * likely result in a loop in one of the lists. That's a sure-fire recipe for 0089 * an infinite loop in the code. 0090 */ 0091 struct xfs_mru_cache { 0092 struct radix_tree_root store; /* Core storage data structure. */ 0093 struct list_head *lists; /* Array of lists, one per grp. */ 0094 struct list_head reap_list; /* Elements overdue for reaping. */ 0095 spinlock_t lock; /* Lock to protect this struct. */ 0096 unsigned int grp_count; /* Number of discrete groups. */ 0097 unsigned int grp_time; /* Time period spanned by grps. */ 0098 unsigned int lru_grp; /* Group containing time zero. */ 0099 unsigned long time_zero; /* Time first element was added. */ 0100 xfs_mru_cache_free_func_t free_func; /* Function pointer for freeing. */ 0101 struct delayed_work work; /* Workqueue data for reaping. */ 0102 unsigned int queued; /* work has been queued */ 0103 void *data; 0104 }; 0105 0106 static struct workqueue_struct *xfs_mru_reap_wq; 0107 0108 /* 0109 * When inserting, destroying or reaping, it's first necessary to update the 0110 * lists relative to a particular time. In the case of destroying, that time 0111 * will be well in the future to ensure that all items are moved to the reap 0112 * list. In all other cases though, the time will be the current time. 0113 * 0114 * This function enters a loop, moving the contents of the LRU list to the reap 0115 * list again and again until either a) the lists are all empty, or b) time zero 0116 * has been advanced sufficiently to be within the immediate element lifetime. 0117 * 0118 * Case a) above is detected by counting how many groups are migrated and 0119 * stopping when they've all been moved. Case b) is detected by monitoring the 0120 * time_zero field, which is updated as each group is migrated. 0121 * 0122 * The return value is the earliest time that more migration could be needed, or 0123 * zero if there's no need to schedule more work because the lists are empty. 0124 */ 0125 STATIC unsigned long 0126 _xfs_mru_cache_migrate( 0127 struct xfs_mru_cache *mru, 0128 unsigned long now) 0129 { 0130 unsigned int grp; 0131 unsigned int migrated = 0; 0132 struct list_head *lru_list; 0133 0134 /* Nothing to do if the data store is empty. */ 0135 if (!mru->time_zero) 0136 return 0; 0137 0138 /* While time zero is older than the time spanned by all the lists. */ 0139 while (mru->time_zero <= now - mru->grp_count * mru->grp_time) { 0140 0141 /* 0142 * If the LRU list isn't empty, migrate its elements to the tail 0143 * of the reap list. 0144 */ 0145 lru_list = mru->lists + mru->lru_grp; 0146 if (!list_empty(lru_list)) 0147 list_splice_init(lru_list, mru->reap_list.prev); 0148 0149 /* 0150 * Advance the LRU group number, freeing the old LRU list to 0151 * become the new MRU list; advance time zero accordingly. 0152 */ 0153 mru->lru_grp = (mru->lru_grp + 1) % mru->grp_count; 0154 mru->time_zero += mru->grp_time; 0155 0156 /* 0157 * If reaping is so far behind that all the elements on all the 0158 * lists have been migrated to the reap list, it's now empty. 0159 */ 0160 if (++migrated == mru->grp_count) { 0161 mru->lru_grp = 0; 0162 mru->time_zero = 0; 0163 return 0; 0164 } 0165 } 0166 0167 /* Find the first non-empty list from the LRU end. */ 0168 for (grp = 0; grp < mru->grp_count; grp++) { 0169 0170 /* Check the grp'th list from the LRU end. */ 0171 lru_list = mru->lists + ((mru->lru_grp + grp) % mru->grp_count); 0172 if (!list_empty(lru_list)) 0173 return mru->time_zero + 0174 (mru->grp_count + grp) * mru->grp_time; 0175 } 0176 0177 /* All the lists must be empty. */ 0178 mru->lru_grp = 0; 0179 mru->time_zero = 0; 0180 return 0; 0181 } 0182 0183 /* 0184 * When inserting or doing a lookup, an element needs to be inserted into the 0185 * MRU list. The lists must be migrated first to ensure that they're 0186 * up-to-date, otherwise the new element could be given a shorter lifetime in 0187 * the cache than it should. 0188 */ 0189 STATIC void 0190 _xfs_mru_cache_list_insert( 0191 struct xfs_mru_cache *mru, 0192 struct xfs_mru_cache_elem *elem) 0193 { 0194 unsigned int grp = 0; 0195 unsigned long now = jiffies; 0196 0197 /* 0198 * If the data store is empty, initialise time zero, leave grp set to 0199 * zero and start the work queue timer if necessary. Otherwise, set grp 0200 * to the number of group times that have elapsed since time zero. 0201 */ 0202 if (!_xfs_mru_cache_migrate(mru, now)) { 0203 mru->time_zero = now; 0204 if (!mru->queued) { 0205 mru->queued = 1; 0206 queue_delayed_work(xfs_mru_reap_wq, &mru->work, 0207 mru->grp_count * mru->grp_time); 0208 } 0209 } else { 0210 grp = (now - mru->time_zero) / mru->grp_time; 0211 grp = (mru->lru_grp + grp) % mru->grp_count; 0212 } 0213 0214 /* Insert the element at the tail of the corresponding list. */ 0215 list_add_tail(&elem->list_node, mru->lists + grp); 0216 } 0217 0218 /* 0219 * When destroying or reaping, all the elements that were migrated to the reap 0220 * list need to be deleted. For each element this involves removing it from the 0221 * data store, removing it from the reap list, calling the client's free 0222 * function and deleting the element from the element cache. 0223 * 0224 * We get called holding the mru->lock, which we drop and then reacquire. 0225 * Sparse need special help with this to tell it we know what we are doing. 0226 */ 0227 STATIC void 0228 _xfs_mru_cache_clear_reap_list( 0229 struct xfs_mru_cache *mru) 0230 __releases(mru->lock) __acquires(mru->lock) 0231 { 0232 struct xfs_mru_cache_elem *elem, *next; 0233 struct list_head tmp; 0234 0235 INIT_LIST_HEAD(&tmp); 0236 list_for_each_entry_safe(elem, next, &mru->reap_list, list_node) { 0237 0238 /* Remove the element from the data store. */ 0239 radix_tree_delete(&mru->store, elem->key); 0240 0241 /* 0242 * remove to temp list so it can be freed without 0243 * needing to hold the lock 0244 */ 0245 list_move(&elem->list_node, &tmp); 0246 } 0247 spin_unlock(&mru->lock); 0248 0249 list_for_each_entry_safe(elem, next, &tmp, list_node) { 0250 list_del_init(&elem->list_node); 0251 mru->free_func(mru->data, elem); 0252 } 0253 0254 spin_lock(&mru->lock); 0255 } 0256 0257 /* 0258 * We fire the reap timer every group expiry interval so 0259 * we always have a reaper ready to run. This makes shutdown 0260 * and flushing of the reaper easy to do. Hence we need to 0261 * keep when the next reap must occur so we can determine 0262 * at each interval whether there is anything we need to do. 0263 */ 0264 STATIC void 0265 _xfs_mru_cache_reap( 0266 struct work_struct *work) 0267 { 0268 struct xfs_mru_cache *mru = 0269 container_of(work, struct xfs_mru_cache, work.work); 0270 unsigned long now, next; 0271 0272 ASSERT(mru && mru->lists); 0273 if (!mru || !mru->lists) 0274 return; 0275 0276 spin_lock(&mru->lock); 0277 next = _xfs_mru_cache_migrate(mru, jiffies); 0278 _xfs_mru_cache_clear_reap_list(mru); 0279 0280 mru->queued = next; 0281 if ((mru->queued > 0)) { 0282 now = jiffies; 0283 if (next <= now) 0284 next = 0; 0285 else 0286 next -= now; 0287 queue_delayed_work(xfs_mru_reap_wq, &mru->work, next); 0288 } 0289 0290 spin_unlock(&mru->lock); 0291 } 0292 0293 int 0294 xfs_mru_cache_init(void) 0295 { 0296 xfs_mru_reap_wq = alloc_workqueue("xfs_mru_cache", 0297 XFS_WQFLAGS(WQ_MEM_RECLAIM | WQ_FREEZABLE), 1); 0298 if (!xfs_mru_reap_wq) 0299 return -ENOMEM; 0300 return 0; 0301 } 0302 0303 void 0304 xfs_mru_cache_uninit(void) 0305 { 0306 destroy_workqueue(xfs_mru_reap_wq); 0307 } 0308 0309 /* 0310 * To initialise a struct xfs_mru_cache pointer, call xfs_mru_cache_create() 0311 * with the address of the pointer, a lifetime value in milliseconds, a group 0312 * count and a free function to use when deleting elements. This function 0313 * returns 0 if the initialisation was successful. 0314 */ 0315 int 0316 xfs_mru_cache_create( 0317 struct xfs_mru_cache **mrup, 0318 void *data, 0319 unsigned int lifetime_ms, 0320 unsigned int grp_count, 0321 xfs_mru_cache_free_func_t free_func) 0322 { 0323 struct xfs_mru_cache *mru = NULL; 0324 int err = 0, grp; 0325 unsigned int grp_time; 0326 0327 if (mrup) 0328 *mrup = NULL; 0329 0330 if (!mrup || !grp_count || !lifetime_ms || !free_func) 0331 return -EINVAL; 0332 0333 if (!(grp_time = msecs_to_jiffies(lifetime_ms) / grp_count)) 0334 return -EINVAL; 0335 0336 if (!(mru = kmem_zalloc(sizeof(*mru), 0))) 0337 return -ENOMEM; 0338 0339 /* An extra list is needed to avoid reaping up to a grp_time early. */ 0340 mru->grp_count = grp_count + 1; 0341 mru->lists = kmem_zalloc(mru->grp_count * sizeof(*mru->lists), 0); 0342 0343 if (!mru->lists) { 0344 err = -ENOMEM; 0345 goto exit; 0346 } 0347 0348 for (grp = 0; grp < mru->grp_count; grp++) 0349 INIT_LIST_HEAD(mru->lists + grp); 0350 0351 /* 0352 * We use GFP_KERNEL radix tree preload and do inserts under a 0353 * spinlock so GFP_ATOMIC is appropriate for the radix tree itself. 0354 */ 0355 INIT_RADIX_TREE(&mru->store, GFP_ATOMIC); 0356 INIT_LIST_HEAD(&mru->reap_list); 0357 spin_lock_init(&mru->lock); 0358 INIT_DELAYED_WORK(&mru->work, _xfs_mru_cache_reap); 0359 0360 mru->grp_time = grp_time; 0361 mru->free_func = free_func; 0362 mru->data = data; 0363 *mrup = mru; 0364 0365 exit: 0366 if (err && mru && mru->lists) 0367 kmem_free(mru->lists); 0368 if (err && mru) 0369 kmem_free(mru); 0370 0371 return err; 0372 } 0373 0374 /* 0375 * Call xfs_mru_cache_flush() to flush out all cached entries, calling their 0376 * free functions as they're deleted. When this function returns, the caller is 0377 * guaranteed that all the free functions for all the elements have finished 0378 * executing and the reaper is not running. 0379 */ 0380 static void 0381 xfs_mru_cache_flush( 0382 struct xfs_mru_cache *mru) 0383 { 0384 if (!mru || !mru->lists) 0385 return; 0386 0387 spin_lock(&mru->lock); 0388 if (mru->queued) { 0389 spin_unlock(&mru->lock); 0390 cancel_delayed_work_sync(&mru->work); 0391 spin_lock(&mru->lock); 0392 } 0393 0394 _xfs_mru_cache_migrate(mru, jiffies + mru->grp_count * mru->grp_time); 0395 _xfs_mru_cache_clear_reap_list(mru); 0396 0397 spin_unlock(&mru->lock); 0398 } 0399 0400 void 0401 xfs_mru_cache_destroy( 0402 struct xfs_mru_cache *mru) 0403 { 0404 if (!mru || !mru->lists) 0405 return; 0406 0407 xfs_mru_cache_flush(mru); 0408 0409 kmem_free(mru->lists); 0410 kmem_free(mru); 0411 } 0412 0413 /* 0414 * To insert an element, call xfs_mru_cache_insert() with the data store, the 0415 * element's key and the client data pointer. This function returns 0 on 0416 * success or ENOMEM if memory for the data element couldn't be allocated. 0417 */ 0418 int 0419 xfs_mru_cache_insert( 0420 struct xfs_mru_cache *mru, 0421 unsigned long key, 0422 struct xfs_mru_cache_elem *elem) 0423 { 0424 int error; 0425 0426 ASSERT(mru && mru->lists); 0427 if (!mru || !mru->lists) 0428 return -EINVAL; 0429 0430 if (radix_tree_preload(GFP_NOFS)) 0431 return -ENOMEM; 0432 0433 INIT_LIST_HEAD(&elem->list_node); 0434 elem->key = key; 0435 0436 spin_lock(&mru->lock); 0437 error = radix_tree_insert(&mru->store, key, elem); 0438 radix_tree_preload_end(); 0439 if (!error) 0440 _xfs_mru_cache_list_insert(mru, elem); 0441 spin_unlock(&mru->lock); 0442 0443 return error; 0444 } 0445 0446 /* 0447 * To remove an element without calling the free function, call 0448 * xfs_mru_cache_remove() with the data store and the element's key. On success 0449 * the client data pointer for the removed element is returned, otherwise this 0450 * function will return a NULL pointer. 0451 */ 0452 struct xfs_mru_cache_elem * 0453 xfs_mru_cache_remove( 0454 struct xfs_mru_cache *mru, 0455 unsigned long key) 0456 { 0457 struct xfs_mru_cache_elem *elem; 0458 0459 ASSERT(mru && mru->lists); 0460 if (!mru || !mru->lists) 0461 return NULL; 0462 0463 spin_lock(&mru->lock); 0464 elem = radix_tree_delete(&mru->store, key); 0465 if (elem) 0466 list_del(&elem->list_node); 0467 spin_unlock(&mru->lock); 0468 0469 return elem; 0470 } 0471 0472 /* 0473 * To remove and element and call the free function, call xfs_mru_cache_delete() 0474 * with the data store and the element's key. 0475 */ 0476 void 0477 xfs_mru_cache_delete( 0478 struct xfs_mru_cache *mru, 0479 unsigned long key) 0480 { 0481 struct xfs_mru_cache_elem *elem; 0482 0483 elem = xfs_mru_cache_remove(mru, key); 0484 if (elem) 0485 mru->free_func(mru->data, elem); 0486 } 0487 0488 /* 0489 * To look up an element using its key, call xfs_mru_cache_lookup() with the 0490 * data store and the element's key. If found, the element will be moved to the 0491 * head of the MRU list to indicate that it's been touched. 0492 * 0493 * The internal data structures are protected by a spinlock that is STILL HELD 0494 * when this function returns. Call xfs_mru_cache_done() to release it. Note 0495 * that it is not safe to call any function that might sleep in the interim. 0496 * 0497 * The implementation could have used reference counting to avoid this 0498 * restriction, but since most clients simply want to get, set or test a member 0499 * of the returned data structure, the extra per-element memory isn't warranted. 0500 * 0501 * If the element isn't found, this function returns NULL and the spinlock is 0502 * released. xfs_mru_cache_done() should NOT be called when this occurs. 0503 * 0504 * Because sparse isn't smart enough to know about conditional lock return 0505 * status, we need to help it get it right by annotating the path that does 0506 * not release the lock. 0507 */ 0508 struct xfs_mru_cache_elem * 0509 xfs_mru_cache_lookup( 0510 struct xfs_mru_cache *mru, 0511 unsigned long key) 0512 { 0513 struct xfs_mru_cache_elem *elem; 0514 0515 ASSERT(mru && mru->lists); 0516 if (!mru || !mru->lists) 0517 return NULL; 0518 0519 spin_lock(&mru->lock); 0520 elem = radix_tree_lookup(&mru->store, key); 0521 if (elem) { 0522 list_del(&elem->list_node); 0523 _xfs_mru_cache_list_insert(mru, elem); 0524 __release(mru_lock); /* help sparse not be stupid */ 0525 } else 0526 spin_unlock(&mru->lock); 0527 0528 return elem; 0529 } 0530 0531 /* 0532 * To release the internal data structure spinlock after having performed an 0533 * xfs_mru_cache_lookup() or an xfs_mru_cache_peek(), call xfs_mru_cache_done() 0534 * with the data store pointer. 0535 */ 0536 void 0537 xfs_mru_cache_done( 0538 struct xfs_mru_cache *mru) 0539 __releases(mru->lock) 0540 { 0541 spin_unlock(&mru->lock); 0542 }
[ Source navigation ] | [ Diff markup ] | [ Identifier search ] | [ general search ] |
This page was automatically generated by the 2.1.0 LXR engine. The LXR team |
![]() ![]() |