Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0-only
0002 /*
0003  * This file is part of UBIFS.
0004  *
0005  * Copyright (C) 2006-2008 Nokia Corporation.
0006  *
0007  * Authors: Adrian Hunter
0008  *          Artem Bityutskiy (Битюцкий Артём)
0009  */
0010 
0011 /*
0012  * This file implements garbage collection. The procedure for garbage collection
0013  * is different depending on whether a LEB as an index LEB (contains index
0014  * nodes) or not. For non-index LEBs, garbage collection finds a LEB which
0015  * contains a lot of dirty space (obsolete nodes), and copies the non-obsolete
0016  * nodes to the journal, at which point the garbage-collected LEB is free to be
0017  * reused. For index LEBs, garbage collection marks the non-obsolete index nodes
0018  * dirty in the TNC, and after the next commit, the garbage-collected LEB is
0019  * to be reused. Garbage collection will cause the number of dirty index nodes
0020  * to grow, however sufficient space is reserved for the index to ensure the
0021  * commit will never run out of space.
0022  *
0023  * Notes about dead watermark. At current UBIFS implementation we assume that
0024  * LEBs which have less than @c->dead_wm bytes of free + dirty space are full
0025  * and not worth garbage-collecting. The dead watermark is one min. I/O unit
0026  * size, or min. UBIFS node size, depending on what is greater. Indeed, UBIFS
0027  * Garbage Collector has to synchronize the GC head's write buffer before
0028  * returning, so this is about wasting one min. I/O unit. However, UBIFS GC can
0029  * actually reclaim even very small pieces of dirty space by garbage collecting
0030  * enough dirty LEBs, but we do not bother doing this at this implementation.
0031  *
0032  * Notes about dark watermark. The results of GC work depends on how big are
0033  * the UBIFS nodes GC deals with. Large nodes make GC waste more space. Indeed,
0034  * if GC move data from LEB A to LEB B and nodes in LEB A are large, GC would
0035  * have to waste large pieces of free space at the end of LEB B, because nodes
0036  * from LEB A would not fit. And the worst situation is when all nodes are of
0037  * maximum size. So dark watermark is the amount of free + dirty space in LEB
0038  * which are guaranteed to be reclaimable. If LEB has less space, the GC might
0039  * be unable to reclaim it. So, LEBs with free + dirty greater than dark
0040  * watermark are "good" LEBs from GC's point of view. The other LEBs are not so
0041  * good, and GC takes extra care when moving them.
0042  */
0043 
0044 #include <linux/slab.h>
0045 #include <linux/pagemap.h>
0046 #include <linux/list_sort.h>
0047 #include "ubifs.h"
0048 
0049 /*
0050  * GC may need to move more than one LEB to make progress. The below constants
0051  * define "soft" and "hard" limits on the number of LEBs the garbage collector
0052  * may move.
0053  */
0054 #define SOFT_LEBS_LIMIT 4
0055 #define HARD_LEBS_LIMIT 32
0056 
0057 /**
0058  * switch_gc_head - switch the garbage collection journal head.
0059  * @c: UBIFS file-system description object
0060  *
0061  * This function switch the GC head to the next LEB which is reserved in
0062  * @c->gc_lnum. Returns %0 in case of success, %-EAGAIN if commit is required,
0063  * and other negative error code in case of failures.
0064  */
0065 static int switch_gc_head(struct ubifs_info *c)
0066 {
0067     int err, gc_lnum = c->gc_lnum;
0068     struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
0069 
0070     ubifs_assert(c, gc_lnum != -1);
0071     dbg_gc("switch GC head from LEB %d:%d to LEB %d (waste %d bytes)",
0072            wbuf->lnum, wbuf->offs + wbuf->used, gc_lnum,
0073            c->leb_size - wbuf->offs - wbuf->used);
0074 
0075     err = ubifs_wbuf_sync_nolock(wbuf);
0076     if (err)
0077         return err;
0078 
0079     /*
0080      * The GC write-buffer was synchronized, we may safely unmap
0081      * 'c->gc_lnum'.
0082      */
0083     err = ubifs_leb_unmap(c, gc_lnum);
0084     if (err)
0085         return err;
0086 
0087     err = ubifs_add_bud_to_log(c, GCHD, gc_lnum, 0);
0088     if (err)
0089         return err;
0090 
0091     c->gc_lnum = -1;
0092     err = ubifs_wbuf_seek_nolock(wbuf, gc_lnum, 0);
0093     return err;
0094 }
0095 
0096 /**
0097  * data_nodes_cmp - compare 2 data nodes.
0098  * @priv: UBIFS file-system description object
0099  * @a: first data node
0100  * @b: second data node
0101  *
0102  * This function compares data nodes @a and @b. Returns %1 if @a has greater
0103  * inode or block number, and %-1 otherwise.
0104  */
0105 static int data_nodes_cmp(void *priv, const struct list_head *a,
0106               const struct list_head *b)
0107 {
0108     ino_t inuma, inumb;
0109     struct ubifs_info *c = priv;
0110     struct ubifs_scan_node *sa, *sb;
0111 
0112     cond_resched();
0113     if (a == b)
0114         return 0;
0115 
0116     sa = list_entry(a, struct ubifs_scan_node, list);
0117     sb = list_entry(b, struct ubifs_scan_node, list);
0118 
0119     ubifs_assert(c, key_type(c, &sa->key) == UBIFS_DATA_KEY);
0120     ubifs_assert(c, key_type(c, &sb->key) == UBIFS_DATA_KEY);
0121     ubifs_assert(c, sa->type == UBIFS_DATA_NODE);
0122     ubifs_assert(c, sb->type == UBIFS_DATA_NODE);
0123 
0124     inuma = key_inum(c, &sa->key);
0125     inumb = key_inum(c, &sb->key);
0126 
0127     if (inuma == inumb) {
0128         unsigned int blka = key_block(c, &sa->key);
0129         unsigned int blkb = key_block(c, &sb->key);
0130 
0131         if (blka <= blkb)
0132             return -1;
0133     } else if (inuma <= inumb)
0134         return -1;
0135 
0136     return 1;
0137 }
0138 
0139 /*
0140  * nondata_nodes_cmp - compare 2 non-data nodes.
0141  * @priv: UBIFS file-system description object
0142  * @a: first node
0143  * @a: second node
0144  *
0145  * This function compares nodes @a and @b. It makes sure that inode nodes go
0146  * first and sorted by length in descending order. Directory entry nodes go
0147  * after inode nodes and are sorted in ascending hash valuer order.
0148  */
0149 static int nondata_nodes_cmp(void *priv, const struct list_head *a,
0150                  const struct list_head *b)
0151 {
0152     ino_t inuma, inumb;
0153     struct ubifs_info *c = priv;
0154     struct ubifs_scan_node *sa, *sb;
0155 
0156     cond_resched();
0157     if (a == b)
0158         return 0;
0159 
0160     sa = list_entry(a, struct ubifs_scan_node, list);
0161     sb = list_entry(b, struct ubifs_scan_node, list);
0162 
0163     ubifs_assert(c, key_type(c, &sa->key) != UBIFS_DATA_KEY &&
0164              key_type(c, &sb->key) != UBIFS_DATA_KEY);
0165     ubifs_assert(c, sa->type != UBIFS_DATA_NODE &&
0166              sb->type != UBIFS_DATA_NODE);
0167 
0168     /* Inodes go before directory entries */
0169     if (sa->type == UBIFS_INO_NODE) {
0170         if (sb->type == UBIFS_INO_NODE)
0171             return sb->len - sa->len;
0172         return -1;
0173     }
0174     if (sb->type == UBIFS_INO_NODE)
0175         return 1;
0176 
0177     ubifs_assert(c, key_type(c, &sa->key) == UBIFS_DENT_KEY ||
0178              key_type(c, &sa->key) == UBIFS_XENT_KEY);
0179     ubifs_assert(c, key_type(c, &sb->key) == UBIFS_DENT_KEY ||
0180              key_type(c, &sb->key) == UBIFS_XENT_KEY);
0181     ubifs_assert(c, sa->type == UBIFS_DENT_NODE ||
0182              sa->type == UBIFS_XENT_NODE);
0183     ubifs_assert(c, sb->type == UBIFS_DENT_NODE ||
0184              sb->type == UBIFS_XENT_NODE);
0185 
0186     inuma = key_inum(c, &sa->key);
0187     inumb = key_inum(c, &sb->key);
0188 
0189     if (inuma == inumb) {
0190         uint32_t hasha = key_hash(c, &sa->key);
0191         uint32_t hashb = key_hash(c, &sb->key);
0192 
0193         if (hasha <= hashb)
0194             return -1;
0195     } else if (inuma <= inumb)
0196         return -1;
0197 
0198     return 1;
0199 }
0200 
0201 /**
0202  * sort_nodes - sort nodes for GC.
0203  * @c: UBIFS file-system description object
0204  * @sleb: describes nodes to sort and contains the result on exit
0205  * @nondata: contains non-data nodes on exit
0206  * @min: minimum node size is returned here
0207  *
0208  * This function sorts the list of inodes to garbage collect. First of all, it
0209  * kills obsolete nodes and separates data and non-data nodes to the
0210  * @sleb->nodes and @nondata lists correspondingly.
0211  *
0212  * Data nodes are then sorted in block number order - this is important for
0213  * bulk-read; data nodes with lower inode number go before data nodes with
0214  * higher inode number, and data nodes with lower block number go before data
0215  * nodes with higher block number;
0216  *
0217  * Non-data nodes are sorted as follows.
0218  *   o First go inode nodes - they are sorted in descending length order.
0219  *   o Then go directory entry nodes - they are sorted in hash order, which
0220  *     should supposedly optimize 'readdir()'. Direntry nodes with lower parent
0221  *     inode number go before direntry nodes with higher parent inode number,
0222  *     and direntry nodes with lower name hash values go before direntry nodes
0223  *     with higher name hash values.
0224  *
0225  * This function returns zero in case of success and a negative error code in
0226  * case of failure.
0227  */
0228 static int sort_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
0229               struct list_head *nondata, int *min)
0230 {
0231     int err;
0232     struct ubifs_scan_node *snod, *tmp;
0233 
0234     *min = INT_MAX;
0235 
0236     /* Separate data nodes and non-data nodes */
0237     list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) {
0238         ubifs_assert(c, snod->type == UBIFS_INO_NODE  ||
0239                  snod->type == UBIFS_DATA_NODE ||
0240                  snod->type == UBIFS_DENT_NODE ||
0241                  snod->type == UBIFS_XENT_NODE ||
0242                  snod->type == UBIFS_TRUN_NODE ||
0243                  snod->type == UBIFS_AUTH_NODE);
0244 
0245         if (snod->type != UBIFS_INO_NODE  &&
0246             snod->type != UBIFS_DATA_NODE &&
0247             snod->type != UBIFS_DENT_NODE &&
0248             snod->type != UBIFS_XENT_NODE) {
0249             /* Probably truncation node, zap it */
0250             list_del(&snod->list);
0251             kfree(snod);
0252             continue;
0253         }
0254 
0255         ubifs_assert(c, key_type(c, &snod->key) == UBIFS_DATA_KEY ||
0256                  key_type(c, &snod->key) == UBIFS_INO_KEY  ||
0257                  key_type(c, &snod->key) == UBIFS_DENT_KEY ||
0258                  key_type(c, &snod->key) == UBIFS_XENT_KEY);
0259 
0260         err = ubifs_tnc_has_node(c, &snod->key, 0, sleb->lnum,
0261                      snod->offs, 0);
0262         if (err < 0)
0263             return err;
0264 
0265         if (!err) {
0266             /* The node is obsolete, remove it from the list */
0267             list_del(&snod->list);
0268             kfree(snod);
0269             continue;
0270         }
0271 
0272         if (snod->len < *min)
0273             *min = snod->len;
0274 
0275         if (key_type(c, &snod->key) != UBIFS_DATA_KEY)
0276             list_move_tail(&snod->list, nondata);
0277     }
0278 
0279     /* Sort data and non-data nodes */
0280     list_sort(c, &sleb->nodes, &data_nodes_cmp);
0281     list_sort(c, nondata, &nondata_nodes_cmp);
0282 
0283     err = dbg_check_data_nodes_order(c, &sleb->nodes);
0284     if (err)
0285         return err;
0286     err = dbg_check_nondata_nodes_order(c, nondata);
0287     if (err)
0288         return err;
0289     return 0;
0290 }
0291 
0292 /**
0293  * move_node - move a node.
0294  * @c: UBIFS file-system description object
0295  * @sleb: describes the LEB to move nodes from
0296  * @snod: the mode to move
0297  * @wbuf: write-buffer to move node to
0298  *
0299  * This function moves node @snod to @wbuf, changes TNC correspondingly, and
0300  * destroys @snod. Returns zero in case of success and a negative error code in
0301  * case of failure.
0302  */
0303 static int move_node(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
0304              struct ubifs_scan_node *snod, struct ubifs_wbuf *wbuf)
0305 {
0306     int err, new_lnum = wbuf->lnum, new_offs = wbuf->offs + wbuf->used;
0307 
0308     cond_resched();
0309     err = ubifs_wbuf_write_nolock(wbuf, snod->node, snod->len);
0310     if (err)
0311         return err;
0312 
0313     err = ubifs_tnc_replace(c, &snod->key, sleb->lnum,
0314                 snod->offs, new_lnum, new_offs,
0315                 snod->len);
0316     list_del(&snod->list);
0317     kfree(snod);
0318     return err;
0319 }
0320 
0321 /**
0322  * move_nodes - move nodes.
0323  * @c: UBIFS file-system description object
0324  * @sleb: describes the LEB to move nodes from
0325  *
0326  * This function moves valid nodes from data LEB described by @sleb to the GC
0327  * journal head. This function returns zero in case of success, %-EAGAIN if
0328  * commit is required, and other negative error codes in case of other
0329  * failures.
0330  */
0331 static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb)
0332 {
0333     int err, min;
0334     LIST_HEAD(nondata);
0335     struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
0336 
0337     if (wbuf->lnum == -1) {
0338         /*
0339          * The GC journal head is not set, because it is the first GC
0340          * invocation since mount.
0341          */
0342         err = switch_gc_head(c);
0343         if (err)
0344             return err;
0345     }
0346 
0347     err = sort_nodes(c, sleb, &nondata, &min);
0348     if (err)
0349         goto out;
0350 
0351     /* Write nodes to their new location. Use the first-fit strategy */
0352     while (1) {
0353         int avail, moved = 0;
0354         struct ubifs_scan_node *snod, *tmp;
0355 
0356         /* Move data nodes */
0357         list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) {
0358             avail = c->leb_size - wbuf->offs - wbuf->used -
0359                     ubifs_auth_node_sz(c);
0360             if  (snod->len > avail)
0361                 /*
0362                  * Do not skip data nodes in order to optimize
0363                  * bulk-read.
0364                  */
0365                 break;
0366 
0367             err = ubifs_shash_update(c, c->jheads[GCHD].log_hash,
0368                          snod->node, snod->len);
0369             if (err)
0370                 goto out;
0371 
0372             err = move_node(c, sleb, snod, wbuf);
0373             if (err)
0374                 goto out;
0375             moved = 1;
0376         }
0377 
0378         /* Move non-data nodes */
0379         list_for_each_entry_safe(snod, tmp, &nondata, list) {
0380             avail = c->leb_size - wbuf->offs - wbuf->used -
0381                     ubifs_auth_node_sz(c);
0382             if (avail < min)
0383                 break;
0384 
0385             if  (snod->len > avail) {
0386                 /*
0387                  * Keep going only if this is an inode with
0388                  * some data. Otherwise stop and switch the GC
0389                  * head. IOW, we assume that data-less inode
0390                  * nodes and direntry nodes are roughly of the
0391                  * same size.
0392                  */
0393                 if (key_type(c, &snod->key) == UBIFS_DENT_KEY ||
0394                     snod->len == UBIFS_INO_NODE_SZ)
0395                     break;
0396                 continue;
0397             }
0398 
0399             err = ubifs_shash_update(c, c->jheads[GCHD].log_hash,
0400                          snod->node, snod->len);
0401             if (err)
0402                 goto out;
0403 
0404             err = move_node(c, sleb, snod, wbuf);
0405             if (err)
0406                 goto out;
0407             moved = 1;
0408         }
0409 
0410         if (ubifs_authenticated(c) && moved) {
0411             struct ubifs_auth_node *auth;
0412 
0413             auth = kmalloc(ubifs_auth_node_sz(c), GFP_NOFS);
0414             if (!auth) {
0415                 err = -ENOMEM;
0416                 goto out;
0417             }
0418 
0419             err = ubifs_prepare_auth_node(c, auth,
0420                         c->jheads[GCHD].log_hash);
0421             if (err) {
0422                 kfree(auth);
0423                 goto out;
0424             }
0425 
0426             err = ubifs_wbuf_write_nolock(wbuf, auth,
0427                               ubifs_auth_node_sz(c));
0428             if (err) {
0429                 kfree(auth);
0430                 goto out;
0431             }
0432 
0433             ubifs_add_dirt(c, wbuf->lnum, ubifs_auth_node_sz(c));
0434         }
0435 
0436         if (list_empty(&sleb->nodes) && list_empty(&nondata))
0437             break;
0438 
0439         /*
0440          * Waste the rest of the space in the LEB and switch to the
0441          * next LEB.
0442          */
0443         err = switch_gc_head(c);
0444         if (err)
0445             goto out;
0446     }
0447 
0448     return 0;
0449 
0450 out:
0451     list_splice_tail(&nondata, &sleb->nodes);
0452     return err;
0453 }
0454 
0455 /**
0456  * gc_sync_wbufs - sync write-buffers for GC.
0457  * @c: UBIFS file-system description object
0458  *
0459  * We must guarantee that obsoleting nodes are on flash. Unfortunately they may
0460  * be in a write-buffer instead. That is, a node could be written to a
0461  * write-buffer, obsoleting another node in a LEB that is GC'd. If that LEB is
0462  * erased before the write-buffer is sync'd and then there is an unclean
0463  * unmount, then an existing node is lost. To avoid this, we sync all
0464  * write-buffers.
0465  *
0466  * This function returns %0 on success or a negative error code on failure.
0467  */
0468 static int gc_sync_wbufs(struct ubifs_info *c)
0469 {
0470     int err, i;
0471 
0472     for (i = 0; i < c->jhead_cnt; i++) {
0473         if (i == GCHD)
0474             continue;
0475         err = ubifs_wbuf_sync(&c->jheads[i].wbuf);
0476         if (err)
0477             return err;
0478     }
0479     return 0;
0480 }
0481 
0482 /**
0483  * ubifs_garbage_collect_leb - garbage-collect a logical eraseblock.
0484  * @c: UBIFS file-system description object
0485  * @lp: describes the LEB to garbage collect
0486  *
0487  * This function garbage-collects an LEB and returns one of the @LEB_FREED,
0488  * @LEB_RETAINED, etc positive codes in case of success, %-EAGAIN if commit is
0489  * required, and other negative error codes in case of failures.
0490  */
0491 int ubifs_garbage_collect_leb(struct ubifs_info *c, struct ubifs_lprops *lp)
0492 {
0493     struct ubifs_scan_leb *sleb;
0494     struct ubifs_scan_node *snod;
0495     struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
0496     int err = 0, lnum = lp->lnum;
0497 
0498     ubifs_assert(c, c->gc_lnum != -1 || wbuf->offs + wbuf->used == 0 ||
0499              c->need_recovery);
0500     ubifs_assert(c, c->gc_lnum != lnum);
0501     ubifs_assert(c, wbuf->lnum != lnum);
0502 
0503     if (lp->free + lp->dirty == c->leb_size) {
0504         /* Special case - a free LEB  */
0505         dbg_gc("LEB %d is free, return it", lp->lnum);
0506         ubifs_assert(c, !(lp->flags & LPROPS_INDEX));
0507 
0508         if (lp->free != c->leb_size) {
0509             /*
0510              * Write buffers must be sync'd before unmapping
0511              * freeable LEBs, because one of them may contain data
0512              * which obsoletes something in 'lp->lnum'.
0513              */
0514             err = gc_sync_wbufs(c);
0515             if (err)
0516                 return err;
0517             err = ubifs_change_one_lp(c, lp->lnum, c->leb_size,
0518                           0, 0, 0, 0);
0519             if (err)
0520                 return err;
0521         }
0522         err = ubifs_leb_unmap(c, lp->lnum);
0523         if (err)
0524             return err;
0525 
0526         if (c->gc_lnum == -1) {
0527             c->gc_lnum = lnum;
0528             return LEB_RETAINED;
0529         }
0530 
0531         return LEB_FREED;
0532     }
0533 
0534     /*
0535      * We scan the entire LEB even though we only really need to scan up to
0536      * (c->leb_size - lp->free).
0537      */
0538     sleb = ubifs_scan(c, lnum, 0, c->sbuf, 0);
0539     if (IS_ERR(sleb))
0540         return PTR_ERR(sleb);
0541 
0542     ubifs_assert(c, !list_empty(&sleb->nodes));
0543     snod = list_entry(sleb->nodes.next, struct ubifs_scan_node, list);
0544 
0545     if (snod->type == UBIFS_IDX_NODE) {
0546         struct ubifs_gced_idx_leb *idx_gc;
0547 
0548         dbg_gc("indexing LEB %d (free %d, dirty %d)",
0549                lnum, lp->free, lp->dirty);
0550         list_for_each_entry(snod, &sleb->nodes, list) {
0551             struct ubifs_idx_node *idx = snod->node;
0552             int level = le16_to_cpu(idx->level);
0553 
0554             ubifs_assert(c, snod->type == UBIFS_IDX_NODE);
0555             key_read(c, ubifs_idx_key(c, idx), &snod->key);
0556             err = ubifs_dirty_idx_node(c, &snod->key, level, lnum,
0557                            snod->offs);
0558             if (err)
0559                 goto out;
0560         }
0561 
0562         idx_gc = kmalloc(sizeof(struct ubifs_gced_idx_leb), GFP_NOFS);
0563         if (!idx_gc) {
0564             err = -ENOMEM;
0565             goto out;
0566         }
0567 
0568         idx_gc->lnum = lnum;
0569         idx_gc->unmap = 0;
0570         list_add(&idx_gc->list, &c->idx_gc);
0571 
0572         /*
0573          * Don't release the LEB until after the next commit, because
0574          * it may contain data which is needed for recovery. So
0575          * although we freed this LEB, it will become usable only after
0576          * the commit.
0577          */
0578         err = ubifs_change_one_lp(c, lnum, c->leb_size, 0, 0,
0579                       LPROPS_INDEX, 1);
0580         if (err)
0581             goto out;
0582         err = LEB_FREED_IDX;
0583     } else {
0584         dbg_gc("data LEB %d (free %d, dirty %d)",
0585                lnum, lp->free, lp->dirty);
0586 
0587         err = move_nodes(c, sleb);
0588         if (err)
0589             goto out_inc_seq;
0590 
0591         err = gc_sync_wbufs(c);
0592         if (err)
0593             goto out_inc_seq;
0594 
0595         err = ubifs_change_one_lp(c, lnum, c->leb_size, 0, 0, 0, 0);
0596         if (err)
0597             goto out_inc_seq;
0598 
0599         /* Allow for races with TNC */
0600         c->gced_lnum = lnum;
0601         smp_wmb();
0602         c->gc_seq += 1;
0603         smp_wmb();
0604 
0605         if (c->gc_lnum == -1) {
0606             c->gc_lnum = lnum;
0607             err = LEB_RETAINED;
0608         } else {
0609             err = ubifs_wbuf_sync_nolock(wbuf);
0610             if (err)
0611                 goto out;
0612 
0613             err = ubifs_leb_unmap(c, lnum);
0614             if (err)
0615                 goto out;
0616 
0617             err = LEB_FREED;
0618         }
0619     }
0620 
0621 out:
0622     ubifs_scan_destroy(sleb);
0623     return err;
0624 
0625 out_inc_seq:
0626     /* We may have moved at least some nodes so allow for races with TNC */
0627     c->gced_lnum = lnum;
0628     smp_wmb();
0629     c->gc_seq += 1;
0630     smp_wmb();
0631     goto out;
0632 }
0633 
0634 /**
0635  * ubifs_garbage_collect - UBIFS garbage collector.
0636  * @c: UBIFS file-system description object
0637  * @anyway: do GC even if there are free LEBs
0638  *
0639  * This function does out-of-place garbage collection. The return codes are:
0640  *   o positive LEB number if the LEB has been freed and may be used;
0641  *   o %-EAGAIN if the caller has to run commit;
0642  *   o %-ENOSPC if GC failed to make any progress;
0643  *   o other negative error codes in case of other errors.
0644  *
0645  * Garbage collector writes data to the journal when GC'ing data LEBs, and just
0646  * marking indexing nodes dirty when GC'ing indexing LEBs. Thus, at some point
0647  * commit may be required. But commit cannot be run from inside GC, because the
0648  * caller might be holding the commit lock, so %-EAGAIN is returned instead;
0649  * And this error code means that the caller has to run commit, and re-run GC
0650  * if there is still no free space.
0651  *
0652  * There are many reasons why this function may return %-EAGAIN:
0653  * o the log is full and there is no space to write an LEB reference for
0654  *   @c->gc_lnum;
0655  * o the journal is too large and exceeds size limitations;
0656  * o GC moved indexing LEBs, but they can be used only after the commit;
0657  * o the shrinker fails to find clean znodes to free and requests the commit;
0658  * o etc.
0659  *
0660  * Note, if the file-system is close to be full, this function may return
0661  * %-EAGAIN infinitely, so the caller has to limit amount of re-invocations of
0662  * the function. E.g., this happens if the limits on the journal size are too
0663  * tough and GC writes too much to the journal before an LEB is freed. This
0664  * might also mean that the journal is too large, and the TNC becomes to big,
0665  * so that the shrinker is constantly called, finds not clean znodes to free,
0666  * and requests commit. Well, this may also happen if the journal is all right,
0667  * but another kernel process consumes too much memory. Anyway, infinite
0668  * %-EAGAIN may happen, but in some extreme/misconfiguration cases.
0669  */
0670 int ubifs_garbage_collect(struct ubifs_info *c, int anyway)
0671 {
0672     int i, err, ret, min_space = c->dead_wm;
0673     struct ubifs_lprops lp;
0674     struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
0675 
0676     ubifs_assert_cmt_locked(c);
0677     ubifs_assert(c, !c->ro_media && !c->ro_mount);
0678 
0679     if (ubifs_gc_should_commit(c))
0680         return -EAGAIN;
0681 
0682     mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
0683 
0684     if (c->ro_error) {
0685         ret = -EROFS;
0686         goto out_unlock;
0687     }
0688 
0689     /* We expect the write-buffer to be empty on entry */
0690     ubifs_assert(c, !wbuf->used);
0691 
0692     for (i = 0; ; i++) {
0693         int space_before, space_after;
0694 
0695         /* Maybe continue after find and break before find */
0696         lp.lnum = -1;
0697 
0698         cond_resched();
0699 
0700         /* Give the commit an opportunity to run */
0701         if (ubifs_gc_should_commit(c)) {
0702             ret = -EAGAIN;
0703             break;
0704         }
0705 
0706         if (i > SOFT_LEBS_LIMIT && !list_empty(&c->idx_gc)) {
0707             /*
0708              * We've done enough iterations. Indexing LEBs were
0709              * moved and will be available after the commit.
0710              */
0711             dbg_gc("soft limit, some index LEBs GC'ed, -EAGAIN");
0712             ubifs_commit_required(c);
0713             ret = -EAGAIN;
0714             break;
0715         }
0716 
0717         if (i > HARD_LEBS_LIMIT) {
0718             /*
0719              * We've moved too many LEBs and have not made
0720              * progress, give up.
0721              */
0722             dbg_gc("hard limit, -ENOSPC");
0723             ret = -ENOSPC;
0724             break;
0725         }
0726 
0727         /*
0728          * Empty and freeable LEBs can turn up while we waited for
0729          * the wbuf lock, or while we have been running GC. In that
0730          * case, we should just return one of those instead of
0731          * continuing to GC dirty LEBs. Hence we request
0732          * 'ubifs_find_dirty_leb()' to return an empty LEB if it can.
0733          */
0734         ret = ubifs_find_dirty_leb(c, &lp, min_space, anyway ? 0 : 1);
0735         if (ret) {
0736             if (ret == -ENOSPC)
0737                 dbg_gc("no more dirty LEBs");
0738             break;
0739         }
0740 
0741         dbg_gc("found LEB %d: free %d, dirty %d, sum %d (min. space %d)",
0742                lp.lnum, lp.free, lp.dirty, lp.free + lp.dirty,
0743                min_space);
0744 
0745         space_before = c->leb_size - wbuf->offs - wbuf->used;
0746         if (wbuf->lnum == -1)
0747             space_before = 0;
0748 
0749         ret = ubifs_garbage_collect_leb(c, &lp);
0750         if (ret < 0) {
0751             if (ret == -EAGAIN) {
0752                 /*
0753                  * This is not error, so we have to return the
0754                  * LEB to lprops. But if 'ubifs_return_leb()'
0755                  * fails, its failure code is propagated to the
0756                  * caller instead of the original '-EAGAIN'.
0757                  */
0758                 err = ubifs_return_leb(c, lp.lnum);
0759                 if (err) {
0760                     ret = err;
0761                     /*
0762                      * An LEB may always be "taken",
0763                      * so setting ubifs to read-only,
0764                      * and then executing sync wbuf will
0765                      * return -EROFS and enter the "out"
0766                      * error branch.
0767                      */
0768                     ubifs_ro_mode(c, ret);
0769                 }
0770                 /*  Maybe double return LEB if goto out */
0771                 lp.lnum = -1;
0772                 break;
0773             }
0774             goto out;
0775         }
0776 
0777         if (ret == LEB_FREED) {
0778             /* An LEB has been freed and is ready for use */
0779             dbg_gc("LEB %d freed, return", lp.lnum);
0780             ret = lp.lnum;
0781             break;
0782         }
0783 
0784         if (ret == LEB_FREED_IDX) {
0785             /*
0786              * This was an indexing LEB and it cannot be
0787              * immediately used. And instead of requesting the
0788              * commit straight away, we try to garbage collect some
0789              * more.
0790              */
0791             dbg_gc("indexing LEB %d freed, continue", lp.lnum);
0792             continue;
0793         }
0794 
0795         ubifs_assert(c, ret == LEB_RETAINED);
0796         space_after = c->leb_size - wbuf->offs - wbuf->used;
0797         dbg_gc("LEB %d retained, freed %d bytes", lp.lnum,
0798                space_after - space_before);
0799 
0800         if (space_after > space_before) {
0801             /* GC makes progress, keep working */
0802             min_space >>= 1;
0803             if (min_space < c->dead_wm)
0804                 min_space = c->dead_wm;
0805             continue;
0806         }
0807 
0808         dbg_gc("did not make progress");
0809 
0810         /*
0811          * GC moved an LEB bud have not done any progress. This means
0812          * that the previous GC head LEB contained too few free space
0813          * and the LEB which was GC'ed contained only large nodes which
0814          * did not fit that space.
0815          *
0816          * We can do 2 things:
0817          * 1. pick another LEB in a hope it'll contain a small node
0818          *    which will fit the space we have at the end of current GC
0819          *    head LEB, but there is no guarantee, so we try this out
0820          *    unless we have already been working for too long;
0821          * 2. request an LEB with more dirty space, which will force
0822          *    'ubifs_find_dirty_leb()' to start scanning the lprops
0823          *    table, instead of just picking one from the heap
0824          *    (previously it already picked the dirtiest LEB).
0825          */
0826         if (i < SOFT_LEBS_LIMIT) {
0827             dbg_gc("try again");
0828             continue;
0829         }
0830 
0831         min_space <<= 1;
0832         if (min_space > c->dark_wm)
0833             min_space = c->dark_wm;
0834         dbg_gc("set min. space to %d", min_space);
0835     }
0836 
0837     if (ret == -ENOSPC && !list_empty(&c->idx_gc)) {
0838         dbg_gc("no space, some index LEBs GC'ed, -EAGAIN");
0839         ubifs_commit_required(c);
0840         ret = -EAGAIN;
0841     }
0842 
0843     err = ubifs_wbuf_sync_nolock(wbuf);
0844     if (!err)
0845         err = ubifs_leb_unmap(c, c->gc_lnum);
0846     if (err) {
0847         ret = err;
0848         goto out;
0849     }
0850 out_unlock:
0851     mutex_unlock(&wbuf->io_mutex);
0852     return ret;
0853 
0854 out:
0855     ubifs_assert(c, ret < 0);
0856     ubifs_assert(c, ret != -ENOSPC && ret != -EAGAIN);
0857     ubifs_wbuf_sync_nolock(wbuf);
0858     ubifs_ro_mode(c, ret);
0859     mutex_unlock(&wbuf->io_mutex);
0860     if (lp.lnum != -1)
0861         ubifs_return_leb(c, lp.lnum);
0862     return ret;
0863 }
0864 
0865 /**
0866  * ubifs_gc_start_commit - garbage collection at start of commit.
0867  * @c: UBIFS file-system description object
0868  *
0869  * If a LEB has only dirty and free space, then we may safely unmap it and make
0870  * it free.  Note, we cannot do this with indexing LEBs because dirty space may
0871  * correspond index nodes that are required for recovery.  In that case, the
0872  * LEB cannot be unmapped until after the next commit.
0873  *
0874  * This function returns %0 upon success and a negative error code upon failure.
0875  */
0876 int ubifs_gc_start_commit(struct ubifs_info *c)
0877 {
0878     struct ubifs_gced_idx_leb *idx_gc;
0879     const struct ubifs_lprops *lp;
0880     int err = 0, flags;
0881 
0882     ubifs_get_lprops(c);
0883 
0884     /*
0885      * Unmap (non-index) freeable LEBs. Note that recovery requires that all
0886      * wbufs are sync'd before this, which is done in 'do_commit()'.
0887      */
0888     while (1) {
0889         lp = ubifs_fast_find_freeable(c);
0890         if (!lp)
0891             break;
0892         ubifs_assert(c, !(lp->flags & LPROPS_TAKEN));
0893         ubifs_assert(c, !(lp->flags & LPROPS_INDEX));
0894         err = ubifs_leb_unmap(c, lp->lnum);
0895         if (err)
0896             goto out;
0897         lp = ubifs_change_lp(c, lp, c->leb_size, 0, lp->flags, 0);
0898         if (IS_ERR(lp)) {
0899             err = PTR_ERR(lp);
0900             goto out;
0901         }
0902         ubifs_assert(c, !(lp->flags & LPROPS_TAKEN));
0903         ubifs_assert(c, !(lp->flags & LPROPS_INDEX));
0904     }
0905 
0906     /* Mark GC'd index LEBs OK to unmap after this commit finishes */
0907     list_for_each_entry(idx_gc, &c->idx_gc, list)
0908         idx_gc->unmap = 1;
0909 
0910     /* Record index freeable LEBs for unmapping after commit */
0911     while (1) {
0912         lp = ubifs_fast_find_frdi_idx(c);
0913         if (IS_ERR(lp)) {
0914             err = PTR_ERR(lp);
0915             goto out;
0916         }
0917         if (!lp)
0918             break;
0919         idx_gc = kmalloc(sizeof(struct ubifs_gced_idx_leb), GFP_NOFS);
0920         if (!idx_gc) {
0921             err = -ENOMEM;
0922             goto out;
0923         }
0924         ubifs_assert(c, !(lp->flags & LPROPS_TAKEN));
0925         ubifs_assert(c, lp->flags & LPROPS_INDEX);
0926         /* Don't release the LEB until after the next commit */
0927         flags = (lp->flags | LPROPS_TAKEN) ^ LPROPS_INDEX;
0928         lp = ubifs_change_lp(c, lp, c->leb_size, 0, flags, 1);
0929         if (IS_ERR(lp)) {
0930             err = PTR_ERR(lp);
0931             kfree(idx_gc);
0932             goto out;
0933         }
0934         ubifs_assert(c, lp->flags & LPROPS_TAKEN);
0935         ubifs_assert(c, !(lp->flags & LPROPS_INDEX));
0936         idx_gc->lnum = lp->lnum;
0937         idx_gc->unmap = 1;
0938         list_add(&idx_gc->list, &c->idx_gc);
0939     }
0940 out:
0941     ubifs_release_lprops(c);
0942     return err;
0943 }
0944 
0945 /**
0946  * ubifs_gc_end_commit - garbage collection at end of commit.
0947  * @c: UBIFS file-system description object
0948  *
0949  * This function completes out-of-place garbage collection of index LEBs.
0950  */
0951 int ubifs_gc_end_commit(struct ubifs_info *c)
0952 {
0953     struct ubifs_gced_idx_leb *idx_gc, *tmp;
0954     struct ubifs_wbuf *wbuf;
0955     int err = 0;
0956 
0957     wbuf = &c->jheads[GCHD].wbuf;
0958     mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
0959     list_for_each_entry_safe(idx_gc, tmp, &c->idx_gc, list)
0960         if (idx_gc->unmap) {
0961             dbg_gc("LEB %d", idx_gc->lnum);
0962             err = ubifs_leb_unmap(c, idx_gc->lnum);
0963             if (err)
0964                 goto out;
0965             err = ubifs_change_one_lp(c, idx_gc->lnum, LPROPS_NC,
0966                       LPROPS_NC, 0, LPROPS_TAKEN, -1);
0967             if (err)
0968                 goto out;
0969             list_del(&idx_gc->list);
0970             kfree(idx_gc);
0971         }
0972 out:
0973     mutex_unlock(&wbuf->io_mutex);
0974     return err;
0975 }
0976 
0977 /**
0978  * ubifs_destroy_idx_gc - destroy idx_gc list.
0979  * @c: UBIFS file-system description object
0980  *
0981  * This function destroys the @c->idx_gc list. It is called when unmounting
0982  * so locks are not needed. Returns zero in case of success and a negative
0983  * error code in case of failure.
0984  */
0985 void ubifs_destroy_idx_gc(struct ubifs_info *c)
0986 {
0987     while (!list_empty(&c->idx_gc)) {
0988         struct ubifs_gced_idx_leb *idx_gc;
0989 
0990         idx_gc = list_entry(c->idx_gc.next, struct ubifs_gced_idx_leb,
0991                     list);
0992         c->idx_gc_cnt -= 1;
0993         list_del(&idx_gc->list);
0994         kfree(idx_gc);
0995     }
0996 }
0997 
0998 /**
0999  * ubifs_get_idx_gc_leb - get a LEB from GC'd index LEB list.
1000  * @c: UBIFS file-system description object
1001  *
1002  * Called during start commit so locks are not needed.
1003  */
1004 int ubifs_get_idx_gc_leb(struct ubifs_info *c)
1005 {
1006     struct ubifs_gced_idx_leb *idx_gc;
1007     int lnum;
1008 
1009     if (list_empty(&c->idx_gc))
1010         return -ENOSPC;
1011     idx_gc = list_entry(c->idx_gc.next, struct ubifs_gced_idx_leb, list);
1012     lnum = idx_gc->lnum;
1013     /* c->idx_gc_cnt is updated by the caller when lprops are updated */
1014     list_del(&idx_gc->list);
1015     kfree(idx_gc);
1016     return lnum;
1017 }