fs/ext4/fast_commit.c

0001 // SPDX-License-Identifier: GPL-2.0
0002
0003 /*
0004  * fs/ext4/fast_commit.c
0005  *
0006  * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com>
0007  *
0008  * Ext4 fast commits routines.
0009  */
0010 #include "ext4.h"
0011 #include "ext4_jbd2.h"
0012 #include "ext4_extents.h"
0013 #include "mballoc.h"
0014
0015 /*
0016  * Ext4 Fast Commits
0017  * -----------------
0018  *
0019  * Ext4 fast commits implement fine grained journalling for Ext4.
0020  *
0021  * Fast commits are organized as a log of tag-length-value (TLV) structs. (See
0022  * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by
0023  * TLV during the recovery phase. For the scenarios for which we currently
0024  * don't have replay code, fast commit falls back to full commits.
0025  * Fast commits record delta in one of the following three categories.
0026  *
0027  * (A) Directory entry updates:
0028  *
0029  * - EXT4_FC_TAG_UNLINK     - records directory entry unlink
0030  * - EXT4_FC_TAG_LINK       - records directory entry link
0031  * - EXT4_FC_TAG_CREAT      - records inode and directory entry creation
0032  *
0033  * (B) File specific data range updates:
0034  *
0035  * - EXT4_FC_TAG_ADD_RANGE  - records addition of new blocks to an inode
0036  * - EXT4_FC_TAG_DEL_RANGE  - records deletion of blocks from an inode
0037  *
0038  * (C) Inode metadata (mtime / ctime etc):
0039  *
0040  * - EXT4_FC_TAG_INODE      - record the inode that should be replayed
0041  *                during recovery. Note that iblocks field is
0042  *                not replayed and instead derived during
0043  *                replay.
0044  * Commit Operation
0045  * ----------------
0046  * With fast commits, we maintain all the directory entry operations in the
0047  * order in which they are issued in an in-memory queue. This queue is flushed
0048  * to disk during the commit operation. We also maintain a list of inodes
0049  * that need to be committed during a fast commit in another in memory queue of
0050  * inodes. During the commit operation, we commit in the following order:
0051  *
0052  * [1] Lock inodes for any further data updates by setting COMMITTING state
0053  * [2] Submit data buffers of all the inodes
0054  * [3] Wait for [2] to complete
0055  * [4] Commit all the directory entry updates in the fast commit space
0056  * [5] Commit all the changed inode structures
0057  * [6] Write tail tag (this tag ensures the atomicity, please read the following
0058  *     section for more details).
0059  * [7] Wait for [4], [5] and [6] to complete.
0060  *
0061  * All the inode updates must call ext4_fc_start_update() before starting an
0062  * update. If such an ongoing update is present, fast commit waits for it to
0063  * complete. The completion of such an update is marked by
0064  * ext4_fc_stop_update().
0065  *
0066  * Fast Commit Ineligibility
0067  * -------------------------
0068  *
0069  * Not all operations are supported by fast commits today (e.g extended
0070  * attributes). Fast commit ineligibility is marked by calling
0071  * ext4_fc_mark_ineligible(): This makes next fast commit operation to fall back
0072  * to full commit.
0073  *
0074  * Atomicity of commits
0075  * --------------------
0076  * In order to guarantee atomicity during the commit operation, fast commit
0077  * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail
0078  * tag contains CRC of the contents and TID of the transaction after which
0079  * this fast commit should be applied. Recovery code replays fast commit
0080  * logs only if there's at least 1 valid tail present. For every fast commit
0081  * operation, there is 1 tail. This means, we may end up with multiple tails
0082  * in the fast commit space. Here's an example:
0083  *
0084  * - Create a new file A and remove existing file B
0085  * - fsync()
0086  * - Append contents to file A
0087  * - Truncate file A
0088  * - fsync()
0089  *
0090  * The fast commit space at the end of above operations would look like this:
0091  *      [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL]
0092  *             |<---  Fast Commit 1   --->|<---      Fast Commit 2     ---->|
0093  *
0094  * Replay code should thus check for all the valid tails in the FC area.
0095  *
0096  * Fast Commit Replay Idempotence
0097  * ------------------------------
0098  *
0099  * Fast commits tags are idempotent in nature provided the recovery code follows
0100  * certain rules. The guiding principle that the commit path follows while
0101  * committing is that it stores the result of a particular operation instead of
0102  * storing the procedure.
0103  *
0104  * Let's consider this rename operation: 'mv /a /b'. Let's assume dirent '/a'
0105  * was associated with inode 10. During fast commit, instead of storing this
0106  * operation as a procedure "rename a to b", we store the resulting file system
0107  * state as a "series" of outcomes:
0108  *
0109  * - Link dirent b to inode 10
0110  * - Unlink dirent a
0111  * - Inode <10> with valid refcount
0112  *
0113  * Now when recovery code runs, it needs "enforce" this state on the file
0114  * system. This is what guarantees idempotence of fast commit replay.
0115  *
0116  * Let's take an example of a procedure that is not idempotent and see how fast
0117  * commits make it idempotent. Consider following sequence of operations:
0118  *
0119  *     rm A;    mv B A;    read A
0120  *  (x)     (y)        (z)
0121  *
0122  * (x), (y) and (z) are the points at which we can crash. If we store this
0123  * sequence of operations as is then the replay is not idempotent. Let's say
0124  * while in replay, we crash at (z). During the second replay, file A (which was
0125  * actually created as a result of "mv B A" operation) would get deleted. Thus,
0126  * file named A would be absent when we try to read A. So, this sequence of
0127  * operations is not idempotent. However, as mentioned above, instead of storing
0128  * the procedure fast commits store the outcome of each procedure. Thus the fast
0129  * commit log for above procedure would be as follows:
0130  *
0131  * (Let's assume dirent A was linked to inode 10 and dirent B was linked to
0132  * inode 11 before the replay)
0133  *
0134  *    [Unlink A]   [Link A to inode 11]   [Unlink B]   [Inode 11]
0135  * (w)          (x)                    (y)          (z)
0136  *
0137  * If we crash at (z), we will have file A linked to inode 11. During the second
0138  * replay, we will remove file A (inode 11). But we will create it back and make
0139  * it point to inode 11. We won't find B, so we'll just skip that step. At this
0140  * point, the refcount for inode 11 is not reliable, but that gets fixed by the
0141  * replay of last inode 11 tag. Crashes at points (w), (x) and (y) get handled
0142  * similarly. Thus, by converting a non-idempotent procedure into a series of
0143  * idempotent outcomes, fast commits ensured idempotence during the replay.
0144  *
0145  * TODOs
0146  * -----
0147  *
0148  * 0) Fast commit replay path hardening: Fast commit replay code should use
0149  *    journal handles to make sure all the updates it does during the replay
0150  *    path are atomic. With that if we crash during fast commit replay, after
0151  *    trying to do recovery again, we will find a file system where fast commit
0152  *    area is invalid (because new full commit would be found). In order to deal
0153  *    with that, fast commit replay code should ensure that the "FC_REPLAY"
0154  *    superblock state is persisted before starting the replay, so that after
0155  *    the crash, fast commit recovery code can look at that flag and perform
0156  *    fast commit recovery even if that area is invalidated by later full
0157  *    commits.
0158  *
0159  * 1) Fast commit's commit path locks the entire file system during fast
0160  *    commit. This has significant performance penalty. Instead of that, we
0161  *    should use ext4_fc_start/stop_update functions to start inode level
0162  *    updates from ext4_journal_start/stop. Once we do that we can drop file
0163  *    system locking during commit path.
0164  *
0165  * 2) Handle more ineligible cases.
0166  */
0167
0168 #include <trace/events/ext4.h>
0169 static struct kmem_cache *ext4_fc_dentry_cachep;
0170
0171 static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
0172 {
0173     BUFFER_TRACE(bh, "");
0174     if (uptodate) {
0175         ext4_debug("%s: Block %lld up-to-date",
0176                __func__, bh->b_blocknr);
0177         set_buffer_uptodate(bh);
0178     } else {
0179         ext4_debug("%s: Block %lld not up-to-date",
0180                __func__, bh->b_blocknr);
0181         clear_buffer_uptodate(bh);
0182     }
0183
0184     unlock_buffer(bh);
0185 }
0186
0187 static inline void ext4_fc_reset_inode(struct inode *inode)
0188 {
0189     struct ext4_inode_info *ei = EXT4_I(inode);
0190
0191     ei->i_fc_lblk_start = 0;
0192     ei->i_fc_lblk_len = 0;
0193 }
0194
0195 void ext4_fc_init_inode(struct inode *inode)
0196 {
0197     struct ext4_inode_info *ei = EXT4_I(inode);
0198
0199     ext4_fc_reset_inode(inode);
0200     ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING);
0201     INIT_LIST_HEAD(&ei->i_fc_list);
0202     INIT_LIST_HEAD(&ei->i_fc_dilist);
0203     init_waitqueue_head(&ei->i_fc_wait);
0204     atomic_set(&ei->i_fc_updates, 0);
0205 }
0206
0207 /* This function must be called with sbi->s_fc_lock held. */
0208 static void ext4_fc_wait_committing_inode(struct inode *inode)
0209 __releases(&EXT4_SB(inode->i_sb)->s_fc_lock)
0210 {
0211     wait_queue_head_t *wq;
0212     struct ext4_inode_info *ei = EXT4_I(inode);
0213
0214 #if (BITS_PER_LONG < 64)
0215     DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
0216             EXT4_STATE_FC_COMMITTING);
0217     wq = bit_waitqueue(&ei->i_state_flags,
0218                 EXT4_STATE_FC_COMMITTING);
0219 #else
0220     DEFINE_WAIT_BIT(wait, &ei->i_flags,
0221             EXT4_STATE_FC_COMMITTING);
0222     wq = bit_waitqueue(&ei->i_flags,
0223                 EXT4_STATE_FC_COMMITTING);
0224 #endif
0225     lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock);
0226     prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
0227     spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
0228     schedule();
0229     finish_wait(wq, &wait.wq_entry);
0230 }
0231
0232 /*
0233  * Inform Ext4's fast about start of an inode update
0234  *
0235  * This function is called by the high level call VFS callbacks before
0236  * performing any inode update. This function blocks if there's an ongoing
0237  * fast commit on the inode in question.
0238  */
0239 void ext4_fc_start_update(struct inode *inode)
0240 {
0241     struct ext4_inode_info *ei = EXT4_I(inode);
0242
0243     if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
0244         (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
0245         return;
0246
0247 restart:
0248     spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
0249     if (list_empty(&ei->i_fc_list))
0250         goto out;
0251
0252     if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
0253         ext4_fc_wait_committing_inode(inode);
0254         goto restart;
0255     }
0256 out:
0257     atomic_inc(&ei->i_fc_updates);
0258     spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
0259 }
0260
0261 /*
0262  * Stop inode update and wake up waiting fast commits if any.
0263  */
0264 void ext4_fc_stop_update(struct inode *inode)
0265 {
0266     struct ext4_inode_info *ei = EXT4_I(inode);
0267
0268     if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
0269         (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
0270         return;
0271
0272     if (atomic_dec_and_test(&ei->i_fc_updates))
0273         wake_up_all(&ei->i_fc_wait);
0274 }
0275
0276 /*
0277  * Remove inode from fast commit list. If the inode is being committed
0278  * we wait until inode commit is done.
0279  */
0280 void ext4_fc_del(struct inode *inode)
0281 {
0282     struct ext4_inode_info *ei = EXT4_I(inode);
0283     struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
0284     struct ext4_fc_dentry_update *fc_dentry;
0285
0286     if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
0287         (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
0288         return;
0289
0290 restart:
0291     spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
0292     if (list_empty(&ei->i_fc_list) && list_empty(&ei->i_fc_dilist)) {
0293         spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
0294         return;
0295     }
0296
0297     if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
0298         ext4_fc_wait_committing_inode(inode);
0299         goto restart;
0300     }
0301
0302     if (!list_empty(&ei->i_fc_list))
0303         list_del_init(&ei->i_fc_list);
0304
0305     /*
0306      * Since this inode is getting removed, let's also remove all FC
0307      * dentry create references, since it is not needed to log it anyways.
0308      */
0309     if (list_empty(&ei->i_fc_dilist)) {
0310         spin_unlock(&sbi->s_fc_lock);
0311         return;
0312     }
0313
0314     fc_dentry = list_first_entry(&ei->i_fc_dilist, struct ext4_fc_dentry_update, fcd_dilist);
0315     WARN_ON(fc_dentry->fcd_op != EXT4_FC_TAG_CREAT);
0316     list_del_init(&fc_dentry->fcd_list);
0317     list_del_init(&fc_dentry->fcd_dilist);
0318
0319     WARN_ON(!list_empty(&ei->i_fc_dilist));
0320     spin_unlock(&sbi->s_fc_lock);
0321
0322     if (fc_dentry->fcd_name.name &&
0323         fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
0324         kfree(fc_dentry->fcd_name.name);
0325     kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
0326
0327     return;
0328 }
0329
0330 /*
0331  * Mark file system as fast commit ineligible, and record latest
0332  * ineligible transaction tid. This means until the recorded
0333  * transaction, commit operation would result in a full jbd2 commit.
0334  */
0335 void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handle)
0336 {
0337     struct ext4_sb_info *sbi = EXT4_SB(sb);
0338     tid_t tid;
0339
0340     if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
0341         (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
0342         return;
0343
0344     ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
0345     if (handle && !IS_ERR(handle))
0346         tid = handle->h_transaction->t_tid;
0347     else {
0348         read_lock(&sbi->s_journal->j_state_lock);
0349         tid = sbi->s_journal->j_running_transaction ?
0350                 sbi->s_journal->j_running_transaction->t_tid : 0;
0351         read_unlock(&sbi->s_journal->j_state_lock);
0352     }
0353     spin_lock(&sbi->s_fc_lock);
0354     if (sbi->s_fc_ineligible_tid < tid)
0355         sbi->s_fc_ineligible_tid = tid;
0356     spin_unlock(&sbi->s_fc_lock);
0357     WARN_ON(reason >= EXT4_FC_REASON_MAX);
0358     sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
0359 }
0360
0361 /*
0362  * Generic fast commit tracking function. If this is the first time this we are
0363  * called after a full commit, we initialize fast commit fields and then call
0364  * __fc_track_fn() with update = 0. If we have already been called after a full
0365  * commit, we pass update = 1. Based on that, the track function can determine
0366  * if it needs to track a field for the first time or if it needs to just
0367  * update the previously tracked value.
0368  *
0369  * If enqueue is set, this function enqueues the inode in fast commit list.
0370  */
0371 static int ext4_fc_track_template(
0372     handle_t *handle, struct inode *inode,
0373     int (*__fc_track_fn)(struct inode *, void *, bool),
0374     void *args, int enqueue)
0375 {
0376     bool update = false;
0377     struct ext4_inode_info *ei = EXT4_I(inode);
0378     struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
0379     tid_t tid = 0;
0380     int ret;
0381
0382     tid = handle->h_transaction->t_tid;
0383     mutex_lock(&ei->i_fc_lock);
0384     if (tid == ei->i_sync_tid) {
0385         update = true;
0386     } else {
0387         ext4_fc_reset_inode(inode);
0388         ei->i_sync_tid = tid;
0389     }
0390     ret = __fc_track_fn(inode, args, update);
0391     mutex_unlock(&ei->i_fc_lock);
0392
0393     if (!enqueue)
0394         return ret;
0395
0396     spin_lock(&sbi->s_fc_lock);
0397     if (list_empty(&EXT4_I(inode)->i_fc_list))
0398         list_add_tail(&EXT4_I(inode)->i_fc_list,
0399                 (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING ||
0400                  sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) ?
0401                 &sbi->s_fc_q[FC_Q_STAGING] :
0402                 &sbi->s_fc_q[FC_Q_MAIN]);
0403     spin_unlock(&sbi->s_fc_lock);
0404
0405     return ret;
0406 }
0407
0408 struct __track_dentry_update_args {
0409     struct dentry *dentry;
0410     int op;
0411 };
0412
0413 /* __track_fn for directory entry updates. Called with ei->i_fc_lock. */
0414 static int __track_dentry_update(struct inode *inode, void *arg, bool update)
0415 {
0416     struct ext4_fc_dentry_update *node;
0417     struct ext4_inode_info *ei = EXT4_I(inode);
0418     struct __track_dentry_update_args *dentry_update =
0419         (struct __track_dentry_update_args *)arg;
0420     struct dentry *dentry = dentry_update->dentry;
0421     struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
0422
0423     mutex_unlock(&ei->i_fc_lock);
0424     node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS);
0425     if (!node) {
0426         ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM, NULL);
0427         mutex_lock(&ei->i_fc_lock);
0428         return -ENOMEM;
0429     }
0430
0431     node->fcd_op = dentry_update->op;
0432     node->fcd_parent = dentry->d_parent->d_inode->i_ino;
0433     node->fcd_ino = inode->i_ino;
0434     if (dentry->d_name.len > DNAME_INLINE_LEN) {
0435         node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS);
0436         if (!node->fcd_name.name) {
0437             kmem_cache_free(ext4_fc_dentry_cachep, node);
0438             ext4_fc_mark_ineligible(inode->i_sb,
0439                 EXT4_FC_REASON_NOMEM, NULL);
0440             mutex_lock(&ei->i_fc_lock);
0441             return -ENOMEM;
0442         }
0443         memcpy((u8 *)node->fcd_name.name, dentry->d_name.name,
0444             dentry->d_name.len);
0445     } else {
0446         memcpy(node->fcd_iname, dentry->d_name.name,
0447             dentry->d_name.len);
0448         node->fcd_name.name = node->fcd_iname;
0449     }
0450     node->fcd_name.len = dentry->d_name.len;
0451     INIT_LIST_HEAD(&node->fcd_dilist);
0452     spin_lock(&sbi->s_fc_lock);
0453     if (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING ||
0454         sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING)
0455         list_add_tail(&node->fcd_list,
0456                 &sbi->s_fc_dentry_q[FC_Q_STAGING]);
0457     else
0458         list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]);
0459
0460     /*
0461      * This helps us keep a track of all fc_dentry updates which is part of
0462      * this ext4 inode. So in case the inode is getting unlinked, before
0463      * even we get a chance to fsync, we could remove all fc_dentry
0464      * references while evicting the inode in ext4_fc_del().
0465      * Also with this, we don't need to loop over all the inodes in
0466      * sbi->s_fc_q to get the corresponding inode in
0467      * ext4_fc_commit_dentry_updates().
0468      */
0469     if (dentry_update->op == EXT4_FC_TAG_CREAT) {
0470         WARN_ON(!list_empty(&ei->i_fc_dilist));
0471         list_add_tail(&node->fcd_dilist, &ei->i_fc_dilist);
0472     }
0473     spin_unlock(&sbi->s_fc_lock);
0474     mutex_lock(&ei->i_fc_lock);
0475
0476     return 0;
0477 }
0478
0479 void __ext4_fc_track_unlink(handle_t *handle,
0480         struct inode *inode, struct dentry *dentry)
0481 {
0482     struct __track_dentry_update_args args;
0483     int ret;
0484
0485     args.dentry = dentry;
0486     args.op = EXT4_FC_TAG_UNLINK;
0487
0488     ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
0489                     (void *)&args, 0);
0490     trace_ext4_fc_track_unlink(handle, inode, dentry, ret);
0491 }
0492
0493 void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry)
0494 {
0495     struct inode *inode = d_inode(dentry);
0496     struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
0497
0498     if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
0499         (sbi->s_mount_state & EXT4_FC_REPLAY))
0500         return;
0501
0502     if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
0503         return;
0504
0505     __ext4_fc_track_unlink(handle, inode, dentry);
0506 }
0507
0508 void __ext4_fc_track_link(handle_t *handle,
0509     struct inode *inode, struct dentry *dentry)
0510 {
0511     struct __track_dentry_update_args args;
0512     int ret;
0513
0514     args.dentry = dentry;
0515     args.op = EXT4_FC_TAG_LINK;
0516
0517     ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
0518                     (void *)&args, 0);
0519     trace_ext4_fc_track_link(handle, inode, dentry, ret);
0520 }
0521
0522 void ext4_fc_track_link(handle_t *handle, struct dentry *dentry)
0523 {
0524     struct inode *inode = d_inode(dentry);
0525     struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
0526
0527     if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
0528         (sbi->s_mount_state & EXT4_FC_REPLAY))
0529         return;
0530
0531     if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
0532         return;
0533
0534     __ext4_fc_track_link(handle, inode, dentry);
0535 }
0536
0537 void __ext4_fc_track_create(handle_t *handle, struct inode *inode,
0538               struct dentry *dentry)
0539 {
0540     struct __track_dentry_update_args args;
0541     int ret;
0542
0543     args.dentry = dentry;
0544     args.op = EXT4_FC_TAG_CREAT;
0545
0546     ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
0547                     (void *)&args, 0);
0548     trace_ext4_fc_track_create(handle, inode, dentry, ret);
0549 }
0550
0551 void ext4_fc_track_create(handle_t *handle, struct dentry *dentry)
0552 {
0553     struct inode *inode = d_inode(dentry);
0554     struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
0555
0556     if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
0557         (sbi->s_mount_state & EXT4_FC_REPLAY))
0558         return;
0559
0560     if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
0561         return;
0562
0563     __ext4_fc_track_create(handle, inode, dentry);
0564 }
0565
0566 /* __track_fn for inode tracking */
0567 static int __track_inode(struct inode *inode, void *arg, bool update)
0568 {
0569     if (update)
0570         return -EEXIST;
0571
0572     EXT4_I(inode)->i_fc_lblk_len = 0;
0573
0574     return 0;
0575 }
0576
0577 void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
0578 {
0579     struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
0580     int ret;
0581
0582     if (S_ISDIR(inode->i_mode))
0583         return;
0584
0585     if (ext4_should_journal_data(inode)) {
0586         ext4_fc_mark_ineligible(inode->i_sb,
0587                     EXT4_FC_REASON_INODE_JOURNAL_DATA, handle);
0588         return;
0589     }
0590
0591     if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
0592         (sbi->s_mount_state & EXT4_FC_REPLAY))
0593         return;
0594
0595     if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
0596         return;
0597
0598     ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1);
0599     trace_ext4_fc_track_inode(handle, inode, ret);
0600 }
0601
0602 struct __track_range_args {
0603     ext4_lblk_t start, end;
0604 };
0605
0606 /* __track_fn for tracking data updates */
0607 static int __track_range(struct inode *inode, void *arg, bool update)
0608 {
0609     struct ext4_inode_info *ei = EXT4_I(inode);
0610     ext4_lblk_t oldstart;
0611     struct __track_range_args *__arg =
0612         (struct __track_range_args *)arg;
0613
0614     if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) {
0615         ext4_debug("Special inode %ld being modified\n", inode->i_ino);
0616         return -ECANCELED;
0617     }
0618
0619     oldstart = ei->i_fc_lblk_start;
0620
0621     if (update && ei->i_fc_lblk_len > 0) {
0622         ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start);
0623         ei->i_fc_lblk_len =
0624             max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) -
0625                 ei->i_fc_lblk_start + 1;
0626     } else {
0627         ei->i_fc_lblk_start = __arg->start;
0628         ei->i_fc_lblk_len = __arg->end - __arg->start + 1;
0629     }
0630
0631     return 0;
0632 }
0633
0634 void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start,
0635              ext4_lblk_t end)
0636 {
0637     struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
0638     struct __track_range_args args;
0639     int ret;
0640
0641     if (S_ISDIR(inode->i_mode))
0642         return;
0643
0644     if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
0645         (sbi->s_mount_state & EXT4_FC_REPLAY))
0646         return;
0647
0648     if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
0649         return;
0650
0651     args.start = start;
0652     args.end = end;
0653
0654     ret = ext4_fc_track_template(handle, inode,  __track_range, &args, 1);
0655
0656     trace_ext4_fc_track_range(handle, inode, start, end, ret);
0657 }
0658
0659 static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail)
0660 {
0661     blk_opf_t write_flags = REQ_SYNC;
0662     struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh;
0663
0664     /* Add REQ_FUA | REQ_PREFLUSH only its tail */
0665     if (test_opt(sb, BARRIER) && is_tail)
0666         write_flags |= REQ_FUA | REQ_PREFLUSH;
0667     lock_buffer(bh);
0668     set_buffer_dirty(bh);
0669     set_buffer_uptodate(bh);
0670     bh->b_end_io = ext4_end_buffer_io_sync;
0671     submit_bh(REQ_OP_WRITE | write_flags, bh);
0672     EXT4_SB(sb)->s_fc_bh = NULL;
0673 }
0674
0675 /* Ext4 commit path routines */
0676
0677 /* memzero and update CRC */
0678 static void *ext4_fc_memzero(struct super_block *sb, void *dst, int len,
0679                 u32 *crc)
0680 {
0681     void *ret;
0682
0683     ret = memset(dst, 0, len);
0684     if (crc)
0685         *crc = ext4_chksum(EXT4_SB(sb), *crc, dst, len);
0686     return ret;
0687 }
0688
0689 /*
0690  * Allocate len bytes on a fast commit buffer.
0691  *
0692  * During the commit time this function is used to manage fast commit
0693  * block space. We don't split a fast commit log onto different
0694  * blocks. So this function makes sure that if there's not enough space
0695  * on the current block, the remaining space in the current block is
0696  * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case,
0697  * new block is from jbd2 and CRC is updated to reflect the padding
0698  * we added.
0699  */
0700 static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
0701 {
0702     struct ext4_fc_tl *tl;
0703     struct ext4_sb_info *sbi = EXT4_SB(sb);
0704     struct buffer_head *bh;
0705     int bsize = sbi->s_journal->j_blocksize;
0706     int ret, off = sbi->s_fc_bytes % bsize;
0707     int pad_len;
0708
0709     /*
0710      * After allocating len, we should have space at least for a 0 byte
0711      * padding.
0712      */
0713     if (len + sizeof(struct ext4_fc_tl) > bsize)
0714         return NULL;
0715
0716     if (bsize - off - 1 > len + sizeof(struct ext4_fc_tl)) {
0717         /*
0718          * Only allocate from current buffer if we have enough space for
0719          * this request AND we have space to add a zero byte padding.
0720          */
0721         if (!sbi->s_fc_bh) {
0722             ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
0723             if (ret)
0724                 return NULL;
0725             sbi->s_fc_bh = bh;
0726         }
0727         sbi->s_fc_bytes += len;
0728         return sbi->s_fc_bh->b_data + off;
0729     }
0730     /* Need to add PAD tag */
0731     tl = (struct ext4_fc_tl *)(sbi->s_fc_bh->b_data + off);
0732     tl->fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD);
0733     pad_len = bsize - off - 1 - sizeof(struct ext4_fc_tl);
0734     tl->fc_len = cpu_to_le16(pad_len);
0735     if (crc)
0736         *crc = ext4_chksum(sbi, *crc, tl, sizeof(*tl));
0737     if (pad_len > 0)
0738         ext4_fc_memzero(sb, tl + 1, pad_len, crc);
0739     ext4_fc_submit_bh(sb, false);
0740
0741     ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
0742     if (ret)
0743         return NULL;
0744     sbi->s_fc_bh = bh;
0745     sbi->s_fc_bytes = (sbi->s_fc_bytes / bsize + 1) * bsize + len;
0746     return sbi->s_fc_bh->b_data;
0747 }
0748
0749 /* memcpy to fc reserved space and update CRC */
0750 static void *ext4_fc_memcpy(struct super_block *sb, void *dst, const void *src,
0751                 int len, u32 *crc)
0752 {
0753     if (crc)
0754         *crc = ext4_chksum(EXT4_SB(sb), *crc, src, len);
0755     return memcpy(dst, src, len);
0756 }
0757
0758 /*
0759  * Complete a fast commit by writing tail tag.
0760  *
0761  * Writing tail tag marks the end of a fast commit. In order to guarantee
0762  * atomicity, after writing tail tag, even if there's space remaining
0763  * in the block, next commit shouldn't use it. That's why tail tag
0764  * has the length as that of the remaining space on the block.
0765  */
0766 static int ext4_fc_write_tail(struct super_block *sb, u32 crc)
0767 {
0768     struct ext4_sb_info *sbi = EXT4_SB(sb);
0769     struct ext4_fc_tl tl;
0770     struct ext4_fc_tail tail;
0771     int off, bsize = sbi->s_journal->j_blocksize;
0772     u8 *dst;
0773
0774     /*
0775      * ext4_fc_reserve_space takes care of allocating an extra block if
0776      * there's no enough space on this block for accommodating this tail.
0777      */
0778     dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(tail), &crc);
0779     if (!dst)
0780         return -ENOSPC;
0781
0782     off = sbi->s_fc_bytes % bsize;
0783
0784     tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL);
0785     tl.fc_len = cpu_to_le16(bsize - off - 1 + sizeof(struct ext4_fc_tail));
0786     sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize);
0787
0788     ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), &crc);
0789     dst += sizeof(tl);
0790     tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid);
0791     ext4_fc_memcpy(sb, dst, &tail.fc_tid, sizeof(tail.fc_tid), &crc);
0792     dst += sizeof(tail.fc_tid);
0793     tail.fc_crc = cpu_to_le32(crc);
0794     ext4_fc_memcpy(sb, dst, &tail.fc_crc, sizeof(tail.fc_crc), NULL);
0795
0796     ext4_fc_submit_bh(sb, true);
0797
0798     return 0;
0799 }
0800
0801 /*
0802  * Adds tag, length, value and updates CRC. Returns true if tlv was added.
0803  * Returns false if there's not enough space.
0804  */
0805 static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val,
0806                u32 *crc)
0807 {
0808     struct ext4_fc_tl tl;
0809     u8 *dst;
0810
0811     dst = ext4_fc_reserve_space(sb, sizeof(tl) + len, crc);
0812     if (!dst)
0813         return false;
0814
0815     tl.fc_tag = cpu_to_le16(tag);
0816     tl.fc_len = cpu_to_le16(len);
0817
0818     ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
0819     ext4_fc_memcpy(sb, dst + sizeof(tl), val, len, crc);
0820
0821     return true;
0822 }
0823
0824 /* Same as above, but adds dentry tlv. */
0825 static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc,
0826                    struct ext4_fc_dentry_update *fc_dentry)
0827 {
0828     struct ext4_fc_dentry_info fcd;
0829     struct ext4_fc_tl tl;
0830     int dlen = fc_dentry->fcd_name.len;
0831     u8 *dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(fcd) + dlen,
0832                     crc);
0833
0834     if (!dst)
0835         return false;
0836
0837     fcd.fc_parent_ino = cpu_to_le32(fc_dentry->fcd_parent);
0838     fcd.fc_ino = cpu_to_le32(fc_dentry->fcd_ino);
0839     tl.fc_tag = cpu_to_le16(fc_dentry->fcd_op);
0840     tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen);
0841     ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
0842     dst += sizeof(tl);
0843     ext4_fc_memcpy(sb, dst, &fcd, sizeof(fcd), crc);
0844     dst += sizeof(fcd);
0845     ext4_fc_memcpy(sb, dst, fc_dentry->fcd_name.name, dlen, crc);
0846
0847     return true;
0848 }
0849
0850 /*
0851  * Writes inode in the fast commit space under TLV with tag @tag.
0852  * Returns 0 on success, error on failure.
0853  */
0854 static int ext4_fc_write_inode(struct inode *inode, u32 *crc)
0855 {
0856     struct ext4_inode_info *ei = EXT4_I(inode);
0857     int inode_len = EXT4_GOOD_OLD_INODE_SIZE;
0858     int ret;
0859     struct ext4_iloc iloc;
0860     struct ext4_fc_inode fc_inode;
0861     struct ext4_fc_tl tl;
0862     u8 *dst;
0863
0864     ret = ext4_get_inode_loc(inode, &iloc);
0865     if (ret)
0866         return ret;
0867
0868     if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA))
0869         inode_len = EXT4_INODE_SIZE(inode->i_sb);
0870     else if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE)
0871         inode_len += ei->i_extra_isize;
0872
0873     fc_inode.fc_ino = cpu_to_le32(inode->i_ino);
0874     tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE);
0875     tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino));
0876
0877     dst = ext4_fc_reserve_space(inode->i_sb,
0878             sizeof(tl) + inode_len + sizeof(fc_inode.fc_ino), crc);
0879     if (!dst)
0880         return -ECANCELED;
0881
0882     if (!ext4_fc_memcpy(inode->i_sb, dst, &tl, sizeof(tl), crc))
0883         return -ECANCELED;
0884     dst += sizeof(tl);
0885     if (!ext4_fc_memcpy(inode->i_sb, dst, &fc_inode, sizeof(fc_inode), crc))
0886         return -ECANCELED;
0887     dst += sizeof(fc_inode);
0888     if (!ext4_fc_memcpy(inode->i_sb, dst, (u8 *)ext4_raw_inode(&iloc),
0889                     inode_len, crc))
0890         return -ECANCELED;
0891
0892     return 0;
0893 }
0894
0895 /*
0896  * Writes updated data ranges for the inode in question. Updates CRC.
0897  * Returns 0 on success, error otherwise.
0898  */
0899 static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
0900 {
0901     ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size;
0902     struct ext4_inode_info *ei = EXT4_I(inode);
0903     struct ext4_map_blocks map;
0904     struct ext4_fc_add_range fc_ext;
0905     struct ext4_fc_del_range lrange;
0906     struct ext4_extent *ex;
0907     int ret;
0908
0909     mutex_lock(&ei->i_fc_lock);
0910     if (ei->i_fc_lblk_len == 0) {
0911         mutex_unlock(&ei->i_fc_lock);
0912         return 0;
0913     }
0914     old_blk_size = ei->i_fc_lblk_start;
0915     new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1;
0916     ei->i_fc_lblk_len = 0;
0917     mutex_unlock(&ei->i_fc_lock);
0918
0919     cur_lblk_off = old_blk_size;
0920     ext4_debug("will try writing %d to %d for inode %ld\n",
0921            cur_lblk_off, new_blk_size, inode->i_ino);
0922
0923     while (cur_lblk_off <= new_blk_size) {
0924         map.m_lblk = cur_lblk_off;
0925         map.m_len = new_blk_size - cur_lblk_off + 1;
0926         ret = ext4_map_blocks(NULL, inode, &map, 0);
0927         if (ret < 0)
0928             return -ECANCELED;
0929
0930         if (map.m_len == 0) {
0931             cur_lblk_off++;
0932             continue;
0933         }
0934
0935         if (ret == 0) {
0936             lrange.fc_ino = cpu_to_le32(inode->i_ino);
0937             lrange.fc_lblk = cpu_to_le32(map.m_lblk);
0938             lrange.fc_len = cpu_to_le32(map.m_len);
0939             if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE,
0940                         sizeof(lrange), (u8 *)&lrange, crc))
0941                 return -ENOSPC;
0942         } else {
0943             unsigned int max = (map.m_flags & EXT4_MAP_UNWRITTEN) ?
0944                 EXT_UNWRITTEN_MAX_LEN : EXT_INIT_MAX_LEN;
0945
0946             /* Limit the number of blocks in one extent */
0947             map.m_len = min(max, map.m_len);
0948
0949             fc_ext.fc_ino = cpu_to_le32(inode->i_ino);
0950             ex = (struct ext4_extent *)&fc_ext.fc_ex;
0951             ex->ee_block = cpu_to_le32(map.m_lblk);
0952             ex->ee_len = cpu_to_le16(map.m_len);
0953             ext4_ext_store_pblock(ex, map.m_pblk);
0954             if (map.m_flags & EXT4_MAP_UNWRITTEN)
0955                 ext4_ext_mark_unwritten(ex);
0956             else
0957                 ext4_ext_mark_initialized(ex);
0958             if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE,
0959                         sizeof(fc_ext), (u8 *)&fc_ext, crc))
0960                 return -ENOSPC;
0961         }
0962
0963         cur_lblk_off += map.m_len;
0964     }
0965
0966     return 0;
0967 }
0968
0969
0970 /* Submit data for all the fast commit inodes */
0971 static int ext4_fc_submit_inode_data_all(journal_t *journal)
0972 {
0973     struct super_block *sb = journal->j_private;
0974     struct ext4_sb_info *sbi = EXT4_SB(sb);
0975     struct ext4_inode_info *ei;
0976     int ret = 0;
0977
0978     spin_lock(&sbi->s_fc_lock);
0979     list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
0980         ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING);
0981         while (atomic_read(&ei->i_fc_updates)) {
0982             DEFINE_WAIT(wait);
0983
0984             prepare_to_wait(&ei->i_fc_wait, &wait,
0985                         TASK_UNINTERRUPTIBLE);
0986             if (atomic_read(&ei->i_fc_updates)) {
0987                 spin_unlock(&sbi->s_fc_lock);
0988                 schedule();
0989                 spin_lock(&sbi->s_fc_lock);
0990             }
0991             finish_wait(&ei->i_fc_wait, &wait);
0992         }
0993         spin_unlock(&sbi->s_fc_lock);
0994         ret = jbd2_submit_inode_data(ei->jinode);
0995         if (ret)
0996             return ret;
0997         spin_lock(&sbi->s_fc_lock);
0998     }
0999     spin_unlock(&sbi->s_fc_lock);
1000
1001     return ret;
1002 }
1003
1004 /* Wait for completion of data for all the fast commit inodes */
1005 static int ext4_fc_wait_inode_data_all(journal_t *journal)
1006 {
1007     struct super_block *sb = journal->j_private;
1008     struct ext4_sb_info *sbi = EXT4_SB(sb);
1009     struct ext4_inode_info *pos, *n;
1010     int ret = 0;
1011
1012     spin_lock(&sbi->s_fc_lock);
1013     list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
1014         if (!ext4_test_inode_state(&pos->vfs_inode,
1015                        EXT4_STATE_FC_COMMITTING))
1016             continue;
1017         spin_unlock(&sbi->s_fc_lock);
1018
1019         ret = jbd2_wait_inode_data(journal, pos->jinode);
1020         if (ret)
1021             return ret;
1022         spin_lock(&sbi->s_fc_lock);
1023     }
1024     spin_unlock(&sbi->s_fc_lock);
1025
1026     return 0;
1027 }
1028
1029 /* Commit all the directory entry updates */
1030 static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc)
1031 __acquires(&sbi->s_fc_lock)
1032 __releases(&sbi->s_fc_lock)
1033 {
1034     struct super_block *sb = journal->j_private;
1035     struct ext4_sb_info *sbi = EXT4_SB(sb);
1036     struct ext4_fc_dentry_update *fc_dentry, *fc_dentry_n;
1037     struct inode *inode;
1038     struct ext4_inode_info *ei;
1039     int ret;
1040
1041     if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN]))
1042         return 0;
1043     list_for_each_entry_safe(fc_dentry, fc_dentry_n,
1044                  &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) {
1045         if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) {
1046             spin_unlock(&sbi->s_fc_lock);
1047             if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) {
1048                 ret = -ENOSPC;
1049                 goto lock_and_exit;
1050             }
1051             spin_lock(&sbi->s_fc_lock);
1052             continue;
1053         }
1054         /*
1055          * With fcd_dilist we need not loop in sbi->s_fc_q to get the
1056          * corresponding inode pointer
1057          */
1058         WARN_ON(list_empty(&fc_dentry->fcd_dilist));
1059         ei = list_first_entry(&fc_dentry->fcd_dilist,
1060                 struct ext4_inode_info, i_fc_dilist);
1061         inode = &ei->vfs_inode;
1062         WARN_ON(inode->i_ino != fc_dentry->fcd_ino);
1063
1064         spin_unlock(&sbi->s_fc_lock);
1065
1066         /*
1067          * We first write the inode and then the create dirent. This
1068          * allows the recovery code to create an unnamed inode first
1069          * and then link it to a directory entry. This allows us
1070          * to use namei.c routines almost as is and simplifies
1071          * the recovery code.
1072          */
1073         ret = ext4_fc_write_inode(inode, crc);
1074         if (ret)
1075             goto lock_and_exit;
1076
1077         ret = ext4_fc_write_inode_data(inode, crc);
1078         if (ret)
1079             goto lock_and_exit;
1080
1081         if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) {
1082             ret = -ENOSPC;
1083             goto lock_and_exit;
1084         }
1085
1086         spin_lock(&sbi->s_fc_lock);
1087     }
1088     return 0;
1089 lock_and_exit:
1090     spin_lock(&sbi->s_fc_lock);
1091     return ret;
1092 }
1093
1094 static int ext4_fc_perform_commit(journal_t *journal)
1095 {
1096     struct super_block *sb = journal->j_private;
1097     struct ext4_sb_info *sbi = EXT4_SB(sb);
1098     struct ext4_inode_info *iter;
1099     struct ext4_fc_head head;
1100     struct inode *inode;
1101     struct blk_plug plug;
1102     int ret = 0;
1103     u32 crc = 0;
1104
1105     ret = ext4_fc_submit_inode_data_all(journal);
1106     if (ret)
1107         return ret;
1108
1109     ret = ext4_fc_wait_inode_data_all(journal);
1110     if (ret)
1111         return ret;
1112
1113     /*
1114      * If file system device is different from journal device, issue a cache
1115      * flush before we start writing fast commit blocks.
1116      */
1117     if (journal->j_fs_dev != journal->j_dev)
1118         blkdev_issue_flush(journal->j_fs_dev);
1119
1120     blk_start_plug(&plug);
1121     if (sbi->s_fc_bytes == 0) {
1122         /*
1123          * Add a head tag only if this is the first fast commit
1124          * in this TID.
1125          */
1126         head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES);
1127         head.fc_tid = cpu_to_le32(
1128             sbi->s_journal->j_running_transaction->t_tid);
1129         if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head),
1130             (u8 *)&head, &crc)) {
1131             ret = -ENOSPC;
1132             goto out;
1133         }
1134     }
1135
1136     spin_lock(&sbi->s_fc_lock);
1137     ret = ext4_fc_commit_dentry_updates(journal, &crc);
1138     if (ret) {
1139         spin_unlock(&sbi->s_fc_lock);
1140         goto out;
1141     }
1142
1143     list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
1144         inode = &iter->vfs_inode;
1145         if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
1146             continue;
1147
1148         spin_unlock(&sbi->s_fc_lock);
1149         ret = ext4_fc_write_inode_data(inode, &crc);
1150         if (ret)
1151             goto out;
1152         ret = ext4_fc_write_inode(inode, &crc);
1153         if (ret)
1154             goto out;
1155         spin_lock(&sbi->s_fc_lock);
1156     }
1157     spin_unlock(&sbi->s_fc_lock);
1158
1159     ret = ext4_fc_write_tail(sb, crc);
1160
1161 out:
1162     blk_finish_plug(&plug);
1163     return ret;
1164 }
1165
1166 static void ext4_fc_update_stats(struct super_block *sb, int status,
1167                  u64 commit_time, int nblks, tid_t commit_tid)
1168 {
1169     struct ext4_fc_stats *stats = &EXT4_SB(sb)->s_fc_stats;
1170
1171     ext4_debug("Fast commit ended with status = %d for tid %u",
1172             status, commit_tid);
1173     if (status == EXT4_FC_STATUS_OK) {
1174         stats->fc_num_commits++;
1175         stats->fc_numblks += nblks;
1176         if (likely(stats->s_fc_avg_commit_time))
1177             stats->s_fc_avg_commit_time =
1178                 (commit_time +
1179                  stats->s_fc_avg_commit_time * 3) / 4;
1180         else
1181             stats->s_fc_avg_commit_time = commit_time;
1182     } else if (status == EXT4_FC_STATUS_FAILED ||
1183            status == EXT4_FC_STATUS_INELIGIBLE) {
1184         if (status == EXT4_FC_STATUS_FAILED)
1185             stats->fc_failed_commits++;
1186         stats->fc_ineligible_commits++;
1187     } else {
1188         stats->fc_skipped_commits++;
1189     }
1190     trace_ext4_fc_commit_stop(sb, nblks, status, commit_tid);
1191 }
1192
1193 /*
1194  * The main commit entry point. Performs a fast commit for transaction
1195  * commit_tid if needed. If it's not possible to perform a fast commit
1196  * due to various reasons, we fall back to full commit. Returns 0
1197  * on success, error otherwise.
1198  */
1199 int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
1200 {
1201     struct super_block *sb = journal->j_private;
1202     struct ext4_sb_info *sbi = EXT4_SB(sb);
1203     int nblks = 0, ret, bsize = journal->j_blocksize;
1204     int subtid = atomic_read(&sbi->s_fc_subtid);
1205     int status = EXT4_FC_STATUS_OK, fc_bufs_before = 0;
1206     ktime_t start_time, commit_time;
1207
1208     if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
1209         return jbd2_complete_transaction(journal, commit_tid);
1210
1211     trace_ext4_fc_commit_start(sb, commit_tid);
1212
1213     start_time = ktime_get();
1214
1215 restart_fc:
1216     ret = jbd2_fc_begin_commit(journal, commit_tid);
1217     if (ret == -EALREADY) {
1218         /* There was an ongoing commit, check if we need to restart */
1219         if (atomic_read(&sbi->s_fc_subtid) <= subtid &&
1220             commit_tid > journal->j_commit_sequence)
1221             goto restart_fc;
1222         ext4_fc_update_stats(sb, EXT4_FC_STATUS_SKIPPED, 0, 0,
1223                 commit_tid);
1224         return 0;
1225     } else if (ret) {
1226         /*
1227          * Commit couldn't start. Just update stats and perform a
1228          * full commit.
1229          */
1230         ext4_fc_update_stats(sb, EXT4_FC_STATUS_FAILED, 0, 0,
1231                 commit_tid);
1232         return jbd2_complete_transaction(journal, commit_tid);
1233     }
1234
1235     /*
1236      * After establishing journal barrier via jbd2_fc_begin_commit(), check
1237      * if we are fast commit ineligible.
1238      */
1239     if (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE)) {
1240         status = EXT4_FC_STATUS_INELIGIBLE;
1241         goto fallback;
1242     }
1243
1244     fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize;
1245     ret = ext4_fc_perform_commit(journal);
1246     if (ret < 0) {
1247         status = EXT4_FC_STATUS_FAILED;
1248         goto fallback;
1249     }
1250     nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before;
1251     ret = jbd2_fc_wait_bufs(journal, nblks);
1252     if (ret < 0) {
1253         status = EXT4_FC_STATUS_FAILED;
1254         goto fallback;
1255     }
1256     atomic_inc(&sbi->s_fc_subtid);
1257     ret = jbd2_fc_end_commit(journal);
1258     /*
1259      * weight the commit time higher than the average time so we
1260      * don't react too strongly to vast changes in the commit time
1261      */
1262     commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1263     ext4_fc_update_stats(sb, status, commit_time, nblks, commit_tid);
1264     return ret;
1265
1266 fallback:
1267     ret = jbd2_fc_end_commit_fallback(journal);
1268     ext4_fc_update_stats(sb, status, 0, 0, commit_tid);
1269     return ret;
1270 }
1271
1272 /*
1273  * Fast commit cleanup routine. This is called after every fast commit and
1274  * full commit. full is true if we are called after a full commit.
1275  */
1276 static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
1277 {
1278     struct super_block *sb = journal->j_private;
1279     struct ext4_sb_info *sbi = EXT4_SB(sb);
1280     struct ext4_inode_info *iter, *iter_n;
1281     struct ext4_fc_dentry_update *fc_dentry;
1282
1283     if (full && sbi->s_fc_bh)
1284         sbi->s_fc_bh = NULL;
1285
1286     trace_ext4_fc_cleanup(journal, full, tid);
1287     jbd2_fc_release_bufs(journal);
1288
1289     spin_lock(&sbi->s_fc_lock);
1290     list_for_each_entry_safe(iter, iter_n, &sbi->s_fc_q[FC_Q_MAIN],
1291                  i_fc_list) {
1292         list_del_init(&iter->i_fc_list);
1293         ext4_clear_inode_state(&iter->vfs_inode,
1294                        EXT4_STATE_FC_COMMITTING);
1295         if (iter->i_sync_tid <= tid)
1296             ext4_fc_reset_inode(&iter->vfs_inode);
1297         /* Make sure EXT4_STATE_FC_COMMITTING bit is clear */
1298         smp_mb();
1299 #if (BITS_PER_LONG < 64)
1300         wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING);
1301 #else
1302         wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING);
1303 #endif
1304     }
1305
1306     while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) {
1307         fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN],
1308                          struct ext4_fc_dentry_update,
1309                          fcd_list);
1310         list_del_init(&fc_dentry->fcd_list);
1311         list_del_init(&fc_dentry->fcd_dilist);
1312         spin_unlock(&sbi->s_fc_lock);
1313
1314         if (fc_dentry->fcd_name.name &&
1315             fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
1316             kfree(fc_dentry->fcd_name.name);
1317         kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
1318         spin_lock(&sbi->s_fc_lock);
1319     }
1320
1321     list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING],
1322                 &sbi->s_fc_dentry_q[FC_Q_MAIN]);
1323     list_splice_init(&sbi->s_fc_q[FC_Q_STAGING],
1324                 &sbi->s_fc_q[FC_Q_MAIN]);
1325
1326     if (tid >= sbi->s_fc_ineligible_tid) {
1327         sbi->s_fc_ineligible_tid = 0;
1328         ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
1329     }
1330
1331     if (full)
1332         sbi->s_fc_bytes = 0;
1333     spin_unlock(&sbi->s_fc_lock);
1334     trace_ext4_fc_stats(sb);
1335 }
1336
1337 /* Ext4 Replay Path Routines */
1338
1339 /* Helper struct for dentry replay routines */
1340 struct dentry_info_args {
1341     int parent_ino, dname_len, ino, inode_len;
1342     char *dname;
1343 };
1344
1345 static inline void tl_to_darg(struct dentry_info_args *darg,
1346                   struct  ext4_fc_tl *tl, u8 *val)
1347 {
1348     struct ext4_fc_dentry_info fcd;
1349
1350     memcpy(&fcd, val, sizeof(fcd));
1351
1352     darg->parent_ino = le32_to_cpu(fcd.fc_parent_ino);
1353     darg->ino = le32_to_cpu(fcd.fc_ino);
1354     darg->dname = val + offsetof(struct ext4_fc_dentry_info, fc_dname);
1355     darg->dname_len = le16_to_cpu(tl->fc_len) -
1356         sizeof(struct ext4_fc_dentry_info);
1357 }
1358
1359 /* Unlink replay function */
1360 static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl,
1361                  u8 *val)
1362 {
1363     struct inode *inode, *old_parent;
1364     struct qstr entry;
1365     struct dentry_info_args darg;
1366     int ret = 0;
1367
1368     tl_to_darg(&darg, tl, val);
1369
1370     trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino,
1371             darg.parent_ino, darg.dname_len);
1372
1373     entry.name = darg.dname;
1374     entry.len = darg.dname_len;
1375     inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1376
1377     if (IS_ERR(inode)) {
1378         ext4_debug("Inode %d not found", darg.ino);
1379         return 0;
1380     }
1381
1382     old_parent = ext4_iget(sb, darg.parent_ino,
1383                 EXT4_IGET_NORMAL);
1384     if (IS_ERR(old_parent)) {
1385         ext4_debug("Dir with inode %d not found", darg.parent_ino);
1386         iput(inode);
1387         return 0;
1388     }
1389
1390     ret = __ext4_unlink(NULL, old_parent, &entry, inode);
1391     /* -ENOENT ok coz it might not exist anymore. */
1392     if (ret == -ENOENT)
1393         ret = 0;
1394     iput(old_parent);
1395     iput(inode);
1396     return ret;
1397 }
1398
1399 static int ext4_fc_replay_link_internal(struct super_block *sb,
1400                 struct dentry_info_args *darg,
1401                 struct inode *inode)
1402 {
1403     struct inode *dir = NULL;
1404     struct dentry *dentry_dir = NULL, *dentry_inode = NULL;
1405     struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len);
1406     int ret = 0;
1407
1408     dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL);
1409     if (IS_ERR(dir)) {
1410         ext4_debug("Dir with inode %d not found.", darg->parent_ino);
1411         dir = NULL;
1412         goto out;
1413     }
1414
1415     dentry_dir = d_obtain_alias(dir);
1416     if (IS_ERR(dentry_dir)) {
1417         ext4_debug("Failed to obtain dentry");
1418         dentry_dir = NULL;
1419         goto out;
1420     }
1421
1422     dentry_inode = d_alloc(dentry_dir, &qstr_dname);
1423     if (!dentry_inode) {
1424         ext4_debug("Inode dentry not created.");
1425         ret = -ENOMEM;
1426         goto out;
1427     }
1428
1429     ret = __ext4_link(dir, inode, dentry_inode);
1430     /*
1431      * It's possible that link already existed since data blocks
1432      * for the dir in question got persisted before we crashed OR
1433      * we replayed this tag and crashed before the entire replay
1434      * could complete.
1435      */
1436     if (ret && ret != -EEXIST) {
1437         ext4_debug("Failed to link\n");
1438         goto out;
1439     }
1440
1441     ret = 0;
1442 out:
1443     if (dentry_dir) {
1444         d_drop(dentry_dir);
1445         dput(dentry_dir);
1446     } else if (dir) {
1447         iput(dir);
1448     }
1449     if (dentry_inode) {
1450         d_drop(dentry_inode);
1451         dput(dentry_inode);
1452     }
1453
1454     return ret;
1455 }
1456
1457 /* Link replay function */
1458 static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl *tl,
1459                    u8 *val)
1460 {
1461     struct inode *inode;
1462     struct dentry_info_args darg;
1463     int ret = 0;
1464
1465     tl_to_darg(&darg, tl, val);
1466     trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino,
1467             darg.parent_ino, darg.dname_len);
1468
1469     inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1470     if (IS_ERR(inode)) {
1471         ext4_debug("Inode not found.");
1472         return 0;
1473     }
1474
1475     ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1476     iput(inode);
1477     return ret;
1478 }
1479
1480 /*
1481  * Record all the modified inodes during replay. We use this later to setup
1482  * block bitmaps correctly.
1483  */
1484 static int ext4_fc_record_modified_inode(struct super_block *sb, int ino)
1485 {
1486     struct ext4_fc_replay_state *state;
1487     int i;
1488
1489     state = &EXT4_SB(sb)->s_fc_replay_state;
1490     for (i = 0; i < state->fc_modified_inodes_used; i++)
1491         if (state->fc_modified_inodes[i] == ino)
1492             return 0;
1493     if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) {
1494         state->fc_modified_inodes = krealloc(
1495                 state->fc_modified_inodes,
1496                 sizeof(int) * (state->fc_modified_inodes_size +
1497                 EXT4_FC_REPLAY_REALLOC_INCREMENT),
1498                 GFP_KERNEL);
1499         if (!state->fc_modified_inodes)
1500             return -ENOMEM;
1501         state->fc_modified_inodes_size +=
1502             EXT4_FC_REPLAY_REALLOC_INCREMENT;
1503     }
1504     state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino;
1505     return 0;
1506 }
1507
1508 /*
1509  * Inode replay function
1510  */
1511 static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl,
1512                 u8 *val)
1513 {
1514     struct ext4_fc_inode fc_inode;
1515     struct ext4_inode *raw_inode;
1516     struct ext4_inode *raw_fc_inode;
1517     struct inode *inode = NULL;
1518     struct ext4_iloc iloc;
1519     int inode_len, ino, ret, tag = le16_to_cpu(tl->fc_tag);
1520     struct ext4_extent_header *eh;
1521
1522     memcpy(&fc_inode, val, sizeof(fc_inode));
1523
1524     ino = le32_to_cpu(fc_inode.fc_ino);
1525     trace_ext4_fc_replay(sb, tag, ino, 0, 0);
1526
1527     inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1528     if (!IS_ERR(inode)) {
1529         ext4_ext_clear_bb(inode);
1530         iput(inode);
1531     }
1532     inode = NULL;
1533
1534     ret = ext4_fc_record_modified_inode(sb, ino);
1535     if (ret)
1536         goto out;
1537
1538     raw_fc_inode = (struct ext4_inode *)
1539         (val + offsetof(struct ext4_fc_inode, fc_raw_inode));
1540     ret = ext4_get_fc_inode_loc(sb, ino, &iloc);
1541     if (ret)
1542         goto out;
1543
1544     inode_len = le16_to_cpu(tl->fc_len) - sizeof(struct ext4_fc_inode);
1545     raw_inode = ext4_raw_inode(&iloc);
1546
1547     memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block));
1548     memcpy(&raw_inode->i_generation, &raw_fc_inode->i_generation,
1549         inode_len - offsetof(struct ext4_inode, i_generation));
1550     if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) {
1551         eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]);
1552         if (eh->eh_magic != EXT4_EXT_MAGIC) {
1553             memset(eh, 0, sizeof(*eh));
1554             eh->eh_magic = EXT4_EXT_MAGIC;
1555             eh->eh_max = cpu_to_le16(
1556                 (sizeof(raw_inode->i_block) -
1557                  sizeof(struct ext4_extent_header))
1558                  / sizeof(struct ext4_extent));
1559         }
1560     } else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) {
1561         memcpy(raw_inode->i_block, raw_fc_inode->i_block,
1562             sizeof(raw_inode->i_block));
1563     }
1564
1565     /* Immediately update the inode on disk. */
1566     ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1567     if (ret)
1568         goto out;
1569     ret = sync_dirty_buffer(iloc.bh);
1570     if (ret)
1571         goto out;
1572     ret = ext4_mark_inode_used(sb, ino);
1573     if (ret)
1574         goto out;
1575
1576     /* Given that we just wrote the inode on disk, this SHOULD succeed. */
1577     inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1578     if (IS_ERR(inode)) {
1579         ext4_debug("Inode not found.");
1580         return -EFSCORRUPTED;
1581     }
1582
1583     /*
1584      * Our allocator could have made different decisions than before
1585      * crashing. This should be fixed but until then, we calculate
1586      * the number of blocks the inode.
1587      */
1588     if (!ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA))
1589         ext4_ext_replay_set_iblocks(inode);
1590
1591     inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation);
1592     ext4_reset_inode_seed(inode);
1593
1594     ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode));
1595     ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1596     sync_dirty_buffer(iloc.bh);
1597     brelse(iloc.bh);
1598 out:
1599     iput(inode);
1600     if (!ret)
1601         blkdev_issue_flush(sb->s_bdev);
1602
1603     return 0;
1604 }
1605
1606 /*
1607  * Dentry create replay function.
1608  *
1609  * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the
1610  * inode for which we are trying to create a dentry here, should already have
1611  * been replayed before we start here.
1612  */
1613 static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl,
1614                  u8 *val)
1615 {
1616     int ret = 0;
1617     struct inode *inode = NULL;
1618     struct inode *dir = NULL;
1619     struct dentry_info_args darg;
1620
1621     tl_to_darg(&darg, tl, val);
1622
1623     trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino,
1624             darg.parent_ino, darg.dname_len);
1625
1626     /* This takes care of update group descriptor and other metadata */
1627     ret = ext4_mark_inode_used(sb, darg.ino);
1628     if (ret)
1629         goto out;
1630
1631     inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1632     if (IS_ERR(inode)) {
1633         ext4_debug("inode %d not found.", darg.ino);
1634         inode = NULL;
1635         ret = -EINVAL;
1636         goto out;
1637     }
1638
1639     if (S_ISDIR(inode->i_mode)) {
1640         /*
1641          * If we are creating a directory, we need to make sure that the
1642          * dot and dot dot dirents are setup properly.
1643          */
1644         dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL);
1645         if (IS_ERR(dir)) {
1646             ext4_debug("Dir %d not found.", darg.ino);
1647             goto out;
1648         }
1649         ret = ext4_init_new_dir(NULL, dir, inode);
1650         iput(dir);
1651         if (ret) {
1652             ret = 0;
1653             goto out;
1654         }
1655     }
1656     ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1657     if (ret)
1658         goto out;
1659     set_nlink(inode, 1);
1660     ext4_mark_inode_dirty(NULL, inode);
1661 out:
1662     iput(inode);
1663     return ret;
1664 }
1665
1666 /*
1667  * Record physical disk regions which are in use as per fast commit area,
1668  * and used by inodes during replay phase. Our simple replay phase
1669  * allocator excludes these regions from allocation.
1670  */
1671 int ext4_fc_record_regions(struct super_block *sb, int ino,
1672         ext4_lblk_t lblk, ext4_fsblk_t pblk, int len, int replay)
1673 {
1674     struct ext4_fc_replay_state *state;
1675     struct ext4_fc_alloc_region *region;
1676
1677     state = &EXT4_SB(sb)->s_fc_replay_state;
1678     /*
1679      * during replay phase, the fc_regions_valid may not same as
1680      * fc_regions_used, update it when do new additions.
1681      */
1682     if (replay && state->fc_regions_used != state->fc_regions_valid)
1683         state->fc_regions_used = state->fc_regions_valid;
1684     if (state->fc_regions_used == state->fc_regions_size) {
1685         state->fc_regions_size +=
1686             EXT4_FC_REPLAY_REALLOC_INCREMENT;
1687         state->fc_regions = krealloc(
1688                     state->fc_regions,
1689                     state->fc_regions_size *
1690                     sizeof(struct ext4_fc_alloc_region),
1691                     GFP_KERNEL);
1692         if (!state->fc_regions)
1693             return -ENOMEM;
1694     }
1695     region = &state->fc_regions[state->fc_regions_used++];
1696     region->ino = ino;
1697     region->lblk = lblk;
1698     region->pblk = pblk;
1699     region->len = len;
1700
1701     if (replay)
1702         state->fc_regions_valid++;
1703
1704     return 0;
1705 }
1706
1707 /* Replay add range tag */
1708 static int ext4_fc_replay_add_range(struct super_block *sb,
1709                     struct ext4_fc_tl *tl, u8 *val)
1710 {
1711     struct ext4_fc_add_range fc_add_ex;
1712     struct ext4_extent newex, *ex;
1713     struct inode *inode;
1714     ext4_lblk_t start, cur;
1715     int remaining, len;
1716     ext4_fsblk_t start_pblk;
1717     struct ext4_map_blocks map;
1718     struct ext4_ext_path *path = NULL;
1719     int ret;
1720
1721     memcpy(&fc_add_ex, val, sizeof(fc_add_ex));
1722     ex = (struct ext4_extent *)&fc_add_ex.fc_ex;
1723
1724     trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE,
1725         le32_to_cpu(fc_add_ex.fc_ino), le32_to_cpu(ex->ee_block),
1726         ext4_ext_get_actual_len(ex));
1727
1728     inode = ext4_iget(sb, le32_to_cpu(fc_add_ex.fc_ino), EXT4_IGET_NORMAL);
1729     if (IS_ERR(inode)) {
1730         ext4_debug("Inode not found.");
1731         return 0;
1732     }
1733
1734     ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1735     if (ret)
1736         goto out;
1737
1738     start = le32_to_cpu(ex->ee_block);
1739     start_pblk = ext4_ext_pblock(ex);
1740     len = ext4_ext_get_actual_len(ex);
1741
1742     cur = start;
1743     remaining = len;
1744     ext4_debug("ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n",
1745           start, start_pblk, len, ext4_ext_is_unwritten(ex),
1746           inode->i_ino);
1747
1748     while (remaining > 0) {
1749         map.m_lblk = cur;
1750         map.m_len = remaining;
1751         map.m_pblk = 0;
1752         ret = ext4_map_blocks(NULL, inode, &map, 0);
1753
1754         if (ret < 0)
1755             goto out;
1756
1757         if (ret == 0) {
1758             /* Range is not mapped */
1759             path = ext4_find_extent(inode, cur, NULL, 0);
1760             if (IS_ERR(path))
1761                 goto out;
1762             memset(&newex, 0, sizeof(newex));
1763             newex.ee_block = cpu_to_le32(cur);
1764             ext4_ext_store_pblock(
1765                 &newex, start_pblk + cur - start);
1766             newex.ee_len = cpu_to_le16(map.m_len);
1767             if (ext4_ext_is_unwritten(ex))
1768                 ext4_ext_mark_unwritten(&newex);
1769             down_write(&EXT4_I(inode)->i_data_sem);
1770             ret = ext4_ext_insert_extent(
1771                 NULL, inode, &path, &newex, 0);
1772             up_write((&EXT4_I(inode)->i_data_sem));
1773             ext4_ext_drop_refs(path);
1774             kfree(path);
1775             if (ret)
1776                 goto out;
1777             goto next;
1778         }
1779
1780         if (start_pblk + cur - start != map.m_pblk) {
1781             /*
1782              * Logical to physical mapping changed. This can happen
1783              * if this range was removed and then reallocated to
1784              * map to new physical blocks during a fast commit.
1785              */
1786             ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1787                     ext4_ext_is_unwritten(ex),
1788                     start_pblk + cur - start);
1789             if (ret)
1790                 goto out;
1791             /*
1792              * Mark the old blocks as free since they aren't used
1793              * anymore. We maintain an array of all the modified
1794              * inodes. In case these blocks are still used at either
1795              * a different logical range in the same inode or in
1796              * some different inode, we will mark them as allocated
1797              * at the end of the FC replay using our array of
1798              * modified inodes.
1799              */
1800             ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1801             goto next;
1802         }
1803
1804         /* Range is mapped and needs a state change */
1805         ext4_debug("Converting from %ld to %d %lld",
1806                 map.m_flags & EXT4_MAP_UNWRITTEN,
1807             ext4_ext_is_unwritten(ex), map.m_pblk);
1808         ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1809                     ext4_ext_is_unwritten(ex), map.m_pblk);
1810         if (ret)
1811             goto out;
1812         /*
1813          * We may have split the extent tree while toggling the state.
1814          * Try to shrink the extent tree now.
1815          */
1816         ext4_ext_replay_shrink_inode(inode, start + len);
1817 next:
1818         cur += map.m_len;
1819         remaining -= map.m_len;
1820     }
1821     ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >>
1822                     sb->s_blocksize_bits);
1823 out:
1824     iput(inode);
1825     return 0;
1826 }
1827
1828 /* Replay DEL_RANGE tag */
1829 static int
1830 ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl,
1831              u8 *val)
1832 {
1833     struct inode *inode;
1834     struct ext4_fc_del_range lrange;
1835     struct ext4_map_blocks map;
1836     ext4_lblk_t cur, remaining;
1837     int ret;
1838
1839     memcpy(&lrange, val, sizeof(lrange));
1840     cur = le32_to_cpu(lrange.fc_lblk);
1841     remaining = le32_to_cpu(lrange.fc_len);
1842
1843     trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE,
1844         le32_to_cpu(lrange.fc_ino), cur, remaining);
1845
1846     inode = ext4_iget(sb, le32_to_cpu(lrange.fc_ino), EXT4_IGET_NORMAL);
1847     if (IS_ERR(inode)) {
1848         ext4_debug("Inode %d not found", le32_to_cpu(lrange.fc_ino));
1849         return 0;
1850     }
1851
1852     ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1853     if (ret)
1854         goto out;
1855
1856     ext4_debug("DEL_RANGE, inode %ld, lblk %d, len %d\n",
1857             inode->i_ino, le32_to_cpu(lrange.fc_lblk),
1858             le32_to_cpu(lrange.fc_len));
1859     while (remaining > 0) {
1860         map.m_lblk = cur;
1861         map.m_len = remaining;
1862
1863         ret = ext4_map_blocks(NULL, inode, &map, 0);
1864         if (ret < 0)
1865             goto out;
1866         if (ret > 0) {
1867             remaining -= ret;
1868             cur += ret;
1869             ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1870         } else {
1871             remaining -= map.m_len;
1872             cur += map.m_len;
1873         }
1874     }
1875
1876     down_write(&EXT4_I(inode)->i_data_sem);
1877     ret = ext4_ext_remove_space(inode, le32_to_cpu(lrange.fc_lblk),
1878                 le32_to_cpu(lrange.fc_lblk) +
1879                 le32_to_cpu(lrange.fc_len) - 1);
1880     up_write(&EXT4_I(inode)->i_data_sem);
1881     if (ret)
1882         goto out;
1883     ext4_ext_replay_shrink_inode(inode,
1884         i_size_read(inode) >> sb->s_blocksize_bits);
1885     ext4_mark_inode_dirty(NULL, inode);
1886 out:
1887     iput(inode);
1888     return 0;
1889 }
1890
1891 static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)
1892 {
1893     struct ext4_fc_replay_state *state;
1894     struct inode *inode;
1895     struct ext4_ext_path *path = NULL;
1896     struct ext4_map_blocks map;
1897     int i, ret, j;
1898     ext4_lblk_t cur, end;
1899
1900     state = &EXT4_SB(sb)->s_fc_replay_state;
1901     for (i = 0; i < state->fc_modified_inodes_used; i++) {
1902         inode = ext4_iget(sb, state->fc_modified_inodes[i],
1903             EXT4_IGET_NORMAL);
1904         if (IS_ERR(inode)) {
1905             ext4_debug("Inode %d not found.",
1906                 state->fc_modified_inodes[i]);
1907             continue;
1908         }
1909         cur = 0;
1910         end = EXT_MAX_BLOCKS;
1911         if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA)) {
1912             iput(inode);
1913             continue;
1914         }
1915         while (cur < end) {
1916             map.m_lblk = cur;
1917             map.m_len = end - cur;
1918
1919             ret = ext4_map_blocks(NULL, inode, &map, 0);
1920             if (ret < 0)
1921                 break;
1922
1923             if (ret > 0) {
1924                 path = ext4_find_extent(inode, map.m_lblk, NULL, 0);
1925                 if (!IS_ERR(path)) {
1926                     for (j = 0; j < path->p_depth; j++)
1927                         ext4_mb_mark_bb(inode->i_sb,
1928                             path[j].p_block, 1, 1);
1929                     ext4_ext_drop_refs(path);
1930                     kfree(path);
1931                 }
1932                 cur += ret;
1933                 ext4_mb_mark_bb(inode->i_sb, map.m_pblk,
1934                             map.m_len, 1);
1935             } else {
1936                 cur = cur + (map.m_len ? map.m_len : 1);
1937             }
1938         }
1939         iput(inode);
1940     }
1941 }
1942
1943 /*
1944  * Check if block is in excluded regions for block allocation. The simple
1945  * allocator that runs during replay phase is calls this function to see
1946  * if it is okay to use a block.
1947  */
1948 bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk)
1949 {
1950     int i;
1951     struct ext4_fc_replay_state *state;
1952
1953     state = &EXT4_SB(sb)->s_fc_replay_state;
1954     for (i = 0; i < state->fc_regions_valid; i++) {
1955         if (state->fc_regions[i].ino == 0 ||
1956             state->fc_regions[i].len == 0)
1957             continue;
1958         if (in_range(blk, state->fc_regions[i].pblk,
1959                     state->fc_regions[i].len))
1960             return true;
1961     }
1962     return false;
1963 }
1964
1965 /* Cleanup function called after replay */
1966 void ext4_fc_replay_cleanup(struct super_block *sb)
1967 {
1968     struct ext4_sb_info *sbi = EXT4_SB(sb);
1969
1970     sbi->s_mount_state &= ~EXT4_FC_REPLAY;
1971     kfree(sbi->s_fc_replay_state.fc_regions);
1972     kfree(sbi->s_fc_replay_state.fc_modified_inodes);
1973 }
1974
1975 /*
1976  * Recovery Scan phase handler
1977  *
1978  * This function is called during the scan phase and is responsible
1979  * for doing following things:
1980  * - Make sure the fast commit area has valid tags for replay
1981  * - Count number of tags that need to be replayed by the replay handler
1982  * - Verify CRC
1983  * - Create a list of excluded blocks for allocation during replay phase
1984  *
1985  * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is
1986  * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP
1987  * to indicate that scan has finished and JBD2 can now start replay phase.
1988  * It returns a negative error to indicate that there was an error. At the end
1989  * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set
1990  * to indicate the number of tags that need to replayed during the replay phase.
1991  */
1992 static int ext4_fc_replay_scan(journal_t *journal,
1993                 struct buffer_head *bh, int off,
1994                 tid_t expected_tid)
1995 {
1996     struct super_block *sb = journal->j_private;
1997     struct ext4_sb_info *sbi = EXT4_SB(sb);
1998     struct ext4_fc_replay_state *state;
1999     int ret = JBD2_FC_REPLAY_CONTINUE;
2000     struct ext4_fc_add_range ext;
2001     struct ext4_fc_tl tl;
2002     struct ext4_fc_tail tail;
2003     __u8 *start, *end, *cur, *val;
2004     struct ext4_fc_head head;
2005     struct ext4_extent *ex;
2006
2007     state = &sbi->s_fc_replay_state;
2008
2009     start = (u8 *)bh->b_data;
2010     end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
2011
2012     if (state->fc_replay_expected_off == 0) {
2013         state->fc_cur_tag = 0;
2014         state->fc_replay_num_tags = 0;
2015         state->fc_crc = 0;
2016         state->fc_regions = NULL;
2017         state->fc_regions_valid = state->fc_regions_used =
2018             state->fc_regions_size = 0;
2019         /* Check if we can stop early */
2020         if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag)
2021             != EXT4_FC_TAG_HEAD)
2022             return 0;
2023     }
2024
2025     if (off != state->fc_replay_expected_off) {
2026         ret = -EFSCORRUPTED;
2027         goto out_err;
2028     }
2029
2030     state->fc_replay_expected_off++;
2031     for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) {
2032         memcpy(&tl, cur, sizeof(tl));
2033         val = cur + sizeof(tl);
2034         ext4_debug("Scan phase, tag:%s, blk %lld\n",
2035               tag2str(le16_to_cpu(tl.fc_tag)), bh->b_blocknr);
2036         switch (le16_to_cpu(tl.fc_tag)) {
2037         case EXT4_FC_TAG_ADD_RANGE:
2038             memcpy(&ext, val, sizeof(ext));
2039             ex = (struct ext4_extent *)&ext.fc_ex;
2040             ret = ext4_fc_record_regions(sb,
2041                 le32_to_cpu(ext.fc_ino),
2042                 le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex),
2043                 ext4_ext_get_actual_len(ex), 0);
2044             if (ret < 0)
2045                 break;
2046             ret = JBD2_FC_REPLAY_CONTINUE;
2047             fallthrough;
2048         case EXT4_FC_TAG_DEL_RANGE:
2049         case EXT4_FC_TAG_LINK:
2050         case EXT4_FC_TAG_UNLINK:
2051         case EXT4_FC_TAG_CREAT:
2052         case EXT4_FC_TAG_INODE:
2053         case EXT4_FC_TAG_PAD:
2054             state->fc_cur_tag++;
2055             state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
2056                     sizeof(tl) + le16_to_cpu(tl.fc_len));
2057             break;
2058         case EXT4_FC_TAG_TAIL:
2059             state->fc_cur_tag++;
2060             memcpy(&tail, val, sizeof(tail));
2061             state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
2062                         sizeof(tl) +
2063                         offsetof(struct ext4_fc_tail,
2064                         fc_crc));
2065             if (le32_to_cpu(tail.fc_tid) == expected_tid &&
2066                 le32_to_cpu(tail.fc_crc) == state->fc_crc) {
2067                 state->fc_replay_num_tags = state->fc_cur_tag;
2068                 state->fc_regions_valid =
2069                     state->fc_regions_used;
2070             } else {
2071                 ret = state->fc_replay_num_tags ?
2072                     JBD2_FC_REPLAY_STOP : -EFSBADCRC;
2073             }
2074             state->fc_crc = 0;
2075             break;
2076         case EXT4_FC_TAG_HEAD:
2077             memcpy(&head, val, sizeof(head));
2078             if (le32_to_cpu(head.fc_features) &
2079                 ~EXT4_FC_SUPPORTED_FEATURES) {
2080                 ret = -EOPNOTSUPP;
2081                 break;
2082             }
2083             if (le32_to_cpu(head.fc_tid) != expected_tid) {
2084                 ret = JBD2_FC_REPLAY_STOP;
2085                 break;
2086             }
2087             state->fc_cur_tag++;
2088             state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
2089                         sizeof(tl) + le16_to_cpu(tl.fc_len));
2090             break;
2091         default:
2092             ret = state->fc_replay_num_tags ?
2093                 JBD2_FC_REPLAY_STOP : -ECANCELED;
2094         }
2095         if (ret < 0 || ret == JBD2_FC_REPLAY_STOP)
2096             break;
2097     }
2098
2099 out_err:
2100     trace_ext4_fc_replay_scan(sb, ret, off);
2101     return ret;
2102 }
2103
2104 /*
2105  * Main recovery path entry point.
2106  * The meaning of return codes is similar as above.
2107  */
2108 static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
2109                 enum passtype pass, int off, tid_t expected_tid)
2110 {
2111     struct super_block *sb = journal->j_private;
2112     struct ext4_sb_info *sbi = EXT4_SB(sb);
2113     struct ext4_fc_tl tl;
2114     __u8 *start, *end, *cur, *val;
2115     int ret = JBD2_FC_REPLAY_CONTINUE;
2116     struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state;
2117     struct ext4_fc_tail tail;
2118
2119     if (pass == PASS_SCAN) {
2120         state->fc_current_pass = PASS_SCAN;
2121         return ext4_fc_replay_scan(journal, bh, off, expected_tid);
2122     }
2123
2124     if (state->fc_current_pass != pass) {
2125         state->fc_current_pass = pass;
2126         sbi->s_mount_state |= EXT4_FC_REPLAY;
2127     }
2128     if (!sbi->s_fc_replay_state.fc_replay_num_tags) {
2129         ext4_debug("Replay stops\n");
2130         ext4_fc_set_bitmaps_and_counters(sb);
2131         return 0;
2132     }
2133
2134 #ifdef CONFIG_EXT4_DEBUG
2135     if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) {
2136         pr_warn("Dropping fc block %d because max_replay set\n", off);
2137         return JBD2_FC_REPLAY_STOP;
2138     }
2139 #endif
2140
2141     start = (u8 *)bh->b_data;
2142     end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
2143
2144     for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) {
2145         memcpy(&tl, cur, sizeof(tl));
2146         val = cur + sizeof(tl);
2147
2148         if (state->fc_replay_num_tags == 0) {
2149             ret = JBD2_FC_REPLAY_STOP;
2150             ext4_fc_set_bitmaps_and_counters(sb);
2151             break;
2152         }
2153         ext4_debug("Replay phase, tag:%s\n",
2154                 tag2str(le16_to_cpu(tl.fc_tag)));
2155         state->fc_replay_num_tags--;
2156         switch (le16_to_cpu(tl.fc_tag)) {
2157         case EXT4_FC_TAG_LINK:
2158             ret = ext4_fc_replay_link(sb, &tl, val);
2159             break;
2160         case EXT4_FC_TAG_UNLINK:
2161             ret = ext4_fc_replay_unlink(sb, &tl, val);
2162             break;
2163         case EXT4_FC_TAG_ADD_RANGE:
2164             ret = ext4_fc_replay_add_range(sb, &tl, val);
2165             break;
2166         case EXT4_FC_TAG_CREAT:
2167             ret = ext4_fc_replay_create(sb, &tl, val);
2168             break;
2169         case EXT4_FC_TAG_DEL_RANGE:
2170             ret = ext4_fc_replay_del_range(sb, &tl, val);
2171             break;
2172         case EXT4_FC_TAG_INODE:
2173             ret = ext4_fc_replay_inode(sb, &tl, val);
2174             break;
2175         case EXT4_FC_TAG_PAD:
2176             trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0,
2177                          le16_to_cpu(tl.fc_len), 0);
2178             break;
2179         case EXT4_FC_TAG_TAIL:
2180             trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 0,
2181                          le16_to_cpu(tl.fc_len), 0);
2182             memcpy(&tail, val, sizeof(tail));
2183             WARN_ON(le32_to_cpu(tail.fc_tid) != expected_tid);
2184             break;
2185         case EXT4_FC_TAG_HEAD:
2186             break;
2187         default:
2188             trace_ext4_fc_replay(sb, le16_to_cpu(tl.fc_tag), 0,
2189                          le16_to_cpu(tl.fc_len), 0);
2190             ret = -ECANCELED;
2191             break;
2192         }
2193         if (ret < 0)
2194             break;
2195         ret = JBD2_FC_REPLAY_CONTINUE;
2196     }
2197     return ret;
2198 }
2199
2200 void ext4_fc_init(struct super_block *sb, journal_t *journal)
2201 {
2202     /*
2203      * We set replay callback even if fast commit disabled because we may
2204      * could still have fast commit blocks that need to be replayed even if
2205      * fast commit has now been turned off.
2206      */
2207     journal->j_fc_replay_callback = ext4_fc_replay;
2208     if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
2209         return;
2210     journal->j_fc_cleanup_callback = ext4_fc_cleanup;
2211 }
2212
2213 static const char *fc_ineligible_reasons[] = {
2214     "Extended attributes changed",
2215     "Cross rename",
2216     "Journal flag changed",
2217     "Insufficient memory",
2218     "Swap boot",
2219     "Resize",
2220     "Dir renamed",
2221     "Falloc range op",
2222     "Data journalling",
2223     "FC Commit Failed"
2224 };
2225
2226 int ext4_fc_info_show(struct seq_file *seq, void *v)
2227 {
2228     struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private);
2229     struct ext4_fc_stats *stats = &sbi->s_fc_stats;
2230     int i;
2231
2232     if (v != SEQ_START_TOKEN)
2233         return 0;
2234
2235     seq_printf(seq,
2236         "fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n",
2237            stats->fc_num_commits, stats->fc_ineligible_commits,
2238            stats->fc_numblks,
2239            div_u64(stats->s_fc_avg_commit_time, 1000));
2240     seq_puts(seq, "Ineligible reasons:\n");
2241     for (i = 0; i < EXT4_FC_REASON_MAX; i++)
2242         seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i],
2243             stats->fc_ineligible_reason_count[i]);
2244
2245     return 0;
2246 }
2247
2248 int __init ext4_fc_init_dentry_cache(void)
2249 {
2250     ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update,
2251                        SLAB_RECLAIM_ACCOUNT);
2252
2253     if (ext4_fc_dentry_cachep == NULL)
2254         return -ENOMEM;
2255
2256     return 0;
2257 }
2258
2259 void ext4_fc_destroy_dentry_cache(void)
2260 {
2261     kmem_cache_destroy(ext4_fc_dentry_cachep);
2262 }