fs/jbd2/commit.c

0001 // SPDX-License-Identifier: GPL-2.0+
0002 /*
0003  * linux/fs/jbd2/commit.c
0004  *
0005  * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
0006  *
0007  * Copyright 1998 Red Hat corp --- All Rights Reserved
0008  *
0009  * Journal commit routines for the generic filesystem journaling code;
0010  * part of the ext2fs journaling system.
0011  */
0012
0013 #include <linux/time.h>
0014 #include <linux/fs.h>
0015 #include <linux/jbd2.h>
0016 #include <linux/errno.h>
0017 #include <linux/slab.h>
0018 #include <linux/mm.h>
0019 #include <linux/pagemap.h>
0020 #include <linux/jiffies.h>
0021 #include <linux/crc32.h>
0022 #include <linux/writeback.h>
0023 #include <linux/backing-dev.h>
0024 #include <linux/bio.h>
0025 #include <linux/blkdev.h>
0026 #include <linux/bitops.h>
0027 #include <trace/events/jbd2.h>
0028
0029 /*
0030  * IO end handler for temporary buffer_heads handling writes to the journal.
0031  */
0032 static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
0033 {
0034     struct buffer_head *orig_bh = bh->b_private;
0035
0036     BUFFER_TRACE(bh, "");
0037     if (uptodate)
0038         set_buffer_uptodate(bh);
0039     else
0040         clear_buffer_uptodate(bh);
0041     if (orig_bh) {
0042         clear_bit_unlock(BH_Shadow, &orig_bh->b_state);
0043         smp_mb__after_atomic();
0044         wake_up_bit(&orig_bh->b_state, BH_Shadow);
0045     }
0046     unlock_buffer(bh);
0047 }
0048
0049 /*
0050  * When an ext4 file is truncated, it is possible that some pages are not
0051  * successfully freed, because they are attached to a committing transaction.
0052  * After the transaction commits, these pages are left on the LRU, with no
0053  * ->mapping, and with attached buffers.  These pages are trivially reclaimable
0054  * by the VM, but their apparent absence upsets the VM accounting, and it makes
0055  * the numbers in /proc/meminfo look odd.
0056  *
0057  * So here, we have a buffer which has just come off the forget list.  Look to
0058  * see if we can strip all buffers from the backing page.
0059  *
0060  * Called under lock_journal(), and possibly under journal_datalist_lock.  The
0061  * caller provided us with a ref against the buffer, and we drop that here.
0062  */
0063 static void release_buffer_page(struct buffer_head *bh)
0064 {
0065     struct folio *folio;
0066     struct page *page;
0067
0068     if (buffer_dirty(bh))
0069         goto nope;
0070     if (atomic_read(&bh->b_count) != 1)
0071         goto nope;
0072     page = bh->b_page;
0073     if (!page)
0074         goto nope;
0075     folio = page_folio(page);
0076     if (folio->mapping)
0077         goto nope;
0078
0079     /* OK, it's a truncated page */
0080     if (!folio_trylock(folio))
0081         goto nope;
0082
0083     folio_get(folio);
0084     __brelse(bh);
0085     try_to_free_buffers(folio);
0086     folio_unlock(folio);
0087     folio_put(folio);
0088     return;
0089
0090 nope:
0091     __brelse(bh);
0092 }
0093
0094 static void jbd2_commit_block_csum_set(journal_t *j, struct buffer_head *bh)
0095 {
0096     struct commit_header *h;
0097     __u32 csum;
0098
0099     if (!jbd2_journal_has_csum_v2or3(j))
0100         return;
0101
0102     h = (struct commit_header *)(bh->b_data);
0103     h->h_chksum_type = 0;
0104     h->h_chksum_size = 0;
0105     h->h_chksum[0] = 0;
0106     csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize);
0107     h->h_chksum[0] = cpu_to_be32(csum);
0108 }
0109
0110 /*
0111  * Done it all: now submit the commit record.  We should have
0112  * cleaned up our previous buffers by now, so if we are in abort
0113  * mode we can now just skip the rest of the journal write
0114  * entirely.
0115  *
0116  * Returns 1 if the journal needs to be aborted or 0 on success
0117  */
0118 static int journal_submit_commit_record(journal_t *journal,
0119                     transaction_t *commit_transaction,
0120                     struct buffer_head **cbh,
0121                     __u32 crc32_sum)
0122 {
0123     struct commit_header *tmp;
0124     struct buffer_head *bh;
0125     int ret;
0126     struct timespec64 now;
0127
0128     *cbh = NULL;
0129
0130     if (is_journal_aborted(journal))
0131         return 0;
0132
0133     bh = jbd2_journal_get_descriptor_buffer(commit_transaction,
0134                         JBD2_COMMIT_BLOCK);
0135     if (!bh)
0136         return 1;
0137
0138     tmp = (struct commit_header *)bh->b_data;
0139     ktime_get_coarse_real_ts64(&now);
0140     tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
0141     tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
0142
0143     if (jbd2_has_feature_checksum(journal)) {
0144         tmp->h_chksum_type  = JBD2_CRC32_CHKSUM;
0145         tmp->h_chksum_size  = JBD2_CRC32_CHKSUM_SIZE;
0146         tmp->h_chksum[0]    = cpu_to_be32(crc32_sum);
0147     }
0148     jbd2_commit_block_csum_set(journal, bh);
0149
0150     BUFFER_TRACE(bh, "submit commit block");
0151     lock_buffer(bh);
0152     clear_buffer_dirty(bh);
0153     set_buffer_uptodate(bh);
0154     bh->b_end_io = journal_end_buffer_io_sync;
0155
0156     if (journal->j_flags & JBD2_BARRIER &&
0157         !jbd2_has_feature_async_commit(journal))
0158         ret = submit_bh(REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH |
0159                 REQ_FUA, bh);
0160     else
0161         ret = submit_bh(REQ_OP_WRITE | REQ_SYNC, bh);
0162
0163     *cbh = bh;
0164     return ret;
0165 }
0166
0167 /*
0168  * This function along with journal_submit_commit_record
0169  * allows to write the commit record asynchronously.
0170  */
0171 static int journal_wait_on_commit_record(journal_t *journal,
0172                      struct buffer_head *bh)
0173 {
0174     int ret = 0;
0175
0176     clear_buffer_dirty(bh);
0177     wait_on_buffer(bh);
0178
0179     if (unlikely(!buffer_uptodate(bh)))
0180         ret = -EIO;
0181     put_bh(bh);            /* One for getblk() */
0182
0183     return ret;
0184 }
0185
0186 /*
0187  * write the filemap data using writepage() address_space_operations.
0188  * We don't do block allocation here even for delalloc. We don't
0189  * use writepages() because with delayed allocation we may be doing
0190  * block allocation in writepages().
0191  */
0192 int jbd2_journal_submit_inode_data_buffers(struct jbd2_inode *jinode)
0193 {
0194     struct address_space *mapping = jinode->i_vfs_inode->i_mapping;
0195     struct writeback_control wbc = {
0196         .sync_mode =  WB_SYNC_ALL,
0197         .nr_to_write = mapping->nrpages * 2,
0198         .range_start = jinode->i_dirty_start,
0199         .range_end = jinode->i_dirty_end,
0200     };
0201
0202     /*
0203      * submit the inode data buffers. We use writepage
0204      * instead of writepages. Because writepages can do
0205      * block allocation with delalloc. We need to write
0206      * only allocated blocks here.
0207      */
0208     return generic_writepages(mapping, &wbc);
0209 }
0210
0211 /* Send all the data buffers related to an inode */
0212 int jbd2_submit_inode_data(struct jbd2_inode *jinode)
0213 {
0214
0215     if (!jinode || !(jinode->i_flags & JI_WRITE_DATA))
0216         return 0;
0217
0218     trace_jbd2_submit_inode_data(jinode->i_vfs_inode);
0219     return jbd2_journal_submit_inode_data_buffers(jinode);
0220
0221 }
0222 EXPORT_SYMBOL(jbd2_submit_inode_data);
0223
0224 int jbd2_wait_inode_data(journal_t *journal, struct jbd2_inode *jinode)
0225 {
0226     if (!jinode || !(jinode->i_flags & JI_WAIT_DATA) ||
0227         !jinode->i_vfs_inode || !jinode->i_vfs_inode->i_mapping)
0228         return 0;
0229     return filemap_fdatawait_range_keep_errors(
0230         jinode->i_vfs_inode->i_mapping, jinode->i_dirty_start,
0231         jinode->i_dirty_end);
0232 }
0233 EXPORT_SYMBOL(jbd2_wait_inode_data);
0234
0235 /*
0236  * Submit all the data buffers of inode associated with the transaction to
0237  * disk.
0238  *
0239  * We are in a committing transaction. Therefore no new inode can be added to
0240  * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
0241  * operate on from being released while we write out pages.
0242  */
0243 static int journal_submit_data_buffers(journal_t *journal,
0244         transaction_t *commit_transaction)
0245 {
0246     struct jbd2_inode *jinode;
0247     int err, ret = 0;
0248
0249     spin_lock(&journal->j_list_lock);
0250     list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
0251         if (!(jinode->i_flags & JI_WRITE_DATA))
0252             continue;
0253         jinode->i_flags |= JI_COMMIT_RUNNING;
0254         spin_unlock(&journal->j_list_lock);
0255         /* submit the inode data buffers. */
0256         trace_jbd2_submit_inode_data(jinode->i_vfs_inode);
0257         if (journal->j_submit_inode_data_buffers) {
0258             err = journal->j_submit_inode_data_buffers(jinode);
0259             if (!ret)
0260                 ret = err;
0261         }
0262         spin_lock(&journal->j_list_lock);
0263         J_ASSERT(jinode->i_transaction == commit_transaction);
0264         jinode->i_flags &= ~JI_COMMIT_RUNNING;
0265         smp_mb();
0266         wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
0267     }
0268     spin_unlock(&journal->j_list_lock);
0269     return ret;
0270 }
0271
0272 int jbd2_journal_finish_inode_data_buffers(struct jbd2_inode *jinode)
0273 {
0274     struct address_space *mapping = jinode->i_vfs_inode->i_mapping;
0275
0276     return filemap_fdatawait_range_keep_errors(mapping,
0277                            jinode->i_dirty_start,
0278                            jinode->i_dirty_end);
0279 }
0280
0281 /*
0282  * Wait for data submitted for writeout, refile inodes to proper
0283  * transaction if needed.
0284  *
0285  */
0286 static int journal_finish_inode_data_buffers(journal_t *journal,
0287         transaction_t *commit_transaction)
0288 {
0289     struct jbd2_inode *jinode, *next_i;
0290     int err, ret = 0;
0291
0292     /* For locking, see the comment in journal_submit_data_buffers() */
0293     spin_lock(&journal->j_list_lock);
0294     list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
0295         if (!(jinode->i_flags & JI_WAIT_DATA))
0296             continue;
0297         jinode->i_flags |= JI_COMMIT_RUNNING;
0298         spin_unlock(&journal->j_list_lock);
0299         /* wait for the inode data buffers writeout. */
0300         if (journal->j_finish_inode_data_buffers) {
0301             err = journal->j_finish_inode_data_buffers(jinode);
0302             if (!ret)
0303                 ret = err;
0304         }
0305         spin_lock(&journal->j_list_lock);
0306         jinode->i_flags &= ~JI_COMMIT_RUNNING;
0307         smp_mb();
0308         wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
0309     }
0310
0311     /* Now refile inode to proper lists */
0312     list_for_each_entry_safe(jinode, next_i,
0313                  &commit_transaction->t_inode_list, i_list) {
0314         list_del(&jinode->i_list);
0315         if (jinode->i_next_transaction) {
0316             jinode->i_transaction = jinode->i_next_transaction;
0317             jinode->i_next_transaction = NULL;
0318             list_add(&jinode->i_list,
0319                 &jinode->i_transaction->t_inode_list);
0320         } else {
0321             jinode->i_transaction = NULL;
0322             jinode->i_dirty_start = 0;
0323             jinode->i_dirty_end = 0;
0324         }
0325     }
0326     spin_unlock(&journal->j_list_lock);
0327
0328     return ret;
0329 }
0330
0331 static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
0332 {
0333     struct page *page = bh->b_page;
0334     char *addr;
0335     __u32 checksum;
0336
0337     addr = kmap_atomic(page);
0338     checksum = crc32_be(crc32_sum,
0339         (void *)(addr + offset_in_page(bh->b_data)), bh->b_size);
0340     kunmap_atomic(addr);
0341
0342     return checksum;
0343 }
0344
0345 static void write_tag_block(journal_t *j, journal_block_tag_t *tag,
0346                    unsigned long long block)
0347 {
0348     tag->t_blocknr = cpu_to_be32(block & (u32)~0);
0349     if (jbd2_has_feature_64bit(j))
0350         tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
0351 }
0352
0353 static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag,
0354                     struct buffer_head *bh, __u32 sequence)
0355 {
0356     journal_block_tag3_t *tag3 = (journal_block_tag3_t *)tag;
0357     struct page *page = bh->b_page;
0358     __u8 *addr;
0359     __u32 csum32;
0360     __be32 seq;
0361
0362     if (!jbd2_journal_has_csum_v2or3(j))
0363         return;
0364
0365     seq = cpu_to_be32(sequence);
0366     addr = kmap_atomic(page);
0367     csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&seq, sizeof(seq));
0368     csum32 = jbd2_chksum(j, csum32, addr + offset_in_page(bh->b_data),
0369                  bh->b_size);
0370     kunmap_atomic(addr);
0371
0372     if (jbd2_has_feature_csum3(j))
0373         tag3->t_checksum = cpu_to_be32(csum32);
0374     else
0375         tag->t_checksum = cpu_to_be16(csum32);
0376 }
0377 /*
0378  * jbd2_journal_commit_transaction
0379  *
0380  * The primary function for committing a transaction to the log.  This
0381  * function is called by the journal thread to begin a complete commit.
0382  */
0383 void jbd2_journal_commit_transaction(journal_t *journal)
0384 {
0385     struct transaction_stats_s stats;
0386     transaction_t *commit_transaction;
0387     struct journal_head *jh;
0388     struct buffer_head *descriptor;
0389     struct buffer_head **wbuf = journal->j_wbuf;
0390     int bufs;
0391     int flags;
0392     int err;
0393     unsigned long long blocknr;
0394     ktime_t start_time;
0395     u64 commit_time;
0396     char *tagp = NULL;
0397     journal_block_tag_t *tag = NULL;
0398     int space_left = 0;
0399     int first_tag = 0;
0400     int tag_flag;
0401     int i;
0402     int tag_bytes = journal_tag_bytes(journal);
0403     struct buffer_head *cbh = NULL; /* For transactional checksums */
0404     __u32 crc32_sum = ~0;
0405     struct blk_plug plug;
0406     /* Tail of the journal */
0407     unsigned long first_block;
0408     tid_t first_tid;
0409     int update_tail;
0410     int csum_size = 0;
0411     LIST_HEAD(io_bufs);
0412     LIST_HEAD(log_bufs);
0413
0414     if (jbd2_journal_has_csum_v2or3(journal))
0415         csum_size = sizeof(struct jbd2_journal_block_tail);
0416
0417     /*
0418      * First job: lock down the current transaction and wait for
0419      * all outstanding updates to complete.
0420      */
0421
0422     /* Do we need to erase the effects of a prior jbd2_journal_flush? */
0423     if (journal->j_flags & JBD2_FLUSHED) {
0424         jbd2_debug(3, "super block updated\n");
0425         mutex_lock_io(&journal->j_checkpoint_mutex);
0426         /*
0427          * We hold j_checkpoint_mutex so tail cannot change under us.
0428          * We don't need any special data guarantees for writing sb
0429          * since journal is empty and it is ok for write to be
0430          * flushed only with transaction commit.
0431          */
0432         jbd2_journal_update_sb_log_tail(journal,
0433                         journal->j_tail_sequence,
0434                         journal->j_tail,
0435                         REQ_SYNC);
0436         mutex_unlock(&journal->j_checkpoint_mutex);
0437     } else {
0438         jbd2_debug(3, "superblock not updated\n");
0439     }
0440
0441     J_ASSERT(journal->j_running_transaction != NULL);
0442     J_ASSERT(journal->j_committing_transaction == NULL);
0443
0444     write_lock(&journal->j_state_lock);
0445     journal->j_flags |= JBD2_FULL_COMMIT_ONGOING;
0446     while (journal->j_flags & JBD2_FAST_COMMIT_ONGOING) {
0447         DEFINE_WAIT(wait);
0448
0449         prepare_to_wait(&journal->j_fc_wait, &wait,
0450                 TASK_UNINTERRUPTIBLE);
0451         write_unlock(&journal->j_state_lock);
0452         schedule();
0453         write_lock(&journal->j_state_lock);
0454         finish_wait(&journal->j_fc_wait, &wait);
0455         /*
0456          * TODO: by blocking fast commits here, we are increasing
0457          * fsync() latency slightly. Strictly speaking, we don't need
0458          * to block fast commits until the transaction enters T_FLUSH
0459          * state. So an optimization is possible where we block new fast
0460          * commits here and wait for existing ones to complete
0461          * just before we enter T_FLUSH. That way, the existing fast
0462          * commits and this full commit can proceed parallely.
0463          */
0464     }
0465     write_unlock(&journal->j_state_lock);
0466
0467     commit_transaction = journal->j_running_transaction;
0468
0469     trace_jbd2_start_commit(journal, commit_transaction);
0470     jbd2_debug(1, "JBD2: starting commit of transaction %d\n",
0471             commit_transaction->t_tid);
0472
0473     write_lock(&journal->j_state_lock);
0474     journal->j_fc_off = 0;
0475     J_ASSERT(commit_transaction->t_state == T_RUNNING);
0476     commit_transaction->t_state = T_LOCKED;
0477
0478     trace_jbd2_commit_locking(journal, commit_transaction);
0479     stats.run.rs_wait = commit_transaction->t_max_wait;
0480     stats.run.rs_request_delay = 0;
0481     stats.run.rs_locked = jiffies;
0482     if (commit_transaction->t_requested)
0483         stats.run.rs_request_delay =
0484             jbd2_time_diff(commit_transaction->t_requested,
0485                        stats.run.rs_locked);
0486     stats.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
0487                           stats.run.rs_locked);
0488
0489     // waits for any t_updates to finish
0490     jbd2_journal_wait_updates(journal);
0491
0492     commit_transaction->t_state = T_SWITCH;
0493
0494     J_ASSERT (atomic_read(&commit_transaction->t_outstanding_credits) <=
0495             journal->j_max_transaction_buffers);
0496
0497     /*
0498      * First thing we are allowed to do is to discard any remaining
0499      * BJ_Reserved buffers.  Note, it is _not_ permissible to assume
0500      * that there are no such buffers: if a large filesystem
0501      * operation like a truncate needs to split itself over multiple
0502      * transactions, then it may try to do a jbd2_journal_restart() while
0503      * there are still BJ_Reserved buffers outstanding.  These must
0504      * be released cleanly from the current transaction.
0505      *
0506      * In this case, the filesystem must still reserve write access
0507      * again before modifying the buffer in the new transaction, but
0508      * we do not require it to remember exactly which old buffers it
0509      * has reserved.  This is consistent with the existing behaviour
0510      * that multiple jbd2_journal_get_write_access() calls to the same
0511      * buffer are perfectly permissible.
0512      * We use journal->j_state_lock here to serialize processing of
0513      * t_reserved_list with eviction of buffers from journal_unmap_buffer().
0514      */
0515     while (commit_transaction->t_reserved_list) {
0516         jh = commit_transaction->t_reserved_list;
0517         JBUFFER_TRACE(jh, "reserved, unused: refile");
0518         /*
0519          * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may
0520          * leave undo-committed data.
0521          */
0522         if (jh->b_committed_data) {
0523             struct buffer_head *bh = jh2bh(jh);
0524
0525             spin_lock(&jh->b_state_lock);
0526             jbd2_free(jh->b_committed_data, bh->b_size);
0527             jh->b_committed_data = NULL;
0528             spin_unlock(&jh->b_state_lock);
0529         }
0530         jbd2_journal_refile_buffer(journal, jh);
0531     }
0532
0533     write_unlock(&journal->j_state_lock);
0534     /*
0535      * Now try to drop any written-back buffers from the journal's
0536      * checkpoint lists.  We do this *before* commit because it potentially
0537      * frees some memory
0538      */
0539     spin_lock(&journal->j_list_lock);
0540     __jbd2_journal_clean_checkpoint_list(journal, false);
0541     spin_unlock(&journal->j_list_lock);
0542
0543     jbd2_debug(3, "JBD2: commit phase 1\n");
0544
0545     /*
0546      * Clear revoked flag to reflect there is no revoked buffers
0547      * in the next transaction which is going to be started.
0548      */
0549     jbd2_clear_buffer_revoked_flags(journal);
0550
0551     /*
0552      * Switch to a new revoke table.
0553      */
0554     jbd2_journal_switch_revoke_table(journal);
0555
0556     write_lock(&journal->j_state_lock);
0557     /*
0558      * Reserved credits cannot be claimed anymore, free them
0559      */
0560     atomic_sub(atomic_read(&journal->j_reserved_credits),
0561            &commit_transaction->t_outstanding_credits);
0562
0563     trace_jbd2_commit_flushing(journal, commit_transaction);
0564     stats.run.rs_flushing = jiffies;
0565     stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked,
0566                          stats.run.rs_flushing);
0567
0568     commit_transaction->t_state = T_FLUSH;
0569     journal->j_committing_transaction = commit_transaction;
0570     journal->j_running_transaction = NULL;
0571     start_time = ktime_get();
0572     commit_transaction->t_log_start = journal->j_head;
0573     wake_up(&journal->j_wait_transaction_locked);
0574     write_unlock(&journal->j_state_lock);
0575
0576     jbd2_debug(3, "JBD2: commit phase 2a\n");
0577
0578     /*
0579      * Now start flushing things to disk, in the order they appear
0580      * on the transaction lists.  Data blocks go first.
0581      */
0582     err = journal_submit_data_buffers(journal, commit_transaction);
0583     if (err)
0584         jbd2_journal_abort(journal, err);
0585
0586     blk_start_plug(&plug);
0587     jbd2_journal_write_revoke_records(commit_transaction, &log_bufs);
0588
0589     jbd2_debug(3, "JBD2: commit phase 2b\n");
0590
0591     /*
0592      * Way to go: we have now written out all of the data for a
0593      * transaction!  Now comes the tricky part: we need to write out
0594      * metadata.  Loop over the transaction's entire buffer list:
0595      */
0596     write_lock(&journal->j_state_lock);
0597     commit_transaction->t_state = T_COMMIT;
0598     write_unlock(&journal->j_state_lock);
0599
0600     trace_jbd2_commit_logging(journal, commit_transaction);
0601     stats.run.rs_logging = jiffies;
0602     stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing,
0603                            stats.run.rs_logging);
0604     stats.run.rs_blocks = commit_transaction->t_nr_buffers;
0605     stats.run.rs_blocks_logged = 0;
0606
0607     J_ASSERT(commit_transaction->t_nr_buffers <=
0608          atomic_read(&commit_transaction->t_outstanding_credits));
0609
0610     err = 0;
0611     bufs = 0;
0612     descriptor = NULL;
0613     while (commit_transaction->t_buffers) {
0614
0615         /* Find the next buffer to be journaled... */
0616
0617         jh = commit_transaction->t_buffers;
0618
0619         /* If we're in abort mode, we just un-journal the buffer and
0620            release it. */
0621
0622         if (is_journal_aborted(journal)) {
0623             clear_buffer_jbddirty(jh2bh(jh));
0624             JBUFFER_TRACE(jh, "journal is aborting: refile");
0625             jbd2_buffer_abort_trigger(jh,
0626                           jh->b_frozen_data ?
0627                           jh->b_frozen_triggers :
0628                           jh->b_triggers);
0629             jbd2_journal_refile_buffer(journal, jh);
0630             /* If that was the last one, we need to clean up
0631              * any descriptor buffers which may have been
0632              * already allocated, even if we are now
0633              * aborting. */
0634             if (!commit_transaction->t_buffers)
0635                 goto start_journal_io;
0636             continue;
0637         }
0638
0639         /* Make sure we have a descriptor block in which to
0640            record the metadata buffer. */
0641
0642         if (!descriptor) {
0643             J_ASSERT (bufs == 0);
0644
0645             jbd2_debug(4, "JBD2: get descriptor\n");
0646
0647             descriptor = jbd2_journal_get_descriptor_buffer(
0648                             commit_transaction,
0649                             JBD2_DESCRIPTOR_BLOCK);
0650             if (!descriptor) {
0651                 jbd2_journal_abort(journal, -EIO);
0652                 continue;
0653             }
0654
0655             jbd2_debug(4, "JBD2: got buffer %llu (%p)\n",
0656                 (unsigned long long)descriptor->b_blocknr,
0657                 descriptor->b_data);
0658             tagp = &descriptor->b_data[sizeof(journal_header_t)];
0659             space_left = descriptor->b_size -
0660                         sizeof(journal_header_t);
0661             first_tag = 1;
0662             set_buffer_jwrite(descriptor);
0663             set_buffer_dirty(descriptor);
0664             wbuf[bufs++] = descriptor;
0665
0666             /* Record it so that we can wait for IO
0667                            completion later */
0668             BUFFER_TRACE(descriptor, "ph3: file as descriptor");
0669             jbd2_file_log_bh(&log_bufs, descriptor);
0670         }
0671
0672         /* Where is the buffer to be written? */
0673
0674         err = jbd2_journal_next_log_block(journal, &blocknr);
0675         /* If the block mapping failed, just abandon the buffer
0676            and repeat this loop: we'll fall into the
0677            refile-on-abort condition above. */
0678         if (err) {
0679             jbd2_journal_abort(journal, err);
0680             continue;
0681         }
0682
0683         /*
0684          * start_this_handle() uses t_outstanding_credits to determine
0685          * the free space in the log.
0686          */
0687         atomic_dec(&commit_transaction->t_outstanding_credits);
0688
0689         /* Bump b_count to prevent truncate from stumbling over
0690                    the shadowed buffer!  @@@ This can go if we ever get
0691                    rid of the shadow pairing of buffers. */
0692         atomic_inc(&jh2bh(jh)->b_count);
0693
0694         /*
0695          * Make a temporary IO buffer with which to write it out
0696          * (this will requeue the metadata buffer to BJ_Shadow).
0697          */
0698         set_bit(BH_JWrite, &jh2bh(jh)->b_state);
0699         JBUFFER_TRACE(jh, "ph3: write metadata");
0700         flags = jbd2_journal_write_metadata_buffer(commit_transaction,
0701                         jh, &wbuf[bufs], blocknr);
0702         if (flags < 0) {
0703             jbd2_journal_abort(journal, flags);
0704             continue;
0705         }
0706         jbd2_file_log_bh(&io_bufs, wbuf[bufs]);
0707
0708         /* Record the new block's tag in the current descriptor
0709                    buffer */
0710
0711         tag_flag = 0;
0712         if (flags & 1)
0713             tag_flag |= JBD2_FLAG_ESCAPE;
0714         if (!first_tag)
0715             tag_flag |= JBD2_FLAG_SAME_UUID;
0716
0717         tag = (journal_block_tag_t *) tagp;
0718         write_tag_block(journal, tag, jh2bh(jh)->b_blocknr);
0719         tag->t_flags = cpu_to_be16(tag_flag);
0720         jbd2_block_tag_csum_set(journal, tag, wbuf[bufs],
0721                     commit_transaction->t_tid);
0722         tagp += tag_bytes;
0723         space_left -= tag_bytes;
0724         bufs++;
0725
0726         if (first_tag) {
0727             memcpy (tagp, journal->j_uuid, 16);
0728             tagp += 16;
0729             space_left -= 16;
0730             first_tag = 0;
0731         }
0732
0733         /* If there's no more to do, or if the descriptor is full,
0734            let the IO rip! */
0735
0736         if (bufs == journal->j_wbufsize ||
0737             commit_transaction->t_buffers == NULL ||
0738             space_left < tag_bytes + 16 + csum_size) {
0739
0740             jbd2_debug(4, "JBD2: Submit %d IOs\n", bufs);
0741
0742             /* Write an end-of-descriptor marker before
0743                            submitting the IOs.  "tag" still points to
0744                            the last tag we set up. */
0745
0746             tag->t_flags |= cpu_to_be16(JBD2_FLAG_LAST_TAG);
0747 start_journal_io:
0748             if (descriptor)
0749                 jbd2_descriptor_block_csum_set(journal,
0750                             descriptor);
0751
0752             for (i = 0; i < bufs; i++) {
0753                 struct buffer_head *bh = wbuf[i];
0754                 /*
0755                  * Compute checksum.
0756                  */
0757                 if (jbd2_has_feature_checksum(journal)) {
0758                     crc32_sum =
0759                         jbd2_checksum_data(crc32_sum, bh);
0760                 }
0761
0762                 lock_buffer(bh);
0763                 clear_buffer_dirty(bh);
0764                 set_buffer_uptodate(bh);
0765                 bh->b_end_io = journal_end_buffer_io_sync;
0766                 submit_bh(REQ_OP_WRITE | REQ_SYNC, bh);
0767             }
0768             cond_resched();
0769
0770             /* Force a new descriptor to be generated next
0771                            time round the loop. */
0772             descriptor = NULL;
0773             bufs = 0;
0774         }
0775     }
0776
0777     err = journal_finish_inode_data_buffers(journal, commit_transaction);
0778     if (err) {
0779         printk(KERN_WARNING
0780             "JBD2: Detected IO errors while flushing file data "
0781                "on %s\n", journal->j_devname);
0782         if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
0783             jbd2_journal_abort(journal, err);
0784         err = 0;
0785     }
0786
0787     /*
0788      * Get current oldest transaction in the log before we issue flush
0789      * to the filesystem device. After the flush we can be sure that
0790      * blocks of all older transactions are checkpointed to persistent
0791      * storage and we will be safe to update journal start in the
0792      * superblock with the numbers we get here.
0793      */
0794     update_tail =
0795         jbd2_journal_get_log_tail(journal, &first_tid, &first_block);
0796
0797     write_lock(&journal->j_state_lock);
0798     if (update_tail) {
0799         long freed = first_block - journal->j_tail;
0800
0801         if (first_block < journal->j_tail)
0802             freed += journal->j_last - journal->j_first;
0803         /* Update tail only if we free significant amount of space */
0804         if (freed < jbd2_journal_get_max_txn_bufs(journal))
0805             update_tail = 0;
0806     }
0807     J_ASSERT(commit_transaction->t_state == T_COMMIT);
0808     commit_transaction->t_state = T_COMMIT_DFLUSH;
0809     write_unlock(&journal->j_state_lock);
0810
0811     /*
0812      * If the journal is not located on the file system device,
0813      * then we must flush the file system device before we issue
0814      * the commit record
0815      */
0816     if (commit_transaction->t_need_data_flush &&
0817         (journal->j_fs_dev != journal->j_dev) &&
0818         (journal->j_flags & JBD2_BARRIER))
0819         blkdev_issue_flush(journal->j_fs_dev);
0820
0821     /* Done it all: now write the commit record asynchronously. */
0822     if (jbd2_has_feature_async_commit(journal)) {
0823         err = journal_submit_commit_record(journal, commit_transaction,
0824                          &cbh, crc32_sum);
0825         if (err)
0826             jbd2_journal_abort(journal, err);
0827     }
0828
0829     blk_finish_plug(&plug);
0830
0831     /* Lo and behold: we have just managed to send a transaction to
0832            the log.  Before we can commit it, wait for the IO so far to
0833            complete.  Control buffers being written are on the
0834            transaction's t_log_list queue, and metadata buffers are on
0835            the io_bufs list.
0836
0837        Wait for the buffers in reverse order.  That way we are
0838        less likely to be woken up until all IOs have completed, and
0839        so we incur less scheduling load.
0840     */
0841
0842     jbd2_debug(3, "JBD2: commit phase 3\n");
0843
0844     while (!list_empty(&io_bufs)) {
0845         struct buffer_head *bh = list_entry(io_bufs.prev,
0846                             struct buffer_head,
0847                             b_assoc_buffers);
0848
0849         wait_on_buffer(bh);
0850         cond_resched();
0851
0852         if (unlikely(!buffer_uptodate(bh)))
0853             err = -EIO;
0854         jbd2_unfile_log_bh(bh);
0855         stats.run.rs_blocks_logged++;
0856
0857         /*
0858          * The list contains temporary buffer heads created by
0859          * jbd2_journal_write_metadata_buffer().
0860          */
0861         BUFFER_TRACE(bh, "dumping temporary bh");
0862         __brelse(bh);
0863         J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
0864         free_buffer_head(bh);
0865
0866         /* We also have to refile the corresponding shadowed buffer */
0867         jh = commit_transaction->t_shadow_list->b_tprev;
0868         bh = jh2bh(jh);
0869         clear_buffer_jwrite(bh);
0870         J_ASSERT_BH(bh, buffer_jbddirty(bh));
0871         J_ASSERT_BH(bh, !buffer_shadow(bh));
0872
0873         /* The metadata is now released for reuse, but we need
0874                    to remember it against this transaction so that when
0875                    we finally commit, we can do any checkpointing
0876                    required. */
0877         JBUFFER_TRACE(jh, "file as BJ_Forget");
0878         jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
0879         JBUFFER_TRACE(jh, "brelse shadowed buffer");
0880         __brelse(bh);
0881     }
0882
0883     J_ASSERT (commit_transaction->t_shadow_list == NULL);
0884
0885     jbd2_debug(3, "JBD2: commit phase 4\n");
0886
0887     /* Here we wait for the revoke record and descriptor record buffers */
0888     while (!list_empty(&log_bufs)) {
0889         struct buffer_head *bh;
0890
0891         bh = list_entry(log_bufs.prev, struct buffer_head, b_assoc_buffers);
0892         wait_on_buffer(bh);
0893         cond_resched();
0894
0895         if (unlikely(!buffer_uptodate(bh)))
0896             err = -EIO;
0897
0898         BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
0899         clear_buffer_jwrite(bh);
0900         jbd2_unfile_log_bh(bh);
0901         stats.run.rs_blocks_logged++;
0902         __brelse(bh);       /* One for getblk */
0903         /* AKPM: bforget here */
0904     }
0905
0906     if (err)
0907         jbd2_journal_abort(journal, err);
0908
0909     jbd2_debug(3, "JBD2: commit phase 5\n");
0910     write_lock(&journal->j_state_lock);
0911     J_ASSERT(commit_transaction->t_state == T_COMMIT_DFLUSH);
0912     commit_transaction->t_state = T_COMMIT_JFLUSH;
0913     write_unlock(&journal->j_state_lock);
0914
0915     if (!jbd2_has_feature_async_commit(journal)) {
0916         err = journal_submit_commit_record(journal, commit_transaction,
0917                         &cbh, crc32_sum);
0918         if (err)
0919             jbd2_journal_abort(journal, err);
0920     }
0921     if (cbh)
0922         err = journal_wait_on_commit_record(journal, cbh);
0923     stats.run.rs_blocks_logged++;
0924     if (jbd2_has_feature_async_commit(journal) &&
0925         journal->j_flags & JBD2_BARRIER) {
0926         blkdev_issue_flush(journal->j_dev);
0927     }
0928
0929     if (err)
0930         jbd2_journal_abort(journal, err);
0931
0932     WARN_ON_ONCE(
0933         atomic_read(&commit_transaction->t_outstanding_credits) < 0);
0934
0935     /*
0936      * Now disk caches for filesystem device are flushed so we are safe to
0937      * erase checkpointed transactions from the log by updating journal
0938      * superblock.
0939      */
0940     if (update_tail)
0941         jbd2_update_log_tail(journal, first_tid, first_block);
0942
0943     /* End of a transaction!  Finally, we can do checkpoint
0944            processing: any buffers committed as a result of this
0945            transaction can be removed from any checkpoint list it was on
0946            before. */
0947
0948     jbd2_debug(3, "JBD2: commit phase 6\n");
0949
0950     J_ASSERT(list_empty(&commit_transaction->t_inode_list));
0951     J_ASSERT(commit_transaction->t_buffers == NULL);
0952     J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
0953     J_ASSERT(commit_transaction->t_shadow_list == NULL);
0954
0955 restart_loop:
0956     /*
0957      * As there are other places (journal_unmap_buffer()) adding buffers
0958      * to this list we have to be careful and hold the j_list_lock.
0959      */
0960     spin_lock(&journal->j_list_lock);
0961     while (commit_transaction->t_forget) {
0962         transaction_t *cp_transaction;
0963         struct buffer_head *bh;
0964         int try_to_free = 0;
0965         bool drop_ref;
0966
0967         jh = commit_transaction->t_forget;
0968         spin_unlock(&journal->j_list_lock);
0969         bh = jh2bh(jh);
0970         /*
0971          * Get a reference so that bh cannot be freed before we are
0972          * done with it.
0973          */
0974         get_bh(bh);
0975         spin_lock(&jh->b_state_lock);
0976         J_ASSERT_JH(jh, jh->b_transaction == commit_transaction);
0977
0978         /*
0979          * If there is undo-protected committed data against
0980          * this buffer, then we can remove it now.  If it is a
0981          * buffer needing such protection, the old frozen_data
0982          * field now points to a committed version of the
0983          * buffer, so rotate that field to the new committed
0984          * data.
0985          *
0986          * Otherwise, we can just throw away the frozen data now.
0987          *
0988          * We also know that the frozen data has already fired
0989          * its triggers if they exist, so we can clear that too.
0990          */
0991         if (jh->b_committed_data) {
0992             jbd2_free(jh->b_committed_data, bh->b_size);
0993             jh->b_committed_data = NULL;
0994             if (jh->b_frozen_data) {
0995                 jh->b_committed_data = jh->b_frozen_data;
0996                 jh->b_frozen_data = NULL;
0997                 jh->b_frozen_triggers = NULL;
0998             }
0999         } else if (jh->b_frozen_data) {
1000             jbd2_free(jh->b_frozen_data, bh->b_size);
1001             jh->b_frozen_data = NULL;
1002             jh->b_frozen_triggers = NULL;
1003         }
1004
1005         spin_lock(&journal->j_list_lock);
1006         cp_transaction = jh->b_cp_transaction;
1007         if (cp_transaction) {
1008             JBUFFER_TRACE(jh, "remove from old cp transaction");
1009             cp_transaction->t_chp_stats.cs_dropped++;
1010             __jbd2_journal_remove_checkpoint(jh);
1011         }
1012
1013         /* Only re-checkpoint the buffer_head if it is marked
1014          * dirty.  If the buffer was added to the BJ_Forget list
1015          * by jbd2_journal_forget, it may no longer be dirty and
1016          * there's no point in keeping a checkpoint record for
1017          * it. */
1018
1019         /*
1020          * A buffer which has been freed while still being journaled
1021          * by a previous transaction, refile the buffer to BJ_Forget of
1022          * the running transaction. If the just committed transaction
1023          * contains "add to orphan" operation, we can completely
1024          * invalidate the buffer now. We are rather through in that
1025          * since the buffer may be still accessible when blocksize <
1026          * pagesize and it is attached to the last partial page.
1027          */
1028         if (buffer_freed(bh) && !jh->b_next_transaction) {
1029             struct address_space *mapping;
1030
1031             clear_buffer_freed(bh);
1032             clear_buffer_jbddirty(bh);
1033
1034             /*
1035              * Block device buffers need to stay mapped all the
1036              * time, so it is enough to clear buffer_jbddirty and
1037              * buffer_freed bits. For the file mapping buffers (i.e.
1038              * journalled data) we need to unmap buffer and clear
1039              * more bits. We also need to be careful about the check
1040              * because the data page mapping can get cleared under
1041              * our hands. Note that if mapping == NULL, we don't
1042              * need to make buffer unmapped because the page is
1043              * already detached from the mapping and buffers cannot
1044              * get reused.
1045              */
1046             mapping = READ_ONCE(bh->b_page->mapping);
1047             if (mapping && !sb_is_blkdev_sb(mapping->host->i_sb)) {
1048                 clear_buffer_mapped(bh);
1049                 clear_buffer_new(bh);
1050                 clear_buffer_req(bh);
1051                 bh->b_bdev = NULL;
1052             }
1053         }
1054
1055         if (buffer_jbddirty(bh)) {
1056             JBUFFER_TRACE(jh, "add to new checkpointing trans");
1057             __jbd2_journal_insert_checkpoint(jh, commit_transaction);
1058             if (is_journal_aborted(journal))
1059                 clear_buffer_jbddirty(bh);
1060         } else {
1061             J_ASSERT_BH(bh, !buffer_dirty(bh));
1062             /*
1063              * The buffer on BJ_Forget list and not jbddirty means
1064              * it has been freed by this transaction and hence it
1065              * could not have been reallocated until this
1066              * transaction has committed. *BUT* it could be
1067              * reallocated once we have written all the data to
1068              * disk and before we process the buffer on BJ_Forget
1069              * list.
1070              */
1071             if (!jh->b_next_transaction)
1072                 try_to_free = 1;
1073         }
1074         JBUFFER_TRACE(jh, "refile or unfile buffer");
1075         drop_ref = __jbd2_journal_refile_buffer(jh);
1076         spin_unlock(&jh->b_state_lock);
1077         if (drop_ref)
1078             jbd2_journal_put_journal_head(jh);
1079         if (try_to_free)
1080             release_buffer_page(bh);    /* Drops bh reference */
1081         else
1082             __brelse(bh);
1083         cond_resched_lock(&journal->j_list_lock);
1084     }
1085     spin_unlock(&journal->j_list_lock);
1086     /*
1087      * This is a bit sleazy.  We use j_list_lock to protect transition
1088      * of a transaction into T_FINISHED state and calling
1089      * __jbd2_journal_drop_transaction(). Otherwise we could race with
1090      * other checkpointing code processing the transaction...
1091      */
1092     write_lock(&journal->j_state_lock);
1093     spin_lock(&journal->j_list_lock);
1094     /*
1095      * Now recheck if some buffers did not get attached to the transaction
1096      * while the lock was dropped...
1097      */
1098     if (commit_transaction->t_forget) {
1099         spin_unlock(&journal->j_list_lock);
1100         write_unlock(&journal->j_state_lock);
1101         goto restart_loop;
1102     }
1103
1104     /* Add the transaction to the checkpoint list
1105      * __journal_remove_checkpoint() can not destroy transaction
1106      * under us because it is not marked as T_FINISHED yet */
1107     if (journal->j_checkpoint_transactions == NULL) {
1108         journal->j_checkpoint_transactions = commit_transaction;
1109         commit_transaction->t_cpnext = commit_transaction;
1110         commit_transaction->t_cpprev = commit_transaction;
1111     } else {
1112         commit_transaction->t_cpnext =
1113             journal->j_checkpoint_transactions;
1114         commit_transaction->t_cpprev =
1115             commit_transaction->t_cpnext->t_cpprev;
1116         commit_transaction->t_cpnext->t_cpprev =
1117             commit_transaction;
1118         commit_transaction->t_cpprev->t_cpnext =
1119                 commit_transaction;
1120     }
1121     spin_unlock(&journal->j_list_lock);
1122
1123     /* Done with this transaction! */
1124
1125     jbd2_debug(3, "JBD2: commit phase 7\n");
1126
1127     J_ASSERT(commit_transaction->t_state == T_COMMIT_JFLUSH);
1128
1129     commit_transaction->t_start = jiffies;
1130     stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging,
1131                           commit_transaction->t_start);
1132
1133     /*
1134      * File the transaction statistics
1135      */
1136     stats.ts_tid = commit_transaction->t_tid;
1137     stats.run.rs_handle_count =
1138         atomic_read(&commit_transaction->t_handle_count);
1139     trace_jbd2_run_stats(journal->j_fs_dev->bd_dev,
1140                  commit_transaction->t_tid, &stats.run);
1141     stats.ts_requested = (commit_transaction->t_requested) ? 1 : 0;
1142
1143     commit_transaction->t_state = T_COMMIT_CALLBACK;
1144     J_ASSERT(commit_transaction == journal->j_committing_transaction);
1145     journal->j_commit_sequence = commit_transaction->t_tid;
1146     journal->j_committing_transaction = NULL;
1147     commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1148
1149     /*
1150      * weight the commit time higher than the average time so we don't
1151      * react too strongly to vast changes in the commit time
1152      */
1153     if (likely(journal->j_average_commit_time))
1154         journal->j_average_commit_time = (commit_time +
1155                 journal->j_average_commit_time*3) / 4;
1156     else
1157         journal->j_average_commit_time = commit_time;
1158
1159     write_unlock(&journal->j_state_lock);
1160
1161     if (journal->j_commit_callback)
1162         journal->j_commit_callback(journal, commit_transaction);
1163     if (journal->j_fc_cleanup_callback)
1164         journal->j_fc_cleanup_callback(journal, 1, commit_transaction->t_tid);
1165
1166     trace_jbd2_end_commit(journal, commit_transaction);
1167     jbd2_debug(1, "JBD2: commit %d complete, head %d\n",
1168           journal->j_commit_sequence, journal->j_tail_sequence);
1169
1170     write_lock(&journal->j_state_lock);
1171     journal->j_flags &= ~JBD2_FULL_COMMIT_ONGOING;
1172     journal->j_flags &= ~JBD2_FAST_COMMIT_ONGOING;
1173     spin_lock(&journal->j_list_lock);
1174     commit_transaction->t_state = T_FINISHED;
1175     /* Check if the transaction can be dropped now that we are finished */
1176     if (commit_transaction->t_checkpoint_list == NULL &&
1177         commit_transaction->t_checkpoint_io_list == NULL) {
1178         __jbd2_journal_drop_transaction(journal, commit_transaction);
1179         jbd2_journal_free_transaction(commit_transaction);
1180     }
1181     spin_unlock(&journal->j_list_lock);
1182     write_unlock(&journal->j_state_lock);
1183     wake_up(&journal->j_wait_done_commit);
1184     wake_up(&journal->j_fc_wait);
1185
1186     /*
1187      * Calculate overall stats
1188      */
1189     spin_lock(&journal->j_history_lock);
1190     journal->j_stats.ts_tid++;
1191     journal->j_stats.ts_requested += stats.ts_requested;
1192     journal->j_stats.run.rs_wait += stats.run.rs_wait;
1193     journal->j_stats.run.rs_request_delay += stats.run.rs_request_delay;
1194     journal->j_stats.run.rs_running += stats.run.rs_running;
1195     journal->j_stats.run.rs_locked += stats.run.rs_locked;
1196     journal->j_stats.run.rs_flushing += stats.run.rs_flushing;
1197     journal->j_stats.run.rs_logging += stats.run.rs_logging;
1198     journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count;
1199     journal->j_stats.run.rs_blocks += stats.run.rs_blocks;
1200     journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged;
1201     spin_unlock(&journal->j_history_lock);
1202 }