fs/ocfs2/buffer_head_io.c

0001 // SPDX-License-Identifier: GPL-2.0-or-later
0002 /*
0003  * io.c
0004  *
0005  * Buffer cache handling
0006  *
0007  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
0008  */
0009
0010 #include <linux/fs.h>
0011 #include <linux/types.h>
0012 #include <linux/highmem.h>
0013 #include <linux/bio.h>
0014
0015 #include <cluster/masklog.h>
0016
0017 #include "ocfs2.h"
0018
0019 #include "alloc.h"
0020 #include "inode.h"
0021 #include "journal.h"
0022 #include "uptodate.h"
0023 #include "buffer_head_io.h"
0024 #include "ocfs2_trace.h"
0025
0026 /*
0027  * Bits on bh->b_state used by ocfs2.
0028  *
0029  * These MUST be after the JBD2 bits.  Hence, we use BH_JBDPrivateStart.
0030  */
0031 enum ocfs2_state_bits {
0032     BH_NeedsValidate = BH_JBDPrivateStart,
0033 };
0034
0035 /* Expand the magic b_state functions */
0036 BUFFER_FNS(NeedsValidate, needs_validate);
0037
0038 int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
0039               struct ocfs2_caching_info *ci)
0040 {
0041     int ret = 0;
0042
0043     trace_ocfs2_write_block((unsigned long long)bh->b_blocknr, ci);
0044
0045     BUG_ON(bh->b_blocknr < OCFS2_SUPER_BLOCK_BLKNO);
0046     BUG_ON(buffer_jbd(bh));
0047
0048     /* No need to check for a soft readonly file system here. non
0049      * journalled writes are only ever done on system files which
0050      * can get modified during recovery even if read-only. */
0051     if (ocfs2_is_hard_readonly(osb)) {
0052         ret = -EROFS;
0053         mlog_errno(ret);
0054         goto out;
0055     }
0056
0057     ocfs2_metadata_cache_io_lock(ci);
0058
0059     lock_buffer(bh);
0060     set_buffer_uptodate(bh);
0061
0062     /* remove from dirty list before I/O. */
0063     clear_buffer_dirty(bh);
0064
0065     get_bh(bh); /* for end_buffer_write_sync() */
0066     bh->b_end_io = end_buffer_write_sync;
0067     submit_bh(REQ_OP_WRITE, bh);
0068
0069     wait_on_buffer(bh);
0070
0071     if (buffer_uptodate(bh)) {
0072         ocfs2_set_buffer_uptodate(ci, bh);
0073     } else {
0074         /* We don't need to remove the clustered uptodate
0075          * information for this bh as it's not marked locally
0076          * uptodate. */
0077         ret = -EIO;
0078         mlog_errno(ret);
0079     }
0080
0081     ocfs2_metadata_cache_io_unlock(ci);
0082 out:
0083     return ret;
0084 }
0085
0086 /* Caller must provide a bhs[] with all NULL or non-NULL entries, so it
0087  * will be easier to handle read failure.
0088  */
0089 int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block,
0090                unsigned int nr, struct buffer_head *bhs[])
0091 {
0092     int status = 0;
0093     unsigned int i;
0094     struct buffer_head *bh;
0095     int new_bh = 0;
0096
0097     trace_ocfs2_read_blocks_sync((unsigned long long)block, nr);
0098
0099     if (!nr)
0100         goto bail;
0101
0102     /* Don't put buffer head and re-assign it to NULL if it is allocated
0103      * outside since the caller can't be aware of this alternation!
0104      */
0105     new_bh = (bhs[0] == NULL);
0106
0107     for (i = 0 ; i < nr ; i++) {
0108         if (bhs[i] == NULL) {
0109             bhs[i] = sb_getblk(osb->sb, block++);
0110             if (bhs[i] == NULL) {
0111                 status = -ENOMEM;
0112                 mlog_errno(status);
0113                 break;
0114             }
0115         }
0116         bh = bhs[i];
0117
0118         if (buffer_jbd(bh)) {
0119             trace_ocfs2_read_blocks_sync_jbd(
0120                     (unsigned long long)bh->b_blocknr);
0121             continue;
0122         }
0123
0124         if (buffer_dirty(bh)) {
0125             /* This should probably be a BUG, or
0126              * at least return an error. */
0127             mlog(ML_ERROR,
0128                  "trying to sync read a dirty "
0129                  "buffer! (blocknr = %llu), skipping\n",
0130                  (unsigned long long)bh->b_blocknr);
0131             continue;
0132         }
0133
0134         lock_buffer(bh);
0135         if (buffer_jbd(bh)) {
0136 #ifdef CATCH_BH_JBD_RACES
0137             mlog(ML_ERROR,
0138                  "block %llu had the JBD bit set "
0139                  "while I was in lock_buffer!",
0140                  (unsigned long long)bh->b_blocknr);
0141             BUG();
0142 #else
0143             unlock_buffer(bh);
0144             continue;
0145 #endif
0146         }
0147
0148         get_bh(bh); /* for end_buffer_read_sync() */
0149         bh->b_end_io = end_buffer_read_sync;
0150         submit_bh(REQ_OP_READ, bh);
0151     }
0152
0153 read_failure:
0154     for (i = nr; i > 0; i--) {
0155         bh = bhs[i - 1];
0156
0157         if (unlikely(status)) {
0158             if (new_bh && bh) {
0159                 /* If middle bh fails, let previous bh
0160                  * finish its read and then put it to
0161                  * aovoid bh leak
0162                  */
0163                 if (!buffer_jbd(bh))
0164                     wait_on_buffer(bh);
0165                 put_bh(bh);
0166                 bhs[i - 1] = NULL;
0167             } else if (bh && buffer_uptodate(bh)) {
0168                 clear_buffer_uptodate(bh);
0169             }
0170             continue;
0171         }
0172
0173         /* No need to wait on the buffer if it's managed by JBD. */
0174         if (!buffer_jbd(bh))
0175             wait_on_buffer(bh);
0176
0177         if (!buffer_uptodate(bh)) {
0178             /* Status won't be cleared from here on out,
0179              * so we can safely record this and loop back
0180              * to cleanup the other buffers. */
0181             status = -EIO;
0182             goto read_failure;
0183         }
0184     }
0185
0186 bail:
0187     return status;
0188 }
0189
0190 /* Caller must provide a bhs[] with all NULL or non-NULL entries, so it
0191  * will be easier to handle read failure.
0192  */
0193 int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
0194               struct buffer_head *bhs[], int flags,
0195               int (*validate)(struct super_block *sb,
0196                       struct buffer_head *bh))
0197 {
0198     int status = 0;
0199     int i, ignore_cache = 0;
0200     struct buffer_head *bh;
0201     struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
0202     int new_bh = 0;
0203
0204     trace_ocfs2_read_blocks_begin(ci, (unsigned long long)block, nr, flags);
0205
0206     BUG_ON(!ci);
0207     BUG_ON((flags & OCFS2_BH_READAHEAD) &&
0208            (flags & OCFS2_BH_IGNORE_CACHE));
0209
0210     if (bhs == NULL) {
0211         status = -EINVAL;
0212         mlog_errno(status);
0213         goto bail;
0214     }
0215
0216     if (nr < 0) {
0217         mlog(ML_ERROR, "asked to read %d blocks!\n", nr);
0218         status = -EINVAL;
0219         mlog_errno(status);
0220         goto bail;
0221     }
0222
0223     if (nr == 0) {
0224         status = 0;
0225         goto bail;
0226     }
0227
0228     /* Don't put buffer head and re-assign it to NULL if it is allocated
0229      * outside since the caller can't be aware of this alternation!
0230      */
0231     new_bh = (bhs[0] == NULL);
0232
0233     ocfs2_metadata_cache_io_lock(ci);
0234     for (i = 0 ; i < nr ; i++) {
0235         if (bhs[i] == NULL) {
0236             bhs[i] = sb_getblk(sb, block++);
0237             if (bhs[i] == NULL) {
0238                 ocfs2_metadata_cache_io_unlock(ci);
0239                 status = -ENOMEM;
0240                 mlog_errno(status);
0241                 /* Don't forget to put previous bh! */
0242                 break;
0243             }
0244         }
0245         bh = bhs[i];
0246         ignore_cache = (flags & OCFS2_BH_IGNORE_CACHE);
0247
0248         /* There are three read-ahead cases here which we need to
0249          * be concerned with. All three assume a buffer has
0250          * previously been submitted with OCFS2_BH_READAHEAD
0251          * and it hasn't yet completed I/O.
0252          *
0253          * 1) The current request is sync to disk. This rarely
0254          *    happens these days, and never when performance
0255          *    matters - the code can just wait on the buffer
0256          *    lock and re-submit.
0257          *
0258          * 2) The current request is cached, but not
0259          *    readahead. ocfs2_buffer_uptodate() will return
0260          *    false anyway, so we'll wind up waiting on the
0261          *    buffer lock to do I/O. We re-check the request
0262          *    with after getting the lock to avoid a re-submit.
0263          *
0264          * 3) The current request is readahead (and so must
0265          *    also be a caching one). We short circuit if the
0266          *    buffer is locked (under I/O) and if it's in the
0267          *    uptodate cache. The re-check from #2 catches the
0268          *    case that the previous read-ahead completes just
0269          *    before our is-it-in-flight check.
0270          */
0271
0272         if (!ignore_cache && !ocfs2_buffer_uptodate(ci, bh)) {
0273             trace_ocfs2_read_blocks_from_disk(
0274                  (unsigned long long)bh->b_blocknr,
0275                  (unsigned long long)ocfs2_metadata_cache_owner(ci));
0276             /* We're using ignore_cache here to say
0277              * "go to disk" */
0278             ignore_cache = 1;
0279         }
0280
0281         trace_ocfs2_read_blocks_bh((unsigned long long)bh->b_blocknr,
0282             ignore_cache, buffer_jbd(bh), buffer_dirty(bh));
0283
0284         if (buffer_jbd(bh)) {
0285             continue;
0286         }
0287
0288         if (ignore_cache) {
0289             if (buffer_dirty(bh)) {
0290                 /* This should probably be a BUG, or
0291                  * at least return an error. */
0292                 continue;
0293             }
0294
0295             /* A read-ahead request was made - if the
0296              * buffer is already under read-ahead from a
0297              * previously submitted request than we are
0298              * done here. */
0299             if ((flags & OCFS2_BH_READAHEAD)
0300                 && ocfs2_buffer_read_ahead(ci, bh))
0301                 continue;
0302
0303             lock_buffer(bh);
0304             if (buffer_jbd(bh)) {
0305 #ifdef CATCH_BH_JBD_RACES
0306                 mlog(ML_ERROR, "block %llu had the JBD bit set "
0307                            "while I was in lock_buffer!",
0308                      (unsigned long long)bh->b_blocknr);
0309                 BUG();
0310 #else
0311                 unlock_buffer(bh);
0312                 continue;
0313 #endif
0314             }
0315
0316             /* Re-check ocfs2_buffer_uptodate() as a
0317              * previously read-ahead buffer may have
0318              * completed I/O while we were waiting for the
0319              * buffer lock. */
0320             if (!(flags & OCFS2_BH_IGNORE_CACHE)
0321                 && !(flags & OCFS2_BH_READAHEAD)
0322                 && ocfs2_buffer_uptodate(ci, bh)) {
0323                 unlock_buffer(bh);
0324                 continue;
0325             }
0326
0327             get_bh(bh); /* for end_buffer_read_sync() */
0328             if (validate)
0329                 set_buffer_needs_validate(bh);
0330             bh->b_end_io = end_buffer_read_sync;
0331             submit_bh(REQ_OP_READ, bh);
0332             continue;
0333         }
0334     }
0335
0336 read_failure:
0337     for (i = (nr - 1); i >= 0; i--) {
0338         bh = bhs[i];
0339
0340         if (!(flags & OCFS2_BH_READAHEAD)) {
0341             if (unlikely(status)) {
0342                 /* Clear the buffers on error including those
0343                  * ever succeeded in reading
0344                  */
0345                 if (new_bh && bh) {
0346                     /* If middle bh fails, let previous bh
0347                      * finish its read and then put it to
0348                      * aovoid bh leak
0349                      */
0350                     if (!buffer_jbd(bh))
0351                         wait_on_buffer(bh);
0352                     put_bh(bh);
0353                     bhs[i] = NULL;
0354                 } else if (bh && buffer_uptodate(bh)) {
0355                     clear_buffer_uptodate(bh);
0356                 }
0357                 continue;
0358             }
0359             /* We know this can't have changed as we hold the
0360              * owner sem. Avoid doing any work on the bh if the
0361              * journal has it. */
0362             if (!buffer_jbd(bh))
0363                 wait_on_buffer(bh);
0364
0365             if (!buffer_uptodate(bh)) {
0366                 /* Status won't be cleared from here on out,
0367                  * so we can safely record this and loop back
0368                  * to cleanup the other buffers. Don't need to
0369                  * remove the clustered uptodate information
0370                  * for this bh as it's not marked locally
0371                  * uptodate. */
0372                 status = -EIO;
0373                 clear_buffer_needs_validate(bh);
0374                 goto read_failure;
0375             }
0376
0377             if (buffer_needs_validate(bh)) {
0378                 /* We never set NeedsValidate if the
0379                  * buffer was held by the journal, so
0380                  * that better not have changed */
0381                 BUG_ON(buffer_jbd(bh));
0382                 clear_buffer_needs_validate(bh);
0383                 status = validate(sb, bh);
0384                 if (status)
0385                     goto read_failure;
0386             }
0387         }
0388
0389         /* Always set the buffer in the cache, even if it was
0390          * a forced read, or read-ahead which hasn't yet
0391          * completed. */
0392         ocfs2_set_buffer_uptodate(ci, bh);
0393     }
0394     ocfs2_metadata_cache_io_unlock(ci);
0395
0396     trace_ocfs2_read_blocks_end((unsigned long long)block, nr,
0397                     flags, ignore_cache);
0398
0399 bail:
0400
0401     return status;
0402 }
0403
0404 /* Check whether the blkno is the super block or one of the backups. */
0405 static void ocfs2_check_super_or_backup(struct super_block *sb,
0406                     sector_t blkno)
0407 {
0408     int i;
0409     u64 backup_blkno;
0410
0411     if (blkno == OCFS2_SUPER_BLOCK_BLKNO)
0412         return;
0413
0414     for (i = 0; i < OCFS2_MAX_BACKUP_SUPERBLOCKS; i++) {
0415         backup_blkno = ocfs2_backup_super_blkno(sb, i);
0416         if (backup_blkno == blkno)
0417             return;
0418     }
0419
0420     BUG();
0421 }
0422
0423 /*
0424  * Write super block and backups doesn't need to collaborate with journal,
0425  * so we don't need to lock ip_io_mutex and ci doesn't need to bea passed
0426  * into this function.
0427  */
0428 int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
0429                 struct buffer_head *bh)
0430 {
0431     int ret = 0;
0432     struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
0433
0434     BUG_ON(buffer_jbd(bh));
0435     ocfs2_check_super_or_backup(osb->sb, bh->b_blocknr);
0436
0437     if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) {
0438         ret = -EROFS;
0439         mlog_errno(ret);
0440         goto out;
0441     }
0442
0443     lock_buffer(bh);
0444     set_buffer_uptodate(bh);
0445
0446     /* remove from dirty list before I/O. */
0447     clear_buffer_dirty(bh);
0448
0449     get_bh(bh); /* for end_buffer_write_sync() */
0450     bh->b_end_io = end_buffer_write_sync;
0451     ocfs2_compute_meta_ecc(osb->sb, bh->b_data, &di->i_check);
0452     submit_bh(REQ_OP_WRITE, bh);
0453
0454     wait_on_buffer(bh);
0455
0456     if (!buffer_uptodate(bh)) {
0457         ret = -EIO;
0458         mlog_errno(ret);
0459     }
0460
0461 out:
0462     return ret;
0463 }