drivers/md/raid1.h

0001 /* SPDX-License-Identifier: GPL-2.0 */
0002 #ifndef _RAID1_H
0003 #define _RAID1_H
0004
0005 /*
0006  * each barrier unit size is 64MB fow now
0007  * note: it must be larger than RESYNC_DEPTH
0008  */
0009 #define BARRIER_UNIT_SECTOR_BITS    17
0010 #define BARRIER_UNIT_SECTOR_SIZE    (1<<17)
0011 /*
0012  * In struct r1conf, the following members are related to I/O barrier
0013  * buckets,
0014  *  atomic_t    *nr_pending;
0015  *  atomic_t    *nr_waiting;
0016  *  atomic_t    *nr_queued;
0017  *  atomic_t    *barrier;
0018  * Each of them points to array of atomic_t variables, each array is
0019  * designed to have BARRIER_BUCKETS_NR elements and occupy a single
0020  * memory page. The data width of atomic_t variables is 4 bytes, equal
0021  * to 1<<(ilog2(sizeof(atomic_t))), BARRIER_BUCKETS_NR_BITS is defined
0022  * as (PAGE_SHIFT - ilog2(sizeof(int))) to make sure an array of
0023  * atomic_t variables with BARRIER_BUCKETS_NR elements just exactly
0024  * occupies a single memory page.
0025  */
0026 #define BARRIER_BUCKETS_NR_BITS     (PAGE_SHIFT - ilog2(sizeof(atomic_t)))
0027 #define BARRIER_BUCKETS_NR      (1<<BARRIER_BUCKETS_NR_BITS)
0028
0029 /* Note: raid1_info.rdev can be set to NULL asynchronously by raid1_remove_disk.
0030  * There are three safe ways to access raid1_info.rdev.
0031  * 1/ when holding mddev->reconfig_mutex
0032  * 2/ when resync/recovery is known to be happening - i.e. in code that is
0033  *    called as part of performing resync/recovery.
0034  * 3/ while holding rcu_read_lock(), use rcu_dereference to get the pointer
0035  *    and if it is non-NULL, increment rdev->nr_pending before dropping the
0036  *    RCU lock.
0037  * When .rdev is set to NULL, the nr_pending count checked again and if it has
0038  * been incremented, the pointer is put back in .rdev.
0039  */
0040
0041 struct raid1_info {
0042     struct md_rdev  *rdev;
0043     sector_t    head_position;
0044
0045     /* When choose the best device for a read (read_balance())
0046      * we try to keep sequential reads one the same device
0047      */
0048     sector_t    next_seq_sect;
0049     sector_t    seq_start;
0050 };
0051
0052 /*
0053  * memory pools need a pointer to the mddev, so they can force an unplug
0054  * when memory is tight, and a count of the number of drives that the
0055  * pool was allocated for, so they know how much to allocate and free.
0056  * mddev->raid_disks cannot be used, as it can change while a pool is active
0057  * These two datums are stored in a kmalloced struct.
0058  * The 'raid_disks' here is twice the raid_disks in r1conf.
0059  * This allows space for each 'real' device can have a replacement in the
0060  * second half of the array.
0061  */
0062
0063 struct pool_info {
0064     struct mddev *mddev;
0065     int raid_disks;
0066 };
0067
0068 struct r1conf {
0069     struct mddev        *mddev;
0070     struct raid1_info   *mirrors;   /* twice 'raid_disks' to
0071                          * allow for replacements.
0072                          */
0073     int         raid_disks;
0074
0075     spinlock_t      device_lock;
0076
0077     /* list of 'struct r1bio' that need to be processed by raid1d,
0078      * whether to retry a read, writeout a resync or recovery
0079      * block, or anything else.
0080      */
0081     struct list_head    retry_list;
0082     /* A separate list of r1bio which just need raid_end_bio_io called.
0083      * This mustn't happen for writes which had any errors if the superblock
0084      * needs to be written.
0085      */
0086     struct list_head    bio_end_io_list;
0087
0088     /* queue pending writes to be submitted on unplug */
0089     struct bio_list     pending_bio_list;
0090
0091     /* for use when syncing mirrors:
0092      * We don't allow both normal IO and resync/recovery IO at
0093      * the same time - resync/recovery can only happen when there
0094      * is no other IO.  So when either is active, the other has to wait.
0095      * See more details description in raid1.c near raise_barrier().
0096      */
0097     wait_queue_head_t   wait_barrier;
0098     spinlock_t      resync_lock;
0099     atomic_t        nr_sync_pending;
0100     atomic_t        *nr_pending;
0101     atomic_t        *nr_waiting;
0102     atomic_t        *nr_queued;
0103     atomic_t        *barrier;
0104     int         array_frozen;
0105
0106     /* Set to 1 if a full sync is needed, (fresh device added).
0107      * Cleared when a sync completes.
0108      */
0109     int         fullsync;
0110
0111     /* When the same as mddev->recovery_disabled we don't allow
0112      * recovery to be attempted as we expect a read error.
0113      */
0114     int         recovery_disabled;
0115
0116     /* poolinfo contains information about the content of the
0117      * mempools - it changes when the array grows or shrinks
0118      */
0119     struct pool_info    *poolinfo;
0120     mempool_t       r1bio_pool;
0121     mempool_t       r1buf_pool;
0122
0123     struct bio_set      bio_split;
0124
0125     /* temporary buffer to synchronous IO when attempting to repair
0126      * a read error.
0127      */
0128     struct page     *tmppage;
0129
0130     /* When taking over an array from a different personality, we store
0131      * the new thread here until we fully activate the array.
0132      */
0133     struct md_thread    *thread;
0134
0135     /* Keep track of cluster resync window to send to other
0136      * nodes.
0137      */
0138     sector_t        cluster_sync_low;
0139     sector_t        cluster_sync_high;
0140
0141 };
0142
0143 /*
0144  * this is our 'private' RAID1 bio.
0145  *
0146  * it contains information about what kind of IO operations were started
0147  * for this RAID1 operation, and about their status:
0148  */
0149
0150 struct r1bio {
0151     atomic_t        remaining; /* 'have we finished' count,
0152                         * used from IRQ handlers
0153                         */
0154     atomic_t        behind_remaining; /* number of write-behind ios remaining
0155                          * in this BehindIO request
0156                          */
0157     sector_t        sector;
0158     int         sectors;
0159     unsigned long       state;
0160     unsigned long       start_time;
0161     struct mddev        *mddev;
0162     /*
0163      * original bio going to /dev/mdx
0164      */
0165     struct bio      *master_bio;
0166     /*
0167      * if the IO is in READ direction, then this is where we read
0168      */
0169     int         read_disk;
0170
0171     struct list_head    retry_list;
0172
0173     /*
0174      * When R1BIO_BehindIO is set, we store pages for write behind
0175      * in behind_master_bio.
0176      */
0177     struct bio      *behind_master_bio;
0178
0179     /*
0180      * if the IO is in WRITE direction, then multiple bios are used.
0181      * We choose the number when they are allocated.
0182      */
0183     struct bio      *bios[];
0184     /* DO NOT PUT ANY NEW FIELDS HERE - bios array is contiguously alloced*/
0185 };
0186
0187 /* bits for r1bio.state */
0188 enum r1bio_state {
0189     R1BIO_Uptodate,
0190     R1BIO_IsSync,
0191     R1BIO_Degraded,
0192     R1BIO_BehindIO,
0193 /* Set ReadError on bios that experience a readerror so that
0194  * raid1d knows what to do with them.
0195  */
0196     R1BIO_ReadError,
0197 /* For write-behind requests, we call bi_end_io when
0198  * the last non-write-behind device completes, providing
0199  * any write was successful.  Otherwise we call when
0200  * any write-behind write succeeds, otherwise we call
0201  * with failure when last write completes (and all failed).
0202  * Record that bi_end_io was called with this flag...
0203  */
0204     R1BIO_Returned,
0205 /* If a write for this request means we can clear some
0206  * known-bad-block records, we set this flag
0207  */
0208     R1BIO_MadeGood,
0209     R1BIO_WriteError,
0210     R1BIO_FailFast,
0211 };
0212
0213 static inline int sector_to_idx(sector_t sector)
0214 {
0215     return hash_long(sector >> BARRIER_UNIT_SECTOR_BITS,
0216              BARRIER_BUCKETS_NR_BITS);
0217 }
0218 #endif