Back to home page

OSCL-LXR

 
 

    


0001 /* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
0002 /*
0003    md_p.h : physical layout of Linux RAID devices
0004           Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman
0005       
0006    This program is free software; you can redistribute it and/or modify
0007    it under the terms of the GNU General Public License as published by
0008    the Free Software Foundation; either version 2, or (at your option)
0009    any later version.
0010    
0011    You should have received a copy of the GNU General Public License
0012    (for example /usr/src/linux/COPYING); if not, write to the Free
0013    Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  
0014 */
0015 
0016 #ifndef _MD_P_H
0017 #define _MD_P_H
0018 
0019 #include <linux/types.h>
0020 #include <asm/byteorder.h>
0021 
0022 /*
0023  * RAID superblock.
0024  *
0025  * The RAID superblock maintains some statistics on each RAID configuration.
0026  * Each real device in the RAID set contains it near the end of the device.
0027  * Some of the ideas are copied from the ext2fs implementation.
0028  *
0029  * We currently use 4096 bytes as follows:
0030  *
0031  *  word offset function
0032  *
0033  *     0  -    31   Constant generic RAID device information.
0034  *        32  -    63   Generic state information.
0035  *    64  -   127   Personality specific information.
0036  *   128  -   511   12 32-words descriptors of the disks in the raid set.
0037  *   512  -   911   Reserved.
0038  *   912  -  1023   Disk specific descriptor.
0039  */
0040 
0041 /*
0042  * If x is the real device size in bytes, we return an apparent size of:
0043  *
0044  *  y = (x & ~(MD_RESERVED_BYTES - 1)) - MD_RESERVED_BYTES
0045  *
0046  * and place the 4kB superblock at offset y.
0047  */
0048 #define MD_RESERVED_BYTES       (64 * 1024)
0049 #define MD_RESERVED_SECTORS     (MD_RESERVED_BYTES / 512)
0050 
0051 #define MD_NEW_SIZE_SECTORS(x)      ((x & ~(MD_RESERVED_SECTORS - 1)) - MD_RESERVED_SECTORS)
0052 
0053 #define MD_SB_BYTES         4096
0054 #define MD_SB_WORDS         (MD_SB_BYTES / 4)
0055 #define MD_SB_SECTORS           (MD_SB_BYTES / 512)
0056 
0057 /*
0058  * The following are counted in 32-bit words
0059  */
0060 #define MD_SB_GENERIC_OFFSET        0
0061 #define MD_SB_PERSONALITY_OFFSET    64
0062 #define MD_SB_DISKS_OFFSET      128
0063 #define MD_SB_DESCRIPTOR_OFFSET     992
0064 
0065 #define MD_SB_GENERIC_CONSTANT_WORDS    32
0066 #define MD_SB_GENERIC_STATE_WORDS   32
0067 #define MD_SB_GENERIC_WORDS     (MD_SB_GENERIC_CONSTANT_WORDS + MD_SB_GENERIC_STATE_WORDS)
0068 #define MD_SB_PERSONALITY_WORDS     64
0069 #define MD_SB_DESCRIPTOR_WORDS      32
0070 #define MD_SB_DISKS         27
0071 #define MD_SB_DISKS_WORDS       (MD_SB_DISKS*MD_SB_DESCRIPTOR_WORDS)
0072 #define MD_SB_RESERVED_WORDS        (1024 - MD_SB_GENERIC_WORDS - MD_SB_PERSONALITY_WORDS - MD_SB_DISKS_WORDS - MD_SB_DESCRIPTOR_WORDS)
0073 #define MD_SB_EQUAL_WORDS       (MD_SB_GENERIC_WORDS + MD_SB_PERSONALITY_WORDS + MD_SB_DISKS_WORDS)
0074 
0075 /*
0076  * Device "operational" state bits
0077  */
0078 #define MD_DISK_FAULTY      0 /* disk is faulty / operational */
0079 #define MD_DISK_ACTIVE      1 /* disk is running or spare disk */
0080 #define MD_DISK_SYNC        2 /* disk is in sync with the raid set */
0081 #define MD_DISK_REMOVED     3 /* disk is in sync with the raid set */
0082 #define MD_DISK_CLUSTER_ADD     4 /* Initiate a disk add across the cluster
0083                    * For clustered enviroments only.
0084                    */
0085 #define MD_DISK_CANDIDATE   5 /* disk is added as spare (local) until confirmed
0086                    * For clustered enviroments only.
0087                    */
0088 #define MD_DISK_FAILFAST    10 /* Send REQ_FAILFAST if there are multiple
0089                     * devices available - and don't try to
0090                     * correct read errors.
0091                     */
0092 
0093 #define MD_DISK_WRITEMOSTLY 9 /* disk is "write-mostly" is RAID1 config.
0094                    * read requests will only be sent here in
0095                    * dire need
0096                    */
0097 #define MD_DISK_JOURNAL     18 /* disk is used as the write journal in RAID-5/6 */
0098 
0099 #define MD_DISK_ROLE_SPARE  0xffff
0100 #define MD_DISK_ROLE_FAULTY 0xfffe
0101 #define MD_DISK_ROLE_JOURNAL    0xfffd
0102 #define MD_DISK_ROLE_MAX    0xff00 /* max value of regular disk role */
0103 
0104 typedef struct mdp_device_descriptor_s {
0105     __u32 number;       /* 0 Device number in the entire set          */
0106     __u32 major;        /* 1 Device major number              */
0107     __u32 minor;        /* 2 Device minor number              */
0108     __u32 raid_disk;    /* 3 The role of the device in the raid set   */
0109     __u32 state;        /* 4 Operational state                */
0110     __u32 reserved[MD_SB_DESCRIPTOR_WORDS - 5];
0111 } mdp_disk_t;
0112 
0113 #define MD_SB_MAGIC     0xa92b4efc
0114 
0115 /*
0116  * Superblock state bits
0117  */
0118 #define MD_SB_CLEAN     0
0119 #define MD_SB_ERRORS        1
0120 
0121 #define MD_SB_CLUSTERED     5 /* MD is clustered */
0122 #define MD_SB_BITMAP_PRESENT    8 /* bitmap may be present nearby */
0123 
0124 /*
0125  * Notes:
0126  * - if an array is being reshaped (restriped) in order to change
0127  *   the number of active devices in the array, 'raid_disks' will be
0128  *   the larger of the old and new numbers.  'delta_disks' will
0129  *   be the "new - old".  So if +ve, raid_disks is the new value, and
0130  *   "raid_disks-delta_disks" is the old.  If -ve, raid_disks is the
0131  *   old value and "raid_disks+delta_disks" is the new (smaller) value.
0132  */
0133 
0134 
0135 typedef struct mdp_superblock_s {
0136     /*
0137      * Constant generic information
0138      */
0139     __u32 md_magic;     /*  0 MD identifier                   */
0140     __u32 major_version;    /*  1 major version to which the set conforms */
0141     __u32 minor_version;    /*  2 minor version ...               */
0142     __u32 patch_version;    /*  3 patchlevel version ...              */
0143     __u32 gvalid_words; /*  4 Number of used words in this section    */
0144     __u32 set_uuid0;    /*  5 Raid set identifier             */
0145     __u32 ctime;        /*  6 Creation time               */
0146     __u32 level;        /*  7 Raid personality                */
0147     __u32 size;     /*  8 Apparent size of each individual disk   */
0148     __u32 nr_disks;     /*  9 total disks in the raid set         */
0149     __u32 raid_disks;   /* 10 disks in a fully functional raid set    */
0150     __u32 md_minor;     /* 11 preferred MD minor device number        */
0151     __u32 not_persistent;   /* 12 does it have a persistent superblock    */
0152     __u32 set_uuid1;    /* 13 Raid set identifier #2              */
0153     __u32 set_uuid2;    /* 14 Raid set identifier #3              */
0154     __u32 set_uuid3;    /* 15 Raid set identifier #4              */
0155     __u32 gstate_creserved[MD_SB_GENERIC_CONSTANT_WORDS - 16];
0156 
0157     /*
0158      * Generic state information
0159      */
0160     __u32 utime;        /*  0 Superblock update time              */
0161     __u32 state;        /*  1 State bits (clean, ...)             */
0162     __u32 active_disks; /*  2 Number of currently active disks        */
0163     __u32 working_disks;    /*  3 Number of working disks             */
0164     __u32 failed_disks; /*  4 Number of failed disks              */
0165     __u32 spare_disks;  /*  5 Number of spare disks           */
0166     __u32 sb_csum;      /*  6 checksum of the whole superblock        */
0167 #if defined(__BYTE_ORDER) ? __BYTE_ORDER == __BIG_ENDIAN : defined(__BIG_ENDIAN)
0168     __u32 events_hi;    /*  7 high-order of superblock update count   */
0169     __u32 events_lo;    /*  8 low-order of superblock update count    */
0170     __u32 cp_events_hi; /*  9 high-order of checkpoint update count   */
0171     __u32 cp_events_lo; /* 10 low-order of checkpoint update count    */
0172 #elif defined(__BYTE_ORDER) ? __BYTE_ORDER == __LITTLE_ENDIAN : defined(__LITTLE_ENDIAN)
0173     __u32 events_lo;    /*  7 low-order of superblock update count    */
0174     __u32 events_hi;    /*  8 high-order of superblock update count   */
0175     __u32 cp_events_lo; /*  9 low-order of checkpoint update count    */
0176     __u32 cp_events_hi; /* 10 high-order of checkpoint update count   */
0177 #else
0178 #error unspecified endianness
0179 #endif
0180     __u32 recovery_cp;  /* 11 recovery checkpoint sector count        */
0181     /* There are only valid for minor_version > 90 */
0182     __u64 reshape_position; /* 12,13 next address in array-space for reshape */
0183     __u32 new_level;    /* 14 new level we are reshaping to       */
0184     __u32 delta_disks;  /* 15 change in number of raid_disks          */
0185     __u32 new_layout;   /* 16 new layout                  */
0186     __u32 new_chunk;    /* 17 new chunk size (bytes)              */
0187     __u32 gstate_sreserved[MD_SB_GENERIC_STATE_WORDS - 18];
0188 
0189     /*
0190      * Personality information
0191      */
0192     __u32 layout;       /*  0 the array's physical layout         */
0193     __u32 chunk_size;   /*  1 chunk size in bytes             */
0194     __u32 root_pv;      /*  2 LV root PV */
0195     __u32 root_block;   /*  3 LV root block */
0196     __u32 pstate_reserved[MD_SB_PERSONALITY_WORDS - 4];
0197 
0198     /*
0199      * Disks information
0200      */
0201     mdp_disk_t disks[MD_SB_DISKS];
0202 
0203     /*
0204      * Reserved
0205      */
0206     __u32 reserved[MD_SB_RESERVED_WORDS];
0207 
0208     /*
0209      * Active descriptor
0210      */
0211     mdp_disk_t this_disk;
0212 
0213 } mdp_super_t;
0214 
0215 static inline __u64 md_event(mdp_super_t *sb) {
0216     __u64 ev = sb->events_hi;
0217     return (ev<<32)| sb->events_lo;
0218 }
0219 
0220 #define MD_SUPERBLOCK_1_TIME_SEC_MASK ((1ULL<<40) - 1)
0221 
0222 /*
0223  * The version-1 superblock :
0224  * All numeric fields are little-endian.
0225  *
0226  * total size: 256 bytes plus 2 per device.
0227  *  1K allows 384 devices.
0228  */
0229 struct mdp_superblock_1 {
0230     /* constant array information - 128 bytes */
0231     __le32  magic;      /* MD_SB_MAGIC: 0xa92b4efc - little endian */
0232     __le32  major_version;  /* 1 */
0233     __le32  feature_map;    /* bit 0 set if 'bitmap_offset' is meaningful */
0234     __le32  pad0;       /* always set to 0 when writing */
0235 
0236     __u8    set_uuid[16];   /* user-space generated. */
0237     char    set_name[32];   /* set and interpreted by user-space */
0238 
0239     __le64  ctime;      /* lo 40 bits are seconds, top 24 are microseconds or 0*/
0240     __le32  level;      /* -4 (multipath), -1 (linear), 0,1,4,5 */
0241     __le32  layout;     /* only for raid5 and raid10 currently */
0242     __le64  size;       /* used size of component devices, in 512byte sectors */
0243 
0244     __le32  chunksize;  /* in 512byte sectors */
0245     __le32  raid_disks;
0246     union {
0247         __le32  bitmap_offset;  /* sectors after start of superblock that bitmap starts
0248                      * NOTE: signed, so bitmap can be before superblock
0249                      * only meaningful of feature_map[0] is set.
0250                      */
0251 
0252         /* only meaningful when feature_map[MD_FEATURE_PPL] is set */
0253         struct {
0254             __le16 offset; /* sectors from start of superblock that ppl starts (signed) */
0255             __le16 size; /* ppl size in sectors */
0256         } ppl;
0257     };
0258 
0259     /* These are only valid with feature bit '4' */
0260     __le32  new_level;  /* new level we are reshaping to        */
0261     __le64  reshape_position;   /* next address in array-space for reshape */
0262     __le32  delta_disks;    /* change in number of raid_disks       */
0263     __le32  new_layout; /* new layout                   */
0264     __le32  new_chunk;  /* new chunk size (512byte sectors)     */
0265     __le32  new_offset; /* signed number to add to data_offset in new
0266                  * layout.  0 == no-change.  This can be
0267                  * different on each device in the array.
0268                  */
0269 
0270     /* constant this-device information - 64 bytes */
0271     __le64  data_offset;    /* sector start of data, often 0 */
0272     __le64  data_size;  /* sectors in this device that can be used for data */
0273     __le64  super_offset;   /* sector start of this superblock */
0274     union {
0275         __le64  recovery_offset;/* sectors before this offset (from data_offset) have been recovered */
0276         __le64  journal_tail;/* journal tail of journal device (from data_offset) */
0277     };
0278     __le32  dev_number; /* permanent identifier of this  device - not role in raid */
0279     __le32  cnt_corrected_read; /* number of read errors that were corrected by re-writing */
0280     __u8    device_uuid[16]; /* user-space setable, ignored by kernel */
0281     __u8    devflags;   /* per-device flags.  Only two defined...*/
0282 #define WriteMostly1    1   /* mask for writemostly flag in above */
0283 #define FailFast1   2   /* Should avoid retries and fixups and just fail */
0284     /* Bad block log.  If there are any bad blocks the feature flag is set.
0285      * If offset and size are non-zero, that space is reserved and available
0286      */
0287     __u8    bblog_shift;    /* shift from sectors to block size */
0288     __le16  bblog_size; /* number of sectors reserved for list */
0289     __le32  bblog_offset;   /* sector offset from superblock to bblog,
0290                  * signed - not unsigned */
0291 
0292     /* array state information - 64 bytes */
0293     __le64  utime;      /* 40 bits second, 24 bits microseconds */
0294     __le64  events;     /* incremented when superblock updated */
0295     __le64  resync_offset;  /* data before this offset (from data_offset) known to be in sync */
0296     __le32  sb_csum;    /* checksum up to devs[max_dev] */
0297     __le32  max_dev;    /* size of devs[] array to consider */
0298     __u8    pad3[64-32];    /* set to 0 when writing */
0299 
0300     /* device state information. Indexed by dev_number.
0301      * 2 bytes per device
0302      * Note there are no per-device state flags. State information is rolled
0303      * into the 'roles' value.  If a device is spare or faulty, then it doesn't
0304      * have a meaningful role.
0305      */
0306     __le16  dev_roles[];    /* role in array, or 0xffff for a spare, or 0xfffe for faulty */
0307 };
0308 
0309 /* feature_map bits */
0310 #define MD_FEATURE_BITMAP_OFFSET    1
0311 #define MD_FEATURE_RECOVERY_OFFSET  2 /* recovery_offset is present and
0312                        * must be honoured
0313                        */
0314 #define MD_FEATURE_RESHAPE_ACTIVE   4
0315 #define MD_FEATURE_BAD_BLOCKS       8 /* badblock list is not empty */
0316 #define MD_FEATURE_REPLACEMENT      16 /* This device is replacing an
0317                         * active device with same 'role'.
0318                         * 'recovery_offset' is also set.
0319                         */
0320 #define MD_FEATURE_RESHAPE_BACKWARDS    32 /* Reshape doesn't change number
0321                         * of devices, but is going
0322                         * backwards anyway.
0323                         */
0324 #define MD_FEATURE_NEW_OFFSET       64 /* new_offset must be honoured */
0325 #define MD_FEATURE_RECOVERY_BITMAP  128 /* recovery that is happening
0326                          * is guided by bitmap.
0327                          */
0328 #define MD_FEATURE_CLUSTERED        256 /* clustered MD */
0329 #define MD_FEATURE_JOURNAL      512 /* support write cache */
0330 #define MD_FEATURE_PPL          1024 /* support PPL */
0331 #define MD_FEATURE_MULTIPLE_PPLS    2048 /* support for multiple PPLs */
0332 #define MD_FEATURE_RAID0_LAYOUT     4096 /* layout is meaningful for RAID0 */
0333 #define MD_FEATURE_ALL          (MD_FEATURE_BITMAP_OFFSET   \
0334                     |MD_FEATURE_RECOVERY_OFFSET \
0335                     |MD_FEATURE_RESHAPE_ACTIVE  \
0336                     |MD_FEATURE_BAD_BLOCKS      \
0337                     |MD_FEATURE_REPLACEMENT     \
0338                     |MD_FEATURE_RESHAPE_BACKWARDS   \
0339                     |MD_FEATURE_NEW_OFFSET      \
0340                     |MD_FEATURE_RECOVERY_BITMAP \
0341                     |MD_FEATURE_CLUSTERED       \
0342                     |MD_FEATURE_JOURNAL     \
0343                     |MD_FEATURE_PPL         \
0344                     |MD_FEATURE_MULTIPLE_PPLS   \
0345                     |MD_FEATURE_RAID0_LAYOUT    \
0346                     )
0347 
0348 struct r5l_payload_header {
0349     __le16 type;
0350     __le16 flags;
0351 } __attribute__ ((__packed__));
0352 
0353 enum r5l_payload_type {
0354     R5LOG_PAYLOAD_DATA = 0,
0355     R5LOG_PAYLOAD_PARITY = 1,
0356     R5LOG_PAYLOAD_FLUSH = 2,
0357 };
0358 
0359 struct r5l_payload_data_parity {
0360     struct r5l_payload_header header;
0361     __le32 size;        /* sector. data/parity size. each 4k
0362                  * has a checksum */
0363     __le64 location;    /* sector. For data, it's raid sector. For
0364                  * parity, it's stripe sector */
0365     __le32 checksum[];
0366 } __attribute__ ((__packed__));
0367 
0368 enum r5l_payload_data_parity_flag {
0369     R5LOG_PAYLOAD_FLAG_DISCARD = 1, /* payload is discard */
0370     /*
0371      * RESHAPED/RESHAPING is only set when there is reshape activity. Note,
0372      * both data/parity of a stripe should have the same flag set
0373      *
0374      * RESHAPED: reshape is running, and this stripe finished reshape
0375      * RESHAPING: reshape is running, and this stripe isn't reshaped
0376      */
0377     R5LOG_PAYLOAD_FLAG_RESHAPED = 2,
0378     R5LOG_PAYLOAD_FLAG_RESHAPING = 3,
0379 };
0380 
0381 struct r5l_payload_flush {
0382     struct r5l_payload_header header;
0383     __le32 size; /* flush_stripes size, bytes */
0384     __le64 flush_stripes[];
0385 } __attribute__ ((__packed__));
0386 
0387 enum r5l_payload_flush_flag {
0388     R5LOG_PAYLOAD_FLAG_FLUSH_STRIPE = 1, /* data represents whole stripe */
0389 };
0390 
0391 struct r5l_meta_block {
0392     __le32 magic;
0393     __le32 checksum;
0394     __u8 version;
0395     __u8 __zero_pading_1;
0396     __le16 __zero_pading_2;
0397     __le32 meta_size; /* whole size of the block */
0398 
0399     __le64 seq;
0400     __le64 position; /* sector, start from rdev->data_offset, current position */
0401     struct r5l_payload_header payloads[];
0402 } __attribute__ ((__packed__));
0403 
0404 #define R5LOG_VERSION 0x1
0405 #define R5LOG_MAGIC 0x6433c509
0406 
0407 struct ppl_header_entry {
0408     __le64 data_sector; /* raid sector of the new data */
0409     __le32 pp_size;     /* length of partial parity */
0410     __le32 data_size;   /* length of data */
0411     __le32 parity_disk; /* member disk containing parity */
0412     __le32 checksum;    /* checksum of partial parity data for this
0413                  * entry (~crc32c) */
0414 } __attribute__ ((__packed__));
0415 
0416 #define PPL_HEADER_SIZE 4096
0417 #define PPL_HDR_RESERVED 512
0418 #define PPL_HDR_ENTRY_SPACE \
0419     (PPL_HEADER_SIZE - PPL_HDR_RESERVED - 4 * sizeof(__le32) - sizeof(__le64))
0420 #define PPL_HDR_MAX_ENTRIES \
0421     (PPL_HDR_ENTRY_SPACE / sizeof(struct ppl_header_entry))
0422 
0423 struct ppl_header {
0424     __u8 reserved[PPL_HDR_RESERVED];/* reserved space, fill with 0xff */
0425     __le32 signature;       /* signature (family number of volume) */
0426     __le32 padding;         /* zero pad */
0427     __le64 generation;      /* generation number of the header */
0428     __le32 entries_count;       /* number of entries in entry array */
0429     __le32 checksum;        /* checksum of the header (~crc32c) */
0430     struct ppl_header_entry entries[PPL_HDR_MAX_ENTRIES];
0431 } __attribute__ ((__packed__));
0432 
0433 #endif