drivers/md/md.h

0001 /* SPDX-License-Identifier: GPL-2.0-or-later */
0002 /*
0003    md.h : kernel internal structure of the Linux MD driver
0004           Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman
0005
0006 */
0007
0008 #ifndef _MD_MD_H
0009 #define _MD_MD_H
0010
0011 #include <linux/blkdev.h>
0012 #include <linux/backing-dev.h>
0013 #include <linux/badblocks.h>
0014 #include <linux/kobject.h>
0015 #include <linux/list.h>
0016 #include <linux/mm.h>
0017 #include <linux/mutex.h>
0018 #include <linux/timer.h>
0019 #include <linux/wait.h>
0020 #include <linux/workqueue.h>
0021 #include "md-cluster.h"
0022
0023 #define MaxSector (~(sector_t)0)
0024
0025 /*
0026  * These flags should really be called "NO_RETRY" rather than
0027  * "FAILFAST" because they don't make any promise about time lapse,
0028  * only about the number of retries, which will be zero.
0029  * REQ_FAILFAST_DRIVER is not included because
0030  * Commit: 4a27446f3e39 ("[SCSI] modify scsi to handle new fail fast flags.")
0031  * seems to suggest that the errors it avoids retrying should usually
0032  * be retried.
0033  */
0034 #define MD_FAILFAST (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT)
0035
0036 /*
0037  * The struct embedded in rdev is used to serialize IO.
0038  */
0039 struct serial_in_rdev {
0040     struct rb_root_cached serial_rb;
0041     spinlock_t serial_lock;
0042     wait_queue_head_t serial_io_wait;
0043 };
0044
0045 /*
0046  * MD's 'extended' device
0047  */
0048 struct md_rdev {
0049     struct list_head same_set;  /* RAID devices within the same set */
0050
0051     sector_t sectors;       /* Device size (in 512bytes sectors) */
0052     struct mddev *mddev;        /* RAID array if running */
0053     int last_events;        /* IO event timestamp */
0054
0055     /*
0056      * If meta_bdev is non-NULL, it means that a separate device is
0057      * being used to store the metadata (superblock/bitmap) which
0058      * would otherwise be contained on the same device as the data (bdev).
0059      */
0060     struct block_device *meta_bdev;
0061     struct block_device *bdev;  /* block device handle */
0062
0063     struct page *sb_page, *bb_page;
0064     int     sb_loaded;
0065     __u64       sb_events;
0066     sector_t    data_offset;    /* start of data in array */
0067     sector_t    new_data_offset;/* only relevant while reshaping */
0068     sector_t    sb_start;   /* offset of the super block (in 512byte sectors) */
0069     int     sb_size;    /* bytes in the superblock */
0070     int     preferred_minor;    /* autorun support */
0071
0072     struct kobject  kobj;
0073
0074     /* A device can be in one of three states based on two flags:
0075      * Not working:   faulty==1 in_sync==0
0076      * Fully working: faulty==0 in_sync==1
0077      * Working, but not
0078      * in sync with array
0079      *                faulty==0 in_sync==0
0080      *
0081      * It can never have faulty==1, in_sync==1
0082      * This reduces the burden of testing multiple flags in many cases
0083      */
0084
0085     unsigned long   flags;  /* bit set of 'enum flag_bits' bits. */
0086     wait_queue_head_t blocked_wait;
0087
0088     int desc_nr;            /* descriptor index in the superblock */
0089     int raid_disk;          /* role of device in array */
0090     int new_raid_disk;      /* role that the device will have in
0091                      * the array after a level-change completes.
0092                      */
0093     int saved_raid_disk;        /* role that device used to have in the
0094                      * array and could again if we did a partial
0095                      * resync from the bitmap
0096                      */
0097     union {
0098         sector_t recovery_offset;/* If this device has been partially
0099                      * recovered, this is where we were
0100                      * up to.
0101                      */
0102         sector_t journal_tail;  /* If this device is a journal device,
0103                      * this is the journal tail (journal
0104                      * recovery start point)
0105                      */
0106     };
0107
0108     atomic_t    nr_pending; /* number of pending requests.
0109                      * only maintained for arrays that
0110                      * support hot removal
0111                      */
0112     atomic_t    read_errors;    /* number of consecutive read errors that
0113                      * we have tried to ignore.
0114                      */
0115     time64_t    last_read_error;    /* monotonic time since our
0116                          * last read error
0117                          */
0118     atomic_t    corrected_errors; /* number of corrected read errors,
0119                        * for reporting to userspace and storing
0120                        * in superblock.
0121                        */
0122
0123     struct serial_in_rdev *serial;  /* used for raid1 io serialization */
0124
0125     struct work_struct del_work;    /* used for delayed sysfs removal */
0126
0127     struct kernfs_node *sysfs_state; /* handle for 'state'
0128                        * sysfs entry */
0129     /* handle for 'unacknowledged_bad_blocks' sysfs dentry */
0130     struct kernfs_node *sysfs_unack_badblocks;
0131     /* handle for 'bad_blocks' sysfs dentry */
0132     struct kernfs_node *sysfs_badblocks;
0133     struct badblocks badblocks;
0134
0135     struct {
0136         short offset;   /* Offset from superblock to start of PPL.
0137                  * Not used by external metadata. */
0138         unsigned int size;  /* Size in sectors of the PPL space */
0139         sector_t sector;    /* First sector of the PPL space */
0140     } ppl;
0141 };
0142 enum flag_bits {
0143     Faulty,         /* device is known to have a fault */
0144     In_sync,        /* device is in_sync with rest of array */
0145     Bitmap_sync,        /* ..actually, not quite In_sync.  Need a
0146                  * bitmap-based recovery to get fully in sync.
0147                  * The bit is only meaningful before device
0148                  * has been passed to pers->hot_add_disk.
0149                  */
0150     WriteMostly,        /* Avoid reading if at all possible */
0151     AutoDetected,       /* added by auto-detect */
0152     Blocked,        /* An error occurred but has not yet
0153                  * been acknowledged by the metadata
0154                  * handler, so don't allow writes
0155                  * until it is cleared */
0156     WriteErrorSeen,     /* A write error has been seen on this
0157                  * device
0158                  */
0159     FaultRecorded,      /* Intermediate state for clearing
0160                  * Blocked.  The Fault is/will-be
0161                  * recorded in the metadata, but that
0162                  * metadata hasn't been stored safely
0163                  * on disk yet.
0164                  */
0165     BlockedBadBlocks,   /* A writer is blocked because they
0166                  * found an unacknowledged bad-block.
0167                  * This can safely be cleared at any
0168                  * time, and the writer will re-check.
0169                  * It may be set at any time, and at
0170                  * worst the writer will timeout and
0171                  * re-check.  So setting it as
0172                  * accurately as possible is good, but
0173                  * not absolutely critical.
0174                  */
0175     WantReplacement,    /* This device is a candidate to be
0176                  * hot-replaced, either because it has
0177                  * reported some faults, or because
0178                  * of explicit request.
0179                  */
0180     Replacement,        /* This device is a replacement for
0181                  * a want_replacement device with same
0182                  * raid_disk number.
0183                  */
0184     Candidate,      /* For clustered environments only:
0185                  * This device is seen locally but not
0186                  * by the whole cluster
0187                  */
0188     Journal,        /* This device is used as journal for
0189                  * raid-5/6.
0190                  * Usually, this device should be faster
0191                  * than other devices in the array
0192                  */
0193     ClusterRemove,
0194     RemoveSynchronized, /* synchronize_rcu() was called after
0195                  * this device was known to be faulty,
0196                  * so it is safe to remove without
0197                  * another synchronize_rcu() call.
0198                  */
0199     ExternalBbl,            /* External metadata provides bad
0200                  * block management for a disk
0201                  */
0202     FailFast,       /* Minimal retries should be attempted on
0203                  * this device, so use REQ_FAILFAST_DEV.
0204                  * Also don't try to repair failed reads.
0205                  * It is expects that no bad block log
0206                  * is present.
0207                  */
0208     LastDev,        /* Seems to be the last working dev as
0209                  * it didn't fail, so don't use FailFast
0210                  * any more for metadata
0211                  */
0212     CollisionCheck,     /*
0213                  * check if there is collision between raid1
0214                  * serial bios.
0215                  */
0216 };
0217
0218 static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors,
0219                   sector_t *first_bad, int *bad_sectors)
0220 {
0221     if (unlikely(rdev->badblocks.count)) {
0222         int rv = badblocks_check(&rdev->badblocks, rdev->data_offset + s,
0223                     sectors,
0224                     first_bad, bad_sectors);
0225         if (rv)
0226             *first_bad -= rdev->data_offset;
0227         return rv;
0228     }
0229     return 0;
0230 }
0231 extern int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
0232                   int is_new);
0233 extern int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
0234                 int is_new);
0235 struct md_cluster_info;
0236
0237 /**
0238  * enum mddev_flags - md device flags.
0239  * @MD_ARRAY_FIRST_USE: First use of array, needs initialization.
0240  * @MD_CLOSING: If set, we are closing the array, do not open it then.
0241  * @MD_JOURNAL_CLEAN: A raid with journal is already clean.
0242  * @MD_HAS_JOURNAL: The raid array has journal feature set.
0243  * @MD_CLUSTER_RESYNC_LOCKED: cluster raid only, which means node, already took
0244  *                 resync lock, need to release the lock.
0245  * @MD_FAILFAST_SUPPORTED: Using MD_FAILFAST on metadata writes is supported as
0246  *              calls to md_error() will never cause the array to
0247  *              become failed.
0248  * @MD_HAS_PPL:  The raid array has PPL feature set.
0249  * @MD_HAS_MULTIPLE_PPLS: The raid array has multiple PPLs feature set.
0250  * @MD_ALLOW_SB_UPDATE: md_check_recovery is allowed to update the metadata
0251  *           without taking reconfig_mutex.
0252  * @MD_UPDATING_SB: md_check_recovery is updating the metadata without
0253  *           explicitly holding reconfig_mutex.
0254  * @MD_NOT_READY: do_md_run() is active, so 'array_state', ust not report that
0255  *         array is ready yet.
0256  * @MD_BROKEN: This is used to stop writes and mark array as failed.
0257  * @MD_DELETED: This device is being deleted
0258  *
0259  * change UNSUPPORTED_MDDEV_FLAGS for each array type if new flag is added
0260  */
0261 enum mddev_flags {
0262     MD_ARRAY_FIRST_USE,
0263     MD_CLOSING,
0264     MD_JOURNAL_CLEAN,
0265     MD_HAS_JOURNAL,
0266     MD_CLUSTER_RESYNC_LOCKED,
0267     MD_FAILFAST_SUPPORTED,
0268     MD_HAS_PPL,
0269     MD_HAS_MULTIPLE_PPLS,
0270     MD_ALLOW_SB_UPDATE,
0271     MD_UPDATING_SB,
0272     MD_NOT_READY,
0273     MD_BROKEN,
0274     MD_DELETED,
0275 };
0276
0277 enum mddev_sb_flags {
0278     MD_SB_CHANGE_DEVS,      /* Some device status has changed */
0279     MD_SB_CHANGE_CLEAN, /* transition to or from 'clean' */
0280     MD_SB_CHANGE_PENDING,   /* switch from 'clean' to 'active' in progress */
0281     MD_SB_NEED_REWRITE, /* metadata write needs to be repeated */
0282 };
0283
0284 #define NR_SERIAL_INFOS     8
0285 /* record current range of serialize IOs */
0286 struct serial_info {
0287     struct rb_node node;
0288     sector_t start;     /* start sector of rb node */
0289     sector_t last;      /* end sector of rb node */
0290     sector_t _subtree_last; /* highest sector in subtree of rb node */
0291 };
0292
0293 /*
0294  * mddev->curr_resync stores the current sector of the resync but
0295  * also has some overloaded values.
0296  */
0297 enum {
0298     /* No resync in progress */
0299     MD_RESYNC_NONE = 0,
0300     /* Yielded to allow another conflicting resync to commence */
0301     MD_RESYNC_YIELDED = 1,
0302     /* Delayed to check that there is no conflict with another sync */
0303     MD_RESYNC_DELAYED = 2,
0304     /* Any value greater than or equal to this is in an active resync */
0305     MD_RESYNC_ACTIVE = 3,
0306 };
0307
0308 struct mddev {
0309     void                *private;
0310     struct md_personality       *pers;
0311     dev_t               unit;
0312     int             md_minor;
0313     struct list_head        disks;
0314     unsigned long           flags;
0315     unsigned long           sb_flags;
0316
0317     int             suspended;
0318     atomic_t            active_io;
0319     int             ro;
0320     int             sysfs_active; /* set when sysfs deletes
0321                                * are happening, so run/
0322                                * takeover/stop are not safe
0323                                */
0324     struct gendisk          *gendisk;
0325
0326     struct kobject          kobj;
0327     int             hold_active;
0328 #define UNTIL_IOCTL 1
0329 #define UNTIL_STOP  2
0330
0331     /* Superblock information */
0332     int             major_version,
0333                     minor_version,
0334                     patch_version;
0335     int             persistent;
0336     int             external;   /* metadata is
0337                              * managed externally */
0338     char                metadata_type[17]; /* externally set*/
0339     int             chunk_sectors;
0340     time64_t            ctime, utime;
0341     int             level, layout;
0342     char                clevel[16];
0343     int             raid_disks;
0344     int             max_disks;
0345     sector_t            dev_sectors;    /* used size of
0346                              * component devices */
0347     sector_t            array_sectors; /* exported array size */
0348     int             external_size; /* size managed
0349                             * externally */
0350     __u64               events;
0351     /* If the last 'event' was simply a clean->dirty transition, and
0352      * we didn't write it to the spares, then it is safe and simple
0353      * to just decrement the event count on a dirty->clean transition.
0354      * So we record that possibility here.
0355      */
0356     int             can_decrease_events;
0357
0358     char                uuid[16];
0359
0360     /* If the array is being reshaped, we need to record the
0361      * new shape and an indication of where we are up to.
0362      * This is written to the superblock.
0363      * If reshape_position is MaxSector, then no reshape is happening (yet).
0364      */
0365     sector_t            reshape_position;
0366     int             delta_disks, new_level, new_layout;
0367     int             new_chunk_sectors;
0368     int             reshape_backwards;
0369
0370     struct md_thread        *thread;    /* management thread */
0371     struct md_thread        *sync_thread;   /* doing resync or reconstruct */
0372
0373     /* 'last_sync_action' is initialized to "none".  It is set when a
0374      * sync operation (i.e "data-check", "requested-resync", "resync",
0375      * "recovery", or "reshape") is started.  It holds this value even
0376      * when the sync thread is "frozen" (interrupted) or "idle" (stopped
0377      * or finished).  It is overwritten when a new sync operation is begun.
0378      */
0379     char                *last_sync_action;
0380     sector_t            curr_resync;    /* last block scheduled */
0381     /* As resync requests can complete out of order, we cannot easily track
0382      * how much resync has been completed.  So we occasionally pause until
0383      * everything completes, then set curr_resync_completed to curr_resync.
0384      * As such it may be well behind the real resync mark, but it is a value
0385      * we are certain of.
0386      */
0387     sector_t            curr_resync_completed;
0388     unsigned long           resync_mark;    /* a recent timestamp */
0389     sector_t            resync_mark_cnt;/* blocks written at resync_mark */
0390     sector_t            curr_mark_cnt; /* blocks scheduled now */
0391
0392     sector_t            resync_max_sectors; /* may be set by personality */
0393
0394     atomic64_t          resync_mismatches; /* count of sectors where
0395                                 * parity/replica mismatch found
0396                                 */
0397
0398     /* allow user-space to request suspension of IO to regions of the array */
0399     sector_t            suspend_lo;
0400     sector_t            suspend_hi;
0401     /* if zero, use the system-wide default */
0402     int             sync_speed_min;
0403     int             sync_speed_max;
0404
0405     /* resync even though the same disks are shared among md-devices */
0406     int             parallel_resync;
0407
0408     int             ok_start_degraded;
0409
0410     unsigned long           recovery;
0411     /* If a RAID personality determines that recovery (of a particular
0412      * device) will fail due to a read error on the source device, it
0413      * takes a copy of this number and does not attempt recovery again
0414      * until this number changes.
0415      */
0416     int             recovery_disabled;
0417
0418     int             in_sync;    /* know to not need resync */
0419     /* 'open_mutex' avoids races between 'md_open' and 'do_md_stop', so
0420      * that we are never stopping an array while it is open.
0421      * 'reconfig_mutex' protects all other reconfiguration.
0422      * These locks are separate due to conflicting interactions
0423      * with disk->open_mutex.
0424      * Lock ordering is:
0425      *  reconfig_mutex -> disk->open_mutex
0426      *  disk->open_mutex -> open_mutex:  e.g. __blkdev_get -> md_open
0427      */
0428     struct mutex            open_mutex;
0429     struct mutex            reconfig_mutex;
0430     atomic_t            active;     /* general refcount */
0431     atomic_t            openers;    /* number of active opens */
0432
0433     int             changed;    /* True if we might need to
0434                              * reread partition info */
0435     int             degraded;   /* whether md should consider
0436                              * adding a spare
0437                              */
0438
0439     atomic_t            recovery_active; /* blocks scheduled, but not written */
0440     wait_queue_head_t       recovery_wait;
0441     sector_t            recovery_cp;
0442     sector_t            resync_min; /* user requested sync
0443                              * starts here */
0444     sector_t            resync_max; /* resync should pause
0445                              * when it gets here */
0446
0447     struct kernfs_node      *sysfs_state;   /* handle for 'array_state'
0448                              * file in sysfs.
0449                              */
0450     struct kernfs_node      *sysfs_action;  /* handle for 'sync_action' */
0451     struct kernfs_node      *sysfs_completed;   /*handle for 'sync_completed' */
0452     struct kernfs_node      *sysfs_degraded;    /*handle for 'degraded' */
0453     struct kernfs_node      *sysfs_level;       /*handle for 'level' */
0454
0455     struct work_struct del_work;    /* used for delayed sysfs removal */
0456
0457     /* "lock" protects:
0458      *   flush_bio transition from NULL to !NULL
0459      *   rdev superblocks, events
0460      *   clearing MD_CHANGE_*
0461      *   in_sync - and related safemode and MD_CHANGE changes
0462      *   pers (also protected by reconfig_mutex and pending IO).
0463      *   clearing ->bitmap
0464      *   clearing ->bitmap_info.file
0465      *   changing ->resync_{min,max}
0466      *   setting MD_RECOVERY_RUNNING (which interacts with resync_{min,max})
0467      */
0468     spinlock_t          lock;
0469     wait_queue_head_t       sb_wait;    /* for waiting on superblock updates */
0470     atomic_t            pending_writes; /* number of active superblock writes */
0471
0472     unsigned int            safemode;   /* if set, update "clean" superblock
0473                              * when no writes pending.
0474                              */
0475     unsigned int            safemode_delay;
0476     struct timer_list       safemode_timer;
0477     struct percpu_ref       writes_pending;
0478     int             sync_checkers;  /* # of threads checking writes_pending */
0479     struct request_queue        *queue; /* for plugging ... */
0480
0481     struct bitmap           *bitmap; /* the bitmap for the device */
0482     struct {
0483         struct file     *file; /* the bitmap file */
0484         loff_t          offset; /* offset from superblock of
0485                          * start of bitmap. May be
0486                          * negative, but not '0'
0487                          * For external metadata, offset
0488                          * from start of device.
0489                          */
0490         unsigned long       space; /* space available at this offset */
0491         loff_t          default_offset; /* this is the offset to use when
0492                              * hot-adding a bitmap.  It should
0493                              * eventually be settable by sysfs.
0494                              */
0495         unsigned long       default_space; /* space available at
0496                             * default offset */
0497         struct mutex        mutex;
0498         unsigned long       chunksize;
0499         unsigned long       daemon_sleep; /* how many jiffies between updates? */
0500         unsigned long       max_write_behind; /* write-behind mode */
0501         int         external;
0502         int         nodes; /* Maximum number of nodes in the cluster */
0503         char                    cluster_name[64]; /* Name of the cluster */
0504     } bitmap_info;
0505
0506     atomic_t            max_corr_read_errors; /* max read retries */
0507     struct list_head        all_mddevs;
0508
0509     const struct attribute_group    *to_remove;
0510
0511     struct bio_set          bio_set;
0512     struct bio_set          sync_set; /* for sync operations like
0513                            * metadata and bitmap writes
0514                            */
0515     struct bio_set          io_acct_set; /* for raid0 and raid5 io accounting */
0516
0517     /* Generic flush handling.
0518      * The last to finish preflush schedules a worker to submit
0519      * the rest of the request (without the REQ_PREFLUSH flag).
0520      */
0521     struct bio *flush_bio;
0522     atomic_t flush_pending;
0523     ktime_t start_flush, prev_flush_start; /* prev_flush_start is when the previous completed
0524                         * flush was started.
0525                         */
0526     struct work_struct flush_work;
0527     struct work_struct event_work;  /* used by dm to report failure event */
0528     mempool_t *serial_info_pool;
0529     void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev);
0530     struct md_cluster_info      *cluster_info;
0531     unsigned int            good_device_nr; /* good device num within cluster raid */
0532     unsigned int            noio_flag; /* for memalloc scope API */
0533
0534     bool    has_superblocks:1;
0535     bool    fail_last_dev:1;
0536     bool    serialize_policy:1;
0537 };
0538
0539 enum recovery_flags {
0540     /*
0541      * If neither SYNC or RESHAPE are set, then it is a recovery.
0542      */
0543     MD_RECOVERY_RUNNING,    /* a thread is running, or about to be started */
0544     MD_RECOVERY_SYNC,   /* actually doing a resync, not a recovery */
0545     MD_RECOVERY_RECOVER,    /* doing recovery, or need to try it. */
0546     MD_RECOVERY_INTR,   /* resync needs to be aborted for some reason */
0547     MD_RECOVERY_DONE,   /* thread is done and is waiting to be reaped */
0548     MD_RECOVERY_NEEDED, /* we might need to start a resync/recover */
0549     MD_RECOVERY_REQUESTED,  /* user-space has requested a sync (used with SYNC) */
0550     MD_RECOVERY_CHECK,  /* user-space request for check-only, no repair */
0551     MD_RECOVERY_RESHAPE,    /* A reshape is happening */
0552     MD_RECOVERY_FROZEN, /* User request to abort, and not restart, any action */
0553     MD_RECOVERY_ERROR,  /* sync-action interrupted because io-error */
0554     MD_RECOVERY_WAIT,   /* waiting for pers->start() to finish */
0555     MD_RESYNCING_REMOTE,    /* remote node is running resync thread */
0556 };
0557
0558 static inline int __must_check mddev_lock(struct mddev *mddev)
0559 {
0560     return mutex_lock_interruptible(&mddev->reconfig_mutex);
0561 }
0562
0563 /* Sometimes we need to take the lock in a situation where
0564  * failure due to interrupts is not acceptable.
0565  */
0566 static inline void mddev_lock_nointr(struct mddev *mddev)
0567 {
0568     mutex_lock(&mddev->reconfig_mutex);
0569 }
0570
0571 static inline int mddev_trylock(struct mddev *mddev)
0572 {
0573     return mutex_trylock(&mddev->reconfig_mutex);
0574 }
0575 extern void mddev_unlock(struct mddev *mddev);
0576
0577 static inline void md_sync_acct(struct block_device *bdev, unsigned long nr_sectors)
0578 {
0579     atomic_add(nr_sectors, &bdev->bd_disk->sync_io);
0580 }
0581
0582 static inline void md_sync_acct_bio(struct bio *bio, unsigned long nr_sectors)
0583 {
0584     md_sync_acct(bio->bi_bdev, nr_sectors);
0585 }
0586
0587 struct md_personality
0588 {
0589     char *name;
0590     int level;
0591     struct list_head list;
0592     struct module *owner;
0593     bool __must_check (*make_request)(struct mddev *mddev, struct bio *bio);
0594     /*
0595      * start up works that do NOT require md_thread. tasks that
0596      * requires md_thread should go into start()
0597      */
0598     int (*run)(struct mddev *mddev);
0599     /* start up works that require md threads */
0600     int (*start)(struct mddev *mddev);
0601     void (*free)(struct mddev *mddev, void *priv);
0602     void (*status)(struct seq_file *seq, struct mddev *mddev);
0603     /* error_handler must set ->faulty and clear ->in_sync
0604      * if appropriate, and should abort recovery if needed
0605      */
0606     void (*error_handler)(struct mddev *mddev, struct md_rdev *rdev);
0607     int (*hot_add_disk) (struct mddev *mddev, struct md_rdev *rdev);
0608     int (*hot_remove_disk) (struct mddev *mddev, struct md_rdev *rdev);
0609     int (*spare_active) (struct mddev *mddev);
0610     sector_t (*sync_request)(struct mddev *mddev, sector_t sector_nr, int *skipped);
0611     int (*resize) (struct mddev *mddev, sector_t sectors);
0612     sector_t (*size) (struct mddev *mddev, sector_t sectors, int raid_disks);
0613     int (*check_reshape) (struct mddev *mddev);
0614     int (*start_reshape) (struct mddev *mddev);
0615     void (*finish_reshape) (struct mddev *mddev);
0616     void (*update_reshape_pos) (struct mddev *mddev);
0617     /* quiesce suspends or resumes internal processing.
0618      * 1 - stop new actions and wait for action io to complete
0619      * 0 - return to normal behaviour
0620      */
0621     void (*quiesce) (struct mddev *mddev, int quiesce);
0622     /* takeover is used to transition an array from one
0623      * personality to another.  The new personality must be able
0624      * to handle the data in the current layout.
0625      * e.g. 2drive raid1 -> 2drive raid5
0626      *      ndrive raid5 -> degraded n+1drive raid6 with special layout
0627      * If the takeover succeeds, a new 'private' structure is returned.
0628      * This needs to be installed and then ->run used to activate the
0629      * array.
0630      */
0631     void *(*takeover) (struct mddev *mddev);
0632     /* Changes the consistency policy of an active array. */
0633     int (*change_consistency_policy)(struct mddev *mddev, const char *buf);
0634 };
0635
0636 struct md_sysfs_entry {
0637     struct attribute attr;
0638     ssize_t (*show)(struct mddev *, char *);
0639     ssize_t (*store)(struct mddev *, const char *, size_t);
0640 };
0641 extern const struct attribute_group md_bitmap_group;
0642
0643 static inline struct kernfs_node *sysfs_get_dirent_safe(struct kernfs_node *sd, char *name)
0644 {
0645     if (sd)
0646         return sysfs_get_dirent(sd, name);
0647     return sd;
0648 }
0649 static inline void sysfs_notify_dirent_safe(struct kernfs_node *sd)
0650 {
0651     if (sd)
0652         sysfs_notify_dirent(sd);
0653 }
0654
0655 static inline char * mdname (struct mddev * mddev)
0656 {
0657     return mddev->gendisk ? mddev->gendisk->disk_name : "mdX";
0658 }
0659
0660 static inline int sysfs_link_rdev(struct mddev *mddev, struct md_rdev *rdev)
0661 {
0662     char nm[20];
0663     if (!test_bit(Replacement, &rdev->flags) &&
0664         !test_bit(Journal, &rdev->flags) &&
0665         mddev->kobj.sd) {
0666         sprintf(nm, "rd%d", rdev->raid_disk);
0667         return sysfs_create_link(&mddev->kobj, &rdev->kobj, nm);
0668     } else
0669         return 0;
0670 }
0671
0672 static inline void sysfs_unlink_rdev(struct mddev *mddev, struct md_rdev *rdev)
0673 {
0674     char nm[20];
0675     if (!test_bit(Replacement, &rdev->flags) &&
0676         !test_bit(Journal, &rdev->flags) &&
0677         mddev->kobj.sd) {
0678         sprintf(nm, "rd%d", rdev->raid_disk);
0679         sysfs_remove_link(&mddev->kobj, nm);
0680     }
0681 }
0682
0683 /*
0684  * iterates through some rdev ringlist. It's safe to remove the
0685  * current 'rdev'. Dont touch 'tmp' though.
0686  */
0687 #define rdev_for_each_list(rdev, tmp, head)             \
0688     list_for_each_entry_safe(rdev, tmp, head, same_set)
0689
0690 /*
0691  * iterates through the 'same array disks' ringlist
0692  */
0693 #define rdev_for_each(rdev, mddev)              \
0694     list_for_each_entry(rdev, &((mddev)->disks), same_set)
0695
0696 #define rdev_for_each_safe(rdev, tmp, mddev)                \
0697     list_for_each_entry_safe(rdev, tmp, &((mddev)->disks), same_set)
0698
0699 #define rdev_for_each_rcu(rdev, mddev)              \
0700     list_for_each_entry_rcu(rdev, &((mddev)->disks), same_set)
0701
0702 struct md_thread {
0703     void            (*run) (struct md_thread *thread);
0704     struct mddev        *mddev;
0705     wait_queue_head_t   wqueue;
0706     unsigned long       flags;
0707     struct task_struct  *tsk;
0708     unsigned long       timeout;
0709     void            *private;
0710 };
0711
0712 struct md_io_acct {
0713     struct bio *orig_bio;
0714     unsigned long start_time;
0715     struct bio bio_clone;
0716 };
0717
0718 #define THREAD_WAKEUP  0
0719
0720 static inline void safe_put_page(struct page *p)
0721 {
0722     if (p) put_page(p);
0723 }
0724
0725 extern int register_md_personality(struct md_personality *p);
0726 extern int unregister_md_personality(struct md_personality *p);
0727 extern int register_md_cluster_operations(struct md_cluster_operations *ops,
0728         struct module *module);
0729 extern int unregister_md_cluster_operations(void);
0730 extern int md_setup_cluster(struct mddev *mddev, int nodes);
0731 extern void md_cluster_stop(struct mddev *mddev);
0732 extern struct md_thread *md_register_thread(
0733     void (*run)(struct md_thread *thread),
0734     struct mddev *mddev,
0735     const char *name);
0736 extern void md_unregister_thread(struct md_thread **threadp);
0737 extern void md_wakeup_thread(struct md_thread *thread);
0738 extern void md_check_recovery(struct mddev *mddev);
0739 extern void md_reap_sync_thread(struct mddev *mddev);
0740 extern int mddev_init_writes_pending(struct mddev *mddev);
0741 extern bool md_write_start(struct mddev *mddev, struct bio *bi);
0742 extern void md_write_inc(struct mddev *mddev, struct bio *bi);
0743 extern void md_write_end(struct mddev *mddev);
0744 extern void md_done_sync(struct mddev *mddev, int blocks, int ok);
0745 extern void md_error(struct mddev *mddev, struct md_rdev *rdev);
0746 extern void md_finish_reshape(struct mddev *mddev);
0747 void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev,
0748             struct bio *bio, sector_t start, sector_t size);
0749 int acct_bioset_init(struct mddev *mddev);
0750 void acct_bioset_exit(struct mddev *mddev);
0751 void md_account_bio(struct mddev *mddev, struct bio **bio);
0752
0753 extern bool __must_check md_flush_request(struct mddev *mddev, struct bio *bio);
0754 extern void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
0755                sector_t sector, int size, struct page *page);
0756 extern int md_super_wait(struct mddev *mddev);
0757 extern int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
0758         struct page *page, blk_opf_t opf, bool metadata_op);
0759 extern void md_do_sync(struct md_thread *thread);
0760 extern void md_new_event(void);
0761 extern void md_allow_write(struct mddev *mddev);
0762 extern void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev);
0763 extern void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors);
0764 extern int md_check_no_bitmap(struct mddev *mddev);
0765 extern int md_integrity_register(struct mddev *mddev);
0766 extern int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev);
0767 extern int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale);
0768
0769 extern void mddev_init(struct mddev *mddev);
0770 struct mddev *md_alloc(dev_t dev, char *name);
0771 void mddev_put(struct mddev *mddev);
0772 extern int md_run(struct mddev *mddev);
0773 extern int md_start(struct mddev *mddev);
0774 extern void md_stop(struct mddev *mddev);
0775 extern void md_stop_writes(struct mddev *mddev);
0776 extern int md_rdev_init(struct md_rdev *rdev);
0777 extern void md_rdev_clear(struct md_rdev *rdev);
0778
0779 extern void md_handle_request(struct mddev *mddev, struct bio *bio);
0780 extern void mddev_suspend(struct mddev *mddev);
0781 extern void mddev_resume(struct mddev *mddev);
0782
0783 extern void md_reload_sb(struct mddev *mddev, int raid_disk);
0784 extern void md_update_sb(struct mddev *mddev, int force);
0785 extern void md_kick_rdev_from_array(struct md_rdev * rdev);
0786 extern void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
0787                      bool is_suspend);
0788 extern void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
0789                       bool is_suspend);
0790 struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr);
0791 struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev);
0792
0793 static inline bool is_mddev_broken(struct md_rdev *rdev, const char *md_type)
0794 {
0795     if (!disk_live(rdev->bdev->bd_disk)) {
0796         if (!test_and_set_bit(MD_BROKEN, &rdev->mddev->flags))
0797             pr_warn("md: %s: %s array has a missing/failed member\n",
0798                 mdname(rdev->mddev), md_type);
0799         return true;
0800     }
0801     return false;
0802 }
0803
0804 static inline void rdev_dec_pending(struct md_rdev *rdev, struct mddev *mddev)
0805 {
0806     int faulty = test_bit(Faulty, &rdev->flags);
0807     if (atomic_dec_and_test(&rdev->nr_pending) && faulty) {
0808         set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
0809         md_wakeup_thread(mddev->thread);
0810     }
0811 }
0812
0813 extern struct md_cluster_operations *md_cluster_ops;
0814 static inline int mddev_is_clustered(struct mddev *mddev)
0815 {
0816     return mddev->cluster_info && mddev->bitmap_info.nodes > 1;
0817 }
0818
0819 /* clear unsupported mddev_flags */
0820 static inline void mddev_clear_unsupported_flags(struct mddev *mddev,
0821     unsigned long unsupported_flags)
0822 {
0823     mddev->flags &= ~unsupported_flags;
0824 }
0825
0826 static inline void mddev_check_write_zeroes(struct mddev *mddev, struct bio *bio)
0827 {
0828     if (bio_op(bio) == REQ_OP_WRITE_ZEROES &&
0829         !bio->bi_bdev->bd_disk->queue->limits.max_write_zeroes_sectors)
0830         mddev->queue->limits.max_write_zeroes_sectors = 0;
0831 }
0832
0833 struct mdu_array_info_s;
0834 struct mdu_disk_info_s;
0835
0836 extern int mdp_major;
0837 void md_autostart_arrays(int part);
0838 int md_set_array_info(struct mddev *mddev, struct mdu_array_info_s *info);
0839 int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info);
0840 int do_md_run(struct mddev *mddev);
0841
0842 extern const struct block_device_operations md_fops;
0843
0844 #endif /* _MD_MD_H */