0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024
0025
0026
0027
0028
0029
0030
0031
0032
0033
0034
0035
0036
0037
0038
0039
0040 #include <linux/sched/mm.h>
0041 #include <linux/sched/signal.h>
0042 #include <linux/kthread.h>
0043 #include <linux/blkdev.h>
0044 #include <linux/blk-integrity.h>
0045 #include <linux/badblocks.h>
0046 #include <linux/sysctl.h>
0047 #include <linux/seq_file.h>
0048 #include <linux/fs.h>
0049 #include <linux/poll.h>
0050 #include <linux/ctype.h>
0051 #include <linux/string.h>
0052 #include <linux/hdreg.h>
0053 #include <linux/proc_fs.h>
0054 #include <linux/random.h>
0055 #include <linux/major.h>
0056 #include <linux/module.h>
0057 #include <linux/reboot.h>
0058 #include <linux/file.h>
0059 #include <linux/compat.h>
0060 #include <linux/delay.h>
0061 #include <linux/raid/md_p.h>
0062 #include <linux/raid/md_u.h>
0063 #include <linux/raid/detect.h>
0064 #include <linux/slab.h>
0065 #include <linux/percpu-refcount.h>
0066 #include <linux/part_stat.h>
0067
0068 #include <trace/events/block.h>
0069 #include "md.h"
0070 #include "md-bitmap.h"
0071 #include "md-cluster.h"
0072
0073
0074
0075
0076
0077
0078 static LIST_HEAD(pers_list);
0079 static DEFINE_SPINLOCK(pers_lock);
0080
0081 static struct kobj_type md_ktype;
0082
0083 struct md_cluster_operations *md_cluster_ops;
0084 EXPORT_SYMBOL(md_cluster_ops);
0085 static struct module *md_cluster_mod;
0086
0087 static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
0088 static struct workqueue_struct *md_wq;
0089 static struct workqueue_struct *md_misc_wq;
0090 static struct workqueue_struct *md_rdev_misc_wq;
0091
0092 static int remove_and_add_spares(struct mddev *mddev,
0093 struct md_rdev *this);
0094 static void mddev_detach(struct mddev *mddev);
0095
0096
0097
0098
0099
0100
0101 #define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20
0102
0103 #define DEFAULT_SAFEMODE_DELAY ((200 * HZ)/1000 +1)
0104
0105
0106
0107
0108
0109
0110
0111
0112
0113
0114
0115
0116
0117 static int sysctl_speed_limit_min = 1000;
0118 static int sysctl_speed_limit_max = 200000;
0119 static inline int speed_min(struct mddev *mddev)
0120 {
0121 return mddev->sync_speed_min ?
0122 mddev->sync_speed_min : sysctl_speed_limit_min;
0123 }
0124
0125 static inline int speed_max(struct mddev *mddev)
0126 {
0127 return mddev->sync_speed_max ?
0128 mddev->sync_speed_max : sysctl_speed_limit_max;
0129 }
0130
0131 static void rdev_uninit_serial(struct md_rdev *rdev)
0132 {
0133 if (!test_and_clear_bit(CollisionCheck, &rdev->flags))
0134 return;
0135
0136 kvfree(rdev->serial);
0137 rdev->serial = NULL;
0138 }
0139
0140 static void rdevs_uninit_serial(struct mddev *mddev)
0141 {
0142 struct md_rdev *rdev;
0143
0144 rdev_for_each(rdev, mddev)
0145 rdev_uninit_serial(rdev);
0146 }
0147
0148 static int rdev_init_serial(struct md_rdev *rdev)
0149 {
0150
0151 int i, serial_nums = 1 << ((PAGE_SHIFT - ilog2(sizeof(atomic_t))));
0152 struct serial_in_rdev *serial = NULL;
0153
0154 if (test_bit(CollisionCheck, &rdev->flags))
0155 return 0;
0156
0157 serial = kvmalloc(sizeof(struct serial_in_rdev) * serial_nums,
0158 GFP_KERNEL);
0159 if (!serial)
0160 return -ENOMEM;
0161
0162 for (i = 0; i < serial_nums; i++) {
0163 struct serial_in_rdev *serial_tmp = &serial[i];
0164
0165 spin_lock_init(&serial_tmp->serial_lock);
0166 serial_tmp->serial_rb = RB_ROOT_CACHED;
0167 init_waitqueue_head(&serial_tmp->serial_io_wait);
0168 }
0169
0170 rdev->serial = serial;
0171 set_bit(CollisionCheck, &rdev->flags);
0172
0173 return 0;
0174 }
0175
0176 static int rdevs_init_serial(struct mddev *mddev)
0177 {
0178 struct md_rdev *rdev;
0179 int ret = 0;
0180
0181 rdev_for_each(rdev, mddev) {
0182 ret = rdev_init_serial(rdev);
0183 if (ret)
0184 break;
0185 }
0186
0187
0188 if (ret && !mddev->serial_info_pool)
0189 rdevs_uninit_serial(mddev);
0190
0191 return ret;
0192 }
0193
0194
0195
0196
0197
0198
0199 static int rdev_need_serial(struct md_rdev *rdev)
0200 {
0201 return (rdev && rdev->mddev->bitmap_info.max_write_behind > 0 &&
0202 rdev->bdev->bd_disk->queue->nr_hw_queues != 1 &&
0203 test_bit(WriteMostly, &rdev->flags));
0204 }
0205
0206
0207
0208
0209
0210
0211 void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
0212 bool is_suspend)
0213 {
0214 int ret = 0;
0215
0216 if (rdev && !rdev_need_serial(rdev) &&
0217 !test_bit(CollisionCheck, &rdev->flags))
0218 return;
0219
0220 if (!is_suspend)
0221 mddev_suspend(mddev);
0222
0223 if (!rdev)
0224 ret = rdevs_init_serial(mddev);
0225 else
0226 ret = rdev_init_serial(rdev);
0227 if (ret)
0228 goto abort;
0229
0230 if (mddev->serial_info_pool == NULL) {
0231
0232
0233
0234
0235 mddev->serial_info_pool =
0236 mempool_create_kmalloc_pool(NR_SERIAL_INFOS,
0237 sizeof(struct serial_info));
0238 if (!mddev->serial_info_pool) {
0239 rdevs_uninit_serial(mddev);
0240 pr_err("can't alloc memory pool for serialization\n");
0241 }
0242 }
0243
0244 abort:
0245 if (!is_suspend)
0246 mddev_resume(mddev);
0247 }
0248
0249
0250
0251
0252
0253
0254
0255 void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
0256 bool is_suspend)
0257 {
0258 if (rdev && !test_bit(CollisionCheck, &rdev->flags))
0259 return;
0260
0261 if (mddev->serial_info_pool) {
0262 struct md_rdev *temp;
0263 int num = 0;
0264
0265 if (!is_suspend)
0266 mddev_suspend(mddev);
0267 rdev_for_each(temp, mddev) {
0268 if (!rdev) {
0269 if (!mddev->serialize_policy ||
0270 !rdev_need_serial(temp))
0271 rdev_uninit_serial(temp);
0272 else
0273 num++;
0274 } else if (temp != rdev &&
0275 test_bit(CollisionCheck, &temp->flags))
0276 num++;
0277 }
0278
0279 if (rdev)
0280 rdev_uninit_serial(rdev);
0281
0282 if (num)
0283 pr_info("The mempool could be used by other devices\n");
0284 else {
0285 mempool_destroy(mddev->serial_info_pool);
0286 mddev->serial_info_pool = NULL;
0287 }
0288 if (!is_suspend)
0289 mddev_resume(mddev);
0290 }
0291 }
0292
0293 static struct ctl_table_header *raid_table_header;
0294
0295 static struct ctl_table raid_table[] = {
0296 {
0297 .procname = "speed_limit_min",
0298 .data = &sysctl_speed_limit_min,
0299 .maxlen = sizeof(int),
0300 .mode = S_IRUGO|S_IWUSR,
0301 .proc_handler = proc_dointvec,
0302 },
0303 {
0304 .procname = "speed_limit_max",
0305 .data = &sysctl_speed_limit_max,
0306 .maxlen = sizeof(int),
0307 .mode = S_IRUGO|S_IWUSR,
0308 .proc_handler = proc_dointvec,
0309 },
0310 { }
0311 };
0312
0313 static struct ctl_table raid_dir_table[] = {
0314 {
0315 .procname = "raid",
0316 .maxlen = 0,
0317 .mode = S_IRUGO|S_IXUGO,
0318 .child = raid_table,
0319 },
0320 { }
0321 };
0322
0323 static struct ctl_table raid_root_table[] = {
0324 {
0325 .procname = "dev",
0326 .maxlen = 0,
0327 .mode = 0555,
0328 .child = raid_dir_table,
0329 },
0330 { }
0331 };
0332
0333 static int start_readonly;
0334
0335
0336
0337
0338
0339
0340
0341
0342
0343 static bool create_on_open = true;
0344
0345
0346
0347
0348
0349
0350
0351
0352
0353
0354
0355 static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters);
0356 static atomic_t md_event_count;
0357 void md_new_event(void)
0358 {
0359 atomic_inc(&md_event_count);
0360 wake_up(&md_event_waiters);
0361 }
0362 EXPORT_SYMBOL_GPL(md_new_event);
0363
0364
0365
0366
0367
0368 static LIST_HEAD(all_mddevs);
0369 static DEFINE_SPINLOCK(all_mddevs_lock);
0370
0371
0372
0373
0374
0375
0376
0377
0378 static bool is_suspended(struct mddev *mddev, struct bio *bio)
0379 {
0380 if (mddev->suspended)
0381 return true;
0382 if (bio_data_dir(bio) != WRITE)
0383 return false;
0384 if (mddev->suspend_lo >= mddev->suspend_hi)
0385 return false;
0386 if (bio->bi_iter.bi_sector >= mddev->suspend_hi)
0387 return false;
0388 if (bio_end_sector(bio) < mddev->suspend_lo)
0389 return false;
0390 return true;
0391 }
0392
0393 void md_handle_request(struct mddev *mddev, struct bio *bio)
0394 {
0395 check_suspended:
0396 rcu_read_lock();
0397 if (is_suspended(mddev, bio)) {
0398 DEFINE_WAIT(__wait);
0399
0400 if (bio->bi_opf & REQ_NOWAIT) {
0401 rcu_read_unlock();
0402 bio_wouldblock_error(bio);
0403 return;
0404 }
0405 for (;;) {
0406 prepare_to_wait(&mddev->sb_wait, &__wait,
0407 TASK_UNINTERRUPTIBLE);
0408 if (!is_suspended(mddev, bio))
0409 break;
0410 rcu_read_unlock();
0411 schedule();
0412 rcu_read_lock();
0413 }
0414 finish_wait(&mddev->sb_wait, &__wait);
0415 }
0416 atomic_inc(&mddev->active_io);
0417 rcu_read_unlock();
0418
0419 if (!mddev->pers->make_request(mddev, bio)) {
0420 atomic_dec(&mddev->active_io);
0421 wake_up(&mddev->sb_wait);
0422 goto check_suspended;
0423 }
0424
0425 if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended)
0426 wake_up(&mddev->sb_wait);
0427 }
0428 EXPORT_SYMBOL(md_handle_request);
0429
0430 static void md_submit_bio(struct bio *bio)
0431 {
0432 const int rw = bio_data_dir(bio);
0433 struct mddev *mddev = bio->bi_bdev->bd_disk->private_data;
0434
0435 if (mddev == NULL || mddev->pers == NULL) {
0436 bio_io_error(bio);
0437 return;
0438 }
0439
0440 if (unlikely(test_bit(MD_BROKEN, &mddev->flags)) && (rw == WRITE)) {
0441 bio_io_error(bio);
0442 return;
0443 }
0444
0445 bio = bio_split_to_limits(bio);
0446
0447 if (mddev->ro == 1 && unlikely(rw == WRITE)) {
0448 if (bio_sectors(bio) != 0)
0449 bio->bi_status = BLK_STS_IOERR;
0450 bio_endio(bio);
0451 return;
0452 }
0453
0454
0455 bio->bi_opf &= ~REQ_NOMERGE;
0456
0457 md_handle_request(mddev, bio);
0458 }
0459
0460
0461
0462
0463
0464
0465
0466 void mddev_suspend(struct mddev *mddev)
0467 {
0468 WARN_ON_ONCE(mddev->thread && current == mddev->thread->tsk);
0469 lockdep_assert_held(&mddev->reconfig_mutex);
0470 if (mddev->suspended++)
0471 return;
0472 synchronize_rcu();
0473 wake_up(&mddev->sb_wait);
0474 set_bit(MD_ALLOW_SB_UPDATE, &mddev->flags);
0475 smp_mb__after_atomic();
0476 wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0);
0477 mddev->pers->quiesce(mddev, 1);
0478 clear_bit_unlock(MD_ALLOW_SB_UPDATE, &mddev->flags);
0479 wait_event(mddev->sb_wait, !test_bit(MD_UPDATING_SB, &mddev->flags));
0480
0481 del_timer_sync(&mddev->safemode_timer);
0482
0483 mddev->noio_flag = memalloc_noio_save();
0484 }
0485 EXPORT_SYMBOL_GPL(mddev_suspend);
0486
0487 void mddev_resume(struct mddev *mddev)
0488 {
0489
0490 memalloc_noio_restore(mddev->noio_flag);
0491 lockdep_assert_held(&mddev->reconfig_mutex);
0492 if (--mddev->suspended)
0493 return;
0494 wake_up(&mddev->sb_wait);
0495 mddev->pers->quiesce(mddev, 0);
0496
0497 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
0498 md_wakeup_thread(mddev->thread);
0499 md_wakeup_thread(mddev->sync_thread);
0500 }
0501 EXPORT_SYMBOL_GPL(mddev_resume);
0502
0503
0504
0505
0506
0507 static void md_end_flush(struct bio *bio)
0508 {
0509 struct md_rdev *rdev = bio->bi_private;
0510 struct mddev *mddev = rdev->mddev;
0511
0512 rdev_dec_pending(rdev, mddev);
0513
0514 if (atomic_dec_and_test(&mddev->flush_pending)) {
0515
0516 queue_work(md_wq, &mddev->flush_work);
0517 }
0518 bio_put(bio);
0519 }
0520
0521 static void md_submit_flush_data(struct work_struct *ws);
0522
0523 static void submit_flushes(struct work_struct *ws)
0524 {
0525 struct mddev *mddev = container_of(ws, struct mddev, flush_work);
0526 struct md_rdev *rdev;
0527
0528 mddev->start_flush = ktime_get_boottime();
0529 INIT_WORK(&mddev->flush_work, md_submit_flush_data);
0530 atomic_set(&mddev->flush_pending, 1);
0531 rcu_read_lock();
0532 rdev_for_each_rcu(rdev, mddev)
0533 if (rdev->raid_disk >= 0 &&
0534 !test_bit(Faulty, &rdev->flags)) {
0535
0536
0537
0538
0539 struct bio *bi;
0540 atomic_inc(&rdev->nr_pending);
0541 atomic_inc(&rdev->nr_pending);
0542 rcu_read_unlock();
0543 bi = bio_alloc_bioset(rdev->bdev, 0,
0544 REQ_OP_WRITE | REQ_PREFLUSH,
0545 GFP_NOIO, &mddev->bio_set);
0546 bi->bi_end_io = md_end_flush;
0547 bi->bi_private = rdev;
0548 atomic_inc(&mddev->flush_pending);
0549 submit_bio(bi);
0550 rcu_read_lock();
0551 rdev_dec_pending(rdev, mddev);
0552 }
0553 rcu_read_unlock();
0554 if (atomic_dec_and_test(&mddev->flush_pending))
0555 queue_work(md_wq, &mddev->flush_work);
0556 }
0557
0558 static void md_submit_flush_data(struct work_struct *ws)
0559 {
0560 struct mddev *mddev = container_of(ws, struct mddev, flush_work);
0561 struct bio *bio = mddev->flush_bio;
0562
0563
0564
0565
0566
0567
0568
0569 spin_lock_irq(&mddev->lock);
0570 mddev->prev_flush_start = mddev->start_flush;
0571 mddev->flush_bio = NULL;
0572 spin_unlock_irq(&mddev->lock);
0573 wake_up(&mddev->sb_wait);
0574
0575 if (bio->bi_iter.bi_size == 0) {
0576
0577 bio_endio(bio);
0578 } else {
0579 bio->bi_opf &= ~REQ_PREFLUSH;
0580 md_handle_request(mddev, bio);
0581 }
0582 }
0583
0584
0585
0586
0587
0588
0589
0590 bool md_flush_request(struct mddev *mddev, struct bio *bio)
0591 {
0592 ktime_t req_start = ktime_get_boottime();
0593 spin_lock_irq(&mddev->lock);
0594
0595
0596
0597 wait_event_lock_irq(mddev->sb_wait,
0598 !mddev->flush_bio ||
0599 ktime_before(req_start, mddev->prev_flush_start),
0600 mddev->lock);
0601
0602 if (ktime_after(req_start, mddev->prev_flush_start)) {
0603 WARN_ON(mddev->flush_bio);
0604 mddev->flush_bio = bio;
0605 bio = NULL;
0606 }
0607 spin_unlock_irq(&mddev->lock);
0608
0609 if (!bio) {
0610 INIT_WORK(&mddev->flush_work, submit_flushes);
0611 queue_work(md_wq, &mddev->flush_work);
0612 } else {
0613
0614 if (bio->bi_iter.bi_size == 0)
0615
0616 bio_endio(bio);
0617 else {
0618 bio->bi_opf &= ~REQ_PREFLUSH;
0619 return false;
0620 }
0621 }
0622 return true;
0623 }
0624 EXPORT_SYMBOL(md_flush_request);
0625
0626 static inline struct mddev *mddev_get(struct mddev *mddev)
0627 {
0628 lockdep_assert_held(&all_mddevs_lock);
0629
0630 if (test_bit(MD_DELETED, &mddev->flags))
0631 return NULL;
0632 atomic_inc(&mddev->active);
0633 return mddev;
0634 }
0635
0636 static void mddev_delayed_delete(struct work_struct *ws);
0637
0638 void mddev_put(struct mddev *mddev)
0639 {
0640 if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
0641 return;
0642 if (!mddev->raid_disks && list_empty(&mddev->disks) &&
0643 mddev->ctime == 0 && !mddev->hold_active) {
0644
0645
0646 set_bit(MD_DELETED, &mddev->flags);
0647
0648
0649
0650
0651
0652
0653 INIT_WORK(&mddev->del_work, mddev_delayed_delete);
0654 queue_work(md_misc_wq, &mddev->del_work);
0655 }
0656 spin_unlock(&all_mddevs_lock);
0657 }
0658
0659 static void md_safemode_timeout(struct timer_list *t);
0660
0661 void mddev_init(struct mddev *mddev)
0662 {
0663 mutex_init(&mddev->open_mutex);
0664 mutex_init(&mddev->reconfig_mutex);
0665 mutex_init(&mddev->bitmap_info.mutex);
0666 INIT_LIST_HEAD(&mddev->disks);
0667 INIT_LIST_HEAD(&mddev->all_mddevs);
0668 timer_setup(&mddev->safemode_timer, md_safemode_timeout, 0);
0669 atomic_set(&mddev->active, 1);
0670 atomic_set(&mddev->openers, 0);
0671 atomic_set(&mddev->active_io, 0);
0672 spin_lock_init(&mddev->lock);
0673 atomic_set(&mddev->flush_pending, 0);
0674 init_waitqueue_head(&mddev->sb_wait);
0675 init_waitqueue_head(&mddev->recovery_wait);
0676 mddev->reshape_position = MaxSector;
0677 mddev->reshape_backwards = 0;
0678 mddev->last_sync_action = "none";
0679 mddev->resync_min = 0;
0680 mddev->resync_max = MaxSector;
0681 mddev->level = LEVEL_NONE;
0682 }
0683 EXPORT_SYMBOL_GPL(mddev_init);
0684
0685 static struct mddev *mddev_find_locked(dev_t unit)
0686 {
0687 struct mddev *mddev;
0688
0689 list_for_each_entry(mddev, &all_mddevs, all_mddevs)
0690 if (mddev->unit == unit)
0691 return mddev;
0692
0693 return NULL;
0694 }
0695
0696
0697 static dev_t mddev_alloc_unit(void)
0698 {
0699 static int next_minor = 512;
0700 int start = next_minor;
0701 bool is_free = 0;
0702 dev_t dev = 0;
0703
0704 while (!is_free) {
0705 dev = MKDEV(MD_MAJOR, next_minor);
0706 next_minor++;
0707 if (next_minor > MINORMASK)
0708 next_minor = 0;
0709 if (next_minor == start)
0710 return 0;
0711 is_free = !mddev_find_locked(dev);
0712 }
0713
0714 return dev;
0715 }
0716
0717 static struct mddev *mddev_alloc(dev_t unit)
0718 {
0719 struct mddev *new;
0720 int error;
0721
0722 if (unit && MAJOR(unit) != MD_MAJOR)
0723 unit &= ~((1 << MdpMinorShift) - 1);
0724
0725 new = kzalloc(sizeof(*new), GFP_KERNEL);
0726 if (!new)
0727 return ERR_PTR(-ENOMEM);
0728 mddev_init(new);
0729
0730 spin_lock(&all_mddevs_lock);
0731 if (unit) {
0732 error = -EEXIST;
0733 if (mddev_find_locked(unit))
0734 goto out_free_new;
0735 new->unit = unit;
0736 if (MAJOR(unit) == MD_MAJOR)
0737 new->md_minor = MINOR(unit);
0738 else
0739 new->md_minor = MINOR(unit) >> MdpMinorShift;
0740 new->hold_active = UNTIL_IOCTL;
0741 } else {
0742 error = -ENODEV;
0743 new->unit = mddev_alloc_unit();
0744 if (!new->unit)
0745 goto out_free_new;
0746 new->md_minor = MINOR(new->unit);
0747 new->hold_active = UNTIL_STOP;
0748 }
0749
0750 list_add(&new->all_mddevs, &all_mddevs);
0751 spin_unlock(&all_mddevs_lock);
0752 return new;
0753 out_free_new:
0754 spin_unlock(&all_mddevs_lock);
0755 kfree(new);
0756 return ERR_PTR(error);
0757 }
0758
0759 static void mddev_free(struct mddev *mddev)
0760 {
0761 spin_lock(&all_mddevs_lock);
0762 list_del(&mddev->all_mddevs);
0763 spin_unlock(&all_mddevs_lock);
0764
0765 kfree(mddev);
0766 }
0767
0768 static const struct attribute_group md_redundancy_group;
0769
0770 void mddev_unlock(struct mddev *mddev)
0771 {
0772 if (mddev->to_remove) {
0773
0774
0775
0776
0777
0778
0779
0780
0781
0782
0783
0784
0785 const struct attribute_group *to_remove = mddev->to_remove;
0786 mddev->to_remove = NULL;
0787 mddev->sysfs_active = 1;
0788 mutex_unlock(&mddev->reconfig_mutex);
0789
0790 if (mddev->kobj.sd) {
0791 if (to_remove != &md_redundancy_group)
0792 sysfs_remove_group(&mddev->kobj, to_remove);
0793 if (mddev->pers == NULL ||
0794 mddev->pers->sync_request == NULL) {
0795 sysfs_remove_group(&mddev->kobj, &md_redundancy_group);
0796 if (mddev->sysfs_action)
0797 sysfs_put(mddev->sysfs_action);
0798 if (mddev->sysfs_completed)
0799 sysfs_put(mddev->sysfs_completed);
0800 if (mddev->sysfs_degraded)
0801 sysfs_put(mddev->sysfs_degraded);
0802 mddev->sysfs_action = NULL;
0803 mddev->sysfs_completed = NULL;
0804 mddev->sysfs_degraded = NULL;
0805 }
0806 }
0807 mddev->sysfs_active = 0;
0808 } else
0809 mutex_unlock(&mddev->reconfig_mutex);
0810
0811
0812
0813
0814 spin_lock(&pers_lock);
0815 md_wakeup_thread(mddev->thread);
0816 wake_up(&mddev->sb_wait);
0817 spin_unlock(&pers_lock);
0818 }
0819 EXPORT_SYMBOL_GPL(mddev_unlock);
0820
0821 struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr)
0822 {
0823 struct md_rdev *rdev;
0824
0825 rdev_for_each_rcu(rdev, mddev)
0826 if (rdev->desc_nr == nr)
0827 return rdev;
0828
0829 return NULL;
0830 }
0831 EXPORT_SYMBOL_GPL(md_find_rdev_nr_rcu);
0832
0833 static struct md_rdev *find_rdev(struct mddev *mddev, dev_t dev)
0834 {
0835 struct md_rdev *rdev;
0836
0837 rdev_for_each(rdev, mddev)
0838 if (rdev->bdev->bd_dev == dev)
0839 return rdev;
0840
0841 return NULL;
0842 }
0843
0844 struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev)
0845 {
0846 struct md_rdev *rdev;
0847
0848 rdev_for_each_rcu(rdev, mddev)
0849 if (rdev->bdev->bd_dev == dev)
0850 return rdev;
0851
0852 return NULL;
0853 }
0854 EXPORT_SYMBOL_GPL(md_find_rdev_rcu);
0855
0856 static struct md_personality *find_pers(int level, char *clevel)
0857 {
0858 struct md_personality *pers;
0859 list_for_each_entry(pers, &pers_list, list) {
0860 if (level != LEVEL_NONE && pers->level == level)
0861 return pers;
0862 if (strcmp(pers->name, clevel)==0)
0863 return pers;
0864 }
0865 return NULL;
0866 }
0867
0868
0869 static inline sector_t calc_dev_sboffset(struct md_rdev *rdev)
0870 {
0871 return MD_NEW_SIZE_SECTORS(bdev_nr_sectors(rdev->bdev));
0872 }
0873
0874 static int alloc_disk_sb(struct md_rdev *rdev)
0875 {
0876 rdev->sb_page = alloc_page(GFP_KERNEL);
0877 if (!rdev->sb_page)
0878 return -ENOMEM;
0879 return 0;
0880 }
0881
0882 void md_rdev_clear(struct md_rdev *rdev)
0883 {
0884 if (rdev->sb_page) {
0885 put_page(rdev->sb_page);
0886 rdev->sb_loaded = 0;
0887 rdev->sb_page = NULL;
0888 rdev->sb_start = 0;
0889 rdev->sectors = 0;
0890 }
0891 if (rdev->bb_page) {
0892 put_page(rdev->bb_page);
0893 rdev->bb_page = NULL;
0894 }
0895 badblocks_exit(&rdev->badblocks);
0896 }
0897 EXPORT_SYMBOL_GPL(md_rdev_clear);
0898
0899 static void super_written(struct bio *bio)
0900 {
0901 struct md_rdev *rdev = bio->bi_private;
0902 struct mddev *mddev = rdev->mddev;
0903
0904 if (bio->bi_status) {
0905 pr_err("md: %s gets error=%d\n", __func__,
0906 blk_status_to_errno(bio->bi_status));
0907 md_error(mddev, rdev);
0908 if (!test_bit(Faulty, &rdev->flags)
0909 && (bio->bi_opf & MD_FAILFAST)) {
0910 set_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags);
0911 set_bit(LastDev, &rdev->flags);
0912 }
0913 } else
0914 clear_bit(LastDev, &rdev->flags);
0915
0916 if (atomic_dec_and_test(&mddev->pending_writes))
0917 wake_up(&mddev->sb_wait);
0918 rdev_dec_pending(rdev, mddev);
0919 bio_put(bio);
0920 }
0921
0922 void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
0923 sector_t sector, int size, struct page *page)
0924 {
0925
0926
0927
0928
0929
0930
0931 struct bio *bio;
0932
0933 if (!page)
0934 return;
0935
0936 if (test_bit(Faulty, &rdev->flags))
0937 return;
0938
0939 bio = bio_alloc_bioset(rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev,
0940 1,
0941 REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH | REQ_FUA,
0942 GFP_NOIO, &mddev->sync_set);
0943
0944 atomic_inc(&rdev->nr_pending);
0945
0946 bio->bi_iter.bi_sector = sector;
0947 bio_add_page(bio, page, size, 0);
0948 bio->bi_private = rdev;
0949 bio->bi_end_io = super_written;
0950
0951 if (test_bit(MD_FAILFAST_SUPPORTED, &mddev->flags) &&
0952 test_bit(FailFast, &rdev->flags) &&
0953 !test_bit(LastDev, &rdev->flags))
0954 bio->bi_opf |= MD_FAILFAST;
0955
0956 atomic_inc(&mddev->pending_writes);
0957 submit_bio(bio);
0958 }
0959
0960 int md_super_wait(struct mddev *mddev)
0961 {
0962
0963 wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0);
0964 if (test_and_clear_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags))
0965 return -EAGAIN;
0966 return 0;
0967 }
0968
0969 int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
0970 struct page *page, blk_opf_t opf, bool metadata_op)
0971 {
0972 struct bio bio;
0973 struct bio_vec bvec;
0974
0975 if (metadata_op && rdev->meta_bdev)
0976 bio_init(&bio, rdev->meta_bdev, &bvec, 1, opf);
0977 else
0978 bio_init(&bio, rdev->bdev, &bvec, 1, opf);
0979
0980 if (metadata_op)
0981 bio.bi_iter.bi_sector = sector + rdev->sb_start;
0982 else if (rdev->mddev->reshape_position != MaxSector &&
0983 (rdev->mddev->reshape_backwards ==
0984 (sector >= rdev->mddev->reshape_position)))
0985 bio.bi_iter.bi_sector = sector + rdev->new_data_offset;
0986 else
0987 bio.bi_iter.bi_sector = sector + rdev->data_offset;
0988 bio_add_page(&bio, page, size, 0);
0989
0990 submit_bio_wait(&bio);
0991
0992 return !bio.bi_status;
0993 }
0994 EXPORT_SYMBOL_GPL(sync_page_io);
0995
0996 static int read_disk_sb(struct md_rdev *rdev, int size)
0997 {
0998 if (rdev->sb_loaded)
0999 return 0;
1000
1001 if (!sync_page_io(rdev, 0, size, rdev->sb_page, REQ_OP_READ, true))
1002 goto fail;
1003 rdev->sb_loaded = 1;
1004 return 0;
1005
1006 fail:
1007 pr_err("md: disabled device %pg, could not read superblock.\n",
1008 rdev->bdev);
1009 return -EINVAL;
1010 }
1011
1012 static int md_uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2)
1013 {
1014 return sb1->set_uuid0 == sb2->set_uuid0 &&
1015 sb1->set_uuid1 == sb2->set_uuid1 &&
1016 sb1->set_uuid2 == sb2->set_uuid2 &&
1017 sb1->set_uuid3 == sb2->set_uuid3;
1018 }
1019
1020 static int md_sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
1021 {
1022 int ret;
1023 mdp_super_t *tmp1, *tmp2;
1024
1025 tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL);
1026 tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL);
1027
1028 if (!tmp1 || !tmp2) {
1029 ret = 0;
1030 goto abort;
1031 }
1032
1033 *tmp1 = *sb1;
1034 *tmp2 = *sb2;
1035
1036
1037
1038
1039 tmp1->nr_disks = 0;
1040 tmp2->nr_disks = 0;
1041
1042 ret = (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4) == 0);
1043 abort:
1044 kfree(tmp1);
1045 kfree(tmp2);
1046 return ret;
1047 }
1048
1049 static u32 md_csum_fold(u32 csum)
1050 {
1051 csum = (csum & 0xffff) + (csum >> 16);
1052 return (csum & 0xffff) + (csum >> 16);
1053 }
1054
1055 static unsigned int calc_sb_csum(mdp_super_t *sb)
1056 {
1057 u64 newcsum = 0;
1058 u32 *sb32 = (u32*)sb;
1059 int i;
1060 unsigned int disk_csum, csum;
1061
1062 disk_csum = sb->sb_csum;
1063 sb->sb_csum = 0;
1064
1065 for (i = 0; i < MD_SB_BYTES/4 ; i++)
1066 newcsum += sb32[i];
1067 csum = (newcsum & 0xffffffff) + (newcsum>>32);
1068
1069 #ifdef CONFIG_ALPHA
1070
1071
1072
1073
1074
1075
1076
1077
1078 sb->sb_csum = md_csum_fold(disk_csum);
1079 #else
1080 sb->sb_csum = disk_csum;
1081 #endif
1082 return csum;
1083 }
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115 struct super_type {
1116 char *name;
1117 struct module *owner;
1118 int (*load_super)(struct md_rdev *rdev,
1119 struct md_rdev *refdev,
1120 int minor_version);
1121 int (*validate_super)(struct mddev *mddev,
1122 struct md_rdev *rdev);
1123 void (*sync_super)(struct mddev *mddev,
1124 struct md_rdev *rdev);
1125 unsigned long long (*rdev_size_change)(struct md_rdev *rdev,
1126 sector_t num_sectors);
1127 int (*allow_new_offset)(struct md_rdev *rdev,
1128 unsigned long long new_offset);
1129 };
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139 int md_check_no_bitmap(struct mddev *mddev)
1140 {
1141 if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset)
1142 return 0;
1143 pr_warn("%s: bitmaps are not supported for %s\n",
1144 mdname(mddev), mddev->pers->name);
1145 return 1;
1146 }
1147 EXPORT_SYMBOL(md_check_no_bitmap);
1148
1149
1150
1151
1152 static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version)
1153 {
1154 mdp_super_t *sb;
1155 int ret;
1156 bool spare_disk = true;
1157
1158
1159
1160
1161
1162
1163
1164 rdev->sb_start = calc_dev_sboffset(rdev);
1165
1166 ret = read_disk_sb(rdev, MD_SB_BYTES);
1167 if (ret)
1168 return ret;
1169
1170 ret = -EINVAL;
1171
1172 sb = page_address(rdev->sb_page);
1173
1174 if (sb->md_magic != MD_SB_MAGIC) {
1175 pr_warn("md: invalid raid superblock magic on %pg\n",
1176 rdev->bdev);
1177 goto abort;
1178 }
1179
1180 if (sb->major_version != 0 ||
1181 sb->minor_version < 90 ||
1182 sb->minor_version > 91) {
1183 pr_warn("Bad version number %d.%d on %pg\n",
1184 sb->major_version, sb->minor_version, rdev->bdev);
1185 goto abort;
1186 }
1187
1188 if (sb->raid_disks <= 0)
1189 goto abort;
1190
1191 if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) {
1192 pr_warn("md: invalid superblock checksum on %pg\n", rdev->bdev);
1193 goto abort;
1194 }
1195
1196 rdev->preferred_minor = sb->md_minor;
1197 rdev->data_offset = 0;
1198 rdev->new_data_offset = 0;
1199 rdev->sb_size = MD_SB_BYTES;
1200 rdev->badblocks.shift = -1;
1201
1202 if (sb->level == LEVEL_MULTIPATH)
1203 rdev->desc_nr = -1;
1204 else
1205 rdev->desc_nr = sb->this_disk.number;
1206
1207
1208 if (sb->level == LEVEL_MULTIPATH ||
1209 (rdev->desc_nr >= 0 &&
1210 rdev->desc_nr < MD_SB_DISKS &&
1211 sb->disks[rdev->desc_nr].state &
1212 ((1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE))))
1213 spare_disk = false;
1214
1215 if (!refdev) {
1216 if (!spare_disk)
1217 ret = 1;
1218 else
1219 ret = 0;
1220 } else {
1221 __u64 ev1, ev2;
1222 mdp_super_t *refsb = page_address(refdev->sb_page);
1223 if (!md_uuid_equal(refsb, sb)) {
1224 pr_warn("md: %pg has different UUID to %pg\n",
1225 rdev->bdev, refdev->bdev);
1226 goto abort;
1227 }
1228 if (!md_sb_equal(refsb, sb)) {
1229 pr_warn("md: %pg has same UUID but different superblock to %pg\n",
1230 rdev->bdev, refdev->bdev);
1231 goto abort;
1232 }
1233 ev1 = md_event(sb);
1234 ev2 = md_event(refsb);
1235
1236 if (!spare_disk && ev1 > ev2)
1237 ret = 1;
1238 else
1239 ret = 0;
1240 }
1241 rdev->sectors = rdev->sb_start;
1242
1243
1244
1245
1246 if ((u64)rdev->sectors >= (2ULL << 32) && sb->level >= 1)
1247 rdev->sectors = (sector_t)(2ULL << 32) - 2;
1248
1249 if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1)
1250
1251 ret = -EINVAL;
1252
1253 abort:
1254 return ret;
1255 }
1256
1257
1258
1259
1260 static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev)
1261 {
1262 mdp_disk_t *desc;
1263 mdp_super_t *sb = page_address(rdev->sb_page);
1264 __u64 ev1 = md_event(sb);
1265
1266 rdev->raid_disk = -1;
1267 clear_bit(Faulty, &rdev->flags);
1268 clear_bit(In_sync, &rdev->flags);
1269 clear_bit(Bitmap_sync, &rdev->flags);
1270 clear_bit(WriteMostly, &rdev->flags);
1271
1272 if (mddev->raid_disks == 0) {
1273 mddev->major_version = 0;
1274 mddev->minor_version = sb->minor_version;
1275 mddev->patch_version = sb->patch_version;
1276 mddev->external = 0;
1277 mddev->chunk_sectors = sb->chunk_size >> 9;
1278 mddev->ctime = sb->ctime;
1279 mddev->utime = sb->utime;
1280 mddev->level = sb->level;
1281 mddev->clevel[0] = 0;
1282 mddev->layout = sb->layout;
1283 mddev->raid_disks = sb->raid_disks;
1284 mddev->dev_sectors = ((sector_t)sb->size) * 2;
1285 mddev->events = ev1;
1286 mddev->bitmap_info.offset = 0;
1287 mddev->bitmap_info.space = 0;
1288
1289 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9;
1290 mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9);
1291 mddev->reshape_backwards = 0;
1292
1293 if (mddev->minor_version >= 91) {
1294 mddev->reshape_position = sb->reshape_position;
1295 mddev->delta_disks = sb->delta_disks;
1296 mddev->new_level = sb->new_level;
1297 mddev->new_layout = sb->new_layout;
1298 mddev->new_chunk_sectors = sb->new_chunk >> 9;
1299 if (mddev->delta_disks < 0)
1300 mddev->reshape_backwards = 1;
1301 } else {
1302 mddev->reshape_position = MaxSector;
1303 mddev->delta_disks = 0;
1304 mddev->new_level = mddev->level;
1305 mddev->new_layout = mddev->layout;
1306 mddev->new_chunk_sectors = mddev->chunk_sectors;
1307 }
1308 if (mddev->level == 0)
1309 mddev->layout = -1;
1310
1311 if (sb->state & (1<<MD_SB_CLEAN))
1312 mddev->recovery_cp = MaxSector;
1313 else {
1314 if (sb->events_hi == sb->cp_events_hi &&
1315 sb->events_lo == sb->cp_events_lo) {
1316 mddev->recovery_cp = sb->recovery_cp;
1317 } else
1318 mddev->recovery_cp = 0;
1319 }
1320
1321 memcpy(mddev->uuid+0, &sb->set_uuid0, 4);
1322 memcpy(mddev->uuid+4, &sb->set_uuid1, 4);
1323 memcpy(mddev->uuid+8, &sb->set_uuid2, 4);
1324 memcpy(mddev->uuid+12,&sb->set_uuid3, 4);
1325
1326 mddev->max_disks = MD_SB_DISKS;
1327
1328 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) &&
1329 mddev->bitmap_info.file == NULL) {
1330 mddev->bitmap_info.offset =
1331 mddev->bitmap_info.default_offset;
1332 mddev->bitmap_info.space =
1333 mddev->bitmap_info.default_space;
1334 }
1335
1336 } else if (mddev->pers == NULL) {
1337
1338
1339 ++ev1;
1340 if (sb->disks[rdev->desc_nr].state & (
1341 (1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE)))
1342 if (ev1 < mddev->events)
1343 return -EINVAL;
1344 } else if (mddev->bitmap) {
1345
1346
1347
1348 if (ev1 < mddev->bitmap->events_cleared)
1349 return 0;
1350 if (ev1 < mddev->events)
1351 set_bit(Bitmap_sync, &rdev->flags);
1352 } else {
1353 if (ev1 < mddev->events)
1354
1355 return 0;
1356 }
1357
1358 if (mddev->level != LEVEL_MULTIPATH) {
1359 desc = sb->disks + rdev->desc_nr;
1360
1361 if (desc->state & (1<<MD_DISK_FAULTY))
1362 set_bit(Faulty, &rdev->flags);
1363 else if (desc->state & (1<<MD_DISK_SYNC)
1364 ) {
1365 set_bit(In_sync, &rdev->flags);
1366 rdev->raid_disk = desc->raid_disk;
1367 rdev->saved_raid_disk = desc->raid_disk;
1368 } else if (desc->state & (1<<MD_DISK_ACTIVE)) {
1369
1370
1371
1372 if (mddev->minor_version >= 91) {
1373 rdev->recovery_offset = 0;
1374 rdev->raid_disk = desc->raid_disk;
1375 }
1376 }
1377 if (desc->state & (1<<MD_DISK_WRITEMOSTLY))
1378 set_bit(WriteMostly, &rdev->flags);
1379 if (desc->state & (1<<MD_DISK_FAILFAST))
1380 set_bit(FailFast, &rdev->flags);
1381 } else
1382 set_bit(In_sync, &rdev->flags);
1383 return 0;
1384 }
1385
1386
1387
1388
1389 static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev)
1390 {
1391 mdp_super_t *sb;
1392 struct md_rdev *rdev2;
1393 int next_spare = mddev->raid_disks;
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405 int i;
1406 int active=0, working=0,failed=0,spare=0,nr_disks=0;
1407
1408 rdev->sb_size = MD_SB_BYTES;
1409
1410 sb = page_address(rdev->sb_page);
1411
1412 memset(sb, 0, sizeof(*sb));
1413
1414 sb->md_magic = MD_SB_MAGIC;
1415 sb->major_version = mddev->major_version;
1416 sb->patch_version = mddev->patch_version;
1417 sb->gvalid_words = 0;
1418 memcpy(&sb->set_uuid0, mddev->uuid+0, 4);
1419 memcpy(&sb->set_uuid1, mddev->uuid+4, 4);
1420 memcpy(&sb->set_uuid2, mddev->uuid+8, 4);
1421 memcpy(&sb->set_uuid3, mddev->uuid+12,4);
1422
1423 sb->ctime = clamp_t(time64_t, mddev->ctime, 0, U32_MAX);
1424 sb->level = mddev->level;
1425 sb->size = mddev->dev_sectors / 2;
1426 sb->raid_disks = mddev->raid_disks;
1427 sb->md_minor = mddev->md_minor;
1428 sb->not_persistent = 0;
1429 sb->utime = clamp_t(time64_t, mddev->utime, 0, U32_MAX);
1430 sb->state = 0;
1431 sb->events_hi = (mddev->events>>32);
1432 sb->events_lo = (u32)mddev->events;
1433
1434 if (mddev->reshape_position == MaxSector)
1435 sb->minor_version = 90;
1436 else {
1437 sb->minor_version = 91;
1438 sb->reshape_position = mddev->reshape_position;
1439 sb->new_level = mddev->new_level;
1440 sb->delta_disks = mddev->delta_disks;
1441 sb->new_layout = mddev->new_layout;
1442 sb->new_chunk = mddev->new_chunk_sectors << 9;
1443 }
1444 mddev->minor_version = sb->minor_version;
1445 if (mddev->in_sync)
1446 {
1447 sb->recovery_cp = mddev->recovery_cp;
1448 sb->cp_events_hi = (mddev->events>>32);
1449 sb->cp_events_lo = (u32)mddev->events;
1450 if (mddev->recovery_cp == MaxSector)
1451 sb->state = (1<< MD_SB_CLEAN);
1452 } else
1453 sb->recovery_cp = 0;
1454
1455 sb->layout = mddev->layout;
1456 sb->chunk_size = mddev->chunk_sectors << 9;
1457
1458 if (mddev->bitmap && mddev->bitmap_info.file == NULL)
1459 sb->state |= (1<<MD_SB_BITMAP_PRESENT);
1460
1461 sb->disks[0].state = (1<<MD_DISK_REMOVED);
1462 rdev_for_each(rdev2, mddev) {
1463 mdp_disk_t *d;
1464 int desc_nr;
1465 int is_active = test_bit(In_sync, &rdev2->flags);
1466
1467 if (rdev2->raid_disk >= 0 &&
1468 sb->minor_version >= 91)
1469
1470
1471
1472
1473 is_active = 1;
1474 if (rdev2->raid_disk < 0 ||
1475 test_bit(Faulty, &rdev2->flags))
1476 is_active = 0;
1477 if (is_active)
1478 desc_nr = rdev2->raid_disk;
1479 else
1480 desc_nr = next_spare++;
1481 rdev2->desc_nr = desc_nr;
1482 d = &sb->disks[rdev2->desc_nr];
1483 nr_disks++;
1484 d->number = rdev2->desc_nr;
1485 d->major = MAJOR(rdev2->bdev->bd_dev);
1486 d->minor = MINOR(rdev2->bdev->bd_dev);
1487 if (is_active)
1488 d->raid_disk = rdev2->raid_disk;
1489 else
1490 d->raid_disk = rdev2->desc_nr;
1491 if (test_bit(Faulty, &rdev2->flags))
1492 d->state = (1<<MD_DISK_FAULTY);
1493 else if (is_active) {
1494 d->state = (1<<MD_DISK_ACTIVE);
1495 if (test_bit(In_sync, &rdev2->flags))
1496 d->state |= (1<<MD_DISK_SYNC);
1497 active++;
1498 working++;
1499 } else {
1500 d->state = 0;
1501 spare++;
1502 working++;
1503 }
1504 if (test_bit(WriteMostly, &rdev2->flags))
1505 d->state |= (1<<MD_DISK_WRITEMOSTLY);
1506 if (test_bit(FailFast, &rdev2->flags))
1507 d->state |= (1<<MD_DISK_FAILFAST);
1508 }
1509
1510 for (i=0 ; i < mddev->raid_disks ; i++) {
1511 mdp_disk_t *d = &sb->disks[i];
1512 if (d->state == 0 && d->number == 0) {
1513 d->number = i;
1514 d->raid_disk = i;
1515 d->state = (1<<MD_DISK_REMOVED);
1516 d->state |= (1<<MD_DISK_FAULTY);
1517 failed++;
1518 }
1519 }
1520 sb->nr_disks = nr_disks;
1521 sb->active_disks = active;
1522 sb->working_disks = working;
1523 sb->failed_disks = failed;
1524 sb->spare_disks = spare;
1525
1526 sb->this_disk = sb->disks[rdev->desc_nr];
1527 sb->sb_csum = calc_sb_csum(sb);
1528 }
1529
1530
1531
1532
1533 static unsigned long long
1534 super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
1535 {
1536 if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
1537 return 0;
1538 if (rdev->mddev->bitmap_info.offset)
1539 return 0;
1540 rdev->sb_start = calc_dev_sboffset(rdev);
1541 if (!num_sectors || num_sectors > rdev->sb_start)
1542 num_sectors = rdev->sb_start;
1543
1544
1545
1546 if ((u64)num_sectors >= (2ULL << 32) && rdev->mddev->level >= 1)
1547 num_sectors = (sector_t)(2ULL << 32) - 2;
1548 do {
1549 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
1550 rdev->sb_page);
1551 } while (md_super_wait(rdev->mddev) < 0);
1552 return num_sectors;
1553 }
1554
1555 static int
1556 super_90_allow_new_offset(struct md_rdev *rdev, unsigned long long new_offset)
1557 {
1558
1559 return new_offset == 0;
1560 }
1561
1562
1563
1564
1565
1566 static __le32 calc_sb_1_csum(struct mdp_superblock_1 *sb)
1567 {
1568 __le32 disk_csum;
1569 u32 csum;
1570 unsigned long long newcsum;
1571 int size = 256 + le32_to_cpu(sb->max_dev)*2;
1572 __le32 *isuper = (__le32*)sb;
1573
1574 disk_csum = sb->sb_csum;
1575 sb->sb_csum = 0;
1576 newcsum = 0;
1577 for (; size >= 4; size -= 4)
1578 newcsum += le32_to_cpu(*isuper++);
1579
1580 if (size == 2)
1581 newcsum += le16_to_cpu(*(__le16*) isuper);
1582
1583 csum = (newcsum & 0xffffffff) + (newcsum >> 32);
1584 sb->sb_csum = disk_csum;
1585 return cpu_to_le32(csum);
1586 }
1587
1588 static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version)
1589 {
1590 struct mdp_superblock_1 *sb;
1591 int ret;
1592 sector_t sb_start;
1593 sector_t sectors;
1594 int bmask;
1595 bool spare_disk = true;
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605 switch(minor_version) {
1606 case 0:
1607 sb_start = bdev_nr_sectors(rdev->bdev) - 8 * 2;
1608 sb_start &= ~(sector_t)(4*2-1);
1609 break;
1610 case 1:
1611 sb_start = 0;
1612 break;
1613 case 2:
1614 sb_start = 8;
1615 break;
1616 default:
1617 return -EINVAL;
1618 }
1619 rdev->sb_start = sb_start;
1620
1621
1622
1623
1624 ret = read_disk_sb(rdev, 4096);
1625 if (ret) return ret;
1626
1627 sb = page_address(rdev->sb_page);
1628
1629 if (sb->magic != cpu_to_le32(MD_SB_MAGIC) ||
1630 sb->major_version != cpu_to_le32(1) ||
1631 le32_to_cpu(sb->max_dev) > (4096-256)/2 ||
1632 le64_to_cpu(sb->super_offset) != rdev->sb_start ||
1633 (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0)
1634 return -EINVAL;
1635
1636 if (calc_sb_1_csum(sb) != sb->sb_csum) {
1637 pr_warn("md: invalid superblock checksum on %pg\n",
1638 rdev->bdev);
1639 return -EINVAL;
1640 }
1641 if (le64_to_cpu(sb->data_size) < 10) {
1642 pr_warn("md: data_size too small on %pg\n",
1643 rdev->bdev);
1644 return -EINVAL;
1645 }
1646 if (sb->pad0 ||
1647 sb->pad3[0] ||
1648 memcmp(sb->pad3, sb->pad3+1, sizeof(sb->pad3) - sizeof(sb->pad3[1])))
1649
1650 return -EINVAL;
1651
1652 rdev->preferred_minor = 0xffff;
1653 rdev->data_offset = le64_to_cpu(sb->data_offset);
1654 rdev->new_data_offset = rdev->data_offset;
1655 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE) &&
1656 (le32_to_cpu(sb->feature_map) & MD_FEATURE_NEW_OFFSET))
1657 rdev->new_data_offset += (s32)le32_to_cpu(sb->new_offset);
1658 atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read));
1659
1660 rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256;
1661 bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1;
1662 if (rdev->sb_size & bmask)
1663 rdev->sb_size = (rdev->sb_size | bmask) + 1;
1664
1665 if (minor_version
1666 && rdev->data_offset < sb_start + (rdev->sb_size/512))
1667 return -EINVAL;
1668 if (minor_version
1669 && rdev->new_data_offset < sb_start + (rdev->sb_size/512))
1670 return -EINVAL;
1671
1672 if (sb->level == cpu_to_le32(LEVEL_MULTIPATH))
1673 rdev->desc_nr = -1;
1674 else
1675 rdev->desc_nr = le32_to_cpu(sb->dev_number);
1676
1677 if (!rdev->bb_page) {
1678 rdev->bb_page = alloc_page(GFP_KERNEL);
1679 if (!rdev->bb_page)
1680 return -ENOMEM;
1681 }
1682 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BAD_BLOCKS) &&
1683 rdev->badblocks.count == 0) {
1684
1685
1686
1687 s32 offset;
1688 sector_t bb_sector;
1689 __le64 *bbp;
1690 int i;
1691 int sectors = le16_to_cpu(sb->bblog_size);
1692 if (sectors > (PAGE_SIZE / 512))
1693 return -EINVAL;
1694 offset = le32_to_cpu(sb->bblog_offset);
1695 if (offset == 0)
1696 return -EINVAL;
1697 bb_sector = (long long)offset;
1698 if (!sync_page_io(rdev, bb_sector, sectors << 9,
1699 rdev->bb_page, REQ_OP_READ, true))
1700 return -EIO;
1701 bbp = (__le64 *)page_address(rdev->bb_page);
1702 rdev->badblocks.shift = sb->bblog_shift;
1703 for (i = 0 ; i < (sectors << (9-3)) ; i++, bbp++) {
1704 u64 bb = le64_to_cpu(*bbp);
1705 int count = bb & (0x3ff);
1706 u64 sector = bb >> 10;
1707 sector <<= sb->bblog_shift;
1708 count <<= sb->bblog_shift;
1709 if (bb + 1 == 0)
1710 break;
1711 if (badblocks_set(&rdev->badblocks, sector, count, 1))
1712 return -EINVAL;
1713 }
1714 } else if (sb->bblog_offset != 0)
1715 rdev->badblocks.shift = 0;
1716
1717 if ((le32_to_cpu(sb->feature_map) &
1718 (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS))) {
1719 rdev->ppl.offset = (__s16)le16_to_cpu(sb->ppl.offset);
1720 rdev->ppl.size = le16_to_cpu(sb->ppl.size);
1721 rdev->ppl.sector = rdev->sb_start + rdev->ppl.offset;
1722 }
1723
1724 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RAID0_LAYOUT) &&
1725 sb->level != 0)
1726 return -EINVAL;
1727
1728
1729 if (sb->level == cpu_to_le32(LEVEL_MULTIPATH) ||
1730 (rdev->desc_nr >= 0 &&
1731 rdev->desc_nr < le32_to_cpu(sb->max_dev) &&
1732 (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX ||
1733 le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL)))
1734 spare_disk = false;
1735
1736 if (!refdev) {
1737 if (!spare_disk)
1738 ret = 1;
1739 else
1740 ret = 0;
1741 } else {
1742 __u64 ev1, ev2;
1743 struct mdp_superblock_1 *refsb = page_address(refdev->sb_page);
1744
1745 if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 ||
1746 sb->level != refsb->level ||
1747 sb->layout != refsb->layout ||
1748 sb->chunksize != refsb->chunksize) {
1749 pr_warn("md: %pg has strangely different superblock to %pg\n",
1750 rdev->bdev,
1751 refdev->bdev);
1752 return -EINVAL;
1753 }
1754 ev1 = le64_to_cpu(sb->events);
1755 ev2 = le64_to_cpu(refsb->events);
1756
1757 if (!spare_disk && ev1 > ev2)
1758 ret = 1;
1759 else
1760 ret = 0;
1761 }
1762 if (minor_version)
1763 sectors = bdev_nr_sectors(rdev->bdev) - rdev->data_offset;
1764 else
1765 sectors = rdev->sb_start;
1766 if (sectors < le64_to_cpu(sb->data_size))
1767 return -EINVAL;
1768 rdev->sectors = le64_to_cpu(sb->data_size);
1769 return ret;
1770 }
1771
1772 static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
1773 {
1774 struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
1775 __u64 ev1 = le64_to_cpu(sb->events);
1776
1777 rdev->raid_disk = -1;
1778 clear_bit(Faulty, &rdev->flags);
1779 clear_bit(In_sync, &rdev->flags);
1780 clear_bit(Bitmap_sync, &rdev->flags);
1781 clear_bit(WriteMostly, &rdev->flags);
1782
1783 if (mddev->raid_disks == 0) {
1784 mddev->major_version = 1;
1785 mddev->patch_version = 0;
1786 mddev->external = 0;
1787 mddev->chunk_sectors = le32_to_cpu(sb->chunksize);
1788 mddev->ctime = le64_to_cpu(sb->ctime);
1789 mddev->utime = le64_to_cpu(sb->utime);
1790 mddev->level = le32_to_cpu(sb->level);
1791 mddev->clevel[0] = 0;
1792 mddev->layout = le32_to_cpu(sb->layout);
1793 mddev->raid_disks = le32_to_cpu(sb->raid_disks);
1794 mddev->dev_sectors = le64_to_cpu(sb->size);
1795 mddev->events = ev1;
1796 mddev->bitmap_info.offset = 0;
1797 mddev->bitmap_info.space = 0;
1798
1799
1800
1801 mddev->bitmap_info.default_offset = 1024 >> 9;
1802 mddev->bitmap_info.default_space = (4096-1024) >> 9;
1803 mddev->reshape_backwards = 0;
1804
1805 mddev->recovery_cp = le64_to_cpu(sb->resync_offset);
1806 memcpy(mddev->uuid, sb->set_uuid, 16);
1807
1808 mddev->max_disks = (4096-256)/2;
1809
1810 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) &&
1811 mddev->bitmap_info.file == NULL) {
1812 mddev->bitmap_info.offset =
1813 (__s32)le32_to_cpu(sb->bitmap_offset);
1814
1815
1816
1817
1818
1819 if (mddev->minor_version > 0)
1820 mddev->bitmap_info.space = 0;
1821 else if (mddev->bitmap_info.offset > 0)
1822 mddev->bitmap_info.space =
1823 8 - mddev->bitmap_info.offset;
1824 else
1825 mddev->bitmap_info.space =
1826 -mddev->bitmap_info.offset;
1827 }
1828
1829 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
1830 mddev->reshape_position = le64_to_cpu(sb->reshape_position);
1831 mddev->delta_disks = le32_to_cpu(sb->delta_disks);
1832 mddev->new_level = le32_to_cpu(sb->new_level);
1833 mddev->new_layout = le32_to_cpu(sb->new_layout);
1834 mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk);
1835 if (mddev->delta_disks < 0 ||
1836 (mddev->delta_disks == 0 &&
1837 (le32_to_cpu(sb->feature_map)
1838 & MD_FEATURE_RESHAPE_BACKWARDS)))
1839 mddev->reshape_backwards = 1;
1840 } else {
1841 mddev->reshape_position = MaxSector;
1842 mddev->delta_disks = 0;
1843 mddev->new_level = mddev->level;
1844 mddev->new_layout = mddev->layout;
1845 mddev->new_chunk_sectors = mddev->chunk_sectors;
1846 }
1847
1848 if (mddev->level == 0 &&
1849 !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RAID0_LAYOUT))
1850 mddev->layout = -1;
1851
1852 if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)
1853 set_bit(MD_HAS_JOURNAL, &mddev->flags);
1854
1855 if (le32_to_cpu(sb->feature_map) &
1856 (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS)) {
1857 if (le32_to_cpu(sb->feature_map) &
1858 (MD_FEATURE_BITMAP_OFFSET | MD_FEATURE_JOURNAL))
1859 return -EINVAL;
1860 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_PPL) &&
1861 (le32_to_cpu(sb->feature_map) &
1862 MD_FEATURE_MULTIPLE_PPLS))
1863 return -EINVAL;
1864 set_bit(MD_HAS_PPL, &mddev->flags);
1865 }
1866 } else if (mddev->pers == NULL) {
1867
1868
1869 ++ev1;
1870 if (rdev->desc_nr >= 0 &&
1871 rdev->desc_nr < le32_to_cpu(sb->max_dev) &&
1872 (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX ||
1873 le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL))
1874 if (ev1 < mddev->events)
1875 return -EINVAL;
1876 } else if (mddev->bitmap) {
1877
1878
1879
1880 if (ev1 < mddev->bitmap->events_cleared)
1881 return 0;
1882 if (ev1 < mddev->events)
1883 set_bit(Bitmap_sync, &rdev->flags);
1884 } else {
1885 if (ev1 < mddev->events)
1886
1887 return 0;
1888 }
1889 if (mddev->level != LEVEL_MULTIPATH) {
1890 int role;
1891 if (rdev->desc_nr < 0 ||
1892 rdev->desc_nr >= le32_to_cpu(sb->max_dev)) {
1893 role = MD_DISK_ROLE_SPARE;
1894 rdev->desc_nr = -1;
1895 } else
1896 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
1897 switch(role) {
1898 case MD_DISK_ROLE_SPARE:
1899 break;
1900 case MD_DISK_ROLE_FAULTY:
1901 set_bit(Faulty, &rdev->flags);
1902 break;
1903 case MD_DISK_ROLE_JOURNAL:
1904 if (!(le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)) {
1905
1906 pr_warn("md: journal device provided without journal feature, ignoring the device\n");
1907 return -EINVAL;
1908 }
1909 set_bit(Journal, &rdev->flags);
1910 rdev->journal_tail = le64_to_cpu(sb->journal_tail);
1911 rdev->raid_disk = 0;
1912 break;
1913 default:
1914 rdev->saved_raid_disk = role;
1915 if ((le32_to_cpu(sb->feature_map) &
1916 MD_FEATURE_RECOVERY_OFFSET)) {
1917 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
1918 if (!(le32_to_cpu(sb->feature_map) &
1919 MD_FEATURE_RECOVERY_BITMAP))
1920 rdev->saved_raid_disk = -1;
1921 } else {
1922
1923
1924
1925
1926 if (!test_bit(MD_RECOVERY_FROZEN,
1927 &mddev->recovery))
1928 set_bit(In_sync, &rdev->flags);
1929 }
1930 rdev->raid_disk = role;
1931 break;
1932 }
1933 if (sb->devflags & WriteMostly1)
1934 set_bit(WriteMostly, &rdev->flags);
1935 if (sb->devflags & FailFast1)
1936 set_bit(FailFast, &rdev->flags);
1937 if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT)
1938 set_bit(Replacement, &rdev->flags);
1939 } else
1940 set_bit(In_sync, &rdev->flags);
1941
1942 return 0;
1943 }
1944
1945 static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
1946 {
1947 struct mdp_superblock_1 *sb;
1948 struct md_rdev *rdev2;
1949 int max_dev, i;
1950
1951
1952 sb = page_address(rdev->sb_page);
1953
1954 sb->feature_map = 0;
1955 sb->pad0 = 0;
1956 sb->recovery_offset = cpu_to_le64(0);
1957 memset(sb->pad3, 0, sizeof(sb->pad3));
1958
1959 sb->utime = cpu_to_le64((__u64)mddev->utime);
1960 sb->events = cpu_to_le64(mddev->events);
1961 if (mddev->in_sync)
1962 sb->resync_offset = cpu_to_le64(mddev->recovery_cp);
1963 else if (test_bit(MD_JOURNAL_CLEAN, &mddev->flags))
1964 sb->resync_offset = cpu_to_le64(MaxSector);
1965 else
1966 sb->resync_offset = cpu_to_le64(0);
1967
1968 sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors));
1969
1970 sb->raid_disks = cpu_to_le32(mddev->raid_disks);
1971 sb->size = cpu_to_le64(mddev->dev_sectors);
1972 sb->chunksize = cpu_to_le32(mddev->chunk_sectors);
1973 sb->level = cpu_to_le32(mddev->level);
1974 sb->layout = cpu_to_le32(mddev->layout);
1975 if (test_bit(FailFast, &rdev->flags))
1976 sb->devflags |= FailFast1;
1977 else
1978 sb->devflags &= ~FailFast1;
1979
1980 if (test_bit(WriteMostly, &rdev->flags))
1981 sb->devflags |= WriteMostly1;
1982 else
1983 sb->devflags &= ~WriteMostly1;
1984 sb->data_offset = cpu_to_le64(rdev->data_offset);
1985 sb->data_size = cpu_to_le64(rdev->sectors);
1986
1987 if (mddev->bitmap && mddev->bitmap_info.file == NULL) {
1988 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset);
1989 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
1990 }
1991
1992 if (rdev->raid_disk >= 0 && !test_bit(Journal, &rdev->flags) &&
1993 !test_bit(In_sync, &rdev->flags)) {
1994 sb->feature_map |=
1995 cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET);
1996 sb->recovery_offset =
1997 cpu_to_le64(rdev->recovery_offset);
1998 if (rdev->saved_raid_disk >= 0 && mddev->bitmap)
1999 sb->feature_map |=
2000 cpu_to_le32(MD_FEATURE_RECOVERY_BITMAP);
2001 }
2002
2003 if (test_bit(Journal, &rdev->flags))
2004 sb->journal_tail = cpu_to_le64(rdev->journal_tail);
2005 if (test_bit(Replacement, &rdev->flags))
2006 sb->feature_map |=
2007 cpu_to_le32(MD_FEATURE_REPLACEMENT);
2008
2009 if (mddev->reshape_position != MaxSector) {
2010 sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE);
2011 sb->reshape_position = cpu_to_le64(mddev->reshape_position);
2012 sb->new_layout = cpu_to_le32(mddev->new_layout);
2013 sb->delta_disks = cpu_to_le32(mddev->delta_disks);
2014 sb->new_level = cpu_to_le32(mddev->new_level);
2015 sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors);
2016 if (mddev->delta_disks == 0 &&
2017 mddev->reshape_backwards)
2018 sb->feature_map
2019 |= cpu_to_le32(MD_FEATURE_RESHAPE_BACKWARDS);
2020 if (rdev->new_data_offset != rdev->data_offset) {
2021 sb->feature_map
2022 |= cpu_to_le32(MD_FEATURE_NEW_OFFSET);
2023 sb->new_offset = cpu_to_le32((__u32)(rdev->new_data_offset
2024 - rdev->data_offset));
2025 }
2026 }
2027
2028 if (mddev_is_clustered(mddev))
2029 sb->feature_map |= cpu_to_le32(MD_FEATURE_CLUSTERED);
2030
2031 if (rdev->badblocks.count == 0)
2032 ;
2033 else if (sb->bblog_offset == 0)
2034
2035 md_error(mddev, rdev);
2036 else {
2037 struct badblocks *bb = &rdev->badblocks;
2038 __le64 *bbp = (__le64 *)page_address(rdev->bb_page);
2039 u64 *p = bb->page;
2040 sb->feature_map |= cpu_to_le32(MD_FEATURE_BAD_BLOCKS);
2041 if (bb->changed) {
2042 unsigned seq;
2043
2044 retry:
2045 seq = read_seqbegin(&bb->lock);
2046
2047 memset(bbp, 0xff, PAGE_SIZE);
2048
2049 for (i = 0 ; i < bb->count ; i++) {
2050 u64 internal_bb = p[i];
2051 u64 store_bb = ((BB_OFFSET(internal_bb) << 10)
2052 | BB_LEN(internal_bb));
2053 bbp[i] = cpu_to_le64(store_bb);
2054 }
2055 bb->changed = 0;
2056 if (read_seqretry(&bb->lock, seq))
2057 goto retry;
2058
2059 bb->sector = (rdev->sb_start +
2060 (int)le32_to_cpu(sb->bblog_offset));
2061 bb->size = le16_to_cpu(sb->bblog_size);
2062 }
2063 }
2064
2065 max_dev = 0;
2066 rdev_for_each(rdev2, mddev)
2067 if (rdev2->desc_nr+1 > max_dev)
2068 max_dev = rdev2->desc_nr+1;
2069
2070 if (max_dev > le32_to_cpu(sb->max_dev)) {
2071 int bmask;
2072 sb->max_dev = cpu_to_le32(max_dev);
2073 rdev->sb_size = max_dev * 2 + 256;
2074 bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1;
2075 if (rdev->sb_size & bmask)
2076 rdev->sb_size = (rdev->sb_size | bmask) + 1;
2077 } else
2078 max_dev = le32_to_cpu(sb->max_dev);
2079
2080 for (i=0; i<max_dev;i++)
2081 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE);
2082
2083 if (test_bit(MD_HAS_JOURNAL, &mddev->flags))
2084 sb->feature_map |= cpu_to_le32(MD_FEATURE_JOURNAL);
2085
2086 if (test_bit(MD_HAS_PPL, &mddev->flags)) {
2087 if (test_bit(MD_HAS_MULTIPLE_PPLS, &mddev->flags))
2088 sb->feature_map |=
2089 cpu_to_le32(MD_FEATURE_MULTIPLE_PPLS);
2090 else
2091 sb->feature_map |= cpu_to_le32(MD_FEATURE_PPL);
2092 sb->ppl.offset = cpu_to_le16(rdev->ppl.offset);
2093 sb->ppl.size = cpu_to_le16(rdev->ppl.size);
2094 }
2095
2096 rdev_for_each(rdev2, mddev) {
2097 i = rdev2->desc_nr;
2098 if (test_bit(Faulty, &rdev2->flags))
2099 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_FAULTY);
2100 else if (test_bit(In_sync, &rdev2->flags))
2101 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
2102 else if (test_bit(Journal, &rdev2->flags))
2103 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_JOURNAL);
2104 else if (rdev2->raid_disk >= 0)
2105 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
2106 else
2107 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE);
2108 }
2109
2110 sb->sb_csum = calc_sb_1_csum(sb);
2111 }
2112
2113 static sector_t super_1_choose_bm_space(sector_t dev_size)
2114 {
2115 sector_t bm_space;
2116
2117
2118
2119
2120 if (dev_size < 64*2)
2121 bm_space = 0;
2122 else if (dev_size - 64*2 >= 200*1024*1024*2)
2123 bm_space = 128*2;
2124 else if (dev_size - 4*2 > 8*1024*1024*2)
2125 bm_space = 64*2;
2126 else
2127 bm_space = 4*2;
2128 return bm_space;
2129 }
2130
2131 static unsigned long long
2132 super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
2133 {
2134 struct mdp_superblock_1 *sb;
2135 sector_t max_sectors;
2136 if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
2137 return 0;
2138 if (rdev->data_offset != rdev->new_data_offset)
2139 return 0;
2140 if (rdev->sb_start < rdev->data_offset) {
2141
2142 max_sectors = bdev_nr_sectors(rdev->bdev) - rdev->data_offset;
2143 if (!num_sectors || num_sectors > max_sectors)
2144 num_sectors = max_sectors;
2145 } else if (rdev->mddev->bitmap_info.offset) {
2146
2147 return 0;
2148 } else {
2149
2150 sector_t sb_start, bm_space;
2151 sector_t dev_size = bdev_nr_sectors(rdev->bdev);
2152
2153
2154 sb_start = dev_size - 8*2;
2155 sb_start &= ~(sector_t)(4*2 - 1);
2156
2157 bm_space = super_1_choose_bm_space(dev_size);
2158
2159
2160
2161
2162 max_sectors = sb_start - bm_space - 4*2;
2163
2164 if (!num_sectors || num_sectors > max_sectors)
2165 num_sectors = max_sectors;
2166 rdev->sb_start = sb_start;
2167 }
2168 sb = page_address(rdev->sb_page);
2169 sb->data_size = cpu_to_le64(num_sectors);
2170 sb->super_offset = cpu_to_le64(rdev->sb_start);
2171 sb->sb_csum = calc_sb_1_csum(sb);
2172 do {
2173 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
2174 rdev->sb_page);
2175 } while (md_super_wait(rdev->mddev) < 0);
2176 return num_sectors;
2177
2178 }
2179
2180 static int
2181 super_1_allow_new_offset(struct md_rdev *rdev,
2182 unsigned long long new_offset)
2183 {
2184
2185 struct bitmap *bitmap;
2186 if (new_offset >= rdev->data_offset)
2187 return 1;
2188
2189
2190
2191 if (rdev->mddev->minor_version == 0)
2192 return 1;
2193
2194
2195
2196
2197
2198
2199
2200 if (rdev->sb_start + (32+4)*2 > new_offset)
2201 return 0;
2202 bitmap = rdev->mddev->bitmap;
2203 if (bitmap && !rdev->mddev->bitmap_info.file &&
2204 rdev->sb_start + rdev->mddev->bitmap_info.offset +
2205 bitmap->storage.file_pages * (PAGE_SIZE>>9) > new_offset)
2206 return 0;
2207 if (rdev->badblocks.sector + rdev->badblocks.size > new_offset)
2208 return 0;
2209
2210 return 1;
2211 }
2212
2213 static struct super_type super_types[] = {
2214 [0] = {
2215 .name = "0.90.0",
2216 .owner = THIS_MODULE,
2217 .load_super = super_90_load,
2218 .validate_super = super_90_validate,
2219 .sync_super = super_90_sync,
2220 .rdev_size_change = super_90_rdev_size_change,
2221 .allow_new_offset = super_90_allow_new_offset,
2222 },
2223 [1] = {
2224 .name = "md-1",
2225 .owner = THIS_MODULE,
2226 .load_super = super_1_load,
2227 .validate_super = super_1_validate,
2228 .sync_super = super_1_sync,
2229 .rdev_size_change = super_1_rdev_size_change,
2230 .allow_new_offset = super_1_allow_new_offset,
2231 },
2232 };
2233
2234 static void sync_super(struct mddev *mddev, struct md_rdev *rdev)
2235 {
2236 if (mddev->sync_super) {
2237 mddev->sync_super(mddev, rdev);
2238 return;
2239 }
2240
2241 BUG_ON(mddev->major_version >= ARRAY_SIZE(super_types));
2242
2243 super_types[mddev->major_version].sync_super(mddev, rdev);
2244 }
2245
2246 static int match_mddev_units(struct mddev *mddev1, struct mddev *mddev2)
2247 {
2248 struct md_rdev *rdev, *rdev2;
2249
2250 rcu_read_lock();
2251 rdev_for_each_rcu(rdev, mddev1) {
2252 if (test_bit(Faulty, &rdev->flags) ||
2253 test_bit(Journal, &rdev->flags) ||
2254 rdev->raid_disk == -1)
2255 continue;
2256 rdev_for_each_rcu(rdev2, mddev2) {
2257 if (test_bit(Faulty, &rdev2->flags) ||
2258 test_bit(Journal, &rdev2->flags) ||
2259 rdev2->raid_disk == -1)
2260 continue;
2261 if (rdev->bdev->bd_disk == rdev2->bdev->bd_disk) {
2262 rcu_read_unlock();
2263 return 1;
2264 }
2265 }
2266 }
2267 rcu_read_unlock();
2268 return 0;
2269 }
2270
2271 static LIST_HEAD(pending_raid_disks);
2272
2273
2274
2275
2276
2277
2278
2279
2280 int md_integrity_register(struct mddev *mddev)
2281 {
2282 struct md_rdev *rdev, *reference = NULL;
2283
2284 if (list_empty(&mddev->disks))
2285 return 0;
2286 if (!mddev->gendisk || blk_get_integrity(mddev->gendisk))
2287 return 0;
2288 rdev_for_each(rdev, mddev) {
2289
2290 if (test_bit(Faulty, &rdev->flags))
2291 continue;
2292 if (rdev->raid_disk < 0)
2293 continue;
2294 if (!reference) {
2295
2296 reference = rdev;
2297 continue;
2298 }
2299
2300 if (blk_integrity_compare(reference->bdev->bd_disk,
2301 rdev->bdev->bd_disk) < 0)
2302 return -EINVAL;
2303 }
2304 if (!reference || !bdev_get_integrity(reference->bdev))
2305 return 0;
2306
2307
2308
2309
2310 blk_integrity_register(mddev->gendisk,
2311 bdev_get_integrity(reference->bdev));
2312
2313 pr_debug("md: data integrity enabled on %s\n", mdname(mddev));
2314 if (bioset_integrity_create(&mddev->bio_set, BIO_POOL_SIZE) ||
2315 (mddev->level != 1 && mddev->level != 10 &&
2316 bioset_integrity_create(&mddev->io_acct_set, BIO_POOL_SIZE))) {
2317
2318
2319
2320
2321
2322
2323 pr_err("md: failed to create integrity pool for %s\n",
2324 mdname(mddev));
2325 return -EINVAL;
2326 }
2327 return 0;
2328 }
2329 EXPORT_SYMBOL(md_integrity_register);
2330
2331
2332
2333
2334
2335 int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev)
2336 {
2337 struct blk_integrity *bi_mddev;
2338
2339 if (!mddev->gendisk)
2340 return 0;
2341
2342 bi_mddev = blk_get_integrity(mddev->gendisk);
2343
2344 if (!bi_mddev)
2345 return 0;
2346
2347 if (blk_integrity_compare(mddev->gendisk, rdev->bdev->bd_disk) != 0) {
2348 pr_err("%s: incompatible integrity profile for %pg\n",
2349 mdname(mddev), rdev->bdev);
2350 return -ENXIO;
2351 }
2352
2353 return 0;
2354 }
2355 EXPORT_SYMBOL(md_integrity_add_rdev);
2356
2357 static bool rdev_read_only(struct md_rdev *rdev)
2358 {
2359 return bdev_read_only(rdev->bdev) ||
2360 (rdev->meta_bdev && bdev_read_only(rdev->meta_bdev));
2361 }
2362
2363 static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
2364 {
2365 char b[BDEVNAME_SIZE];
2366 int err;
2367
2368
2369 if (find_rdev(mddev, rdev->bdev->bd_dev))
2370 return -EEXIST;
2371
2372 if (rdev_read_only(rdev) && mddev->pers)
2373 return -EROFS;
2374
2375
2376 if (!test_bit(Journal, &rdev->flags) &&
2377 rdev->sectors &&
2378 (mddev->dev_sectors == 0 || rdev->sectors < mddev->dev_sectors)) {
2379 if (mddev->pers) {
2380
2381
2382
2383
2384 if (mddev->level > 0)
2385 return -ENOSPC;
2386 } else
2387 mddev->dev_sectors = rdev->sectors;
2388 }
2389
2390
2391
2392
2393
2394 rcu_read_lock();
2395 if (rdev->desc_nr < 0) {
2396 int choice = 0;
2397 if (mddev->pers)
2398 choice = mddev->raid_disks;
2399 while (md_find_rdev_nr_rcu(mddev, choice))
2400 choice++;
2401 rdev->desc_nr = choice;
2402 } else {
2403 if (md_find_rdev_nr_rcu(mddev, rdev->desc_nr)) {
2404 rcu_read_unlock();
2405 return -EBUSY;
2406 }
2407 }
2408 rcu_read_unlock();
2409 if (!test_bit(Journal, &rdev->flags) &&
2410 mddev->max_disks && rdev->desc_nr >= mddev->max_disks) {
2411 pr_warn("md: %s: array is limited to %d devices\n",
2412 mdname(mddev), mddev->max_disks);
2413 return -EBUSY;
2414 }
2415 snprintf(b, sizeof(b), "%pg", rdev->bdev);
2416 strreplace(b, '/', '!');
2417
2418 rdev->mddev = mddev;
2419 pr_debug("md: bind<%s>\n", b);
2420
2421 if (mddev->raid_disks)
2422 mddev_create_serial_pool(mddev, rdev, false);
2423
2424 if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b)))
2425 goto fail;
2426
2427
2428 err = sysfs_create_link(&rdev->kobj, bdev_kobj(rdev->bdev), "block");
2429 rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state");
2430 rdev->sysfs_unack_badblocks =
2431 sysfs_get_dirent_safe(rdev->kobj.sd, "unacknowledged_bad_blocks");
2432 rdev->sysfs_badblocks =
2433 sysfs_get_dirent_safe(rdev->kobj.sd, "bad_blocks");
2434
2435 list_add_rcu(&rdev->same_set, &mddev->disks);
2436 bd_link_disk_holder(rdev->bdev, mddev->gendisk);
2437
2438
2439 mddev->recovery_disabled++;
2440
2441 return 0;
2442
2443 fail:
2444 pr_warn("md: failed to register dev-%s for %s\n",
2445 b, mdname(mddev));
2446 return err;
2447 }
2448
2449 static void rdev_delayed_delete(struct work_struct *ws)
2450 {
2451 struct md_rdev *rdev = container_of(ws, struct md_rdev, del_work);
2452 kobject_del(&rdev->kobj);
2453 kobject_put(&rdev->kobj);
2454 }
2455
2456 static void unbind_rdev_from_array(struct md_rdev *rdev)
2457 {
2458 bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk);
2459 list_del_rcu(&rdev->same_set);
2460 pr_debug("md: unbind<%pg>\n", rdev->bdev);
2461 mddev_destroy_serial_pool(rdev->mddev, rdev, false);
2462 rdev->mddev = NULL;
2463 sysfs_remove_link(&rdev->kobj, "block");
2464 sysfs_put(rdev->sysfs_state);
2465 sysfs_put(rdev->sysfs_unack_badblocks);
2466 sysfs_put(rdev->sysfs_badblocks);
2467 rdev->sysfs_state = NULL;
2468 rdev->sysfs_unack_badblocks = NULL;
2469 rdev->sysfs_badblocks = NULL;
2470 rdev->badblocks.count = 0;
2471
2472
2473
2474
2475 synchronize_rcu();
2476 INIT_WORK(&rdev->del_work, rdev_delayed_delete);
2477 kobject_get(&rdev->kobj);
2478 queue_work(md_rdev_misc_wq, &rdev->del_work);
2479 }
2480
2481
2482
2483
2484
2485
2486 static int lock_rdev(struct md_rdev *rdev, dev_t dev, int shared)
2487 {
2488 int err = 0;
2489 struct block_device *bdev;
2490
2491 bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
2492 shared ? (struct md_rdev *)lock_rdev : rdev);
2493 if (IS_ERR(bdev)) {
2494 pr_warn("md: could not open device unknown-block(%u,%u).\n",
2495 MAJOR(dev), MINOR(dev));
2496 return PTR_ERR(bdev);
2497 }
2498 rdev->bdev = bdev;
2499 return err;
2500 }
2501
2502 static void unlock_rdev(struct md_rdev *rdev)
2503 {
2504 struct block_device *bdev = rdev->bdev;
2505 rdev->bdev = NULL;
2506 blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
2507 }
2508
2509 void md_autodetect_dev(dev_t dev);
2510
2511 static void export_rdev(struct md_rdev *rdev)
2512 {
2513 pr_debug("md: export_rdev(%pg)\n", rdev->bdev);
2514 md_rdev_clear(rdev);
2515 #ifndef MODULE
2516 if (test_bit(AutoDetected, &rdev->flags))
2517 md_autodetect_dev(rdev->bdev->bd_dev);
2518 #endif
2519 unlock_rdev(rdev);
2520 kobject_put(&rdev->kobj);
2521 }
2522
2523 void md_kick_rdev_from_array(struct md_rdev *rdev)
2524 {
2525 unbind_rdev_from_array(rdev);
2526 export_rdev(rdev);
2527 }
2528 EXPORT_SYMBOL_GPL(md_kick_rdev_from_array);
2529
2530 static void export_array(struct mddev *mddev)
2531 {
2532 struct md_rdev *rdev;
2533
2534 while (!list_empty(&mddev->disks)) {
2535 rdev = list_first_entry(&mddev->disks, struct md_rdev,
2536 same_set);
2537 md_kick_rdev_from_array(rdev);
2538 }
2539 mddev->raid_disks = 0;
2540 mddev->major_version = 0;
2541 }
2542
2543 static bool set_in_sync(struct mddev *mddev)
2544 {
2545 lockdep_assert_held(&mddev->lock);
2546 if (!mddev->in_sync) {
2547 mddev->sync_checkers++;
2548 spin_unlock(&mddev->lock);
2549 percpu_ref_switch_to_atomic_sync(&mddev->writes_pending);
2550 spin_lock(&mddev->lock);
2551 if (!mddev->in_sync &&
2552 percpu_ref_is_zero(&mddev->writes_pending)) {
2553 mddev->in_sync = 1;
2554
2555
2556
2557
2558 smp_mb();
2559 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
2560 sysfs_notify_dirent_safe(mddev->sysfs_state);
2561 }
2562 if (--mddev->sync_checkers == 0)
2563 percpu_ref_switch_to_percpu(&mddev->writes_pending);
2564 }
2565 if (mddev->safemode == 1)
2566 mddev->safemode = 0;
2567 return mddev->in_sync;
2568 }
2569
2570 static void sync_sbs(struct mddev *mddev, int nospares)
2571 {
2572
2573
2574
2575
2576
2577
2578 struct md_rdev *rdev;
2579 rdev_for_each(rdev, mddev) {
2580 if (rdev->sb_events == mddev->events ||
2581 (nospares &&
2582 rdev->raid_disk < 0 &&
2583 rdev->sb_events+1 == mddev->events)) {
2584
2585 rdev->sb_loaded = 2;
2586 } else {
2587 sync_super(mddev, rdev);
2588 rdev->sb_loaded = 1;
2589 }
2590 }
2591 }
2592
2593 static bool does_sb_need_changing(struct mddev *mddev)
2594 {
2595 struct md_rdev *rdev = NULL, *iter;
2596 struct mdp_superblock_1 *sb;
2597 int role;
2598
2599
2600 rdev_for_each(iter, mddev)
2601 if ((iter->raid_disk >= 0) && !test_bit(Faulty, &iter->flags)) {
2602 rdev = iter;
2603 break;
2604 }
2605
2606
2607 if (!rdev)
2608 return false;
2609
2610 sb = page_address(rdev->sb_page);
2611
2612 rdev_for_each(rdev, mddev) {
2613 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
2614
2615 if (role == MD_DISK_ROLE_SPARE && rdev->raid_disk >= 0 &&
2616 !test_bit(Faulty, &rdev->flags))
2617 return true;
2618
2619 if (test_bit(Faulty, &rdev->flags) && (role < MD_DISK_ROLE_MAX))
2620 return true;
2621 }
2622
2623
2624 if ((mddev->dev_sectors != le64_to_cpu(sb->size)) ||
2625 (mddev->reshape_position != le64_to_cpu(sb->reshape_position)) ||
2626 (mddev->layout != le32_to_cpu(sb->layout)) ||
2627 (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) ||
2628 (mddev->chunk_sectors != le32_to_cpu(sb->chunksize)))
2629 return true;
2630
2631 return false;
2632 }
2633
2634 void md_update_sb(struct mddev *mddev, int force_change)
2635 {
2636 struct md_rdev *rdev;
2637 int sync_req;
2638 int nospares = 0;
2639 int any_badblocks_changed = 0;
2640 int ret = -1;
2641
2642 if (mddev->ro) {
2643 if (force_change)
2644 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
2645 return;
2646 }
2647
2648 repeat:
2649 if (mddev_is_clustered(mddev)) {
2650 if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags))
2651 force_change = 1;
2652 if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags))
2653 nospares = 1;
2654 ret = md_cluster_ops->metadata_update_start(mddev);
2655
2656 if (!does_sb_need_changing(mddev)) {
2657 if (ret == 0)
2658 md_cluster_ops->metadata_update_cancel(mddev);
2659 bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING),
2660 BIT(MD_SB_CHANGE_DEVS) |
2661 BIT(MD_SB_CHANGE_CLEAN));
2662 return;
2663 }
2664 }
2665
2666
2667
2668
2669
2670
2671
2672 rdev_for_each(rdev, mddev) {
2673 if (rdev->raid_disk >= 0 &&
2674 mddev->delta_disks >= 0 &&
2675 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
2676 test_bit(MD_RECOVERY_RECOVER, &mddev->recovery) &&
2677 !test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
2678 !test_bit(Journal, &rdev->flags) &&
2679 !test_bit(In_sync, &rdev->flags) &&
2680 mddev->curr_resync_completed > rdev->recovery_offset)
2681 rdev->recovery_offset = mddev->curr_resync_completed;
2682
2683 }
2684 if (!mddev->persistent) {
2685 clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
2686 clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
2687 if (!mddev->external) {
2688 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
2689 rdev_for_each(rdev, mddev) {
2690 if (rdev->badblocks.changed) {
2691 rdev->badblocks.changed = 0;
2692 ack_all_badblocks(&rdev->badblocks);
2693 md_error(mddev, rdev);
2694 }
2695 clear_bit(Blocked, &rdev->flags);
2696 clear_bit(BlockedBadBlocks, &rdev->flags);
2697 wake_up(&rdev->blocked_wait);
2698 }
2699 }
2700 wake_up(&mddev->sb_wait);
2701 return;
2702 }
2703
2704 spin_lock(&mddev->lock);
2705
2706 mddev->utime = ktime_get_real_seconds();
2707
2708 if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags))
2709 force_change = 1;
2710 if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags))
2711
2712
2713
2714
2715 nospares = 1;
2716 if (force_change)
2717 nospares = 0;
2718 if (mddev->degraded)
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728 nospares = 0;
2729
2730 sync_req = mddev->in_sync;
2731
2732
2733
2734 if (nospares
2735 && (mddev->in_sync && mddev->recovery_cp == MaxSector)
2736 && mddev->can_decrease_events
2737 && mddev->events != 1) {
2738 mddev->events--;
2739 mddev->can_decrease_events = 0;
2740 } else {
2741
2742 mddev->events ++;
2743 mddev->can_decrease_events = nospares;
2744 }
2745
2746
2747
2748
2749
2750
2751 WARN_ON(mddev->events == 0);
2752
2753 rdev_for_each(rdev, mddev) {
2754 if (rdev->badblocks.changed)
2755 any_badblocks_changed++;
2756 if (test_bit(Faulty, &rdev->flags))
2757 set_bit(FaultRecorded, &rdev->flags);
2758 }
2759
2760 sync_sbs(mddev, nospares);
2761 spin_unlock(&mddev->lock);
2762
2763 pr_debug("md: updating %s RAID superblock on device (in sync %d)\n",
2764 mdname(mddev), mddev->in_sync);
2765
2766 if (mddev->queue)
2767 blk_add_trace_msg(mddev->queue, "md md_update_sb");
2768 rewrite:
2769 md_bitmap_update_sb(mddev->bitmap);
2770 rdev_for_each(rdev, mddev) {
2771 if (rdev->sb_loaded != 1)
2772 continue;
2773
2774 if (!test_bit(Faulty, &rdev->flags)) {
2775 md_super_write(mddev,rdev,
2776 rdev->sb_start, rdev->sb_size,
2777 rdev->sb_page);
2778 pr_debug("md: (write) %pg's sb offset: %llu\n",
2779 rdev->bdev,
2780 (unsigned long long)rdev->sb_start);
2781 rdev->sb_events = mddev->events;
2782 if (rdev->badblocks.size) {
2783 md_super_write(mddev, rdev,
2784 rdev->badblocks.sector,
2785 rdev->badblocks.size << 9,
2786 rdev->bb_page);
2787 rdev->badblocks.size = 0;
2788 }
2789
2790 } else
2791 pr_debug("md: %pg (skipping faulty)\n",
2792 rdev->bdev);
2793
2794 if (mddev->level == LEVEL_MULTIPATH)
2795
2796 break;
2797 }
2798 if (md_super_wait(mddev) < 0)
2799 goto rewrite;
2800
2801
2802 if (mddev_is_clustered(mddev) && ret == 0)
2803 md_cluster_ops->metadata_update_finish(mddev);
2804
2805 if (mddev->in_sync != sync_req ||
2806 !bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING),
2807 BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_CLEAN)))
2808
2809 goto repeat;
2810 wake_up(&mddev->sb_wait);
2811 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
2812 sysfs_notify_dirent_safe(mddev->sysfs_completed);
2813
2814 rdev_for_each(rdev, mddev) {
2815 if (test_and_clear_bit(FaultRecorded, &rdev->flags))
2816 clear_bit(Blocked, &rdev->flags);
2817
2818 if (any_badblocks_changed)
2819 ack_all_badblocks(&rdev->badblocks);
2820 clear_bit(BlockedBadBlocks, &rdev->flags);
2821 wake_up(&rdev->blocked_wait);
2822 }
2823 }
2824 EXPORT_SYMBOL(md_update_sb);
2825
2826 static int add_bound_rdev(struct md_rdev *rdev)
2827 {
2828 struct mddev *mddev = rdev->mddev;
2829 int err = 0;
2830 bool add_journal = test_bit(Journal, &rdev->flags);
2831
2832 if (!mddev->pers->hot_remove_disk || add_journal) {
2833
2834
2835
2836
2837 super_types[mddev->major_version].
2838 validate_super(mddev, rdev);
2839 if (add_journal)
2840 mddev_suspend(mddev);
2841 err = mddev->pers->hot_add_disk(mddev, rdev);
2842 if (add_journal)
2843 mddev_resume(mddev);
2844 if (err) {
2845 md_kick_rdev_from_array(rdev);
2846 return err;
2847 }
2848 }
2849 sysfs_notify_dirent_safe(rdev->sysfs_state);
2850
2851 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
2852 if (mddev->degraded)
2853 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
2854 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
2855 md_new_event();
2856 md_wakeup_thread(mddev->thread);
2857 return 0;
2858 }
2859
2860
2861
2862
2863 static int cmd_match(const char *cmd, const char *str)
2864 {
2865
2866
2867
2868
2869 while (*cmd && *str && *cmd == *str) {
2870 cmd++;
2871 str++;
2872 }
2873 if (*cmd == '\n')
2874 cmd++;
2875 if (*str || *cmd)
2876 return 0;
2877 return 1;
2878 }
2879
2880 struct rdev_sysfs_entry {
2881 struct attribute attr;
2882 ssize_t (*show)(struct md_rdev *, char *);
2883 ssize_t (*store)(struct md_rdev *, const char *, size_t);
2884 };
2885
2886 static ssize_t
2887 state_show(struct md_rdev *rdev, char *page)
2888 {
2889 char *sep = ",";
2890 size_t len = 0;
2891 unsigned long flags = READ_ONCE(rdev->flags);
2892
2893 if (test_bit(Faulty, &flags) ||
2894 (!test_bit(ExternalBbl, &flags) &&
2895 rdev->badblocks.unacked_exist))
2896 len += sprintf(page+len, "faulty%s", sep);
2897 if (test_bit(In_sync, &flags))
2898 len += sprintf(page+len, "in_sync%s", sep);
2899 if (test_bit(Journal, &flags))
2900 len += sprintf(page+len, "journal%s", sep);
2901 if (test_bit(WriteMostly, &flags))
2902 len += sprintf(page+len, "write_mostly%s", sep);
2903 if (test_bit(Blocked, &flags) ||
2904 (rdev->badblocks.unacked_exist
2905 && !test_bit(Faulty, &flags)))
2906 len += sprintf(page+len, "blocked%s", sep);
2907 if (!test_bit(Faulty, &flags) &&
2908 !test_bit(Journal, &flags) &&
2909 !test_bit(In_sync, &flags))
2910 len += sprintf(page+len, "spare%s", sep);
2911 if (test_bit(WriteErrorSeen, &flags))
2912 len += sprintf(page+len, "write_error%s", sep);
2913 if (test_bit(WantReplacement, &flags))
2914 len += sprintf(page+len, "want_replacement%s", sep);
2915 if (test_bit(Replacement, &flags))
2916 len += sprintf(page+len, "replacement%s", sep);
2917 if (test_bit(ExternalBbl, &flags))
2918 len += sprintf(page+len, "external_bbl%s", sep);
2919 if (test_bit(FailFast, &flags))
2920 len += sprintf(page+len, "failfast%s", sep);
2921
2922 if (len)
2923 len -= strlen(sep);
2924
2925 return len+sprintf(page+len, "\n");
2926 }
2927
2928 static ssize_t
2929 state_store(struct md_rdev *rdev, const char *buf, size_t len)
2930 {
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946 struct mddev *mddev = rdev->mddev;
2947 int err = -EINVAL;
2948 bool need_update_sb = false;
2949
2950 if (cmd_match(buf, "faulty") && rdev->mddev->pers) {
2951 md_error(rdev->mddev, rdev);
2952
2953 if (test_bit(MD_BROKEN, &rdev->mddev->flags))
2954 err = -EBUSY;
2955 else
2956 err = 0;
2957 } else if (cmd_match(buf, "remove")) {
2958 if (rdev->mddev->pers) {
2959 clear_bit(Blocked, &rdev->flags);
2960 remove_and_add_spares(rdev->mddev, rdev);
2961 }
2962 if (rdev->raid_disk >= 0)
2963 err = -EBUSY;
2964 else {
2965 err = 0;
2966 if (mddev_is_clustered(mddev))
2967 err = md_cluster_ops->remove_disk(mddev, rdev);
2968
2969 if (err == 0) {
2970 md_kick_rdev_from_array(rdev);
2971 if (mddev->pers) {
2972 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
2973 md_wakeup_thread(mddev->thread);
2974 }
2975 md_new_event();
2976 }
2977 }
2978 } else if (cmd_match(buf, "writemostly")) {
2979 set_bit(WriteMostly, &rdev->flags);
2980 mddev_create_serial_pool(rdev->mddev, rdev, false);
2981 need_update_sb = true;
2982 err = 0;
2983 } else if (cmd_match(buf, "-writemostly")) {
2984 mddev_destroy_serial_pool(rdev->mddev, rdev, false);
2985 clear_bit(WriteMostly, &rdev->flags);
2986 need_update_sb = true;
2987 err = 0;
2988 } else if (cmd_match(buf, "blocked")) {
2989 set_bit(Blocked, &rdev->flags);
2990 err = 0;
2991 } else if (cmd_match(buf, "-blocked")) {
2992 if (!test_bit(Faulty, &rdev->flags) &&
2993 !test_bit(ExternalBbl, &rdev->flags) &&
2994 rdev->badblocks.unacked_exist) {
2995
2996
2997
2998 md_error(rdev->mddev, rdev);
2999 }
3000 clear_bit(Blocked, &rdev->flags);
3001 clear_bit(BlockedBadBlocks, &rdev->flags);
3002 wake_up(&rdev->blocked_wait);
3003 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
3004 md_wakeup_thread(rdev->mddev->thread);
3005
3006 err = 0;
3007 } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) {
3008 set_bit(In_sync, &rdev->flags);
3009 err = 0;
3010 } else if (cmd_match(buf, "failfast")) {
3011 set_bit(FailFast, &rdev->flags);
3012 need_update_sb = true;
3013 err = 0;
3014 } else if (cmd_match(buf, "-failfast")) {
3015 clear_bit(FailFast, &rdev->flags);
3016 need_update_sb = true;
3017 err = 0;
3018 } else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0 &&
3019 !test_bit(Journal, &rdev->flags)) {
3020 if (rdev->mddev->pers == NULL) {
3021 clear_bit(In_sync, &rdev->flags);
3022 rdev->saved_raid_disk = rdev->raid_disk;
3023 rdev->raid_disk = -1;
3024 err = 0;
3025 }
3026 } else if (cmd_match(buf, "write_error")) {
3027 set_bit(WriteErrorSeen, &rdev->flags);
3028 err = 0;
3029 } else if (cmd_match(buf, "-write_error")) {
3030 clear_bit(WriteErrorSeen, &rdev->flags);
3031 err = 0;
3032 } else if (cmd_match(buf, "want_replacement")) {
3033
3034
3035
3036
3037 if (rdev->raid_disk >= 0 &&
3038 !test_bit(Journal, &rdev->flags) &&
3039 !test_bit(Replacement, &rdev->flags))
3040 set_bit(WantReplacement, &rdev->flags);
3041 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
3042 md_wakeup_thread(rdev->mddev->thread);
3043 err = 0;
3044 } else if (cmd_match(buf, "-want_replacement")) {
3045
3046
3047
3048 err = 0;
3049 clear_bit(WantReplacement, &rdev->flags);
3050 } else if (cmd_match(buf, "replacement")) {
3051
3052
3053
3054
3055 if (rdev->mddev->pers)
3056 err = -EBUSY;
3057 else {
3058 set_bit(Replacement, &rdev->flags);
3059 err = 0;
3060 }
3061 } else if (cmd_match(buf, "-replacement")) {
3062
3063 if (rdev->mddev->pers)
3064 err = -EBUSY;
3065 else {
3066 clear_bit(Replacement, &rdev->flags);
3067 err = 0;
3068 }
3069 } else if (cmd_match(buf, "re-add")) {
3070 if (!rdev->mddev->pers)
3071 err = -EINVAL;
3072 else if (test_bit(Faulty, &rdev->flags) && (rdev->raid_disk == -1) &&
3073 rdev->saved_raid_disk >= 0) {
3074
3075
3076
3077
3078
3079
3080 if (!mddev_is_clustered(rdev->mddev) ||
3081 (err = md_cluster_ops->gather_bitmaps(rdev)) == 0) {
3082 clear_bit(Faulty, &rdev->flags);
3083 err = add_bound_rdev(rdev);
3084 }
3085 } else
3086 err = -EBUSY;
3087 } else if (cmd_match(buf, "external_bbl") && (rdev->mddev->external)) {
3088 set_bit(ExternalBbl, &rdev->flags);
3089 rdev->badblocks.shift = 0;
3090 err = 0;
3091 } else if (cmd_match(buf, "-external_bbl") && (rdev->mddev->external)) {
3092 clear_bit(ExternalBbl, &rdev->flags);
3093 err = 0;
3094 }
3095 if (need_update_sb)
3096 md_update_sb(mddev, 1);
3097 if (!err)
3098 sysfs_notify_dirent_safe(rdev->sysfs_state);
3099 return err ? err : len;
3100 }
3101 static struct rdev_sysfs_entry rdev_state =
3102 __ATTR_PREALLOC(state, S_IRUGO|S_IWUSR, state_show, state_store);
3103
3104 static ssize_t
3105 errors_show(struct md_rdev *rdev, char *page)
3106 {
3107 return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors));
3108 }
3109
3110 static ssize_t
3111 errors_store(struct md_rdev *rdev, const char *buf, size_t len)
3112 {
3113 unsigned int n;
3114 int rv;
3115
3116 rv = kstrtouint(buf, 10, &n);
3117 if (rv < 0)
3118 return rv;
3119 atomic_set(&rdev->corrected_errors, n);
3120 return len;
3121 }
3122 static struct rdev_sysfs_entry rdev_errors =
3123 __ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store);
3124
3125 static ssize_t
3126 slot_show(struct md_rdev *rdev, char *page)
3127 {
3128 if (test_bit(Journal, &rdev->flags))
3129 return sprintf(page, "journal\n");
3130 else if (rdev->raid_disk < 0)
3131 return sprintf(page, "none\n");
3132 else
3133 return sprintf(page, "%d\n", rdev->raid_disk);
3134 }
3135
3136 static ssize_t
3137 slot_store(struct md_rdev *rdev, const char *buf, size_t len)
3138 {
3139 int slot;
3140 int err;
3141
3142 if (test_bit(Journal, &rdev->flags))
3143 return -EBUSY;
3144 if (strncmp(buf, "none", 4)==0)
3145 slot = -1;
3146 else {
3147 err = kstrtouint(buf, 10, (unsigned int *)&slot);
3148 if (err < 0)
3149 return err;
3150 }
3151 if (rdev->mddev->pers && slot == -1) {
3152
3153
3154
3155
3156
3157
3158
3159 if (rdev->raid_disk == -1)
3160 return -EEXIST;
3161
3162 if (rdev->mddev->pers->hot_remove_disk == NULL)
3163 return -EINVAL;
3164 clear_bit(Blocked, &rdev->flags);
3165 remove_and_add_spares(rdev->mddev, rdev);
3166 if (rdev->raid_disk >= 0)
3167 return -EBUSY;
3168 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
3169 md_wakeup_thread(rdev->mddev->thread);
3170 } else if (rdev->mddev->pers) {
3171
3172
3173
3174 int err;
3175
3176 if (rdev->raid_disk != -1)
3177 return -EBUSY;
3178
3179 if (test_bit(MD_RECOVERY_RUNNING, &rdev->mddev->recovery))
3180 return -EBUSY;
3181
3182 if (rdev->mddev->pers->hot_add_disk == NULL)
3183 return -EINVAL;
3184
3185 if (slot >= rdev->mddev->raid_disks &&
3186 slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks)
3187 return -ENOSPC;
3188
3189 rdev->raid_disk = slot;
3190 if (test_bit(In_sync, &rdev->flags))
3191 rdev->saved_raid_disk = slot;
3192 else
3193 rdev->saved_raid_disk = -1;
3194 clear_bit(In_sync, &rdev->flags);
3195 clear_bit(Bitmap_sync, &rdev->flags);
3196 err = rdev->mddev->pers->hot_add_disk(rdev->mddev, rdev);
3197 if (err) {
3198 rdev->raid_disk = -1;
3199 return err;
3200 } else
3201 sysfs_notify_dirent_safe(rdev->sysfs_state);
3202 ;
3203 sysfs_link_rdev(rdev->mddev, rdev);
3204
3205 } else {
3206 if (slot >= rdev->mddev->raid_disks &&
3207 slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks)
3208 return -ENOSPC;
3209 rdev->raid_disk = slot;
3210
3211 clear_bit(Faulty, &rdev->flags);
3212 clear_bit(WriteMostly, &rdev->flags);
3213 set_bit(In_sync, &rdev->flags);
3214 sysfs_notify_dirent_safe(rdev->sysfs_state);
3215 }
3216 return len;
3217 }
3218
3219 static struct rdev_sysfs_entry rdev_slot =
3220 __ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store);
3221
3222 static ssize_t
3223 offset_show(struct md_rdev *rdev, char *page)
3224 {
3225 return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset);
3226 }
3227
3228 static ssize_t
3229 offset_store(struct md_rdev *rdev, const char *buf, size_t len)
3230 {
3231 unsigned long long offset;
3232 if (kstrtoull(buf, 10, &offset) < 0)
3233 return -EINVAL;
3234 if (rdev->mddev->pers && rdev->raid_disk >= 0)
3235 return -EBUSY;
3236 if (rdev->sectors && rdev->mddev->external)
3237
3238
3239 return -EBUSY;
3240 rdev->data_offset = offset;
3241 rdev->new_data_offset = offset;
3242 return len;
3243 }
3244
3245 static struct rdev_sysfs_entry rdev_offset =
3246 __ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store);
3247
3248 static ssize_t new_offset_show(struct md_rdev *rdev, char *page)
3249 {
3250 return sprintf(page, "%llu\n",
3251 (unsigned long long)rdev->new_data_offset);
3252 }
3253
3254 static ssize_t new_offset_store(struct md_rdev *rdev,
3255 const char *buf, size_t len)
3256 {
3257 unsigned long long new_offset;
3258 struct mddev *mddev = rdev->mddev;
3259
3260 if (kstrtoull(buf, 10, &new_offset) < 0)
3261 return -EINVAL;
3262
3263 if (mddev->sync_thread ||
3264 test_bit(MD_RECOVERY_RUNNING,&mddev->recovery))
3265 return -EBUSY;
3266 if (new_offset == rdev->data_offset)
3267
3268 ;
3269 else if (new_offset > rdev->data_offset) {
3270
3271 if (new_offset - rdev->data_offset
3272 + mddev->dev_sectors > rdev->sectors)
3273 return -E2BIG;
3274 }
3275
3276
3277
3278
3279
3280 if (new_offset < rdev->data_offset &&
3281 mddev->reshape_backwards)
3282 return -EINVAL;
3283
3284
3285
3286
3287 if (new_offset > rdev->data_offset &&
3288 !mddev->reshape_backwards)
3289 return -EINVAL;
3290
3291 if (mddev->pers && mddev->persistent &&
3292 !super_types[mddev->major_version]
3293 .allow_new_offset(rdev, new_offset))
3294 return -E2BIG;
3295 rdev->new_data_offset = new_offset;
3296 if (new_offset > rdev->data_offset)
3297 mddev->reshape_backwards = 1;
3298 else if (new_offset < rdev->data_offset)
3299 mddev->reshape_backwards = 0;
3300
3301 return len;
3302 }
3303 static struct rdev_sysfs_entry rdev_new_offset =
3304 __ATTR(new_offset, S_IRUGO|S_IWUSR, new_offset_show, new_offset_store);
3305
3306 static ssize_t
3307 rdev_size_show(struct md_rdev *rdev, char *page)
3308 {
3309 return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2);
3310 }
3311
3312 static int md_rdevs_overlap(struct md_rdev *a, struct md_rdev *b)
3313 {
3314
3315 if (a->data_offset + a->sectors <= b->data_offset)
3316 return false;
3317 if (b->data_offset + b->sectors <= a->data_offset)
3318 return false;
3319 return true;
3320 }
3321
3322 static bool md_rdev_overlaps(struct md_rdev *rdev)
3323 {
3324 struct mddev *mddev;
3325 struct md_rdev *rdev2;
3326
3327 spin_lock(&all_mddevs_lock);
3328 list_for_each_entry(mddev, &all_mddevs, all_mddevs) {
3329 if (test_bit(MD_DELETED, &mddev->flags))
3330 continue;
3331 rdev_for_each(rdev2, mddev) {
3332 if (rdev != rdev2 && rdev->bdev == rdev2->bdev &&
3333 md_rdevs_overlap(rdev, rdev2)) {
3334 spin_unlock(&all_mddevs_lock);
3335 return true;
3336 }
3337 }
3338 }
3339 spin_unlock(&all_mddevs_lock);
3340 return false;
3341 }
3342
3343 static int strict_blocks_to_sectors(const char *buf, sector_t *sectors)
3344 {
3345 unsigned long long blocks;
3346 sector_t new;
3347
3348 if (kstrtoull(buf, 10, &blocks) < 0)
3349 return -EINVAL;
3350
3351 if (blocks & 1ULL << (8 * sizeof(blocks) - 1))
3352 return -EINVAL;
3353
3354 new = blocks * 2;
3355 if (new != blocks * 2)
3356 return -EINVAL;
3357
3358 *sectors = new;
3359 return 0;
3360 }
3361
3362 static ssize_t
3363 rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len)
3364 {
3365 struct mddev *my_mddev = rdev->mddev;
3366 sector_t oldsectors = rdev->sectors;
3367 sector_t sectors;
3368
3369 if (test_bit(Journal, &rdev->flags))
3370 return -EBUSY;
3371 if (strict_blocks_to_sectors(buf, §ors) < 0)
3372 return -EINVAL;
3373 if (rdev->data_offset != rdev->new_data_offset)
3374 return -EINVAL;
3375 if (my_mddev->pers && rdev->raid_disk >= 0) {
3376 if (my_mddev->persistent) {
3377 sectors = super_types[my_mddev->major_version].
3378 rdev_size_change(rdev, sectors);
3379 if (!sectors)
3380 return -EBUSY;
3381 } else if (!sectors)
3382 sectors = bdev_nr_sectors(rdev->bdev) -
3383 rdev->data_offset;
3384 if (!my_mddev->pers->resize)
3385
3386 return -EINVAL;
3387 }
3388 if (sectors < my_mddev->dev_sectors)
3389 return -EINVAL;
3390
3391 rdev->sectors = sectors;
3392
3393
3394
3395
3396
3397
3398 if (sectors > oldsectors && my_mddev->external &&
3399 md_rdev_overlaps(rdev)) {
3400
3401
3402
3403
3404
3405 rdev->sectors = oldsectors;
3406 return -EBUSY;
3407 }
3408 return len;
3409 }
3410
3411 static struct rdev_sysfs_entry rdev_size =
3412 __ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store);
3413
3414 static ssize_t recovery_start_show(struct md_rdev *rdev, char *page)
3415 {
3416 unsigned long long recovery_start = rdev->recovery_offset;
3417
3418 if (test_bit(In_sync, &rdev->flags) ||
3419 recovery_start == MaxSector)
3420 return sprintf(page, "none\n");
3421
3422 return sprintf(page, "%llu\n", recovery_start);
3423 }
3424
3425 static ssize_t recovery_start_store(struct md_rdev *rdev, const char *buf, size_t len)
3426 {
3427 unsigned long long recovery_start;
3428
3429 if (cmd_match(buf, "none"))
3430 recovery_start = MaxSector;
3431 else if (kstrtoull(buf, 10, &recovery_start))
3432 return -EINVAL;
3433
3434 if (rdev->mddev->pers &&
3435 rdev->raid_disk >= 0)
3436 return -EBUSY;
3437
3438 rdev->recovery_offset = recovery_start;
3439 if (recovery_start == MaxSector)
3440 set_bit(In_sync, &rdev->flags);
3441 else
3442 clear_bit(In_sync, &rdev->flags);
3443 return len;
3444 }
3445
3446 static struct rdev_sysfs_entry rdev_recovery_start =
3447 __ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store);
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460 static ssize_t bb_show(struct md_rdev *rdev, char *page)
3461 {
3462 return badblocks_show(&rdev->badblocks, page, 0);
3463 }
3464 static ssize_t bb_store(struct md_rdev *rdev, const char *page, size_t len)
3465 {
3466 int rv = badblocks_store(&rdev->badblocks, page, len, 0);
3467
3468 if (test_and_clear_bit(BlockedBadBlocks, &rdev->flags))
3469 wake_up(&rdev->blocked_wait);
3470 return rv;
3471 }
3472 static struct rdev_sysfs_entry rdev_bad_blocks =
3473 __ATTR(bad_blocks, S_IRUGO|S_IWUSR, bb_show, bb_store);
3474
3475 static ssize_t ubb_show(struct md_rdev *rdev, char *page)
3476 {
3477 return badblocks_show(&rdev->badblocks, page, 1);
3478 }
3479 static ssize_t ubb_store(struct md_rdev *rdev, const char *page, size_t len)
3480 {
3481 return badblocks_store(&rdev->badblocks, page, len, 1);
3482 }
3483 static struct rdev_sysfs_entry rdev_unack_bad_blocks =
3484 __ATTR(unacknowledged_bad_blocks, S_IRUGO|S_IWUSR, ubb_show, ubb_store);
3485
3486 static ssize_t
3487 ppl_sector_show(struct md_rdev *rdev, char *page)
3488 {
3489 return sprintf(page, "%llu\n", (unsigned long long)rdev->ppl.sector);
3490 }
3491
3492 static ssize_t
3493 ppl_sector_store(struct md_rdev *rdev, const char *buf, size_t len)
3494 {
3495 unsigned long long sector;
3496
3497 if (kstrtoull(buf, 10, §or) < 0)
3498 return -EINVAL;
3499 if (sector != (sector_t)sector)
3500 return -EINVAL;
3501
3502 if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) &&
3503 rdev->raid_disk >= 0)
3504 return -EBUSY;
3505
3506 if (rdev->mddev->persistent) {
3507 if (rdev->mddev->major_version == 0)
3508 return -EINVAL;
3509 if ((sector > rdev->sb_start &&
3510 sector - rdev->sb_start > S16_MAX) ||
3511 (sector < rdev->sb_start &&
3512 rdev->sb_start - sector > -S16_MIN))
3513 return -EINVAL;
3514 rdev->ppl.offset = sector - rdev->sb_start;
3515 } else if (!rdev->mddev->external) {
3516 return -EBUSY;
3517 }
3518 rdev->ppl.sector = sector;
3519 return len;
3520 }
3521
3522 static struct rdev_sysfs_entry rdev_ppl_sector =
3523 __ATTR(ppl_sector, S_IRUGO|S_IWUSR, ppl_sector_show, ppl_sector_store);
3524
3525 static ssize_t
3526 ppl_size_show(struct md_rdev *rdev, char *page)
3527 {
3528 return sprintf(page, "%u\n", rdev->ppl.size);
3529 }
3530
3531 static ssize_t
3532 ppl_size_store(struct md_rdev *rdev, const char *buf, size_t len)
3533 {
3534 unsigned int size;
3535
3536 if (kstrtouint(buf, 10, &size) < 0)
3537 return -EINVAL;
3538
3539 if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) &&
3540 rdev->raid_disk >= 0)
3541 return -EBUSY;
3542
3543 if (rdev->mddev->persistent) {
3544 if (rdev->mddev->major_version == 0)
3545 return -EINVAL;
3546 if (size > U16_MAX)
3547 return -EINVAL;
3548 } else if (!rdev->mddev->external) {
3549 return -EBUSY;
3550 }
3551 rdev->ppl.size = size;
3552 return len;
3553 }
3554
3555 static struct rdev_sysfs_entry rdev_ppl_size =
3556 __ATTR(ppl_size, S_IRUGO|S_IWUSR, ppl_size_show, ppl_size_store);
3557
3558 static struct attribute *rdev_default_attrs[] = {
3559 &rdev_state.attr,
3560 &rdev_errors.attr,
3561 &rdev_slot.attr,
3562 &rdev_offset.attr,
3563 &rdev_new_offset.attr,
3564 &rdev_size.attr,
3565 &rdev_recovery_start.attr,
3566 &rdev_bad_blocks.attr,
3567 &rdev_unack_bad_blocks.attr,
3568 &rdev_ppl_sector.attr,
3569 &rdev_ppl_size.attr,
3570 NULL,
3571 };
3572 ATTRIBUTE_GROUPS(rdev_default);
3573 static ssize_t
3574 rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
3575 {
3576 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
3577 struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj);
3578
3579 if (!entry->show)
3580 return -EIO;
3581 if (!rdev->mddev)
3582 return -ENODEV;
3583 return entry->show(rdev, page);
3584 }
3585
3586 static ssize_t
3587 rdev_attr_store(struct kobject *kobj, struct attribute *attr,
3588 const char *page, size_t length)
3589 {
3590 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
3591 struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj);
3592 ssize_t rv;
3593 struct mddev *mddev = rdev->mddev;
3594
3595 if (!entry->store)
3596 return -EIO;
3597 if (!capable(CAP_SYS_ADMIN))
3598 return -EACCES;
3599 rv = mddev ? mddev_lock(mddev) : -ENODEV;
3600 if (!rv) {
3601 if (rdev->mddev == NULL)
3602 rv = -ENODEV;
3603 else
3604 rv = entry->store(rdev, page, length);
3605 mddev_unlock(mddev);
3606 }
3607 return rv;
3608 }
3609
3610 static void rdev_free(struct kobject *ko)
3611 {
3612 struct md_rdev *rdev = container_of(ko, struct md_rdev, kobj);
3613 kfree(rdev);
3614 }
3615 static const struct sysfs_ops rdev_sysfs_ops = {
3616 .show = rdev_attr_show,
3617 .store = rdev_attr_store,
3618 };
3619 static struct kobj_type rdev_ktype = {
3620 .release = rdev_free,
3621 .sysfs_ops = &rdev_sysfs_ops,
3622 .default_groups = rdev_default_groups,
3623 };
3624
3625 int md_rdev_init(struct md_rdev *rdev)
3626 {
3627 rdev->desc_nr = -1;
3628 rdev->saved_raid_disk = -1;
3629 rdev->raid_disk = -1;
3630 rdev->flags = 0;
3631 rdev->data_offset = 0;
3632 rdev->new_data_offset = 0;
3633 rdev->sb_events = 0;
3634 rdev->last_read_error = 0;
3635 rdev->sb_loaded = 0;
3636 rdev->bb_page = NULL;
3637 atomic_set(&rdev->nr_pending, 0);
3638 atomic_set(&rdev->read_errors, 0);
3639 atomic_set(&rdev->corrected_errors, 0);
3640
3641 INIT_LIST_HEAD(&rdev->same_set);
3642 init_waitqueue_head(&rdev->blocked_wait);
3643
3644
3645
3646
3647
3648 return badblocks_init(&rdev->badblocks, 0);
3649 }
3650 EXPORT_SYMBOL_GPL(md_rdev_init);
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661 static struct md_rdev *md_import_device(dev_t newdev, int super_format, int super_minor)
3662 {
3663 int err;
3664 struct md_rdev *rdev;
3665 sector_t size;
3666
3667 rdev = kzalloc(sizeof(*rdev), GFP_KERNEL);
3668 if (!rdev)
3669 return ERR_PTR(-ENOMEM);
3670
3671 err = md_rdev_init(rdev);
3672 if (err)
3673 goto abort_free;
3674 err = alloc_disk_sb(rdev);
3675 if (err)
3676 goto abort_free;
3677
3678 err = lock_rdev(rdev, newdev, super_format == -2);
3679 if (err)
3680 goto abort_free;
3681
3682 kobject_init(&rdev->kobj, &rdev_ktype);
3683
3684 size = bdev_nr_bytes(rdev->bdev) >> BLOCK_SIZE_BITS;
3685 if (!size) {
3686 pr_warn("md: %pg has zero or unknown size, marking faulty!\n",
3687 rdev->bdev);
3688 err = -EINVAL;
3689 goto abort_free;
3690 }
3691
3692 if (super_format >= 0) {
3693 err = super_types[super_format].
3694 load_super(rdev, NULL, super_minor);
3695 if (err == -EINVAL) {
3696 pr_warn("md: %pg does not have a valid v%d.%d superblock, not importing!\n",
3697 rdev->bdev,
3698 super_format, super_minor);
3699 goto abort_free;
3700 }
3701 if (err < 0) {
3702 pr_warn("md: could not read %pg's sb, not importing!\n",
3703 rdev->bdev);
3704 goto abort_free;
3705 }
3706 }
3707
3708 return rdev;
3709
3710 abort_free:
3711 if (rdev->bdev)
3712 unlock_rdev(rdev);
3713 md_rdev_clear(rdev);
3714 kfree(rdev);
3715 return ERR_PTR(err);
3716 }
3717
3718
3719
3720
3721
3722 static int analyze_sbs(struct mddev *mddev)
3723 {
3724 int i;
3725 struct md_rdev *rdev, *freshest, *tmp;
3726
3727 freshest = NULL;
3728 rdev_for_each_safe(rdev, tmp, mddev)
3729 switch (super_types[mddev->major_version].
3730 load_super(rdev, freshest, mddev->minor_version)) {
3731 case 1:
3732 freshest = rdev;
3733 break;
3734 case 0:
3735 break;
3736 default:
3737 pr_warn("md: fatal superblock inconsistency in %pg -- removing from array\n",
3738 rdev->bdev);
3739 md_kick_rdev_from_array(rdev);
3740 }
3741
3742
3743 if (!freshest) {
3744 pr_warn("md: cannot find a valid disk\n");
3745 return -EINVAL;
3746 }
3747
3748 super_types[mddev->major_version].
3749 validate_super(mddev, freshest);
3750
3751 i = 0;
3752 rdev_for_each_safe(rdev, tmp, mddev) {
3753 if (mddev->max_disks &&
3754 (rdev->desc_nr >= mddev->max_disks ||
3755 i > mddev->max_disks)) {
3756 pr_warn("md: %s: %pg: only %d devices permitted\n",
3757 mdname(mddev), rdev->bdev,
3758 mddev->max_disks);
3759 md_kick_rdev_from_array(rdev);
3760 continue;
3761 }
3762 if (rdev != freshest) {
3763 if (super_types[mddev->major_version].
3764 validate_super(mddev, rdev)) {
3765 pr_warn("md: kicking non-fresh %pg from array!\n",
3766 rdev->bdev);
3767 md_kick_rdev_from_array(rdev);
3768 continue;
3769 }
3770 }
3771 if (mddev->level == LEVEL_MULTIPATH) {
3772 rdev->desc_nr = i++;
3773 rdev->raid_disk = rdev->desc_nr;
3774 set_bit(In_sync, &rdev->flags);
3775 } else if (rdev->raid_disk >=
3776 (mddev->raid_disks - min(0, mddev->delta_disks)) &&
3777 !test_bit(Journal, &rdev->flags)) {
3778 rdev->raid_disk = -1;
3779 clear_bit(In_sync, &rdev->flags);
3780 }
3781 }
3782
3783 return 0;
3784 }
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796 int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale)
3797 {
3798 unsigned long result = 0;
3799 long decimals = -1;
3800 while (isdigit(*cp) || (*cp == '.' && decimals < 0)) {
3801 if (*cp == '.')
3802 decimals = 0;
3803 else if (decimals < scale) {
3804 unsigned int value;
3805 value = *cp - '0';
3806 result = result * 10 + value;
3807 if (decimals >= 0)
3808 decimals++;
3809 }
3810 cp++;
3811 }
3812 if (*cp == '\n')
3813 cp++;
3814 if (*cp)
3815 return -EINVAL;
3816 if (decimals < 0)
3817 decimals = 0;
3818 *res = result * int_pow(10, scale - decimals);
3819 return 0;
3820 }
3821
3822 static ssize_t
3823 safe_delay_show(struct mddev *mddev, char *page)
3824 {
3825 int msec = (mddev->safemode_delay*1000)/HZ;
3826 return sprintf(page, "%d.%03d\n", msec/1000, msec%1000);
3827 }
3828 static ssize_t
3829 safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len)
3830 {
3831 unsigned long msec;
3832
3833 if (mddev_is_clustered(mddev)) {
3834 pr_warn("md: Safemode is disabled for clustered mode\n");
3835 return -EINVAL;
3836 }
3837
3838 if (strict_strtoul_scaled(cbuf, &msec, 3) < 0)
3839 return -EINVAL;
3840 if (msec == 0)
3841 mddev->safemode_delay = 0;
3842 else {
3843 unsigned long old_delay = mddev->safemode_delay;
3844 unsigned long new_delay = (msec*HZ)/1000;
3845
3846 if (new_delay == 0)
3847 new_delay = 1;
3848 mddev->safemode_delay = new_delay;
3849 if (new_delay < old_delay || old_delay == 0)
3850 mod_timer(&mddev->safemode_timer, jiffies+1);
3851 }
3852 return len;
3853 }
3854 static struct md_sysfs_entry md_safe_delay =
3855 __ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store);
3856
3857 static ssize_t
3858 level_show(struct mddev *mddev, char *page)
3859 {
3860 struct md_personality *p;
3861 int ret;
3862 spin_lock(&mddev->lock);
3863 p = mddev->pers;
3864 if (p)
3865 ret = sprintf(page, "%s\n", p->name);
3866 else if (mddev->clevel[0])
3867 ret = sprintf(page, "%s\n", mddev->clevel);
3868 else if (mddev->level != LEVEL_NONE)
3869 ret = sprintf(page, "%d\n", mddev->level);
3870 else
3871 ret = 0;
3872 spin_unlock(&mddev->lock);
3873 return ret;
3874 }
3875
3876 static ssize_t
3877 level_store(struct mddev *mddev, const char *buf, size_t len)
3878 {
3879 char clevel[16];
3880 ssize_t rv;
3881 size_t slen = len;
3882 struct md_personality *pers, *oldpers;
3883 long level;
3884 void *priv, *oldpriv;
3885 struct md_rdev *rdev;
3886
3887 if (slen == 0 || slen >= sizeof(clevel))
3888 return -EINVAL;
3889
3890 rv = mddev_lock(mddev);
3891 if (rv)
3892 return rv;
3893
3894 if (mddev->pers == NULL) {
3895 strncpy(mddev->clevel, buf, slen);
3896 if (mddev->clevel[slen-1] == '\n')
3897 slen--;
3898 mddev->clevel[slen] = 0;
3899 mddev->level = LEVEL_NONE;
3900 rv = len;
3901 goto out_unlock;
3902 }
3903 rv = -EROFS;
3904 if (mddev->ro)
3905 goto out_unlock;
3906
3907
3908
3909
3910
3911
3912
3913 rv = -EBUSY;
3914 if (mddev->sync_thread ||
3915 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
3916 mddev->reshape_position != MaxSector ||
3917 mddev->sysfs_active)
3918 goto out_unlock;
3919
3920 rv = -EINVAL;
3921 if (!mddev->pers->quiesce) {
3922 pr_warn("md: %s: %s does not support online personality change\n",
3923 mdname(mddev), mddev->pers->name);
3924 goto out_unlock;
3925 }
3926
3927
3928 strncpy(clevel, buf, slen);
3929 if (clevel[slen-1] == '\n')
3930 slen--;
3931 clevel[slen] = 0;
3932 if (kstrtol(clevel, 10, &level))
3933 level = LEVEL_NONE;
3934
3935 if (request_module("md-%s", clevel) != 0)
3936 request_module("md-level-%s", clevel);
3937 spin_lock(&pers_lock);
3938 pers = find_pers(level, clevel);
3939 if (!pers || !try_module_get(pers->owner)) {
3940 spin_unlock(&pers_lock);
3941 pr_warn("md: personality %s not loaded\n", clevel);
3942 rv = -EINVAL;
3943 goto out_unlock;
3944 }
3945 spin_unlock(&pers_lock);
3946
3947 if (pers == mddev->pers) {
3948
3949 module_put(pers->owner);
3950 rv = len;
3951 goto out_unlock;
3952 }
3953 if (!pers->takeover) {
3954 module_put(pers->owner);
3955 pr_warn("md: %s: %s does not support personality takeover\n",
3956 mdname(mddev), clevel);
3957 rv = -EINVAL;
3958 goto out_unlock;
3959 }
3960
3961 rdev_for_each(rdev, mddev)
3962 rdev->new_raid_disk = rdev->raid_disk;
3963
3964
3965
3966
3967 priv = pers->takeover(mddev);
3968 if (IS_ERR(priv)) {
3969 mddev->new_level = mddev->level;
3970 mddev->new_layout = mddev->layout;
3971 mddev->new_chunk_sectors = mddev->chunk_sectors;
3972 mddev->raid_disks -= mddev->delta_disks;
3973 mddev->delta_disks = 0;
3974 mddev->reshape_backwards = 0;
3975 module_put(pers->owner);
3976 pr_warn("md: %s: %s would not accept array\n",
3977 mdname(mddev), clevel);
3978 rv = PTR_ERR(priv);
3979 goto out_unlock;
3980 }
3981
3982
3983 mddev_suspend(mddev);
3984 mddev_detach(mddev);
3985
3986 spin_lock(&mddev->lock);
3987 oldpers = mddev->pers;
3988 oldpriv = mddev->private;
3989 mddev->pers = pers;
3990 mddev->private = priv;
3991 strscpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
3992 mddev->level = mddev->new_level;
3993 mddev->layout = mddev->new_layout;
3994 mddev->chunk_sectors = mddev->new_chunk_sectors;
3995 mddev->delta_disks = 0;
3996 mddev->reshape_backwards = 0;
3997 mddev->degraded = 0;
3998 spin_unlock(&mddev->lock);
3999
4000 if (oldpers->sync_request == NULL &&
4001 mddev->external) {
4002
4003
4004
4005
4006
4007
4008
4009 mddev->in_sync = 0;
4010 mddev->safemode_delay = 0;
4011 mddev->safemode = 0;
4012 }
4013
4014 oldpers->free(mddev, oldpriv);
4015
4016 if (oldpers->sync_request == NULL &&
4017 pers->sync_request != NULL) {
4018
4019 if (sysfs_create_group(&mddev->kobj, &md_redundancy_group))
4020 pr_warn("md: cannot register extra attributes for %s\n",
4021 mdname(mddev));
4022 mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action");
4023 mddev->sysfs_completed = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_completed");
4024 mddev->sysfs_degraded = sysfs_get_dirent_safe(mddev->kobj.sd, "degraded");
4025 }
4026 if (oldpers->sync_request != NULL &&
4027 pers->sync_request == NULL) {
4028
4029 if (mddev->to_remove == NULL)
4030 mddev->to_remove = &md_redundancy_group;
4031 }
4032
4033 module_put(oldpers->owner);
4034
4035 rdev_for_each(rdev, mddev) {
4036 if (rdev->raid_disk < 0)
4037 continue;
4038 if (rdev->new_raid_disk >= mddev->raid_disks)
4039 rdev->new_raid_disk = -1;
4040 if (rdev->new_raid_disk == rdev->raid_disk)
4041 continue;
4042 sysfs_unlink_rdev(mddev, rdev);
4043 }
4044 rdev_for_each(rdev, mddev) {
4045 if (rdev->raid_disk < 0)
4046 continue;
4047 if (rdev->new_raid_disk == rdev->raid_disk)
4048 continue;
4049 rdev->raid_disk = rdev->new_raid_disk;
4050 if (rdev->raid_disk < 0)
4051 clear_bit(In_sync, &rdev->flags);
4052 else {
4053 if (sysfs_link_rdev(mddev, rdev))
4054 pr_warn("md: cannot register rd%d for %s after level change\n",
4055 rdev->raid_disk, mdname(mddev));
4056 }
4057 }
4058
4059 if (pers->sync_request == NULL) {
4060
4061
4062
4063 mddev->in_sync = 1;
4064 del_timer_sync(&mddev->safemode_timer);
4065 }
4066 blk_set_stacking_limits(&mddev->queue->limits);
4067 pers->run(mddev);
4068 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
4069 mddev_resume(mddev);
4070 if (!mddev->thread)
4071 md_update_sb(mddev, 1);
4072 sysfs_notify_dirent_safe(mddev->sysfs_level);
4073 md_new_event();
4074 rv = len;
4075 out_unlock:
4076 mddev_unlock(mddev);
4077 return rv;
4078 }
4079
4080 static struct md_sysfs_entry md_level =
4081 __ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store);
4082
4083 static ssize_t
4084 layout_show(struct mddev *mddev, char *page)
4085 {
4086
4087 if (mddev->reshape_position != MaxSector &&
4088 mddev->layout != mddev->new_layout)
4089 return sprintf(page, "%d (%d)\n",
4090 mddev->new_layout, mddev->layout);
4091 return sprintf(page, "%d\n", mddev->layout);
4092 }
4093
4094 static ssize_t
4095 layout_store(struct mddev *mddev, const char *buf, size_t len)
4096 {
4097 unsigned int n;
4098 int err;
4099
4100 err = kstrtouint(buf, 10, &n);
4101 if (err < 0)
4102 return err;
4103 err = mddev_lock(mddev);
4104 if (err)
4105 return err;
4106
4107 if (mddev->pers) {
4108 if (mddev->pers->check_reshape == NULL)
4109 err = -EBUSY;
4110 else if (mddev->ro)
4111 err = -EROFS;
4112 else {
4113 mddev->new_layout = n;
4114 err = mddev->pers->check_reshape(mddev);
4115 if (err)
4116 mddev->new_layout = mddev->layout;
4117 }
4118 } else {
4119 mddev->new_layout = n;
4120 if (mddev->reshape_position == MaxSector)
4121 mddev->layout = n;
4122 }
4123 mddev_unlock(mddev);
4124 return err ?: len;
4125 }
4126 static struct md_sysfs_entry md_layout =
4127 __ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store);
4128
4129 static ssize_t
4130 raid_disks_show(struct mddev *mddev, char *page)
4131 {
4132 if (mddev->raid_disks == 0)
4133 return 0;
4134 if (mddev->reshape_position != MaxSector &&
4135 mddev->delta_disks != 0)
4136 return sprintf(page, "%d (%d)\n", mddev->raid_disks,
4137 mddev->raid_disks - mddev->delta_disks);
4138 return sprintf(page, "%d\n", mddev->raid_disks);
4139 }
4140
4141 static int update_raid_disks(struct mddev *mddev, int raid_disks);
4142
4143 static ssize_t
4144 raid_disks_store(struct mddev *mddev, const char *buf, size_t len)
4145 {
4146 unsigned int n;
4147 int err;
4148
4149 err = kstrtouint(buf, 10, &n);
4150 if (err < 0)
4151 return err;
4152
4153 err = mddev_lock(mddev);
4154 if (err)
4155 return err;
4156 if (mddev->pers)
4157 err = update_raid_disks(mddev, n);
4158 else if (mddev->reshape_position != MaxSector) {
4159 struct md_rdev *rdev;
4160 int olddisks = mddev->raid_disks - mddev->delta_disks;
4161
4162 err = -EINVAL;
4163 rdev_for_each(rdev, mddev) {
4164 if (olddisks < n &&
4165 rdev->data_offset < rdev->new_data_offset)
4166 goto out_unlock;
4167 if (olddisks > n &&
4168 rdev->data_offset > rdev->new_data_offset)
4169 goto out_unlock;
4170 }
4171 err = 0;
4172 mddev->delta_disks = n - olddisks;
4173 mddev->raid_disks = n;
4174 mddev->reshape_backwards = (mddev->delta_disks < 0);
4175 } else
4176 mddev->raid_disks = n;
4177 out_unlock:
4178 mddev_unlock(mddev);
4179 return err ? err : len;
4180 }
4181 static struct md_sysfs_entry md_raid_disks =
4182 __ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store);
4183
4184 static ssize_t
4185 uuid_show(struct mddev *mddev, char *page)
4186 {
4187 return sprintf(page, "%pU\n", mddev->uuid);
4188 }
4189 static struct md_sysfs_entry md_uuid =
4190 __ATTR(uuid, S_IRUGO, uuid_show, NULL);
4191
4192 static ssize_t
4193 chunk_size_show(struct mddev *mddev, char *page)
4194 {
4195 if (mddev->reshape_position != MaxSector &&
4196 mddev->chunk_sectors != mddev->new_chunk_sectors)
4197 return sprintf(page, "%d (%d)\n",
4198 mddev->new_chunk_sectors << 9,
4199 mddev->chunk_sectors << 9);
4200 return sprintf(page, "%d\n", mddev->chunk_sectors << 9);
4201 }
4202
4203 static ssize_t
4204 chunk_size_store(struct mddev *mddev, const char *buf, size_t len)
4205 {
4206 unsigned long n;
4207 int err;
4208
4209 err = kstrtoul(buf, 10, &n);
4210 if (err < 0)
4211 return err;
4212
4213 err = mddev_lock(mddev);
4214 if (err)
4215 return err;
4216 if (mddev->pers) {
4217 if (mddev->pers->check_reshape == NULL)
4218 err = -EBUSY;
4219 else if (mddev->ro)
4220 err = -EROFS;
4221 else {
4222 mddev->new_chunk_sectors = n >> 9;
4223 err = mddev->pers->check_reshape(mddev);
4224 if (err)
4225 mddev->new_chunk_sectors = mddev->chunk_sectors;
4226 }
4227 } else {
4228 mddev->new_chunk_sectors = n >> 9;
4229 if (mddev->reshape_position == MaxSector)
4230 mddev->chunk_sectors = n >> 9;
4231 }
4232 mddev_unlock(mddev);
4233 return err ?: len;
4234 }
4235 static struct md_sysfs_entry md_chunk_size =
4236 __ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store);
4237
4238 static ssize_t
4239 resync_start_show(struct mddev *mddev, char *page)
4240 {
4241 if (mddev->recovery_cp == MaxSector)
4242 return sprintf(page, "none\n");
4243 return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp);
4244 }
4245
4246 static ssize_t
4247 resync_start_store(struct mddev *mddev, const char *buf, size_t len)
4248 {
4249 unsigned long long n;
4250 int err;
4251
4252 if (cmd_match(buf, "none"))
4253 n = MaxSector;
4254 else {
4255 err = kstrtoull(buf, 10, &n);
4256 if (err < 0)
4257 return err;
4258 if (n != (sector_t)n)
4259 return -EINVAL;
4260 }
4261
4262 err = mddev_lock(mddev);
4263 if (err)
4264 return err;
4265 if (mddev->pers && !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
4266 err = -EBUSY;
4267
4268 if (!err) {
4269 mddev->recovery_cp = n;
4270 if (mddev->pers)
4271 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
4272 }
4273 mddev_unlock(mddev);
4274 return err ?: len;
4275 }
4276 static struct md_sysfs_entry md_resync_start =
4277 __ATTR_PREALLOC(resync_start, S_IRUGO|S_IWUSR,
4278 resync_start_show, resync_start_store);
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320 enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active,
4321 write_pending, active_idle, broken, bad_word};
4322 static char *array_states[] = {
4323 "clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active",
4324 "write-pending", "active-idle", "broken", NULL };
4325
4326 static int match_word(const char *word, char **list)
4327 {
4328 int n;
4329 for (n=0; list[n]; n++)
4330 if (cmd_match(word, list[n]))
4331 break;
4332 return n;
4333 }
4334
4335 static ssize_t
4336 array_state_show(struct mddev *mddev, char *page)
4337 {
4338 enum array_state st = inactive;
4339
4340 if (mddev->pers && !test_bit(MD_NOT_READY, &mddev->flags)) {
4341 switch(mddev->ro) {
4342 case 1:
4343 st = readonly;
4344 break;
4345 case 2:
4346 st = read_auto;
4347 break;
4348 case 0:
4349 spin_lock(&mddev->lock);
4350 if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags))
4351 st = write_pending;
4352 else if (mddev->in_sync)
4353 st = clean;
4354 else if (mddev->safemode)
4355 st = active_idle;
4356 else
4357 st = active;
4358 spin_unlock(&mddev->lock);
4359 }
4360
4361 if (test_bit(MD_BROKEN, &mddev->flags) && st == clean)
4362 st = broken;
4363 } else {
4364 if (list_empty(&mddev->disks) &&
4365 mddev->raid_disks == 0 &&
4366 mddev->dev_sectors == 0)
4367 st = clear;
4368 else
4369 st = inactive;
4370 }
4371 return sprintf(page, "%s\n", array_states[st]);
4372 }
4373
4374 static int do_md_stop(struct mddev *mddev, int ro, struct block_device *bdev);
4375 static int md_set_readonly(struct mddev *mddev, struct block_device *bdev);
4376 static int restart_array(struct mddev *mddev);
4377
4378 static ssize_t
4379 array_state_store(struct mddev *mddev, const char *buf, size_t len)
4380 {
4381 int err = 0;
4382 enum array_state st = match_word(buf, array_states);
4383
4384 if (mddev->pers && (st == active || st == clean) && mddev->ro != 1) {
4385
4386
4387
4388 spin_lock(&mddev->lock);
4389 if (st == active) {
4390 restart_array(mddev);
4391 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
4392 md_wakeup_thread(mddev->thread);
4393 wake_up(&mddev->sb_wait);
4394 } else {
4395 restart_array(mddev);
4396 if (!set_in_sync(mddev))
4397 err = -EBUSY;
4398 }
4399 if (!err)
4400 sysfs_notify_dirent_safe(mddev->sysfs_state);
4401 spin_unlock(&mddev->lock);
4402 return err ?: len;
4403 }
4404 err = mddev_lock(mddev);
4405 if (err)
4406 return err;
4407 err = -EINVAL;
4408 switch(st) {
4409 case bad_word:
4410 break;
4411 case clear:
4412
4413 err = do_md_stop(mddev, 0, NULL);
4414 break;
4415 case inactive:
4416
4417 if (mddev->pers)
4418 err = do_md_stop(mddev, 2, NULL);
4419 else
4420 err = 0;
4421 break;
4422 case suspended:
4423 break;
4424 case readonly:
4425 if (mddev->pers)
4426 err = md_set_readonly(mddev, NULL);
4427 else {
4428 mddev->ro = 1;
4429 set_disk_ro(mddev->gendisk, 1);
4430 err = do_md_run(mddev);
4431 }
4432 break;
4433 case read_auto:
4434 if (mddev->pers) {
4435 if (mddev->ro == 0)
4436 err = md_set_readonly(mddev, NULL);
4437 else if (mddev->ro == 1)
4438 err = restart_array(mddev);
4439 if (err == 0) {
4440 mddev->ro = 2;
4441 set_disk_ro(mddev->gendisk, 0);
4442 }
4443 } else {
4444 mddev->ro = 2;
4445 err = do_md_run(mddev);
4446 }
4447 break;
4448 case clean:
4449 if (mddev->pers) {
4450 err = restart_array(mddev);
4451 if (err)
4452 break;
4453 spin_lock(&mddev->lock);
4454 if (!set_in_sync(mddev))
4455 err = -EBUSY;
4456 spin_unlock(&mddev->lock);
4457 } else
4458 err = -EINVAL;
4459 break;
4460 case active:
4461 if (mddev->pers) {
4462 err = restart_array(mddev);
4463 if (err)
4464 break;
4465 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
4466 wake_up(&mddev->sb_wait);
4467 err = 0;
4468 } else {
4469 mddev->ro = 0;
4470 set_disk_ro(mddev->gendisk, 0);
4471 err = do_md_run(mddev);
4472 }
4473 break;
4474 case write_pending:
4475 case active_idle:
4476 case broken:
4477
4478 break;
4479 }
4480
4481 if (!err) {
4482 if (mddev->hold_active == UNTIL_IOCTL)
4483 mddev->hold_active = 0;
4484 sysfs_notify_dirent_safe(mddev->sysfs_state);
4485 }
4486 mddev_unlock(mddev);
4487 return err ?: len;
4488 }
4489 static struct md_sysfs_entry md_array_state =
4490 __ATTR_PREALLOC(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store);
4491
4492 static ssize_t
4493 max_corrected_read_errors_show(struct mddev *mddev, char *page) {
4494 return sprintf(page, "%d\n",
4495 atomic_read(&mddev->max_corr_read_errors));
4496 }
4497
4498 static ssize_t
4499 max_corrected_read_errors_store(struct mddev *mddev, const char *buf, size_t len)
4500 {
4501 unsigned int n;
4502 int rv;
4503
4504 rv = kstrtouint(buf, 10, &n);
4505 if (rv < 0)
4506 return rv;
4507 atomic_set(&mddev->max_corr_read_errors, n);
4508 return len;
4509 }
4510
4511 static struct md_sysfs_entry max_corr_read_errors =
4512 __ATTR(max_read_errors, S_IRUGO|S_IWUSR, max_corrected_read_errors_show,
4513 max_corrected_read_errors_store);
4514
4515 static ssize_t
4516 null_show(struct mddev *mddev, char *page)
4517 {
4518 return -EINVAL;
4519 }
4520
4521
4522 static void flush_rdev_wq(struct mddev *mddev)
4523 {
4524 struct md_rdev *rdev;
4525
4526 rcu_read_lock();
4527 rdev_for_each_rcu(rdev, mddev)
4528 if (work_pending(&rdev->del_work)) {
4529 flush_workqueue(md_rdev_misc_wq);
4530 break;
4531 }
4532 rcu_read_unlock();
4533 }
4534
4535 static ssize_t
4536 new_dev_store(struct mddev *mddev, const char *buf, size_t len)
4537 {
4538
4539
4540
4541
4542
4543
4544
4545 char *e;
4546 int major = simple_strtoul(buf, &e, 10);
4547 int minor;
4548 dev_t dev;
4549 struct md_rdev *rdev;
4550 int err;
4551
4552 if (!*buf || *e != ':' || !e[1] || e[1] == '\n')
4553 return -EINVAL;
4554 minor = simple_strtoul(e+1, &e, 10);
4555 if (*e && *e != '\n')
4556 return -EINVAL;
4557 dev = MKDEV(major, minor);
4558 if (major != MAJOR(dev) ||
4559 minor != MINOR(dev))
4560 return -EOVERFLOW;
4561
4562 flush_rdev_wq(mddev);
4563 err = mddev_lock(mddev);
4564 if (err)
4565 return err;
4566 if (mddev->persistent) {
4567 rdev = md_import_device(dev, mddev->major_version,
4568 mddev->minor_version);
4569 if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) {
4570 struct md_rdev *rdev0
4571 = list_entry(mddev->disks.next,
4572 struct md_rdev, same_set);
4573 err = super_types[mddev->major_version]
4574 .load_super(rdev, rdev0, mddev->minor_version);
4575 if (err < 0)
4576 goto out;
4577 }
4578 } else if (mddev->external)
4579 rdev = md_import_device(dev, -2, -1);
4580 else
4581 rdev = md_import_device(dev, -1, -1);
4582
4583 if (IS_ERR(rdev)) {
4584 mddev_unlock(mddev);
4585 return PTR_ERR(rdev);
4586 }
4587 err = bind_rdev_to_array(rdev, mddev);
4588 out:
4589 if (err)
4590 export_rdev(rdev);
4591 mddev_unlock(mddev);
4592 if (!err)
4593 md_new_event();
4594 return err ? err : len;
4595 }
4596
4597 static struct md_sysfs_entry md_new_device =
4598 __ATTR(new_dev, S_IWUSR, null_show, new_dev_store);
4599
4600 static ssize_t
4601 bitmap_store(struct mddev *mddev, const char *buf, size_t len)
4602 {
4603 char *end;
4604 unsigned long chunk, end_chunk;
4605 int err;
4606
4607 err = mddev_lock(mddev);
4608 if (err)
4609 return err;
4610 if (!mddev->bitmap)
4611 goto out;
4612
4613 while (*buf) {
4614 chunk = end_chunk = simple_strtoul(buf, &end, 0);
4615 if (buf == end) break;
4616 if (*end == '-') {
4617 buf = end + 1;
4618 end_chunk = simple_strtoul(buf, &end, 0);
4619 if (buf == end) break;
4620 }
4621 if (*end && !isspace(*end)) break;
4622 md_bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk);
4623 buf = skip_spaces(end);
4624 }
4625 md_bitmap_unplug(mddev->bitmap);
4626 out:
4627 mddev_unlock(mddev);
4628 return len;
4629 }
4630
4631 static struct md_sysfs_entry md_bitmap =
4632 __ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store);
4633
4634 static ssize_t
4635 size_show(struct mddev *mddev, char *page)
4636 {
4637 return sprintf(page, "%llu\n",
4638 (unsigned long long)mddev->dev_sectors / 2);
4639 }
4640
4641 static int update_size(struct mddev *mddev, sector_t num_sectors);
4642
4643 static ssize_t
4644 size_store(struct mddev *mddev, const char *buf, size_t len)
4645 {
4646
4647
4648
4649
4650 sector_t sectors;
4651 int err = strict_blocks_to_sectors(buf, §ors);
4652
4653 if (err < 0)
4654 return err;
4655 err = mddev_lock(mddev);
4656 if (err)
4657 return err;
4658 if (mddev->pers) {
4659 err = update_size(mddev, sectors);
4660 if (err == 0)
4661 md_update_sb(mddev, 1);
4662 } else {
4663 if (mddev->dev_sectors == 0 ||
4664 mddev->dev_sectors > sectors)
4665 mddev->dev_sectors = sectors;
4666 else
4667 err = -ENOSPC;
4668 }
4669 mddev_unlock(mddev);
4670 return err ? err : len;
4671 }
4672
4673 static struct md_sysfs_entry md_size =
4674 __ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store);
4675
4676
4677
4678
4679
4680
4681
4682 static ssize_t
4683 metadata_show(struct mddev *mddev, char *page)
4684 {
4685 if (mddev->persistent)
4686 return sprintf(page, "%d.%d\n",
4687 mddev->major_version, mddev->minor_version);
4688 else if (mddev->external)
4689 return sprintf(page, "external:%s\n", mddev->metadata_type);
4690 else
4691 return sprintf(page, "none\n");
4692 }
4693
4694 static ssize_t
4695 metadata_store(struct mddev *mddev, const char *buf, size_t len)
4696 {
4697 int major, minor;
4698 char *e;
4699 int err;
4700
4701
4702
4703
4704
4705 err = mddev_lock(mddev);
4706 if (err)
4707 return err;
4708 err = -EBUSY;
4709 if (mddev->external && strncmp(buf, "external:", 9) == 0)
4710 ;
4711 else if (!list_empty(&mddev->disks))
4712 goto out_unlock;
4713
4714 err = 0;
4715 if (cmd_match(buf, "none")) {
4716 mddev->persistent = 0;
4717 mddev->external = 0;
4718 mddev->major_version = 0;
4719 mddev->minor_version = 90;
4720 goto out_unlock;
4721 }
4722 if (strncmp(buf, "external:", 9) == 0) {
4723 size_t namelen = len-9;
4724 if (namelen >= sizeof(mddev->metadata_type))
4725 namelen = sizeof(mddev->metadata_type)-1;
4726 strncpy(mddev->metadata_type, buf+9, namelen);
4727 mddev->metadata_type[namelen] = 0;
4728 if (namelen && mddev->metadata_type[namelen-1] == '\n')
4729 mddev->metadata_type[--namelen] = 0;
4730 mddev->persistent = 0;
4731 mddev->external = 1;
4732 mddev->major_version = 0;
4733 mddev->minor_version = 90;
4734 goto out_unlock;
4735 }
4736 major = simple_strtoul(buf, &e, 10);
4737 err = -EINVAL;
4738 if (e==buf || *e != '.')
4739 goto out_unlock;
4740 buf = e+1;
4741 minor = simple_strtoul(buf, &e, 10);
4742 if (e==buf || (*e && *e != '\n') )
4743 goto out_unlock;
4744 err = -ENOENT;
4745 if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL)
4746 goto out_unlock;
4747 mddev->major_version = major;
4748 mddev->minor_version = minor;
4749 mddev->persistent = 1;
4750 mddev->external = 0;
4751 err = 0;
4752 out_unlock:
4753 mddev_unlock(mddev);
4754 return err ?: len;
4755 }
4756
4757 static struct md_sysfs_entry md_metadata =
4758 __ATTR_PREALLOC(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store);
4759
4760 static ssize_t
4761 action_show(struct mddev *mddev, char *page)
4762 {
4763 char *type = "idle";
4764 unsigned long recovery = mddev->recovery;
4765 if (test_bit(MD_RECOVERY_FROZEN, &recovery))
4766 type = "frozen";
4767 else if (test_bit(MD_RECOVERY_RUNNING, &recovery) ||
4768 (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &recovery))) {
4769 if (test_bit(MD_RECOVERY_RESHAPE, &recovery))
4770 type = "reshape";
4771 else if (test_bit(MD_RECOVERY_SYNC, &recovery)) {
4772 if (!test_bit(MD_RECOVERY_REQUESTED, &recovery))
4773 type = "resync";
4774 else if (test_bit(MD_RECOVERY_CHECK, &recovery))
4775 type = "check";
4776 else
4777 type = "repair";
4778 } else if (test_bit(MD_RECOVERY_RECOVER, &recovery))
4779 type = "recover";
4780 else if (mddev->reshape_position != MaxSector)
4781 type = "reshape";
4782 }
4783 return sprintf(page, "%s\n", type);
4784 }
4785
4786 static ssize_t
4787 action_store(struct mddev *mddev, const char *page, size_t len)
4788 {
4789 if (!mddev->pers || !mddev->pers->sync_request)
4790 return -EINVAL;
4791
4792
4793 if (cmd_match(page, "idle") || cmd_match(page, "frozen")) {
4794 if (cmd_match(page, "frozen"))
4795 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4796 else
4797 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4798 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
4799 mddev_lock(mddev) == 0) {
4800 if (work_pending(&mddev->del_work))
4801 flush_workqueue(md_misc_wq);
4802 if (mddev->sync_thread) {
4803 sector_t save_rp = mddev->reshape_position;
4804
4805 mddev_unlock(mddev);
4806 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
4807 md_unregister_thread(&mddev->sync_thread);
4808 mddev_lock_nointr(mddev);
4809
4810
4811
4812
4813
4814
4815 mddev->reshape_position = save_rp;
4816 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
4817 md_reap_sync_thread(mddev);
4818 }
4819 mddev_unlock(mddev);
4820 }
4821 } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4822 return -EBUSY;
4823 else if (cmd_match(page, "resync"))
4824 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4825 else if (cmd_match(page, "recover")) {
4826 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4827 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
4828 } else if (cmd_match(page, "reshape")) {
4829 int err;
4830 if (mddev->pers->start_reshape == NULL)
4831 return -EINVAL;
4832 err = mddev_lock(mddev);
4833 if (!err) {
4834 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4835 err = -EBUSY;
4836 else {
4837 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4838 err = mddev->pers->start_reshape(mddev);
4839 }
4840 mddev_unlock(mddev);
4841 }
4842 if (err)
4843 return err;
4844 sysfs_notify_dirent_safe(mddev->sysfs_degraded);
4845 } else {
4846 if (cmd_match(page, "check"))
4847 set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
4848 else if (!cmd_match(page, "repair"))
4849 return -EINVAL;
4850 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4851 set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
4852 set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
4853 }
4854 if (mddev->ro == 2) {
4855
4856
4857
4858 mddev->ro = 0;
4859 md_wakeup_thread(mddev->sync_thread);
4860 }
4861 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4862 md_wakeup_thread(mddev->thread);
4863 sysfs_notify_dirent_safe(mddev->sysfs_action);
4864 return len;
4865 }
4866
4867 static struct md_sysfs_entry md_scan_mode =
4868 __ATTR_PREALLOC(sync_action, S_IRUGO|S_IWUSR, action_show, action_store);
4869
4870 static ssize_t
4871 last_sync_action_show(struct mddev *mddev, char *page)
4872 {
4873 return sprintf(page, "%s\n", mddev->last_sync_action);
4874 }
4875
4876 static struct md_sysfs_entry md_last_scan_mode = __ATTR_RO(last_sync_action);
4877
4878 static ssize_t
4879 mismatch_cnt_show(struct mddev *mddev, char *page)
4880 {
4881 return sprintf(page, "%llu\n",
4882 (unsigned long long)
4883 atomic64_read(&mddev->resync_mismatches));
4884 }
4885
4886 static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt);
4887
4888 static ssize_t
4889 sync_min_show(struct mddev *mddev, char *page)
4890 {
4891 return sprintf(page, "%d (%s)\n", speed_min(mddev),
4892 mddev->sync_speed_min ? "local": "system");
4893 }
4894
4895 static ssize_t
4896 sync_min_store(struct mddev *mddev, const char *buf, size_t len)
4897 {
4898 unsigned int min;
4899 int rv;
4900
4901 if (strncmp(buf, "system", 6)==0) {
4902 min = 0;
4903 } else {
4904 rv = kstrtouint(buf, 10, &min);
4905 if (rv < 0)
4906 return rv;
4907 if (min == 0)
4908 return -EINVAL;
4909 }
4910 mddev->sync_speed_min = min;
4911 return len;
4912 }
4913
4914 static struct md_sysfs_entry md_sync_min =
4915 __ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store);
4916
4917 static ssize_t
4918 sync_max_show(struct mddev *mddev, char *page)
4919 {
4920 return sprintf(page, "%d (%s)\n", speed_max(mddev),
4921 mddev->sync_speed_max ? "local": "system");
4922 }
4923
4924 static ssize_t
4925 sync_max_store(struct mddev *mddev, const char *buf, size_t len)
4926 {
4927 unsigned int max;
4928 int rv;
4929
4930 if (strncmp(buf, "system", 6)==0) {
4931 max = 0;
4932 } else {
4933 rv = kstrtouint(buf, 10, &max);
4934 if (rv < 0)
4935 return rv;
4936 if (max == 0)
4937 return -EINVAL;
4938 }
4939 mddev->sync_speed_max = max;
4940 return len;
4941 }
4942
4943 static struct md_sysfs_entry md_sync_max =
4944 __ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store);
4945
4946 static ssize_t
4947 degraded_show(struct mddev *mddev, char *page)
4948 {
4949 return sprintf(page, "%d\n", mddev->degraded);
4950 }
4951 static struct md_sysfs_entry md_degraded = __ATTR_RO(degraded);
4952
4953 static ssize_t
4954 sync_force_parallel_show(struct mddev *mddev, char *page)
4955 {
4956 return sprintf(page, "%d\n", mddev->parallel_resync);
4957 }
4958
4959 static ssize_t
4960 sync_force_parallel_store(struct mddev *mddev, const char *buf, size_t len)
4961 {
4962 long n;
4963
4964 if (kstrtol(buf, 10, &n))
4965 return -EINVAL;
4966
4967 if (n != 0 && n != 1)
4968 return -EINVAL;
4969
4970 mddev->parallel_resync = n;
4971
4972 if (mddev->sync_thread)
4973 wake_up(&resync_wait);
4974
4975 return len;
4976 }
4977
4978
4979 static struct md_sysfs_entry md_sync_force_parallel =
4980 __ATTR(sync_force_parallel, S_IRUGO|S_IWUSR,
4981 sync_force_parallel_show, sync_force_parallel_store);
4982
4983 static ssize_t
4984 sync_speed_show(struct mddev *mddev, char *page)
4985 {
4986 unsigned long resync, dt, db;
4987 if (mddev->curr_resync == MD_RESYNC_NONE)
4988 return sprintf(page, "none\n");
4989 resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active);
4990 dt = (jiffies - mddev->resync_mark) / HZ;
4991 if (!dt) dt++;
4992 db = resync - mddev->resync_mark_cnt;
4993 return sprintf(page, "%lu\n", db/dt/2);
4994 }
4995
4996 static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed);
4997
4998 static ssize_t
4999 sync_completed_show(struct mddev *mddev, char *page)
5000 {
5001 unsigned long long max_sectors, resync;
5002
5003 if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
5004 return sprintf(page, "none\n");
5005
5006 if (mddev->curr_resync == MD_RESYNC_YIELDED ||
5007 mddev->curr_resync == MD_RESYNC_DELAYED)
5008 return sprintf(page, "delayed\n");
5009
5010 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
5011 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
5012 max_sectors = mddev->resync_max_sectors;
5013 else
5014 max_sectors = mddev->dev_sectors;
5015
5016 resync = mddev->curr_resync_completed;
5017 return sprintf(page, "%llu / %llu\n", resync, max_sectors);
5018 }
5019
5020 static struct md_sysfs_entry md_sync_completed =
5021 __ATTR_PREALLOC(sync_completed, S_IRUGO, sync_completed_show, NULL);
5022
5023 static ssize_t
5024 min_sync_show(struct mddev *mddev, char *page)
5025 {
5026 return sprintf(page, "%llu\n",
5027 (unsigned long long)mddev->resync_min);
5028 }
5029 static ssize_t
5030 min_sync_store(struct mddev *mddev, const char *buf, size_t len)
5031 {
5032 unsigned long long min;
5033 int err;
5034
5035 if (kstrtoull(buf, 10, &min))
5036 return -EINVAL;
5037
5038 spin_lock(&mddev->lock);
5039 err = -EINVAL;
5040 if (min > mddev->resync_max)
5041 goto out_unlock;
5042
5043 err = -EBUSY;
5044 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
5045 goto out_unlock;
5046
5047
5048 mddev->resync_min = round_down(min, 8);
5049 err = 0;
5050
5051 out_unlock:
5052 spin_unlock(&mddev->lock);
5053 return err ?: len;
5054 }
5055
5056 static struct md_sysfs_entry md_min_sync =
5057 __ATTR(sync_min, S_IRUGO|S_IWUSR, min_sync_show, min_sync_store);
5058
5059 static ssize_t
5060 max_sync_show(struct mddev *mddev, char *page)
5061 {
5062 if (mddev->resync_max == MaxSector)
5063 return sprintf(page, "max\n");
5064 else
5065 return sprintf(page, "%llu\n",
5066 (unsigned long long)mddev->resync_max);
5067 }
5068 static ssize_t
5069 max_sync_store(struct mddev *mddev, const char *buf, size_t len)
5070 {
5071 int err;
5072 spin_lock(&mddev->lock);
5073 if (strncmp(buf, "max", 3) == 0)
5074 mddev->resync_max = MaxSector;
5075 else {
5076 unsigned long long max;
5077 int chunk;
5078
5079 err = -EINVAL;
5080 if (kstrtoull(buf, 10, &max))
5081 goto out_unlock;
5082 if (max < mddev->resync_min)
5083 goto out_unlock;
5084
5085 err = -EBUSY;
5086 if (max < mddev->resync_max &&
5087 mddev->ro == 0 &&
5088 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
5089 goto out_unlock;
5090
5091
5092 chunk = mddev->chunk_sectors;
5093 if (chunk) {
5094 sector_t temp = max;
5095
5096 err = -EINVAL;
5097 if (sector_div(temp, chunk))
5098 goto out_unlock;
5099 }
5100 mddev->resync_max = max;
5101 }
5102 wake_up(&mddev->recovery_wait);
5103 err = 0;
5104 out_unlock:
5105 spin_unlock(&mddev->lock);
5106 return err ?: len;
5107 }
5108
5109 static struct md_sysfs_entry md_max_sync =
5110 __ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store);
5111
5112 static ssize_t
5113 suspend_lo_show(struct mddev *mddev, char *page)
5114 {
5115 return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo);
5116 }
5117
5118 static ssize_t
5119 suspend_lo_store(struct mddev *mddev, const char *buf, size_t len)
5120 {
5121 unsigned long long new;
5122 int err;
5123
5124 err = kstrtoull(buf, 10, &new);
5125 if (err < 0)
5126 return err;
5127 if (new != (sector_t)new)
5128 return -EINVAL;
5129
5130 err = mddev_lock(mddev);
5131 if (err)
5132 return err;
5133 err = -EINVAL;
5134 if (mddev->pers == NULL ||
5135 mddev->pers->quiesce == NULL)
5136 goto unlock;
5137 mddev_suspend(mddev);
5138 mddev->suspend_lo = new;
5139 mddev_resume(mddev);
5140
5141 err = 0;
5142 unlock:
5143 mddev_unlock(mddev);
5144 return err ?: len;
5145 }
5146 static struct md_sysfs_entry md_suspend_lo =
5147 __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store);
5148
5149 static ssize_t
5150 suspend_hi_show(struct mddev *mddev, char *page)
5151 {
5152 return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_hi);
5153 }
5154
5155 static ssize_t
5156 suspend_hi_store(struct mddev *mddev, const char *buf, size_t len)
5157 {
5158 unsigned long long new;
5159 int err;
5160
5161 err = kstrtoull(buf, 10, &new);
5162 if (err < 0)
5163 return err;
5164 if (new != (sector_t)new)
5165 return -EINVAL;
5166
5167 err = mddev_lock(mddev);
5168 if (err)
5169 return err;
5170 err = -EINVAL;
5171 if (mddev->pers == NULL)
5172 goto unlock;
5173
5174 mddev_suspend(mddev);
5175 mddev->suspend_hi = new;
5176 mddev_resume(mddev);
5177
5178 err = 0;
5179 unlock:
5180 mddev_unlock(mddev);
5181 return err ?: len;
5182 }
5183 static struct md_sysfs_entry md_suspend_hi =
5184 __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store);
5185
5186 static ssize_t
5187 reshape_position_show(struct mddev *mddev, char *page)
5188 {
5189 if (mddev->reshape_position != MaxSector)
5190 return sprintf(page, "%llu\n",
5191 (unsigned long long)mddev->reshape_position);
5192 strcpy(page, "none\n");
5193 return 5;
5194 }
5195
5196 static ssize_t
5197 reshape_position_store(struct mddev *mddev, const char *buf, size_t len)
5198 {
5199 struct md_rdev *rdev;
5200 unsigned long long new;
5201 int err;
5202
5203 err = kstrtoull(buf, 10, &new);
5204 if (err < 0)
5205 return err;
5206 if (new != (sector_t)new)
5207 return -EINVAL;
5208 err = mddev_lock(mddev);
5209 if (err)
5210 return err;
5211 err = -EBUSY;
5212 if (mddev->pers)
5213 goto unlock;
5214 mddev->reshape_position = new;
5215 mddev->delta_disks = 0;
5216 mddev->reshape_backwards = 0;
5217 mddev->new_level = mddev->level;
5218 mddev->new_layout = mddev->layout;
5219 mddev->new_chunk_sectors = mddev->chunk_sectors;
5220 rdev_for_each(rdev, mddev)
5221 rdev->new_data_offset = rdev->data_offset;
5222 err = 0;
5223 unlock:
5224 mddev_unlock(mddev);
5225 return err ?: len;
5226 }
5227
5228 static struct md_sysfs_entry md_reshape_position =
5229 __ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show,
5230 reshape_position_store);
5231
5232 static ssize_t
5233 reshape_direction_show(struct mddev *mddev, char *page)
5234 {
5235 return sprintf(page, "%s\n",
5236 mddev->reshape_backwards ? "backwards" : "forwards");
5237 }
5238
5239 static ssize_t
5240 reshape_direction_store(struct mddev *mddev, const char *buf, size_t len)
5241 {
5242 int backwards = 0;
5243 int err;
5244
5245 if (cmd_match(buf, "forwards"))
5246 backwards = 0;
5247 else if (cmd_match(buf, "backwards"))
5248 backwards = 1;
5249 else
5250 return -EINVAL;
5251 if (mddev->reshape_backwards == backwards)
5252 return len;
5253
5254 err = mddev_lock(mddev);
5255 if (err)
5256 return err;
5257
5258 if (mddev->delta_disks)
5259 err = -EBUSY;
5260 else if (mddev->persistent &&
5261 mddev->major_version == 0)
5262 err = -EINVAL;
5263 else
5264 mddev->reshape_backwards = backwards;
5265 mddev_unlock(mddev);
5266 return err ?: len;
5267 }
5268
5269 static struct md_sysfs_entry md_reshape_direction =
5270 __ATTR(reshape_direction, S_IRUGO|S_IWUSR, reshape_direction_show,
5271 reshape_direction_store);
5272
5273 static ssize_t
5274 array_size_show(struct mddev *mddev, char *page)
5275 {
5276 if (mddev->external_size)
5277 return sprintf(page, "%llu\n",
5278 (unsigned long long)mddev->array_sectors/2);
5279 else
5280 return sprintf(page, "default\n");
5281 }
5282
5283 static ssize_t
5284 array_size_store(struct mddev *mddev, const char *buf, size_t len)
5285 {
5286 sector_t sectors;
5287 int err;
5288
5289 err = mddev_lock(mddev);
5290 if (err)
5291 return err;
5292
5293
5294 if (mddev_is_clustered(mddev)) {
5295 mddev_unlock(mddev);
5296 return -EINVAL;
5297 }
5298
5299 if (strncmp(buf, "default", 7) == 0) {
5300 if (mddev->pers)
5301 sectors = mddev->pers->size(mddev, 0, 0);
5302 else
5303 sectors = mddev->array_sectors;
5304
5305 mddev->external_size = 0;
5306 } else {
5307 if (strict_blocks_to_sectors(buf, §ors) < 0)
5308 err = -EINVAL;
5309 else if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors)
5310 err = -E2BIG;
5311 else
5312 mddev->external_size = 1;
5313 }
5314
5315 if (!err) {
5316 mddev->array_sectors = sectors;
5317 if (mddev->pers)
5318 set_capacity_and_notify(mddev->gendisk,
5319 mddev->array_sectors);
5320 }
5321 mddev_unlock(mddev);
5322 return err ?: len;
5323 }
5324
5325 static struct md_sysfs_entry md_array_size =
5326 __ATTR(array_size, S_IRUGO|S_IWUSR, array_size_show,
5327 array_size_store);
5328
5329 static ssize_t
5330 consistency_policy_show(struct mddev *mddev, char *page)
5331 {
5332 int ret;
5333
5334 if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
5335 ret = sprintf(page, "journal\n");
5336 } else if (test_bit(MD_HAS_PPL, &mddev->flags)) {
5337 ret = sprintf(page, "ppl\n");
5338 } else if (mddev->bitmap) {
5339 ret = sprintf(page, "bitmap\n");
5340 } else if (mddev->pers) {
5341 if (mddev->pers->sync_request)
5342 ret = sprintf(page, "resync\n");
5343 else
5344 ret = sprintf(page, "none\n");
5345 } else {
5346 ret = sprintf(page, "unknown\n");
5347 }
5348
5349 return ret;
5350 }
5351
5352 static ssize_t
5353 consistency_policy_store(struct mddev *mddev, const char *buf, size_t len)
5354 {
5355 int err = 0;
5356
5357 if (mddev->pers) {
5358 if (mddev->pers->change_consistency_policy)
5359 err = mddev->pers->change_consistency_policy(mddev, buf);
5360 else
5361 err = -EBUSY;
5362 } else if (mddev->external && strncmp(buf, "ppl", 3) == 0) {
5363 set_bit(MD_HAS_PPL, &mddev->flags);
5364 } else {
5365 err = -EINVAL;
5366 }
5367
5368 return err ? err : len;
5369 }
5370
5371 static struct md_sysfs_entry md_consistency_policy =
5372 __ATTR(consistency_policy, S_IRUGO | S_IWUSR, consistency_policy_show,
5373 consistency_policy_store);
5374
5375 static ssize_t fail_last_dev_show(struct mddev *mddev, char *page)
5376 {
5377 return sprintf(page, "%d\n", mddev->fail_last_dev);
5378 }
5379
5380
5381
5382
5383
5384 static ssize_t
5385 fail_last_dev_store(struct mddev *mddev, const char *buf, size_t len)
5386 {
5387 int ret;
5388 bool value;
5389
5390 ret = kstrtobool(buf, &value);
5391 if (ret)
5392 return ret;
5393
5394 if (value != mddev->fail_last_dev)
5395 mddev->fail_last_dev = value;
5396
5397 return len;
5398 }
5399 static struct md_sysfs_entry md_fail_last_dev =
5400 __ATTR(fail_last_dev, S_IRUGO | S_IWUSR, fail_last_dev_show,
5401 fail_last_dev_store);
5402
5403 static ssize_t serialize_policy_show(struct mddev *mddev, char *page)
5404 {
5405 if (mddev->pers == NULL || (mddev->pers->level != 1))
5406 return sprintf(page, "n/a\n");
5407 else
5408 return sprintf(page, "%d\n", mddev->serialize_policy);
5409 }
5410
5411
5412
5413
5414
5415 static ssize_t
5416 serialize_policy_store(struct mddev *mddev, const char *buf, size_t len)
5417 {
5418 int err;
5419 bool value;
5420
5421 err = kstrtobool(buf, &value);
5422 if (err)
5423 return err;
5424
5425 if (value == mddev->serialize_policy)
5426 return len;
5427
5428 err = mddev_lock(mddev);
5429 if (err)
5430 return err;
5431 if (mddev->pers == NULL || (mddev->pers->level != 1)) {
5432 pr_err("md: serialize_policy is only effective for raid1\n");
5433 err = -EINVAL;
5434 goto unlock;
5435 }
5436
5437 mddev_suspend(mddev);
5438 if (value)
5439 mddev_create_serial_pool(mddev, NULL, true);
5440 else
5441 mddev_destroy_serial_pool(mddev, NULL, true);
5442 mddev->serialize_policy = value;
5443 mddev_resume(mddev);
5444 unlock:
5445 mddev_unlock(mddev);
5446 return err ?: len;
5447 }
5448
5449 static struct md_sysfs_entry md_serialize_policy =
5450 __ATTR(serialize_policy, S_IRUGO | S_IWUSR, serialize_policy_show,
5451 serialize_policy_store);
5452
5453
5454 static struct attribute *md_default_attrs[] = {
5455 &md_level.attr,
5456 &md_layout.attr,
5457 &md_raid_disks.attr,
5458 &md_uuid.attr,
5459 &md_chunk_size.attr,
5460 &md_size.attr,
5461 &md_resync_start.attr,
5462 &md_metadata.attr,
5463 &md_new_device.attr,
5464 &md_safe_delay.attr,
5465 &md_array_state.attr,
5466 &md_reshape_position.attr,
5467 &md_reshape_direction.attr,
5468 &md_array_size.attr,
5469 &max_corr_read_errors.attr,
5470 &md_consistency_policy.attr,
5471 &md_fail_last_dev.attr,
5472 &md_serialize_policy.attr,
5473 NULL,
5474 };
5475
5476 static const struct attribute_group md_default_group = {
5477 .attrs = md_default_attrs,
5478 };
5479
5480 static struct attribute *md_redundancy_attrs[] = {
5481 &md_scan_mode.attr,
5482 &md_last_scan_mode.attr,
5483 &md_mismatches.attr,
5484 &md_sync_min.attr,
5485 &md_sync_max.attr,
5486 &md_sync_speed.attr,
5487 &md_sync_force_parallel.attr,
5488 &md_sync_completed.attr,
5489 &md_min_sync.attr,
5490 &md_max_sync.attr,
5491 &md_suspend_lo.attr,
5492 &md_suspend_hi.attr,
5493 &md_bitmap.attr,
5494 &md_degraded.attr,
5495 NULL,
5496 };
5497 static const struct attribute_group md_redundancy_group = {
5498 .name = NULL,
5499 .attrs = md_redundancy_attrs,
5500 };
5501
5502 static const struct attribute_group *md_attr_groups[] = {
5503 &md_default_group,
5504 &md_bitmap_group,
5505 NULL,
5506 };
5507
5508 static ssize_t
5509 md_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
5510 {
5511 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
5512 struct mddev *mddev = container_of(kobj, struct mddev, kobj);
5513 ssize_t rv;
5514
5515 if (!entry->show)
5516 return -EIO;
5517 spin_lock(&all_mddevs_lock);
5518 if (!mddev_get(mddev)) {
5519 spin_unlock(&all_mddevs_lock);
5520 return -EBUSY;
5521 }
5522 spin_unlock(&all_mddevs_lock);
5523
5524 rv = entry->show(mddev, page);
5525 mddev_put(mddev);
5526 return rv;
5527 }
5528
5529 static ssize_t
5530 md_attr_store(struct kobject *kobj, struct attribute *attr,
5531 const char *page, size_t length)
5532 {
5533 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
5534 struct mddev *mddev = container_of(kobj, struct mddev, kobj);
5535 ssize_t rv;
5536
5537 if (!entry->store)
5538 return -EIO;
5539 if (!capable(CAP_SYS_ADMIN))
5540 return -EACCES;
5541 spin_lock(&all_mddevs_lock);
5542 if (!mddev_get(mddev)) {
5543 spin_unlock(&all_mddevs_lock);
5544 return -EBUSY;
5545 }
5546 spin_unlock(&all_mddevs_lock);
5547 rv = entry->store(mddev, page, length);
5548 mddev_put(mddev);
5549 return rv;
5550 }
5551
5552 static void md_kobj_release(struct kobject *ko)
5553 {
5554 struct mddev *mddev = container_of(ko, struct mddev, kobj);
5555
5556 if (mddev->sysfs_state)
5557 sysfs_put(mddev->sysfs_state);
5558 if (mddev->sysfs_level)
5559 sysfs_put(mddev->sysfs_level);
5560
5561 del_gendisk(mddev->gendisk);
5562 put_disk(mddev->gendisk);
5563 }
5564
5565 static const struct sysfs_ops md_sysfs_ops = {
5566 .show = md_attr_show,
5567 .store = md_attr_store,
5568 };
5569 static struct kobj_type md_ktype = {
5570 .release = md_kobj_release,
5571 .sysfs_ops = &md_sysfs_ops,
5572 .default_groups = md_attr_groups,
5573 };
5574
5575 int mdp_major = 0;
5576
5577 static void mddev_delayed_delete(struct work_struct *ws)
5578 {
5579 struct mddev *mddev = container_of(ws, struct mddev, del_work);
5580
5581 kobject_put(&mddev->kobj);
5582 }
5583
5584 static void no_op(struct percpu_ref *r) {}
5585
5586 int mddev_init_writes_pending(struct mddev *mddev)
5587 {
5588 if (mddev->writes_pending.percpu_count_ptr)
5589 return 0;
5590 if (percpu_ref_init(&mddev->writes_pending, no_op,
5591 PERCPU_REF_ALLOW_REINIT, GFP_KERNEL) < 0)
5592 return -ENOMEM;
5593
5594 percpu_ref_put(&mddev->writes_pending);
5595 return 0;
5596 }
5597 EXPORT_SYMBOL_GPL(mddev_init_writes_pending);
5598
5599 struct mddev *md_alloc(dev_t dev, char *name)
5600 {
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610 static DEFINE_MUTEX(disks_mutex);
5611 struct mddev *mddev;
5612 struct gendisk *disk;
5613 int partitioned;
5614 int shift;
5615 int unit;
5616 int error ;
5617
5618
5619
5620
5621
5622 flush_workqueue(md_misc_wq);
5623 flush_workqueue(md_rdev_misc_wq);
5624
5625 mutex_lock(&disks_mutex);
5626 mddev = mddev_alloc(dev);
5627 if (IS_ERR(mddev)) {
5628 error = PTR_ERR(mddev);
5629 goto out_unlock;
5630 }
5631
5632 partitioned = (MAJOR(mddev->unit) != MD_MAJOR);
5633 shift = partitioned ? MdpMinorShift : 0;
5634 unit = MINOR(mddev->unit) >> shift;
5635
5636 if (name && !dev) {
5637
5638
5639 struct mddev *mddev2;
5640 spin_lock(&all_mddevs_lock);
5641
5642 list_for_each_entry(mddev2, &all_mddevs, all_mddevs)
5643 if (mddev2->gendisk &&
5644 strcmp(mddev2->gendisk->disk_name, name) == 0) {
5645 spin_unlock(&all_mddevs_lock);
5646 error = -EEXIST;
5647 goto out_free_mddev;
5648 }
5649 spin_unlock(&all_mddevs_lock);
5650 }
5651 if (name && dev)
5652
5653
5654
5655 mddev->hold_active = UNTIL_STOP;
5656
5657 error = -ENOMEM;
5658 disk = blk_alloc_disk(NUMA_NO_NODE);
5659 if (!disk)
5660 goto out_free_mddev;
5661
5662 disk->major = MAJOR(mddev->unit);
5663 disk->first_minor = unit << shift;
5664 disk->minors = 1 << shift;
5665 if (name)
5666 strcpy(disk->disk_name, name);
5667 else if (partitioned)
5668 sprintf(disk->disk_name, "md_d%d", unit);
5669 else
5670 sprintf(disk->disk_name, "md%d", unit);
5671 disk->fops = &md_fops;
5672 disk->private_data = mddev;
5673
5674 mddev->queue = disk->queue;
5675 blk_set_stacking_limits(&mddev->queue->limits);
5676 blk_queue_write_cache(mddev->queue, true, true);
5677 disk->events |= DISK_EVENT_MEDIA_CHANGE;
5678 mddev->gendisk = disk;
5679 error = add_disk(disk);
5680 if (error)
5681 goto out_put_disk;
5682
5683 kobject_init(&mddev->kobj, &md_ktype);
5684 error = kobject_add(&mddev->kobj, &disk_to_dev(disk)->kobj, "%s", "md");
5685 if (error) {
5686
5687
5688
5689
5690
5691 mddev->hold_active = 0;
5692 mutex_unlock(&disks_mutex);
5693 mddev_put(mddev);
5694 return ERR_PTR(error);
5695 }
5696
5697 kobject_uevent(&mddev->kobj, KOBJ_ADD);
5698 mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, "array_state");
5699 mddev->sysfs_level = sysfs_get_dirent_safe(mddev->kobj.sd, "level");
5700 mutex_unlock(&disks_mutex);
5701 return mddev;
5702
5703 out_put_disk:
5704 put_disk(disk);
5705 out_free_mddev:
5706 mddev_free(mddev);
5707 out_unlock:
5708 mutex_unlock(&disks_mutex);
5709 return ERR_PTR(error);
5710 }
5711
5712 static int md_alloc_and_put(dev_t dev, char *name)
5713 {
5714 struct mddev *mddev = md_alloc(dev, name);
5715
5716 if (IS_ERR(mddev))
5717 return PTR_ERR(mddev);
5718 mddev_put(mddev);
5719 return 0;
5720 }
5721
5722 static void md_probe(dev_t dev)
5723 {
5724 if (MAJOR(dev) == MD_MAJOR && MINOR(dev) >= 512)
5725 return;
5726 if (create_on_open)
5727 md_alloc_and_put(dev, NULL);
5728 }
5729
5730 static int add_named_array(const char *val, const struct kernel_param *kp)
5731 {
5732
5733
5734
5735
5736
5737
5738
5739 int len = strlen(val);
5740 char buf[DISK_NAME_LEN];
5741 unsigned long devnum;
5742
5743 while (len && val[len-1] == '\n')
5744 len--;
5745 if (len >= DISK_NAME_LEN)
5746 return -E2BIG;
5747 strscpy(buf, val, len+1);
5748 if (strncmp(buf, "md_", 3) == 0)
5749 return md_alloc_and_put(0, buf);
5750 if (strncmp(buf, "md", 2) == 0 &&
5751 isdigit(buf[2]) &&
5752 kstrtoul(buf+2, 10, &devnum) == 0 &&
5753 devnum <= MINORMASK)
5754 return md_alloc_and_put(MKDEV(MD_MAJOR, devnum), NULL);
5755
5756 return -EINVAL;
5757 }
5758
5759 static void md_safemode_timeout(struct timer_list *t)
5760 {
5761 struct mddev *mddev = from_timer(mddev, t, safemode_timer);
5762
5763 mddev->safemode = 1;
5764 if (mddev->external)
5765 sysfs_notify_dirent_safe(mddev->sysfs_state);
5766
5767 md_wakeup_thread(mddev->thread);
5768 }
5769
5770 static int start_dirty_degraded;
5771
5772 int md_run(struct mddev *mddev)
5773 {
5774 int err;
5775 struct md_rdev *rdev;
5776 struct md_personality *pers;
5777 bool nowait = true;
5778
5779 if (list_empty(&mddev->disks))
5780
5781 return -EINVAL;
5782
5783 if (mddev->pers)
5784 return -EBUSY;
5785
5786 if (mddev->sysfs_active)
5787 return -EBUSY;
5788
5789
5790
5791
5792 if (!mddev->raid_disks) {
5793 if (!mddev->persistent)
5794 return -EINVAL;
5795 err = analyze_sbs(mddev);
5796 if (err)
5797 return -EINVAL;
5798 }
5799
5800 if (mddev->level != LEVEL_NONE)
5801 request_module("md-level-%d", mddev->level);
5802 else if (mddev->clevel[0])
5803 request_module("md-%s", mddev->clevel);
5804
5805
5806
5807
5808
5809
5810 mddev->has_superblocks = false;
5811 rdev_for_each(rdev, mddev) {
5812 if (test_bit(Faulty, &rdev->flags))
5813 continue;
5814 sync_blockdev(rdev->bdev);
5815 invalidate_bdev(rdev->bdev);
5816 if (mddev->ro != 1 && rdev_read_only(rdev)) {
5817 mddev->ro = 1;
5818 if (mddev->gendisk)
5819 set_disk_ro(mddev->gendisk, 1);
5820 }
5821
5822 if (rdev->sb_page)
5823 mddev->has_superblocks = true;
5824
5825
5826
5827
5828
5829 if (rdev->meta_bdev) {
5830 ;
5831 } else if (rdev->data_offset < rdev->sb_start) {
5832 if (mddev->dev_sectors &&
5833 rdev->data_offset + mddev->dev_sectors
5834 > rdev->sb_start) {
5835 pr_warn("md: %s: data overlaps metadata\n",
5836 mdname(mddev));
5837 return -EINVAL;
5838 }
5839 } else {
5840 if (rdev->sb_start + rdev->sb_size/512
5841 > rdev->data_offset) {
5842 pr_warn("md: %s: metadata overlaps data\n",
5843 mdname(mddev));
5844 return -EINVAL;
5845 }
5846 }
5847 sysfs_notify_dirent_safe(rdev->sysfs_state);
5848 nowait = nowait && blk_queue_nowait(bdev_get_queue(rdev->bdev));
5849 }
5850
5851 if (!bioset_initialized(&mddev->bio_set)) {
5852 err = bioset_init(&mddev->bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
5853 if (err)
5854 return err;
5855 }
5856 if (!bioset_initialized(&mddev->sync_set)) {
5857 err = bioset_init(&mddev->sync_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
5858 if (err)
5859 goto exit_bio_set;
5860 }
5861
5862 spin_lock(&pers_lock);
5863 pers = find_pers(mddev->level, mddev->clevel);
5864 if (!pers || !try_module_get(pers->owner)) {
5865 spin_unlock(&pers_lock);
5866 if (mddev->level != LEVEL_NONE)
5867 pr_warn("md: personality for level %d is not loaded!\n",
5868 mddev->level);
5869 else
5870 pr_warn("md: personality for level %s is not loaded!\n",
5871 mddev->clevel);
5872 err = -EINVAL;
5873 goto abort;
5874 }
5875 spin_unlock(&pers_lock);
5876 if (mddev->level != pers->level) {
5877 mddev->level = pers->level;
5878 mddev->new_level = pers->level;
5879 }
5880 strscpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
5881
5882 if (mddev->reshape_position != MaxSector &&
5883 pers->start_reshape == NULL) {
5884
5885 module_put(pers->owner);
5886 err = -EINVAL;
5887 goto abort;
5888 }
5889
5890 if (pers->sync_request) {
5891
5892
5893
5894 struct md_rdev *rdev2;
5895 int warned = 0;
5896
5897 rdev_for_each(rdev, mddev)
5898 rdev_for_each(rdev2, mddev) {
5899 if (rdev < rdev2 &&
5900 rdev->bdev->bd_disk ==
5901 rdev2->bdev->bd_disk) {
5902 pr_warn("%s: WARNING: %pg appears to be on the same physical disk as %pg.\n",
5903 mdname(mddev),
5904 rdev->bdev,
5905 rdev2->bdev);
5906 warned = 1;
5907 }
5908 }
5909
5910 if (warned)
5911 pr_warn("True protection against single-disk failure might be compromised.\n");
5912 }
5913
5914 mddev->recovery = 0;
5915
5916 mddev->resync_max_sectors = mddev->dev_sectors;
5917
5918 mddev->ok_start_degraded = start_dirty_degraded;
5919
5920 if (start_readonly && mddev->ro == 0)
5921 mddev->ro = 2;
5922
5923 err = pers->run(mddev);
5924 if (err)
5925 pr_warn("md: pers->run() failed ...\n");
5926 else if (pers->size(mddev, 0, 0) < mddev->array_sectors) {
5927 WARN_ONCE(!mddev->external_size,
5928 "%s: default size too small, but 'external_size' not in effect?\n",
5929 __func__);
5930 pr_warn("md: invalid array_size %llu > default size %llu\n",
5931 (unsigned long long)mddev->array_sectors / 2,
5932 (unsigned long long)pers->size(mddev, 0, 0) / 2);
5933 err = -EINVAL;
5934 }
5935 if (err == 0 && pers->sync_request &&
5936 (mddev->bitmap_info.file || mddev->bitmap_info.offset)) {
5937 struct bitmap *bitmap;
5938
5939 bitmap = md_bitmap_create(mddev, -1);
5940 if (IS_ERR(bitmap)) {
5941 err = PTR_ERR(bitmap);
5942 pr_warn("%s: failed to create bitmap (%d)\n",
5943 mdname(mddev), err);
5944 } else
5945 mddev->bitmap = bitmap;
5946
5947 }
5948 if (err)
5949 goto bitmap_abort;
5950
5951 if (mddev->bitmap_info.max_write_behind > 0) {
5952 bool create_pool = false;
5953
5954 rdev_for_each(rdev, mddev) {
5955 if (test_bit(WriteMostly, &rdev->flags) &&
5956 rdev_init_serial(rdev))
5957 create_pool = true;
5958 }
5959 if (create_pool && mddev->serial_info_pool == NULL) {
5960 mddev->serial_info_pool =
5961 mempool_create_kmalloc_pool(NR_SERIAL_INFOS,
5962 sizeof(struct serial_info));
5963 if (!mddev->serial_info_pool) {
5964 err = -ENOMEM;
5965 goto bitmap_abort;
5966 }
5967 }
5968 }
5969
5970 if (mddev->queue) {
5971 bool nonrot = true;
5972
5973 rdev_for_each(rdev, mddev) {
5974 if (rdev->raid_disk >= 0 && !bdev_nonrot(rdev->bdev)) {
5975 nonrot = false;
5976 break;
5977 }
5978 }
5979 if (mddev->degraded)
5980 nonrot = false;
5981 if (nonrot)
5982 blk_queue_flag_set(QUEUE_FLAG_NONROT, mddev->queue);
5983 else
5984 blk_queue_flag_clear(QUEUE_FLAG_NONROT, mddev->queue);
5985 blk_queue_flag_set(QUEUE_FLAG_IO_STAT, mddev->queue);
5986
5987
5988 if (nowait)
5989 blk_queue_flag_set(QUEUE_FLAG_NOWAIT, mddev->queue);
5990 }
5991 if (pers->sync_request) {
5992 if (mddev->kobj.sd &&
5993 sysfs_create_group(&mddev->kobj, &md_redundancy_group))
5994 pr_warn("md: cannot register extra attributes for %s\n",
5995 mdname(mddev));
5996 mddev->sysfs_action = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_action");
5997 mddev->sysfs_completed = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_completed");
5998 mddev->sysfs_degraded = sysfs_get_dirent_safe(mddev->kobj.sd, "degraded");
5999 } else if (mddev->ro == 2)
6000 mddev->ro = 0;
6001
6002 atomic_set(&mddev->max_corr_read_errors,
6003 MD_DEFAULT_MAX_CORRECTED_READ_ERRORS);
6004 mddev->safemode = 0;
6005 if (mddev_is_clustered(mddev))
6006 mddev->safemode_delay = 0;
6007 else
6008 mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY;
6009 mddev->in_sync = 1;
6010 smp_wmb();
6011 spin_lock(&mddev->lock);
6012 mddev->pers = pers;
6013 spin_unlock(&mddev->lock);
6014 rdev_for_each(rdev, mddev)
6015 if (rdev->raid_disk >= 0)
6016 sysfs_link_rdev(mddev, rdev);
6017
6018 if (mddev->degraded && !mddev->ro)
6019
6020
6021
6022 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
6023 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6024
6025 if (mddev->sb_flags)
6026 md_update_sb(mddev, 0);
6027
6028 md_new_event();
6029 return 0;
6030
6031 bitmap_abort:
6032 mddev_detach(mddev);
6033 if (mddev->private)
6034 pers->free(mddev, mddev->private);
6035 mddev->private = NULL;
6036 module_put(pers->owner);
6037 md_bitmap_destroy(mddev);
6038 abort:
6039 bioset_exit(&mddev->sync_set);
6040 exit_bio_set:
6041 bioset_exit(&mddev->bio_set);
6042 return err;
6043 }
6044 EXPORT_SYMBOL_GPL(md_run);
6045
6046 int do_md_run(struct mddev *mddev)
6047 {
6048 int err;
6049
6050 set_bit(MD_NOT_READY, &mddev->flags);
6051 err = md_run(mddev);
6052 if (err)
6053 goto out;
6054 err = md_bitmap_load(mddev);
6055 if (err) {
6056 md_bitmap_destroy(mddev);
6057 goto out;
6058 }
6059
6060 if (mddev_is_clustered(mddev))
6061 md_allow_write(mddev);
6062
6063
6064 md_start(mddev);
6065
6066 md_wakeup_thread(mddev->thread);
6067 md_wakeup_thread(mddev->sync_thread);
6068
6069 set_capacity_and_notify(mddev->gendisk, mddev->array_sectors);
6070 clear_bit(MD_NOT_READY, &mddev->flags);
6071 mddev->changed = 1;
6072 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
6073 sysfs_notify_dirent_safe(mddev->sysfs_state);
6074 sysfs_notify_dirent_safe(mddev->sysfs_action);
6075 sysfs_notify_dirent_safe(mddev->sysfs_degraded);
6076 out:
6077 clear_bit(MD_NOT_READY, &mddev->flags);
6078 return err;
6079 }
6080
6081 int md_start(struct mddev *mddev)
6082 {
6083 int ret = 0;
6084
6085 if (mddev->pers->start) {
6086 set_bit(MD_RECOVERY_WAIT, &mddev->recovery);
6087 md_wakeup_thread(mddev->thread);
6088 ret = mddev->pers->start(mddev);
6089 clear_bit(MD_RECOVERY_WAIT, &mddev->recovery);
6090 md_wakeup_thread(mddev->sync_thread);
6091 }
6092 return ret;
6093 }
6094 EXPORT_SYMBOL_GPL(md_start);
6095
6096 static int restart_array(struct mddev *mddev)
6097 {
6098 struct gendisk *disk = mddev->gendisk;
6099 struct md_rdev *rdev;
6100 bool has_journal = false;
6101 bool has_readonly = false;
6102
6103
6104 if (list_empty(&mddev->disks))
6105 return -ENXIO;
6106 if (!mddev->pers)
6107 return -EINVAL;
6108 if (!mddev->ro)
6109 return -EBUSY;
6110
6111 rcu_read_lock();
6112 rdev_for_each_rcu(rdev, mddev) {
6113 if (test_bit(Journal, &rdev->flags) &&
6114 !test_bit(Faulty, &rdev->flags))
6115 has_journal = true;
6116 if (rdev_read_only(rdev))
6117 has_readonly = true;
6118 }
6119 rcu_read_unlock();
6120 if (test_bit(MD_HAS_JOURNAL, &mddev->flags) && !has_journal)
6121
6122 return -EINVAL;
6123 if (has_readonly)
6124 return -EROFS;
6125
6126 mddev->safemode = 0;
6127 mddev->ro = 0;
6128 set_disk_ro(disk, 0);
6129 pr_debug("md: %s switched to read-write mode.\n", mdname(mddev));
6130
6131 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6132 md_wakeup_thread(mddev->thread);
6133 md_wakeup_thread(mddev->sync_thread);
6134 sysfs_notify_dirent_safe(mddev->sysfs_state);
6135 return 0;
6136 }
6137
6138 static void md_clean(struct mddev *mddev)
6139 {
6140 mddev->array_sectors = 0;
6141 mddev->external_size = 0;
6142 mddev->dev_sectors = 0;
6143 mddev->raid_disks = 0;
6144 mddev->recovery_cp = 0;
6145 mddev->resync_min = 0;
6146 mddev->resync_max = MaxSector;
6147 mddev->reshape_position = MaxSector;
6148 mddev->external = 0;
6149 mddev->persistent = 0;
6150 mddev->level = LEVEL_NONE;
6151 mddev->clevel[0] = 0;
6152 mddev->flags = 0;
6153 mddev->sb_flags = 0;
6154 mddev->ro = 0;
6155 mddev->metadata_type[0] = 0;
6156 mddev->chunk_sectors = 0;
6157 mddev->ctime = mddev->utime = 0;
6158 mddev->layout = 0;
6159 mddev->max_disks = 0;
6160 mddev->events = 0;
6161 mddev->can_decrease_events = 0;
6162 mddev->delta_disks = 0;
6163 mddev->reshape_backwards = 0;
6164 mddev->new_level = LEVEL_NONE;
6165 mddev->new_layout = 0;
6166 mddev->new_chunk_sectors = 0;
6167 mddev->curr_resync = 0;
6168 atomic64_set(&mddev->resync_mismatches, 0);
6169 mddev->suspend_lo = mddev->suspend_hi = 0;
6170 mddev->sync_speed_min = mddev->sync_speed_max = 0;
6171 mddev->recovery = 0;
6172 mddev->in_sync = 0;
6173 mddev->changed = 0;
6174 mddev->degraded = 0;
6175 mddev->safemode = 0;
6176 mddev->private = NULL;
6177 mddev->cluster_info = NULL;
6178 mddev->bitmap_info.offset = 0;
6179 mddev->bitmap_info.default_offset = 0;
6180 mddev->bitmap_info.default_space = 0;
6181 mddev->bitmap_info.chunksize = 0;
6182 mddev->bitmap_info.daemon_sleep = 0;
6183 mddev->bitmap_info.max_write_behind = 0;
6184 mddev->bitmap_info.nodes = 0;
6185 }
6186
6187 static void __md_stop_writes(struct mddev *mddev)
6188 {
6189 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
6190 if (work_pending(&mddev->del_work))
6191 flush_workqueue(md_misc_wq);
6192 if (mddev->sync_thread) {
6193 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
6194 md_unregister_thread(&mddev->sync_thread);
6195 md_reap_sync_thread(mddev);
6196 }
6197
6198 del_timer_sync(&mddev->safemode_timer);
6199
6200 if (mddev->pers && mddev->pers->quiesce) {
6201 mddev->pers->quiesce(mddev, 1);
6202 mddev->pers->quiesce(mddev, 0);
6203 }
6204 md_bitmap_flush(mddev);
6205
6206 if (mddev->ro == 0 &&
6207 ((!mddev->in_sync && !mddev_is_clustered(mddev)) ||
6208 mddev->sb_flags)) {
6209
6210 if (!mddev_is_clustered(mddev))
6211 mddev->in_sync = 1;
6212 md_update_sb(mddev, 1);
6213 }
6214
6215 mddev->serialize_policy = 0;
6216 mddev_destroy_serial_pool(mddev, NULL, true);
6217 }
6218
6219 void md_stop_writes(struct mddev *mddev)
6220 {
6221 mddev_lock_nointr(mddev);
6222 __md_stop_writes(mddev);
6223 mddev_unlock(mddev);
6224 }
6225 EXPORT_SYMBOL_GPL(md_stop_writes);
6226
6227 static void mddev_detach(struct mddev *mddev)
6228 {
6229 md_bitmap_wait_behind_writes(mddev);
6230 if (mddev->pers && mddev->pers->quiesce && !mddev->suspended) {
6231 mddev->pers->quiesce(mddev, 1);
6232 mddev->pers->quiesce(mddev, 0);
6233 }
6234 md_unregister_thread(&mddev->thread);
6235 if (mddev->queue)
6236 blk_sync_queue(mddev->queue);
6237 }
6238
6239 static void __md_stop(struct mddev *mddev)
6240 {
6241 struct md_personality *pers = mddev->pers;
6242 md_bitmap_destroy(mddev);
6243 mddev_detach(mddev);
6244
6245 if (mddev->event_work.func)
6246 flush_workqueue(md_misc_wq);
6247 spin_lock(&mddev->lock);
6248 mddev->pers = NULL;
6249 spin_unlock(&mddev->lock);
6250 if (mddev->private)
6251 pers->free(mddev, mddev->private);
6252 mddev->private = NULL;
6253 if (pers->sync_request && mddev->to_remove == NULL)
6254 mddev->to_remove = &md_redundancy_group;
6255 module_put(pers->owner);
6256 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
6257 }
6258
6259 void md_stop(struct mddev *mddev)
6260 {
6261
6262
6263
6264 __md_stop_writes(mddev);
6265 __md_stop(mddev);
6266 bioset_exit(&mddev->bio_set);
6267 bioset_exit(&mddev->sync_set);
6268 }
6269
6270 EXPORT_SYMBOL_GPL(md_stop);
6271
6272 static int md_set_readonly(struct mddev *mddev, struct block_device *bdev)
6273 {
6274 int err = 0;
6275 int did_freeze = 0;
6276
6277 if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) {
6278 did_freeze = 1;
6279 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
6280 md_wakeup_thread(mddev->thread);
6281 }
6282 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
6283 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
6284 if (mddev->sync_thread)
6285
6286
6287 wake_up_process(mddev->sync_thread->tsk);
6288
6289 if (mddev->external && test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags))
6290 return -EBUSY;
6291 mddev_unlock(mddev);
6292 wait_event(resync_wait, !test_bit(MD_RECOVERY_RUNNING,
6293 &mddev->recovery));
6294 wait_event(mddev->sb_wait,
6295 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
6296 mddev_lock_nointr(mddev);
6297
6298 mutex_lock(&mddev->open_mutex);
6299 if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) ||
6300 mddev->sync_thread ||
6301 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
6302 pr_warn("md: %s still in use.\n",mdname(mddev));
6303 if (did_freeze) {
6304 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
6305 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6306 md_wakeup_thread(mddev->thread);
6307 }
6308 err = -EBUSY;
6309 goto out;
6310 }
6311 if (mddev->pers) {
6312 __md_stop_writes(mddev);
6313
6314 err = -ENXIO;
6315 if (mddev->ro==1)
6316 goto out;
6317 mddev->ro = 1;
6318 set_disk_ro(mddev->gendisk, 1);
6319 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
6320 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6321 md_wakeup_thread(mddev->thread);
6322 sysfs_notify_dirent_safe(mddev->sysfs_state);
6323 err = 0;
6324 }
6325 out:
6326 mutex_unlock(&mddev->open_mutex);
6327 return err;
6328 }
6329
6330
6331
6332
6333
6334 static int do_md_stop(struct mddev *mddev, int mode,
6335 struct block_device *bdev)
6336 {
6337 struct gendisk *disk = mddev->gendisk;
6338 struct md_rdev *rdev;
6339 int did_freeze = 0;
6340
6341 if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) {
6342 did_freeze = 1;
6343 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
6344 md_wakeup_thread(mddev->thread);
6345 }
6346 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
6347 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
6348 if (mddev->sync_thread)
6349
6350
6351 wake_up_process(mddev->sync_thread->tsk);
6352
6353 mddev_unlock(mddev);
6354 wait_event(resync_wait, (mddev->sync_thread == NULL &&
6355 !test_bit(MD_RECOVERY_RUNNING,
6356 &mddev->recovery)));
6357 mddev_lock_nointr(mddev);
6358
6359 mutex_lock(&mddev->open_mutex);
6360 if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) ||
6361 mddev->sysfs_active ||
6362 mddev->sync_thread ||
6363 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
6364 pr_warn("md: %s still in use.\n",mdname(mddev));
6365 mutex_unlock(&mddev->open_mutex);
6366 if (did_freeze) {
6367 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
6368 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6369 md_wakeup_thread(mddev->thread);
6370 }
6371 return -EBUSY;
6372 }
6373 if (mddev->pers) {
6374 if (mddev->ro)
6375 set_disk_ro(disk, 0);
6376
6377 __md_stop_writes(mddev);
6378 __md_stop(mddev);
6379
6380
6381 sysfs_notify_dirent_safe(mddev->sysfs_state);
6382
6383 rdev_for_each(rdev, mddev)
6384 if (rdev->raid_disk >= 0)
6385 sysfs_unlink_rdev(mddev, rdev);
6386
6387 set_capacity_and_notify(disk, 0);
6388 mutex_unlock(&mddev->open_mutex);
6389 mddev->changed = 1;
6390
6391 if (mddev->ro)
6392 mddev->ro = 0;
6393 } else
6394 mutex_unlock(&mddev->open_mutex);
6395
6396
6397
6398 if (mode == 0) {
6399 pr_info("md: %s stopped.\n", mdname(mddev));
6400
6401 if (mddev->bitmap_info.file) {
6402 struct file *f = mddev->bitmap_info.file;
6403 spin_lock(&mddev->lock);
6404 mddev->bitmap_info.file = NULL;
6405 spin_unlock(&mddev->lock);
6406 fput(f);
6407 }
6408 mddev->bitmap_info.offset = 0;
6409
6410 export_array(mddev);
6411
6412 md_clean(mddev);
6413 if (mddev->hold_active == UNTIL_STOP)
6414 mddev->hold_active = 0;
6415 }
6416 md_new_event();
6417 sysfs_notify_dirent_safe(mddev->sysfs_state);
6418 return 0;
6419 }
6420
6421 #ifndef MODULE
6422 static void autorun_array(struct mddev *mddev)
6423 {
6424 struct md_rdev *rdev;
6425 int err;
6426
6427 if (list_empty(&mddev->disks))
6428 return;
6429
6430 pr_info("md: running: ");
6431
6432 rdev_for_each(rdev, mddev) {
6433 pr_cont("<%pg>", rdev->bdev);
6434 }
6435 pr_cont("\n");
6436
6437 err = do_md_run(mddev);
6438 if (err) {
6439 pr_warn("md: do_md_run() returned %d\n", err);
6440 do_md_stop(mddev, 0, NULL);
6441 }
6442 }
6443
6444
6445
6446
6447
6448
6449
6450
6451
6452
6453
6454
6455
6456 static void autorun_devices(int part)
6457 {
6458 struct md_rdev *rdev0, *rdev, *tmp;
6459 struct mddev *mddev;
6460
6461 pr_info("md: autorun ...\n");
6462 while (!list_empty(&pending_raid_disks)) {
6463 int unit;
6464 dev_t dev;
6465 LIST_HEAD(candidates);
6466 rdev0 = list_entry(pending_raid_disks.next,
6467 struct md_rdev, same_set);
6468
6469 pr_debug("md: considering %pg ...\n", rdev0->bdev);
6470 INIT_LIST_HEAD(&candidates);
6471 rdev_for_each_list(rdev, tmp, &pending_raid_disks)
6472 if (super_90_load(rdev, rdev0, 0) >= 0) {
6473 pr_debug("md: adding %pg ...\n",
6474 rdev->bdev);
6475 list_move(&rdev->same_set, &candidates);
6476 }
6477
6478
6479
6480
6481
6482 if (part) {
6483 dev = MKDEV(mdp_major,
6484 rdev0->preferred_minor << MdpMinorShift);
6485 unit = MINOR(dev) >> MdpMinorShift;
6486 } else {
6487 dev = MKDEV(MD_MAJOR, rdev0->preferred_minor);
6488 unit = MINOR(dev);
6489 }
6490 if (rdev0->preferred_minor != unit) {
6491 pr_warn("md: unit number in %pg is bad: %d\n",
6492 rdev0->bdev, rdev0->preferred_minor);
6493 break;
6494 }
6495
6496 mddev = md_alloc(dev, NULL);
6497 if (IS_ERR(mddev))
6498 break;
6499
6500 if (mddev_lock(mddev))
6501 pr_warn("md: %s locked, cannot run\n", mdname(mddev));
6502 else if (mddev->raid_disks || mddev->major_version
6503 || !list_empty(&mddev->disks)) {
6504 pr_warn("md: %s already running, cannot run %pg\n",
6505 mdname(mddev), rdev0->bdev);
6506 mddev_unlock(mddev);
6507 } else {
6508 pr_debug("md: created %s\n", mdname(mddev));
6509 mddev->persistent = 1;
6510 rdev_for_each_list(rdev, tmp, &candidates) {
6511 list_del_init(&rdev->same_set);
6512 if (bind_rdev_to_array(rdev, mddev))
6513 export_rdev(rdev);
6514 }
6515 autorun_array(mddev);
6516 mddev_unlock(mddev);
6517 }
6518
6519
6520
6521 rdev_for_each_list(rdev, tmp, &candidates) {
6522 list_del_init(&rdev->same_set);
6523 export_rdev(rdev);
6524 }
6525 mddev_put(mddev);
6526 }
6527 pr_info("md: ... autorun DONE.\n");
6528 }
6529 #endif
6530
6531 static int get_version(void __user *arg)
6532 {
6533 mdu_version_t ver;
6534
6535 ver.major = MD_MAJOR_VERSION;
6536 ver.minor = MD_MINOR_VERSION;
6537 ver.patchlevel = MD_PATCHLEVEL_VERSION;
6538
6539 if (copy_to_user(arg, &ver, sizeof(ver)))
6540 return -EFAULT;
6541
6542 return 0;
6543 }
6544
6545 static int get_array_info(struct mddev *mddev, void __user *arg)
6546 {
6547 mdu_array_info_t info;
6548 int nr,working,insync,failed,spare;
6549 struct md_rdev *rdev;
6550
6551 nr = working = insync = failed = spare = 0;
6552 rcu_read_lock();
6553 rdev_for_each_rcu(rdev, mddev) {
6554 nr++;
6555 if (test_bit(Faulty, &rdev->flags))
6556 failed++;
6557 else {
6558 working++;
6559 if (test_bit(In_sync, &rdev->flags))
6560 insync++;
6561 else if (test_bit(Journal, &rdev->flags))
6562
6563 ;
6564 else
6565 spare++;
6566 }
6567 }
6568 rcu_read_unlock();
6569
6570 info.major_version = mddev->major_version;
6571 info.minor_version = mddev->minor_version;
6572 info.patch_version = MD_PATCHLEVEL_VERSION;
6573 info.ctime = clamp_t(time64_t, mddev->ctime, 0, U32_MAX);
6574 info.level = mddev->level;
6575 info.size = mddev->dev_sectors / 2;
6576 if (info.size != mddev->dev_sectors / 2)
6577 info.size = -1;
6578 info.nr_disks = nr;
6579 info.raid_disks = mddev->raid_disks;
6580 info.md_minor = mddev->md_minor;
6581 info.not_persistent= !mddev->persistent;
6582
6583 info.utime = clamp_t(time64_t, mddev->utime, 0, U32_MAX);
6584 info.state = 0;
6585 if (mddev->in_sync)
6586 info.state = (1<<MD_SB_CLEAN);
6587 if (mddev->bitmap && mddev->bitmap_info.offset)
6588 info.state |= (1<<MD_SB_BITMAP_PRESENT);
6589 if (mddev_is_clustered(mddev))
6590 info.state |= (1<<MD_SB_CLUSTERED);
6591 info.active_disks = insync;
6592 info.working_disks = working;
6593 info.failed_disks = failed;
6594 info.spare_disks = spare;
6595
6596 info.layout = mddev->layout;
6597 info.chunk_size = mddev->chunk_sectors << 9;
6598
6599 if (copy_to_user(arg, &info, sizeof(info)))
6600 return -EFAULT;
6601
6602 return 0;
6603 }
6604
6605 static int get_bitmap_file(struct mddev *mddev, void __user * arg)
6606 {
6607 mdu_bitmap_file_t *file = NULL;
6608 char *ptr;
6609 int err;
6610
6611 file = kzalloc(sizeof(*file), GFP_NOIO);
6612 if (!file)
6613 return -ENOMEM;
6614
6615 err = 0;
6616 spin_lock(&mddev->lock);
6617
6618 if (mddev->bitmap_info.file) {
6619 ptr = file_path(mddev->bitmap_info.file, file->pathname,
6620 sizeof(file->pathname));
6621 if (IS_ERR(ptr))
6622 err = PTR_ERR(ptr);
6623 else
6624 memmove(file->pathname, ptr,
6625 sizeof(file->pathname)-(ptr-file->pathname));
6626 }
6627 spin_unlock(&mddev->lock);
6628
6629 if (err == 0 &&
6630 copy_to_user(arg, file, sizeof(*file)))
6631 err = -EFAULT;
6632
6633 kfree(file);
6634 return err;
6635 }
6636
6637 static int get_disk_info(struct mddev *mddev, void __user * arg)
6638 {
6639 mdu_disk_info_t info;
6640 struct md_rdev *rdev;
6641
6642 if (copy_from_user(&info, arg, sizeof(info)))
6643 return -EFAULT;
6644
6645 rcu_read_lock();
6646 rdev = md_find_rdev_nr_rcu(mddev, info.number);
6647 if (rdev) {
6648 info.major = MAJOR(rdev->bdev->bd_dev);
6649 info.minor = MINOR(rdev->bdev->bd_dev);
6650 info.raid_disk = rdev->raid_disk;
6651 info.state = 0;
6652 if (test_bit(Faulty, &rdev->flags))
6653 info.state |= (1<<MD_DISK_FAULTY);
6654 else if (test_bit(In_sync, &rdev->flags)) {
6655 info.state |= (1<<MD_DISK_ACTIVE);
6656 info.state |= (1<<MD_DISK_SYNC);
6657 }
6658 if (test_bit(Journal, &rdev->flags))
6659 info.state |= (1<<MD_DISK_JOURNAL);
6660 if (test_bit(WriteMostly, &rdev->flags))
6661 info.state |= (1<<MD_DISK_WRITEMOSTLY);
6662 if (test_bit(FailFast, &rdev->flags))
6663 info.state |= (1<<MD_DISK_FAILFAST);
6664 } else {
6665 info.major = info.minor = 0;
6666 info.raid_disk = -1;
6667 info.state = (1<<MD_DISK_REMOVED);
6668 }
6669 rcu_read_unlock();
6670
6671 if (copy_to_user(arg, &info, sizeof(info)))
6672 return -EFAULT;
6673
6674 return 0;
6675 }
6676
6677 int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info)
6678 {
6679 struct md_rdev *rdev;
6680 dev_t dev = MKDEV(info->major,info->minor);
6681
6682 if (mddev_is_clustered(mddev) &&
6683 !(info->state & ((1 << MD_DISK_CLUSTER_ADD) | (1 << MD_DISK_CANDIDATE)))) {
6684 pr_warn("%s: Cannot add to clustered mddev.\n",
6685 mdname(mddev));
6686 return -EINVAL;
6687 }
6688
6689 if (info->major != MAJOR(dev) || info->minor != MINOR(dev))
6690 return -EOVERFLOW;
6691
6692 if (!mddev->raid_disks) {
6693 int err;
6694
6695 rdev = md_import_device(dev, mddev->major_version, mddev->minor_version);
6696 if (IS_ERR(rdev)) {
6697 pr_warn("md: md_import_device returned %ld\n",
6698 PTR_ERR(rdev));
6699 return PTR_ERR(rdev);
6700 }
6701 if (!list_empty(&mddev->disks)) {
6702 struct md_rdev *rdev0
6703 = list_entry(mddev->disks.next,
6704 struct md_rdev, same_set);
6705 err = super_types[mddev->major_version]
6706 .load_super(rdev, rdev0, mddev->minor_version);
6707 if (err < 0) {
6708 pr_warn("md: %pg has different UUID to %pg\n",
6709 rdev->bdev,
6710 rdev0->bdev);
6711 export_rdev(rdev);
6712 return -EINVAL;
6713 }
6714 }
6715 err = bind_rdev_to_array(rdev, mddev);
6716 if (err)
6717 export_rdev(rdev);
6718 return err;
6719 }
6720
6721
6722
6723
6724
6725
6726 if (mddev->pers) {
6727 int err;
6728 if (!mddev->pers->hot_add_disk) {
6729 pr_warn("%s: personality does not support diskops!\n",
6730 mdname(mddev));
6731 return -EINVAL;
6732 }
6733 if (mddev->persistent)
6734 rdev = md_import_device(dev, mddev->major_version,
6735 mddev->minor_version);
6736 else
6737 rdev = md_import_device(dev, -1, -1);
6738 if (IS_ERR(rdev)) {
6739 pr_warn("md: md_import_device returned %ld\n",
6740 PTR_ERR(rdev));
6741 return PTR_ERR(rdev);
6742 }
6743
6744 if (!mddev->persistent) {
6745 if (info->state & (1<<MD_DISK_SYNC) &&
6746 info->raid_disk < mddev->raid_disks) {
6747 rdev->raid_disk = info->raid_disk;
6748 set_bit(In_sync, &rdev->flags);
6749 clear_bit(Bitmap_sync, &rdev->flags);
6750 } else
6751 rdev->raid_disk = -1;
6752 rdev->saved_raid_disk = rdev->raid_disk;
6753 } else
6754 super_types[mddev->major_version].
6755 validate_super(mddev, rdev);
6756 if ((info->state & (1<<MD_DISK_SYNC)) &&
6757 rdev->raid_disk != info->raid_disk) {
6758
6759
6760
6761 export_rdev(rdev);
6762 return -EINVAL;
6763 }
6764
6765 clear_bit(In_sync, &rdev->flags);
6766 if (info->state & (1<<MD_DISK_WRITEMOSTLY))
6767 set_bit(WriteMostly, &rdev->flags);
6768 else
6769 clear_bit(WriteMostly, &rdev->flags);
6770 if (info->state & (1<<MD_DISK_FAILFAST))
6771 set_bit(FailFast, &rdev->flags);
6772 else
6773 clear_bit(FailFast, &rdev->flags);
6774
6775 if (info->state & (1<<MD_DISK_JOURNAL)) {
6776 struct md_rdev *rdev2;
6777 bool has_journal = false;
6778
6779
6780 rdev_for_each(rdev2, mddev) {
6781 if (test_bit(Journal, &rdev2->flags)) {
6782 has_journal = true;
6783 break;
6784 }
6785 }
6786 if (has_journal || mddev->bitmap) {
6787 export_rdev(rdev);
6788 return -EBUSY;
6789 }
6790 set_bit(Journal, &rdev->flags);
6791 }
6792
6793
6794
6795 if (mddev_is_clustered(mddev)) {
6796 if (info->state & (1 << MD_DISK_CANDIDATE))
6797 set_bit(Candidate, &rdev->flags);
6798 else if (info->state & (1 << MD_DISK_CLUSTER_ADD)) {
6799
6800 err = md_cluster_ops->add_new_disk(mddev, rdev);
6801 if (err) {
6802 export_rdev(rdev);
6803 return err;
6804 }
6805 }
6806 }
6807
6808 rdev->raid_disk = -1;
6809 err = bind_rdev_to_array(rdev, mddev);
6810
6811 if (err)
6812 export_rdev(rdev);
6813
6814 if (mddev_is_clustered(mddev)) {
6815 if (info->state & (1 << MD_DISK_CANDIDATE)) {
6816 if (!err) {
6817 err = md_cluster_ops->new_disk_ack(mddev,
6818 err == 0);
6819 if (err)
6820 md_kick_rdev_from_array(rdev);
6821 }
6822 } else {
6823 if (err)
6824 md_cluster_ops->add_new_disk_cancel(mddev);
6825 else
6826 err = add_bound_rdev(rdev);
6827 }
6828
6829 } else if (!err)
6830 err = add_bound_rdev(rdev);
6831
6832 return err;
6833 }
6834
6835
6836
6837
6838 if (mddev->major_version != 0) {
6839 pr_warn("%s: ADD_NEW_DISK not supported\n", mdname(mddev));
6840 return -EINVAL;
6841 }
6842
6843 if (!(info->state & (1<<MD_DISK_FAULTY))) {
6844 int err;
6845 rdev = md_import_device(dev, -1, 0);
6846 if (IS_ERR(rdev)) {
6847 pr_warn("md: error, md_import_device() returned %ld\n",
6848 PTR_ERR(rdev));
6849 return PTR_ERR(rdev);
6850 }
6851 rdev->desc_nr = info->number;
6852 if (info->raid_disk < mddev->raid_disks)
6853 rdev->raid_disk = info->raid_disk;
6854 else
6855 rdev->raid_disk = -1;
6856
6857 if (rdev->raid_disk < mddev->raid_disks)
6858 if (info->state & (1<<MD_DISK_SYNC))
6859 set_bit(In_sync, &rdev->flags);
6860
6861 if (info->state & (1<<MD_DISK_WRITEMOSTLY))
6862 set_bit(WriteMostly, &rdev->flags);
6863 if (info->state & (1<<MD_DISK_FAILFAST))
6864 set_bit(FailFast, &rdev->flags);
6865
6866 if (!mddev->persistent) {
6867 pr_debug("md: nonpersistent superblock ...\n");
6868 rdev->sb_start = bdev_nr_sectors(rdev->bdev);
6869 } else
6870 rdev->sb_start = calc_dev_sboffset(rdev);
6871 rdev->sectors = rdev->sb_start;
6872
6873 err = bind_rdev_to_array(rdev, mddev);
6874 if (err) {
6875 export_rdev(rdev);
6876 return err;
6877 }
6878 }
6879
6880 return 0;
6881 }
6882
6883 static int hot_remove_disk(struct mddev *mddev, dev_t dev)
6884 {
6885 struct md_rdev *rdev;
6886
6887 if (!mddev->pers)
6888 return -ENODEV;
6889
6890 rdev = find_rdev(mddev, dev);
6891 if (!rdev)
6892 return -ENXIO;
6893
6894 if (rdev->raid_disk < 0)
6895 goto kick_rdev;
6896
6897 clear_bit(Blocked, &rdev->flags);
6898 remove_and_add_spares(mddev, rdev);
6899
6900 if (rdev->raid_disk >= 0)
6901 goto busy;
6902
6903 kick_rdev:
6904 if (mddev_is_clustered(mddev)) {
6905 if (md_cluster_ops->remove_disk(mddev, rdev))
6906 goto busy;
6907 }
6908
6909 md_kick_rdev_from_array(rdev);
6910 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
6911 if (mddev->thread)
6912 md_wakeup_thread(mddev->thread);
6913 else
6914 md_update_sb(mddev, 1);
6915 md_new_event();
6916
6917 return 0;
6918 busy:
6919 pr_debug("md: cannot remove active disk %pg from %s ...\n",
6920 rdev->bdev, mdname(mddev));
6921 return -EBUSY;
6922 }
6923
6924 static int hot_add_disk(struct mddev *mddev, dev_t dev)
6925 {
6926 int err;
6927 struct md_rdev *rdev;
6928
6929 if (!mddev->pers)
6930 return -ENODEV;
6931
6932 if (mddev->major_version != 0) {
6933 pr_warn("%s: HOT_ADD may only be used with version-0 superblocks.\n",
6934 mdname(mddev));
6935 return -EINVAL;
6936 }
6937 if (!mddev->pers->hot_add_disk) {
6938 pr_warn("%s: personality does not support diskops!\n",
6939 mdname(mddev));
6940 return -EINVAL;
6941 }
6942
6943 rdev = md_import_device(dev, -1, 0);
6944 if (IS_ERR(rdev)) {
6945 pr_warn("md: error, md_import_device() returned %ld\n",
6946 PTR_ERR(rdev));
6947 return -EINVAL;
6948 }
6949
6950 if (mddev->persistent)
6951 rdev->sb_start = calc_dev_sboffset(rdev);
6952 else
6953 rdev->sb_start = bdev_nr_sectors(rdev->bdev);
6954
6955 rdev->sectors = rdev->sb_start;
6956
6957 if (test_bit(Faulty, &rdev->flags)) {
6958 pr_warn("md: can not hot-add faulty %pg disk to %s!\n",
6959 rdev->bdev, mdname(mddev));
6960 err = -EINVAL;
6961 goto abort_export;
6962 }
6963
6964 clear_bit(In_sync, &rdev->flags);
6965 rdev->desc_nr = -1;
6966 rdev->saved_raid_disk = -1;
6967 err = bind_rdev_to_array(rdev, mddev);
6968 if (err)
6969 goto abort_export;
6970
6971
6972
6973
6974
6975
6976 rdev->raid_disk = -1;
6977
6978 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
6979 if (!mddev->thread)
6980 md_update_sb(mddev, 1);
6981
6982
6983
6984
6985 if (!blk_queue_nowait(bdev_get_queue(rdev->bdev))) {
6986 pr_info("%s: Disabling nowait because %pg does not support nowait\n",
6987 mdname(mddev), rdev->bdev);
6988 blk_queue_flag_clear(QUEUE_FLAG_NOWAIT, mddev->queue);
6989 }
6990
6991
6992
6993
6994 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6995 md_wakeup_thread(mddev->thread);
6996 md_new_event();
6997 return 0;
6998
6999 abort_export:
7000 export_rdev(rdev);
7001 return err;
7002 }
7003
7004 static int set_bitmap_file(struct mddev *mddev, int fd)
7005 {
7006 int err = 0;
7007
7008 if (mddev->pers) {
7009 if (!mddev->pers->quiesce || !mddev->thread)
7010 return -EBUSY;
7011 if (mddev->recovery || mddev->sync_thread)
7012 return -EBUSY;
7013
7014 }
7015
7016 if (fd >= 0) {
7017 struct inode *inode;
7018 struct file *f;
7019
7020 if (mddev->bitmap || mddev->bitmap_info.file)
7021 return -EEXIST;
7022 f = fget(fd);
7023
7024 if (f == NULL) {
7025 pr_warn("%s: error: failed to get bitmap file\n",
7026 mdname(mddev));
7027 return -EBADF;
7028 }
7029
7030 inode = f->f_mapping->host;
7031 if (!S_ISREG(inode->i_mode)) {
7032 pr_warn("%s: error: bitmap file must be a regular file\n",
7033 mdname(mddev));
7034 err = -EBADF;
7035 } else if (!(f->f_mode & FMODE_WRITE)) {
7036 pr_warn("%s: error: bitmap file must open for write\n",
7037 mdname(mddev));
7038 err = -EBADF;
7039 } else if (atomic_read(&inode->i_writecount) != 1) {
7040 pr_warn("%s: error: bitmap file is already in use\n",
7041 mdname(mddev));
7042 err = -EBUSY;
7043 }
7044 if (err) {
7045 fput(f);
7046 return err;
7047 }
7048 mddev->bitmap_info.file = f;
7049 mddev->bitmap_info.offset = 0;
7050 } else if (mddev->bitmap == NULL)
7051 return -ENOENT;
7052 err = 0;
7053 if (mddev->pers) {
7054 if (fd >= 0) {
7055 struct bitmap *bitmap;
7056
7057 bitmap = md_bitmap_create(mddev, -1);
7058 mddev_suspend(mddev);
7059 if (!IS_ERR(bitmap)) {
7060 mddev->bitmap = bitmap;
7061 err = md_bitmap_load(mddev);
7062 } else
7063 err = PTR_ERR(bitmap);
7064 if (err) {
7065 md_bitmap_destroy(mddev);
7066 fd = -1;
7067 }
7068 mddev_resume(mddev);
7069 } else if (fd < 0) {
7070 mddev_suspend(mddev);
7071 md_bitmap_destroy(mddev);
7072 mddev_resume(mddev);
7073 }
7074 }
7075 if (fd < 0) {
7076 struct file *f = mddev->bitmap_info.file;
7077 if (f) {
7078 spin_lock(&mddev->lock);
7079 mddev->bitmap_info.file = NULL;
7080 spin_unlock(&mddev->lock);
7081 fput(f);
7082 }
7083 }
7084
7085 return err;
7086 }
7087
7088
7089
7090
7091
7092
7093
7094
7095
7096
7097
7098
7099
7100
7101 int md_set_array_info(struct mddev *mddev, struct mdu_array_info_s *info)
7102 {
7103 if (info->raid_disks == 0) {
7104
7105 if (info->major_version < 0 ||
7106 info->major_version >= ARRAY_SIZE(super_types) ||
7107 super_types[info->major_version].name == NULL) {
7108
7109 pr_warn("md: superblock version %d not known\n",
7110 info->major_version);
7111 return -EINVAL;
7112 }
7113 mddev->major_version = info->major_version;
7114 mddev->minor_version = info->minor_version;
7115 mddev->patch_version = info->patch_version;
7116 mddev->persistent = !info->not_persistent;
7117
7118
7119
7120 mddev->ctime = ktime_get_real_seconds();
7121 return 0;
7122 }
7123 mddev->major_version = MD_MAJOR_VERSION;
7124 mddev->minor_version = MD_MINOR_VERSION;
7125 mddev->patch_version = MD_PATCHLEVEL_VERSION;
7126 mddev->ctime = ktime_get_real_seconds();
7127
7128 mddev->level = info->level;
7129 mddev->clevel[0] = 0;
7130 mddev->dev_sectors = 2 * (sector_t)info->size;
7131 mddev->raid_disks = info->raid_disks;
7132
7133
7134
7135 if (info->state & (1<<MD_SB_CLEAN))
7136 mddev->recovery_cp = MaxSector;
7137 else
7138 mddev->recovery_cp = 0;
7139 mddev->persistent = ! info->not_persistent;
7140 mddev->external = 0;
7141
7142 mddev->layout = info->layout;
7143 if (mddev->level == 0)
7144
7145 mddev->layout = -1;
7146 mddev->chunk_sectors = info->chunk_size >> 9;
7147
7148 if (mddev->persistent) {
7149 mddev->max_disks = MD_SB_DISKS;
7150 mddev->flags = 0;
7151 mddev->sb_flags = 0;
7152 }
7153 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
7154
7155 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9;
7156 mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9);
7157 mddev->bitmap_info.offset = 0;
7158
7159 mddev->reshape_position = MaxSector;
7160
7161
7162
7163
7164 get_random_bytes(mddev->uuid, 16);
7165
7166 mddev->new_level = mddev->level;
7167 mddev->new_chunk_sectors = mddev->chunk_sectors;
7168 mddev->new_layout = mddev->layout;
7169 mddev->delta_disks = 0;
7170 mddev->reshape_backwards = 0;
7171
7172 return 0;
7173 }
7174
7175 void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors)
7176 {
7177 lockdep_assert_held(&mddev->reconfig_mutex);
7178
7179 if (mddev->external_size)
7180 return;
7181
7182 mddev->array_sectors = array_sectors;
7183 }
7184 EXPORT_SYMBOL(md_set_array_sectors);
7185
7186 static int update_size(struct mddev *mddev, sector_t num_sectors)
7187 {
7188 struct md_rdev *rdev;
7189 int rv;
7190 int fit = (num_sectors == 0);
7191 sector_t old_dev_sectors = mddev->dev_sectors;
7192
7193 if (mddev->pers->resize == NULL)
7194 return -EINVAL;
7195
7196
7197
7198
7199
7200
7201
7202
7203
7204 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
7205 mddev->sync_thread)
7206 return -EBUSY;
7207 if (mddev->ro)
7208 return -EROFS;
7209
7210 rdev_for_each(rdev, mddev) {
7211 sector_t avail = rdev->sectors;
7212
7213 if (fit && (num_sectors == 0 || num_sectors > avail))
7214 num_sectors = avail;
7215 if (avail < num_sectors)
7216 return -ENOSPC;
7217 }
7218 rv = mddev->pers->resize(mddev, num_sectors);
7219 if (!rv) {
7220 if (mddev_is_clustered(mddev))
7221 md_cluster_ops->update_size(mddev, old_dev_sectors);
7222 else if (mddev->queue) {
7223 set_capacity_and_notify(mddev->gendisk,
7224 mddev->array_sectors);
7225 }
7226 }
7227 return rv;
7228 }
7229
7230 static int update_raid_disks(struct mddev *mddev, int raid_disks)
7231 {
7232 int rv;
7233 struct md_rdev *rdev;
7234
7235 if (mddev->pers->check_reshape == NULL)
7236 return -EINVAL;
7237 if (mddev->ro)
7238 return -EROFS;
7239 if (raid_disks <= 0 ||
7240 (mddev->max_disks && raid_disks >= mddev->max_disks))
7241 return -EINVAL;
7242 if (mddev->sync_thread ||
7243 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
7244 test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) ||
7245 mddev->reshape_position != MaxSector)
7246 return -EBUSY;
7247
7248 rdev_for_each(rdev, mddev) {
7249 if (mddev->raid_disks < raid_disks &&
7250 rdev->data_offset < rdev->new_data_offset)
7251 return -EINVAL;
7252 if (mddev->raid_disks > raid_disks &&
7253 rdev->data_offset > rdev->new_data_offset)
7254 return -EINVAL;
7255 }
7256
7257 mddev->delta_disks = raid_disks - mddev->raid_disks;
7258 if (mddev->delta_disks < 0)
7259 mddev->reshape_backwards = 1;
7260 else if (mddev->delta_disks > 0)
7261 mddev->reshape_backwards = 0;
7262
7263 rv = mddev->pers->check_reshape(mddev);
7264 if (rv < 0) {
7265 mddev->delta_disks = 0;
7266 mddev->reshape_backwards = 0;
7267 }
7268 return rv;
7269 }
7270
7271
7272
7273
7274
7275
7276
7277
7278
7279 static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
7280 {
7281 int rv = 0;
7282 int cnt = 0;
7283 int state = 0;
7284
7285
7286 if (mddev->bitmap && mddev->bitmap_info.offset)
7287 state |= (1 << MD_SB_BITMAP_PRESENT);
7288
7289 if (mddev->major_version != info->major_version ||
7290 mddev->minor_version != info->minor_version ||
7291
7292 mddev->ctime != info->ctime ||
7293 mddev->level != info->level ||
7294
7295 mddev->persistent != !info->not_persistent ||
7296 mddev->chunk_sectors != info->chunk_size >> 9 ||
7297
7298 ((state^info->state) & 0xfffffe00)
7299 )
7300 return -EINVAL;
7301
7302 if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
7303 cnt++;
7304 if (mddev->raid_disks != info->raid_disks)
7305 cnt++;
7306 if (mddev->layout != info->layout)
7307 cnt++;
7308 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT))
7309 cnt++;
7310 if (cnt == 0)
7311 return 0;
7312 if (cnt > 1)
7313 return -EINVAL;
7314
7315 if (mddev->layout != info->layout) {
7316
7317
7318
7319
7320 if (mddev->pers->check_reshape == NULL)
7321 return -EINVAL;
7322 else {
7323 mddev->new_layout = info->layout;
7324 rv = mddev->pers->check_reshape(mddev);
7325 if (rv)
7326 mddev->new_layout = mddev->layout;
7327 return rv;
7328 }
7329 }
7330 if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
7331 rv = update_size(mddev, (sector_t)info->size * 2);
7332
7333 if (mddev->raid_disks != info->raid_disks)
7334 rv = update_raid_disks(mddev, info->raid_disks);
7335
7336 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) {
7337 if (mddev->pers->quiesce == NULL || mddev->thread == NULL) {
7338 rv = -EINVAL;
7339 goto err;
7340 }
7341 if (mddev->recovery || mddev->sync_thread) {
7342 rv = -EBUSY;
7343 goto err;
7344 }
7345 if (info->state & (1<<MD_SB_BITMAP_PRESENT)) {
7346 struct bitmap *bitmap;
7347
7348 if (mddev->bitmap) {
7349 rv = -EEXIST;
7350 goto err;
7351 }
7352 if (mddev->bitmap_info.default_offset == 0) {
7353 rv = -EINVAL;
7354 goto err;
7355 }
7356 mddev->bitmap_info.offset =
7357 mddev->bitmap_info.default_offset;
7358 mddev->bitmap_info.space =
7359 mddev->bitmap_info.default_space;
7360 bitmap = md_bitmap_create(mddev, -1);
7361 mddev_suspend(mddev);
7362 if (!IS_ERR(bitmap)) {
7363 mddev->bitmap = bitmap;
7364 rv = md_bitmap_load(mddev);
7365 } else
7366 rv = PTR_ERR(bitmap);
7367 if (rv)
7368 md_bitmap_destroy(mddev);
7369 mddev_resume(mddev);
7370 } else {
7371
7372 if (!mddev->bitmap) {
7373 rv = -ENOENT;
7374 goto err;
7375 }
7376 if (mddev->bitmap->storage.file) {
7377 rv = -EINVAL;
7378 goto err;
7379 }
7380 if (mddev->bitmap_info.nodes) {
7381
7382 if (md_cluster_ops->lock_all_bitmaps(mddev) <= 0) {
7383 pr_warn("md: can't change bitmap to none since the array is in use by more than one node\n");
7384 rv = -EPERM;
7385 md_cluster_ops->unlock_all_bitmaps(mddev);
7386 goto err;
7387 }
7388
7389 mddev->bitmap_info.nodes = 0;
7390 md_cluster_ops->leave(mddev);
7391 module_put(md_cluster_mod);
7392 mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY;
7393 }
7394 mddev_suspend(mddev);
7395 md_bitmap_destroy(mddev);
7396 mddev_resume(mddev);
7397 mddev->bitmap_info.offset = 0;
7398 }
7399 }
7400 md_update_sb(mddev, 1);
7401 return rv;
7402 err:
7403 return rv;
7404 }
7405
7406 static int set_disk_faulty(struct mddev *mddev, dev_t dev)
7407 {
7408 struct md_rdev *rdev;
7409 int err = 0;
7410
7411 if (mddev->pers == NULL)
7412 return -ENODEV;
7413
7414 rcu_read_lock();
7415 rdev = md_find_rdev_rcu(mddev, dev);
7416 if (!rdev)
7417 err = -ENODEV;
7418 else {
7419 md_error(mddev, rdev);
7420 if (test_bit(MD_BROKEN, &mddev->flags))
7421 err = -EBUSY;
7422 }
7423 rcu_read_unlock();
7424 return err;
7425 }
7426
7427
7428
7429
7430
7431
7432
7433 static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo)
7434 {
7435 struct mddev *mddev = bdev->bd_disk->private_data;
7436
7437 geo->heads = 2;
7438 geo->sectors = 4;
7439 geo->cylinders = mddev->array_sectors / 8;
7440 return 0;
7441 }
7442
7443 static inline bool md_ioctl_valid(unsigned int cmd)
7444 {
7445 switch (cmd) {
7446 case ADD_NEW_DISK:
7447 case GET_ARRAY_INFO:
7448 case GET_BITMAP_FILE:
7449 case GET_DISK_INFO:
7450 case HOT_ADD_DISK:
7451 case HOT_REMOVE_DISK:
7452 case RAID_VERSION:
7453 case RESTART_ARRAY_RW:
7454 case RUN_ARRAY:
7455 case SET_ARRAY_INFO:
7456 case SET_BITMAP_FILE:
7457 case SET_DISK_FAULTY:
7458 case STOP_ARRAY:
7459 case STOP_ARRAY_RO:
7460 case CLUSTERED_DISK_NACK:
7461 return true;
7462 default:
7463 return false;
7464 }
7465 }
7466
7467 static int md_ioctl(struct block_device *bdev, fmode_t mode,
7468 unsigned int cmd, unsigned long arg)
7469 {
7470 int err = 0;
7471 void __user *argp = (void __user *)arg;
7472 struct mddev *mddev = NULL;
7473 bool did_set_md_closing = false;
7474
7475 if (!md_ioctl_valid(cmd))
7476 return -ENOTTY;
7477
7478 switch (cmd) {
7479 case RAID_VERSION:
7480 case GET_ARRAY_INFO:
7481 case GET_DISK_INFO:
7482 break;
7483 default:
7484 if (!capable(CAP_SYS_ADMIN))
7485 return -EACCES;
7486 }
7487
7488
7489
7490
7491
7492 switch (cmd) {
7493 case RAID_VERSION:
7494 err = get_version(argp);
7495 goto out;
7496 default:;
7497 }
7498
7499
7500
7501
7502
7503 mddev = bdev->bd_disk->private_data;
7504
7505 if (!mddev) {
7506 BUG();
7507 goto out;
7508 }
7509
7510
7511 switch (cmd) {
7512 case GET_ARRAY_INFO:
7513 if (!mddev->raid_disks && !mddev->external)
7514 err = -ENODEV;
7515 else
7516 err = get_array_info(mddev, argp);
7517 goto out;
7518
7519 case GET_DISK_INFO:
7520 if (!mddev->raid_disks && !mddev->external)
7521 err = -ENODEV;
7522 else
7523 err = get_disk_info(mddev, argp);
7524 goto out;
7525
7526 case SET_DISK_FAULTY:
7527 err = set_disk_faulty(mddev, new_decode_dev(arg));
7528 goto out;
7529
7530 case GET_BITMAP_FILE:
7531 err = get_bitmap_file(mddev, argp);
7532 goto out;
7533
7534 }
7535
7536 if (cmd == ADD_NEW_DISK || cmd == HOT_ADD_DISK)
7537 flush_rdev_wq(mddev);
7538
7539 if (cmd == HOT_REMOVE_DISK)
7540
7541 wait_event_interruptible_timeout(mddev->sb_wait,
7542 !test_bit(MD_RECOVERY_NEEDED,
7543 &mddev->recovery),
7544 msecs_to_jiffies(5000));
7545 if (cmd == STOP_ARRAY || cmd == STOP_ARRAY_RO) {
7546
7547
7548
7549 mutex_lock(&mddev->open_mutex);
7550 if (mddev->pers && atomic_read(&mddev->openers) > 1) {
7551 mutex_unlock(&mddev->open_mutex);
7552 err = -EBUSY;
7553 goto out;
7554 }
7555 if (test_and_set_bit(MD_CLOSING, &mddev->flags)) {
7556 mutex_unlock(&mddev->open_mutex);
7557 err = -EBUSY;
7558 goto out;
7559 }
7560 did_set_md_closing = true;
7561 mutex_unlock(&mddev->open_mutex);
7562 sync_blockdev(bdev);
7563 }
7564 err = mddev_lock(mddev);
7565 if (err) {
7566 pr_debug("md: ioctl lock interrupted, reason %d, cmd %d\n",
7567 err, cmd);
7568 goto out;
7569 }
7570
7571 if (cmd == SET_ARRAY_INFO) {
7572 mdu_array_info_t info;
7573 if (!arg)
7574 memset(&info, 0, sizeof(info));
7575 else if (copy_from_user(&info, argp, sizeof(info))) {
7576 err = -EFAULT;
7577 goto unlock;
7578 }
7579 if (mddev->pers) {
7580 err = update_array_info(mddev, &info);
7581 if (err) {
7582 pr_warn("md: couldn't update array info. %d\n", err);
7583 goto unlock;
7584 }
7585 goto unlock;
7586 }
7587 if (!list_empty(&mddev->disks)) {
7588 pr_warn("md: array %s already has disks!\n", mdname(mddev));
7589 err = -EBUSY;
7590 goto unlock;
7591 }
7592 if (mddev->raid_disks) {
7593 pr_warn("md: array %s already initialised!\n", mdname(mddev));
7594 err = -EBUSY;
7595 goto unlock;
7596 }
7597 err = md_set_array_info(mddev, &info);
7598 if (err) {
7599 pr_warn("md: couldn't set array info. %d\n", err);
7600 goto unlock;
7601 }
7602 goto unlock;
7603 }
7604
7605
7606
7607
7608
7609
7610 if ((!mddev->raid_disks && !mddev->external)
7611 && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY
7612 && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE
7613 && cmd != GET_BITMAP_FILE) {
7614 err = -ENODEV;
7615 goto unlock;
7616 }
7617
7618
7619
7620
7621 switch (cmd) {
7622 case RESTART_ARRAY_RW:
7623 err = restart_array(mddev);
7624 goto unlock;
7625
7626 case STOP_ARRAY:
7627 err = do_md_stop(mddev, 0, bdev);
7628 goto unlock;
7629
7630 case STOP_ARRAY_RO:
7631 err = md_set_readonly(mddev, bdev);
7632 goto unlock;
7633
7634 case HOT_REMOVE_DISK:
7635 err = hot_remove_disk(mddev, new_decode_dev(arg));
7636 goto unlock;
7637
7638 case ADD_NEW_DISK:
7639
7640
7641
7642
7643 if (mddev->pers) {
7644 mdu_disk_info_t info;
7645 if (copy_from_user(&info, argp, sizeof(info)))
7646 err = -EFAULT;
7647 else if (!(info.state & (1<<MD_DISK_SYNC)))
7648
7649 break;
7650 else
7651 err = md_add_new_disk(mddev, &info);
7652 goto unlock;
7653 }
7654 break;
7655 }
7656
7657
7658
7659
7660
7661 if (mddev->ro && mddev->pers) {
7662 if (mddev->ro == 2) {
7663 mddev->ro = 0;
7664 sysfs_notify_dirent_safe(mddev->sysfs_state);
7665 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
7666
7667
7668
7669
7670 if (test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) {
7671 mddev_unlock(mddev);
7672 wait_event(mddev->sb_wait,
7673 !test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags) &&
7674 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
7675 mddev_lock_nointr(mddev);
7676 }
7677 } else {
7678 err = -EROFS;
7679 goto unlock;
7680 }
7681 }
7682
7683 switch (cmd) {
7684 case ADD_NEW_DISK:
7685 {
7686 mdu_disk_info_t info;
7687 if (copy_from_user(&info, argp, sizeof(info)))
7688 err = -EFAULT;
7689 else
7690 err = md_add_new_disk(mddev, &info);
7691 goto unlock;
7692 }
7693
7694 case CLUSTERED_DISK_NACK:
7695 if (mddev_is_clustered(mddev))
7696 md_cluster_ops->new_disk_ack(mddev, false);
7697 else
7698 err = -EINVAL;
7699 goto unlock;
7700
7701 case HOT_ADD_DISK:
7702 err = hot_add_disk(mddev, new_decode_dev(arg));
7703 goto unlock;
7704
7705 case RUN_ARRAY:
7706 err = do_md_run(mddev);
7707 goto unlock;
7708
7709 case SET_BITMAP_FILE:
7710 err = set_bitmap_file(mddev, (int)arg);
7711 goto unlock;
7712
7713 default:
7714 err = -EINVAL;
7715 goto unlock;
7716 }
7717
7718 unlock:
7719 if (mddev->hold_active == UNTIL_IOCTL &&
7720 err != -EINVAL)
7721 mddev->hold_active = 0;
7722 mddev_unlock(mddev);
7723 out:
7724 if(did_set_md_closing)
7725 clear_bit(MD_CLOSING, &mddev->flags);
7726 return err;
7727 }
7728 #ifdef CONFIG_COMPAT
7729 static int md_compat_ioctl(struct block_device *bdev, fmode_t mode,
7730 unsigned int cmd, unsigned long arg)
7731 {
7732 switch (cmd) {
7733 case HOT_REMOVE_DISK:
7734 case HOT_ADD_DISK:
7735 case SET_DISK_FAULTY:
7736 case SET_BITMAP_FILE:
7737
7738 break;
7739 default:
7740 arg = (unsigned long)compat_ptr(arg);
7741 break;
7742 }
7743
7744 return md_ioctl(bdev, mode, cmd, arg);
7745 }
7746 #endif
7747
7748 static int md_set_read_only(struct block_device *bdev, bool ro)
7749 {
7750 struct mddev *mddev = bdev->bd_disk->private_data;
7751 int err;
7752
7753 err = mddev_lock(mddev);
7754 if (err)
7755 return err;
7756
7757 if (!mddev->raid_disks && !mddev->external) {
7758 err = -ENODEV;
7759 goto out_unlock;
7760 }
7761
7762
7763
7764
7765
7766 if (!ro && mddev->ro == 1 && mddev->pers) {
7767 err = restart_array(mddev);
7768 if (err)
7769 goto out_unlock;
7770 mddev->ro = 2;
7771 }
7772
7773 out_unlock:
7774 mddev_unlock(mddev);
7775 return err;
7776 }
7777
7778 static int md_open(struct block_device *bdev, fmode_t mode)
7779 {
7780 struct mddev *mddev;
7781 int err;
7782
7783 spin_lock(&all_mddevs_lock);
7784 mddev = mddev_get(bdev->bd_disk->private_data);
7785 spin_unlock(&all_mddevs_lock);
7786 if (!mddev)
7787 return -ENODEV;
7788
7789 err = mutex_lock_interruptible(&mddev->open_mutex);
7790 if (err)
7791 goto out;
7792
7793 err = -ENODEV;
7794 if (test_bit(MD_CLOSING, &mddev->flags))
7795 goto out_unlock;
7796
7797 atomic_inc(&mddev->openers);
7798 mutex_unlock(&mddev->open_mutex);
7799
7800 bdev_check_media_change(bdev);
7801 return 0;
7802
7803 out_unlock:
7804 mutex_unlock(&mddev->open_mutex);
7805 out:
7806 mddev_put(mddev);
7807 return err;
7808 }
7809
7810 static void md_release(struct gendisk *disk, fmode_t mode)
7811 {
7812 struct mddev *mddev = disk->private_data;
7813
7814 BUG_ON(!mddev);
7815 atomic_dec(&mddev->openers);
7816 mddev_put(mddev);
7817 }
7818
7819 static unsigned int md_check_events(struct gendisk *disk, unsigned int clearing)
7820 {
7821 struct mddev *mddev = disk->private_data;
7822 unsigned int ret = 0;
7823
7824 if (mddev->changed)
7825 ret = DISK_EVENT_MEDIA_CHANGE;
7826 mddev->changed = 0;
7827 return ret;
7828 }
7829
7830 static void md_free_disk(struct gendisk *disk)
7831 {
7832 struct mddev *mddev = disk->private_data;
7833
7834 percpu_ref_exit(&mddev->writes_pending);
7835 bioset_exit(&mddev->bio_set);
7836 bioset_exit(&mddev->sync_set);
7837
7838 mddev_free(mddev);
7839 }
7840
7841 const struct block_device_operations md_fops =
7842 {
7843 .owner = THIS_MODULE,
7844 .submit_bio = md_submit_bio,
7845 .open = md_open,
7846 .release = md_release,
7847 .ioctl = md_ioctl,
7848 #ifdef CONFIG_COMPAT
7849 .compat_ioctl = md_compat_ioctl,
7850 #endif
7851 .getgeo = md_getgeo,
7852 .check_events = md_check_events,
7853 .set_read_only = md_set_read_only,
7854 .free_disk = md_free_disk,
7855 };
7856
7857 static int md_thread(void *arg)
7858 {
7859 struct md_thread *thread = arg;
7860
7861
7862
7863
7864
7865
7866
7867
7868
7869
7870
7871
7872
7873 allow_signal(SIGKILL);
7874 while (!kthread_should_stop()) {
7875
7876
7877
7878
7879
7880
7881 if (signal_pending(current))
7882 flush_signals(current);
7883
7884 wait_event_interruptible_timeout
7885 (thread->wqueue,
7886 test_bit(THREAD_WAKEUP, &thread->flags)
7887 || kthread_should_stop() || kthread_should_park(),
7888 thread->timeout);
7889
7890 clear_bit(THREAD_WAKEUP, &thread->flags);
7891 if (kthread_should_park())
7892 kthread_parkme();
7893 if (!kthread_should_stop())
7894 thread->run(thread);
7895 }
7896
7897 return 0;
7898 }
7899
7900 void md_wakeup_thread(struct md_thread *thread)
7901 {
7902 if (thread) {
7903 pr_debug("md: waking up MD thread %s.\n", thread->tsk->comm);
7904 set_bit(THREAD_WAKEUP, &thread->flags);
7905 wake_up(&thread->wqueue);
7906 }
7907 }
7908 EXPORT_SYMBOL(md_wakeup_thread);
7909
7910 struct md_thread *md_register_thread(void (*run) (struct md_thread *),
7911 struct mddev *mddev, const char *name)
7912 {
7913 struct md_thread *thread;
7914
7915 thread = kzalloc(sizeof(struct md_thread), GFP_KERNEL);
7916 if (!thread)
7917 return NULL;
7918
7919 init_waitqueue_head(&thread->wqueue);
7920
7921 thread->run = run;
7922 thread->mddev = mddev;
7923 thread->timeout = MAX_SCHEDULE_TIMEOUT;
7924 thread->tsk = kthread_run(md_thread, thread,
7925 "%s_%s",
7926 mdname(thread->mddev),
7927 name);
7928 if (IS_ERR(thread->tsk)) {
7929 kfree(thread);
7930 return NULL;
7931 }
7932 return thread;
7933 }
7934 EXPORT_SYMBOL(md_register_thread);
7935
7936 void md_unregister_thread(struct md_thread **threadp)
7937 {
7938 struct md_thread *thread;
7939
7940
7941
7942
7943
7944 spin_lock(&pers_lock);
7945 thread = *threadp;
7946 if (!thread) {
7947 spin_unlock(&pers_lock);
7948 return;
7949 }
7950 *threadp = NULL;
7951 spin_unlock(&pers_lock);
7952
7953 pr_debug("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk));
7954 kthread_stop(thread->tsk);
7955 kfree(thread);
7956 }
7957 EXPORT_SYMBOL(md_unregister_thread);
7958
7959 void md_error(struct mddev *mddev, struct md_rdev *rdev)
7960 {
7961 if (!rdev || test_bit(Faulty, &rdev->flags))
7962 return;
7963
7964 if (!mddev->pers || !mddev->pers->error_handler)
7965 return;
7966 mddev->pers->error_handler(mddev, rdev);
7967
7968 if (mddev->degraded && !test_bit(MD_BROKEN, &mddev->flags))
7969 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
7970 sysfs_notify_dirent_safe(rdev->sysfs_state);
7971 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
7972 if (!test_bit(MD_BROKEN, &mddev->flags)) {
7973 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
7974 md_wakeup_thread(mddev->thread);
7975 }
7976 if (mddev->event_work.func)
7977 queue_work(md_misc_wq, &mddev->event_work);
7978 md_new_event();
7979 }
7980 EXPORT_SYMBOL(md_error);
7981
7982
7983
7984 static void status_unused(struct seq_file *seq)
7985 {
7986 int i = 0;
7987 struct md_rdev *rdev;
7988
7989 seq_printf(seq, "unused devices: ");
7990
7991 list_for_each_entry(rdev, &pending_raid_disks, same_set) {
7992 i++;
7993 seq_printf(seq, "%pg ", rdev->bdev);
7994 }
7995 if (!i)
7996 seq_printf(seq, "<none>");
7997
7998 seq_printf(seq, "\n");
7999 }
8000
8001 static int status_resync(struct seq_file *seq, struct mddev *mddev)
8002 {
8003 sector_t max_sectors, resync, res;
8004 unsigned long dt, db = 0;
8005 sector_t rt, curr_mark_cnt, resync_mark_cnt;
8006 int scale, recovery_active;
8007 unsigned int per_milli;
8008
8009 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
8010 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
8011 max_sectors = mddev->resync_max_sectors;
8012 else
8013 max_sectors = mddev->dev_sectors;
8014
8015 resync = mddev->curr_resync;
8016 if (resync < MD_RESYNC_ACTIVE) {
8017 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
8018
8019 resync = max_sectors;
8020 } else if (resync > max_sectors) {
8021 resync = max_sectors;
8022 } else {
8023 resync -= atomic_read(&mddev->recovery_active);
8024 if (resync < MD_RESYNC_ACTIVE) {
8025
8026
8027
8028
8029
8030
8031 resync = MD_RESYNC_ACTIVE;
8032 }
8033 }
8034
8035 if (resync == MD_RESYNC_NONE) {
8036 if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery)) {
8037 struct md_rdev *rdev;
8038
8039 rdev_for_each(rdev, mddev)
8040 if (rdev->raid_disk >= 0 &&
8041 !test_bit(Faulty, &rdev->flags) &&
8042 rdev->recovery_offset != MaxSector &&
8043 rdev->recovery_offset) {
8044 seq_printf(seq, "\trecover=REMOTE");
8045 return 1;
8046 }
8047 if (mddev->reshape_position != MaxSector)
8048 seq_printf(seq, "\treshape=REMOTE");
8049 else
8050 seq_printf(seq, "\tresync=REMOTE");
8051 return 1;
8052 }
8053 if (mddev->recovery_cp < MaxSector) {
8054 seq_printf(seq, "\tresync=PENDING");
8055 return 1;
8056 }
8057 return 0;
8058 }
8059 if (resync < MD_RESYNC_ACTIVE) {
8060 seq_printf(seq, "\tresync=DELAYED");
8061 return 1;
8062 }
8063
8064 WARN_ON(max_sectors == 0);
8065
8066
8067
8068
8069
8070 scale = 10;
8071 if (sizeof(sector_t) > sizeof(unsigned long)) {
8072 while ( max_sectors/2 > (1ULL<<(scale+32)))
8073 scale++;
8074 }
8075 res = (resync>>scale)*1000;
8076 sector_div(res, (u32)((max_sectors>>scale)+1));
8077
8078 per_milli = res;
8079 {
8080 int i, x = per_milli/50, y = 20-x;
8081 seq_printf(seq, "[");
8082 for (i = 0; i < x; i++)
8083 seq_printf(seq, "=");
8084 seq_printf(seq, ">");
8085 for (i = 0; i < y; i++)
8086 seq_printf(seq, ".");
8087 seq_printf(seq, "] ");
8088 }
8089 seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)",
8090 (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)?
8091 "reshape" :
8092 (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)?
8093 "check" :
8094 (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ?
8095 "resync" : "recovery"))),
8096 per_milli/10, per_milli % 10,
8097 (unsigned long long) resync/2,
8098 (unsigned long long) max_sectors/2);
8099
8100
8101
8102
8103
8104
8105
8106
8107
8108
8109
8110
8111
8112
8113
8114
8115
8116
8117 dt = ((jiffies - mddev->resync_mark) / HZ);
8118 if (!dt) dt++;
8119
8120 curr_mark_cnt = mddev->curr_mark_cnt;
8121 recovery_active = atomic_read(&mddev->recovery_active);
8122 resync_mark_cnt = mddev->resync_mark_cnt;
8123
8124 if (curr_mark_cnt >= (recovery_active + resync_mark_cnt))
8125 db = curr_mark_cnt - (recovery_active + resync_mark_cnt);
8126
8127 rt = max_sectors - resync;
8128 rt = div64_u64(rt, db/32+1);
8129 rt *= dt;
8130 rt >>= 5;
8131
8132 seq_printf(seq, " finish=%lu.%lumin", (unsigned long)rt / 60,
8133 ((unsigned long)rt % 60)/6);
8134
8135 seq_printf(seq, " speed=%ldK/sec", db/2/dt);
8136 return 1;
8137 }
8138
8139 static void *md_seq_start(struct seq_file *seq, loff_t *pos)
8140 {
8141 struct list_head *tmp;
8142 loff_t l = *pos;
8143 struct mddev *mddev;
8144
8145 if (l == 0x10000) {
8146 ++*pos;
8147 return (void *)2;
8148 }
8149 if (l > 0x10000)
8150 return NULL;
8151 if (!l--)
8152
8153 return (void*)1;
8154
8155 spin_lock(&all_mddevs_lock);
8156 list_for_each(tmp,&all_mddevs)
8157 if (!l--) {
8158 mddev = list_entry(tmp, struct mddev, all_mddevs);
8159 mddev_get(mddev);
8160 if (!mddev_get(mddev))
8161 continue;
8162 spin_unlock(&all_mddevs_lock);
8163 return mddev;
8164 }
8165 spin_unlock(&all_mddevs_lock);
8166 if (!l--)
8167 return (void*)2;
8168 return NULL;
8169 }
8170
8171 static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos)
8172 {
8173 struct list_head *tmp;
8174 struct mddev *next_mddev, *mddev = v;
8175 struct mddev *to_put = NULL;
8176
8177 ++*pos;
8178 if (v == (void*)2)
8179 return NULL;
8180
8181 spin_lock(&all_mddevs_lock);
8182 if (v == (void*)1) {
8183 tmp = all_mddevs.next;
8184 } else {
8185 to_put = mddev;
8186 tmp = mddev->all_mddevs.next;
8187 }
8188
8189 for (;;) {
8190 if (tmp == &all_mddevs) {
8191 next_mddev = (void*)2;
8192 *pos = 0x10000;
8193 break;
8194 }
8195 next_mddev = list_entry(tmp, struct mddev, all_mddevs);
8196 if (mddev_get(next_mddev))
8197 break;
8198 mddev = next_mddev;
8199 tmp = mddev->all_mddevs.next;
8200 }
8201 spin_unlock(&all_mddevs_lock);
8202
8203 if (to_put)
8204 mddev_put(mddev);
8205 return next_mddev;
8206
8207 }
8208
8209 static void md_seq_stop(struct seq_file *seq, void *v)
8210 {
8211 struct mddev *mddev = v;
8212
8213 if (mddev && v != (void*)1 && v != (void*)2)
8214 mddev_put(mddev);
8215 }
8216
8217 static int md_seq_show(struct seq_file *seq, void *v)
8218 {
8219 struct mddev *mddev = v;
8220 sector_t sectors;
8221 struct md_rdev *rdev;
8222
8223 if (v == (void*)1) {
8224 struct md_personality *pers;
8225 seq_printf(seq, "Personalities : ");
8226 spin_lock(&pers_lock);
8227 list_for_each_entry(pers, &pers_list, list)
8228 seq_printf(seq, "[%s] ", pers->name);
8229
8230 spin_unlock(&pers_lock);
8231 seq_printf(seq, "\n");
8232 seq->poll_event = atomic_read(&md_event_count);
8233 return 0;
8234 }
8235 if (v == (void*)2) {
8236 status_unused(seq);
8237 return 0;
8238 }
8239
8240 spin_lock(&mddev->lock);
8241 if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) {
8242 seq_printf(seq, "%s : %sactive", mdname(mddev),
8243 mddev->pers ? "" : "in");
8244 if (mddev->pers) {
8245 if (mddev->ro==1)
8246 seq_printf(seq, " (read-only)");
8247 if (mddev->ro==2)
8248 seq_printf(seq, " (auto-read-only)");
8249 seq_printf(seq, " %s", mddev->pers->name);
8250 }
8251
8252 sectors = 0;
8253 rcu_read_lock();
8254 rdev_for_each_rcu(rdev, mddev) {
8255 seq_printf(seq, " %pg[%d]", rdev->bdev, rdev->desc_nr);
8256
8257 if (test_bit(WriteMostly, &rdev->flags))
8258 seq_printf(seq, "(W)");
8259 if (test_bit(Journal, &rdev->flags))
8260 seq_printf(seq, "(J)");
8261 if (test_bit(Faulty, &rdev->flags)) {
8262 seq_printf(seq, "(F)");
8263 continue;
8264 }
8265 if (rdev->raid_disk < 0)
8266 seq_printf(seq, "(S)");
8267 if (test_bit(Replacement, &rdev->flags))
8268 seq_printf(seq, "(R)");
8269 sectors += rdev->sectors;
8270 }
8271 rcu_read_unlock();
8272
8273 if (!list_empty(&mddev->disks)) {
8274 if (mddev->pers)
8275 seq_printf(seq, "\n %llu blocks",
8276 (unsigned long long)
8277 mddev->array_sectors / 2);
8278 else
8279 seq_printf(seq, "\n %llu blocks",
8280 (unsigned long long)sectors / 2);
8281 }
8282 if (mddev->persistent) {
8283 if (mddev->major_version != 0 ||
8284 mddev->minor_version != 90) {
8285 seq_printf(seq," super %d.%d",
8286 mddev->major_version,
8287 mddev->minor_version);
8288 }
8289 } else if (mddev->external)
8290 seq_printf(seq, " super external:%s",
8291 mddev->metadata_type);
8292 else
8293 seq_printf(seq, " super non-persistent");
8294
8295 if (mddev->pers) {
8296 mddev->pers->status(seq, mddev);
8297 seq_printf(seq, "\n ");
8298 if (mddev->pers->sync_request) {
8299 if (status_resync(seq, mddev))
8300 seq_printf(seq, "\n ");
8301 }
8302 } else
8303 seq_printf(seq, "\n ");
8304
8305 md_bitmap_status(seq, mddev->bitmap);
8306
8307 seq_printf(seq, "\n");
8308 }
8309 spin_unlock(&mddev->lock);
8310
8311 return 0;
8312 }
8313
8314 static const struct seq_operations md_seq_ops = {
8315 .start = md_seq_start,
8316 .next = md_seq_next,
8317 .stop = md_seq_stop,
8318 .show = md_seq_show,
8319 };
8320
8321 static int md_seq_open(struct inode *inode, struct file *file)
8322 {
8323 struct seq_file *seq;
8324 int error;
8325
8326 error = seq_open(file, &md_seq_ops);
8327 if (error)
8328 return error;
8329
8330 seq = file->private_data;
8331 seq->poll_event = atomic_read(&md_event_count);
8332 return error;
8333 }
8334
8335 static int md_unloading;
8336 static __poll_t mdstat_poll(struct file *filp, poll_table *wait)
8337 {
8338 struct seq_file *seq = filp->private_data;
8339 __poll_t mask;
8340
8341 if (md_unloading)
8342 return EPOLLIN|EPOLLRDNORM|EPOLLERR|EPOLLPRI;
8343 poll_wait(filp, &md_event_waiters, wait);
8344
8345
8346 mask = EPOLLIN | EPOLLRDNORM;
8347
8348 if (seq->poll_event != atomic_read(&md_event_count))
8349 mask |= EPOLLERR | EPOLLPRI;
8350 return mask;
8351 }
8352
8353 static const struct proc_ops mdstat_proc_ops = {
8354 .proc_open = md_seq_open,
8355 .proc_read = seq_read,
8356 .proc_lseek = seq_lseek,
8357 .proc_release = seq_release,
8358 .proc_poll = mdstat_poll,
8359 };
8360
8361 int register_md_personality(struct md_personality *p)
8362 {
8363 pr_debug("md: %s personality registered for level %d\n",
8364 p->name, p->level);
8365 spin_lock(&pers_lock);
8366 list_add_tail(&p->list, &pers_list);
8367 spin_unlock(&pers_lock);
8368 return 0;
8369 }
8370 EXPORT_SYMBOL(register_md_personality);
8371
8372 int unregister_md_personality(struct md_personality *p)
8373 {
8374 pr_debug("md: %s personality unregistered\n", p->name);
8375 spin_lock(&pers_lock);
8376 list_del_init(&p->list);
8377 spin_unlock(&pers_lock);
8378 return 0;
8379 }
8380 EXPORT_SYMBOL(unregister_md_personality);
8381
8382 int register_md_cluster_operations(struct md_cluster_operations *ops,
8383 struct module *module)
8384 {
8385 int ret = 0;
8386 spin_lock(&pers_lock);
8387 if (md_cluster_ops != NULL)
8388 ret = -EALREADY;
8389 else {
8390 md_cluster_ops = ops;
8391 md_cluster_mod = module;
8392 }
8393 spin_unlock(&pers_lock);
8394 return ret;
8395 }
8396 EXPORT_SYMBOL(register_md_cluster_operations);
8397
8398 int unregister_md_cluster_operations(void)
8399 {
8400 spin_lock(&pers_lock);
8401 md_cluster_ops = NULL;
8402 spin_unlock(&pers_lock);
8403 return 0;
8404 }
8405 EXPORT_SYMBOL(unregister_md_cluster_operations);
8406
8407 int md_setup_cluster(struct mddev *mddev, int nodes)
8408 {
8409 int ret;
8410 if (!md_cluster_ops)
8411 request_module("md-cluster");
8412 spin_lock(&pers_lock);
8413
8414 if (!md_cluster_ops || !try_module_get(md_cluster_mod)) {
8415 pr_warn("can't find md-cluster module or get its reference.\n");
8416 spin_unlock(&pers_lock);
8417 return -ENOENT;
8418 }
8419 spin_unlock(&pers_lock);
8420
8421 ret = md_cluster_ops->join(mddev, nodes);
8422 if (!ret)
8423 mddev->safemode_delay = 0;
8424 return ret;
8425 }
8426
8427 void md_cluster_stop(struct mddev *mddev)
8428 {
8429 if (!md_cluster_ops)
8430 return;
8431 md_cluster_ops->leave(mddev);
8432 module_put(md_cluster_mod);
8433 }
8434
8435 static int is_mddev_idle(struct mddev *mddev, int init)
8436 {
8437 struct md_rdev *rdev;
8438 int idle;
8439 int curr_events;
8440
8441 idle = 1;
8442 rcu_read_lock();
8443 rdev_for_each_rcu(rdev, mddev) {
8444 struct gendisk *disk = rdev->bdev->bd_disk;
8445 curr_events = (int)part_stat_read_accum(disk->part0, sectors) -
8446 atomic_read(&disk->sync_io);
8447
8448
8449
8450
8451
8452
8453
8454
8455
8456
8457
8458
8459
8460
8461
8462
8463
8464
8465
8466
8467
8468
8469 if (init || curr_events - rdev->last_events > 64) {
8470 rdev->last_events = curr_events;
8471 idle = 0;
8472 }
8473 }
8474 rcu_read_unlock();
8475 return idle;
8476 }
8477
8478 void md_done_sync(struct mddev *mddev, int blocks, int ok)
8479 {
8480
8481 atomic_sub(blocks, &mddev->recovery_active);
8482 wake_up(&mddev->recovery_wait);
8483 if (!ok) {
8484 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
8485 set_bit(MD_RECOVERY_ERROR, &mddev->recovery);
8486 md_wakeup_thread(mddev->thread);
8487
8488 }
8489 }
8490 EXPORT_SYMBOL(md_done_sync);
8491
8492
8493
8494
8495
8496
8497
8498
8499 bool md_write_start(struct mddev *mddev, struct bio *bi)
8500 {
8501 int did_change = 0;
8502
8503 if (bio_data_dir(bi) != WRITE)
8504 return true;
8505
8506 BUG_ON(mddev->ro == 1);
8507 if (mddev->ro == 2) {
8508
8509 mddev->ro = 0;
8510 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
8511 md_wakeup_thread(mddev->thread);
8512 md_wakeup_thread(mddev->sync_thread);
8513 did_change = 1;
8514 }
8515 rcu_read_lock();
8516 percpu_ref_get(&mddev->writes_pending);
8517 smp_mb();
8518 if (mddev->safemode == 1)
8519 mddev->safemode = 0;
8520
8521 if (mddev->in_sync || mddev->sync_checkers) {
8522 spin_lock(&mddev->lock);
8523 if (mddev->in_sync) {
8524 mddev->in_sync = 0;
8525 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
8526 set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
8527 md_wakeup_thread(mddev->thread);
8528 did_change = 1;
8529 }
8530 spin_unlock(&mddev->lock);
8531 }
8532 rcu_read_unlock();
8533 if (did_change)
8534 sysfs_notify_dirent_safe(mddev->sysfs_state);
8535 if (!mddev->has_superblocks)
8536 return true;
8537 wait_event(mddev->sb_wait,
8538 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) ||
8539 mddev->suspended);
8540 if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
8541 percpu_ref_put(&mddev->writes_pending);
8542 return false;
8543 }
8544 return true;
8545 }
8546 EXPORT_SYMBOL(md_write_start);
8547
8548
8549
8550
8551
8552
8553
8554
8555
8556 void md_write_inc(struct mddev *mddev, struct bio *bi)
8557 {
8558 if (bio_data_dir(bi) != WRITE)
8559 return;
8560 WARN_ON_ONCE(mddev->in_sync || mddev->ro);
8561 percpu_ref_get(&mddev->writes_pending);
8562 }
8563 EXPORT_SYMBOL(md_write_inc);
8564
8565 void md_write_end(struct mddev *mddev)
8566 {
8567 percpu_ref_put(&mddev->writes_pending);
8568
8569 if (mddev->safemode == 2)
8570 md_wakeup_thread(mddev->thread);
8571 else if (mddev->safemode_delay)
8572
8573
8574
8575 mod_timer(&mddev->safemode_timer,
8576 roundup(jiffies, mddev->safemode_delay) +
8577 mddev->safemode_delay);
8578 }
8579
8580 EXPORT_SYMBOL(md_write_end);
8581
8582
8583 void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev,
8584 struct bio *bio, sector_t start, sector_t size)
8585 {
8586 struct bio *discard_bio = NULL;
8587
8588 if (__blkdev_issue_discard(rdev->bdev, start, size, GFP_NOIO,
8589 &discard_bio) || !discard_bio)
8590 return;
8591
8592 bio_chain(discard_bio, bio);
8593 bio_clone_blkg_association(discard_bio, bio);
8594 if (mddev->gendisk)
8595 trace_block_bio_remap(discard_bio,
8596 disk_devt(mddev->gendisk),
8597 bio->bi_iter.bi_sector);
8598 submit_bio_noacct(discard_bio);
8599 }
8600 EXPORT_SYMBOL_GPL(md_submit_discard_bio);
8601
8602 int acct_bioset_init(struct mddev *mddev)
8603 {
8604 int err = 0;
8605
8606 if (!bioset_initialized(&mddev->io_acct_set))
8607 err = bioset_init(&mddev->io_acct_set, BIO_POOL_SIZE,
8608 offsetof(struct md_io_acct, bio_clone), 0);
8609 return err;
8610 }
8611 EXPORT_SYMBOL_GPL(acct_bioset_init);
8612
8613 void acct_bioset_exit(struct mddev *mddev)
8614 {
8615 bioset_exit(&mddev->io_acct_set);
8616 }
8617 EXPORT_SYMBOL_GPL(acct_bioset_exit);
8618
8619 static void md_end_io_acct(struct bio *bio)
8620 {
8621 struct md_io_acct *md_io_acct = bio->bi_private;
8622 struct bio *orig_bio = md_io_acct->orig_bio;
8623
8624 orig_bio->bi_status = bio->bi_status;
8625
8626 bio_end_io_acct(orig_bio, md_io_acct->start_time);
8627 bio_put(bio);
8628 bio_endio(orig_bio);
8629 }
8630
8631
8632
8633
8634
8635 void md_account_bio(struct mddev *mddev, struct bio **bio)
8636 {
8637 struct block_device *bdev = (*bio)->bi_bdev;
8638 struct md_io_acct *md_io_acct;
8639 struct bio *clone;
8640
8641 if (!blk_queue_io_stat(bdev->bd_disk->queue))
8642 return;
8643
8644 clone = bio_alloc_clone(bdev, *bio, GFP_NOIO, &mddev->io_acct_set);
8645 md_io_acct = container_of(clone, struct md_io_acct, bio_clone);
8646 md_io_acct->orig_bio = *bio;
8647 md_io_acct->start_time = bio_start_io_acct(*bio);
8648
8649 clone->bi_end_io = md_end_io_acct;
8650 clone->bi_private = md_io_acct;
8651 *bio = clone;
8652 }
8653 EXPORT_SYMBOL_GPL(md_account_bio);
8654
8655
8656
8657
8658
8659
8660
8661 void md_allow_write(struct mddev *mddev)
8662 {
8663 if (!mddev->pers)
8664 return;
8665 if (mddev->ro)
8666 return;
8667 if (!mddev->pers->sync_request)
8668 return;
8669
8670 spin_lock(&mddev->lock);
8671 if (mddev->in_sync) {
8672 mddev->in_sync = 0;
8673 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
8674 set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
8675 if (mddev->safemode_delay &&
8676 mddev->safemode == 0)
8677 mddev->safemode = 1;
8678 spin_unlock(&mddev->lock);
8679 md_update_sb(mddev, 0);
8680 sysfs_notify_dirent_safe(mddev->sysfs_state);
8681
8682 wait_event(mddev->sb_wait,
8683 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
8684 } else
8685 spin_unlock(&mddev->lock);
8686 }
8687 EXPORT_SYMBOL_GPL(md_allow_write);
8688
8689 #define SYNC_MARKS 10
8690 #define SYNC_MARK_STEP (3*HZ)
8691 #define UPDATE_FREQUENCY (5*60*HZ)
8692 void md_do_sync(struct md_thread *thread)
8693 {
8694 struct mddev *mddev = thread->mddev;
8695 struct mddev *mddev2;
8696 unsigned int currspeed = 0, window;
8697 sector_t max_sectors,j, io_sectors, recovery_done;
8698 unsigned long mark[SYNC_MARKS];
8699 unsigned long update_time;
8700 sector_t mark_cnt[SYNC_MARKS];
8701 int last_mark,m;
8702 sector_t last_check;
8703 int skipped = 0;
8704 struct md_rdev *rdev;
8705 char *desc, *action = NULL;
8706 struct blk_plug plug;
8707 int ret;
8708
8709
8710 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
8711 test_bit(MD_RECOVERY_WAIT, &mddev->recovery))
8712 return;
8713 if (mddev->ro) {
8714 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
8715 return;
8716 }
8717
8718 if (mddev_is_clustered(mddev)) {
8719 ret = md_cluster_ops->resync_start(mddev);
8720 if (ret)
8721 goto skip;
8722
8723 set_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags);
8724 if (!(test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
8725 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) ||
8726 test_bit(MD_RECOVERY_RECOVER, &mddev->recovery))
8727 && ((unsigned long long)mddev->curr_resync_completed
8728 < (unsigned long long)mddev->resync_max_sectors))
8729 goto skip;
8730 }
8731
8732 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
8733 if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) {
8734 desc = "data-check";
8735 action = "check";
8736 } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
8737 desc = "requested-resync";
8738 action = "repair";
8739 } else
8740 desc = "resync";
8741 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
8742 desc = "reshape";
8743 else
8744 desc = "recovery";
8745
8746 mddev->last_sync_action = action ?: desc;
8747
8748
8749
8750
8751
8752
8753
8754
8755
8756
8757
8758 do {
8759 int mddev2_minor = -1;
8760 mddev->curr_resync = MD_RESYNC_DELAYED;
8761
8762 try_again:
8763 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
8764 goto skip;
8765 spin_lock(&all_mddevs_lock);
8766 list_for_each_entry(mddev2, &all_mddevs, all_mddevs) {
8767 if (test_bit(MD_DELETED, &mddev2->flags))
8768 continue;
8769 if (mddev2 == mddev)
8770 continue;
8771 if (!mddev->parallel_resync
8772 && mddev2->curr_resync
8773 && match_mddev_units(mddev, mddev2)) {
8774 DEFINE_WAIT(wq);
8775 if (mddev < mddev2 &&
8776 mddev->curr_resync == MD_RESYNC_DELAYED) {
8777
8778 mddev->curr_resync = MD_RESYNC_YIELDED;
8779 wake_up(&resync_wait);
8780 }
8781 if (mddev > mddev2 &&
8782 mddev->curr_resync == MD_RESYNC_YIELDED)
8783
8784
8785
8786 continue;
8787
8788
8789
8790
8791 prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE);
8792 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
8793 mddev2->curr_resync >= mddev->curr_resync) {
8794 if (mddev2_minor != mddev2->md_minor) {
8795 mddev2_minor = mddev2->md_minor;
8796 pr_info("md: delaying %s of %s until %s has finished (they share one or more physical units)\n",
8797 desc, mdname(mddev),
8798 mdname(mddev2));
8799 }
8800 spin_unlock(&all_mddevs_lock);
8801
8802 if (signal_pending(current))
8803 flush_signals(current);
8804 schedule();
8805 finish_wait(&resync_wait, &wq);
8806 goto try_again;
8807 }
8808 finish_wait(&resync_wait, &wq);
8809 }
8810 }
8811 spin_unlock(&all_mddevs_lock);
8812 } while (mddev->curr_resync < MD_RESYNC_DELAYED);
8813
8814 j = 0;
8815 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
8816
8817
8818
8819 max_sectors = mddev->resync_max_sectors;
8820 atomic64_set(&mddev->resync_mismatches, 0);
8821
8822 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
8823 j = mddev->resync_min;
8824 else if (!mddev->bitmap)
8825 j = mddev->recovery_cp;
8826
8827 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
8828 max_sectors = mddev->resync_max_sectors;
8829
8830
8831
8832
8833
8834 if (mddev_is_clustered(mddev) &&
8835 mddev->reshape_position != MaxSector)
8836 j = mddev->reshape_position;
8837 } else {
8838
8839 max_sectors = mddev->dev_sectors;
8840 j = MaxSector;
8841 rcu_read_lock();
8842 rdev_for_each_rcu(rdev, mddev)
8843 if (rdev->raid_disk >= 0 &&
8844 !test_bit(Journal, &rdev->flags) &&
8845 !test_bit(Faulty, &rdev->flags) &&
8846 !test_bit(In_sync, &rdev->flags) &&
8847 rdev->recovery_offset < j)
8848 j = rdev->recovery_offset;
8849 rcu_read_unlock();
8850
8851
8852
8853
8854
8855
8856
8857
8858
8859 if (mddev->bitmap) {
8860 mddev->pers->quiesce(mddev, 1);
8861 mddev->pers->quiesce(mddev, 0);
8862 }
8863 }
8864
8865 pr_info("md: %s of RAID array %s\n", desc, mdname(mddev));
8866 pr_debug("md: minimum _guaranteed_ speed: %d KB/sec/disk.\n", speed_min(mddev));
8867 pr_debug("md: using maximum available idle IO bandwidth (but not more than %d KB/sec) for %s.\n",
8868 speed_max(mddev), desc);
8869
8870 is_mddev_idle(mddev, 1);
8871
8872 io_sectors = 0;
8873 for (m = 0; m < SYNC_MARKS; m++) {
8874 mark[m] = jiffies;
8875 mark_cnt[m] = io_sectors;
8876 }
8877 last_mark = 0;
8878 mddev->resync_mark = mark[last_mark];
8879 mddev->resync_mark_cnt = mark_cnt[last_mark];
8880
8881
8882
8883
8884 window = 32 * (PAGE_SIZE / 512);
8885 pr_debug("md: using %dk window, over a total of %lluk.\n",
8886 window/2, (unsigned long long)max_sectors/2);
8887
8888 atomic_set(&mddev->recovery_active, 0);
8889 last_check = 0;
8890
8891 if (j>2) {
8892 pr_debug("md: resuming %s of %s from checkpoint.\n",
8893 desc, mdname(mddev));
8894 mddev->curr_resync = j;
8895 } else
8896 mddev->curr_resync = MD_RESYNC_ACTIVE;
8897 mddev->curr_resync_completed = j;
8898 sysfs_notify_dirent_safe(mddev->sysfs_completed);
8899 md_new_event();
8900 update_time = jiffies;
8901
8902 blk_start_plug(&plug);
8903 while (j < max_sectors) {
8904 sector_t sectors;
8905
8906 skipped = 0;
8907
8908 if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
8909 ((mddev->curr_resync > mddev->curr_resync_completed &&
8910 (mddev->curr_resync - mddev->curr_resync_completed)
8911 > (max_sectors >> 4)) ||
8912 time_after_eq(jiffies, update_time + UPDATE_FREQUENCY) ||
8913 (j - mddev->curr_resync_completed)*2
8914 >= mddev->resync_max - mddev->curr_resync_completed ||
8915 mddev->curr_resync_completed > mddev->resync_max
8916 )) {
8917
8918 wait_event(mddev->recovery_wait,
8919 atomic_read(&mddev->recovery_active) == 0);
8920 mddev->curr_resync_completed = j;
8921 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
8922 j > mddev->recovery_cp)
8923 mddev->recovery_cp = j;
8924 update_time = jiffies;
8925 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
8926 sysfs_notify_dirent_safe(mddev->sysfs_completed);
8927 }
8928
8929 while (j >= mddev->resync_max &&
8930 !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
8931
8932
8933
8934
8935 flush_signals(current);
8936 wait_event_interruptible(mddev->recovery_wait,
8937 mddev->resync_max > j
8938 || test_bit(MD_RECOVERY_INTR,
8939 &mddev->recovery));
8940 }
8941
8942 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
8943 break;
8944
8945 sectors = mddev->pers->sync_request(mddev, j, &skipped);
8946 if (sectors == 0) {
8947 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
8948 break;
8949 }
8950
8951 if (!skipped) {
8952 io_sectors += sectors;
8953 atomic_add(sectors, &mddev->recovery_active);
8954 }
8955
8956 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
8957 break;
8958
8959 j += sectors;
8960 if (j > max_sectors)
8961
8962 j = max_sectors;
8963 if (j > 2)
8964 mddev->curr_resync = j;
8965 mddev->curr_mark_cnt = io_sectors;
8966 if (last_check == 0)
8967
8968
8969
8970 md_new_event();
8971
8972 if (last_check + window > io_sectors || j == max_sectors)
8973 continue;
8974
8975 last_check = io_sectors;
8976 repeat:
8977 if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) {
8978
8979 int next = (last_mark+1) % SYNC_MARKS;
8980
8981 mddev->resync_mark = mark[next];
8982 mddev->resync_mark_cnt = mark_cnt[next];
8983 mark[next] = jiffies;
8984 mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active);
8985 last_mark = next;
8986 }
8987
8988 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
8989 break;
8990
8991
8992
8993
8994
8995
8996
8997
8998
8999 cond_resched();
9000
9001 recovery_done = io_sectors - atomic_read(&mddev->recovery_active);
9002 currspeed = ((unsigned long)(recovery_done - mddev->resync_mark_cnt))/2
9003 /((jiffies-mddev->resync_mark)/HZ +1) +1;
9004
9005 if (currspeed > speed_min(mddev)) {
9006 if (currspeed > speed_max(mddev)) {
9007 msleep(500);
9008 goto repeat;
9009 }
9010 if (!is_mddev_idle(mddev, 0)) {
9011
9012
9013
9014
9015 wait_event(mddev->recovery_wait,
9016 !atomic_read(&mddev->recovery_active));
9017 }
9018 }
9019 }
9020 pr_info("md: %s: %s %s.\n",mdname(mddev), desc,
9021 test_bit(MD_RECOVERY_INTR, &mddev->recovery)
9022 ? "interrupted" : "done");
9023
9024
9025
9026 blk_finish_plug(&plug);
9027 wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
9028
9029 if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
9030 !test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
9031 mddev->curr_resync >= MD_RESYNC_ACTIVE) {
9032 mddev->curr_resync_completed = mddev->curr_resync;
9033 sysfs_notify_dirent_safe(mddev->sysfs_completed);
9034 }
9035 mddev->pers->sync_request(mddev, max_sectors, &skipped);
9036
9037 if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
9038 mddev->curr_resync >= MD_RESYNC_ACTIVE) {
9039 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
9040 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
9041 if (mddev->curr_resync >= mddev->recovery_cp) {
9042 pr_debug("md: checkpointing %s of %s.\n",
9043 desc, mdname(mddev));
9044 if (test_bit(MD_RECOVERY_ERROR,
9045 &mddev->recovery))
9046 mddev->recovery_cp =
9047 mddev->curr_resync_completed;
9048 else
9049 mddev->recovery_cp =
9050 mddev->curr_resync;
9051 }
9052 } else
9053 mddev->recovery_cp = MaxSector;
9054 } else {
9055 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
9056 mddev->curr_resync = MaxSector;
9057 if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
9058 test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) {
9059 rcu_read_lock();
9060 rdev_for_each_rcu(rdev, mddev)
9061 if (rdev->raid_disk >= 0 &&
9062 mddev->delta_disks >= 0 &&
9063 !test_bit(Journal, &rdev->flags) &&
9064 !test_bit(Faulty, &rdev->flags) &&
9065 !test_bit(In_sync, &rdev->flags) &&
9066 rdev->recovery_offset < mddev->curr_resync)
9067 rdev->recovery_offset = mddev->curr_resync;
9068 rcu_read_unlock();
9069 }
9070 }
9071 }
9072 skip:
9073
9074
9075
9076 set_mask_bits(&mddev->sb_flags, 0,
9077 BIT(MD_SB_CHANGE_PENDING) | BIT(MD_SB_CHANGE_DEVS));
9078
9079 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
9080 !test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
9081 mddev->delta_disks > 0 &&
9082 mddev->pers->finish_reshape &&
9083 mddev->pers->size &&
9084 mddev->queue) {
9085 mddev_lock_nointr(mddev);
9086 md_set_array_sectors(mddev, mddev->pers->size(mddev, 0, 0));
9087 mddev_unlock(mddev);
9088 if (!mddev_is_clustered(mddev))
9089 set_capacity_and_notify(mddev->gendisk,
9090 mddev->array_sectors);
9091 }
9092
9093 spin_lock(&mddev->lock);
9094 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
9095
9096 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
9097 mddev->resync_min = 0;
9098 mddev->resync_max = MaxSector;
9099 } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
9100 mddev->resync_min = mddev->curr_resync_completed;
9101 set_bit(MD_RECOVERY_DONE, &mddev->recovery);
9102 mddev->curr_resync = MD_RESYNC_NONE;
9103 spin_unlock(&mddev->lock);
9104
9105 wake_up(&resync_wait);
9106 md_wakeup_thread(mddev->thread);
9107 return;
9108 }
9109 EXPORT_SYMBOL_GPL(md_do_sync);
9110
9111 static int remove_and_add_spares(struct mddev *mddev,
9112 struct md_rdev *this)
9113 {
9114 struct md_rdev *rdev;
9115 int spares = 0;
9116 int removed = 0;
9117 bool remove_some = false;
9118
9119 if (this && test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
9120
9121 return 0;
9122
9123 rdev_for_each(rdev, mddev) {
9124 if ((this == NULL || rdev == this) &&
9125 rdev->raid_disk >= 0 &&
9126 !test_bit(Blocked, &rdev->flags) &&
9127 test_bit(Faulty, &rdev->flags) &&
9128 atomic_read(&rdev->nr_pending)==0) {
9129
9130
9131
9132
9133
9134 remove_some = true;
9135 set_bit(RemoveSynchronized, &rdev->flags);
9136 }
9137 }
9138
9139 if (remove_some)
9140 synchronize_rcu();
9141 rdev_for_each(rdev, mddev) {
9142 if ((this == NULL || rdev == this) &&
9143 rdev->raid_disk >= 0 &&
9144 !test_bit(Blocked, &rdev->flags) &&
9145 ((test_bit(RemoveSynchronized, &rdev->flags) ||
9146 (!test_bit(In_sync, &rdev->flags) &&
9147 !test_bit(Journal, &rdev->flags))) &&
9148 atomic_read(&rdev->nr_pending)==0)) {
9149 if (mddev->pers->hot_remove_disk(
9150 mddev, rdev) == 0) {
9151 sysfs_unlink_rdev(mddev, rdev);
9152 rdev->saved_raid_disk = rdev->raid_disk;
9153 rdev->raid_disk = -1;
9154 removed++;
9155 }
9156 }
9157 if (remove_some && test_bit(RemoveSynchronized, &rdev->flags))
9158 clear_bit(RemoveSynchronized, &rdev->flags);
9159 }
9160
9161 if (removed && mddev->kobj.sd)
9162 sysfs_notify_dirent_safe(mddev->sysfs_degraded);
9163
9164 if (this && removed)
9165 goto no_add;
9166
9167 rdev_for_each(rdev, mddev) {
9168 if (this && this != rdev)
9169 continue;
9170 if (test_bit(Candidate, &rdev->flags))
9171 continue;
9172 if (rdev->raid_disk >= 0 &&
9173 !test_bit(In_sync, &rdev->flags) &&
9174 !test_bit(Journal, &rdev->flags) &&
9175 !test_bit(Faulty, &rdev->flags))
9176 spares++;
9177 if (rdev->raid_disk >= 0)
9178 continue;
9179 if (test_bit(Faulty, &rdev->flags))
9180 continue;
9181 if (!test_bit(Journal, &rdev->flags)) {
9182 if (mddev->ro &&
9183 ! (rdev->saved_raid_disk >= 0 &&
9184 !test_bit(Bitmap_sync, &rdev->flags)))
9185 continue;
9186
9187 rdev->recovery_offset = 0;
9188 }
9189 if (mddev->pers->hot_add_disk(mddev, rdev) == 0) {
9190
9191 sysfs_link_rdev(mddev, rdev);
9192 if (!test_bit(Journal, &rdev->flags))
9193 spares++;
9194 md_new_event();
9195 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
9196 }
9197 }
9198 no_add:
9199 if (removed)
9200 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
9201 return spares;
9202 }
9203
9204 static void md_start_sync(struct work_struct *ws)
9205 {
9206 struct mddev *mddev = container_of(ws, struct mddev, del_work);
9207
9208 mddev->sync_thread = md_register_thread(md_do_sync,
9209 mddev,
9210 "resync");
9211 if (!mddev->sync_thread) {
9212 pr_warn("%s: could not start resync thread...\n",
9213 mdname(mddev));
9214
9215 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
9216 clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
9217 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
9218 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
9219 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
9220 wake_up(&resync_wait);
9221 if (test_and_clear_bit(MD_RECOVERY_RECOVER,
9222 &mddev->recovery))
9223 if (mddev->sysfs_action)
9224 sysfs_notify_dirent_safe(mddev->sysfs_action);
9225 } else
9226 md_wakeup_thread(mddev->sync_thread);
9227 sysfs_notify_dirent_safe(mddev->sysfs_action);
9228 md_new_event();
9229 }
9230
9231
9232
9233
9234
9235
9236
9237
9238
9239
9240
9241
9242
9243
9244
9245
9246
9247
9248
9249
9250
9251
9252
9253 void md_check_recovery(struct mddev *mddev)
9254 {
9255 if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags) && mddev->sb_flags) {
9256
9257
9258
9259 set_bit(MD_UPDATING_SB, &mddev->flags);
9260 smp_mb__after_atomic();
9261 if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags))
9262 md_update_sb(mddev, 0);
9263 clear_bit_unlock(MD_UPDATING_SB, &mddev->flags);
9264 wake_up(&mddev->sb_wait);
9265 }
9266
9267 if (mddev->suspended)
9268 return;
9269
9270 if (mddev->bitmap)
9271 md_bitmap_daemon_work(mddev);
9272
9273 if (signal_pending(current)) {
9274 if (mddev->pers->sync_request && !mddev->external) {
9275 pr_debug("md: %s in immediate safe mode\n",
9276 mdname(mddev));
9277 mddev->safemode = 2;
9278 }
9279 flush_signals(current);
9280 }
9281
9282 if (mddev->ro && !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
9283 return;
9284 if ( ! (
9285 (mddev->sb_flags & ~ (1<<MD_SB_CHANGE_PENDING)) ||
9286 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
9287 test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
9288 (mddev->external == 0 && mddev->safemode == 1) ||
9289 (mddev->safemode == 2
9290 && !mddev->in_sync && mddev->recovery_cp == MaxSector)
9291 ))
9292 return;
9293
9294 if (mddev_trylock(mddev)) {
9295 int spares = 0;
9296 bool try_set_sync = mddev->safemode != 0;
9297
9298 if (!mddev->external && mddev->safemode == 1)
9299 mddev->safemode = 0;
9300
9301 if (mddev->ro) {
9302 struct md_rdev *rdev;
9303 if (!mddev->external && mddev->in_sync)
9304
9305
9306
9307
9308
9309 rdev_for_each(rdev, mddev)
9310 clear_bit(Blocked, &rdev->flags);
9311
9312
9313
9314
9315
9316
9317
9318 remove_and_add_spares(mddev, NULL);
9319
9320
9321
9322 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
9323 md_unregister_thread(&mddev->sync_thread);
9324 md_reap_sync_thread(mddev);
9325 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
9326 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
9327 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
9328 goto unlock;
9329 }
9330
9331 if (mddev_is_clustered(mddev)) {
9332 struct md_rdev *rdev, *tmp;
9333
9334
9335
9336 rdev_for_each_safe(rdev, tmp, mddev) {
9337 if (test_and_clear_bit(ClusterRemove, &rdev->flags) &&
9338 rdev->raid_disk < 0)
9339 md_kick_rdev_from_array(rdev);
9340 }
9341 }
9342
9343 if (try_set_sync && !mddev->external && !mddev->in_sync) {
9344 spin_lock(&mddev->lock);
9345 set_in_sync(mddev);
9346 spin_unlock(&mddev->lock);
9347 }
9348
9349 if (mddev->sb_flags)
9350 md_update_sb(mddev, 0);
9351
9352 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
9353 !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
9354
9355 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
9356 goto unlock;
9357 }
9358 if (mddev->sync_thread) {
9359 md_unregister_thread(&mddev->sync_thread);
9360 md_reap_sync_thread(mddev);
9361 goto unlock;
9362 }
9363
9364
9365
9366 mddev->curr_resync_completed = 0;
9367 spin_lock(&mddev->lock);
9368 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
9369 spin_unlock(&mddev->lock);
9370
9371
9372
9373 clear_bit(MD_RECOVERY_INTR, &mddev->recovery);
9374 clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
9375
9376 if (!test_and_clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
9377 test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
9378 goto not_running;
9379
9380
9381
9382
9383
9384
9385
9386 if (mddev->reshape_position != MaxSector) {
9387 if (mddev->pers->check_reshape == NULL ||
9388 mddev->pers->check_reshape(mddev) != 0)
9389
9390 goto not_running;
9391 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
9392 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
9393 } else if ((spares = remove_and_add_spares(mddev, NULL))) {
9394 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
9395 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
9396 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
9397 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
9398 } else if (mddev->recovery_cp < MaxSector) {
9399 set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
9400 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
9401 } else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
9402
9403 goto not_running;
9404
9405 if (mddev->pers->sync_request) {
9406 if (spares) {
9407
9408
9409
9410
9411 md_bitmap_write_all(mddev->bitmap);
9412 }
9413 INIT_WORK(&mddev->del_work, md_start_sync);
9414 queue_work(md_misc_wq, &mddev->del_work);
9415 goto unlock;
9416 }
9417 not_running:
9418 if (!mddev->sync_thread) {
9419 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
9420 wake_up(&resync_wait);
9421 if (test_and_clear_bit(MD_RECOVERY_RECOVER,
9422 &mddev->recovery))
9423 if (mddev->sysfs_action)
9424 sysfs_notify_dirent_safe(mddev->sysfs_action);
9425 }
9426 unlock:
9427 wake_up(&mddev->sb_wait);
9428 mddev_unlock(mddev);
9429 }
9430 }
9431 EXPORT_SYMBOL(md_check_recovery);
9432
9433 void md_reap_sync_thread(struct mddev *mddev)
9434 {
9435 struct md_rdev *rdev;
9436 sector_t old_dev_sectors = mddev->dev_sectors;
9437 bool is_reshaped = false;
9438
9439
9440 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
9441 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
9442 mddev->degraded != mddev->raid_disks) {
9443
9444
9445 if (mddev->pers->spare_active(mddev)) {
9446 sysfs_notify_dirent_safe(mddev->sysfs_degraded);
9447 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
9448 }
9449 }
9450 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
9451 mddev->pers->finish_reshape) {
9452 mddev->pers->finish_reshape(mddev);
9453 if (mddev_is_clustered(mddev))
9454 is_reshaped = true;
9455 }
9456
9457
9458
9459
9460 if (!mddev->degraded)
9461 rdev_for_each(rdev, mddev)
9462 rdev->saved_raid_disk = -1;
9463
9464 md_update_sb(mddev, 1);
9465
9466
9467
9468 if (test_and_clear_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags))
9469 md_cluster_ops->resync_finish(mddev);
9470 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
9471 clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
9472 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
9473 clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
9474 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
9475 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
9476
9477
9478
9479
9480
9481 if (mddev_is_clustered(mddev) && is_reshaped
9482 && !test_bit(MD_CLOSING, &mddev->flags))
9483 md_cluster_ops->update_size(mddev, old_dev_sectors);
9484 wake_up(&resync_wait);
9485
9486 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
9487 sysfs_notify_dirent_safe(mddev->sysfs_completed);
9488 sysfs_notify_dirent_safe(mddev->sysfs_action);
9489 md_new_event();
9490 if (mddev->event_work.func)
9491 queue_work(md_misc_wq, &mddev->event_work);
9492 }
9493 EXPORT_SYMBOL(md_reap_sync_thread);
9494
9495 void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev)
9496 {
9497 sysfs_notify_dirent_safe(rdev->sysfs_state);
9498 wait_event_timeout(rdev->blocked_wait,
9499 !test_bit(Blocked, &rdev->flags) &&
9500 !test_bit(BlockedBadBlocks, &rdev->flags),
9501 msecs_to_jiffies(5000));
9502 rdev_dec_pending(rdev, mddev);
9503 }
9504 EXPORT_SYMBOL(md_wait_for_blocked_rdev);
9505
9506 void md_finish_reshape(struct mddev *mddev)
9507 {
9508
9509 struct md_rdev *rdev;
9510
9511 rdev_for_each(rdev, mddev) {
9512 if (rdev->data_offset > rdev->new_data_offset)
9513 rdev->sectors += rdev->data_offset - rdev->new_data_offset;
9514 else
9515 rdev->sectors -= rdev->new_data_offset - rdev->data_offset;
9516 rdev->data_offset = rdev->new_data_offset;
9517 }
9518 }
9519 EXPORT_SYMBOL(md_finish_reshape);
9520
9521
9522
9523
9524 int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
9525 int is_new)
9526 {
9527 struct mddev *mddev = rdev->mddev;
9528 int rv;
9529 if (is_new)
9530 s += rdev->new_data_offset;
9531 else
9532 s += rdev->data_offset;
9533 rv = badblocks_set(&rdev->badblocks, s, sectors, 0);
9534 if (rv == 0) {
9535
9536 if (test_bit(ExternalBbl, &rdev->flags))
9537 sysfs_notify_dirent_safe(rdev->sysfs_unack_badblocks);
9538 sysfs_notify_dirent_safe(rdev->sysfs_state);
9539 set_mask_bits(&mddev->sb_flags, 0,
9540 BIT(MD_SB_CHANGE_CLEAN) | BIT(MD_SB_CHANGE_PENDING));
9541 md_wakeup_thread(rdev->mddev->thread);
9542 return 1;
9543 } else
9544 return 0;
9545 }
9546 EXPORT_SYMBOL_GPL(rdev_set_badblocks);
9547
9548 int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
9549 int is_new)
9550 {
9551 int rv;
9552 if (is_new)
9553 s += rdev->new_data_offset;
9554 else
9555 s += rdev->data_offset;
9556 rv = badblocks_clear(&rdev->badblocks, s, sectors);
9557 if ((rv == 0) && test_bit(ExternalBbl, &rdev->flags))
9558 sysfs_notify_dirent_safe(rdev->sysfs_badblocks);
9559 return rv;
9560 }
9561 EXPORT_SYMBOL_GPL(rdev_clear_badblocks);
9562
9563 static int md_notify_reboot(struct notifier_block *this,
9564 unsigned long code, void *x)
9565 {
9566 struct mddev *mddev, *n;
9567 int need_delay = 0;
9568
9569 spin_lock(&all_mddevs_lock);
9570 list_for_each_entry_safe(mddev, n, &all_mddevs, all_mddevs) {
9571 if (!mddev_get(mddev))
9572 continue;
9573 spin_unlock(&all_mddevs_lock);
9574 if (mddev_trylock(mddev)) {
9575 if (mddev->pers)
9576 __md_stop_writes(mddev);
9577 if (mddev->persistent)
9578 mddev->safemode = 2;
9579 mddev_unlock(mddev);
9580 }
9581 need_delay = 1;
9582 mddev_put(mddev);
9583 spin_lock(&all_mddevs_lock);
9584 }
9585 spin_unlock(&all_mddevs_lock);
9586
9587
9588
9589
9590
9591
9592
9593 if (need_delay)
9594 msleep(1000);
9595
9596 return NOTIFY_DONE;
9597 }
9598
9599 static struct notifier_block md_notifier = {
9600 .notifier_call = md_notify_reboot,
9601 .next = NULL,
9602 .priority = INT_MAX,
9603 };
9604
9605 static void md_geninit(void)
9606 {
9607 pr_debug("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t));
9608
9609 proc_create("mdstat", S_IRUGO, NULL, &mdstat_proc_ops);
9610 }
9611
9612 static int __init md_init(void)
9613 {
9614 int ret = -ENOMEM;
9615
9616 md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM, 0);
9617 if (!md_wq)
9618 goto err_wq;
9619
9620 md_misc_wq = alloc_workqueue("md_misc", 0, 0);
9621 if (!md_misc_wq)
9622 goto err_misc_wq;
9623
9624 md_rdev_misc_wq = alloc_workqueue("md_rdev_misc", 0, 0);
9625 if (!md_rdev_misc_wq)
9626 goto err_rdev_misc_wq;
9627
9628 ret = __register_blkdev(MD_MAJOR, "md", md_probe);
9629 if (ret < 0)
9630 goto err_md;
9631
9632 ret = __register_blkdev(0, "mdp", md_probe);
9633 if (ret < 0)
9634 goto err_mdp;
9635 mdp_major = ret;
9636
9637 register_reboot_notifier(&md_notifier);
9638 raid_table_header = register_sysctl_table(raid_root_table);
9639
9640 md_geninit();
9641 return 0;
9642
9643 err_mdp:
9644 unregister_blkdev(MD_MAJOR, "md");
9645 err_md:
9646 destroy_workqueue(md_rdev_misc_wq);
9647 err_rdev_misc_wq:
9648 destroy_workqueue(md_misc_wq);
9649 err_misc_wq:
9650 destroy_workqueue(md_wq);
9651 err_wq:
9652 return ret;
9653 }
9654
9655 static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev)
9656 {
9657 struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
9658 struct md_rdev *rdev2, *tmp;
9659 int role, ret;
9660
9661
9662
9663
9664
9665 if (mddev->dev_sectors != le64_to_cpu(sb->size)) {
9666 ret = mddev->pers->resize(mddev, le64_to_cpu(sb->size));
9667 if (ret)
9668 pr_info("md-cluster: resize failed\n");
9669 else
9670 md_bitmap_update_sb(mddev->bitmap);
9671 }
9672
9673
9674 rdev_for_each_safe(rdev2, tmp, mddev) {
9675 if (test_bit(Faulty, &rdev2->flags))
9676 continue;
9677
9678
9679 role = le16_to_cpu(sb->dev_roles[rdev2->desc_nr]);
9680
9681 if (test_bit(Candidate, &rdev2->flags)) {
9682 if (role == MD_DISK_ROLE_FAULTY) {
9683 pr_info("md: Removing Candidate device %pg because add failed\n",
9684 rdev2->bdev);
9685 md_kick_rdev_from_array(rdev2);
9686 continue;
9687 }
9688 else
9689 clear_bit(Candidate, &rdev2->flags);
9690 }
9691
9692 if (role != rdev2->raid_disk) {
9693
9694
9695
9696 if (rdev2->raid_disk == -1 && role != MD_DISK_ROLE_SPARE &&
9697 !(le32_to_cpu(sb->feature_map) &
9698 MD_FEATURE_RESHAPE_ACTIVE)) {
9699 rdev2->saved_raid_disk = role;
9700 ret = remove_and_add_spares(mddev, rdev2);
9701 pr_info("Activated spare: %pg\n",
9702 rdev2->bdev);
9703
9704
9705 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
9706 md_wakeup_thread(mddev->thread);
9707 }
9708
9709
9710
9711
9712
9713 if (role == MD_DISK_ROLE_FAULTY ||
9714 role == MD_DISK_ROLE_JOURNAL) {
9715 md_error(mddev, rdev2);
9716 clear_bit(Blocked, &rdev2->flags);
9717 }
9718 }
9719 }
9720
9721 if (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) {
9722 ret = update_raid_disks(mddev, le32_to_cpu(sb->raid_disks));
9723 if (ret)
9724 pr_warn("md: updating array disks failed. %d\n", ret);
9725 }
9726
9727
9728
9729
9730
9731 if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) &&
9732 (le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
9733
9734
9735
9736
9737 mddev->reshape_position = le64_to_cpu(sb->reshape_position);
9738 if (mddev->pers->update_reshape_pos)
9739 mddev->pers->update_reshape_pos(mddev);
9740 if (mddev->pers->start_reshape)
9741 mddev->pers->start_reshape(mddev);
9742 } else if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) &&
9743 mddev->reshape_position != MaxSector &&
9744 !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
9745
9746 mddev->reshape_position = MaxSector;
9747 if (mddev->pers->update_reshape_pos)
9748 mddev->pers->update_reshape_pos(mddev);
9749 }
9750
9751
9752 mddev->events = le64_to_cpu(sb->events);
9753 }
9754
9755 static int read_rdev(struct mddev *mddev, struct md_rdev *rdev)
9756 {
9757 int err;
9758 struct page *swapout = rdev->sb_page;
9759 struct mdp_superblock_1 *sb;
9760
9761
9762
9763
9764 rdev->sb_page = NULL;
9765 err = alloc_disk_sb(rdev);
9766 if (err == 0) {
9767 ClearPageUptodate(rdev->sb_page);
9768 rdev->sb_loaded = 0;
9769 err = super_types[mddev->major_version].
9770 load_super(rdev, NULL, mddev->minor_version);
9771 }
9772 if (err < 0) {
9773 pr_warn("%s: %d Could not reload rdev(%d) err: %d. Restoring old values\n",
9774 __func__, __LINE__, rdev->desc_nr, err);
9775 if (rdev->sb_page)
9776 put_page(rdev->sb_page);
9777 rdev->sb_page = swapout;
9778 rdev->sb_loaded = 1;
9779 return err;
9780 }
9781
9782 sb = page_address(rdev->sb_page);
9783
9784
9785
9786
9787 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RECOVERY_OFFSET))
9788 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
9789
9790
9791
9792
9793 if (rdev->recovery_offset == MaxSector &&
9794 !test_bit(In_sync, &rdev->flags) &&
9795 mddev->pers->spare_active(mddev))
9796 sysfs_notify_dirent_safe(mddev->sysfs_degraded);
9797
9798 put_page(swapout);
9799 return 0;
9800 }
9801
9802 void md_reload_sb(struct mddev *mddev, int nr)
9803 {
9804 struct md_rdev *rdev = NULL, *iter;
9805 int err;
9806
9807
9808 rdev_for_each_rcu(iter, mddev) {
9809 if (iter->desc_nr == nr) {
9810 rdev = iter;
9811 break;
9812 }
9813 }
9814
9815 if (!rdev) {
9816 pr_warn("%s: %d Could not find rdev with nr %d\n", __func__, __LINE__, nr);
9817 return;
9818 }
9819
9820 err = read_rdev(mddev, rdev);
9821 if (err < 0)
9822 return;
9823
9824 check_sb_changes(mddev, rdev);
9825
9826
9827 rdev_for_each_rcu(rdev, mddev) {
9828 if (!test_bit(Faulty, &rdev->flags))
9829 read_rdev(mddev, rdev);
9830 }
9831 }
9832 EXPORT_SYMBOL(md_reload_sb);
9833
9834 #ifndef MODULE
9835
9836
9837
9838
9839
9840
9841 static DEFINE_MUTEX(detected_devices_mutex);
9842 static LIST_HEAD(all_detected_devices);
9843 struct detected_devices_node {
9844 struct list_head list;
9845 dev_t dev;
9846 };
9847
9848 void md_autodetect_dev(dev_t dev)
9849 {
9850 struct detected_devices_node *node_detected_dev;
9851
9852 node_detected_dev = kzalloc(sizeof(*node_detected_dev), GFP_KERNEL);
9853 if (node_detected_dev) {
9854 node_detected_dev->dev = dev;
9855 mutex_lock(&detected_devices_mutex);
9856 list_add_tail(&node_detected_dev->list, &all_detected_devices);
9857 mutex_unlock(&detected_devices_mutex);
9858 }
9859 }
9860
9861 void md_autostart_arrays(int part)
9862 {
9863 struct md_rdev *rdev;
9864 struct detected_devices_node *node_detected_dev;
9865 dev_t dev;
9866 int i_scanned, i_passed;
9867
9868 i_scanned = 0;
9869 i_passed = 0;
9870
9871 pr_info("md: Autodetecting RAID arrays.\n");
9872
9873 mutex_lock(&detected_devices_mutex);
9874 while (!list_empty(&all_detected_devices) && i_scanned < INT_MAX) {
9875 i_scanned++;
9876 node_detected_dev = list_entry(all_detected_devices.next,
9877 struct detected_devices_node, list);
9878 list_del(&node_detected_dev->list);
9879 dev = node_detected_dev->dev;
9880 kfree(node_detected_dev);
9881 mutex_unlock(&detected_devices_mutex);
9882 rdev = md_import_device(dev,0, 90);
9883 mutex_lock(&detected_devices_mutex);
9884 if (IS_ERR(rdev))
9885 continue;
9886
9887 if (test_bit(Faulty, &rdev->flags))
9888 continue;
9889
9890 set_bit(AutoDetected, &rdev->flags);
9891 list_add(&rdev->same_set, &pending_raid_disks);
9892 i_passed++;
9893 }
9894 mutex_unlock(&detected_devices_mutex);
9895
9896 pr_debug("md: Scanned %d and added %d devices.\n", i_scanned, i_passed);
9897
9898 autorun_devices(part);
9899 }
9900
9901 #endif
9902
9903 static __exit void md_exit(void)
9904 {
9905 struct mddev *mddev, *n;
9906 int delay = 1;
9907
9908 unregister_blkdev(MD_MAJOR,"md");
9909 unregister_blkdev(mdp_major, "mdp");
9910 unregister_reboot_notifier(&md_notifier);
9911 unregister_sysctl_table(raid_table_header);
9912
9913
9914
9915
9916 md_unloading = 1;
9917 while (waitqueue_active(&md_event_waiters)) {
9918
9919 wake_up(&md_event_waiters);
9920 msleep(delay);
9921 delay += delay;
9922 }
9923 remove_proc_entry("mdstat", NULL);
9924
9925 spin_lock(&all_mddevs_lock);
9926 list_for_each_entry_safe(mddev, n, &all_mddevs, all_mddevs) {
9927 if (!mddev_get(mddev))
9928 continue;
9929 spin_unlock(&all_mddevs_lock);
9930 export_array(mddev);
9931 mddev->ctime = 0;
9932 mddev->hold_active = 0;
9933
9934
9935
9936
9937
9938 mddev_put(mddev);
9939 spin_lock(&all_mddevs_lock);
9940 }
9941 spin_unlock(&all_mddevs_lock);
9942
9943 destroy_workqueue(md_rdev_misc_wq);
9944 destroy_workqueue(md_misc_wq);
9945 destroy_workqueue(md_wq);
9946 }
9947
9948 subsys_initcall(md_init);
9949 module_exit(md_exit)
9950
9951 static int get_ro(char *buffer, const struct kernel_param *kp)
9952 {
9953 return sprintf(buffer, "%d\n", start_readonly);
9954 }
9955 static int set_ro(const char *val, const struct kernel_param *kp)
9956 {
9957 return kstrtouint(val, 10, (unsigned int *)&start_readonly);
9958 }
9959
9960 module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR);
9961 module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR);
9962 module_param_call(new_array, add_named_array, NULL, NULL, S_IWUSR);
9963 module_param(create_on_open, bool, S_IRUSR|S_IWUSR);
9964
9965 MODULE_LICENSE("GPL");
9966 MODULE_DESCRIPTION("MD RAID framework");
9967 MODULE_ALIAS("md");
9968 MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR);