0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024
0025
0026
0027
0028
0029
0030
0031
0032
0033
0034
0035
0036
0037
0038 #include <linux/blkdev.h>
0039 #include <linux/kthread.h>
0040 #include <linux/raid/pq.h>
0041 #include <linux/async_tx.h>
0042 #include <linux/module.h>
0043 #include <linux/async.h>
0044 #include <linux/seq_file.h>
0045 #include <linux/cpu.h>
0046 #include <linux/slab.h>
0047 #include <linux/ratelimit.h>
0048 #include <linux/nodemask.h>
0049
0050 #include <trace/events/block.h>
0051 #include <linux/list_sort.h>
0052
0053 #include "md.h"
0054 #include "raid5.h"
0055 #include "raid0.h"
0056 #include "md-bitmap.h"
0057 #include "raid5-log.h"
0058
0059 #define UNSUPPORTED_MDDEV_FLAGS (1L << MD_FAILFAST_SUPPORTED)
0060
0061 #define cpu_to_group(cpu) cpu_to_node(cpu)
0062 #define ANY_GROUP NUMA_NO_NODE
0063
0064 #define RAID5_MAX_REQ_STRIPES 256
0065
0066 static bool devices_handle_discard_safely = false;
0067 module_param(devices_handle_discard_safely, bool, 0644);
0068 MODULE_PARM_DESC(devices_handle_discard_safely,
0069 "Set to Y if all devices in each array reliably return zeroes on reads from discarded regions");
0070 static struct workqueue_struct *raid5_wq;
0071
0072 static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect)
0073 {
0074 int hash = (sect >> RAID5_STRIPE_SHIFT(conf)) & HASH_MASK;
0075 return &conf->stripe_hashtbl[hash];
0076 }
0077
0078 static inline int stripe_hash_locks_hash(struct r5conf *conf, sector_t sect)
0079 {
0080 return (sect >> RAID5_STRIPE_SHIFT(conf)) & STRIPE_HASH_LOCKS_MASK;
0081 }
0082
0083 static inline void lock_device_hash_lock(struct r5conf *conf, int hash)
0084 __acquires(&conf->device_lock)
0085 {
0086 spin_lock_irq(conf->hash_locks + hash);
0087 spin_lock(&conf->device_lock);
0088 }
0089
0090 static inline void unlock_device_hash_lock(struct r5conf *conf, int hash)
0091 __releases(&conf->device_lock)
0092 {
0093 spin_unlock(&conf->device_lock);
0094 spin_unlock_irq(conf->hash_locks + hash);
0095 }
0096
0097 static inline void lock_all_device_hash_locks_irq(struct r5conf *conf)
0098 __acquires(&conf->device_lock)
0099 {
0100 int i;
0101 spin_lock_irq(conf->hash_locks);
0102 for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++)
0103 spin_lock_nest_lock(conf->hash_locks + i, conf->hash_locks);
0104 spin_lock(&conf->device_lock);
0105 }
0106
0107 static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf)
0108 __releases(&conf->device_lock)
0109 {
0110 int i;
0111 spin_unlock(&conf->device_lock);
0112 for (i = NR_STRIPE_HASH_LOCKS - 1; i; i--)
0113 spin_unlock(conf->hash_locks + i);
0114 spin_unlock_irq(conf->hash_locks);
0115 }
0116
0117
0118 static inline int raid6_d0(struct stripe_head *sh)
0119 {
0120 if (sh->ddf_layout)
0121
0122 return 0;
0123
0124 if (sh->qd_idx == sh->disks - 1)
0125 return 0;
0126 else
0127 return sh->qd_idx + 1;
0128 }
0129 static inline int raid6_next_disk(int disk, int raid_disks)
0130 {
0131 disk++;
0132 return (disk < raid_disks) ? disk : 0;
0133 }
0134
0135
0136
0137
0138
0139
0140 static int raid6_idx_to_slot(int idx, struct stripe_head *sh,
0141 int *count, int syndrome_disks)
0142 {
0143 int slot = *count;
0144
0145 if (sh->ddf_layout)
0146 (*count)++;
0147 if (idx == sh->pd_idx)
0148 return syndrome_disks;
0149 if (idx == sh->qd_idx)
0150 return syndrome_disks + 1;
0151 if (!sh->ddf_layout)
0152 (*count)++;
0153 return slot;
0154 }
0155
0156 static void print_raid5_conf (struct r5conf *conf);
0157
0158 static int stripe_operations_active(struct stripe_head *sh)
0159 {
0160 return sh->check_state || sh->reconstruct_state ||
0161 test_bit(STRIPE_BIOFILL_RUN, &sh->state) ||
0162 test_bit(STRIPE_COMPUTE_RUN, &sh->state);
0163 }
0164
0165 static bool stripe_is_lowprio(struct stripe_head *sh)
0166 {
0167 return (test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state) ||
0168 test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) &&
0169 !test_bit(STRIPE_R5C_CACHING, &sh->state);
0170 }
0171
0172 static void raid5_wakeup_stripe_thread(struct stripe_head *sh)
0173 __must_hold(&sh->raid_conf->device_lock)
0174 {
0175 struct r5conf *conf = sh->raid_conf;
0176 struct r5worker_group *group;
0177 int thread_cnt;
0178 int i, cpu = sh->cpu;
0179
0180 if (!cpu_online(cpu)) {
0181 cpu = cpumask_any(cpu_online_mask);
0182 sh->cpu = cpu;
0183 }
0184
0185 if (list_empty(&sh->lru)) {
0186 struct r5worker_group *group;
0187 group = conf->worker_groups + cpu_to_group(cpu);
0188 if (stripe_is_lowprio(sh))
0189 list_add_tail(&sh->lru, &group->loprio_list);
0190 else
0191 list_add_tail(&sh->lru, &group->handle_list);
0192 group->stripes_cnt++;
0193 sh->group = group;
0194 }
0195
0196 if (conf->worker_cnt_per_group == 0) {
0197 md_wakeup_thread(conf->mddev->thread);
0198 return;
0199 }
0200
0201 group = conf->worker_groups + cpu_to_group(sh->cpu);
0202
0203 group->workers[0].working = true;
0204
0205 queue_work_on(sh->cpu, raid5_wq, &group->workers[0].work);
0206
0207 thread_cnt = group->stripes_cnt / MAX_STRIPE_BATCH - 1;
0208
0209 for (i = 1; i < conf->worker_cnt_per_group && thread_cnt > 0; i++) {
0210 if (group->workers[i].working == false) {
0211 group->workers[i].working = true;
0212 queue_work_on(sh->cpu, raid5_wq,
0213 &group->workers[i].work);
0214 thread_cnt--;
0215 }
0216 }
0217 }
0218
0219 static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
0220 struct list_head *temp_inactive_list)
0221 __must_hold(&conf->device_lock)
0222 {
0223 int i;
0224 int injournal = 0;
0225
0226 BUG_ON(!list_empty(&sh->lru));
0227 BUG_ON(atomic_read(&conf->active_stripes)==0);
0228
0229 if (r5c_is_writeback(conf->log))
0230 for (i = sh->disks; i--; )
0231 if (test_bit(R5_InJournal, &sh->dev[i].flags))
0232 injournal++;
0233
0234
0235
0236
0237
0238
0239
0240 if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) ||
0241 (conf->quiesce && r5c_is_writeback(conf->log) &&
0242 !test_bit(STRIPE_HANDLE, &sh->state) && injournal != 0)) {
0243 if (test_bit(STRIPE_R5C_CACHING, &sh->state))
0244 r5c_make_stripe_write_out(sh);
0245 set_bit(STRIPE_HANDLE, &sh->state);
0246 }
0247
0248 if (test_bit(STRIPE_HANDLE, &sh->state)) {
0249 if (test_bit(STRIPE_DELAYED, &sh->state) &&
0250 !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
0251 list_add_tail(&sh->lru, &conf->delayed_list);
0252 else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
0253 sh->bm_seq - conf->seq_write > 0)
0254 list_add_tail(&sh->lru, &conf->bitmap_list);
0255 else {
0256 clear_bit(STRIPE_DELAYED, &sh->state);
0257 clear_bit(STRIPE_BIT_DELAY, &sh->state);
0258 if (conf->worker_cnt_per_group == 0) {
0259 if (stripe_is_lowprio(sh))
0260 list_add_tail(&sh->lru,
0261 &conf->loprio_list);
0262 else
0263 list_add_tail(&sh->lru,
0264 &conf->handle_list);
0265 } else {
0266 raid5_wakeup_stripe_thread(sh);
0267 return;
0268 }
0269 }
0270 md_wakeup_thread(conf->mddev->thread);
0271 } else {
0272 BUG_ON(stripe_operations_active(sh));
0273 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
0274 if (atomic_dec_return(&conf->preread_active_stripes)
0275 < IO_THRESHOLD)
0276 md_wakeup_thread(conf->mddev->thread);
0277 atomic_dec(&conf->active_stripes);
0278 if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
0279 if (!r5c_is_writeback(conf->log))
0280 list_add_tail(&sh->lru, temp_inactive_list);
0281 else {
0282 WARN_ON(test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags));
0283 if (injournal == 0)
0284 list_add_tail(&sh->lru, temp_inactive_list);
0285 else if (injournal == conf->raid_disks - conf->max_degraded) {
0286
0287 if (!test_and_set_bit(STRIPE_R5C_FULL_STRIPE, &sh->state))
0288 atomic_inc(&conf->r5c_cached_full_stripes);
0289 if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state))
0290 atomic_dec(&conf->r5c_cached_partial_stripes);
0291 list_add_tail(&sh->lru, &conf->r5c_full_stripe_list);
0292 r5c_check_cached_full_stripe(conf);
0293 } else
0294
0295
0296
0297
0298
0299 list_add_tail(&sh->lru, &conf->r5c_partial_stripe_list);
0300 }
0301 }
0302 }
0303 }
0304
0305 static void __release_stripe(struct r5conf *conf, struct stripe_head *sh,
0306 struct list_head *temp_inactive_list)
0307 __must_hold(&conf->device_lock)
0308 {
0309 if (atomic_dec_and_test(&sh->count))
0310 do_release_stripe(conf, sh, temp_inactive_list);
0311 }
0312
0313
0314
0315
0316
0317
0318
0319
0320 static void release_inactive_stripe_list(struct r5conf *conf,
0321 struct list_head *temp_inactive_list,
0322 int hash)
0323 {
0324 int size;
0325 bool do_wakeup = false;
0326 unsigned long flags;
0327
0328 if (hash == NR_STRIPE_HASH_LOCKS) {
0329 size = NR_STRIPE_HASH_LOCKS;
0330 hash = NR_STRIPE_HASH_LOCKS - 1;
0331 } else
0332 size = 1;
0333 while (size) {
0334 struct list_head *list = &temp_inactive_list[size - 1];
0335
0336
0337
0338
0339
0340 if (!list_empty_careful(list)) {
0341 spin_lock_irqsave(conf->hash_locks + hash, flags);
0342 if (list_empty(conf->inactive_list + hash) &&
0343 !list_empty(list))
0344 atomic_dec(&conf->empty_inactive_list_nr);
0345 list_splice_tail_init(list, conf->inactive_list + hash);
0346 do_wakeup = true;
0347 spin_unlock_irqrestore(conf->hash_locks + hash, flags);
0348 }
0349 size--;
0350 hash--;
0351 }
0352
0353 if (do_wakeup) {
0354 wake_up(&conf->wait_for_stripe);
0355 if (atomic_read(&conf->active_stripes) == 0)
0356 wake_up(&conf->wait_for_quiescent);
0357 if (conf->retry_read_aligned)
0358 md_wakeup_thread(conf->mddev->thread);
0359 }
0360 }
0361
0362 static int release_stripe_list(struct r5conf *conf,
0363 struct list_head *temp_inactive_list)
0364 __must_hold(&conf->device_lock)
0365 {
0366 struct stripe_head *sh, *t;
0367 int count = 0;
0368 struct llist_node *head;
0369
0370 head = llist_del_all(&conf->released_stripes);
0371 head = llist_reverse_order(head);
0372 llist_for_each_entry_safe(sh, t, head, release_list) {
0373 int hash;
0374
0375
0376 smp_mb();
0377 clear_bit(STRIPE_ON_RELEASE_LIST, &sh->state);
0378
0379
0380
0381
0382
0383 hash = sh->hash_lock_index;
0384 __release_stripe(conf, sh, &temp_inactive_list[hash]);
0385 count++;
0386 }
0387
0388 return count;
0389 }
0390
0391 void raid5_release_stripe(struct stripe_head *sh)
0392 {
0393 struct r5conf *conf = sh->raid_conf;
0394 unsigned long flags;
0395 struct list_head list;
0396 int hash;
0397 bool wakeup;
0398
0399
0400
0401 if (atomic_add_unless(&sh->count, -1, 1))
0402 return;
0403
0404 if (unlikely(!conf->mddev->thread) ||
0405 test_and_set_bit(STRIPE_ON_RELEASE_LIST, &sh->state))
0406 goto slow_path;
0407 wakeup = llist_add(&sh->release_list, &conf->released_stripes);
0408 if (wakeup)
0409 md_wakeup_thread(conf->mddev->thread);
0410 return;
0411 slow_path:
0412
0413 if (atomic_dec_and_lock_irqsave(&sh->count, &conf->device_lock, flags)) {
0414 INIT_LIST_HEAD(&list);
0415 hash = sh->hash_lock_index;
0416 do_release_stripe(conf, sh, &list);
0417 spin_unlock_irqrestore(&conf->device_lock, flags);
0418 release_inactive_stripe_list(conf, &list, hash);
0419 }
0420 }
0421
0422 static inline void remove_hash(struct stripe_head *sh)
0423 {
0424 pr_debug("remove_hash(), stripe %llu\n",
0425 (unsigned long long)sh->sector);
0426
0427 hlist_del_init(&sh->hash);
0428 }
0429
0430 static inline void insert_hash(struct r5conf *conf, struct stripe_head *sh)
0431 {
0432 struct hlist_head *hp = stripe_hash(conf, sh->sector);
0433
0434 pr_debug("insert_hash(), stripe %llu\n",
0435 (unsigned long long)sh->sector);
0436
0437 hlist_add_head(&sh->hash, hp);
0438 }
0439
0440
0441 static struct stripe_head *get_free_stripe(struct r5conf *conf, int hash)
0442 {
0443 struct stripe_head *sh = NULL;
0444 struct list_head *first;
0445
0446 if (list_empty(conf->inactive_list + hash))
0447 goto out;
0448 first = (conf->inactive_list + hash)->next;
0449 sh = list_entry(first, struct stripe_head, lru);
0450 list_del_init(first);
0451 remove_hash(sh);
0452 atomic_inc(&conf->active_stripes);
0453 BUG_ON(hash != sh->hash_lock_index);
0454 if (list_empty(conf->inactive_list + hash))
0455 atomic_inc(&conf->empty_inactive_list_nr);
0456 out:
0457 return sh;
0458 }
0459
0460 #if PAGE_SIZE != DEFAULT_STRIPE_SIZE
0461 static void free_stripe_pages(struct stripe_head *sh)
0462 {
0463 int i;
0464 struct page *p;
0465
0466
0467 if (!sh->pages)
0468 return;
0469
0470 for (i = 0; i < sh->nr_pages; i++) {
0471 p = sh->pages[i];
0472 if (p)
0473 put_page(p);
0474 sh->pages[i] = NULL;
0475 }
0476 }
0477
0478 static int alloc_stripe_pages(struct stripe_head *sh, gfp_t gfp)
0479 {
0480 int i;
0481 struct page *p;
0482
0483 for (i = 0; i < sh->nr_pages; i++) {
0484
0485 if (sh->pages[i])
0486 continue;
0487
0488 p = alloc_page(gfp);
0489 if (!p) {
0490 free_stripe_pages(sh);
0491 return -ENOMEM;
0492 }
0493 sh->pages[i] = p;
0494 }
0495 return 0;
0496 }
0497
0498 static int
0499 init_stripe_shared_pages(struct stripe_head *sh, struct r5conf *conf, int disks)
0500 {
0501 int nr_pages, cnt;
0502
0503 if (sh->pages)
0504 return 0;
0505
0506
0507 cnt = PAGE_SIZE / conf->stripe_size;
0508 nr_pages = (disks + cnt - 1) / cnt;
0509
0510 sh->pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL);
0511 if (!sh->pages)
0512 return -ENOMEM;
0513 sh->nr_pages = nr_pages;
0514 sh->stripes_per_page = cnt;
0515 return 0;
0516 }
0517 #endif
0518
0519 static void shrink_buffers(struct stripe_head *sh)
0520 {
0521 int i;
0522 int num = sh->raid_conf->pool_size;
0523
0524 #if PAGE_SIZE == DEFAULT_STRIPE_SIZE
0525 for (i = 0; i < num ; i++) {
0526 struct page *p;
0527
0528 WARN_ON(sh->dev[i].page != sh->dev[i].orig_page);
0529 p = sh->dev[i].page;
0530 if (!p)
0531 continue;
0532 sh->dev[i].page = NULL;
0533 put_page(p);
0534 }
0535 #else
0536 for (i = 0; i < num; i++)
0537 sh->dev[i].page = NULL;
0538 free_stripe_pages(sh);
0539 #endif
0540 }
0541
0542 static int grow_buffers(struct stripe_head *sh, gfp_t gfp)
0543 {
0544 int i;
0545 int num = sh->raid_conf->pool_size;
0546
0547 #if PAGE_SIZE == DEFAULT_STRIPE_SIZE
0548 for (i = 0; i < num; i++) {
0549 struct page *page;
0550
0551 if (!(page = alloc_page(gfp))) {
0552 return 1;
0553 }
0554 sh->dev[i].page = page;
0555 sh->dev[i].orig_page = page;
0556 sh->dev[i].offset = 0;
0557 }
0558 #else
0559 if (alloc_stripe_pages(sh, gfp))
0560 return -ENOMEM;
0561
0562 for (i = 0; i < num; i++) {
0563 sh->dev[i].page = raid5_get_dev_page(sh, i);
0564 sh->dev[i].orig_page = sh->dev[i].page;
0565 sh->dev[i].offset = raid5_get_page_offset(sh, i);
0566 }
0567 #endif
0568 return 0;
0569 }
0570
0571 static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous,
0572 struct stripe_head *sh);
0573
0574 static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
0575 {
0576 struct r5conf *conf = sh->raid_conf;
0577 int i, seq;
0578
0579 BUG_ON(atomic_read(&sh->count) != 0);
0580 BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
0581 BUG_ON(stripe_operations_active(sh));
0582 BUG_ON(sh->batch_head);
0583
0584 pr_debug("init_stripe called, stripe %llu\n",
0585 (unsigned long long)sector);
0586 retry:
0587 seq = read_seqcount_begin(&conf->gen_lock);
0588 sh->generation = conf->generation - previous;
0589 sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks;
0590 sh->sector = sector;
0591 stripe_set_idx(sector, conf, previous, sh);
0592 sh->state = 0;
0593
0594 for (i = sh->disks; i--; ) {
0595 struct r5dev *dev = &sh->dev[i];
0596
0597 if (dev->toread || dev->read || dev->towrite || dev->written ||
0598 test_bit(R5_LOCKED, &dev->flags)) {
0599 pr_err("sector=%llx i=%d %p %p %p %p %d\n",
0600 (unsigned long long)sh->sector, i, dev->toread,
0601 dev->read, dev->towrite, dev->written,
0602 test_bit(R5_LOCKED, &dev->flags));
0603 WARN_ON(1);
0604 }
0605 dev->flags = 0;
0606 dev->sector = raid5_compute_blocknr(sh, i, previous);
0607 }
0608 if (read_seqcount_retry(&conf->gen_lock, seq))
0609 goto retry;
0610 sh->overwrite_disks = 0;
0611 insert_hash(conf, sh);
0612 sh->cpu = smp_processor_id();
0613 set_bit(STRIPE_BATCH_READY, &sh->state);
0614 }
0615
0616 static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector,
0617 short generation)
0618 {
0619 struct stripe_head *sh;
0620
0621 pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector);
0622 hlist_for_each_entry(sh, stripe_hash(conf, sector), hash)
0623 if (sh->sector == sector && sh->generation == generation)
0624 return sh;
0625 pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector);
0626 return NULL;
0627 }
0628
0629 static struct stripe_head *find_get_stripe(struct r5conf *conf,
0630 sector_t sector, short generation, int hash)
0631 {
0632 int inc_empty_inactive_list_flag;
0633 struct stripe_head *sh;
0634
0635 sh = __find_stripe(conf, sector, generation);
0636 if (!sh)
0637 return NULL;
0638
0639 if (atomic_inc_not_zero(&sh->count))
0640 return sh;
0641
0642
0643
0644
0645
0646
0647
0648 spin_lock(&conf->device_lock);
0649 if (!atomic_read(&sh->count)) {
0650 if (!test_bit(STRIPE_HANDLE, &sh->state))
0651 atomic_inc(&conf->active_stripes);
0652 BUG_ON(list_empty(&sh->lru) &&
0653 !test_bit(STRIPE_EXPANDING, &sh->state));
0654 inc_empty_inactive_list_flag = 0;
0655 if (!list_empty(conf->inactive_list + hash))
0656 inc_empty_inactive_list_flag = 1;
0657 list_del_init(&sh->lru);
0658 if (list_empty(conf->inactive_list + hash) &&
0659 inc_empty_inactive_list_flag)
0660 atomic_inc(&conf->empty_inactive_list_nr);
0661 if (sh->group) {
0662 sh->group->stripes_cnt--;
0663 sh->group = NULL;
0664 }
0665 }
0666 atomic_inc(&sh->count);
0667 spin_unlock(&conf->device_lock);
0668
0669 return sh;
0670 }
0671
0672
0673
0674
0675
0676
0677
0678
0679
0680
0681
0682
0683
0684
0685
0686
0687
0688
0689 int raid5_calc_degraded(struct r5conf *conf)
0690 {
0691 int degraded, degraded2;
0692 int i;
0693
0694 rcu_read_lock();
0695 degraded = 0;
0696 for (i = 0; i < conf->previous_raid_disks; i++) {
0697 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
0698 if (rdev && test_bit(Faulty, &rdev->flags))
0699 rdev = rcu_dereference(conf->disks[i].replacement);
0700 if (!rdev || test_bit(Faulty, &rdev->flags))
0701 degraded++;
0702 else if (test_bit(In_sync, &rdev->flags))
0703 ;
0704 else
0705
0706
0707
0708
0709
0710
0711
0712
0713
0714 if (conf->raid_disks >= conf->previous_raid_disks)
0715 degraded++;
0716 }
0717 rcu_read_unlock();
0718 if (conf->raid_disks == conf->previous_raid_disks)
0719 return degraded;
0720 rcu_read_lock();
0721 degraded2 = 0;
0722 for (i = 0; i < conf->raid_disks; i++) {
0723 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
0724 if (rdev && test_bit(Faulty, &rdev->flags))
0725 rdev = rcu_dereference(conf->disks[i].replacement);
0726 if (!rdev || test_bit(Faulty, &rdev->flags))
0727 degraded2++;
0728 else if (test_bit(In_sync, &rdev->flags))
0729 ;
0730 else
0731
0732
0733
0734
0735
0736 if (conf->raid_disks <= conf->previous_raid_disks)
0737 degraded2++;
0738 }
0739 rcu_read_unlock();
0740 if (degraded2 > degraded)
0741 return degraded2;
0742 return degraded;
0743 }
0744
0745 static bool has_failed(struct r5conf *conf)
0746 {
0747 int degraded = conf->mddev->degraded;
0748
0749 if (test_bit(MD_BROKEN, &conf->mddev->flags))
0750 return true;
0751
0752 if (conf->mddev->reshape_position != MaxSector)
0753 degraded = raid5_calc_degraded(conf);
0754
0755 return degraded > conf->max_degraded;
0756 }
0757
0758 enum stripe_result {
0759 STRIPE_SUCCESS = 0,
0760 STRIPE_RETRY,
0761 STRIPE_SCHEDULE_AND_RETRY,
0762 STRIPE_FAIL,
0763 };
0764
0765 struct stripe_request_ctx {
0766
0767 struct stripe_head *batch_last;
0768
0769
0770 sector_t first_sector;
0771
0772
0773 sector_t last_sector;
0774
0775
0776
0777
0778
0779 DECLARE_BITMAP(sectors_to_do, RAID5_MAX_REQ_STRIPES + 1);
0780
0781
0782 bool do_flush;
0783 };
0784
0785
0786
0787
0788
0789
0790 static bool is_inactive_blocked(struct r5conf *conf, int hash)
0791 {
0792 int active = atomic_read(&conf->active_stripes);
0793
0794 if (list_empty(conf->inactive_list + hash))
0795 return false;
0796
0797 if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state))
0798 return true;
0799
0800 return active < (conf->max_nr_stripes * 3 / 4);
0801 }
0802
0803 static struct stripe_head *__raid5_get_active_stripe(struct r5conf *conf,
0804 struct stripe_request_ctx *ctx, sector_t sector,
0805 bool previous, bool noblock, bool noquiesce)
0806 {
0807 struct stripe_head *sh;
0808 int hash = stripe_hash_locks_hash(conf, sector);
0809
0810 pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector);
0811
0812 spin_lock_irq(conf->hash_locks + hash);
0813
0814 retry:
0815 if (!noquiesce && conf->quiesce) {
0816
0817
0818
0819
0820
0821
0822 if (ctx && ctx->batch_last) {
0823 raid5_release_stripe(ctx->batch_last);
0824 ctx->batch_last = NULL;
0825 }
0826
0827 wait_event_lock_irq(conf->wait_for_quiescent, !conf->quiesce,
0828 *(conf->hash_locks + hash));
0829 }
0830
0831 sh = find_get_stripe(conf, sector, conf->generation - previous, hash);
0832 if (sh)
0833 goto out;
0834
0835 if (test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state))
0836 goto wait_for_stripe;
0837
0838 sh = get_free_stripe(conf, hash);
0839 if (sh) {
0840 r5c_check_stripe_cache_usage(conf);
0841 init_stripe(sh, sector, previous);
0842 atomic_inc(&sh->count);
0843 goto out;
0844 }
0845
0846 if (!test_bit(R5_DID_ALLOC, &conf->cache_state))
0847 set_bit(R5_ALLOC_MORE, &conf->cache_state);
0848
0849 wait_for_stripe:
0850 if (noblock)
0851 goto out;
0852
0853 set_bit(R5_INACTIVE_BLOCKED, &conf->cache_state);
0854 r5l_wake_reclaim(conf->log, 0);
0855 wait_event_lock_irq(conf->wait_for_stripe,
0856 is_inactive_blocked(conf, hash),
0857 *(conf->hash_locks + hash));
0858 clear_bit(R5_INACTIVE_BLOCKED, &conf->cache_state);
0859 goto retry;
0860
0861 out:
0862 spin_unlock_irq(conf->hash_locks + hash);
0863 return sh;
0864 }
0865
0866 struct stripe_head *raid5_get_active_stripe(struct r5conf *conf,
0867 sector_t sector, bool previous, bool noblock, bool noquiesce)
0868 {
0869 return __raid5_get_active_stripe(conf, NULL, sector, previous, noblock,
0870 noquiesce);
0871 }
0872
0873 static bool is_full_stripe_write(struct stripe_head *sh)
0874 {
0875 BUG_ON(sh->overwrite_disks > (sh->disks - sh->raid_conf->max_degraded));
0876 return sh->overwrite_disks == (sh->disks - sh->raid_conf->max_degraded);
0877 }
0878
0879 static void lock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
0880 __acquires(&sh1->stripe_lock)
0881 __acquires(&sh2->stripe_lock)
0882 {
0883 if (sh1 > sh2) {
0884 spin_lock_irq(&sh2->stripe_lock);
0885 spin_lock_nested(&sh1->stripe_lock, 1);
0886 } else {
0887 spin_lock_irq(&sh1->stripe_lock);
0888 spin_lock_nested(&sh2->stripe_lock, 1);
0889 }
0890 }
0891
0892 static void unlock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
0893 __releases(&sh1->stripe_lock)
0894 __releases(&sh2->stripe_lock)
0895 {
0896 spin_unlock(&sh1->stripe_lock);
0897 spin_unlock_irq(&sh2->stripe_lock);
0898 }
0899
0900
0901 static bool stripe_can_batch(struct stripe_head *sh)
0902 {
0903 struct r5conf *conf = sh->raid_conf;
0904
0905 if (raid5_has_log(conf) || raid5_has_ppl(conf))
0906 return false;
0907 return test_bit(STRIPE_BATCH_READY, &sh->state) &&
0908 !test_bit(STRIPE_BITMAP_PENDING, &sh->state) &&
0909 is_full_stripe_write(sh);
0910 }
0911
0912
0913 static void stripe_add_to_batch_list(struct r5conf *conf,
0914 struct stripe_head *sh, struct stripe_head *last_sh)
0915 {
0916 struct stripe_head *head;
0917 sector_t head_sector, tmp_sec;
0918 int hash;
0919 int dd_idx;
0920
0921
0922 tmp_sec = sh->sector;
0923 if (!sector_div(tmp_sec, conf->chunk_sectors))
0924 return;
0925 head_sector = sh->sector - RAID5_STRIPE_SECTORS(conf);
0926
0927 if (last_sh && head_sector == last_sh->sector) {
0928 head = last_sh;
0929 atomic_inc(&head->count);
0930 } else {
0931 hash = stripe_hash_locks_hash(conf, head_sector);
0932 spin_lock_irq(conf->hash_locks + hash);
0933 head = find_get_stripe(conf, head_sector, conf->generation,
0934 hash);
0935 spin_unlock_irq(conf->hash_locks + hash);
0936 if (!head)
0937 return;
0938 if (!stripe_can_batch(head))
0939 goto out;
0940 }
0941
0942 lock_two_stripes(head, sh);
0943
0944 if (!stripe_can_batch(head) || !stripe_can_batch(sh))
0945 goto unlock_out;
0946
0947 if (sh->batch_head)
0948 goto unlock_out;
0949
0950 dd_idx = 0;
0951 while (dd_idx == sh->pd_idx || dd_idx == sh->qd_idx)
0952 dd_idx++;
0953 if (head->dev[dd_idx].towrite->bi_opf != sh->dev[dd_idx].towrite->bi_opf ||
0954 bio_op(head->dev[dd_idx].towrite) != bio_op(sh->dev[dd_idx].towrite))
0955 goto unlock_out;
0956
0957 if (head->batch_head) {
0958 spin_lock(&head->batch_head->batch_lock);
0959
0960 if (!stripe_can_batch(head)) {
0961 spin_unlock(&head->batch_head->batch_lock);
0962 goto unlock_out;
0963 }
0964
0965
0966
0967
0968
0969
0970
0971 sh->batch_head = head->batch_head;
0972
0973
0974
0975
0976
0977 list_add(&sh->batch_list, &head->batch_list);
0978 spin_unlock(&head->batch_head->batch_lock);
0979 } else {
0980 head->batch_head = head;
0981 sh->batch_head = head->batch_head;
0982 spin_lock(&head->batch_lock);
0983 list_add_tail(&sh->batch_list, &head->batch_list);
0984 spin_unlock(&head->batch_lock);
0985 }
0986
0987 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
0988 if (atomic_dec_return(&conf->preread_active_stripes)
0989 < IO_THRESHOLD)
0990 md_wakeup_thread(conf->mddev->thread);
0991
0992 if (test_and_clear_bit(STRIPE_BIT_DELAY, &sh->state)) {
0993 int seq = sh->bm_seq;
0994 if (test_bit(STRIPE_BIT_DELAY, &sh->batch_head->state) &&
0995 sh->batch_head->bm_seq > seq)
0996 seq = sh->batch_head->bm_seq;
0997 set_bit(STRIPE_BIT_DELAY, &sh->batch_head->state);
0998 sh->batch_head->bm_seq = seq;
0999 }
1000
1001 atomic_inc(&sh->count);
1002 unlock_out:
1003 unlock_two_stripes(head, sh);
1004 out:
1005 raid5_release_stripe(head);
1006 }
1007
1008
1009
1010
1011 static int use_new_offset(struct r5conf *conf, struct stripe_head *sh)
1012 {
1013 sector_t progress = conf->reshape_progress;
1014
1015
1016
1017
1018 smp_rmb();
1019 if (progress == MaxSector)
1020 return 0;
1021 if (sh->generation == conf->generation - 1)
1022 return 0;
1023
1024
1025
1026 return 1;
1027 }
1028
1029 static void dispatch_bio_list(struct bio_list *tmp)
1030 {
1031 struct bio *bio;
1032
1033 while ((bio = bio_list_pop(tmp)))
1034 submit_bio_noacct(bio);
1035 }
1036
1037 static int cmp_stripe(void *priv, const struct list_head *a,
1038 const struct list_head *b)
1039 {
1040 const struct r5pending_data *da = list_entry(a,
1041 struct r5pending_data, sibling);
1042 const struct r5pending_data *db = list_entry(b,
1043 struct r5pending_data, sibling);
1044 if (da->sector > db->sector)
1045 return 1;
1046 if (da->sector < db->sector)
1047 return -1;
1048 return 0;
1049 }
1050
1051 static void dispatch_defer_bios(struct r5conf *conf, int target,
1052 struct bio_list *list)
1053 {
1054 struct r5pending_data *data;
1055 struct list_head *first, *next = NULL;
1056 int cnt = 0;
1057
1058 if (conf->pending_data_cnt == 0)
1059 return;
1060
1061 list_sort(NULL, &conf->pending_list, cmp_stripe);
1062
1063 first = conf->pending_list.next;
1064
1065
1066 if (conf->next_pending_data)
1067 list_move_tail(&conf->pending_list,
1068 &conf->next_pending_data->sibling);
1069
1070 while (!list_empty(&conf->pending_list)) {
1071 data = list_first_entry(&conf->pending_list,
1072 struct r5pending_data, sibling);
1073 if (&data->sibling == first)
1074 first = data->sibling.next;
1075 next = data->sibling.next;
1076
1077 bio_list_merge(list, &data->bios);
1078 list_move(&data->sibling, &conf->free_list);
1079 cnt++;
1080 if (cnt >= target)
1081 break;
1082 }
1083 conf->pending_data_cnt -= cnt;
1084 BUG_ON(conf->pending_data_cnt < 0 || cnt < target);
1085
1086 if (next != &conf->pending_list)
1087 conf->next_pending_data = list_entry(next,
1088 struct r5pending_data, sibling);
1089 else
1090 conf->next_pending_data = NULL;
1091
1092 if (first != &conf->pending_list)
1093 list_move_tail(&conf->pending_list, first);
1094 }
1095
1096 static void flush_deferred_bios(struct r5conf *conf)
1097 {
1098 struct bio_list tmp = BIO_EMPTY_LIST;
1099
1100 if (conf->pending_data_cnt == 0)
1101 return;
1102
1103 spin_lock(&conf->pending_bios_lock);
1104 dispatch_defer_bios(conf, conf->pending_data_cnt, &tmp);
1105 BUG_ON(conf->pending_data_cnt != 0);
1106 spin_unlock(&conf->pending_bios_lock);
1107
1108 dispatch_bio_list(&tmp);
1109 }
1110
1111 static void defer_issue_bios(struct r5conf *conf, sector_t sector,
1112 struct bio_list *bios)
1113 {
1114 struct bio_list tmp = BIO_EMPTY_LIST;
1115 struct r5pending_data *ent;
1116
1117 spin_lock(&conf->pending_bios_lock);
1118 ent = list_first_entry(&conf->free_list, struct r5pending_data,
1119 sibling);
1120 list_move_tail(&ent->sibling, &conf->pending_list);
1121 ent->sector = sector;
1122 bio_list_init(&ent->bios);
1123 bio_list_merge(&ent->bios, bios);
1124 conf->pending_data_cnt++;
1125 if (conf->pending_data_cnt >= PENDING_IO_MAX)
1126 dispatch_defer_bios(conf, PENDING_IO_ONE_FLUSH, &tmp);
1127
1128 spin_unlock(&conf->pending_bios_lock);
1129
1130 dispatch_bio_list(&tmp);
1131 }
1132
1133 static void
1134 raid5_end_read_request(struct bio *bi);
1135 static void
1136 raid5_end_write_request(struct bio *bi);
1137
1138 static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
1139 {
1140 struct r5conf *conf = sh->raid_conf;
1141 int i, disks = sh->disks;
1142 struct stripe_head *head_sh = sh;
1143 struct bio_list pending_bios = BIO_EMPTY_LIST;
1144 struct r5dev *dev;
1145 bool should_defer;
1146
1147 might_sleep();
1148
1149 if (log_stripe(sh, s) == 0)
1150 return;
1151
1152 should_defer = conf->batch_bio_dispatch && conf->group_cnt;
1153
1154 for (i = disks; i--; ) {
1155 enum req_op op;
1156 blk_opf_t op_flags = 0;
1157 int replace_only = 0;
1158 struct bio *bi, *rbi;
1159 struct md_rdev *rdev, *rrdev = NULL;
1160
1161 sh = head_sh;
1162 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) {
1163 op = REQ_OP_WRITE;
1164 if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags))
1165 op_flags = REQ_FUA;
1166 if (test_bit(R5_Discard, &sh->dev[i].flags))
1167 op = REQ_OP_DISCARD;
1168 } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
1169 op = REQ_OP_READ;
1170 else if (test_and_clear_bit(R5_WantReplace,
1171 &sh->dev[i].flags)) {
1172 op = REQ_OP_WRITE;
1173 replace_only = 1;
1174 } else
1175 continue;
1176 if (test_and_clear_bit(R5_SyncIO, &sh->dev[i].flags))
1177 op_flags |= REQ_SYNC;
1178
1179 again:
1180 dev = &sh->dev[i];
1181 bi = &dev->req;
1182 rbi = &dev->rreq;
1183
1184 rcu_read_lock();
1185 rrdev = rcu_dereference(conf->disks[i].replacement);
1186 smp_mb();
1187 rdev = rcu_dereference(conf->disks[i].rdev);
1188 if (!rdev) {
1189 rdev = rrdev;
1190 rrdev = NULL;
1191 }
1192 if (op_is_write(op)) {
1193 if (replace_only)
1194 rdev = NULL;
1195 if (rdev == rrdev)
1196
1197 rrdev = NULL;
1198 } else {
1199 if (test_bit(R5_ReadRepl, &head_sh->dev[i].flags) && rrdev)
1200 rdev = rrdev;
1201 rrdev = NULL;
1202 }
1203
1204 if (rdev && test_bit(Faulty, &rdev->flags))
1205 rdev = NULL;
1206 if (rdev)
1207 atomic_inc(&rdev->nr_pending);
1208 if (rrdev && test_bit(Faulty, &rrdev->flags))
1209 rrdev = NULL;
1210 if (rrdev)
1211 atomic_inc(&rrdev->nr_pending);
1212 rcu_read_unlock();
1213
1214
1215
1216
1217
1218 while (op_is_write(op) && rdev &&
1219 test_bit(WriteErrorSeen, &rdev->flags)) {
1220 sector_t first_bad;
1221 int bad_sectors;
1222 int bad = is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf),
1223 &first_bad, &bad_sectors);
1224 if (!bad)
1225 break;
1226
1227 if (bad < 0) {
1228 set_bit(BlockedBadBlocks, &rdev->flags);
1229 if (!conf->mddev->external &&
1230 conf->mddev->sb_flags) {
1231
1232
1233
1234
1235 md_check_recovery(conf->mddev);
1236 }
1237
1238
1239
1240
1241
1242 atomic_inc(&rdev->nr_pending);
1243 md_wait_for_blocked_rdev(rdev, conf->mddev);
1244 } else {
1245
1246 rdev_dec_pending(rdev, conf->mddev);
1247 rdev = NULL;
1248 }
1249 }
1250
1251 if (rdev) {
1252 if (s->syncing || s->expanding || s->expanded
1253 || s->replacing)
1254 md_sync_acct(rdev->bdev, RAID5_STRIPE_SECTORS(conf));
1255
1256 set_bit(STRIPE_IO_STARTED, &sh->state);
1257
1258 bio_init(bi, rdev->bdev, &dev->vec, 1, op | op_flags);
1259 bi->bi_end_io = op_is_write(op)
1260 ? raid5_end_write_request
1261 : raid5_end_read_request;
1262 bi->bi_private = sh;
1263
1264 pr_debug("%s: for %llu schedule op %d on disc %d\n",
1265 __func__, (unsigned long long)sh->sector,
1266 bi->bi_opf, i);
1267 atomic_inc(&sh->count);
1268 if (sh != head_sh)
1269 atomic_inc(&head_sh->count);
1270 if (use_new_offset(conf, sh))
1271 bi->bi_iter.bi_sector = (sh->sector
1272 + rdev->new_data_offset);
1273 else
1274 bi->bi_iter.bi_sector = (sh->sector
1275 + rdev->data_offset);
1276 if (test_bit(R5_ReadNoMerge, &head_sh->dev[i].flags))
1277 bi->bi_opf |= REQ_NOMERGE;
1278
1279 if (test_bit(R5_SkipCopy, &sh->dev[i].flags))
1280 WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
1281
1282 if (!op_is_write(op) &&
1283 test_bit(R5_InJournal, &sh->dev[i].flags))
1284
1285
1286
1287
1288
1289 sh->dev[i].vec.bv_page = sh->dev[i].orig_page;
1290 else
1291 sh->dev[i].vec.bv_page = sh->dev[i].page;
1292 bi->bi_vcnt = 1;
1293 bi->bi_io_vec[0].bv_len = RAID5_STRIPE_SIZE(conf);
1294 bi->bi_io_vec[0].bv_offset = sh->dev[i].offset;
1295 bi->bi_iter.bi_size = RAID5_STRIPE_SIZE(conf);
1296
1297
1298
1299
1300 if (op == REQ_OP_DISCARD)
1301 bi->bi_vcnt = 0;
1302 if (rrdev)
1303 set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags);
1304
1305 if (conf->mddev->gendisk)
1306 trace_block_bio_remap(bi,
1307 disk_devt(conf->mddev->gendisk),
1308 sh->dev[i].sector);
1309 if (should_defer && op_is_write(op))
1310 bio_list_add(&pending_bios, bi);
1311 else
1312 submit_bio_noacct(bi);
1313 }
1314 if (rrdev) {
1315 if (s->syncing || s->expanding || s->expanded
1316 || s->replacing)
1317 md_sync_acct(rrdev->bdev, RAID5_STRIPE_SECTORS(conf));
1318
1319 set_bit(STRIPE_IO_STARTED, &sh->state);
1320
1321 bio_init(rbi, rrdev->bdev, &dev->rvec, 1, op | op_flags);
1322 BUG_ON(!op_is_write(op));
1323 rbi->bi_end_io = raid5_end_write_request;
1324 rbi->bi_private = sh;
1325
1326 pr_debug("%s: for %llu schedule op %d on "
1327 "replacement disc %d\n",
1328 __func__, (unsigned long long)sh->sector,
1329 rbi->bi_opf, i);
1330 atomic_inc(&sh->count);
1331 if (sh != head_sh)
1332 atomic_inc(&head_sh->count);
1333 if (use_new_offset(conf, sh))
1334 rbi->bi_iter.bi_sector = (sh->sector
1335 + rrdev->new_data_offset);
1336 else
1337 rbi->bi_iter.bi_sector = (sh->sector
1338 + rrdev->data_offset);
1339 if (test_bit(R5_SkipCopy, &sh->dev[i].flags))
1340 WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
1341 sh->dev[i].rvec.bv_page = sh->dev[i].page;
1342 rbi->bi_vcnt = 1;
1343 rbi->bi_io_vec[0].bv_len = RAID5_STRIPE_SIZE(conf);
1344 rbi->bi_io_vec[0].bv_offset = sh->dev[i].offset;
1345 rbi->bi_iter.bi_size = RAID5_STRIPE_SIZE(conf);
1346
1347
1348
1349
1350 if (op == REQ_OP_DISCARD)
1351 rbi->bi_vcnt = 0;
1352 if (conf->mddev->gendisk)
1353 trace_block_bio_remap(rbi,
1354 disk_devt(conf->mddev->gendisk),
1355 sh->dev[i].sector);
1356 if (should_defer && op_is_write(op))
1357 bio_list_add(&pending_bios, rbi);
1358 else
1359 submit_bio_noacct(rbi);
1360 }
1361 if (!rdev && !rrdev) {
1362 if (op_is_write(op))
1363 set_bit(STRIPE_DEGRADED, &sh->state);
1364 pr_debug("skip op %d on disc %d for sector %llu\n",
1365 bi->bi_opf, i, (unsigned long long)sh->sector);
1366 clear_bit(R5_LOCKED, &sh->dev[i].flags);
1367 set_bit(STRIPE_HANDLE, &sh->state);
1368 }
1369
1370 if (!head_sh->batch_head)
1371 continue;
1372 sh = list_first_entry(&sh->batch_list, struct stripe_head,
1373 batch_list);
1374 if (sh != head_sh)
1375 goto again;
1376 }
1377
1378 if (should_defer && !bio_list_empty(&pending_bios))
1379 defer_issue_bios(conf, head_sh->sector, &pending_bios);
1380 }
1381
1382 static struct dma_async_tx_descriptor *
1383 async_copy_data(int frombio, struct bio *bio, struct page **page,
1384 unsigned int poff, sector_t sector, struct dma_async_tx_descriptor *tx,
1385 struct stripe_head *sh, int no_skipcopy)
1386 {
1387 struct bio_vec bvl;
1388 struct bvec_iter iter;
1389 struct page *bio_page;
1390 int page_offset;
1391 struct async_submit_ctl submit;
1392 enum async_tx_flags flags = 0;
1393 struct r5conf *conf = sh->raid_conf;
1394
1395 if (bio->bi_iter.bi_sector >= sector)
1396 page_offset = (signed)(bio->bi_iter.bi_sector - sector) * 512;
1397 else
1398 page_offset = (signed)(sector - bio->bi_iter.bi_sector) * -512;
1399
1400 if (frombio)
1401 flags |= ASYNC_TX_FENCE;
1402 init_async_submit(&submit, flags, tx, NULL, NULL, NULL);
1403
1404 bio_for_each_segment(bvl, bio, iter) {
1405 int len = bvl.bv_len;
1406 int clen;
1407 int b_offset = 0;
1408
1409 if (page_offset < 0) {
1410 b_offset = -page_offset;
1411 page_offset += b_offset;
1412 len -= b_offset;
1413 }
1414
1415 if (len > 0 && page_offset + len > RAID5_STRIPE_SIZE(conf))
1416 clen = RAID5_STRIPE_SIZE(conf) - page_offset;
1417 else
1418 clen = len;
1419
1420 if (clen > 0) {
1421 b_offset += bvl.bv_offset;
1422 bio_page = bvl.bv_page;
1423 if (frombio) {
1424 if (conf->skip_copy &&
1425 b_offset == 0 && page_offset == 0 &&
1426 clen == RAID5_STRIPE_SIZE(conf) &&
1427 !no_skipcopy)
1428 *page = bio_page;
1429 else
1430 tx = async_memcpy(*page, bio_page, page_offset + poff,
1431 b_offset, clen, &submit);
1432 } else
1433 tx = async_memcpy(bio_page, *page, b_offset,
1434 page_offset + poff, clen, &submit);
1435 }
1436
1437 submit.depend_tx = tx;
1438
1439 if (clen < len)
1440 break;
1441 page_offset += len;
1442 }
1443
1444 return tx;
1445 }
1446
1447 static void ops_complete_biofill(void *stripe_head_ref)
1448 {
1449 struct stripe_head *sh = stripe_head_ref;
1450 int i;
1451 struct r5conf *conf = sh->raid_conf;
1452
1453 pr_debug("%s: stripe %llu\n", __func__,
1454 (unsigned long long)sh->sector);
1455
1456
1457 for (i = sh->disks; i--; ) {
1458 struct r5dev *dev = &sh->dev[i];
1459
1460
1461
1462
1463
1464
1465 if (test_and_clear_bit(R5_Wantfill, &dev->flags)) {
1466 struct bio *rbi, *rbi2;
1467
1468 BUG_ON(!dev->read);
1469 rbi = dev->read;
1470 dev->read = NULL;
1471 while (rbi && rbi->bi_iter.bi_sector <
1472 dev->sector + RAID5_STRIPE_SECTORS(conf)) {
1473 rbi2 = r5_next_bio(conf, rbi, dev->sector);
1474 bio_endio(rbi);
1475 rbi = rbi2;
1476 }
1477 }
1478 }
1479 clear_bit(STRIPE_BIOFILL_RUN, &sh->state);
1480
1481 set_bit(STRIPE_HANDLE, &sh->state);
1482 raid5_release_stripe(sh);
1483 }
1484
1485 static void ops_run_biofill(struct stripe_head *sh)
1486 {
1487 struct dma_async_tx_descriptor *tx = NULL;
1488 struct async_submit_ctl submit;
1489 int i;
1490 struct r5conf *conf = sh->raid_conf;
1491
1492 BUG_ON(sh->batch_head);
1493 pr_debug("%s: stripe %llu\n", __func__,
1494 (unsigned long long)sh->sector);
1495
1496 for (i = sh->disks; i--; ) {
1497 struct r5dev *dev = &sh->dev[i];
1498 if (test_bit(R5_Wantfill, &dev->flags)) {
1499 struct bio *rbi;
1500 spin_lock_irq(&sh->stripe_lock);
1501 dev->read = rbi = dev->toread;
1502 dev->toread = NULL;
1503 spin_unlock_irq(&sh->stripe_lock);
1504 while (rbi && rbi->bi_iter.bi_sector <
1505 dev->sector + RAID5_STRIPE_SECTORS(conf)) {
1506 tx = async_copy_data(0, rbi, &dev->page,
1507 dev->offset,
1508 dev->sector, tx, sh, 0);
1509 rbi = r5_next_bio(conf, rbi, dev->sector);
1510 }
1511 }
1512 }
1513
1514 atomic_inc(&sh->count);
1515 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_biofill, sh, NULL);
1516 async_trigger_callback(&submit);
1517 }
1518
1519 static void mark_target_uptodate(struct stripe_head *sh, int target)
1520 {
1521 struct r5dev *tgt;
1522
1523 if (target < 0)
1524 return;
1525
1526 tgt = &sh->dev[target];
1527 set_bit(R5_UPTODATE, &tgt->flags);
1528 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
1529 clear_bit(R5_Wantcompute, &tgt->flags);
1530 }
1531
1532 static void ops_complete_compute(void *stripe_head_ref)
1533 {
1534 struct stripe_head *sh = stripe_head_ref;
1535
1536 pr_debug("%s: stripe %llu\n", __func__,
1537 (unsigned long long)sh->sector);
1538
1539
1540 mark_target_uptodate(sh, sh->ops.target);
1541 mark_target_uptodate(sh, sh->ops.target2);
1542
1543 clear_bit(STRIPE_COMPUTE_RUN, &sh->state);
1544 if (sh->check_state == check_state_compute_run)
1545 sh->check_state = check_state_compute_result;
1546 set_bit(STRIPE_HANDLE, &sh->state);
1547 raid5_release_stripe(sh);
1548 }
1549
1550
1551 static struct page **to_addr_page(struct raid5_percpu *percpu, int i)
1552 {
1553 return percpu->scribble + i * percpu->scribble_obj_size;
1554 }
1555
1556
1557 static addr_conv_t *to_addr_conv(struct stripe_head *sh,
1558 struct raid5_percpu *percpu, int i)
1559 {
1560 return (void *) (to_addr_page(percpu, i) + sh->disks + 2);
1561 }
1562
1563
1564
1565
1566 static unsigned int *
1567 to_addr_offs(struct stripe_head *sh, struct raid5_percpu *percpu)
1568 {
1569 return (unsigned int *) (to_addr_conv(sh, percpu, 0) + sh->disks + 2);
1570 }
1571
1572 static struct dma_async_tx_descriptor *
1573 ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu)
1574 {
1575 int disks = sh->disks;
1576 struct page **xor_srcs = to_addr_page(percpu, 0);
1577 unsigned int *off_srcs = to_addr_offs(sh, percpu);
1578 int target = sh->ops.target;
1579 struct r5dev *tgt = &sh->dev[target];
1580 struct page *xor_dest = tgt->page;
1581 unsigned int off_dest = tgt->offset;
1582 int count = 0;
1583 struct dma_async_tx_descriptor *tx;
1584 struct async_submit_ctl submit;
1585 int i;
1586
1587 BUG_ON(sh->batch_head);
1588
1589 pr_debug("%s: stripe %llu block: %d\n",
1590 __func__, (unsigned long long)sh->sector, target);
1591 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
1592
1593 for (i = disks; i--; ) {
1594 if (i != target) {
1595 off_srcs[count] = sh->dev[i].offset;
1596 xor_srcs[count++] = sh->dev[i].page;
1597 }
1598 }
1599
1600 atomic_inc(&sh->count);
1601
1602 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL,
1603 ops_complete_compute, sh, to_addr_conv(sh, percpu, 0));
1604 if (unlikely(count == 1))
1605 tx = async_memcpy(xor_dest, xor_srcs[0], off_dest, off_srcs[0],
1606 RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
1607 else
1608 tx = async_xor_offs(xor_dest, off_dest, xor_srcs, off_srcs, count,
1609 RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
1610
1611 return tx;
1612 }
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624 static int set_syndrome_sources(struct page **srcs,
1625 unsigned int *offs,
1626 struct stripe_head *sh,
1627 int srctype)
1628 {
1629 int disks = sh->disks;
1630 int syndrome_disks = sh->ddf_layout ? disks : (disks - 2);
1631 int d0_idx = raid6_d0(sh);
1632 int count;
1633 int i;
1634
1635 for (i = 0; i < disks; i++)
1636 srcs[i] = NULL;
1637
1638 count = 0;
1639 i = d0_idx;
1640 do {
1641 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
1642 struct r5dev *dev = &sh->dev[i];
1643
1644 if (i == sh->qd_idx || i == sh->pd_idx ||
1645 (srctype == SYNDROME_SRC_ALL) ||
1646 (srctype == SYNDROME_SRC_WANT_DRAIN &&
1647 (test_bit(R5_Wantdrain, &dev->flags) ||
1648 test_bit(R5_InJournal, &dev->flags))) ||
1649 (srctype == SYNDROME_SRC_WRITTEN &&
1650 (dev->written ||
1651 test_bit(R5_InJournal, &dev->flags)))) {
1652 if (test_bit(R5_InJournal, &dev->flags))
1653 srcs[slot] = sh->dev[i].orig_page;
1654 else
1655 srcs[slot] = sh->dev[i].page;
1656
1657
1658
1659
1660
1661 offs[slot] = sh->dev[i].offset;
1662 }
1663 i = raid6_next_disk(i, disks);
1664 } while (i != d0_idx);
1665
1666 return syndrome_disks;
1667 }
1668
1669 static struct dma_async_tx_descriptor *
1670 ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu)
1671 {
1672 int disks = sh->disks;
1673 struct page **blocks = to_addr_page(percpu, 0);
1674 unsigned int *offs = to_addr_offs(sh, percpu);
1675 int target;
1676 int qd_idx = sh->qd_idx;
1677 struct dma_async_tx_descriptor *tx;
1678 struct async_submit_ctl submit;
1679 struct r5dev *tgt;
1680 struct page *dest;
1681 unsigned int dest_off;
1682 int i;
1683 int count;
1684
1685 BUG_ON(sh->batch_head);
1686 if (sh->ops.target < 0)
1687 target = sh->ops.target2;
1688 else if (sh->ops.target2 < 0)
1689 target = sh->ops.target;
1690 else
1691
1692 BUG();
1693 BUG_ON(target < 0);
1694 pr_debug("%s: stripe %llu block: %d\n",
1695 __func__, (unsigned long long)sh->sector, target);
1696
1697 tgt = &sh->dev[target];
1698 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
1699 dest = tgt->page;
1700 dest_off = tgt->offset;
1701
1702 atomic_inc(&sh->count);
1703
1704 if (target == qd_idx) {
1705 count = set_syndrome_sources(blocks, offs, sh, SYNDROME_SRC_ALL);
1706 blocks[count] = NULL;
1707 BUG_ON(blocks[count+1] != dest);
1708 init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
1709 ops_complete_compute, sh,
1710 to_addr_conv(sh, percpu, 0));
1711 tx = async_gen_syndrome(blocks, offs, count+2,
1712 RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
1713 } else {
1714
1715 count = 0;
1716 for (i = disks; i-- ; ) {
1717 if (i == target || i == qd_idx)
1718 continue;
1719 offs[count] = sh->dev[i].offset;
1720 blocks[count++] = sh->dev[i].page;
1721 }
1722
1723 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
1724 NULL, ops_complete_compute, sh,
1725 to_addr_conv(sh, percpu, 0));
1726 tx = async_xor_offs(dest, dest_off, blocks, offs, count,
1727 RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
1728 }
1729
1730 return tx;
1731 }
1732
1733 static struct dma_async_tx_descriptor *
1734 ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu)
1735 {
1736 int i, count, disks = sh->disks;
1737 int syndrome_disks = sh->ddf_layout ? disks : disks-2;
1738 int d0_idx = raid6_d0(sh);
1739 int faila = -1, failb = -1;
1740 int target = sh->ops.target;
1741 int target2 = sh->ops.target2;
1742 struct r5dev *tgt = &sh->dev[target];
1743 struct r5dev *tgt2 = &sh->dev[target2];
1744 struct dma_async_tx_descriptor *tx;
1745 struct page **blocks = to_addr_page(percpu, 0);
1746 unsigned int *offs = to_addr_offs(sh, percpu);
1747 struct async_submit_ctl submit;
1748
1749 BUG_ON(sh->batch_head);
1750 pr_debug("%s: stripe %llu block1: %d block2: %d\n",
1751 __func__, (unsigned long long)sh->sector, target, target2);
1752 BUG_ON(target < 0 || target2 < 0);
1753 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
1754 BUG_ON(!test_bit(R5_Wantcompute, &tgt2->flags));
1755
1756
1757
1758
1759 for (i = 0; i < disks ; i++) {
1760 offs[i] = 0;
1761 blocks[i] = NULL;
1762 }
1763 count = 0;
1764 i = d0_idx;
1765 do {
1766 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
1767
1768 offs[slot] = sh->dev[i].offset;
1769 blocks[slot] = sh->dev[i].page;
1770
1771 if (i == target)
1772 faila = slot;
1773 if (i == target2)
1774 failb = slot;
1775 i = raid6_next_disk(i, disks);
1776 } while (i != d0_idx);
1777
1778 BUG_ON(faila == failb);
1779 if (failb < faila)
1780 swap(faila, failb);
1781 pr_debug("%s: stripe: %llu faila: %d failb: %d\n",
1782 __func__, (unsigned long long)sh->sector, faila, failb);
1783
1784 atomic_inc(&sh->count);
1785
1786 if (failb == syndrome_disks+1) {
1787
1788 if (faila == syndrome_disks) {
1789
1790 init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
1791 ops_complete_compute, sh,
1792 to_addr_conv(sh, percpu, 0));
1793 return async_gen_syndrome(blocks, offs, syndrome_disks+2,
1794 RAID5_STRIPE_SIZE(sh->raid_conf),
1795 &submit);
1796 } else {
1797 struct page *dest;
1798 unsigned int dest_off;
1799 int data_target;
1800 int qd_idx = sh->qd_idx;
1801
1802
1803 if (target == qd_idx)
1804 data_target = target2;
1805 else
1806 data_target = target;
1807
1808 count = 0;
1809 for (i = disks; i-- ; ) {
1810 if (i == data_target || i == qd_idx)
1811 continue;
1812 offs[count] = sh->dev[i].offset;
1813 blocks[count++] = sh->dev[i].page;
1814 }
1815 dest = sh->dev[data_target].page;
1816 dest_off = sh->dev[data_target].offset;
1817 init_async_submit(&submit,
1818 ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
1819 NULL, NULL, NULL,
1820 to_addr_conv(sh, percpu, 0));
1821 tx = async_xor_offs(dest, dest_off, blocks, offs, count,
1822 RAID5_STRIPE_SIZE(sh->raid_conf),
1823 &submit);
1824
1825 count = set_syndrome_sources(blocks, offs, sh, SYNDROME_SRC_ALL);
1826 init_async_submit(&submit, ASYNC_TX_FENCE, tx,
1827 ops_complete_compute, sh,
1828 to_addr_conv(sh, percpu, 0));
1829 return async_gen_syndrome(blocks, offs, count+2,
1830 RAID5_STRIPE_SIZE(sh->raid_conf),
1831 &submit);
1832 }
1833 } else {
1834 init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
1835 ops_complete_compute, sh,
1836 to_addr_conv(sh, percpu, 0));
1837 if (failb == syndrome_disks) {
1838
1839 return async_raid6_datap_recov(syndrome_disks+2,
1840 RAID5_STRIPE_SIZE(sh->raid_conf),
1841 faila,
1842 blocks, offs, &submit);
1843 } else {
1844
1845 return async_raid6_2data_recov(syndrome_disks+2,
1846 RAID5_STRIPE_SIZE(sh->raid_conf),
1847 faila, failb,
1848 blocks, offs, &submit);
1849 }
1850 }
1851 }
1852
1853 static void ops_complete_prexor(void *stripe_head_ref)
1854 {
1855 struct stripe_head *sh = stripe_head_ref;
1856
1857 pr_debug("%s: stripe %llu\n", __func__,
1858 (unsigned long long)sh->sector);
1859
1860 if (r5c_is_writeback(sh->raid_conf->log))
1861
1862
1863
1864
1865 r5c_release_extra_page(sh);
1866 }
1867
1868 static struct dma_async_tx_descriptor *
1869 ops_run_prexor5(struct stripe_head *sh, struct raid5_percpu *percpu,
1870 struct dma_async_tx_descriptor *tx)
1871 {
1872 int disks = sh->disks;
1873 struct page **xor_srcs = to_addr_page(percpu, 0);
1874 unsigned int *off_srcs = to_addr_offs(sh, percpu);
1875 int count = 0, pd_idx = sh->pd_idx, i;
1876 struct async_submit_ctl submit;
1877
1878
1879 unsigned int off_dest = off_srcs[count] = sh->dev[pd_idx].offset;
1880 struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
1881
1882 BUG_ON(sh->batch_head);
1883 pr_debug("%s: stripe %llu\n", __func__,
1884 (unsigned long long)sh->sector);
1885
1886 for (i = disks; i--; ) {
1887 struct r5dev *dev = &sh->dev[i];
1888
1889 if (test_bit(R5_InJournal, &dev->flags)) {
1890
1891
1892
1893
1894 off_srcs[count] = dev->offset;
1895 xor_srcs[count++] = dev->orig_page;
1896 } else if (test_bit(R5_Wantdrain, &dev->flags)) {
1897 off_srcs[count] = dev->offset;
1898 xor_srcs[count++] = dev->page;
1899 }
1900 }
1901
1902 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
1903 ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0));
1904 tx = async_xor_offs(xor_dest, off_dest, xor_srcs, off_srcs, count,
1905 RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
1906
1907 return tx;
1908 }
1909
1910 static struct dma_async_tx_descriptor *
1911 ops_run_prexor6(struct stripe_head *sh, struct raid5_percpu *percpu,
1912 struct dma_async_tx_descriptor *tx)
1913 {
1914 struct page **blocks = to_addr_page(percpu, 0);
1915 unsigned int *offs = to_addr_offs(sh, percpu);
1916 int count;
1917 struct async_submit_ctl submit;
1918
1919 pr_debug("%s: stripe %llu\n", __func__,
1920 (unsigned long long)sh->sector);
1921
1922 count = set_syndrome_sources(blocks, offs, sh, SYNDROME_SRC_WANT_DRAIN);
1923
1924 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_PQ_XOR_DST, tx,
1925 ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0));
1926 tx = async_gen_syndrome(blocks, offs, count+2,
1927 RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
1928
1929 return tx;
1930 }
1931
1932 static struct dma_async_tx_descriptor *
1933 ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
1934 {
1935 struct r5conf *conf = sh->raid_conf;
1936 int disks = sh->disks;
1937 int i;
1938 struct stripe_head *head_sh = sh;
1939
1940 pr_debug("%s: stripe %llu\n", __func__,
1941 (unsigned long long)sh->sector);
1942
1943 for (i = disks; i--; ) {
1944 struct r5dev *dev;
1945 struct bio *chosen;
1946
1947 sh = head_sh;
1948 if (test_and_clear_bit(R5_Wantdrain, &head_sh->dev[i].flags)) {
1949 struct bio *wbi;
1950
1951 again:
1952 dev = &sh->dev[i];
1953
1954
1955
1956
1957 clear_bit(R5_InJournal, &dev->flags);
1958 spin_lock_irq(&sh->stripe_lock);
1959 chosen = dev->towrite;
1960 dev->towrite = NULL;
1961 sh->overwrite_disks = 0;
1962 BUG_ON(dev->written);
1963 wbi = dev->written = chosen;
1964 spin_unlock_irq(&sh->stripe_lock);
1965 WARN_ON(dev->page != dev->orig_page);
1966
1967 while (wbi && wbi->bi_iter.bi_sector <
1968 dev->sector + RAID5_STRIPE_SECTORS(conf)) {
1969 if (wbi->bi_opf & REQ_FUA)
1970 set_bit(R5_WantFUA, &dev->flags);
1971 if (wbi->bi_opf & REQ_SYNC)
1972 set_bit(R5_SyncIO, &dev->flags);
1973 if (bio_op(wbi) == REQ_OP_DISCARD)
1974 set_bit(R5_Discard, &dev->flags);
1975 else {
1976 tx = async_copy_data(1, wbi, &dev->page,
1977 dev->offset,
1978 dev->sector, tx, sh,
1979 r5c_is_writeback(conf->log));
1980 if (dev->page != dev->orig_page &&
1981 !r5c_is_writeback(conf->log)) {
1982 set_bit(R5_SkipCopy, &dev->flags);
1983 clear_bit(R5_UPTODATE, &dev->flags);
1984 clear_bit(R5_OVERWRITE, &dev->flags);
1985 }
1986 }
1987 wbi = r5_next_bio(conf, wbi, dev->sector);
1988 }
1989
1990 if (head_sh->batch_head) {
1991 sh = list_first_entry(&sh->batch_list,
1992 struct stripe_head,
1993 batch_list);
1994 if (sh == head_sh)
1995 continue;
1996 goto again;
1997 }
1998 }
1999 }
2000
2001 return tx;
2002 }
2003
2004 static void ops_complete_reconstruct(void *stripe_head_ref)
2005 {
2006 struct stripe_head *sh = stripe_head_ref;
2007 int disks = sh->disks;
2008 int pd_idx = sh->pd_idx;
2009 int qd_idx = sh->qd_idx;
2010 int i;
2011 bool fua = false, sync = false, discard = false;
2012
2013 pr_debug("%s: stripe %llu\n", __func__,
2014 (unsigned long long)sh->sector);
2015
2016 for (i = disks; i--; ) {
2017 fua |= test_bit(R5_WantFUA, &sh->dev[i].flags);
2018 sync |= test_bit(R5_SyncIO, &sh->dev[i].flags);
2019 discard |= test_bit(R5_Discard, &sh->dev[i].flags);
2020 }
2021
2022 for (i = disks; i--; ) {
2023 struct r5dev *dev = &sh->dev[i];
2024
2025 if (dev->written || i == pd_idx || i == qd_idx) {
2026 if (!discard && !test_bit(R5_SkipCopy, &dev->flags)) {
2027 set_bit(R5_UPTODATE, &dev->flags);
2028 if (test_bit(STRIPE_EXPAND_READY, &sh->state))
2029 set_bit(R5_Expanded, &dev->flags);
2030 }
2031 if (fua)
2032 set_bit(R5_WantFUA, &dev->flags);
2033 if (sync)
2034 set_bit(R5_SyncIO, &dev->flags);
2035 }
2036 }
2037
2038 if (sh->reconstruct_state == reconstruct_state_drain_run)
2039 sh->reconstruct_state = reconstruct_state_drain_result;
2040 else if (sh->reconstruct_state == reconstruct_state_prexor_drain_run)
2041 sh->reconstruct_state = reconstruct_state_prexor_drain_result;
2042 else {
2043 BUG_ON(sh->reconstruct_state != reconstruct_state_run);
2044 sh->reconstruct_state = reconstruct_state_result;
2045 }
2046
2047 set_bit(STRIPE_HANDLE, &sh->state);
2048 raid5_release_stripe(sh);
2049 }
2050
2051 static void
2052 ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu,
2053 struct dma_async_tx_descriptor *tx)
2054 {
2055 int disks = sh->disks;
2056 struct page **xor_srcs;
2057 unsigned int *off_srcs;
2058 struct async_submit_ctl submit;
2059 int count, pd_idx = sh->pd_idx, i;
2060 struct page *xor_dest;
2061 unsigned int off_dest;
2062 int prexor = 0;
2063 unsigned long flags;
2064 int j = 0;
2065 struct stripe_head *head_sh = sh;
2066 int last_stripe;
2067
2068 pr_debug("%s: stripe %llu\n", __func__,
2069 (unsigned long long)sh->sector);
2070
2071 for (i = 0; i < sh->disks; i++) {
2072 if (pd_idx == i)
2073 continue;
2074 if (!test_bit(R5_Discard, &sh->dev[i].flags))
2075 break;
2076 }
2077 if (i >= sh->disks) {
2078 atomic_inc(&sh->count);
2079 set_bit(R5_Discard, &sh->dev[pd_idx].flags);
2080 ops_complete_reconstruct(sh);
2081 return;
2082 }
2083 again:
2084 count = 0;
2085 xor_srcs = to_addr_page(percpu, j);
2086 off_srcs = to_addr_offs(sh, percpu);
2087
2088
2089
2090 if (head_sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
2091 prexor = 1;
2092 off_dest = off_srcs[count] = sh->dev[pd_idx].offset;
2093 xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
2094 for (i = disks; i--; ) {
2095 struct r5dev *dev = &sh->dev[i];
2096 if (head_sh->dev[i].written ||
2097 test_bit(R5_InJournal, &head_sh->dev[i].flags)) {
2098 off_srcs[count] = dev->offset;
2099 xor_srcs[count++] = dev->page;
2100 }
2101 }
2102 } else {
2103 xor_dest = sh->dev[pd_idx].page;
2104 off_dest = sh->dev[pd_idx].offset;
2105 for (i = disks; i--; ) {
2106 struct r5dev *dev = &sh->dev[i];
2107 if (i != pd_idx) {
2108 off_srcs[count] = dev->offset;
2109 xor_srcs[count++] = dev->page;
2110 }
2111 }
2112 }
2113
2114
2115
2116
2117
2118
2119 last_stripe = !head_sh->batch_head ||
2120 list_first_entry(&sh->batch_list,
2121 struct stripe_head, batch_list) == head_sh;
2122 if (last_stripe) {
2123 flags = ASYNC_TX_ACK |
2124 (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST);
2125
2126 atomic_inc(&head_sh->count);
2127 init_async_submit(&submit, flags, tx, ops_complete_reconstruct, head_sh,
2128 to_addr_conv(sh, percpu, j));
2129 } else {
2130 flags = prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST;
2131 init_async_submit(&submit, flags, tx, NULL, NULL,
2132 to_addr_conv(sh, percpu, j));
2133 }
2134
2135 if (unlikely(count == 1))
2136 tx = async_memcpy(xor_dest, xor_srcs[0], off_dest, off_srcs[0],
2137 RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
2138 else
2139 tx = async_xor_offs(xor_dest, off_dest, xor_srcs, off_srcs, count,
2140 RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
2141 if (!last_stripe) {
2142 j++;
2143 sh = list_first_entry(&sh->batch_list, struct stripe_head,
2144 batch_list);
2145 goto again;
2146 }
2147 }
2148
2149 static void
2150 ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu,
2151 struct dma_async_tx_descriptor *tx)
2152 {
2153 struct async_submit_ctl submit;
2154 struct page **blocks;
2155 unsigned int *offs;
2156 int count, i, j = 0;
2157 struct stripe_head *head_sh = sh;
2158 int last_stripe;
2159 int synflags;
2160 unsigned long txflags;
2161
2162 pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
2163
2164 for (i = 0; i < sh->disks; i++) {
2165 if (sh->pd_idx == i || sh->qd_idx == i)
2166 continue;
2167 if (!test_bit(R5_Discard, &sh->dev[i].flags))
2168 break;
2169 }
2170 if (i >= sh->disks) {
2171 atomic_inc(&sh->count);
2172 set_bit(R5_Discard, &sh->dev[sh->pd_idx].flags);
2173 set_bit(R5_Discard, &sh->dev[sh->qd_idx].flags);
2174 ops_complete_reconstruct(sh);
2175 return;
2176 }
2177
2178 again:
2179 blocks = to_addr_page(percpu, j);
2180 offs = to_addr_offs(sh, percpu);
2181
2182 if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
2183 synflags = SYNDROME_SRC_WRITTEN;
2184 txflags = ASYNC_TX_ACK | ASYNC_TX_PQ_XOR_DST;
2185 } else {
2186 synflags = SYNDROME_SRC_ALL;
2187 txflags = ASYNC_TX_ACK;
2188 }
2189
2190 count = set_syndrome_sources(blocks, offs, sh, synflags);
2191 last_stripe = !head_sh->batch_head ||
2192 list_first_entry(&sh->batch_list,
2193 struct stripe_head, batch_list) == head_sh;
2194
2195 if (last_stripe) {
2196 atomic_inc(&head_sh->count);
2197 init_async_submit(&submit, txflags, tx, ops_complete_reconstruct,
2198 head_sh, to_addr_conv(sh, percpu, j));
2199 } else
2200 init_async_submit(&submit, 0, tx, NULL, NULL,
2201 to_addr_conv(sh, percpu, j));
2202 tx = async_gen_syndrome(blocks, offs, count+2,
2203 RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
2204 if (!last_stripe) {
2205 j++;
2206 sh = list_first_entry(&sh->batch_list, struct stripe_head,
2207 batch_list);
2208 goto again;
2209 }
2210 }
2211
2212 static void ops_complete_check(void *stripe_head_ref)
2213 {
2214 struct stripe_head *sh = stripe_head_ref;
2215
2216 pr_debug("%s: stripe %llu\n", __func__,
2217 (unsigned long long)sh->sector);
2218
2219 sh->check_state = check_state_check_result;
2220 set_bit(STRIPE_HANDLE, &sh->state);
2221 raid5_release_stripe(sh);
2222 }
2223
2224 static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu)
2225 {
2226 int disks = sh->disks;
2227 int pd_idx = sh->pd_idx;
2228 int qd_idx = sh->qd_idx;
2229 struct page *xor_dest;
2230 unsigned int off_dest;
2231 struct page **xor_srcs = to_addr_page(percpu, 0);
2232 unsigned int *off_srcs = to_addr_offs(sh, percpu);
2233 struct dma_async_tx_descriptor *tx;
2234 struct async_submit_ctl submit;
2235 int count;
2236 int i;
2237
2238 pr_debug("%s: stripe %llu\n", __func__,
2239 (unsigned long long)sh->sector);
2240
2241 BUG_ON(sh->batch_head);
2242 count = 0;
2243 xor_dest = sh->dev[pd_idx].page;
2244 off_dest = sh->dev[pd_idx].offset;
2245 off_srcs[count] = off_dest;
2246 xor_srcs[count++] = xor_dest;
2247 for (i = disks; i--; ) {
2248 if (i == pd_idx || i == qd_idx)
2249 continue;
2250 off_srcs[count] = sh->dev[i].offset;
2251 xor_srcs[count++] = sh->dev[i].page;
2252 }
2253
2254 init_async_submit(&submit, 0, NULL, NULL, NULL,
2255 to_addr_conv(sh, percpu, 0));
2256 tx = async_xor_val_offs(xor_dest, off_dest, xor_srcs, off_srcs, count,
2257 RAID5_STRIPE_SIZE(sh->raid_conf),
2258 &sh->ops.zero_sum_result, &submit);
2259
2260 atomic_inc(&sh->count);
2261 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_check, sh, NULL);
2262 tx = async_trigger_callback(&submit);
2263 }
2264
2265 static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp)
2266 {
2267 struct page **srcs = to_addr_page(percpu, 0);
2268 unsigned int *offs = to_addr_offs(sh, percpu);
2269 struct async_submit_ctl submit;
2270 int count;
2271
2272 pr_debug("%s: stripe %llu checkp: %d\n", __func__,
2273 (unsigned long long)sh->sector, checkp);
2274
2275 BUG_ON(sh->batch_head);
2276 count = set_syndrome_sources(srcs, offs, sh, SYNDROME_SRC_ALL);
2277 if (!checkp)
2278 srcs[count] = NULL;
2279
2280 atomic_inc(&sh->count);
2281 init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check,
2282 sh, to_addr_conv(sh, percpu, 0));
2283 async_syndrome_val(srcs, offs, count+2,
2284 RAID5_STRIPE_SIZE(sh->raid_conf),
2285 &sh->ops.zero_sum_result, percpu->spare_page, 0, &submit);
2286 }
2287
2288 static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
2289 {
2290 int overlap_clear = 0, i, disks = sh->disks;
2291 struct dma_async_tx_descriptor *tx = NULL;
2292 struct r5conf *conf = sh->raid_conf;
2293 int level = conf->level;
2294 struct raid5_percpu *percpu;
2295
2296 local_lock(&conf->percpu->lock);
2297 percpu = this_cpu_ptr(conf->percpu);
2298 if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) {
2299 ops_run_biofill(sh);
2300 overlap_clear++;
2301 }
2302
2303 if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) {
2304 if (level < 6)
2305 tx = ops_run_compute5(sh, percpu);
2306 else {
2307 if (sh->ops.target2 < 0 || sh->ops.target < 0)
2308 tx = ops_run_compute6_1(sh, percpu);
2309 else
2310 tx = ops_run_compute6_2(sh, percpu);
2311 }
2312
2313 if (tx && !test_bit(STRIPE_OP_RECONSTRUCT, &ops_request))
2314 async_tx_ack(tx);
2315 }
2316
2317 if (test_bit(STRIPE_OP_PREXOR, &ops_request)) {
2318 if (level < 6)
2319 tx = ops_run_prexor5(sh, percpu, tx);
2320 else
2321 tx = ops_run_prexor6(sh, percpu, tx);
2322 }
2323
2324 if (test_bit(STRIPE_OP_PARTIAL_PARITY, &ops_request))
2325 tx = ops_run_partial_parity(sh, percpu, tx);
2326
2327 if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) {
2328 tx = ops_run_biodrain(sh, tx);
2329 overlap_clear++;
2330 }
2331
2332 if (test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) {
2333 if (level < 6)
2334 ops_run_reconstruct5(sh, percpu, tx);
2335 else
2336 ops_run_reconstruct6(sh, percpu, tx);
2337 }
2338
2339 if (test_bit(STRIPE_OP_CHECK, &ops_request)) {
2340 if (sh->check_state == check_state_run)
2341 ops_run_check_p(sh, percpu);
2342 else if (sh->check_state == check_state_run_q)
2343 ops_run_check_pq(sh, percpu, 0);
2344 else if (sh->check_state == check_state_run_pq)
2345 ops_run_check_pq(sh, percpu, 1);
2346 else
2347 BUG();
2348 }
2349
2350 if (overlap_clear && !sh->batch_head) {
2351 for (i = disks; i--; ) {
2352 struct r5dev *dev = &sh->dev[i];
2353 if (test_and_clear_bit(R5_Overlap, &dev->flags))
2354 wake_up(&sh->raid_conf->wait_for_overlap);
2355 }
2356 }
2357 local_unlock(&conf->percpu->lock);
2358 }
2359
2360 static void free_stripe(struct kmem_cache *sc, struct stripe_head *sh)
2361 {
2362 #if PAGE_SIZE != DEFAULT_STRIPE_SIZE
2363 kfree(sh->pages);
2364 #endif
2365 if (sh->ppl_page)
2366 __free_page(sh->ppl_page);
2367 kmem_cache_free(sc, sh);
2368 }
2369
2370 static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp,
2371 int disks, struct r5conf *conf)
2372 {
2373 struct stripe_head *sh;
2374
2375 sh = kmem_cache_zalloc(sc, gfp);
2376 if (sh) {
2377 spin_lock_init(&sh->stripe_lock);
2378 spin_lock_init(&sh->batch_lock);
2379 INIT_LIST_HEAD(&sh->batch_list);
2380 INIT_LIST_HEAD(&sh->lru);
2381 INIT_LIST_HEAD(&sh->r5c);
2382 INIT_LIST_HEAD(&sh->log_list);
2383 atomic_set(&sh->count, 1);
2384 sh->raid_conf = conf;
2385 sh->log_start = MaxSector;
2386
2387 if (raid5_has_ppl(conf)) {
2388 sh->ppl_page = alloc_page(gfp);
2389 if (!sh->ppl_page) {
2390 free_stripe(sc, sh);
2391 return NULL;
2392 }
2393 }
2394 #if PAGE_SIZE != DEFAULT_STRIPE_SIZE
2395 if (init_stripe_shared_pages(sh, conf, disks)) {
2396 free_stripe(sc, sh);
2397 return NULL;
2398 }
2399 #endif
2400 }
2401 return sh;
2402 }
2403 static int grow_one_stripe(struct r5conf *conf, gfp_t gfp)
2404 {
2405 struct stripe_head *sh;
2406
2407 sh = alloc_stripe(conf->slab_cache, gfp, conf->pool_size, conf);
2408 if (!sh)
2409 return 0;
2410
2411 if (grow_buffers(sh, gfp)) {
2412 shrink_buffers(sh);
2413 free_stripe(conf->slab_cache, sh);
2414 return 0;
2415 }
2416 sh->hash_lock_index =
2417 conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS;
2418
2419 atomic_inc(&conf->active_stripes);
2420
2421 raid5_release_stripe(sh);
2422 conf->max_nr_stripes++;
2423 return 1;
2424 }
2425
2426 static int grow_stripes(struct r5conf *conf, int num)
2427 {
2428 struct kmem_cache *sc;
2429 size_t namelen = sizeof(conf->cache_name[0]);
2430 int devs = max(conf->raid_disks, conf->previous_raid_disks);
2431
2432 if (conf->mddev->gendisk)
2433 snprintf(conf->cache_name[0], namelen,
2434 "raid%d-%s", conf->level, mdname(conf->mddev));
2435 else
2436 snprintf(conf->cache_name[0], namelen,
2437 "raid%d-%p", conf->level, conf->mddev);
2438 snprintf(conf->cache_name[1], namelen, "%.27s-alt", conf->cache_name[0]);
2439
2440 conf->active_name = 0;
2441 sc = kmem_cache_create(conf->cache_name[conf->active_name],
2442 sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev),
2443 0, 0, NULL);
2444 if (!sc)
2445 return 1;
2446 conf->slab_cache = sc;
2447 conf->pool_size = devs;
2448 while (num--)
2449 if (!grow_one_stripe(conf, GFP_KERNEL))
2450 return 1;
2451
2452 return 0;
2453 }
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471 static int scribble_alloc(struct raid5_percpu *percpu,
2472 int num, int cnt)
2473 {
2474 size_t obj_size =
2475 sizeof(struct page *) * (num + 2) +
2476 sizeof(addr_conv_t) * (num + 2) +
2477 sizeof(unsigned int) * (num + 2);
2478 void *scribble;
2479
2480
2481
2482
2483
2484
2485 scribble = kvmalloc_array(cnt, obj_size, GFP_KERNEL);
2486 if (!scribble)
2487 return -ENOMEM;
2488
2489 kvfree(percpu->scribble);
2490
2491 percpu->scribble = scribble;
2492 percpu->scribble_obj_size = obj_size;
2493 return 0;
2494 }
2495
2496 static int resize_chunks(struct r5conf *conf, int new_disks, int new_sectors)
2497 {
2498 unsigned long cpu;
2499 int err = 0;
2500
2501
2502
2503
2504
2505
2506 if (conf->scribble_disks >= new_disks &&
2507 conf->scribble_sectors >= new_sectors)
2508 return 0;
2509 mddev_suspend(conf->mddev);
2510 cpus_read_lock();
2511
2512 for_each_present_cpu(cpu) {
2513 struct raid5_percpu *percpu;
2514
2515 percpu = per_cpu_ptr(conf->percpu, cpu);
2516 err = scribble_alloc(percpu, new_disks,
2517 new_sectors / RAID5_STRIPE_SECTORS(conf));
2518 if (err)
2519 break;
2520 }
2521
2522 cpus_read_unlock();
2523 mddev_resume(conf->mddev);
2524 if (!err) {
2525 conf->scribble_disks = new_disks;
2526 conf->scribble_sectors = new_sectors;
2527 }
2528 return err;
2529 }
2530
2531 static int resize_stripes(struct r5conf *conf, int newsize)
2532 {
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556 struct stripe_head *osh, *nsh;
2557 LIST_HEAD(newstripes);
2558 struct disk_info *ndisks;
2559 int err = 0;
2560 struct kmem_cache *sc;
2561 int i;
2562 int hash, cnt;
2563
2564 md_allow_write(conf->mddev);
2565
2566
2567 sc = kmem_cache_create(conf->cache_name[1-conf->active_name],
2568 sizeof(struct stripe_head)+(newsize-1)*sizeof(struct r5dev),
2569 0, 0, NULL);
2570 if (!sc)
2571 return -ENOMEM;
2572
2573
2574 mutex_lock(&conf->cache_size_mutex);
2575
2576 for (i = conf->max_nr_stripes; i; i--) {
2577 nsh = alloc_stripe(sc, GFP_KERNEL, newsize, conf);
2578 if (!nsh)
2579 break;
2580
2581 list_add(&nsh->lru, &newstripes);
2582 }
2583 if (i) {
2584
2585 while (!list_empty(&newstripes)) {
2586 nsh = list_entry(newstripes.next, struct stripe_head, lru);
2587 list_del(&nsh->lru);
2588 free_stripe(sc, nsh);
2589 }
2590 kmem_cache_destroy(sc);
2591 mutex_unlock(&conf->cache_size_mutex);
2592 return -ENOMEM;
2593 }
2594
2595
2596
2597
2598 hash = 0;
2599 cnt = 0;
2600 list_for_each_entry(nsh, &newstripes, lru) {
2601 lock_device_hash_lock(conf, hash);
2602 wait_event_cmd(conf->wait_for_stripe,
2603 !list_empty(conf->inactive_list + hash),
2604 unlock_device_hash_lock(conf, hash),
2605 lock_device_hash_lock(conf, hash));
2606 osh = get_free_stripe(conf, hash);
2607 unlock_device_hash_lock(conf, hash);
2608
2609 #if PAGE_SIZE != DEFAULT_STRIPE_SIZE
2610 for (i = 0; i < osh->nr_pages; i++) {
2611 nsh->pages[i] = osh->pages[i];
2612 osh->pages[i] = NULL;
2613 }
2614 #endif
2615 for(i=0; i<conf->pool_size; i++) {
2616 nsh->dev[i].page = osh->dev[i].page;
2617 nsh->dev[i].orig_page = osh->dev[i].page;
2618 nsh->dev[i].offset = osh->dev[i].offset;
2619 }
2620 nsh->hash_lock_index = hash;
2621 free_stripe(conf->slab_cache, osh);
2622 cnt++;
2623 if (cnt >= conf->max_nr_stripes / NR_STRIPE_HASH_LOCKS +
2624 !!((conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS) > hash)) {
2625 hash++;
2626 cnt = 0;
2627 }
2628 }
2629 kmem_cache_destroy(conf->slab_cache);
2630
2631
2632
2633
2634
2635
2636 ndisks = kcalloc(newsize, sizeof(struct disk_info), GFP_NOIO);
2637 if (ndisks) {
2638 for (i = 0; i < conf->pool_size; i++)
2639 ndisks[i] = conf->disks[i];
2640
2641 for (i = conf->pool_size; i < newsize; i++) {
2642 ndisks[i].extra_page = alloc_page(GFP_NOIO);
2643 if (!ndisks[i].extra_page)
2644 err = -ENOMEM;
2645 }
2646
2647 if (err) {
2648 for (i = conf->pool_size; i < newsize; i++)
2649 if (ndisks[i].extra_page)
2650 put_page(ndisks[i].extra_page);
2651 kfree(ndisks);
2652 } else {
2653 kfree(conf->disks);
2654 conf->disks = ndisks;
2655 }
2656 } else
2657 err = -ENOMEM;
2658
2659 conf->slab_cache = sc;
2660 conf->active_name = 1-conf->active_name;
2661
2662
2663 while(!list_empty(&newstripes)) {
2664 nsh = list_entry(newstripes.next, struct stripe_head, lru);
2665 list_del_init(&nsh->lru);
2666
2667 #if PAGE_SIZE != DEFAULT_STRIPE_SIZE
2668 for (i = 0; i < nsh->nr_pages; i++) {
2669 if (nsh->pages[i])
2670 continue;
2671 nsh->pages[i] = alloc_page(GFP_NOIO);
2672 if (!nsh->pages[i])
2673 err = -ENOMEM;
2674 }
2675
2676 for (i = conf->raid_disks; i < newsize; i++) {
2677 if (nsh->dev[i].page)
2678 continue;
2679 nsh->dev[i].page = raid5_get_dev_page(nsh, i);
2680 nsh->dev[i].orig_page = nsh->dev[i].page;
2681 nsh->dev[i].offset = raid5_get_page_offset(nsh, i);
2682 }
2683 #else
2684 for (i=conf->raid_disks; i < newsize; i++)
2685 if (nsh->dev[i].page == NULL) {
2686 struct page *p = alloc_page(GFP_NOIO);
2687 nsh->dev[i].page = p;
2688 nsh->dev[i].orig_page = p;
2689 nsh->dev[i].offset = 0;
2690 if (!p)
2691 err = -ENOMEM;
2692 }
2693 #endif
2694 raid5_release_stripe(nsh);
2695 }
2696
2697
2698 if (!err)
2699 conf->pool_size = newsize;
2700 mutex_unlock(&conf->cache_size_mutex);
2701
2702 return err;
2703 }
2704
2705 static int drop_one_stripe(struct r5conf *conf)
2706 {
2707 struct stripe_head *sh;
2708 int hash = (conf->max_nr_stripes - 1) & STRIPE_HASH_LOCKS_MASK;
2709
2710 spin_lock_irq(conf->hash_locks + hash);
2711 sh = get_free_stripe(conf, hash);
2712 spin_unlock_irq(conf->hash_locks + hash);
2713 if (!sh)
2714 return 0;
2715 BUG_ON(atomic_read(&sh->count));
2716 shrink_buffers(sh);
2717 free_stripe(conf->slab_cache, sh);
2718 atomic_dec(&conf->active_stripes);
2719 conf->max_nr_stripes--;
2720 return 1;
2721 }
2722
2723 static void shrink_stripes(struct r5conf *conf)
2724 {
2725 while (conf->max_nr_stripes &&
2726 drop_one_stripe(conf))
2727 ;
2728
2729 kmem_cache_destroy(conf->slab_cache);
2730 conf->slab_cache = NULL;
2731 }
2732
2733
2734
2735
2736
2737 static struct md_rdev *rdev_pend_deref(struct md_rdev __rcu *rdev)
2738 {
2739 return rcu_dereference_protected(rdev,
2740 atomic_read(&rcu_access_pointer(rdev)->nr_pending));
2741 }
2742
2743
2744
2745
2746
2747
2748 static struct md_rdev *rdev_mdlock_deref(struct mddev *mddev,
2749 struct md_rdev __rcu *rdev)
2750 {
2751 return rcu_dereference_protected(rdev,
2752 lockdep_is_held(&mddev->reconfig_mutex));
2753 }
2754
2755 static void raid5_end_read_request(struct bio * bi)
2756 {
2757 struct stripe_head *sh = bi->bi_private;
2758 struct r5conf *conf = sh->raid_conf;
2759 int disks = sh->disks, i;
2760 struct md_rdev *rdev = NULL;
2761 sector_t s;
2762
2763 for (i=0 ; i<disks; i++)
2764 if (bi == &sh->dev[i].req)
2765 break;
2766
2767 pr_debug("end_read_request %llu/%d, count: %d, error %d.\n",
2768 (unsigned long long)sh->sector, i, atomic_read(&sh->count),
2769 bi->bi_status);
2770 if (i == disks) {
2771 BUG();
2772 return;
2773 }
2774 if (test_bit(R5_ReadRepl, &sh->dev[i].flags))
2775
2776
2777
2778
2779
2780 rdev = rdev_pend_deref(conf->disks[i].replacement);
2781 if (!rdev)
2782 rdev = rdev_pend_deref(conf->disks[i].rdev);
2783
2784 if (use_new_offset(conf, sh))
2785 s = sh->sector + rdev->new_data_offset;
2786 else
2787 s = sh->sector + rdev->data_offset;
2788 if (!bi->bi_status) {
2789 set_bit(R5_UPTODATE, &sh->dev[i].flags);
2790 if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
2791
2792
2793
2794
2795 pr_info_ratelimited(
2796 "md/raid:%s: read error corrected (%lu sectors at %llu on %pg)\n",
2797 mdname(conf->mddev), RAID5_STRIPE_SECTORS(conf),
2798 (unsigned long long)s,
2799 rdev->bdev);
2800 atomic_add(RAID5_STRIPE_SECTORS(conf), &rdev->corrected_errors);
2801 clear_bit(R5_ReadError, &sh->dev[i].flags);
2802 clear_bit(R5_ReWrite, &sh->dev[i].flags);
2803 } else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
2804 clear_bit(R5_ReadNoMerge, &sh->dev[i].flags);
2805
2806 if (test_bit(R5_InJournal, &sh->dev[i].flags))
2807
2808
2809
2810
2811 set_bit(R5_OrigPageUPTDODATE, &sh->dev[i].flags);
2812
2813 if (atomic_read(&rdev->read_errors))
2814 atomic_set(&rdev->read_errors, 0);
2815 } else {
2816 int retry = 0;
2817 int set_bad = 0;
2818
2819 clear_bit(R5_UPTODATE, &sh->dev[i].flags);
2820 if (!(bi->bi_status == BLK_STS_PROTECTION))
2821 atomic_inc(&rdev->read_errors);
2822 if (test_bit(R5_ReadRepl, &sh->dev[i].flags))
2823 pr_warn_ratelimited(
2824 "md/raid:%s: read error on replacement device (sector %llu on %pg).\n",
2825 mdname(conf->mddev),
2826 (unsigned long long)s,
2827 rdev->bdev);
2828 else if (conf->mddev->degraded >= conf->max_degraded) {
2829 set_bad = 1;
2830 pr_warn_ratelimited(
2831 "md/raid:%s: read error not correctable (sector %llu on %pg).\n",
2832 mdname(conf->mddev),
2833 (unsigned long long)s,
2834 rdev->bdev);
2835 } else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) {
2836
2837 set_bad = 1;
2838 pr_warn_ratelimited(
2839 "md/raid:%s: read error NOT corrected!! (sector %llu on %pg).\n",
2840 mdname(conf->mddev),
2841 (unsigned long long)s,
2842 rdev->bdev);
2843 } else if (atomic_read(&rdev->read_errors)
2844 > conf->max_nr_stripes) {
2845 if (!test_bit(Faulty, &rdev->flags)) {
2846 pr_warn("md/raid:%s: %d read_errors > %d stripes\n",
2847 mdname(conf->mddev),
2848 atomic_read(&rdev->read_errors),
2849 conf->max_nr_stripes);
2850 pr_warn("md/raid:%s: Too many read errors, failing device %pg.\n",
2851 mdname(conf->mddev), rdev->bdev);
2852 }
2853 } else
2854 retry = 1;
2855 if (set_bad && test_bit(In_sync, &rdev->flags)
2856 && !test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
2857 retry = 1;
2858 if (retry)
2859 if (sh->qd_idx >= 0 && sh->pd_idx == i)
2860 set_bit(R5_ReadError, &sh->dev[i].flags);
2861 else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) {
2862 set_bit(R5_ReadError, &sh->dev[i].flags);
2863 clear_bit(R5_ReadNoMerge, &sh->dev[i].flags);
2864 } else
2865 set_bit(R5_ReadNoMerge, &sh->dev[i].flags);
2866 else {
2867 clear_bit(R5_ReadError, &sh->dev[i].flags);
2868 clear_bit(R5_ReWrite, &sh->dev[i].flags);
2869 if (!(set_bad
2870 && test_bit(In_sync, &rdev->flags)
2871 && rdev_set_badblocks(
2872 rdev, sh->sector, RAID5_STRIPE_SECTORS(conf), 0)))
2873 md_error(conf->mddev, rdev);
2874 }
2875 }
2876 rdev_dec_pending(rdev, conf->mddev);
2877 bio_uninit(bi);
2878 clear_bit(R5_LOCKED, &sh->dev[i].flags);
2879 set_bit(STRIPE_HANDLE, &sh->state);
2880 raid5_release_stripe(sh);
2881 }
2882
2883 static void raid5_end_write_request(struct bio *bi)
2884 {
2885 struct stripe_head *sh = bi->bi_private;
2886 struct r5conf *conf = sh->raid_conf;
2887 int disks = sh->disks, i;
2888 struct md_rdev *rdev;
2889 sector_t first_bad;
2890 int bad_sectors;
2891 int replacement = 0;
2892
2893 for (i = 0 ; i < disks; i++) {
2894 if (bi == &sh->dev[i].req) {
2895 rdev = rdev_pend_deref(conf->disks[i].rdev);
2896 break;
2897 }
2898 if (bi == &sh->dev[i].rreq) {
2899 rdev = rdev_pend_deref(conf->disks[i].replacement);
2900 if (rdev)
2901 replacement = 1;
2902 else
2903
2904
2905
2906
2907 rdev = rdev_pend_deref(conf->disks[i].rdev);
2908 break;
2909 }
2910 }
2911 pr_debug("end_write_request %llu/%d, count %d, error: %d.\n",
2912 (unsigned long long)sh->sector, i, atomic_read(&sh->count),
2913 bi->bi_status);
2914 if (i == disks) {
2915 BUG();
2916 return;
2917 }
2918
2919 if (replacement) {
2920 if (bi->bi_status)
2921 md_error(conf->mddev, rdev);
2922 else if (is_badblock(rdev, sh->sector,
2923 RAID5_STRIPE_SECTORS(conf),
2924 &first_bad, &bad_sectors))
2925 set_bit(R5_MadeGoodRepl, &sh->dev[i].flags);
2926 } else {
2927 if (bi->bi_status) {
2928 set_bit(STRIPE_DEGRADED, &sh->state);
2929 set_bit(WriteErrorSeen, &rdev->flags);
2930 set_bit(R5_WriteError, &sh->dev[i].flags);
2931 if (!test_and_set_bit(WantReplacement, &rdev->flags))
2932 set_bit(MD_RECOVERY_NEEDED,
2933 &rdev->mddev->recovery);
2934 } else if (is_badblock(rdev, sh->sector,
2935 RAID5_STRIPE_SECTORS(conf),
2936 &first_bad, &bad_sectors)) {
2937 set_bit(R5_MadeGood, &sh->dev[i].flags);
2938 if (test_bit(R5_ReadError, &sh->dev[i].flags))
2939
2940
2941
2942
2943 set_bit(R5_ReWrite, &sh->dev[i].flags);
2944 }
2945 }
2946 rdev_dec_pending(rdev, conf->mddev);
2947
2948 if (sh->batch_head && bi->bi_status && !replacement)
2949 set_bit(STRIPE_BATCH_ERR, &sh->batch_head->state);
2950
2951 bio_uninit(bi);
2952 if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags))
2953 clear_bit(R5_LOCKED, &sh->dev[i].flags);
2954 set_bit(STRIPE_HANDLE, &sh->state);
2955
2956 if (sh->batch_head && sh != sh->batch_head)
2957 raid5_release_stripe(sh->batch_head);
2958 raid5_release_stripe(sh);
2959 }
2960
2961 static void raid5_error(struct mddev *mddev, struct md_rdev *rdev)
2962 {
2963 struct r5conf *conf = mddev->private;
2964 unsigned long flags;
2965 pr_debug("raid456: error called\n");
2966
2967 pr_crit("md/raid:%s: Disk failure on %pg, disabling device.\n",
2968 mdname(mddev), rdev->bdev);
2969
2970 spin_lock_irqsave(&conf->device_lock, flags);
2971 set_bit(Faulty, &rdev->flags);
2972 clear_bit(In_sync, &rdev->flags);
2973 mddev->degraded = raid5_calc_degraded(conf);
2974
2975 if (has_failed(conf)) {
2976 set_bit(MD_BROKEN, &conf->mddev->flags);
2977 conf->recovery_disabled = mddev->recovery_disabled;
2978
2979 pr_crit("md/raid:%s: Cannot continue operation (%d/%d failed).\n",
2980 mdname(mddev), mddev->degraded, conf->raid_disks);
2981 } else {
2982 pr_crit("md/raid:%s: Operation continuing on %d devices.\n",
2983 mdname(mddev), conf->raid_disks - mddev->degraded);
2984 }
2985
2986 spin_unlock_irqrestore(&conf->device_lock, flags);
2987 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
2988
2989 set_bit(Blocked, &rdev->flags);
2990 set_mask_bits(&mddev->sb_flags, 0,
2991 BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING));
2992 r5c_update_on_rdev_error(mddev, rdev);
2993 }
2994
2995
2996
2997
2998
2999 sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector,
3000 int previous, int *dd_idx,
3001 struct stripe_head *sh)
3002 {
3003 sector_t stripe, stripe2;
3004 sector_t chunk_number;
3005 unsigned int chunk_offset;
3006 int pd_idx, qd_idx;
3007 int ddf_layout = 0;
3008 sector_t new_sector;
3009 int algorithm = previous ? conf->prev_algo
3010 : conf->algorithm;
3011 int sectors_per_chunk = previous ? conf->prev_chunk_sectors
3012 : conf->chunk_sectors;
3013 int raid_disks = previous ? conf->previous_raid_disks
3014 : conf->raid_disks;
3015 int data_disks = raid_disks - conf->max_degraded;
3016
3017
3018
3019
3020
3021
3022 chunk_offset = sector_div(r_sector, sectors_per_chunk);
3023 chunk_number = r_sector;
3024
3025
3026
3027
3028 stripe = chunk_number;
3029 *dd_idx = sector_div(stripe, data_disks);
3030 stripe2 = stripe;
3031
3032
3033
3034 pd_idx = qd_idx = -1;
3035 switch(conf->level) {
3036 case 4:
3037 pd_idx = data_disks;
3038 break;
3039 case 5:
3040 switch (algorithm) {
3041 case ALGORITHM_LEFT_ASYMMETRIC:
3042 pd_idx = data_disks - sector_div(stripe2, raid_disks);
3043 if (*dd_idx >= pd_idx)
3044 (*dd_idx)++;
3045 break;
3046 case ALGORITHM_RIGHT_ASYMMETRIC:
3047 pd_idx = sector_div(stripe2, raid_disks);
3048 if (*dd_idx >= pd_idx)
3049 (*dd_idx)++;
3050 break;
3051 case ALGORITHM_LEFT_SYMMETRIC:
3052 pd_idx = data_disks - sector_div(stripe2, raid_disks);
3053 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
3054 break;
3055 case ALGORITHM_RIGHT_SYMMETRIC:
3056 pd_idx = sector_div(stripe2, raid_disks);
3057 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
3058 break;
3059 case ALGORITHM_PARITY_0:
3060 pd_idx = 0;
3061 (*dd_idx)++;
3062 break;
3063 case ALGORITHM_PARITY_N:
3064 pd_idx = data_disks;
3065 break;
3066 default:
3067 BUG();
3068 }
3069 break;
3070 case 6:
3071
3072 switch (algorithm) {
3073 case ALGORITHM_LEFT_ASYMMETRIC:
3074 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
3075 qd_idx = pd_idx + 1;
3076 if (pd_idx == raid_disks-1) {
3077 (*dd_idx)++;
3078 qd_idx = 0;
3079 } else if (*dd_idx >= pd_idx)
3080 (*dd_idx) += 2;
3081 break;
3082 case ALGORITHM_RIGHT_ASYMMETRIC:
3083 pd_idx = sector_div(stripe2, raid_disks);
3084 qd_idx = pd_idx + 1;
3085 if (pd_idx == raid_disks-1) {
3086 (*dd_idx)++;
3087 qd_idx = 0;
3088 } else if (*dd_idx >= pd_idx)
3089 (*dd_idx) += 2;
3090 break;
3091 case ALGORITHM_LEFT_SYMMETRIC:
3092 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
3093 qd_idx = (pd_idx + 1) % raid_disks;
3094 *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks;
3095 break;
3096 case ALGORITHM_RIGHT_SYMMETRIC:
3097 pd_idx = sector_div(stripe2, raid_disks);
3098 qd_idx = (pd_idx + 1) % raid_disks;
3099 *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks;
3100 break;
3101
3102 case ALGORITHM_PARITY_0:
3103 pd_idx = 0;
3104 qd_idx = 1;
3105 (*dd_idx) += 2;
3106 break;
3107 case ALGORITHM_PARITY_N:
3108 pd_idx = data_disks;
3109 qd_idx = data_disks + 1;
3110 break;
3111
3112 case ALGORITHM_ROTATING_ZERO_RESTART:
3113
3114
3115
3116 pd_idx = sector_div(stripe2, raid_disks);
3117 qd_idx = pd_idx + 1;
3118 if (pd_idx == raid_disks-1) {
3119 (*dd_idx)++;
3120 qd_idx = 0;
3121 } else if (*dd_idx >= pd_idx)
3122 (*dd_idx) += 2;
3123 ddf_layout = 1;
3124 break;
3125
3126 case ALGORITHM_ROTATING_N_RESTART:
3127
3128
3129
3130
3131 stripe2 += 1;
3132 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
3133 qd_idx = pd_idx + 1;
3134 if (pd_idx == raid_disks-1) {
3135 (*dd_idx)++;
3136 qd_idx = 0;
3137 } else if (*dd_idx >= pd_idx)
3138 (*dd_idx) += 2;
3139 ddf_layout = 1;
3140 break;
3141
3142 case ALGORITHM_ROTATING_N_CONTINUE:
3143
3144 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
3145 qd_idx = (pd_idx + raid_disks - 1) % raid_disks;
3146 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
3147 ddf_layout = 1;
3148 break;
3149
3150 case ALGORITHM_LEFT_ASYMMETRIC_6:
3151
3152 pd_idx = data_disks - sector_div(stripe2, raid_disks-1);
3153 if (*dd_idx >= pd_idx)
3154 (*dd_idx)++;
3155 qd_idx = raid_disks - 1;
3156 break;
3157
3158 case ALGORITHM_RIGHT_ASYMMETRIC_6:
3159 pd_idx = sector_div(stripe2, raid_disks-1);
3160 if (*dd_idx >= pd_idx)
3161 (*dd_idx)++;
3162 qd_idx = raid_disks - 1;
3163 break;
3164
3165 case ALGORITHM_LEFT_SYMMETRIC_6:
3166 pd_idx = data_disks - sector_div(stripe2, raid_disks-1);
3167 *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1);
3168 qd_idx = raid_disks - 1;
3169 break;
3170
3171 case ALGORITHM_RIGHT_SYMMETRIC_6:
3172 pd_idx = sector_div(stripe2, raid_disks-1);
3173 *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1);
3174 qd_idx = raid_disks - 1;
3175 break;
3176
3177 case ALGORITHM_PARITY_0_6:
3178 pd_idx = 0;
3179 (*dd_idx)++;
3180 qd_idx = raid_disks - 1;
3181 break;
3182
3183 default:
3184 BUG();
3185 }
3186 break;
3187 }
3188
3189 if (sh) {
3190 sh->pd_idx = pd_idx;
3191 sh->qd_idx = qd_idx;
3192 sh->ddf_layout = ddf_layout;
3193 }
3194
3195
3196
3197 new_sector = (sector_t)stripe * sectors_per_chunk + chunk_offset;
3198 return new_sector;
3199 }
3200
3201 sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous)
3202 {
3203 struct r5conf *conf = sh->raid_conf;
3204 int raid_disks = sh->disks;
3205 int data_disks = raid_disks - conf->max_degraded;
3206 sector_t new_sector = sh->sector, check;
3207 int sectors_per_chunk = previous ? conf->prev_chunk_sectors
3208 : conf->chunk_sectors;
3209 int algorithm = previous ? conf->prev_algo
3210 : conf->algorithm;
3211 sector_t stripe;
3212 int chunk_offset;
3213 sector_t chunk_number;
3214 int dummy1, dd_idx = i;
3215 sector_t r_sector;
3216 struct stripe_head sh2;
3217
3218 chunk_offset = sector_div(new_sector, sectors_per_chunk);
3219 stripe = new_sector;
3220
3221 if (i == sh->pd_idx)
3222 return 0;
3223 switch(conf->level) {
3224 case 4: break;
3225 case 5:
3226 switch (algorithm) {
3227 case ALGORITHM_LEFT_ASYMMETRIC:
3228 case ALGORITHM_RIGHT_ASYMMETRIC:
3229 if (i > sh->pd_idx)
3230 i--;
3231 break;
3232 case ALGORITHM_LEFT_SYMMETRIC:
3233 case ALGORITHM_RIGHT_SYMMETRIC:
3234 if (i < sh->pd_idx)
3235 i += raid_disks;
3236 i -= (sh->pd_idx + 1);
3237 break;
3238 case ALGORITHM_PARITY_0:
3239 i -= 1;
3240 break;
3241 case ALGORITHM_PARITY_N:
3242 break;
3243 default:
3244 BUG();
3245 }
3246 break;
3247 case 6:
3248 if (i == sh->qd_idx)
3249 return 0;
3250 switch (algorithm) {
3251 case ALGORITHM_LEFT_ASYMMETRIC:
3252 case ALGORITHM_RIGHT_ASYMMETRIC:
3253 case ALGORITHM_ROTATING_ZERO_RESTART:
3254 case ALGORITHM_ROTATING_N_RESTART:
3255 if (sh->pd_idx == raid_disks-1)
3256 i--;
3257 else if (i > sh->pd_idx)
3258 i -= 2;
3259 break;
3260 case ALGORITHM_LEFT_SYMMETRIC:
3261 case ALGORITHM_RIGHT_SYMMETRIC:
3262 if (sh->pd_idx == raid_disks-1)
3263 i--;
3264 else {
3265
3266 if (i < sh->pd_idx)
3267 i += raid_disks;
3268 i -= (sh->pd_idx + 2);
3269 }
3270 break;
3271 case ALGORITHM_PARITY_0:
3272 i -= 2;
3273 break;
3274 case ALGORITHM_PARITY_N:
3275 break;
3276 case ALGORITHM_ROTATING_N_CONTINUE:
3277
3278 if (sh->pd_idx == 0)
3279 i--;
3280 else {
3281
3282 if (i < sh->pd_idx)
3283 i += raid_disks;
3284 i -= (sh->pd_idx + 1);
3285 }
3286 break;
3287 case ALGORITHM_LEFT_ASYMMETRIC_6:
3288 case ALGORITHM_RIGHT_ASYMMETRIC_6:
3289 if (i > sh->pd_idx)
3290 i--;
3291 break;
3292 case ALGORITHM_LEFT_SYMMETRIC_6:
3293 case ALGORITHM_RIGHT_SYMMETRIC_6:
3294 if (i < sh->pd_idx)
3295 i += data_disks + 1;
3296 i -= (sh->pd_idx + 1);
3297 break;
3298 case ALGORITHM_PARITY_0_6:
3299 i -= 1;
3300 break;
3301 default:
3302 BUG();
3303 }
3304 break;
3305 }
3306
3307 chunk_number = stripe * data_disks + i;
3308 r_sector = chunk_number * sectors_per_chunk + chunk_offset;
3309
3310 check = raid5_compute_sector(conf, r_sector,
3311 previous, &dummy1, &sh2);
3312 if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx
3313 || sh2.qd_idx != sh->qd_idx) {
3314 pr_warn("md/raid:%s: compute_blocknr: map not correct\n",
3315 mdname(conf->mddev));
3316 return 0;
3317 }
3318 return r_sector;
3319 }
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359 static inline bool delay_towrite(struct r5conf *conf,
3360 struct r5dev *dev,
3361 struct stripe_head_state *s)
3362 {
3363
3364 if (!test_bit(R5_OVERWRITE, &dev->flags) &&
3365 !test_bit(R5_Insync, &dev->flags) && s->injournal)
3366 return true;
3367
3368 if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) &&
3369 s->injournal > 0)
3370 return true;
3371
3372 if (s->log_failed && s->injournal)
3373 return true;
3374 return false;
3375 }
3376
3377 static void
3378 schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
3379 int rcw, int expand)
3380 {
3381 int i, pd_idx = sh->pd_idx, qd_idx = sh->qd_idx, disks = sh->disks;
3382 struct r5conf *conf = sh->raid_conf;
3383 int level = conf->level;
3384
3385 if (rcw) {
3386
3387
3388
3389
3390
3391
3392 r5c_release_extra_page(sh);
3393
3394 for (i = disks; i--; ) {
3395 struct r5dev *dev = &sh->dev[i];
3396
3397 if (dev->towrite && !delay_towrite(conf, dev, s)) {
3398 set_bit(R5_LOCKED, &dev->flags);
3399 set_bit(R5_Wantdrain, &dev->flags);
3400 if (!expand)
3401 clear_bit(R5_UPTODATE, &dev->flags);
3402 s->locked++;
3403 } else if (test_bit(R5_InJournal, &dev->flags)) {
3404 set_bit(R5_LOCKED, &dev->flags);
3405 s->locked++;
3406 }
3407 }
3408
3409
3410
3411
3412 if (!expand) {
3413 if (!s->locked)
3414
3415 return;
3416 sh->reconstruct_state = reconstruct_state_drain_run;
3417 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
3418 } else
3419 sh->reconstruct_state = reconstruct_state_run;
3420
3421 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
3422
3423 if (s->locked + conf->max_degraded == disks)
3424 if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state))
3425 atomic_inc(&conf->pending_full_writes);
3426 } else {
3427 BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) ||
3428 test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags)));
3429 BUG_ON(level == 6 &&
3430 (!(test_bit(R5_UPTODATE, &sh->dev[qd_idx].flags) ||
3431 test_bit(R5_Wantcompute, &sh->dev[qd_idx].flags))));
3432
3433 for (i = disks; i--; ) {
3434 struct r5dev *dev = &sh->dev[i];
3435 if (i == pd_idx || i == qd_idx)
3436 continue;
3437
3438 if (dev->towrite &&
3439 (test_bit(R5_UPTODATE, &dev->flags) ||
3440 test_bit(R5_Wantcompute, &dev->flags))) {
3441 set_bit(R5_Wantdrain, &dev->flags);
3442 set_bit(R5_LOCKED, &dev->flags);
3443 clear_bit(R5_UPTODATE, &dev->flags);
3444 s->locked++;
3445 } else if (test_bit(R5_InJournal, &dev->flags)) {
3446 set_bit(R5_LOCKED, &dev->flags);
3447 s->locked++;
3448 }
3449 }
3450 if (!s->locked)
3451
3452 return;
3453 sh->reconstruct_state = reconstruct_state_prexor_drain_run;
3454 set_bit(STRIPE_OP_PREXOR, &s->ops_request);
3455 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
3456 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
3457 }
3458
3459
3460
3461
3462 set_bit(R5_LOCKED, &sh->dev[pd_idx].flags);
3463 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
3464 s->locked++;
3465
3466 if (level == 6) {
3467 int qd_idx = sh->qd_idx;
3468 struct r5dev *dev = &sh->dev[qd_idx];
3469
3470 set_bit(R5_LOCKED, &dev->flags);
3471 clear_bit(R5_UPTODATE, &dev->flags);
3472 s->locked++;
3473 }
3474
3475 if (raid5_has_ppl(sh->raid_conf) && sh->ppl_page &&
3476 test_bit(STRIPE_OP_BIODRAIN, &s->ops_request) &&
3477 !test_bit(STRIPE_FULL_WRITE, &sh->state) &&
3478 test_bit(R5_Insync, &sh->dev[pd_idx].flags))
3479 set_bit(STRIPE_OP_PARTIAL_PARITY, &s->ops_request);
3480
3481 pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n",
3482 __func__, (unsigned long long)sh->sector,
3483 s->locked, s->ops_request);
3484 }
3485
3486 static bool stripe_bio_overlaps(struct stripe_head *sh, struct bio *bi,
3487 int dd_idx, int forwrite)
3488 {
3489 struct r5conf *conf = sh->raid_conf;
3490 struct bio **bip;
3491
3492 pr_debug("checking bi b#%llu to stripe s#%llu\n",
3493 bi->bi_iter.bi_sector, sh->sector);
3494
3495
3496 if (sh->batch_head)
3497 return true;
3498
3499 if (forwrite)
3500 bip = &sh->dev[dd_idx].towrite;
3501 else
3502 bip = &sh->dev[dd_idx].toread;
3503
3504 while (*bip && (*bip)->bi_iter.bi_sector < bi->bi_iter.bi_sector) {
3505 if (bio_end_sector(*bip) > bi->bi_iter.bi_sector)
3506 return true;
3507 bip = &(*bip)->bi_next;
3508 }
3509
3510 if (*bip && (*bip)->bi_iter.bi_sector < bio_end_sector(bi))
3511 return true;
3512
3513 if (forwrite && raid5_has_ppl(conf)) {
3514
3515
3516
3517
3518
3519
3520
3521 sector_t sector;
3522 sector_t first = 0;
3523 sector_t last = 0;
3524 int count = 0;
3525 int i;
3526
3527 for (i = 0; i < sh->disks; i++) {
3528 if (i != sh->pd_idx &&
3529 (i == dd_idx || sh->dev[i].towrite)) {
3530 sector = sh->dev[i].sector;
3531 if (count == 0 || sector < first)
3532 first = sector;
3533 if (sector > last)
3534 last = sector;
3535 count++;
3536 }
3537 }
3538
3539 if (first + conf->chunk_sectors * (count - 1) != last)
3540 return true;
3541 }
3542
3543 return false;
3544 }
3545
3546 static void __add_stripe_bio(struct stripe_head *sh, struct bio *bi,
3547 int dd_idx, int forwrite, int previous)
3548 {
3549 struct r5conf *conf = sh->raid_conf;
3550 struct bio **bip;
3551 int firstwrite = 0;
3552
3553 if (forwrite) {
3554 bip = &sh->dev[dd_idx].towrite;
3555 if (!*bip)
3556 firstwrite = 1;
3557 } else {
3558 bip = &sh->dev[dd_idx].toread;
3559 }
3560
3561 while (*bip && (*bip)->bi_iter.bi_sector < bi->bi_iter.bi_sector)
3562 bip = &(*bip)->bi_next;
3563
3564 if (!forwrite || previous)
3565 clear_bit(STRIPE_BATCH_READY, &sh->state);
3566
3567 BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next);
3568 if (*bip)
3569 bi->bi_next = *bip;
3570 *bip = bi;
3571 bio_inc_remaining(bi);
3572 md_write_inc(conf->mddev, bi);
3573
3574 if (forwrite) {
3575
3576 sector_t sector = sh->dev[dd_idx].sector;
3577 for (bi=sh->dev[dd_idx].towrite;
3578 sector < sh->dev[dd_idx].sector + RAID5_STRIPE_SECTORS(conf) &&
3579 bi && bi->bi_iter.bi_sector <= sector;
3580 bi = r5_next_bio(conf, bi, sh->dev[dd_idx].sector)) {
3581 if (bio_end_sector(bi) >= sector)
3582 sector = bio_end_sector(bi);
3583 }
3584 if (sector >= sh->dev[dd_idx].sector + RAID5_STRIPE_SECTORS(conf))
3585 if (!test_and_set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags))
3586 sh->overwrite_disks++;
3587 }
3588
3589 pr_debug("added bi b#%llu to stripe s#%llu, disk %d, logical %llu\n",
3590 (*bip)->bi_iter.bi_sector, sh->sector, dd_idx,
3591 sh->dev[dd_idx].sector);
3592
3593 if (conf->mddev->bitmap && firstwrite) {
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606 set_bit(STRIPE_BITMAP_PENDING, &sh->state);
3607 spin_unlock_irq(&sh->stripe_lock);
3608 md_bitmap_startwrite(conf->mddev->bitmap, sh->sector,
3609 RAID5_STRIPE_SECTORS(conf), 0);
3610 spin_lock_irq(&sh->stripe_lock);
3611 clear_bit(STRIPE_BITMAP_PENDING, &sh->state);
3612 if (!sh->batch_head) {
3613 sh->bm_seq = conf->seq_flush+1;
3614 set_bit(STRIPE_BIT_DELAY, &sh->state);
3615 }
3616 }
3617 }
3618
3619
3620
3621
3622
3623
3624 static bool add_stripe_bio(struct stripe_head *sh, struct bio *bi,
3625 int dd_idx, int forwrite, int previous)
3626 {
3627 spin_lock_irq(&sh->stripe_lock);
3628
3629 if (stripe_bio_overlaps(sh, bi, dd_idx, forwrite)) {
3630 set_bit(R5_Overlap, &sh->dev[dd_idx].flags);
3631 spin_unlock_irq(&sh->stripe_lock);
3632 return false;
3633 }
3634
3635 __add_stripe_bio(sh, bi, dd_idx, forwrite, previous);
3636 spin_unlock_irq(&sh->stripe_lock);
3637 return true;
3638 }
3639
3640 static void end_reshape(struct r5conf *conf);
3641
3642 static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous,
3643 struct stripe_head *sh)
3644 {
3645 int sectors_per_chunk =
3646 previous ? conf->prev_chunk_sectors : conf->chunk_sectors;
3647 int dd_idx;
3648 int chunk_offset = sector_div(stripe, sectors_per_chunk);
3649 int disks = previous ? conf->previous_raid_disks : conf->raid_disks;
3650
3651 raid5_compute_sector(conf,
3652 stripe * (disks - conf->max_degraded)
3653 *sectors_per_chunk + chunk_offset,
3654 previous,
3655 &dd_idx, sh);
3656 }
3657
3658 static void
3659 handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
3660 struct stripe_head_state *s, int disks)
3661 {
3662 int i;
3663 BUG_ON(sh->batch_head);
3664 for (i = disks; i--; ) {
3665 struct bio *bi;
3666 int bitmap_end = 0;
3667
3668 if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
3669 struct md_rdev *rdev;
3670 rcu_read_lock();
3671 rdev = rcu_dereference(conf->disks[i].rdev);
3672 if (rdev && test_bit(In_sync, &rdev->flags) &&
3673 !test_bit(Faulty, &rdev->flags))
3674 atomic_inc(&rdev->nr_pending);
3675 else
3676 rdev = NULL;
3677 rcu_read_unlock();
3678 if (rdev) {
3679 if (!rdev_set_badblocks(
3680 rdev,
3681 sh->sector,
3682 RAID5_STRIPE_SECTORS(conf), 0))
3683 md_error(conf->mddev, rdev);
3684 rdev_dec_pending(rdev, conf->mddev);
3685 }
3686 }
3687 spin_lock_irq(&sh->stripe_lock);
3688
3689 bi = sh->dev[i].towrite;
3690 sh->dev[i].towrite = NULL;
3691 sh->overwrite_disks = 0;
3692 spin_unlock_irq(&sh->stripe_lock);
3693 if (bi)
3694 bitmap_end = 1;
3695
3696 log_stripe_write_finished(sh);
3697
3698 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
3699 wake_up(&conf->wait_for_overlap);
3700
3701 while (bi && bi->bi_iter.bi_sector <
3702 sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) {
3703 struct bio *nextbi = r5_next_bio(conf, bi, sh->dev[i].sector);
3704
3705 md_write_end(conf->mddev);
3706 bio_io_error(bi);
3707 bi = nextbi;
3708 }
3709 if (bitmap_end)
3710 md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
3711 RAID5_STRIPE_SECTORS(conf), 0, 0);
3712 bitmap_end = 0;
3713
3714 bi = sh->dev[i].written;
3715 sh->dev[i].written = NULL;
3716 if (test_and_clear_bit(R5_SkipCopy, &sh->dev[i].flags)) {
3717 WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
3718 sh->dev[i].page = sh->dev[i].orig_page;
3719 }
3720
3721 if (bi) bitmap_end = 1;
3722 while (bi && bi->bi_iter.bi_sector <
3723 sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) {
3724 struct bio *bi2 = r5_next_bio(conf, bi, sh->dev[i].sector);
3725
3726 md_write_end(conf->mddev);
3727 bio_io_error(bi);
3728 bi = bi2;
3729 }
3730
3731
3732
3733
3734 if (!test_bit(R5_Wantfill, &sh->dev[i].flags) &&
3735 s->failed > conf->max_degraded &&
3736 (!test_bit(R5_Insync, &sh->dev[i].flags) ||
3737 test_bit(R5_ReadError, &sh->dev[i].flags))) {
3738 spin_lock_irq(&sh->stripe_lock);
3739 bi = sh->dev[i].toread;
3740 sh->dev[i].toread = NULL;
3741 spin_unlock_irq(&sh->stripe_lock);
3742 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
3743 wake_up(&conf->wait_for_overlap);
3744 if (bi)
3745 s->to_read--;
3746 while (bi && bi->bi_iter.bi_sector <
3747 sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) {
3748 struct bio *nextbi =
3749 r5_next_bio(conf, bi, sh->dev[i].sector);
3750
3751 bio_io_error(bi);
3752 bi = nextbi;
3753 }
3754 }
3755 if (bitmap_end)
3756 md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
3757 RAID5_STRIPE_SECTORS(conf), 0, 0);
3758
3759
3760
3761 clear_bit(R5_LOCKED, &sh->dev[i].flags);
3762 }
3763 s->to_write = 0;
3764 s->written = 0;
3765
3766 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
3767 if (atomic_dec_and_test(&conf->pending_full_writes))
3768 md_wakeup_thread(conf->mddev->thread);
3769 }
3770
3771 static void
3772 handle_failed_sync(struct r5conf *conf, struct stripe_head *sh,
3773 struct stripe_head_state *s)
3774 {
3775 int abort = 0;
3776 int i;
3777
3778 BUG_ON(sh->batch_head);
3779 clear_bit(STRIPE_SYNCING, &sh->state);
3780 if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags))
3781 wake_up(&conf->wait_for_overlap);
3782 s->syncing = 0;
3783 s->replacing = 0;
3784
3785
3786
3787
3788
3789
3790
3791 if (test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery)) {
3792
3793
3794
3795 rcu_read_lock();
3796 for (i = 0; i < conf->raid_disks; i++) {
3797 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
3798 if (rdev
3799 && !test_bit(Faulty, &rdev->flags)
3800 && !test_bit(In_sync, &rdev->flags)
3801 && !rdev_set_badblocks(rdev, sh->sector,
3802 RAID5_STRIPE_SECTORS(conf), 0))
3803 abort = 1;
3804 rdev = rcu_dereference(conf->disks[i].replacement);
3805 if (rdev
3806 && !test_bit(Faulty, &rdev->flags)
3807 && !test_bit(In_sync, &rdev->flags)
3808 && !rdev_set_badblocks(rdev, sh->sector,
3809 RAID5_STRIPE_SECTORS(conf), 0))
3810 abort = 1;
3811 }
3812 rcu_read_unlock();
3813 if (abort)
3814 conf->recovery_disabled =
3815 conf->mddev->recovery_disabled;
3816 }
3817 md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), !abort);
3818 }
3819
3820 static int want_replace(struct stripe_head *sh, int disk_idx)
3821 {
3822 struct md_rdev *rdev;
3823 int rv = 0;
3824
3825 rcu_read_lock();
3826 rdev = rcu_dereference(sh->raid_conf->disks[disk_idx].replacement);
3827 if (rdev
3828 && !test_bit(Faulty, &rdev->flags)
3829 && !test_bit(In_sync, &rdev->flags)
3830 && (rdev->recovery_offset <= sh->sector
3831 || rdev->mddev->recovery_cp <= sh->sector))
3832 rv = 1;
3833 rcu_read_unlock();
3834 return rv;
3835 }
3836
3837 static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s,
3838 int disk_idx, int disks)
3839 {
3840 struct r5dev *dev = &sh->dev[disk_idx];
3841 struct r5dev *fdev[2] = { &sh->dev[s->failed_num[0]],
3842 &sh->dev[s->failed_num[1]] };
3843 int i;
3844 bool force_rcw = (sh->raid_conf->rmw_level == PARITY_DISABLE_RMW);
3845
3846
3847 if (test_bit(R5_LOCKED, &dev->flags) ||
3848 test_bit(R5_UPTODATE, &dev->flags))
3849
3850
3851
3852 return 0;
3853
3854 if (dev->toread ||
3855 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)))
3856
3857 return 1;
3858
3859 if (s->syncing || s->expanding ||
3860 (s->replacing && want_replace(sh, disk_idx)))
3861
3862
3863
3864 return 1;
3865
3866 if ((s->failed >= 1 && fdev[0]->toread) ||
3867 (s->failed >= 2 && fdev[1]->toread))
3868
3869
3870
3871 return 1;
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881 if (!s->failed || !s->to_write)
3882 return 0;
3883
3884 if (test_bit(R5_Insync, &dev->flags) &&
3885 !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
3886
3887
3888
3889
3890
3891 return 0;
3892
3893 for (i = 0; i < s->failed && i < 2; i++) {
3894 if (fdev[i]->towrite &&
3895 !test_bit(R5_UPTODATE, &fdev[i]->flags) &&
3896 !test_bit(R5_OVERWRITE, &fdev[i]->flags))
3897
3898
3899
3900
3901
3902 return 1;
3903
3904 if (s->failed >= 2 &&
3905 (fdev[i]->towrite ||
3906 s->failed_num[i] == sh->pd_idx ||
3907 s->failed_num[i] == sh->qd_idx) &&
3908 !test_bit(R5_UPTODATE, &fdev[i]->flags))
3909
3910
3911
3912
3913 force_rcw = true;
3914 }
3915
3916
3917
3918
3919
3920
3921
3922
3923 if (!force_rcw &&
3924 sh->sector < sh->raid_conf->mddev->recovery_cp)
3925
3926 return 0;
3927 for (i = 0; i < s->failed && i < 2; i++) {
3928 if (s->failed_num[i] != sh->pd_idx &&
3929 s->failed_num[i] != sh->qd_idx &&
3930 !test_bit(R5_UPTODATE, &fdev[i]->flags) &&
3931 !test_bit(R5_OVERWRITE, &fdev[i]->flags))
3932 return 1;
3933 }
3934
3935 return 0;
3936 }
3937
3938
3939
3940
3941
3942
3943
3944 static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s,
3945 int disk_idx, int disks)
3946 {
3947 struct r5dev *dev = &sh->dev[disk_idx];
3948
3949
3950 if (need_this_block(sh, s, disk_idx, disks)) {
3951
3952
3953
3954 BUG_ON(test_bit(R5_Wantcompute, &dev->flags));
3955 BUG_ON(test_bit(R5_Wantread, &dev->flags));
3956 BUG_ON(sh->batch_head);
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967 if ((s->uptodate == disks - 1) &&
3968 ((sh->qd_idx >= 0 && sh->pd_idx == disk_idx) ||
3969 (s->failed && (disk_idx == s->failed_num[0] ||
3970 disk_idx == s->failed_num[1])))) {
3971
3972
3973
3974 pr_debug("Computing stripe %llu block %d\n",
3975 (unsigned long long)sh->sector, disk_idx);
3976 set_bit(STRIPE_COMPUTE_RUN, &sh->state);
3977 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
3978 set_bit(R5_Wantcompute, &dev->flags);
3979 sh->ops.target = disk_idx;
3980 sh->ops.target2 = -1;
3981 s->req_compute = 1;
3982
3983
3984
3985
3986
3987
3988 s->uptodate++;
3989 return 1;
3990 } else if (s->uptodate == disks-2 && s->failed >= 2) {
3991
3992
3993
3994 int other;
3995 for (other = disks; other--; ) {
3996 if (other == disk_idx)
3997 continue;
3998 if (!test_bit(R5_UPTODATE,
3999 &sh->dev[other].flags))
4000 break;
4001 }
4002 BUG_ON(other < 0);
4003 pr_debug("Computing stripe %llu blocks %d,%d\n",
4004 (unsigned long long)sh->sector,
4005 disk_idx, other);
4006 set_bit(STRIPE_COMPUTE_RUN, &sh->state);
4007 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
4008 set_bit(R5_Wantcompute, &sh->dev[disk_idx].flags);
4009 set_bit(R5_Wantcompute, &sh->dev[other].flags);
4010 sh->ops.target = disk_idx;
4011 sh->ops.target2 = other;
4012 s->uptodate += 2;
4013 s->req_compute = 1;
4014 return 1;
4015 } else if (test_bit(R5_Insync, &dev->flags)) {
4016 set_bit(R5_LOCKED, &dev->flags);
4017 set_bit(R5_Wantread, &dev->flags);
4018 s->locked++;
4019 pr_debug("Reading block %d (sync=%d)\n",
4020 disk_idx, s->syncing);
4021 }
4022 }
4023
4024 return 0;
4025 }
4026
4027
4028
4029
4030 static void handle_stripe_fill(struct stripe_head *sh,
4031 struct stripe_head_state *s,
4032 int disks)
4033 {
4034 int i;
4035
4036
4037
4038
4039
4040 if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state &&
4041 !sh->reconstruct_state) {
4042
4043
4044
4045
4046
4047
4048
4049
4050 if (s->injournal && s->failed) {
4051 if (test_bit(STRIPE_R5C_CACHING, &sh->state))
4052 r5c_make_stripe_write_out(sh);
4053 goto out;
4054 }
4055
4056 for (i = disks; i--; )
4057 if (fetch_block(sh, s, i, disks))
4058 break;
4059 }
4060 out:
4061 set_bit(STRIPE_HANDLE, &sh->state);
4062 }
4063
4064 static void break_stripe_batch_list(struct stripe_head *head_sh,
4065 unsigned long handle_flags);
4066
4067
4068
4069
4070
4071 static void handle_stripe_clean_event(struct r5conf *conf,
4072 struct stripe_head *sh, int disks)
4073 {
4074 int i;
4075 struct r5dev *dev;
4076 int discard_pending = 0;
4077 struct stripe_head *head_sh = sh;
4078 bool do_endio = false;
4079
4080 for (i = disks; i--; )
4081 if (sh->dev[i].written) {
4082 dev = &sh->dev[i];
4083 if (!test_bit(R5_LOCKED, &dev->flags) &&
4084 (test_bit(R5_UPTODATE, &dev->flags) ||
4085 test_bit(R5_Discard, &dev->flags) ||
4086 test_bit(R5_SkipCopy, &dev->flags))) {
4087
4088 struct bio *wbi, *wbi2;
4089 pr_debug("Return write for disc %d\n", i);
4090 if (test_and_clear_bit(R5_Discard, &dev->flags))
4091 clear_bit(R5_UPTODATE, &dev->flags);
4092 if (test_and_clear_bit(R5_SkipCopy, &dev->flags)) {
4093 WARN_ON(test_bit(R5_UPTODATE, &dev->flags));
4094 }
4095 do_endio = true;
4096
4097 returnbi:
4098 dev->page = dev->orig_page;
4099 wbi = dev->written;
4100 dev->written = NULL;
4101 while (wbi && wbi->bi_iter.bi_sector <
4102 dev->sector + RAID5_STRIPE_SECTORS(conf)) {
4103 wbi2 = r5_next_bio(conf, wbi, dev->sector);
4104 md_write_end(conf->mddev);
4105 bio_endio(wbi);
4106 wbi = wbi2;
4107 }
4108 md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
4109 RAID5_STRIPE_SECTORS(conf),
4110 !test_bit(STRIPE_DEGRADED, &sh->state),
4111 0);
4112 if (head_sh->batch_head) {
4113 sh = list_first_entry(&sh->batch_list,
4114 struct stripe_head,
4115 batch_list);
4116 if (sh != head_sh) {
4117 dev = &sh->dev[i];
4118 goto returnbi;
4119 }
4120 }
4121 sh = head_sh;
4122 dev = &sh->dev[i];
4123 } else if (test_bit(R5_Discard, &dev->flags))
4124 discard_pending = 1;
4125 }
4126
4127 log_stripe_write_finished(sh);
4128
4129 if (!discard_pending &&
4130 test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)) {
4131 int hash;
4132 clear_bit(R5_Discard, &sh->dev[sh->pd_idx].flags);
4133 clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags);
4134 if (sh->qd_idx >= 0) {
4135 clear_bit(R5_Discard, &sh->dev[sh->qd_idx].flags);
4136 clear_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags);
4137 }
4138
4139 clear_bit(STRIPE_DISCARD, &sh->state);
4140
4141
4142
4143
4144
4145 unhash:
4146 hash = sh->hash_lock_index;
4147 spin_lock_irq(conf->hash_locks + hash);
4148 remove_hash(sh);
4149 spin_unlock_irq(conf->hash_locks + hash);
4150 if (head_sh->batch_head) {
4151 sh = list_first_entry(&sh->batch_list,
4152 struct stripe_head, batch_list);
4153 if (sh != head_sh)
4154 goto unhash;
4155 }
4156 sh = head_sh;
4157
4158 if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state))
4159 set_bit(STRIPE_HANDLE, &sh->state);
4160
4161 }
4162
4163 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
4164 if (atomic_dec_and_test(&conf->pending_full_writes))
4165 md_wakeup_thread(conf->mddev->thread);
4166
4167 if (head_sh->batch_head && do_endio)
4168 break_stripe_batch_list(head_sh, STRIPE_EXPAND_SYNC_FLAGS);
4169 }
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179 static inline bool uptodate_for_rmw(struct r5dev *dev)
4180 {
4181 return (test_bit(R5_UPTODATE, &dev->flags)) &&
4182 (!test_bit(R5_InJournal, &dev->flags) ||
4183 test_bit(R5_OrigPageUPTDODATE, &dev->flags));
4184 }
4185
4186 static int handle_stripe_dirtying(struct r5conf *conf,
4187 struct stripe_head *sh,
4188 struct stripe_head_state *s,
4189 int disks)
4190 {
4191 int rmw = 0, rcw = 0, i;
4192 sector_t recovery_cp = conf->mddev->recovery_cp;
4193
4194
4195
4196
4197
4198
4199
4200
4201 if (conf->rmw_level == PARITY_DISABLE_RMW ||
4202 (recovery_cp < MaxSector && sh->sector >= recovery_cp &&
4203 s->failed == 0)) {
4204
4205
4206
4207 rcw = 1; rmw = 2;
4208 pr_debug("force RCW rmw_level=%u, recovery_cp=%llu sh->sector=%llu\n",
4209 conf->rmw_level, (unsigned long long)recovery_cp,
4210 (unsigned long long)sh->sector);
4211 } else for (i = disks; i--; ) {
4212
4213 struct r5dev *dev = &sh->dev[i];
4214 if (((dev->towrite && !delay_towrite(conf, dev, s)) ||
4215 i == sh->pd_idx || i == sh->qd_idx ||
4216 test_bit(R5_InJournal, &dev->flags)) &&
4217 !test_bit(R5_LOCKED, &dev->flags) &&
4218 !(uptodate_for_rmw(dev) ||
4219 test_bit(R5_Wantcompute, &dev->flags))) {
4220 if (test_bit(R5_Insync, &dev->flags))
4221 rmw++;
4222 else
4223 rmw += 2*disks;
4224 }
4225
4226 if (!test_bit(R5_OVERWRITE, &dev->flags) &&
4227 i != sh->pd_idx && i != sh->qd_idx &&
4228 !test_bit(R5_LOCKED, &dev->flags) &&
4229 !(test_bit(R5_UPTODATE, &dev->flags) ||
4230 test_bit(R5_Wantcompute, &dev->flags))) {
4231 if (test_bit(R5_Insync, &dev->flags))
4232 rcw++;
4233 else
4234 rcw += 2*disks;
4235 }
4236 }
4237
4238 pr_debug("for sector %llu state 0x%lx, rmw=%d rcw=%d\n",
4239 (unsigned long long)sh->sector, sh->state, rmw, rcw);
4240 set_bit(STRIPE_HANDLE, &sh->state);
4241 if ((rmw < rcw || (rmw == rcw && conf->rmw_level == PARITY_PREFER_RMW)) && rmw > 0) {
4242
4243 if (conf->mddev->queue)
4244 blk_add_trace_msg(conf->mddev->queue,
4245 "raid5 rmw %llu %d",
4246 (unsigned long long)sh->sector, rmw);
4247 for (i = disks; i--; ) {
4248 struct r5dev *dev = &sh->dev[i];
4249 if (test_bit(R5_InJournal, &dev->flags) &&
4250 dev->page == dev->orig_page &&
4251 !test_bit(R5_LOCKED, &sh->dev[sh->pd_idx].flags)) {
4252
4253 struct page *p = alloc_page(GFP_NOIO);
4254
4255 if (p) {
4256 dev->orig_page = p;
4257 continue;
4258 }
4259
4260
4261
4262
4263
4264 if (!test_and_set_bit(R5C_EXTRA_PAGE_IN_USE,
4265 &conf->cache_state)) {
4266 r5c_use_extra_page(sh);
4267 break;
4268 }
4269
4270
4271 set_bit(STRIPE_DELAYED, &sh->state);
4272 s->waiting_extra_page = 1;
4273 return -EAGAIN;
4274 }
4275 }
4276
4277 for (i = disks; i--; ) {
4278 struct r5dev *dev = &sh->dev[i];
4279 if (((dev->towrite && !delay_towrite(conf, dev, s)) ||
4280 i == sh->pd_idx || i == sh->qd_idx ||
4281 test_bit(R5_InJournal, &dev->flags)) &&
4282 !test_bit(R5_LOCKED, &dev->flags) &&
4283 !(uptodate_for_rmw(dev) ||
4284 test_bit(R5_Wantcompute, &dev->flags)) &&
4285 test_bit(R5_Insync, &dev->flags)) {
4286 if (test_bit(STRIPE_PREREAD_ACTIVE,
4287 &sh->state)) {
4288 pr_debug("Read_old block %d for r-m-w\n",
4289 i);
4290 set_bit(R5_LOCKED, &dev->flags);
4291 set_bit(R5_Wantread, &dev->flags);
4292 s->locked++;
4293 } else
4294 set_bit(STRIPE_DELAYED, &sh->state);
4295 }
4296 }
4297 }
4298 if ((rcw < rmw || (rcw == rmw && conf->rmw_level != PARITY_PREFER_RMW)) && rcw > 0) {
4299
4300 int qread =0;
4301 rcw = 0;
4302 for (i = disks; i--; ) {
4303 struct r5dev *dev = &sh->dev[i];
4304 if (!test_bit(R5_OVERWRITE, &dev->flags) &&
4305 i != sh->pd_idx && i != sh->qd_idx &&
4306 !test_bit(R5_LOCKED, &dev->flags) &&
4307 !(test_bit(R5_UPTODATE, &dev->flags) ||
4308 test_bit(R5_Wantcompute, &dev->flags))) {
4309 rcw++;
4310 if (test_bit(R5_Insync, &dev->flags) &&
4311 test_bit(STRIPE_PREREAD_ACTIVE,
4312 &sh->state)) {
4313 pr_debug("Read_old block "
4314 "%d for Reconstruct\n", i);
4315 set_bit(R5_LOCKED, &dev->flags);
4316 set_bit(R5_Wantread, &dev->flags);
4317 s->locked++;
4318 qread++;
4319 } else
4320 set_bit(STRIPE_DELAYED, &sh->state);
4321 }
4322 }
4323 if (rcw && conf->mddev->queue)
4324 blk_add_trace_msg(conf->mddev->queue, "raid5 rcw %llu %d %d %d",
4325 (unsigned long long)sh->sector,
4326 rcw, qread, test_bit(STRIPE_DELAYED, &sh->state));
4327 }
4328
4329 if (rcw > disks && rmw > disks &&
4330 !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
4331 set_bit(STRIPE_DELAYED, &sh->state);
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343 if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) &&
4344 (s->locked == 0 && (rcw == 0 || rmw == 0) &&
4345 !test_bit(STRIPE_BIT_DELAY, &sh->state)))
4346 schedule_reconstruction(sh, s, rcw == 0, 0);
4347 return 0;
4348 }
4349
4350 static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh,
4351 struct stripe_head_state *s, int disks)
4352 {
4353 struct r5dev *dev = NULL;
4354
4355 BUG_ON(sh->batch_head);
4356 set_bit(STRIPE_HANDLE, &sh->state);
4357
4358 switch (sh->check_state) {
4359 case check_state_idle:
4360
4361 if (s->failed == 0) {
4362 BUG_ON(s->uptodate != disks);
4363 sh->check_state = check_state_run;
4364 set_bit(STRIPE_OP_CHECK, &s->ops_request);
4365 clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags);
4366 s->uptodate--;
4367 break;
4368 }
4369 dev = &sh->dev[s->failed_num[0]];
4370 fallthrough;
4371 case check_state_compute_result:
4372 sh->check_state = check_state_idle;
4373 if (!dev)
4374 dev = &sh->dev[sh->pd_idx];
4375
4376
4377 if (test_bit(STRIPE_INSYNC, &sh->state))
4378 break;
4379
4380
4381 BUG_ON(!test_bit(R5_UPTODATE, &dev->flags));
4382 BUG_ON(s->uptodate != disks);
4383
4384 set_bit(R5_LOCKED, &dev->flags);
4385 s->locked++;
4386 set_bit(R5_Wantwrite, &dev->flags);
4387
4388 clear_bit(STRIPE_DEGRADED, &sh->state);
4389 set_bit(STRIPE_INSYNC, &sh->state);
4390 break;
4391 case check_state_run:
4392 break;
4393 case check_state_check_result:
4394 sh->check_state = check_state_idle;
4395
4396
4397
4398
4399 if (s->failed)
4400 break;
4401
4402
4403
4404
4405
4406 if ((sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) == 0)
4407
4408
4409
4410 set_bit(STRIPE_INSYNC, &sh->state);
4411 else {
4412 atomic64_add(RAID5_STRIPE_SECTORS(conf), &conf->mddev->resync_mismatches);
4413 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) {
4414
4415 set_bit(STRIPE_INSYNC, &sh->state);
4416 pr_warn_ratelimited("%s: mismatch sector in range "
4417 "%llu-%llu\n", mdname(conf->mddev),
4418 (unsigned long long) sh->sector,
4419 (unsigned long long) sh->sector +
4420 RAID5_STRIPE_SECTORS(conf));
4421 } else {
4422 sh->check_state = check_state_compute_run;
4423 set_bit(STRIPE_COMPUTE_RUN, &sh->state);
4424 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
4425 set_bit(R5_Wantcompute,
4426 &sh->dev[sh->pd_idx].flags);
4427 sh->ops.target = sh->pd_idx;
4428 sh->ops.target2 = -1;
4429 s->uptodate++;
4430 }
4431 }
4432 break;
4433 case check_state_compute_run:
4434 break;
4435 default:
4436 pr_err("%s: unknown check_state: %d sector: %llu\n",
4437 __func__, sh->check_state,
4438 (unsigned long long) sh->sector);
4439 BUG();
4440 }
4441 }
4442
4443 static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh,
4444 struct stripe_head_state *s,
4445 int disks)
4446 {
4447 int pd_idx = sh->pd_idx;
4448 int qd_idx = sh->qd_idx;
4449 struct r5dev *dev;
4450
4451 BUG_ON(sh->batch_head);
4452 set_bit(STRIPE_HANDLE, &sh->state);
4453
4454 BUG_ON(s->failed > 2);
4455
4456
4457
4458
4459
4460
4461
4462 switch (sh->check_state) {
4463 case check_state_idle:
4464
4465 if (s->failed == s->q_failed) {
4466
4467
4468
4469
4470 sh->check_state = check_state_run;
4471 }
4472 if (!s->q_failed && s->failed < 2) {
4473
4474
4475
4476 if (sh->check_state == check_state_run)
4477 sh->check_state = check_state_run_pq;
4478 else
4479 sh->check_state = check_state_run_q;
4480 }
4481
4482
4483 sh->ops.zero_sum_result = 0;
4484
4485 if (sh->check_state == check_state_run) {
4486
4487 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
4488 s->uptodate--;
4489 }
4490 if (sh->check_state >= check_state_run &&
4491 sh->check_state <= check_state_run_pq) {
4492
4493
4494
4495 set_bit(STRIPE_OP_CHECK, &s->ops_request);
4496 break;
4497 }
4498
4499
4500 BUG_ON(s->failed != 2);
4501 fallthrough;
4502 case check_state_compute_result:
4503 sh->check_state = check_state_idle;
4504
4505
4506 if (test_bit(STRIPE_INSYNC, &sh->state))
4507 break;
4508
4509
4510
4511
4512 dev = NULL;
4513 if (s->failed == 2) {
4514 dev = &sh->dev[s->failed_num[1]];
4515 s->locked++;
4516 set_bit(R5_LOCKED, &dev->flags);
4517 set_bit(R5_Wantwrite, &dev->flags);
4518 }
4519 if (s->failed >= 1) {
4520 dev = &sh->dev[s->failed_num[0]];
4521 s->locked++;
4522 set_bit(R5_LOCKED, &dev->flags);
4523 set_bit(R5_Wantwrite, &dev->flags);
4524 }
4525 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) {
4526 dev = &sh->dev[pd_idx];
4527 s->locked++;
4528 set_bit(R5_LOCKED, &dev->flags);
4529 set_bit(R5_Wantwrite, &dev->flags);
4530 }
4531 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) {
4532 dev = &sh->dev[qd_idx];
4533 s->locked++;
4534 set_bit(R5_LOCKED, &dev->flags);
4535 set_bit(R5_Wantwrite, &dev->flags);
4536 }
4537 if (WARN_ONCE(dev && !test_bit(R5_UPTODATE, &dev->flags),
4538 "%s: disk%td not up to date\n",
4539 mdname(conf->mddev),
4540 dev - (struct r5dev *) &sh->dev)) {
4541 clear_bit(R5_LOCKED, &dev->flags);
4542 clear_bit(R5_Wantwrite, &dev->flags);
4543 s->locked--;
4544 }
4545 clear_bit(STRIPE_DEGRADED, &sh->state);
4546
4547 set_bit(STRIPE_INSYNC, &sh->state);
4548 break;
4549 case check_state_run:
4550 case check_state_run_q:
4551 case check_state_run_pq:
4552 break;
4553 case check_state_check_result:
4554 sh->check_state = check_state_idle;
4555
4556
4557
4558
4559
4560 if (sh->ops.zero_sum_result == 0) {
4561
4562 if (!s->failed)
4563 set_bit(STRIPE_INSYNC, &sh->state);
4564 else {
4565
4566
4567
4568
4569 sh->check_state = check_state_compute_result;
4570
4571
4572
4573
4574
4575 }
4576 } else {
4577 atomic64_add(RAID5_STRIPE_SECTORS(conf), &conf->mddev->resync_mismatches);
4578 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) {
4579
4580 set_bit(STRIPE_INSYNC, &sh->state);
4581 pr_warn_ratelimited("%s: mismatch sector in range "
4582 "%llu-%llu\n", mdname(conf->mddev),
4583 (unsigned long long) sh->sector,
4584 (unsigned long long) sh->sector +
4585 RAID5_STRIPE_SECTORS(conf));
4586 } else {
4587 int *target = &sh->ops.target;
4588
4589 sh->ops.target = -1;
4590 sh->ops.target2 = -1;
4591 sh->check_state = check_state_compute_run;
4592 set_bit(STRIPE_COMPUTE_RUN, &sh->state);
4593 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
4594 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) {
4595 set_bit(R5_Wantcompute,
4596 &sh->dev[pd_idx].flags);
4597 *target = pd_idx;
4598 target = &sh->ops.target2;
4599 s->uptodate++;
4600 }
4601 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) {
4602 set_bit(R5_Wantcompute,
4603 &sh->dev[qd_idx].flags);
4604 *target = qd_idx;
4605 s->uptodate++;
4606 }
4607 }
4608 }
4609 break;
4610 case check_state_compute_run:
4611 break;
4612 default:
4613 pr_warn("%s: unknown check_state: %d sector: %llu\n",
4614 __func__, sh->check_state,
4615 (unsigned long long) sh->sector);
4616 BUG();
4617 }
4618 }
4619
4620 static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh)
4621 {
4622 int i;
4623
4624
4625
4626
4627 struct dma_async_tx_descriptor *tx = NULL;
4628 BUG_ON(sh->batch_head);
4629 clear_bit(STRIPE_EXPAND_SOURCE, &sh->state);
4630 for (i = 0; i < sh->disks; i++)
4631 if (i != sh->pd_idx && i != sh->qd_idx) {
4632 int dd_idx, j;
4633 struct stripe_head *sh2;
4634 struct async_submit_ctl submit;
4635
4636 sector_t bn = raid5_compute_blocknr(sh, i, 1);
4637 sector_t s = raid5_compute_sector(conf, bn, 0,
4638 &dd_idx, NULL);
4639 sh2 = raid5_get_active_stripe(conf, s, 0, 1, 1);
4640 if (sh2 == NULL)
4641
4642
4643
4644
4645 continue;
4646 if (!test_bit(STRIPE_EXPANDING, &sh2->state) ||
4647 test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) {
4648
4649 raid5_release_stripe(sh2);
4650 continue;
4651 }
4652
4653
4654 init_async_submit(&submit, 0, tx, NULL, NULL, NULL);
4655 tx = async_memcpy(sh2->dev[dd_idx].page,
4656 sh->dev[i].page, sh2->dev[dd_idx].offset,
4657 sh->dev[i].offset, RAID5_STRIPE_SIZE(conf),
4658 &submit);
4659
4660 set_bit(R5_Expanded, &sh2->dev[dd_idx].flags);
4661 set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags);
4662 for (j = 0; j < conf->raid_disks; j++)
4663 if (j != sh2->pd_idx &&
4664 j != sh2->qd_idx &&
4665 !test_bit(R5_Expanded, &sh2->dev[j].flags))
4666 break;
4667 if (j == conf->raid_disks) {
4668 set_bit(STRIPE_EXPAND_READY, &sh2->state);
4669 set_bit(STRIPE_HANDLE, &sh2->state);
4670 }
4671 raid5_release_stripe(sh2);
4672
4673 }
4674
4675 async_tx_quiesce(&tx);
4676 }
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692 static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
4693 {
4694 struct r5conf *conf = sh->raid_conf;
4695 int disks = sh->disks;
4696 struct r5dev *dev;
4697 int i;
4698 int do_recovery = 0;
4699
4700 memset(s, 0, sizeof(*s));
4701
4702 s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state) && !sh->batch_head;
4703 s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state) && !sh->batch_head;
4704 s->failed_num[0] = -1;
4705 s->failed_num[1] = -1;
4706 s->log_failed = r5l_log_disk_error(conf);
4707
4708
4709 rcu_read_lock();
4710 for (i=disks; i--; ) {
4711 struct md_rdev *rdev;
4712 sector_t first_bad;
4713 int bad_sectors;
4714 int is_bad = 0;
4715
4716 dev = &sh->dev[i];
4717
4718 pr_debug("check %d: state 0x%lx read %p write %p written %p\n",
4719 i, dev->flags,
4720 dev->toread, dev->towrite, dev->written);
4721
4722
4723
4724
4725
4726 if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread &&
4727 !test_bit(STRIPE_BIOFILL_RUN, &sh->state))
4728 set_bit(R5_Wantfill, &dev->flags);
4729
4730
4731 if (test_bit(R5_LOCKED, &dev->flags))
4732 s->locked++;
4733 if (test_bit(R5_UPTODATE, &dev->flags))
4734 s->uptodate++;
4735 if (test_bit(R5_Wantcompute, &dev->flags)) {
4736 s->compute++;
4737 BUG_ON(s->compute > 2);
4738 }
4739
4740 if (test_bit(R5_Wantfill, &dev->flags))
4741 s->to_fill++;
4742 else if (dev->toread)
4743 s->to_read++;
4744 if (dev->towrite) {
4745 s->to_write++;
4746 if (!test_bit(R5_OVERWRITE, &dev->flags))
4747 s->non_overwrite++;
4748 }
4749 if (dev->written)
4750 s->written++;
4751
4752
4753
4754 rdev = rcu_dereference(conf->disks[i].replacement);
4755 if (rdev && !test_bit(Faulty, &rdev->flags) &&
4756 rdev->recovery_offset >= sh->sector + RAID5_STRIPE_SECTORS(conf) &&
4757 !is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf),
4758 &first_bad, &bad_sectors))
4759 set_bit(R5_ReadRepl, &dev->flags);
4760 else {
4761 if (rdev && !test_bit(Faulty, &rdev->flags))
4762 set_bit(R5_NeedReplace, &dev->flags);
4763 else
4764 clear_bit(R5_NeedReplace, &dev->flags);
4765 rdev = rcu_dereference(conf->disks[i].rdev);
4766 clear_bit(R5_ReadRepl, &dev->flags);
4767 }
4768 if (rdev && test_bit(Faulty, &rdev->flags))
4769 rdev = NULL;
4770 if (rdev) {
4771 is_bad = is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf),
4772 &first_bad, &bad_sectors);
4773 if (s->blocked_rdev == NULL
4774 && (test_bit(Blocked, &rdev->flags)
4775 || is_bad < 0)) {
4776 if (is_bad < 0)
4777 set_bit(BlockedBadBlocks,
4778 &rdev->flags);
4779 s->blocked_rdev = rdev;
4780 atomic_inc(&rdev->nr_pending);
4781 }
4782 }
4783 clear_bit(R5_Insync, &dev->flags);
4784 if (!rdev)
4785 ;
4786 else if (is_bad) {
4787
4788 if (!test_bit(WriteErrorSeen, &rdev->flags) &&
4789 test_bit(R5_UPTODATE, &dev->flags)) {
4790
4791
4792
4793 set_bit(R5_Insync, &dev->flags);
4794 set_bit(R5_ReadError, &dev->flags);
4795 }
4796 } else if (test_bit(In_sync, &rdev->flags))
4797 set_bit(R5_Insync, &dev->flags);
4798 else if (sh->sector + RAID5_STRIPE_SECTORS(conf) <= rdev->recovery_offset)
4799
4800 set_bit(R5_Insync, &dev->flags);
4801 else if (test_bit(R5_UPTODATE, &dev->flags) &&
4802 test_bit(R5_Expanded, &dev->flags))
4803
4804
4805
4806
4807 set_bit(R5_Insync, &dev->flags);
4808
4809 if (test_bit(R5_WriteError, &dev->flags)) {
4810
4811
4812 struct md_rdev *rdev2 = rcu_dereference(
4813 conf->disks[i].rdev);
4814 if (rdev2 == rdev)
4815 clear_bit(R5_Insync, &dev->flags);
4816 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
4817 s->handle_bad_blocks = 1;
4818 atomic_inc(&rdev2->nr_pending);
4819 } else
4820 clear_bit(R5_WriteError, &dev->flags);
4821 }
4822 if (test_bit(R5_MadeGood, &dev->flags)) {
4823
4824
4825 struct md_rdev *rdev2 = rcu_dereference(
4826 conf->disks[i].rdev);
4827 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
4828 s->handle_bad_blocks = 1;
4829 atomic_inc(&rdev2->nr_pending);
4830 } else
4831 clear_bit(R5_MadeGood, &dev->flags);
4832 }
4833 if (test_bit(R5_MadeGoodRepl, &dev->flags)) {
4834 struct md_rdev *rdev2 = rcu_dereference(
4835 conf->disks[i].replacement);
4836 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
4837 s->handle_bad_blocks = 1;
4838 atomic_inc(&rdev2->nr_pending);
4839 } else
4840 clear_bit(R5_MadeGoodRepl, &dev->flags);
4841 }
4842 if (!test_bit(R5_Insync, &dev->flags)) {
4843
4844 clear_bit(R5_ReadError, &dev->flags);
4845 clear_bit(R5_ReWrite, &dev->flags);
4846 }
4847 if (test_bit(R5_ReadError, &dev->flags))
4848 clear_bit(R5_Insync, &dev->flags);
4849 if (!test_bit(R5_Insync, &dev->flags)) {
4850 if (s->failed < 2)
4851 s->failed_num[s->failed] = i;
4852 s->failed++;
4853 if (rdev && !test_bit(Faulty, &rdev->flags))
4854 do_recovery = 1;
4855 else if (!rdev) {
4856 rdev = rcu_dereference(
4857 conf->disks[i].replacement);
4858 if (rdev && !test_bit(Faulty, &rdev->flags))
4859 do_recovery = 1;
4860 }
4861 }
4862
4863 if (test_bit(R5_InJournal, &dev->flags))
4864 s->injournal++;
4865 if (test_bit(R5_InJournal, &dev->flags) && dev->written)
4866 s->just_cached++;
4867 }
4868 if (test_bit(STRIPE_SYNCING, &sh->state)) {
4869
4870
4871
4872
4873
4874
4875
4876
4877 if (do_recovery ||
4878 sh->sector >= conf->mddev->recovery_cp ||
4879 test_bit(MD_RECOVERY_REQUESTED, &(conf->mddev->recovery)))
4880 s->syncing = 1;
4881 else
4882 s->replacing = 1;
4883 }
4884 rcu_read_unlock();
4885 }
4886
4887
4888
4889
4890
4891 static int clear_batch_ready(struct stripe_head *sh)
4892 {
4893 struct stripe_head *tmp;
4894 if (!test_and_clear_bit(STRIPE_BATCH_READY, &sh->state))
4895 return (sh->batch_head && sh->batch_head != sh);
4896 spin_lock(&sh->stripe_lock);
4897 if (!sh->batch_head) {
4898 spin_unlock(&sh->stripe_lock);
4899 return 0;
4900 }
4901
4902
4903
4904
4905
4906 if (sh->batch_head != sh) {
4907 spin_unlock(&sh->stripe_lock);
4908 return 1;
4909 }
4910 spin_lock(&sh->batch_lock);
4911 list_for_each_entry(tmp, &sh->batch_list, batch_list)
4912 clear_bit(STRIPE_BATCH_READY, &tmp->state);
4913 spin_unlock(&sh->batch_lock);
4914 spin_unlock(&sh->stripe_lock);
4915
4916
4917
4918
4919
4920 return 0;
4921 }
4922
4923 static void break_stripe_batch_list(struct stripe_head *head_sh,
4924 unsigned long handle_flags)
4925 {
4926 struct stripe_head *sh, *next;
4927 int i;
4928 int do_wakeup = 0;
4929
4930 list_for_each_entry_safe(sh, next, &head_sh->batch_list, batch_list) {
4931
4932 list_del_init(&sh->batch_list);
4933
4934 WARN_ONCE(sh->state & ((1 << STRIPE_ACTIVE) |
4935 (1 << STRIPE_SYNCING) |
4936 (1 << STRIPE_REPLACED) |
4937 (1 << STRIPE_DELAYED) |
4938 (1 << STRIPE_BIT_DELAY) |
4939 (1 << STRIPE_FULL_WRITE) |
4940 (1 << STRIPE_BIOFILL_RUN) |
4941 (1 << STRIPE_COMPUTE_RUN) |
4942 (1 << STRIPE_DISCARD) |
4943 (1 << STRIPE_BATCH_READY) |
4944 (1 << STRIPE_BATCH_ERR) |
4945 (1 << STRIPE_BITMAP_PENDING)),
4946 "stripe state: %lx\n", sh->state);
4947 WARN_ONCE(head_sh->state & ((1 << STRIPE_DISCARD) |
4948 (1 << STRIPE_REPLACED)),
4949 "head stripe state: %lx\n", head_sh->state);
4950
4951 set_mask_bits(&sh->state, ~(STRIPE_EXPAND_SYNC_FLAGS |
4952 (1 << STRIPE_PREREAD_ACTIVE) |
4953 (1 << STRIPE_DEGRADED) |
4954 (1 << STRIPE_ON_UNPLUG_LIST)),
4955 head_sh->state & (1 << STRIPE_INSYNC));
4956
4957 sh->check_state = head_sh->check_state;
4958 sh->reconstruct_state = head_sh->reconstruct_state;
4959 spin_lock_irq(&sh->stripe_lock);
4960 sh->batch_head = NULL;
4961 spin_unlock_irq(&sh->stripe_lock);
4962 for (i = 0; i < sh->disks; i++) {
4963 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
4964 do_wakeup = 1;
4965 sh->dev[i].flags = head_sh->dev[i].flags &
4966 (~((1 << R5_WriteError) | (1 << R5_Overlap)));
4967 }
4968 if (handle_flags == 0 ||
4969 sh->state & handle_flags)
4970 set_bit(STRIPE_HANDLE, &sh->state);
4971 raid5_release_stripe(sh);
4972 }
4973 spin_lock_irq(&head_sh->stripe_lock);
4974 head_sh->batch_head = NULL;
4975 spin_unlock_irq(&head_sh->stripe_lock);
4976 for (i = 0; i < head_sh->disks; i++)
4977 if (test_and_clear_bit(R5_Overlap, &head_sh->dev[i].flags))
4978 do_wakeup = 1;
4979 if (head_sh->state & handle_flags)
4980 set_bit(STRIPE_HANDLE, &head_sh->state);
4981
4982 if (do_wakeup)
4983 wake_up(&head_sh->raid_conf->wait_for_overlap);
4984 }
4985
4986 static void handle_stripe(struct stripe_head *sh)
4987 {
4988 struct stripe_head_state s;
4989 struct r5conf *conf = sh->raid_conf;
4990 int i;
4991 int prexor;
4992 int disks = sh->disks;
4993 struct r5dev *pdev, *qdev;
4994
4995 clear_bit(STRIPE_HANDLE, &sh->state);
4996
4997
4998
4999
5000
5001
5002
5003 if (clear_batch_ready(sh))
5004 return;
5005
5006 if (test_and_set_bit_lock(STRIPE_ACTIVE, &sh->state)) {
5007
5008
5009 set_bit(STRIPE_HANDLE, &sh->state);
5010 return;
5011 }
5012
5013 if (test_and_clear_bit(STRIPE_BATCH_ERR, &sh->state))
5014 break_stripe_batch_list(sh, 0);
5015
5016 if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) && !sh->batch_head) {
5017 spin_lock(&sh->stripe_lock);
5018
5019
5020
5021
5022 if (!test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state) &&
5023 !test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state) &&
5024 !test_bit(STRIPE_DISCARD, &sh->state) &&
5025 test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) {
5026 set_bit(STRIPE_SYNCING, &sh->state);
5027 clear_bit(STRIPE_INSYNC, &sh->state);
5028 clear_bit(STRIPE_REPLACED, &sh->state);
5029 }
5030 spin_unlock(&sh->stripe_lock);
5031 }
5032 clear_bit(STRIPE_DELAYED, &sh->state);
5033
5034 pr_debug("handling stripe %llu, state=%#lx cnt=%d, "
5035 "pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n",
5036 (unsigned long long)sh->sector, sh->state,
5037 atomic_read(&sh->count), sh->pd_idx, sh->qd_idx,
5038 sh->check_state, sh->reconstruct_state);
5039
5040 analyse_stripe(sh, &s);
5041
5042 if (test_bit(STRIPE_LOG_TRAPPED, &sh->state))
5043 goto finish;
5044
5045 if (s.handle_bad_blocks ||
5046 test_bit(MD_SB_CHANGE_PENDING, &conf->mddev->sb_flags)) {
5047 set_bit(STRIPE_HANDLE, &sh->state);
5048 goto finish;
5049 }
5050
5051 if (unlikely(s.blocked_rdev)) {
5052 if (s.syncing || s.expanding || s.expanded ||
5053 s.replacing || s.to_write || s.written) {
5054 set_bit(STRIPE_HANDLE, &sh->state);
5055 goto finish;
5056 }
5057
5058 rdev_dec_pending(s.blocked_rdev, conf->mddev);
5059 s.blocked_rdev = NULL;
5060 }
5061
5062 if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) {
5063 set_bit(STRIPE_OP_BIOFILL, &s.ops_request);
5064 set_bit(STRIPE_BIOFILL_RUN, &sh->state);
5065 }
5066
5067 pr_debug("locked=%d uptodate=%d to_read=%d"
5068 " to_write=%d failed=%d failed_num=%d,%d\n",
5069 s.locked, s.uptodate, s.to_read, s.to_write, s.failed,
5070 s.failed_num[0], s.failed_num[1]);
5071
5072
5073
5074
5075
5076
5077
5078 if (s.failed > conf->max_degraded ||
5079 (s.log_failed && s.injournal == 0)) {
5080 sh->check_state = 0;
5081 sh->reconstruct_state = 0;
5082 break_stripe_batch_list(sh, 0);
5083 if (s.to_read+s.to_write+s.written)
5084 handle_failed_stripe(conf, sh, &s, disks);
5085 if (s.syncing + s.replacing)
5086 handle_failed_sync(conf, sh, &s);
5087 }
5088
5089
5090
5091
5092 prexor = 0;
5093 if (sh->reconstruct_state == reconstruct_state_prexor_drain_result)
5094 prexor = 1;
5095 if (sh->reconstruct_state == reconstruct_state_drain_result ||
5096 sh->reconstruct_state == reconstruct_state_prexor_drain_result) {
5097 sh->reconstruct_state = reconstruct_state_idle;
5098
5099
5100
5101
5102 BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags) &&
5103 !test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags));
5104 BUG_ON(sh->qd_idx >= 0 &&
5105 !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags) &&
5106 !test_bit(R5_Discard, &sh->dev[sh->qd_idx].flags));
5107 for (i = disks; i--; ) {
5108 struct r5dev *dev = &sh->dev[i];
5109 if (test_bit(R5_LOCKED, &dev->flags) &&
5110 (i == sh->pd_idx || i == sh->qd_idx ||
5111 dev->written || test_bit(R5_InJournal,
5112 &dev->flags))) {
5113 pr_debug("Writing block %d\n", i);
5114 set_bit(R5_Wantwrite, &dev->flags);
5115 if (prexor)
5116 continue;
5117 if (s.failed > 1)
5118 continue;
5119 if (!test_bit(R5_Insync, &dev->flags) ||
5120 ((i == sh->pd_idx || i == sh->qd_idx) &&
5121 s.failed == 0))
5122 set_bit(STRIPE_INSYNC, &sh->state);
5123 }
5124 }
5125 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
5126 s.dec_preread_active = 1;
5127 }
5128
5129
5130
5131
5132
5133 pdev = &sh->dev[sh->pd_idx];
5134 s.p_failed = (s.failed >= 1 && s.failed_num[0] == sh->pd_idx)
5135 || (s.failed >= 2 && s.failed_num[1] == sh->pd_idx);
5136 qdev = &sh->dev[sh->qd_idx];
5137 s.q_failed = (s.failed >= 1 && s.failed_num[0] == sh->qd_idx)
5138 || (s.failed >= 2 && s.failed_num[1] == sh->qd_idx)
5139 || conf->level < 6;
5140
5141 if (s.written &&
5142 (s.p_failed || ((test_bit(R5_Insync, &pdev->flags)
5143 && !test_bit(R5_LOCKED, &pdev->flags)
5144 && (test_bit(R5_UPTODATE, &pdev->flags) ||
5145 test_bit(R5_Discard, &pdev->flags))))) &&
5146 (s.q_failed || ((test_bit(R5_Insync, &qdev->flags)
5147 && !test_bit(R5_LOCKED, &qdev->flags)
5148 && (test_bit(R5_UPTODATE, &qdev->flags) ||
5149 test_bit(R5_Discard, &qdev->flags))))))
5150 handle_stripe_clean_event(conf, sh, disks);
5151
5152 if (s.just_cached)
5153 r5c_handle_cached_data_endio(conf, sh, disks);
5154 log_stripe_write_finished(sh);
5155
5156
5157
5158
5159
5160 if (s.to_read || s.non_overwrite
5161 || (s.to_write && s.failed)
5162 || (s.syncing && (s.uptodate + s.compute < disks))
5163 || s.replacing
5164 || s.expanding)
5165 handle_stripe_fill(sh, &s, disks);
5166
5167
5168
5169
5170
5171
5172 r5c_finish_stripe_write_out(conf, sh, &s);
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183 if (!sh->reconstruct_state && !sh->check_state && !sh->log_io) {
5184 if (!r5c_is_writeback(conf->log)) {
5185 if (s.to_write)
5186 handle_stripe_dirtying(conf, sh, &s, disks);
5187 } else {
5188 int ret = 0;
5189
5190
5191 if (s.to_write)
5192 ret = r5c_try_caching_write(conf, sh, &s,
5193 disks);
5194
5195
5196
5197
5198
5199
5200
5201 if (ret == -EAGAIN ||
5202
5203 (!test_bit(STRIPE_R5C_CACHING, &sh->state) &&
5204 s.injournal > 0)) {
5205 ret = handle_stripe_dirtying(conf, sh, &s,
5206 disks);
5207 if (ret == -EAGAIN)
5208 goto finish;
5209 }
5210 }
5211 }
5212
5213
5214
5215
5216
5217
5218 if (sh->check_state ||
5219 (s.syncing && s.locked == 0 &&
5220 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
5221 !test_bit(STRIPE_INSYNC, &sh->state))) {
5222 if (conf->level == 6)
5223 handle_parity_checks6(conf, sh, &s, disks);
5224 else
5225 handle_parity_checks5(conf, sh, &s, disks);
5226 }
5227
5228 if ((s.replacing || s.syncing) && s.locked == 0
5229 && !test_bit(STRIPE_COMPUTE_RUN, &sh->state)
5230 && !test_bit(STRIPE_REPLACED, &sh->state)) {
5231
5232 for (i = 0; i < conf->raid_disks; i++)
5233 if (test_bit(R5_NeedReplace, &sh->dev[i].flags)) {
5234 WARN_ON(!test_bit(R5_UPTODATE, &sh->dev[i].flags));
5235 set_bit(R5_WantReplace, &sh->dev[i].flags);
5236 set_bit(R5_LOCKED, &sh->dev[i].flags);
5237 s.locked++;
5238 }
5239 if (s.replacing)
5240 set_bit(STRIPE_INSYNC, &sh->state);
5241 set_bit(STRIPE_REPLACED, &sh->state);
5242 }
5243 if ((s.syncing || s.replacing) && s.locked == 0 &&
5244 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
5245 test_bit(STRIPE_INSYNC, &sh->state)) {
5246 md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), 1);
5247 clear_bit(STRIPE_SYNCING, &sh->state);
5248 if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags))
5249 wake_up(&conf->wait_for_overlap);
5250 }
5251
5252
5253
5254
5255 if (s.failed <= conf->max_degraded && !conf->mddev->ro)
5256 for (i = 0; i < s.failed; i++) {
5257 struct r5dev *dev = &sh->dev[s.failed_num[i]];
5258 if (test_bit(R5_ReadError, &dev->flags)
5259 && !test_bit(R5_LOCKED, &dev->flags)
5260 && test_bit(R5_UPTODATE, &dev->flags)
5261 ) {
5262 if (!test_bit(R5_ReWrite, &dev->flags)) {
5263 set_bit(R5_Wantwrite, &dev->flags);
5264 set_bit(R5_ReWrite, &dev->flags);
5265 } else
5266
5267 set_bit(R5_Wantread, &dev->flags);
5268 set_bit(R5_LOCKED, &dev->flags);
5269 s.locked++;
5270 }
5271 }
5272
5273
5274 if (sh->reconstruct_state == reconstruct_state_result) {
5275 struct stripe_head *sh_src
5276 = raid5_get_active_stripe(conf, sh->sector, 1, 1, 1);
5277 if (sh_src && test_bit(STRIPE_EXPAND_SOURCE, &sh_src->state)) {
5278
5279
5280
5281 set_bit(STRIPE_DELAYED, &sh->state);
5282 set_bit(STRIPE_HANDLE, &sh->state);
5283 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE,
5284 &sh_src->state))
5285 atomic_inc(&conf->preread_active_stripes);
5286 raid5_release_stripe(sh_src);
5287 goto finish;
5288 }
5289 if (sh_src)
5290 raid5_release_stripe(sh_src);
5291
5292 sh->reconstruct_state = reconstruct_state_idle;
5293 clear_bit(STRIPE_EXPANDING, &sh->state);
5294 for (i = conf->raid_disks; i--; ) {
5295 set_bit(R5_Wantwrite, &sh->dev[i].flags);
5296 set_bit(R5_LOCKED, &sh->dev[i].flags);
5297 s.locked++;
5298 }
5299 }
5300
5301 if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) &&
5302 !sh->reconstruct_state) {
5303
5304 sh->disks = conf->raid_disks;
5305 stripe_set_idx(sh->sector, conf, 0, sh);
5306 schedule_reconstruction(sh, &s, 1, 1);
5307 } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) {
5308 clear_bit(STRIPE_EXPAND_READY, &sh->state);
5309 atomic_dec(&conf->reshape_stripes);
5310 wake_up(&conf->wait_for_overlap);
5311 md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), 1);
5312 }
5313
5314 if (s.expanding && s.locked == 0 &&
5315 !test_bit(STRIPE_COMPUTE_RUN, &sh->state))
5316 handle_stripe_expansion(conf, sh);
5317
5318 finish:
5319
5320 if (unlikely(s.blocked_rdev)) {
5321 if (conf->mddev->external)
5322 md_wait_for_blocked_rdev(s.blocked_rdev,
5323 conf->mddev);
5324 else
5325
5326
5327
5328
5329 rdev_dec_pending(s.blocked_rdev,
5330 conf->mddev);
5331 }
5332
5333 if (s.handle_bad_blocks)
5334 for (i = disks; i--; ) {
5335 struct md_rdev *rdev;
5336 struct r5dev *dev = &sh->dev[i];
5337 if (test_and_clear_bit(R5_WriteError, &dev->flags)) {
5338
5339 rdev = rdev_pend_deref(conf->disks[i].rdev);
5340 if (!rdev_set_badblocks(rdev, sh->sector,
5341 RAID5_STRIPE_SECTORS(conf), 0))
5342 md_error(conf->mddev, rdev);
5343 rdev_dec_pending(rdev, conf->mddev);
5344 }
5345 if (test_and_clear_bit(R5_MadeGood, &dev->flags)) {
5346 rdev = rdev_pend_deref(conf->disks[i].rdev);
5347 rdev_clear_badblocks(rdev, sh->sector,
5348 RAID5_STRIPE_SECTORS(conf), 0);
5349 rdev_dec_pending(rdev, conf->mddev);
5350 }
5351 if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) {
5352 rdev = rdev_pend_deref(conf->disks[i].replacement);
5353 if (!rdev)
5354
5355 rdev = rdev_pend_deref(conf->disks[i].rdev);
5356 rdev_clear_badblocks(rdev, sh->sector,
5357 RAID5_STRIPE_SECTORS(conf), 0);
5358 rdev_dec_pending(rdev, conf->mddev);
5359 }
5360 }
5361
5362 if (s.ops_request)
5363 raid_run_ops(sh, s.ops_request);
5364
5365 ops_run_io(sh, &s);
5366
5367 if (s.dec_preread_active) {
5368
5369
5370
5371
5372 atomic_dec(&conf->preread_active_stripes);
5373 if (atomic_read(&conf->preread_active_stripes) <
5374 IO_THRESHOLD)
5375 md_wakeup_thread(conf->mddev->thread);
5376 }
5377
5378 clear_bit_unlock(STRIPE_ACTIVE, &sh->state);
5379 }
5380
5381 static void raid5_activate_delayed(struct r5conf *conf)
5382 __must_hold(&conf->device_lock)
5383 {
5384 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) {
5385 while (!list_empty(&conf->delayed_list)) {
5386 struct list_head *l = conf->delayed_list.next;
5387 struct stripe_head *sh;
5388 sh = list_entry(l, struct stripe_head, lru);
5389 list_del_init(l);
5390 clear_bit(STRIPE_DELAYED, &sh->state);
5391 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
5392 atomic_inc(&conf->preread_active_stripes);
5393 list_add_tail(&sh->lru, &conf->hold_list);
5394 raid5_wakeup_stripe_thread(sh);
5395 }
5396 }
5397 }
5398
5399 static void activate_bit_delay(struct r5conf *conf,
5400 struct list_head *temp_inactive_list)
5401 __must_hold(&conf->device_lock)
5402 {
5403 struct list_head head;
5404 list_add(&head, &conf->bitmap_list);
5405 list_del_init(&conf->bitmap_list);
5406 while (!list_empty(&head)) {
5407 struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru);
5408 int hash;
5409 list_del_init(&sh->lru);
5410 atomic_inc(&sh->count);
5411 hash = sh->hash_lock_index;
5412 __release_stripe(conf, sh, &temp_inactive_list[hash]);
5413 }
5414 }
5415
5416 static int in_chunk_boundary(struct mddev *mddev, struct bio *bio)
5417 {
5418 struct r5conf *conf = mddev->private;
5419 sector_t sector = bio->bi_iter.bi_sector;
5420 unsigned int chunk_sectors;
5421 unsigned int bio_sectors = bio_sectors(bio);
5422
5423 chunk_sectors = min(conf->chunk_sectors, conf->prev_chunk_sectors);
5424 return chunk_sectors >=
5425 ((sector & (chunk_sectors - 1)) + bio_sectors);
5426 }
5427
5428
5429
5430
5431
5432 static void add_bio_to_retry(struct bio *bi,struct r5conf *conf)
5433 {
5434 unsigned long flags;
5435
5436 spin_lock_irqsave(&conf->device_lock, flags);
5437
5438 bi->bi_next = conf->retry_read_aligned_list;
5439 conf->retry_read_aligned_list = bi;
5440
5441 spin_unlock_irqrestore(&conf->device_lock, flags);
5442 md_wakeup_thread(conf->mddev->thread);
5443 }
5444
5445 static struct bio *remove_bio_from_retry(struct r5conf *conf,
5446 unsigned int *offset)
5447 {
5448 struct bio *bi;
5449
5450 bi = conf->retry_read_aligned;
5451 if (bi) {
5452 *offset = conf->retry_read_offset;
5453 conf->retry_read_aligned = NULL;
5454 return bi;
5455 }
5456 bi = conf->retry_read_aligned_list;
5457 if(bi) {
5458 conf->retry_read_aligned_list = bi->bi_next;
5459 bi->bi_next = NULL;
5460 *offset = 0;
5461 }
5462
5463 return bi;
5464 }
5465
5466
5467
5468
5469
5470
5471
5472 static void raid5_align_endio(struct bio *bi)
5473 {
5474 struct md_io_acct *md_io_acct = bi->bi_private;
5475 struct bio *raid_bi = md_io_acct->orig_bio;
5476 struct mddev *mddev;
5477 struct r5conf *conf;
5478 struct md_rdev *rdev;
5479 blk_status_t error = bi->bi_status;
5480 unsigned long start_time = md_io_acct->start_time;
5481
5482 bio_put(bi);
5483
5484 rdev = (void*)raid_bi->bi_next;
5485 raid_bi->bi_next = NULL;
5486 mddev = rdev->mddev;
5487 conf = mddev->private;
5488
5489 rdev_dec_pending(rdev, conf->mddev);
5490
5491 if (!error) {
5492 if (blk_queue_io_stat(raid_bi->bi_bdev->bd_disk->queue))
5493 bio_end_io_acct(raid_bi, start_time);
5494 bio_endio(raid_bi);
5495 if (atomic_dec_and_test(&conf->active_aligned_reads))
5496 wake_up(&conf->wait_for_quiescent);
5497 return;
5498 }
5499
5500 pr_debug("raid5_align_endio : io error...handing IO for a retry\n");
5501
5502 add_bio_to_retry(raid_bi, conf);
5503 }
5504
5505 static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
5506 {
5507 struct r5conf *conf = mddev->private;
5508 struct bio *align_bio;
5509 struct md_rdev *rdev;
5510 sector_t sector, end_sector, first_bad;
5511 int bad_sectors, dd_idx;
5512 struct md_io_acct *md_io_acct;
5513 bool did_inc;
5514
5515 if (!in_chunk_boundary(mddev, raid_bio)) {
5516 pr_debug("%s: non aligned\n", __func__);
5517 return 0;
5518 }
5519
5520 sector = raid5_compute_sector(conf, raid_bio->bi_iter.bi_sector, 0,
5521 &dd_idx, NULL);
5522 end_sector = bio_end_sector(raid_bio);
5523
5524 rcu_read_lock();
5525 if (r5c_big_stripe_cached(conf, sector))
5526 goto out_rcu_unlock;
5527
5528 rdev = rcu_dereference(conf->disks[dd_idx].replacement);
5529 if (!rdev || test_bit(Faulty, &rdev->flags) ||
5530 rdev->recovery_offset < end_sector) {
5531 rdev = rcu_dereference(conf->disks[dd_idx].rdev);
5532 if (!rdev)
5533 goto out_rcu_unlock;
5534 if (test_bit(Faulty, &rdev->flags) ||
5535 !(test_bit(In_sync, &rdev->flags) ||
5536 rdev->recovery_offset >= end_sector))
5537 goto out_rcu_unlock;
5538 }
5539
5540 atomic_inc(&rdev->nr_pending);
5541 rcu_read_unlock();
5542
5543 if (is_badblock(rdev, sector, bio_sectors(raid_bio), &first_bad,
5544 &bad_sectors)) {
5545 bio_put(raid_bio);
5546 rdev_dec_pending(rdev, mddev);
5547 return 0;
5548 }
5549
5550 align_bio = bio_alloc_clone(rdev->bdev, raid_bio, GFP_NOIO,
5551 &mddev->io_acct_set);
5552 md_io_acct = container_of(align_bio, struct md_io_acct, bio_clone);
5553 raid_bio->bi_next = (void *)rdev;
5554 if (blk_queue_io_stat(raid_bio->bi_bdev->bd_disk->queue))
5555 md_io_acct->start_time = bio_start_io_acct(raid_bio);
5556 md_io_acct->orig_bio = raid_bio;
5557
5558 align_bio->bi_end_io = raid5_align_endio;
5559 align_bio->bi_private = md_io_acct;
5560 align_bio->bi_iter.bi_sector = sector;
5561
5562
5563 align_bio->bi_iter.bi_sector += rdev->data_offset;
5564
5565 did_inc = false;
5566 if (conf->quiesce == 0) {
5567 atomic_inc(&conf->active_aligned_reads);
5568 did_inc = true;
5569 }
5570
5571 if (!did_inc || smp_load_acquire(&conf->quiesce) != 0) {
5572
5573
5574
5575 if (did_inc && atomic_dec_and_test(&conf->active_aligned_reads))
5576 wake_up(&conf->wait_for_quiescent);
5577 spin_lock_irq(&conf->device_lock);
5578 wait_event_lock_irq(conf->wait_for_quiescent, conf->quiesce == 0,
5579 conf->device_lock);
5580 atomic_inc(&conf->active_aligned_reads);
5581 spin_unlock_irq(&conf->device_lock);
5582 }
5583
5584 if (mddev->gendisk)
5585 trace_block_bio_remap(align_bio, disk_devt(mddev->gendisk),
5586 raid_bio->bi_iter.bi_sector);
5587 submit_bio_noacct(align_bio);
5588 return 1;
5589
5590 out_rcu_unlock:
5591 rcu_read_unlock();
5592 return 0;
5593 }
5594
5595 static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio)
5596 {
5597 struct bio *split;
5598 sector_t sector = raid_bio->bi_iter.bi_sector;
5599 unsigned chunk_sects = mddev->chunk_sectors;
5600 unsigned sectors = chunk_sects - (sector & (chunk_sects-1));
5601
5602 if (sectors < bio_sectors(raid_bio)) {
5603 struct r5conf *conf = mddev->private;
5604 split = bio_split(raid_bio, sectors, GFP_NOIO, &conf->bio_split);
5605 bio_chain(split, raid_bio);
5606 submit_bio_noacct(raid_bio);
5607 raid_bio = split;
5608 }
5609
5610 if (!raid5_read_one_chunk(mddev, raid_bio))
5611 return raid_bio;
5612
5613 return NULL;
5614 }
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626 static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group)
5627 __must_hold(&conf->device_lock)
5628 {
5629 struct stripe_head *sh, *tmp;
5630 struct list_head *handle_list = NULL;
5631 struct r5worker_group *wg;
5632 bool second_try = !r5c_is_writeback(conf->log) &&
5633 !r5l_log_disk_error(conf);
5634 bool try_loprio = test_bit(R5C_LOG_TIGHT, &conf->cache_state) ||
5635 r5l_log_disk_error(conf);
5636
5637 again:
5638 wg = NULL;
5639 sh = NULL;
5640 if (conf->worker_cnt_per_group == 0) {
5641 handle_list = try_loprio ? &conf->loprio_list :
5642 &conf->handle_list;
5643 } else if (group != ANY_GROUP) {
5644 handle_list = try_loprio ? &conf->worker_groups[group].loprio_list :
5645 &conf->worker_groups[group].handle_list;
5646 wg = &conf->worker_groups[group];
5647 } else {
5648 int i;
5649 for (i = 0; i < conf->group_cnt; i++) {
5650 handle_list = try_loprio ? &conf->worker_groups[i].loprio_list :
5651 &conf->worker_groups[i].handle_list;
5652 wg = &conf->worker_groups[i];
5653 if (!list_empty(handle_list))
5654 break;
5655 }
5656 }
5657
5658 pr_debug("%s: handle: %s hold: %s full_writes: %d bypass_count: %d\n",
5659 __func__,
5660 list_empty(handle_list) ? "empty" : "busy",
5661 list_empty(&conf->hold_list) ? "empty" : "busy",
5662 atomic_read(&conf->pending_full_writes), conf->bypass_count);
5663
5664 if (!list_empty(handle_list)) {
5665 sh = list_entry(handle_list->next, typeof(*sh), lru);
5666
5667 if (list_empty(&conf->hold_list))
5668 conf->bypass_count = 0;
5669 else if (!test_bit(STRIPE_IO_STARTED, &sh->state)) {
5670 if (conf->hold_list.next == conf->last_hold)
5671 conf->bypass_count++;
5672 else {
5673 conf->last_hold = conf->hold_list.next;
5674 conf->bypass_count -= conf->bypass_threshold;
5675 if (conf->bypass_count < 0)
5676 conf->bypass_count = 0;
5677 }
5678 }
5679 } else if (!list_empty(&conf->hold_list) &&
5680 ((conf->bypass_threshold &&
5681 conf->bypass_count > conf->bypass_threshold) ||
5682 atomic_read(&conf->pending_full_writes) == 0)) {
5683
5684 list_for_each_entry(tmp, &conf->hold_list, lru) {
5685 if (conf->worker_cnt_per_group == 0 ||
5686 group == ANY_GROUP ||
5687 !cpu_online(tmp->cpu) ||
5688 cpu_to_group(tmp->cpu) == group) {
5689 sh = tmp;
5690 break;
5691 }
5692 }
5693
5694 if (sh) {
5695 conf->bypass_count -= conf->bypass_threshold;
5696 if (conf->bypass_count < 0)
5697 conf->bypass_count = 0;
5698 }
5699 wg = NULL;
5700 }
5701
5702 if (!sh) {
5703 if (second_try)
5704 return NULL;
5705 second_try = true;
5706 try_loprio = !try_loprio;
5707 goto again;
5708 }
5709
5710 if (wg) {
5711 wg->stripes_cnt--;
5712 sh->group = NULL;
5713 }
5714 list_del_init(&sh->lru);
5715 BUG_ON(atomic_inc_return(&sh->count) != 1);
5716 return sh;
5717 }
5718
5719 struct raid5_plug_cb {
5720 struct blk_plug_cb cb;
5721 struct list_head list;
5722 struct list_head temp_inactive_list[NR_STRIPE_HASH_LOCKS];
5723 };
5724
5725 static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule)
5726 {
5727 struct raid5_plug_cb *cb = container_of(
5728 blk_cb, struct raid5_plug_cb, cb);
5729 struct stripe_head *sh;
5730 struct mddev *mddev = cb->cb.data;
5731 struct r5conf *conf = mddev->private;
5732 int cnt = 0;
5733 int hash;
5734
5735 if (cb->list.next && !list_empty(&cb->list)) {
5736 spin_lock_irq(&conf->device_lock);
5737 while (!list_empty(&cb->list)) {
5738 sh = list_first_entry(&cb->list, struct stripe_head, lru);
5739 list_del_init(&sh->lru);
5740
5741
5742
5743
5744
5745 smp_mb__before_atomic();
5746 clear_bit(STRIPE_ON_UNPLUG_LIST, &sh->state);
5747
5748
5749
5750
5751 hash = sh->hash_lock_index;
5752 __release_stripe(conf, sh, &cb->temp_inactive_list[hash]);
5753 cnt++;
5754 }
5755 spin_unlock_irq(&conf->device_lock);
5756 }
5757 release_inactive_stripe_list(conf, cb->temp_inactive_list,
5758 NR_STRIPE_HASH_LOCKS);
5759 if (mddev->queue)
5760 trace_block_unplug(mddev->queue, cnt, !from_schedule);
5761 kfree(cb);
5762 }
5763
5764 static void release_stripe_plug(struct mddev *mddev,
5765 struct stripe_head *sh)
5766 {
5767 struct blk_plug_cb *blk_cb = blk_check_plugged(
5768 raid5_unplug, mddev,
5769 sizeof(struct raid5_plug_cb));
5770 struct raid5_plug_cb *cb;
5771
5772 if (!blk_cb) {
5773 raid5_release_stripe(sh);
5774 return;
5775 }
5776
5777 cb = container_of(blk_cb, struct raid5_plug_cb, cb);
5778
5779 if (cb->list.next == NULL) {
5780 int i;
5781 INIT_LIST_HEAD(&cb->list);
5782 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
5783 INIT_LIST_HEAD(cb->temp_inactive_list + i);
5784 }
5785
5786 if (!test_and_set_bit(STRIPE_ON_UNPLUG_LIST, &sh->state))
5787 list_add_tail(&sh->lru, &cb->list);
5788 else
5789 raid5_release_stripe(sh);
5790 }
5791
5792 static void make_discard_request(struct mddev *mddev, struct bio *bi)
5793 {
5794 struct r5conf *conf = mddev->private;
5795 sector_t logical_sector, last_sector;
5796 struct stripe_head *sh;
5797 int stripe_sectors;
5798
5799
5800 if (WARN_ON_ONCE(bi->bi_opf & REQ_NOWAIT))
5801 return;
5802
5803 if (mddev->reshape_position != MaxSector)
5804
5805 return;
5806
5807 logical_sector = bi->bi_iter.bi_sector & ~((sector_t)RAID5_STRIPE_SECTORS(conf)-1);
5808 last_sector = bio_end_sector(bi);
5809
5810 bi->bi_next = NULL;
5811
5812 stripe_sectors = conf->chunk_sectors *
5813 (conf->raid_disks - conf->max_degraded);
5814 logical_sector = DIV_ROUND_UP_SECTOR_T(logical_sector,
5815 stripe_sectors);
5816 sector_div(last_sector, stripe_sectors);
5817
5818 logical_sector *= conf->chunk_sectors;
5819 last_sector *= conf->chunk_sectors;
5820
5821 for (; logical_sector < last_sector;
5822 logical_sector += RAID5_STRIPE_SECTORS(conf)) {
5823 DEFINE_WAIT(w);
5824 int d;
5825 again:
5826 sh = raid5_get_active_stripe(conf, logical_sector, 0, 0, 0);
5827 prepare_to_wait(&conf->wait_for_overlap, &w,
5828 TASK_UNINTERRUPTIBLE);
5829 set_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags);
5830 if (test_bit(STRIPE_SYNCING, &sh->state)) {
5831 raid5_release_stripe(sh);
5832 schedule();
5833 goto again;
5834 }
5835 clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags);
5836 spin_lock_irq(&sh->stripe_lock);
5837 for (d = 0; d < conf->raid_disks; d++) {
5838 if (d == sh->pd_idx || d == sh->qd_idx)
5839 continue;
5840 if (sh->dev[d].towrite || sh->dev[d].toread) {
5841 set_bit(R5_Overlap, &sh->dev[d].flags);
5842 spin_unlock_irq(&sh->stripe_lock);
5843 raid5_release_stripe(sh);
5844 schedule();
5845 goto again;
5846 }
5847 }
5848 set_bit(STRIPE_DISCARD, &sh->state);
5849 finish_wait(&conf->wait_for_overlap, &w);
5850 sh->overwrite_disks = 0;
5851 for (d = 0; d < conf->raid_disks; d++) {
5852 if (d == sh->pd_idx || d == sh->qd_idx)
5853 continue;
5854 sh->dev[d].towrite = bi;
5855 set_bit(R5_OVERWRITE, &sh->dev[d].flags);
5856 bio_inc_remaining(bi);
5857 md_write_inc(mddev, bi);
5858 sh->overwrite_disks++;
5859 }
5860 spin_unlock_irq(&sh->stripe_lock);
5861 if (conf->mddev->bitmap) {
5862 for (d = 0;
5863 d < conf->raid_disks - conf->max_degraded;
5864 d++)
5865 md_bitmap_startwrite(mddev->bitmap,
5866 sh->sector,
5867 RAID5_STRIPE_SECTORS(conf),
5868 0);
5869 sh->bm_seq = conf->seq_flush + 1;
5870 set_bit(STRIPE_BIT_DELAY, &sh->state);
5871 }
5872
5873 set_bit(STRIPE_HANDLE, &sh->state);
5874 clear_bit(STRIPE_DELAYED, &sh->state);
5875 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
5876 atomic_inc(&conf->preread_active_stripes);
5877 release_stripe_plug(mddev, sh);
5878 }
5879
5880 bio_endio(bi);
5881 }
5882
5883 static bool ahead_of_reshape(struct mddev *mddev, sector_t sector,
5884 sector_t reshape_sector)
5885 {
5886 return mddev->reshape_backwards ? sector < reshape_sector :
5887 sector >= reshape_sector;
5888 }
5889
5890 static bool range_ahead_of_reshape(struct mddev *mddev, sector_t min,
5891 sector_t max, sector_t reshape_sector)
5892 {
5893 return mddev->reshape_backwards ? max < reshape_sector :
5894 min >= reshape_sector;
5895 }
5896
5897 static bool stripe_ahead_of_reshape(struct mddev *mddev, struct r5conf *conf,
5898 struct stripe_head *sh)
5899 {
5900 sector_t max_sector = 0, min_sector = MaxSector;
5901 bool ret = false;
5902 int dd_idx;
5903
5904 for (dd_idx = 0; dd_idx < sh->disks; dd_idx++) {
5905 if (dd_idx == sh->pd_idx)
5906 continue;
5907
5908 min_sector = min(min_sector, sh->dev[dd_idx].sector);
5909 max_sector = min(max_sector, sh->dev[dd_idx].sector);
5910 }
5911
5912 spin_lock_irq(&conf->device_lock);
5913
5914 if (!range_ahead_of_reshape(mddev, min_sector, max_sector,
5915 conf->reshape_progress))
5916
5917 ret = true;
5918
5919 spin_unlock_irq(&conf->device_lock);
5920
5921 return ret;
5922 }
5923
5924 static int add_all_stripe_bios(struct r5conf *conf,
5925 struct stripe_request_ctx *ctx, struct stripe_head *sh,
5926 struct bio *bi, int forwrite, int previous)
5927 {
5928 int dd_idx;
5929 int ret = 1;
5930
5931 spin_lock_irq(&sh->stripe_lock);
5932
5933 for (dd_idx = 0; dd_idx < sh->disks; dd_idx++) {
5934 struct r5dev *dev = &sh->dev[dd_idx];
5935
5936 if (dd_idx == sh->pd_idx || dd_idx == sh->qd_idx)
5937 continue;
5938
5939 if (dev->sector < ctx->first_sector ||
5940 dev->sector >= ctx->last_sector)
5941 continue;
5942
5943 if (stripe_bio_overlaps(sh, bi, dd_idx, forwrite)) {
5944 set_bit(R5_Overlap, &dev->flags);
5945 ret = 0;
5946 continue;
5947 }
5948 }
5949
5950 if (!ret)
5951 goto out;
5952
5953 for (dd_idx = 0; dd_idx < sh->disks; dd_idx++) {
5954 struct r5dev *dev = &sh->dev[dd_idx];
5955
5956 if (dd_idx == sh->pd_idx || dd_idx == sh->qd_idx)
5957 continue;
5958
5959 if (dev->sector < ctx->first_sector ||
5960 dev->sector >= ctx->last_sector)
5961 continue;
5962
5963 __add_stripe_bio(sh, bi, dd_idx, forwrite, previous);
5964 clear_bit((dev->sector - ctx->first_sector) >>
5965 RAID5_STRIPE_SHIFT(conf), ctx->sectors_to_do);
5966 }
5967
5968 out:
5969 spin_unlock_irq(&sh->stripe_lock);
5970 return ret;
5971 }
5972
5973 static enum stripe_result make_stripe_request(struct mddev *mddev,
5974 struct r5conf *conf, struct stripe_request_ctx *ctx,
5975 sector_t logical_sector, struct bio *bi)
5976 {
5977 const int rw = bio_data_dir(bi);
5978 enum stripe_result ret;
5979 struct stripe_head *sh;
5980 sector_t new_sector;
5981 int previous = 0;
5982 int seq, dd_idx;
5983
5984 seq = read_seqcount_begin(&conf->gen_lock);
5985
5986 if (unlikely(conf->reshape_progress != MaxSector)) {
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996 spin_lock_irq(&conf->device_lock);
5997 if (ahead_of_reshape(mddev, logical_sector,
5998 conf->reshape_progress)) {
5999 previous = 1;
6000 } else {
6001 if (ahead_of_reshape(mddev, logical_sector,
6002 conf->reshape_safe)) {
6003 spin_unlock_irq(&conf->device_lock);
6004 return STRIPE_SCHEDULE_AND_RETRY;
6005 }
6006 }
6007 spin_unlock_irq(&conf->device_lock);
6008 }
6009
6010 new_sector = raid5_compute_sector(conf, logical_sector, previous,
6011 &dd_idx, NULL);
6012 pr_debug("raid456: %s, sector %llu logical %llu\n", __func__,
6013 new_sector, logical_sector);
6014
6015 sh = __raid5_get_active_stripe(conf, ctx, new_sector, previous,
6016 (bi->bi_opf & REQ_RAHEAD), 0);
6017 if (unlikely(!sh)) {
6018
6019 bi->bi_status = BLK_STS_IOERR;
6020 return STRIPE_FAIL;
6021 }
6022
6023 if (unlikely(previous) &&
6024 stripe_ahead_of_reshape(mddev, conf, sh)) {
6025
6026
6027
6028
6029
6030
6031
6032
6033 ret = STRIPE_SCHEDULE_AND_RETRY;
6034 goto out_release;
6035 }
6036
6037 if (read_seqcount_retry(&conf->gen_lock, seq)) {
6038
6039 ret = STRIPE_RETRY;
6040 goto out_release;
6041 }
6042
6043 if (test_bit(STRIPE_EXPANDING, &sh->state) ||
6044 !add_all_stripe_bios(conf, ctx, sh, bi, rw, previous)) {
6045
6046
6047
6048
6049 md_wakeup_thread(mddev->thread);
6050 ret = STRIPE_SCHEDULE_AND_RETRY;
6051 goto out_release;
6052 }
6053
6054 if (stripe_can_batch(sh)) {
6055 stripe_add_to_batch_list(conf, sh, ctx->batch_last);
6056 if (ctx->batch_last)
6057 raid5_release_stripe(ctx->batch_last);
6058 atomic_inc(&sh->count);
6059 ctx->batch_last = sh;
6060 }
6061
6062 if (ctx->do_flush) {
6063 set_bit(STRIPE_R5C_PREFLUSH, &sh->state);
6064
6065 ctx->do_flush = false;
6066 }
6067
6068 set_bit(STRIPE_HANDLE, &sh->state);
6069 clear_bit(STRIPE_DELAYED, &sh->state);
6070 if ((!sh->batch_head || sh == sh->batch_head) &&
6071 (bi->bi_opf & REQ_SYNC) &&
6072 !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
6073 atomic_inc(&conf->preread_active_stripes);
6074
6075 release_stripe_plug(mddev, sh);
6076 return STRIPE_SUCCESS;
6077
6078 out_release:
6079 raid5_release_stripe(sh);
6080 return ret;
6081 }
6082
6083 static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
6084 {
6085 DEFINE_WAIT_FUNC(wait, woken_wake_function);
6086 struct r5conf *conf = mddev->private;
6087 sector_t logical_sector;
6088 struct stripe_request_ctx ctx = {};
6089 const int rw = bio_data_dir(bi);
6090 enum stripe_result res;
6091 int s, stripe_cnt;
6092
6093 if (unlikely(bi->bi_opf & REQ_PREFLUSH)) {
6094 int ret = log_handle_flush_request(conf, bi);
6095
6096 if (ret == 0)
6097 return true;
6098 if (ret == -ENODEV) {
6099 if (md_flush_request(mddev, bi))
6100 return true;
6101 }
6102
6103
6104
6105
6106
6107 ctx.do_flush = bi->bi_opf & REQ_PREFLUSH;
6108 }
6109
6110 if (!md_write_start(mddev, bi))
6111 return false;
6112
6113
6114
6115
6116
6117 if (rw == READ && mddev->degraded == 0 &&
6118 mddev->reshape_position == MaxSector) {
6119 bi = chunk_aligned_read(mddev, bi);
6120 if (!bi)
6121 return true;
6122 }
6123
6124 if (unlikely(bio_op(bi) == REQ_OP_DISCARD)) {
6125 make_discard_request(mddev, bi);
6126 md_write_end(mddev);
6127 return true;
6128 }
6129
6130 logical_sector = bi->bi_iter.bi_sector & ~((sector_t)RAID5_STRIPE_SECTORS(conf)-1);
6131 ctx.first_sector = logical_sector;
6132 ctx.last_sector = bio_end_sector(bi);
6133 bi->bi_next = NULL;
6134
6135 stripe_cnt = DIV_ROUND_UP_SECTOR_T(ctx.last_sector - logical_sector,
6136 RAID5_STRIPE_SECTORS(conf));
6137 bitmap_set(ctx.sectors_to_do, 0, stripe_cnt);
6138
6139 pr_debug("raid456: %s, logical %llu to %llu\n", __func__,
6140 bi->bi_iter.bi_sector, ctx.last_sector);
6141
6142
6143 if ((bi->bi_opf & REQ_NOWAIT) &&
6144 (conf->reshape_progress != MaxSector) &&
6145 !ahead_of_reshape(mddev, logical_sector, conf->reshape_progress) &&
6146 ahead_of_reshape(mddev, logical_sector, conf->reshape_safe)) {
6147 bio_wouldblock_error(bi);
6148 if (rw == WRITE)
6149 md_write_end(mddev);
6150 return true;
6151 }
6152 md_account_bio(mddev, &bi);
6153
6154 add_wait_queue(&conf->wait_for_overlap, &wait);
6155 while (1) {
6156 res = make_stripe_request(mddev, conf, &ctx, logical_sector,
6157 bi);
6158 if (res == STRIPE_FAIL)
6159 break;
6160
6161 if (res == STRIPE_RETRY)
6162 continue;
6163
6164 if (res == STRIPE_SCHEDULE_AND_RETRY) {
6165
6166
6167
6168
6169
6170
6171
6172 if (ctx.batch_last) {
6173 raid5_release_stripe(ctx.batch_last);
6174 ctx.batch_last = NULL;
6175 }
6176
6177 wait_woken(&wait, TASK_UNINTERRUPTIBLE,
6178 MAX_SCHEDULE_TIMEOUT);
6179 continue;
6180 }
6181
6182 s = find_first_bit(ctx.sectors_to_do, stripe_cnt);
6183 if (s == stripe_cnt)
6184 break;
6185
6186 logical_sector = ctx.first_sector +
6187 (s << RAID5_STRIPE_SHIFT(conf));
6188 }
6189 remove_wait_queue(&conf->wait_for_overlap, &wait);
6190
6191 if (ctx.batch_last)
6192 raid5_release_stripe(ctx.batch_last);
6193
6194 if (rw == WRITE)
6195 md_write_end(mddev);
6196 bio_endio(bi);
6197 return true;
6198 }
6199
6200 static sector_t raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks);
6201
6202 static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *skipped)
6203 {
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213 struct r5conf *conf = mddev->private;
6214 struct stripe_head *sh;
6215 struct md_rdev *rdev;
6216 sector_t first_sector, last_sector;
6217 int raid_disks = conf->previous_raid_disks;
6218 int data_disks = raid_disks - conf->max_degraded;
6219 int new_data_disks = conf->raid_disks - conf->max_degraded;
6220 int i;
6221 int dd_idx;
6222 sector_t writepos, readpos, safepos;
6223 sector_t stripe_addr;
6224 int reshape_sectors;
6225 struct list_head stripes;
6226 sector_t retn;
6227
6228 if (sector_nr == 0) {
6229
6230 if (mddev->reshape_backwards &&
6231 conf->reshape_progress < raid5_size(mddev, 0, 0)) {
6232 sector_nr = raid5_size(mddev, 0, 0)
6233 - conf->reshape_progress;
6234 } else if (mddev->reshape_backwards &&
6235 conf->reshape_progress == MaxSector) {
6236
6237 sector_nr = MaxSector;
6238 } else if (!mddev->reshape_backwards &&
6239 conf->reshape_progress > 0)
6240 sector_nr = conf->reshape_progress;
6241 sector_div(sector_nr, new_data_disks);
6242 if (sector_nr) {
6243 mddev->curr_resync_completed = sector_nr;
6244 sysfs_notify_dirent_safe(mddev->sysfs_completed);
6245 *skipped = 1;
6246 retn = sector_nr;
6247 goto finish;
6248 }
6249 }
6250
6251
6252
6253
6254
6255
6256 reshape_sectors = max(conf->chunk_sectors, conf->prev_chunk_sectors);
6257
6258
6259
6260
6261
6262
6263
6264 writepos = conf->reshape_progress;
6265 sector_div(writepos, new_data_disks);
6266 readpos = conf->reshape_progress;
6267 sector_div(readpos, data_disks);
6268 safepos = conf->reshape_safe;
6269 sector_div(safepos, data_disks);
6270 if (mddev->reshape_backwards) {
6271 BUG_ON(writepos < reshape_sectors);
6272 writepos -= reshape_sectors;
6273 readpos += reshape_sectors;
6274 safepos += reshape_sectors;
6275 } else {
6276 writepos += reshape_sectors;
6277
6278
6279
6280
6281 readpos -= min_t(sector_t, reshape_sectors, readpos);
6282 safepos -= min_t(sector_t, reshape_sectors, safepos);
6283 }
6284
6285
6286
6287
6288 if (mddev->reshape_backwards) {
6289 BUG_ON(conf->reshape_progress == 0);
6290 stripe_addr = writepos;
6291 BUG_ON((mddev->dev_sectors &
6292 ~((sector_t)reshape_sectors - 1))
6293 - reshape_sectors - stripe_addr
6294 != sector_nr);
6295 } else {
6296 BUG_ON(writepos != sector_nr + reshape_sectors);
6297 stripe_addr = sector_nr;
6298 }
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320 if (conf->min_offset_diff < 0) {
6321 safepos += -conf->min_offset_diff;
6322 readpos += -conf->min_offset_diff;
6323 } else
6324 writepos += conf->min_offset_diff;
6325
6326 if ((mddev->reshape_backwards
6327 ? (safepos > writepos && readpos < writepos)
6328 : (safepos < writepos && readpos > writepos)) ||
6329 time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
6330
6331 wait_event(conf->wait_for_overlap,
6332 atomic_read(&conf->reshape_stripes)==0
6333 || test_bit(MD_RECOVERY_INTR, &mddev->recovery));
6334 if (atomic_read(&conf->reshape_stripes) != 0)
6335 return 0;
6336 mddev->reshape_position = conf->reshape_progress;
6337 mddev->curr_resync_completed = sector_nr;
6338 if (!mddev->reshape_backwards)
6339
6340 rdev_for_each(rdev, mddev)
6341 if (rdev->raid_disk >= 0 &&
6342 !test_bit(Journal, &rdev->flags) &&
6343 !test_bit(In_sync, &rdev->flags) &&
6344 rdev->recovery_offset < sector_nr)
6345 rdev->recovery_offset = sector_nr;
6346
6347 conf->reshape_checkpoint = jiffies;
6348 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
6349 md_wakeup_thread(mddev->thread);
6350 wait_event(mddev->sb_wait, mddev->sb_flags == 0 ||
6351 test_bit(MD_RECOVERY_INTR, &mddev->recovery));
6352 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
6353 return 0;
6354 spin_lock_irq(&conf->device_lock);
6355 conf->reshape_safe = mddev->reshape_position;
6356 spin_unlock_irq(&conf->device_lock);
6357 wake_up(&conf->wait_for_overlap);
6358 sysfs_notify_dirent_safe(mddev->sysfs_completed);
6359 }
6360
6361 INIT_LIST_HEAD(&stripes);
6362 for (i = 0; i < reshape_sectors; i += RAID5_STRIPE_SECTORS(conf)) {
6363 int j;
6364 int skipped_disk = 0;
6365 sh = raid5_get_active_stripe(conf, stripe_addr+i, 0, 0, 1);
6366 set_bit(STRIPE_EXPANDING, &sh->state);
6367 atomic_inc(&conf->reshape_stripes);
6368
6369
6370
6371 for (j=sh->disks; j--;) {
6372 sector_t s;
6373 if (j == sh->pd_idx)
6374 continue;
6375 if (conf->level == 6 &&
6376 j == sh->qd_idx)
6377 continue;
6378 s = raid5_compute_blocknr(sh, j, 0);
6379 if (s < raid5_size(mddev, 0, 0)) {
6380 skipped_disk = 1;
6381 continue;
6382 }
6383 memset(page_address(sh->dev[j].page), 0, RAID5_STRIPE_SIZE(conf));
6384 set_bit(R5_Expanded, &sh->dev[j].flags);
6385 set_bit(R5_UPTODATE, &sh->dev[j].flags);
6386 }
6387 if (!skipped_disk) {
6388 set_bit(STRIPE_EXPAND_READY, &sh->state);
6389 set_bit(STRIPE_HANDLE, &sh->state);
6390 }
6391 list_add(&sh->lru, &stripes);
6392 }
6393 spin_lock_irq(&conf->device_lock);
6394 if (mddev->reshape_backwards)
6395 conf->reshape_progress -= reshape_sectors * new_data_disks;
6396 else
6397 conf->reshape_progress += reshape_sectors * new_data_disks;
6398 spin_unlock_irq(&conf->device_lock);
6399
6400
6401
6402
6403
6404 first_sector =
6405 raid5_compute_sector(conf, stripe_addr*(new_data_disks),
6406 1, &dd_idx, NULL);
6407 last_sector =
6408 raid5_compute_sector(conf, ((stripe_addr+reshape_sectors)
6409 * new_data_disks - 1),
6410 1, &dd_idx, NULL);
6411 if (last_sector >= mddev->dev_sectors)
6412 last_sector = mddev->dev_sectors - 1;
6413 while (first_sector <= last_sector) {
6414 sh = raid5_get_active_stripe(conf, first_sector, 1, 0, 1);
6415 set_bit(STRIPE_EXPAND_SOURCE, &sh->state);
6416 set_bit(STRIPE_HANDLE, &sh->state);
6417 raid5_release_stripe(sh);
6418 first_sector += RAID5_STRIPE_SECTORS(conf);
6419 }
6420
6421
6422
6423 while (!list_empty(&stripes)) {
6424 sh = list_entry(stripes.next, struct stripe_head, lru);
6425 list_del_init(&sh->lru);
6426 raid5_release_stripe(sh);
6427 }
6428
6429
6430
6431 sector_nr += reshape_sectors;
6432 retn = reshape_sectors;
6433 finish:
6434 if (mddev->curr_resync_completed > mddev->resync_max ||
6435 (sector_nr - mddev->curr_resync_completed) * 2
6436 >= mddev->resync_max - mddev->curr_resync_completed) {
6437
6438 wait_event(conf->wait_for_overlap,
6439 atomic_read(&conf->reshape_stripes) == 0
6440 || test_bit(MD_RECOVERY_INTR, &mddev->recovery));
6441 if (atomic_read(&conf->reshape_stripes) != 0)
6442 goto ret;
6443 mddev->reshape_position = conf->reshape_progress;
6444 mddev->curr_resync_completed = sector_nr;
6445 if (!mddev->reshape_backwards)
6446
6447 rdev_for_each(rdev, mddev)
6448 if (rdev->raid_disk >= 0 &&
6449 !test_bit(Journal, &rdev->flags) &&
6450 !test_bit(In_sync, &rdev->flags) &&
6451 rdev->recovery_offset < sector_nr)
6452 rdev->recovery_offset = sector_nr;
6453 conf->reshape_checkpoint = jiffies;
6454 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
6455 md_wakeup_thread(mddev->thread);
6456 wait_event(mddev->sb_wait,
6457 !test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)
6458 || test_bit(MD_RECOVERY_INTR, &mddev->recovery));
6459 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
6460 goto ret;
6461 spin_lock_irq(&conf->device_lock);
6462 conf->reshape_safe = mddev->reshape_position;
6463 spin_unlock_irq(&conf->device_lock);
6464 wake_up(&conf->wait_for_overlap);
6465 sysfs_notify_dirent_safe(mddev->sysfs_completed);
6466 }
6467 ret:
6468 return retn;
6469 }
6470
6471 static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_nr,
6472 int *skipped)
6473 {
6474 struct r5conf *conf = mddev->private;
6475 struct stripe_head *sh;
6476 sector_t max_sector = mddev->dev_sectors;
6477 sector_t sync_blocks;
6478 int still_degraded = 0;
6479 int i;
6480
6481 if (sector_nr >= max_sector) {
6482
6483
6484 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
6485 end_reshape(conf);
6486 return 0;
6487 }
6488
6489 if (mddev->curr_resync < max_sector)
6490 md_bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
6491 &sync_blocks, 1);
6492 else
6493 conf->fullsync = 0;
6494 md_bitmap_close_sync(mddev->bitmap);
6495
6496 return 0;
6497 }
6498
6499
6500 wait_event(conf->wait_for_overlap, conf->quiesce != 2);
6501
6502 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
6503 return reshape_request(mddev, sector_nr, skipped);
6504
6505
6506
6507
6508
6509
6510
6511
6512
6513
6514
6515 if (mddev->degraded >= conf->max_degraded &&
6516 test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
6517 sector_t rv = mddev->dev_sectors - sector_nr;
6518 *skipped = 1;
6519 return rv;
6520 }
6521 if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
6522 !conf->fullsync &&
6523 !md_bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&
6524 sync_blocks >= RAID5_STRIPE_SECTORS(conf)) {
6525
6526 do_div(sync_blocks, RAID5_STRIPE_SECTORS(conf));
6527 *skipped = 1;
6528
6529 return sync_blocks * RAID5_STRIPE_SECTORS(conf);
6530 }
6531
6532 md_bitmap_cond_end_sync(mddev->bitmap, sector_nr, false);
6533
6534 sh = raid5_get_active_stripe(conf, sector_nr, 0, 1, 0);
6535 if (sh == NULL) {
6536 sh = raid5_get_active_stripe(conf, sector_nr, 0, 0, 0);
6537
6538
6539
6540 schedule_timeout_uninterruptible(1);
6541 }
6542
6543
6544
6545
6546 rcu_read_lock();
6547 for (i = 0; i < conf->raid_disks; i++) {
6548 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
6549
6550 if (rdev == NULL || test_bit(Faulty, &rdev->flags))
6551 still_degraded = 1;
6552 }
6553 rcu_read_unlock();
6554
6555 md_bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded);
6556
6557 set_bit(STRIPE_SYNC_REQUESTED, &sh->state);
6558 set_bit(STRIPE_HANDLE, &sh->state);
6559
6560 raid5_release_stripe(sh);
6561
6562 return RAID5_STRIPE_SECTORS(conf);
6563 }
6564
6565 static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio,
6566 unsigned int offset)
6567 {
6568
6569
6570
6571
6572
6573
6574
6575
6576
6577
6578 struct stripe_head *sh;
6579 int dd_idx;
6580 sector_t sector, logical_sector, last_sector;
6581 int scnt = 0;
6582 int handled = 0;
6583
6584 logical_sector = raid_bio->bi_iter.bi_sector &
6585 ~((sector_t)RAID5_STRIPE_SECTORS(conf)-1);
6586 sector = raid5_compute_sector(conf, logical_sector,
6587 0, &dd_idx, NULL);
6588 last_sector = bio_end_sector(raid_bio);
6589
6590 for (; logical_sector < last_sector;
6591 logical_sector += RAID5_STRIPE_SECTORS(conf),
6592 sector += RAID5_STRIPE_SECTORS(conf),
6593 scnt++) {
6594
6595 if (scnt < offset)
6596
6597 continue;
6598
6599 sh = raid5_get_active_stripe(conf, sector, 0, 1, 1);
6600
6601 if (!sh) {
6602
6603 conf->retry_read_aligned = raid_bio;
6604 conf->retry_read_offset = scnt;
6605 return handled;
6606 }
6607
6608 if (!add_stripe_bio(sh, raid_bio, dd_idx, 0, 0)) {
6609 raid5_release_stripe(sh);
6610 conf->retry_read_aligned = raid_bio;
6611 conf->retry_read_offset = scnt;
6612 return handled;
6613 }
6614
6615 set_bit(R5_ReadNoMerge, &sh->dev[dd_idx].flags);
6616 handle_stripe(sh);
6617 raid5_release_stripe(sh);
6618 handled++;
6619 }
6620
6621 bio_endio(raid_bio);
6622
6623 if (atomic_dec_and_test(&conf->active_aligned_reads))
6624 wake_up(&conf->wait_for_quiescent);
6625 return handled;
6626 }
6627
6628 static int handle_active_stripes(struct r5conf *conf, int group,
6629 struct r5worker *worker,
6630 struct list_head *temp_inactive_list)
6631 __must_hold(&conf->device_lock)
6632 {
6633 struct stripe_head *batch[MAX_STRIPE_BATCH], *sh;
6634 int i, batch_size = 0, hash;
6635 bool release_inactive = false;
6636
6637 while (batch_size < MAX_STRIPE_BATCH &&
6638 (sh = __get_priority_stripe(conf, group)) != NULL)
6639 batch[batch_size++] = sh;
6640
6641 if (batch_size == 0) {
6642 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
6643 if (!list_empty(temp_inactive_list + i))
6644 break;
6645 if (i == NR_STRIPE_HASH_LOCKS) {
6646 spin_unlock_irq(&conf->device_lock);
6647 log_flush_stripe_to_raid(conf);
6648 spin_lock_irq(&conf->device_lock);
6649 return batch_size;
6650 }
6651 release_inactive = true;
6652 }
6653 spin_unlock_irq(&conf->device_lock);
6654
6655 release_inactive_stripe_list(conf, temp_inactive_list,
6656 NR_STRIPE_HASH_LOCKS);
6657
6658 r5l_flush_stripe_to_raid(conf->log);
6659 if (release_inactive) {
6660 spin_lock_irq(&conf->device_lock);
6661 return 0;
6662 }
6663
6664 for (i = 0; i < batch_size; i++)
6665 handle_stripe(batch[i]);
6666 log_write_stripe_run(conf);
6667
6668 cond_resched();
6669
6670 spin_lock_irq(&conf->device_lock);
6671 for (i = 0; i < batch_size; i++) {
6672 hash = batch[i]->hash_lock_index;
6673 __release_stripe(conf, batch[i], &temp_inactive_list[hash]);
6674 }
6675 return batch_size;
6676 }
6677
6678 static void raid5_do_work(struct work_struct *work)
6679 {
6680 struct r5worker *worker = container_of(work, struct r5worker, work);
6681 struct r5worker_group *group = worker->group;
6682 struct r5conf *conf = group->conf;
6683 struct mddev *mddev = conf->mddev;
6684 int group_id = group - conf->worker_groups;
6685 int handled;
6686 struct blk_plug plug;
6687
6688 pr_debug("+++ raid5worker active\n");
6689
6690 blk_start_plug(&plug);
6691 handled = 0;
6692 spin_lock_irq(&conf->device_lock);
6693 while (1) {
6694 int batch_size, released;
6695
6696 released = release_stripe_list(conf, worker->temp_inactive_list);
6697
6698 batch_size = handle_active_stripes(conf, group_id, worker,
6699 worker->temp_inactive_list);
6700 worker->working = false;
6701 if (!batch_size && !released)
6702 break;
6703 handled += batch_size;
6704 wait_event_lock_irq(mddev->sb_wait,
6705 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags),
6706 conf->device_lock);
6707 }
6708 pr_debug("%d stripes handled\n", handled);
6709
6710 spin_unlock_irq(&conf->device_lock);
6711
6712 flush_deferred_bios(conf);
6713
6714 r5l_flush_stripe_to_raid(conf->log);
6715
6716 async_tx_issue_pending_all();
6717 blk_finish_plug(&plug);
6718
6719 pr_debug("--- raid5worker inactive\n");
6720 }
6721
6722
6723
6724
6725
6726
6727
6728
6729 static void raid5d(struct md_thread *thread)
6730 {
6731 struct mddev *mddev = thread->mddev;
6732 struct r5conf *conf = mddev->private;
6733 int handled;
6734 struct blk_plug plug;
6735
6736 pr_debug("+++ raid5d active\n");
6737
6738 md_check_recovery(mddev);
6739
6740 blk_start_plug(&plug);
6741 handled = 0;
6742 spin_lock_irq(&conf->device_lock);
6743 while (1) {
6744 struct bio *bio;
6745 int batch_size, released;
6746 unsigned int offset;
6747
6748 released = release_stripe_list(conf, conf->temp_inactive_list);
6749 if (released)
6750 clear_bit(R5_DID_ALLOC, &conf->cache_state);
6751
6752 if (
6753 !list_empty(&conf->bitmap_list)) {
6754
6755 conf->seq_flush++;
6756 spin_unlock_irq(&conf->device_lock);
6757 md_bitmap_unplug(mddev->bitmap);
6758 spin_lock_irq(&conf->device_lock);
6759 conf->seq_write = conf->seq_flush;
6760 activate_bit_delay(conf, conf->temp_inactive_list);
6761 }
6762 raid5_activate_delayed(conf);
6763
6764 while ((bio = remove_bio_from_retry(conf, &offset))) {
6765 int ok;
6766 spin_unlock_irq(&conf->device_lock);
6767 ok = retry_aligned_read(conf, bio, offset);
6768 spin_lock_irq(&conf->device_lock);
6769 if (!ok)
6770 break;
6771 handled++;
6772 }
6773
6774 batch_size = handle_active_stripes(conf, ANY_GROUP, NULL,
6775 conf->temp_inactive_list);
6776 if (!batch_size && !released)
6777 break;
6778 handled += batch_size;
6779
6780 if (mddev->sb_flags & ~(1 << MD_SB_CHANGE_PENDING)) {
6781 spin_unlock_irq(&conf->device_lock);
6782 md_check_recovery(mddev);
6783 spin_lock_irq(&conf->device_lock);
6784 }
6785 }
6786 pr_debug("%d stripes handled\n", handled);
6787
6788 spin_unlock_irq(&conf->device_lock);
6789 if (test_and_clear_bit(R5_ALLOC_MORE, &conf->cache_state) &&
6790 mutex_trylock(&conf->cache_size_mutex)) {
6791 grow_one_stripe(conf, __GFP_NOWARN);
6792
6793
6794
6795 set_bit(R5_DID_ALLOC, &conf->cache_state);
6796 mutex_unlock(&conf->cache_size_mutex);
6797 }
6798
6799 flush_deferred_bios(conf);
6800
6801 r5l_flush_stripe_to_raid(conf->log);
6802
6803 async_tx_issue_pending_all();
6804 blk_finish_plug(&plug);
6805
6806 pr_debug("--- raid5d inactive\n");
6807 }
6808
6809 static ssize_t
6810 raid5_show_stripe_cache_size(struct mddev *mddev, char *page)
6811 {
6812 struct r5conf *conf;
6813 int ret = 0;
6814 spin_lock(&mddev->lock);
6815 conf = mddev->private;
6816 if (conf)
6817 ret = sprintf(page, "%d\n", conf->min_nr_stripes);
6818 spin_unlock(&mddev->lock);
6819 return ret;
6820 }
6821
6822 int
6823 raid5_set_cache_size(struct mddev *mddev, int size)
6824 {
6825 int result = 0;
6826 struct r5conf *conf = mddev->private;
6827
6828 if (size <= 16 || size > 32768)
6829 return -EINVAL;
6830
6831 conf->min_nr_stripes = size;
6832 mutex_lock(&conf->cache_size_mutex);
6833 while (size < conf->max_nr_stripes &&
6834 drop_one_stripe(conf))
6835 ;
6836 mutex_unlock(&conf->cache_size_mutex);
6837
6838 md_allow_write(mddev);
6839
6840 mutex_lock(&conf->cache_size_mutex);
6841 while (size > conf->max_nr_stripes)
6842 if (!grow_one_stripe(conf, GFP_KERNEL)) {
6843 conf->min_nr_stripes = conf->max_nr_stripes;
6844 result = -ENOMEM;
6845 break;
6846 }
6847 mutex_unlock(&conf->cache_size_mutex);
6848
6849 return result;
6850 }
6851 EXPORT_SYMBOL(raid5_set_cache_size);
6852
6853 static ssize_t
6854 raid5_store_stripe_cache_size(struct mddev *mddev, const char *page, size_t len)
6855 {
6856 struct r5conf *conf;
6857 unsigned long new;
6858 int err;
6859
6860 if (len >= PAGE_SIZE)
6861 return -EINVAL;
6862 if (kstrtoul(page, 10, &new))
6863 return -EINVAL;
6864 err = mddev_lock(mddev);
6865 if (err)
6866 return err;
6867 conf = mddev->private;
6868 if (!conf)
6869 err = -ENODEV;
6870 else
6871 err = raid5_set_cache_size(mddev, new);
6872 mddev_unlock(mddev);
6873
6874 return err ?: len;
6875 }
6876
6877 static struct md_sysfs_entry
6878 raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR,
6879 raid5_show_stripe_cache_size,
6880 raid5_store_stripe_cache_size);
6881
6882 static ssize_t
6883 raid5_show_rmw_level(struct mddev *mddev, char *page)
6884 {
6885 struct r5conf *conf = mddev->private;
6886 if (conf)
6887 return sprintf(page, "%d\n", conf->rmw_level);
6888 else
6889 return 0;
6890 }
6891
6892 static ssize_t
6893 raid5_store_rmw_level(struct mddev *mddev, const char *page, size_t len)
6894 {
6895 struct r5conf *conf = mddev->private;
6896 unsigned long new;
6897
6898 if (!conf)
6899 return -ENODEV;
6900
6901 if (len >= PAGE_SIZE)
6902 return -EINVAL;
6903
6904 if (kstrtoul(page, 10, &new))
6905 return -EINVAL;
6906
6907 if (new != PARITY_DISABLE_RMW && !raid6_call.xor_syndrome)
6908 return -EINVAL;
6909
6910 if (new != PARITY_DISABLE_RMW &&
6911 new != PARITY_ENABLE_RMW &&
6912 new != PARITY_PREFER_RMW)
6913 return -EINVAL;
6914
6915 conf->rmw_level = new;
6916 return len;
6917 }
6918
6919 static struct md_sysfs_entry
6920 raid5_rmw_level = __ATTR(rmw_level, S_IRUGO | S_IWUSR,
6921 raid5_show_rmw_level,
6922 raid5_store_rmw_level);
6923
6924 static ssize_t
6925 raid5_show_stripe_size(struct mddev *mddev, char *page)
6926 {
6927 struct r5conf *conf;
6928 int ret = 0;
6929
6930 spin_lock(&mddev->lock);
6931 conf = mddev->private;
6932 if (conf)
6933 ret = sprintf(page, "%lu\n", RAID5_STRIPE_SIZE(conf));
6934 spin_unlock(&mddev->lock);
6935 return ret;
6936 }
6937
6938 #if PAGE_SIZE != DEFAULT_STRIPE_SIZE
6939 static ssize_t
6940 raid5_store_stripe_size(struct mddev *mddev, const char *page, size_t len)
6941 {
6942 struct r5conf *conf;
6943 unsigned long new;
6944 int err;
6945 int size;
6946
6947 if (len >= PAGE_SIZE)
6948 return -EINVAL;
6949 if (kstrtoul(page, 10, &new))
6950 return -EINVAL;
6951
6952
6953
6954
6955
6956
6957 if (new % DEFAULT_STRIPE_SIZE != 0 ||
6958 new > PAGE_SIZE || new == 0 ||
6959 new != roundup_pow_of_two(new))
6960 return -EINVAL;
6961
6962 err = mddev_lock(mddev);
6963 if (err)
6964 return err;
6965
6966 conf = mddev->private;
6967 if (!conf) {
6968 err = -ENODEV;
6969 goto out_unlock;
6970 }
6971
6972 if (new == conf->stripe_size)
6973 goto out_unlock;
6974
6975 pr_debug("md/raid: change stripe_size from %lu to %lu\n",
6976 conf->stripe_size, new);
6977
6978 if (mddev->sync_thread ||
6979 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
6980 mddev->reshape_position != MaxSector ||
6981 mddev->sysfs_active) {
6982 err = -EBUSY;
6983 goto out_unlock;
6984 }
6985
6986 mddev_suspend(mddev);
6987 mutex_lock(&conf->cache_size_mutex);
6988 size = conf->max_nr_stripes;
6989
6990 shrink_stripes(conf);
6991
6992 conf->stripe_size = new;
6993 conf->stripe_shift = ilog2(new) - 9;
6994 conf->stripe_sectors = new >> 9;
6995 if (grow_stripes(conf, size)) {
6996 pr_warn("md/raid:%s: couldn't allocate buffers\n",
6997 mdname(mddev));
6998 err = -ENOMEM;
6999 }
7000 mutex_unlock(&conf->cache_size_mutex);
7001 mddev_resume(mddev);
7002
7003 out_unlock:
7004 mddev_unlock(mddev);
7005 return err ?: len;
7006 }
7007
7008 static struct md_sysfs_entry
7009 raid5_stripe_size = __ATTR(stripe_size, 0644,
7010 raid5_show_stripe_size,
7011 raid5_store_stripe_size);
7012 #else
7013 static struct md_sysfs_entry
7014 raid5_stripe_size = __ATTR(stripe_size, 0444,
7015 raid5_show_stripe_size,
7016 NULL);
7017 #endif
7018
7019 static ssize_t
7020 raid5_show_preread_threshold(struct mddev *mddev, char *page)
7021 {
7022 struct r5conf *conf;
7023 int ret = 0;
7024 spin_lock(&mddev->lock);
7025 conf = mddev->private;
7026 if (conf)
7027 ret = sprintf(page, "%d\n", conf->bypass_threshold);
7028 spin_unlock(&mddev->lock);
7029 return ret;
7030 }
7031
7032 static ssize_t
7033 raid5_store_preread_threshold(struct mddev *mddev, const char *page, size_t len)
7034 {
7035 struct r5conf *conf;
7036 unsigned long new;
7037 int err;
7038
7039 if (len >= PAGE_SIZE)
7040 return -EINVAL;
7041 if (kstrtoul(page, 10, &new))
7042 return -EINVAL;
7043
7044 err = mddev_lock(mddev);
7045 if (err)
7046 return err;
7047 conf = mddev->private;
7048 if (!conf)
7049 err = -ENODEV;
7050 else if (new > conf->min_nr_stripes)
7051 err = -EINVAL;
7052 else
7053 conf->bypass_threshold = new;
7054 mddev_unlock(mddev);
7055 return err ?: len;
7056 }
7057
7058 static struct md_sysfs_entry
7059 raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold,
7060 S_IRUGO | S_IWUSR,
7061 raid5_show_preread_threshold,
7062 raid5_store_preread_threshold);
7063
7064 static ssize_t
7065 raid5_show_skip_copy(struct mddev *mddev, char *page)
7066 {
7067 struct r5conf *conf;
7068 int ret = 0;
7069 spin_lock(&mddev->lock);
7070 conf = mddev->private;
7071 if (conf)
7072 ret = sprintf(page, "%d\n", conf->skip_copy);
7073 spin_unlock(&mddev->lock);
7074 return ret;
7075 }
7076
7077 static ssize_t
7078 raid5_store_skip_copy(struct mddev *mddev, const char *page, size_t len)
7079 {
7080 struct r5conf *conf;
7081 unsigned long new;
7082 int err;
7083
7084 if (len >= PAGE_SIZE)
7085 return -EINVAL;
7086 if (kstrtoul(page, 10, &new))
7087 return -EINVAL;
7088 new = !!new;
7089
7090 err = mddev_lock(mddev);
7091 if (err)
7092 return err;
7093 conf = mddev->private;
7094 if (!conf)
7095 err = -ENODEV;
7096 else if (new != conf->skip_copy) {
7097 struct request_queue *q = mddev->queue;
7098
7099 mddev_suspend(mddev);
7100 conf->skip_copy = new;
7101 if (new)
7102 blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, q);
7103 else
7104 blk_queue_flag_clear(QUEUE_FLAG_STABLE_WRITES, q);
7105 mddev_resume(mddev);
7106 }
7107 mddev_unlock(mddev);
7108 return err ?: len;
7109 }
7110
7111 static struct md_sysfs_entry
7112 raid5_skip_copy = __ATTR(skip_copy, S_IRUGO | S_IWUSR,
7113 raid5_show_skip_copy,
7114 raid5_store_skip_copy);
7115
7116 static ssize_t
7117 stripe_cache_active_show(struct mddev *mddev, char *page)
7118 {
7119 struct r5conf *conf = mddev->private;
7120 if (conf)
7121 return sprintf(page, "%d\n", atomic_read(&conf->active_stripes));
7122 else
7123 return 0;
7124 }
7125
7126 static struct md_sysfs_entry
7127 raid5_stripecache_active = __ATTR_RO(stripe_cache_active);
7128
7129 static ssize_t
7130 raid5_show_group_thread_cnt(struct mddev *mddev, char *page)
7131 {
7132 struct r5conf *conf;
7133 int ret = 0;
7134 spin_lock(&mddev->lock);
7135 conf = mddev->private;
7136 if (conf)
7137 ret = sprintf(page, "%d\n", conf->worker_cnt_per_group);
7138 spin_unlock(&mddev->lock);
7139 return ret;
7140 }
7141
7142 static int alloc_thread_groups(struct r5conf *conf, int cnt,
7143 int *group_cnt,
7144 struct r5worker_group **worker_groups);
7145 static ssize_t
7146 raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len)
7147 {
7148 struct r5conf *conf;
7149 unsigned int new;
7150 int err;
7151 struct r5worker_group *new_groups, *old_groups;
7152 int group_cnt;
7153
7154 if (len >= PAGE_SIZE)
7155 return -EINVAL;
7156 if (kstrtouint(page, 10, &new))
7157 return -EINVAL;
7158
7159 if (new > 8192)
7160 return -EINVAL;
7161
7162 err = mddev_lock(mddev);
7163 if (err)
7164 return err;
7165 conf = mddev->private;
7166 if (!conf)
7167 err = -ENODEV;
7168 else if (new != conf->worker_cnt_per_group) {
7169 mddev_suspend(mddev);
7170
7171 old_groups = conf->worker_groups;
7172 if (old_groups)
7173 flush_workqueue(raid5_wq);
7174
7175 err = alloc_thread_groups(conf, new, &group_cnt, &new_groups);
7176 if (!err) {
7177 spin_lock_irq(&conf->device_lock);
7178 conf->group_cnt = group_cnt;
7179 conf->worker_cnt_per_group = new;
7180 conf->worker_groups = new_groups;
7181 spin_unlock_irq(&conf->device_lock);
7182
7183 if (old_groups)
7184 kfree(old_groups[0].workers);
7185 kfree(old_groups);
7186 }
7187 mddev_resume(mddev);
7188 }
7189 mddev_unlock(mddev);
7190
7191 return err ?: len;
7192 }
7193
7194 static struct md_sysfs_entry
7195 raid5_group_thread_cnt = __ATTR(group_thread_cnt, S_IRUGO | S_IWUSR,
7196 raid5_show_group_thread_cnt,
7197 raid5_store_group_thread_cnt);
7198
7199 static struct attribute *raid5_attrs[] = {
7200 &raid5_stripecache_size.attr,
7201 &raid5_stripecache_active.attr,
7202 &raid5_preread_bypass_threshold.attr,
7203 &raid5_group_thread_cnt.attr,
7204 &raid5_skip_copy.attr,
7205 &raid5_rmw_level.attr,
7206 &raid5_stripe_size.attr,
7207 &r5c_journal_mode.attr,
7208 &ppl_write_hint.attr,
7209 NULL,
7210 };
7211 static const struct attribute_group raid5_attrs_group = {
7212 .name = NULL,
7213 .attrs = raid5_attrs,
7214 };
7215
7216 static int alloc_thread_groups(struct r5conf *conf, int cnt, int *group_cnt,
7217 struct r5worker_group **worker_groups)
7218 {
7219 int i, j, k;
7220 ssize_t size;
7221 struct r5worker *workers;
7222
7223 if (cnt == 0) {
7224 *group_cnt = 0;
7225 *worker_groups = NULL;
7226 return 0;
7227 }
7228 *group_cnt = num_possible_nodes();
7229 size = sizeof(struct r5worker) * cnt;
7230 workers = kcalloc(size, *group_cnt, GFP_NOIO);
7231 *worker_groups = kcalloc(*group_cnt, sizeof(struct r5worker_group),
7232 GFP_NOIO);
7233 if (!*worker_groups || !workers) {
7234 kfree(workers);
7235 kfree(*worker_groups);
7236 return -ENOMEM;
7237 }
7238
7239 for (i = 0; i < *group_cnt; i++) {
7240 struct r5worker_group *group;
7241
7242 group = &(*worker_groups)[i];
7243 INIT_LIST_HEAD(&group->handle_list);
7244 INIT_LIST_HEAD(&group->loprio_list);
7245 group->conf = conf;
7246 group->workers = workers + i * cnt;
7247
7248 for (j = 0; j < cnt; j++) {
7249 struct r5worker *worker = group->workers + j;
7250 worker->group = group;
7251 INIT_WORK(&worker->work, raid5_do_work);
7252
7253 for (k = 0; k < NR_STRIPE_HASH_LOCKS; k++)
7254 INIT_LIST_HEAD(worker->temp_inactive_list + k);
7255 }
7256 }
7257
7258 return 0;
7259 }
7260
7261 static void free_thread_groups(struct r5conf *conf)
7262 {
7263 if (conf->worker_groups)
7264 kfree(conf->worker_groups[0].workers);
7265 kfree(conf->worker_groups);
7266 conf->worker_groups = NULL;
7267 }
7268
7269 static sector_t
7270 raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks)
7271 {
7272 struct r5conf *conf = mddev->private;
7273
7274 if (!sectors)
7275 sectors = mddev->dev_sectors;
7276 if (!raid_disks)
7277
7278 raid_disks = min(conf->raid_disks, conf->previous_raid_disks);
7279
7280 sectors &= ~((sector_t)conf->chunk_sectors - 1);
7281 sectors &= ~((sector_t)conf->prev_chunk_sectors - 1);
7282 return sectors * (raid_disks - conf->max_degraded);
7283 }
7284
7285 static void free_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu)
7286 {
7287 safe_put_page(percpu->spare_page);
7288 percpu->spare_page = NULL;
7289 kvfree(percpu->scribble);
7290 percpu->scribble = NULL;
7291 }
7292
7293 static int alloc_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu)
7294 {
7295 if (conf->level == 6 && !percpu->spare_page) {
7296 percpu->spare_page = alloc_page(GFP_KERNEL);
7297 if (!percpu->spare_page)
7298 return -ENOMEM;
7299 }
7300
7301 if (scribble_alloc(percpu,
7302 max(conf->raid_disks,
7303 conf->previous_raid_disks),
7304 max(conf->chunk_sectors,
7305 conf->prev_chunk_sectors)
7306 / RAID5_STRIPE_SECTORS(conf))) {
7307 free_scratch_buffer(conf, percpu);
7308 return -ENOMEM;
7309 }
7310
7311 local_lock_init(&percpu->lock);
7312 return 0;
7313 }
7314
7315 static int raid456_cpu_dead(unsigned int cpu, struct hlist_node *node)
7316 {
7317 struct r5conf *conf = hlist_entry_safe(node, struct r5conf, node);
7318
7319 free_scratch_buffer(conf, per_cpu_ptr(conf->percpu, cpu));
7320 return 0;
7321 }
7322
7323 static void raid5_free_percpu(struct r5conf *conf)
7324 {
7325 if (!conf->percpu)
7326 return;
7327
7328 cpuhp_state_remove_instance(CPUHP_MD_RAID5_PREPARE, &conf->node);
7329 free_percpu(conf->percpu);
7330 }
7331
7332 static void free_conf(struct r5conf *conf)
7333 {
7334 int i;
7335
7336 log_exit(conf);
7337
7338 unregister_shrinker(&conf->shrinker);
7339 free_thread_groups(conf);
7340 shrink_stripes(conf);
7341 raid5_free_percpu(conf);
7342 for (i = 0; i < conf->pool_size; i++)
7343 if (conf->disks[i].extra_page)
7344 put_page(conf->disks[i].extra_page);
7345 kfree(conf->disks);
7346 bioset_exit(&conf->bio_split);
7347 kfree(conf->stripe_hashtbl);
7348 kfree(conf->pending_data);
7349 kfree(conf);
7350 }
7351
7352 static int raid456_cpu_up_prepare(unsigned int cpu, struct hlist_node *node)
7353 {
7354 struct r5conf *conf = hlist_entry_safe(node, struct r5conf, node);
7355 struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu);
7356
7357 if (alloc_scratch_buffer(conf, percpu)) {
7358 pr_warn("%s: failed memory allocation for cpu%u\n",
7359 __func__, cpu);
7360 return -ENOMEM;
7361 }
7362 return 0;
7363 }
7364
7365 static int raid5_alloc_percpu(struct r5conf *conf)
7366 {
7367 int err = 0;
7368
7369 conf->percpu = alloc_percpu(struct raid5_percpu);
7370 if (!conf->percpu)
7371 return -ENOMEM;
7372
7373 err = cpuhp_state_add_instance(CPUHP_MD_RAID5_PREPARE, &conf->node);
7374 if (!err) {
7375 conf->scribble_disks = max(conf->raid_disks,
7376 conf->previous_raid_disks);
7377 conf->scribble_sectors = max(conf->chunk_sectors,
7378 conf->prev_chunk_sectors);
7379 }
7380 return err;
7381 }
7382
7383 static unsigned long raid5_cache_scan(struct shrinker *shrink,
7384 struct shrink_control *sc)
7385 {
7386 struct r5conf *conf = container_of(shrink, struct r5conf, shrinker);
7387 unsigned long ret = SHRINK_STOP;
7388
7389 if (mutex_trylock(&conf->cache_size_mutex)) {
7390 ret= 0;
7391 while (ret < sc->nr_to_scan &&
7392 conf->max_nr_stripes > conf->min_nr_stripes) {
7393 if (drop_one_stripe(conf) == 0) {
7394 ret = SHRINK_STOP;
7395 break;
7396 }
7397 ret++;
7398 }
7399 mutex_unlock(&conf->cache_size_mutex);
7400 }
7401 return ret;
7402 }
7403
7404 static unsigned long raid5_cache_count(struct shrinker *shrink,
7405 struct shrink_control *sc)
7406 {
7407 struct r5conf *conf = container_of(shrink, struct r5conf, shrinker);
7408
7409 if (conf->max_nr_stripes < conf->min_nr_stripes)
7410
7411 return 0;
7412 return conf->max_nr_stripes - conf->min_nr_stripes;
7413 }
7414
7415 static struct r5conf *setup_conf(struct mddev *mddev)
7416 {
7417 struct r5conf *conf;
7418 int raid_disk, memory, max_disks;
7419 struct md_rdev *rdev;
7420 struct disk_info *disk;
7421 char pers_name[6];
7422 int i;
7423 int group_cnt;
7424 struct r5worker_group *new_group;
7425 int ret = -ENOMEM;
7426
7427 if (mddev->new_level != 5
7428 && mddev->new_level != 4
7429 && mddev->new_level != 6) {
7430 pr_warn("md/raid:%s: raid level not set to 4/5/6 (%d)\n",
7431 mdname(mddev), mddev->new_level);
7432 return ERR_PTR(-EIO);
7433 }
7434 if ((mddev->new_level == 5
7435 && !algorithm_valid_raid5(mddev->new_layout)) ||
7436 (mddev->new_level == 6
7437 && !algorithm_valid_raid6(mddev->new_layout))) {
7438 pr_warn("md/raid:%s: layout %d not supported\n",
7439 mdname(mddev), mddev->new_layout);
7440 return ERR_PTR(-EIO);
7441 }
7442 if (mddev->new_level == 6 && mddev->raid_disks < 4) {
7443 pr_warn("md/raid:%s: not enough configured devices (%d, minimum 4)\n",
7444 mdname(mddev), mddev->raid_disks);
7445 return ERR_PTR(-EINVAL);
7446 }
7447
7448 if (!mddev->new_chunk_sectors ||
7449 (mddev->new_chunk_sectors << 9) % PAGE_SIZE ||
7450 !is_power_of_2(mddev->new_chunk_sectors)) {
7451 pr_warn("md/raid:%s: invalid chunk size %d\n",
7452 mdname(mddev), mddev->new_chunk_sectors << 9);
7453 return ERR_PTR(-EINVAL);
7454 }
7455
7456 conf = kzalloc(sizeof(struct r5conf), GFP_KERNEL);
7457 if (conf == NULL)
7458 goto abort;
7459
7460 #if PAGE_SIZE != DEFAULT_STRIPE_SIZE
7461 conf->stripe_size = DEFAULT_STRIPE_SIZE;
7462 conf->stripe_shift = ilog2(DEFAULT_STRIPE_SIZE) - 9;
7463 conf->stripe_sectors = DEFAULT_STRIPE_SIZE >> 9;
7464 #endif
7465 INIT_LIST_HEAD(&conf->free_list);
7466 INIT_LIST_HEAD(&conf->pending_list);
7467 conf->pending_data = kcalloc(PENDING_IO_MAX,
7468 sizeof(struct r5pending_data),
7469 GFP_KERNEL);
7470 if (!conf->pending_data)
7471 goto abort;
7472 for (i = 0; i < PENDING_IO_MAX; i++)
7473 list_add(&conf->pending_data[i].sibling, &conf->free_list);
7474
7475 if (!alloc_thread_groups(conf, 0, &group_cnt, &new_group)) {
7476 conf->group_cnt = group_cnt;
7477 conf->worker_cnt_per_group = 0;
7478 conf->worker_groups = new_group;
7479 } else
7480 goto abort;
7481 spin_lock_init(&conf->device_lock);
7482 seqcount_spinlock_init(&conf->gen_lock, &conf->device_lock);
7483 mutex_init(&conf->cache_size_mutex);
7484
7485 init_waitqueue_head(&conf->wait_for_quiescent);
7486 init_waitqueue_head(&conf->wait_for_stripe);
7487 init_waitqueue_head(&conf->wait_for_overlap);
7488 INIT_LIST_HEAD(&conf->handle_list);
7489 INIT_LIST_HEAD(&conf->loprio_list);
7490 INIT_LIST_HEAD(&conf->hold_list);
7491 INIT_LIST_HEAD(&conf->delayed_list);
7492 INIT_LIST_HEAD(&conf->bitmap_list);
7493 init_llist_head(&conf->released_stripes);
7494 atomic_set(&conf->active_stripes, 0);
7495 atomic_set(&conf->preread_active_stripes, 0);
7496 atomic_set(&conf->active_aligned_reads, 0);
7497 spin_lock_init(&conf->pending_bios_lock);
7498 conf->batch_bio_dispatch = true;
7499 rdev_for_each(rdev, mddev) {
7500 if (test_bit(Journal, &rdev->flags))
7501 continue;
7502 if (bdev_nonrot(rdev->bdev)) {
7503 conf->batch_bio_dispatch = false;
7504 break;
7505 }
7506 }
7507
7508 conf->bypass_threshold = BYPASS_THRESHOLD;
7509 conf->recovery_disabled = mddev->recovery_disabled - 1;
7510
7511 conf->raid_disks = mddev->raid_disks;
7512 if (mddev->reshape_position == MaxSector)
7513 conf->previous_raid_disks = mddev->raid_disks;
7514 else
7515 conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks;
7516 max_disks = max(conf->raid_disks, conf->previous_raid_disks);
7517
7518 conf->disks = kcalloc(max_disks, sizeof(struct disk_info),
7519 GFP_KERNEL);
7520
7521 if (!conf->disks)
7522 goto abort;
7523
7524 for (i = 0; i < max_disks; i++) {
7525 conf->disks[i].extra_page = alloc_page(GFP_KERNEL);
7526 if (!conf->disks[i].extra_page)
7527 goto abort;
7528 }
7529
7530 ret = bioset_init(&conf->bio_split, BIO_POOL_SIZE, 0, 0);
7531 if (ret)
7532 goto abort;
7533 conf->mddev = mddev;
7534
7535 ret = -ENOMEM;
7536 conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL);
7537 if (!conf->stripe_hashtbl)
7538 goto abort;
7539
7540
7541
7542
7543
7544
7545 spin_lock_init(conf->hash_locks);
7546 for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++)
7547 spin_lock_init(conf->hash_locks + i);
7548
7549 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
7550 INIT_LIST_HEAD(conf->inactive_list + i);
7551
7552 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
7553 INIT_LIST_HEAD(conf->temp_inactive_list + i);
7554
7555 atomic_set(&conf->r5c_cached_full_stripes, 0);
7556 INIT_LIST_HEAD(&conf->r5c_full_stripe_list);
7557 atomic_set(&conf->r5c_cached_partial_stripes, 0);
7558 INIT_LIST_HEAD(&conf->r5c_partial_stripe_list);
7559 atomic_set(&conf->r5c_flushing_full_stripes, 0);
7560 atomic_set(&conf->r5c_flushing_partial_stripes, 0);
7561
7562 conf->level = mddev->new_level;
7563 conf->chunk_sectors = mddev->new_chunk_sectors;
7564 ret = raid5_alloc_percpu(conf);
7565 if (ret)
7566 goto abort;
7567
7568 pr_debug("raid456: run(%s) called.\n", mdname(mddev));
7569
7570 ret = -EIO;
7571 rdev_for_each(rdev, mddev) {
7572 raid_disk = rdev->raid_disk;
7573 if (raid_disk >= max_disks
7574 || raid_disk < 0 || test_bit(Journal, &rdev->flags))
7575 continue;
7576 disk = conf->disks + raid_disk;
7577
7578 if (test_bit(Replacement, &rdev->flags)) {
7579 if (disk->replacement)
7580 goto abort;
7581 RCU_INIT_POINTER(disk->replacement, rdev);
7582 } else {
7583 if (disk->rdev)
7584 goto abort;
7585 RCU_INIT_POINTER(disk->rdev, rdev);
7586 }
7587
7588 if (test_bit(In_sync, &rdev->flags)) {
7589 pr_info("md/raid:%s: device %pg operational as raid disk %d\n",
7590 mdname(mddev), rdev->bdev, raid_disk);
7591 } else if (rdev->saved_raid_disk != raid_disk)
7592
7593 conf->fullsync = 1;
7594 }
7595
7596 conf->level = mddev->new_level;
7597 if (conf->level == 6) {
7598 conf->max_degraded = 2;
7599 if (raid6_call.xor_syndrome)
7600 conf->rmw_level = PARITY_ENABLE_RMW;
7601 else
7602 conf->rmw_level = PARITY_DISABLE_RMW;
7603 } else {
7604 conf->max_degraded = 1;
7605 conf->rmw_level = PARITY_ENABLE_RMW;
7606 }
7607 conf->algorithm = mddev->new_layout;
7608 conf->reshape_progress = mddev->reshape_position;
7609 if (conf->reshape_progress != MaxSector) {
7610 conf->prev_chunk_sectors = mddev->chunk_sectors;
7611 conf->prev_algo = mddev->layout;
7612 } else {
7613 conf->prev_chunk_sectors = conf->chunk_sectors;
7614 conf->prev_algo = conf->algorithm;
7615 }
7616
7617 conf->min_nr_stripes = NR_STRIPES;
7618 if (mddev->reshape_position != MaxSector) {
7619 int stripes = max_t(int,
7620 ((mddev->chunk_sectors << 9) / RAID5_STRIPE_SIZE(conf)) * 4,
7621 ((mddev->new_chunk_sectors << 9) / RAID5_STRIPE_SIZE(conf)) * 4);
7622 conf->min_nr_stripes = max(NR_STRIPES, stripes);
7623 if (conf->min_nr_stripes != NR_STRIPES)
7624 pr_info("md/raid:%s: force stripe size %d for reshape\n",
7625 mdname(mddev), conf->min_nr_stripes);
7626 }
7627 memory = conf->min_nr_stripes * (sizeof(struct stripe_head) +
7628 max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
7629 atomic_set(&conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS);
7630 if (grow_stripes(conf, conf->min_nr_stripes)) {
7631 pr_warn("md/raid:%s: couldn't allocate %dkB for buffers\n",
7632 mdname(mddev), memory);
7633 ret = -ENOMEM;
7634 goto abort;
7635 } else
7636 pr_debug("md/raid:%s: allocated %dkB\n", mdname(mddev), memory);
7637
7638
7639
7640
7641
7642 conf->shrinker.seeks = DEFAULT_SEEKS * conf->raid_disks * 4;
7643 conf->shrinker.scan_objects = raid5_cache_scan;
7644 conf->shrinker.count_objects = raid5_cache_count;
7645 conf->shrinker.batch = 128;
7646 conf->shrinker.flags = 0;
7647 ret = register_shrinker(&conf->shrinker, "md-raid5:%s", mdname(mddev));
7648 if (ret) {
7649 pr_warn("md/raid:%s: couldn't register shrinker.\n",
7650 mdname(mddev));
7651 goto abort;
7652 }
7653
7654 sprintf(pers_name, "raid%d", mddev->new_level);
7655 conf->thread = md_register_thread(raid5d, mddev, pers_name);
7656 if (!conf->thread) {
7657 pr_warn("md/raid:%s: couldn't allocate thread.\n",
7658 mdname(mddev));
7659 ret = -ENOMEM;
7660 goto abort;
7661 }
7662
7663 return conf;
7664
7665 abort:
7666 if (conf)
7667 free_conf(conf);
7668 return ERR_PTR(ret);
7669 }
7670
7671 static int only_parity(int raid_disk, int algo, int raid_disks, int max_degraded)
7672 {
7673 switch (algo) {
7674 case ALGORITHM_PARITY_0:
7675 if (raid_disk < max_degraded)
7676 return 1;
7677 break;
7678 case ALGORITHM_PARITY_N:
7679 if (raid_disk >= raid_disks - max_degraded)
7680 return 1;
7681 break;
7682 case ALGORITHM_PARITY_0_6:
7683 if (raid_disk == 0 ||
7684 raid_disk == raid_disks - 1)
7685 return 1;
7686 break;
7687 case ALGORITHM_LEFT_ASYMMETRIC_6:
7688 case ALGORITHM_RIGHT_ASYMMETRIC_6:
7689 case ALGORITHM_LEFT_SYMMETRIC_6:
7690 case ALGORITHM_RIGHT_SYMMETRIC_6:
7691 if (raid_disk == raid_disks - 1)
7692 return 1;
7693 }
7694 return 0;
7695 }
7696
7697 static void raid5_set_io_opt(struct r5conf *conf)
7698 {
7699 blk_queue_io_opt(conf->mddev->queue, (conf->chunk_sectors << 9) *
7700 (conf->raid_disks - conf->max_degraded));
7701 }
7702
7703 static int raid5_run(struct mddev *mddev)
7704 {
7705 struct r5conf *conf;
7706 int working_disks = 0;
7707 int dirty_parity_disks = 0;
7708 struct md_rdev *rdev;
7709 struct md_rdev *journal_dev = NULL;
7710 sector_t reshape_offset = 0;
7711 int i, ret = 0;
7712 long long min_offset_diff = 0;
7713 int first = 1;
7714
7715 if (acct_bioset_init(mddev)) {
7716 pr_err("md/raid456:%s: alloc acct bioset failed.\n", mdname(mddev));
7717 return -ENOMEM;
7718 }
7719
7720 if (mddev_init_writes_pending(mddev) < 0) {
7721 ret = -ENOMEM;
7722 goto exit_acct_set;
7723 }
7724
7725 if (mddev->recovery_cp != MaxSector)
7726 pr_notice("md/raid:%s: not clean -- starting background reconstruction\n",
7727 mdname(mddev));
7728
7729 rdev_for_each(rdev, mddev) {
7730 long long diff;
7731
7732 if (test_bit(Journal, &rdev->flags)) {
7733 journal_dev = rdev;
7734 continue;
7735 }
7736 if (rdev->raid_disk < 0)
7737 continue;
7738 diff = (rdev->new_data_offset - rdev->data_offset);
7739 if (first) {
7740 min_offset_diff = diff;
7741 first = 0;
7742 } else if (mddev->reshape_backwards &&
7743 diff < min_offset_diff)
7744 min_offset_diff = diff;
7745 else if (!mddev->reshape_backwards &&
7746 diff > min_offset_diff)
7747 min_offset_diff = diff;
7748 }
7749
7750 if ((test_bit(MD_HAS_JOURNAL, &mddev->flags) || journal_dev) &&
7751 (mddev->bitmap_info.offset || mddev->bitmap_info.file)) {
7752 pr_notice("md/raid:%s: array cannot have both journal and bitmap\n",
7753 mdname(mddev));
7754 ret = -EINVAL;
7755 goto exit_acct_set;
7756 }
7757
7758 if (mddev->reshape_position != MaxSector) {
7759
7760
7761
7762
7763
7764
7765
7766
7767
7768
7769
7770
7771 sector_t here_new, here_old;
7772 int old_disks;
7773 int max_degraded = (mddev->level == 6 ? 2 : 1);
7774 int chunk_sectors;
7775 int new_data_disks;
7776
7777 if (journal_dev) {
7778 pr_warn("md/raid:%s: don't support reshape with journal - aborting.\n",
7779 mdname(mddev));
7780 ret = -EINVAL;
7781 goto exit_acct_set;
7782 }
7783
7784 if (mddev->new_level != mddev->level) {
7785 pr_warn("md/raid:%s: unsupported reshape required - aborting.\n",
7786 mdname(mddev));
7787 ret = -EINVAL;
7788 goto exit_acct_set;
7789 }
7790 old_disks = mddev->raid_disks - mddev->delta_disks;
7791
7792
7793
7794
7795
7796
7797
7798 here_new = mddev->reshape_position;
7799 chunk_sectors = max(mddev->chunk_sectors, mddev->new_chunk_sectors);
7800 new_data_disks = mddev->raid_disks - max_degraded;
7801 if (sector_div(here_new, chunk_sectors * new_data_disks)) {
7802 pr_warn("md/raid:%s: reshape_position not on a stripe boundary\n",
7803 mdname(mddev));
7804 ret = -EINVAL;
7805 goto exit_acct_set;
7806 }
7807 reshape_offset = here_new * chunk_sectors;
7808
7809 here_old = mddev->reshape_position;
7810 sector_div(here_old, chunk_sectors * (old_disks-max_degraded));
7811
7812
7813 if (mddev->delta_disks == 0) {
7814
7815
7816
7817
7818
7819
7820
7821 if (abs(min_offset_diff) >= mddev->chunk_sectors &&
7822 abs(min_offset_diff) >= mddev->new_chunk_sectors)
7823 ;
7824 else if (mddev->ro == 0) {
7825 pr_warn("md/raid:%s: in-place reshape must be started in read-only mode - aborting\n",
7826 mdname(mddev));
7827 ret = -EINVAL;
7828 goto exit_acct_set;
7829 }
7830 } else if (mddev->reshape_backwards
7831 ? (here_new * chunk_sectors + min_offset_diff <=
7832 here_old * chunk_sectors)
7833 : (here_new * chunk_sectors >=
7834 here_old * chunk_sectors + (-min_offset_diff))) {
7835
7836 pr_warn("md/raid:%s: reshape_position too early for auto-recovery - aborting.\n",
7837 mdname(mddev));
7838 ret = -EINVAL;
7839 goto exit_acct_set;
7840 }
7841 pr_debug("md/raid:%s: reshape will continue\n", mdname(mddev));
7842
7843 } else {
7844 BUG_ON(mddev->level != mddev->new_level);
7845 BUG_ON(mddev->layout != mddev->new_layout);
7846 BUG_ON(mddev->chunk_sectors != mddev->new_chunk_sectors);
7847 BUG_ON(mddev->delta_disks != 0);
7848 }
7849
7850 if (test_bit(MD_HAS_JOURNAL, &mddev->flags) &&
7851 test_bit(MD_HAS_PPL, &mddev->flags)) {
7852 pr_warn("md/raid:%s: using journal device and PPL not allowed - disabling PPL\n",
7853 mdname(mddev));
7854 clear_bit(MD_HAS_PPL, &mddev->flags);
7855 clear_bit(MD_HAS_MULTIPLE_PPLS, &mddev->flags);
7856 }
7857
7858 if (mddev->private == NULL)
7859 conf = setup_conf(mddev);
7860 else
7861 conf = mddev->private;
7862
7863 if (IS_ERR(conf)) {
7864 ret = PTR_ERR(conf);
7865 goto exit_acct_set;
7866 }
7867
7868 if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
7869 if (!journal_dev) {
7870 pr_warn("md/raid:%s: journal disk is missing, force array readonly\n",
7871 mdname(mddev));
7872 mddev->ro = 1;
7873 set_disk_ro(mddev->gendisk, 1);
7874 } else if (mddev->recovery_cp == MaxSector)
7875 set_bit(MD_JOURNAL_CLEAN, &mddev->flags);
7876 }
7877
7878 conf->min_offset_diff = min_offset_diff;
7879 mddev->thread = conf->thread;
7880 conf->thread = NULL;
7881 mddev->private = conf;
7882
7883 for (i = 0; i < conf->raid_disks && conf->previous_raid_disks;
7884 i++) {
7885 rdev = rdev_mdlock_deref(mddev, conf->disks[i].rdev);
7886 if (!rdev && conf->disks[i].replacement) {
7887
7888 rdev = rdev_mdlock_deref(mddev,
7889 conf->disks[i].replacement);
7890 conf->disks[i].replacement = NULL;
7891 clear_bit(Replacement, &rdev->flags);
7892 rcu_assign_pointer(conf->disks[i].rdev, rdev);
7893 }
7894 if (!rdev)
7895 continue;
7896 if (rcu_access_pointer(conf->disks[i].replacement) &&
7897 conf->reshape_progress != MaxSector) {
7898
7899 pr_warn("md: cannot handle concurrent replacement and reshape.\n");
7900 goto abort;
7901 }
7902 if (test_bit(In_sync, &rdev->flags)) {
7903 working_disks++;
7904 continue;
7905 }
7906
7907
7908
7909
7910
7911
7912
7913
7914
7915 if (mddev->major_version == 0 &&
7916 mddev->minor_version > 90)
7917 rdev->recovery_offset = reshape_offset;
7918
7919 if (rdev->recovery_offset < reshape_offset) {
7920
7921 if (!only_parity(rdev->raid_disk,
7922 conf->algorithm,
7923 conf->raid_disks,
7924 conf->max_degraded))
7925 continue;
7926 }
7927 if (!only_parity(rdev->raid_disk,
7928 conf->prev_algo,
7929 conf->previous_raid_disks,
7930 conf->max_degraded))
7931 continue;
7932 dirty_parity_disks++;
7933 }
7934
7935
7936
7937
7938 mddev->degraded = raid5_calc_degraded(conf);
7939
7940 if (has_failed(conf)) {
7941 pr_crit("md/raid:%s: not enough operational devices (%d/%d failed)\n",
7942 mdname(mddev), mddev->degraded, conf->raid_disks);
7943 goto abort;
7944 }
7945
7946
7947 mddev->dev_sectors &= ~((sector_t)mddev->chunk_sectors - 1);
7948 mddev->resync_max_sectors = mddev->dev_sectors;
7949
7950 if (mddev->degraded > dirty_parity_disks &&
7951 mddev->recovery_cp != MaxSector) {
7952 if (test_bit(MD_HAS_PPL, &mddev->flags))
7953 pr_crit("md/raid:%s: starting dirty degraded array with PPL.\n",
7954 mdname(mddev));
7955 else if (mddev->ok_start_degraded)
7956 pr_crit("md/raid:%s: starting dirty degraded array - data corruption possible.\n",
7957 mdname(mddev));
7958 else {
7959 pr_crit("md/raid:%s: cannot start dirty degraded array.\n",
7960 mdname(mddev));
7961 goto abort;
7962 }
7963 }
7964
7965 pr_info("md/raid:%s: raid level %d active with %d out of %d devices, algorithm %d\n",
7966 mdname(mddev), conf->level,
7967 mddev->raid_disks-mddev->degraded, mddev->raid_disks,
7968 mddev->new_layout);
7969
7970 print_raid5_conf(conf);
7971
7972 if (conf->reshape_progress != MaxSector) {
7973 conf->reshape_safe = conf->reshape_progress;
7974 atomic_set(&conf->reshape_stripes, 0);
7975 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
7976 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
7977 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
7978 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
7979 mddev->sync_thread = md_register_thread(md_do_sync, mddev,
7980 "reshape");
7981 if (!mddev->sync_thread)
7982 goto abort;
7983 }
7984
7985
7986 if (mddev->to_remove == &raid5_attrs_group)
7987 mddev->to_remove = NULL;
7988 else if (mddev->kobj.sd &&
7989 sysfs_create_group(&mddev->kobj, &raid5_attrs_group))
7990 pr_warn("raid5: failed to create sysfs attributes for %s\n",
7991 mdname(mddev));
7992 md_set_array_sectors(mddev, raid5_size(mddev, 0, 0));
7993
7994 if (mddev->queue) {
7995 int chunk_size;
7996
7997
7998
7999
8000 int data_disks = conf->previous_raid_disks - conf->max_degraded;
8001 int stripe = data_disks *
8002 ((mddev->chunk_sectors << 9) / PAGE_SIZE);
8003
8004 chunk_size = mddev->chunk_sectors << 9;
8005 blk_queue_io_min(mddev->queue, chunk_size);
8006 raid5_set_io_opt(conf);
8007 mddev->queue->limits.raid_partial_stripes_expensive = 1;
8008
8009
8010
8011
8012 stripe = stripe * PAGE_SIZE;
8013 stripe = roundup_pow_of_two(stripe);
8014 mddev->queue->limits.discard_granularity = stripe;
8015
8016 blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
8017
8018 rdev_for_each(rdev, mddev) {
8019 disk_stack_limits(mddev->gendisk, rdev->bdev,
8020 rdev->data_offset << 9);
8021 disk_stack_limits(mddev->gendisk, rdev->bdev,
8022 rdev->new_data_offset << 9);
8023 }
8024
8025
8026
8027
8028
8029
8030
8031
8032
8033
8034
8035
8036
8037
8038
8039
8040 if (!devices_handle_discard_safely ||
8041 mddev->queue->limits.max_discard_sectors < (stripe >> 9) ||
8042 mddev->queue->limits.discard_granularity < stripe)
8043 blk_queue_max_discard_sectors(mddev->queue, 0);
8044
8045
8046
8047
8048
8049 blk_queue_max_hw_sectors(mddev->queue,
8050 RAID5_MAX_REQ_STRIPES << RAID5_STRIPE_SHIFT(conf));
8051
8052
8053 blk_queue_max_segments(mddev->queue, USHRT_MAX);
8054 }
8055
8056 if (log_init(conf, journal_dev, raid5_has_ppl(conf)))
8057 goto abort;
8058
8059 return 0;
8060 abort:
8061 md_unregister_thread(&mddev->thread);
8062 print_raid5_conf(conf);
8063 free_conf(conf);
8064 mddev->private = NULL;
8065 pr_warn("md/raid:%s: failed to run raid set.\n", mdname(mddev));
8066 ret = -EIO;
8067 exit_acct_set:
8068 acct_bioset_exit(mddev);
8069 return ret;
8070 }
8071
8072 static void raid5_free(struct mddev *mddev, void *priv)
8073 {
8074 struct r5conf *conf = priv;
8075
8076 free_conf(conf);
8077 acct_bioset_exit(mddev);
8078 mddev->to_remove = &raid5_attrs_group;
8079 }
8080
8081 static void raid5_status(struct seq_file *seq, struct mddev *mddev)
8082 {
8083 struct r5conf *conf = mddev->private;
8084 int i;
8085
8086 seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level,
8087 conf->chunk_sectors / 2, mddev->layout);
8088 seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded);
8089 rcu_read_lock();
8090 for (i = 0; i < conf->raid_disks; i++) {
8091 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
8092 seq_printf (seq, "%s", rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_");
8093 }
8094 rcu_read_unlock();
8095 seq_printf (seq, "]");
8096 }
8097
8098 static void print_raid5_conf (struct r5conf *conf)
8099 {
8100 struct md_rdev *rdev;
8101 int i;
8102
8103 pr_debug("RAID conf printout:\n");
8104 if (!conf) {
8105 pr_debug("(conf==NULL)\n");
8106 return;
8107 }
8108 pr_debug(" --- level:%d rd:%d wd:%d\n", conf->level,
8109 conf->raid_disks,
8110 conf->raid_disks - conf->mddev->degraded);
8111
8112 rcu_read_lock();
8113 for (i = 0; i < conf->raid_disks; i++) {
8114 rdev = rcu_dereference(conf->disks[i].rdev);
8115 if (rdev)
8116 pr_debug(" disk %d, o:%d, dev:%pg\n",
8117 i, !test_bit(Faulty, &rdev->flags),
8118 rdev->bdev);
8119 }
8120 rcu_read_unlock();
8121 }
8122
8123 static int raid5_spare_active(struct mddev *mddev)
8124 {
8125 int i;
8126 struct r5conf *conf = mddev->private;
8127 struct md_rdev *rdev, *replacement;
8128 int count = 0;
8129 unsigned long flags;
8130
8131 for (i = 0; i < conf->raid_disks; i++) {
8132 rdev = rdev_mdlock_deref(mddev, conf->disks[i].rdev);
8133 replacement = rdev_mdlock_deref(mddev,
8134 conf->disks[i].replacement);
8135 if (replacement
8136 && replacement->recovery_offset == MaxSector
8137 && !test_bit(Faulty, &replacement->flags)
8138 && !test_and_set_bit(In_sync, &replacement->flags)) {
8139
8140 if (!rdev
8141 || !test_and_clear_bit(In_sync, &rdev->flags))
8142 count++;
8143 if (rdev) {
8144
8145
8146
8147
8148 set_bit(Faulty, &rdev->flags);
8149 sysfs_notify_dirent_safe(
8150 rdev->sysfs_state);
8151 }
8152 sysfs_notify_dirent_safe(replacement->sysfs_state);
8153 } else if (rdev
8154 && rdev->recovery_offset == MaxSector
8155 && !test_bit(Faulty, &rdev->flags)
8156 && !test_and_set_bit(In_sync, &rdev->flags)) {
8157 count++;
8158 sysfs_notify_dirent_safe(rdev->sysfs_state);
8159 }
8160 }
8161 spin_lock_irqsave(&conf->device_lock, flags);
8162 mddev->degraded = raid5_calc_degraded(conf);
8163 spin_unlock_irqrestore(&conf->device_lock, flags);
8164 print_raid5_conf(conf);
8165 return count;
8166 }
8167
8168 static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
8169 {
8170 struct r5conf *conf = mddev->private;
8171 int err = 0;
8172 int number = rdev->raid_disk;
8173 struct md_rdev __rcu **rdevp;
8174 struct disk_info *p;
8175 struct md_rdev *tmp;
8176
8177 print_raid5_conf(conf);
8178 if (test_bit(Journal, &rdev->flags) && conf->log) {
8179
8180
8181
8182
8183
8184
8185 if (atomic_read(&conf->active_stripes) ||
8186 atomic_read(&conf->r5c_cached_full_stripes) ||
8187 atomic_read(&conf->r5c_cached_partial_stripes)) {
8188 return -EBUSY;
8189 }
8190 log_exit(conf);
8191 return 0;
8192 }
8193 if (unlikely(number >= conf->pool_size))
8194 return 0;
8195 p = conf->disks + number;
8196 if (rdev == rcu_access_pointer(p->rdev))
8197 rdevp = &p->rdev;
8198 else if (rdev == rcu_access_pointer(p->replacement))
8199 rdevp = &p->replacement;
8200 else
8201 return 0;
8202
8203 if (number >= conf->raid_disks &&
8204 conf->reshape_progress == MaxSector)
8205 clear_bit(In_sync, &rdev->flags);
8206
8207 if (test_bit(In_sync, &rdev->flags) ||
8208 atomic_read(&rdev->nr_pending)) {
8209 err = -EBUSY;
8210 goto abort;
8211 }
8212
8213
8214
8215 if (!test_bit(Faulty, &rdev->flags) &&
8216 mddev->recovery_disabled != conf->recovery_disabled &&
8217 !has_failed(conf) &&
8218 (!rcu_access_pointer(p->replacement) ||
8219 rcu_access_pointer(p->replacement) == rdev) &&
8220 number < conf->raid_disks) {
8221 err = -EBUSY;
8222 goto abort;
8223 }
8224 *rdevp = NULL;
8225 if (!test_bit(RemoveSynchronized, &rdev->flags)) {
8226 lockdep_assert_held(&mddev->reconfig_mutex);
8227 synchronize_rcu();
8228 if (atomic_read(&rdev->nr_pending)) {
8229
8230 err = -EBUSY;
8231 rcu_assign_pointer(*rdevp, rdev);
8232 }
8233 }
8234 if (!err) {
8235 err = log_modify(conf, rdev, false);
8236 if (err)
8237 goto abort;
8238 }
8239
8240 tmp = rcu_access_pointer(p->replacement);
8241 if (tmp) {
8242
8243 rcu_assign_pointer(p->rdev, tmp);
8244 clear_bit(Replacement, &tmp->flags);
8245 smp_mb();
8246
8247
8248 rcu_assign_pointer(p->replacement, NULL);
8249
8250 if (!err)
8251 err = log_modify(conf, tmp, true);
8252 }
8253
8254 clear_bit(WantReplacement, &rdev->flags);
8255 abort:
8256
8257 print_raid5_conf(conf);
8258 return err;
8259 }
8260
8261 static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
8262 {
8263 struct r5conf *conf = mddev->private;
8264 int ret, err = -EEXIST;
8265 int disk;
8266 struct disk_info *p;
8267 struct md_rdev *tmp;
8268 int first = 0;
8269 int last = conf->raid_disks - 1;
8270
8271 if (test_bit(Journal, &rdev->flags)) {
8272 if (conf->log)
8273 return -EBUSY;
8274
8275 rdev->raid_disk = 0;
8276
8277
8278
8279
8280 ret = log_init(conf, rdev, false);
8281 if (ret)
8282 return ret;
8283
8284 ret = r5l_start(conf->log);
8285 if (ret)
8286 return ret;
8287
8288 return 0;
8289 }
8290 if (mddev->recovery_disabled == conf->recovery_disabled)
8291 return -EBUSY;
8292
8293 if (rdev->saved_raid_disk < 0 && has_failed(conf))
8294
8295 return -EINVAL;
8296
8297 if (rdev->raid_disk >= 0)
8298 first = last = rdev->raid_disk;
8299
8300
8301
8302
8303
8304 if (rdev->saved_raid_disk >= first &&
8305 rdev->saved_raid_disk <= last &&
8306 conf->disks[rdev->saved_raid_disk].rdev == NULL)
8307 first = rdev->saved_raid_disk;
8308
8309 for (disk = first; disk <= last; disk++) {
8310 p = conf->disks + disk;
8311 if (p->rdev == NULL) {
8312 clear_bit(In_sync, &rdev->flags);
8313 rdev->raid_disk = disk;
8314 if (rdev->saved_raid_disk != disk)
8315 conf->fullsync = 1;
8316 rcu_assign_pointer(p->rdev, rdev);
8317
8318 err = log_modify(conf, rdev, true);
8319
8320 goto out;
8321 }
8322 }
8323 for (disk = first; disk <= last; disk++) {
8324 p = conf->disks + disk;
8325 tmp = rdev_mdlock_deref(mddev, p->rdev);
8326 if (test_bit(WantReplacement, &tmp->flags) &&
8327 p->replacement == NULL) {
8328 clear_bit(In_sync, &rdev->flags);
8329 set_bit(Replacement, &rdev->flags);
8330 rdev->raid_disk = disk;
8331 err = 0;
8332 conf->fullsync = 1;
8333 rcu_assign_pointer(p->replacement, rdev);
8334 break;
8335 }
8336 }
8337 out:
8338 print_raid5_conf(conf);
8339 return err;
8340 }
8341
8342 static int raid5_resize(struct mddev *mddev, sector_t sectors)
8343 {
8344
8345
8346
8347
8348
8349
8350
8351 sector_t newsize;
8352 struct r5conf *conf = mddev->private;
8353
8354 if (raid5_has_log(conf) || raid5_has_ppl(conf))
8355 return -EINVAL;
8356 sectors &= ~((sector_t)conf->chunk_sectors - 1);
8357 newsize = raid5_size(mddev, sectors, mddev->raid_disks);
8358 if (mddev->external_size &&
8359 mddev->array_sectors > newsize)
8360 return -EINVAL;
8361 if (mddev->bitmap) {
8362 int ret = md_bitmap_resize(mddev->bitmap, sectors, 0, 0);
8363 if (ret)
8364 return ret;
8365 }
8366 md_set_array_sectors(mddev, newsize);
8367 if (sectors > mddev->dev_sectors &&
8368 mddev->recovery_cp > mddev->dev_sectors) {
8369 mddev->recovery_cp = mddev->dev_sectors;
8370 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
8371 }
8372 mddev->dev_sectors = sectors;
8373 mddev->resync_max_sectors = sectors;
8374 return 0;
8375 }
8376
8377 static int check_stripe_cache(struct mddev *mddev)
8378 {
8379
8380
8381
8382
8383
8384
8385
8386
8387 struct r5conf *conf = mddev->private;
8388 if (((mddev->chunk_sectors << 9) / RAID5_STRIPE_SIZE(conf)) * 4
8389 > conf->min_nr_stripes ||
8390 ((mddev->new_chunk_sectors << 9) / RAID5_STRIPE_SIZE(conf)) * 4
8391 > conf->min_nr_stripes) {
8392 pr_warn("md/raid:%s: reshape: not enough stripes. Needed %lu\n",
8393 mdname(mddev),
8394 ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9)
8395 / RAID5_STRIPE_SIZE(conf))*4);
8396 return 0;
8397 }
8398 return 1;
8399 }
8400
8401 static int check_reshape(struct mddev *mddev)
8402 {
8403 struct r5conf *conf = mddev->private;
8404
8405 if (raid5_has_log(conf) || raid5_has_ppl(conf))
8406 return -EINVAL;
8407 if (mddev->delta_disks == 0 &&
8408 mddev->new_layout == mddev->layout &&
8409 mddev->new_chunk_sectors == mddev->chunk_sectors)
8410 return 0;
8411 if (has_failed(conf))
8412 return -EINVAL;
8413 if (mddev->delta_disks < 0 && mddev->reshape_position == MaxSector) {
8414
8415
8416
8417
8418
8419 int min = 2;
8420 if (mddev->level == 6)
8421 min = 4;
8422 if (mddev->raid_disks + mddev->delta_disks < min)
8423 return -EINVAL;
8424 }
8425
8426 if (!check_stripe_cache(mddev))
8427 return -ENOSPC;
8428
8429 if (mddev->new_chunk_sectors > mddev->chunk_sectors ||
8430 mddev->delta_disks > 0)
8431 if (resize_chunks(conf,
8432 conf->previous_raid_disks
8433 + max(0, mddev->delta_disks),
8434 max(mddev->new_chunk_sectors,
8435 mddev->chunk_sectors)
8436 ) < 0)
8437 return -ENOMEM;
8438
8439 if (conf->previous_raid_disks + mddev->delta_disks <= conf->pool_size)
8440 return 0;
8441 return resize_stripes(conf, (conf->previous_raid_disks
8442 + mddev->delta_disks));
8443 }
8444
8445 static int raid5_start_reshape(struct mddev *mddev)
8446 {
8447 struct r5conf *conf = mddev->private;
8448 struct md_rdev *rdev;
8449 int spares = 0;
8450 unsigned long flags;
8451
8452 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
8453 return -EBUSY;
8454
8455 if (!check_stripe_cache(mddev))
8456 return -ENOSPC;
8457
8458 if (has_failed(conf))
8459 return -EINVAL;
8460
8461 rdev_for_each(rdev, mddev) {
8462 if (!test_bit(In_sync, &rdev->flags)
8463 && !test_bit(Faulty, &rdev->flags))
8464 spares++;
8465 }
8466
8467 if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded)
8468
8469
8470
8471 return -EINVAL;
8472
8473
8474
8475
8476
8477 if (raid5_size(mddev, 0, conf->raid_disks + mddev->delta_disks)
8478 < mddev->array_sectors) {
8479 pr_warn("md/raid:%s: array size must be reduced before number of disks\n",
8480 mdname(mddev));
8481 return -EINVAL;
8482 }
8483
8484 atomic_set(&conf->reshape_stripes, 0);
8485 spin_lock_irq(&conf->device_lock);
8486 write_seqcount_begin(&conf->gen_lock);
8487 conf->previous_raid_disks = conf->raid_disks;
8488 conf->raid_disks += mddev->delta_disks;
8489 conf->prev_chunk_sectors = conf->chunk_sectors;
8490 conf->chunk_sectors = mddev->new_chunk_sectors;
8491 conf->prev_algo = conf->algorithm;
8492 conf->algorithm = mddev->new_layout;
8493 conf->generation++;
8494
8495
8496
8497 smp_mb();
8498 if (mddev->reshape_backwards)
8499 conf->reshape_progress = raid5_size(mddev, 0, 0);
8500 else
8501 conf->reshape_progress = 0;
8502 conf->reshape_safe = conf->reshape_progress;
8503 write_seqcount_end(&conf->gen_lock);
8504 spin_unlock_irq(&conf->device_lock);
8505
8506
8507
8508
8509
8510 mddev_suspend(mddev);
8511 mddev_resume(mddev);
8512
8513
8514
8515
8516
8517
8518
8519
8520 if (mddev->delta_disks >= 0) {
8521 rdev_for_each(rdev, mddev)
8522 if (rdev->raid_disk < 0 &&
8523 !test_bit(Faulty, &rdev->flags)) {
8524 if (raid5_add_disk(mddev, rdev) == 0) {
8525 if (rdev->raid_disk
8526 >= conf->previous_raid_disks)
8527 set_bit(In_sync, &rdev->flags);
8528 else
8529 rdev->recovery_offset = 0;
8530
8531
8532 sysfs_link_rdev(mddev, rdev);
8533 }
8534 } else if (rdev->raid_disk >= conf->previous_raid_disks
8535 && !test_bit(Faulty, &rdev->flags)) {
8536
8537 set_bit(In_sync, &rdev->flags);
8538 }
8539
8540
8541
8542
8543
8544 spin_lock_irqsave(&conf->device_lock, flags);
8545 mddev->degraded = raid5_calc_degraded(conf);
8546 spin_unlock_irqrestore(&conf->device_lock, flags);
8547 }
8548 mddev->raid_disks = conf->raid_disks;
8549 mddev->reshape_position = conf->reshape_progress;
8550 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
8551
8552 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
8553 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
8554 clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
8555 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
8556 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
8557 mddev->sync_thread = md_register_thread(md_do_sync, mddev,
8558 "reshape");
8559 if (!mddev->sync_thread) {
8560 mddev->recovery = 0;
8561 spin_lock_irq(&conf->device_lock);
8562 write_seqcount_begin(&conf->gen_lock);
8563 mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks;
8564 mddev->new_chunk_sectors =
8565 conf->chunk_sectors = conf->prev_chunk_sectors;
8566 mddev->new_layout = conf->algorithm = conf->prev_algo;
8567 rdev_for_each(rdev, mddev)
8568 rdev->new_data_offset = rdev->data_offset;
8569 smp_wmb();
8570 conf->generation --;
8571 conf->reshape_progress = MaxSector;
8572 mddev->reshape_position = MaxSector;
8573 write_seqcount_end(&conf->gen_lock);
8574 spin_unlock_irq(&conf->device_lock);
8575 return -EAGAIN;
8576 }
8577 conf->reshape_checkpoint = jiffies;
8578 md_wakeup_thread(mddev->sync_thread);
8579 md_new_event();
8580 return 0;
8581 }
8582
8583
8584
8585
8586 static void end_reshape(struct r5conf *conf)
8587 {
8588
8589 if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) {
8590 struct md_rdev *rdev;
8591
8592 spin_lock_irq(&conf->device_lock);
8593 conf->previous_raid_disks = conf->raid_disks;
8594 md_finish_reshape(conf->mddev);
8595 smp_wmb();
8596 conf->reshape_progress = MaxSector;
8597 conf->mddev->reshape_position = MaxSector;
8598 rdev_for_each(rdev, conf->mddev)
8599 if (rdev->raid_disk >= 0 &&
8600 !test_bit(Journal, &rdev->flags) &&
8601 !test_bit(In_sync, &rdev->flags))
8602 rdev->recovery_offset = MaxSector;
8603 spin_unlock_irq(&conf->device_lock);
8604 wake_up(&conf->wait_for_overlap);
8605
8606 if (conf->mddev->queue)
8607 raid5_set_io_opt(conf);
8608 }
8609 }
8610
8611
8612
8613
8614 static void raid5_finish_reshape(struct mddev *mddev)
8615 {
8616 struct r5conf *conf = mddev->private;
8617 struct md_rdev *rdev;
8618
8619 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
8620
8621 if (mddev->delta_disks <= 0) {
8622 int d;
8623 spin_lock_irq(&conf->device_lock);
8624 mddev->degraded = raid5_calc_degraded(conf);
8625 spin_unlock_irq(&conf->device_lock);
8626 for (d = conf->raid_disks ;
8627 d < conf->raid_disks - mddev->delta_disks;
8628 d++) {
8629 rdev = rdev_mdlock_deref(mddev,
8630 conf->disks[d].rdev);
8631 if (rdev)
8632 clear_bit(In_sync, &rdev->flags);
8633 rdev = rdev_mdlock_deref(mddev,
8634 conf->disks[d].replacement);
8635 if (rdev)
8636 clear_bit(In_sync, &rdev->flags);
8637 }
8638 }
8639 mddev->layout = conf->algorithm;
8640 mddev->chunk_sectors = conf->chunk_sectors;
8641 mddev->reshape_position = MaxSector;
8642 mddev->delta_disks = 0;
8643 mddev->reshape_backwards = 0;
8644 }
8645 }
8646
8647 static void raid5_quiesce(struct mddev *mddev, int quiesce)
8648 {
8649 struct r5conf *conf = mddev->private;
8650
8651 if (quiesce) {
8652
8653 lock_all_device_hash_locks_irq(conf);
8654
8655
8656
8657 r5c_flush_cache(conf, INT_MAX);
8658
8659
8660
8661 smp_store_release(&conf->quiesce, 2);
8662 wait_event_cmd(conf->wait_for_quiescent,
8663 atomic_read(&conf->active_stripes) == 0 &&
8664 atomic_read(&conf->active_aligned_reads) == 0,
8665 unlock_all_device_hash_locks_irq(conf),
8666 lock_all_device_hash_locks_irq(conf));
8667 conf->quiesce = 1;
8668 unlock_all_device_hash_locks_irq(conf);
8669
8670 wake_up(&conf->wait_for_overlap);
8671 } else {
8672
8673 lock_all_device_hash_locks_irq(conf);
8674 conf->quiesce = 0;
8675 wake_up(&conf->wait_for_quiescent);
8676 wake_up(&conf->wait_for_overlap);
8677 unlock_all_device_hash_locks_irq(conf);
8678 }
8679 log_quiesce(conf, quiesce);
8680 }
8681
8682 static void *raid45_takeover_raid0(struct mddev *mddev, int level)
8683 {
8684 struct r0conf *raid0_conf = mddev->private;
8685 sector_t sectors;
8686
8687
8688 if (raid0_conf->nr_strip_zones > 1) {
8689 pr_warn("md/raid:%s: cannot takeover raid0 with more than one zone.\n",
8690 mdname(mddev));
8691 return ERR_PTR(-EINVAL);
8692 }
8693
8694 sectors = raid0_conf->strip_zone[0].zone_end;
8695 sector_div(sectors, raid0_conf->strip_zone[0].nb_dev);
8696 mddev->dev_sectors = sectors;
8697 mddev->new_level = level;
8698 mddev->new_layout = ALGORITHM_PARITY_N;
8699 mddev->new_chunk_sectors = mddev->chunk_sectors;
8700 mddev->raid_disks += 1;
8701 mddev->delta_disks = 1;
8702
8703 mddev->recovery_cp = MaxSector;
8704
8705 return setup_conf(mddev);
8706 }
8707
8708 static void *raid5_takeover_raid1(struct mddev *mddev)
8709 {
8710 int chunksect;
8711 void *ret;
8712
8713 if (mddev->raid_disks != 2 ||
8714 mddev->degraded > 1)
8715 return ERR_PTR(-EINVAL);
8716
8717
8718
8719 chunksect = 64*2;
8720
8721
8722 while (chunksect && (mddev->array_sectors & (chunksect-1)))
8723 chunksect >>= 1;
8724
8725 if ((chunksect<<9) < RAID5_STRIPE_SIZE((struct r5conf *)mddev->private))
8726
8727 return ERR_PTR(-EINVAL);
8728
8729 mddev->new_level = 5;
8730 mddev->new_layout = ALGORITHM_LEFT_SYMMETRIC;
8731 mddev->new_chunk_sectors = chunksect;
8732
8733 ret = setup_conf(mddev);
8734 if (!IS_ERR(ret))
8735 mddev_clear_unsupported_flags(mddev,
8736 UNSUPPORTED_MDDEV_FLAGS);
8737 return ret;
8738 }
8739
8740 static void *raid5_takeover_raid6(struct mddev *mddev)
8741 {
8742 int new_layout;
8743
8744 switch (mddev->layout) {
8745 case ALGORITHM_LEFT_ASYMMETRIC_6:
8746 new_layout = ALGORITHM_LEFT_ASYMMETRIC;
8747 break;
8748 case ALGORITHM_RIGHT_ASYMMETRIC_6:
8749 new_layout = ALGORITHM_RIGHT_ASYMMETRIC;
8750 break;
8751 case ALGORITHM_LEFT_SYMMETRIC_6:
8752 new_layout = ALGORITHM_LEFT_SYMMETRIC;
8753 break;
8754 case ALGORITHM_RIGHT_SYMMETRIC_6:
8755 new_layout = ALGORITHM_RIGHT_SYMMETRIC;
8756 break;
8757 case ALGORITHM_PARITY_0_6:
8758 new_layout = ALGORITHM_PARITY_0;
8759 break;
8760 case ALGORITHM_PARITY_N:
8761 new_layout = ALGORITHM_PARITY_N;
8762 break;
8763 default:
8764 return ERR_PTR(-EINVAL);
8765 }
8766 mddev->new_level = 5;
8767 mddev->new_layout = new_layout;
8768 mddev->delta_disks = -1;
8769 mddev->raid_disks -= 1;
8770 return setup_conf(mddev);
8771 }
8772
8773 static int raid5_check_reshape(struct mddev *mddev)
8774 {
8775
8776
8777
8778
8779
8780 struct r5conf *conf = mddev->private;
8781 int new_chunk = mddev->new_chunk_sectors;
8782
8783 if (mddev->new_layout >= 0 && !algorithm_valid_raid5(mddev->new_layout))
8784 return -EINVAL;
8785 if (new_chunk > 0) {
8786 if (!is_power_of_2(new_chunk))
8787 return -EINVAL;
8788 if (new_chunk < (PAGE_SIZE>>9))
8789 return -EINVAL;
8790 if (mddev->array_sectors & (new_chunk-1))
8791
8792 return -EINVAL;
8793 }
8794
8795
8796
8797 if (mddev->raid_disks == 2) {
8798
8799 if (mddev->new_layout >= 0) {
8800 conf->algorithm = mddev->new_layout;
8801 mddev->layout = mddev->new_layout;
8802 }
8803 if (new_chunk > 0) {
8804 conf->chunk_sectors = new_chunk ;
8805 mddev->chunk_sectors = new_chunk;
8806 }
8807 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
8808 md_wakeup_thread(mddev->thread);
8809 }
8810 return check_reshape(mddev);
8811 }
8812
8813 static int raid6_check_reshape(struct mddev *mddev)
8814 {
8815 int new_chunk = mddev->new_chunk_sectors;
8816
8817 if (mddev->new_layout >= 0 && !algorithm_valid_raid6(mddev->new_layout))
8818 return -EINVAL;
8819 if (new_chunk > 0) {
8820 if (!is_power_of_2(new_chunk))
8821 return -EINVAL;
8822 if (new_chunk < (PAGE_SIZE >> 9))
8823 return -EINVAL;
8824 if (mddev->array_sectors & (new_chunk-1))
8825
8826 return -EINVAL;
8827 }
8828
8829
8830 return check_reshape(mddev);
8831 }
8832
8833 static void *raid5_takeover(struct mddev *mddev)
8834 {
8835
8836
8837
8838
8839
8840
8841 if (mddev->level == 0)
8842 return raid45_takeover_raid0(mddev, 5);
8843 if (mddev->level == 1)
8844 return raid5_takeover_raid1(mddev);
8845 if (mddev->level == 4) {
8846 mddev->new_layout = ALGORITHM_PARITY_N;
8847 mddev->new_level = 5;
8848 return setup_conf(mddev);
8849 }
8850 if (mddev->level == 6)
8851 return raid5_takeover_raid6(mddev);
8852
8853 return ERR_PTR(-EINVAL);
8854 }
8855
8856 static void *raid4_takeover(struct mddev *mddev)
8857 {
8858
8859
8860
8861
8862 if (mddev->level == 0)
8863 return raid45_takeover_raid0(mddev, 4);
8864 if (mddev->level == 5 &&
8865 mddev->layout == ALGORITHM_PARITY_N) {
8866 mddev->new_layout = 0;
8867 mddev->new_level = 4;
8868 return setup_conf(mddev);
8869 }
8870 return ERR_PTR(-EINVAL);
8871 }
8872
8873 static struct md_personality raid5_personality;
8874
8875 static void *raid6_takeover(struct mddev *mddev)
8876 {
8877
8878
8879
8880
8881 int new_layout;
8882
8883 if (mddev->pers != &raid5_personality)
8884 return ERR_PTR(-EINVAL);
8885 if (mddev->degraded > 1)
8886 return ERR_PTR(-EINVAL);
8887 if (mddev->raid_disks > 253)
8888 return ERR_PTR(-EINVAL);
8889 if (mddev->raid_disks < 3)
8890 return ERR_PTR(-EINVAL);
8891
8892 switch (mddev->layout) {
8893 case ALGORITHM_LEFT_ASYMMETRIC:
8894 new_layout = ALGORITHM_LEFT_ASYMMETRIC_6;
8895 break;
8896 case ALGORITHM_RIGHT_ASYMMETRIC:
8897 new_layout = ALGORITHM_RIGHT_ASYMMETRIC_6;
8898 break;
8899 case ALGORITHM_LEFT_SYMMETRIC:
8900 new_layout = ALGORITHM_LEFT_SYMMETRIC_6;
8901 break;
8902 case ALGORITHM_RIGHT_SYMMETRIC:
8903 new_layout = ALGORITHM_RIGHT_SYMMETRIC_6;
8904 break;
8905 case ALGORITHM_PARITY_0:
8906 new_layout = ALGORITHM_PARITY_0_6;
8907 break;
8908 case ALGORITHM_PARITY_N:
8909 new_layout = ALGORITHM_PARITY_N;
8910 break;
8911 default:
8912 return ERR_PTR(-EINVAL);
8913 }
8914 mddev->new_level = 6;
8915 mddev->new_layout = new_layout;
8916 mddev->delta_disks = 1;
8917 mddev->raid_disks += 1;
8918 return setup_conf(mddev);
8919 }
8920
8921 static int raid5_change_consistency_policy(struct mddev *mddev, const char *buf)
8922 {
8923 struct r5conf *conf;
8924 int err;
8925
8926 err = mddev_lock(mddev);
8927 if (err)
8928 return err;
8929 conf = mddev->private;
8930 if (!conf) {
8931 mddev_unlock(mddev);
8932 return -ENODEV;
8933 }
8934
8935 if (strncmp(buf, "ppl", 3) == 0) {
8936
8937 if (!raid5_has_ppl(conf) && conf->level == 5) {
8938 err = log_init(conf, NULL, true);
8939 if (!err) {
8940 err = resize_stripes(conf, conf->pool_size);
8941 if (err) {
8942 mddev_suspend(mddev);
8943 log_exit(conf);
8944 mddev_resume(mddev);
8945 }
8946 }
8947 } else
8948 err = -EINVAL;
8949 } else if (strncmp(buf, "resync", 6) == 0) {
8950 if (raid5_has_ppl(conf)) {
8951 mddev_suspend(mddev);
8952 log_exit(conf);
8953 mddev_resume(mddev);
8954 err = resize_stripes(conf, conf->pool_size);
8955 } else if (test_bit(MD_HAS_JOURNAL, &conf->mddev->flags) &&
8956 r5l_log_disk_error(conf)) {
8957 bool journal_dev_exists = false;
8958 struct md_rdev *rdev;
8959
8960 rdev_for_each(rdev, mddev)
8961 if (test_bit(Journal, &rdev->flags)) {
8962 journal_dev_exists = true;
8963 break;
8964 }
8965
8966 if (!journal_dev_exists) {
8967 mddev_suspend(mddev);
8968 clear_bit(MD_HAS_JOURNAL, &mddev->flags);
8969 mddev_resume(mddev);
8970 } else
8971 err = -EBUSY;
8972 } else
8973 err = -EINVAL;
8974 } else {
8975 err = -EINVAL;
8976 }
8977
8978 if (!err)
8979 md_update_sb(mddev, 1);
8980
8981 mddev_unlock(mddev);
8982
8983 return err;
8984 }
8985
8986 static int raid5_start(struct mddev *mddev)
8987 {
8988 struct r5conf *conf = mddev->private;
8989
8990 return r5l_start(conf->log);
8991 }
8992
8993 static struct md_personality raid6_personality =
8994 {
8995 .name = "raid6",
8996 .level = 6,
8997 .owner = THIS_MODULE,
8998 .make_request = raid5_make_request,
8999 .run = raid5_run,
9000 .start = raid5_start,
9001 .free = raid5_free,
9002 .status = raid5_status,
9003 .error_handler = raid5_error,
9004 .hot_add_disk = raid5_add_disk,
9005 .hot_remove_disk= raid5_remove_disk,
9006 .spare_active = raid5_spare_active,
9007 .sync_request = raid5_sync_request,
9008 .resize = raid5_resize,
9009 .size = raid5_size,
9010 .check_reshape = raid6_check_reshape,
9011 .start_reshape = raid5_start_reshape,
9012 .finish_reshape = raid5_finish_reshape,
9013 .quiesce = raid5_quiesce,
9014 .takeover = raid6_takeover,
9015 .change_consistency_policy = raid5_change_consistency_policy,
9016 };
9017 static struct md_personality raid5_personality =
9018 {
9019 .name = "raid5",
9020 .level = 5,
9021 .owner = THIS_MODULE,
9022 .make_request = raid5_make_request,
9023 .run = raid5_run,
9024 .start = raid5_start,
9025 .free = raid5_free,
9026 .status = raid5_status,
9027 .error_handler = raid5_error,
9028 .hot_add_disk = raid5_add_disk,
9029 .hot_remove_disk= raid5_remove_disk,
9030 .spare_active = raid5_spare_active,
9031 .sync_request = raid5_sync_request,
9032 .resize = raid5_resize,
9033 .size = raid5_size,
9034 .check_reshape = raid5_check_reshape,
9035 .start_reshape = raid5_start_reshape,
9036 .finish_reshape = raid5_finish_reshape,
9037 .quiesce = raid5_quiesce,
9038 .takeover = raid5_takeover,
9039 .change_consistency_policy = raid5_change_consistency_policy,
9040 };
9041
9042 static struct md_personality raid4_personality =
9043 {
9044 .name = "raid4",
9045 .level = 4,
9046 .owner = THIS_MODULE,
9047 .make_request = raid5_make_request,
9048 .run = raid5_run,
9049 .start = raid5_start,
9050 .free = raid5_free,
9051 .status = raid5_status,
9052 .error_handler = raid5_error,
9053 .hot_add_disk = raid5_add_disk,
9054 .hot_remove_disk= raid5_remove_disk,
9055 .spare_active = raid5_spare_active,
9056 .sync_request = raid5_sync_request,
9057 .resize = raid5_resize,
9058 .size = raid5_size,
9059 .check_reshape = raid5_check_reshape,
9060 .start_reshape = raid5_start_reshape,
9061 .finish_reshape = raid5_finish_reshape,
9062 .quiesce = raid5_quiesce,
9063 .takeover = raid4_takeover,
9064 .change_consistency_policy = raid5_change_consistency_policy,
9065 };
9066
9067 static int __init raid5_init(void)
9068 {
9069 int ret;
9070
9071 raid5_wq = alloc_workqueue("raid5wq",
9072 WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE|WQ_SYSFS, 0);
9073 if (!raid5_wq)
9074 return -ENOMEM;
9075
9076 ret = cpuhp_setup_state_multi(CPUHP_MD_RAID5_PREPARE,
9077 "md/raid5:prepare",
9078 raid456_cpu_up_prepare,
9079 raid456_cpu_dead);
9080 if (ret) {
9081 destroy_workqueue(raid5_wq);
9082 return ret;
9083 }
9084 register_md_personality(&raid6_personality);
9085 register_md_personality(&raid5_personality);
9086 register_md_personality(&raid4_personality);
9087 return 0;
9088 }
9089
9090 static void raid5_exit(void)
9091 {
9092 unregister_md_personality(&raid6_personality);
9093 unregister_md_personality(&raid5_personality);
9094 unregister_md_personality(&raid4_personality);
9095 cpuhp_remove_multi_state(CPUHP_MD_RAID5_PREPARE);
9096 destroy_workqueue(raid5_wq);
9097 }
9098
9099 module_init(raid5_init);
9100 module_exit(raid5_exit);
9101 MODULE_LICENSE("GPL");
9102 MODULE_DESCRIPTION("RAID4/5/6 (striping with parity) personality for MD");
9103 MODULE_ALIAS("md-personality-4");
9104 MODULE_ALIAS("md-raid5");
9105 MODULE_ALIAS("md-raid4");
9106 MODULE_ALIAS("md-level-5");
9107 MODULE_ALIAS("md-level-4");
9108 MODULE_ALIAS("md-personality-8");
9109 MODULE_ALIAS("md-raid6");
9110 MODULE_ALIAS("md-level-6");
9111
9112
9113 MODULE_ALIAS("raid5");
9114 MODULE_ALIAS("raid6");