0001
0002
0003
0004
0005
0006 #include <linux/kernel.h>
0007 #include <linux/wait.h>
0008 #include <linux/blkdev.h>
0009 #include <linux/slab.h>
0010 #include <linux/raid/md_p.h>
0011 #include <linux/crc32c.h>
0012 #include <linux/random.h>
0013 #include <linux/kthread.h>
0014 #include <linux/types.h>
0015 #include "md.h"
0016 #include "raid5.h"
0017 #include "md-bitmap.h"
0018 #include "raid5-log.h"
0019
0020
0021
0022
0023
0024 #define BLOCK_SECTORS (8)
0025 #define BLOCK_SECTOR_SHIFT (3)
0026
0027
0028
0029
0030
0031
0032
0033 #define RECLAIM_MAX_FREE_SPACE (10 * 1024 * 1024 * 2)
0034 #define RECLAIM_MAX_FREE_SPACE_SHIFT (2)
0035
0036
0037 #define R5C_RECLAIM_WAKEUP_INTERVAL (30 * HZ)
0038
0039 #define R5C_FULL_STRIPE_FLUSH_BATCH(conf) (conf->max_nr_stripes / 4)
0040
0041 #define R5C_RECLAIM_STRIPE_GROUP (NR_STRIPE_HASH_LOCKS * 2)
0042
0043
0044
0045
0046
0047 #define R5L_POOL_SIZE 4
0048
0049 static char *r5c_journal_mode_str[] = {"write-through",
0050 "write-back"};
0051
0052
0053
0054
0055
0056
0057
0058
0059
0060
0061
0062
0063
0064
0065
0066
0067
0068
0069
0070
0071
0072
0073
0074
0075
0076
0077
0078
0079
0080
0081
0082 struct r5l_log {
0083 struct md_rdev *rdev;
0084
0085 u32 uuid_checksum;
0086
0087 sector_t device_size;
0088
0089 sector_t max_free_space;
0090
0091
0092 sector_t last_checkpoint;
0093
0094 u64 last_cp_seq;
0095
0096 sector_t log_start;
0097 u64 seq;
0098
0099 sector_t next_checkpoint;
0100
0101 struct mutex io_mutex;
0102 struct r5l_io_unit *current_io;
0103
0104 spinlock_t io_list_lock;
0105 struct list_head running_ios;
0106
0107
0108 struct list_head io_end_ios;
0109
0110
0111 struct list_head flushing_ios;
0112
0113 struct list_head finished_ios;
0114 struct bio flush_bio;
0115
0116 struct list_head no_mem_stripes;
0117
0118 struct kmem_cache *io_kc;
0119 mempool_t io_pool;
0120 struct bio_set bs;
0121 mempool_t meta_pool;
0122
0123 struct md_thread *reclaim_thread;
0124 unsigned long reclaim_target;
0125
0126
0127
0128
0129
0130
0131 wait_queue_head_t iounit_wait;
0132
0133 struct list_head no_space_stripes;
0134 spinlock_t no_space_stripes_lock;
0135
0136 bool need_cache_flush;
0137
0138
0139 enum r5c_journal_mode r5c_journal_mode;
0140
0141
0142 struct list_head stripe_in_journal_list;
0143
0144 spinlock_t stripe_in_journal_lock;
0145 atomic_t stripe_in_journal_count;
0146
0147
0148 struct work_struct deferred_io_work;
0149
0150 struct work_struct disable_writeback_work;
0151
0152
0153 spinlock_t tree_lock;
0154 struct radix_tree_root big_stripe_tree;
0155 };
0156
0157
0158
0159
0160
0161
0162
0163
0164
0165
0166
0167
0168
0169
0170
0171
0172
0173
0174
0175
0176
0177
0178
0179
0180
0181
0182
0183
0184
0185
0186
0187
0188 #define R5C_RADIX_COUNT_SHIFT 2
0189
0190
0191
0192
0193
0194
0195 static inline sector_t r5c_tree_index(struct r5conf *conf,
0196 sector_t sect)
0197 {
0198 sector_div(sect, conf->chunk_sectors);
0199 return sect;
0200 }
0201
0202
0203
0204
0205
0206
0207
0208
0209 struct r5l_io_unit {
0210 struct r5l_log *log;
0211
0212 struct page *meta_page;
0213 int meta_offset;
0214
0215 struct bio *current_bio;
0216
0217 atomic_t pending_stripe;
0218 u64 seq;
0219 sector_t log_start;
0220 sector_t log_end;
0221 struct list_head log_sibling;
0222 struct list_head stripe_list;
0223
0224 int state;
0225 bool need_split_bio;
0226 struct bio *split_bio;
0227
0228 unsigned int has_flush:1;
0229 unsigned int has_fua:1;
0230 unsigned int has_null_flush:1;
0231 unsigned int has_flush_payload:1;
0232
0233
0234
0235
0236 unsigned int io_deferred:1;
0237
0238 struct bio_list flush_barriers;
0239 };
0240
0241
0242 enum r5l_io_unit_state {
0243 IO_UNIT_RUNNING = 0,
0244 IO_UNIT_IO_START = 1,
0245
0246 IO_UNIT_IO_END = 2,
0247 IO_UNIT_STRIPE_END = 3,
0248 };
0249
0250 bool r5c_is_writeback(struct r5l_log *log)
0251 {
0252 return (log != NULL &&
0253 log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK);
0254 }
0255
0256 static sector_t r5l_ring_add(struct r5l_log *log, sector_t start, sector_t inc)
0257 {
0258 start += inc;
0259 if (start >= log->device_size)
0260 start = start - log->device_size;
0261 return start;
0262 }
0263
0264 static sector_t r5l_ring_distance(struct r5l_log *log, sector_t start,
0265 sector_t end)
0266 {
0267 if (end >= start)
0268 return end - start;
0269 else
0270 return end + log->device_size - start;
0271 }
0272
0273 static bool r5l_has_free_space(struct r5l_log *log, sector_t size)
0274 {
0275 sector_t used_size;
0276
0277 used_size = r5l_ring_distance(log, log->last_checkpoint,
0278 log->log_start);
0279
0280 return log->device_size > used_size + size;
0281 }
0282
0283 static void __r5l_set_io_unit_state(struct r5l_io_unit *io,
0284 enum r5l_io_unit_state state)
0285 {
0286 if (WARN_ON(io->state >= state))
0287 return;
0288 io->state = state;
0289 }
0290
0291 static void
0292 r5c_return_dev_pending_writes(struct r5conf *conf, struct r5dev *dev)
0293 {
0294 struct bio *wbi, *wbi2;
0295
0296 wbi = dev->written;
0297 dev->written = NULL;
0298 while (wbi && wbi->bi_iter.bi_sector <
0299 dev->sector + RAID5_STRIPE_SECTORS(conf)) {
0300 wbi2 = r5_next_bio(conf, wbi, dev->sector);
0301 md_write_end(conf->mddev);
0302 bio_endio(wbi);
0303 wbi = wbi2;
0304 }
0305 }
0306
0307 void r5c_handle_cached_data_endio(struct r5conf *conf,
0308 struct stripe_head *sh, int disks)
0309 {
0310 int i;
0311
0312 for (i = sh->disks; i--; ) {
0313 if (sh->dev[i].written) {
0314 set_bit(R5_UPTODATE, &sh->dev[i].flags);
0315 r5c_return_dev_pending_writes(conf, &sh->dev[i]);
0316 md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
0317 RAID5_STRIPE_SECTORS(conf),
0318 !test_bit(STRIPE_DEGRADED, &sh->state),
0319 0);
0320 }
0321 }
0322 }
0323
0324 void r5l_wake_reclaim(struct r5l_log *log, sector_t space);
0325
0326
0327 void r5c_check_stripe_cache_usage(struct r5conf *conf)
0328 {
0329 int total_cached;
0330
0331 if (!r5c_is_writeback(conf->log))
0332 return;
0333
0334 total_cached = atomic_read(&conf->r5c_cached_partial_stripes) +
0335 atomic_read(&conf->r5c_cached_full_stripes);
0336
0337
0338
0339
0340
0341
0342
0343
0344
0345 if (total_cached > conf->min_nr_stripes * 1 / 2 ||
0346 atomic_read(&conf->empty_inactive_list_nr) > 0)
0347 r5l_wake_reclaim(conf->log, 0);
0348 }
0349
0350
0351
0352
0353
0354 void r5c_check_cached_full_stripe(struct r5conf *conf)
0355 {
0356 if (!r5c_is_writeback(conf->log))
0357 return;
0358
0359
0360
0361
0362
0363 if (atomic_read(&conf->r5c_cached_full_stripes) >=
0364 min(R5C_FULL_STRIPE_FLUSH_BATCH(conf),
0365 conf->chunk_sectors >> RAID5_STRIPE_SHIFT(conf)))
0366 r5l_wake_reclaim(conf->log, 0);
0367 }
0368
0369
0370
0371
0372
0373
0374
0375
0376
0377
0378
0379
0380
0381
0382
0383
0384
0385
0386
0387
0388
0389
0390
0391
0392
0393
0394
0395
0396
0397 static sector_t r5c_log_required_to_flush_cache(struct r5conf *conf)
0398 {
0399 struct r5l_log *log = conf->log;
0400
0401 if (!r5c_is_writeback(log))
0402 return 0;
0403
0404 return BLOCK_SECTORS *
0405 ((conf->max_degraded + 1) * atomic_read(&log->stripe_in_journal_count) +
0406 (conf->raid_disks - conf->max_degraded) * (conf->group_cnt + 1));
0407 }
0408
0409
0410
0411
0412
0413
0414
0415
0416 static inline void r5c_update_log_state(struct r5l_log *log)
0417 {
0418 struct r5conf *conf = log->rdev->mddev->private;
0419 sector_t free_space;
0420 sector_t reclaim_space;
0421 bool wake_reclaim = false;
0422
0423 if (!r5c_is_writeback(log))
0424 return;
0425
0426 free_space = r5l_ring_distance(log, log->log_start,
0427 log->last_checkpoint);
0428 reclaim_space = r5c_log_required_to_flush_cache(conf);
0429 if (free_space < 2 * reclaim_space)
0430 set_bit(R5C_LOG_CRITICAL, &conf->cache_state);
0431 else {
0432 if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state))
0433 wake_reclaim = true;
0434 clear_bit(R5C_LOG_CRITICAL, &conf->cache_state);
0435 }
0436 if (free_space < 3 * reclaim_space)
0437 set_bit(R5C_LOG_TIGHT, &conf->cache_state);
0438 else
0439 clear_bit(R5C_LOG_TIGHT, &conf->cache_state);
0440
0441 if (wake_reclaim)
0442 r5l_wake_reclaim(log, 0);
0443 }
0444
0445
0446
0447
0448
0449 void r5c_make_stripe_write_out(struct stripe_head *sh)
0450 {
0451 struct r5conf *conf = sh->raid_conf;
0452 struct r5l_log *log = conf->log;
0453
0454 BUG_ON(!r5c_is_writeback(log));
0455
0456 WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state));
0457 clear_bit(STRIPE_R5C_CACHING, &sh->state);
0458
0459 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
0460 atomic_inc(&conf->preread_active_stripes);
0461 }
0462
0463 static void r5c_handle_data_cached(struct stripe_head *sh)
0464 {
0465 int i;
0466
0467 for (i = sh->disks; i--; )
0468 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) {
0469 set_bit(R5_InJournal, &sh->dev[i].flags);
0470 clear_bit(R5_LOCKED, &sh->dev[i].flags);
0471 }
0472 clear_bit(STRIPE_LOG_TRAPPED, &sh->state);
0473 }
0474
0475
0476
0477
0478
0479 static void r5c_handle_parity_cached(struct stripe_head *sh)
0480 {
0481 int i;
0482
0483 for (i = sh->disks; i--; )
0484 if (test_bit(R5_InJournal, &sh->dev[i].flags))
0485 set_bit(R5_Wantwrite, &sh->dev[i].flags);
0486 }
0487
0488
0489
0490
0491
0492 static void r5c_finish_cache_stripe(struct stripe_head *sh)
0493 {
0494 struct r5l_log *log = sh->raid_conf->log;
0495
0496 if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) {
0497 BUG_ON(test_bit(STRIPE_R5C_CACHING, &sh->state));
0498
0499
0500
0501
0502
0503
0504 set_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags);
0505 } else if (test_bit(STRIPE_R5C_CACHING, &sh->state)) {
0506 r5c_handle_data_cached(sh);
0507 } else {
0508 r5c_handle_parity_cached(sh);
0509 set_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags);
0510 }
0511 }
0512
0513 static void r5l_io_run_stripes(struct r5l_io_unit *io)
0514 {
0515 struct stripe_head *sh, *next;
0516
0517 list_for_each_entry_safe(sh, next, &io->stripe_list, log_list) {
0518 list_del_init(&sh->log_list);
0519
0520 r5c_finish_cache_stripe(sh);
0521
0522 set_bit(STRIPE_HANDLE, &sh->state);
0523 raid5_release_stripe(sh);
0524 }
0525 }
0526
0527 static void r5l_log_run_stripes(struct r5l_log *log)
0528 {
0529 struct r5l_io_unit *io, *next;
0530
0531 lockdep_assert_held(&log->io_list_lock);
0532
0533 list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) {
0534
0535 if (io->state < IO_UNIT_IO_END)
0536 break;
0537
0538 list_move_tail(&io->log_sibling, &log->finished_ios);
0539 r5l_io_run_stripes(io);
0540 }
0541 }
0542
0543 static void r5l_move_to_end_ios(struct r5l_log *log)
0544 {
0545 struct r5l_io_unit *io, *next;
0546
0547 lockdep_assert_held(&log->io_list_lock);
0548
0549 list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) {
0550
0551 if (io->state < IO_UNIT_IO_END)
0552 break;
0553 list_move_tail(&io->log_sibling, &log->io_end_ios);
0554 }
0555 }
0556
0557 static void __r5l_stripe_write_finished(struct r5l_io_unit *io);
0558 static void r5l_log_endio(struct bio *bio)
0559 {
0560 struct r5l_io_unit *io = bio->bi_private;
0561 struct r5l_io_unit *io_deferred;
0562 struct r5l_log *log = io->log;
0563 unsigned long flags;
0564 bool has_null_flush;
0565 bool has_flush_payload;
0566
0567 if (bio->bi_status)
0568 md_error(log->rdev->mddev, log->rdev);
0569
0570 bio_put(bio);
0571 mempool_free(io->meta_page, &log->meta_pool);
0572
0573 spin_lock_irqsave(&log->io_list_lock, flags);
0574 __r5l_set_io_unit_state(io, IO_UNIT_IO_END);
0575
0576
0577
0578
0579
0580
0581
0582 has_null_flush = io->has_null_flush;
0583 has_flush_payload = io->has_flush_payload;
0584
0585 if (log->need_cache_flush && !list_empty(&io->stripe_list))
0586 r5l_move_to_end_ios(log);
0587 else
0588 r5l_log_run_stripes(log);
0589 if (!list_empty(&log->running_ios)) {
0590
0591
0592
0593
0594 io_deferred = list_first_entry(&log->running_ios,
0595 struct r5l_io_unit, log_sibling);
0596 if (io_deferred->io_deferred)
0597 schedule_work(&log->deferred_io_work);
0598 }
0599
0600 spin_unlock_irqrestore(&log->io_list_lock, flags);
0601
0602 if (log->need_cache_flush)
0603 md_wakeup_thread(log->rdev->mddev->thread);
0604
0605
0606 if (has_null_flush) {
0607 struct bio *bi;
0608
0609 WARN_ON(bio_list_empty(&io->flush_barriers));
0610 while ((bi = bio_list_pop(&io->flush_barriers)) != NULL) {
0611 bio_endio(bi);
0612 if (atomic_dec_and_test(&io->pending_stripe)) {
0613 __r5l_stripe_write_finished(io);
0614 return;
0615 }
0616 }
0617 }
0618
0619 if (has_flush_payload)
0620 if (atomic_dec_and_test(&io->pending_stripe))
0621 __r5l_stripe_write_finished(io);
0622 }
0623
0624 static void r5l_do_submit_io(struct r5l_log *log, struct r5l_io_unit *io)
0625 {
0626 unsigned long flags;
0627
0628 spin_lock_irqsave(&log->io_list_lock, flags);
0629 __r5l_set_io_unit_state(io, IO_UNIT_IO_START);
0630 spin_unlock_irqrestore(&log->io_list_lock, flags);
0631
0632
0633
0634
0635
0636
0637
0638
0639
0640
0641
0642
0643 if (io->split_bio) {
0644 if (io->has_flush)
0645 io->split_bio->bi_opf |= REQ_PREFLUSH;
0646 if (io->has_fua)
0647 io->split_bio->bi_opf |= REQ_FUA;
0648 submit_bio(io->split_bio);
0649 }
0650
0651 if (io->has_flush)
0652 io->current_bio->bi_opf |= REQ_PREFLUSH;
0653 if (io->has_fua)
0654 io->current_bio->bi_opf |= REQ_FUA;
0655 submit_bio(io->current_bio);
0656 }
0657
0658
0659 static void r5l_submit_io_async(struct work_struct *work)
0660 {
0661 struct r5l_log *log = container_of(work, struct r5l_log,
0662 deferred_io_work);
0663 struct r5l_io_unit *io = NULL;
0664 unsigned long flags;
0665
0666 spin_lock_irqsave(&log->io_list_lock, flags);
0667 if (!list_empty(&log->running_ios)) {
0668 io = list_first_entry(&log->running_ios, struct r5l_io_unit,
0669 log_sibling);
0670 if (!io->io_deferred)
0671 io = NULL;
0672 else
0673 io->io_deferred = 0;
0674 }
0675 spin_unlock_irqrestore(&log->io_list_lock, flags);
0676 if (io)
0677 r5l_do_submit_io(log, io);
0678 }
0679
0680 static void r5c_disable_writeback_async(struct work_struct *work)
0681 {
0682 struct r5l_log *log = container_of(work, struct r5l_log,
0683 disable_writeback_work);
0684 struct mddev *mddev = log->rdev->mddev;
0685 struct r5conf *conf = mddev->private;
0686 int locked = 0;
0687
0688 if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
0689 return;
0690 pr_info("md/raid:%s: Disabling writeback cache for degraded array.\n",
0691 mdname(mddev));
0692
0693
0694 wait_event(mddev->sb_wait,
0695 conf->log == NULL ||
0696 (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) &&
0697 (locked = mddev_trylock(mddev))));
0698 if (locked) {
0699 mddev_suspend(mddev);
0700 log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
0701 mddev_resume(mddev);
0702 mddev_unlock(mddev);
0703 }
0704 }
0705
0706 static void r5l_submit_current_io(struct r5l_log *log)
0707 {
0708 struct r5l_io_unit *io = log->current_io;
0709 struct r5l_meta_block *block;
0710 unsigned long flags;
0711 u32 crc;
0712 bool do_submit = true;
0713
0714 if (!io)
0715 return;
0716
0717 block = page_address(io->meta_page);
0718 block->meta_size = cpu_to_le32(io->meta_offset);
0719 crc = crc32c_le(log->uuid_checksum, block, PAGE_SIZE);
0720 block->checksum = cpu_to_le32(crc);
0721
0722 log->current_io = NULL;
0723 spin_lock_irqsave(&log->io_list_lock, flags);
0724 if (io->has_flush || io->has_fua) {
0725 if (io != list_first_entry(&log->running_ios,
0726 struct r5l_io_unit, log_sibling)) {
0727 io->io_deferred = 1;
0728 do_submit = false;
0729 }
0730 }
0731 spin_unlock_irqrestore(&log->io_list_lock, flags);
0732 if (do_submit)
0733 r5l_do_submit_io(log, io);
0734 }
0735
0736 static struct bio *r5l_bio_alloc(struct r5l_log *log)
0737 {
0738 struct bio *bio = bio_alloc_bioset(log->rdev->bdev, BIO_MAX_VECS,
0739 REQ_OP_WRITE, GFP_NOIO, &log->bs);
0740
0741 bio->bi_iter.bi_sector = log->rdev->data_offset + log->log_start;
0742
0743 return bio;
0744 }
0745
0746 static void r5_reserve_log_entry(struct r5l_log *log, struct r5l_io_unit *io)
0747 {
0748 log->log_start = r5l_ring_add(log, log->log_start, BLOCK_SECTORS);
0749
0750 r5c_update_log_state(log);
0751
0752
0753
0754
0755
0756
0757
0758 if (log->log_start == 0)
0759 io->need_split_bio = true;
0760
0761 io->log_end = log->log_start;
0762 }
0763
0764 static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log)
0765 {
0766 struct r5l_io_unit *io;
0767 struct r5l_meta_block *block;
0768
0769 io = mempool_alloc(&log->io_pool, GFP_ATOMIC);
0770 if (!io)
0771 return NULL;
0772 memset(io, 0, sizeof(*io));
0773
0774 io->log = log;
0775 INIT_LIST_HEAD(&io->log_sibling);
0776 INIT_LIST_HEAD(&io->stripe_list);
0777 bio_list_init(&io->flush_barriers);
0778 io->state = IO_UNIT_RUNNING;
0779
0780 io->meta_page = mempool_alloc(&log->meta_pool, GFP_NOIO);
0781 block = page_address(io->meta_page);
0782 clear_page(block);
0783 block->magic = cpu_to_le32(R5LOG_MAGIC);
0784 block->version = R5LOG_VERSION;
0785 block->seq = cpu_to_le64(log->seq);
0786 block->position = cpu_to_le64(log->log_start);
0787
0788 io->log_start = log->log_start;
0789 io->meta_offset = sizeof(struct r5l_meta_block);
0790 io->seq = log->seq++;
0791
0792 io->current_bio = r5l_bio_alloc(log);
0793 io->current_bio->bi_end_io = r5l_log_endio;
0794 io->current_bio->bi_private = io;
0795 bio_add_page(io->current_bio, io->meta_page, PAGE_SIZE, 0);
0796
0797 r5_reserve_log_entry(log, io);
0798
0799 spin_lock_irq(&log->io_list_lock);
0800 list_add_tail(&io->log_sibling, &log->running_ios);
0801 spin_unlock_irq(&log->io_list_lock);
0802
0803 return io;
0804 }
0805
0806 static int r5l_get_meta(struct r5l_log *log, unsigned int payload_size)
0807 {
0808 if (log->current_io &&
0809 log->current_io->meta_offset + payload_size > PAGE_SIZE)
0810 r5l_submit_current_io(log);
0811
0812 if (!log->current_io) {
0813 log->current_io = r5l_new_meta(log);
0814 if (!log->current_io)
0815 return -ENOMEM;
0816 }
0817
0818 return 0;
0819 }
0820
0821 static void r5l_append_payload_meta(struct r5l_log *log, u16 type,
0822 sector_t location,
0823 u32 checksum1, u32 checksum2,
0824 bool checksum2_valid)
0825 {
0826 struct r5l_io_unit *io = log->current_io;
0827 struct r5l_payload_data_parity *payload;
0828
0829 payload = page_address(io->meta_page) + io->meta_offset;
0830 payload->header.type = cpu_to_le16(type);
0831 payload->header.flags = cpu_to_le16(0);
0832 payload->size = cpu_to_le32((1 + !!checksum2_valid) <<
0833 (PAGE_SHIFT - 9));
0834 payload->location = cpu_to_le64(location);
0835 payload->checksum[0] = cpu_to_le32(checksum1);
0836 if (checksum2_valid)
0837 payload->checksum[1] = cpu_to_le32(checksum2);
0838
0839 io->meta_offset += sizeof(struct r5l_payload_data_parity) +
0840 sizeof(__le32) * (1 + !!checksum2_valid);
0841 }
0842
0843 static void r5l_append_payload_page(struct r5l_log *log, struct page *page)
0844 {
0845 struct r5l_io_unit *io = log->current_io;
0846
0847 if (io->need_split_bio) {
0848 BUG_ON(io->split_bio);
0849 io->split_bio = io->current_bio;
0850 io->current_bio = r5l_bio_alloc(log);
0851 bio_chain(io->current_bio, io->split_bio);
0852 io->need_split_bio = false;
0853 }
0854
0855 if (!bio_add_page(io->current_bio, page, PAGE_SIZE, 0))
0856 BUG();
0857
0858 r5_reserve_log_entry(log, io);
0859 }
0860
0861 static void r5l_append_flush_payload(struct r5l_log *log, sector_t sect)
0862 {
0863 struct mddev *mddev = log->rdev->mddev;
0864 struct r5conf *conf = mddev->private;
0865 struct r5l_io_unit *io;
0866 struct r5l_payload_flush *payload;
0867 int meta_size;
0868
0869
0870
0871
0872
0873
0874 if (conf->quiesce)
0875 return;
0876
0877 mutex_lock(&log->io_mutex);
0878 meta_size = sizeof(struct r5l_payload_flush) + sizeof(__le64);
0879
0880 if (r5l_get_meta(log, meta_size)) {
0881 mutex_unlock(&log->io_mutex);
0882 return;
0883 }
0884
0885
0886 io = log->current_io;
0887 payload = page_address(io->meta_page) + io->meta_offset;
0888 payload->header.type = cpu_to_le16(R5LOG_PAYLOAD_FLUSH);
0889 payload->header.flags = cpu_to_le16(0);
0890 payload->size = cpu_to_le32(sizeof(__le64));
0891 payload->flush_stripes[0] = cpu_to_le64(sect);
0892 io->meta_offset += meta_size;
0893
0894 if (!io->has_flush_payload) {
0895 io->has_flush_payload = 1;
0896 atomic_inc(&io->pending_stripe);
0897 }
0898 mutex_unlock(&log->io_mutex);
0899 }
0900
0901 static int r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
0902 int data_pages, int parity_pages)
0903 {
0904 int i;
0905 int meta_size;
0906 int ret;
0907 struct r5l_io_unit *io;
0908
0909 meta_size =
0910 ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32))
0911 * data_pages) +
0912 sizeof(struct r5l_payload_data_parity) +
0913 sizeof(__le32) * parity_pages;
0914
0915 ret = r5l_get_meta(log, meta_size);
0916 if (ret)
0917 return ret;
0918
0919 io = log->current_io;
0920
0921 if (test_and_clear_bit(STRIPE_R5C_PREFLUSH, &sh->state))
0922 io->has_flush = 1;
0923
0924 for (i = 0; i < sh->disks; i++) {
0925 if (!test_bit(R5_Wantwrite, &sh->dev[i].flags) ||
0926 test_bit(R5_InJournal, &sh->dev[i].flags))
0927 continue;
0928 if (i == sh->pd_idx || i == sh->qd_idx)
0929 continue;
0930 if (test_bit(R5_WantFUA, &sh->dev[i].flags) &&
0931 log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK) {
0932 io->has_fua = 1;
0933
0934
0935
0936
0937 io->has_flush = 1;
0938 }
0939 r5l_append_payload_meta(log, R5LOG_PAYLOAD_DATA,
0940 raid5_compute_blocknr(sh, i, 0),
0941 sh->dev[i].log_checksum, 0, false);
0942 r5l_append_payload_page(log, sh->dev[i].page);
0943 }
0944
0945 if (parity_pages == 2) {
0946 r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY,
0947 sh->sector, sh->dev[sh->pd_idx].log_checksum,
0948 sh->dev[sh->qd_idx].log_checksum, true);
0949 r5l_append_payload_page(log, sh->dev[sh->pd_idx].page);
0950 r5l_append_payload_page(log, sh->dev[sh->qd_idx].page);
0951 } else if (parity_pages == 1) {
0952 r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY,
0953 sh->sector, sh->dev[sh->pd_idx].log_checksum,
0954 0, false);
0955 r5l_append_payload_page(log, sh->dev[sh->pd_idx].page);
0956 } else
0957 BUG_ON(parity_pages != 0);
0958
0959 list_add_tail(&sh->log_list, &io->stripe_list);
0960 atomic_inc(&io->pending_stripe);
0961 sh->log_io = io;
0962
0963 if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
0964 return 0;
0965
0966 if (sh->log_start == MaxSector) {
0967 BUG_ON(!list_empty(&sh->r5c));
0968 sh->log_start = io->log_start;
0969 spin_lock_irq(&log->stripe_in_journal_lock);
0970 list_add_tail(&sh->r5c,
0971 &log->stripe_in_journal_list);
0972 spin_unlock_irq(&log->stripe_in_journal_lock);
0973 atomic_inc(&log->stripe_in_journal_count);
0974 }
0975 return 0;
0976 }
0977
0978
0979 static inline void r5l_add_no_space_stripe(struct r5l_log *log,
0980 struct stripe_head *sh)
0981 {
0982 spin_lock(&log->no_space_stripes_lock);
0983 list_add_tail(&sh->log_list, &log->no_space_stripes);
0984 spin_unlock(&log->no_space_stripes_lock);
0985 }
0986
0987
0988
0989
0990
0991 int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
0992 {
0993 struct r5conf *conf = sh->raid_conf;
0994 int write_disks = 0;
0995 int data_pages, parity_pages;
0996 int reserve;
0997 int i;
0998 int ret = 0;
0999 bool wake_reclaim = false;
1000
1001 if (!log)
1002 return -EAGAIN;
1003
1004 if (sh->log_io || !test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags) ||
1005 test_bit(STRIPE_SYNCING, &sh->state)) {
1006
1007 clear_bit(STRIPE_LOG_TRAPPED, &sh->state);
1008 return -EAGAIN;
1009 }
1010
1011 WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state));
1012
1013 for (i = 0; i < sh->disks; i++) {
1014 void *addr;
1015
1016 if (!test_bit(R5_Wantwrite, &sh->dev[i].flags) ||
1017 test_bit(R5_InJournal, &sh->dev[i].flags))
1018 continue;
1019
1020 write_disks++;
1021
1022 if (test_bit(STRIPE_LOG_TRAPPED, &sh->state))
1023 continue;
1024 addr = kmap_atomic(sh->dev[i].page);
1025 sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum,
1026 addr, PAGE_SIZE);
1027 kunmap_atomic(addr);
1028 }
1029 parity_pages = 1 + !!(sh->qd_idx >= 0);
1030 data_pages = write_disks - parity_pages;
1031
1032 set_bit(STRIPE_LOG_TRAPPED, &sh->state);
1033
1034
1035
1036
1037 clear_bit(STRIPE_DELAYED, &sh->state);
1038 atomic_inc(&sh->count);
1039
1040 mutex_lock(&log->io_mutex);
1041
1042 reserve = (1 + write_disks) << (PAGE_SHIFT - 9);
1043
1044 if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) {
1045 if (!r5l_has_free_space(log, reserve)) {
1046 r5l_add_no_space_stripe(log, sh);
1047 wake_reclaim = true;
1048 } else {
1049 ret = r5l_log_stripe(log, sh, data_pages, parity_pages);
1050 if (ret) {
1051 spin_lock_irq(&log->io_list_lock);
1052 list_add_tail(&sh->log_list,
1053 &log->no_mem_stripes);
1054 spin_unlock_irq(&log->io_list_lock);
1055 }
1056 }
1057 } else {
1058
1059
1060
1061
1062 if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) &&
1063 sh->log_start == MaxSector) {
1064 r5l_add_no_space_stripe(log, sh);
1065 wake_reclaim = true;
1066 reserve = 0;
1067 } else if (!r5l_has_free_space(log, reserve)) {
1068 if (sh->log_start == log->last_checkpoint)
1069 BUG();
1070 else
1071 r5l_add_no_space_stripe(log, sh);
1072 } else {
1073 ret = r5l_log_stripe(log, sh, data_pages, parity_pages);
1074 if (ret) {
1075 spin_lock_irq(&log->io_list_lock);
1076 list_add_tail(&sh->log_list,
1077 &log->no_mem_stripes);
1078 spin_unlock_irq(&log->io_list_lock);
1079 }
1080 }
1081 }
1082
1083 mutex_unlock(&log->io_mutex);
1084 if (wake_reclaim)
1085 r5l_wake_reclaim(log, reserve);
1086 return 0;
1087 }
1088
1089 void r5l_write_stripe_run(struct r5l_log *log)
1090 {
1091 if (!log)
1092 return;
1093 mutex_lock(&log->io_mutex);
1094 r5l_submit_current_io(log);
1095 mutex_unlock(&log->io_mutex);
1096 }
1097
1098 int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio)
1099 {
1100 if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) {
1101
1102
1103
1104
1105
1106
1107
1108 if (bio->bi_iter.bi_size == 0) {
1109 bio_endio(bio);
1110 return 0;
1111 }
1112 bio->bi_opf &= ~REQ_PREFLUSH;
1113 } else {
1114
1115 if (bio->bi_iter.bi_size == 0) {
1116 mutex_lock(&log->io_mutex);
1117 r5l_get_meta(log, 0);
1118 bio_list_add(&log->current_io->flush_barriers, bio);
1119 log->current_io->has_flush = 1;
1120 log->current_io->has_null_flush = 1;
1121 atomic_inc(&log->current_io->pending_stripe);
1122 r5l_submit_current_io(log);
1123 mutex_unlock(&log->io_mutex);
1124 return 0;
1125 }
1126 }
1127 return -EAGAIN;
1128 }
1129
1130
1131 static void r5l_run_no_space_stripes(struct r5l_log *log)
1132 {
1133 struct stripe_head *sh;
1134
1135 spin_lock(&log->no_space_stripes_lock);
1136 while (!list_empty(&log->no_space_stripes)) {
1137 sh = list_first_entry(&log->no_space_stripes,
1138 struct stripe_head, log_list);
1139 list_del_init(&sh->log_list);
1140 set_bit(STRIPE_HANDLE, &sh->state);
1141 raid5_release_stripe(sh);
1142 }
1143 spin_unlock(&log->no_space_stripes_lock);
1144 }
1145
1146
1147
1148
1149
1150
1151 static sector_t r5c_calculate_new_cp(struct r5conf *conf)
1152 {
1153 struct stripe_head *sh;
1154 struct r5l_log *log = conf->log;
1155 sector_t new_cp;
1156 unsigned long flags;
1157
1158 if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
1159 return log->next_checkpoint;
1160
1161 spin_lock_irqsave(&log->stripe_in_journal_lock, flags);
1162 if (list_empty(&conf->log->stripe_in_journal_list)) {
1163
1164 spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags);
1165 return log->next_checkpoint;
1166 }
1167 sh = list_first_entry(&conf->log->stripe_in_journal_list,
1168 struct stripe_head, r5c);
1169 new_cp = sh->log_start;
1170 spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags);
1171 return new_cp;
1172 }
1173
1174 static sector_t r5l_reclaimable_space(struct r5l_log *log)
1175 {
1176 struct r5conf *conf = log->rdev->mddev->private;
1177
1178 return r5l_ring_distance(log, log->last_checkpoint,
1179 r5c_calculate_new_cp(conf));
1180 }
1181
1182 static void r5l_run_no_mem_stripe(struct r5l_log *log)
1183 {
1184 struct stripe_head *sh;
1185
1186 lockdep_assert_held(&log->io_list_lock);
1187
1188 if (!list_empty(&log->no_mem_stripes)) {
1189 sh = list_first_entry(&log->no_mem_stripes,
1190 struct stripe_head, log_list);
1191 list_del_init(&sh->log_list);
1192 set_bit(STRIPE_HANDLE, &sh->state);
1193 raid5_release_stripe(sh);
1194 }
1195 }
1196
1197 static bool r5l_complete_finished_ios(struct r5l_log *log)
1198 {
1199 struct r5l_io_unit *io, *next;
1200 bool found = false;
1201
1202 lockdep_assert_held(&log->io_list_lock);
1203
1204 list_for_each_entry_safe(io, next, &log->finished_ios, log_sibling) {
1205
1206 if (io->state < IO_UNIT_STRIPE_END)
1207 break;
1208
1209 log->next_checkpoint = io->log_start;
1210
1211 list_del(&io->log_sibling);
1212 mempool_free(io, &log->io_pool);
1213 r5l_run_no_mem_stripe(log);
1214
1215 found = true;
1216 }
1217
1218 return found;
1219 }
1220
1221 static void __r5l_stripe_write_finished(struct r5l_io_unit *io)
1222 {
1223 struct r5l_log *log = io->log;
1224 struct r5conf *conf = log->rdev->mddev->private;
1225 unsigned long flags;
1226
1227 spin_lock_irqsave(&log->io_list_lock, flags);
1228 __r5l_set_io_unit_state(io, IO_UNIT_STRIPE_END);
1229
1230 if (!r5l_complete_finished_ios(log)) {
1231 spin_unlock_irqrestore(&log->io_list_lock, flags);
1232 return;
1233 }
1234
1235 if (r5l_reclaimable_space(log) > log->max_free_space ||
1236 test_bit(R5C_LOG_TIGHT, &conf->cache_state))
1237 r5l_wake_reclaim(log, 0);
1238
1239 spin_unlock_irqrestore(&log->io_list_lock, flags);
1240 wake_up(&log->iounit_wait);
1241 }
1242
1243 void r5l_stripe_write_finished(struct stripe_head *sh)
1244 {
1245 struct r5l_io_unit *io;
1246
1247 io = sh->log_io;
1248 sh->log_io = NULL;
1249
1250 if (io && atomic_dec_and_test(&io->pending_stripe))
1251 __r5l_stripe_write_finished(io);
1252 }
1253
1254 static void r5l_log_flush_endio(struct bio *bio)
1255 {
1256 struct r5l_log *log = container_of(bio, struct r5l_log,
1257 flush_bio);
1258 unsigned long flags;
1259 struct r5l_io_unit *io;
1260
1261 if (bio->bi_status)
1262 md_error(log->rdev->mddev, log->rdev);
1263
1264 spin_lock_irqsave(&log->io_list_lock, flags);
1265 list_for_each_entry(io, &log->flushing_ios, log_sibling)
1266 r5l_io_run_stripes(io);
1267 list_splice_tail_init(&log->flushing_ios, &log->finished_ios);
1268 spin_unlock_irqrestore(&log->io_list_lock, flags);
1269
1270 bio_uninit(bio);
1271 }
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287 void r5l_flush_stripe_to_raid(struct r5l_log *log)
1288 {
1289 bool do_flush;
1290
1291 if (!log || !log->need_cache_flush)
1292 return;
1293
1294 spin_lock_irq(&log->io_list_lock);
1295
1296 if (!list_empty(&log->flushing_ios)) {
1297 spin_unlock_irq(&log->io_list_lock);
1298 return;
1299 }
1300 list_splice_tail_init(&log->io_end_ios, &log->flushing_ios);
1301 do_flush = !list_empty(&log->flushing_ios);
1302 spin_unlock_irq(&log->io_list_lock);
1303
1304 if (!do_flush)
1305 return;
1306 bio_init(&log->flush_bio, log->rdev->bdev, NULL, 0,
1307 REQ_OP_WRITE | REQ_PREFLUSH);
1308 log->flush_bio.bi_end_io = r5l_log_flush_endio;
1309 submit_bio(&log->flush_bio);
1310 }
1311
1312 static void r5l_write_super(struct r5l_log *log, sector_t cp);
1313 static void r5l_write_super_and_discard_space(struct r5l_log *log,
1314 sector_t end)
1315 {
1316 struct block_device *bdev = log->rdev->bdev;
1317 struct mddev *mddev;
1318
1319 r5l_write_super(log, end);
1320
1321 if (!bdev_max_discard_sectors(bdev))
1322 return;
1323
1324 mddev = log->rdev->mddev;
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336 set_mask_bits(&mddev->sb_flags, 0,
1337 BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING));
1338 if (!mddev_trylock(mddev))
1339 return;
1340 md_update_sb(mddev, 1);
1341 mddev_unlock(mddev);
1342
1343
1344 if (log->last_checkpoint < end) {
1345 blkdev_issue_discard(bdev,
1346 log->last_checkpoint + log->rdev->data_offset,
1347 end - log->last_checkpoint, GFP_NOIO);
1348 } else {
1349 blkdev_issue_discard(bdev,
1350 log->last_checkpoint + log->rdev->data_offset,
1351 log->device_size - log->last_checkpoint,
1352 GFP_NOIO);
1353 blkdev_issue_discard(bdev, log->rdev->data_offset, end,
1354 GFP_NOIO);
1355 }
1356 }
1357
1358
1359
1360
1361
1362
1363
1364 static void r5c_flush_stripe(struct r5conf *conf, struct stripe_head *sh)
1365 {
1366 BUG_ON(list_empty(&sh->lru));
1367 BUG_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state));
1368 BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
1369
1370
1371
1372
1373
1374 BUG_ON(test_bit(STRIPE_ON_RELEASE_LIST, &sh->state));
1375 lockdep_assert_held(&conf->device_lock);
1376
1377 list_del_init(&sh->lru);
1378 atomic_inc(&sh->count);
1379
1380 set_bit(STRIPE_HANDLE, &sh->state);
1381 atomic_inc(&conf->active_stripes);
1382 r5c_make_stripe_write_out(sh);
1383
1384 if (test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state))
1385 atomic_inc(&conf->r5c_flushing_partial_stripes);
1386 else
1387 atomic_inc(&conf->r5c_flushing_full_stripes);
1388 raid5_release_stripe(sh);
1389 }
1390
1391
1392
1393
1394
1395
1396
1397 void r5c_flush_cache(struct r5conf *conf, int num)
1398 {
1399 int count;
1400 struct stripe_head *sh, *next;
1401
1402 lockdep_assert_held(&conf->device_lock);
1403 if (!conf->log)
1404 return;
1405
1406 count = 0;
1407 list_for_each_entry_safe(sh, next, &conf->r5c_full_stripe_list, lru) {
1408 r5c_flush_stripe(conf, sh);
1409 count++;
1410 }
1411
1412 if (count >= num)
1413 return;
1414 list_for_each_entry_safe(sh, next,
1415 &conf->r5c_partial_stripe_list, lru) {
1416 r5c_flush_stripe(conf, sh);
1417 if (++count >= num)
1418 break;
1419 }
1420 }
1421
1422 static void r5c_do_reclaim(struct r5conf *conf)
1423 {
1424 struct r5l_log *log = conf->log;
1425 struct stripe_head *sh;
1426 int count = 0;
1427 unsigned long flags;
1428 int total_cached;
1429 int stripes_to_flush;
1430 int flushing_partial, flushing_full;
1431
1432 if (!r5c_is_writeback(log))
1433 return;
1434
1435 flushing_partial = atomic_read(&conf->r5c_flushing_partial_stripes);
1436 flushing_full = atomic_read(&conf->r5c_flushing_full_stripes);
1437 total_cached = atomic_read(&conf->r5c_cached_partial_stripes) +
1438 atomic_read(&conf->r5c_cached_full_stripes) -
1439 flushing_full - flushing_partial;
1440
1441 if (total_cached > conf->min_nr_stripes * 3 / 4 ||
1442 atomic_read(&conf->empty_inactive_list_nr) > 0)
1443
1444
1445
1446
1447 stripes_to_flush = R5C_RECLAIM_STRIPE_GROUP;
1448 else if (total_cached > conf->min_nr_stripes * 1 / 2 ||
1449 atomic_read(&conf->r5c_cached_full_stripes) - flushing_full >
1450 R5C_FULL_STRIPE_FLUSH_BATCH(conf))
1451
1452
1453
1454
1455 stripes_to_flush = 0;
1456 else
1457
1458 stripes_to_flush = -1;
1459
1460 if (stripes_to_flush >= 0) {
1461 spin_lock_irqsave(&conf->device_lock, flags);
1462 r5c_flush_cache(conf, stripes_to_flush);
1463 spin_unlock_irqrestore(&conf->device_lock, flags);
1464 }
1465
1466
1467 if (test_bit(R5C_LOG_TIGHT, &conf->cache_state)) {
1468 spin_lock_irqsave(&log->stripe_in_journal_lock, flags);
1469 spin_lock(&conf->device_lock);
1470 list_for_each_entry(sh, &log->stripe_in_journal_list, r5c) {
1471
1472
1473
1474
1475
1476
1477
1478
1479 if (!list_empty(&sh->lru) &&
1480 !test_bit(STRIPE_HANDLE, &sh->state) &&
1481 atomic_read(&sh->count) == 0) {
1482 r5c_flush_stripe(conf, sh);
1483 if (count++ >= R5C_RECLAIM_STRIPE_GROUP)
1484 break;
1485 }
1486 }
1487 spin_unlock(&conf->device_lock);
1488 spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags);
1489 }
1490
1491 if (!test_bit(R5C_LOG_CRITICAL, &conf->cache_state))
1492 r5l_run_no_space_stripes(log);
1493
1494 md_wakeup_thread(conf->mddev->thread);
1495 }
1496
1497 static void r5l_do_reclaim(struct r5l_log *log)
1498 {
1499 struct r5conf *conf = log->rdev->mddev->private;
1500 sector_t reclaim_target = xchg(&log->reclaim_target, 0);
1501 sector_t reclaimable;
1502 sector_t next_checkpoint;
1503 bool write_super;
1504
1505 spin_lock_irq(&log->io_list_lock);
1506 write_super = r5l_reclaimable_space(log) > log->max_free_space ||
1507 reclaim_target != 0 || !list_empty(&log->no_space_stripes);
1508
1509
1510
1511
1512
1513 while (1) {
1514 reclaimable = r5l_reclaimable_space(log);
1515 if (reclaimable >= reclaim_target ||
1516 (list_empty(&log->running_ios) &&
1517 list_empty(&log->io_end_ios) &&
1518 list_empty(&log->flushing_ios) &&
1519 list_empty(&log->finished_ios)))
1520 break;
1521
1522 md_wakeup_thread(log->rdev->mddev->thread);
1523 wait_event_lock_irq(log->iounit_wait,
1524 r5l_reclaimable_space(log) > reclaimable,
1525 log->io_list_lock);
1526 }
1527
1528 next_checkpoint = r5c_calculate_new_cp(conf);
1529 spin_unlock_irq(&log->io_list_lock);
1530
1531 if (reclaimable == 0 || !write_super)
1532 return;
1533
1534
1535
1536
1537
1538
1539 r5l_write_super_and_discard_space(log, next_checkpoint);
1540
1541 mutex_lock(&log->io_mutex);
1542 log->last_checkpoint = next_checkpoint;
1543 r5c_update_log_state(log);
1544 mutex_unlock(&log->io_mutex);
1545
1546 r5l_run_no_space_stripes(log);
1547 }
1548
1549 static void r5l_reclaim_thread(struct md_thread *thread)
1550 {
1551 struct mddev *mddev = thread->mddev;
1552 struct r5conf *conf = mddev->private;
1553 struct r5l_log *log = conf->log;
1554
1555 if (!log)
1556 return;
1557 r5c_do_reclaim(conf);
1558 r5l_do_reclaim(log);
1559 }
1560
1561 void r5l_wake_reclaim(struct r5l_log *log, sector_t space)
1562 {
1563 unsigned long target;
1564 unsigned long new = (unsigned long)space;
1565
1566 if (!log)
1567 return;
1568 do {
1569 target = log->reclaim_target;
1570 if (new < target)
1571 return;
1572 } while (cmpxchg(&log->reclaim_target, target, new) != target);
1573 md_wakeup_thread(log->reclaim_thread);
1574 }
1575
1576 void r5l_quiesce(struct r5l_log *log, int quiesce)
1577 {
1578 struct mddev *mddev;
1579
1580 if (quiesce) {
1581
1582 mddev = log->rdev->mddev;
1583 wake_up(&mddev->sb_wait);
1584 kthread_park(log->reclaim_thread->tsk);
1585 r5l_wake_reclaim(log, MaxSector);
1586 r5l_do_reclaim(log);
1587 } else
1588 kthread_unpark(log->reclaim_thread->tsk);
1589 }
1590
1591 bool r5l_log_disk_error(struct r5conf *conf)
1592 {
1593 struct r5l_log *log = conf->log;
1594
1595
1596 if (!log)
1597 return test_bit(MD_HAS_JOURNAL, &conf->mddev->flags);
1598 else
1599 return test_bit(Faulty, &log->rdev->flags);
1600 }
1601
1602 #define R5L_RECOVERY_PAGE_POOL_SIZE 256
1603
1604 struct r5l_recovery_ctx {
1605 struct page *meta_page;
1606 sector_t meta_total_blocks;
1607 sector_t pos;
1608 u64 seq;
1609 int data_parity_stripes;
1610 int data_only_stripes;
1611 struct list_head cached_list;
1612
1613
1614
1615
1616
1617
1618
1619
1620 struct page *ra_pool[R5L_RECOVERY_PAGE_POOL_SIZE];
1621 struct bio_vec ra_bvec[R5L_RECOVERY_PAGE_POOL_SIZE];
1622 sector_t pool_offset;
1623 int total_pages;
1624 int valid_pages;
1625 };
1626
1627 static int r5l_recovery_allocate_ra_pool(struct r5l_log *log,
1628 struct r5l_recovery_ctx *ctx)
1629 {
1630 struct page *page;
1631
1632 ctx->valid_pages = 0;
1633 ctx->total_pages = 0;
1634 while (ctx->total_pages < R5L_RECOVERY_PAGE_POOL_SIZE) {
1635 page = alloc_page(GFP_KERNEL);
1636
1637 if (!page)
1638 break;
1639 ctx->ra_pool[ctx->total_pages] = page;
1640 ctx->total_pages += 1;
1641 }
1642
1643 if (ctx->total_pages == 0)
1644 return -ENOMEM;
1645
1646 ctx->pool_offset = 0;
1647 return 0;
1648 }
1649
1650 static void r5l_recovery_free_ra_pool(struct r5l_log *log,
1651 struct r5l_recovery_ctx *ctx)
1652 {
1653 int i;
1654
1655 for (i = 0; i < ctx->total_pages; ++i)
1656 put_page(ctx->ra_pool[i]);
1657 }
1658
1659
1660
1661
1662
1663
1664
1665 static int r5l_recovery_fetch_ra_pool(struct r5l_log *log,
1666 struct r5l_recovery_ctx *ctx,
1667 sector_t offset)
1668 {
1669 struct bio bio;
1670 int ret;
1671
1672 bio_init(&bio, log->rdev->bdev, ctx->ra_bvec,
1673 R5L_RECOVERY_PAGE_POOL_SIZE, REQ_OP_READ);
1674 bio.bi_iter.bi_sector = log->rdev->data_offset + offset;
1675
1676 ctx->valid_pages = 0;
1677 ctx->pool_offset = offset;
1678
1679 while (ctx->valid_pages < ctx->total_pages) {
1680 __bio_add_page(&bio, ctx->ra_pool[ctx->valid_pages], PAGE_SIZE,
1681 0);
1682 ctx->valid_pages += 1;
1683
1684 offset = r5l_ring_add(log, offset, BLOCK_SECTORS);
1685
1686 if (offset == 0)
1687 break;
1688 }
1689
1690 ret = submit_bio_wait(&bio);
1691 bio_uninit(&bio);
1692 return ret;
1693 }
1694
1695
1696
1697
1698
1699 static int r5l_recovery_read_page(struct r5l_log *log,
1700 struct r5l_recovery_ctx *ctx,
1701 struct page *page,
1702 sector_t offset)
1703 {
1704 int ret;
1705
1706 if (offset < ctx->pool_offset ||
1707 offset >= ctx->pool_offset + ctx->valid_pages * BLOCK_SECTORS) {
1708 ret = r5l_recovery_fetch_ra_pool(log, ctx, offset);
1709 if (ret)
1710 return ret;
1711 }
1712
1713 BUG_ON(offset < ctx->pool_offset ||
1714 offset >= ctx->pool_offset + ctx->valid_pages * BLOCK_SECTORS);
1715
1716 memcpy(page_address(page),
1717 page_address(ctx->ra_pool[(offset - ctx->pool_offset) >>
1718 BLOCK_SECTOR_SHIFT]),
1719 PAGE_SIZE);
1720 return 0;
1721 }
1722
1723 static int r5l_recovery_read_meta_block(struct r5l_log *log,
1724 struct r5l_recovery_ctx *ctx)
1725 {
1726 struct page *page = ctx->meta_page;
1727 struct r5l_meta_block *mb;
1728 u32 crc, stored_crc;
1729 int ret;
1730
1731 ret = r5l_recovery_read_page(log, ctx, page, ctx->pos);
1732 if (ret != 0)
1733 return ret;
1734
1735 mb = page_address(page);
1736 stored_crc = le32_to_cpu(mb->checksum);
1737 mb->checksum = 0;
1738
1739 if (le32_to_cpu(mb->magic) != R5LOG_MAGIC ||
1740 le64_to_cpu(mb->seq) != ctx->seq ||
1741 mb->version != R5LOG_VERSION ||
1742 le64_to_cpu(mb->position) != ctx->pos)
1743 return -EINVAL;
1744
1745 crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
1746 if (stored_crc != crc)
1747 return -EINVAL;
1748
1749 if (le32_to_cpu(mb->meta_size) > PAGE_SIZE)
1750 return -EINVAL;
1751
1752 ctx->meta_total_blocks = BLOCK_SECTORS;
1753
1754 return 0;
1755 }
1756
1757 static void
1758 r5l_recovery_create_empty_meta_block(struct r5l_log *log,
1759 struct page *page,
1760 sector_t pos, u64 seq)
1761 {
1762 struct r5l_meta_block *mb;
1763
1764 mb = page_address(page);
1765 clear_page(mb);
1766 mb->magic = cpu_to_le32(R5LOG_MAGIC);
1767 mb->version = R5LOG_VERSION;
1768 mb->meta_size = cpu_to_le32(sizeof(struct r5l_meta_block));
1769 mb->seq = cpu_to_le64(seq);
1770 mb->position = cpu_to_le64(pos);
1771 }
1772
1773 static int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos,
1774 u64 seq)
1775 {
1776 struct page *page;
1777 struct r5l_meta_block *mb;
1778
1779 page = alloc_page(GFP_KERNEL);
1780 if (!page)
1781 return -ENOMEM;
1782 r5l_recovery_create_empty_meta_block(log, page, pos, seq);
1783 mb = page_address(page);
1784 mb->checksum = cpu_to_le32(crc32c_le(log->uuid_checksum,
1785 mb, PAGE_SIZE));
1786 if (!sync_page_io(log->rdev, pos, PAGE_SIZE, page, REQ_OP_WRITE |
1787 REQ_SYNC | REQ_FUA, false)) {
1788 __free_page(page);
1789 return -EIO;
1790 }
1791 __free_page(page);
1792 return 0;
1793 }
1794
1795
1796
1797
1798
1799
1800
1801
1802 static void r5l_recovery_load_data(struct r5l_log *log,
1803 struct stripe_head *sh,
1804 struct r5l_recovery_ctx *ctx,
1805 struct r5l_payload_data_parity *payload,
1806 sector_t log_offset)
1807 {
1808 struct mddev *mddev = log->rdev->mddev;
1809 struct r5conf *conf = mddev->private;
1810 int dd_idx;
1811
1812 raid5_compute_sector(conf,
1813 le64_to_cpu(payload->location), 0,
1814 &dd_idx, sh);
1815 r5l_recovery_read_page(log, ctx, sh->dev[dd_idx].page, log_offset);
1816 sh->dev[dd_idx].log_checksum =
1817 le32_to_cpu(payload->checksum[0]);
1818 ctx->meta_total_blocks += BLOCK_SECTORS;
1819
1820 set_bit(R5_Wantwrite, &sh->dev[dd_idx].flags);
1821 set_bit(STRIPE_R5C_CACHING, &sh->state);
1822 }
1823
1824 static void r5l_recovery_load_parity(struct r5l_log *log,
1825 struct stripe_head *sh,
1826 struct r5l_recovery_ctx *ctx,
1827 struct r5l_payload_data_parity *payload,
1828 sector_t log_offset)
1829 {
1830 struct mddev *mddev = log->rdev->mddev;
1831 struct r5conf *conf = mddev->private;
1832
1833 ctx->meta_total_blocks += BLOCK_SECTORS * conf->max_degraded;
1834 r5l_recovery_read_page(log, ctx, sh->dev[sh->pd_idx].page, log_offset);
1835 sh->dev[sh->pd_idx].log_checksum =
1836 le32_to_cpu(payload->checksum[0]);
1837 set_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags);
1838
1839 if (sh->qd_idx >= 0) {
1840 r5l_recovery_read_page(
1841 log, ctx, sh->dev[sh->qd_idx].page,
1842 r5l_ring_add(log, log_offset, BLOCK_SECTORS));
1843 sh->dev[sh->qd_idx].log_checksum =
1844 le32_to_cpu(payload->checksum[1]);
1845 set_bit(R5_Wantwrite, &sh->dev[sh->qd_idx].flags);
1846 }
1847 clear_bit(STRIPE_R5C_CACHING, &sh->state);
1848 }
1849
1850 static void r5l_recovery_reset_stripe(struct stripe_head *sh)
1851 {
1852 int i;
1853
1854 sh->state = 0;
1855 sh->log_start = MaxSector;
1856 for (i = sh->disks; i--; )
1857 sh->dev[i].flags = 0;
1858 }
1859
1860 static void
1861 r5l_recovery_replay_one_stripe(struct r5conf *conf,
1862 struct stripe_head *sh,
1863 struct r5l_recovery_ctx *ctx)
1864 {
1865 struct md_rdev *rdev, *rrdev;
1866 int disk_index;
1867 int data_count = 0;
1868
1869 for (disk_index = 0; disk_index < sh->disks; disk_index++) {
1870 if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags))
1871 continue;
1872 if (disk_index == sh->qd_idx || disk_index == sh->pd_idx)
1873 continue;
1874 data_count++;
1875 }
1876
1877
1878
1879
1880
1881
1882 if (data_count == 0)
1883 goto out;
1884
1885 for (disk_index = 0; disk_index < sh->disks; disk_index++) {
1886 if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags))
1887 continue;
1888
1889
1890 rcu_read_lock();
1891 rdev = rcu_dereference(conf->disks[disk_index].rdev);
1892 if (rdev) {
1893 atomic_inc(&rdev->nr_pending);
1894 rcu_read_unlock();
1895 sync_page_io(rdev, sh->sector, PAGE_SIZE,
1896 sh->dev[disk_index].page, REQ_OP_WRITE,
1897 false);
1898 rdev_dec_pending(rdev, rdev->mddev);
1899 rcu_read_lock();
1900 }
1901 rrdev = rcu_dereference(conf->disks[disk_index].replacement);
1902 if (rrdev) {
1903 atomic_inc(&rrdev->nr_pending);
1904 rcu_read_unlock();
1905 sync_page_io(rrdev, sh->sector, PAGE_SIZE,
1906 sh->dev[disk_index].page, REQ_OP_WRITE,
1907 false);
1908 rdev_dec_pending(rrdev, rrdev->mddev);
1909 rcu_read_lock();
1910 }
1911 rcu_read_unlock();
1912 }
1913 ctx->data_parity_stripes++;
1914 out:
1915 r5l_recovery_reset_stripe(sh);
1916 }
1917
1918 static struct stripe_head *
1919 r5c_recovery_alloc_stripe(
1920 struct r5conf *conf,
1921 sector_t stripe_sect,
1922 int noblock)
1923 {
1924 struct stripe_head *sh;
1925
1926 sh = raid5_get_active_stripe(conf, stripe_sect, 0, noblock, 0);
1927 if (!sh)
1928 return NULL;
1929
1930 r5l_recovery_reset_stripe(sh);
1931
1932 return sh;
1933 }
1934
1935 static struct stripe_head *
1936 r5c_recovery_lookup_stripe(struct list_head *list, sector_t sect)
1937 {
1938 struct stripe_head *sh;
1939
1940 list_for_each_entry(sh, list, lru)
1941 if (sh->sector == sect)
1942 return sh;
1943 return NULL;
1944 }
1945
1946 static void
1947 r5c_recovery_drop_stripes(struct list_head *cached_stripe_list,
1948 struct r5l_recovery_ctx *ctx)
1949 {
1950 struct stripe_head *sh, *next;
1951
1952 list_for_each_entry_safe(sh, next, cached_stripe_list, lru) {
1953 r5l_recovery_reset_stripe(sh);
1954 list_del_init(&sh->lru);
1955 raid5_release_stripe(sh);
1956 }
1957 }
1958
1959 static void
1960 r5c_recovery_replay_stripes(struct list_head *cached_stripe_list,
1961 struct r5l_recovery_ctx *ctx)
1962 {
1963 struct stripe_head *sh, *next;
1964
1965 list_for_each_entry_safe(sh, next, cached_stripe_list, lru)
1966 if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) {
1967 r5l_recovery_replay_one_stripe(sh->raid_conf, sh, ctx);
1968 list_del_init(&sh->lru);
1969 raid5_release_stripe(sh);
1970 }
1971 }
1972
1973
1974 static int
1975 r5l_recovery_verify_data_checksum(struct r5l_log *log,
1976 struct r5l_recovery_ctx *ctx,
1977 struct page *page,
1978 sector_t log_offset, __le32 log_checksum)
1979 {
1980 void *addr;
1981 u32 checksum;
1982
1983 r5l_recovery_read_page(log, ctx, page, log_offset);
1984 addr = kmap_atomic(page);
1985 checksum = crc32c_le(log->uuid_checksum, addr, PAGE_SIZE);
1986 kunmap_atomic(addr);
1987 return (le32_to_cpu(log_checksum) == checksum) ? 0 : -EINVAL;
1988 }
1989
1990
1991
1992
1993
1994 static int
1995 r5l_recovery_verify_data_checksum_for_mb(struct r5l_log *log,
1996 struct r5l_recovery_ctx *ctx)
1997 {
1998 struct mddev *mddev = log->rdev->mddev;
1999 struct r5conf *conf = mddev->private;
2000 struct r5l_meta_block *mb = page_address(ctx->meta_page);
2001 sector_t mb_offset = sizeof(struct r5l_meta_block);
2002 sector_t log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS);
2003 struct page *page;
2004 struct r5l_payload_data_parity *payload;
2005 struct r5l_payload_flush *payload_flush;
2006
2007 page = alloc_page(GFP_KERNEL);
2008 if (!page)
2009 return -ENOMEM;
2010
2011 while (mb_offset < le32_to_cpu(mb->meta_size)) {
2012 payload = (void *)mb + mb_offset;
2013 payload_flush = (void *)mb + mb_offset;
2014
2015 if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) {
2016 if (r5l_recovery_verify_data_checksum(
2017 log, ctx, page, log_offset,
2018 payload->checksum[0]) < 0)
2019 goto mismatch;
2020 } else if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_PARITY) {
2021 if (r5l_recovery_verify_data_checksum(
2022 log, ctx, page, log_offset,
2023 payload->checksum[0]) < 0)
2024 goto mismatch;
2025 if (conf->max_degraded == 2 &&
2026 r5l_recovery_verify_data_checksum(
2027 log, ctx, page,
2028 r5l_ring_add(log, log_offset,
2029 BLOCK_SECTORS),
2030 payload->checksum[1]) < 0)
2031 goto mismatch;
2032 } else if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_FLUSH) {
2033
2034 } else
2035 goto mismatch;
2036
2037 if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_FLUSH) {
2038 mb_offset += sizeof(struct r5l_payload_flush) +
2039 le32_to_cpu(payload_flush->size);
2040 } else {
2041
2042 log_offset = r5l_ring_add(log, log_offset,
2043 le32_to_cpu(payload->size));
2044 mb_offset += sizeof(struct r5l_payload_data_parity) +
2045 sizeof(__le32) *
2046 (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
2047 }
2048
2049 }
2050
2051 put_page(page);
2052 return 0;
2053
2054 mismatch:
2055 put_page(page);
2056 return -EINVAL;
2057 }
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067 static int
2068 r5c_recovery_analyze_meta_block(struct r5l_log *log,
2069 struct r5l_recovery_ctx *ctx,
2070 struct list_head *cached_stripe_list)
2071 {
2072 struct mddev *mddev = log->rdev->mddev;
2073 struct r5conf *conf = mddev->private;
2074 struct r5l_meta_block *mb;
2075 struct r5l_payload_data_parity *payload;
2076 struct r5l_payload_flush *payload_flush;
2077 int mb_offset;
2078 sector_t log_offset;
2079 sector_t stripe_sect;
2080 struct stripe_head *sh;
2081 int ret;
2082
2083
2084
2085
2086
2087
2088 ret = r5l_recovery_verify_data_checksum_for_mb(log, ctx);
2089 if (ret == -EINVAL)
2090 return -EAGAIN;
2091 else if (ret)
2092 return ret;
2093
2094 mb = page_address(ctx->meta_page);
2095 mb_offset = sizeof(struct r5l_meta_block);
2096 log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS);
2097
2098 while (mb_offset < le32_to_cpu(mb->meta_size)) {
2099 int dd;
2100
2101 payload = (void *)mb + mb_offset;
2102 payload_flush = (void *)mb + mb_offset;
2103
2104 if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_FLUSH) {
2105 int i, count;
2106
2107 count = le32_to_cpu(payload_flush->size) / sizeof(__le64);
2108 for (i = 0; i < count; ++i) {
2109 stripe_sect = le64_to_cpu(payload_flush->flush_stripes[i]);
2110 sh = r5c_recovery_lookup_stripe(cached_stripe_list,
2111 stripe_sect);
2112 if (sh) {
2113 WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state));
2114 r5l_recovery_reset_stripe(sh);
2115 list_del_init(&sh->lru);
2116 raid5_release_stripe(sh);
2117 }
2118 }
2119
2120 mb_offset += sizeof(struct r5l_payload_flush) +
2121 le32_to_cpu(payload_flush->size);
2122 continue;
2123 }
2124
2125
2126 stripe_sect = (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) ?
2127 raid5_compute_sector(
2128 conf, le64_to_cpu(payload->location), 0, &dd,
2129 NULL)
2130 : le64_to_cpu(payload->location);
2131
2132 sh = r5c_recovery_lookup_stripe(cached_stripe_list,
2133 stripe_sect);
2134
2135 if (!sh) {
2136 sh = r5c_recovery_alloc_stripe(conf, stripe_sect, 1);
2137
2138
2139
2140
2141 if (!sh) {
2142 r5c_recovery_replay_stripes(
2143 cached_stripe_list, ctx);
2144 sh = r5c_recovery_alloc_stripe(
2145 conf, stripe_sect, 1);
2146 }
2147 if (!sh) {
2148 int new_size = conf->min_nr_stripes * 2;
2149 pr_debug("md/raid:%s: Increasing stripe cache size to %d to recovery data on journal.\n",
2150 mdname(mddev),
2151 new_size);
2152 ret = raid5_set_cache_size(mddev, new_size);
2153 if (conf->min_nr_stripes <= new_size / 2) {
2154 pr_err("md/raid:%s: Cannot increase cache size, ret=%d, new_size=%d, min_nr_stripes=%d, max_nr_stripes=%d\n",
2155 mdname(mddev),
2156 ret,
2157 new_size,
2158 conf->min_nr_stripes,
2159 conf->max_nr_stripes);
2160 return -ENOMEM;
2161 }
2162 sh = r5c_recovery_alloc_stripe(
2163 conf, stripe_sect, 0);
2164 }
2165 if (!sh) {
2166 pr_err("md/raid:%s: Cannot get enough stripes due to memory pressure. Recovery failed.\n",
2167 mdname(mddev));
2168 return -ENOMEM;
2169 }
2170 list_add_tail(&sh->lru, cached_stripe_list);
2171 }
2172
2173 if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) {
2174 if (!test_bit(STRIPE_R5C_CACHING, &sh->state) &&
2175 test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags)) {
2176 r5l_recovery_replay_one_stripe(conf, sh, ctx);
2177 list_move_tail(&sh->lru, cached_stripe_list);
2178 }
2179 r5l_recovery_load_data(log, sh, ctx, payload,
2180 log_offset);
2181 } else if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_PARITY)
2182 r5l_recovery_load_parity(log, sh, ctx, payload,
2183 log_offset);
2184 else
2185 return -EINVAL;
2186
2187 log_offset = r5l_ring_add(log, log_offset,
2188 le32_to_cpu(payload->size));
2189
2190 mb_offset += sizeof(struct r5l_payload_data_parity) +
2191 sizeof(__le32) *
2192 (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
2193 }
2194
2195 return 0;
2196 }
2197
2198
2199
2200
2201
2202 static void r5c_recovery_load_one_stripe(struct r5l_log *log,
2203 struct stripe_head *sh)
2204 {
2205 struct r5dev *dev;
2206 int i;
2207
2208 for (i = sh->disks; i--; ) {
2209 dev = sh->dev + i;
2210 if (test_and_clear_bit(R5_Wantwrite, &dev->flags)) {
2211 set_bit(R5_InJournal, &dev->flags);
2212 set_bit(R5_UPTODATE, &dev->flags);
2213 }
2214 }
2215 }
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233 static int r5c_recovery_flush_log(struct r5l_log *log,
2234 struct r5l_recovery_ctx *ctx)
2235 {
2236 struct stripe_head *sh;
2237 int ret = 0;
2238
2239
2240 while (1) {
2241 if (r5l_recovery_read_meta_block(log, ctx))
2242 break;
2243
2244 ret = r5c_recovery_analyze_meta_block(log, ctx,
2245 &ctx->cached_list);
2246
2247
2248
2249
2250 if (ret && ret != -EAGAIN)
2251 break;
2252 ctx->seq++;
2253 ctx->pos = r5l_ring_add(log, ctx->pos, ctx->meta_total_blocks);
2254 }
2255
2256 if (ret == -ENOMEM) {
2257 r5c_recovery_drop_stripes(&ctx->cached_list, ctx);
2258 return ret;
2259 }
2260
2261
2262 r5c_recovery_replay_stripes(&ctx->cached_list, ctx);
2263
2264
2265 list_for_each_entry(sh, &ctx->cached_list, lru) {
2266 WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state));
2267 r5c_recovery_load_one_stripe(log, sh);
2268 ctx->data_only_stripes++;
2269 }
2270
2271 return 0;
2272 }
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343 static int
2344 r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log,
2345 struct r5l_recovery_ctx *ctx)
2346 {
2347 struct stripe_head *sh;
2348 struct mddev *mddev = log->rdev->mddev;
2349 struct page *page;
2350 sector_t next_checkpoint = MaxSector;
2351
2352 page = alloc_page(GFP_KERNEL);
2353 if (!page) {
2354 pr_err("md/raid:%s: cannot allocate memory to rewrite data only stripes\n",
2355 mdname(mddev));
2356 return -ENOMEM;
2357 }
2358
2359 WARN_ON(list_empty(&ctx->cached_list));
2360
2361 list_for_each_entry(sh, &ctx->cached_list, lru) {
2362 struct r5l_meta_block *mb;
2363 int i;
2364 int offset;
2365 sector_t write_pos;
2366
2367 WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state));
2368 r5l_recovery_create_empty_meta_block(log, page,
2369 ctx->pos, ctx->seq);
2370 mb = page_address(page);
2371 offset = le32_to_cpu(mb->meta_size);
2372 write_pos = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS);
2373
2374 for (i = sh->disks; i--; ) {
2375 struct r5dev *dev = &sh->dev[i];
2376 struct r5l_payload_data_parity *payload;
2377 void *addr;
2378
2379 if (test_bit(R5_InJournal, &dev->flags)) {
2380 payload = (void *)mb + offset;
2381 payload->header.type = cpu_to_le16(
2382 R5LOG_PAYLOAD_DATA);
2383 payload->size = cpu_to_le32(BLOCK_SECTORS);
2384 payload->location = cpu_to_le64(
2385 raid5_compute_blocknr(sh, i, 0));
2386 addr = kmap_atomic(dev->page);
2387 payload->checksum[0] = cpu_to_le32(
2388 crc32c_le(log->uuid_checksum, addr,
2389 PAGE_SIZE));
2390 kunmap_atomic(addr);
2391 sync_page_io(log->rdev, write_pos, PAGE_SIZE,
2392 dev->page, REQ_OP_WRITE, false);
2393 write_pos = r5l_ring_add(log, write_pos,
2394 BLOCK_SECTORS);
2395 offset += sizeof(__le32) +
2396 sizeof(struct r5l_payload_data_parity);
2397
2398 }
2399 }
2400 mb->meta_size = cpu_to_le32(offset);
2401 mb->checksum = cpu_to_le32(crc32c_le(log->uuid_checksum,
2402 mb, PAGE_SIZE));
2403 sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page,
2404 REQ_OP_WRITE | REQ_SYNC | REQ_FUA, false);
2405 sh->log_start = ctx->pos;
2406 list_add_tail(&sh->r5c, &log->stripe_in_journal_list);
2407 atomic_inc(&log->stripe_in_journal_count);
2408 ctx->pos = write_pos;
2409 ctx->seq += 1;
2410 next_checkpoint = sh->log_start;
2411 }
2412 log->next_checkpoint = next_checkpoint;
2413 __free_page(page);
2414 return 0;
2415 }
2416
2417 static void r5c_recovery_flush_data_only_stripes(struct r5l_log *log,
2418 struct r5l_recovery_ctx *ctx)
2419 {
2420 struct mddev *mddev = log->rdev->mddev;
2421 struct r5conf *conf = mddev->private;
2422 struct stripe_head *sh, *next;
2423 bool cleared_pending = false;
2424
2425 if (ctx->data_only_stripes == 0)
2426 return;
2427
2428 if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
2429 cleared_pending = true;
2430 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
2431 }
2432 log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_BACK;
2433
2434 list_for_each_entry_safe(sh, next, &ctx->cached_list, lru) {
2435 r5c_make_stripe_write_out(sh);
2436 set_bit(STRIPE_HANDLE, &sh->state);
2437 list_del_init(&sh->lru);
2438 raid5_release_stripe(sh);
2439 }
2440
2441
2442 wait_event(conf->wait_for_quiescent,
2443 atomic_read(&conf->active_stripes) == 0);
2444
2445 log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
2446 if (cleared_pending)
2447 set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
2448 }
2449
2450 static int r5l_recovery_log(struct r5l_log *log)
2451 {
2452 struct mddev *mddev = log->rdev->mddev;
2453 struct r5l_recovery_ctx *ctx;
2454 int ret;
2455 sector_t pos;
2456
2457 ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
2458 if (!ctx)
2459 return -ENOMEM;
2460
2461 ctx->pos = log->last_checkpoint;
2462 ctx->seq = log->last_cp_seq;
2463 INIT_LIST_HEAD(&ctx->cached_list);
2464 ctx->meta_page = alloc_page(GFP_KERNEL);
2465
2466 if (!ctx->meta_page) {
2467 ret = -ENOMEM;
2468 goto meta_page;
2469 }
2470
2471 if (r5l_recovery_allocate_ra_pool(log, ctx) != 0) {
2472 ret = -ENOMEM;
2473 goto ra_pool;
2474 }
2475
2476 ret = r5c_recovery_flush_log(log, ctx);
2477
2478 if (ret)
2479 goto error;
2480
2481 pos = ctx->pos;
2482 ctx->seq += 10000;
2483
2484 if ((ctx->data_only_stripes == 0) && (ctx->data_parity_stripes == 0))
2485 pr_info("md/raid:%s: starting from clean shutdown\n",
2486 mdname(mddev));
2487 else
2488 pr_info("md/raid:%s: recovering %d data-only stripes and %d data-parity stripes\n",
2489 mdname(mddev), ctx->data_only_stripes,
2490 ctx->data_parity_stripes);
2491
2492 if (ctx->data_only_stripes == 0) {
2493 log->next_checkpoint = ctx->pos;
2494 r5l_log_write_empty_meta_block(log, ctx->pos, ctx->seq++);
2495 ctx->pos = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS);
2496 } else if (r5c_recovery_rewrite_data_only_stripes(log, ctx)) {
2497 pr_err("md/raid:%s: failed to rewrite stripes to journal\n",
2498 mdname(mddev));
2499 ret = -EIO;
2500 goto error;
2501 }
2502
2503 log->log_start = ctx->pos;
2504 log->seq = ctx->seq;
2505 log->last_checkpoint = pos;
2506 r5l_write_super(log, pos);
2507
2508 r5c_recovery_flush_data_only_stripes(log, ctx);
2509 ret = 0;
2510 error:
2511 r5l_recovery_free_ra_pool(log, ctx);
2512 ra_pool:
2513 __free_page(ctx->meta_page);
2514 meta_page:
2515 kfree(ctx);
2516 return ret;
2517 }
2518
2519 static void r5l_write_super(struct r5l_log *log, sector_t cp)
2520 {
2521 struct mddev *mddev = log->rdev->mddev;
2522
2523 log->rdev->journal_tail = cp;
2524 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
2525 }
2526
2527 static ssize_t r5c_journal_mode_show(struct mddev *mddev, char *page)
2528 {
2529 struct r5conf *conf;
2530 int ret;
2531
2532 ret = mddev_lock(mddev);
2533 if (ret)
2534 return ret;
2535
2536 conf = mddev->private;
2537 if (!conf || !conf->log)
2538 goto out_unlock;
2539
2540 switch (conf->log->r5c_journal_mode) {
2541 case R5C_JOURNAL_MODE_WRITE_THROUGH:
2542 ret = snprintf(
2543 page, PAGE_SIZE, "[%s] %s\n",
2544 r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_THROUGH],
2545 r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_BACK]);
2546 break;
2547 case R5C_JOURNAL_MODE_WRITE_BACK:
2548 ret = snprintf(
2549 page, PAGE_SIZE, "%s [%s]\n",
2550 r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_THROUGH],
2551 r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_BACK]);
2552 break;
2553 default:
2554 ret = 0;
2555 }
2556
2557 out_unlock:
2558 mddev_unlock(mddev);
2559 return ret;
2560 }
2561
2562
2563
2564
2565
2566
2567
2568 int r5c_journal_mode_set(struct mddev *mddev, int mode)
2569 {
2570 struct r5conf *conf;
2571
2572 if (mode < R5C_JOURNAL_MODE_WRITE_THROUGH ||
2573 mode > R5C_JOURNAL_MODE_WRITE_BACK)
2574 return -EINVAL;
2575
2576 conf = mddev->private;
2577 if (!conf || !conf->log)
2578 return -ENODEV;
2579
2580 if (raid5_calc_degraded(conf) > 0 &&
2581 mode == R5C_JOURNAL_MODE_WRITE_BACK)
2582 return -EINVAL;
2583
2584 mddev_suspend(mddev);
2585 conf->log->r5c_journal_mode = mode;
2586 mddev_resume(mddev);
2587
2588 pr_debug("md/raid:%s: setting r5c cache mode to %d: %s\n",
2589 mdname(mddev), mode, r5c_journal_mode_str[mode]);
2590 return 0;
2591 }
2592 EXPORT_SYMBOL(r5c_journal_mode_set);
2593
2594 static ssize_t r5c_journal_mode_store(struct mddev *mddev,
2595 const char *page, size_t length)
2596 {
2597 int mode = ARRAY_SIZE(r5c_journal_mode_str);
2598 size_t len = length;
2599 int ret;
2600
2601 if (len < 2)
2602 return -EINVAL;
2603
2604 if (page[len - 1] == '\n')
2605 len--;
2606
2607 while (mode--)
2608 if (strlen(r5c_journal_mode_str[mode]) == len &&
2609 !strncmp(page, r5c_journal_mode_str[mode], len))
2610 break;
2611 ret = mddev_lock(mddev);
2612 if (ret)
2613 return ret;
2614 ret = r5c_journal_mode_set(mddev, mode);
2615 mddev_unlock(mddev);
2616 return ret ?: length;
2617 }
2618
2619 struct md_sysfs_entry
2620 r5c_journal_mode = __ATTR(journal_mode, 0644,
2621 r5c_journal_mode_show, r5c_journal_mode_store);
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631 int r5c_try_caching_write(struct r5conf *conf,
2632 struct stripe_head *sh,
2633 struct stripe_head_state *s,
2634 int disks)
2635 {
2636 struct r5l_log *log = conf->log;
2637 int i;
2638 struct r5dev *dev;
2639 int to_cache = 0;
2640 void __rcu **pslot;
2641 sector_t tree_index;
2642 int ret;
2643 uintptr_t refcount;
2644
2645 BUG_ON(!r5c_is_writeback(log));
2646
2647 if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) {
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662 if (s->injournal > 0 || s->written > 0)
2663 return -EAGAIN;
2664
2665 set_bit(STRIPE_R5C_CACHING, &sh->state);
2666 }
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676 if (s->failed || test_bit(STRIPE_SYNCING, &sh->state)) {
2677 r5c_make_stripe_write_out(sh);
2678 return -EAGAIN;
2679 }
2680
2681 for (i = disks; i--; ) {
2682 dev = &sh->dev[i];
2683
2684 if (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags) &&
2685 !test_bit(R5_InJournal, &dev->flags)) {
2686 r5c_make_stripe_write_out(sh);
2687 return -EAGAIN;
2688 }
2689 }
2690
2691
2692 if (!test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state) &&
2693 !test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) {
2694 tree_index = r5c_tree_index(conf, sh->sector);
2695 spin_lock(&log->tree_lock);
2696 pslot = radix_tree_lookup_slot(&log->big_stripe_tree,
2697 tree_index);
2698 if (pslot) {
2699 refcount = (uintptr_t)radix_tree_deref_slot_protected(
2700 pslot, &log->tree_lock) >>
2701 R5C_RADIX_COUNT_SHIFT;
2702 radix_tree_replace_slot(
2703 &log->big_stripe_tree, pslot,
2704 (void *)((refcount + 1) << R5C_RADIX_COUNT_SHIFT));
2705 } else {
2706
2707
2708
2709
2710 ret = radix_tree_insert(
2711 &log->big_stripe_tree, tree_index,
2712 (void *)(1 << R5C_RADIX_COUNT_SHIFT));
2713 if (ret) {
2714 spin_unlock(&log->tree_lock);
2715 r5c_make_stripe_write_out(sh);
2716 return -EAGAIN;
2717 }
2718 }
2719 spin_unlock(&log->tree_lock);
2720
2721
2722
2723
2724
2725 set_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state);
2726 atomic_inc(&conf->r5c_cached_partial_stripes);
2727 }
2728
2729 for (i = disks; i--; ) {
2730 dev = &sh->dev[i];
2731 if (dev->towrite) {
2732 set_bit(R5_Wantwrite, &dev->flags);
2733 set_bit(R5_Wantdrain, &dev->flags);
2734 set_bit(R5_LOCKED, &dev->flags);
2735 to_cache++;
2736 }
2737 }
2738
2739 if (to_cache) {
2740 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
2741
2742
2743
2744
2745
2746 set_bit(STRIPE_LOG_TRAPPED, &sh->state);
2747 }
2748
2749 return 0;
2750 }
2751
2752
2753
2754
2755 void r5c_release_extra_page(struct stripe_head *sh)
2756 {
2757 struct r5conf *conf = sh->raid_conf;
2758 int i;
2759 bool using_disk_info_extra_page;
2760
2761 using_disk_info_extra_page =
2762 sh->dev[0].orig_page == conf->disks[0].extra_page;
2763
2764 for (i = sh->disks; i--; )
2765 if (sh->dev[i].page != sh->dev[i].orig_page) {
2766 struct page *p = sh->dev[i].orig_page;
2767
2768 sh->dev[i].orig_page = sh->dev[i].page;
2769 clear_bit(R5_OrigPageUPTDODATE, &sh->dev[i].flags);
2770
2771 if (!using_disk_info_extra_page)
2772 put_page(p);
2773 }
2774
2775 if (using_disk_info_extra_page) {
2776 clear_bit(R5C_EXTRA_PAGE_IN_USE, &conf->cache_state);
2777 md_wakeup_thread(conf->mddev->thread);
2778 }
2779 }
2780
2781 void r5c_use_extra_page(struct stripe_head *sh)
2782 {
2783 struct r5conf *conf = sh->raid_conf;
2784 int i;
2785 struct r5dev *dev;
2786
2787 for (i = sh->disks; i--; ) {
2788 dev = &sh->dev[i];
2789 if (dev->orig_page != dev->page)
2790 put_page(dev->orig_page);
2791 dev->orig_page = conf->disks[i].extra_page;
2792 }
2793 }
2794
2795
2796
2797
2798
2799 void r5c_finish_stripe_write_out(struct r5conf *conf,
2800 struct stripe_head *sh,
2801 struct stripe_head_state *s)
2802 {
2803 struct r5l_log *log = conf->log;
2804 int i;
2805 int do_wakeup = 0;
2806 sector_t tree_index;
2807 void __rcu **pslot;
2808 uintptr_t refcount;
2809
2810 if (!log || !test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags))
2811 return;
2812
2813 WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state));
2814 clear_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags);
2815
2816 if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
2817 return;
2818
2819 for (i = sh->disks; i--; ) {
2820 clear_bit(R5_InJournal, &sh->dev[i].flags);
2821 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
2822 do_wakeup = 1;
2823 }
2824
2825
2826
2827
2828
2829 s->injournal = 0;
2830
2831 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
2832 if (atomic_dec_and_test(&conf->pending_full_writes))
2833 md_wakeup_thread(conf->mddev->thread);
2834
2835 if (do_wakeup)
2836 wake_up(&conf->wait_for_overlap);
2837
2838 spin_lock_irq(&log->stripe_in_journal_lock);
2839 list_del_init(&sh->r5c);
2840 spin_unlock_irq(&log->stripe_in_journal_lock);
2841 sh->log_start = MaxSector;
2842
2843 atomic_dec(&log->stripe_in_journal_count);
2844 r5c_update_log_state(log);
2845
2846
2847 if (test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state) ||
2848 test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) {
2849 tree_index = r5c_tree_index(conf, sh->sector);
2850 spin_lock(&log->tree_lock);
2851 pslot = radix_tree_lookup_slot(&log->big_stripe_tree,
2852 tree_index);
2853 BUG_ON(pslot == NULL);
2854 refcount = (uintptr_t)radix_tree_deref_slot_protected(
2855 pslot, &log->tree_lock) >>
2856 R5C_RADIX_COUNT_SHIFT;
2857 if (refcount == 1)
2858 radix_tree_delete(&log->big_stripe_tree, tree_index);
2859 else
2860 radix_tree_replace_slot(
2861 &log->big_stripe_tree, pslot,
2862 (void *)((refcount - 1) << R5C_RADIX_COUNT_SHIFT));
2863 spin_unlock(&log->tree_lock);
2864 }
2865
2866 if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) {
2867 BUG_ON(atomic_read(&conf->r5c_cached_partial_stripes) == 0);
2868 atomic_dec(&conf->r5c_flushing_partial_stripes);
2869 atomic_dec(&conf->r5c_cached_partial_stripes);
2870 }
2871
2872 if (test_and_clear_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) {
2873 BUG_ON(atomic_read(&conf->r5c_cached_full_stripes) == 0);
2874 atomic_dec(&conf->r5c_flushing_full_stripes);
2875 atomic_dec(&conf->r5c_cached_full_stripes);
2876 }
2877
2878 r5l_append_flush_payload(log, sh->sector);
2879
2880 if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state))
2881 set_bit(STRIPE_HANDLE, &sh->state);
2882 }
2883
2884 int r5c_cache_data(struct r5l_log *log, struct stripe_head *sh)
2885 {
2886 struct r5conf *conf = sh->raid_conf;
2887 int pages = 0;
2888 int reserve;
2889 int i;
2890 int ret = 0;
2891
2892 BUG_ON(!log);
2893
2894 for (i = 0; i < sh->disks; i++) {
2895 void *addr;
2896
2897 if (!test_bit(R5_Wantwrite, &sh->dev[i].flags))
2898 continue;
2899 addr = kmap_atomic(sh->dev[i].page);
2900 sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum,
2901 addr, PAGE_SIZE);
2902 kunmap_atomic(addr);
2903 pages++;
2904 }
2905 WARN_ON(pages == 0);
2906
2907
2908
2909
2910
2911 clear_bit(STRIPE_DELAYED, &sh->state);
2912 atomic_inc(&sh->count);
2913
2914 mutex_lock(&log->io_mutex);
2915
2916 reserve = (1 + pages) << (PAGE_SHIFT - 9);
2917
2918 if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) &&
2919 sh->log_start == MaxSector)
2920 r5l_add_no_space_stripe(log, sh);
2921 else if (!r5l_has_free_space(log, reserve)) {
2922 if (sh->log_start == log->last_checkpoint)
2923 BUG();
2924 else
2925 r5l_add_no_space_stripe(log, sh);
2926 } else {
2927 ret = r5l_log_stripe(log, sh, pages, 0);
2928 if (ret) {
2929 spin_lock_irq(&log->io_list_lock);
2930 list_add_tail(&sh->log_list, &log->no_mem_stripes);
2931 spin_unlock_irq(&log->io_list_lock);
2932 }
2933 }
2934
2935 mutex_unlock(&log->io_mutex);
2936 return 0;
2937 }
2938
2939
2940 bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect)
2941 {
2942 struct r5l_log *log = conf->log;
2943 sector_t tree_index;
2944 void *slot;
2945
2946 if (!log)
2947 return false;
2948
2949 WARN_ON_ONCE(!rcu_read_lock_held());
2950 tree_index = r5c_tree_index(conf, sect);
2951 slot = radix_tree_lookup(&log->big_stripe_tree, tree_index);
2952 return slot != NULL;
2953 }
2954
2955 static int r5l_load_log(struct r5l_log *log)
2956 {
2957 struct md_rdev *rdev = log->rdev;
2958 struct page *page;
2959 struct r5l_meta_block *mb;
2960 sector_t cp = log->rdev->journal_tail;
2961 u32 stored_crc, expected_crc;
2962 bool create_super = false;
2963 int ret = 0;
2964
2965
2966 if (cp >= rdev->sectors || round_down(cp, BLOCK_SECTORS) != cp)
2967 cp = 0;
2968 page = alloc_page(GFP_KERNEL);
2969 if (!page)
2970 return -ENOMEM;
2971
2972 if (!sync_page_io(rdev, cp, PAGE_SIZE, page, REQ_OP_READ, false)) {
2973 ret = -EIO;
2974 goto ioerr;
2975 }
2976 mb = page_address(page);
2977
2978 if (le32_to_cpu(mb->magic) != R5LOG_MAGIC ||
2979 mb->version != R5LOG_VERSION) {
2980 create_super = true;
2981 goto create;
2982 }
2983 stored_crc = le32_to_cpu(mb->checksum);
2984 mb->checksum = 0;
2985 expected_crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
2986 if (stored_crc != expected_crc) {
2987 create_super = true;
2988 goto create;
2989 }
2990 if (le64_to_cpu(mb->position) != cp) {
2991 create_super = true;
2992 goto create;
2993 }
2994 create:
2995 if (create_super) {
2996 log->last_cp_seq = prandom_u32();
2997 cp = 0;
2998 r5l_log_write_empty_meta_block(log, cp, log->last_cp_seq);
2999
3000
3001
3002
3003
3004 r5l_write_super(log, cp);
3005 } else
3006 log->last_cp_seq = le64_to_cpu(mb->seq);
3007
3008 log->device_size = round_down(rdev->sectors, BLOCK_SECTORS);
3009 log->max_free_space = log->device_size >> RECLAIM_MAX_FREE_SPACE_SHIFT;
3010 if (log->max_free_space > RECLAIM_MAX_FREE_SPACE)
3011 log->max_free_space = RECLAIM_MAX_FREE_SPACE;
3012 log->last_checkpoint = cp;
3013
3014 __free_page(page);
3015
3016 if (create_super) {
3017 log->log_start = r5l_ring_add(log, cp, BLOCK_SECTORS);
3018 log->seq = log->last_cp_seq + 1;
3019 log->next_checkpoint = cp;
3020 } else
3021 ret = r5l_recovery_log(log);
3022
3023 r5c_update_log_state(log);
3024 return ret;
3025 ioerr:
3026 __free_page(page);
3027 return ret;
3028 }
3029
3030 int r5l_start(struct r5l_log *log)
3031 {
3032 int ret;
3033
3034 if (!log)
3035 return 0;
3036
3037 ret = r5l_load_log(log);
3038 if (ret) {
3039 struct mddev *mddev = log->rdev->mddev;
3040 struct r5conf *conf = mddev->private;
3041
3042 r5l_exit_log(conf);
3043 }
3044 return ret;
3045 }
3046
3047 void r5c_update_on_rdev_error(struct mddev *mddev, struct md_rdev *rdev)
3048 {
3049 struct r5conf *conf = mddev->private;
3050 struct r5l_log *log = conf->log;
3051
3052 if (!log)
3053 return;
3054
3055 if ((raid5_calc_degraded(conf) > 0 ||
3056 test_bit(Journal, &rdev->flags)) &&
3057 conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK)
3058 schedule_work(&log->disable_writeback_work);
3059 }
3060
3061 int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
3062 {
3063 struct request_queue *q = bdev_get_queue(rdev->bdev);
3064 struct r5l_log *log;
3065 int ret;
3066
3067 pr_debug("md/raid:%s: using device %pg as journal\n",
3068 mdname(conf->mddev), rdev->bdev);
3069
3070 if (PAGE_SIZE != 4096)
3071 return -EINVAL;
3072
3073
3074
3075
3076
3077
3078
3079
3080 if (sizeof(struct r5l_meta_block) +
3081 ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32)) *
3082 conf->raid_disks) > PAGE_SIZE) {
3083 pr_err("md/raid:%s: write journal/cache doesn't work for array with %d disks\n",
3084 mdname(conf->mddev), conf->raid_disks);
3085 return -EINVAL;
3086 }
3087
3088 log = kzalloc(sizeof(*log), GFP_KERNEL);
3089 if (!log)
3090 return -ENOMEM;
3091 log->rdev = rdev;
3092
3093 log->need_cache_flush = test_bit(QUEUE_FLAG_WC, &q->queue_flags) != 0;
3094
3095 log->uuid_checksum = crc32c_le(~0, rdev->mddev->uuid,
3096 sizeof(rdev->mddev->uuid));
3097
3098 mutex_init(&log->io_mutex);
3099
3100 spin_lock_init(&log->io_list_lock);
3101 INIT_LIST_HEAD(&log->running_ios);
3102 INIT_LIST_HEAD(&log->io_end_ios);
3103 INIT_LIST_HEAD(&log->flushing_ios);
3104 INIT_LIST_HEAD(&log->finished_ios);
3105
3106 log->io_kc = KMEM_CACHE(r5l_io_unit, 0);
3107 if (!log->io_kc)
3108 goto io_kc;
3109
3110 ret = mempool_init_slab_pool(&log->io_pool, R5L_POOL_SIZE, log->io_kc);
3111 if (ret)
3112 goto io_pool;
3113
3114 ret = bioset_init(&log->bs, R5L_POOL_SIZE, 0, BIOSET_NEED_BVECS);
3115 if (ret)
3116 goto io_bs;
3117
3118 ret = mempool_init_page_pool(&log->meta_pool, R5L_POOL_SIZE, 0);
3119 if (ret)
3120 goto out_mempool;
3121
3122 spin_lock_init(&log->tree_lock);
3123 INIT_RADIX_TREE(&log->big_stripe_tree, GFP_NOWAIT | __GFP_NOWARN);
3124
3125 log->reclaim_thread = md_register_thread(r5l_reclaim_thread,
3126 log->rdev->mddev, "reclaim");
3127 if (!log->reclaim_thread)
3128 goto reclaim_thread;
3129 log->reclaim_thread->timeout = R5C_RECLAIM_WAKEUP_INTERVAL;
3130
3131 init_waitqueue_head(&log->iounit_wait);
3132
3133 INIT_LIST_HEAD(&log->no_mem_stripes);
3134
3135 INIT_LIST_HEAD(&log->no_space_stripes);
3136 spin_lock_init(&log->no_space_stripes_lock);
3137
3138 INIT_WORK(&log->deferred_io_work, r5l_submit_io_async);
3139 INIT_WORK(&log->disable_writeback_work, r5c_disable_writeback_async);
3140
3141 log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
3142 INIT_LIST_HEAD(&log->stripe_in_journal_list);
3143 spin_lock_init(&log->stripe_in_journal_lock);
3144 atomic_set(&log->stripe_in_journal_count, 0);
3145
3146 conf->log = log;
3147
3148 set_bit(MD_HAS_JOURNAL, &conf->mddev->flags);
3149 return 0;
3150
3151 reclaim_thread:
3152 mempool_exit(&log->meta_pool);
3153 out_mempool:
3154 bioset_exit(&log->bs);
3155 io_bs:
3156 mempool_exit(&log->io_pool);
3157 io_pool:
3158 kmem_cache_destroy(log->io_kc);
3159 io_kc:
3160 kfree(log);
3161 return -EINVAL;
3162 }
3163
3164 void r5l_exit_log(struct r5conf *conf)
3165 {
3166 struct r5l_log *log = conf->log;
3167
3168
3169 wake_up(&conf->mddev->sb_wait);
3170 flush_work(&log->disable_writeback_work);
3171 md_unregister_thread(&log->reclaim_thread);
3172
3173 conf->log = NULL;
3174
3175 mempool_exit(&log->meta_pool);
3176 bioset_exit(&log->bs);
3177 mempool_exit(&log->io_pool);
3178 kmem_cache_destroy(log->io_kc);
3179 kfree(log);
3180 }