0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017 #include <linux/kernel.h>
0018 #include <linux/export.h>
0019 #include <linux/spinlock.h>
0020 #include <linux/slab.h>
0021 #include <linux/sched.h>
0022 #include <linux/fs.h>
0023 #include <linux/mm.h>
0024 #include <linux/pagemap.h>
0025 #include <linux/kthread.h>
0026 #include <linux/writeback.h>
0027 #include <linux/blkdev.h>
0028 #include <linux/backing-dev.h>
0029 #include <linux/tracepoint.h>
0030 #include <linux/device.h>
0031 #include <linux/memcontrol.h>
0032 #include "internal.h"
0033
0034
0035
0036
0037 #define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_SHIFT - 10))
0038
0039
0040
0041
0042 struct wb_writeback_work {
0043 long nr_pages;
0044 struct super_block *sb;
0045 enum writeback_sync_modes sync_mode;
0046 unsigned int tagged_writepages:1;
0047 unsigned int for_kupdate:1;
0048 unsigned int range_cyclic:1;
0049 unsigned int for_background:1;
0050 unsigned int for_sync:1;
0051 unsigned int auto_free:1;
0052 enum wb_reason reason;
0053
0054 struct list_head list;
0055 struct wb_completion *done;
0056 };
0057
0058
0059
0060
0061
0062
0063
0064
0065
0066
0067
0068 unsigned int dirtytime_expire_interval = 12 * 60 * 60;
0069
0070 static inline struct inode *wb_inode(struct list_head *head)
0071 {
0072 return list_entry(head, struct inode, i_io_list);
0073 }
0074
0075
0076
0077
0078
0079
0080 #define CREATE_TRACE_POINTS
0081 #include <trace/events/writeback.h>
0082
0083 EXPORT_TRACEPOINT_SYMBOL_GPL(wbc_writepage);
0084
0085 static bool wb_io_lists_populated(struct bdi_writeback *wb)
0086 {
0087 if (wb_has_dirty_io(wb)) {
0088 return false;
0089 } else {
0090 set_bit(WB_has_dirty_io, &wb->state);
0091 WARN_ON_ONCE(!wb->avg_write_bandwidth);
0092 atomic_long_add(wb->avg_write_bandwidth,
0093 &wb->bdi->tot_write_bandwidth);
0094 return true;
0095 }
0096 }
0097
0098 static void wb_io_lists_depopulated(struct bdi_writeback *wb)
0099 {
0100 if (wb_has_dirty_io(wb) && list_empty(&wb->b_dirty) &&
0101 list_empty(&wb->b_io) && list_empty(&wb->b_more_io)) {
0102 clear_bit(WB_has_dirty_io, &wb->state);
0103 WARN_ON_ONCE(atomic_long_sub_return(wb->avg_write_bandwidth,
0104 &wb->bdi->tot_write_bandwidth) < 0);
0105 }
0106 }
0107
0108
0109
0110
0111
0112
0113
0114
0115
0116
0117
0118 static bool inode_io_list_move_locked(struct inode *inode,
0119 struct bdi_writeback *wb,
0120 struct list_head *head)
0121 {
0122 assert_spin_locked(&wb->list_lock);
0123 assert_spin_locked(&inode->i_lock);
0124
0125 list_move(&inode->i_io_list, head);
0126
0127
0128 if (head != &wb->b_dirty_time)
0129 return wb_io_lists_populated(wb);
0130
0131 wb_io_lists_depopulated(wb);
0132 return false;
0133 }
0134
0135 static void wb_wakeup(struct bdi_writeback *wb)
0136 {
0137 spin_lock_irq(&wb->work_lock);
0138 if (test_bit(WB_registered, &wb->state))
0139 mod_delayed_work(bdi_wq, &wb->dwork, 0);
0140 spin_unlock_irq(&wb->work_lock);
0141 }
0142
0143 static void finish_writeback_work(struct bdi_writeback *wb,
0144 struct wb_writeback_work *work)
0145 {
0146 struct wb_completion *done = work->done;
0147
0148 if (work->auto_free)
0149 kfree(work);
0150 if (done) {
0151 wait_queue_head_t *waitq = done->waitq;
0152
0153
0154 if (atomic_dec_and_test(&done->cnt))
0155 wake_up_all(waitq);
0156 }
0157 }
0158
0159 static void wb_queue_work(struct bdi_writeback *wb,
0160 struct wb_writeback_work *work)
0161 {
0162 trace_writeback_queue(wb, work);
0163
0164 if (work->done)
0165 atomic_inc(&work->done->cnt);
0166
0167 spin_lock_irq(&wb->work_lock);
0168
0169 if (test_bit(WB_registered, &wb->state)) {
0170 list_add_tail(&work->list, &wb->work_list);
0171 mod_delayed_work(bdi_wq, &wb->dwork, 0);
0172 } else
0173 finish_writeback_work(wb, work);
0174
0175 spin_unlock_irq(&wb->work_lock);
0176 }
0177
0178
0179
0180
0181
0182
0183
0184
0185
0186
0187
0188 void wb_wait_for_completion(struct wb_completion *done)
0189 {
0190 atomic_dec(&done->cnt);
0191 wait_event(*done->waitq, !atomic_read(&done->cnt));
0192 }
0193
0194 #ifdef CONFIG_CGROUP_WRITEBACK
0195
0196
0197
0198
0199
0200
0201
0202
0203
0204
0205
0206
0207
0208
0209
0210
0211
0212
0213
0214
0215 #define WB_FRN_TIME_SHIFT 13
0216 #define WB_FRN_TIME_AVG_SHIFT 3
0217 #define WB_FRN_TIME_CUT_DIV 8
0218 #define WB_FRN_TIME_PERIOD (2 * (1 << WB_FRN_TIME_SHIFT))
0219
0220 #define WB_FRN_HIST_SLOTS 16
0221 #define WB_FRN_HIST_UNIT (WB_FRN_TIME_PERIOD / WB_FRN_HIST_SLOTS)
0222
0223 #define WB_FRN_HIST_THR_SLOTS (WB_FRN_HIST_SLOTS / 2)
0224
0225 #define WB_FRN_HIST_MAX_SLOTS (WB_FRN_HIST_THR_SLOTS / 2 + 1)
0226
0227 #define WB_FRN_MAX_IN_FLIGHT 1024
0228
0229
0230
0231
0232
0233 #define WB_MAX_INODES_PER_ISW ((1024UL - sizeof(struct inode_switch_wbs_context)) \
0234 / sizeof(struct inode *))
0235
0236 static atomic_t isw_nr_in_flight = ATOMIC_INIT(0);
0237 static struct workqueue_struct *isw_wq;
0238
0239 void __inode_attach_wb(struct inode *inode, struct page *page)
0240 {
0241 struct backing_dev_info *bdi = inode_to_bdi(inode);
0242 struct bdi_writeback *wb = NULL;
0243
0244 if (inode_cgwb_enabled(inode)) {
0245 struct cgroup_subsys_state *memcg_css;
0246
0247 if (page) {
0248 memcg_css = mem_cgroup_css_from_page(page);
0249 wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
0250 } else {
0251
0252 memcg_css = task_get_css(current, memory_cgrp_id);
0253 wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
0254 css_put(memcg_css);
0255 }
0256 }
0257
0258 if (!wb)
0259 wb = &bdi->wb;
0260
0261
0262
0263
0264
0265 if (unlikely(cmpxchg(&inode->i_wb, NULL, wb)))
0266 wb_put(wb);
0267 }
0268 EXPORT_SYMBOL_GPL(__inode_attach_wb);
0269
0270
0271
0272
0273
0274
0275
0276
0277
0278 static void inode_cgwb_move_to_attached(struct inode *inode,
0279 struct bdi_writeback *wb)
0280 {
0281 assert_spin_locked(&wb->list_lock);
0282 assert_spin_locked(&inode->i_lock);
0283
0284 inode->i_state &= ~I_SYNC_QUEUED;
0285 if (wb != &wb->bdi->wb)
0286 list_move(&inode->i_io_list, &wb->b_attached);
0287 else
0288 list_del_init(&inode->i_io_list);
0289 wb_io_lists_depopulated(wb);
0290 }
0291
0292
0293
0294
0295
0296
0297
0298
0299
0300 static struct bdi_writeback *
0301 locked_inode_to_wb_and_lock_list(struct inode *inode)
0302 __releases(&inode->i_lock)
0303 __acquires(&wb->list_lock)
0304 {
0305 while (true) {
0306 struct bdi_writeback *wb = inode_to_wb(inode);
0307
0308
0309
0310
0311
0312
0313
0314 wb_get(wb);
0315 spin_unlock(&inode->i_lock);
0316 spin_lock(&wb->list_lock);
0317
0318
0319 if (likely(wb == inode->i_wb)) {
0320 wb_put(wb);
0321 return wb;
0322 }
0323
0324 spin_unlock(&wb->list_lock);
0325 wb_put(wb);
0326 cpu_relax();
0327 spin_lock(&inode->i_lock);
0328 }
0329 }
0330
0331
0332
0333
0334
0335
0336
0337
0338 static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode)
0339 __acquires(&wb->list_lock)
0340 {
0341 spin_lock(&inode->i_lock);
0342 return locked_inode_to_wb_and_lock_list(inode);
0343 }
0344
0345 struct inode_switch_wbs_context {
0346 struct rcu_work work;
0347
0348
0349
0350
0351
0352
0353
0354
0355
0356 struct bdi_writeback *new_wb;
0357 struct inode *inodes[];
0358 };
0359
0360 static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi)
0361 {
0362 down_write(&bdi->wb_switch_rwsem);
0363 }
0364
0365 static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi)
0366 {
0367 up_write(&bdi->wb_switch_rwsem);
0368 }
0369
0370 static bool inode_do_switch_wbs(struct inode *inode,
0371 struct bdi_writeback *old_wb,
0372 struct bdi_writeback *new_wb)
0373 {
0374 struct address_space *mapping = inode->i_mapping;
0375 XA_STATE(xas, &mapping->i_pages, 0);
0376 struct folio *folio;
0377 bool switched = false;
0378
0379 spin_lock(&inode->i_lock);
0380 xa_lock_irq(&mapping->i_pages);
0381
0382
0383
0384
0385
0386 if (unlikely(inode->i_state & (I_FREEING | I_WILL_FREE)))
0387 goto skip_switch;
0388
0389 trace_inode_switch_wbs(inode, old_wb, new_wb);
0390
0391
0392
0393
0394
0395
0396 xas_for_each_marked(&xas, folio, ULONG_MAX, PAGECACHE_TAG_DIRTY) {
0397 if (folio_test_dirty(folio)) {
0398 long nr = folio_nr_pages(folio);
0399 wb_stat_mod(old_wb, WB_RECLAIMABLE, -nr);
0400 wb_stat_mod(new_wb, WB_RECLAIMABLE, nr);
0401 }
0402 }
0403
0404 xas_set(&xas, 0);
0405 xas_for_each_marked(&xas, folio, ULONG_MAX, PAGECACHE_TAG_WRITEBACK) {
0406 long nr = folio_nr_pages(folio);
0407 WARN_ON_ONCE(!folio_test_writeback(folio));
0408 wb_stat_mod(old_wb, WB_WRITEBACK, -nr);
0409 wb_stat_mod(new_wb, WB_WRITEBACK, nr);
0410 }
0411
0412 if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) {
0413 atomic_dec(&old_wb->writeback_inodes);
0414 atomic_inc(&new_wb->writeback_inodes);
0415 }
0416
0417 wb_get(new_wb);
0418
0419
0420
0421
0422
0423
0424
0425
0426
0427 if (!list_empty(&inode->i_io_list)) {
0428 inode->i_wb = new_wb;
0429
0430 if (inode->i_state & I_DIRTY_ALL) {
0431 struct inode *pos;
0432
0433 list_for_each_entry(pos, &new_wb->b_dirty, i_io_list)
0434 if (time_after_eq(inode->dirtied_when,
0435 pos->dirtied_when))
0436 break;
0437 inode_io_list_move_locked(inode, new_wb,
0438 pos->i_io_list.prev);
0439 } else {
0440 inode_cgwb_move_to_attached(inode, new_wb);
0441 }
0442 } else {
0443 inode->i_wb = new_wb;
0444 }
0445
0446
0447 inode->i_wb_frn_winner = 0;
0448 inode->i_wb_frn_avg_time = 0;
0449 inode->i_wb_frn_history = 0;
0450 switched = true;
0451 skip_switch:
0452
0453
0454
0455
0456 smp_store_release(&inode->i_state, inode->i_state & ~I_WB_SWITCH);
0457
0458 xa_unlock_irq(&mapping->i_pages);
0459 spin_unlock(&inode->i_lock);
0460
0461 return switched;
0462 }
0463
0464 static void inode_switch_wbs_work_fn(struct work_struct *work)
0465 {
0466 struct inode_switch_wbs_context *isw =
0467 container_of(to_rcu_work(work), struct inode_switch_wbs_context, work);
0468 struct backing_dev_info *bdi = inode_to_bdi(isw->inodes[0]);
0469 struct bdi_writeback *old_wb = isw->inodes[0]->i_wb;
0470 struct bdi_writeback *new_wb = isw->new_wb;
0471 unsigned long nr_switched = 0;
0472 struct inode **inodep;
0473
0474
0475
0476
0477
0478 down_read(&bdi->wb_switch_rwsem);
0479
0480
0481
0482
0483
0484
0485
0486
0487
0488
0489
0490 if (old_wb < new_wb) {
0491 spin_lock(&old_wb->list_lock);
0492 spin_lock_nested(&new_wb->list_lock, SINGLE_DEPTH_NESTING);
0493 } else {
0494 spin_lock(&new_wb->list_lock);
0495 spin_lock_nested(&old_wb->list_lock, SINGLE_DEPTH_NESTING);
0496 }
0497
0498 for (inodep = isw->inodes; *inodep; inodep++) {
0499 WARN_ON_ONCE((*inodep)->i_wb != old_wb);
0500 if (inode_do_switch_wbs(*inodep, old_wb, new_wb))
0501 nr_switched++;
0502 }
0503
0504 spin_unlock(&new_wb->list_lock);
0505 spin_unlock(&old_wb->list_lock);
0506
0507 up_read(&bdi->wb_switch_rwsem);
0508
0509 if (nr_switched) {
0510 wb_wakeup(new_wb);
0511 wb_put_many(old_wb, nr_switched);
0512 }
0513
0514 for (inodep = isw->inodes; *inodep; inodep++)
0515 iput(*inodep);
0516 wb_put(new_wb);
0517 kfree(isw);
0518 atomic_dec(&isw_nr_in_flight);
0519 }
0520
0521 static bool inode_prepare_wbs_switch(struct inode *inode,
0522 struct bdi_writeback *new_wb)
0523 {
0524
0525
0526
0527
0528
0529
0530 smp_mb();
0531
0532 if (IS_DAX(inode))
0533 return false;
0534
0535
0536 spin_lock(&inode->i_lock);
0537 if (!(inode->i_sb->s_flags & SB_ACTIVE) ||
0538 inode->i_state & (I_WB_SWITCH | I_FREEING | I_WILL_FREE) ||
0539 inode_to_wb(inode) == new_wb) {
0540 spin_unlock(&inode->i_lock);
0541 return false;
0542 }
0543 inode->i_state |= I_WB_SWITCH;
0544 __iget(inode);
0545 spin_unlock(&inode->i_lock);
0546
0547 return true;
0548 }
0549
0550
0551
0552
0553
0554
0555
0556
0557
0558 static void inode_switch_wbs(struct inode *inode, int new_wb_id)
0559 {
0560 struct backing_dev_info *bdi = inode_to_bdi(inode);
0561 struct cgroup_subsys_state *memcg_css;
0562 struct inode_switch_wbs_context *isw;
0563
0564
0565 if (inode->i_state & I_WB_SWITCH)
0566 return;
0567
0568
0569 if (atomic_read(&isw_nr_in_flight) > WB_FRN_MAX_IN_FLIGHT)
0570 return;
0571
0572 isw = kzalloc(struct_size(isw, inodes, 2), GFP_ATOMIC);
0573 if (!isw)
0574 return;
0575
0576 atomic_inc(&isw_nr_in_flight);
0577
0578
0579 rcu_read_lock();
0580 memcg_css = css_from_id(new_wb_id, &memory_cgrp_subsys);
0581 if (memcg_css && !css_tryget(memcg_css))
0582 memcg_css = NULL;
0583 rcu_read_unlock();
0584 if (!memcg_css)
0585 goto out_free;
0586
0587 isw->new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
0588 css_put(memcg_css);
0589 if (!isw->new_wb)
0590 goto out_free;
0591
0592 if (!inode_prepare_wbs_switch(inode, isw->new_wb))
0593 goto out_free;
0594
0595 isw->inodes[0] = inode;
0596
0597
0598
0599
0600
0601
0602
0603 INIT_RCU_WORK(&isw->work, inode_switch_wbs_work_fn);
0604 queue_rcu_work(isw_wq, &isw->work);
0605 return;
0606
0607 out_free:
0608 atomic_dec(&isw_nr_in_flight);
0609 if (isw->new_wb)
0610 wb_put(isw->new_wb);
0611 kfree(isw);
0612 }
0613
0614
0615
0616
0617
0618
0619
0620
0621
0622 bool cleanup_offline_cgwb(struct bdi_writeback *wb)
0623 {
0624 struct cgroup_subsys_state *memcg_css;
0625 struct inode_switch_wbs_context *isw;
0626 struct inode *inode;
0627 int nr;
0628 bool restart = false;
0629
0630 isw = kzalloc(struct_size(isw, inodes, WB_MAX_INODES_PER_ISW),
0631 GFP_KERNEL);
0632 if (!isw)
0633 return restart;
0634
0635 atomic_inc(&isw_nr_in_flight);
0636
0637 for (memcg_css = wb->memcg_css->parent; memcg_css;
0638 memcg_css = memcg_css->parent) {
0639 isw->new_wb = wb_get_create(wb->bdi, memcg_css, GFP_KERNEL);
0640 if (isw->new_wb)
0641 break;
0642 }
0643 if (unlikely(!isw->new_wb))
0644 isw->new_wb = &wb->bdi->wb;
0645
0646 nr = 0;
0647 spin_lock(&wb->list_lock);
0648 list_for_each_entry(inode, &wb->b_attached, i_io_list) {
0649 if (!inode_prepare_wbs_switch(inode, isw->new_wb))
0650 continue;
0651
0652 isw->inodes[nr++] = inode;
0653
0654 if (nr >= WB_MAX_INODES_PER_ISW - 1) {
0655 restart = true;
0656 break;
0657 }
0658 }
0659 spin_unlock(&wb->list_lock);
0660
0661
0662 if (nr == 0) {
0663 atomic_dec(&isw_nr_in_flight);
0664 wb_put(isw->new_wb);
0665 kfree(isw);
0666 return restart;
0667 }
0668
0669
0670
0671
0672
0673
0674
0675 INIT_RCU_WORK(&isw->work, inode_switch_wbs_work_fn);
0676 queue_rcu_work(isw_wq, &isw->work);
0677
0678 return restart;
0679 }
0680
0681
0682
0683
0684
0685
0686
0687
0688
0689
0690
0691 void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
0692 struct inode *inode)
0693 {
0694 if (!inode_cgwb_enabled(inode)) {
0695 spin_unlock(&inode->i_lock);
0696 return;
0697 }
0698
0699 wbc->wb = inode_to_wb(inode);
0700 wbc->inode = inode;
0701
0702 wbc->wb_id = wbc->wb->memcg_css->id;
0703 wbc->wb_lcand_id = inode->i_wb_frn_winner;
0704 wbc->wb_tcand_id = 0;
0705 wbc->wb_bytes = 0;
0706 wbc->wb_lcand_bytes = 0;
0707 wbc->wb_tcand_bytes = 0;
0708
0709 wb_get(wbc->wb);
0710 spin_unlock(&inode->i_lock);
0711
0712
0713
0714
0715
0716
0717
0718
0719 if (unlikely(wb_dying(wbc->wb) && !css_is_dying(wbc->wb->memcg_css)))
0720 inode_switch_wbs(inode, wbc->wb_id);
0721 }
0722 EXPORT_SYMBOL_GPL(wbc_attach_and_unlock_inode);
0723
0724
0725
0726
0727
0728
0729
0730
0731
0732
0733
0734
0735
0736
0737
0738
0739
0740
0741
0742
0743
0744
0745
0746
0747
0748
0749
0750
0751
0752
0753
0754
0755
0756
0757
0758
0759
0760
0761 void wbc_detach_inode(struct writeback_control *wbc)
0762 {
0763 struct bdi_writeback *wb = wbc->wb;
0764 struct inode *inode = wbc->inode;
0765 unsigned long avg_time, max_bytes, max_time;
0766 u16 history;
0767 int max_id;
0768
0769 if (!wb)
0770 return;
0771
0772 history = inode->i_wb_frn_history;
0773 avg_time = inode->i_wb_frn_avg_time;
0774
0775
0776 if (wbc->wb_bytes >= wbc->wb_lcand_bytes &&
0777 wbc->wb_bytes >= wbc->wb_tcand_bytes) {
0778 max_id = wbc->wb_id;
0779 max_bytes = wbc->wb_bytes;
0780 } else if (wbc->wb_lcand_bytes >= wbc->wb_tcand_bytes) {
0781 max_id = wbc->wb_lcand_id;
0782 max_bytes = wbc->wb_lcand_bytes;
0783 } else {
0784 max_id = wbc->wb_tcand_id;
0785 max_bytes = wbc->wb_tcand_bytes;
0786 }
0787
0788
0789
0790
0791
0792
0793
0794
0795 max_time = DIV_ROUND_UP((max_bytes >> PAGE_SHIFT) << WB_FRN_TIME_SHIFT,
0796 wb->avg_write_bandwidth);
0797 if (avg_time)
0798 avg_time += (max_time >> WB_FRN_TIME_AVG_SHIFT) -
0799 (avg_time >> WB_FRN_TIME_AVG_SHIFT);
0800 else
0801 avg_time = max_time;
0802
0803 if (max_time >= avg_time / WB_FRN_TIME_CUT_DIV) {
0804 int slots;
0805
0806
0807
0808
0809
0810
0811
0812
0813
0814 slots = min(DIV_ROUND_UP(max_time, WB_FRN_HIST_UNIT),
0815 (unsigned long)WB_FRN_HIST_MAX_SLOTS);
0816 history <<= slots;
0817 if (wbc->wb_id != max_id)
0818 history |= (1U << slots) - 1;
0819
0820 if (history)
0821 trace_inode_foreign_history(inode, wbc, history);
0822
0823
0824
0825
0826
0827
0828
0829
0830 if (hweight32(history) > WB_FRN_HIST_THR_SLOTS)
0831 inode_switch_wbs(inode, max_id);
0832 }
0833
0834
0835
0836
0837
0838 inode->i_wb_frn_winner = max_id;
0839 inode->i_wb_frn_avg_time = min(avg_time, (unsigned long)U16_MAX);
0840 inode->i_wb_frn_history = history;
0841
0842 wb_put(wbc->wb);
0843 wbc->wb = NULL;
0844 }
0845 EXPORT_SYMBOL_GPL(wbc_detach_inode);
0846
0847
0848
0849
0850
0851
0852
0853
0854
0855
0856
0857 void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page,
0858 size_t bytes)
0859 {
0860 struct cgroup_subsys_state *css;
0861 int id;
0862
0863
0864
0865
0866
0867
0868
0869 if (!wbc->wb || wbc->no_cgroup_owner)
0870 return;
0871
0872 css = mem_cgroup_css_from_page(page);
0873
0874 if (!(css->flags & CSS_ONLINE))
0875 return;
0876
0877 id = css->id;
0878
0879 if (id == wbc->wb_id) {
0880 wbc->wb_bytes += bytes;
0881 return;
0882 }
0883
0884 if (id == wbc->wb_lcand_id)
0885 wbc->wb_lcand_bytes += bytes;
0886
0887
0888 if (!wbc->wb_tcand_bytes)
0889 wbc->wb_tcand_id = id;
0890 if (id == wbc->wb_tcand_id)
0891 wbc->wb_tcand_bytes += bytes;
0892 else
0893 wbc->wb_tcand_bytes -= min(bytes, wbc->wb_tcand_bytes);
0894 }
0895 EXPORT_SYMBOL_GPL(wbc_account_cgroup_owner);
0896
0897
0898
0899
0900
0901
0902
0903
0904
0905
0906 static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages)
0907 {
0908 unsigned long this_bw = wb->avg_write_bandwidth;
0909 unsigned long tot_bw = atomic_long_read(&wb->bdi->tot_write_bandwidth);
0910
0911 if (nr_pages == LONG_MAX)
0912 return LONG_MAX;
0913
0914
0915
0916
0917
0918
0919 if (!tot_bw || this_bw >= tot_bw)
0920 return nr_pages;
0921 else
0922 return DIV_ROUND_UP_ULL((u64)nr_pages * this_bw, tot_bw);
0923 }
0924
0925
0926
0927
0928
0929
0930
0931
0932
0933
0934
0935
0936 static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
0937 struct wb_writeback_work *base_work,
0938 bool skip_if_busy)
0939 {
0940 struct bdi_writeback *last_wb = NULL;
0941 struct bdi_writeback *wb = list_entry(&bdi->wb_list,
0942 struct bdi_writeback, bdi_node);
0943
0944 might_sleep();
0945 restart:
0946 rcu_read_lock();
0947 list_for_each_entry_continue_rcu(wb, &bdi->wb_list, bdi_node) {
0948 DEFINE_WB_COMPLETION(fallback_work_done, bdi);
0949 struct wb_writeback_work fallback_work;
0950 struct wb_writeback_work *work;
0951 long nr_pages;
0952
0953 if (last_wb) {
0954 wb_put(last_wb);
0955 last_wb = NULL;
0956 }
0957
0958
0959 if (!wb_has_dirty_io(wb) &&
0960 (base_work->sync_mode == WB_SYNC_NONE ||
0961 list_empty(&wb->b_dirty_time)))
0962 continue;
0963 if (skip_if_busy && writeback_in_progress(wb))
0964 continue;
0965
0966 nr_pages = wb_split_bdi_pages(wb, base_work->nr_pages);
0967
0968 work = kmalloc(sizeof(*work), GFP_ATOMIC);
0969 if (work) {
0970 *work = *base_work;
0971 work->nr_pages = nr_pages;
0972 work->auto_free = 1;
0973 wb_queue_work(wb, work);
0974 continue;
0975 }
0976
0977
0978 work = &fallback_work;
0979 *work = *base_work;
0980 work->nr_pages = nr_pages;
0981 work->auto_free = 0;
0982 work->done = &fallback_work_done;
0983
0984 wb_queue_work(wb, work);
0985
0986
0987
0988
0989
0990
0991 wb_get(wb);
0992 last_wb = wb;
0993
0994 rcu_read_unlock();
0995 wb_wait_for_completion(&fallback_work_done);
0996 goto restart;
0997 }
0998 rcu_read_unlock();
0999
1000 if (last_wb)
1001 wb_put(last_wb);
1002 }
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014 int cgroup_writeback_by_id(u64 bdi_id, int memcg_id,
1015 enum wb_reason reason, struct wb_completion *done)
1016 {
1017 struct backing_dev_info *bdi;
1018 struct cgroup_subsys_state *memcg_css;
1019 struct bdi_writeback *wb;
1020 struct wb_writeback_work *work;
1021 unsigned long dirty;
1022 int ret;
1023
1024
1025 bdi = bdi_get_by_id(bdi_id);
1026 if (!bdi)
1027 return -ENOENT;
1028
1029 rcu_read_lock();
1030 memcg_css = css_from_id(memcg_id, &memory_cgrp_subsys);
1031 if (memcg_css && !css_tryget(memcg_css))
1032 memcg_css = NULL;
1033 rcu_read_unlock();
1034 if (!memcg_css) {
1035 ret = -ENOENT;
1036 goto out_bdi_put;
1037 }
1038
1039
1040
1041
1042
1043 wb = wb_get_lookup(bdi, memcg_css);
1044 if (!wb) {
1045 ret = -ENOENT;
1046 goto out_css_put;
1047 }
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059 dirty = memcg_page_state(mem_cgroup_from_css(memcg_css), NR_FILE_DIRTY);
1060 dirty = dirty * 10 / 8;
1061
1062
1063 work = kzalloc(sizeof(*work), GFP_NOWAIT | __GFP_NOWARN);
1064 if (work) {
1065 work->nr_pages = dirty;
1066 work->sync_mode = WB_SYNC_NONE;
1067 work->range_cyclic = 1;
1068 work->reason = reason;
1069 work->done = done;
1070 work->auto_free = 1;
1071 wb_queue_work(wb, work);
1072 ret = 0;
1073 } else {
1074 ret = -ENOMEM;
1075 }
1076
1077 wb_put(wb);
1078 out_css_put:
1079 css_put(memcg_css);
1080 out_bdi_put:
1081 bdi_put(bdi);
1082 return ret;
1083 }
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095 void cgroup_writeback_umount(void)
1096 {
1097
1098
1099
1100
1101 smp_mb();
1102
1103 if (atomic_read(&isw_nr_in_flight)) {
1104
1105
1106
1107
1108 rcu_barrier();
1109 flush_workqueue(isw_wq);
1110 }
1111 }
1112
1113 static int __init cgroup_writeback_init(void)
1114 {
1115 isw_wq = alloc_workqueue("inode_switch_wbs", 0, 0);
1116 if (!isw_wq)
1117 return -ENOMEM;
1118 return 0;
1119 }
1120 fs_initcall(cgroup_writeback_init);
1121
1122 #else
1123
1124 static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi) { }
1125 static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi) { }
1126
1127 static void inode_cgwb_move_to_attached(struct inode *inode,
1128 struct bdi_writeback *wb)
1129 {
1130 assert_spin_locked(&wb->list_lock);
1131 assert_spin_locked(&inode->i_lock);
1132
1133 inode->i_state &= ~I_SYNC_QUEUED;
1134 list_del_init(&inode->i_io_list);
1135 wb_io_lists_depopulated(wb);
1136 }
1137
1138 static struct bdi_writeback *
1139 locked_inode_to_wb_and_lock_list(struct inode *inode)
1140 __releases(&inode->i_lock)
1141 __acquires(&wb->list_lock)
1142 {
1143 struct bdi_writeback *wb = inode_to_wb(inode);
1144
1145 spin_unlock(&inode->i_lock);
1146 spin_lock(&wb->list_lock);
1147 return wb;
1148 }
1149
1150 static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode)
1151 __acquires(&wb->list_lock)
1152 {
1153 struct bdi_writeback *wb = inode_to_wb(inode);
1154
1155 spin_lock(&wb->list_lock);
1156 return wb;
1157 }
1158
1159 static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages)
1160 {
1161 return nr_pages;
1162 }
1163
1164 static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
1165 struct wb_writeback_work *base_work,
1166 bool skip_if_busy)
1167 {
1168 might_sleep();
1169
1170 if (!skip_if_busy || !writeback_in_progress(&bdi->wb)) {
1171 base_work->auto_free = 0;
1172 wb_queue_work(&bdi->wb, base_work);
1173 }
1174 }
1175
1176 #endif
1177
1178
1179
1180
1181
1182 static unsigned long get_nr_dirty_pages(void)
1183 {
1184 return global_node_page_state(NR_FILE_DIRTY) +
1185 get_nr_dirty_inodes();
1186 }
1187
1188 static void wb_start_writeback(struct bdi_writeback *wb, enum wb_reason reason)
1189 {
1190 if (!wb_has_dirty_io(wb))
1191 return;
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201 if (test_bit(WB_start_all, &wb->state) ||
1202 test_and_set_bit(WB_start_all, &wb->state))
1203 return;
1204
1205 wb->start_all_reason = reason;
1206 wb_wakeup(wb);
1207 }
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219 void wb_start_background_writeback(struct bdi_writeback *wb)
1220 {
1221
1222
1223
1224
1225 trace_writeback_wake_background(wb);
1226 wb_wakeup(wb);
1227 }
1228
1229
1230
1231
1232 void inode_io_list_del(struct inode *inode)
1233 {
1234 struct bdi_writeback *wb;
1235
1236 wb = inode_to_wb_and_lock_list(inode);
1237 spin_lock(&inode->i_lock);
1238
1239 inode->i_state &= ~I_SYNC_QUEUED;
1240 list_del_init(&inode->i_io_list);
1241 wb_io_lists_depopulated(wb);
1242
1243 spin_unlock(&inode->i_lock);
1244 spin_unlock(&wb->list_lock);
1245 }
1246 EXPORT_SYMBOL(inode_io_list_del);
1247
1248
1249
1250
1251 void sb_mark_inode_writeback(struct inode *inode)
1252 {
1253 struct super_block *sb = inode->i_sb;
1254 unsigned long flags;
1255
1256 if (list_empty(&inode->i_wb_list)) {
1257 spin_lock_irqsave(&sb->s_inode_wblist_lock, flags);
1258 if (list_empty(&inode->i_wb_list)) {
1259 list_add_tail(&inode->i_wb_list, &sb->s_inodes_wb);
1260 trace_sb_mark_inode_writeback(inode);
1261 }
1262 spin_unlock_irqrestore(&sb->s_inode_wblist_lock, flags);
1263 }
1264 }
1265
1266
1267
1268
1269 void sb_clear_inode_writeback(struct inode *inode)
1270 {
1271 struct super_block *sb = inode->i_sb;
1272 unsigned long flags;
1273
1274 if (!list_empty(&inode->i_wb_list)) {
1275 spin_lock_irqsave(&sb->s_inode_wblist_lock, flags);
1276 if (!list_empty(&inode->i_wb_list)) {
1277 list_del_init(&inode->i_wb_list);
1278 trace_sb_clear_inode_writeback(inode);
1279 }
1280 spin_unlock_irqrestore(&sb->s_inode_wblist_lock, flags);
1281 }
1282 }
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293 static void redirty_tail_locked(struct inode *inode, struct bdi_writeback *wb)
1294 {
1295 assert_spin_locked(&inode->i_lock);
1296
1297 if (!list_empty(&wb->b_dirty)) {
1298 struct inode *tail;
1299
1300 tail = wb_inode(wb->b_dirty.next);
1301 if (time_before(inode->dirtied_when, tail->dirtied_when))
1302 inode->dirtied_when = jiffies;
1303 }
1304 inode_io_list_move_locked(inode, wb, &wb->b_dirty);
1305 inode->i_state &= ~I_SYNC_QUEUED;
1306 }
1307
1308 static void redirty_tail(struct inode *inode, struct bdi_writeback *wb)
1309 {
1310 spin_lock(&inode->i_lock);
1311 redirty_tail_locked(inode, wb);
1312 spin_unlock(&inode->i_lock);
1313 }
1314
1315
1316
1317
1318 static void requeue_io(struct inode *inode, struct bdi_writeback *wb)
1319 {
1320 inode_io_list_move_locked(inode, wb, &wb->b_more_io);
1321 }
1322
1323 static void inode_sync_complete(struct inode *inode)
1324 {
1325 inode->i_state &= ~I_SYNC;
1326
1327 inode_add_lru(inode);
1328
1329 smp_mb();
1330 wake_up_bit(&inode->i_state, __I_SYNC);
1331 }
1332
1333 static bool inode_dirtied_after(struct inode *inode, unsigned long t)
1334 {
1335 bool ret = time_after(inode->dirtied_when, t);
1336 #ifndef CONFIG_64BIT
1337
1338
1339
1340
1341
1342
1343 ret = ret && time_before_eq(inode->dirtied_when, jiffies);
1344 #endif
1345 return ret;
1346 }
1347
1348 #define EXPIRE_DIRTY_ATIME 0x0001
1349
1350
1351
1352
1353
1354 static int move_expired_inodes(struct list_head *delaying_queue,
1355 struct list_head *dispatch_queue,
1356 unsigned long dirtied_before)
1357 {
1358 LIST_HEAD(tmp);
1359 struct list_head *pos, *node;
1360 struct super_block *sb = NULL;
1361 struct inode *inode;
1362 int do_sb_sort = 0;
1363 int moved = 0;
1364
1365 while (!list_empty(delaying_queue)) {
1366 inode = wb_inode(delaying_queue->prev);
1367 if (inode_dirtied_after(inode, dirtied_before))
1368 break;
1369 spin_lock(&inode->i_lock);
1370 list_move(&inode->i_io_list, &tmp);
1371 moved++;
1372 inode->i_state |= I_SYNC_QUEUED;
1373 spin_unlock(&inode->i_lock);
1374 if (sb_is_blkdev_sb(inode->i_sb))
1375 continue;
1376 if (sb && sb != inode->i_sb)
1377 do_sb_sort = 1;
1378 sb = inode->i_sb;
1379 }
1380
1381
1382 if (!do_sb_sort) {
1383 list_splice(&tmp, dispatch_queue);
1384 goto out;
1385 }
1386
1387
1388
1389
1390
1391
1392
1393 while (!list_empty(&tmp)) {
1394 sb = wb_inode(tmp.prev)->i_sb;
1395 list_for_each_prev_safe(pos, node, &tmp) {
1396 inode = wb_inode(pos);
1397 if (inode->i_sb == sb)
1398 list_move(&inode->i_io_list, dispatch_queue);
1399 }
1400 }
1401 out:
1402 return moved;
1403 }
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416 static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work,
1417 unsigned long dirtied_before)
1418 {
1419 int moved;
1420 unsigned long time_expire_jif = dirtied_before;
1421
1422 assert_spin_locked(&wb->list_lock);
1423 list_splice_init(&wb->b_more_io, &wb->b_io);
1424 moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, dirtied_before);
1425 if (!work->for_sync)
1426 time_expire_jif = jiffies - dirtytime_expire_interval * HZ;
1427 moved += move_expired_inodes(&wb->b_dirty_time, &wb->b_io,
1428 time_expire_jif);
1429 if (moved)
1430 wb_io_lists_populated(wb);
1431 trace_writeback_queue_io(wb, work, dirtied_before, moved);
1432 }
1433
1434 static int write_inode(struct inode *inode, struct writeback_control *wbc)
1435 {
1436 int ret;
1437
1438 if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode)) {
1439 trace_writeback_write_inode_start(inode, wbc);
1440 ret = inode->i_sb->s_op->write_inode(inode, wbc);
1441 trace_writeback_write_inode(inode, wbc);
1442 return ret;
1443 }
1444 return 0;
1445 }
1446
1447
1448
1449
1450
1451 static void __inode_wait_for_writeback(struct inode *inode)
1452 __releases(inode->i_lock)
1453 __acquires(inode->i_lock)
1454 {
1455 DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC);
1456 wait_queue_head_t *wqh;
1457
1458 wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
1459 while (inode->i_state & I_SYNC) {
1460 spin_unlock(&inode->i_lock);
1461 __wait_on_bit(wqh, &wq, bit_wait,
1462 TASK_UNINTERRUPTIBLE);
1463 spin_lock(&inode->i_lock);
1464 }
1465 }
1466
1467
1468
1469
1470 void inode_wait_for_writeback(struct inode *inode)
1471 {
1472 spin_lock(&inode->i_lock);
1473 __inode_wait_for_writeback(inode);
1474 spin_unlock(&inode->i_lock);
1475 }
1476
1477
1478
1479
1480
1481
1482 static void inode_sleep_on_writeback(struct inode *inode)
1483 __releases(inode->i_lock)
1484 {
1485 DEFINE_WAIT(wait);
1486 wait_queue_head_t *wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
1487 int sleep;
1488
1489 prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
1490 sleep = inode->i_state & I_SYNC;
1491 spin_unlock(&inode->i_lock);
1492 if (sleep)
1493 schedule();
1494 finish_wait(wqh, &wait);
1495 }
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505 static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
1506 struct writeback_control *wbc)
1507 {
1508 if (inode->i_state & I_FREEING)
1509 return;
1510
1511
1512
1513
1514
1515
1516 if ((inode->i_state & I_DIRTY) &&
1517 (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages))
1518 inode->dirtied_when = jiffies;
1519
1520 if (wbc->pages_skipped) {
1521
1522
1523
1524
1525 redirty_tail_locked(inode, wb);
1526 return;
1527 }
1528
1529 if (mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
1530
1531
1532
1533
1534 if (wbc->nr_to_write <= 0) {
1535
1536 requeue_io(inode, wb);
1537 } else {
1538
1539
1540
1541
1542
1543
1544
1545 redirty_tail_locked(inode, wb);
1546 }
1547 } else if (inode->i_state & I_DIRTY) {
1548
1549
1550
1551
1552
1553 redirty_tail_locked(inode, wb);
1554 } else if (inode->i_state & I_DIRTY_TIME) {
1555 inode->dirtied_when = jiffies;
1556 inode_io_list_move_locked(inode, wb, &wb->b_dirty_time);
1557 inode->i_state &= ~I_SYNC_QUEUED;
1558 } else {
1559
1560 inode_cgwb_move_to_attached(inode, wb);
1561 }
1562 }
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575 static int
1576 __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
1577 {
1578 struct address_space *mapping = inode->i_mapping;
1579 long nr_to_write = wbc->nr_to_write;
1580 unsigned dirty;
1581 int ret;
1582
1583 WARN_ON(!(inode->i_state & I_SYNC));
1584
1585 trace_writeback_single_inode_start(inode, wbc, nr_to_write);
1586
1587 ret = do_writepages(mapping, wbc);
1588
1589
1590
1591
1592
1593
1594
1595
1596 if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) {
1597 int err = filemap_fdatawait(mapping);
1598 if (ret == 0)
1599 ret = err;
1600 }
1601
1602
1603
1604
1605
1606
1607 if ((inode->i_state & I_DIRTY_TIME) &&
1608 (wbc->sync_mode == WB_SYNC_ALL ||
1609 time_after(jiffies, inode->dirtied_time_when +
1610 dirtytime_expire_interval * HZ))) {
1611 trace_writeback_lazytime(inode);
1612 mark_inode_dirty_sync(inode);
1613 }
1614
1615
1616
1617
1618
1619
1620
1621 spin_lock(&inode->i_lock);
1622 dirty = inode->i_state & I_DIRTY;
1623 inode->i_state &= ~dirty;
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636 smp_mb();
1637
1638 if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
1639 inode->i_state |= I_DIRTY_PAGES;
1640 else if (unlikely(inode->i_state & I_PINNING_FSCACHE_WB)) {
1641 if (!(inode->i_state & I_DIRTY_PAGES)) {
1642 inode->i_state &= ~I_PINNING_FSCACHE_WB;
1643 wbc->unpinned_fscache_wb = true;
1644 dirty |= I_PINNING_FSCACHE_WB;
1645 }
1646 }
1647
1648 spin_unlock(&inode->i_lock);
1649
1650
1651 if (dirty & ~I_DIRTY_PAGES) {
1652 int err = write_inode(inode, wbc);
1653 if (ret == 0)
1654 ret = err;
1655 }
1656 wbc->unpinned_fscache_wb = false;
1657 trace_writeback_single_inode(inode, wbc, nr_to_write);
1658 return ret;
1659 }
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670 static int writeback_single_inode(struct inode *inode,
1671 struct writeback_control *wbc)
1672 {
1673 struct bdi_writeback *wb;
1674 int ret = 0;
1675
1676 spin_lock(&inode->i_lock);
1677 if (!atomic_read(&inode->i_count))
1678 WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING)));
1679 else
1680 WARN_ON(inode->i_state & I_WILL_FREE);
1681
1682 if (inode->i_state & I_SYNC) {
1683
1684
1685
1686
1687
1688
1689 if (wbc->sync_mode != WB_SYNC_ALL)
1690 goto out;
1691 __inode_wait_for_writeback(inode);
1692 }
1693 WARN_ON(inode->i_state & I_SYNC);
1694
1695
1696
1697
1698
1699
1700
1701 if (!(inode->i_state & I_DIRTY_ALL) &&
1702 (wbc->sync_mode != WB_SYNC_ALL ||
1703 !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK)))
1704 goto out;
1705 inode->i_state |= I_SYNC;
1706 wbc_attach_and_unlock_inode(wbc, inode);
1707
1708 ret = __writeback_single_inode(inode, wbc);
1709
1710 wbc_detach_inode(wbc);
1711
1712 wb = inode_to_wb_and_lock_list(inode);
1713 spin_lock(&inode->i_lock);
1714
1715
1716
1717
1718
1719 if (!(inode->i_state & I_DIRTY_ALL))
1720 inode_cgwb_move_to_attached(inode, wb);
1721 else if (!(inode->i_state & I_SYNC_QUEUED) &&
1722 (inode->i_state & I_DIRTY))
1723 redirty_tail_locked(inode, wb);
1724
1725 spin_unlock(&wb->list_lock);
1726 inode_sync_complete(inode);
1727 out:
1728 spin_unlock(&inode->i_lock);
1729 return ret;
1730 }
1731
1732 static long writeback_chunk_size(struct bdi_writeback *wb,
1733 struct wb_writeback_work *work)
1734 {
1735 long pages;
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750 if (work->sync_mode == WB_SYNC_ALL || work->tagged_writepages)
1751 pages = LONG_MAX;
1752 else {
1753 pages = min(wb->avg_write_bandwidth / 2,
1754 global_wb_domain.dirty_limit / DIRTY_SCOPE);
1755 pages = min(pages, work->nr_pages);
1756 pages = round_down(pages + MIN_WRITEBACK_PAGES,
1757 MIN_WRITEBACK_PAGES);
1758 }
1759
1760 return pages;
1761 }
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772 static long writeback_sb_inodes(struct super_block *sb,
1773 struct bdi_writeback *wb,
1774 struct wb_writeback_work *work)
1775 {
1776 struct writeback_control wbc = {
1777 .sync_mode = work->sync_mode,
1778 .tagged_writepages = work->tagged_writepages,
1779 .for_kupdate = work->for_kupdate,
1780 .for_background = work->for_background,
1781 .for_sync = work->for_sync,
1782 .range_cyclic = work->range_cyclic,
1783 .range_start = 0,
1784 .range_end = LLONG_MAX,
1785 };
1786 unsigned long start_time = jiffies;
1787 long write_chunk;
1788 long total_wrote = 0;
1789
1790 while (!list_empty(&wb->b_io)) {
1791 struct inode *inode = wb_inode(wb->b_io.prev);
1792 struct bdi_writeback *tmp_wb;
1793 long wrote;
1794
1795 if (inode->i_sb != sb) {
1796 if (work->sb) {
1797
1798
1799
1800
1801
1802 redirty_tail(inode, wb);
1803 continue;
1804 }
1805
1806
1807
1808
1809
1810
1811 break;
1812 }
1813
1814
1815
1816
1817
1818
1819 spin_lock(&inode->i_lock);
1820 if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
1821 redirty_tail_locked(inode, wb);
1822 spin_unlock(&inode->i_lock);
1823 continue;
1824 }
1825 if ((inode->i_state & I_SYNC) && wbc.sync_mode != WB_SYNC_ALL) {
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835 requeue_io(inode, wb);
1836 spin_unlock(&inode->i_lock);
1837 trace_writeback_sb_inodes_requeue(inode);
1838 continue;
1839 }
1840 spin_unlock(&wb->list_lock);
1841
1842
1843
1844
1845
1846
1847 if (inode->i_state & I_SYNC) {
1848
1849 inode_sleep_on_writeback(inode);
1850
1851 spin_lock(&wb->list_lock);
1852 continue;
1853 }
1854 inode->i_state |= I_SYNC;
1855 wbc_attach_and_unlock_inode(&wbc, inode);
1856
1857 write_chunk = writeback_chunk_size(wb, work);
1858 wbc.nr_to_write = write_chunk;
1859 wbc.pages_skipped = 0;
1860
1861
1862
1863
1864
1865 __writeback_single_inode(inode, &wbc);
1866
1867 wbc_detach_inode(&wbc);
1868 work->nr_pages -= write_chunk - wbc.nr_to_write;
1869 wrote = write_chunk - wbc.nr_to_write - wbc.pages_skipped;
1870 wrote = wrote < 0 ? 0 : wrote;
1871 total_wrote += wrote;
1872
1873 if (need_resched()) {
1874
1875
1876
1877
1878
1879
1880
1881
1882 blk_flush_plug(current->plug, false);
1883 cond_resched();
1884 }
1885
1886
1887
1888
1889
1890 tmp_wb = inode_to_wb_and_lock_list(inode);
1891 spin_lock(&inode->i_lock);
1892 if (!(inode->i_state & I_DIRTY_ALL))
1893 total_wrote++;
1894 requeue_inode(inode, tmp_wb, &wbc);
1895 inode_sync_complete(inode);
1896 spin_unlock(&inode->i_lock);
1897
1898 if (unlikely(tmp_wb != wb)) {
1899 spin_unlock(&tmp_wb->list_lock);
1900 spin_lock(&wb->list_lock);
1901 }
1902
1903
1904
1905
1906
1907 if (total_wrote) {
1908 if (time_is_before_jiffies(start_time + HZ / 10UL))
1909 break;
1910 if (work->nr_pages <= 0)
1911 break;
1912 }
1913 }
1914 return total_wrote;
1915 }
1916
1917 static long __writeback_inodes_wb(struct bdi_writeback *wb,
1918 struct wb_writeback_work *work)
1919 {
1920 unsigned long start_time = jiffies;
1921 long wrote = 0;
1922
1923 while (!list_empty(&wb->b_io)) {
1924 struct inode *inode = wb_inode(wb->b_io.prev);
1925 struct super_block *sb = inode->i_sb;
1926
1927 if (!trylock_super(sb)) {
1928
1929
1930
1931
1932
1933 redirty_tail(inode, wb);
1934 continue;
1935 }
1936 wrote += writeback_sb_inodes(sb, wb, work);
1937 up_read(&sb->s_umount);
1938
1939
1940 if (wrote) {
1941 if (time_is_before_jiffies(start_time + HZ / 10UL))
1942 break;
1943 if (work->nr_pages <= 0)
1944 break;
1945 }
1946 }
1947
1948 return wrote;
1949 }
1950
1951 static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages,
1952 enum wb_reason reason)
1953 {
1954 struct wb_writeback_work work = {
1955 .nr_pages = nr_pages,
1956 .sync_mode = WB_SYNC_NONE,
1957 .range_cyclic = 1,
1958 .reason = reason,
1959 };
1960 struct blk_plug plug;
1961
1962 blk_start_plug(&plug);
1963 spin_lock(&wb->list_lock);
1964 if (list_empty(&wb->b_io))
1965 queue_io(wb, &work, jiffies);
1966 __writeback_inodes_wb(wb, &work);
1967 spin_unlock(&wb->list_lock);
1968 blk_finish_plug(&plug);
1969
1970 return nr_pages - work.nr_pages;
1971 }
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988 static long wb_writeback(struct bdi_writeback *wb,
1989 struct wb_writeback_work *work)
1990 {
1991 long nr_pages = work->nr_pages;
1992 unsigned long dirtied_before = jiffies;
1993 struct inode *inode;
1994 long progress;
1995 struct blk_plug plug;
1996
1997 blk_start_plug(&plug);
1998 spin_lock(&wb->list_lock);
1999 for (;;) {
2000
2001
2002
2003 if (work->nr_pages <= 0)
2004 break;
2005
2006
2007
2008
2009
2010
2011
2012 if ((work->for_background || work->for_kupdate) &&
2013 !list_empty(&wb->work_list))
2014 break;
2015
2016
2017
2018
2019
2020 if (work->for_background && !wb_over_bg_thresh(wb))
2021 break;
2022
2023
2024
2025
2026
2027
2028
2029 if (work->for_kupdate) {
2030 dirtied_before = jiffies -
2031 msecs_to_jiffies(dirty_expire_interval * 10);
2032 } else if (work->for_background)
2033 dirtied_before = jiffies;
2034
2035 trace_writeback_start(wb, work);
2036 if (list_empty(&wb->b_io))
2037 queue_io(wb, work, dirtied_before);
2038 if (work->sb)
2039 progress = writeback_sb_inodes(work->sb, wb, work);
2040 else
2041 progress = __writeback_inodes_wb(wb, work);
2042 trace_writeback_written(wb, work);
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052 if (progress)
2053 continue;
2054
2055
2056
2057 if (list_empty(&wb->b_more_io))
2058 break;
2059
2060
2061
2062
2063
2064 trace_writeback_wait(wb, work);
2065 inode = wb_inode(wb->b_more_io.prev);
2066 spin_lock(&inode->i_lock);
2067 spin_unlock(&wb->list_lock);
2068
2069 inode_sleep_on_writeback(inode);
2070 spin_lock(&wb->list_lock);
2071 }
2072 spin_unlock(&wb->list_lock);
2073 blk_finish_plug(&plug);
2074
2075 return nr_pages - work->nr_pages;
2076 }
2077
2078
2079
2080
2081 static struct wb_writeback_work *get_next_work_item(struct bdi_writeback *wb)
2082 {
2083 struct wb_writeback_work *work = NULL;
2084
2085 spin_lock_irq(&wb->work_lock);
2086 if (!list_empty(&wb->work_list)) {
2087 work = list_entry(wb->work_list.next,
2088 struct wb_writeback_work, list);
2089 list_del_init(&work->list);
2090 }
2091 spin_unlock_irq(&wb->work_lock);
2092 return work;
2093 }
2094
2095 static long wb_check_background_flush(struct bdi_writeback *wb)
2096 {
2097 if (wb_over_bg_thresh(wb)) {
2098
2099 struct wb_writeback_work work = {
2100 .nr_pages = LONG_MAX,
2101 .sync_mode = WB_SYNC_NONE,
2102 .for_background = 1,
2103 .range_cyclic = 1,
2104 .reason = WB_REASON_BACKGROUND,
2105 };
2106
2107 return wb_writeback(wb, &work);
2108 }
2109
2110 return 0;
2111 }
2112
2113 static long wb_check_old_data_flush(struct bdi_writeback *wb)
2114 {
2115 unsigned long expired;
2116 long nr_pages;
2117
2118
2119
2120
2121 if (!dirty_writeback_interval)
2122 return 0;
2123
2124 expired = wb->last_old_flush +
2125 msecs_to_jiffies(dirty_writeback_interval * 10);
2126 if (time_before(jiffies, expired))
2127 return 0;
2128
2129 wb->last_old_flush = jiffies;
2130 nr_pages = get_nr_dirty_pages();
2131
2132 if (nr_pages) {
2133 struct wb_writeback_work work = {
2134 .nr_pages = nr_pages,
2135 .sync_mode = WB_SYNC_NONE,
2136 .for_kupdate = 1,
2137 .range_cyclic = 1,
2138 .reason = WB_REASON_PERIODIC,
2139 };
2140
2141 return wb_writeback(wb, &work);
2142 }
2143
2144 return 0;
2145 }
2146
2147 static long wb_check_start_all(struct bdi_writeback *wb)
2148 {
2149 long nr_pages;
2150
2151 if (!test_bit(WB_start_all, &wb->state))
2152 return 0;
2153
2154 nr_pages = get_nr_dirty_pages();
2155 if (nr_pages) {
2156 struct wb_writeback_work work = {
2157 .nr_pages = wb_split_bdi_pages(wb, nr_pages),
2158 .sync_mode = WB_SYNC_NONE,
2159 .range_cyclic = 1,
2160 .reason = wb->start_all_reason,
2161 };
2162
2163 nr_pages = wb_writeback(wb, &work);
2164 }
2165
2166 clear_bit(WB_start_all, &wb->state);
2167 return nr_pages;
2168 }
2169
2170
2171
2172
2173
2174 static long wb_do_writeback(struct bdi_writeback *wb)
2175 {
2176 struct wb_writeback_work *work;
2177 long wrote = 0;
2178
2179 set_bit(WB_writeback_running, &wb->state);
2180 while ((work = get_next_work_item(wb)) != NULL) {
2181 trace_writeback_exec(wb, work);
2182 wrote += wb_writeback(wb, work);
2183 finish_writeback_work(wb, work);
2184 }
2185
2186
2187
2188
2189 wrote += wb_check_start_all(wb);
2190
2191
2192
2193
2194 wrote += wb_check_old_data_flush(wb);
2195 wrote += wb_check_background_flush(wb);
2196 clear_bit(WB_writeback_running, &wb->state);
2197
2198 return wrote;
2199 }
2200
2201
2202
2203
2204
2205 void wb_workfn(struct work_struct *work)
2206 {
2207 struct bdi_writeback *wb = container_of(to_delayed_work(work),
2208 struct bdi_writeback, dwork);
2209 long pages_written;
2210
2211 set_worker_desc("flush-%s", bdi_dev_name(wb->bdi));
2212
2213 if (likely(!current_is_workqueue_rescuer() ||
2214 !test_bit(WB_registered, &wb->state))) {
2215
2216
2217
2218
2219
2220
2221 do {
2222 pages_written = wb_do_writeback(wb);
2223 trace_writeback_pages_written(pages_written);
2224 } while (!list_empty(&wb->work_list));
2225 } else {
2226
2227
2228
2229
2230
2231 pages_written = writeback_inodes_wb(wb, 1024,
2232 WB_REASON_FORKER_THREAD);
2233 trace_writeback_pages_written(pages_written);
2234 }
2235
2236 if (!list_empty(&wb->work_list))
2237 wb_wakeup(wb);
2238 else if (wb_has_dirty_io(wb) && dirty_writeback_interval)
2239 wb_wakeup_delayed(wb);
2240 }
2241
2242
2243
2244
2245
2246 static void __wakeup_flusher_threads_bdi(struct backing_dev_info *bdi,
2247 enum wb_reason reason)
2248 {
2249 struct bdi_writeback *wb;
2250
2251 if (!bdi_has_dirty_io(bdi))
2252 return;
2253
2254 list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node)
2255 wb_start_writeback(wb, reason);
2256 }
2257
2258 void wakeup_flusher_threads_bdi(struct backing_dev_info *bdi,
2259 enum wb_reason reason)
2260 {
2261 rcu_read_lock();
2262 __wakeup_flusher_threads_bdi(bdi, reason);
2263 rcu_read_unlock();
2264 }
2265
2266
2267
2268
2269 void wakeup_flusher_threads(enum wb_reason reason)
2270 {
2271 struct backing_dev_info *bdi;
2272
2273
2274
2275
2276 blk_flush_plug(current->plug, true);
2277
2278 rcu_read_lock();
2279 list_for_each_entry_rcu(bdi, &bdi_list, bdi_list)
2280 __wakeup_flusher_threads_bdi(bdi, reason);
2281 rcu_read_unlock();
2282 }
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299 static void wakeup_dirtytime_writeback(struct work_struct *w);
2300 static DECLARE_DELAYED_WORK(dirtytime_work, wakeup_dirtytime_writeback);
2301
2302 static void wakeup_dirtytime_writeback(struct work_struct *w)
2303 {
2304 struct backing_dev_info *bdi;
2305
2306 rcu_read_lock();
2307 list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
2308 struct bdi_writeback *wb;
2309
2310 list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node)
2311 if (!list_empty(&wb->b_dirty_time))
2312 wb_wakeup(wb);
2313 }
2314 rcu_read_unlock();
2315 schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ);
2316 }
2317
2318 static int __init start_dirtytime_writeback(void)
2319 {
2320 schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ);
2321 return 0;
2322 }
2323 __initcall(start_dirtytime_writeback);
2324
2325 int dirtytime_interval_handler(struct ctl_table *table, int write,
2326 void *buffer, size_t *lenp, loff_t *ppos)
2327 {
2328 int ret;
2329
2330 ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
2331 if (ret == 0 && write)
2332 mod_delayed_work(system_wq, &dirtytime_work, 0);
2333 return ret;
2334 }
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363 void __mark_inode_dirty(struct inode *inode, int flags)
2364 {
2365 struct super_block *sb = inode->i_sb;
2366 int dirtytime = 0;
2367 struct bdi_writeback *wb = NULL;
2368
2369 trace_writeback_mark_inode_dirty(inode, flags);
2370
2371 if (flags & I_DIRTY_INODE) {
2372
2373
2374
2375
2376
2377
2378
2379 trace_writeback_dirty_inode_start(inode, flags);
2380 if (sb->s_op->dirty_inode)
2381 sb->s_op->dirty_inode(inode, flags & I_DIRTY_INODE);
2382 trace_writeback_dirty_inode(inode, flags);
2383
2384
2385 flags &= ~I_DIRTY_TIME;
2386 } else {
2387
2388
2389
2390
2391
2392 dirtytime = flags & I_DIRTY_TIME;
2393 WARN_ON_ONCE(dirtytime && flags != I_DIRTY_TIME);
2394 }
2395
2396
2397
2398
2399
2400 smp_mb();
2401
2402 if (((inode->i_state & flags) == flags) ||
2403 (dirtytime && (inode->i_state & I_DIRTY_INODE)))
2404 return;
2405
2406 spin_lock(&inode->i_lock);
2407 if (dirtytime && (inode->i_state & I_DIRTY_INODE))
2408 goto out_unlock_inode;
2409 if ((inode->i_state & flags) != flags) {
2410 const int was_dirty = inode->i_state & I_DIRTY;
2411
2412 inode_attach_wb(inode, NULL);
2413
2414
2415 if (flags & I_DIRTY_INODE)
2416 inode->i_state &= ~I_DIRTY_TIME;
2417 inode->i_state |= flags;
2418
2419
2420
2421
2422
2423
2424
2425 if (!was_dirty) {
2426 wb = locked_inode_to_wb_and_lock_list(inode);
2427 spin_lock(&inode->i_lock);
2428 }
2429
2430
2431
2432
2433
2434
2435
2436 if (inode->i_state & I_SYNC_QUEUED)
2437 goto out_unlock;
2438
2439
2440
2441
2442
2443 if (!S_ISBLK(inode->i_mode)) {
2444 if (inode_unhashed(inode))
2445 goto out_unlock;
2446 }
2447 if (inode->i_state & I_FREEING)
2448 goto out_unlock;
2449
2450
2451
2452
2453
2454 if (!was_dirty) {
2455 struct list_head *dirty_list;
2456 bool wakeup_bdi = false;
2457
2458 inode->dirtied_when = jiffies;
2459 if (dirtytime)
2460 inode->dirtied_time_when = jiffies;
2461
2462 if (inode->i_state & I_DIRTY)
2463 dirty_list = &wb->b_dirty;
2464 else
2465 dirty_list = &wb->b_dirty_time;
2466
2467 wakeup_bdi = inode_io_list_move_locked(inode, wb,
2468 dirty_list);
2469
2470 spin_unlock(&wb->list_lock);
2471 spin_unlock(&inode->i_lock);
2472 trace_writeback_dirty_inode_enqueue(inode);
2473
2474
2475
2476
2477
2478
2479
2480 if (wakeup_bdi &&
2481 (wb->bdi->capabilities & BDI_CAP_WRITEBACK))
2482 wb_wakeup_delayed(wb);
2483 return;
2484 }
2485 }
2486 out_unlock:
2487 if (wb)
2488 spin_unlock(&wb->list_lock);
2489 out_unlock_inode:
2490 spin_unlock(&inode->i_lock);
2491 }
2492 EXPORT_SYMBOL(__mark_inode_dirty);
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503 static void wait_sb_inodes(struct super_block *sb)
2504 {
2505 LIST_HEAD(sync_list);
2506
2507
2508
2509
2510
2511 WARN_ON(!rwsem_is_locked(&sb->s_umount));
2512
2513 mutex_lock(&sb->s_sync_lock);
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524 rcu_read_lock();
2525 spin_lock_irq(&sb->s_inode_wblist_lock);
2526 list_splice_init(&sb->s_inodes_wb, &sync_list);
2527
2528
2529
2530
2531
2532
2533
2534
2535 while (!list_empty(&sync_list)) {
2536 struct inode *inode = list_first_entry(&sync_list, struct inode,
2537 i_wb_list);
2538 struct address_space *mapping = inode->i_mapping;
2539
2540
2541
2542
2543
2544
2545
2546 list_move_tail(&inode->i_wb_list, &sb->s_inodes_wb);
2547
2548
2549
2550
2551
2552
2553 if (!mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK))
2554 continue;
2555
2556 spin_unlock_irq(&sb->s_inode_wblist_lock);
2557
2558 spin_lock(&inode->i_lock);
2559 if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) {
2560 spin_unlock(&inode->i_lock);
2561
2562 spin_lock_irq(&sb->s_inode_wblist_lock);
2563 continue;
2564 }
2565 __iget(inode);
2566 spin_unlock(&inode->i_lock);
2567 rcu_read_unlock();
2568
2569
2570
2571
2572
2573
2574 filemap_fdatawait_keep_errors(mapping);
2575
2576 cond_resched();
2577
2578 iput(inode);
2579
2580 rcu_read_lock();
2581 spin_lock_irq(&sb->s_inode_wblist_lock);
2582 }
2583 spin_unlock_irq(&sb->s_inode_wblist_lock);
2584 rcu_read_unlock();
2585 mutex_unlock(&sb->s_sync_lock);
2586 }
2587
2588 static void __writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr,
2589 enum wb_reason reason, bool skip_if_busy)
2590 {
2591 struct backing_dev_info *bdi = sb->s_bdi;
2592 DEFINE_WB_COMPLETION(done, bdi);
2593 struct wb_writeback_work work = {
2594 .sb = sb,
2595 .sync_mode = WB_SYNC_NONE,
2596 .tagged_writepages = 1,
2597 .done = &done,
2598 .nr_pages = nr,
2599 .reason = reason,
2600 };
2601
2602 if (!bdi_has_dirty_io(bdi) || bdi == &noop_backing_dev_info)
2603 return;
2604 WARN_ON(!rwsem_is_locked(&sb->s_umount));
2605
2606 bdi_split_work_to_wbs(sb->s_bdi, &work, skip_if_busy);
2607 wb_wait_for_completion(&done);
2608 }
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620 void writeback_inodes_sb_nr(struct super_block *sb,
2621 unsigned long nr,
2622 enum wb_reason reason)
2623 {
2624 __writeback_inodes_sb_nr(sb, nr, reason, false);
2625 }
2626 EXPORT_SYMBOL(writeback_inodes_sb_nr);
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637 void writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
2638 {
2639 return writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason);
2640 }
2641 EXPORT_SYMBOL(writeback_inodes_sb);
2642
2643
2644
2645
2646
2647
2648
2649
2650 void try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
2651 {
2652 if (!down_read_trylock(&sb->s_umount))
2653 return;
2654
2655 __writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason, true);
2656 up_read(&sb->s_umount);
2657 }
2658 EXPORT_SYMBOL(try_to_writeback_inodes_sb);
2659
2660
2661
2662
2663
2664
2665
2666
2667 void sync_inodes_sb(struct super_block *sb)
2668 {
2669 struct backing_dev_info *bdi = sb->s_bdi;
2670 DEFINE_WB_COMPLETION(done, bdi);
2671 struct wb_writeback_work work = {
2672 .sb = sb,
2673 .sync_mode = WB_SYNC_ALL,
2674 .nr_pages = LONG_MAX,
2675 .range_cyclic = 0,
2676 .done = &done,
2677 .reason = WB_REASON_SYNC,
2678 .for_sync = 1,
2679 };
2680
2681
2682
2683
2684
2685
2686 if (bdi == &noop_backing_dev_info)
2687 return;
2688 WARN_ON(!rwsem_is_locked(&sb->s_umount));
2689
2690
2691 bdi_down_write_wb_switch_rwsem(bdi);
2692 bdi_split_work_to_wbs(bdi, &work, false);
2693 wb_wait_for_completion(&done);
2694 bdi_up_write_wb_switch_rwsem(bdi);
2695
2696 wait_sb_inodes(sb);
2697 }
2698 EXPORT_SYMBOL(sync_inodes_sb);
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710 int write_inode_now(struct inode *inode, int sync)
2711 {
2712 struct writeback_control wbc = {
2713 .nr_to_write = LONG_MAX,
2714 .sync_mode = sync ? WB_SYNC_ALL : WB_SYNC_NONE,
2715 .range_start = 0,
2716 .range_end = LLONG_MAX,
2717 };
2718
2719 if (!mapping_can_writeback(inode->i_mapping))
2720 wbc.nr_to_write = 0;
2721
2722 might_sleep();
2723 return writeback_single_inode(inode, &wbc);
2724 }
2725 EXPORT_SYMBOL(write_inode_now);
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736 int sync_inode_metadata(struct inode *inode, int wait)
2737 {
2738 struct writeback_control wbc = {
2739 .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE,
2740 .nr_to_write = 0,
2741 };
2742
2743 return writeback_single_inode(inode, &wbc);
2744 }
2745 EXPORT_SYMBOL(sync_inode_metadata);