Back to home page

LXR

 
 

    


0001 /*
0002  * fs/fs-writeback.c
0003  *
0004  * Copyright (C) 2002, Linus Torvalds.
0005  *
0006  * Contains all the functions related to writing back and waiting
0007  * upon dirty inodes against superblocks, and writing back dirty
0008  * pages against inodes.  ie: data writeback.  Writeout of the
0009  * inode itself is not handled here.
0010  *
0011  * 10Apr2002    Andrew Morton
0012  *      Split out of fs/inode.c
0013  *      Additions for address_space-based writeback
0014  */
0015 
0016 #include <linux/kernel.h>
0017 #include <linux/export.h>
0018 #include <linux/spinlock.h>
0019 #include <linux/slab.h>
0020 #include <linux/sched.h>
0021 #include <linux/fs.h>
0022 #include <linux/mm.h>
0023 #include <linux/pagemap.h>
0024 #include <linux/kthread.h>
0025 #include <linux/writeback.h>
0026 #include <linux/blkdev.h>
0027 #include <linux/backing-dev.h>
0028 #include <linux/tracepoint.h>
0029 #include <linux/device.h>
0030 #include <linux/memcontrol.h>
0031 #include "internal.h"
0032 
0033 /*
0034  * 4MB minimal write chunk size
0035  */
0036 #define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_SHIFT - 10))
0037 
0038 struct wb_completion {
0039     atomic_t        cnt;
0040 };
0041 
0042 /*
0043  * Passed into wb_writeback(), essentially a subset of writeback_control
0044  */
0045 struct wb_writeback_work {
0046     long nr_pages;
0047     struct super_block *sb;
0048     unsigned long *older_than_this;
0049     enum writeback_sync_modes sync_mode;
0050     unsigned int tagged_writepages:1;
0051     unsigned int for_kupdate:1;
0052     unsigned int range_cyclic:1;
0053     unsigned int for_background:1;
0054     unsigned int for_sync:1;    /* sync(2) WB_SYNC_ALL writeback */
0055     unsigned int auto_free:1;   /* free on completion */
0056     enum wb_reason reason;      /* why was writeback initiated? */
0057 
0058     struct list_head list;      /* pending work list */
0059     struct wb_completion *done; /* set if the caller waits */
0060 };
0061 
0062 /*
0063  * If one wants to wait for one or more wb_writeback_works, each work's
0064  * ->done should be set to a wb_completion defined using the following
0065  * macro.  Once all work items are issued with wb_queue_work(), the caller
0066  * can wait for the completion of all using wb_wait_for_completion().  Work
0067  * items which are waited upon aren't freed automatically on completion.
0068  */
0069 #define DEFINE_WB_COMPLETION_ONSTACK(cmpl)              \
0070     struct wb_completion cmpl = {                   \
0071         .cnt        = ATOMIC_INIT(1),           \
0072     }
0073 
0074 
0075 /*
0076  * If an inode is constantly having its pages dirtied, but then the
0077  * updates stop dirtytime_expire_interval seconds in the past, it's
0078  * possible for the worst case time between when an inode has its
0079  * timestamps updated and when they finally get written out to be two
0080  * dirtytime_expire_intervals.  We set the default to 12 hours (in
0081  * seconds), which means most of the time inodes will have their
0082  * timestamps written to disk after 12 hours, but in the worst case a
0083  * few inodes might not their timestamps updated for 24 hours.
0084  */
0085 unsigned int dirtytime_expire_interval = 12 * 60 * 60;
0086 
0087 static inline struct inode *wb_inode(struct list_head *head)
0088 {
0089     return list_entry(head, struct inode, i_io_list);
0090 }
0091 
0092 /*
0093  * Include the creation of the trace points after defining the
0094  * wb_writeback_work structure and inline functions so that the definition
0095  * remains local to this file.
0096  */
0097 #define CREATE_TRACE_POINTS
0098 #include <trace/events/writeback.h>
0099 
0100 EXPORT_TRACEPOINT_SYMBOL_GPL(wbc_writepage);
0101 
0102 static bool wb_io_lists_populated(struct bdi_writeback *wb)
0103 {
0104     if (wb_has_dirty_io(wb)) {
0105         return false;
0106     } else {
0107         set_bit(WB_has_dirty_io, &wb->state);
0108         WARN_ON_ONCE(!wb->avg_write_bandwidth);
0109         atomic_long_add(wb->avg_write_bandwidth,
0110                 &wb->bdi->tot_write_bandwidth);
0111         return true;
0112     }
0113 }
0114 
0115 static void wb_io_lists_depopulated(struct bdi_writeback *wb)
0116 {
0117     if (wb_has_dirty_io(wb) && list_empty(&wb->b_dirty) &&
0118         list_empty(&wb->b_io) && list_empty(&wb->b_more_io)) {
0119         clear_bit(WB_has_dirty_io, &wb->state);
0120         WARN_ON_ONCE(atomic_long_sub_return(wb->avg_write_bandwidth,
0121                     &wb->bdi->tot_write_bandwidth) < 0);
0122     }
0123 }
0124 
0125 /**
0126  * inode_io_list_move_locked - move an inode onto a bdi_writeback IO list
0127  * @inode: inode to be moved
0128  * @wb: target bdi_writeback
0129  * @head: one of @wb->b_{dirty|io|more_io}
0130  *
0131  * Move @inode->i_io_list to @list of @wb and set %WB_has_dirty_io.
0132  * Returns %true if @inode is the first occupant of the !dirty_time IO
0133  * lists; otherwise, %false.
0134  */
0135 static bool inode_io_list_move_locked(struct inode *inode,
0136                       struct bdi_writeback *wb,
0137                       struct list_head *head)
0138 {
0139     assert_spin_locked(&wb->list_lock);
0140 
0141     list_move(&inode->i_io_list, head);
0142 
0143     /* dirty_time doesn't count as dirty_io until expiration */
0144     if (head != &wb->b_dirty_time)
0145         return wb_io_lists_populated(wb);
0146 
0147     wb_io_lists_depopulated(wb);
0148     return false;
0149 }
0150 
0151 /**
0152  * inode_io_list_del_locked - remove an inode from its bdi_writeback IO list
0153  * @inode: inode to be removed
0154  * @wb: bdi_writeback @inode is being removed from
0155  *
0156  * Remove @inode which may be on one of @wb->b_{dirty|io|more_io} lists and
0157  * clear %WB_has_dirty_io if all are empty afterwards.
0158  */
0159 static void inode_io_list_del_locked(struct inode *inode,
0160                      struct bdi_writeback *wb)
0161 {
0162     assert_spin_locked(&wb->list_lock);
0163 
0164     list_del_init(&inode->i_io_list);
0165     wb_io_lists_depopulated(wb);
0166 }
0167 
0168 static void wb_wakeup(struct bdi_writeback *wb)
0169 {
0170     spin_lock_bh(&wb->work_lock);
0171     if (test_bit(WB_registered, &wb->state))
0172         mod_delayed_work(bdi_wq, &wb->dwork, 0);
0173     spin_unlock_bh(&wb->work_lock);
0174 }
0175 
0176 static void wb_queue_work(struct bdi_writeback *wb,
0177               struct wb_writeback_work *work)
0178 {
0179     trace_writeback_queue(wb, work);
0180 
0181     spin_lock_bh(&wb->work_lock);
0182     if (!test_bit(WB_registered, &wb->state))
0183         goto out_unlock;
0184     if (work->done)
0185         atomic_inc(&work->done->cnt);
0186     list_add_tail(&work->list, &wb->work_list);
0187     mod_delayed_work(bdi_wq, &wb->dwork, 0);
0188 out_unlock:
0189     spin_unlock_bh(&wb->work_lock);
0190 }
0191 
0192 /**
0193  * wb_wait_for_completion - wait for completion of bdi_writeback_works
0194  * @bdi: bdi work items were issued to
0195  * @done: target wb_completion
0196  *
0197  * Wait for one or more work items issued to @bdi with their ->done field
0198  * set to @done, which should have been defined with
0199  * DEFINE_WB_COMPLETION_ONSTACK().  This function returns after all such
0200  * work items are completed.  Work items which are waited upon aren't freed
0201  * automatically on completion.
0202  */
0203 static void wb_wait_for_completion(struct backing_dev_info *bdi,
0204                    struct wb_completion *done)
0205 {
0206     atomic_dec(&done->cnt);     /* put down the initial count */
0207     wait_event(bdi->wb_waitq, !atomic_read(&done->cnt));
0208 }
0209 
0210 #ifdef CONFIG_CGROUP_WRITEBACK
0211 
0212 /* parameters for foreign inode detection, see wb_detach_inode() */
0213 #define WB_FRN_TIME_SHIFT   13  /* 1s = 2^13, upto 8 secs w/ 16bit */
0214 #define WB_FRN_TIME_AVG_SHIFT   3   /* avg = avg * 7/8 + new * 1/8 */
0215 #define WB_FRN_TIME_CUT_DIV 2   /* ignore rounds < avg / 2 */
0216 #define WB_FRN_TIME_PERIOD  (2 * (1 << WB_FRN_TIME_SHIFT))  /* 2s */
0217 
0218 #define WB_FRN_HIST_SLOTS   16  /* inode->i_wb_frn_history is 16bit */
0219 #define WB_FRN_HIST_UNIT    (WB_FRN_TIME_PERIOD / WB_FRN_HIST_SLOTS)
0220                     /* each slot's duration is 2s / 16 */
0221 #define WB_FRN_HIST_THR_SLOTS   (WB_FRN_HIST_SLOTS / 2)
0222                     /* if foreign slots >= 8, switch */
0223 #define WB_FRN_HIST_MAX_SLOTS   (WB_FRN_HIST_THR_SLOTS / 2 + 1)
0224                     /* one round can affect upto 5 slots */
0225 
0226 static atomic_t isw_nr_in_flight = ATOMIC_INIT(0);
0227 static struct workqueue_struct *isw_wq;
0228 
0229 void __inode_attach_wb(struct inode *inode, struct page *page)
0230 {
0231     struct backing_dev_info *bdi = inode_to_bdi(inode);
0232     struct bdi_writeback *wb = NULL;
0233 
0234     if (inode_cgwb_enabled(inode)) {
0235         struct cgroup_subsys_state *memcg_css;
0236 
0237         if (page) {
0238             memcg_css = mem_cgroup_css_from_page(page);
0239             wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
0240         } else {
0241             /* must pin memcg_css, see wb_get_create() */
0242             memcg_css = task_get_css(current, memory_cgrp_id);
0243             wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
0244             css_put(memcg_css);
0245         }
0246     }
0247 
0248     if (!wb)
0249         wb = &bdi->wb;
0250 
0251     /*
0252      * There may be multiple instances of this function racing to
0253      * update the same inode.  Use cmpxchg() to tell the winner.
0254      */
0255     if (unlikely(cmpxchg(&inode->i_wb, NULL, wb)))
0256         wb_put(wb);
0257 }
0258 
0259 /**
0260  * locked_inode_to_wb_and_lock_list - determine a locked inode's wb and lock it
0261  * @inode: inode of interest with i_lock held
0262  *
0263  * Returns @inode's wb with its list_lock held.  @inode->i_lock must be
0264  * held on entry and is released on return.  The returned wb is guaranteed
0265  * to stay @inode's associated wb until its list_lock is released.
0266  */
0267 static struct bdi_writeback *
0268 locked_inode_to_wb_and_lock_list(struct inode *inode)
0269     __releases(&inode->i_lock)
0270     __acquires(&wb->list_lock)
0271 {
0272     while (true) {
0273         struct bdi_writeback *wb = inode_to_wb(inode);
0274 
0275         /*
0276          * inode_to_wb() association is protected by both
0277          * @inode->i_lock and @wb->list_lock but list_lock nests
0278          * outside i_lock.  Drop i_lock and verify that the
0279          * association hasn't changed after acquiring list_lock.
0280          */
0281         wb_get(wb);
0282         spin_unlock(&inode->i_lock);
0283         spin_lock(&wb->list_lock);
0284 
0285         /* i_wb may have changed inbetween, can't use inode_to_wb() */
0286         if (likely(wb == inode->i_wb)) {
0287             wb_put(wb); /* @inode already has ref */
0288             return wb;
0289         }
0290 
0291         spin_unlock(&wb->list_lock);
0292         wb_put(wb);
0293         cpu_relax();
0294         spin_lock(&inode->i_lock);
0295     }
0296 }
0297 
0298 /**
0299  * inode_to_wb_and_lock_list - determine an inode's wb and lock it
0300  * @inode: inode of interest
0301  *
0302  * Same as locked_inode_to_wb_and_lock_list() but @inode->i_lock isn't held
0303  * on entry.
0304  */
0305 static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode)
0306     __acquires(&wb->list_lock)
0307 {
0308     spin_lock(&inode->i_lock);
0309     return locked_inode_to_wb_and_lock_list(inode);
0310 }
0311 
0312 struct inode_switch_wbs_context {
0313     struct inode        *inode;
0314     struct bdi_writeback    *new_wb;
0315 
0316     struct rcu_head     rcu_head;
0317     struct work_struct  work;
0318 };
0319 
0320 static void inode_switch_wbs_work_fn(struct work_struct *work)
0321 {
0322     struct inode_switch_wbs_context *isw =
0323         container_of(work, struct inode_switch_wbs_context, work);
0324     struct inode *inode = isw->inode;
0325     struct address_space *mapping = inode->i_mapping;
0326     struct bdi_writeback *old_wb = inode->i_wb;
0327     struct bdi_writeback *new_wb = isw->new_wb;
0328     struct radix_tree_iter iter;
0329     bool switched = false;
0330     void **slot;
0331 
0332     /*
0333      * By the time control reaches here, RCU grace period has passed
0334      * since I_WB_SWITCH assertion and all wb stat update transactions
0335      * between unlocked_inode_to_wb_begin/end() are guaranteed to be
0336      * synchronizing against mapping->tree_lock.
0337      *
0338      * Grabbing old_wb->list_lock, inode->i_lock and mapping->tree_lock
0339      * gives us exclusion against all wb related operations on @inode
0340      * including IO list manipulations and stat updates.
0341      */
0342     if (old_wb < new_wb) {
0343         spin_lock(&old_wb->list_lock);
0344         spin_lock_nested(&new_wb->list_lock, SINGLE_DEPTH_NESTING);
0345     } else {
0346         spin_lock(&new_wb->list_lock);
0347         spin_lock_nested(&old_wb->list_lock, SINGLE_DEPTH_NESTING);
0348     }
0349     spin_lock(&inode->i_lock);
0350     spin_lock_irq(&mapping->tree_lock);
0351 
0352     /*
0353      * Once I_FREEING is visible under i_lock, the eviction path owns
0354      * the inode and we shouldn't modify ->i_io_list.
0355      */
0356     if (unlikely(inode->i_state & I_FREEING))
0357         goto skip_switch;
0358 
0359     /*
0360      * Count and transfer stats.  Note that PAGECACHE_TAG_DIRTY points
0361      * to possibly dirty pages while PAGECACHE_TAG_WRITEBACK points to
0362      * pages actually under underwriteback.
0363      */
0364     radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, 0,
0365                    PAGECACHE_TAG_DIRTY) {
0366         struct page *page = radix_tree_deref_slot_protected(slot,
0367                             &mapping->tree_lock);
0368         if (likely(page) && PageDirty(page)) {
0369             __dec_wb_stat(old_wb, WB_RECLAIMABLE);
0370             __inc_wb_stat(new_wb, WB_RECLAIMABLE);
0371         }
0372     }
0373 
0374     radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, 0,
0375                    PAGECACHE_TAG_WRITEBACK) {
0376         struct page *page = radix_tree_deref_slot_protected(slot,
0377                             &mapping->tree_lock);
0378         if (likely(page)) {
0379             WARN_ON_ONCE(!PageWriteback(page));
0380             __dec_wb_stat(old_wb, WB_WRITEBACK);
0381             __inc_wb_stat(new_wb, WB_WRITEBACK);
0382         }
0383     }
0384 
0385     wb_get(new_wb);
0386 
0387     /*
0388      * Transfer to @new_wb's IO list if necessary.  The specific list
0389      * @inode was on is ignored and the inode is put on ->b_dirty which
0390      * is always correct including from ->b_dirty_time.  The transfer
0391      * preserves @inode->dirtied_when ordering.
0392      */
0393     if (!list_empty(&inode->i_io_list)) {
0394         struct inode *pos;
0395 
0396         inode_io_list_del_locked(inode, old_wb);
0397         inode->i_wb = new_wb;
0398         list_for_each_entry(pos, &new_wb->b_dirty, i_io_list)
0399             if (time_after_eq(inode->dirtied_when,
0400                       pos->dirtied_when))
0401                 break;
0402         inode_io_list_move_locked(inode, new_wb, pos->i_io_list.prev);
0403     } else {
0404         inode->i_wb = new_wb;
0405     }
0406 
0407     /* ->i_wb_frn updates may race wbc_detach_inode() but doesn't matter */
0408     inode->i_wb_frn_winner = 0;
0409     inode->i_wb_frn_avg_time = 0;
0410     inode->i_wb_frn_history = 0;
0411     switched = true;
0412 skip_switch:
0413     /*
0414      * Paired with load_acquire in unlocked_inode_to_wb_begin() and
0415      * ensures that the new wb is visible if they see !I_WB_SWITCH.
0416      */
0417     smp_store_release(&inode->i_state, inode->i_state & ~I_WB_SWITCH);
0418 
0419     spin_unlock_irq(&mapping->tree_lock);
0420     spin_unlock(&inode->i_lock);
0421     spin_unlock(&new_wb->list_lock);
0422     spin_unlock(&old_wb->list_lock);
0423 
0424     if (switched) {
0425         wb_wakeup(new_wb);
0426         wb_put(old_wb);
0427     }
0428     wb_put(new_wb);
0429 
0430     iput(inode);
0431     kfree(isw);
0432 
0433     atomic_dec(&isw_nr_in_flight);
0434 }
0435 
0436 static void inode_switch_wbs_rcu_fn(struct rcu_head *rcu_head)
0437 {
0438     struct inode_switch_wbs_context *isw = container_of(rcu_head,
0439                 struct inode_switch_wbs_context, rcu_head);
0440 
0441     /* needs to grab bh-unsafe locks, bounce to work item */
0442     INIT_WORK(&isw->work, inode_switch_wbs_work_fn);
0443     queue_work(isw_wq, &isw->work);
0444 }
0445 
0446 /**
0447  * inode_switch_wbs - change the wb association of an inode
0448  * @inode: target inode
0449  * @new_wb_id: ID of the new wb
0450  *
0451  * Switch @inode's wb association to the wb identified by @new_wb_id.  The
0452  * switching is performed asynchronously and may fail silently.
0453  */
0454 static void inode_switch_wbs(struct inode *inode, int new_wb_id)
0455 {
0456     struct backing_dev_info *bdi = inode_to_bdi(inode);
0457     struct cgroup_subsys_state *memcg_css;
0458     struct inode_switch_wbs_context *isw;
0459 
0460     /* noop if seems to be already in progress */
0461     if (inode->i_state & I_WB_SWITCH)
0462         return;
0463 
0464     isw = kzalloc(sizeof(*isw), GFP_ATOMIC);
0465     if (!isw)
0466         return;
0467 
0468     /* find and pin the new wb */
0469     rcu_read_lock();
0470     memcg_css = css_from_id(new_wb_id, &memory_cgrp_subsys);
0471     if (memcg_css)
0472         isw->new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
0473     rcu_read_unlock();
0474     if (!isw->new_wb)
0475         goto out_free;
0476 
0477     /* while holding I_WB_SWITCH, no one else can update the association */
0478     spin_lock(&inode->i_lock);
0479     if (!(inode->i_sb->s_flags & MS_ACTIVE) ||
0480         inode->i_state & (I_WB_SWITCH | I_FREEING) ||
0481         inode_to_wb(inode) == isw->new_wb) {
0482         spin_unlock(&inode->i_lock);
0483         goto out_free;
0484     }
0485     inode->i_state |= I_WB_SWITCH;
0486     __iget(inode);
0487     spin_unlock(&inode->i_lock);
0488 
0489     isw->inode = inode;
0490 
0491     atomic_inc(&isw_nr_in_flight);
0492 
0493     /*
0494      * In addition to synchronizing among switchers, I_WB_SWITCH tells
0495      * the RCU protected stat update paths to grab the mapping's
0496      * tree_lock so that stat transfer can synchronize against them.
0497      * Let's continue after I_WB_SWITCH is guaranteed to be visible.
0498      */
0499     call_rcu(&isw->rcu_head, inode_switch_wbs_rcu_fn);
0500     return;
0501 
0502 out_free:
0503     if (isw->new_wb)
0504         wb_put(isw->new_wb);
0505     kfree(isw);
0506 }
0507 
0508 /**
0509  * wbc_attach_and_unlock_inode - associate wbc with target inode and unlock it
0510  * @wbc: writeback_control of interest
0511  * @inode: target inode
0512  *
0513  * @inode is locked and about to be written back under the control of @wbc.
0514  * Record @inode's writeback context into @wbc and unlock the i_lock.  On
0515  * writeback completion, wbc_detach_inode() should be called.  This is used
0516  * to track the cgroup writeback context.
0517  */
0518 void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
0519                  struct inode *inode)
0520 {
0521     if (!inode_cgwb_enabled(inode)) {
0522         spin_unlock(&inode->i_lock);
0523         return;
0524     }
0525 
0526     wbc->wb = inode_to_wb(inode);
0527     wbc->inode = inode;
0528 
0529     wbc->wb_id = wbc->wb->memcg_css->id;
0530     wbc->wb_lcand_id = inode->i_wb_frn_winner;
0531     wbc->wb_tcand_id = 0;
0532     wbc->wb_bytes = 0;
0533     wbc->wb_lcand_bytes = 0;
0534     wbc->wb_tcand_bytes = 0;
0535 
0536     wb_get(wbc->wb);
0537     spin_unlock(&inode->i_lock);
0538 
0539     /*
0540      * A dying wb indicates that the memcg-blkcg mapping has changed
0541      * and a new wb is already serving the memcg.  Switch immediately.
0542      */
0543     if (unlikely(wb_dying(wbc->wb)))
0544         inode_switch_wbs(inode, wbc->wb_id);
0545 }
0546 
0547 /**
0548  * wbc_detach_inode - disassociate wbc from inode and perform foreign detection
0549  * @wbc: writeback_control of the just finished writeback
0550  *
0551  * To be called after a writeback attempt of an inode finishes and undoes
0552  * wbc_attach_and_unlock_inode().  Can be called under any context.
0553  *
0554  * As concurrent write sharing of an inode is expected to be very rare and
0555  * memcg only tracks page ownership on first-use basis severely confining
0556  * the usefulness of such sharing, cgroup writeback tracks ownership
0557  * per-inode.  While the support for concurrent write sharing of an inode
0558  * is deemed unnecessary, an inode being written to by different cgroups at
0559  * different points in time is a lot more common, and, more importantly,
0560  * charging only by first-use can too readily lead to grossly incorrect
0561  * behaviors (single foreign page can lead to gigabytes of writeback to be
0562  * incorrectly attributed).
0563  *
0564  * To resolve this issue, cgroup writeback detects the majority dirtier of
0565  * an inode and transfers the ownership to it.  To avoid unnnecessary
0566  * oscillation, the detection mechanism keeps track of history and gives
0567  * out the switch verdict only if the foreign usage pattern is stable over
0568  * a certain amount of time and/or writeback attempts.
0569  *
0570  * On each writeback attempt, @wbc tries to detect the majority writer
0571  * using Boyer-Moore majority vote algorithm.  In addition to the byte
0572  * count from the majority voting, it also counts the bytes written for the
0573  * current wb and the last round's winner wb (max of last round's current
0574  * wb, the winner from two rounds ago, and the last round's majority
0575  * candidate).  Keeping track of the historical winner helps the algorithm
0576  * to semi-reliably detect the most active writer even when it's not the
0577  * absolute majority.
0578  *
0579  * Once the winner of the round is determined, whether the winner is
0580  * foreign or not and how much IO time the round consumed is recorded in
0581  * inode->i_wb_frn_history.  If the amount of recorded foreign IO time is
0582  * over a certain threshold, the switch verdict is given.
0583  */
0584 void wbc_detach_inode(struct writeback_control *wbc)
0585 {
0586     struct bdi_writeback *wb = wbc->wb;
0587     struct inode *inode = wbc->inode;
0588     unsigned long avg_time, max_bytes, max_time;
0589     u16 history;
0590     int max_id;
0591 
0592     if (!wb)
0593         return;
0594 
0595     history = inode->i_wb_frn_history;
0596     avg_time = inode->i_wb_frn_avg_time;
0597 
0598     /* pick the winner of this round */
0599     if (wbc->wb_bytes >= wbc->wb_lcand_bytes &&
0600         wbc->wb_bytes >= wbc->wb_tcand_bytes) {
0601         max_id = wbc->wb_id;
0602         max_bytes = wbc->wb_bytes;
0603     } else if (wbc->wb_lcand_bytes >= wbc->wb_tcand_bytes) {
0604         max_id = wbc->wb_lcand_id;
0605         max_bytes = wbc->wb_lcand_bytes;
0606     } else {
0607         max_id = wbc->wb_tcand_id;
0608         max_bytes = wbc->wb_tcand_bytes;
0609     }
0610 
0611     /*
0612      * Calculate the amount of IO time the winner consumed and fold it
0613      * into the running average kept per inode.  If the consumed IO
0614      * time is lower than avag / WB_FRN_TIME_CUT_DIV, ignore it for
0615      * deciding whether to switch or not.  This is to prevent one-off
0616      * small dirtiers from skewing the verdict.
0617      */
0618     max_time = DIV_ROUND_UP((max_bytes >> PAGE_SHIFT) << WB_FRN_TIME_SHIFT,
0619                 wb->avg_write_bandwidth);
0620     if (avg_time)
0621         avg_time += (max_time >> WB_FRN_TIME_AVG_SHIFT) -
0622                 (avg_time >> WB_FRN_TIME_AVG_SHIFT);
0623     else
0624         avg_time = max_time;    /* immediate catch up on first run */
0625 
0626     if (max_time >= avg_time / WB_FRN_TIME_CUT_DIV) {
0627         int slots;
0628 
0629         /*
0630          * The switch verdict is reached if foreign wb's consume
0631          * more than a certain proportion of IO time in a
0632          * WB_FRN_TIME_PERIOD.  This is loosely tracked by 16 slot
0633          * history mask where each bit represents one sixteenth of
0634          * the period.  Determine the number of slots to shift into
0635          * history from @max_time.
0636          */
0637         slots = min(DIV_ROUND_UP(max_time, WB_FRN_HIST_UNIT),
0638                 (unsigned long)WB_FRN_HIST_MAX_SLOTS);
0639         history <<= slots;
0640         if (wbc->wb_id != max_id)
0641             history |= (1U << slots) - 1;
0642 
0643         /*
0644          * Switch if the current wb isn't the consistent winner.
0645          * If there are multiple closely competing dirtiers, the
0646          * inode may switch across them repeatedly over time, which
0647          * is okay.  The main goal is avoiding keeping an inode on
0648          * the wrong wb for an extended period of time.
0649          */
0650         if (hweight32(history) > WB_FRN_HIST_THR_SLOTS)
0651             inode_switch_wbs(inode, max_id);
0652     }
0653 
0654     /*
0655      * Multiple instances of this function may race to update the
0656      * following fields but we don't mind occassional inaccuracies.
0657      */
0658     inode->i_wb_frn_winner = max_id;
0659     inode->i_wb_frn_avg_time = min(avg_time, (unsigned long)U16_MAX);
0660     inode->i_wb_frn_history = history;
0661 
0662     wb_put(wbc->wb);
0663     wbc->wb = NULL;
0664 }
0665 
0666 /**
0667  * wbc_account_io - account IO issued during writeback
0668  * @wbc: writeback_control of the writeback in progress
0669  * @page: page being written out
0670  * @bytes: number of bytes being written out
0671  *
0672  * @bytes from @page are about to written out during the writeback
0673  * controlled by @wbc.  Keep the book for foreign inode detection.  See
0674  * wbc_detach_inode().
0675  */
0676 void wbc_account_io(struct writeback_control *wbc, struct page *page,
0677             size_t bytes)
0678 {
0679     int id;
0680 
0681     /*
0682      * pageout() path doesn't attach @wbc to the inode being written
0683      * out.  This is intentional as we don't want the function to block
0684      * behind a slow cgroup.  Ultimately, we want pageout() to kick off
0685      * regular writeback instead of writing things out itself.
0686      */
0687     if (!wbc->wb)
0688         return;
0689 
0690     id = mem_cgroup_css_from_page(page)->id;
0691 
0692     if (id == wbc->wb_id) {
0693         wbc->wb_bytes += bytes;
0694         return;
0695     }
0696 
0697     if (id == wbc->wb_lcand_id)
0698         wbc->wb_lcand_bytes += bytes;
0699 
0700     /* Boyer-Moore majority vote algorithm */
0701     if (!wbc->wb_tcand_bytes)
0702         wbc->wb_tcand_id = id;
0703     if (id == wbc->wb_tcand_id)
0704         wbc->wb_tcand_bytes += bytes;
0705     else
0706         wbc->wb_tcand_bytes -= min(bytes, wbc->wb_tcand_bytes);
0707 }
0708 EXPORT_SYMBOL_GPL(wbc_account_io);
0709 
0710 /**
0711  * inode_congested - test whether an inode is congested
0712  * @inode: inode to test for congestion (may be NULL)
0713  * @cong_bits: mask of WB_[a]sync_congested bits to test
0714  *
0715  * Tests whether @inode is congested.  @cong_bits is the mask of congestion
0716  * bits to test and the return value is the mask of set bits.
0717  *
0718  * If cgroup writeback is enabled for @inode, the congestion state is
0719  * determined by whether the cgwb (cgroup bdi_writeback) for the blkcg
0720  * associated with @inode is congested; otherwise, the root wb's congestion
0721  * state is used.
0722  *
0723  * @inode is allowed to be NULL as this function is often called on
0724  * mapping->host which is NULL for the swapper space.
0725  */
0726 int inode_congested(struct inode *inode, int cong_bits)
0727 {
0728     /*
0729      * Once set, ->i_wb never becomes NULL while the inode is alive.
0730      * Start transaction iff ->i_wb is visible.
0731      */
0732     if (inode && inode_to_wb_is_valid(inode)) {
0733         struct bdi_writeback *wb;
0734         bool locked, congested;
0735 
0736         wb = unlocked_inode_to_wb_begin(inode, &locked);
0737         congested = wb_congested(wb, cong_bits);
0738         unlocked_inode_to_wb_end(inode, locked);
0739         return congested;
0740     }
0741 
0742     return wb_congested(&inode_to_bdi(inode)->wb, cong_bits);
0743 }
0744 EXPORT_SYMBOL_GPL(inode_congested);
0745 
0746 /**
0747  * wb_split_bdi_pages - split nr_pages to write according to bandwidth
0748  * @wb: target bdi_writeback to split @nr_pages to
0749  * @nr_pages: number of pages to write for the whole bdi
0750  *
0751  * Split @wb's portion of @nr_pages according to @wb's write bandwidth in
0752  * relation to the total write bandwidth of all wb's w/ dirty inodes on
0753  * @wb->bdi.
0754  */
0755 static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages)
0756 {
0757     unsigned long this_bw = wb->avg_write_bandwidth;
0758     unsigned long tot_bw = atomic_long_read(&wb->bdi->tot_write_bandwidth);
0759 
0760     if (nr_pages == LONG_MAX)
0761         return LONG_MAX;
0762 
0763     /*
0764      * This may be called on clean wb's and proportional distribution
0765      * may not make sense, just use the original @nr_pages in those
0766      * cases.  In general, we wanna err on the side of writing more.
0767      */
0768     if (!tot_bw || this_bw >= tot_bw)
0769         return nr_pages;
0770     else
0771         return DIV_ROUND_UP_ULL((u64)nr_pages * this_bw, tot_bw);
0772 }
0773 
0774 /**
0775  * bdi_split_work_to_wbs - split a wb_writeback_work to all wb's of a bdi
0776  * @bdi: target backing_dev_info
0777  * @base_work: wb_writeback_work to issue
0778  * @skip_if_busy: skip wb's which already have writeback in progress
0779  *
0780  * Split and issue @base_work to all wb's (bdi_writeback's) of @bdi which
0781  * have dirty inodes.  If @base_work->nr_page isn't %LONG_MAX, it's
0782  * distributed to the busy wbs according to each wb's proportion in the
0783  * total active write bandwidth of @bdi.
0784  */
0785 static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
0786                   struct wb_writeback_work *base_work,
0787                   bool skip_if_busy)
0788 {
0789     struct bdi_writeback *last_wb = NULL;
0790     struct bdi_writeback *wb = list_entry(&bdi->wb_list,
0791                           struct bdi_writeback, bdi_node);
0792 
0793     might_sleep();
0794 restart:
0795     rcu_read_lock();
0796     list_for_each_entry_continue_rcu(wb, &bdi->wb_list, bdi_node) {
0797         DEFINE_WB_COMPLETION_ONSTACK(fallback_work_done);
0798         struct wb_writeback_work fallback_work;
0799         struct wb_writeback_work *work;
0800         long nr_pages;
0801 
0802         if (last_wb) {
0803             wb_put(last_wb);
0804             last_wb = NULL;
0805         }
0806 
0807         /* SYNC_ALL writes out I_DIRTY_TIME too */
0808         if (!wb_has_dirty_io(wb) &&
0809             (base_work->sync_mode == WB_SYNC_NONE ||
0810              list_empty(&wb->b_dirty_time)))
0811             continue;
0812         if (skip_if_busy && writeback_in_progress(wb))
0813             continue;
0814 
0815         nr_pages = wb_split_bdi_pages(wb, base_work->nr_pages);
0816 
0817         work = kmalloc(sizeof(*work), GFP_ATOMIC);
0818         if (work) {
0819             *work = *base_work;
0820             work->nr_pages = nr_pages;
0821             work->auto_free = 1;
0822             wb_queue_work(wb, work);
0823             continue;
0824         }
0825 
0826         /* alloc failed, execute synchronously using on-stack fallback */
0827         work = &fallback_work;
0828         *work = *base_work;
0829         work->nr_pages = nr_pages;
0830         work->auto_free = 0;
0831         work->done = &fallback_work_done;
0832 
0833         wb_queue_work(wb, work);
0834 
0835         /*
0836          * Pin @wb so that it stays on @bdi->wb_list.  This allows
0837          * continuing iteration from @wb after dropping and
0838          * regrabbing rcu read lock.
0839          */
0840         wb_get(wb);
0841         last_wb = wb;
0842 
0843         rcu_read_unlock();
0844         wb_wait_for_completion(bdi, &fallback_work_done);
0845         goto restart;
0846     }
0847     rcu_read_unlock();
0848 
0849     if (last_wb)
0850         wb_put(last_wb);
0851 }
0852 
0853 /**
0854  * cgroup_writeback_umount - flush inode wb switches for umount
0855  *
0856  * This function is called when a super_block is about to be destroyed and
0857  * flushes in-flight inode wb switches.  An inode wb switch goes through
0858  * RCU and then workqueue, so the two need to be flushed in order to ensure
0859  * that all previously scheduled switches are finished.  As wb switches are
0860  * rare occurrences and synchronize_rcu() can take a while, perform
0861  * flushing iff wb switches are in flight.
0862  */
0863 void cgroup_writeback_umount(void)
0864 {
0865     if (atomic_read(&isw_nr_in_flight)) {
0866         synchronize_rcu();
0867         flush_workqueue(isw_wq);
0868     }
0869 }
0870 
0871 static int __init cgroup_writeback_init(void)
0872 {
0873     isw_wq = alloc_workqueue("inode_switch_wbs", 0, 0);
0874     if (!isw_wq)
0875         return -ENOMEM;
0876     return 0;
0877 }
0878 fs_initcall(cgroup_writeback_init);
0879 
0880 #else   /* CONFIG_CGROUP_WRITEBACK */
0881 
0882 static struct bdi_writeback *
0883 locked_inode_to_wb_and_lock_list(struct inode *inode)
0884     __releases(&inode->i_lock)
0885     __acquires(&wb->list_lock)
0886 {
0887     struct bdi_writeback *wb = inode_to_wb(inode);
0888 
0889     spin_unlock(&inode->i_lock);
0890     spin_lock(&wb->list_lock);
0891     return wb;
0892 }
0893 
0894 static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode)
0895     __acquires(&wb->list_lock)
0896 {
0897     struct bdi_writeback *wb = inode_to_wb(inode);
0898 
0899     spin_lock(&wb->list_lock);
0900     return wb;
0901 }
0902 
0903 static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages)
0904 {
0905     return nr_pages;
0906 }
0907 
0908 static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
0909                   struct wb_writeback_work *base_work,
0910                   bool skip_if_busy)
0911 {
0912     might_sleep();
0913 
0914     if (!skip_if_busy || !writeback_in_progress(&bdi->wb)) {
0915         base_work->auto_free = 0;
0916         wb_queue_work(&bdi->wb, base_work);
0917     }
0918 }
0919 
0920 #endif  /* CONFIG_CGROUP_WRITEBACK */
0921 
0922 void wb_start_writeback(struct bdi_writeback *wb, long nr_pages,
0923             bool range_cyclic, enum wb_reason reason)
0924 {
0925     struct wb_writeback_work *work;
0926 
0927     if (!wb_has_dirty_io(wb))
0928         return;
0929 
0930     /*
0931      * This is WB_SYNC_NONE writeback, so if allocation fails just
0932      * wakeup the thread for old dirty data writeback
0933      */
0934     work = kzalloc(sizeof(*work),
0935                GFP_NOWAIT | __GFP_NOMEMALLOC | __GFP_NOWARN);
0936     if (!work) {
0937         trace_writeback_nowork(wb);
0938         wb_wakeup(wb);
0939         return;
0940     }
0941 
0942     work->sync_mode = WB_SYNC_NONE;
0943     work->nr_pages  = nr_pages;
0944     work->range_cyclic = range_cyclic;
0945     work->reason    = reason;
0946     work->auto_free = 1;
0947 
0948     wb_queue_work(wb, work);
0949 }
0950 
0951 /**
0952  * wb_start_background_writeback - start background writeback
0953  * @wb: bdi_writback to write from
0954  *
0955  * Description:
0956  *   This makes sure WB_SYNC_NONE background writeback happens. When
0957  *   this function returns, it is only guaranteed that for given wb
0958  *   some IO is happening if we are over background dirty threshold.
0959  *   Caller need not hold sb s_umount semaphore.
0960  */
0961 void wb_start_background_writeback(struct bdi_writeback *wb)
0962 {
0963     /*
0964      * We just wake up the flusher thread. It will perform background
0965      * writeback as soon as there is no other work to do.
0966      */
0967     trace_writeback_wake_background(wb);
0968     wb_wakeup(wb);
0969 }
0970 
0971 /*
0972  * Remove the inode from the writeback list it is on.
0973  */
0974 void inode_io_list_del(struct inode *inode)
0975 {
0976     struct bdi_writeback *wb;
0977 
0978     wb = inode_to_wb_and_lock_list(inode);
0979     inode_io_list_del_locked(inode, wb);
0980     spin_unlock(&wb->list_lock);
0981 }
0982 
0983 /*
0984  * mark an inode as under writeback on the sb
0985  */
0986 void sb_mark_inode_writeback(struct inode *inode)
0987 {
0988     struct super_block *sb = inode->i_sb;
0989     unsigned long flags;
0990 
0991     if (list_empty(&inode->i_wb_list)) {
0992         spin_lock_irqsave(&sb->s_inode_wblist_lock, flags);
0993         if (list_empty(&inode->i_wb_list)) {
0994             list_add_tail(&inode->i_wb_list, &sb->s_inodes_wb);
0995             trace_sb_mark_inode_writeback(inode);
0996         }
0997         spin_unlock_irqrestore(&sb->s_inode_wblist_lock, flags);
0998     }
0999 }
1000 
1001 /*
1002  * clear an inode as under writeback on the sb
1003  */
1004 void sb_clear_inode_writeback(struct inode *inode)
1005 {
1006     struct super_block *sb = inode->i_sb;
1007     unsigned long flags;
1008 
1009     if (!list_empty(&inode->i_wb_list)) {
1010         spin_lock_irqsave(&sb->s_inode_wblist_lock, flags);
1011         if (!list_empty(&inode->i_wb_list)) {
1012             list_del_init(&inode->i_wb_list);
1013             trace_sb_clear_inode_writeback(inode);
1014         }
1015         spin_unlock_irqrestore(&sb->s_inode_wblist_lock, flags);
1016     }
1017 }
1018 
1019 /*
1020  * Redirty an inode: set its when-it-was dirtied timestamp and move it to the
1021  * furthest end of its superblock's dirty-inode list.
1022  *
1023  * Before stamping the inode's ->dirtied_when, we check to see whether it is
1024  * already the most-recently-dirtied inode on the b_dirty list.  If that is
1025  * the case then the inode must have been redirtied while it was being written
1026  * out and we don't reset its dirtied_when.
1027  */
1028 static void redirty_tail(struct inode *inode, struct bdi_writeback *wb)
1029 {
1030     if (!list_empty(&wb->b_dirty)) {
1031         struct inode *tail;
1032 
1033         tail = wb_inode(wb->b_dirty.next);
1034         if (time_before(inode->dirtied_when, tail->dirtied_when))
1035             inode->dirtied_when = jiffies;
1036     }
1037     inode_io_list_move_locked(inode, wb, &wb->b_dirty);
1038 }
1039 
1040 /*
1041  * requeue inode for re-scanning after bdi->b_io list is exhausted.
1042  */
1043 static void requeue_io(struct inode *inode, struct bdi_writeback *wb)
1044 {
1045     inode_io_list_move_locked(inode, wb, &wb->b_more_io);
1046 }
1047 
1048 static void inode_sync_complete(struct inode *inode)
1049 {
1050     inode->i_state &= ~I_SYNC;
1051     /* If inode is clean an unused, put it into LRU now... */
1052     inode_add_lru(inode);
1053     /* Waiters must see I_SYNC cleared before being woken up */
1054     smp_mb();
1055     wake_up_bit(&inode->i_state, __I_SYNC);
1056 }
1057 
1058 static bool inode_dirtied_after(struct inode *inode, unsigned long t)
1059 {
1060     bool ret = time_after(inode->dirtied_when, t);
1061 #ifndef CONFIG_64BIT
1062     /*
1063      * For inodes being constantly redirtied, dirtied_when can get stuck.
1064      * It _appears_ to be in the future, but is actually in distant past.
1065      * This test is necessary to prevent such wrapped-around relative times
1066      * from permanently stopping the whole bdi writeback.
1067      */
1068     ret = ret && time_before_eq(inode->dirtied_when, jiffies);
1069 #endif
1070     return ret;
1071 }
1072 
1073 #define EXPIRE_DIRTY_ATIME 0x0001
1074 
1075 /*
1076  * Move expired (dirtied before work->older_than_this) dirty inodes from
1077  * @delaying_queue to @dispatch_queue.
1078  */
1079 static int move_expired_inodes(struct list_head *delaying_queue,
1080                    struct list_head *dispatch_queue,
1081                    int flags,
1082                    struct wb_writeback_work *work)
1083 {
1084     unsigned long *older_than_this = NULL;
1085     unsigned long expire_time;
1086     LIST_HEAD(tmp);
1087     struct list_head *pos, *node;
1088     struct super_block *sb = NULL;
1089     struct inode *inode;
1090     int do_sb_sort = 0;
1091     int moved = 0;
1092 
1093     if ((flags & EXPIRE_DIRTY_ATIME) == 0)
1094         older_than_this = work->older_than_this;
1095     else if (!work->for_sync) {
1096         expire_time = jiffies - (dirtytime_expire_interval * HZ);
1097         older_than_this = &expire_time;
1098     }
1099     while (!list_empty(delaying_queue)) {
1100         inode = wb_inode(delaying_queue->prev);
1101         if (older_than_this &&
1102             inode_dirtied_after(inode, *older_than_this))
1103             break;
1104         list_move(&inode->i_io_list, &tmp);
1105         moved++;
1106         if (flags & EXPIRE_DIRTY_ATIME)
1107             set_bit(__I_DIRTY_TIME_EXPIRED, &inode->i_state);
1108         if (sb_is_blkdev_sb(inode->i_sb))
1109             continue;
1110         if (sb && sb != inode->i_sb)
1111             do_sb_sort = 1;
1112         sb = inode->i_sb;
1113     }
1114 
1115     /* just one sb in list, splice to dispatch_queue and we're done */
1116     if (!do_sb_sort) {
1117         list_splice(&tmp, dispatch_queue);
1118         goto out;
1119     }
1120 
1121     /* Move inodes from one superblock together */
1122     while (!list_empty(&tmp)) {
1123         sb = wb_inode(tmp.prev)->i_sb;
1124         list_for_each_prev_safe(pos, node, &tmp) {
1125             inode = wb_inode(pos);
1126             if (inode->i_sb == sb)
1127                 list_move(&inode->i_io_list, dispatch_queue);
1128         }
1129     }
1130 out:
1131     return moved;
1132 }
1133 
1134 /*
1135  * Queue all expired dirty inodes for io, eldest first.
1136  * Before
1137  *         newly dirtied     b_dirty    b_io    b_more_io
1138  *         =============>    gf         edc     BA
1139  * After
1140  *         newly dirtied     b_dirty    b_io    b_more_io
1141  *         =============>    g          fBAedc
1142  *                                           |
1143  *                                           +--> dequeue for IO
1144  */
1145 static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work)
1146 {
1147     int moved;
1148 
1149     assert_spin_locked(&wb->list_lock);
1150     list_splice_init(&wb->b_more_io, &wb->b_io);
1151     moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, 0, work);
1152     moved += move_expired_inodes(&wb->b_dirty_time, &wb->b_io,
1153                      EXPIRE_DIRTY_ATIME, work);
1154     if (moved)
1155         wb_io_lists_populated(wb);
1156     trace_writeback_queue_io(wb, work, moved);
1157 }
1158 
1159 static int write_inode(struct inode *inode, struct writeback_control *wbc)
1160 {
1161     int ret;
1162 
1163     if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode)) {
1164         trace_writeback_write_inode_start(inode, wbc);
1165         ret = inode->i_sb->s_op->write_inode(inode, wbc);
1166         trace_writeback_write_inode(inode, wbc);
1167         return ret;
1168     }
1169     return 0;
1170 }
1171 
1172 /*
1173  * Wait for writeback on an inode to complete. Called with i_lock held.
1174  * Caller must make sure inode cannot go away when we drop i_lock.
1175  */
1176 static void __inode_wait_for_writeback(struct inode *inode)
1177     __releases(inode->i_lock)
1178     __acquires(inode->i_lock)
1179 {
1180     DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC);
1181     wait_queue_head_t *wqh;
1182 
1183     wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
1184     while (inode->i_state & I_SYNC) {
1185         spin_unlock(&inode->i_lock);
1186         __wait_on_bit(wqh, &wq, bit_wait,
1187                   TASK_UNINTERRUPTIBLE);
1188         spin_lock(&inode->i_lock);
1189     }
1190 }
1191 
1192 /*
1193  * Wait for writeback on an inode to complete. Caller must have inode pinned.
1194  */
1195 void inode_wait_for_writeback(struct inode *inode)
1196 {
1197     spin_lock(&inode->i_lock);
1198     __inode_wait_for_writeback(inode);
1199     spin_unlock(&inode->i_lock);
1200 }
1201 
1202 /*
1203  * Sleep until I_SYNC is cleared. This function must be called with i_lock
1204  * held and drops it. It is aimed for callers not holding any inode reference
1205  * so once i_lock is dropped, inode can go away.
1206  */
1207 static void inode_sleep_on_writeback(struct inode *inode)
1208     __releases(inode->i_lock)
1209 {
1210     DEFINE_WAIT(wait);
1211     wait_queue_head_t *wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
1212     int sleep;
1213 
1214     prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
1215     sleep = inode->i_state & I_SYNC;
1216     spin_unlock(&inode->i_lock);
1217     if (sleep)
1218         schedule();
1219     finish_wait(wqh, &wait);
1220 }
1221 
1222 /*
1223  * Find proper writeback list for the inode depending on its current state and
1224  * possibly also change of its state while we were doing writeback.  Here we
1225  * handle things such as livelock prevention or fairness of writeback among
1226  * inodes. This function can be called only by flusher thread - noone else
1227  * processes all inodes in writeback lists and requeueing inodes behind flusher
1228  * thread's back can have unexpected consequences.
1229  */
1230 static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
1231               struct writeback_control *wbc)
1232 {
1233     if (inode->i_state & I_FREEING)
1234         return;
1235 
1236     /*
1237      * Sync livelock prevention. Each inode is tagged and synced in one
1238      * shot. If still dirty, it will be redirty_tail()'ed below.  Update
1239      * the dirty time to prevent enqueue and sync it again.
1240      */
1241     if ((inode->i_state & I_DIRTY) &&
1242         (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages))
1243         inode->dirtied_when = jiffies;
1244 
1245     if (wbc->pages_skipped) {
1246         /*
1247          * writeback is not making progress due to locked
1248          * buffers. Skip this inode for now.
1249          */
1250         redirty_tail(inode, wb);
1251         return;
1252     }
1253 
1254     if (mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
1255         /*
1256          * We didn't write back all the pages.  nfs_writepages()
1257          * sometimes bales out without doing anything.
1258          */
1259         if (wbc->nr_to_write <= 0) {
1260             /* Slice used up. Queue for next turn. */
1261             requeue_io(inode, wb);
1262         } else {
1263             /*
1264              * Writeback blocked by something other than
1265              * congestion. Delay the inode for some time to
1266              * avoid spinning on the CPU (100% iowait)
1267              * retrying writeback of the dirty page/inode
1268              * that cannot be performed immediately.
1269              */
1270             redirty_tail(inode, wb);
1271         }
1272     } else if (inode->i_state & I_DIRTY) {
1273         /*
1274          * Filesystems can dirty the inode during writeback operations,
1275          * such as delayed allocation during submission or metadata
1276          * updates after data IO completion.
1277          */
1278         redirty_tail(inode, wb);
1279     } else if (inode->i_state & I_DIRTY_TIME) {
1280         inode->dirtied_when = jiffies;
1281         inode_io_list_move_locked(inode, wb, &wb->b_dirty_time);
1282     } else {
1283         /* The inode is clean. Remove from writeback lists. */
1284         inode_io_list_del_locked(inode, wb);
1285     }
1286 }
1287 
1288 /*
1289  * Write out an inode and its dirty pages. Do not update the writeback list
1290  * linkage. That is left to the caller. The caller is also responsible for
1291  * setting I_SYNC flag and calling inode_sync_complete() to clear it.
1292  */
1293 static int
1294 __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
1295 {
1296     struct address_space *mapping = inode->i_mapping;
1297     long nr_to_write = wbc->nr_to_write;
1298     unsigned dirty;
1299     int ret;
1300 
1301     WARN_ON(!(inode->i_state & I_SYNC));
1302 
1303     trace_writeback_single_inode_start(inode, wbc, nr_to_write);
1304 
1305     ret = do_writepages(mapping, wbc);
1306 
1307     /*
1308      * Make sure to wait on the data before writing out the metadata.
1309      * This is important for filesystems that modify metadata on data
1310      * I/O completion. We don't do it for sync(2) writeback because it has a
1311      * separate, external IO completion path and ->sync_fs for guaranteeing
1312      * inode metadata is written back correctly.
1313      */
1314     if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) {
1315         int err = filemap_fdatawait(mapping);
1316         if (ret == 0)
1317             ret = err;
1318     }
1319 
1320     /*
1321      * Some filesystems may redirty the inode during the writeback
1322      * due to delalloc, clear dirty metadata flags right before
1323      * write_inode()
1324      */
1325     spin_lock(&inode->i_lock);
1326 
1327     dirty = inode->i_state & I_DIRTY;
1328     if (inode->i_state & I_DIRTY_TIME) {
1329         if ((dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) ||
1330             wbc->sync_mode == WB_SYNC_ALL ||
1331             unlikely(inode->i_state & I_DIRTY_TIME_EXPIRED) ||
1332             unlikely(time_after(jiffies,
1333                     (inode->dirtied_time_when +
1334                      dirtytime_expire_interval * HZ)))) {
1335             dirty |= I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED;
1336             trace_writeback_lazytime(inode);
1337         }
1338     } else
1339         inode->i_state &= ~I_DIRTY_TIME_EXPIRED;
1340     inode->i_state &= ~dirty;
1341 
1342     /*
1343      * Paired with smp_mb() in __mark_inode_dirty().  This allows
1344      * __mark_inode_dirty() to test i_state without grabbing i_lock -
1345      * either they see the I_DIRTY bits cleared or we see the dirtied
1346      * inode.
1347      *
1348      * I_DIRTY_PAGES is always cleared together above even if @mapping
1349      * still has dirty pages.  The flag is reinstated after smp_mb() if
1350      * necessary.  This guarantees that either __mark_inode_dirty()
1351      * sees clear I_DIRTY_PAGES or we see PAGECACHE_TAG_DIRTY.
1352      */
1353     smp_mb();
1354 
1355     if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
1356         inode->i_state |= I_DIRTY_PAGES;
1357 
1358     spin_unlock(&inode->i_lock);
1359 
1360     if (dirty & I_DIRTY_TIME)
1361         mark_inode_dirty_sync(inode);
1362     /* Don't write the inode if only I_DIRTY_PAGES was set */
1363     if (dirty & ~I_DIRTY_PAGES) {
1364         int err = write_inode(inode, wbc);
1365         if (ret == 0)
1366             ret = err;
1367     }
1368     trace_writeback_single_inode(inode, wbc, nr_to_write);
1369     return ret;
1370 }
1371 
1372 /*
1373  * Write out an inode's dirty pages. Either the caller has an active reference
1374  * on the inode or the inode has I_WILL_FREE set.
1375  *
1376  * This function is designed to be called for writing back one inode which
1377  * we go e.g. from filesystem. Flusher thread uses __writeback_single_inode()
1378  * and does more profound writeback list handling in writeback_sb_inodes().
1379  */
1380 static int writeback_single_inode(struct inode *inode,
1381                   struct writeback_control *wbc)
1382 {
1383     struct bdi_writeback *wb;
1384     int ret = 0;
1385 
1386     spin_lock(&inode->i_lock);
1387     if (!atomic_read(&inode->i_count))
1388         WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING)));
1389     else
1390         WARN_ON(inode->i_state & I_WILL_FREE);
1391 
1392     if (inode->i_state & I_SYNC) {
1393         if (wbc->sync_mode != WB_SYNC_ALL)
1394             goto out;
1395         /*
1396          * It's a data-integrity sync. We must wait. Since callers hold
1397          * inode reference or inode has I_WILL_FREE set, it cannot go
1398          * away under us.
1399          */
1400         __inode_wait_for_writeback(inode);
1401     }
1402     WARN_ON(inode->i_state & I_SYNC);
1403     /*
1404      * Skip inode if it is clean and we have no outstanding writeback in
1405      * WB_SYNC_ALL mode. We don't want to mess with writeback lists in this
1406      * function since flusher thread may be doing for example sync in
1407      * parallel and if we move the inode, it could get skipped. So here we
1408      * make sure inode is on some writeback list and leave it there unless
1409      * we have completely cleaned the inode.
1410      */
1411     if (!(inode->i_state & I_DIRTY_ALL) &&
1412         (wbc->sync_mode != WB_SYNC_ALL ||
1413          !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK)))
1414         goto out;
1415     inode->i_state |= I_SYNC;
1416     wbc_attach_and_unlock_inode(wbc, inode);
1417 
1418     ret = __writeback_single_inode(inode, wbc);
1419 
1420     wbc_detach_inode(wbc);
1421 
1422     wb = inode_to_wb_and_lock_list(inode);
1423     spin_lock(&inode->i_lock);
1424     /*
1425      * If inode is clean, remove it from writeback lists. Otherwise don't
1426      * touch it. See comment above for explanation.
1427      */
1428     if (!(inode->i_state & I_DIRTY_ALL))
1429         inode_io_list_del_locked(inode, wb);
1430     spin_unlock(&wb->list_lock);
1431     inode_sync_complete(inode);
1432 out:
1433     spin_unlock(&inode->i_lock);
1434     return ret;
1435 }
1436 
1437 static long writeback_chunk_size(struct bdi_writeback *wb,
1438                  struct wb_writeback_work *work)
1439 {
1440     long pages;
1441 
1442     /*
1443      * WB_SYNC_ALL mode does livelock avoidance by syncing dirty
1444      * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX
1445      * here avoids calling into writeback_inodes_wb() more than once.
1446      *
1447      * The intended call sequence for WB_SYNC_ALL writeback is:
1448      *
1449      *      wb_writeback()
1450      *          writeback_sb_inodes()       <== called only once
1451      *              write_cache_pages()     <== called once for each inode
1452      *                   (quickly) tag currently dirty pages
1453      *                   (maybe slowly) sync all tagged pages
1454      */
1455     if (work->sync_mode == WB_SYNC_ALL || work->tagged_writepages)
1456         pages = LONG_MAX;
1457     else {
1458         pages = min(wb->avg_write_bandwidth / 2,
1459                 global_wb_domain.dirty_limit / DIRTY_SCOPE);
1460         pages = min(pages, work->nr_pages);
1461         pages = round_down(pages + MIN_WRITEBACK_PAGES,
1462                    MIN_WRITEBACK_PAGES);
1463     }
1464 
1465     return pages;
1466 }
1467 
1468 /*
1469  * Write a portion of b_io inodes which belong to @sb.
1470  *
1471  * Return the number of pages and/or inodes written.
1472  *
1473  * NOTE! This is called with wb->list_lock held, and will
1474  * unlock and relock that for each inode it ends up doing
1475  * IO for.
1476  */
1477 static long writeback_sb_inodes(struct super_block *sb,
1478                 struct bdi_writeback *wb,
1479                 struct wb_writeback_work *work)
1480 {
1481     struct writeback_control wbc = {
1482         .sync_mode      = work->sync_mode,
1483         .tagged_writepages  = work->tagged_writepages,
1484         .for_kupdate        = work->for_kupdate,
1485         .for_background     = work->for_background,
1486         .for_sync       = work->for_sync,
1487         .range_cyclic       = work->range_cyclic,
1488         .range_start        = 0,
1489         .range_end      = LLONG_MAX,
1490     };
1491     unsigned long start_time = jiffies;
1492     long write_chunk;
1493     long wrote = 0;  /* count both pages and inodes */
1494 
1495     while (!list_empty(&wb->b_io)) {
1496         struct inode *inode = wb_inode(wb->b_io.prev);
1497         struct bdi_writeback *tmp_wb;
1498 
1499         if (inode->i_sb != sb) {
1500             if (work->sb) {
1501                 /*
1502                  * We only want to write back data for this
1503                  * superblock, move all inodes not belonging
1504                  * to it back onto the dirty list.
1505                  */
1506                 redirty_tail(inode, wb);
1507                 continue;
1508             }
1509 
1510             /*
1511              * The inode belongs to a different superblock.
1512              * Bounce back to the caller to unpin this and
1513              * pin the next superblock.
1514              */
1515             break;
1516         }
1517 
1518         /*
1519          * Don't bother with new inodes or inodes being freed, first
1520          * kind does not need periodic writeout yet, and for the latter
1521          * kind writeout is handled by the freer.
1522          */
1523         spin_lock(&inode->i_lock);
1524         if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
1525             spin_unlock(&inode->i_lock);
1526             redirty_tail(inode, wb);
1527             continue;
1528         }
1529         if ((inode->i_state & I_SYNC) && wbc.sync_mode != WB_SYNC_ALL) {
1530             /*
1531              * If this inode is locked for writeback and we are not
1532              * doing writeback-for-data-integrity, move it to
1533              * b_more_io so that writeback can proceed with the
1534              * other inodes on s_io.
1535              *
1536              * We'll have another go at writing back this inode
1537              * when we completed a full scan of b_io.
1538              */
1539             spin_unlock(&inode->i_lock);
1540             requeue_io(inode, wb);
1541             trace_writeback_sb_inodes_requeue(inode);
1542             continue;
1543         }
1544         spin_unlock(&wb->list_lock);
1545 
1546         /*
1547          * We already requeued the inode if it had I_SYNC set and we
1548          * are doing WB_SYNC_NONE writeback. So this catches only the
1549          * WB_SYNC_ALL case.
1550          */
1551         if (inode->i_state & I_SYNC) {
1552             /* Wait for I_SYNC. This function drops i_lock... */
1553             inode_sleep_on_writeback(inode);
1554             /* Inode may be gone, start again */
1555             spin_lock(&wb->list_lock);
1556             continue;
1557         }
1558         inode->i_state |= I_SYNC;
1559         wbc_attach_and_unlock_inode(&wbc, inode);
1560 
1561         write_chunk = writeback_chunk_size(wb, work);
1562         wbc.nr_to_write = write_chunk;
1563         wbc.pages_skipped = 0;
1564 
1565         /*
1566          * We use I_SYNC to pin the inode in memory. While it is set
1567          * evict_inode() will wait so the inode cannot be freed.
1568          */
1569         __writeback_single_inode(inode, &wbc);
1570 
1571         wbc_detach_inode(&wbc);
1572         work->nr_pages -= write_chunk - wbc.nr_to_write;
1573         wrote += write_chunk - wbc.nr_to_write;
1574 
1575         if (need_resched()) {
1576             /*
1577              * We're trying to balance between building up a nice
1578              * long list of IOs to improve our merge rate, and
1579              * getting those IOs out quickly for anyone throttling
1580              * in balance_dirty_pages().  cond_resched() doesn't
1581              * unplug, so get our IOs out the door before we
1582              * give up the CPU.
1583              */
1584             blk_flush_plug(current);
1585             cond_resched();
1586         }
1587 
1588         /*
1589          * Requeue @inode if still dirty.  Be careful as @inode may
1590          * have been switched to another wb in the meantime.
1591          */
1592         tmp_wb = inode_to_wb_and_lock_list(inode);
1593         spin_lock(&inode->i_lock);
1594         if (!(inode->i_state & I_DIRTY_ALL))
1595             wrote++;
1596         requeue_inode(inode, tmp_wb, &wbc);
1597         inode_sync_complete(inode);
1598         spin_unlock(&inode->i_lock);
1599 
1600         if (unlikely(tmp_wb != wb)) {
1601             spin_unlock(&tmp_wb->list_lock);
1602             spin_lock(&wb->list_lock);
1603         }
1604 
1605         /*
1606          * bail out to wb_writeback() often enough to check
1607          * background threshold and other termination conditions.
1608          */
1609         if (wrote) {
1610             if (time_is_before_jiffies(start_time + HZ / 10UL))
1611                 break;
1612             if (work->nr_pages <= 0)
1613                 break;
1614         }
1615     }
1616     return wrote;
1617 }
1618 
1619 static long __writeback_inodes_wb(struct bdi_writeback *wb,
1620                   struct wb_writeback_work *work)
1621 {
1622     unsigned long start_time = jiffies;
1623     long wrote = 0;
1624 
1625     while (!list_empty(&wb->b_io)) {
1626         struct inode *inode = wb_inode(wb->b_io.prev);
1627         struct super_block *sb = inode->i_sb;
1628 
1629         if (!trylock_super(sb)) {
1630             /*
1631              * trylock_super() may fail consistently due to
1632              * s_umount being grabbed by someone else. Don't use
1633              * requeue_io() to avoid busy retrying the inode/sb.
1634              */
1635             redirty_tail(inode, wb);
1636             continue;
1637         }
1638         wrote += writeback_sb_inodes(sb, wb, work);
1639         up_read(&sb->s_umount);
1640 
1641         /* refer to the same tests at the end of writeback_sb_inodes */
1642         if (wrote) {
1643             if (time_is_before_jiffies(start_time + HZ / 10UL))
1644                 break;
1645             if (work->nr_pages <= 0)
1646                 break;
1647         }
1648     }
1649     /* Leave any unwritten inodes on b_io */
1650     return wrote;
1651 }
1652 
1653 static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages,
1654                 enum wb_reason reason)
1655 {
1656     struct wb_writeback_work work = {
1657         .nr_pages   = nr_pages,
1658         .sync_mode  = WB_SYNC_NONE,
1659         .range_cyclic   = 1,
1660         .reason     = reason,
1661     };
1662     struct blk_plug plug;
1663 
1664     blk_start_plug(&plug);
1665     spin_lock(&wb->list_lock);
1666     if (list_empty(&wb->b_io))
1667         queue_io(wb, &work);
1668     __writeback_inodes_wb(wb, &work);
1669     spin_unlock(&wb->list_lock);
1670     blk_finish_plug(&plug);
1671 
1672     return nr_pages - work.nr_pages;
1673 }
1674 
1675 /*
1676  * Explicit flushing or periodic writeback of "old" data.
1677  *
1678  * Define "old": the first time one of an inode's pages is dirtied, we mark the
1679  * dirtying-time in the inode's address_space.  So this periodic writeback code
1680  * just walks the superblock inode list, writing back any inodes which are
1681  * older than a specific point in time.
1682  *
1683  * Try to run once per dirty_writeback_interval.  But if a writeback event
1684  * takes longer than a dirty_writeback_interval interval, then leave a
1685  * one-second gap.
1686  *
1687  * older_than_this takes precedence over nr_to_write.  So we'll only write back
1688  * all dirty pages if they are all attached to "old" mappings.
1689  */
1690 static long wb_writeback(struct bdi_writeback *wb,
1691              struct wb_writeback_work *work)
1692 {
1693     unsigned long wb_start = jiffies;
1694     long nr_pages = work->nr_pages;
1695     unsigned long oldest_jif;
1696     struct inode *inode;
1697     long progress;
1698     struct blk_plug plug;
1699 
1700     oldest_jif = jiffies;
1701     work->older_than_this = &oldest_jif;
1702 
1703     blk_start_plug(&plug);
1704     spin_lock(&wb->list_lock);
1705     for (;;) {
1706         /*
1707          * Stop writeback when nr_pages has been consumed
1708          */
1709         if (work->nr_pages <= 0)
1710             break;
1711 
1712         /*
1713          * Background writeout and kupdate-style writeback may
1714          * run forever. Stop them if there is other work to do
1715          * so that e.g. sync can proceed. They'll be restarted
1716          * after the other works are all done.
1717          */
1718         if ((work->for_background || work->for_kupdate) &&
1719             !list_empty(&wb->work_list))
1720             break;
1721 
1722         /*
1723          * For background writeout, stop when we are below the
1724          * background dirty threshold
1725          */
1726         if (work->for_background && !wb_over_bg_thresh(wb))
1727             break;
1728 
1729         /*
1730          * Kupdate and background works are special and we want to
1731          * include all inodes that need writing. Livelock avoidance is
1732          * handled by these works yielding to any other work so we are
1733          * safe.
1734          */
1735         if (work->for_kupdate) {
1736             oldest_jif = jiffies -
1737                 msecs_to_jiffies(dirty_expire_interval * 10);
1738         } else if (work->for_background)
1739             oldest_jif = jiffies;
1740 
1741         trace_writeback_start(wb, work);
1742         if (list_empty(&wb->b_io))
1743             queue_io(wb, work);
1744         if (work->sb)
1745             progress = writeback_sb_inodes(work->sb, wb, work);
1746         else
1747             progress = __writeback_inodes_wb(wb, work);
1748         trace_writeback_written(wb, work);
1749 
1750         wb_update_bandwidth(wb, wb_start);
1751 
1752         /*
1753          * Did we write something? Try for more
1754          *
1755          * Dirty inodes are moved to b_io for writeback in batches.
1756          * The completion of the current batch does not necessarily
1757          * mean the overall work is done. So we keep looping as long
1758          * as made some progress on cleaning pages or inodes.
1759          */
1760         if (progress)
1761             continue;
1762         /*
1763          * No more inodes for IO, bail
1764          */
1765         if (list_empty(&wb->b_more_io))
1766             break;
1767         /*
1768          * Nothing written. Wait for some inode to
1769          * become available for writeback. Otherwise
1770          * we'll just busyloop.
1771          */
1772         trace_writeback_wait(wb, work);
1773         inode = wb_inode(wb->b_more_io.prev);
1774         spin_lock(&inode->i_lock);
1775         spin_unlock(&wb->list_lock);
1776         /* This function drops i_lock... */
1777         inode_sleep_on_writeback(inode);
1778         spin_lock(&wb->list_lock);
1779     }
1780     spin_unlock(&wb->list_lock);
1781     blk_finish_plug(&plug);
1782 
1783     return nr_pages - work->nr_pages;
1784 }
1785 
1786 /*
1787  * Return the next wb_writeback_work struct that hasn't been processed yet.
1788  */
1789 static struct wb_writeback_work *get_next_work_item(struct bdi_writeback *wb)
1790 {
1791     struct wb_writeback_work *work = NULL;
1792 
1793     spin_lock_bh(&wb->work_lock);
1794     if (!list_empty(&wb->work_list)) {
1795         work = list_entry(wb->work_list.next,
1796                   struct wb_writeback_work, list);
1797         list_del_init(&work->list);
1798     }
1799     spin_unlock_bh(&wb->work_lock);
1800     return work;
1801 }
1802 
1803 /*
1804  * Add in the number of potentially dirty inodes, because each inode
1805  * write can dirty pagecache in the underlying blockdev.
1806  */
1807 static unsigned long get_nr_dirty_pages(void)
1808 {
1809     return global_node_page_state(NR_FILE_DIRTY) +
1810         global_node_page_state(NR_UNSTABLE_NFS) +
1811         get_nr_dirty_inodes();
1812 }
1813 
1814 static long wb_check_background_flush(struct bdi_writeback *wb)
1815 {
1816     if (wb_over_bg_thresh(wb)) {
1817 
1818         struct wb_writeback_work work = {
1819             .nr_pages   = LONG_MAX,
1820             .sync_mode  = WB_SYNC_NONE,
1821             .for_background = 1,
1822             .range_cyclic   = 1,
1823             .reason     = WB_REASON_BACKGROUND,
1824         };
1825 
1826         return wb_writeback(wb, &work);
1827     }
1828 
1829     return 0;
1830 }
1831 
1832 static long wb_check_old_data_flush(struct bdi_writeback *wb)
1833 {
1834     unsigned long expired;
1835     long nr_pages;
1836 
1837     /*
1838      * When set to zero, disable periodic writeback
1839      */
1840     if (!dirty_writeback_interval)
1841         return 0;
1842 
1843     expired = wb->last_old_flush +
1844             msecs_to_jiffies(dirty_writeback_interval * 10);
1845     if (time_before(jiffies, expired))
1846         return 0;
1847 
1848     wb->last_old_flush = jiffies;
1849     nr_pages = get_nr_dirty_pages();
1850 
1851     if (nr_pages) {
1852         struct wb_writeback_work work = {
1853             .nr_pages   = nr_pages,
1854             .sync_mode  = WB_SYNC_NONE,
1855             .for_kupdate    = 1,
1856             .range_cyclic   = 1,
1857             .reason     = WB_REASON_PERIODIC,
1858         };
1859 
1860         return wb_writeback(wb, &work);
1861     }
1862 
1863     return 0;
1864 }
1865 
1866 /*
1867  * Retrieve work items and do the writeback they describe
1868  */
1869 static long wb_do_writeback(struct bdi_writeback *wb)
1870 {
1871     struct wb_writeback_work *work;
1872     long wrote = 0;
1873 
1874     set_bit(WB_writeback_running, &wb->state);
1875     while ((work = get_next_work_item(wb)) != NULL) {
1876         struct wb_completion *done = work->done;
1877 
1878         trace_writeback_exec(wb, work);
1879 
1880         wrote += wb_writeback(wb, work);
1881 
1882         if (work->auto_free)
1883             kfree(work);
1884         if (done && atomic_dec_and_test(&done->cnt))
1885             wake_up_all(&wb->bdi->wb_waitq);
1886     }
1887 
1888     /*
1889      * Check for periodic writeback, kupdated() style
1890      */
1891     wrote += wb_check_old_data_flush(wb);
1892     wrote += wb_check_background_flush(wb);
1893     clear_bit(WB_writeback_running, &wb->state);
1894 
1895     return wrote;
1896 }
1897 
1898 /*
1899  * Handle writeback of dirty data for the device backed by this bdi. Also
1900  * reschedules periodically and does kupdated style flushing.
1901  */
1902 void wb_workfn(struct work_struct *work)
1903 {
1904     struct bdi_writeback *wb = container_of(to_delayed_work(work),
1905                         struct bdi_writeback, dwork);
1906     long pages_written;
1907 
1908     set_worker_desc("flush-%s", dev_name(wb->bdi->dev));
1909     current->flags |= PF_SWAPWRITE;
1910 
1911     if (likely(!current_is_workqueue_rescuer() ||
1912            !test_bit(WB_registered, &wb->state))) {
1913         /*
1914          * The normal path.  Keep writing back @wb until its
1915          * work_list is empty.  Note that this path is also taken
1916          * if @wb is shutting down even when we're running off the
1917          * rescuer as work_list needs to be drained.
1918          */
1919         do {
1920             pages_written = wb_do_writeback(wb);
1921             trace_writeback_pages_written(pages_written);
1922         } while (!list_empty(&wb->work_list));
1923     } else {
1924         /*
1925          * bdi_wq can't get enough workers and we're running off
1926          * the emergency worker.  Don't hog it.  Hopefully, 1024 is
1927          * enough for efficient IO.
1928          */
1929         pages_written = writeback_inodes_wb(wb, 1024,
1930                             WB_REASON_FORKER_THREAD);
1931         trace_writeback_pages_written(pages_written);
1932     }
1933 
1934     if (!list_empty(&wb->work_list))
1935         mod_delayed_work(bdi_wq, &wb->dwork, 0);
1936     else if (wb_has_dirty_io(wb) && dirty_writeback_interval)
1937         wb_wakeup_delayed(wb);
1938 
1939     current->flags &= ~PF_SWAPWRITE;
1940 }
1941 
1942 /*
1943  * Start writeback of `nr_pages' pages.  If `nr_pages' is zero, write back
1944  * the whole world.
1945  */
1946 void wakeup_flusher_threads(long nr_pages, enum wb_reason reason)
1947 {
1948     struct backing_dev_info *bdi;
1949 
1950     /*
1951      * If we are expecting writeback progress we must submit plugged IO.
1952      */
1953     if (blk_needs_flush_plug(current))
1954         blk_schedule_flush_plug(current);
1955 
1956     if (!nr_pages)
1957         nr_pages = get_nr_dirty_pages();
1958 
1959     rcu_read_lock();
1960     list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
1961         struct bdi_writeback *wb;
1962 
1963         if (!bdi_has_dirty_io(bdi))
1964             continue;
1965 
1966         list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node)
1967             wb_start_writeback(wb, wb_split_bdi_pages(wb, nr_pages),
1968                        false, reason);
1969     }
1970     rcu_read_unlock();
1971 }
1972 
1973 /*
1974  * Wake up bdi's periodically to make sure dirtytime inodes gets
1975  * written back periodically.  We deliberately do *not* check the
1976  * b_dirtytime list in wb_has_dirty_io(), since this would cause the
1977  * kernel to be constantly waking up once there are any dirtytime
1978  * inodes on the system.  So instead we define a separate delayed work
1979  * function which gets called much more rarely.  (By default, only
1980  * once every 12 hours.)
1981  *
1982  * If there is any other write activity going on in the file system,
1983  * this function won't be necessary.  But if the only thing that has
1984  * happened on the file system is a dirtytime inode caused by an atime
1985  * update, we need this infrastructure below to make sure that inode
1986  * eventually gets pushed out to disk.
1987  */
1988 static void wakeup_dirtytime_writeback(struct work_struct *w);
1989 static DECLARE_DELAYED_WORK(dirtytime_work, wakeup_dirtytime_writeback);
1990 
1991 static void wakeup_dirtytime_writeback(struct work_struct *w)
1992 {
1993     struct backing_dev_info *bdi;
1994 
1995     rcu_read_lock();
1996     list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
1997         struct bdi_writeback *wb;
1998 
1999         list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node)
2000             if (!list_empty(&wb->b_dirty_time))
2001                 wb_wakeup(wb);
2002     }
2003     rcu_read_unlock();
2004     schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ);
2005 }
2006 
2007 static int __init start_dirtytime_writeback(void)
2008 {
2009     schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ);
2010     return 0;
2011 }
2012 __initcall(start_dirtytime_writeback);
2013 
2014 int dirtytime_interval_handler(struct ctl_table *table, int write,
2015                    void __user *buffer, size_t *lenp, loff_t *ppos)
2016 {
2017     int ret;
2018 
2019     ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
2020     if (ret == 0 && write)
2021         mod_delayed_work(system_wq, &dirtytime_work, 0);
2022     return ret;
2023 }
2024 
2025 static noinline void block_dump___mark_inode_dirty(struct inode *inode)
2026 {
2027     if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) {
2028         struct dentry *dentry;
2029         const char *name = "?";
2030 
2031         dentry = d_find_alias(inode);
2032         if (dentry) {
2033             spin_lock(&dentry->d_lock);
2034             name = (const char *) dentry->d_name.name;
2035         }
2036         printk(KERN_DEBUG
2037                "%s(%d): dirtied inode %lu (%s) on %s\n",
2038                current->comm, task_pid_nr(current), inode->i_ino,
2039                name, inode->i_sb->s_id);
2040         if (dentry) {
2041             spin_unlock(&dentry->d_lock);
2042             dput(dentry);
2043         }
2044     }
2045 }
2046 
2047 /**
2048  *  __mark_inode_dirty -    internal function
2049  *  @inode: inode to mark
2050  *  @flags: what kind of dirty (i.e. I_DIRTY_SYNC)
2051  *  Mark an inode as dirty. Callers should use mark_inode_dirty or
2052  *      mark_inode_dirty_sync.
2053  *
2054  * Put the inode on the super block's dirty list.
2055  *
2056  * CAREFUL! We mark it dirty unconditionally, but move it onto the
2057  * dirty list only if it is hashed or if it refers to a blockdev.
2058  * If it was not hashed, it will never be added to the dirty list
2059  * even if it is later hashed, as it will have been marked dirty already.
2060  *
2061  * In short, make sure you hash any inodes _before_ you start marking
2062  * them dirty.
2063  *
2064  * Note that for blockdevs, inode->dirtied_when represents the dirtying time of
2065  * the block-special inode (/dev/hda1) itself.  And the ->dirtied_when field of
2066  * the kernel-internal blockdev inode represents the dirtying time of the
2067  * blockdev's pages.  This is why for I_DIRTY_PAGES we always use
2068  * page->mapping->host, so the page-dirtying time is recorded in the internal
2069  * blockdev inode.
2070  */
2071 void __mark_inode_dirty(struct inode *inode, int flags)
2072 {
2073 #define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC)
2074     struct super_block *sb = inode->i_sb;
2075     int dirtytime;
2076 
2077     trace_writeback_mark_inode_dirty(inode, flags);
2078 
2079     /*
2080      * Don't do this for I_DIRTY_PAGES - that doesn't actually
2081      * dirty the inode itself
2082      */
2083     if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_TIME)) {
2084         trace_writeback_dirty_inode_start(inode, flags);
2085 
2086         if (sb->s_op->dirty_inode)
2087             sb->s_op->dirty_inode(inode, flags);
2088 
2089         trace_writeback_dirty_inode(inode, flags);
2090     }
2091     if (flags & I_DIRTY_INODE)
2092         flags &= ~I_DIRTY_TIME;
2093     dirtytime = flags & I_DIRTY_TIME;
2094 
2095     /*
2096      * Paired with smp_mb() in __writeback_single_inode() for the
2097      * following lockless i_state test.  See there for details.
2098      */
2099     smp_mb();
2100 
2101     if (((inode->i_state & flags) == flags) ||
2102         (dirtytime && (inode->i_state & I_DIRTY_INODE)))
2103         return;
2104 
2105     if (unlikely(block_dump))
2106         block_dump___mark_inode_dirty(inode);
2107 
2108     spin_lock(&inode->i_lock);
2109     if (dirtytime && (inode->i_state & I_DIRTY_INODE))
2110         goto out_unlock_inode;
2111     if ((inode->i_state & flags) != flags) {
2112         const int was_dirty = inode->i_state & I_DIRTY;
2113 
2114         inode_attach_wb(inode, NULL);
2115 
2116         if (flags & I_DIRTY_INODE)
2117             inode->i_state &= ~I_DIRTY_TIME;
2118         inode->i_state |= flags;
2119 
2120         /*
2121          * If the inode is being synced, just update its dirty state.
2122          * The unlocker will place the inode on the appropriate
2123          * superblock list, based upon its state.
2124          */
2125         if (inode->i_state & I_SYNC)
2126             goto out_unlock_inode;
2127 
2128         /*
2129          * Only add valid (hashed) inodes to the superblock's
2130          * dirty list.  Add blockdev inodes as well.
2131          */
2132         if (!S_ISBLK(inode->i_mode)) {
2133             if (inode_unhashed(inode))
2134                 goto out_unlock_inode;
2135         }
2136         if (inode->i_state & I_FREEING)
2137             goto out_unlock_inode;
2138 
2139         /*
2140          * If the inode was already on b_dirty/b_io/b_more_io, don't
2141          * reposition it (that would break b_dirty time-ordering).
2142          */
2143         if (!was_dirty) {
2144             struct bdi_writeback *wb;
2145             struct list_head *dirty_list;
2146             bool wakeup_bdi = false;
2147 
2148             wb = locked_inode_to_wb_and_lock_list(inode);
2149 
2150             WARN(bdi_cap_writeback_dirty(wb->bdi) &&
2151                  !test_bit(WB_registered, &wb->state),
2152                  "bdi-%s not registered\n", wb->bdi->name);
2153 
2154             inode->dirtied_when = jiffies;
2155             if (dirtytime)
2156                 inode->dirtied_time_when = jiffies;
2157 
2158             if (inode->i_state & (I_DIRTY_INODE | I_DIRTY_PAGES))
2159                 dirty_list = &wb->b_dirty;
2160             else
2161                 dirty_list = &wb->b_dirty_time;
2162 
2163             wakeup_bdi = inode_io_list_move_locked(inode, wb,
2164                                    dirty_list);
2165 
2166             spin_unlock(&wb->list_lock);
2167             trace_writeback_dirty_inode_enqueue(inode);
2168 
2169             /*
2170              * If this is the first dirty inode for this bdi,
2171              * we have to wake-up the corresponding bdi thread
2172              * to make sure background write-back happens
2173              * later.
2174              */
2175             if (bdi_cap_writeback_dirty(wb->bdi) && wakeup_bdi)
2176                 wb_wakeup_delayed(wb);
2177             return;
2178         }
2179     }
2180 out_unlock_inode:
2181     spin_unlock(&inode->i_lock);
2182 
2183 #undef I_DIRTY_INODE
2184 }
2185 EXPORT_SYMBOL(__mark_inode_dirty);
2186 
2187 /*
2188  * The @s_sync_lock is used to serialise concurrent sync operations
2189  * to avoid lock contention problems with concurrent wait_sb_inodes() calls.
2190  * Concurrent callers will block on the s_sync_lock rather than doing contending
2191  * walks. The queueing maintains sync(2) required behaviour as all the IO that
2192  * has been issued up to the time this function is enter is guaranteed to be
2193  * completed by the time we have gained the lock and waited for all IO that is
2194  * in progress regardless of the order callers are granted the lock.
2195  */
2196 static void wait_sb_inodes(struct super_block *sb)
2197 {
2198     LIST_HEAD(sync_list);
2199 
2200     /*
2201      * We need to be protected against the filesystem going from
2202      * r/o to r/w or vice versa.
2203      */
2204     WARN_ON(!rwsem_is_locked(&sb->s_umount));
2205 
2206     mutex_lock(&sb->s_sync_lock);
2207 
2208     /*
2209      * Splice the writeback list onto a temporary list to avoid waiting on
2210      * inodes that have started writeback after this point.
2211      *
2212      * Use rcu_read_lock() to keep the inodes around until we have a
2213      * reference. s_inode_wblist_lock protects sb->s_inodes_wb as well as
2214      * the local list because inodes can be dropped from either by writeback
2215      * completion.
2216      */
2217     rcu_read_lock();
2218     spin_lock_irq(&sb->s_inode_wblist_lock);
2219     list_splice_init(&sb->s_inodes_wb, &sync_list);
2220 
2221     /*
2222      * Data integrity sync. Must wait for all pages under writeback, because
2223      * there may have been pages dirtied before our sync call, but which had
2224      * writeout started before we write it out.  In which case, the inode
2225      * may not be on the dirty list, but we still have to wait for that
2226      * writeout.
2227      */
2228     while (!list_empty(&sync_list)) {
2229         struct inode *inode = list_first_entry(&sync_list, struct inode,
2230                                i_wb_list);
2231         struct address_space *mapping = inode->i_mapping;
2232 
2233         /*
2234          * Move each inode back to the wb list before we drop the lock
2235          * to preserve consistency between i_wb_list and the mapping
2236          * writeback tag. Writeback completion is responsible to remove
2237          * the inode from either list once the writeback tag is cleared.
2238          */
2239         list_move_tail(&inode->i_wb_list, &sb->s_inodes_wb);
2240 
2241         /*
2242          * The mapping can appear untagged while still on-list since we
2243          * do not have the mapping lock. Skip it here, wb completion
2244          * will remove it.
2245          */
2246         if (!mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK))
2247             continue;
2248 
2249         spin_unlock_irq(&sb->s_inode_wblist_lock);
2250 
2251         spin_lock(&inode->i_lock);
2252         if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) {
2253             spin_unlock(&inode->i_lock);
2254 
2255             spin_lock_irq(&sb->s_inode_wblist_lock);
2256             continue;
2257         }
2258         __iget(inode);
2259         spin_unlock(&inode->i_lock);
2260         rcu_read_unlock();
2261 
2262         /*
2263          * We keep the error status of individual mapping so that
2264          * applications can catch the writeback error using fsync(2).
2265          * See filemap_fdatawait_keep_errors() for details.
2266          */
2267         filemap_fdatawait_keep_errors(mapping);
2268 
2269         cond_resched();
2270 
2271         iput(inode);
2272 
2273         rcu_read_lock();
2274         spin_lock_irq(&sb->s_inode_wblist_lock);
2275     }
2276     spin_unlock_irq(&sb->s_inode_wblist_lock);
2277     rcu_read_unlock();
2278     mutex_unlock(&sb->s_sync_lock);
2279 }
2280 
2281 static void __writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr,
2282                      enum wb_reason reason, bool skip_if_busy)
2283 {
2284     DEFINE_WB_COMPLETION_ONSTACK(done);
2285     struct wb_writeback_work work = {
2286         .sb         = sb,
2287         .sync_mode      = WB_SYNC_NONE,
2288         .tagged_writepages  = 1,
2289         .done           = &done,
2290         .nr_pages       = nr,
2291         .reason         = reason,
2292     };
2293     struct backing_dev_info *bdi = sb->s_bdi;
2294 
2295     if (!bdi_has_dirty_io(bdi) || bdi == &noop_backing_dev_info)
2296         return;
2297     WARN_ON(!rwsem_is_locked(&sb->s_umount));
2298 
2299     bdi_split_work_to_wbs(sb->s_bdi, &work, skip_if_busy);
2300     wb_wait_for_completion(bdi, &done);
2301 }
2302 
2303 /**
2304  * writeback_inodes_sb_nr - writeback dirty inodes from given super_block
2305  * @sb: the superblock
2306  * @nr: the number of pages to write
2307  * @reason: reason why some writeback work initiated
2308  *
2309  * Start writeback on some inodes on this super_block. No guarantees are made
2310  * on how many (if any) will be written, and this function does not wait
2311  * for IO completion of submitted IO.
2312  */
2313 void writeback_inodes_sb_nr(struct super_block *sb,
2314                 unsigned long nr,
2315                 enum wb_reason reason)
2316 {
2317     __writeback_inodes_sb_nr(sb, nr, reason, false);
2318 }
2319 EXPORT_SYMBOL(writeback_inodes_sb_nr);
2320 
2321 /**
2322  * writeback_inodes_sb  -   writeback dirty inodes from given super_block
2323  * @sb: the superblock
2324  * @reason: reason why some writeback work was initiated
2325  *
2326  * Start writeback on some inodes on this super_block. No guarantees are made
2327  * on how many (if any) will be written, and this function does not wait
2328  * for IO completion of submitted IO.
2329  */
2330 void writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
2331 {
2332     return writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason);
2333 }
2334 EXPORT_SYMBOL(writeback_inodes_sb);
2335 
2336 /**
2337  * try_to_writeback_inodes_sb_nr - try to start writeback if none underway
2338  * @sb: the superblock
2339  * @nr: the number of pages to write
2340  * @reason: the reason of writeback
2341  *
2342  * Invoke writeback_inodes_sb_nr if no writeback is currently underway.
2343  * Returns 1 if writeback was started, 0 if not.
2344  */
2345 bool try_to_writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr,
2346                    enum wb_reason reason)
2347 {
2348     if (!down_read_trylock(&sb->s_umount))
2349         return false;
2350 
2351     __writeback_inodes_sb_nr(sb, nr, reason, true);
2352     up_read(&sb->s_umount);
2353     return true;
2354 }
2355 EXPORT_SYMBOL(try_to_writeback_inodes_sb_nr);
2356 
2357 /**
2358  * try_to_writeback_inodes_sb - try to start writeback if none underway
2359  * @sb: the superblock
2360  * @reason: reason why some writeback work was initiated
2361  *
2362  * Implement by try_to_writeback_inodes_sb_nr()
2363  * Returns 1 if writeback was started, 0 if not.
2364  */
2365 bool try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
2366 {
2367     return try_to_writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason);
2368 }
2369 EXPORT_SYMBOL(try_to_writeback_inodes_sb);
2370 
2371 /**
2372  * sync_inodes_sb   -   sync sb inode pages
2373  * @sb: the superblock
2374  *
2375  * This function writes and waits on any dirty inode belonging to this
2376  * super_block.
2377  */
2378 void sync_inodes_sb(struct super_block *sb)
2379 {
2380     DEFINE_WB_COMPLETION_ONSTACK(done);
2381     struct wb_writeback_work work = {
2382         .sb     = sb,
2383         .sync_mode  = WB_SYNC_ALL,
2384         .nr_pages   = LONG_MAX,
2385         .range_cyclic   = 0,
2386         .done       = &done,
2387         .reason     = WB_REASON_SYNC,
2388         .for_sync   = 1,
2389     };
2390     struct backing_dev_info *bdi = sb->s_bdi;
2391 
2392     /*
2393      * Can't skip on !bdi_has_dirty() because we should wait for !dirty
2394      * inodes under writeback and I_DIRTY_TIME inodes ignored by
2395      * bdi_has_dirty() need to be written out too.
2396      */
2397     if (bdi == &noop_backing_dev_info)
2398         return;
2399     WARN_ON(!rwsem_is_locked(&sb->s_umount));
2400 
2401     bdi_split_work_to_wbs(bdi, &work, false);
2402     wb_wait_for_completion(bdi, &done);
2403 
2404     wait_sb_inodes(sb);
2405 }
2406 EXPORT_SYMBOL(sync_inodes_sb);
2407 
2408 /**
2409  * write_inode_now  -   write an inode to disk
2410  * @inode: inode to write to disk
2411  * @sync: whether the write should be synchronous or not
2412  *
2413  * This function commits an inode to disk immediately if it is dirty. This is
2414  * primarily needed by knfsd.
2415  *
2416  * The caller must either have a ref on the inode or must have set I_WILL_FREE.
2417  */
2418 int write_inode_now(struct inode *inode, int sync)
2419 {
2420     struct writeback_control wbc = {
2421         .nr_to_write = LONG_MAX,
2422         .sync_mode = sync ? WB_SYNC_ALL : WB_SYNC_NONE,
2423         .range_start = 0,
2424         .range_end = LLONG_MAX,
2425     };
2426 
2427     if (!mapping_cap_writeback_dirty(inode->i_mapping))
2428         wbc.nr_to_write = 0;
2429 
2430     might_sleep();
2431     return writeback_single_inode(inode, &wbc);
2432 }
2433 EXPORT_SYMBOL(write_inode_now);
2434 
2435 /**
2436  * sync_inode - write an inode and its pages to disk.
2437  * @inode: the inode to sync
2438  * @wbc: controls the writeback mode
2439  *
2440  * sync_inode() will write an inode and its pages to disk.  It will also
2441  * correctly update the inode on its superblock's dirty inode lists and will
2442  * update inode->i_state.
2443  *
2444  * The caller must have a ref on the inode.
2445  */
2446 int sync_inode(struct inode *inode, struct writeback_control *wbc)
2447 {
2448     return writeback_single_inode(inode, wbc);
2449 }
2450 EXPORT_SYMBOL(sync_inode);
2451 
2452 /**
2453  * sync_inode_metadata - write an inode to disk
2454  * @inode: the inode to sync
2455  * @wait: wait for I/O to complete.
2456  *
2457  * Write an inode to disk and adjust its dirty state after completion.
2458  *
2459  * Note: only writes the actual inode, no associated data or other metadata.
2460  */
2461 int sync_inode_metadata(struct inode *inode, int wait)
2462 {
2463     struct writeback_control wbc = {
2464         .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE,
2465         .nr_to_write = 0, /* metadata-only */
2466     };
2467 
2468     return sync_inode(inode, &wbc);
2469 }
2470 EXPORT_SYMBOL(sync_inode_metadata);