Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0-only
0002 
0003 #include <linux/blkdev.h>
0004 #include <linux/wait.h>
0005 #include <linux/rbtree.h>
0006 #include <linux/kthread.h>
0007 #include <linux/backing-dev.h>
0008 #include <linux/blk-cgroup.h>
0009 #include <linux/freezer.h>
0010 #include <linux/fs.h>
0011 #include <linux/pagemap.h>
0012 #include <linux/mm.h>
0013 #include <linux/sched/mm.h>
0014 #include <linux/sched.h>
0015 #include <linux/module.h>
0016 #include <linux/writeback.h>
0017 #include <linux/device.h>
0018 #include <trace/events/writeback.h>
0019 
0020 struct backing_dev_info noop_backing_dev_info;
0021 EXPORT_SYMBOL_GPL(noop_backing_dev_info);
0022 
0023 static struct class *bdi_class;
0024 static const char *bdi_unknown_name = "(unknown)";
0025 
0026 /*
0027  * bdi_lock protects bdi_tree and updates to bdi_list. bdi_list has RCU
0028  * reader side locking.
0029  */
0030 DEFINE_SPINLOCK(bdi_lock);
0031 static u64 bdi_id_cursor;
0032 static struct rb_root bdi_tree = RB_ROOT;
0033 LIST_HEAD(bdi_list);
0034 
0035 /* bdi_wq serves all asynchronous writeback tasks */
0036 struct workqueue_struct *bdi_wq;
0037 
0038 #define K(x) ((x) << (PAGE_SHIFT - 10))
0039 
0040 #ifdef CONFIG_DEBUG_FS
0041 #include <linux/debugfs.h>
0042 #include <linux/seq_file.h>
0043 
0044 static struct dentry *bdi_debug_root;
0045 
0046 static void bdi_debug_init(void)
0047 {
0048     bdi_debug_root = debugfs_create_dir("bdi", NULL);
0049 }
0050 
0051 static int bdi_debug_stats_show(struct seq_file *m, void *v)
0052 {
0053     struct backing_dev_info *bdi = m->private;
0054     struct bdi_writeback *wb = &bdi->wb;
0055     unsigned long background_thresh;
0056     unsigned long dirty_thresh;
0057     unsigned long wb_thresh;
0058     unsigned long nr_dirty, nr_io, nr_more_io, nr_dirty_time;
0059     struct inode *inode;
0060 
0061     nr_dirty = nr_io = nr_more_io = nr_dirty_time = 0;
0062     spin_lock(&wb->list_lock);
0063     list_for_each_entry(inode, &wb->b_dirty, i_io_list)
0064         nr_dirty++;
0065     list_for_each_entry(inode, &wb->b_io, i_io_list)
0066         nr_io++;
0067     list_for_each_entry(inode, &wb->b_more_io, i_io_list)
0068         nr_more_io++;
0069     list_for_each_entry(inode, &wb->b_dirty_time, i_io_list)
0070         if (inode->i_state & I_DIRTY_TIME)
0071             nr_dirty_time++;
0072     spin_unlock(&wb->list_lock);
0073 
0074     global_dirty_limits(&background_thresh, &dirty_thresh);
0075     wb_thresh = wb_calc_thresh(wb, dirty_thresh);
0076 
0077     seq_printf(m,
0078            "BdiWriteback:       %10lu kB\n"
0079            "BdiReclaimable:     %10lu kB\n"
0080            "BdiDirtyThresh:     %10lu kB\n"
0081            "DirtyThresh:        %10lu kB\n"
0082            "BackgroundThresh:   %10lu kB\n"
0083            "BdiDirtied:         %10lu kB\n"
0084            "BdiWritten:         %10lu kB\n"
0085            "BdiWriteBandwidth:  %10lu kBps\n"
0086            "b_dirty:            %10lu\n"
0087            "b_io:               %10lu\n"
0088            "b_more_io:          %10lu\n"
0089            "b_dirty_time:       %10lu\n"
0090            "bdi_list:           %10u\n"
0091            "state:              %10lx\n",
0092            (unsigned long) K(wb_stat(wb, WB_WRITEBACK)),
0093            (unsigned long) K(wb_stat(wb, WB_RECLAIMABLE)),
0094            K(wb_thresh),
0095            K(dirty_thresh),
0096            K(background_thresh),
0097            (unsigned long) K(wb_stat(wb, WB_DIRTIED)),
0098            (unsigned long) K(wb_stat(wb, WB_WRITTEN)),
0099            (unsigned long) K(wb->write_bandwidth),
0100            nr_dirty,
0101            nr_io,
0102            nr_more_io,
0103            nr_dirty_time,
0104            !list_empty(&bdi->bdi_list), bdi->wb.state);
0105 
0106     return 0;
0107 }
0108 DEFINE_SHOW_ATTRIBUTE(bdi_debug_stats);
0109 
0110 static void bdi_debug_register(struct backing_dev_info *bdi, const char *name)
0111 {
0112     bdi->debug_dir = debugfs_create_dir(name, bdi_debug_root);
0113 
0114     debugfs_create_file("stats", 0444, bdi->debug_dir, bdi,
0115                 &bdi_debug_stats_fops);
0116 }
0117 
0118 static void bdi_debug_unregister(struct backing_dev_info *bdi)
0119 {
0120     debugfs_remove_recursive(bdi->debug_dir);
0121 }
0122 #else
0123 static inline void bdi_debug_init(void)
0124 {
0125 }
0126 static inline void bdi_debug_register(struct backing_dev_info *bdi,
0127                       const char *name)
0128 {
0129 }
0130 static inline void bdi_debug_unregister(struct backing_dev_info *bdi)
0131 {
0132 }
0133 #endif
0134 
0135 static ssize_t read_ahead_kb_store(struct device *dev,
0136                   struct device_attribute *attr,
0137                   const char *buf, size_t count)
0138 {
0139     struct backing_dev_info *bdi = dev_get_drvdata(dev);
0140     unsigned long read_ahead_kb;
0141     ssize_t ret;
0142 
0143     ret = kstrtoul(buf, 10, &read_ahead_kb);
0144     if (ret < 0)
0145         return ret;
0146 
0147     bdi->ra_pages = read_ahead_kb >> (PAGE_SHIFT - 10);
0148 
0149     return count;
0150 }
0151 
0152 #define BDI_SHOW(name, expr)                        \
0153 static ssize_t name##_show(struct device *dev,              \
0154                struct device_attribute *attr, char *buf)    \
0155 {                                   \
0156     struct backing_dev_info *bdi = dev_get_drvdata(dev);        \
0157                                     \
0158     return sysfs_emit(buf, "%lld\n", (long long)expr);      \
0159 }                                   \
0160 static DEVICE_ATTR_RW(name);
0161 
0162 BDI_SHOW(read_ahead_kb, K(bdi->ra_pages))
0163 
0164 static ssize_t min_ratio_store(struct device *dev,
0165         struct device_attribute *attr, const char *buf, size_t count)
0166 {
0167     struct backing_dev_info *bdi = dev_get_drvdata(dev);
0168     unsigned int ratio;
0169     ssize_t ret;
0170 
0171     ret = kstrtouint(buf, 10, &ratio);
0172     if (ret < 0)
0173         return ret;
0174 
0175     ret = bdi_set_min_ratio(bdi, ratio);
0176     if (!ret)
0177         ret = count;
0178 
0179     return ret;
0180 }
0181 BDI_SHOW(min_ratio, bdi->min_ratio)
0182 
0183 static ssize_t max_ratio_store(struct device *dev,
0184         struct device_attribute *attr, const char *buf, size_t count)
0185 {
0186     struct backing_dev_info *bdi = dev_get_drvdata(dev);
0187     unsigned int ratio;
0188     ssize_t ret;
0189 
0190     ret = kstrtouint(buf, 10, &ratio);
0191     if (ret < 0)
0192         return ret;
0193 
0194     ret = bdi_set_max_ratio(bdi, ratio);
0195     if (!ret)
0196         ret = count;
0197 
0198     return ret;
0199 }
0200 BDI_SHOW(max_ratio, bdi->max_ratio)
0201 
0202 static ssize_t stable_pages_required_show(struct device *dev,
0203                       struct device_attribute *attr,
0204                       char *buf)
0205 {
0206     dev_warn_once(dev,
0207         "the stable_pages_required attribute has been removed. Use the stable_writes queue attribute instead.\n");
0208     return sysfs_emit(buf, "%d\n", 0);
0209 }
0210 static DEVICE_ATTR_RO(stable_pages_required);
0211 
0212 static struct attribute *bdi_dev_attrs[] = {
0213     &dev_attr_read_ahead_kb.attr,
0214     &dev_attr_min_ratio.attr,
0215     &dev_attr_max_ratio.attr,
0216     &dev_attr_stable_pages_required.attr,
0217     NULL,
0218 };
0219 ATTRIBUTE_GROUPS(bdi_dev);
0220 
0221 static __init int bdi_class_init(void)
0222 {
0223     bdi_class = class_create(THIS_MODULE, "bdi");
0224     if (IS_ERR(bdi_class))
0225         return PTR_ERR(bdi_class);
0226 
0227     bdi_class->dev_groups = bdi_dev_groups;
0228     bdi_debug_init();
0229 
0230     return 0;
0231 }
0232 postcore_initcall(bdi_class_init);
0233 
0234 static int __init default_bdi_init(void)
0235 {
0236     bdi_wq = alloc_workqueue("writeback", WQ_MEM_RECLAIM | WQ_UNBOUND |
0237                  WQ_SYSFS, 0);
0238     if (!bdi_wq)
0239         return -ENOMEM;
0240     return 0;
0241 }
0242 subsys_initcall(default_bdi_init);
0243 
0244 /*
0245  * This function is used when the first inode for this wb is marked dirty. It
0246  * wakes-up the corresponding bdi thread which should then take care of the
0247  * periodic background write-out of dirty inodes. Since the write-out would
0248  * starts only 'dirty_writeback_interval' centisecs from now anyway, we just
0249  * set up a timer which wakes the bdi thread up later.
0250  *
0251  * Note, we wouldn't bother setting up the timer, but this function is on the
0252  * fast-path (used by '__mark_inode_dirty()'), so we save few context switches
0253  * by delaying the wake-up.
0254  *
0255  * We have to be careful not to postpone flush work if it is scheduled for
0256  * earlier. Thus we use queue_delayed_work().
0257  */
0258 void wb_wakeup_delayed(struct bdi_writeback *wb)
0259 {
0260     unsigned long timeout;
0261 
0262     timeout = msecs_to_jiffies(dirty_writeback_interval * 10);
0263     spin_lock_irq(&wb->work_lock);
0264     if (test_bit(WB_registered, &wb->state))
0265         queue_delayed_work(bdi_wq, &wb->dwork, timeout);
0266     spin_unlock_irq(&wb->work_lock);
0267 }
0268 
0269 static void wb_update_bandwidth_workfn(struct work_struct *work)
0270 {
0271     struct bdi_writeback *wb = container_of(to_delayed_work(work),
0272                         struct bdi_writeback, bw_dwork);
0273 
0274     wb_update_bandwidth(wb);
0275 }
0276 
0277 /*
0278  * Initial write bandwidth: 100 MB/s
0279  */
0280 #define INIT_BW     (100 << (20 - PAGE_SHIFT))
0281 
0282 static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
0283            gfp_t gfp)
0284 {
0285     int i, err;
0286 
0287     memset(wb, 0, sizeof(*wb));
0288 
0289     wb->bdi = bdi;
0290     wb->last_old_flush = jiffies;
0291     INIT_LIST_HEAD(&wb->b_dirty);
0292     INIT_LIST_HEAD(&wb->b_io);
0293     INIT_LIST_HEAD(&wb->b_more_io);
0294     INIT_LIST_HEAD(&wb->b_dirty_time);
0295     spin_lock_init(&wb->list_lock);
0296 
0297     atomic_set(&wb->writeback_inodes, 0);
0298     wb->bw_time_stamp = jiffies;
0299     wb->balanced_dirty_ratelimit = INIT_BW;
0300     wb->dirty_ratelimit = INIT_BW;
0301     wb->write_bandwidth = INIT_BW;
0302     wb->avg_write_bandwidth = INIT_BW;
0303 
0304     spin_lock_init(&wb->work_lock);
0305     INIT_LIST_HEAD(&wb->work_list);
0306     INIT_DELAYED_WORK(&wb->dwork, wb_workfn);
0307     INIT_DELAYED_WORK(&wb->bw_dwork, wb_update_bandwidth_workfn);
0308     wb->dirty_sleep = jiffies;
0309 
0310     err = fprop_local_init_percpu(&wb->completions, gfp);
0311     if (err)
0312         return err;
0313 
0314     for (i = 0; i < NR_WB_STAT_ITEMS; i++) {
0315         err = percpu_counter_init(&wb->stat[i], 0, gfp);
0316         if (err)
0317             goto out_destroy_stat;
0318     }
0319 
0320     return 0;
0321 
0322 out_destroy_stat:
0323     while (i--)
0324         percpu_counter_destroy(&wb->stat[i]);
0325     fprop_local_destroy_percpu(&wb->completions);
0326     return err;
0327 }
0328 
0329 static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb);
0330 
0331 /*
0332  * Remove bdi from the global list and shutdown any threads we have running
0333  */
0334 static void wb_shutdown(struct bdi_writeback *wb)
0335 {
0336     /* Make sure nobody queues further work */
0337     spin_lock_irq(&wb->work_lock);
0338     if (!test_and_clear_bit(WB_registered, &wb->state)) {
0339         spin_unlock_irq(&wb->work_lock);
0340         return;
0341     }
0342     spin_unlock_irq(&wb->work_lock);
0343 
0344     cgwb_remove_from_bdi_list(wb);
0345     /*
0346      * Drain work list and shutdown the delayed_work.  !WB_registered
0347      * tells wb_workfn() that @wb is dying and its work_list needs to
0348      * be drained no matter what.
0349      */
0350     mod_delayed_work(bdi_wq, &wb->dwork, 0);
0351     flush_delayed_work(&wb->dwork);
0352     WARN_ON(!list_empty(&wb->work_list));
0353     flush_delayed_work(&wb->bw_dwork);
0354 }
0355 
0356 static void wb_exit(struct bdi_writeback *wb)
0357 {
0358     int i;
0359 
0360     WARN_ON(delayed_work_pending(&wb->dwork));
0361 
0362     for (i = 0; i < NR_WB_STAT_ITEMS; i++)
0363         percpu_counter_destroy(&wb->stat[i]);
0364 
0365     fprop_local_destroy_percpu(&wb->completions);
0366 }
0367 
0368 #ifdef CONFIG_CGROUP_WRITEBACK
0369 
0370 #include <linux/memcontrol.h>
0371 
0372 /*
0373  * cgwb_lock protects bdi->cgwb_tree, blkcg->cgwb_list, offline_cgwbs and
0374  * memcg->cgwb_list.  bdi->cgwb_tree is also RCU protected.
0375  */
0376 static DEFINE_SPINLOCK(cgwb_lock);
0377 static struct workqueue_struct *cgwb_release_wq;
0378 
0379 static LIST_HEAD(offline_cgwbs);
0380 static void cleanup_offline_cgwbs_workfn(struct work_struct *work);
0381 static DECLARE_WORK(cleanup_offline_cgwbs_work, cleanup_offline_cgwbs_workfn);
0382 
0383 static void cgwb_release_workfn(struct work_struct *work)
0384 {
0385     struct bdi_writeback *wb = container_of(work, struct bdi_writeback,
0386                         release_work);
0387     struct backing_dev_info *bdi = wb->bdi;
0388 
0389     mutex_lock(&wb->bdi->cgwb_release_mutex);
0390     wb_shutdown(wb);
0391 
0392     css_put(wb->memcg_css);
0393     css_put(wb->blkcg_css);
0394     mutex_unlock(&wb->bdi->cgwb_release_mutex);
0395 
0396     /* triggers blkg destruction if no online users left */
0397     blkcg_unpin_online(wb->blkcg_css);
0398 
0399     fprop_local_destroy_percpu(&wb->memcg_completions);
0400 
0401     spin_lock_irq(&cgwb_lock);
0402     list_del(&wb->offline_node);
0403     spin_unlock_irq(&cgwb_lock);
0404 
0405     percpu_ref_exit(&wb->refcnt);
0406     wb_exit(wb);
0407     bdi_put(bdi);
0408     WARN_ON_ONCE(!list_empty(&wb->b_attached));
0409     kfree_rcu(wb, rcu);
0410 }
0411 
0412 static void cgwb_release(struct percpu_ref *refcnt)
0413 {
0414     struct bdi_writeback *wb = container_of(refcnt, struct bdi_writeback,
0415                         refcnt);
0416     queue_work(cgwb_release_wq, &wb->release_work);
0417 }
0418 
0419 static void cgwb_kill(struct bdi_writeback *wb)
0420 {
0421     lockdep_assert_held(&cgwb_lock);
0422 
0423     WARN_ON(!radix_tree_delete(&wb->bdi->cgwb_tree, wb->memcg_css->id));
0424     list_del(&wb->memcg_node);
0425     list_del(&wb->blkcg_node);
0426     list_add(&wb->offline_node, &offline_cgwbs);
0427     percpu_ref_kill(&wb->refcnt);
0428 }
0429 
0430 static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb)
0431 {
0432     spin_lock_irq(&cgwb_lock);
0433     list_del_rcu(&wb->bdi_node);
0434     spin_unlock_irq(&cgwb_lock);
0435 }
0436 
0437 static int cgwb_create(struct backing_dev_info *bdi,
0438                struct cgroup_subsys_state *memcg_css, gfp_t gfp)
0439 {
0440     struct mem_cgroup *memcg;
0441     struct cgroup_subsys_state *blkcg_css;
0442     struct list_head *memcg_cgwb_list, *blkcg_cgwb_list;
0443     struct bdi_writeback *wb;
0444     unsigned long flags;
0445     int ret = 0;
0446 
0447     memcg = mem_cgroup_from_css(memcg_css);
0448     blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys);
0449     memcg_cgwb_list = &memcg->cgwb_list;
0450     blkcg_cgwb_list = blkcg_get_cgwb_list(blkcg_css);
0451 
0452     /* look up again under lock and discard on blkcg mismatch */
0453     spin_lock_irqsave(&cgwb_lock, flags);
0454     wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id);
0455     if (wb && wb->blkcg_css != blkcg_css) {
0456         cgwb_kill(wb);
0457         wb = NULL;
0458     }
0459     spin_unlock_irqrestore(&cgwb_lock, flags);
0460     if (wb)
0461         goto out_put;
0462 
0463     /* need to create a new one */
0464     wb = kmalloc(sizeof(*wb), gfp);
0465     if (!wb) {
0466         ret = -ENOMEM;
0467         goto out_put;
0468     }
0469 
0470     ret = wb_init(wb, bdi, gfp);
0471     if (ret)
0472         goto err_free;
0473 
0474     ret = percpu_ref_init(&wb->refcnt, cgwb_release, 0, gfp);
0475     if (ret)
0476         goto err_wb_exit;
0477 
0478     ret = fprop_local_init_percpu(&wb->memcg_completions, gfp);
0479     if (ret)
0480         goto err_ref_exit;
0481 
0482     wb->memcg_css = memcg_css;
0483     wb->blkcg_css = blkcg_css;
0484     INIT_LIST_HEAD(&wb->b_attached);
0485     INIT_WORK(&wb->release_work, cgwb_release_workfn);
0486     set_bit(WB_registered, &wb->state);
0487     bdi_get(bdi);
0488 
0489     /*
0490      * The root wb determines the registered state of the whole bdi and
0491      * memcg_cgwb_list and blkcg_cgwb_list's next pointers indicate
0492      * whether they're still online.  Don't link @wb if any is dead.
0493      * See wb_memcg_offline() and wb_blkcg_offline().
0494      */
0495     ret = -ENODEV;
0496     spin_lock_irqsave(&cgwb_lock, flags);
0497     if (test_bit(WB_registered, &bdi->wb.state) &&
0498         blkcg_cgwb_list->next && memcg_cgwb_list->next) {
0499         /* we might have raced another instance of this function */
0500         ret = radix_tree_insert(&bdi->cgwb_tree, memcg_css->id, wb);
0501         if (!ret) {
0502             list_add_tail_rcu(&wb->bdi_node, &bdi->wb_list);
0503             list_add(&wb->memcg_node, memcg_cgwb_list);
0504             list_add(&wb->blkcg_node, blkcg_cgwb_list);
0505             blkcg_pin_online(blkcg_css);
0506             css_get(memcg_css);
0507             css_get(blkcg_css);
0508         }
0509     }
0510     spin_unlock_irqrestore(&cgwb_lock, flags);
0511     if (ret) {
0512         if (ret == -EEXIST)
0513             ret = 0;
0514         goto err_fprop_exit;
0515     }
0516     goto out_put;
0517 
0518 err_fprop_exit:
0519     bdi_put(bdi);
0520     fprop_local_destroy_percpu(&wb->memcg_completions);
0521 err_ref_exit:
0522     percpu_ref_exit(&wb->refcnt);
0523 err_wb_exit:
0524     wb_exit(wb);
0525 err_free:
0526     kfree(wb);
0527 out_put:
0528     css_put(blkcg_css);
0529     return ret;
0530 }
0531 
0532 /**
0533  * wb_get_lookup - get wb for a given memcg
0534  * @bdi: target bdi
0535  * @memcg_css: cgroup_subsys_state of the target memcg (must have positive ref)
0536  *
0537  * Try to get the wb for @memcg_css on @bdi.  The returned wb has its
0538  * refcount incremented.
0539  *
0540  * This function uses css_get() on @memcg_css and thus expects its refcnt
0541  * to be positive on invocation.  IOW, rcu_read_lock() protection on
0542  * @memcg_css isn't enough.  try_get it before calling this function.
0543  *
0544  * A wb is keyed by its associated memcg.  As blkcg implicitly enables
0545  * memcg on the default hierarchy, memcg association is guaranteed to be
0546  * more specific (equal or descendant to the associated blkcg) and thus can
0547  * identify both the memcg and blkcg associations.
0548  *
0549  * Because the blkcg associated with a memcg may change as blkcg is enabled
0550  * and disabled closer to root in the hierarchy, each wb keeps track of
0551  * both the memcg and blkcg associated with it and verifies the blkcg on
0552  * each lookup.  On mismatch, the existing wb is discarded and a new one is
0553  * created.
0554  */
0555 struct bdi_writeback *wb_get_lookup(struct backing_dev_info *bdi,
0556                     struct cgroup_subsys_state *memcg_css)
0557 {
0558     struct bdi_writeback *wb;
0559 
0560     if (!memcg_css->parent)
0561         return &bdi->wb;
0562 
0563     rcu_read_lock();
0564     wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id);
0565     if (wb) {
0566         struct cgroup_subsys_state *blkcg_css;
0567 
0568         /* see whether the blkcg association has changed */
0569         blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys);
0570         if (unlikely(wb->blkcg_css != blkcg_css || !wb_tryget(wb)))
0571             wb = NULL;
0572         css_put(blkcg_css);
0573     }
0574     rcu_read_unlock();
0575 
0576     return wb;
0577 }
0578 
0579 /**
0580  * wb_get_create - get wb for a given memcg, create if necessary
0581  * @bdi: target bdi
0582  * @memcg_css: cgroup_subsys_state of the target memcg (must have positive ref)
0583  * @gfp: allocation mask to use
0584  *
0585  * Try to get the wb for @memcg_css on @bdi.  If it doesn't exist, try to
0586  * create one.  See wb_get_lookup() for more details.
0587  */
0588 struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi,
0589                     struct cgroup_subsys_state *memcg_css,
0590                     gfp_t gfp)
0591 {
0592     struct bdi_writeback *wb;
0593 
0594     might_alloc(gfp);
0595 
0596     if (!memcg_css->parent)
0597         return &bdi->wb;
0598 
0599     do {
0600         wb = wb_get_lookup(bdi, memcg_css);
0601     } while (!wb && !cgwb_create(bdi, memcg_css, gfp));
0602 
0603     return wb;
0604 }
0605 
0606 static int cgwb_bdi_init(struct backing_dev_info *bdi)
0607 {
0608     int ret;
0609 
0610     INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC);
0611     mutex_init(&bdi->cgwb_release_mutex);
0612     init_rwsem(&bdi->wb_switch_rwsem);
0613 
0614     ret = wb_init(&bdi->wb, bdi, GFP_KERNEL);
0615     if (!ret) {
0616         bdi->wb.memcg_css = &root_mem_cgroup->css;
0617         bdi->wb.blkcg_css = blkcg_root_css;
0618     }
0619     return ret;
0620 }
0621 
0622 static void cgwb_bdi_unregister(struct backing_dev_info *bdi)
0623 {
0624     struct radix_tree_iter iter;
0625     void **slot;
0626     struct bdi_writeback *wb;
0627 
0628     WARN_ON(test_bit(WB_registered, &bdi->wb.state));
0629 
0630     spin_lock_irq(&cgwb_lock);
0631     radix_tree_for_each_slot(slot, &bdi->cgwb_tree, &iter, 0)
0632         cgwb_kill(*slot);
0633     spin_unlock_irq(&cgwb_lock);
0634 
0635     mutex_lock(&bdi->cgwb_release_mutex);
0636     spin_lock_irq(&cgwb_lock);
0637     while (!list_empty(&bdi->wb_list)) {
0638         wb = list_first_entry(&bdi->wb_list, struct bdi_writeback,
0639                       bdi_node);
0640         spin_unlock_irq(&cgwb_lock);
0641         wb_shutdown(wb);
0642         spin_lock_irq(&cgwb_lock);
0643     }
0644     spin_unlock_irq(&cgwb_lock);
0645     mutex_unlock(&bdi->cgwb_release_mutex);
0646 }
0647 
0648 /*
0649  * cleanup_offline_cgwbs_workfn - try to release dying cgwbs
0650  *
0651  * Try to release dying cgwbs by switching attached inodes to the nearest
0652  * living ancestor's writeback. Processed wbs are placed at the end
0653  * of the list to guarantee the forward progress.
0654  */
0655 static void cleanup_offline_cgwbs_workfn(struct work_struct *work)
0656 {
0657     struct bdi_writeback *wb;
0658     LIST_HEAD(processed);
0659 
0660     spin_lock_irq(&cgwb_lock);
0661 
0662     while (!list_empty(&offline_cgwbs)) {
0663         wb = list_first_entry(&offline_cgwbs, struct bdi_writeback,
0664                       offline_node);
0665         list_move(&wb->offline_node, &processed);
0666 
0667         /*
0668          * If wb is dirty, cleaning up the writeback by switching
0669          * attached inodes will result in an effective removal of any
0670          * bandwidth restrictions, which isn't the goal.  Instead,
0671          * it can be postponed until the next time, when all io
0672          * will be likely completed.  If in the meantime some inodes
0673          * will get re-dirtied, they should be eventually switched to
0674          * a new cgwb.
0675          */
0676         if (wb_has_dirty_io(wb))
0677             continue;
0678 
0679         if (!wb_tryget(wb))
0680             continue;
0681 
0682         spin_unlock_irq(&cgwb_lock);
0683         while (cleanup_offline_cgwb(wb))
0684             cond_resched();
0685         spin_lock_irq(&cgwb_lock);
0686 
0687         wb_put(wb);
0688     }
0689 
0690     if (!list_empty(&processed))
0691         list_splice_tail(&processed, &offline_cgwbs);
0692 
0693     spin_unlock_irq(&cgwb_lock);
0694 }
0695 
0696 /**
0697  * wb_memcg_offline - kill all wb's associated with a memcg being offlined
0698  * @memcg: memcg being offlined
0699  *
0700  * Also prevents creation of any new wb's associated with @memcg.
0701  */
0702 void wb_memcg_offline(struct mem_cgroup *memcg)
0703 {
0704     struct list_head *memcg_cgwb_list = &memcg->cgwb_list;
0705     struct bdi_writeback *wb, *next;
0706 
0707     spin_lock_irq(&cgwb_lock);
0708     list_for_each_entry_safe(wb, next, memcg_cgwb_list, memcg_node)
0709         cgwb_kill(wb);
0710     memcg_cgwb_list->next = NULL;   /* prevent new wb's */
0711     spin_unlock_irq(&cgwb_lock);
0712 
0713     queue_work(system_unbound_wq, &cleanup_offline_cgwbs_work);
0714 }
0715 
0716 /**
0717  * wb_blkcg_offline - kill all wb's associated with a blkcg being offlined
0718  * @css: blkcg being offlined
0719  *
0720  * Also prevents creation of any new wb's associated with @blkcg.
0721  */
0722 void wb_blkcg_offline(struct cgroup_subsys_state *css)
0723 {
0724     struct bdi_writeback *wb, *next;
0725     struct list_head *list = blkcg_get_cgwb_list(css);
0726 
0727     spin_lock_irq(&cgwb_lock);
0728     list_for_each_entry_safe(wb, next, list, blkcg_node)
0729         cgwb_kill(wb);
0730     list->next = NULL;  /* prevent new wb's */
0731     spin_unlock_irq(&cgwb_lock);
0732 }
0733 
0734 static void cgwb_bdi_register(struct backing_dev_info *bdi)
0735 {
0736     spin_lock_irq(&cgwb_lock);
0737     list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list);
0738     spin_unlock_irq(&cgwb_lock);
0739 }
0740 
0741 static int __init cgwb_init(void)
0742 {
0743     /*
0744      * There can be many concurrent release work items overwhelming
0745      * system_wq.  Put them in a separate wq and limit concurrency.
0746      * There's no point in executing many of these in parallel.
0747      */
0748     cgwb_release_wq = alloc_workqueue("cgwb_release", 0, 1);
0749     if (!cgwb_release_wq)
0750         return -ENOMEM;
0751 
0752     return 0;
0753 }
0754 subsys_initcall(cgwb_init);
0755 
0756 #else   /* CONFIG_CGROUP_WRITEBACK */
0757 
0758 static int cgwb_bdi_init(struct backing_dev_info *bdi)
0759 {
0760     return wb_init(&bdi->wb, bdi, GFP_KERNEL);
0761 }
0762 
0763 static void cgwb_bdi_unregister(struct backing_dev_info *bdi) { }
0764 
0765 static void cgwb_bdi_register(struct backing_dev_info *bdi)
0766 {
0767     list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list);
0768 }
0769 
0770 static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb)
0771 {
0772     list_del_rcu(&wb->bdi_node);
0773 }
0774 
0775 #endif  /* CONFIG_CGROUP_WRITEBACK */
0776 
0777 int bdi_init(struct backing_dev_info *bdi)
0778 {
0779     int ret;
0780 
0781     bdi->dev = NULL;
0782 
0783     kref_init(&bdi->refcnt);
0784     bdi->min_ratio = 0;
0785     bdi->max_ratio = 100;
0786     bdi->max_prop_frac = FPROP_FRAC_BASE;
0787     INIT_LIST_HEAD(&bdi->bdi_list);
0788     INIT_LIST_HEAD(&bdi->wb_list);
0789     init_waitqueue_head(&bdi->wb_waitq);
0790 
0791     ret = cgwb_bdi_init(bdi);
0792 
0793     return ret;
0794 }
0795 
0796 struct backing_dev_info *bdi_alloc(int node_id)
0797 {
0798     struct backing_dev_info *bdi;
0799 
0800     bdi = kzalloc_node(sizeof(*bdi), GFP_KERNEL, node_id);
0801     if (!bdi)
0802         return NULL;
0803 
0804     if (bdi_init(bdi)) {
0805         kfree(bdi);
0806         return NULL;
0807     }
0808     bdi->capabilities = BDI_CAP_WRITEBACK | BDI_CAP_WRITEBACK_ACCT;
0809     bdi->ra_pages = VM_READAHEAD_PAGES;
0810     bdi->io_pages = VM_READAHEAD_PAGES;
0811     timer_setup(&bdi->laptop_mode_wb_timer, laptop_mode_timer_fn, 0);
0812     return bdi;
0813 }
0814 EXPORT_SYMBOL(bdi_alloc);
0815 
0816 static struct rb_node **bdi_lookup_rb_node(u64 id, struct rb_node **parentp)
0817 {
0818     struct rb_node **p = &bdi_tree.rb_node;
0819     struct rb_node *parent = NULL;
0820     struct backing_dev_info *bdi;
0821 
0822     lockdep_assert_held(&bdi_lock);
0823 
0824     while (*p) {
0825         parent = *p;
0826         bdi = rb_entry(parent, struct backing_dev_info, rb_node);
0827 
0828         if (bdi->id > id)
0829             p = &(*p)->rb_left;
0830         else if (bdi->id < id)
0831             p = &(*p)->rb_right;
0832         else
0833             break;
0834     }
0835 
0836     if (parentp)
0837         *parentp = parent;
0838     return p;
0839 }
0840 
0841 /**
0842  * bdi_get_by_id - lookup and get bdi from its id
0843  * @id: bdi id to lookup
0844  *
0845  * Find bdi matching @id and get it.  Returns NULL if the matching bdi
0846  * doesn't exist or is already unregistered.
0847  */
0848 struct backing_dev_info *bdi_get_by_id(u64 id)
0849 {
0850     struct backing_dev_info *bdi = NULL;
0851     struct rb_node **p;
0852 
0853     spin_lock_bh(&bdi_lock);
0854     p = bdi_lookup_rb_node(id, NULL);
0855     if (*p) {
0856         bdi = rb_entry(*p, struct backing_dev_info, rb_node);
0857         bdi_get(bdi);
0858     }
0859     spin_unlock_bh(&bdi_lock);
0860 
0861     return bdi;
0862 }
0863 
0864 int bdi_register_va(struct backing_dev_info *bdi, const char *fmt, va_list args)
0865 {
0866     struct device *dev;
0867     struct rb_node *parent, **p;
0868 
0869     if (bdi->dev)   /* The driver needs to use separate queues per device */
0870         return 0;
0871 
0872     vsnprintf(bdi->dev_name, sizeof(bdi->dev_name), fmt, args);
0873     dev = device_create(bdi_class, NULL, MKDEV(0, 0), bdi, bdi->dev_name);
0874     if (IS_ERR(dev))
0875         return PTR_ERR(dev);
0876 
0877     cgwb_bdi_register(bdi);
0878     bdi->dev = dev;
0879 
0880     bdi_debug_register(bdi, dev_name(dev));
0881     set_bit(WB_registered, &bdi->wb.state);
0882 
0883     spin_lock_bh(&bdi_lock);
0884 
0885     bdi->id = ++bdi_id_cursor;
0886 
0887     p = bdi_lookup_rb_node(bdi->id, &parent);
0888     rb_link_node(&bdi->rb_node, parent, p);
0889     rb_insert_color(&bdi->rb_node, &bdi_tree);
0890 
0891     list_add_tail_rcu(&bdi->bdi_list, &bdi_list);
0892 
0893     spin_unlock_bh(&bdi_lock);
0894 
0895     trace_writeback_bdi_register(bdi);
0896     return 0;
0897 }
0898 
0899 int bdi_register(struct backing_dev_info *bdi, const char *fmt, ...)
0900 {
0901     va_list args;
0902     int ret;
0903 
0904     va_start(args, fmt);
0905     ret = bdi_register_va(bdi, fmt, args);
0906     va_end(args);
0907     return ret;
0908 }
0909 EXPORT_SYMBOL(bdi_register);
0910 
0911 void bdi_set_owner(struct backing_dev_info *bdi, struct device *owner)
0912 {
0913     WARN_ON_ONCE(bdi->owner);
0914     bdi->owner = owner;
0915     get_device(owner);
0916 }
0917 
0918 /*
0919  * Remove bdi from bdi_list, and ensure that it is no longer visible
0920  */
0921 static void bdi_remove_from_list(struct backing_dev_info *bdi)
0922 {
0923     spin_lock_bh(&bdi_lock);
0924     rb_erase(&bdi->rb_node, &bdi_tree);
0925     list_del_rcu(&bdi->bdi_list);
0926     spin_unlock_bh(&bdi_lock);
0927 
0928     synchronize_rcu_expedited();
0929 }
0930 
0931 void bdi_unregister(struct backing_dev_info *bdi)
0932 {
0933     del_timer_sync(&bdi->laptop_mode_wb_timer);
0934 
0935     /* make sure nobody finds us on the bdi_list anymore */
0936     bdi_remove_from_list(bdi);
0937     wb_shutdown(&bdi->wb);
0938     cgwb_bdi_unregister(bdi);
0939 
0940     /*
0941      * If this BDI's min ratio has been set, use bdi_set_min_ratio() to
0942      * update the global bdi_min_ratio.
0943      */
0944     if (bdi->min_ratio)
0945         bdi_set_min_ratio(bdi, 0);
0946 
0947     if (bdi->dev) {
0948         bdi_debug_unregister(bdi);
0949         device_unregister(bdi->dev);
0950         bdi->dev = NULL;
0951     }
0952 
0953     if (bdi->owner) {
0954         put_device(bdi->owner);
0955         bdi->owner = NULL;
0956     }
0957 }
0958 EXPORT_SYMBOL(bdi_unregister);
0959 
0960 static void release_bdi(struct kref *ref)
0961 {
0962     struct backing_dev_info *bdi =
0963             container_of(ref, struct backing_dev_info, refcnt);
0964 
0965     WARN_ON_ONCE(test_bit(WB_registered, &bdi->wb.state));
0966     WARN_ON_ONCE(bdi->dev);
0967     wb_exit(&bdi->wb);
0968     kfree(bdi);
0969 }
0970 
0971 void bdi_put(struct backing_dev_info *bdi)
0972 {
0973     kref_put(&bdi->refcnt, release_bdi);
0974 }
0975 EXPORT_SYMBOL(bdi_put);
0976 
0977 struct backing_dev_info *inode_to_bdi(struct inode *inode)
0978 {
0979     struct super_block *sb;
0980 
0981     if (!inode)
0982         return &noop_backing_dev_info;
0983 
0984     sb = inode->i_sb;
0985 #ifdef CONFIG_BLOCK
0986     if (sb_is_blkdev_sb(sb))
0987         return I_BDEV(inode)->bd_disk->bdi;
0988 #endif
0989     return sb->s_bdi;
0990 }
0991 EXPORT_SYMBOL(inode_to_bdi);
0992 
0993 const char *bdi_dev_name(struct backing_dev_info *bdi)
0994 {
0995     if (!bdi || !bdi->dev)
0996         return bdi_unknown_name;
0997     return bdi->dev_name;
0998 }
0999 EXPORT_SYMBOL_GPL(bdi_dev_name);