Back to home page

LXR

 
 

    


0001 /*
0002  *  linux/fs/super.c
0003  *
0004  *  Copyright (C) 1991, 1992  Linus Torvalds
0005  *
0006  *  super.c contains code to handle: - mount structures
0007  *                                   - super-block tables
0008  *                                   - filesystem drivers list
0009  *                                   - mount system call
0010  *                                   - umount system call
0011  *                                   - ustat system call
0012  *
0013  * GK 2/5/95  -  Changed to support mounting the root fs via NFS
0014  *
0015  *  Added kerneld support: Jacques Gelinas and Bjorn Ekwall
0016  *  Added change_root: Werner Almesberger & Hans Lermen, Feb '96
0017  *  Added options to /proc/mounts:
0018  *    Torbjörn Lindh (torbjorn.lindh@gopta.se), April 14, 1996.
0019  *  Added devfs support: Richard Gooch <rgooch@atnf.csiro.au>, 13-JAN-1998
0020  *  Heavily rewritten for 'one fs - one tree' dcache architecture. AV, Mar 2000
0021  */
0022 
0023 #include <linux/export.h>
0024 #include <linux/slab.h>
0025 #include <linux/blkdev.h>
0026 #include <linux/mount.h>
0027 #include <linux/security.h>
0028 #include <linux/writeback.h>        /* for the emergency remount stuff */
0029 #include <linux/idr.h>
0030 #include <linux/mutex.h>
0031 #include <linux/backing-dev.h>
0032 #include <linux/rculist_bl.h>
0033 #include <linux/cleancache.h>
0034 #include <linux/fsnotify.h>
0035 #include <linux/lockdep.h>
0036 #include <linux/user_namespace.h>
0037 #include "internal.h"
0038 
0039 
0040 static LIST_HEAD(super_blocks);
0041 static DEFINE_SPINLOCK(sb_lock);
0042 
0043 static char *sb_writers_name[SB_FREEZE_LEVELS] = {
0044     "sb_writers",
0045     "sb_pagefaults",
0046     "sb_internal",
0047 };
0048 
0049 /*
0050  * One thing we have to be careful of with a per-sb shrinker is that we don't
0051  * drop the last active reference to the superblock from within the shrinker.
0052  * If that happens we could trigger unregistering the shrinker from within the
0053  * shrinker path and that leads to deadlock on the shrinker_rwsem. Hence we
0054  * take a passive reference to the superblock to avoid this from occurring.
0055  */
0056 static unsigned long super_cache_scan(struct shrinker *shrink,
0057                       struct shrink_control *sc)
0058 {
0059     struct super_block *sb;
0060     long    fs_objects = 0;
0061     long    total_objects;
0062     long    freed = 0;
0063     long    dentries;
0064     long    inodes;
0065 
0066     sb = container_of(shrink, struct super_block, s_shrink);
0067 
0068     /*
0069      * Deadlock avoidance.  We may hold various FS locks, and we don't want
0070      * to recurse into the FS that called us in clear_inode() and friends..
0071      */
0072     if (!(sc->gfp_mask & __GFP_FS))
0073         return SHRINK_STOP;
0074 
0075     if (!trylock_super(sb))
0076         return SHRINK_STOP;
0077 
0078     if (sb->s_op->nr_cached_objects)
0079         fs_objects = sb->s_op->nr_cached_objects(sb, sc);
0080 
0081     inodes = list_lru_shrink_count(&sb->s_inode_lru, sc);
0082     dentries = list_lru_shrink_count(&sb->s_dentry_lru, sc);
0083     total_objects = dentries + inodes + fs_objects + 1;
0084     if (!total_objects)
0085         total_objects = 1;
0086 
0087     /* proportion the scan between the caches */
0088     dentries = mult_frac(sc->nr_to_scan, dentries, total_objects);
0089     inodes = mult_frac(sc->nr_to_scan, inodes, total_objects);
0090     fs_objects = mult_frac(sc->nr_to_scan, fs_objects, total_objects);
0091 
0092     /*
0093      * prune the dcache first as the icache is pinned by it, then
0094      * prune the icache, followed by the filesystem specific caches
0095      *
0096      * Ensure that we always scan at least one object - memcg kmem
0097      * accounting uses this to fully empty the caches.
0098      */
0099     sc->nr_to_scan = dentries + 1;
0100     freed = prune_dcache_sb(sb, sc);
0101     sc->nr_to_scan = inodes + 1;
0102     freed += prune_icache_sb(sb, sc);
0103 
0104     if (fs_objects) {
0105         sc->nr_to_scan = fs_objects + 1;
0106         freed += sb->s_op->free_cached_objects(sb, sc);
0107     }
0108 
0109     up_read(&sb->s_umount);
0110     return freed;
0111 }
0112 
0113 static unsigned long super_cache_count(struct shrinker *shrink,
0114                        struct shrink_control *sc)
0115 {
0116     struct super_block *sb;
0117     long    total_objects = 0;
0118 
0119     sb = container_of(shrink, struct super_block, s_shrink);
0120 
0121     /*
0122      * Don't call trylock_super as it is a potential
0123      * scalability bottleneck. The counts could get updated
0124      * between super_cache_count and super_cache_scan anyway.
0125      * Call to super_cache_count with shrinker_rwsem held
0126      * ensures the safety of call to list_lru_shrink_count() and
0127      * s_op->nr_cached_objects().
0128      */
0129     if (sb->s_op && sb->s_op->nr_cached_objects)
0130         total_objects = sb->s_op->nr_cached_objects(sb, sc);
0131 
0132     total_objects += list_lru_shrink_count(&sb->s_dentry_lru, sc);
0133     total_objects += list_lru_shrink_count(&sb->s_inode_lru, sc);
0134 
0135     total_objects = vfs_pressure_ratio(total_objects);
0136     return total_objects;
0137 }
0138 
0139 static void destroy_super_work(struct work_struct *work)
0140 {
0141     struct super_block *s = container_of(work, struct super_block,
0142                             destroy_work);
0143     int i;
0144 
0145     for (i = 0; i < SB_FREEZE_LEVELS; i++)
0146         percpu_free_rwsem(&s->s_writers.rw_sem[i]);
0147     kfree(s);
0148 }
0149 
0150 static void destroy_super_rcu(struct rcu_head *head)
0151 {
0152     struct super_block *s = container_of(head, struct super_block, rcu);
0153     INIT_WORK(&s->destroy_work, destroy_super_work);
0154     schedule_work(&s->destroy_work);
0155 }
0156 
0157 /**
0158  *  destroy_super   -   frees a superblock
0159  *  @s: superblock to free
0160  *
0161  *  Frees a superblock.
0162  */
0163 static void destroy_super(struct super_block *s)
0164 {
0165     list_lru_destroy(&s->s_dentry_lru);
0166     list_lru_destroy(&s->s_inode_lru);
0167     security_sb_free(s);
0168     WARN_ON(!list_empty(&s->s_mounts));
0169     put_user_ns(s->s_user_ns);
0170     kfree(s->s_subtype);
0171     kfree(s->s_options);
0172     call_rcu(&s->rcu, destroy_super_rcu);
0173 }
0174 
0175 /**
0176  *  alloc_super -   create new superblock
0177  *  @type:  filesystem type superblock should belong to
0178  *  @flags: the mount flags
0179  *  @user_ns: User namespace for the super_block
0180  *
0181  *  Allocates and initializes a new &struct super_block.  alloc_super()
0182  *  returns a pointer new superblock or %NULL if allocation had failed.
0183  */
0184 static struct super_block *alloc_super(struct file_system_type *type, int flags,
0185                        struct user_namespace *user_ns)
0186 {
0187     struct super_block *s = kzalloc(sizeof(struct super_block),  GFP_USER);
0188     static const struct super_operations default_op;
0189     int i;
0190 
0191     if (!s)
0192         return NULL;
0193 
0194     INIT_LIST_HEAD(&s->s_mounts);
0195     s->s_user_ns = get_user_ns(user_ns);
0196 
0197     if (security_sb_alloc(s))
0198         goto fail;
0199 
0200     for (i = 0; i < SB_FREEZE_LEVELS; i++) {
0201         if (__percpu_init_rwsem(&s->s_writers.rw_sem[i],
0202                     sb_writers_name[i],
0203                     &type->s_writers_key[i]))
0204             goto fail;
0205     }
0206     init_waitqueue_head(&s->s_writers.wait_unfrozen);
0207     s->s_bdi = &noop_backing_dev_info;
0208     s->s_flags = flags;
0209     if (s->s_user_ns != &init_user_ns)
0210         s->s_iflags |= SB_I_NODEV;
0211     INIT_HLIST_NODE(&s->s_instances);
0212     INIT_HLIST_BL_HEAD(&s->s_anon);
0213     mutex_init(&s->s_sync_lock);
0214     INIT_LIST_HEAD(&s->s_inodes);
0215     spin_lock_init(&s->s_inode_list_lock);
0216     INIT_LIST_HEAD(&s->s_inodes_wb);
0217     spin_lock_init(&s->s_inode_wblist_lock);
0218 
0219     if (list_lru_init_memcg(&s->s_dentry_lru))
0220         goto fail;
0221     if (list_lru_init_memcg(&s->s_inode_lru))
0222         goto fail;
0223 
0224     init_rwsem(&s->s_umount);
0225     lockdep_set_class(&s->s_umount, &type->s_umount_key);
0226     /*
0227      * sget() can have s_umount recursion.
0228      *
0229      * When it cannot find a suitable sb, it allocates a new
0230      * one (this one), and tries again to find a suitable old
0231      * one.
0232      *
0233      * In case that succeeds, it will acquire the s_umount
0234      * lock of the old one. Since these are clearly distrinct
0235      * locks, and this object isn't exposed yet, there's no
0236      * risk of deadlocks.
0237      *
0238      * Annotate this by putting this lock in a different
0239      * subclass.
0240      */
0241     down_write_nested(&s->s_umount, SINGLE_DEPTH_NESTING);
0242     s->s_count = 1;
0243     atomic_set(&s->s_active, 1);
0244     mutex_init(&s->s_vfs_rename_mutex);
0245     lockdep_set_class(&s->s_vfs_rename_mutex, &type->s_vfs_rename_key);
0246     mutex_init(&s->s_dquot.dqio_mutex);
0247     s->s_maxbytes = MAX_NON_LFS;
0248     s->s_op = &default_op;
0249     s->s_time_gran = 1000000000;
0250     s->cleancache_poolid = CLEANCACHE_NO_POOL;
0251 
0252     s->s_shrink.seeks = DEFAULT_SEEKS;
0253     s->s_shrink.scan_objects = super_cache_scan;
0254     s->s_shrink.count_objects = super_cache_count;
0255     s->s_shrink.batch = 1024;
0256     s->s_shrink.flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE;
0257     return s;
0258 
0259 fail:
0260     destroy_super(s);
0261     return NULL;
0262 }
0263 
0264 /* Superblock refcounting  */
0265 
0266 /*
0267  * Drop a superblock's refcount.  The caller must hold sb_lock.
0268  */
0269 static void __put_super(struct super_block *sb)
0270 {
0271     if (!--sb->s_count) {
0272         list_del_init(&sb->s_list);
0273         destroy_super(sb);
0274     }
0275 }
0276 
0277 /**
0278  *  put_super   -   drop a temporary reference to superblock
0279  *  @sb: superblock in question
0280  *
0281  *  Drops a temporary reference, frees superblock if there's no
0282  *  references left.
0283  */
0284 static void put_super(struct super_block *sb)
0285 {
0286     spin_lock(&sb_lock);
0287     __put_super(sb);
0288     spin_unlock(&sb_lock);
0289 }
0290 
0291 
0292 /**
0293  *  deactivate_locked_super -   drop an active reference to superblock
0294  *  @s: superblock to deactivate
0295  *
0296  *  Drops an active reference to superblock, converting it into a temporary
0297  *  one if there is no other active references left.  In that case we
0298  *  tell fs driver to shut it down and drop the temporary reference we
0299  *  had just acquired.
0300  *
0301  *  Caller holds exclusive lock on superblock; that lock is released.
0302  */
0303 void deactivate_locked_super(struct super_block *s)
0304 {
0305     struct file_system_type *fs = s->s_type;
0306     if (atomic_dec_and_test(&s->s_active)) {
0307         cleancache_invalidate_fs(s);
0308         unregister_shrinker(&s->s_shrink);
0309         fs->kill_sb(s);
0310 
0311         /*
0312          * Since list_lru_destroy() may sleep, we cannot call it from
0313          * put_super(), where we hold the sb_lock. Therefore we destroy
0314          * the lru lists right now.
0315          */
0316         list_lru_destroy(&s->s_dentry_lru);
0317         list_lru_destroy(&s->s_inode_lru);
0318 
0319         put_filesystem(fs);
0320         put_super(s);
0321     } else {
0322         up_write(&s->s_umount);
0323     }
0324 }
0325 
0326 EXPORT_SYMBOL(deactivate_locked_super);
0327 
0328 /**
0329  *  deactivate_super    -   drop an active reference to superblock
0330  *  @s: superblock to deactivate
0331  *
0332  *  Variant of deactivate_locked_super(), except that superblock is *not*
0333  *  locked by caller.  If we are going to drop the final active reference,
0334  *  lock will be acquired prior to that.
0335  */
0336 void deactivate_super(struct super_block *s)
0337 {
0338         if (!atomic_add_unless(&s->s_active, -1, 1)) {
0339         down_write(&s->s_umount);
0340         deactivate_locked_super(s);
0341     }
0342 }
0343 
0344 EXPORT_SYMBOL(deactivate_super);
0345 
0346 /**
0347  *  grab_super - acquire an active reference
0348  *  @s: reference we are trying to make active
0349  *
0350  *  Tries to acquire an active reference.  grab_super() is used when we
0351  *  had just found a superblock in super_blocks or fs_type->fs_supers
0352  *  and want to turn it into a full-blown active reference.  grab_super()
0353  *  is called with sb_lock held and drops it.  Returns 1 in case of
0354  *  success, 0 if we had failed (superblock contents was already dead or
0355  *  dying when grab_super() had been called).  Note that this is only
0356  *  called for superblocks not in rundown mode (== ones still on ->fs_supers
0357  *  of their type), so increment of ->s_count is OK here.
0358  */
0359 static int grab_super(struct super_block *s) __releases(sb_lock)
0360 {
0361     s->s_count++;
0362     spin_unlock(&sb_lock);
0363     down_write(&s->s_umount);
0364     if ((s->s_flags & MS_BORN) && atomic_inc_not_zero(&s->s_active)) {
0365         put_super(s);
0366         return 1;
0367     }
0368     up_write(&s->s_umount);
0369     put_super(s);
0370     return 0;
0371 }
0372 
0373 /*
0374  *  trylock_super - try to grab ->s_umount shared
0375  *  @sb: reference we are trying to grab
0376  *
0377  *  Try to prevent fs shutdown.  This is used in places where we
0378  *  cannot take an active reference but we need to ensure that the
0379  *  filesystem is not shut down while we are working on it. It returns
0380  *  false if we cannot acquire s_umount or if we lose the race and
0381  *  filesystem already got into shutdown, and returns true with the s_umount
0382  *  lock held in read mode in case of success. On successful return,
0383  *  the caller must drop the s_umount lock when done.
0384  *
0385  *  Note that unlike get_super() et.al. this one does *not* bump ->s_count.
0386  *  The reason why it's safe is that we are OK with doing trylock instead
0387  *  of down_read().  There's a couple of places that are OK with that, but
0388  *  it's very much not a general-purpose interface.
0389  */
0390 bool trylock_super(struct super_block *sb)
0391 {
0392     if (down_read_trylock(&sb->s_umount)) {
0393         if (!hlist_unhashed(&sb->s_instances) &&
0394             sb->s_root && (sb->s_flags & MS_BORN))
0395             return true;
0396         up_read(&sb->s_umount);
0397     }
0398 
0399     return false;
0400 }
0401 
0402 /**
0403  *  generic_shutdown_super  -   common helper for ->kill_sb()
0404  *  @sb: superblock to kill
0405  *
0406  *  generic_shutdown_super() does all fs-independent work on superblock
0407  *  shutdown.  Typical ->kill_sb() should pick all fs-specific objects
0408  *  that need destruction out of superblock, call generic_shutdown_super()
0409  *  and release aforementioned objects.  Note: dentries and inodes _are_
0410  *  taken care of and do not need specific handling.
0411  *
0412  *  Upon calling this function, the filesystem may no longer alter or
0413  *  rearrange the set of dentries belonging to this super_block, nor may it
0414  *  change the attachments of dentries to inodes.
0415  */
0416 void generic_shutdown_super(struct super_block *sb)
0417 {
0418     const struct super_operations *sop = sb->s_op;
0419 
0420     if (sb->s_root) {
0421         shrink_dcache_for_umount(sb);
0422         sync_filesystem(sb);
0423         sb->s_flags &= ~MS_ACTIVE;
0424 
0425         fsnotify_unmount_inodes(sb);
0426         cgroup_writeback_umount();
0427 
0428         evict_inodes(sb);
0429 
0430         if (sb->s_dio_done_wq) {
0431             destroy_workqueue(sb->s_dio_done_wq);
0432             sb->s_dio_done_wq = NULL;
0433         }
0434 
0435         if (sop->put_super)
0436             sop->put_super(sb);
0437 
0438         if (!list_empty(&sb->s_inodes)) {
0439             printk("VFS: Busy inodes after unmount of %s. "
0440                "Self-destruct in 5 seconds.  Have a nice day...\n",
0441                sb->s_id);
0442         }
0443     }
0444     spin_lock(&sb_lock);
0445     /* should be initialized for __put_super_and_need_restart() */
0446     hlist_del_init(&sb->s_instances);
0447     spin_unlock(&sb_lock);
0448     up_write(&sb->s_umount);
0449 }
0450 
0451 EXPORT_SYMBOL(generic_shutdown_super);
0452 
0453 /**
0454  *  sget_userns -   find or create a superblock
0455  *  @type:  filesystem type superblock should belong to
0456  *  @test:  comparison callback
0457  *  @set:   setup callback
0458  *  @flags: mount flags
0459  *  @user_ns: User namespace for the super_block
0460  *  @data:  argument to each of them
0461  */
0462 struct super_block *sget_userns(struct file_system_type *type,
0463             int (*test)(struct super_block *,void *),
0464             int (*set)(struct super_block *,void *),
0465             int flags, struct user_namespace *user_ns,
0466             void *data)
0467 {
0468     struct super_block *s = NULL;
0469     struct super_block *old;
0470     int err;
0471 
0472     if (!(flags & MS_KERNMOUNT) &&
0473         !(type->fs_flags & FS_USERNS_MOUNT) &&
0474         !capable(CAP_SYS_ADMIN))
0475         return ERR_PTR(-EPERM);
0476 retry:
0477     spin_lock(&sb_lock);
0478     if (test) {
0479         hlist_for_each_entry(old, &type->fs_supers, s_instances) {
0480             if (!test(old, data))
0481                 continue;
0482             if (user_ns != old->s_user_ns) {
0483                 spin_unlock(&sb_lock);
0484                 if (s) {
0485                     up_write(&s->s_umount);
0486                     destroy_super(s);
0487                 }
0488                 return ERR_PTR(-EBUSY);
0489             }
0490             if (!grab_super(old))
0491                 goto retry;
0492             if (s) {
0493                 up_write(&s->s_umount);
0494                 destroy_super(s);
0495                 s = NULL;
0496             }
0497             return old;
0498         }
0499     }
0500     if (!s) {
0501         spin_unlock(&sb_lock);
0502         s = alloc_super(type, flags, user_ns);
0503         if (!s)
0504             return ERR_PTR(-ENOMEM);
0505         goto retry;
0506     }
0507         
0508     err = set(s, data);
0509     if (err) {
0510         spin_unlock(&sb_lock);
0511         up_write(&s->s_umount);
0512         destroy_super(s);
0513         return ERR_PTR(err);
0514     }
0515     s->s_type = type;
0516     strlcpy(s->s_id, type->name, sizeof(s->s_id));
0517     list_add_tail(&s->s_list, &super_blocks);
0518     hlist_add_head(&s->s_instances, &type->fs_supers);
0519     spin_unlock(&sb_lock);
0520     get_filesystem(type);
0521     register_shrinker(&s->s_shrink);
0522     return s;
0523 }
0524 
0525 EXPORT_SYMBOL(sget_userns);
0526 
0527 /**
0528  *  sget    -   find or create a superblock
0529  *  @type:    filesystem type superblock should belong to
0530  *  @test:    comparison callback
0531  *  @set:     setup callback
0532  *  @flags:   mount flags
0533  *  @data:    argument to each of them
0534  */
0535 struct super_block *sget(struct file_system_type *type,
0536             int (*test)(struct super_block *,void *),
0537             int (*set)(struct super_block *,void *),
0538             int flags,
0539             void *data)
0540 {
0541     struct user_namespace *user_ns = current_user_ns();
0542 
0543     /* Ensure the requestor has permissions over the target filesystem */
0544     if (!(flags & MS_KERNMOUNT) && !ns_capable(user_ns, CAP_SYS_ADMIN))
0545         return ERR_PTR(-EPERM);
0546 
0547     return sget_userns(type, test, set, flags, user_ns, data);
0548 }
0549 
0550 EXPORT_SYMBOL(sget);
0551 
0552 void drop_super(struct super_block *sb)
0553 {
0554     up_read(&sb->s_umount);
0555     put_super(sb);
0556 }
0557 
0558 EXPORT_SYMBOL(drop_super);
0559 
0560 void drop_super_exclusive(struct super_block *sb)
0561 {
0562     up_write(&sb->s_umount);
0563     put_super(sb);
0564 }
0565 EXPORT_SYMBOL(drop_super_exclusive);
0566 
0567 /**
0568  *  iterate_supers - call function for all active superblocks
0569  *  @f: function to call
0570  *  @arg: argument to pass to it
0571  *
0572  *  Scans the superblock list and calls given function, passing it
0573  *  locked superblock and given argument.
0574  */
0575 void iterate_supers(void (*f)(struct super_block *, void *), void *arg)
0576 {
0577     struct super_block *sb, *p = NULL;
0578 
0579     spin_lock(&sb_lock);
0580     list_for_each_entry(sb, &super_blocks, s_list) {
0581         if (hlist_unhashed(&sb->s_instances))
0582             continue;
0583         sb->s_count++;
0584         spin_unlock(&sb_lock);
0585 
0586         down_read(&sb->s_umount);
0587         if (sb->s_root && (sb->s_flags & MS_BORN))
0588             f(sb, arg);
0589         up_read(&sb->s_umount);
0590 
0591         spin_lock(&sb_lock);
0592         if (p)
0593             __put_super(p);
0594         p = sb;
0595     }
0596     if (p)
0597         __put_super(p);
0598     spin_unlock(&sb_lock);
0599 }
0600 
0601 /**
0602  *  iterate_supers_type - call function for superblocks of given type
0603  *  @type: fs type
0604  *  @f: function to call
0605  *  @arg: argument to pass to it
0606  *
0607  *  Scans the superblock list and calls given function, passing it
0608  *  locked superblock and given argument.
0609  */
0610 void iterate_supers_type(struct file_system_type *type,
0611     void (*f)(struct super_block *, void *), void *arg)
0612 {
0613     struct super_block *sb, *p = NULL;
0614 
0615     spin_lock(&sb_lock);
0616     hlist_for_each_entry(sb, &type->fs_supers, s_instances) {
0617         sb->s_count++;
0618         spin_unlock(&sb_lock);
0619 
0620         down_read(&sb->s_umount);
0621         if (sb->s_root && (sb->s_flags & MS_BORN))
0622             f(sb, arg);
0623         up_read(&sb->s_umount);
0624 
0625         spin_lock(&sb_lock);
0626         if (p)
0627             __put_super(p);
0628         p = sb;
0629     }
0630     if (p)
0631         __put_super(p);
0632     spin_unlock(&sb_lock);
0633 }
0634 
0635 EXPORT_SYMBOL(iterate_supers_type);
0636 
0637 static struct super_block *__get_super(struct block_device *bdev, bool excl)
0638 {
0639     struct super_block *sb;
0640 
0641     if (!bdev)
0642         return NULL;
0643 
0644     spin_lock(&sb_lock);
0645 rescan:
0646     list_for_each_entry(sb, &super_blocks, s_list) {
0647         if (hlist_unhashed(&sb->s_instances))
0648             continue;
0649         if (sb->s_bdev == bdev) {
0650             sb->s_count++;
0651             spin_unlock(&sb_lock);
0652             if (!excl)
0653                 down_read(&sb->s_umount);
0654             else
0655                 down_write(&sb->s_umount);
0656             /* still alive? */
0657             if (sb->s_root && (sb->s_flags & MS_BORN))
0658                 return sb;
0659             if (!excl)
0660                 up_read(&sb->s_umount);
0661             else
0662                 up_write(&sb->s_umount);
0663             /* nope, got unmounted */
0664             spin_lock(&sb_lock);
0665             __put_super(sb);
0666             goto rescan;
0667         }
0668     }
0669     spin_unlock(&sb_lock);
0670     return NULL;
0671 }
0672 
0673 /**
0674  *  get_super - get the superblock of a device
0675  *  @bdev: device to get the superblock for
0676  *
0677  *  Scans the superblock list and finds the superblock of the file system
0678  *  mounted on the device given. %NULL is returned if no match is found.
0679  */
0680 struct super_block *get_super(struct block_device *bdev)
0681 {
0682     return __get_super(bdev, false);
0683 }
0684 EXPORT_SYMBOL(get_super);
0685 
0686 static struct super_block *__get_super_thawed(struct block_device *bdev,
0687                           bool excl)
0688 {
0689     while (1) {
0690         struct super_block *s = __get_super(bdev, excl);
0691         if (!s || s->s_writers.frozen == SB_UNFROZEN)
0692             return s;
0693         if (!excl)
0694             up_read(&s->s_umount);
0695         else
0696             up_write(&s->s_umount);
0697         wait_event(s->s_writers.wait_unfrozen,
0698                s->s_writers.frozen == SB_UNFROZEN);
0699         put_super(s);
0700     }
0701 }
0702 
0703 /**
0704  *  get_super_thawed - get thawed superblock of a device
0705  *  @bdev: device to get the superblock for
0706  *
0707  *  Scans the superblock list and finds the superblock of the file system
0708  *  mounted on the device. The superblock is returned once it is thawed
0709  *  (or immediately if it was not frozen). %NULL is returned if no match
0710  *  is found.
0711  */
0712 struct super_block *get_super_thawed(struct block_device *bdev)
0713 {
0714     return __get_super_thawed(bdev, false);
0715 }
0716 EXPORT_SYMBOL(get_super_thawed);
0717 
0718 /**
0719  *  get_super_exclusive_thawed - get thawed superblock of a device
0720  *  @bdev: device to get the superblock for
0721  *
0722  *  Scans the superblock list and finds the superblock of the file system
0723  *  mounted on the device. The superblock is returned once it is thawed
0724  *  (or immediately if it was not frozen) and s_umount semaphore is held
0725  *  in exclusive mode. %NULL is returned if no match is found.
0726  */
0727 struct super_block *get_super_exclusive_thawed(struct block_device *bdev)
0728 {
0729     return __get_super_thawed(bdev, true);
0730 }
0731 EXPORT_SYMBOL(get_super_exclusive_thawed);
0732 
0733 /**
0734  * get_active_super - get an active reference to the superblock of a device
0735  * @bdev: device to get the superblock for
0736  *
0737  * Scans the superblock list and finds the superblock of the file system
0738  * mounted on the device given.  Returns the superblock with an active
0739  * reference or %NULL if none was found.
0740  */
0741 struct super_block *get_active_super(struct block_device *bdev)
0742 {
0743     struct super_block *sb;
0744 
0745     if (!bdev)
0746         return NULL;
0747 
0748 restart:
0749     spin_lock(&sb_lock);
0750     list_for_each_entry(sb, &super_blocks, s_list) {
0751         if (hlist_unhashed(&sb->s_instances))
0752             continue;
0753         if (sb->s_bdev == bdev) {
0754             if (!grab_super(sb))
0755                 goto restart;
0756             up_write(&sb->s_umount);
0757             return sb;
0758         }
0759     }
0760     spin_unlock(&sb_lock);
0761     return NULL;
0762 }
0763  
0764 struct super_block *user_get_super(dev_t dev)
0765 {
0766     struct super_block *sb;
0767 
0768     spin_lock(&sb_lock);
0769 rescan:
0770     list_for_each_entry(sb, &super_blocks, s_list) {
0771         if (hlist_unhashed(&sb->s_instances))
0772             continue;
0773         if (sb->s_dev ==  dev) {
0774             sb->s_count++;
0775             spin_unlock(&sb_lock);
0776             down_read(&sb->s_umount);
0777             /* still alive? */
0778             if (sb->s_root && (sb->s_flags & MS_BORN))
0779                 return sb;
0780             up_read(&sb->s_umount);
0781             /* nope, got unmounted */
0782             spin_lock(&sb_lock);
0783             __put_super(sb);
0784             goto rescan;
0785         }
0786     }
0787     spin_unlock(&sb_lock);
0788     return NULL;
0789 }
0790 
0791 /**
0792  *  do_remount_sb - asks filesystem to change mount options.
0793  *  @sb:    superblock in question
0794  *  @flags: numeric part of options
0795  *  @data:  the rest of options
0796  *      @force: whether or not to force the change
0797  *
0798  *  Alters the mount options of a mounted file system.
0799  */
0800 int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
0801 {
0802     int retval;
0803     int remount_ro;
0804 
0805     if (sb->s_writers.frozen != SB_UNFROZEN)
0806         return -EBUSY;
0807 
0808 #ifdef CONFIG_BLOCK
0809     if (!(flags & MS_RDONLY) && bdev_read_only(sb->s_bdev))
0810         return -EACCES;
0811 #endif
0812 
0813     remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY);
0814 
0815     if (remount_ro) {
0816         if (!hlist_empty(&sb->s_pins)) {
0817             up_write(&sb->s_umount);
0818             group_pin_kill(&sb->s_pins);
0819             down_write(&sb->s_umount);
0820             if (!sb->s_root)
0821                 return 0;
0822             if (sb->s_writers.frozen != SB_UNFROZEN)
0823                 return -EBUSY;
0824             remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY);
0825         }
0826     }
0827     shrink_dcache_sb(sb);
0828 
0829     /* If we are remounting RDONLY and current sb is read/write,
0830        make sure there are no rw files opened */
0831     if (remount_ro) {
0832         if (force) {
0833             sb->s_readonly_remount = 1;
0834             smp_wmb();
0835         } else {
0836             retval = sb_prepare_remount_readonly(sb);
0837             if (retval)
0838                 return retval;
0839         }
0840     }
0841 
0842     if (sb->s_op->remount_fs) {
0843         retval = sb->s_op->remount_fs(sb, &flags, data);
0844         if (retval) {
0845             if (!force)
0846                 goto cancel_readonly;
0847             /* If forced remount, go ahead despite any errors */
0848             WARN(1, "forced remount of a %s fs returned %i\n",
0849                  sb->s_type->name, retval);
0850         }
0851     }
0852     sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (flags & MS_RMT_MASK);
0853     /* Needs to be ordered wrt mnt_is_readonly() */
0854     smp_wmb();
0855     sb->s_readonly_remount = 0;
0856 
0857     /*
0858      * Some filesystems modify their metadata via some other path than the
0859      * bdev buffer cache (eg. use a private mapping, or directories in
0860      * pagecache, etc). Also file data modifications go via their own
0861      * mappings. So If we try to mount readonly then copy the filesystem
0862      * from bdev, we could get stale data, so invalidate it to give a best
0863      * effort at coherency.
0864      */
0865     if (remount_ro && sb->s_bdev)
0866         invalidate_bdev(sb->s_bdev);
0867     return 0;
0868 
0869 cancel_readonly:
0870     sb->s_readonly_remount = 0;
0871     return retval;
0872 }
0873 
0874 static void do_emergency_remount(struct work_struct *work)
0875 {
0876     struct super_block *sb, *p = NULL;
0877 
0878     spin_lock(&sb_lock);
0879     list_for_each_entry(sb, &super_blocks, s_list) {
0880         if (hlist_unhashed(&sb->s_instances))
0881             continue;
0882         sb->s_count++;
0883         spin_unlock(&sb_lock);
0884         down_write(&sb->s_umount);
0885         if (sb->s_root && sb->s_bdev && (sb->s_flags & MS_BORN) &&
0886             !(sb->s_flags & MS_RDONLY)) {
0887             /*
0888              * What lock protects sb->s_flags??
0889              */
0890             do_remount_sb(sb, MS_RDONLY, NULL, 1);
0891         }
0892         up_write(&sb->s_umount);
0893         spin_lock(&sb_lock);
0894         if (p)
0895             __put_super(p);
0896         p = sb;
0897     }
0898     if (p)
0899         __put_super(p);
0900     spin_unlock(&sb_lock);
0901     kfree(work);
0902     printk("Emergency Remount complete\n");
0903 }
0904 
0905 void emergency_remount(void)
0906 {
0907     struct work_struct *work;
0908 
0909     work = kmalloc(sizeof(*work), GFP_ATOMIC);
0910     if (work) {
0911         INIT_WORK(work, do_emergency_remount);
0912         schedule_work(work);
0913     }
0914 }
0915 
0916 /*
0917  * Unnamed block devices are dummy devices used by virtual
0918  * filesystems which don't use real block-devices.  -- jrs
0919  */
0920 
0921 static DEFINE_IDA(unnamed_dev_ida);
0922 static DEFINE_SPINLOCK(unnamed_dev_lock);/* protects the above */
0923 /* Many userspace utilities consider an FSID of 0 invalid.
0924  * Always return at least 1 from get_anon_bdev.
0925  */
0926 static int unnamed_dev_start = 1;
0927 
0928 int get_anon_bdev(dev_t *p)
0929 {
0930     int dev;
0931     int error;
0932 
0933  retry:
0934     if (ida_pre_get(&unnamed_dev_ida, GFP_ATOMIC) == 0)
0935         return -ENOMEM;
0936     spin_lock(&unnamed_dev_lock);
0937     error = ida_get_new_above(&unnamed_dev_ida, unnamed_dev_start, &dev);
0938     if (!error)
0939         unnamed_dev_start = dev + 1;
0940     spin_unlock(&unnamed_dev_lock);
0941     if (error == -EAGAIN)
0942         /* We raced and lost with another CPU. */
0943         goto retry;
0944     else if (error)
0945         return -EAGAIN;
0946 
0947     if (dev >= (1 << MINORBITS)) {
0948         spin_lock(&unnamed_dev_lock);
0949         ida_remove(&unnamed_dev_ida, dev);
0950         if (unnamed_dev_start > dev)
0951             unnamed_dev_start = dev;
0952         spin_unlock(&unnamed_dev_lock);
0953         return -EMFILE;
0954     }
0955     *p = MKDEV(0, dev & MINORMASK);
0956     return 0;
0957 }
0958 EXPORT_SYMBOL(get_anon_bdev);
0959 
0960 void free_anon_bdev(dev_t dev)
0961 {
0962     int slot = MINOR(dev);
0963     spin_lock(&unnamed_dev_lock);
0964     ida_remove(&unnamed_dev_ida, slot);
0965     if (slot < unnamed_dev_start)
0966         unnamed_dev_start = slot;
0967     spin_unlock(&unnamed_dev_lock);
0968 }
0969 EXPORT_SYMBOL(free_anon_bdev);
0970 
0971 int set_anon_super(struct super_block *s, void *data)
0972 {
0973     return get_anon_bdev(&s->s_dev);
0974 }
0975 
0976 EXPORT_SYMBOL(set_anon_super);
0977 
0978 void kill_anon_super(struct super_block *sb)
0979 {
0980     dev_t dev = sb->s_dev;
0981     generic_shutdown_super(sb);
0982     free_anon_bdev(dev);
0983 }
0984 
0985 EXPORT_SYMBOL(kill_anon_super);
0986 
0987 void kill_litter_super(struct super_block *sb)
0988 {
0989     if (sb->s_root)
0990         d_genocide(sb->s_root);
0991     kill_anon_super(sb);
0992 }
0993 
0994 EXPORT_SYMBOL(kill_litter_super);
0995 
0996 static int ns_test_super(struct super_block *sb, void *data)
0997 {
0998     return sb->s_fs_info == data;
0999 }
1000 
1001 static int ns_set_super(struct super_block *sb, void *data)
1002 {
1003     sb->s_fs_info = data;
1004     return set_anon_super(sb, NULL);
1005 }
1006 
1007 struct dentry *mount_ns(struct file_system_type *fs_type,
1008     int flags, void *data, void *ns, struct user_namespace *user_ns,
1009     int (*fill_super)(struct super_block *, void *, int))
1010 {
1011     struct super_block *sb;
1012 
1013     /* Don't allow mounting unless the caller has CAP_SYS_ADMIN
1014      * over the namespace.
1015      */
1016     if (!(flags & MS_KERNMOUNT) && !ns_capable(user_ns, CAP_SYS_ADMIN))
1017         return ERR_PTR(-EPERM);
1018 
1019     sb = sget_userns(fs_type, ns_test_super, ns_set_super, flags,
1020              user_ns, ns);
1021     if (IS_ERR(sb))
1022         return ERR_CAST(sb);
1023 
1024     if (!sb->s_root) {
1025         int err;
1026         err = fill_super(sb, data, flags & MS_SILENT ? 1 : 0);
1027         if (err) {
1028             deactivate_locked_super(sb);
1029             return ERR_PTR(err);
1030         }
1031 
1032         sb->s_flags |= MS_ACTIVE;
1033     }
1034 
1035     return dget(sb->s_root);
1036 }
1037 
1038 EXPORT_SYMBOL(mount_ns);
1039 
1040 #ifdef CONFIG_BLOCK
1041 static int set_bdev_super(struct super_block *s, void *data)
1042 {
1043     s->s_bdev = data;
1044     s->s_dev = s->s_bdev->bd_dev;
1045 
1046     /*
1047      * We set the bdi here to the queue backing, file systems can
1048      * overwrite this in ->fill_super()
1049      */
1050     s->s_bdi = &bdev_get_queue(s->s_bdev)->backing_dev_info;
1051     return 0;
1052 }
1053 
1054 static int test_bdev_super(struct super_block *s, void *data)
1055 {
1056     return (void *)s->s_bdev == data;
1057 }
1058 
1059 struct dentry *mount_bdev(struct file_system_type *fs_type,
1060     int flags, const char *dev_name, void *data,
1061     int (*fill_super)(struct super_block *, void *, int))
1062 {
1063     struct block_device *bdev;
1064     struct super_block *s;
1065     fmode_t mode = FMODE_READ | FMODE_EXCL;
1066     int error = 0;
1067 
1068     if (!(flags & MS_RDONLY))
1069         mode |= FMODE_WRITE;
1070 
1071     bdev = blkdev_get_by_path(dev_name, mode, fs_type);
1072     if (IS_ERR(bdev))
1073         return ERR_CAST(bdev);
1074 
1075     /*
1076      * once the super is inserted into the list by sget, s_umount
1077      * will protect the lockfs code from trying to start a snapshot
1078      * while we are mounting
1079      */
1080     mutex_lock(&bdev->bd_fsfreeze_mutex);
1081     if (bdev->bd_fsfreeze_count > 0) {
1082         mutex_unlock(&bdev->bd_fsfreeze_mutex);
1083         error = -EBUSY;
1084         goto error_bdev;
1085     }
1086     s = sget(fs_type, test_bdev_super, set_bdev_super, flags | MS_NOSEC,
1087          bdev);
1088     mutex_unlock(&bdev->bd_fsfreeze_mutex);
1089     if (IS_ERR(s))
1090         goto error_s;
1091 
1092     if (s->s_root) {
1093         if ((flags ^ s->s_flags) & MS_RDONLY) {
1094             deactivate_locked_super(s);
1095             error = -EBUSY;
1096             goto error_bdev;
1097         }
1098 
1099         /*
1100          * s_umount nests inside bd_mutex during
1101          * __invalidate_device().  blkdev_put() acquires
1102          * bd_mutex and can't be called under s_umount.  Drop
1103          * s_umount temporarily.  This is safe as we're
1104          * holding an active reference.
1105          */
1106         up_write(&s->s_umount);
1107         blkdev_put(bdev, mode);
1108         down_write(&s->s_umount);
1109     } else {
1110         s->s_mode = mode;
1111         snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev);
1112         sb_set_blocksize(s, block_size(bdev));
1113         error = fill_super(s, data, flags & MS_SILENT ? 1 : 0);
1114         if (error) {
1115             deactivate_locked_super(s);
1116             goto error;
1117         }
1118 
1119         s->s_flags |= MS_ACTIVE;
1120         bdev->bd_super = s;
1121     }
1122 
1123     return dget(s->s_root);
1124 
1125 error_s:
1126     error = PTR_ERR(s);
1127 error_bdev:
1128     blkdev_put(bdev, mode);
1129 error:
1130     return ERR_PTR(error);
1131 }
1132 EXPORT_SYMBOL(mount_bdev);
1133 
1134 void kill_block_super(struct super_block *sb)
1135 {
1136     struct block_device *bdev = sb->s_bdev;
1137     fmode_t mode = sb->s_mode;
1138 
1139     bdev->bd_super = NULL;
1140     generic_shutdown_super(sb);
1141     sync_blockdev(bdev);
1142     WARN_ON_ONCE(!(mode & FMODE_EXCL));
1143     blkdev_put(bdev, mode | FMODE_EXCL);
1144 }
1145 
1146 EXPORT_SYMBOL(kill_block_super);
1147 #endif
1148 
1149 struct dentry *mount_nodev(struct file_system_type *fs_type,
1150     int flags, void *data,
1151     int (*fill_super)(struct super_block *, void *, int))
1152 {
1153     int error;
1154     struct super_block *s = sget(fs_type, NULL, set_anon_super, flags, NULL);
1155 
1156     if (IS_ERR(s))
1157         return ERR_CAST(s);
1158 
1159     error = fill_super(s, data, flags & MS_SILENT ? 1 : 0);
1160     if (error) {
1161         deactivate_locked_super(s);
1162         return ERR_PTR(error);
1163     }
1164     s->s_flags |= MS_ACTIVE;
1165     return dget(s->s_root);
1166 }
1167 EXPORT_SYMBOL(mount_nodev);
1168 
1169 static int compare_single(struct super_block *s, void *p)
1170 {
1171     return 1;
1172 }
1173 
1174 struct dentry *mount_single(struct file_system_type *fs_type,
1175     int flags, void *data,
1176     int (*fill_super)(struct super_block *, void *, int))
1177 {
1178     struct super_block *s;
1179     int error;
1180 
1181     s = sget(fs_type, compare_single, set_anon_super, flags, NULL);
1182     if (IS_ERR(s))
1183         return ERR_CAST(s);
1184     if (!s->s_root) {
1185         error = fill_super(s, data, flags & MS_SILENT ? 1 : 0);
1186         if (error) {
1187             deactivate_locked_super(s);
1188             return ERR_PTR(error);
1189         }
1190         s->s_flags |= MS_ACTIVE;
1191     } else {
1192         do_remount_sb(s, flags, data, 0);
1193     }
1194     return dget(s->s_root);
1195 }
1196 EXPORT_SYMBOL(mount_single);
1197 
1198 struct dentry *
1199 mount_fs(struct file_system_type *type, int flags, const char *name, void *data)
1200 {
1201     struct dentry *root;
1202     struct super_block *sb;
1203     char *secdata = NULL;
1204     int error = -ENOMEM;
1205 
1206     if (data && !(type->fs_flags & FS_BINARY_MOUNTDATA)) {
1207         secdata = alloc_secdata();
1208         if (!secdata)
1209             goto out;
1210 
1211         error = security_sb_copy_data(data, secdata);
1212         if (error)
1213             goto out_free_secdata;
1214     }
1215 
1216     root = type->mount(type, flags, name, data);
1217     if (IS_ERR(root)) {
1218         error = PTR_ERR(root);
1219         goto out_free_secdata;
1220     }
1221     sb = root->d_sb;
1222     BUG_ON(!sb);
1223     WARN_ON(!sb->s_bdi);
1224     sb->s_flags |= MS_BORN;
1225 
1226     error = security_sb_kern_mount(sb, flags, secdata);
1227     if (error)
1228         goto out_sb;
1229 
1230     /*
1231      * filesystems should never set s_maxbytes larger than MAX_LFS_FILESIZE
1232      * but s_maxbytes was an unsigned long long for many releases. Throw
1233      * this warning for a little while to try and catch filesystems that
1234      * violate this rule.
1235      */
1236     WARN((sb->s_maxbytes < 0), "%s set sb->s_maxbytes to "
1237         "negative value (%lld)\n", type->name, sb->s_maxbytes);
1238 
1239     up_write(&sb->s_umount);
1240     free_secdata(secdata);
1241     return root;
1242 out_sb:
1243     dput(root);
1244     deactivate_locked_super(sb);
1245 out_free_secdata:
1246     free_secdata(secdata);
1247 out:
1248     return ERR_PTR(error);
1249 }
1250 
1251 /*
1252  * This is an internal function, please use sb_end_{write,pagefault,intwrite}
1253  * instead.
1254  */
1255 void __sb_end_write(struct super_block *sb, int level)
1256 {
1257     percpu_up_read(sb->s_writers.rw_sem + level-1);
1258 }
1259 EXPORT_SYMBOL(__sb_end_write);
1260 
1261 /*
1262  * This is an internal function, please use sb_start_{write,pagefault,intwrite}
1263  * instead.
1264  */
1265 int __sb_start_write(struct super_block *sb, int level, bool wait)
1266 {
1267     bool force_trylock = false;
1268     int ret = 1;
1269 
1270 #ifdef CONFIG_LOCKDEP
1271     /*
1272      * We want lockdep to tell us about possible deadlocks with freezing
1273      * but it's it bit tricky to properly instrument it. Getting a freeze
1274      * protection works as getting a read lock but there are subtle
1275      * problems. XFS for example gets freeze protection on internal level
1276      * twice in some cases, which is OK only because we already hold a
1277      * freeze protection also on higher level. Due to these cases we have
1278      * to use wait == F (trylock mode) which must not fail.
1279      */
1280     if (wait) {
1281         int i;
1282 
1283         for (i = 0; i < level - 1; i++)
1284             if (percpu_rwsem_is_held(sb->s_writers.rw_sem + i)) {
1285                 force_trylock = true;
1286                 break;
1287             }
1288     }
1289 #endif
1290     if (wait && !force_trylock)
1291         percpu_down_read(sb->s_writers.rw_sem + level-1);
1292     else
1293         ret = percpu_down_read_trylock(sb->s_writers.rw_sem + level-1);
1294 
1295     WARN_ON(force_trylock && !ret);
1296     return ret;
1297 }
1298 EXPORT_SYMBOL(__sb_start_write);
1299 
1300 /**
1301  * sb_wait_write - wait until all writers to given file system finish
1302  * @sb: the super for which we wait
1303  * @level: type of writers we wait for (normal vs page fault)
1304  *
1305  * This function waits until there are no writers of given type to given file
1306  * system.
1307  */
1308 static void sb_wait_write(struct super_block *sb, int level)
1309 {
1310     percpu_down_write(sb->s_writers.rw_sem + level-1);
1311 }
1312 
1313 /*
1314  * We are going to return to userspace and forget about these locks, the
1315  * ownership goes to the caller of thaw_super() which does unlock().
1316  */
1317 static void lockdep_sb_freeze_release(struct super_block *sb)
1318 {
1319     int level;
1320 
1321     for (level = SB_FREEZE_LEVELS - 1; level >= 0; level--)
1322         percpu_rwsem_release(sb->s_writers.rw_sem + level, 0, _THIS_IP_);
1323 }
1324 
1325 /*
1326  * Tell lockdep we are holding these locks before we call ->unfreeze_fs(sb).
1327  */
1328 static void lockdep_sb_freeze_acquire(struct super_block *sb)
1329 {
1330     int level;
1331 
1332     for (level = 0; level < SB_FREEZE_LEVELS; ++level)
1333         percpu_rwsem_acquire(sb->s_writers.rw_sem + level, 0, _THIS_IP_);
1334 }
1335 
1336 static void sb_freeze_unlock(struct super_block *sb)
1337 {
1338     int level;
1339 
1340     for (level = SB_FREEZE_LEVELS - 1; level >= 0; level--)
1341         percpu_up_write(sb->s_writers.rw_sem + level);
1342 }
1343 
1344 /**
1345  * freeze_super - lock the filesystem and force it into a consistent state
1346  * @sb: the super to lock
1347  *
1348  * Syncs the super to make sure the filesystem is consistent and calls the fs's
1349  * freeze_fs.  Subsequent calls to this without first thawing the fs will return
1350  * -EBUSY.
1351  *
1352  * During this function, sb->s_writers.frozen goes through these values:
1353  *
1354  * SB_UNFROZEN: File system is normal, all writes progress as usual.
1355  *
1356  * SB_FREEZE_WRITE: The file system is in the process of being frozen.  New
1357  * writes should be blocked, though page faults are still allowed. We wait for
1358  * all writes to complete and then proceed to the next stage.
1359  *
1360  * SB_FREEZE_PAGEFAULT: Freezing continues. Now also page faults are blocked
1361  * but internal fs threads can still modify the filesystem (although they
1362  * should not dirty new pages or inodes), writeback can run etc. After waiting
1363  * for all running page faults we sync the filesystem which will clean all
1364  * dirty pages and inodes (no new dirty pages or inodes can be created when
1365  * sync is running).
1366  *
1367  * SB_FREEZE_FS: The file system is frozen. Now all internal sources of fs
1368  * modification are blocked (e.g. XFS preallocation truncation on inode
1369  * reclaim). This is usually implemented by blocking new transactions for
1370  * filesystems that have them and need this additional guard. After all
1371  * internal writers are finished we call ->freeze_fs() to finish filesystem
1372  * freezing. Then we transition to SB_FREEZE_COMPLETE state. This state is
1373  * mostly auxiliary for filesystems to verify they do not modify frozen fs.
1374  *
1375  * sb->s_writers.frozen is protected by sb->s_umount.
1376  */
1377 int freeze_super(struct super_block *sb)
1378 {
1379     int ret;
1380 
1381     atomic_inc(&sb->s_active);
1382     down_write(&sb->s_umount);
1383     if (sb->s_writers.frozen != SB_UNFROZEN) {
1384         deactivate_locked_super(sb);
1385         return -EBUSY;
1386     }
1387 
1388     if (!(sb->s_flags & MS_BORN)) {
1389         up_write(&sb->s_umount);
1390         return 0;   /* sic - it's "nothing to do" */
1391     }
1392 
1393     if (sb->s_flags & MS_RDONLY) {
1394         /* Nothing to do really... */
1395         sb->s_writers.frozen = SB_FREEZE_COMPLETE;
1396         up_write(&sb->s_umount);
1397         return 0;
1398     }
1399 
1400     sb->s_writers.frozen = SB_FREEZE_WRITE;
1401     /* Release s_umount to preserve sb_start_write -> s_umount ordering */
1402     up_write(&sb->s_umount);
1403     sb_wait_write(sb, SB_FREEZE_WRITE);
1404     down_write(&sb->s_umount);
1405 
1406     /* Now we go and block page faults... */
1407     sb->s_writers.frozen = SB_FREEZE_PAGEFAULT;
1408     sb_wait_write(sb, SB_FREEZE_PAGEFAULT);
1409 
1410     /* All writers are done so after syncing there won't be dirty data */
1411     sync_filesystem(sb);
1412 
1413     /* Now wait for internal filesystem counter */
1414     sb->s_writers.frozen = SB_FREEZE_FS;
1415     sb_wait_write(sb, SB_FREEZE_FS);
1416 
1417     if (sb->s_op->freeze_fs) {
1418         ret = sb->s_op->freeze_fs(sb);
1419         if (ret) {
1420             printk(KERN_ERR
1421                 "VFS:Filesystem freeze failed\n");
1422             sb->s_writers.frozen = SB_UNFROZEN;
1423             sb_freeze_unlock(sb);
1424             wake_up(&sb->s_writers.wait_unfrozen);
1425             deactivate_locked_super(sb);
1426             return ret;
1427         }
1428     }
1429     /*
1430      * For debugging purposes so that fs can warn if it sees write activity
1431      * when frozen is set to SB_FREEZE_COMPLETE, and for thaw_super().
1432      */
1433     sb->s_writers.frozen = SB_FREEZE_COMPLETE;
1434     lockdep_sb_freeze_release(sb);
1435     up_write(&sb->s_umount);
1436     return 0;
1437 }
1438 EXPORT_SYMBOL(freeze_super);
1439 
1440 /**
1441  * thaw_super -- unlock filesystem
1442  * @sb: the super to thaw
1443  *
1444  * Unlocks the filesystem and marks it writeable again after freeze_super().
1445  */
1446 int thaw_super(struct super_block *sb)
1447 {
1448     int error;
1449 
1450     down_write(&sb->s_umount);
1451     if (sb->s_writers.frozen != SB_FREEZE_COMPLETE) {
1452         up_write(&sb->s_umount);
1453         return -EINVAL;
1454     }
1455 
1456     if (sb->s_flags & MS_RDONLY) {
1457         sb->s_writers.frozen = SB_UNFROZEN;
1458         goto out;
1459     }
1460 
1461     lockdep_sb_freeze_acquire(sb);
1462 
1463     if (sb->s_op->unfreeze_fs) {
1464         error = sb->s_op->unfreeze_fs(sb);
1465         if (error) {
1466             printk(KERN_ERR
1467                 "VFS:Filesystem thaw failed\n");
1468             lockdep_sb_freeze_release(sb);
1469             up_write(&sb->s_umount);
1470             return error;
1471         }
1472     }
1473 
1474     sb->s_writers.frozen = SB_UNFROZEN;
1475     sb_freeze_unlock(sb);
1476 out:
1477     wake_up(&sb->s_writers.wait_unfrozen);
1478     deactivate_locked_super(sb);
1479     return 0;
1480 }
1481 EXPORT_SYMBOL(thaw_super);