Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0-only
0002 /*
0003  *  linux/fs/file_table.c
0004  *
0005  *  Copyright (C) 1991, 1992  Linus Torvalds
0006  *  Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu)
0007  */
0008 
0009 #include <linux/string.h>
0010 #include <linux/slab.h>
0011 #include <linux/file.h>
0012 #include <linux/fdtable.h>
0013 #include <linux/init.h>
0014 #include <linux/module.h>
0015 #include <linux/fs.h>
0016 #include <linux/security.h>
0017 #include <linux/cred.h>
0018 #include <linux/eventpoll.h>
0019 #include <linux/rcupdate.h>
0020 #include <linux/mount.h>
0021 #include <linux/capability.h>
0022 #include <linux/cdev.h>
0023 #include <linux/fsnotify.h>
0024 #include <linux/sysctl.h>
0025 #include <linux/percpu_counter.h>
0026 #include <linux/percpu.h>
0027 #include <linux/task_work.h>
0028 #include <linux/ima.h>
0029 #include <linux/swap.h>
0030 #include <linux/kmemleak.h>
0031 
0032 #include <linux/atomic.h>
0033 
0034 #include "internal.h"
0035 
0036 /* sysctl tunables... */
0037 static struct files_stat_struct files_stat = {
0038     .max_files = NR_FILE
0039 };
0040 
0041 /* SLAB cache for file structures */
0042 static struct kmem_cache *filp_cachep __read_mostly;
0043 
0044 static struct percpu_counter nr_files __cacheline_aligned_in_smp;
0045 
0046 static void file_free_rcu(struct rcu_head *head)
0047 {
0048     struct file *f = container_of(head, struct file, f_rcuhead);
0049 
0050     put_cred(f->f_cred);
0051     kmem_cache_free(filp_cachep, f);
0052 }
0053 
0054 static inline void file_free(struct file *f)
0055 {
0056     security_file_free(f);
0057     if (!(f->f_mode & FMODE_NOACCOUNT))
0058         percpu_counter_dec(&nr_files);
0059     call_rcu(&f->f_rcuhead, file_free_rcu);
0060 }
0061 
0062 /*
0063  * Return the total number of open files in the system
0064  */
0065 static long get_nr_files(void)
0066 {
0067     return percpu_counter_read_positive(&nr_files);
0068 }
0069 
0070 /*
0071  * Return the maximum number of open files in the system
0072  */
0073 unsigned long get_max_files(void)
0074 {
0075     return files_stat.max_files;
0076 }
0077 EXPORT_SYMBOL_GPL(get_max_files);
0078 
0079 #if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS)
0080 
0081 /*
0082  * Handle nr_files sysctl
0083  */
0084 static int proc_nr_files(struct ctl_table *table, int write, void *buffer,
0085              size_t *lenp, loff_t *ppos)
0086 {
0087     files_stat.nr_files = get_nr_files();
0088     return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
0089 }
0090 
0091 static struct ctl_table fs_stat_sysctls[] = {
0092     {
0093         .procname   = "file-nr",
0094         .data       = &files_stat,
0095         .maxlen     = sizeof(files_stat),
0096         .mode       = 0444,
0097         .proc_handler   = proc_nr_files,
0098     },
0099     {
0100         .procname   = "file-max",
0101         .data       = &files_stat.max_files,
0102         .maxlen     = sizeof(files_stat.max_files),
0103         .mode       = 0644,
0104         .proc_handler   = proc_doulongvec_minmax,
0105         .extra1     = SYSCTL_LONG_ZERO,
0106         .extra2     = SYSCTL_LONG_MAX,
0107     },
0108     {
0109         .procname   = "nr_open",
0110         .data       = &sysctl_nr_open,
0111         .maxlen     = sizeof(unsigned int),
0112         .mode       = 0644,
0113         .proc_handler   = proc_dointvec_minmax,
0114         .extra1     = &sysctl_nr_open_min,
0115         .extra2     = &sysctl_nr_open_max,
0116     },
0117     { }
0118 };
0119 
0120 static int __init init_fs_stat_sysctls(void)
0121 {
0122     register_sysctl_init("fs", fs_stat_sysctls);
0123     if (IS_ENABLED(CONFIG_BINFMT_MISC)) {
0124         struct ctl_table_header *hdr;
0125         hdr = register_sysctl_mount_point("fs/binfmt_misc");
0126         kmemleak_not_leak(hdr);
0127     }
0128     return 0;
0129 }
0130 fs_initcall(init_fs_stat_sysctls);
0131 #endif
0132 
0133 static struct file *__alloc_file(int flags, const struct cred *cred)
0134 {
0135     struct file *f;
0136     int error;
0137 
0138     f = kmem_cache_zalloc(filp_cachep, GFP_KERNEL);
0139     if (unlikely(!f))
0140         return ERR_PTR(-ENOMEM);
0141 
0142     f->f_cred = get_cred(cred);
0143     error = security_file_alloc(f);
0144     if (unlikely(error)) {
0145         file_free_rcu(&f->f_rcuhead);
0146         return ERR_PTR(error);
0147     }
0148 
0149     atomic_long_set(&f->f_count, 1);
0150     rwlock_init(&f->f_owner.lock);
0151     spin_lock_init(&f->f_lock);
0152     mutex_init(&f->f_pos_lock);
0153     f->f_flags = flags;
0154     f->f_mode = OPEN_FMODE(flags);
0155     /* f->f_version: 0 */
0156 
0157     return f;
0158 }
0159 
0160 /* Find an unused file structure and return a pointer to it.
0161  * Returns an error pointer if some error happend e.g. we over file
0162  * structures limit, run out of memory or operation is not permitted.
0163  *
0164  * Be very careful using this.  You are responsible for
0165  * getting write access to any mount that you might assign
0166  * to this filp, if it is opened for write.  If this is not
0167  * done, you will imbalance int the mount's writer count
0168  * and a warning at __fput() time.
0169  */
0170 struct file *alloc_empty_file(int flags, const struct cred *cred)
0171 {
0172     static long old_max;
0173     struct file *f;
0174 
0175     /*
0176      * Privileged users can go above max_files
0177      */
0178     if (get_nr_files() >= files_stat.max_files && !capable(CAP_SYS_ADMIN)) {
0179         /*
0180          * percpu_counters are inaccurate.  Do an expensive check before
0181          * we go and fail.
0182          */
0183         if (percpu_counter_sum_positive(&nr_files) >= files_stat.max_files)
0184             goto over;
0185     }
0186 
0187     f = __alloc_file(flags, cred);
0188     if (!IS_ERR(f))
0189         percpu_counter_inc(&nr_files);
0190 
0191     return f;
0192 
0193 over:
0194     /* Ran out of filps - report that */
0195     if (get_nr_files() > old_max) {
0196         pr_info("VFS: file-max limit %lu reached\n", get_max_files());
0197         old_max = get_nr_files();
0198     }
0199     return ERR_PTR(-ENFILE);
0200 }
0201 
0202 /*
0203  * Variant of alloc_empty_file() that doesn't check and modify nr_files.
0204  *
0205  * Should not be used unless there's a very good reason to do so.
0206  */
0207 struct file *alloc_empty_file_noaccount(int flags, const struct cred *cred)
0208 {
0209     struct file *f = __alloc_file(flags, cred);
0210 
0211     if (!IS_ERR(f))
0212         f->f_mode |= FMODE_NOACCOUNT;
0213 
0214     return f;
0215 }
0216 
0217 /**
0218  * alloc_file - allocate and initialize a 'struct file'
0219  *
0220  * @path: the (dentry, vfsmount) pair for the new file
0221  * @flags: O_... flags with which the new file will be opened
0222  * @fop: the 'struct file_operations' for the new file
0223  */
0224 static struct file *alloc_file(const struct path *path, int flags,
0225         const struct file_operations *fop)
0226 {
0227     struct file *file;
0228 
0229     file = alloc_empty_file(flags, current_cred());
0230     if (IS_ERR(file))
0231         return file;
0232 
0233     file->f_path = *path;
0234     file->f_inode = path->dentry->d_inode;
0235     file->f_mapping = path->dentry->d_inode->i_mapping;
0236     file->f_wb_err = filemap_sample_wb_err(file->f_mapping);
0237     file->f_sb_err = file_sample_sb_err(file);
0238     if (fop->llseek)
0239         file->f_mode |= FMODE_LSEEK;
0240     if ((file->f_mode & FMODE_READ) &&
0241          likely(fop->read || fop->read_iter))
0242         file->f_mode |= FMODE_CAN_READ;
0243     if ((file->f_mode & FMODE_WRITE) &&
0244          likely(fop->write || fop->write_iter))
0245         file->f_mode |= FMODE_CAN_WRITE;
0246     file->f_iocb_flags = iocb_flags(file);
0247     file->f_mode |= FMODE_OPENED;
0248     file->f_op = fop;
0249     if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
0250         i_readcount_inc(path->dentry->d_inode);
0251     return file;
0252 }
0253 
0254 struct file *alloc_file_pseudo(struct inode *inode, struct vfsmount *mnt,
0255                 const char *name, int flags,
0256                 const struct file_operations *fops)
0257 {
0258     static const struct dentry_operations anon_ops = {
0259         .d_dname = simple_dname
0260     };
0261     struct qstr this = QSTR_INIT(name, strlen(name));
0262     struct path path;
0263     struct file *file;
0264 
0265     path.dentry = d_alloc_pseudo(mnt->mnt_sb, &this);
0266     if (!path.dentry)
0267         return ERR_PTR(-ENOMEM);
0268     if (!mnt->mnt_sb->s_d_op)
0269         d_set_d_op(path.dentry, &anon_ops);
0270     path.mnt = mntget(mnt);
0271     d_instantiate(path.dentry, inode);
0272     file = alloc_file(&path, flags, fops);
0273     if (IS_ERR(file)) {
0274         ihold(inode);
0275         path_put(&path);
0276     }
0277     return file;
0278 }
0279 EXPORT_SYMBOL(alloc_file_pseudo);
0280 
0281 struct file *alloc_file_clone(struct file *base, int flags,
0282                 const struct file_operations *fops)
0283 {
0284     struct file *f = alloc_file(&base->f_path, flags, fops);
0285     if (!IS_ERR(f)) {
0286         path_get(&f->f_path);
0287         f->f_mapping = base->f_mapping;
0288     }
0289     return f;
0290 }
0291 
0292 /* the real guts of fput() - releasing the last reference to file
0293  */
0294 static void __fput(struct file *file)
0295 {
0296     struct dentry *dentry = file->f_path.dentry;
0297     struct vfsmount *mnt = file->f_path.mnt;
0298     struct inode *inode = file->f_inode;
0299     fmode_t mode = file->f_mode;
0300 
0301     if (unlikely(!(file->f_mode & FMODE_OPENED)))
0302         goto out;
0303 
0304     might_sleep();
0305 
0306     fsnotify_close(file);
0307     /*
0308      * The function eventpoll_release() should be the first called
0309      * in the file cleanup chain.
0310      */
0311     eventpoll_release(file);
0312     locks_remove_file(file);
0313 
0314     ima_file_free(file);
0315     if (unlikely(file->f_flags & FASYNC)) {
0316         if (file->f_op->fasync)
0317             file->f_op->fasync(-1, file, 0);
0318     }
0319     if (file->f_op->release)
0320         file->f_op->release(inode, file);
0321     if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL &&
0322              !(mode & FMODE_PATH))) {
0323         cdev_put(inode->i_cdev);
0324     }
0325     fops_put(file->f_op);
0326     put_pid(file->f_owner.pid);
0327     if ((mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
0328         i_readcount_dec(inode);
0329     if (mode & FMODE_WRITER) {
0330         put_write_access(inode);
0331         __mnt_drop_write(mnt);
0332     }
0333     dput(dentry);
0334     if (unlikely(mode & FMODE_NEED_UNMOUNT))
0335         dissolve_on_fput(mnt);
0336     mntput(mnt);
0337 out:
0338     file_free(file);
0339 }
0340 
0341 static LLIST_HEAD(delayed_fput_list);
0342 static void delayed_fput(struct work_struct *unused)
0343 {
0344     struct llist_node *node = llist_del_all(&delayed_fput_list);
0345     struct file *f, *t;
0346 
0347     llist_for_each_entry_safe(f, t, node, f_llist)
0348         __fput(f);
0349 }
0350 
0351 static void ____fput(struct callback_head *work)
0352 {
0353     __fput(container_of(work, struct file, f_rcuhead));
0354 }
0355 
0356 /*
0357  * If kernel thread really needs to have the final fput() it has done
0358  * to complete, call this.  The only user right now is the boot - we
0359  * *do* need to make sure our writes to binaries on initramfs has
0360  * not left us with opened struct file waiting for __fput() - execve()
0361  * won't work without that.  Please, don't add more callers without
0362  * very good reasons; in particular, never call that with locks
0363  * held and never call that from a thread that might need to do
0364  * some work on any kind of umount.
0365  */
0366 void flush_delayed_fput(void)
0367 {
0368     delayed_fput(NULL);
0369 }
0370 EXPORT_SYMBOL_GPL(flush_delayed_fput);
0371 
0372 static DECLARE_DELAYED_WORK(delayed_fput_work, delayed_fput);
0373 
0374 void fput(struct file *file)
0375 {
0376     if (atomic_long_dec_and_test(&file->f_count)) {
0377         struct task_struct *task = current;
0378 
0379         if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) {
0380             init_task_work(&file->f_rcuhead, ____fput);
0381             if (!task_work_add(task, &file->f_rcuhead, TWA_RESUME))
0382                 return;
0383             /*
0384              * After this task has run exit_task_work(),
0385              * task_work_add() will fail.  Fall through to delayed
0386              * fput to avoid leaking *file.
0387              */
0388         }
0389 
0390         if (llist_add(&file->f_llist, &delayed_fput_list))
0391             schedule_delayed_work(&delayed_fput_work, 1);
0392     }
0393 }
0394 
0395 /*
0396  * synchronous analog of fput(); for kernel threads that might be needed
0397  * in some umount() (and thus can't use flush_delayed_fput() without
0398  * risking deadlocks), need to wait for completion of __fput() and know
0399  * for this specific struct file it won't involve anything that would
0400  * need them.  Use only if you really need it - at the very least,
0401  * don't blindly convert fput() by kernel thread to that.
0402  */
0403 void __fput_sync(struct file *file)
0404 {
0405     if (atomic_long_dec_and_test(&file->f_count)) {
0406         struct task_struct *task = current;
0407         BUG_ON(!(task->flags & PF_KTHREAD));
0408         __fput(file);
0409     }
0410 }
0411 
0412 EXPORT_SYMBOL(fput);
0413 EXPORT_SYMBOL(__fput_sync);
0414 
0415 void __init files_init(void)
0416 {
0417     filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0,
0418             SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT, NULL);
0419     percpu_counter_init(&nr_files, 0, GFP_KERNEL);
0420 }
0421 
0422 /*
0423  * One file with associated inode and dcache is very roughly 1K. Per default
0424  * do not use more than 10% of our memory for files.
0425  */
0426 void __init files_maxfiles_init(void)
0427 {
0428     unsigned long n;
0429     unsigned long nr_pages = totalram_pages();
0430     unsigned long memreserve = (nr_pages - nr_free_pages()) * 3/2;
0431 
0432     memreserve = min(memreserve, nr_pages - 1);
0433     n = ((nr_pages - memreserve) * (PAGE_SIZE / 1024)) / 10;
0434 
0435     files_stat.max_files = max_t(unsigned long, n, NR_FILE);
0436 }