fs/kernfs/file.c

0001 // SPDX-License-Identifier: GPL-2.0-only
0002 /*
0003  * fs/kernfs/file.c - kernfs file implementation
0004  *
0005  * Copyright (c) 2001-3 Patrick Mochel
0006  * Copyright (c) 2007 SUSE Linux Products GmbH
0007  * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org>
0008  */
0009
0010 #include <linux/fs.h>
0011 #include <linux/seq_file.h>
0012 #include <linux/slab.h>
0013 #include <linux/poll.h>
0014 #include <linux/pagemap.h>
0015 #include <linux/sched/mm.h>
0016 #include <linux/fsnotify.h>
0017 #include <linux/uio.h>
0018
0019 #include "kernfs-internal.h"
0020
0021 struct kernfs_open_node {
0022     struct rcu_head     rcu_head;
0023     atomic_t        event;
0024     wait_queue_head_t   poll;
0025     struct list_head    files; /* goes through kernfs_open_file.list */
0026 };
0027
0028 /*
0029  * kernfs_notify() may be called from any context and bounces notifications
0030  * through a work item.  To minimize space overhead in kernfs_node, the
0031  * pending queue is implemented as a singly linked list of kernfs_nodes.
0032  * The list is terminated with the self pointer so that whether a
0033  * kernfs_node is on the list or not can be determined by testing the next
0034  * pointer for NULL.
0035  */
0036 #define KERNFS_NOTIFY_EOL           ((void *)&kernfs_notify_list)
0037
0038 static DEFINE_SPINLOCK(kernfs_notify_lock);
0039 static struct kernfs_node *kernfs_notify_list = KERNFS_NOTIFY_EOL;
0040
0041 static inline struct mutex *kernfs_open_file_mutex_ptr(struct kernfs_node *kn)
0042 {
0043     int idx = hash_ptr(kn, NR_KERNFS_LOCK_BITS);
0044
0045     return &kernfs_locks->open_file_mutex[idx];
0046 }
0047
0048 static inline struct mutex *kernfs_open_file_mutex_lock(struct kernfs_node *kn)
0049 {
0050     struct mutex *lock;
0051
0052     lock = kernfs_open_file_mutex_ptr(kn);
0053
0054     mutex_lock(lock);
0055
0056     return lock;
0057 }
0058
0059 /**
0060  * kernfs_deref_open_node - Get kernfs_open_node corresponding to @kn.
0061  *
0062  * @of: associated kernfs_open_file instance.
0063  * @kn: target kernfs_node.
0064  *
0065  * Fetch and return ->attr.open of @kn if @of->list is non empty.
0066  * If @of->list is not empty we can safely assume that @of is on
0067  * @kn->attr.open->files list and this guarantees that @kn->attr.open
0068  * will not vanish i.e. dereferencing outside RCU read-side critical
0069  * section is safe here.
0070  *
0071  * The caller needs to make sure that @of->list is not empty.
0072  */
0073 static struct kernfs_open_node *
0074 kernfs_deref_open_node(struct kernfs_open_file *of, struct kernfs_node *kn)
0075 {
0076     struct kernfs_open_node *on;
0077
0078     on = rcu_dereference_check(kn->attr.open, !list_empty(&of->list));
0079
0080     return on;
0081 }
0082
0083 /**
0084  * kernfs_deref_open_node_protected - Get kernfs_open_node corresponding to @kn
0085  *
0086  * @kn: target kernfs_node.
0087  *
0088  * Fetch and return ->attr.open of @kn when caller holds the
0089  * kernfs_open_file_mutex_ptr(kn).
0090  *
0091  * Update of ->attr.open happens under kernfs_open_file_mutex_ptr(kn). So when
0092  * the caller guarantees that this mutex is being held, other updaters can't
0093  * change ->attr.open and this means that we can safely deref ->attr.open
0094  * outside RCU read-side critical section.
0095  *
0096  * The caller needs to make sure that kernfs_open_file_mutex is held.
0097  */
0098 static struct kernfs_open_node *
0099 kernfs_deref_open_node_protected(struct kernfs_node *kn)
0100 {
0101     return rcu_dereference_protected(kn->attr.open,
0102                 lockdep_is_held(kernfs_open_file_mutex_ptr(kn)));
0103 }
0104
0105 static struct kernfs_open_file *kernfs_of(struct file *file)
0106 {
0107     return ((struct seq_file *)file->private_data)->private;
0108 }
0109
0110 /*
0111  * Determine the kernfs_ops for the given kernfs_node.  This function must
0112  * be called while holding an active reference.
0113  */
0114 static const struct kernfs_ops *kernfs_ops(struct kernfs_node *kn)
0115 {
0116     if (kn->flags & KERNFS_LOCKDEP)
0117         lockdep_assert_held(kn);
0118     return kn->attr.ops;
0119 }
0120
0121 /*
0122  * As kernfs_seq_stop() is also called after kernfs_seq_start() or
0123  * kernfs_seq_next() failure, it needs to distinguish whether it's stopping
0124  * a seq_file iteration which is fully initialized with an active reference
0125  * or an aborted kernfs_seq_start() due to get_active failure.  The
0126  * position pointer is the only context for each seq_file iteration and
0127  * thus the stop condition should be encoded in it.  As the return value is
0128  * directly visible to userland, ERR_PTR(-ENODEV) is the only acceptable
0129  * choice to indicate get_active failure.
0130  *
0131  * Unfortunately, this is complicated due to the optional custom seq_file
0132  * operations which may return ERR_PTR(-ENODEV) too.  kernfs_seq_stop()
0133  * can't distinguish whether ERR_PTR(-ENODEV) is from get_active failure or
0134  * custom seq_file operations and thus can't decide whether put_active
0135  * should be performed or not only on ERR_PTR(-ENODEV).
0136  *
0137  * This is worked around by factoring out the custom seq_stop() and
0138  * put_active part into kernfs_seq_stop_active(), skipping it from
0139  * kernfs_seq_stop() if ERR_PTR(-ENODEV) while invoking it directly after
0140  * custom seq_file operations fail with ERR_PTR(-ENODEV) - this ensures
0141  * that kernfs_seq_stop_active() is skipped only after get_active failure.
0142  */
0143 static void kernfs_seq_stop_active(struct seq_file *sf, void *v)
0144 {
0145     struct kernfs_open_file *of = sf->private;
0146     const struct kernfs_ops *ops = kernfs_ops(of->kn);
0147
0148     if (ops->seq_stop)
0149         ops->seq_stop(sf, v);
0150     kernfs_put_active(of->kn);
0151 }
0152
0153 static void *kernfs_seq_start(struct seq_file *sf, loff_t *ppos)
0154 {
0155     struct kernfs_open_file *of = sf->private;
0156     const struct kernfs_ops *ops;
0157
0158     /*
0159      * @of->mutex nests outside active ref and is primarily to ensure that
0160      * the ops aren't called concurrently for the same open file.
0161      */
0162     mutex_lock(&of->mutex);
0163     if (!kernfs_get_active(of->kn))
0164         return ERR_PTR(-ENODEV);
0165
0166     ops = kernfs_ops(of->kn);
0167     if (ops->seq_start) {
0168         void *next = ops->seq_start(sf, ppos);
0169         /* see the comment above kernfs_seq_stop_active() */
0170         if (next == ERR_PTR(-ENODEV))
0171             kernfs_seq_stop_active(sf, next);
0172         return next;
0173     }
0174     return single_start(sf, ppos);
0175 }
0176
0177 static void *kernfs_seq_next(struct seq_file *sf, void *v, loff_t *ppos)
0178 {
0179     struct kernfs_open_file *of = sf->private;
0180     const struct kernfs_ops *ops = kernfs_ops(of->kn);
0181
0182     if (ops->seq_next) {
0183         void *next = ops->seq_next(sf, v, ppos);
0184         /* see the comment above kernfs_seq_stop_active() */
0185         if (next == ERR_PTR(-ENODEV))
0186             kernfs_seq_stop_active(sf, next);
0187         return next;
0188     } else {
0189         /*
0190          * The same behavior and code as single_open(), always
0191          * terminate after the initial read.
0192          */
0193         ++*ppos;
0194         return NULL;
0195     }
0196 }
0197
0198 static void kernfs_seq_stop(struct seq_file *sf, void *v)
0199 {
0200     struct kernfs_open_file *of = sf->private;
0201
0202     if (v != ERR_PTR(-ENODEV))
0203         kernfs_seq_stop_active(sf, v);
0204     mutex_unlock(&of->mutex);
0205 }
0206
0207 static int kernfs_seq_show(struct seq_file *sf, void *v)
0208 {
0209     struct kernfs_open_file *of = sf->private;
0210     struct kernfs_open_node *on = kernfs_deref_open_node(of, of->kn);
0211
0212     if (!on)
0213         return -EINVAL;
0214
0215     of->event = atomic_read(&on->event);
0216
0217     return of->kn->attr.ops->seq_show(sf, v);
0218 }
0219
0220 static const struct seq_operations kernfs_seq_ops = {
0221     .start = kernfs_seq_start,
0222     .next = kernfs_seq_next,
0223     .stop = kernfs_seq_stop,
0224     .show = kernfs_seq_show,
0225 };
0226
0227 /*
0228  * As reading a bin file can have side-effects, the exact offset and bytes
0229  * specified in read(2) call should be passed to the read callback making
0230  * it difficult to use seq_file.  Implement simplistic custom buffering for
0231  * bin files.
0232  */
0233 static ssize_t kernfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
0234 {
0235     struct kernfs_open_file *of = kernfs_of(iocb->ki_filp);
0236     ssize_t len = min_t(size_t, iov_iter_count(iter), PAGE_SIZE);
0237     const struct kernfs_ops *ops;
0238     struct kernfs_open_node *on;
0239     char *buf;
0240
0241     buf = of->prealloc_buf;
0242     if (buf)
0243         mutex_lock(&of->prealloc_mutex);
0244     else
0245         buf = kmalloc(len, GFP_KERNEL);
0246     if (!buf)
0247         return -ENOMEM;
0248
0249     /*
0250      * @of->mutex nests outside active ref and is used both to ensure that
0251      * the ops aren't called concurrently for the same open file.
0252      */
0253     mutex_lock(&of->mutex);
0254     if (!kernfs_get_active(of->kn)) {
0255         len = -ENODEV;
0256         mutex_unlock(&of->mutex);
0257         goto out_free;
0258     }
0259
0260     on = kernfs_deref_open_node(of, of->kn);
0261     if (!on) {
0262         len = -EINVAL;
0263         mutex_unlock(&of->mutex);
0264         goto out_free;
0265     }
0266
0267     of->event = atomic_read(&on->event);
0268
0269     ops = kernfs_ops(of->kn);
0270     if (ops->read)
0271         len = ops->read(of, buf, len, iocb->ki_pos);
0272     else
0273         len = -EINVAL;
0274
0275     kernfs_put_active(of->kn);
0276     mutex_unlock(&of->mutex);
0277
0278     if (len < 0)
0279         goto out_free;
0280
0281     if (copy_to_iter(buf, len, iter) != len) {
0282         len = -EFAULT;
0283         goto out_free;
0284     }
0285
0286     iocb->ki_pos += len;
0287
0288  out_free:
0289     if (buf == of->prealloc_buf)
0290         mutex_unlock(&of->prealloc_mutex);
0291     else
0292         kfree(buf);
0293     return len;
0294 }
0295
0296 static ssize_t kernfs_fop_read_iter(struct kiocb *iocb, struct iov_iter *iter)
0297 {
0298     if (kernfs_of(iocb->ki_filp)->kn->flags & KERNFS_HAS_SEQ_SHOW)
0299         return seq_read_iter(iocb, iter);
0300     return kernfs_file_read_iter(iocb, iter);
0301 }
0302
0303 /*
0304  * Copy data in from userland and pass it to the matching kernfs write
0305  * operation.
0306  *
0307  * There is no easy way for us to know if userspace is only doing a partial
0308  * write, so we don't support them. We expect the entire buffer to come on
0309  * the first write.  Hint: if you're writing a value, first read the file,
0310  * modify only the value you're changing, then write entire buffer
0311  * back.
0312  */
0313 static ssize_t kernfs_fop_write_iter(struct kiocb *iocb, struct iov_iter *iter)
0314 {
0315     struct kernfs_open_file *of = kernfs_of(iocb->ki_filp);
0316     ssize_t len = iov_iter_count(iter);
0317     const struct kernfs_ops *ops;
0318     char *buf;
0319
0320     if (of->atomic_write_len) {
0321         if (len > of->atomic_write_len)
0322             return -E2BIG;
0323     } else {
0324         len = min_t(size_t, len, PAGE_SIZE);
0325     }
0326
0327     buf = of->prealloc_buf;
0328     if (buf)
0329         mutex_lock(&of->prealloc_mutex);
0330     else
0331         buf = kmalloc(len + 1, GFP_KERNEL);
0332     if (!buf)
0333         return -ENOMEM;
0334
0335     if (copy_from_iter(buf, len, iter) != len) {
0336         len = -EFAULT;
0337         goto out_free;
0338     }
0339     buf[len] = '\0';    /* guarantee string termination */
0340
0341     /*
0342      * @of->mutex nests outside active ref and is used both to ensure that
0343      * the ops aren't called concurrently for the same open file.
0344      */
0345     mutex_lock(&of->mutex);
0346     if (!kernfs_get_active(of->kn)) {
0347         mutex_unlock(&of->mutex);
0348         len = -ENODEV;
0349         goto out_free;
0350     }
0351
0352     ops = kernfs_ops(of->kn);
0353     if (ops->write)
0354         len = ops->write(of, buf, len, iocb->ki_pos);
0355     else
0356         len = -EINVAL;
0357
0358     kernfs_put_active(of->kn);
0359     mutex_unlock(&of->mutex);
0360
0361     if (len > 0)
0362         iocb->ki_pos += len;
0363
0364 out_free:
0365     if (buf == of->prealloc_buf)
0366         mutex_unlock(&of->prealloc_mutex);
0367     else
0368         kfree(buf);
0369     return len;
0370 }
0371
0372 static void kernfs_vma_open(struct vm_area_struct *vma)
0373 {
0374     struct file *file = vma->vm_file;
0375     struct kernfs_open_file *of = kernfs_of(file);
0376
0377     if (!of->vm_ops)
0378         return;
0379
0380     if (!kernfs_get_active(of->kn))
0381         return;
0382
0383     if (of->vm_ops->open)
0384         of->vm_ops->open(vma);
0385
0386     kernfs_put_active(of->kn);
0387 }
0388
0389 static vm_fault_t kernfs_vma_fault(struct vm_fault *vmf)
0390 {
0391     struct file *file = vmf->vma->vm_file;
0392     struct kernfs_open_file *of = kernfs_of(file);
0393     vm_fault_t ret;
0394
0395     if (!of->vm_ops)
0396         return VM_FAULT_SIGBUS;
0397
0398     if (!kernfs_get_active(of->kn))
0399         return VM_FAULT_SIGBUS;
0400
0401     ret = VM_FAULT_SIGBUS;
0402     if (of->vm_ops->fault)
0403         ret = of->vm_ops->fault(vmf);
0404
0405     kernfs_put_active(of->kn);
0406     return ret;
0407 }
0408
0409 static vm_fault_t kernfs_vma_page_mkwrite(struct vm_fault *vmf)
0410 {
0411     struct file *file = vmf->vma->vm_file;
0412     struct kernfs_open_file *of = kernfs_of(file);
0413     vm_fault_t ret;
0414
0415     if (!of->vm_ops)
0416         return VM_FAULT_SIGBUS;
0417
0418     if (!kernfs_get_active(of->kn))
0419         return VM_FAULT_SIGBUS;
0420
0421     ret = 0;
0422     if (of->vm_ops->page_mkwrite)
0423         ret = of->vm_ops->page_mkwrite(vmf);
0424     else
0425         file_update_time(file);
0426
0427     kernfs_put_active(of->kn);
0428     return ret;
0429 }
0430
0431 static int kernfs_vma_access(struct vm_area_struct *vma, unsigned long addr,
0432                  void *buf, int len, int write)
0433 {
0434     struct file *file = vma->vm_file;
0435     struct kernfs_open_file *of = kernfs_of(file);
0436     int ret;
0437
0438     if (!of->vm_ops)
0439         return -EINVAL;
0440
0441     if (!kernfs_get_active(of->kn))
0442         return -EINVAL;
0443
0444     ret = -EINVAL;
0445     if (of->vm_ops->access)
0446         ret = of->vm_ops->access(vma, addr, buf, len, write);
0447
0448     kernfs_put_active(of->kn);
0449     return ret;
0450 }
0451
0452 #ifdef CONFIG_NUMA
0453 static int kernfs_vma_set_policy(struct vm_area_struct *vma,
0454                  struct mempolicy *new)
0455 {
0456     struct file *file = vma->vm_file;
0457     struct kernfs_open_file *of = kernfs_of(file);
0458     int ret;
0459
0460     if (!of->vm_ops)
0461         return 0;
0462
0463     if (!kernfs_get_active(of->kn))
0464         return -EINVAL;
0465
0466     ret = 0;
0467     if (of->vm_ops->set_policy)
0468         ret = of->vm_ops->set_policy(vma, new);
0469
0470     kernfs_put_active(of->kn);
0471     return ret;
0472 }
0473
0474 static struct mempolicy *kernfs_vma_get_policy(struct vm_area_struct *vma,
0475                            unsigned long addr)
0476 {
0477     struct file *file = vma->vm_file;
0478     struct kernfs_open_file *of = kernfs_of(file);
0479     struct mempolicy *pol;
0480
0481     if (!of->vm_ops)
0482         return vma->vm_policy;
0483
0484     if (!kernfs_get_active(of->kn))
0485         return vma->vm_policy;
0486
0487     pol = vma->vm_policy;
0488     if (of->vm_ops->get_policy)
0489         pol = of->vm_ops->get_policy(vma, addr);
0490
0491     kernfs_put_active(of->kn);
0492     return pol;
0493 }
0494
0495 #endif
0496
0497 static const struct vm_operations_struct kernfs_vm_ops = {
0498     .open       = kernfs_vma_open,
0499     .fault      = kernfs_vma_fault,
0500     .page_mkwrite   = kernfs_vma_page_mkwrite,
0501     .access     = kernfs_vma_access,
0502 #ifdef CONFIG_NUMA
0503     .set_policy = kernfs_vma_set_policy,
0504     .get_policy = kernfs_vma_get_policy,
0505 #endif
0506 };
0507
0508 static int kernfs_fop_mmap(struct file *file, struct vm_area_struct *vma)
0509 {
0510     struct kernfs_open_file *of = kernfs_of(file);
0511     const struct kernfs_ops *ops;
0512     int rc;
0513
0514     /*
0515      * mmap path and of->mutex are prone to triggering spurious lockdep
0516      * warnings and we don't want to add spurious locking dependency
0517      * between the two.  Check whether mmap is actually implemented
0518      * without grabbing @of->mutex by testing HAS_MMAP flag.  See the
0519      * comment in kernfs_file_open() for more details.
0520      */
0521     if (!(of->kn->flags & KERNFS_HAS_MMAP))
0522         return -ENODEV;
0523
0524     mutex_lock(&of->mutex);
0525
0526     rc = -ENODEV;
0527     if (!kernfs_get_active(of->kn))
0528         goto out_unlock;
0529
0530     ops = kernfs_ops(of->kn);
0531     rc = ops->mmap(of, vma);
0532     if (rc)
0533         goto out_put;
0534
0535     /*
0536      * PowerPC's pci_mmap of legacy_mem uses shmem_zero_setup()
0537      * to satisfy versions of X which crash if the mmap fails: that
0538      * substitutes a new vm_file, and we don't then want bin_vm_ops.
0539      */
0540     if (vma->vm_file != file)
0541         goto out_put;
0542
0543     rc = -EINVAL;
0544     if (of->mmapped && of->vm_ops != vma->vm_ops)
0545         goto out_put;
0546
0547     /*
0548      * It is not possible to successfully wrap close.
0549      * So error if someone is trying to use close.
0550      */
0551     if (vma->vm_ops && vma->vm_ops->close)
0552         goto out_put;
0553
0554     rc = 0;
0555     of->mmapped = true;
0556     of->vm_ops = vma->vm_ops;
0557     vma->vm_ops = &kernfs_vm_ops;
0558 out_put:
0559     kernfs_put_active(of->kn);
0560 out_unlock:
0561     mutex_unlock(&of->mutex);
0562
0563     return rc;
0564 }
0565
0566 /**
0567  *  kernfs_get_open_node - get or create kernfs_open_node
0568  *  @kn: target kernfs_node
0569  *  @of: kernfs_open_file for this instance of open
0570  *
0571  *  If @kn->attr.open exists, increment its reference count; otherwise,
0572  *  create one.  @of is chained to the files list.
0573  *
0574  *  LOCKING:
0575  *  Kernel thread context (may sleep).
0576  *
0577  *  RETURNS:
0578  *  0 on success, -errno on failure.
0579  */
0580 static int kernfs_get_open_node(struct kernfs_node *kn,
0581                 struct kernfs_open_file *of)
0582 {
0583     struct kernfs_open_node *on, *new_on = NULL;
0584     struct mutex *mutex = NULL;
0585
0586     mutex = kernfs_open_file_mutex_lock(kn);
0587     on = kernfs_deref_open_node_protected(kn);
0588
0589     if (on) {
0590         list_add_tail(&of->list, &on->files);
0591         mutex_unlock(mutex);
0592         return 0;
0593     } else {
0594         /* not there, initialize a new one */
0595         new_on = kmalloc(sizeof(*new_on), GFP_KERNEL);
0596         if (!new_on) {
0597             mutex_unlock(mutex);
0598             return -ENOMEM;
0599         }
0600         atomic_set(&new_on->event, 1);
0601         init_waitqueue_head(&new_on->poll);
0602         INIT_LIST_HEAD(&new_on->files);
0603         list_add_tail(&of->list, &new_on->files);
0604         rcu_assign_pointer(kn->attr.open, new_on);
0605     }
0606     mutex_unlock(mutex);
0607
0608     return 0;
0609 }
0610
0611 /**
0612  *  kernfs_unlink_open_file - Unlink @of from @kn.
0613  *
0614  *  @kn: target kernfs_node
0615  *  @of: associated kernfs_open_file
0616  *
0617  *  Unlink @of from list of @kn's associated open files. If list of
0618  *  associated open files becomes empty, disassociate and free
0619  *  kernfs_open_node.
0620  *
0621  *  LOCKING:
0622  *  None.
0623  */
0624 static void kernfs_unlink_open_file(struct kernfs_node *kn,
0625                  struct kernfs_open_file *of)
0626 {
0627     struct kernfs_open_node *on;
0628     struct mutex *mutex = NULL;
0629
0630     mutex = kernfs_open_file_mutex_lock(kn);
0631
0632     on = kernfs_deref_open_node_protected(kn);
0633     if (!on) {
0634         mutex_unlock(mutex);
0635         return;
0636     }
0637
0638     if (of)
0639         list_del(&of->list);
0640
0641     if (list_empty(&on->files)) {
0642         rcu_assign_pointer(kn->attr.open, NULL);
0643         kfree_rcu(on, rcu_head);
0644     }
0645
0646     mutex_unlock(mutex);
0647 }
0648
0649 static int kernfs_fop_open(struct inode *inode, struct file *file)
0650 {
0651     struct kernfs_node *kn = inode->i_private;
0652     struct kernfs_root *root = kernfs_root(kn);
0653     const struct kernfs_ops *ops;
0654     struct kernfs_open_file *of;
0655     bool has_read, has_write, has_mmap;
0656     int error = -EACCES;
0657
0658     if (!kernfs_get_active(kn))
0659         return -ENODEV;
0660
0661     ops = kernfs_ops(kn);
0662
0663     has_read = ops->seq_show || ops->read || ops->mmap;
0664     has_write = ops->write || ops->mmap;
0665     has_mmap = ops->mmap;
0666
0667     /* see the flag definition for details */
0668     if (root->flags & KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK) {
0669         if ((file->f_mode & FMODE_WRITE) &&
0670             (!(inode->i_mode & S_IWUGO) || !has_write))
0671             goto err_out;
0672
0673         if ((file->f_mode & FMODE_READ) &&
0674             (!(inode->i_mode & S_IRUGO) || !has_read))
0675             goto err_out;
0676     }
0677
0678     /* allocate a kernfs_open_file for the file */
0679     error = -ENOMEM;
0680     of = kzalloc(sizeof(struct kernfs_open_file), GFP_KERNEL);
0681     if (!of)
0682         goto err_out;
0683
0684     /*
0685      * The following is done to give a different lockdep key to
0686      * @of->mutex for files which implement mmap.  This is a rather
0687      * crude way to avoid false positive lockdep warning around
0688      * mm->mmap_lock - mmap nests @of->mutex under mm->mmap_lock and
0689      * reading /sys/block/sda/trace/act_mask grabs sr_mutex, under
0690      * which mm->mmap_lock nests, while holding @of->mutex.  As each
0691      * open file has a separate mutex, it's okay as long as those don't
0692      * happen on the same file.  At this point, we can't easily give
0693      * each file a separate locking class.  Let's differentiate on
0694      * whether the file has mmap or not for now.
0695      *
0696      * Both paths of the branch look the same.  They're supposed to
0697      * look that way and give @of->mutex different static lockdep keys.
0698      */
0699     if (has_mmap)
0700         mutex_init(&of->mutex);
0701     else
0702         mutex_init(&of->mutex);
0703
0704     of->kn = kn;
0705     of->file = file;
0706
0707     /*
0708      * Write path needs to atomic_write_len outside active reference.
0709      * Cache it in open_file.  See kernfs_fop_write_iter() for details.
0710      */
0711     of->atomic_write_len = ops->atomic_write_len;
0712
0713     error = -EINVAL;
0714     /*
0715      * ->seq_show is incompatible with ->prealloc,
0716      * as seq_read does its own allocation.
0717      * ->read must be used instead.
0718      */
0719     if (ops->prealloc && ops->seq_show)
0720         goto err_free;
0721     if (ops->prealloc) {
0722         int len = of->atomic_write_len ?: PAGE_SIZE;
0723         of->prealloc_buf = kmalloc(len + 1, GFP_KERNEL);
0724         error = -ENOMEM;
0725         if (!of->prealloc_buf)
0726             goto err_free;
0727         mutex_init(&of->prealloc_mutex);
0728     }
0729
0730     /*
0731      * Always instantiate seq_file even if read access doesn't use
0732      * seq_file or is not requested.  This unifies private data access
0733      * and readable regular files are the vast majority anyway.
0734      */
0735     if (ops->seq_show)
0736         error = seq_open(file, &kernfs_seq_ops);
0737     else
0738         error = seq_open(file, NULL);
0739     if (error)
0740         goto err_free;
0741
0742     of->seq_file = file->private_data;
0743     of->seq_file->private = of;
0744
0745     /* seq_file clears PWRITE unconditionally, restore it if WRITE */
0746     if (file->f_mode & FMODE_WRITE)
0747         file->f_mode |= FMODE_PWRITE;
0748
0749     /* make sure we have open node struct */
0750     error = kernfs_get_open_node(kn, of);
0751     if (error)
0752         goto err_seq_release;
0753
0754     if (ops->open) {
0755         /* nobody has access to @of yet, skip @of->mutex */
0756         error = ops->open(of);
0757         if (error)
0758             goto err_put_node;
0759     }
0760
0761     /* open succeeded, put active references */
0762     kernfs_put_active(kn);
0763     return 0;
0764
0765 err_put_node:
0766     kernfs_unlink_open_file(kn, of);
0767 err_seq_release:
0768     seq_release(inode, file);
0769 err_free:
0770     kfree(of->prealloc_buf);
0771     kfree(of);
0772 err_out:
0773     kernfs_put_active(kn);
0774     return error;
0775 }
0776
0777 /* used from release/drain to ensure that ->release() is called exactly once */
0778 static void kernfs_release_file(struct kernfs_node *kn,
0779                 struct kernfs_open_file *of)
0780 {
0781     /*
0782      * @of is guaranteed to have no other file operations in flight and
0783      * we just want to synchronize release and drain paths.
0784      * @kernfs_open_file_mutex_ptr(kn) is enough. @of->mutex can't be used
0785      * here because drain path may be called from places which can
0786      * cause circular dependency.
0787      */
0788     lockdep_assert_held(kernfs_open_file_mutex_ptr(kn));
0789
0790     if (!of->released) {
0791         /*
0792          * A file is never detached without being released and we
0793          * need to be able to release files which are deactivated
0794          * and being drained.  Don't use kernfs_ops().
0795          */
0796         kn->attr.ops->release(of);
0797         of->released = true;
0798     }
0799 }
0800
0801 static int kernfs_fop_release(struct inode *inode, struct file *filp)
0802 {
0803     struct kernfs_node *kn = inode->i_private;
0804     struct kernfs_open_file *of = kernfs_of(filp);
0805     struct mutex *mutex = NULL;
0806
0807     if (kn->flags & KERNFS_HAS_RELEASE) {
0808         mutex = kernfs_open_file_mutex_lock(kn);
0809         kernfs_release_file(kn, of);
0810         mutex_unlock(mutex);
0811     }
0812
0813     kernfs_unlink_open_file(kn, of);
0814     seq_release(inode, filp);
0815     kfree(of->prealloc_buf);
0816     kfree(of);
0817
0818     return 0;
0819 }
0820
0821 void kernfs_drain_open_files(struct kernfs_node *kn)
0822 {
0823     struct kernfs_open_node *on;
0824     struct kernfs_open_file *of;
0825     struct mutex *mutex = NULL;
0826
0827     if (!(kn->flags & (KERNFS_HAS_MMAP | KERNFS_HAS_RELEASE)))
0828         return;
0829
0830     /*
0831      * lockless opportunistic check is safe below because no one is adding to
0832      * ->attr.open at this point of time. This check allows early bail out
0833      * if ->attr.open is already NULL. kernfs_unlink_open_file makes
0834      * ->attr.open NULL only while holding kernfs_open_file_mutex so below
0835      * check under kernfs_open_file_mutex_ptr(kn) will ensure bailing out if
0836      * ->attr.open became NULL while waiting for the mutex.
0837      */
0838     if (!rcu_access_pointer(kn->attr.open))
0839         return;
0840
0841     mutex = kernfs_open_file_mutex_lock(kn);
0842     on = kernfs_deref_open_node_protected(kn);
0843     if (!on) {
0844         mutex_unlock(mutex);
0845         return;
0846     }
0847
0848     list_for_each_entry(of, &on->files, list) {
0849         struct inode *inode = file_inode(of->file);
0850
0851         if (kn->flags & KERNFS_HAS_MMAP)
0852             unmap_mapping_range(inode->i_mapping, 0, 0, 1);
0853
0854         if (kn->flags & KERNFS_HAS_RELEASE)
0855             kernfs_release_file(kn, of);
0856     }
0857
0858     mutex_unlock(mutex);
0859 }
0860
0861 /*
0862  * Kernfs attribute files are pollable.  The idea is that you read
0863  * the content and then you use 'poll' or 'select' to wait for
0864  * the content to change.  When the content changes (assuming the
0865  * manager for the kobject supports notification), poll will
0866  * return EPOLLERR|EPOLLPRI, and select will return the fd whether
0867  * it is waiting for read, write, or exceptions.
0868  * Once poll/select indicates that the value has changed, you
0869  * need to close and re-open the file, or seek to 0 and read again.
0870  * Reminder: this only works for attributes which actively support
0871  * it, and it is not possible to test an attribute from userspace
0872  * to see if it supports poll (Neither 'poll' nor 'select' return
0873  * an appropriate error code).  When in doubt, set a suitable timeout value.
0874  */
0875 __poll_t kernfs_generic_poll(struct kernfs_open_file *of, poll_table *wait)
0876 {
0877     struct kernfs_node *kn = kernfs_dentry_node(of->file->f_path.dentry);
0878     struct kernfs_open_node *on = kernfs_deref_open_node(of, kn);
0879
0880     if (!on)
0881         return EPOLLERR;
0882
0883     poll_wait(of->file, &on->poll, wait);
0884
0885     if (of->event != atomic_read(&on->event))
0886         return DEFAULT_POLLMASK|EPOLLERR|EPOLLPRI;
0887
0888     return DEFAULT_POLLMASK;
0889 }
0890
0891 static __poll_t kernfs_fop_poll(struct file *filp, poll_table *wait)
0892 {
0893     struct kernfs_open_file *of = kernfs_of(filp);
0894     struct kernfs_node *kn = kernfs_dentry_node(filp->f_path.dentry);
0895     __poll_t ret;
0896
0897     if (!kernfs_get_active(kn))
0898         return DEFAULT_POLLMASK|EPOLLERR|EPOLLPRI;
0899
0900     if (kn->attr.ops->poll)
0901         ret = kn->attr.ops->poll(of, wait);
0902     else
0903         ret = kernfs_generic_poll(of, wait);
0904
0905     kernfs_put_active(kn);
0906     return ret;
0907 }
0908
0909 static void kernfs_notify_workfn(struct work_struct *work)
0910 {
0911     struct kernfs_node *kn;
0912     struct kernfs_super_info *info;
0913     struct kernfs_root *root;
0914 repeat:
0915     /* pop one off the notify_list */
0916     spin_lock_irq(&kernfs_notify_lock);
0917     kn = kernfs_notify_list;
0918     if (kn == KERNFS_NOTIFY_EOL) {
0919         spin_unlock_irq(&kernfs_notify_lock);
0920         return;
0921     }
0922     kernfs_notify_list = kn->attr.notify_next;
0923     kn->attr.notify_next = NULL;
0924     spin_unlock_irq(&kernfs_notify_lock);
0925
0926     root = kernfs_root(kn);
0927     /* kick fsnotify */
0928     down_write(&root->kernfs_rwsem);
0929
0930     list_for_each_entry(info, &kernfs_root(kn)->supers, node) {
0931         struct kernfs_node *parent;
0932         struct inode *p_inode = NULL;
0933         struct inode *inode;
0934         struct qstr name;
0935
0936         /*
0937          * We want fsnotify_modify() on @kn but as the
0938          * modifications aren't originating from userland don't
0939          * have the matching @file available.  Look up the inodes
0940          * and generate the events manually.
0941          */
0942         inode = ilookup(info->sb, kernfs_ino(kn));
0943         if (!inode)
0944             continue;
0945
0946         name = (struct qstr)QSTR_INIT(kn->name, strlen(kn->name));
0947         parent = kernfs_get_parent(kn);
0948         if (parent) {
0949             p_inode = ilookup(info->sb, kernfs_ino(parent));
0950             if (p_inode) {
0951                 fsnotify(FS_MODIFY | FS_EVENT_ON_CHILD,
0952                      inode, FSNOTIFY_EVENT_INODE,
0953                      p_inode, &name, inode, 0);
0954                 iput(p_inode);
0955             }
0956
0957             kernfs_put(parent);
0958         }
0959
0960         if (!p_inode)
0961             fsnotify_inode(inode, FS_MODIFY);
0962
0963         iput(inode);
0964     }
0965
0966     up_write(&root->kernfs_rwsem);
0967     kernfs_put(kn);
0968     goto repeat;
0969 }
0970
0971 /**
0972  * kernfs_notify - notify a kernfs file
0973  * @kn: file to notify
0974  *
0975  * Notify @kn such that poll(2) on @kn wakes up.  Maybe be called from any
0976  * context.
0977  */
0978 void kernfs_notify(struct kernfs_node *kn)
0979 {
0980     static DECLARE_WORK(kernfs_notify_work, kernfs_notify_workfn);
0981     unsigned long flags;
0982     struct kernfs_open_node *on;
0983
0984     if (WARN_ON(kernfs_type(kn) != KERNFS_FILE))
0985         return;
0986
0987     /* kick poll immediately */
0988     rcu_read_lock();
0989     on = rcu_dereference(kn->attr.open);
0990     if (on) {
0991         atomic_inc(&on->event);
0992         wake_up_interruptible(&on->poll);
0993     }
0994     rcu_read_unlock();
0995
0996     /* schedule work to kick fsnotify */
0997     spin_lock_irqsave(&kernfs_notify_lock, flags);
0998     if (!kn->attr.notify_next) {
0999         kernfs_get(kn);
1000         kn->attr.notify_next = kernfs_notify_list;
1001         kernfs_notify_list = kn;
1002         schedule_work(&kernfs_notify_work);
1003     }
1004     spin_unlock_irqrestore(&kernfs_notify_lock, flags);
1005 }
1006 EXPORT_SYMBOL_GPL(kernfs_notify);
1007
1008 const struct file_operations kernfs_file_fops = {
1009     .read_iter  = kernfs_fop_read_iter,
1010     .write_iter = kernfs_fop_write_iter,
1011     .llseek     = generic_file_llseek,
1012     .mmap       = kernfs_fop_mmap,
1013     .open       = kernfs_fop_open,
1014     .release    = kernfs_fop_release,
1015     .poll       = kernfs_fop_poll,
1016     .fsync      = noop_fsync,
1017     .splice_read    = generic_file_splice_read,
1018     .splice_write   = iter_file_splice_write,
1019 };
1020
1021 /**
1022  * __kernfs_create_file - kernfs internal function to create a file
1023  * @parent: directory to create the file in
1024  * @name: name of the file
1025  * @mode: mode of the file
1026  * @uid: uid of the file
1027  * @gid: gid of the file
1028  * @size: size of the file
1029  * @ops: kernfs operations for the file
1030  * @priv: private data for the file
1031  * @ns: optional namespace tag of the file
1032  * @key: lockdep key for the file's active_ref, %NULL to disable lockdep
1033  *
1034  * Returns the created node on success, ERR_PTR() value on error.
1035  */
1036 struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent,
1037                      const char *name,
1038                      umode_t mode, kuid_t uid, kgid_t gid,
1039                      loff_t size,
1040                      const struct kernfs_ops *ops,
1041                      void *priv, const void *ns,
1042                      struct lock_class_key *key)
1043 {
1044     struct kernfs_node *kn;
1045     unsigned flags;
1046     int rc;
1047
1048     flags = KERNFS_FILE;
1049
1050     kn = kernfs_new_node(parent, name, (mode & S_IALLUGO) | S_IFREG,
1051                  uid, gid, flags);
1052     if (!kn)
1053         return ERR_PTR(-ENOMEM);
1054
1055     kn->attr.ops = ops;
1056     kn->attr.size = size;
1057     kn->ns = ns;
1058     kn->priv = priv;
1059
1060 #ifdef CONFIG_DEBUG_LOCK_ALLOC
1061     if (key) {
1062         lockdep_init_map(&kn->dep_map, "kn->active", key, 0);
1063         kn->flags |= KERNFS_LOCKDEP;
1064     }
1065 #endif
1066
1067     /*
1068      * kn->attr.ops is accessible only while holding active ref.  We
1069      * need to know whether some ops are implemented outside active
1070      * ref.  Cache their existence in flags.
1071      */
1072     if (ops->seq_show)
1073         kn->flags |= KERNFS_HAS_SEQ_SHOW;
1074     if (ops->mmap)
1075         kn->flags |= KERNFS_HAS_MMAP;
1076     if (ops->release)
1077         kn->flags |= KERNFS_HAS_RELEASE;
1078
1079     rc = kernfs_add_one(kn);
1080     if (rc) {
1081         kernfs_put(kn);
1082         return ERR_PTR(rc);
1083     }
1084     return kn;
1085 }