Back to home page

LXR

 
 

    


0001 /*
0002  *  fs/eventfd.c
0003  *
0004  *  Copyright (C) 2007  Davide Libenzi <davidel@xmailserver.org>
0005  *
0006  */
0007 
0008 #include <linux/file.h>
0009 #include <linux/poll.h>
0010 #include <linux/init.h>
0011 #include <linux/fs.h>
0012 #include <linux/sched.h>
0013 #include <linux/kernel.h>
0014 #include <linux/slab.h>
0015 #include <linux/list.h>
0016 #include <linux/spinlock.h>
0017 #include <linux/anon_inodes.h>
0018 #include <linux/syscalls.h>
0019 #include <linux/export.h>
0020 #include <linux/kref.h>
0021 #include <linux/eventfd.h>
0022 #include <linux/proc_fs.h>
0023 #include <linux/seq_file.h>
0024 
0025 struct eventfd_ctx {
0026     struct kref kref;
0027     wait_queue_head_t wqh;
0028     /*
0029      * Every time that a write(2) is performed on an eventfd, the
0030      * value of the __u64 being written is added to "count" and a
0031      * wakeup is performed on "wqh". A read(2) will return the "count"
0032      * value to userspace, and will reset "count" to zero. The kernel
0033      * side eventfd_signal() also, adds to the "count" counter and
0034      * issue a wakeup.
0035      */
0036     __u64 count;
0037     unsigned int flags;
0038 };
0039 
0040 /**
0041  * eventfd_signal - Adds @n to the eventfd counter.
0042  * @ctx: [in] Pointer to the eventfd context.
0043  * @n: [in] Value of the counter to be added to the eventfd internal counter.
0044  *          The value cannot be negative.
0045  *
0046  * This function is supposed to be called by the kernel in paths that do not
0047  * allow sleeping. In this function we allow the counter to reach the ULLONG_MAX
0048  * value, and we signal this as overflow condition by returning a POLLERR
0049  * to poll(2).
0050  *
0051  * Returns the amount by which the counter was incremented.  This will be less
0052  * than @n if the counter has overflowed.
0053  */
0054 __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
0055 {
0056     unsigned long flags;
0057 
0058     spin_lock_irqsave(&ctx->wqh.lock, flags);
0059     if (ULLONG_MAX - ctx->count < n)
0060         n = ULLONG_MAX - ctx->count;
0061     ctx->count += n;
0062     if (waitqueue_active(&ctx->wqh))
0063         wake_up_locked_poll(&ctx->wqh, POLLIN);
0064     spin_unlock_irqrestore(&ctx->wqh.lock, flags);
0065 
0066     return n;
0067 }
0068 EXPORT_SYMBOL_GPL(eventfd_signal);
0069 
0070 static void eventfd_free_ctx(struct eventfd_ctx *ctx)
0071 {
0072     kfree(ctx);
0073 }
0074 
0075 static void eventfd_free(struct kref *kref)
0076 {
0077     struct eventfd_ctx *ctx = container_of(kref, struct eventfd_ctx, kref);
0078 
0079     eventfd_free_ctx(ctx);
0080 }
0081 
0082 /**
0083  * eventfd_ctx_get - Acquires a reference to the internal eventfd context.
0084  * @ctx: [in] Pointer to the eventfd context.
0085  *
0086  * Returns: In case of success, returns a pointer to the eventfd context.
0087  */
0088 struct eventfd_ctx *eventfd_ctx_get(struct eventfd_ctx *ctx)
0089 {
0090     kref_get(&ctx->kref);
0091     return ctx;
0092 }
0093 EXPORT_SYMBOL_GPL(eventfd_ctx_get);
0094 
0095 /**
0096  * eventfd_ctx_put - Releases a reference to the internal eventfd context.
0097  * @ctx: [in] Pointer to eventfd context.
0098  *
0099  * The eventfd context reference must have been previously acquired either
0100  * with eventfd_ctx_get() or eventfd_ctx_fdget().
0101  */
0102 void eventfd_ctx_put(struct eventfd_ctx *ctx)
0103 {
0104     kref_put(&ctx->kref, eventfd_free);
0105 }
0106 EXPORT_SYMBOL_GPL(eventfd_ctx_put);
0107 
0108 static int eventfd_release(struct inode *inode, struct file *file)
0109 {
0110     struct eventfd_ctx *ctx = file->private_data;
0111 
0112     wake_up_poll(&ctx->wqh, POLLHUP);
0113     eventfd_ctx_put(ctx);
0114     return 0;
0115 }
0116 
0117 static unsigned int eventfd_poll(struct file *file, poll_table *wait)
0118 {
0119     struct eventfd_ctx *ctx = file->private_data;
0120     unsigned int events = 0;
0121     u64 count;
0122 
0123     poll_wait(file, &ctx->wqh, wait);
0124 
0125     /*
0126      * All writes to ctx->count occur within ctx->wqh.lock.  This read
0127      * can be done outside ctx->wqh.lock because we know that poll_wait
0128      * takes that lock (through add_wait_queue) if our caller will sleep.
0129      *
0130      * The read _can_ therefore seep into add_wait_queue's critical
0131      * section, but cannot move above it!  add_wait_queue's spin_lock acts
0132      * as an acquire barrier and ensures that the read be ordered properly
0133      * against the writes.  The following CAN happen and is safe:
0134      *
0135      *     poll                               write
0136      *     -----------------                  ------------
0137      *     lock ctx->wqh.lock (in poll_wait)
0138      *     count = ctx->count
0139      *     __add_wait_queue
0140      *     unlock ctx->wqh.lock
0141      *                                        lock ctx->qwh.lock
0142      *                                        ctx->count += n
0143      *                                        if (waitqueue_active)
0144      *                                          wake_up_locked_poll
0145      *                                        unlock ctx->qwh.lock
0146      *     eventfd_poll returns 0
0147      *
0148      * but the following, which would miss a wakeup, cannot happen:
0149      *
0150      *     poll                               write
0151      *     -----------------                  ------------
0152      *     count = ctx->count (INVALID!)
0153      *                                        lock ctx->qwh.lock
0154      *                                        ctx->count += n
0155      *                                        **waitqueue_active is false**
0156      *                                        **no wake_up_locked_poll!**
0157      *                                        unlock ctx->qwh.lock
0158      *     lock ctx->wqh.lock (in poll_wait)
0159      *     __add_wait_queue
0160      *     unlock ctx->wqh.lock
0161      *     eventfd_poll returns 0
0162      */
0163     count = READ_ONCE(ctx->count);
0164 
0165     if (count > 0)
0166         events |= POLLIN;
0167     if (count == ULLONG_MAX)
0168         events |= POLLERR;
0169     if (ULLONG_MAX - 1 > count)
0170         events |= POLLOUT;
0171 
0172     return events;
0173 }
0174 
0175 static void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt)
0176 {
0177     *cnt = (ctx->flags & EFD_SEMAPHORE) ? 1 : ctx->count;
0178     ctx->count -= *cnt;
0179 }
0180 
0181 /**
0182  * eventfd_ctx_remove_wait_queue - Read the current counter and removes wait queue.
0183  * @ctx: [in] Pointer to eventfd context.
0184  * @wait: [in] Wait queue to be removed.
0185  * @cnt: [out] Pointer to the 64-bit counter value.
0186  *
0187  * Returns %0 if successful, or the following error codes:
0188  *
0189  * -EAGAIN      : The operation would have blocked.
0190  *
0191  * This is used to atomically remove a wait queue entry from the eventfd wait
0192  * queue head, and read/reset the counter value.
0193  */
0194 int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_t *wait,
0195                   __u64 *cnt)
0196 {
0197     unsigned long flags;
0198 
0199     spin_lock_irqsave(&ctx->wqh.lock, flags);
0200     eventfd_ctx_do_read(ctx, cnt);
0201     __remove_wait_queue(&ctx->wqh, wait);
0202     if (*cnt != 0 && waitqueue_active(&ctx->wqh))
0203         wake_up_locked_poll(&ctx->wqh, POLLOUT);
0204     spin_unlock_irqrestore(&ctx->wqh.lock, flags);
0205 
0206     return *cnt != 0 ? 0 : -EAGAIN;
0207 }
0208 EXPORT_SYMBOL_GPL(eventfd_ctx_remove_wait_queue);
0209 
0210 /**
0211  * eventfd_ctx_read - Reads the eventfd counter or wait if it is zero.
0212  * @ctx: [in] Pointer to eventfd context.
0213  * @no_wait: [in] Different from zero if the operation should not block.
0214  * @cnt: [out] Pointer to the 64-bit counter value.
0215  *
0216  * Returns %0 if successful, or the following error codes:
0217  *
0218  * -EAGAIN      : The operation would have blocked but @no_wait was non-zero.
0219  * -ERESTARTSYS : A signal interrupted the wait operation.
0220  *
0221  * If @no_wait is zero, the function might sleep until the eventfd internal
0222  * counter becomes greater than zero.
0223  */
0224 ssize_t eventfd_ctx_read(struct eventfd_ctx *ctx, int no_wait, __u64 *cnt)
0225 {
0226     ssize_t res;
0227     DECLARE_WAITQUEUE(wait, current);
0228 
0229     spin_lock_irq(&ctx->wqh.lock);
0230     *cnt = 0;
0231     res = -EAGAIN;
0232     if (ctx->count > 0)
0233         res = 0;
0234     else if (!no_wait) {
0235         __add_wait_queue(&ctx->wqh, &wait);
0236         for (;;) {
0237             set_current_state(TASK_INTERRUPTIBLE);
0238             if (ctx->count > 0) {
0239                 res = 0;
0240                 break;
0241             }
0242             if (signal_pending(current)) {
0243                 res = -ERESTARTSYS;
0244                 break;
0245             }
0246             spin_unlock_irq(&ctx->wqh.lock);
0247             schedule();
0248             spin_lock_irq(&ctx->wqh.lock);
0249         }
0250         __remove_wait_queue(&ctx->wqh, &wait);
0251         __set_current_state(TASK_RUNNING);
0252     }
0253     if (likely(res == 0)) {
0254         eventfd_ctx_do_read(ctx, cnt);
0255         if (waitqueue_active(&ctx->wqh))
0256             wake_up_locked_poll(&ctx->wqh, POLLOUT);
0257     }
0258     spin_unlock_irq(&ctx->wqh.lock);
0259 
0260     return res;
0261 }
0262 EXPORT_SYMBOL_GPL(eventfd_ctx_read);
0263 
0264 static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count,
0265                 loff_t *ppos)
0266 {
0267     struct eventfd_ctx *ctx = file->private_data;
0268     ssize_t res;
0269     __u64 cnt;
0270 
0271     if (count < sizeof(cnt))
0272         return -EINVAL;
0273     res = eventfd_ctx_read(ctx, file->f_flags & O_NONBLOCK, &cnt);
0274     if (res < 0)
0275         return res;
0276 
0277     return put_user(cnt, (__u64 __user *) buf) ? -EFAULT : sizeof(cnt);
0278 }
0279 
0280 static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t count,
0281                  loff_t *ppos)
0282 {
0283     struct eventfd_ctx *ctx = file->private_data;
0284     ssize_t res;
0285     __u64 ucnt;
0286     DECLARE_WAITQUEUE(wait, current);
0287 
0288     if (count < sizeof(ucnt))
0289         return -EINVAL;
0290     if (copy_from_user(&ucnt, buf, sizeof(ucnt)))
0291         return -EFAULT;
0292     if (ucnt == ULLONG_MAX)
0293         return -EINVAL;
0294     spin_lock_irq(&ctx->wqh.lock);
0295     res = -EAGAIN;
0296     if (ULLONG_MAX - ctx->count > ucnt)
0297         res = sizeof(ucnt);
0298     else if (!(file->f_flags & O_NONBLOCK)) {
0299         __add_wait_queue(&ctx->wqh, &wait);
0300         for (res = 0;;) {
0301             set_current_state(TASK_INTERRUPTIBLE);
0302             if (ULLONG_MAX - ctx->count > ucnt) {
0303                 res = sizeof(ucnt);
0304                 break;
0305             }
0306             if (signal_pending(current)) {
0307                 res = -ERESTARTSYS;
0308                 break;
0309             }
0310             spin_unlock_irq(&ctx->wqh.lock);
0311             schedule();
0312             spin_lock_irq(&ctx->wqh.lock);
0313         }
0314         __remove_wait_queue(&ctx->wqh, &wait);
0315         __set_current_state(TASK_RUNNING);
0316     }
0317     if (likely(res > 0)) {
0318         ctx->count += ucnt;
0319         if (waitqueue_active(&ctx->wqh))
0320             wake_up_locked_poll(&ctx->wqh, POLLIN);
0321     }
0322     spin_unlock_irq(&ctx->wqh.lock);
0323 
0324     return res;
0325 }
0326 
0327 #ifdef CONFIG_PROC_FS
0328 static void eventfd_show_fdinfo(struct seq_file *m, struct file *f)
0329 {
0330     struct eventfd_ctx *ctx = f->private_data;
0331 
0332     spin_lock_irq(&ctx->wqh.lock);
0333     seq_printf(m, "eventfd-count: %16llx\n",
0334            (unsigned long long)ctx->count);
0335     spin_unlock_irq(&ctx->wqh.lock);
0336 }
0337 #endif
0338 
0339 static const struct file_operations eventfd_fops = {
0340 #ifdef CONFIG_PROC_FS
0341     .show_fdinfo    = eventfd_show_fdinfo,
0342 #endif
0343     .release    = eventfd_release,
0344     .poll       = eventfd_poll,
0345     .read       = eventfd_read,
0346     .write      = eventfd_write,
0347     .llseek     = noop_llseek,
0348 };
0349 
0350 /**
0351  * eventfd_fget - Acquire a reference of an eventfd file descriptor.
0352  * @fd: [in] Eventfd file descriptor.
0353  *
0354  * Returns a pointer to the eventfd file structure in case of success, or the
0355  * following error pointer:
0356  *
0357  * -EBADF    : Invalid @fd file descriptor.
0358  * -EINVAL   : The @fd file descriptor is not an eventfd file.
0359  */
0360 struct file *eventfd_fget(int fd)
0361 {
0362     struct file *file;
0363 
0364     file = fget(fd);
0365     if (!file)
0366         return ERR_PTR(-EBADF);
0367     if (file->f_op != &eventfd_fops) {
0368         fput(file);
0369         return ERR_PTR(-EINVAL);
0370     }
0371 
0372     return file;
0373 }
0374 EXPORT_SYMBOL_GPL(eventfd_fget);
0375 
0376 /**
0377  * eventfd_ctx_fdget - Acquires a reference to the internal eventfd context.
0378  * @fd: [in] Eventfd file descriptor.
0379  *
0380  * Returns a pointer to the internal eventfd context, otherwise the error
0381  * pointers returned by the following functions:
0382  *
0383  * eventfd_fget
0384  */
0385 struct eventfd_ctx *eventfd_ctx_fdget(int fd)
0386 {
0387     struct eventfd_ctx *ctx;
0388     struct fd f = fdget(fd);
0389     if (!f.file)
0390         return ERR_PTR(-EBADF);
0391     ctx = eventfd_ctx_fileget(f.file);
0392     fdput(f);
0393     return ctx;
0394 }
0395 EXPORT_SYMBOL_GPL(eventfd_ctx_fdget);
0396 
0397 /**
0398  * eventfd_ctx_fileget - Acquires a reference to the internal eventfd context.
0399  * @file: [in] Eventfd file pointer.
0400  *
0401  * Returns a pointer to the internal eventfd context, otherwise the error
0402  * pointer:
0403  *
0404  * -EINVAL   : The @fd file descriptor is not an eventfd file.
0405  */
0406 struct eventfd_ctx *eventfd_ctx_fileget(struct file *file)
0407 {
0408     if (file->f_op != &eventfd_fops)
0409         return ERR_PTR(-EINVAL);
0410 
0411     return eventfd_ctx_get(file->private_data);
0412 }
0413 EXPORT_SYMBOL_GPL(eventfd_ctx_fileget);
0414 
0415 /**
0416  * eventfd_file_create - Creates an eventfd file pointer.
0417  * @count: Initial eventfd counter value.
0418  * @flags: Flags for the eventfd file.
0419  *
0420  * This function creates an eventfd file pointer, w/out installing it into
0421  * the fd table. This is useful when the eventfd file is used during the
0422  * initialization of data structures that require extra setup after the eventfd
0423  * creation. So the eventfd creation is split into the file pointer creation
0424  * phase, and the file descriptor installation phase.
0425  * In this way races with userspace closing the newly installed file descriptor
0426  * can be avoided.
0427  * Returns an eventfd file pointer, or a proper error pointer.
0428  */
0429 struct file *eventfd_file_create(unsigned int count, int flags)
0430 {
0431     struct file *file;
0432     struct eventfd_ctx *ctx;
0433 
0434     /* Check the EFD_* constants for consistency.  */
0435     BUILD_BUG_ON(EFD_CLOEXEC != O_CLOEXEC);
0436     BUILD_BUG_ON(EFD_NONBLOCK != O_NONBLOCK);
0437 
0438     if (flags & ~EFD_FLAGS_SET)
0439         return ERR_PTR(-EINVAL);
0440 
0441     ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
0442     if (!ctx)
0443         return ERR_PTR(-ENOMEM);
0444 
0445     kref_init(&ctx->kref);
0446     init_waitqueue_head(&ctx->wqh);
0447     ctx->count = count;
0448     ctx->flags = flags;
0449 
0450     file = anon_inode_getfile("[eventfd]", &eventfd_fops, ctx,
0451                   O_RDWR | (flags & EFD_SHARED_FCNTL_FLAGS));
0452     if (IS_ERR(file))
0453         eventfd_free_ctx(ctx);
0454 
0455     return file;
0456 }
0457 
0458 SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags)
0459 {
0460     int fd, error;
0461     struct file *file;
0462 
0463     error = get_unused_fd_flags(flags & EFD_SHARED_FCNTL_FLAGS);
0464     if (error < 0)
0465         return error;
0466     fd = error;
0467 
0468     file = eventfd_file_create(count, flags);
0469     if (IS_ERR(file)) {
0470         error = PTR_ERR(file);
0471         goto err_put_unused_fd;
0472     }
0473     fd_install(fd, file);
0474 
0475     return fd;
0476 
0477 err_put_unused_fd:
0478     put_unused_fd(fd);
0479 
0480     return error;
0481 }
0482 
0483 SYSCALL_DEFINE1(eventfd, unsigned int, count)
0484 {
0485     return sys_eventfd2(count, 0);
0486 }
0487