Back to home page

LXR

 
 

    


0001 /*
0002  *  fs/eventpoll.c (Efficient event retrieval implementation)
0003  *  Copyright (C) 2001,...,2009  Davide Libenzi
0004  *
0005  *  This program is free software; you can redistribute it and/or modify
0006  *  it under the terms of the GNU General Public License as published by
0007  *  the Free Software Foundation; either version 2 of the License, or
0008  *  (at your option) any later version.
0009  *
0010  *  Davide Libenzi <davidel@xmailserver.org>
0011  *
0012  */
0013 
0014 #include <linux/init.h>
0015 #include <linux/kernel.h>
0016 #include <linux/sched.h>
0017 #include <linux/fs.h>
0018 #include <linux/file.h>
0019 #include <linux/signal.h>
0020 #include <linux/errno.h>
0021 #include <linux/mm.h>
0022 #include <linux/slab.h>
0023 #include <linux/poll.h>
0024 #include <linux/string.h>
0025 #include <linux/list.h>
0026 #include <linux/hash.h>
0027 #include <linux/spinlock.h>
0028 #include <linux/syscalls.h>
0029 #include <linux/rbtree.h>
0030 #include <linux/wait.h>
0031 #include <linux/eventpoll.h>
0032 #include <linux/mount.h>
0033 #include <linux/bitops.h>
0034 #include <linux/mutex.h>
0035 #include <linux/anon_inodes.h>
0036 #include <linux/device.h>
0037 #include <linux/uaccess.h>
0038 #include <asm/io.h>
0039 #include <asm/mman.h>
0040 #include <linux/atomic.h>
0041 #include <linux/proc_fs.h>
0042 #include <linux/seq_file.h>
0043 #include <linux/compat.h>
0044 #include <linux/rculist.h>
0045 
0046 /*
0047  * LOCKING:
0048  * There are three level of locking required by epoll :
0049  *
0050  * 1) epmutex (mutex)
0051  * 2) ep->mtx (mutex)
0052  * 3) ep->lock (spinlock)
0053  *
0054  * The acquire order is the one listed above, from 1 to 3.
0055  * We need a spinlock (ep->lock) because we manipulate objects
0056  * from inside the poll callback, that might be triggered from
0057  * a wake_up() that in turn might be called from IRQ context.
0058  * So we can't sleep inside the poll callback and hence we need
0059  * a spinlock. During the event transfer loop (from kernel to
0060  * user space) we could end up sleeping due a copy_to_user(), so
0061  * we need a lock that will allow us to sleep. This lock is a
0062  * mutex (ep->mtx). It is acquired during the event transfer loop,
0063  * during epoll_ctl(EPOLL_CTL_DEL) and during eventpoll_release_file().
0064  * Then we also need a global mutex to serialize eventpoll_release_file()
0065  * and ep_free().
0066  * This mutex is acquired by ep_free() during the epoll file
0067  * cleanup path and it is also acquired by eventpoll_release_file()
0068  * if a file has been pushed inside an epoll set and it is then
0069  * close()d without a previous call to epoll_ctl(EPOLL_CTL_DEL).
0070  * It is also acquired when inserting an epoll fd onto another epoll
0071  * fd. We do this so that we walk the epoll tree and ensure that this
0072  * insertion does not create a cycle of epoll file descriptors, which
0073  * could lead to deadlock. We need a global mutex to prevent two
0074  * simultaneous inserts (A into B and B into A) from racing and
0075  * constructing a cycle without either insert observing that it is
0076  * going to.
0077  * It is necessary to acquire multiple "ep->mtx"es at once in the
0078  * case when one epoll fd is added to another. In this case, we
0079  * always acquire the locks in the order of nesting (i.e. after
0080  * epoll_ctl(e1, EPOLL_CTL_ADD, e2), e1->mtx will always be acquired
0081  * before e2->mtx). Since we disallow cycles of epoll file
0082  * descriptors, this ensures that the mutexes are well-ordered. In
0083  * order to communicate this nesting to lockdep, when walking a tree
0084  * of epoll file descriptors, we use the current recursion depth as
0085  * the lockdep subkey.
0086  * It is possible to drop the "ep->mtx" and to use the global
0087  * mutex "epmutex" (together with "ep->lock") to have it working,
0088  * but having "ep->mtx" will make the interface more scalable.
0089  * Events that require holding "epmutex" are very rare, while for
0090  * normal operations the epoll private "ep->mtx" will guarantee
0091  * a better scalability.
0092  */
0093 
0094 /* Epoll private bits inside the event mask */
0095 #define EP_PRIVATE_BITS (EPOLLWAKEUP | EPOLLONESHOT | EPOLLET | EPOLLEXCLUSIVE)
0096 
0097 #define EPOLLINOUT_BITS (POLLIN | POLLOUT)
0098 
0099 #define EPOLLEXCLUSIVE_OK_BITS (EPOLLINOUT_BITS | POLLERR | POLLHUP | \
0100                 EPOLLWAKEUP | EPOLLET | EPOLLEXCLUSIVE)
0101 
0102 /* Maximum number of nesting allowed inside epoll sets */
0103 #define EP_MAX_NESTS 4
0104 
0105 #define EP_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event))
0106 
0107 #define EP_UNACTIVE_PTR ((void *) -1L)
0108 
0109 #define EP_ITEM_COST (sizeof(struct epitem) + sizeof(struct eppoll_entry))
0110 
0111 struct epoll_filefd {
0112     struct file *file;
0113     int fd;
0114 } __packed;
0115 
0116 /*
0117  * Structure used to track possible nested calls, for too deep recursions
0118  * and loop cycles.
0119  */
0120 struct nested_call_node {
0121     struct list_head llink;
0122     void *cookie;
0123     void *ctx;
0124 };
0125 
0126 /*
0127  * This structure is used as collector for nested calls, to check for
0128  * maximum recursion dept and loop cycles.
0129  */
0130 struct nested_calls {
0131     struct list_head tasks_call_list;
0132     spinlock_t lock;
0133 };
0134 
0135 /*
0136  * Each file descriptor added to the eventpoll interface will
0137  * have an entry of this type linked to the "rbr" RB tree.
0138  * Avoid increasing the size of this struct, there can be many thousands
0139  * of these on a server and we do not want this to take another cache line.
0140  */
0141 struct epitem {
0142     union {
0143         /* RB tree node links this structure to the eventpoll RB tree */
0144         struct rb_node rbn;
0145         /* Used to free the struct epitem */
0146         struct rcu_head rcu;
0147     };
0148 
0149     /* List header used to link this structure to the eventpoll ready list */
0150     struct list_head rdllink;
0151 
0152     /*
0153      * Works together "struct eventpoll"->ovflist in keeping the
0154      * single linked chain of items.
0155      */
0156     struct epitem *next;
0157 
0158     /* The file descriptor information this item refers to */
0159     struct epoll_filefd ffd;
0160 
0161     /* Number of active wait queue attached to poll operations */
0162     int nwait;
0163 
0164     /* List containing poll wait queues */
0165     struct list_head pwqlist;
0166 
0167     /* The "container" of this item */
0168     struct eventpoll *ep;
0169 
0170     /* List header used to link this item to the "struct file" items list */
0171     struct list_head fllink;
0172 
0173     /* wakeup_source used when EPOLLWAKEUP is set */
0174     struct wakeup_source __rcu *ws;
0175 
0176     /* The structure that describe the interested events and the source fd */
0177     struct epoll_event event;
0178 };
0179 
0180 /*
0181  * This structure is stored inside the "private_data" member of the file
0182  * structure and represents the main data structure for the eventpoll
0183  * interface.
0184  */
0185 struct eventpoll {
0186     /* Protect the access to this structure */
0187     spinlock_t lock;
0188 
0189     /*
0190      * This mutex is used to ensure that files are not removed
0191      * while epoll is using them. This is held during the event
0192      * collection loop, the file cleanup path, the epoll file exit
0193      * code and the ctl operations.
0194      */
0195     struct mutex mtx;
0196 
0197     /* Wait queue used by sys_epoll_wait() */
0198     wait_queue_head_t wq;
0199 
0200     /* Wait queue used by file->poll() */
0201     wait_queue_head_t poll_wait;
0202 
0203     /* List of ready file descriptors */
0204     struct list_head rdllist;
0205 
0206     /* RB tree root used to store monitored fd structs */
0207     struct rb_root rbr;
0208 
0209     /*
0210      * This is a single linked list that chains all the "struct epitem" that
0211      * happened while transferring ready events to userspace w/out
0212      * holding ->lock.
0213      */
0214     struct epitem *ovflist;
0215 
0216     /* wakeup_source used when ep_scan_ready_list is running */
0217     struct wakeup_source *ws;
0218 
0219     /* The user that created the eventpoll descriptor */
0220     struct user_struct *user;
0221 
0222     struct file *file;
0223 
0224     /* used to optimize loop detection check */
0225     int visited;
0226     struct list_head visited_list_link;
0227 };
0228 
0229 /* Wait structure used by the poll hooks */
0230 struct eppoll_entry {
0231     /* List header used to link this structure to the "struct epitem" */
0232     struct list_head llink;
0233 
0234     /* The "base" pointer is set to the container "struct epitem" */
0235     struct epitem *base;
0236 
0237     /*
0238      * Wait queue item that will be linked to the target file wait
0239      * queue head.
0240      */
0241     wait_queue_t wait;
0242 
0243     /* The wait queue head that linked the "wait" wait queue item */
0244     wait_queue_head_t *whead;
0245 };
0246 
0247 /* Wrapper struct used by poll queueing */
0248 struct ep_pqueue {
0249     poll_table pt;
0250     struct epitem *epi;
0251 };
0252 
0253 /* Used by the ep_send_events() function as callback private data */
0254 struct ep_send_events_data {
0255     int maxevents;
0256     struct epoll_event __user *events;
0257 };
0258 
0259 /*
0260  * Configuration options available inside /proc/sys/fs/epoll/
0261  */
0262 /* Maximum number of epoll watched descriptors, per user */
0263 static long max_user_watches __read_mostly;
0264 
0265 /*
0266  * This mutex is used to serialize ep_free() and eventpoll_release_file().
0267  */
0268 static DEFINE_MUTEX(epmutex);
0269 
0270 /* Used to check for epoll file descriptor inclusion loops */
0271 static struct nested_calls poll_loop_ncalls;
0272 
0273 /* Used for safe wake up implementation */
0274 static struct nested_calls poll_safewake_ncalls;
0275 
0276 /* Used to call file's f_op->poll() under the nested calls boundaries */
0277 static struct nested_calls poll_readywalk_ncalls;
0278 
0279 /* Slab cache used to allocate "struct epitem" */
0280 static struct kmem_cache *epi_cache __read_mostly;
0281 
0282 /* Slab cache used to allocate "struct eppoll_entry" */
0283 static struct kmem_cache *pwq_cache __read_mostly;
0284 
0285 /* Visited nodes during ep_loop_check(), so we can unset them when we finish */
0286 static LIST_HEAD(visited_list);
0287 
0288 /*
0289  * List of files with newly added links, where we may need to limit the number
0290  * of emanating paths. Protected by the epmutex.
0291  */
0292 static LIST_HEAD(tfile_check_list);
0293 
0294 #ifdef CONFIG_SYSCTL
0295 
0296 #include <linux/sysctl.h>
0297 
0298 static long zero;
0299 static long long_max = LONG_MAX;
0300 
0301 struct ctl_table epoll_table[] = {
0302     {
0303         .procname   = "max_user_watches",
0304         .data       = &max_user_watches,
0305         .maxlen     = sizeof(max_user_watches),
0306         .mode       = 0644,
0307         .proc_handler   = proc_doulongvec_minmax,
0308         .extra1     = &zero,
0309         .extra2     = &long_max,
0310     },
0311     { }
0312 };
0313 #endif /* CONFIG_SYSCTL */
0314 
0315 static const struct file_operations eventpoll_fops;
0316 
0317 static inline int is_file_epoll(struct file *f)
0318 {
0319     return f->f_op == &eventpoll_fops;
0320 }
0321 
0322 /* Setup the structure that is used as key for the RB tree */
0323 static inline void ep_set_ffd(struct epoll_filefd *ffd,
0324                   struct file *file, int fd)
0325 {
0326     ffd->file = file;
0327     ffd->fd = fd;
0328 }
0329 
0330 /* Compare RB tree keys */
0331 static inline int ep_cmp_ffd(struct epoll_filefd *p1,
0332                  struct epoll_filefd *p2)
0333 {
0334     return (p1->file > p2->file ? +1:
0335             (p1->file < p2->file ? -1 : p1->fd - p2->fd));
0336 }
0337 
0338 /* Tells us if the item is currently linked */
0339 static inline int ep_is_linked(struct list_head *p)
0340 {
0341     return !list_empty(p);
0342 }
0343 
0344 static inline struct eppoll_entry *ep_pwq_from_wait(wait_queue_t *p)
0345 {
0346     return container_of(p, struct eppoll_entry, wait);
0347 }
0348 
0349 /* Get the "struct epitem" from a wait queue pointer */
0350 static inline struct epitem *ep_item_from_wait(wait_queue_t *p)
0351 {
0352     return container_of(p, struct eppoll_entry, wait)->base;
0353 }
0354 
0355 /* Get the "struct epitem" from an epoll queue wrapper */
0356 static inline struct epitem *ep_item_from_epqueue(poll_table *p)
0357 {
0358     return container_of(p, struct ep_pqueue, pt)->epi;
0359 }
0360 
0361 /* Tells if the epoll_ctl(2) operation needs an event copy from userspace */
0362 static inline int ep_op_has_event(int op)
0363 {
0364     return op != EPOLL_CTL_DEL;
0365 }
0366 
0367 /* Initialize the poll safe wake up structure */
0368 static void ep_nested_calls_init(struct nested_calls *ncalls)
0369 {
0370     INIT_LIST_HEAD(&ncalls->tasks_call_list);
0371     spin_lock_init(&ncalls->lock);
0372 }
0373 
0374 /**
0375  * ep_events_available - Checks if ready events might be available.
0376  *
0377  * @ep: Pointer to the eventpoll context.
0378  *
0379  * Returns: Returns a value different than zero if ready events are available,
0380  *          or zero otherwise.
0381  */
0382 static inline int ep_events_available(struct eventpoll *ep)
0383 {
0384     return !list_empty(&ep->rdllist) || ep->ovflist != EP_UNACTIVE_PTR;
0385 }
0386 
0387 /**
0388  * ep_call_nested - Perform a bound (possibly) nested call, by checking
0389  *                  that the recursion limit is not exceeded, and that
0390  *                  the same nested call (by the meaning of same cookie) is
0391  *                  no re-entered.
0392  *
0393  * @ncalls: Pointer to the nested_calls structure to be used for this call.
0394  * @max_nests: Maximum number of allowed nesting calls.
0395  * @nproc: Nested call core function pointer.
0396  * @priv: Opaque data to be passed to the @nproc callback.
0397  * @cookie: Cookie to be used to identify this nested call.
0398  * @ctx: This instance context.
0399  *
0400  * Returns: Returns the code returned by the @nproc callback, or -1 if
0401  *          the maximum recursion limit has been exceeded.
0402  */
0403 static int ep_call_nested(struct nested_calls *ncalls, int max_nests,
0404               int (*nproc)(void *, void *, int), void *priv,
0405               void *cookie, void *ctx)
0406 {
0407     int error, call_nests = 0;
0408     unsigned long flags;
0409     struct list_head *lsthead = &ncalls->tasks_call_list;
0410     struct nested_call_node *tncur;
0411     struct nested_call_node tnode;
0412 
0413     spin_lock_irqsave(&ncalls->lock, flags);
0414 
0415     /*
0416      * Try to see if the current task is already inside this wakeup call.
0417      * We use a list here, since the population inside this set is always
0418      * very much limited.
0419      */
0420     list_for_each_entry(tncur, lsthead, llink) {
0421         if (tncur->ctx == ctx &&
0422             (tncur->cookie == cookie || ++call_nests > max_nests)) {
0423             /*
0424              * Ops ... loop detected or maximum nest level reached.
0425              * We abort this wake by breaking the cycle itself.
0426              */
0427             error = -1;
0428             goto out_unlock;
0429         }
0430     }
0431 
0432     /* Add the current task and cookie to the list */
0433     tnode.ctx = ctx;
0434     tnode.cookie = cookie;
0435     list_add(&tnode.llink, lsthead);
0436 
0437     spin_unlock_irqrestore(&ncalls->lock, flags);
0438 
0439     /* Call the nested function */
0440     error = (*nproc)(priv, cookie, call_nests);
0441 
0442     /* Remove the current task from the list */
0443     spin_lock_irqsave(&ncalls->lock, flags);
0444     list_del(&tnode.llink);
0445 out_unlock:
0446     spin_unlock_irqrestore(&ncalls->lock, flags);
0447 
0448     return error;
0449 }
0450 
0451 /*
0452  * As described in commit 0ccf831cb lockdep: annotate epoll
0453  * the use of wait queues used by epoll is done in a very controlled
0454  * manner. Wake ups can nest inside each other, but are never done
0455  * with the same locking. For example:
0456  *
0457  *   dfd = socket(...);
0458  *   efd1 = epoll_create();
0459  *   efd2 = epoll_create();
0460  *   epoll_ctl(efd1, EPOLL_CTL_ADD, dfd, ...);
0461  *   epoll_ctl(efd2, EPOLL_CTL_ADD, efd1, ...);
0462  *
0463  * When a packet arrives to the device underneath "dfd", the net code will
0464  * issue a wake_up() on its poll wake list. Epoll (efd1) has installed a
0465  * callback wakeup entry on that queue, and the wake_up() performed by the
0466  * "dfd" net code will end up in ep_poll_callback(). At this point epoll
0467  * (efd1) notices that it may have some event ready, so it needs to wake up
0468  * the waiters on its poll wait list (efd2). So it calls ep_poll_safewake()
0469  * that ends up in another wake_up(), after having checked about the
0470  * recursion constraints. That are, no more than EP_MAX_POLLWAKE_NESTS, to
0471  * avoid stack blasting.
0472  *
0473  * When CONFIG_DEBUG_LOCK_ALLOC is enabled, make sure lockdep can handle
0474  * this special case of epoll.
0475  */
0476 #ifdef CONFIG_DEBUG_LOCK_ALLOC
0477 static inline void ep_wake_up_nested(wait_queue_head_t *wqueue,
0478                      unsigned long events, int subclass)
0479 {
0480     unsigned long flags;
0481 
0482     spin_lock_irqsave_nested(&wqueue->lock, flags, subclass);
0483     wake_up_locked_poll(wqueue, events);
0484     spin_unlock_irqrestore(&wqueue->lock, flags);
0485 }
0486 #else
0487 static inline void ep_wake_up_nested(wait_queue_head_t *wqueue,
0488                      unsigned long events, int subclass)
0489 {
0490     wake_up_poll(wqueue, events);
0491 }
0492 #endif
0493 
0494 static int ep_poll_wakeup_proc(void *priv, void *cookie, int call_nests)
0495 {
0496     ep_wake_up_nested((wait_queue_head_t *) cookie, POLLIN,
0497               1 + call_nests);
0498     return 0;
0499 }
0500 
0501 /*
0502  * Perform a safe wake up of the poll wait list. The problem is that
0503  * with the new callback'd wake up system, it is possible that the
0504  * poll callback is reentered from inside the call to wake_up() done
0505  * on the poll wait queue head. The rule is that we cannot reenter the
0506  * wake up code from the same task more than EP_MAX_NESTS times,
0507  * and we cannot reenter the same wait queue head at all. This will
0508  * enable to have a hierarchy of epoll file descriptor of no more than
0509  * EP_MAX_NESTS deep.
0510  */
0511 static void ep_poll_safewake(wait_queue_head_t *wq)
0512 {
0513     int this_cpu = get_cpu();
0514 
0515     ep_call_nested(&poll_safewake_ncalls, EP_MAX_NESTS,
0516                ep_poll_wakeup_proc, NULL, wq, (void *) (long) this_cpu);
0517 
0518     put_cpu();
0519 }
0520 
0521 static void ep_remove_wait_queue(struct eppoll_entry *pwq)
0522 {
0523     wait_queue_head_t *whead;
0524 
0525     rcu_read_lock();
0526     /* If it is cleared by POLLFREE, it should be rcu-safe */
0527     whead = rcu_dereference(pwq->whead);
0528     if (whead)
0529         remove_wait_queue(whead, &pwq->wait);
0530     rcu_read_unlock();
0531 }
0532 
0533 /*
0534  * This function unregisters poll callbacks from the associated file
0535  * descriptor.  Must be called with "mtx" held (or "epmutex" if called from
0536  * ep_free).
0537  */
0538 static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi)
0539 {
0540     struct list_head *lsthead = &epi->pwqlist;
0541     struct eppoll_entry *pwq;
0542 
0543     while (!list_empty(lsthead)) {
0544         pwq = list_first_entry(lsthead, struct eppoll_entry, llink);
0545 
0546         list_del(&pwq->llink);
0547         ep_remove_wait_queue(pwq);
0548         kmem_cache_free(pwq_cache, pwq);
0549     }
0550 }
0551 
0552 /* call only when ep->mtx is held */
0553 static inline struct wakeup_source *ep_wakeup_source(struct epitem *epi)
0554 {
0555     return rcu_dereference_check(epi->ws, lockdep_is_held(&epi->ep->mtx));
0556 }
0557 
0558 /* call only when ep->mtx is held */
0559 static inline void ep_pm_stay_awake(struct epitem *epi)
0560 {
0561     struct wakeup_source *ws = ep_wakeup_source(epi);
0562 
0563     if (ws)
0564         __pm_stay_awake(ws);
0565 }
0566 
0567 static inline bool ep_has_wakeup_source(struct epitem *epi)
0568 {
0569     return rcu_access_pointer(epi->ws) ? true : false;
0570 }
0571 
0572 /* call when ep->mtx cannot be held (ep_poll_callback) */
0573 static inline void ep_pm_stay_awake_rcu(struct epitem *epi)
0574 {
0575     struct wakeup_source *ws;
0576 
0577     rcu_read_lock();
0578     ws = rcu_dereference(epi->ws);
0579     if (ws)
0580         __pm_stay_awake(ws);
0581     rcu_read_unlock();
0582 }
0583 
0584 /**
0585  * ep_scan_ready_list - Scans the ready list in a way that makes possible for
0586  *                      the scan code, to call f_op->poll(). Also allows for
0587  *                      O(NumReady) performance.
0588  *
0589  * @ep: Pointer to the epoll private data structure.
0590  * @sproc: Pointer to the scan callback.
0591  * @priv: Private opaque data passed to the @sproc callback.
0592  * @depth: The current depth of recursive f_op->poll calls.
0593  * @ep_locked: caller already holds ep->mtx
0594  *
0595  * Returns: The same integer error code returned by the @sproc callback.
0596  */
0597 static int ep_scan_ready_list(struct eventpoll *ep,
0598                   int (*sproc)(struct eventpoll *,
0599                        struct list_head *, void *),
0600                   void *priv, int depth, bool ep_locked)
0601 {
0602     int error, pwake = 0;
0603     unsigned long flags;
0604     struct epitem *epi, *nepi;
0605     LIST_HEAD(txlist);
0606 
0607     /*
0608      * We need to lock this because we could be hit by
0609      * eventpoll_release_file() and epoll_ctl().
0610      */
0611 
0612     if (!ep_locked)
0613         mutex_lock_nested(&ep->mtx, depth);
0614 
0615     /*
0616      * Steal the ready list, and re-init the original one to the
0617      * empty list. Also, set ep->ovflist to NULL so that events
0618      * happening while looping w/out locks, are not lost. We cannot
0619      * have the poll callback to queue directly on ep->rdllist,
0620      * because we want the "sproc" callback to be able to do it
0621      * in a lockless way.
0622      */
0623     spin_lock_irqsave(&ep->lock, flags);
0624     list_splice_init(&ep->rdllist, &txlist);
0625     ep->ovflist = NULL;
0626     spin_unlock_irqrestore(&ep->lock, flags);
0627 
0628     /*
0629      * Now call the callback function.
0630      */
0631     error = (*sproc)(ep, &txlist, priv);
0632 
0633     spin_lock_irqsave(&ep->lock, flags);
0634     /*
0635      * During the time we spent inside the "sproc" callback, some
0636      * other events might have been queued by the poll callback.
0637      * We re-insert them inside the main ready-list here.
0638      */
0639     for (nepi = ep->ovflist; (epi = nepi) != NULL;
0640          nepi = epi->next, epi->next = EP_UNACTIVE_PTR) {
0641         /*
0642          * We need to check if the item is already in the list.
0643          * During the "sproc" callback execution time, items are
0644          * queued into ->ovflist but the "txlist" might already
0645          * contain them, and the list_splice() below takes care of them.
0646          */
0647         if (!ep_is_linked(&epi->rdllink)) {
0648             list_add_tail(&epi->rdllink, &ep->rdllist);
0649             ep_pm_stay_awake(epi);
0650         }
0651     }
0652     /*
0653      * We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after
0654      * releasing the lock, events will be queued in the normal way inside
0655      * ep->rdllist.
0656      */
0657     ep->ovflist = EP_UNACTIVE_PTR;
0658 
0659     /*
0660      * Quickly re-inject items left on "txlist".
0661      */
0662     list_splice(&txlist, &ep->rdllist);
0663     __pm_relax(ep->ws);
0664 
0665     if (!list_empty(&ep->rdllist)) {
0666         /*
0667          * Wake up (if active) both the eventpoll wait list and
0668          * the ->poll() wait list (delayed after we release the lock).
0669          */
0670         if (waitqueue_active(&ep->wq))
0671             wake_up_locked(&ep->wq);
0672         if (waitqueue_active(&ep->poll_wait))
0673             pwake++;
0674     }
0675     spin_unlock_irqrestore(&ep->lock, flags);
0676 
0677     if (!ep_locked)
0678         mutex_unlock(&ep->mtx);
0679 
0680     /* We have to call this outside the lock */
0681     if (pwake)
0682         ep_poll_safewake(&ep->poll_wait);
0683 
0684     return error;
0685 }
0686 
0687 static void epi_rcu_free(struct rcu_head *head)
0688 {
0689     struct epitem *epi = container_of(head, struct epitem, rcu);
0690     kmem_cache_free(epi_cache, epi);
0691 }
0692 
0693 /*
0694  * Removes a "struct epitem" from the eventpoll RB tree and deallocates
0695  * all the associated resources. Must be called with "mtx" held.
0696  */
0697 static int ep_remove(struct eventpoll *ep, struct epitem *epi)
0698 {
0699     unsigned long flags;
0700     struct file *file = epi->ffd.file;
0701 
0702     /*
0703      * Removes poll wait queue hooks. We _have_ to do this without holding
0704      * the "ep->lock" otherwise a deadlock might occur. This because of the
0705      * sequence of the lock acquisition. Here we do "ep->lock" then the wait
0706      * queue head lock when unregistering the wait queue. The wakeup callback
0707      * will run by holding the wait queue head lock and will call our callback
0708      * that will try to get "ep->lock".
0709      */
0710     ep_unregister_pollwait(ep, epi);
0711 
0712     /* Remove the current item from the list of epoll hooks */
0713     spin_lock(&file->f_lock);
0714     list_del_rcu(&epi->fllink);
0715     spin_unlock(&file->f_lock);
0716 
0717     rb_erase(&epi->rbn, &ep->rbr);
0718 
0719     spin_lock_irqsave(&ep->lock, flags);
0720     if (ep_is_linked(&epi->rdllink))
0721         list_del_init(&epi->rdllink);
0722     spin_unlock_irqrestore(&ep->lock, flags);
0723 
0724     wakeup_source_unregister(ep_wakeup_source(epi));
0725     /*
0726      * At this point it is safe to free the eventpoll item. Use the union
0727      * field epi->rcu, since we are trying to minimize the size of
0728      * 'struct epitem'. The 'rbn' field is no longer in use. Protected by
0729      * ep->mtx. The rcu read side, reverse_path_check_proc(), does not make
0730      * use of the rbn field.
0731      */
0732     call_rcu(&epi->rcu, epi_rcu_free);
0733 
0734     atomic_long_dec(&ep->user->epoll_watches);
0735 
0736     return 0;
0737 }
0738 
0739 static void ep_free(struct eventpoll *ep)
0740 {
0741     struct rb_node *rbp;
0742     struct epitem *epi;
0743 
0744     /* We need to release all tasks waiting for these file */
0745     if (waitqueue_active(&ep->poll_wait))
0746         ep_poll_safewake(&ep->poll_wait);
0747 
0748     /*
0749      * We need to lock this because we could be hit by
0750      * eventpoll_release_file() while we're freeing the "struct eventpoll".
0751      * We do not need to hold "ep->mtx" here because the epoll file
0752      * is on the way to be removed and no one has references to it
0753      * anymore. The only hit might come from eventpoll_release_file() but
0754      * holding "epmutex" is sufficient here.
0755      */
0756     mutex_lock(&epmutex);
0757 
0758     /*
0759      * Walks through the whole tree by unregistering poll callbacks.
0760      */
0761     for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
0762         epi = rb_entry(rbp, struct epitem, rbn);
0763 
0764         ep_unregister_pollwait(ep, epi);
0765         cond_resched();
0766     }
0767 
0768     /*
0769      * Walks through the whole tree by freeing each "struct epitem". At this
0770      * point we are sure no poll callbacks will be lingering around, and also by
0771      * holding "epmutex" we can be sure that no file cleanup code will hit
0772      * us during this operation. So we can avoid the lock on "ep->lock".
0773      * We do not need to lock ep->mtx, either, we only do it to prevent
0774      * a lockdep warning.
0775      */
0776     mutex_lock(&ep->mtx);
0777     while ((rbp = rb_first(&ep->rbr)) != NULL) {
0778         epi = rb_entry(rbp, struct epitem, rbn);
0779         ep_remove(ep, epi);
0780         cond_resched();
0781     }
0782     mutex_unlock(&ep->mtx);
0783 
0784     mutex_unlock(&epmutex);
0785     mutex_destroy(&ep->mtx);
0786     free_uid(ep->user);
0787     wakeup_source_unregister(ep->ws);
0788     kfree(ep);
0789 }
0790 
0791 static int ep_eventpoll_release(struct inode *inode, struct file *file)
0792 {
0793     struct eventpoll *ep = file->private_data;
0794 
0795     if (ep)
0796         ep_free(ep);
0797 
0798     return 0;
0799 }
0800 
0801 static inline unsigned int ep_item_poll(struct epitem *epi, poll_table *pt)
0802 {
0803     pt->_key = epi->event.events;
0804 
0805     return epi->ffd.file->f_op->poll(epi->ffd.file, pt) & epi->event.events;
0806 }
0807 
0808 static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
0809                    void *priv)
0810 {
0811     struct epitem *epi, *tmp;
0812     poll_table pt;
0813 
0814     init_poll_funcptr(&pt, NULL);
0815 
0816     list_for_each_entry_safe(epi, tmp, head, rdllink) {
0817         if (ep_item_poll(epi, &pt))
0818             return POLLIN | POLLRDNORM;
0819         else {
0820             /*
0821              * Item has been dropped into the ready list by the poll
0822              * callback, but it's not actually ready, as far as
0823              * caller requested events goes. We can remove it here.
0824              */
0825             __pm_relax(ep_wakeup_source(epi));
0826             list_del_init(&epi->rdllink);
0827         }
0828     }
0829 
0830     return 0;
0831 }
0832 
0833 static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
0834                  poll_table *pt);
0835 
0836 struct readyevents_arg {
0837     struct eventpoll *ep;
0838     bool locked;
0839 };
0840 
0841 static int ep_poll_readyevents_proc(void *priv, void *cookie, int call_nests)
0842 {
0843     struct readyevents_arg *arg = priv;
0844 
0845     return ep_scan_ready_list(arg->ep, ep_read_events_proc, NULL,
0846                   call_nests + 1, arg->locked);
0847 }
0848 
0849 static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
0850 {
0851     int pollflags;
0852     struct eventpoll *ep = file->private_data;
0853     struct readyevents_arg arg;
0854 
0855     /*
0856      * During ep_insert() we already hold the ep->mtx for the tfile.
0857      * Prevent re-aquisition.
0858      */
0859     arg.locked = wait && (wait->_qproc == ep_ptable_queue_proc);
0860     arg.ep = ep;
0861 
0862     /* Insert inside our poll wait queue */
0863     poll_wait(file, &ep->poll_wait, wait);
0864 
0865     /*
0866      * Proceed to find out if wanted events are really available inside
0867      * the ready list. This need to be done under ep_call_nested()
0868      * supervision, since the call to f_op->poll() done on listed files
0869      * could re-enter here.
0870      */
0871     pollflags = ep_call_nested(&poll_readywalk_ncalls, EP_MAX_NESTS,
0872                    ep_poll_readyevents_proc, &arg, ep, current);
0873 
0874     return pollflags != -1 ? pollflags : 0;
0875 }
0876 
0877 #ifdef CONFIG_PROC_FS
0878 static void ep_show_fdinfo(struct seq_file *m, struct file *f)
0879 {
0880     struct eventpoll *ep = f->private_data;
0881     struct rb_node *rbp;
0882 
0883     mutex_lock(&ep->mtx);
0884     for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
0885         struct epitem *epi = rb_entry(rbp, struct epitem, rbn);
0886 
0887         seq_printf(m, "tfd: %8d events: %8x data: %16llx\n",
0888                epi->ffd.fd, epi->event.events,
0889                (long long)epi->event.data);
0890         if (seq_has_overflowed(m))
0891             break;
0892     }
0893     mutex_unlock(&ep->mtx);
0894 }
0895 #endif
0896 
0897 /* File callbacks that implement the eventpoll file behaviour */
0898 static const struct file_operations eventpoll_fops = {
0899 #ifdef CONFIG_PROC_FS
0900     .show_fdinfo    = ep_show_fdinfo,
0901 #endif
0902     .release    = ep_eventpoll_release,
0903     .poll       = ep_eventpoll_poll,
0904     .llseek     = noop_llseek,
0905 };
0906 
0907 /*
0908  * This is called from eventpoll_release() to unlink files from the eventpoll
0909  * interface. We need to have this facility to cleanup correctly files that are
0910  * closed without being removed from the eventpoll interface.
0911  */
0912 void eventpoll_release_file(struct file *file)
0913 {
0914     struct eventpoll *ep;
0915     struct epitem *epi, *next;
0916 
0917     /*
0918      * We don't want to get "file->f_lock" because it is not
0919      * necessary. It is not necessary because we're in the "struct file"
0920      * cleanup path, and this means that no one is using this file anymore.
0921      * So, for example, epoll_ctl() cannot hit here since if we reach this
0922      * point, the file counter already went to zero and fget() would fail.
0923      * The only hit might come from ep_free() but by holding the mutex
0924      * will correctly serialize the operation. We do need to acquire
0925      * "ep->mtx" after "epmutex" because ep_remove() requires it when called
0926      * from anywhere but ep_free().
0927      *
0928      * Besides, ep_remove() acquires the lock, so we can't hold it here.
0929      */
0930     mutex_lock(&epmutex);
0931     list_for_each_entry_safe(epi, next, &file->f_ep_links, fllink) {
0932         ep = epi->ep;
0933         mutex_lock_nested(&ep->mtx, 0);
0934         ep_remove(ep, epi);
0935         mutex_unlock(&ep->mtx);
0936     }
0937     mutex_unlock(&epmutex);
0938 }
0939 
0940 static int ep_alloc(struct eventpoll **pep)
0941 {
0942     int error;
0943     struct user_struct *user;
0944     struct eventpoll *ep;
0945 
0946     user = get_current_user();
0947     error = -ENOMEM;
0948     ep = kzalloc(sizeof(*ep), GFP_KERNEL);
0949     if (unlikely(!ep))
0950         goto free_uid;
0951 
0952     spin_lock_init(&ep->lock);
0953     mutex_init(&ep->mtx);
0954     init_waitqueue_head(&ep->wq);
0955     init_waitqueue_head(&ep->poll_wait);
0956     INIT_LIST_HEAD(&ep->rdllist);
0957     ep->rbr = RB_ROOT;
0958     ep->ovflist = EP_UNACTIVE_PTR;
0959     ep->user = user;
0960 
0961     *pep = ep;
0962 
0963     return 0;
0964 
0965 free_uid:
0966     free_uid(user);
0967     return error;
0968 }
0969 
0970 /*
0971  * Search the file inside the eventpoll tree. The RB tree operations
0972  * are protected by the "mtx" mutex, and ep_find() must be called with
0973  * "mtx" held.
0974  */
0975 static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd)
0976 {
0977     int kcmp;
0978     struct rb_node *rbp;
0979     struct epitem *epi, *epir = NULL;
0980     struct epoll_filefd ffd;
0981 
0982     ep_set_ffd(&ffd, file, fd);
0983     for (rbp = ep->rbr.rb_node; rbp; ) {
0984         epi = rb_entry(rbp, struct epitem, rbn);
0985         kcmp = ep_cmp_ffd(&ffd, &epi->ffd);
0986         if (kcmp > 0)
0987             rbp = rbp->rb_right;
0988         else if (kcmp < 0)
0989             rbp = rbp->rb_left;
0990         else {
0991             epir = epi;
0992             break;
0993         }
0994     }
0995 
0996     return epir;
0997 }
0998 
0999 /*
1000  * This is the callback that is passed to the wait queue wakeup
1001  * mechanism. It is called by the stored file descriptors when they
1002  * have events to report.
1003  */
1004 static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key)
1005 {
1006     int pwake = 0;
1007     unsigned long flags;
1008     struct epitem *epi = ep_item_from_wait(wait);
1009     struct eventpoll *ep = epi->ep;
1010     int ewake = 0;
1011 
1012     if ((unsigned long)key & POLLFREE) {
1013         ep_pwq_from_wait(wait)->whead = NULL;
1014         /*
1015          * whead = NULL above can race with ep_remove_wait_queue()
1016          * which can do another remove_wait_queue() after us, so we
1017          * can't use __remove_wait_queue(). whead->lock is held by
1018          * the caller.
1019          */
1020         list_del_init(&wait->task_list);
1021     }
1022 
1023     spin_lock_irqsave(&ep->lock, flags);
1024 
1025     /*
1026      * If the event mask does not contain any poll(2) event, we consider the
1027      * descriptor to be disabled. This condition is likely the effect of the
1028      * EPOLLONESHOT bit that disables the descriptor when an event is received,
1029      * until the next EPOLL_CTL_MOD will be issued.
1030      */
1031     if (!(epi->event.events & ~EP_PRIVATE_BITS))
1032         goto out_unlock;
1033 
1034     /*
1035      * Check the events coming with the callback. At this stage, not
1036      * every device reports the events in the "key" parameter of the
1037      * callback. We need to be able to handle both cases here, hence the
1038      * test for "key" != NULL before the event match test.
1039      */
1040     if (key && !((unsigned long) key & epi->event.events))
1041         goto out_unlock;
1042 
1043     /*
1044      * If we are transferring events to userspace, we can hold no locks
1045      * (because we're accessing user memory, and because of linux f_op->poll()
1046      * semantics). All the events that happen during that period of time are
1047      * chained in ep->ovflist and requeued later on.
1048      */
1049     if (unlikely(ep->ovflist != EP_UNACTIVE_PTR)) {
1050         if (epi->next == EP_UNACTIVE_PTR) {
1051             epi->next = ep->ovflist;
1052             ep->ovflist = epi;
1053             if (epi->ws) {
1054                 /*
1055                  * Activate ep->ws since epi->ws may get
1056                  * deactivated at any time.
1057                  */
1058                 __pm_stay_awake(ep->ws);
1059             }
1060 
1061         }
1062         goto out_unlock;
1063     }
1064 
1065     /* If this file is already in the ready list we exit soon */
1066     if (!ep_is_linked(&epi->rdllink)) {
1067         list_add_tail(&epi->rdllink, &ep->rdllist);
1068         ep_pm_stay_awake_rcu(epi);
1069     }
1070 
1071     /*
1072      * Wake up ( if active ) both the eventpoll wait list and the ->poll()
1073      * wait list.
1074      */
1075     if (waitqueue_active(&ep->wq)) {
1076         if ((epi->event.events & EPOLLEXCLUSIVE) &&
1077                     !((unsigned long)key & POLLFREE)) {
1078             switch ((unsigned long)key & EPOLLINOUT_BITS) {
1079             case POLLIN:
1080                 if (epi->event.events & POLLIN)
1081                     ewake = 1;
1082                 break;
1083             case POLLOUT:
1084                 if (epi->event.events & POLLOUT)
1085                     ewake = 1;
1086                 break;
1087             case 0:
1088                 ewake = 1;
1089                 break;
1090             }
1091         }
1092         wake_up_locked(&ep->wq);
1093     }
1094     if (waitqueue_active(&ep->poll_wait))
1095         pwake++;
1096 
1097 out_unlock:
1098     spin_unlock_irqrestore(&ep->lock, flags);
1099 
1100     /* We have to call this outside the lock */
1101     if (pwake)
1102         ep_poll_safewake(&ep->poll_wait);
1103 
1104     if (epi->event.events & EPOLLEXCLUSIVE)
1105         return ewake;
1106 
1107     return 1;
1108 }
1109 
1110 /*
1111  * This is the callback that is used to add our wait queue to the
1112  * target file wakeup lists.
1113  */
1114 static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
1115                  poll_table *pt)
1116 {
1117     struct epitem *epi = ep_item_from_epqueue(pt);
1118     struct eppoll_entry *pwq;
1119 
1120     if (epi->nwait >= 0 && (pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL))) {
1121         init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
1122         pwq->whead = whead;
1123         pwq->base = epi;
1124         if (epi->event.events & EPOLLEXCLUSIVE)
1125             add_wait_queue_exclusive(whead, &pwq->wait);
1126         else
1127             add_wait_queue(whead, &pwq->wait);
1128         list_add_tail(&pwq->llink, &epi->pwqlist);
1129         epi->nwait++;
1130     } else {
1131         /* We have to signal that an error occurred */
1132         epi->nwait = -1;
1133     }
1134 }
1135 
1136 static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi)
1137 {
1138     int kcmp;
1139     struct rb_node **p = &ep->rbr.rb_node, *parent = NULL;
1140     struct epitem *epic;
1141 
1142     while (*p) {
1143         parent = *p;
1144         epic = rb_entry(parent, struct epitem, rbn);
1145         kcmp = ep_cmp_ffd(&epi->ffd, &epic->ffd);
1146         if (kcmp > 0)
1147             p = &parent->rb_right;
1148         else
1149             p = &parent->rb_left;
1150     }
1151     rb_link_node(&epi->rbn, parent, p);
1152     rb_insert_color(&epi->rbn, &ep->rbr);
1153 }
1154 
1155 
1156 
1157 #define PATH_ARR_SIZE 5
1158 /*
1159  * These are the number paths of length 1 to 5, that we are allowing to emanate
1160  * from a single file of interest. For example, we allow 1000 paths of length
1161  * 1, to emanate from each file of interest. This essentially represents the
1162  * potential wakeup paths, which need to be limited in order to avoid massive
1163  * uncontrolled wakeup storms. The common use case should be a single ep which
1164  * is connected to n file sources. In this case each file source has 1 path
1165  * of length 1. Thus, the numbers below should be more than sufficient. These
1166  * path limits are enforced during an EPOLL_CTL_ADD operation, since a modify
1167  * and delete can't add additional paths. Protected by the epmutex.
1168  */
1169 static const int path_limits[PATH_ARR_SIZE] = { 1000, 500, 100, 50, 10 };
1170 static int path_count[PATH_ARR_SIZE];
1171 
1172 static int path_count_inc(int nests)
1173 {
1174     /* Allow an arbitrary number of depth 1 paths */
1175     if (nests == 0)
1176         return 0;
1177 
1178     if (++path_count[nests] > path_limits[nests])
1179         return -1;
1180     return 0;
1181 }
1182 
1183 static void path_count_init(void)
1184 {
1185     int i;
1186 
1187     for (i = 0; i < PATH_ARR_SIZE; i++)
1188         path_count[i] = 0;
1189 }
1190 
1191 static int reverse_path_check_proc(void *priv, void *cookie, int call_nests)
1192 {
1193     int error = 0;
1194     struct file *file = priv;
1195     struct file *child_file;
1196     struct epitem *epi;
1197 
1198     /* CTL_DEL can remove links here, but that can't increase our count */
1199     rcu_read_lock();
1200     list_for_each_entry_rcu(epi, &file->f_ep_links, fllink) {
1201         child_file = epi->ep->file;
1202         if (is_file_epoll(child_file)) {
1203             if (list_empty(&child_file->f_ep_links)) {
1204                 if (path_count_inc(call_nests)) {
1205                     error = -1;
1206                     break;
1207                 }
1208             } else {
1209                 error = ep_call_nested(&poll_loop_ncalls,
1210                             EP_MAX_NESTS,
1211                             reverse_path_check_proc,
1212                             child_file, child_file,
1213                             current);
1214             }
1215             if (error != 0)
1216                 break;
1217         } else {
1218             printk(KERN_ERR "reverse_path_check_proc: "
1219                 "file is not an ep!\n");
1220         }
1221     }
1222     rcu_read_unlock();
1223     return error;
1224 }
1225 
1226 /**
1227  * reverse_path_check - The tfile_check_list is list of file *, which have
1228  *                      links that are proposed to be newly added. We need to
1229  *                      make sure that those added links don't add too many
1230  *                      paths such that we will spend all our time waking up
1231  *                      eventpoll objects.
1232  *
1233  * Returns: Returns zero if the proposed links don't create too many paths,
1234  *      -1 otherwise.
1235  */
1236 static int reverse_path_check(void)
1237 {
1238     int error = 0;
1239     struct file *current_file;
1240 
1241     /* let's call this for all tfiles */
1242     list_for_each_entry(current_file, &tfile_check_list, f_tfile_llink) {
1243         path_count_init();
1244         error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
1245                     reverse_path_check_proc, current_file,
1246                     current_file, current);
1247         if (error)
1248             break;
1249     }
1250     return error;
1251 }
1252 
1253 static int ep_create_wakeup_source(struct epitem *epi)
1254 {
1255     const char *name;
1256     struct wakeup_source *ws;
1257 
1258     if (!epi->ep->ws) {
1259         epi->ep->ws = wakeup_source_register("eventpoll");
1260         if (!epi->ep->ws)
1261             return -ENOMEM;
1262     }
1263 
1264     name = epi->ffd.file->f_path.dentry->d_name.name;
1265     ws = wakeup_source_register(name);
1266 
1267     if (!ws)
1268         return -ENOMEM;
1269     rcu_assign_pointer(epi->ws, ws);
1270 
1271     return 0;
1272 }
1273 
1274 /* rare code path, only used when EPOLL_CTL_MOD removes a wakeup source */
1275 static noinline void ep_destroy_wakeup_source(struct epitem *epi)
1276 {
1277     struct wakeup_source *ws = ep_wakeup_source(epi);
1278 
1279     RCU_INIT_POINTER(epi->ws, NULL);
1280 
1281     /*
1282      * wait for ep_pm_stay_awake_rcu to finish, synchronize_rcu is
1283      * used internally by wakeup_source_remove, too (called by
1284      * wakeup_source_unregister), so we cannot use call_rcu
1285      */
1286     synchronize_rcu();
1287     wakeup_source_unregister(ws);
1288 }
1289 
1290 /*
1291  * Must be called with "mtx" held.
1292  */
1293 static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
1294              struct file *tfile, int fd, int full_check)
1295 {
1296     int error, revents, pwake = 0;
1297     unsigned long flags;
1298     long user_watches;
1299     struct epitem *epi;
1300     struct ep_pqueue epq;
1301 
1302     user_watches = atomic_long_read(&ep->user->epoll_watches);
1303     if (unlikely(user_watches >= max_user_watches))
1304         return -ENOSPC;
1305     if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL)))
1306         return -ENOMEM;
1307 
1308     /* Item initialization follow here ... */
1309     INIT_LIST_HEAD(&epi->rdllink);
1310     INIT_LIST_HEAD(&epi->fllink);
1311     INIT_LIST_HEAD(&epi->pwqlist);
1312     epi->ep = ep;
1313     ep_set_ffd(&epi->ffd, tfile, fd);
1314     epi->event = *event;
1315     epi->nwait = 0;
1316     epi->next = EP_UNACTIVE_PTR;
1317     if (epi->event.events & EPOLLWAKEUP) {
1318         error = ep_create_wakeup_source(epi);
1319         if (error)
1320             goto error_create_wakeup_source;
1321     } else {
1322         RCU_INIT_POINTER(epi->ws, NULL);
1323     }
1324 
1325     /* Initialize the poll table using the queue callback */
1326     epq.epi = epi;
1327     init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);
1328 
1329     /*
1330      * Attach the item to the poll hooks and get current event bits.
1331      * We can safely use the file* here because its usage count has
1332      * been increased by the caller of this function. Note that after
1333      * this operation completes, the poll callback can start hitting
1334      * the new item.
1335      */
1336     revents = ep_item_poll(epi, &epq.pt);
1337 
1338     /*
1339      * We have to check if something went wrong during the poll wait queue
1340      * install process. Namely an allocation for a wait queue failed due
1341      * high memory pressure.
1342      */
1343     error = -ENOMEM;
1344     if (epi->nwait < 0)
1345         goto error_unregister;
1346 
1347     /* Add the current item to the list of active epoll hook for this file */
1348     spin_lock(&tfile->f_lock);
1349     list_add_tail_rcu(&epi->fllink, &tfile->f_ep_links);
1350     spin_unlock(&tfile->f_lock);
1351 
1352     /*
1353      * Add the current item to the RB tree. All RB tree operations are
1354      * protected by "mtx", and ep_insert() is called with "mtx" held.
1355      */
1356     ep_rbtree_insert(ep, epi);
1357 
1358     /* now check if we've created too many backpaths */
1359     error = -EINVAL;
1360     if (full_check && reverse_path_check())
1361         goto error_remove_epi;
1362 
1363     /* We have to drop the new item inside our item list to keep track of it */
1364     spin_lock_irqsave(&ep->lock, flags);
1365 
1366     /* If the file is already "ready" we drop it inside the ready list */
1367     if ((revents & event->events) && !ep_is_linked(&epi->rdllink)) {
1368         list_add_tail(&epi->rdllink, &ep->rdllist);
1369         ep_pm_stay_awake(epi);
1370 
1371         /* Notify waiting tasks that events are available */
1372         if (waitqueue_active(&ep->wq))
1373             wake_up_locked(&ep->wq);
1374         if (waitqueue_active(&ep->poll_wait))
1375             pwake++;
1376     }
1377 
1378     spin_unlock_irqrestore(&ep->lock, flags);
1379 
1380     atomic_long_inc(&ep->user->epoll_watches);
1381 
1382     /* We have to call this outside the lock */
1383     if (pwake)
1384         ep_poll_safewake(&ep->poll_wait);
1385 
1386     return 0;
1387 
1388 error_remove_epi:
1389     spin_lock(&tfile->f_lock);
1390     list_del_rcu(&epi->fllink);
1391     spin_unlock(&tfile->f_lock);
1392 
1393     rb_erase(&epi->rbn, &ep->rbr);
1394 
1395 error_unregister:
1396     ep_unregister_pollwait(ep, epi);
1397 
1398     /*
1399      * We need to do this because an event could have been arrived on some
1400      * allocated wait queue. Note that we don't care about the ep->ovflist
1401      * list, since that is used/cleaned only inside a section bound by "mtx".
1402      * And ep_insert() is called with "mtx" held.
1403      */
1404     spin_lock_irqsave(&ep->lock, flags);
1405     if (ep_is_linked(&epi->rdllink))
1406         list_del_init(&epi->rdllink);
1407     spin_unlock_irqrestore(&ep->lock, flags);
1408 
1409     wakeup_source_unregister(ep_wakeup_source(epi));
1410 
1411 error_create_wakeup_source:
1412     kmem_cache_free(epi_cache, epi);
1413 
1414     return error;
1415 }
1416 
1417 /*
1418  * Modify the interest event mask by dropping an event if the new mask
1419  * has a match in the current file status. Must be called with "mtx" held.
1420  */
1421 static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_event *event)
1422 {
1423     int pwake = 0;
1424     unsigned int revents;
1425     poll_table pt;
1426 
1427     init_poll_funcptr(&pt, NULL);
1428 
1429     /*
1430      * Set the new event interest mask before calling f_op->poll();
1431      * otherwise we might miss an event that happens between the
1432      * f_op->poll() call and the new event set registering.
1433      */
1434     epi->event.events = event->events; /* need barrier below */
1435     epi->event.data = event->data; /* protected by mtx */
1436     if (epi->event.events & EPOLLWAKEUP) {
1437         if (!ep_has_wakeup_source(epi))
1438             ep_create_wakeup_source(epi);
1439     } else if (ep_has_wakeup_source(epi)) {
1440         ep_destroy_wakeup_source(epi);
1441     }
1442 
1443     /*
1444      * The following barrier has two effects:
1445      *
1446      * 1) Flush epi changes above to other CPUs.  This ensures
1447      *    we do not miss events from ep_poll_callback if an
1448      *    event occurs immediately after we call f_op->poll().
1449      *    We need this because we did not take ep->lock while
1450      *    changing epi above (but ep_poll_callback does take
1451      *    ep->lock).
1452      *
1453      * 2) We also need to ensure we do not miss _past_ events
1454      *    when calling f_op->poll().  This barrier also
1455      *    pairs with the barrier in wq_has_sleeper (see
1456      *    comments for wq_has_sleeper).
1457      *
1458      * This barrier will now guarantee ep_poll_callback or f_op->poll
1459      * (or both) will notice the readiness of an item.
1460      */
1461     smp_mb();
1462 
1463     /*
1464      * Get current event bits. We can safely use the file* here because
1465      * its usage count has been increased by the caller of this function.
1466      */
1467     revents = ep_item_poll(epi, &pt);
1468 
1469     /*
1470      * If the item is "hot" and it is not registered inside the ready
1471      * list, push it inside.
1472      */
1473     if (revents & event->events) {
1474         spin_lock_irq(&ep->lock);
1475         if (!ep_is_linked(&epi->rdllink)) {
1476             list_add_tail(&epi->rdllink, &ep->rdllist);
1477             ep_pm_stay_awake(epi);
1478 
1479             /* Notify waiting tasks that events are available */
1480             if (waitqueue_active(&ep->wq))
1481                 wake_up_locked(&ep->wq);
1482             if (waitqueue_active(&ep->poll_wait))
1483                 pwake++;
1484         }
1485         spin_unlock_irq(&ep->lock);
1486     }
1487 
1488     /* We have to call this outside the lock */
1489     if (pwake)
1490         ep_poll_safewake(&ep->poll_wait);
1491 
1492     return 0;
1493 }
1494 
1495 static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
1496                    void *priv)
1497 {
1498     struct ep_send_events_data *esed = priv;
1499     int eventcnt;
1500     unsigned int revents;
1501     struct epitem *epi;
1502     struct epoll_event __user *uevent;
1503     struct wakeup_source *ws;
1504     poll_table pt;
1505 
1506     init_poll_funcptr(&pt, NULL);
1507 
1508     /*
1509      * We can loop without lock because we are passed a task private list.
1510      * Items cannot vanish during the loop because ep_scan_ready_list() is
1511      * holding "mtx" during this call.
1512      */
1513     for (eventcnt = 0, uevent = esed->events;
1514          !list_empty(head) && eventcnt < esed->maxevents;) {
1515         epi = list_first_entry(head, struct epitem, rdllink);
1516 
1517         /*
1518          * Activate ep->ws before deactivating epi->ws to prevent
1519          * triggering auto-suspend here (in case we reactive epi->ws
1520          * below).
1521          *
1522          * This could be rearranged to delay the deactivation of epi->ws
1523          * instead, but then epi->ws would temporarily be out of sync
1524          * with ep_is_linked().
1525          */
1526         ws = ep_wakeup_source(epi);
1527         if (ws) {
1528             if (ws->active)
1529                 __pm_stay_awake(ep->ws);
1530             __pm_relax(ws);
1531         }
1532 
1533         list_del_init(&epi->rdllink);
1534 
1535         revents = ep_item_poll(epi, &pt);
1536 
1537         /*
1538          * If the event mask intersect the caller-requested one,
1539          * deliver the event to userspace. Again, ep_scan_ready_list()
1540          * is holding "mtx", so no operations coming from userspace
1541          * can change the item.
1542          */
1543         if (revents) {
1544             if (__put_user(revents, &uevent->events) ||
1545                 __put_user(epi->event.data, &uevent->data)) {
1546                 list_add(&epi->rdllink, head);
1547                 ep_pm_stay_awake(epi);
1548                 return eventcnt ? eventcnt : -EFAULT;
1549             }
1550             eventcnt++;
1551             uevent++;
1552             if (epi->event.events & EPOLLONESHOT)
1553                 epi->event.events &= EP_PRIVATE_BITS;
1554             else if (!(epi->event.events & EPOLLET)) {
1555                 /*
1556                  * If this file has been added with Level
1557                  * Trigger mode, we need to insert back inside
1558                  * the ready list, so that the next call to
1559                  * epoll_wait() will check again the events
1560                  * availability. At this point, no one can insert
1561                  * into ep->rdllist besides us. The epoll_ctl()
1562                  * callers are locked out by
1563                  * ep_scan_ready_list() holding "mtx" and the
1564                  * poll callback will queue them in ep->ovflist.
1565                  */
1566                 list_add_tail(&epi->rdllink, &ep->rdllist);
1567                 ep_pm_stay_awake(epi);
1568             }
1569         }
1570     }
1571 
1572     return eventcnt;
1573 }
1574 
1575 static int ep_send_events(struct eventpoll *ep,
1576               struct epoll_event __user *events, int maxevents)
1577 {
1578     struct ep_send_events_data esed;
1579 
1580     esed.maxevents = maxevents;
1581     esed.events = events;
1582 
1583     return ep_scan_ready_list(ep, ep_send_events_proc, &esed, 0, false);
1584 }
1585 
1586 static inline struct timespec64 ep_set_mstimeout(long ms)
1587 {
1588     struct timespec64 now, ts = {
1589         .tv_sec = ms / MSEC_PER_SEC,
1590         .tv_nsec = NSEC_PER_MSEC * (ms % MSEC_PER_SEC),
1591     };
1592 
1593     ktime_get_ts64(&now);
1594     return timespec64_add_safe(now, ts);
1595 }
1596 
1597 /**
1598  * ep_poll - Retrieves ready events, and delivers them to the caller supplied
1599  *           event buffer.
1600  *
1601  * @ep: Pointer to the eventpoll context.
1602  * @events: Pointer to the userspace buffer where the ready events should be
1603  *          stored.
1604  * @maxevents: Size (in terms of number of events) of the caller event buffer.
1605  * @timeout: Maximum timeout for the ready events fetch operation, in
1606  *           milliseconds. If the @timeout is zero, the function will not block,
1607  *           while if the @timeout is less than zero, the function will block
1608  *           until at least one event has been retrieved (or an error
1609  *           occurred).
1610  *
1611  * Returns: Returns the number of ready events which have been fetched, or an
1612  *          error code, in case of error.
1613  */
1614 static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
1615            int maxevents, long timeout)
1616 {
1617     int res = 0, eavail, timed_out = 0;
1618     unsigned long flags;
1619     u64 slack = 0;
1620     wait_queue_t wait;
1621     ktime_t expires, *to = NULL;
1622 
1623     if (timeout > 0) {
1624         struct timespec64 end_time = ep_set_mstimeout(timeout);
1625 
1626         slack = select_estimate_accuracy(&end_time);
1627         to = &expires;
1628         *to = timespec64_to_ktime(end_time);
1629     } else if (timeout == 0) {
1630         /*
1631          * Avoid the unnecessary trip to the wait queue loop, if the
1632          * caller specified a non blocking operation.
1633          */
1634         timed_out = 1;
1635         spin_lock_irqsave(&ep->lock, flags);
1636         goto check_events;
1637     }
1638 
1639 fetch_events:
1640     spin_lock_irqsave(&ep->lock, flags);
1641 
1642     if (!ep_events_available(ep)) {
1643         /*
1644          * We don't have any available event to return to the caller.
1645          * We need to sleep here, and we will be wake up by
1646          * ep_poll_callback() when events will become available.
1647          */
1648         init_waitqueue_entry(&wait, current);
1649         __add_wait_queue_exclusive(&ep->wq, &wait);
1650 
1651         for (;;) {
1652             /*
1653              * We don't want to sleep if the ep_poll_callback() sends us
1654              * a wakeup in between. That's why we set the task state
1655              * to TASK_INTERRUPTIBLE before doing the checks.
1656              */
1657             set_current_state(TASK_INTERRUPTIBLE);
1658             if (ep_events_available(ep) || timed_out)
1659                 break;
1660             if (signal_pending(current)) {
1661                 res = -EINTR;
1662                 break;
1663             }
1664 
1665             spin_unlock_irqrestore(&ep->lock, flags);
1666             if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS))
1667                 timed_out = 1;
1668 
1669             spin_lock_irqsave(&ep->lock, flags);
1670         }
1671 
1672         __remove_wait_queue(&ep->wq, &wait);
1673         __set_current_state(TASK_RUNNING);
1674     }
1675 check_events:
1676     /* Is it worth to try to dig for events ? */
1677     eavail = ep_events_available(ep);
1678 
1679     spin_unlock_irqrestore(&ep->lock, flags);
1680 
1681     /*
1682      * Try to transfer events to user space. In case we get 0 events and
1683      * there's still timeout left over, we go trying again in search of
1684      * more luck.
1685      */
1686     if (!res && eavail &&
1687         !(res = ep_send_events(ep, events, maxevents)) && !timed_out)
1688         goto fetch_events;
1689 
1690     return res;
1691 }
1692 
1693 /**
1694  * ep_loop_check_proc - Callback function to be passed to the @ep_call_nested()
1695  *                      API, to verify that adding an epoll file inside another
1696  *                      epoll structure, does not violate the constraints, in
1697  *                      terms of closed loops, or too deep chains (which can
1698  *                      result in excessive stack usage).
1699  *
1700  * @priv: Pointer to the epoll file to be currently checked.
1701  * @cookie: Original cookie for this call. This is the top-of-the-chain epoll
1702  *          data structure pointer.
1703  * @call_nests: Current dept of the @ep_call_nested() call stack.
1704  *
1705  * Returns: Returns zero if adding the epoll @file inside current epoll
1706  *          structure @ep does not violate the constraints, or -1 otherwise.
1707  */
1708 static int ep_loop_check_proc(void *priv, void *cookie, int call_nests)
1709 {
1710     int error = 0;
1711     struct file *file = priv;
1712     struct eventpoll *ep = file->private_data;
1713     struct eventpoll *ep_tovisit;
1714     struct rb_node *rbp;
1715     struct epitem *epi;
1716 
1717     mutex_lock_nested(&ep->mtx, call_nests + 1);
1718     ep->visited = 1;
1719     list_add(&ep->visited_list_link, &visited_list);
1720     for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
1721         epi = rb_entry(rbp, struct epitem, rbn);
1722         if (unlikely(is_file_epoll(epi->ffd.file))) {
1723             ep_tovisit = epi->ffd.file->private_data;
1724             if (ep_tovisit->visited)
1725                 continue;
1726             error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
1727                     ep_loop_check_proc, epi->ffd.file,
1728                     ep_tovisit, current);
1729             if (error != 0)
1730                 break;
1731         } else {
1732             /*
1733              * If we've reached a file that is not associated with
1734              * an ep, then we need to check if the newly added
1735              * links are going to add too many wakeup paths. We do
1736              * this by adding it to the tfile_check_list, if it's
1737              * not already there, and calling reverse_path_check()
1738              * during ep_insert().
1739              */
1740             if (list_empty(&epi->ffd.file->f_tfile_llink))
1741                 list_add(&epi->ffd.file->f_tfile_llink,
1742                      &tfile_check_list);
1743         }
1744     }
1745     mutex_unlock(&ep->mtx);
1746 
1747     return error;
1748 }
1749 
1750 /**
1751  * ep_loop_check - Performs a check to verify that adding an epoll file (@file)
1752  *                 another epoll file (represented by @ep) does not create
1753  *                 closed loops or too deep chains.
1754  *
1755  * @ep: Pointer to the epoll private data structure.
1756  * @file: Pointer to the epoll file to be checked.
1757  *
1758  * Returns: Returns zero if adding the epoll @file inside current epoll
1759  *          structure @ep does not violate the constraints, or -1 otherwise.
1760  */
1761 static int ep_loop_check(struct eventpoll *ep, struct file *file)
1762 {
1763     int ret;
1764     struct eventpoll *ep_cur, *ep_next;
1765 
1766     ret = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
1767                   ep_loop_check_proc, file, ep, current);
1768     /* clear visited list */
1769     list_for_each_entry_safe(ep_cur, ep_next, &visited_list,
1770                             visited_list_link) {
1771         ep_cur->visited = 0;
1772         list_del(&ep_cur->visited_list_link);
1773     }
1774     return ret;
1775 }
1776 
1777 static void clear_tfile_check_list(void)
1778 {
1779     struct file *file;
1780 
1781     /* first clear the tfile_check_list */
1782     while (!list_empty(&tfile_check_list)) {
1783         file = list_first_entry(&tfile_check_list, struct file,
1784                     f_tfile_llink);
1785         list_del_init(&file->f_tfile_llink);
1786     }
1787     INIT_LIST_HEAD(&tfile_check_list);
1788 }
1789 
1790 /*
1791  * Open an eventpoll file descriptor.
1792  */
1793 SYSCALL_DEFINE1(epoll_create1, int, flags)
1794 {
1795     int error, fd;
1796     struct eventpoll *ep = NULL;
1797     struct file *file;
1798 
1799     /* Check the EPOLL_* constant for consistency.  */
1800     BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);
1801 
1802     if (flags & ~EPOLL_CLOEXEC)
1803         return -EINVAL;
1804     /*
1805      * Create the internal data structure ("struct eventpoll").
1806      */
1807     error = ep_alloc(&ep);
1808     if (error < 0)
1809         return error;
1810     /*
1811      * Creates all the items needed to setup an eventpoll file. That is,
1812      * a file structure and a free file descriptor.
1813      */
1814     fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC));
1815     if (fd < 0) {
1816         error = fd;
1817         goto out_free_ep;
1818     }
1819     file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep,
1820                  O_RDWR | (flags & O_CLOEXEC));
1821     if (IS_ERR(file)) {
1822         error = PTR_ERR(file);
1823         goto out_free_fd;
1824     }
1825     ep->file = file;
1826     fd_install(fd, file);
1827     return fd;
1828 
1829 out_free_fd:
1830     put_unused_fd(fd);
1831 out_free_ep:
1832     ep_free(ep);
1833     return error;
1834 }
1835 
1836 SYSCALL_DEFINE1(epoll_create, int, size)
1837 {
1838     if (size <= 0)
1839         return -EINVAL;
1840 
1841     return sys_epoll_create1(0);
1842 }
1843 
1844 /*
1845  * The following function implements the controller interface for
1846  * the eventpoll file that enables the insertion/removal/change of
1847  * file descriptors inside the interest set.
1848  */
1849 SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
1850         struct epoll_event __user *, event)
1851 {
1852     int error;
1853     int full_check = 0;
1854     struct fd f, tf;
1855     struct eventpoll *ep;
1856     struct epitem *epi;
1857     struct epoll_event epds;
1858     struct eventpoll *tep = NULL;
1859 
1860     error = -EFAULT;
1861     if (ep_op_has_event(op) &&
1862         copy_from_user(&epds, event, sizeof(struct epoll_event)))
1863         goto error_return;
1864 
1865     error = -EBADF;
1866     f = fdget(epfd);
1867     if (!f.file)
1868         goto error_return;
1869 
1870     /* Get the "struct file *" for the target file */
1871     tf = fdget(fd);
1872     if (!tf.file)
1873         goto error_fput;
1874 
1875     /* The target file descriptor must support poll */
1876     error = -EPERM;
1877     if (!tf.file->f_op->poll)
1878         goto error_tgt_fput;
1879 
1880     /* Check if EPOLLWAKEUP is allowed */
1881     if (ep_op_has_event(op))
1882         ep_take_care_of_epollwakeup(&epds);
1883 
1884     /*
1885      * We have to check that the file structure underneath the file descriptor
1886      * the user passed to us _is_ an eventpoll file. And also we do not permit
1887      * adding an epoll file descriptor inside itself.
1888      */
1889     error = -EINVAL;
1890     if (f.file == tf.file || !is_file_epoll(f.file))
1891         goto error_tgt_fput;
1892 
1893     /*
1894      * epoll adds to the wakeup queue at EPOLL_CTL_ADD time only,
1895      * so EPOLLEXCLUSIVE is not allowed for a EPOLL_CTL_MOD operation.
1896      * Also, we do not currently supported nested exclusive wakeups.
1897      */
1898     if (epds.events & EPOLLEXCLUSIVE) {
1899         if (op == EPOLL_CTL_MOD)
1900             goto error_tgt_fput;
1901         if (op == EPOLL_CTL_ADD && (is_file_epoll(tf.file) ||
1902                 (epds.events & ~EPOLLEXCLUSIVE_OK_BITS)))
1903             goto error_tgt_fput;
1904     }
1905 
1906     /*
1907      * At this point it is safe to assume that the "private_data" contains
1908      * our own data structure.
1909      */
1910     ep = f.file->private_data;
1911 
1912     /*
1913      * When we insert an epoll file descriptor, inside another epoll file
1914      * descriptor, there is the change of creating closed loops, which are
1915      * better be handled here, than in more critical paths. While we are
1916      * checking for loops we also determine the list of files reachable
1917      * and hang them on the tfile_check_list, so we can check that we
1918      * haven't created too many possible wakeup paths.
1919      *
1920      * We do not need to take the global 'epumutex' on EPOLL_CTL_ADD when
1921      * the epoll file descriptor is attaching directly to a wakeup source,
1922      * unless the epoll file descriptor is nested. The purpose of taking the
1923      * 'epmutex' on add is to prevent complex toplogies such as loops and
1924      * deep wakeup paths from forming in parallel through multiple
1925      * EPOLL_CTL_ADD operations.
1926      */
1927     mutex_lock_nested(&ep->mtx, 0);
1928     if (op == EPOLL_CTL_ADD) {
1929         if (!list_empty(&f.file->f_ep_links) ||
1930                         is_file_epoll(tf.file)) {
1931             full_check = 1;
1932             mutex_unlock(&ep->mtx);
1933             mutex_lock(&epmutex);
1934             if (is_file_epoll(tf.file)) {
1935                 error = -ELOOP;
1936                 if (ep_loop_check(ep, tf.file) != 0) {
1937                     clear_tfile_check_list();
1938                     goto error_tgt_fput;
1939                 }
1940             } else
1941                 list_add(&tf.file->f_tfile_llink,
1942                             &tfile_check_list);
1943             mutex_lock_nested(&ep->mtx, 0);
1944             if (is_file_epoll(tf.file)) {
1945                 tep = tf.file->private_data;
1946                 mutex_lock_nested(&tep->mtx, 1);
1947             }
1948         }
1949     }
1950 
1951     /*
1952      * Try to lookup the file inside our RB tree, Since we grabbed "mtx"
1953      * above, we can be sure to be able to use the item looked up by
1954      * ep_find() till we release the mutex.
1955      */
1956     epi = ep_find(ep, tf.file, fd);
1957 
1958     error = -EINVAL;
1959     switch (op) {
1960     case EPOLL_CTL_ADD:
1961         if (!epi) {
1962             epds.events |= POLLERR | POLLHUP;
1963             error = ep_insert(ep, &epds, tf.file, fd, full_check);
1964         } else
1965             error = -EEXIST;
1966         if (full_check)
1967             clear_tfile_check_list();
1968         break;
1969     case EPOLL_CTL_DEL:
1970         if (epi)
1971             error = ep_remove(ep, epi);
1972         else
1973             error = -ENOENT;
1974         break;
1975     case EPOLL_CTL_MOD:
1976         if (epi) {
1977             if (!(epi->event.events & EPOLLEXCLUSIVE)) {
1978                 epds.events |= POLLERR | POLLHUP;
1979                 error = ep_modify(ep, epi, &epds);
1980             }
1981         } else
1982             error = -ENOENT;
1983         break;
1984     }
1985     if (tep != NULL)
1986         mutex_unlock(&tep->mtx);
1987     mutex_unlock(&ep->mtx);
1988 
1989 error_tgt_fput:
1990     if (full_check)
1991         mutex_unlock(&epmutex);
1992 
1993     fdput(tf);
1994 error_fput:
1995     fdput(f);
1996 error_return:
1997 
1998     return error;
1999 }
2000 
2001 /*
2002  * Implement the event wait interface for the eventpoll file. It is the kernel
2003  * part of the user space epoll_wait(2).
2004  */
2005 SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
2006         int, maxevents, int, timeout)
2007 {
2008     int error;
2009     struct fd f;
2010     struct eventpoll *ep;
2011 
2012     /* The maximum number of event must be greater than zero */
2013     if (maxevents <= 0 || maxevents > EP_MAX_EVENTS)
2014         return -EINVAL;
2015 
2016     /* Verify that the area passed by the user is writeable */
2017     if (!access_ok(VERIFY_WRITE, events, maxevents * sizeof(struct epoll_event)))
2018         return -EFAULT;
2019 
2020     /* Get the "struct file *" for the eventpoll file */
2021     f = fdget(epfd);
2022     if (!f.file)
2023         return -EBADF;
2024 
2025     /*
2026      * We have to check that the file structure underneath the fd
2027      * the user passed to us _is_ an eventpoll file.
2028      */
2029     error = -EINVAL;
2030     if (!is_file_epoll(f.file))
2031         goto error_fput;
2032 
2033     /*
2034      * At this point it is safe to assume that the "private_data" contains
2035      * our own data structure.
2036      */
2037     ep = f.file->private_data;
2038 
2039     /* Time to fish for events ... */
2040     error = ep_poll(ep, events, maxevents, timeout);
2041 
2042 error_fput:
2043     fdput(f);
2044     return error;
2045 }
2046 
2047 /*
2048  * Implement the event wait interface for the eventpoll file. It is the kernel
2049  * part of the user space epoll_pwait(2).
2050  */
2051 SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
2052         int, maxevents, int, timeout, const sigset_t __user *, sigmask,
2053         size_t, sigsetsize)
2054 {
2055     int error;
2056     sigset_t ksigmask, sigsaved;
2057 
2058     /*
2059      * If the caller wants a certain signal mask to be set during the wait,
2060      * we apply it here.
2061      */
2062     if (sigmask) {
2063         if (sigsetsize != sizeof(sigset_t))
2064             return -EINVAL;
2065         if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask)))
2066             return -EFAULT;
2067         sigsaved = current->blocked;
2068         set_current_blocked(&ksigmask);
2069     }
2070 
2071     error = sys_epoll_wait(epfd, events, maxevents, timeout);
2072 
2073     /*
2074      * If we changed the signal mask, we need to restore the original one.
2075      * In case we've got a signal while waiting, we do not restore the
2076      * signal mask yet, and we allow do_signal() to deliver the signal on
2077      * the way back to userspace, before the signal mask is restored.
2078      */
2079     if (sigmask) {
2080         if (error == -EINTR) {
2081             memcpy(&current->saved_sigmask, &sigsaved,
2082                    sizeof(sigsaved));
2083             set_restore_sigmask();
2084         } else
2085             set_current_blocked(&sigsaved);
2086     }
2087 
2088     return error;
2089 }
2090 
2091 #ifdef CONFIG_COMPAT
2092 COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,
2093             struct epoll_event __user *, events,
2094             int, maxevents, int, timeout,
2095             const compat_sigset_t __user *, sigmask,
2096             compat_size_t, sigsetsize)
2097 {
2098     long err;
2099     compat_sigset_t csigmask;
2100     sigset_t ksigmask, sigsaved;
2101 
2102     /*
2103      * If the caller wants a certain signal mask to be set during the wait,
2104      * we apply it here.
2105      */
2106     if (sigmask) {
2107         if (sigsetsize != sizeof(compat_sigset_t))
2108             return -EINVAL;
2109         if (copy_from_user(&csigmask, sigmask, sizeof(csigmask)))
2110             return -EFAULT;
2111         sigset_from_compat(&ksigmask, &csigmask);
2112         sigsaved = current->blocked;
2113         set_current_blocked(&ksigmask);
2114     }
2115 
2116     err = sys_epoll_wait(epfd, events, maxevents, timeout);
2117 
2118     /*
2119      * If we changed the signal mask, we need to restore the original one.
2120      * In case we've got a signal while waiting, we do not restore the
2121      * signal mask yet, and we allow do_signal() to deliver the signal on
2122      * the way back to userspace, before the signal mask is restored.
2123      */
2124     if (sigmask) {
2125         if (err == -EINTR) {
2126             memcpy(&current->saved_sigmask, &sigsaved,
2127                    sizeof(sigsaved));
2128             set_restore_sigmask();
2129         } else
2130             set_current_blocked(&sigsaved);
2131     }
2132 
2133     return err;
2134 }
2135 #endif
2136 
2137 static int __init eventpoll_init(void)
2138 {
2139     struct sysinfo si;
2140 
2141     si_meminfo(&si);
2142     /*
2143      * Allows top 4% of lomem to be allocated for epoll watches (per user).
2144      */
2145     max_user_watches = (((si.totalram - si.totalhigh) / 25) << PAGE_SHIFT) /
2146         EP_ITEM_COST;
2147     BUG_ON(max_user_watches < 0);
2148 
2149     /*
2150      * Initialize the structure used to perform epoll file descriptor
2151      * inclusion loops checks.
2152      */
2153     ep_nested_calls_init(&poll_loop_ncalls);
2154 
2155     /* Initialize the structure used to perform safe poll wait head wake ups */
2156     ep_nested_calls_init(&poll_safewake_ncalls);
2157 
2158     /* Initialize the structure used to perform file's f_op->poll() calls */
2159     ep_nested_calls_init(&poll_readywalk_ncalls);
2160 
2161     /*
2162      * We can have many thousands of epitems, so prevent this from
2163      * using an extra cache line on 64-bit (and smaller) CPUs
2164      */
2165     BUILD_BUG_ON(sizeof(void *) <= 8 && sizeof(struct epitem) > 128);
2166 
2167     /* Allocates slab cache used to allocate "struct epitem" items */
2168     epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem),
2169             0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
2170 
2171     /* Allocates slab cache used to allocate "struct eppoll_entry" */
2172     pwq_cache = kmem_cache_create("eventpoll_pwq",
2173             sizeof(struct eppoll_entry), 0, SLAB_PANIC, NULL);
2174 
2175     return 0;
2176 }
2177 fs_initcall(eventpoll_init);