0001
0002
0003
0004
0005
0006
0007
0008
0009 #include <linux/init.h>
0010 #include <linux/kernel.h>
0011 #include <linux/sched/signal.h>
0012 #include <linux/fs.h>
0013 #include <linux/file.h>
0014 #include <linux/signal.h>
0015 #include <linux/errno.h>
0016 #include <linux/mm.h>
0017 #include <linux/slab.h>
0018 #include <linux/poll.h>
0019 #include <linux/string.h>
0020 #include <linux/list.h>
0021 #include <linux/hash.h>
0022 #include <linux/spinlock.h>
0023 #include <linux/syscalls.h>
0024 #include <linux/rbtree.h>
0025 #include <linux/wait.h>
0026 #include <linux/eventpoll.h>
0027 #include <linux/mount.h>
0028 #include <linux/bitops.h>
0029 #include <linux/mutex.h>
0030 #include <linux/anon_inodes.h>
0031 #include <linux/device.h>
0032 #include <linux/uaccess.h>
0033 #include <asm/io.h>
0034 #include <asm/mman.h>
0035 #include <linux/atomic.h>
0036 #include <linux/proc_fs.h>
0037 #include <linux/seq_file.h>
0038 #include <linux/compat.h>
0039 #include <linux/rculist.h>
0040 #include <net/busy_poll.h>
0041
0042
0043
0044
0045
0046
0047
0048
0049
0050
0051
0052
0053
0054
0055
0056
0057
0058
0059
0060
0061
0062
0063
0064
0065
0066
0067
0068
0069
0070
0071
0072
0073
0074
0075
0076
0077
0078
0079
0080
0081
0082
0083
0084
0085
0086
0087
0088
0089
0090
0091 #define EP_PRIVATE_BITS (EPOLLWAKEUP | EPOLLONESHOT | EPOLLET | EPOLLEXCLUSIVE)
0092
0093 #define EPOLLINOUT_BITS (EPOLLIN | EPOLLOUT)
0094
0095 #define EPOLLEXCLUSIVE_OK_BITS (EPOLLINOUT_BITS | EPOLLERR | EPOLLHUP | \
0096 EPOLLWAKEUP | EPOLLET | EPOLLEXCLUSIVE)
0097
0098
0099 #define EP_MAX_NESTS 4
0100
0101 #define EP_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event))
0102
0103 #define EP_UNACTIVE_PTR ((void *) -1L)
0104
0105 #define EP_ITEM_COST (sizeof(struct epitem) + sizeof(struct eppoll_entry))
0106
0107 struct epoll_filefd {
0108 struct file *file;
0109 int fd;
0110 } __packed;
0111
0112
0113 struct eppoll_entry {
0114
0115 struct eppoll_entry *next;
0116
0117
0118 struct epitem *base;
0119
0120
0121
0122
0123
0124 wait_queue_entry_t wait;
0125
0126
0127 wait_queue_head_t *whead;
0128 };
0129
0130
0131
0132
0133
0134
0135
0136 struct epitem {
0137 union {
0138
0139 struct rb_node rbn;
0140
0141 struct rcu_head rcu;
0142 };
0143
0144
0145 struct list_head rdllink;
0146
0147
0148
0149
0150
0151 struct epitem *next;
0152
0153
0154 struct epoll_filefd ffd;
0155
0156
0157 struct eppoll_entry *pwqlist;
0158
0159
0160 struct eventpoll *ep;
0161
0162
0163 struct hlist_node fllink;
0164
0165
0166 struct wakeup_source __rcu *ws;
0167
0168
0169 struct epoll_event event;
0170 };
0171
0172
0173
0174
0175
0176
0177 struct eventpoll {
0178
0179
0180
0181
0182
0183
0184 struct mutex mtx;
0185
0186
0187 wait_queue_head_t wq;
0188
0189
0190 wait_queue_head_t poll_wait;
0191
0192
0193 struct list_head rdllist;
0194
0195
0196 rwlock_t lock;
0197
0198
0199 struct rb_root_cached rbr;
0200
0201
0202
0203
0204
0205
0206 struct epitem *ovflist;
0207
0208
0209 struct wakeup_source *ws;
0210
0211
0212 struct user_struct *user;
0213
0214 struct file *file;
0215
0216
0217 u64 gen;
0218 struct hlist_head refs;
0219
0220 #ifdef CONFIG_NET_RX_BUSY_POLL
0221
0222 unsigned int napi_id;
0223 #endif
0224
0225 #ifdef CONFIG_DEBUG_LOCK_ALLOC
0226
0227 u8 nests;
0228 #endif
0229 };
0230
0231
0232 struct ep_pqueue {
0233 poll_table pt;
0234 struct epitem *epi;
0235 };
0236
0237
0238
0239
0240
0241 static long max_user_watches __read_mostly;
0242
0243
0244
0245
0246 static DEFINE_MUTEX(epmutex);
0247
0248 static u64 loop_check_gen = 0;
0249
0250
0251 static struct eventpoll *inserting_into;
0252
0253
0254 static struct kmem_cache *epi_cache __read_mostly;
0255
0256
0257 static struct kmem_cache *pwq_cache __read_mostly;
0258
0259
0260
0261
0262
0263 struct epitems_head {
0264 struct hlist_head epitems;
0265 struct epitems_head *next;
0266 };
0267 static struct epitems_head *tfile_check_list = EP_UNACTIVE_PTR;
0268
0269 static struct kmem_cache *ephead_cache __read_mostly;
0270
0271 static inline void free_ephead(struct epitems_head *head)
0272 {
0273 if (head)
0274 kmem_cache_free(ephead_cache, head);
0275 }
0276
0277 static void list_file(struct file *file)
0278 {
0279 struct epitems_head *head;
0280
0281 head = container_of(file->f_ep, struct epitems_head, epitems);
0282 if (!head->next) {
0283 head->next = tfile_check_list;
0284 tfile_check_list = head;
0285 }
0286 }
0287
0288 static void unlist_file(struct epitems_head *head)
0289 {
0290 struct epitems_head *to_free = head;
0291 struct hlist_node *p = rcu_dereference(hlist_first_rcu(&head->epitems));
0292 if (p) {
0293 struct epitem *epi= container_of(p, struct epitem, fllink);
0294 spin_lock(&epi->ffd.file->f_lock);
0295 if (!hlist_empty(&head->epitems))
0296 to_free = NULL;
0297 head->next = NULL;
0298 spin_unlock(&epi->ffd.file->f_lock);
0299 }
0300 free_ephead(to_free);
0301 }
0302
0303 #ifdef CONFIG_SYSCTL
0304
0305 #include <linux/sysctl.h>
0306
0307 static long long_zero;
0308 static long long_max = LONG_MAX;
0309
0310 static struct ctl_table epoll_table[] = {
0311 {
0312 .procname = "max_user_watches",
0313 .data = &max_user_watches,
0314 .maxlen = sizeof(max_user_watches),
0315 .mode = 0644,
0316 .proc_handler = proc_doulongvec_minmax,
0317 .extra1 = &long_zero,
0318 .extra2 = &long_max,
0319 },
0320 { }
0321 };
0322
0323 static void __init epoll_sysctls_init(void)
0324 {
0325 register_sysctl("fs/epoll", epoll_table);
0326 }
0327 #else
0328 #define epoll_sysctls_init() do { } while (0)
0329 #endif
0330
0331 static const struct file_operations eventpoll_fops;
0332
0333 static inline int is_file_epoll(struct file *f)
0334 {
0335 return f->f_op == &eventpoll_fops;
0336 }
0337
0338
0339 static inline void ep_set_ffd(struct epoll_filefd *ffd,
0340 struct file *file, int fd)
0341 {
0342 ffd->file = file;
0343 ffd->fd = fd;
0344 }
0345
0346
0347 static inline int ep_cmp_ffd(struct epoll_filefd *p1,
0348 struct epoll_filefd *p2)
0349 {
0350 return (p1->file > p2->file ? +1:
0351 (p1->file < p2->file ? -1 : p1->fd - p2->fd));
0352 }
0353
0354
0355 static inline int ep_is_linked(struct epitem *epi)
0356 {
0357 return !list_empty(&epi->rdllink);
0358 }
0359
0360 static inline struct eppoll_entry *ep_pwq_from_wait(wait_queue_entry_t *p)
0361 {
0362 return container_of(p, struct eppoll_entry, wait);
0363 }
0364
0365
0366 static inline struct epitem *ep_item_from_wait(wait_queue_entry_t *p)
0367 {
0368 return container_of(p, struct eppoll_entry, wait)->base;
0369 }
0370
0371
0372
0373
0374
0375
0376
0377
0378
0379 static inline int ep_events_available(struct eventpoll *ep)
0380 {
0381 return !list_empty_careful(&ep->rdllist) ||
0382 READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR;
0383 }
0384
0385 #ifdef CONFIG_NET_RX_BUSY_POLL
0386 static bool ep_busy_loop_end(void *p, unsigned long start_time)
0387 {
0388 struct eventpoll *ep = p;
0389
0390 return ep_events_available(ep) || busy_loop_timeout(start_time);
0391 }
0392
0393
0394
0395
0396
0397
0398
0399 static bool ep_busy_loop(struct eventpoll *ep, int nonblock)
0400 {
0401 unsigned int napi_id = READ_ONCE(ep->napi_id);
0402
0403 if ((napi_id >= MIN_NAPI_ID) && net_busy_loop_on()) {
0404 napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, ep, false,
0405 BUSY_POLL_BUDGET);
0406 if (ep_events_available(ep))
0407 return true;
0408
0409
0410
0411
0412
0413 ep->napi_id = 0;
0414 return false;
0415 }
0416 return false;
0417 }
0418
0419
0420
0421
0422 static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
0423 {
0424 struct eventpoll *ep;
0425 unsigned int napi_id;
0426 struct socket *sock;
0427 struct sock *sk;
0428
0429 if (!net_busy_loop_on())
0430 return;
0431
0432 sock = sock_from_file(epi->ffd.file);
0433 if (!sock)
0434 return;
0435
0436 sk = sock->sk;
0437 if (!sk)
0438 return;
0439
0440 napi_id = READ_ONCE(sk->sk_napi_id);
0441 ep = epi->ep;
0442
0443
0444
0445
0446
0447 if (napi_id < MIN_NAPI_ID || napi_id == ep->napi_id)
0448 return;
0449
0450
0451 ep->napi_id = napi_id;
0452 }
0453
0454 #else
0455
0456 static inline bool ep_busy_loop(struct eventpoll *ep, int nonblock)
0457 {
0458 return false;
0459 }
0460
0461 static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
0462 {
0463 }
0464
0465 #endif
0466
0467
0468
0469
0470
0471
0472
0473
0474
0475
0476
0477
0478
0479
0480
0481
0482
0483
0484
0485
0486
0487
0488
0489
0490
0491
0492 #ifdef CONFIG_DEBUG_LOCK_ALLOC
0493
0494 static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi)
0495 {
0496 struct eventpoll *ep_src;
0497 unsigned long flags;
0498 u8 nests = 0;
0499
0500
0501
0502
0503
0504
0505
0506
0507
0508
0509
0510
0511
0512
0513
0514
0515 if (epi) {
0516 if ((is_file_epoll(epi->ffd.file))) {
0517 ep_src = epi->ffd.file->private_data;
0518 nests = ep_src->nests;
0519 } else {
0520 nests = 1;
0521 }
0522 }
0523 spin_lock_irqsave_nested(&ep->poll_wait.lock, flags, nests);
0524 ep->nests = nests + 1;
0525 wake_up_locked_poll(&ep->poll_wait, EPOLLIN);
0526 ep->nests = 0;
0527 spin_unlock_irqrestore(&ep->poll_wait.lock, flags);
0528 }
0529
0530 #else
0531
0532 static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi)
0533 {
0534 wake_up_poll(&ep->poll_wait, EPOLLIN);
0535 }
0536
0537 #endif
0538
0539 static void ep_remove_wait_queue(struct eppoll_entry *pwq)
0540 {
0541 wait_queue_head_t *whead;
0542
0543 rcu_read_lock();
0544
0545
0546
0547
0548
0549
0550 whead = smp_load_acquire(&pwq->whead);
0551 if (whead)
0552 remove_wait_queue(whead, &pwq->wait);
0553 rcu_read_unlock();
0554 }
0555
0556
0557
0558
0559
0560
0561 static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi)
0562 {
0563 struct eppoll_entry **p = &epi->pwqlist;
0564 struct eppoll_entry *pwq;
0565
0566 while ((pwq = *p) != NULL) {
0567 *p = pwq->next;
0568 ep_remove_wait_queue(pwq);
0569 kmem_cache_free(pwq_cache, pwq);
0570 }
0571 }
0572
0573
0574 static inline struct wakeup_source *ep_wakeup_source(struct epitem *epi)
0575 {
0576 return rcu_dereference_check(epi->ws, lockdep_is_held(&epi->ep->mtx));
0577 }
0578
0579
0580 static inline void ep_pm_stay_awake(struct epitem *epi)
0581 {
0582 struct wakeup_source *ws = ep_wakeup_source(epi);
0583
0584 if (ws)
0585 __pm_stay_awake(ws);
0586 }
0587
0588 static inline bool ep_has_wakeup_source(struct epitem *epi)
0589 {
0590 return rcu_access_pointer(epi->ws) ? true : false;
0591 }
0592
0593
0594 static inline void ep_pm_stay_awake_rcu(struct epitem *epi)
0595 {
0596 struct wakeup_source *ws;
0597
0598 rcu_read_lock();
0599 ws = rcu_dereference(epi->ws);
0600 if (ws)
0601 __pm_stay_awake(ws);
0602 rcu_read_unlock();
0603 }
0604
0605
0606
0607
0608
0609
0610 static void ep_start_scan(struct eventpoll *ep, struct list_head *txlist)
0611 {
0612
0613
0614
0615
0616
0617
0618
0619
0620 lockdep_assert_irqs_enabled();
0621 write_lock_irq(&ep->lock);
0622 list_splice_init(&ep->rdllist, txlist);
0623 WRITE_ONCE(ep->ovflist, NULL);
0624 write_unlock_irq(&ep->lock);
0625 }
0626
0627 static void ep_done_scan(struct eventpoll *ep,
0628 struct list_head *txlist)
0629 {
0630 struct epitem *epi, *nepi;
0631
0632 write_lock_irq(&ep->lock);
0633
0634
0635
0636
0637
0638 for (nepi = READ_ONCE(ep->ovflist); (epi = nepi) != NULL;
0639 nepi = epi->next, epi->next = EP_UNACTIVE_PTR) {
0640
0641
0642
0643
0644
0645
0646 if (!ep_is_linked(epi)) {
0647
0648
0649
0650
0651 list_add(&epi->rdllink, &ep->rdllist);
0652 ep_pm_stay_awake(epi);
0653 }
0654 }
0655
0656
0657
0658
0659
0660 WRITE_ONCE(ep->ovflist, EP_UNACTIVE_PTR);
0661
0662
0663
0664
0665 list_splice(txlist, &ep->rdllist);
0666 __pm_relax(ep->ws);
0667
0668 if (!list_empty(&ep->rdllist)) {
0669 if (waitqueue_active(&ep->wq))
0670 wake_up(&ep->wq);
0671 }
0672
0673 write_unlock_irq(&ep->lock);
0674 }
0675
0676 static void epi_rcu_free(struct rcu_head *head)
0677 {
0678 struct epitem *epi = container_of(head, struct epitem, rcu);
0679 kmem_cache_free(epi_cache, epi);
0680 }
0681
0682
0683
0684
0685
0686 static int ep_remove(struct eventpoll *ep, struct epitem *epi)
0687 {
0688 struct file *file = epi->ffd.file;
0689 struct epitems_head *to_free;
0690 struct hlist_head *head;
0691
0692 lockdep_assert_irqs_enabled();
0693
0694
0695
0696
0697 ep_unregister_pollwait(ep, epi);
0698
0699
0700 spin_lock(&file->f_lock);
0701 to_free = NULL;
0702 head = file->f_ep;
0703 if (head->first == &epi->fllink && !epi->fllink.next) {
0704 file->f_ep = NULL;
0705 if (!is_file_epoll(file)) {
0706 struct epitems_head *v;
0707 v = container_of(head, struct epitems_head, epitems);
0708 if (!smp_load_acquire(&v->next))
0709 to_free = v;
0710 }
0711 }
0712 hlist_del_rcu(&epi->fllink);
0713 spin_unlock(&file->f_lock);
0714 free_ephead(to_free);
0715
0716 rb_erase_cached(&epi->rbn, &ep->rbr);
0717
0718 write_lock_irq(&ep->lock);
0719 if (ep_is_linked(epi))
0720 list_del_init(&epi->rdllink);
0721 write_unlock_irq(&ep->lock);
0722
0723 wakeup_source_unregister(ep_wakeup_source(epi));
0724
0725
0726
0727
0728
0729
0730
0731 call_rcu(&epi->rcu, epi_rcu_free);
0732
0733 percpu_counter_dec(&ep->user->epoll_watches);
0734
0735 return 0;
0736 }
0737
0738 static void ep_free(struct eventpoll *ep)
0739 {
0740 struct rb_node *rbp;
0741 struct epitem *epi;
0742
0743
0744 if (waitqueue_active(&ep->poll_wait))
0745 ep_poll_safewake(ep, NULL);
0746
0747
0748
0749
0750
0751
0752
0753
0754
0755 mutex_lock(&epmutex);
0756
0757
0758
0759
0760 for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
0761 epi = rb_entry(rbp, struct epitem, rbn);
0762
0763 ep_unregister_pollwait(ep, epi);
0764 cond_resched();
0765 }
0766
0767
0768
0769
0770
0771
0772
0773
0774
0775 mutex_lock(&ep->mtx);
0776 while ((rbp = rb_first_cached(&ep->rbr)) != NULL) {
0777 epi = rb_entry(rbp, struct epitem, rbn);
0778 ep_remove(ep, epi);
0779 cond_resched();
0780 }
0781 mutex_unlock(&ep->mtx);
0782
0783 mutex_unlock(&epmutex);
0784 mutex_destroy(&ep->mtx);
0785 free_uid(ep->user);
0786 wakeup_source_unregister(ep->ws);
0787 kfree(ep);
0788 }
0789
0790 static int ep_eventpoll_release(struct inode *inode, struct file *file)
0791 {
0792 struct eventpoll *ep = file->private_data;
0793
0794 if (ep)
0795 ep_free(ep);
0796
0797 return 0;
0798 }
0799
0800 static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt, int depth);
0801
0802 static __poll_t __ep_eventpoll_poll(struct file *file, poll_table *wait, int depth)
0803 {
0804 struct eventpoll *ep = file->private_data;
0805 LIST_HEAD(txlist);
0806 struct epitem *epi, *tmp;
0807 poll_table pt;
0808 __poll_t res = 0;
0809
0810 init_poll_funcptr(&pt, NULL);
0811
0812
0813 poll_wait(file, &ep->poll_wait, wait);
0814
0815
0816
0817
0818
0819 mutex_lock_nested(&ep->mtx, depth);
0820 ep_start_scan(ep, &txlist);
0821 list_for_each_entry_safe(epi, tmp, &txlist, rdllink) {
0822 if (ep_item_poll(epi, &pt, depth + 1)) {
0823 res = EPOLLIN | EPOLLRDNORM;
0824 break;
0825 } else {
0826
0827
0828
0829
0830
0831 __pm_relax(ep_wakeup_source(epi));
0832 list_del_init(&epi->rdllink);
0833 }
0834 }
0835 ep_done_scan(ep, &txlist);
0836 mutex_unlock(&ep->mtx);
0837 return res;
0838 }
0839
0840
0841
0842
0843
0844
0845 static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt,
0846 int depth)
0847 {
0848 struct file *file = epi->ffd.file;
0849 __poll_t res;
0850
0851 pt->_key = epi->event.events;
0852 if (!is_file_epoll(file))
0853 res = vfs_poll(file, pt);
0854 else
0855 res = __ep_eventpoll_poll(file, pt, depth);
0856 return res & epi->event.events;
0857 }
0858
0859 static __poll_t ep_eventpoll_poll(struct file *file, poll_table *wait)
0860 {
0861 return __ep_eventpoll_poll(file, wait, 0);
0862 }
0863
0864 #ifdef CONFIG_PROC_FS
0865 static void ep_show_fdinfo(struct seq_file *m, struct file *f)
0866 {
0867 struct eventpoll *ep = f->private_data;
0868 struct rb_node *rbp;
0869
0870 mutex_lock(&ep->mtx);
0871 for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
0872 struct epitem *epi = rb_entry(rbp, struct epitem, rbn);
0873 struct inode *inode = file_inode(epi->ffd.file);
0874
0875 seq_printf(m, "tfd: %8d events: %8x data: %16llx "
0876 " pos:%lli ino:%lx sdev:%x\n",
0877 epi->ffd.fd, epi->event.events,
0878 (long long)epi->event.data,
0879 (long long)epi->ffd.file->f_pos,
0880 inode->i_ino, inode->i_sb->s_dev);
0881 if (seq_has_overflowed(m))
0882 break;
0883 }
0884 mutex_unlock(&ep->mtx);
0885 }
0886 #endif
0887
0888
0889 static const struct file_operations eventpoll_fops = {
0890 #ifdef CONFIG_PROC_FS
0891 .show_fdinfo = ep_show_fdinfo,
0892 #endif
0893 .release = ep_eventpoll_release,
0894 .poll = ep_eventpoll_poll,
0895 .llseek = noop_llseek,
0896 };
0897
0898
0899
0900
0901
0902
0903 void eventpoll_release_file(struct file *file)
0904 {
0905 struct eventpoll *ep;
0906 struct epitem *epi;
0907 struct hlist_node *next;
0908
0909
0910
0911
0912
0913
0914
0915
0916
0917
0918
0919
0920
0921
0922 mutex_lock(&epmutex);
0923 if (unlikely(!file->f_ep)) {
0924 mutex_unlock(&epmutex);
0925 return;
0926 }
0927 hlist_for_each_entry_safe(epi, next, file->f_ep, fllink) {
0928 ep = epi->ep;
0929 mutex_lock_nested(&ep->mtx, 0);
0930 ep_remove(ep, epi);
0931 mutex_unlock(&ep->mtx);
0932 }
0933 mutex_unlock(&epmutex);
0934 }
0935
0936 static int ep_alloc(struct eventpoll **pep)
0937 {
0938 int error;
0939 struct user_struct *user;
0940 struct eventpoll *ep;
0941
0942 user = get_current_user();
0943 error = -ENOMEM;
0944 ep = kzalloc(sizeof(*ep), GFP_KERNEL);
0945 if (unlikely(!ep))
0946 goto free_uid;
0947
0948 mutex_init(&ep->mtx);
0949 rwlock_init(&ep->lock);
0950 init_waitqueue_head(&ep->wq);
0951 init_waitqueue_head(&ep->poll_wait);
0952 INIT_LIST_HEAD(&ep->rdllist);
0953 ep->rbr = RB_ROOT_CACHED;
0954 ep->ovflist = EP_UNACTIVE_PTR;
0955 ep->user = user;
0956
0957 *pep = ep;
0958
0959 return 0;
0960
0961 free_uid:
0962 free_uid(user);
0963 return error;
0964 }
0965
0966
0967
0968
0969
0970
0971 static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd)
0972 {
0973 int kcmp;
0974 struct rb_node *rbp;
0975 struct epitem *epi, *epir = NULL;
0976 struct epoll_filefd ffd;
0977
0978 ep_set_ffd(&ffd, file, fd);
0979 for (rbp = ep->rbr.rb_root.rb_node; rbp; ) {
0980 epi = rb_entry(rbp, struct epitem, rbn);
0981 kcmp = ep_cmp_ffd(&ffd, &epi->ffd);
0982 if (kcmp > 0)
0983 rbp = rbp->rb_right;
0984 else if (kcmp < 0)
0985 rbp = rbp->rb_left;
0986 else {
0987 epir = epi;
0988 break;
0989 }
0990 }
0991
0992 return epir;
0993 }
0994
0995 #ifdef CONFIG_KCMP
0996 static struct epitem *ep_find_tfd(struct eventpoll *ep, int tfd, unsigned long toff)
0997 {
0998 struct rb_node *rbp;
0999 struct epitem *epi;
1000
1001 for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
1002 epi = rb_entry(rbp, struct epitem, rbn);
1003 if (epi->ffd.fd == tfd) {
1004 if (toff == 0)
1005 return epi;
1006 else
1007 toff--;
1008 }
1009 cond_resched();
1010 }
1011
1012 return NULL;
1013 }
1014
1015 struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd,
1016 unsigned long toff)
1017 {
1018 struct file *file_raw;
1019 struct eventpoll *ep;
1020 struct epitem *epi;
1021
1022 if (!is_file_epoll(file))
1023 return ERR_PTR(-EINVAL);
1024
1025 ep = file->private_data;
1026
1027 mutex_lock(&ep->mtx);
1028 epi = ep_find_tfd(ep, tfd, toff);
1029 if (epi)
1030 file_raw = epi->ffd.file;
1031 else
1032 file_raw = ERR_PTR(-ENOENT);
1033 mutex_unlock(&ep->mtx);
1034
1035 return file_raw;
1036 }
1037 #endif
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057 static inline bool list_add_tail_lockless(struct list_head *new,
1058 struct list_head *head)
1059 {
1060 struct list_head *prev;
1061
1062
1063
1064
1065
1066
1067
1068 if (cmpxchg(&new->next, new, head) != new)
1069 return false;
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079 prev = xchg(&head->prev, new);
1080
1081
1082
1083
1084
1085
1086 prev->next = new;
1087 new->prev = prev;
1088
1089 return true;
1090 }
1091
1092
1093
1094
1095
1096
1097
1098 static inline bool chain_epi_lockless(struct epitem *epi)
1099 {
1100 struct eventpoll *ep = epi->ep;
1101
1102
1103 if (epi->next != EP_UNACTIVE_PTR)
1104 return false;
1105
1106
1107 if (cmpxchg(&epi->next, EP_UNACTIVE_PTR, NULL) != EP_UNACTIVE_PTR)
1108 return false;
1109
1110
1111 epi->next = xchg(&ep->ovflist, epi);
1112
1113 return true;
1114 }
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134 static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
1135 {
1136 int pwake = 0;
1137 struct epitem *epi = ep_item_from_wait(wait);
1138 struct eventpoll *ep = epi->ep;
1139 __poll_t pollflags = key_to_poll(key);
1140 unsigned long flags;
1141 int ewake = 0;
1142
1143 read_lock_irqsave(&ep->lock, flags);
1144
1145 ep_set_busy_poll_napi_id(epi);
1146
1147
1148
1149
1150
1151
1152
1153 if (!(epi->event.events & ~EP_PRIVATE_BITS))
1154 goto out_unlock;
1155
1156
1157
1158
1159
1160
1161
1162 if (pollflags && !(pollflags & epi->event.events))
1163 goto out_unlock;
1164
1165
1166
1167
1168
1169
1170
1171 if (READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR) {
1172 if (chain_epi_lockless(epi))
1173 ep_pm_stay_awake_rcu(epi);
1174 } else if (!ep_is_linked(epi)) {
1175
1176 if (list_add_tail_lockless(&epi->rdllink, &ep->rdllist))
1177 ep_pm_stay_awake_rcu(epi);
1178 }
1179
1180
1181
1182
1183
1184 if (waitqueue_active(&ep->wq)) {
1185 if ((epi->event.events & EPOLLEXCLUSIVE) &&
1186 !(pollflags & POLLFREE)) {
1187 switch (pollflags & EPOLLINOUT_BITS) {
1188 case EPOLLIN:
1189 if (epi->event.events & EPOLLIN)
1190 ewake = 1;
1191 break;
1192 case EPOLLOUT:
1193 if (epi->event.events & EPOLLOUT)
1194 ewake = 1;
1195 break;
1196 case 0:
1197 ewake = 1;
1198 break;
1199 }
1200 }
1201 wake_up(&ep->wq);
1202 }
1203 if (waitqueue_active(&ep->poll_wait))
1204 pwake++;
1205
1206 out_unlock:
1207 read_unlock_irqrestore(&ep->lock, flags);
1208
1209
1210 if (pwake)
1211 ep_poll_safewake(ep, epi);
1212
1213 if (!(epi->event.events & EPOLLEXCLUSIVE))
1214 ewake = 1;
1215
1216 if (pollflags & POLLFREE) {
1217
1218
1219
1220
1221
1222 list_del_init(&wait->entry);
1223
1224
1225
1226
1227
1228
1229 smp_store_release(&ep_pwq_from_wait(wait)->whead, NULL);
1230 }
1231
1232 return ewake;
1233 }
1234
1235
1236
1237
1238
1239 static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
1240 poll_table *pt)
1241 {
1242 struct ep_pqueue *epq = container_of(pt, struct ep_pqueue, pt);
1243 struct epitem *epi = epq->epi;
1244 struct eppoll_entry *pwq;
1245
1246 if (unlikely(!epi))
1247 return;
1248
1249 pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL);
1250 if (unlikely(!pwq)) {
1251 epq->epi = NULL;
1252 return;
1253 }
1254
1255 init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
1256 pwq->whead = whead;
1257 pwq->base = epi;
1258 if (epi->event.events & EPOLLEXCLUSIVE)
1259 add_wait_queue_exclusive(whead, &pwq->wait);
1260 else
1261 add_wait_queue(whead, &pwq->wait);
1262 pwq->next = epi->pwqlist;
1263 epi->pwqlist = pwq;
1264 }
1265
1266 static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi)
1267 {
1268 int kcmp;
1269 struct rb_node **p = &ep->rbr.rb_root.rb_node, *parent = NULL;
1270 struct epitem *epic;
1271 bool leftmost = true;
1272
1273 while (*p) {
1274 parent = *p;
1275 epic = rb_entry(parent, struct epitem, rbn);
1276 kcmp = ep_cmp_ffd(&epi->ffd, &epic->ffd);
1277 if (kcmp > 0) {
1278 p = &parent->rb_right;
1279 leftmost = false;
1280 } else
1281 p = &parent->rb_left;
1282 }
1283 rb_link_node(&epi->rbn, parent, p);
1284 rb_insert_color_cached(&epi->rbn, &ep->rbr, leftmost);
1285 }
1286
1287
1288
1289 #define PATH_ARR_SIZE 5
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301 static const int path_limits[PATH_ARR_SIZE] = { 1000, 500, 100, 50, 10 };
1302 static int path_count[PATH_ARR_SIZE];
1303
1304 static int path_count_inc(int nests)
1305 {
1306
1307 if (nests == 0)
1308 return 0;
1309
1310 if (++path_count[nests] > path_limits[nests])
1311 return -1;
1312 return 0;
1313 }
1314
1315 static void path_count_init(void)
1316 {
1317 int i;
1318
1319 for (i = 0; i < PATH_ARR_SIZE; i++)
1320 path_count[i] = 0;
1321 }
1322
1323 static int reverse_path_check_proc(struct hlist_head *refs, int depth)
1324 {
1325 int error = 0;
1326 struct epitem *epi;
1327
1328 if (depth > EP_MAX_NESTS)
1329 return -1;
1330
1331
1332 hlist_for_each_entry_rcu(epi, refs, fllink) {
1333 struct hlist_head *refs = &epi->ep->refs;
1334 if (hlist_empty(refs))
1335 error = path_count_inc(depth);
1336 else
1337 error = reverse_path_check_proc(refs, depth + 1);
1338 if (error != 0)
1339 break;
1340 }
1341 return error;
1342 }
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354 static int reverse_path_check(void)
1355 {
1356 struct epitems_head *p;
1357
1358 for (p = tfile_check_list; p != EP_UNACTIVE_PTR; p = p->next) {
1359 int error;
1360 path_count_init();
1361 rcu_read_lock();
1362 error = reverse_path_check_proc(&p->epitems, 0);
1363 rcu_read_unlock();
1364 if (error)
1365 return error;
1366 }
1367 return 0;
1368 }
1369
1370 static int ep_create_wakeup_source(struct epitem *epi)
1371 {
1372 struct name_snapshot n;
1373 struct wakeup_source *ws;
1374
1375 if (!epi->ep->ws) {
1376 epi->ep->ws = wakeup_source_register(NULL, "eventpoll");
1377 if (!epi->ep->ws)
1378 return -ENOMEM;
1379 }
1380
1381 take_dentry_name_snapshot(&n, epi->ffd.file->f_path.dentry);
1382 ws = wakeup_source_register(NULL, n.name.name);
1383 release_dentry_name_snapshot(&n);
1384
1385 if (!ws)
1386 return -ENOMEM;
1387 rcu_assign_pointer(epi->ws, ws);
1388
1389 return 0;
1390 }
1391
1392
1393 static noinline void ep_destroy_wakeup_source(struct epitem *epi)
1394 {
1395 struct wakeup_source *ws = ep_wakeup_source(epi);
1396
1397 RCU_INIT_POINTER(epi->ws, NULL);
1398
1399
1400
1401
1402
1403
1404 synchronize_rcu();
1405 wakeup_source_unregister(ws);
1406 }
1407
1408 static int attach_epitem(struct file *file, struct epitem *epi)
1409 {
1410 struct epitems_head *to_free = NULL;
1411 struct hlist_head *head = NULL;
1412 struct eventpoll *ep = NULL;
1413
1414 if (is_file_epoll(file))
1415 ep = file->private_data;
1416
1417 if (ep) {
1418 head = &ep->refs;
1419 } else if (!READ_ONCE(file->f_ep)) {
1420 allocate:
1421 to_free = kmem_cache_zalloc(ephead_cache, GFP_KERNEL);
1422 if (!to_free)
1423 return -ENOMEM;
1424 head = &to_free->epitems;
1425 }
1426 spin_lock(&file->f_lock);
1427 if (!file->f_ep) {
1428 if (unlikely(!head)) {
1429 spin_unlock(&file->f_lock);
1430 goto allocate;
1431 }
1432 file->f_ep = head;
1433 to_free = NULL;
1434 }
1435 hlist_add_head_rcu(&epi->fllink, file->f_ep);
1436 spin_unlock(&file->f_lock);
1437 free_ephead(to_free);
1438 return 0;
1439 }
1440
1441
1442
1443
1444 static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
1445 struct file *tfile, int fd, int full_check)
1446 {
1447 int error, pwake = 0;
1448 __poll_t revents;
1449 struct epitem *epi;
1450 struct ep_pqueue epq;
1451 struct eventpoll *tep = NULL;
1452
1453 if (is_file_epoll(tfile))
1454 tep = tfile->private_data;
1455
1456 lockdep_assert_irqs_enabled();
1457
1458 if (unlikely(percpu_counter_compare(&ep->user->epoll_watches,
1459 max_user_watches) >= 0))
1460 return -ENOSPC;
1461 percpu_counter_inc(&ep->user->epoll_watches);
1462
1463 if (!(epi = kmem_cache_zalloc(epi_cache, GFP_KERNEL))) {
1464 percpu_counter_dec(&ep->user->epoll_watches);
1465 return -ENOMEM;
1466 }
1467
1468
1469 INIT_LIST_HEAD(&epi->rdllink);
1470 epi->ep = ep;
1471 ep_set_ffd(&epi->ffd, tfile, fd);
1472 epi->event = *event;
1473 epi->next = EP_UNACTIVE_PTR;
1474
1475 if (tep)
1476 mutex_lock_nested(&tep->mtx, 1);
1477
1478 if (unlikely(attach_epitem(tfile, epi) < 0)) {
1479 if (tep)
1480 mutex_unlock(&tep->mtx);
1481 kmem_cache_free(epi_cache, epi);
1482 percpu_counter_dec(&ep->user->epoll_watches);
1483 return -ENOMEM;
1484 }
1485
1486 if (full_check && !tep)
1487 list_file(tfile);
1488
1489
1490
1491
1492
1493 ep_rbtree_insert(ep, epi);
1494 if (tep)
1495 mutex_unlock(&tep->mtx);
1496
1497
1498 if (unlikely(full_check && reverse_path_check())) {
1499 ep_remove(ep, epi);
1500 return -EINVAL;
1501 }
1502
1503 if (epi->event.events & EPOLLWAKEUP) {
1504 error = ep_create_wakeup_source(epi);
1505 if (error) {
1506 ep_remove(ep, epi);
1507 return error;
1508 }
1509 }
1510
1511
1512 epq.epi = epi;
1513 init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);
1514
1515
1516
1517
1518
1519
1520
1521
1522 revents = ep_item_poll(epi, &epq.pt, 1);
1523
1524
1525
1526
1527
1528
1529 if (unlikely(!epq.epi)) {
1530 ep_remove(ep, epi);
1531 return -ENOMEM;
1532 }
1533
1534
1535 write_lock_irq(&ep->lock);
1536
1537
1538 ep_set_busy_poll_napi_id(epi);
1539
1540
1541 if (revents && !ep_is_linked(epi)) {
1542 list_add_tail(&epi->rdllink, &ep->rdllist);
1543 ep_pm_stay_awake(epi);
1544
1545
1546 if (waitqueue_active(&ep->wq))
1547 wake_up(&ep->wq);
1548 if (waitqueue_active(&ep->poll_wait))
1549 pwake++;
1550 }
1551
1552 write_unlock_irq(&ep->lock);
1553
1554
1555 if (pwake)
1556 ep_poll_safewake(ep, NULL);
1557
1558 return 0;
1559 }
1560
1561
1562
1563
1564
1565 static int ep_modify(struct eventpoll *ep, struct epitem *epi,
1566 const struct epoll_event *event)
1567 {
1568 int pwake = 0;
1569 poll_table pt;
1570
1571 lockdep_assert_irqs_enabled();
1572
1573 init_poll_funcptr(&pt, NULL);
1574
1575
1576
1577
1578
1579
1580 epi->event.events = event->events;
1581 epi->event.data = event->data;
1582 if (epi->event.events & EPOLLWAKEUP) {
1583 if (!ep_has_wakeup_source(epi))
1584 ep_create_wakeup_source(epi);
1585 } else if (ep_has_wakeup_source(epi)) {
1586 ep_destroy_wakeup_source(epi);
1587 }
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607 smp_mb();
1608
1609
1610
1611
1612
1613
1614
1615 if (ep_item_poll(epi, &pt, 1)) {
1616 write_lock_irq(&ep->lock);
1617 if (!ep_is_linked(epi)) {
1618 list_add_tail(&epi->rdllink, &ep->rdllist);
1619 ep_pm_stay_awake(epi);
1620
1621
1622 if (waitqueue_active(&ep->wq))
1623 wake_up(&ep->wq);
1624 if (waitqueue_active(&ep->poll_wait))
1625 pwake++;
1626 }
1627 write_unlock_irq(&ep->lock);
1628 }
1629
1630
1631 if (pwake)
1632 ep_poll_safewake(ep, NULL);
1633
1634 return 0;
1635 }
1636
1637 static int ep_send_events(struct eventpoll *ep,
1638 struct epoll_event __user *events, int maxevents)
1639 {
1640 struct epitem *epi, *tmp;
1641 LIST_HEAD(txlist);
1642 poll_table pt;
1643 int res = 0;
1644
1645
1646
1647
1648
1649
1650 if (fatal_signal_pending(current))
1651 return -EINTR;
1652
1653 init_poll_funcptr(&pt, NULL);
1654
1655 mutex_lock(&ep->mtx);
1656 ep_start_scan(ep, &txlist);
1657
1658
1659
1660
1661
1662 list_for_each_entry_safe(epi, tmp, &txlist, rdllink) {
1663 struct wakeup_source *ws;
1664 __poll_t revents;
1665
1666 if (res >= maxevents)
1667 break;
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678 ws = ep_wakeup_source(epi);
1679 if (ws) {
1680 if (ws->active)
1681 __pm_stay_awake(ep->ws);
1682 __pm_relax(ws);
1683 }
1684
1685 list_del_init(&epi->rdllink);
1686
1687
1688
1689
1690
1691
1692 revents = ep_item_poll(epi, &pt, 1);
1693 if (!revents)
1694 continue;
1695
1696 events = epoll_put_uevent(revents, epi->event.data, events);
1697 if (!events) {
1698 list_add(&epi->rdllink, &txlist);
1699 ep_pm_stay_awake(epi);
1700 if (!res)
1701 res = -EFAULT;
1702 break;
1703 }
1704 res++;
1705 if (epi->event.events & EPOLLONESHOT)
1706 epi->event.events &= EP_PRIVATE_BITS;
1707 else if (!(epi->event.events & EPOLLET)) {
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719 list_add_tail(&epi->rdllink, &ep->rdllist);
1720 ep_pm_stay_awake(epi);
1721 }
1722 }
1723 ep_done_scan(ep, &txlist);
1724 mutex_unlock(&ep->mtx);
1725
1726 return res;
1727 }
1728
1729 static struct timespec64 *ep_timeout_to_timespec(struct timespec64 *to, long ms)
1730 {
1731 struct timespec64 now;
1732
1733 if (ms < 0)
1734 return NULL;
1735
1736 if (!ms) {
1737 to->tv_sec = 0;
1738 to->tv_nsec = 0;
1739 return to;
1740 }
1741
1742 to->tv_sec = ms / MSEC_PER_SEC;
1743 to->tv_nsec = NSEC_PER_MSEC * (ms % MSEC_PER_SEC);
1744
1745 ktime_get_ts64(&now);
1746 *to = timespec64_add_safe(now, *to);
1747 return to;
1748 }
1749
1750
1751
1752
1753
1754
1755
1756 static int ep_autoremove_wake_function(struct wait_queue_entry *wq_entry,
1757 unsigned int mode, int sync, void *key)
1758 {
1759 int ret = default_wake_function(wq_entry, mode, sync, key);
1760
1761 list_del_init(&wq_entry->entry);
1762 return ret;
1763 }
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782 static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
1783 int maxevents, struct timespec64 *timeout)
1784 {
1785 int res, eavail, timed_out = 0;
1786 u64 slack = 0;
1787 wait_queue_entry_t wait;
1788 ktime_t expires, *to = NULL;
1789
1790 lockdep_assert_irqs_enabled();
1791
1792 if (timeout && (timeout->tv_sec | timeout->tv_nsec)) {
1793 slack = select_estimate_accuracy(timeout);
1794 to = &expires;
1795 *to = timespec64_to_ktime(*timeout);
1796 } else if (timeout) {
1797
1798
1799
1800
1801 timed_out = 1;
1802 }
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812 eavail = ep_events_available(ep);
1813
1814 while (1) {
1815 if (eavail) {
1816
1817
1818
1819
1820
1821 res = ep_send_events(ep, events, maxevents);
1822 if (res)
1823 return res;
1824 }
1825
1826 if (timed_out)
1827 return 0;
1828
1829 eavail = ep_busy_loop(ep, timed_out);
1830 if (eavail)
1831 continue;
1832
1833 if (signal_pending(current))
1834 return -EINTR;
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853 init_wait(&wait);
1854 wait.func = ep_autoremove_wake_function;
1855
1856 write_lock_irq(&ep->lock);
1857
1858
1859
1860
1861
1862 __set_current_state(TASK_INTERRUPTIBLE);
1863
1864
1865
1866
1867
1868
1869
1870
1871 eavail = ep_events_available(ep);
1872 if (!eavail)
1873 __add_wait_queue_exclusive(&ep->wq, &wait);
1874
1875 write_unlock_irq(&ep->lock);
1876
1877 if (!eavail)
1878 timed_out = !schedule_hrtimeout_range(to, slack,
1879 HRTIMER_MODE_ABS);
1880 __set_current_state(TASK_RUNNING);
1881
1882
1883
1884
1885
1886
1887 eavail = 1;
1888
1889 if (!list_empty_careful(&wait.entry)) {
1890 write_lock_irq(&ep->lock);
1891
1892
1893
1894
1895
1896
1897
1898 if (timed_out)
1899 eavail = list_empty(&wait.entry);
1900 __remove_wait_queue(&ep->wq, &wait);
1901 write_unlock_irq(&ep->lock);
1902 }
1903 }
1904 }
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918 static int ep_loop_check_proc(struct eventpoll *ep, int depth)
1919 {
1920 int error = 0;
1921 struct rb_node *rbp;
1922 struct epitem *epi;
1923
1924 mutex_lock_nested(&ep->mtx, depth + 1);
1925 ep->gen = loop_check_gen;
1926 for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
1927 epi = rb_entry(rbp, struct epitem, rbn);
1928 if (unlikely(is_file_epoll(epi->ffd.file))) {
1929 struct eventpoll *ep_tovisit;
1930 ep_tovisit = epi->ffd.file->private_data;
1931 if (ep_tovisit->gen == loop_check_gen)
1932 continue;
1933 if (ep_tovisit == inserting_into || depth > EP_MAX_NESTS)
1934 error = -1;
1935 else
1936 error = ep_loop_check_proc(ep_tovisit, depth + 1);
1937 if (error != 0)
1938 break;
1939 } else {
1940
1941
1942
1943
1944
1945
1946
1947
1948 list_file(epi->ffd.file);
1949 }
1950 }
1951 mutex_unlock(&ep->mtx);
1952
1953 return error;
1954 }
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967 static int ep_loop_check(struct eventpoll *ep, struct eventpoll *to)
1968 {
1969 inserting_into = ep;
1970 return ep_loop_check_proc(to, 0);
1971 }
1972
1973 static void clear_tfile_check_list(void)
1974 {
1975 rcu_read_lock();
1976 while (tfile_check_list != EP_UNACTIVE_PTR) {
1977 struct epitems_head *head = tfile_check_list;
1978 tfile_check_list = head->next;
1979 unlist_file(head);
1980 }
1981 rcu_read_unlock();
1982 }
1983
1984
1985
1986
1987 static int do_epoll_create(int flags)
1988 {
1989 int error, fd;
1990 struct eventpoll *ep = NULL;
1991 struct file *file;
1992
1993
1994 BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);
1995
1996 if (flags & ~EPOLL_CLOEXEC)
1997 return -EINVAL;
1998
1999
2000
2001 error = ep_alloc(&ep);
2002 if (error < 0)
2003 return error;
2004
2005
2006
2007
2008 fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC));
2009 if (fd < 0) {
2010 error = fd;
2011 goto out_free_ep;
2012 }
2013 file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep,
2014 O_RDWR | (flags & O_CLOEXEC));
2015 if (IS_ERR(file)) {
2016 error = PTR_ERR(file);
2017 goto out_free_fd;
2018 }
2019 ep->file = file;
2020 fd_install(fd, file);
2021 return fd;
2022
2023 out_free_fd:
2024 put_unused_fd(fd);
2025 out_free_ep:
2026 ep_free(ep);
2027 return error;
2028 }
2029
2030 SYSCALL_DEFINE1(epoll_create1, int, flags)
2031 {
2032 return do_epoll_create(flags);
2033 }
2034
2035 SYSCALL_DEFINE1(epoll_create, int, size)
2036 {
2037 if (size <= 0)
2038 return -EINVAL;
2039
2040 return do_epoll_create(0);
2041 }
2042
2043 static inline int epoll_mutex_lock(struct mutex *mutex, int depth,
2044 bool nonblock)
2045 {
2046 if (!nonblock) {
2047 mutex_lock_nested(mutex, depth);
2048 return 0;
2049 }
2050 if (mutex_trylock(mutex))
2051 return 0;
2052 return -EAGAIN;
2053 }
2054
2055 int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
2056 bool nonblock)
2057 {
2058 int error;
2059 int full_check = 0;
2060 struct fd f, tf;
2061 struct eventpoll *ep;
2062 struct epitem *epi;
2063 struct eventpoll *tep = NULL;
2064
2065 error = -EBADF;
2066 f = fdget(epfd);
2067 if (!f.file)
2068 goto error_return;
2069
2070
2071 tf = fdget(fd);
2072 if (!tf.file)
2073 goto error_fput;
2074
2075
2076 error = -EPERM;
2077 if (!file_can_poll(tf.file))
2078 goto error_tgt_fput;
2079
2080
2081 if (ep_op_has_event(op))
2082 ep_take_care_of_epollwakeup(epds);
2083
2084
2085
2086
2087
2088
2089 error = -EINVAL;
2090 if (f.file == tf.file || !is_file_epoll(f.file))
2091 goto error_tgt_fput;
2092
2093
2094
2095
2096
2097
2098 if (ep_op_has_event(op) && (epds->events & EPOLLEXCLUSIVE)) {
2099 if (op == EPOLL_CTL_MOD)
2100 goto error_tgt_fput;
2101 if (op == EPOLL_CTL_ADD && (is_file_epoll(tf.file) ||
2102 (epds->events & ~EPOLLEXCLUSIVE_OK_BITS)))
2103 goto error_tgt_fput;
2104 }
2105
2106
2107
2108
2109
2110 ep = f.file->private_data;
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127 error = epoll_mutex_lock(&ep->mtx, 0, nonblock);
2128 if (error)
2129 goto error_tgt_fput;
2130 if (op == EPOLL_CTL_ADD) {
2131 if (READ_ONCE(f.file->f_ep) || ep->gen == loop_check_gen ||
2132 is_file_epoll(tf.file)) {
2133 mutex_unlock(&ep->mtx);
2134 error = epoll_mutex_lock(&epmutex, 0, nonblock);
2135 if (error)
2136 goto error_tgt_fput;
2137 loop_check_gen++;
2138 full_check = 1;
2139 if (is_file_epoll(tf.file)) {
2140 tep = tf.file->private_data;
2141 error = -ELOOP;
2142 if (ep_loop_check(ep, tep) != 0)
2143 goto error_tgt_fput;
2144 }
2145 error = epoll_mutex_lock(&ep->mtx, 0, nonblock);
2146 if (error)
2147 goto error_tgt_fput;
2148 }
2149 }
2150
2151
2152
2153
2154
2155
2156 epi = ep_find(ep, tf.file, fd);
2157
2158 error = -EINVAL;
2159 switch (op) {
2160 case EPOLL_CTL_ADD:
2161 if (!epi) {
2162 epds->events |= EPOLLERR | EPOLLHUP;
2163 error = ep_insert(ep, epds, tf.file, fd, full_check);
2164 } else
2165 error = -EEXIST;
2166 break;
2167 case EPOLL_CTL_DEL:
2168 if (epi)
2169 error = ep_remove(ep, epi);
2170 else
2171 error = -ENOENT;
2172 break;
2173 case EPOLL_CTL_MOD:
2174 if (epi) {
2175 if (!(epi->event.events & EPOLLEXCLUSIVE)) {
2176 epds->events |= EPOLLERR | EPOLLHUP;
2177 error = ep_modify(ep, epi, epds);
2178 }
2179 } else
2180 error = -ENOENT;
2181 break;
2182 }
2183 mutex_unlock(&ep->mtx);
2184
2185 error_tgt_fput:
2186 if (full_check) {
2187 clear_tfile_check_list();
2188 loop_check_gen++;
2189 mutex_unlock(&epmutex);
2190 }
2191
2192 fdput(tf);
2193 error_fput:
2194 fdput(f);
2195 error_return:
2196
2197 return error;
2198 }
2199
2200
2201
2202
2203
2204
2205 SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
2206 struct epoll_event __user *, event)
2207 {
2208 struct epoll_event epds;
2209
2210 if (ep_op_has_event(op) &&
2211 copy_from_user(&epds, event, sizeof(struct epoll_event)))
2212 return -EFAULT;
2213
2214 return do_epoll_ctl(epfd, op, fd, &epds, false);
2215 }
2216
2217
2218
2219
2220
2221 static int do_epoll_wait(int epfd, struct epoll_event __user *events,
2222 int maxevents, struct timespec64 *to)
2223 {
2224 int error;
2225 struct fd f;
2226 struct eventpoll *ep;
2227
2228
2229 if (maxevents <= 0 || maxevents > EP_MAX_EVENTS)
2230 return -EINVAL;
2231
2232
2233 if (!access_ok(events, maxevents * sizeof(struct epoll_event)))
2234 return -EFAULT;
2235
2236
2237 f = fdget(epfd);
2238 if (!f.file)
2239 return -EBADF;
2240
2241
2242
2243
2244
2245 error = -EINVAL;
2246 if (!is_file_epoll(f.file))
2247 goto error_fput;
2248
2249
2250
2251
2252
2253 ep = f.file->private_data;
2254
2255
2256 error = ep_poll(ep, events, maxevents, to);
2257
2258 error_fput:
2259 fdput(f);
2260 return error;
2261 }
2262
2263 SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
2264 int, maxevents, int, timeout)
2265 {
2266 struct timespec64 to;
2267
2268 return do_epoll_wait(epfd, events, maxevents,
2269 ep_timeout_to_timespec(&to, timeout));
2270 }
2271
2272
2273
2274
2275
2276 static int do_epoll_pwait(int epfd, struct epoll_event __user *events,
2277 int maxevents, struct timespec64 *to,
2278 const sigset_t __user *sigmask, size_t sigsetsize)
2279 {
2280 int error;
2281
2282
2283
2284
2285
2286 error = set_user_sigmask(sigmask, sigsetsize);
2287 if (error)
2288 return error;
2289
2290 error = do_epoll_wait(epfd, events, maxevents, to);
2291
2292 restore_saved_sigmask_unless(error == -EINTR);
2293
2294 return error;
2295 }
2296
2297 SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
2298 int, maxevents, int, timeout, const sigset_t __user *, sigmask,
2299 size_t, sigsetsize)
2300 {
2301 struct timespec64 to;
2302
2303 return do_epoll_pwait(epfd, events, maxevents,
2304 ep_timeout_to_timespec(&to, timeout),
2305 sigmask, sigsetsize);
2306 }
2307
2308 SYSCALL_DEFINE6(epoll_pwait2, int, epfd, struct epoll_event __user *, events,
2309 int, maxevents, const struct __kernel_timespec __user *, timeout,
2310 const sigset_t __user *, sigmask, size_t, sigsetsize)
2311 {
2312 struct timespec64 ts, *to = NULL;
2313
2314 if (timeout) {
2315 if (get_timespec64(&ts, timeout))
2316 return -EFAULT;
2317 to = &ts;
2318 if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
2319 return -EINVAL;
2320 }
2321
2322 return do_epoll_pwait(epfd, events, maxevents, to,
2323 sigmask, sigsetsize);
2324 }
2325
2326 #ifdef CONFIG_COMPAT
2327 static int do_compat_epoll_pwait(int epfd, struct epoll_event __user *events,
2328 int maxevents, struct timespec64 *timeout,
2329 const compat_sigset_t __user *sigmask,
2330 compat_size_t sigsetsize)
2331 {
2332 long err;
2333
2334
2335
2336
2337
2338 err = set_compat_user_sigmask(sigmask, sigsetsize);
2339 if (err)
2340 return err;
2341
2342 err = do_epoll_wait(epfd, events, maxevents, timeout);
2343
2344 restore_saved_sigmask_unless(err == -EINTR);
2345
2346 return err;
2347 }
2348
2349 COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,
2350 struct epoll_event __user *, events,
2351 int, maxevents, int, timeout,
2352 const compat_sigset_t __user *, sigmask,
2353 compat_size_t, sigsetsize)
2354 {
2355 struct timespec64 to;
2356
2357 return do_compat_epoll_pwait(epfd, events, maxevents,
2358 ep_timeout_to_timespec(&to, timeout),
2359 sigmask, sigsetsize);
2360 }
2361
2362 COMPAT_SYSCALL_DEFINE6(epoll_pwait2, int, epfd,
2363 struct epoll_event __user *, events,
2364 int, maxevents,
2365 const struct __kernel_timespec __user *, timeout,
2366 const compat_sigset_t __user *, sigmask,
2367 compat_size_t, sigsetsize)
2368 {
2369 struct timespec64 ts, *to = NULL;
2370
2371 if (timeout) {
2372 if (get_timespec64(&ts, timeout))
2373 return -EFAULT;
2374 to = &ts;
2375 if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
2376 return -EINVAL;
2377 }
2378
2379 return do_compat_epoll_pwait(epfd, events, maxevents, to,
2380 sigmask, sigsetsize);
2381 }
2382
2383 #endif
2384
2385 static int __init eventpoll_init(void)
2386 {
2387 struct sysinfo si;
2388
2389 si_meminfo(&si);
2390
2391
2392
2393 max_user_watches = (((si.totalram - si.totalhigh) / 25) << PAGE_SHIFT) /
2394 EP_ITEM_COST;
2395 BUG_ON(max_user_watches < 0);
2396
2397
2398
2399
2400
2401 BUILD_BUG_ON(sizeof(void *) <= 8 && sizeof(struct epitem) > 128);
2402
2403
2404 epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem),
2405 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL);
2406
2407
2408 pwq_cache = kmem_cache_create("eventpoll_pwq",
2409 sizeof(struct eppoll_entry), 0, SLAB_PANIC|SLAB_ACCOUNT, NULL);
2410 epoll_sysctls_init();
2411
2412 ephead_cache = kmem_cache_create("ep_head",
2413 sizeof(struct epitems_head), 0, SLAB_PANIC|SLAB_ACCOUNT, NULL);
2414
2415 return 0;
2416 }
2417 fs_initcall(eventpoll_init);