Back to home page

LXR

 
 

    


0001 /*
0002  * This file contains the procedures for the handling of select and poll
0003  *
0004  * Created for Linux based loosely upon Mathius Lattner's minix
0005  * patches by Peter MacDonald. Heavily edited by Linus.
0006  *
0007  *  4 February 1994
0008  *     COFF/ELF binary emulation. If the process has the STICKY_TIMEOUTS
0009  *     flag set in its personality we do *not* modify the given timeout
0010  *     parameter to reflect time remaining.
0011  *
0012  *  24 January 2000
0013  *     Changed sys_poll()/do_poll() to use PAGE_SIZE chunk-based allocation 
0014  *     of fds to overcome nfds < 16390 descriptors limit (Tigran Aivazian).
0015  */
0016 
0017 #include <linux/kernel.h>
0018 #include <linux/sched.h>
0019 #include <linux/syscalls.h>
0020 #include <linux/export.h>
0021 #include <linux/slab.h>
0022 #include <linux/poll.h>
0023 #include <linux/personality.h> /* for STICKY_TIMEOUTS */
0024 #include <linux/file.h>
0025 #include <linux/fdtable.h>
0026 #include <linux/fs.h>
0027 #include <linux/rcupdate.h>
0028 #include <linux/hrtimer.h>
0029 #include <linux/sched/rt.h>
0030 #include <linux/freezer.h>
0031 #include <net/busy_poll.h>
0032 #include <linux/vmalloc.h>
0033 
0034 #include <linux/uaccess.h>
0035 
0036 
0037 /*
0038  * Estimate expected accuracy in ns from a timeval.
0039  *
0040  * After quite a bit of churning around, we've settled on
0041  * a simple thing of taking 0.1% of the timeout as the
0042  * slack, with a cap of 100 msec.
0043  * "nice" tasks get a 0.5% slack instead.
0044  *
0045  * Consider this comment an open invitation to come up with even
0046  * better solutions..
0047  */
0048 
0049 #define MAX_SLACK   (100 * NSEC_PER_MSEC)
0050 
0051 static long __estimate_accuracy(struct timespec64 *tv)
0052 {
0053     long slack;
0054     int divfactor = 1000;
0055 
0056     if (tv->tv_sec < 0)
0057         return 0;
0058 
0059     if (task_nice(current) > 0)
0060         divfactor = divfactor / 5;
0061 
0062     if (tv->tv_sec > MAX_SLACK / (NSEC_PER_SEC/divfactor))
0063         return MAX_SLACK;
0064 
0065     slack = tv->tv_nsec / divfactor;
0066     slack += tv->tv_sec * (NSEC_PER_SEC/divfactor);
0067 
0068     if (slack > MAX_SLACK)
0069         return MAX_SLACK;
0070 
0071     return slack;
0072 }
0073 
0074 u64 select_estimate_accuracy(struct timespec64 *tv)
0075 {
0076     u64 ret;
0077     struct timespec64 now;
0078 
0079     /*
0080      * Realtime tasks get a slack of 0 for obvious reasons.
0081      */
0082 
0083     if (rt_task(current))
0084         return 0;
0085 
0086     ktime_get_ts64(&now);
0087     now = timespec64_sub(*tv, now);
0088     ret = __estimate_accuracy(&now);
0089     if (ret < current->timer_slack_ns)
0090         return current->timer_slack_ns;
0091     return ret;
0092 }
0093 
0094 
0095 
0096 struct poll_table_page {
0097     struct poll_table_page * next;
0098     struct poll_table_entry * entry;
0099     struct poll_table_entry entries[0];
0100 };
0101 
0102 #define POLL_TABLE_FULL(table) \
0103     ((unsigned long)((table)->entry+1) > PAGE_SIZE + (unsigned long)(table))
0104 
0105 /*
0106  * Ok, Peter made a complicated, but straightforward multiple_wait() function.
0107  * I have rewritten this, taking some shortcuts: This code may not be easy to
0108  * follow, but it should be free of race-conditions, and it's practical. If you
0109  * understand what I'm doing here, then you understand how the linux
0110  * sleep/wakeup mechanism works.
0111  *
0112  * Two very simple procedures, poll_wait() and poll_freewait() make all the
0113  * work.  poll_wait() is an inline-function defined in <linux/poll.h>,
0114  * as all select/poll functions have to call it to add an entry to the
0115  * poll table.
0116  */
0117 static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
0118                poll_table *p);
0119 
0120 void poll_initwait(struct poll_wqueues *pwq)
0121 {
0122     init_poll_funcptr(&pwq->pt, __pollwait);
0123     pwq->polling_task = current;
0124     pwq->triggered = 0;
0125     pwq->error = 0;
0126     pwq->table = NULL;
0127     pwq->inline_index = 0;
0128 }
0129 EXPORT_SYMBOL(poll_initwait);
0130 
0131 static void free_poll_entry(struct poll_table_entry *entry)
0132 {
0133     remove_wait_queue(entry->wait_address, &entry->wait);
0134     fput(entry->filp);
0135 }
0136 
0137 void poll_freewait(struct poll_wqueues *pwq)
0138 {
0139     struct poll_table_page * p = pwq->table;
0140     int i;
0141     for (i = 0; i < pwq->inline_index; i++)
0142         free_poll_entry(pwq->inline_entries + i);
0143     while (p) {
0144         struct poll_table_entry * entry;
0145         struct poll_table_page *old;
0146 
0147         entry = p->entry;
0148         do {
0149             entry--;
0150             free_poll_entry(entry);
0151         } while (entry > p->entries);
0152         old = p;
0153         p = p->next;
0154         free_page((unsigned long) old);
0155     }
0156 }
0157 EXPORT_SYMBOL(poll_freewait);
0158 
0159 static struct poll_table_entry *poll_get_entry(struct poll_wqueues *p)
0160 {
0161     struct poll_table_page *table = p->table;
0162 
0163     if (p->inline_index < N_INLINE_POLL_ENTRIES)
0164         return p->inline_entries + p->inline_index++;
0165 
0166     if (!table || POLL_TABLE_FULL(table)) {
0167         struct poll_table_page *new_table;
0168 
0169         new_table = (struct poll_table_page *) __get_free_page(GFP_KERNEL);
0170         if (!new_table) {
0171             p->error = -ENOMEM;
0172             return NULL;
0173         }
0174         new_table->entry = new_table->entries;
0175         new_table->next = table;
0176         p->table = new_table;
0177         table = new_table;
0178     }
0179 
0180     return table->entry++;
0181 }
0182 
0183 static int __pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)
0184 {
0185     struct poll_wqueues *pwq = wait->private;
0186     DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task);
0187 
0188     /*
0189      * Although this function is called under waitqueue lock, LOCK
0190      * doesn't imply write barrier and the users expect write
0191      * barrier semantics on wakeup functions.  The following
0192      * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up()
0193      * and is paired with smp_store_mb() in poll_schedule_timeout.
0194      */
0195     smp_wmb();
0196     pwq->triggered = 1;
0197 
0198     /*
0199      * Perform the default wake up operation using a dummy
0200      * waitqueue.
0201      *
0202      * TODO: This is hacky but there currently is no interface to
0203      * pass in @sync.  @sync is scheduled to be removed and once
0204      * that happens, wake_up_process() can be used directly.
0205      */
0206     return default_wake_function(&dummy_wait, mode, sync, key);
0207 }
0208 
0209 static int pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)
0210 {
0211     struct poll_table_entry *entry;
0212 
0213     entry = container_of(wait, struct poll_table_entry, wait);
0214     if (key && !((unsigned long)key & entry->key))
0215         return 0;
0216     return __pollwake(wait, mode, sync, key);
0217 }
0218 
0219 /* Add a new entry */
0220 static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
0221                 poll_table *p)
0222 {
0223     struct poll_wqueues *pwq = container_of(p, struct poll_wqueues, pt);
0224     struct poll_table_entry *entry = poll_get_entry(pwq);
0225     if (!entry)
0226         return;
0227     entry->filp = get_file(filp);
0228     entry->wait_address = wait_address;
0229     entry->key = p->_key;
0230     init_waitqueue_func_entry(&entry->wait, pollwake);
0231     entry->wait.private = pwq;
0232     add_wait_queue(wait_address, &entry->wait);
0233 }
0234 
0235 int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
0236               ktime_t *expires, unsigned long slack)
0237 {
0238     int rc = -EINTR;
0239 
0240     set_current_state(state);
0241     if (!pwq->triggered)
0242         rc = schedule_hrtimeout_range(expires, slack, HRTIMER_MODE_ABS);
0243     __set_current_state(TASK_RUNNING);
0244 
0245     /*
0246      * Prepare for the next iteration.
0247      *
0248      * The following smp_store_mb() serves two purposes.  First, it's
0249      * the counterpart rmb of the wmb in pollwake() such that data
0250      * written before wake up is always visible after wake up.
0251      * Second, the full barrier guarantees that triggered clearing
0252      * doesn't pass event check of the next iteration.  Note that
0253      * this problem doesn't exist for the first iteration as
0254      * add_wait_queue() has full barrier semantics.
0255      */
0256     smp_store_mb(pwq->triggered, 0);
0257 
0258     return rc;
0259 }
0260 EXPORT_SYMBOL(poll_schedule_timeout);
0261 
0262 /**
0263  * poll_select_set_timeout - helper function to setup the timeout value
0264  * @to:     pointer to timespec64 variable for the final timeout
0265  * @sec:    seconds (from user space)
0266  * @nsec:   nanoseconds (from user space)
0267  *
0268  * Note, we do not use a timespec for the user space value here, That
0269  * way we can use the function for timeval and compat interfaces as well.
0270  *
0271  * Returns -EINVAL if sec/nsec are not normalized. Otherwise 0.
0272  */
0273 int poll_select_set_timeout(struct timespec64 *to, time64_t sec, long nsec)
0274 {
0275     struct timespec64 ts = {.tv_sec = sec, .tv_nsec = nsec};
0276 
0277     if (!timespec64_valid(&ts))
0278         return -EINVAL;
0279 
0280     /* Optimize for the zero timeout value here */
0281     if (!sec && !nsec) {
0282         to->tv_sec = to->tv_nsec = 0;
0283     } else {
0284         ktime_get_ts64(to);
0285         *to = timespec64_add_safe(*to, ts);
0286     }
0287     return 0;
0288 }
0289 
0290 static int poll_select_copy_remaining(struct timespec64 *end_time,
0291                       void __user *p,
0292                       int timeval, int ret)
0293 {
0294     struct timespec64 rts64;
0295     struct timespec rts;
0296     struct timeval rtv;
0297 
0298     if (!p)
0299         return ret;
0300 
0301     if (current->personality & STICKY_TIMEOUTS)
0302         goto sticky;
0303 
0304     /* No update for zero timeout */
0305     if (!end_time->tv_sec && !end_time->tv_nsec)
0306         return ret;
0307 
0308     ktime_get_ts64(&rts64);
0309     rts64 = timespec64_sub(*end_time, rts64);
0310     if (rts64.tv_sec < 0)
0311         rts64.tv_sec = rts64.tv_nsec = 0;
0312 
0313     rts = timespec64_to_timespec(rts64);
0314 
0315     if (timeval) {
0316         if (sizeof(rtv) > sizeof(rtv.tv_sec) + sizeof(rtv.tv_usec))
0317             memset(&rtv, 0, sizeof(rtv));
0318         rtv.tv_sec = rts64.tv_sec;
0319         rtv.tv_usec = rts64.tv_nsec / NSEC_PER_USEC;
0320 
0321         if (!copy_to_user(p, &rtv, sizeof(rtv)))
0322             return ret;
0323 
0324     } else if (!copy_to_user(p, &rts, sizeof(rts)))
0325         return ret;
0326 
0327     /*
0328      * If an application puts its timeval in read-only memory, we
0329      * don't want the Linux-specific update to the timeval to
0330      * cause a fault after the select has completed
0331      * successfully. However, because we're not updating the
0332      * timeval, we can't restart the system call.
0333      */
0334 
0335 sticky:
0336     if (ret == -ERESTARTNOHAND)
0337         ret = -EINTR;
0338     return ret;
0339 }
0340 
0341 #define FDS_IN(fds, n)      (fds->in + n)
0342 #define FDS_OUT(fds, n)     (fds->out + n)
0343 #define FDS_EX(fds, n)      (fds->ex + n)
0344 
0345 #define BITS(fds, n)    (*FDS_IN(fds, n)|*FDS_OUT(fds, n)|*FDS_EX(fds, n))
0346 
0347 static int max_select_fd(unsigned long n, fd_set_bits *fds)
0348 {
0349     unsigned long *open_fds;
0350     unsigned long set;
0351     int max;
0352     struct fdtable *fdt;
0353 
0354     /* handle last in-complete long-word first */
0355     set = ~(~0UL << (n & (BITS_PER_LONG-1)));
0356     n /= BITS_PER_LONG;
0357     fdt = files_fdtable(current->files);
0358     open_fds = fdt->open_fds + n;
0359     max = 0;
0360     if (set) {
0361         set &= BITS(fds, n);
0362         if (set) {
0363             if (!(set & ~*open_fds))
0364                 goto get_max;
0365             return -EBADF;
0366         }
0367     }
0368     while (n) {
0369         open_fds--;
0370         n--;
0371         set = BITS(fds, n);
0372         if (!set)
0373             continue;
0374         if (set & ~*open_fds)
0375             return -EBADF;
0376         if (max)
0377             continue;
0378 get_max:
0379         do {
0380             max++;
0381             set >>= 1;
0382         } while (set);
0383         max += n * BITS_PER_LONG;
0384     }
0385 
0386     return max;
0387 }
0388 
0389 #define POLLIN_SET (POLLRDNORM | POLLRDBAND | POLLIN | POLLHUP | POLLERR)
0390 #define POLLOUT_SET (POLLWRBAND | POLLWRNORM | POLLOUT | POLLERR)
0391 #define POLLEX_SET (POLLPRI)
0392 
0393 static inline void wait_key_set(poll_table *wait, unsigned long in,
0394                 unsigned long out, unsigned long bit,
0395                 unsigned int ll_flag)
0396 {
0397     wait->_key = POLLEX_SET | ll_flag;
0398     if (in & bit)
0399         wait->_key |= POLLIN_SET;
0400     if (out & bit)
0401         wait->_key |= POLLOUT_SET;
0402 }
0403 
0404 int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time)
0405 {
0406     ktime_t expire, *to = NULL;
0407     struct poll_wqueues table;
0408     poll_table *wait;
0409     int retval, i, timed_out = 0;
0410     u64 slack = 0;
0411     unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0;
0412     unsigned long busy_end = 0;
0413 
0414     rcu_read_lock();
0415     retval = max_select_fd(n, fds);
0416     rcu_read_unlock();
0417 
0418     if (retval < 0)
0419         return retval;
0420     n = retval;
0421 
0422     poll_initwait(&table);
0423     wait = &table.pt;
0424     if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
0425         wait->_qproc = NULL;
0426         timed_out = 1;
0427     }
0428 
0429     if (end_time && !timed_out)
0430         slack = select_estimate_accuracy(end_time);
0431 
0432     retval = 0;
0433     for (;;) {
0434         unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp;
0435         bool can_busy_loop = false;
0436 
0437         inp = fds->in; outp = fds->out; exp = fds->ex;
0438         rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex;
0439 
0440         for (i = 0; i < n; ++rinp, ++routp, ++rexp) {
0441             unsigned long in, out, ex, all_bits, bit = 1, mask, j;
0442             unsigned long res_in = 0, res_out = 0, res_ex = 0;
0443 
0444             in = *inp++; out = *outp++; ex = *exp++;
0445             all_bits = in | out | ex;
0446             if (all_bits == 0) {
0447                 i += BITS_PER_LONG;
0448                 continue;
0449             }
0450 
0451             for (j = 0; j < BITS_PER_LONG; ++j, ++i, bit <<= 1) {
0452                 struct fd f;
0453                 if (i >= n)
0454                     break;
0455                 if (!(bit & all_bits))
0456                     continue;
0457                 f = fdget(i);
0458                 if (f.file) {
0459                     const struct file_operations *f_op;
0460                     f_op = f.file->f_op;
0461                     mask = DEFAULT_POLLMASK;
0462                     if (f_op->poll) {
0463                         wait_key_set(wait, in, out,
0464                                  bit, busy_flag);
0465                         mask = (*f_op->poll)(f.file, wait);
0466                     }
0467                     fdput(f);
0468                     if ((mask & POLLIN_SET) && (in & bit)) {
0469                         res_in |= bit;
0470                         retval++;
0471                         wait->_qproc = NULL;
0472                     }
0473                     if ((mask & POLLOUT_SET) && (out & bit)) {
0474                         res_out |= bit;
0475                         retval++;
0476                         wait->_qproc = NULL;
0477                     }
0478                     if ((mask & POLLEX_SET) && (ex & bit)) {
0479                         res_ex |= bit;
0480                         retval++;
0481                         wait->_qproc = NULL;
0482                     }
0483                     /* got something, stop busy polling */
0484                     if (retval) {
0485                         can_busy_loop = false;
0486                         busy_flag = 0;
0487 
0488                     /*
0489                      * only remember a returned
0490                      * POLL_BUSY_LOOP if we asked for it
0491                      */
0492                     } else if (busy_flag & mask)
0493                         can_busy_loop = true;
0494 
0495                 }
0496             }
0497             if (res_in)
0498                 *rinp = res_in;
0499             if (res_out)
0500                 *routp = res_out;
0501             if (res_ex)
0502                 *rexp = res_ex;
0503             cond_resched();
0504         }
0505         wait->_qproc = NULL;
0506         if (retval || timed_out || signal_pending(current))
0507             break;
0508         if (table.error) {
0509             retval = table.error;
0510             break;
0511         }
0512 
0513         /* only if found POLL_BUSY_LOOP sockets && not out of time */
0514         if (can_busy_loop && !need_resched()) {
0515             if (!busy_end) {
0516                 busy_end = busy_loop_end_time();
0517                 continue;
0518             }
0519             if (!busy_loop_timeout(busy_end))
0520                 continue;
0521         }
0522         busy_flag = 0;
0523 
0524         /*
0525          * If this is the first loop and we have a timeout
0526          * given, then we convert to ktime_t and set the to
0527          * pointer to the expiry value.
0528          */
0529         if (end_time && !to) {
0530             expire = timespec64_to_ktime(*end_time);
0531             to = &expire;
0532         }
0533 
0534         if (!poll_schedule_timeout(&table, TASK_INTERRUPTIBLE,
0535                        to, slack))
0536             timed_out = 1;
0537     }
0538 
0539     poll_freewait(&table);
0540 
0541     return retval;
0542 }
0543 
0544 /*
0545  * We can actually return ERESTARTSYS instead of EINTR, but I'd
0546  * like to be certain this leads to no problems. So I return
0547  * EINTR just for safety.
0548  *
0549  * Update: ERESTARTSYS breaks at least the xview clock binary, so
0550  * I'm trying ERESTARTNOHAND which restart only when you want to.
0551  */
0552 int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
0553                fd_set __user *exp, struct timespec64 *end_time)
0554 {
0555     fd_set_bits fds;
0556     void *bits;
0557     int ret, max_fds;
0558     size_t size, alloc_size;
0559     struct fdtable *fdt;
0560     /* Allocate small arguments on the stack to save memory and be faster */
0561     long stack_fds[SELECT_STACK_ALLOC/sizeof(long)];
0562 
0563     ret = -EINVAL;
0564     if (n < 0)
0565         goto out_nofds;
0566 
0567     /* max_fds can increase, so grab it once to avoid race */
0568     rcu_read_lock();
0569     fdt = files_fdtable(current->files);
0570     max_fds = fdt->max_fds;
0571     rcu_read_unlock();
0572     if (n > max_fds)
0573         n = max_fds;
0574 
0575     /*
0576      * We need 6 bitmaps (in/out/ex for both incoming and outgoing),
0577      * since we used fdset we need to allocate memory in units of
0578      * long-words. 
0579      */
0580     size = FDS_BYTES(n);
0581     bits = stack_fds;
0582     if (size > sizeof(stack_fds) / 6) {
0583         /* Not enough space in on-stack array; must use kmalloc */
0584         ret = -ENOMEM;
0585         if (size > (SIZE_MAX / 6))
0586             goto out_nofds;
0587 
0588         alloc_size = 6 * size;
0589         bits = kmalloc(alloc_size, GFP_KERNEL|__GFP_NOWARN);
0590         if (!bits && alloc_size > PAGE_SIZE)
0591             bits = vmalloc(alloc_size);
0592 
0593         if (!bits)
0594             goto out_nofds;
0595     }
0596     fds.in      = bits;
0597     fds.out     = bits +   size;
0598     fds.ex      = bits + 2*size;
0599     fds.res_in  = bits + 3*size;
0600     fds.res_out = bits + 4*size;
0601     fds.res_ex  = bits + 5*size;
0602 
0603     if ((ret = get_fd_set(n, inp, fds.in)) ||
0604         (ret = get_fd_set(n, outp, fds.out)) ||
0605         (ret = get_fd_set(n, exp, fds.ex)))
0606         goto out;
0607     zero_fd_set(n, fds.res_in);
0608     zero_fd_set(n, fds.res_out);
0609     zero_fd_set(n, fds.res_ex);
0610 
0611     ret = do_select(n, &fds, end_time);
0612 
0613     if (ret < 0)
0614         goto out;
0615     if (!ret) {
0616         ret = -ERESTARTNOHAND;
0617         if (signal_pending(current))
0618             goto out;
0619         ret = 0;
0620     }
0621 
0622     if (set_fd_set(n, inp, fds.res_in) ||
0623         set_fd_set(n, outp, fds.res_out) ||
0624         set_fd_set(n, exp, fds.res_ex))
0625         ret = -EFAULT;
0626 
0627 out:
0628     if (bits != stack_fds)
0629         kvfree(bits);
0630 out_nofds:
0631     return ret;
0632 }
0633 
0634 SYSCALL_DEFINE5(select, int, n, fd_set __user *, inp, fd_set __user *, outp,
0635         fd_set __user *, exp, struct timeval __user *, tvp)
0636 {
0637     struct timespec64 end_time, *to = NULL;
0638     struct timeval tv;
0639     int ret;
0640 
0641     if (tvp) {
0642         if (copy_from_user(&tv, tvp, sizeof(tv)))
0643             return -EFAULT;
0644 
0645         to = &end_time;
0646         if (poll_select_set_timeout(to,
0647                 tv.tv_sec + (tv.tv_usec / USEC_PER_SEC),
0648                 (tv.tv_usec % USEC_PER_SEC) * NSEC_PER_USEC))
0649             return -EINVAL;
0650     }
0651 
0652     ret = core_sys_select(n, inp, outp, exp, to);
0653     ret = poll_select_copy_remaining(&end_time, tvp, 1, ret);
0654 
0655     return ret;
0656 }
0657 
0658 static long do_pselect(int n, fd_set __user *inp, fd_set __user *outp,
0659                fd_set __user *exp, struct timespec __user *tsp,
0660                const sigset_t __user *sigmask, size_t sigsetsize)
0661 {
0662     sigset_t ksigmask, sigsaved;
0663     struct timespec ts;
0664     struct timespec64 ts64, end_time, *to = NULL;
0665     int ret;
0666 
0667     if (tsp) {
0668         if (copy_from_user(&ts, tsp, sizeof(ts)))
0669             return -EFAULT;
0670         ts64 = timespec_to_timespec64(ts);
0671 
0672         to = &end_time;
0673         if (poll_select_set_timeout(to, ts64.tv_sec, ts64.tv_nsec))
0674             return -EINVAL;
0675     }
0676 
0677     if (sigmask) {
0678         /* XXX: Don't preclude handling different sized sigset_t's.  */
0679         if (sigsetsize != sizeof(sigset_t))
0680             return -EINVAL;
0681         if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask)))
0682             return -EFAULT;
0683 
0684         sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
0685         sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
0686     }
0687 
0688     ret = core_sys_select(n, inp, outp, exp, to);
0689     ret = poll_select_copy_remaining(&end_time, tsp, 0, ret);
0690 
0691     if (ret == -ERESTARTNOHAND) {
0692         /*
0693          * Don't restore the signal mask yet. Let do_signal() deliver
0694          * the signal on the way back to userspace, before the signal
0695          * mask is restored.
0696          */
0697         if (sigmask) {
0698             memcpy(&current->saved_sigmask, &sigsaved,
0699                     sizeof(sigsaved));
0700             set_restore_sigmask();
0701         }
0702     } else if (sigmask)
0703         sigprocmask(SIG_SETMASK, &sigsaved, NULL);
0704 
0705     return ret;
0706 }
0707 
0708 /*
0709  * Most architectures can't handle 7-argument syscalls. So we provide a
0710  * 6-argument version where the sixth argument is a pointer to a structure
0711  * which has a pointer to the sigset_t itself followed by a size_t containing
0712  * the sigset size.
0713  */
0714 SYSCALL_DEFINE6(pselect6, int, n, fd_set __user *, inp, fd_set __user *, outp,
0715         fd_set __user *, exp, struct timespec __user *, tsp,
0716         void __user *, sig)
0717 {
0718     size_t sigsetsize = 0;
0719     sigset_t __user *up = NULL;
0720 
0721     if (sig) {
0722         if (!access_ok(VERIFY_READ, sig, sizeof(void *)+sizeof(size_t))
0723             || __get_user(up, (sigset_t __user * __user *)sig)
0724             || __get_user(sigsetsize,
0725                 (size_t __user *)(sig+sizeof(void *))))
0726             return -EFAULT;
0727     }
0728 
0729     return do_pselect(n, inp, outp, exp, tsp, up, sigsetsize);
0730 }
0731 
0732 #ifdef __ARCH_WANT_SYS_OLD_SELECT
0733 struct sel_arg_struct {
0734     unsigned long n;
0735     fd_set __user *inp, *outp, *exp;
0736     struct timeval __user *tvp;
0737 };
0738 
0739 SYSCALL_DEFINE1(old_select, struct sel_arg_struct __user *, arg)
0740 {
0741     struct sel_arg_struct a;
0742 
0743     if (copy_from_user(&a, arg, sizeof(a)))
0744         return -EFAULT;
0745     return sys_select(a.n, a.inp, a.outp, a.exp, a.tvp);
0746 }
0747 #endif
0748 
0749 struct poll_list {
0750     struct poll_list *next;
0751     int len;
0752     struct pollfd entries[0];
0753 };
0754 
0755 #define POLLFD_PER_PAGE  ((PAGE_SIZE-sizeof(struct poll_list)) / sizeof(struct pollfd))
0756 
0757 /*
0758  * Fish for pollable events on the pollfd->fd file descriptor. We're only
0759  * interested in events matching the pollfd->events mask, and the result
0760  * matching that mask is both recorded in pollfd->revents and returned. The
0761  * pwait poll_table will be used by the fd-provided poll handler for waiting,
0762  * if pwait->_qproc is non-NULL.
0763  */
0764 static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait,
0765                      bool *can_busy_poll,
0766                      unsigned int busy_flag)
0767 {
0768     unsigned int mask;
0769     int fd;
0770 
0771     mask = 0;
0772     fd = pollfd->fd;
0773     if (fd >= 0) {
0774         struct fd f = fdget(fd);
0775         mask = POLLNVAL;
0776         if (f.file) {
0777             mask = DEFAULT_POLLMASK;
0778             if (f.file->f_op->poll) {
0779                 pwait->_key = pollfd->events|POLLERR|POLLHUP;
0780                 pwait->_key |= busy_flag;
0781                 mask = f.file->f_op->poll(f.file, pwait);
0782                 if (mask & busy_flag)
0783                     *can_busy_poll = true;
0784             }
0785             /* Mask out unneeded events. */
0786             mask &= pollfd->events | POLLERR | POLLHUP;
0787             fdput(f);
0788         }
0789     }
0790     pollfd->revents = mask;
0791 
0792     return mask;
0793 }
0794 
0795 static int do_poll(struct poll_list *list, struct poll_wqueues *wait,
0796            struct timespec64 *end_time)
0797 {
0798     poll_table* pt = &wait->pt;
0799     ktime_t expire, *to = NULL;
0800     int timed_out = 0, count = 0;
0801     u64 slack = 0;
0802     unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0;
0803     unsigned long busy_end = 0;
0804 
0805     /* Optimise the no-wait case */
0806     if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
0807         pt->_qproc = NULL;
0808         timed_out = 1;
0809     }
0810 
0811     if (end_time && !timed_out)
0812         slack = select_estimate_accuracy(end_time);
0813 
0814     for (;;) {
0815         struct poll_list *walk;
0816         bool can_busy_loop = false;
0817 
0818         for (walk = list; walk != NULL; walk = walk->next) {
0819             struct pollfd * pfd, * pfd_end;
0820 
0821             pfd = walk->entries;
0822             pfd_end = pfd + walk->len;
0823             for (; pfd != pfd_end; pfd++) {
0824                 /*
0825                  * Fish for events. If we found one, record it
0826                  * and kill poll_table->_qproc, so we don't
0827                  * needlessly register any other waiters after
0828                  * this. They'll get immediately deregistered
0829                  * when we break out and return.
0830                  */
0831                 if (do_pollfd(pfd, pt, &can_busy_loop,
0832                           busy_flag)) {
0833                     count++;
0834                     pt->_qproc = NULL;
0835                     /* found something, stop busy polling */
0836                     busy_flag = 0;
0837                     can_busy_loop = false;
0838                 }
0839             }
0840         }
0841         /*
0842          * All waiters have already been registered, so don't provide
0843          * a poll_table->_qproc to them on the next loop iteration.
0844          */
0845         pt->_qproc = NULL;
0846         if (!count) {
0847             count = wait->error;
0848             if (signal_pending(current))
0849                 count = -EINTR;
0850         }
0851         if (count || timed_out)
0852             break;
0853 
0854         /* only if found POLL_BUSY_LOOP sockets && not out of time */
0855         if (can_busy_loop && !need_resched()) {
0856             if (!busy_end) {
0857                 busy_end = busy_loop_end_time();
0858                 continue;
0859             }
0860             if (!busy_loop_timeout(busy_end))
0861                 continue;
0862         }
0863         busy_flag = 0;
0864 
0865         /*
0866          * If this is the first loop and we have a timeout
0867          * given, then we convert to ktime_t and set the to
0868          * pointer to the expiry value.
0869          */
0870         if (end_time && !to) {
0871             expire = timespec64_to_ktime(*end_time);
0872             to = &expire;
0873         }
0874 
0875         if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack))
0876             timed_out = 1;
0877     }
0878     return count;
0879 }
0880 
0881 #define N_STACK_PPS ((sizeof(stack_pps) - sizeof(struct poll_list))  / \
0882             sizeof(struct pollfd))
0883 
0884 int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
0885         struct timespec64 *end_time)
0886 {
0887     struct poll_wqueues table;
0888     int err = -EFAULT, fdcount, len, size;
0889     /* Allocate small arguments on the stack to save memory and be
0890        faster - use long to make sure the buffer is aligned properly
0891        on 64 bit archs to avoid unaligned access */
0892     long stack_pps[POLL_STACK_ALLOC/sizeof(long)];
0893     struct poll_list *const head = (struct poll_list *)stack_pps;
0894     struct poll_list *walk = head;
0895     unsigned long todo = nfds;
0896 
0897     if (nfds > rlimit(RLIMIT_NOFILE))
0898         return -EINVAL;
0899 
0900     len = min_t(unsigned int, nfds, N_STACK_PPS);
0901     for (;;) {
0902         walk->next = NULL;
0903         walk->len = len;
0904         if (!len)
0905             break;
0906 
0907         if (copy_from_user(walk->entries, ufds + nfds-todo,
0908                     sizeof(struct pollfd) * walk->len))
0909             goto out_fds;
0910 
0911         todo -= walk->len;
0912         if (!todo)
0913             break;
0914 
0915         len = min(todo, POLLFD_PER_PAGE);
0916         size = sizeof(struct poll_list) + sizeof(struct pollfd) * len;
0917         walk = walk->next = kmalloc(size, GFP_KERNEL);
0918         if (!walk) {
0919             err = -ENOMEM;
0920             goto out_fds;
0921         }
0922     }
0923 
0924     poll_initwait(&table);
0925     fdcount = do_poll(head, &table, end_time);
0926     poll_freewait(&table);
0927 
0928     for (walk = head; walk; walk = walk->next) {
0929         struct pollfd *fds = walk->entries;
0930         int j;
0931 
0932         for (j = 0; j < walk->len; j++, ufds++)
0933             if (__put_user(fds[j].revents, &ufds->revents))
0934                 goto out_fds;
0935     }
0936 
0937     err = fdcount;
0938 out_fds:
0939     walk = head->next;
0940     while (walk) {
0941         struct poll_list *pos = walk;
0942         walk = walk->next;
0943         kfree(pos);
0944     }
0945 
0946     return err;
0947 }
0948 
0949 static long do_restart_poll(struct restart_block *restart_block)
0950 {
0951     struct pollfd __user *ufds = restart_block->poll.ufds;
0952     int nfds = restart_block->poll.nfds;
0953     struct timespec64 *to = NULL, end_time;
0954     int ret;
0955 
0956     if (restart_block->poll.has_timeout) {
0957         end_time.tv_sec = restart_block->poll.tv_sec;
0958         end_time.tv_nsec = restart_block->poll.tv_nsec;
0959         to = &end_time;
0960     }
0961 
0962     ret = do_sys_poll(ufds, nfds, to);
0963 
0964     if (ret == -EINTR) {
0965         restart_block->fn = do_restart_poll;
0966         ret = -ERESTART_RESTARTBLOCK;
0967     }
0968     return ret;
0969 }
0970 
0971 SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds,
0972         int, timeout_msecs)
0973 {
0974     struct timespec64 end_time, *to = NULL;
0975     int ret;
0976 
0977     if (timeout_msecs >= 0) {
0978         to = &end_time;
0979         poll_select_set_timeout(to, timeout_msecs / MSEC_PER_SEC,
0980             NSEC_PER_MSEC * (timeout_msecs % MSEC_PER_SEC));
0981     }
0982 
0983     ret = do_sys_poll(ufds, nfds, to);
0984 
0985     if (ret == -EINTR) {
0986         struct restart_block *restart_block;
0987 
0988         restart_block = &current->restart_block;
0989         restart_block->fn = do_restart_poll;
0990         restart_block->poll.ufds = ufds;
0991         restart_block->poll.nfds = nfds;
0992 
0993         if (timeout_msecs >= 0) {
0994             restart_block->poll.tv_sec = end_time.tv_sec;
0995             restart_block->poll.tv_nsec = end_time.tv_nsec;
0996             restart_block->poll.has_timeout = 1;
0997         } else
0998             restart_block->poll.has_timeout = 0;
0999 
1000         ret = -ERESTART_RESTARTBLOCK;
1001     }
1002     return ret;
1003 }
1004 
1005 SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds,
1006         struct timespec __user *, tsp, const sigset_t __user *, sigmask,
1007         size_t, sigsetsize)
1008 {
1009     sigset_t ksigmask, sigsaved;
1010     struct timespec ts;
1011     struct timespec64 end_time, *to = NULL;
1012     int ret;
1013 
1014     if (tsp) {
1015         if (copy_from_user(&ts, tsp, sizeof(ts)))
1016             return -EFAULT;
1017 
1018         to = &end_time;
1019         if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
1020             return -EINVAL;
1021     }
1022 
1023     if (sigmask) {
1024         /* XXX: Don't preclude handling different sized sigset_t's.  */
1025         if (sigsetsize != sizeof(sigset_t))
1026             return -EINVAL;
1027         if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask)))
1028             return -EFAULT;
1029 
1030         sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
1031         sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
1032     }
1033 
1034     ret = do_sys_poll(ufds, nfds, to);
1035 
1036     /* We can restart this syscall, usually */
1037     if (ret == -EINTR) {
1038         /*
1039          * Don't restore the signal mask yet. Let do_signal() deliver
1040          * the signal on the way back to userspace, before the signal
1041          * mask is restored.
1042          */
1043         if (sigmask) {
1044             memcpy(&current->saved_sigmask, &sigsaved,
1045                     sizeof(sigsaved));
1046             set_restore_sigmask();
1047         }
1048         ret = -ERESTARTNOHAND;
1049     } else if (sigmask)
1050         sigprocmask(SIG_SETMASK, &sigsaved, NULL);
1051 
1052     ret = poll_select_copy_remaining(&end_time, tsp, 0, ret);
1053 
1054     return ret;
1055 }