Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0
0002 /*
0003  * Implement CPU time clocks for the POSIX clock interface.
0004  */
0005 
0006 #include <linux/sched/signal.h>
0007 #include <linux/sched/cputime.h>
0008 #include <linux/posix-timers.h>
0009 #include <linux/errno.h>
0010 #include <linux/math64.h>
0011 #include <linux/uaccess.h>
0012 #include <linux/kernel_stat.h>
0013 #include <trace/events/timer.h>
0014 #include <linux/tick.h>
0015 #include <linux/workqueue.h>
0016 #include <linux/compat.h>
0017 #include <linux/sched/deadline.h>
0018 #include <linux/task_work.h>
0019 
0020 #include "posix-timers.h"
0021 
0022 static void posix_cpu_timer_rearm(struct k_itimer *timer);
0023 
0024 void posix_cputimers_group_init(struct posix_cputimers *pct, u64 cpu_limit)
0025 {
0026     posix_cputimers_init(pct);
0027     if (cpu_limit != RLIM_INFINITY) {
0028         pct->bases[CPUCLOCK_PROF].nextevt = cpu_limit * NSEC_PER_SEC;
0029         pct->timers_active = true;
0030     }
0031 }
0032 
0033 /*
0034  * Called after updating RLIMIT_CPU to run cpu timer and update
0035  * tsk->signal->posix_cputimers.bases[clock].nextevt expiration cache if
0036  * necessary. Needs siglock protection since other code may update the
0037  * expiration cache as well.
0038  *
0039  * Returns 0 on success, -ESRCH on failure.  Can fail if the task is exiting and
0040  * we cannot lock_task_sighand.  Cannot fail if task is current.
0041  */
0042 int update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new)
0043 {
0044     u64 nsecs = rlim_new * NSEC_PER_SEC;
0045     unsigned long irq_fl;
0046 
0047     if (!lock_task_sighand(task, &irq_fl))
0048         return -ESRCH;
0049     set_process_cpu_timer(task, CPUCLOCK_PROF, &nsecs, NULL);
0050     unlock_task_sighand(task, &irq_fl);
0051     return 0;
0052 }
0053 
0054 /*
0055  * Functions for validating access to tasks.
0056  */
0057 static struct pid *pid_for_clock(const clockid_t clock, bool gettime)
0058 {
0059     const bool thread = !!CPUCLOCK_PERTHREAD(clock);
0060     const pid_t upid = CPUCLOCK_PID(clock);
0061     struct pid *pid;
0062 
0063     if (CPUCLOCK_WHICH(clock) >= CPUCLOCK_MAX)
0064         return NULL;
0065 
0066     /*
0067      * If the encoded PID is 0, then the timer is targeted at current
0068      * or the process to which current belongs.
0069      */
0070     if (upid == 0)
0071         return thread ? task_pid(current) : task_tgid(current);
0072 
0073     pid = find_vpid(upid);
0074     if (!pid)
0075         return NULL;
0076 
0077     if (thread) {
0078         struct task_struct *tsk = pid_task(pid, PIDTYPE_PID);
0079         return (tsk && same_thread_group(tsk, current)) ? pid : NULL;
0080     }
0081 
0082     /*
0083      * For clock_gettime(PROCESS) allow finding the process by
0084      * with the pid of the current task.  The code needs the tgid
0085      * of the process so that pid_task(pid, PIDTYPE_TGID) can be
0086      * used to find the process.
0087      */
0088     if (gettime && (pid == task_pid(current)))
0089         return task_tgid(current);
0090 
0091     /*
0092      * For processes require that pid identifies a process.
0093      */
0094     return pid_has_task(pid, PIDTYPE_TGID) ? pid : NULL;
0095 }
0096 
0097 static inline int validate_clock_permissions(const clockid_t clock)
0098 {
0099     int ret;
0100 
0101     rcu_read_lock();
0102     ret = pid_for_clock(clock, false) ? 0 : -EINVAL;
0103     rcu_read_unlock();
0104 
0105     return ret;
0106 }
0107 
0108 static inline enum pid_type clock_pid_type(const clockid_t clock)
0109 {
0110     return CPUCLOCK_PERTHREAD(clock) ? PIDTYPE_PID : PIDTYPE_TGID;
0111 }
0112 
0113 static inline struct task_struct *cpu_timer_task_rcu(struct k_itimer *timer)
0114 {
0115     return pid_task(timer->it.cpu.pid, clock_pid_type(timer->it_clock));
0116 }
0117 
0118 /*
0119  * Update expiry time from increment, and increase overrun count,
0120  * given the current clock sample.
0121  */
0122 static u64 bump_cpu_timer(struct k_itimer *timer, u64 now)
0123 {
0124     u64 delta, incr, expires = timer->it.cpu.node.expires;
0125     int i;
0126 
0127     if (!timer->it_interval)
0128         return expires;
0129 
0130     if (now < expires)
0131         return expires;
0132 
0133     incr = timer->it_interval;
0134     delta = now + incr - expires;
0135 
0136     /* Don't use (incr*2 < delta), incr*2 might overflow. */
0137     for (i = 0; incr < delta - incr; i++)
0138         incr = incr << 1;
0139 
0140     for (; i >= 0; incr >>= 1, i--) {
0141         if (delta < incr)
0142             continue;
0143 
0144         timer->it.cpu.node.expires += incr;
0145         timer->it_overrun += 1LL << i;
0146         delta -= incr;
0147     }
0148     return timer->it.cpu.node.expires;
0149 }
0150 
0151 /* Check whether all cache entries contain U64_MAX, i.e. eternal expiry time */
0152 static inline bool expiry_cache_is_inactive(const struct posix_cputimers *pct)
0153 {
0154     return !(~pct->bases[CPUCLOCK_PROF].nextevt |
0155          ~pct->bases[CPUCLOCK_VIRT].nextevt |
0156          ~pct->bases[CPUCLOCK_SCHED].nextevt);
0157 }
0158 
0159 static int
0160 posix_cpu_clock_getres(const clockid_t which_clock, struct timespec64 *tp)
0161 {
0162     int error = validate_clock_permissions(which_clock);
0163 
0164     if (!error) {
0165         tp->tv_sec = 0;
0166         tp->tv_nsec = ((NSEC_PER_SEC + HZ - 1) / HZ);
0167         if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
0168             /*
0169              * If sched_clock is using a cycle counter, we
0170              * don't have any idea of its true resolution
0171              * exported, but it is much more than 1s/HZ.
0172              */
0173             tp->tv_nsec = 1;
0174         }
0175     }
0176     return error;
0177 }
0178 
0179 static int
0180 posix_cpu_clock_set(const clockid_t clock, const struct timespec64 *tp)
0181 {
0182     int error = validate_clock_permissions(clock);
0183 
0184     /*
0185      * You can never reset a CPU clock, but we check for other errors
0186      * in the call before failing with EPERM.
0187      */
0188     return error ? : -EPERM;
0189 }
0190 
0191 /*
0192  * Sample a per-thread clock for the given task. clkid is validated.
0193  */
0194 static u64 cpu_clock_sample(const clockid_t clkid, struct task_struct *p)
0195 {
0196     u64 utime, stime;
0197 
0198     if (clkid == CPUCLOCK_SCHED)
0199         return task_sched_runtime(p);
0200 
0201     task_cputime(p, &utime, &stime);
0202 
0203     switch (clkid) {
0204     case CPUCLOCK_PROF:
0205         return utime + stime;
0206     case CPUCLOCK_VIRT:
0207         return utime;
0208     default:
0209         WARN_ON_ONCE(1);
0210     }
0211     return 0;
0212 }
0213 
0214 static inline void store_samples(u64 *samples, u64 stime, u64 utime, u64 rtime)
0215 {
0216     samples[CPUCLOCK_PROF] = stime + utime;
0217     samples[CPUCLOCK_VIRT] = utime;
0218     samples[CPUCLOCK_SCHED] = rtime;
0219 }
0220 
0221 static void task_sample_cputime(struct task_struct *p, u64 *samples)
0222 {
0223     u64 stime, utime;
0224 
0225     task_cputime(p, &utime, &stime);
0226     store_samples(samples, stime, utime, p->se.sum_exec_runtime);
0227 }
0228 
0229 static void proc_sample_cputime_atomic(struct task_cputime_atomic *at,
0230                        u64 *samples)
0231 {
0232     u64 stime, utime, rtime;
0233 
0234     utime = atomic64_read(&at->utime);
0235     stime = atomic64_read(&at->stime);
0236     rtime = atomic64_read(&at->sum_exec_runtime);
0237     store_samples(samples, stime, utime, rtime);
0238 }
0239 
0240 /*
0241  * Set cputime to sum_cputime if sum_cputime > cputime. Use cmpxchg
0242  * to avoid race conditions with concurrent updates to cputime.
0243  */
0244 static inline void __update_gt_cputime(atomic64_t *cputime, u64 sum_cputime)
0245 {
0246     u64 curr_cputime;
0247 retry:
0248     curr_cputime = atomic64_read(cputime);
0249     if (sum_cputime > curr_cputime) {
0250         if (atomic64_cmpxchg(cputime, curr_cputime, sum_cputime) != curr_cputime)
0251             goto retry;
0252     }
0253 }
0254 
0255 static void update_gt_cputime(struct task_cputime_atomic *cputime_atomic,
0256                   struct task_cputime *sum)
0257 {
0258     __update_gt_cputime(&cputime_atomic->utime, sum->utime);
0259     __update_gt_cputime(&cputime_atomic->stime, sum->stime);
0260     __update_gt_cputime(&cputime_atomic->sum_exec_runtime, sum->sum_exec_runtime);
0261 }
0262 
0263 /**
0264  * thread_group_sample_cputime - Sample cputime for a given task
0265  * @tsk:    Task for which cputime needs to be started
0266  * @samples:    Storage for time samples
0267  *
0268  * Called from sys_getitimer() to calculate the expiry time of an active
0269  * timer. That means group cputime accounting is already active. Called
0270  * with task sighand lock held.
0271  *
0272  * Updates @times with an uptodate sample of the thread group cputimes.
0273  */
0274 void thread_group_sample_cputime(struct task_struct *tsk, u64 *samples)
0275 {
0276     struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
0277     struct posix_cputimers *pct = &tsk->signal->posix_cputimers;
0278 
0279     WARN_ON_ONCE(!pct->timers_active);
0280 
0281     proc_sample_cputime_atomic(&cputimer->cputime_atomic, samples);
0282 }
0283 
0284 /**
0285  * thread_group_start_cputime - Start cputime and return a sample
0286  * @tsk:    Task for which cputime needs to be started
0287  * @samples:    Storage for time samples
0288  *
0289  * The thread group cputime accounting is avoided when there are no posix
0290  * CPU timers armed. Before starting a timer it's required to check whether
0291  * the time accounting is active. If not, a full update of the atomic
0292  * accounting store needs to be done and the accounting enabled.
0293  *
0294  * Updates @times with an uptodate sample of the thread group cputimes.
0295  */
0296 static void thread_group_start_cputime(struct task_struct *tsk, u64 *samples)
0297 {
0298     struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
0299     struct posix_cputimers *pct = &tsk->signal->posix_cputimers;
0300 
0301     lockdep_assert_task_sighand_held(tsk);
0302 
0303     /* Check if cputimer isn't running. This is accessed without locking. */
0304     if (!READ_ONCE(pct->timers_active)) {
0305         struct task_cputime sum;
0306 
0307         /*
0308          * The POSIX timer interface allows for absolute time expiry
0309          * values through the TIMER_ABSTIME flag, therefore we have
0310          * to synchronize the timer to the clock every time we start it.
0311          */
0312         thread_group_cputime(tsk, &sum);
0313         update_gt_cputime(&cputimer->cputime_atomic, &sum);
0314 
0315         /*
0316          * We're setting timers_active without a lock. Ensure this
0317          * only gets written to in one operation. We set it after
0318          * update_gt_cputime() as a small optimization, but
0319          * barriers are not required because update_gt_cputime()
0320          * can handle concurrent updates.
0321          */
0322         WRITE_ONCE(pct->timers_active, true);
0323     }
0324     proc_sample_cputime_atomic(&cputimer->cputime_atomic, samples);
0325 }
0326 
0327 static void __thread_group_cputime(struct task_struct *tsk, u64 *samples)
0328 {
0329     struct task_cputime ct;
0330 
0331     thread_group_cputime(tsk, &ct);
0332     store_samples(samples, ct.stime, ct.utime, ct.sum_exec_runtime);
0333 }
0334 
0335 /*
0336  * Sample a process (thread group) clock for the given task clkid. If the
0337  * group's cputime accounting is already enabled, read the atomic
0338  * store. Otherwise a full update is required.  clkid is already validated.
0339  */
0340 static u64 cpu_clock_sample_group(const clockid_t clkid, struct task_struct *p,
0341                   bool start)
0342 {
0343     struct thread_group_cputimer *cputimer = &p->signal->cputimer;
0344     struct posix_cputimers *pct = &p->signal->posix_cputimers;
0345     u64 samples[CPUCLOCK_MAX];
0346 
0347     if (!READ_ONCE(pct->timers_active)) {
0348         if (start)
0349             thread_group_start_cputime(p, samples);
0350         else
0351             __thread_group_cputime(p, samples);
0352     } else {
0353         proc_sample_cputime_atomic(&cputimer->cputime_atomic, samples);
0354     }
0355 
0356     return samples[clkid];
0357 }
0358 
0359 static int posix_cpu_clock_get(const clockid_t clock, struct timespec64 *tp)
0360 {
0361     const clockid_t clkid = CPUCLOCK_WHICH(clock);
0362     struct task_struct *tsk;
0363     u64 t;
0364 
0365     rcu_read_lock();
0366     tsk = pid_task(pid_for_clock(clock, true), clock_pid_type(clock));
0367     if (!tsk) {
0368         rcu_read_unlock();
0369         return -EINVAL;
0370     }
0371 
0372     if (CPUCLOCK_PERTHREAD(clock))
0373         t = cpu_clock_sample(clkid, tsk);
0374     else
0375         t = cpu_clock_sample_group(clkid, tsk, false);
0376     rcu_read_unlock();
0377 
0378     *tp = ns_to_timespec64(t);
0379     return 0;
0380 }
0381 
0382 /*
0383  * Validate the clockid_t for a new CPU-clock timer, and initialize the timer.
0384  * This is called from sys_timer_create() and do_cpu_nanosleep() with the
0385  * new timer already all-zeros initialized.
0386  */
0387 static int posix_cpu_timer_create(struct k_itimer *new_timer)
0388 {
0389     static struct lock_class_key posix_cpu_timers_key;
0390     struct pid *pid;
0391 
0392     rcu_read_lock();
0393     pid = pid_for_clock(new_timer->it_clock, false);
0394     if (!pid) {
0395         rcu_read_unlock();
0396         return -EINVAL;
0397     }
0398 
0399     /*
0400      * If posix timer expiry is handled in task work context then
0401      * timer::it_lock can be taken without disabling interrupts as all
0402      * other locking happens in task context. This requires a separate
0403      * lock class key otherwise regular posix timer expiry would record
0404      * the lock class being taken in interrupt context and generate a
0405      * false positive warning.
0406      */
0407     if (IS_ENABLED(CONFIG_POSIX_CPU_TIMERS_TASK_WORK))
0408         lockdep_set_class(&new_timer->it_lock, &posix_cpu_timers_key);
0409 
0410     new_timer->kclock = &clock_posix_cpu;
0411     timerqueue_init(&new_timer->it.cpu.node);
0412     new_timer->it.cpu.pid = get_pid(pid);
0413     rcu_read_unlock();
0414     return 0;
0415 }
0416 
0417 static struct posix_cputimer_base *timer_base(struct k_itimer *timer,
0418                           struct task_struct *tsk)
0419 {
0420     int clkidx = CPUCLOCK_WHICH(timer->it_clock);
0421 
0422     if (CPUCLOCK_PERTHREAD(timer->it_clock))
0423         return tsk->posix_cputimers.bases + clkidx;
0424     else
0425         return tsk->signal->posix_cputimers.bases + clkidx;
0426 }
0427 
0428 /*
0429  * Force recalculating the base earliest expiration on the next tick.
0430  * This will also re-evaluate the need to keep around the process wide
0431  * cputime counter and tick dependency and eventually shut these down
0432  * if necessary.
0433  */
0434 static void trigger_base_recalc_expires(struct k_itimer *timer,
0435                     struct task_struct *tsk)
0436 {
0437     struct posix_cputimer_base *base = timer_base(timer, tsk);
0438 
0439     base->nextevt = 0;
0440 }
0441 
0442 /*
0443  * Dequeue the timer and reset the base if it was its earliest expiration.
0444  * It makes sure the next tick recalculates the base next expiration so we
0445  * don't keep the costly process wide cputime counter around for a random
0446  * amount of time, along with the tick dependency.
0447  *
0448  * If another timer gets queued between this and the next tick, its
0449  * expiration will update the base next event if necessary on the next
0450  * tick.
0451  */
0452 static void disarm_timer(struct k_itimer *timer, struct task_struct *p)
0453 {
0454     struct cpu_timer *ctmr = &timer->it.cpu;
0455     struct posix_cputimer_base *base;
0456 
0457     if (!cpu_timer_dequeue(ctmr))
0458         return;
0459 
0460     base = timer_base(timer, p);
0461     if (cpu_timer_getexpires(ctmr) == base->nextevt)
0462         trigger_base_recalc_expires(timer, p);
0463 }
0464 
0465 
0466 /*
0467  * Clean up a CPU-clock timer that is about to be destroyed.
0468  * This is called from timer deletion with the timer already locked.
0469  * If we return TIMER_RETRY, it's necessary to release the timer's lock
0470  * and try again.  (This happens when the timer is in the middle of firing.)
0471  */
0472 static int posix_cpu_timer_del(struct k_itimer *timer)
0473 {
0474     struct cpu_timer *ctmr = &timer->it.cpu;
0475     struct sighand_struct *sighand;
0476     struct task_struct *p;
0477     unsigned long flags;
0478     int ret = 0;
0479 
0480     rcu_read_lock();
0481     p = cpu_timer_task_rcu(timer);
0482     if (!p)
0483         goto out;
0484 
0485     /*
0486      * Protect against sighand release/switch in exit/exec and process/
0487      * thread timer list entry concurrent read/writes.
0488      */
0489     sighand = lock_task_sighand(p, &flags);
0490     if (unlikely(sighand == NULL)) {
0491         /*
0492          * This raced with the reaping of the task. The exit cleanup
0493          * should have removed this timer from the timer queue.
0494          */
0495         WARN_ON_ONCE(ctmr->head || timerqueue_node_queued(&ctmr->node));
0496     } else {
0497         if (timer->it.cpu.firing)
0498             ret = TIMER_RETRY;
0499         else
0500             disarm_timer(timer, p);
0501 
0502         unlock_task_sighand(p, &flags);
0503     }
0504 
0505 out:
0506     rcu_read_unlock();
0507     if (!ret)
0508         put_pid(ctmr->pid);
0509 
0510     return ret;
0511 }
0512 
0513 static void cleanup_timerqueue(struct timerqueue_head *head)
0514 {
0515     struct timerqueue_node *node;
0516     struct cpu_timer *ctmr;
0517 
0518     while ((node = timerqueue_getnext(head))) {
0519         timerqueue_del(head, node);
0520         ctmr = container_of(node, struct cpu_timer, node);
0521         ctmr->head = NULL;
0522     }
0523 }
0524 
0525 /*
0526  * Clean out CPU timers which are still armed when a thread exits. The
0527  * timers are only removed from the list. No other updates are done. The
0528  * corresponding posix timers are still accessible, but cannot be rearmed.
0529  *
0530  * This must be called with the siglock held.
0531  */
0532 static void cleanup_timers(struct posix_cputimers *pct)
0533 {
0534     cleanup_timerqueue(&pct->bases[CPUCLOCK_PROF].tqhead);
0535     cleanup_timerqueue(&pct->bases[CPUCLOCK_VIRT].tqhead);
0536     cleanup_timerqueue(&pct->bases[CPUCLOCK_SCHED].tqhead);
0537 }
0538 
0539 /*
0540  * These are both called with the siglock held, when the current thread
0541  * is being reaped.  When the final (leader) thread in the group is reaped,
0542  * posix_cpu_timers_exit_group will be called after posix_cpu_timers_exit.
0543  */
0544 void posix_cpu_timers_exit(struct task_struct *tsk)
0545 {
0546     cleanup_timers(&tsk->posix_cputimers);
0547 }
0548 void posix_cpu_timers_exit_group(struct task_struct *tsk)
0549 {
0550     cleanup_timers(&tsk->signal->posix_cputimers);
0551 }
0552 
0553 /*
0554  * Insert the timer on the appropriate list before any timers that
0555  * expire later.  This must be called with the sighand lock held.
0556  */
0557 static void arm_timer(struct k_itimer *timer, struct task_struct *p)
0558 {
0559     struct posix_cputimer_base *base = timer_base(timer, p);
0560     struct cpu_timer *ctmr = &timer->it.cpu;
0561     u64 newexp = cpu_timer_getexpires(ctmr);
0562 
0563     if (!cpu_timer_enqueue(&base->tqhead, ctmr))
0564         return;
0565 
0566     /*
0567      * We are the new earliest-expiring POSIX 1.b timer, hence
0568      * need to update expiration cache. Take into account that
0569      * for process timers we share expiration cache with itimers
0570      * and RLIMIT_CPU and for thread timers with RLIMIT_RTTIME.
0571      */
0572     if (newexp < base->nextevt)
0573         base->nextevt = newexp;
0574 
0575     if (CPUCLOCK_PERTHREAD(timer->it_clock))
0576         tick_dep_set_task(p, TICK_DEP_BIT_POSIX_TIMER);
0577     else
0578         tick_dep_set_signal(p, TICK_DEP_BIT_POSIX_TIMER);
0579 }
0580 
0581 /*
0582  * The timer is locked, fire it and arrange for its reload.
0583  */
0584 static void cpu_timer_fire(struct k_itimer *timer)
0585 {
0586     struct cpu_timer *ctmr = &timer->it.cpu;
0587 
0588     if ((timer->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) {
0589         /*
0590          * User don't want any signal.
0591          */
0592         cpu_timer_setexpires(ctmr, 0);
0593     } else if (unlikely(timer->sigq == NULL)) {
0594         /*
0595          * This a special case for clock_nanosleep,
0596          * not a normal timer from sys_timer_create.
0597          */
0598         wake_up_process(timer->it_process);
0599         cpu_timer_setexpires(ctmr, 0);
0600     } else if (!timer->it_interval) {
0601         /*
0602          * One-shot timer.  Clear it as soon as it's fired.
0603          */
0604         posix_timer_event(timer, 0);
0605         cpu_timer_setexpires(ctmr, 0);
0606     } else if (posix_timer_event(timer, ++timer->it_requeue_pending)) {
0607         /*
0608          * The signal did not get queued because the signal
0609          * was ignored, so we won't get any callback to
0610          * reload the timer.  But we need to keep it
0611          * ticking in case the signal is deliverable next time.
0612          */
0613         posix_cpu_timer_rearm(timer);
0614         ++timer->it_requeue_pending;
0615     }
0616 }
0617 
0618 /*
0619  * Guts of sys_timer_settime for CPU timers.
0620  * This is called with the timer locked and interrupts disabled.
0621  * If we return TIMER_RETRY, it's necessary to release the timer's lock
0622  * and try again.  (This happens when the timer is in the middle of firing.)
0623  */
0624 static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
0625                    struct itimerspec64 *new, struct itimerspec64 *old)
0626 {
0627     clockid_t clkid = CPUCLOCK_WHICH(timer->it_clock);
0628     u64 old_expires, new_expires, old_incr, val;
0629     struct cpu_timer *ctmr = &timer->it.cpu;
0630     struct sighand_struct *sighand;
0631     struct task_struct *p;
0632     unsigned long flags;
0633     int ret = 0;
0634 
0635     rcu_read_lock();
0636     p = cpu_timer_task_rcu(timer);
0637     if (!p) {
0638         /*
0639          * If p has just been reaped, we can no
0640          * longer get any information about it at all.
0641          */
0642         rcu_read_unlock();
0643         return -ESRCH;
0644     }
0645 
0646     /*
0647      * Use the to_ktime conversion because that clamps the maximum
0648      * value to KTIME_MAX and avoid multiplication overflows.
0649      */
0650     new_expires = ktime_to_ns(timespec64_to_ktime(new->it_value));
0651 
0652     /*
0653      * Protect against sighand release/switch in exit/exec and p->cpu_timers
0654      * and p->signal->cpu_timers read/write in arm_timer()
0655      */
0656     sighand = lock_task_sighand(p, &flags);
0657     /*
0658      * If p has just been reaped, we can no
0659      * longer get any information about it at all.
0660      */
0661     if (unlikely(sighand == NULL)) {
0662         rcu_read_unlock();
0663         return -ESRCH;
0664     }
0665 
0666     /*
0667      * Disarm any old timer after extracting its expiry time.
0668      */
0669     old_incr = timer->it_interval;
0670     old_expires = cpu_timer_getexpires(ctmr);
0671 
0672     if (unlikely(timer->it.cpu.firing)) {
0673         timer->it.cpu.firing = -1;
0674         ret = TIMER_RETRY;
0675     } else {
0676         cpu_timer_dequeue(ctmr);
0677     }
0678 
0679     /*
0680      * We need to sample the current value to convert the new
0681      * value from to relative and absolute, and to convert the
0682      * old value from absolute to relative.  To set a process
0683      * timer, we need a sample to balance the thread expiry
0684      * times (in arm_timer).  With an absolute time, we must
0685      * check if it's already passed.  In short, we need a sample.
0686      */
0687     if (CPUCLOCK_PERTHREAD(timer->it_clock))
0688         val = cpu_clock_sample(clkid, p);
0689     else
0690         val = cpu_clock_sample_group(clkid, p, true);
0691 
0692     if (old) {
0693         if (old_expires == 0) {
0694             old->it_value.tv_sec = 0;
0695             old->it_value.tv_nsec = 0;
0696         } else {
0697             /*
0698              * Update the timer in case it has overrun already.
0699              * If it has, we'll report it as having overrun and
0700              * with the next reloaded timer already ticking,
0701              * though we are swallowing that pending
0702              * notification here to install the new setting.
0703              */
0704             u64 exp = bump_cpu_timer(timer, val);
0705 
0706             if (val < exp) {
0707                 old_expires = exp - val;
0708                 old->it_value = ns_to_timespec64(old_expires);
0709             } else {
0710                 old->it_value.tv_nsec = 1;
0711                 old->it_value.tv_sec = 0;
0712             }
0713         }
0714     }
0715 
0716     if (unlikely(ret)) {
0717         /*
0718          * We are colliding with the timer actually firing.
0719          * Punt after filling in the timer's old value, and
0720          * disable this firing since we are already reporting
0721          * it as an overrun (thanks to bump_cpu_timer above).
0722          */
0723         unlock_task_sighand(p, &flags);
0724         goto out;
0725     }
0726 
0727     if (new_expires != 0 && !(timer_flags & TIMER_ABSTIME)) {
0728         new_expires += val;
0729     }
0730 
0731     /*
0732      * Install the new expiry time (or zero).
0733      * For a timer with no notification action, we don't actually
0734      * arm the timer (we'll just fake it for timer_gettime).
0735      */
0736     cpu_timer_setexpires(ctmr, new_expires);
0737     if (new_expires != 0 && val < new_expires) {
0738         arm_timer(timer, p);
0739     }
0740 
0741     unlock_task_sighand(p, &flags);
0742     /*
0743      * Install the new reload setting, and
0744      * set up the signal and overrun bookkeeping.
0745      */
0746     timer->it_interval = timespec64_to_ktime(new->it_interval);
0747 
0748     /*
0749      * This acts as a modification timestamp for the timer,
0750      * so any automatic reload attempt will punt on seeing
0751      * that we have reset the timer manually.
0752      */
0753     timer->it_requeue_pending = (timer->it_requeue_pending + 2) &
0754         ~REQUEUE_PENDING;
0755     timer->it_overrun_last = 0;
0756     timer->it_overrun = -1;
0757 
0758     if (val >= new_expires) {
0759         if (new_expires != 0) {
0760             /*
0761              * The designated time already passed, so we notify
0762              * immediately, even if the thread never runs to
0763              * accumulate more time on this clock.
0764              */
0765             cpu_timer_fire(timer);
0766         }
0767 
0768         /*
0769          * Make sure we don't keep around the process wide cputime
0770          * counter or the tick dependency if they are not necessary.
0771          */
0772         sighand = lock_task_sighand(p, &flags);
0773         if (!sighand)
0774             goto out;
0775 
0776         if (!cpu_timer_queued(ctmr))
0777             trigger_base_recalc_expires(timer, p);
0778 
0779         unlock_task_sighand(p, &flags);
0780     }
0781  out:
0782     rcu_read_unlock();
0783     if (old)
0784         old->it_interval = ns_to_timespec64(old_incr);
0785 
0786     return ret;
0787 }
0788 
0789 static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp)
0790 {
0791     clockid_t clkid = CPUCLOCK_WHICH(timer->it_clock);
0792     struct cpu_timer *ctmr = &timer->it.cpu;
0793     u64 now, expires = cpu_timer_getexpires(ctmr);
0794     struct task_struct *p;
0795 
0796     rcu_read_lock();
0797     p = cpu_timer_task_rcu(timer);
0798     if (!p)
0799         goto out;
0800 
0801     /*
0802      * Easy part: convert the reload time.
0803      */
0804     itp->it_interval = ktime_to_timespec64(timer->it_interval);
0805 
0806     if (!expires)
0807         goto out;
0808 
0809     /*
0810      * Sample the clock to take the difference with the expiry time.
0811      */
0812     if (CPUCLOCK_PERTHREAD(timer->it_clock))
0813         now = cpu_clock_sample(clkid, p);
0814     else
0815         now = cpu_clock_sample_group(clkid, p, false);
0816 
0817     if (now < expires) {
0818         itp->it_value = ns_to_timespec64(expires - now);
0819     } else {
0820         /*
0821          * The timer should have expired already, but the firing
0822          * hasn't taken place yet.  Say it's just about to expire.
0823          */
0824         itp->it_value.tv_nsec = 1;
0825         itp->it_value.tv_sec = 0;
0826     }
0827 out:
0828     rcu_read_unlock();
0829 }
0830 
0831 #define MAX_COLLECTED   20
0832 
0833 static u64 collect_timerqueue(struct timerqueue_head *head,
0834                   struct list_head *firing, u64 now)
0835 {
0836     struct timerqueue_node *next;
0837     int i = 0;
0838 
0839     while ((next = timerqueue_getnext(head))) {
0840         struct cpu_timer *ctmr;
0841         u64 expires;
0842 
0843         ctmr = container_of(next, struct cpu_timer, node);
0844         expires = cpu_timer_getexpires(ctmr);
0845         /* Limit the number of timers to expire at once */
0846         if (++i == MAX_COLLECTED || now < expires)
0847             return expires;
0848 
0849         ctmr->firing = 1;
0850         cpu_timer_dequeue(ctmr);
0851         list_add_tail(&ctmr->elist, firing);
0852     }
0853 
0854     return U64_MAX;
0855 }
0856 
0857 static void collect_posix_cputimers(struct posix_cputimers *pct, u64 *samples,
0858                     struct list_head *firing)
0859 {
0860     struct posix_cputimer_base *base = pct->bases;
0861     int i;
0862 
0863     for (i = 0; i < CPUCLOCK_MAX; i++, base++) {
0864         base->nextevt = collect_timerqueue(&base->tqhead, firing,
0865                             samples[i]);
0866     }
0867 }
0868 
0869 static inline void check_dl_overrun(struct task_struct *tsk)
0870 {
0871     if (tsk->dl.dl_overrun) {
0872         tsk->dl.dl_overrun = 0;
0873         send_signal_locked(SIGXCPU, SEND_SIG_PRIV, tsk, PIDTYPE_TGID);
0874     }
0875 }
0876 
0877 static bool check_rlimit(u64 time, u64 limit, int signo, bool rt, bool hard)
0878 {
0879     if (time < limit)
0880         return false;
0881 
0882     if (print_fatal_signals) {
0883         pr_info("%s Watchdog Timeout (%s): %s[%d]\n",
0884             rt ? "RT" : "CPU", hard ? "hard" : "soft",
0885             current->comm, task_pid_nr(current));
0886     }
0887     send_signal_locked(signo, SEND_SIG_PRIV, current, PIDTYPE_TGID);
0888     return true;
0889 }
0890 
0891 /*
0892  * Check for any per-thread CPU timers that have fired and move them off
0893  * the tsk->cpu_timers[N] list onto the firing list.  Here we update the
0894  * tsk->it_*_expires values to reflect the remaining thread CPU timers.
0895  */
0896 static void check_thread_timers(struct task_struct *tsk,
0897                 struct list_head *firing)
0898 {
0899     struct posix_cputimers *pct = &tsk->posix_cputimers;
0900     u64 samples[CPUCLOCK_MAX];
0901     unsigned long soft;
0902 
0903     if (dl_task(tsk))
0904         check_dl_overrun(tsk);
0905 
0906     if (expiry_cache_is_inactive(pct))
0907         return;
0908 
0909     task_sample_cputime(tsk, samples);
0910     collect_posix_cputimers(pct, samples, firing);
0911 
0912     /*
0913      * Check for the special case thread timers.
0914      */
0915     soft = task_rlimit(tsk, RLIMIT_RTTIME);
0916     if (soft != RLIM_INFINITY) {
0917         /* Task RT timeout is accounted in jiffies. RTTIME is usec */
0918         unsigned long rttime = tsk->rt.timeout * (USEC_PER_SEC / HZ);
0919         unsigned long hard = task_rlimit_max(tsk, RLIMIT_RTTIME);
0920 
0921         /* At the hard limit, send SIGKILL. No further action. */
0922         if (hard != RLIM_INFINITY &&
0923             check_rlimit(rttime, hard, SIGKILL, true, true))
0924             return;
0925 
0926         /* At the soft limit, send a SIGXCPU every second */
0927         if (check_rlimit(rttime, soft, SIGXCPU, true, false)) {
0928             soft += USEC_PER_SEC;
0929             tsk->signal->rlim[RLIMIT_RTTIME].rlim_cur = soft;
0930         }
0931     }
0932 
0933     if (expiry_cache_is_inactive(pct))
0934         tick_dep_clear_task(tsk, TICK_DEP_BIT_POSIX_TIMER);
0935 }
0936 
0937 static inline void stop_process_timers(struct signal_struct *sig)
0938 {
0939     struct posix_cputimers *pct = &sig->posix_cputimers;
0940 
0941     /* Turn off the active flag. This is done without locking. */
0942     WRITE_ONCE(pct->timers_active, false);
0943     tick_dep_clear_signal(sig, TICK_DEP_BIT_POSIX_TIMER);
0944 }
0945 
0946 static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
0947                  u64 *expires, u64 cur_time, int signo)
0948 {
0949     if (!it->expires)
0950         return;
0951 
0952     if (cur_time >= it->expires) {
0953         if (it->incr)
0954             it->expires += it->incr;
0955         else
0956             it->expires = 0;
0957 
0958         trace_itimer_expire(signo == SIGPROF ?
0959                     ITIMER_PROF : ITIMER_VIRTUAL,
0960                     task_tgid(tsk), cur_time);
0961         send_signal_locked(signo, SEND_SIG_PRIV, tsk, PIDTYPE_TGID);
0962     }
0963 
0964     if (it->expires && it->expires < *expires)
0965         *expires = it->expires;
0966 }
0967 
0968 /*
0969  * Check for any per-thread CPU timers that have fired and move them
0970  * off the tsk->*_timers list onto the firing list.  Per-thread timers
0971  * have already been taken off.
0972  */
0973 static void check_process_timers(struct task_struct *tsk,
0974                  struct list_head *firing)
0975 {
0976     struct signal_struct *const sig = tsk->signal;
0977     struct posix_cputimers *pct = &sig->posix_cputimers;
0978     u64 samples[CPUCLOCK_MAX];
0979     unsigned long soft;
0980 
0981     /*
0982      * If there are no active process wide timers (POSIX 1.b, itimers,
0983      * RLIMIT_CPU) nothing to check. Also skip the process wide timer
0984      * processing when there is already another task handling them.
0985      */
0986     if (!READ_ONCE(pct->timers_active) || pct->expiry_active)
0987         return;
0988 
0989     /*
0990      * Signify that a thread is checking for process timers.
0991      * Write access to this field is protected by the sighand lock.
0992      */
0993     pct->expiry_active = true;
0994 
0995     /*
0996      * Collect the current process totals. Group accounting is active
0997      * so the sample can be taken directly.
0998      */
0999     proc_sample_cputime_atomic(&sig->cputimer.cputime_atomic, samples);
1000     collect_posix_cputimers(pct, samples, firing);
1001 
1002     /*
1003      * Check for the special case process timers.
1004      */
1005     check_cpu_itimer(tsk, &sig->it[CPUCLOCK_PROF],
1006              &pct->bases[CPUCLOCK_PROF].nextevt,
1007              samples[CPUCLOCK_PROF], SIGPROF);
1008     check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT],
1009              &pct->bases[CPUCLOCK_VIRT].nextevt,
1010              samples[CPUCLOCK_VIRT], SIGVTALRM);
1011 
1012     soft = task_rlimit(tsk, RLIMIT_CPU);
1013     if (soft != RLIM_INFINITY) {
1014         /* RLIMIT_CPU is in seconds. Samples are nanoseconds */
1015         unsigned long hard = task_rlimit_max(tsk, RLIMIT_CPU);
1016         u64 ptime = samples[CPUCLOCK_PROF];
1017         u64 softns = (u64)soft * NSEC_PER_SEC;
1018         u64 hardns = (u64)hard * NSEC_PER_SEC;
1019 
1020         /* At the hard limit, send SIGKILL. No further action. */
1021         if (hard != RLIM_INFINITY &&
1022             check_rlimit(ptime, hardns, SIGKILL, false, true))
1023             return;
1024 
1025         /* At the soft limit, send a SIGXCPU every second */
1026         if (check_rlimit(ptime, softns, SIGXCPU, false, false)) {
1027             sig->rlim[RLIMIT_CPU].rlim_cur = soft + 1;
1028             softns += NSEC_PER_SEC;
1029         }
1030 
1031         /* Update the expiry cache */
1032         if (softns < pct->bases[CPUCLOCK_PROF].nextevt)
1033             pct->bases[CPUCLOCK_PROF].nextevt = softns;
1034     }
1035 
1036     if (expiry_cache_is_inactive(pct))
1037         stop_process_timers(sig);
1038 
1039     pct->expiry_active = false;
1040 }
1041 
1042 /*
1043  * This is called from the signal code (via posixtimer_rearm)
1044  * when the last timer signal was delivered and we have to reload the timer.
1045  */
1046 static void posix_cpu_timer_rearm(struct k_itimer *timer)
1047 {
1048     clockid_t clkid = CPUCLOCK_WHICH(timer->it_clock);
1049     struct task_struct *p;
1050     struct sighand_struct *sighand;
1051     unsigned long flags;
1052     u64 now;
1053 
1054     rcu_read_lock();
1055     p = cpu_timer_task_rcu(timer);
1056     if (!p)
1057         goto out;
1058 
1059     /* Protect timer list r/w in arm_timer() */
1060     sighand = lock_task_sighand(p, &flags);
1061     if (unlikely(sighand == NULL))
1062         goto out;
1063 
1064     /*
1065      * Fetch the current sample and update the timer's expiry time.
1066      */
1067     if (CPUCLOCK_PERTHREAD(timer->it_clock))
1068         now = cpu_clock_sample(clkid, p);
1069     else
1070         now = cpu_clock_sample_group(clkid, p, true);
1071 
1072     bump_cpu_timer(timer, now);
1073 
1074     /*
1075      * Now re-arm for the new expiry time.
1076      */
1077     arm_timer(timer, p);
1078     unlock_task_sighand(p, &flags);
1079 out:
1080     rcu_read_unlock();
1081 }
1082 
1083 /**
1084  * task_cputimers_expired - Check whether posix CPU timers are expired
1085  *
1086  * @samples:    Array of current samples for the CPUCLOCK clocks
1087  * @pct:    Pointer to a posix_cputimers container
1088  *
1089  * Returns true if any member of @samples is greater than the corresponding
1090  * member of @pct->bases[CLK].nextevt. False otherwise
1091  */
1092 static inline bool
1093 task_cputimers_expired(const u64 *samples, struct posix_cputimers *pct)
1094 {
1095     int i;
1096 
1097     for (i = 0; i < CPUCLOCK_MAX; i++) {
1098         if (samples[i] >= pct->bases[i].nextevt)
1099             return true;
1100     }
1101     return false;
1102 }
1103 
1104 /**
1105  * fastpath_timer_check - POSIX CPU timers fast path.
1106  *
1107  * @tsk:    The task (thread) being checked.
1108  *
1109  * Check the task and thread group timers.  If both are zero (there are no
1110  * timers set) return false.  Otherwise snapshot the task and thread group
1111  * timers and compare them with the corresponding expiration times.  Return
1112  * true if a timer has expired, else return false.
1113  */
1114 static inline bool fastpath_timer_check(struct task_struct *tsk)
1115 {
1116     struct posix_cputimers *pct = &tsk->posix_cputimers;
1117     struct signal_struct *sig;
1118 
1119     if (!expiry_cache_is_inactive(pct)) {
1120         u64 samples[CPUCLOCK_MAX];
1121 
1122         task_sample_cputime(tsk, samples);
1123         if (task_cputimers_expired(samples, pct))
1124             return true;
1125     }
1126 
1127     sig = tsk->signal;
1128     pct = &sig->posix_cputimers;
1129     /*
1130      * Check if thread group timers expired when timers are active and
1131      * no other thread in the group is already handling expiry for
1132      * thread group cputimers. These fields are read without the
1133      * sighand lock. However, this is fine because this is meant to be
1134      * a fastpath heuristic to determine whether we should try to
1135      * acquire the sighand lock to handle timer expiry.
1136      *
1137      * In the worst case scenario, if concurrently timers_active is set
1138      * or expiry_active is cleared, but the current thread doesn't see
1139      * the change yet, the timer checks are delayed until the next
1140      * thread in the group gets a scheduler interrupt to handle the
1141      * timer. This isn't an issue in practice because these types of
1142      * delays with signals actually getting sent are expected.
1143      */
1144     if (READ_ONCE(pct->timers_active) && !READ_ONCE(pct->expiry_active)) {
1145         u64 samples[CPUCLOCK_MAX];
1146 
1147         proc_sample_cputime_atomic(&sig->cputimer.cputime_atomic,
1148                        samples);
1149 
1150         if (task_cputimers_expired(samples, pct))
1151             return true;
1152     }
1153 
1154     if (dl_task(tsk) && tsk->dl.dl_overrun)
1155         return true;
1156 
1157     return false;
1158 }
1159 
1160 static void handle_posix_cpu_timers(struct task_struct *tsk);
1161 
1162 #ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK
1163 static void posix_cpu_timers_work(struct callback_head *work)
1164 {
1165     handle_posix_cpu_timers(current);
1166 }
1167 
1168 /*
1169  * Clear existing posix CPU timers task work.
1170  */
1171 void clear_posix_cputimers_work(struct task_struct *p)
1172 {
1173     /*
1174      * A copied work entry from the old task is not meaningful, clear it.
1175      * N.B. init_task_work will not do this.
1176      */
1177     memset(&p->posix_cputimers_work.work, 0,
1178            sizeof(p->posix_cputimers_work.work));
1179     init_task_work(&p->posix_cputimers_work.work,
1180                posix_cpu_timers_work);
1181     p->posix_cputimers_work.scheduled = false;
1182 }
1183 
1184 /*
1185  * Initialize posix CPU timers task work in init task. Out of line to
1186  * keep the callback static and to avoid header recursion hell.
1187  */
1188 void __init posix_cputimers_init_work(void)
1189 {
1190     clear_posix_cputimers_work(current);
1191 }
1192 
1193 /*
1194  * Note: All operations on tsk->posix_cputimer_work.scheduled happen either
1195  * in hard interrupt context or in task context with interrupts
1196  * disabled. Aside of that the writer/reader interaction is always in the
1197  * context of the current task, which means they are strict per CPU.
1198  */
1199 static inline bool posix_cpu_timers_work_scheduled(struct task_struct *tsk)
1200 {
1201     return tsk->posix_cputimers_work.scheduled;
1202 }
1203 
1204 static inline void __run_posix_cpu_timers(struct task_struct *tsk)
1205 {
1206     if (WARN_ON_ONCE(tsk->posix_cputimers_work.scheduled))
1207         return;
1208 
1209     /* Schedule task work to actually expire the timers */
1210     tsk->posix_cputimers_work.scheduled = true;
1211     task_work_add(tsk, &tsk->posix_cputimers_work.work, TWA_RESUME);
1212 }
1213 
1214 static inline bool posix_cpu_timers_enable_work(struct task_struct *tsk,
1215                         unsigned long start)
1216 {
1217     bool ret = true;
1218 
1219     /*
1220      * On !RT kernels interrupts are disabled while collecting expired
1221      * timers, so no tick can happen and the fast path check can be
1222      * reenabled without further checks.
1223      */
1224     if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
1225         tsk->posix_cputimers_work.scheduled = false;
1226         return true;
1227     }
1228 
1229     /*
1230      * On RT enabled kernels ticks can happen while the expired timers
1231      * are collected under sighand lock. But any tick which observes
1232      * the CPUTIMERS_WORK_SCHEDULED bit set, does not run the fastpath
1233      * checks. So reenabling the tick work has do be done carefully:
1234      *
1235      * Disable interrupts and run the fast path check if jiffies have
1236      * advanced since the collecting of expired timers started. If
1237      * jiffies have not advanced or the fast path check did not find
1238      * newly expired timers, reenable the fast path check in the timer
1239      * interrupt. If there are newly expired timers, return false and
1240      * let the collection loop repeat.
1241      */
1242     local_irq_disable();
1243     if (start != jiffies && fastpath_timer_check(tsk))
1244         ret = false;
1245     else
1246         tsk->posix_cputimers_work.scheduled = false;
1247     local_irq_enable();
1248 
1249     return ret;
1250 }
1251 #else /* CONFIG_POSIX_CPU_TIMERS_TASK_WORK */
1252 static inline void __run_posix_cpu_timers(struct task_struct *tsk)
1253 {
1254     lockdep_posixtimer_enter();
1255     handle_posix_cpu_timers(tsk);
1256     lockdep_posixtimer_exit();
1257 }
1258 
1259 static inline bool posix_cpu_timers_work_scheduled(struct task_struct *tsk)
1260 {
1261     return false;
1262 }
1263 
1264 static inline bool posix_cpu_timers_enable_work(struct task_struct *tsk,
1265                         unsigned long start)
1266 {
1267     return true;
1268 }
1269 #endif /* CONFIG_POSIX_CPU_TIMERS_TASK_WORK */
1270 
1271 static void handle_posix_cpu_timers(struct task_struct *tsk)
1272 {
1273     struct k_itimer *timer, *next;
1274     unsigned long flags, start;
1275     LIST_HEAD(firing);
1276 
1277     if (!lock_task_sighand(tsk, &flags))
1278         return;
1279 
1280     do {
1281         /*
1282          * On RT locking sighand lock does not disable interrupts,
1283          * so this needs to be careful vs. ticks. Store the current
1284          * jiffies value.
1285          */
1286         start = READ_ONCE(jiffies);
1287         barrier();
1288 
1289         /*
1290          * Here we take off tsk->signal->cpu_timers[N] and
1291          * tsk->cpu_timers[N] all the timers that are firing, and
1292          * put them on the firing list.
1293          */
1294         check_thread_timers(tsk, &firing);
1295 
1296         check_process_timers(tsk, &firing);
1297 
1298         /*
1299          * The above timer checks have updated the expiry cache and
1300          * because nothing can have queued or modified timers after
1301          * sighand lock was taken above it is guaranteed to be
1302          * consistent. So the next timer interrupt fastpath check
1303          * will find valid data.
1304          *
1305          * If timer expiry runs in the timer interrupt context then
1306          * the loop is not relevant as timers will be directly
1307          * expired in interrupt context. The stub function below
1308          * returns always true which allows the compiler to
1309          * optimize the loop out.
1310          *
1311          * If timer expiry is deferred to task work context then
1312          * the following rules apply:
1313          *
1314          * - On !RT kernels no tick can have happened on this CPU
1315          *   after sighand lock was acquired because interrupts are
1316          *   disabled. So reenabling task work before dropping
1317          *   sighand lock and reenabling interrupts is race free.
1318          *
1319          * - On RT kernels ticks might have happened but the tick
1320          *   work ignored posix CPU timer handling because the
1321          *   CPUTIMERS_WORK_SCHEDULED bit is set. Reenabling work
1322          *   must be done very carefully including a check whether
1323          *   ticks have happened since the start of the timer
1324          *   expiry checks. posix_cpu_timers_enable_work() takes
1325          *   care of that and eventually lets the expiry checks
1326          *   run again.
1327          */
1328     } while (!posix_cpu_timers_enable_work(tsk, start));
1329 
1330     /*
1331      * We must release sighand lock before taking any timer's lock.
1332      * There is a potential race with timer deletion here, as the
1333      * siglock now protects our private firing list.  We have set
1334      * the firing flag in each timer, so that a deletion attempt
1335      * that gets the timer lock before we do will give it up and
1336      * spin until we've taken care of that timer below.
1337      */
1338     unlock_task_sighand(tsk, &flags);
1339 
1340     /*
1341      * Now that all the timers on our list have the firing flag,
1342      * no one will touch their list entries but us.  We'll take
1343      * each timer's lock before clearing its firing flag, so no
1344      * timer call will interfere.
1345      */
1346     list_for_each_entry_safe(timer, next, &firing, it.cpu.elist) {
1347         int cpu_firing;
1348 
1349         /*
1350          * spin_lock() is sufficient here even independent of the
1351          * expiry context. If expiry happens in hard interrupt
1352          * context it's obvious. For task work context it's safe
1353          * because all other operations on timer::it_lock happen in
1354          * task context (syscall or exit).
1355          */
1356         spin_lock(&timer->it_lock);
1357         list_del_init(&timer->it.cpu.elist);
1358         cpu_firing = timer->it.cpu.firing;
1359         timer->it.cpu.firing = 0;
1360         /*
1361          * The firing flag is -1 if we collided with a reset
1362          * of the timer, which already reported this
1363          * almost-firing as an overrun.  So don't generate an event.
1364          */
1365         if (likely(cpu_firing >= 0))
1366             cpu_timer_fire(timer);
1367         spin_unlock(&timer->it_lock);
1368     }
1369 }
1370 
1371 /*
1372  * This is called from the timer interrupt handler.  The irq handler has
1373  * already updated our counts.  We need to check if any timers fire now.
1374  * Interrupts are disabled.
1375  */
1376 void run_posix_cpu_timers(void)
1377 {
1378     struct task_struct *tsk = current;
1379 
1380     lockdep_assert_irqs_disabled();
1381 
1382     /*
1383      * If the actual expiry is deferred to task work context and the
1384      * work is already scheduled there is no point to do anything here.
1385      */
1386     if (posix_cpu_timers_work_scheduled(tsk))
1387         return;
1388 
1389     /*
1390      * The fast path checks that there are no expired thread or thread
1391      * group timers.  If that's so, just return.
1392      */
1393     if (!fastpath_timer_check(tsk))
1394         return;
1395 
1396     __run_posix_cpu_timers(tsk);
1397 }
1398 
1399 /*
1400  * Set one of the process-wide special case CPU timers or RLIMIT_CPU.
1401  * The tsk->sighand->siglock must be held by the caller.
1402  */
1403 void set_process_cpu_timer(struct task_struct *tsk, unsigned int clkid,
1404                u64 *newval, u64 *oldval)
1405 {
1406     u64 now, *nextevt;
1407 
1408     if (WARN_ON_ONCE(clkid >= CPUCLOCK_SCHED))
1409         return;
1410 
1411     nextevt = &tsk->signal->posix_cputimers.bases[clkid].nextevt;
1412     now = cpu_clock_sample_group(clkid, tsk, true);
1413 
1414     if (oldval) {
1415         /*
1416          * We are setting itimer. The *oldval is absolute and we update
1417          * it to be relative, *newval argument is relative and we update
1418          * it to be absolute.
1419          */
1420         if (*oldval) {
1421             if (*oldval <= now) {
1422                 /* Just about to fire. */
1423                 *oldval = TICK_NSEC;
1424             } else {
1425                 *oldval -= now;
1426             }
1427         }
1428 
1429         if (*newval)
1430             *newval += now;
1431     }
1432 
1433     /*
1434      * Update expiration cache if this is the earliest timer. CPUCLOCK_PROF
1435      * expiry cache is also used by RLIMIT_CPU!.
1436      */
1437     if (*newval < *nextevt)
1438         *nextevt = *newval;
1439 
1440     tick_dep_set_signal(tsk, TICK_DEP_BIT_POSIX_TIMER);
1441 }
1442 
1443 static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
1444                 const struct timespec64 *rqtp)
1445 {
1446     struct itimerspec64 it;
1447     struct k_itimer timer;
1448     u64 expires;
1449     int error;
1450 
1451     /*
1452      * Set up a temporary timer and then wait for it to go off.
1453      */
1454     memset(&timer, 0, sizeof timer);
1455     spin_lock_init(&timer.it_lock);
1456     timer.it_clock = which_clock;
1457     timer.it_overrun = -1;
1458     error = posix_cpu_timer_create(&timer);
1459     timer.it_process = current;
1460 
1461     if (!error) {
1462         static struct itimerspec64 zero_it;
1463         struct restart_block *restart;
1464 
1465         memset(&it, 0, sizeof(it));
1466         it.it_value = *rqtp;
1467 
1468         spin_lock_irq(&timer.it_lock);
1469         error = posix_cpu_timer_set(&timer, flags, &it, NULL);
1470         if (error) {
1471             spin_unlock_irq(&timer.it_lock);
1472             return error;
1473         }
1474 
1475         while (!signal_pending(current)) {
1476             if (!cpu_timer_getexpires(&timer.it.cpu)) {
1477                 /*
1478                  * Our timer fired and was reset, below
1479                  * deletion can not fail.
1480                  */
1481                 posix_cpu_timer_del(&timer);
1482                 spin_unlock_irq(&timer.it_lock);
1483                 return 0;
1484             }
1485 
1486             /*
1487              * Block until cpu_timer_fire (or a signal) wakes us.
1488              */
1489             __set_current_state(TASK_INTERRUPTIBLE);
1490             spin_unlock_irq(&timer.it_lock);
1491             schedule();
1492             spin_lock_irq(&timer.it_lock);
1493         }
1494 
1495         /*
1496          * We were interrupted by a signal.
1497          */
1498         expires = cpu_timer_getexpires(&timer.it.cpu);
1499         error = posix_cpu_timer_set(&timer, 0, &zero_it, &it);
1500         if (!error) {
1501             /*
1502              * Timer is now unarmed, deletion can not fail.
1503              */
1504             posix_cpu_timer_del(&timer);
1505         }
1506         spin_unlock_irq(&timer.it_lock);
1507 
1508         while (error == TIMER_RETRY) {
1509             /*
1510              * We need to handle case when timer was or is in the
1511              * middle of firing. In other cases we already freed
1512              * resources.
1513              */
1514             spin_lock_irq(&timer.it_lock);
1515             error = posix_cpu_timer_del(&timer);
1516             spin_unlock_irq(&timer.it_lock);
1517         }
1518 
1519         if ((it.it_value.tv_sec | it.it_value.tv_nsec) == 0) {
1520             /*
1521              * It actually did fire already.
1522              */
1523             return 0;
1524         }
1525 
1526         error = -ERESTART_RESTARTBLOCK;
1527         /*
1528          * Report back to the user the time still remaining.
1529          */
1530         restart = &current->restart_block;
1531         restart->nanosleep.expires = expires;
1532         if (restart->nanosleep.type != TT_NONE)
1533             error = nanosleep_copyout(restart, &it.it_value);
1534     }
1535 
1536     return error;
1537 }
1538 
1539 static long posix_cpu_nsleep_restart(struct restart_block *restart_block);
1540 
1541 static int posix_cpu_nsleep(const clockid_t which_clock, int flags,
1542                 const struct timespec64 *rqtp)
1543 {
1544     struct restart_block *restart_block = &current->restart_block;
1545     int error;
1546 
1547     /*
1548      * Diagnose required errors first.
1549      */
1550     if (CPUCLOCK_PERTHREAD(which_clock) &&
1551         (CPUCLOCK_PID(which_clock) == 0 ||
1552          CPUCLOCK_PID(which_clock) == task_pid_vnr(current)))
1553         return -EINVAL;
1554 
1555     error = do_cpu_nanosleep(which_clock, flags, rqtp);
1556 
1557     if (error == -ERESTART_RESTARTBLOCK) {
1558 
1559         if (flags & TIMER_ABSTIME)
1560             return -ERESTARTNOHAND;
1561 
1562         restart_block->nanosleep.clockid = which_clock;
1563         set_restart_fn(restart_block, posix_cpu_nsleep_restart);
1564     }
1565     return error;
1566 }
1567 
1568 static long posix_cpu_nsleep_restart(struct restart_block *restart_block)
1569 {
1570     clockid_t which_clock = restart_block->nanosleep.clockid;
1571     struct timespec64 t;
1572 
1573     t = ns_to_timespec64(restart_block->nanosleep.expires);
1574 
1575     return do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t);
1576 }
1577 
1578 #define PROCESS_CLOCK   make_process_cpuclock(0, CPUCLOCK_SCHED)
1579 #define THREAD_CLOCK    make_thread_cpuclock(0, CPUCLOCK_SCHED)
1580 
1581 static int process_cpu_clock_getres(const clockid_t which_clock,
1582                     struct timespec64 *tp)
1583 {
1584     return posix_cpu_clock_getres(PROCESS_CLOCK, tp);
1585 }
1586 static int process_cpu_clock_get(const clockid_t which_clock,
1587                  struct timespec64 *tp)
1588 {
1589     return posix_cpu_clock_get(PROCESS_CLOCK, tp);
1590 }
1591 static int process_cpu_timer_create(struct k_itimer *timer)
1592 {
1593     timer->it_clock = PROCESS_CLOCK;
1594     return posix_cpu_timer_create(timer);
1595 }
1596 static int process_cpu_nsleep(const clockid_t which_clock, int flags,
1597                   const struct timespec64 *rqtp)
1598 {
1599     return posix_cpu_nsleep(PROCESS_CLOCK, flags, rqtp);
1600 }
1601 static int thread_cpu_clock_getres(const clockid_t which_clock,
1602                    struct timespec64 *tp)
1603 {
1604     return posix_cpu_clock_getres(THREAD_CLOCK, tp);
1605 }
1606 static int thread_cpu_clock_get(const clockid_t which_clock,
1607                 struct timespec64 *tp)
1608 {
1609     return posix_cpu_clock_get(THREAD_CLOCK, tp);
1610 }
1611 static int thread_cpu_timer_create(struct k_itimer *timer)
1612 {
1613     timer->it_clock = THREAD_CLOCK;
1614     return posix_cpu_timer_create(timer);
1615 }
1616 
1617 const struct k_clock clock_posix_cpu = {
1618     .clock_getres       = posix_cpu_clock_getres,
1619     .clock_set      = posix_cpu_clock_set,
1620     .clock_get_timespec = posix_cpu_clock_get,
1621     .timer_create       = posix_cpu_timer_create,
1622     .nsleep         = posix_cpu_nsleep,
1623     .timer_set      = posix_cpu_timer_set,
1624     .timer_del      = posix_cpu_timer_del,
1625     .timer_get      = posix_cpu_timer_get,
1626     .timer_rearm        = posix_cpu_timer_rearm,
1627 };
1628 
1629 const struct k_clock clock_process = {
1630     .clock_getres       = process_cpu_clock_getres,
1631     .clock_get_timespec = process_cpu_clock_get,
1632     .timer_create       = process_cpu_timer_create,
1633     .nsleep         = process_cpu_nsleep,
1634 };
1635 
1636 const struct k_clock clock_thread = {
1637     .clock_getres       = thread_cpu_clock_getres,
1638     .clock_get_timespec = thread_cpu_clock_get,
1639     .timer_create       = thread_cpu_timer_create,
1640 };