kernel/sched/deadline.c

0001 // SPDX-License-Identifier: GPL-2.0
0002 /*
0003  * Deadline Scheduling Class (SCHED_DEADLINE)
0004  *
0005  * Earliest Deadline First (EDF) + Constant Bandwidth Server (CBS).
0006  *
0007  * Tasks that periodically executes their instances for less than their
0008  * runtime won't miss any of their deadlines.
0009  * Tasks that are not periodic or sporadic or that tries to execute more
0010  * than their reserved bandwidth will be slowed down (and may potentially
0011  * miss some of their deadlines), and won't affect any other task.
0012  *
0013  * Copyright (C) 2012 Dario Faggioli <raistlin@linux.it>,
0014  *                    Juri Lelli <juri.lelli@gmail.com>,
0015  *                    Michael Trimarchi <michael@amarulasolutions.com>,
0016  *                    Fabio Checconi <fchecconi@gmail.com>
0017  */
0018
0019 /*
0020  * Default limits for DL period; on the top end we guard against small util
0021  * tasks still getting ridiculously long effective runtimes, on the bottom end we
0022  * guard against timer DoS.
0023  */
0024 static unsigned int sysctl_sched_dl_period_max = 1 << 22; /* ~4 seconds */
0025 static unsigned int sysctl_sched_dl_period_min = 100;     /* 100 us */
0026 #ifdef CONFIG_SYSCTL
0027 static struct ctl_table sched_dl_sysctls[] = {
0028     {
0029         .procname       = "sched_deadline_period_max_us",
0030         .data           = &sysctl_sched_dl_period_max,
0031         .maxlen         = sizeof(unsigned int),
0032         .mode           = 0644,
0033         .proc_handler   = proc_douintvec_minmax,
0034         .extra1         = (void *)&sysctl_sched_dl_period_min,
0035     },
0036     {
0037         .procname       = "sched_deadline_period_min_us",
0038         .data           = &sysctl_sched_dl_period_min,
0039         .maxlen         = sizeof(unsigned int),
0040         .mode           = 0644,
0041         .proc_handler   = proc_douintvec_minmax,
0042         .extra2         = (void *)&sysctl_sched_dl_period_max,
0043     },
0044     {}
0045 };
0046
0047 static int __init sched_dl_sysctl_init(void)
0048 {
0049     register_sysctl_init("kernel", sched_dl_sysctls);
0050     return 0;
0051 }
0052 late_initcall(sched_dl_sysctl_init);
0053 #endif
0054
0055 static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se)
0056 {
0057     return container_of(dl_se, struct task_struct, dl);
0058 }
0059
0060 static inline struct rq *rq_of_dl_rq(struct dl_rq *dl_rq)
0061 {
0062     return container_of(dl_rq, struct rq, dl);
0063 }
0064
0065 static inline struct dl_rq *dl_rq_of_se(struct sched_dl_entity *dl_se)
0066 {
0067     struct task_struct *p = dl_task_of(dl_se);
0068     struct rq *rq = task_rq(p);
0069
0070     return &rq->dl;
0071 }
0072
0073 static inline int on_dl_rq(struct sched_dl_entity *dl_se)
0074 {
0075     return !RB_EMPTY_NODE(&dl_se->rb_node);
0076 }
0077
0078 #ifdef CONFIG_RT_MUTEXES
0079 static inline struct sched_dl_entity *pi_of(struct sched_dl_entity *dl_se)
0080 {
0081     return dl_se->pi_se;
0082 }
0083
0084 static inline bool is_dl_boosted(struct sched_dl_entity *dl_se)
0085 {
0086     return pi_of(dl_se) != dl_se;
0087 }
0088 #else
0089 static inline struct sched_dl_entity *pi_of(struct sched_dl_entity *dl_se)
0090 {
0091     return dl_se;
0092 }
0093
0094 static inline bool is_dl_boosted(struct sched_dl_entity *dl_se)
0095 {
0096     return false;
0097 }
0098 #endif
0099
0100 #ifdef CONFIG_SMP
0101 static inline struct dl_bw *dl_bw_of(int i)
0102 {
0103     RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
0104              "sched RCU must be held");
0105     return &cpu_rq(i)->rd->dl_bw;
0106 }
0107
0108 static inline int dl_bw_cpus(int i)
0109 {
0110     struct root_domain *rd = cpu_rq(i)->rd;
0111     int cpus;
0112
0113     RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
0114              "sched RCU must be held");
0115
0116     if (cpumask_subset(rd->span, cpu_active_mask))
0117         return cpumask_weight(rd->span);
0118
0119     cpus = 0;
0120
0121     for_each_cpu_and(i, rd->span, cpu_active_mask)
0122         cpus++;
0123
0124     return cpus;
0125 }
0126
0127 static inline unsigned long __dl_bw_capacity(int i)
0128 {
0129     struct root_domain *rd = cpu_rq(i)->rd;
0130     unsigned long cap = 0;
0131
0132     RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
0133              "sched RCU must be held");
0134
0135     for_each_cpu_and(i, rd->span, cpu_active_mask)
0136         cap += capacity_orig_of(i);
0137
0138     return cap;
0139 }
0140
0141 /*
0142  * XXX Fix: If 'rq->rd == def_root_domain' perform AC against capacity
0143  * of the CPU the task is running on rather rd's \Sum CPU capacity.
0144  */
0145 static inline unsigned long dl_bw_capacity(int i)
0146 {
0147     if (!static_branch_unlikely(&sched_asym_cpucapacity) &&
0148         capacity_orig_of(i) == SCHED_CAPACITY_SCALE) {
0149         return dl_bw_cpus(i) << SCHED_CAPACITY_SHIFT;
0150     } else {
0151         return __dl_bw_capacity(i);
0152     }
0153 }
0154
0155 static inline bool dl_bw_visited(int cpu, u64 gen)
0156 {
0157     struct root_domain *rd = cpu_rq(cpu)->rd;
0158
0159     if (rd->visit_gen == gen)
0160         return true;
0161
0162     rd->visit_gen = gen;
0163     return false;
0164 }
0165
0166 static inline
0167 void __dl_update(struct dl_bw *dl_b, s64 bw)
0168 {
0169     struct root_domain *rd = container_of(dl_b, struct root_domain, dl_bw);
0170     int i;
0171
0172     RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
0173              "sched RCU must be held");
0174     for_each_cpu_and(i, rd->span, cpu_active_mask) {
0175         struct rq *rq = cpu_rq(i);
0176
0177         rq->dl.extra_bw += bw;
0178     }
0179 }
0180 #else
0181 static inline struct dl_bw *dl_bw_of(int i)
0182 {
0183     return &cpu_rq(i)->dl.dl_bw;
0184 }
0185
0186 static inline int dl_bw_cpus(int i)
0187 {
0188     return 1;
0189 }
0190
0191 static inline unsigned long dl_bw_capacity(int i)
0192 {
0193     return SCHED_CAPACITY_SCALE;
0194 }
0195
0196 static inline bool dl_bw_visited(int cpu, u64 gen)
0197 {
0198     return false;
0199 }
0200
0201 static inline
0202 void __dl_update(struct dl_bw *dl_b, s64 bw)
0203 {
0204     struct dl_rq *dl = container_of(dl_b, struct dl_rq, dl_bw);
0205
0206     dl->extra_bw += bw;
0207 }
0208 #endif
0209
0210 static inline
0211 void __dl_sub(struct dl_bw *dl_b, u64 tsk_bw, int cpus)
0212 {
0213     dl_b->total_bw -= tsk_bw;
0214     __dl_update(dl_b, (s32)tsk_bw / cpus);
0215 }
0216
0217 static inline
0218 void __dl_add(struct dl_bw *dl_b, u64 tsk_bw, int cpus)
0219 {
0220     dl_b->total_bw += tsk_bw;
0221     __dl_update(dl_b, -((s32)tsk_bw / cpus));
0222 }
0223
0224 static inline bool
0225 __dl_overflow(struct dl_bw *dl_b, unsigned long cap, u64 old_bw, u64 new_bw)
0226 {
0227     return dl_b->bw != -1 &&
0228            cap_scale(dl_b->bw, cap) < dl_b->total_bw - old_bw + new_bw;
0229 }
0230
0231 static inline
0232 void __add_running_bw(u64 dl_bw, struct dl_rq *dl_rq)
0233 {
0234     u64 old = dl_rq->running_bw;
0235
0236     lockdep_assert_rq_held(rq_of_dl_rq(dl_rq));
0237     dl_rq->running_bw += dl_bw;
0238     SCHED_WARN_ON(dl_rq->running_bw < old); /* overflow */
0239     SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw);
0240     /* kick cpufreq (see the comment in kernel/sched/sched.h). */
0241     cpufreq_update_util(rq_of_dl_rq(dl_rq), 0);
0242 }
0243
0244 static inline
0245 void __sub_running_bw(u64 dl_bw, struct dl_rq *dl_rq)
0246 {
0247     u64 old = dl_rq->running_bw;
0248
0249     lockdep_assert_rq_held(rq_of_dl_rq(dl_rq));
0250     dl_rq->running_bw -= dl_bw;
0251     SCHED_WARN_ON(dl_rq->running_bw > old); /* underflow */
0252     if (dl_rq->running_bw > old)
0253         dl_rq->running_bw = 0;
0254     /* kick cpufreq (see the comment in kernel/sched/sched.h). */
0255     cpufreq_update_util(rq_of_dl_rq(dl_rq), 0);
0256 }
0257
0258 static inline
0259 void __add_rq_bw(u64 dl_bw, struct dl_rq *dl_rq)
0260 {
0261     u64 old = dl_rq->this_bw;
0262
0263     lockdep_assert_rq_held(rq_of_dl_rq(dl_rq));
0264     dl_rq->this_bw += dl_bw;
0265     SCHED_WARN_ON(dl_rq->this_bw < old); /* overflow */
0266 }
0267
0268 static inline
0269 void __sub_rq_bw(u64 dl_bw, struct dl_rq *dl_rq)
0270 {
0271     u64 old = dl_rq->this_bw;
0272
0273     lockdep_assert_rq_held(rq_of_dl_rq(dl_rq));
0274     dl_rq->this_bw -= dl_bw;
0275     SCHED_WARN_ON(dl_rq->this_bw > old); /* underflow */
0276     if (dl_rq->this_bw > old)
0277         dl_rq->this_bw = 0;
0278     SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw);
0279 }
0280
0281 static inline
0282 void add_rq_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
0283 {
0284     if (!dl_entity_is_special(dl_se))
0285         __add_rq_bw(dl_se->dl_bw, dl_rq);
0286 }
0287
0288 static inline
0289 void sub_rq_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
0290 {
0291     if (!dl_entity_is_special(dl_se))
0292         __sub_rq_bw(dl_se->dl_bw, dl_rq);
0293 }
0294
0295 static inline
0296 void add_running_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
0297 {
0298     if (!dl_entity_is_special(dl_se))
0299         __add_running_bw(dl_se->dl_bw, dl_rq);
0300 }
0301
0302 static inline
0303 void sub_running_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
0304 {
0305     if (!dl_entity_is_special(dl_se))
0306         __sub_running_bw(dl_se->dl_bw, dl_rq);
0307 }
0308
0309 static void dl_change_utilization(struct task_struct *p, u64 new_bw)
0310 {
0311     struct rq *rq;
0312
0313     BUG_ON(p->dl.flags & SCHED_FLAG_SUGOV);
0314
0315     if (task_on_rq_queued(p))
0316         return;
0317
0318     rq = task_rq(p);
0319     if (p->dl.dl_non_contending) {
0320         sub_running_bw(&p->dl, &rq->dl);
0321         p->dl.dl_non_contending = 0;
0322         /*
0323          * If the timer handler is currently running and the
0324          * timer cannot be canceled, inactive_task_timer()
0325          * will see that dl_not_contending is not set, and
0326          * will not touch the rq's active utilization,
0327          * so we are still safe.
0328          */
0329         if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1)
0330             put_task_struct(p);
0331     }
0332     __sub_rq_bw(p->dl.dl_bw, &rq->dl);
0333     __add_rq_bw(new_bw, &rq->dl);
0334 }
0335
0336 /*
0337  * The utilization of a task cannot be immediately removed from
0338  * the rq active utilization (running_bw) when the task blocks.
0339  * Instead, we have to wait for the so called "0-lag time".
0340  *
0341  * If a task blocks before the "0-lag time", a timer (the inactive
0342  * timer) is armed, and running_bw is decreased when the timer
0343  * fires.
0344  *
0345  * If the task wakes up again before the inactive timer fires,
0346  * the timer is canceled, whereas if the task wakes up after the
0347  * inactive timer fired (and running_bw has been decreased) the
0348  * task's utilization has to be added to running_bw again.
0349  * A flag in the deadline scheduling entity (dl_non_contending)
0350  * is used to avoid race conditions between the inactive timer handler
0351  * and task wakeups.
0352  *
0353  * The following diagram shows how running_bw is updated. A task is
0354  * "ACTIVE" when its utilization contributes to running_bw; an
0355  * "ACTIVE contending" task is in the TASK_RUNNING state, while an
0356  * "ACTIVE non contending" task is a blocked task for which the "0-lag time"
0357  * has not passed yet. An "INACTIVE" task is a task for which the "0-lag"
0358  * time already passed, which does not contribute to running_bw anymore.
0359  *                              +------------------+
0360  *             wakeup           |    ACTIVE        |
0361  *          +------------------>+   contending     |
0362  *          | add_running_bw    |                  |
0363  *          |                   +----+------+------+
0364  *          |                        |      ^
0365  *          |                dequeue |      |
0366  * +--------+-------+                |      |
0367  * |                |   t >= 0-lag   |      | wakeup
0368  * |    INACTIVE    |<---------------+      |
0369  * |                | sub_running_bw |      |
0370  * +--------+-------+                |      |
0371  *          ^                        |      |
0372  *          |              t < 0-lag |      |
0373  *          |                        |      |
0374  *          |                        V      |
0375  *          |                   +----+------+------+
0376  *          | sub_running_bw    |    ACTIVE        |
0377  *          +-------------------+                  |
0378  *            inactive timer    |  non contending  |
0379  *            fired             +------------------+
0380  *
0381  * The task_non_contending() function is invoked when a task
0382  * blocks, and checks if the 0-lag time already passed or
0383  * not (in the first case, it directly updates running_bw;
0384  * in the second case, it arms the inactive timer).
0385  *
0386  * The task_contending() function is invoked when a task wakes
0387  * up, and checks if the task is still in the "ACTIVE non contending"
0388  * state or not (in the second case, it updates running_bw).
0389  */
0390 static void task_non_contending(struct task_struct *p)
0391 {
0392     struct sched_dl_entity *dl_se = &p->dl;
0393     struct hrtimer *timer = &dl_se->inactive_timer;
0394     struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
0395     struct rq *rq = rq_of_dl_rq(dl_rq);
0396     s64 zerolag_time;
0397
0398     /*
0399      * If this is a non-deadline task that has been boosted,
0400      * do nothing
0401      */
0402     if (dl_se->dl_runtime == 0)
0403         return;
0404
0405     if (dl_entity_is_special(dl_se))
0406         return;
0407
0408     WARN_ON(dl_se->dl_non_contending);
0409
0410     zerolag_time = dl_se->deadline -
0411          div64_long((dl_se->runtime * dl_se->dl_period),
0412             dl_se->dl_runtime);
0413
0414     /*
0415      * Using relative times instead of the absolute "0-lag time"
0416      * allows to simplify the code
0417      */
0418     zerolag_time -= rq_clock(rq);
0419
0420     /*
0421      * If the "0-lag time" already passed, decrease the active
0422      * utilization now, instead of starting a timer
0423      */
0424     if ((zerolag_time < 0) || hrtimer_active(&dl_se->inactive_timer)) {
0425         if (dl_task(p))
0426             sub_running_bw(dl_se, dl_rq);
0427         if (!dl_task(p) || READ_ONCE(p->__state) == TASK_DEAD) {
0428             struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
0429
0430             if (READ_ONCE(p->__state) == TASK_DEAD)
0431                 sub_rq_bw(&p->dl, &rq->dl);
0432             raw_spin_lock(&dl_b->lock);
0433             __dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
0434             __dl_clear_params(p);
0435             raw_spin_unlock(&dl_b->lock);
0436         }
0437
0438         return;
0439     }
0440
0441     dl_se->dl_non_contending = 1;
0442     get_task_struct(p);
0443     hrtimer_start(timer, ns_to_ktime(zerolag_time), HRTIMER_MODE_REL_HARD);
0444 }
0445
0446 static void task_contending(struct sched_dl_entity *dl_se, int flags)
0447 {
0448     struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
0449
0450     /*
0451      * If this is a non-deadline task that has been boosted,
0452      * do nothing
0453      */
0454     if (dl_se->dl_runtime == 0)
0455         return;
0456
0457     if (flags & ENQUEUE_MIGRATED)
0458         add_rq_bw(dl_se, dl_rq);
0459
0460     if (dl_se->dl_non_contending) {
0461         dl_se->dl_non_contending = 0;
0462         /*
0463          * If the timer handler is currently running and the
0464          * timer cannot be canceled, inactive_task_timer()
0465          * will see that dl_not_contending is not set, and
0466          * will not touch the rq's active utilization,
0467          * so we are still safe.
0468          */
0469         if (hrtimer_try_to_cancel(&dl_se->inactive_timer) == 1)
0470             put_task_struct(dl_task_of(dl_se));
0471     } else {
0472         /*
0473          * Since "dl_non_contending" is not set, the
0474          * task's utilization has already been removed from
0475          * active utilization (either when the task blocked,
0476          * when the "inactive timer" fired).
0477          * So, add it back.
0478          */
0479         add_running_bw(dl_se, dl_rq);
0480     }
0481 }
0482
0483 static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq)
0484 {
0485     struct sched_dl_entity *dl_se = &p->dl;
0486
0487     return rb_first_cached(&dl_rq->root) == &dl_se->rb_node;
0488 }
0489
0490 static void init_dl_rq_bw_ratio(struct dl_rq *dl_rq);
0491
0492 void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime)
0493 {
0494     raw_spin_lock_init(&dl_b->dl_runtime_lock);
0495     dl_b->dl_period = period;
0496     dl_b->dl_runtime = runtime;
0497 }
0498
0499 void init_dl_bw(struct dl_bw *dl_b)
0500 {
0501     raw_spin_lock_init(&dl_b->lock);
0502     if (global_rt_runtime() == RUNTIME_INF)
0503         dl_b->bw = -1;
0504     else
0505         dl_b->bw = to_ratio(global_rt_period(), global_rt_runtime());
0506     dl_b->total_bw = 0;
0507 }
0508
0509 void init_dl_rq(struct dl_rq *dl_rq)
0510 {
0511     dl_rq->root = RB_ROOT_CACHED;
0512
0513 #ifdef CONFIG_SMP
0514     /* zero means no -deadline tasks */
0515     dl_rq->earliest_dl.curr = dl_rq->earliest_dl.next = 0;
0516
0517     dl_rq->dl_nr_migratory = 0;
0518     dl_rq->overloaded = 0;
0519     dl_rq->pushable_dl_tasks_root = RB_ROOT_CACHED;
0520 #else
0521     init_dl_bw(&dl_rq->dl_bw);
0522 #endif
0523
0524     dl_rq->running_bw = 0;
0525     dl_rq->this_bw = 0;
0526     init_dl_rq_bw_ratio(dl_rq);
0527 }
0528
0529 #ifdef CONFIG_SMP
0530
0531 static inline int dl_overloaded(struct rq *rq)
0532 {
0533     return atomic_read(&rq->rd->dlo_count);
0534 }
0535
0536 static inline void dl_set_overload(struct rq *rq)
0537 {
0538     if (!rq->online)
0539         return;
0540
0541     cpumask_set_cpu(rq->cpu, rq->rd->dlo_mask);
0542     /*
0543      * Must be visible before the overload count is
0544      * set (as in sched_rt.c).
0545      *
0546      * Matched by the barrier in pull_dl_task().
0547      */
0548     smp_wmb();
0549     atomic_inc(&rq->rd->dlo_count);
0550 }
0551
0552 static inline void dl_clear_overload(struct rq *rq)
0553 {
0554     if (!rq->online)
0555         return;
0556
0557     atomic_dec(&rq->rd->dlo_count);
0558     cpumask_clear_cpu(rq->cpu, rq->rd->dlo_mask);
0559 }
0560
0561 static void update_dl_migration(struct dl_rq *dl_rq)
0562 {
0563     if (dl_rq->dl_nr_migratory && dl_rq->dl_nr_running > 1) {
0564         if (!dl_rq->overloaded) {
0565             dl_set_overload(rq_of_dl_rq(dl_rq));
0566             dl_rq->overloaded = 1;
0567         }
0568     } else if (dl_rq->overloaded) {
0569         dl_clear_overload(rq_of_dl_rq(dl_rq));
0570         dl_rq->overloaded = 0;
0571     }
0572 }
0573
0574 static void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
0575 {
0576     struct task_struct *p = dl_task_of(dl_se);
0577
0578     if (p->nr_cpus_allowed > 1)
0579         dl_rq->dl_nr_migratory++;
0580
0581     update_dl_migration(dl_rq);
0582 }
0583
0584 static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
0585 {
0586     struct task_struct *p = dl_task_of(dl_se);
0587
0588     if (p->nr_cpus_allowed > 1)
0589         dl_rq->dl_nr_migratory--;
0590
0591     update_dl_migration(dl_rq);
0592 }
0593
0594 #define __node_2_pdl(node) \
0595     rb_entry((node), struct task_struct, pushable_dl_tasks)
0596
0597 static inline bool __pushable_less(struct rb_node *a, const struct rb_node *b)
0598 {
0599     return dl_entity_preempt(&__node_2_pdl(a)->dl, &__node_2_pdl(b)->dl);
0600 }
0601
0602 /*
0603  * The list of pushable -deadline task is not a plist, like in
0604  * sched_rt.c, it is an rb-tree with tasks ordered by deadline.
0605  */
0606 static void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p)
0607 {
0608     struct rb_node *leftmost;
0609
0610     BUG_ON(!RB_EMPTY_NODE(&p->pushable_dl_tasks));
0611
0612     leftmost = rb_add_cached(&p->pushable_dl_tasks,
0613                  &rq->dl.pushable_dl_tasks_root,
0614                  __pushable_less);
0615     if (leftmost)
0616         rq->dl.earliest_dl.next = p->dl.deadline;
0617 }
0618
0619 static void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p)
0620 {
0621     struct dl_rq *dl_rq = &rq->dl;
0622     struct rb_root_cached *root = &dl_rq->pushable_dl_tasks_root;
0623     struct rb_node *leftmost;
0624
0625     if (RB_EMPTY_NODE(&p->pushable_dl_tasks))
0626         return;
0627
0628     leftmost = rb_erase_cached(&p->pushable_dl_tasks, root);
0629     if (leftmost)
0630         dl_rq->earliest_dl.next = __node_2_pdl(leftmost)->dl.deadline;
0631
0632     RB_CLEAR_NODE(&p->pushable_dl_tasks);
0633 }
0634
0635 static inline int has_pushable_dl_tasks(struct rq *rq)
0636 {
0637     return !RB_EMPTY_ROOT(&rq->dl.pushable_dl_tasks_root.rb_root);
0638 }
0639
0640 static int push_dl_task(struct rq *rq);
0641
0642 static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev)
0643 {
0644     return rq->online && dl_task(prev);
0645 }
0646
0647 static DEFINE_PER_CPU(struct callback_head, dl_push_head);
0648 static DEFINE_PER_CPU(struct callback_head, dl_pull_head);
0649
0650 static void push_dl_tasks(struct rq *);
0651 static void pull_dl_task(struct rq *);
0652
0653 static inline void deadline_queue_push_tasks(struct rq *rq)
0654 {
0655     if (!has_pushable_dl_tasks(rq))
0656         return;
0657
0658     queue_balance_callback(rq, &per_cpu(dl_push_head, rq->cpu), push_dl_tasks);
0659 }
0660
0661 static inline void deadline_queue_pull_task(struct rq *rq)
0662 {
0663     queue_balance_callback(rq, &per_cpu(dl_pull_head, rq->cpu), pull_dl_task);
0664 }
0665
0666 static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq);
0667
0668 static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p)
0669 {
0670     struct rq *later_rq = NULL;
0671     struct dl_bw *dl_b;
0672
0673     later_rq = find_lock_later_rq(p, rq);
0674     if (!later_rq) {
0675         int cpu;
0676
0677         /*
0678          * If we cannot preempt any rq, fall back to pick any
0679          * online CPU:
0680          */
0681         cpu = cpumask_any_and(cpu_active_mask, p->cpus_ptr);
0682         if (cpu >= nr_cpu_ids) {
0683             /*
0684              * Failed to find any suitable CPU.
0685              * The task will never come back!
0686              */
0687             BUG_ON(dl_bandwidth_enabled());
0688
0689             /*
0690              * If admission control is disabled we
0691              * try a little harder to let the task
0692              * run.
0693              */
0694             cpu = cpumask_any(cpu_active_mask);
0695         }
0696         later_rq = cpu_rq(cpu);
0697         double_lock_balance(rq, later_rq);
0698     }
0699
0700     if (p->dl.dl_non_contending || p->dl.dl_throttled) {
0701         /*
0702          * Inactive timer is armed (or callback is running, but
0703          * waiting for us to release rq locks). In any case, when it
0704          * will fire (or continue), it will see running_bw of this
0705          * task migrated to later_rq (and correctly handle it).
0706          */
0707         sub_running_bw(&p->dl, &rq->dl);
0708         sub_rq_bw(&p->dl, &rq->dl);
0709
0710         add_rq_bw(&p->dl, &later_rq->dl);
0711         add_running_bw(&p->dl, &later_rq->dl);
0712     } else {
0713         sub_rq_bw(&p->dl, &rq->dl);
0714         add_rq_bw(&p->dl, &later_rq->dl);
0715     }
0716
0717     /*
0718      * And we finally need to fixup root_domain(s) bandwidth accounting,
0719      * since p is still hanging out in the old (now moved to default) root
0720      * domain.
0721      */
0722     dl_b = &rq->rd->dl_bw;
0723     raw_spin_lock(&dl_b->lock);
0724     __dl_sub(dl_b, p->dl.dl_bw, cpumask_weight(rq->rd->span));
0725     raw_spin_unlock(&dl_b->lock);
0726
0727     dl_b = &later_rq->rd->dl_bw;
0728     raw_spin_lock(&dl_b->lock);
0729     __dl_add(dl_b, p->dl.dl_bw, cpumask_weight(later_rq->rd->span));
0730     raw_spin_unlock(&dl_b->lock);
0731
0732     set_task_cpu(p, later_rq->cpu);
0733     double_unlock_balance(later_rq, rq);
0734
0735     return later_rq;
0736 }
0737
0738 #else
0739
0740 static inline
0741 void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p)
0742 {
0743 }
0744
0745 static inline
0746 void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p)
0747 {
0748 }
0749
0750 static inline
0751 void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
0752 {
0753 }
0754
0755 static inline
0756 void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
0757 {
0758 }
0759
0760 static inline void deadline_queue_push_tasks(struct rq *rq)
0761 {
0762 }
0763
0764 static inline void deadline_queue_pull_task(struct rq *rq)
0765 {
0766 }
0767 #endif /* CONFIG_SMP */
0768
0769 static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags);
0770 static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags);
0771 static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, int flags);
0772
0773 /*
0774  * We are being explicitly informed that a new instance is starting,
0775  * and this means that:
0776  *  - the absolute deadline of the entity has to be placed at
0777  *    current time + relative deadline;
0778  *  - the runtime of the entity has to be set to the maximum value.
0779  *
0780  * The capability of specifying such event is useful whenever a -deadline
0781  * entity wants to (try to!) synchronize its behaviour with the scheduler's
0782  * one, and to (try to!) reconcile itself with its own scheduling
0783  * parameters.
0784  */
0785 static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se)
0786 {
0787     struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
0788     struct rq *rq = rq_of_dl_rq(dl_rq);
0789
0790     WARN_ON(is_dl_boosted(dl_se));
0791     WARN_ON(dl_time_before(rq_clock(rq), dl_se->deadline));
0792
0793     /*
0794      * We are racing with the deadline timer. So, do nothing because
0795      * the deadline timer handler will take care of properly recharging
0796      * the runtime and postponing the deadline
0797      */
0798     if (dl_se->dl_throttled)
0799         return;
0800
0801     /*
0802      * We use the regular wall clock time to set deadlines in the
0803      * future; in fact, we must consider execution overheads (time
0804      * spent on hardirq context, etc.).
0805      */
0806     dl_se->deadline = rq_clock(rq) + dl_se->dl_deadline;
0807     dl_se->runtime = dl_se->dl_runtime;
0808 }
0809
0810 /*
0811  * Pure Earliest Deadline First (EDF) scheduling does not deal with the
0812  * possibility of a entity lasting more than what it declared, and thus
0813  * exhausting its runtime.
0814  *
0815  * Here we are interested in making runtime overrun possible, but we do
0816  * not want a entity which is misbehaving to affect the scheduling of all
0817  * other entities.
0818  * Therefore, a budgeting strategy called Constant Bandwidth Server (CBS)
0819  * is used, in order to confine each entity within its own bandwidth.
0820  *
0821  * This function deals exactly with that, and ensures that when the runtime
0822  * of a entity is replenished, its deadline is also postponed. That ensures
0823  * the overrunning entity can't interfere with other entity in the system and
0824  * can't make them miss their deadlines. Reasons why this kind of overruns
0825  * could happen are, typically, a entity voluntarily trying to overcome its
0826  * runtime, or it just underestimated it during sched_setattr().
0827  */
0828 static void replenish_dl_entity(struct sched_dl_entity *dl_se)
0829 {
0830     struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
0831     struct rq *rq = rq_of_dl_rq(dl_rq);
0832
0833     BUG_ON(pi_of(dl_se)->dl_runtime <= 0);
0834
0835     /*
0836      * This could be the case for a !-dl task that is boosted.
0837      * Just go with full inherited parameters.
0838      */
0839     if (dl_se->dl_deadline == 0) {
0840         dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline;
0841         dl_se->runtime = pi_of(dl_se)->dl_runtime;
0842     }
0843
0844     if (dl_se->dl_yielded && dl_se->runtime > 0)
0845         dl_se->runtime = 0;
0846
0847     /*
0848      * We keep moving the deadline away until we get some
0849      * available runtime for the entity. This ensures correct
0850      * handling of situations where the runtime overrun is
0851      * arbitrary large.
0852      */
0853     while (dl_se->runtime <= 0) {
0854         dl_se->deadline += pi_of(dl_se)->dl_period;
0855         dl_se->runtime += pi_of(dl_se)->dl_runtime;
0856     }
0857
0858     /*
0859      * At this point, the deadline really should be "in
0860      * the future" with respect to rq->clock. If it's
0861      * not, we are, for some reason, lagging too much!
0862      * Anyway, after having warn userspace abut that,
0863      * we still try to keep the things running by
0864      * resetting the deadline and the budget of the
0865      * entity.
0866      */
0867     if (dl_time_before(dl_se->deadline, rq_clock(rq))) {
0868         printk_deferred_once("sched: DL replenish lagged too much\n");
0869         dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline;
0870         dl_se->runtime = pi_of(dl_se)->dl_runtime;
0871     }
0872
0873     if (dl_se->dl_yielded)
0874         dl_se->dl_yielded = 0;
0875     if (dl_se->dl_throttled)
0876         dl_se->dl_throttled = 0;
0877 }
0878
0879 /*
0880  * Here we check if --at time t-- an entity (which is probably being
0881  * [re]activated or, in general, enqueued) can use its remaining runtime
0882  * and its current deadline _without_ exceeding the bandwidth it is
0883  * assigned (function returns true if it can't). We are in fact applying
0884  * one of the CBS rules: when a task wakes up, if the residual runtime
0885  * over residual deadline fits within the allocated bandwidth, then we
0886  * can keep the current (absolute) deadline and residual budget without
0887  * disrupting the schedulability of the system. Otherwise, we should
0888  * refill the runtime and set the deadline a period in the future,
0889  * because keeping the current (absolute) deadline of the task would
0890  * result in breaking guarantees promised to other tasks (refer to
0891  * Documentation/scheduler/sched-deadline.rst for more information).
0892  *
0893  * This function returns true if:
0894  *
0895  *   runtime / (deadline - t) > dl_runtime / dl_deadline ,
0896  *
0897  * IOW we can't recycle current parameters.
0898  *
0899  * Notice that the bandwidth check is done against the deadline. For
0900  * task with deadline equal to period this is the same of using
0901  * dl_period instead of dl_deadline in the equation above.
0902  */
0903 static bool dl_entity_overflow(struct sched_dl_entity *dl_se, u64 t)
0904 {
0905     u64 left, right;
0906
0907     /*
0908      * left and right are the two sides of the equation above,
0909      * after a bit of shuffling to use multiplications instead
0910      * of divisions.
0911      *
0912      * Note that none of the time values involved in the two
0913      * multiplications are absolute: dl_deadline and dl_runtime
0914      * are the relative deadline and the maximum runtime of each
0915      * instance, runtime is the runtime left for the last instance
0916      * and (deadline - t), since t is rq->clock, is the time left
0917      * to the (absolute) deadline. Even if overflowing the u64 type
0918      * is very unlikely to occur in both cases, here we scale down
0919      * as we want to avoid that risk at all. Scaling down by 10
0920      * means that we reduce granularity to 1us. We are fine with it,
0921      * since this is only a true/false check and, anyway, thinking
0922      * of anything below microseconds resolution is actually fiction
0923      * (but still we want to give the user that illusion >;).
0924      */
0925     left = (pi_of(dl_se)->dl_deadline >> DL_SCALE) * (dl_se->runtime >> DL_SCALE);
0926     right = ((dl_se->deadline - t) >> DL_SCALE) *
0927         (pi_of(dl_se)->dl_runtime >> DL_SCALE);
0928
0929     return dl_time_before(right, left);
0930 }
0931
0932 /*
0933  * Revised wakeup rule [1]: For self-suspending tasks, rather then
0934  * re-initializing task's runtime and deadline, the revised wakeup
0935  * rule adjusts the task's runtime to avoid the task to overrun its
0936  * density.
0937  *
0938  * Reasoning: a task may overrun the density if:
0939  *    runtime / (deadline - t) > dl_runtime / dl_deadline
0940  *
0941  * Therefore, runtime can be adjusted to:
0942  *     runtime = (dl_runtime / dl_deadline) * (deadline - t)
0943  *
0944  * In such way that runtime will be equal to the maximum density
0945  * the task can use without breaking any rule.
0946  *
0947  * [1] Luca Abeni, Giuseppe Lipari, and Juri Lelli. 2015. Constant
0948  * bandwidth server revisited. SIGBED Rev. 11, 4 (January 2015), 19-24.
0949  */
0950 static void
0951 update_dl_revised_wakeup(struct sched_dl_entity *dl_se, struct rq *rq)
0952 {
0953     u64 laxity = dl_se->deadline - rq_clock(rq);
0954
0955     /*
0956      * If the task has deadline < period, and the deadline is in the past,
0957      * it should already be throttled before this check.
0958      *
0959      * See update_dl_entity() comments for further details.
0960      */
0961     WARN_ON(dl_time_before(dl_se->deadline, rq_clock(rq)));
0962
0963     dl_se->runtime = (dl_se->dl_density * laxity) >> BW_SHIFT;
0964 }
0965
0966 /*
0967  * Regarding the deadline, a task with implicit deadline has a relative
0968  * deadline == relative period. A task with constrained deadline has a
0969  * relative deadline <= relative period.
0970  *
0971  * We support constrained deadline tasks. However, there are some restrictions
0972  * applied only for tasks which do not have an implicit deadline. See
0973  * update_dl_entity() to know more about such restrictions.
0974  *
0975  * The dl_is_implicit() returns true if the task has an implicit deadline.
0976  */
0977 static inline bool dl_is_implicit(struct sched_dl_entity *dl_se)
0978 {
0979     return dl_se->dl_deadline == dl_se->dl_period;
0980 }
0981
0982 /*
0983  * When a deadline entity is placed in the runqueue, its runtime and deadline
0984  * might need to be updated. This is done by a CBS wake up rule. There are two
0985  * different rules: 1) the original CBS; and 2) the Revisited CBS.
0986  *
0987  * When the task is starting a new period, the Original CBS is used. In this
0988  * case, the runtime is replenished and a new absolute deadline is set.
0989  *
0990  * When a task is queued before the begin of the next period, using the
0991  * remaining runtime and deadline could make the entity to overflow, see
0992  * dl_entity_overflow() to find more about runtime overflow. When such case
0993  * is detected, the runtime and deadline need to be updated.
0994  *
0995  * If the task has an implicit deadline, i.e., deadline == period, the Original
0996  * CBS is applied. the runtime is replenished and a new absolute deadline is
0997  * set, as in the previous cases.
0998  *
0999  * However, the Original CBS does not work properly for tasks with
1000  * deadline < period, which are said to have a constrained deadline. By
1001  * applying the Original CBS, a constrained deadline task would be able to run
1002  * runtime/deadline in a period. With deadline < period, the task would
1003  * overrun the runtime/period allowed bandwidth, breaking the admission test.
1004  *
1005  * In order to prevent this misbehave, the Revisited CBS is used for
1006  * constrained deadline tasks when a runtime overflow is detected. In the
1007  * Revisited CBS, rather than replenishing & setting a new absolute deadline,
1008  * the remaining runtime of the task is reduced to avoid runtime overflow.
1009  * Please refer to the comments update_dl_revised_wakeup() function to find
1010  * more about the Revised CBS rule.
1011  */
1012 static void update_dl_entity(struct sched_dl_entity *dl_se)
1013 {
1014     struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
1015     struct rq *rq = rq_of_dl_rq(dl_rq);
1016
1017     if (dl_time_before(dl_se->deadline, rq_clock(rq)) ||
1018         dl_entity_overflow(dl_se, rq_clock(rq))) {
1019
1020         if (unlikely(!dl_is_implicit(dl_se) &&
1021                  !dl_time_before(dl_se->deadline, rq_clock(rq)) &&
1022                  !is_dl_boosted(dl_se))) {
1023             update_dl_revised_wakeup(dl_se, rq);
1024             return;
1025         }
1026
1027         dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline;
1028         dl_se->runtime = pi_of(dl_se)->dl_runtime;
1029     }
1030 }
1031
1032 static inline u64 dl_next_period(struct sched_dl_entity *dl_se)
1033 {
1034     return dl_se->deadline - dl_se->dl_deadline + dl_se->dl_period;
1035 }
1036
1037 /*
1038  * If the entity depleted all its runtime, and if we want it to sleep
1039  * while waiting for some new execution time to become available, we
1040  * set the bandwidth replenishment timer to the replenishment instant
1041  * and try to activate it.
1042  *
1043  * Notice that it is important for the caller to know if the timer
1044  * actually started or not (i.e., the replenishment instant is in
1045  * the future or in the past).
1046  */
1047 static int start_dl_timer(struct task_struct *p)
1048 {
1049     struct sched_dl_entity *dl_se = &p->dl;
1050     struct hrtimer *timer = &dl_se->dl_timer;
1051     struct rq *rq = task_rq(p);
1052     ktime_t now, act;
1053     s64 delta;
1054
1055     lockdep_assert_rq_held(rq);
1056
1057     /*
1058      * We want the timer to fire at the deadline, but considering
1059      * that it is actually coming from rq->clock and not from
1060      * hrtimer's time base reading.
1061      */
1062     act = ns_to_ktime(dl_next_period(dl_se));
1063     now = hrtimer_cb_get_time(timer);
1064     delta = ktime_to_ns(now) - rq_clock(rq);
1065     act = ktime_add_ns(act, delta);
1066
1067     /*
1068      * If the expiry time already passed, e.g., because the value
1069      * chosen as the deadline is too small, don't even try to
1070      * start the timer in the past!
1071      */
1072     if (ktime_us_delta(act, now) < 0)
1073         return 0;
1074
1075     /*
1076      * !enqueued will guarantee another callback; even if one is already in
1077      * progress. This ensures a balanced {get,put}_task_struct().
1078      *
1079      * The race against __run_timer() clearing the enqueued state is
1080      * harmless because we're holding task_rq()->lock, therefore the timer
1081      * expiring after we've done the check will wait on its task_rq_lock()
1082      * and observe our state.
1083      */
1084     if (!hrtimer_is_queued(timer)) {
1085         get_task_struct(p);
1086         hrtimer_start(timer, act, HRTIMER_MODE_ABS_HARD);
1087     }
1088
1089     return 1;
1090 }
1091
1092 /*
1093  * This is the bandwidth enforcement timer callback. If here, we know
1094  * a task is not on its dl_rq, since the fact that the timer was running
1095  * means the task is throttled and needs a runtime replenishment.
1096  *
1097  * However, what we actually do depends on the fact the task is active,
1098  * (it is on its rq) or has been removed from there by a call to
1099  * dequeue_task_dl(). In the former case we must issue the runtime
1100  * replenishment and add the task back to the dl_rq; in the latter, we just
1101  * do nothing but clearing dl_throttled, so that runtime and deadline
1102  * updating (and the queueing back to dl_rq) will be done by the
1103  * next call to enqueue_task_dl().
1104  */
1105 static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
1106 {
1107     struct sched_dl_entity *dl_se = container_of(timer,
1108                              struct sched_dl_entity,
1109                              dl_timer);
1110     struct task_struct *p = dl_task_of(dl_se);
1111     struct rq_flags rf;
1112     struct rq *rq;
1113
1114     rq = task_rq_lock(p, &rf);
1115
1116     /*
1117      * The task might have changed its scheduling policy to something
1118      * different than SCHED_DEADLINE (through switched_from_dl()).
1119      */
1120     if (!dl_task(p))
1121         goto unlock;
1122
1123     /*
1124      * The task might have been boosted by someone else and might be in the
1125      * boosting/deboosting path, its not throttled.
1126      */
1127     if (is_dl_boosted(dl_se))
1128         goto unlock;
1129
1130     /*
1131      * Spurious timer due to start_dl_timer() race; or we already received
1132      * a replenishment from rt_mutex_setprio().
1133      */
1134     if (!dl_se->dl_throttled)
1135         goto unlock;
1136
1137     sched_clock_tick();
1138     update_rq_clock(rq);
1139
1140     /*
1141      * If the throttle happened during sched-out; like:
1142      *
1143      *   schedule()
1144      *     deactivate_task()
1145      *       dequeue_task_dl()
1146      *         update_curr_dl()
1147      *           start_dl_timer()
1148      *         __dequeue_task_dl()
1149      *     prev->on_rq = 0;
1150      *
1151      * We can be both throttled and !queued. Replenish the counter
1152      * but do not enqueue -- wait for our wakeup to do that.
1153      */
1154     if (!task_on_rq_queued(p)) {
1155         replenish_dl_entity(dl_se);
1156         goto unlock;
1157     }
1158
1159 #ifdef CONFIG_SMP
1160     if (unlikely(!rq->online)) {
1161         /*
1162          * If the runqueue is no longer available, migrate the
1163          * task elsewhere. This necessarily changes rq.
1164          */
1165         lockdep_unpin_lock(__rq_lockp(rq), rf.cookie);
1166         rq = dl_task_offline_migration(rq, p);
1167         rf.cookie = lockdep_pin_lock(__rq_lockp(rq));
1168         update_rq_clock(rq);
1169
1170         /*
1171          * Now that the task has been migrated to the new RQ and we
1172          * have that locked, proceed as normal and enqueue the task
1173          * there.
1174          */
1175     }
1176 #endif
1177
1178     enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
1179     if (dl_task(rq->curr))
1180         check_preempt_curr_dl(rq, p, 0);
1181     else
1182         resched_curr(rq);
1183
1184 #ifdef CONFIG_SMP
1185     /*
1186      * Queueing this task back might have overloaded rq, check if we need
1187      * to kick someone away.
1188      */
1189     if (has_pushable_dl_tasks(rq)) {
1190         /*
1191          * Nothing relies on rq->lock after this, so its safe to drop
1192          * rq->lock.
1193          */
1194         rq_unpin_lock(rq, &rf);
1195         push_dl_task(rq);
1196         rq_repin_lock(rq, &rf);
1197     }
1198 #endif
1199
1200 unlock:
1201     task_rq_unlock(rq, p, &rf);
1202
1203     /*
1204      * This can free the task_struct, including this hrtimer, do not touch
1205      * anything related to that after this.
1206      */
1207     put_task_struct(p);
1208
1209     return HRTIMER_NORESTART;
1210 }
1211
1212 void init_dl_task_timer(struct sched_dl_entity *dl_se)
1213 {
1214     struct hrtimer *timer = &dl_se->dl_timer;
1215
1216     hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
1217     timer->function = dl_task_timer;
1218 }
1219
1220 /*
1221  * During the activation, CBS checks if it can reuse the current task's
1222  * runtime and period. If the deadline of the task is in the past, CBS
1223  * cannot use the runtime, and so it replenishes the task. This rule
1224  * works fine for implicit deadline tasks (deadline == period), and the
1225  * CBS was designed for implicit deadline tasks. However, a task with
1226  * constrained deadline (deadline < period) might be awakened after the
1227  * deadline, but before the next period. In this case, replenishing the
1228  * task would allow it to run for runtime / deadline. As in this case
1229  * deadline < period, CBS enables a task to run for more than the
1230  * runtime / period. In a very loaded system, this can cause a domino
1231  * effect, making other tasks miss their deadlines.
1232  *
1233  * To avoid this problem, in the activation of a constrained deadline
1234  * task after the deadline but before the next period, throttle the
1235  * task and set the replenishing timer to the begin of the next period,
1236  * unless it is boosted.
1237  */
1238 static inline void dl_check_constrained_dl(struct sched_dl_entity *dl_se)
1239 {
1240     struct task_struct *p = dl_task_of(dl_se);
1241     struct rq *rq = rq_of_dl_rq(dl_rq_of_se(dl_se));
1242
1243     if (dl_time_before(dl_se->deadline, rq_clock(rq)) &&
1244         dl_time_before(rq_clock(rq), dl_next_period(dl_se))) {
1245         if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(p)))
1246             return;
1247         dl_se->dl_throttled = 1;
1248         if (dl_se->runtime > 0)
1249             dl_se->runtime = 0;
1250     }
1251 }
1252
1253 static
1254 int dl_runtime_exceeded(struct sched_dl_entity *dl_se)
1255 {
1256     return (dl_se->runtime <= 0);
1257 }
1258
1259 /*
1260  * This function implements the GRUB accounting rule:
1261  * according to the GRUB reclaiming algorithm, the runtime is
1262  * not decreased as "dq = -dt", but as
1263  * "dq = -max{u / Umax, (1 - Uinact - Uextra)} dt",
1264  * where u is the utilization of the task, Umax is the maximum reclaimable
1265  * utilization, Uinact is the (per-runqueue) inactive utilization, computed
1266  * as the difference between the "total runqueue utilization" and the
1267  * runqueue active utilization, and Uextra is the (per runqueue) extra
1268  * reclaimable utilization.
1269  * Since rq->dl.running_bw and rq->dl.this_bw contain utilizations
1270  * multiplied by 2^BW_SHIFT, the result has to be shifted right by
1271  * BW_SHIFT.
1272  * Since rq->dl.bw_ratio contains 1 / Umax multiplied by 2^RATIO_SHIFT,
1273  * dl_bw is multiped by rq->dl.bw_ratio and shifted right by RATIO_SHIFT.
1274  * Since delta is a 64 bit variable, to have an overflow its value
1275  * should be larger than 2^(64 - 20 - 8), which is more than 64 seconds.
1276  * So, overflow is not an issue here.
1277  */
1278 static u64 grub_reclaim(u64 delta, struct rq *rq, struct sched_dl_entity *dl_se)
1279 {
1280     u64 u_inact = rq->dl.this_bw - rq->dl.running_bw; /* Utot - Uact */
1281     u64 u_act;
1282     u64 u_act_min = (dl_se->dl_bw * rq->dl.bw_ratio) >> RATIO_SHIFT;
1283
1284     /*
1285      * Instead of computing max{u * bw_ratio, (1 - u_inact - u_extra)},
1286      * we compare u_inact + rq->dl.extra_bw with
1287      * 1 - (u * rq->dl.bw_ratio >> RATIO_SHIFT), because
1288      * u_inact + rq->dl.extra_bw can be larger than
1289      * 1 * (so, 1 - u_inact - rq->dl.extra_bw would be negative
1290      * leading to wrong results)
1291      */
1292     if (u_inact + rq->dl.extra_bw > BW_UNIT - u_act_min)
1293         u_act = u_act_min;
1294     else
1295         u_act = BW_UNIT - u_inact - rq->dl.extra_bw;
1296
1297     return (delta * u_act) >> BW_SHIFT;
1298 }
1299
1300 /*
1301  * Update the current task's runtime statistics (provided it is still
1302  * a -deadline task and has not been removed from the dl_rq).
1303  */
1304 static void update_curr_dl(struct rq *rq)
1305 {
1306     struct task_struct *curr = rq->curr;
1307     struct sched_dl_entity *dl_se = &curr->dl;
1308     u64 delta_exec, scaled_delta_exec;
1309     int cpu = cpu_of(rq);
1310     u64 now;
1311
1312     if (!dl_task(curr) || !on_dl_rq(dl_se))
1313         return;
1314
1315     /*
1316      * Consumed budget is computed considering the time as
1317      * observed by schedulable tasks (excluding time spent
1318      * in hardirq context, etc.). Deadlines are instead
1319      * computed using hard walltime. This seems to be the more
1320      * natural solution, but the full ramifications of this
1321      * approach need further study.
1322      */
1323     now = rq_clock_task(rq);
1324     delta_exec = now - curr->se.exec_start;
1325     if (unlikely((s64)delta_exec <= 0)) {
1326         if (unlikely(dl_se->dl_yielded))
1327             goto throttle;
1328         return;
1329     }
1330
1331     schedstat_set(curr->stats.exec_max,
1332               max(curr->stats.exec_max, delta_exec));
1333
1334     trace_sched_stat_runtime(curr, delta_exec, 0);
1335
1336     curr->se.sum_exec_runtime += delta_exec;
1337     account_group_exec_runtime(curr, delta_exec);
1338
1339     curr->se.exec_start = now;
1340     cgroup_account_cputime(curr, delta_exec);
1341
1342     if (dl_entity_is_special(dl_se))
1343         return;
1344
1345     /*
1346      * For tasks that participate in GRUB, we implement GRUB-PA: the
1347      * spare reclaimed bandwidth is used to clock down frequency.
1348      *
1349      * For the others, we still need to scale reservation parameters
1350      * according to current frequency and CPU maximum capacity.
1351      */
1352     if (unlikely(dl_se->flags & SCHED_FLAG_RECLAIM)) {
1353         scaled_delta_exec = grub_reclaim(delta_exec,
1354                          rq,
1355                          &curr->dl);
1356     } else {
1357         unsigned long scale_freq = arch_scale_freq_capacity(cpu);
1358         unsigned long scale_cpu = arch_scale_cpu_capacity(cpu);
1359
1360         scaled_delta_exec = cap_scale(delta_exec, scale_freq);
1361         scaled_delta_exec = cap_scale(scaled_delta_exec, scale_cpu);
1362     }
1363
1364     dl_se->runtime -= scaled_delta_exec;
1365
1366 throttle:
1367     if (dl_runtime_exceeded(dl_se) || dl_se->dl_yielded) {
1368         dl_se->dl_throttled = 1;
1369
1370         /* If requested, inform the user about runtime overruns. */
1371         if (dl_runtime_exceeded(dl_se) &&
1372             (dl_se->flags & SCHED_FLAG_DL_OVERRUN))
1373             dl_se->dl_overrun = 1;
1374
1375         __dequeue_task_dl(rq, curr, 0);
1376         if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(curr)))
1377             enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH);
1378
1379         if (!is_leftmost(curr, &rq->dl))
1380             resched_curr(rq);
1381     }
1382
1383     /*
1384      * Because -- for now -- we share the rt bandwidth, we need to
1385      * account our runtime there too, otherwise actual rt tasks
1386      * would be able to exceed the shared quota.
1387      *
1388      * Account to the root rt group for now.
1389      *
1390      * The solution we're working towards is having the RT groups scheduled
1391      * using deadline servers -- however there's a few nasties to figure
1392      * out before that can happen.
1393      */
1394     if (rt_bandwidth_enabled()) {
1395         struct rt_rq *rt_rq = &rq->rt;
1396
1397         raw_spin_lock(&rt_rq->rt_runtime_lock);
1398         /*
1399          * We'll let actual RT tasks worry about the overflow here, we
1400          * have our own CBS to keep us inline; only account when RT
1401          * bandwidth is relevant.
1402          */
1403         if (sched_rt_bandwidth_account(rt_rq))
1404             rt_rq->rt_time += delta_exec;
1405         raw_spin_unlock(&rt_rq->rt_runtime_lock);
1406     }
1407 }
1408
1409 static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer)
1410 {
1411     struct sched_dl_entity *dl_se = container_of(timer,
1412                              struct sched_dl_entity,
1413                              inactive_timer);
1414     struct task_struct *p = dl_task_of(dl_se);
1415     struct rq_flags rf;
1416     struct rq *rq;
1417
1418     rq = task_rq_lock(p, &rf);
1419
1420     sched_clock_tick();
1421     update_rq_clock(rq);
1422
1423     if (!dl_task(p) || READ_ONCE(p->__state) == TASK_DEAD) {
1424         struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
1425
1426         if (READ_ONCE(p->__state) == TASK_DEAD && dl_se->dl_non_contending) {
1427             sub_running_bw(&p->dl, dl_rq_of_se(&p->dl));
1428             sub_rq_bw(&p->dl, dl_rq_of_se(&p->dl));
1429             dl_se->dl_non_contending = 0;
1430         }
1431
1432         raw_spin_lock(&dl_b->lock);
1433         __dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
1434         raw_spin_unlock(&dl_b->lock);
1435         __dl_clear_params(p);
1436
1437         goto unlock;
1438     }
1439     if (dl_se->dl_non_contending == 0)
1440         goto unlock;
1441
1442     sub_running_bw(dl_se, &rq->dl);
1443     dl_se->dl_non_contending = 0;
1444 unlock:
1445     task_rq_unlock(rq, p, &rf);
1446     put_task_struct(p);
1447
1448     return HRTIMER_NORESTART;
1449 }
1450
1451 void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se)
1452 {
1453     struct hrtimer *timer = &dl_se->inactive_timer;
1454
1455     hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
1456     timer->function = inactive_task_timer;
1457 }
1458
1459 #define __node_2_dle(node) \
1460     rb_entry((node), struct sched_dl_entity, rb_node)
1461
1462 #ifdef CONFIG_SMP
1463
1464 static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
1465 {
1466     struct rq *rq = rq_of_dl_rq(dl_rq);
1467
1468     if (dl_rq->earliest_dl.curr == 0 ||
1469         dl_time_before(deadline, dl_rq->earliest_dl.curr)) {
1470         if (dl_rq->earliest_dl.curr == 0)
1471             cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_HIGHER);
1472         dl_rq->earliest_dl.curr = deadline;
1473         cpudl_set(&rq->rd->cpudl, rq->cpu, deadline);
1474     }
1475 }
1476
1477 static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
1478 {
1479     struct rq *rq = rq_of_dl_rq(dl_rq);
1480
1481     /*
1482      * Since we may have removed our earliest (and/or next earliest)
1483      * task we must recompute them.
1484      */
1485     if (!dl_rq->dl_nr_running) {
1486         dl_rq->earliest_dl.curr = 0;
1487         dl_rq->earliest_dl.next = 0;
1488         cpudl_clear(&rq->rd->cpudl, rq->cpu);
1489         cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr);
1490     } else {
1491         struct rb_node *leftmost = rb_first_cached(&dl_rq->root);
1492         struct sched_dl_entity *entry = __node_2_dle(leftmost);
1493
1494         dl_rq->earliest_dl.curr = entry->deadline;
1495         cpudl_set(&rq->rd->cpudl, rq->cpu, entry->deadline);
1496     }
1497 }
1498
1499 #else
1500
1501 static inline void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline) {}
1502 static inline void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) {}
1503
1504 #endif /* CONFIG_SMP */
1505
1506 static inline
1507 void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
1508 {
1509     int prio = dl_task_of(dl_se)->prio;
1510     u64 deadline = dl_se->deadline;
1511
1512     WARN_ON(!dl_prio(prio));
1513     dl_rq->dl_nr_running++;
1514     add_nr_running(rq_of_dl_rq(dl_rq), 1);
1515
1516     inc_dl_deadline(dl_rq, deadline);
1517     inc_dl_migration(dl_se, dl_rq);
1518 }
1519
1520 static inline
1521 void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
1522 {
1523     int prio = dl_task_of(dl_se)->prio;
1524
1525     WARN_ON(!dl_prio(prio));
1526     WARN_ON(!dl_rq->dl_nr_running);
1527     dl_rq->dl_nr_running--;
1528     sub_nr_running(rq_of_dl_rq(dl_rq), 1);
1529
1530     dec_dl_deadline(dl_rq, dl_se->deadline);
1531     dec_dl_migration(dl_se, dl_rq);
1532 }
1533
1534 static inline bool __dl_less(struct rb_node *a, const struct rb_node *b)
1535 {
1536     return dl_time_before(__node_2_dle(a)->deadline, __node_2_dle(b)->deadline);
1537 }
1538
1539 static inline struct sched_statistics *
1540 __schedstats_from_dl_se(struct sched_dl_entity *dl_se)
1541 {
1542     return &dl_task_of(dl_se)->stats;
1543 }
1544
1545 static inline void
1546 update_stats_wait_start_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se)
1547 {
1548     struct sched_statistics *stats;
1549
1550     if (!schedstat_enabled())
1551         return;
1552
1553     stats = __schedstats_from_dl_se(dl_se);
1554     __update_stats_wait_start(rq_of_dl_rq(dl_rq), dl_task_of(dl_se), stats);
1555 }
1556
1557 static inline void
1558 update_stats_wait_end_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se)
1559 {
1560     struct sched_statistics *stats;
1561
1562     if (!schedstat_enabled())
1563         return;
1564
1565     stats = __schedstats_from_dl_se(dl_se);
1566     __update_stats_wait_end(rq_of_dl_rq(dl_rq), dl_task_of(dl_se), stats);
1567 }
1568
1569 static inline void
1570 update_stats_enqueue_sleeper_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se)
1571 {
1572     struct sched_statistics *stats;
1573
1574     if (!schedstat_enabled())
1575         return;
1576
1577     stats = __schedstats_from_dl_se(dl_se);
1578     __update_stats_enqueue_sleeper(rq_of_dl_rq(dl_rq), dl_task_of(dl_se), stats);
1579 }
1580
1581 static inline void
1582 update_stats_enqueue_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se,
1583             int flags)
1584 {
1585     if (!schedstat_enabled())
1586         return;
1587
1588     if (flags & ENQUEUE_WAKEUP)
1589         update_stats_enqueue_sleeper_dl(dl_rq, dl_se);
1590 }
1591
1592 static inline void
1593 update_stats_dequeue_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se,
1594             int flags)
1595 {
1596     struct task_struct *p = dl_task_of(dl_se);
1597
1598     if (!schedstat_enabled())
1599         return;
1600
1601     if ((flags & DEQUEUE_SLEEP)) {
1602         unsigned int state;
1603
1604         state = READ_ONCE(p->__state);
1605         if (state & TASK_INTERRUPTIBLE)
1606             __schedstat_set(p->stats.sleep_start,
1607                     rq_clock(rq_of_dl_rq(dl_rq)));
1608
1609         if (state & TASK_UNINTERRUPTIBLE)
1610             __schedstat_set(p->stats.block_start,
1611                     rq_clock(rq_of_dl_rq(dl_rq)));
1612     }
1613 }
1614
1615 static void __enqueue_dl_entity(struct sched_dl_entity *dl_se)
1616 {
1617     struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
1618
1619     BUG_ON(!RB_EMPTY_NODE(&dl_se->rb_node));
1620
1621     rb_add_cached(&dl_se->rb_node, &dl_rq->root, __dl_less);
1622
1623     inc_dl_tasks(dl_se, dl_rq);
1624 }
1625
1626 static void __dequeue_dl_entity(struct sched_dl_entity *dl_se)
1627 {
1628     struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
1629
1630     if (RB_EMPTY_NODE(&dl_se->rb_node))
1631         return;
1632
1633     rb_erase_cached(&dl_se->rb_node, &dl_rq->root);
1634
1635     RB_CLEAR_NODE(&dl_se->rb_node);
1636
1637     dec_dl_tasks(dl_se, dl_rq);
1638 }
1639
1640 static void
1641 enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags)
1642 {
1643     BUG_ON(on_dl_rq(dl_se));
1644
1645     update_stats_enqueue_dl(dl_rq_of_se(dl_se), dl_se, flags);
1646
1647     /*
1648      * If this is a wakeup or a new instance, the scheduling
1649      * parameters of the task might need updating. Otherwise,
1650      * we want a replenishment of its runtime.
1651      */
1652     if (flags & ENQUEUE_WAKEUP) {
1653         task_contending(dl_se, flags);
1654         update_dl_entity(dl_se);
1655     } else if (flags & ENQUEUE_REPLENISH) {
1656         replenish_dl_entity(dl_se);
1657     } else if ((flags & ENQUEUE_RESTORE) &&
1658           dl_time_before(dl_se->deadline,
1659                  rq_clock(rq_of_dl_rq(dl_rq_of_se(dl_se))))) {
1660         setup_new_dl_entity(dl_se);
1661     }
1662
1663     __enqueue_dl_entity(dl_se);
1664 }
1665
1666 static void dequeue_dl_entity(struct sched_dl_entity *dl_se)
1667 {
1668     __dequeue_dl_entity(dl_se);
1669 }
1670
1671 static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
1672 {
1673     if (is_dl_boosted(&p->dl)) {
1674         /*
1675          * Because of delays in the detection of the overrun of a
1676          * thread's runtime, it might be the case that a thread
1677          * goes to sleep in a rt mutex with negative runtime. As
1678          * a consequence, the thread will be throttled.
1679          *
1680          * While waiting for the mutex, this thread can also be
1681          * boosted via PI, resulting in a thread that is throttled
1682          * and boosted at the same time.
1683          *
1684          * In this case, the boost overrides the throttle.
1685          */
1686         if (p->dl.dl_throttled) {
1687             /*
1688              * The replenish timer needs to be canceled. No
1689              * problem if it fires concurrently: boosted threads
1690              * are ignored in dl_task_timer().
1691              */
1692             hrtimer_try_to_cancel(&p->dl.dl_timer);
1693             p->dl.dl_throttled = 0;
1694         }
1695     } else if (!dl_prio(p->normal_prio)) {
1696         /*
1697          * Special case in which we have a !SCHED_DEADLINE task that is going
1698          * to be deboosted, but exceeds its runtime while doing so. No point in
1699          * replenishing it, as it's going to return back to its original
1700          * scheduling class after this. If it has been throttled, we need to
1701          * clear the flag, otherwise the task may wake up as throttled after
1702          * being boosted again with no means to replenish the runtime and clear
1703          * the throttle.
1704          */
1705         p->dl.dl_throttled = 0;
1706         if (!(flags & ENQUEUE_REPLENISH))
1707             printk_deferred_once("sched: DL de-boosted task PID %d: REPLENISH flag missing\n",
1708                          task_pid_nr(p));
1709
1710         return;
1711     }
1712
1713     /*
1714      * Check if a constrained deadline task was activated
1715      * after the deadline but before the next period.
1716      * If that is the case, the task will be throttled and
1717      * the replenishment timer will be set to the next period.
1718      */
1719     if (!p->dl.dl_throttled && !dl_is_implicit(&p->dl))
1720         dl_check_constrained_dl(&p->dl);
1721
1722     if (p->on_rq == TASK_ON_RQ_MIGRATING || flags & ENQUEUE_RESTORE) {
1723         add_rq_bw(&p->dl, &rq->dl);
1724         add_running_bw(&p->dl, &rq->dl);
1725     }
1726
1727     /*
1728      * If p is throttled, we do not enqueue it. In fact, if it exhausted
1729      * its budget it needs a replenishment and, since it now is on
1730      * its rq, the bandwidth timer callback (which clearly has not
1731      * run yet) will take care of this.
1732      * However, the active utilization does not depend on the fact
1733      * that the task is on the runqueue or not (but depends on the
1734      * task's state - in GRUB parlance, "inactive" vs "active contending").
1735      * In other words, even if a task is throttled its utilization must
1736      * be counted in the active utilization; hence, we need to call
1737      * add_running_bw().
1738      */
1739     if (p->dl.dl_throttled && !(flags & ENQUEUE_REPLENISH)) {
1740         if (flags & ENQUEUE_WAKEUP)
1741             task_contending(&p->dl, flags);
1742
1743         return;
1744     }
1745
1746     check_schedstat_required();
1747     update_stats_wait_start_dl(dl_rq_of_se(&p->dl), &p->dl);
1748
1749     enqueue_dl_entity(&p->dl, flags);
1750
1751     if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
1752         enqueue_pushable_dl_task(rq, p);
1753 }
1754
1755 static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
1756 {
1757     update_stats_dequeue_dl(&rq->dl, &p->dl, flags);
1758     dequeue_dl_entity(&p->dl);
1759     dequeue_pushable_dl_task(rq, p);
1760 }
1761
1762 static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
1763 {
1764     update_curr_dl(rq);
1765     __dequeue_task_dl(rq, p, flags);
1766
1767     if (p->on_rq == TASK_ON_RQ_MIGRATING || flags & DEQUEUE_SAVE) {
1768         sub_running_bw(&p->dl, &rq->dl);
1769         sub_rq_bw(&p->dl, &rq->dl);
1770     }
1771
1772     /*
1773      * This check allows to start the inactive timer (or to immediately
1774      * decrease the active utilization, if needed) in two cases:
1775      * when the task blocks and when it is terminating
1776      * (p->state == TASK_DEAD). We can handle the two cases in the same
1777      * way, because from GRUB's point of view the same thing is happening
1778      * (the task moves from "active contending" to "active non contending"
1779      * or "inactive")
1780      */
1781     if (flags & DEQUEUE_SLEEP)
1782         task_non_contending(p);
1783 }
1784
1785 /*
1786  * Yield task semantic for -deadline tasks is:
1787  *
1788  *   get off from the CPU until our next instance, with
1789  *   a new runtime. This is of little use now, since we
1790  *   don't have a bandwidth reclaiming mechanism. Anyway,
1791  *   bandwidth reclaiming is planned for the future, and
1792  *   yield_task_dl will indicate that some spare budget
1793  *   is available for other task instances to use it.
1794  */
1795 static void yield_task_dl(struct rq *rq)
1796 {
1797     /*
1798      * We make the task go to sleep until its current deadline by
1799      * forcing its runtime to zero. This way, update_curr_dl() stops
1800      * it and the bandwidth timer will wake it up and will give it
1801      * new scheduling parameters (thanks to dl_yielded=1).
1802      */
1803     rq->curr->dl.dl_yielded = 1;
1804
1805     update_rq_clock(rq);
1806     update_curr_dl(rq);
1807     /*
1808      * Tell update_rq_clock() that we've just updated,
1809      * so we don't do microscopic update in schedule()
1810      * and double the fastpath cost.
1811      */
1812     rq_clock_skip_update(rq);
1813 }
1814
1815 #ifdef CONFIG_SMP
1816
1817 static int find_later_rq(struct task_struct *task);
1818
1819 static int
1820 select_task_rq_dl(struct task_struct *p, int cpu, int flags)
1821 {
1822     struct task_struct *curr;
1823     bool select_rq;
1824     struct rq *rq;
1825
1826     if (!(flags & WF_TTWU))
1827         goto out;
1828
1829     rq = cpu_rq(cpu);
1830
1831     rcu_read_lock();
1832     curr = READ_ONCE(rq->curr); /* unlocked access */
1833
1834     /*
1835      * If we are dealing with a -deadline task, we must
1836      * decide where to wake it up.
1837      * If it has a later deadline and the current task
1838      * on this rq can't move (provided the waking task
1839      * can!) we prefer to send it somewhere else. On the
1840      * other hand, if it has a shorter deadline, we
1841      * try to make it stay here, it might be important.
1842      */
1843     select_rq = unlikely(dl_task(curr)) &&
1844             (curr->nr_cpus_allowed < 2 ||
1845              !dl_entity_preempt(&p->dl, &curr->dl)) &&
1846             p->nr_cpus_allowed > 1;
1847
1848     /*
1849      * Take the capacity of the CPU into account to
1850      * ensure it fits the requirement of the task.
1851      */
1852     if (static_branch_unlikely(&sched_asym_cpucapacity))
1853         select_rq |= !dl_task_fits_capacity(p, cpu);
1854
1855     if (select_rq) {
1856         int target = find_later_rq(p);
1857
1858         if (target != -1 &&
1859                 (dl_time_before(p->dl.deadline,
1860                     cpu_rq(target)->dl.earliest_dl.curr) ||
1861                 (cpu_rq(target)->dl.dl_nr_running == 0)))
1862             cpu = target;
1863     }
1864     rcu_read_unlock();
1865
1866 out:
1867     return cpu;
1868 }
1869
1870 static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused)
1871 {
1872     struct rq_flags rf;
1873     struct rq *rq;
1874
1875     if (READ_ONCE(p->__state) != TASK_WAKING)
1876         return;
1877
1878     rq = task_rq(p);
1879     /*
1880      * Since p->state == TASK_WAKING, set_task_cpu() has been called
1881      * from try_to_wake_up(). Hence, p->pi_lock is locked, but
1882      * rq->lock is not... So, lock it
1883      */
1884     rq_lock(rq, &rf);
1885     if (p->dl.dl_non_contending) {
1886         update_rq_clock(rq);
1887         sub_running_bw(&p->dl, &rq->dl);
1888         p->dl.dl_non_contending = 0;
1889         /*
1890          * If the timer handler is currently running and the
1891          * timer cannot be canceled, inactive_task_timer()
1892          * will see that dl_not_contending is not set, and
1893          * will not touch the rq's active utilization,
1894          * so we are still safe.
1895          */
1896         if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1)
1897             put_task_struct(p);
1898     }
1899     sub_rq_bw(&p->dl, &rq->dl);
1900     rq_unlock(rq, &rf);
1901 }
1902
1903 static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
1904 {
1905     /*
1906      * Current can't be migrated, useless to reschedule,
1907      * let's hope p can move out.
1908      */
1909     if (rq->curr->nr_cpus_allowed == 1 ||
1910         !cpudl_find(&rq->rd->cpudl, rq->curr, NULL))
1911         return;
1912
1913     /*
1914      * p is migratable, so let's not schedule it and
1915      * see if it is pushed or pulled somewhere else.
1916      */
1917     if (p->nr_cpus_allowed != 1 &&
1918         cpudl_find(&rq->rd->cpudl, p, NULL))
1919         return;
1920
1921     resched_curr(rq);
1922 }
1923
1924 static int balance_dl(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
1925 {
1926     if (!on_dl_rq(&p->dl) && need_pull_dl_task(rq, p)) {
1927         /*
1928          * This is OK, because current is on_cpu, which avoids it being
1929          * picked for load-balance and preemption/IRQs are still
1930          * disabled avoiding further scheduler activity on it and we've
1931          * not yet started the picking loop.
1932          */
1933         rq_unpin_lock(rq, rf);
1934         pull_dl_task(rq);
1935         rq_repin_lock(rq, rf);
1936     }
1937
1938     return sched_stop_runnable(rq) || sched_dl_runnable(rq);
1939 }
1940 #endif /* CONFIG_SMP */
1941
1942 /*
1943  * Only called when both the current and waking task are -deadline
1944  * tasks.
1945  */
1946 static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p,
1947                   int flags)
1948 {
1949     if (dl_entity_preempt(&p->dl, &rq->curr->dl)) {
1950         resched_curr(rq);
1951         return;
1952     }
1953
1954 #ifdef CONFIG_SMP
1955     /*
1956      * In the unlikely case current and p have the same deadline
1957      * let us try to decide what's the best thing to do...
1958      */
1959     if ((p->dl.deadline == rq->curr->dl.deadline) &&
1960         !test_tsk_need_resched(rq->curr))
1961         check_preempt_equal_dl(rq, p);
1962 #endif /* CONFIG_SMP */
1963 }
1964
1965 #ifdef CONFIG_SCHED_HRTICK
1966 static void start_hrtick_dl(struct rq *rq, struct task_struct *p)
1967 {
1968     hrtick_start(rq, p->dl.runtime);
1969 }
1970 #else /* !CONFIG_SCHED_HRTICK */
1971 static void start_hrtick_dl(struct rq *rq, struct task_struct *p)
1972 {
1973 }
1974 #endif
1975
1976 static void set_next_task_dl(struct rq *rq, struct task_struct *p, bool first)
1977 {
1978     struct sched_dl_entity *dl_se = &p->dl;
1979     struct dl_rq *dl_rq = &rq->dl;
1980
1981     p->se.exec_start = rq_clock_task(rq);
1982     if (on_dl_rq(&p->dl))
1983         update_stats_wait_end_dl(dl_rq, dl_se);
1984
1985     /* You can't push away the running task */
1986     dequeue_pushable_dl_task(rq, p);
1987
1988     if (!first)
1989         return;
1990
1991     if (hrtick_enabled_dl(rq))
1992         start_hrtick_dl(rq, p);
1993
1994     if (rq->curr->sched_class != &dl_sched_class)
1995         update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0);
1996
1997     deadline_queue_push_tasks(rq);
1998 }
1999
2000 static struct sched_dl_entity *pick_next_dl_entity(struct dl_rq *dl_rq)
2001 {
2002     struct rb_node *left = rb_first_cached(&dl_rq->root);
2003
2004     if (!left)
2005         return NULL;
2006
2007     return __node_2_dle(left);
2008 }
2009
2010 static struct task_struct *pick_task_dl(struct rq *rq)
2011 {
2012     struct sched_dl_entity *dl_se;
2013     struct dl_rq *dl_rq = &rq->dl;
2014     struct task_struct *p;
2015
2016     if (!sched_dl_runnable(rq))
2017         return NULL;
2018
2019     dl_se = pick_next_dl_entity(dl_rq);
2020     BUG_ON(!dl_se);
2021     p = dl_task_of(dl_se);
2022
2023     return p;
2024 }
2025
2026 static struct task_struct *pick_next_task_dl(struct rq *rq)
2027 {
2028     struct task_struct *p;
2029
2030     p = pick_task_dl(rq);
2031     if (p)
2032         set_next_task_dl(rq, p, true);
2033
2034     return p;
2035 }
2036
2037 static void put_prev_task_dl(struct rq *rq, struct task_struct *p)
2038 {
2039     struct sched_dl_entity *dl_se = &p->dl;
2040     struct dl_rq *dl_rq = &rq->dl;
2041
2042     if (on_dl_rq(&p->dl))
2043         update_stats_wait_start_dl(dl_rq, dl_se);
2044
2045     update_curr_dl(rq);
2046
2047     update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 1);
2048     if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1)
2049         enqueue_pushable_dl_task(rq, p);
2050 }
2051
2052 /*
2053  * scheduler tick hitting a task of our scheduling class.
2054  *
2055  * NOTE: This function can be called remotely by the tick offload that
2056  * goes along full dynticks. Therefore no local assumption can be made
2057  * and everything must be accessed through the @rq and @curr passed in
2058  * parameters.
2059  */
2060 static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued)
2061 {
2062     update_curr_dl(rq);
2063
2064     update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 1);
2065     /*
2066      * Even when we have runtime, update_curr_dl() might have resulted in us
2067      * not being the leftmost task anymore. In that case NEED_RESCHED will
2068      * be set and schedule() will start a new hrtick for the next task.
2069      */
2070     if (hrtick_enabled_dl(rq) && queued && p->dl.runtime > 0 &&
2071         is_leftmost(p, &rq->dl))
2072         start_hrtick_dl(rq, p);
2073 }
2074
2075 static void task_fork_dl(struct task_struct *p)
2076 {
2077     /*
2078      * SCHED_DEADLINE tasks cannot fork and this is achieved through
2079      * sched_fork()
2080      */
2081 }
2082
2083 #ifdef CONFIG_SMP
2084
2085 /* Only try algorithms three times */
2086 #define DL_MAX_TRIES 3
2087
2088 static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu)
2089 {
2090     if (!task_running(rq, p) &&
2091         cpumask_test_cpu(cpu, &p->cpus_mask))
2092         return 1;
2093     return 0;
2094 }
2095
2096 /*
2097  * Return the earliest pushable rq's task, which is suitable to be executed
2098  * on the CPU, NULL otherwise:
2099  */
2100 static struct task_struct *pick_earliest_pushable_dl_task(struct rq *rq, int cpu)
2101 {
2102     struct task_struct *p = NULL;
2103     struct rb_node *next_node;
2104
2105     if (!has_pushable_dl_tasks(rq))
2106         return NULL;
2107
2108     next_node = rb_first_cached(&rq->dl.pushable_dl_tasks_root);
2109
2110 next_node:
2111     if (next_node) {
2112         p = __node_2_pdl(next_node);
2113
2114         if (pick_dl_task(rq, p, cpu))
2115             return p;
2116
2117         next_node = rb_next(next_node);
2118         goto next_node;
2119     }
2120
2121     return NULL;
2122 }
2123
2124 static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask_dl);
2125
2126 static int find_later_rq(struct task_struct *task)
2127 {
2128     struct sched_domain *sd;
2129     struct cpumask *later_mask = this_cpu_cpumask_var_ptr(local_cpu_mask_dl);
2130     int this_cpu = smp_processor_id();
2131     int cpu = task_cpu(task);
2132
2133     /* Make sure the mask is initialized first */
2134     if (unlikely(!later_mask))
2135         return -1;
2136
2137     if (task->nr_cpus_allowed == 1)
2138         return -1;
2139
2140     /*
2141      * We have to consider system topology and task affinity
2142      * first, then we can look for a suitable CPU.
2143      */
2144     if (!cpudl_find(&task_rq(task)->rd->cpudl, task, later_mask))
2145         return -1;
2146
2147     /*
2148      * If we are here, some targets have been found, including
2149      * the most suitable which is, among the runqueues where the
2150      * current tasks have later deadlines than the task's one, the
2151      * rq with the latest possible one.
2152      *
2153      * Now we check how well this matches with task's
2154      * affinity and system topology.
2155      *
2156      * The last CPU where the task run is our first
2157      * guess, since it is most likely cache-hot there.
2158      */
2159     if (cpumask_test_cpu(cpu, later_mask))
2160         return cpu;
2161     /*
2162      * Check if this_cpu is to be skipped (i.e., it is
2163      * not in the mask) or not.
2164      */
2165     if (!cpumask_test_cpu(this_cpu, later_mask))
2166         this_cpu = -1;
2167
2168     rcu_read_lock();
2169     for_each_domain(cpu, sd) {
2170         if (sd->flags & SD_WAKE_AFFINE) {
2171             int best_cpu;
2172
2173             /*
2174              * If possible, preempting this_cpu is
2175              * cheaper than migrating.
2176              */
2177             if (this_cpu != -1 &&
2178                 cpumask_test_cpu(this_cpu, sched_domain_span(sd))) {
2179                 rcu_read_unlock();
2180                 return this_cpu;
2181             }
2182
2183             best_cpu = cpumask_any_and_distribute(later_mask,
2184                                   sched_domain_span(sd));
2185             /*
2186              * Last chance: if a CPU being in both later_mask
2187              * and current sd span is valid, that becomes our
2188              * choice. Of course, the latest possible CPU is
2189              * already under consideration through later_mask.
2190              */
2191             if (best_cpu < nr_cpu_ids) {
2192                 rcu_read_unlock();
2193                 return best_cpu;
2194             }
2195         }
2196     }
2197     rcu_read_unlock();
2198
2199     /*
2200      * At this point, all our guesses failed, we just return
2201      * 'something', and let the caller sort the things out.
2202      */
2203     if (this_cpu != -1)
2204         return this_cpu;
2205
2206     cpu = cpumask_any_distribute(later_mask);
2207     if (cpu < nr_cpu_ids)
2208         return cpu;
2209
2210     return -1;
2211 }
2212
2213 /* Locks the rq it finds */
2214 static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
2215 {
2216     struct rq *later_rq = NULL;
2217     int tries;
2218     int cpu;
2219
2220     for (tries = 0; tries < DL_MAX_TRIES; tries++) {
2221         cpu = find_later_rq(task);
2222
2223         if ((cpu == -1) || (cpu == rq->cpu))
2224             break;
2225
2226         later_rq = cpu_rq(cpu);
2227
2228         if (later_rq->dl.dl_nr_running &&
2229             !dl_time_before(task->dl.deadline,
2230                     later_rq->dl.earliest_dl.curr)) {
2231             /*
2232              * Target rq has tasks of equal or earlier deadline,
2233              * retrying does not release any lock and is unlikely
2234              * to yield a different result.
2235              */
2236             later_rq = NULL;
2237             break;
2238         }
2239
2240         /* Retry if something changed. */
2241         if (double_lock_balance(rq, later_rq)) {
2242             if (unlikely(task_rq(task) != rq ||
2243                      !cpumask_test_cpu(later_rq->cpu, &task->cpus_mask) ||
2244                      task_running(rq, task) ||
2245                      !dl_task(task) ||
2246                      !task_on_rq_queued(task))) {
2247                 double_unlock_balance(rq, later_rq);
2248                 later_rq = NULL;
2249                 break;
2250             }
2251         }
2252
2253         /*
2254          * If the rq we found has no -deadline task, or
2255          * its earliest one has a later deadline than our
2256          * task, the rq is a good one.
2257          */
2258         if (!later_rq->dl.dl_nr_running ||
2259             dl_time_before(task->dl.deadline,
2260                    later_rq->dl.earliest_dl.curr))
2261             break;
2262
2263         /* Otherwise we try again. */
2264         double_unlock_balance(rq, later_rq);
2265         later_rq = NULL;
2266     }
2267
2268     return later_rq;
2269 }
2270
2271 static struct task_struct *pick_next_pushable_dl_task(struct rq *rq)
2272 {
2273     struct task_struct *p;
2274
2275     if (!has_pushable_dl_tasks(rq))
2276         return NULL;
2277
2278     p = __node_2_pdl(rb_first_cached(&rq->dl.pushable_dl_tasks_root));
2279
2280     BUG_ON(rq->cpu != task_cpu(p));
2281     BUG_ON(task_current(rq, p));
2282     BUG_ON(p->nr_cpus_allowed <= 1);
2283
2284     BUG_ON(!task_on_rq_queued(p));
2285     BUG_ON(!dl_task(p));
2286
2287     return p;
2288 }
2289
2290 /*
2291  * See if the non running -deadline tasks on this rq
2292  * can be sent to some other CPU where they can preempt
2293  * and start executing.
2294  */
2295 static int push_dl_task(struct rq *rq)
2296 {
2297     struct task_struct *next_task;
2298     struct rq *later_rq;
2299     int ret = 0;
2300
2301     if (!rq->dl.overloaded)
2302         return 0;
2303
2304     next_task = pick_next_pushable_dl_task(rq);
2305     if (!next_task)
2306         return 0;
2307
2308 retry:
2309     /*
2310      * If next_task preempts rq->curr, and rq->curr
2311      * can move away, it makes sense to just reschedule
2312      * without going further in pushing next_task.
2313      */
2314     if (dl_task(rq->curr) &&
2315         dl_time_before(next_task->dl.deadline, rq->curr->dl.deadline) &&
2316         rq->curr->nr_cpus_allowed > 1) {
2317         resched_curr(rq);
2318         return 0;
2319     }
2320
2321     if (is_migration_disabled(next_task))
2322         return 0;
2323
2324     if (WARN_ON(next_task == rq->curr))
2325         return 0;
2326
2327     /* We might release rq lock */
2328     get_task_struct(next_task);
2329
2330     /* Will lock the rq it'll find */
2331     later_rq = find_lock_later_rq(next_task, rq);
2332     if (!later_rq) {
2333         struct task_struct *task;
2334
2335         /*
2336          * We must check all this again, since
2337          * find_lock_later_rq releases rq->lock and it is
2338          * then possible that next_task has migrated.
2339          */
2340         task = pick_next_pushable_dl_task(rq);
2341         if (task == next_task) {
2342             /*
2343              * The task is still there. We don't try
2344              * again, some other CPU will pull it when ready.
2345              */
2346             goto out;
2347         }
2348
2349         if (!task)
2350             /* No more tasks */
2351             goto out;
2352
2353         put_task_struct(next_task);
2354         next_task = task;
2355         goto retry;
2356     }
2357
2358     deactivate_task(rq, next_task, 0);
2359     set_task_cpu(next_task, later_rq->cpu);
2360     activate_task(later_rq, next_task, 0);
2361     ret = 1;
2362
2363     resched_curr(later_rq);
2364
2365     double_unlock_balance(rq, later_rq);
2366
2367 out:
2368     put_task_struct(next_task);
2369
2370     return ret;
2371 }
2372
2373 static void push_dl_tasks(struct rq *rq)
2374 {
2375     /* push_dl_task() will return true if it moved a -deadline task */
2376     while (push_dl_task(rq))
2377         ;
2378 }
2379
2380 static void pull_dl_task(struct rq *this_rq)
2381 {
2382     int this_cpu = this_rq->cpu, cpu;
2383     struct task_struct *p, *push_task;
2384     bool resched = false;
2385     struct rq *src_rq;
2386     u64 dmin = LONG_MAX;
2387
2388     if (likely(!dl_overloaded(this_rq)))
2389         return;
2390
2391     /*
2392      * Match the barrier from dl_set_overloaded; this guarantees that if we
2393      * see overloaded we must also see the dlo_mask bit.
2394      */
2395     smp_rmb();
2396
2397     for_each_cpu(cpu, this_rq->rd->dlo_mask) {
2398         if (this_cpu == cpu)
2399             continue;
2400
2401         src_rq = cpu_rq(cpu);
2402
2403         /*
2404          * It looks racy, abd it is! However, as in sched_rt.c,
2405          * we are fine with this.
2406          */
2407         if (this_rq->dl.dl_nr_running &&
2408             dl_time_before(this_rq->dl.earliest_dl.curr,
2409                    src_rq->dl.earliest_dl.next))
2410             continue;
2411
2412         /* Might drop this_rq->lock */
2413         push_task = NULL;
2414         double_lock_balance(this_rq, src_rq);
2415
2416         /*
2417          * If there are no more pullable tasks on the
2418          * rq, we're done with it.
2419          */
2420         if (src_rq->dl.dl_nr_running <= 1)
2421             goto skip;
2422
2423         p = pick_earliest_pushable_dl_task(src_rq, this_cpu);
2424
2425         /*
2426          * We found a task to be pulled if:
2427          *  - it preempts our current (if there's one),
2428          *  - it will preempt the last one we pulled (if any).
2429          */
2430         if (p && dl_time_before(p->dl.deadline, dmin) &&
2431             (!this_rq->dl.dl_nr_running ||
2432              dl_time_before(p->dl.deadline,
2433                     this_rq->dl.earliest_dl.curr))) {
2434             WARN_ON(p == src_rq->curr);
2435             WARN_ON(!task_on_rq_queued(p));
2436
2437             /*
2438              * Then we pull iff p has actually an earlier
2439              * deadline than the current task of its runqueue.
2440              */
2441             if (dl_time_before(p->dl.deadline,
2442                        src_rq->curr->dl.deadline))
2443                 goto skip;
2444
2445             if (is_migration_disabled(p)) {
2446                 push_task = get_push_task(src_rq);
2447             } else {
2448                 deactivate_task(src_rq, p, 0);
2449                 set_task_cpu(p, this_cpu);
2450                 activate_task(this_rq, p, 0);
2451                 dmin = p->dl.deadline;
2452                 resched = true;
2453             }
2454
2455             /* Is there any other task even earlier? */
2456         }
2457 skip:
2458         double_unlock_balance(this_rq, src_rq);
2459
2460         if (push_task) {
2461             raw_spin_rq_unlock(this_rq);
2462             stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop,
2463                         push_task, &src_rq->push_work);
2464             raw_spin_rq_lock(this_rq);
2465         }
2466     }
2467
2468     if (resched)
2469         resched_curr(this_rq);
2470 }
2471
2472 /*
2473  * Since the task is not running and a reschedule is not going to happen
2474  * anytime soon on its runqueue, we try pushing it away now.
2475  */
2476 static void task_woken_dl(struct rq *rq, struct task_struct *p)
2477 {
2478     if (!task_running(rq, p) &&
2479         !test_tsk_need_resched(rq->curr) &&
2480         p->nr_cpus_allowed > 1 &&
2481         dl_task(rq->curr) &&
2482         (rq->curr->nr_cpus_allowed < 2 ||
2483          !dl_entity_preempt(&p->dl, &rq->curr->dl))) {
2484         push_dl_tasks(rq);
2485     }
2486 }
2487
2488 static void set_cpus_allowed_dl(struct task_struct *p,
2489                 const struct cpumask *new_mask,
2490                 u32 flags)
2491 {
2492     struct root_domain *src_rd;
2493     struct rq *rq;
2494
2495     BUG_ON(!dl_task(p));
2496
2497     rq = task_rq(p);
2498     src_rd = rq->rd;
2499     /*
2500      * Migrating a SCHED_DEADLINE task between exclusive
2501      * cpusets (different root_domains) entails a bandwidth
2502      * update. We already made space for us in the destination
2503      * domain (see cpuset_can_attach()).
2504      */
2505     if (!cpumask_intersects(src_rd->span, new_mask)) {
2506         struct dl_bw *src_dl_b;
2507
2508         src_dl_b = dl_bw_of(cpu_of(rq));
2509         /*
2510          * We now free resources of the root_domain we are migrating
2511          * off. In the worst case, sched_setattr() may temporary fail
2512          * until we complete the update.
2513          */
2514         raw_spin_lock(&src_dl_b->lock);
2515         __dl_sub(src_dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
2516         raw_spin_unlock(&src_dl_b->lock);
2517     }
2518
2519     set_cpus_allowed_common(p, new_mask, flags);
2520 }
2521
2522 /* Assumes rq->lock is held */
2523 static void rq_online_dl(struct rq *rq)
2524 {
2525     if (rq->dl.overloaded)
2526         dl_set_overload(rq);
2527
2528     cpudl_set_freecpu(&rq->rd->cpudl, rq->cpu);
2529     if (rq->dl.dl_nr_running > 0)
2530         cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr);
2531 }
2532
2533 /* Assumes rq->lock is held */
2534 static void rq_offline_dl(struct rq *rq)
2535 {
2536     if (rq->dl.overloaded)
2537         dl_clear_overload(rq);
2538
2539     cpudl_clear(&rq->rd->cpudl, rq->cpu);
2540     cpudl_clear_freecpu(&rq->rd->cpudl, rq->cpu);
2541 }
2542
2543 void __init init_sched_dl_class(void)
2544 {
2545     unsigned int i;
2546
2547     for_each_possible_cpu(i)
2548         zalloc_cpumask_var_node(&per_cpu(local_cpu_mask_dl, i),
2549                     GFP_KERNEL, cpu_to_node(i));
2550 }
2551
2552 void dl_add_task_root_domain(struct task_struct *p)
2553 {
2554     struct rq_flags rf;
2555     struct rq *rq;
2556     struct dl_bw *dl_b;
2557
2558     raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
2559     if (!dl_task(p)) {
2560         raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags);
2561         return;
2562     }
2563
2564     rq = __task_rq_lock(p, &rf);
2565
2566     dl_b = &rq->rd->dl_bw;
2567     raw_spin_lock(&dl_b->lock);
2568
2569     __dl_add(dl_b, p->dl.dl_bw, cpumask_weight(rq->rd->span));
2570
2571     raw_spin_unlock(&dl_b->lock);
2572
2573     task_rq_unlock(rq, p, &rf);
2574 }
2575
2576 void dl_clear_root_domain(struct root_domain *rd)
2577 {
2578     unsigned long flags;
2579
2580     raw_spin_lock_irqsave(&rd->dl_bw.lock, flags);
2581     rd->dl_bw.total_bw = 0;
2582     raw_spin_unlock_irqrestore(&rd->dl_bw.lock, flags);
2583 }
2584
2585 #endif /* CONFIG_SMP */
2586
2587 static void switched_from_dl(struct rq *rq, struct task_struct *p)
2588 {
2589     /*
2590      * task_non_contending() can start the "inactive timer" (if the 0-lag
2591      * time is in the future). If the task switches back to dl before
2592      * the "inactive timer" fires, it can continue to consume its current
2593      * runtime using its current deadline. If it stays outside of
2594      * SCHED_DEADLINE until the 0-lag time passes, inactive_task_timer()
2595      * will reset the task parameters.
2596      */
2597     if (task_on_rq_queued(p) && p->dl.dl_runtime)
2598         task_non_contending(p);
2599
2600     if (!task_on_rq_queued(p)) {
2601         /*
2602          * Inactive timer is armed. However, p is leaving DEADLINE and
2603          * might migrate away from this rq while continuing to run on
2604          * some other class. We need to remove its contribution from
2605          * this rq running_bw now, or sub_rq_bw (below) will complain.
2606          */
2607         if (p->dl.dl_non_contending)
2608             sub_running_bw(&p->dl, &rq->dl);
2609         sub_rq_bw(&p->dl, &rq->dl);
2610     }
2611
2612     /*
2613      * We cannot use inactive_task_timer() to invoke sub_running_bw()
2614      * at the 0-lag time, because the task could have been migrated
2615      * while SCHED_OTHER in the meanwhile.
2616      */
2617     if (p->dl.dl_non_contending)
2618         p->dl.dl_non_contending = 0;
2619
2620     /*
2621      * Since this might be the only -deadline task on the rq,
2622      * this is the right place to try to pull some other one
2623      * from an overloaded CPU, if any.
2624      */
2625     if (!task_on_rq_queued(p) || rq->dl.dl_nr_running)
2626         return;
2627
2628     deadline_queue_pull_task(rq);
2629 }
2630
2631 /*
2632  * When switching to -deadline, we may overload the rq, then
2633  * we try to push someone off, if possible.
2634  */
2635 static void switched_to_dl(struct rq *rq, struct task_struct *p)
2636 {
2637     if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1)
2638         put_task_struct(p);
2639
2640     /* If p is not queued we will update its parameters at next wakeup. */
2641     if (!task_on_rq_queued(p)) {
2642         add_rq_bw(&p->dl, &rq->dl);
2643
2644         return;
2645     }
2646
2647     if (rq->curr != p) {
2648 #ifdef CONFIG_SMP
2649         if (p->nr_cpus_allowed > 1 && rq->dl.overloaded)
2650             deadline_queue_push_tasks(rq);
2651 #endif
2652         if (dl_task(rq->curr))
2653             check_preempt_curr_dl(rq, p, 0);
2654         else
2655             resched_curr(rq);
2656     } else {
2657         update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0);
2658     }
2659 }
2660
2661 /*
2662  * If the scheduling parameters of a -deadline task changed,
2663  * a push or pull operation might be needed.
2664  */
2665 static void prio_changed_dl(struct rq *rq, struct task_struct *p,
2666                 int oldprio)
2667 {
2668     if (task_on_rq_queued(p) || task_current(rq, p)) {
2669 #ifdef CONFIG_SMP
2670         /*
2671          * This might be too much, but unfortunately
2672          * we don't have the old deadline value, and
2673          * we can't argue if the task is increasing
2674          * or lowering its prio, so...
2675          */
2676         if (!rq->dl.overloaded)
2677             deadline_queue_pull_task(rq);
2678
2679         /*
2680          * If we now have a earlier deadline task than p,
2681          * then reschedule, provided p is still on this
2682          * runqueue.
2683          */
2684         if (dl_time_before(rq->dl.earliest_dl.curr, p->dl.deadline))
2685             resched_curr(rq);
2686 #else
2687         /*
2688          * Again, we don't know if p has a earlier
2689          * or later deadline, so let's blindly set a
2690          * (maybe not needed) rescheduling point.
2691          */
2692         resched_curr(rq);
2693 #endif /* CONFIG_SMP */
2694     }
2695 }
2696
2697 DEFINE_SCHED_CLASS(dl) = {
2698
2699     .enqueue_task       = enqueue_task_dl,
2700     .dequeue_task       = dequeue_task_dl,
2701     .yield_task     = yield_task_dl,
2702
2703     .check_preempt_curr = check_preempt_curr_dl,
2704
2705     .pick_next_task     = pick_next_task_dl,
2706     .put_prev_task      = put_prev_task_dl,
2707     .set_next_task      = set_next_task_dl,
2708
2709 #ifdef CONFIG_SMP
2710     .balance        = balance_dl,
2711     .pick_task      = pick_task_dl,
2712     .select_task_rq     = select_task_rq_dl,
2713     .migrate_task_rq    = migrate_task_rq_dl,
2714     .set_cpus_allowed       = set_cpus_allowed_dl,
2715     .rq_online              = rq_online_dl,
2716     .rq_offline             = rq_offline_dl,
2717     .task_woken     = task_woken_dl,
2718     .find_lock_rq       = find_lock_later_rq,
2719 #endif
2720
2721     .task_tick      = task_tick_dl,
2722     .task_fork              = task_fork_dl,
2723
2724     .prio_changed           = prio_changed_dl,
2725     .switched_from      = switched_from_dl,
2726     .switched_to        = switched_to_dl,
2727
2728     .update_curr        = update_curr_dl,
2729 };
2730
2731 /* Used for dl_bw check and update, used under sched_rt_handler()::mutex */
2732 static u64 dl_generation;
2733
2734 int sched_dl_global_validate(void)
2735 {
2736     u64 runtime = global_rt_runtime();
2737     u64 period = global_rt_period();
2738     u64 new_bw = to_ratio(period, runtime);
2739     u64 gen = ++dl_generation;
2740     struct dl_bw *dl_b;
2741     int cpu, cpus, ret = 0;
2742     unsigned long flags;
2743
2744     /*
2745      * Here we want to check the bandwidth not being set to some
2746      * value smaller than the currently allocated bandwidth in
2747      * any of the root_domains.
2748      */
2749     for_each_possible_cpu(cpu) {
2750         rcu_read_lock_sched();
2751
2752         if (dl_bw_visited(cpu, gen))
2753             goto next;
2754
2755         dl_b = dl_bw_of(cpu);
2756         cpus = dl_bw_cpus(cpu);
2757
2758         raw_spin_lock_irqsave(&dl_b->lock, flags);
2759         if (new_bw * cpus < dl_b->total_bw)
2760             ret = -EBUSY;
2761         raw_spin_unlock_irqrestore(&dl_b->lock, flags);
2762
2763 next:
2764         rcu_read_unlock_sched();
2765
2766         if (ret)
2767             break;
2768     }
2769
2770     return ret;
2771 }
2772
2773 static void init_dl_rq_bw_ratio(struct dl_rq *dl_rq)
2774 {
2775     if (global_rt_runtime() == RUNTIME_INF) {
2776         dl_rq->bw_ratio = 1 << RATIO_SHIFT;
2777         dl_rq->extra_bw = 1 << BW_SHIFT;
2778     } else {
2779         dl_rq->bw_ratio = to_ratio(global_rt_runtime(),
2780               global_rt_period()) >> (BW_SHIFT - RATIO_SHIFT);
2781         dl_rq->extra_bw = to_ratio(global_rt_period(),
2782                             global_rt_runtime());
2783     }
2784 }
2785
2786 void sched_dl_do_global(void)
2787 {
2788     u64 new_bw = -1;
2789     u64 gen = ++dl_generation;
2790     struct dl_bw *dl_b;
2791     int cpu;
2792     unsigned long flags;
2793
2794     if (global_rt_runtime() != RUNTIME_INF)
2795         new_bw = to_ratio(global_rt_period(), global_rt_runtime());
2796
2797     for_each_possible_cpu(cpu) {
2798         rcu_read_lock_sched();
2799
2800         if (dl_bw_visited(cpu, gen)) {
2801             rcu_read_unlock_sched();
2802             continue;
2803         }
2804
2805         dl_b = dl_bw_of(cpu);
2806
2807         raw_spin_lock_irqsave(&dl_b->lock, flags);
2808         dl_b->bw = new_bw;
2809         raw_spin_unlock_irqrestore(&dl_b->lock, flags);
2810
2811         rcu_read_unlock_sched();
2812         init_dl_rq_bw_ratio(&cpu_rq(cpu)->dl);
2813     }
2814 }
2815
2816 /*
2817  * We must be sure that accepting a new task (or allowing changing the
2818  * parameters of an existing one) is consistent with the bandwidth
2819  * constraints. If yes, this function also accordingly updates the currently
2820  * allocated bandwidth to reflect the new situation.
2821  *
2822  * This function is called while holding p's rq->lock.
2823  */
2824 int sched_dl_overflow(struct task_struct *p, int policy,
2825               const struct sched_attr *attr)
2826 {
2827     u64 period = attr->sched_period ?: attr->sched_deadline;
2828     u64 runtime = attr->sched_runtime;
2829     u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0;
2830     int cpus, err = -1, cpu = task_cpu(p);
2831     struct dl_bw *dl_b = dl_bw_of(cpu);
2832     unsigned long cap;
2833
2834     if (attr->sched_flags & SCHED_FLAG_SUGOV)
2835         return 0;
2836
2837     /* !deadline task may carry old deadline bandwidth */
2838     if (new_bw == p->dl.dl_bw && task_has_dl_policy(p))
2839         return 0;
2840
2841     /*
2842      * Either if a task, enters, leave, or stays -deadline but changes
2843      * its parameters, we may need to update accordingly the total
2844      * allocated bandwidth of the container.
2845      */
2846     raw_spin_lock(&dl_b->lock);
2847     cpus = dl_bw_cpus(cpu);
2848     cap = dl_bw_capacity(cpu);
2849
2850     if (dl_policy(policy) && !task_has_dl_policy(p) &&
2851         !__dl_overflow(dl_b, cap, 0, new_bw)) {
2852         if (hrtimer_active(&p->dl.inactive_timer))
2853             __dl_sub(dl_b, p->dl.dl_bw, cpus);
2854         __dl_add(dl_b, new_bw, cpus);
2855         err = 0;
2856     } else if (dl_policy(policy) && task_has_dl_policy(p) &&
2857            !__dl_overflow(dl_b, cap, p->dl.dl_bw, new_bw)) {
2858         /*
2859          * XXX this is slightly incorrect: when the task
2860          * utilization decreases, we should delay the total
2861          * utilization change until the task's 0-lag point.
2862          * But this would require to set the task's "inactive
2863          * timer" when the task is not inactive.
2864          */
2865         __dl_sub(dl_b, p->dl.dl_bw, cpus);
2866         __dl_add(dl_b, new_bw, cpus);
2867         dl_change_utilization(p, new_bw);
2868         err = 0;
2869     } else if (!dl_policy(policy) && task_has_dl_policy(p)) {
2870         /*
2871          * Do not decrease the total deadline utilization here,
2872          * switched_from_dl() will take care to do it at the correct
2873          * (0-lag) time.
2874          */
2875         err = 0;
2876     }
2877     raw_spin_unlock(&dl_b->lock);
2878
2879     return err;
2880 }
2881
2882 /*
2883  * This function initializes the sched_dl_entity of a newly becoming
2884  * SCHED_DEADLINE task.
2885  *
2886  * Only the static values are considered here, the actual runtime and the
2887  * absolute deadline will be properly calculated when the task is enqueued
2888  * for the first time with its new policy.
2889  */
2890 void __setparam_dl(struct task_struct *p, const struct sched_attr *attr)
2891 {
2892     struct sched_dl_entity *dl_se = &p->dl;
2893
2894     dl_se->dl_runtime = attr->sched_runtime;
2895     dl_se->dl_deadline = attr->sched_deadline;
2896     dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline;
2897     dl_se->flags = attr->sched_flags & SCHED_DL_FLAGS;
2898     dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
2899     dl_se->dl_density = to_ratio(dl_se->dl_deadline, dl_se->dl_runtime);
2900 }
2901
2902 void __getparam_dl(struct task_struct *p, struct sched_attr *attr)
2903 {
2904     struct sched_dl_entity *dl_se = &p->dl;
2905
2906     attr->sched_priority = p->rt_priority;
2907     attr->sched_runtime = dl_se->dl_runtime;
2908     attr->sched_deadline = dl_se->dl_deadline;
2909     attr->sched_period = dl_se->dl_period;
2910     attr->sched_flags &= ~SCHED_DL_FLAGS;
2911     attr->sched_flags |= dl_se->flags;
2912 }
2913
2914 /*
2915  * This function validates the new parameters of a -deadline task.
2916  * We ask for the deadline not being zero, and greater or equal
2917  * than the runtime, as well as the period of being zero or
2918  * greater than deadline. Furthermore, we have to be sure that
2919  * user parameters are above the internal resolution of 1us (we
2920  * check sched_runtime only since it is always the smaller one) and
2921  * below 2^63 ns (we have to check both sched_deadline and
2922  * sched_period, as the latter can be zero).
2923  */
2924 bool __checkparam_dl(const struct sched_attr *attr)
2925 {
2926     u64 period, max, min;
2927
2928     /* special dl tasks don't actually use any parameter */
2929     if (attr->sched_flags & SCHED_FLAG_SUGOV)
2930         return true;
2931
2932     /* deadline != 0 */
2933     if (attr->sched_deadline == 0)
2934         return false;
2935
2936     /*
2937      * Since we truncate DL_SCALE bits, make sure we're at least
2938      * that big.
2939      */
2940     if (attr->sched_runtime < (1ULL << DL_SCALE))
2941         return false;
2942
2943     /*
2944      * Since we use the MSB for wrap-around and sign issues, make
2945      * sure it's not set (mind that period can be equal to zero).
2946      */
2947     if (attr->sched_deadline & (1ULL << 63) ||
2948         attr->sched_period & (1ULL << 63))
2949         return false;
2950
2951     period = attr->sched_period;
2952     if (!period)
2953         period = attr->sched_deadline;
2954
2955     /* runtime <= deadline <= period (if period != 0) */
2956     if (period < attr->sched_deadline ||
2957         attr->sched_deadline < attr->sched_runtime)
2958         return false;
2959
2960     max = (u64)READ_ONCE(sysctl_sched_dl_period_max) * NSEC_PER_USEC;
2961     min = (u64)READ_ONCE(sysctl_sched_dl_period_min) * NSEC_PER_USEC;
2962
2963     if (period < min || period > max)
2964         return false;
2965
2966     return true;
2967 }
2968
2969 /*
2970  * This function clears the sched_dl_entity static params.
2971  */
2972 void __dl_clear_params(struct task_struct *p)
2973 {
2974     struct sched_dl_entity *dl_se = &p->dl;
2975
2976     dl_se->dl_runtime       = 0;
2977     dl_se->dl_deadline      = 0;
2978     dl_se->dl_period        = 0;
2979     dl_se->flags            = 0;
2980     dl_se->dl_bw            = 0;
2981     dl_se->dl_density       = 0;
2982
2983     dl_se->dl_throttled     = 0;
2984     dl_se->dl_yielded       = 0;
2985     dl_se->dl_non_contending    = 0;
2986     dl_se->dl_overrun       = 0;
2987
2988 #ifdef CONFIG_RT_MUTEXES
2989     dl_se->pi_se            = dl_se;
2990 #endif
2991 }
2992
2993 bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr)
2994 {
2995     struct sched_dl_entity *dl_se = &p->dl;
2996
2997     if (dl_se->dl_runtime != attr->sched_runtime ||
2998         dl_se->dl_deadline != attr->sched_deadline ||
2999         dl_se->dl_period != attr->sched_period ||
3000         dl_se->flags != (attr->sched_flags & SCHED_DL_FLAGS))
3001         return true;
3002
3003     return false;
3004 }
3005
3006 #ifdef CONFIG_SMP
3007 int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur,
3008                  const struct cpumask *trial)
3009 {
3010     int ret = 1, trial_cpus;
3011     struct dl_bw *cur_dl_b;
3012     unsigned long flags;
3013
3014     rcu_read_lock_sched();
3015     cur_dl_b = dl_bw_of(cpumask_any(cur));
3016     trial_cpus = cpumask_weight(trial);
3017
3018     raw_spin_lock_irqsave(&cur_dl_b->lock, flags);
3019     if (cur_dl_b->bw != -1 &&
3020         cur_dl_b->bw * trial_cpus < cur_dl_b->total_bw)
3021         ret = 0;
3022     raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags);
3023     rcu_read_unlock_sched();
3024
3025     return ret;
3026 }
3027
3028 int dl_cpu_busy(int cpu, struct task_struct *p)
3029 {
3030     unsigned long flags, cap;
3031     struct dl_bw *dl_b;
3032     bool overflow;
3033
3034     rcu_read_lock_sched();
3035     dl_b = dl_bw_of(cpu);
3036     raw_spin_lock_irqsave(&dl_b->lock, flags);
3037     cap = dl_bw_capacity(cpu);
3038     overflow = __dl_overflow(dl_b, cap, 0, p ? p->dl.dl_bw : 0);
3039
3040     if (!overflow && p) {
3041         /*
3042          * We reserve space for this task in the destination
3043          * root_domain, as we can't fail after this point.
3044          * We will free resources in the source root_domain
3045          * later on (see set_cpus_allowed_dl()).
3046          */
3047         __dl_add(dl_b, p->dl.dl_bw, dl_bw_cpus(cpu));
3048     }
3049
3050     raw_spin_unlock_irqrestore(&dl_b->lock, flags);
3051     rcu_read_unlock_sched();
3052
3053     return overflow ? -EBUSY : 0;
3054 }
3055 #endif
3056
3057 #ifdef CONFIG_SCHED_DEBUG
3058 void print_dl_stats(struct seq_file *m, int cpu)
3059 {
3060     print_dl_rq(m, cpu, &cpu_rq(cpu)->dl);
3061 }
3062 #endif /* CONFIG_SCHED_DEBUG */