kernel/sched/rt.c

0001 // SPDX-License-Identifier: GPL-2.0
0002 /*
0003  * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR
0004  * policies)
0005  */
0006
0007 int sched_rr_timeslice = RR_TIMESLICE;
0008 /* More than 4 hours if BW_SHIFT equals 20. */
0009 static const u64 max_rt_runtime = MAX_BW;
0010
0011 static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
0012
0013 struct rt_bandwidth def_rt_bandwidth;
0014
0015 /*
0016  * period over which we measure -rt task CPU usage in us.
0017  * default: 1s
0018  */
0019 unsigned int sysctl_sched_rt_period = 1000000;
0020
0021 /*
0022  * part of the period that we allow rt tasks to run in us.
0023  * default: 0.95s
0024  */
0025 int sysctl_sched_rt_runtime = 950000;
0026
0027 #ifdef CONFIG_SYSCTL
0028 static int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE;
0029 static int sched_rt_handler(struct ctl_table *table, int write, void *buffer,
0030         size_t *lenp, loff_t *ppos);
0031 static int sched_rr_handler(struct ctl_table *table, int write, void *buffer,
0032         size_t *lenp, loff_t *ppos);
0033 static struct ctl_table sched_rt_sysctls[] = {
0034     {
0035         .procname       = "sched_rt_period_us",
0036         .data           = &sysctl_sched_rt_period,
0037         .maxlen         = sizeof(unsigned int),
0038         .mode           = 0644,
0039         .proc_handler   = sched_rt_handler,
0040     },
0041     {
0042         .procname       = "sched_rt_runtime_us",
0043         .data           = &sysctl_sched_rt_runtime,
0044         .maxlen         = sizeof(int),
0045         .mode           = 0644,
0046         .proc_handler   = sched_rt_handler,
0047     },
0048     {
0049         .procname       = "sched_rr_timeslice_ms",
0050         .data           = &sysctl_sched_rr_timeslice,
0051         .maxlen         = sizeof(int),
0052         .mode           = 0644,
0053         .proc_handler   = sched_rr_handler,
0054     },
0055     {}
0056 };
0057
0058 static int __init sched_rt_sysctl_init(void)
0059 {
0060     register_sysctl_init("kernel", sched_rt_sysctls);
0061     return 0;
0062 }
0063 late_initcall(sched_rt_sysctl_init);
0064 #endif
0065
0066 static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
0067 {
0068     struct rt_bandwidth *rt_b =
0069         container_of(timer, struct rt_bandwidth, rt_period_timer);
0070     int idle = 0;
0071     int overrun;
0072
0073     raw_spin_lock(&rt_b->rt_runtime_lock);
0074     for (;;) {
0075         overrun = hrtimer_forward_now(timer, rt_b->rt_period);
0076         if (!overrun)
0077             break;
0078
0079         raw_spin_unlock(&rt_b->rt_runtime_lock);
0080         idle = do_sched_rt_period_timer(rt_b, overrun);
0081         raw_spin_lock(&rt_b->rt_runtime_lock);
0082     }
0083     if (idle)
0084         rt_b->rt_period_active = 0;
0085     raw_spin_unlock(&rt_b->rt_runtime_lock);
0086
0087     return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
0088 }
0089
0090 void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
0091 {
0092     rt_b->rt_period = ns_to_ktime(period);
0093     rt_b->rt_runtime = runtime;
0094
0095     raw_spin_lock_init(&rt_b->rt_runtime_lock);
0096
0097     hrtimer_init(&rt_b->rt_period_timer, CLOCK_MONOTONIC,
0098              HRTIMER_MODE_REL_HARD);
0099     rt_b->rt_period_timer.function = sched_rt_period_timer;
0100 }
0101
0102 static inline void do_start_rt_bandwidth(struct rt_bandwidth *rt_b)
0103 {
0104     raw_spin_lock(&rt_b->rt_runtime_lock);
0105     if (!rt_b->rt_period_active) {
0106         rt_b->rt_period_active = 1;
0107         /*
0108          * SCHED_DEADLINE updates the bandwidth, as a run away
0109          * RT task with a DL task could hog a CPU. But DL does
0110          * not reset the period. If a deadline task was running
0111          * without an RT task running, it can cause RT tasks to
0112          * throttle when they start up. Kick the timer right away
0113          * to update the period.
0114          */
0115         hrtimer_forward_now(&rt_b->rt_period_timer, ns_to_ktime(0));
0116         hrtimer_start_expires(&rt_b->rt_period_timer,
0117                       HRTIMER_MODE_ABS_PINNED_HARD);
0118     }
0119     raw_spin_unlock(&rt_b->rt_runtime_lock);
0120 }
0121
0122 static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
0123 {
0124     if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
0125         return;
0126
0127     do_start_rt_bandwidth(rt_b);
0128 }
0129
0130 void init_rt_rq(struct rt_rq *rt_rq)
0131 {
0132     struct rt_prio_array *array;
0133     int i;
0134
0135     array = &rt_rq->active;
0136     for (i = 0; i < MAX_RT_PRIO; i++) {
0137         INIT_LIST_HEAD(array->queue + i);
0138         __clear_bit(i, array->bitmap);
0139     }
0140     /* delimiter for bitsearch: */
0141     __set_bit(MAX_RT_PRIO, array->bitmap);
0142
0143 #if defined CONFIG_SMP
0144     rt_rq->highest_prio.curr = MAX_RT_PRIO-1;
0145     rt_rq->highest_prio.next = MAX_RT_PRIO-1;
0146     rt_rq->rt_nr_migratory = 0;
0147     rt_rq->overloaded = 0;
0148     plist_head_init(&rt_rq->pushable_tasks);
0149 #endif /* CONFIG_SMP */
0150     /* We start is dequeued state, because no RT tasks are queued */
0151     rt_rq->rt_queued = 0;
0152
0153     rt_rq->rt_time = 0;
0154     rt_rq->rt_throttled = 0;
0155     rt_rq->rt_runtime = 0;
0156     raw_spin_lock_init(&rt_rq->rt_runtime_lock);
0157 }
0158
0159 #ifdef CONFIG_RT_GROUP_SCHED
0160 static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
0161 {
0162     hrtimer_cancel(&rt_b->rt_period_timer);
0163 }
0164
0165 #define rt_entity_is_task(rt_se) (!(rt_se)->my_q)
0166
0167 static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
0168 {
0169 #ifdef CONFIG_SCHED_DEBUG
0170     WARN_ON_ONCE(!rt_entity_is_task(rt_se));
0171 #endif
0172     return container_of(rt_se, struct task_struct, rt);
0173 }
0174
0175 static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
0176 {
0177     return rt_rq->rq;
0178 }
0179
0180 static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
0181 {
0182     return rt_se->rt_rq;
0183 }
0184
0185 static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
0186 {
0187     struct rt_rq *rt_rq = rt_se->rt_rq;
0188
0189     return rt_rq->rq;
0190 }
0191
0192 void unregister_rt_sched_group(struct task_group *tg)
0193 {
0194     if (tg->rt_se)
0195         destroy_rt_bandwidth(&tg->rt_bandwidth);
0196
0197 }
0198
0199 void free_rt_sched_group(struct task_group *tg)
0200 {
0201     int i;
0202
0203     for_each_possible_cpu(i) {
0204         if (tg->rt_rq)
0205             kfree(tg->rt_rq[i]);
0206         if (tg->rt_se)
0207             kfree(tg->rt_se[i]);
0208     }
0209
0210     kfree(tg->rt_rq);
0211     kfree(tg->rt_se);
0212 }
0213
0214 void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
0215         struct sched_rt_entity *rt_se, int cpu,
0216         struct sched_rt_entity *parent)
0217 {
0218     struct rq *rq = cpu_rq(cpu);
0219
0220     rt_rq->highest_prio.curr = MAX_RT_PRIO-1;
0221     rt_rq->rt_nr_boosted = 0;
0222     rt_rq->rq = rq;
0223     rt_rq->tg = tg;
0224
0225     tg->rt_rq[cpu] = rt_rq;
0226     tg->rt_se[cpu] = rt_se;
0227
0228     if (!rt_se)
0229         return;
0230
0231     if (!parent)
0232         rt_se->rt_rq = &rq->rt;
0233     else
0234         rt_se->rt_rq = parent->my_q;
0235
0236     rt_se->my_q = rt_rq;
0237     rt_se->parent = parent;
0238     INIT_LIST_HEAD(&rt_se->run_list);
0239 }
0240
0241 int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
0242 {
0243     struct rt_rq *rt_rq;
0244     struct sched_rt_entity *rt_se;
0245     int i;
0246
0247     tg->rt_rq = kcalloc(nr_cpu_ids, sizeof(rt_rq), GFP_KERNEL);
0248     if (!tg->rt_rq)
0249         goto err;
0250     tg->rt_se = kcalloc(nr_cpu_ids, sizeof(rt_se), GFP_KERNEL);
0251     if (!tg->rt_se)
0252         goto err;
0253
0254     init_rt_bandwidth(&tg->rt_bandwidth,
0255             ktime_to_ns(def_rt_bandwidth.rt_period), 0);
0256
0257     for_each_possible_cpu(i) {
0258         rt_rq = kzalloc_node(sizeof(struct rt_rq),
0259                      GFP_KERNEL, cpu_to_node(i));
0260         if (!rt_rq)
0261             goto err;
0262
0263         rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
0264                      GFP_KERNEL, cpu_to_node(i));
0265         if (!rt_se)
0266             goto err_free_rq;
0267
0268         init_rt_rq(rt_rq);
0269         rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
0270         init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
0271     }
0272
0273     return 1;
0274
0275 err_free_rq:
0276     kfree(rt_rq);
0277 err:
0278     return 0;
0279 }
0280
0281 #else /* CONFIG_RT_GROUP_SCHED */
0282
0283 #define rt_entity_is_task(rt_se) (1)
0284
0285 static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
0286 {
0287     return container_of(rt_se, struct task_struct, rt);
0288 }
0289
0290 static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
0291 {
0292     return container_of(rt_rq, struct rq, rt);
0293 }
0294
0295 static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
0296 {
0297     struct task_struct *p = rt_task_of(rt_se);
0298
0299     return task_rq(p);
0300 }
0301
0302 static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
0303 {
0304     struct rq *rq = rq_of_rt_se(rt_se);
0305
0306     return &rq->rt;
0307 }
0308
0309 void unregister_rt_sched_group(struct task_group *tg) { }
0310
0311 void free_rt_sched_group(struct task_group *tg) { }
0312
0313 int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
0314 {
0315     return 1;
0316 }
0317 #endif /* CONFIG_RT_GROUP_SCHED */
0318
0319 #ifdef CONFIG_SMP
0320
0321 static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
0322 {
0323     /* Try to pull RT tasks here if we lower this rq's prio */
0324     return rq->online && rq->rt.highest_prio.curr > prev->prio;
0325 }
0326
0327 static inline int rt_overloaded(struct rq *rq)
0328 {
0329     return atomic_read(&rq->rd->rto_count);
0330 }
0331
0332 static inline void rt_set_overload(struct rq *rq)
0333 {
0334     if (!rq->online)
0335         return;
0336
0337     cpumask_set_cpu(rq->cpu, rq->rd->rto_mask);
0338     /*
0339      * Make sure the mask is visible before we set
0340      * the overload count. That is checked to determine
0341      * if we should look at the mask. It would be a shame
0342      * if we looked at the mask, but the mask was not
0343      * updated yet.
0344      *
0345      * Matched by the barrier in pull_rt_task().
0346      */
0347     smp_wmb();
0348     atomic_inc(&rq->rd->rto_count);
0349 }
0350
0351 static inline void rt_clear_overload(struct rq *rq)
0352 {
0353     if (!rq->online)
0354         return;
0355
0356     /* the order here really doesn't matter */
0357     atomic_dec(&rq->rd->rto_count);
0358     cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask);
0359 }
0360
0361 static void update_rt_migration(struct rt_rq *rt_rq)
0362 {
0363     if (rt_rq->rt_nr_migratory && rt_rq->rt_nr_total > 1) {
0364         if (!rt_rq->overloaded) {
0365             rt_set_overload(rq_of_rt_rq(rt_rq));
0366             rt_rq->overloaded = 1;
0367         }
0368     } else if (rt_rq->overloaded) {
0369         rt_clear_overload(rq_of_rt_rq(rt_rq));
0370         rt_rq->overloaded = 0;
0371     }
0372 }
0373
0374 static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
0375 {
0376     struct task_struct *p;
0377
0378     if (!rt_entity_is_task(rt_se))
0379         return;
0380
0381     p = rt_task_of(rt_se);
0382     rt_rq = &rq_of_rt_rq(rt_rq)->rt;
0383
0384     rt_rq->rt_nr_total++;
0385     if (p->nr_cpus_allowed > 1)
0386         rt_rq->rt_nr_migratory++;
0387
0388     update_rt_migration(rt_rq);
0389 }
0390
0391 static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
0392 {
0393     struct task_struct *p;
0394
0395     if (!rt_entity_is_task(rt_se))
0396         return;
0397
0398     p = rt_task_of(rt_se);
0399     rt_rq = &rq_of_rt_rq(rt_rq)->rt;
0400
0401     rt_rq->rt_nr_total--;
0402     if (p->nr_cpus_allowed > 1)
0403         rt_rq->rt_nr_migratory--;
0404
0405     update_rt_migration(rt_rq);
0406 }
0407
0408 static inline int has_pushable_tasks(struct rq *rq)
0409 {
0410     return !plist_head_empty(&rq->rt.pushable_tasks);
0411 }
0412
0413 static DEFINE_PER_CPU(struct callback_head, rt_push_head);
0414 static DEFINE_PER_CPU(struct callback_head, rt_pull_head);
0415
0416 static void push_rt_tasks(struct rq *);
0417 static void pull_rt_task(struct rq *);
0418
0419 static inline void rt_queue_push_tasks(struct rq *rq)
0420 {
0421     if (!has_pushable_tasks(rq))
0422         return;
0423
0424     queue_balance_callback(rq, &per_cpu(rt_push_head, rq->cpu), push_rt_tasks);
0425 }
0426
0427 static inline void rt_queue_pull_task(struct rq *rq)
0428 {
0429     queue_balance_callback(rq, &per_cpu(rt_pull_head, rq->cpu), pull_rt_task);
0430 }
0431
0432 static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
0433 {
0434     plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
0435     plist_node_init(&p->pushable_tasks, p->prio);
0436     plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks);
0437
0438     /* Update the highest prio pushable task */
0439     if (p->prio < rq->rt.highest_prio.next)
0440         rq->rt.highest_prio.next = p->prio;
0441 }
0442
0443 static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
0444 {
0445     plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
0446
0447     /* Update the new highest prio pushable task */
0448     if (has_pushable_tasks(rq)) {
0449         p = plist_first_entry(&rq->rt.pushable_tasks,
0450                       struct task_struct, pushable_tasks);
0451         rq->rt.highest_prio.next = p->prio;
0452     } else {
0453         rq->rt.highest_prio.next = MAX_RT_PRIO-1;
0454     }
0455 }
0456
0457 #else
0458
0459 static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
0460 {
0461 }
0462
0463 static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
0464 {
0465 }
0466
0467 static inline
0468 void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
0469 {
0470 }
0471
0472 static inline
0473 void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
0474 {
0475 }
0476
0477 static inline void rt_queue_push_tasks(struct rq *rq)
0478 {
0479 }
0480 #endif /* CONFIG_SMP */
0481
0482 static void enqueue_top_rt_rq(struct rt_rq *rt_rq);
0483 static void dequeue_top_rt_rq(struct rt_rq *rt_rq, unsigned int count);
0484
0485 static inline int on_rt_rq(struct sched_rt_entity *rt_se)
0486 {
0487     return rt_se->on_rq;
0488 }
0489
0490 #ifdef CONFIG_UCLAMP_TASK
0491 /*
0492  * Verify the fitness of task @p to run on @cpu taking into account the uclamp
0493  * settings.
0494  *
0495  * This check is only important for heterogeneous systems where uclamp_min value
0496  * is higher than the capacity of a @cpu. For non-heterogeneous system this
0497  * function will always return true.
0498  *
0499  * The function will return true if the capacity of the @cpu is >= the
0500  * uclamp_min and false otherwise.
0501  *
0502  * Note that uclamp_min will be clamped to uclamp_max if uclamp_min
0503  * > uclamp_max.
0504  */
0505 static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu)
0506 {
0507     unsigned int min_cap;
0508     unsigned int max_cap;
0509     unsigned int cpu_cap;
0510
0511     /* Only heterogeneous systems can benefit from this check */
0512     if (!static_branch_unlikely(&sched_asym_cpucapacity))
0513         return true;
0514
0515     min_cap = uclamp_eff_value(p, UCLAMP_MIN);
0516     max_cap = uclamp_eff_value(p, UCLAMP_MAX);
0517
0518     cpu_cap = capacity_orig_of(cpu);
0519
0520     return cpu_cap >= min(min_cap, max_cap);
0521 }
0522 #else
0523 static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu)
0524 {
0525     return true;
0526 }
0527 #endif
0528
0529 #ifdef CONFIG_RT_GROUP_SCHED
0530
0531 static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
0532 {
0533     if (!rt_rq->tg)
0534         return RUNTIME_INF;
0535
0536     return rt_rq->rt_runtime;
0537 }
0538
0539 static inline u64 sched_rt_period(struct rt_rq *rt_rq)
0540 {
0541     return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period);
0542 }
0543
0544 typedef struct task_group *rt_rq_iter_t;
0545
0546 static inline struct task_group *next_task_group(struct task_group *tg)
0547 {
0548     do {
0549         tg = list_entry_rcu(tg->list.next,
0550             typeof(struct task_group), list);
0551     } while (&tg->list != &task_groups && task_group_is_autogroup(tg));
0552
0553     if (&tg->list == &task_groups)
0554         tg = NULL;
0555
0556     return tg;
0557 }
0558
0559 #define for_each_rt_rq(rt_rq, iter, rq)                 \
0560     for (iter = container_of(&task_groups, typeof(*iter), list);    \
0561         (iter = next_task_group(iter)) &&           \
0562         (rt_rq = iter->rt_rq[cpu_of(rq)]);)
0563
0564 #define for_each_sched_rt_entity(rt_se) \
0565     for (; rt_se; rt_se = rt_se->parent)
0566
0567 static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
0568 {
0569     return rt_se->my_q;
0570 }
0571
0572 static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);
0573 static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);
0574
0575 static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
0576 {
0577     struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
0578     struct rq *rq = rq_of_rt_rq(rt_rq);
0579     struct sched_rt_entity *rt_se;
0580
0581     int cpu = cpu_of(rq);
0582
0583     rt_se = rt_rq->tg->rt_se[cpu];
0584
0585     if (rt_rq->rt_nr_running) {
0586         if (!rt_se)
0587             enqueue_top_rt_rq(rt_rq);
0588         else if (!on_rt_rq(rt_se))
0589             enqueue_rt_entity(rt_se, 0);
0590
0591         if (rt_rq->highest_prio.curr < curr->prio)
0592             resched_curr(rq);
0593     }
0594 }
0595
0596 static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
0597 {
0598     struct sched_rt_entity *rt_se;
0599     int cpu = cpu_of(rq_of_rt_rq(rt_rq));
0600
0601     rt_se = rt_rq->tg->rt_se[cpu];
0602
0603     if (!rt_se) {
0604         dequeue_top_rt_rq(rt_rq, rt_rq->rt_nr_running);
0605         /* Kick cpufreq (see the comment in kernel/sched/sched.h). */
0606         cpufreq_update_util(rq_of_rt_rq(rt_rq), 0);
0607     }
0608     else if (on_rt_rq(rt_se))
0609         dequeue_rt_entity(rt_se, 0);
0610 }
0611
0612 static inline int rt_rq_throttled(struct rt_rq *rt_rq)
0613 {
0614     return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
0615 }
0616
0617 static int rt_se_boosted(struct sched_rt_entity *rt_se)
0618 {
0619     struct rt_rq *rt_rq = group_rt_rq(rt_se);
0620     struct task_struct *p;
0621
0622     if (rt_rq)
0623         return !!rt_rq->rt_nr_boosted;
0624
0625     p = rt_task_of(rt_se);
0626     return p->prio != p->normal_prio;
0627 }
0628
0629 #ifdef CONFIG_SMP
0630 static inline const struct cpumask *sched_rt_period_mask(void)
0631 {
0632     return this_rq()->rd->span;
0633 }
0634 #else
0635 static inline const struct cpumask *sched_rt_period_mask(void)
0636 {
0637     return cpu_online_mask;
0638 }
0639 #endif
0640
0641 static inline
0642 struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
0643 {
0644     return container_of(rt_b, struct task_group, rt_bandwidth)->rt_rq[cpu];
0645 }
0646
0647 static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
0648 {
0649     return &rt_rq->tg->rt_bandwidth;
0650 }
0651
0652 #else /* !CONFIG_RT_GROUP_SCHED */
0653
0654 static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
0655 {
0656     return rt_rq->rt_runtime;
0657 }
0658
0659 static inline u64 sched_rt_period(struct rt_rq *rt_rq)
0660 {
0661     return ktime_to_ns(def_rt_bandwidth.rt_period);
0662 }
0663
0664 typedef struct rt_rq *rt_rq_iter_t;
0665
0666 #define for_each_rt_rq(rt_rq, iter, rq) \
0667     for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
0668
0669 #define for_each_sched_rt_entity(rt_se) \
0670     for (; rt_se; rt_se = NULL)
0671
0672 static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
0673 {
0674     return NULL;
0675 }
0676
0677 static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
0678 {
0679     struct rq *rq = rq_of_rt_rq(rt_rq);
0680
0681     if (!rt_rq->rt_nr_running)
0682         return;
0683
0684     enqueue_top_rt_rq(rt_rq);
0685     resched_curr(rq);
0686 }
0687
0688 static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
0689 {
0690     dequeue_top_rt_rq(rt_rq, rt_rq->rt_nr_running);
0691 }
0692
0693 static inline int rt_rq_throttled(struct rt_rq *rt_rq)
0694 {
0695     return rt_rq->rt_throttled;
0696 }
0697
0698 static inline const struct cpumask *sched_rt_period_mask(void)
0699 {
0700     return cpu_online_mask;
0701 }
0702
0703 static inline
0704 struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
0705 {
0706     return &cpu_rq(cpu)->rt;
0707 }
0708
0709 static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
0710 {
0711     return &def_rt_bandwidth;
0712 }
0713
0714 #endif /* CONFIG_RT_GROUP_SCHED */
0715
0716 bool sched_rt_bandwidth_account(struct rt_rq *rt_rq)
0717 {
0718     struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
0719
0720     return (hrtimer_active(&rt_b->rt_period_timer) ||
0721         rt_rq->rt_time < rt_b->rt_runtime);
0722 }
0723
0724 #ifdef CONFIG_SMP
0725 /*
0726  * We ran out of runtime, see if we can borrow some from our neighbours.
0727  */
0728 static void do_balance_runtime(struct rt_rq *rt_rq)
0729 {
0730     struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
0731     struct root_domain *rd = rq_of_rt_rq(rt_rq)->rd;
0732     int i, weight;
0733     u64 rt_period;
0734
0735     weight = cpumask_weight(rd->span);
0736
0737     raw_spin_lock(&rt_b->rt_runtime_lock);
0738     rt_period = ktime_to_ns(rt_b->rt_period);
0739     for_each_cpu(i, rd->span) {
0740         struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
0741         s64 diff;
0742
0743         if (iter == rt_rq)
0744             continue;
0745
0746         raw_spin_lock(&iter->rt_runtime_lock);
0747         /*
0748          * Either all rqs have inf runtime and there's nothing to steal
0749          * or __disable_runtime() below sets a specific rq to inf to
0750          * indicate its been disabled and disallow stealing.
0751          */
0752         if (iter->rt_runtime == RUNTIME_INF)
0753             goto next;
0754
0755         /*
0756          * From runqueues with spare time, take 1/n part of their
0757          * spare time, but no more than our period.
0758          */
0759         diff = iter->rt_runtime - iter->rt_time;
0760         if (diff > 0) {
0761             diff = div_u64((u64)diff, weight);
0762             if (rt_rq->rt_runtime + diff > rt_period)
0763                 diff = rt_period - rt_rq->rt_runtime;
0764             iter->rt_runtime -= diff;
0765             rt_rq->rt_runtime += diff;
0766             if (rt_rq->rt_runtime == rt_period) {
0767                 raw_spin_unlock(&iter->rt_runtime_lock);
0768                 break;
0769             }
0770         }
0771 next:
0772         raw_spin_unlock(&iter->rt_runtime_lock);
0773     }
0774     raw_spin_unlock(&rt_b->rt_runtime_lock);
0775 }
0776
0777 /*
0778  * Ensure this RQ takes back all the runtime it lend to its neighbours.
0779  */
0780 static void __disable_runtime(struct rq *rq)
0781 {
0782     struct root_domain *rd = rq->rd;
0783     rt_rq_iter_t iter;
0784     struct rt_rq *rt_rq;
0785
0786     if (unlikely(!scheduler_running))
0787         return;
0788
0789     for_each_rt_rq(rt_rq, iter, rq) {
0790         struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
0791         s64 want;
0792         int i;
0793
0794         raw_spin_lock(&rt_b->rt_runtime_lock);
0795         raw_spin_lock(&rt_rq->rt_runtime_lock);
0796         /*
0797          * Either we're all inf and nobody needs to borrow, or we're
0798          * already disabled and thus have nothing to do, or we have
0799          * exactly the right amount of runtime to take out.
0800          */
0801         if (rt_rq->rt_runtime == RUNTIME_INF ||
0802                 rt_rq->rt_runtime == rt_b->rt_runtime)
0803             goto balanced;
0804         raw_spin_unlock(&rt_rq->rt_runtime_lock);
0805
0806         /*
0807          * Calculate the difference between what we started out with
0808          * and what we current have, that's the amount of runtime
0809          * we lend and now have to reclaim.
0810          */
0811         want = rt_b->rt_runtime - rt_rq->rt_runtime;
0812
0813         /*
0814          * Greedy reclaim, take back as much as we can.
0815          */
0816         for_each_cpu(i, rd->span) {
0817             struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
0818             s64 diff;
0819
0820             /*
0821              * Can't reclaim from ourselves or disabled runqueues.
0822              */
0823             if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF)
0824                 continue;
0825
0826             raw_spin_lock(&iter->rt_runtime_lock);
0827             if (want > 0) {
0828                 diff = min_t(s64, iter->rt_runtime, want);
0829                 iter->rt_runtime -= diff;
0830                 want -= diff;
0831             } else {
0832                 iter->rt_runtime -= want;
0833                 want -= want;
0834             }
0835             raw_spin_unlock(&iter->rt_runtime_lock);
0836
0837             if (!want)
0838                 break;
0839         }
0840
0841         raw_spin_lock(&rt_rq->rt_runtime_lock);
0842         /*
0843          * We cannot be left wanting - that would mean some runtime
0844          * leaked out of the system.
0845          */
0846         BUG_ON(want);
0847 balanced:
0848         /*
0849          * Disable all the borrow logic by pretending we have inf
0850          * runtime - in which case borrowing doesn't make sense.
0851          */
0852         rt_rq->rt_runtime = RUNTIME_INF;
0853         rt_rq->rt_throttled = 0;
0854         raw_spin_unlock(&rt_rq->rt_runtime_lock);
0855         raw_spin_unlock(&rt_b->rt_runtime_lock);
0856
0857         /* Make rt_rq available for pick_next_task() */
0858         sched_rt_rq_enqueue(rt_rq);
0859     }
0860 }
0861
0862 static void __enable_runtime(struct rq *rq)
0863 {
0864     rt_rq_iter_t iter;
0865     struct rt_rq *rt_rq;
0866
0867     if (unlikely(!scheduler_running))
0868         return;
0869
0870     /*
0871      * Reset each runqueue's bandwidth settings
0872      */
0873     for_each_rt_rq(rt_rq, iter, rq) {
0874         struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
0875
0876         raw_spin_lock(&rt_b->rt_runtime_lock);
0877         raw_spin_lock(&rt_rq->rt_runtime_lock);
0878         rt_rq->rt_runtime = rt_b->rt_runtime;
0879         rt_rq->rt_time = 0;
0880         rt_rq->rt_throttled = 0;
0881         raw_spin_unlock(&rt_rq->rt_runtime_lock);
0882         raw_spin_unlock(&rt_b->rt_runtime_lock);
0883     }
0884 }
0885
0886 static void balance_runtime(struct rt_rq *rt_rq)
0887 {
0888     if (!sched_feat(RT_RUNTIME_SHARE))
0889         return;
0890
0891     if (rt_rq->rt_time > rt_rq->rt_runtime) {
0892         raw_spin_unlock(&rt_rq->rt_runtime_lock);
0893         do_balance_runtime(rt_rq);
0894         raw_spin_lock(&rt_rq->rt_runtime_lock);
0895     }
0896 }
0897 #else /* !CONFIG_SMP */
0898 static inline void balance_runtime(struct rt_rq *rt_rq) {}
0899 #endif /* CONFIG_SMP */
0900
0901 static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
0902 {
0903     int i, idle = 1, throttled = 0;
0904     const struct cpumask *span;
0905
0906     span = sched_rt_period_mask();
0907 #ifdef CONFIG_RT_GROUP_SCHED
0908     /*
0909      * FIXME: isolated CPUs should really leave the root task group,
0910      * whether they are isolcpus or were isolated via cpusets, lest
0911      * the timer run on a CPU which does not service all runqueues,
0912      * potentially leaving other CPUs indefinitely throttled.  If
0913      * isolation is really required, the user will turn the throttle
0914      * off to kill the perturbations it causes anyway.  Meanwhile,
0915      * this maintains functionality for boot and/or troubleshooting.
0916      */
0917     if (rt_b == &root_task_group.rt_bandwidth)
0918         span = cpu_online_mask;
0919 #endif
0920     for_each_cpu(i, span) {
0921         int enqueue = 0;
0922         struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
0923         struct rq *rq = rq_of_rt_rq(rt_rq);
0924         struct rq_flags rf;
0925         int skip;
0926
0927         /*
0928          * When span == cpu_online_mask, taking each rq->lock
0929          * can be time-consuming. Try to avoid it when possible.
0930          */
0931         raw_spin_lock(&rt_rq->rt_runtime_lock);
0932         if (!sched_feat(RT_RUNTIME_SHARE) && rt_rq->rt_runtime != RUNTIME_INF)
0933             rt_rq->rt_runtime = rt_b->rt_runtime;
0934         skip = !rt_rq->rt_time && !rt_rq->rt_nr_running;
0935         raw_spin_unlock(&rt_rq->rt_runtime_lock);
0936         if (skip)
0937             continue;
0938
0939         rq_lock(rq, &rf);
0940         update_rq_clock(rq);
0941
0942         if (rt_rq->rt_time) {
0943             u64 runtime;
0944
0945             raw_spin_lock(&rt_rq->rt_runtime_lock);
0946             if (rt_rq->rt_throttled)
0947                 balance_runtime(rt_rq);
0948             runtime = rt_rq->rt_runtime;
0949             rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime);
0950             if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
0951                 rt_rq->rt_throttled = 0;
0952                 enqueue = 1;
0953
0954                 /*
0955                  * When we're idle and a woken (rt) task is
0956                  * throttled check_preempt_curr() will set
0957                  * skip_update and the time between the wakeup
0958                  * and this unthrottle will get accounted as
0959                  * 'runtime'.
0960                  */
0961                 if (rt_rq->rt_nr_running && rq->curr == rq->idle)
0962                     rq_clock_cancel_skipupdate(rq);
0963             }
0964             if (rt_rq->rt_time || rt_rq->rt_nr_running)
0965                 idle = 0;
0966             raw_spin_unlock(&rt_rq->rt_runtime_lock);
0967         } else if (rt_rq->rt_nr_running) {
0968             idle = 0;
0969             if (!rt_rq_throttled(rt_rq))
0970                 enqueue = 1;
0971         }
0972         if (rt_rq->rt_throttled)
0973             throttled = 1;
0974
0975         if (enqueue)
0976             sched_rt_rq_enqueue(rt_rq);
0977         rq_unlock(rq, &rf);
0978     }
0979
0980     if (!throttled && (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF))
0981         return 1;
0982
0983     return idle;
0984 }
0985
0986 static inline int rt_se_prio(struct sched_rt_entity *rt_se)
0987 {
0988 #ifdef CONFIG_RT_GROUP_SCHED
0989     struct rt_rq *rt_rq = group_rt_rq(rt_se);
0990
0991     if (rt_rq)
0992         return rt_rq->highest_prio.curr;
0993 #endif
0994
0995     return rt_task_of(rt_se)->prio;
0996 }
0997
0998 static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
0999 {
1000     u64 runtime = sched_rt_runtime(rt_rq);
1001
1002     if (rt_rq->rt_throttled)
1003         return rt_rq_throttled(rt_rq);
1004
1005     if (runtime >= sched_rt_period(rt_rq))
1006         return 0;
1007
1008     balance_runtime(rt_rq);
1009     runtime = sched_rt_runtime(rt_rq);
1010     if (runtime == RUNTIME_INF)
1011         return 0;
1012
1013     if (rt_rq->rt_time > runtime) {
1014         struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
1015
1016         /*
1017          * Don't actually throttle groups that have no runtime assigned
1018          * but accrue some time due to boosting.
1019          */
1020         if (likely(rt_b->rt_runtime)) {
1021             rt_rq->rt_throttled = 1;
1022             printk_deferred_once("sched: RT throttling activated\n");
1023         } else {
1024             /*
1025              * In case we did anyway, make it go away,
1026              * replenishment is a joke, since it will replenish us
1027              * with exactly 0 ns.
1028              */
1029             rt_rq->rt_time = 0;
1030         }
1031
1032         if (rt_rq_throttled(rt_rq)) {
1033             sched_rt_rq_dequeue(rt_rq);
1034             return 1;
1035         }
1036     }
1037
1038     return 0;
1039 }
1040
1041 /*
1042  * Update the current task's runtime statistics. Skip current tasks that
1043  * are not in our scheduling class.
1044  */
1045 static void update_curr_rt(struct rq *rq)
1046 {
1047     struct task_struct *curr = rq->curr;
1048     struct sched_rt_entity *rt_se = &curr->rt;
1049     u64 delta_exec;
1050     u64 now;
1051
1052     if (curr->sched_class != &rt_sched_class)
1053         return;
1054
1055     now = rq_clock_task(rq);
1056     delta_exec = now - curr->se.exec_start;
1057     if (unlikely((s64)delta_exec <= 0))
1058         return;
1059
1060     schedstat_set(curr->stats.exec_max,
1061               max(curr->stats.exec_max, delta_exec));
1062
1063     trace_sched_stat_runtime(curr, delta_exec, 0);
1064
1065     curr->se.sum_exec_runtime += delta_exec;
1066     account_group_exec_runtime(curr, delta_exec);
1067
1068     curr->se.exec_start = now;
1069     cgroup_account_cputime(curr, delta_exec);
1070
1071     if (!rt_bandwidth_enabled())
1072         return;
1073
1074     for_each_sched_rt_entity(rt_se) {
1075         struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
1076         int exceeded;
1077
1078         if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
1079             raw_spin_lock(&rt_rq->rt_runtime_lock);
1080             rt_rq->rt_time += delta_exec;
1081             exceeded = sched_rt_runtime_exceeded(rt_rq);
1082             if (exceeded)
1083                 resched_curr(rq);
1084             raw_spin_unlock(&rt_rq->rt_runtime_lock);
1085             if (exceeded)
1086                 do_start_rt_bandwidth(sched_rt_bandwidth(rt_rq));
1087         }
1088     }
1089 }
1090
1091 static void
1092 dequeue_top_rt_rq(struct rt_rq *rt_rq, unsigned int count)
1093 {
1094     struct rq *rq = rq_of_rt_rq(rt_rq);
1095
1096     BUG_ON(&rq->rt != rt_rq);
1097
1098     if (!rt_rq->rt_queued)
1099         return;
1100
1101     BUG_ON(!rq->nr_running);
1102
1103     sub_nr_running(rq, count);
1104     rt_rq->rt_queued = 0;
1105
1106 }
1107
1108 static void
1109 enqueue_top_rt_rq(struct rt_rq *rt_rq)
1110 {
1111     struct rq *rq = rq_of_rt_rq(rt_rq);
1112
1113     BUG_ON(&rq->rt != rt_rq);
1114
1115     if (rt_rq->rt_queued)
1116         return;
1117
1118     if (rt_rq_throttled(rt_rq))
1119         return;
1120
1121     if (rt_rq->rt_nr_running) {
1122         add_nr_running(rq, rt_rq->rt_nr_running);
1123         rt_rq->rt_queued = 1;
1124     }
1125
1126     /* Kick cpufreq (see the comment in kernel/sched/sched.h). */
1127     cpufreq_update_util(rq, 0);
1128 }
1129
1130 #if defined CONFIG_SMP
1131
1132 static void
1133 inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
1134 {
1135     struct rq *rq = rq_of_rt_rq(rt_rq);
1136
1137 #ifdef CONFIG_RT_GROUP_SCHED
1138     /*
1139      * Change rq's cpupri only if rt_rq is the top queue.
1140      */
1141     if (&rq->rt != rt_rq)
1142         return;
1143 #endif
1144     if (rq->online && prio < prev_prio)
1145         cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
1146 }
1147
1148 static void
1149 dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
1150 {
1151     struct rq *rq = rq_of_rt_rq(rt_rq);
1152
1153 #ifdef CONFIG_RT_GROUP_SCHED
1154     /*
1155      * Change rq's cpupri only if rt_rq is the top queue.
1156      */
1157     if (&rq->rt != rt_rq)
1158         return;
1159 #endif
1160     if (rq->online && rt_rq->highest_prio.curr != prev_prio)
1161         cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
1162 }
1163
1164 #else /* CONFIG_SMP */
1165
1166 static inline
1167 void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
1168 static inline
1169 void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
1170
1171 #endif /* CONFIG_SMP */
1172
1173 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
1174 static void
1175 inc_rt_prio(struct rt_rq *rt_rq, int prio)
1176 {
1177     int prev_prio = rt_rq->highest_prio.curr;
1178
1179     if (prio < prev_prio)
1180         rt_rq->highest_prio.curr = prio;
1181
1182     inc_rt_prio_smp(rt_rq, prio, prev_prio);
1183 }
1184
1185 static void
1186 dec_rt_prio(struct rt_rq *rt_rq, int prio)
1187 {
1188     int prev_prio = rt_rq->highest_prio.curr;
1189
1190     if (rt_rq->rt_nr_running) {
1191
1192         WARN_ON(prio < prev_prio);
1193
1194         /*
1195          * This may have been our highest task, and therefore
1196          * we may have some recomputation to do
1197          */
1198         if (prio == prev_prio) {
1199             struct rt_prio_array *array = &rt_rq->active;
1200
1201             rt_rq->highest_prio.curr =
1202                 sched_find_first_bit(array->bitmap);
1203         }
1204
1205     } else {
1206         rt_rq->highest_prio.curr = MAX_RT_PRIO-1;
1207     }
1208
1209     dec_rt_prio_smp(rt_rq, prio, prev_prio);
1210 }
1211
1212 #else
1213
1214 static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {}
1215 static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {}
1216
1217 #endif /* CONFIG_SMP || CONFIG_RT_GROUP_SCHED */
1218
1219 #ifdef CONFIG_RT_GROUP_SCHED
1220
1221 static void
1222 inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1223 {
1224     if (rt_se_boosted(rt_se))
1225         rt_rq->rt_nr_boosted++;
1226
1227     if (rt_rq->tg)
1228         start_rt_bandwidth(&rt_rq->tg->rt_bandwidth);
1229 }
1230
1231 static void
1232 dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1233 {
1234     if (rt_se_boosted(rt_se))
1235         rt_rq->rt_nr_boosted--;
1236
1237     WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted);
1238 }
1239
1240 #else /* CONFIG_RT_GROUP_SCHED */
1241
1242 static void
1243 inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1244 {
1245     start_rt_bandwidth(&def_rt_bandwidth);
1246 }
1247
1248 static inline
1249 void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}
1250
1251 #endif /* CONFIG_RT_GROUP_SCHED */
1252
1253 static inline
1254 unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se)
1255 {
1256     struct rt_rq *group_rq = group_rt_rq(rt_se);
1257
1258     if (group_rq)
1259         return group_rq->rt_nr_running;
1260     else
1261         return 1;
1262 }
1263
1264 static inline
1265 unsigned int rt_se_rr_nr_running(struct sched_rt_entity *rt_se)
1266 {
1267     struct rt_rq *group_rq = group_rt_rq(rt_se);
1268     struct task_struct *tsk;
1269
1270     if (group_rq)
1271         return group_rq->rr_nr_running;
1272
1273     tsk = rt_task_of(rt_se);
1274
1275     return (tsk->policy == SCHED_RR) ? 1 : 0;
1276 }
1277
1278 static inline
1279 void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1280 {
1281     int prio = rt_se_prio(rt_se);
1282
1283     WARN_ON(!rt_prio(prio));
1284     rt_rq->rt_nr_running += rt_se_nr_running(rt_se);
1285     rt_rq->rr_nr_running += rt_se_rr_nr_running(rt_se);
1286
1287     inc_rt_prio(rt_rq, prio);
1288     inc_rt_migration(rt_se, rt_rq);
1289     inc_rt_group(rt_se, rt_rq);
1290 }
1291
1292 static inline
1293 void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1294 {
1295     WARN_ON(!rt_prio(rt_se_prio(rt_se)));
1296     WARN_ON(!rt_rq->rt_nr_running);
1297     rt_rq->rt_nr_running -= rt_se_nr_running(rt_se);
1298     rt_rq->rr_nr_running -= rt_se_rr_nr_running(rt_se);
1299
1300     dec_rt_prio(rt_rq, rt_se_prio(rt_se));
1301     dec_rt_migration(rt_se, rt_rq);
1302     dec_rt_group(rt_se, rt_rq);
1303 }
1304
1305 /*
1306  * Change rt_se->run_list location unless SAVE && !MOVE
1307  *
1308  * assumes ENQUEUE/DEQUEUE flags match
1309  */
1310 static inline bool move_entity(unsigned int flags)
1311 {
1312     if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE)
1313         return false;
1314
1315     return true;
1316 }
1317
1318 static void __delist_rt_entity(struct sched_rt_entity *rt_se, struct rt_prio_array *array)
1319 {
1320     list_del_init(&rt_se->run_list);
1321
1322     if (list_empty(array->queue + rt_se_prio(rt_se)))
1323         __clear_bit(rt_se_prio(rt_se), array->bitmap);
1324
1325     rt_se->on_list = 0;
1326 }
1327
1328 static inline struct sched_statistics *
1329 __schedstats_from_rt_se(struct sched_rt_entity *rt_se)
1330 {
1331 #ifdef CONFIG_RT_GROUP_SCHED
1332     /* schedstats is not supported for rt group. */
1333     if (!rt_entity_is_task(rt_se))
1334         return NULL;
1335 #endif
1336
1337     return &rt_task_of(rt_se)->stats;
1338 }
1339
1340 static inline void
1341 update_stats_wait_start_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se)
1342 {
1343     struct sched_statistics *stats;
1344     struct task_struct *p = NULL;
1345
1346     if (!schedstat_enabled())
1347         return;
1348
1349     if (rt_entity_is_task(rt_se))
1350         p = rt_task_of(rt_se);
1351
1352     stats = __schedstats_from_rt_se(rt_se);
1353     if (!stats)
1354         return;
1355
1356     __update_stats_wait_start(rq_of_rt_rq(rt_rq), p, stats);
1357 }
1358
1359 static inline void
1360 update_stats_enqueue_sleeper_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se)
1361 {
1362     struct sched_statistics *stats;
1363     struct task_struct *p = NULL;
1364
1365     if (!schedstat_enabled())
1366         return;
1367
1368     if (rt_entity_is_task(rt_se))
1369         p = rt_task_of(rt_se);
1370
1371     stats = __schedstats_from_rt_se(rt_se);
1372     if (!stats)
1373         return;
1374
1375     __update_stats_enqueue_sleeper(rq_of_rt_rq(rt_rq), p, stats);
1376 }
1377
1378 static inline void
1379 update_stats_enqueue_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se,
1380             int flags)
1381 {
1382     if (!schedstat_enabled())
1383         return;
1384
1385     if (flags & ENQUEUE_WAKEUP)
1386         update_stats_enqueue_sleeper_rt(rt_rq, rt_se);
1387 }
1388
1389 static inline void
1390 update_stats_wait_end_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se)
1391 {
1392     struct sched_statistics *stats;
1393     struct task_struct *p = NULL;
1394
1395     if (!schedstat_enabled())
1396         return;
1397
1398     if (rt_entity_is_task(rt_se))
1399         p = rt_task_of(rt_se);
1400
1401     stats = __schedstats_from_rt_se(rt_se);
1402     if (!stats)
1403         return;
1404
1405     __update_stats_wait_end(rq_of_rt_rq(rt_rq), p, stats);
1406 }
1407
1408 static inline void
1409 update_stats_dequeue_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se,
1410             int flags)
1411 {
1412     struct task_struct *p = NULL;
1413
1414     if (!schedstat_enabled())
1415         return;
1416
1417     if (rt_entity_is_task(rt_se))
1418         p = rt_task_of(rt_se);
1419
1420     if ((flags & DEQUEUE_SLEEP) && p) {
1421         unsigned int state;
1422
1423         state = READ_ONCE(p->__state);
1424         if (state & TASK_INTERRUPTIBLE)
1425             __schedstat_set(p->stats.sleep_start,
1426                     rq_clock(rq_of_rt_rq(rt_rq)));
1427
1428         if (state & TASK_UNINTERRUPTIBLE)
1429             __schedstat_set(p->stats.block_start,
1430                     rq_clock(rq_of_rt_rq(rt_rq)));
1431     }
1432 }
1433
1434 static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
1435 {
1436     struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
1437     struct rt_prio_array *array = &rt_rq->active;
1438     struct rt_rq *group_rq = group_rt_rq(rt_se);
1439     struct list_head *queue = array->queue + rt_se_prio(rt_se);
1440
1441     /*
1442      * Don't enqueue the group if its throttled, or when empty.
1443      * The latter is a consequence of the former when a child group
1444      * get throttled and the current group doesn't have any other
1445      * active members.
1446      */
1447     if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) {
1448         if (rt_se->on_list)
1449             __delist_rt_entity(rt_se, array);
1450         return;
1451     }
1452
1453     if (move_entity(flags)) {
1454         WARN_ON_ONCE(rt_se->on_list);
1455         if (flags & ENQUEUE_HEAD)
1456             list_add(&rt_se->run_list, queue);
1457         else
1458             list_add_tail(&rt_se->run_list, queue);
1459
1460         __set_bit(rt_se_prio(rt_se), array->bitmap);
1461         rt_se->on_list = 1;
1462     }
1463     rt_se->on_rq = 1;
1464
1465     inc_rt_tasks(rt_se, rt_rq);
1466 }
1467
1468 static void __dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
1469 {
1470     struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
1471     struct rt_prio_array *array = &rt_rq->active;
1472
1473     if (move_entity(flags)) {
1474         WARN_ON_ONCE(!rt_se->on_list);
1475         __delist_rt_entity(rt_se, array);
1476     }
1477     rt_se->on_rq = 0;
1478
1479     dec_rt_tasks(rt_se, rt_rq);
1480 }
1481
1482 /*
1483  * Because the prio of an upper entry depends on the lower
1484  * entries, we must remove entries top - down.
1485  */
1486 static void dequeue_rt_stack(struct sched_rt_entity *rt_se, unsigned int flags)
1487 {
1488     struct sched_rt_entity *back = NULL;
1489     unsigned int rt_nr_running;
1490
1491     for_each_sched_rt_entity(rt_se) {
1492         rt_se->back = back;
1493         back = rt_se;
1494     }
1495
1496     rt_nr_running = rt_rq_of_se(back)->rt_nr_running;
1497
1498     for (rt_se = back; rt_se; rt_se = rt_se->back) {
1499         if (on_rt_rq(rt_se))
1500             __dequeue_rt_entity(rt_se, flags);
1501     }
1502
1503     dequeue_top_rt_rq(rt_rq_of_se(back), rt_nr_running);
1504 }
1505
1506 static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
1507 {
1508     struct rq *rq = rq_of_rt_se(rt_se);
1509
1510     update_stats_enqueue_rt(rt_rq_of_se(rt_se), rt_se, flags);
1511
1512     dequeue_rt_stack(rt_se, flags);
1513     for_each_sched_rt_entity(rt_se)
1514         __enqueue_rt_entity(rt_se, flags);
1515     enqueue_top_rt_rq(&rq->rt);
1516 }
1517
1518 static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
1519 {
1520     struct rq *rq = rq_of_rt_se(rt_se);
1521
1522     update_stats_dequeue_rt(rt_rq_of_se(rt_se), rt_se, flags);
1523
1524     dequeue_rt_stack(rt_se, flags);
1525
1526     for_each_sched_rt_entity(rt_se) {
1527         struct rt_rq *rt_rq = group_rt_rq(rt_se);
1528
1529         if (rt_rq && rt_rq->rt_nr_running)
1530             __enqueue_rt_entity(rt_se, flags);
1531     }
1532     enqueue_top_rt_rq(&rq->rt);
1533 }
1534
1535 /*
1536  * Adding/removing a task to/from a priority array:
1537  */
1538 static void
1539 enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
1540 {
1541     struct sched_rt_entity *rt_se = &p->rt;
1542
1543     if (flags & ENQUEUE_WAKEUP)
1544         rt_se->timeout = 0;
1545
1546     check_schedstat_required();
1547     update_stats_wait_start_rt(rt_rq_of_se(rt_se), rt_se);
1548
1549     enqueue_rt_entity(rt_se, flags);
1550
1551     if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
1552         enqueue_pushable_task(rq, p);
1553 }
1554
1555 static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
1556 {
1557     struct sched_rt_entity *rt_se = &p->rt;
1558
1559     update_curr_rt(rq);
1560     dequeue_rt_entity(rt_se, flags);
1561
1562     dequeue_pushable_task(rq, p);
1563 }
1564
1565 /*
1566  * Put task to the head or the end of the run list without the overhead of
1567  * dequeue followed by enqueue.
1568  */
1569 static void
1570 requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head)
1571 {
1572     if (on_rt_rq(rt_se)) {
1573         struct rt_prio_array *array = &rt_rq->active;
1574         struct list_head *queue = array->queue + rt_se_prio(rt_se);
1575
1576         if (head)
1577             list_move(&rt_se->run_list, queue);
1578         else
1579             list_move_tail(&rt_se->run_list, queue);
1580     }
1581 }
1582
1583 static void requeue_task_rt(struct rq *rq, struct task_struct *p, int head)
1584 {
1585     struct sched_rt_entity *rt_se = &p->rt;
1586     struct rt_rq *rt_rq;
1587
1588     for_each_sched_rt_entity(rt_se) {
1589         rt_rq = rt_rq_of_se(rt_se);
1590         requeue_rt_entity(rt_rq, rt_se, head);
1591     }
1592 }
1593
1594 static void yield_task_rt(struct rq *rq)
1595 {
1596     requeue_task_rt(rq, rq->curr, 0);
1597 }
1598
1599 #ifdef CONFIG_SMP
1600 static int find_lowest_rq(struct task_struct *task);
1601
1602 static int
1603 select_task_rq_rt(struct task_struct *p, int cpu, int flags)
1604 {
1605     struct task_struct *curr;
1606     struct rq *rq;
1607     bool test;
1608
1609     /* For anything but wake ups, just return the task_cpu */
1610     if (!(flags & (WF_TTWU | WF_FORK)))
1611         goto out;
1612
1613     rq = cpu_rq(cpu);
1614
1615     rcu_read_lock();
1616     curr = READ_ONCE(rq->curr); /* unlocked access */
1617
1618     /*
1619      * If the current task on @p's runqueue is an RT task, then
1620      * try to see if we can wake this RT task up on another
1621      * runqueue. Otherwise simply start this RT task
1622      * on its current runqueue.
1623      *
1624      * We want to avoid overloading runqueues. If the woken
1625      * task is a higher priority, then it will stay on this CPU
1626      * and the lower prio task should be moved to another CPU.
1627      * Even though this will probably make the lower prio task
1628      * lose its cache, we do not want to bounce a higher task
1629      * around just because it gave up its CPU, perhaps for a
1630      * lock?
1631      *
1632      * For equal prio tasks, we just let the scheduler sort it out.
1633      *
1634      * Otherwise, just let it ride on the affined RQ and the
1635      * post-schedule router will push the preempted task away
1636      *
1637      * This test is optimistic, if we get it wrong the load-balancer
1638      * will have to sort it out.
1639      *
1640      * We take into account the capacity of the CPU to ensure it fits the
1641      * requirement of the task - which is only important on heterogeneous
1642      * systems like big.LITTLE.
1643      */
1644     test = curr &&
1645            unlikely(rt_task(curr)) &&
1646            (curr->nr_cpus_allowed < 2 || curr->prio <= p->prio);
1647
1648     if (test || !rt_task_fits_capacity(p, cpu)) {
1649         int target = find_lowest_rq(p);
1650
1651         /*
1652          * Bail out if we were forcing a migration to find a better
1653          * fitting CPU but our search failed.
1654          */
1655         if (!test && target != -1 && !rt_task_fits_capacity(p, target))
1656             goto out_unlock;
1657
1658         /*
1659          * Don't bother moving it if the destination CPU is
1660          * not running a lower priority task.
1661          */
1662         if (target != -1 &&
1663             p->prio < cpu_rq(target)->rt.highest_prio.curr)
1664             cpu = target;
1665     }
1666
1667 out_unlock:
1668     rcu_read_unlock();
1669
1670 out:
1671     return cpu;
1672 }
1673
1674 static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
1675 {
1676     /*
1677      * Current can't be migrated, useless to reschedule,
1678      * let's hope p can move out.
1679      */
1680     if (rq->curr->nr_cpus_allowed == 1 ||
1681         !cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
1682         return;
1683
1684     /*
1685      * p is migratable, so let's not schedule it and
1686      * see if it is pushed or pulled somewhere else.
1687      */
1688     if (p->nr_cpus_allowed != 1 &&
1689         cpupri_find(&rq->rd->cpupri, p, NULL))
1690         return;
1691
1692     /*
1693      * There appear to be other CPUs that can accept
1694      * the current task but none can run 'p', so lets reschedule
1695      * to try and push the current task away:
1696      */
1697     requeue_task_rt(rq, p, 1);
1698     resched_curr(rq);
1699 }
1700
1701 static int balance_rt(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
1702 {
1703     if (!on_rt_rq(&p->rt) && need_pull_rt_task(rq, p)) {
1704         /*
1705          * This is OK, because current is on_cpu, which avoids it being
1706          * picked for load-balance and preemption/IRQs are still
1707          * disabled avoiding further scheduler activity on it and we've
1708          * not yet started the picking loop.
1709          */
1710         rq_unpin_lock(rq, rf);
1711         pull_rt_task(rq);
1712         rq_repin_lock(rq, rf);
1713     }
1714
1715     return sched_stop_runnable(rq) || sched_dl_runnable(rq) || sched_rt_runnable(rq);
1716 }
1717 #endif /* CONFIG_SMP */
1718
1719 /*
1720  * Preempt the current task with a newly woken task if needed:
1721  */
1722 static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags)
1723 {
1724     if (p->prio < rq->curr->prio) {
1725         resched_curr(rq);
1726         return;
1727     }
1728
1729 #ifdef CONFIG_SMP
1730     /*
1731      * If:
1732      *
1733      * - the newly woken task is of equal priority to the current task
1734      * - the newly woken task is non-migratable while current is migratable
1735      * - current will be preempted on the next reschedule
1736      *
1737      * we should check to see if current can readily move to a different
1738      * cpu.  If so, we will reschedule to allow the push logic to try
1739      * to move current somewhere else, making room for our non-migratable
1740      * task.
1741      */
1742     if (p->prio == rq->curr->prio && !test_tsk_need_resched(rq->curr))
1743         check_preempt_equal_prio(rq, p);
1744 #endif
1745 }
1746
1747 static inline void set_next_task_rt(struct rq *rq, struct task_struct *p, bool first)
1748 {
1749     struct sched_rt_entity *rt_se = &p->rt;
1750     struct rt_rq *rt_rq = &rq->rt;
1751
1752     p->se.exec_start = rq_clock_task(rq);
1753     if (on_rt_rq(&p->rt))
1754         update_stats_wait_end_rt(rt_rq, rt_se);
1755
1756     /* The running task is never eligible for pushing */
1757     dequeue_pushable_task(rq, p);
1758
1759     if (!first)
1760         return;
1761
1762     /*
1763      * If prev task was rt, put_prev_task() has already updated the
1764      * utilization. We only care of the case where we start to schedule a
1765      * rt task
1766      */
1767     if (rq->curr->sched_class != &rt_sched_class)
1768         update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0);
1769
1770     rt_queue_push_tasks(rq);
1771 }
1772
1773 static struct sched_rt_entity *pick_next_rt_entity(struct rt_rq *rt_rq)
1774 {
1775     struct rt_prio_array *array = &rt_rq->active;
1776     struct sched_rt_entity *next = NULL;
1777     struct list_head *queue;
1778     int idx;
1779
1780     idx = sched_find_first_bit(array->bitmap);
1781     BUG_ON(idx >= MAX_RT_PRIO);
1782
1783     queue = array->queue + idx;
1784     next = list_entry(queue->next, struct sched_rt_entity, run_list);
1785
1786     return next;
1787 }
1788
1789 static struct task_struct *_pick_next_task_rt(struct rq *rq)
1790 {
1791     struct sched_rt_entity *rt_se;
1792     struct rt_rq *rt_rq  = &rq->rt;
1793
1794     do {
1795         rt_se = pick_next_rt_entity(rt_rq);
1796         BUG_ON(!rt_se);
1797         rt_rq = group_rt_rq(rt_se);
1798     } while (rt_rq);
1799
1800     return rt_task_of(rt_se);
1801 }
1802
1803 static struct task_struct *pick_task_rt(struct rq *rq)
1804 {
1805     struct task_struct *p;
1806
1807     if (!sched_rt_runnable(rq))
1808         return NULL;
1809
1810     p = _pick_next_task_rt(rq);
1811
1812     return p;
1813 }
1814
1815 static struct task_struct *pick_next_task_rt(struct rq *rq)
1816 {
1817     struct task_struct *p = pick_task_rt(rq);
1818
1819     if (p)
1820         set_next_task_rt(rq, p, true);
1821
1822     return p;
1823 }
1824
1825 static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
1826 {
1827     struct sched_rt_entity *rt_se = &p->rt;
1828     struct rt_rq *rt_rq = &rq->rt;
1829
1830     if (on_rt_rq(&p->rt))
1831         update_stats_wait_start_rt(rt_rq, rt_se);
1832
1833     update_curr_rt(rq);
1834
1835     update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 1);
1836
1837     /*
1838      * The previous task needs to be made eligible for pushing
1839      * if it is still active
1840      */
1841     if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1)
1842         enqueue_pushable_task(rq, p);
1843 }
1844
1845 #ifdef CONFIG_SMP
1846
1847 /* Only try algorithms three times */
1848 #define RT_MAX_TRIES 3
1849
1850 static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
1851 {
1852     if (!task_running(rq, p) &&
1853         cpumask_test_cpu(cpu, &p->cpus_mask))
1854         return 1;
1855
1856     return 0;
1857 }
1858
1859 /*
1860  * Return the highest pushable rq's task, which is suitable to be executed
1861  * on the CPU, NULL otherwise
1862  */
1863 static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu)
1864 {
1865     struct plist_head *head = &rq->rt.pushable_tasks;
1866     struct task_struct *p;
1867
1868     if (!has_pushable_tasks(rq))
1869         return NULL;
1870
1871     plist_for_each_entry(p, head, pushable_tasks) {
1872         if (pick_rt_task(rq, p, cpu))
1873             return p;
1874     }
1875
1876     return NULL;
1877 }
1878
1879 static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
1880
1881 static int find_lowest_rq(struct task_struct *task)
1882 {
1883     struct sched_domain *sd;
1884     struct cpumask *lowest_mask = this_cpu_cpumask_var_ptr(local_cpu_mask);
1885     int this_cpu = smp_processor_id();
1886     int cpu      = task_cpu(task);
1887     int ret;
1888
1889     /* Make sure the mask is initialized first */
1890     if (unlikely(!lowest_mask))
1891         return -1;
1892
1893     if (task->nr_cpus_allowed == 1)
1894         return -1; /* No other targets possible */
1895
1896     /*
1897      * If we're on asym system ensure we consider the different capacities
1898      * of the CPUs when searching for the lowest_mask.
1899      */
1900     if (static_branch_unlikely(&sched_asym_cpucapacity)) {
1901
1902         ret = cpupri_find_fitness(&task_rq(task)->rd->cpupri,
1903                       task, lowest_mask,
1904                       rt_task_fits_capacity);
1905     } else {
1906
1907         ret = cpupri_find(&task_rq(task)->rd->cpupri,
1908                   task, lowest_mask);
1909     }
1910
1911     if (!ret)
1912         return -1; /* No targets found */
1913
1914     /*
1915      * At this point we have built a mask of CPUs representing the
1916      * lowest priority tasks in the system.  Now we want to elect
1917      * the best one based on our affinity and topology.
1918      *
1919      * We prioritize the last CPU that the task executed on since
1920      * it is most likely cache-hot in that location.
1921      */
1922     if (cpumask_test_cpu(cpu, lowest_mask))
1923         return cpu;
1924
1925     /*
1926      * Otherwise, we consult the sched_domains span maps to figure
1927      * out which CPU is logically closest to our hot cache data.
1928      */
1929     if (!cpumask_test_cpu(this_cpu, lowest_mask))
1930         this_cpu = -1; /* Skip this_cpu opt if not among lowest */
1931
1932     rcu_read_lock();
1933     for_each_domain(cpu, sd) {
1934         if (sd->flags & SD_WAKE_AFFINE) {
1935             int best_cpu;
1936
1937             /*
1938              * "this_cpu" is cheaper to preempt than a
1939              * remote processor.
1940              */
1941             if (this_cpu != -1 &&
1942                 cpumask_test_cpu(this_cpu, sched_domain_span(sd))) {
1943                 rcu_read_unlock();
1944                 return this_cpu;
1945             }
1946
1947             best_cpu = cpumask_any_and_distribute(lowest_mask,
1948                                   sched_domain_span(sd));
1949             if (best_cpu < nr_cpu_ids) {
1950                 rcu_read_unlock();
1951                 return best_cpu;
1952             }
1953         }
1954     }
1955     rcu_read_unlock();
1956
1957     /*
1958      * And finally, if there were no matches within the domains
1959      * just give the caller *something* to work with from the compatible
1960      * locations.
1961      */
1962     if (this_cpu != -1)
1963         return this_cpu;
1964
1965     cpu = cpumask_any_distribute(lowest_mask);
1966     if (cpu < nr_cpu_ids)
1967         return cpu;
1968
1969     return -1;
1970 }
1971
1972 /* Will lock the rq it finds */
1973 static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
1974 {
1975     struct rq *lowest_rq = NULL;
1976     int tries;
1977     int cpu;
1978
1979     for (tries = 0; tries < RT_MAX_TRIES; tries++) {
1980         cpu = find_lowest_rq(task);
1981
1982         if ((cpu == -1) || (cpu == rq->cpu))
1983             break;
1984
1985         lowest_rq = cpu_rq(cpu);
1986
1987         if (lowest_rq->rt.highest_prio.curr <= task->prio) {
1988             /*
1989              * Target rq has tasks of equal or higher priority,
1990              * retrying does not release any lock and is unlikely
1991              * to yield a different result.
1992              */
1993             lowest_rq = NULL;
1994             break;
1995         }
1996
1997         /* if the prio of this runqueue changed, try again */
1998         if (double_lock_balance(rq, lowest_rq)) {
1999             /*
2000              * We had to unlock the run queue. In
2001              * the mean time, task could have
2002              * migrated already or had its affinity changed.
2003              * Also make sure that it wasn't scheduled on its rq.
2004              */
2005             if (unlikely(task_rq(task) != rq ||
2006                      !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_mask) ||
2007                      task_running(rq, task) ||
2008                      !rt_task(task) ||
2009                      !task_on_rq_queued(task))) {
2010
2011                 double_unlock_balance(rq, lowest_rq);
2012                 lowest_rq = NULL;
2013                 break;
2014             }
2015         }
2016
2017         /* If this rq is still suitable use it. */
2018         if (lowest_rq->rt.highest_prio.curr > task->prio)
2019             break;
2020
2021         /* try again */
2022         double_unlock_balance(rq, lowest_rq);
2023         lowest_rq = NULL;
2024     }
2025
2026     return lowest_rq;
2027 }
2028
2029 static struct task_struct *pick_next_pushable_task(struct rq *rq)
2030 {
2031     struct task_struct *p;
2032
2033     if (!has_pushable_tasks(rq))
2034         return NULL;
2035
2036     p = plist_first_entry(&rq->rt.pushable_tasks,
2037                   struct task_struct, pushable_tasks);
2038
2039     BUG_ON(rq->cpu != task_cpu(p));
2040     BUG_ON(task_current(rq, p));
2041     BUG_ON(p->nr_cpus_allowed <= 1);
2042
2043     BUG_ON(!task_on_rq_queued(p));
2044     BUG_ON(!rt_task(p));
2045
2046     return p;
2047 }
2048
2049 /*
2050  * If the current CPU has more than one RT task, see if the non
2051  * running task can migrate over to a CPU that is running a task
2052  * of lesser priority.
2053  */
2054 static int push_rt_task(struct rq *rq, bool pull)
2055 {
2056     struct task_struct *next_task;
2057     struct rq *lowest_rq;
2058     int ret = 0;
2059
2060     if (!rq->rt.overloaded)
2061         return 0;
2062
2063     next_task = pick_next_pushable_task(rq);
2064     if (!next_task)
2065         return 0;
2066
2067 retry:
2068     /*
2069      * It's possible that the next_task slipped in of
2070      * higher priority than current. If that's the case
2071      * just reschedule current.
2072      */
2073     if (unlikely(next_task->prio < rq->curr->prio)) {
2074         resched_curr(rq);
2075         return 0;
2076     }
2077
2078     if (is_migration_disabled(next_task)) {
2079         struct task_struct *push_task = NULL;
2080         int cpu;
2081
2082         if (!pull || rq->push_busy)
2083             return 0;
2084
2085         /*
2086          * Invoking find_lowest_rq() on anything but an RT task doesn't
2087          * make sense. Per the above priority check, curr has to
2088          * be of higher priority than next_task, so no need to
2089          * reschedule when bailing out.
2090          *
2091          * Note that the stoppers are masqueraded as SCHED_FIFO
2092          * (cf. sched_set_stop_task()), so we can't rely on rt_task().
2093          */
2094         if (rq->curr->sched_class != &rt_sched_class)
2095             return 0;
2096
2097         cpu = find_lowest_rq(rq->curr);
2098         if (cpu == -1 || cpu == rq->cpu)
2099             return 0;
2100
2101         /*
2102          * Given we found a CPU with lower priority than @next_task,
2103          * therefore it should be running. However we cannot migrate it
2104          * to this other CPU, instead attempt to push the current
2105          * running task on this CPU away.
2106          */
2107         push_task = get_push_task(rq);
2108         if (push_task) {
2109             raw_spin_rq_unlock(rq);
2110             stop_one_cpu_nowait(rq->cpu, push_cpu_stop,
2111                         push_task, &rq->push_work);
2112             raw_spin_rq_lock(rq);
2113         }
2114
2115         return 0;
2116     }
2117
2118     if (WARN_ON(next_task == rq->curr))
2119         return 0;
2120
2121     /* We might release rq lock */
2122     get_task_struct(next_task);
2123
2124     /* find_lock_lowest_rq locks the rq if found */
2125     lowest_rq = find_lock_lowest_rq(next_task, rq);
2126     if (!lowest_rq) {
2127         struct task_struct *task;
2128         /*
2129          * find_lock_lowest_rq releases rq->lock
2130          * so it is possible that next_task has migrated.
2131          *
2132          * We need to make sure that the task is still on the same
2133          * run-queue and is also still the next task eligible for
2134          * pushing.
2135          */
2136         task = pick_next_pushable_task(rq);
2137         if (task == next_task) {
2138             /*
2139              * The task hasn't migrated, and is still the next
2140              * eligible task, but we failed to find a run-queue
2141              * to push it to.  Do not retry in this case, since
2142              * other CPUs will pull from us when ready.
2143              */
2144             goto out;
2145         }
2146
2147         if (!task)
2148             /* No more tasks, just exit */
2149             goto out;
2150
2151         /*
2152          * Something has shifted, try again.
2153          */
2154         put_task_struct(next_task);
2155         next_task = task;
2156         goto retry;
2157     }
2158
2159     deactivate_task(rq, next_task, 0);
2160     set_task_cpu(next_task, lowest_rq->cpu);
2161     activate_task(lowest_rq, next_task, 0);
2162     resched_curr(lowest_rq);
2163     ret = 1;
2164
2165     double_unlock_balance(rq, lowest_rq);
2166 out:
2167     put_task_struct(next_task);
2168
2169     return ret;
2170 }
2171
2172 static void push_rt_tasks(struct rq *rq)
2173 {
2174     /* push_rt_task will return true if it moved an RT */
2175     while (push_rt_task(rq, false))
2176         ;
2177 }
2178
2179 #ifdef HAVE_RT_PUSH_IPI
2180
2181 /*
2182  * When a high priority task schedules out from a CPU and a lower priority
2183  * task is scheduled in, a check is made to see if there's any RT tasks
2184  * on other CPUs that are waiting to run because a higher priority RT task
2185  * is currently running on its CPU. In this case, the CPU with multiple RT
2186  * tasks queued on it (overloaded) needs to be notified that a CPU has opened
2187  * up that may be able to run one of its non-running queued RT tasks.
2188  *
2189  * All CPUs with overloaded RT tasks need to be notified as there is currently
2190  * no way to know which of these CPUs have the highest priority task waiting
2191  * to run. Instead of trying to take a spinlock on each of these CPUs,
2192  * which has shown to cause large latency when done on machines with many
2193  * CPUs, sending an IPI to the CPUs to have them push off the overloaded
2194  * RT tasks waiting to run.
2195  *
2196  * Just sending an IPI to each of the CPUs is also an issue, as on large
2197  * count CPU machines, this can cause an IPI storm on a CPU, especially
2198  * if its the only CPU with multiple RT tasks queued, and a large number
2199  * of CPUs scheduling a lower priority task at the same time.
2200  *
2201  * Each root domain has its own irq work function that can iterate over
2202  * all CPUs with RT overloaded tasks. Since all CPUs with overloaded RT
2203  * task must be checked if there's one or many CPUs that are lowering
2204  * their priority, there's a single irq work iterator that will try to
2205  * push off RT tasks that are waiting to run.
2206  *
2207  * When a CPU schedules a lower priority task, it will kick off the
2208  * irq work iterator that will jump to each CPU with overloaded RT tasks.
2209  * As it only takes the first CPU that schedules a lower priority task
2210  * to start the process, the rto_start variable is incremented and if
2211  * the atomic result is one, then that CPU will try to take the rto_lock.
2212  * This prevents high contention on the lock as the process handles all
2213  * CPUs scheduling lower priority tasks.
2214  *
2215  * All CPUs that are scheduling a lower priority task will increment the
2216  * rt_loop_next variable. This will make sure that the irq work iterator
2217  * checks all RT overloaded CPUs whenever a CPU schedules a new lower
2218  * priority task, even if the iterator is in the middle of a scan. Incrementing
2219  * the rt_loop_next will cause the iterator to perform another scan.
2220  *
2221  */
2222 static int rto_next_cpu(struct root_domain *rd)
2223 {
2224     int next;
2225     int cpu;
2226
2227     /*
2228      * When starting the IPI RT pushing, the rto_cpu is set to -1,
2229      * rt_next_cpu() will simply return the first CPU found in
2230      * the rto_mask.
2231      *
2232      * If rto_next_cpu() is called with rto_cpu is a valid CPU, it
2233      * will return the next CPU found in the rto_mask.
2234      *
2235      * If there are no more CPUs left in the rto_mask, then a check is made
2236      * against rto_loop and rto_loop_next. rto_loop is only updated with
2237      * the rto_lock held, but any CPU may increment the rto_loop_next
2238      * without any locking.
2239      */
2240     for (;;) {
2241
2242         /* When rto_cpu is -1 this acts like cpumask_first() */
2243         cpu = cpumask_next(rd->rto_cpu, rd->rto_mask);
2244
2245         rd->rto_cpu = cpu;
2246
2247         if (cpu < nr_cpu_ids)
2248             return cpu;
2249
2250         rd->rto_cpu = -1;
2251
2252         /*
2253          * ACQUIRE ensures we see the @rto_mask changes
2254          * made prior to the @next value observed.
2255          *
2256          * Matches WMB in rt_set_overload().
2257          */
2258         next = atomic_read_acquire(&rd->rto_loop_next);
2259
2260         if (rd->rto_loop == next)
2261             break;
2262
2263         rd->rto_loop = next;
2264     }
2265
2266     return -1;
2267 }
2268
2269 static inline bool rto_start_trylock(atomic_t *v)
2270 {
2271     return !atomic_cmpxchg_acquire(v, 0, 1);
2272 }
2273
2274 static inline void rto_start_unlock(atomic_t *v)
2275 {
2276     atomic_set_release(v, 0);
2277 }
2278
2279 static void tell_cpu_to_push(struct rq *rq)
2280 {
2281     int cpu = -1;
2282
2283     /* Keep the loop going if the IPI is currently active */
2284     atomic_inc(&rq->rd->rto_loop_next);
2285
2286     /* Only one CPU can initiate a loop at a time */
2287     if (!rto_start_trylock(&rq->rd->rto_loop_start))
2288         return;
2289
2290     raw_spin_lock(&rq->rd->rto_lock);
2291
2292     /*
2293      * The rto_cpu is updated under the lock, if it has a valid CPU
2294      * then the IPI is still running and will continue due to the
2295      * update to loop_next, and nothing needs to be done here.
2296      * Otherwise it is finishing up and an ipi needs to be sent.
2297      */
2298     if (rq->rd->rto_cpu < 0)
2299         cpu = rto_next_cpu(rq->rd);
2300
2301     raw_spin_unlock(&rq->rd->rto_lock);
2302
2303     rto_start_unlock(&rq->rd->rto_loop_start);
2304
2305     if (cpu >= 0) {
2306         /* Make sure the rd does not get freed while pushing */
2307         sched_get_rd(rq->rd);
2308         irq_work_queue_on(&rq->rd->rto_push_work, cpu);
2309     }
2310 }
2311
2312 /* Called from hardirq context */
2313 void rto_push_irq_work_func(struct irq_work *work)
2314 {
2315     struct root_domain *rd =
2316         container_of(work, struct root_domain, rto_push_work);
2317     struct rq *rq;
2318     int cpu;
2319
2320     rq = this_rq();
2321
2322     /*
2323      * We do not need to grab the lock to check for has_pushable_tasks.
2324      * When it gets updated, a check is made if a push is possible.
2325      */
2326     if (has_pushable_tasks(rq)) {
2327         raw_spin_rq_lock(rq);
2328         while (push_rt_task(rq, true))
2329             ;
2330         raw_spin_rq_unlock(rq);
2331     }
2332
2333     raw_spin_lock(&rd->rto_lock);
2334
2335     /* Pass the IPI to the next rt overloaded queue */
2336     cpu = rto_next_cpu(rd);
2337
2338     raw_spin_unlock(&rd->rto_lock);
2339
2340     if (cpu < 0) {
2341         sched_put_rd(rd);
2342         return;
2343     }
2344
2345     /* Try the next RT overloaded CPU */
2346     irq_work_queue_on(&rd->rto_push_work, cpu);
2347 }
2348 #endif /* HAVE_RT_PUSH_IPI */
2349
2350 static void pull_rt_task(struct rq *this_rq)
2351 {
2352     int this_cpu = this_rq->cpu, cpu;
2353     bool resched = false;
2354     struct task_struct *p, *push_task;
2355     struct rq *src_rq;
2356     int rt_overload_count = rt_overloaded(this_rq);
2357
2358     if (likely(!rt_overload_count))
2359         return;
2360
2361     /*
2362      * Match the barrier from rt_set_overloaded; this guarantees that if we
2363      * see overloaded we must also see the rto_mask bit.
2364      */
2365     smp_rmb();
2366
2367     /* If we are the only overloaded CPU do nothing */
2368     if (rt_overload_count == 1 &&
2369         cpumask_test_cpu(this_rq->cpu, this_rq->rd->rto_mask))
2370         return;
2371
2372 #ifdef HAVE_RT_PUSH_IPI
2373     if (sched_feat(RT_PUSH_IPI)) {
2374         tell_cpu_to_push(this_rq);
2375         return;
2376     }
2377 #endif
2378
2379     for_each_cpu(cpu, this_rq->rd->rto_mask) {
2380         if (this_cpu == cpu)
2381             continue;
2382
2383         src_rq = cpu_rq(cpu);
2384
2385         /*
2386          * Don't bother taking the src_rq->lock if the next highest
2387          * task is known to be lower-priority than our current task.
2388          * This may look racy, but if this value is about to go
2389          * logically higher, the src_rq will push this task away.
2390          * And if its going logically lower, we do not care
2391          */
2392         if (src_rq->rt.highest_prio.next >=
2393             this_rq->rt.highest_prio.curr)
2394             continue;
2395
2396         /*
2397          * We can potentially drop this_rq's lock in
2398          * double_lock_balance, and another CPU could
2399          * alter this_rq
2400          */
2401         push_task = NULL;
2402         double_lock_balance(this_rq, src_rq);
2403
2404         /*
2405          * We can pull only a task, which is pushable
2406          * on its rq, and no others.
2407          */
2408         p = pick_highest_pushable_task(src_rq, this_cpu);
2409
2410         /*
2411          * Do we have an RT task that preempts
2412          * the to-be-scheduled task?
2413          */
2414         if (p && (p->prio < this_rq->rt.highest_prio.curr)) {
2415             WARN_ON(p == src_rq->curr);
2416             WARN_ON(!task_on_rq_queued(p));
2417
2418             /*
2419              * There's a chance that p is higher in priority
2420              * than what's currently running on its CPU.
2421              * This is just that p is waking up and hasn't
2422              * had a chance to schedule. We only pull
2423              * p if it is lower in priority than the
2424              * current task on the run queue
2425              */
2426             if (p->prio < src_rq->curr->prio)
2427                 goto skip;
2428
2429             if (is_migration_disabled(p)) {
2430                 push_task = get_push_task(src_rq);
2431             } else {
2432                 deactivate_task(src_rq, p, 0);
2433                 set_task_cpu(p, this_cpu);
2434                 activate_task(this_rq, p, 0);
2435                 resched = true;
2436             }
2437             /*
2438              * We continue with the search, just in
2439              * case there's an even higher prio task
2440              * in another runqueue. (low likelihood
2441              * but possible)
2442              */
2443         }
2444 skip:
2445         double_unlock_balance(this_rq, src_rq);
2446
2447         if (push_task) {
2448             raw_spin_rq_unlock(this_rq);
2449             stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop,
2450                         push_task, &src_rq->push_work);
2451             raw_spin_rq_lock(this_rq);
2452         }
2453     }
2454
2455     if (resched)
2456         resched_curr(this_rq);
2457 }
2458
2459 /*
2460  * If we are not running and we are not going to reschedule soon, we should
2461  * try to push tasks away now
2462  */
2463 static void task_woken_rt(struct rq *rq, struct task_struct *p)
2464 {
2465     bool need_to_push = !task_running(rq, p) &&
2466                 !test_tsk_need_resched(rq->curr) &&
2467                 p->nr_cpus_allowed > 1 &&
2468                 (dl_task(rq->curr) || rt_task(rq->curr)) &&
2469                 (rq->curr->nr_cpus_allowed < 2 ||
2470                  rq->curr->prio <= p->prio);
2471
2472     if (need_to_push)
2473         push_rt_tasks(rq);
2474 }
2475
2476 /* Assumes rq->lock is held */
2477 static void rq_online_rt(struct rq *rq)
2478 {
2479     if (rq->rt.overloaded)
2480         rt_set_overload(rq);
2481
2482     __enable_runtime(rq);
2483
2484     cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr);
2485 }
2486
2487 /* Assumes rq->lock is held */
2488 static void rq_offline_rt(struct rq *rq)
2489 {
2490     if (rq->rt.overloaded)
2491         rt_clear_overload(rq);
2492
2493     __disable_runtime(rq);
2494
2495     cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID);
2496 }
2497
2498 /*
2499  * When switch from the rt queue, we bring ourselves to a position
2500  * that we might want to pull RT tasks from other runqueues.
2501  */
2502 static void switched_from_rt(struct rq *rq, struct task_struct *p)
2503 {
2504     /*
2505      * If there are other RT tasks then we will reschedule
2506      * and the scheduling of the other RT tasks will handle
2507      * the balancing. But if we are the last RT task
2508      * we may need to handle the pulling of RT tasks
2509      * now.
2510      */
2511     if (!task_on_rq_queued(p) || rq->rt.rt_nr_running)
2512         return;
2513
2514     rt_queue_pull_task(rq);
2515 }
2516
2517 void __init init_sched_rt_class(void)
2518 {
2519     unsigned int i;
2520
2521     for_each_possible_cpu(i) {
2522         zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i),
2523                     GFP_KERNEL, cpu_to_node(i));
2524     }
2525 }
2526 #endif /* CONFIG_SMP */
2527
2528 /*
2529  * When switching a task to RT, we may overload the runqueue
2530  * with RT tasks. In this case we try to push them off to
2531  * other runqueues.
2532  */
2533 static void switched_to_rt(struct rq *rq, struct task_struct *p)
2534 {
2535     /*
2536      * If we are running, update the avg_rt tracking, as the running time
2537      * will now on be accounted into the latter.
2538      */
2539     if (task_current(rq, p)) {
2540         update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0);
2541         return;
2542     }
2543
2544     /*
2545      * If we are not running we may need to preempt the current
2546      * running task. If that current running task is also an RT task
2547      * then see if we can move to another run queue.
2548      */
2549     if (task_on_rq_queued(p)) {
2550 #ifdef CONFIG_SMP
2551         if (p->nr_cpus_allowed > 1 && rq->rt.overloaded)
2552             rt_queue_push_tasks(rq);
2553 #endif /* CONFIG_SMP */
2554         if (p->prio < rq->curr->prio && cpu_online(cpu_of(rq)))
2555             resched_curr(rq);
2556     }
2557 }
2558
2559 /*
2560  * Priority of the task has changed. This may cause
2561  * us to initiate a push or pull.
2562  */
2563 static void
2564 prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
2565 {
2566     if (!task_on_rq_queued(p))
2567         return;
2568
2569     if (task_current(rq, p)) {
2570 #ifdef CONFIG_SMP
2571         /*
2572          * If our priority decreases while running, we
2573          * may need to pull tasks to this runqueue.
2574          */
2575         if (oldprio < p->prio)
2576             rt_queue_pull_task(rq);
2577
2578         /*
2579          * If there's a higher priority task waiting to run
2580          * then reschedule.
2581          */
2582         if (p->prio > rq->rt.highest_prio.curr)
2583             resched_curr(rq);
2584 #else
2585         /* For UP simply resched on drop of prio */
2586         if (oldprio < p->prio)
2587             resched_curr(rq);
2588 #endif /* CONFIG_SMP */
2589     } else {
2590         /*
2591          * This task is not running, but if it is
2592          * greater than the current running task
2593          * then reschedule.
2594          */
2595         if (p->prio < rq->curr->prio)
2596             resched_curr(rq);
2597     }
2598 }
2599
2600 #ifdef CONFIG_POSIX_TIMERS
2601 static void watchdog(struct rq *rq, struct task_struct *p)
2602 {
2603     unsigned long soft, hard;
2604
2605     /* max may change after cur was read, this will be fixed next tick */
2606     soft = task_rlimit(p, RLIMIT_RTTIME);
2607     hard = task_rlimit_max(p, RLIMIT_RTTIME);
2608
2609     if (soft != RLIM_INFINITY) {
2610         unsigned long next;
2611
2612         if (p->rt.watchdog_stamp != jiffies) {
2613             p->rt.timeout++;
2614             p->rt.watchdog_stamp = jiffies;
2615         }
2616
2617         next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ);
2618         if (p->rt.timeout > next) {
2619             posix_cputimers_rt_watchdog(&p->posix_cputimers,
2620                             p->se.sum_exec_runtime);
2621         }
2622     }
2623 }
2624 #else
2625 static inline void watchdog(struct rq *rq, struct task_struct *p) { }
2626 #endif
2627
2628 /*
2629  * scheduler tick hitting a task of our scheduling class.
2630  *
2631  * NOTE: This function can be called remotely by the tick offload that
2632  * goes along full dynticks. Therefore no local assumption can be made
2633  * and everything must be accessed through the @rq and @curr passed in
2634  * parameters.
2635  */
2636 static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
2637 {
2638     struct sched_rt_entity *rt_se = &p->rt;
2639
2640     update_curr_rt(rq);
2641     update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 1);
2642
2643     watchdog(rq, p);
2644
2645     /*
2646      * RR tasks need a special form of timeslice management.
2647      * FIFO tasks have no timeslices.
2648      */
2649     if (p->policy != SCHED_RR)
2650         return;
2651
2652     if (--p->rt.time_slice)
2653         return;
2654
2655     p->rt.time_slice = sched_rr_timeslice;
2656
2657     /*
2658      * Requeue to the end of queue if we (and all of our ancestors) are not
2659      * the only element on the queue
2660      */
2661     for_each_sched_rt_entity(rt_se) {
2662         if (rt_se->run_list.prev != rt_se->run_list.next) {
2663             requeue_task_rt(rq, p, 0);
2664             resched_curr(rq);
2665             return;
2666         }
2667     }
2668 }
2669
2670 static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
2671 {
2672     /*
2673      * Time slice is 0 for SCHED_FIFO tasks
2674      */
2675     if (task->policy == SCHED_RR)
2676         return sched_rr_timeslice;
2677     else
2678         return 0;
2679 }
2680
2681 DEFINE_SCHED_CLASS(rt) = {
2682
2683     .enqueue_task       = enqueue_task_rt,
2684     .dequeue_task       = dequeue_task_rt,
2685     .yield_task     = yield_task_rt,
2686
2687     .check_preempt_curr = check_preempt_curr_rt,
2688
2689     .pick_next_task     = pick_next_task_rt,
2690     .put_prev_task      = put_prev_task_rt,
2691     .set_next_task          = set_next_task_rt,
2692
2693 #ifdef CONFIG_SMP
2694     .balance        = balance_rt,
2695     .pick_task      = pick_task_rt,
2696     .select_task_rq     = select_task_rq_rt,
2697     .set_cpus_allowed       = set_cpus_allowed_common,
2698     .rq_online              = rq_online_rt,
2699     .rq_offline             = rq_offline_rt,
2700     .task_woken     = task_woken_rt,
2701     .switched_from      = switched_from_rt,
2702     .find_lock_rq       = find_lock_lowest_rq,
2703 #endif
2704
2705     .task_tick      = task_tick_rt,
2706
2707     .get_rr_interval    = get_rr_interval_rt,
2708
2709     .prio_changed       = prio_changed_rt,
2710     .switched_to        = switched_to_rt,
2711
2712     .update_curr        = update_curr_rt,
2713
2714 #ifdef CONFIG_UCLAMP_TASK
2715     .uclamp_enabled     = 1,
2716 #endif
2717 };
2718
2719 #ifdef CONFIG_RT_GROUP_SCHED
2720 /*
2721  * Ensure that the real time constraints are schedulable.
2722  */
2723 static DEFINE_MUTEX(rt_constraints_mutex);
2724
2725 static inline int tg_has_rt_tasks(struct task_group *tg)
2726 {
2727     struct task_struct *task;
2728     struct css_task_iter it;
2729     int ret = 0;
2730
2731     /*
2732      * Autogroups do not have RT tasks; see autogroup_create().
2733      */
2734     if (task_group_is_autogroup(tg))
2735         return 0;
2736
2737     css_task_iter_start(&tg->css, 0, &it);
2738     while (!ret && (task = css_task_iter_next(&it)))
2739         ret |= rt_task(task);
2740     css_task_iter_end(&it);
2741
2742     return ret;
2743 }
2744
2745 struct rt_schedulable_data {
2746     struct task_group *tg;
2747     u64 rt_period;
2748     u64 rt_runtime;
2749 };
2750
2751 static int tg_rt_schedulable(struct task_group *tg, void *data)
2752 {
2753     struct rt_schedulable_data *d = data;
2754     struct task_group *child;
2755     unsigned long total, sum = 0;
2756     u64 period, runtime;
2757
2758     period = ktime_to_ns(tg->rt_bandwidth.rt_period);
2759     runtime = tg->rt_bandwidth.rt_runtime;
2760
2761     if (tg == d->tg) {
2762         period = d->rt_period;
2763         runtime = d->rt_runtime;
2764     }
2765
2766     /*
2767      * Cannot have more runtime than the period.
2768      */
2769     if (runtime > period && runtime != RUNTIME_INF)
2770         return -EINVAL;
2771
2772     /*
2773      * Ensure we don't starve existing RT tasks if runtime turns zero.
2774      */
2775     if (rt_bandwidth_enabled() && !runtime &&
2776         tg->rt_bandwidth.rt_runtime && tg_has_rt_tasks(tg))
2777         return -EBUSY;
2778
2779     total = to_ratio(period, runtime);
2780
2781     /*
2782      * Nobody can have more than the global setting allows.
2783      */
2784     if (total > to_ratio(global_rt_period(), global_rt_runtime()))
2785         return -EINVAL;
2786
2787     /*
2788      * The sum of our children's runtime should not exceed our own.
2789      */
2790     list_for_each_entry_rcu(child, &tg->children, siblings) {
2791         period = ktime_to_ns(child->rt_bandwidth.rt_period);
2792         runtime = child->rt_bandwidth.rt_runtime;
2793
2794         if (child == d->tg) {
2795             period = d->rt_period;
2796             runtime = d->rt_runtime;
2797         }
2798
2799         sum += to_ratio(period, runtime);
2800     }
2801
2802     if (sum > total)
2803         return -EINVAL;
2804
2805     return 0;
2806 }
2807
2808 static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
2809 {
2810     int ret;
2811
2812     struct rt_schedulable_data data = {
2813         .tg = tg,
2814         .rt_period = period,
2815         .rt_runtime = runtime,
2816     };
2817
2818     rcu_read_lock();
2819     ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data);
2820     rcu_read_unlock();
2821
2822     return ret;
2823 }
2824
2825 static int tg_set_rt_bandwidth(struct task_group *tg,
2826         u64 rt_period, u64 rt_runtime)
2827 {
2828     int i, err = 0;
2829
2830     /*
2831      * Disallowing the root group RT runtime is BAD, it would disallow the
2832      * kernel creating (and or operating) RT threads.
2833      */
2834     if (tg == &root_task_group && rt_runtime == 0)
2835         return -EINVAL;
2836
2837     /* No period doesn't make any sense. */
2838     if (rt_period == 0)
2839         return -EINVAL;
2840
2841     /*
2842      * Bound quota to defend quota against overflow during bandwidth shift.
2843      */
2844     if (rt_runtime != RUNTIME_INF && rt_runtime > max_rt_runtime)
2845         return -EINVAL;
2846
2847     mutex_lock(&rt_constraints_mutex);
2848     err = __rt_schedulable(tg, rt_period, rt_runtime);
2849     if (err)
2850         goto unlock;
2851
2852     raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
2853     tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
2854     tg->rt_bandwidth.rt_runtime = rt_runtime;
2855
2856     for_each_possible_cpu(i) {
2857         struct rt_rq *rt_rq = tg->rt_rq[i];
2858
2859         raw_spin_lock(&rt_rq->rt_runtime_lock);
2860         rt_rq->rt_runtime = rt_runtime;
2861         raw_spin_unlock(&rt_rq->rt_runtime_lock);
2862     }
2863     raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
2864 unlock:
2865     mutex_unlock(&rt_constraints_mutex);
2866
2867     return err;
2868 }
2869
2870 int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
2871 {
2872     u64 rt_runtime, rt_period;
2873
2874     rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
2875     rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
2876     if (rt_runtime_us < 0)
2877         rt_runtime = RUNTIME_INF;
2878     else if ((u64)rt_runtime_us > U64_MAX / NSEC_PER_USEC)
2879         return -EINVAL;
2880
2881     return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
2882 }
2883
2884 long sched_group_rt_runtime(struct task_group *tg)
2885 {
2886     u64 rt_runtime_us;
2887
2888     if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)
2889         return -1;
2890
2891     rt_runtime_us = tg->rt_bandwidth.rt_runtime;
2892     do_div(rt_runtime_us, NSEC_PER_USEC);
2893     return rt_runtime_us;
2894 }
2895
2896 int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us)
2897 {
2898     u64 rt_runtime, rt_period;
2899
2900     if (rt_period_us > U64_MAX / NSEC_PER_USEC)
2901         return -EINVAL;
2902
2903     rt_period = rt_period_us * NSEC_PER_USEC;
2904     rt_runtime = tg->rt_bandwidth.rt_runtime;
2905
2906     return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
2907 }
2908
2909 long sched_group_rt_period(struct task_group *tg)
2910 {
2911     u64 rt_period_us;
2912
2913     rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);
2914     do_div(rt_period_us, NSEC_PER_USEC);
2915     return rt_period_us;
2916 }
2917
2918 #ifdef CONFIG_SYSCTL
2919 static int sched_rt_global_constraints(void)
2920 {
2921     int ret = 0;
2922
2923     mutex_lock(&rt_constraints_mutex);
2924     ret = __rt_schedulable(NULL, 0, 0);
2925     mutex_unlock(&rt_constraints_mutex);
2926
2927     return ret;
2928 }
2929 #endif /* CONFIG_SYSCTL */
2930
2931 int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
2932 {
2933     /* Don't accept realtime tasks when there is no way for them to run */
2934     if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
2935         return 0;
2936
2937     return 1;
2938 }
2939
2940 #else /* !CONFIG_RT_GROUP_SCHED */
2941
2942 #ifdef CONFIG_SYSCTL
2943 static int sched_rt_global_constraints(void)
2944 {
2945     unsigned long flags;
2946     int i;
2947
2948     raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
2949     for_each_possible_cpu(i) {
2950         struct rt_rq *rt_rq = &cpu_rq(i)->rt;
2951
2952         raw_spin_lock(&rt_rq->rt_runtime_lock);
2953         rt_rq->rt_runtime = global_rt_runtime();
2954         raw_spin_unlock(&rt_rq->rt_runtime_lock);
2955     }
2956     raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
2957
2958     return 0;
2959 }
2960 #endif /* CONFIG_SYSCTL */
2961 #endif /* CONFIG_RT_GROUP_SCHED */
2962
2963 #ifdef CONFIG_SYSCTL
2964 static int sched_rt_global_validate(void)
2965 {
2966     if (sysctl_sched_rt_period <= 0)
2967         return -EINVAL;
2968
2969     if ((sysctl_sched_rt_runtime != RUNTIME_INF) &&
2970         ((sysctl_sched_rt_runtime > sysctl_sched_rt_period) ||
2971          ((u64)sysctl_sched_rt_runtime *
2972             NSEC_PER_USEC > max_rt_runtime)))
2973         return -EINVAL;
2974
2975     return 0;
2976 }
2977
2978 static void sched_rt_do_global(void)
2979 {
2980     unsigned long flags;
2981
2982     raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
2983     def_rt_bandwidth.rt_runtime = global_rt_runtime();
2984     def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period());
2985     raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
2986 }
2987
2988 static int sched_rt_handler(struct ctl_table *table, int write, void *buffer,
2989         size_t *lenp, loff_t *ppos)
2990 {
2991     int old_period, old_runtime;
2992     static DEFINE_MUTEX(mutex);
2993     int ret;
2994
2995     mutex_lock(&mutex);
2996     old_period = sysctl_sched_rt_period;
2997     old_runtime = sysctl_sched_rt_runtime;
2998
2999     ret = proc_dointvec(table, write, buffer, lenp, ppos);
3000
3001     if (!ret && write) {
3002         ret = sched_rt_global_validate();
3003         if (ret)
3004             goto undo;
3005
3006         ret = sched_dl_global_validate();
3007         if (ret)
3008             goto undo;
3009
3010         ret = sched_rt_global_constraints();
3011         if (ret)
3012             goto undo;
3013
3014         sched_rt_do_global();
3015         sched_dl_do_global();
3016     }
3017     if (0) {
3018 undo:
3019         sysctl_sched_rt_period = old_period;
3020         sysctl_sched_rt_runtime = old_runtime;
3021     }
3022     mutex_unlock(&mutex);
3023
3024     return ret;
3025 }
3026
3027 static int sched_rr_handler(struct ctl_table *table, int write, void *buffer,
3028         size_t *lenp, loff_t *ppos)
3029 {
3030     int ret;
3031     static DEFINE_MUTEX(mutex);
3032
3033     mutex_lock(&mutex);
3034     ret = proc_dointvec(table, write, buffer, lenp, ppos);
3035     /*
3036      * Make sure that internally we keep jiffies.
3037      * Also, writing zero resets the timeslice to default:
3038      */
3039     if (!ret && write) {
3040         sched_rr_timeslice =
3041             sysctl_sched_rr_timeslice <= 0 ? RR_TIMESLICE :
3042             msecs_to_jiffies(sysctl_sched_rr_timeslice);
3043     }
3044     mutex_unlock(&mutex);
3045
3046     return ret;
3047 }
3048 #endif /* CONFIG_SYSCTL */
3049
3050 #ifdef CONFIG_SCHED_DEBUG
3051 void print_rt_stats(struct seq_file *m, int cpu)
3052 {
3053     rt_rq_iter_t iter;
3054     struct rt_rq *rt_rq;
3055
3056     rcu_read_lock();
3057     for_each_rt_rq(rt_rq, iter, cpu_rq(cpu))
3058         print_rt_rq(m, cpu, rt_rq);
3059     rcu_read_unlock();
3060 }
3061 #endif /* CONFIG_SCHED_DEBUG */