0001
0002
0003
0004
0005
0006
0007
0008
0009 #include <linux/highmem.h>
0010 #include <linux/hrtimer_api.h>
0011 #include <linux/ktime_api.h>
0012 #include <linux/sched/signal.h>
0013 #include <linux/syscalls_api.h>
0014 #include <linux/debug_locks.h>
0015 #include <linux/prefetch.h>
0016 #include <linux/capability.h>
0017 #include <linux/pgtable_api.h>
0018 #include <linux/wait_bit.h>
0019 #include <linux/jiffies.h>
0020 #include <linux/spinlock_api.h>
0021 #include <linux/cpumask_api.h>
0022 #include <linux/lockdep_api.h>
0023 #include <linux/hardirq.h>
0024 #include <linux/softirq.h>
0025 #include <linux/refcount_api.h>
0026 #include <linux/topology.h>
0027 #include <linux/sched/clock.h>
0028 #include <linux/sched/cond_resched.h>
0029 #include <linux/sched/cputime.h>
0030 #include <linux/sched/debug.h>
0031 #include <linux/sched/hotplug.h>
0032 #include <linux/sched/init.h>
0033 #include <linux/sched/isolation.h>
0034 #include <linux/sched/loadavg.h>
0035 #include <linux/sched/mm.h>
0036 #include <linux/sched/nohz.h>
0037 #include <linux/sched/rseq_api.h>
0038 #include <linux/sched/rt.h>
0039
0040 #include <linux/blkdev.h>
0041 #include <linux/context_tracking.h>
0042 #include <linux/cpuset.h>
0043 #include <linux/delayacct.h>
0044 #include <linux/init_task.h>
0045 #include <linux/interrupt.h>
0046 #include <linux/ioprio.h>
0047 #include <linux/kallsyms.h>
0048 #include <linux/kcov.h>
0049 #include <linux/kprobes.h>
0050 #include <linux/llist_api.h>
0051 #include <linux/mmu_context.h>
0052 #include <linux/mmzone.h>
0053 #include <linux/mutex_api.h>
0054 #include <linux/nmi.h>
0055 #include <linux/nospec.h>
0056 #include <linux/perf_event_api.h>
0057 #include <linux/profile.h>
0058 #include <linux/psi.h>
0059 #include <linux/rcuwait_api.h>
0060 #include <linux/sched/wake_q.h>
0061 #include <linux/scs.h>
0062 #include <linux/slab.h>
0063 #include <linux/syscalls.h>
0064 #include <linux/vtime.h>
0065 #include <linux/wait_api.h>
0066 #include <linux/workqueue_api.h>
0067
0068 #ifdef CONFIG_PREEMPT_DYNAMIC
0069 # ifdef CONFIG_GENERIC_ENTRY
0070 # include <linux/entry-common.h>
0071 # endif
0072 #endif
0073
0074 #include <uapi/linux/sched/types.h>
0075
0076 #include <asm/switch_to.h>
0077 #include <asm/tlb.h>
0078
0079 #define CREATE_TRACE_POINTS
0080 #include <linux/sched/rseq_api.h>
0081 #include <trace/events/sched.h>
0082 #undef CREATE_TRACE_POINTS
0083
0084 #include "sched.h"
0085 #include "stats.h"
0086 #include "autogroup.h"
0087
0088 #include "autogroup.h"
0089 #include "pelt.h"
0090 #include "smp.h"
0091 #include "stats.h"
0092
0093 #include "../workqueue_internal.h"
0094 #include "../../io_uring/io-wq.h"
0095 #include "../smpboot.h"
0096
0097
0098
0099
0100
0101 EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_cfs_tp);
0102 EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_rt_tp);
0103 EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp);
0104 EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp);
0105 EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp);
0106 EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_thermal_tp);
0107 EXPORT_TRACEPOINT_SYMBOL_GPL(sched_cpu_capacity_tp);
0108 EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp);
0109 EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp);
0110 EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp);
0111 EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp);
0112
0113 DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
0114
0115 #ifdef CONFIG_SCHED_DEBUG
0116
0117
0118
0119
0120
0121
0122
0123 #define SCHED_FEAT(name, enabled) \
0124 (1UL << __SCHED_FEAT_##name) * enabled |
0125 const_debug unsigned int sysctl_sched_features =
0126 #include "features.h"
0127 0;
0128 #undef SCHED_FEAT
0129
0130
0131
0132
0133
0134
0135
0136
0137 __read_mostly int sysctl_resched_latency_warn_ms = 100;
0138 __read_mostly int sysctl_resched_latency_warn_once = 1;
0139 #endif
0140
0141
0142
0143
0144
0145 #ifdef CONFIG_PREEMPT_RT
0146 const_debug unsigned int sysctl_sched_nr_migrate = 8;
0147 #else
0148 const_debug unsigned int sysctl_sched_nr_migrate = 32;
0149 #endif
0150
0151 __read_mostly int scheduler_running;
0152
0153 #ifdef CONFIG_SCHED_CORE
0154
0155 DEFINE_STATIC_KEY_FALSE(__sched_core_enabled);
0156
0157
0158 static inline int __task_prio(struct task_struct *p)
0159 {
0160 if (p->sched_class == &stop_sched_class)
0161 return -2;
0162
0163 if (rt_prio(p->prio))
0164 return p->prio;
0165
0166 if (p->sched_class == &idle_sched_class)
0167 return MAX_RT_PRIO + NICE_WIDTH;
0168
0169 return MAX_RT_PRIO + MAX_NICE;
0170 }
0171
0172
0173
0174
0175
0176
0177
0178
0179
0180 static inline bool prio_less(struct task_struct *a, struct task_struct *b, bool in_fi)
0181 {
0182
0183 int pa = __task_prio(a), pb = __task_prio(b);
0184
0185 if (-pa < -pb)
0186 return true;
0187
0188 if (-pb < -pa)
0189 return false;
0190
0191 if (pa == -1)
0192 return !dl_time_before(a->dl.deadline, b->dl.deadline);
0193
0194 if (pa == MAX_RT_PRIO + MAX_NICE)
0195 return cfs_prio_less(a, b, in_fi);
0196
0197 return false;
0198 }
0199
0200 static inline bool __sched_core_less(struct task_struct *a, struct task_struct *b)
0201 {
0202 if (a->core_cookie < b->core_cookie)
0203 return true;
0204
0205 if (a->core_cookie > b->core_cookie)
0206 return false;
0207
0208
0209 if (prio_less(b, a, !!task_rq(a)->core->core_forceidle_count))
0210 return true;
0211
0212 return false;
0213 }
0214
0215 #define __node_2_sc(node) rb_entry((node), struct task_struct, core_node)
0216
0217 static inline bool rb_sched_core_less(struct rb_node *a, const struct rb_node *b)
0218 {
0219 return __sched_core_less(__node_2_sc(a), __node_2_sc(b));
0220 }
0221
0222 static inline int rb_sched_core_cmp(const void *key, const struct rb_node *node)
0223 {
0224 const struct task_struct *p = __node_2_sc(node);
0225 unsigned long cookie = (unsigned long)key;
0226
0227 if (cookie < p->core_cookie)
0228 return -1;
0229
0230 if (cookie > p->core_cookie)
0231 return 1;
0232
0233 return 0;
0234 }
0235
0236 void sched_core_enqueue(struct rq *rq, struct task_struct *p)
0237 {
0238 rq->core->core_task_seq++;
0239
0240 if (!p->core_cookie)
0241 return;
0242
0243 rb_add(&p->core_node, &rq->core_tree, rb_sched_core_less);
0244 }
0245
0246 void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags)
0247 {
0248 rq->core->core_task_seq++;
0249
0250 if (sched_core_enqueued(p)) {
0251 rb_erase(&p->core_node, &rq->core_tree);
0252 RB_CLEAR_NODE(&p->core_node);
0253 }
0254
0255
0256
0257
0258
0259
0260 if (!(flags & DEQUEUE_SAVE) && rq->nr_running == 1 &&
0261 rq->core->core_forceidle_count && rq->curr == rq->idle)
0262 resched_curr(rq);
0263 }
0264
0265
0266
0267
0268 static struct task_struct *sched_core_find(struct rq *rq, unsigned long cookie)
0269 {
0270 struct rb_node *node;
0271
0272 node = rb_find_first((void *)cookie, &rq->core_tree, rb_sched_core_cmp);
0273
0274
0275
0276 if (!node)
0277 return idle_sched_class.pick_task(rq);
0278
0279 return __node_2_sc(node);
0280 }
0281
0282 static struct task_struct *sched_core_next(struct task_struct *p, unsigned long cookie)
0283 {
0284 struct rb_node *node = &p->core_node;
0285
0286 node = rb_next(node);
0287 if (!node)
0288 return NULL;
0289
0290 p = container_of(node, struct task_struct, core_node);
0291 if (p->core_cookie != cookie)
0292 return NULL;
0293
0294 return p;
0295 }
0296
0297
0298
0299
0300
0301
0302
0303
0304
0305
0306
0307
0308
0309
0310 static DEFINE_MUTEX(sched_core_mutex);
0311 static atomic_t sched_core_count;
0312 static struct cpumask sched_core_mask;
0313
0314 static void sched_core_lock(int cpu, unsigned long *flags)
0315 {
0316 const struct cpumask *smt_mask = cpu_smt_mask(cpu);
0317 int t, i = 0;
0318
0319 local_irq_save(*flags);
0320 for_each_cpu(t, smt_mask)
0321 raw_spin_lock_nested(&cpu_rq(t)->__lock, i++);
0322 }
0323
0324 static void sched_core_unlock(int cpu, unsigned long *flags)
0325 {
0326 const struct cpumask *smt_mask = cpu_smt_mask(cpu);
0327 int t;
0328
0329 for_each_cpu(t, smt_mask)
0330 raw_spin_unlock(&cpu_rq(t)->__lock);
0331 local_irq_restore(*flags);
0332 }
0333
0334 static void __sched_core_flip(bool enabled)
0335 {
0336 unsigned long flags;
0337 int cpu, t;
0338
0339 cpus_read_lock();
0340
0341
0342
0343
0344 cpumask_copy(&sched_core_mask, cpu_online_mask);
0345 for_each_cpu(cpu, &sched_core_mask) {
0346 const struct cpumask *smt_mask = cpu_smt_mask(cpu);
0347
0348 sched_core_lock(cpu, &flags);
0349
0350 for_each_cpu(t, smt_mask)
0351 cpu_rq(t)->core_enabled = enabled;
0352
0353 cpu_rq(cpu)->core->core_forceidle_start = 0;
0354
0355 sched_core_unlock(cpu, &flags);
0356
0357 cpumask_andnot(&sched_core_mask, &sched_core_mask, smt_mask);
0358 }
0359
0360
0361
0362
0363 cpumask_copy(&sched_core_mask, cpu_possible_mask);
0364 cpumask_andnot(&sched_core_mask, &sched_core_mask, cpu_online_mask);
0365
0366 for_each_cpu(cpu, &sched_core_mask)
0367 cpu_rq(cpu)->core_enabled = enabled;
0368
0369 cpus_read_unlock();
0370 }
0371
0372 static void sched_core_assert_empty(void)
0373 {
0374 int cpu;
0375
0376 for_each_possible_cpu(cpu)
0377 WARN_ON_ONCE(!RB_EMPTY_ROOT(&cpu_rq(cpu)->core_tree));
0378 }
0379
0380 static void __sched_core_enable(void)
0381 {
0382 static_branch_enable(&__sched_core_enabled);
0383
0384
0385
0386
0387 synchronize_rcu();
0388 __sched_core_flip(true);
0389 sched_core_assert_empty();
0390 }
0391
0392 static void __sched_core_disable(void)
0393 {
0394 sched_core_assert_empty();
0395 __sched_core_flip(false);
0396 static_branch_disable(&__sched_core_enabled);
0397 }
0398
0399 void sched_core_get(void)
0400 {
0401 if (atomic_inc_not_zero(&sched_core_count))
0402 return;
0403
0404 mutex_lock(&sched_core_mutex);
0405 if (!atomic_read(&sched_core_count))
0406 __sched_core_enable();
0407
0408 smp_mb__before_atomic();
0409 atomic_inc(&sched_core_count);
0410 mutex_unlock(&sched_core_mutex);
0411 }
0412
0413 static void __sched_core_put(struct work_struct *work)
0414 {
0415 if (atomic_dec_and_mutex_lock(&sched_core_count, &sched_core_mutex)) {
0416 __sched_core_disable();
0417 mutex_unlock(&sched_core_mutex);
0418 }
0419 }
0420
0421 void sched_core_put(void)
0422 {
0423 static DECLARE_WORK(_work, __sched_core_put);
0424
0425
0426
0427
0428
0429
0430
0431
0432 if (!atomic_add_unless(&sched_core_count, -1, 1))
0433 schedule_work(&_work);
0434 }
0435
0436 #else
0437
0438 static inline void sched_core_enqueue(struct rq *rq, struct task_struct *p) { }
0439 static inline void
0440 sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) { }
0441
0442 #endif
0443
0444
0445
0446
0447
0448
0449
0450
0451
0452
0453
0454
0455
0456
0457
0458
0459
0460
0461
0462
0463
0464
0465
0466
0467
0468
0469
0470
0471
0472
0473
0474
0475
0476
0477
0478
0479
0480
0481
0482
0483
0484
0485
0486
0487
0488
0489
0490
0491
0492
0493
0494
0495
0496
0497
0498
0499
0500
0501
0502
0503
0504
0505
0506
0507
0508
0509
0510
0511
0512
0513
0514
0515
0516
0517
0518
0519
0520
0521
0522
0523
0524
0525
0526
0527
0528
0529
0530
0531
0532
0533
0534
0535
0536
0537 void raw_spin_rq_lock_nested(struct rq *rq, int subclass)
0538 {
0539 raw_spinlock_t *lock;
0540
0541
0542 preempt_disable();
0543 if (sched_core_disabled()) {
0544 raw_spin_lock_nested(&rq->__lock, subclass);
0545
0546 preempt_enable_no_resched();
0547 return;
0548 }
0549
0550 for (;;) {
0551 lock = __rq_lockp(rq);
0552 raw_spin_lock_nested(lock, subclass);
0553 if (likely(lock == __rq_lockp(rq))) {
0554
0555 preempt_enable_no_resched();
0556 return;
0557 }
0558 raw_spin_unlock(lock);
0559 }
0560 }
0561
0562 bool raw_spin_rq_trylock(struct rq *rq)
0563 {
0564 raw_spinlock_t *lock;
0565 bool ret;
0566
0567
0568 preempt_disable();
0569 if (sched_core_disabled()) {
0570 ret = raw_spin_trylock(&rq->__lock);
0571 preempt_enable();
0572 return ret;
0573 }
0574
0575 for (;;) {
0576 lock = __rq_lockp(rq);
0577 ret = raw_spin_trylock(lock);
0578 if (!ret || (likely(lock == __rq_lockp(rq)))) {
0579 preempt_enable();
0580 return ret;
0581 }
0582 raw_spin_unlock(lock);
0583 }
0584 }
0585
0586 void raw_spin_rq_unlock(struct rq *rq)
0587 {
0588 raw_spin_unlock(rq_lockp(rq));
0589 }
0590
0591 #ifdef CONFIG_SMP
0592
0593
0594
0595 void double_rq_lock(struct rq *rq1, struct rq *rq2)
0596 {
0597 lockdep_assert_irqs_disabled();
0598
0599 if (rq_order_less(rq2, rq1))
0600 swap(rq1, rq2);
0601
0602 raw_spin_rq_lock(rq1);
0603 if (__rq_lockp(rq1) != __rq_lockp(rq2))
0604 raw_spin_rq_lock_nested(rq2, SINGLE_DEPTH_NESTING);
0605
0606 double_rq_clock_clear_update(rq1, rq2);
0607 }
0608 #endif
0609
0610
0611
0612
0613 struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
0614 __acquires(rq->lock)
0615 {
0616 struct rq *rq;
0617
0618 lockdep_assert_held(&p->pi_lock);
0619
0620 for (;;) {
0621 rq = task_rq(p);
0622 raw_spin_rq_lock(rq);
0623 if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
0624 rq_pin_lock(rq, rf);
0625 return rq;
0626 }
0627 raw_spin_rq_unlock(rq);
0628
0629 while (unlikely(task_on_rq_migrating(p)))
0630 cpu_relax();
0631 }
0632 }
0633
0634
0635
0636
0637 struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
0638 __acquires(p->pi_lock)
0639 __acquires(rq->lock)
0640 {
0641 struct rq *rq;
0642
0643 for (;;) {
0644 raw_spin_lock_irqsave(&p->pi_lock, rf->flags);
0645 rq = task_rq(p);
0646 raw_spin_rq_lock(rq);
0647
0648
0649
0650
0651
0652
0653
0654
0655
0656
0657
0658
0659
0660
0661
0662
0663
0664 if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
0665 rq_pin_lock(rq, rf);
0666 return rq;
0667 }
0668 raw_spin_rq_unlock(rq);
0669 raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
0670
0671 while (unlikely(task_on_rq_migrating(p)))
0672 cpu_relax();
0673 }
0674 }
0675
0676
0677
0678
0679
0680 static void update_rq_clock_task(struct rq *rq, s64 delta)
0681 {
0682
0683
0684
0685
0686 s64 __maybe_unused steal = 0, irq_delta = 0;
0687
0688 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
0689 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
0690
0691
0692
0693
0694
0695
0696
0697
0698
0699
0700
0701
0702
0703
0704
0705
0706 if (irq_delta > delta)
0707 irq_delta = delta;
0708
0709 rq->prev_irq_time += irq_delta;
0710 delta -= irq_delta;
0711 #endif
0712 #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
0713 if (static_key_false((¶virt_steal_rq_enabled))) {
0714 steal = paravirt_steal_clock(cpu_of(rq));
0715 steal -= rq->prev_steal_time_rq;
0716
0717 if (unlikely(steal > delta))
0718 steal = delta;
0719
0720 rq->prev_steal_time_rq += steal;
0721 delta -= steal;
0722 }
0723 #endif
0724
0725 rq->clock_task += delta;
0726
0727 #ifdef CONFIG_HAVE_SCHED_AVG_IRQ
0728 if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))
0729 update_irq_load_avg(rq, irq_delta + steal);
0730 #endif
0731 update_rq_clock_pelt(rq, delta);
0732 }
0733
0734 void update_rq_clock(struct rq *rq)
0735 {
0736 s64 delta;
0737
0738 lockdep_assert_rq_held(rq);
0739
0740 if (rq->clock_update_flags & RQCF_ACT_SKIP)
0741 return;
0742
0743 #ifdef CONFIG_SCHED_DEBUG
0744 if (sched_feat(WARN_DOUBLE_CLOCK))
0745 SCHED_WARN_ON(rq->clock_update_flags & RQCF_UPDATED);
0746 rq->clock_update_flags |= RQCF_UPDATED;
0747 #endif
0748
0749 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
0750 if (delta < 0)
0751 return;
0752 rq->clock += delta;
0753 update_rq_clock_task(rq, delta);
0754 }
0755
0756 #ifdef CONFIG_SCHED_HRTICK
0757
0758
0759
0760
0761 static void hrtick_clear(struct rq *rq)
0762 {
0763 if (hrtimer_active(&rq->hrtick_timer))
0764 hrtimer_cancel(&rq->hrtick_timer);
0765 }
0766
0767
0768
0769
0770
0771 static enum hrtimer_restart hrtick(struct hrtimer *timer)
0772 {
0773 struct rq *rq = container_of(timer, struct rq, hrtick_timer);
0774 struct rq_flags rf;
0775
0776 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
0777
0778 rq_lock(rq, &rf);
0779 update_rq_clock(rq);
0780 rq->curr->sched_class->task_tick(rq, rq->curr, 1);
0781 rq_unlock(rq, &rf);
0782
0783 return HRTIMER_NORESTART;
0784 }
0785
0786 #ifdef CONFIG_SMP
0787
0788 static void __hrtick_restart(struct rq *rq)
0789 {
0790 struct hrtimer *timer = &rq->hrtick_timer;
0791 ktime_t time = rq->hrtick_time;
0792
0793 hrtimer_start(timer, time, HRTIMER_MODE_ABS_PINNED_HARD);
0794 }
0795
0796
0797
0798
0799 static void __hrtick_start(void *arg)
0800 {
0801 struct rq *rq = arg;
0802 struct rq_flags rf;
0803
0804 rq_lock(rq, &rf);
0805 __hrtick_restart(rq);
0806 rq_unlock(rq, &rf);
0807 }
0808
0809
0810
0811
0812
0813
0814 void hrtick_start(struct rq *rq, u64 delay)
0815 {
0816 struct hrtimer *timer = &rq->hrtick_timer;
0817 s64 delta;
0818
0819
0820
0821
0822
0823 delta = max_t(s64, delay, 10000LL);
0824 rq->hrtick_time = ktime_add_ns(timer->base->get_time(), delta);
0825
0826 if (rq == this_rq())
0827 __hrtick_restart(rq);
0828 else
0829 smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd);
0830 }
0831
0832 #else
0833
0834
0835
0836
0837
0838 void hrtick_start(struct rq *rq, u64 delay)
0839 {
0840
0841
0842
0843
0844 delay = max_t(u64, delay, 10000LL);
0845 hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay),
0846 HRTIMER_MODE_REL_PINNED_HARD);
0847 }
0848
0849 #endif
0850
0851 static void hrtick_rq_init(struct rq *rq)
0852 {
0853 #ifdef CONFIG_SMP
0854 INIT_CSD(&rq->hrtick_csd, __hrtick_start, rq);
0855 #endif
0856 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
0857 rq->hrtick_timer.function = hrtick;
0858 }
0859 #else
0860 static inline void hrtick_clear(struct rq *rq)
0861 {
0862 }
0863
0864 static inline void hrtick_rq_init(struct rq *rq)
0865 {
0866 }
0867 #endif
0868
0869
0870
0871
0872 #define fetch_or(ptr, mask) \
0873 ({ \
0874 typeof(ptr) _ptr = (ptr); \
0875 typeof(mask) _mask = (mask); \
0876 typeof(*_ptr) _val = *_ptr; \
0877 \
0878 do { \
0879 } while (!try_cmpxchg(_ptr, &_val, _val | _mask)); \
0880 _val; \
0881 })
0882
0883 #if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG)
0884
0885
0886
0887
0888
0889 static inline bool set_nr_and_not_polling(struct task_struct *p)
0890 {
0891 struct thread_info *ti = task_thread_info(p);
0892 return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG);
0893 }
0894
0895
0896
0897
0898
0899
0900
0901 static bool set_nr_if_polling(struct task_struct *p)
0902 {
0903 struct thread_info *ti = task_thread_info(p);
0904 typeof(ti->flags) val = READ_ONCE(ti->flags);
0905
0906 for (;;) {
0907 if (!(val & _TIF_POLLING_NRFLAG))
0908 return false;
0909 if (val & _TIF_NEED_RESCHED)
0910 return true;
0911 if (try_cmpxchg(&ti->flags, &val, val | _TIF_NEED_RESCHED))
0912 break;
0913 }
0914 return true;
0915 }
0916
0917 #else
0918 static inline bool set_nr_and_not_polling(struct task_struct *p)
0919 {
0920 set_tsk_need_resched(p);
0921 return true;
0922 }
0923
0924 #ifdef CONFIG_SMP
0925 static inline bool set_nr_if_polling(struct task_struct *p)
0926 {
0927 return false;
0928 }
0929 #endif
0930 #endif
0931
0932 static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task)
0933 {
0934 struct wake_q_node *node = &task->wake_q;
0935
0936
0937
0938
0939
0940
0941
0942
0943
0944 smp_mb__before_atomic();
0945 if (unlikely(cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL)))
0946 return false;
0947
0948
0949
0950
0951 *head->lastp = node;
0952 head->lastp = &node->next;
0953 return true;
0954 }
0955
0956
0957
0958
0959
0960
0961
0962
0963
0964
0965
0966
0967
0968 void wake_q_add(struct wake_q_head *head, struct task_struct *task)
0969 {
0970 if (__wake_q_add(head, task))
0971 get_task_struct(task);
0972 }
0973
0974
0975
0976
0977
0978
0979
0980
0981
0982
0983
0984
0985
0986
0987
0988
0989
0990
0991 void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task)
0992 {
0993 if (!__wake_q_add(head, task))
0994 put_task_struct(task);
0995 }
0996
0997 void wake_up_q(struct wake_q_head *head)
0998 {
0999 struct wake_q_node *node = head->first;
1000
1001 while (node != WAKE_Q_TAIL) {
1002 struct task_struct *task;
1003
1004 task = container_of(node, struct task_struct, wake_q);
1005
1006 node = node->next;
1007 task->wake_q.next = NULL;
1008
1009
1010
1011
1012
1013 wake_up_process(task);
1014 put_task_struct(task);
1015 }
1016 }
1017
1018
1019
1020
1021
1022
1023
1024
1025 void resched_curr(struct rq *rq)
1026 {
1027 struct task_struct *curr = rq->curr;
1028 int cpu;
1029
1030 lockdep_assert_rq_held(rq);
1031
1032 if (test_tsk_need_resched(curr))
1033 return;
1034
1035 cpu = cpu_of(rq);
1036
1037 if (cpu == smp_processor_id()) {
1038 set_tsk_need_resched(curr);
1039 set_preempt_need_resched();
1040 return;
1041 }
1042
1043 if (set_nr_and_not_polling(curr))
1044 smp_send_reschedule(cpu);
1045 else
1046 trace_sched_wake_idle_without_ipi(cpu);
1047 }
1048
1049 void resched_cpu(int cpu)
1050 {
1051 struct rq *rq = cpu_rq(cpu);
1052 unsigned long flags;
1053
1054 raw_spin_rq_lock_irqsave(rq, flags);
1055 if (cpu_online(cpu) || cpu == smp_processor_id())
1056 resched_curr(rq);
1057 raw_spin_rq_unlock_irqrestore(rq, flags);
1058 }
1059
1060 #ifdef CONFIG_SMP
1061 #ifdef CONFIG_NO_HZ_COMMON
1062
1063
1064
1065
1066
1067
1068
1069
1070 int get_nohz_timer_target(void)
1071 {
1072 int i, cpu = smp_processor_id(), default_cpu = -1;
1073 struct sched_domain *sd;
1074 const struct cpumask *hk_mask;
1075
1076 if (housekeeping_cpu(cpu, HK_TYPE_TIMER)) {
1077 if (!idle_cpu(cpu))
1078 return cpu;
1079 default_cpu = cpu;
1080 }
1081
1082 hk_mask = housekeeping_cpumask(HK_TYPE_TIMER);
1083
1084 rcu_read_lock();
1085 for_each_domain(cpu, sd) {
1086 for_each_cpu_and(i, sched_domain_span(sd), hk_mask) {
1087 if (cpu == i)
1088 continue;
1089
1090 if (!idle_cpu(i)) {
1091 cpu = i;
1092 goto unlock;
1093 }
1094 }
1095 }
1096
1097 if (default_cpu == -1)
1098 default_cpu = housekeeping_any_cpu(HK_TYPE_TIMER);
1099 cpu = default_cpu;
1100 unlock:
1101 rcu_read_unlock();
1102 return cpu;
1103 }
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115 static void wake_up_idle_cpu(int cpu)
1116 {
1117 struct rq *rq = cpu_rq(cpu);
1118
1119 if (cpu == smp_processor_id())
1120 return;
1121
1122 if (set_nr_and_not_polling(rq->idle))
1123 smp_send_reschedule(cpu);
1124 else
1125 trace_sched_wake_idle_without_ipi(cpu);
1126 }
1127
1128 static bool wake_up_full_nohz_cpu(int cpu)
1129 {
1130
1131
1132
1133
1134
1135
1136 if (cpu_is_offline(cpu))
1137 return true;
1138 if (tick_nohz_full_cpu(cpu)) {
1139 if (cpu != smp_processor_id() ||
1140 tick_nohz_tick_stopped())
1141 tick_nohz_full_kick_cpu(cpu);
1142 return true;
1143 }
1144
1145 return false;
1146 }
1147
1148
1149
1150
1151
1152
1153 void wake_up_nohz_cpu(int cpu)
1154 {
1155 if (!wake_up_full_nohz_cpu(cpu))
1156 wake_up_idle_cpu(cpu);
1157 }
1158
1159 static void nohz_csd_func(void *info)
1160 {
1161 struct rq *rq = info;
1162 int cpu = cpu_of(rq);
1163 unsigned int flags;
1164
1165
1166
1167
1168 flags = atomic_fetch_andnot(NOHZ_KICK_MASK | NOHZ_NEWILB_KICK, nohz_flags(cpu));
1169 WARN_ON(!(flags & NOHZ_KICK_MASK));
1170
1171 rq->idle_balance = idle_cpu(cpu);
1172 if (rq->idle_balance && !need_resched()) {
1173 rq->nohz_idle_balance = flags;
1174 raise_softirq_irqoff(SCHED_SOFTIRQ);
1175 }
1176 }
1177
1178 #endif
1179
1180 #ifdef CONFIG_NO_HZ_FULL
1181 bool sched_can_stop_tick(struct rq *rq)
1182 {
1183 int fifo_nr_running;
1184
1185
1186 if (rq->dl.dl_nr_running)
1187 return false;
1188
1189
1190
1191
1192
1193 if (rq->rt.rr_nr_running) {
1194 if (rq->rt.rr_nr_running == 1)
1195 return true;
1196 else
1197 return false;
1198 }
1199
1200
1201
1202
1203
1204 fifo_nr_running = rq->rt.rt_nr_running - rq->rt.rr_nr_running;
1205 if (fifo_nr_running)
1206 return true;
1207
1208
1209
1210
1211
1212
1213 if (rq->nr_running > 1)
1214 return false;
1215
1216 return true;
1217 }
1218 #endif
1219 #endif
1220
1221 #if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
1222 (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))
1223
1224
1225
1226
1227
1228
1229 int walk_tg_tree_from(struct task_group *from,
1230 tg_visitor down, tg_visitor up, void *data)
1231 {
1232 struct task_group *parent, *child;
1233 int ret;
1234
1235 parent = from;
1236
1237 down:
1238 ret = (*down)(parent, data);
1239 if (ret)
1240 goto out;
1241 list_for_each_entry_rcu(child, &parent->children, siblings) {
1242 parent = child;
1243 goto down;
1244
1245 up:
1246 continue;
1247 }
1248 ret = (*up)(parent, data);
1249 if (ret || parent == from)
1250 goto out;
1251
1252 child = parent;
1253 parent = parent->parent;
1254 if (parent)
1255 goto up;
1256 out:
1257 return ret;
1258 }
1259
1260 int tg_nop(struct task_group *tg, void *data)
1261 {
1262 return 0;
1263 }
1264 #endif
1265
1266 static void set_load_weight(struct task_struct *p, bool update_load)
1267 {
1268 int prio = p->static_prio - MAX_RT_PRIO;
1269 struct load_weight *load = &p->se.load;
1270
1271
1272
1273
1274 if (task_has_idle_policy(p)) {
1275 load->weight = scale_load(WEIGHT_IDLEPRIO);
1276 load->inv_weight = WMULT_IDLEPRIO;
1277 return;
1278 }
1279
1280
1281
1282
1283
1284 if (update_load && p->sched_class == &fair_sched_class) {
1285 reweight_task(p, prio);
1286 } else {
1287 load->weight = scale_load(sched_prio_to_weight[prio]);
1288 load->inv_weight = sched_prio_to_wmult[prio];
1289 }
1290 }
1291
1292 #ifdef CONFIG_UCLAMP_TASK
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303 static DEFINE_MUTEX(uclamp_mutex);
1304
1305
1306 static unsigned int __maybe_unused sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE;
1307
1308
1309 static unsigned int __maybe_unused sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE;
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326 static unsigned int sysctl_sched_uclamp_util_min_rt_default = SCHED_CAPACITY_SCALE;
1327
1328
1329 static struct uclamp_se uclamp_default[UCLAMP_CNT];
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349 DEFINE_STATIC_KEY_FALSE(sched_uclamp_used);
1350
1351
1352 #define UCLAMP_BUCKET_DELTA DIV_ROUND_CLOSEST(SCHED_CAPACITY_SCALE, UCLAMP_BUCKETS)
1353
1354 #define for_each_clamp_id(clamp_id) \
1355 for ((clamp_id) = 0; (clamp_id) < UCLAMP_CNT; (clamp_id)++)
1356
1357 static inline unsigned int uclamp_bucket_id(unsigned int clamp_value)
1358 {
1359 return min_t(unsigned int, clamp_value / UCLAMP_BUCKET_DELTA, UCLAMP_BUCKETS - 1);
1360 }
1361
1362 static inline unsigned int uclamp_none(enum uclamp_id clamp_id)
1363 {
1364 if (clamp_id == UCLAMP_MIN)
1365 return 0;
1366 return SCHED_CAPACITY_SCALE;
1367 }
1368
1369 static inline void uclamp_se_set(struct uclamp_se *uc_se,
1370 unsigned int value, bool user_defined)
1371 {
1372 uc_se->value = value;
1373 uc_se->bucket_id = uclamp_bucket_id(value);
1374 uc_se->user_defined = user_defined;
1375 }
1376
1377 static inline unsigned int
1378 uclamp_idle_value(struct rq *rq, enum uclamp_id clamp_id,
1379 unsigned int clamp_value)
1380 {
1381
1382
1383
1384
1385
1386 if (clamp_id == UCLAMP_MAX) {
1387 rq->uclamp_flags |= UCLAMP_FLAG_IDLE;
1388 return clamp_value;
1389 }
1390
1391 return uclamp_none(UCLAMP_MIN);
1392 }
1393
1394 static inline void uclamp_idle_reset(struct rq *rq, enum uclamp_id clamp_id,
1395 unsigned int clamp_value)
1396 {
1397
1398 if (!(rq->uclamp_flags & UCLAMP_FLAG_IDLE))
1399 return;
1400
1401 WRITE_ONCE(rq->uclamp[clamp_id].value, clamp_value);
1402 }
1403
1404 static inline
1405 unsigned int uclamp_rq_max_value(struct rq *rq, enum uclamp_id clamp_id,
1406 unsigned int clamp_value)
1407 {
1408 struct uclamp_bucket *bucket = rq->uclamp[clamp_id].bucket;
1409 int bucket_id = UCLAMP_BUCKETS - 1;
1410
1411
1412
1413
1414
1415 for ( ; bucket_id >= 0; bucket_id--) {
1416 if (!bucket[bucket_id].tasks)
1417 continue;
1418 return bucket[bucket_id].value;
1419 }
1420
1421
1422 return uclamp_idle_value(rq, clamp_id, clamp_value);
1423 }
1424
1425 static void __uclamp_update_util_min_rt_default(struct task_struct *p)
1426 {
1427 unsigned int default_util_min;
1428 struct uclamp_se *uc_se;
1429
1430 lockdep_assert_held(&p->pi_lock);
1431
1432 uc_se = &p->uclamp_req[UCLAMP_MIN];
1433
1434
1435 if (uc_se->user_defined)
1436 return;
1437
1438 default_util_min = sysctl_sched_uclamp_util_min_rt_default;
1439 uclamp_se_set(uc_se, default_util_min, false);
1440 }
1441
1442 static void uclamp_update_util_min_rt_default(struct task_struct *p)
1443 {
1444 struct rq_flags rf;
1445 struct rq *rq;
1446
1447 if (!rt_task(p))
1448 return;
1449
1450
1451 rq = task_rq_lock(p, &rf);
1452 __uclamp_update_util_min_rt_default(p);
1453 task_rq_unlock(rq, p, &rf);
1454 }
1455
1456 static inline struct uclamp_se
1457 uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clamp_id)
1458 {
1459
1460 struct uclamp_se uc_req = p->uclamp_req[clamp_id];
1461 #ifdef CONFIG_UCLAMP_TASK_GROUP
1462 unsigned int tg_min, tg_max, value;
1463
1464
1465
1466
1467
1468 if (task_group_is_autogroup(task_group(p)))
1469 return uc_req;
1470 if (task_group(p) == &root_task_group)
1471 return uc_req;
1472
1473 tg_min = task_group(p)->uclamp[UCLAMP_MIN].value;
1474 tg_max = task_group(p)->uclamp[UCLAMP_MAX].value;
1475 value = uc_req.value;
1476 value = clamp(value, tg_min, tg_max);
1477 uclamp_se_set(&uc_req, value, false);
1478 #endif
1479
1480 return uc_req;
1481 }
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491 static inline struct uclamp_se
1492 uclamp_eff_get(struct task_struct *p, enum uclamp_id clamp_id)
1493 {
1494 struct uclamp_se uc_req = uclamp_tg_restrict(p, clamp_id);
1495 struct uclamp_se uc_max = uclamp_default[clamp_id];
1496
1497
1498 if (unlikely(uc_req.value > uc_max.value))
1499 return uc_max;
1500
1501 return uc_req;
1502 }
1503
1504 unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id)
1505 {
1506 struct uclamp_se uc_eff;
1507
1508
1509 if (p->uclamp[clamp_id].active)
1510 return (unsigned long)p->uclamp[clamp_id].value;
1511
1512 uc_eff = uclamp_eff_get(p, clamp_id);
1513
1514 return (unsigned long)uc_eff.value;
1515 }
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527 static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p,
1528 enum uclamp_id clamp_id)
1529 {
1530 struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
1531 struct uclamp_se *uc_se = &p->uclamp[clamp_id];
1532 struct uclamp_bucket *bucket;
1533
1534 lockdep_assert_rq_held(rq);
1535
1536
1537 p->uclamp[clamp_id] = uclamp_eff_get(p, clamp_id);
1538
1539 bucket = &uc_rq->bucket[uc_se->bucket_id];
1540 bucket->tasks++;
1541 uc_se->active = true;
1542
1543 uclamp_idle_reset(rq, clamp_id, uc_se->value);
1544
1545
1546
1547
1548
1549 if (bucket->tasks == 1 || uc_se->value > bucket->value)
1550 bucket->value = uc_se->value;
1551
1552 if (uc_se->value > READ_ONCE(uc_rq->value))
1553 WRITE_ONCE(uc_rq->value, uc_se->value);
1554 }
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565 static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
1566 enum uclamp_id clamp_id)
1567 {
1568 struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
1569 struct uclamp_se *uc_se = &p->uclamp[clamp_id];
1570 struct uclamp_bucket *bucket;
1571 unsigned int bkt_clamp;
1572 unsigned int rq_clamp;
1573
1574 lockdep_assert_rq_held(rq);
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599 if (unlikely(!uc_se->active))
1600 return;
1601
1602 bucket = &uc_rq->bucket[uc_se->bucket_id];
1603
1604 SCHED_WARN_ON(!bucket->tasks);
1605 if (likely(bucket->tasks))
1606 bucket->tasks--;
1607
1608 uc_se->active = false;
1609
1610
1611
1612
1613
1614
1615
1616 if (likely(bucket->tasks))
1617 return;
1618
1619 rq_clamp = READ_ONCE(uc_rq->value);
1620
1621
1622
1623
1624 SCHED_WARN_ON(bucket->value > rq_clamp);
1625 if (bucket->value >= rq_clamp) {
1626 bkt_clamp = uclamp_rq_max_value(rq, clamp_id, uc_se->value);
1627 WRITE_ONCE(uc_rq->value, bkt_clamp);
1628 }
1629 }
1630
1631 static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
1632 {
1633 enum uclamp_id clamp_id;
1634
1635
1636
1637
1638
1639
1640
1641 if (!static_branch_unlikely(&sched_uclamp_used))
1642 return;
1643
1644 if (unlikely(!p->sched_class->uclamp_enabled))
1645 return;
1646
1647 for_each_clamp_id(clamp_id)
1648 uclamp_rq_inc_id(rq, p, clamp_id);
1649
1650
1651 if (rq->uclamp_flags & UCLAMP_FLAG_IDLE)
1652 rq->uclamp_flags &= ~UCLAMP_FLAG_IDLE;
1653 }
1654
1655 static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
1656 {
1657 enum uclamp_id clamp_id;
1658
1659
1660
1661
1662
1663
1664
1665 if (!static_branch_unlikely(&sched_uclamp_used))
1666 return;
1667
1668 if (unlikely(!p->sched_class->uclamp_enabled))
1669 return;
1670
1671 for_each_clamp_id(clamp_id)
1672 uclamp_rq_dec_id(rq, p, clamp_id);
1673 }
1674
1675 static inline void uclamp_rq_reinc_id(struct rq *rq, struct task_struct *p,
1676 enum uclamp_id clamp_id)
1677 {
1678 if (!p->uclamp[clamp_id].active)
1679 return;
1680
1681 uclamp_rq_dec_id(rq, p, clamp_id);
1682 uclamp_rq_inc_id(rq, p, clamp_id);
1683
1684
1685
1686
1687
1688 if (clamp_id == UCLAMP_MAX && (rq->uclamp_flags & UCLAMP_FLAG_IDLE))
1689 rq->uclamp_flags &= ~UCLAMP_FLAG_IDLE;
1690 }
1691
1692 static inline void
1693 uclamp_update_active(struct task_struct *p)
1694 {
1695 enum uclamp_id clamp_id;
1696 struct rq_flags rf;
1697 struct rq *rq;
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707 rq = task_rq_lock(p, &rf);
1708
1709
1710
1711
1712
1713
1714
1715 for_each_clamp_id(clamp_id)
1716 uclamp_rq_reinc_id(rq, p, clamp_id);
1717
1718 task_rq_unlock(rq, p, &rf);
1719 }
1720
1721 #ifdef CONFIG_UCLAMP_TASK_GROUP
1722 static inline void
1723 uclamp_update_active_tasks(struct cgroup_subsys_state *css)
1724 {
1725 struct css_task_iter it;
1726 struct task_struct *p;
1727
1728 css_task_iter_start(css, 0, &it);
1729 while ((p = css_task_iter_next(&it)))
1730 uclamp_update_active(p);
1731 css_task_iter_end(&it);
1732 }
1733
1734 static void cpu_util_update_eff(struct cgroup_subsys_state *css);
1735 #endif
1736
1737 #ifdef CONFIG_SYSCTL
1738 #ifdef CONFIG_UCLAMP_TASK
1739 #ifdef CONFIG_UCLAMP_TASK_GROUP
1740 static void uclamp_update_root_tg(void)
1741 {
1742 struct task_group *tg = &root_task_group;
1743
1744 uclamp_se_set(&tg->uclamp_req[UCLAMP_MIN],
1745 sysctl_sched_uclamp_util_min, false);
1746 uclamp_se_set(&tg->uclamp_req[UCLAMP_MAX],
1747 sysctl_sched_uclamp_util_max, false);
1748
1749 rcu_read_lock();
1750 cpu_util_update_eff(&root_task_group.css);
1751 rcu_read_unlock();
1752 }
1753 #else
1754 static void uclamp_update_root_tg(void) { }
1755 #endif
1756
1757 static void uclamp_sync_util_min_rt_default(void)
1758 {
1759 struct task_struct *g, *p;
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774 read_lock(&tasklist_lock);
1775 smp_mb__after_spinlock();
1776 read_unlock(&tasklist_lock);
1777
1778 rcu_read_lock();
1779 for_each_process_thread(g, p)
1780 uclamp_update_util_min_rt_default(p);
1781 rcu_read_unlock();
1782 }
1783
1784 static int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
1785 void *buffer, size_t *lenp, loff_t *ppos)
1786 {
1787 bool update_root_tg = false;
1788 int old_min, old_max, old_min_rt;
1789 int result;
1790
1791 mutex_lock(&uclamp_mutex);
1792 old_min = sysctl_sched_uclamp_util_min;
1793 old_max = sysctl_sched_uclamp_util_max;
1794 old_min_rt = sysctl_sched_uclamp_util_min_rt_default;
1795
1796 result = proc_dointvec(table, write, buffer, lenp, ppos);
1797 if (result)
1798 goto undo;
1799 if (!write)
1800 goto done;
1801
1802 if (sysctl_sched_uclamp_util_min > sysctl_sched_uclamp_util_max ||
1803 sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE ||
1804 sysctl_sched_uclamp_util_min_rt_default > SCHED_CAPACITY_SCALE) {
1805
1806 result = -EINVAL;
1807 goto undo;
1808 }
1809
1810 if (old_min != sysctl_sched_uclamp_util_min) {
1811 uclamp_se_set(&uclamp_default[UCLAMP_MIN],
1812 sysctl_sched_uclamp_util_min, false);
1813 update_root_tg = true;
1814 }
1815 if (old_max != sysctl_sched_uclamp_util_max) {
1816 uclamp_se_set(&uclamp_default[UCLAMP_MAX],
1817 sysctl_sched_uclamp_util_max, false);
1818 update_root_tg = true;
1819 }
1820
1821 if (update_root_tg) {
1822 static_branch_enable(&sched_uclamp_used);
1823 uclamp_update_root_tg();
1824 }
1825
1826 if (old_min_rt != sysctl_sched_uclamp_util_min_rt_default) {
1827 static_branch_enable(&sched_uclamp_used);
1828 uclamp_sync_util_min_rt_default();
1829 }
1830
1831
1832
1833
1834
1835
1836
1837 goto done;
1838
1839 undo:
1840 sysctl_sched_uclamp_util_min = old_min;
1841 sysctl_sched_uclamp_util_max = old_max;
1842 sysctl_sched_uclamp_util_min_rt_default = old_min_rt;
1843 done:
1844 mutex_unlock(&uclamp_mutex);
1845
1846 return result;
1847 }
1848 #endif
1849 #endif
1850
1851 static int uclamp_validate(struct task_struct *p,
1852 const struct sched_attr *attr)
1853 {
1854 int util_min = p->uclamp_req[UCLAMP_MIN].value;
1855 int util_max = p->uclamp_req[UCLAMP_MAX].value;
1856
1857 if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) {
1858 util_min = attr->sched_util_min;
1859
1860 if (util_min + 1 > SCHED_CAPACITY_SCALE + 1)
1861 return -EINVAL;
1862 }
1863
1864 if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) {
1865 util_max = attr->sched_util_max;
1866
1867 if (util_max + 1 > SCHED_CAPACITY_SCALE + 1)
1868 return -EINVAL;
1869 }
1870
1871 if (util_min != -1 && util_max != -1 && util_min > util_max)
1872 return -EINVAL;
1873
1874
1875
1876
1877
1878
1879
1880
1881 static_branch_enable(&sched_uclamp_used);
1882
1883 return 0;
1884 }
1885
1886 static bool uclamp_reset(const struct sched_attr *attr,
1887 enum uclamp_id clamp_id,
1888 struct uclamp_se *uc_se)
1889 {
1890
1891 if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)) &&
1892 !uc_se->user_defined)
1893 return true;
1894
1895
1896 if (clamp_id == UCLAMP_MIN &&
1897 attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN &&
1898 attr->sched_util_min == -1) {
1899 return true;
1900 }
1901
1902 if (clamp_id == UCLAMP_MAX &&
1903 attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX &&
1904 attr->sched_util_max == -1) {
1905 return true;
1906 }
1907
1908 return false;
1909 }
1910
1911 static void __setscheduler_uclamp(struct task_struct *p,
1912 const struct sched_attr *attr)
1913 {
1914 enum uclamp_id clamp_id;
1915
1916 for_each_clamp_id(clamp_id) {
1917 struct uclamp_se *uc_se = &p->uclamp_req[clamp_id];
1918 unsigned int value;
1919
1920 if (!uclamp_reset(attr, clamp_id, uc_se))
1921 continue;
1922
1923
1924
1925
1926
1927 if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN))
1928 value = sysctl_sched_uclamp_util_min_rt_default;
1929 else
1930 value = uclamp_none(clamp_id);
1931
1932 uclamp_se_set(uc_se, value, false);
1933
1934 }
1935
1936 if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)))
1937 return;
1938
1939 if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN &&
1940 attr->sched_util_min != -1) {
1941 uclamp_se_set(&p->uclamp_req[UCLAMP_MIN],
1942 attr->sched_util_min, true);
1943 }
1944
1945 if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX &&
1946 attr->sched_util_max != -1) {
1947 uclamp_se_set(&p->uclamp_req[UCLAMP_MAX],
1948 attr->sched_util_max, true);
1949 }
1950 }
1951
1952 static void uclamp_fork(struct task_struct *p)
1953 {
1954 enum uclamp_id clamp_id;
1955
1956
1957
1958
1959
1960 for_each_clamp_id(clamp_id)
1961 p->uclamp[clamp_id].active = false;
1962
1963 if (likely(!p->sched_reset_on_fork))
1964 return;
1965
1966 for_each_clamp_id(clamp_id) {
1967 uclamp_se_set(&p->uclamp_req[clamp_id],
1968 uclamp_none(clamp_id), false);
1969 }
1970 }
1971
1972 static void uclamp_post_fork(struct task_struct *p)
1973 {
1974 uclamp_update_util_min_rt_default(p);
1975 }
1976
1977 static void __init init_uclamp_rq(struct rq *rq)
1978 {
1979 enum uclamp_id clamp_id;
1980 struct uclamp_rq *uc_rq = rq->uclamp;
1981
1982 for_each_clamp_id(clamp_id) {
1983 uc_rq[clamp_id] = (struct uclamp_rq) {
1984 .value = uclamp_none(clamp_id)
1985 };
1986 }
1987
1988 rq->uclamp_flags = UCLAMP_FLAG_IDLE;
1989 }
1990
1991 static void __init init_uclamp(void)
1992 {
1993 struct uclamp_se uc_max = {};
1994 enum uclamp_id clamp_id;
1995 int cpu;
1996
1997 for_each_possible_cpu(cpu)
1998 init_uclamp_rq(cpu_rq(cpu));
1999
2000 for_each_clamp_id(clamp_id) {
2001 uclamp_se_set(&init_task.uclamp_req[clamp_id],
2002 uclamp_none(clamp_id), false);
2003 }
2004
2005
2006 uclamp_se_set(&uc_max, uclamp_none(UCLAMP_MAX), false);
2007 for_each_clamp_id(clamp_id) {
2008 uclamp_default[clamp_id] = uc_max;
2009 #ifdef CONFIG_UCLAMP_TASK_GROUP
2010 root_task_group.uclamp_req[clamp_id] = uc_max;
2011 root_task_group.uclamp[clamp_id] = uc_max;
2012 #endif
2013 }
2014 }
2015
2016 #else
2017 static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) { }
2018 static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { }
2019 static inline int uclamp_validate(struct task_struct *p,
2020 const struct sched_attr *attr)
2021 {
2022 return -EOPNOTSUPP;
2023 }
2024 static void __setscheduler_uclamp(struct task_struct *p,
2025 const struct sched_attr *attr) { }
2026 static inline void uclamp_fork(struct task_struct *p) { }
2027 static inline void uclamp_post_fork(struct task_struct *p) { }
2028 static inline void init_uclamp(void) { }
2029 #endif
2030
2031 bool sched_task_on_rq(struct task_struct *p)
2032 {
2033 return task_on_rq_queued(p);
2034 }
2035
2036 unsigned long get_wchan(struct task_struct *p)
2037 {
2038 unsigned long ip = 0;
2039 unsigned int state;
2040
2041 if (!p || p == current)
2042 return 0;
2043
2044
2045 raw_spin_lock_irq(&p->pi_lock);
2046 state = READ_ONCE(p->__state);
2047 smp_rmb();
2048 if (state != TASK_RUNNING && state != TASK_WAKING && !p->on_rq)
2049 ip = __get_wchan(p);
2050 raw_spin_unlock_irq(&p->pi_lock);
2051
2052 return ip;
2053 }
2054
2055 static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
2056 {
2057 if (!(flags & ENQUEUE_NOCLOCK))
2058 update_rq_clock(rq);
2059
2060 if (!(flags & ENQUEUE_RESTORE)) {
2061 sched_info_enqueue(rq, p);
2062 psi_enqueue(p, flags & ENQUEUE_WAKEUP);
2063 }
2064
2065 uclamp_rq_inc(rq, p);
2066 p->sched_class->enqueue_task(rq, p, flags);
2067
2068 if (sched_core_enabled(rq))
2069 sched_core_enqueue(rq, p);
2070 }
2071
2072 static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
2073 {
2074 if (sched_core_enabled(rq))
2075 sched_core_dequeue(rq, p, flags);
2076
2077 if (!(flags & DEQUEUE_NOCLOCK))
2078 update_rq_clock(rq);
2079
2080 if (!(flags & DEQUEUE_SAVE)) {
2081 sched_info_dequeue(rq, p);
2082 psi_dequeue(p, flags & DEQUEUE_SLEEP);
2083 }
2084
2085 uclamp_rq_dec(rq, p);
2086 p->sched_class->dequeue_task(rq, p, flags);
2087 }
2088
2089 void activate_task(struct rq *rq, struct task_struct *p, int flags)
2090 {
2091 enqueue_task(rq, p, flags);
2092
2093 p->on_rq = TASK_ON_RQ_QUEUED;
2094 }
2095
2096 void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
2097 {
2098 p->on_rq = (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING;
2099
2100 dequeue_task(rq, p, flags);
2101 }
2102
2103 static inline int __normal_prio(int policy, int rt_prio, int nice)
2104 {
2105 int prio;
2106
2107 if (dl_policy(policy))
2108 prio = MAX_DL_PRIO - 1;
2109 else if (rt_policy(policy))
2110 prio = MAX_RT_PRIO - 1 - rt_prio;
2111 else
2112 prio = NICE_TO_PRIO(nice);
2113
2114 return prio;
2115 }
2116
2117
2118
2119
2120
2121
2122
2123
2124 static inline int normal_prio(struct task_struct *p)
2125 {
2126 return __normal_prio(p->policy, p->rt_priority, PRIO_TO_NICE(p->static_prio));
2127 }
2128
2129
2130
2131
2132
2133
2134
2135
2136 static int effective_prio(struct task_struct *p)
2137 {
2138 p->normal_prio = normal_prio(p);
2139
2140
2141
2142
2143
2144 if (!rt_prio(p->prio))
2145 return p->normal_prio;
2146 return p->prio;
2147 }
2148
2149
2150
2151
2152
2153
2154
2155 inline int task_curr(const struct task_struct *p)
2156 {
2157 return cpu_curr(task_cpu(p)) == p;
2158 }
2159
2160
2161
2162
2163
2164
2165
2166
2167 static inline void check_class_changed(struct rq *rq, struct task_struct *p,
2168 const struct sched_class *prev_class,
2169 int oldprio)
2170 {
2171 if (prev_class != p->sched_class) {
2172 if (prev_class->switched_from)
2173 prev_class->switched_from(rq, p);
2174
2175 p->sched_class->switched_to(rq, p);
2176 } else if (oldprio != p->prio || dl_task(p))
2177 p->sched_class->prio_changed(rq, p, oldprio);
2178 }
2179
2180 void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
2181 {
2182 if (p->sched_class == rq->curr->sched_class)
2183 rq->curr->sched_class->check_preempt_curr(rq, p, flags);
2184 else if (sched_class_above(p->sched_class, rq->curr->sched_class))
2185 resched_curr(rq);
2186
2187
2188
2189
2190
2191 if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr))
2192 rq_clock_skip_update(rq);
2193 }
2194
2195 #ifdef CONFIG_SMP
2196
2197 static void
2198 __do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 flags);
2199
2200 static int __set_cpus_allowed_ptr(struct task_struct *p,
2201 const struct cpumask *new_mask,
2202 u32 flags);
2203
2204 static void migrate_disable_switch(struct rq *rq, struct task_struct *p)
2205 {
2206 if (likely(!p->migration_disabled))
2207 return;
2208
2209 if (p->cpus_ptr != &p->cpus_mask)
2210 return;
2211
2212
2213
2214
2215 __do_set_cpus_allowed(p, cpumask_of(rq->cpu), SCA_MIGRATE_DISABLE);
2216 }
2217
2218 void migrate_disable(void)
2219 {
2220 struct task_struct *p = current;
2221
2222 if (p->migration_disabled) {
2223 p->migration_disabled++;
2224 return;
2225 }
2226
2227 preempt_disable();
2228 this_rq()->nr_pinned++;
2229 p->migration_disabled = 1;
2230 preempt_enable();
2231 }
2232 EXPORT_SYMBOL_GPL(migrate_disable);
2233
2234 void migrate_enable(void)
2235 {
2236 struct task_struct *p = current;
2237
2238 if (p->migration_disabled > 1) {
2239 p->migration_disabled--;
2240 return;
2241 }
2242
2243 if (WARN_ON_ONCE(!p->migration_disabled))
2244 return;
2245
2246
2247
2248
2249
2250 preempt_disable();
2251 if (p->cpus_ptr != &p->cpus_mask)
2252 __set_cpus_allowed_ptr(p, &p->cpus_mask, SCA_MIGRATE_ENABLE);
2253
2254
2255
2256
2257
2258 barrier();
2259 p->migration_disabled = 0;
2260 this_rq()->nr_pinned--;
2261 preempt_enable();
2262 }
2263 EXPORT_SYMBOL_GPL(migrate_enable);
2264
2265 static inline bool rq_has_pinned_tasks(struct rq *rq)
2266 {
2267 return rq->nr_pinned;
2268 }
2269
2270
2271
2272
2273
2274 static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
2275 {
2276
2277 if (!cpumask_test_cpu(cpu, p->cpus_ptr))
2278 return false;
2279
2280
2281 if (is_migration_disabled(p))
2282 return cpu_online(cpu);
2283
2284
2285 if (!(p->flags & PF_KTHREAD))
2286 return cpu_active(cpu) && task_cpu_possible(cpu, p);
2287
2288
2289 if (kthread_is_per_cpu(p))
2290 return cpu_online(cpu);
2291
2292
2293 if (cpu_dying(cpu))
2294 return false;
2295
2296
2297 return cpu_online(cpu);
2298 }
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319 static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf,
2320 struct task_struct *p, int new_cpu)
2321 {
2322 lockdep_assert_rq_held(rq);
2323
2324 deactivate_task(rq, p, DEQUEUE_NOCLOCK);
2325 set_task_cpu(p, new_cpu);
2326 rq_unlock(rq, rf);
2327
2328 rq = cpu_rq(new_cpu);
2329
2330 rq_lock(rq, rf);
2331 BUG_ON(task_cpu(p) != new_cpu);
2332 activate_task(rq, p, 0);
2333 check_preempt_curr(rq, p, 0);
2334
2335 return rq;
2336 }
2337
2338 struct migration_arg {
2339 struct task_struct *task;
2340 int dest_cpu;
2341 struct set_affinity_pending *pending;
2342 };
2343
2344
2345
2346
2347
2348 struct set_affinity_pending {
2349 refcount_t refs;
2350 unsigned int stop_pending;
2351 struct completion done;
2352 struct cpu_stop_work stop_work;
2353 struct migration_arg arg;
2354 };
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365 static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf,
2366 struct task_struct *p, int dest_cpu)
2367 {
2368
2369 if (!is_cpu_allowed(p, dest_cpu))
2370 return rq;
2371
2372 update_rq_clock(rq);
2373 rq = move_queued_task(rq, rf, p, dest_cpu);
2374
2375 return rq;
2376 }
2377
2378
2379
2380
2381
2382
2383 static int migration_cpu_stop(void *data)
2384 {
2385 struct migration_arg *arg = data;
2386 struct set_affinity_pending *pending = arg->pending;
2387 struct task_struct *p = arg->task;
2388 struct rq *rq = this_rq();
2389 bool complete = false;
2390 struct rq_flags rf;
2391
2392
2393
2394
2395
2396 local_irq_save(rf.flags);
2397
2398
2399
2400
2401
2402 flush_smp_call_function_queue();
2403
2404 raw_spin_lock(&p->pi_lock);
2405 rq_lock(rq, &rf);
2406
2407
2408
2409
2410
2411 WARN_ON_ONCE(pending && pending != p->migration_pending);
2412
2413
2414
2415
2416
2417
2418 if (task_rq(p) == rq) {
2419 if (is_migration_disabled(p))
2420 goto out;
2421
2422 if (pending) {
2423 p->migration_pending = NULL;
2424 complete = true;
2425
2426 if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask))
2427 goto out;
2428 }
2429
2430 if (task_on_rq_queued(p))
2431 rq = __migrate_task(rq, &rf, p, arg->dest_cpu);
2432 else
2433 p->wake_cpu = arg->dest_cpu;
2434
2435
2436
2437
2438
2439
2440
2441
2442 } else if (pending) {
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457 if (cpumask_test_cpu(task_cpu(p), p->cpus_ptr)) {
2458 p->migration_pending = NULL;
2459 complete = true;
2460 goto out;
2461 }
2462
2463
2464
2465
2466
2467
2468 WARN_ON_ONCE(!pending->stop_pending);
2469 task_rq_unlock(rq, p, &rf);
2470 stop_one_cpu_nowait(task_cpu(p), migration_cpu_stop,
2471 &pending->arg, &pending->stop_work);
2472 return 0;
2473 }
2474 out:
2475 if (pending)
2476 pending->stop_pending = false;
2477 task_rq_unlock(rq, p, &rf);
2478
2479 if (complete)
2480 complete_all(&pending->done);
2481
2482 return 0;
2483 }
2484
2485 int push_cpu_stop(void *arg)
2486 {
2487 struct rq *lowest_rq = NULL, *rq = this_rq();
2488 struct task_struct *p = arg;
2489
2490 raw_spin_lock_irq(&p->pi_lock);
2491 raw_spin_rq_lock(rq);
2492
2493 if (task_rq(p) != rq)
2494 goto out_unlock;
2495
2496 if (is_migration_disabled(p)) {
2497 p->migration_flags |= MDF_PUSH;
2498 goto out_unlock;
2499 }
2500
2501 p->migration_flags &= ~MDF_PUSH;
2502
2503 if (p->sched_class->find_lock_rq)
2504 lowest_rq = p->sched_class->find_lock_rq(p, rq);
2505
2506 if (!lowest_rq)
2507 goto out_unlock;
2508
2509
2510 if (task_rq(p) == rq) {
2511 deactivate_task(rq, p, 0);
2512 set_task_cpu(p, lowest_rq->cpu);
2513 activate_task(lowest_rq, p, 0);
2514 resched_curr(lowest_rq);
2515 }
2516
2517 double_unlock_balance(rq, lowest_rq);
2518
2519 out_unlock:
2520 rq->push_busy = false;
2521 raw_spin_rq_unlock(rq);
2522 raw_spin_unlock_irq(&p->pi_lock);
2523
2524 put_task_struct(p);
2525 return 0;
2526 }
2527
2528
2529
2530
2531
2532 void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags)
2533 {
2534 if (flags & (SCA_MIGRATE_ENABLE | SCA_MIGRATE_DISABLE)) {
2535 p->cpus_ptr = new_mask;
2536 return;
2537 }
2538
2539 cpumask_copy(&p->cpus_mask, new_mask);
2540 p->nr_cpus_allowed = cpumask_weight(new_mask);
2541 }
2542
2543 static void
2544 __do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 flags)
2545 {
2546 struct rq *rq = task_rq(p);
2547 bool queued, running;
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561 if (flags & SCA_MIGRATE_DISABLE)
2562 SCHED_WARN_ON(!p->on_cpu);
2563 else
2564 lockdep_assert_held(&p->pi_lock);
2565
2566 queued = task_on_rq_queued(p);
2567 running = task_current(rq, p);
2568
2569 if (queued) {
2570
2571
2572
2573
2574 lockdep_assert_rq_held(rq);
2575 dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
2576 }
2577 if (running)
2578 put_prev_task(rq, p);
2579
2580 p->sched_class->set_cpus_allowed(p, new_mask, flags);
2581
2582 if (queued)
2583 enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
2584 if (running)
2585 set_next_task(rq, p);
2586 }
2587
2588 void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
2589 {
2590 __do_set_cpus_allowed(p, new_mask, 0);
2591 }
2592
2593 int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src,
2594 int node)
2595 {
2596 if (!src->user_cpus_ptr)
2597 return 0;
2598
2599 dst->user_cpus_ptr = kmalloc_node(cpumask_size(), GFP_KERNEL, node);
2600 if (!dst->user_cpus_ptr)
2601 return -ENOMEM;
2602
2603 cpumask_copy(dst->user_cpus_ptr, src->user_cpus_ptr);
2604 return 0;
2605 }
2606
2607 static inline struct cpumask *clear_user_cpus_ptr(struct task_struct *p)
2608 {
2609 struct cpumask *user_mask = NULL;
2610
2611 swap(p->user_cpus_ptr, user_mask);
2612
2613 return user_mask;
2614 }
2615
2616 void release_user_cpus_ptr(struct task_struct *p)
2617 {
2618 kfree(clear_user_cpus_ptr(p));
2619 }
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697 static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flags *rf,
2698 int dest_cpu, unsigned int flags)
2699 {
2700 struct set_affinity_pending my_pending = { }, *pending = NULL;
2701 bool stop_pending, complete = false;
2702
2703
2704 if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) {
2705 struct task_struct *push_task = NULL;
2706
2707 if ((flags & SCA_MIGRATE_ENABLE) &&
2708 (p->migration_flags & MDF_PUSH) && !rq->push_busy) {
2709 rq->push_busy = true;
2710 push_task = get_task_struct(p);
2711 }
2712
2713
2714
2715
2716
2717 pending = p->migration_pending;
2718 if (pending && !pending->stop_pending) {
2719 p->migration_pending = NULL;
2720 complete = true;
2721 }
2722
2723 task_rq_unlock(rq, p, rf);
2724
2725 if (push_task) {
2726 stop_one_cpu_nowait(rq->cpu, push_cpu_stop,
2727 p, &rq->push_work);
2728 }
2729
2730 if (complete)
2731 complete_all(&pending->done);
2732
2733 return 0;
2734 }
2735
2736 if (!(flags & SCA_MIGRATE_ENABLE)) {
2737
2738 if (!p->migration_pending) {
2739
2740 refcount_set(&my_pending.refs, 1);
2741 init_completion(&my_pending.done);
2742 my_pending.arg = (struct migration_arg) {
2743 .task = p,
2744 .dest_cpu = dest_cpu,
2745 .pending = &my_pending,
2746 };
2747
2748 p->migration_pending = &my_pending;
2749 } else {
2750 pending = p->migration_pending;
2751 refcount_inc(&pending->refs);
2752
2753
2754
2755
2756
2757
2758
2759
2760 pending->arg.dest_cpu = dest_cpu;
2761 }
2762 }
2763 pending = p->migration_pending;
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776 if (WARN_ON_ONCE(!pending)) {
2777 task_rq_unlock(rq, p, rf);
2778 return -EINVAL;
2779 }
2780
2781 if (task_running(rq, p) || READ_ONCE(p->__state) == TASK_WAKING) {
2782
2783
2784
2785
2786
2787 stop_pending = pending->stop_pending;
2788 if (!stop_pending)
2789 pending->stop_pending = true;
2790
2791 if (flags & SCA_MIGRATE_ENABLE)
2792 p->migration_flags &= ~MDF_PUSH;
2793
2794 task_rq_unlock(rq, p, rf);
2795
2796 if (!stop_pending) {
2797 stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop,
2798 &pending->arg, &pending->stop_work);
2799 }
2800
2801 if (flags & SCA_MIGRATE_ENABLE)
2802 return 0;
2803 } else {
2804
2805 if (!is_migration_disabled(p)) {
2806 if (task_on_rq_queued(p))
2807 rq = move_queued_task(rq, rf, p, dest_cpu);
2808
2809 if (!pending->stop_pending) {
2810 p->migration_pending = NULL;
2811 complete = true;
2812 }
2813 }
2814 task_rq_unlock(rq, p, rf);
2815
2816 if (complete)
2817 complete_all(&pending->done);
2818 }
2819
2820 wait_for_completion(&pending->done);
2821
2822 if (refcount_dec_and_test(&pending->refs))
2823 wake_up_var(&pending->refs);
2824
2825
2826
2827
2828
2829 wait_var_event(&my_pending.refs, !refcount_read(&my_pending.refs));
2830
2831
2832 WARN_ON_ONCE(my_pending.stop_pending);
2833
2834 return 0;
2835 }
2836
2837
2838
2839
2840 static int __set_cpus_allowed_ptr_locked(struct task_struct *p,
2841 const struct cpumask *new_mask,
2842 u32 flags,
2843 struct rq *rq,
2844 struct rq_flags *rf)
2845 __releases(rq->lock)
2846 __releases(p->pi_lock)
2847 {
2848 const struct cpumask *cpu_allowed_mask = task_cpu_possible_mask(p);
2849 const struct cpumask *cpu_valid_mask = cpu_active_mask;
2850 bool kthread = p->flags & PF_KTHREAD;
2851 struct cpumask *user_mask = NULL;
2852 unsigned int dest_cpu;
2853 int ret = 0;
2854
2855 update_rq_clock(rq);
2856
2857 if (kthread || is_migration_disabled(p)) {
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868 cpu_valid_mask = cpu_online_mask;
2869 }
2870
2871 if (!kthread && !cpumask_subset(new_mask, cpu_allowed_mask)) {
2872 ret = -EINVAL;
2873 goto out;
2874 }
2875
2876
2877
2878
2879
2880 if ((flags & SCA_CHECK) && (p->flags & PF_NO_SETAFFINITY)) {
2881 ret = -EINVAL;
2882 goto out;
2883 }
2884
2885 if (!(flags & SCA_MIGRATE_ENABLE)) {
2886 if (cpumask_equal(&p->cpus_mask, new_mask))
2887 goto out;
2888
2889 if (WARN_ON_ONCE(p == current &&
2890 is_migration_disabled(p) &&
2891 !cpumask_test_cpu(task_cpu(p), new_mask))) {
2892 ret = -EBUSY;
2893 goto out;
2894 }
2895 }
2896
2897
2898
2899
2900
2901
2902 dest_cpu = cpumask_any_and_distribute(cpu_valid_mask, new_mask);
2903 if (dest_cpu >= nr_cpu_ids) {
2904 ret = -EINVAL;
2905 goto out;
2906 }
2907
2908 __do_set_cpus_allowed(p, new_mask, flags);
2909
2910 if (flags & SCA_USER)
2911 user_mask = clear_user_cpus_ptr(p);
2912
2913 ret = affine_move_task(rq, p, rf, dest_cpu, flags);
2914
2915 kfree(user_mask);
2916
2917 return ret;
2918
2919 out:
2920 task_rq_unlock(rq, p, rf);
2921
2922 return ret;
2923 }
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934 static int __set_cpus_allowed_ptr(struct task_struct *p,
2935 const struct cpumask *new_mask, u32 flags)
2936 {
2937 struct rq_flags rf;
2938 struct rq *rq;
2939
2940 rq = task_rq_lock(p, &rf);
2941 return __set_cpus_allowed_ptr_locked(p, new_mask, flags, rq, &rf);
2942 }
2943
2944 int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
2945 {
2946 return __set_cpus_allowed_ptr(p, new_mask, 0);
2947 }
2948 EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
2949
2950
2951
2952
2953
2954
2955
2956
2957 static int restrict_cpus_allowed_ptr(struct task_struct *p,
2958 struct cpumask *new_mask,
2959 const struct cpumask *subset_mask)
2960 {
2961 struct cpumask *user_mask = NULL;
2962 struct rq_flags rf;
2963 struct rq *rq;
2964 int err;
2965
2966 if (!p->user_cpus_ptr) {
2967 user_mask = kmalloc(cpumask_size(), GFP_KERNEL);
2968 if (!user_mask)
2969 return -ENOMEM;
2970 }
2971
2972 rq = task_rq_lock(p, &rf);
2973
2974
2975
2976
2977
2978
2979 if (task_has_dl_policy(p) && dl_bandwidth_enabled()) {
2980 err = -EPERM;
2981 goto err_unlock;
2982 }
2983
2984 if (!cpumask_and(new_mask, &p->cpus_mask, subset_mask)) {
2985 err = -EINVAL;
2986 goto err_unlock;
2987 }
2988
2989
2990
2991
2992
2993 if (user_mask) {
2994 cpumask_copy(user_mask, p->cpus_ptr);
2995 p->user_cpus_ptr = user_mask;
2996 }
2997
2998 return __set_cpus_allowed_ptr_locked(p, new_mask, 0, rq, &rf);
2999
3000 err_unlock:
3001 task_rq_unlock(rq, p, &rf);
3002 kfree(user_mask);
3003 return err;
3004 }
3005
3006
3007
3008
3009
3010
3011
3012 void force_compatible_cpus_allowed_ptr(struct task_struct *p)
3013 {
3014 cpumask_var_t new_mask;
3015 const struct cpumask *override_mask = task_cpu_possible_mask(p);
3016
3017 alloc_cpumask_var(&new_mask, GFP_KERNEL);
3018
3019
3020
3021
3022
3023
3024 cpus_read_lock();
3025 if (!cpumask_available(new_mask))
3026 goto out_set_mask;
3027
3028 if (!restrict_cpus_allowed_ptr(p, new_mask, override_mask))
3029 goto out_free_mask;
3030
3031
3032
3033
3034
3035 cpuset_cpus_allowed(p, new_mask);
3036 override_mask = new_mask;
3037
3038 out_set_mask:
3039 if (printk_ratelimit()) {
3040 printk_deferred("Overriding affinity for process %d (%s) to CPUs %*pbl\n",
3041 task_pid_nr(p), p->comm,
3042 cpumask_pr_args(override_mask));
3043 }
3044
3045 WARN_ON(set_cpus_allowed_ptr(p, override_mask));
3046 out_free_mask:
3047 cpus_read_unlock();
3048 free_cpumask_var(new_mask);
3049 }
3050
3051 static int
3052 __sched_setaffinity(struct task_struct *p, const struct cpumask *mask);
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062 void relax_compatible_cpus_allowed_ptr(struct task_struct *p)
3063 {
3064 struct cpumask *user_mask = p->user_cpus_ptr;
3065 unsigned long flags;
3066
3067
3068
3069
3070
3071
3072 if (!user_mask || !__sched_setaffinity(p, user_mask))
3073 return;
3074
3075 raw_spin_lock_irqsave(&p->pi_lock, flags);
3076 user_mask = clear_user_cpus_ptr(p);
3077 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
3078
3079 kfree(user_mask);
3080 }
3081
3082 void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
3083 {
3084 #ifdef CONFIG_SCHED_DEBUG
3085 unsigned int state = READ_ONCE(p->__state);
3086
3087
3088
3089
3090
3091 WARN_ON_ONCE(state != TASK_RUNNING && state != TASK_WAKING && !p->on_rq);
3092
3093
3094
3095
3096
3097
3098 WARN_ON_ONCE(state == TASK_RUNNING &&
3099 p->sched_class == &fair_sched_class &&
3100 (p->on_rq && !task_on_rq_migrating(p)));
3101
3102 #ifdef CONFIG_LOCKDEP
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113 WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
3114 lockdep_is_held(__rq_lockp(task_rq(p)))));
3115 #endif
3116
3117
3118
3119 WARN_ON_ONCE(!cpu_online(new_cpu));
3120
3121 WARN_ON_ONCE(is_migration_disabled(p));
3122 #endif
3123
3124 trace_sched_migrate_task(p, new_cpu);
3125
3126 if (task_cpu(p) != new_cpu) {
3127 if (p->sched_class->migrate_task_rq)
3128 p->sched_class->migrate_task_rq(p, new_cpu);
3129 p->se.nr_migrations++;
3130 rseq_migrate(p);
3131 perf_event_task_migrate(p);
3132 }
3133
3134 __set_task_cpu(p, new_cpu);
3135 }
3136
3137 #ifdef CONFIG_NUMA_BALANCING
3138 static void __migrate_swap_task(struct task_struct *p, int cpu)
3139 {
3140 if (task_on_rq_queued(p)) {
3141 struct rq *src_rq, *dst_rq;
3142 struct rq_flags srf, drf;
3143
3144 src_rq = task_rq(p);
3145 dst_rq = cpu_rq(cpu);
3146
3147 rq_pin_lock(src_rq, &srf);
3148 rq_pin_lock(dst_rq, &drf);
3149
3150 deactivate_task(src_rq, p, 0);
3151 set_task_cpu(p, cpu);
3152 activate_task(dst_rq, p, 0);
3153 check_preempt_curr(dst_rq, p, 0);
3154
3155 rq_unpin_lock(dst_rq, &drf);
3156 rq_unpin_lock(src_rq, &srf);
3157
3158 } else {
3159
3160
3161
3162
3163
3164 p->wake_cpu = cpu;
3165 }
3166 }
3167
3168 struct migration_swap_arg {
3169 struct task_struct *src_task, *dst_task;
3170 int src_cpu, dst_cpu;
3171 };
3172
3173 static int migrate_swap_stop(void *data)
3174 {
3175 struct migration_swap_arg *arg = data;
3176 struct rq *src_rq, *dst_rq;
3177 int ret = -EAGAIN;
3178
3179 if (!cpu_active(arg->src_cpu) || !cpu_active(arg->dst_cpu))
3180 return -EAGAIN;
3181
3182 src_rq = cpu_rq(arg->src_cpu);
3183 dst_rq = cpu_rq(arg->dst_cpu);
3184
3185 double_raw_lock(&arg->src_task->pi_lock,
3186 &arg->dst_task->pi_lock);
3187 double_rq_lock(src_rq, dst_rq);
3188
3189 if (task_cpu(arg->dst_task) != arg->dst_cpu)
3190 goto unlock;
3191
3192 if (task_cpu(arg->src_task) != arg->src_cpu)
3193 goto unlock;
3194
3195 if (!cpumask_test_cpu(arg->dst_cpu, arg->src_task->cpus_ptr))
3196 goto unlock;
3197
3198 if (!cpumask_test_cpu(arg->src_cpu, arg->dst_task->cpus_ptr))
3199 goto unlock;
3200
3201 __migrate_swap_task(arg->src_task, arg->dst_cpu);
3202 __migrate_swap_task(arg->dst_task, arg->src_cpu);
3203
3204 ret = 0;
3205
3206 unlock:
3207 double_rq_unlock(src_rq, dst_rq);
3208 raw_spin_unlock(&arg->dst_task->pi_lock);
3209 raw_spin_unlock(&arg->src_task->pi_lock);
3210
3211 return ret;
3212 }
3213
3214
3215
3216
3217 int migrate_swap(struct task_struct *cur, struct task_struct *p,
3218 int target_cpu, int curr_cpu)
3219 {
3220 struct migration_swap_arg arg;
3221 int ret = -EINVAL;
3222
3223 arg = (struct migration_swap_arg){
3224 .src_task = cur,
3225 .src_cpu = curr_cpu,
3226 .dst_task = p,
3227 .dst_cpu = target_cpu,
3228 };
3229
3230 if (arg.src_cpu == arg.dst_cpu)
3231 goto out;
3232
3233
3234
3235
3236
3237 if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu))
3238 goto out;
3239
3240 if (!cpumask_test_cpu(arg.dst_cpu, arg.src_task->cpus_ptr))
3241 goto out;
3242
3243 if (!cpumask_test_cpu(arg.src_cpu, arg.dst_task->cpus_ptr))
3244 goto out;
3245
3246 trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu);
3247 ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg);
3248
3249 out:
3250 return ret;
3251 }
3252 #endif
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270 unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state)
3271 {
3272 int running, queued;
3273 struct rq_flags rf;
3274 unsigned long ncsw;
3275 struct rq *rq;
3276
3277 for (;;) {
3278
3279
3280
3281
3282
3283
3284 rq = task_rq(p);
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297 while (task_running(rq, p)) {
3298 if (match_state && unlikely(READ_ONCE(p->__state) != match_state))
3299 return 0;
3300 cpu_relax();
3301 }
3302
3303
3304
3305
3306
3307
3308 rq = task_rq_lock(p, &rf);
3309 trace_sched_wait_task(p);
3310 running = task_running(rq, p);
3311 queued = task_on_rq_queued(p);
3312 ncsw = 0;
3313 if (!match_state || READ_ONCE(p->__state) == match_state)
3314 ncsw = p->nvcsw | LONG_MIN;
3315 task_rq_unlock(rq, p, &rf);
3316
3317
3318
3319
3320 if (unlikely(!ncsw))
3321 break;
3322
3323
3324
3325
3326
3327
3328
3329 if (unlikely(running)) {
3330 cpu_relax();
3331 continue;
3332 }
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343 if (unlikely(queued)) {
3344 ktime_t to = NSEC_PER_SEC / HZ;
3345
3346 set_current_state(TASK_UNINTERRUPTIBLE);
3347 schedule_hrtimeout(&to, HRTIMER_MODE_REL_HARD);
3348 continue;
3349 }
3350
3351
3352
3353
3354
3355
3356 break;
3357 }
3358
3359 return ncsw;
3360 }
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375 void kick_process(struct task_struct *p)
3376 {
3377 int cpu;
3378
3379 preempt_disable();
3380 cpu = task_cpu(p);
3381 if ((cpu != smp_processor_id()) && task_curr(p))
3382 smp_send_reschedule(cpu);
3383 preempt_enable();
3384 }
3385 EXPORT_SYMBOL_GPL(kick_process);
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409 static int select_fallback_rq(int cpu, struct task_struct *p)
3410 {
3411 int nid = cpu_to_node(cpu);
3412 const struct cpumask *nodemask = NULL;
3413 enum { cpuset, possible, fail } state = cpuset;
3414 int dest_cpu;
3415
3416
3417
3418
3419
3420
3421 if (nid != -1) {
3422 nodemask = cpumask_of_node(nid);
3423
3424
3425 for_each_cpu(dest_cpu, nodemask) {
3426 if (is_cpu_allowed(p, dest_cpu))
3427 return dest_cpu;
3428 }
3429 }
3430
3431 for (;;) {
3432
3433 for_each_cpu(dest_cpu, p->cpus_ptr) {
3434 if (!is_cpu_allowed(p, dest_cpu))
3435 continue;
3436
3437 goto out;
3438 }
3439
3440
3441 switch (state) {
3442 case cpuset:
3443 if (cpuset_cpus_allowed_fallback(p)) {
3444 state = possible;
3445 break;
3446 }
3447 fallthrough;
3448 case possible:
3449
3450
3451
3452
3453
3454
3455 do_set_cpus_allowed(p, task_cpu_possible_mask(p));
3456 state = fail;
3457 break;
3458 case fail:
3459 BUG();
3460 break;
3461 }
3462 }
3463
3464 out:
3465 if (state != cpuset) {
3466
3467
3468
3469
3470
3471 if (p->mm && printk_ratelimit()) {
3472 printk_deferred("process %d (%s) no longer affine to cpu%d\n",
3473 task_pid_nr(p), p->comm, cpu);
3474 }
3475 }
3476
3477 return dest_cpu;
3478 }
3479
3480
3481
3482
3483 static inline
3484 int select_task_rq(struct task_struct *p, int cpu, int wake_flags)
3485 {
3486 lockdep_assert_held(&p->pi_lock);
3487
3488 if (p->nr_cpus_allowed > 1 && !is_migration_disabled(p))
3489 cpu = p->sched_class->select_task_rq(p, cpu, wake_flags);
3490 else
3491 cpu = cpumask_any(p->cpus_ptr);
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503 if (unlikely(!is_cpu_allowed(p, cpu)))
3504 cpu = select_fallback_rq(task_cpu(p), p);
3505
3506 return cpu;
3507 }
3508
3509 void sched_set_stop_task(int cpu, struct task_struct *stop)
3510 {
3511 static struct lock_class_key stop_pi_lock;
3512 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
3513 struct task_struct *old_stop = cpu_rq(cpu)->stop;
3514
3515 if (stop) {
3516
3517
3518
3519
3520
3521
3522
3523
3524 sched_setscheduler_nocheck(stop, SCHED_FIFO, ¶m);
3525
3526 stop->sched_class = &stop_sched_class;
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540 lockdep_set_class(&stop->pi_lock, &stop_pi_lock);
3541 }
3542
3543 cpu_rq(cpu)->stop = stop;
3544
3545 if (old_stop) {
3546
3547
3548
3549
3550 old_stop->sched_class = &rt_sched_class;
3551 }
3552 }
3553
3554 #else
3555
3556 static inline int __set_cpus_allowed_ptr(struct task_struct *p,
3557 const struct cpumask *new_mask,
3558 u32 flags)
3559 {
3560 return set_cpus_allowed_ptr(p, new_mask);
3561 }
3562
3563 static inline void migrate_disable_switch(struct rq *rq, struct task_struct *p) { }
3564
3565 static inline bool rq_has_pinned_tasks(struct rq *rq)
3566 {
3567 return false;
3568 }
3569
3570 #endif
3571
3572 static void
3573 ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
3574 {
3575 struct rq *rq;
3576
3577 if (!schedstat_enabled())
3578 return;
3579
3580 rq = this_rq();
3581
3582 #ifdef CONFIG_SMP
3583 if (cpu == rq->cpu) {
3584 __schedstat_inc(rq->ttwu_local);
3585 __schedstat_inc(p->stats.nr_wakeups_local);
3586 } else {
3587 struct sched_domain *sd;
3588
3589 __schedstat_inc(p->stats.nr_wakeups_remote);
3590 rcu_read_lock();
3591 for_each_domain(rq->cpu, sd) {
3592 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
3593 __schedstat_inc(sd->ttwu_wake_remote);
3594 break;
3595 }
3596 }
3597 rcu_read_unlock();
3598 }
3599
3600 if (wake_flags & WF_MIGRATED)
3601 __schedstat_inc(p->stats.nr_wakeups_migrate);
3602 #endif
3603
3604 __schedstat_inc(rq->ttwu_count);
3605 __schedstat_inc(p->stats.nr_wakeups);
3606
3607 if (wake_flags & WF_SYNC)
3608 __schedstat_inc(p->stats.nr_wakeups_sync);
3609 }
3610
3611
3612
3613
3614 static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags,
3615 struct rq_flags *rf)
3616 {
3617 check_preempt_curr(rq, p, wake_flags);
3618 WRITE_ONCE(p->__state, TASK_RUNNING);
3619 trace_sched_wakeup(p);
3620
3621 #ifdef CONFIG_SMP
3622 if (p->sched_class->task_woken) {
3623
3624
3625
3626
3627 rq_unpin_lock(rq, rf);
3628 p->sched_class->task_woken(rq, p);
3629 rq_repin_lock(rq, rf);
3630 }
3631
3632 if (rq->idle_stamp) {
3633 u64 delta = rq_clock(rq) - rq->idle_stamp;
3634 u64 max = 2*rq->max_idle_balance_cost;
3635
3636 update_avg(&rq->avg_idle, delta);
3637
3638 if (rq->avg_idle > max)
3639 rq->avg_idle = max;
3640
3641 rq->wake_stamp = jiffies;
3642 rq->wake_avg_idle = rq->avg_idle / 2;
3643
3644 rq->idle_stamp = 0;
3645 }
3646 #endif
3647 }
3648
3649 static void
3650 ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
3651 struct rq_flags *rf)
3652 {
3653 int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK;
3654
3655 lockdep_assert_rq_held(rq);
3656
3657 if (p->sched_contributes_to_load)
3658 rq->nr_uninterruptible--;
3659
3660 #ifdef CONFIG_SMP
3661 if (wake_flags & WF_MIGRATED)
3662 en_flags |= ENQUEUE_MIGRATED;
3663 else
3664 #endif
3665 if (p->in_iowait) {
3666 delayacct_blkio_end(p);
3667 atomic_dec(&task_rq(p)->nr_iowait);
3668 }
3669
3670 activate_task(rq, p, en_flags);
3671 ttwu_do_wakeup(rq, p, wake_flags, rf);
3672 }
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699 static int ttwu_runnable(struct task_struct *p, int wake_flags)
3700 {
3701 struct rq_flags rf;
3702 struct rq *rq;
3703 int ret = 0;
3704
3705 rq = __task_rq_lock(p, &rf);
3706 if (task_on_rq_queued(p)) {
3707
3708 update_rq_clock(rq);
3709 ttwu_do_wakeup(rq, p, wake_flags, &rf);
3710 ret = 1;
3711 }
3712 __task_rq_unlock(rq, &rf);
3713
3714 return ret;
3715 }
3716
3717 #ifdef CONFIG_SMP
3718 void sched_ttwu_pending(void *arg)
3719 {
3720 struct llist_node *llist = arg;
3721 struct rq *rq = this_rq();
3722 struct task_struct *p, *t;
3723 struct rq_flags rf;
3724
3725 if (!llist)
3726 return;
3727
3728
3729
3730
3731
3732
3733 WRITE_ONCE(rq->ttwu_pending, 0);
3734
3735 rq_lock_irqsave(rq, &rf);
3736 update_rq_clock(rq);
3737
3738 llist_for_each_entry_safe(p, t, llist, wake_entry.llist) {
3739 if (WARN_ON_ONCE(p->on_cpu))
3740 smp_cond_load_acquire(&p->on_cpu, !VAL);
3741
3742 if (WARN_ON_ONCE(task_cpu(p) != cpu_of(rq)))
3743 set_task_cpu(p, cpu_of(rq));
3744
3745 ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0, &rf);
3746 }
3747
3748 rq_unlock_irqrestore(rq, &rf);
3749 }
3750
3751 void send_call_function_single_ipi(int cpu)
3752 {
3753 struct rq *rq = cpu_rq(cpu);
3754
3755 if (!set_nr_if_polling(rq->idle))
3756 arch_send_call_function_single_ipi(cpu);
3757 else
3758 trace_sched_wake_idle_without_ipi(cpu);
3759 }
3760
3761
3762
3763
3764
3765
3766
3767 static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
3768 {
3769 struct rq *rq = cpu_rq(cpu);
3770
3771 p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED);
3772
3773 WRITE_ONCE(rq->ttwu_pending, 1);
3774 __smp_call_single_queue(cpu, &p->wake_entry.llist);
3775 }
3776
3777 void wake_up_if_idle(int cpu)
3778 {
3779 struct rq *rq = cpu_rq(cpu);
3780 struct rq_flags rf;
3781
3782 rcu_read_lock();
3783
3784 if (!is_idle_task(rcu_dereference(rq->curr)))
3785 goto out;
3786
3787 rq_lock_irqsave(rq, &rf);
3788 if (is_idle_task(rq->curr))
3789 resched_curr(rq);
3790
3791 rq_unlock_irqrestore(rq, &rf);
3792
3793 out:
3794 rcu_read_unlock();
3795 }
3796
3797 bool cpus_share_cache(int this_cpu, int that_cpu)
3798 {
3799 if (this_cpu == that_cpu)
3800 return true;
3801
3802 return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
3803 }
3804
3805 static inline bool ttwu_queue_cond(struct task_struct *p, int cpu)
3806 {
3807
3808
3809
3810
3811 if (!cpu_active(cpu))
3812 return false;
3813
3814
3815 if (!cpumask_test_cpu(cpu, p->cpus_ptr))
3816 return false;
3817
3818
3819
3820
3821
3822 if (!cpus_share_cache(smp_processor_id(), cpu))
3823 return true;
3824
3825 if (cpu == smp_processor_id())
3826 return false;
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839 if (!cpu_rq(cpu)->nr_running)
3840 return true;
3841
3842 return false;
3843 }
3844
3845 static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
3846 {
3847 if (sched_feat(TTWU_QUEUE) && ttwu_queue_cond(p, cpu)) {
3848 sched_clock_cpu(cpu);
3849 __ttwu_queue_wakelist(p, cpu, wake_flags);
3850 return true;
3851 }
3852
3853 return false;
3854 }
3855
3856 #else
3857
3858 static inline bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
3859 {
3860 return false;
3861 }
3862
3863 #endif
3864
3865 static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
3866 {
3867 struct rq *rq = cpu_rq(cpu);
3868 struct rq_flags rf;
3869
3870 if (ttwu_queue_wakelist(p, cpu, wake_flags))
3871 return;
3872
3873 rq_lock(rq, &rf);
3874 update_rq_clock(rq);
3875 ttwu_do_activate(rq, p, wake_flags, &rf);
3876 rq_unlock(rq, &rf);
3877 }
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893 static __always_inline
3894 bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success)
3895 {
3896 if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)) {
3897 WARN_ON_ONCE((state & TASK_RTLOCK_WAIT) &&
3898 state != TASK_RTLOCK_WAIT);
3899 }
3900
3901 if (READ_ONCE(p->__state) & state) {
3902 *success = 1;
3903 return true;
3904 }
3905
3906 #ifdef CONFIG_PREEMPT_RT
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920 if (p->saved_state & state) {
3921 p->saved_state = TASK_RUNNING;
3922 *success = 1;
3923 }
3924 #endif
3925 return false;
3926 }
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048 static int
4049 try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
4050 {
4051 unsigned long flags;
4052 int cpu, success = 0;
4053
4054 preempt_disable();
4055 if (p == current) {
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067 if (!ttwu_state_match(p, state, &success))
4068 goto out;
4069
4070 trace_sched_waking(p);
4071 WRITE_ONCE(p->__state, TASK_RUNNING);
4072 trace_sched_wakeup(p);
4073 goto out;
4074 }
4075
4076
4077
4078
4079
4080
4081
4082 raw_spin_lock_irqsave(&p->pi_lock, flags);
4083 smp_mb__after_spinlock();
4084 if (!ttwu_state_match(p, state, &success))
4085 goto unlock;
4086
4087 trace_sched_waking(p);
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111 smp_rmb();
4112 if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags))
4113 goto unlock;
4114
4115 #ifdef CONFIG_SMP
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139 smp_acquire__after_ctrl_dep();
4140
4141
4142
4143
4144
4145
4146
4147 WRITE_ONCE(p->__state, TASK_WAKING);
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168 if (smp_load_acquire(&p->on_cpu) &&
4169 ttwu_queue_wakelist(p, task_cpu(p), wake_flags))
4170 goto unlock;
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181 smp_cond_load_acquire(&p->on_cpu, !VAL);
4182
4183 cpu = select_task_rq(p, p->wake_cpu, wake_flags | WF_TTWU);
4184 if (task_cpu(p) != cpu) {
4185 if (p->in_iowait) {
4186 delayacct_blkio_end(p);
4187 atomic_dec(&task_rq(p)->nr_iowait);
4188 }
4189
4190 wake_flags |= WF_MIGRATED;
4191 psi_ttwu_dequeue(p);
4192 set_task_cpu(p, cpu);
4193 }
4194 #else
4195 cpu = task_cpu(p);
4196 #endif
4197
4198 ttwu_queue(p, cpu, wake_flags);
4199 unlock:
4200 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4201 out:
4202 if (success)
4203 ttwu_stat(p, task_cpu(p), wake_flags);
4204 preempt_enable();
4205
4206 return success;
4207 }
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223 int task_call_func(struct task_struct *p, task_call_f func, void *arg)
4224 {
4225 struct rq *rq = NULL;
4226 unsigned int state;
4227 struct rq_flags rf;
4228 int ret;
4229
4230 raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
4231
4232 state = READ_ONCE(p->__state);
4233
4234
4235
4236
4237
4238
4239
4240 smp_rmb();
4241
4242
4243
4244
4245
4246
4247 if (state == TASK_RUNNING || state == TASK_WAKING || p->on_rq)
4248 rq = __task_rq_lock(p, &rf);
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260 ret = func(p, arg);
4261
4262 if (rq)
4263 rq_unlock(rq, &rf);
4264
4265 raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags);
4266 return ret;
4267 }
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291 struct task_struct *cpu_curr_snapshot(int cpu)
4292 {
4293 struct task_struct *t;
4294
4295 smp_mb();
4296 t = rcu_dereference(cpu_curr(cpu));
4297 smp_mb();
4298 return t;
4299 }
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312 int wake_up_process(struct task_struct *p)
4313 {
4314 return try_to_wake_up(p, TASK_NORMAL, 0);
4315 }
4316 EXPORT_SYMBOL(wake_up_process);
4317
4318 int wake_up_state(struct task_struct *p, unsigned int state)
4319 {
4320 return try_to_wake_up(p, state, 0);
4321 }
4322
4323
4324
4325
4326
4327
4328
4329 static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
4330 {
4331 p->on_rq = 0;
4332
4333 p->se.on_rq = 0;
4334 p->se.exec_start = 0;
4335 p->se.sum_exec_runtime = 0;
4336 p->se.prev_sum_exec_runtime = 0;
4337 p->se.nr_migrations = 0;
4338 p->se.vruntime = 0;
4339 INIT_LIST_HEAD(&p->se.group_node);
4340
4341 #ifdef CONFIG_FAIR_GROUP_SCHED
4342 p->se.cfs_rq = NULL;
4343 #endif
4344
4345 #ifdef CONFIG_SCHEDSTATS
4346
4347 memset(&p->stats, 0, sizeof(p->stats));
4348 #endif
4349
4350 RB_CLEAR_NODE(&p->dl.rb_node);
4351 init_dl_task_timer(&p->dl);
4352 init_dl_inactive_task_timer(&p->dl);
4353 __dl_clear_params(p);
4354
4355 INIT_LIST_HEAD(&p->rt.run_list);
4356 p->rt.timeout = 0;
4357 p->rt.time_slice = sched_rr_timeslice;
4358 p->rt.on_rq = 0;
4359 p->rt.on_list = 0;
4360
4361 #ifdef CONFIG_PREEMPT_NOTIFIERS
4362 INIT_HLIST_HEAD(&p->preempt_notifiers);
4363 #endif
4364
4365 #ifdef CONFIG_COMPACTION
4366 p->capture_control = NULL;
4367 #endif
4368 init_numa_balancing(clone_flags, p);
4369 #ifdef CONFIG_SMP
4370 p->wake_entry.u_flags = CSD_TYPE_TTWU;
4371 p->migration_pending = NULL;
4372 #endif
4373 }
4374
4375 DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
4376
4377 #ifdef CONFIG_NUMA_BALANCING
4378
4379 int sysctl_numa_balancing_mode;
4380
4381 static void __set_numabalancing_state(bool enabled)
4382 {
4383 if (enabled)
4384 static_branch_enable(&sched_numa_balancing);
4385 else
4386 static_branch_disable(&sched_numa_balancing);
4387 }
4388
4389 void set_numabalancing_state(bool enabled)
4390 {
4391 if (enabled)
4392 sysctl_numa_balancing_mode = NUMA_BALANCING_NORMAL;
4393 else
4394 sysctl_numa_balancing_mode = NUMA_BALANCING_DISABLED;
4395 __set_numabalancing_state(enabled);
4396 }
4397
4398 #ifdef CONFIG_PROC_SYSCTL
4399 int sysctl_numa_balancing(struct ctl_table *table, int write,
4400 void *buffer, size_t *lenp, loff_t *ppos)
4401 {
4402 struct ctl_table t;
4403 int err;
4404 int state = sysctl_numa_balancing_mode;
4405
4406 if (write && !capable(CAP_SYS_ADMIN))
4407 return -EPERM;
4408
4409 t = *table;
4410 t.data = &state;
4411 err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
4412 if (err < 0)
4413 return err;
4414 if (write) {
4415 sysctl_numa_balancing_mode = state;
4416 __set_numabalancing_state(state);
4417 }
4418 return err;
4419 }
4420 #endif
4421 #endif
4422
4423 #ifdef CONFIG_SCHEDSTATS
4424
4425 DEFINE_STATIC_KEY_FALSE(sched_schedstats);
4426
4427 static void set_schedstats(bool enabled)
4428 {
4429 if (enabled)
4430 static_branch_enable(&sched_schedstats);
4431 else
4432 static_branch_disable(&sched_schedstats);
4433 }
4434
4435 void force_schedstat_enabled(void)
4436 {
4437 if (!schedstat_enabled()) {
4438 pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n");
4439 static_branch_enable(&sched_schedstats);
4440 }
4441 }
4442
4443 static int __init setup_schedstats(char *str)
4444 {
4445 int ret = 0;
4446 if (!str)
4447 goto out;
4448
4449 if (!strcmp(str, "enable")) {
4450 set_schedstats(true);
4451 ret = 1;
4452 } else if (!strcmp(str, "disable")) {
4453 set_schedstats(false);
4454 ret = 1;
4455 }
4456 out:
4457 if (!ret)
4458 pr_warn("Unable to parse schedstats=\n");
4459
4460 return ret;
4461 }
4462 __setup("schedstats=", setup_schedstats);
4463
4464 #ifdef CONFIG_PROC_SYSCTL
4465 static int sysctl_schedstats(struct ctl_table *table, int write, void *buffer,
4466 size_t *lenp, loff_t *ppos)
4467 {
4468 struct ctl_table t;
4469 int err;
4470 int state = static_branch_likely(&sched_schedstats);
4471
4472 if (write && !capable(CAP_SYS_ADMIN))
4473 return -EPERM;
4474
4475 t = *table;
4476 t.data = &state;
4477 err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
4478 if (err < 0)
4479 return err;
4480 if (write)
4481 set_schedstats(state);
4482 return err;
4483 }
4484 #endif
4485 #endif
4486
4487 #ifdef CONFIG_SYSCTL
4488 static struct ctl_table sched_core_sysctls[] = {
4489 #ifdef CONFIG_SCHEDSTATS
4490 {
4491 .procname = "sched_schedstats",
4492 .data = NULL,
4493 .maxlen = sizeof(unsigned int),
4494 .mode = 0644,
4495 .proc_handler = sysctl_schedstats,
4496 .extra1 = SYSCTL_ZERO,
4497 .extra2 = SYSCTL_ONE,
4498 },
4499 #endif
4500 #ifdef CONFIG_UCLAMP_TASK
4501 {
4502 .procname = "sched_util_clamp_min",
4503 .data = &sysctl_sched_uclamp_util_min,
4504 .maxlen = sizeof(unsigned int),
4505 .mode = 0644,
4506 .proc_handler = sysctl_sched_uclamp_handler,
4507 },
4508 {
4509 .procname = "sched_util_clamp_max",
4510 .data = &sysctl_sched_uclamp_util_max,
4511 .maxlen = sizeof(unsigned int),
4512 .mode = 0644,
4513 .proc_handler = sysctl_sched_uclamp_handler,
4514 },
4515 {
4516 .procname = "sched_util_clamp_min_rt_default",
4517 .data = &sysctl_sched_uclamp_util_min_rt_default,
4518 .maxlen = sizeof(unsigned int),
4519 .mode = 0644,
4520 .proc_handler = sysctl_sched_uclamp_handler,
4521 },
4522 #endif
4523 {}
4524 };
4525 static int __init sched_core_sysctl_init(void)
4526 {
4527 register_sysctl_init("kernel", sched_core_sysctls);
4528 return 0;
4529 }
4530 late_initcall(sched_core_sysctl_init);
4531 #endif
4532
4533
4534
4535
4536 int sched_fork(unsigned long clone_flags, struct task_struct *p)
4537 {
4538 __sched_fork(clone_flags, p);
4539
4540
4541
4542
4543
4544 p->__state = TASK_NEW;
4545
4546
4547
4548
4549 p->prio = current->normal_prio;
4550
4551 uclamp_fork(p);
4552
4553
4554
4555
4556 if (unlikely(p->sched_reset_on_fork)) {
4557 if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
4558 p->policy = SCHED_NORMAL;
4559 p->static_prio = NICE_TO_PRIO(0);
4560 p->rt_priority = 0;
4561 } else if (PRIO_TO_NICE(p->static_prio) < 0)
4562 p->static_prio = NICE_TO_PRIO(0);
4563
4564 p->prio = p->normal_prio = p->static_prio;
4565 set_load_weight(p, false);
4566
4567
4568
4569
4570
4571 p->sched_reset_on_fork = 0;
4572 }
4573
4574 if (dl_prio(p->prio))
4575 return -EAGAIN;
4576 else if (rt_prio(p->prio))
4577 p->sched_class = &rt_sched_class;
4578 else
4579 p->sched_class = &fair_sched_class;
4580
4581 init_entity_runnable_average(&p->se);
4582
4583
4584 #ifdef CONFIG_SCHED_INFO
4585 if (likely(sched_info_on()))
4586 memset(&p->sched_info, 0, sizeof(p->sched_info));
4587 #endif
4588 #if defined(CONFIG_SMP)
4589 p->on_cpu = 0;
4590 #endif
4591 init_task_preempt_count(p);
4592 #ifdef CONFIG_SMP
4593 plist_node_init(&p->pushable_tasks, MAX_PRIO);
4594 RB_CLEAR_NODE(&p->pushable_dl_tasks);
4595 #endif
4596 return 0;
4597 }
4598
4599 void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs)
4600 {
4601 unsigned long flags;
4602
4603
4604
4605
4606
4607 raw_spin_lock_irqsave(&p->pi_lock, flags);
4608 #ifdef CONFIG_CGROUP_SCHED
4609 if (1) {
4610 struct task_group *tg;
4611 tg = container_of(kargs->cset->subsys[cpu_cgrp_id],
4612 struct task_group, css);
4613 tg = autogroup_task_group(p, tg);
4614 p->sched_task_group = tg;
4615 }
4616 #endif
4617 rseq_migrate(p);
4618
4619
4620
4621
4622 __set_task_cpu(p, smp_processor_id());
4623 if (p->sched_class->task_fork)
4624 p->sched_class->task_fork(p);
4625 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4626 }
4627
4628 void sched_post_fork(struct task_struct *p)
4629 {
4630 uclamp_post_fork(p);
4631 }
4632
4633 unsigned long to_ratio(u64 period, u64 runtime)
4634 {
4635 if (runtime == RUNTIME_INF)
4636 return BW_UNIT;
4637
4638
4639
4640
4641
4642
4643 if (period == 0)
4644 return 0;
4645
4646 return div64_u64(runtime << BW_SHIFT, period);
4647 }
4648
4649
4650
4651
4652
4653
4654
4655
4656 void wake_up_new_task(struct task_struct *p)
4657 {
4658 struct rq_flags rf;
4659 struct rq *rq;
4660
4661 raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
4662 WRITE_ONCE(p->__state, TASK_RUNNING);
4663 #ifdef CONFIG_SMP
4664
4665
4666
4667
4668
4669
4670
4671
4672 p->recent_used_cpu = task_cpu(p);
4673 rseq_migrate(p);
4674 __set_task_cpu(p, select_task_rq(p, task_cpu(p), WF_FORK));
4675 #endif
4676 rq = __task_rq_lock(p, &rf);
4677 update_rq_clock(rq);
4678 post_init_entity_util_avg(p);
4679
4680 activate_task(rq, p, ENQUEUE_NOCLOCK);
4681 trace_sched_wakeup_new(p);
4682 check_preempt_curr(rq, p, WF_FORK);
4683 #ifdef CONFIG_SMP
4684 if (p->sched_class->task_woken) {
4685
4686
4687
4688
4689 rq_unpin_lock(rq, &rf);
4690 p->sched_class->task_woken(rq, p);
4691 rq_repin_lock(rq, &rf);
4692 }
4693 #endif
4694 task_rq_unlock(rq, p, &rf);
4695 }
4696
4697 #ifdef CONFIG_PREEMPT_NOTIFIERS
4698
4699 static DEFINE_STATIC_KEY_FALSE(preempt_notifier_key);
4700
4701 void preempt_notifier_inc(void)
4702 {
4703 static_branch_inc(&preempt_notifier_key);
4704 }
4705 EXPORT_SYMBOL_GPL(preempt_notifier_inc);
4706
4707 void preempt_notifier_dec(void)
4708 {
4709 static_branch_dec(&preempt_notifier_key);
4710 }
4711 EXPORT_SYMBOL_GPL(preempt_notifier_dec);
4712
4713
4714
4715
4716
4717 void preempt_notifier_register(struct preempt_notifier *notifier)
4718 {
4719 if (!static_branch_unlikely(&preempt_notifier_key))
4720 WARN(1, "registering preempt_notifier while notifiers disabled\n");
4721
4722 hlist_add_head(¬ifier->link, ¤t->preempt_notifiers);
4723 }
4724 EXPORT_SYMBOL_GPL(preempt_notifier_register);
4725
4726
4727
4728
4729
4730
4731
4732 void preempt_notifier_unregister(struct preempt_notifier *notifier)
4733 {
4734 hlist_del(¬ifier->link);
4735 }
4736 EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
4737
4738 static void __fire_sched_in_preempt_notifiers(struct task_struct *curr)
4739 {
4740 struct preempt_notifier *notifier;
4741
4742 hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
4743 notifier->ops->sched_in(notifier, raw_smp_processor_id());
4744 }
4745
4746 static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
4747 {
4748 if (static_branch_unlikely(&preempt_notifier_key))
4749 __fire_sched_in_preempt_notifiers(curr);
4750 }
4751
4752 static void
4753 __fire_sched_out_preempt_notifiers(struct task_struct *curr,
4754 struct task_struct *next)
4755 {
4756 struct preempt_notifier *notifier;
4757
4758 hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
4759 notifier->ops->sched_out(notifier, next);
4760 }
4761
4762 static __always_inline void
4763 fire_sched_out_preempt_notifiers(struct task_struct *curr,
4764 struct task_struct *next)
4765 {
4766 if (static_branch_unlikely(&preempt_notifier_key))
4767 __fire_sched_out_preempt_notifiers(curr, next);
4768 }
4769
4770 #else
4771
4772 static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
4773 {
4774 }
4775
4776 static inline void
4777 fire_sched_out_preempt_notifiers(struct task_struct *curr,
4778 struct task_struct *next)
4779 {
4780 }
4781
4782 #endif
4783
4784 static inline void prepare_task(struct task_struct *next)
4785 {
4786 #ifdef CONFIG_SMP
4787
4788
4789
4790
4791
4792
4793
4794 WRITE_ONCE(next->on_cpu, 1);
4795 #endif
4796 }
4797
4798 static inline void finish_task(struct task_struct *prev)
4799 {
4800 #ifdef CONFIG_SMP
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812 smp_store_release(&prev->on_cpu, 0);
4813 #endif
4814 }
4815
4816 #ifdef CONFIG_SMP
4817
4818 static void do_balance_callbacks(struct rq *rq, struct callback_head *head)
4819 {
4820 void (*func)(struct rq *rq);
4821 struct callback_head *next;
4822
4823 lockdep_assert_rq_held(rq);
4824
4825 while (head) {
4826 func = (void (*)(struct rq *))head->func;
4827 next = head->next;
4828 head->next = NULL;
4829 head = next;
4830
4831 func(rq);
4832 }
4833 }
4834
4835 static void balance_push(struct rq *rq);
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848 struct callback_head balance_push_callback = {
4849 .next = NULL,
4850 .func = (void (*)(struct callback_head *))balance_push,
4851 };
4852
4853 static inline struct callback_head *
4854 __splice_balance_callbacks(struct rq *rq, bool split)
4855 {
4856 struct callback_head *head = rq->balance_callback;
4857
4858 if (likely(!head))
4859 return NULL;
4860
4861 lockdep_assert_rq_held(rq);
4862
4863
4864
4865
4866
4867
4868
4869
4870 if (split && head == &balance_push_callback)
4871 head = NULL;
4872 else
4873 rq->balance_callback = NULL;
4874
4875 return head;
4876 }
4877
4878 static inline struct callback_head *splice_balance_callbacks(struct rq *rq)
4879 {
4880 return __splice_balance_callbacks(rq, true);
4881 }
4882
4883 static void __balance_callbacks(struct rq *rq)
4884 {
4885 do_balance_callbacks(rq, __splice_balance_callbacks(rq, false));
4886 }
4887
4888 static inline void balance_callbacks(struct rq *rq, struct callback_head *head)
4889 {
4890 unsigned long flags;
4891
4892 if (unlikely(head)) {
4893 raw_spin_rq_lock_irqsave(rq, flags);
4894 do_balance_callbacks(rq, head);
4895 raw_spin_rq_unlock_irqrestore(rq, flags);
4896 }
4897 }
4898
4899 #else
4900
4901 static inline void __balance_callbacks(struct rq *rq)
4902 {
4903 }
4904
4905 static inline struct callback_head *splice_balance_callbacks(struct rq *rq)
4906 {
4907 return NULL;
4908 }
4909
4910 static inline void balance_callbacks(struct rq *rq, struct callback_head *head)
4911 {
4912 }
4913
4914 #endif
4915
4916 static inline void
4917 prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf)
4918 {
4919
4920
4921
4922
4923
4924
4925 rq_unpin_lock(rq, rf);
4926 spin_release(&__rq_lockp(rq)->dep_map, _THIS_IP_);
4927 #ifdef CONFIG_DEBUG_SPINLOCK
4928
4929 rq_lockp(rq)->owner = next;
4930 #endif
4931 }
4932
4933 static inline void finish_lock_switch(struct rq *rq)
4934 {
4935
4936
4937
4938
4939
4940 spin_acquire(&__rq_lockp(rq)->dep_map, 0, 0, _THIS_IP_);
4941 __balance_callbacks(rq);
4942 raw_spin_rq_unlock_irq(rq);
4943 }
4944
4945
4946
4947
4948
4949 #ifndef prepare_arch_switch
4950 # define prepare_arch_switch(next) do { } while (0)
4951 #endif
4952
4953 #ifndef finish_arch_post_lock_switch
4954 # define finish_arch_post_lock_switch() do { } while (0)
4955 #endif
4956
4957 static inline void kmap_local_sched_out(void)
4958 {
4959 #ifdef CONFIG_KMAP_LOCAL
4960 if (unlikely(current->kmap_ctrl.idx))
4961 __kmap_local_sched_out();
4962 #endif
4963 }
4964
4965 static inline void kmap_local_sched_in(void)
4966 {
4967 #ifdef CONFIG_KMAP_LOCAL
4968 if (unlikely(current->kmap_ctrl.idx))
4969 __kmap_local_sched_in();
4970 #endif
4971 }
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986 static inline void
4987 prepare_task_switch(struct rq *rq, struct task_struct *prev,
4988 struct task_struct *next)
4989 {
4990 kcov_prepare_switch(prev);
4991 sched_info_switch(rq, prev, next);
4992 perf_event_task_sched_out(prev, next);
4993 rseq_preempt(prev);
4994 fire_sched_out_preempt_notifiers(prev, next);
4995 kmap_local_sched_out();
4996 prepare_task(next);
4997 prepare_arch_switch(next);
4998 }
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019 static struct rq *finish_task_switch(struct task_struct *prev)
5020 __releases(rq->lock)
5021 {
5022 struct rq *rq = this_rq();
5023 struct mm_struct *mm = rq->prev_mm;
5024 unsigned int prev_state;
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037 if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET,
5038 "corrupted preempt_count: %s/%d/0x%x\n",
5039 current->comm, current->pid, preempt_count()))
5040 preempt_count_set(FORK_PREEMPT_COUNT);
5041
5042 rq->prev_mm = NULL;
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055 prev_state = READ_ONCE(prev->__state);
5056 vtime_task_switch(prev);
5057 perf_event_task_sched_in(prev, current);
5058 finish_task(prev);
5059 tick_nohz_task_switch();
5060 finish_lock_switch(rq);
5061 finish_arch_post_lock_switch();
5062 kcov_finish_switch(current);
5063
5064
5065
5066
5067
5068
5069
5070 kmap_local_sched_in();
5071
5072 fire_sched_in_preempt_notifiers(current);
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085 if (mm) {
5086 membarrier_mm_sync_core_before_usermode(mm);
5087 mmdrop_sched(mm);
5088 }
5089 if (unlikely(prev_state == TASK_DEAD)) {
5090 if (prev->sched_class->task_dead)
5091 prev->sched_class->task_dead(prev);
5092
5093
5094 put_task_stack(prev);
5095
5096 put_task_struct_rcu_user(prev);
5097 }
5098
5099 return rq;
5100 }
5101
5102
5103
5104
5105
5106 asmlinkage __visible void schedule_tail(struct task_struct *prev)
5107 __releases(rq->lock)
5108 {
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118 finish_task_switch(prev);
5119 preempt_enable();
5120
5121 if (current->set_child_tid)
5122 put_user(task_pid_vnr(current), current->set_child_tid);
5123
5124 calculate_sigpending();
5125 }
5126
5127
5128
5129
5130 static __always_inline struct rq *
5131 context_switch(struct rq *rq, struct task_struct *prev,
5132 struct task_struct *next, struct rq_flags *rf)
5133 {
5134 prepare_task_switch(rq, prev, next);
5135
5136
5137
5138
5139
5140
5141 arch_start_context_switch(prev);
5142
5143
5144
5145
5146
5147
5148
5149
5150 if (!next->mm) {
5151 enter_lazy_tlb(prev->active_mm, next);
5152
5153 next->active_mm = prev->active_mm;
5154 if (prev->mm)
5155 mmgrab(prev->active_mm);
5156 else
5157 prev->active_mm = NULL;
5158 } else {
5159 membarrier_switch_mm(rq, prev->active_mm, next->mm);
5160
5161
5162
5163
5164
5165
5166
5167
5168 switch_mm_irqs_off(prev->active_mm, next->mm, next);
5169
5170 if (!prev->mm) {
5171
5172 rq->prev_mm = prev->active_mm;
5173 prev->active_mm = NULL;
5174 }
5175 }
5176
5177 rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
5178
5179 prepare_lock_switch(rq, next, rf);
5180
5181
5182 switch_to(prev, next, prev);
5183 barrier();
5184
5185 return finish_task_switch(prev);
5186 }
5187
5188
5189
5190
5191
5192
5193
5194 unsigned int nr_running(void)
5195 {
5196 unsigned int i, sum = 0;
5197
5198 for_each_online_cpu(i)
5199 sum += cpu_rq(i)->nr_running;
5200
5201 return sum;
5202 }
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217 bool single_task_running(void)
5218 {
5219 return raw_rq()->nr_running == 1;
5220 }
5221 EXPORT_SYMBOL(single_task_running);
5222
5223 unsigned long long nr_context_switches(void)
5224 {
5225 int i;
5226 unsigned long long sum = 0;
5227
5228 for_each_possible_cpu(i)
5229 sum += cpu_rq(i)->nr_switches;
5230
5231 return sum;
5232 }
5233
5234
5235
5236
5237
5238
5239
5240
5241 unsigned int nr_iowait_cpu(int cpu)
5242 {
5243 return atomic_read(&cpu_rq(cpu)->nr_iowait);
5244 }
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276 unsigned int nr_iowait(void)
5277 {
5278 unsigned int i, sum = 0;
5279
5280 for_each_possible_cpu(i)
5281 sum += nr_iowait_cpu(i);
5282
5283 return sum;
5284 }
5285
5286 #ifdef CONFIG_SMP
5287
5288
5289
5290
5291
5292 void sched_exec(void)
5293 {
5294 struct task_struct *p = current;
5295 unsigned long flags;
5296 int dest_cpu;
5297
5298 raw_spin_lock_irqsave(&p->pi_lock, flags);
5299 dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), WF_EXEC);
5300 if (dest_cpu == smp_processor_id())
5301 goto unlock;
5302
5303 if (likely(cpu_active(dest_cpu))) {
5304 struct migration_arg arg = { p, dest_cpu };
5305
5306 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
5307 stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
5308 return;
5309 }
5310 unlock:
5311 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
5312 }
5313
5314 #endif
5315
5316 DEFINE_PER_CPU(struct kernel_stat, kstat);
5317 DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
5318
5319 EXPORT_PER_CPU_SYMBOL(kstat);
5320 EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
5321
5322
5323
5324
5325
5326
5327
5328 static inline void prefetch_curr_exec_start(struct task_struct *p)
5329 {
5330 #ifdef CONFIG_FAIR_GROUP_SCHED
5331 struct sched_entity *curr = (&p->se)->cfs_rq->curr;
5332 #else
5333 struct sched_entity *curr = (&task_rq(p)->cfs)->curr;
5334 #endif
5335 prefetch(curr);
5336 prefetch(&curr->exec_start);
5337 }
5338
5339
5340
5341
5342
5343
5344 unsigned long long task_sched_runtime(struct task_struct *p)
5345 {
5346 struct rq_flags rf;
5347 struct rq *rq;
5348 u64 ns;
5349
5350 #if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362 if (!p->on_cpu || !task_on_rq_queued(p))
5363 return p->se.sum_exec_runtime;
5364 #endif
5365
5366 rq = task_rq_lock(p, &rf);
5367
5368
5369
5370
5371
5372 if (task_current(rq, p) && task_on_rq_queued(p)) {
5373 prefetch_curr_exec_start(p);
5374 update_rq_clock(rq);
5375 p->sched_class->update_curr(rq);
5376 }
5377 ns = p->se.sum_exec_runtime;
5378 task_rq_unlock(rq, p, &rf);
5379
5380 return ns;
5381 }
5382
5383 #ifdef CONFIG_SCHED_DEBUG
5384 static u64 cpu_resched_latency(struct rq *rq)
5385 {
5386 int latency_warn_ms = READ_ONCE(sysctl_resched_latency_warn_ms);
5387 u64 resched_latency, now = rq_clock(rq);
5388 static bool warned_once;
5389
5390 if (sysctl_resched_latency_warn_once && warned_once)
5391 return 0;
5392
5393 if (!need_resched() || !latency_warn_ms)
5394 return 0;
5395
5396 if (system_state == SYSTEM_BOOTING)
5397 return 0;
5398
5399 if (!rq->last_seen_need_resched_ns) {
5400 rq->last_seen_need_resched_ns = now;
5401 rq->ticks_without_resched = 0;
5402 return 0;
5403 }
5404
5405 rq->ticks_without_resched++;
5406 resched_latency = now - rq->last_seen_need_resched_ns;
5407 if (resched_latency <= latency_warn_ms * NSEC_PER_MSEC)
5408 return 0;
5409
5410 warned_once = true;
5411
5412 return resched_latency;
5413 }
5414
5415 static int __init setup_resched_latency_warn_ms(char *str)
5416 {
5417 long val;
5418
5419 if ((kstrtol(str, 0, &val))) {
5420 pr_warn("Unable to set resched_latency_warn_ms\n");
5421 return 1;
5422 }
5423
5424 sysctl_resched_latency_warn_ms = val;
5425 return 1;
5426 }
5427 __setup("resched_latency_warn_ms=", setup_resched_latency_warn_ms);
5428 #else
5429 static inline u64 cpu_resched_latency(struct rq *rq) { return 0; }
5430 #endif
5431
5432
5433
5434
5435
5436 void scheduler_tick(void)
5437 {
5438 int cpu = smp_processor_id();
5439 struct rq *rq = cpu_rq(cpu);
5440 struct task_struct *curr = rq->curr;
5441 struct rq_flags rf;
5442 unsigned long thermal_pressure;
5443 u64 resched_latency;
5444
5445 arch_scale_freq_tick();
5446 sched_clock_tick();
5447
5448 rq_lock(rq, &rf);
5449
5450 update_rq_clock(rq);
5451 thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq));
5452 update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure);
5453 curr->sched_class->task_tick(rq, curr, 0);
5454 if (sched_feat(LATENCY_WARN))
5455 resched_latency = cpu_resched_latency(rq);
5456 calc_global_load_tick(rq);
5457 sched_core_tick(rq);
5458
5459 rq_unlock(rq, &rf);
5460
5461 if (sched_feat(LATENCY_WARN) && resched_latency)
5462 resched_latency_warn(cpu, resched_latency);
5463
5464 perf_event_task_tick();
5465
5466 #ifdef CONFIG_SMP
5467 rq->idle_balance = idle_cpu(cpu);
5468 trigger_load_balance(rq);
5469 #endif
5470 }
5471
5472 #ifdef CONFIG_NO_HZ_FULL
5473
5474 struct tick_work {
5475 int cpu;
5476 atomic_t state;
5477 struct delayed_work work;
5478 };
5479
5480 #define TICK_SCHED_REMOTE_OFFLINE 0
5481 #define TICK_SCHED_REMOTE_OFFLINING 1
5482 #define TICK_SCHED_REMOTE_RUNNING 2
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507 static struct tick_work __percpu *tick_work_cpu;
5508
5509 static void sched_tick_remote(struct work_struct *work)
5510 {
5511 struct delayed_work *dwork = to_delayed_work(work);
5512 struct tick_work *twork = container_of(dwork, struct tick_work, work);
5513 int cpu = twork->cpu;
5514 struct rq *rq = cpu_rq(cpu);
5515 struct task_struct *curr;
5516 struct rq_flags rf;
5517 u64 delta;
5518 int os;
5519
5520
5521
5522
5523
5524
5525
5526
5527 if (!tick_nohz_tick_stopped_cpu(cpu))
5528 goto out_requeue;
5529
5530 rq_lock_irq(rq, &rf);
5531 curr = rq->curr;
5532 if (cpu_is_offline(cpu))
5533 goto out_unlock;
5534
5535 update_rq_clock(rq);
5536
5537 if (!is_idle_task(curr)) {
5538
5539
5540
5541
5542 delta = rq_clock_task(rq) - curr->se.exec_start;
5543 WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3);
5544 }
5545 curr->sched_class->task_tick(rq, curr, 0);
5546
5547 calc_load_nohz_remote(rq);
5548 out_unlock:
5549 rq_unlock_irq(rq, &rf);
5550 out_requeue:
5551
5552
5553
5554
5555
5556
5557
5558 os = atomic_fetch_add_unless(&twork->state, -1, TICK_SCHED_REMOTE_RUNNING);
5559 WARN_ON_ONCE(os == TICK_SCHED_REMOTE_OFFLINE);
5560 if (os == TICK_SCHED_REMOTE_RUNNING)
5561 queue_delayed_work(system_unbound_wq, dwork, HZ);
5562 }
5563
5564 static void sched_tick_start(int cpu)
5565 {
5566 int os;
5567 struct tick_work *twork;
5568
5569 if (housekeeping_cpu(cpu, HK_TYPE_TICK))
5570 return;
5571
5572 WARN_ON_ONCE(!tick_work_cpu);
5573
5574 twork = per_cpu_ptr(tick_work_cpu, cpu);
5575 os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_RUNNING);
5576 WARN_ON_ONCE(os == TICK_SCHED_REMOTE_RUNNING);
5577 if (os == TICK_SCHED_REMOTE_OFFLINE) {
5578 twork->cpu = cpu;
5579 INIT_DELAYED_WORK(&twork->work, sched_tick_remote);
5580 queue_delayed_work(system_unbound_wq, &twork->work, HZ);
5581 }
5582 }
5583
5584 #ifdef CONFIG_HOTPLUG_CPU
5585 static void sched_tick_stop(int cpu)
5586 {
5587 struct tick_work *twork;
5588 int os;
5589
5590 if (housekeeping_cpu(cpu, HK_TYPE_TICK))
5591 return;
5592
5593 WARN_ON_ONCE(!tick_work_cpu);
5594
5595 twork = per_cpu_ptr(tick_work_cpu, cpu);
5596
5597 os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_OFFLINING);
5598 WARN_ON_ONCE(os != TICK_SCHED_REMOTE_RUNNING);
5599
5600 }
5601 #endif
5602
5603 int __init sched_tick_offload_init(void)
5604 {
5605 tick_work_cpu = alloc_percpu(struct tick_work);
5606 BUG_ON(!tick_work_cpu);
5607 return 0;
5608 }
5609
5610 #else
5611 static inline void sched_tick_start(int cpu) { }
5612 static inline void sched_tick_stop(int cpu) { }
5613 #endif
5614
5615 #if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \
5616 defined(CONFIG_TRACE_PREEMPT_TOGGLE))
5617
5618
5619
5620
5621 static inline void preempt_latency_start(int val)
5622 {
5623 if (preempt_count() == val) {
5624 unsigned long ip = get_lock_parent_ip();
5625 #ifdef CONFIG_DEBUG_PREEMPT
5626 current->preempt_disable_ip = ip;
5627 #endif
5628 trace_preempt_off(CALLER_ADDR0, ip);
5629 }
5630 }
5631
5632 void preempt_count_add(int val)
5633 {
5634 #ifdef CONFIG_DEBUG_PREEMPT
5635
5636
5637
5638 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
5639 return;
5640 #endif
5641 __preempt_count_add(val);
5642 #ifdef CONFIG_DEBUG_PREEMPT
5643
5644
5645
5646 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
5647 PREEMPT_MASK - 10);
5648 #endif
5649 preempt_latency_start(val);
5650 }
5651 EXPORT_SYMBOL(preempt_count_add);
5652 NOKPROBE_SYMBOL(preempt_count_add);
5653
5654
5655
5656
5657
5658 static inline void preempt_latency_stop(int val)
5659 {
5660 if (preempt_count() == val)
5661 trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip());
5662 }
5663
5664 void preempt_count_sub(int val)
5665 {
5666 #ifdef CONFIG_DEBUG_PREEMPT
5667
5668
5669
5670 if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
5671 return;
5672
5673
5674
5675 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
5676 !(preempt_count() & PREEMPT_MASK)))
5677 return;
5678 #endif
5679
5680 preempt_latency_stop(val);
5681 __preempt_count_sub(val);
5682 }
5683 EXPORT_SYMBOL(preempt_count_sub);
5684 NOKPROBE_SYMBOL(preempt_count_sub);
5685
5686 #else
5687 static inline void preempt_latency_start(int val) { }
5688 static inline void preempt_latency_stop(int val) { }
5689 #endif
5690
5691 static inline unsigned long get_preempt_disable_ip(struct task_struct *p)
5692 {
5693 #ifdef CONFIG_DEBUG_PREEMPT
5694 return p->preempt_disable_ip;
5695 #else
5696 return 0;
5697 #endif
5698 }
5699
5700
5701
5702
5703 static noinline void __schedule_bug(struct task_struct *prev)
5704 {
5705
5706 unsigned long preempt_disable_ip = get_preempt_disable_ip(current);
5707
5708 if (oops_in_progress)
5709 return;
5710
5711 printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
5712 prev->comm, prev->pid, preempt_count());
5713
5714 debug_show_held_locks(prev);
5715 print_modules();
5716 if (irqs_disabled())
5717 print_irqtrace_events(prev);
5718 if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
5719 && in_atomic_preempt_off()) {
5720 pr_err("Preemption disabled at:");
5721 print_ip_sym(KERN_ERR, preempt_disable_ip);
5722 }
5723 if (panic_on_warn)
5724 panic("scheduling while atomic\n");
5725
5726 dump_stack();
5727 add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
5728 }
5729
5730
5731
5732
5733 static inline void schedule_debug(struct task_struct *prev, bool preempt)
5734 {
5735 #ifdef CONFIG_SCHED_STACK_END_CHECK
5736 if (task_stack_end_corrupted(prev))
5737 panic("corrupted stack end detected inside scheduler\n");
5738
5739 if (task_scs_end_corrupted(prev))
5740 panic("corrupted shadow stack detected inside scheduler\n");
5741 #endif
5742
5743 #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
5744 if (!preempt && READ_ONCE(prev->__state) && prev->non_block_count) {
5745 printk(KERN_ERR "BUG: scheduling in a non-blocking section: %s/%d/%i\n",
5746 prev->comm, prev->pid, prev->non_block_count);
5747 dump_stack();
5748 add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
5749 }
5750 #endif
5751
5752 if (unlikely(in_atomic_preempt_off())) {
5753 __schedule_bug(prev);
5754 preempt_count_set(PREEMPT_DISABLED);
5755 }
5756 rcu_sleep_check();
5757 SCHED_WARN_ON(ct_state() == CONTEXT_USER);
5758
5759 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
5760
5761 schedstat_inc(this_rq()->sched_count);
5762 }
5763
5764 static void put_prev_task_balance(struct rq *rq, struct task_struct *prev,
5765 struct rq_flags *rf)
5766 {
5767 #ifdef CONFIG_SMP
5768 const struct sched_class *class;
5769
5770
5771
5772
5773
5774
5775
5776
5777 for_class_range(class, prev->sched_class, &idle_sched_class) {
5778 if (class->balance(rq, prev, rf))
5779 break;
5780 }
5781 #endif
5782
5783 put_prev_task(rq, prev);
5784 }
5785
5786
5787
5788
5789 static inline struct task_struct *
5790 __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
5791 {
5792 const struct sched_class *class;
5793 struct task_struct *p;
5794
5795
5796
5797
5798
5799
5800
5801 if (likely(!sched_class_above(prev->sched_class, &fair_sched_class) &&
5802 rq->nr_running == rq->cfs.h_nr_running)) {
5803
5804 p = pick_next_task_fair(rq, prev, rf);
5805 if (unlikely(p == RETRY_TASK))
5806 goto restart;
5807
5808
5809 if (!p) {
5810 put_prev_task(rq, prev);
5811 p = pick_next_task_idle(rq);
5812 }
5813
5814 return p;
5815 }
5816
5817 restart:
5818 put_prev_task_balance(rq, prev, rf);
5819
5820 for_each_class(class) {
5821 p = class->pick_next_task(rq);
5822 if (p)
5823 return p;
5824 }
5825
5826 BUG();
5827 }
5828
5829 #ifdef CONFIG_SCHED_CORE
5830 static inline bool is_task_rq_idle(struct task_struct *t)
5831 {
5832 return (task_rq(t)->idle == t);
5833 }
5834
5835 static inline bool cookie_equals(struct task_struct *a, unsigned long cookie)
5836 {
5837 return is_task_rq_idle(a) || (a->core_cookie == cookie);
5838 }
5839
5840 static inline bool cookie_match(struct task_struct *a, struct task_struct *b)
5841 {
5842 if (is_task_rq_idle(a) || is_task_rq_idle(b))
5843 return true;
5844
5845 return a->core_cookie == b->core_cookie;
5846 }
5847
5848 static inline struct task_struct *pick_task(struct rq *rq)
5849 {
5850 const struct sched_class *class;
5851 struct task_struct *p;
5852
5853 for_each_class(class) {
5854 p = class->pick_task(rq);
5855 if (p)
5856 return p;
5857 }
5858
5859 BUG();
5860 }
5861
5862 extern void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_fi);
5863
5864 static void queue_core_balance(struct rq *rq);
5865
5866 static struct task_struct *
5867 pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
5868 {
5869 struct task_struct *next, *p, *max = NULL;
5870 const struct cpumask *smt_mask;
5871 bool fi_before = false;
5872 bool core_clock_updated = (rq == rq->core);
5873 unsigned long cookie;
5874 int i, cpu, occ = 0;
5875 struct rq *rq_i;
5876 bool need_sync;
5877
5878 if (!sched_core_enabled(rq))
5879 return __pick_next_task(rq, prev, rf);
5880
5881 cpu = cpu_of(rq);
5882
5883
5884 if (cpu_is_offline(cpu)) {
5885
5886
5887
5888
5889
5890 rq->core_pick = NULL;
5891 return __pick_next_task(rq, prev, rf);
5892 }
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903 if (rq->core->core_pick_seq == rq->core->core_task_seq &&
5904 rq->core->core_pick_seq != rq->core_sched_seq &&
5905 rq->core_pick) {
5906 WRITE_ONCE(rq->core_sched_seq, rq->core->core_pick_seq);
5907
5908 next = rq->core_pick;
5909 if (next != prev) {
5910 put_prev_task(rq, prev);
5911 set_next_task(rq, next);
5912 }
5913
5914 rq->core_pick = NULL;
5915 goto out;
5916 }
5917
5918 put_prev_task_balance(rq, prev, rf);
5919
5920 smt_mask = cpu_smt_mask(cpu);
5921 need_sync = !!rq->core->core_cookie;
5922
5923
5924 rq->core->core_cookie = 0UL;
5925 if (rq->core->core_forceidle_count) {
5926 if (!core_clock_updated) {
5927 update_rq_clock(rq->core);
5928 core_clock_updated = true;
5929 }
5930 sched_core_account_forceidle(rq);
5931
5932 rq->core->core_forceidle_start = 0;
5933 rq->core->core_forceidle_count = 0;
5934 rq->core->core_forceidle_occupation = 0;
5935 need_sync = true;
5936 fi_before = true;
5937 }
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949 rq->core->core_task_seq++;
5950
5951
5952
5953
5954
5955 if (!need_sync) {
5956 next = pick_task(rq);
5957 if (!next->core_cookie) {
5958 rq->core_pick = NULL;
5959
5960
5961
5962
5963 WARN_ON_ONCE(fi_before);
5964 task_vruntime_update(rq, next, false);
5965 goto out_set_next;
5966 }
5967 }
5968
5969
5970
5971
5972
5973
5974
5975 for_each_cpu_wrap(i, smt_mask, cpu) {
5976 rq_i = cpu_rq(i);
5977
5978
5979
5980
5981
5982
5983 if (i != cpu && (rq_i != rq->core || !core_clock_updated))
5984 update_rq_clock(rq_i);
5985
5986 p = rq_i->core_pick = pick_task(rq_i);
5987 if (!max || prio_less(max, p, fi_before))
5988 max = p;
5989 }
5990
5991 cookie = rq->core->core_cookie = max->core_cookie;
5992
5993
5994
5995
5996
5997 for_each_cpu(i, smt_mask) {
5998 rq_i = cpu_rq(i);
5999 p = rq_i->core_pick;
6000
6001 if (!cookie_equals(p, cookie)) {
6002 p = NULL;
6003 if (cookie)
6004 p = sched_core_find(rq_i, cookie);
6005 if (!p)
6006 p = idle_sched_class.pick_task(rq_i);
6007 }
6008
6009 rq_i->core_pick = p;
6010
6011 if (p == rq_i->idle) {
6012 if (rq_i->nr_running) {
6013 rq->core->core_forceidle_count++;
6014 if (!fi_before)
6015 rq->core->core_forceidle_seq++;
6016 }
6017 } else {
6018 occ++;
6019 }
6020 }
6021
6022 if (schedstat_enabled() && rq->core->core_forceidle_count) {
6023 rq->core->core_forceidle_start = rq_clock(rq->core);
6024 rq->core->core_forceidle_occupation = occ;
6025 }
6026
6027 rq->core->core_pick_seq = rq->core->core_task_seq;
6028 next = rq->core_pick;
6029 rq->core_sched_seq = rq->core->core_pick_seq;
6030
6031
6032 WARN_ON_ONCE(!next);
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042 for_each_cpu(i, smt_mask) {
6043 rq_i = cpu_rq(i);
6044
6045
6046
6047
6048
6049
6050
6051
6052 if (!rq_i->core_pick)
6053 continue;
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063 if (!(fi_before && rq->core->core_forceidle_count))
6064 task_vruntime_update(rq_i, rq_i->core_pick, !!rq->core->core_forceidle_count);
6065
6066 rq_i->core_pick->core_occupation = occ;
6067
6068 if (i == cpu) {
6069 rq_i->core_pick = NULL;
6070 continue;
6071 }
6072
6073
6074 WARN_ON_ONCE(!cookie_match(next, rq_i->core_pick));
6075
6076 if (rq_i->curr == rq_i->core_pick) {
6077 rq_i->core_pick = NULL;
6078 continue;
6079 }
6080
6081 resched_curr(rq_i);
6082 }
6083
6084 out_set_next:
6085 set_next_task(rq, next);
6086 out:
6087 if (rq->core->core_forceidle_count && next == rq->idle)
6088 queue_core_balance(rq);
6089
6090 return next;
6091 }
6092
6093 static bool try_steal_cookie(int this, int that)
6094 {
6095 struct rq *dst = cpu_rq(this), *src = cpu_rq(that);
6096 struct task_struct *p;
6097 unsigned long cookie;
6098 bool success = false;
6099
6100 local_irq_disable();
6101 double_rq_lock(dst, src);
6102
6103 cookie = dst->core->core_cookie;
6104 if (!cookie)
6105 goto unlock;
6106
6107 if (dst->curr != dst->idle)
6108 goto unlock;
6109
6110 p = sched_core_find(src, cookie);
6111 if (p == src->idle)
6112 goto unlock;
6113
6114 do {
6115 if (p == src->core_pick || p == src->curr)
6116 goto next;
6117
6118 if (!is_cpu_allowed(p, this))
6119 goto next;
6120
6121 if (p->core_occupation > dst->idle->core_occupation)
6122 goto next;
6123
6124 deactivate_task(src, p, 0);
6125 set_task_cpu(p, this);
6126 activate_task(dst, p, 0);
6127
6128 resched_curr(dst);
6129
6130 success = true;
6131 break;
6132
6133 next:
6134 p = sched_core_next(p, cookie);
6135 } while (p);
6136
6137 unlock:
6138 double_rq_unlock(dst, src);
6139 local_irq_enable();
6140
6141 return success;
6142 }
6143
6144 static bool steal_cookie_task(int cpu, struct sched_domain *sd)
6145 {
6146 int i;
6147
6148 for_each_cpu_wrap(i, sched_domain_span(sd), cpu) {
6149 if (i == cpu)
6150 continue;
6151
6152 if (need_resched())
6153 break;
6154
6155 if (try_steal_cookie(cpu, i))
6156 return true;
6157 }
6158
6159 return false;
6160 }
6161
6162 static void sched_core_balance(struct rq *rq)
6163 {
6164 struct sched_domain *sd;
6165 int cpu = cpu_of(rq);
6166
6167 preempt_disable();
6168 rcu_read_lock();
6169 raw_spin_rq_unlock_irq(rq);
6170 for_each_domain(cpu, sd) {
6171 if (need_resched())
6172 break;
6173
6174 if (steal_cookie_task(cpu, sd))
6175 break;
6176 }
6177 raw_spin_rq_lock_irq(rq);
6178 rcu_read_unlock();
6179 preempt_enable();
6180 }
6181
6182 static DEFINE_PER_CPU(struct callback_head, core_balance_head);
6183
6184 static void queue_core_balance(struct rq *rq)
6185 {
6186 if (!sched_core_enabled(rq))
6187 return;
6188
6189 if (!rq->core->core_cookie)
6190 return;
6191
6192 if (!rq->nr_running)
6193 return;
6194
6195 queue_balance_callback(rq, &per_cpu(core_balance_head, rq->cpu), sched_core_balance);
6196 }
6197
6198 static void sched_core_cpu_starting(unsigned int cpu)
6199 {
6200 const struct cpumask *smt_mask = cpu_smt_mask(cpu);
6201 struct rq *rq = cpu_rq(cpu), *core_rq = NULL;
6202 unsigned long flags;
6203 int t;
6204
6205 sched_core_lock(cpu, &flags);
6206
6207 WARN_ON_ONCE(rq->core != rq);
6208
6209
6210 if (cpumask_weight(smt_mask) == 1)
6211 goto unlock;
6212
6213
6214 for_each_cpu(t, smt_mask) {
6215 if (t == cpu)
6216 continue;
6217 rq = cpu_rq(t);
6218 if (rq->core == rq) {
6219 core_rq = rq;
6220 break;
6221 }
6222 }
6223
6224 if (WARN_ON_ONCE(!core_rq))
6225 goto unlock;
6226
6227
6228 for_each_cpu(t, smt_mask) {
6229 rq = cpu_rq(t);
6230
6231 if (t == cpu)
6232 rq->core = core_rq;
6233
6234 WARN_ON_ONCE(rq->core != core_rq);
6235 }
6236
6237 unlock:
6238 sched_core_unlock(cpu, &flags);
6239 }
6240
6241 static void sched_core_cpu_deactivate(unsigned int cpu)
6242 {
6243 const struct cpumask *smt_mask = cpu_smt_mask(cpu);
6244 struct rq *rq = cpu_rq(cpu), *core_rq = NULL;
6245 unsigned long flags;
6246 int t;
6247
6248 sched_core_lock(cpu, &flags);
6249
6250
6251 if (cpumask_weight(smt_mask) == 1) {
6252 WARN_ON_ONCE(rq->core != rq);
6253 goto unlock;
6254 }
6255
6256
6257 if (rq->core != rq)
6258 goto unlock;
6259
6260
6261 for_each_cpu(t, smt_mask) {
6262 if (t == cpu)
6263 continue;
6264 core_rq = cpu_rq(t);
6265 break;
6266 }
6267
6268 if (WARN_ON_ONCE(!core_rq))
6269 goto unlock;
6270
6271
6272 core_rq->core_task_seq = rq->core_task_seq;
6273 core_rq->core_pick_seq = rq->core_pick_seq;
6274 core_rq->core_cookie = rq->core_cookie;
6275 core_rq->core_forceidle_count = rq->core_forceidle_count;
6276 core_rq->core_forceidle_seq = rq->core_forceidle_seq;
6277 core_rq->core_forceidle_occupation = rq->core_forceidle_occupation;
6278
6279
6280
6281
6282
6283
6284 core_rq->core_forceidle_start = 0;
6285
6286
6287 for_each_cpu(t, smt_mask) {
6288 rq = cpu_rq(t);
6289 rq->core = core_rq;
6290 }
6291
6292 unlock:
6293 sched_core_unlock(cpu, &flags);
6294 }
6295
6296 static inline void sched_core_cpu_dying(unsigned int cpu)
6297 {
6298 struct rq *rq = cpu_rq(cpu);
6299
6300 if (rq->core != rq)
6301 rq->core = rq;
6302 }
6303
6304 #else
6305
6306 static inline void sched_core_cpu_starting(unsigned int cpu) {}
6307 static inline void sched_core_cpu_deactivate(unsigned int cpu) {}
6308 static inline void sched_core_cpu_dying(unsigned int cpu) {}
6309
6310 static struct task_struct *
6311 pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
6312 {
6313 return __pick_next_task(rq, prev, rf);
6314 }
6315
6316 #endif
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326 #define SM_NONE 0x0
6327 #define SM_PREEMPT 0x1
6328 #define SM_RTLOCK_WAIT 0x2
6329
6330 #ifndef CONFIG_PREEMPT_RT
6331 # define SM_MASK_PREEMPT (~0U)
6332 #else
6333 # define SM_MASK_PREEMPT SM_PREEMPT
6334 #endif
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375 static void __sched notrace __schedule(unsigned int sched_mode)
6376 {
6377 struct task_struct *prev, *next;
6378 unsigned long *switch_count;
6379 unsigned long prev_state;
6380 struct rq_flags rf;
6381 struct rq *rq;
6382 int cpu;
6383
6384 cpu = smp_processor_id();
6385 rq = cpu_rq(cpu);
6386 prev = rq->curr;
6387
6388 schedule_debug(prev, !!sched_mode);
6389
6390 if (sched_feat(HRTICK) || sched_feat(HRTICK_DL))
6391 hrtick_clear(rq);
6392
6393 local_irq_disable();
6394 rcu_note_context_switch(!!sched_mode);
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411 rq_lock(rq, &rf);
6412 smp_mb__after_spinlock();
6413
6414
6415 rq->clock_update_flags <<= 1;
6416 update_rq_clock(rq);
6417
6418 switch_count = &prev->nivcsw;
6419
6420
6421
6422
6423
6424 prev_state = READ_ONCE(prev->__state);
6425 if (!(sched_mode & SM_MASK_PREEMPT) && prev_state) {
6426 if (signal_pending_state(prev_state, prev)) {
6427 WRITE_ONCE(prev->__state, TASK_RUNNING);
6428 } else {
6429 prev->sched_contributes_to_load =
6430 (prev_state & TASK_UNINTERRUPTIBLE) &&
6431 !(prev_state & TASK_NOLOAD) &&
6432 !(prev->flags & PF_FROZEN);
6433
6434 if (prev->sched_contributes_to_load)
6435 rq->nr_uninterruptible++;
6436
6437
6438
6439
6440
6441
6442
6443
6444
6445
6446
6447
6448 deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK);
6449
6450 if (prev->in_iowait) {
6451 atomic_inc(&rq->nr_iowait);
6452 delayacct_blkio_start();
6453 }
6454 }
6455 switch_count = &prev->nvcsw;
6456 }
6457
6458 next = pick_next_task(rq, prev, &rf);
6459 clear_tsk_need_resched(prev);
6460 clear_preempt_need_resched();
6461 #ifdef CONFIG_SCHED_DEBUG
6462 rq->last_seen_need_resched_ns = 0;
6463 #endif
6464
6465 if (likely(prev != next)) {
6466 rq->nr_switches++;
6467
6468
6469
6470
6471 RCU_INIT_POINTER(rq->curr, next);
6472
6473
6474
6475
6476
6477
6478
6479
6480
6481
6482
6483
6484
6485
6486 ++*switch_count;
6487
6488 migrate_disable_switch(rq, prev);
6489 psi_sched_switch(prev, next, !task_on_rq_queued(prev));
6490
6491 trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev, next, prev_state);
6492
6493
6494 rq = context_switch(rq, prev, next, &rf);
6495 } else {
6496 rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
6497
6498 rq_unpin_lock(rq, &rf);
6499 __balance_callbacks(rq);
6500 raw_spin_rq_unlock_irq(rq);
6501 }
6502 }
6503
6504 void __noreturn do_task_dead(void)
6505 {
6506
6507 set_special_state(TASK_DEAD);
6508
6509
6510 current->flags |= PF_NOFREEZE;
6511
6512 __schedule(SM_NONE);
6513 BUG();
6514
6515
6516 for (;;)
6517 cpu_relax();
6518 }
6519
6520 static inline void sched_submit_work(struct task_struct *tsk)
6521 {
6522 unsigned int task_flags;
6523
6524 if (task_is_running(tsk))
6525 return;
6526
6527 task_flags = tsk->flags;
6528
6529
6530
6531
6532 if (task_flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
6533 if (task_flags & PF_WQ_WORKER)
6534 wq_worker_sleeping(tsk);
6535 else
6536 io_wq_worker_sleeping(tsk);
6537 }
6538
6539
6540
6541
6542
6543
6544 SCHED_WARN_ON(current->__state & TASK_RTLOCK_WAIT);
6545
6546
6547
6548
6549
6550 blk_flush_plug(tsk->plug, true);
6551 }
6552
6553 static void sched_update_worker(struct task_struct *tsk)
6554 {
6555 if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
6556 if (tsk->flags & PF_WQ_WORKER)
6557 wq_worker_running(tsk);
6558 else
6559 io_wq_worker_running(tsk);
6560 }
6561 }
6562
6563 asmlinkage __visible void __sched schedule(void)
6564 {
6565 struct task_struct *tsk = current;
6566
6567 sched_submit_work(tsk);
6568 do {
6569 preempt_disable();
6570 __schedule(SM_NONE);
6571 sched_preempt_enable_no_resched();
6572 } while (need_resched());
6573 sched_update_worker(tsk);
6574 }
6575 EXPORT_SYMBOL(schedule);
6576
6577
6578
6579
6580
6581
6582
6583
6584
6585
6586
6587 void __sched schedule_idle(void)
6588 {
6589
6590
6591
6592
6593
6594
6595
6596 WARN_ON_ONCE(current->__state);
6597 do {
6598 __schedule(SM_NONE);
6599 } while (need_resched());
6600 }
6601
6602 #if defined(CONFIG_CONTEXT_TRACKING_USER) && !defined(CONFIG_HAVE_CONTEXT_TRACKING_USER_OFFSTACK)
6603 asmlinkage __visible void __sched schedule_user(void)
6604 {
6605
6606
6607
6608
6609
6610
6611
6612
6613
6614
6615 enum ctx_state prev_state = exception_enter();
6616 schedule();
6617 exception_exit(prev_state);
6618 }
6619 #endif
6620
6621
6622
6623
6624
6625
6626 void __sched schedule_preempt_disabled(void)
6627 {
6628 sched_preempt_enable_no_resched();
6629 schedule();
6630 preempt_disable();
6631 }
6632
6633 #ifdef CONFIG_PREEMPT_RT
6634 void __sched notrace schedule_rtlock(void)
6635 {
6636 do {
6637 preempt_disable();
6638 __schedule(SM_RTLOCK_WAIT);
6639 sched_preempt_enable_no_resched();
6640 } while (need_resched());
6641 }
6642 NOKPROBE_SYMBOL(schedule_rtlock);
6643 #endif
6644
6645 static void __sched notrace preempt_schedule_common(void)
6646 {
6647 do {
6648
6649
6650
6651
6652
6653
6654
6655
6656
6657
6658
6659
6660
6661 preempt_disable_notrace();
6662 preempt_latency_start(1);
6663 __schedule(SM_PREEMPT);
6664 preempt_latency_stop(1);
6665 preempt_enable_no_resched_notrace();
6666
6667
6668
6669
6670
6671 } while (need_resched());
6672 }
6673
6674 #ifdef CONFIG_PREEMPTION
6675
6676
6677
6678
6679 asmlinkage __visible void __sched notrace preempt_schedule(void)
6680 {
6681
6682
6683
6684
6685 if (likely(!preemptible()))
6686 return;
6687 preempt_schedule_common();
6688 }
6689 NOKPROBE_SYMBOL(preempt_schedule);
6690 EXPORT_SYMBOL(preempt_schedule);
6691
6692 #ifdef CONFIG_PREEMPT_DYNAMIC
6693 #if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
6694 #ifndef preempt_schedule_dynamic_enabled
6695 #define preempt_schedule_dynamic_enabled preempt_schedule
6696 #define preempt_schedule_dynamic_disabled NULL
6697 #endif
6698 DEFINE_STATIC_CALL(preempt_schedule, preempt_schedule_dynamic_enabled);
6699 EXPORT_STATIC_CALL_TRAMP(preempt_schedule);
6700 #elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
6701 static DEFINE_STATIC_KEY_TRUE(sk_dynamic_preempt_schedule);
6702 void __sched notrace dynamic_preempt_schedule(void)
6703 {
6704 if (!static_branch_unlikely(&sk_dynamic_preempt_schedule))
6705 return;
6706 preempt_schedule();
6707 }
6708 NOKPROBE_SYMBOL(dynamic_preempt_schedule);
6709 EXPORT_SYMBOL(dynamic_preempt_schedule);
6710 #endif
6711 #endif
6712
6713
6714
6715
6716
6717
6718
6719
6720
6721
6722
6723
6724
6725
6726
6727 asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
6728 {
6729 enum ctx_state prev_ctx;
6730
6731 if (likely(!preemptible()))
6732 return;
6733
6734 do {
6735
6736
6737
6738
6739
6740
6741
6742
6743
6744
6745
6746
6747
6748 preempt_disable_notrace();
6749 preempt_latency_start(1);
6750
6751
6752
6753
6754
6755 prev_ctx = exception_enter();
6756 __schedule(SM_PREEMPT);
6757 exception_exit(prev_ctx);
6758
6759 preempt_latency_stop(1);
6760 preempt_enable_no_resched_notrace();
6761 } while (need_resched());
6762 }
6763 EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
6764
6765 #ifdef CONFIG_PREEMPT_DYNAMIC
6766 #if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
6767 #ifndef preempt_schedule_notrace_dynamic_enabled
6768 #define preempt_schedule_notrace_dynamic_enabled preempt_schedule_notrace
6769 #define preempt_schedule_notrace_dynamic_disabled NULL
6770 #endif
6771 DEFINE_STATIC_CALL(preempt_schedule_notrace, preempt_schedule_notrace_dynamic_enabled);
6772 EXPORT_STATIC_CALL_TRAMP(preempt_schedule_notrace);
6773 #elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
6774 static DEFINE_STATIC_KEY_TRUE(sk_dynamic_preempt_schedule_notrace);
6775 void __sched notrace dynamic_preempt_schedule_notrace(void)
6776 {
6777 if (!static_branch_unlikely(&sk_dynamic_preempt_schedule_notrace))
6778 return;
6779 preempt_schedule_notrace();
6780 }
6781 NOKPROBE_SYMBOL(dynamic_preempt_schedule_notrace);
6782 EXPORT_SYMBOL(dynamic_preempt_schedule_notrace);
6783 #endif
6784 #endif
6785
6786 #endif
6787
6788
6789
6790
6791
6792
6793
6794 asmlinkage __visible void __sched preempt_schedule_irq(void)
6795 {
6796 enum ctx_state prev_state;
6797
6798
6799 BUG_ON(preempt_count() || !irqs_disabled());
6800
6801 prev_state = exception_enter();
6802
6803 do {
6804 preempt_disable();
6805 local_irq_enable();
6806 __schedule(SM_PREEMPT);
6807 local_irq_disable();
6808 sched_preempt_enable_no_resched();
6809 } while (need_resched());
6810
6811 exception_exit(prev_state);
6812 }
6813
6814 int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags,
6815 void *key)
6816 {
6817 WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~WF_SYNC);
6818 return try_to_wake_up(curr->private, mode, wake_flags);
6819 }
6820 EXPORT_SYMBOL(default_wake_function);
6821
6822 static void __setscheduler_prio(struct task_struct *p, int prio)
6823 {
6824 if (dl_prio(prio))
6825 p->sched_class = &dl_sched_class;
6826 else if (rt_prio(prio))
6827 p->sched_class = &rt_sched_class;
6828 else
6829 p->sched_class = &fair_sched_class;
6830
6831 p->prio = prio;
6832 }
6833
6834 #ifdef CONFIG_RT_MUTEXES
6835
6836 static inline int __rt_effective_prio(struct task_struct *pi_task, int prio)
6837 {
6838 if (pi_task)
6839 prio = min(prio, pi_task->prio);
6840
6841 return prio;
6842 }
6843
6844 static inline int rt_effective_prio(struct task_struct *p, int prio)
6845 {
6846 struct task_struct *pi_task = rt_mutex_get_top_task(p);
6847
6848 return __rt_effective_prio(pi_task, prio);
6849 }
6850
6851
6852
6853
6854
6855
6856
6857
6858
6859
6860
6861
6862 void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
6863 {
6864 int prio, oldprio, queued, running, queue_flag =
6865 DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
6866 const struct sched_class *prev_class;
6867 struct rq_flags rf;
6868 struct rq *rq;
6869
6870
6871 prio = __rt_effective_prio(pi_task, p->normal_prio);
6872
6873
6874
6875
6876 if (p->pi_top_task == pi_task && prio == p->prio && !dl_prio(prio))
6877 return;
6878
6879 rq = __task_rq_lock(p, &rf);
6880 update_rq_clock(rq);
6881
6882
6883
6884
6885
6886
6887
6888
6889
6890
6891 p->pi_top_task = pi_task;
6892
6893
6894
6895
6896 if (prio == p->prio && !dl_prio(prio))
6897 goto out_unlock;
6898
6899
6900
6901
6902
6903
6904
6905
6906
6907
6908
6909
6910
6911 if (unlikely(p == rq->idle)) {
6912 WARN_ON(p != rq->curr);
6913 WARN_ON(p->pi_blocked_on);
6914 goto out_unlock;
6915 }
6916
6917 trace_sched_pi_setprio(p, pi_task);
6918 oldprio = p->prio;
6919
6920 if (oldprio == prio)
6921 queue_flag &= ~DEQUEUE_MOVE;
6922
6923 prev_class = p->sched_class;
6924 queued = task_on_rq_queued(p);
6925 running = task_current(rq, p);
6926 if (queued)
6927 dequeue_task(rq, p, queue_flag);
6928 if (running)
6929 put_prev_task(rq, p);
6930
6931
6932
6933
6934
6935
6936
6937
6938
6939
6940 if (dl_prio(prio)) {
6941 if (!dl_prio(p->normal_prio) ||
6942 (pi_task && dl_prio(pi_task->prio) &&
6943 dl_entity_preempt(&pi_task->dl, &p->dl))) {
6944 p->dl.pi_se = pi_task->dl.pi_se;
6945 queue_flag |= ENQUEUE_REPLENISH;
6946 } else {
6947 p->dl.pi_se = &p->dl;
6948 }
6949 } else if (rt_prio(prio)) {
6950 if (dl_prio(oldprio))
6951 p->dl.pi_se = &p->dl;
6952 if (oldprio < prio)
6953 queue_flag |= ENQUEUE_HEAD;
6954 } else {
6955 if (dl_prio(oldprio))
6956 p->dl.pi_se = &p->dl;
6957 if (rt_prio(oldprio))
6958 p->rt.timeout = 0;
6959 }
6960
6961 __setscheduler_prio(p, prio);
6962
6963 if (queued)
6964 enqueue_task(rq, p, queue_flag);
6965 if (running)
6966 set_next_task(rq, p);
6967
6968 check_class_changed(rq, p, prev_class, oldprio);
6969 out_unlock:
6970
6971 preempt_disable();
6972
6973 rq_unpin_lock(rq, &rf);
6974 __balance_callbacks(rq);
6975 raw_spin_rq_unlock(rq);
6976
6977 preempt_enable();
6978 }
6979 #else
6980 static inline int rt_effective_prio(struct task_struct *p, int prio)
6981 {
6982 return prio;
6983 }
6984 #endif
6985
6986 void set_user_nice(struct task_struct *p, long nice)
6987 {
6988 bool queued, running;
6989 int old_prio;
6990 struct rq_flags rf;
6991 struct rq *rq;
6992
6993 if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE)
6994 return;
6995
6996
6997
6998
6999 rq = task_rq_lock(p, &rf);
7000 update_rq_clock(rq);
7001
7002
7003
7004
7005
7006
7007
7008 if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
7009 p->static_prio = NICE_TO_PRIO(nice);
7010 goto out_unlock;
7011 }
7012 queued = task_on_rq_queued(p);
7013 running = task_current(rq, p);
7014 if (queued)
7015 dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
7016 if (running)
7017 put_prev_task(rq, p);
7018
7019 p->static_prio = NICE_TO_PRIO(nice);
7020 set_load_weight(p, true);
7021 old_prio = p->prio;
7022 p->prio = effective_prio(p);
7023
7024 if (queued)
7025 enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
7026 if (running)
7027 set_next_task(rq, p);
7028
7029
7030
7031
7032
7033 p->sched_class->prio_changed(rq, p, old_prio);
7034
7035 out_unlock:
7036 task_rq_unlock(rq, p, &rf);
7037 }
7038 EXPORT_SYMBOL(set_user_nice);
7039
7040
7041
7042
7043
7044
7045
7046
7047
7048 static bool is_nice_reduction(const struct task_struct *p, const int nice)
7049 {
7050
7051 int nice_rlim = nice_to_rlimit(nice);
7052
7053 return (nice_rlim <= task_rlimit(p, RLIMIT_NICE));
7054 }
7055
7056
7057
7058
7059
7060
7061 int can_nice(const struct task_struct *p, const int nice)
7062 {
7063 return is_nice_reduction(p, nice) || capable(CAP_SYS_NICE);
7064 }
7065
7066 #ifdef __ARCH_WANT_SYS_NICE
7067
7068
7069
7070
7071
7072
7073
7074
7075 SYSCALL_DEFINE1(nice, int, increment)
7076 {
7077 long nice, retval;
7078
7079
7080
7081
7082
7083
7084 increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH);
7085 nice = task_nice(current) + increment;
7086
7087 nice = clamp_val(nice, MIN_NICE, MAX_NICE);
7088 if (increment < 0 && !can_nice(current, nice))
7089 return -EPERM;
7090
7091 retval = security_task_setnice(current, nice);
7092 if (retval)
7093 return retval;
7094
7095 set_user_nice(current, nice);
7096 return 0;
7097 }
7098
7099 #endif
7100
7101
7102
7103
7104
7105
7106
7107
7108
7109
7110
7111
7112
7113 int task_prio(const struct task_struct *p)
7114 {
7115 return p->prio - MAX_RT_PRIO;
7116 }
7117
7118
7119
7120
7121
7122
7123
7124 int idle_cpu(int cpu)
7125 {
7126 struct rq *rq = cpu_rq(cpu);
7127
7128 if (rq->curr != rq->idle)
7129 return 0;
7130
7131 if (rq->nr_running)
7132 return 0;
7133
7134 #ifdef CONFIG_SMP
7135 if (rq->ttwu_pending)
7136 return 0;
7137 #endif
7138
7139 return 1;
7140 }
7141
7142
7143
7144
7145
7146
7147
7148 int available_idle_cpu(int cpu)
7149 {
7150 if (!idle_cpu(cpu))
7151 return 0;
7152
7153 if (vcpu_is_preempted(cpu))
7154 return 0;
7155
7156 return 1;
7157 }
7158
7159
7160
7161
7162
7163
7164
7165 struct task_struct *idle_task(int cpu)
7166 {
7167 return cpu_rq(cpu)->idle;
7168 }
7169
7170 #ifdef CONFIG_SMP
7171
7172
7173
7174
7175
7176
7177
7178
7179
7180
7181
7182
7183
7184
7185
7186
7187
7188
7189
7190
7191 unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
7192 enum cpu_util_type type,
7193 struct task_struct *p)
7194 {
7195 unsigned long dl_util, util, irq, max;
7196 struct rq *rq = cpu_rq(cpu);
7197
7198 max = arch_scale_cpu_capacity(cpu);
7199
7200 if (!uclamp_is_used() &&
7201 type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) {
7202 return max;
7203 }
7204
7205
7206
7207
7208
7209
7210 irq = cpu_util_irq(rq);
7211 if (unlikely(irq >= max))
7212 return max;
7213
7214
7215
7216
7217
7218
7219
7220
7221
7222
7223
7224
7225
7226 util = util_cfs + cpu_util_rt(rq);
7227 if (type == FREQUENCY_UTIL)
7228 util = uclamp_rq_util_with(rq, util, p);
7229
7230 dl_util = cpu_util_dl(rq);
7231
7232
7233
7234
7235
7236
7237
7238
7239
7240
7241 if (util + dl_util >= max)
7242 return max;
7243
7244
7245
7246
7247
7248 if (type == ENERGY_UTIL)
7249 util += dl_util;
7250
7251
7252
7253
7254
7255
7256
7257
7258
7259
7260 util = scale_irq_capacity(util, irq, max);
7261 util += irq;
7262
7263
7264
7265
7266
7267
7268
7269
7270
7271
7272
7273 if (type == FREQUENCY_UTIL)
7274 util += cpu_bw_dl(rq);
7275
7276 return min(max, util);
7277 }
7278
7279 unsigned long sched_cpu_util(int cpu)
7280 {
7281 return effective_cpu_util(cpu, cpu_util_cfs(cpu), ENERGY_UTIL, NULL);
7282 }
7283 #endif
7284
7285
7286
7287
7288
7289
7290
7291 static struct task_struct *find_process_by_pid(pid_t pid)
7292 {
7293 return pid ? find_task_by_vpid(pid) : current;
7294 }
7295
7296
7297
7298
7299
7300 #define SETPARAM_POLICY -1
7301
7302 static void __setscheduler_params(struct task_struct *p,
7303 const struct sched_attr *attr)
7304 {
7305 int policy = attr->sched_policy;
7306
7307 if (policy == SETPARAM_POLICY)
7308 policy = p->policy;
7309
7310 p->policy = policy;
7311
7312 if (dl_policy(policy))
7313 __setparam_dl(p, attr);
7314 else if (fair_policy(policy))
7315 p->static_prio = NICE_TO_PRIO(attr->sched_nice);
7316
7317
7318
7319
7320
7321
7322 p->rt_priority = attr->sched_priority;
7323 p->normal_prio = normal_prio(p);
7324 set_load_weight(p, true);
7325 }
7326
7327
7328
7329
7330 static bool check_same_owner(struct task_struct *p)
7331 {
7332 const struct cred *cred = current_cred(), *pcred;
7333 bool match;
7334
7335 rcu_read_lock();
7336 pcred = __task_cred(p);
7337 match = (uid_eq(cred->euid, pcred->euid) ||
7338 uid_eq(cred->euid, pcred->uid));
7339 rcu_read_unlock();
7340 return match;
7341 }
7342
7343
7344
7345
7346
7347
7348 static int user_check_sched_setscheduler(struct task_struct *p,
7349 const struct sched_attr *attr,
7350 int policy, int reset_on_fork)
7351 {
7352 if (fair_policy(policy)) {
7353 if (attr->sched_nice < task_nice(p) &&
7354 !is_nice_reduction(p, attr->sched_nice))
7355 goto req_priv;
7356 }
7357
7358 if (rt_policy(policy)) {
7359 unsigned long rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO);
7360
7361
7362 if (policy != p->policy && !rlim_rtprio)
7363 goto req_priv;
7364
7365
7366 if (attr->sched_priority > p->rt_priority &&
7367 attr->sched_priority > rlim_rtprio)
7368 goto req_priv;
7369 }
7370
7371
7372
7373
7374
7375
7376
7377 if (dl_policy(policy))
7378 goto req_priv;
7379
7380
7381
7382
7383
7384 if (task_has_idle_policy(p) && !idle_policy(policy)) {
7385 if (!is_nice_reduction(p, task_nice(p)))
7386 goto req_priv;
7387 }
7388
7389
7390 if (!check_same_owner(p))
7391 goto req_priv;
7392
7393
7394 if (p->sched_reset_on_fork && !reset_on_fork)
7395 goto req_priv;
7396
7397 return 0;
7398
7399 req_priv:
7400 if (!capable(CAP_SYS_NICE))
7401 return -EPERM;
7402
7403 return 0;
7404 }
7405
7406 static int __sched_setscheduler(struct task_struct *p,
7407 const struct sched_attr *attr,
7408 bool user, bool pi)
7409 {
7410 int oldpolicy = -1, policy = attr->sched_policy;
7411 int retval, oldprio, newprio, queued, running;
7412 const struct sched_class *prev_class;
7413 struct callback_head *head;
7414 struct rq_flags rf;
7415 int reset_on_fork;
7416 int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
7417 struct rq *rq;
7418
7419
7420 BUG_ON(pi && in_interrupt());
7421 recheck:
7422
7423 if (policy < 0) {
7424 reset_on_fork = p->sched_reset_on_fork;
7425 policy = oldpolicy = p->policy;
7426 } else {
7427 reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK);
7428
7429 if (!valid_policy(policy))
7430 return -EINVAL;
7431 }
7432
7433 if (attr->sched_flags & ~(SCHED_FLAG_ALL | SCHED_FLAG_SUGOV))
7434 return -EINVAL;
7435
7436
7437
7438
7439
7440
7441 if (attr->sched_priority > MAX_RT_PRIO-1)
7442 return -EINVAL;
7443 if ((dl_policy(policy) && !__checkparam_dl(attr)) ||
7444 (rt_policy(policy) != (attr->sched_priority != 0)))
7445 return -EINVAL;
7446
7447 if (user) {
7448 retval = user_check_sched_setscheduler(p, attr, policy, reset_on_fork);
7449 if (retval)
7450 return retval;
7451
7452 if (attr->sched_flags & SCHED_FLAG_SUGOV)
7453 return -EINVAL;
7454
7455 retval = security_task_setscheduler(p);
7456 if (retval)
7457 return retval;
7458 }
7459
7460
7461 if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) {
7462 retval = uclamp_validate(p, attr);
7463 if (retval)
7464 return retval;
7465 }
7466
7467 if (pi)
7468 cpuset_read_lock();
7469
7470
7471
7472
7473
7474
7475
7476
7477 rq = task_rq_lock(p, &rf);
7478 update_rq_clock(rq);
7479
7480
7481
7482
7483 if (p == rq->stop) {
7484 retval = -EINVAL;
7485 goto unlock;
7486 }
7487
7488
7489
7490
7491
7492 if (unlikely(policy == p->policy)) {
7493 if (fair_policy(policy) && attr->sched_nice != task_nice(p))
7494 goto change;
7495 if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
7496 goto change;
7497 if (dl_policy(policy) && dl_param_changed(p, attr))
7498 goto change;
7499 if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)
7500 goto change;
7501
7502 p->sched_reset_on_fork = reset_on_fork;
7503 retval = 0;
7504 goto unlock;
7505 }
7506 change:
7507
7508 if (user) {
7509 #ifdef CONFIG_RT_GROUP_SCHED
7510
7511
7512
7513
7514 if (rt_bandwidth_enabled() && rt_policy(policy) &&
7515 task_group(p)->rt_bandwidth.rt_runtime == 0 &&
7516 !task_group_is_autogroup(task_group(p))) {
7517 retval = -EPERM;
7518 goto unlock;
7519 }
7520 #endif
7521 #ifdef CONFIG_SMP
7522 if (dl_bandwidth_enabled() && dl_policy(policy) &&
7523 !(attr->sched_flags & SCHED_FLAG_SUGOV)) {
7524 cpumask_t *span = rq->rd->span;
7525
7526
7527
7528
7529
7530
7531 if (!cpumask_subset(span, p->cpus_ptr) ||
7532 rq->rd->dl_bw.bw == 0) {
7533 retval = -EPERM;
7534 goto unlock;
7535 }
7536 }
7537 #endif
7538 }
7539
7540
7541 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
7542 policy = oldpolicy = -1;
7543 task_rq_unlock(rq, p, &rf);
7544 if (pi)
7545 cpuset_read_unlock();
7546 goto recheck;
7547 }
7548
7549
7550
7551
7552
7553
7554 if ((dl_policy(policy) || dl_task(p)) && sched_dl_overflow(p, policy, attr)) {
7555 retval = -EBUSY;
7556 goto unlock;
7557 }
7558
7559 p->sched_reset_on_fork = reset_on_fork;
7560 oldprio = p->prio;
7561
7562 newprio = __normal_prio(policy, attr->sched_priority, attr->sched_nice);
7563 if (pi) {
7564
7565
7566
7567
7568
7569
7570
7571 newprio = rt_effective_prio(p, newprio);
7572 if (newprio == oldprio)
7573 queue_flags &= ~DEQUEUE_MOVE;
7574 }
7575
7576 queued = task_on_rq_queued(p);
7577 running = task_current(rq, p);
7578 if (queued)
7579 dequeue_task(rq, p, queue_flags);
7580 if (running)
7581 put_prev_task(rq, p);
7582
7583 prev_class = p->sched_class;
7584
7585 if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) {
7586 __setscheduler_params(p, attr);
7587 __setscheduler_prio(p, newprio);
7588 }
7589 __setscheduler_uclamp(p, attr);
7590
7591 if (queued) {
7592
7593
7594
7595
7596 if (oldprio < p->prio)
7597 queue_flags |= ENQUEUE_HEAD;
7598
7599 enqueue_task(rq, p, queue_flags);
7600 }
7601 if (running)
7602 set_next_task(rq, p);
7603
7604 check_class_changed(rq, p, prev_class, oldprio);
7605
7606
7607 preempt_disable();
7608 head = splice_balance_callbacks(rq);
7609 task_rq_unlock(rq, p, &rf);
7610
7611 if (pi) {
7612 cpuset_read_unlock();
7613 rt_mutex_adjust_pi(p);
7614 }
7615
7616
7617 balance_callbacks(rq, head);
7618 preempt_enable();
7619
7620 return 0;
7621
7622 unlock:
7623 task_rq_unlock(rq, p, &rf);
7624 if (pi)
7625 cpuset_read_unlock();
7626 return retval;
7627 }
7628
7629 static int _sched_setscheduler(struct task_struct *p, int policy,
7630 const struct sched_param *param, bool check)
7631 {
7632 struct sched_attr attr = {
7633 .sched_policy = policy,
7634 .sched_priority = param->sched_priority,
7635 .sched_nice = PRIO_TO_NICE(p->static_prio),
7636 };
7637
7638
7639 if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) {
7640 attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
7641 policy &= ~SCHED_RESET_ON_FORK;
7642 attr.sched_policy = policy;
7643 }
7644
7645 return __sched_setscheduler(p, &attr, check, true);
7646 }
7647
7648
7649
7650
7651
7652
7653
7654
7655
7656
7657
7658
7659 int sched_setscheduler(struct task_struct *p, int policy,
7660 const struct sched_param *param)
7661 {
7662 return _sched_setscheduler(p, policy, param, true);
7663 }
7664
7665 int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
7666 {
7667 return __sched_setscheduler(p, attr, true, true);
7668 }
7669
7670 int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr)
7671 {
7672 return __sched_setscheduler(p, attr, false, true);
7673 }
7674 EXPORT_SYMBOL_GPL(sched_setattr_nocheck);
7675
7676
7677
7678
7679
7680
7681
7682
7683
7684
7685
7686
7687
7688
7689 int sched_setscheduler_nocheck(struct task_struct *p, int policy,
7690 const struct sched_param *param)
7691 {
7692 return _sched_setscheduler(p, policy, param, false);
7693 }
7694
7695
7696
7697
7698
7699
7700
7701
7702
7703
7704
7705
7706
7707
7708
7709
7710
7711
7712
7713 void sched_set_fifo(struct task_struct *p)
7714 {
7715 struct sched_param sp = { .sched_priority = MAX_RT_PRIO / 2 };
7716 WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0);
7717 }
7718 EXPORT_SYMBOL_GPL(sched_set_fifo);
7719
7720
7721
7722
7723 void sched_set_fifo_low(struct task_struct *p)
7724 {
7725 struct sched_param sp = { .sched_priority = 1 };
7726 WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0);
7727 }
7728 EXPORT_SYMBOL_GPL(sched_set_fifo_low);
7729
7730 void sched_set_normal(struct task_struct *p, int nice)
7731 {
7732 struct sched_attr attr = {
7733 .sched_policy = SCHED_NORMAL,
7734 .sched_nice = nice,
7735 };
7736 WARN_ON_ONCE(sched_setattr_nocheck(p, &attr) != 0);
7737 }
7738 EXPORT_SYMBOL_GPL(sched_set_normal);
7739
7740 static int
7741 do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
7742 {
7743 struct sched_param lparam;
7744 struct task_struct *p;
7745 int retval;
7746
7747 if (!param || pid < 0)
7748 return -EINVAL;
7749 if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
7750 return -EFAULT;
7751
7752 rcu_read_lock();
7753 retval = -ESRCH;
7754 p = find_process_by_pid(pid);
7755 if (likely(p))
7756 get_task_struct(p);
7757 rcu_read_unlock();
7758
7759 if (likely(p)) {
7760 retval = sched_setscheduler(p, policy, &lparam);
7761 put_task_struct(p);
7762 }
7763
7764 return retval;
7765 }
7766
7767
7768
7769
7770 static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *attr)
7771 {
7772 u32 size;
7773 int ret;
7774
7775
7776 memset(attr, 0, sizeof(*attr));
7777
7778 ret = get_user(size, &uattr->size);
7779 if (ret)
7780 return ret;
7781
7782
7783 if (!size)
7784 size = SCHED_ATTR_SIZE_VER0;
7785 if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE)
7786 goto err_size;
7787
7788 ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size);
7789 if (ret) {
7790 if (ret == -E2BIG)
7791 goto err_size;
7792 return ret;
7793 }
7794
7795 if ((attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) &&
7796 size < SCHED_ATTR_SIZE_VER1)
7797 return -EINVAL;
7798
7799
7800
7801
7802
7803 attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE);
7804
7805 return 0;
7806
7807 err_size:
7808 put_user(sizeof(*attr), &uattr->size);
7809 return -E2BIG;
7810 }
7811
7812 static void get_params(struct task_struct *p, struct sched_attr *attr)
7813 {
7814 if (task_has_dl_policy(p))
7815 __getparam_dl(p, attr);
7816 else if (task_has_rt_policy(p))
7817 attr->sched_priority = p->rt_priority;
7818 else
7819 attr->sched_nice = task_nice(p);
7820 }
7821
7822
7823
7824
7825
7826
7827
7828
7829
7830 SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, struct sched_param __user *, param)
7831 {
7832 if (policy < 0)
7833 return -EINVAL;
7834
7835 return do_sched_setscheduler(pid, policy, param);
7836 }
7837
7838
7839
7840
7841
7842
7843
7844
7845 SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
7846 {
7847 return do_sched_setscheduler(pid, SETPARAM_POLICY, param);
7848 }
7849
7850
7851
7852
7853
7854
7855
7856 SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
7857 unsigned int, flags)
7858 {
7859 struct sched_attr attr;
7860 struct task_struct *p;
7861 int retval;
7862
7863 if (!uattr || pid < 0 || flags)
7864 return -EINVAL;
7865
7866 retval = sched_copy_attr(uattr, &attr);
7867 if (retval)
7868 return retval;
7869
7870 if ((int)attr.sched_policy < 0)
7871 return -EINVAL;
7872 if (attr.sched_flags & SCHED_FLAG_KEEP_POLICY)
7873 attr.sched_policy = SETPARAM_POLICY;
7874
7875 rcu_read_lock();
7876 retval = -ESRCH;
7877 p = find_process_by_pid(pid);
7878 if (likely(p))
7879 get_task_struct(p);
7880 rcu_read_unlock();
7881
7882 if (likely(p)) {
7883 if (attr.sched_flags & SCHED_FLAG_KEEP_PARAMS)
7884 get_params(p, &attr);
7885 retval = sched_setattr(p, &attr);
7886 put_task_struct(p);
7887 }
7888
7889 return retval;
7890 }
7891
7892
7893
7894
7895
7896
7897
7898
7899 SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
7900 {
7901 struct task_struct *p;
7902 int retval;
7903
7904 if (pid < 0)
7905 return -EINVAL;
7906
7907 retval = -ESRCH;
7908 rcu_read_lock();
7909 p = find_process_by_pid(pid);
7910 if (p) {
7911 retval = security_task_getscheduler(p);
7912 if (!retval)
7913 retval = p->policy
7914 | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
7915 }
7916 rcu_read_unlock();
7917 return retval;
7918 }
7919
7920
7921
7922
7923
7924
7925
7926
7927
7928 SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
7929 {
7930 struct sched_param lp = { .sched_priority = 0 };
7931 struct task_struct *p;
7932 int retval;
7933
7934 if (!param || pid < 0)
7935 return -EINVAL;
7936
7937 rcu_read_lock();
7938 p = find_process_by_pid(pid);
7939 retval = -ESRCH;
7940 if (!p)
7941 goto out_unlock;
7942
7943 retval = security_task_getscheduler(p);
7944 if (retval)
7945 goto out_unlock;
7946
7947 if (task_has_rt_policy(p))
7948 lp.sched_priority = p->rt_priority;
7949 rcu_read_unlock();
7950
7951
7952
7953
7954 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
7955
7956 return retval;
7957
7958 out_unlock:
7959 rcu_read_unlock();
7960 return retval;
7961 }
7962
7963
7964
7965
7966
7967
7968
7969
7970
7971 static int
7972 sched_attr_copy_to_user(struct sched_attr __user *uattr,
7973 struct sched_attr *kattr,
7974 unsigned int usize)
7975 {
7976 unsigned int ksize = sizeof(*kattr);
7977
7978 if (!access_ok(uattr, usize))
7979 return -EFAULT;
7980
7981
7982
7983
7984
7985
7986
7987
7988
7989
7990
7991
7992
7993
7994 kattr->size = min(usize, ksize);
7995
7996 if (copy_to_user(uattr, kattr, kattr->size))
7997 return -EFAULT;
7998
7999 return 0;
8000 }
8001
8002
8003
8004
8005
8006
8007
8008
8009 SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
8010 unsigned int, usize, unsigned int, flags)
8011 {
8012 struct sched_attr kattr = { };
8013 struct task_struct *p;
8014 int retval;
8015
8016 if (!uattr || pid < 0 || usize > PAGE_SIZE ||
8017 usize < SCHED_ATTR_SIZE_VER0 || flags)
8018 return -EINVAL;
8019
8020 rcu_read_lock();
8021 p = find_process_by_pid(pid);
8022 retval = -ESRCH;
8023 if (!p)
8024 goto out_unlock;
8025
8026 retval = security_task_getscheduler(p);
8027 if (retval)
8028 goto out_unlock;
8029
8030 kattr.sched_policy = p->policy;
8031 if (p->sched_reset_on_fork)
8032 kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
8033 get_params(p, &kattr);
8034 kattr.sched_flags &= SCHED_FLAG_ALL;
8035
8036 #ifdef CONFIG_UCLAMP_TASK
8037
8038
8039
8040
8041
8042 kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;
8043 kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;
8044 #endif
8045
8046 rcu_read_unlock();
8047
8048 return sched_attr_copy_to_user(uattr, &kattr, usize);
8049
8050 out_unlock:
8051 rcu_read_unlock();
8052 return retval;
8053 }
8054
8055 #ifdef CONFIG_SMP
8056 int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask)
8057 {
8058 int ret = 0;
8059
8060
8061
8062
8063
8064 if (!task_has_dl_policy(p) || !dl_bandwidth_enabled())
8065 return 0;
8066
8067
8068
8069
8070
8071
8072
8073 rcu_read_lock();
8074 if (!cpumask_subset(task_rq(p)->rd->span, mask))
8075 ret = -EBUSY;
8076 rcu_read_unlock();
8077 return ret;
8078 }
8079 #endif
8080
8081 static int
8082 __sched_setaffinity(struct task_struct *p, const struct cpumask *mask)
8083 {
8084 int retval;
8085 cpumask_var_t cpus_allowed, new_mask;
8086
8087 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL))
8088 return -ENOMEM;
8089
8090 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
8091 retval = -ENOMEM;
8092 goto out_free_cpus_allowed;
8093 }
8094
8095 cpuset_cpus_allowed(p, cpus_allowed);
8096 cpumask_and(new_mask, mask, cpus_allowed);
8097
8098 retval = dl_task_check_affinity(p, new_mask);
8099 if (retval)
8100 goto out_free_new_mask;
8101 again:
8102 retval = __set_cpus_allowed_ptr(p, new_mask, SCA_CHECK | SCA_USER);
8103 if (retval)
8104 goto out_free_new_mask;
8105
8106 cpuset_cpus_allowed(p, cpus_allowed);
8107 if (!cpumask_subset(new_mask, cpus_allowed)) {
8108
8109
8110
8111
8112 cpumask_copy(new_mask, cpus_allowed);
8113 goto again;
8114 }
8115
8116 out_free_new_mask:
8117 free_cpumask_var(new_mask);
8118 out_free_cpus_allowed:
8119 free_cpumask_var(cpus_allowed);
8120 return retval;
8121 }
8122
8123 long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
8124 {
8125 struct task_struct *p;
8126 int retval;
8127
8128 rcu_read_lock();
8129
8130 p = find_process_by_pid(pid);
8131 if (!p) {
8132 rcu_read_unlock();
8133 return -ESRCH;
8134 }
8135
8136
8137 get_task_struct(p);
8138 rcu_read_unlock();
8139
8140 if (p->flags & PF_NO_SETAFFINITY) {
8141 retval = -EINVAL;
8142 goto out_put_task;
8143 }
8144
8145 if (!check_same_owner(p)) {
8146 rcu_read_lock();
8147 if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
8148 rcu_read_unlock();
8149 retval = -EPERM;
8150 goto out_put_task;
8151 }
8152 rcu_read_unlock();
8153 }
8154
8155 retval = security_task_setscheduler(p);
8156 if (retval)
8157 goto out_put_task;
8158
8159 retval = __sched_setaffinity(p, in_mask);
8160 out_put_task:
8161 put_task_struct(p);
8162 return retval;
8163 }
8164
8165 static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
8166 struct cpumask *new_mask)
8167 {
8168 if (len < cpumask_size())
8169 cpumask_clear(new_mask);
8170 else if (len > cpumask_size())
8171 len = cpumask_size();
8172
8173 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
8174 }
8175
8176
8177
8178
8179
8180
8181
8182
8183
8184 SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
8185 unsigned long __user *, user_mask_ptr)
8186 {
8187 cpumask_var_t new_mask;
8188 int retval;
8189
8190 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
8191 return -ENOMEM;
8192
8193 retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
8194 if (retval == 0)
8195 retval = sched_setaffinity(pid, new_mask);
8196 free_cpumask_var(new_mask);
8197 return retval;
8198 }
8199
8200 long sched_getaffinity(pid_t pid, struct cpumask *mask)
8201 {
8202 struct task_struct *p;
8203 unsigned long flags;
8204 int retval;
8205
8206 rcu_read_lock();
8207
8208 retval = -ESRCH;
8209 p = find_process_by_pid(pid);
8210 if (!p)
8211 goto out_unlock;
8212
8213 retval = security_task_getscheduler(p);
8214 if (retval)
8215 goto out_unlock;
8216
8217 raw_spin_lock_irqsave(&p->pi_lock, flags);
8218 cpumask_and(mask, &p->cpus_mask, cpu_active_mask);
8219 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
8220
8221 out_unlock:
8222 rcu_read_unlock();
8223
8224 return retval;
8225 }
8226
8227
8228
8229
8230
8231
8232
8233
8234
8235
8236 SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
8237 unsigned long __user *, user_mask_ptr)
8238 {
8239 int ret;
8240 cpumask_var_t mask;
8241
8242 if ((len * BITS_PER_BYTE) < nr_cpu_ids)
8243 return -EINVAL;
8244 if (len & (sizeof(unsigned long)-1))
8245 return -EINVAL;
8246
8247 if (!alloc_cpumask_var(&mask, GFP_KERNEL))
8248 return -ENOMEM;
8249
8250 ret = sched_getaffinity(pid, mask);
8251 if (ret == 0) {
8252 unsigned int retlen = min(len, cpumask_size());
8253
8254 if (copy_to_user(user_mask_ptr, mask, retlen))
8255 ret = -EFAULT;
8256 else
8257 ret = retlen;
8258 }
8259 free_cpumask_var(mask);
8260
8261 return ret;
8262 }
8263
8264 static void do_sched_yield(void)
8265 {
8266 struct rq_flags rf;
8267 struct rq *rq;
8268
8269 rq = this_rq_lock_irq(&rf);
8270
8271 schedstat_inc(rq->yld_count);
8272 current->sched_class->yield_task(rq);
8273
8274 preempt_disable();
8275 rq_unlock_irq(rq, &rf);
8276 sched_preempt_enable_no_resched();
8277
8278 schedule();
8279 }
8280
8281
8282
8283
8284
8285
8286
8287
8288
8289 SYSCALL_DEFINE0(sched_yield)
8290 {
8291 do_sched_yield();
8292 return 0;
8293 }
8294
8295 #if !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC)
8296 int __sched __cond_resched(void)
8297 {
8298 if (should_resched(0)) {
8299 preempt_schedule_common();
8300 return 1;
8301 }
8302
8303
8304
8305
8306
8307
8308
8309
8310
8311
8312
8313 #ifndef CONFIG_PREEMPT_RCU
8314 rcu_all_qs();
8315 #endif
8316 return 0;
8317 }
8318 EXPORT_SYMBOL(__cond_resched);
8319 #endif
8320
8321 #ifdef CONFIG_PREEMPT_DYNAMIC
8322 #if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
8323 #define cond_resched_dynamic_enabled __cond_resched
8324 #define cond_resched_dynamic_disabled ((void *)&__static_call_return0)
8325 DEFINE_STATIC_CALL_RET0(cond_resched, __cond_resched);
8326 EXPORT_STATIC_CALL_TRAMP(cond_resched);
8327
8328 #define might_resched_dynamic_enabled __cond_resched
8329 #define might_resched_dynamic_disabled ((void *)&__static_call_return0)
8330 DEFINE_STATIC_CALL_RET0(might_resched, __cond_resched);
8331 EXPORT_STATIC_CALL_TRAMP(might_resched);
8332 #elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
8333 static DEFINE_STATIC_KEY_FALSE(sk_dynamic_cond_resched);
8334 int __sched dynamic_cond_resched(void)
8335 {
8336 if (!static_branch_unlikely(&sk_dynamic_cond_resched))
8337 return 0;
8338 return __cond_resched();
8339 }
8340 EXPORT_SYMBOL(dynamic_cond_resched);
8341
8342 static DEFINE_STATIC_KEY_FALSE(sk_dynamic_might_resched);
8343 int __sched dynamic_might_resched(void)
8344 {
8345 if (!static_branch_unlikely(&sk_dynamic_might_resched))
8346 return 0;
8347 return __cond_resched();
8348 }
8349 EXPORT_SYMBOL(dynamic_might_resched);
8350 #endif
8351 #endif
8352
8353
8354
8355
8356
8357
8358
8359
8360
8361 int __cond_resched_lock(spinlock_t *lock)
8362 {
8363 int resched = should_resched(PREEMPT_LOCK_OFFSET);
8364 int ret = 0;
8365
8366 lockdep_assert_held(lock);
8367
8368 if (spin_needbreak(lock) || resched) {
8369 spin_unlock(lock);
8370 if (!_cond_resched())
8371 cpu_relax();
8372 ret = 1;
8373 spin_lock(lock);
8374 }
8375 return ret;
8376 }
8377 EXPORT_SYMBOL(__cond_resched_lock);
8378
8379 int __cond_resched_rwlock_read(rwlock_t *lock)
8380 {
8381 int resched = should_resched(PREEMPT_LOCK_OFFSET);
8382 int ret = 0;
8383
8384 lockdep_assert_held_read(lock);
8385
8386 if (rwlock_needbreak(lock) || resched) {
8387 read_unlock(lock);
8388 if (!_cond_resched())
8389 cpu_relax();
8390 ret = 1;
8391 read_lock(lock);
8392 }
8393 return ret;
8394 }
8395 EXPORT_SYMBOL(__cond_resched_rwlock_read);
8396
8397 int __cond_resched_rwlock_write(rwlock_t *lock)
8398 {
8399 int resched = should_resched(PREEMPT_LOCK_OFFSET);
8400 int ret = 0;
8401
8402 lockdep_assert_held_write(lock);
8403
8404 if (rwlock_needbreak(lock) || resched) {
8405 write_unlock(lock);
8406 if (!_cond_resched())
8407 cpu_relax();
8408 ret = 1;
8409 write_lock(lock);
8410 }
8411 return ret;
8412 }
8413 EXPORT_SYMBOL(__cond_resched_rwlock_write);
8414
8415 #ifdef CONFIG_PREEMPT_DYNAMIC
8416
8417 #ifdef CONFIG_GENERIC_ENTRY
8418 #include <linux/entry-common.h>
8419 #endif
8420
8421
8422
8423
8424
8425
8426
8427
8428
8429
8430
8431
8432
8433
8434
8435
8436
8437
8438
8439
8440
8441
8442
8443
8444
8445
8446
8447
8448
8449
8450
8451 enum {
8452 preempt_dynamic_undefined = -1,
8453 preempt_dynamic_none,
8454 preempt_dynamic_voluntary,
8455 preempt_dynamic_full,
8456 };
8457
8458 int preempt_dynamic_mode = preempt_dynamic_undefined;
8459
8460 int sched_dynamic_mode(const char *str)
8461 {
8462 if (!strcmp(str, "none"))
8463 return preempt_dynamic_none;
8464
8465 if (!strcmp(str, "voluntary"))
8466 return preempt_dynamic_voluntary;
8467
8468 if (!strcmp(str, "full"))
8469 return preempt_dynamic_full;
8470
8471 return -EINVAL;
8472 }
8473
8474 #if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
8475 #define preempt_dynamic_enable(f) static_call_update(f, f##_dynamic_enabled)
8476 #define preempt_dynamic_disable(f) static_call_update(f, f##_dynamic_disabled)
8477 #elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
8478 #define preempt_dynamic_enable(f) static_key_enable(&sk_dynamic_##f.key)
8479 #define preempt_dynamic_disable(f) static_key_disable(&sk_dynamic_##f.key)
8480 #else
8481 #error "Unsupported PREEMPT_DYNAMIC mechanism"
8482 #endif
8483
8484 void sched_dynamic_update(int mode)
8485 {
8486
8487
8488
8489
8490 preempt_dynamic_enable(cond_resched);
8491 preempt_dynamic_enable(might_resched);
8492 preempt_dynamic_enable(preempt_schedule);
8493 preempt_dynamic_enable(preempt_schedule_notrace);
8494 preempt_dynamic_enable(irqentry_exit_cond_resched);
8495
8496 switch (mode) {
8497 case preempt_dynamic_none:
8498 preempt_dynamic_enable(cond_resched);
8499 preempt_dynamic_disable(might_resched);
8500 preempt_dynamic_disable(preempt_schedule);
8501 preempt_dynamic_disable(preempt_schedule_notrace);
8502 preempt_dynamic_disable(irqentry_exit_cond_resched);
8503 pr_info("Dynamic Preempt: none\n");
8504 break;
8505
8506 case preempt_dynamic_voluntary:
8507 preempt_dynamic_enable(cond_resched);
8508 preempt_dynamic_enable(might_resched);
8509 preempt_dynamic_disable(preempt_schedule);
8510 preempt_dynamic_disable(preempt_schedule_notrace);
8511 preempt_dynamic_disable(irqentry_exit_cond_resched);
8512 pr_info("Dynamic Preempt: voluntary\n");
8513 break;
8514
8515 case preempt_dynamic_full:
8516 preempt_dynamic_disable(cond_resched);
8517 preempt_dynamic_disable(might_resched);
8518 preempt_dynamic_enable(preempt_schedule);
8519 preempt_dynamic_enable(preempt_schedule_notrace);
8520 preempt_dynamic_enable(irqentry_exit_cond_resched);
8521 pr_info("Dynamic Preempt: full\n");
8522 break;
8523 }
8524
8525 preempt_dynamic_mode = mode;
8526 }
8527
8528 static int __init setup_preempt_mode(char *str)
8529 {
8530 int mode = sched_dynamic_mode(str);
8531 if (mode < 0) {
8532 pr_warn("Dynamic Preempt: unsupported mode: %s\n", str);
8533 return 0;
8534 }
8535
8536 sched_dynamic_update(mode);
8537 return 1;
8538 }
8539 __setup("preempt=", setup_preempt_mode);
8540
8541 static void __init preempt_dynamic_init(void)
8542 {
8543 if (preempt_dynamic_mode == preempt_dynamic_undefined) {
8544 if (IS_ENABLED(CONFIG_PREEMPT_NONE)) {
8545 sched_dynamic_update(preempt_dynamic_none);
8546 } else if (IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY)) {
8547 sched_dynamic_update(preempt_dynamic_voluntary);
8548 } else {
8549
8550 WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT));
8551 preempt_dynamic_mode = preempt_dynamic_full;
8552 pr_info("Dynamic Preempt: full\n");
8553 }
8554 }
8555 }
8556
8557 #define PREEMPT_MODEL_ACCESSOR(mode) \
8558 bool preempt_model_##mode(void) \
8559 { \
8560 WARN_ON_ONCE(preempt_dynamic_mode == preempt_dynamic_undefined); \
8561 return preempt_dynamic_mode == preempt_dynamic_##mode; \
8562 } \
8563 EXPORT_SYMBOL_GPL(preempt_model_##mode)
8564
8565 PREEMPT_MODEL_ACCESSOR(none);
8566 PREEMPT_MODEL_ACCESSOR(voluntary);
8567 PREEMPT_MODEL_ACCESSOR(full);
8568
8569 #else
8570
8571 static inline void preempt_dynamic_init(void) { }
8572
8573 #endif
8574
8575
8576
8577
8578
8579
8580
8581
8582
8583
8584
8585
8586
8587
8588
8589
8590
8591
8592
8593
8594
8595
8596
8597 void __sched yield(void)
8598 {
8599 set_current_state(TASK_RUNNING);
8600 do_sched_yield();
8601 }
8602 EXPORT_SYMBOL(yield);
8603
8604
8605
8606
8607
8608
8609
8610
8611
8612
8613
8614
8615
8616
8617
8618
8619 int __sched yield_to(struct task_struct *p, bool preempt)
8620 {
8621 struct task_struct *curr = current;
8622 struct rq *rq, *p_rq;
8623 unsigned long flags;
8624 int yielded = 0;
8625
8626 local_irq_save(flags);
8627 rq = this_rq();
8628
8629 again:
8630 p_rq = task_rq(p);
8631
8632
8633
8634
8635 if (rq->nr_running == 1 && p_rq->nr_running == 1) {
8636 yielded = -ESRCH;
8637 goto out_irq;
8638 }
8639
8640 double_rq_lock(rq, p_rq);
8641 if (task_rq(p) != p_rq) {
8642 double_rq_unlock(rq, p_rq);
8643 goto again;
8644 }
8645
8646 if (!curr->sched_class->yield_to_task)
8647 goto out_unlock;
8648
8649 if (curr->sched_class != p->sched_class)
8650 goto out_unlock;
8651
8652 if (task_running(p_rq, p) || !task_is_running(p))
8653 goto out_unlock;
8654
8655 yielded = curr->sched_class->yield_to_task(rq, p);
8656 if (yielded) {
8657 schedstat_inc(rq->yld_count);
8658
8659
8660
8661
8662 if (preempt && rq != p_rq)
8663 resched_curr(p_rq);
8664 }
8665
8666 out_unlock:
8667 double_rq_unlock(rq, p_rq);
8668 out_irq:
8669 local_irq_restore(flags);
8670
8671 if (yielded > 0)
8672 schedule();
8673
8674 return yielded;
8675 }
8676 EXPORT_SYMBOL_GPL(yield_to);
8677
8678 int io_schedule_prepare(void)
8679 {
8680 int old_iowait = current->in_iowait;
8681
8682 current->in_iowait = 1;
8683 blk_flush_plug(current->plug, true);
8684 return old_iowait;
8685 }
8686
8687 void io_schedule_finish(int token)
8688 {
8689 current->in_iowait = token;
8690 }
8691
8692
8693
8694
8695
8696 long __sched io_schedule_timeout(long timeout)
8697 {
8698 int token;
8699 long ret;
8700
8701 token = io_schedule_prepare();
8702 ret = schedule_timeout(timeout);
8703 io_schedule_finish(token);
8704
8705 return ret;
8706 }
8707 EXPORT_SYMBOL(io_schedule_timeout);
8708
8709 void __sched io_schedule(void)
8710 {
8711 int token;
8712
8713 token = io_schedule_prepare();
8714 schedule();
8715 io_schedule_finish(token);
8716 }
8717 EXPORT_SYMBOL(io_schedule);
8718
8719
8720
8721
8722
8723
8724
8725
8726
8727 SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
8728 {
8729 int ret = -EINVAL;
8730
8731 switch (policy) {
8732 case SCHED_FIFO:
8733 case SCHED_RR:
8734 ret = MAX_RT_PRIO-1;
8735 break;
8736 case SCHED_DEADLINE:
8737 case SCHED_NORMAL:
8738 case SCHED_BATCH:
8739 case SCHED_IDLE:
8740 ret = 0;
8741 break;
8742 }
8743 return ret;
8744 }
8745
8746
8747
8748
8749
8750
8751
8752
8753
8754 SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
8755 {
8756 int ret = -EINVAL;
8757
8758 switch (policy) {
8759 case SCHED_FIFO:
8760 case SCHED_RR:
8761 ret = 1;
8762 break;
8763 case SCHED_DEADLINE:
8764 case SCHED_NORMAL:
8765 case SCHED_BATCH:
8766 case SCHED_IDLE:
8767 ret = 0;
8768 }
8769 return ret;
8770 }
8771
8772 static int sched_rr_get_interval(pid_t pid, struct timespec64 *t)
8773 {
8774 struct task_struct *p;
8775 unsigned int time_slice;
8776 struct rq_flags rf;
8777 struct rq *rq;
8778 int retval;
8779
8780 if (pid < 0)
8781 return -EINVAL;
8782
8783 retval = -ESRCH;
8784 rcu_read_lock();
8785 p = find_process_by_pid(pid);
8786 if (!p)
8787 goto out_unlock;
8788
8789 retval = security_task_getscheduler(p);
8790 if (retval)
8791 goto out_unlock;
8792
8793 rq = task_rq_lock(p, &rf);
8794 time_slice = 0;
8795 if (p->sched_class->get_rr_interval)
8796 time_slice = p->sched_class->get_rr_interval(rq, p);
8797 task_rq_unlock(rq, p, &rf);
8798
8799 rcu_read_unlock();
8800 jiffies_to_timespec64(time_slice, t);
8801 return 0;
8802
8803 out_unlock:
8804 rcu_read_unlock();
8805 return retval;
8806 }
8807
8808
8809
8810
8811
8812
8813
8814
8815
8816
8817
8818
8819 SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
8820 struct __kernel_timespec __user *, interval)
8821 {
8822 struct timespec64 t;
8823 int retval = sched_rr_get_interval(pid, &t);
8824
8825 if (retval == 0)
8826 retval = put_timespec64(&t, interval);
8827
8828 return retval;
8829 }
8830
8831 #ifdef CONFIG_COMPAT_32BIT_TIME
8832 SYSCALL_DEFINE2(sched_rr_get_interval_time32, pid_t, pid,
8833 struct old_timespec32 __user *, interval)
8834 {
8835 struct timespec64 t;
8836 int retval = sched_rr_get_interval(pid, &t);
8837
8838 if (retval == 0)
8839 retval = put_old_timespec32(&t, interval);
8840 return retval;
8841 }
8842 #endif
8843
8844 void sched_show_task(struct task_struct *p)
8845 {
8846 unsigned long free = 0;
8847 int ppid;
8848
8849 if (!try_get_task_stack(p))
8850 return;
8851
8852 pr_info("task:%-15.15s state:%c", p->comm, task_state_to_char(p));
8853
8854 if (task_is_running(p))
8855 pr_cont(" running task ");
8856 #ifdef CONFIG_DEBUG_STACK_USAGE
8857 free = stack_not_used(p);
8858 #endif
8859 ppid = 0;
8860 rcu_read_lock();
8861 if (pid_alive(p))
8862 ppid = task_pid_nr(rcu_dereference(p->real_parent));
8863 rcu_read_unlock();
8864 pr_cont(" stack:%5lu pid:%5d ppid:%6d flags:0x%08lx\n",
8865 free, task_pid_nr(p), ppid,
8866 read_task_thread_flags(p));
8867
8868 print_worker_info(KERN_INFO, p);
8869 print_stop_info(KERN_INFO, p);
8870 show_stack(p, NULL, KERN_INFO);
8871 put_task_stack(p);
8872 }
8873 EXPORT_SYMBOL_GPL(sched_show_task);
8874
8875 static inline bool
8876 state_filter_match(unsigned long state_filter, struct task_struct *p)
8877 {
8878 unsigned int state = READ_ONCE(p->__state);
8879
8880
8881 if (!state_filter)
8882 return true;
8883
8884
8885 if (!(state & state_filter))
8886 return false;
8887
8888
8889
8890
8891
8892 if (state_filter == TASK_UNINTERRUPTIBLE && state == TASK_IDLE)
8893 return false;
8894
8895 return true;
8896 }
8897
8898
8899 void show_state_filter(unsigned int state_filter)
8900 {
8901 struct task_struct *g, *p;
8902
8903 rcu_read_lock();
8904 for_each_process_thread(g, p) {
8905
8906
8907
8908
8909
8910
8911
8912 touch_nmi_watchdog();
8913 touch_all_softlockup_watchdogs();
8914 if (state_filter_match(state_filter, p))
8915 sched_show_task(p);
8916 }
8917
8918 #ifdef CONFIG_SCHED_DEBUG
8919 if (!state_filter)
8920 sysrq_sched_debug_show();
8921 #endif
8922 rcu_read_unlock();
8923
8924
8925
8926 if (!state_filter)
8927 debug_show_all_locks();
8928 }
8929
8930
8931
8932
8933
8934
8935
8936
8937
8938 void __init init_idle(struct task_struct *idle, int cpu)
8939 {
8940 struct rq *rq = cpu_rq(cpu);
8941 unsigned long flags;
8942
8943 __sched_fork(0, idle);
8944
8945 raw_spin_lock_irqsave(&idle->pi_lock, flags);
8946 raw_spin_rq_lock(rq);
8947
8948 idle->__state = TASK_RUNNING;
8949 idle->se.exec_start = sched_clock();
8950
8951
8952
8953
8954 idle->flags |= PF_IDLE | PF_KTHREAD | PF_NO_SETAFFINITY;
8955 kthread_set_per_cpu(idle, cpu);
8956
8957 #ifdef CONFIG_SMP
8958
8959
8960
8961
8962
8963
8964 set_cpus_allowed_common(idle, cpumask_of(cpu), 0);
8965 #endif
8966
8967
8968
8969
8970
8971
8972
8973
8974
8975
8976 rcu_read_lock();
8977 __set_task_cpu(idle, cpu);
8978 rcu_read_unlock();
8979
8980 rq->idle = idle;
8981 rcu_assign_pointer(rq->curr, idle);
8982 idle->on_rq = TASK_ON_RQ_QUEUED;
8983 #ifdef CONFIG_SMP
8984 idle->on_cpu = 1;
8985 #endif
8986 raw_spin_rq_unlock(rq);
8987 raw_spin_unlock_irqrestore(&idle->pi_lock, flags);
8988
8989
8990 init_idle_preempt_count(idle, cpu);
8991
8992
8993
8994
8995 idle->sched_class = &idle_sched_class;
8996 ftrace_graph_init_idle_task(idle, cpu);
8997 vtime_init_idle(idle, cpu);
8998 #ifdef CONFIG_SMP
8999 sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
9000 #endif
9001 }
9002
9003 #ifdef CONFIG_SMP
9004
9005 int cpuset_cpumask_can_shrink(const struct cpumask *cur,
9006 const struct cpumask *trial)
9007 {
9008 int ret = 1;
9009
9010 if (cpumask_empty(cur))
9011 return ret;
9012
9013 ret = dl_cpuset_cpumask_can_shrink(cur, trial);
9014
9015 return ret;
9016 }
9017
9018 int task_can_attach(struct task_struct *p,
9019 const struct cpumask *cs_effective_cpus)
9020 {
9021 int ret = 0;
9022
9023
9024
9025
9026
9027
9028
9029
9030
9031
9032 if (p->flags & PF_NO_SETAFFINITY) {
9033 ret = -EINVAL;
9034 goto out;
9035 }
9036
9037 if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span,
9038 cs_effective_cpus)) {
9039 int cpu = cpumask_any_and(cpu_active_mask, cs_effective_cpus);
9040
9041 if (unlikely(cpu >= nr_cpu_ids))
9042 return -EINVAL;
9043 ret = dl_cpu_busy(cpu, p);
9044 }
9045
9046 out:
9047 return ret;
9048 }
9049
9050 bool sched_smp_initialized __read_mostly;
9051
9052 #ifdef CONFIG_NUMA_BALANCING
9053
9054 int migrate_task_to(struct task_struct *p, int target_cpu)
9055 {
9056 struct migration_arg arg = { p, target_cpu };
9057 int curr_cpu = task_cpu(p);
9058
9059 if (curr_cpu == target_cpu)
9060 return 0;
9061
9062 if (!cpumask_test_cpu(target_cpu, p->cpus_ptr))
9063 return -EINVAL;
9064
9065
9066
9067 trace_sched_move_numa(p, curr_cpu, target_cpu);
9068 return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg);
9069 }
9070
9071
9072
9073
9074
9075 void sched_setnuma(struct task_struct *p, int nid)
9076 {
9077 bool queued, running;
9078 struct rq_flags rf;
9079 struct rq *rq;
9080
9081 rq = task_rq_lock(p, &rf);
9082 queued = task_on_rq_queued(p);
9083 running = task_current(rq, p);
9084
9085 if (queued)
9086 dequeue_task(rq, p, DEQUEUE_SAVE);
9087 if (running)
9088 put_prev_task(rq, p);
9089
9090 p->numa_preferred_nid = nid;
9091
9092 if (queued)
9093 enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
9094 if (running)
9095 set_next_task(rq, p);
9096 task_rq_unlock(rq, p, &rf);
9097 }
9098 #endif
9099
9100 #ifdef CONFIG_HOTPLUG_CPU
9101
9102
9103
9104
9105 void idle_task_exit(void)
9106 {
9107 struct mm_struct *mm = current->active_mm;
9108
9109 BUG_ON(cpu_online(smp_processor_id()));
9110 BUG_ON(current != this_rq()->idle);
9111
9112 if (mm != &init_mm) {
9113 switch_mm(mm, &init_mm, current);
9114 finish_arch_post_lock_switch();
9115 }
9116
9117
9118 }
9119
9120 static int __balance_push_cpu_stop(void *arg)
9121 {
9122 struct task_struct *p = arg;
9123 struct rq *rq = this_rq();
9124 struct rq_flags rf;
9125 int cpu;
9126
9127 raw_spin_lock_irq(&p->pi_lock);
9128 rq_lock(rq, &rf);
9129
9130 update_rq_clock(rq);
9131
9132 if (task_rq(p) == rq && task_on_rq_queued(p)) {
9133 cpu = select_fallback_rq(rq->cpu, p);
9134 rq = __migrate_task(rq, &rf, p, cpu);
9135 }
9136
9137 rq_unlock(rq, &rf);
9138 raw_spin_unlock_irq(&p->pi_lock);
9139
9140 put_task_struct(p);
9141
9142 return 0;
9143 }
9144
9145 static DEFINE_PER_CPU(struct cpu_stop_work, push_work);
9146
9147
9148
9149
9150
9151
9152
9153 static void balance_push(struct rq *rq)
9154 {
9155 struct task_struct *push_task = rq->curr;
9156
9157 lockdep_assert_rq_held(rq);
9158
9159
9160
9161
9162 rq->balance_callback = &balance_push_callback;
9163
9164
9165
9166
9167
9168 if (!cpu_dying(rq->cpu) || rq != this_rq())
9169 return;
9170
9171
9172
9173
9174
9175 if (kthread_is_per_cpu(push_task) ||
9176 is_migration_disabled(push_task)) {
9177
9178
9179
9180
9181
9182
9183
9184
9185
9186
9187
9188
9189 if (!rq->nr_running && !rq_has_pinned_tasks(rq) &&
9190 rcuwait_active(&rq->hotplug_wait)) {
9191 raw_spin_rq_unlock(rq);
9192 rcuwait_wake_up(&rq->hotplug_wait);
9193 raw_spin_rq_lock(rq);
9194 }
9195 return;
9196 }
9197
9198 get_task_struct(push_task);
9199
9200
9201
9202
9203 raw_spin_rq_unlock(rq);
9204 stop_one_cpu_nowait(rq->cpu, __balance_push_cpu_stop, push_task,
9205 this_cpu_ptr(&push_work));
9206
9207
9208
9209
9210
9211 raw_spin_rq_lock(rq);
9212 }
9213
9214 static void balance_push_set(int cpu, bool on)
9215 {
9216 struct rq *rq = cpu_rq(cpu);
9217 struct rq_flags rf;
9218
9219 rq_lock_irqsave(rq, &rf);
9220 if (on) {
9221 WARN_ON_ONCE(rq->balance_callback);
9222 rq->balance_callback = &balance_push_callback;
9223 } else if (rq->balance_callback == &balance_push_callback) {
9224 rq->balance_callback = NULL;
9225 }
9226 rq_unlock_irqrestore(rq, &rf);
9227 }
9228
9229
9230
9231
9232
9233
9234
9235 static void balance_hotplug_wait(void)
9236 {
9237 struct rq *rq = this_rq();
9238
9239 rcuwait_wait_event(&rq->hotplug_wait,
9240 rq->nr_running == 1 && !rq_has_pinned_tasks(rq),
9241 TASK_UNINTERRUPTIBLE);
9242 }
9243
9244 #else
9245
9246 static inline void balance_push(struct rq *rq)
9247 {
9248 }
9249
9250 static inline void balance_push_set(int cpu, bool on)
9251 {
9252 }
9253
9254 static inline void balance_hotplug_wait(void)
9255 {
9256 }
9257
9258 #endif
9259
9260 void set_rq_online(struct rq *rq)
9261 {
9262 if (!rq->online) {
9263 const struct sched_class *class;
9264
9265 cpumask_set_cpu(rq->cpu, rq->rd->online);
9266 rq->online = 1;
9267
9268 for_each_class(class) {
9269 if (class->rq_online)
9270 class->rq_online(rq);
9271 }
9272 }
9273 }
9274
9275 void set_rq_offline(struct rq *rq)
9276 {
9277 if (rq->online) {
9278 const struct sched_class *class;
9279
9280 for_each_class(class) {
9281 if (class->rq_offline)
9282 class->rq_offline(rq);
9283 }
9284
9285 cpumask_clear_cpu(rq->cpu, rq->rd->online);
9286 rq->online = 0;
9287 }
9288 }
9289
9290
9291
9292
9293 static int num_cpus_frozen;
9294
9295
9296
9297
9298
9299
9300
9301
9302
9303 static void cpuset_cpu_active(void)
9304 {
9305 if (cpuhp_tasks_frozen) {
9306
9307
9308
9309
9310
9311
9312 partition_sched_domains(1, NULL, NULL);
9313 if (--num_cpus_frozen)
9314 return;
9315
9316
9317
9318
9319
9320 cpuset_force_rebuild();
9321 }
9322 cpuset_update_active_cpus();
9323 }
9324
9325 static int cpuset_cpu_inactive(unsigned int cpu)
9326 {
9327 if (!cpuhp_tasks_frozen) {
9328 int ret = dl_cpu_busy(cpu, NULL);
9329
9330 if (ret)
9331 return ret;
9332 cpuset_update_active_cpus();
9333 } else {
9334 num_cpus_frozen++;
9335 partition_sched_domains(1, NULL, NULL);
9336 }
9337 return 0;
9338 }
9339
9340 int sched_cpu_activate(unsigned int cpu)
9341 {
9342 struct rq *rq = cpu_rq(cpu);
9343 struct rq_flags rf;
9344
9345
9346
9347
9348
9349 balance_push_set(cpu, false);
9350
9351 #ifdef CONFIG_SCHED_SMT
9352
9353
9354
9355 if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
9356 static_branch_inc_cpuslocked(&sched_smt_present);
9357 #endif
9358 set_cpu_active(cpu, true);
9359
9360 if (sched_smp_initialized) {
9361 sched_update_numa(cpu, true);
9362 sched_domains_numa_masks_set(cpu);
9363 cpuset_cpu_active();
9364 }
9365
9366
9367
9368
9369
9370
9371
9372
9373
9374
9375 rq_lock_irqsave(rq, &rf);
9376 if (rq->rd) {
9377 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
9378 set_rq_online(rq);
9379 }
9380 rq_unlock_irqrestore(rq, &rf);
9381
9382 return 0;
9383 }
9384
9385 int sched_cpu_deactivate(unsigned int cpu)
9386 {
9387 struct rq *rq = cpu_rq(cpu);
9388 struct rq_flags rf;
9389 int ret;
9390
9391
9392
9393
9394
9395 nohz_balance_exit_idle(rq);
9396
9397 set_cpu_active(cpu, false);
9398
9399
9400
9401
9402
9403
9404
9405 balance_push_set(cpu, true);
9406
9407
9408
9409
9410
9411
9412
9413
9414
9415
9416
9417 synchronize_rcu();
9418
9419 rq_lock_irqsave(rq, &rf);
9420 if (rq->rd) {
9421 update_rq_clock(rq);
9422 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
9423 set_rq_offline(rq);
9424 }
9425 rq_unlock_irqrestore(rq, &rf);
9426
9427 #ifdef CONFIG_SCHED_SMT
9428
9429
9430
9431 if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
9432 static_branch_dec_cpuslocked(&sched_smt_present);
9433
9434 sched_core_cpu_deactivate(cpu);
9435 #endif
9436
9437 if (!sched_smp_initialized)
9438 return 0;
9439
9440 sched_update_numa(cpu, false);
9441 ret = cpuset_cpu_inactive(cpu);
9442 if (ret) {
9443 balance_push_set(cpu, false);
9444 set_cpu_active(cpu, true);
9445 sched_update_numa(cpu, true);
9446 return ret;
9447 }
9448 sched_domains_numa_masks_clear(cpu);
9449 return 0;
9450 }
9451
9452 static void sched_rq_cpu_starting(unsigned int cpu)
9453 {
9454 struct rq *rq = cpu_rq(cpu);
9455
9456 rq->calc_load_update = calc_load_update;
9457 update_max_interval();
9458 }
9459
9460 int sched_cpu_starting(unsigned int cpu)
9461 {
9462 sched_core_cpu_starting(cpu);
9463 sched_rq_cpu_starting(cpu);
9464 sched_tick_start(cpu);
9465 return 0;
9466 }
9467
9468 #ifdef CONFIG_HOTPLUG_CPU
9469
9470
9471
9472
9473
9474
9475
9476
9477
9478
9479
9480
9481 int sched_cpu_wait_empty(unsigned int cpu)
9482 {
9483 balance_hotplug_wait();
9484 return 0;
9485 }
9486
9487
9488
9489
9490
9491
9492
9493
9494
9495
9496 static void calc_load_migrate(struct rq *rq)
9497 {
9498 long delta = calc_load_fold_active(rq, 1);
9499
9500 if (delta)
9501 atomic_long_add(delta, &calc_load_tasks);
9502 }
9503
9504 static void dump_rq_tasks(struct rq *rq, const char *loglvl)
9505 {
9506 struct task_struct *g, *p;
9507 int cpu = cpu_of(rq);
9508
9509 lockdep_assert_rq_held(rq);
9510
9511 printk("%sCPU%d enqueued tasks (%u total):\n", loglvl, cpu, rq->nr_running);
9512 for_each_process_thread(g, p) {
9513 if (task_cpu(p) != cpu)
9514 continue;
9515
9516 if (!task_on_rq_queued(p))
9517 continue;
9518
9519 printk("%s\tpid: %d, name: %s\n", loglvl, p->pid, p->comm);
9520 }
9521 }
9522
9523 int sched_cpu_dying(unsigned int cpu)
9524 {
9525 struct rq *rq = cpu_rq(cpu);
9526 struct rq_flags rf;
9527
9528
9529 sched_tick_stop(cpu);
9530
9531 rq_lock_irqsave(rq, &rf);
9532 if (rq->nr_running != 1 || rq_has_pinned_tasks(rq)) {
9533 WARN(true, "Dying CPU not properly vacated!");
9534 dump_rq_tasks(rq, KERN_WARNING);
9535 }
9536 rq_unlock_irqrestore(rq, &rf);
9537
9538 calc_load_migrate(rq);
9539 update_max_interval();
9540 hrtick_clear(rq);
9541 sched_core_cpu_dying(cpu);
9542 return 0;
9543 }
9544 #endif
9545
9546 void __init sched_init_smp(void)
9547 {
9548 sched_init_numa(NUMA_NO_NODE);
9549
9550
9551
9552
9553
9554
9555 mutex_lock(&sched_domains_mutex);
9556 sched_init_domains(cpu_active_mask);
9557 mutex_unlock(&sched_domains_mutex);
9558
9559
9560 if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_TYPE_DOMAIN)) < 0)
9561 BUG();
9562 current->flags &= ~PF_NO_SETAFFINITY;
9563 sched_init_granularity();
9564
9565 init_sched_rt_class();
9566 init_sched_dl_class();
9567
9568 sched_smp_initialized = true;
9569 }
9570
9571 static int __init migration_init(void)
9572 {
9573 sched_cpu_starting(smp_processor_id());
9574 return 0;
9575 }
9576 early_initcall(migration_init);
9577
9578 #else
9579 void __init sched_init_smp(void)
9580 {
9581 sched_init_granularity();
9582 }
9583 #endif
9584
9585 int in_sched_functions(unsigned long addr)
9586 {
9587 return in_lock_functions(addr) ||
9588 (addr >= (unsigned long)__sched_text_start
9589 && addr < (unsigned long)__sched_text_end);
9590 }
9591
9592 #ifdef CONFIG_CGROUP_SCHED
9593
9594
9595
9596
9597 struct task_group root_task_group;
9598 LIST_HEAD(task_groups);
9599
9600
9601 static struct kmem_cache *task_group_cache __read_mostly;
9602 #endif
9603
9604 DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
9605 DECLARE_PER_CPU(cpumask_var_t, select_rq_mask);
9606
9607 void __init sched_init(void)
9608 {
9609 unsigned long ptr = 0;
9610 int i;
9611
9612
9613 BUG_ON(&idle_sched_class != &fair_sched_class + 1 ||
9614 &fair_sched_class != &rt_sched_class + 1 ||
9615 &rt_sched_class != &dl_sched_class + 1);
9616 #ifdef CONFIG_SMP
9617 BUG_ON(&dl_sched_class != &stop_sched_class + 1);
9618 #endif
9619
9620 wait_bit_init();
9621
9622 #ifdef CONFIG_FAIR_GROUP_SCHED
9623 ptr += 2 * nr_cpu_ids * sizeof(void **);
9624 #endif
9625 #ifdef CONFIG_RT_GROUP_SCHED
9626 ptr += 2 * nr_cpu_ids * sizeof(void **);
9627 #endif
9628 if (ptr) {
9629 ptr = (unsigned long)kzalloc(ptr, GFP_NOWAIT);
9630
9631 #ifdef CONFIG_FAIR_GROUP_SCHED
9632 root_task_group.se = (struct sched_entity **)ptr;
9633 ptr += nr_cpu_ids * sizeof(void **);
9634
9635 root_task_group.cfs_rq = (struct cfs_rq **)ptr;
9636 ptr += nr_cpu_ids * sizeof(void **);
9637
9638 root_task_group.shares = ROOT_TASK_GROUP_LOAD;
9639 init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
9640 #endif
9641 #ifdef CONFIG_RT_GROUP_SCHED
9642 root_task_group.rt_se = (struct sched_rt_entity **)ptr;
9643 ptr += nr_cpu_ids * sizeof(void **);
9644
9645 root_task_group.rt_rq = (struct rt_rq **)ptr;
9646 ptr += nr_cpu_ids * sizeof(void **);
9647
9648 #endif
9649 }
9650 #ifdef CONFIG_CPUMASK_OFFSTACK
9651 for_each_possible_cpu(i) {
9652 per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node(
9653 cpumask_size(), GFP_KERNEL, cpu_to_node(i));
9654 per_cpu(select_rq_mask, i) = (cpumask_var_t)kzalloc_node(
9655 cpumask_size(), GFP_KERNEL, cpu_to_node(i));
9656 }
9657 #endif
9658
9659 init_rt_bandwidth(&def_rt_bandwidth, global_rt_period(), global_rt_runtime());
9660
9661 #ifdef CONFIG_SMP
9662 init_defrootdomain();
9663 #endif
9664
9665 #ifdef CONFIG_RT_GROUP_SCHED
9666 init_rt_bandwidth(&root_task_group.rt_bandwidth,
9667 global_rt_period(), global_rt_runtime());
9668 #endif
9669
9670 #ifdef CONFIG_CGROUP_SCHED
9671 task_group_cache = KMEM_CACHE(task_group, 0);
9672
9673 list_add(&root_task_group.list, &task_groups);
9674 INIT_LIST_HEAD(&root_task_group.children);
9675 INIT_LIST_HEAD(&root_task_group.siblings);
9676 autogroup_init(&init_task);
9677 #endif
9678
9679 for_each_possible_cpu(i) {
9680 struct rq *rq;
9681
9682 rq = cpu_rq(i);
9683 raw_spin_lock_init(&rq->__lock);
9684 rq->nr_running = 0;
9685 rq->calc_load_active = 0;
9686 rq->calc_load_update = jiffies + LOAD_FREQ;
9687 init_cfs_rq(&rq->cfs);
9688 init_rt_rq(&rq->rt);
9689 init_dl_rq(&rq->dl);
9690 #ifdef CONFIG_FAIR_GROUP_SCHED
9691 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
9692 rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
9693
9694
9695
9696
9697
9698
9699
9700
9701
9702
9703
9704
9705
9706
9707
9708
9709
9710
9711
9712 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
9713 #endif
9714
9715 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
9716 #ifdef CONFIG_RT_GROUP_SCHED
9717 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
9718 #endif
9719 #ifdef CONFIG_SMP
9720 rq->sd = NULL;
9721 rq->rd = NULL;
9722 rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE;
9723 rq->balance_callback = &balance_push_callback;
9724 rq->active_balance = 0;
9725 rq->next_balance = jiffies;
9726 rq->push_cpu = 0;
9727 rq->cpu = i;
9728 rq->online = 0;
9729 rq->idle_stamp = 0;
9730 rq->avg_idle = 2*sysctl_sched_migration_cost;
9731 rq->wake_stamp = jiffies;
9732 rq->wake_avg_idle = rq->avg_idle;
9733 rq->max_idle_balance_cost = sysctl_sched_migration_cost;
9734
9735 INIT_LIST_HEAD(&rq->cfs_tasks);
9736
9737 rq_attach_root(rq, &def_root_domain);
9738 #ifdef CONFIG_NO_HZ_COMMON
9739 rq->last_blocked_load_update_tick = jiffies;
9740 atomic_set(&rq->nohz_flags, 0);
9741
9742 INIT_CSD(&rq->nohz_csd, nohz_csd_func, rq);
9743 #endif
9744 #ifdef CONFIG_HOTPLUG_CPU
9745 rcuwait_init(&rq->hotplug_wait);
9746 #endif
9747 #endif
9748 hrtick_rq_init(rq);
9749 atomic_set(&rq->nr_iowait, 0);
9750
9751 #ifdef CONFIG_SCHED_CORE
9752 rq->core = rq;
9753 rq->core_pick = NULL;
9754 rq->core_enabled = 0;
9755 rq->core_tree = RB_ROOT;
9756 rq->core_forceidle_count = 0;
9757 rq->core_forceidle_occupation = 0;
9758 rq->core_forceidle_start = 0;
9759
9760 rq->core_cookie = 0UL;
9761 #endif
9762 }
9763
9764 set_load_weight(&init_task, false);
9765
9766
9767
9768
9769 mmgrab(&init_mm);
9770 enter_lazy_tlb(&init_mm, current);
9771
9772
9773
9774
9775
9776
9777
9778 WARN_ON(!set_kthread_struct(current));
9779
9780
9781
9782
9783
9784
9785
9786 init_idle(current, smp_processor_id());
9787
9788 calc_load_update = jiffies + LOAD_FREQ;
9789
9790 #ifdef CONFIG_SMP
9791 idle_thread_set_boot_cpu();
9792 balance_push_set(smp_processor_id(), false);
9793 #endif
9794 init_sched_fair_class();
9795
9796 psi_init();
9797
9798 init_uclamp();
9799
9800 preempt_dynamic_init();
9801
9802 scheduler_running = 1;
9803 }
9804
9805 #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
9806
9807 void __might_sleep(const char *file, int line)
9808 {
9809 unsigned int state = get_current_state();
9810
9811
9812
9813
9814
9815 WARN_ONCE(state != TASK_RUNNING && current->task_state_change,
9816 "do not call blocking ops when !TASK_RUNNING; "
9817 "state=%x set at [<%p>] %pS\n", state,
9818 (void *)current->task_state_change,
9819 (void *)current->task_state_change);
9820
9821 __might_resched(file, line, 0);
9822 }
9823 EXPORT_SYMBOL(__might_sleep);
9824
9825 static void print_preempt_disable_ip(int preempt_offset, unsigned long ip)
9826 {
9827 if (!IS_ENABLED(CONFIG_DEBUG_PREEMPT))
9828 return;
9829
9830 if (preempt_count() == preempt_offset)
9831 return;
9832
9833 pr_err("Preemption disabled at:");
9834 print_ip_sym(KERN_ERR, ip);
9835 }
9836
9837 static inline bool resched_offsets_ok(unsigned int offsets)
9838 {
9839 unsigned int nested = preempt_count();
9840
9841 nested += rcu_preempt_depth() << MIGHT_RESCHED_RCU_SHIFT;
9842
9843 return nested == offsets;
9844 }
9845
9846 void __might_resched(const char *file, int line, unsigned int offsets)
9847 {
9848
9849 static unsigned long prev_jiffy;
9850
9851 unsigned long preempt_disable_ip;
9852
9853
9854 rcu_sleep_check();
9855
9856 if ((resched_offsets_ok(offsets) && !irqs_disabled() &&
9857 !is_idle_task(current) && !current->non_block_count) ||
9858 system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING ||
9859 oops_in_progress)
9860 return;
9861
9862 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
9863 return;
9864 prev_jiffy = jiffies;
9865
9866
9867 preempt_disable_ip = get_preempt_disable_ip(current);
9868
9869 pr_err("BUG: sleeping function called from invalid context at %s:%d\n",
9870 file, line);
9871 pr_err("in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n",
9872 in_atomic(), irqs_disabled(), current->non_block_count,
9873 current->pid, current->comm);
9874 pr_err("preempt_count: %x, expected: %x\n", preempt_count(),
9875 offsets & MIGHT_RESCHED_PREEMPT_MASK);
9876
9877 if (IS_ENABLED(CONFIG_PREEMPT_RCU)) {
9878 pr_err("RCU nest depth: %d, expected: %u\n",
9879 rcu_preempt_depth(), offsets >> MIGHT_RESCHED_RCU_SHIFT);
9880 }
9881
9882 if (task_stack_end_corrupted(current))
9883 pr_emerg("Thread overran stack, or stack corrupted\n");
9884
9885 debug_show_held_locks(current);
9886 if (irqs_disabled())
9887 print_irqtrace_events(current);
9888
9889 print_preempt_disable_ip(offsets & MIGHT_RESCHED_PREEMPT_MASK,
9890 preempt_disable_ip);
9891
9892 dump_stack();
9893 add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
9894 }
9895 EXPORT_SYMBOL(__might_resched);
9896
9897 void __cant_sleep(const char *file, int line, int preempt_offset)
9898 {
9899 static unsigned long prev_jiffy;
9900
9901 if (irqs_disabled())
9902 return;
9903
9904 if (!IS_ENABLED(CONFIG_PREEMPT_COUNT))
9905 return;
9906
9907 if (preempt_count() > preempt_offset)
9908 return;
9909
9910 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
9911 return;
9912 prev_jiffy = jiffies;
9913
9914 printk(KERN_ERR "BUG: assuming atomic context at %s:%d\n", file, line);
9915 printk(KERN_ERR "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
9916 in_atomic(), irqs_disabled(),
9917 current->pid, current->comm);
9918
9919 debug_show_held_locks(current);
9920 dump_stack();
9921 add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
9922 }
9923 EXPORT_SYMBOL_GPL(__cant_sleep);
9924
9925 #ifdef CONFIG_SMP
9926 void __cant_migrate(const char *file, int line)
9927 {
9928 static unsigned long prev_jiffy;
9929
9930 if (irqs_disabled())
9931 return;
9932
9933 if (is_migration_disabled(current))
9934 return;
9935
9936 if (!IS_ENABLED(CONFIG_PREEMPT_COUNT))
9937 return;
9938
9939 if (preempt_count() > 0)
9940 return;
9941
9942 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
9943 return;
9944 prev_jiffy = jiffies;
9945
9946 pr_err("BUG: assuming non migratable context at %s:%d\n", file, line);
9947 pr_err("in_atomic(): %d, irqs_disabled(): %d, migration_disabled() %u pid: %d, name: %s\n",
9948 in_atomic(), irqs_disabled(), is_migration_disabled(current),
9949 current->pid, current->comm);
9950
9951 debug_show_held_locks(current);
9952 dump_stack();
9953 add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
9954 }
9955 EXPORT_SYMBOL_GPL(__cant_migrate);
9956 #endif
9957 #endif
9958
9959 #ifdef CONFIG_MAGIC_SYSRQ
9960 void normalize_rt_tasks(void)
9961 {
9962 struct task_struct *g, *p;
9963 struct sched_attr attr = {
9964 .sched_policy = SCHED_NORMAL,
9965 };
9966
9967 read_lock(&tasklist_lock);
9968 for_each_process_thread(g, p) {
9969
9970
9971
9972 if (p->flags & PF_KTHREAD)
9973 continue;
9974
9975 p->se.exec_start = 0;
9976 schedstat_set(p->stats.wait_start, 0);
9977 schedstat_set(p->stats.sleep_start, 0);
9978 schedstat_set(p->stats.block_start, 0);
9979
9980 if (!dl_task(p) && !rt_task(p)) {
9981
9982
9983
9984
9985 if (task_nice(p) < 0)
9986 set_user_nice(p, 0);
9987 continue;
9988 }
9989
9990 __sched_setscheduler(p, &attr, false, false);
9991 }
9992 read_unlock(&tasklist_lock);
9993 }
9994
9995 #endif
9996
9997 #if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB)
9998
9999
10000
10001
10002
10003
10004
10005
10006
10007
10008
10009
10010
10011
10012
10013
10014
10015
10016 struct task_struct *curr_task(int cpu)
10017 {
10018 return cpu_curr(cpu);
10019 }
10020
10021 #endif
10022
10023 #ifdef CONFIG_IA64
10024
10025
10026
10027
10028
10029
10030
10031
10032
10033
10034
10035
10036
10037
10038
10039 void ia64_set_curr_task(int cpu, struct task_struct *p)
10040 {
10041 cpu_curr(cpu) = p;
10042 }
10043
10044 #endif
10045
10046 #ifdef CONFIG_CGROUP_SCHED
10047
10048 static DEFINE_SPINLOCK(task_group_lock);
10049
10050 static inline void alloc_uclamp_sched_group(struct task_group *tg,
10051 struct task_group *parent)
10052 {
10053 #ifdef CONFIG_UCLAMP_TASK_GROUP
10054 enum uclamp_id clamp_id;
10055
10056 for_each_clamp_id(clamp_id) {
10057 uclamp_se_set(&tg->uclamp_req[clamp_id],
10058 uclamp_none(clamp_id), false);
10059 tg->uclamp[clamp_id] = parent->uclamp[clamp_id];
10060 }
10061 #endif
10062 }
10063
10064 static void sched_free_group(struct task_group *tg)
10065 {
10066 free_fair_sched_group(tg);
10067 free_rt_sched_group(tg);
10068 autogroup_free(tg);
10069 kmem_cache_free(task_group_cache, tg);
10070 }
10071
10072 static void sched_free_group_rcu(struct rcu_head *rcu)
10073 {
10074 sched_free_group(container_of(rcu, struct task_group, rcu));
10075 }
10076
10077 static void sched_unregister_group(struct task_group *tg)
10078 {
10079 unregister_fair_sched_group(tg);
10080 unregister_rt_sched_group(tg);
10081
10082
10083
10084
10085 call_rcu(&tg->rcu, sched_free_group_rcu);
10086 }
10087
10088
10089 struct task_group *sched_create_group(struct task_group *parent)
10090 {
10091 struct task_group *tg;
10092
10093 tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO);
10094 if (!tg)
10095 return ERR_PTR(-ENOMEM);
10096
10097 if (!alloc_fair_sched_group(tg, parent))
10098 goto err;
10099
10100 if (!alloc_rt_sched_group(tg, parent))
10101 goto err;
10102
10103 alloc_uclamp_sched_group(tg, parent);
10104
10105 return tg;
10106
10107 err:
10108 sched_free_group(tg);
10109 return ERR_PTR(-ENOMEM);
10110 }
10111
10112 void sched_online_group(struct task_group *tg, struct task_group *parent)
10113 {
10114 unsigned long flags;
10115
10116 spin_lock_irqsave(&task_group_lock, flags);
10117 list_add_rcu(&tg->list, &task_groups);
10118
10119
10120 WARN_ON(!parent);
10121
10122 tg->parent = parent;
10123 INIT_LIST_HEAD(&tg->children);
10124 list_add_rcu(&tg->siblings, &parent->children);
10125 spin_unlock_irqrestore(&task_group_lock, flags);
10126
10127 online_fair_sched_group(tg);
10128 }
10129
10130
10131 static void sched_unregister_group_rcu(struct rcu_head *rhp)
10132 {
10133
10134 sched_unregister_group(container_of(rhp, struct task_group, rcu));
10135 }
10136
10137 void sched_destroy_group(struct task_group *tg)
10138 {
10139
10140 call_rcu(&tg->rcu, sched_unregister_group_rcu);
10141 }
10142
10143 void sched_release_group(struct task_group *tg)
10144 {
10145 unsigned long flags;
10146
10147
10148
10149
10150
10151
10152
10153
10154
10155
10156
10157
10158
10159
10160 spin_lock_irqsave(&task_group_lock, flags);
10161 list_del_rcu(&tg->list);
10162 list_del_rcu(&tg->siblings);
10163 spin_unlock_irqrestore(&task_group_lock, flags);
10164 }
10165
10166 static void sched_change_group(struct task_struct *tsk, int type)
10167 {
10168 struct task_group *tg;
10169
10170
10171
10172
10173
10174
10175 tg = container_of(task_css_check(tsk, cpu_cgrp_id, true),
10176 struct task_group, css);
10177 tg = autogroup_task_group(tsk, tg);
10178 tsk->sched_task_group = tg;
10179
10180 #ifdef CONFIG_FAIR_GROUP_SCHED
10181 if (tsk->sched_class->task_change_group)
10182 tsk->sched_class->task_change_group(tsk, type);
10183 else
10184 #endif
10185 set_task_rq(tsk, task_cpu(tsk));
10186 }
10187
10188
10189
10190
10191
10192
10193
10194
10195 void sched_move_task(struct task_struct *tsk)
10196 {
10197 int queued, running, queue_flags =
10198 DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
10199 struct rq_flags rf;
10200 struct rq *rq;
10201
10202 rq = task_rq_lock(tsk, &rf);
10203 update_rq_clock(rq);
10204
10205 running = task_current(rq, tsk);
10206 queued = task_on_rq_queued(tsk);
10207
10208 if (queued)
10209 dequeue_task(rq, tsk, queue_flags);
10210 if (running)
10211 put_prev_task(rq, tsk);
10212
10213 sched_change_group(tsk, TASK_MOVE_GROUP);
10214
10215 if (queued)
10216 enqueue_task(rq, tsk, queue_flags);
10217 if (running) {
10218 set_next_task(rq, tsk);
10219
10220
10221
10222
10223
10224 resched_curr(rq);
10225 }
10226
10227 task_rq_unlock(rq, tsk, &rf);
10228 }
10229
10230 static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
10231 {
10232 return css ? container_of(css, struct task_group, css) : NULL;
10233 }
10234
10235 static struct cgroup_subsys_state *
10236 cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
10237 {
10238 struct task_group *parent = css_tg(parent_css);
10239 struct task_group *tg;
10240
10241 if (!parent) {
10242
10243 return &root_task_group.css;
10244 }
10245
10246 tg = sched_create_group(parent);
10247 if (IS_ERR(tg))
10248 return ERR_PTR(-ENOMEM);
10249
10250 return &tg->css;
10251 }
10252
10253
10254 static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
10255 {
10256 struct task_group *tg = css_tg(css);
10257 struct task_group *parent = css_tg(css->parent);
10258
10259 if (parent)
10260 sched_online_group(tg, parent);
10261
10262 #ifdef CONFIG_UCLAMP_TASK_GROUP
10263
10264 mutex_lock(&uclamp_mutex);
10265 rcu_read_lock();
10266 cpu_util_update_eff(css);
10267 rcu_read_unlock();
10268 mutex_unlock(&uclamp_mutex);
10269 #endif
10270
10271 return 0;
10272 }
10273
10274 static void cpu_cgroup_css_released(struct cgroup_subsys_state *css)
10275 {
10276 struct task_group *tg = css_tg(css);
10277
10278 sched_release_group(tg);
10279 }
10280
10281 static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
10282 {
10283 struct task_group *tg = css_tg(css);
10284
10285
10286
10287
10288 sched_unregister_group(tg);
10289 }
10290
10291
10292
10293
10294
10295 static void cpu_cgroup_fork(struct task_struct *task)
10296 {
10297 struct rq_flags rf;
10298 struct rq *rq;
10299
10300 rq = task_rq_lock(task, &rf);
10301
10302 update_rq_clock(rq);
10303 sched_change_group(task, TASK_SET_GROUP);
10304
10305 task_rq_unlock(rq, task, &rf);
10306 }
10307
10308 static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
10309 {
10310 struct task_struct *task;
10311 struct cgroup_subsys_state *css;
10312 int ret = 0;
10313
10314 cgroup_taskset_for_each(task, css, tset) {
10315 #ifdef CONFIG_RT_GROUP_SCHED
10316 if (!sched_rt_can_attach(css_tg(css), task))
10317 return -EINVAL;
10318 #endif
10319
10320
10321
10322
10323 raw_spin_lock_irq(&task->pi_lock);
10324
10325
10326
10327
10328
10329 if (READ_ONCE(task->__state) == TASK_NEW)
10330 ret = -EINVAL;
10331 raw_spin_unlock_irq(&task->pi_lock);
10332
10333 if (ret)
10334 break;
10335 }
10336 return ret;
10337 }
10338
10339 static void cpu_cgroup_attach(struct cgroup_taskset *tset)
10340 {
10341 struct task_struct *task;
10342 struct cgroup_subsys_state *css;
10343
10344 cgroup_taskset_for_each(task, css, tset)
10345 sched_move_task(task);
10346 }
10347
10348 #ifdef CONFIG_UCLAMP_TASK_GROUP
10349 static void cpu_util_update_eff(struct cgroup_subsys_state *css)
10350 {
10351 struct cgroup_subsys_state *top_css = css;
10352 struct uclamp_se *uc_parent = NULL;
10353 struct uclamp_se *uc_se = NULL;
10354 unsigned int eff[UCLAMP_CNT];
10355 enum uclamp_id clamp_id;
10356 unsigned int clamps;
10357
10358 lockdep_assert_held(&uclamp_mutex);
10359 SCHED_WARN_ON(!rcu_read_lock_held());
10360
10361 css_for_each_descendant_pre(css, top_css) {
10362 uc_parent = css_tg(css)->parent
10363 ? css_tg(css)->parent->uclamp : NULL;
10364
10365 for_each_clamp_id(clamp_id) {
10366
10367 eff[clamp_id] = css_tg(css)->uclamp_req[clamp_id].value;
10368
10369 if (uc_parent &&
10370 eff[clamp_id] > uc_parent[clamp_id].value) {
10371 eff[clamp_id] = uc_parent[clamp_id].value;
10372 }
10373 }
10374
10375 eff[UCLAMP_MIN] = min(eff[UCLAMP_MIN], eff[UCLAMP_MAX]);
10376
10377
10378 clamps = 0x0;
10379 uc_se = css_tg(css)->uclamp;
10380 for_each_clamp_id(clamp_id) {
10381 if (eff[clamp_id] == uc_se[clamp_id].value)
10382 continue;
10383 uc_se[clamp_id].value = eff[clamp_id];
10384 uc_se[clamp_id].bucket_id = uclamp_bucket_id(eff[clamp_id]);
10385 clamps |= (0x1 << clamp_id);
10386 }
10387 if (!clamps) {
10388 css = css_rightmost_descendant(css);
10389 continue;
10390 }
10391
10392
10393 uclamp_update_active_tasks(css);
10394 }
10395 }
10396
10397
10398
10399
10400
10401
10402 #define _POW10(exp) ((unsigned int)1e##exp)
10403 #define POW10(exp) _POW10(exp)
10404
10405 struct uclamp_request {
10406 #define UCLAMP_PERCENT_SHIFT 2
10407 #define UCLAMP_PERCENT_SCALE (100 * POW10(UCLAMP_PERCENT_SHIFT))
10408 s64 percent;
10409 u64 util;
10410 int ret;
10411 };
10412
10413 static inline struct uclamp_request
10414 capacity_from_percent(char *buf)
10415 {
10416 struct uclamp_request req = {
10417 .percent = UCLAMP_PERCENT_SCALE,
10418 .util = SCHED_CAPACITY_SCALE,
10419 .ret = 0,
10420 };
10421
10422 buf = strim(buf);
10423 if (strcmp(buf, "max")) {
10424 req.ret = cgroup_parse_float(buf, UCLAMP_PERCENT_SHIFT,
10425 &req.percent);
10426 if (req.ret)
10427 return req;
10428 if ((u64)req.percent > UCLAMP_PERCENT_SCALE) {
10429 req.ret = -ERANGE;
10430 return req;
10431 }
10432
10433 req.util = req.percent << SCHED_CAPACITY_SHIFT;
10434 req.util = DIV_ROUND_CLOSEST_ULL(req.util, UCLAMP_PERCENT_SCALE);
10435 }
10436
10437 return req;
10438 }
10439
10440 static ssize_t cpu_uclamp_write(struct kernfs_open_file *of, char *buf,
10441 size_t nbytes, loff_t off,
10442 enum uclamp_id clamp_id)
10443 {
10444 struct uclamp_request req;
10445 struct task_group *tg;
10446
10447 req = capacity_from_percent(buf);
10448 if (req.ret)
10449 return req.ret;
10450
10451 static_branch_enable(&sched_uclamp_used);
10452
10453 mutex_lock(&uclamp_mutex);
10454 rcu_read_lock();
10455
10456 tg = css_tg(of_css(of));
10457 if (tg->uclamp_req[clamp_id].value != req.util)
10458 uclamp_se_set(&tg->uclamp_req[clamp_id], req.util, false);
10459
10460
10461
10462
10463
10464 tg->uclamp_pct[clamp_id] = req.percent;
10465
10466
10467 cpu_util_update_eff(of_css(of));
10468
10469 rcu_read_unlock();
10470 mutex_unlock(&uclamp_mutex);
10471
10472 return nbytes;
10473 }
10474
10475 static ssize_t cpu_uclamp_min_write(struct kernfs_open_file *of,
10476 char *buf, size_t nbytes,
10477 loff_t off)
10478 {
10479 return cpu_uclamp_write(of, buf, nbytes, off, UCLAMP_MIN);
10480 }
10481
10482 static ssize_t cpu_uclamp_max_write(struct kernfs_open_file *of,
10483 char *buf, size_t nbytes,
10484 loff_t off)
10485 {
10486 return cpu_uclamp_write(of, buf, nbytes, off, UCLAMP_MAX);
10487 }
10488
10489 static inline void cpu_uclamp_print(struct seq_file *sf,
10490 enum uclamp_id clamp_id)
10491 {
10492 struct task_group *tg;
10493 u64 util_clamp;
10494 u64 percent;
10495 u32 rem;
10496
10497 rcu_read_lock();
10498 tg = css_tg(seq_css(sf));
10499 util_clamp = tg->uclamp_req[clamp_id].value;
10500 rcu_read_unlock();
10501
10502 if (util_clamp == SCHED_CAPACITY_SCALE) {
10503 seq_puts(sf, "max\n");
10504 return;
10505 }
10506
10507 percent = tg->uclamp_pct[clamp_id];
10508 percent = div_u64_rem(percent, POW10(UCLAMP_PERCENT_SHIFT), &rem);
10509 seq_printf(sf, "%llu.%0*u\n", percent, UCLAMP_PERCENT_SHIFT, rem);
10510 }
10511
10512 static int cpu_uclamp_min_show(struct seq_file *sf, void *v)
10513 {
10514 cpu_uclamp_print(sf, UCLAMP_MIN);
10515 return 0;
10516 }
10517
10518 static int cpu_uclamp_max_show(struct seq_file *sf, void *v)
10519 {
10520 cpu_uclamp_print(sf, UCLAMP_MAX);
10521 return 0;
10522 }
10523 #endif
10524
10525 #ifdef CONFIG_FAIR_GROUP_SCHED
10526 static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
10527 struct cftype *cftype, u64 shareval)
10528 {
10529 if (shareval > scale_load_down(ULONG_MAX))
10530 shareval = MAX_SHARES;
10531 return sched_group_set_shares(css_tg(css), scale_load(shareval));
10532 }
10533
10534 static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css,
10535 struct cftype *cft)
10536 {
10537 struct task_group *tg = css_tg(css);
10538
10539 return (u64) scale_load_down(tg->shares);
10540 }
10541
10542 #ifdef CONFIG_CFS_BANDWIDTH
10543 static DEFINE_MUTEX(cfs_constraints_mutex);
10544
10545 const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC;
10546 static const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC;
10547
10548 static const u64 max_cfs_runtime = MAX_BW * NSEC_PER_USEC;
10549
10550 static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
10551
10552 static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota,
10553 u64 burst)
10554 {
10555 int i, ret = 0, runtime_enabled, runtime_was_enabled;
10556 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
10557
10558 if (tg == &root_task_group)
10559 return -EINVAL;
10560
10561
10562
10563
10564
10565
10566 if (quota < min_cfs_quota_period || period < min_cfs_quota_period)
10567 return -EINVAL;
10568
10569
10570
10571
10572
10573
10574 if (period > max_cfs_quota_period)
10575 return -EINVAL;
10576
10577
10578
10579
10580 if (quota != RUNTIME_INF && quota > max_cfs_runtime)
10581 return -EINVAL;
10582
10583 if (quota != RUNTIME_INF && (burst > quota ||
10584 burst + quota > max_cfs_runtime))
10585 return -EINVAL;
10586
10587
10588
10589
10590
10591 cpus_read_lock();
10592 mutex_lock(&cfs_constraints_mutex);
10593 ret = __cfs_schedulable(tg, period, quota);
10594 if (ret)
10595 goto out_unlock;
10596
10597 runtime_enabled = quota != RUNTIME_INF;
10598 runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
10599
10600
10601
10602
10603 if (runtime_enabled && !runtime_was_enabled)
10604 cfs_bandwidth_usage_inc();
10605 raw_spin_lock_irq(&cfs_b->lock);
10606 cfs_b->period = ns_to_ktime(period);
10607 cfs_b->quota = quota;
10608 cfs_b->burst = burst;
10609
10610 __refill_cfs_bandwidth_runtime(cfs_b);
10611
10612
10613 if (runtime_enabled)
10614 start_cfs_bandwidth(cfs_b);
10615
10616 raw_spin_unlock_irq(&cfs_b->lock);
10617
10618 for_each_online_cpu(i) {
10619 struct cfs_rq *cfs_rq = tg->cfs_rq[i];
10620 struct rq *rq = cfs_rq->rq;
10621 struct rq_flags rf;
10622
10623 rq_lock_irq(rq, &rf);
10624 cfs_rq->runtime_enabled = runtime_enabled;
10625 cfs_rq->runtime_remaining = 0;
10626
10627 if (cfs_rq->throttled)
10628 unthrottle_cfs_rq(cfs_rq);
10629 rq_unlock_irq(rq, &rf);
10630 }
10631 if (runtime_was_enabled && !runtime_enabled)
10632 cfs_bandwidth_usage_dec();
10633 out_unlock:
10634 mutex_unlock(&cfs_constraints_mutex);
10635 cpus_read_unlock();
10636
10637 return ret;
10638 }
10639
10640 static int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
10641 {
10642 u64 quota, period, burst;
10643
10644 period = ktime_to_ns(tg->cfs_bandwidth.period);
10645 burst = tg->cfs_bandwidth.burst;
10646 if (cfs_quota_us < 0)
10647 quota = RUNTIME_INF;
10648 else if ((u64)cfs_quota_us <= U64_MAX / NSEC_PER_USEC)
10649 quota = (u64)cfs_quota_us * NSEC_PER_USEC;
10650 else
10651 return -EINVAL;
10652
10653 return tg_set_cfs_bandwidth(tg, period, quota, burst);
10654 }
10655
10656 static long tg_get_cfs_quota(struct task_group *tg)
10657 {
10658 u64 quota_us;
10659
10660 if (tg->cfs_bandwidth.quota == RUNTIME_INF)
10661 return -1;
10662
10663 quota_us = tg->cfs_bandwidth.quota;
10664 do_div(quota_us, NSEC_PER_USEC);
10665
10666 return quota_us;
10667 }
10668
10669 static int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
10670 {
10671 u64 quota, period, burst;
10672
10673 if ((u64)cfs_period_us > U64_MAX / NSEC_PER_USEC)
10674 return -EINVAL;
10675
10676 period = (u64)cfs_period_us * NSEC_PER_USEC;
10677 quota = tg->cfs_bandwidth.quota;
10678 burst = tg->cfs_bandwidth.burst;
10679
10680 return tg_set_cfs_bandwidth(tg, period, quota, burst);
10681 }
10682
10683 static long tg_get_cfs_period(struct task_group *tg)
10684 {
10685 u64 cfs_period_us;
10686
10687 cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period);
10688 do_div(cfs_period_us, NSEC_PER_USEC);
10689
10690 return cfs_period_us;
10691 }
10692
10693 static int tg_set_cfs_burst(struct task_group *tg, long cfs_burst_us)
10694 {
10695 u64 quota, period, burst;
10696
10697 if ((u64)cfs_burst_us > U64_MAX / NSEC_PER_USEC)
10698 return -EINVAL;
10699
10700 burst = (u64)cfs_burst_us * NSEC_PER_USEC;
10701 period = ktime_to_ns(tg->cfs_bandwidth.period);
10702 quota = tg->cfs_bandwidth.quota;
10703
10704 return tg_set_cfs_bandwidth(tg, period, quota, burst);
10705 }
10706
10707 static long tg_get_cfs_burst(struct task_group *tg)
10708 {
10709 u64 burst_us;
10710
10711 burst_us = tg->cfs_bandwidth.burst;
10712 do_div(burst_us, NSEC_PER_USEC);
10713
10714 return burst_us;
10715 }
10716
10717 static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css,
10718 struct cftype *cft)
10719 {
10720 return tg_get_cfs_quota(css_tg(css));
10721 }
10722
10723 static int cpu_cfs_quota_write_s64(struct cgroup_subsys_state *css,
10724 struct cftype *cftype, s64 cfs_quota_us)
10725 {
10726 return tg_set_cfs_quota(css_tg(css), cfs_quota_us);
10727 }
10728
10729 static u64 cpu_cfs_period_read_u64(struct cgroup_subsys_state *css,
10730 struct cftype *cft)
10731 {
10732 return tg_get_cfs_period(css_tg(css));
10733 }
10734
10735 static int cpu_cfs_period_write_u64(struct cgroup_subsys_state *css,
10736 struct cftype *cftype, u64 cfs_period_us)
10737 {
10738 return tg_set_cfs_period(css_tg(css), cfs_period_us);
10739 }
10740
10741 static u64 cpu_cfs_burst_read_u64(struct cgroup_subsys_state *css,
10742 struct cftype *cft)
10743 {
10744 return tg_get_cfs_burst(css_tg(css));
10745 }
10746
10747 static int cpu_cfs_burst_write_u64(struct cgroup_subsys_state *css,
10748 struct cftype *cftype, u64 cfs_burst_us)
10749 {
10750 return tg_set_cfs_burst(css_tg(css), cfs_burst_us);
10751 }
10752
10753 struct cfs_schedulable_data {
10754 struct task_group *tg;
10755 u64 period, quota;
10756 };
10757
10758
10759
10760
10761
10762 static u64 normalize_cfs_quota(struct task_group *tg,
10763 struct cfs_schedulable_data *d)
10764 {
10765 u64 quota, period;
10766
10767 if (tg == d->tg) {
10768 period = d->period;
10769 quota = d->quota;
10770 } else {
10771 period = tg_get_cfs_period(tg);
10772 quota = tg_get_cfs_quota(tg);
10773 }
10774
10775
10776 if (quota == RUNTIME_INF || quota == -1)
10777 return RUNTIME_INF;
10778
10779 return to_ratio(period, quota);
10780 }
10781
10782 static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
10783 {
10784 struct cfs_schedulable_data *d = data;
10785 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
10786 s64 quota = 0, parent_quota = -1;
10787
10788 if (!tg->parent) {
10789 quota = RUNTIME_INF;
10790 } else {
10791 struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth;
10792
10793 quota = normalize_cfs_quota(tg, d);
10794 parent_quota = parent_b->hierarchical_quota;
10795
10796
10797
10798
10799
10800
10801 if (cgroup_subsys_on_dfl(cpu_cgrp_subsys)) {
10802 quota = min(quota, parent_quota);
10803 } else {
10804 if (quota == RUNTIME_INF)
10805 quota = parent_quota;
10806 else if (parent_quota != RUNTIME_INF && quota > parent_quota)
10807 return -EINVAL;
10808 }
10809 }
10810 cfs_b->hierarchical_quota = quota;
10811
10812 return 0;
10813 }
10814
10815 static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
10816 {
10817 int ret;
10818 struct cfs_schedulable_data data = {
10819 .tg = tg,
10820 .period = period,
10821 .quota = quota,
10822 };
10823
10824 if (quota != RUNTIME_INF) {
10825 do_div(data.period, NSEC_PER_USEC);
10826 do_div(data.quota, NSEC_PER_USEC);
10827 }
10828
10829 rcu_read_lock();
10830 ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);
10831 rcu_read_unlock();
10832
10833 return ret;
10834 }
10835
10836 static int cpu_cfs_stat_show(struct seq_file *sf, void *v)
10837 {
10838 struct task_group *tg = css_tg(seq_css(sf));
10839 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
10840
10841 seq_printf(sf, "nr_periods %d\n", cfs_b->nr_periods);
10842 seq_printf(sf, "nr_throttled %d\n", cfs_b->nr_throttled);
10843 seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time);
10844
10845 if (schedstat_enabled() && tg != &root_task_group) {
10846 struct sched_statistics *stats;
10847 u64 ws = 0;
10848 int i;
10849
10850 for_each_possible_cpu(i) {
10851 stats = __schedstats_from_se(tg->se[i]);
10852 ws += schedstat_val(stats->wait_sum);
10853 }
10854
10855 seq_printf(sf, "wait_sum %llu\n", ws);
10856 }
10857
10858 seq_printf(sf, "nr_bursts %d\n", cfs_b->nr_burst);
10859 seq_printf(sf, "burst_time %llu\n", cfs_b->burst_time);
10860
10861 return 0;
10862 }
10863 #endif
10864 #endif
10865
10866 #ifdef CONFIG_RT_GROUP_SCHED
10867 static int cpu_rt_runtime_write(struct cgroup_subsys_state *css,
10868 struct cftype *cft, s64 val)
10869 {
10870 return sched_group_set_rt_runtime(css_tg(css), val);
10871 }
10872
10873 static s64 cpu_rt_runtime_read(struct cgroup_subsys_state *css,
10874 struct cftype *cft)
10875 {
10876 return sched_group_rt_runtime(css_tg(css));
10877 }
10878
10879 static int cpu_rt_period_write_uint(struct cgroup_subsys_state *css,
10880 struct cftype *cftype, u64 rt_period_us)
10881 {
10882 return sched_group_set_rt_period(css_tg(css), rt_period_us);
10883 }
10884
10885 static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
10886 struct cftype *cft)
10887 {
10888 return sched_group_rt_period(css_tg(css));
10889 }
10890 #endif
10891
10892 #ifdef CONFIG_FAIR_GROUP_SCHED
10893 static s64 cpu_idle_read_s64(struct cgroup_subsys_state *css,
10894 struct cftype *cft)
10895 {
10896 return css_tg(css)->idle;
10897 }
10898
10899 static int cpu_idle_write_s64(struct cgroup_subsys_state *css,
10900 struct cftype *cft, s64 idle)
10901 {
10902 return sched_group_set_idle(css_tg(css), idle);
10903 }
10904 #endif
10905
10906 static struct cftype cpu_legacy_files[] = {
10907 #ifdef CONFIG_FAIR_GROUP_SCHED
10908 {
10909 .name = "shares",
10910 .read_u64 = cpu_shares_read_u64,
10911 .write_u64 = cpu_shares_write_u64,
10912 },
10913 {
10914 .name = "idle",
10915 .read_s64 = cpu_idle_read_s64,
10916 .write_s64 = cpu_idle_write_s64,
10917 },
10918 #endif
10919 #ifdef CONFIG_CFS_BANDWIDTH
10920 {
10921 .name = "cfs_quota_us",
10922 .read_s64 = cpu_cfs_quota_read_s64,
10923 .write_s64 = cpu_cfs_quota_write_s64,
10924 },
10925 {
10926 .name = "cfs_period_us",
10927 .read_u64 = cpu_cfs_period_read_u64,
10928 .write_u64 = cpu_cfs_period_write_u64,
10929 },
10930 {
10931 .name = "cfs_burst_us",
10932 .read_u64 = cpu_cfs_burst_read_u64,
10933 .write_u64 = cpu_cfs_burst_write_u64,
10934 },
10935 {
10936 .name = "stat",
10937 .seq_show = cpu_cfs_stat_show,
10938 },
10939 #endif
10940 #ifdef CONFIG_RT_GROUP_SCHED
10941 {
10942 .name = "rt_runtime_us",
10943 .read_s64 = cpu_rt_runtime_read,
10944 .write_s64 = cpu_rt_runtime_write,
10945 },
10946 {
10947 .name = "rt_period_us",
10948 .read_u64 = cpu_rt_period_read_uint,
10949 .write_u64 = cpu_rt_period_write_uint,
10950 },
10951 #endif
10952 #ifdef CONFIG_UCLAMP_TASK_GROUP
10953 {
10954 .name = "uclamp.min",
10955 .flags = CFTYPE_NOT_ON_ROOT,
10956 .seq_show = cpu_uclamp_min_show,
10957 .write = cpu_uclamp_min_write,
10958 },
10959 {
10960 .name = "uclamp.max",
10961 .flags = CFTYPE_NOT_ON_ROOT,
10962 .seq_show = cpu_uclamp_max_show,
10963 .write = cpu_uclamp_max_write,
10964 },
10965 #endif
10966 { }
10967 };
10968
10969 static int cpu_extra_stat_show(struct seq_file *sf,
10970 struct cgroup_subsys_state *css)
10971 {
10972 #ifdef CONFIG_CFS_BANDWIDTH
10973 {
10974 struct task_group *tg = css_tg(css);
10975 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
10976 u64 throttled_usec, burst_usec;
10977
10978 throttled_usec = cfs_b->throttled_time;
10979 do_div(throttled_usec, NSEC_PER_USEC);
10980 burst_usec = cfs_b->burst_time;
10981 do_div(burst_usec, NSEC_PER_USEC);
10982
10983 seq_printf(sf, "nr_periods %d\n"
10984 "nr_throttled %d\n"
10985 "throttled_usec %llu\n"
10986 "nr_bursts %d\n"
10987 "burst_usec %llu\n",
10988 cfs_b->nr_periods, cfs_b->nr_throttled,
10989 throttled_usec, cfs_b->nr_burst, burst_usec);
10990 }
10991 #endif
10992 return 0;
10993 }
10994
10995 #ifdef CONFIG_FAIR_GROUP_SCHED
10996 static u64 cpu_weight_read_u64(struct cgroup_subsys_state *css,
10997 struct cftype *cft)
10998 {
10999 struct task_group *tg = css_tg(css);
11000 u64 weight = scale_load_down(tg->shares);
11001
11002 return DIV_ROUND_CLOSEST_ULL(weight * CGROUP_WEIGHT_DFL, 1024);
11003 }
11004
11005 static int cpu_weight_write_u64(struct cgroup_subsys_state *css,
11006 struct cftype *cft, u64 weight)
11007 {
11008
11009
11010
11011
11012
11013
11014
11015 if (weight < CGROUP_WEIGHT_MIN || weight > CGROUP_WEIGHT_MAX)
11016 return -ERANGE;
11017
11018 weight = DIV_ROUND_CLOSEST_ULL(weight * 1024, CGROUP_WEIGHT_DFL);
11019
11020 return sched_group_set_shares(css_tg(css), scale_load(weight));
11021 }
11022
11023 static s64 cpu_weight_nice_read_s64(struct cgroup_subsys_state *css,
11024 struct cftype *cft)
11025 {
11026 unsigned long weight = scale_load_down(css_tg(css)->shares);
11027 int last_delta = INT_MAX;
11028 int prio, delta;
11029
11030
11031 for (prio = 0; prio < ARRAY_SIZE(sched_prio_to_weight); prio++) {
11032 delta = abs(sched_prio_to_weight[prio] - weight);
11033 if (delta >= last_delta)
11034 break;
11035 last_delta = delta;
11036 }
11037
11038 return PRIO_TO_NICE(prio - 1 + MAX_RT_PRIO);
11039 }
11040
11041 static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css,
11042 struct cftype *cft, s64 nice)
11043 {
11044 unsigned long weight;
11045 int idx;
11046
11047 if (nice < MIN_NICE || nice > MAX_NICE)
11048 return -ERANGE;
11049
11050 idx = NICE_TO_PRIO(nice) - MAX_RT_PRIO;
11051 idx = array_index_nospec(idx, 40);
11052 weight = sched_prio_to_weight[idx];
11053
11054 return sched_group_set_shares(css_tg(css), scale_load(weight));
11055 }
11056 #endif
11057
11058 static void __maybe_unused cpu_period_quota_print(struct seq_file *sf,
11059 long period, long quota)
11060 {
11061 if (quota < 0)
11062 seq_puts(sf, "max");
11063 else
11064 seq_printf(sf, "%ld", quota);
11065
11066 seq_printf(sf, " %ld\n", period);
11067 }
11068
11069
11070 static int __maybe_unused cpu_period_quota_parse(char *buf,
11071 u64 *periodp, u64 *quotap)
11072 {
11073 char tok[21];
11074
11075 if (sscanf(buf, "%20s %llu", tok, periodp) < 1)
11076 return -EINVAL;
11077
11078 *periodp *= NSEC_PER_USEC;
11079
11080 if (sscanf(tok, "%llu", quotap))
11081 *quotap *= NSEC_PER_USEC;
11082 else if (!strcmp(tok, "max"))
11083 *quotap = RUNTIME_INF;
11084 else
11085 return -EINVAL;
11086
11087 return 0;
11088 }
11089
11090 #ifdef CONFIG_CFS_BANDWIDTH
11091 static int cpu_max_show(struct seq_file *sf, void *v)
11092 {
11093 struct task_group *tg = css_tg(seq_css(sf));
11094
11095 cpu_period_quota_print(sf, tg_get_cfs_period(tg), tg_get_cfs_quota(tg));
11096 return 0;
11097 }
11098
11099 static ssize_t cpu_max_write(struct kernfs_open_file *of,
11100 char *buf, size_t nbytes, loff_t off)
11101 {
11102 struct task_group *tg = css_tg(of_css(of));
11103 u64 period = tg_get_cfs_period(tg);
11104 u64 burst = tg_get_cfs_burst(tg);
11105 u64 quota;
11106 int ret;
11107
11108 ret = cpu_period_quota_parse(buf, &period, "a);
11109 if (!ret)
11110 ret = tg_set_cfs_bandwidth(tg, period, quota, burst);
11111 return ret ?: nbytes;
11112 }
11113 #endif
11114
11115 static struct cftype cpu_files[] = {
11116 #ifdef CONFIG_FAIR_GROUP_SCHED
11117 {
11118 .name = "weight",
11119 .flags = CFTYPE_NOT_ON_ROOT,
11120 .read_u64 = cpu_weight_read_u64,
11121 .write_u64 = cpu_weight_write_u64,
11122 },
11123 {
11124 .name = "weight.nice",
11125 .flags = CFTYPE_NOT_ON_ROOT,
11126 .read_s64 = cpu_weight_nice_read_s64,
11127 .write_s64 = cpu_weight_nice_write_s64,
11128 },
11129 {
11130 .name = "idle",
11131 .flags = CFTYPE_NOT_ON_ROOT,
11132 .read_s64 = cpu_idle_read_s64,
11133 .write_s64 = cpu_idle_write_s64,
11134 },
11135 #endif
11136 #ifdef CONFIG_CFS_BANDWIDTH
11137 {
11138 .name = "max",
11139 .flags = CFTYPE_NOT_ON_ROOT,
11140 .seq_show = cpu_max_show,
11141 .write = cpu_max_write,
11142 },
11143 {
11144 .name = "max.burst",
11145 .flags = CFTYPE_NOT_ON_ROOT,
11146 .read_u64 = cpu_cfs_burst_read_u64,
11147 .write_u64 = cpu_cfs_burst_write_u64,
11148 },
11149 #endif
11150 #ifdef CONFIG_UCLAMP_TASK_GROUP
11151 {
11152 .name = "uclamp.min",
11153 .flags = CFTYPE_NOT_ON_ROOT,
11154 .seq_show = cpu_uclamp_min_show,
11155 .write = cpu_uclamp_min_write,
11156 },
11157 {
11158 .name = "uclamp.max",
11159 .flags = CFTYPE_NOT_ON_ROOT,
11160 .seq_show = cpu_uclamp_max_show,
11161 .write = cpu_uclamp_max_write,
11162 },
11163 #endif
11164 { }
11165 };
11166
11167 struct cgroup_subsys cpu_cgrp_subsys = {
11168 .css_alloc = cpu_cgroup_css_alloc,
11169 .css_online = cpu_cgroup_css_online,
11170 .css_released = cpu_cgroup_css_released,
11171 .css_free = cpu_cgroup_css_free,
11172 .css_extra_stat_show = cpu_extra_stat_show,
11173 .fork = cpu_cgroup_fork,
11174 .can_attach = cpu_cgroup_can_attach,
11175 .attach = cpu_cgroup_attach,
11176 .legacy_cftypes = cpu_legacy_files,
11177 .dfl_cftypes = cpu_files,
11178 .early_init = true,
11179 .threaded = true,
11180 };
11181
11182 #endif
11183
11184 void dump_cpu_task(int cpu)
11185 {
11186 pr_info("Task dump for CPU %d:\n", cpu);
11187 sched_show_task(cpu_curr(cpu));
11188 }
11189
11190
11191
11192
11193
11194
11195
11196
11197
11198
11199
11200
11201
11202 const int sched_prio_to_weight[40] = {
11203 88761, 71755, 56483, 46273, 36291,
11204 29154, 23254, 18705, 14949, 11916,
11205 9548, 7620, 6100, 4904, 3906,
11206 3121, 2501, 1991, 1586, 1277,
11207 1024, 820, 655, 526, 423,
11208 335, 272, 215, 172, 137,
11209 110, 87, 70, 56, 45,
11210 36, 29, 23, 18, 15,
11211 };
11212
11213
11214
11215
11216
11217
11218
11219
11220 const u32 sched_prio_to_wmult[40] = {
11221 48388, 59856, 76040, 92818, 118348,
11222 147320, 184698, 229616, 287308, 360437,
11223 449829, 563644, 704093, 875809, 1099582,
11224 1376151, 1717300, 2157191, 2708050, 3363326,
11225 4194304, 5237765, 6557202, 8165337, 10153587,
11226 12820798, 15790321, 19976592, 24970740, 31350126,
11227 39045157, 49367440, 61356676, 76695844, 95443717,
11228 119304647, 148102320, 186737708, 238609294, 286331153,
11229 };
11230
11231 void call_trace_sched_update_nr_running(struct rq *rq, int count)
11232 {
11233 trace_sched_update_nr_running_tp(rq, count);
11234 }