0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023 #include <linux/energy_model.h>
0024 #include <linux/mmap_lock.h>
0025 #include <linux/hugetlb_inline.h>
0026 #include <linux/jiffies.h>
0027 #include <linux/mm_api.h>
0028 #include <linux/highmem.h>
0029 #include <linux/spinlock_api.h>
0030 #include <linux/cpumask_api.h>
0031 #include <linux/lockdep_api.h>
0032 #include <linux/softirq.h>
0033 #include <linux/refcount_api.h>
0034 #include <linux/topology.h>
0035 #include <linux/sched/clock.h>
0036 #include <linux/sched/cond_resched.h>
0037 #include <linux/sched/cputime.h>
0038 #include <linux/sched/isolation.h>
0039 #include <linux/sched/nohz.h>
0040
0041 #include <linux/cpuidle.h>
0042 #include <linux/interrupt.h>
0043 #include <linux/mempolicy.h>
0044 #include <linux/mutex_api.h>
0045 #include <linux/profile.h>
0046 #include <linux/psi.h>
0047 #include <linux/ratelimit.h>
0048 #include <linux/task_work.h>
0049
0050 #include <asm/switch_to.h>
0051
0052 #include <linux/sched/cond_resched.h>
0053
0054 #include "sched.h"
0055 #include "stats.h"
0056 #include "autogroup.h"
0057
0058
0059
0060
0061
0062
0063
0064
0065
0066
0067
0068
0069
0070
0071 unsigned int sysctl_sched_latency = 6000000ULL;
0072 static unsigned int normalized_sysctl_sched_latency = 6000000ULL;
0073
0074
0075
0076
0077
0078
0079
0080
0081
0082
0083
0084
0085 unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
0086
0087
0088
0089
0090
0091
0092 unsigned int sysctl_sched_min_granularity = 750000ULL;
0093 static unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
0094
0095
0096
0097
0098
0099
0100
0101 unsigned int sysctl_sched_idle_min_granularity = 750000ULL;
0102
0103
0104
0105
0106 static unsigned int sched_nr_latency = 8;
0107
0108
0109
0110
0111
0112 unsigned int sysctl_sched_child_runs_first __read_mostly;
0113
0114
0115
0116
0117
0118
0119
0120
0121
0122
0123 unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
0124 static unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
0125
0126 const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
0127
0128 int sched_thermal_decay_shift;
0129 static int __init setup_sched_thermal_decay_shift(char *str)
0130 {
0131 int _shift = 0;
0132
0133 if (kstrtoint(str, 0, &_shift))
0134 pr_warn("Unable to set scheduler thermal pressure decay shift parameter\n");
0135
0136 sched_thermal_decay_shift = clamp(_shift, 0, 10);
0137 return 1;
0138 }
0139 __setup("sched_thermal_decay_shift=", setup_sched_thermal_decay_shift);
0140
0141 #ifdef CONFIG_SMP
0142
0143
0144
0145 int __weak arch_asym_cpu_priority(int cpu)
0146 {
0147 return -cpu;
0148 }
0149
0150
0151
0152
0153
0154
0155 #define fits_capacity(cap, max) ((cap) * 1280 < (max) * 1024)
0156
0157
0158
0159
0160
0161
0162
0163 #define capacity_greater(cap1, cap2) ((cap1) * 1024 > (cap2) * 1078)
0164 #endif
0165
0166 #ifdef CONFIG_CFS_BANDWIDTH
0167
0168
0169
0170
0171
0172
0173
0174
0175
0176
0177 static unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
0178 #endif
0179
0180 #ifdef CONFIG_SYSCTL
0181 static struct ctl_table sched_fair_sysctls[] = {
0182 {
0183 .procname = "sched_child_runs_first",
0184 .data = &sysctl_sched_child_runs_first,
0185 .maxlen = sizeof(unsigned int),
0186 .mode = 0644,
0187 .proc_handler = proc_dointvec,
0188 },
0189 #ifdef CONFIG_CFS_BANDWIDTH
0190 {
0191 .procname = "sched_cfs_bandwidth_slice_us",
0192 .data = &sysctl_sched_cfs_bandwidth_slice,
0193 .maxlen = sizeof(unsigned int),
0194 .mode = 0644,
0195 .proc_handler = proc_dointvec_minmax,
0196 .extra1 = SYSCTL_ONE,
0197 },
0198 #endif
0199 {}
0200 };
0201
0202 static int __init sched_fair_sysctl_init(void)
0203 {
0204 register_sysctl_init("kernel", sched_fair_sysctls);
0205 return 0;
0206 }
0207 late_initcall(sched_fair_sysctl_init);
0208 #endif
0209
0210 static inline void update_load_add(struct load_weight *lw, unsigned long inc)
0211 {
0212 lw->weight += inc;
0213 lw->inv_weight = 0;
0214 }
0215
0216 static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
0217 {
0218 lw->weight -= dec;
0219 lw->inv_weight = 0;
0220 }
0221
0222 static inline void update_load_set(struct load_weight *lw, unsigned long w)
0223 {
0224 lw->weight = w;
0225 lw->inv_weight = 0;
0226 }
0227
0228
0229
0230
0231
0232
0233
0234
0235
0236
0237 static unsigned int get_update_sysctl_factor(void)
0238 {
0239 unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8);
0240 unsigned int factor;
0241
0242 switch (sysctl_sched_tunable_scaling) {
0243 case SCHED_TUNABLESCALING_NONE:
0244 factor = 1;
0245 break;
0246 case SCHED_TUNABLESCALING_LINEAR:
0247 factor = cpus;
0248 break;
0249 case SCHED_TUNABLESCALING_LOG:
0250 default:
0251 factor = 1 + ilog2(cpus);
0252 break;
0253 }
0254
0255 return factor;
0256 }
0257
0258 static void update_sysctl(void)
0259 {
0260 unsigned int factor = get_update_sysctl_factor();
0261
0262 #define SET_SYSCTL(name) \
0263 (sysctl_##name = (factor) * normalized_sysctl_##name)
0264 SET_SYSCTL(sched_min_granularity);
0265 SET_SYSCTL(sched_latency);
0266 SET_SYSCTL(sched_wakeup_granularity);
0267 #undef SET_SYSCTL
0268 }
0269
0270 void __init sched_init_granularity(void)
0271 {
0272 update_sysctl();
0273 }
0274
0275 #define WMULT_CONST (~0U)
0276 #define WMULT_SHIFT 32
0277
0278 static void __update_inv_weight(struct load_weight *lw)
0279 {
0280 unsigned long w;
0281
0282 if (likely(lw->inv_weight))
0283 return;
0284
0285 w = scale_load_down(lw->weight);
0286
0287 if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
0288 lw->inv_weight = 1;
0289 else if (unlikely(!w))
0290 lw->inv_weight = WMULT_CONST;
0291 else
0292 lw->inv_weight = WMULT_CONST / w;
0293 }
0294
0295
0296
0297
0298
0299
0300
0301
0302
0303
0304
0305
0306
0307 static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
0308 {
0309 u64 fact = scale_load_down(weight);
0310 u32 fact_hi = (u32)(fact >> 32);
0311 int shift = WMULT_SHIFT;
0312 int fs;
0313
0314 __update_inv_weight(lw);
0315
0316 if (unlikely(fact_hi)) {
0317 fs = fls(fact_hi);
0318 shift -= fs;
0319 fact >>= fs;
0320 }
0321
0322 fact = mul_u32_u32(fact, lw->inv_weight);
0323
0324 fact_hi = (u32)(fact >> 32);
0325 if (fact_hi) {
0326 fs = fls(fact_hi);
0327 shift -= fs;
0328 fact >>= fs;
0329 }
0330
0331 return mul_u64_u32_shr(delta_exec, fact, shift);
0332 }
0333
0334
0335 const struct sched_class fair_sched_class;
0336
0337
0338
0339
0340
0341 #ifdef CONFIG_FAIR_GROUP_SCHED
0342
0343
0344 #define for_each_sched_entity(se) \
0345 for (; se; se = se->parent)
0346
0347 static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
0348 {
0349 struct rq *rq = rq_of(cfs_rq);
0350 int cpu = cpu_of(rq);
0351
0352 if (cfs_rq->on_list)
0353 return rq->tmp_alone_branch == &rq->leaf_cfs_rq_list;
0354
0355 cfs_rq->on_list = 1;
0356
0357
0358
0359
0360
0361
0362
0363
0364
0365
0366 if (cfs_rq->tg->parent &&
0367 cfs_rq->tg->parent->cfs_rq[cpu]->on_list) {
0368
0369
0370
0371
0372
0373
0374 list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
0375 &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list));
0376
0377
0378
0379
0380
0381 rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
0382 return true;
0383 }
0384
0385 if (!cfs_rq->tg->parent) {
0386
0387
0388
0389
0390 list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
0391 &rq->leaf_cfs_rq_list);
0392
0393
0394
0395
0396 rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
0397 return true;
0398 }
0399
0400
0401
0402
0403
0404
0405
0406 list_add_rcu(&cfs_rq->leaf_cfs_rq_list, rq->tmp_alone_branch);
0407
0408
0409
0410
0411 rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list;
0412 return false;
0413 }
0414
0415 static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
0416 {
0417 if (cfs_rq->on_list) {
0418 struct rq *rq = rq_of(cfs_rq);
0419
0420
0421
0422
0423
0424
0425
0426
0427 if (rq->tmp_alone_branch == &cfs_rq->leaf_cfs_rq_list)
0428 rq->tmp_alone_branch = cfs_rq->leaf_cfs_rq_list.prev;
0429
0430 list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
0431 cfs_rq->on_list = 0;
0432 }
0433 }
0434
0435 static inline void assert_list_leaf_cfs_rq(struct rq *rq)
0436 {
0437 SCHED_WARN_ON(rq->tmp_alone_branch != &rq->leaf_cfs_rq_list);
0438 }
0439
0440
0441 #define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
0442 list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list, \
0443 leaf_cfs_rq_list)
0444
0445
0446 static inline struct cfs_rq *
0447 is_same_group(struct sched_entity *se, struct sched_entity *pse)
0448 {
0449 if (se->cfs_rq == pse->cfs_rq)
0450 return se->cfs_rq;
0451
0452 return NULL;
0453 }
0454
0455 static inline struct sched_entity *parent_entity(struct sched_entity *se)
0456 {
0457 return se->parent;
0458 }
0459
0460 static void
0461 find_matching_se(struct sched_entity **se, struct sched_entity **pse)
0462 {
0463 int se_depth, pse_depth;
0464
0465
0466
0467
0468
0469
0470
0471
0472
0473 se_depth = (*se)->depth;
0474 pse_depth = (*pse)->depth;
0475
0476 while (se_depth > pse_depth) {
0477 se_depth--;
0478 *se = parent_entity(*se);
0479 }
0480
0481 while (pse_depth > se_depth) {
0482 pse_depth--;
0483 *pse = parent_entity(*pse);
0484 }
0485
0486 while (!is_same_group(*se, *pse)) {
0487 *se = parent_entity(*se);
0488 *pse = parent_entity(*pse);
0489 }
0490 }
0491
0492 static int tg_is_idle(struct task_group *tg)
0493 {
0494 return tg->idle > 0;
0495 }
0496
0497 static int cfs_rq_is_idle(struct cfs_rq *cfs_rq)
0498 {
0499 return cfs_rq->idle > 0;
0500 }
0501
0502 static int se_is_idle(struct sched_entity *se)
0503 {
0504 if (entity_is_task(se))
0505 return task_has_idle_policy(task_of(se));
0506 return cfs_rq_is_idle(group_cfs_rq(se));
0507 }
0508
0509 #else
0510
0511 #define for_each_sched_entity(se) \
0512 for (; se; se = NULL)
0513
0514 static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
0515 {
0516 return true;
0517 }
0518
0519 static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
0520 {
0521 }
0522
0523 static inline void assert_list_leaf_cfs_rq(struct rq *rq)
0524 {
0525 }
0526
0527 #define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
0528 for (cfs_rq = &rq->cfs, pos = NULL; cfs_rq; cfs_rq = pos)
0529
0530 static inline struct sched_entity *parent_entity(struct sched_entity *se)
0531 {
0532 return NULL;
0533 }
0534
0535 static inline void
0536 find_matching_se(struct sched_entity **se, struct sched_entity **pse)
0537 {
0538 }
0539
0540 static inline int tg_is_idle(struct task_group *tg)
0541 {
0542 return 0;
0543 }
0544
0545 static int cfs_rq_is_idle(struct cfs_rq *cfs_rq)
0546 {
0547 return 0;
0548 }
0549
0550 static int se_is_idle(struct sched_entity *se)
0551 {
0552 return 0;
0553 }
0554
0555 #endif
0556
0557 static __always_inline
0558 void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
0559
0560
0561
0562
0563
0564 static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
0565 {
0566 s64 delta = (s64)(vruntime - max_vruntime);
0567 if (delta > 0)
0568 max_vruntime = vruntime;
0569
0570 return max_vruntime;
0571 }
0572
0573 static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
0574 {
0575 s64 delta = (s64)(vruntime - min_vruntime);
0576 if (delta < 0)
0577 min_vruntime = vruntime;
0578
0579 return min_vruntime;
0580 }
0581
0582 static inline bool entity_before(struct sched_entity *a,
0583 struct sched_entity *b)
0584 {
0585 return (s64)(a->vruntime - b->vruntime) < 0;
0586 }
0587
0588 #define __node_2_se(node) \
0589 rb_entry((node), struct sched_entity, run_node)
0590
0591 static void update_min_vruntime(struct cfs_rq *cfs_rq)
0592 {
0593 struct sched_entity *curr = cfs_rq->curr;
0594 struct rb_node *leftmost = rb_first_cached(&cfs_rq->tasks_timeline);
0595
0596 u64 vruntime = cfs_rq->min_vruntime;
0597
0598 if (curr) {
0599 if (curr->on_rq)
0600 vruntime = curr->vruntime;
0601 else
0602 curr = NULL;
0603 }
0604
0605 if (leftmost) {
0606 struct sched_entity *se = __node_2_se(leftmost);
0607
0608 if (!curr)
0609 vruntime = se->vruntime;
0610 else
0611 vruntime = min_vruntime(vruntime, se->vruntime);
0612 }
0613
0614
0615 u64_u32_store(cfs_rq->min_vruntime,
0616 max_vruntime(cfs_rq->min_vruntime, vruntime));
0617 }
0618
0619 static inline bool __entity_less(struct rb_node *a, const struct rb_node *b)
0620 {
0621 return entity_before(__node_2_se(a), __node_2_se(b));
0622 }
0623
0624
0625
0626
0627 static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
0628 {
0629 rb_add_cached(&se->run_node, &cfs_rq->tasks_timeline, __entity_less);
0630 }
0631
0632 static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
0633 {
0634 rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline);
0635 }
0636
0637 struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
0638 {
0639 struct rb_node *left = rb_first_cached(&cfs_rq->tasks_timeline);
0640
0641 if (!left)
0642 return NULL;
0643
0644 return __node_2_se(left);
0645 }
0646
0647 static struct sched_entity *__pick_next_entity(struct sched_entity *se)
0648 {
0649 struct rb_node *next = rb_next(&se->run_node);
0650
0651 if (!next)
0652 return NULL;
0653
0654 return __node_2_se(next);
0655 }
0656
0657 #ifdef CONFIG_SCHED_DEBUG
0658 struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
0659 {
0660 struct rb_node *last = rb_last(&cfs_rq->tasks_timeline.rb_root);
0661
0662 if (!last)
0663 return NULL;
0664
0665 return __node_2_se(last);
0666 }
0667
0668
0669
0670
0671
0672 int sched_update_scaling(void)
0673 {
0674 unsigned int factor = get_update_sysctl_factor();
0675
0676 sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
0677 sysctl_sched_min_granularity);
0678
0679 #define WRT_SYSCTL(name) \
0680 (normalized_sysctl_##name = sysctl_##name / (factor))
0681 WRT_SYSCTL(sched_min_granularity);
0682 WRT_SYSCTL(sched_latency);
0683 WRT_SYSCTL(sched_wakeup_granularity);
0684 #undef WRT_SYSCTL
0685
0686 return 0;
0687 }
0688 #endif
0689
0690
0691
0692
0693 static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
0694 {
0695 if (unlikely(se->load.weight != NICE_0_LOAD))
0696 delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
0697
0698 return delta;
0699 }
0700
0701
0702
0703
0704
0705
0706
0707
0708
0709 static u64 __sched_period(unsigned long nr_running)
0710 {
0711 if (unlikely(nr_running > sched_nr_latency))
0712 return nr_running * sysctl_sched_min_granularity;
0713 else
0714 return sysctl_sched_latency;
0715 }
0716
0717 static bool sched_idle_cfs_rq(struct cfs_rq *cfs_rq);
0718
0719
0720
0721
0722
0723
0724
0725 static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
0726 {
0727 unsigned int nr_running = cfs_rq->nr_running;
0728 struct sched_entity *init_se = se;
0729 unsigned int min_gran;
0730 u64 slice;
0731
0732 if (sched_feat(ALT_PERIOD))
0733 nr_running = rq_of(cfs_rq)->cfs.h_nr_running;
0734
0735 slice = __sched_period(nr_running + !se->on_rq);
0736
0737 for_each_sched_entity(se) {
0738 struct load_weight *load;
0739 struct load_weight lw;
0740 struct cfs_rq *qcfs_rq;
0741
0742 qcfs_rq = cfs_rq_of(se);
0743 load = &qcfs_rq->load;
0744
0745 if (unlikely(!se->on_rq)) {
0746 lw = qcfs_rq->load;
0747
0748 update_load_add(&lw, se->load.weight);
0749 load = &lw;
0750 }
0751 slice = __calc_delta(slice, se->load.weight, load);
0752 }
0753
0754 if (sched_feat(BASE_SLICE)) {
0755 if (se_is_idle(init_se) && !sched_idle_cfs_rq(cfs_rq))
0756 min_gran = sysctl_sched_idle_min_granularity;
0757 else
0758 min_gran = sysctl_sched_min_granularity;
0759
0760 slice = max_t(u64, slice, min_gran);
0761 }
0762
0763 return slice;
0764 }
0765
0766
0767
0768
0769
0770
0771 static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
0772 {
0773 return calc_delta_fair(sched_slice(cfs_rq, se), se);
0774 }
0775
0776 #include "pelt.h"
0777 #ifdef CONFIG_SMP
0778
0779 static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
0780 static unsigned long task_h_load(struct task_struct *p);
0781 static unsigned long capacity_of(int cpu);
0782
0783
0784 void init_entity_runnable_average(struct sched_entity *se)
0785 {
0786 struct sched_avg *sa = &se->avg;
0787
0788 memset(sa, 0, sizeof(*sa));
0789
0790
0791
0792
0793
0794
0795
0796 if (entity_is_task(se))
0797 sa->load_avg = scale_load_down(se->load.weight);
0798
0799
0800 }
0801
0802 static void attach_entity_cfs_rq(struct sched_entity *se);
0803
0804
0805
0806
0807
0808
0809
0810
0811
0812
0813
0814
0815
0816
0817
0818
0819
0820
0821
0822
0823
0824
0825
0826
0827
0828
0829
0830 void post_init_entity_util_avg(struct task_struct *p)
0831 {
0832 struct sched_entity *se = &p->se;
0833 struct cfs_rq *cfs_rq = cfs_rq_of(se);
0834 struct sched_avg *sa = &se->avg;
0835 long cpu_scale = arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq)));
0836 long cap = (long)(cpu_scale - cfs_rq->avg.util_avg) / 2;
0837
0838 if (cap > 0) {
0839 if (cfs_rq->avg.util_avg != 0) {
0840 sa->util_avg = cfs_rq->avg.util_avg * se->load.weight;
0841 sa->util_avg /= (cfs_rq->avg.load_avg + 1);
0842
0843 if (sa->util_avg > cap)
0844 sa->util_avg = cap;
0845 } else {
0846 sa->util_avg = cap;
0847 }
0848 }
0849
0850 sa->runnable_avg = sa->util_avg;
0851
0852 if (p->sched_class != &fair_sched_class) {
0853
0854
0855
0856
0857
0858
0859
0860
0861
0862
0863 se->avg.last_update_time = cfs_rq_clock_pelt(cfs_rq);
0864 return;
0865 }
0866
0867 attach_entity_cfs_rq(se);
0868 }
0869
0870 #else
0871 void init_entity_runnable_average(struct sched_entity *se)
0872 {
0873 }
0874 void post_init_entity_util_avg(struct task_struct *p)
0875 {
0876 }
0877 static void update_tg_load_avg(struct cfs_rq *cfs_rq)
0878 {
0879 }
0880 #endif
0881
0882
0883
0884
0885 static void update_curr(struct cfs_rq *cfs_rq)
0886 {
0887 struct sched_entity *curr = cfs_rq->curr;
0888 u64 now = rq_clock_task(rq_of(cfs_rq));
0889 u64 delta_exec;
0890
0891 if (unlikely(!curr))
0892 return;
0893
0894 delta_exec = now - curr->exec_start;
0895 if (unlikely((s64)delta_exec <= 0))
0896 return;
0897
0898 curr->exec_start = now;
0899
0900 if (schedstat_enabled()) {
0901 struct sched_statistics *stats;
0902
0903 stats = __schedstats_from_se(curr);
0904 __schedstat_set(stats->exec_max,
0905 max(delta_exec, stats->exec_max));
0906 }
0907
0908 curr->sum_exec_runtime += delta_exec;
0909 schedstat_add(cfs_rq->exec_clock, delta_exec);
0910
0911 curr->vruntime += calc_delta_fair(delta_exec, curr);
0912 update_min_vruntime(cfs_rq);
0913
0914 if (entity_is_task(curr)) {
0915 struct task_struct *curtask = task_of(curr);
0916
0917 trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
0918 cgroup_account_cputime(curtask, delta_exec);
0919 account_group_exec_runtime(curtask, delta_exec);
0920 }
0921
0922 account_cfs_rq_runtime(cfs_rq, delta_exec);
0923 }
0924
0925 static void update_curr_fair(struct rq *rq)
0926 {
0927 update_curr(cfs_rq_of(&rq->curr->se));
0928 }
0929
0930 static inline void
0931 update_stats_wait_start_fair(struct cfs_rq *cfs_rq, struct sched_entity *se)
0932 {
0933 struct sched_statistics *stats;
0934 struct task_struct *p = NULL;
0935
0936 if (!schedstat_enabled())
0937 return;
0938
0939 stats = __schedstats_from_se(se);
0940
0941 if (entity_is_task(se))
0942 p = task_of(se);
0943
0944 __update_stats_wait_start(rq_of(cfs_rq), p, stats);
0945 }
0946
0947 static inline void
0948 update_stats_wait_end_fair(struct cfs_rq *cfs_rq, struct sched_entity *se)
0949 {
0950 struct sched_statistics *stats;
0951 struct task_struct *p = NULL;
0952
0953 if (!schedstat_enabled())
0954 return;
0955
0956 stats = __schedstats_from_se(se);
0957
0958
0959
0960
0961
0962
0963
0964 if (unlikely(!schedstat_val(stats->wait_start)))
0965 return;
0966
0967 if (entity_is_task(se))
0968 p = task_of(se);
0969
0970 __update_stats_wait_end(rq_of(cfs_rq), p, stats);
0971 }
0972
0973 static inline void
0974 update_stats_enqueue_sleeper_fair(struct cfs_rq *cfs_rq, struct sched_entity *se)
0975 {
0976 struct sched_statistics *stats;
0977 struct task_struct *tsk = NULL;
0978
0979 if (!schedstat_enabled())
0980 return;
0981
0982 stats = __schedstats_from_se(se);
0983
0984 if (entity_is_task(se))
0985 tsk = task_of(se);
0986
0987 __update_stats_enqueue_sleeper(rq_of(cfs_rq), tsk, stats);
0988 }
0989
0990
0991
0992
0993 static inline void
0994 update_stats_enqueue_fair(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
0995 {
0996 if (!schedstat_enabled())
0997 return;
0998
0999
1000
1001
1002
1003 if (se != cfs_rq->curr)
1004 update_stats_wait_start_fair(cfs_rq, se);
1005
1006 if (flags & ENQUEUE_WAKEUP)
1007 update_stats_enqueue_sleeper_fair(cfs_rq, se);
1008 }
1009
1010 static inline void
1011 update_stats_dequeue_fair(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1012 {
1013
1014 if (!schedstat_enabled())
1015 return;
1016
1017
1018
1019
1020
1021 if (se != cfs_rq->curr)
1022 update_stats_wait_end_fair(cfs_rq, se);
1023
1024 if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) {
1025 struct task_struct *tsk = task_of(se);
1026 unsigned int state;
1027
1028
1029 state = READ_ONCE(tsk->__state);
1030 if (state & TASK_INTERRUPTIBLE)
1031 __schedstat_set(tsk->stats.sleep_start,
1032 rq_clock(rq_of(cfs_rq)));
1033 if (state & TASK_UNINTERRUPTIBLE)
1034 __schedstat_set(tsk->stats.block_start,
1035 rq_clock(rq_of(cfs_rq)));
1036 }
1037 }
1038
1039
1040
1041
1042 static inline void
1043 update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
1044 {
1045
1046
1047
1048 se->exec_start = rq_clock_task(rq_of(cfs_rq));
1049 }
1050
1051
1052
1053
1054
1055 #ifdef CONFIG_NUMA
1056 #define NUMA_IMBALANCE_MIN 2
1057
1058 static inline long
1059 adjust_numa_imbalance(int imbalance, int dst_running, int imb_numa_nr)
1060 {
1061
1062
1063
1064
1065
1066
1067
1068 if (dst_running > imb_numa_nr)
1069 return imbalance;
1070
1071
1072
1073
1074
1075 if (imbalance <= NUMA_IMBALANCE_MIN)
1076 return 0;
1077
1078 return imbalance;
1079 }
1080 #endif
1081
1082 #ifdef CONFIG_NUMA_BALANCING
1083
1084
1085
1086
1087
1088 unsigned int sysctl_numa_balancing_scan_period_min = 1000;
1089 unsigned int sysctl_numa_balancing_scan_period_max = 60000;
1090
1091
1092 unsigned int sysctl_numa_balancing_scan_size = 256;
1093
1094
1095 unsigned int sysctl_numa_balancing_scan_delay = 1000;
1096
1097 struct numa_group {
1098 refcount_t refcount;
1099
1100 spinlock_t lock;
1101 int nr_tasks;
1102 pid_t gid;
1103 int active_nodes;
1104
1105 struct rcu_head rcu;
1106 unsigned long total_faults;
1107 unsigned long max_faults_cpu;
1108
1109
1110
1111
1112
1113
1114
1115 unsigned long faults[];
1116 };
1117
1118
1119
1120
1121
1122 static struct numa_group *deref_task_numa_group(struct task_struct *p)
1123 {
1124 return rcu_dereference_check(p->numa_group, p == current ||
1125 (lockdep_is_held(__rq_lockp(task_rq(p))) && !READ_ONCE(p->on_cpu)));
1126 }
1127
1128 static struct numa_group *deref_curr_numa_group(struct task_struct *p)
1129 {
1130 return rcu_dereference_protected(p->numa_group, p == current);
1131 }
1132
1133 static inline unsigned long group_faults_priv(struct numa_group *ng);
1134 static inline unsigned long group_faults_shared(struct numa_group *ng);
1135
1136 static unsigned int task_nr_scan_windows(struct task_struct *p)
1137 {
1138 unsigned long rss = 0;
1139 unsigned long nr_scan_pages;
1140
1141
1142
1143
1144
1145
1146 nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);
1147 rss = get_mm_rss(p->mm);
1148 if (!rss)
1149 rss = nr_scan_pages;
1150
1151 rss = round_up(rss, nr_scan_pages);
1152 return rss / nr_scan_pages;
1153 }
1154
1155
1156 #define MAX_SCAN_WINDOW 2560
1157
1158 static unsigned int task_scan_min(struct task_struct *p)
1159 {
1160 unsigned int scan_size = READ_ONCE(sysctl_numa_balancing_scan_size);
1161 unsigned int scan, floor;
1162 unsigned int windows = 1;
1163
1164 if (scan_size < MAX_SCAN_WINDOW)
1165 windows = MAX_SCAN_WINDOW / scan_size;
1166 floor = 1000 / windows;
1167
1168 scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
1169 return max_t(unsigned int, floor, scan);
1170 }
1171
1172 static unsigned int task_scan_start(struct task_struct *p)
1173 {
1174 unsigned long smin = task_scan_min(p);
1175 unsigned long period = smin;
1176 struct numa_group *ng;
1177
1178
1179 rcu_read_lock();
1180 ng = rcu_dereference(p->numa_group);
1181 if (ng) {
1182 unsigned long shared = group_faults_shared(ng);
1183 unsigned long private = group_faults_priv(ng);
1184
1185 period *= refcount_read(&ng->refcount);
1186 period *= shared + 1;
1187 period /= private + shared + 1;
1188 }
1189 rcu_read_unlock();
1190
1191 return max(smin, period);
1192 }
1193
1194 static unsigned int task_scan_max(struct task_struct *p)
1195 {
1196 unsigned long smin = task_scan_min(p);
1197 unsigned long smax;
1198 struct numa_group *ng;
1199
1200
1201 smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
1202
1203
1204 ng = deref_curr_numa_group(p);
1205 if (ng) {
1206 unsigned long shared = group_faults_shared(ng);
1207 unsigned long private = group_faults_priv(ng);
1208 unsigned long period = smax;
1209
1210 period *= refcount_read(&ng->refcount);
1211 period *= shared + 1;
1212 period /= private + shared + 1;
1213
1214 smax = max(smax, period);
1215 }
1216
1217 return max(smin, smax);
1218 }
1219
1220 static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
1221 {
1222 rq->nr_numa_running += (p->numa_preferred_nid != NUMA_NO_NODE);
1223 rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
1224 }
1225
1226 static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
1227 {
1228 rq->nr_numa_running -= (p->numa_preferred_nid != NUMA_NO_NODE);
1229 rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
1230 }
1231
1232
1233 #define NR_NUMA_HINT_FAULT_TYPES 2
1234
1235
1236 #define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2)
1237
1238
1239 #define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2)
1240
1241 pid_t task_numa_group_id(struct task_struct *p)
1242 {
1243 struct numa_group *ng;
1244 pid_t gid = 0;
1245
1246 rcu_read_lock();
1247 ng = rcu_dereference(p->numa_group);
1248 if (ng)
1249 gid = ng->gid;
1250 rcu_read_unlock();
1251
1252 return gid;
1253 }
1254
1255
1256
1257
1258
1259
1260
1261 static inline int task_faults_idx(enum numa_faults_stats s, int nid, int priv)
1262 {
1263 return NR_NUMA_HINT_FAULT_TYPES * (s * nr_node_ids + nid) + priv;
1264 }
1265
1266 static inline unsigned long task_faults(struct task_struct *p, int nid)
1267 {
1268 if (!p->numa_faults)
1269 return 0;
1270
1271 return p->numa_faults[task_faults_idx(NUMA_MEM, nid, 0)] +
1272 p->numa_faults[task_faults_idx(NUMA_MEM, nid, 1)];
1273 }
1274
1275 static inline unsigned long group_faults(struct task_struct *p, int nid)
1276 {
1277 struct numa_group *ng = deref_task_numa_group(p);
1278
1279 if (!ng)
1280 return 0;
1281
1282 return ng->faults[task_faults_idx(NUMA_MEM, nid, 0)] +
1283 ng->faults[task_faults_idx(NUMA_MEM, nid, 1)];
1284 }
1285
1286 static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
1287 {
1288 return group->faults[task_faults_idx(NUMA_CPU, nid, 0)] +
1289 group->faults[task_faults_idx(NUMA_CPU, nid, 1)];
1290 }
1291
1292 static inline unsigned long group_faults_priv(struct numa_group *ng)
1293 {
1294 unsigned long faults = 0;
1295 int node;
1296
1297 for_each_online_node(node) {
1298 faults += ng->faults[task_faults_idx(NUMA_MEM, node, 1)];
1299 }
1300
1301 return faults;
1302 }
1303
1304 static inline unsigned long group_faults_shared(struct numa_group *ng)
1305 {
1306 unsigned long faults = 0;
1307 int node;
1308
1309 for_each_online_node(node) {
1310 faults += ng->faults[task_faults_idx(NUMA_MEM, node, 0)];
1311 }
1312
1313 return faults;
1314 }
1315
1316
1317
1318
1319
1320
1321 #define ACTIVE_NODE_FRACTION 3
1322
1323 static bool numa_is_active_node(int nid, struct numa_group *ng)
1324 {
1325 return group_faults_cpu(ng, nid) * ACTIVE_NODE_FRACTION > ng->max_faults_cpu;
1326 }
1327
1328
1329 static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
1330 int lim_dist, bool task)
1331 {
1332 unsigned long score = 0;
1333 int node, max_dist;
1334
1335
1336
1337
1338
1339 if (sched_numa_topology_type == NUMA_DIRECT)
1340 return 0;
1341
1342
1343 max_dist = READ_ONCE(sched_max_numa_distance);
1344
1345
1346
1347
1348 for_each_online_node(node) {
1349 unsigned long faults;
1350 int dist = node_distance(nid, node);
1351
1352
1353
1354
1355
1356 if (dist >= max_dist || node == nid)
1357 continue;
1358
1359
1360
1361
1362
1363
1364
1365
1366 if (sched_numa_topology_type == NUMA_BACKPLANE && dist >= lim_dist)
1367 continue;
1368
1369
1370 if (task)
1371 faults = task_faults(p, node);
1372 else
1373 faults = group_faults(p, node);
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383 if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
1384 faults *= (max_dist - dist);
1385 faults /= (max_dist - LOCAL_DISTANCE);
1386 }
1387
1388 score += faults;
1389 }
1390
1391 return score;
1392 }
1393
1394
1395
1396
1397
1398
1399
1400 static inline unsigned long task_weight(struct task_struct *p, int nid,
1401 int dist)
1402 {
1403 unsigned long faults, total_faults;
1404
1405 if (!p->numa_faults)
1406 return 0;
1407
1408 total_faults = p->total_numa_faults;
1409
1410 if (!total_faults)
1411 return 0;
1412
1413 faults = task_faults(p, nid);
1414 faults += score_nearby_nodes(p, nid, dist, true);
1415
1416 return 1000 * faults / total_faults;
1417 }
1418
1419 static inline unsigned long group_weight(struct task_struct *p, int nid,
1420 int dist)
1421 {
1422 struct numa_group *ng = deref_task_numa_group(p);
1423 unsigned long faults, total_faults;
1424
1425 if (!ng)
1426 return 0;
1427
1428 total_faults = ng->total_faults;
1429
1430 if (!total_faults)
1431 return 0;
1432
1433 faults = group_faults(p, nid);
1434 faults += score_nearby_nodes(p, nid, dist, false);
1435
1436 return 1000 * faults / total_faults;
1437 }
1438
1439 bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
1440 int src_nid, int dst_cpu)
1441 {
1442 struct numa_group *ng = deref_curr_numa_group(p);
1443 int dst_nid = cpu_to_node(dst_cpu);
1444 int last_cpupid, this_cpupid;
1445
1446 this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
1447 last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
1448
1449
1450
1451
1452
1453
1454
1455 if ((p->numa_preferred_nid == NUMA_NO_NODE || p->numa_scan_seq <= 4) &&
1456 (cpupid_pid_unset(last_cpupid) || cpupid_match_pid(p, last_cpupid)))
1457 return true;
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476 if (!cpupid_pid_unset(last_cpupid) &&
1477 cpupid_to_nid(last_cpupid) != dst_nid)
1478 return false;
1479
1480
1481 if (cpupid_match_pid(p, last_cpupid))
1482 return true;
1483
1484
1485 if (!ng)
1486 return true;
1487
1488
1489
1490
1491
1492 if (group_faults_cpu(ng, dst_nid) > group_faults_cpu(ng, src_nid) *
1493 ACTIVE_NODE_FRACTION)
1494 return true;
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504 return group_faults_cpu(ng, dst_nid) * group_faults(p, src_nid) * 3 >
1505 group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
1506 }
1507
1508
1509
1510
1511 enum numa_type {
1512
1513 node_has_spare = 0,
1514
1515
1516
1517
1518 node_fully_busy,
1519
1520
1521
1522
1523 node_overloaded
1524 };
1525
1526
1527 struct numa_stats {
1528 unsigned long load;
1529 unsigned long runnable;
1530 unsigned long util;
1531
1532 unsigned long compute_capacity;
1533 unsigned int nr_running;
1534 unsigned int weight;
1535 enum numa_type node_type;
1536 int idle_cpu;
1537 };
1538
1539 static inline bool is_core_idle(int cpu)
1540 {
1541 #ifdef CONFIG_SCHED_SMT
1542 int sibling;
1543
1544 for_each_cpu(sibling, cpu_smt_mask(cpu)) {
1545 if (cpu == sibling)
1546 continue;
1547
1548 if (!idle_cpu(sibling))
1549 return false;
1550 }
1551 #endif
1552
1553 return true;
1554 }
1555
1556 struct task_numa_env {
1557 struct task_struct *p;
1558
1559 int src_cpu, src_nid;
1560 int dst_cpu, dst_nid;
1561 int imb_numa_nr;
1562
1563 struct numa_stats src_stats, dst_stats;
1564
1565 int imbalance_pct;
1566 int dist;
1567
1568 struct task_struct *best_task;
1569 long best_imp;
1570 int best_cpu;
1571 };
1572
1573 static unsigned long cpu_load(struct rq *rq);
1574 static unsigned long cpu_runnable(struct rq *rq);
1575
1576 static inline enum
1577 numa_type numa_classify(unsigned int imbalance_pct,
1578 struct numa_stats *ns)
1579 {
1580 if ((ns->nr_running > ns->weight) &&
1581 (((ns->compute_capacity * 100) < (ns->util * imbalance_pct)) ||
1582 ((ns->compute_capacity * imbalance_pct) < (ns->runnable * 100))))
1583 return node_overloaded;
1584
1585 if ((ns->nr_running < ns->weight) ||
1586 (((ns->compute_capacity * 100) > (ns->util * imbalance_pct)) &&
1587 ((ns->compute_capacity * imbalance_pct) > (ns->runnable * 100))))
1588 return node_has_spare;
1589
1590 return node_fully_busy;
1591 }
1592
1593 #ifdef CONFIG_SCHED_SMT
1594
1595 static inline bool test_idle_cores(int cpu, bool def);
1596 static inline int numa_idle_core(int idle_core, int cpu)
1597 {
1598 if (!static_branch_likely(&sched_smt_present) ||
1599 idle_core >= 0 || !test_idle_cores(cpu, false))
1600 return idle_core;
1601
1602
1603
1604
1605
1606 if (is_core_idle(cpu))
1607 idle_core = cpu;
1608
1609 return idle_core;
1610 }
1611 #else
1612 static inline int numa_idle_core(int idle_core, int cpu)
1613 {
1614 return idle_core;
1615 }
1616 #endif
1617
1618
1619
1620
1621
1622
1623
1624 static void update_numa_stats(struct task_numa_env *env,
1625 struct numa_stats *ns, int nid,
1626 bool find_idle)
1627 {
1628 int cpu, idle_core = -1;
1629
1630 memset(ns, 0, sizeof(*ns));
1631 ns->idle_cpu = -1;
1632
1633 rcu_read_lock();
1634 for_each_cpu(cpu, cpumask_of_node(nid)) {
1635 struct rq *rq = cpu_rq(cpu);
1636
1637 ns->load += cpu_load(rq);
1638 ns->runnable += cpu_runnable(rq);
1639 ns->util += cpu_util_cfs(cpu);
1640 ns->nr_running += rq->cfs.h_nr_running;
1641 ns->compute_capacity += capacity_of(cpu);
1642
1643 if (find_idle && !rq->nr_running && idle_cpu(cpu)) {
1644 if (READ_ONCE(rq->numa_migrate_on) ||
1645 !cpumask_test_cpu(cpu, env->p->cpus_ptr))
1646 continue;
1647
1648 if (ns->idle_cpu == -1)
1649 ns->idle_cpu = cpu;
1650
1651 idle_core = numa_idle_core(idle_core, cpu);
1652 }
1653 }
1654 rcu_read_unlock();
1655
1656 ns->weight = cpumask_weight(cpumask_of_node(nid));
1657
1658 ns->node_type = numa_classify(env->imbalance_pct, ns);
1659
1660 if (idle_core >= 0)
1661 ns->idle_cpu = idle_core;
1662 }
1663
1664 static void task_numa_assign(struct task_numa_env *env,
1665 struct task_struct *p, long imp)
1666 {
1667 struct rq *rq = cpu_rq(env->dst_cpu);
1668
1669
1670 if (env->best_cpu != env->dst_cpu && xchg(&rq->numa_migrate_on, 1)) {
1671 int cpu;
1672 int start = env->dst_cpu;
1673
1674
1675 for_each_cpu_wrap(cpu, cpumask_of_node(env->dst_nid), start) {
1676 if (cpu == env->best_cpu || !idle_cpu(cpu) ||
1677 !cpumask_test_cpu(cpu, env->p->cpus_ptr)) {
1678 continue;
1679 }
1680
1681 env->dst_cpu = cpu;
1682 rq = cpu_rq(env->dst_cpu);
1683 if (!xchg(&rq->numa_migrate_on, 1))
1684 goto assign;
1685 }
1686
1687
1688 return;
1689 }
1690
1691 assign:
1692
1693
1694
1695
1696 if (env->best_cpu != -1 && env->best_cpu != env->dst_cpu) {
1697 rq = cpu_rq(env->best_cpu);
1698 WRITE_ONCE(rq->numa_migrate_on, 0);
1699 }
1700
1701 if (env->best_task)
1702 put_task_struct(env->best_task);
1703 if (p)
1704 get_task_struct(p);
1705
1706 env->best_task = p;
1707 env->best_imp = imp;
1708 env->best_cpu = env->dst_cpu;
1709 }
1710
1711 static bool load_too_imbalanced(long src_load, long dst_load,
1712 struct task_numa_env *env)
1713 {
1714 long imb, old_imb;
1715 long orig_src_load, orig_dst_load;
1716 long src_capacity, dst_capacity;
1717
1718
1719
1720
1721
1722
1723
1724
1725 src_capacity = env->src_stats.compute_capacity;
1726 dst_capacity = env->dst_stats.compute_capacity;
1727
1728 imb = abs(dst_load * src_capacity - src_load * dst_capacity);
1729
1730 orig_src_load = env->src_stats.load;
1731 orig_dst_load = env->dst_stats.load;
1732
1733 old_imb = abs(orig_dst_load * src_capacity - orig_src_load * dst_capacity);
1734
1735
1736 return (imb > old_imb);
1737 }
1738
1739
1740
1741
1742
1743
1744 #define SMALLIMP 30
1745
1746
1747
1748
1749
1750
1751
1752 static bool task_numa_compare(struct task_numa_env *env,
1753 long taskimp, long groupimp, bool maymove)
1754 {
1755 struct numa_group *cur_ng, *p_ng = deref_curr_numa_group(env->p);
1756 struct rq *dst_rq = cpu_rq(env->dst_cpu);
1757 long imp = p_ng ? groupimp : taskimp;
1758 struct task_struct *cur;
1759 long src_load, dst_load;
1760 int dist = env->dist;
1761 long moveimp = imp;
1762 long load;
1763 bool stopsearch = false;
1764
1765 if (READ_ONCE(dst_rq->numa_migrate_on))
1766 return false;
1767
1768 rcu_read_lock();
1769 cur = rcu_dereference(dst_rq->curr);
1770 if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur)))
1771 cur = NULL;
1772
1773
1774
1775
1776
1777 if (cur == env->p) {
1778 stopsearch = true;
1779 goto unlock;
1780 }
1781
1782 if (!cur) {
1783 if (maymove && moveimp >= env->best_imp)
1784 goto assign;
1785 else
1786 goto unlock;
1787 }
1788
1789
1790 if (!cpumask_test_cpu(env->src_cpu, cur->cpus_ptr))
1791 goto unlock;
1792
1793
1794
1795
1796
1797 if (env->best_task &&
1798 env->best_task->numa_preferred_nid == env->src_nid &&
1799 cur->numa_preferred_nid != env->src_nid) {
1800 goto unlock;
1801 }
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813 cur_ng = rcu_dereference(cur->numa_group);
1814 if (cur_ng == p_ng) {
1815
1816
1817
1818
1819
1820
1821 if (env->dst_stats.node_type == node_has_spare)
1822 goto unlock;
1823
1824 imp = taskimp + task_weight(cur, env->src_nid, dist) -
1825 task_weight(cur, env->dst_nid, dist);
1826
1827
1828
1829
1830 if (cur_ng)
1831 imp -= imp / 16;
1832 } else {
1833
1834
1835
1836
1837 if (cur_ng && p_ng)
1838 imp += group_weight(cur, env->src_nid, dist) -
1839 group_weight(cur, env->dst_nid, dist);
1840 else
1841 imp += task_weight(cur, env->src_nid, dist) -
1842 task_weight(cur, env->dst_nid, dist);
1843 }
1844
1845
1846 if (cur->numa_preferred_nid == env->dst_nid)
1847 imp -= imp / 16;
1848
1849
1850
1851
1852
1853
1854
1855 if (cur->numa_preferred_nid == env->src_nid)
1856 imp += imp / 8;
1857
1858 if (maymove && moveimp > imp && moveimp > env->best_imp) {
1859 imp = moveimp;
1860 cur = NULL;
1861 goto assign;
1862 }
1863
1864
1865
1866
1867
1868 if (env->best_task && cur->numa_preferred_nid == env->src_nid &&
1869 env->best_task->numa_preferred_nid != env->src_nid) {
1870 goto assign;
1871 }
1872
1873
1874
1875
1876
1877
1878
1879 if (imp < SMALLIMP || imp <= env->best_imp + SMALLIMP / 2)
1880 goto unlock;
1881
1882
1883
1884
1885 load = task_h_load(env->p) - task_h_load(cur);
1886 if (!load)
1887 goto assign;
1888
1889 dst_load = env->dst_stats.load + load;
1890 src_load = env->src_stats.load - load;
1891
1892 if (load_too_imbalanced(src_load, dst_load, env))
1893 goto unlock;
1894
1895 assign:
1896
1897 if (!cur) {
1898 int cpu = env->dst_stats.idle_cpu;
1899
1900
1901 if (cpu < 0)
1902 cpu = env->dst_cpu;
1903
1904
1905
1906
1907
1908 if (!idle_cpu(cpu) && env->best_cpu >= 0 &&
1909 idle_cpu(env->best_cpu)) {
1910 cpu = env->best_cpu;
1911 }
1912
1913 env->dst_cpu = cpu;
1914 }
1915
1916 task_numa_assign(env, cur, imp);
1917
1918
1919
1920
1921
1922
1923 if (maymove && !cur && env->best_cpu >= 0 && idle_cpu(env->best_cpu))
1924 stopsearch = true;
1925
1926
1927
1928
1929
1930 if (!maymove && env->best_task &&
1931 env->best_task->numa_preferred_nid == env->src_nid) {
1932 stopsearch = true;
1933 }
1934 unlock:
1935 rcu_read_unlock();
1936
1937 return stopsearch;
1938 }
1939
1940 static void task_numa_find_cpu(struct task_numa_env *env,
1941 long taskimp, long groupimp)
1942 {
1943 bool maymove = false;
1944 int cpu;
1945
1946
1947
1948
1949
1950 if (env->dst_stats.node_type == node_has_spare) {
1951 unsigned int imbalance;
1952 int src_running, dst_running;
1953
1954
1955
1956
1957
1958
1959
1960 src_running = env->src_stats.nr_running - 1;
1961 dst_running = env->dst_stats.nr_running + 1;
1962 imbalance = max(0, dst_running - src_running);
1963 imbalance = adjust_numa_imbalance(imbalance, dst_running,
1964 env->imb_numa_nr);
1965
1966
1967 if (!imbalance) {
1968 maymove = true;
1969 if (env->dst_stats.idle_cpu >= 0) {
1970 env->dst_cpu = env->dst_stats.idle_cpu;
1971 task_numa_assign(env, NULL, 0);
1972 return;
1973 }
1974 }
1975 } else {
1976 long src_load, dst_load, load;
1977
1978
1979
1980
1981 load = task_h_load(env->p);
1982 dst_load = env->dst_stats.load + load;
1983 src_load = env->src_stats.load - load;
1984 maymove = !load_too_imbalanced(src_load, dst_load, env);
1985 }
1986
1987 for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
1988
1989 if (!cpumask_test_cpu(cpu, env->p->cpus_ptr))
1990 continue;
1991
1992 env->dst_cpu = cpu;
1993 if (task_numa_compare(env, taskimp, groupimp, maymove))
1994 break;
1995 }
1996 }
1997
1998 static int task_numa_migrate(struct task_struct *p)
1999 {
2000 struct task_numa_env env = {
2001 .p = p,
2002
2003 .src_cpu = task_cpu(p),
2004 .src_nid = task_node(p),
2005
2006 .imbalance_pct = 112,
2007
2008 .best_task = NULL,
2009 .best_imp = 0,
2010 .best_cpu = -1,
2011 };
2012 unsigned long taskweight, groupweight;
2013 struct sched_domain *sd;
2014 long taskimp, groupimp;
2015 struct numa_group *ng;
2016 struct rq *best_rq;
2017 int nid, ret, dist;
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027 rcu_read_lock();
2028 sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
2029 if (sd) {
2030 env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
2031 env.imb_numa_nr = sd->imb_numa_nr;
2032 }
2033 rcu_read_unlock();
2034
2035
2036
2037
2038
2039
2040
2041 if (unlikely(!sd)) {
2042 sched_setnuma(p, task_node(p));
2043 return -EINVAL;
2044 }
2045
2046 env.dst_nid = p->numa_preferred_nid;
2047 dist = env.dist = node_distance(env.src_nid, env.dst_nid);
2048 taskweight = task_weight(p, env.src_nid, dist);
2049 groupweight = group_weight(p, env.src_nid, dist);
2050 update_numa_stats(&env, &env.src_stats, env.src_nid, false);
2051 taskimp = task_weight(p, env.dst_nid, dist) - taskweight;
2052 groupimp = group_weight(p, env.dst_nid, dist) - groupweight;
2053 update_numa_stats(&env, &env.dst_stats, env.dst_nid, true);
2054
2055
2056 task_numa_find_cpu(&env, taskimp, groupimp);
2057
2058
2059
2060
2061
2062
2063
2064
2065 ng = deref_curr_numa_group(p);
2066 if (env.best_cpu == -1 || (ng && ng->active_nodes > 1)) {
2067 for_each_node_state(nid, N_CPU) {
2068 if (nid == env.src_nid || nid == p->numa_preferred_nid)
2069 continue;
2070
2071 dist = node_distance(env.src_nid, env.dst_nid);
2072 if (sched_numa_topology_type == NUMA_BACKPLANE &&
2073 dist != env.dist) {
2074 taskweight = task_weight(p, env.src_nid, dist);
2075 groupweight = group_weight(p, env.src_nid, dist);
2076 }
2077
2078
2079 taskimp = task_weight(p, nid, dist) - taskweight;
2080 groupimp = group_weight(p, nid, dist) - groupweight;
2081 if (taskimp < 0 && groupimp < 0)
2082 continue;
2083
2084 env.dist = dist;
2085 env.dst_nid = nid;
2086 update_numa_stats(&env, &env.dst_stats, env.dst_nid, true);
2087 task_numa_find_cpu(&env, taskimp, groupimp);
2088 }
2089 }
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099 if (ng) {
2100 if (env.best_cpu == -1)
2101 nid = env.src_nid;
2102 else
2103 nid = cpu_to_node(env.best_cpu);
2104
2105 if (nid != p->numa_preferred_nid)
2106 sched_setnuma(p, nid);
2107 }
2108
2109
2110 if (env.best_cpu == -1) {
2111 trace_sched_stick_numa(p, env.src_cpu, NULL, -1);
2112 return -EAGAIN;
2113 }
2114
2115 best_rq = cpu_rq(env.best_cpu);
2116 if (env.best_task == NULL) {
2117 ret = migrate_task_to(p, env.best_cpu);
2118 WRITE_ONCE(best_rq->numa_migrate_on, 0);
2119 if (ret != 0)
2120 trace_sched_stick_numa(p, env.src_cpu, NULL, env.best_cpu);
2121 return ret;
2122 }
2123
2124 ret = migrate_swap(p, env.best_task, env.best_cpu, env.src_cpu);
2125 WRITE_ONCE(best_rq->numa_migrate_on, 0);
2126
2127 if (ret != 0)
2128 trace_sched_stick_numa(p, env.src_cpu, env.best_task, env.best_cpu);
2129 put_task_struct(env.best_task);
2130 return ret;
2131 }
2132
2133
2134 static void numa_migrate_preferred(struct task_struct *p)
2135 {
2136 unsigned long interval = HZ;
2137
2138
2139 if (unlikely(p->numa_preferred_nid == NUMA_NO_NODE || !p->numa_faults))
2140 return;
2141
2142
2143 interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);
2144 p->numa_migrate_retry = jiffies + interval;
2145
2146
2147 if (task_node(p) == p->numa_preferred_nid)
2148 return;
2149
2150
2151 task_numa_migrate(p);
2152 }
2153
2154
2155
2156
2157
2158
2159
2160 static void numa_group_count_active_nodes(struct numa_group *numa_group)
2161 {
2162 unsigned long faults, max_faults = 0;
2163 int nid, active_nodes = 0;
2164
2165 for_each_node_state(nid, N_CPU) {
2166 faults = group_faults_cpu(numa_group, nid);
2167 if (faults > max_faults)
2168 max_faults = faults;
2169 }
2170
2171 for_each_node_state(nid, N_CPU) {
2172 faults = group_faults_cpu(numa_group, nid);
2173 if (faults * ACTIVE_NODE_FRACTION > max_faults)
2174 active_nodes++;
2175 }
2176
2177 numa_group->max_faults_cpu = max_faults;
2178 numa_group->active_nodes = active_nodes;
2179 }
2180
2181
2182
2183
2184
2185
2186
2187
2188 #define NUMA_PERIOD_SLOTS 10
2189 #define NUMA_PERIOD_THRESHOLD 7
2190
2191
2192
2193
2194
2195
2196
2197 static void update_task_scan_period(struct task_struct *p,
2198 unsigned long shared, unsigned long private)
2199 {
2200 unsigned int period_slot;
2201 int lr_ratio, ps_ratio;
2202 int diff;
2203
2204 unsigned long remote = p->numa_faults_locality[0];
2205 unsigned long local = p->numa_faults_locality[1];
2206
2207
2208
2209
2210
2211
2212
2213
2214 if (local + shared == 0 || p->numa_faults_locality[2]) {
2215 p->numa_scan_period = min(p->numa_scan_period_max,
2216 p->numa_scan_period << 1);
2217
2218 p->mm->numa_next_scan = jiffies +
2219 msecs_to_jiffies(p->numa_scan_period);
2220
2221 return;
2222 }
2223
2224
2225
2226
2227
2228
2229
2230 period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
2231 lr_ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
2232 ps_ratio = (private * NUMA_PERIOD_SLOTS) / (private + shared);
2233
2234 if (ps_ratio >= NUMA_PERIOD_THRESHOLD) {
2235
2236
2237
2238
2239 int slot = ps_ratio - NUMA_PERIOD_THRESHOLD;
2240 if (!slot)
2241 slot = 1;
2242 diff = slot * period_slot;
2243 } else if (lr_ratio >= NUMA_PERIOD_THRESHOLD) {
2244
2245
2246
2247
2248
2249 int slot = lr_ratio - NUMA_PERIOD_THRESHOLD;
2250 if (!slot)
2251 slot = 1;
2252 diff = slot * period_slot;
2253 } else {
2254
2255
2256
2257
2258
2259 int ratio = max(lr_ratio, ps_ratio);
2260 diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
2261 }
2262
2263 p->numa_scan_period = clamp(p->numa_scan_period + diff,
2264 task_scan_min(p), task_scan_max(p));
2265 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
2266 }
2267
2268
2269
2270
2271
2272
2273
2274
2275 static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
2276 {
2277 u64 runtime, delta, now;
2278
2279 now = p->se.exec_start;
2280 runtime = p->se.sum_exec_runtime;
2281
2282 if (p->last_task_numa_placement) {
2283 delta = runtime - p->last_sum_exec_runtime;
2284 *period = now - p->last_task_numa_placement;
2285
2286
2287 if (unlikely((s64)*period < 0))
2288 *period = 0;
2289 } else {
2290 delta = p->se.avg.load_sum;
2291 *period = LOAD_AVG_MAX;
2292 }
2293
2294 p->last_sum_exec_runtime = runtime;
2295 p->last_task_numa_placement = now;
2296
2297 return delta;
2298 }
2299
2300
2301
2302
2303
2304
2305 static int preferred_group_nid(struct task_struct *p, int nid)
2306 {
2307 nodemask_t nodes;
2308 int dist;
2309
2310
2311 if (sched_numa_topology_type == NUMA_DIRECT)
2312 return nid;
2313
2314
2315
2316
2317
2318
2319 if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
2320 unsigned long score, max_score = 0;
2321 int node, max_node = nid;
2322
2323 dist = sched_max_numa_distance;
2324
2325 for_each_node_state(node, N_CPU) {
2326 score = group_weight(p, node, dist);
2327 if (score > max_score) {
2328 max_score = score;
2329 max_node = node;
2330 }
2331 }
2332 return max_node;
2333 }
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344 nodes = node_states[N_CPU];
2345 for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) {
2346 unsigned long max_faults = 0;
2347 nodemask_t max_group = NODE_MASK_NONE;
2348 int a, b;
2349
2350
2351 if (!find_numa_distance(dist))
2352 continue;
2353
2354 for_each_node_mask(a, nodes) {
2355 unsigned long faults = 0;
2356 nodemask_t this_group;
2357 nodes_clear(this_group);
2358
2359
2360 for_each_node_mask(b, nodes) {
2361 if (node_distance(a, b) < dist) {
2362 faults += group_faults(p, b);
2363 node_set(b, this_group);
2364 node_clear(b, nodes);
2365 }
2366 }
2367
2368
2369 if (faults > max_faults) {
2370 max_faults = faults;
2371 max_group = this_group;
2372
2373
2374
2375
2376
2377 nid = a;
2378 }
2379 }
2380
2381 if (!max_faults)
2382 break;
2383 nodes = max_group;
2384 }
2385 return nid;
2386 }
2387
2388 static void task_numa_placement(struct task_struct *p)
2389 {
2390 int seq, nid, max_nid = NUMA_NO_NODE;
2391 unsigned long max_faults = 0;
2392 unsigned long fault_types[2] = { 0, 0 };
2393 unsigned long total_faults;
2394 u64 runtime, period;
2395 spinlock_t *group_lock = NULL;
2396 struct numa_group *ng;
2397
2398
2399
2400
2401
2402
2403 seq = READ_ONCE(p->mm->numa_scan_seq);
2404 if (p->numa_scan_seq == seq)
2405 return;
2406 p->numa_scan_seq = seq;
2407 p->numa_scan_period_max = task_scan_max(p);
2408
2409 total_faults = p->numa_faults_locality[0] +
2410 p->numa_faults_locality[1];
2411 runtime = numa_get_avg_runtime(p, &period);
2412
2413
2414 ng = deref_curr_numa_group(p);
2415 if (ng) {
2416 group_lock = &ng->lock;
2417 spin_lock_irq(group_lock);
2418 }
2419
2420
2421 for_each_online_node(nid) {
2422
2423 int mem_idx, membuf_idx, cpu_idx, cpubuf_idx;
2424 unsigned long faults = 0, group_faults = 0;
2425 int priv;
2426
2427 for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {
2428 long diff, f_diff, f_weight;
2429
2430 mem_idx = task_faults_idx(NUMA_MEM, nid, priv);
2431 membuf_idx = task_faults_idx(NUMA_MEMBUF, nid, priv);
2432 cpu_idx = task_faults_idx(NUMA_CPU, nid, priv);
2433 cpubuf_idx = task_faults_idx(NUMA_CPUBUF, nid, priv);
2434
2435
2436 diff = p->numa_faults[membuf_idx] - p->numa_faults[mem_idx] / 2;
2437 fault_types[priv] += p->numa_faults[membuf_idx];
2438 p->numa_faults[membuf_idx] = 0;
2439
2440
2441
2442
2443
2444
2445
2446
2447 f_weight = div64_u64(runtime << 16, period + 1);
2448 f_weight = (f_weight * p->numa_faults[cpubuf_idx]) /
2449 (total_faults + 1);
2450 f_diff = f_weight - p->numa_faults[cpu_idx] / 2;
2451 p->numa_faults[cpubuf_idx] = 0;
2452
2453 p->numa_faults[mem_idx] += diff;
2454 p->numa_faults[cpu_idx] += f_diff;
2455 faults += p->numa_faults[mem_idx];
2456 p->total_numa_faults += diff;
2457 if (ng) {
2458
2459
2460
2461
2462
2463
2464
2465 ng->faults[mem_idx] += diff;
2466 ng->faults[cpu_idx] += f_diff;
2467 ng->total_faults += diff;
2468 group_faults += ng->faults[mem_idx];
2469 }
2470 }
2471
2472 if (!ng) {
2473 if (faults > max_faults) {
2474 max_faults = faults;
2475 max_nid = nid;
2476 }
2477 } else if (group_faults > max_faults) {
2478 max_faults = group_faults;
2479 max_nid = nid;
2480 }
2481 }
2482
2483
2484 if (max_nid != NUMA_NO_NODE && !node_state(max_nid, N_CPU)) {
2485 int near_nid = max_nid;
2486 int distance, near_distance = INT_MAX;
2487
2488 for_each_node_state(nid, N_CPU) {
2489 distance = node_distance(max_nid, nid);
2490 if (distance < near_distance) {
2491 near_nid = nid;
2492 near_distance = distance;
2493 }
2494 }
2495 max_nid = near_nid;
2496 }
2497
2498 if (ng) {
2499 numa_group_count_active_nodes(ng);
2500 spin_unlock_irq(group_lock);
2501 max_nid = preferred_group_nid(p, max_nid);
2502 }
2503
2504 if (max_faults) {
2505
2506 if (max_nid != p->numa_preferred_nid)
2507 sched_setnuma(p, max_nid);
2508 }
2509
2510 update_task_scan_period(p, fault_types[0], fault_types[1]);
2511 }
2512
2513 static inline int get_numa_group(struct numa_group *grp)
2514 {
2515 return refcount_inc_not_zero(&grp->refcount);
2516 }
2517
2518 static inline void put_numa_group(struct numa_group *grp)
2519 {
2520 if (refcount_dec_and_test(&grp->refcount))
2521 kfree_rcu(grp, rcu);
2522 }
2523
2524 static void task_numa_group(struct task_struct *p, int cpupid, int flags,
2525 int *priv)
2526 {
2527 struct numa_group *grp, *my_grp;
2528 struct task_struct *tsk;
2529 bool join = false;
2530 int cpu = cpupid_to_cpu(cpupid);
2531 int i;
2532
2533 if (unlikely(!deref_curr_numa_group(p))) {
2534 unsigned int size = sizeof(struct numa_group) +
2535 NR_NUMA_HINT_FAULT_STATS *
2536 nr_node_ids * sizeof(unsigned long);
2537
2538 grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
2539 if (!grp)
2540 return;
2541
2542 refcount_set(&grp->refcount, 1);
2543 grp->active_nodes = 1;
2544 grp->max_faults_cpu = 0;
2545 spin_lock_init(&grp->lock);
2546 grp->gid = p->pid;
2547
2548 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
2549 grp->faults[i] = p->numa_faults[i];
2550
2551 grp->total_faults = p->total_numa_faults;
2552
2553 grp->nr_tasks++;
2554 rcu_assign_pointer(p->numa_group, grp);
2555 }
2556
2557 rcu_read_lock();
2558 tsk = READ_ONCE(cpu_rq(cpu)->curr);
2559
2560 if (!cpupid_match_pid(tsk, cpupid))
2561 goto no_join;
2562
2563 grp = rcu_dereference(tsk->numa_group);
2564 if (!grp)
2565 goto no_join;
2566
2567 my_grp = deref_curr_numa_group(p);
2568 if (grp == my_grp)
2569 goto no_join;
2570
2571
2572
2573
2574
2575 if (my_grp->nr_tasks > grp->nr_tasks)
2576 goto no_join;
2577
2578
2579
2580
2581 if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp)
2582 goto no_join;
2583
2584
2585 if (tsk->mm == current->mm)
2586 join = true;
2587
2588
2589 if (flags & TNF_SHARED)
2590 join = true;
2591
2592
2593 *priv = !join;
2594
2595 if (join && !get_numa_group(grp))
2596 goto no_join;
2597
2598 rcu_read_unlock();
2599
2600 if (!join)
2601 return;
2602
2603 BUG_ON(irqs_disabled());
2604 double_lock_irq(&my_grp->lock, &grp->lock);
2605
2606 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
2607 my_grp->faults[i] -= p->numa_faults[i];
2608 grp->faults[i] += p->numa_faults[i];
2609 }
2610 my_grp->total_faults -= p->total_numa_faults;
2611 grp->total_faults += p->total_numa_faults;
2612
2613 my_grp->nr_tasks--;
2614 grp->nr_tasks++;
2615
2616 spin_unlock(&my_grp->lock);
2617 spin_unlock_irq(&grp->lock);
2618
2619 rcu_assign_pointer(p->numa_group, grp);
2620
2621 put_numa_group(my_grp);
2622 return;
2623
2624 no_join:
2625 rcu_read_unlock();
2626 return;
2627 }
2628
2629
2630
2631
2632
2633
2634
2635
2636 void task_numa_free(struct task_struct *p, bool final)
2637 {
2638
2639 struct numa_group *grp = rcu_dereference_raw(p->numa_group);
2640 unsigned long *numa_faults = p->numa_faults;
2641 unsigned long flags;
2642 int i;
2643
2644 if (!numa_faults)
2645 return;
2646
2647 if (grp) {
2648 spin_lock_irqsave(&grp->lock, flags);
2649 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
2650 grp->faults[i] -= p->numa_faults[i];
2651 grp->total_faults -= p->total_numa_faults;
2652
2653 grp->nr_tasks--;
2654 spin_unlock_irqrestore(&grp->lock, flags);
2655 RCU_INIT_POINTER(p->numa_group, NULL);
2656 put_numa_group(grp);
2657 }
2658
2659 if (final) {
2660 p->numa_faults = NULL;
2661 kfree(numa_faults);
2662 } else {
2663 p->total_numa_faults = 0;
2664 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
2665 numa_faults[i] = 0;
2666 }
2667 }
2668
2669
2670
2671
2672 void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
2673 {
2674 struct task_struct *p = current;
2675 bool migrated = flags & TNF_MIGRATED;
2676 int cpu_node = task_node(current);
2677 int local = !!(flags & TNF_FAULT_LOCAL);
2678 struct numa_group *ng;
2679 int priv;
2680
2681 if (!static_branch_likely(&sched_numa_balancing))
2682 return;
2683
2684
2685 if (!p->mm)
2686 return;
2687
2688
2689 if (unlikely(!p->numa_faults)) {
2690 int size = sizeof(*p->numa_faults) *
2691 NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
2692
2693 p->numa_faults = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);
2694 if (!p->numa_faults)
2695 return;
2696
2697 p->total_numa_faults = 0;
2698 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
2699 }
2700
2701
2702
2703
2704
2705 if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) {
2706 priv = 1;
2707 } else {
2708 priv = cpupid_match_pid(p, last_cpupid);
2709 if (!priv && !(flags & TNF_NO_GROUP))
2710 task_numa_group(p, last_cpupid, flags, &priv);
2711 }
2712
2713
2714
2715
2716
2717
2718
2719 ng = deref_curr_numa_group(p);
2720 if (!priv && !local && ng && ng->active_nodes > 1 &&
2721 numa_is_active_node(cpu_node, ng) &&
2722 numa_is_active_node(mem_node, ng))
2723 local = 1;
2724
2725
2726
2727
2728
2729 if (time_after(jiffies, p->numa_migrate_retry)) {
2730 task_numa_placement(p);
2731 numa_migrate_preferred(p);
2732 }
2733
2734 if (migrated)
2735 p->numa_pages_migrated += pages;
2736 if (flags & TNF_MIGRATE_FAIL)
2737 p->numa_faults_locality[2] += pages;
2738
2739 p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;
2740 p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;
2741 p->numa_faults_locality[local] += pages;
2742 }
2743
2744 static void reset_ptenuma_scan(struct task_struct *p)
2745 {
2746
2747
2748
2749
2750
2751
2752
2753
2754 WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
2755 p->mm->numa_scan_offset = 0;
2756 }
2757
2758
2759
2760
2761
2762 static void task_numa_work(struct callback_head *work)
2763 {
2764 unsigned long migrate, next_scan, now = jiffies;
2765 struct task_struct *p = current;
2766 struct mm_struct *mm = p->mm;
2767 u64 runtime = p->se.sum_exec_runtime;
2768 struct vm_area_struct *vma;
2769 unsigned long start, end;
2770 unsigned long nr_pte_updates = 0;
2771 long pages, virtpages;
2772
2773 SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work));
2774
2775 work->next = work;
2776
2777
2778
2779
2780
2781
2782
2783
2784 if (p->flags & PF_EXITING)
2785 return;
2786
2787 if (!mm->numa_next_scan) {
2788 mm->numa_next_scan = now +
2789 msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
2790 }
2791
2792
2793
2794
2795 migrate = mm->numa_next_scan;
2796 if (time_before(now, migrate))
2797 return;
2798
2799 if (p->numa_scan_period == 0) {
2800 p->numa_scan_period_max = task_scan_max(p);
2801 p->numa_scan_period = task_scan_start(p);
2802 }
2803
2804 next_scan = now + msecs_to_jiffies(p->numa_scan_period);
2805 if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
2806 return;
2807
2808
2809
2810
2811
2812 p->node_stamp += 2 * TICK_NSEC;
2813
2814 start = mm->numa_scan_offset;
2815 pages = sysctl_numa_balancing_scan_size;
2816 pages <<= 20 - PAGE_SHIFT;
2817 virtpages = pages * 8;
2818 if (!pages)
2819 return;
2820
2821
2822 if (!mmap_read_trylock(mm))
2823 return;
2824 vma = find_vma(mm, start);
2825 if (!vma) {
2826 reset_ptenuma_scan(p);
2827 start = 0;
2828 vma = mm->mmap;
2829 }
2830 for (; vma; vma = vma->vm_next) {
2831 if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
2832 is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) {
2833 continue;
2834 }
2835
2836
2837
2838
2839
2840
2841
2842 if (!vma->vm_mm ||
2843 (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
2844 continue;
2845
2846
2847
2848
2849
2850 if (!vma_is_accessible(vma))
2851 continue;
2852
2853 do {
2854 start = max(start, vma->vm_start);
2855 end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
2856 end = min(end, vma->vm_end);
2857 nr_pte_updates = change_prot_numa(vma, start, end);
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867 if (nr_pte_updates)
2868 pages -= (end - start) >> PAGE_SHIFT;
2869 virtpages -= (end - start) >> PAGE_SHIFT;
2870
2871 start = end;
2872 if (pages <= 0 || virtpages <= 0)
2873 goto out;
2874
2875 cond_resched();
2876 } while (end != vma->vm_end);
2877 }
2878
2879 out:
2880
2881
2882
2883
2884
2885
2886 if (vma)
2887 mm->numa_scan_offset = start;
2888 else
2889 reset_ptenuma_scan(p);
2890 mmap_read_unlock(mm);
2891
2892
2893
2894
2895
2896
2897
2898 if (unlikely(p->se.sum_exec_runtime != runtime)) {
2899 u64 diff = p->se.sum_exec_runtime - runtime;
2900 p->node_stamp += 32 * diff;
2901 }
2902 }
2903
2904 void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
2905 {
2906 int mm_users = 0;
2907 struct mm_struct *mm = p->mm;
2908
2909 if (mm) {
2910 mm_users = atomic_read(&mm->mm_users);
2911 if (mm_users == 1) {
2912 mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
2913 mm->numa_scan_seq = 0;
2914 }
2915 }
2916 p->node_stamp = 0;
2917 p->numa_scan_seq = mm ? mm->numa_scan_seq : 0;
2918 p->numa_scan_period = sysctl_numa_balancing_scan_delay;
2919 p->numa_migrate_retry = 0;
2920
2921 p->numa_work.next = &p->numa_work;
2922 p->numa_faults = NULL;
2923 p->numa_pages_migrated = 0;
2924 p->total_numa_faults = 0;
2925 RCU_INIT_POINTER(p->numa_group, NULL);
2926 p->last_task_numa_placement = 0;
2927 p->last_sum_exec_runtime = 0;
2928
2929 init_task_work(&p->numa_work, task_numa_work);
2930
2931
2932 if (!(clone_flags & CLONE_VM)) {
2933 p->numa_preferred_nid = NUMA_NO_NODE;
2934 return;
2935 }
2936
2937
2938
2939
2940
2941 if (mm) {
2942 unsigned int delay;
2943
2944 delay = min_t(unsigned int, task_scan_max(current),
2945 current->numa_scan_period * mm_users * NSEC_PER_MSEC);
2946 delay += 2 * TICK_NSEC;
2947 p->node_stamp = delay;
2948 }
2949 }
2950
2951
2952
2953
2954 static void task_tick_numa(struct rq *rq, struct task_struct *curr)
2955 {
2956 struct callback_head *work = &curr->numa_work;
2957 u64 period, now;
2958
2959
2960
2961
2962 if (!curr->mm || (curr->flags & (PF_EXITING | PF_KTHREAD)) || work->next != work)
2963 return;
2964
2965
2966
2967
2968
2969
2970
2971 now = curr->se.sum_exec_runtime;
2972 period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
2973
2974 if (now > curr->node_stamp + period) {
2975 if (!curr->node_stamp)
2976 curr->numa_scan_period = task_scan_start(curr);
2977 curr->node_stamp += period;
2978
2979 if (!time_before(jiffies, curr->mm->numa_next_scan))
2980 task_work_add(curr, work, TWA_RESUME);
2981 }
2982 }
2983
2984 static void update_scan_period(struct task_struct *p, int new_cpu)
2985 {
2986 int src_nid = cpu_to_node(task_cpu(p));
2987 int dst_nid = cpu_to_node(new_cpu);
2988
2989 if (!static_branch_likely(&sched_numa_balancing))
2990 return;
2991
2992 if (!p->mm || !p->numa_faults || (p->flags & PF_EXITING))
2993 return;
2994
2995 if (src_nid == dst_nid)
2996 return;
2997
2998
2999
3000
3001
3002
3003 if (p->numa_scan_seq) {
3004
3005
3006
3007
3008
3009 if (dst_nid == p->numa_preferred_nid ||
3010 (p->numa_preferred_nid != NUMA_NO_NODE &&
3011 src_nid != p->numa_preferred_nid))
3012 return;
3013 }
3014
3015 p->numa_scan_period = task_scan_start(p);
3016 }
3017
3018 #else
3019 static void task_tick_numa(struct rq *rq, struct task_struct *curr)
3020 {
3021 }
3022
3023 static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)
3024 {
3025 }
3026
3027 static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
3028 {
3029 }
3030
3031 static inline void update_scan_period(struct task_struct *p, int new_cpu)
3032 {
3033 }
3034
3035 #endif
3036
3037 static void
3038 account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
3039 {
3040 update_load_add(&cfs_rq->load, se->load.weight);
3041 #ifdef CONFIG_SMP
3042 if (entity_is_task(se)) {
3043 struct rq *rq = rq_of(cfs_rq);
3044
3045 account_numa_enqueue(rq, task_of(se));
3046 list_add(&se->group_node, &rq->cfs_tasks);
3047 }
3048 #endif
3049 cfs_rq->nr_running++;
3050 if (se_is_idle(se))
3051 cfs_rq->idle_nr_running++;
3052 }
3053
3054 static void
3055 account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
3056 {
3057 update_load_sub(&cfs_rq->load, se->load.weight);
3058 #ifdef CONFIG_SMP
3059 if (entity_is_task(se)) {
3060 account_numa_dequeue(rq_of(cfs_rq), task_of(se));
3061 list_del_init(&se->group_node);
3062 }
3063 #endif
3064 cfs_rq->nr_running--;
3065 if (se_is_idle(se))
3066 cfs_rq->idle_nr_running--;
3067 }
3068
3069
3070
3071
3072
3073
3074
3075
3076 #define add_positive(_ptr, _val) do { \
3077 typeof(_ptr) ptr = (_ptr); \
3078 typeof(_val) val = (_val); \
3079 typeof(*ptr) res, var = READ_ONCE(*ptr); \
3080 \
3081 res = var + val; \
3082 \
3083 if (val < 0 && res > var) \
3084 res = 0; \
3085 \
3086 WRITE_ONCE(*ptr, res); \
3087 } while (0)
3088
3089
3090
3091
3092
3093
3094
3095
3096 #define sub_positive(_ptr, _val) do { \
3097 typeof(_ptr) ptr = (_ptr); \
3098 typeof(*ptr) val = (_val); \
3099 typeof(*ptr) res, var = READ_ONCE(*ptr); \
3100 res = var - val; \
3101 if (res > var) \
3102 res = 0; \
3103 WRITE_ONCE(*ptr, res); \
3104 } while (0)
3105
3106
3107
3108
3109
3110
3111
3112 #define lsub_positive(_ptr, _val) do { \
3113 typeof(_ptr) ptr = (_ptr); \
3114 *ptr -= min_t(typeof(*ptr), *ptr, _val); \
3115 } while (0)
3116
3117 #ifdef CONFIG_SMP
3118 static inline void
3119 enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
3120 {
3121 cfs_rq->avg.load_avg += se->avg.load_avg;
3122 cfs_rq->avg.load_sum += se_weight(se) * se->avg.load_sum;
3123 }
3124
3125 static inline void
3126 dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
3127 {
3128 sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
3129 sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum);
3130
3131 cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum,
3132 cfs_rq->avg.load_avg * PELT_MIN_DIVIDER);
3133 }
3134 #else
3135 static inline void
3136 enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
3137 static inline void
3138 dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
3139 #endif
3140
3141 static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
3142 unsigned long weight)
3143 {
3144 if (se->on_rq) {
3145
3146 if (cfs_rq->curr == se)
3147 update_curr(cfs_rq);
3148 update_load_sub(&cfs_rq->load, se->load.weight);
3149 }
3150 dequeue_load_avg(cfs_rq, se);
3151
3152 update_load_set(&se->load, weight);
3153
3154 #ifdef CONFIG_SMP
3155 do {
3156 u32 divider = get_pelt_divider(&se->avg);
3157
3158 se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider);
3159 } while (0);
3160 #endif
3161
3162 enqueue_load_avg(cfs_rq, se);
3163 if (se->on_rq)
3164 update_load_add(&cfs_rq->load, se->load.weight);
3165
3166 }
3167
3168 void reweight_task(struct task_struct *p, int prio)
3169 {
3170 struct sched_entity *se = &p->se;
3171 struct cfs_rq *cfs_rq = cfs_rq_of(se);
3172 struct load_weight *load = &se->load;
3173 unsigned long weight = scale_load(sched_prio_to_weight[prio]);
3174
3175 reweight_entity(cfs_rq, se, weight);
3176 load->inv_weight = sched_prio_to_wmult[prio];
3177 }
3178
3179 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
3180
3181 #ifdef CONFIG_FAIR_GROUP_SCHED
3182 #ifdef CONFIG_SMP
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256 static long calc_group_shares(struct cfs_rq *cfs_rq)
3257 {
3258 long tg_weight, tg_shares, load, shares;
3259 struct task_group *tg = cfs_rq->tg;
3260
3261 tg_shares = READ_ONCE(tg->shares);
3262
3263 load = max(scale_load_down(cfs_rq->load.weight), cfs_rq->avg.load_avg);
3264
3265 tg_weight = atomic_long_read(&tg->load_avg);
3266
3267
3268 tg_weight -= cfs_rq->tg_load_avg_contrib;
3269 tg_weight += load;
3270
3271 shares = (tg_shares * load);
3272 if (tg_weight)
3273 shares /= tg_weight;
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287 return clamp_t(long, shares, MIN_SHARES, tg_shares);
3288 }
3289 #endif
3290
3291
3292
3293
3294
3295 static void update_cfs_group(struct sched_entity *se)
3296 {
3297 struct cfs_rq *gcfs_rq = group_cfs_rq(se);
3298 long shares;
3299
3300 if (!gcfs_rq)
3301 return;
3302
3303 if (throttled_hierarchy(gcfs_rq))
3304 return;
3305
3306 #ifndef CONFIG_SMP
3307 shares = READ_ONCE(gcfs_rq->tg->shares);
3308
3309 if (likely(se->load.weight == shares))
3310 return;
3311 #else
3312 shares = calc_group_shares(gcfs_rq);
3313 #endif
3314
3315 reweight_entity(cfs_rq_of(se), se, shares);
3316 }
3317
3318 #else
3319 static inline void update_cfs_group(struct sched_entity *se)
3320 {
3321 }
3322 #endif
3323
3324 static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
3325 {
3326 struct rq *rq = rq_of(cfs_rq);
3327
3328 if (&rq->cfs == cfs_rq) {
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343 cpufreq_update_util(rq, flags);
3344 }
3345 }
3346
3347 #ifdef CONFIG_SMP
3348 static inline bool load_avg_is_decayed(struct sched_avg *sa)
3349 {
3350 if (sa->load_sum)
3351 return false;
3352
3353 if (sa->util_sum)
3354 return false;
3355
3356 if (sa->runnable_sum)
3357 return false;
3358
3359
3360
3361
3362
3363
3364 SCHED_WARN_ON(sa->load_avg ||
3365 sa->util_avg ||
3366 sa->runnable_avg);
3367
3368 return true;
3369 }
3370
3371 static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
3372 {
3373 return u64_u32_load_copy(cfs_rq->avg.last_update_time,
3374 cfs_rq->last_update_time_copy);
3375 }
3376 #ifdef CONFIG_FAIR_GROUP_SCHED
3377
3378
3379
3380
3381
3382
3383
3384
3385 static inline bool child_cfs_rq_on_list(struct cfs_rq *cfs_rq)
3386 {
3387 struct cfs_rq *prev_cfs_rq;
3388 struct list_head *prev;
3389
3390 if (cfs_rq->on_list) {
3391 prev = cfs_rq->leaf_cfs_rq_list.prev;
3392 } else {
3393 struct rq *rq = rq_of(cfs_rq);
3394
3395 prev = rq->tmp_alone_branch;
3396 }
3397
3398 prev_cfs_rq = container_of(prev, struct cfs_rq, leaf_cfs_rq_list);
3399
3400 return (prev_cfs_rq->tg->parent == cfs_rq->tg);
3401 }
3402
3403 static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
3404 {
3405 if (cfs_rq->load.weight)
3406 return false;
3407
3408 if (!load_avg_is_decayed(&cfs_rq->avg))
3409 return false;
3410
3411 if (child_cfs_rq_on_list(cfs_rq))
3412 return false;
3413
3414 return true;
3415 }
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431 static inline void update_tg_load_avg(struct cfs_rq *cfs_rq)
3432 {
3433 long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
3434
3435
3436
3437
3438 if (cfs_rq->tg == &root_task_group)
3439 return;
3440
3441 if (abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
3442 atomic_long_add(delta, &cfs_rq->tg->load_avg);
3443 cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
3444 }
3445 }
3446
3447
3448
3449
3450
3451
3452 void set_task_rq_fair(struct sched_entity *se,
3453 struct cfs_rq *prev, struct cfs_rq *next)
3454 {
3455 u64 p_last_update_time;
3456 u64 n_last_update_time;
3457
3458 if (!sched_feat(ATTACH_AGE_LOAD))
3459 return;
3460
3461
3462
3463
3464
3465
3466
3467
3468 if (!(se->avg.last_update_time && prev))
3469 return;
3470
3471 p_last_update_time = cfs_rq_last_update_time(prev);
3472 n_last_update_time = cfs_rq_last_update_time(next);
3473
3474 __update_load_avg_blocked_se(p_last_update_time, se);
3475 se->avg.last_update_time = n_last_update_time;
3476 }
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545 static inline void
3546 update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
3547 {
3548 long delta_sum, delta_avg = gcfs_rq->avg.util_avg - se->avg.util_avg;
3549 u32 new_sum, divider;
3550
3551
3552 if (!delta_avg)
3553 return;
3554
3555
3556
3557
3558
3559 divider = get_pelt_divider(&cfs_rq->avg);
3560
3561
3562
3563 se->avg.util_avg = gcfs_rq->avg.util_avg;
3564 new_sum = se->avg.util_avg * divider;
3565 delta_sum = (long)new_sum - (long)se->avg.util_sum;
3566 se->avg.util_sum = new_sum;
3567
3568
3569 add_positive(&cfs_rq->avg.util_avg, delta_avg);
3570 add_positive(&cfs_rq->avg.util_sum, delta_sum);
3571
3572
3573 cfs_rq->avg.util_sum = max_t(u32, cfs_rq->avg.util_sum,
3574 cfs_rq->avg.util_avg * PELT_MIN_DIVIDER);
3575 }
3576
3577 static inline void
3578 update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
3579 {
3580 long delta_sum, delta_avg = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg;
3581 u32 new_sum, divider;
3582
3583
3584 if (!delta_avg)
3585 return;
3586
3587
3588
3589
3590
3591 divider = get_pelt_divider(&cfs_rq->avg);
3592
3593
3594 se->avg.runnable_avg = gcfs_rq->avg.runnable_avg;
3595 new_sum = se->avg.runnable_avg * divider;
3596 delta_sum = (long)new_sum - (long)se->avg.runnable_sum;
3597 se->avg.runnable_sum = new_sum;
3598
3599
3600 add_positive(&cfs_rq->avg.runnable_avg, delta_avg);
3601 add_positive(&cfs_rq->avg.runnable_sum, delta_sum);
3602
3603 cfs_rq->avg.runnable_sum = max_t(u32, cfs_rq->avg.runnable_sum,
3604 cfs_rq->avg.runnable_avg * PELT_MIN_DIVIDER);
3605 }
3606
3607 static inline void
3608 update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
3609 {
3610 long delta_avg, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum;
3611 unsigned long load_avg;
3612 u64 load_sum = 0;
3613 s64 delta_sum;
3614 u32 divider;
3615
3616 if (!runnable_sum)
3617 return;
3618
3619 gcfs_rq->prop_runnable_sum = 0;
3620
3621
3622
3623
3624
3625 divider = get_pelt_divider(&cfs_rq->avg);
3626
3627 if (runnable_sum >= 0) {
3628
3629
3630
3631
3632 runnable_sum += se->avg.load_sum;
3633 runnable_sum = min_t(long, runnable_sum, divider);
3634 } else {
3635
3636
3637
3638
3639 if (scale_load_down(gcfs_rq->load.weight)) {
3640 load_sum = div_u64(gcfs_rq->avg.load_sum,
3641 scale_load_down(gcfs_rq->load.weight));
3642 }
3643
3644
3645 runnable_sum = min(se->avg.load_sum, load_sum);
3646 }
3647
3648
3649
3650
3651
3652
3653
3654 running_sum = se->avg.util_sum >> SCHED_CAPACITY_SHIFT;
3655 runnable_sum = max(runnable_sum, running_sum);
3656
3657 load_sum = se_weight(se) * runnable_sum;
3658 load_avg = div_u64(load_sum, divider);
3659
3660 delta_avg = load_avg - se->avg.load_avg;
3661 if (!delta_avg)
3662 return;
3663
3664 delta_sum = load_sum - (s64)se_weight(se) * se->avg.load_sum;
3665
3666 se->avg.load_sum = runnable_sum;
3667 se->avg.load_avg = load_avg;
3668 add_positive(&cfs_rq->avg.load_avg, delta_avg);
3669 add_positive(&cfs_rq->avg.load_sum, delta_sum);
3670
3671 cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum,
3672 cfs_rq->avg.load_avg * PELT_MIN_DIVIDER);
3673 }
3674
3675 static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum)
3676 {
3677 cfs_rq->propagate = 1;
3678 cfs_rq->prop_runnable_sum += runnable_sum;
3679 }
3680
3681
3682 static inline int propagate_entity_load_avg(struct sched_entity *se)
3683 {
3684 struct cfs_rq *cfs_rq, *gcfs_rq;
3685
3686 if (entity_is_task(se))
3687 return 0;
3688
3689 gcfs_rq = group_cfs_rq(se);
3690 if (!gcfs_rq->propagate)
3691 return 0;
3692
3693 gcfs_rq->propagate = 0;
3694
3695 cfs_rq = cfs_rq_of(se);
3696
3697 add_tg_cfs_propagate(cfs_rq, gcfs_rq->prop_runnable_sum);
3698
3699 update_tg_cfs_util(cfs_rq, se, gcfs_rq);
3700 update_tg_cfs_runnable(cfs_rq, se, gcfs_rq);
3701 update_tg_cfs_load(cfs_rq, se, gcfs_rq);
3702
3703 trace_pelt_cfs_tp(cfs_rq);
3704 trace_pelt_se_tp(se);
3705
3706 return 1;
3707 }
3708
3709
3710
3711
3712
3713 static inline bool skip_blocked_update(struct sched_entity *se)
3714 {
3715 struct cfs_rq *gcfs_rq = group_cfs_rq(se);
3716
3717
3718
3719
3720
3721 if (se->avg.load_avg || se->avg.util_avg)
3722 return false;
3723
3724
3725
3726
3727
3728 if (gcfs_rq->propagate)
3729 return false;
3730
3731
3732
3733
3734
3735
3736 return true;
3737 }
3738
3739 #else
3740
3741 static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) {}
3742
3743 static inline int propagate_entity_load_avg(struct sched_entity *se)
3744 {
3745 return 0;
3746 }
3747
3748 static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) {}
3749
3750 #endif
3751
3752 #ifdef CONFIG_NO_HZ_COMMON
3753 static inline void migrate_se_pelt_lag(struct sched_entity *se)
3754 {
3755 u64 throttled = 0, now, lut;
3756 struct cfs_rq *cfs_rq;
3757 struct rq *rq;
3758 bool is_idle;
3759
3760 if (load_avg_is_decayed(&se->avg))
3761 return;
3762
3763 cfs_rq = cfs_rq_of(se);
3764 rq = rq_of(cfs_rq);
3765
3766 rcu_read_lock();
3767 is_idle = is_idle_task(rcu_dereference(rq->curr));
3768 rcu_read_unlock();
3769
3770
3771
3772
3773
3774
3775 if (!is_idle)
3776 return;
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803 #ifdef CONFIG_CFS_BANDWIDTH
3804 throttled = u64_u32_load(cfs_rq->throttled_pelt_idle);
3805
3806 if (throttled == U64_MAX)
3807 return;
3808 #endif
3809 now = u64_u32_load(rq->clock_pelt_idle);
3810
3811
3812
3813
3814
3815
3816 smp_rmb();
3817 lut = cfs_rq_last_update_time(cfs_rq);
3818
3819 now -= throttled;
3820 if (now < lut)
3821
3822
3823
3824
3825 now = lut;
3826 else
3827 now += sched_clock_cpu(cpu_of(rq)) - u64_u32_load(rq->clock_idle);
3828
3829 __update_load_avg_blocked_se(now, se);
3830 }
3831 #else
3832 static void migrate_se_pelt_lag(struct sched_entity *se) {}
3833 #endif
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851 static inline int
3852 update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
3853 {
3854 unsigned long removed_load = 0, removed_util = 0, removed_runnable = 0;
3855 struct sched_avg *sa = &cfs_rq->avg;
3856 int decayed = 0;
3857
3858 if (cfs_rq->removed.nr) {
3859 unsigned long r;
3860 u32 divider = get_pelt_divider(&cfs_rq->avg);
3861
3862 raw_spin_lock(&cfs_rq->removed.lock);
3863 swap(cfs_rq->removed.util_avg, removed_util);
3864 swap(cfs_rq->removed.load_avg, removed_load);
3865 swap(cfs_rq->removed.runnable_avg, removed_runnable);
3866 cfs_rq->removed.nr = 0;
3867 raw_spin_unlock(&cfs_rq->removed.lock);
3868
3869 r = removed_load;
3870 sub_positive(&sa->load_avg, r);
3871 sub_positive(&sa->load_sum, r * divider);
3872
3873 sa->load_sum = max_t(u32, sa->load_sum, sa->load_avg * PELT_MIN_DIVIDER);
3874
3875 r = removed_util;
3876 sub_positive(&sa->util_avg, r);
3877 sub_positive(&sa->util_sum, r * divider);
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889 sa->util_sum = max_t(u32, sa->util_sum, sa->util_avg * PELT_MIN_DIVIDER);
3890
3891 r = removed_runnable;
3892 sub_positive(&sa->runnable_avg, r);
3893 sub_positive(&sa->runnable_sum, r * divider);
3894
3895 sa->runnable_sum = max_t(u32, sa->runnable_sum,
3896 sa->runnable_avg * PELT_MIN_DIVIDER);
3897
3898
3899
3900
3901
3902 add_tg_cfs_propagate(cfs_rq,
3903 -(long)(removed_runnable * divider) >> SCHED_CAPACITY_SHIFT);
3904
3905 decayed = 1;
3906 }
3907
3908 decayed |= __update_load_avg_cfs_rq(now, cfs_rq);
3909 u64_u32_store_copy(sa->last_update_time,
3910 cfs_rq->last_update_time_copy,
3911 sa->last_update_time);
3912 return decayed;
3913 }
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923 static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
3924 {
3925
3926
3927
3928
3929 u32 divider = get_pelt_divider(&cfs_rq->avg);
3930
3931
3932
3933
3934
3935
3936
3937
3938 se->avg.last_update_time = cfs_rq->avg.last_update_time;
3939 se->avg.period_contrib = cfs_rq->avg.period_contrib;
3940
3941
3942
3943
3944
3945
3946
3947 se->avg.util_sum = se->avg.util_avg * divider;
3948
3949 se->avg.runnable_sum = se->avg.runnable_avg * divider;
3950
3951 se->avg.load_sum = se->avg.load_avg * divider;
3952 if (se_weight(se) < se->avg.load_sum)
3953 se->avg.load_sum = div_u64(se->avg.load_sum, se_weight(se));
3954 else
3955 se->avg.load_sum = 1;
3956
3957 enqueue_load_avg(cfs_rq, se);
3958 cfs_rq->avg.util_avg += se->avg.util_avg;
3959 cfs_rq->avg.util_sum += se->avg.util_sum;
3960 cfs_rq->avg.runnable_avg += se->avg.runnable_avg;
3961 cfs_rq->avg.runnable_sum += se->avg.runnable_sum;
3962
3963 add_tg_cfs_propagate(cfs_rq, se->avg.load_sum);
3964
3965 cfs_rq_util_change(cfs_rq, 0);
3966
3967 trace_pelt_cfs_tp(cfs_rq);
3968 }
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978 static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
3979 {
3980 dequeue_load_avg(cfs_rq, se);
3981 sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
3982 sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
3983
3984 cfs_rq->avg.util_sum = max_t(u32, cfs_rq->avg.util_sum,
3985 cfs_rq->avg.util_avg * PELT_MIN_DIVIDER);
3986
3987 sub_positive(&cfs_rq->avg.runnable_avg, se->avg.runnable_avg);
3988 sub_positive(&cfs_rq->avg.runnable_sum, se->avg.runnable_sum);
3989
3990 cfs_rq->avg.runnable_sum = max_t(u32, cfs_rq->avg.runnable_sum,
3991 cfs_rq->avg.runnable_avg * PELT_MIN_DIVIDER);
3992
3993 add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);
3994
3995 cfs_rq_util_change(cfs_rq, 0);
3996
3997 trace_pelt_cfs_tp(cfs_rq);
3998 }
3999
4000
4001
4002
4003 #define UPDATE_TG 0x1
4004 #define SKIP_AGE_LOAD 0x2
4005 #define DO_ATTACH 0x4
4006
4007
4008 static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
4009 {
4010 u64 now = cfs_rq_clock_pelt(cfs_rq);
4011 int decayed;
4012
4013
4014
4015
4016
4017 if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
4018 __update_load_avg_se(now, cfs_rq, se);
4019
4020 decayed = update_cfs_rq_load_avg(now, cfs_rq);
4021 decayed |= propagate_entity_load_avg(se);
4022
4023 if (!se->avg.last_update_time && (flags & DO_ATTACH)) {
4024
4025
4026
4027
4028
4029
4030
4031
4032 attach_entity_load_avg(cfs_rq, se);
4033 update_tg_load_avg(cfs_rq);
4034
4035 } else if (decayed) {
4036 cfs_rq_util_change(cfs_rq, 0);
4037
4038 if (flags & UPDATE_TG)
4039 update_tg_load_avg(cfs_rq);
4040 }
4041 }
4042
4043
4044
4045
4046
4047 static void sync_entity_load_avg(struct sched_entity *se)
4048 {
4049 struct cfs_rq *cfs_rq = cfs_rq_of(se);
4050 u64 last_update_time;
4051
4052 last_update_time = cfs_rq_last_update_time(cfs_rq);
4053 __update_load_avg_blocked_se(last_update_time, se);
4054 }
4055
4056
4057
4058
4059
4060 static void remove_entity_load_avg(struct sched_entity *se)
4061 {
4062 struct cfs_rq *cfs_rq = cfs_rq_of(se);
4063 unsigned long flags;
4064
4065
4066
4067
4068
4069
4070
4071 sync_entity_load_avg(se);
4072
4073 raw_spin_lock_irqsave(&cfs_rq->removed.lock, flags);
4074 ++cfs_rq->removed.nr;
4075 cfs_rq->removed.util_avg += se->avg.util_avg;
4076 cfs_rq->removed.load_avg += se->avg.load_avg;
4077 cfs_rq->removed.runnable_avg += se->avg.runnable_avg;
4078 raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags);
4079 }
4080
4081 static inline unsigned long cfs_rq_runnable_avg(struct cfs_rq *cfs_rq)
4082 {
4083 return cfs_rq->avg.runnable_avg;
4084 }
4085
4086 static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
4087 {
4088 return cfs_rq->avg.load_avg;
4089 }
4090
4091 static int newidle_balance(struct rq *this_rq, struct rq_flags *rf);
4092
4093 static inline unsigned long task_util(struct task_struct *p)
4094 {
4095 return READ_ONCE(p->se.avg.util_avg);
4096 }
4097
4098 static inline unsigned long _task_util_est(struct task_struct *p)
4099 {
4100 struct util_est ue = READ_ONCE(p->se.avg.util_est);
4101
4102 return max(ue.ewma, (ue.enqueued & ~UTIL_AVG_UNCHANGED));
4103 }
4104
4105 static inline unsigned long task_util_est(struct task_struct *p)
4106 {
4107 return max(task_util(p), _task_util_est(p));
4108 }
4109
4110 #ifdef CONFIG_UCLAMP_TASK
4111 static inline unsigned long uclamp_task_util(struct task_struct *p)
4112 {
4113 return clamp(task_util_est(p),
4114 uclamp_eff_value(p, UCLAMP_MIN),
4115 uclamp_eff_value(p, UCLAMP_MAX));
4116 }
4117 #else
4118 static inline unsigned long uclamp_task_util(struct task_struct *p)
4119 {
4120 return task_util_est(p);
4121 }
4122 #endif
4123
4124 static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
4125 struct task_struct *p)
4126 {
4127 unsigned int enqueued;
4128
4129 if (!sched_feat(UTIL_EST))
4130 return;
4131
4132
4133 enqueued = cfs_rq->avg.util_est.enqueued;
4134 enqueued += _task_util_est(p);
4135 WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
4136
4137 trace_sched_util_est_cfs_tp(cfs_rq);
4138 }
4139
4140 static inline void util_est_dequeue(struct cfs_rq *cfs_rq,
4141 struct task_struct *p)
4142 {
4143 unsigned int enqueued;
4144
4145 if (!sched_feat(UTIL_EST))
4146 return;
4147
4148
4149 enqueued = cfs_rq->avg.util_est.enqueued;
4150 enqueued -= min_t(unsigned int, enqueued, _task_util_est(p));
4151 WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
4152
4153 trace_sched_util_est_cfs_tp(cfs_rq);
4154 }
4155
4156 #define UTIL_EST_MARGIN (SCHED_CAPACITY_SCALE / 100)
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166 static inline bool within_margin(int value, int margin)
4167 {
4168 return ((unsigned int)(value + margin - 1) < (2 * margin - 1));
4169 }
4170
4171 static inline void util_est_update(struct cfs_rq *cfs_rq,
4172 struct task_struct *p,
4173 bool task_sleep)
4174 {
4175 long last_ewma_diff, last_enqueued_diff;
4176 struct util_est ue;
4177
4178 if (!sched_feat(UTIL_EST))
4179 return;
4180
4181
4182
4183
4184
4185 if (!task_sleep)
4186 return;
4187
4188
4189
4190
4191
4192 ue = p->se.avg.util_est;
4193 if (ue.enqueued & UTIL_AVG_UNCHANGED)
4194 return;
4195
4196 last_enqueued_diff = ue.enqueued;
4197
4198
4199
4200
4201
4202 ue.enqueued = task_util(p);
4203 if (sched_feat(UTIL_EST_FASTUP)) {
4204 if (ue.ewma < ue.enqueued) {
4205 ue.ewma = ue.enqueued;
4206 goto done;
4207 }
4208 }
4209
4210
4211
4212
4213
4214 last_ewma_diff = ue.enqueued - ue.ewma;
4215 last_enqueued_diff -= ue.enqueued;
4216 if (within_margin(last_ewma_diff, UTIL_EST_MARGIN)) {
4217 if (!within_margin(last_enqueued_diff, UTIL_EST_MARGIN))
4218 goto done;
4219
4220 return;
4221 }
4222
4223
4224
4225
4226
4227 if (task_util(p) > capacity_orig_of(cpu_of(rq_of(cfs_rq))))
4228 return;
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247 ue.ewma <<= UTIL_EST_WEIGHT_SHIFT;
4248 ue.ewma += last_ewma_diff;
4249 ue.ewma >>= UTIL_EST_WEIGHT_SHIFT;
4250 done:
4251 ue.enqueued |= UTIL_AVG_UNCHANGED;
4252 WRITE_ONCE(p->se.avg.util_est, ue);
4253
4254 trace_sched_util_est_se_tp(&p->se);
4255 }
4256
4257 static inline int task_fits_capacity(struct task_struct *p,
4258 unsigned long capacity)
4259 {
4260 return fits_capacity(uclamp_task_util(p), capacity);
4261 }
4262
4263 static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
4264 {
4265 if (!static_branch_unlikely(&sched_asym_cpucapacity))
4266 return;
4267
4268 if (!p || p->nr_cpus_allowed == 1) {
4269 rq->misfit_task_load = 0;
4270 return;
4271 }
4272
4273 if (task_fits_capacity(p, capacity_of(cpu_of(rq)))) {
4274 rq->misfit_task_load = 0;
4275 return;
4276 }
4277
4278
4279
4280
4281
4282 rq->misfit_task_load = max_t(unsigned long, task_h_load(p), 1);
4283 }
4284
4285 #else
4286
4287 static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
4288 {
4289 return true;
4290 }
4291
4292 #define UPDATE_TG 0x0
4293 #define SKIP_AGE_LOAD 0x0
4294 #define DO_ATTACH 0x0
4295
4296 static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1)
4297 {
4298 cfs_rq_util_change(cfs_rq, 0);
4299 }
4300
4301 static inline void remove_entity_load_avg(struct sched_entity *se) {}
4302
4303 static inline void
4304 attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
4305 static inline void
4306 detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
4307
4308 static inline int newidle_balance(struct rq *rq, struct rq_flags *rf)
4309 {
4310 return 0;
4311 }
4312
4313 static inline void
4314 util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
4315
4316 static inline void
4317 util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
4318
4319 static inline void
4320 util_est_update(struct cfs_rq *cfs_rq, struct task_struct *p,
4321 bool task_sleep) {}
4322 static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {}
4323
4324 #endif
4325
4326 static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
4327 {
4328 #ifdef CONFIG_SCHED_DEBUG
4329 s64 d = se->vruntime - cfs_rq->min_vruntime;
4330
4331 if (d < 0)
4332 d = -d;
4333
4334 if (d > 3*sysctl_sched_latency)
4335 schedstat_inc(cfs_rq->nr_spread_over);
4336 #endif
4337 }
4338
4339 static void
4340 place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
4341 {
4342 u64 vruntime = cfs_rq->min_vruntime;
4343
4344
4345
4346
4347
4348
4349
4350 if (initial && sched_feat(START_DEBIT))
4351 vruntime += sched_vslice(cfs_rq, se);
4352
4353
4354 if (!initial) {
4355 unsigned long thresh;
4356
4357 if (se_is_idle(se))
4358 thresh = sysctl_sched_min_granularity;
4359 else
4360 thresh = sysctl_sched_latency;
4361
4362
4363
4364
4365
4366 if (sched_feat(GENTLE_FAIR_SLEEPERS))
4367 thresh >>= 1;
4368
4369 vruntime -= thresh;
4370 }
4371
4372
4373 se->vruntime = max_vruntime(se->vruntime, vruntime);
4374 }
4375
4376 static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
4377
4378 static inline bool cfs_bandwidth_used(void);
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410 static void
4411 enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
4412 {
4413 bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATED);
4414 bool curr = cfs_rq->curr == se;
4415
4416
4417
4418
4419
4420 if (renorm && curr)
4421 se->vruntime += cfs_rq->min_vruntime;
4422
4423 update_curr(cfs_rq);
4424
4425
4426
4427
4428
4429
4430
4431 if (renorm && !curr)
4432 se->vruntime += cfs_rq->min_vruntime;
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442 update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH);
4443 se_update_runnable(se);
4444 update_cfs_group(se);
4445 account_entity_enqueue(cfs_rq, se);
4446
4447 if (flags & ENQUEUE_WAKEUP)
4448 place_entity(cfs_rq, se, 0);
4449
4450 check_schedstat_required();
4451 update_stats_enqueue_fair(cfs_rq, se, flags);
4452 check_spread(cfs_rq, se);
4453 if (!curr)
4454 __enqueue_entity(cfs_rq, se);
4455 se->on_rq = 1;
4456
4457 if (cfs_rq->nr_running == 1) {
4458 check_enqueue_throttle(cfs_rq);
4459 if (!throttled_hierarchy(cfs_rq))
4460 list_add_leaf_cfs_rq(cfs_rq);
4461 }
4462 }
4463
4464 static void __clear_buddies_last(struct sched_entity *se)
4465 {
4466 for_each_sched_entity(se) {
4467 struct cfs_rq *cfs_rq = cfs_rq_of(se);
4468 if (cfs_rq->last != se)
4469 break;
4470
4471 cfs_rq->last = NULL;
4472 }
4473 }
4474
4475 static void __clear_buddies_next(struct sched_entity *se)
4476 {
4477 for_each_sched_entity(se) {
4478 struct cfs_rq *cfs_rq = cfs_rq_of(se);
4479 if (cfs_rq->next != se)
4480 break;
4481
4482 cfs_rq->next = NULL;
4483 }
4484 }
4485
4486 static void __clear_buddies_skip(struct sched_entity *se)
4487 {
4488 for_each_sched_entity(se) {
4489 struct cfs_rq *cfs_rq = cfs_rq_of(se);
4490 if (cfs_rq->skip != se)
4491 break;
4492
4493 cfs_rq->skip = NULL;
4494 }
4495 }
4496
4497 static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
4498 {
4499 if (cfs_rq->last == se)
4500 __clear_buddies_last(se);
4501
4502 if (cfs_rq->next == se)
4503 __clear_buddies_next(se);
4504
4505 if (cfs_rq->skip == se)
4506 __clear_buddies_skip(se);
4507 }
4508
4509 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
4510
4511 static void
4512 dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
4513 {
4514
4515
4516
4517 update_curr(cfs_rq);
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527 update_load_avg(cfs_rq, se, UPDATE_TG);
4528 se_update_runnable(se);
4529
4530 update_stats_dequeue_fair(cfs_rq, se, flags);
4531
4532 clear_buddies(cfs_rq, se);
4533
4534 if (se != cfs_rq->curr)
4535 __dequeue_entity(cfs_rq, se);
4536 se->on_rq = 0;
4537 account_entity_dequeue(cfs_rq, se);
4538
4539
4540
4541
4542
4543
4544
4545 if (!(flags & DEQUEUE_SLEEP))
4546 se->vruntime -= cfs_rq->min_vruntime;
4547
4548
4549 return_cfs_rq_runtime(cfs_rq);
4550
4551 update_cfs_group(se);
4552
4553
4554
4555
4556
4557
4558
4559 if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE)
4560 update_min_vruntime(cfs_rq);
4561
4562 if (cfs_rq->nr_running == 0)
4563 update_idle_cfs_rq_clock_pelt(cfs_rq);
4564 }
4565
4566
4567
4568
4569 static void
4570 check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
4571 {
4572 unsigned long ideal_runtime, delta_exec;
4573 struct sched_entity *se;
4574 s64 delta;
4575
4576 ideal_runtime = sched_slice(cfs_rq, curr);
4577 delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
4578 if (delta_exec > ideal_runtime) {
4579 resched_curr(rq_of(cfs_rq));
4580
4581
4582
4583
4584 clear_buddies(cfs_rq, curr);
4585 return;
4586 }
4587
4588
4589
4590
4591
4592
4593 if (delta_exec < sysctl_sched_min_granularity)
4594 return;
4595
4596 se = __pick_first_entity(cfs_rq);
4597 delta = curr->vruntime - se->vruntime;
4598
4599 if (delta < 0)
4600 return;
4601
4602 if (delta > ideal_runtime)
4603 resched_curr(rq_of(cfs_rq));
4604 }
4605
4606 static void
4607 set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
4608 {
4609 clear_buddies(cfs_rq, se);
4610
4611
4612 if (se->on_rq) {
4613
4614
4615
4616
4617
4618 update_stats_wait_end_fair(cfs_rq, se);
4619 __dequeue_entity(cfs_rq, se);
4620 update_load_avg(cfs_rq, se, UPDATE_TG);
4621 }
4622
4623 update_stats_curr_start(cfs_rq, se);
4624 cfs_rq->curr = se;
4625
4626
4627
4628
4629
4630
4631 if (schedstat_enabled() &&
4632 rq_of(cfs_rq)->cfs.load.weight >= 2*se->load.weight) {
4633 struct sched_statistics *stats;
4634
4635 stats = __schedstats_from_se(se);
4636 __schedstat_set(stats->slice_max,
4637 max((u64)stats->slice_max,
4638 se->sum_exec_runtime - se->prev_sum_exec_runtime));
4639 }
4640
4641 se->prev_sum_exec_runtime = se->sum_exec_runtime;
4642 }
4643
4644 static int
4645 wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
4646
4647
4648
4649
4650
4651
4652
4653
4654 static struct sched_entity *
4655 pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
4656 {
4657 struct sched_entity *left = __pick_first_entity(cfs_rq);
4658 struct sched_entity *se;
4659
4660
4661
4662
4663
4664 if (!left || (curr && entity_before(curr, left)))
4665 left = curr;
4666
4667 se = left;
4668
4669
4670
4671
4672
4673 if (cfs_rq->skip && cfs_rq->skip == se) {
4674 struct sched_entity *second;
4675
4676 if (se == curr) {
4677 second = __pick_first_entity(cfs_rq);
4678 } else {
4679 second = __pick_next_entity(se);
4680 if (!second || (curr && entity_before(curr, second)))
4681 second = curr;
4682 }
4683
4684 if (second && wakeup_preempt_entity(second, left) < 1)
4685 se = second;
4686 }
4687
4688 if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) {
4689
4690
4691
4692 se = cfs_rq->next;
4693 } else if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) {
4694
4695
4696
4697 se = cfs_rq->last;
4698 }
4699
4700 return se;
4701 }
4702
4703 static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
4704
4705 static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
4706 {
4707
4708
4709
4710
4711 if (prev->on_rq)
4712 update_curr(cfs_rq);
4713
4714
4715 check_cfs_rq_runtime(cfs_rq);
4716
4717 check_spread(cfs_rq, prev);
4718
4719 if (prev->on_rq) {
4720 update_stats_wait_start_fair(cfs_rq, prev);
4721
4722 __enqueue_entity(cfs_rq, prev);
4723
4724 update_load_avg(cfs_rq, prev, 0);
4725 }
4726 cfs_rq->curr = NULL;
4727 }
4728
4729 static void
4730 entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
4731 {
4732
4733
4734
4735 update_curr(cfs_rq);
4736
4737
4738
4739
4740 update_load_avg(cfs_rq, curr, UPDATE_TG);
4741 update_cfs_group(curr);
4742
4743 #ifdef CONFIG_SCHED_HRTICK
4744
4745
4746
4747
4748 if (queued) {
4749 resched_curr(rq_of(cfs_rq));
4750 return;
4751 }
4752
4753
4754
4755 if (!sched_feat(DOUBLE_TICK) &&
4756 hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
4757 return;
4758 #endif
4759
4760 if (cfs_rq->nr_running > 1)
4761 check_preempt_tick(cfs_rq, curr);
4762 }
4763
4764
4765
4766
4767
4768
4769 #ifdef CONFIG_CFS_BANDWIDTH
4770
4771 #ifdef CONFIG_JUMP_LABEL
4772 static struct static_key __cfs_bandwidth_used;
4773
4774 static inline bool cfs_bandwidth_used(void)
4775 {
4776 return static_key_false(&__cfs_bandwidth_used);
4777 }
4778
4779 void cfs_bandwidth_usage_inc(void)
4780 {
4781 static_key_slow_inc_cpuslocked(&__cfs_bandwidth_used);
4782 }
4783
4784 void cfs_bandwidth_usage_dec(void)
4785 {
4786 static_key_slow_dec_cpuslocked(&__cfs_bandwidth_used);
4787 }
4788 #else
4789 static bool cfs_bandwidth_used(void)
4790 {
4791 return true;
4792 }
4793
4794 void cfs_bandwidth_usage_inc(void) {}
4795 void cfs_bandwidth_usage_dec(void) {}
4796 #endif
4797
4798
4799
4800
4801
4802 static inline u64 default_cfs_period(void)
4803 {
4804 return 100000000ULL;
4805 }
4806
4807 static inline u64 sched_cfs_bandwidth_slice(void)
4808 {
4809 return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
4810 }
4811
4812
4813
4814
4815
4816
4817
4818
4819 void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
4820 {
4821 s64 runtime;
4822
4823 if (unlikely(cfs_b->quota == RUNTIME_INF))
4824 return;
4825
4826 cfs_b->runtime += cfs_b->quota;
4827 runtime = cfs_b->runtime_snap - cfs_b->runtime;
4828 if (runtime > 0) {
4829 cfs_b->burst_time += runtime;
4830 cfs_b->nr_burst++;
4831 }
4832
4833 cfs_b->runtime = min(cfs_b->runtime, cfs_b->quota + cfs_b->burst);
4834 cfs_b->runtime_snap = cfs_b->runtime;
4835 }
4836
4837 static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
4838 {
4839 return &tg->cfs_bandwidth;
4840 }
4841
4842
4843 static int __assign_cfs_rq_runtime(struct cfs_bandwidth *cfs_b,
4844 struct cfs_rq *cfs_rq, u64 target_runtime)
4845 {
4846 u64 min_amount, amount = 0;
4847
4848 lockdep_assert_held(&cfs_b->lock);
4849
4850
4851 min_amount = target_runtime - cfs_rq->runtime_remaining;
4852
4853 if (cfs_b->quota == RUNTIME_INF)
4854 amount = min_amount;
4855 else {
4856 start_cfs_bandwidth(cfs_b);
4857
4858 if (cfs_b->runtime > 0) {
4859 amount = min(cfs_b->runtime, min_amount);
4860 cfs_b->runtime -= amount;
4861 cfs_b->idle = 0;
4862 }
4863 }
4864
4865 cfs_rq->runtime_remaining += amount;
4866
4867 return cfs_rq->runtime_remaining > 0;
4868 }
4869
4870
4871 static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4872 {
4873 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
4874 int ret;
4875
4876 raw_spin_lock(&cfs_b->lock);
4877 ret = __assign_cfs_rq_runtime(cfs_b, cfs_rq, sched_cfs_bandwidth_slice());
4878 raw_spin_unlock(&cfs_b->lock);
4879
4880 return ret;
4881 }
4882
4883 static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
4884 {
4885
4886 cfs_rq->runtime_remaining -= delta_exec;
4887
4888 if (likely(cfs_rq->runtime_remaining > 0))
4889 return;
4890
4891 if (cfs_rq->throttled)
4892 return;
4893
4894
4895
4896
4897 if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
4898 resched_curr(rq_of(cfs_rq));
4899 }
4900
4901 static __always_inline
4902 void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
4903 {
4904 if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
4905 return;
4906
4907 __account_cfs_rq_runtime(cfs_rq, delta_exec);
4908 }
4909
4910 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
4911 {
4912 return cfs_bandwidth_used() && cfs_rq->throttled;
4913 }
4914
4915
4916 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
4917 {
4918 return cfs_bandwidth_used() && cfs_rq->throttle_count;
4919 }
4920
4921
4922
4923
4924
4925
4926 static inline int throttled_lb_pair(struct task_group *tg,
4927 int src_cpu, int dest_cpu)
4928 {
4929 struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
4930
4931 src_cfs_rq = tg->cfs_rq[src_cpu];
4932 dest_cfs_rq = tg->cfs_rq[dest_cpu];
4933
4934 return throttled_hierarchy(src_cfs_rq) ||
4935 throttled_hierarchy(dest_cfs_rq);
4936 }
4937
4938 static int tg_unthrottle_up(struct task_group *tg, void *data)
4939 {
4940 struct rq *rq = data;
4941 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
4942
4943 cfs_rq->throttle_count--;
4944 if (!cfs_rq->throttle_count) {
4945 cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) -
4946 cfs_rq->throttled_clock_pelt;
4947
4948
4949 if (!cfs_rq_is_decayed(cfs_rq))
4950 list_add_leaf_cfs_rq(cfs_rq);
4951 }
4952
4953 return 0;
4954 }
4955
4956 static int tg_throttle_down(struct task_group *tg, void *data)
4957 {
4958 struct rq *rq = data;
4959 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
4960
4961
4962 if (!cfs_rq->throttle_count) {
4963 cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
4964 list_del_leaf_cfs_rq(cfs_rq);
4965 }
4966 cfs_rq->throttle_count++;
4967
4968 return 0;
4969 }
4970
4971 static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
4972 {
4973 struct rq *rq = rq_of(cfs_rq);
4974 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
4975 struct sched_entity *se;
4976 long task_delta, idle_task_delta, dequeue = 1;
4977
4978 raw_spin_lock(&cfs_b->lock);
4979
4980 if (__assign_cfs_rq_runtime(cfs_b, cfs_rq, 1)) {
4981
4982
4983
4984
4985
4986
4987
4988
4989 dequeue = 0;
4990 } else {
4991 list_add_tail_rcu(&cfs_rq->throttled_list,
4992 &cfs_b->throttled_cfs_rq);
4993 }
4994 raw_spin_unlock(&cfs_b->lock);
4995
4996 if (!dequeue)
4997 return false;
4998
4999 se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
5000
5001
5002 rcu_read_lock();
5003 walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
5004 rcu_read_unlock();
5005
5006 task_delta = cfs_rq->h_nr_running;
5007 idle_task_delta = cfs_rq->idle_h_nr_running;
5008 for_each_sched_entity(se) {
5009 struct cfs_rq *qcfs_rq = cfs_rq_of(se);
5010
5011 if (!se->on_rq)
5012 goto done;
5013
5014 dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
5015
5016 if (cfs_rq_is_idle(group_cfs_rq(se)))
5017 idle_task_delta = cfs_rq->h_nr_running;
5018
5019 qcfs_rq->h_nr_running -= task_delta;
5020 qcfs_rq->idle_h_nr_running -= idle_task_delta;
5021
5022 if (qcfs_rq->load.weight) {
5023
5024 se = parent_entity(se);
5025 break;
5026 }
5027 }
5028
5029 for_each_sched_entity(se) {
5030 struct cfs_rq *qcfs_rq = cfs_rq_of(se);
5031
5032 if (!se->on_rq)
5033 goto done;
5034
5035 update_load_avg(qcfs_rq, se, 0);
5036 se_update_runnable(se);
5037
5038 if (cfs_rq_is_idle(group_cfs_rq(se)))
5039 idle_task_delta = cfs_rq->h_nr_running;
5040
5041 qcfs_rq->h_nr_running -= task_delta;
5042 qcfs_rq->idle_h_nr_running -= idle_task_delta;
5043 }
5044
5045
5046 sub_nr_running(rq, task_delta);
5047
5048 done:
5049
5050
5051
5052
5053 cfs_rq->throttled = 1;
5054 cfs_rq->throttled_clock = rq_clock(rq);
5055 return true;
5056 }
5057
5058 void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
5059 {
5060 struct rq *rq = rq_of(cfs_rq);
5061 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
5062 struct sched_entity *se;
5063 long task_delta, idle_task_delta;
5064
5065 se = cfs_rq->tg->se[cpu_of(rq)];
5066
5067 cfs_rq->throttled = 0;
5068
5069 update_rq_clock(rq);
5070
5071 raw_spin_lock(&cfs_b->lock);
5072 cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
5073 list_del_rcu(&cfs_rq->throttled_list);
5074 raw_spin_unlock(&cfs_b->lock);
5075
5076
5077 walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
5078
5079 if (!cfs_rq->load.weight) {
5080 if (!cfs_rq->on_list)
5081 return;
5082
5083
5084
5085
5086 for_each_sched_entity(se) {
5087 if (list_add_leaf_cfs_rq(cfs_rq_of(se)))
5088 break;
5089 }
5090 goto unthrottle_throttle;
5091 }
5092
5093 task_delta = cfs_rq->h_nr_running;
5094 idle_task_delta = cfs_rq->idle_h_nr_running;
5095 for_each_sched_entity(se) {
5096 struct cfs_rq *qcfs_rq = cfs_rq_of(se);
5097
5098 if (se->on_rq)
5099 break;
5100 enqueue_entity(qcfs_rq, se, ENQUEUE_WAKEUP);
5101
5102 if (cfs_rq_is_idle(group_cfs_rq(se)))
5103 idle_task_delta = cfs_rq->h_nr_running;
5104
5105 qcfs_rq->h_nr_running += task_delta;
5106 qcfs_rq->idle_h_nr_running += idle_task_delta;
5107
5108
5109 if (cfs_rq_throttled(qcfs_rq))
5110 goto unthrottle_throttle;
5111 }
5112
5113 for_each_sched_entity(se) {
5114 struct cfs_rq *qcfs_rq = cfs_rq_of(se);
5115
5116 update_load_avg(qcfs_rq, se, UPDATE_TG);
5117 se_update_runnable(se);
5118
5119 if (cfs_rq_is_idle(group_cfs_rq(se)))
5120 idle_task_delta = cfs_rq->h_nr_running;
5121
5122 qcfs_rq->h_nr_running += task_delta;
5123 qcfs_rq->idle_h_nr_running += idle_task_delta;
5124
5125
5126 if (cfs_rq_throttled(qcfs_rq))
5127 goto unthrottle_throttle;
5128 }
5129
5130
5131 add_nr_running(rq, task_delta);
5132
5133 unthrottle_throttle:
5134 assert_list_leaf_cfs_rq(rq);
5135
5136
5137 if (rq->curr == rq->idle && rq->cfs.nr_running)
5138 resched_curr(rq);
5139 }
5140
5141 static void distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
5142 {
5143 struct cfs_rq *cfs_rq;
5144 u64 runtime, remaining = 1;
5145
5146 rcu_read_lock();
5147 list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
5148 throttled_list) {
5149 struct rq *rq = rq_of(cfs_rq);
5150 struct rq_flags rf;
5151
5152 rq_lock_irqsave(rq, &rf);
5153 if (!cfs_rq_throttled(cfs_rq))
5154 goto next;
5155
5156
5157 SCHED_WARN_ON(cfs_rq->runtime_remaining > 0);
5158
5159 raw_spin_lock(&cfs_b->lock);
5160 runtime = -cfs_rq->runtime_remaining + 1;
5161 if (runtime > cfs_b->runtime)
5162 runtime = cfs_b->runtime;
5163 cfs_b->runtime -= runtime;
5164 remaining = cfs_b->runtime;
5165 raw_spin_unlock(&cfs_b->lock);
5166
5167 cfs_rq->runtime_remaining += runtime;
5168
5169
5170 if (cfs_rq->runtime_remaining > 0)
5171 unthrottle_cfs_rq(cfs_rq);
5172
5173 next:
5174 rq_unlock_irqrestore(rq, &rf);
5175
5176 if (!remaining)
5177 break;
5178 }
5179 rcu_read_unlock();
5180 }
5181
5182
5183
5184
5185
5186
5187
5188 static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, unsigned long flags)
5189 {
5190 int throttled;
5191
5192
5193 if (cfs_b->quota == RUNTIME_INF)
5194 goto out_deactivate;
5195
5196 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
5197 cfs_b->nr_periods += overrun;
5198
5199
5200 __refill_cfs_bandwidth_runtime(cfs_b);
5201
5202
5203
5204
5205
5206 if (cfs_b->idle && !throttled)
5207 goto out_deactivate;
5208
5209 if (!throttled) {
5210
5211 cfs_b->idle = 1;
5212 return 0;
5213 }
5214
5215
5216 cfs_b->nr_throttled += overrun;
5217
5218
5219
5220
5221 while (throttled && cfs_b->runtime > 0) {
5222 raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
5223
5224 distribute_cfs_runtime(cfs_b);
5225 raw_spin_lock_irqsave(&cfs_b->lock, flags);
5226
5227 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
5228 }
5229
5230
5231
5232
5233
5234
5235
5236 cfs_b->idle = 0;
5237
5238 return 0;
5239
5240 out_deactivate:
5241 return 1;
5242 }
5243
5244
5245 static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
5246
5247 static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
5248
5249 static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
5250
5251
5252
5253
5254
5255
5256
5257
5258 static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
5259 {
5260 struct hrtimer *refresh_timer = &cfs_b->period_timer;
5261 s64 remaining;
5262
5263
5264 if (hrtimer_callback_running(refresh_timer))
5265 return 1;
5266
5267
5268 remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
5269 if (remaining < (s64)min_expire)
5270 return 1;
5271
5272 return 0;
5273 }
5274
5275 static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
5276 {
5277 u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
5278
5279
5280 if (runtime_refresh_within(cfs_b, min_left))
5281 return;
5282
5283
5284 if (cfs_b->slack_started)
5285 return;
5286 cfs_b->slack_started = true;
5287
5288 hrtimer_start(&cfs_b->slack_timer,
5289 ns_to_ktime(cfs_bandwidth_slack_period),
5290 HRTIMER_MODE_REL);
5291 }
5292
5293
5294 static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
5295 {
5296 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
5297 s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
5298
5299 if (slack_runtime <= 0)
5300 return;
5301
5302 raw_spin_lock(&cfs_b->lock);
5303 if (cfs_b->quota != RUNTIME_INF) {
5304 cfs_b->runtime += slack_runtime;
5305
5306
5307 if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
5308 !list_empty(&cfs_b->throttled_cfs_rq))
5309 start_cfs_slack_bandwidth(cfs_b);
5310 }
5311 raw_spin_unlock(&cfs_b->lock);
5312
5313
5314 cfs_rq->runtime_remaining -= slack_runtime;
5315 }
5316
5317 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
5318 {
5319 if (!cfs_bandwidth_used())
5320 return;
5321
5322 if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
5323 return;
5324
5325 __return_cfs_rq_runtime(cfs_rq);
5326 }
5327
5328
5329
5330
5331
5332 static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
5333 {
5334 u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
5335 unsigned long flags;
5336
5337
5338 raw_spin_lock_irqsave(&cfs_b->lock, flags);
5339 cfs_b->slack_started = false;
5340
5341 if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
5342 raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
5343 return;
5344 }
5345
5346 if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
5347 runtime = cfs_b->runtime;
5348
5349 raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
5350
5351 if (!runtime)
5352 return;
5353
5354 distribute_cfs_runtime(cfs_b);
5355 }
5356
5357
5358
5359
5360
5361
5362 static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
5363 {
5364 if (!cfs_bandwidth_used())
5365 return;
5366
5367
5368 if (!cfs_rq->runtime_enabled || cfs_rq->curr)
5369 return;
5370
5371
5372 if (cfs_rq_throttled(cfs_rq))
5373 return;
5374
5375
5376 account_cfs_rq_runtime(cfs_rq, 0);
5377 if (cfs_rq->runtime_remaining <= 0)
5378 throttle_cfs_rq(cfs_rq);
5379 }
5380
5381 static void sync_throttle(struct task_group *tg, int cpu)
5382 {
5383 struct cfs_rq *pcfs_rq, *cfs_rq;
5384
5385 if (!cfs_bandwidth_used())
5386 return;
5387
5388 if (!tg->parent)
5389 return;
5390
5391 cfs_rq = tg->cfs_rq[cpu];
5392 pcfs_rq = tg->parent->cfs_rq[cpu];
5393
5394 cfs_rq->throttle_count = pcfs_rq->throttle_count;
5395 cfs_rq->throttled_clock_pelt = rq_clock_pelt(cpu_rq(cpu));
5396 }
5397
5398
5399 static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
5400 {
5401 if (!cfs_bandwidth_used())
5402 return false;
5403
5404 if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
5405 return false;
5406
5407
5408
5409
5410
5411 if (cfs_rq_throttled(cfs_rq))
5412 return true;
5413
5414 return throttle_cfs_rq(cfs_rq);
5415 }
5416
5417 static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
5418 {
5419 struct cfs_bandwidth *cfs_b =
5420 container_of(timer, struct cfs_bandwidth, slack_timer);
5421
5422 do_sched_cfs_slack_timer(cfs_b);
5423
5424 return HRTIMER_NORESTART;
5425 }
5426
5427 extern const u64 max_cfs_quota_period;
5428
5429 static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
5430 {
5431 struct cfs_bandwidth *cfs_b =
5432 container_of(timer, struct cfs_bandwidth, period_timer);
5433 unsigned long flags;
5434 int overrun;
5435 int idle = 0;
5436 int count = 0;
5437
5438 raw_spin_lock_irqsave(&cfs_b->lock, flags);
5439 for (;;) {
5440 overrun = hrtimer_forward_now(timer, cfs_b->period);
5441 if (!overrun)
5442 break;
5443
5444 idle = do_sched_cfs_period_timer(cfs_b, overrun, flags);
5445
5446 if (++count > 3) {
5447 u64 new, old = ktime_to_ns(cfs_b->period);
5448
5449
5450
5451
5452
5453
5454 new = old * 2;
5455 if (new < max_cfs_quota_period) {
5456 cfs_b->period = ns_to_ktime(new);
5457 cfs_b->quota *= 2;
5458 cfs_b->burst *= 2;
5459
5460 pr_warn_ratelimited(
5461 "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us = %lld, cfs_quota_us = %lld)\n",
5462 smp_processor_id(),
5463 div_u64(new, NSEC_PER_USEC),
5464 div_u64(cfs_b->quota, NSEC_PER_USEC));
5465 } else {
5466 pr_warn_ratelimited(
5467 "cfs_period_timer[cpu%d]: period too short, but cannot scale up without losing precision (cfs_period_us = %lld, cfs_quota_us = %lld)\n",
5468 smp_processor_id(),
5469 div_u64(old, NSEC_PER_USEC),
5470 div_u64(cfs_b->quota, NSEC_PER_USEC));
5471 }
5472
5473
5474 count = 0;
5475 }
5476 }
5477 if (idle)
5478 cfs_b->period_active = 0;
5479 raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
5480
5481 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
5482 }
5483
5484 void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
5485 {
5486 raw_spin_lock_init(&cfs_b->lock);
5487 cfs_b->runtime = 0;
5488 cfs_b->quota = RUNTIME_INF;
5489 cfs_b->period = ns_to_ktime(default_cfs_period());
5490 cfs_b->burst = 0;
5491
5492 INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
5493 hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
5494 cfs_b->period_timer.function = sched_cfs_period_timer;
5495 hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
5496 cfs_b->slack_timer.function = sched_cfs_slack_timer;
5497 cfs_b->slack_started = false;
5498 }
5499
5500 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
5501 {
5502 cfs_rq->runtime_enabled = 0;
5503 INIT_LIST_HEAD(&cfs_rq->throttled_list);
5504 }
5505
5506 void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
5507 {
5508 lockdep_assert_held(&cfs_b->lock);
5509
5510 if (cfs_b->period_active)
5511 return;
5512
5513 cfs_b->period_active = 1;
5514 hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
5515 hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED);
5516 }
5517
5518 static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
5519 {
5520
5521 if (!cfs_b->throttled_cfs_rq.next)
5522 return;
5523
5524 hrtimer_cancel(&cfs_b->period_timer);
5525 hrtimer_cancel(&cfs_b->slack_timer);
5526 }
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536 static void __maybe_unused update_runtime_enabled(struct rq *rq)
5537 {
5538 struct task_group *tg;
5539
5540 lockdep_assert_rq_held(rq);
5541
5542 rcu_read_lock();
5543 list_for_each_entry_rcu(tg, &task_groups, list) {
5544 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
5545 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
5546
5547 raw_spin_lock(&cfs_b->lock);
5548 cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF;
5549 raw_spin_unlock(&cfs_b->lock);
5550 }
5551 rcu_read_unlock();
5552 }
5553
5554
5555 static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
5556 {
5557 struct task_group *tg;
5558
5559 lockdep_assert_rq_held(rq);
5560
5561 rcu_read_lock();
5562 list_for_each_entry_rcu(tg, &task_groups, list) {
5563 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
5564
5565 if (!cfs_rq->runtime_enabled)
5566 continue;
5567
5568
5569
5570
5571
5572 cfs_rq->runtime_remaining = 1;
5573
5574
5575
5576
5577 cfs_rq->runtime_enabled = 0;
5578
5579 if (cfs_rq_throttled(cfs_rq))
5580 unthrottle_cfs_rq(cfs_rq);
5581 }
5582 rcu_read_unlock();
5583 }
5584
5585 #else
5586
5587 static inline bool cfs_bandwidth_used(void)
5588 {
5589 return false;
5590 }
5591
5592 static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
5593 static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
5594 static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
5595 static inline void sync_throttle(struct task_group *tg, int cpu) {}
5596 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
5597
5598 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
5599 {
5600 return 0;
5601 }
5602
5603 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
5604 {
5605 return 0;
5606 }
5607
5608 static inline int throttled_lb_pair(struct task_group *tg,
5609 int src_cpu, int dest_cpu)
5610 {
5611 return 0;
5612 }
5613
5614 void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
5615
5616 #ifdef CONFIG_FAIR_GROUP_SCHED
5617 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
5618 #endif
5619
5620 static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
5621 {
5622 return NULL;
5623 }
5624 static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
5625 static inline void update_runtime_enabled(struct rq *rq) {}
5626 static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
5627
5628 #endif
5629
5630
5631
5632
5633
5634 #ifdef CONFIG_SCHED_HRTICK
5635 static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
5636 {
5637 struct sched_entity *se = &p->se;
5638 struct cfs_rq *cfs_rq = cfs_rq_of(se);
5639
5640 SCHED_WARN_ON(task_rq(p) != rq);
5641
5642 if (rq->cfs.h_nr_running > 1) {
5643 u64 slice = sched_slice(cfs_rq, se);
5644 u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
5645 s64 delta = slice - ran;
5646
5647 if (delta < 0) {
5648 if (task_current(rq, p))
5649 resched_curr(rq);
5650 return;
5651 }
5652 hrtick_start(rq, delta);
5653 }
5654 }
5655
5656
5657
5658
5659
5660
5661 static void hrtick_update(struct rq *rq)
5662 {
5663 struct task_struct *curr = rq->curr;
5664
5665 if (!hrtick_enabled_fair(rq) || curr->sched_class != &fair_sched_class)
5666 return;
5667
5668 if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
5669 hrtick_start_fair(rq, curr);
5670 }
5671 #else
5672 static inline void
5673 hrtick_start_fair(struct rq *rq, struct task_struct *p)
5674 {
5675 }
5676
5677 static inline void hrtick_update(struct rq *rq)
5678 {
5679 }
5680 #endif
5681
5682 #ifdef CONFIG_SMP
5683 static inline bool cpu_overutilized(int cpu)
5684 {
5685 return !fits_capacity(cpu_util_cfs(cpu), capacity_of(cpu));
5686 }
5687
5688 static inline void update_overutilized_status(struct rq *rq)
5689 {
5690 if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu)) {
5691 WRITE_ONCE(rq->rd->overutilized, SG_OVERUTILIZED);
5692 trace_sched_overutilized_tp(rq->rd, SG_OVERUTILIZED);
5693 }
5694 }
5695 #else
5696 static inline void update_overutilized_status(struct rq *rq) { }
5697 #endif
5698
5699
5700 static int sched_idle_rq(struct rq *rq)
5701 {
5702 return unlikely(rq->nr_running == rq->cfs.idle_h_nr_running &&
5703 rq->nr_running);
5704 }
5705
5706
5707
5708
5709
5710
5711 static bool sched_idle_cfs_rq(struct cfs_rq *cfs_rq)
5712 {
5713 return cfs_rq->nr_running &&
5714 cfs_rq->nr_running == cfs_rq->idle_nr_running;
5715 }
5716
5717 #ifdef CONFIG_SMP
5718 static int sched_idle_cpu(int cpu)
5719 {
5720 return sched_idle_rq(cpu_rq(cpu));
5721 }
5722 #endif
5723
5724
5725
5726
5727
5728
5729 static void
5730 enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
5731 {
5732 struct cfs_rq *cfs_rq;
5733 struct sched_entity *se = &p->se;
5734 int idle_h_nr_running = task_has_idle_policy(p);
5735 int task_new = !(flags & ENQUEUE_WAKEUP);
5736
5737
5738
5739
5740
5741
5742
5743 util_est_enqueue(&rq->cfs, p);
5744
5745
5746
5747
5748
5749
5750 if (p->in_iowait)
5751 cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
5752
5753 for_each_sched_entity(se) {
5754 if (se->on_rq)
5755 break;
5756 cfs_rq = cfs_rq_of(se);
5757 enqueue_entity(cfs_rq, se, flags);
5758
5759 cfs_rq->h_nr_running++;
5760 cfs_rq->idle_h_nr_running += idle_h_nr_running;
5761
5762 if (cfs_rq_is_idle(cfs_rq))
5763 idle_h_nr_running = 1;
5764
5765
5766 if (cfs_rq_throttled(cfs_rq))
5767 goto enqueue_throttle;
5768
5769 flags = ENQUEUE_WAKEUP;
5770 }
5771
5772 for_each_sched_entity(se) {
5773 cfs_rq = cfs_rq_of(se);
5774
5775 update_load_avg(cfs_rq, se, UPDATE_TG);
5776 se_update_runnable(se);
5777 update_cfs_group(se);
5778
5779 cfs_rq->h_nr_running++;
5780 cfs_rq->idle_h_nr_running += idle_h_nr_running;
5781
5782 if (cfs_rq_is_idle(cfs_rq))
5783 idle_h_nr_running = 1;
5784
5785
5786 if (cfs_rq_throttled(cfs_rq))
5787 goto enqueue_throttle;
5788 }
5789
5790
5791 add_nr_running(rq, 1);
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807 if (!task_new)
5808 update_overutilized_status(rq);
5809
5810 enqueue_throttle:
5811 assert_list_leaf_cfs_rq(rq);
5812
5813 hrtick_update(rq);
5814 }
5815
5816 static void set_next_buddy(struct sched_entity *se);
5817
5818
5819
5820
5821
5822
5823 static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
5824 {
5825 struct cfs_rq *cfs_rq;
5826 struct sched_entity *se = &p->se;
5827 int task_sleep = flags & DEQUEUE_SLEEP;
5828 int idle_h_nr_running = task_has_idle_policy(p);
5829 bool was_sched_idle = sched_idle_rq(rq);
5830
5831 util_est_dequeue(&rq->cfs, p);
5832
5833 for_each_sched_entity(se) {
5834 cfs_rq = cfs_rq_of(se);
5835 dequeue_entity(cfs_rq, se, flags);
5836
5837 cfs_rq->h_nr_running--;
5838 cfs_rq->idle_h_nr_running -= idle_h_nr_running;
5839
5840 if (cfs_rq_is_idle(cfs_rq))
5841 idle_h_nr_running = 1;
5842
5843
5844 if (cfs_rq_throttled(cfs_rq))
5845 goto dequeue_throttle;
5846
5847
5848 if (cfs_rq->load.weight) {
5849
5850 se = parent_entity(se);
5851
5852
5853
5854
5855 if (task_sleep && se && !throttled_hierarchy(cfs_rq))
5856 set_next_buddy(se);
5857 break;
5858 }
5859 flags |= DEQUEUE_SLEEP;
5860 }
5861
5862 for_each_sched_entity(se) {
5863 cfs_rq = cfs_rq_of(se);
5864
5865 update_load_avg(cfs_rq, se, UPDATE_TG);
5866 se_update_runnable(se);
5867 update_cfs_group(se);
5868
5869 cfs_rq->h_nr_running--;
5870 cfs_rq->idle_h_nr_running -= idle_h_nr_running;
5871
5872 if (cfs_rq_is_idle(cfs_rq))
5873 idle_h_nr_running = 1;
5874
5875
5876 if (cfs_rq_throttled(cfs_rq))
5877 goto dequeue_throttle;
5878
5879 }
5880
5881
5882 sub_nr_running(rq, 1);
5883
5884
5885 if (unlikely(!was_sched_idle && sched_idle_rq(rq)))
5886 rq->next_balance = jiffies;
5887
5888 dequeue_throttle:
5889 util_est_update(&rq->cfs, p, task_sleep);
5890 hrtick_update(rq);
5891 }
5892
5893 #ifdef CONFIG_SMP
5894
5895
5896 DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
5897 DEFINE_PER_CPU(cpumask_var_t, select_rq_mask);
5898
5899 #ifdef CONFIG_NO_HZ_COMMON
5900
5901 static struct {
5902 cpumask_var_t idle_cpus_mask;
5903 atomic_t nr_cpus;
5904 int has_blocked;
5905 int needs_update;
5906 unsigned long next_balance;
5907 unsigned long next_blocked;
5908 } nohz ____cacheline_aligned;
5909
5910 #endif
5911
5912 static unsigned long cpu_load(struct rq *rq)
5913 {
5914 return cfs_rq_load_avg(&rq->cfs);
5915 }
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930 static unsigned long cpu_load_without(struct rq *rq, struct task_struct *p)
5931 {
5932 struct cfs_rq *cfs_rq;
5933 unsigned int load;
5934
5935
5936 if (cpu_of(rq) != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
5937 return cpu_load(rq);
5938
5939 cfs_rq = &rq->cfs;
5940 load = READ_ONCE(cfs_rq->avg.load_avg);
5941
5942
5943 lsub_positive(&load, task_h_load(p));
5944
5945 return load;
5946 }
5947
5948 static unsigned long cpu_runnable(struct rq *rq)
5949 {
5950 return cfs_rq_runnable_avg(&rq->cfs);
5951 }
5952
5953 static unsigned long cpu_runnable_without(struct rq *rq, struct task_struct *p)
5954 {
5955 struct cfs_rq *cfs_rq;
5956 unsigned int runnable;
5957
5958
5959 if (cpu_of(rq) != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
5960 return cpu_runnable(rq);
5961
5962 cfs_rq = &rq->cfs;
5963 runnable = READ_ONCE(cfs_rq->avg.runnable_avg);
5964
5965
5966 lsub_positive(&runnable, p->se.avg.runnable_avg);
5967
5968 return runnable;
5969 }
5970
5971 static unsigned long capacity_of(int cpu)
5972 {
5973 return cpu_rq(cpu)->cpu_capacity;
5974 }
5975
5976 static void record_wakee(struct task_struct *p)
5977 {
5978
5979
5980
5981
5982 if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) {
5983 current->wakee_flips >>= 1;
5984 current->wakee_flip_decay_ts = jiffies;
5985 }
5986
5987 if (current->last_wakee != p) {
5988 current->last_wakee = p;
5989 current->wakee_flips++;
5990 }
5991 }
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010 static int wake_wide(struct task_struct *p)
6011 {
6012 unsigned int master = current->wakee_flips;
6013 unsigned int slave = p->wakee_flips;
6014 int factor = __this_cpu_read(sd_llc_size);
6015
6016 if (master < slave)
6017 swap(master, slave);
6018 if (slave < factor || master < slave * factor)
6019 return 0;
6020 return 1;
6021 }
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035 static int
6036 wake_affine_idle(int this_cpu, int prev_cpu, int sync)
6037 {
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050 if (available_idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu))
6051 return available_idle_cpu(prev_cpu) ? prev_cpu : this_cpu;
6052
6053 if (sync && cpu_rq(this_cpu)->nr_running == 1)
6054 return this_cpu;
6055
6056 if (available_idle_cpu(prev_cpu))
6057 return prev_cpu;
6058
6059 return nr_cpumask_bits;
6060 }
6061
6062 static int
6063 wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
6064 int this_cpu, int prev_cpu, int sync)
6065 {
6066 s64 this_eff_load, prev_eff_load;
6067 unsigned long task_load;
6068
6069 this_eff_load = cpu_load(cpu_rq(this_cpu));
6070
6071 if (sync) {
6072 unsigned long current_load = task_h_load(current);
6073
6074 if (current_load > this_eff_load)
6075 return this_cpu;
6076
6077 this_eff_load -= current_load;
6078 }
6079
6080 task_load = task_h_load(p);
6081
6082 this_eff_load += task_load;
6083 if (sched_feat(WA_BIAS))
6084 this_eff_load *= 100;
6085 this_eff_load *= capacity_of(prev_cpu);
6086
6087 prev_eff_load = cpu_load(cpu_rq(prev_cpu));
6088 prev_eff_load -= task_load;
6089 if (sched_feat(WA_BIAS))
6090 prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2;
6091 prev_eff_load *= capacity_of(this_cpu);
6092
6093
6094
6095
6096
6097
6098
6099 if (sync)
6100 prev_eff_load += 1;
6101
6102 return this_eff_load < prev_eff_load ? this_cpu : nr_cpumask_bits;
6103 }
6104
6105 static int wake_affine(struct sched_domain *sd, struct task_struct *p,
6106 int this_cpu, int prev_cpu, int sync)
6107 {
6108 int target = nr_cpumask_bits;
6109
6110 if (sched_feat(WA_IDLE))
6111 target = wake_affine_idle(this_cpu, prev_cpu, sync);
6112
6113 if (sched_feat(WA_WEIGHT) && target == nr_cpumask_bits)
6114 target = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync);
6115
6116 schedstat_inc(p->stats.nr_wakeups_affine_attempts);
6117 if (target == nr_cpumask_bits)
6118 return prev_cpu;
6119
6120 schedstat_inc(sd->ttwu_move_affine);
6121 schedstat_inc(p->stats.nr_wakeups_affine);
6122 return target;
6123 }
6124
6125 static struct sched_group *
6126 find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu);
6127
6128
6129
6130
6131 static int
6132 find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
6133 {
6134 unsigned long load, min_load = ULONG_MAX;
6135 unsigned int min_exit_latency = UINT_MAX;
6136 u64 latest_idle_timestamp = 0;
6137 int least_loaded_cpu = this_cpu;
6138 int shallowest_idle_cpu = -1;
6139 int i;
6140
6141
6142 if (group->group_weight == 1)
6143 return cpumask_first(sched_group_span(group));
6144
6145
6146 for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) {
6147 struct rq *rq = cpu_rq(i);
6148
6149 if (!sched_core_cookie_match(rq, p))
6150 continue;
6151
6152 if (sched_idle_cpu(i))
6153 return i;
6154
6155 if (available_idle_cpu(i)) {
6156 struct cpuidle_state *idle = idle_get_state(rq);
6157 if (idle && idle->exit_latency < min_exit_latency) {
6158
6159
6160
6161
6162
6163 min_exit_latency = idle->exit_latency;
6164 latest_idle_timestamp = rq->idle_stamp;
6165 shallowest_idle_cpu = i;
6166 } else if ((!idle || idle->exit_latency == min_exit_latency) &&
6167 rq->idle_stamp > latest_idle_timestamp) {
6168
6169
6170
6171
6172
6173 latest_idle_timestamp = rq->idle_stamp;
6174 shallowest_idle_cpu = i;
6175 }
6176 } else if (shallowest_idle_cpu == -1) {
6177 load = cpu_load(cpu_rq(i));
6178 if (load < min_load) {
6179 min_load = load;
6180 least_loaded_cpu = i;
6181 }
6182 }
6183 }
6184
6185 return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
6186 }
6187
6188 static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p,
6189 int cpu, int prev_cpu, int sd_flag)
6190 {
6191 int new_cpu = cpu;
6192
6193 if (!cpumask_intersects(sched_domain_span(sd), p->cpus_ptr))
6194 return prev_cpu;
6195
6196
6197
6198
6199
6200 if (!(sd_flag & SD_BALANCE_FORK))
6201 sync_entity_load_avg(&p->se);
6202
6203 while (sd) {
6204 struct sched_group *group;
6205 struct sched_domain *tmp;
6206 int weight;
6207
6208 if (!(sd->flags & sd_flag)) {
6209 sd = sd->child;
6210 continue;
6211 }
6212
6213 group = find_idlest_group(sd, p, cpu);
6214 if (!group) {
6215 sd = sd->child;
6216 continue;
6217 }
6218
6219 new_cpu = find_idlest_group_cpu(group, p, cpu);
6220 if (new_cpu == cpu) {
6221
6222 sd = sd->child;
6223 continue;
6224 }
6225
6226
6227 cpu = new_cpu;
6228 weight = sd->span_weight;
6229 sd = NULL;
6230 for_each_domain(cpu, tmp) {
6231 if (weight <= tmp->span_weight)
6232 break;
6233 if (tmp->flags & sd_flag)
6234 sd = tmp;
6235 }
6236 }
6237
6238 return new_cpu;
6239 }
6240
6241 static inline int __select_idle_cpu(int cpu, struct task_struct *p)
6242 {
6243 if ((available_idle_cpu(cpu) || sched_idle_cpu(cpu)) &&
6244 sched_cpu_cookie_match(cpu_rq(cpu), p))
6245 return cpu;
6246
6247 return -1;
6248 }
6249
6250 #ifdef CONFIG_SCHED_SMT
6251 DEFINE_STATIC_KEY_FALSE(sched_smt_present);
6252 EXPORT_SYMBOL_GPL(sched_smt_present);
6253
6254 static inline void set_idle_cores(int cpu, int val)
6255 {
6256 struct sched_domain_shared *sds;
6257
6258 sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
6259 if (sds)
6260 WRITE_ONCE(sds->has_idle_cores, val);
6261 }
6262
6263 static inline bool test_idle_cores(int cpu, bool def)
6264 {
6265 struct sched_domain_shared *sds;
6266
6267 sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
6268 if (sds)
6269 return READ_ONCE(sds->has_idle_cores);
6270
6271 return def;
6272 }
6273
6274
6275
6276
6277
6278
6279
6280
6281 void __update_idle_core(struct rq *rq)
6282 {
6283 int core = cpu_of(rq);
6284 int cpu;
6285
6286 rcu_read_lock();
6287 if (test_idle_cores(core, true))
6288 goto unlock;
6289
6290 for_each_cpu(cpu, cpu_smt_mask(core)) {
6291 if (cpu == core)
6292 continue;
6293
6294 if (!available_idle_cpu(cpu))
6295 goto unlock;
6296 }
6297
6298 set_idle_cores(core, 1);
6299 unlock:
6300 rcu_read_unlock();
6301 }
6302
6303
6304
6305
6306
6307
6308 static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpus, int *idle_cpu)
6309 {
6310 bool idle = true;
6311 int cpu;
6312
6313 if (!static_branch_likely(&sched_smt_present))
6314 return __select_idle_cpu(core, p);
6315
6316 for_each_cpu(cpu, cpu_smt_mask(core)) {
6317 if (!available_idle_cpu(cpu)) {
6318 idle = false;
6319 if (*idle_cpu == -1) {
6320 if (sched_idle_cpu(cpu) && cpumask_test_cpu(cpu, p->cpus_ptr)) {
6321 *idle_cpu = cpu;
6322 break;
6323 }
6324 continue;
6325 }
6326 break;
6327 }
6328 if (*idle_cpu == -1 && cpumask_test_cpu(cpu, p->cpus_ptr))
6329 *idle_cpu = cpu;
6330 }
6331
6332 if (idle)
6333 return core;
6334
6335 cpumask_andnot(cpus, cpus, cpu_smt_mask(core));
6336 return -1;
6337 }
6338
6339
6340
6341
6342 static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
6343 {
6344 int cpu;
6345
6346 for_each_cpu(cpu, cpu_smt_mask(target)) {
6347 if (!cpumask_test_cpu(cpu, p->cpus_ptr) ||
6348 !cpumask_test_cpu(cpu, sched_domain_span(sd)))
6349 continue;
6350 if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
6351 return cpu;
6352 }
6353
6354 return -1;
6355 }
6356
6357 #else
6358
6359 static inline void set_idle_cores(int cpu, int val)
6360 {
6361 }
6362
6363 static inline bool test_idle_cores(int cpu, bool def)
6364 {
6365 return def;
6366 }
6367
6368 static inline int select_idle_core(struct task_struct *p, int core, struct cpumask *cpus, int *idle_cpu)
6369 {
6370 return __select_idle_cpu(core, p);
6371 }
6372
6373 static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
6374 {
6375 return -1;
6376 }
6377
6378 #endif
6379
6380
6381
6382
6383
6384
6385 static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool has_idle_core, int target)
6386 {
6387 struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
6388 int i, cpu, idle_cpu = -1, nr = INT_MAX;
6389 struct sched_domain_shared *sd_share;
6390 struct rq *this_rq = this_rq();
6391 int this = smp_processor_id();
6392 struct sched_domain *this_sd;
6393 u64 time = 0;
6394
6395 this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
6396 if (!this_sd)
6397 return -1;
6398
6399 cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
6400
6401 if (sched_feat(SIS_PROP) && !has_idle_core) {
6402 u64 avg_cost, avg_idle, span_avg;
6403 unsigned long now = jiffies;
6404
6405
6406
6407
6408
6409
6410 if (unlikely(this_rq->wake_stamp < now)) {
6411 while (this_rq->wake_stamp < now && this_rq->wake_avg_idle) {
6412 this_rq->wake_stamp++;
6413 this_rq->wake_avg_idle >>= 1;
6414 }
6415 }
6416
6417 avg_idle = this_rq->wake_avg_idle;
6418 avg_cost = this_sd->avg_scan_cost + 1;
6419
6420 span_avg = sd->span_weight * avg_idle;
6421 if (span_avg > 4*avg_cost)
6422 nr = div_u64(span_avg, avg_cost);
6423 else
6424 nr = 4;
6425
6426 time = cpu_clock(this);
6427 }
6428
6429 if (sched_feat(SIS_UTIL)) {
6430 sd_share = rcu_dereference(per_cpu(sd_llc_shared, target));
6431 if (sd_share) {
6432
6433 nr = READ_ONCE(sd_share->nr_idle_scan) + 1;
6434
6435 if (nr == 1)
6436 return -1;
6437 }
6438 }
6439
6440 for_each_cpu_wrap(cpu, cpus, target + 1) {
6441 if (has_idle_core) {
6442 i = select_idle_core(p, cpu, cpus, &idle_cpu);
6443 if ((unsigned int)i < nr_cpumask_bits)
6444 return i;
6445
6446 } else {
6447 if (!--nr)
6448 return -1;
6449 idle_cpu = __select_idle_cpu(cpu, p);
6450 if ((unsigned int)idle_cpu < nr_cpumask_bits)
6451 break;
6452 }
6453 }
6454
6455 if (has_idle_core)
6456 set_idle_cores(target, false);
6457
6458 if (sched_feat(SIS_PROP) && !has_idle_core) {
6459 time = cpu_clock(this) - time;
6460
6461
6462
6463
6464
6465 this_rq->wake_avg_idle -= min(this_rq->wake_avg_idle, time);
6466
6467 update_avg(&this_sd->avg_scan_cost, time);
6468 }
6469
6470 return idle_cpu;
6471 }
6472
6473
6474
6475
6476
6477
6478 static int
6479 select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
6480 {
6481 unsigned long task_util, best_cap = 0;
6482 int cpu, best_cpu = -1;
6483 struct cpumask *cpus;
6484
6485 cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
6486 cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
6487
6488 task_util = uclamp_task_util(p);
6489
6490 for_each_cpu_wrap(cpu, cpus, target) {
6491 unsigned long cpu_cap = capacity_of(cpu);
6492
6493 if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu))
6494 continue;
6495 if (fits_capacity(task_util, cpu_cap))
6496 return cpu;
6497
6498 if (cpu_cap > best_cap) {
6499 best_cap = cpu_cap;
6500 best_cpu = cpu;
6501 }
6502 }
6503
6504 return best_cpu;
6505 }
6506
6507 static inline bool asym_fits_capacity(unsigned long task_util, int cpu)
6508 {
6509 if (static_branch_unlikely(&sched_asym_cpucapacity))
6510 return fits_capacity(task_util, capacity_of(cpu));
6511
6512 return true;
6513 }
6514
6515
6516
6517
6518 static int select_idle_sibling(struct task_struct *p, int prev, int target)
6519 {
6520 bool has_idle_core = false;
6521 struct sched_domain *sd;
6522 unsigned long task_util;
6523 int i, recent_used_cpu;
6524
6525
6526
6527
6528
6529 if (static_branch_unlikely(&sched_asym_cpucapacity)) {
6530 sync_entity_load_avg(&p->se);
6531 task_util = uclamp_task_util(p);
6532 }
6533
6534
6535
6536
6537 lockdep_assert_irqs_disabled();
6538
6539 if ((available_idle_cpu(target) || sched_idle_cpu(target)) &&
6540 asym_fits_capacity(task_util, target))
6541 return target;
6542
6543
6544
6545
6546 if (prev != target && cpus_share_cache(prev, target) &&
6547 (available_idle_cpu(prev) || sched_idle_cpu(prev)) &&
6548 asym_fits_capacity(task_util, prev))
6549 return prev;
6550
6551
6552
6553
6554
6555
6556
6557
6558
6559 if (is_per_cpu_kthread(current) &&
6560 in_task() &&
6561 prev == smp_processor_id() &&
6562 this_rq()->nr_running <= 1 &&
6563 asym_fits_capacity(task_util, prev)) {
6564 return prev;
6565 }
6566
6567
6568 recent_used_cpu = p->recent_used_cpu;
6569 p->recent_used_cpu = prev;
6570 if (recent_used_cpu != prev &&
6571 recent_used_cpu != target &&
6572 cpus_share_cache(recent_used_cpu, target) &&
6573 (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
6574 cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr) &&
6575 asym_fits_capacity(task_util, recent_used_cpu)) {
6576 return recent_used_cpu;
6577 }
6578
6579
6580
6581
6582
6583 if (static_branch_unlikely(&sched_asym_cpucapacity)) {
6584 sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, target));
6585
6586
6587
6588
6589
6590
6591
6592
6593 if (sd) {
6594 i = select_idle_capacity(p, sd, target);
6595 return ((unsigned)i < nr_cpumask_bits) ? i : target;
6596 }
6597 }
6598
6599 sd = rcu_dereference(per_cpu(sd_llc, target));
6600 if (!sd)
6601 return target;
6602
6603 if (sched_smt_active()) {
6604 has_idle_core = test_idle_cores(target, false);
6605
6606 if (!has_idle_core && cpus_share_cache(prev, target)) {
6607 i = select_idle_smt(p, sd, prev);
6608 if ((unsigned int)i < nr_cpumask_bits)
6609 return i;
6610 }
6611 }
6612
6613 i = select_idle_cpu(p, sd, has_idle_core, target);
6614 if ((unsigned)i < nr_cpumask_bits)
6615 return i;
6616
6617 return target;
6618 }
6619
6620
6621
6622
6623
6624 static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
6625 {
6626 struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
6627 unsigned long util = READ_ONCE(cfs_rq->avg.util_avg);
6628
6629
6630
6631
6632
6633
6634
6635 if (task_cpu(p) == cpu && dst_cpu != cpu)
6636 lsub_positive(&util, task_util(p));
6637 else if (task_cpu(p) != cpu && dst_cpu == cpu)
6638 util += task_util(p);
6639
6640 if (sched_feat(UTIL_EST)) {
6641 unsigned long util_est;
6642
6643 util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued);
6644
6645
6646
6647
6648
6649
6650
6651
6652
6653
6654
6655
6656
6657
6658
6659
6660
6661
6662
6663
6664
6665
6666
6667
6668
6669
6670
6671 if (dst_cpu == cpu)
6672 util_est += _task_util_est(p);
6673 else if (unlikely(task_on_rq_queued(p) || current == p))
6674 lsub_positive(&util_est, _task_util_est(p));
6675
6676 util = max(util, util_est);
6677 }
6678
6679 return min(util, capacity_orig_of(cpu));
6680 }
6681
6682
6683
6684
6685
6686
6687
6688
6689
6690
6691
6692
6693
6694
6695 static unsigned long cpu_util_without(int cpu, struct task_struct *p)
6696 {
6697
6698 if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
6699 return cpu_util_cfs(cpu);
6700
6701 return cpu_util_next(cpu, p, -1);
6702 }
6703
6704
6705
6706
6707
6708
6709
6710
6711
6712
6713 struct energy_env {
6714 unsigned long task_busy_time;
6715 unsigned long pd_busy_time;
6716 unsigned long cpu_cap;
6717 unsigned long pd_cap;
6718 };
6719
6720
6721
6722
6723
6724
6725
6726 static inline void eenv_task_busy_time(struct energy_env *eenv,
6727 struct task_struct *p, int prev_cpu)
6728 {
6729 unsigned long busy_time, max_cap = arch_scale_cpu_capacity(prev_cpu);
6730 unsigned long irq = cpu_util_irq(cpu_rq(prev_cpu));
6731
6732 if (unlikely(irq >= max_cap))
6733 busy_time = max_cap;
6734 else
6735 busy_time = scale_irq_capacity(task_util_est(p), irq, max_cap);
6736
6737 eenv->task_busy_time = busy_time;
6738 }
6739
6740
6741
6742
6743
6744
6745
6746
6747
6748
6749
6750
6751
6752
6753
6754
6755
6756
6757
6758
6759
6760
6761 static inline void eenv_pd_busy_time(struct energy_env *eenv,
6762 struct cpumask *pd_cpus,
6763 struct task_struct *p)
6764 {
6765 unsigned long busy_time = 0;
6766 int cpu;
6767
6768 for_each_cpu(cpu, pd_cpus) {
6769 unsigned long util = cpu_util_next(cpu, p, -1);
6770
6771 busy_time += effective_cpu_util(cpu, util, ENERGY_UTIL, NULL);
6772 }
6773
6774 eenv->pd_busy_time = min(eenv->pd_cap, busy_time);
6775 }
6776
6777
6778
6779
6780
6781
6782
6783
6784 static inline unsigned long
6785 eenv_pd_max_util(struct energy_env *eenv, struct cpumask *pd_cpus,
6786 struct task_struct *p, int dst_cpu)
6787 {
6788 unsigned long max_util = 0;
6789 int cpu;
6790
6791 for_each_cpu(cpu, pd_cpus) {
6792 struct task_struct *tsk = (cpu == dst_cpu) ? p : NULL;
6793 unsigned long util = cpu_util_next(cpu, p, dst_cpu);
6794 unsigned long cpu_util;
6795
6796
6797
6798
6799
6800
6801
6802
6803 cpu_util = effective_cpu_util(cpu, util, FREQUENCY_UTIL, tsk);
6804 max_util = max(max_util, cpu_util);
6805 }
6806
6807 return min(max_util, eenv->cpu_cap);
6808 }
6809
6810
6811
6812
6813
6814
6815 static inline unsigned long
6816 compute_energy(struct energy_env *eenv, struct perf_domain *pd,
6817 struct cpumask *pd_cpus, struct task_struct *p, int dst_cpu)
6818 {
6819 unsigned long max_util = eenv_pd_max_util(eenv, pd_cpus, p, dst_cpu);
6820 unsigned long busy_time = eenv->pd_busy_time;
6821
6822 if (dst_cpu >= 0)
6823 busy_time = min(eenv->pd_cap, busy_time + eenv->task_busy_time);
6824
6825 return em_cpu_energy(pd->em_pd, max_util, busy_time, eenv->cpu_cap);
6826 }
6827
6828
6829
6830
6831
6832
6833
6834
6835
6836
6837
6838
6839
6840
6841
6842
6843
6844
6845
6846
6847
6848
6849
6850
6851
6852
6853
6854
6855
6856
6857
6858
6859
6860
6861
6862
6863
6864
6865
6866
6867 static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
6868 {
6869 struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
6870 unsigned long prev_delta = ULONG_MAX, best_delta = ULONG_MAX;
6871 struct root_domain *rd = this_rq()->rd;
6872 int cpu, best_energy_cpu, target = -1;
6873 struct sched_domain *sd;
6874 struct perf_domain *pd;
6875 struct energy_env eenv;
6876
6877 rcu_read_lock();
6878 pd = rcu_dereference(rd->pd);
6879 if (!pd || READ_ONCE(rd->overutilized))
6880 goto unlock;
6881
6882
6883
6884
6885
6886 sd = rcu_dereference(*this_cpu_ptr(&sd_asym_cpucapacity));
6887 while (sd && !cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
6888 sd = sd->parent;
6889 if (!sd)
6890 goto unlock;
6891
6892 target = prev_cpu;
6893
6894 sync_entity_load_avg(&p->se);
6895 if (!task_util_est(p))
6896 goto unlock;
6897
6898 eenv_task_busy_time(&eenv, p, prev_cpu);
6899
6900 for (; pd; pd = pd->next) {
6901 unsigned long cpu_cap, cpu_thermal_cap, util;
6902 unsigned long cur_delta, max_spare_cap = 0;
6903 bool compute_prev_delta = false;
6904 int max_spare_cap_cpu = -1;
6905 unsigned long base_energy;
6906
6907 cpumask_and(cpus, perf_domain_span(pd), cpu_online_mask);
6908
6909 if (cpumask_empty(cpus))
6910 continue;
6911
6912
6913 cpu = cpumask_first(cpus);
6914 cpu_thermal_cap = arch_scale_cpu_capacity(cpu);
6915 cpu_thermal_cap -= arch_scale_thermal_pressure(cpu);
6916
6917 eenv.cpu_cap = cpu_thermal_cap;
6918 eenv.pd_cap = 0;
6919
6920 for_each_cpu(cpu, cpus) {
6921 eenv.pd_cap += cpu_thermal_cap;
6922
6923 if (!cpumask_test_cpu(cpu, sched_domain_span(sd)))
6924 continue;
6925
6926 if (!cpumask_test_cpu(cpu, p->cpus_ptr))
6927 continue;
6928
6929 util = cpu_util_next(cpu, p, cpu);
6930 cpu_cap = capacity_of(cpu);
6931
6932
6933
6934
6935
6936
6937
6938
6939 util = uclamp_rq_util_with(cpu_rq(cpu), util, p);
6940 if (!fits_capacity(util, cpu_cap))
6941 continue;
6942
6943 lsub_positive(&cpu_cap, util);
6944
6945 if (cpu == prev_cpu) {
6946
6947 compute_prev_delta = true;
6948 } else if (cpu_cap > max_spare_cap) {
6949
6950
6951
6952
6953 max_spare_cap = cpu_cap;
6954 max_spare_cap_cpu = cpu;
6955 }
6956 }
6957
6958 if (max_spare_cap_cpu < 0 && !compute_prev_delta)
6959 continue;
6960
6961 eenv_pd_busy_time(&eenv, cpus, p);
6962
6963 base_energy = compute_energy(&eenv, pd, cpus, p, -1);
6964
6965
6966 if (compute_prev_delta) {
6967 prev_delta = compute_energy(&eenv, pd, cpus, p,
6968 prev_cpu);
6969
6970 if (prev_delta < base_energy)
6971 goto unlock;
6972 prev_delta -= base_energy;
6973 best_delta = min(best_delta, prev_delta);
6974 }
6975
6976
6977 if (max_spare_cap_cpu >= 0) {
6978 cur_delta = compute_energy(&eenv, pd, cpus, p,
6979 max_spare_cap_cpu);
6980
6981 if (cur_delta < base_energy)
6982 goto unlock;
6983 cur_delta -= base_energy;
6984 if (cur_delta < best_delta) {
6985 best_delta = cur_delta;
6986 best_energy_cpu = max_spare_cap_cpu;
6987 }
6988 }
6989 }
6990 rcu_read_unlock();
6991
6992 if (best_delta < prev_delta)
6993 target = best_energy_cpu;
6994
6995 return target;
6996
6997 unlock:
6998 rcu_read_unlock();
6999
7000 return target;
7001 }
7002
7003
7004
7005
7006
7007
7008
7009
7010
7011
7012
7013 static int
7014 select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
7015 {
7016 int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING);
7017 struct sched_domain *tmp, *sd = NULL;
7018 int cpu = smp_processor_id();
7019 int new_cpu = prev_cpu;
7020 int want_affine = 0;
7021
7022 int sd_flag = wake_flags & 0xF;
7023
7024
7025
7026
7027 lockdep_assert_held(&p->pi_lock);
7028 if (wake_flags & WF_TTWU) {
7029 record_wakee(p);
7030
7031 if (sched_energy_enabled()) {
7032 new_cpu = find_energy_efficient_cpu(p, prev_cpu);
7033 if (new_cpu >= 0)
7034 return new_cpu;
7035 new_cpu = prev_cpu;
7036 }
7037
7038 want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, p->cpus_ptr);
7039 }
7040
7041 rcu_read_lock();
7042 for_each_domain(cpu, tmp) {
7043
7044
7045
7046
7047 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
7048 cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
7049 if (cpu != prev_cpu)
7050 new_cpu = wake_affine(tmp, p, cpu, prev_cpu, sync);
7051
7052 sd = NULL;
7053 break;
7054 }
7055
7056
7057
7058
7059
7060
7061 if (tmp->flags & sd_flag)
7062 sd = tmp;
7063 else if (!want_affine)
7064 break;
7065 }
7066
7067 if (unlikely(sd)) {
7068
7069 new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
7070 } else if (wake_flags & WF_TTWU) {
7071
7072 new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
7073 }
7074 rcu_read_unlock();
7075
7076 return new_cpu;
7077 }
7078
7079 static void detach_entity_cfs_rq(struct sched_entity *se);
7080
7081
7082
7083
7084
7085
7086 static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
7087 {
7088 struct sched_entity *se = &p->se;
7089
7090
7091
7092
7093
7094
7095
7096 if (READ_ONCE(p->__state) == TASK_WAKING) {
7097 struct cfs_rq *cfs_rq = cfs_rq_of(se);
7098
7099 se->vruntime -= u64_u32_load(cfs_rq->min_vruntime);
7100 }
7101
7102 if (p->on_rq == TASK_ON_RQ_MIGRATING) {
7103
7104
7105
7106
7107 lockdep_assert_rq_held(task_rq(p));
7108 detach_entity_cfs_rq(se);
7109
7110 } else {
7111 remove_entity_load_avg(se);
7112
7113
7114
7115
7116
7117
7118
7119
7120
7121
7122
7123 migrate_se_pelt_lag(se);
7124 }
7125
7126
7127 se->avg.last_update_time = 0;
7128
7129
7130 se->exec_start = 0;
7131
7132 update_scan_period(p, new_cpu);
7133 }
7134
7135 static void task_dead_fair(struct task_struct *p)
7136 {
7137 remove_entity_load_avg(&p->se);
7138 }
7139
7140 static int
7141 balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
7142 {
7143 if (rq->nr_running)
7144 return 1;
7145
7146 return newidle_balance(rq, rf) != 0;
7147 }
7148 #endif
7149
7150 static unsigned long wakeup_gran(struct sched_entity *se)
7151 {
7152 unsigned long gran = sysctl_sched_wakeup_granularity;
7153
7154
7155
7156
7157
7158
7159
7160
7161
7162
7163
7164
7165
7166
7167 return calc_delta_fair(gran, se);
7168 }
7169
7170
7171
7172
7173
7174
7175
7176
7177
7178
7179
7180
7181
7182
7183
7184 static int
7185 wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
7186 {
7187 s64 gran, vdiff = curr->vruntime - se->vruntime;
7188
7189 if (vdiff <= 0)
7190 return -1;
7191
7192 gran = wakeup_gran(se);
7193 if (vdiff > gran)
7194 return 1;
7195
7196 return 0;
7197 }
7198
7199 static void set_last_buddy(struct sched_entity *se)
7200 {
7201 for_each_sched_entity(se) {
7202 if (SCHED_WARN_ON(!se->on_rq))
7203 return;
7204 if (se_is_idle(se))
7205 return;
7206 cfs_rq_of(se)->last = se;
7207 }
7208 }
7209
7210 static void set_next_buddy(struct sched_entity *se)
7211 {
7212 for_each_sched_entity(se) {
7213 if (SCHED_WARN_ON(!se->on_rq))
7214 return;
7215 if (se_is_idle(se))
7216 return;
7217 cfs_rq_of(se)->next = se;
7218 }
7219 }
7220
7221 static void set_skip_buddy(struct sched_entity *se)
7222 {
7223 for_each_sched_entity(se)
7224 cfs_rq_of(se)->skip = se;
7225 }
7226
7227
7228
7229
7230 static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
7231 {
7232 struct task_struct *curr = rq->curr;
7233 struct sched_entity *se = &curr->se, *pse = &p->se;
7234 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
7235 int scale = cfs_rq->nr_running >= sched_nr_latency;
7236 int next_buddy_marked = 0;
7237 int cse_is_idle, pse_is_idle;
7238
7239 if (unlikely(se == pse))
7240 return;
7241
7242
7243
7244
7245
7246
7247
7248 if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
7249 return;
7250
7251 if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
7252 set_next_buddy(pse);
7253 next_buddy_marked = 1;
7254 }
7255
7256
7257
7258
7259
7260
7261
7262
7263
7264
7265
7266 if (test_tsk_need_resched(curr))
7267 return;
7268
7269
7270 if (unlikely(task_has_idle_policy(curr)) &&
7271 likely(!task_has_idle_policy(p)))
7272 goto preempt;
7273
7274
7275
7276
7277
7278 if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
7279 return;
7280
7281 find_matching_se(&se, &pse);
7282 BUG_ON(!pse);
7283
7284 cse_is_idle = se_is_idle(se);
7285 pse_is_idle = se_is_idle(pse);
7286
7287
7288
7289
7290
7291 if (cse_is_idle && !pse_is_idle)
7292 goto preempt;
7293 if (cse_is_idle != pse_is_idle)
7294 return;
7295
7296 update_curr(cfs_rq_of(se));
7297 if (wakeup_preempt_entity(se, pse) == 1) {
7298
7299
7300
7301
7302 if (!next_buddy_marked)
7303 set_next_buddy(pse);
7304 goto preempt;
7305 }
7306
7307 return;
7308
7309 preempt:
7310 resched_curr(rq);
7311
7312
7313
7314
7315
7316
7317
7318
7319
7320 if (unlikely(!se->on_rq || curr == rq->idle))
7321 return;
7322
7323 if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
7324 set_last_buddy(se);
7325 }
7326
7327 #ifdef CONFIG_SMP
7328 static struct task_struct *pick_task_fair(struct rq *rq)
7329 {
7330 struct sched_entity *se;
7331 struct cfs_rq *cfs_rq;
7332
7333 again:
7334 cfs_rq = &rq->cfs;
7335 if (!cfs_rq->nr_running)
7336 return NULL;
7337
7338 do {
7339 struct sched_entity *curr = cfs_rq->curr;
7340
7341
7342 if (curr) {
7343 if (curr->on_rq)
7344 update_curr(cfs_rq);
7345 else
7346 curr = NULL;
7347
7348 if (unlikely(check_cfs_rq_runtime(cfs_rq)))
7349 goto again;
7350 }
7351
7352 se = pick_next_entity(cfs_rq, curr);
7353 cfs_rq = group_cfs_rq(se);
7354 } while (cfs_rq);
7355
7356 return task_of(se);
7357 }
7358 #endif
7359
7360 struct task_struct *
7361 pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
7362 {
7363 struct cfs_rq *cfs_rq = &rq->cfs;
7364 struct sched_entity *se;
7365 struct task_struct *p;
7366 int new_tasks;
7367
7368 again:
7369 if (!sched_fair_runnable(rq))
7370 goto idle;
7371
7372 #ifdef CONFIG_FAIR_GROUP_SCHED
7373 if (!prev || prev->sched_class != &fair_sched_class)
7374 goto simple;
7375
7376
7377
7378
7379
7380
7381
7382
7383
7384 do {
7385 struct sched_entity *curr = cfs_rq->curr;
7386
7387
7388
7389
7390
7391
7392
7393 if (curr) {
7394 if (curr->on_rq)
7395 update_curr(cfs_rq);
7396 else
7397 curr = NULL;
7398
7399
7400
7401
7402
7403
7404
7405 if (unlikely(check_cfs_rq_runtime(cfs_rq))) {
7406 cfs_rq = &rq->cfs;
7407
7408 if (!cfs_rq->nr_running)
7409 goto idle;
7410
7411 goto simple;
7412 }
7413 }
7414
7415 se = pick_next_entity(cfs_rq, curr);
7416 cfs_rq = group_cfs_rq(se);
7417 } while (cfs_rq);
7418
7419 p = task_of(se);
7420
7421
7422
7423
7424
7425
7426 if (prev != p) {
7427 struct sched_entity *pse = &prev->se;
7428
7429 while (!(cfs_rq = is_same_group(se, pse))) {
7430 int se_depth = se->depth;
7431 int pse_depth = pse->depth;
7432
7433 if (se_depth <= pse_depth) {
7434 put_prev_entity(cfs_rq_of(pse), pse);
7435 pse = parent_entity(pse);
7436 }
7437 if (se_depth >= pse_depth) {
7438 set_next_entity(cfs_rq_of(se), se);
7439 se = parent_entity(se);
7440 }
7441 }
7442
7443 put_prev_entity(cfs_rq, pse);
7444 set_next_entity(cfs_rq, se);
7445 }
7446
7447 goto done;
7448 simple:
7449 #endif
7450 if (prev)
7451 put_prev_task(rq, prev);
7452
7453 do {
7454 se = pick_next_entity(cfs_rq, NULL);
7455 set_next_entity(cfs_rq, se);
7456 cfs_rq = group_cfs_rq(se);
7457 } while (cfs_rq);
7458
7459 p = task_of(se);
7460
7461 done: __maybe_unused;
7462 #ifdef CONFIG_SMP
7463
7464
7465
7466
7467
7468 list_move(&p->se.group_node, &rq->cfs_tasks);
7469 #endif
7470
7471 if (hrtick_enabled_fair(rq))
7472 hrtick_start_fair(rq, p);
7473
7474 update_misfit_status(p, rq);
7475
7476 return p;
7477
7478 idle:
7479 if (!rf)
7480 return NULL;
7481
7482 new_tasks = newidle_balance(rq, rf);
7483
7484
7485
7486
7487
7488
7489 if (new_tasks < 0)
7490 return RETRY_TASK;
7491
7492 if (new_tasks > 0)
7493 goto again;
7494
7495
7496
7497
7498
7499 update_idle_rq_clock_pelt(rq);
7500
7501 return NULL;
7502 }
7503
7504 static struct task_struct *__pick_next_task_fair(struct rq *rq)
7505 {
7506 return pick_next_task_fair(rq, NULL, NULL);
7507 }
7508
7509
7510
7511
7512 static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
7513 {
7514 struct sched_entity *se = &prev->se;
7515 struct cfs_rq *cfs_rq;
7516
7517 for_each_sched_entity(se) {
7518 cfs_rq = cfs_rq_of(se);
7519 put_prev_entity(cfs_rq, se);
7520 }
7521 }
7522
7523
7524
7525
7526
7527
7528 static void yield_task_fair(struct rq *rq)
7529 {
7530 struct task_struct *curr = rq->curr;
7531 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
7532 struct sched_entity *se = &curr->se;
7533
7534
7535
7536
7537 if (unlikely(rq->nr_running == 1))
7538 return;
7539
7540 clear_buddies(cfs_rq, se);
7541
7542 if (curr->policy != SCHED_BATCH) {
7543 update_rq_clock(rq);
7544
7545
7546
7547 update_curr(cfs_rq);
7548
7549
7550
7551
7552
7553 rq_clock_skip_update(rq);
7554 }
7555
7556 set_skip_buddy(se);
7557 }
7558
7559 static bool yield_to_task_fair(struct rq *rq, struct task_struct *p)
7560 {
7561 struct sched_entity *se = &p->se;
7562
7563
7564 if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
7565 return false;
7566
7567
7568 set_next_buddy(se);
7569
7570 yield_task_fair(rq);
7571
7572 return true;
7573 }
7574
7575 #ifdef CONFIG_SMP
7576
7577
7578
7579
7580
7581
7582
7583
7584
7585
7586
7587
7588
7589
7590
7591
7592
7593
7594
7595
7596
7597
7598
7599
7600
7601
7602
7603
7604
7605
7606
7607
7608
7609
7610
7611
7612
7613
7614
7615
7616
7617
7618
7619
7620
7621
7622
7623
7624
7625
7626
7627
7628
7629
7630
7631
7632
7633
7634
7635
7636
7637
7638
7639
7640
7641
7642
7643
7644
7645
7646
7647
7648
7649
7650
7651
7652
7653
7654
7655
7656
7657
7658
7659
7660
7661
7662
7663
7664
7665
7666
7667
7668
7669
7670
7671
7672
7673
7674
7675
7676
7677
7678
7679
7680
7681
7682
7683
7684
7685
7686
7687
7688
7689
7690
7691
7692
7693
7694 static unsigned long __read_mostly max_load_balance_interval = HZ/10;
7695
7696 enum fbq_type { regular, remote, all };
7697
7698
7699
7700
7701
7702
7703
7704
7705 enum group_type {
7706
7707 group_has_spare = 0,
7708
7709
7710
7711
7712 group_fully_busy,
7713
7714
7715
7716
7717 group_misfit_task,
7718
7719
7720
7721
7722
7723 group_asym_packing,
7724
7725
7726
7727
7728 group_imbalanced,
7729
7730
7731
7732
7733 group_overloaded
7734 };
7735
7736 enum migration_type {
7737 migrate_load = 0,
7738 migrate_util,
7739 migrate_task,
7740 migrate_misfit
7741 };
7742
7743 #define LBF_ALL_PINNED 0x01
7744 #define LBF_NEED_BREAK 0x02
7745 #define LBF_DST_PINNED 0x04
7746 #define LBF_SOME_PINNED 0x08
7747 #define LBF_ACTIVE_LB 0x10
7748
7749 struct lb_env {
7750 struct sched_domain *sd;
7751
7752 struct rq *src_rq;
7753 int src_cpu;
7754
7755 int dst_cpu;
7756 struct rq *dst_rq;
7757
7758 struct cpumask *dst_grpmask;
7759 int new_dst_cpu;
7760 enum cpu_idle_type idle;
7761 long imbalance;
7762
7763 struct cpumask *cpus;
7764
7765 unsigned int flags;
7766
7767 unsigned int loop;
7768 unsigned int loop_break;
7769 unsigned int loop_max;
7770
7771 enum fbq_type fbq_type;
7772 enum migration_type migration_type;
7773 struct list_head tasks;
7774 };
7775
7776
7777
7778
7779 static int task_hot(struct task_struct *p, struct lb_env *env)
7780 {
7781 s64 delta;
7782
7783 lockdep_assert_rq_held(env->src_rq);
7784
7785 if (p->sched_class != &fair_sched_class)
7786 return 0;
7787
7788 if (unlikely(task_has_idle_policy(p)))
7789 return 0;
7790
7791
7792 if (env->sd->flags & SD_SHARE_CPUCAPACITY)
7793 return 0;
7794
7795
7796
7797
7798 if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running &&
7799 (&p->se == cfs_rq_of(&p->se)->next ||
7800 &p->se == cfs_rq_of(&p->se)->last))
7801 return 1;
7802
7803 if (sysctl_sched_migration_cost == -1)
7804 return 1;
7805
7806
7807
7808
7809
7810 if (!sched_core_cookie_match(cpu_rq(env->dst_cpu), p))
7811 return 1;
7812
7813 if (sysctl_sched_migration_cost == 0)
7814 return 0;
7815
7816 delta = rq_clock_task(env->src_rq) - p->se.exec_start;
7817
7818 return delta < (s64)sysctl_sched_migration_cost;
7819 }
7820
7821 #ifdef CONFIG_NUMA_BALANCING
7822
7823
7824
7825
7826
7827 static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
7828 {
7829 struct numa_group *numa_group = rcu_dereference(p->numa_group);
7830 unsigned long src_weight, dst_weight;
7831 int src_nid, dst_nid, dist;
7832
7833 if (!static_branch_likely(&sched_numa_balancing))
7834 return -1;
7835
7836 if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
7837 return -1;
7838
7839 src_nid = cpu_to_node(env->src_cpu);
7840 dst_nid = cpu_to_node(env->dst_cpu);
7841
7842 if (src_nid == dst_nid)
7843 return -1;
7844
7845
7846 if (src_nid == p->numa_preferred_nid) {
7847 if (env->src_rq->nr_running > env->src_rq->nr_preferred_running)
7848 return 1;
7849 else
7850 return -1;
7851 }
7852
7853
7854 if (dst_nid == p->numa_preferred_nid)
7855 return 0;
7856
7857
7858 if (env->idle == CPU_IDLE)
7859 return -1;
7860
7861 dist = node_distance(src_nid, dst_nid);
7862 if (numa_group) {
7863 src_weight = group_weight(p, src_nid, dist);
7864 dst_weight = group_weight(p, dst_nid, dist);
7865 } else {
7866 src_weight = task_weight(p, src_nid, dist);
7867 dst_weight = task_weight(p, dst_nid, dist);
7868 }
7869
7870 return dst_weight < src_weight;
7871 }
7872
7873 #else
7874 static inline int migrate_degrades_locality(struct task_struct *p,
7875 struct lb_env *env)
7876 {
7877 return -1;
7878 }
7879 #endif
7880
7881
7882
7883
7884 static
7885 int can_migrate_task(struct task_struct *p, struct lb_env *env)
7886 {
7887 int tsk_cache_hot;
7888
7889 lockdep_assert_rq_held(env->src_rq);
7890
7891
7892
7893
7894
7895
7896
7897
7898 if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
7899 return 0;
7900
7901
7902 if (kthread_is_per_cpu(p))
7903 return 0;
7904
7905 if (!cpumask_test_cpu(env->dst_cpu, p->cpus_ptr)) {
7906 int cpu;
7907
7908 schedstat_inc(p->stats.nr_failed_migrations_affine);
7909
7910 env->flags |= LBF_SOME_PINNED;
7911
7912
7913
7914
7915
7916
7917
7918
7919
7920
7921
7922 if (env->idle == CPU_NEWLY_IDLE ||
7923 env->flags & (LBF_DST_PINNED | LBF_ACTIVE_LB))
7924 return 0;
7925
7926
7927 for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
7928 if (cpumask_test_cpu(cpu, p->cpus_ptr)) {
7929 env->flags |= LBF_DST_PINNED;
7930 env->new_dst_cpu = cpu;
7931 break;
7932 }
7933 }
7934
7935 return 0;
7936 }
7937
7938
7939 env->flags &= ~LBF_ALL_PINNED;
7940
7941 if (task_running(env->src_rq, p)) {
7942 schedstat_inc(p->stats.nr_failed_migrations_running);
7943 return 0;
7944 }
7945
7946
7947
7948
7949
7950
7951
7952
7953 if (env->flags & LBF_ACTIVE_LB)
7954 return 1;
7955
7956 tsk_cache_hot = migrate_degrades_locality(p, env);
7957 if (tsk_cache_hot == -1)
7958 tsk_cache_hot = task_hot(p, env);
7959
7960 if (tsk_cache_hot <= 0 ||
7961 env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
7962 if (tsk_cache_hot == 1) {
7963 schedstat_inc(env->sd->lb_hot_gained[env->idle]);
7964 schedstat_inc(p->stats.nr_forced_migrations);
7965 }
7966 return 1;
7967 }
7968
7969 schedstat_inc(p->stats.nr_failed_migrations_hot);
7970 return 0;
7971 }
7972
7973
7974
7975
7976 static void detach_task(struct task_struct *p, struct lb_env *env)
7977 {
7978 lockdep_assert_rq_held(env->src_rq);
7979
7980 deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK);
7981 set_task_cpu(p, env->dst_cpu);
7982 }
7983
7984
7985
7986
7987
7988
7989
7990 static struct task_struct *detach_one_task(struct lb_env *env)
7991 {
7992 struct task_struct *p;
7993
7994 lockdep_assert_rq_held(env->src_rq);
7995
7996 list_for_each_entry_reverse(p,
7997 &env->src_rq->cfs_tasks, se.group_node) {
7998 if (!can_migrate_task(p, env))
7999 continue;
8000
8001 detach_task(p, env);
8002
8003
8004
8005
8006
8007
8008
8009 schedstat_inc(env->sd->lb_gained[env->idle]);
8010 return p;
8011 }
8012 return NULL;
8013 }
8014
8015 static const unsigned int sched_nr_migrate_break = 32;
8016
8017
8018
8019
8020
8021
8022
8023 static int detach_tasks(struct lb_env *env)
8024 {
8025 struct list_head *tasks = &env->src_rq->cfs_tasks;
8026 unsigned long util, load;
8027 struct task_struct *p;
8028 int detached = 0;
8029
8030 lockdep_assert_rq_held(env->src_rq);
8031
8032
8033
8034
8035
8036 if (env->src_rq->nr_running <= 1) {
8037 env->flags &= ~LBF_ALL_PINNED;
8038 return 0;
8039 }
8040
8041 if (env->imbalance <= 0)
8042 return 0;
8043
8044 while (!list_empty(tasks)) {
8045
8046
8047
8048
8049 if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1)
8050 break;
8051
8052 p = list_last_entry(tasks, struct task_struct, se.group_node);
8053
8054 env->loop++;
8055
8056 if (env->loop > env->loop_max)
8057 break;
8058
8059
8060 if (env->loop > env->loop_break) {
8061 env->loop_break += sched_nr_migrate_break;
8062 env->flags |= LBF_NEED_BREAK;
8063 break;
8064 }
8065
8066 if (!can_migrate_task(p, env))
8067 goto next;
8068
8069 switch (env->migration_type) {
8070 case migrate_load:
8071
8072
8073
8074
8075
8076
8077
8078 load = max_t(unsigned long, task_h_load(p), 1);
8079
8080 if (sched_feat(LB_MIN) &&
8081 load < 16 && !env->sd->nr_balance_failed)
8082 goto next;
8083
8084
8085
8086
8087
8088
8089
8090 if (shr_bound(load, env->sd->nr_balance_failed) > env->imbalance)
8091 goto next;
8092
8093 env->imbalance -= load;
8094 break;
8095
8096 case migrate_util:
8097 util = task_util_est(p);
8098
8099 if (util > env->imbalance)
8100 goto next;
8101
8102 env->imbalance -= util;
8103 break;
8104
8105 case migrate_task:
8106 env->imbalance--;
8107 break;
8108
8109 case migrate_misfit:
8110
8111 if (task_fits_capacity(p, capacity_of(env->src_cpu)))
8112 goto next;
8113
8114 env->imbalance = 0;
8115 break;
8116 }
8117
8118 detach_task(p, env);
8119 list_add(&p->se.group_node, &env->tasks);
8120
8121 detached++;
8122
8123 #ifdef CONFIG_PREEMPTION
8124
8125
8126
8127
8128
8129 if (env->idle == CPU_NEWLY_IDLE)
8130 break;
8131 #endif
8132
8133
8134
8135
8136
8137 if (env->imbalance <= 0)
8138 break;
8139
8140 continue;
8141 next:
8142 list_move(&p->se.group_node, tasks);
8143 }
8144
8145
8146
8147
8148
8149
8150 schedstat_add(env->sd->lb_gained[env->idle], detached);
8151
8152 return detached;
8153 }
8154
8155
8156
8157
8158 static void attach_task(struct rq *rq, struct task_struct *p)
8159 {
8160 lockdep_assert_rq_held(rq);
8161
8162 BUG_ON(task_rq(p) != rq);
8163 activate_task(rq, p, ENQUEUE_NOCLOCK);
8164 check_preempt_curr(rq, p, 0);
8165 }
8166
8167
8168
8169
8170
8171 static void attach_one_task(struct rq *rq, struct task_struct *p)
8172 {
8173 struct rq_flags rf;
8174
8175 rq_lock(rq, &rf);
8176 update_rq_clock(rq);
8177 attach_task(rq, p);
8178 rq_unlock(rq, &rf);
8179 }
8180
8181
8182
8183
8184
8185 static void attach_tasks(struct lb_env *env)
8186 {
8187 struct list_head *tasks = &env->tasks;
8188 struct task_struct *p;
8189 struct rq_flags rf;
8190
8191 rq_lock(env->dst_rq, &rf);
8192 update_rq_clock(env->dst_rq);
8193
8194 while (!list_empty(tasks)) {
8195 p = list_first_entry(tasks, struct task_struct, se.group_node);
8196 list_del_init(&p->se.group_node);
8197
8198 attach_task(env->dst_rq, p);
8199 }
8200
8201 rq_unlock(env->dst_rq, &rf);
8202 }
8203
8204 #ifdef CONFIG_NO_HZ_COMMON
8205 static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq)
8206 {
8207 if (cfs_rq->avg.load_avg)
8208 return true;
8209
8210 if (cfs_rq->avg.util_avg)
8211 return true;
8212
8213 return false;
8214 }
8215
8216 static inline bool others_have_blocked(struct rq *rq)
8217 {
8218 if (READ_ONCE(rq->avg_rt.util_avg))
8219 return true;
8220
8221 if (READ_ONCE(rq->avg_dl.util_avg))
8222 return true;
8223
8224 if (thermal_load_avg(rq))
8225 return true;
8226
8227 #ifdef CONFIG_HAVE_SCHED_AVG_IRQ
8228 if (READ_ONCE(rq->avg_irq.util_avg))
8229 return true;
8230 #endif
8231
8232 return false;
8233 }
8234
8235 static inline void update_blocked_load_tick(struct rq *rq)
8236 {
8237 WRITE_ONCE(rq->last_blocked_load_update_tick, jiffies);
8238 }
8239
8240 static inline void update_blocked_load_status(struct rq *rq, bool has_blocked)
8241 {
8242 if (!has_blocked)
8243 rq->has_blocked_load = 0;
8244 }
8245 #else
8246 static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) { return false; }
8247 static inline bool others_have_blocked(struct rq *rq) { return false; }
8248 static inline void update_blocked_load_tick(struct rq *rq) {}
8249 static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {}
8250 #endif
8251
8252 static bool __update_blocked_others(struct rq *rq, bool *done)
8253 {
8254 const struct sched_class *curr_class;
8255 u64 now = rq_clock_pelt(rq);
8256 unsigned long thermal_pressure;
8257 bool decayed;
8258
8259
8260
8261
8262
8263 curr_class = rq->curr->sched_class;
8264
8265 thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq));
8266
8267 decayed = update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) |
8268 update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) |
8269 update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure) |
8270 update_irq_load_avg(rq, 0);
8271
8272 if (others_have_blocked(rq))
8273 *done = false;
8274
8275 return decayed;
8276 }
8277
8278 #ifdef CONFIG_FAIR_GROUP_SCHED
8279
8280 static bool __update_blocked_fair(struct rq *rq, bool *done)
8281 {
8282 struct cfs_rq *cfs_rq, *pos;
8283 bool decayed = false;
8284 int cpu = cpu_of(rq);
8285
8286
8287
8288
8289
8290 for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) {
8291 struct sched_entity *se;
8292
8293 if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq)) {
8294 update_tg_load_avg(cfs_rq);
8295
8296 if (cfs_rq->nr_running == 0)
8297 update_idle_cfs_rq_clock_pelt(cfs_rq);
8298
8299 if (cfs_rq == &rq->cfs)
8300 decayed = true;
8301 }
8302
8303
8304 se = cfs_rq->tg->se[cpu];
8305 if (se && !skip_blocked_update(se))
8306 update_load_avg(cfs_rq_of(se), se, UPDATE_TG);
8307
8308
8309
8310
8311
8312 if (cfs_rq_is_decayed(cfs_rq))
8313 list_del_leaf_cfs_rq(cfs_rq);
8314
8315
8316 if (cfs_rq_has_blocked(cfs_rq))
8317 *done = false;
8318 }
8319
8320 return decayed;
8321 }
8322
8323
8324
8325
8326
8327
8328 static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
8329 {
8330 struct rq *rq = rq_of(cfs_rq);
8331 struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
8332 unsigned long now = jiffies;
8333 unsigned long load;
8334
8335 if (cfs_rq->last_h_load_update == now)
8336 return;
8337
8338 WRITE_ONCE(cfs_rq->h_load_next, NULL);
8339 for_each_sched_entity(se) {
8340 cfs_rq = cfs_rq_of(se);
8341 WRITE_ONCE(cfs_rq->h_load_next, se);
8342 if (cfs_rq->last_h_load_update == now)
8343 break;
8344 }
8345
8346 if (!se) {
8347 cfs_rq->h_load = cfs_rq_load_avg(cfs_rq);
8348 cfs_rq->last_h_load_update = now;
8349 }
8350
8351 while ((se = READ_ONCE(cfs_rq->h_load_next)) != NULL) {
8352 load = cfs_rq->h_load;
8353 load = div64_ul(load * se->avg.load_avg,
8354 cfs_rq_load_avg(cfs_rq) + 1);
8355 cfs_rq = group_cfs_rq(se);
8356 cfs_rq->h_load = load;
8357 cfs_rq->last_h_load_update = now;
8358 }
8359 }
8360
8361 static unsigned long task_h_load(struct task_struct *p)
8362 {
8363 struct cfs_rq *cfs_rq = task_cfs_rq(p);
8364
8365 update_cfs_rq_h_load(cfs_rq);
8366 return div64_ul(p->se.avg.load_avg * cfs_rq->h_load,
8367 cfs_rq_load_avg(cfs_rq) + 1);
8368 }
8369 #else
8370 static bool __update_blocked_fair(struct rq *rq, bool *done)
8371 {
8372 struct cfs_rq *cfs_rq = &rq->cfs;
8373 bool decayed;
8374
8375 decayed = update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq);
8376 if (cfs_rq_has_blocked(cfs_rq))
8377 *done = false;
8378
8379 return decayed;
8380 }
8381
8382 static unsigned long task_h_load(struct task_struct *p)
8383 {
8384 return p->se.avg.load_avg;
8385 }
8386 #endif
8387
8388 static void update_blocked_averages(int cpu)
8389 {
8390 bool decayed = false, done = true;
8391 struct rq *rq = cpu_rq(cpu);
8392 struct rq_flags rf;
8393
8394 rq_lock_irqsave(rq, &rf);
8395 update_blocked_load_tick(rq);
8396 update_rq_clock(rq);
8397
8398 decayed |= __update_blocked_others(rq, &done);
8399 decayed |= __update_blocked_fair(rq, &done);
8400
8401 update_blocked_load_status(rq, !done);
8402 if (decayed)
8403 cpufreq_update_util(rq, 0);
8404 rq_unlock_irqrestore(rq, &rf);
8405 }
8406
8407
8408
8409
8410
8411
8412 struct sg_lb_stats {
8413 unsigned long avg_load;
8414 unsigned long group_load;
8415 unsigned long group_capacity;
8416 unsigned long group_util;
8417 unsigned long group_runnable;
8418 unsigned int sum_nr_running;
8419 unsigned int sum_h_nr_running;
8420 unsigned int idle_cpus;
8421 unsigned int group_weight;
8422 enum group_type group_type;
8423 unsigned int group_asym_packing;
8424 unsigned long group_misfit_task_load;
8425 #ifdef CONFIG_NUMA_BALANCING
8426 unsigned int nr_numa_running;
8427 unsigned int nr_preferred_running;
8428 #endif
8429 };
8430
8431
8432
8433
8434
8435 struct sd_lb_stats {
8436 struct sched_group *busiest;
8437 struct sched_group *local;
8438 unsigned long total_load;
8439 unsigned long total_capacity;
8440 unsigned long avg_load;
8441 unsigned int prefer_sibling;
8442
8443 struct sg_lb_stats busiest_stat;
8444 struct sg_lb_stats local_stat;
8445 };
8446
8447 static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
8448 {
8449
8450
8451
8452
8453
8454
8455
8456 *sds = (struct sd_lb_stats){
8457 .busiest = NULL,
8458 .local = NULL,
8459 .total_load = 0UL,
8460 .total_capacity = 0UL,
8461 .busiest_stat = {
8462 .idle_cpus = UINT_MAX,
8463 .group_type = group_has_spare,
8464 },
8465 };
8466 }
8467
8468 static unsigned long scale_rt_capacity(int cpu)
8469 {
8470 struct rq *rq = cpu_rq(cpu);
8471 unsigned long max = arch_scale_cpu_capacity(cpu);
8472 unsigned long used, free;
8473 unsigned long irq;
8474
8475 irq = cpu_util_irq(rq);
8476
8477 if (unlikely(irq >= max))
8478 return 1;
8479
8480
8481
8482
8483
8484
8485
8486 used = READ_ONCE(rq->avg_rt.util_avg);
8487 used += READ_ONCE(rq->avg_dl.util_avg);
8488 used += thermal_load_avg(rq);
8489
8490 if (unlikely(used >= max))
8491 return 1;
8492
8493 free = max - used;
8494
8495 return scale_irq_capacity(free, irq, max);
8496 }
8497
8498 static void update_cpu_capacity(struct sched_domain *sd, int cpu)
8499 {
8500 unsigned long capacity = scale_rt_capacity(cpu);
8501 struct sched_group *sdg = sd->groups;
8502
8503 cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(cpu);
8504
8505 if (!capacity)
8506 capacity = 1;
8507
8508 cpu_rq(cpu)->cpu_capacity = capacity;
8509 trace_sched_cpu_capacity_tp(cpu_rq(cpu));
8510
8511 sdg->sgc->capacity = capacity;
8512 sdg->sgc->min_capacity = capacity;
8513 sdg->sgc->max_capacity = capacity;
8514 }
8515
8516 void update_group_capacity(struct sched_domain *sd, int cpu)
8517 {
8518 struct sched_domain *child = sd->child;
8519 struct sched_group *group, *sdg = sd->groups;
8520 unsigned long capacity, min_capacity, max_capacity;
8521 unsigned long interval;
8522
8523 interval = msecs_to_jiffies(sd->balance_interval);
8524 interval = clamp(interval, 1UL, max_load_balance_interval);
8525 sdg->sgc->next_update = jiffies + interval;
8526
8527 if (!child) {
8528 update_cpu_capacity(sd, cpu);
8529 return;
8530 }
8531
8532 capacity = 0;
8533 min_capacity = ULONG_MAX;
8534 max_capacity = 0;
8535
8536 if (child->flags & SD_OVERLAP) {
8537
8538
8539
8540
8541
8542 for_each_cpu(cpu, sched_group_span(sdg)) {
8543 unsigned long cpu_cap = capacity_of(cpu);
8544
8545 capacity += cpu_cap;
8546 min_capacity = min(cpu_cap, min_capacity);
8547 max_capacity = max(cpu_cap, max_capacity);
8548 }
8549 } else {
8550
8551
8552
8553
8554
8555 group = child->groups;
8556 do {
8557 struct sched_group_capacity *sgc = group->sgc;
8558
8559 capacity += sgc->capacity;
8560 min_capacity = min(sgc->min_capacity, min_capacity);
8561 max_capacity = max(sgc->max_capacity, max_capacity);
8562 group = group->next;
8563 } while (group != child->groups);
8564 }
8565
8566 sdg->sgc->capacity = capacity;
8567 sdg->sgc->min_capacity = min_capacity;
8568 sdg->sgc->max_capacity = max_capacity;
8569 }
8570
8571
8572
8573
8574
8575
8576 static inline int
8577 check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
8578 {
8579 return ((rq->cpu_capacity * sd->imbalance_pct) <
8580 (rq->cpu_capacity_orig * 100));
8581 }
8582
8583
8584
8585
8586
8587
8588 static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd)
8589 {
8590 return rq->misfit_task_load &&
8591 (rq->cpu_capacity_orig < rq->rd->max_cpu_capacity ||
8592 check_cpu_capacity(rq, sd));
8593 }
8594
8595
8596
8597
8598
8599
8600
8601
8602
8603
8604
8605
8606
8607
8608
8609
8610
8611
8612
8613
8614
8615
8616
8617
8618
8619
8620
8621
8622
8623
8624 static inline int sg_imbalanced(struct sched_group *group)
8625 {
8626 return group->sgc->imbalance;
8627 }
8628
8629
8630
8631
8632
8633
8634
8635
8636
8637
8638
8639
8640
8641 static inline bool
8642 group_has_capacity(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
8643 {
8644 if (sgs->sum_nr_running < sgs->group_weight)
8645 return true;
8646
8647 if ((sgs->group_capacity * imbalance_pct) <
8648 (sgs->group_runnable * 100))
8649 return false;
8650
8651 if ((sgs->group_capacity * 100) >
8652 (sgs->group_util * imbalance_pct))
8653 return true;
8654
8655 return false;
8656 }
8657
8658
8659
8660
8661
8662
8663
8664
8665
8666 static inline bool
8667 group_is_overloaded(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
8668 {
8669 if (sgs->sum_nr_running <= sgs->group_weight)
8670 return false;
8671
8672 if ((sgs->group_capacity * 100) <
8673 (sgs->group_util * imbalance_pct))
8674 return true;
8675
8676 if ((sgs->group_capacity * imbalance_pct) <
8677 (sgs->group_runnable * 100))
8678 return true;
8679
8680 return false;
8681 }
8682
8683 static inline enum
8684 group_type group_classify(unsigned int imbalance_pct,
8685 struct sched_group *group,
8686 struct sg_lb_stats *sgs)
8687 {
8688 if (group_is_overloaded(imbalance_pct, sgs))
8689 return group_overloaded;
8690
8691 if (sg_imbalanced(group))
8692 return group_imbalanced;
8693
8694 if (sgs->group_asym_packing)
8695 return group_asym_packing;
8696
8697 if (sgs->group_misfit_task_load)
8698 return group_misfit_task;
8699
8700 if (!group_has_capacity(imbalance_pct, sgs))
8701 return group_fully_busy;
8702
8703 return group_has_spare;
8704 }
8705
8706
8707
8708
8709
8710
8711
8712
8713
8714
8715
8716
8717
8718
8719
8720
8721
8722
8723
8724
8725
8726
8727
8728
8729
8730 static bool asym_smt_can_pull_tasks(int dst_cpu, struct sd_lb_stats *sds,
8731 struct sg_lb_stats *sgs,
8732 struct sched_group *sg)
8733 {
8734 #ifdef CONFIG_SCHED_SMT
8735 bool local_is_smt, sg_is_smt;
8736 int sg_busy_cpus;
8737
8738 local_is_smt = sds->local->flags & SD_SHARE_CPUCAPACITY;
8739 sg_is_smt = sg->flags & SD_SHARE_CPUCAPACITY;
8740
8741 sg_busy_cpus = sgs->group_weight - sgs->idle_cpus;
8742
8743 if (!local_is_smt) {
8744
8745
8746
8747
8748
8749 if (sg_busy_cpus >= 2)
8750 return true;
8751
8752
8753
8754
8755
8756
8757
8758 return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu);
8759 }
8760
8761
8762
8763 if (sg_is_smt) {
8764 int local_busy_cpus = sds->local->group_weight -
8765 sds->local_stat.idle_cpus;
8766 int busy_cpus_delta = sg_busy_cpus - local_busy_cpus;
8767
8768 if (busy_cpus_delta == 1)
8769 return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu);
8770
8771 return false;
8772 }
8773
8774
8775
8776
8777
8778
8779 if (!sds->local_stat.sum_nr_running)
8780 return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu);
8781
8782 return false;
8783 #else
8784
8785 return false;
8786 #endif
8787 }
8788
8789 static inline bool
8790 sched_asym(struct lb_env *env, struct sd_lb_stats *sds, struct sg_lb_stats *sgs,
8791 struct sched_group *group)
8792 {
8793
8794 if ((sds->local->flags & SD_SHARE_CPUCAPACITY) ||
8795 (group->flags & SD_SHARE_CPUCAPACITY))
8796 return asym_smt_can_pull_tasks(env->dst_cpu, sds, sgs, group);
8797
8798 return sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu);
8799 }
8800
8801 static inline bool
8802 sched_reduced_capacity(struct rq *rq, struct sched_domain *sd)
8803 {
8804
8805
8806
8807
8808 if (rq->cfs.h_nr_running != 1)
8809 return false;
8810
8811 return check_cpu_capacity(rq, sd);
8812 }
8813
8814
8815
8816
8817
8818
8819
8820
8821
8822 static inline void update_sg_lb_stats(struct lb_env *env,
8823 struct sd_lb_stats *sds,
8824 struct sched_group *group,
8825 struct sg_lb_stats *sgs,
8826 int *sg_status)
8827 {
8828 int i, nr_running, local_group;
8829
8830 memset(sgs, 0, sizeof(*sgs));
8831
8832 local_group = group == sds->local;
8833
8834 for_each_cpu_and(i, sched_group_span(group), env->cpus) {
8835 struct rq *rq = cpu_rq(i);
8836 unsigned long load = cpu_load(rq);
8837
8838 sgs->group_load += load;
8839 sgs->group_util += cpu_util_cfs(i);
8840 sgs->group_runnable += cpu_runnable(rq);
8841 sgs->sum_h_nr_running += rq->cfs.h_nr_running;
8842
8843 nr_running = rq->nr_running;
8844 sgs->sum_nr_running += nr_running;
8845
8846 if (nr_running > 1)
8847 *sg_status |= SG_OVERLOAD;
8848
8849 if (cpu_overutilized(i))
8850 *sg_status |= SG_OVERUTILIZED;
8851
8852 #ifdef CONFIG_NUMA_BALANCING
8853 sgs->nr_numa_running += rq->nr_numa_running;
8854 sgs->nr_preferred_running += rq->nr_preferred_running;
8855 #endif
8856
8857
8858
8859 if (!nr_running && idle_cpu(i)) {
8860 sgs->idle_cpus++;
8861
8862 continue;
8863 }
8864
8865 if (local_group)
8866 continue;
8867
8868 if (env->sd->flags & SD_ASYM_CPUCAPACITY) {
8869
8870 if (sgs->group_misfit_task_load < rq->misfit_task_load) {
8871 sgs->group_misfit_task_load = rq->misfit_task_load;
8872 *sg_status |= SG_OVERLOAD;
8873 }
8874 } else if ((env->idle != CPU_NOT_IDLE) &&
8875 sched_reduced_capacity(rq, env->sd)) {
8876
8877 if (sgs->group_misfit_task_load < load)
8878 sgs->group_misfit_task_load = load;
8879 }
8880 }
8881
8882 sgs->group_capacity = group->sgc->capacity;
8883
8884 sgs->group_weight = group->group_weight;
8885
8886
8887 if (!local_group && env->sd->flags & SD_ASYM_PACKING &&
8888 env->idle != CPU_NOT_IDLE && sgs->sum_h_nr_running &&
8889 sched_asym(env, sds, sgs, group)) {
8890 sgs->group_asym_packing = 1;
8891 }
8892
8893 sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs);
8894
8895
8896 if (sgs->group_type == group_overloaded)
8897 sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) /
8898 sgs->group_capacity;
8899 }
8900
8901
8902
8903
8904
8905
8906
8907
8908
8909
8910
8911
8912
8913
8914 static bool update_sd_pick_busiest(struct lb_env *env,
8915 struct sd_lb_stats *sds,
8916 struct sched_group *sg,
8917 struct sg_lb_stats *sgs)
8918 {
8919 struct sg_lb_stats *busiest = &sds->busiest_stat;
8920
8921
8922 if (!sgs->sum_h_nr_running)
8923 return false;
8924
8925
8926
8927
8928
8929
8930
8931 if ((env->sd->flags & SD_ASYM_CPUCAPACITY) &&
8932 (sgs->group_type == group_misfit_task) &&
8933 (!capacity_greater(capacity_of(env->dst_cpu), sg->sgc->max_capacity) ||
8934 sds->local_stat.group_type != group_has_spare))
8935 return false;
8936
8937 if (sgs->group_type > busiest->group_type)
8938 return true;
8939
8940 if (sgs->group_type < busiest->group_type)
8941 return false;
8942
8943
8944
8945
8946
8947
8948 switch (sgs->group_type) {
8949 case group_overloaded:
8950
8951 if (sgs->avg_load <= busiest->avg_load)
8952 return false;
8953 break;
8954
8955 case group_imbalanced:
8956
8957
8958
8959
8960 return false;
8961
8962 case group_asym_packing:
8963
8964 if (sched_asym_prefer(sg->asym_prefer_cpu, sds->busiest->asym_prefer_cpu))
8965 return false;
8966 break;
8967
8968 case group_misfit_task:
8969
8970
8971
8972
8973 if (sgs->group_misfit_task_load < busiest->group_misfit_task_load)
8974 return false;
8975 break;
8976
8977 case group_fully_busy:
8978
8979
8980
8981
8982
8983
8984
8985
8986
8987
8988 if (sgs->avg_load <= busiest->avg_load)
8989 return false;
8990 break;
8991
8992 case group_has_spare:
8993
8994
8995
8996
8997
8998
8999
9000 if (sgs->idle_cpus > busiest->idle_cpus)
9001 return false;
9002 else if ((sgs->idle_cpus == busiest->idle_cpus) &&
9003 (sgs->sum_nr_running <= busiest->sum_nr_running))
9004 return false;
9005
9006 break;
9007 }
9008
9009
9010
9011
9012
9013
9014
9015 if ((env->sd->flags & SD_ASYM_CPUCAPACITY) &&
9016 (sgs->group_type <= group_fully_busy) &&
9017 (capacity_greater(sg->sgc->min_capacity, capacity_of(env->dst_cpu))))
9018 return false;
9019
9020 return true;
9021 }
9022
9023 #ifdef CONFIG_NUMA_BALANCING
9024 static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
9025 {
9026 if (sgs->sum_h_nr_running > sgs->nr_numa_running)
9027 return regular;
9028 if (sgs->sum_h_nr_running > sgs->nr_preferred_running)
9029 return remote;
9030 return all;
9031 }
9032
9033 static inline enum fbq_type fbq_classify_rq(struct rq *rq)
9034 {
9035 if (rq->nr_running > rq->nr_numa_running)
9036 return regular;
9037 if (rq->nr_running > rq->nr_preferred_running)
9038 return remote;
9039 return all;
9040 }
9041 #else
9042 static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
9043 {
9044 return all;
9045 }
9046
9047 static inline enum fbq_type fbq_classify_rq(struct rq *rq)
9048 {
9049 return regular;
9050 }
9051 #endif
9052
9053
9054 struct sg_lb_stats;
9055
9056
9057
9058
9059
9060 static unsigned int task_running_on_cpu(int cpu, struct task_struct *p)
9061 {
9062
9063 if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
9064 return 0;
9065
9066 if (task_on_rq_queued(p))
9067 return 1;
9068
9069 return 0;
9070 }
9071
9072
9073
9074
9075
9076
9077
9078
9079 static int idle_cpu_without(int cpu, struct task_struct *p)
9080 {
9081 struct rq *rq = cpu_rq(cpu);
9082
9083 if (rq->curr != rq->idle && rq->curr != p)
9084 return 0;
9085
9086
9087
9088
9089
9090
9091
9092 #ifdef CONFIG_SMP
9093 if (rq->ttwu_pending)
9094 return 0;
9095 #endif
9096
9097 return 1;
9098 }
9099
9100
9101
9102
9103
9104
9105
9106
9107 static inline void update_sg_wakeup_stats(struct sched_domain *sd,
9108 struct sched_group *group,
9109 struct sg_lb_stats *sgs,
9110 struct task_struct *p)
9111 {
9112 int i, nr_running;
9113
9114 memset(sgs, 0, sizeof(*sgs));
9115
9116 for_each_cpu(i, sched_group_span(group)) {
9117 struct rq *rq = cpu_rq(i);
9118 unsigned int local;
9119
9120 sgs->group_load += cpu_load_without(rq, p);
9121 sgs->group_util += cpu_util_without(i, p);
9122 sgs->group_runnable += cpu_runnable_without(rq, p);
9123 local = task_running_on_cpu(i, p);
9124 sgs->sum_h_nr_running += rq->cfs.h_nr_running - local;
9125
9126 nr_running = rq->nr_running - local;
9127 sgs->sum_nr_running += nr_running;
9128
9129
9130
9131
9132 if (!nr_running && idle_cpu_without(i, p))
9133 sgs->idle_cpus++;
9134
9135 }
9136
9137
9138 if (sd->flags & SD_ASYM_CPUCAPACITY &&
9139 !task_fits_capacity(p, group->sgc->max_capacity)) {
9140 sgs->group_misfit_task_load = 1;
9141 }
9142
9143 sgs->group_capacity = group->sgc->capacity;
9144
9145 sgs->group_weight = group->group_weight;
9146
9147 sgs->group_type = group_classify(sd->imbalance_pct, group, sgs);
9148
9149
9150
9151
9152
9153 if (sgs->group_type == group_fully_busy ||
9154 sgs->group_type == group_overloaded)
9155 sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) /
9156 sgs->group_capacity;
9157 }
9158
9159 static bool update_pick_idlest(struct sched_group *idlest,
9160 struct sg_lb_stats *idlest_sgs,
9161 struct sched_group *group,
9162 struct sg_lb_stats *sgs)
9163 {
9164 if (sgs->group_type < idlest_sgs->group_type)
9165 return true;
9166
9167 if (sgs->group_type > idlest_sgs->group_type)
9168 return false;
9169
9170
9171
9172
9173
9174
9175 switch (sgs->group_type) {
9176 case group_overloaded:
9177 case group_fully_busy:
9178
9179 if (idlest_sgs->avg_load <= sgs->avg_load)
9180 return false;
9181 break;
9182
9183 case group_imbalanced:
9184 case group_asym_packing:
9185
9186 return false;
9187
9188 case group_misfit_task:
9189
9190 if (idlest->sgc->max_capacity >= group->sgc->max_capacity)
9191 return false;
9192 break;
9193
9194 case group_has_spare:
9195
9196 if (idlest_sgs->idle_cpus > sgs->idle_cpus)
9197 return false;
9198
9199
9200 if (idlest_sgs->idle_cpus == sgs->idle_cpus &&
9201 idlest_sgs->group_util <= sgs->group_util)
9202 return false;
9203
9204 break;
9205 }
9206
9207 return true;
9208 }
9209
9210
9211
9212
9213
9214
9215
9216 static struct sched_group *
9217 find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
9218 {
9219 struct sched_group *idlest = NULL, *local = NULL, *group = sd->groups;
9220 struct sg_lb_stats local_sgs, tmp_sgs;
9221 struct sg_lb_stats *sgs;
9222 unsigned long imbalance;
9223 struct sg_lb_stats idlest_sgs = {
9224 .avg_load = UINT_MAX,
9225 .group_type = group_overloaded,
9226 };
9227
9228 do {
9229 int local_group;
9230
9231
9232 if (!cpumask_intersects(sched_group_span(group),
9233 p->cpus_ptr))
9234 continue;
9235
9236
9237 if (!sched_group_cookie_match(cpu_rq(this_cpu), p, group))
9238 continue;
9239
9240 local_group = cpumask_test_cpu(this_cpu,
9241 sched_group_span(group));
9242
9243 if (local_group) {
9244 sgs = &local_sgs;
9245 local = group;
9246 } else {
9247 sgs = &tmp_sgs;
9248 }
9249
9250 update_sg_wakeup_stats(sd, group, sgs, p);
9251
9252 if (!local_group && update_pick_idlest(idlest, &idlest_sgs, group, sgs)) {
9253 idlest = group;
9254 idlest_sgs = *sgs;
9255 }
9256
9257 } while (group = group->next, group != sd->groups);
9258
9259
9260
9261 if (!idlest)
9262 return NULL;
9263
9264
9265 if (!local)
9266 return idlest;
9267
9268
9269
9270
9271
9272 if (local_sgs.group_type < idlest_sgs.group_type)
9273 return NULL;
9274
9275
9276
9277
9278
9279 if (local_sgs.group_type > idlest_sgs.group_type)
9280 return idlest;
9281
9282 switch (local_sgs.group_type) {
9283 case group_overloaded:
9284 case group_fully_busy:
9285
9286
9287 imbalance = scale_load_down(NICE_0_LOAD) *
9288 (sd->imbalance_pct-100) / 100;
9289
9290
9291
9292
9293
9294
9295
9296
9297
9298
9299 if ((sd->flags & SD_NUMA) &&
9300 ((idlest_sgs.avg_load + imbalance) >= local_sgs.avg_load))
9301 return NULL;
9302
9303
9304
9305
9306
9307 if (idlest_sgs.avg_load >= (local_sgs.avg_load + imbalance))
9308 return NULL;
9309
9310 if (100 * local_sgs.avg_load <= sd->imbalance_pct * idlest_sgs.avg_load)
9311 return NULL;
9312 break;
9313
9314 case group_imbalanced:
9315 case group_asym_packing:
9316
9317 return NULL;
9318
9319 case group_misfit_task:
9320
9321 if (local->sgc->max_capacity >= idlest->sgc->max_capacity)
9322 return NULL;
9323 break;
9324
9325 case group_has_spare:
9326 #ifdef CONFIG_NUMA
9327 if (sd->flags & SD_NUMA) {
9328 int imb_numa_nr = sd->imb_numa_nr;
9329 #ifdef CONFIG_NUMA_BALANCING
9330 int idlest_cpu;
9331
9332
9333
9334
9335 if (cpu_to_node(this_cpu) == p->numa_preferred_nid)
9336 return NULL;
9337
9338 idlest_cpu = cpumask_first(sched_group_span(idlest));
9339 if (cpu_to_node(idlest_cpu) == p->numa_preferred_nid)
9340 return idlest;
9341 #endif
9342
9343
9344
9345
9346
9347
9348
9349
9350
9351 if (p->nr_cpus_allowed != NR_CPUS) {
9352 struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
9353
9354 cpumask_and(cpus, sched_group_span(local), p->cpus_ptr);
9355 imb_numa_nr = min(cpumask_weight(cpus), sd->imb_numa_nr);
9356 }
9357
9358 imbalance = abs(local_sgs.idle_cpus - idlest_sgs.idle_cpus);
9359 if (!adjust_numa_imbalance(imbalance,
9360 local_sgs.sum_nr_running + 1,
9361 imb_numa_nr)) {
9362 return NULL;
9363 }
9364 }
9365 #endif
9366
9367
9368
9369
9370
9371
9372
9373 if (local_sgs.idle_cpus >= idlest_sgs.idle_cpus)
9374 return NULL;
9375 break;
9376 }
9377
9378 return idlest;
9379 }
9380
9381 static void update_idle_cpu_scan(struct lb_env *env,
9382 unsigned long sum_util)
9383 {
9384 struct sched_domain_shared *sd_share;
9385 int llc_weight, pct;
9386 u64 x, y, tmp;
9387
9388
9389
9390
9391
9392
9393
9394
9395 if (!sched_feat(SIS_UTIL) || env->idle == CPU_NEWLY_IDLE)
9396 return;
9397
9398 llc_weight = per_cpu(sd_llc_size, env->dst_cpu);
9399 if (env->sd->span_weight != llc_weight)
9400 return;
9401
9402 sd_share = rcu_dereference(per_cpu(sd_llc_shared, env->dst_cpu));
9403 if (!sd_share)
9404 return;
9405
9406
9407
9408
9409
9410
9411
9412
9413
9414
9415
9416
9417
9418
9419
9420
9421
9422
9423
9424
9425
9426
9427
9428
9429
9430
9431
9432
9433
9434
9435 x = sum_util;
9436 do_div(x, llc_weight);
9437
9438
9439 pct = env->sd->imbalance_pct;
9440 tmp = x * x * pct * pct;
9441 do_div(tmp, 10000 * SCHED_CAPACITY_SCALE);
9442 tmp = min_t(long, tmp, SCHED_CAPACITY_SCALE);
9443 y = SCHED_CAPACITY_SCALE - tmp;
9444
9445
9446 y *= llc_weight;
9447 do_div(y, SCHED_CAPACITY_SCALE);
9448 if ((int)y != sd_share->nr_idle_scan)
9449 WRITE_ONCE(sd_share->nr_idle_scan, (int)y);
9450 }
9451
9452
9453
9454
9455
9456
9457
9458 static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
9459 {
9460 struct sched_domain *child = env->sd->child;
9461 struct sched_group *sg = env->sd->groups;
9462 struct sg_lb_stats *local = &sds->local_stat;
9463 struct sg_lb_stats tmp_sgs;
9464 unsigned long sum_util = 0;
9465 int sg_status = 0;
9466
9467 do {
9468 struct sg_lb_stats *sgs = &tmp_sgs;
9469 int local_group;
9470
9471 local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(sg));
9472 if (local_group) {
9473 sds->local = sg;
9474 sgs = local;
9475
9476 if (env->idle != CPU_NEWLY_IDLE ||
9477 time_after_eq(jiffies, sg->sgc->next_update))
9478 update_group_capacity(env->sd, env->dst_cpu);
9479 }
9480
9481 update_sg_lb_stats(env, sds, sg, sgs, &sg_status);
9482
9483 if (local_group)
9484 goto next_group;
9485
9486
9487 if (update_sd_pick_busiest(env, sds, sg, sgs)) {
9488 sds->busiest = sg;
9489 sds->busiest_stat = *sgs;
9490 }
9491
9492 next_group:
9493
9494 sds->total_load += sgs->group_load;
9495 sds->total_capacity += sgs->group_capacity;
9496
9497 sum_util += sgs->group_util;
9498 sg = sg->next;
9499 } while (sg != env->sd->groups);
9500
9501
9502 sds->prefer_sibling = child && child->flags & SD_PREFER_SIBLING;
9503
9504
9505 if (env->sd->flags & SD_NUMA)
9506 env->fbq_type = fbq_classify_group(&sds->busiest_stat);
9507
9508 if (!env->sd->parent) {
9509 struct root_domain *rd = env->dst_rq->rd;
9510
9511
9512 WRITE_ONCE(rd->overload, sg_status & SG_OVERLOAD);
9513
9514
9515 WRITE_ONCE(rd->overutilized, sg_status & SG_OVERUTILIZED);
9516 trace_sched_overutilized_tp(rd, sg_status & SG_OVERUTILIZED);
9517 } else if (sg_status & SG_OVERUTILIZED) {
9518 struct root_domain *rd = env->dst_rq->rd;
9519
9520 WRITE_ONCE(rd->overutilized, SG_OVERUTILIZED);
9521 trace_sched_overutilized_tp(rd, SG_OVERUTILIZED);
9522 }
9523
9524 update_idle_cpu_scan(env, sum_util);
9525 }
9526
9527
9528
9529
9530
9531
9532
9533 static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
9534 {
9535 struct sg_lb_stats *local, *busiest;
9536
9537 local = &sds->local_stat;
9538 busiest = &sds->busiest_stat;
9539
9540 if (busiest->group_type == group_misfit_task) {
9541 if (env->sd->flags & SD_ASYM_CPUCAPACITY) {
9542
9543 env->migration_type = migrate_misfit;
9544 env->imbalance = 1;
9545 } else {
9546
9547
9548
9549
9550 env->migration_type = migrate_load;
9551 env->imbalance = busiest->group_misfit_task_load;
9552 }
9553 return;
9554 }
9555
9556 if (busiest->group_type == group_asym_packing) {
9557
9558
9559
9560
9561 env->migration_type = migrate_task;
9562 env->imbalance = busiest->sum_h_nr_running;
9563 return;
9564 }
9565
9566 if (busiest->group_type == group_imbalanced) {
9567
9568
9569
9570
9571
9572
9573 env->migration_type = migrate_task;
9574 env->imbalance = 1;
9575 return;
9576 }
9577
9578
9579
9580
9581
9582 if (local->group_type == group_has_spare) {
9583 if ((busiest->group_type > group_fully_busy) &&
9584 !(env->sd->flags & SD_SHARE_PKG_RESOURCES)) {
9585
9586
9587
9588
9589
9590
9591
9592
9593 env->migration_type = migrate_util;
9594 env->imbalance = max(local->group_capacity, local->group_util) -
9595 local->group_util;
9596
9597
9598
9599
9600
9601
9602
9603
9604 if (env->idle != CPU_NOT_IDLE && env->imbalance == 0) {
9605 env->migration_type = migrate_task;
9606 env->imbalance = 1;
9607 }
9608
9609 return;
9610 }
9611
9612 if (busiest->group_weight == 1 || sds->prefer_sibling) {
9613 unsigned int nr_diff = busiest->sum_nr_running;
9614
9615
9616
9617
9618 env->migration_type = migrate_task;
9619 lsub_positive(&nr_diff, local->sum_nr_running);
9620 env->imbalance = nr_diff;
9621 } else {
9622
9623
9624
9625
9626
9627 env->migration_type = migrate_task;
9628 env->imbalance = max_t(long, 0,
9629 (local->idle_cpus - busiest->idle_cpus));
9630 }
9631
9632 #ifdef CONFIG_NUMA
9633
9634 if (env->sd->flags & SD_NUMA) {
9635 env->imbalance = adjust_numa_imbalance(env->imbalance,
9636 local->sum_nr_running + 1,
9637 env->sd->imb_numa_nr);
9638 }
9639 #endif
9640
9641
9642 env->imbalance >>= 1;
9643
9644 return;
9645 }
9646
9647
9648
9649
9650
9651 if (local->group_type < group_overloaded) {
9652
9653
9654
9655
9656
9657 local->avg_load = (local->group_load * SCHED_CAPACITY_SCALE) /
9658 local->group_capacity;
9659
9660
9661
9662
9663
9664 if (local->avg_load >= busiest->avg_load) {
9665 env->imbalance = 0;
9666 return;
9667 }
9668
9669 sds->avg_load = (sds->total_load * SCHED_CAPACITY_SCALE) /
9670 sds->total_capacity;
9671 }
9672
9673
9674
9675
9676
9677
9678
9679
9680
9681 env->migration_type = migrate_load;
9682 env->imbalance = min(
9683 (busiest->avg_load - sds->avg_load) * busiest->group_capacity,
9684 (sds->avg_load - local->avg_load) * local->group_capacity
9685 ) / SCHED_CAPACITY_SCALE;
9686 }
9687
9688
9689
9690
9691
9692
9693
9694
9695
9696
9697
9698
9699
9700
9701
9702
9703
9704
9705
9706
9707
9708
9709
9710
9711
9712
9713
9714
9715
9716
9717
9718
9719
9720 static struct sched_group *find_busiest_group(struct lb_env *env)
9721 {
9722 struct sg_lb_stats *local, *busiest;
9723 struct sd_lb_stats sds;
9724
9725 init_sd_lb_stats(&sds);
9726
9727
9728
9729
9730
9731 update_sd_lb_stats(env, &sds);
9732
9733 if (sched_energy_enabled()) {
9734 struct root_domain *rd = env->dst_rq->rd;
9735
9736 if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized))
9737 goto out_balanced;
9738 }
9739
9740 local = &sds.local_stat;
9741 busiest = &sds.busiest_stat;
9742
9743
9744 if (!sds.busiest)
9745 goto out_balanced;
9746
9747
9748 if (busiest->group_type == group_misfit_task)
9749 goto force_balance;
9750
9751
9752 if (busiest->group_type == group_asym_packing)
9753 goto force_balance;
9754
9755
9756
9757
9758
9759
9760 if (busiest->group_type == group_imbalanced)
9761 goto force_balance;
9762
9763
9764
9765
9766
9767 if (local->group_type > busiest->group_type)
9768 goto out_balanced;
9769
9770
9771
9772
9773
9774 if (local->group_type == group_overloaded) {
9775
9776
9777
9778
9779 if (local->avg_load >= busiest->avg_load)
9780 goto out_balanced;
9781
9782
9783 sds.avg_load = (sds.total_load * SCHED_CAPACITY_SCALE) /
9784 sds.total_capacity;
9785
9786
9787
9788
9789
9790 if (local->avg_load >= sds.avg_load)
9791 goto out_balanced;
9792
9793
9794
9795
9796
9797 if (100 * busiest->avg_load <=
9798 env->sd->imbalance_pct * local->avg_load)
9799 goto out_balanced;
9800 }
9801
9802
9803 if (sds.prefer_sibling && local->group_type == group_has_spare &&
9804 busiest->sum_nr_running > local->sum_nr_running + 1)
9805 goto force_balance;
9806
9807 if (busiest->group_type != group_overloaded) {
9808 if (env->idle == CPU_NOT_IDLE)
9809
9810
9811
9812
9813
9814 goto out_balanced;
9815
9816 if (busiest->group_weight > 1 &&
9817 local->idle_cpus <= (busiest->idle_cpus + 1))
9818
9819
9820
9821
9822
9823
9824
9825
9826
9827 goto out_balanced;
9828
9829 if (busiest->sum_h_nr_running == 1)
9830
9831
9832
9833 goto out_balanced;
9834 }
9835
9836 force_balance:
9837
9838 calculate_imbalance(env, &sds);
9839 return env->imbalance ? sds.busiest : NULL;
9840
9841 out_balanced:
9842 env->imbalance = 0;
9843 return NULL;
9844 }
9845
9846
9847
9848
9849 static struct rq *find_busiest_queue(struct lb_env *env,
9850 struct sched_group *group)
9851 {
9852 struct rq *busiest = NULL, *rq;
9853 unsigned long busiest_util = 0, busiest_load = 0, busiest_capacity = 1;
9854 unsigned int busiest_nr = 0;
9855 int i;
9856
9857 for_each_cpu_and(i, sched_group_span(group), env->cpus) {
9858 unsigned long capacity, load, util;
9859 unsigned int nr_running;
9860 enum fbq_type rt;
9861
9862 rq = cpu_rq(i);
9863 rt = fbq_classify_rq(rq);
9864
9865
9866
9867
9868
9869
9870
9871
9872
9873
9874
9875
9876
9877
9878
9879
9880
9881
9882
9883
9884 if (rt > env->fbq_type)
9885 continue;
9886
9887 nr_running = rq->cfs.h_nr_running;
9888 if (!nr_running)
9889 continue;
9890
9891 capacity = capacity_of(i);
9892
9893
9894
9895
9896
9897
9898
9899 if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
9900 !capacity_greater(capacity_of(env->dst_cpu), capacity) &&
9901 nr_running == 1)
9902 continue;
9903
9904
9905 if ((env->sd->flags & SD_ASYM_PACKING) &&
9906 sched_asym_prefer(i, env->dst_cpu) &&
9907 nr_running == 1)
9908 continue;
9909
9910 switch (env->migration_type) {
9911 case migrate_load:
9912
9913
9914
9915
9916 load = cpu_load(rq);
9917
9918 if (nr_running == 1 && load > env->imbalance &&
9919 !check_cpu_capacity(rq, env->sd))
9920 break;
9921
9922
9923
9924
9925
9926
9927
9928
9929
9930
9931
9932
9933
9934
9935 if (load * busiest_capacity > busiest_load * capacity) {
9936 busiest_load = load;
9937 busiest_capacity = capacity;
9938 busiest = rq;
9939 }
9940 break;
9941
9942 case migrate_util:
9943 util = cpu_util_cfs(i);
9944
9945
9946
9947
9948
9949
9950 if (nr_running <= 1)
9951 continue;
9952
9953 if (busiest_util < util) {
9954 busiest_util = util;
9955 busiest = rq;
9956 }
9957 break;
9958
9959 case migrate_task:
9960 if (busiest_nr < nr_running) {
9961 busiest_nr = nr_running;
9962 busiest = rq;
9963 }
9964 break;
9965
9966 case migrate_misfit:
9967
9968
9969
9970
9971 if (rq->misfit_task_load > busiest_load) {
9972 busiest_load = rq->misfit_task_load;
9973 busiest = rq;
9974 }
9975
9976 break;
9977
9978 }
9979 }
9980
9981 return busiest;
9982 }
9983
9984
9985
9986
9987
9988 #define MAX_PINNED_INTERVAL 512
9989
9990 static inline bool
9991 asym_active_balance(struct lb_env *env)
9992 {
9993
9994
9995
9996
9997
9998 return env->idle != CPU_NOT_IDLE && (env->sd->flags & SD_ASYM_PACKING) &&
9999 sched_asym_prefer(env->dst_cpu, env->src_cpu);
10000 }
10001
10002 static inline bool
10003 imbalanced_active_balance(struct lb_env *env)
10004 {
10005 struct sched_domain *sd = env->sd;
10006
10007
10008
10009
10010
10011
10012 if ((env->migration_type == migrate_task) &&
10013 (sd->nr_balance_failed > sd->cache_nice_tries+2))
10014 return 1;
10015
10016 return 0;
10017 }
10018
10019 static int need_active_balance(struct lb_env *env)
10020 {
10021 struct sched_domain *sd = env->sd;
10022
10023 if (asym_active_balance(env))
10024 return 1;
10025
10026 if (imbalanced_active_balance(env))
10027 return 1;
10028
10029
10030
10031
10032
10033
10034
10035 if ((env->idle != CPU_NOT_IDLE) &&
10036 (env->src_rq->cfs.h_nr_running == 1)) {
10037 if ((check_cpu_capacity(env->src_rq, sd)) &&
10038 (capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100))
10039 return 1;
10040 }
10041
10042 if (env->migration_type == migrate_misfit)
10043 return 1;
10044
10045 return 0;
10046 }
10047
10048 static int active_load_balance_cpu_stop(void *data);
10049
10050 static int should_we_balance(struct lb_env *env)
10051 {
10052 struct sched_group *sg = env->sd->groups;
10053 int cpu;
10054
10055
10056
10057
10058
10059 if (!cpumask_test_cpu(env->dst_cpu, env->cpus))
10060 return 0;
10061
10062
10063
10064
10065
10066
10067
10068
10069 if (env->idle == CPU_NEWLY_IDLE) {
10070 if (env->dst_rq->nr_running > 0 || env->dst_rq->ttwu_pending)
10071 return 0;
10072 return 1;
10073 }
10074
10075
10076 for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) {
10077 if (!idle_cpu(cpu))
10078 continue;
10079
10080
10081 return cpu == env->dst_cpu;
10082 }
10083
10084
10085 return group_balance_cpu(sg) == env->dst_cpu;
10086 }
10087
10088
10089
10090
10091
10092 static int load_balance(int this_cpu, struct rq *this_rq,
10093 struct sched_domain *sd, enum cpu_idle_type idle,
10094 int *continue_balancing)
10095 {
10096 int ld_moved, cur_ld_moved, active_balance = 0;
10097 struct sched_domain *sd_parent = sd->parent;
10098 struct sched_group *group;
10099 struct rq *busiest;
10100 struct rq_flags rf;
10101 struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
10102
10103 struct lb_env env = {
10104 .sd = sd,
10105 .dst_cpu = this_cpu,
10106 .dst_rq = this_rq,
10107 .dst_grpmask = sched_group_span(sd->groups),
10108 .idle = idle,
10109 .loop_break = sched_nr_migrate_break,
10110 .cpus = cpus,
10111 .fbq_type = all,
10112 .tasks = LIST_HEAD_INIT(env.tasks),
10113 };
10114
10115 cpumask_and(cpus, sched_domain_span(sd), cpu_active_mask);
10116
10117 schedstat_inc(sd->lb_count[idle]);
10118
10119 redo:
10120 if (!should_we_balance(&env)) {
10121 *continue_balancing = 0;
10122 goto out_balanced;
10123 }
10124
10125 group = find_busiest_group(&env);
10126 if (!group) {
10127 schedstat_inc(sd->lb_nobusyg[idle]);
10128 goto out_balanced;
10129 }
10130
10131 busiest = find_busiest_queue(&env, group);
10132 if (!busiest) {
10133 schedstat_inc(sd->lb_nobusyq[idle]);
10134 goto out_balanced;
10135 }
10136
10137 BUG_ON(busiest == env.dst_rq);
10138
10139 schedstat_add(sd->lb_imbalance[idle], env.imbalance);
10140
10141 env.src_cpu = busiest->cpu;
10142 env.src_rq = busiest;
10143
10144 ld_moved = 0;
10145
10146 env.flags |= LBF_ALL_PINNED;
10147 if (busiest->nr_running > 1) {
10148
10149
10150
10151
10152
10153
10154 env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
10155
10156 more_balance:
10157 rq_lock_irqsave(busiest, &rf);
10158 update_rq_clock(busiest);
10159
10160
10161
10162
10163
10164 cur_ld_moved = detach_tasks(&env);
10165
10166
10167
10168
10169
10170
10171
10172
10173
10174 rq_unlock(busiest, &rf);
10175
10176 if (cur_ld_moved) {
10177 attach_tasks(&env);
10178 ld_moved += cur_ld_moved;
10179 }
10180
10181 local_irq_restore(rf.flags);
10182
10183 if (env.flags & LBF_NEED_BREAK) {
10184 env.flags &= ~LBF_NEED_BREAK;
10185 goto more_balance;
10186 }
10187
10188
10189
10190
10191
10192
10193
10194
10195
10196
10197
10198
10199
10200
10201
10202
10203
10204
10205
10206
10207 if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
10208
10209
10210 __cpumask_clear_cpu(env.dst_cpu, env.cpus);
10211
10212 env.dst_rq = cpu_rq(env.new_dst_cpu);
10213 env.dst_cpu = env.new_dst_cpu;
10214 env.flags &= ~LBF_DST_PINNED;
10215 env.loop = 0;
10216 env.loop_break = sched_nr_migrate_break;
10217
10218
10219
10220
10221
10222 goto more_balance;
10223 }
10224
10225
10226
10227
10228 if (sd_parent) {
10229 int *group_imbalance = &sd_parent->groups->sgc->imbalance;
10230
10231 if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0)
10232 *group_imbalance = 1;
10233 }
10234
10235
10236 if (unlikely(env.flags & LBF_ALL_PINNED)) {
10237 __cpumask_clear_cpu(cpu_of(busiest), cpus);
10238
10239
10240
10241
10242
10243
10244
10245
10246 if (!cpumask_subset(cpus, env.dst_grpmask)) {
10247 env.loop = 0;
10248 env.loop_break = sched_nr_migrate_break;
10249 goto redo;
10250 }
10251 goto out_all_pinned;
10252 }
10253 }
10254
10255 if (!ld_moved) {
10256 schedstat_inc(sd->lb_failed[idle]);
10257
10258
10259
10260
10261
10262
10263 if (idle != CPU_NEWLY_IDLE)
10264 sd->nr_balance_failed++;
10265
10266 if (need_active_balance(&env)) {
10267 unsigned long flags;
10268
10269 raw_spin_rq_lock_irqsave(busiest, flags);
10270
10271
10272
10273
10274
10275
10276 if (!cpumask_test_cpu(this_cpu, busiest->curr->cpus_ptr)) {
10277 raw_spin_rq_unlock_irqrestore(busiest, flags);
10278 goto out_one_pinned;
10279 }
10280
10281
10282 env.flags &= ~LBF_ALL_PINNED;
10283
10284
10285
10286
10287
10288
10289 if (!busiest->active_balance) {
10290 busiest->active_balance = 1;
10291 busiest->push_cpu = this_cpu;
10292 active_balance = 1;
10293 }
10294 raw_spin_rq_unlock_irqrestore(busiest, flags);
10295
10296 if (active_balance) {
10297 stop_one_cpu_nowait(cpu_of(busiest),
10298 active_load_balance_cpu_stop, busiest,
10299 &busiest->active_balance_work);
10300 }
10301 }
10302 } else {
10303 sd->nr_balance_failed = 0;
10304 }
10305
10306 if (likely(!active_balance) || need_active_balance(&env)) {
10307
10308 sd->balance_interval = sd->min_interval;
10309 }
10310
10311 goto out;
10312
10313 out_balanced:
10314
10315
10316
10317
10318
10319 if (sd_parent && !(env.flags & LBF_ALL_PINNED)) {
10320 int *group_imbalance = &sd_parent->groups->sgc->imbalance;
10321
10322 if (*group_imbalance)
10323 *group_imbalance = 0;
10324 }
10325
10326 out_all_pinned:
10327
10328
10329
10330
10331
10332 schedstat_inc(sd->lb_balanced[idle]);
10333
10334 sd->nr_balance_failed = 0;
10335
10336 out_one_pinned:
10337 ld_moved = 0;
10338
10339
10340
10341
10342
10343
10344
10345 if (env.idle == CPU_NEWLY_IDLE)
10346 goto out;
10347
10348
10349 if ((env.flags & LBF_ALL_PINNED &&
10350 sd->balance_interval < MAX_PINNED_INTERVAL) ||
10351 sd->balance_interval < sd->max_interval)
10352 sd->balance_interval *= 2;
10353 out:
10354 return ld_moved;
10355 }
10356
10357 static inline unsigned long
10358 get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
10359 {
10360 unsigned long interval = sd->balance_interval;
10361
10362 if (cpu_busy)
10363 interval *= sd->busy_factor;
10364
10365
10366 interval = msecs_to_jiffies(interval);
10367
10368
10369
10370
10371
10372
10373 if (cpu_busy)
10374 interval -= 1;
10375
10376 interval = clamp(interval, 1UL, max_load_balance_interval);
10377
10378 return interval;
10379 }
10380
10381 static inline void
10382 update_next_balance(struct sched_domain *sd, unsigned long *next_balance)
10383 {
10384 unsigned long interval, next;
10385
10386
10387 interval = get_sd_balance_interval(sd, 0);
10388 next = sd->last_balance + interval;
10389
10390 if (time_after(*next_balance, next))
10391 *next_balance = next;
10392 }
10393
10394
10395
10396
10397
10398
10399
10400 static int active_load_balance_cpu_stop(void *data)
10401 {
10402 struct rq *busiest_rq = data;
10403 int busiest_cpu = cpu_of(busiest_rq);
10404 int target_cpu = busiest_rq->push_cpu;
10405 struct rq *target_rq = cpu_rq(target_cpu);
10406 struct sched_domain *sd;
10407 struct task_struct *p = NULL;
10408 struct rq_flags rf;
10409
10410 rq_lock_irq(busiest_rq, &rf);
10411
10412
10413
10414
10415
10416 if (!cpu_active(busiest_cpu) || !cpu_active(target_cpu))
10417 goto out_unlock;
10418
10419
10420 if (unlikely(busiest_cpu != smp_processor_id() ||
10421 !busiest_rq->active_balance))
10422 goto out_unlock;
10423
10424
10425 if (busiest_rq->nr_running <= 1)
10426 goto out_unlock;
10427
10428
10429
10430
10431
10432
10433 BUG_ON(busiest_rq == target_rq);
10434
10435
10436 rcu_read_lock();
10437 for_each_domain(target_cpu, sd) {
10438 if (cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
10439 break;
10440 }
10441
10442 if (likely(sd)) {
10443 struct lb_env env = {
10444 .sd = sd,
10445 .dst_cpu = target_cpu,
10446 .dst_rq = target_rq,
10447 .src_cpu = busiest_rq->cpu,
10448 .src_rq = busiest_rq,
10449 .idle = CPU_IDLE,
10450 .flags = LBF_ACTIVE_LB,
10451 };
10452
10453 schedstat_inc(sd->alb_count);
10454 update_rq_clock(busiest_rq);
10455
10456 p = detach_one_task(&env);
10457 if (p) {
10458 schedstat_inc(sd->alb_pushed);
10459
10460 sd->nr_balance_failed = 0;
10461 } else {
10462 schedstat_inc(sd->alb_failed);
10463 }
10464 }
10465 rcu_read_unlock();
10466 out_unlock:
10467 busiest_rq->active_balance = 0;
10468 rq_unlock(busiest_rq, &rf);
10469
10470 if (p)
10471 attach_one_task(target_rq, p);
10472
10473 local_irq_enable();
10474
10475 return 0;
10476 }
10477
10478 static DEFINE_SPINLOCK(balancing);
10479
10480
10481
10482
10483
10484 void update_max_interval(void)
10485 {
10486 max_load_balance_interval = HZ*num_online_cpus()/10;
10487 }
10488
10489 static inline bool update_newidle_cost(struct sched_domain *sd, u64 cost)
10490 {
10491 if (cost > sd->max_newidle_lb_cost) {
10492
10493
10494
10495
10496 sd->max_newidle_lb_cost = cost;
10497 sd->last_decay_max_lb_cost = jiffies;
10498 } else if (time_after(jiffies, sd->last_decay_max_lb_cost + HZ)) {
10499
10500
10501
10502
10503
10504 sd->max_newidle_lb_cost = (sd->max_newidle_lb_cost * 253) / 256;
10505 sd->last_decay_max_lb_cost = jiffies;
10506
10507 return true;
10508 }
10509
10510 return false;
10511 }
10512
10513
10514
10515
10516
10517
10518
10519 static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
10520 {
10521 int continue_balancing = 1;
10522 int cpu = rq->cpu;
10523 int busy = idle != CPU_IDLE && !sched_idle_cpu(cpu);
10524 unsigned long interval;
10525 struct sched_domain *sd;
10526
10527 unsigned long next_balance = jiffies + 60*HZ;
10528 int update_next_balance = 0;
10529 int need_serialize, need_decay = 0;
10530 u64 max_cost = 0;
10531
10532 rcu_read_lock();
10533 for_each_domain(cpu, sd) {
10534
10535
10536
10537
10538 need_decay = update_newidle_cost(sd, 0);
10539 max_cost += sd->max_newidle_lb_cost;
10540
10541
10542
10543
10544
10545
10546 if (!continue_balancing) {
10547 if (need_decay)
10548 continue;
10549 break;
10550 }
10551
10552 interval = get_sd_balance_interval(sd, busy);
10553
10554 need_serialize = sd->flags & SD_SERIALIZE;
10555 if (need_serialize) {
10556 if (!spin_trylock(&balancing))
10557 goto out;
10558 }
10559
10560 if (time_after_eq(jiffies, sd->last_balance + interval)) {
10561 if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
10562
10563
10564
10565
10566
10567 idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
10568 busy = idle != CPU_IDLE && !sched_idle_cpu(cpu);
10569 }
10570 sd->last_balance = jiffies;
10571 interval = get_sd_balance_interval(sd, busy);
10572 }
10573 if (need_serialize)
10574 spin_unlock(&balancing);
10575 out:
10576 if (time_after(next_balance, sd->last_balance + interval)) {
10577 next_balance = sd->last_balance + interval;
10578 update_next_balance = 1;
10579 }
10580 }
10581 if (need_decay) {
10582
10583
10584
10585
10586 rq->max_idle_balance_cost =
10587 max((u64)sysctl_sched_migration_cost, max_cost);
10588 }
10589 rcu_read_unlock();
10590
10591
10592
10593
10594
10595
10596 if (likely(update_next_balance))
10597 rq->next_balance = next_balance;
10598
10599 }
10600
10601 static inline int on_null_domain(struct rq *rq)
10602 {
10603 return unlikely(!rcu_dereference_sched(rq->sd));
10604 }
10605
10606 #ifdef CONFIG_NO_HZ_COMMON
10607
10608
10609
10610
10611
10612
10613
10614
10615
10616 static inline int find_new_ilb(void)
10617 {
10618 int ilb;
10619 const struct cpumask *hk_mask;
10620
10621 hk_mask = housekeeping_cpumask(HK_TYPE_MISC);
10622
10623 for_each_cpu_and(ilb, nohz.idle_cpus_mask, hk_mask) {
10624
10625 if (ilb == smp_processor_id())
10626 continue;
10627
10628 if (idle_cpu(ilb))
10629 return ilb;
10630 }
10631
10632 return nr_cpu_ids;
10633 }
10634
10635
10636
10637
10638
10639 static void kick_ilb(unsigned int flags)
10640 {
10641 int ilb_cpu;
10642
10643
10644
10645
10646
10647 if (flags & NOHZ_BALANCE_KICK)
10648 nohz.next_balance = jiffies+1;
10649
10650 ilb_cpu = find_new_ilb();
10651
10652 if (ilb_cpu >= nr_cpu_ids)
10653 return;
10654
10655
10656
10657
10658
10659 flags = atomic_fetch_or(flags, nohz_flags(ilb_cpu));
10660 if (flags & NOHZ_KICK_MASK)
10661 return;
10662
10663
10664
10665
10666
10667
10668 smp_call_function_single_async(ilb_cpu, &cpu_rq(ilb_cpu)->nohz_csd);
10669 }
10670
10671
10672
10673
10674
10675 static void nohz_balancer_kick(struct rq *rq)
10676 {
10677 unsigned long now = jiffies;
10678 struct sched_domain_shared *sds;
10679 struct sched_domain *sd;
10680 int nr_busy, i, cpu = rq->cpu;
10681 unsigned int flags = 0;
10682
10683 if (unlikely(rq->idle_balance))
10684 return;
10685
10686
10687
10688
10689
10690 nohz_balance_exit_idle(rq);
10691
10692
10693
10694
10695
10696 if (likely(!atomic_read(&nohz.nr_cpus)))
10697 return;
10698
10699 if (READ_ONCE(nohz.has_blocked) &&
10700 time_after(now, READ_ONCE(nohz.next_blocked)))
10701 flags = NOHZ_STATS_KICK;
10702
10703 if (time_before(now, nohz.next_balance))
10704 goto out;
10705
10706 if (rq->nr_running >= 2) {
10707 flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
10708 goto out;
10709 }
10710
10711 rcu_read_lock();
10712
10713 sd = rcu_dereference(rq->sd);
10714 if (sd) {
10715
10716
10717
10718
10719
10720 if (rq->cfs.h_nr_running >= 1 && check_cpu_capacity(rq, sd)) {
10721 flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
10722 goto unlock;
10723 }
10724 }
10725
10726 sd = rcu_dereference(per_cpu(sd_asym_packing, cpu));
10727 if (sd) {
10728
10729
10730
10731
10732
10733 for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) {
10734 if (sched_asym_prefer(i, cpu)) {
10735 flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
10736 goto unlock;
10737 }
10738 }
10739 }
10740
10741 sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, cpu));
10742 if (sd) {
10743
10744
10745
10746
10747 if (check_misfit_status(rq, sd)) {
10748 flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
10749 goto unlock;
10750 }
10751
10752
10753
10754
10755
10756
10757
10758
10759 goto unlock;
10760 }
10761
10762 sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
10763 if (sds) {
10764
10765
10766
10767
10768
10769
10770
10771
10772
10773 nr_busy = atomic_read(&sds->nr_busy_cpus);
10774 if (nr_busy > 1) {
10775 flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
10776 goto unlock;
10777 }
10778 }
10779 unlock:
10780 rcu_read_unlock();
10781 out:
10782 if (READ_ONCE(nohz.needs_update))
10783 flags |= NOHZ_NEXT_KICK;
10784
10785 if (flags)
10786 kick_ilb(flags);
10787 }
10788
10789 static void set_cpu_sd_state_busy(int cpu)
10790 {
10791 struct sched_domain *sd;
10792
10793 rcu_read_lock();
10794 sd = rcu_dereference(per_cpu(sd_llc, cpu));
10795
10796 if (!sd || !sd->nohz_idle)
10797 goto unlock;
10798 sd->nohz_idle = 0;
10799
10800 atomic_inc(&sd->shared->nr_busy_cpus);
10801 unlock:
10802 rcu_read_unlock();
10803 }
10804
10805 void nohz_balance_exit_idle(struct rq *rq)
10806 {
10807 SCHED_WARN_ON(rq != this_rq());
10808
10809 if (likely(!rq->nohz_tick_stopped))
10810 return;
10811
10812 rq->nohz_tick_stopped = 0;
10813 cpumask_clear_cpu(rq->cpu, nohz.idle_cpus_mask);
10814 atomic_dec(&nohz.nr_cpus);
10815
10816 set_cpu_sd_state_busy(rq->cpu);
10817 }
10818
10819 static void set_cpu_sd_state_idle(int cpu)
10820 {
10821 struct sched_domain *sd;
10822
10823 rcu_read_lock();
10824 sd = rcu_dereference(per_cpu(sd_llc, cpu));
10825
10826 if (!sd || sd->nohz_idle)
10827 goto unlock;
10828 sd->nohz_idle = 1;
10829
10830 atomic_dec(&sd->shared->nr_busy_cpus);
10831 unlock:
10832 rcu_read_unlock();
10833 }
10834
10835
10836
10837
10838
10839 void nohz_balance_enter_idle(int cpu)
10840 {
10841 struct rq *rq = cpu_rq(cpu);
10842
10843 SCHED_WARN_ON(cpu != smp_processor_id());
10844
10845
10846 if (!cpu_active(cpu))
10847 return;
10848
10849
10850 if (!housekeeping_cpu(cpu, HK_TYPE_SCHED))
10851 return;
10852
10853
10854
10855
10856
10857
10858 rq->has_blocked_load = 1;
10859
10860
10861
10862
10863
10864
10865
10866 if (rq->nohz_tick_stopped)
10867 goto out;
10868
10869
10870 if (on_null_domain(rq))
10871 return;
10872
10873 rq->nohz_tick_stopped = 1;
10874
10875 cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
10876 atomic_inc(&nohz.nr_cpus);
10877
10878
10879
10880
10881
10882
10883 smp_mb__after_atomic();
10884
10885 set_cpu_sd_state_idle(cpu);
10886
10887 WRITE_ONCE(nohz.needs_update, 1);
10888 out:
10889
10890
10891
10892
10893 WRITE_ONCE(nohz.has_blocked, 1);
10894 }
10895
10896 static bool update_nohz_stats(struct rq *rq)
10897 {
10898 unsigned int cpu = rq->cpu;
10899
10900 if (!rq->has_blocked_load)
10901 return false;
10902
10903 if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
10904 return false;
10905
10906 if (!time_after(jiffies, READ_ONCE(rq->last_blocked_load_update_tick)))
10907 return true;
10908
10909 update_blocked_averages(cpu);
10910
10911 return rq->has_blocked_load;
10912 }
10913
10914
10915
10916
10917
10918
10919 static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
10920 enum cpu_idle_type idle)
10921 {
10922
10923 unsigned long now = jiffies;
10924 unsigned long next_balance = now + 60*HZ;
10925 bool has_blocked_load = false;
10926 int update_next_balance = 0;
10927 int this_cpu = this_rq->cpu;
10928 int balance_cpu;
10929 struct rq *rq;
10930
10931 SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
10932
10933
10934
10935
10936
10937
10938
10939
10940
10941
10942
10943 if (flags & NOHZ_STATS_KICK)
10944 WRITE_ONCE(nohz.has_blocked, 0);
10945 if (flags & NOHZ_NEXT_KICK)
10946 WRITE_ONCE(nohz.needs_update, 0);
10947
10948
10949
10950
10951
10952 smp_mb();
10953
10954
10955
10956
10957
10958 for_each_cpu_wrap(balance_cpu, nohz.idle_cpus_mask, this_cpu+1) {
10959 if (!idle_cpu(balance_cpu))
10960 continue;
10961
10962
10963
10964
10965
10966
10967 if (need_resched()) {
10968 if (flags & NOHZ_STATS_KICK)
10969 has_blocked_load = true;
10970 if (flags & NOHZ_NEXT_KICK)
10971 WRITE_ONCE(nohz.needs_update, 1);
10972 goto abort;
10973 }
10974
10975 rq = cpu_rq(balance_cpu);
10976
10977 if (flags & NOHZ_STATS_KICK)
10978 has_blocked_load |= update_nohz_stats(rq);
10979
10980
10981
10982
10983
10984 if (time_after_eq(jiffies, rq->next_balance)) {
10985 struct rq_flags rf;
10986
10987 rq_lock_irqsave(rq, &rf);
10988 update_rq_clock(rq);
10989 rq_unlock_irqrestore(rq, &rf);
10990
10991 if (flags & NOHZ_BALANCE_KICK)
10992 rebalance_domains(rq, CPU_IDLE);
10993 }
10994
10995 if (time_after(next_balance, rq->next_balance)) {
10996 next_balance = rq->next_balance;
10997 update_next_balance = 1;
10998 }
10999 }
11000
11001
11002
11003
11004
11005
11006 if (likely(update_next_balance))
11007 nohz.next_balance = next_balance;
11008
11009 if (flags & NOHZ_STATS_KICK)
11010 WRITE_ONCE(nohz.next_blocked,
11011 now + msecs_to_jiffies(LOAD_AVG_PERIOD));
11012
11013 abort:
11014
11015 if (has_blocked_load)
11016 WRITE_ONCE(nohz.has_blocked, 1);
11017 }
11018
11019
11020
11021
11022
11023 static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
11024 {
11025 unsigned int flags = this_rq->nohz_idle_balance;
11026
11027 if (!flags)
11028 return false;
11029
11030 this_rq->nohz_idle_balance = 0;
11031
11032 if (idle != CPU_IDLE)
11033 return false;
11034
11035 _nohz_idle_balance(this_rq, flags, idle);
11036
11037 return true;
11038 }
11039
11040
11041
11042
11043
11044 void nohz_run_idle_balance(int cpu)
11045 {
11046 unsigned int flags;
11047
11048 flags = atomic_fetch_andnot(NOHZ_NEWILB_KICK, nohz_flags(cpu));
11049
11050
11051
11052
11053
11054 if ((flags == NOHZ_NEWILB_KICK) && !need_resched())
11055 _nohz_idle_balance(cpu_rq(cpu), NOHZ_STATS_KICK, CPU_IDLE);
11056 }
11057
11058 static void nohz_newidle_balance(struct rq *this_rq)
11059 {
11060 int this_cpu = this_rq->cpu;
11061
11062
11063
11064
11065
11066 if (!housekeeping_cpu(this_cpu, HK_TYPE_SCHED))
11067 return;
11068
11069
11070 if (this_rq->avg_idle < sysctl_sched_migration_cost)
11071 return;
11072
11073
11074 if (!READ_ONCE(nohz.has_blocked) ||
11075 time_before(jiffies, READ_ONCE(nohz.next_blocked)))
11076 return;
11077
11078
11079
11080
11081
11082 atomic_or(NOHZ_NEWILB_KICK, nohz_flags(this_cpu));
11083 }
11084
11085 #else
11086 static inline void nohz_balancer_kick(struct rq *rq) { }
11087
11088 static inline bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
11089 {
11090 return false;
11091 }
11092
11093 static inline void nohz_newidle_balance(struct rq *this_rq) { }
11094 #endif
11095
11096
11097
11098
11099
11100
11101
11102
11103
11104
11105 static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
11106 {
11107 unsigned long next_balance = jiffies + HZ;
11108 int this_cpu = this_rq->cpu;
11109 u64 t0, t1, curr_cost = 0;
11110 struct sched_domain *sd;
11111 int pulled_task = 0;
11112
11113 update_misfit_status(NULL, this_rq);
11114
11115
11116
11117
11118
11119 if (this_rq->ttwu_pending)
11120 return 0;
11121
11122
11123
11124
11125
11126 this_rq->idle_stamp = rq_clock(this_rq);
11127
11128
11129
11130
11131 if (!cpu_active(this_cpu))
11132 return 0;
11133
11134
11135
11136
11137
11138
11139
11140 rq_unpin_lock(this_rq, rf);
11141
11142 rcu_read_lock();
11143 sd = rcu_dereference_check_sched_domain(this_rq->sd);
11144
11145 if (!READ_ONCE(this_rq->rd->overload) ||
11146 (sd && this_rq->avg_idle < sd->max_newidle_lb_cost)) {
11147
11148 if (sd)
11149 update_next_balance(sd, &next_balance);
11150 rcu_read_unlock();
11151
11152 goto out;
11153 }
11154 rcu_read_unlock();
11155
11156 raw_spin_rq_unlock(this_rq);
11157
11158 t0 = sched_clock_cpu(this_cpu);
11159 update_blocked_averages(this_cpu);
11160
11161 rcu_read_lock();
11162 for_each_domain(this_cpu, sd) {
11163 int continue_balancing = 1;
11164 u64 domain_cost;
11165
11166 update_next_balance(sd, &next_balance);
11167
11168 if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost)
11169 break;
11170
11171 if (sd->flags & SD_BALANCE_NEWIDLE) {
11172
11173 pulled_task = load_balance(this_cpu, this_rq,
11174 sd, CPU_NEWLY_IDLE,
11175 &continue_balancing);
11176
11177 t1 = sched_clock_cpu(this_cpu);
11178 domain_cost = t1 - t0;
11179 update_newidle_cost(sd, domain_cost);
11180
11181 curr_cost += domain_cost;
11182 t0 = t1;
11183 }
11184
11185
11186
11187
11188
11189 if (pulled_task || this_rq->nr_running > 0 ||
11190 this_rq->ttwu_pending)
11191 break;
11192 }
11193 rcu_read_unlock();
11194
11195 raw_spin_rq_lock(this_rq);
11196
11197 if (curr_cost > this_rq->max_idle_balance_cost)
11198 this_rq->max_idle_balance_cost = curr_cost;
11199
11200
11201
11202
11203
11204
11205 if (this_rq->cfs.h_nr_running && !pulled_task)
11206 pulled_task = 1;
11207
11208
11209 if (this_rq->nr_running != this_rq->cfs.h_nr_running)
11210 pulled_task = -1;
11211
11212 out:
11213
11214 if (time_after(this_rq->next_balance, next_balance))
11215 this_rq->next_balance = next_balance;
11216
11217 if (pulled_task)
11218 this_rq->idle_stamp = 0;
11219 else
11220 nohz_newidle_balance(this_rq);
11221
11222 rq_repin_lock(this_rq, rf);
11223
11224 return pulled_task;
11225 }
11226
11227
11228
11229
11230
11231 static __latent_entropy void run_rebalance_domains(struct softirq_action *h)
11232 {
11233 struct rq *this_rq = this_rq();
11234 enum cpu_idle_type idle = this_rq->idle_balance ?
11235 CPU_IDLE : CPU_NOT_IDLE;
11236
11237
11238
11239
11240
11241
11242
11243
11244
11245 if (nohz_idle_balance(this_rq, idle))
11246 return;
11247
11248
11249 update_blocked_averages(this_rq->cpu);
11250 rebalance_domains(this_rq, idle);
11251 }
11252
11253
11254
11255
11256 void trigger_load_balance(struct rq *rq)
11257 {
11258
11259
11260
11261
11262 if (unlikely(on_null_domain(rq) || !cpu_active(cpu_of(rq))))
11263 return;
11264
11265 if (time_after_eq(jiffies, rq->next_balance))
11266 raise_softirq(SCHED_SOFTIRQ);
11267
11268 nohz_balancer_kick(rq);
11269 }
11270
11271 static void rq_online_fair(struct rq *rq)
11272 {
11273 update_sysctl();
11274
11275 update_runtime_enabled(rq);
11276 }
11277
11278 static void rq_offline_fair(struct rq *rq)
11279 {
11280 update_sysctl();
11281
11282
11283 unthrottle_offline_cfs_rqs(rq);
11284 }
11285
11286 #endif
11287
11288 #ifdef CONFIG_SCHED_CORE
11289 static inline bool
11290 __entity_slice_used(struct sched_entity *se, int min_nr_tasks)
11291 {
11292 u64 slice = sched_slice(cfs_rq_of(se), se);
11293 u64 rtime = se->sum_exec_runtime - se->prev_sum_exec_runtime;
11294
11295 return (rtime * min_nr_tasks > slice);
11296 }
11297
11298 #define MIN_NR_TASKS_DURING_FORCEIDLE 2
11299 static inline void task_tick_core(struct rq *rq, struct task_struct *curr)
11300 {
11301 if (!sched_core_enabled(rq))
11302 return;
11303
11304
11305
11306
11307
11308
11309
11310
11311
11312
11313
11314
11315
11316
11317
11318 if (rq->core->core_forceidle_count && rq->cfs.nr_running == 1 &&
11319 __entity_slice_used(&curr->se, MIN_NR_TASKS_DURING_FORCEIDLE))
11320 resched_curr(rq);
11321 }
11322
11323
11324
11325
11326 static void se_fi_update(struct sched_entity *se, unsigned int fi_seq, bool forceidle)
11327 {
11328 for_each_sched_entity(se) {
11329 struct cfs_rq *cfs_rq = cfs_rq_of(se);
11330
11331 if (forceidle) {
11332 if (cfs_rq->forceidle_seq == fi_seq)
11333 break;
11334 cfs_rq->forceidle_seq = fi_seq;
11335 }
11336
11337 cfs_rq->min_vruntime_fi = cfs_rq->min_vruntime;
11338 }
11339 }
11340
11341 void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_fi)
11342 {
11343 struct sched_entity *se = &p->se;
11344
11345 if (p->sched_class != &fair_sched_class)
11346 return;
11347
11348 se_fi_update(se, rq->core->core_forceidle_seq, in_fi);
11349 }
11350
11351 bool cfs_prio_less(struct task_struct *a, struct task_struct *b, bool in_fi)
11352 {
11353 struct rq *rq = task_rq(a);
11354 struct sched_entity *sea = &a->se;
11355 struct sched_entity *seb = &b->se;
11356 struct cfs_rq *cfs_rqa;
11357 struct cfs_rq *cfs_rqb;
11358 s64 delta;
11359
11360 SCHED_WARN_ON(task_rq(b)->core != rq->core);
11361
11362 #ifdef CONFIG_FAIR_GROUP_SCHED
11363
11364
11365
11366
11367 while (sea->cfs_rq->tg != seb->cfs_rq->tg) {
11368 int sea_depth = sea->depth;
11369 int seb_depth = seb->depth;
11370
11371 if (sea_depth >= seb_depth)
11372 sea = parent_entity(sea);
11373 if (sea_depth <= seb_depth)
11374 seb = parent_entity(seb);
11375 }
11376
11377 se_fi_update(sea, rq->core->core_forceidle_seq, in_fi);
11378 se_fi_update(seb, rq->core->core_forceidle_seq, in_fi);
11379
11380 cfs_rqa = sea->cfs_rq;
11381 cfs_rqb = seb->cfs_rq;
11382 #else
11383 cfs_rqa = &task_rq(a)->cfs;
11384 cfs_rqb = &task_rq(b)->cfs;
11385 #endif
11386
11387
11388
11389
11390
11391
11392 delta = (s64)(sea->vruntime - seb->vruntime) +
11393 (s64)(cfs_rqb->min_vruntime_fi - cfs_rqa->min_vruntime_fi);
11394
11395 return delta > 0;
11396 }
11397 #else
11398 static inline void task_tick_core(struct rq *rq, struct task_struct *curr) {}
11399 #endif
11400
11401
11402
11403
11404
11405
11406
11407
11408
11409 static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
11410 {
11411 struct cfs_rq *cfs_rq;
11412 struct sched_entity *se = &curr->se;
11413
11414 for_each_sched_entity(se) {
11415 cfs_rq = cfs_rq_of(se);
11416 entity_tick(cfs_rq, se, queued);
11417 }
11418
11419 if (static_branch_unlikely(&sched_numa_balancing))
11420 task_tick_numa(rq, curr);
11421
11422 update_misfit_status(curr, rq);
11423 update_overutilized_status(task_rq(curr));
11424
11425 task_tick_core(rq, curr);
11426 }
11427
11428
11429
11430
11431
11432
11433 static void task_fork_fair(struct task_struct *p)
11434 {
11435 struct cfs_rq *cfs_rq;
11436 struct sched_entity *se = &p->se, *curr;
11437 struct rq *rq = this_rq();
11438 struct rq_flags rf;
11439
11440 rq_lock(rq, &rf);
11441 update_rq_clock(rq);
11442
11443 cfs_rq = task_cfs_rq(current);
11444 curr = cfs_rq->curr;
11445 if (curr) {
11446 update_curr(cfs_rq);
11447 se->vruntime = curr->vruntime;
11448 }
11449 place_entity(cfs_rq, se, 1);
11450
11451 if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
11452
11453
11454
11455
11456 swap(curr->vruntime, se->vruntime);
11457 resched_curr(rq);
11458 }
11459
11460 se->vruntime -= cfs_rq->min_vruntime;
11461 rq_unlock(rq, &rf);
11462 }
11463
11464
11465
11466
11467
11468 static void
11469 prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
11470 {
11471 if (!task_on_rq_queued(p))
11472 return;
11473
11474 if (rq->cfs.nr_running == 1)
11475 return;
11476
11477
11478
11479
11480
11481
11482 if (task_current(rq, p)) {
11483 if (p->prio > oldprio)
11484 resched_curr(rq);
11485 } else
11486 check_preempt_curr(rq, p, 0);
11487 }
11488
11489 static inline bool vruntime_normalized(struct task_struct *p)
11490 {
11491 struct sched_entity *se = &p->se;
11492
11493
11494
11495
11496
11497
11498 if (p->on_rq)
11499 return true;
11500
11501
11502
11503
11504
11505
11506
11507
11508
11509
11510 if (!se->sum_exec_runtime ||
11511 (READ_ONCE(p->__state) == TASK_WAKING && p->sched_remote_wakeup))
11512 return true;
11513
11514 return false;
11515 }
11516
11517 #ifdef CONFIG_FAIR_GROUP_SCHED
11518
11519
11520
11521
11522 static void propagate_entity_cfs_rq(struct sched_entity *se)
11523 {
11524 struct cfs_rq *cfs_rq = cfs_rq_of(se);
11525
11526 if (cfs_rq_throttled(cfs_rq))
11527 return;
11528
11529 if (!throttled_hierarchy(cfs_rq))
11530 list_add_leaf_cfs_rq(cfs_rq);
11531
11532
11533 se = se->parent;
11534
11535 for_each_sched_entity(se) {
11536 cfs_rq = cfs_rq_of(se);
11537
11538 update_load_avg(cfs_rq, se, UPDATE_TG);
11539
11540 if (cfs_rq_throttled(cfs_rq))
11541 break;
11542
11543 if (!throttled_hierarchy(cfs_rq))
11544 list_add_leaf_cfs_rq(cfs_rq);
11545 }
11546 }
11547 #else
11548 static void propagate_entity_cfs_rq(struct sched_entity *se) { }
11549 #endif
11550
11551 static void detach_entity_cfs_rq(struct sched_entity *se)
11552 {
11553 struct cfs_rq *cfs_rq = cfs_rq_of(se);
11554
11555
11556 update_load_avg(cfs_rq, se, 0);
11557 detach_entity_load_avg(cfs_rq, se);
11558 update_tg_load_avg(cfs_rq);
11559 propagate_entity_cfs_rq(se);
11560 }
11561
11562 static void attach_entity_cfs_rq(struct sched_entity *se)
11563 {
11564 struct cfs_rq *cfs_rq = cfs_rq_of(se);
11565
11566 #ifdef CONFIG_FAIR_GROUP_SCHED
11567
11568
11569
11570
11571 se->depth = se->parent ? se->parent->depth + 1 : 0;
11572 #endif
11573
11574
11575 update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
11576 attach_entity_load_avg(cfs_rq, se);
11577 update_tg_load_avg(cfs_rq);
11578 propagate_entity_cfs_rq(se);
11579 }
11580
11581 static void detach_task_cfs_rq(struct task_struct *p)
11582 {
11583 struct sched_entity *se = &p->se;
11584 struct cfs_rq *cfs_rq = cfs_rq_of(se);
11585
11586 if (!vruntime_normalized(p)) {
11587
11588
11589
11590
11591 place_entity(cfs_rq, se, 0);
11592 se->vruntime -= cfs_rq->min_vruntime;
11593 }
11594
11595 detach_entity_cfs_rq(se);
11596 }
11597
11598 static void attach_task_cfs_rq(struct task_struct *p)
11599 {
11600 struct sched_entity *se = &p->se;
11601 struct cfs_rq *cfs_rq = cfs_rq_of(se);
11602
11603 attach_entity_cfs_rq(se);
11604
11605 if (!vruntime_normalized(p))
11606 se->vruntime += cfs_rq->min_vruntime;
11607 }
11608
11609 static void switched_from_fair(struct rq *rq, struct task_struct *p)
11610 {
11611 detach_task_cfs_rq(p);
11612 }
11613
11614 static void switched_to_fair(struct rq *rq, struct task_struct *p)
11615 {
11616 attach_task_cfs_rq(p);
11617
11618 if (task_on_rq_queued(p)) {
11619
11620
11621
11622
11623
11624 if (task_current(rq, p))
11625 resched_curr(rq);
11626 else
11627 check_preempt_curr(rq, p, 0);
11628 }
11629 }
11630
11631
11632
11633
11634
11635
11636 static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
11637 {
11638 struct sched_entity *se = &p->se;
11639
11640 #ifdef CONFIG_SMP
11641 if (task_on_rq_queued(p)) {
11642
11643
11644
11645
11646 list_move(&se->group_node, &rq->cfs_tasks);
11647 }
11648 #endif
11649
11650 for_each_sched_entity(se) {
11651 struct cfs_rq *cfs_rq = cfs_rq_of(se);
11652
11653 set_next_entity(cfs_rq, se);
11654
11655 account_cfs_rq_runtime(cfs_rq, 0);
11656 }
11657 }
11658
11659 void init_cfs_rq(struct cfs_rq *cfs_rq)
11660 {
11661 cfs_rq->tasks_timeline = RB_ROOT_CACHED;
11662 u64_u32_store(cfs_rq->min_vruntime, (u64)(-(1LL << 20)));
11663 #ifdef CONFIG_SMP
11664 raw_spin_lock_init(&cfs_rq->removed.lock);
11665 #endif
11666 }
11667
11668 #ifdef CONFIG_FAIR_GROUP_SCHED
11669 static void task_set_group_fair(struct task_struct *p)
11670 {
11671 struct sched_entity *se = &p->se;
11672
11673 set_task_rq(p, task_cpu(p));
11674 se->depth = se->parent ? se->parent->depth + 1 : 0;
11675 }
11676
11677 static void task_move_group_fair(struct task_struct *p)
11678 {
11679 detach_task_cfs_rq(p);
11680 set_task_rq(p, task_cpu(p));
11681
11682 #ifdef CONFIG_SMP
11683
11684 p->se.avg.last_update_time = 0;
11685 #endif
11686 attach_task_cfs_rq(p);
11687 }
11688
11689 static void task_change_group_fair(struct task_struct *p, int type)
11690 {
11691 switch (type) {
11692 case TASK_SET_GROUP:
11693 task_set_group_fair(p);
11694 break;
11695
11696 case TASK_MOVE_GROUP:
11697 task_move_group_fair(p);
11698 break;
11699 }
11700 }
11701
11702 void free_fair_sched_group(struct task_group *tg)
11703 {
11704 int i;
11705
11706 for_each_possible_cpu(i) {
11707 if (tg->cfs_rq)
11708 kfree(tg->cfs_rq[i]);
11709 if (tg->se)
11710 kfree(tg->se[i]);
11711 }
11712
11713 kfree(tg->cfs_rq);
11714 kfree(tg->se);
11715 }
11716
11717 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
11718 {
11719 struct sched_entity *se;
11720 struct cfs_rq *cfs_rq;
11721 int i;
11722
11723 tg->cfs_rq = kcalloc(nr_cpu_ids, sizeof(cfs_rq), GFP_KERNEL);
11724 if (!tg->cfs_rq)
11725 goto err;
11726 tg->se = kcalloc(nr_cpu_ids, sizeof(se), GFP_KERNEL);
11727 if (!tg->se)
11728 goto err;
11729
11730 tg->shares = NICE_0_LOAD;
11731
11732 init_cfs_bandwidth(tg_cfs_bandwidth(tg));
11733
11734 for_each_possible_cpu(i) {
11735 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
11736 GFP_KERNEL, cpu_to_node(i));
11737 if (!cfs_rq)
11738 goto err;
11739
11740 se = kzalloc_node(sizeof(struct sched_entity_stats),
11741 GFP_KERNEL, cpu_to_node(i));
11742 if (!se)
11743 goto err_free_rq;
11744
11745 init_cfs_rq(cfs_rq);
11746 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
11747 init_entity_runnable_average(se);
11748 }
11749
11750 return 1;
11751
11752 err_free_rq:
11753 kfree(cfs_rq);
11754 err:
11755 return 0;
11756 }
11757
11758 void online_fair_sched_group(struct task_group *tg)
11759 {
11760 struct sched_entity *se;
11761 struct rq_flags rf;
11762 struct rq *rq;
11763 int i;
11764
11765 for_each_possible_cpu(i) {
11766 rq = cpu_rq(i);
11767 se = tg->se[i];
11768 rq_lock_irq(rq, &rf);
11769 update_rq_clock(rq);
11770 attach_entity_cfs_rq(se);
11771 sync_throttle(tg, i);
11772 rq_unlock_irq(rq, &rf);
11773 }
11774 }
11775
11776 void unregister_fair_sched_group(struct task_group *tg)
11777 {
11778 unsigned long flags;
11779 struct rq *rq;
11780 int cpu;
11781
11782 destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
11783
11784 for_each_possible_cpu(cpu) {
11785 if (tg->se[cpu])
11786 remove_entity_load_avg(tg->se[cpu]);
11787
11788
11789
11790
11791
11792 if (!tg->cfs_rq[cpu]->on_list)
11793 continue;
11794
11795 rq = cpu_rq(cpu);
11796
11797 raw_spin_rq_lock_irqsave(rq, flags);
11798 list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
11799 raw_spin_rq_unlock_irqrestore(rq, flags);
11800 }
11801 }
11802
11803 void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
11804 struct sched_entity *se, int cpu,
11805 struct sched_entity *parent)
11806 {
11807 struct rq *rq = cpu_rq(cpu);
11808
11809 cfs_rq->tg = tg;
11810 cfs_rq->rq = rq;
11811 init_cfs_rq_runtime(cfs_rq);
11812
11813 tg->cfs_rq[cpu] = cfs_rq;
11814 tg->se[cpu] = se;
11815
11816
11817 if (!se)
11818 return;
11819
11820 if (!parent) {
11821 se->cfs_rq = &rq->cfs;
11822 se->depth = 0;
11823 } else {
11824 se->cfs_rq = parent->my_q;
11825 se->depth = parent->depth + 1;
11826 }
11827
11828 se->my_q = cfs_rq;
11829
11830 update_load_set(&se->load, NICE_0_LOAD);
11831 se->parent = parent;
11832 }
11833
11834 static DEFINE_MUTEX(shares_mutex);
11835
11836 static int __sched_group_set_shares(struct task_group *tg, unsigned long shares)
11837 {
11838 int i;
11839
11840 lockdep_assert_held(&shares_mutex);
11841
11842
11843
11844
11845 if (!tg->se[0])
11846 return -EINVAL;
11847
11848 shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
11849
11850 if (tg->shares == shares)
11851 return 0;
11852
11853 tg->shares = shares;
11854 for_each_possible_cpu(i) {
11855 struct rq *rq = cpu_rq(i);
11856 struct sched_entity *se = tg->se[i];
11857 struct rq_flags rf;
11858
11859
11860 rq_lock_irqsave(rq, &rf);
11861 update_rq_clock(rq);
11862 for_each_sched_entity(se) {
11863 update_load_avg(cfs_rq_of(se), se, UPDATE_TG);
11864 update_cfs_group(se);
11865 }
11866 rq_unlock_irqrestore(rq, &rf);
11867 }
11868
11869 return 0;
11870 }
11871
11872 int sched_group_set_shares(struct task_group *tg, unsigned long shares)
11873 {
11874 int ret;
11875
11876 mutex_lock(&shares_mutex);
11877 if (tg_is_idle(tg))
11878 ret = -EINVAL;
11879 else
11880 ret = __sched_group_set_shares(tg, shares);
11881 mutex_unlock(&shares_mutex);
11882
11883 return ret;
11884 }
11885
11886 int sched_group_set_idle(struct task_group *tg, long idle)
11887 {
11888 int i;
11889
11890 if (tg == &root_task_group)
11891 return -EINVAL;
11892
11893 if (idle < 0 || idle > 1)
11894 return -EINVAL;
11895
11896 mutex_lock(&shares_mutex);
11897
11898 if (tg->idle == idle) {
11899 mutex_unlock(&shares_mutex);
11900 return 0;
11901 }
11902
11903 tg->idle = idle;
11904
11905 for_each_possible_cpu(i) {
11906 struct rq *rq = cpu_rq(i);
11907 struct sched_entity *se = tg->se[i];
11908 struct cfs_rq *parent_cfs_rq, *grp_cfs_rq = tg->cfs_rq[i];
11909 bool was_idle = cfs_rq_is_idle(grp_cfs_rq);
11910 long idle_task_delta;
11911 struct rq_flags rf;
11912
11913 rq_lock_irqsave(rq, &rf);
11914
11915 grp_cfs_rq->idle = idle;
11916 if (WARN_ON_ONCE(was_idle == cfs_rq_is_idle(grp_cfs_rq)))
11917 goto next_cpu;
11918
11919 if (se->on_rq) {
11920 parent_cfs_rq = cfs_rq_of(se);
11921 if (cfs_rq_is_idle(grp_cfs_rq))
11922 parent_cfs_rq->idle_nr_running++;
11923 else
11924 parent_cfs_rq->idle_nr_running--;
11925 }
11926
11927 idle_task_delta = grp_cfs_rq->h_nr_running -
11928 grp_cfs_rq->idle_h_nr_running;
11929 if (!cfs_rq_is_idle(grp_cfs_rq))
11930 idle_task_delta *= -1;
11931
11932 for_each_sched_entity(se) {
11933 struct cfs_rq *cfs_rq = cfs_rq_of(se);
11934
11935 if (!se->on_rq)
11936 break;
11937
11938 cfs_rq->idle_h_nr_running += idle_task_delta;
11939
11940
11941 if (cfs_rq_is_idle(cfs_rq))
11942 break;
11943 }
11944
11945 next_cpu:
11946 rq_unlock_irqrestore(rq, &rf);
11947 }
11948
11949
11950 if (tg_is_idle(tg))
11951 __sched_group_set_shares(tg, scale_load(WEIGHT_IDLEPRIO));
11952 else
11953 __sched_group_set_shares(tg, NICE_0_LOAD);
11954
11955 mutex_unlock(&shares_mutex);
11956 return 0;
11957 }
11958
11959 #else
11960
11961 void free_fair_sched_group(struct task_group *tg) { }
11962
11963 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
11964 {
11965 return 1;
11966 }
11967
11968 void online_fair_sched_group(struct task_group *tg) { }
11969
11970 void unregister_fair_sched_group(struct task_group *tg) { }
11971
11972 #endif
11973
11974
11975 static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
11976 {
11977 struct sched_entity *se = &task->se;
11978 unsigned int rr_interval = 0;
11979
11980
11981
11982
11983
11984 if (rq->cfs.load.weight)
11985 rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
11986
11987 return rr_interval;
11988 }
11989
11990
11991
11992
11993 DEFINE_SCHED_CLASS(fair) = {
11994
11995 .enqueue_task = enqueue_task_fair,
11996 .dequeue_task = dequeue_task_fair,
11997 .yield_task = yield_task_fair,
11998 .yield_to_task = yield_to_task_fair,
11999
12000 .check_preempt_curr = check_preempt_wakeup,
12001
12002 .pick_next_task = __pick_next_task_fair,
12003 .put_prev_task = put_prev_task_fair,
12004 .set_next_task = set_next_task_fair,
12005
12006 #ifdef CONFIG_SMP
12007 .balance = balance_fair,
12008 .pick_task = pick_task_fair,
12009 .select_task_rq = select_task_rq_fair,
12010 .migrate_task_rq = migrate_task_rq_fair,
12011
12012 .rq_online = rq_online_fair,
12013 .rq_offline = rq_offline_fair,
12014
12015 .task_dead = task_dead_fair,
12016 .set_cpus_allowed = set_cpus_allowed_common,
12017 #endif
12018
12019 .task_tick = task_tick_fair,
12020 .task_fork = task_fork_fair,
12021
12022 .prio_changed = prio_changed_fair,
12023 .switched_from = switched_from_fair,
12024 .switched_to = switched_to_fair,
12025
12026 .get_rr_interval = get_rr_interval_fair,
12027
12028 .update_curr = update_curr_fair,
12029
12030 #ifdef CONFIG_FAIR_GROUP_SCHED
12031 .task_change_group = task_change_group_fair,
12032 #endif
12033
12034 #ifdef CONFIG_UCLAMP_TASK
12035 .uclamp_enabled = 1,
12036 #endif
12037 };
12038
12039 #ifdef CONFIG_SCHED_DEBUG
12040 void print_cfs_stats(struct seq_file *m, int cpu)
12041 {
12042 struct cfs_rq *cfs_rq, *pos;
12043
12044 rcu_read_lock();
12045 for_each_leaf_cfs_rq_safe(cpu_rq(cpu), cfs_rq, pos)
12046 print_cfs_rq(m, cpu, cfs_rq);
12047 rcu_read_unlock();
12048 }
12049
12050 #ifdef CONFIG_NUMA_BALANCING
12051 void show_numa_stats(struct task_struct *p, struct seq_file *m)
12052 {
12053 int node;
12054 unsigned long tsf = 0, tpf = 0, gsf = 0, gpf = 0;
12055 struct numa_group *ng;
12056
12057 rcu_read_lock();
12058 ng = rcu_dereference(p->numa_group);
12059 for_each_online_node(node) {
12060 if (p->numa_faults) {
12061 tsf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 0)];
12062 tpf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 1)];
12063 }
12064 if (ng) {
12065 gsf = ng->faults[task_faults_idx(NUMA_MEM, node, 0)],
12066 gpf = ng->faults[task_faults_idx(NUMA_MEM, node, 1)];
12067 }
12068 print_numa_stats(m, node, tsf, tpf, gsf, gpf);
12069 }
12070 rcu_read_unlock();
12071 }
12072 #endif
12073 #endif
12074
12075 __init void init_sched_fair_class(void)
12076 {
12077 #ifdef CONFIG_SMP
12078 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
12079
12080 #ifdef CONFIG_NO_HZ_COMMON
12081 nohz.next_balance = jiffies;
12082 nohz.next_blocked = jiffies;
12083 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
12084 #endif
12085 #endif
12086
12087 }