0001
0002
0003
0004
0005
0006
0007
0008
0009
0010 #include <linux/compiler.h>
0011 #include <linux/completion.h>
0012 #include <linux/cpu.h>
0013 #include <linux/init.h>
0014 #include <linux/kthread.h>
0015 #include <linux/export.h>
0016 #include <linux/percpu.h>
0017 #include <linux/sched.h>
0018 #include <linux/stop_machine.h>
0019 #include <linux/interrupt.h>
0020 #include <linux/kallsyms.h>
0021 #include <linux/smpboot.h>
0022 #include <linux/atomic.h>
0023 #include <linux/nmi.h>
0024 #include <linux/sched/wake_q.h>
0025
0026
0027
0028
0029
0030 struct cpu_stop_done {
0031 atomic_t nr_todo;
0032 int ret;
0033 struct completion completion;
0034 };
0035
0036
0037 struct cpu_stopper {
0038 struct task_struct *thread;
0039
0040 raw_spinlock_t lock;
0041 bool enabled;
0042 struct list_head works;
0043
0044 struct cpu_stop_work stop_work;
0045 unsigned long caller;
0046 cpu_stop_fn_t fn;
0047 };
0048
0049 static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper);
0050 static bool stop_machine_initialized = false;
0051
0052 void print_stop_info(const char *log_lvl, struct task_struct *task)
0053 {
0054
0055
0056
0057
0058 struct cpu_stopper *stopper = per_cpu_ptr(&cpu_stopper, task_cpu(task));
0059
0060 if (task != stopper->thread)
0061 return;
0062
0063 printk("%sStopper: %pS <- %pS\n", log_lvl, stopper->fn, (void *)stopper->caller);
0064 }
0065
0066
0067 static DEFINE_MUTEX(stop_cpus_mutex);
0068 static bool stop_cpus_in_progress;
0069
0070 static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo)
0071 {
0072 memset(done, 0, sizeof(*done));
0073 atomic_set(&done->nr_todo, nr_todo);
0074 init_completion(&done->completion);
0075 }
0076
0077
0078 static void cpu_stop_signal_done(struct cpu_stop_done *done)
0079 {
0080 if (atomic_dec_and_test(&done->nr_todo))
0081 complete(&done->completion);
0082 }
0083
0084 static void __cpu_stop_queue_work(struct cpu_stopper *stopper,
0085 struct cpu_stop_work *work,
0086 struct wake_q_head *wakeq)
0087 {
0088 list_add_tail(&work->list, &stopper->works);
0089 wake_q_add(wakeq, stopper->thread);
0090 }
0091
0092
0093 static bool cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
0094 {
0095 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
0096 DEFINE_WAKE_Q(wakeq);
0097 unsigned long flags;
0098 bool enabled;
0099
0100 preempt_disable();
0101 raw_spin_lock_irqsave(&stopper->lock, flags);
0102 enabled = stopper->enabled;
0103 if (enabled)
0104 __cpu_stop_queue_work(stopper, work, &wakeq);
0105 else if (work->done)
0106 cpu_stop_signal_done(work->done);
0107 raw_spin_unlock_irqrestore(&stopper->lock, flags);
0108
0109 wake_up_q(&wakeq);
0110 preempt_enable();
0111
0112 return enabled;
0113 }
0114
0115
0116
0117
0118
0119
0120
0121
0122
0123
0124
0125
0126
0127
0128
0129
0130
0131
0132
0133
0134
0135
0136
0137
0138
0139 int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg)
0140 {
0141 struct cpu_stop_done done;
0142 struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done, .caller = _RET_IP_ };
0143
0144 cpu_stop_init_done(&done, 1);
0145 if (!cpu_stop_queue_work(cpu, &work))
0146 return -ENOENT;
0147
0148
0149
0150
0151 cond_resched();
0152 wait_for_completion(&done.completion);
0153 return done.ret;
0154 }
0155
0156
0157 enum multi_stop_state {
0158
0159 MULTI_STOP_NONE,
0160
0161 MULTI_STOP_PREPARE,
0162
0163 MULTI_STOP_DISABLE_IRQ,
0164
0165 MULTI_STOP_RUN,
0166
0167 MULTI_STOP_EXIT,
0168 };
0169
0170 struct multi_stop_data {
0171 cpu_stop_fn_t fn;
0172 void *data;
0173
0174 unsigned int num_threads;
0175 const struct cpumask *active_cpus;
0176
0177 enum multi_stop_state state;
0178 atomic_t thread_ack;
0179 };
0180
0181 static void set_state(struct multi_stop_data *msdata,
0182 enum multi_stop_state newstate)
0183 {
0184
0185 atomic_set(&msdata->thread_ack, msdata->num_threads);
0186 smp_wmb();
0187 WRITE_ONCE(msdata->state, newstate);
0188 }
0189
0190
0191 static void ack_state(struct multi_stop_data *msdata)
0192 {
0193 if (atomic_dec_and_test(&msdata->thread_ack))
0194 set_state(msdata, msdata->state + 1);
0195 }
0196
0197 notrace void __weak stop_machine_yield(const struct cpumask *cpumask)
0198 {
0199 cpu_relax();
0200 }
0201
0202
0203 static int multi_cpu_stop(void *data)
0204 {
0205 struct multi_stop_data *msdata = data;
0206 enum multi_stop_state newstate, curstate = MULTI_STOP_NONE;
0207 int cpu = smp_processor_id(), err = 0;
0208 const struct cpumask *cpumask;
0209 unsigned long flags;
0210 bool is_active;
0211
0212
0213
0214
0215
0216 local_save_flags(flags);
0217
0218 if (!msdata->active_cpus) {
0219 cpumask = cpu_online_mask;
0220 is_active = cpu == cpumask_first(cpumask);
0221 } else {
0222 cpumask = msdata->active_cpus;
0223 is_active = cpumask_test_cpu(cpu, cpumask);
0224 }
0225
0226
0227 do {
0228
0229 stop_machine_yield(cpumask);
0230 newstate = READ_ONCE(msdata->state);
0231 if (newstate != curstate) {
0232 curstate = newstate;
0233 switch (curstate) {
0234 case MULTI_STOP_DISABLE_IRQ:
0235 local_irq_disable();
0236 hard_irq_disable();
0237 break;
0238 case MULTI_STOP_RUN:
0239 if (is_active)
0240 err = msdata->fn(msdata->data);
0241 break;
0242 default:
0243 break;
0244 }
0245 ack_state(msdata);
0246 } else if (curstate > MULTI_STOP_PREPARE) {
0247
0248
0249
0250
0251
0252 touch_nmi_watchdog();
0253 }
0254 rcu_momentary_dyntick_idle();
0255 } while (curstate != MULTI_STOP_EXIT);
0256
0257 local_irq_restore(flags);
0258 return err;
0259 }
0260
0261 static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1,
0262 int cpu2, struct cpu_stop_work *work2)
0263 {
0264 struct cpu_stopper *stopper1 = per_cpu_ptr(&cpu_stopper, cpu1);
0265 struct cpu_stopper *stopper2 = per_cpu_ptr(&cpu_stopper, cpu2);
0266 DEFINE_WAKE_Q(wakeq);
0267 int err;
0268
0269 retry:
0270
0271
0272
0273
0274
0275
0276
0277 preempt_disable();
0278 raw_spin_lock_irq(&stopper1->lock);
0279 raw_spin_lock_nested(&stopper2->lock, SINGLE_DEPTH_NESTING);
0280
0281 if (!stopper1->enabled || !stopper2->enabled) {
0282 err = -ENOENT;
0283 goto unlock;
0284 }
0285
0286
0287
0288
0289
0290
0291
0292
0293
0294
0295
0296 if (unlikely(stop_cpus_in_progress)) {
0297 err = -EDEADLK;
0298 goto unlock;
0299 }
0300
0301 err = 0;
0302 __cpu_stop_queue_work(stopper1, work1, &wakeq);
0303 __cpu_stop_queue_work(stopper2, work2, &wakeq);
0304
0305 unlock:
0306 raw_spin_unlock(&stopper2->lock);
0307 raw_spin_unlock_irq(&stopper1->lock);
0308
0309 if (unlikely(err == -EDEADLK)) {
0310 preempt_enable();
0311
0312 while (stop_cpus_in_progress)
0313 cpu_relax();
0314
0315 goto retry;
0316 }
0317
0318 wake_up_q(&wakeq);
0319 preempt_enable();
0320
0321 return err;
0322 }
0323
0324
0325
0326
0327
0328
0329
0330
0331
0332
0333
0334 int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *arg)
0335 {
0336 struct cpu_stop_done done;
0337 struct cpu_stop_work work1, work2;
0338 struct multi_stop_data msdata;
0339
0340 msdata = (struct multi_stop_data){
0341 .fn = fn,
0342 .data = arg,
0343 .num_threads = 2,
0344 .active_cpus = cpumask_of(cpu1),
0345 };
0346
0347 work1 = work2 = (struct cpu_stop_work){
0348 .fn = multi_cpu_stop,
0349 .arg = &msdata,
0350 .done = &done,
0351 .caller = _RET_IP_,
0352 };
0353
0354 cpu_stop_init_done(&done, 2);
0355 set_state(&msdata, MULTI_STOP_PREPARE);
0356
0357 if (cpu1 > cpu2)
0358 swap(cpu1, cpu2);
0359 if (cpu_stop_queue_two_works(cpu1, &work1, cpu2, &work2))
0360 return -ENOENT;
0361
0362 wait_for_completion(&done.completion);
0363 return done.ret;
0364 }
0365
0366
0367
0368
0369
0370
0371
0372
0373
0374
0375
0376
0377
0378
0379
0380
0381
0382
0383
0384 bool stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
0385 struct cpu_stop_work *work_buf)
0386 {
0387 *work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, .caller = _RET_IP_, };
0388 return cpu_stop_queue_work(cpu, work_buf);
0389 }
0390
0391 static bool queue_stop_cpus_work(const struct cpumask *cpumask,
0392 cpu_stop_fn_t fn, void *arg,
0393 struct cpu_stop_done *done)
0394 {
0395 struct cpu_stop_work *work;
0396 unsigned int cpu;
0397 bool queued = false;
0398
0399
0400
0401
0402
0403
0404 preempt_disable();
0405 stop_cpus_in_progress = true;
0406 barrier();
0407 for_each_cpu(cpu, cpumask) {
0408 work = &per_cpu(cpu_stopper.stop_work, cpu);
0409 work->fn = fn;
0410 work->arg = arg;
0411 work->done = done;
0412 work->caller = _RET_IP_;
0413 if (cpu_stop_queue_work(cpu, work))
0414 queued = true;
0415 }
0416 barrier();
0417 stop_cpus_in_progress = false;
0418 preempt_enable();
0419
0420 return queued;
0421 }
0422
0423 static int __stop_cpus(const struct cpumask *cpumask,
0424 cpu_stop_fn_t fn, void *arg)
0425 {
0426 struct cpu_stop_done done;
0427
0428 cpu_stop_init_done(&done, cpumask_weight(cpumask));
0429 if (!queue_stop_cpus_work(cpumask, fn, arg, &done))
0430 return -ENOENT;
0431 wait_for_completion(&done.completion);
0432 return done.ret;
0433 }
0434
0435
0436
0437
0438
0439
0440
0441
0442
0443
0444
0445
0446
0447
0448
0449
0450
0451
0452
0453
0454
0455
0456
0457
0458
0459
0460
0461
0462
0463 static int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
0464 {
0465 int ret;
0466
0467
0468 mutex_lock(&stop_cpus_mutex);
0469 ret = __stop_cpus(cpumask, fn, arg);
0470 mutex_unlock(&stop_cpus_mutex);
0471 return ret;
0472 }
0473
0474 static int cpu_stop_should_run(unsigned int cpu)
0475 {
0476 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
0477 unsigned long flags;
0478 int run;
0479
0480 raw_spin_lock_irqsave(&stopper->lock, flags);
0481 run = !list_empty(&stopper->works);
0482 raw_spin_unlock_irqrestore(&stopper->lock, flags);
0483 return run;
0484 }
0485
0486 static void cpu_stopper_thread(unsigned int cpu)
0487 {
0488 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
0489 struct cpu_stop_work *work;
0490
0491 repeat:
0492 work = NULL;
0493 raw_spin_lock_irq(&stopper->lock);
0494 if (!list_empty(&stopper->works)) {
0495 work = list_first_entry(&stopper->works,
0496 struct cpu_stop_work, list);
0497 list_del_init(&work->list);
0498 }
0499 raw_spin_unlock_irq(&stopper->lock);
0500
0501 if (work) {
0502 cpu_stop_fn_t fn = work->fn;
0503 void *arg = work->arg;
0504 struct cpu_stop_done *done = work->done;
0505 int ret;
0506
0507
0508 stopper->caller = work->caller;
0509 stopper->fn = fn;
0510 preempt_count_inc();
0511 ret = fn(arg);
0512 if (done) {
0513 if (ret)
0514 done->ret = ret;
0515 cpu_stop_signal_done(done);
0516 }
0517 preempt_count_dec();
0518 stopper->fn = NULL;
0519 stopper->caller = 0;
0520 WARN_ONCE(preempt_count(),
0521 "cpu_stop: %ps(%p) leaked preempt count\n", fn, arg);
0522 goto repeat;
0523 }
0524 }
0525
0526 void stop_machine_park(int cpu)
0527 {
0528 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
0529
0530
0531
0532
0533
0534 stopper->enabled = false;
0535 kthread_park(stopper->thread);
0536 }
0537
0538 static void cpu_stop_create(unsigned int cpu)
0539 {
0540 sched_set_stop_task(cpu, per_cpu(cpu_stopper.thread, cpu));
0541 }
0542
0543 static void cpu_stop_park(unsigned int cpu)
0544 {
0545 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
0546
0547 WARN_ON(!list_empty(&stopper->works));
0548 }
0549
0550 void stop_machine_unpark(int cpu)
0551 {
0552 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
0553
0554 stopper->enabled = true;
0555 kthread_unpark(stopper->thread);
0556 }
0557
0558 static struct smp_hotplug_thread cpu_stop_threads = {
0559 .store = &cpu_stopper.thread,
0560 .thread_should_run = cpu_stop_should_run,
0561 .thread_fn = cpu_stopper_thread,
0562 .thread_comm = "migration/%u",
0563 .create = cpu_stop_create,
0564 .park = cpu_stop_park,
0565 .selfparking = true,
0566 };
0567
0568 static int __init cpu_stop_init(void)
0569 {
0570 unsigned int cpu;
0571
0572 for_each_possible_cpu(cpu) {
0573 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
0574
0575 raw_spin_lock_init(&stopper->lock);
0576 INIT_LIST_HEAD(&stopper->works);
0577 }
0578
0579 BUG_ON(smpboot_register_percpu_thread(&cpu_stop_threads));
0580 stop_machine_unpark(raw_smp_processor_id());
0581 stop_machine_initialized = true;
0582 return 0;
0583 }
0584 early_initcall(cpu_stop_init);
0585
0586 int stop_machine_cpuslocked(cpu_stop_fn_t fn, void *data,
0587 const struct cpumask *cpus)
0588 {
0589 struct multi_stop_data msdata = {
0590 .fn = fn,
0591 .data = data,
0592 .num_threads = num_online_cpus(),
0593 .active_cpus = cpus,
0594 };
0595
0596 lockdep_assert_cpus_held();
0597
0598 if (!stop_machine_initialized) {
0599
0600
0601
0602
0603
0604 unsigned long flags;
0605 int ret;
0606
0607 WARN_ON_ONCE(msdata.num_threads != 1);
0608
0609 local_irq_save(flags);
0610 hard_irq_disable();
0611 ret = (*fn)(data);
0612 local_irq_restore(flags);
0613
0614 return ret;
0615 }
0616
0617
0618 set_state(&msdata, MULTI_STOP_PREPARE);
0619 return stop_cpus(cpu_online_mask, multi_cpu_stop, &msdata);
0620 }
0621
0622 int stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus)
0623 {
0624 int ret;
0625
0626
0627 cpus_read_lock();
0628 ret = stop_machine_cpuslocked(fn, data, cpus);
0629 cpus_read_unlock();
0630 return ret;
0631 }
0632 EXPORT_SYMBOL_GPL(stop_machine);
0633
0634 #ifdef CONFIG_SCHED_SMT
0635 int stop_core_cpuslocked(unsigned int cpu, cpu_stop_fn_t fn, void *data)
0636 {
0637 const struct cpumask *smt_mask = cpu_smt_mask(cpu);
0638
0639 struct multi_stop_data msdata = {
0640 .fn = fn,
0641 .data = data,
0642 .num_threads = cpumask_weight(smt_mask),
0643 .active_cpus = smt_mask,
0644 };
0645
0646 lockdep_assert_cpus_held();
0647
0648
0649 set_state(&msdata, MULTI_STOP_PREPARE);
0650 return stop_cpus(smt_mask, multi_cpu_stop, &msdata);
0651 }
0652 EXPORT_SYMBOL_GPL(stop_core_cpuslocked);
0653 #endif
0654
0655
0656
0657
0658
0659
0660
0661
0662
0663
0664
0665
0666
0667
0668
0669
0670
0671
0672
0673
0674
0675
0676
0677 int stop_machine_from_inactive_cpu(cpu_stop_fn_t fn, void *data,
0678 const struct cpumask *cpus)
0679 {
0680 struct multi_stop_data msdata = { .fn = fn, .data = data,
0681 .active_cpus = cpus };
0682 struct cpu_stop_done done;
0683 int ret;
0684
0685
0686 BUG_ON(cpu_active(raw_smp_processor_id()));
0687 msdata.num_threads = num_active_cpus() + 1;
0688
0689
0690 while (!mutex_trylock(&stop_cpus_mutex))
0691 cpu_relax();
0692
0693
0694 set_state(&msdata, MULTI_STOP_PREPARE);
0695 cpu_stop_init_done(&done, num_active_cpus());
0696 queue_stop_cpus_work(cpu_active_mask, multi_cpu_stop, &msdata,
0697 &done);
0698 ret = multi_cpu_stop(&msdata);
0699
0700
0701 while (!completion_done(&done.completion))
0702 cpu_relax();
0703
0704 mutex_unlock(&stop_cpus_mutex);
0705 return ret ?: done.ret;
0706 }