0001
0002
0003
0004
0005
0006
0007
0008
0009
0010 #define pr_fmt(fmt) "mobility: " fmt
0011
0012 #include <linux/cpu.h>
0013 #include <linux/kernel.h>
0014 #include <linux/kobject.h>
0015 #include <linux/nmi.h>
0016 #include <linux/sched.h>
0017 #include <linux/smp.h>
0018 #include <linux/stat.h>
0019 #include <linux/stop_machine.h>
0020 #include <linux/completion.h>
0021 #include <linux/device.h>
0022 #include <linux/delay.h>
0023 #include <linux/slab.h>
0024 #include <linux/stringify.h>
0025
0026 #include <asm/machdep.h>
0027 #include <asm/rtas.h>
0028 #include "pseries.h"
0029 #include "vas.h" /* vas_migration_handler() */
0030 #include "../../kernel/cacheinfo.h"
0031
0032 static struct kobject *mobility_kobj;
0033
0034 struct update_props_workarea {
0035 __be32 phandle;
0036 __be32 state;
0037 __be64 reserved;
0038 __be32 nprops;
0039 } __packed;
0040
0041 #define NODE_ACTION_MASK 0xff000000
0042 #define NODE_COUNT_MASK 0x00ffffff
0043
0044 #define DELETE_DT_NODE 0x01000000
0045 #define UPDATE_DT_NODE 0x02000000
0046 #define ADD_DT_NODE 0x03000000
0047
0048 #define MIGRATION_SCOPE (1)
0049 #define PRRN_SCOPE -2
0050
0051 #ifdef CONFIG_PPC_WATCHDOG
0052 static unsigned int nmi_wd_lpm_factor = 200;
0053
0054 #ifdef CONFIG_SYSCTL
0055 static struct ctl_table nmi_wd_lpm_factor_ctl_table[] = {
0056 {
0057 .procname = "nmi_wd_lpm_factor",
0058 .data = &nmi_wd_lpm_factor,
0059 .maxlen = sizeof(int),
0060 .mode = 0644,
0061 .proc_handler = proc_douintvec_minmax,
0062 },
0063 {}
0064 };
0065 static struct ctl_table nmi_wd_lpm_factor_sysctl_root[] = {
0066 {
0067 .procname = "kernel",
0068 .mode = 0555,
0069 .child = nmi_wd_lpm_factor_ctl_table,
0070 },
0071 {}
0072 };
0073
0074 static int __init register_nmi_wd_lpm_factor_sysctl(void)
0075 {
0076 register_sysctl_table(nmi_wd_lpm_factor_sysctl_root);
0077
0078 return 0;
0079 }
0080 device_initcall(register_nmi_wd_lpm_factor_sysctl);
0081 #endif
0082 #endif
0083
0084 static int mobility_rtas_call(int token, char *buf, s32 scope)
0085 {
0086 int rc;
0087
0088 spin_lock(&rtas_data_buf_lock);
0089
0090 memcpy(rtas_data_buf, buf, RTAS_DATA_BUF_SIZE);
0091 rc = rtas_call(token, 2, 1, NULL, rtas_data_buf, scope);
0092 memcpy(buf, rtas_data_buf, RTAS_DATA_BUF_SIZE);
0093
0094 spin_unlock(&rtas_data_buf_lock);
0095 return rc;
0096 }
0097
0098 static int delete_dt_node(struct device_node *dn)
0099 {
0100 struct device_node *pdn;
0101 bool is_platfac;
0102
0103 pdn = of_get_parent(dn);
0104 is_platfac = of_node_is_type(dn, "ibm,platform-facilities") ||
0105 of_node_is_type(pdn, "ibm,platform-facilities");
0106 of_node_put(pdn);
0107
0108
0109
0110
0111
0112
0113
0114
0115
0116 if (is_platfac) {
0117 pr_notice("ignoring remove operation for %pOFfp\n", dn);
0118 return 0;
0119 }
0120
0121 pr_debug("removing node %pOFfp\n", dn);
0122 dlpar_detach_node(dn);
0123 return 0;
0124 }
0125
0126 static int update_dt_property(struct device_node *dn, struct property **prop,
0127 const char *name, u32 vd, char *value)
0128 {
0129 struct property *new_prop = *prop;
0130 int more = 0;
0131
0132
0133
0134
0135
0136
0137
0138 if (vd & 0x80000000) {
0139 vd = ~vd + 1;
0140 more = 1;
0141 }
0142
0143 if (new_prop) {
0144
0145 char *new_data = kzalloc(new_prop->length + vd, GFP_KERNEL);
0146 if (!new_data)
0147 return -ENOMEM;
0148
0149 memcpy(new_data, new_prop->value, new_prop->length);
0150 memcpy(new_data + new_prop->length, value, vd);
0151
0152 kfree(new_prop->value);
0153 new_prop->value = new_data;
0154 new_prop->length += vd;
0155 } else {
0156 new_prop = kzalloc(sizeof(*new_prop), GFP_KERNEL);
0157 if (!new_prop)
0158 return -ENOMEM;
0159
0160 new_prop->name = kstrdup(name, GFP_KERNEL);
0161 if (!new_prop->name) {
0162 kfree(new_prop);
0163 return -ENOMEM;
0164 }
0165
0166 new_prop->length = vd;
0167 new_prop->value = kzalloc(new_prop->length, GFP_KERNEL);
0168 if (!new_prop->value) {
0169 kfree(new_prop->name);
0170 kfree(new_prop);
0171 return -ENOMEM;
0172 }
0173
0174 memcpy(new_prop->value, value, vd);
0175 *prop = new_prop;
0176 }
0177
0178 if (!more) {
0179 pr_debug("updating node %pOF property %s\n", dn, name);
0180 of_update_property(dn, new_prop);
0181 *prop = NULL;
0182 }
0183
0184 return 0;
0185 }
0186
0187 static int update_dt_node(struct device_node *dn, s32 scope)
0188 {
0189 struct update_props_workarea *upwa;
0190 struct property *prop = NULL;
0191 int i, rc, rtas_rc;
0192 char *prop_data;
0193 char *rtas_buf;
0194 int update_properties_token;
0195 u32 nprops;
0196 u32 vd;
0197
0198 update_properties_token = rtas_token("ibm,update-properties");
0199 if (update_properties_token == RTAS_UNKNOWN_SERVICE)
0200 return -EINVAL;
0201
0202 rtas_buf = kzalloc(RTAS_DATA_BUF_SIZE, GFP_KERNEL);
0203 if (!rtas_buf)
0204 return -ENOMEM;
0205
0206 upwa = (struct update_props_workarea *)&rtas_buf[0];
0207 upwa->phandle = cpu_to_be32(dn->phandle);
0208
0209 do {
0210 rtas_rc = mobility_rtas_call(update_properties_token, rtas_buf,
0211 scope);
0212 if (rtas_rc < 0)
0213 break;
0214
0215 prop_data = rtas_buf + sizeof(*upwa);
0216 nprops = be32_to_cpu(upwa->nprops);
0217
0218
0219
0220
0221
0222
0223 if (*prop_data == 0) {
0224 prop_data++;
0225 vd = be32_to_cpu(*(__be32 *)prop_data);
0226 prop_data += vd + sizeof(vd);
0227 nprops--;
0228 }
0229
0230 for (i = 0; i < nprops; i++) {
0231 char *prop_name;
0232
0233 prop_name = prop_data;
0234 prop_data += strlen(prop_name) + 1;
0235 vd = be32_to_cpu(*(__be32 *)prop_data);
0236 prop_data += sizeof(vd);
0237
0238 switch (vd) {
0239 case 0x00000000:
0240
0241 break;
0242
0243 case 0x80000000:
0244 of_remove_property(dn, of_find_property(dn,
0245 prop_name, NULL));
0246 prop = NULL;
0247 break;
0248
0249 default:
0250 rc = update_dt_property(dn, &prop, prop_name,
0251 vd, prop_data);
0252 if (rc) {
0253 pr_err("updating %s property failed: %d\n",
0254 prop_name, rc);
0255 }
0256
0257 prop_data += vd;
0258 break;
0259 }
0260
0261 cond_resched();
0262 }
0263
0264 cond_resched();
0265 } while (rtas_rc == 1);
0266
0267 kfree(rtas_buf);
0268 return 0;
0269 }
0270
0271 static int add_dt_node(struct device_node *parent_dn, __be32 drc_index)
0272 {
0273 struct device_node *dn;
0274 int rc;
0275
0276 dn = dlpar_configure_connector(drc_index, parent_dn);
0277 if (!dn)
0278 return -ENOENT;
0279
0280
0281
0282
0283
0284
0285
0286
0287 if (of_node_is_type(dn, "ibm,platform-facilities")) {
0288 pr_notice("ignoring add operation for %pOF\n", dn);
0289 dlpar_free_cc_nodes(dn);
0290 return 0;
0291 }
0292
0293 rc = dlpar_attach_node(dn, parent_dn);
0294 if (rc)
0295 dlpar_free_cc_nodes(dn);
0296
0297 pr_debug("added node %pOFfp\n", dn);
0298
0299 return rc;
0300 }
0301
0302 static int pseries_devicetree_update(s32 scope)
0303 {
0304 char *rtas_buf;
0305 __be32 *data;
0306 int update_nodes_token;
0307 int rc;
0308
0309 update_nodes_token = rtas_token("ibm,update-nodes");
0310 if (update_nodes_token == RTAS_UNKNOWN_SERVICE)
0311 return 0;
0312
0313 rtas_buf = kzalloc(RTAS_DATA_BUF_SIZE, GFP_KERNEL);
0314 if (!rtas_buf)
0315 return -ENOMEM;
0316
0317 do {
0318 rc = mobility_rtas_call(update_nodes_token, rtas_buf, scope);
0319 if (rc && rc != 1)
0320 break;
0321
0322 data = (__be32 *)rtas_buf + 4;
0323 while (be32_to_cpu(*data) & NODE_ACTION_MASK) {
0324 int i;
0325 u32 action = be32_to_cpu(*data) & NODE_ACTION_MASK;
0326 u32 node_count = be32_to_cpu(*data) & NODE_COUNT_MASK;
0327
0328 data++;
0329
0330 for (i = 0; i < node_count; i++) {
0331 struct device_node *np;
0332 __be32 phandle = *data++;
0333 __be32 drc_index;
0334
0335 np = of_find_node_by_phandle(be32_to_cpu(phandle));
0336 if (!np) {
0337 pr_warn("Failed lookup: phandle 0x%x for action 0x%x\n",
0338 be32_to_cpu(phandle), action);
0339 continue;
0340 }
0341
0342 switch (action) {
0343 case DELETE_DT_NODE:
0344 delete_dt_node(np);
0345 break;
0346 case UPDATE_DT_NODE:
0347 update_dt_node(np, scope);
0348 break;
0349 case ADD_DT_NODE:
0350 drc_index = *data++;
0351 add_dt_node(np, drc_index);
0352 break;
0353 }
0354
0355 of_node_put(np);
0356 cond_resched();
0357 }
0358 }
0359
0360 cond_resched();
0361 } while (rc == 1);
0362
0363 kfree(rtas_buf);
0364 return rc;
0365 }
0366
0367 void post_mobility_fixup(void)
0368 {
0369 int rc;
0370
0371 rtas_activate_firmware();
0372
0373
0374
0375
0376
0377 cpus_read_lock();
0378
0379
0380
0381
0382
0383
0384 cacheinfo_teardown();
0385
0386 rc = pseries_devicetree_update(MIGRATION_SCOPE);
0387 if (rc)
0388 pr_err("device tree update failed: %d\n", rc);
0389
0390 cacheinfo_rebuild();
0391
0392 cpus_read_unlock();
0393
0394
0395 pseries_setup_security_mitigations();
0396
0397
0398 read_24x7_sys_info();
0399
0400 return;
0401 }
0402
0403 static int poll_vasi_state(u64 handle, unsigned long *res)
0404 {
0405 unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
0406 long hvrc;
0407 int ret;
0408
0409 hvrc = plpar_hcall(H_VASI_STATE, retbuf, handle);
0410 switch (hvrc) {
0411 case H_SUCCESS:
0412 ret = 0;
0413 *res = retbuf[0];
0414 break;
0415 case H_PARAMETER:
0416 ret = -EINVAL;
0417 break;
0418 case H_FUNCTION:
0419 ret = -EOPNOTSUPP;
0420 break;
0421 case H_HARDWARE:
0422 default:
0423 pr_err("unexpected H_VASI_STATE result %ld\n", hvrc);
0424 ret = -EIO;
0425 break;
0426 }
0427 return ret;
0428 }
0429
0430 static int wait_for_vasi_session_suspending(u64 handle)
0431 {
0432 unsigned long state;
0433 int ret;
0434
0435
0436
0437
0438
0439 while (true) {
0440 ret = poll_vasi_state(handle, &state);
0441
0442 if (ret != 0 || state == H_VASI_SUSPENDING) {
0443 break;
0444 } else if (state == H_VASI_ENABLED) {
0445 ssleep(1);
0446 } else {
0447 pr_err("unexpected H_VASI_STATE result %lu\n", state);
0448 ret = -EIO;
0449 break;
0450 }
0451 }
0452
0453
0454
0455
0456
0457 if (ret == -EOPNOTSUPP)
0458 ret = 0;
0459
0460 return ret;
0461 }
0462
0463 static void wait_for_vasi_session_completed(u64 handle)
0464 {
0465 unsigned long state = 0;
0466 int ret;
0467
0468 pr_info("waiting for memory transfer to complete...\n");
0469
0470
0471
0472
0473 while (true) {
0474 ret = poll_vasi_state(handle, &state);
0475
0476
0477
0478
0479
0480
0481 if (ret == -EINVAL || (!ret && state == H_VASI_COMPLETED)) {
0482 pr_info("memory transfer completed.\n");
0483 break;
0484 }
0485
0486 if (ret) {
0487 pr_err("H_VASI_STATE return error (%d)\n", ret);
0488 break;
0489 }
0490
0491 if (state != H_VASI_RESUMED) {
0492 pr_err("unexpected H_VASI_STATE result %lu\n", state);
0493 break;
0494 }
0495
0496 msleep(500);
0497 }
0498 }
0499
0500 static void prod_single(unsigned int target_cpu)
0501 {
0502 long hvrc;
0503 int hwid;
0504
0505 hwid = get_hard_smp_processor_id(target_cpu);
0506 hvrc = plpar_hcall_norets(H_PROD, hwid);
0507 if (hvrc == H_SUCCESS)
0508 return;
0509 pr_err_ratelimited("H_PROD of CPU %u (hwid %d) error: %ld\n",
0510 target_cpu, hwid, hvrc);
0511 }
0512
0513 static void prod_others(void)
0514 {
0515 unsigned int cpu;
0516
0517 for_each_online_cpu(cpu) {
0518 if (cpu != smp_processor_id())
0519 prod_single(cpu);
0520 }
0521 }
0522
0523 static u16 clamp_slb_size(void)
0524 {
0525 #ifdef CONFIG_PPC_64S_HASH_MMU
0526 u16 prev = mmu_slb_size;
0527
0528 slb_set_size(SLB_MIN_SIZE);
0529
0530 return prev;
0531 #else
0532 return 0;
0533 #endif
0534 }
0535
0536 static int do_suspend(void)
0537 {
0538 u16 saved_slb_size;
0539 int status;
0540 int ret;
0541
0542 pr_info("calling ibm,suspend-me on CPU %i\n", smp_processor_id());
0543
0544
0545
0546
0547
0548
0549
0550
0551
0552
0553 saved_slb_size = clamp_slb_size();
0554
0555 ret = rtas_ibm_suspend_me(&status);
0556 if (ret != 0) {
0557 pr_err("ibm,suspend-me error: %d\n", status);
0558 slb_set_size(saved_slb_size);
0559 }
0560
0561 return ret;
0562 }
0563
0564
0565
0566
0567
0568
0569
0570
0571
0572
0573 struct pseries_suspend_info {
0574 atomic_t counter;
0575 bool done;
0576 };
0577
0578 static int do_join(void *arg)
0579 {
0580 struct pseries_suspend_info *info = arg;
0581 atomic_t *counter = &info->counter;
0582 long hvrc;
0583 int ret;
0584
0585 retry:
0586
0587 hard_irq_disable();
0588 hvrc = plpar_hcall_norets(H_JOIN);
0589
0590 switch (hvrc) {
0591 case H_CONTINUE:
0592
0593
0594
0595
0596 ret = do_suspend();
0597 break;
0598 case H_SUCCESS:
0599
0600
0601
0602
0603
0604
0605
0606
0607
0608
0609 smp_mb();
0610 if (READ_ONCE(info->done) == false) {
0611 pr_info_ratelimited("premature return from H_JOIN on CPU %i, retrying",
0612 smp_processor_id());
0613 goto retry;
0614 }
0615 ret = 0;
0616 break;
0617 case H_BAD_MODE:
0618 case H_HARDWARE:
0619 default:
0620 ret = -EIO;
0621 pr_err_ratelimited("H_JOIN error %ld on CPU %i\n",
0622 hvrc, smp_processor_id());
0623 break;
0624 }
0625
0626 if (atomic_inc_return(counter) == 1) {
0627 pr_info("CPU %u waking all threads\n", smp_processor_id());
0628 WRITE_ONCE(info->done, true);
0629
0630
0631
0632
0633
0634 smp_mb();
0635 prod_others();
0636 }
0637
0638
0639
0640
0641 touch_nmi_watchdog();
0642 return ret;
0643 }
0644
0645
0646
0647
0648 enum vasi_aborting_entity {
0649 ORCHESTRATOR = 1,
0650 VSP_SOURCE = 2,
0651 PARTITION_FIRMWARE = 3,
0652 PLATFORM_FIRMWARE = 4,
0653 VSP_TARGET = 5,
0654 MIGRATING_PARTITION = 6,
0655 };
0656
0657 static void pseries_cancel_migration(u64 handle, int err)
0658 {
0659 u32 reason_code;
0660 u32 detail;
0661 u8 entity;
0662 long hvrc;
0663
0664 entity = MIGRATING_PARTITION;
0665 detail = abs(err) & 0xffffff;
0666 reason_code = (entity << 24) | detail;
0667
0668 hvrc = plpar_hcall_norets(H_VASI_SIGNAL, handle,
0669 H_VASI_SIGNAL_CANCEL, reason_code);
0670 if (hvrc)
0671 pr_err("H_VASI_SIGNAL error: %ld\n", hvrc);
0672 }
0673
0674 static int pseries_suspend(u64 handle)
0675 {
0676 const unsigned int max_attempts = 5;
0677 unsigned int retry_interval_ms = 1;
0678 unsigned int attempt = 1;
0679 int ret;
0680
0681 while (true) {
0682 struct pseries_suspend_info info;
0683 unsigned long vasi_state;
0684 int vasi_err;
0685
0686 info = (struct pseries_suspend_info) {
0687 .counter = ATOMIC_INIT(0),
0688 .done = false,
0689 };
0690
0691 ret = stop_machine(do_join, &info, cpu_online_mask);
0692 if (ret == 0)
0693 break;
0694
0695
0696
0697
0698
0699
0700
0701
0702
0703
0704
0705
0706 pr_notice("Partition suspend attempt %u of %u error: %d\n",
0707 attempt, max_attempts, ret);
0708
0709 if (attempt == max_attempts)
0710 break;
0711
0712 vasi_err = poll_vasi_state(handle, &vasi_state);
0713 if (vasi_err == 0) {
0714 if (vasi_state != H_VASI_SUSPENDING) {
0715 pr_notice("VASI state %lu after failed suspend\n",
0716 vasi_state);
0717 break;
0718 }
0719 } else if (vasi_err != -EOPNOTSUPP) {
0720 pr_err("VASI state poll error: %d", vasi_err);
0721 break;
0722 }
0723
0724 pr_notice("Will retry partition suspend after %u ms\n",
0725 retry_interval_ms);
0726
0727 msleep(retry_interval_ms);
0728 retry_interval_ms *= 10;
0729 attempt++;
0730 }
0731
0732 return ret;
0733 }
0734
0735 static int pseries_migrate_partition(u64 handle)
0736 {
0737 int ret;
0738 unsigned int factor = 0;
0739
0740 #ifdef CONFIG_PPC_WATCHDOG
0741 factor = nmi_wd_lpm_factor;
0742 #endif
0743 ret = wait_for_vasi_session_suspending(handle);
0744 if (ret)
0745 return ret;
0746
0747 vas_migration_handler(VAS_SUSPEND);
0748
0749 if (factor)
0750 watchdog_nmi_set_timeout_pct(factor);
0751
0752 ret = pseries_suspend(handle);
0753 if (ret == 0) {
0754 post_mobility_fixup();
0755
0756
0757
0758
0759
0760
0761 wait_for_vasi_session_completed(handle);
0762 } else
0763 pseries_cancel_migration(handle, ret);
0764
0765 if (factor)
0766 watchdog_nmi_set_timeout_pct(0);
0767
0768 vas_migration_handler(VAS_RESUME);
0769
0770 return ret;
0771 }
0772
0773 int rtas_syscall_dispatch_ibm_suspend_me(u64 handle)
0774 {
0775 return pseries_migrate_partition(handle);
0776 }
0777
0778 static ssize_t migration_store(struct class *class,
0779 struct class_attribute *attr, const char *buf,
0780 size_t count)
0781 {
0782 u64 streamid;
0783 int rc;
0784
0785 rc = kstrtou64(buf, 0, &streamid);
0786 if (rc)
0787 return rc;
0788
0789 rc = pseries_migrate_partition(streamid);
0790 if (rc)
0791 return rc;
0792
0793 return count;
0794 }
0795
0796
0797
0798
0799
0800
0801
0802 #define MIGRATION_API_VERSION 1
0803
0804 static CLASS_ATTR_WO(migration);
0805 static CLASS_ATTR_STRING(api_version, 0444, __stringify(MIGRATION_API_VERSION));
0806
0807 static int __init mobility_sysfs_init(void)
0808 {
0809 int rc;
0810
0811 mobility_kobj = kobject_create_and_add("mobility", kernel_kobj);
0812 if (!mobility_kobj)
0813 return -ENOMEM;
0814
0815 rc = sysfs_create_file(mobility_kobj, &class_attr_migration.attr);
0816 if (rc)
0817 pr_err("unable to create migration sysfs file (%d)\n", rc);
0818
0819 rc = sysfs_create_file(mobility_kobj, &class_attr_api_version.attr.attr);
0820 if (rc)
0821 pr_err("unable to create api_version sysfs file (%d)\n", rc);
0822
0823 return 0;
0824 }
0825 machine_device_initcall(pseries, mobility_sysfs_init);