Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0-only
0002 /*
0003  * Support for Partition Mobility/Migration
0004  *
0005  * Copyright (C) 2010 Nathan Fontenot
0006  * Copyright (C) 2010 IBM Corporation
0007  */
0008 
0009 
0010 #define pr_fmt(fmt) "mobility: " fmt
0011 
0012 #include <linux/cpu.h>
0013 #include <linux/kernel.h>
0014 #include <linux/kobject.h>
0015 #include <linux/nmi.h>
0016 #include <linux/sched.h>
0017 #include <linux/smp.h>
0018 #include <linux/stat.h>
0019 #include <linux/stop_machine.h>
0020 #include <linux/completion.h>
0021 #include <linux/device.h>
0022 #include <linux/delay.h>
0023 #include <linux/slab.h>
0024 #include <linux/stringify.h>
0025 
0026 #include <asm/machdep.h>
0027 #include <asm/rtas.h>
0028 #include "pseries.h"
0029 #include "vas.h"    /* vas_migration_handler() */
0030 #include "../../kernel/cacheinfo.h"
0031 
0032 static struct kobject *mobility_kobj;
0033 
0034 struct update_props_workarea {
0035     __be32 phandle;
0036     __be32 state;
0037     __be64 reserved;
0038     __be32 nprops;
0039 } __packed;
0040 
0041 #define NODE_ACTION_MASK    0xff000000
0042 #define NODE_COUNT_MASK     0x00ffffff
0043 
0044 #define DELETE_DT_NODE  0x01000000
0045 #define UPDATE_DT_NODE  0x02000000
0046 #define ADD_DT_NODE 0x03000000
0047 
0048 #define MIGRATION_SCOPE (1)
0049 #define PRRN_SCOPE -2
0050 
0051 #ifdef CONFIG_PPC_WATCHDOG
0052 static unsigned int nmi_wd_lpm_factor = 200;
0053 
0054 #ifdef CONFIG_SYSCTL
0055 static struct ctl_table nmi_wd_lpm_factor_ctl_table[] = {
0056     {
0057         .procname   = "nmi_wd_lpm_factor",
0058         .data       = &nmi_wd_lpm_factor,
0059         .maxlen     = sizeof(int),
0060         .mode       = 0644,
0061         .proc_handler   = proc_douintvec_minmax,
0062     },
0063     {}
0064 };
0065 static struct ctl_table nmi_wd_lpm_factor_sysctl_root[] = {
0066     {
0067         .procname       = "kernel",
0068         .mode           = 0555,
0069         .child          = nmi_wd_lpm_factor_ctl_table,
0070     },
0071     {}
0072 };
0073 
0074 static int __init register_nmi_wd_lpm_factor_sysctl(void)
0075 {
0076     register_sysctl_table(nmi_wd_lpm_factor_sysctl_root);
0077 
0078     return 0;
0079 }
0080 device_initcall(register_nmi_wd_lpm_factor_sysctl);
0081 #endif /* CONFIG_SYSCTL */
0082 #endif /* CONFIG_PPC_WATCHDOG */
0083 
0084 static int mobility_rtas_call(int token, char *buf, s32 scope)
0085 {
0086     int rc;
0087 
0088     spin_lock(&rtas_data_buf_lock);
0089 
0090     memcpy(rtas_data_buf, buf, RTAS_DATA_BUF_SIZE);
0091     rc = rtas_call(token, 2, 1, NULL, rtas_data_buf, scope);
0092     memcpy(buf, rtas_data_buf, RTAS_DATA_BUF_SIZE);
0093 
0094     spin_unlock(&rtas_data_buf_lock);
0095     return rc;
0096 }
0097 
0098 static int delete_dt_node(struct device_node *dn)
0099 {
0100     struct device_node *pdn;
0101     bool is_platfac;
0102 
0103     pdn = of_get_parent(dn);
0104     is_platfac = of_node_is_type(dn, "ibm,platform-facilities") ||
0105              of_node_is_type(pdn, "ibm,platform-facilities");
0106     of_node_put(pdn);
0107 
0108     /*
0109      * The drivers that bind to nodes in the platform-facilities
0110      * hierarchy don't support node removal, and the removal directive
0111      * from firmware is always followed by an add of an equivalent
0112      * node. The capability (e.g. RNG, encryption, compression)
0113      * represented by the node is never interrupted by the migration.
0114      * So ignore changes to this part of the tree.
0115      */
0116     if (is_platfac) {
0117         pr_notice("ignoring remove operation for %pOFfp\n", dn);
0118         return 0;
0119     }
0120 
0121     pr_debug("removing node %pOFfp\n", dn);
0122     dlpar_detach_node(dn);
0123     return 0;
0124 }
0125 
0126 static int update_dt_property(struct device_node *dn, struct property **prop,
0127                   const char *name, u32 vd, char *value)
0128 {
0129     struct property *new_prop = *prop;
0130     int more = 0;
0131 
0132     /* A negative 'vd' value indicates that only part of the new property
0133      * value is contained in the buffer and we need to call
0134      * ibm,update-properties again to get the rest of the value.
0135      *
0136      * A negative value is also the two's compliment of the actual value.
0137      */
0138     if (vd & 0x80000000) {
0139         vd = ~vd + 1;
0140         more = 1;
0141     }
0142 
0143     if (new_prop) {
0144         /* partial property fixup */
0145         char *new_data = kzalloc(new_prop->length + vd, GFP_KERNEL);
0146         if (!new_data)
0147             return -ENOMEM;
0148 
0149         memcpy(new_data, new_prop->value, new_prop->length);
0150         memcpy(new_data + new_prop->length, value, vd);
0151 
0152         kfree(new_prop->value);
0153         new_prop->value = new_data;
0154         new_prop->length += vd;
0155     } else {
0156         new_prop = kzalloc(sizeof(*new_prop), GFP_KERNEL);
0157         if (!new_prop)
0158             return -ENOMEM;
0159 
0160         new_prop->name = kstrdup(name, GFP_KERNEL);
0161         if (!new_prop->name) {
0162             kfree(new_prop);
0163             return -ENOMEM;
0164         }
0165 
0166         new_prop->length = vd;
0167         new_prop->value = kzalloc(new_prop->length, GFP_KERNEL);
0168         if (!new_prop->value) {
0169             kfree(new_prop->name);
0170             kfree(new_prop);
0171             return -ENOMEM;
0172         }
0173 
0174         memcpy(new_prop->value, value, vd);
0175         *prop = new_prop;
0176     }
0177 
0178     if (!more) {
0179         pr_debug("updating node %pOF property %s\n", dn, name);
0180         of_update_property(dn, new_prop);
0181         *prop = NULL;
0182     }
0183 
0184     return 0;
0185 }
0186 
0187 static int update_dt_node(struct device_node *dn, s32 scope)
0188 {
0189     struct update_props_workarea *upwa;
0190     struct property *prop = NULL;
0191     int i, rc, rtas_rc;
0192     char *prop_data;
0193     char *rtas_buf;
0194     int update_properties_token;
0195     u32 nprops;
0196     u32 vd;
0197 
0198     update_properties_token = rtas_token("ibm,update-properties");
0199     if (update_properties_token == RTAS_UNKNOWN_SERVICE)
0200         return -EINVAL;
0201 
0202     rtas_buf = kzalloc(RTAS_DATA_BUF_SIZE, GFP_KERNEL);
0203     if (!rtas_buf)
0204         return -ENOMEM;
0205 
0206     upwa = (struct update_props_workarea *)&rtas_buf[0];
0207     upwa->phandle = cpu_to_be32(dn->phandle);
0208 
0209     do {
0210         rtas_rc = mobility_rtas_call(update_properties_token, rtas_buf,
0211                     scope);
0212         if (rtas_rc < 0)
0213             break;
0214 
0215         prop_data = rtas_buf + sizeof(*upwa);
0216         nprops = be32_to_cpu(upwa->nprops);
0217 
0218         /* On the first call to ibm,update-properties for a node the
0219          * the first property value descriptor contains an empty
0220          * property name, the property value length encoded as u32,
0221          * and the property value is the node path being updated.
0222          */
0223         if (*prop_data == 0) {
0224             prop_data++;
0225             vd = be32_to_cpu(*(__be32 *)prop_data);
0226             prop_data += vd + sizeof(vd);
0227             nprops--;
0228         }
0229 
0230         for (i = 0; i < nprops; i++) {
0231             char *prop_name;
0232 
0233             prop_name = prop_data;
0234             prop_data += strlen(prop_name) + 1;
0235             vd = be32_to_cpu(*(__be32 *)prop_data);
0236             prop_data += sizeof(vd);
0237 
0238             switch (vd) {
0239             case 0x00000000:
0240                 /* name only property, nothing to do */
0241                 break;
0242 
0243             case 0x80000000:
0244                 of_remove_property(dn, of_find_property(dn,
0245                             prop_name, NULL));
0246                 prop = NULL;
0247                 break;
0248 
0249             default:
0250                 rc = update_dt_property(dn, &prop, prop_name,
0251                             vd, prop_data);
0252                 if (rc) {
0253                     pr_err("updating %s property failed: %d\n",
0254                            prop_name, rc);
0255                 }
0256 
0257                 prop_data += vd;
0258                 break;
0259             }
0260 
0261             cond_resched();
0262         }
0263 
0264         cond_resched();
0265     } while (rtas_rc == 1);
0266 
0267     kfree(rtas_buf);
0268     return 0;
0269 }
0270 
0271 static int add_dt_node(struct device_node *parent_dn, __be32 drc_index)
0272 {
0273     struct device_node *dn;
0274     int rc;
0275 
0276     dn = dlpar_configure_connector(drc_index, parent_dn);
0277     if (!dn)
0278         return -ENOENT;
0279 
0280     /*
0281      * Since delete_dt_node() ignores this node type, this is the
0282      * necessary counterpart. We also know that a platform-facilities
0283      * node returned from dlpar_configure_connector() has children
0284      * attached, and dlpar_attach_node() only adds the parent, leaking
0285      * the children. So ignore these on the add side for now.
0286      */
0287     if (of_node_is_type(dn, "ibm,platform-facilities")) {
0288         pr_notice("ignoring add operation for %pOF\n", dn);
0289         dlpar_free_cc_nodes(dn);
0290         return 0;
0291     }
0292 
0293     rc = dlpar_attach_node(dn, parent_dn);
0294     if (rc)
0295         dlpar_free_cc_nodes(dn);
0296 
0297     pr_debug("added node %pOFfp\n", dn);
0298 
0299     return rc;
0300 }
0301 
0302 static int pseries_devicetree_update(s32 scope)
0303 {
0304     char *rtas_buf;
0305     __be32 *data;
0306     int update_nodes_token;
0307     int rc;
0308 
0309     update_nodes_token = rtas_token("ibm,update-nodes");
0310     if (update_nodes_token == RTAS_UNKNOWN_SERVICE)
0311         return 0;
0312 
0313     rtas_buf = kzalloc(RTAS_DATA_BUF_SIZE, GFP_KERNEL);
0314     if (!rtas_buf)
0315         return -ENOMEM;
0316 
0317     do {
0318         rc = mobility_rtas_call(update_nodes_token, rtas_buf, scope);
0319         if (rc && rc != 1)
0320             break;
0321 
0322         data = (__be32 *)rtas_buf + 4;
0323         while (be32_to_cpu(*data) & NODE_ACTION_MASK) {
0324             int i;
0325             u32 action = be32_to_cpu(*data) & NODE_ACTION_MASK;
0326             u32 node_count = be32_to_cpu(*data) & NODE_COUNT_MASK;
0327 
0328             data++;
0329 
0330             for (i = 0; i < node_count; i++) {
0331                 struct device_node *np;
0332                 __be32 phandle = *data++;
0333                 __be32 drc_index;
0334 
0335                 np = of_find_node_by_phandle(be32_to_cpu(phandle));
0336                 if (!np) {
0337                     pr_warn("Failed lookup: phandle 0x%x for action 0x%x\n",
0338                         be32_to_cpu(phandle), action);
0339                     continue;
0340                 }
0341 
0342                 switch (action) {
0343                 case DELETE_DT_NODE:
0344                     delete_dt_node(np);
0345                     break;
0346                 case UPDATE_DT_NODE:
0347                     update_dt_node(np, scope);
0348                     break;
0349                 case ADD_DT_NODE:
0350                     drc_index = *data++;
0351                     add_dt_node(np, drc_index);
0352                     break;
0353                 }
0354 
0355                 of_node_put(np);
0356                 cond_resched();
0357             }
0358         }
0359 
0360         cond_resched();
0361     } while (rc == 1);
0362 
0363     kfree(rtas_buf);
0364     return rc;
0365 }
0366 
0367 void post_mobility_fixup(void)
0368 {
0369     int rc;
0370 
0371     rtas_activate_firmware();
0372 
0373     /*
0374      * We don't want CPUs to go online/offline while the device
0375      * tree is being updated.
0376      */
0377     cpus_read_lock();
0378 
0379     /*
0380      * It's common for the destination firmware to replace cache
0381      * nodes.  Release all of the cacheinfo hierarchy's references
0382      * before updating the device tree.
0383      */
0384     cacheinfo_teardown();
0385 
0386     rc = pseries_devicetree_update(MIGRATION_SCOPE);
0387     if (rc)
0388         pr_err("device tree update failed: %d\n", rc);
0389 
0390     cacheinfo_rebuild();
0391 
0392     cpus_read_unlock();
0393 
0394     /* Possibly switch to a new L1 flush type */
0395     pseries_setup_security_mitigations();
0396 
0397     /* Reinitialise system information for hv-24x7 */
0398     read_24x7_sys_info();
0399 
0400     return;
0401 }
0402 
0403 static int poll_vasi_state(u64 handle, unsigned long *res)
0404 {
0405     unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
0406     long hvrc;
0407     int ret;
0408 
0409     hvrc = plpar_hcall(H_VASI_STATE, retbuf, handle);
0410     switch (hvrc) {
0411     case H_SUCCESS:
0412         ret = 0;
0413         *res = retbuf[0];
0414         break;
0415     case H_PARAMETER:
0416         ret = -EINVAL;
0417         break;
0418     case H_FUNCTION:
0419         ret = -EOPNOTSUPP;
0420         break;
0421     case H_HARDWARE:
0422     default:
0423         pr_err("unexpected H_VASI_STATE result %ld\n", hvrc);
0424         ret = -EIO;
0425         break;
0426     }
0427     return ret;
0428 }
0429 
0430 static int wait_for_vasi_session_suspending(u64 handle)
0431 {
0432     unsigned long state;
0433     int ret;
0434 
0435     /*
0436      * Wait for transition from H_VASI_ENABLED to
0437      * H_VASI_SUSPENDING. Treat anything else as an error.
0438      */
0439     while (true) {
0440         ret = poll_vasi_state(handle, &state);
0441 
0442         if (ret != 0 || state == H_VASI_SUSPENDING) {
0443             break;
0444         } else if (state == H_VASI_ENABLED) {
0445             ssleep(1);
0446         } else {
0447             pr_err("unexpected H_VASI_STATE result %lu\n", state);
0448             ret = -EIO;
0449             break;
0450         }
0451     }
0452 
0453     /*
0454      * Proceed even if H_VASI_STATE is unavailable. If H_JOIN or
0455      * ibm,suspend-me are also unimplemented, we'll recover then.
0456      */
0457     if (ret == -EOPNOTSUPP)
0458         ret = 0;
0459 
0460     return ret;
0461 }
0462 
0463 static void wait_for_vasi_session_completed(u64 handle)
0464 {
0465     unsigned long state = 0;
0466     int ret;
0467 
0468     pr_info("waiting for memory transfer to complete...\n");
0469 
0470     /*
0471      * Wait for transition from H_VASI_RESUMED to H_VASI_COMPLETED.
0472      */
0473     while (true) {
0474         ret = poll_vasi_state(handle, &state);
0475 
0476         /*
0477          * If the memory transfer is already complete and the migration
0478          * has been cleaned up by the hypervisor, H_PARAMETER is return,
0479          * which is translate in EINVAL by poll_vasi_state().
0480          */
0481         if (ret == -EINVAL || (!ret && state == H_VASI_COMPLETED)) {
0482             pr_info("memory transfer completed.\n");
0483             break;
0484         }
0485 
0486         if (ret) {
0487             pr_err("H_VASI_STATE return error (%d)\n", ret);
0488             break;
0489         }
0490 
0491         if (state != H_VASI_RESUMED) {
0492             pr_err("unexpected H_VASI_STATE result %lu\n", state);
0493             break;
0494         }
0495 
0496         msleep(500);
0497     }
0498 }
0499 
0500 static void prod_single(unsigned int target_cpu)
0501 {
0502     long hvrc;
0503     int hwid;
0504 
0505     hwid = get_hard_smp_processor_id(target_cpu);
0506     hvrc = plpar_hcall_norets(H_PROD, hwid);
0507     if (hvrc == H_SUCCESS)
0508         return;
0509     pr_err_ratelimited("H_PROD of CPU %u (hwid %d) error: %ld\n",
0510                target_cpu, hwid, hvrc);
0511 }
0512 
0513 static void prod_others(void)
0514 {
0515     unsigned int cpu;
0516 
0517     for_each_online_cpu(cpu) {
0518         if (cpu != smp_processor_id())
0519             prod_single(cpu);
0520     }
0521 }
0522 
0523 static u16 clamp_slb_size(void)
0524 {
0525 #ifdef CONFIG_PPC_64S_HASH_MMU
0526     u16 prev = mmu_slb_size;
0527 
0528     slb_set_size(SLB_MIN_SIZE);
0529 
0530     return prev;
0531 #else
0532     return 0;
0533 #endif
0534 }
0535 
0536 static int do_suspend(void)
0537 {
0538     u16 saved_slb_size;
0539     int status;
0540     int ret;
0541 
0542     pr_info("calling ibm,suspend-me on CPU %i\n", smp_processor_id());
0543 
0544     /*
0545      * The destination processor model may have fewer SLB entries
0546      * than the source. We reduce mmu_slb_size to a safe minimum
0547      * before suspending in order to minimize the possibility of
0548      * programming non-existent entries on the destination. If
0549      * suspend fails, we restore it before returning. On success
0550      * the OF reconfig path will update it from the new device
0551      * tree after resuming on the destination.
0552      */
0553     saved_slb_size = clamp_slb_size();
0554 
0555     ret = rtas_ibm_suspend_me(&status);
0556     if (ret != 0) {
0557         pr_err("ibm,suspend-me error: %d\n", status);
0558         slb_set_size(saved_slb_size);
0559     }
0560 
0561     return ret;
0562 }
0563 
0564 /**
0565  * struct pseries_suspend_info - State shared between CPUs for join/suspend.
0566  * @counter: Threads are to increment this upon resuming from suspend
0567  *           or if an error is received from H_JOIN. The thread which performs
0568  *           the first increment (i.e. sets it to 1) is responsible for
0569  *           waking the other threads.
0570  * @done: False if join/suspend is in progress. True if the operation is
0571  *        complete (successful or not).
0572  */
0573 struct pseries_suspend_info {
0574     atomic_t counter;
0575     bool done;
0576 };
0577 
0578 static int do_join(void *arg)
0579 {
0580     struct pseries_suspend_info *info = arg;
0581     atomic_t *counter = &info->counter;
0582     long hvrc;
0583     int ret;
0584 
0585 retry:
0586     /* Must ensure MSR.EE off for H_JOIN. */
0587     hard_irq_disable();
0588     hvrc = plpar_hcall_norets(H_JOIN);
0589 
0590     switch (hvrc) {
0591     case H_CONTINUE:
0592         /*
0593          * All other CPUs are offline or in H_JOIN. This CPU
0594          * attempts the suspend.
0595          */
0596         ret = do_suspend();
0597         break;
0598     case H_SUCCESS:
0599         /*
0600          * The suspend is complete and this cpu has received a
0601          * prod, or we've received a stray prod from unrelated
0602          * code (e.g. paravirt spinlocks) and we need to join
0603          * again.
0604          *
0605          * This barrier orders the return from H_JOIN above vs
0606          * the load of info->done. It pairs with the barrier
0607          * in the wakeup/prod path below.
0608          */
0609         smp_mb();
0610         if (READ_ONCE(info->done) == false) {
0611             pr_info_ratelimited("premature return from H_JOIN on CPU %i, retrying",
0612                         smp_processor_id());
0613             goto retry;
0614         }
0615         ret = 0;
0616         break;
0617     case H_BAD_MODE:
0618     case H_HARDWARE:
0619     default:
0620         ret = -EIO;
0621         pr_err_ratelimited("H_JOIN error %ld on CPU %i\n",
0622                    hvrc, smp_processor_id());
0623         break;
0624     }
0625 
0626     if (atomic_inc_return(counter) == 1) {
0627         pr_info("CPU %u waking all threads\n", smp_processor_id());
0628         WRITE_ONCE(info->done, true);
0629         /*
0630          * This barrier orders the store to info->done vs subsequent
0631          * H_PRODs to wake the other CPUs. It pairs with the barrier
0632          * in the H_SUCCESS case above.
0633          */
0634         smp_mb();
0635         prod_others();
0636     }
0637     /*
0638      * Execution may have been suspended for several seconds, so
0639      * reset the watchdog.
0640      */
0641     touch_nmi_watchdog();
0642     return ret;
0643 }
0644 
0645 /*
0646  * Abort reason code byte 0. We use only the 'Migrating partition' value.
0647  */
0648 enum vasi_aborting_entity {
0649     ORCHESTRATOR        = 1,
0650     VSP_SOURCE          = 2,
0651     PARTITION_FIRMWARE  = 3,
0652     PLATFORM_FIRMWARE   = 4,
0653     VSP_TARGET          = 5,
0654     MIGRATING_PARTITION = 6,
0655 };
0656 
0657 static void pseries_cancel_migration(u64 handle, int err)
0658 {
0659     u32 reason_code;
0660     u32 detail;
0661     u8 entity;
0662     long hvrc;
0663 
0664     entity = MIGRATING_PARTITION;
0665     detail = abs(err) & 0xffffff;
0666     reason_code = (entity << 24) | detail;
0667 
0668     hvrc = plpar_hcall_norets(H_VASI_SIGNAL, handle,
0669                   H_VASI_SIGNAL_CANCEL, reason_code);
0670     if (hvrc)
0671         pr_err("H_VASI_SIGNAL error: %ld\n", hvrc);
0672 }
0673 
0674 static int pseries_suspend(u64 handle)
0675 {
0676     const unsigned int max_attempts = 5;
0677     unsigned int retry_interval_ms = 1;
0678     unsigned int attempt = 1;
0679     int ret;
0680 
0681     while (true) {
0682         struct pseries_suspend_info info;
0683         unsigned long vasi_state;
0684         int vasi_err;
0685 
0686         info = (struct pseries_suspend_info) {
0687             .counter = ATOMIC_INIT(0),
0688             .done = false,
0689         };
0690 
0691         ret = stop_machine(do_join, &info, cpu_online_mask);
0692         if (ret == 0)
0693             break;
0694         /*
0695          * Encountered an error. If the VASI stream is still
0696          * in Suspending state, it's likely a transient
0697          * condition related to some device in the partition
0698          * and we can retry in the hope that the cause has
0699          * cleared after some delay.
0700          *
0701          * A better design would allow drivers etc to prepare
0702          * for the suspend and avoid conditions which prevent
0703          * the suspend from succeeding. For now, we have this
0704          * mitigation.
0705          */
0706         pr_notice("Partition suspend attempt %u of %u error: %d\n",
0707               attempt, max_attempts, ret);
0708 
0709         if (attempt == max_attempts)
0710             break;
0711 
0712         vasi_err = poll_vasi_state(handle, &vasi_state);
0713         if (vasi_err == 0) {
0714             if (vasi_state != H_VASI_SUSPENDING) {
0715                 pr_notice("VASI state %lu after failed suspend\n",
0716                       vasi_state);
0717                 break;
0718             }
0719         } else if (vasi_err != -EOPNOTSUPP) {
0720             pr_err("VASI state poll error: %d", vasi_err);
0721             break;
0722         }
0723 
0724         pr_notice("Will retry partition suspend after %u ms\n",
0725               retry_interval_ms);
0726 
0727         msleep(retry_interval_ms);
0728         retry_interval_ms *= 10;
0729         attempt++;
0730     }
0731 
0732     return ret;
0733 }
0734 
0735 static int pseries_migrate_partition(u64 handle)
0736 {
0737     int ret;
0738     unsigned int factor = 0;
0739 
0740 #ifdef CONFIG_PPC_WATCHDOG
0741     factor = nmi_wd_lpm_factor;
0742 #endif
0743     ret = wait_for_vasi_session_suspending(handle);
0744     if (ret)
0745         return ret;
0746 
0747     vas_migration_handler(VAS_SUSPEND);
0748 
0749     if (factor)
0750         watchdog_nmi_set_timeout_pct(factor);
0751 
0752     ret = pseries_suspend(handle);
0753     if (ret == 0) {
0754         post_mobility_fixup();
0755         /*
0756          * Wait until the memory transfer is complete, so that the user
0757          * space process returns from the syscall after the transfer is
0758          * complete. This allows the user hooks to be executed at the
0759          * right time.
0760          */
0761         wait_for_vasi_session_completed(handle);
0762     } else
0763         pseries_cancel_migration(handle, ret);
0764 
0765     if (factor)
0766         watchdog_nmi_set_timeout_pct(0);
0767 
0768     vas_migration_handler(VAS_RESUME);
0769 
0770     return ret;
0771 }
0772 
0773 int rtas_syscall_dispatch_ibm_suspend_me(u64 handle)
0774 {
0775     return pseries_migrate_partition(handle);
0776 }
0777 
0778 static ssize_t migration_store(struct class *class,
0779                    struct class_attribute *attr, const char *buf,
0780                    size_t count)
0781 {
0782     u64 streamid;
0783     int rc;
0784 
0785     rc = kstrtou64(buf, 0, &streamid);
0786     if (rc)
0787         return rc;
0788 
0789     rc = pseries_migrate_partition(streamid);
0790     if (rc)
0791         return rc;
0792 
0793     return count;
0794 }
0795 
0796 /*
0797  * Used by drmgr to determine the kernel behavior of the migration interface.
0798  *
0799  * Version 1: Performs all PAPR requirements for migration including
0800  *  firmware activation and device tree update.
0801  */
0802 #define MIGRATION_API_VERSION   1
0803 
0804 static CLASS_ATTR_WO(migration);
0805 static CLASS_ATTR_STRING(api_version, 0444, __stringify(MIGRATION_API_VERSION));
0806 
0807 static int __init mobility_sysfs_init(void)
0808 {
0809     int rc;
0810 
0811     mobility_kobj = kobject_create_and_add("mobility", kernel_kobj);
0812     if (!mobility_kobj)
0813         return -ENOMEM;
0814 
0815     rc = sysfs_create_file(mobility_kobj, &class_attr_migration.attr);
0816     if (rc)
0817         pr_err("unable to create migration sysfs file (%d)\n", rc);
0818 
0819     rc = sysfs_create_file(mobility_kobj, &class_attr_api_version.attr.attr);
0820     if (rc)
0821         pr_err("unable to create api_version sysfs file (%d)\n", rc);
0822 
0823     return 0;
0824 }
0825 machine_device_initcall(pseries, mobility_sysfs_init);