Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0
0002 /*
0003  * kvm nested virtualization support for s390x
0004  *
0005  * Copyright IBM Corp. 2016, 2018
0006  *
0007  *    Author(s): David Hildenbrand <dahi@linux.vnet.ibm.com>
0008  */
0009 #include <linux/vmalloc.h>
0010 #include <linux/kvm_host.h>
0011 #include <linux/bug.h>
0012 #include <linux/list.h>
0013 #include <linux/bitmap.h>
0014 #include <linux/sched/signal.h>
0015 
0016 #include <asm/gmap.h>
0017 #include <asm/mmu_context.h>
0018 #include <asm/sclp.h>
0019 #include <asm/nmi.h>
0020 #include <asm/dis.h>
0021 #include <asm/fpu/api.h>
0022 #include "kvm-s390.h"
0023 #include "gaccess.h"
0024 
0025 struct vsie_page {
0026     struct kvm_s390_sie_block scb_s;    /* 0x0000 */
0027     /*
0028      * the backup info for machine check. ensure it's at
0029      * the same offset as that in struct sie_page!
0030      */
0031     struct mcck_volatile_info mcck_info;    /* 0x0200 */
0032     /*
0033      * The pinned original scb. Be aware that other VCPUs can modify
0034      * it while we read from it. Values that are used for conditions or
0035      * are reused conditionally, should be accessed via READ_ONCE.
0036      */
0037     struct kvm_s390_sie_block *scb_o;   /* 0x0218 */
0038     /* the shadow gmap in use by the vsie_page */
0039     struct gmap *gmap;          /* 0x0220 */
0040     /* address of the last reported fault to guest2 */
0041     unsigned long fault_addr;       /* 0x0228 */
0042     /* calculated guest addresses of satellite control blocks */
0043     gpa_t sca_gpa;              /* 0x0230 */
0044     gpa_t itdba_gpa;            /* 0x0238 */
0045     gpa_t gvrd_gpa;             /* 0x0240 */
0046     gpa_t riccbd_gpa;           /* 0x0248 */
0047     gpa_t sdnx_gpa;             /* 0x0250 */
0048     __u8 reserved[0x0700 - 0x0258];     /* 0x0258 */
0049     struct kvm_s390_crypto_cb crycb;    /* 0x0700 */
0050     __u8 fac[S390_ARCH_FAC_LIST_SIZE_BYTE]; /* 0x0800 */
0051 };
0052 
0053 /* trigger a validity icpt for the given scb */
0054 static int set_validity_icpt(struct kvm_s390_sie_block *scb,
0055                  __u16 reason_code)
0056 {
0057     scb->ipa = 0x1000;
0058     scb->ipb = ((__u32) reason_code) << 16;
0059     scb->icptcode = ICPT_VALIDITY;
0060     return 1;
0061 }
0062 
0063 /* mark the prefix as unmapped, this will block the VSIE */
0064 static void prefix_unmapped(struct vsie_page *vsie_page)
0065 {
0066     atomic_or(PROG_REQUEST, &vsie_page->scb_s.prog20);
0067 }
0068 
0069 /* mark the prefix as unmapped and wait until the VSIE has been left */
0070 static void prefix_unmapped_sync(struct vsie_page *vsie_page)
0071 {
0072     prefix_unmapped(vsie_page);
0073     if (vsie_page->scb_s.prog0c & PROG_IN_SIE)
0074         atomic_or(CPUSTAT_STOP_INT, &vsie_page->scb_s.cpuflags);
0075     while (vsie_page->scb_s.prog0c & PROG_IN_SIE)
0076         cpu_relax();
0077 }
0078 
0079 /* mark the prefix as mapped, this will allow the VSIE to run */
0080 static void prefix_mapped(struct vsie_page *vsie_page)
0081 {
0082     atomic_andnot(PROG_REQUEST, &vsie_page->scb_s.prog20);
0083 }
0084 
0085 /* test if the prefix is mapped into the gmap shadow */
0086 static int prefix_is_mapped(struct vsie_page *vsie_page)
0087 {
0088     return !(atomic_read(&vsie_page->scb_s.prog20) & PROG_REQUEST);
0089 }
0090 
0091 /* copy the updated intervention request bits into the shadow scb */
0092 static void update_intervention_requests(struct vsie_page *vsie_page)
0093 {
0094     const int bits = CPUSTAT_STOP_INT | CPUSTAT_IO_INT | CPUSTAT_EXT_INT;
0095     int cpuflags;
0096 
0097     cpuflags = atomic_read(&vsie_page->scb_o->cpuflags);
0098     atomic_andnot(bits, &vsie_page->scb_s.cpuflags);
0099     atomic_or(cpuflags & bits, &vsie_page->scb_s.cpuflags);
0100 }
0101 
0102 /* shadow (filter and validate) the cpuflags  */
0103 static int prepare_cpuflags(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
0104 {
0105     struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
0106     struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
0107     int newflags, cpuflags = atomic_read(&scb_o->cpuflags);
0108 
0109     /* we don't allow ESA/390 guests */
0110     if (!(cpuflags & CPUSTAT_ZARCH))
0111         return set_validity_icpt(scb_s, 0x0001U);
0112 
0113     if (cpuflags & (CPUSTAT_RRF | CPUSTAT_MCDS))
0114         return set_validity_icpt(scb_s, 0x0001U);
0115     else if (cpuflags & (CPUSTAT_SLSV | CPUSTAT_SLSR))
0116         return set_validity_icpt(scb_s, 0x0007U);
0117 
0118     /* intervention requests will be set later */
0119     newflags = CPUSTAT_ZARCH;
0120     if (cpuflags & CPUSTAT_GED && test_kvm_facility(vcpu->kvm, 8))
0121         newflags |= CPUSTAT_GED;
0122     if (cpuflags & CPUSTAT_GED2 && test_kvm_facility(vcpu->kvm, 78)) {
0123         if (cpuflags & CPUSTAT_GED)
0124             return set_validity_icpt(scb_s, 0x0001U);
0125         newflags |= CPUSTAT_GED2;
0126     }
0127     if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_GPERE))
0128         newflags |= cpuflags & CPUSTAT_P;
0129     if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_GSLS))
0130         newflags |= cpuflags & CPUSTAT_SM;
0131     if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_IBS))
0132         newflags |= cpuflags & CPUSTAT_IBS;
0133     if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_KSS))
0134         newflags |= cpuflags & CPUSTAT_KSS;
0135 
0136     atomic_set(&scb_s->cpuflags, newflags);
0137     return 0;
0138 }
0139 /* Copy to APCB FORMAT1 from APCB FORMAT0 */
0140 static int setup_apcb10(struct kvm_vcpu *vcpu, struct kvm_s390_apcb1 *apcb_s,
0141             unsigned long apcb_o, struct kvm_s390_apcb1 *apcb_h)
0142 {
0143     struct kvm_s390_apcb0 tmp;
0144 
0145     if (read_guest_real(vcpu, apcb_o, &tmp, sizeof(struct kvm_s390_apcb0)))
0146         return -EFAULT;
0147 
0148     apcb_s->apm[0] = apcb_h->apm[0] & tmp.apm[0];
0149     apcb_s->aqm[0] = apcb_h->aqm[0] & tmp.aqm[0] & 0xffff000000000000UL;
0150     apcb_s->adm[0] = apcb_h->adm[0] & tmp.adm[0] & 0xffff000000000000UL;
0151 
0152     return 0;
0153 
0154 }
0155 
0156 /**
0157  * setup_apcb00 - Copy to APCB FORMAT0 from APCB FORMAT0
0158  * @vcpu: pointer to the virtual CPU
0159  * @apcb_s: pointer to start of apcb in the shadow crycb
0160  * @apcb_o: pointer to start of original apcb in the guest2
0161  * @apcb_h: pointer to start of apcb in the guest1
0162  *
0163  * Returns 0 and -EFAULT on error reading guest apcb
0164  */
0165 static int setup_apcb00(struct kvm_vcpu *vcpu, unsigned long *apcb_s,
0166             unsigned long apcb_o, unsigned long *apcb_h)
0167 {
0168     if (read_guest_real(vcpu, apcb_o, apcb_s,
0169                 sizeof(struct kvm_s390_apcb0)))
0170         return -EFAULT;
0171 
0172     bitmap_and(apcb_s, apcb_s, apcb_h, sizeof(struct kvm_s390_apcb0));
0173 
0174     return 0;
0175 }
0176 
0177 /**
0178  * setup_apcb11 - Copy the FORMAT1 APCB from the guest to the shadow CRYCB
0179  * @vcpu: pointer to the virtual CPU
0180  * @apcb_s: pointer to start of apcb in the shadow crycb
0181  * @apcb_o: pointer to start of original guest apcb
0182  * @apcb_h: pointer to start of apcb in the host
0183  *
0184  * Returns 0 and -EFAULT on error reading guest apcb
0185  */
0186 static int setup_apcb11(struct kvm_vcpu *vcpu, unsigned long *apcb_s,
0187             unsigned long apcb_o,
0188             unsigned long *apcb_h)
0189 {
0190     if (read_guest_real(vcpu, apcb_o, apcb_s,
0191                 sizeof(struct kvm_s390_apcb1)))
0192         return -EFAULT;
0193 
0194     bitmap_and(apcb_s, apcb_s, apcb_h, sizeof(struct kvm_s390_apcb1));
0195 
0196     return 0;
0197 }
0198 
0199 /**
0200  * setup_apcb - Create a shadow copy of the apcb.
0201  * @vcpu: pointer to the virtual CPU
0202  * @crycb_s: pointer to shadow crycb
0203  * @crycb_o: pointer to original guest crycb
0204  * @crycb_h: pointer to the host crycb
0205  * @fmt_o: format of the original guest crycb.
0206  * @fmt_h: format of the host crycb.
0207  *
0208  * Checks the compatibility between the guest and host crycb and calls the
0209  * appropriate copy function.
0210  *
0211  * Return 0 or an error number if the guest and host crycb are incompatible.
0212  */
0213 static int setup_apcb(struct kvm_vcpu *vcpu, struct kvm_s390_crypto_cb *crycb_s,
0214            const u32 crycb_o,
0215            struct kvm_s390_crypto_cb *crycb_h,
0216            int fmt_o, int fmt_h)
0217 {
0218     struct kvm_s390_crypto_cb *crycb;
0219 
0220     crycb = (struct kvm_s390_crypto_cb *) (unsigned long)crycb_o;
0221 
0222     switch (fmt_o) {
0223     case CRYCB_FORMAT2:
0224         if ((crycb_o & PAGE_MASK) != ((crycb_o + 256) & PAGE_MASK))
0225             return -EACCES;
0226         if (fmt_h != CRYCB_FORMAT2)
0227             return -EINVAL;
0228         return setup_apcb11(vcpu, (unsigned long *)&crycb_s->apcb1,
0229                     (unsigned long) &crycb->apcb1,
0230                     (unsigned long *)&crycb_h->apcb1);
0231     case CRYCB_FORMAT1:
0232         switch (fmt_h) {
0233         case CRYCB_FORMAT2:
0234             return setup_apcb10(vcpu, &crycb_s->apcb1,
0235                         (unsigned long) &crycb->apcb0,
0236                         &crycb_h->apcb1);
0237         case CRYCB_FORMAT1:
0238             return setup_apcb00(vcpu,
0239                         (unsigned long *) &crycb_s->apcb0,
0240                         (unsigned long) &crycb->apcb0,
0241                         (unsigned long *) &crycb_h->apcb0);
0242         }
0243         break;
0244     case CRYCB_FORMAT0:
0245         if ((crycb_o & PAGE_MASK) != ((crycb_o + 32) & PAGE_MASK))
0246             return -EACCES;
0247 
0248         switch (fmt_h) {
0249         case CRYCB_FORMAT2:
0250             return setup_apcb10(vcpu, &crycb_s->apcb1,
0251                         (unsigned long) &crycb->apcb0,
0252                         &crycb_h->apcb1);
0253         case CRYCB_FORMAT1:
0254         case CRYCB_FORMAT0:
0255             return setup_apcb00(vcpu,
0256                         (unsigned long *) &crycb_s->apcb0,
0257                         (unsigned long) &crycb->apcb0,
0258                         (unsigned long *) &crycb_h->apcb0);
0259         }
0260     }
0261     return -EINVAL;
0262 }
0263 
0264 /**
0265  * shadow_crycb - Create a shadow copy of the crycb block
0266  * @vcpu: a pointer to the virtual CPU
0267  * @vsie_page: a pointer to internal date used for the vSIE
0268  *
0269  * Create a shadow copy of the crycb block and setup key wrapping, if
0270  * requested for guest 3 and enabled for guest 2.
0271  *
0272  * We accept format-1 or format-2, but we convert format-1 into format-2
0273  * in the shadow CRYCB.
0274  * Using format-2 enables the firmware to choose the right format when
0275  * scheduling the SIE.
0276  * There is nothing to do for format-0.
0277  *
0278  * This function centralize the issuing of set_validity_icpt() for all
0279  * the subfunctions working on the crycb.
0280  *
0281  * Returns: - 0 if shadowed or nothing to do
0282  *          - > 0 if control has to be given to guest 2
0283  */
0284 static int shadow_crycb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
0285 {
0286     struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
0287     struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
0288     const uint32_t crycbd_o = READ_ONCE(scb_o->crycbd);
0289     const u32 crycb_addr = crycbd_o & 0x7ffffff8U;
0290     unsigned long *b1, *b2;
0291     u8 ecb3_flags;
0292     u32 ecd_flags;
0293     int apie_h;
0294     int apie_s;
0295     int key_msk = test_kvm_facility(vcpu->kvm, 76);
0296     int fmt_o = crycbd_o & CRYCB_FORMAT_MASK;
0297     int fmt_h = vcpu->arch.sie_block->crycbd & CRYCB_FORMAT_MASK;
0298     int ret = 0;
0299 
0300     scb_s->crycbd = 0;
0301 
0302     apie_h = vcpu->arch.sie_block->eca & ECA_APIE;
0303     apie_s = apie_h & scb_o->eca;
0304     if (!apie_s && (!key_msk || (fmt_o == CRYCB_FORMAT0)))
0305         return 0;
0306 
0307     if (!crycb_addr)
0308         return set_validity_icpt(scb_s, 0x0039U);
0309 
0310     if (fmt_o == CRYCB_FORMAT1)
0311         if ((crycb_addr & PAGE_MASK) !=
0312             ((crycb_addr + 128) & PAGE_MASK))
0313             return set_validity_icpt(scb_s, 0x003CU);
0314 
0315     if (apie_s) {
0316         ret = setup_apcb(vcpu, &vsie_page->crycb, crycb_addr,
0317                  vcpu->kvm->arch.crypto.crycb,
0318                  fmt_o, fmt_h);
0319         if (ret)
0320             goto end;
0321         scb_s->eca |= scb_o->eca & ECA_APIE;
0322     }
0323 
0324     /* we may only allow it if enabled for guest 2 */
0325     ecb3_flags = scb_o->ecb3 & vcpu->arch.sie_block->ecb3 &
0326              (ECB3_AES | ECB3_DEA);
0327     ecd_flags = scb_o->ecd & vcpu->arch.sie_block->ecd & ECD_ECC;
0328     if (!ecb3_flags && !ecd_flags)
0329         goto end;
0330 
0331     /* copy only the wrapping keys */
0332     if (read_guest_real(vcpu, crycb_addr + 72,
0333                 vsie_page->crycb.dea_wrapping_key_mask, 56))
0334         return set_validity_icpt(scb_s, 0x0035U);
0335 
0336     scb_s->ecb3 |= ecb3_flags;
0337     scb_s->ecd |= ecd_flags;
0338 
0339     /* xor both blocks in one run */
0340     b1 = (unsigned long *) vsie_page->crycb.dea_wrapping_key_mask;
0341     b2 = (unsigned long *)
0342                 vcpu->kvm->arch.crypto.crycb->dea_wrapping_key_mask;
0343     /* as 56%8 == 0, bitmap_xor won't overwrite any data */
0344     bitmap_xor(b1, b1, b2, BITS_PER_BYTE * 56);
0345 end:
0346     switch (ret) {
0347     case -EINVAL:
0348         return set_validity_icpt(scb_s, 0x0022U);
0349     case -EFAULT:
0350         return set_validity_icpt(scb_s, 0x0035U);
0351     case -EACCES:
0352         return set_validity_icpt(scb_s, 0x003CU);
0353     }
0354     scb_s->crycbd = ((__u32)(__u64) &vsie_page->crycb) | CRYCB_FORMAT2;
0355     return 0;
0356 }
0357 
0358 /* shadow (round up/down) the ibc to avoid validity icpt */
0359 static void prepare_ibc(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
0360 {
0361     struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
0362     struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
0363     /* READ_ONCE does not work on bitfields - use a temporary variable */
0364     const uint32_t __new_ibc = scb_o->ibc;
0365     const uint32_t new_ibc = READ_ONCE(__new_ibc) & 0x0fffU;
0366     __u64 min_ibc = (sclp.ibc >> 16) & 0x0fffU;
0367 
0368     scb_s->ibc = 0;
0369     /* ibc installed in g2 and requested for g3 */
0370     if (vcpu->kvm->arch.model.ibc && new_ibc) {
0371         scb_s->ibc = new_ibc;
0372         /* takte care of the minimum ibc level of the machine */
0373         if (scb_s->ibc < min_ibc)
0374             scb_s->ibc = min_ibc;
0375         /* take care of the maximum ibc level set for the guest */
0376         if (scb_s->ibc > vcpu->kvm->arch.model.ibc)
0377             scb_s->ibc = vcpu->kvm->arch.model.ibc;
0378     }
0379 }
0380 
0381 /* unshadow the scb, copying parameters back to the real scb */
0382 static void unshadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
0383 {
0384     struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
0385     struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
0386 
0387     /* interception */
0388     scb_o->icptcode = scb_s->icptcode;
0389     scb_o->icptstatus = scb_s->icptstatus;
0390     scb_o->ipa = scb_s->ipa;
0391     scb_o->ipb = scb_s->ipb;
0392     scb_o->gbea = scb_s->gbea;
0393 
0394     /* timer */
0395     scb_o->cputm = scb_s->cputm;
0396     scb_o->ckc = scb_s->ckc;
0397     scb_o->todpr = scb_s->todpr;
0398 
0399     /* guest state */
0400     scb_o->gpsw = scb_s->gpsw;
0401     scb_o->gg14 = scb_s->gg14;
0402     scb_o->gg15 = scb_s->gg15;
0403     memcpy(scb_o->gcr, scb_s->gcr, 128);
0404     scb_o->pp = scb_s->pp;
0405 
0406     /* branch prediction */
0407     if (test_kvm_facility(vcpu->kvm, 82)) {
0408         scb_o->fpf &= ~FPF_BPBC;
0409         scb_o->fpf |= scb_s->fpf & FPF_BPBC;
0410     }
0411 
0412     /* interrupt intercept */
0413     switch (scb_s->icptcode) {
0414     case ICPT_PROGI:
0415     case ICPT_INSTPROGI:
0416     case ICPT_EXTINT:
0417         memcpy((void *)((u64)scb_o + 0xc0),
0418                (void *)((u64)scb_s + 0xc0), 0xf0 - 0xc0);
0419         break;
0420     }
0421 
0422     if (scb_s->ihcpu != 0xffffU)
0423         scb_o->ihcpu = scb_s->ihcpu;
0424 }
0425 
0426 /*
0427  * Setup the shadow scb by copying and checking the relevant parts of the g2
0428  * provided scb.
0429  *
0430  * Returns: - 0 if the scb has been shadowed
0431  *          - > 0 if control has to be given to guest 2
0432  */
0433 static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
0434 {
0435     struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
0436     struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
0437     /* READ_ONCE does not work on bitfields - use a temporary variable */
0438     const uint32_t __new_prefix = scb_o->prefix;
0439     const uint32_t new_prefix = READ_ONCE(__new_prefix);
0440     const bool wants_tx = READ_ONCE(scb_o->ecb) & ECB_TE;
0441     bool had_tx = scb_s->ecb & ECB_TE;
0442     unsigned long new_mso = 0;
0443     int rc;
0444 
0445     /* make sure we don't have any leftovers when reusing the scb */
0446     scb_s->icptcode = 0;
0447     scb_s->eca = 0;
0448     scb_s->ecb = 0;
0449     scb_s->ecb2 = 0;
0450     scb_s->ecb3 = 0;
0451     scb_s->ecd = 0;
0452     scb_s->fac = 0;
0453     scb_s->fpf = 0;
0454 
0455     rc = prepare_cpuflags(vcpu, vsie_page);
0456     if (rc)
0457         goto out;
0458 
0459     /* timer */
0460     scb_s->cputm = scb_o->cputm;
0461     scb_s->ckc = scb_o->ckc;
0462     scb_s->todpr = scb_o->todpr;
0463     scb_s->epoch = scb_o->epoch;
0464 
0465     /* guest state */
0466     scb_s->gpsw = scb_o->gpsw;
0467     scb_s->gg14 = scb_o->gg14;
0468     scb_s->gg15 = scb_o->gg15;
0469     memcpy(scb_s->gcr, scb_o->gcr, 128);
0470     scb_s->pp = scb_o->pp;
0471 
0472     /* interception / execution handling */
0473     scb_s->gbea = scb_o->gbea;
0474     scb_s->lctl = scb_o->lctl;
0475     scb_s->svcc = scb_o->svcc;
0476     scb_s->ictl = scb_o->ictl;
0477     /*
0478      * SKEY handling functions can't deal with false setting of PTE invalid
0479      * bits. Therefore we cannot provide interpretation and would later
0480      * have to provide own emulation handlers.
0481      */
0482     if (!(atomic_read(&scb_s->cpuflags) & CPUSTAT_KSS))
0483         scb_s->ictl |= ICTL_ISKE | ICTL_SSKE | ICTL_RRBE;
0484 
0485     scb_s->icpua = scb_o->icpua;
0486 
0487     if (!(atomic_read(&scb_s->cpuflags) & CPUSTAT_SM))
0488         new_mso = READ_ONCE(scb_o->mso) & 0xfffffffffff00000UL;
0489     /* if the hva of the prefix changes, we have to remap the prefix */
0490     if (scb_s->mso != new_mso || scb_s->prefix != new_prefix)
0491         prefix_unmapped(vsie_page);
0492      /* SIE will do mso/msl validity and exception checks for us */
0493     scb_s->msl = scb_o->msl & 0xfffffffffff00000UL;
0494     scb_s->mso = new_mso;
0495     scb_s->prefix = new_prefix;
0496 
0497     /* We have to definetly flush the tlb if this scb never ran */
0498     if (scb_s->ihcpu != 0xffffU)
0499         scb_s->ihcpu = scb_o->ihcpu;
0500 
0501     /* MVPG and Protection Exception Interpretation are always available */
0502     scb_s->eca |= scb_o->eca & (ECA_MVPGI | ECA_PROTEXCI);
0503     /* Host-protection-interruption introduced with ESOP */
0504     if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_ESOP))
0505         scb_s->ecb |= scb_o->ecb & ECB_HOSTPROTINT;
0506     /*
0507      * CPU Topology
0508      * This facility only uses the utility field of the SCA and none of
0509      * the cpu entries that are problematic with the other interpretation
0510      * facilities so we can pass it through
0511      */
0512     if (test_kvm_facility(vcpu->kvm, 11))
0513         scb_s->ecb |= scb_o->ecb & ECB_PTF;
0514     /* transactional execution */
0515     if (test_kvm_facility(vcpu->kvm, 73) && wants_tx) {
0516         /* remap the prefix is tx is toggled on */
0517         if (!had_tx)
0518             prefix_unmapped(vsie_page);
0519         scb_s->ecb |= ECB_TE;
0520     }
0521     /* specification exception interpretation */
0522     scb_s->ecb |= scb_o->ecb & ECB_SPECI;
0523     /* branch prediction */
0524     if (test_kvm_facility(vcpu->kvm, 82))
0525         scb_s->fpf |= scb_o->fpf & FPF_BPBC;
0526     /* SIMD */
0527     if (test_kvm_facility(vcpu->kvm, 129)) {
0528         scb_s->eca |= scb_o->eca & ECA_VX;
0529         scb_s->ecd |= scb_o->ecd & ECD_HOSTREGMGMT;
0530     }
0531     /* Run-time-Instrumentation */
0532     if (test_kvm_facility(vcpu->kvm, 64))
0533         scb_s->ecb3 |= scb_o->ecb3 & ECB3_RI;
0534     /* Instruction Execution Prevention */
0535     if (test_kvm_facility(vcpu->kvm, 130))
0536         scb_s->ecb2 |= scb_o->ecb2 & ECB2_IEP;
0537     /* Guarded Storage */
0538     if (test_kvm_facility(vcpu->kvm, 133)) {
0539         scb_s->ecb |= scb_o->ecb & ECB_GS;
0540         scb_s->ecd |= scb_o->ecd & ECD_HOSTREGMGMT;
0541     }
0542     if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_SIIF))
0543         scb_s->eca |= scb_o->eca & ECA_SII;
0544     if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_IB))
0545         scb_s->eca |= scb_o->eca & ECA_IB;
0546     if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_CEI))
0547         scb_s->eca |= scb_o->eca & ECA_CEI;
0548     /* Epoch Extension */
0549     if (test_kvm_facility(vcpu->kvm, 139))
0550         scb_s->ecd |= scb_o->ecd & ECD_MEF;
0551 
0552     /* etoken */
0553     if (test_kvm_facility(vcpu->kvm, 156))
0554         scb_s->ecd |= scb_o->ecd & ECD_ETOKENF;
0555 
0556     scb_s->hpid = HPID_VSIE;
0557     scb_s->cpnc = scb_o->cpnc;
0558 
0559     prepare_ibc(vcpu, vsie_page);
0560     rc = shadow_crycb(vcpu, vsie_page);
0561 out:
0562     if (rc)
0563         unshadow_scb(vcpu, vsie_page);
0564     return rc;
0565 }
0566 
0567 void kvm_s390_vsie_gmap_notifier(struct gmap *gmap, unsigned long start,
0568                  unsigned long end)
0569 {
0570     struct kvm *kvm = gmap->private;
0571     struct vsie_page *cur;
0572     unsigned long prefix;
0573     struct page *page;
0574     int i;
0575 
0576     if (!gmap_is_shadow(gmap))
0577         return;
0578     if (start >= 1UL << 31)
0579         /* We are only interested in prefix pages */
0580         return;
0581 
0582     /*
0583      * Only new shadow blocks are added to the list during runtime,
0584      * therefore we can safely reference them all the time.
0585      */
0586     for (i = 0; i < kvm->arch.vsie.page_count; i++) {
0587         page = READ_ONCE(kvm->arch.vsie.pages[i]);
0588         if (!page)
0589             continue;
0590         cur = page_to_virt(page);
0591         if (READ_ONCE(cur->gmap) != gmap)
0592             continue;
0593         prefix = cur->scb_s.prefix << GUEST_PREFIX_SHIFT;
0594         /* with mso/msl, the prefix lies at an offset */
0595         prefix += cur->scb_s.mso;
0596         if (prefix <= end && start <= prefix + 2 * PAGE_SIZE - 1)
0597             prefix_unmapped_sync(cur);
0598     }
0599 }
0600 
0601 /*
0602  * Map the first prefix page and if tx is enabled also the second prefix page.
0603  *
0604  * The prefix will be protected, a gmap notifier will inform about unmaps.
0605  * The shadow scb must not be executed until the prefix is remapped, this is
0606  * guaranteed by properly handling PROG_REQUEST.
0607  *
0608  * Returns: - 0 on if successfully mapped or already mapped
0609  *          - > 0 if control has to be given to guest 2
0610  *          - -EAGAIN if the caller can retry immediately
0611  *          - -ENOMEM if out of memory
0612  */
0613 static int map_prefix(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
0614 {
0615     struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
0616     u64 prefix = scb_s->prefix << GUEST_PREFIX_SHIFT;
0617     int rc;
0618 
0619     if (prefix_is_mapped(vsie_page))
0620         return 0;
0621 
0622     /* mark it as mapped so we can catch any concurrent unmappers */
0623     prefix_mapped(vsie_page);
0624 
0625     /* with mso/msl, the prefix lies at offset *mso* */
0626     prefix += scb_s->mso;
0627 
0628     rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, prefix, NULL);
0629     if (!rc && (scb_s->ecb & ECB_TE))
0630         rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap,
0631                        prefix + PAGE_SIZE, NULL);
0632     /*
0633      * We don't have to mprotect, we will be called for all unshadows.
0634      * SIE will detect if protection applies and trigger a validity.
0635      */
0636     if (rc)
0637         prefix_unmapped(vsie_page);
0638     if (rc > 0 || rc == -EFAULT)
0639         rc = set_validity_icpt(scb_s, 0x0037U);
0640     return rc;
0641 }
0642 
0643 /*
0644  * Pin the guest page given by gpa and set hpa to the pinned host address.
0645  * Will always be pinned writable.
0646  *
0647  * Returns: - 0 on success
0648  *          - -EINVAL if the gpa is not valid guest storage
0649  */
0650 static int pin_guest_page(struct kvm *kvm, gpa_t gpa, hpa_t *hpa)
0651 {
0652     struct page *page;
0653 
0654     page = gfn_to_page(kvm, gpa_to_gfn(gpa));
0655     if (is_error_page(page))
0656         return -EINVAL;
0657     *hpa = (hpa_t) page_to_virt(page) + (gpa & ~PAGE_MASK);
0658     return 0;
0659 }
0660 
0661 /* Unpins a page previously pinned via pin_guest_page, marking it as dirty. */
0662 static void unpin_guest_page(struct kvm *kvm, gpa_t gpa, hpa_t hpa)
0663 {
0664     kvm_release_pfn_dirty(hpa >> PAGE_SHIFT);
0665     /* mark the page always as dirty for migration */
0666     mark_page_dirty(kvm, gpa_to_gfn(gpa));
0667 }
0668 
0669 /* unpin all blocks previously pinned by pin_blocks(), marking them dirty */
0670 static void unpin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
0671 {
0672     struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
0673     hpa_t hpa;
0674 
0675     hpa = (u64) scb_s->scaoh << 32 | scb_s->scaol;
0676     if (hpa) {
0677         unpin_guest_page(vcpu->kvm, vsie_page->sca_gpa, hpa);
0678         vsie_page->sca_gpa = 0;
0679         scb_s->scaol = 0;
0680         scb_s->scaoh = 0;
0681     }
0682 
0683     hpa = scb_s->itdba;
0684     if (hpa) {
0685         unpin_guest_page(vcpu->kvm, vsie_page->itdba_gpa, hpa);
0686         vsie_page->itdba_gpa = 0;
0687         scb_s->itdba = 0;
0688     }
0689 
0690     hpa = scb_s->gvrd;
0691     if (hpa) {
0692         unpin_guest_page(vcpu->kvm, vsie_page->gvrd_gpa, hpa);
0693         vsie_page->gvrd_gpa = 0;
0694         scb_s->gvrd = 0;
0695     }
0696 
0697     hpa = scb_s->riccbd;
0698     if (hpa) {
0699         unpin_guest_page(vcpu->kvm, vsie_page->riccbd_gpa, hpa);
0700         vsie_page->riccbd_gpa = 0;
0701         scb_s->riccbd = 0;
0702     }
0703 
0704     hpa = scb_s->sdnxo;
0705     if (hpa) {
0706         unpin_guest_page(vcpu->kvm, vsie_page->sdnx_gpa, hpa);
0707         vsie_page->sdnx_gpa = 0;
0708         scb_s->sdnxo = 0;
0709     }
0710 }
0711 
0712 /*
0713  * Instead of shadowing some blocks, we can simply forward them because the
0714  * addresses in the scb are 64 bit long.
0715  *
0716  * This works as long as the data lies in one page. If blocks ever exceed one
0717  * page, we have to fall back to shadowing.
0718  *
0719  * As we reuse the sca, the vcpu pointers contained in it are invalid. We must
0720  * therefore not enable any facilities that access these pointers (e.g. SIGPIF).
0721  *
0722  * Returns: - 0 if all blocks were pinned.
0723  *          - > 0 if control has to be given to guest 2
0724  *          - -ENOMEM if out of memory
0725  */
0726 static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
0727 {
0728     struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
0729     struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
0730     hpa_t hpa;
0731     gpa_t gpa;
0732     int rc = 0;
0733 
0734     gpa = READ_ONCE(scb_o->scaol) & ~0xfUL;
0735     if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_64BSCAO))
0736         gpa |= (u64) READ_ONCE(scb_o->scaoh) << 32;
0737     if (gpa) {
0738         if (gpa < 2 * PAGE_SIZE)
0739             rc = set_validity_icpt(scb_s, 0x0038U);
0740         else if ((gpa & ~0x1fffUL) == kvm_s390_get_prefix(vcpu))
0741             rc = set_validity_icpt(scb_s, 0x0011U);
0742         else if ((gpa & PAGE_MASK) !=
0743              ((gpa + sizeof(struct bsca_block) - 1) & PAGE_MASK))
0744             rc = set_validity_icpt(scb_s, 0x003bU);
0745         if (!rc) {
0746             rc = pin_guest_page(vcpu->kvm, gpa, &hpa);
0747             if (rc)
0748                 rc = set_validity_icpt(scb_s, 0x0034U);
0749         }
0750         if (rc)
0751             goto unpin;
0752         vsie_page->sca_gpa = gpa;
0753         scb_s->scaoh = (u32)((u64)hpa >> 32);
0754         scb_s->scaol = (u32)(u64)hpa;
0755     }
0756 
0757     gpa = READ_ONCE(scb_o->itdba) & ~0xffUL;
0758     if (gpa && (scb_s->ecb & ECB_TE)) {
0759         if (gpa < 2 * PAGE_SIZE) {
0760             rc = set_validity_icpt(scb_s, 0x0080U);
0761             goto unpin;
0762         }
0763         /* 256 bytes cannot cross page boundaries */
0764         rc = pin_guest_page(vcpu->kvm, gpa, &hpa);
0765         if (rc) {
0766             rc = set_validity_icpt(scb_s, 0x0080U);
0767             goto unpin;
0768         }
0769         vsie_page->itdba_gpa = gpa;
0770         scb_s->itdba = hpa;
0771     }
0772 
0773     gpa = READ_ONCE(scb_o->gvrd) & ~0x1ffUL;
0774     if (gpa && (scb_s->eca & ECA_VX) && !(scb_s->ecd & ECD_HOSTREGMGMT)) {
0775         if (gpa < 2 * PAGE_SIZE) {
0776             rc = set_validity_icpt(scb_s, 0x1310U);
0777             goto unpin;
0778         }
0779         /*
0780          * 512 bytes vector registers cannot cross page boundaries
0781          * if this block gets bigger, we have to shadow it.
0782          */
0783         rc = pin_guest_page(vcpu->kvm, gpa, &hpa);
0784         if (rc) {
0785             rc = set_validity_icpt(scb_s, 0x1310U);
0786             goto unpin;
0787         }
0788         vsie_page->gvrd_gpa = gpa;
0789         scb_s->gvrd = hpa;
0790     }
0791 
0792     gpa = READ_ONCE(scb_o->riccbd) & ~0x3fUL;
0793     if (gpa && (scb_s->ecb3 & ECB3_RI)) {
0794         if (gpa < 2 * PAGE_SIZE) {
0795             rc = set_validity_icpt(scb_s, 0x0043U);
0796             goto unpin;
0797         }
0798         /* 64 bytes cannot cross page boundaries */
0799         rc = pin_guest_page(vcpu->kvm, gpa, &hpa);
0800         if (rc) {
0801             rc = set_validity_icpt(scb_s, 0x0043U);
0802             goto unpin;
0803         }
0804         /* Validity 0x0044 will be checked by SIE */
0805         vsie_page->riccbd_gpa = gpa;
0806         scb_s->riccbd = hpa;
0807     }
0808     if (((scb_s->ecb & ECB_GS) && !(scb_s->ecd & ECD_HOSTREGMGMT)) ||
0809         (scb_s->ecd & ECD_ETOKENF)) {
0810         unsigned long sdnxc;
0811 
0812         gpa = READ_ONCE(scb_o->sdnxo) & ~0xfUL;
0813         sdnxc = READ_ONCE(scb_o->sdnxo) & 0xfUL;
0814         if (!gpa || gpa < 2 * PAGE_SIZE) {
0815             rc = set_validity_icpt(scb_s, 0x10b0U);
0816             goto unpin;
0817         }
0818         if (sdnxc < 6 || sdnxc > 12) {
0819             rc = set_validity_icpt(scb_s, 0x10b1U);
0820             goto unpin;
0821         }
0822         if (gpa & ((1 << sdnxc) - 1)) {
0823             rc = set_validity_icpt(scb_s, 0x10b2U);
0824             goto unpin;
0825         }
0826         /* Due to alignment rules (checked above) this cannot
0827          * cross page boundaries
0828          */
0829         rc = pin_guest_page(vcpu->kvm, gpa, &hpa);
0830         if (rc) {
0831             rc = set_validity_icpt(scb_s, 0x10b0U);
0832             goto unpin;
0833         }
0834         vsie_page->sdnx_gpa = gpa;
0835         scb_s->sdnxo = hpa | sdnxc;
0836     }
0837     return 0;
0838 unpin:
0839     unpin_blocks(vcpu, vsie_page);
0840     return rc;
0841 }
0842 
0843 /* unpin the scb provided by guest 2, marking it as dirty */
0844 static void unpin_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page,
0845               gpa_t gpa)
0846 {
0847     hpa_t hpa = (hpa_t) vsie_page->scb_o;
0848 
0849     if (hpa)
0850         unpin_guest_page(vcpu->kvm, gpa, hpa);
0851     vsie_page->scb_o = NULL;
0852 }
0853 
0854 /*
0855  * Pin the scb at gpa provided by guest 2 at vsie_page->scb_o.
0856  *
0857  * Returns: - 0 if the scb was pinned.
0858  *          - > 0 if control has to be given to guest 2
0859  */
0860 static int pin_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page,
0861            gpa_t gpa)
0862 {
0863     hpa_t hpa;
0864     int rc;
0865 
0866     rc = pin_guest_page(vcpu->kvm, gpa, &hpa);
0867     if (rc) {
0868         rc = kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
0869         WARN_ON_ONCE(rc);
0870         return 1;
0871     }
0872     vsie_page->scb_o = (struct kvm_s390_sie_block *) hpa;
0873     return 0;
0874 }
0875 
0876 /*
0877  * Inject a fault into guest 2.
0878  *
0879  * Returns: - > 0 if control has to be given to guest 2
0880  *            < 0 if an error occurred during injection.
0881  */
0882 static int inject_fault(struct kvm_vcpu *vcpu, __u16 code, __u64 vaddr,
0883             bool write_flag)
0884 {
0885     struct kvm_s390_pgm_info pgm = {
0886         .code = code,
0887         .trans_exc_code =
0888             /* 0-51: virtual address */
0889             (vaddr & 0xfffffffffffff000UL) |
0890             /* 52-53: store / fetch */
0891             (((unsigned int) !write_flag) + 1) << 10,
0892             /* 62-63: asce id (alway primary == 0) */
0893         .exc_access_id = 0, /* always primary */
0894         .op_access_id = 0, /* not MVPG */
0895     };
0896     int rc;
0897 
0898     if (code == PGM_PROTECTION)
0899         pgm.trans_exc_code |= 0x4UL;
0900 
0901     rc = kvm_s390_inject_prog_irq(vcpu, &pgm);
0902     return rc ? rc : 1;
0903 }
0904 
0905 /*
0906  * Handle a fault during vsie execution on a gmap shadow.
0907  *
0908  * Returns: - 0 if the fault was resolved
0909  *          - > 0 if control has to be given to guest 2
0910  *          - < 0 if an error occurred
0911  */
0912 static int handle_fault(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
0913 {
0914     int rc;
0915 
0916     if (current->thread.gmap_int_code == PGM_PROTECTION)
0917         /* we can directly forward all protection exceptions */
0918         return inject_fault(vcpu, PGM_PROTECTION,
0919                     current->thread.gmap_addr, 1);
0920 
0921     rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap,
0922                    current->thread.gmap_addr, NULL);
0923     if (rc > 0) {
0924         rc = inject_fault(vcpu, rc,
0925                   current->thread.gmap_addr,
0926                   current->thread.gmap_write_flag);
0927         if (rc >= 0)
0928             vsie_page->fault_addr = current->thread.gmap_addr;
0929     }
0930     return rc;
0931 }
0932 
0933 /*
0934  * Retry the previous fault that required guest 2 intervention. This avoids
0935  * one superfluous SIE re-entry and direct exit.
0936  *
0937  * Will ignore any errors. The next SIE fault will do proper fault handling.
0938  */
0939 static void handle_last_fault(struct kvm_vcpu *vcpu,
0940                   struct vsie_page *vsie_page)
0941 {
0942     if (vsie_page->fault_addr)
0943         kvm_s390_shadow_fault(vcpu, vsie_page->gmap,
0944                       vsie_page->fault_addr, NULL);
0945     vsie_page->fault_addr = 0;
0946 }
0947 
0948 static inline void clear_vsie_icpt(struct vsie_page *vsie_page)
0949 {
0950     vsie_page->scb_s.icptcode = 0;
0951 }
0952 
0953 /* rewind the psw and clear the vsie icpt, so we can retry execution */
0954 static void retry_vsie_icpt(struct vsie_page *vsie_page)
0955 {
0956     struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
0957     int ilen = insn_length(scb_s->ipa >> 8);
0958 
0959     /* take care of EXECUTE instructions */
0960     if (scb_s->icptstatus & 1) {
0961         ilen = (scb_s->icptstatus >> 4) & 0x6;
0962         if (!ilen)
0963             ilen = 4;
0964     }
0965     scb_s->gpsw.addr = __rewind_psw(scb_s->gpsw, ilen);
0966     clear_vsie_icpt(vsie_page);
0967 }
0968 
0969 /*
0970  * Try to shadow + enable the guest 2 provided facility list.
0971  * Retry instruction execution if enabled for and provided by guest 2.
0972  *
0973  * Returns: - 0 if handled (retry or guest 2 icpt)
0974  *          - > 0 if control has to be given to guest 2
0975  */
0976 static int handle_stfle(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
0977 {
0978     struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
0979     __u32 fac = READ_ONCE(vsie_page->scb_o->fac) & 0x7ffffff8U;
0980 
0981     if (fac && test_kvm_facility(vcpu->kvm, 7)) {
0982         retry_vsie_icpt(vsie_page);
0983         if (read_guest_real(vcpu, fac, &vsie_page->fac,
0984                     sizeof(vsie_page->fac)))
0985             return set_validity_icpt(scb_s, 0x1090U);
0986         scb_s->fac = (__u32)(__u64) &vsie_page->fac;
0987     }
0988     return 0;
0989 }
0990 
0991 /*
0992  * Get a register for a nested guest.
0993  * @vcpu the vcpu of the guest
0994  * @vsie_page the vsie_page for the nested guest
0995  * @reg the register number, the upper 4 bits are ignored.
0996  * returns: the value of the register.
0997  */
0998 static u64 vsie_get_register(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page, u8 reg)
0999 {
1000     /* no need to validate the parameter and/or perform error handling */
1001     reg &= 0xf;
1002     switch (reg) {
1003     case 15:
1004         return vsie_page->scb_s.gg15;
1005     case 14:
1006         return vsie_page->scb_s.gg14;
1007     default:
1008         return vcpu->run->s.regs.gprs[reg];
1009     }
1010 }
1011 
1012 static int vsie_handle_mvpg(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
1013 {
1014     struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
1015     unsigned long pei_dest, pei_src, src, dest, mask, prefix;
1016     u64 *pei_block = &vsie_page->scb_o->mcic;
1017     int edat, rc_dest, rc_src;
1018     union ctlreg0 cr0;
1019 
1020     cr0.val = vcpu->arch.sie_block->gcr[0];
1021     edat = cr0.edat && test_kvm_facility(vcpu->kvm, 8);
1022     mask = _kvm_s390_logical_to_effective(&scb_s->gpsw, PAGE_MASK);
1023     prefix = scb_s->prefix << GUEST_PREFIX_SHIFT;
1024 
1025     dest = vsie_get_register(vcpu, vsie_page, scb_s->ipb >> 20) & mask;
1026     dest = _kvm_s390_real_to_abs(prefix, dest) + scb_s->mso;
1027     src = vsie_get_register(vcpu, vsie_page, scb_s->ipb >> 16) & mask;
1028     src = _kvm_s390_real_to_abs(prefix, src) + scb_s->mso;
1029 
1030     rc_dest = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, dest, &pei_dest);
1031     rc_src = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, src, &pei_src);
1032     /*
1033      * Either everything went well, or something non-critical went wrong
1034      * e.g. because of a race. In either case, simply retry.
1035      */
1036     if (rc_dest == -EAGAIN || rc_src == -EAGAIN || (!rc_dest && !rc_src)) {
1037         retry_vsie_icpt(vsie_page);
1038         return -EAGAIN;
1039     }
1040     /* Something more serious went wrong, propagate the error */
1041     if (rc_dest < 0)
1042         return rc_dest;
1043     if (rc_src < 0)
1044         return rc_src;
1045 
1046     /* The only possible suppressing exception: just deliver it */
1047     if (rc_dest == PGM_TRANSLATION_SPEC || rc_src == PGM_TRANSLATION_SPEC) {
1048         clear_vsie_icpt(vsie_page);
1049         rc_dest = kvm_s390_inject_program_int(vcpu, PGM_TRANSLATION_SPEC);
1050         WARN_ON_ONCE(rc_dest);
1051         return 1;
1052     }
1053 
1054     /*
1055      * Forward the PEI intercept to the guest if it was a page fault, or
1056      * also for segment and region table faults if EDAT applies.
1057      */
1058     if (edat) {
1059         rc_dest = rc_dest == PGM_ASCE_TYPE ? rc_dest : 0;
1060         rc_src = rc_src == PGM_ASCE_TYPE ? rc_src : 0;
1061     } else {
1062         rc_dest = rc_dest != PGM_PAGE_TRANSLATION ? rc_dest : 0;
1063         rc_src = rc_src != PGM_PAGE_TRANSLATION ? rc_src : 0;
1064     }
1065     if (!rc_dest && !rc_src) {
1066         pei_block[0] = pei_dest;
1067         pei_block[1] = pei_src;
1068         return 1;
1069     }
1070 
1071     retry_vsie_icpt(vsie_page);
1072 
1073     /*
1074      * The host has edat, and the guest does not, or it was an ASCE type
1075      * exception. The host needs to inject the appropriate DAT interrupts
1076      * into the guest.
1077      */
1078     if (rc_dest)
1079         return inject_fault(vcpu, rc_dest, dest, 1);
1080     return inject_fault(vcpu, rc_src, src, 0);
1081 }
1082 
1083 /*
1084  * Run the vsie on a shadow scb and a shadow gmap, without any further
1085  * sanity checks, handling SIE faults.
1086  *
1087  * Returns: - 0 everything went fine
1088  *          - > 0 if control has to be given to guest 2
1089  *          - < 0 if an error occurred
1090  */
1091 static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
1092     __releases(vcpu->kvm->srcu)
1093     __acquires(vcpu->kvm->srcu)
1094 {
1095     struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
1096     struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
1097     int guest_bp_isolation;
1098     int rc = 0;
1099 
1100     handle_last_fault(vcpu, vsie_page);
1101 
1102     kvm_vcpu_srcu_read_unlock(vcpu);
1103 
1104     /* save current guest state of bp isolation override */
1105     guest_bp_isolation = test_thread_flag(TIF_ISOLATE_BP_GUEST);
1106 
1107     /*
1108      * The guest is running with BPBC, so we have to force it on for our
1109      * nested guest. This is done by enabling BPBC globally, so the BPBC
1110      * control in the SCB (which the nested guest can modify) is simply
1111      * ignored.
1112      */
1113     if (test_kvm_facility(vcpu->kvm, 82) &&
1114         vcpu->arch.sie_block->fpf & FPF_BPBC)
1115         set_thread_flag(TIF_ISOLATE_BP_GUEST);
1116 
1117     local_irq_disable();
1118     guest_enter_irqoff();
1119     local_irq_enable();
1120 
1121     /*
1122      * Simulate a SIE entry of the VCPU (see sie64a), so VCPU blocking
1123      * and VCPU requests also hinder the vSIE from running and lead
1124      * to an immediate exit. kvm_s390_vsie_kick() has to be used to
1125      * also kick the vSIE.
1126      */
1127     vcpu->arch.sie_block->prog0c |= PROG_IN_SIE;
1128     barrier();
1129     if (test_cpu_flag(CIF_FPU))
1130         load_fpu_regs();
1131     if (!kvm_s390_vcpu_sie_inhibited(vcpu))
1132         rc = sie64a(scb_s, vcpu->run->s.regs.gprs);
1133     barrier();
1134     vcpu->arch.sie_block->prog0c &= ~PROG_IN_SIE;
1135 
1136     local_irq_disable();
1137     guest_exit_irqoff();
1138     local_irq_enable();
1139 
1140     /* restore guest state for bp isolation override */
1141     if (!guest_bp_isolation)
1142         clear_thread_flag(TIF_ISOLATE_BP_GUEST);
1143 
1144     kvm_vcpu_srcu_read_lock(vcpu);
1145 
1146     if (rc == -EINTR) {
1147         VCPU_EVENT(vcpu, 3, "%s", "machine check");
1148         kvm_s390_reinject_machine_check(vcpu, &vsie_page->mcck_info);
1149         return 0;
1150     }
1151 
1152     if (rc > 0)
1153         rc = 0; /* we could still have an icpt */
1154     else if (rc == -EFAULT)
1155         return handle_fault(vcpu, vsie_page);
1156 
1157     switch (scb_s->icptcode) {
1158     case ICPT_INST:
1159         if (scb_s->ipa == 0xb2b0)
1160             rc = handle_stfle(vcpu, vsie_page);
1161         break;
1162     case ICPT_STOP:
1163         /* stop not requested by g2 - must have been a kick */
1164         if (!(atomic_read(&scb_o->cpuflags) & CPUSTAT_STOP_INT))
1165             clear_vsie_icpt(vsie_page);
1166         break;
1167     case ICPT_VALIDITY:
1168         if ((scb_s->ipa & 0xf000) != 0xf000)
1169             scb_s->ipa += 0x1000;
1170         break;
1171     case ICPT_PARTEXEC:
1172         if (scb_s->ipa == 0xb254)
1173             rc = vsie_handle_mvpg(vcpu, vsie_page);
1174         break;
1175     }
1176     return rc;
1177 }
1178 
1179 static void release_gmap_shadow(struct vsie_page *vsie_page)
1180 {
1181     if (vsie_page->gmap)
1182         gmap_put(vsie_page->gmap);
1183     WRITE_ONCE(vsie_page->gmap, NULL);
1184     prefix_unmapped(vsie_page);
1185 }
1186 
1187 static int acquire_gmap_shadow(struct kvm_vcpu *vcpu,
1188                    struct vsie_page *vsie_page)
1189 {
1190     unsigned long asce;
1191     union ctlreg0 cr0;
1192     struct gmap *gmap;
1193     int edat;
1194 
1195     asce = vcpu->arch.sie_block->gcr[1];
1196     cr0.val = vcpu->arch.sie_block->gcr[0];
1197     edat = cr0.edat && test_kvm_facility(vcpu->kvm, 8);
1198     edat += edat && test_kvm_facility(vcpu->kvm, 78);
1199 
1200     /*
1201      * ASCE or EDAT could have changed since last icpt, or the gmap
1202      * we're holding has been unshadowed. If the gmap is still valid,
1203      * we can safely reuse it.
1204      */
1205     if (vsie_page->gmap && gmap_shadow_valid(vsie_page->gmap, asce, edat))
1206         return 0;
1207 
1208     /* release the old shadow - if any, and mark the prefix as unmapped */
1209     release_gmap_shadow(vsie_page);
1210     gmap = gmap_shadow(vcpu->arch.gmap, asce, edat);
1211     if (IS_ERR(gmap))
1212         return PTR_ERR(gmap);
1213     gmap->private = vcpu->kvm;
1214     WRITE_ONCE(vsie_page->gmap, gmap);
1215     return 0;
1216 }
1217 
1218 /*
1219  * Register the shadow scb at the VCPU, e.g. for kicking out of vsie.
1220  */
1221 static void register_shadow_scb(struct kvm_vcpu *vcpu,
1222                 struct vsie_page *vsie_page)
1223 {
1224     struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
1225 
1226     WRITE_ONCE(vcpu->arch.vsie_block, &vsie_page->scb_s);
1227     /*
1228      * External calls have to lead to a kick of the vcpu and
1229      * therefore the vsie -> Simulate Wait state.
1230      */
1231     kvm_s390_set_cpuflags(vcpu, CPUSTAT_WAIT);
1232     /*
1233      * We have to adjust the g3 epoch by the g2 epoch. The epoch will
1234      * automatically be adjusted on tod clock changes via kvm_sync_clock.
1235      */
1236     preempt_disable();
1237     scb_s->epoch += vcpu->kvm->arch.epoch;
1238 
1239     if (scb_s->ecd & ECD_MEF) {
1240         scb_s->epdx += vcpu->kvm->arch.epdx;
1241         if (scb_s->epoch < vcpu->kvm->arch.epoch)
1242             scb_s->epdx += 1;
1243     }
1244 
1245     preempt_enable();
1246 }
1247 
1248 /*
1249  * Unregister a shadow scb from a VCPU.
1250  */
1251 static void unregister_shadow_scb(struct kvm_vcpu *vcpu)
1252 {
1253     kvm_s390_clear_cpuflags(vcpu, CPUSTAT_WAIT);
1254     WRITE_ONCE(vcpu->arch.vsie_block, NULL);
1255 }
1256 
1257 /*
1258  * Run the vsie on a shadowed scb, managing the gmap shadow, handling
1259  * prefix pages and faults.
1260  *
1261  * Returns: - 0 if no errors occurred
1262  *          - > 0 if control has to be given to guest 2
1263  *          - -ENOMEM if out of memory
1264  */
1265 static int vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
1266 {
1267     struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
1268     int rc = 0;
1269 
1270     while (1) {
1271         rc = acquire_gmap_shadow(vcpu, vsie_page);
1272         if (!rc)
1273             rc = map_prefix(vcpu, vsie_page);
1274         if (!rc) {
1275             gmap_enable(vsie_page->gmap);
1276             update_intervention_requests(vsie_page);
1277             rc = do_vsie_run(vcpu, vsie_page);
1278             gmap_enable(vcpu->arch.gmap);
1279         }
1280         atomic_andnot(PROG_BLOCK_SIE, &scb_s->prog20);
1281 
1282         if (rc == -EAGAIN)
1283             rc = 0;
1284         if (rc || scb_s->icptcode || signal_pending(current) ||
1285             kvm_s390_vcpu_has_irq(vcpu, 0) ||
1286             kvm_s390_vcpu_sie_inhibited(vcpu))
1287             break;
1288         cond_resched();
1289     }
1290 
1291     if (rc == -EFAULT) {
1292         /*
1293          * Addressing exceptions are always presentes as intercepts.
1294          * As addressing exceptions are suppressing and our guest 3 PSW
1295          * points at the responsible instruction, we have to
1296          * forward the PSW and set the ilc. If we can't read guest 3
1297          * instruction, we can use an arbitrary ilc. Let's always use
1298          * ilen = 4 for now, so we can avoid reading in guest 3 virtual
1299          * memory. (we could also fake the shadow so the hardware
1300          * handles it).
1301          */
1302         scb_s->icptcode = ICPT_PROGI;
1303         scb_s->iprcc = PGM_ADDRESSING;
1304         scb_s->pgmilc = 4;
1305         scb_s->gpsw.addr = __rewind_psw(scb_s->gpsw, 4);
1306         rc = 1;
1307     }
1308     return rc;
1309 }
1310 
1311 /*
1312  * Get or create a vsie page for a scb address.
1313  *
1314  * Returns: - address of a vsie page (cached or new one)
1315  *          - NULL if the same scb address is already used by another VCPU
1316  *          - ERR_PTR(-ENOMEM) if out of memory
1317  */
1318 static struct vsie_page *get_vsie_page(struct kvm *kvm, unsigned long addr)
1319 {
1320     struct vsie_page *vsie_page;
1321     struct page *page;
1322     int nr_vcpus;
1323 
1324     rcu_read_lock();
1325     page = radix_tree_lookup(&kvm->arch.vsie.addr_to_page, addr >> 9);
1326     rcu_read_unlock();
1327     if (page) {
1328         if (page_ref_inc_return(page) == 2)
1329             return page_to_virt(page);
1330         page_ref_dec(page);
1331     }
1332 
1333     /*
1334      * We want at least #online_vcpus shadows, so every VCPU can execute
1335      * the VSIE in parallel.
1336      */
1337     nr_vcpus = atomic_read(&kvm->online_vcpus);
1338 
1339     mutex_lock(&kvm->arch.vsie.mutex);
1340     if (kvm->arch.vsie.page_count < nr_vcpus) {
1341         page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO | GFP_DMA);
1342         if (!page) {
1343             mutex_unlock(&kvm->arch.vsie.mutex);
1344             return ERR_PTR(-ENOMEM);
1345         }
1346         page_ref_inc(page);
1347         kvm->arch.vsie.pages[kvm->arch.vsie.page_count] = page;
1348         kvm->arch.vsie.page_count++;
1349     } else {
1350         /* reuse an existing entry that belongs to nobody */
1351         while (true) {
1352             page = kvm->arch.vsie.pages[kvm->arch.vsie.next];
1353             if (page_ref_inc_return(page) == 2)
1354                 break;
1355             page_ref_dec(page);
1356             kvm->arch.vsie.next++;
1357             kvm->arch.vsie.next %= nr_vcpus;
1358         }
1359         radix_tree_delete(&kvm->arch.vsie.addr_to_page, page->index >> 9);
1360     }
1361     page->index = addr;
1362     /* double use of the same address */
1363     if (radix_tree_insert(&kvm->arch.vsie.addr_to_page, addr >> 9, page)) {
1364         page_ref_dec(page);
1365         mutex_unlock(&kvm->arch.vsie.mutex);
1366         return NULL;
1367     }
1368     mutex_unlock(&kvm->arch.vsie.mutex);
1369 
1370     vsie_page = page_to_virt(page);
1371     memset(&vsie_page->scb_s, 0, sizeof(struct kvm_s390_sie_block));
1372     release_gmap_shadow(vsie_page);
1373     vsie_page->fault_addr = 0;
1374     vsie_page->scb_s.ihcpu = 0xffffU;
1375     return vsie_page;
1376 }
1377 
1378 /* put a vsie page acquired via get_vsie_page */
1379 static void put_vsie_page(struct kvm *kvm, struct vsie_page *vsie_page)
1380 {
1381     struct page *page = pfn_to_page(__pa(vsie_page) >> PAGE_SHIFT);
1382 
1383     page_ref_dec(page);
1384 }
1385 
1386 int kvm_s390_handle_vsie(struct kvm_vcpu *vcpu)
1387 {
1388     struct vsie_page *vsie_page;
1389     unsigned long scb_addr;
1390     int rc;
1391 
1392     vcpu->stat.instruction_sie++;
1393     if (!test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_SIEF2))
1394         return -EOPNOTSUPP;
1395     if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
1396         return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
1397 
1398     BUILD_BUG_ON(sizeof(struct vsie_page) != PAGE_SIZE);
1399     scb_addr = kvm_s390_get_base_disp_s(vcpu, NULL);
1400 
1401     /* 512 byte alignment */
1402     if (unlikely(scb_addr & 0x1ffUL))
1403         return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
1404 
1405     if (signal_pending(current) || kvm_s390_vcpu_has_irq(vcpu, 0) ||
1406         kvm_s390_vcpu_sie_inhibited(vcpu))
1407         return 0;
1408 
1409     vsie_page = get_vsie_page(vcpu->kvm, scb_addr);
1410     if (IS_ERR(vsie_page))
1411         return PTR_ERR(vsie_page);
1412     else if (!vsie_page)
1413         /* double use of sie control block - simply do nothing */
1414         return 0;
1415 
1416     rc = pin_scb(vcpu, vsie_page, scb_addr);
1417     if (rc)
1418         goto out_put;
1419     rc = shadow_scb(vcpu, vsie_page);
1420     if (rc)
1421         goto out_unpin_scb;
1422     rc = pin_blocks(vcpu, vsie_page);
1423     if (rc)
1424         goto out_unshadow;
1425     register_shadow_scb(vcpu, vsie_page);
1426     rc = vsie_run(vcpu, vsie_page);
1427     unregister_shadow_scb(vcpu);
1428     unpin_blocks(vcpu, vsie_page);
1429 out_unshadow:
1430     unshadow_scb(vcpu, vsie_page);
1431 out_unpin_scb:
1432     unpin_scb(vcpu, vsie_page, scb_addr);
1433 out_put:
1434     put_vsie_page(vcpu->kvm, vsie_page);
1435 
1436     return rc < 0 ? rc : 0;
1437 }
1438 
1439 /* Init the vsie data structures. To be called when a vm is initialized. */
1440 void kvm_s390_vsie_init(struct kvm *kvm)
1441 {
1442     mutex_init(&kvm->arch.vsie.mutex);
1443     INIT_RADIX_TREE(&kvm->arch.vsie.addr_to_page, GFP_KERNEL_ACCOUNT);
1444 }
1445 
1446 /* Destroy the vsie data structures. To be called when a vm is destroyed. */
1447 void kvm_s390_vsie_destroy(struct kvm *kvm)
1448 {
1449     struct vsie_page *vsie_page;
1450     struct page *page;
1451     int i;
1452 
1453     mutex_lock(&kvm->arch.vsie.mutex);
1454     for (i = 0; i < kvm->arch.vsie.page_count; i++) {
1455         page = kvm->arch.vsie.pages[i];
1456         kvm->arch.vsie.pages[i] = NULL;
1457         vsie_page = page_to_virt(page);
1458         release_gmap_shadow(vsie_page);
1459         /* free the radix tree entry */
1460         radix_tree_delete(&kvm->arch.vsie.addr_to_page, page->index >> 9);
1461         __free_page(page);
1462     }
1463     kvm->arch.vsie.page_count = 0;
1464     mutex_unlock(&kvm->arch.vsie.mutex);
1465 }
1466 
1467 void kvm_s390_vsie_kick(struct kvm_vcpu *vcpu)
1468 {
1469     struct kvm_s390_sie_block *scb = READ_ONCE(vcpu->arch.vsie_block);
1470 
1471     /*
1472      * Even if the VCPU lets go of the shadow sie block reference, it is
1473      * still valid in the cache. So we can safely kick it.
1474      */
1475     if (scb) {
1476         atomic_or(PROG_BLOCK_SIE, &scb->prog20);
1477         if (scb->prog0c & PROG_IN_SIE)
1478             atomic_or(CPUSTAT_STOP_INT, &scb->cpuflags);
1479     }
1480 }