Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0
0002 /*
0003  * Core of Xen paravirt_ops implementation.
0004  *
0005  * This file contains the xen_paravirt_ops structure itself, and the
0006  * implementations for:
0007  * - privileged instructions
0008  * - interrupt flags
0009  * - segment operations
0010  * - booting and setup
0011  *
0012  * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
0013  */
0014 
0015 #include <linux/cpu.h>
0016 #include <linux/kernel.h>
0017 #include <linux/init.h>
0018 #include <linux/smp.h>
0019 #include <linux/preempt.h>
0020 #include <linux/hardirq.h>
0021 #include <linux/percpu.h>
0022 #include <linux/delay.h>
0023 #include <linux/start_kernel.h>
0024 #include <linux/sched.h>
0025 #include <linux/kprobes.h>
0026 #include <linux/memblock.h>
0027 #include <linux/export.h>
0028 #include <linux/mm.h>
0029 #include <linux/page-flags.h>
0030 #include <linux/pci.h>
0031 #include <linux/gfp.h>
0032 #include <linux/edd.h>
0033 #include <linux/reboot.h>
0034 #include <linux/virtio_anchor.h>
0035 
0036 #include <xen/xen.h>
0037 #include <xen/events.h>
0038 #include <xen/interface/xen.h>
0039 #include <xen/interface/version.h>
0040 #include <xen/interface/physdev.h>
0041 #include <xen/interface/vcpu.h>
0042 #include <xen/interface/memory.h>
0043 #include <xen/interface/nmi.h>
0044 #include <xen/interface/xen-mca.h>
0045 #include <xen/features.h>
0046 #include <xen/page.h>
0047 #include <xen/hvc-console.h>
0048 #include <xen/acpi.h>
0049 
0050 #include <asm/paravirt.h>
0051 #include <asm/apic.h>
0052 #include <asm/page.h>
0053 #include <asm/xen/pci.h>
0054 #include <asm/xen/hypercall.h>
0055 #include <asm/xen/hypervisor.h>
0056 #include <asm/xen/cpuid.h>
0057 #include <asm/fixmap.h>
0058 #include <asm/processor.h>
0059 #include <asm/proto.h>
0060 #include <asm/msr-index.h>
0061 #include <asm/traps.h>
0062 #include <asm/setup.h>
0063 #include <asm/desc.h>
0064 #include <asm/pgalloc.h>
0065 #include <asm/tlbflush.h>
0066 #include <asm/reboot.h>
0067 #include <asm/stackprotector.h>
0068 #include <asm/hypervisor.h>
0069 #include <asm/mach_traps.h>
0070 #include <asm/mwait.h>
0071 #include <asm/pci_x86.h>
0072 #include <asm/cpu.h>
0073 #ifdef CONFIG_X86_IOPL_IOPERM
0074 #include <asm/io_bitmap.h>
0075 #endif
0076 
0077 #ifdef CONFIG_ACPI
0078 #include <linux/acpi.h>
0079 #include <asm/acpi.h>
0080 #include <acpi/pdc_intel.h>
0081 #include <acpi/processor.h>
0082 #include <xen/interface/platform.h>
0083 #endif
0084 
0085 #include "xen-ops.h"
0086 #include "mmu.h"
0087 #include "smp.h"
0088 #include "multicalls.h"
0089 #include "pmu.h"
0090 
0091 #include "../kernel/cpu/cpu.h" /* get_cpu_cap() */
0092 
0093 void *xen_initial_gdt;
0094 
0095 static int xen_cpu_up_prepare_pv(unsigned int cpu);
0096 static int xen_cpu_dead_pv(unsigned int cpu);
0097 
0098 struct tls_descs {
0099     struct desc_struct desc[3];
0100 };
0101 
0102 /*
0103  * Updating the 3 TLS descriptors in the GDT on every task switch is
0104  * surprisingly expensive so we avoid updating them if they haven't
0105  * changed.  Since Xen writes different descriptors than the one
0106  * passed in the update_descriptor hypercall we keep shadow copies to
0107  * compare against.
0108  */
0109 static DEFINE_PER_CPU(struct tls_descs, shadow_tls_desc);
0110 
0111 static void __init xen_pv_init_platform(void)
0112 {
0113     /* PV guests can't operate virtio devices without grants. */
0114     if (IS_ENABLED(CONFIG_XEN_VIRTIO))
0115         virtio_set_mem_acc_cb(virtio_require_restricted_mem_acc);
0116 
0117     populate_extra_pte(fix_to_virt(FIX_PARAVIRT_BOOTMAP));
0118 
0119     set_fixmap(FIX_PARAVIRT_BOOTMAP, xen_start_info->shared_info);
0120     HYPERVISOR_shared_info = (void *)fix_to_virt(FIX_PARAVIRT_BOOTMAP);
0121 
0122     /* xen clock uses per-cpu vcpu_info, need to init it for boot cpu */
0123     xen_vcpu_info_reset(0);
0124 
0125     /* pvclock is in shared info area */
0126     xen_init_time_ops();
0127 }
0128 
0129 static void __init xen_pv_guest_late_init(void)
0130 {
0131 #ifndef CONFIG_SMP
0132     /* Setup shared vcpu info for non-smp configurations */
0133     xen_setup_vcpu_info_placement();
0134 #endif
0135 }
0136 
0137 static __read_mostly unsigned int cpuid_leaf5_ecx_val;
0138 static __read_mostly unsigned int cpuid_leaf5_edx_val;
0139 
0140 static void xen_cpuid(unsigned int *ax, unsigned int *bx,
0141               unsigned int *cx, unsigned int *dx)
0142 {
0143     unsigned maskebx = ~0;
0144 
0145     /*
0146      * Mask out inconvenient features, to try and disable as many
0147      * unsupported kernel subsystems as possible.
0148      */
0149     switch (*ax) {
0150     case CPUID_MWAIT_LEAF:
0151         /* Synthesize the values.. */
0152         *ax = 0;
0153         *bx = 0;
0154         *cx = cpuid_leaf5_ecx_val;
0155         *dx = cpuid_leaf5_edx_val;
0156         return;
0157 
0158     case 0xb:
0159         /* Suppress extended topology stuff */
0160         maskebx = 0;
0161         break;
0162     }
0163 
0164     asm(XEN_EMULATE_PREFIX "cpuid"
0165         : "=a" (*ax),
0166           "=b" (*bx),
0167           "=c" (*cx),
0168           "=d" (*dx)
0169         : "0" (*ax), "2" (*cx));
0170 
0171     *bx &= maskebx;
0172 }
0173 
0174 static bool __init xen_check_mwait(void)
0175 {
0176 #ifdef CONFIG_ACPI
0177     struct xen_platform_op op = {
0178         .cmd            = XENPF_set_processor_pminfo,
0179         .u.set_pminfo.id    = -1,
0180         .u.set_pminfo.type  = XEN_PM_PDC,
0181     };
0182     uint32_t buf[3];
0183     unsigned int ax, bx, cx, dx;
0184     unsigned int mwait_mask;
0185 
0186     /* We need to determine whether it is OK to expose the MWAIT
0187      * capability to the kernel to harvest deeper than C3 states from ACPI
0188      * _CST using the processor_harvest_xen.c module. For this to work, we
0189      * need to gather the MWAIT_LEAF values (which the cstate.c code
0190      * checks against). The hypervisor won't expose the MWAIT flag because
0191      * it would break backwards compatibility; so we will find out directly
0192      * from the hardware and hypercall.
0193      */
0194     if (!xen_initial_domain())
0195         return false;
0196 
0197     /*
0198      * When running under platform earlier than Xen4.2, do not expose
0199      * mwait, to avoid the risk of loading native acpi pad driver
0200      */
0201     if (!xen_running_on_version_or_later(4, 2))
0202         return false;
0203 
0204     ax = 1;
0205     cx = 0;
0206 
0207     native_cpuid(&ax, &bx, &cx, &dx);
0208 
0209     mwait_mask = (1 << (X86_FEATURE_EST % 32)) |
0210              (1 << (X86_FEATURE_MWAIT % 32));
0211 
0212     if ((cx & mwait_mask) != mwait_mask)
0213         return false;
0214 
0215     /* We need to emulate the MWAIT_LEAF and for that we need both
0216      * ecx and edx. The hypercall provides only partial information.
0217      */
0218 
0219     ax = CPUID_MWAIT_LEAF;
0220     bx = 0;
0221     cx = 0;
0222     dx = 0;
0223 
0224     native_cpuid(&ax, &bx, &cx, &dx);
0225 
0226     /* Ask the Hypervisor whether to clear ACPI_PDC_C_C2C3_FFH. If so,
0227      * don't expose MWAIT_LEAF and let ACPI pick the IOPORT version of C3.
0228      */
0229     buf[0] = ACPI_PDC_REVISION_ID;
0230     buf[1] = 1;
0231     buf[2] = (ACPI_PDC_C_CAPABILITY_SMP | ACPI_PDC_EST_CAPABILITY_SWSMP);
0232 
0233     set_xen_guest_handle(op.u.set_pminfo.pdc, buf);
0234 
0235     if ((HYPERVISOR_platform_op(&op) == 0) &&
0236         (buf[2] & (ACPI_PDC_C_C1_FFH | ACPI_PDC_C_C2C3_FFH))) {
0237         cpuid_leaf5_ecx_val = cx;
0238         cpuid_leaf5_edx_val = dx;
0239     }
0240     return true;
0241 #else
0242     return false;
0243 #endif
0244 }
0245 
0246 static bool __init xen_check_xsave(void)
0247 {
0248     unsigned int cx, xsave_mask;
0249 
0250     cx = cpuid_ecx(1);
0251 
0252     xsave_mask = (1 << (X86_FEATURE_XSAVE % 32)) |
0253              (1 << (X86_FEATURE_OSXSAVE % 32));
0254 
0255     /* Xen will set CR4.OSXSAVE if supported and not disabled by force */
0256     return (cx & xsave_mask) == xsave_mask;
0257 }
0258 
0259 static void __init xen_init_capabilities(void)
0260 {
0261     setup_force_cpu_cap(X86_FEATURE_XENPV);
0262     setup_clear_cpu_cap(X86_FEATURE_DCA);
0263     setup_clear_cpu_cap(X86_FEATURE_APERFMPERF);
0264     setup_clear_cpu_cap(X86_FEATURE_MTRR);
0265     setup_clear_cpu_cap(X86_FEATURE_ACC);
0266     setup_clear_cpu_cap(X86_FEATURE_X2APIC);
0267     setup_clear_cpu_cap(X86_FEATURE_SME);
0268 
0269     /*
0270      * Xen PV would need some work to support PCID: CR3 handling as well
0271      * as xen_flush_tlb_others() would need updating.
0272      */
0273     setup_clear_cpu_cap(X86_FEATURE_PCID);
0274 
0275     if (!xen_initial_domain())
0276         setup_clear_cpu_cap(X86_FEATURE_ACPI);
0277 
0278     if (xen_check_mwait())
0279         setup_force_cpu_cap(X86_FEATURE_MWAIT);
0280     else
0281         setup_clear_cpu_cap(X86_FEATURE_MWAIT);
0282 
0283     if (!xen_check_xsave()) {
0284         setup_clear_cpu_cap(X86_FEATURE_XSAVE);
0285         setup_clear_cpu_cap(X86_FEATURE_OSXSAVE);
0286     }
0287 }
0288 
0289 static noinstr void xen_set_debugreg(int reg, unsigned long val)
0290 {
0291     HYPERVISOR_set_debugreg(reg, val);
0292 }
0293 
0294 static noinstr unsigned long xen_get_debugreg(int reg)
0295 {
0296     return HYPERVISOR_get_debugreg(reg);
0297 }
0298 
0299 static void xen_end_context_switch(struct task_struct *next)
0300 {
0301     xen_mc_flush();
0302     paravirt_end_context_switch(next);
0303 }
0304 
0305 static unsigned long xen_store_tr(void)
0306 {
0307     return 0;
0308 }
0309 
0310 /*
0311  * Set the page permissions for a particular virtual address.  If the
0312  * address is a vmalloc mapping (or other non-linear mapping), then
0313  * find the linear mapping of the page and also set its protections to
0314  * match.
0315  */
0316 static void set_aliased_prot(void *v, pgprot_t prot)
0317 {
0318     int level;
0319     pte_t *ptep;
0320     pte_t pte;
0321     unsigned long pfn;
0322     unsigned char dummy;
0323     void *va;
0324 
0325     ptep = lookup_address((unsigned long)v, &level);
0326     BUG_ON(ptep == NULL);
0327 
0328     pfn = pte_pfn(*ptep);
0329     pte = pfn_pte(pfn, prot);
0330 
0331     /*
0332      * Careful: update_va_mapping() will fail if the virtual address
0333      * we're poking isn't populated in the page tables.  We don't
0334      * need to worry about the direct map (that's always in the page
0335      * tables), but we need to be careful about vmap space.  In
0336      * particular, the top level page table can lazily propagate
0337      * entries between processes, so if we've switched mms since we
0338      * vmapped the target in the first place, we might not have the
0339      * top-level page table entry populated.
0340      *
0341      * We disable preemption because we want the same mm active when
0342      * we probe the target and when we issue the hypercall.  We'll
0343      * have the same nominal mm, but if we're a kernel thread, lazy
0344      * mm dropping could change our pgd.
0345      *
0346      * Out of an abundance of caution, this uses __get_user() to fault
0347      * in the target address just in case there's some obscure case
0348      * in which the target address isn't readable.
0349      */
0350 
0351     preempt_disable();
0352 
0353     copy_from_kernel_nofault(&dummy, v, 1);
0354 
0355     if (HYPERVISOR_update_va_mapping((unsigned long)v, pte, 0))
0356         BUG();
0357 
0358     va = __va(PFN_PHYS(pfn));
0359 
0360     if (va != v && HYPERVISOR_update_va_mapping((unsigned long)va, pte, 0))
0361         BUG();
0362 
0363     preempt_enable();
0364 }
0365 
0366 static void xen_alloc_ldt(struct desc_struct *ldt, unsigned entries)
0367 {
0368     const unsigned entries_per_page = PAGE_SIZE / LDT_ENTRY_SIZE;
0369     int i;
0370 
0371     /*
0372      * We need to mark the all aliases of the LDT pages RO.  We
0373      * don't need to call vm_flush_aliases(), though, since that's
0374      * only responsible for flushing aliases out the TLBs, not the
0375      * page tables, and Xen will flush the TLB for us if needed.
0376      *
0377      * To avoid confusing future readers: none of this is necessary
0378      * to load the LDT.  The hypervisor only checks this when the
0379      * LDT is faulted in due to subsequent descriptor access.
0380      */
0381 
0382     for (i = 0; i < entries; i += entries_per_page)
0383         set_aliased_prot(ldt + i, PAGE_KERNEL_RO);
0384 }
0385 
0386 static void xen_free_ldt(struct desc_struct *ldt, unsigned entries)
0387 {
0388     const unsigned entries_per_page = PAGE_SIZE / LDT_ENTRY_SIZE;
0389     int i;
0390 
0391     for (i = 0; i < entries; i += entries_per_page)
0392         set_aliased_prot(ldt + i, PAGE_KERNEL);
0393 }
0394 
0395 static void xen_set_ldt(const void *addr, unsigned entries)
0396 {
0397     struct mmuext_op *op;
0398     struct multicall_space mcs = xen_mc_entry(sizeof(*op));
0399 
0400     trace_xen_cpu_set_ldt(addr, entries);
0401 
0402     op = mcs.args;
0403     op->cmd = MMUEXT_SET_LDT;
0404     op->arg1.linear_addr = (unsigned long)addr;
0405     op->arg2.nr_ents = entries;
0406 
0407     MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
0408 
0409     xen_mc_issue(PARAVIRT_LAZY_CPU);
0410 }
0411 
0412 static void xen_load_gdt(const struct desc_ptr *dtr)
0413 {
0414     unsigned long va = dtr->address;
0415     unsigned int size = dtr->size + 1;
0416     unsigned long pfn, mfn;
0417     int level;
0418     pte_t *ptep;
0419     void *virt;
0420 
0421     /* @size should be at most GDT_SIZE which is smaller than PAGE_SIZE. */
0422     BUG_ON(size > PAGE_SIZE);
0423     BUG_ON(va & ~PAGE_MASK);
0424 
0425     /*
0426      * The GDT is per-cpu and is in the percpu data area.
0427      * That can be virtually mapped, so we need to do a
0428      * page-walk to get the underlying MFN for the
0429      * hypercall.  The page can also be in the kernel's
0430      * linear range, so we need to RO that mapping too.
0431      */
0432     ptep = lookup_address(va, &level);
0433     BUG_ON(ptep == NULL);
0434 
0435     pfn = pte_pfn(*ptep);
0436     mfn = pfn_to_mfn(pfn);
0437     virt = __va(PFN_PHYS(pfn));
0438 
0439     make_lowmem_page_readonly((void *)va);
0440     make_lowmem_page_readonly(virt);
0441 
0442     if (HYPERVISOR_set_gdt(&mfn, size / sizeof(struct desc_struct)))
0443         BUG();
0444 }
0445 
0446 /*
0447  * load_gdt for early boot, when the gdt is only mapped once
0448  */
0449 static void __init xen_load_gdt_boot(const struct desc_ptr *dtr)
0450 {
0451     unsigned long va = dtr->address;
0452     unsigned int size = dtr->size + 1;
0453     unsigned long pfn, mfn;
0454     pte_t pte;
0455 
0456     /* @size should be at most GDT_SIZE which is smaller than PAGE_SIZE. */
0457     BUG_ON(size > PAGE_SIZE);
0458     BUG_ON(va & ~PAGE_MASK);
0459 
0460     pfn = virt_to_pfn(va);
0461     mfn = pfn_to_mfn(pfn);
0462 
0463     pte = pfn_pte(pfn, PAGE_KERNEL_RO);
0464 
0465     if (HYPERVISOR_update_va_mapping((unsigned long)va, pte, 0))
0466         BUG();
0467 
0468     if (HYPERVISOR_set_gdt(&mfn, size / sizeof(struct desc_struct)))
0469         BUG();
0470 }
0471 
0472 static inline bool desc_equal(const struct desc_struct *d1,
0473                   const struct desc_struct *d2)
0474 {
0475     return !memcmp(d1, d2, sizeof(*d1));
0476 }
0477 
0478 static void load_TLS_descriptor(struct thread_struct *t,
0479                 unsigned int cpu, unsigned int i)
0480 {
0481     struct desc_struct *shadow = &per_cpu(shadow_tls_desc, cpu).desc[i];
0482     struct desc_struct *gdt;
0483     xmaddr_t maddr;
0484     struct multicall_space mc;
0485 
0486     if (desc_equal(shadow, &t->tls_array[i]))
0487         return;
0488 
0489     *shadow = t->tls_array[i];
0490 
0491     gdt = get_cpu_gdt_rw(cpu);
0492     maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]);
0493     mc = __xen_mc_entry(0);
0494 
0495     MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]);
0496 }
0497 
0498 static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
0499 {
0500     /*
0501      * In lazy mode we need to zero %fs, otherwise we may get an
0502      * exception between the new %fs descriptor being loaded and
0503      * %fs being effectively cleared at __switch_to().
0504      */
0505     if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU)
0506         loadsegment(fs, 0);
0507 
0508     xen_mc_batch();
0509 
0510     load_TLS_descriptor(t, cpu, 0);
0511     load_TLS_descriptor(t, cpu, 1);
0512     load_TLS_descriptor(t, cpu, 2);
0513 
0514     xen_mc_issue(PARAVIRT_LAZY_CPU);
0515 }
0516 
0517 static void xen_load_gs_index(unsigned int idx)
0518 {
0519     if (HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, idx))
0520         BUG();
0521 }
0522 
0523 static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
0524                 const void *ptr)
0525 {
0526     xmaddr_t mach_lp = arbitrary_virt_to_machine(&dt[entrynum]);
0527     u64 entry = *(u64 *)ptr;
0528 
0529     trace_xen_cpu_write_ldt_entry(dt, entrynum, entry);
0530 
0531     preempt_disable();
0532 
0533     xen_mc_flush();
0534     if (HYPERVISOR_update_descriptor(mach_lp.maddr, entry))
0535         BUG();
0536 
0537     preempt_enable();
0538 }
0539 
0540 void noist_exc_debug(struct pt_regs *regs);
0541 
0542 DEFINE_IDTENTRY_RAW(xenpv_exc_nmi)
0543 {
0544     /* On Xen PV, NMI doesn't use IST.  The C part is the same as native. */
0545     exc_nmi(regs);
0546 }
0547 
0548 DEFINE_IDTENTRY_RAW_ERRORCODE(xenpv_exc_double_fault)
0549 {
0550     /* On Xen PV, DF doesn't use IST.  The C part is the same as native. */
0551     exc_double_fault(regs, error_code);
0552 }
0553 
0554 DEFINE_IDTENTRY_RAW(xenpv_exc_debug)
0555 {
0556     /*
0557      * There's no IST on Xen PV, but we still need to dispatch
0558      * to the correct handler.
0559      */
0560     if (user_mode(regs))
0561         noist_exc_debug(regs);
0562     else
0563         exc_debug(regs);
0564 }
0565 
0566 DEFINE_IDTENTRY_RAW(exc_xen_unknown_trap)
0567 {
0568     /* This should never happen and there is no way to handle it. */
0569     instrumentation_begin();
0570     pr_err("Unknown trap in Xen PV mode.");
0571     BUG();
0572     instrumentation_end();
0573 }
0574 
0575 #ifdef CONFIG_X86_MCE
0576 DEFINE_IDTENTRY_RAW(xenpv_exc_machine_check)
0577 {
0578     /*
0579      * There's no IST on Xen PV, but we still need to dispatch
0580      * to the correct handler.
0581      */
0582     if (user_mode(regs))
0583         noist_exc_machine_check(regs);
0584     else
0585         exc_machine_check(regs);
0586 }
0587 #endif
0588 
0589 struct trap_array_entry {
0590     void (*orig)(void);
0591     void (*xen)(void);
0592     bool ist_okay;
0593 };
0594 
0595 #define TRAP_ENTRY(func, ist_ok) {          \
0596     .orig       = asm_##func,           \
0597     .xen        = xen_asm_##func,       \
0598     .ist_okay   = ist_ok }
0599 
0600 #define TRAP_ENTRY_REDIR(func, ist_ok) {        \
0601     .orig       = asm_##func,           \
0602     .xen        = xen_asm_xenpv_##func,     \
0603     .ist_okay   = ist_ok }
0604 
0605 static struct trap_array_entry trap_array[] = {
0606     TRAP_ENTRY_REDIR(exc_debug,         true  ),
0607     TRAP_ENTRY_REDIR(exc_double_fault,      true  ),
0608 #ifdef CONFIG_X86_MCE
0609     TRAP_ENTRY_REDIR(exc_machine_check,     true  ),
0610 #endif
0611     TRAP_ENTRY_REDIR(exc_nmi,           true  ),
0612     TRAP_ENTRY(exc_int3,                false ),
0613     TRAP_ENTRY(exc_overflow,            false ),
0614 #ifdef CONFIG_IA32_EMULATION
0615     { entry_INT80_compat,          xen_entry_INT80_compat,          false },
0616 #endif
0617     TRAP_ENTRY(exc_page_fault,          false ),
0618     TRAP_ENTRY(exc_divide_error,            false ),
0619     TRAP_ENTRY(exc_bounds,              false ),
0620     TRAP_ENTRY(exc_invalid_op,          false ),
0621     TRAP_ENTRY(exc_device_not_available,        false ),
0622     TRAP_ENTRY(exc_coproc_segment_overrun,      false ),
0623     TRAP_ENTRY(exc_invalid_tss,         false ),
0624     TRAP_ENTRY(exc_segment_not_present,     false ),
0625     TRAP_ENTRY(exc_stack_segment,           false ),
0626     TRAP_ENTRY(exc_general_protection,      false ),
0627     TRAP_ENTRY(exc_spurious_interrupt_bug,      false ),
0628     TRAP_ENTRY(exc_coprocessor_error,       false ),
0629     TRAP_ENTRY(exc_alignment_check,         false ),
0630     TRAP_ENTRY(exc_simd_coprocessor_error,      false ),
0631 #ifdef CONFIG_X86_KERNEL_IBT
0632     TRAP_ENTRY(exc_control_protection,      false ),
0633 #endif
0634 };
0635 
0636 static bool __ref get_trap_addr(void **addr, unsigned int ist)
0637 {
0638     unsigned int nr;
0639     bool ist_okay = false;
0640     bool found = false;
0641 
0642     /*
0643      * Replace trap handler addresses by Xen specific ones.
0644      * Check for known traps using IST and whitelist them.
0645      * The debugger ones are the only ones we care about.
0646      * Xen will handle faults like double_fault, so we should never see
0647      * them.  Warn if there's an unexpected IST-using fault handler.
0648      */
0649     for (nr = 0; nr < ARRAY_SIZE(trap_array); nr++) {
0650         struct trap_array_entry *entry = trap_array + nr;
0651 
0652         if (*addr == entry->orig) {
0653             *addr = entry->xen;
0654             ist_okay = entry->ist_okay;
0655             found = true;
0656             break;
0657         }
0658     }
0659 
0660     if (nr == ARRAY_SIZE(trap_array) &&
0661         *addr >= (void *)early_idt_handler_array[0] &&
0662         *addr < (void *)early_idt_handler_array[NUM_EXCEPTION_VECTORS]) {
0663         nr = (*addr - (void *)early_idt_handler_array[0]) /
0664              EARLY_IDT_HANDLER_SIZE;
0665         *addr = (void *)xen_early_idt_handler_array[nr];
0666         found = true;
0667     }
0668 
0669     if (!found)
0670         *addr = (void *)xen_asm_exc_xen_unknown_trap;
0671 
0672     if (WARN_ON(found && ist != 0 && !ist_okay))
0673         return false;
0674 
0675     return true;
0676 }
0677 
0678 static int cvt_gate_to_trap(int vector, const gate_desc *val,
0679                 struct trap_info *info)
0680 {
0681     unsigned long addr;
0682 
0683     if (val->bits.type != GATE_TRAP && val->bits.type != GATE_INTERRUPT)
0684         return 0;
0685 
0686     info->vector = vector;
0687 
0688     addr = gate_offset(val);
0689     if (!get_trap_addr((void **)&addr, val->bits.ist))
0690         return 0;
0691     info->address = addr;
0692 
0693     info->cs = gate_segment(val);
0694     info->flags = val->bits.dpl;
0695     /* interrupt gates clear IF */
0696     if (val->bits.type == GATE_INTERRUPT)
0697         info->flags |= 1 << 2;
0698 
0699     return 1;
0700 }
0701 
0702 /* Locations of each CPU's IDT */
0703 static DEFINE_PER_CPU(struct desc_ptr, idt_desc);
0704 
0705 /* Set an IDT entry.  If the entry is part of the current IDT, then
0706    also update Xen. */
0707 static void xen_write_idt_entry(gate_desc *dt, int entrynum, const gate_desc *g)
0708 {
0709     unsigned long p = (unsigned long)&dt[entrynum];
0710     unsigned long start, end;
0711 
0712     trace_xen_cpu_write_idt_entry(dt, entrynum, g);
0713 
0714     preempt_disable();
0715 
0716     start = __this_cpu_read(idt_desc.address);
0717     end = start + __this_cpu_read(idt_desc.size) + 1;
0718 
0719     xen_mc_flush();
0720 
0721     native_write_idt_entry(dt, entrynum, g);
0722 
0723     if (p >= start && (p + 8) <= end) {
0724         struct trap_info info[2];
0725 
0726         info[1].address = 0;
0727 
0728         if (cvt_gate_to_trap(entrynum, g, &info[0]))
0729             if (HYPERVISOR_set_trap_table(info))
0730                 BUG();
0731     }
0732 
0733     preempt_enable();
0734 }
0735 
0736 static unsigned xen_convert_trap_info(const struct desc_ptr *desc,
0737                       struct trap_info *traps, bool full)
0738 {
0739     unsigned in, out, count;
0740 
0741     count = (desc->size+1) / sizeof(gate_desc);
0742     BUG_ON(count > 256);
0743 
0744     for (in = out = 0; in < count; in++) {
0745         gate_desc *entry = (gate_desc *)(desc->address) + in;
0746 
0747         if (cvt_gate_to_trap(in, entry, &traps[out]) || full)
0748             out++;
0749     }
0750 
0751     return out;
0752 }
0753 
0754 void xen_copy_trap_info(struct trap_info *traps)
0755 {
0756     const struct desc_ptr *desc = this_cpu_ptr(&idt_desc);
0757 
0758     xen_convert_trap_info(desc, traps, true);
0759 }
0760 
0761 /* Load a new IDT into Xen.  In principle this can be per-CPU, so we
0762    hold a spinlock to protect the static traps[] array (static because
0763    it avoids allocation, and saves stack space). */
0764 static void xen_load_idt(const struct desc_ptr *desc)
0765 {
0766     static DEFINE_SPINLOCK(lock);
0767     static struct trap_info traps[257];
0768     unsigned out;
0769 
0770     trace_xen_cpu_load_idt(desc);
0771 
0772     spin_lock(&lock);
0773 
0774     memcpy(this_cpu_ptr(&idt_desc), desc, sizeof(idt_desc));
0775 
0776     out = xen_convert_trap_info(desc, traps, false);
0777     memset(&traps[out], 0, sizeof(traps[0]));
0778 
0779     xen_mc_flush();
0780     if (HYPERVISOR_set_trap_table(traps))
0781         BUG();
0782 
0783     spin_unlock(&lock);
0784 }
0785 
0786 /* Write a GDT descriptor entry.  Ignore LDT descriptors, since
0787    they're handled differently. */
0788 static void xen_write_gdt_entry(struct desc_struct *dt, int entry,
0789                 const void *desc, int type)
0790 {
0791     trace_xen_cpu_write_gdt_entry(dt, entry, desc, type);
0792 
0793     preempt_disable();
0794 
0795     switch (type) {
0796     case DESC_LDT:
0797     case DESC_TSS:
0798         /* ignore */
0799         break;
0800 
0801     default: {
0802         xmaddr_t maddr = arbitrary_virt_to_machine(&dt[entry]);
0803 
0804         xen_mc_flush();
0805         if (HYPERVISOR_update_descriptor(maddr.maddr, *(u64 *)desc))
0806             BUG();
0807     }
0808 
0809     }
0810 
0811     preempt_enable();
0812 }
0813 
0814 /*
0815  * Version of write_gdt_entry for use at early boot-time needed to
0816  * update an entry as simply as possible.
0817  */
0818 static void __init xen_write_gdt_entry_boot(struct desc_struct *dt, int entry,
0819                         const void *desc, int type)
0820 {
0821     trace_xen_cpu_write_gdt_entry(dt, entry, desc, type);
0822 
0823     switch (type) {
0824     case DESC_LDT:
0825     case DESC_TSS:
0826         /* ignore */
0827         break;
0828 
0829     default: {
0830         xmaddr_t maddr = virt_to_machine(&dt[entry]);
0831 
0832         if (HYPERVISOR_update_descriptor(maddr.maddr, *(u64 *)desc))
0833             dt[entry] = *(struct desc_struct *)desc;
0834     }
0835 
0836     }
0837 }
0838 
0839 static void xen_load_sp0(unsigned long sp0)
0840 {
0841     struct multicall_space mcs;
0842 
0843     mcs = xen_mc_entry(0);
0844     MULTI_stack_switch(mcs.mc, __KERNEL_DS, sp0);
0845     xen_mc_issue(PARAVIRT_LAZY_CPU);
0846     this_cpu_write(cpu_tss_rw.x86_tss.sp0, sp0);
0847 }
0848 
0849 #ifdef CONFIG_X86_IOPL_IOPERM
0850 static void xen_invalidate_io_bitmap(void)
0851 {
0852     struct physdev_set_iobitmap iobitmap = {
0853         .bitmap = NULL,
0854         .nr_ports = 0,
0855     };
0856 
0857     native_tss_invalidate_io_bitmap();
0858     HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap, &iobitmap);
0859 }
0860 
0861 static void xen_update_io_bitmap(void)
0862 {
0863     struct physdev_set_iobitmap iobitmap;
0864     struct tss_struct *tss = this_cpu_ptr(&cpu_tss_rw);
0865 
0866     native_tss_update_io_bitmap();
0867 
0868     iobitmap.bitmap = (uint8_t *)(&tss->x86_tss) +
0869               tss->x86_tss.io_bitmap_base;
0870     if (tss->x86_tss.io_bitmap_base == IO_BITMAP_OFFSET_INVALID)
0871         iobitmap.nr_ports = 0;
0872     else
0873         iobitmap.nr_ports = IO_BITMAP_BITS;
0874 
0875     HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap, &iobitmap);
0876 }
0877 #endif
0878 
0879 static void xen_io_delay(void)
0880 {
0881 }
0882 
0883 static DEFINE_PER_CPU(unsigned long, xen_cr0_value);
0884 
0885 static unsigned long xen_read_cr0(void)
0886 {
0887     unsigned long cr0 = this_cpu_read(xen_cr0_value);
0888 
0889     if (unlikely(cr0 == 0)) {
0890         cr0 = native_read_cr0();
0891         this_cpu_write(xen_cr0_value, cr0);
0892     }
0893 
0894     return cr0;
0895 }
0896 
0897 static void xen_write_cr0(unsigned long cr0)
0898 {
0899     struct multicall_space mcs;
0900 
0901     this_cpu_write(xen_cr0_value, cr0);
0902 
0903     /* Only pay attention to cr0.TS; everything else is
0904        ignored. */
0905     mcs = xen_mc_entry(0);
0906 
0907     MULTI_fpu_taskswitch(mcs.mc, (cr0 & X86_CR0_TS) != 0);
0908 
0909     xen_mc_issue(PARAVIRT_LAZY_CPU);
0910 }
0911 
0912 static void xen_write_cr4(unsigned long cr4)
0913 {
0914     cr4 &= ~(X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PCE);
0915 
0916     native_write_cr4(cr4);
0917 }
0918 
0919 static u64 xen_read_msr_safe(unsigned int msr, int *err)
0920 {
0921     u64 val;
0922 
0923     if (pmu_msr_read(msr, &val, err))
0924         return val;
0925 
0926     val = native_read_msr_safe(msr, err);
0927     switch (msr) {
0928     case MSR_IA32_APICBASE:
0929         val &= ~X2APIC_ENABLE;
0930         break;
0931     }
0932     return val;
0933 }
0934 
0935 static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
0936 {
0937     int ret;
0938     unsigned int which;
0939     u64 base;
0940 
0941     ret = 0;
0942 
0943     switch (msr) {
0944     case MSR_FS_BASE:       which = SEGBASE_FS; goto set;
0945     case MSR_KERNEL_GS_BASE:    which = SEGBASE_GS_USER; goto set;
0946     case MSR_GS_BASE:       which = SEGBASE_GS_KERNEL; goto set;
0947 
0948     set:
0949         base = ((u64)high << 32) | low;
0950         if (HYPERVISOR_set_segment_base(which, base) != 0)
0951             ret = -EIO;
0952         break;
0953 
0954     case MSR_STAR:
0955     case MSR_CSTAR:
0956     case MSR_LSTAR:
0957     case MSR_SYSCALL_MASK:
0958     case MSR_IA32_SYSENTER_CS:
0959     case MSR_IA32_SYSENTER_ESP:
0960     case MSR_IA32_SYSENTER_EIP:
0961         /* Fast syscall setup is all done in hypercalls, so
0962            these are all ignored.  Stub them out here to stop
0963            Xen console noise. */
0964         break;
0965 
0966     default:
0967         if (!pmu_msr_write(msr, low, high, &ret))
0968             ret = native_write_msr_safe(msr, low, high);
0969     }
0970 
0971     return ret;
0972 }
0973 
0974 static u64 xen_read_msr(unsigned int msr)
0975 {
0976     /*
0977      * This will silently swallow a #GP from RDMSR.  It may be worth
0978      * changing that.
0979      */
0980     int err;
0981 
0982     return xen_read_msr_safe(msr, &err);
0983 }
0984 
0985 static void xen_write_msr(unsigned int msr, unsigned low, unsigned high)
0986 {
0987     /*
0988      * This will silently swallow a #GP from WRMSR.  It may be worth
0989      * changing that.
0990      */
0991     xen_write_msr_safe(msr, low, high);
0992 }
0993 
0994 /* This is called once we have the cpu_possible_mask */
0995 void __init xen_setup_vcpu_info_placement(void)
0996 {
0997     int cpu;
0998 
0999     for_each_possible_cpu(cpu) {
1000         /* Set up direct vCPU id mapping for PV guests. */
1001         per_cpu(xen_vcpu_id, cpu) = cpu;
1002         xen_vcpu_setup(cpu);
1003     }
1004 
1005     pv_ops.irq.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct);
1006     pv_ops.irq.irq_disable = __PV_IS_CALLEE_SAVE(xen_irq_disable_direct);
1007     pv_ops.irq.irq_enable = __PV_IS_CALLEE_SAVE(xen_irq_enable_direct);
1008     pv_ops.mmu.read_cr2 = __PV_IS_CALLEE_SAVE(xen_read_cr2_direct);
1009 }
1010 
1011 static const struct pv_info xen_info __initconst = {
1012     .extra_user_64bit_cs = FLAT_USER_CS64,
1013     .name = "Xen",
1014 };
1015 
1016 static const typeof(pv_ops) xen_cpu_ops __initconst = {
1017     .cpu = {
1018         .cpuid = xen_cpuid,
1019 
1020         .set_debugreg = xen_set_debugreg,
1021         .get_debugreg = xen_get_debugreg,
1022 
1023         .read_cr0 = xen_read_cr0,
1024         .write_cr0 = xen_write_cr0,
1025 
1026         .write_cr4 = xen_write_cr4,
1027 
1028         .wbinvd = native_wbinvd,
1029 
1030         .read_msr = xen_read_msr,
1031         .write_msr = xen_write_msr,
1032 
1033         .read_msr_safe = xen_read_msr_safe,
1034         .write_msr_safe = xen_write_msr_safe,
1035 
1036         .read_pmc = xen_read_pmc,
1037 
1038         .load_tr_desc = paravirt_nop,
1039         .set_ldt = xen_set_ldt,
1040         .load_gdt = xen_load_gdt,
1041         .load_idt = xen_load_idt,
1042         .load_tls = xen_load_tls,
1043         .load_gs_index = xen_load_gs_index,
1044 
1045         .alloc_ldt = xen_alloc_ldt,
1046         .free_ldt = xen_free_ldt,
1047 
1048         .store_tr = xen_store_tr,
1049 
1050         .write_ldt_entry = xen_write_ldt_entry,
1051         .write_gdt_entry = xen_write_gdt_entry,
1052         .write_idt_entry = xen_write_idt_entry,
1053         .load_sp0 = xen_load_sp0,
1054 
1055 #ifdef CONFIG_X86_IOPL_IOPERM
1056         .invalidate_io_bitmap = xen_invalidate_io_bitmap,
1057         .update_io_bitmap = xen_update_io_bitmap,
1058 #endif
1059         .io_delay = xen_io_delay,
1060 
1061         .start_context_switch = paravirt_start_context_switch,
1062         .end_context_switch = xen_end_context_switch,
1063     },
1064 };
1065 
1066 static void xen_restart(char *msg)
1067 {
1068     xen_reboot(SHUTDOWN_reboot);
1069 }
1070 
1071 static void xen_machine_halt(void)
1072 {
1073     xen_reboot(SHUTDOWN_poweroff);
1074 }
1075 
1076 static void xen_machine_power_off(void)
1077 {
1078     do_kernel_power_off();
1079     xen_reboot(SHUTDOWN_poweroff);
1080 }
1081 
1082 static void xen_crash_shutdown(struct pt_regs *regs)
1083 {
1084     xen_reboot(SHUTDOWN_crash);
1085 }
1086 
1087 static const struct machine_ops xen_machine_ops __initconst = {
1088     .restart = xen_restart,
1089     .halt = xen_machine_halt,
1090     .power_off = xen_machine_power_off,
1091     .shutdown = xen_machine_halt,
1092     .crash_shutdown = xen_crash_shutdown,
1093     .emergency_restart = xen_emergency_restart,
1094 };
1095 
1096 static unsigned char xen_get_nmi_reason(void)
1097 {
1098     unsigned char reason = 0;
1099 
1100     /* Construct a value which looks like it came from port 0x61. */
1101     if (test_bit(_XEN_NMIREASON_io_error,
1102              &HYPERVISOR_shared_info->arch.nmi_reason))
1103         reason |= NMI_REASON_IOCHK;
1104     if (test_bit(_XEN_NMIREASON_pci_serr,
1105              &HYPERVISOR_shared_info->arch.nmi_reason))
1106         reason |= NMI_REASON_SERR;
1107 
1108     return reason;
1109 }
1110 
1111 static void __init xen_boot_params_init_edd(void)
1112 {
1113 #if IS_ENABLED(CONFIG_EDD)
1114     struct xen_platform_op op;
1115     struct edd_info *edd_info;
1116     u32 *mbr_signature;
1117     unsigned nr;
1118     int ret;
1119 
1120     edd_info = boot_params.eddbuf;
1121     mbr_signature = boot_params.edd_mbr_sig_buffer;
1122 
1123     op.cmd = XENPF_firmware_info;
1124 
1125     op.u.firmware_info.type = XEN_FW_DISK_INFO;
1126     for (nr = 0; nr < EDDMAXNR; nr++) {
1127         struct edd_info *info = edd_info + nr;
1128 
1129         op.u.firmware_info.index = nr;
1130         info->params.length = sizeof(info->params);
1131         set_xen_guest_handle(op.u.firmware_info.u.disk_info.edd_params,
1132                      &info->params);
1133         ret = HYPERVISOR_platform_op(&op);
1134         if (ret)
1135             break;
1136 
1137 #define C(x) info->x = op.u.firmware_info.u.disk_info.x
1138         C(device);
1139         C(version);
1140         C(interface_support);
1141         C(legacy_max_cylinder);
1142         C(legacy_max_head);
1143         C(legacy_sectors_per_track);
1144 #undef C
1145     }
1146     boot_params.eddbuf_entries = nr;
1147 
1148     op.u.firmware_info.type = XEN_FW_DISK_MBR_SIGNATURE;
1149     for (nr = 0; nr < EDD_MBR_SIG_MAX; nr++) {
1150         op.u.firmware_info.index = nr;
1151         ret = HYPERVISOR_platform_op(&op);
1152         if (ret)
1153             break;
1154         mbr_signature[nr] = op.u.firmware_info.u.disk_mbr_signature.mbr_signature;
1155     }
1156     boot_params.edd_mbr_sig_buf_entries = nr;
1157 #endif
1158 }
1159 
1160 /*
1161  * Set up the GDT and segment registers for -fstack-protector.  Until
1162  * we do this, we have to be careful not to call any stack-protected
1163  * function, which is most of the kernel.
1164  */
1165 static void __init xen_setup_gdt(int cpu)
1166 {
1167     pv_ops.cpu.write_gdt_entry = xen_write_gdt_entry_boot;
1168     pv_ops.cpu.load_gdt = xen_load_gdt_boot;
1169 
1170     switch_to_new_gdt(cpu);
1171 
1172     pv_ops.cpu.write_gdt_entry = xen_write_gdt_entry;
1173     pv_ops.cpu.load_gdt = xen_load_gdt;
1174 }
1175 
1176 static void __init xen_dom0_set_legacy_features(void)
1177 {
1178     x86_platform.legacy.rtc = 1;
1179 }
1180 
1181 static void __init xen_domu_set_legacy_features(void)
1182 {
1183     x86_platform.legacy.rtc = 0;
1184 }
1185 
1186 extern void early_xen_iret_patch(void);
1187 
1188 /* First C function to be called on Xen boot */
1189 asmlinkage __visible void __init xen_start_kernel(struct start_info *si)
1190 {
1191     struct physdev_set_iopl set_iopl;
1192     unsigned long initrd_start = 0;
1193     int rc;
1194 
1195     if (!si)
1196         return;
1197 
1198     clear_bss();
1199 
1200     xen_start_info = si;
1201 
1202     __text_gen_insn(&early_xen_iret_patch,
1203             JMP32_INSN_OPCODE, &early_xen_iret_patch, &xen_iret,
1204             JMP32_INSN_SIZE);
1205 
1206     xen_domain_type = XEN_PV_DOMAIN;
1207     xen_start_flags = xen_start_info->flags;
1208 
1209     xen_setup_features();
1210 
1211     /* Install Xen paravirt ops */
1212     pv_info = xen_info;
1213     pv_ops.cpu = xen_cpu_ops.cpu;
1214     xen_init_irq_ops();
1215 
1216     /*
1217      * Setup xen_vcpu early because it is needed for
1218      * local_irq_disable(), irqs_disabled(), e.g. in printk().
1219      *
1220      * Don't do the full vcpu_info placement stuff until we have
1221      * the cpu_possible_mask and a non-dummy shared_info.
1222      */
1223     xen_vcpu_info_reset(0);
1224 
1225     x86_platform.get_nmi_reason = xen_get_nmi_reason;
1226 
1227     x86_init.resources.memory_setup = xen_memory_setup;
1228     x86_init.irqs.intr_mode_select  = x86_init_noop;
1229     x86_init.irqs.intr_mode_init    = x86_init_noop;
1230     x86_init.oem.arch_setup = xen_arch_setup;
1231     x86_init.oem.banner = xen_banner;
1232     x86_init.hyper.init_platform = xen_pv_init_platform;
1233     x86_init.hyper.guest_late_init = xen_pv_guest_late_init;
1234 
1235     /*
1236      * Set up some pagetable state before starting to set any ptes.
1237      */
1238 
1239     xen_setup_machphys_mapping();
1240     xen_init_mmu_ops();
1241 
1242     /* Prevent unwanted bits from being set in PTEs. */
1243     __supported_pte_mask &= ~_PAGE_GLOBAL;
1244     __default_kernel_pte_mask &= ~_PAGE_GLOBAL;
1245 
1246     /* Get mfn list */
1247     xen_build_dynamic_phys_to_machine();
1248 
1249     /* Work out if we support NX */
1250     get_cpu_cap(&boot_cpu_data);
1251     x86_configure_nx();
1252 
1253     /*
1254      * Set up kernel GDT and segment registers, mainly so that
1255      * -fstack-protector code can be executed.
1256      */
1257     xen_setup_gdt(0);
1258 
1259     /* Determine virtual and physical address sizes */
1260     get_cpu_address_sizes(&boot_cpu_data);
1261 
1262     /* Let's presume PV guests always boot on vCPU with id 0. */
1263     per_cpu(xen_vcpu_id, 0) = 0;
1264 
1265     idt_setup_early_handler();
1266 
1267     xen_init_capabilities();
1268 
1269 #ifdef CONFIG_X86_LOCAL_APIC
1270     /*
1271      * set up the basic apic ops.
1272      */
1273     xen_init_apic();
1274 #endif
1275 
1276     machine_ops = xen_machine_ops;
1277 
1278     /*
1279      * The only reliable way to retain the initial address of the
1280      * percpu gdt_page is to remember it here, so we can go and
1281      * mark it RW later, when the initial percpu area is freed.
1282      */
1283     xen_initial_gdt = &per_cpu(gdt_page, 0);
1284 
1285     xen_smp_init();
1286 
1287 #ifdef CONFIG_ACPI_NUMA
1288     /*
1289      * The pages we from Xen are not related to machine pages, so
1290      * any NUMA information the kernel tries to get from ACPI will
1291      * be meaningless.  Prevent it from trying.
1292      */
1293     disable_srat();
1294 #endif
1295     WARN_ON(xen_cpuhp_setup(xen_cpu_up_prepare_pv, xen_cpu_dead_pv));
1296 
1297     local_irq_disable();
1298     early_boot_irqs_disabled = true;
1299 
1300     xen_raw_console_write("mapping kernel into physical memory\n");
1301     xen_setup_kernel_pagetable((pgd_t *)xen_start_info->pt_base,
1302                    xen_start_info->nr_pages);
1303     xen_reserve_special_pages();
1304 
1305     /*
1306      * We used to do this in xen_arch_setup, but that is too late
1307      * on AMD were early_cpu_init (run before ->arch_setup()) calls
1308      * early_amd_init which pokes 0xcf8 port.
1309      */
1310     set_iopl.iopl = 1;
1311     rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
1312     if (rc != 0)
1313         xen_raw_printk("physdev_op failed %d\n", rc);
1314 
1315 
1316     if (xen_start_info->mod_start) {
1317         if (xen_start_info->flags & SIF_MOD_START_PFN)
1318         initrd_start = PFN_PHYS(xen_start_info->mod_start);
1319         else
1320         initrd_start = __pa(xen_start_info->mod_start);
1321     }
1322 
1323     /* Poke various useful things into boot_params */
1324     boot_params.hdr.type_of_loader = (9 << 4) | 0;
1325     boot_params.hdr.ramdisk_image = initrd_start;
1326     boot_params.hdr.ramdisk_size = xen_start_info->mod_len;
1327     boot_params.hdr.cmd_line_ptr = __pa(xen_start_info->cmd_line);
1328     boot_params.hdr.hardware_subarch = X86_SUBARCH_XEN;
1329 
1330     if (!xen_initial_domain()) {
1331         if (pci_xen)
1332             x86_init.pci.arch_init = pci_xen_init;
1333         x86_platform.set_legacy_features =
1334                 xen_domu_set_legacy_features;
1335     } else {
1336         const struct dom0_vga_console_info *info =
1337             (void *)((char *)xen_start_info +
1338                  xen_start_info->console.dom0.info_off);
1339         struct xen_platform_op op = {
1340             .cmd = XENPF_firmware_info,
1341             .interface_version = XENPF_INTERFACE_VERSION,
1342             .u.firmware_info.type = XEN_FW_KBD_SHIFT_FLAGS,
1343         };
1344 
1345         x86_platform.set_legacy_features =
1346                 xen_dom0_set_legacy_features;
1347         xen_init_vga(info, xen_start_info->console.dom0.info_size);
1348         xen_start_info->console.domU.mfn = 0;
1349         xen_start_info->console.domU.evtchn = 0;
1350 
1351         if (HYPERVISOR_platform_op(&op) == 0)
1352             boot_params.kbd_status = op.u.firmware_info.u.kbd_shift_flags;
1353 
1354         /* Make sure ACS will be enabled */
1355         pci_request_acs();
1356 
1357         xen_acpi_sleep_register();
1358 
1359         xen_boot_params_init_edd();
1360 
1361 #ifdef CONFIG_ACPI
1362         /*
1363          * Disable selecting "Firmware First mode" for correctable
1364          * memory errors, as this is the duty of the hypervisor to
1365          * decide.
1366          */
1367         acpi_disable_cmcff = 1;
1368 #endif
1369     }
1370 
1371     xen_add_preferred_consoles();
1372 
1373 #ifdef CONFIG_PCI
1374     /* PCI BIOS service won't work from a PV guest. */
1375     pci_probe &= ~PCI_PROBE_BIOS;
1376 #endif
1377     xen_raw_console_write("about to get started...\n");
1378 
1379     /* We need this for printk timestamps */
1380     xen_setup_runstate_info(0);
1381 
1382     xen_efi_init(&boot_params);
1383 
1384     /* Start the world */
1385     cr4_init_shadow(); /* 32b kernel does this in i386_start_kernel() */
1386     x86_64_start_reservations((char *)__pa_symbol(&boot_params));
1387 }
1388 
1389 static int xen_cpu_up_prepare_pv(unsigned int cpu)
1390 {
1391     int rc;
1392 
1393     if (per_cpu(xen_vcpu, cpu) == NULL)
1394         return -ENODEV;
1395 
1396     xen_setup_timer(cpu);
1397 
1398     rc = xen_smp_intr_init(cpu);
1399     if (rc) {
1400         WARN(1, "xen_smp_intr_init() for CPU %d failed: %d\n",
1401              cpu, rc);
1402         return rc;
1403     }
1404 
1405     rc = xen_smp_intr_init_pv(cpu);
1406     if (rc) {
1407         WARN(1, "xen_smp_intr_init_pv() for CPU %d failed: %d\n",
1408              cpu, rc);
1409         return rc;
1410     }
1411 
1412     return 0;
1413 }
1414 
1415 static int xen_cpu_dead_pv(unsigned int cpu)
1416 {
1417     xen_smp_intr_free(cpu);
1418     xen_smp_intr_free_pv(cpu);
1419 
1420     xen_teardown_timer(cpu);
1421 
1422     return 0;
1423 }
1424 
1425 static uint32_t __init xen_platform_pv(void)
1426 {
1427     if (xen_pv_domain())
1428         return xen_cpuid_base();
1429 
1430     return 0;
1431 }
1432 
1433 const __initconst struct hypervisor_x86 x86_hyper_xen_pv = {
1434     .name                   = "Xen PV",
1435     .detect                 = xen_platform_pv,
1436     .type           = X86_HYPER_XEN_PV,
1437     .runtime.pin_vcpu       = xen_pin_vcpu,
1438     .ignore_nopv        = true,
1439 };