Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0-only
0002 /*
0003  * handle transition of Linux booting another kernel
0004  * Copyright (C) 2002-2005 Eric Biederman  <ebiederm@xmission.com>
0005  */
0006 
0007 #define pr_fmt(fmt) "kexec: " fmt
0008 
0009 #include <linux/mm.h>
0010 #include <linux/kexec.h>
0011 #include <linux/string.h>
0012 #include <linux/gfp.h>
0013 #include <linux/reboot.h>
0014 #include <linux/numa.h>
0015 #include <linux/ftrace.h>
0016 #include <linux/io.h>
0017 #include <linux/suspend.h>
0018 #include <linux/vmalloc.h>
0019 #include <linux/efi.h>
0020 #include <linux/cc_platform.h>
0021 
0022 #include <asm/init.h>
0023 #include <asm/tlbflush.h>
0024 #include <asm/mmu_context.h>
0025 #include <asm/io_apic.h>
0026 #include <asm/debugreg.h>
0027 #include <asm/kexec-bzimage64.h>
0028 #include <asm/setup.h>
0029 #include <asm/set_memory.h>
0030 #include <asm/cpu.h>
0031 
0032 #ifdef CONFIG_ACPI
0033 /*
0034  * Used while adding mapping for ACPI tables.
0035  * Can be reused when other iomem regions need be mapped
0036  */
0037 struct init_pgtable_data {
0038     struct x86_mapping_info *info;
0039     pgd_t *level4p;
0040 };
0041 
0042 static int mem_region_callback(struct resource *res, void *arg)
0043 {
0044     struct init_pgtable_data *data = arg;
0045     unsigned long mstart, mend;
0046 
0047     mstart = res->start;
0048     mend = mstart + resource_size(res) - 1;
0049 
0050     return kernel_ident_mapping_init(data->info, data->level4p, mstart, mend);
0051 }
0052 
0053 static int
0054 map_acpi_tables(struct x86_mapping_info *info, pgd_t *level4p)
0055 {
0056     struct init_pgtable_data data;
0057     unsigned long flags;
0058     int ret;
0059 
0060     data.info = info;
0061     data.level4p = level4p;
0062     flags = IORESOURCE_MEM | IORESOURCE_BUSY;
0063 
0064     ret = walk_iomem_res_desc(IORES_DESC_ACPI_TABLES, flags, 0, -1,
0065                   &data, mem_region_callback);
0066     if (ret && ret != -EINVAL)
0067         return ret;
0068 
0069     /* ACPI tables could be located in ACPI Non-volatile Storage region */
0070     ret = walk_iomem_res_desc(IORES_DESC_ACPI_NV_STORAGE, flags, 0, -1,
0071                   &data, mem_region_callback);
0072     if (ret && ret != -EINVAL)
0073         return ret;
0074 
0075     return 0;
0076 }
0077 #else
0078 static int map_acpi_tables(struct x86_mapping_info *info, pgd_t *level4p) { return 0; }
0079 #endif
0080 
0081 #ifdef CONFIG_KEXEC_FILE
0082 const struct kexec_file_ops * const kexec_file_loaders[] = {
0083         &kexec_bzImage64_ops,
0084         NULL
0085 };
0086 #endif
0087 
0088 static int
0089 map_efi_systab(struct x86_mapping_info *info, pgd_t *level4p)
0090 {
0091 #ifdef CONFIG_EFI
0092     unsigned long mstart, mend;
0093 
0094     if (!efi_enabled(EFI_BOOT))
0095         return 0;
0096 
0097     mstart = (boot_params.efi_info.efi_systab |
0098             ((u64)boot_params.efi_info.efi_systab_hi<<32));
0099 
0100     if (efi_enabled(EFI_64BIT))
0101         mend = mstart + sizeof(efi_system_table_64_t);
0102     else
0103         mend = mstart + sizeof(efi_system_table_32_t);
0104 
0105     if (!mstart)
0106         return 0;
0107 
0108     return kernel_ident_mapping_init(info, level4p, mstart, mend);
0109 #endif
0110     return 0;
0111 }
0112 
0113 static void free_transition_pgtable(struct kimage *image)
0114 {
0115     free_page((unsigned long)image->arch.p4d);
0116     image->arch.p4d = NULL;
0117     free_page((unsigned long)image->arch.pud);
0118     image->arch.pud = NULL;
0119     free_page((unsigned long)image->arch.pmd);
0120     image->arch.pmd = NULL;
0121     free_page((unsigned long)image->arch.pte);
0122     image->arch.pte = NULL;
0123 }
0124 
0125 static int init_transition_pgtable(struct kimage *image, pgd_t *pgd)
0126 {
0127     pgprot_t prot = PAGE_KERNEL_EXEC_NOENC;
0128     unsigned long vaddr, paddr;
0129     int result = -ENOMEM;
0130     p4d_t *p4d;
0131     pud_t *pud;
0132     pmd_t *pmd;
0133     pte_t *pte;
0134 
0135     vaddr = (unsigned long)relocate_kernel;
0136     paddr = __pa(page_address(image->control_code_page)+PAGE_SIZE);
0137     pgd += pgd_index(vaddr);
0138     if (!pgd_present(*pgd)) {
0139         p4d = (p4d_t *)get_zeroed_page(GFP_KERNEL);
0140         if (!p4d)
0141             goto err;
0142         image->arch.p4d = p4d;
0143         set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE));
0144     }
0145     p4d = p4d_offset(pgd, vaddr);
0146     if (!p4d_present(*p4d)) {
0147         pud = (pud_t *)get_zeroed_page(GFP_KERNEL);
0148         if (!pud)
0149             goto err;
0150         image->arch.pud = pud;
0151         set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE));
0152     }
0153     pud = pud_offset(p4d, vaddr);
0154     if (!pud_present(*pud)) {
0155         pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL);
0156         if (!pmd)
0157             goto err;
0158         image->arch.pmd = pmd;
0159         set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
0160     }
0161     pmd = pmd_offset(pud, vaddr);
0162     if (!pmd_present(*pmd)) {
0163         pte = (pte_t *)get_zeroed_page(GFP_KERNEL);
0164         if (!pte)
0165             goto err;
0166         image->arch.pte = pte;
0167         set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE));
0168     }
0169     pte = pte_offset_kernel(pmd, vaddr);
0170 
0171     if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT))
0172         prot = PAGE_KERNEL_EXEC;
0173 
0174     set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, prot));
0175     return 0;
0176 err:
0177     return result;
0178 }
0179 
0180 static void *alloc_pgt_page(void *data)
0181 {
0182     struct kimage *image = (struct kimage *)data;
0183     struct page *page;
0184     void *p = NULL;
0185 
0186     page = kimage_alloc_control_pages(image, 0);
0187     if (page) {
0188         p = page_address(page);
0189         clear_page(p);
0190     }
0191 
0192     return p;
0193 }
0194 
0195 static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
0196 {
0197     struct x86_mapping_info info = {
0198         .alloc_pgt_page = alloc_pgt_page,
0199         .context    = image,
0200         .page_flag  = __PAGE_KERNEL_LARGE_EXEC,
0201         .kernpg_flag    = _KERNPG_TABLE_NOENC,
0202     };
0203     unsigned long mstart, mend;
0204     pgd_t *level4p;
0205     int result;
0206     int i;
0207 
0208     level4p = (pgd_t *)__va(start_pgtable);
0209     clear_page(level4p);
0210 
0211     if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) {
0212         info.page_flag   |= _PAGE_ENC;
0213         info.kernpg_flag |= _PAGE_ENC;
0214     }
0215 
0216     if (direct_gbpages)
0217         info.direct_gbpages = true;
0218 
0219     for (i = 0; i < nr_pfn_mapped; i++) {
0220         mstart = pfn_mapped[i].start << PAGE_SHIFT;
0221         mend   = pfn_mapped[i].end << PAGE_SHIFT;
0222 
0223         result = kernel_ident_mapping_init(&info,
0224                          level4p, mstart, mend);
0225         if (result)
0226             return result;
0227     }
0228 
0229     /*
0230      * segments's mem ranges could be outside 0 ~ max_pfn,
0231      * for example when jump back to original kernel from kexeced kernel.
0232      * or first kernel is booted with user mem map, and second kernel
0233      * could be loaded out of that range.
0234      */
0235     for (i = 0; i < image->nr_segments; i++) {
0236         mstart = image->segment[i].mem;
0237         mend   = mstart + image->segment[i].memsz;
0238 
0239         result = kernel_ident_mapping_init(&info,
0240                          level4p, mstart, mend);
0241 
0242         if (result)
0243             return result;
0244     }
0245 
0246     /*
0247      * Prepare EFI systab and ACPI tables for kexec kernel since they are
0248      * not covered by pfn_mapped.
0249      */
0250     result = map_efi_systab(&info, level4p);
0251     if (result)
0252         return result;
0253 
0254     result = map_acpi_tables(&info, level4p);
0255     if (result)
0256         return result;
0257 
0258     return init_transition_pgtable(image, level4p);
0259 }
0260 
0261 static void load_segments(void)
0262 {
0263     __asm__ __volatile__ (
0264         "\tmovl %0,%%ds\n"
0265         "\tmovl %0,%%es\n"
0266         "\tmovl %0,%%ss\n"
0267         "\tmovl %0,%%fs\n"
0268         "\tmovl %0,%%gs\n"
0269         : : "a" (__KERNEL_DS) : "memory"
0270         );
0271 }
0272 
0273 int machine_kexec_prepare(struct kimage *image)
0274 {
0275     unsigned long start_pgtable;
0276     int result;
0277 
0278     /* Calculate the offsets */
0279     start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
0280 
0281     /* Setup the identity mapped 64bit page table */
0282     result = init_pgtable(image, start_pgtable);
0283     if (result)
0284         return result;
0285 
0286     return 0;
0287 }
0288 
0289 void machine_kexec_cleanup(struct kimage *image)
0290 {
0291     free_transition_pgtable(image);
0292 }
0293 
0294 /*
0295  * Do not allocate memory (or fail in any way) in machine_kexec().
0296  * We are past the point of no return, committed to rebooting now.
0297  */
0298 void machine_kexec(struct kimage *image)
0299 {
0300     unsigned long page_list[PAGES_NR];
0301     void *control_page;
0302     int save_ftrace_enabled;
0303 
0304 #ifdef CONFIG_KEXEC_JUMP
0305     if (image->preserve_context)
0306         save_processor_state();
0307 #endif
0308 
0309     save_ftrace_enabled = __ftrace_enabled_save();
0310 
0311     /* Interrupts aren't acceptable while we reboot */
0312     local_irq_disable();
0313     hw_breakpoint_disable();
0314     cet_disable();
0315 
0316     if (image->preserve_context) {
0317 #ifdef CONFIG_X86_IO_APIC
0318         /*
0319          * We need to put APICs in legacy mode so that we can
0320          * get timer interrupts in second kernel. kexec/kdump
0321          * paths already have calls to restore_boot_irq_mode()
0322          * in one form or other. kexec jump path also need one.
0323          */
0324         clear_IO_APIC();
0325         restore_boot_irq_mode();
0326 #endif
0327     }
0328 
0329     control_page = page_address(image->control_code_page) + PAGE_SIZE;
0330     __memcpy(control_page, relocate_kernel, KEXEC_CONTROL_CODE_MAX_SIZE);
0331 
0332     page_list[PA_CONTROL_PAGE] = virt_to_phys(control_page);
0333     page_list[VA_CONTROL_PAGE] = (unsigned long)control_page;
0334     page_list[PA_TABLE_PAGE] =
0335       (unsigned long)__pa(page_address(image->control_code_page));
0336 
0337     if (image->type == KEXEC_TYPE_DEFAULT)
0338         page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page)
0339                         << PAGE_SHIFT);
0340 
0341     /*
0342      * The segment registers are funny things, they have both a
0343      * visible and an invisible part.  Whenever the visible part is
0344      * set to a specific selector, the invisible part is loaded
0345      * with from a table in memory.  At no other time is the
0346      * descriptor table in memory accessed.
0347      *
0348      * I take advantage of this here by force loading the
0349      * segments, before I zap the gdt with an invalid value.
0350      */
0351     load_segments();
0352     /*
0353      * The gdt & idt are now invalid.
0354      * If you want to load them you must set up your own idt & gdt.
0355      */
0356     native_idt_invalidate();
0357     native_gdt_invalidate();
0358 
0359     /* now call it */
0360     image->start = relocate_kernel((unsigned long)image->head,
0361                        (unsigned long)page_list,
0362                        image->start,
0363                        image->preserve_context,
0364                        cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT));
0365 
0366 #ifdef CONFIG_KEXEC_JUMP
0367     if (image->preserve_context)
0368         restore_processor_state();
0369 #endif
0370 
0371     __ftrace_enabled_restore(save_ftrace_enabled);
0372 }
0373 
0374 /* arch-dependent functionality related to kexec file-based syscall */
0375 
0376 #ifdef CONFIG_KEXEC_FILE
0377 void *arch_kexec_kernel_image_load(struct kimage *image)
0378 {
0379     if (!image->fops || !image->fops->load)
0380         return ERR_PTR(-ENOEXEC);
0381 
0382     return image->fops->load(image, image->kernel_buf,
0383                  image->kernel_buf_len, image->initrd_buf,
0384                  image->initrd_buf_len, image->cmdline_buf,
0385                  image->cmdline_buf_len);
0386 }
0387 
0388 /*
0389  * Apply purgatory relocations.
0390  *
0391  * @pi:     Purgatory to be relocated.
0392  * @section:    Section relocations applying to.
0393  * @relsec: Section containing RELAs.
0394  * @symtabsec:  Corresponding symtab.
0395  *
0396  * TODO: Some of the code belongs to generic code. Move that in kexec.c.
0397  */
0398 int arch_kexec_apply_relocations_add(struct purgatory_info *pi,
0399                      Elf_Shdr *section, const Elf_Shdr *relsec,
0400                      const Elf_Shdr *symtabsec)
0401 {
0402     unsigned int i;
0403     Elf64_Rela *rel;
0404     Elf64_Sym *sym;
0405     void *location;
0406     unsigned long address, sec_base, value;
0407     const char *strtab, *name, *shstrtab;
0408     const Elf_Shdr *sechdrs;
0409 
0410     /* String & section header string table */
0411     sechdrs = (void *)pi->ehdr + pi->ehdr->e_shoff;
0412     strtab = (char *)pi->ehdr + sechdrs[symtabsec->sh_link].sh_offset;
0413     shstrtab = (char *)pi->ehdr + sechdrs[pi->ehdr->e_shstrndx].sh_offset;
0414 
0415     rel = (void *)pi->ehdr + relsec->sh_offset;
0416 
0417     pr_debug("Applying relocate section %s to %u\n",
0418          shstrtab + relsec->sh_name, relsec->sh_info);
0419 
0420     for (i = 0; i < relsec->sh_size / sizeof(*rel); i++) {
0421 
0422         /*
0423          * rel[i].r_offset contains byte offset from beginning
0424          * of section to the storage unit affected.
0425          *
0426          * This is location to update. This is temporary buffer
0427          * where section is currently loaded. This will finally be
0428          * loaded to a different address later, pointed to by
0429          * ->sh_addr. kexec takes care of moving it
0430          *  (kexec_load_segment()).
0431          */
0432         location = pi->purgatory_buf;
0433         location += section->sh_offset;
0434         location += rel[i].r_offset;
0435 
0436         /* Final address of the location */
0437         address = section->sh_addr + rel[i].r_offset;
0438 
0439         /*
0440          * rel[i].r_info contains information about symbol table index
0441          * w.r.t which relocation must be made and type of relocation
0442          * to apply. ELF64_R_SYM() and ELF64_R_TYPE() macros get
0443          * these respectively.
0444          */
0445         sym = (void *)pi->ehdr + symtabsec->sh_offset;
0446         sym += ELF64_R_SYM(rel[i].r_info);
0447 
0448         if (sym->st_name)
0449             name = strtab + sym->st_name;
0450         else
0451             name = shstrtab + sechdrs[sym->st_shndx].sh_name;
0452 
0453         pr_debug("Symbol: %s info: %02x shndx: %02x value=%llx size: %llx\n",
0454              name, sym->st_info, sym->st_shndx, sym->st_value,
0455              sym->st_size);
0456 
0457         if (sym->st_shndx == SHN_UNDEF) {
0458             pr_err("Undefined symbol: %s\n", name);
0459             return -ENOEXEC;
0460         }
0461 
0462         if (sym->st_shndx == SHN_COMMON) {
0463             pr_err("symbol '%s' in common section\n", name);
0464             return -ENOEXEC;
0465         }
0466 
0467         if (sym->st_shndx == SHN_ABS)
0468             sec_base = 0;
0469         else if (sym->st_shndx >= pi->ehdr->e_shnum) {
0470             pr_err("Invalid section %d for symbol %s\n",
0471                    sym->st_shndx, name);
0472             return -ENOEXEC;
0473         } else
0474             sec_base = pi->sechdrs[sym->st_shndx].sh_addr;
0475 
0476         value = sym->st_value;
0477         value += sec_base;
0478         value += rel[i].r_addend;
0479 
0480         switch (ELF64_R_TYPE(rel[i].r_info)) {
0481         case R_X86_64_NONE:
0482             break;
0483         case R_X86_64_64:
0484             *(u64 *)location = value;
0485             break;
0486         case R_X86_64_32:
0487             *(u32 *)location = value;
0488             if (value != *(u32 *)location)
0489                 goto overflow;
0490             break;
0491         case R_X86_64_32S:
0492             *(s32 *)location = value;
0493             if ((s64)value != *(s32 *)location)
0494                 goto overflow;
0495             break;
0496         case R_X86_64_PC32:
0497         case R_X86_64_PLT32:
0498             value -= (u64)address;
0499             *(u32 *)location = value;
0500             break;
0501         default:
0502             pr_err("Unknown rela relocation: %llu\n",
0503                    ELF64_R_TYPE(rel[i].r_info));
0504             return -ENOEXEC;
0505         }
0506     }
0507     return 0;
0508 
0509 overflow:
0510     pr_err("Overflow in relocation type %d value 0x%lx\n",
0511            (int)ELF64_R_TYPE(rel[i].r_info), value);
0512     return -ENOEXEC;
0513 }
0514 
0515 int arch_kimage_file_post_load_cleanup(struct kimage *image)
0516 {
0517     vfree(image->elf_headers);
0518     image->elf_headers = NULL;
0519     image->elf_headers_sz = 0;
0520 
0521     return kexec_image_post_load_cleanup_default(image);
0522 }
0523 #endif /* CONFIG_KEXEC_FILE */
0524 
0525 static int
0526 kexec_mark_range(unsigned long start, unsigned long end, bool protect)
0527 {
0528     struct page *page;
0529     unsigned int nr_pages;
0530 
0531     /*
0532      * For physical range: [start, end]. We must skip the unassigned
0533      * crashk resource with zero-valued "end" member.
0534      */
0535     if (!end || start > end)
0536         return 0;
0537 
0538     page = pfn_to_page(start >> PAGE_SHIFT);
0539     nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
0540     if (protect)
0541         return set_pages_ro(page, nr_pages);
0542     else
0543         return set_pages_rw(page, nr_pages);
0544 }
0545 
0546 static void kexec_mark_crashkres(bool protect)
0547 {
0548     unsigned long control;
0549 
0550     kexec_mark_range(crashk_low_res.start, crashk_low_res.end, protect);
0551 
0552     /* Don't touch the control code page used in crash_kexec().*/
0553     control = PFN_PHYS(page_to_pfn(kexec_crash_image->control_code_page));
0554     /* Control code page is located in the 2nd page. */
0555     kexec_mark_range(crashk_res.start, control + PAGE_SIZE - 1, protect);
0556     control += KEXEC_CONTROL_PAGE_SIZE;
0557     kexec_mark_range(control, crashk_res.end, protect);
0558 }
0559 
0560 void arch_kexec_protect_crashkres(void)
0561 {
0562     kexec_mark_crashkres(true);
0563 }
0564 
0565 void arch_kexec_unprotect_crashkres(void)
0566 {
0567     kexec_mark_crashkres(false);
0568 }
0569 
0570 /*
0571  * During a traditional boot under SME, SME will encrypt the kernel,
0572  * so the SME kexec kernel also needs to be un-encrypted in order to
0573  * replicate a normal SME boot.
0574  *
0575  * During a traditional boot under SEV, the kernel has already been
0576  * loaded encrypted, so the SEV kexec kernel needs to be encrypted in
0577  * order to replicate a normal SEV boot.
0578  */
0579 int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, gfp_t gfp)
0580 {
0581     if (!cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT))
0582         return 0;
0583 
0584     /*
0585      * If host memory encryption is active we need to be sure that kexec
0586      * pages are not encrypted because when we boot to the new kernel the
0587      * pages won't be accessed encrypted (initially).
0588      */
0589     return set_memory_decrypted((unsigned long)vaddr, pages);
0590 }
0591 
0592 void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages)
0593 {
0594     if (!cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT))
0595         return;
0596 
0597     /*
0598      * If host memory encryption is active we need to reset the pages back
0599      * to being an encrypted mapping before freeing them.
0600      */
0601     set_memory_encrypted((unsigned long)vaddr, pages);
0602 }