0001
0002
0003
0004
0005
0006
0007 #define pr_fmt(fmt) "kexec: " fmt
0008
0009 #include <linux/mm.h>
0010 #include <linux/kexec.h>
0011 #include <linux/string.h>
0012 #include <linux/gfp.h>
0013 #include <linux/reboot.h>
0014 #include <linux/numa.h>
0015 #include <linux/ftrace.h>
0016 #include <linux/io.h>
0017 #include <linux/suspend.h>
0018 #include <linux/vmalloc.h>
0019 #include <linux/efi.h>
0020 #include <linux/cc_platform.h>
0021
0022 #include <asm/init.h>
0023 #include <asm/tlbflush.h>
0024 #include <asm/mmu_context.h>
0025 #include <asm/io_apic.h>
0026 #include <asm/debugreg.h>
0027 #include <asm/kexec-bzimage64.h>
0028 #include <asm/setup.h>
0029 #include <asm/set_memory.h>
0030 #include <asm/cpu.h>
0031
0032 #ifdef CONFIG_ACPI
0033
0034
0035
0036
0037 struct init_pgtable_data {
0038 struct x86_mapping_info *info;
0039 pgd_t *level4p;
0040 };
0041
0042 static int mem_region_callback(struct resource *res, void *arg)
0043 {
0044 struct init_pgtable_data *data = arg;
0045 unsigned long mstart, mend;
0046
0047 mstart = res->start;
0048 mend = mstart + resource_size(res) - 1;
0049
0050 return kernel_ident_mapping_init(data->info, data->level4p, mstart, mend);
0051 }
0052
0053 static int
0054 map_acpi_tables(struct x86_mapping_info *info, pgd_t *level4p)
0055 {
0056 struct init_pgtable_data data;
0057 unsigned long flags;
0058 int ret;
0059
0060 data.info = info;
0061 data.level4p = level4p;
0062 flags = IORESOURCE_MEM | IORESOURCE_BUSY;
0063
0064 ret = walk_iomem_res_desc(IORES_DESC_ACPI_TABLES, flags, 0, -1,
0065 &data, mem_region_callback);
0066 if (ret && ret != -EINVAL)
0067 return ret;
0068
0069
0070 ret = walk_iomem_res_desc(IORES_DESC_ACPI_NV_STORAGE, flags, 0, -1,
0071 &data, mem_region_callback);
0072 if (ret && ret != -EINVAL)
0073 return ret;
0074
0075 return 0;
0076 }
0077 #else
0078 static int map_acpi_tables(struct x86_mapping_info *info, pgd_t *level4p) { return 0; }
0079 #endif
0080
0081 #ifdef CONFIG_KEXEC_FILE
0082 const struct kexec_file_ops * const kexec_file_loaders[] = {
0083 &kexec_bzImage64_ops,
0084 NULL
0085 };
0086 #endif
0087
0088 static int
0089 map_efi_systab(struct x86_mapping_info *info, pgd_t *level4p)
0090 {
0091 #ifdef CONFIG_EFI
0092 unsigned long mstart, mend;
0093
0094 if (!efi_enabled(EFI_BOOT))
0095 return 0;
0096
0097 mstart = (boot_params.efi_info.efi_systab |
0098 ((u64)boot_params.efi_info.efi_systab_hi<<32));
0099
0100 if (efi_enabled(EFI_64BIT))
0101 mend = mstart + sizeof(efi_system_table_64_t);
0102 else
0103 mend = mstart + sizeof(efi_system_table_32_t);
0104
0105 if (!mstart)
0106 return 0;
0107
0108 return kernel_ident_mapping_init(info, level4p, mstart, mend);
0109 #endif
0110 return 0;
0111 }
0112
0113 static void free_transition_pgtable(struct kimage *image)
0114 {
0115 free_page((unsigned long)image->arch.p4d);
0116 image->arch.p4d = NULL;
0117 free_page((unsigned long)image->arch.pud);
0118 image->arch.pud = NULL;
0119 free_page((unsigned long)image->arch.pmd);
0120 image->arch.pmd = NULL;
0121 free_page((unsigned long)image->arch.pte);
0122 image->arch.pte = NULL;
0123 }
0124
0125 static int init_transition_pgtable(struct kimage *image, pgd_t *pgd)
0126 {
0127 pgprot_t prot = PAGE_KERNEL_EXEC_NOENC;
0128 unsigned long vaddr, paddr;
0129 int result = -ENOMEM;
0130 p4d_t *p4d;
0131 pud_t *pud;
0132 pmd_t *pmd;
0133 pte_t *pte;
0134
0135 vaddr = (unsigned long)relocate_kernel;
0136 paddr = __pa(page_address(image->control_code_page)+PAGE_SIZE);
0137 pgd += pgd_index(vaddr);
0138 if (!pgd_present(*pgd)) {
0139 p4d = (p4d_t *)get_zeroed_page(GFP_KERNEL);
0140 if (!p4d)
0141 goto err;
0142 image->arch.p4d = p4d;
0143 set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE));
0144 }
0145 p4d = p4d_offset(pgd, vaddr);
0146 if (!p4d_present(*p4d)) {
0147 pud = (pud_t *)get_zeroed_page(GFP_KERNEL);
0148 if (!pud)
0149 goto err;
0150 image->arch.pud = pud;
0151 set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE));
0152 }
0153 pud = pud_offset(p4d, vaddr);
0154 if (!pud_present(*pud)) {
0155 pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL);
0156 if (!pmd)
0157 goto err;
0158 image->arch.pmd = pmd;
0159 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
0160 }
0161 pmd = pmd_offset(pud, vaddr);
0162 if (!pmd_present(*pmd)) {
0163 pte = (pte_t *)get_zeroed_page(GFP_KERNEL);
0164 if (!pte)
0165 goto err;
0166 image->arch.pte = pte;
0167 set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE));
0168 }
0169 pte = pte_offset_kernel(pmd, vaddr);
0170
0171 if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT))
0172 prot = PAGE_KERNEL_EXEC;
0173
0174 set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, prot));
0175 return 0;
0176 err:
0177 return result;
0178 }
0179
0180 static void *alloc_pgt_page(void *data)
0181 {
0182 struct kimage *image = (struct kimage *)data;
0183 struct page *page;
0184 void *p = NULL;
0185
0186 page = kimage_alloc_control_pages(image, 0);
0187 if (page) {
0188 p = page_address(page);
0189 clear_page(p);
0190 }
0191
0192 return p;
0193 }
0194
0195 static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
0196 {
0197 struct x86_mapping_info info = {
0198 .alloc_pgt_page = alloc_pgt_page,
0199 .context = image,
0200 .page_flag = __PAGE_KERNEL_LARGE_EXEC,
0201 .kernpg_flag = _KERNPG_TABLE_NOENC,
0202 };
0203 unsigned long mstart, mend;
0204 pgd_t *level4p;
0205 int result;
0206 int i;
0207
0208 level4p = (pgd_t *)__va(start_pgtable);
0209 clear_page(level4p);
0210
0211 if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) {
0212 info.page_flag |= _PAGE_ENC;
0213 info.kernpg_flag |= _PAGE_ENC;
0214 }
0215
0216 if (direct_gbpages)
0217 info.direct_gbpages = true;
0218
0219 for (i = 0; i < nr_pfn_mapped; i++) {
0220 mstart = pfn_mapped[i].start << PAGE_SHIFT;
0221 mend = pfn_mapped[i].end << PAGE_SHIFT;
0222
0223 result = kernel_ident_mapping_init(&info,
0224 level4p, mstart, mend);
0225 if (result)
0226 return result;
0227 }
0228
0229
0230
0231
0232
0233
0234
0235 for (i = 0; i < image->nr_segments; i++) {
0236 mstart = image->segment[i].mem;
0237 mend = mstart + image->segment[i].memsz;
0238
0239 result = kernel_ident_mapping_init(&info,
0240 level4p, mstart, mend);
0241
0242 if (result)
0243 return result;
0244 }
0245
0246
0247
0248
0249
0250 result = map_efi_systab(&info, level4p);
0251 if (result)
0252 return result;
0253
0254 result = map_acpi_tables(&info, level4p);
0255 if (result)
0256 return result;
0257
0258 return init_transition_pgtable(image, level4p);
0259 }
0260
0261 static void load_segments(void)
0262 {
0263 __asm__ __volatile__ (
0264 "\tmovl %0,%%ds\n"
0265 "\tmovl %0,%%es\n"
0266 "\tmovl %0,%%ss\n"
0267 "\tmovl %0,%%fs\n"
0268 "\tmovl %0,%%gs\n"
0269 : : "a" (__KERNEL_DS) : "memory"
0270 );
0271 }
0272
0273 int machine_kexec_prepare(struct kimage *image)
0274 {
0275 unsigned long start_pgtable;
0276 int result;
0277
0278
0279 start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
0280
0281
0282 result = init_pgtable(image, start_pgtable);
0283 if (result)
0284 return result;
0285
0286 return 0;
0287 }
0288
0289 void machine_kexec_cleanup(struct kimage *image)
0290 {
0291 free_transition_pgtable(image);
0292 }
0293
0294
0295
0296
0297
0298 void machine_kexec(struct kimage *image)
0299 {
0300 unsigned long page_list[PAGES_NR];
0301 void *control_page;
0302 int save_ftrace_enabled;
0303
0304 #ifdef CONFIG_KEXEC_JUMP
0305 if (image->preserve_context)
0306 save_processor_state();
0307 #endif
0308
0309 save_ftrace_enabled = __ftrace_enabled_save();
0310
0311
0312 local_irq_disable();
0313 hw_breakpoint_disable();
0314 cet_disable();
0315
0316 if (image->preserve_context) {
0317 #ifdef CONFIG_X86_IO_APIC
0318
0319
0320
0321
0322
0323
0324 clear_IO_APIC();
0325 restore_boot_irq_mode();
0326 #endif
0327 }
0328
0329 control_page = page_address(image->control_code_page) + PAGE_SIZE;
0330 __memcpy(control_page, relocate_kernel, KEXEC_CONTROL_CODE_MAX_SIZE);
0331
0332 page_list[PA_CONTROL_PAGE] = virt_to_phys(control_page);
0333 page_list[VA_CONTROL_PAGE] = (unsigned long)control_page;
0334 page_list[PA_TABLE_PAGE] =
0335 (unsigned long)__pa(page_address(image->control_code_page));
0336
0337 if (image->type == KEXEC_TYPE_DEFAULT)
0338 page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page)
0339 << PAGE_SHIFT);
0340
0341
0342
0343
0344
0345
0346
0347
0348
0349
0350
0351 load_segments();
0352
0353
0354
0355
0356 native_idt_invalidate();
0357 native_gdt_invalidate();
0358
0359
0360 image->start = relocate_kernel((unsigned long)image->head,
0361 (unsigned long)page_list,
0362 image->start,
0363 image->preserve_context,
0364 cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT));
0365
0366 #ifdef CONFIG_KEXEC_JUMP
0367 if (image->preserve_context)
0368 restore_processor_state();
0369 #endif
0370
0371 __ftrace_enabled_restore(save_ftrace_enabled);
0372 }
0373
0374
0375
0376 #ifdef CONFIG_KEXEC_FILE
0377 void *arch_kexec_kernel_image_load(struct kimage *image)
0378 {
0379 if (!image->fops || !image->fops->load)
0380 return ERR_PTR(-ENOEXEC);
0381
0382 return image->fops->load(image, image->kernel_buf,
0383 image->kernel_buf_len, image->initrd_buf,
0384 image->initrd_buf_len, image->cmdline_buf,
0385 image->cmdline_buf_len);
0386 }
0387
0388
0389
0390
0391
0392
0393
0394
0395
0396
0397
0398 int arch_kexec_apply_relocations_add(struct purgatory_info *pi,
0399 Elf_Shdr *section, const Elf_Shdr *relsec,
0400 const Elf_Shdr *symtabsec)
0401 {
0402 unsigned int i;
0403 Elf64_Rela *rel;
0404 Elf64_Sym *sym;
0405 void *location;
0406 unsigned long address, sec_base, value;
0407 const char *strtab, *name, *shstrtab;
0408 const Elf_Shdr *sechdrs;
0409
0410
0411 sechdrs = (void *)pi->ehdr + pi->ehdr->e_shoff;
0412 strtab = (char *)pi->ehdr + sechdrs[symtabsec->sh_link].sh_offset;
0413 shstrtab = (char *)pi->ehdr + sechdrs[pi->ehdr->e_shstrndx].sh_offset;
0414
0415 rel = (void *)pi->ehdr + relsec->sh_offset;
0416
0417 pr_debug("Applying relocate section %s to %u\n",
0418 shstrtab + relsec->sh_name, relsec->sh_info);
0419
0420 for (i = 0; i < relsec->sh_size / sizeof(*rel); i++) {
0421
0422
0423
0424
0425
0426
0427
0428
0429
0430
0431
0432 location = pi->purgatory_buf;
0433 location += section->sh_offset;
0434 location += rel[i].r_offset;
0435
0436
0437 address = section->sh_addr + rel[i].r_offset;
0438
0439
0440
0441
0442
0443
0444
0445 sym = (void *)pi->ehdr + symtabsec->sh_offset;
0446 sym += ELF64_R_SYM(rel[i].r_info);
0447
0448 if (sym->st_name)
0449 name = strtab + sym->st_name;
0450 else
0451 name = shstrtab + sechdrs[sym->st_shndx].sh_name;
0452
0453 pr_debug("Symbol: %s info: %02x shndx: %02x value=%llx size: %llx\n",
0454 name, sym->st_info, sym->st_shndx, sym->st_value,
0455 sym->st_size);
0456
0457 if (sym->st_shndx == SHN_UNDEF) {
0458 pr_err("Undefined symbol: %s\n", name);
0459 return -ENOEXEC;
0460 }
0461
0462 if (sym->st_shndx == SHN_COMMON) {
0463 pr_err("symbol '%s' in common section\n", name);
0464 return -ENOEXEC;
0465 }
0466
0467 if (sym->st_shndx == SHN_ABS)
0468 sec_base = 0;
0469 else if (sym->st_shndx >= pi->ehdr->e_shnum) {
0470 pr_err("Invalid section %d for symbol %s\n",
0471 sym->st_shndx, name);
0472 return -ENOEXEC;
0473 } else
0474 sec_base = pi->sechdrs[sym->st_shndx].sh_addr;
0475
0476 value = sym->st_value;
0477 value += sec_base;
0478 value += rel[i].r_addend;
0479
0480 switch (ELF64_R_TYPE(rel[i].r_info)) {
0481 case R_X86_64_NONE:
0482 break;
0483 case R_X86_64_64:
0484 *(u64 *)location = value;
0485 break;
0486 case R_X86_64_32:
0487 *(u32 *)location = value;
0488 if (value != *(u32 *)location)
0489 goto overflow;
0490 break;
0491 case R_X86_64_32S:
0492 *(s32 *)location = value;
0493 if ((s64)value != *(s32 *)location)
0494 goto overflow;
0495 break;
0496 case R_X86_64_PC32:
0497 case R_X86_64_PLT32:
0498 value -= (u64)address;
0499 *(u32 *)location = value;
0500 break;
0501 default:
0502 pr_err("Unknown rela relocation: %llu\n",
0503 ELF64_R_TYPE(rel[i].r_info));
0504 return -ENOEXEC;
0505 }
0506 }
0507 return 0;
0508
0509 overflow:
0510 pr_err("Overflow in relocation type %d value 0x%lx\n",
0511 (int)ELF64_R_TYPE(rel[i].r_info), value);
0512 return -ENOEXEC;
0513 }
0514
0515 int arch_kimage_file_post_load_cleanup(struct kimage *image)
0516 {
0517 vfree(image->elf_headers);
0518 image->elf_headers = NULL;
0519 image->elf_headers_sz = 0;
0520
0521 return kexec_image_post_load_cleanup_default(image);
0522 }
0523 #endif
0524
0525 static int
0526 kexec_mark_range(unsigned long start, unsigned long end, bool protect)
0527 {
0528 struct page *page;
0529 unsigned int nr_pages;
0530
0531
0532
0533
0534
0535 if (!end || start > end)
0536 return 0;
0537
0538 page = pfn_to_page(start >> PAGE_SHIFT);
0539 nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
0540 if (protect)
0541 return set_pages_ro(page, nr_pages);
0542 else
0543 return set_pages_rw(page, nr_pages);
0544 }
0545
0546 static void kexec_mark_crashkres(bool protect)
0547 {
0548 unsigned long control;
0549
0550 kexec_mark_range(crashk_low_res.start, crashk_low_res.end, protect);
0551
0552
0553 control = PFN_PHYS(page_to_pfn(kexec_crash_image->control_code_page));
0554
0555 kexec_mark_range(crashk_res.start, control + PAGE_SIZE - 1, protect);
0556 control += KEXEC_CONTROL_PAGE_SIZE;
0557 kexec_mark_range(control, crashk_res.end, protect);
0558 }
0559
0560 void arch_kexec_protect_crashkres(void)
0561 {
0562 kexec_mark_crashkres(true);
0563 }
0564
0565 void arch_kexec_unprotect_crashkres(void)
0566 {
0567 kexec_mark_crashkres(false);
0568 }
0569
0570
0571
0572
0573
0574
0575
0576
0577
0578
0579 int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, gfp_t gfp)
0580 {
0581 if (!cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT))
0582 return 0;
0583
0584
0585
0586
0587
0588
0589 return set_memory_decrypted((unsigned long)vaddr, pages);
0590 }
0591
0592 void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages)
0593 {
0594 if (!cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT))
0595 return;
0596
0597
0598
0599
0600
0601 set_memory_encrypted((unsigned long)vaddr, pages);
0602 }