Back to home page

LXR

 
 

    


0001 /*
0002  * handle transition of Linux booting another kernel
0003  * Copyright (C) 2002-2005 Eric Biederman  <ebiederm@xmission.com>
0004  *
0005  * This source code is licensed under the GNU General Public License,
0006  * Version 2.  See the file COPYING for more details.
0007  */
0008 
0009 #define pr_fmt(fmt) "kexec: " fmt
0010 
0011 #include <linux/mm.h>
0012 #include <linux/kexec.h>
0013 #include <linux/string.h>
0014 #include <linux/gfp.h>
0015 #include <linux/reboot.h>
0016 #include <linux/numa.h>
0017 #include <linux/ftrace.h>
0018 #include <linux/io.h>
0019 #include <linux/suspend.h>
0020 #include <linux/vmalloc.h>
0021 
0022 #include <asm/init.h>
0023 #include <asm/pgtable.h>
0024 #include <asm/tlbflush.h>
0025 #include <asm/mmu_context.h>
0026 #include <asm/io_apic.h>
0027 #include <asm/debugreg.h>
0028 #include <asm/kexec-bzimage64.h>
0029 #include <asm/setup.h>
0030 
0031 #ifdef CONFIG_KEXEC_FILE
0032 static struct kexec_file_ops *kexec_file_loaders[] = {
0033         &kexec_bzImage64_ops,
0034 };
0035 #endif
0036 
0037 static void free_transition_pgtable(struct kimage *image)
0038 {
0039     free_page((unsigned long)image->arch.pud);
0040     free_page((unsigned long)image->arch.pmd);
0041     free_page((unsigned long)image->arch.pte);
0042 }
0043 
0044 static int init_transition_pgtable(struct kimage *image, pgd_t *pgd)
0045 {
0046     pud_t *pud;
0047     pmd_t *pmd;
0048     pte_t *pte;
0049     unsigned long vaddr, paddr;
0050     int result = -ENOMEM;
0051 
0052     vaddr = (unsigned long)relocate_kernel;
0053     paddr = __pa(page_address(image->control_code_page)+PAGE_SIZE);
0054     pgd += pgd_index(vaddr);
0055     if (!pgd_present(*pgd)) {
0056         pud = (pud_t *)get_zeroed_page(GFP_KERNEL);
0057         if (!pud)
0058             goto err;
0059         image->arch.pud = pud;
0060         set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE));
0061     }
0062     pud = pud_offset(pgd, vaddr);
0063     if (!pud_present(*pud)) {
0064         pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL);
0065         if (!pmd)
0066             goto err;
0067         image->arch.pmd = pmd;
0068         set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
0069     }
0070     pmd = pmd_offset(pud, vaddr);
0071     if (!pmd_present(*pmd)) {
0072         pte = (pte_t *)get_zeroed_page(GFP_KERNEL);
0073         if (!pte)
0074             goto err;
0075         image->arch.pte = pte;
0076         set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE));
0077     }
0078     pte = pte_offset_kernel(pmd, vaddr);
0079     set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL_EXEC));
0080     return 0;
0081 err:
0082     free_transition_pgtable(image);
0083     return result;
0084 }
0085 
0086 static void *alloc_pgt_page(void *data)
0087 {
0088     struct kimage *image = (struct kimage *)data;
0089     struct page *page;
0090     void *p = NULL;
0091 
0092     page = kimage_alloc_control_pages(image, 0);
0093     if (page) {
0094         p = page_address(page);
0095         clear_page(p);
0096     }
0097 
0098     return p;
0099 }
0100 
0101 static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
0102 {
0103     struct x86_mapping_info info = {
0104         .alloc_pgt_page = alloc_pgt_page,
0105         .context    = image,
0106         .pmd_flag   = __PAGE_KERNEL_LARGE_EXEC,
0107     };
0108     unsigned long mstart, mend;
0109     pgd_t *level4p;
0110     int result;
0111     int i;
0112 
0113     level4p = (pgd_t *)__va(start_pgtable);
0114     clear_page(level4p);
0115     for (i = 0; i < nr_pfn_mapped; i++) {
0116         mstart = pfn_mapped[i].start << PAGE_SHIFT;
0117         mend   = pfn_mapped[i].end << PAGE_SHIFT;
0118 
0119         result = kernel_ident_mapping_init(&info,
0120                          level4p, mstart, mend);
0121         if (result)
0122             return result;
0123     }
0124 
0125     /*
0126      * segments's mem ranges could be outside 0 ~ max_pfn,
0127      * for example when jump back to original kernel from kexeced kernel.
0128      * or first kernel is booted with user mem map, and second kernel
0129      * could be loaded out of that range.
0130      */
0131     for (i = 0; i < image->nr_segments; i++) {
0132         mstart = image->segment[i].mem;
0133         mend   = mstart + image->segment[i].memsz;
0134 
0135         result = kernel_ident_mapping_init(&info,
0136                          level4p, mstart, mend);
0137 
0138         if (result)
0139             return result;
0140     }
0141 
0142     return init_transition_pgtable(image, level4p);
0143 }
0144 
0145 static void set_idt(void *newidt, u16 limit)
0146 {
0147     struct desc_ptr curidt;
0148 
0149     /* x86-64 supports unaliged loads & stores */
0150     curidt.size    = limit;
0151     curidt.address = (unsigned long)newidt;
0152 
0153     __asm__ __volatile__ (
0154         "lidtq %0\n"
0155         : : "m" (curidt)
0156         );
0157 };
0158 
0159 
0160 static void set_gdt(void *newgdt, u16 limit)
0161 {
0162     struct desc_ptr curgdt;
0163 
0164     /* x86-64 supports unaligned loads & stores */
0165     curgdt.size    = limit;
0166     curgdt.address = (unsigned long)newgdt;
0167 
0168     __asm__ __volatile__ (
0169         "lgdtq %0\n"
0170         : : "m" (curgdt)
0171         );
0172 };
0173 
0174 static void load_segments(void)
0175 {
0176     __asm__ __volatile__ (
0177         "\tmovl %0,%%ds\n"
0178         "\tmovl %0,%%es\n"
0179         "\tmovl %0,%%ss\n"
0180         "\tmovl %0,%%fs\n"
0181         "\tmovl %0,%%gs\n"
0182         : : "a" (__KERNEL_DS) : "memory"
0183         );
0184 }
0185 
0186 #ifdef CONFIG_KEXEC_FILE
0187 /* Update purgatory as needed after various image segments have been prepared */
0188 static int arch_update_purgatory(struct kimage *image)
0189 {
0190     int ret = 0;
0191 
0192     if (!image->file_mode)
0193         return 0;
0194 
0195     /* Setup copying of backup region */
0196     if (image->type == KEXEC_TYPE_CRASH) {
0197         ret = kexec_purgatory_get_set_symbol(image, "backup_dest",
0198                 &image->arch.backup_load_addr,
0199                 sizeof(image->arch.backup_load_addr), 0);
0200         if (ret)
0201             return ret;
0202 
0203         ret = kexec_purgatory_get_set_symbol(image, "backup_src",
0204                 &image->arch.backup_src_start,
0205                 sizeof(image->arch.backup_src_start), 0);
0206         if (ret)
0207             return ret;
0208 
0209         ret = kexec_purgatory_get_set_symbol(image, "backup_sz",
0210                 &image->arch.backup_src_sz,
0211                 sizeof(image->arch.backup_src_sz), 0);
0212         if (ret)
0213             return ret;
0214     }
0215 
0216     return ret;
0217 }
0218 #else /* !CONFIG_KEXEC_FILE */
0219 static inline int arch_update_purgatory(struct kimage *image)
0220 {
0221     return 0;
0222 }
0223 #endif /* CONFIG_KEXEC_FILE */
0224 
0225 int machine_kexec_prepare(struct kimage *image)
0226 {
0227     unsigned long start_pgtable;
0228     int result;
0229 
0230     /* Calculate the offsets */
0231     start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
0232 
0233     /* Setup the identity mapped 64bit page table */
0234     result = init_pgtable(image, start_pgtable);
0235     if (result)
0236         return result;
0237 
0238     /* update purgatory as needed */
0239     result = arch_update_purgatory(image);
0240     if (result)
0241         return result;
0242 
0243     return 0;
0244 }
0245 
0246 void machine_kexec_cleanup(struct kimage *image)
0247 {
0248     free_transition_pgtable(image);
0249 }
0250 
0251 /*
0252  * Do not allocate memory (or fail in any way) in machine_kexec().
0253  * We are past the point of no return, committed to rebooting now.
0254  */
0255 void machine_kexec(struct kimage *image)
0256 {
0257     unsigned long page_list[PAGES_NR];
0258     void *control_page;
0259     int save_ftrace_enabled;
0260 
0261 #ifdef CONFIG_KEXEC_JUMP
0262     if (image->preserve_context)
0263         save_processor_state();
0264 #endif
0265 
0266     save_ftrace_enabled = __ftrace_enabled_save();
0267 
0268     /* Interrupts aren't acceptable while we reboot */
0269     local_irq_disable();
0270     hw_breakpoint_disable();
0271 
0272     if (image->preserve_context) {
0273 #ifdef CONFIG_X86_IO_APIC
0274         /*
0275          * We need to put APICs in legacy mode so that we can
0276          * get timer interrupts in second kernel. kexec/kdump
0277          * paths already have calls to disable_IO_APIC() in
0278          * one form or other. kexec jump path also need
0279          * one.
0280          */
0281         disable_IO_APIC();
0282 #endif
0283     }
0284 
0285     control_page = page_address(image->control_code_page) + PAGE_SIZE;
0286     memcpy(control_page, relocate_kernel, KEXEC_CONTROL_CODE_MAX_SIZE);
0287 
0288     page_list[PA_CONTROL_PAGE] = virt_to_phys(control_page);
0289     page_list[VA_CONTROL_PAGE] = (unsigned long)control_page;
0290     page_list[PA_TABLE_PAGE] =
0291       (unsigned long)__pa(page_address(image->control_code_page));
0292 
0293     if (image->type == KEXEC_TYPE_DEFAULT)
0294         page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page)
0295                         << PAGE_SHIFT);
0296 
0297     /*
0298      * The segment registers are funny things, they have both a
0299      * visible and an invisible part.  Whenever the visible part is
0300      * set to a specific selector, the invisible part is loaded
0301      * with from a table in memory.  At no other time is the
0302      * descriptor table in memory accessed.
0303      *
0304      * I take advantage of this here by force loading the
0305      * segments, before I zap the gdt with an invalid value.
0306      */
0307     load_segments();
0308     /*
0309      * The gdt & idt are now invalid.
0310      * If you want to load them you must set up your own idt & gdt.
0311      */
0312     set_gdt(phys_to_virt(0), 0);
0313     set_idt(phys_to_virt(0), 0);
0314 
0315     /* now call it */
0316     image->start = relocate_kernel((unsigned long)image->head,
0317                        (unsigned long)page_list,
0318                        image->start,
0319                        image->preserve_context);
0320 
0321 #ifdef CONFIG_KEXEC_JUMP
0322     if (image->preserve_context)
0323         restore_processor_state();
0324 #endif
0325 
0326     __ftrace_enabled_restore(save_ftrace_enabled);
0327 }
0328 
0329 void arch_crash_save_vmcoreinfo(void)
0330 {
0331     VMCOREINFO_NUMBER(phys_base);
0332     VMCOREINFO_SYMBOL(init_level4_pgt);
0333 
0334 #ifdef CONFIG_NUMA
0335     VMCOREINFO_SYMBOL(node_data);
0336     VMCOREINFO_LENGTH(node_data, MAX_NUMNODES);
0337 #endif
0338     vmcoreinfo_append_str("KERNELOFFSET=%lx\n",
0339                   kaslr_offset());
0340     VMCOREINFO_NUMBER(KERNEL_IMAGE_SIZE);
0341 }
0342 
0343 /* arch-dependent functionality related to kexec file-based syscall */
0344 
0345 #ifdef CONFIG_KEXEC_FILE
0346 int arch_kexec_kernel_image_probe(struct kimage *image, void *buf,
0347                   unsigned long buf_len)
0348 {
0349     int i, ret = -ENOEXEC;
0350     struct kexec_file_ops *fops;
0351 
0352     for (i = 0; i < ARRAY_SIZE(kexec_file_loaders); i++) {
0353         fops = kexec_file_loaders[i];
0354         if (!fops || !fops->probe)
0355             continue;
0356 
0357         ret = fops->probe(buf, buf_len);
0358         if (!ret) {
0359             image->fops = fops;
0360             return ret;
0361         }
0362     }
0363 
0364     return ret;
0365 }
0366 
0367 void *arch_kexec_kernel_image_load(struct kimage *image)
0368 {
0369     vfree(image->arch.elf_headers);
0370     image->arch.elf_headers = NULL;
0371 
0372     if (!image->fops || !image->fops->load)
0373         return ERR_PTR(-ENOEXEC);
0374 
0375     return image->fops->load(image, image->kernel_buf,
0376                  image->kernel_buf_len, image->initrd_buf,
0377                  image->initrd_buf_len, image->cmdline_buf,
0378                  image->cmdline_buf_len);
0379 }
0380 
0381 int arch_kimage_file_post_load_cleanup(struct kimage *image)
0382 {
0383     if (!image->fops || !image->fops->cleanup)
0384         return 0;
0385 
0386     return image->fops->cleanup(image->image_loader_data);
0387 }
0388 
0389 #ifdef CONFIG_KEXEC_VERIFY_SIG
0390 int arch_kexec_kernel_verify_sig(struct kimage *image, void *kernel,
0391                  unsigned long kernel_len)
0392 {
0393     if (!image->fops || !image->fops->verify_sig) {
0394         pr_debug("kernel loader does not support signature verification.");
0395         return -EKEYREJECTED;
0396     }
0397 
0398     return image->fops->verify_sig(kernel, kernel_len);
0399 }
0400 #endif
0401 
0402 /*
0403  * Apply purgatory relocations.
0404  *
0405  * ehdr: Pointer to elf headers
0406  * sechdrs: Pointer to section headers.
0407  * relsec: section index of SHT_RELA section.
0408  *
0409  * TODO: Some of the code belongs to generic code. Move that in kexec.c.
0410  */
0411 int arch_kexec_apply_relocations_add(const Elf64_Ehdr *ehdr,
0412                      Elf64_Shdr *sechdrs, unsigned int relsec)
0413 {
0414     unsigned int i;
0415     Elf64_Rela *rel;
0416     Elf64_Sym *sym;
0417     void *location;
0418     Elf64_Shdr *section, *symtabsec;
0419     unsigned long address, sec_base, value;
0420     const char *strtab, *name, *shstrtab;
0421 
0422     /*
0423      * ->sh_offset has been modified to keep the pointer to section
0424      * contents in memory
0425      */
0426     rel = (void *)sechdrs[relsec].sh_offset;
0427 
0428     /* Section to which relocations apply */
0429     section = &sechdrs[sechdrs[relsec].sh_info];
0430 
0431     pr_debug("Applying relocate section %u to %u\n", relsec,
0432          sechdrs[relsec].sh_info);
0433 
0434     /* Associated symbol table */
0435     symtabsec = &sechdrs[sechdrs[relsec].sh_link];
0436 
0437     /* String table */
0438     if (symtabsec->sh_link >= ehdr->e_shnum) {
0439         /* Invalid strtab section number */
0440         pr_err("Invalid string table section index %d\n",
0441                symtabsec->sh_link);
0442         return -ENOEXEC;
0443     }
0444 
0445     strtab = (char *)sechdrs[symtabsec->sh_link].sh_offset;
0446 
0447     /* section header string table */
0448     shstrtab = (char *)sechdrs[ehdr->e_shstrndx].sh_offset;
0449 
0450     for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) {
0451 
0452         /*
0453          * rel[i].r_offset contains byte offset from beginning
0454          * of section to the storage unit affected.
0455          *
0456          * This is location to update (->sh_offset). This is temporary
0457          * buffer where section is currently loaded. This will finally
0458          * be loaded to a different address later, pointed to by
0459          * ->sh_addr. kexec takes care of moving it
0460          *  (kexec_load_segment()).
0461          */
0462         location = (void *)(section->sh_offset + rel[i].r_offset);
0463 
0464         /* Final address of the location */
0465         address = section->sh_addr + rel[i].r_offset;
0466 
0467         /*
0468          * rel[i].r_info contains information about symbol table index
0469          * w.r.t which relocation must be made and type of relocation
0470          * to apply. ELF64_R_SYM() and ELF64_R_TYPE() macros get
0471          * these respectively.
0472          */
0473         sym = (Elf64_Sym *)symtabsec->sh_offset +
0474                 ELF64_R_SYM(rel[i].r_info);
0475 
0476         if (sym->st_name)
0477             name = strtab + sym->st_name;
0478         else
0479             name = shstrtab + sechdrs[sym->st_shndx].sh_name;
0480 
0481         pr_debug("Symbol: %s info: %02x shndx: %02x value=%llx size: %llx\n",
0482              name, sym->st_info, sym->st_shndx, sym->st_value,
0483              sym->st_size);
0484 
0485         if (sym->st_shndx == SHN_UNDEF) {
0486             pr_err("Undefined symbol: %s\n", name);
0487             return -ENOEXEC;
0488         }
0489 
0490         if (sym->st_shndx == SHN_COMMON) {
0491             pr_err("symbol '%s' in common section\n", name);
0492             return -ENOEXEC;
0493         }
0494 
0495         if (sym->st_shndx == SHN_ABS)
0496             sec_base = 0;
0497         else if (sym->st_shndx >= ehdr->e_shnum) {
0498             pr_err("Invalid section %d for symbol %s\n",
0499                    sym->st_shndx, name);
0500             return -ENOEXEC;
0501         } else
0502             sec_base = sechdrs[sym->st_shndx].sh_addr;
0503 
0504         value = sym->st_value;
0505         value += sec_base;
0506         value += rel[i].r_addend;
0507 
0508         switch (ELF64_R_TYPE(rel[i].r_info)) {
0509         case R_X86_64_NONE:
0510             break;
0511         case R_X86_64_64:
0512             *(u64 *)location = value;
0513             break;
0514         case R_X86_64_32:
0515             *(u32 *)location = value;
0516             if (value != *(u32 *)location)
0517                 goto overflow;
0518             break;
0519         case R_X86_64_32S:
0520             *(s32 *)location = value;
0521             if ((s64)value != *(s32 *)location)
0522                 goto overflow;
0523             break;
0524         case R_X86_64_PC32:
0525             value -= (u64)address;
0526             *(u32 *)location = value;
0527             break;
0528         default:
0529             pr_err("Unknown rela relocation: %llu\n",
0530                    ELF64_R_TYPE(rel[i].r_info));
0531             return -ENOEXEC;
0532         }
0533     }
0534     return 0;
0535 
0536 overflow:
0537     pr_err("Overflow in relocation type %d value 0x%lx\n",
0538            (int)ELF64_R_TYPE(rel[i].r_info), value);
0539     return -ENOEXEC;
0540 }
0541 #endif /* CONFIG_KEXEC_FILE */
0542 
0543 static int
0544 kexec_mark_range(unsigned long start, unsigned long end, bool protect)
0545 {
0546     struct page *page;
0547     unsigned int nr_pages;
0548 
0549     /*
0550      * For physical range: [start, end]. We must skip the unassigned
0551      * crashk resource with zero-valued "end" member.
0552      */
0553     if (!end || start > end)
0554         return 0;
0555 
0556     page = pfn_to_page(start >> PAGE_SHIFT);
0557     nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
0558     if (protect)
0559         return set_pages_ro(page, nr_pages);
0560     else
0561         return set_pages_rw(page, nr_pages);
0562 }
0563 
0564 static void kexec_mark_crashkres(bool protect)
0565 {
0566     unsigned long control;
0567 
0568     kexec_mark_range(crashk_low_res.start, crashk_low_res.end, protect);
0569 
0570     /* Don't touch the control code page used in crash_kexec().*/
0571     control = PFN_PHYS(page_to_pfn(kexec_crash_image->control_code_page));
0572     /* Control code page is located in the 2nd page. */
0573     kexec_mark_range(crashk_res.start, control + PAGE_SIZE - 1, protect);
0574     control += KEXEC_CONTROL_PAGE_SIZE;
0575     kexec_mark_range(control, crashk_res.end, protect);
0576 }
0577 
0578 void arch_kexec_protect_crashkres(void)
0579 {
0580     kexec_mark_crashkres(true);
0581 }
0582 
0583 void arch_kexec_unprotect_crashkres(void)
0584 {
0585     kexec_mark_crashkres(false);
0586 }