Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0-only
0002 /*
0003  * Debug helper to dump the current kernel pagetables of the system
0004  * so that we can see what the various memory ranges are set to.
0005  *
0006  * (C) Copyright 2008 Intel Corporation
0007  *
0008  * Author: Arjan van de Ven <arjan@linux.intel.com>
0009  */
0010 
0011 #include <linux/debugfs.h>
0012 #include <linux/kasan.h>
0013 #include <linux/mm.h>
0014 #include <linux/init.h>
0015 #include <linux/sched.h>
0016 #include <linux/seq_file.h>
0017 #include <linux/highmem.h>
0018 #include <linux/pci.h>
0019 #include <linux/ptdump.h>
0020 
0021 #include <asm/e820/types.h>
0022 
0023 /*
0024  * The dumper groups pagetable entries of the same type into one, and for
0025  * that it needs to keep some state when walking, and flush this state
0026  * when a "break" in the continuity is found.
0027  */
0028 struct pg_state {
0029     struct ptdump_state ptdump;
0030     int level;
0031     pgprotval_t current_prot;
0032     pgprotval_t effective_prot;
0033     pgprotval_t prot_levels[5];
0034     unsigned long start_address;
0035     const struct addr_marker *marker;
0036     unsigned long lines;
0037     bool to_dmesg;
0038     bool check_wx;
0039     unsigned long wx_pages;
0040     struct seq_file *seq;
0041 };
0042 
0043 struct addr_marker {
0044     unsigned long start_address;
0045     const char *name;
0046     unsigned long max_lines;
0047 };
0048 
0049 /* Address space markers hints */
0050 
0051 #ifdef CONFIG_X86_64
0052 
0053 enum address_markers_idx {
0054     USER_SPACE_NR = 0,
0055     KERNEL_SPACE_NR,
0056 #ifdef CONFIG_MODIFY_LDT_SYSCALL
0057     LDT_NR,
0058 #endif
0059     LOW_KERNEL_NR,
0060     VMALLOC_START_NR,
0061     VMEMMAP_START_NR,
0062 #ifdef CONFIG_KASAN
0063     KASAN_SHADOW_START_NR,
0064     KASAN_SHADOW_END_NR,
0065 #endif
0066     CPU_ENTRY_AREA_NR,
0067 #ifdef CONFIG_X86_ESPFIX64
0068     ESPFIX_START_NR,
0069 #endif
0070 #ifdef CONFIG_EFI
0071     EFI_END_NR,
0072 #endif
0073     HIGH_KERNEL_NR,
0074     MODULES_VADDR_NR,
0075     MODULES_END_NR,
0076     FIXADDR_START_NR,
0077     END_OF_SPACE_NR,
0078 };
0079 
0080 static struct addr_marker address_markers[] = {
0081     [USER_SPACE_NR]     = { 0,          "User Space" },
0082     [KERNEL_SPACE_NR]   = { (1UL << 63),    "Kernel Space" },
0083     [LOW_KERNEL_NR]     = { 0UL,        "Low Kernel Mapping" },
0084     [VMALLOC_START_NR]  = { 0UL,        "vmalloc() Area" },
0085     [VMEMMAP_START_NR]  = { 0UL,        "Vmemmap" },
0086 #ifdef CONFIG_KASAN
0087     /*
0088      * These fields get initialized with the (dynamic)
0089      * KASAN_SHADOW_{START,END} values in pt_dump_init().
0090      */
0091     [KASAN_SHADOW_START_NR] = { 0UL,        "KASAN shadow" },
0092     [KASAN_SHADOW_END_NR]   = { 0UL,        "KASAN shadow end" },
0093 #endif
0094 #ifdef CONFIG_MODIFY_LDT_SYSCALL
0095     [LDT_NR]        = { 0UL,        "LDT remap" },
0096 #endif
0097     [CPU_ENTRY_AREA_NR] = { CPU_ENTRY_AREA_BASE,"CPU entry Area" },
0098 #ifdef CONFIG_X86_ESPFIX64
0099     [ESPFIX_START_NR]   = { ESPFIX_BASE_ADDR,   "ESPfix Area", 16 },
0100 #endif
0101 #ifdef CONFIG_EFI
0102     [EFI_END_NR]        = { EFI_VA_END,     "EFI Runtime Services" },
0103 #endif
0104     [HIGH_KERNEL_NR]    = { __START_KERNEL_map, "High Kernel Mapping" },
0105     [MODULES_VADDR_NR]  = { MODULES_VADDR,  "Modules" },
0106     [MODULES_END_NR]    = { MODULES_END,    "End Modules" },
0107     [FIXADDR_START_NR]  = { FIXADDR_START,  "Fixmap Area" },
0108     [END_OF_SPACE_NR]   = { -1,         NULL }
0109 };
0110 
0111 #define INIT_PGD    ((pgd_t *) &init_top_pgt)
0112 
0113 #else /* CONFIG_X86_64 */
0114 
0115 enum address_markers_idx {
0116     USER_SPACE_NR = 0,
0117     KERNEL_SPACE_NR,
0118     VMALLOC_START_NR,
0119     VMALLOC_END_NR,
0120 #ifdef CONFIG_HIGHMEM
0121     PKMAP_BASE_NR,
0122 #endif
0123 #ifdef CONFIG_MODIFY_LDT_SYSCALL
0124     LDT_NR,
0125 #endif
0126     CPU_ENTRY_AREA_NR,
0127     FIXADDR_START_NR,
0128     END_OF_SPACE_NR,
0129 };
0130 
0131 static struct addr_marker address_markers[] = {
0132     [USER_SPACE_NR]     = { 0,          "User Space" },
0133     [KERNEL_SPACE_NR]   = { PAGE_OFFSET,    "Kernel Mapping" },
0134     [VMALLOC_START_NR]  = { 0UL,        "vmalloc() Area" },
0135     [VMALLOC_END_NR]    = { 0UL,        "vmalloc() End" },
0136 #ifdef CONFIG_HIGHMEM
0137     [PKMAP_BASE_NR]     = { 0UL,        "Persistent kmap() Area" },
0138 #endif
0139 #ifdef CONFIG_MODIFY_LDT_SYSCALL
0140     [LDT_NR]        = { 0UL,        "LDT remap" },
0141 #endif
0142     [CPU_ENTRY_AREA_NR] = { 0UL,        "CPU entry area" },
0143     [FIXADDR_START_NR]  = { 0UL,        "Fixmap area" },
0144     [END_OF_SPACE_NR]   = { -1,         NULL }
0145 };
0146 
0147 #define INIT_PGD    (swapper_pg_dir)
0148 
0149 #endif /* !CONFIG_X86_64 */
0150 
0151 /* Multipliers for offsets within the PTEs */
0152 #define PTE_LEVEL_MULT (PAGE_SIZE)
0153 #define PMD_LEVEL_MULT (PTRS_PER_PTE * PTE_LEVEL_MULT)
0154 #define PUD_LEVEL_MULT (PTRS_PER_PMD * PMD_LEVEL_MULT)
0155 #define P4D_LEVEL_MULT (PTRS_PER_PUD * PUD_LEVEL_MULT)
0156 #define PGD_LEVEL_MULT (PTRS_PER_P4D * P4D_LEVEL_MULT)
0157 
0158 #define pt_dump_seq_printf(m, to_dmesg, fmt, args...)       \
0159 ({                              \
0160     if (to_dmesg)                   \
0161         printk(KERN_INFO fmt, ##args);          \
0162     else                            \
0163         if (m)                      \
0164             seq_printf(m, fmt, ##args);     \
0165 })
0166 
0167 #define pt_dump_cont_printf(m, to_dmesg, fmt, args...)      \
0168 ({                              \
0169     if (to_dmesg)                   \
0170         printk(KERN_CONT fmt, ##args);          \
0171     else                            \
0172         if (m)                      \
0173             seq_printf(m, fmt, ##args);     \
0174 })
0175 
0176 /*
0177  * Print a readable form of a pgprot_t to the seq_file
0178  */
0179 static void printk_prot(struct seq_file *m, pgprotval_t pr, int level, bool dmsg)
0180 {
0181     static const char * const level_name[] =
0182         { "pgd", "p4d", "pud", "pmd", "pte" };
0183 
0184     if (!(pr & _PAGE_PRESENT)) {
0185         /* Not present */
0186         pt_dump_cont_printf(m, dmsg, "                              ");
0187     } else {
0188         if (pr & _PAGE_USER)
0189             pt_dump_cont_printf(m, dmsg, "USR ");
0190         else
0191             pt_dump_cont_printf(m, dmsg, "    ");
0192         if (pr & _PAGE_RW)
0193             pt_dump_cont_printf(m, dmsg, "RW ");
0194         else
0195             pt_dump_cont_printf(m, dmsg, "ro ");
0196         if (pr & _PAGE_PWT)
0197             pt_dump_cont_printf(m, dmsg, "PWT ");
0198         else
0199             pt_dump_cont_printf(m, dmsg, "    ");
0200         if (pr & _PAGE_PCD)
0201             pt_dump_cont_printf(m, dmsg, "PCD ");
0202         else
0203             pt_dump_cont_printf(m, dmsg, "    ");
0204 
0205         /* Bit 7 has a different meaning on level 3 vs 4 */
0206         if (level <= 3 && pr & _PAGE_PSE)
0207             pt_dump_cont_printf(m, dmsg, "PSE ");
0208         else
0209             pt_dump_cont_printf(m, dmsg, "    ");
0210         if ((level == 4 && pr & _PAGE_PAT) ||
0211             ((level == 3 || level == 2) && pr & _PAGE_PAT_LARGE))
0212             pt_dump_cont_printf(m, dmsg, "PAT ");
0213         else
0214             pt_dump_cont_printf(m, dmsg, "    ");
0215         if (pr & _PAGE_GLOBAL)
0216             pt_dump_cont_printf(m, dmsg, "GLB ");
0217         else
0218             pt_dump_cont_printf(m, dmsg, "    ");
0219         if (pr & _PAGE_NX)
0220             pt_dump_cont_printf(m, dmsg, "NX ");
0221         else
0222             pt_dump_cont_printf(m, dmsg, "x  ");
0223     }
0224     pt_dump_cont_printf(m, dmsg, "%s\n", level_name[level]);
0225 }
0226 
0227 static void note_wx(struct pg_state *st, unsigned long addr)
0228 {
0229     unsigned long npages;
0230 
0231     npages = (addr - st->start_address) / PAGE_SIZE;
0232 
0233 #ifdef CONFIG_PCI_BIOS
0234     /*
0235      * If PCI BIOS is enabled, the PCI BIOS area is forced to WX.
0236      * Inform about it, but avoid the warning.
0237      */
0238     if (pcibios_enabled && st->start_address >= PAGE_OFFSET + BIOS_BEGIN &&
0239         addr <= PAGE_OFFSET + BIOS_END) {
0240         pr_warn_once("x86/mm: PCI BIOS W+X mapping %lu pages\n", npages);
0241         return;
0242     }
0243 #endif
0244     /* Account the WX pages */
0245     st->wx_pages += npages;
0246     WARN_ONCE(__supported_pte_mask & _PAGE_NX,
0247           "x86/mm: Found insecure W+X mapping at address %pS\n",
0248           (void *)st->start_address);
0249 }
0250 
0251 static void effective_prot(struct ptdump_state *pt_st, int level, u64 val)
0252 {
0253     struct pg_state *st = container_of(pt_st, struct pg_state, ptdump);
0254     pgprotval_t prot = val & PTE_FLAGS_MASK;
0255     pgprotval_t effective;
0256 
0257     if (level > 0) {
0258         pgprotval_t higher_prot = st->prot_levels[level - 1];
0259 
0260         effective = (higher_prot & prot & (_PAGE_USER | _PAGE_RW)) |
0261                 ((higher_prot | prot) & _PAGE_NX);
0262     } else {
0263         effective = prot;
0264     }
0265 
0266     st->prot_levels[level] = effective;
0267 }
0268 
0269 /*
0270  * This function gets called on a break in a continuous series
0271  * of PTE entries; the next one is different so we need to
0272  * print what we collected so far.
0273  */
0274 static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level,
0275               u64 val)
0276 {
0277     struct pg_state *st = container_of(pt_st, struct pg_state, ptdump);
0278     pgprotval_t new_prot, new_eff;
0279     pgprotval_t cur, eff;
0280     static const char units[] = "BKMGTPE";
0281     struct seq_file *m = st->seq;
0282 
0283     new_prot = val & PTE_FLAGS_MASK;
0284     if (!val)
0285         new_eff = 0;
0286     else
0287         new_eff = st->prot_levels[level];
0288 
0289     /*
0290      * If we have a "break" in the series, we need to flush the state that
0291      * we have now. "break" is either changing perms, levels or
0292      * address space marker.
0293      */
0294     cur = st->current_prot;
0295     eff = st->effective_prot;
0296 
0297     if (st->level == -1) {
0298         /* First entry */
0299         st->current_prot = new_prot;
0300         st->effective_prot = new_eff;
0301         st->level = level;
0302         st->marker = address_markers;
0303         st->lines = 0;
0304         pt_dump_seq_printf(m, st->to_dmesg, "---[ %s ]---\n",
0305                    st->marker->name);
0306     } else if (new_prot != cur || new_eff != eff || level != st->level ||
0307            addr >= st->marker[1].start_address) {
0308         const char *unit = units;
0309         unsigned long delta;
0310         int width = sizeof(unsigned long) * 2;
0311 
0312         if (st->check_wx && (eff & _PAGE_RW) && !(eff & _PAGE_NX))
0313             note_wx(st, addr);
0314 
0315         /*
0316          * Now print the actual finished series
0317          */
0318         if (!st->marker->max_lines ||
0319             st->lines < st->marker->max_lines) {
0320             pt_dump_seq_printf(m, st->to_dmesg,
0321                        "0x%0*lx-0x%0*lx   ",
0322                        width, st->start_address,
0323                        width, addr);
0324 
0325             delta = addr - st->start_address;
0326             while (!(delta & 1023) && unit[1]) {
0327                 delta >>= 10;
0328                 unit++;
0329             }
0330             pt_dump_cont_printf(m, st->to_dmesg, "%9lu%c ",
0331                         delta, *unit);
0332             printk_prot(m, st->current_prot, st->level,
0333                     st->to_dmesg);
0334         }
0335         st->lines++;
0336 
0337         /*
0338          * We print markers for special areas of address space,
0339          * such as the start of vmalloc space etc.
0340          * This helps in the interpretation.
0341          */
0342         if (addr >= st->marker[1].start_address) {
0343             if (st->marker->max_lines &&
0344                 st->lines > st->marker->max_lines) {
0345                 unsigned long nskip =
0346                     st->lines - st->marker->max_lines;
0347                 pt_dump_seq_printf(m, st->to_dmesg,
0348                            "... %lu entr%s skipped ... \n",
0349                            nskip,
0350                            nskip == 1 ? "y" : "ies");
0351             }
0352             st->marker++;
0353             st->lines = 0;
0354             pt_dump_seq_printf(m, st->to_dmesg, "---[ %s ]---\n",
0355                        st->marker->name);
0356         }
0357 
0358         st->start_address = addr;
0359         st->current_prot = new_prot;
0360         st->effective_prot = new_eff;
0361         st->level = level;
0362     }
0363 }
0364 
0365 static void ptdump_walk_pgd_level_core(struct seq_file *m,
0366                        struct mm_struct *mm, pgd_t *pgd,
0367                        bool checkwx, bool dmesg)
0368 {
0369     const struct ptdump_range ptdump_ranges[] = {
0370 #ifdef CONFIG_X86_64
0371     {0, PTRS_PER_PGD * PGD_LEVEL_MULT / 2},
0372     {GUARD_HOLE_END_ADDR, ~0UL},
0373 #else
0374     {0, ~0UL},
0375 #endif
0376     {0, 0}
0377 };
0378 
0379     struct pg_state st = {
0380         .ptdump = {
0381             .note_page  = note_page,
0382             .effective_prot = effective_prot,
0383             .range      = ptdump_ranges
0384         },
0385         .level = -1,
0386         .to_dmesg   = dmesg,
0387         .check_wx   = checkwx,
0388         .seq        = m
0389     };
0390 
0391     ptdump_walk_pgd(&st.ptdump, mm, pgd);
0392 
0393     if (!checkwx)
0394         return;
0395     if (st.wx_pages)
0396         pr_info("x86/mm: Checked W+X mappings: FAILED, %lu W+X pages found.\n",
0397             st.wx_pages);
0398     else
0399         pr_info("x86/mm: Checked W+X mappings: passed, no W+X pages found.\n");
0400 }
0401 
0402 void ptdump_walk_pgd_level(struct seq_file *m, struct mm_struct *mm)
0403 {
0404     ptdump_walk_pgd_level_core(m, mm, mm->pgd, false, true);
0405 }
0406 
0407 void ptdump_walk_pgd_level_debugfs(struct seq_file *m, struct mm_struct *mm,
0408                    bool user)
0409 {
0410     pgd_t *pgd = mm->pgd;
0411 #ifdef CONFIG_PAGE_TABLE_ISOLATION
0412     if (user && boot_cpu_has(X86_FEATURE_PTI))
0413         pgd = kernel_to_user_pgdp(pgd);
0414 #endif
0415     ptdump_walk_pgd_level_core(m, mm, pgd, false, false);
0416 }
0417 EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level_debugfs);
0418 
0419 void ptdump_walk_user_pgd_level_checkwx(void)
0420 {
0421 #ifdef CONFIG_PAGE_TABLE_ISOLATION
0422     pgd_t *pgd = INIT_PGD;
0423 
0424     if (!(__supported_pte_mask & _PAGE_NX) ||
0425         !boot_cpu_has(X86_FEATURE_PTI))
0426         return;
0427 
0428     pr_info("x86/mm: Checking user space page tables\n");
0429     pgd = kernel_to_user_pgdp(pgd);
0430     ptdump_walk_pgd_level_core(NULL, &init_mm, pgd, true, false);
0431 #endif
0432 }
0433 
0434 void ptdump_walk_pgd_level_checkwx(void)
0435 {
0436     ptdump_walk_pgd_level_core(NULL, &init_mm, INIT_PGD, true, false);
0437 }
0438 
0439 static int __init pt_dump_init(void)
0440 {
0441     /*
0442      * Various markers are not compile-time constants, so assign them
0443      * here.
0444      */
0445 #ifdef CONFIG_X86_64
0446     address_markers[LOW_KERNEL_NR].start_address = PAGE_OFFSET;
0447     address_markers[VMALLOC_START_NR].start_address = VMALLOC_START;
0448     address_markers[VMEMMAP_START_NR].start_address = VMEMMAP_START;
0449 #ifdef CONFIG_MODIFY_LDT_SYSCALL
0450     address_markers[LDT_NR].start_address = LDT_BASE_ADDR;
0451 #endif
0452 #ifdef CONFIG_KASAN
0453     address_markers[KASAN_SHADOW_START_NR].start_address = KASAN_SHADOW_START;
0454     address_markers[KASAN_SHADOW_END_NR].start_address = KASAN_SHADOW_END;
0455 #endif
0456 #endif
0457 #ifdef CONFIG_X86_32
0458     address_markers[VMALLOC_START_NR].start_address = VMALLOC_START;
0459     address_markers[VMALLOC_END_NR].start_address = VMALLOC_END;
0460 # ifdef CONFIG_HIGHMEM
0461     address_markers[PKMAP_BASE_NR].start_address = PKMAP_BASE;
0462 # endif
0463     address_markers[FIXADDR_START_NR].start_address = FIXADDR_START;
0464     address_markers[CPU_ENTRY_AREA_NR].start_address = CPU_ENTRY_AREA_BASE;
0465 # ifdef CONFIG_MODIFY_LDT_SYSCALL
0466     address_markers[LDT_NR].start_address = LDT_BASE_ADDR;
0467 # endif
0468 #endif
0469     return 0;
0470 }
0471 __initcall(pt_dump_init);