Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0-only
0002 /*
0003  * Copyright 2016, Rashmica Gupta, IBM Corp.
0004  *
0005  * This traverses the kernel pagetables and dumps the
0006  * information about the used sections of memory to
0007  * /sys/kernel/debug/kernel_pagetables.
0008  *
0009  * Derived from the arm64 implementation:
0010  * Copyright (c) 2014, The Linux Foundation, Laura Abbott.
0011  * (C) Copyright 2008 Intel Corporation, Arjan van de Ven.
0012  */
0013 #include <linux/debugfs.h>
0014 #include <linux/fs.h>
0015 #include <linux/hugetlb.h>
0016 #include <linux/io.h>
0017 #include <linux/mm.h>
0018 #include <linux/highmem.h>
0019 #include <linux/ptdump.h>
0020 #include <linux/sched.h>
0021 #include <linux/seq_file.h>
0022 #include <asm/fixmap.h>
0023 #include <linux/const.h>
0024 #include <linux/kasan.h>
0025 #include <asm/page.h>
0026 #include <asm/hugetlb.h>
0027 
0028 #include <mm/mmu_decl.h>
0029 
0030 #include "ptdump.h"
0031 
0032 /*
0033  * To visualise what is happening,
0034  *
0035  *  - PTRS_PER_P** = how many entries there are in the corresponding P**
0036  *  - P**_SHIFT = how many bits of the address we use to index into the
0037  * corresponding P**
0038  *  - P**_SIZE is how much memory we can access through the table - not the
0039  * size of the table itself.
0040  * P**={PGD, PUD, PMD, PTE}
0041  *
0042  *
0043  * Each entry of the PGD points to a PUD. Each entry of a PUD points to a
0044  * PMD. Each entry of a PMD points to a PTE. And every PTE entry points to
0045  * a page.
0046  *
0047  * In the case where there are only 3 levels, the PUD is folded into the
0048  * PGD: every PUD has only one entry which points to the PMD.
0049  *
0050  * The page dumper groups page table entries of the same type into a single
0051  * description. It uses pg_state to track the range information while
0052  * iterating over the PTE entries. When the continuity is broken it then
0053  * dumps out a description of the range - ie PTEs that are virtually contiguous
0054  * with the same PTE flags are chunked together. This is to make it clear how
0055  * different areas of the kernel virtual memory are used.
0056  *
0057  */
0058 struct pg_state {
0059     struct ptdump_state ptdump;
0060     struct seq_file *seq;
0061     const struct addr_marker *marker;
0062     unsigned long start_address;
0063     unsigned long start_pa;
0064     int level;
0065     u64 current_flags;
0066     bool check_wx;
0067     unsigned long wx_pages;
0068 };
0069 
0070 struct addr_marker {
0071     unsigned long start_address;
0072     const char *name;
0073 };
0074 
0075 static struct addr_marker address_markers[] = {
0076     { 0,    "Start of kernel VM" },
0077 #ifdef MODULES_VADDR
0078     { 0,    "modules start" },
0079     { 0,    "modules end" },
0080 #endif
0081     { 0,    "vmalloc() Area" },
0082     { 0,    "vmalloc() End" },
0083 #ifdef CONFIG_PPC64
0084     { 0,    "isa I/O start" },
0085     { 0,    "isa I/O end" },
0086     { 0,    "phb I/O start" },
0087     { 0,    "phb I/O end" },
0088     { 0,    "I/O remap start" },
0089     { 0,    "I/O remap end" },
0090     { 0,    "vmemmap start" },
0091 #else
0092     { 0,    "Early I/O remap start" },
0093     { 0,    "Early I/O remap end" },
0094 #ifdef CONFIG_HIGHMEM
0095     { 0,    "Highmem PTEs start" },
0096     { 0,    "Highmem PTEs end" },
0097 #endif
0098     { 0,    "Fixmap start" },
0099     { 0,    "Fixmap end" },
0100 #endif
0101 #ifdef CONFIG_KASAN
0102     { 0,    "kasan shadow mem start" },
0103     { 0,    "kasan shadow mem end" },
0104 #endif
0105     { -1,   NULL },
0106 };
0107 
0108 static struct ptdump_range ptdump_range[] __ro_after_init = {
0109     {TASK_SIZE_MAX, ~0UL},
0110     {0, 0}
0111 };
0112 
0113 #define pt_dump_seq_printf(m, fmt, args...) \
0114 ({                      \
0115     if (m)                  \
0116         seq_printf(m, fmt, ##args); \
0117 })
0118 
0119 #define pt_dump_seq_putc(m, c)      \
0120 ({                  \
0121     if (m)              \
0122         seq_putc(m, c);     \
0123 })
0124 
0125 void pt_dump_size(struct seq_file *m, unsigned long size)
0126 {
0127     static const char units[] = " KMGTPE";
0128     const char *unit = units;
0129 
0130     /* Work out what appropriate unit to use */
0131     while (!(size & 1023) && unit[1]) {
0132         size >>= 10;
0133         unit++;
0134     }
0135     pt_dump_seq_printf(m, "%9lu%c ", size, *unit);
0136 }
0137 
0138 static void dump_flag_info(struct pg_state *st, const struct flag_info
0139         *flag, u64 pte, int num)
0140 {
0141     unsigned int i;
0142 
0143     for (i = 0; i < num; i++, flag++) {
0144         const char *s = NULL;
0145         u64 val;
0146 
0147         /* flag not defined so don't check it */
0148         if (flag->mask == 0)
0149             continue;
0150         /* Some 'flags' are actually values */
0151         if (flag->is_val) {
0152             val = pte & flag->val;
0153             if (flag->shift)
0154                 val = val >> flag->shift;
0155             pt_dump_seq_printf(st->seq, "  %s:%llx", flag->set, val);
0156         } else {
0157             if ((pte & flag->mask) == flag->val)
0158                 s = flag->set;
0159             else
0160                 s = flag->clear;
0161             if (s)
0162                 pt_dump_seq_printf(st->seq, "  %s", s);
0163         }
0164         st->current_flags &= ~flag->mask;
0165     }
0166     if (st->current_flags != 0)
0167         pt_dump_seq_printf(st->seq, "  unknown flags:%llx", st->current_flags);
0168 }
0169 
0170 static void dump_addr(struct pg_state *st, unsigned long addr)
0171 {
0172 #ifdef CONFIG_PPC64
0173 #define REG     "0x%016lx"
0174 #else
0175 #define REG     "0x%08lx"
0176 #endif
0177 
0178     pt_dump_seq_printf(st->seq, REG "-" REG " ", st->start_address, addr - 1);
0179     pt_dump_seq_printf(st->seq, " " REG " ", st->start_pa);
0180     pt_dump_size(st->seq, addr - st->start_address);
0181 }
0182 
0183 static void note_prot_wx(struct pg_state *st, unsigned long addr)
0184 {
0185     pte_t pte = __pte(st->current_flags);
0186 
0187     if (!IS_ENABLED(CONFIG_DEBUG_WX) || !st->check_wx)
0188         return;
0189 
0190     if (!pte_write(pte) || !pte_exec(pte))
0191         return;
0192 
0193     WARN_ONCE(1, "powerpc/mm: Found insecure W+X mapping at address %p/%pS\n",
0194           (void *)st->start_address, (void *)st->start_address);
0195 
0196     st->wx_pages += (addr - st->start_address) / PAGE_SIZE;
0197 }
0198 
0199 static void note_page_update_state(struct pg_state *st, unsigned long addr, int level, u64 val)
0200 {
0201     u64 flag = level >= 0 ? val & pg_level[level].mask : 0;
0202     u64 pa = val & PTE_RPN_MASK;
0203 
0204     st->level = level;
0205     st->current_flags = flag;
0206     st->start_address = addr;
0207     st->start_pa = pa;
0208 
0209     while (addr >= st->marker[1].start_address) {
0210         st->marker++;
0211         pt_dump_seq_printf(st->seq, "---[ %s ]---\n", st->marker->name);
0212     }
0213 }
0214 
0215 static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, u64 val)
0216 {
0217     u64 flag = level >= 0 ? val & pg_level[level].mask : 0;
0218     struct pg_state *st = container_of(pt_st, struct pg_state, ptdump);
0219 
0220     /* At first no level is set */
0221     if (st->level == -1) {
0222         pt_dump_seq_printf(st->seq, "---[ %s ]---\n", st->marker->name);
0223         note_page_update_state(st, addr, level, val);
0224     /*
0225      * Dump the section of virtual memory when:
0226      *   - the PTE flags from one entry to the next differs.
0227      *   - we change levels in the tree.
0228      *   - the address is in a different section of memory and is thus
0229      *   used for a different purpose, regardless of the flags.
0230      */
0231     } else if (flag != st->current_flags || level != st->level ||
0232            addr >= st->marker[1].start_address) {
0233 
0234         /* Check the PTE flags */
0235         if (st->current_flags) {
0236             note_prot_wx(st, addr);
0237             dump_addr(st, addr);
0238 
0239             /* Dump all the flags */
0240             if (pg_level[st->level].flag)
0241                 dump_flag_info(st, pg_level[st->level].flag,
0242                       st->current_flags,
0243                       pg_level[st->level].num);
0244 
0245             pt_dump_seq_putc(st->seq, '\n');
0246         }
0247 
0248         /*
0249          * Address indicates we have passed the end of the
0250          * current section of virtual memory
0251          */
0252         note_page_update_state(st, addr, level, val);
0253     }
0254 }
0255 
0256 static void populate_markers(void)
0257 {
0258     int i = 0;
0259 
0260 #ifdef CONFIG_PPC64
0261     address_markers[i++].start_address = PAGE_OFFSET;
0262 #else
0263     address_markers[i++].start_address = TASK_SIZE;
0264 #endif
0265 #ifdef MODULES_VADDR
0266     address_markers[i++].start_address = MODULES_VADDR;
0267     address_markers[i++].start_address = MODULES_END;
0268 #endif
0269     address_markers[i++].start_address = VMALLOC_START;
0270     address_markers[i++].start_address = VMALLOC_END;
0271 #ifdef CONFIG_PPC64
0272     address_markers[i++].start_address = ISA_IO_BASE;
0273     address_markers[i++].start_address = ISA_IO_END;
0274     address_markers[i++].start_address = PHB_IO_BASE;
0275     address_markers[i++].start_address = PHB_IO_END;
0276     address_markers[i++].start_address = IOREMAP_BASE;
0277     address_markers[i++].start_address = IOREMAP_END;
0278     /* What is the ifdef about? */
0279 #ifdef CONFIG_PPC_BOOK3S_64
0280     address_markers[i++].start_address =  H_VMEMMAP_START;
0281 #else
0282     address_markers[i++].start_address =  VMEMMAP_BASE;
0283 #endif
0284 #else /* !CONFIG_PPC64 */
0285     address_markers[i++].start_address = ioremap_bot;
0286     address_markers[i++].start_address = IOREMAP_TOP;
0287 #ifdef CONFIG_HIGHMEM
0288     address_markers[i++].start_address = PKMAP_BASE;
0289     address_markers[i++].start_address = PKMAP_ADDR(LAST_PKMAP);
0290 #endif
0291     address_markers[i++].start_address = FIXADDR_START;
0292     address_markers[i++].start_address = FIXADDR_TOP;
0293 #endif /* CONFIG_PPC64 */
0294 #ifdef CONFIG_KASAN
0295     address_markers[i++].start_address = KASAN_SHADOW_START;
0296     address_markers[i++].start_address = KASAN_SHADOW_END;
0297 #endif
0298 }
0299 
0300 static int ptdump_show(struct seq_file *m, void *v)
0301 {
0302     struct pg_state st = {
0303         .seq = m,
0304         .marker = address_markers,
0305         .level = -1,
0306         .ptdump = {
0307             .note_page = note_page,
0308             .range = ptdump_range,
0309         }
0310     };
0311 
0312     /* Traverse kernel page tables */
0313     ptdump_walk_pgd(&st.ptdump, &init_mm, NULL);
0314     return 0;
0315 }
0316 
0317 DEFINE_SHOW_ATTRIBUTE(ptdump);
0318 
0319 static void __init build_pgtable_complete_mask(void)
0320 {
0321     unsigned int i, j;
0322 
0323     for (i = 0; i < ARRAY_SIZE(pg_level); i++)
0324         if (pg_level[i].flag)
0325             for (j = 0; j < pg_level[i].num; j++)
0326                 pg_level[i].mask |= pg_level[i].flag[j].mask;
0327 }
0328 
0329 #ifdef CONFIG_DEBUG_WX
0330 void ptdump_check_wx(void)
0331 {
0332     struct pg_state st = {
0333         .seq = NULL,
0334         .marker = (struct addr_marker[]) {
0335             { 0, NULL},
0336             { -1, NULL},
0337         },
0338         .level = -1,
0339         .check_wx = true,
0340         .ptdump = {
0341             .note_page = note_page,
0342             .range = ptdump_range,
0343         }
0344     };
0345 
0346     ptdump_walk_pgd(&st.ptdump, &init_mm, NULL);
0347 
0348     if (st.wx_pages)
0349         pr_warn("Checked W+X mappings: FAILED, %lu W+X pages found\n",
0350             st.wx_pages);
0351     else
0352         pr_info("Checked W+X mappings: passed, no W+X pages found\n");
0353 }
0354 #endif
0355 
0356 static int __init ptdump_init(void)
0357 {
0358 #ifdef CONFIG_PPC64
0359     if (!radix_enabled())
0360         ptdump_range[0].start = KERN_VIRT_START;
0361     else
0362         ptdump_range[0].start = PAGE_OFFSET;
0363 
0364     ptdump_range[0].end = PAGE_OFFSET + (PGDIR_SIZE * PTRS_PER_PGD);
0365 #endif
0366 
0367     populate_markers();
0368     build_pgtable_complete_mask();
0369 
0370     if (IS_ENABLED(CONFIG_PTDUMP_DEBUGFS))
0371         debugfs_create_file("kernel_page_tables", 0400, NULL, NULL, &ptdump_fops);
0372 
0373     return 0;
0374 }
0375 device_initcall(ptdump_init);