Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0-only
0002 /*
0003  * page-types: Tool for querying page flags
0004  *
0005  * Copyright (C) 2009 Intel corporation
0006  *
0007  * Authors: Wu Fengguang <fengguang.wu@intel.com>
0008  */
0009 
0010 #define _FILE_OFFSET_BITS 64
0011 #define _GNU_SOURCE
0012 #include <stdio.h>
0013 #include <stdlib.h>
0014 #include <unistd.h>
0015 #include <stdint.h>
0016 #include <stdarg.h>
0017 #include <string.h>
0018 #include <getopt.h>
0019 #include <limits.h>
0020 #include <assert.h>
0021 #include <ftw.h>
0022 #include <time.h>
0023 #include <setjmp.h>
0024 #include <signal.h>
0025 #include <sys/types.h>
0026 #include <sys/errno.h>
0027 #include <sys/fcntl.h>
0028 #include <sys/mount.h>
0029 #include <sys/statfs.h>
0030 #include <sys/mman.h>
0031 #include "../../include/uapi/linux/magic.h"
0032 #include "../../include/uapi/linux/kernel-page-flags.h"
0033 #include <api/fs/fs.h>
0034 
0035 #ifndef MAX_PATH
0036 # define MAX_PATH 256
0037 #endif
0038 
0039 #ifndef STR
0040 # define _STR(x) #x
0041 # define STR(x) _STR(x)
0042 #endif
0043 
0044 /*
0045  * pagemap kernel ABI bits
0046  */
0047 
0048 #define PM_ENTRY_BYTES      8
0049 #define PM_PFRAME_BITS      55
0050 #define PM_PFRAME_MASK      ((1LL << PM_PFRAME_BITS) - 1)
0051 #define PM_PFRAME(x)        ((x) & PM_PFRAME_MASK)
0052 #define MAX_SWAPFILES_SHIFT 5
0053 #define PM_SWAP_OFFSET(x)   (((x) & PM_PFRAME_MASK) >> MAX_SWAPFILES_SHIFT)
0054 #define PM_SOFT_DIRTY       (1ULL << 55)
0055 #define PM_MMAP_EXCLUSIVE   (1ULL << 56)
0056 #define PM_FILE         (1ULL << 61)
0057 #define PM_SWAP         (1ULL << 62)
0058 #define PM_PRESENT      (1ULL << 63)
0059 
0060 /*
0061  * kernel page flags
0062  */
0063 
0064 #define KPF_BYTES       8
0065 #define PROC_KPAGEFLAGS     "/proc/kpageflags"
0066 #define PROC_KPAGECOUNT     "/proc/kpagecount"
0067 #define PROC_KPAGECGROUP    "/proc/kpagecgroup"
0068 
0069 #define SYS_KERNEL_MM_PAGE_IDLE "/sys/kernel/mm/page_idle/bitmap"
0070 
0071 /* [32-] kernel hacking assistances */
0072 #define KPF_RESERVED        32
0073 #define KPF_MLOCKED     33
0074 #define KPF_MAPPEDTODISK    34
0075 #define KPF_PRIVATE     35
0076 #define KPF_PRIVATE_2       36
0077 #define KPF_OWNER_PRIVATE   37
0078 #define KPF_ARCH        38
0079 #define KPF_UNCACHED        39
0080 #define KPF_SOFTDIRTY       40
0081 #define KPF_ARCH_2      41
0082 
0083 /* [47-] take some arbitrary free slots for expanding overloaded flags
0084  * not part of kernel API
0085  */
0086 #define KPF_ANON_EXCLUSIVE  47
0087 #define KPF_READAHEAD       48
0088 #define KPF_SLOB_FREE       49
0089 #define KPF_SLUB_FROZEN     50
0090 #define KPF_SLUB_DEBUG      51
0091 #define KPF_FILE        61
0092 #define KPF_SWAP        62
0093 #define KPF_MMAP_EXCLUSIVE  63
0094 
0095 #define KPF_ALL_BITS        ((uint64_t)~0ULL)
0096 #define KPF_HACKERS_BITS    (0xffffULL << 32)
0097 #define KPF_OVERLOADED_BITS (0xffffULL << 48)
0098 #define BIT(name)       (1ULL << KPF_##name)
0099 #define BITS_COMPOUND       (BIT(COMPOUND_HEAD) | BIT(COMPOUND_TAIL))
0100 
0101 static const char * const page_flag_names[] = {
0102     [KPF_LOCKED]        = "L:locked",
0103     [KPF_ERROR]     = "E:error",
0104     [KPF_REFERENCED]    = "R:referenced",
0105     [KPF_UPTODATE]      = "U:uptodate",
0106     [KPF_DIRTY]     = "D:dirty",
0107     [KPF_LRU]       = "l:lru",
0108     [KPF_ACTIVE]        = "A:active",
0109     [KPF_SLAB]      = "S:slab",
0110     [KPF_WRITEBACK]     = "W:writeback",
0111     [KPF_RECLAIM]       = "I:reclaim",
0112     [KPF_BUDDY]     = "B:buddy",
0113 
0114     [KPF_MMAP]      = "M:mmap",
0115     [KPF_ANON]      = "a:anonymous",
0116     [KPF_SWAPCACHE]     = "s:swapcache",
0117     [KPF_SWAPBACKED]    = "b:swapbacked",
0118     [KPF_COMPOUND_HEAD] = "H:compound_head",
0119     [KPF_COMPOUND_TAIL] = "T:compound_tail",
0120     [KPF_HUGE]      = "G:huge",
0121     [KPF_UNEVICTABLE]   = "u:unevictable",
0122     [KPF_HWPOISON]      = "X:hwpoison",
0123     [KPF_NOPAGE]        = "n:nopage",
0124     [KPF_KSM]       = "x:ksm",
0125     [KPF_THP]       = "t:thp",
0126     [KPF_OFFLINE]       = "o:offline",
0127     [KPF_PGTABLE]       = "g:pgtable",
0128     [KPF_ZERO_PAGE]     = "z:zero_page",
0129     [KPF_IDLE]              = "i:idle_page",
0130 
0131     [KPF_RESERVED]      = "r:reserved",
0132     [KPF_MLOCKED]       = "m:mlocked",
0133     [KPF_MAPPEDTODISK]  = "d:mappedtodisk",
0134     [KPF_PRIVATE]       = "P:private",
0135     [KPF_PRIVATE_2]     = "p:private_2",
0136     [KPF_OWNER_PRIVATE] = "O:owner_private",
0137     [KPF_ARCH]      = "h:arch",
0138     [KPF_UNCACHED]      = "c:uncached",
0139     [KPF_SOFTDIRTY]     = "f:softdirty",
0140     [KPF_ARCH_2]        = "H:arch_2",
0141 
0142     [KPF_ANON_EXCLUSIVE]    = "d:anon_exclusive",
0143     [KPF_READAHEAD]     = "I:readahead",
0144     [KPF_SLOB_FREE]     = "P:slob_free",
0145     [KPF_SLUB_FROZEN]   = "A:slub_frozen",
0146     [KPF_SLUB_DEBUG]    = "E:slub_debug",
0147 
0148     [KPF_FILE]      = "F:file",
0149     [KPF_SWAP]      = "w:swap",
0150     [KPF_MMAP_EXCLUSIVE]    = "1:mmap_exclusive",
0151 };
0152 
0153 
0154 /*
0155  * data structures
0156  */
0157 
0158 static int      opt_raw;    /* for kernel developers */
0159 static int      opt_list;   /* list pages (in ranges) */
0160 static int      opt_mark_idle;  /* set accessed bit */
0161 static int      opt_no_summary; /* don't show summary */
0162 static pid_t        opt_pid;    /* process to walk */
0163 const char      *opt_file;  /* file or directory path */
0164 static uint64_t     opt_cgroup; /* cgroup inode */
0165 static int      opt_list_cgroup;/* list page cgroup */
0166 static int      opt_list_mapcnt;/* list page map count */
0167 static const char   *opt_kpageflags;/* kpageflags file to parse */
0168 
0169 #define MAX_ADDR_RANGES 1024
0170 static int      nr_addr_ranges;
0171 static unsigned long    opt_offset[MAX_ADDR_RANGES];
0172 static unsigned long    opt_size[MAX_ADDR_RANGES];
0173 
0174 #define MAX_VMAS    10240
0175 static int      nr_vmas;
0176 static unsigned long    pg_start[MAX_VMAS];
0177 static unsigned long    pg_end[MAX_VMAS];
0178 
0179 #define MAX_BIT_FILTERS 64
0180 static int      nr_bit_filters;
0181 static uint64_t     opt_mask[MAX_BIT_FILTERS];
0182 static uint64_t     opt_bits[MAX_BIT_FILTERS];
0183 
0184 static int      page_size;
0185 
0186 static int      pagemap_fd;
0187 static int      kpageflags_fd;
0188 static int      kpagecount_fd = -1;
0189 static int      kpagecgroup_fd = -1;
0190 static int      page_idle_fd = -1;
0191 
0192 static int      opt_hwpoison;
0193 static int      opt_unpoison;
0194 
0195 static const char   *hwpoison_debug_fs;
0196 static int      hwpoison_inject_fd;
0197 static int      hwpoison_forget_fd;
0198 
0199 #define HASH_SHIFT  13
0200 #define HASH_SIZE   (1 << HASH_SHIFT)
0201 #define HASH_MASK   (HASH_SIZE - 1)
0202 #define HASH_KEY(flags) (flags & HASH_MASK)
0203 
0204 static unsigned long    total_pages;
0205 static unsigned long    nr_pages[HASH_SIZE];
0206 static uint64_t     page_flags[HASH_SIZE];
0207 
0208 
0209 /*
0210  * helper functions
0211  */
0212 
0213 #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
0214 
0215 #define min_t(type, x, y) ({            \
0216     type __min1 = (x);          \
0217     type __min2 = (y);          \
0218     __min1 < __min2 ? __min1 : __min2; })
0219 
0220 #define max_t(type, x, y) ({            \
0221     type __max1 = (x);          \
0222     type __max2 = (y);          \
0223     __max1 > __max2 ? __max1 : __max2; })
0224 
0225 static unsigned long pages2mb(unsigned long pages)
0226 {
0227     return (pages * page_size) >> 20;
0228 }
0229 
0230 static void fatal(const char *x, ...)
0231 {
0232     va_list ap;
0233 
0234     va_start(ap, x);
0235     vfprintf(stderr, x, ap);
0236     va_end(ap);
0237     exit(EXIT_FAILURE);
0238 }
0239 
0240 static int checked_open(const char *pathname, int flags)
0241 {
0242     int fd = open(pathname, flags);
0243 
0244     if (fd < 0) {
0245         perror(pathname);
0246         exit(EXIT_FAILURE);
0247     }
0248 
0249     return fd;
0250 }
0251 
0252 /*
0253  * pagemap/kpageflags routines
0254  */
0255 
0256 static unsigned long do_u64_read(int fd, const char *name,
0257                  uint64_t *buf,
0258                  unsigned long index,
0259                  unsigned long count)
0260 {
0261     long bytes;
0262 
0263     if (index > ULONG_MAX / 8)
0264         fatal("index overflow: %lu\n", index);
0265 
0266     bytes = pread(fd, buf, count * 8, (off_t)index * 8);
0267     if (bytes < 0) {
0268         perror(name);
0269         exit(EXIT_FAILURE);
0270     }
0271     if (bytes % 8)
0272         fatal("partial read: %lu bytes\n", bytes);
0273 
0274     return bytes / 8;
0275 }
0276 
0277 static unsigned long kpageflags_read(uint64_t *buf,
0278                      unsigned long index,
0279                      unsigned long pages)
0280 {
0281     return do_u64_read(kpageflags_fd, opt_kpageflags, buf, index, pages);
0282 }
0283 
0284 static unsigned long kpagecgroup_read(uint64_t *buf,
0285                       unsigned long index,
0286                       unsigned long pages)
0287 {
0288     if (kpagecgroup_fd < 0)
0289         return pages;
0290 
0291     return do_u64_read(kpagecgroup_fd, opt_kpageflags, buf, index, pages);
0292 }
0293 
0294 static unsigned long kpagecount_read(uint64_t *buf,
0295                      unsigned long index,
0296                      unsigned long pages)
0297 {
0298     return kpagecount_fd < 0 ? pages :
0299         do_u64_read(kpagecount_fd, PROC_KPAGECOUNT,
0300                 buf, index, pages);
0301 }
0302 
0303 static unsigned long pagemap_read(uint64_t *buf,
0304                   unsigned long index,
0305                   unsigned long pages)
0306 {
0307     return do_u64_read(pagemap_fd, "/proc/pid/pagemap", buf, index, pages);
0308 }
0309 
0310 static unsigned long pagemap_pfn(uint64_t val)
0311 {
0312     unsigned long pfn;
0313 
0314     if (val & PM_PRESENT)
0315         pfn = PM_PFRAME(val);
0316     else
0317         pfn = 0;
0318 
0319     return pfn;
0320 }
0321 
0322 static unsigned long pagemap_swap_offset(uint64_t val)
0323 {
0324     return val & PM_SWAP ? PM_SWAP_OFFSET(val) : 0;
0325 }
0326 
0327 /*
0328  * page flag names
0329  */
0330 
0331 static char *page_flag_name(uint64_t flags)
0332 {
0333     static char buf[65];
0334     int present;
0335     size_t i, j;
0336 
0337     for (i = 0, j = 0; i < ARRAY_SIZE(page_flag_names); i++) {
0338         present = (flags >> i) & 1;
0339         if (!page_flag_names[i]) {
0340             if (present)
0341                 fatal("unknown flag bit %d\n", i);
0342             continue;
0343         }
0344         buf[j++] = present ? page_flag_names[i][0] : '_';
0345     }
0346 
0347     return buf;
0348 }
0349 
0350 static char *page_flag_longname(uint64_t flags)
0351 {
0352     static char buf[1024];
0353     size_t i, n;
0354 
0355     for (i = 0, n = 0; i < ARRAY_SIZE(page_flag_names); i++) {
0356         if (!page_flag_names[i])
0357             continue;
0358         if ((flags >> i) & 1)
0359             n += snprintf(buf + n, sizeof(buf) - n, "%s,",
0360                     page_flag_names[i] + 2);
0361     }
0362     if (n)
0363         n--;
0364     buf[n] = '\0';
0365 
0366     return buf;
0367 }
0368 
0369 
0370 /*
0371  * page list and summary
0372  */
0373 
0374 static void show_page_range(unsigned long voffset, unsigned long offset,
0375                 unsigned long size, uint64_t flags,
0376                 uint64_t cgroup, uint64_t mapcnt)
0377 {
0378     static uint64_t      flags0;
0379     static uint64_t      cgroup0;
0380     static uint64_t      mapcnt0;
0381     static unsigned long voff;
0382     static unsigned long index;
0383     static unsigned long count;
0384 
0385     if (flags == flags0 && cgroup == cgroup0 && mapcnt == mapcnt0 &&
0386         offset == index + count && size && voffset == voff + count) {
0387         count += size;
0388         return;
0389     }
0390 
0391     if (count) {
0392         if (opt_pid)
0393             printf("%lx\t", voff);
0394         if (opt_file)
0395             printf("%lx\t", voff);
0396         if (opt_list_cgroup)
0397             printf("@%llu\t", (unsigned long long)cgroup0);
0398         if (opt_list_mapcnt)
0399             printf("%lu\t", mapcnt0);
0400         printf("%lx\t%lx\t%s\n",
0401                 index, count, page_flag_name(flags0));
0402     }
0403 
0404     flags0 = flags;
0405     cgroup0 = cgroup;
0406     mapcnt0 = mapcnt;
0407     index  = offset;
0408     voff   = voffset;
0409     count  = size;
0410 }
0411 
0412 static void flush_page_range(void)
0413 {
0414     show_page_range(0, 0, 0, 0, 0, 0);
0415 }
0416 
0417 static void show_page(unsigned long voffset, unsigned long offset,
0418               uint64_t flags, uint64_t cgroup, uint64_t mapcnt)
0419 {
0420     if (opt_pid)
0421         printf("%lx\t", voffset);
0422     if (opt_file)
0423         printf("%lx\t", voffset);
0424     if (opt_list_cgroup)
0425         printf("@%llu\t", (unsigned long long)cgroup);
0426     if (opt_list_mapcnt)
0427         printf("%lu\t", mapcnt);
0428 
0429     printf("%lx\t%s\n", offset, page_flag_name(flags));
0430 }
0431 
0432 static void show_summary(void)
0433 {
0434     size_t i;
0435 
0436     printf("             flags\tpage-count       MB"
0437         "  symbolic-flags\t\t\tlong-symbolic-flags\n");
0438 
0439     for (i = 0; i < ARRAY_SIZE(nr_pages); i++) {
0440         if (nr_pages[i])
0441             printf("0x%016llx\t%10lu %8lu  %s\t%s\n",
0442                 (unsigned long long)page_flags[i],
0443                 nr_pages[i],
0444                 pages2mb(nr_pages[i]),
0445                 page_flag_name(page_flags[i]),
0446                 page_flag_longname(page_flags[i]));
0447     }
0448 
0449     printf("             total\t%10lu %8lu\n",
0450             total_pages, pages2mb(total_pages));
0451 }
0452 
0453 
0454 /*
0455  * page flag filters
0456  */
0457 
0458 static int bit_mask_ok(uint64_t flags)
0459 {
0460     int i;
0461 
0462     for (i = 0; i < nr_bit_filters; i++) {
0463         if (opt_bits[i] == KPF_ALL_BITS) {
0464             if ((flags & opt_mask[i]) == 0)
0465                 return 0;
0466         } else {
0467             if ((flags & opt_mask[i]) != opt_bits[i])
0468                 return 0;
0469         }
0470     }
0471 
0472     return 1;
0473 }
0474 
0475 static uint64_t expand_overloaded_flags(uint64_t flags, uint64_t pme)
0476 {
0477     /* Anonymous pages overload PG_mappedtodisk */
0478     if ((flags & BIT(ANON)) && (flags & BIT(MAPPEDTODISK)))
0479         flags ^= BIT(MAPPEDTODISK) | BIT(ANON_EXCLUSIVE);
0480 
0481     /* SLOB/SLUB overload several page flags */
0482     if (flags & BIT(SLAB)) {
0483         if (flags & BIT(PRIVATE))
0484             flags ^= BIT(PRIVATE) | BIT(SLOB_FREE);
0485         if (flags & BIT(ACTIVE))
0486             flags ^= BIT(ACTIVE) | BIT(SLUB_FROZEN);
0487         if (flags & BIT(ERROR))
0488             flags ^= BIT(ERROR) | BIT(SLUB_DEBUG);
0489     }
0490 
0491     /* PG_reclaim is overloaded as PG_readahead in the read path */
0492     if ((flags & (BIT(RECLAIM) | BIT(WRITEBACK))) == BIT(RECLAIM))
0493         flags ^= BIT(RECLAIM) | BIT(READAHEAD);
0494 
0495     if (pme & PM_SOFT_DIRTY)
0496         flags |= BIT(SOFTDIRTY);
0497     if (pme & PM_FILE)
0498         flags |= BIT(FILE);
0499     if (pme & PM_SWAP)
0500         flags |= BIT(SWAP);
0501     if (pme & PM_MMAP_EXCLUSIVE)
0502         flags |= BIT(MMAP_EXCLUSIVE);
0503 
0504     return flags;
0505 }
0506 
0507 static uint64_t well_known_flags(uint64_t flags)
0508 {
0509     /* hide flags intended only for kernel hacker */
0510     flags &= ~KPF_HACKERS_BITS;
0511 
0512     /* hide non-hugeTLB compound pages */
0513     if ((flags & BITS_COMPOUND) && !(flags & BIT(HUGE)))
0514         flags &= ~BITS_COMPOUND;
0515 
0516     return flags;
0517 }
0518 
0519 static uint64_t kpageflags_flags(uint64_t flags, uint64_t pme)
0520 {
0521     if (opt_raw)
0522         flags = expand_overloaded_flags(flags, pme);
0523     else
0524         flags = well_known_flags(flags);
0525 
0526     return flags;
0527 }
0528 
0529 /*
0530  * page actions
0531  */
0532 
0533 static void prepare_hwpoison_fd(void)
0534 {
0535     char buf[MAX_PATH + 1];
0536 
0537     hwpoison_debug_fs = debugfs__mount();
0538     if (!hwpoison_debug_fs) {
0539         perror("mount debugfs");
0540         exit(EXIT_FAILURE);
0541     }
0542 
0543     if (opt_hwpoison && !hwpoison_inject_fd) {
0544         snprintf(buf, MAX_PATH, "%s/hwpoison/corrupt-pfn",
0545             hwpoison_debug_fs);
0546         hwpoison_inject_fd = checked_open(buf, O_WRONLY);
0547     }
0548 
0549     if (opt_unpoison && !hwpoison_forget_fd) {
0550         snprintf(buf, MAX_PATH, "%s/hwpoison/unpoison-pfn",
0551             hwpoison_debug_fs);
0552         hwpoison_forget_fd = checked_open(buf, O_WRONLY);
0553     }
0554 }
0555 
0556 static int hwpoison_page(unsigned long offset)
0557 {
0558     char buf[100];
0559     int len;
0560 
0561     len = sprintf(buf, "0x%lx\n", offset);
0562     len = write(hwpoison_inject_fd, buf, len);
0563     if (len < 0) {
0564         perror("hwpoison inject");
0565         return len;
0566     }
0567     return 0;
0568 }
0569 
0570 static int unpoison_page(unsigned long offset)
0571 {
0572     char buf[100];
0573     int len;
0574 
0575     len = sprintf(buf, "0x%lx\n", offset);
0576     len = write(hwpoison_forget_fd, buf, len);
0577     if (len < 0) {
0578         perror("hwpoison forget");
0579         return len;
0580     }
0581     return 0;
0582 }
0583 
0584 static int mark_page_idle(unsigned long offset)
0585 {
0586     static unsigned long off;
0587     static uint64_t buf;
0588     int len;
0589 
0590     if ((offset / 64 == off / 64) || buf == 0) {
0591         buf |= 1UL << (offset % 64);
0592         off = offset;
0593         return 0;
0594     }
0595 
0596     len = pwrite(page_idle_fd, &buf, 8, 8 * (off / 64));
0597     if (len < 0) {
0598         perror("mark page idle");
0599         return len;
0600     }
0601 
0602     buf = 1UL << (offset % 64);
0603     off = offset;
0604 
0605     return 0;
0606 }
0607 
0608 /*
0609  * page frame walker
0610  */
0611 
0612 static size_t hash_slot(uint64_t flags)
0613 {
0614     size_t k = HASH_KEY(flags);
0615     size_t i;
0616 
0617     /* Explicitly reserve slot 0 for flags 0: the following logic
0618      * cannot distinguish an unoccupied slot from slot (flags==0).
0619      */
0620     if (flags == 0)
0621         return 0;
0622 
0623     /* search through the remaining (HASH_SIZE-1) slots */
0624     for (i = 1; i < ARRAY_SIZE(page_flags); i++, k++) {
0625         if (!k || k >= ARRAY_SIZE(page_flags))
0626             k = 1;
0627         if (page_flags[k] == 0) {
0628             page_flags[k] = flags;
0629             return k;
0630         }
0631         if (page_flags[k] == flags)
0632             return k;
0633     }
0634 
0635     fatal("hash table full: bump up HASH_SHIFT?\n");
0636     exit(EXIT_FAILURE);
0637 }
0638 
0639 static void add_page(unsigned long voffset, unsigned long offset,
0640              uint64_t flags, uint64_t cgroup, uint64_t mapcnt,
0641              uint64_t pme)
0642 {
0643     flags = kpageflags_flags(flags, pme);
0644 
0645     if (!bit_mask_ok(flags))
0646         return;
0647 
0648     if (opt_cgroup && cgroup != (uint64_t)opt_cgroup)
0649         return;
0650 
0651     if (opt_hwpoison)
0652         hwpoison_page(offset);
0653     if (opt_unpoison)
0654         unpoison_page(offset);
0655 
0656     if (opt_mark_idle)
0657         mark_page_idle(offset);
0658 
0659     if (opt_list == 1)
0660         show_page_range(voffset, offset, 1, flags, cgroup, mapcnt);
0661     else if (opt_list == 2)
0662         show_page(voffset, offset, flags, cgroup, mapcnt);
0663 
0664     nr_pages[hash_slot(flags)]++;
0665     total_pages++;
0666 }
0667 
0668 #define KPAGEFLAGS_BATCH    (64 << 10)  /* 64k pages */
0669 static void walk_pfn(unsigned long voffset,
0670              unsigned long index,
0671              unsigned long count,
0672              uint64_t pme)
0673 {
0674     uint64_t buf[KPAGEFLAGS_BATCH];
0675     uint64_t cgi[KPAGEFLAGS_BATCH];
0676     uint64_t cnt[KPAGEFLAGS_BATCH];
0677     unsigned long batch;
0678     unsigned long pages;
0679     unsigned long i;
0680 
0681     /*
0682      * kpagecgroup_read() reads only if kpagecgroup were opened, but
0683      * /proc/kpagecgroup might even not exist, so it's better to fill
0684      * them with zeros here.
0685      */
0686     if (count == 1)
0687         cgi[0] = 0;
0688     else
0689         memset(cgi, 0, sizeof cgi);
0690 
0691     while (count) {
0692         batch = min_t(unsigned long, count, KPAGEFLAGS_BATCH);
0693         pages = kpageflags_read(buf, index, batch);
0694         if (pages == 0)
0695             break;
0696 
0697         if (kpagecgroup_read(cgi, index, pages) != pages)
0698             fatal("kpagecgroup returned fewer pages than expected");
0699 
0700         if (kpagecount_read(cnt, index, pages) != pages)
0701             fatal("kpagecount returned fewer pages than expected");
0702 
0703         for (i = 0; i < pages; i++)
0704             add_page(voffset + i, index + i,
0705                  buf[i], cgi[i], cnt[i], pme);
0706 
0707         index += pages;
0708         count -= pages;
0709     }
0710 }
0711 
0712 static void walk_swap(unsigned long voffset, uint64_t pme)
0713 {
0714     uint64_t flags = kpageflags_flags(0, pme);
0715 
0716     if (!bit_mask_ok(flags))
0717         return;
0718 
0719     if (opt_cgroup)
0720         return;
0721 
0722     if (opt_list == 1)
0723         show_page_range(voffset, pagemap_swap_offset(pme),
0724                 1, flags, 0, 0);
0725     else if (opt_list == 2)
0726         show_page(voffset, pagemap_swap_offset(pme), flags, 0, 0);
0727 
0728     nr_pages[hash_slot(flags)]++;
0729     total_pages++;
0730 }
0731 
0732 #define PAGEMAP_BATCH   (64 << 10)
0733 static void walk_vma(unsigned long index, unsigned long count)
0734 {
0735     uint64_t buf[PAGEMAP_BATCH];
0736     unsigned long batch;
0737     unsigned long pages;
0738     unsigned long pfn;
0739     unsigned long i;
0740 
0741     while (count) {
0742         batch = min_t(unsigned long, count, PAGEMAP_BATCH);
0743         pages = pagemap_read(buf, index, batch);
0744         if (pages == 0)
0745             break;
0746 
0747         for (i = 0; i < pages; i++) {
0748             pfn = pagemap_pfn(buf[i]);
0749             if (pfn)
0750                 walk_pfn(index + i, pfn, 1, buf[i]);
0751             if (buf[i] & PM_SWAP)
0752                 walk_swap(index + i, buf[i]);
0753         }
0754 
0755         index += pages;
0756         count -= pages;
0757     }
0758 }
0759 
0760 static void walk_task(unsigned long index, unsigned long count)
0761 {
0762     const unsigned long end = index + count;
0763     unsigned long start;
0764     int i = 0;
0765 
0766     while (index < end) {
0767 
0768         while (pg_end[i] <= index)
0769             if (++i >= nr_vmas)
0770                 return;
0771         if (pg_start[i] >= end)
0772             return;
0773 
0774         start = max_t(unsigned long, pg_start[i], index);
0775         index = min_t(unsigned long, pg_end[i], end);
0776 
0777         assert(start < index);
0778         walk_vma(start, index - start);
0779     }
0780 }
0781 
0782 static void add_addr_range(unsigned long offset, unsigned long size)
0783 {
0784     if (nr_addr_ranges >= MAX_ADDR_RANGES)
0785         fatal("too many addr ranges\n");
0786 
0787     opt_offset[nr_addr_ranges] = offset;
0788     opt_size[nr_addr_ranges] = min_t(unsigned long, size, ULONG_MAX-offset);
0789     nr_addr_ranges++;
0790 }
0791 
0792 static void walk_addr_ranges(void)
0793 {
0794     int i;
0795 
0796     kpageflags_fd = checked_open(opt_kpageflags, O_RDONLY);
0797 
0798     if (!nr_addr_ranges)
0799         add_addr_range(0, ULONG_MAX);
0800 
0801     for (i = 0; i < nr_addr_ranges; i++)
0802         if (!opt_pid)
0803             walk_pfn(opt_offset[i], opt_offset[i], opt_size[i], 0);
0804         else
0805             walk_task(opt_offset[i], opt_size[i]);
0806 
0807     if (opt_mark_idle)
0808         mark_page_idle(0);
0809 
0810     close(kpageflags_fd);
0811 }
0812 
0813 
0814 /*
0815  * user interface
0816  */
0817 
0818 static const char *page_flag_type(uint64_t flag)
0819 {
0820     if (flag & KPF_HACKERS_BITS)
0821         return "(r)";
0822     if (flag & KPF_OVERLOADED_BITS)
0823         return "(o)";
0824     return "   ";
0825 }
0826 
0827 static void usage(void)
0828 {
0829     size_t i, j;
0830 
0831     printf(
0832 "page-types [options]\n"
0833 "            -r|--raw                   Raw mode, for kernel developers\n"
0834 "            -d|--describe flags        Describe flags\n"
0835 "            -a|--addr    addr-spec     Walk a range of pages\n"
0836 "            -b|--bits    bits-spec     Walk pages with specified bits\n"
0837 "            -c|--cgroup  path|@inode   Walk pages within memory cgroup\n"
0838 "            -p|--pid     pid           Walk process address space\n"
0839 "            -f|--file    filename      Walk file address space\n"
0840 "            -i|--mark-idle             Mark pages idle\n"
0841 "            -l|--list                  Show page details in ranges\n"
0842 "            -L|--list-each             Show page details one by one\n"
0843 "            -C|--list-cgroup           Show cgroup inode for pages\n"
0844 "            -M|--list-mapcnt           Show page map count\n"
0845 "            -N|--no-summary            Don't show summary info\n"
0846 "            -X|--hwpoison              hwpoison pages\n"
0847 "            -x|--unpoison              unpoison pages\n"
0848 "            -F|--kpageflags filename   kpageflags file to parse\n"
0849 "            -h|--help                  Show this usage message\n"
0850 "flags:\n"
0851 "            0x10                       bitfield format, e.g.\n"
0852 "            anon                       bit-name, e.g.\n"
0853 "            0x10,anon                  comma-separated list, e.g.\n"
0854 "addr-spec:\n"
0855 "            N                          one page at offset N (unit: pages)\n"
0856 "            N+M                        pages range from N to N+M-1\n"
0857 "            N,M                        pages range from N to M-1\n"
0858 "            N,                         pages range from N to end\n"
0859 "            ,M                         pages range from 0 to M-1\n"
0860 "bits-spec:\n"
0861 "            bit1,bit2                  (flags & (bit1|bit2)) != 0\n"
0862 "            bit1,bit2=bit1             (flags & (bit1|bit2)) == bit1\n"
0863 "            bit1,~bit2                 (flags & (bit1|bit2)) == bit1\n"
0864 "            =bit1,bit2                 flags == (bit1|bit2)\n"
0865 "bit-names:\n"
0866     );
0867 
0868     for (i = 0, j = 0; i < ARRAY_SIZE(page_flag_names); i++) {
0869         if (!page_flag_names[i])
0870             continue;
0871         printf("%16s%s", page_flag_names[i] + 2,
0872                  page_flag_type(1ULL << i));
0873         if (++j > 3) {
0874             j = 0;
0875             putchar('\n');
0876         }
0877     }
0878     printf("\n                                   "
0879         "(r) raw mode bits  (o) overloaded bits\n");
0880 }
0881 
0882 static unsigned long long parse_number(const char *str)
0883 {
0884     unsigned long long n;
0885 
0886     n = strtoll(str, NULL, 0);
0887 
0888     if (n == 0 && str[0] != '0')
0889         fatal("invalid name or number: %s\n", str);
0890 
0891     return n;
0892 }
0893 
0894 static void parse_pid(const char *str)
0895 {
0896     FILE *file;
0897     char buf[5000];
0898 
0899     opt_pid = parse_number(str);
0900 
0901     sprintf(buf, "/proc/%d/pagemap", opt_pid);
0902     pagemap_fd = checked_open(buf, O_RDONLY);
0903 
0904     sprintf(buf, "/proc/%d/maps", opt_pid);
0905     file = fopen(buf, "r");
0906     if (!file) {
0907         perror(buf);
0908         exit(EXIT_FAILURE);
0909     }
0910 
0911     while (fgets(buf, sizeof(buf), file) != NULL) {
0912         unsigned long vm_start;
0913         unsigned long vm_end;
0914         unsigned long long pgoff;
0915         int major, minor;
0916         char r, w, x, s;
0917         unsigned long ino;
0918         int n;
0919 
0920         n = sscanf(buf, "%lx-%lx %c%c%c%c %llx %x:%x %lu",
0921                &vm_start,
0922                &vm_end,
0923                &r, &w, &x, &s,
0924                &pgoff,
0925                &major, &minor,
0926                &ino);
0927         if (n < 10) {
0928             fprintf(stderr, "unexpected line: %s\n", buf);
0929             continue;
0930         }
0931         pg_start[nr_vmas] = vm_start / page_size;
0932         pg_end[nr_vmas] = vm_end / page_size;
0933         if (++nr_vmas >= MAX_VMAS) {
0934             fprintf(stderr, "too many VMAs\n");
0935             break;
0936         }
0937     }
0938     fclose(file);
0939 }
0940 
0941 static void show_file(const char *name, const struct stat *st)
0942 {
0943     unsigned long long size = st->st_size;
0944     char atime[64], mtime[64];
0945     long now = time(NULL);
0946 
0947     printf("%s\tInode: %u\tSize: %llu (%llu pages)\n",
0948             name, (unsigned)st->st_ino,
0949             size, (size + page_size - 1) / page_size);
0950 
0951     strftime(atime, sizeof(atime), "%c", localtime(&st->st_atime));
0952     strftime(mtime, sizeof(mtime), "%c", localtime(&st->st_mtime));
0953 
0954     printf("Modify: %s (%ld seconds ago)\nAccess: %s (%ld seconds ago)\n",
0955             mtime, now - st->st_mtime,
0956             atime, now - st->st_atime);
0957 }
0958 
0959 static sigjmp_buf sigbus_jmp;
0960 
0961 static void * volatile sigbus_addr;
0962 
0963 static void sigbus_handler(int sig, siginfo_t *info, void *ucontex)
0964 {
0965     (void)sig;
0966     (void)ucontex;
0967     sigbus_addr = info ? info->si_addr : NULL;
0968     siglongjmp(sigbus_jmp, 1);
0969 }
0970 
0971 static struct sigaction sigbus_action = {
0972     .sa_sigaction = sigbus_handler,
0973     .sa_flags = SA_SIGINFO,
0974 };
0975 
0976 static void walk_file_range(const char *name, int fd,
0977                 unsigned long off, unsigned long end)
0978 {
0979     uint8_t vec[PAGEMAP_BATCH];
0980     uint64_t buf[PAGEMAP_BATCH], flags;
0981     uint64_t cgroup = 0;
0982     uint64_t mapcnt = 0;
0983     unsigned long nr_pages, pfn, i;
0984     ssize_t len;
0985     void *ptr;
0986     int first = 1;
0987 
0988     for (; off < end; off += len) {
0989         nr_pages = (end - off + page_size - 1) / page_size;
0990         if (nr_pages > PAGEMAP_BATCH)
0991             nr_pages = PAGEMAP_BATCH;
0992         len = nr_pages * page_size;
0993 
0994         ptr = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, off);
0995         if (ptr == MAP_FAILED)
0996             fatal("mmap failed: %s", name);
0997 
0998         /* determine cached pages */
0999         if (mincore(ptr, len, vec))
1000             fatal("mincore failed: %s", name);
1001 
1002         /* turn off readahead */
1003         if (madvise(ptr, len, MADV_RANDOM))
1004             fatal("madvice failed: %s", name);
1005 
1006         if (sigsetjmp(sigbus_jmp, 1)) {
1007             end = off + sigbus_addr ? sigbus_addr - ptr : 0;
1008             fprintf(stderr, "got sigbus at offset %lld: %s\n",
1009                     (long long)end, name);
1010             goto got_sigbus;
1011         }
1012 
1013         /* populate ptes */
1014         for (i = 0; i < nr_pages ; i++) {
1015             if (vec[i] & 1)
1016                 (void)*(volatile int *)(ptr + i * page_size);
1017         }
1018 got_sigbus:
1019 
1020         /* turn off harvesting reference bits */
1021         if (madvise(ptr, len, MADV_SEQUENTIAL))
1022             fatal("madvice failed: %s", name);
1023 
1024         if (pagemap_read(buf, (unsigned long)ptr / page_size,
1025                     nr_pages) != nr_pages)
1026             fatal("cannot read pagemap");
1027 
1028         munmap(ptr, len);
1029 
1030         for (i = 0; i < nr_pages; i++) {
1031             pfn = pagemap_pfn(buf[i]);
1032             if (!pfn)
1033                 continue;
1034             if (!kpageflags_read(&flags, pfn, 1))
1035                 continue;
1036             if (!kpagecgroup_read(&cgroup, pfn, 1))
1037                 fatal("kpagecgroup_read failed");
1038             if (!kpagecount_read(&mapcnt, pfn, 1))
1039                 fatal("kpagecount_read failed");
1040             if (first && opt_list) {
1041                 first = 0;
1042                 flush_page_range();
1043             }
1044             add_page(off / page_size + i, pfn,
1045                  flags, cgroup, mapcnt, buf[i]);
1046         }
1047     }
1048 }
1049 
1050 static void walk_file(const char *name, const struct stat *st)
1051 {
1052     int i;
1053     int fd;
1054 
1055     fd = checked_open(name, O_RDONLY|O_NOATIME|O_NOFOLLOW);
1056 
1057     if (!nr_addr_ranges)
1058         add_addr_range(0, st->st_size / page_size);
1059 
1060     for (i = 0; i < nr_addr_ranges; i++)
1061         walk_file_range(name, fd, opt_offset[i] * page_size,
1062                 (opt_offset[i] + opt_size[i]) * page_size);
1063 
1064     close(fd);
1065 }
1066 
1067 int walk_tree(const char *name, const struct stat *st, int type, struct FTW *f)
1068 {
1069     (void)f;
1070     switch (type) {
1071     case FTW_F:
1072         if (S_ISREG(st->st_mode))
1073             walk_file(name, st);
1074         break;
1075     case FTW_DNR:
1076         fprintf(stderr, "cannot read dir: %s\n", name);
1077         break;
1078     }
1079     return 0;
1080 }
1081 
1082 struct stat st;
1083 
1084 static void walk_page_cache(void)
1085 {
1086     kpageflags_fd = checked_open(opt_kpageflags, O_RDONLY);
1087     pagemap_fd = checked_open("/proc/self/pagemap", O_RDONLY);
1088     sigaction(SIGBUS, &sigbus_action, NULL);
1089 
1090     if (stat(opt_file, &st))
1091         fatal("stat failed: %s\n", opt_file);
1092 
1093     if (S_ISREG(st.st_mode)) {
1094         walk_file(opt_file, &st);
1095     } else if (S_ISDIR(st.st_mode)) {
1096         /* do not follow symlinks and mountpoints */
1097         if (nftw(opt_file, walk_tree, 64, FTW_MOUNT | FTW_PHYS) < 0)
1098             fatal("nftw failed: %s\n", opt_file);
1099     } else
1100         fatal("unhandled file type: %s\n", opt_file);
1101 
1102     close(kpageflags_fd);
1103     close(pagemap_fd);
1104     signal(SIGBUS, SIG_DFL);
1105 }
1106 
1107 static void parse_file(const char *name)
1108 {
1109     opt_file = name;
1110 }
1111 
1112 static void parse_cgroup(const char *path)
1113 {
1114     if (path[0] == '@') {
1115         opt_cgroup = parse_number(path + 1);
1116         return;
1117     }
1118 
1119     struct stat st;
1120 
1121     if (stat(path, &st))
1122         fatal("stat failed: %s: %m\n", path);
1123 
1124     if (!S_ISDIR(st.st_mode))
1125         fatal("cgroup supposed to be a directory: %s\n", path);
1126 
1127     opt_cgroup = st.st_ino;
1128 }
1129 
1130 static void parse_addr_range(const char *optarg)
1131 {
1132     unsigned long offset;
1133     unsigned long size;
1134     char *p;
1135 
1136     p = strchr(optarg, ',');
1137     if (!p)
1138         p = strchr(optarg, '+');
1139 
1140     if (p == optarg) {
1141         offset = 0;
1142         size   = parse_number(p + 1);
1143     } else if (p) {
1144         offset = parse_number(optarg);
1145         if (p[1] == '\0')
1146             size = ULONG_MAX;
1147         else {
1148             size = parse_number(p + 1);
1149             if (*p == ',') {
1150                 if (size < offset)
1151                     fatal("invalid range: %lu,%lu\n",
1152                             offset, size);
1153                 size -= offset;
1154             }
1155         }
1156     } else {
1157         offset = parse_number(optarg);
1158         size   = 1;
1159     }
1160 
1161     add_addr_range(offset, size);
1162 }
1163 
1164 static void add_bits_filter(uint64_t mask, uint64_t bits)
1165 {
1166     if (nr_bit_filters >= MAX_BIT_FILTERS)
1167         fatal("too much bit filters\n");
1168 
1169     opt_mask[nr_bit_filters] = mask;
1170     opt_bits[nr_bit_filters] = bits;
1171     nr_bit_filters++;
1172 }
1173 
1174 static uint64_t parse_flag_name(const char *str, int len)
1175 {
1176     size_t i;
1177 
1178     if (!*str || !len)
1179         return 0;
1180 
1181     if (len <= 8 && !strncmp(str, "compound", len))
1182         return BITS_COMPOUND;
1183 
1184     for (i = 0; i < ARRAY_SIZE(page_flag_names); i++) {
1185         if (!page_flag_names[i])
1186             continue;
1187         if (!strncmp(str, page_flag_names[i] + 2, len))
1188             return 1ULL << i;
1189     }
1190 
1191     return parse_number(str);
1192 }
1193 
1194 static uint64_t parse_flag_names(const char *str, int all)
1195 {
1196     const char *p    = str;
1197     uint64_t   flags = 0;
1198 
1199     while (1) {
1200         if (*p == ',' || *p == '=' || *p == '\0') {
1201             if ((*str != '~') || (*str == '~' && all && *++str))
1202                 flags |= parse_flag_name(str, p - str);
1203             if (*p != ',')
1204                 break;
1205             str = p + 1;
1206         }
1207         p++;
1208     }
1209 
1210     return flags;
1211 }
1212 
1213 static void parse_bits_mask(const char *optarg)
1214 {
1215     uint64_t mask;
1216     uint64_t bits;
1217     const char *p;
1218 
1219     p = strchr(optarg, '=');
1220     if (p == optarg) {
1221         mask = KPF_ALL_BITS;
1222         bits = parse_flag_names(p + 1, 0);
1223     } else if (p) {
1224         mask = parse_flag_names(optarg, 0);
1225         bits = parse_flag_names(p + 1, 0);
1226     } else if (strchr(optarg, '~')) {
1227         mask = parse_flag_names(optarg, 1);
1228         bits = parse_flag_names(optarg, 0);
1229     } else {
1230         mask = parse_flag_names(optarg, 0);
1231         bits = KPF_ALL_BITS;
1232     }
1233 
1234     add_bits_filter(mask, bits);
1235 }
1236 
1237 static void parse_kpageflags(const char *name)
1238 {
1239     opt_kpageflags = name;
1240 }
1241 
1242 static void describe_flags(const char *optarg)
1243 {
1244     uint64_t flags = parse_flag_names(optarg, 0);
1245 
1246     printf("0x%016llx\t%s\t%s\n",
1247         (unsigned long long)flags,
1248         page_flag_name(flags),
1249         page_flag_longname(flags));
1250 }
1251 
1252 static const struct option opts[] = {
1253     { "raw"       , 0, NULL, 'r' },
1254     { "pid"       , 1, NULL, 'p' },
1255     { "file"      , 1, NULL, 'f' },
1256     { "addr"      , 1, NULL, 'a' },
1257     { "bits"      , 1, NULL, 'b' },
1258     { "cgroup"    , 1, NULL, 'c' },
1259     { "describe"  , 1, NULL, 'd' },
1260     { "mark-idle" , 0, NULL, 'i' },
1261     { "list"      , 0, NULL, 'l' },
1262     { "list-each" , 0, NULL, 'L' },
1263     { "list-cgroup", 0, NULL, 'C' },
1264     { "list-mapcnt", 0, NULL, 'M' },
1265     { "no-summary", 0, NULL, 'N' },
1266     { "hwpoison"  , 0, NULL, 'X' },
1267     { "unpoison"  , 0, NULL, 'x' },
1268     { "kpageflags", 0, NULL, 'F' },
1269     { "help"      , 0, NULL, 'h' },
1270     { NULL        , 0, NULL, 0 }
1271 };
1272 
1273 int main(int argc, char *argv[])
1274 {
1275     int c;
1276 
1277     page_size = getpagesize();
1278 
1279     while ((c = getopt_long(argc, argv,
1280                 "rp:f:a:b:d:c:CilLMNXxF:h",
1281                 opts, NULL)) != -1) {
1282         switch (c) {
1283         case 'r':
1284             opt_raw = 1;
1285             break;
1286         case 'p':
1287             parse_pid(optarg);
1288             break;
1289         case 'f':
1290             parse_file(optarg);
1291             break;
1292         case 'a':
1293             parse_addr_range(optarg);
1294             break;
1295         case 'b':
1296             parse_bits_mask(optarg);
1297             break;
1298         case 'c':
1299             parse_cgroup(optarg);
1300             break;
1301         case 'C':
1302             opt_list_cgroup = 1;
1303             break;
1304         case 'd':
1305             describe_flags(optarg);
1306             exit(0);
1307         case 'i':
1308             opt_mark_idle = 1;
1309             break;
1310         case 'l':
1311             opt_list = 1;
1312             break;
1313         case 'L':
1314             opt_list = 2;
1315             break;
1316         case 'M':
1317             opt_list_mapcnt = 1;
1318             break;
1319         case 'N':
1320             opt_no_summary = 1;
1321             break;
1322         case 'X':
1323             opt_hwpoison = 1;
1324             prepare_hwpoison_fd();
1325             break;
1326         case 'x':
1327             opt_unpoison = 1;
1328             prepare_hwpoison_fd();
1329             break;
1330         case 'F':
1331             parse_kpageflags(optarg);
1332             break;
1333         case 'h':
1334             usage();
1335             exit(0);
1336         default:
1337             usage();
1338             exit(1);
1339         }
1340     }
1341 
1342     if (!opt_kpageflags)
1343         opt_kpageflags = PROC_KPAGEFLAGS;
1344 
1345     if (opt_cgroup || opt_list_cgroup)
1346         kpagecgroup_fd = checked_open(PROC_KPAGECGROUP, O_RDONLY);
1347 
1348     if (opt_list && opt_list_mapcnt)
1349         kpagecount_fd = checked_open(PROC_KPAGECOUNT, O_RDONLY);
1350 
1351     if (opt_mark_idle)
1352         page_idle_fd = checked_open(SYS_KERNEL_MM_PAGE_IDLE, O_RDWR);
1353 
1354     if (opt_list && opt_pid)
1355         printf("voffset\t");
1356     if (opt_list && opt_file)
1357         printf("foffset\t");
1358     if (opt_list && opt_list_cgroup)
1359         printf("cgroup\t");
1360     if (opt_list && opt_list_mapcnt)
1361         printf("map-cnt\t");
1362 
1363     if (opt_list == 1)
1364         printf("offset\tlen\tflags\n");
1365     if (opt_list == 2)
1366         printf("offset\tflags\n");
1367 
1368     if (opt_file)
1369         walk_page_cache();
1370     else
1371         walk_addr_ranges();
1372 
1373     if (opt_list == 1)
1374         flush_page_range();
1375 
1376     if (opt_no_summary)
1377         return 0;
1378 
1379     if (opt_list)
1380         printf("\n\n");
1381 
1382     if (opt_file) {
1383         show_file(opt_file, &st);
1384         printf("\n");
1385     }
1386 
1387     show_summary();
1388 
1389     if (opt_list_mapcnt)
1390         close(kpagecount_fd);
1391 
1392     if (page_idle_fd >= 0)
1393         close(page_idle_fd);
1394 
1395     return 0;
1396 }