Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0-only
0002 /*
0003  * Page Attribute Table (PAT) support: handle memory caching attributes in page tables.
0004  *
0005  * Authors: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
0006  *          Suresh B Siddha <suresh.b.siddha@intel.com>
0007  *
0008  * Loosely based on earlier PAT patchset from Eric Biederman and Andi Kleen.
0009  *
0010  * Basic principles:
0011  *
0012  * PAT is a CPU feature supported by all modern x86 CPUs, to allow the firmware and
0013  * the kernel to set one of a handful of 'caching type' attributes for physical
0014  * memory ranges: uncached, write-combining, write-through, write-protected,
0015  * and the most commonly used and default attribute: write-back caching.
0016  *
0017  * PAT support supercedes and augments MTRR support in a compatible fashion: MTRR is
0018  * a hardware interface to enumerate a limited number of physical memory ranges
0019  * and set their caching attributes explicitly, programmed into the CPU via MSRs.
0020  * Even modern CPUs have MTRRs enabled - but these are typically not touched
0021  * by the kernel or by user-space (such as the X server), we rely on PAT for any
0022  * additional cache attribute logic.
0023  *
0024  * PAT doesn't work via explicit memory ranges, but uses page table entries to add
0025  * cache attribute information to the mapped memory range: there's 3 bits used,
0026  * (_PAGE_PWT, _PAGE_PCD, _PAGE_PAT), with the 8 possible values mapped by the
0027  * CPU to actual cache attributes via an MSR loaded into the CPU (MSR_IA32_CR_PAT).
0028  *
0029  * ( There's a metric ton of finer details, such as compatibility with CPU quirks
0030  *   that only support 4 types of PAT entries, and interaction with MTRRs, see
0031  *   below for details. )
0032  */
0033 
0034 #include <linux/seq_file.h>
0035 #include <linux/memblock.h>
0036 #include <linux/debugfs.h>
0037 #include <linux/ioport.h>
0038 #include <linux/kernel.h>
0039 #include <linux/pfn_t.h>
0040 #include <linux/slab.h>
0041 #include <linux/mm.h>
0042 #include <linux/fs.h>
0043 #include <linux/rbtree.h>
0044 
0045 #include <asm/cacheflush.h>
0046 #include <asm/processor.h>
0047 #include <asm/tlbflush.h>
0048 #include <asm/x86_init.h>
0049 #include <asm/fcntl.h>
0050 #include <asm/e820/api.h>
0051 #include <asm/mtrr.h>
0052 #include <asm/page.h>
0053 #include <asm/msr.h>
0054 #include <asm/memtype.h>
0055 #include <asm/io.h>
0056 
0057 #include "memtype.h"
0058 #include "../mm_internal.h"
0059 
0060 #undef pr_fmt
0061 #define pr_fmt(fmt) "" fmt
0062 
0063 static bool __read_mostly pat_bp_initialized;
0064 static bool __read_mostly pat_disabled = !IS_ENABLED(CONFIG_X86_PAT);
0065 static bool __initdata pat_force_disabled = !IS_ENABLED(CONFIG_X86_PAT);
0066 static bool __read_mostly pat_bp_enabled;
0067 static bool __read_mostly pat_cm_initialized;
0068 
0069 /*
0070  * PAT support is enabled by default, but can be disabled for
0071  * various user-requested or hardware-forced reasons:
0072  */
0073 void pat_disable(const char *msg_reason)
0074 {
0075     if (pat_disabled)
0076         return;
0077 
0078     if (pat_bp_initialized) {
0079         WARN_ONCE(1, "x86/PAT: PAT cannot be disabled after initialization\n");
0080         return;
0081     }
0082 
0083     pat_disabled = true;
0084     pr_info("x86/PAT: %s\n", msg_reason);
0085 }
0086 
0087 static int __init nopat(char *str)
0088 {
0089     pat_disable("PAT support disabled via boot option.");
0090     pat_force_disabled = true;
0091     return 0;
0092 }
0093 early_param("nopat", nopat);
0094 
0095 bool pat_enabled(void)
0096 {
0097     return pat_bp_enabled;
0098 }
0099 EXPORT_SYMBOL_GPL(pat_enabled);
0100 
0101 int pat_debug_enable;
0102 
0103 static int __init pat_debug_setup(char *str)
0104 {
0105     pat_debug_enable = 1;
0106     return 1;
0107 }
0108 __setup("debugpat", pat_debug_setup);
0109 
0110 #ifdef CONFIG_X86_PAT
0111 /*
0112  * X86 PAT uses page flags arch_1 and uncached together to keep track of
0113  * memory type of pages that have backing page struct.
0114  *
0115  * X86 PAT supports 4 different memory types:
0116  *  - _PAGE_CACHE_MODE_WB
0117  *  - _PAGE_CACHE_MODE_WC
0118  *  - _PAGE_CACHE_MODE_UC_MINUS
0119  *  - _PAGE_CACHE_MODE_WT
0120  *
0121  * _PAGE_CACHE_MODE_WB is the default type.
0122  */
0123 
0124 #define _PGMT_WB        0
0125 #define _PGMT_WC        (1UL << PG_arch_1)
0126 #define _PGMT_UC_MINUS      (1UL << PG_uncached)
0127 #define _PGMT_WT        (1UL << PG_uncached | 1UL << PG_arch_1)
0128 #define _PGMT_MASK      (1UL << PG_uncached | 1UL << PG_arch_1)
0129 #define _PGMT_CLEAR_MASK    (~_PGMT_MASK)
0130 
0131 static inline enum page_cache_mode get_page_memtype(struct page *pg)
0132 {
0133     unsigned long pg_flags = pg->flags & _PGMT_MASK;
0134 
0135     if (pg_flags == _PGMT_WB)
0136         return _PAGE_CACHE_MODE_WB;
0137     else if (pg_flags == _PGMT_WC)
0138         return _PAGE_CACHE_MODE_WC;
0139     else if (pg_flags == _PGMT_UC_MINUS)
0140         return _PAGE_CACHE_MODE_UC_MINUS;
0141     else
0142         return _PAGE_CACHE_MODE_WT;
0143 }
0144 
0145 static inline void set_page_memtype(struct page *pg,
0146                     enum page_cache_mode memtype)
0147 {
0148     unsigned long memtype_flags;
0149     unsigned long old_flags;
0150     unsigned long new_flags;
0151 
0152     switch (memtype) {
0153     case _PAGE_CACHE_MODE_WC:
0154         memtype_flags = _PGMT_WC;
0155         break;
0156     case _PAGE_CACHE_MODE_UC_MINUS:
0157         memtype_flags = _PGMT_UC_MINUS;
0158         break;
0159     case _PAGE_CACHE_MODE_WT:
0160         memtype_flags = _PGMT_WT;
0161         break;
0162     case _PAGE_CACHE_MODE_WB:
0163     default:
0164         memtype_flags = _PGMT_WB;
0165         break;
0166     }
0167 
0168     do {
0169         old_flags = pg->flags;
0170         new_flags = (old_flags & _PGMT_CLEAR_MASK) | memtype_flags;
0171     } while (cmpxchg(&pg->flags, old_flags, new_flags) != old_flags);
0172 }
0173 #else
0174 static inline enum page_cache_mode get_page_memtype(struct page *pg)
0175 {
0176     return -1;
0177 }
0178 static inline void set_page_memtype(struct page *pg,
0179                     enum page_cache_mode memtype)
0180 {
0181 }
0182 #endif
0183 
0184 enum {
0185     PAT_UC = 0,     /* uncached */
0186     PAT_WC = 1,     /* Write combining */
0187     PAT_WT = 4,     /* Write Through */
0188     PAT_WP = 5,     /* Write Protected */
0189     PAT_WB = 6,     /* Write Back (default) */
0190     PAT_UC_MINUS = 7,   /* UC, but can be overridden by MTRR */
0191 };
0192 
0193 #define CM(c) (_PAGE_CACHE_MODE_ ## c)
0194 
0195 static enum page_cache_mode pat_get_cache_mode(unsigned pat_val, char *msg)
0196 {
0197     enum page_cache_mode cache;
0198     char *cache_mode;
0199 
0200     switch (pat_val) {
0201     case PAT_UC:       cache = CM(UC);       cache_mode = "UC  "; break;
0202     case PAT_WC:       cache = CM(WC);       cache_mode = "WC  "; break;
0203     case PAT_WT:       cache = CM(WT);       cache_mode = "WT  "; break;
0204     case PAT_WP:       cache = CM(WP);       cache_mode = "WP  "; break;
0205     case PAT_WB:       cache = CM(WB);       cache_mode = "WB  "; break;
0206     case PAT_UC_MINUS: cache = CM(UC_MINUS); cache_mode = "UC- "; break;
0207     default:           cache = CM(WB);       cache_mode = "WB  "; break;
0208     }
0209 
0210     memcpy(msg, cache_mode, 4);
0211 
0212     return cache;
0213 }
0214 
0215 #undef CM
0216 
0217 /*
0218  * Update the cache mode to pgprot translation tables according to PAT
0219  * configuration.
0220  * Using lower indices is preferred, so we start with highest index.
0221  */
0222 static void __init_cache_modes(u64 pat)
0223 {
0224     enum page_cache_mode cache;
0225     char pat_msg[33];
0226     int i;
0227 
0228     WARN_ON_ONCE(pat_cm_initialized);
0229 
0230     pat_msg[32] = 0;
0231     for (i = 7; i >= 0; i--) {
0232         cache = pat_get_cache_mode((pat >> (i * 8)) & 7,
0233                        pat_msg + 4 * i);
0234         update_cache_mode_entry(i, cache);
0235     }
0236     pr_info("x86/PAT: Configuration [0-7]: %s\n", pat_msg);
0237 
0238     pat_cm_initialized = true;
0239 }
0240 
0241 #define PAT(x, y)   ((u64)PAT_ ## y << ((x)*8))
0242 
0243 static void pat_bp_init(u64 pat)
0244 {
0245     u64 tmp_pat;
0246 
0247     if (!boot_cpu_has(X86_FEATURE_PAT)) {
0248         pat_disable("PAT not supported by the CPU.");
0249         return;
0250     }
0251 
0252     rdmsrl(MSR_IA32_CR_PAT, tmp_pat);
0253     if (!tmp_pat) {
0254         pat_disable("PAT support disabled by the firmware.");
0255         return;
0256     }
0257 
0258     wrmsrl(MSR_IA32_CR_PAT, pat);
0259     pat_bp_enabled = true;
0260 
0261     __init_cache_modes(pat);
0262 }
0263 
0264 static void pat_ap_init(u64 pat)
0265 {
0266     if (!boot_cpu_has(X86_FEATURE_PAT)) {
0267         /*
0268          * If this happens we are on a secondary CPU, but switched to
0269          * PAT on the boot CPU. We have no way to undo PAT.
0270          */
0271         panic("x86/PAT: PAT enabled, but not supported by secondary CPU\n");
0272     }
0273 
0274     wrmsrl(MSR_IA32_CR_PAT, pat);
0275 }
0276 
0277 void __init init_cache_modes(void)
0278 {
0279     u64 pat = 0;
0280 
0281     if (pat_cm_initialized)
0282         return;
0283 
0284     if (boot_cpu_has(X86_FEATURE_PAT)) {
0285         /*
0286          * CPU supports PAT. Set PAT table to be consistent with
0287          * PAT MSR. This case supports "nopat" boot option, and
0288          * virtual machine environments which support PAT without
0289          * MTRRs. In specific, Xen has unique setup to PAT MSR.
0290          *
0291          * If PAT MSR returns 0, it is considered invalid and emulates
0292          * as No PAT.
0293          */
0294         rdmsrl(MSR_IA32_CR_PAT, pat);
0295     }
0296 
0297     if (!pat) {
0298         /*
0299          * No PAT. Emulate the PAT table that corresponds to the two
0300          * cache bits, PWT (Write Through) and PCD (Cache Disable).
0301          * This setup is also the same as the BIOS default setup.
0302          *
0303          * PTE encoding:
0304          *
0305          *       PCD
0306          *       |PWT  PAT
0307          *       ||    slot
0308          *       00    0    WB : _PAGE_CACHE_MODE_WB
0309          *       01    1    WT : _PAGE_CACHE_MODE_WT
0310          *       10    2    UC-: _PAGE_CACHE_MODE_UC_MINUS
0311          *       11    3    UC : _PAGE_CACHE_MODE_UC
0312          *
0313          * NOTE: When WC or WP is used, it is redirected to UC- per
0314          * the default setup in __cachemode2pte_tbl[].
0315          */
0316         pat = PAT(0, WB) | PAT(1, WT) | PAT(2, UC_MINUS) | PAT(3, UC) |
0317               PAT(4, WB) | PAT(5, WT) | PAT(6, UC_MINUS) | PAT(7, UC);
0318     } else if (!pat_force_disabled && cpu_feature_enabled(X86_FEATURE_HYPERVISOR)) {
0319         /*
0320          * Clearly PAT is enabled underneath. Allow pat_enabled() to
0321          * reflect this.
0322          */
0323         pat_bp_enabled = true;
0324     }
0325 
0326     __init_cache_modes(pat);
0327 }
0328 
0329 /**
0330  * pat_init - Initialize the PAT MSR and PAT table on the current CPU
0331  *
0332  * This function initializes PAT MSR and PAT table with an OS-defined value
0333  * to enable additional cache attributes, WC, WT and WP.
0334  *
0335  * This function must be called on all CPUs using the specific sequence of
0336  * operations defined in Intel SDM. mtrr_rendezvous_handler() provides this
0337  * procedure for PAT.
0338  */
0339 void pat_init(void)
0340 {
0341     u64 pat;
0342     struct cpuinfo_x86 *c = &boot_cpu_data;
0343 
0344 #ifndef CONFIG_X86_PAT
0345     pr_info_once("x86/PAT: PAT support disabled because CONFIG_X86_PAT is disabled in the kernel.\n");
0346 #endif
0347 
0348     if (pat_disabled)
0349         return;
0350 
0351     if ((c->x86_vendor == X86_VENDOR_INTEL) &&
0352         (((c->x86 == 0x6) && (c->x86_model <= 0xd)) ||
0353          ((c->x86 == 0xf) && (c->x86_model <= 0x6)))) {
0354         /*
0355          * PAT support with the lower four entries. Intel Pentium 2,
0356          * 3, M, and 4 are affected by PAT errata, which makes the
0357          * upper four entries unusable. To be on the safe side, we don't
0358          * use those.
0359          *
0360          *  PTE encoding:
0361          *      PAT
0362          *      |PCD
0363          *      ||PWT  PAT
0364          *      |||    slot
0365          *      000    0    WB : _PAGE_CACHE_MODE_WB
0366          *      001    1    WC : _PAGE_CACHE_MODE_WC
0367          *      010    2    UC-: _PAGE_CACHE_MODE_UC_MINUS
0368          *      011    3    UC : _PAGE_CACHE_MODE_UC
0369          * PAT bit unused
0370          *
0371          * NOTE: When WT or WP is used, it is redirected to UC- per
0372          * the default setup in __cachemode2pte_tbl[].
0373          */
0374         pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) |
0375               PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, UC);
0376     } else {
0377         /*
0378          * Full PAT support.  We put WT in slot 7 to improve
0379          * robustness in the presence of errata that might cause
0380          * the high PAT bit to be ignored.  This way, a buggy slot 7
0381          * access will hit slot 3, and slot 3 is UC, so at worst
0382          * we lose performance without causing a correctness issue.
0383          * Pentium 4 erratum N46 is an example for such an erratum,
0384          * although we try not to use PAT at all on affected CPUs.
0385          *
0386          *  PTE encoding:
0387          *      PAT
0388          *      |PCD
0389          *      ||PWT  PAT
0390          *      |||    slot
0391          *      000    0    WB : _PAGE_CACHE_MODE_WB
0392          *      001    1    WC : _PAGE_CACHE_MODE_WC
0393          *      010    2    UC-: _PAGE_CACHE_MODE_UC_MINUS
0394          *      011    3    UC : _PAGE_CACHE_MODE_UC
0395          *      100    4    WB : Reserved
0396          *      101    5    WP : _PAGE_CACHE_MODE_WP
0397          *      110    6    UC-: Reserved
0398          *      111    7    WT : _PAGE_CACHE_MODE_WT
0399          *
0400          * The reserved slots are unused, but mapped to their
0401          * corresponding types in the presence of PAT errata.
0402          */
0403         pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) |
0404               PAT(4, WB) | PAT(5, WP) | PAT(6, UC_MINUS) | PAT(7, WT);
0405     }
0406 
0407     if (!pat_bp_initialized) {
0408         pat_bp_init(pat);
0409         pat_bp_initialized = true;
0410     } else {
0411         pat_ap_init(pat);
0412     }
0413 }
0414 
0415 #undef PAT
0416 
0417 static DEFINE_SPINLOCK(memtype_lock);   /* protects memtype accesses */
0418 
0419 /*
0420  * Does intersection of PAT memory type and MTRR memory type and returns
0421  * the resulting memory type as PAT understands it.
0422  * (Type in pat and mtrr will not have same value)
0423  * The intersection is based on "Effective Memory Type" tables in IA-32
0424  * SDM vol 3a
0425  */
0426 static unsigned long pat_x_mtrr_type(u64 start, u64 end,
0427                      enum page_cache_mode req_type)
0428 {
0429     /*
0430      * Look for MTRR hint to get the effective type in case where PAT
0431      * request is for WB.
0432      */
0433     if (req_type == _PAGE_CACHE_MODE_WB) {
0434         u8 mtrr_type, uniform;
0435 
0436         mtrr_type = mtrr_type_lookup(start, end, &uniform);
0437         if (mtrr_type != MTRR_TYPE_WRBACK)
0438             return _PAGE_CACHE_MODE_UC_MINUS;
0439 
0440         return _PAGE_CACHE_MODE_WB;
0441     }
0442 
0443     return req_type;
0444 }
0445 
0446 struct pagerange_state {
0447     unsigned long       cur_pfn;
0448     int         ram;
0449     int         not_ram;
0450 };
0451 
0452 static int
0453 pagerange_is_ram_callback(unsigned long initial_pfn, unsigned long total_nr_pages, void *arg)
0454 {
0455     struct pagerange_state *state = arg;
0456 
0457     state->not_ram  |= initial_pfn > state->cur_pfn;
0458     state->ram  |= total_nr_pages > 0;
0459     state->cur_pfn   = initial_pfn + total_nr_pages;
0460 
0461     return state->ram && state->not_ram;
0462 }
0463 
0464 static int pat_pagerange_is_ram(resource_size_t start, resource_size_t end)
0465 {
0466     int ret = 0;
0467     unsigned long start_pfn = start >> PAGE_SHIFT;
0468     unsigned long end_pfn = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
0469     struct pagerange_state state = {start_pfn, 0, 0};
0470 
0471     /*
0472      * For legacy reasons, physical address range in the legacy ISA
0473      * region is tracked as non-RAM. This will allow users of
0474      * /dev/mem to map portions of legacy ISA region, even when
0475      * some of those portions are listed(or not even listed) with
0476      * different e820 types(RAM/reserved/..)
0477      */
0478     if (start_pfn < ISA_END_ADDRESS >> PAGE_SHIFT)
0479         start_pfn = ISA_END_ADDRESS >> PAGE_SHIFT;
0480 
0481     if (start_pfn < end_pfn) {
0482         ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn,
0483                 &state, pagerange_is_ram_callback);
0484     }
0485 
0486     return (ret > 0) ? -1 : (state.ram ? 1 : 0);
0487 }
0488 
0489 /*
0490  * For RAM pages, we use page flags to mark the pages with appropriate type.
0491  * The page flags are limited to four types, WB (default), WC, WT and UC-.
0492  * WP request fails with -EINVAL, and UC gets redirected to UC-.  Setting
0493  * a new memory type is only allowed for a page mapped with the default WB
0494  * type.
0495  *
0496  * Here we do two passes:
0497  * - Find the memtype of all the pages in the range, look for any conflicts.
0498  * - In case of no conflicts, set the new memtype for pages in the range.
0499  */
0500 static int reserve_ram_pages_type(u64 start, u64 end,
0501                   enum page_cache_mode req_type,
0502                   enum page_cache_mode *new_type)
0503 {
0504     struct page *page;
0505     u64 pfn;
0506 
0507     if (req_type == _PAGE_CACHE_MODE_WP) {
0508         if (new_type)
0509             *new_type = _PAGE_CACHE_MODE_UC_MINUS;
0510         return -EINVAL;
0511     }
0512 
0513     if (req_type == _PAGE_CACHE_MODE_UC) {
0514         /* We do not support strong UC */
0515         WARN_ON_ONCE(1);
0516         req_type = _PAGE_CACHE_MODE_UC_MINUS;
0517     }
0518 
0519     for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) {
0520         enum page_cache_mode type;
0521 
0522         page = pfn_to_page(pfn);
0523         type = get_page_memtype(page);
0524         if (type != _PAGE_CACHE_MODE_WB) {
0525             pr_info("x86/PAT: reserve_ram_pages_type failed [mem %#010Lx-%#010Lx], track 0x%x, req 0x%x\n",
0526                 start, end - 1, type, req_type);
0527             if (new_type)
0528                 *new_type = type;
0529 
0530             return -EBUSY;
0531         }
0532     }
0533 
0534     if (new_type)
0535         *new_type = req_type;
0536 
0537     for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) {
0538         page = pfn_to_page(pfn);
0539         set_page_memtype(page, req_type);
0540     }
0541     return 0;
0542 }
0543 
0544 static int free_ram_pages_type(u64 start, u64 end)
0545 {
0546     struct page *page;
0547     u64 pfn;
0548 
0549     for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) {
0550         page = pfn_to_page(pfn);
0551         set_page_memtype(page, _PAGE_CACHE_MODE_WB);
0552     }
0553     return 0;
0554 }
0555 
0556 static u64 sanitize_phys(u64 address)
0557 {
0558     /*
0559      * When changing the memtype for pages containing poison allow
0560      * for a "decoy" virtual address (bit 63 clear) passed to
0561      * set_memory_X(). __pa() on a "decoy" address results in a
0562      * physical address with bit 63 set.
0563      *
0564      * Decoy addresses are not present for 32-bit builds, see
0565      * set_mce_nospec().
0566      */
0567     if (IS_ENABLED(CONFIG_X86_64))
0568         return address & __PHYSICAL_MASK;
0569     return address;
0570 }
0571 
0572 /*
0573  * req_type typically has one of the:
0574  * - _PAGE_CACHE_MODE_WB
0575  * - _PAGE_CACHE_MODE_WC
0576  * - _PAGE_CACHE_MODE_UC_MINUS
0577  * - _PAGE_CACHE_MODE_UC
0578  * - _PAGE_CACHE_MODE_WT
0579  *
0580  * If new_type is NULL, function will return an error if it cannot reserve the
0581  * region with req_type. If new_type is non-NULL, function will return
0582  * available type in new_type in case of no error. In case of any error
0583  * it will return a negative return value.
0584  */
0585 int memtype_reserve(u64 start, u64 end, enum page_cache_mode req_type,
0586             enum page_cache_mode *new_type)
0587 {
0588     struct memtype *entry_new;
0589     enum page_cache_mode actual_type;
0590     int is_range_ram;
0591     int err = 0;
0592 
0593     start = sanitize_phys(start);
0594 
0595     /*
0596      * The end address passed into this function is exclusive, but
0597      * sanitize_phys() expects an inclusive address.
0598      */
0599     end = sanitize_phys(end - 1) + 1;
0600     if (start >= end) {
0601         WARN(1, "%s failed: [mem %#010Lx-%#010Lx], req %s\n", __func__,
0602                 start, end - 1, cattr_name(req_type));
0603         return -EINVAL;
0604     }
0605 
0606     if (!pat_enabled()) {
0607         /* This is identical to page table setting without PAT */
0608         if (new_type)
0609             *new_type = req_type;
0610         return 0;
0611     }
0612 
0613     /* Low ISA region is always mapped WB in page table. No need to track */
0614     if (x86_platform.is_untracked_pat_range(start, end)) {
0615         if (new_type)
0616             *new_type = _PAGE_CACHE_MODE_WB;
0617         return 0;
0618     }
0619 
0620     /*
0621      * Call mtrr_lookup to get the type hint. This is an
0622      * optimization for /dev/mem mmap'ers into WB memory (BIOS
0623      * tools and ACPI tools). Use WB request for WB memory and use
0624      * UC_MINUS otherwise.
0625      */
0626     actual_type = pat_x_mtrr_type(start, end, req_type);
0627 
0628     if (new_type)
0629         *new_type = actual_type;
0630 
0631     is_range_ram = pat_pagerange_is_ram(start, end);
0632     if (is_range_ram == 1) {
0633 
0634         err = reserve_ram_pages_type(start, end, req_type, new_type);
0635 
0636         return err;
0637     } else if (is_range_ram < 0) {
0638         return -EINVAL;
0639     }
0640 
0641     entry_new = kzalloc(sizeof(struct memtype), GFP_KERNEL);
0642     if (!entry_new)
0643         return -ENOMEM;
0644 
0645     entry_new->start = start;
0646     entry_new->end   = end;
0647     entry_new->type  = actual_type;
0648 
0649     spin_lock(&memtype_lock);
0650 
0651     err = memtype_check_insert(entry_new, new_type);
0652     if (err) {
0653         pr_info("x86/PAT: memtype_reserve failed [mem %#010Lx-%#010Lx], track %s, req %s\n",
0654             start, end - 1,
0655             cattr_name(entry_new->type), cattr_name(req_type));
0656         kfree(entry_new);
0657         spin_unlock(&memtype_lock);
0658 
0659         return err;
0660     }
0661 
0662     spin_unlock(&memtype_lock);
0663 
0664     dprintk("memtype_reserve added [mem %#010Lx-%#010Lx], track %s, req %s, ret %s\n",
0665         start, end - 1, cattr_name(entry_new->type), cattr_name(req_type),
0666         new_type ? cattr_name(*new_type) : "-");
0667 
0668     return err;
0669 }
0670 
0671 int memtype_free(u64 start, u64 end)
0672 {
0673     int is_range_ram;
0674     struct memtype *entry_old;
0675 
0676     if (!pat_enabled())
0677         return 0;
0678 
0679     start = sanitize_phys(start);
0680     end = sanitize_phys(end);
0681 
0682     /* Low ISA region is always mapped WB. No need to track */
0683     if (x86_platform.is_untracked_pat_range(start, end))
0684         return 0;
0685 
0686     is_range_ram = pat_pagerange_is_ram(start, end);
0687     if (is_range_ram == 1)
0688         return free_ram_pages_type(start, end);
0689     if (is_range_ram < 0)
0690         return -EINVAL;
0691 
0692     spin_lock(&memtype_lock);
0693     entry_old = memtype_erase(start, end);
0694     spin_unlock(&memtype_lock);
0695 
0696     if (IS_ERR(entry_old)) {
0697         pr_info("x86/PAT: %s:%d freeing invalid memtype [mem %#010Lx-%#010Lx]\n",
0698             current->comm, current->pid, start, end - 1);
0699         return -EINVAL;
0700     }
0701 
0702     kfree(entry_old);
0703 
0704     dprintk("memtype_free request [mem %#010Lx-%#010Lx]\n", start, end - 1);
0705 
0706     return 0;
0707 }
0708 
0709 
0710 /**
0711  * lookup_memtype - Looks up the memory type for a physical address
0712  * @paddr: physical address of which memory type needs to be looked up
0713  *
0714  * Only to be called when PAT is enabled
0715  *
0716  * Returns _PAGE_CACHE_MODE_WB, _PAGE_CACHE_MODE_WC, _PAGE_CACHE_MODE_UC_MINUS
0717  * or _PAGE_CACHE_MODE_WT.
0718  */
0719 static enum page_cache_mode lookup_memtype(u64 paddr)
0720 {
0721     enum page_cache_mode rettype = _PAGE_CACHE_MODE_WB;
0722     struct memtype *entry;
0723 
0724     if (x86_platform.is_untracked_pat_range(paddr, paddr + PAGE_SIZE))
0725         return rettype;
0726 
0727     if (pat_pagerange_is_ram(paddr, paddr + PAGE_SIZE)) {
0728         struct page *page;
0729 
0730         page = pfn_to_page(paddr >> PAGE_SHIFT);
0731         return get_page_memtype(page);
0732     }
0733 
0734     spin_lock(&memtype_lock);
0735 
0736     entry = memtype_lookup(paddr);
0737     if (entry != NULL)
0738         rettype = entry->type;
0739     else
0740         rettype = _PAGE_CACHE_MODE_UC_MINUS;
0741 
0742     spin_unlock(&memtype_lock);
0743 
0744     return rettype;
0745 }
0746 
0747 /**
0748  * pat_pfn_immune_to_uc_mtrr - Check whether the PAT memory type
0749  * of @pfn cannot be overridden by UC MTRR memory type.
0750  *
0751  * Only to be called when PAT is enabled.
0752  *
0753  * Returns true, if the PAT memory type of @pfn is UC, UC-, or WC.
0754  * Returns false in other cases.
0755  */
0756 bool pat_pfn_immune_to_uc_mtrr(unsigned long pfn)
0757 {
0758     enum page_cache_mode cm = lookup_memtype(PFN_PHYS(pfn));
0759 
0760     return cm == _PAGE_CACHE_MODE_UC ||
0761            cm == _PAGE_CACHE_MODE_UC_MINUS ||
0762            cm == _PAGE_CACHE_MODE_WC;
0763 }
0764 EXPORT_SYMBOL_GPL(pat_pfn_immune_to_uc_mtrr);
0765 
0766 /**
0767  * memtype_reserve_io - Request a memory type mapping for a region of memory
0768  * @start: start (physical address) of the region
0769  * @end: end (physical address) of the region
0770  * @type: A pointer to memtype, with requested type. On success, requested
0771  * or any other compatible type that was available for the region is returned
0772  *
0773  * On success, returns 0
0774  * On failure, returns non-zero
0775  */
0776 int memtype_reserve_io(resource_size_t start, resource_size_t end,
0777             enum page_cache_mode *type)
0778 {
0779     resource_size_t size = end - start;
0780     enum page_cache_mode req_type = *type;
0781     enum page_cache_mode new_type;
0782     int ret;
0783 
0784     WARN_ON_ONCE(iomem_map_sanity_check(start, size));
0785 
0786     ret = memtype_reserve(start, end, req_type, &new_type);
0787     if (ret)
0788         goto out_err;
0789 
0790     if (!is_new_memtype_allowed(start, size, req_type, new_type))
0791         goto out_free;
0792 
0793     if (memtype_kernel_map_sync(start, size, new_type) < 0)
0794         goto out_free;
0795 
0796     *type = new_type;
0797     return 0;
0798 
0799 out_free:
0800     memtype_free(start, end);
0801     ret = -EBUSY;
0802 out_err:
0803     return ret;
0804 }
0805 
0806 /**
0807  * memtype_free_io - Release a memory type mapping for a region of memory
0808  * @start: start (physical address) of the region
0809  * @end: end (physical address) of the region
0810  */
0811 void memtype_free_io(resource_size_t start, resource_size_t end)
0812 {
0813     memtype_free(start, end);
0814 }
0815 
0816 #ifdef CONFIG_X86_PAT
0817 int arch_io_reserve_memtype_wc(resource_size_t start, resource_size_t size)
0818 {
0819     enum page_cache_mode type = _PAGE_CACHE_MODE_WC;
0820 
0821     return memtype_reserve_io(start, start + size, &type);
0822 }
0823 EXPORT_SYMBOL(arch_io_reserve_memtype_wc);
0824 
0825 void arch_io_free_memtype_wc(resource_size_t start, resource_size_t size)
0826 {
0827     memtype_free_io(start, start + size);
0828 }
0829 EXPORT_SYMBOL(arch_io_free_memtype_wc);
0830 #endif
0831 
0832 pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
0833                 unsigned long size, pgprot_t vma_prot)
0834 {
0835     if (!phys_mem_access_encrypted(pfn << PAGE_SHIFT, size))
0836         vma_prot = pgprot_decrypted(vma_prot);
0837 
0838     return vma_prot;
0839 }
0840 
0841 #ifdef CONFIG_STRICT_DEVMEM
0842 /* This check is done in drivers/char/mem.c in case of STRICT_DEVMEM */
0843 static inline int range_is_allowed(unsigned long pfn, unsigned long size)
0844 {
0845     return 1;
0846 }
0847 #else
0848 /* This check is needed to avoid cache aliasing when PAT is enabled */
0849 static inline int range_is_allowed(unsigned long pfn, unsigned long size)
0850 {
0851     u64 from = ((u64)pfn) << PAGE_SHIFT;
0852     u64 to = from + size;
0853     u64 cursor = from;
0854 
0855     if (!pat_enabled())
0856         return 1;
0857 
0858     while (cursor < to) {
0859         if (!devmem_is_allowed(pfn))
0860             return 0;
0861         cursor += PAGE_SIZE;
0862         pfn++;
0863     }
0864     return 1;
0865 }
0866 #endif /* CONFIG_STRICT_DEVMEM */
0867 
0868 int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
0869                 unsigned long size, pgprot_t *vma_prot)
0870 {
0871     enum page_cache_mode pcm = _PAGE_CACHE_MODE_WB;
0872 
0873     if (!range_is_allowed(pfn, size))
0874         return 0;
0875 
0876     if (file->f_flags & O_DSYNC)
0877         pcm = _PAGE_CACHE_MODE_UC_MINUS;
0878 
0879     *vma_prot = __pgprot((pgprot_val(*vma_prot) & ~_PAGE_CACHE_MASK) |
0880                  cachemode2protval(pcm));
0881     return 1;
0882 }
0883 
0884 /*
0885  * Change the memory type for the physical address range in kernel identity
0886  * mapping space if that range is a part of identity map.
0887  */
0888 int memtype_kernel_map_sync(u64 base, unsigned long size,
0889                 enum page_cache_mode pcm)
0890 {
0891     unsigned long id_sz;
0892 
0893     if (base > __pa(high_memory-1))
0894         return 0;
0895 
0896     /*
0897      * Some areas in the middle of the kernel identity range
0898      * are not mapped, for example the PCI space.
0899      */
0900     if (!page_is_ram(base >> PAGE_SHIFT))
0901         return 0;
0902 
0903     id_sz = (__pa(high_memory-1) <= base + size) ?
0904                 __pa(high_memory) - base : size;
0905 
0906     if (ioremap_change_attr((unsigned long)__va(base), id_sz, pcm) < 0) {
0907         pr_info("x86/PAT: %s:%d ioremap_change_attr failed %s for [mem %#010Lx-%#010Lx]\n",
0908             current->comm, current->pid,
0909             cattr_name(pcm),
0910             base, (unsigned long long)(base + size-1));
0911         return -EINVAL;
0912     }
0913     return 0;
0914 }
0915 
0916 /*
0917  * Internal interface to reserve a range of physical memory with prot.
0918  * Reserved non RAM regions only and after successful memtype_reserve,
0919  * this func also keeps identity mapping (if any) in sync with this new prot.
0920  */
0921 static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot,
0922                 int strict_prot)
0923 {
0924     int is_ram = 0;
0925     int ret;
0926     enum page_cache_mode want_pcm = pgprot2cachemode(*vma_prot);
0927     enum page_cache_mode pcm = want_pcm;
0928 
0929     is_ram = pat_pagerange_is_ram(paddr, paddr + size);
0930 
0931     /*
0932      * reserve_pfn_range() for RAM pages. We do not refcount to keep
0933      * track of number of mappings of RAM pages. We can assert that
0934      * the type requested matches the type of first page in the range.
0935      */
0936     if (is_ram) {
0937         if (!pat_enabled())
0938             return 0;
0939 
0940         pcm = lookup_memtype(paddr);
0941         if (want_pcm != pcm) {
0942             pr_warn("x86/PAT: %s:%d map pfn RAM range req %s for [mem %#010Lx-%#010Lx], got %s\n",
0943                 current->comm, current->pid,
0944                 cattr_name(want_pcm),
0945                 (unsigned long long)paddr,
0946                 (unsigned long long)(paddr + size - 1),
0947                 cattr_name(pcm));
0948             *vma_prot = __pgprot((pgprot_val(*vma_prot) &
0949                          (~_PAGE_CACHE_MASK)) |
0950                          cachemode2protval(pcm));
0951         }
0952         return 0;
0953     }
0954 
0955     ret = memtype_reserve(paddr, paddr + size, want_pcm, &pcm);
0956     if (ret)
0957         return ret;
0958 
0959     if (pcm != want_pcm) {
0960         if (strict_prot ||
0961             !is_new_memtype_allowed(paddr, size, want_pcm, pcm)) {
0962             memtype_free(paddr, paddr + size);
0963             pr_err("x86/PAT: %s:%d map pfn expected mapping type %s for [mem %#010Lx-%#010Lx], got %s\n",
0964                    current->comm, current->pid,
0965                    cattr_name(want_pcm),
0966                    (unsigned long long)paddr,
0967                    (unsigned long long)(paddr + size - 1),
0968                    cattr_name(pcm));
0969             return -EINVAL;
0970         }
0971         /*
0972          * We allow returning different type than the one requested in
0973          * non strict case.
0974          */
0975         *vma_prot = __pgprot((pgprot_val(*vma_prot) &
0976                       (~_PAGE_CACHE_MASK)) |
0977                      cachemode2protval(pcm));
0978     }
0979 
0980     if (memtype_kernel_map_sync(paddr, size, pcm) < 0) {
0981         memtype_free(paddr, paddr + size);
0982         return -EINVAL;
0983     }
0984     return 0;
0985 }
0986 
0987 /*
0988  * Internal interface to free a range of physical memory.
0989  * Frees non RAM regions only.
0990  */
0991 static void free_pfn_range(u64 paddr, unsigned long size)
0992 {
0993     int is_ram;
0994 
0995     is_ram = pat_pagerange_is_ram(paddr, paddr + size);
0996     if (is_ram == 0)
0997         memtype_free(paddr, paddr + size);
0998 }
0999 
1000 /*
1001  * track_pfn_copy is called when vma that is covering the pfnmap gets
1002  * copied through copy_page_range().
1003  *
1004  * If the vma has a linear pfn mapping for the entire range, we get the prot
1005  * from pte and reserve the entire vma range with single reserve_pfn_range call.
1006  */
1007 int track_pfn_copy(struct vm_area_struct *vma)
1008 {
1009     resource_size_t paddr;
1010     unsigned long prot;
1011     unsigned long vma_size = vma->vm_end - vma->vm_start;
1012     pgprot_t pgprot;
1013 
1014     if (vma->vm_flags & VM_PAT) {
1015         /*
1016          * reserve the whole chunk covered by vma. We need the
1017          * starting address and protection from pte.
1018          */
1019         if (follow_phys(vma, vma->vm_start, 0, &prot, &paddr)) {
1020             WARN_ON_ONCE(1);
1021             return -EINVAL;
1022         }
1023         pgprot = __pgprot(prot);
1024         return reserve_pfn_range(paddr, vma_size, &pgprot, 1);
1025     }
1026 
1027     return 0;
1028 }
1029 
1030 /*
1031  * prot is passed in as a parameter for the new mapping. If the vma has
1032  * a linear pfn mapping for the entire range, or no vma is provided,
1033  * reserve the entire pfn + size range with single reserve_pfn_range
1034  * call.
1035  */
1036 int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot,
1037             unsigned long pfn, unsigned long addr, unsigned long size)
1038 {
1039     resource_size_t paddr = (resource_size_t)pfn << PAGE_SHIFT;
1040     enum page_cache_mode pcm;
1041 
1042     /* reserve the whole chunk starting from paddr */
1043     if (!vma || (addr == vma->vm_start
1044                 && size == (vma->vm_end - vma->vm_start))) {
1045         int ret;
1046 
1047         ret = reserve_pfn_range(paddr, size, prot, 0);
1048         if (ret == 0 && vma)
1049             vma->vm_flags |= VM_PAT;
1050         return ret;
1051     }
1052 
1053     if (!pat_enabled())
1054         return 0;
1055 
1056     /*
1057      * For anything smaller than the vma size we set prot based on the
1058      * lookup.
1059      */
1060     pcm = lookup_memtype(paddr);
1061 
1062     /* Check memtype for the remaining pages */
1063     while (size > PAGE_SIZE) {
1064         size -= PAGE_SIZE;
1065         paddr += PAGE_SIZE;
1066         if (pcm != lookup_memtype(paddr))
1067             return -EINVAL;
1068     }
1069 
1070     *prot = __pgprot((pgprot_val(*prot) & (~_PAGE_CACHE_MASK)) |
1071              cachemode2protval(pcm));
1072 
1073     return 0;
1074 }
1075 
1076 void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, pfn_t pfn)
1077 {
1078     enum page_cache_mode pcm;
1079 
1080     if (!pat_enabled())
1081         return;
1082 
1083     /* Set prot based on lookup */
1084     pcm = lookup_memtype(pfn_t_to_phys(pfn));
1085     *prot = __pgprot((pgprot_val(*prot) & (~_PAGE_CACHE_MASK)) |
1086              cachemode2protval(pcm));
1087 }
1088 
1089 /*
1090  * untrack_pfn is called while unmapping a pfnmap for a region.
1091  * untrack can be called for a specific region indicated by pfn and size or
1092  * can be for the entire vma (in which case pfn, size are zero).
1093  */
1094 void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
1095          unsigned long size)
1096 {
1097     resource_size_t paddr;
1098     unsigned long prot;
1099 
1100     if (vma && !(vma->vm_flags & VM_PAT))
1101         return;
1102 
1103     /* free the chunk starting from pfn or the whole chunk */
1104     paddr = (resource_size_t)pfn << PAGE_SHIFT;
1105     if (!paddr && !size) {
1106         if (follow_phys(vma, vma->vm_start, 0, &prot, &paddr)) {
1107             WARN_ON_ONCE(1);
1108             return;
1109         }
1110 
1111         size = vma->vm_end - vma->vm_start;
1112     }
1113     free_pfn_range(paddr, size);
1114     if (vma)
1115         vma->vm_flags &= ~VM_PAT;
1116 }
1117 
1118 /*
1119  * untrack_pfn_moved is called, while mremapping a pfnmap for a new region,
1120  * with the old vma after its pfnmap page table has been removed.  The new
1121  * vma has a new pfnmap to the same pfn & cache type with VM_PAT set.
1122  */
1123 void untrack_pfn_moved(struct vm_area_struct *vma)
1124 {
1125     vma->vm_flags &= ~VM_PAT;
1126 }
1127 
1128 pgprot_t pgprot_writecombine(pgprot_t prot)
1129 {
1130     return __pgprot(pgprot_val(prot) |
1131                 cachemode2protval(_PAGE_CACHE_MODE_WC));
1132 }
1133 EXPORT_SYMBOL_GPL(pgprot_writecombine);
1134 
1135 pgprot_t pgprot_writethrough(pgprot_t prot)
1136 {
1137     return __pgprot(pgprot_val(prot) |
1138                 cachemode2protval(_PAGE_CACHE_MODE_WT));
1139 }
1140 EXPORT_SYMBOL_GPL(pgprot_writethrough);
1141 
1142 #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_X86_PAT)
1143 
1144 /*
1145  * We are allocating a temporary printout-entry to be passed
1146  * between seq_start()/next() and seq_show():
1147  */
1148 static struct memtype *memtype_get_idx(loff_t pos)
1149 {
1150     struct memtype *entry_print;
1151     int ret;
1152 
1153     entry_print  = kzalloc(sizeof(struct memtype), GFP_KERNEL);
1154     if (!entry_print)
1155         return NULL;
1156 
1157     spin_lock(&memtype_lock);
1158     ret = memtype_copy_nth_element(entry_print, pos);
1159     spin_unlock(&memtype_lock);
1160 
1161     /* Free it on error: */
1162     if (ret) {
1163         kfree(entry_print);
1164         return NULL;
1165     }
1166 
1167     return entry_print;
1168 }
1169 
1170 static void *memtype_seq_start(struct seq_file *seq, loff_t *pos)
1171 {
1172     if (*pos == 0) {
1173         ++*pos;
1174         seq_puts(seq, "PAT memtype list:\n");
1175     }
1176 
1177     return memtype_get_idx(*pos);
1178 }
1179 
1180 static void *memtype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1181 {
1182     kfree(v);
1183     ++*pos;
1184     return memtype_get_idx(*pos);
1185 }
1186 
1187 static void memtype_seq_stop(struct seq_file *seq, void *v)
1188 {
1189     kfree(v);
1190 }
1191 
1192 static int memtype_seq_show(struct seq_file *seq, void *v)
1193 {
1194     struct memtype *entry_print = (struct memtype *)v;
1195 
1196     seq_printf(seq, "PAT: [mem 0x%016Lx-0x%016Lx] %s\n",
1197             entry_print->start,
1198             entry_print->end,
1199             cattr_name(entry_print->type));
1200 
1201     return 0;
1202 }
1203 
1204 static const struct seq_operations memtype_seq_ops = {
1205     .start = memtype_seq_start,
1206     .next  = memtype_seq_next,
1207     .stop  = memtype_seq_stop,
1208     .show  = memtype_seq_show,
1209 };
1210 
1211 static int memtype_seq_open(struct inode *inode, struct file *file)
1212 {
1213     return seq_open(file, &memtype_seq_ops);
1214 }
1215 
1216 static const struct file_operations memtype_fops = {
1217     .open    = memtype_seq_open,
1218     .read    = seq_read,
1219     .llseek  = seq_lseek,
1220     .release = seq_release,
1221 };
1222 
1223 static int __init pat_memtype_list_init(void)
1224 {
1225     if (pat_enabled()) {
1226         debugfs_create_file("pat_memtype_list", S_IRUSR,
1227                     arch_debugfs_dir, NULL, &memtype_fops);
1228     }
1229     return 0;
1230 }
1231 late_initcall(pat_memtype_list_init);
1232 
1233 #endif /* CONFIG_DEBUG_FS && CONFIG_X86_PAT */