Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0-only
0002 /*
0003  * xsave/xrstor support.
0004  *
0005  * Author: Suresh Siddha <suresh.b.siddha@intel.com>
0006  */
0007 #include <linux/bitops.h>
0008 #include <linux/compat.h>
0009 #include <linux/cpu.h>
0010 #include <linux/mman.h>
0011 #include <linux/nospec.h>
0012 #include <linux/pkeys.h>
0013 #include <linux/seq_file.h>
0014 #include <linux/proc_fs.h>
0015 #include <linux/vmalloc.h>
0016 
0017 #include <asm/fpu/api.h>
0018 #include <asm/fpu/regset.h>
0019 #include <asm/fpu/signal.h>
0020 #include <asm/fpu/xcr.h>
0021 
0022 #include <asm/tlbflush.h>
0023 #include <asm/prctl.h>
0024 #include <asm/elf.h>
0025 
0026 #include "context.h"
0027 #include "internal.h"
0028 #include "legacy.h"
0029 #include "xstate.h"
0030 
0031 #define for_each_extended_xfeature(bit, mask)               \
0032     (bit) = FIRST_EXTENDED_XFEATURE;                \
0033     for_each_set_bit_from(bit, (unsigned long *)&(mask), 8 * sizeof(mask))
0034 
0035 /*
0036  * Although we spell it out in here, the Processor Trace
0037  * xfeature is completely unused.  We use other mechanisms
0038  * to save/restore PT state in Linux.
0039  */
0040 static const char *xfeature_names[] =
0041 {
0042     "x87 floating point registers"  ,
0043     "SSE registers"         ,
0044     "AVX registers"         ,
0045     "MPX bounds registers"      ,
0046     "MPX CSR"           ,
0047     "AVX-512 opmask"        ,
0048     "AVX-512 Hi256"         ,
0049     "AVX-512 ZMM_Hi256"     ,
0050     "Processor Trace (unused)"  ,
0051     "Protection Keys User registers",
0052     "PASID state",
0053     "unknown xstate feature"    ,
0054     "unknown xstate feature"    ,
0055     "unknown xstate feature"    ,
0056     "unknown xstate feature"    ,
0057     "unknown xstate feature"    ,
0058     "unknown xstate feature"    ,
0059     "AMX Tile config"       ,
0060     "AMX Tile data"         ,
0061     "unknown xstate feature"    ,
0062 };
0063 
0064 static unsigned short xsave_cpuid_features[] __initdata = {
0065     [XFEATURE_FP]               = X86_FEATURE_FPU,
0066     [XFEATURE_SSE]              = X86_FEATURE_XMM,
0067     [XFEATURE_YMM]              = X86_FEATURE_AVX,
0068     [XFEATURE_BNDREGS]          = X86_FEATURE_MPX,
0069     [XFEATURE_BNDCSR]           = X86_FEATURE_MPX,
0070     [XFEATURE_OPMASK]           = X86_FEATURE_AVX512F,
0071     [XFEATURE_ZMM_Hi256]            = X86_FEATURE_AVX512F,
0072     [XFEATURE_Hi16_ZMM]         = X86_FEATURE_AVX512F,
0073     [XFEATURE_PT_UNIMPLEMENTED_SO_FAR]  = X86_FEATURE_INTEL_PT,
0074     [XFEATURE_PKRU]             = X86_FEATURE_PKU,
0075     [XFEATURE_PASID]            = X86_FEATURE_ENQCMD,
0076     [XFEATURE_XTILE_CFG]            = X86_FEATURE_AMX_TILE,
0077     [XFEATURE_XTILE_DATA]           = X86_FEATURE_AMX_TILE,
0078 };
0079 
0080 static unsigned int xstate_offsets[XFEATURE_MAX] __ro_after_init =
0081     { [ 0 ... XFEATURE_MAX - 1] = -1};
0082 static unsigned int xstate_sizes[XFEATURE_MAX] __ro_after_init =
0083     { [ 0 ... XFEATURE_MAX - 1] = -1};
0084 static unsigned int xstate_flags[XFEATURE_MAX] __ro_after_init;
0085 
0086 #define XSTATE_FLAG_SUPERVISOR  BIT(0)
0087 #define XSTATE_FLAG_ALIGNED64   BIT(1)
0088 
0089 /*
0090  * Return whether the system supports a given xfeature.
0091  *
0092  * Also return the name of the (most advanced) feature that the caller requested:
0093  */
0094 int cpu_has_xfeatures(u64 xfeatures_needed, const char **feature_name)
0095 {
0096     u64 xfeatures_missing = xfeatures_needed & ~fpu_kernel_cfg.max_features;
0097 
0098     if (unlikely(feature_name)) {
0099         long xfeature_idx, max_idx;
0100         u64 xfeatures_print;
0101         /*
0102          * So we use FLS here to be able to print the most advanced
0103          * feature that was requested but is missing. So if a driver
0104          * asks about "XFEATURE_MASK_SSE | XFEATURE_MASK_YMM" we'll print the
0105          * missing AVX feature - this is the most informative message
0106          * to users:
0107          */
0108         if (xfeatures_missing)
0109             xfeatures_print = xfeatures_missing;
0110         else
0111             xfeatures_print = xfeatures_needed;
0112 
0113         xfeature_idx = fls64(xfeatures_print)-1;
0114         max_idx = ARRAY_SIZE(xfeature_names)-1;
0115         xfeature_idx = min(xfeature_idx, max_idx);
0116 
0117         *feature_name = xfeature_names[xfeature_idx];
0118     }
0119 
0120     if (xfeatures_missing)
0121         return 0;
0122 
0123     return 1;
0124 }
0125 EXPORT_SYMBOL_GPL(cpu_has_xfeatures);
0126 
0127 static bool xfeature_is_aligned64(int xfeature_nr)
0128 {
0129     return xstate_flags[xfeature_nr] & XSTATE_FLAG_ALIGNED64;
0130 }
0131 
0132 static bool xfeature_is_supervisor(int xfeature_nr)
0133 {
0134     return xstate_flags[xfeature_nr] & XSTATE_FLAG_SUPERVISOR;
0135 }
0136 
0137 static unsigned int xfeature_get_offset(u64 xcomp_bv, int xfeature)
0138 {
0139     unsigned int offs, i;
0140 
0141     /*
0142      * Non-compacted format and legacy features use the cached fixed
0143      * offsets.
0144      */
0145     if (!cpu_feature_enabled(X86_FEATURE_XCOMPACTED) ||
0146         xfeature <= XFEATURE_SSE)
0147         return xstate_offsets[xfeature];
0148 
0149     /*
0150      * Compacted format offsets depend on the actual content of the
0151      * compacted xsave area which is determined by the xcomp_bv header
0152      * field.
0153      */
0154     offs = FXSAVE_SIZE + XSAVE_HDR_SIZE;
0155     for_each_extended_xfeature(i, xcomp_bv) {
0156         if (xfeature_is_aligned64(i))
0157             offs = ALIGN(offs, 64);
0158         if (i == xfeature)
0159             break;
0160         offs += xstate_sizes[i];
0161     }
0162     return offs;
0163 }
0164 
0165 /*
0166  * Enable the extended processor state save/restore feature.
0167  * Called once per CPU onlining.
0168  */
0169 void fpu__init_cpu_xstate(void)
0170 {
0171     if (!boot_cpu_has(X86_FEATURE_XSAVE) || !fpu_kernel_cfg.max_features)
0172         return;
0173 
0174     cr4_set_bits(X86_CR4_OSXSAVE);
0175 
0176     /*
0177      * Must happen after CR4 setup and before xsetbv() to allow KVM
0178      * lazy passthrough.  Write independent of the dynamic state static
0179      * key as that does not work on the boot CPU. This also ensures
0180      * that any stale state is wiped out from XFD.
0181      */
0182     if (cpu_feature_enabled(X86_FEATURE_XFD))
0183         wrmsrl(MSR_IA32_XFD, init_fpstate.xfd);
0184 
0185     /*
0186      * XCR_XFEATURE_ENABLED_MASK (aka. XCR0) sets user features
0187      * managed by XSAVE{C, OPT, S} and XRSTOR{S}.  Only XSAVE user
0188      * states can be set here.
0189      */
0190     xsetbv(XCR_XFEATURE_ENABLED_MASK, fpu_user_cfg.max_features);
0191 
0192     /*
0193      * MSR_IA32_XSS sets supervisor states managed by XSAVES.
0194      */
0195     if (boot_cpu_has(X86_FEATURE_XSAVES)) {
0196         wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() |
0197                      xfeatures_mask_independent());
0198     }
0199 }
0200 
0201 static bool xfeature_enabled(enum xfeature xfeature)
0202 {
0203     return fpu_kernel_cfg.max_features & BIT_ULL(xfeature);
0204 }
0205 
0206 /*
0207  * Record the offsets and sizes of various xstates contained
0208  * in the XSAVE state memory layout.
0209  */
0210 static void __init setup_xstate_cache(void)
0211 {
0212     u32 eax, ebx, ecx, edx, i;
0213     /* start at the beginning of the "extended state" */
0214     unsigned int last_good_offset = offsetof(struct xregs_state,
0215                          extended_state_area);
0216     /*
0217      * The FP xstates and SSE xstates are legacy states. They are always
0218      * in the fixed offsets in the xsave area in either compacted form
0219      * or standard form.
0220      */
0221     xstate_offsets[XFEATURE_FP] = 0;
0222     xstate_sizes[XFEATURE_FP]   = offsetof(struct fxregs_state,
0223                            xmm_space);
0224 
0225     xstate_offsets[XFEATURE_SSE]    = xstate_sizes[XFEATURE_FP];
0226     xstate_sizes[XFEATURE_SSE]  = sizeof_field(struct fxregs_state,
0227                                xmm_space);
0228 
0229     for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) {
0230         cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx);
0231 
0232         xstate_sizes[i] = eax;
0233         xstate_flags[i] = ecx;
0234 
0235         /*
0236          * If an xfeature is supervisor state, the offset in EBX is
0237          * invalid, leave it to -1.
0238          */
0239         if (xfeature_is_supervisor(i))
0240             continue;
0241 
0242         xstate_offsets[i] = ebx;
0243 
0244         /*
0245          * In our xstate size checks, we assume that the highest-numbered
0246          * xstate feature has the highest offset in the buffer.  Ensure
0247          * it does.
0248          */
0249         WARN_ONCE(last_good_offset > xstate_offsets[i],
0250               "x86/fpu: misordered xstate at %d\n", last_good_offset);
0251 
0252         last_good_offset = xstate_offsets[i];
0253     }
0254 }
0255 
0256 static void __init print_xstate_feature(u64 xstate_mask)
0257 {
0258     const char *feature_name;
0259 
0260     if (cpu_has_xfeatures(xstate_mask, &feature_name))
0261         pr_info("x86/fpu: Supporting XSAVE feature 0x%03Lx: '%s'\n", xstate_mask, feature_name);
0262 }
0263 
0264 /*
0265  * Print out all the supported xstate features:
0266  */
0267 static void __init print_xstate_features(void)
0268 {
0269     print_xstate_feature(XFEATURE_MASK_FP);
0270     print_xstate_feature(XFEATURE_MASK_SSE);
0271     print_xstate_feature(XFEATURE_MASK_YMM);
0272     print_xstate_feature(XFEATURE_MASK_BNDREGS);
0273     print_xstate_feature(XFEATURE_MASK_BNDCSR);
0274     print_xstate_feature(XFEATURE_MASK_OPMASK);
0275     print_xstate_feature(XFEATURE_MASK_ZMM_Hi256);
0276     print_xstate_feature(XFEATURE_MASK_Hi16_ZMM);
0277     print_xstate_feature(XFEATURE_MASK_PKRU);
0278     print_xstate_feature(XFEATURE_MASK_PASID);
0279     print_xstate_feature(XFEATURE_MASK_XTILE_CFG);
0280     print_xstate_feature(XFEATURE_MASK_XTILE_DATA);
0281 }
0282 
0283 /*
0284  * This check is important because it is easy to get XSTATE_*
0285  * confused with XSTATE_BIT_*.
0286  */
0287 #define CHECK_XFEATURE(nr) do {     \
0288     WARN_ON(nr < FIRST_EXTENDED_XFEATURE);  \
0289     WARN_ON(nr >= XFEATURE_MAX);    \
0290 } while (0)
0291 
0292 /*
0293  * Print out xstate component offsets and sizes
0294  */
0295 static void __init print_xstate_offset_size(void)
0296 {
0297     int i;
0298 
0299     for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) {
0300         pr_info("x86/fpu: xstate_offset[%d]: %4d, xstate_sizes[%d]: %4d\n",
0301             i, xfeature_get_offset(fpu_kernel_cfg.max_features, i),
0302             i, xstate_sizes[i]);
0303     }
0304 }
0305 
0306 /*
0307  * This function is called only during boot time when x86 caps are not set
0308  * up and alternative can not be used yet.
0309  */
0310 static __init void os_xrstor_booting(struct xregs_state *xstate)
0311 {
0312     u64 mask = fpu_kernel_cfg.max_features & XFEATURE_MASK_FPSTATE;
0313     u32 lmask = mask;
0314     u32 hmask = mask >> 32;
0315     int err;
0316 
0317     if (cpu_feature_enabled(X86_FEATURE_XSAVES))
0318         XSTATE_OP(XRSTORS, xstate, lmask, hmask, err);
0319     else
0320         XSTATE_OP(XRSTOR, xstate, lmask, hmask, err);
0321 
0322     /*
0323      * We should never fault when copying from a kernel buffer, and the FPU
0324      * state we set at boot time should be valid.
0325      */
0326     WARN_ON_FPU(err);
0327 }
0328 
0329 /*
0330  * All supported features have either init state all zeros or are
0331  * handled in setup_init_fpu() individually. This is an explicit
0332  * feature list and does not use XFEATURE_MASK*SUPPORTED to catch
0333  * newly added supported features at build time and make people
0334  * actually look at the init state for the new feature.
0335  */
0336 #define XFEATURES_INIT_FPSTATE_HANDLED      \
0337     (XFEATURE_MASK_FP |         \
0338      XFEATURE_MASK_SSE |            \
0339      XFEATURE_MASK_YMM |            \
0340      XFEATURE_MASK_OPMASK |         \
0341      XFEATURE_MASK_ZMM_Hi256 |      \
0342      XFEATURE_MASK_Hi16_ZMM  |      \
0343      XFEATURE_MASK_PKRU |           \
0344      XFEATURE_MASK_BNDREGS |        \
0345      XFEATURE_MASK_BNDCSR |         \
0346      XFEATURE_MASK_PASID |          \
0347      XFEATURE_MASK_XTILE)
0348 
0349 /*
0350  * setup the xstate image representing the init state
0351  */
0352 static void __init setup_init_fpu_buf(void)
0353 {
0354     BUILD_BUG_ON((XFEATURE_MASK_USER_SUPPORTED |
0355               XFEATURE_MASK_SUPERVISOR_SUPPORTED) !=
0356              XFEATURES_INIT_FPSTATE_HANDLED);
0357 
0358     if (!boot_cpu_has(X86_FEATURE_XSAVE))
0359         return;
0360 
0361     print_xstate_features();
0362 
0363     xstate_init_xcomp_bv(&init_fpstate.regs.xsave, fpu_kernel_cfg.max_features);
0364 
0365     /*
0366      * Init all the features state with header.xfeatures being 0x0
0367      */
0368     os_xrstor_booting(&init_fpstate.regs.xsave);
0369 
0370     /*
0371      * All components are now in init state. Read the state back so
0372      * that init_fpstate contains all non-zero init state. This only
0373      * works with XSAVE, but not with XSAVEOPT and XSAVEC/S because
0374      * those use the init optimization which skips writing data for
0375      * components in init state.
0376      *
0377      * XSAVE could be used, but that would require to reshuffle the
0378      * data when XSAVEC/S is available because XSAVEC/S uses xstate
0379      * compaction. But doing so is a pointless exercise because most
0380      * components have an all zeros init state except for the legacy
0381      * ones (FP and SSE). Those can be saved with FXSAVE into the
0382      * legacy area. Adding new features requires to ensure that init
0383      * state is all zeroes or if not to add the necessary handling
0384      * here.
0385      */
0386     fxsave(&init_fpstate.regs.fxsave);
0387 }
0388 
0389 int xfeature_size(int xfeature_nr)
0390 {
0391     u32 eax, ebx, ecx, edx;
0392 
0393     CHECK_XFEATURE(xfeature_nr);
0394     cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx);
0395     return eax;
0396 }
0397 
0398 /* Validate an xstate header supplied by userspace (ptrace or sigreturn) */
0399 static int validate_user_xstate_header(const struct xstate_header *hdr,
0400                        struct fpstate *fpstate)
0401 {
0402     /* No unknown or supervisor features may be set */
0403     if (hdr->xfeatures & ~fpstate->user_xfeatures)
0404         return -EINVAL;
0405 
0406     /* Userspace must use the uncompacted format */
0407     if (hdr->xcomp_bv)
0408         return -EINVAL;
0409 
0410     /*
0411      * If 'reserved' is shrunken to add a new field, make sure to validate
0412      * that new field here!
0413      */
0414     BUILD_BUG_ON(sizeof(hdr->reserved) != 48);
0415 
0416     /* No reserved bits may be set */
0417     if (memchr_inv(hdr->reserved, 0, sizeof(hdr->reserved)))
0418         return -EINVAL;
0419 
0420     return 0;
0421 }
0422 
0423 static void __init __xstate_dump_leaves(void)
0424 {
0425     int i;
0426     u32 eax, ebx, ecx, edx;
0427     static int should_dump = 1;
0428 
0429     if (!should_dump)
0430         return;
0431     should_dump = 0;
0432     /*
0433      * Dump out a few leaves past the ones that we support
0434      * just in case there are some goodies up there
0435      */
0436     for (i = 0; i < XFEATURE_MAX + 10; i++) {
0437         cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx);
0438         pr_warn("CPUID[%02x, %02x]: eax=%08x ebx=%08x ecx=%08x edx=%08x\n",
0439             XSTATE_CPUID, i, eax, ebx, ecx, edx);
0440     }
0441 }
0442 
0443 #define XSTATE_WARN_ON(x) do {                          \
0444     if (WARN_ONCE(x, "XSAVE consistency problem, dumping leaves")) {    \
0445         __xstate_dump_leaves();                     \
0446     }                                   \
0447 } while (0)
0448 
0449 #define XCHECK_SZ(sz, nr, nr_macro, __struct) do {          \
0450     if ((nr == nr_macro) &&                     \
0451         WARN_ONCE(sz != sizeof(__struct),               \
0452         "%s: struct is %zu bytes, cpu state %d bytes\n",    \
0453         __stringify(nr_macro), sizeof(__struct), sz)) {     \
0454         __xstate_dump_leaves();                 \
0455     }                               \
0456 } while (0)
0457 
0458 /**
0459  * check_xtile_data_against_struct - Check tile data state size.
0460  *
0461  * Calculate the state size by multiplying the single tile size which is
0462  * recorded in a C struct, and the number of tiles that the CPU informs.
0463  * Compare the provided size with the calculation.
0464  *
0465  * @size:   The tile data state size
0466  *
0467  * Returns: 0 on success, -EINVAL on mismatch.
0468  */
0469 static int __init check_xtile_data_against_struct(int size)
0470 {
0471     u32 max_palid, palid, state_size;
0472     u32 eax, ebx, ecx, edx;
0473     u16 max_tile;
0474 
0475     /*
0476      * Check the maximum palette id:
0477      *   eax: the highest numbered palette subleaf.
0478      */
0479     cpuid_count(TILE_CPUID, 0, &max_palid, &ebx, &ecx, &edx);
0480 
0481     /*
0482      * Cross-check each tile size and find the maximum number of
0483      * supported tiles.
0484      */
0485     for (palid = 1, max_tile = 0; palid <= max_palid; palid++) {
0486         u16 tile_size, max;
0487 
0488         /*
0489          * Check the tile size info:
0490          *   eax[31:16]:  bytes per title
0491          *   ebx[31:16]:  the max names (or max number of tiles)
0492          */
0493         cpuid_count(TILE_CPUID, palid, &eax, &ebx, &edx, &edx);
0494         tile_size = eax >> 16;
0495         max = ebx >> 16;
0496 
0497         if (tile_size != sizeof(struct xtile_data)) {
0498             pr_err("%s: struct is %zu bytes, cpu xtile %d bytes\n",
0499                    __stringify(XFEATURE_XTILE_DATA),
0500                    sizeof(struct xtile_data), tile_size);
0501             __xstate_dump_leaves();
0502             return -EINVAL;
0503         }
0504 
0505         if (max > max_tile)
0506             max_tile = max;
0507     }
0508 
0509     state_size = sizeof(struct xtile_data) * max_tile;
0510     if (size != state_size) {
0511         pr_err("%s: calculated size is %u bytes, cpu state %d bytes\n",
0512                __stringify(XFEATURE_XTILE_DATA), state_size, size);
0513         __xstate_dump_leaves();
0514         return -EINVAL;
0515     }
0516     return 0;
0517 }
0518 
0519 /*
0520  * We have a C struct for each 'xstate'.  We need to ensure
0521  * that our software representation matches what the CPU
0522  * tells us about the state's size.
0523  */
0524 static bool __init check_xstate_against_struct(int nr)
0525 {
0526     /*
0527      * Ask the CPU for the size of the state.
0528      */
0529     int sz = xfeature_size(nr);
0530     /*
0531      * Match each CPU state with the corresponding software
0532      * structure.
0533      */
0534     XCHECK_SZ(sz, nr, XFEATURE_YMM,       struct ymmh_struct);
0535     XCHECK_SZ(sz, nr, XFEATURE_BNDREGS,   struct mpx_bndreg_state);
0536     XCHECK_SZ(sz, nr, XFEATURE_BNDCSR,    struct mpx_bndcsr_state);
0537     XCHECK_SZ(sz, nr, XFEATURE_OPMASK,    struct avx_512_opmask_state);
0538     XCHECK_SZ(sz, nr, XFEATURE_ZMM_Hi256, struct avx_512_zmm_uppers_state);
0539     XCHECK_SZ(sz, nr, XFEATURE_Hi16_ZMM,  struct avx_512_hi16_state);
0540     XCHECK_SZ(sz, nr, XFEATURE_PKRU,      struct pkru_state);
0541     XCHECK_SZ(sz, nr, XFEATURE_PASID,     struct ia32_pasid_state);
0542     XCHECK_SZ(sz, nr, XFEATURE_XTILE_CFG, struct xtile_cfg);
0543 
0544     /* The tile data size varies between implementations. */
0545     if (nr == XFEATURE_XTILE_DATA)
0546         check_xtile_data_against_struct(sz);
0547 
0548     /*
0549      * Make *SURE* to add any feature numbers in below if
0550      * there are "holes" in the xsave state component
0551      * numbers.
0552      */
0553     if ((nr < XFEATURE_YMM) ||
0554         (nr >= XFEATURE_MAX) ||
0555         (nr == XFEATURE_PT_UNIMPLEMENTED_SO_FAR) ||
0556         ((nr >= XFEATURE_RSRVD_COMP_11) && (nr <= XFEATURE_RSRVD_COMP_16))) {
0557         WARN_ONCE(1, "no structure for xstate: %d\n", nr);
0558         XSTATE_WARN_ON(1);
0559         return false;
0560     }
0561     return true;
0562 }
0563 
0564 static unsigned int xstate_calculate_size(u64 xfeatures, bool compacted)
0565 {
0566     unsigned int topmost = fls64(xfeatures) -  1;
0567     unsigned int offset = xstate_offsets[topmost];
0568 
0569     if (topmost <= XFEATURE_SSE)
0570         return sizeof(struct xregs_state);
0571 
0572     if (compacted)
0573         offset = xfeature_get_offset(xfeatures, topmost);
0574     return offset + xstate_sizes[topmost];
0575 }
0576 
0577 /*
0578  * This essentially double-checks what the cpu told us about
0579  * how large the XSAVE buffer needs to be.  We are recalculating
0580  * it to be safe.
0581  *
0582  * Independent XSAVE features allocate their own buffers and are not
0583  * covered by these checks. Only the size of the buffer for task->fpu
0584  * is checked here.
0585  */
0586 static bool __init paranoid_xstate_size_valid(unsigned int kernel_size)
0587 {
0588     bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED);
0589     bool xsaves = cpu_feature_enabled(X86_FEATURE_XSAVES);
0590     unsigned int size = FXSAVE_SIZE + XSAVE_HDR_SIZE;
0591     int i;
0592 
0593     for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) {
0594         if (!check_xstate_against_struct(i))
0595             return false;
0596         /*
0597          * Supervisor state components can be managed only by
0598          * XSAVES.
0599          */
0600         if (!xsaves && xfeature_is_supervisor(i)) {
0601             XSTATE_WARN_ON(1);
0602             return false;
0603         }
0604     }
0605     size = xstate_calculate_size(fpu_kernel_cfg.max_features, compacted);
0606     XSTATE_WARN_ON(size != kernel_size);
0607     return size == kernel_size;
0608 }
0609 
0610 /*
0611  * Get total size of enabled xstates in XCR0 | IA32_XSS.
0612  *
0613  * Note the SDM's wording here.  "sub-function 0" only enumerates
0614  * the size of the *user* states.  If we use it to size a buffer
0615  * that we use 'XSAVES' on, we could potentially overflow the
0616  * buffer because 'XSAVES' saves system states too.
0617  *
0618  * This also takes compaction into account. So this works for
0619  * XSAVEC as well.
0620  */
0621 static unsigned int __init get_compacted_size(void)
0622 {
0623     unsigned int eax, ebx, ecx, edx;
0624     /*
0625      * - CPUID function 0DH, sub-function 1:
0626      *    EBX enumerates the size (in bytes) required by
0627      *    the XSAVES instruction for an XSAVE area
0628      *    containing all the state components
0629      *    corresponding to bits currently set in
0630      *    XCR0 | IA32_XSS.
0631      *
0632      * When XSAVES is not available but XSAVEC is (virt), then there
0633      * are no supervisor states, but XSAVEC still uses compacted
0634      * format.
0635      */
0636     cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx);
0637     return ebx;
0638 }
0639 
0640 /*
0641  * Get the total size of the enabled xstates without the independent supervisor
0642  * features.
0643  */
0644 static unsigned int __init get_xsave_compacted_size(void)
0645 {
0646     u64 mask = xfeatures_mask_independent();
0647     unsigned int size;
0648 
0649     if (!mask)
0650         return get_compacted_size();
0651 
0652     /* Disable independent features. */
0653     wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor());
0654 
0655     /*
0656      * Ask the hardware what size is required of the buffer.
0657      * This is the size required for the task->fpu buffer.
0658      */
0659     size = get_compacted_size();
0660 
0661     /* Re-enable independent features so XSAVES will work on them again. */
0662     wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() | mask);
0663 
0664     return size;
0665 }
0666 
0667 static unsigned int __init get_xsave_size_user(void)
0668 {
0669     unsigned int eax, ebx, ecx, edx;
0670     /*
0671      * - CPUID function 0DH, sub-function 0:
0672      *    EBX enumerates the size (in bytes) required by
0673      *    the XSAVE instruction for an XSAVE area
0674      *    containing all the *user* state components
0675      *    corresponding to bits currently set in XCR0.
0676      */
0677     cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
0678     return ebx;
0679 }
0680 
0681 /*
0682  * Will the runtime-enumerated 'xstate_size' fit in the init
0683  * task's statically-allocated buffer?
0684  */
0685 static bool __init is_supported_xstate_size(unsigned int test_xstate_size)
0686 {
0687     if (test_xstate_size <= sizeof(init_fpstate.regs))
0688         return true;
0689 
0690     pr_warn("x86/fpu: xstate buffer too small (%zu < %d), disabling xsave\n",
0691             sizeof(init_fpstate.regs), test_xstate_size);
0692     return false;
0693 }
0694 
0695 static int __init init_xstate_size(void)
0696 {
0697     /* Recompute the context size for enabled features: */
0698     unsigned int user_size, kernel_size, kernel_default_size;
0699     bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED);
0700 
0701     /* Uncompacted user space size */
0702     user_size = get_xsave_size_user();
0703 
0704     /*
0705      * XSAVES kernel size includes supervisor states and uses compacted
0706      * format. XSAVEC uses compacted format, but does not save
0707      * supervisor states.
0708      *
0709      * XSAVE[OPT] do not support supervisor states so kernel and user
0710      * size is identical.
0711      */
0712     if (compacted)
0713         kernel_size = get_xsave_compacted_size();
0714     else
0715         kernel_size = user_size;
0716 
0717     kernel_default_size =
0718         xstate_calculate_size(fpu_kernel_cfg.default_features, compacted);
0719 
0720     /* Ensure we have the space to store all default enabled features. */
0721     if (!is_supported_xstate_size(kernel_default_size))
0722         return -EINVAL;
0723 
0724     if (!paranoid_xstate_size_valid(kernel_size))
0725         return -EINVAL;
0726 
0727     fpu_kernel_cfg.max_size = kernel_size;
0728     fpu_user_cfg.max_size = user_size;
0729 
0730     fpu_kernel_cfg.default_size = kernel_default_size;
0731     fpu_user_cfg.default_size =
0732         xstate_calculate_size(fpu_user_cfg.default_features, false);
0733 
0734     return 0;
0735 }
0736 
0737 /*
0738  * We enabled the XSAVE hardware, but something went wrong and
0739  * we can not use it.  Disable it.
0740  */
0741 static void __init fpu__init_disable_system_xstate(unsigned int legacy_size)
0742 {
0743     fpu_kernel_cfg.max_features = 0;
0744     cr4_clear_bits(X86_CR4_OSXSAVE);
0745     setup_clear_cpu_cap(X86_FEATURE_XSAVE);
0746 
0747     /* Restore the legacy size.*/
0748     fpu_kernel_cfg.max_size = legacy_size;
0749     fpu_kernel_cfg.default_size = legacy_size;
0750     fpu_user_cfg.max_size = legacy_size;
0751     fpu_user_cfg.default_size = legacy_size;
0752 
0753     /*
0754      * Prevent enabling the static branch which enables writes to the
0755      * XFD MSR.
0756      */
0757     init_fpstate.xfd = 0;
0758 
0759     fpstate_reset(&current->thread.fpu);
0760 }
0761 
0762 /*
0763  * Enable and initialize the xsave feature.
0764  * Called once per system bootup.
0765  */
0766 void __init fpu__init_system_xstate(unsigned int legacy_size)
0767 {
0768     unsigned int eax, ebx, ecx, edx;
0769     u64 xfeatures;
0770     int err;
0771     int i;
0772 
0773     if (!boot_cpu_has(X86_FEATURE_FPU)) {
0774         pr_info("x86/fpu: No FPU detected\n");
0775         return;
0776     }
0777 
0778     if (!boot_cpu_has(X86_FEATURE_XSAVE)) {
0779         pr_info("x86/fpu: x87 FPU will use %s\n",
0780             boot_cpu_has(X86_FEATURE_FXSR) ? "FXSAVE" : "FSAVE");
0781         return;
0782     }
0783 
0784     if (boot_cpu_data.cpuid_level < XSTATE_CPUID) {
0785         WARN_ON_FPU(1);
0786         return;
0787     }
0788 
0789     /*
0790      * Find user xstates supported by the processor.
0791      */
0792     cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
0793     fpu_kernel_cfg.max_features = eax + ((u64)edx << 32);
0794 
0795     /*
0796      * Find supervisor xstates supported by the processor.
0797      */
0798     cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx);
0799     fpu_kernel_cfg.max_features |= ecx + ((u64)edx << 32);
0800 
0801     if ((fpu_kernel_cfg.max_features & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) {
0802         /*
0803          * This indicates that something really unexpected happened
0804          * with the enumeration.  Disable XSAVE and try to continue
0805          * booting without it.  This is too early to BUG().
0806          */
0807         pr_err("x86/fpu: FP/SSE not present amongst the CPU's xstate features: 0x%llx.\n",
0808                fpu_kernel_cfg.max_features);
0809         goto out_disable;
0810     }
0811 
0812     /*
0813      * Clear XSAVE features that are disabled in the normal CPUID.
0814      */
0815     for (i = 0; i < ARRAY_SIZE(xsave_cpuid_features); i++) {
0816         unsigned short cid = xsave_cpuid_features[i];
0817 
0818         /* Careful: X86_FEATURE_FPU is 0! */
0819         if ((i != XFEATURE_FP && !cid) || !boot_cpu_has(cid))
0820             fpu_kernel_cfg.max_features &= ~BIT_ULL(i);
0821     }
0822 
0823     if (!cpu_feature_enabled(X86_FEATURE_XFD))
0824         fpu_kernel_cfg.max_features &= ~XFEATURE_MASK_USER_DYNAMIC;
0825 
0826     if (!cpu_feature_enabled(X86_FEATURE_XSAVES))
0827         fpu_kernel_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED;
0828     else
0829         fpu_kernel_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED |
0830                     XFEATURE_MASK_SUPERVISOR_SUPPORTED;
0831 
0832     fpu_user_cfg.max_features = fpu_kernel_cfg.max_features;
0833     fpu_user_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED;
0834 
0835     /* Clean out dynamic features from default */
0836     fpu_kernel_cfg.default_features = fpu_kernel_cfg.max_features;
0837     fpu_kernel_cfg.default_features &= ~XFEATURE_MASK_USER_DYNAMIC;
0838 
0839     fpu_user_cfg.default_features = fpu_user_cfg.max_features;
0840     fpu_user_cfg.default_features &= ~XFEATURE_MASK_USER_DYNAMIC;
0841 
0842     /* Store it for paranoia check at the end */
0843     xfeatures = fpu_kernel_cfg.max_features;
0844 
0845     /*
0846      * Initialize the default XFD state in initfp_state and enable the
0847      * dynamic sizing mechanism if dynamic states are available.  The
0848      * static key cannot be enabled here because this runs before
0849      * jump_label_init(). This is delayed to an initcall.
0850      */
0851     init_fpstate.xfd = fpu_user_cfg.max_features & XFEATURE_MASK_USER_DYNAMIC;
0852 
0853     /* Set up compaction feature bit */
0854     if (cpu_feature_enabled(X86_FEATURE_XSAVEC) ||
0855         cpu_feature_enabled(X86_FEATURE_XSAVES))
0856         setup_force_cpu_cap(X86_FEATURE_XCOMPACTED);
0857 
0858     /* Enable xstate instructions to be able to continue with initialization: */
0859     fpu__init_cpu_xstate();
0860 
0861     /* Cache size, offset and flags for initialization */
0862     setup_xstate_cache();
0863 
0864     err = init_xstate_size();
0865     if (err)
0866         goto out_disable;
0867 
0868     /* Reset the state for the current task */
0869     fpstate_reset(&current->thread.fpu);
0870 
0871     /*
0872      * Update info used for ptrace frames; use standard-format size and no
0873      * supervisor xstates:
0874      */
0875     update_regset_xstate_info(fpu_user_cfg.max_size,
0876                   fpu_user_cfg.max_features);
0877 
0878     setup_init_fpu_buf();
0879 
0880     /*
0881      * Paranoia check whether something in the setup modified the
0882      * xfeatures mask.
0883      */
0884     if (xfeatures != fpu_kernel_cfg.max_features) {
0885         pr_err("x86/fpu: xfeatures modified from 0x%016llx to 0x%016llx during init, disabling XSAVE\n",
0886                xfeatures, fpu_kernel_cfg.max_features);
0887         goto out_disable;
0888     }
0889 
0890     print_xstate_offset_size();
0891     pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n",
0892         fpu_kernel_cfg.max_features,
0893         fpu_kernel_cfg.max_size,
0894         boot_cpu_has(X86_FEATURE_XCOMPACTED) ? "compacted" : "standard");
0895     return;
0896 
0897 out_disable:
0898     /* something went wrong, try to boot without any XSAVE support */
0899     fpu__init_disable_system_xstate(legacy_size);
0900 }
0901 
0902 /*
0903  * Restore minimal FPU state after suspend:
0904  */
0905 void fpu__resume_cpu(void)
0906 {
0907     /*
0908      * Restore XCR0 on xsave capable CPUs:
0909      */
0910     if (cpu_feature_enabled(X86_FEATURE_XSAVE))
0911         xsetbv(XCR_XFEATURE_ENABLED_MASK, fpu_user_cfg.max_features);
0912 
0913     /*
0914      * Restore IA32_XSS. The same CPUID bit enumerates support
0915      * of XSAVES and MSR_IA32_XSS.
0916      */
0917     if (cpu_feature_enabled(X86_FEATURE_XSAVES)) {
0918         wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor()  |
0919                      xfeatures_mask_independent());
0920     }
0921 
0922     if (fpu_state_size_dynamic())
0923         wrmsrl(MSR_IA32_XFD, current->thread.fpu.fpstate->xfd);
0924 }
0925 
0926 /*
0927  * Given an xstate feature nr, calculate where in the xsave
0928  * buffer the state is.  Callers should ensure that the buffer
0929  * is valid.
0930  */
0931 static void *__raw_xsave_addr(struct xregs_state *xsave, int xfeature_nr)
0932 {
0933     u64 xcomp_bv = xsave->header.xcomp_bv;
0934 
0935     if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr)))
0936         return NULL;
0937 
0938     if (cpu_feature_enabled(X86_FEATURE_XCOMPACTED)) {
0939         if (WARN_ON_ONCE(!(xcomp_bv & BIT_ULL(xfeature_nr))))
0940             return NULL;
0941     }
0942 
0943     return (void *)xsave + xfeature_get_offset(xcomp_bv, xfeature_nr);
0944 }
0945 
0946 /*
0947  * Given the xsave area and a state inside, this function returns the
0948  * address of the state.
0949  *
0950  * This is the API that is called to get xstate address in either
0951  * standard format or compacted format of xsave area.
0952  *
0953  * Note that if there is no data for the field in the xsave buffer
0954  * this will return NULL.
0955  *
0956  * Inputs:
0957  *  xstate: the thread's storage area for all FPU data
0958  *  xfeature_nr: state which is defined in xsave.h (e.g. XFEATURE_FP,
0959  *  XFEATURE_SSE, etc...)
0960  * Output:
0961  *  address of the state in the xsave area, or NULL if the
0962  *  field is not present in the xsave buffer.
0963  */
0964 void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr)
0965 {
0966     /*
0967      * Do we even *have* xsave state?
0968      */
0969     if (!boot_cpu_has(X86_FEATURE_XSAVE))
0970         return NULL;
0971 
0972     /*
0973      * We should not ever be requesting features that we
0974      * have not enabled.
0975      */
0976     if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr)))
0977         return NULL;
0978 
0979     /*
0980      * This assumes the last 'xsave*' instruction to
0981      * have requested that 'xfeature_nr' be saved.
0982      * If it did not, we might be seeing and old value
0983      * of the field in the buffer.
0984      *
0985      * This can happen because the last 'xsave' did not
0986      * request that this feature be saved (unlikely)
0987      * or because the "init optimization" caused it
0988      * to not be saved.
0989      */
0990     if (!(xsave->header.xfeatures & BIT_ULL(xfeature_nr)))
0991         return NULL;
0992 
0993     return __raw_xsave_addr(xsave, xfeature_nr);
0994 }
0995 
0996 #ifdef CONFIG_ARCH_HAS_PKEYS
0997 
0998 /*
0999  * This will go out and modify PKRU register to set the access
1000  * rights for @pkey to @init_val.
1001  */
1002 int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
1003                   unsigned long init_val)
1004 {
1005     u32 old_pkru, new_pkru_bits = 0;
1006     int pkey_shift;
1007 
1008     /*
1009      * This check implies XSAVE support.  OSPKE only gets
1010      * set if we enable XSAVE and we enable PKU in XCR0.
1011      */
1012     if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
1013         return -EINVAL;
1014 
1015     /*
1016      * This code should only be called with valid 'pkey'
1017      * values originating from in-kernel users.  Complain
1018      * if a bad value is observed.
1019      */
1020     if (WARN_ON_ONCE(pkey >= arch_max_pkey()))
1021         return -EINVAL;
1022 
1023     /* Set the bits we need in PKRU:  */
1024     if (init_val & PKEY_DISABLE_ACCESS)
1025         new_pkru_bits |= PKRU_AD_BIT;
1026     if (init_val & PKEY_DISABLE_WRITE)
1027         new_pkru_bits |= PKRU_WD_BIT;
1028 
1029     /* Shift the bits in to the correct place in PKRU for pkey: */
1030     pkey_shift = pkey * PKRU_BITS_PER_PKEY;
1031     new_pkru_bits <<= pkey_shift;
1032 
1033     /* Get old PKRU and mask off any old bits in place: */
1034     old_pkru = read_pkru();
1035     old_pkru &= ~((PKRU_AD_BIT|PKRU_WD_BIT) << pkey_shift);
1036 
1037     /* Write old part along with new part: */
1038     write_pkru(old_pkru | new_pkru_bits);
1039 
1040     return 0;
1041 }
1042 #endif /* ! CONFIG_ARCH_HAS_PKEYS */
1043 
1044 static void copy_feature(bool from_xstate, struct membuf *to, void *xstate,
1045              void *init_xstate, unsigned int size)
1046 {
1047     membuf_write(to, from_xstate ? xstate : init_xstate, size);
1048 }
1049 
1050 /**
1051  * __copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer
1052  * @to:     membuf descriptor
1053  * @fpstate:    The fpstate buffer from which to copy
1054  * @pkru_val:   The PKRU value to store in the PKRU component
1055  * @copy_mode:  The requested copy mode
1056  *
1057  * Converts from kernel XSAVE or XSAVES compacted format to UABI conforming
1058  * format, i.e. from the kernel internal hardware dependent storage format
1059  * to the requested @mode. UABI XSTATE is always uncompacted!
1060  *
1061  * It supports partial copy but @to.pos always starts from zero.
1062  */
1063 void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate,
1064                    u32 pkru_val, enum xstate_copy_mode copy_mode)
1065 {
1066     const unsigned int off_mxcsr = offsetof(struct fxregs_state, mxcsr);
1067     struct xregs_state *xinit = &init_fpstate.regs.xsave;
1068     struct xregs_state *xsave = &fpstate->regs.xsave;
1069     struct xstate_header header;
1070     unsigned int zerofrom;
1071     u64 mask;
1072     int i;
1073 
1074     memset(&header, 0, sizeof(header));
1075     header.xfeatures = xsave->header.xfeatures;
1076 
1077     /* Mask out the feature bits depending on copy mode */
1078     switch (copy_mode) {
1079     case XSTATE_COPY_FP:
1080         header.xfeatures &= XFEATURE_MASK_FP;
1081         break;
1082 
1083     case XSTATE_COPY_FX:
1084         header.xfeatures &= XFEATURE_MASK_FP | XFEATURE_MASK_SSE;
1085         break;
1086 
1087     case XSTATE_COPY_XSAVE:
1088         header.xfeatures &= fpstate->user_xfeatures;
1089         break;
1090     }
1091 
1092     /* Copy FP state up to MXCSR */
1093     copy_feature(header.xfeatures & XFEATURE_MASK_FP, &to, &xsave->i387,
1094              &xinit->i387, off_mxcsr);
1095 
1096     /* Copy MXCSR when SSE or YMM are set in the feature mask */
1097     copy_feature(header.xfeatures & (XFEATURE_MASK_SSE | XFEATURE_MASK_YMM),
1098              &to, &xsave->i387.mxcsr, &xinit->i387.mxcsr,
1099              MXCSR_AND_FLAGS_SIZE);
1100 
1101     /* Copy the remaining FP state */
1102     copy_feature(header.xfeatures & XFEATURE_MASK_FP,
1103              &to, &xsave->i387.st_space, &xinit->i387.st_space,
1104              sizeof(xsave->i387.st_space));
1105 
1106     /* Copy the SSE state - shared with YMM, but independently managed */
1107     copy_feature(header.xfeatures & XFEATURE_MASK_SSE,
1108              &to, &xsave->i387.xmm_space, &xinit->i387.xmm_space,
1109              sizeof(xsave->i387.xmm_space));
1110 
1111     if (copy_mode != XSTATE_COPY_XSAVE)
1112         goto out;
1113 
1114     /* Zero the padding area */
1115     membuf_zero(&to, sizeof(xsave->i387.padding));
1116 
1117     /* Copy xsave->i387.sw_reserved */
1118     membuf_write(&to, xstate_fx_sw_bytes, sizeof(xsave->i387.sw_reserved));
1119 
1120     /* Copy the user space relevant state of @xsave->header */
1121     membuf_write(&to, &header, sizeof(header));
1122 
1123     zerofrom = offsetof(struct xregs_state, extended_state_area);
1124 
1125     /*
1126      * The ptrace buffer is in non-compacted XSAVE format.  In
1127      * non-compacted format disabled features still occupy state space,
1128      * but there is no state to copy from in the compacted
1129      * init_fpstate. The gap tracking will zero these states.
1130      */
1131     mask = fpstate->user_xfeatures;
1132 
1133     for_each_extended_xfeature(i, mask) {
1134         /*
1135          * If there was a feature or alignment gap, zero the space
1136          * in the destination buffer.
1137          */
1138         if (zerofrom < xstate_offsets[i])
1139             membuf_zero(&to, xstate_offsets[i] - zerofrom);
1140 
1141         if (i == XFEATURE_PKRU) {
1142             struct pkru_state pkru = {0};
1143             /*
1144              * PKRU is not necessarily up to date in the
1145              * XSAVE buffer. Use the provided value.
1146              */
1147             pkru.pkru = pkru_val;
1148             membuf_write(&to, &pkru, sizeof(pkru));
1149         } else {
1150             copy_feature(header.xfeatures & BIT_ULL(i), &to,
1151                      __raw_xsave_addr(xsave, i),
1152                      __raw_xsave_addr(xinit, i),
1153                      xstate_sizes[i]);
1154         }
1155         /*
1156          * Keep track of the last copied state in the non-compacted
1157          * target buffer for gap zeroing.
1158          */
1159         zerofrom = xstate_offsets[i] + xstate_sizes[i];
1160     }
1161 
1162 out:
1163     if (to.left)
1164         membuf_zero(&to, to.left);
1165 }
1166 
1167 /**
1168  * copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer
1169  * @to:     membuf descriptor
1170  * @tsk:    The task from which to copy the saved xstate
1171  * @copy_mode:  The requested copy mode
1172  *
1173  * Converts from kernel XSAVE or XSAVES compacted format to UABI conforming
1174  * format, i.e. from the kernel internal hardware dependent storage format
1175  * to the requested @mode. UABI XSTATE is always uncompacted!
1176  *
1177  * It supports partial copy but @to.pos always starts from zero.
1178  */
1179 void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk,
1180                  enum xstate_copy_mode copy_mode)
1181 {
1182     __copy_xstate_to_uabi_buf(to, tsk->thread.fpu.fpstate,
1183                   tsk->thread.pkru, copy_mode);
1184 }
1185 
1186 static int copy_from_buffer(void *dst, unsigned int offset, unsigned int size,
1187                 const void *kbuf, const void __user *ubuf)
1188 {
1189     if (kbuf) {
1190         memcpy(dst, kbuf + offset, size);
1191     } else {
1192         if (copy_from_user(dst, ubuf + offset, size))
1193             return -EFAULT;
1194     }
1195     return 0;
1196 }
1197 
1198 
1199 static int copy_uabi_to_xstate(struct fpstate *fpstate, const void *kbuf,
1200                    const void __user *ubuf)
1201 {
1202     struct xregs_state *xsave = &fpstate->regs.xsave;
1203     unsigned int offset, size;
1204     struct xstate_header hdr;
1205     u64 mask;
1206     int i;
1207 
1208     offset = offsetof(struct xregs_state, header);
1209     if (copy_from_buffer(&hdr, offset, sizeof(hdr), kbuf, ubuf))
1210         return -EFAULT;
1211 
1212     if (validate_user_xstate_header(&hdr, fpstate))
1213         return -EINVAL;
1214 
1215     /* Validate MXCSR when any of the related features is in use */
1216     mask = XFEATURE_MASK_FP | XFEATURE_MASK_SSE | XFEATURE_MASK_YMM;
1217     if (hdr.xfeatures & mask) {
1218         u32 mxcsr[2];
1219 
1220         offset = offsetof(struct fxregs_state, mxcsr);
1221         if (copy_from_buffer(mxcsr, offset, sizeof(mxcsr), kbuf, ubuf))
1222             return -EFAULT;
1223 
1224         /* Reserved bits in MXCSR must be zero. */
1225         if (mxcsr[0] & ~mxcsr_feature_mask)
1226             return -EINVAL;
1227 
1228         /* SSE and YMM require MXCSR even when FP is not in use. */
1229         if (!(hdr.xfeatures & XFEATURE_MASK_FP)) {
1230             xsave->i387.mxcsr = mxcsr[0];
1231             xsave->i387.mxcsr_mask = mxcsr[1];
1232         }
1233     }
1234 
1235     for (i = 0; i < XFEATURE_MAX; i++) {
1236         mask = BIT_ULL(i);
1237 
1238         if (hdr.xfeatures & mask) {
1239             void *dst = __raw_xsave_addr(xsave, i);
1240 
1241             offset = xstate_offsets[i];
1242             size = xstate_sizes[i];
1243 
1244             if (copy_from_buffer(dst, offset, size, kbuf, ubuf))
1245                 return -EFAULT;
1246         }
1247     }
1248 
1249     /*
1250      * The state that came in from userspace was user-state only.
1251      * Mask all the user states out of 'xfeatures':
1252      */
1253     xsave->header.xfeatures &= XFEATURE_MASK_SUPERVISOR_ALL;
1254 
1255     /*
1256      * Add back in the features that came in from userspace:
1257      */
1258     xsave->header.xfeatures |= hdr.xfeatures;
1259 
1260     return 0;
1261 }
1262 
1263 /*
1264  * Convert from a ptrace standard-format kernel buffer to kernel XSAVE[S]
1265  * format and copy to the target thread. Used by ptrace and KVM.
1266  */
1267 int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf)
1268 {
1269     return copy_uabi_to_xstate(fpstate, kbuf, NULL);
1270 }
1271 
1272 /*
1273  * Convert from a sigreturn standard-format user-space buffer to kernel
1274  * XSAVE[S] format and copy to the target thread. This is called from the
1275  * sigreturn() and rt_sigreturn() system calls.
1276  */
1277 int copy_sigframe_from_user_to_xstate(struct fpstate *fpstate,
1278                       const void __user *ubuf)
1279 {
1280     return copy_uabi_to_xstate(fpstate, NULL, ubuf);
1281 }
1282 
1283 static bool validate_independent_components(u64 mask)
1284 {
1285     u64 xchk;
1286 
1287     if (WARN_ON_FPU(!cpu_feature_enabled(X86_FEATURE_XSAVES)))
1288         return false;
1289 
1290     xchk = ~xfeatures_mask_independent();
1291 
1292     if (WARN_ON_ONCE(!mask || mask & xchk))
1293         return false;
1294 
1295     return true;
1296 }
1297 
1298 /**
1299  * xsaves - Save selected components to a kernel xstate buffer
1300  * @xstate: Pointer to the buffer
1301  * @mask:   Feature mask to select the components to save
1302  *
1303  * The @xstate buffer must be 64 byte aligned and correctly initialized as
1304  * XSAVES does not write the full xstate header. Before first use the
1305  * buffer should be zeroed otherwise a consecutive XRSTORS from that buffer
1306  * can #GP.
1307  *
1308  * The feature mask must be a subset of the independent features.
1309  */
1310 void xsaves(struct xregs_state *xstate, u64 mask)
1311 {
1312     int err;
1313 
1314     if (!validate_independent_components(mask))
1315         return;
1316 
1317     XSTATE_OP(XSAVES, xstate, (u32)mask, (u32)(mask >> 32), err);
1318     WARN_ON_ONCE(err);
1319 }
1320 
1321 /**
1322  * xrstors - Restore selected components from a kernel xstate buffer
1323  * @xstate: Pointer to the buffer
1324  * @mask:   Feature mask to select the components to restore
1325  *
1326  * The @xstate buffer must be 64 byte aligned and correctly initialized
1327  * otherwise XRSTORS from that buffer can #GP.
1328  *
1329  * Proper usage is to restore the state which was saved with
1330  * xsaves() into @xstate.
1331  *
1332  * The feature mask must be a subset of the independent features.
1333  */
1334 void xrstors(struct xregs_state *xstate, u64 mask)
1335 {
1336     int err;
1337 
1338     if (!validate_independent_components(mask))
1339         return;
1340 
1341     XSTATE_OP(XRSTORS, xstate, (u32)mask, (u32)(mask >> 32), err);
1342     WARN_ON_ONCE(err);
1343 }
1344 
1345 #if IS_ENABLED(CONFIG_KVM)
1346 void fpstate_clear_xstate_component(struct fpstate *fps, unsigned int xfeature)
1347 {
1348     void *addr = get_xsave_addr(&fps->regs.xsave, xfeature);
1349 
1350     if (addr)
1351         memset(addr, 0, xstate_sizes[xfeature]);
1352 }
1353 EXPORT_SYMBOL_GPL(fpstate_clear_xstate_component);
1354 #endif
1355 
1356 #ifdef CONFIG_X86_64
1357 
1358 #ifdef CONFIG_X86_DEBUG_FPU
1359 /*
1360  * Ensure that a subsequent XSAVE* or XRSTOR* instruction with RFBM=@mask
1361  * can safely operate on the @fpstate buffer.
1362  */
1363 static bool xstate_op_valid(struct fpstate *fpstate, u64 mask, bool rstor)
1364 {
1365     u64 xfd = __this_cpu_read(xfd_state);
1366 
1367     if (fpstate->xfd == xfd)
1368         return true;
1369 
1370      /*
1371       * The XFD MSR does not match fpstate->xfd. That's invalid when
1372       * the passed in fpstate is current's fpstate.
1373       */
1374     if (fpstate->xfd == current->thread.fpu.fpstate->xfd)
1375         return false;
1376 
1377     /*
1378      * XRSTOR(S) from init_fpstate are always correct as it will just
1379      * bring all components into init state and not read from the
1380      * buffer. XSAVE(S) raises #PF after init.
1381      */
1382     if (fpstate == &init_fpstate)
1383         return rstor;
1384 
1385     /*
1386      * XSAVE(S): clone(), fpu_swap_kvm_fpu()
1387      * XRSTORS(S): fpu_swap_kvm_fpu()
1388      */
1389 
1390     /*
1391      * No XSAVE/XRSTOR instructions (except XSAVE itself) touch
1392      * the buffer area for XFD-disabled state components.
1393      */
1394     mask &= ~xfd;
1395 
1396     /*
1397      * Remove features which are valid in fpstate. They
1398      * have space allocated in fpstate.
1399      */
1400     mask &= ~fpstate->xfeatures;
1401 
1402     /*
1403      * Any remaining state components in 'mask' might be written
1404      * by XSAVE/XRSTOR. Fail validation it found.
1405      */
1406     return !mask;
1407 }
1408 
1409 void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rstor)
1410 {
1411     WARN_ON_ONCE(!xstate_op_valid(fpstate, mask, rstor));
1412 }
1413 #endif /* CONFIG_X86_DEBUG_FPU */
1414 
1415 static int __init xfd_update_static_branch(void)
1416 {
1417     /*
1418      * If init_fpstate.xfd has bits set then dynamic features are
1419      * available and the dynamic sizing must be enabled.
1420      */
1421     if (init_fpstate.xfd)
1422         static_branch_enable(&__fpu_state_size_dynamic);
1423     return 0;
1424 }
1425 arch_initcall(xfd_update_static_branch)
1426 
1427 void fpstate_free(struct fpu *fpu)
1428 {
1429     if (fpu->fpstate && fpu->fpstate != &fpu->__fpstate)
1430         vfree(fpu->fpstate);
1431 }
1432 
1433 /**
1434  * fpstate_realloc - Reallocate struct fpstate for the requested new features
1435  *
1436  * @xfeatures:  A bitmap of xstate features which extend the enabled features
1437  *      of that task
1438  * @ksize:  The required size for the kernel buffer
1439  * @usize:  The required size for user space buffers
1440  * @guest_fpu:  Pointer to a guest FPU container. NULL for host allocations
1441  *
1442  * Note vs. vmalloc(): If the task with a vzalloc()-allocated buffer
1443  * terminates quickly, vfree()-induced IPIs may be a concern, but tasks
1444  * with large states are likely to live longer.
1445  *
1446  * Returns: 0 on success, -ENOMEM on allocation error.
1447  */
1448 static int fpstate_realloc(u64 xfeatures, unsigned int ksize,
1449                unsigned int usize, struct fpu_guest *guest_fpu)
1450 {
1451     struct fpu *fpu = &current->thread.fpu;
1452     struct fpstate *curfps, *newfps = NULL;
1453     unsigned int fpsize;
1454     bool in_use;
1455 
1456     fpsize = ksize + ALIGN(offsetof(struct fpstate, regs), 64);
1457 
1458     newfps = vzalloc(fpsize);
1459     if (!newfps)
1460         return -ENOMEM;
1461     newfps->size = ksize;
1462     newfps->user_size = usize;
1463     newfps->is_valloc = true;
1464 
1465     /*
1466      * When a guest FPU is supplied, use @guest_fpu->fpstate
1467      * as reference independent whether it is in use or not.
1468      */
1469     curfps = guest_fpu ? guest_fpu->fpstate : fpu->fpstate;
1470 
1471     /* Determine whether @curfps is the active fpstate */
1472     in_use = fpu->fpstate == curfps;
1473 
1474     if (guest_fpu) {
1475         newfps->is_guest = true;
1476         newfps->is_confidential = curfps->is_confidential;
1477         newfps->in_use = curfps->in_use;
1478         guest_fpu->xfeatures |= xfeatures;
1479         guest_fpu->uabi_size = usize;
1480     }
1481 
1482     fpregs_lock();
1483     /*
1484      * If @curfps is in use, ensure that the current state is in the
1485      * registers before swapping fpstate as that might invalidate it
1486      * due to layout changes.
1487      */
1488     if (in_use && test_thread_flag(TIF_NEED_FPU_LOAD))
1489         fpregs_restore_userregs();
1490 
1491     newfps->xfeatures = curfps->xfeatures | xfeatures;
1492 
1493     if (!guest_fpu)
1494         newfps->user_xfeatures = curfps->user_xfeatures | xfeatures;
1495 
1496     newfps->xfd = curfps->xfd & ~xfeatures;
1497 
1498     /* Do the final updates within the locked region */
1499     xstate_init_xcomp_bv(&newfps->regs.xsave, newfps->xfeatures);
1500 
1501     if (guest_fpu) {
1502         guest_fpu->fpstate = newfps;
1503         /* If curfps is active, update the FPU fpstate pointer */
1504         if (in_use)
1505             fpu->fpstate = newfps;
1506     } else {
1507         fpu->fpstate = newfps;
1508     }
1509 
1510     if (in_use)
1511         xfd_update_state(fpu->fpstate);
1512     fpregs_unlock();
1513 
1514     /* Only free valloc'ed state */
1515     if (curfps && curfps->is_valloc)
1516         vfree(curfps);
1517 
1518     return 0;
1519 }
1520 
1521 static int validate_sigaltstack(unsigned int usize)
1522 {
1523     struct task_struct *thread, *leader = current->group_leader;
1524     unsigned long framesize = get_sigframe_size();
1525 
1526     lockdep_assert_held(&current->sighand->siglock);
1527 
1528     /* get_sigframe_size() is based on fpu_user_cfg.max_size */
1529     framesize -= fpu_user_cfg.max_size;
1530     framesize += usize;
1531     for_each_thread(leader, thread) {
1532         if (thread->sas_ss_size && thread->sas_ss_size < framesize)
1533             return -ENOSPC;
1534     }
1535     return 0;
1536 }
1537 
1538 static int __xstate_request_perm(u64 permitted, u64 requested, bool guest)
1539 {
1540     /*
1541      * This deliberately does not exclude !XSAVES as we still might
1542      * decide to optionally context switch XCR0 or talk the silicon
1543      * vendors into extending XFD for the pre AMX states, especially
1544      * AVX512.
1545      */
1546     bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED);
1547     struct fpu *fpu = &current->group_leader->thread.fpu;
1548     struct fpu_state_perm *perm;
1549     unsigned int ksize, usize;
1550     u64 mask;
1551     int ret = 0;
1552 
1553     /* Check whether fully enabled */
1554     if ((permitted & requested) == requested)
1555         return 0;
1556 
1557     /* Calculate the resulting kernel state size */
1558     mask = permitted | requested;
1559     /* Take supervisor states into account on the host */
1560     if (!guest)
1561         mask |= xfeatures_mask_supervisor();
1562     ksize = xstate_calculate_size(mask, compacted);
1563 
1564     /* Calculate the resulting user state size */
1565     mask &= XFEATURE_MASK_USER_SUPPORTED;
1566     usize = xstate_calculate_size(mask, false);
1567 
1568     if (!guest) {
1569         ret = validate_sigaltstack(usize);
1570         if (ret)
1571             return ret;
1572     }
1573 
1574     perm = guest ? &fpu->guest_perm : &fpu->perm;
1575     /* Pairs with the READ_ONCE() in xstate_get_group_perm() */
1576     WRITE_ONCE(perm->__state_perm, mask);
1577     /* Protected by sighand lock */
1578     perm->__state_size = ksize;
1579     perm->__user_state_size = usize;
1580     return ret;
1581 }
1582 
1583 /*
1584  * Permissions array to map facilities with more than one component
1585  */
1586 static const u64 xstate_prctl_req[XFEATURE_MAX] = {
1587     [XFEATURE_XTILE_DATA] = XFEATURE_MASK_XTILE_DATA,
1588 };
1589 
1590 static int xstate_request_perm(unsigned long idx, bool guest)
1591 {
1592     u64 permitted, requested;
1593     int ret;
1594 
1595     if (idx >= XFEATURE_MAX)
1596         return -EINVAL;
1597 
1598     /*
1599      * Look up the facility mask which can require more than
1600      * one xstate component.
1601      */
1602     idx = array_index_nospec(idx, ARRAY_SIZE(xstate_prctl_req));
1603     requested = xstate_prctl_req[idx];
1604     if (!requested)
1605         return -EOPNOTSUPP;
1606 
1607     if ((fpu_user_cfg.max_features & requested) != requested)
1608         return -EOPNOTSUPP;
1609 
1610     /* Lockless quick check */
1611     permitted = xstate_get_group_perm(guest);
1612     if ((permitted & requested) == requested)
1613         return 0;
1614 
1615     /* Protect against concurrent modifications */
1616     spin_lock_irq(&current->sighand->siglock);
1617     permitted = xstate_get_group_perm(guest);
1618 
1619     /* First vCPU allocation locks the permissions. */
1620     if (guest && (permitted & FPU_GUEST_PERM_LOCKED))
1621         ret = -EBUSY;
1622     else
1623         ret = __xstate_request_perm(permitted, requested, guest);
1624     spin_unlock_irq(&current->sighand->siglock);
1625     return ret;
1626 }
1627 
1628 int __xfd_enable_feature(u64 xfd_err, struct fpu_guest *guest_fpu)
1629 {
1630     u64 xfd_event = xfd_err & XFEATURE_MASK_USER_DYNAMIC;
1631     struct fpu_state_perm *perm;
1632     unsigned int ksize, usize;
1633     struct fpu *fpu;
1634 
1635     if (!xfd_event) {
1636         if (!guest_fpu)
1637             pr_err_once("XFD: Invalid xfd error: %016llx\n", xfd_err);
1638         return 0;
1639     }
1640 
1641     /* Protect against concurrent modifications */
1642     spin_lock_irq(&current->sighand->siglock);
1643 
1644     /* If not permitted let it die */
1645     if ((xstate_get_group_perm(!!guest_fpu) & xfd_event) != xfd_event) {
1646         spin_unlock_irq(&current->sighand->siglock);
1647         return -EPERM;
1648     }
1649 
1650     fpu = &current->group_leader->thread.fpu;
1651     perm = guest_fpu ? &fpu->guest_perm : &fpu->perm;
1652     ksize = perm->__state_size;
1653     usize = perm->__user_state_size;
1654 
1655     /*
1656      * The feature is permitted. State size is sufficient.  Dropping
1657      * the lock is safe here even if more features are added from
1658      * another task, the retrieved buffer sizes are valid for the
1659      * currently requested feature(s).
1660      */
1661     spin_unlock_irq(&current->sighand->siglock);
1662 
1663     /*
1664      * Try to allocate a new fpstate. If that fails there is no way
1665      * out.
1666      */
1667     if (fpstate_realloc(xfd_event, ksize, usize, guest_fpu))
1668         return -EFAULT;
1669     return 0;
1670 }
1671 
1672 int xfd_enable_feature(u64 xfd_err)
1673 {
1674     return __xfd_enable_feature(xfd_err, NULL);
1675 }
1676 
1677 #else /* CONFIG_X86_64 */
1678 static inline int xstate_request_perm(unsigned long idx, bool guest)
1679 {
1680     return -EPERM;
1681 }
1682 #endif  /* !CONFIG_X86_64 */
1683 
1684 u64 xstate_get_guest_group_perm(void)
1685 {
1686     return xstate_get_group_perm(true);
1687 }
1688 EXPORT_SYMBOL_GPL(xstate_get_guest_group_perm);
1689 
1690 /**
1691  * fpu_xstate_prctl - xstate permission operations
1692  * @tsk:    Redundant pointer to current
1693  * @option: A subfunction of arch_prctl()
1694  * @arg2:   option argument
1695  * Return:  0 if successful; otherwise, an error code
1696  *
1697  * Option arguments:
1698  *
1699  * ARCH_GET_XCOMP_SUPP: Pointer to user space u64 to store the info
1700  * ARCH_GET_XCOMP_PERM: Pointer to user space u64 to store the info
1701  * ARCH_REQ_XCOMP_PERM: Facility number requested
1702  *
1703  * For facilities which require more than one XSTATE component, the request
1704  * must be the highest state component number related to that facility,
1705  * e.g. for AMX which requires XFEATURE_XTILE_CFG(17) and
1706  * XFEATURE_XTILE_DATA(18) this would be XFEATURE_XTILE_DATA(18).
1707  */
1708 long fpu_xstate_prctl(int option, unsigned long arg2)
1709 {
1710     u64 __user *uptr = (u64 __user *)arg2;
1711     u64 permitted, supported;
1712     unsigned long idx = arg2;
1713     bool guest = false;
1714 
1715     switch (option) {
1716     case ARCH_GET_XCOMP_SUPP:
1717         supported = fpu_user_cfg.max_features | fpu_user_cfg.legacy_features;
1718         return put_user(supported, uptr);
1719 
1720     case ARCH_GET_XCOMP_PERM:
1721         /*
1722          * Lockless snapshot as it can also change right after the
1723          * dropping the lock.
1724          */
1725         permitted = xstate_get_host_group_perm();
1726         permitted &= XFEATURE_MASK_USER_SUPPORTED;
1727         return put_user(permitted, uptr);
1728 
1729     case ARCH_GET_XCOMP_GUEST_PERM:
1730         permitted = xstate_get_guest_group_perm();
1731         permitted &= XFEATURE_MASK_USER_SUPPORTED;
1732         return put_user(permitted, uptr);
1733 
1734     case ARCH_REQ_XCOMP_GUEST_PERM:
1735         guest = true;
1736         fallthrough;
1737 
1738     case ARCH_REQ_XCOMP_PERM:
1739         if (!IS_ENABLED(CONFIG_X86_64))
1740             return -EOPNOTSUPP;
1741 
1742         return xstate_request_perm(idx, guest);
1743 
1744     default:
1745         return -EINVAL;
1746     }
1747 }
1748 
1749 #ifdef CONFIG_PROC_PID_ARCH_STATUS
1750 /*
1751  * Report the amount of time elapsed in millisecond since last AVX512
1752  * use in the task.
1753  */
1754 static void avx512_status(struct seq_file *m, struct task_struct *task)
1755 {
1756     unsigned long timestamp = READ_ONCE(task->thread.fpu.avx512_timestamp);
1757     long delta;
1758 
1759     if (!timestamp) {
1760         /*
1761          * Report -1 if no AVX512 usage
1762          */
1763         delta = -1;
1764     } else {
1765         delta = (long)(jiffies - timestamp);
1766         /*
1767          * Cap to LONG_MAX if time difference > LONG_MAX
1768          */
1769         if (delta < 0)
1770             delta = LONG_MAX;
1771         delta = jiffies_to_msecs(delta);
1772     }
1773 
1774     seq_put_decimal_ll(m, "AVX512_elapsed_ms:\t", delta);
1775     seq_putc(m, '\n');
1776 }
1777 
1778 /*
1779  * Report architecture specific information
1780  */
1781 int proc_pid_arch_status(struct seq_file *m, struct pid_namespace *ns,
1782             struct pid *pid, struct task_struct *task)
1783 {
1784     /*
1785      * Report AVX512 state if the processor and build option supported.
1786      */
1787     if (cpu_feature_enabled(X86_FEATURE_AVX512F))
1788         avx512_status(m, task);
1789 
1790     return 0;
1791 }
1792 #endif /* CONFIG_PROC_PID_ARCH_STATUS */