include/asm/xor_avx.h

0001 /* SPDX-License-Identifier: GPL-2.0-only */
0002 #ifndef _ASM_X86_XOR_AVX_H
0003 #define _ASM_X86_XOR_AVX_H
0004
0005 /*
0006  * Optimized RAID-5 checksumming functions for AVX
0007  *
0008  * Copyright (C) 2012 Intel Corporation
0009  * Author: Jim Kukunas <james.t.kukunas@linux.intel.com>
0010  *
0011  * Based on Ingo Molnar and Zach Brown's respective MMX and SSE routines
0012  */
0013
0014 #include <linux/compiler.h>
0015 #include <asm/fpu/api.h>
0016
0017 #define BLOCK4(i) \
0018         BLOCK(32 * i, 0) \
0019         BLOCK(32 * (i + 1), 1) \
0020         BLOCK(32 * (i + 2), 2) \
0021         BLOCK(32 * (i + 3), 3)
0022
0023 #define BLOCK16() \
0024         BLOCK4(0) \
0025         BLOCK4(4) \
0026         BLOCK4(8) \
0027         BLOCK4(12)
0028
0029 static void xor_avx_2(unsigned long bytes, unsigned long * __restrict p0,
0030               const unsigned long * __restrict p1)
0031 {
0032     unsigned long lines = bytes >> 9;
0033
0034     kernel_fpu_begin();
0035
0036     while (lines--) {
0037 #undef BLOCK
0038 #define BLOCK(i, reg) \
0039 do { \
0040     asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p1[i / sizeof(*p1)])); \
0041     asm volatile("vxorps %0, %%ymm" #reg ", %%ymm"  #reg : : \
0042         "m" (p0[i / sizeof(*p0)])); \
0043     asm volatile("vmovdqa %%ymm" #reg ", %0" : \
0044         "=m" (p0[i / sizeof(*p0)])); \
0045 } while (0);
0046
0047         BLOCK16()
0048
0049         p0 = (unsigned long *)((uintptr_t)p0 + 512);
0050         p1 = (unsigned long *)((uintptr_t)p1 + 512);
0051     }
0052
0053     kernel_fpu_end();
0054 }
0055
0056 static void xor_avx_3(unsigned long bytes, unsigned long * __restrict p0,
0057               const unsigned long * __restrict p1,
0058               const unsigned long * __restrict p2)
0059 {
0060     unsigned long lines = bytes >> 9;
0061
0062     kernel_fpu_begin();
0063
0064     while (lines--) {
0065 #undef BLOCK
0066 #define BLOCK(i, reg) \
0067 do { \
0068     asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p2[i / sizeof(*p2)])); \
0069     asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
0070         "m" (p1[i / sizeof(*p1)])); \
0071     asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
0072         "m" (p0[i / sizeof(*p0)])); \
0073     asm volatile("vmovdqa %%ymm" #reg ", %0" : \
0074         "=m" (p0[i / sizeof(*p0)])); \
0075 } while (0);
0076
0077         BLOCK16()
0078
0079         p0 = (unsigned long *)((uintptr_t)p0 + 512);
0080         p1 = (unsigned long *)((uintptr_t)p1 + 512);
0081         p2 = (unsigned long *)((uintptr_t)p2 + 512);
0082     }
0083
0084     kernel_fpu_end();
0085 }
0086
0087 static void xor_avx_4(unsigned long bytes, unsigned long * __restrict p0,
0088               const unsigned long * __restrict p1,
0089               const unsigned long * __restrict p2,
0090               const unsigned long * __restrict p3)
0091 {
0092     unsigned long lines = bytes >> 9;
0093
0094     kernel_fpu_begin();
0095
0096     while (lines--) {
0097 #undef BLOCK
0098 #define BLOCK(i, reg) \
0099 do { \
0100     asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p3[i / sizeof(*p3)])); \
0101     asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
0102         "m" (p2[i / sizeof(*p2)])); \
0103     asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
0104         "m" (p1[i / sizeof(*p1)])); \
0105     asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
0106         "m" (p0[i / sizeof(*p0)])); \
0107     asm volatile("vmovdqa %%ymm" #reg ", %0" : \
0108         "=m" (p0[i / sizeof(*p0)])); \
0109 } while (0);
0110
0111         BLOCK16();
0112
0113         p0 = (unsigned long *)((uintptr_t)p0 + 512);
0114         p1 = (unsigned long *)((uintptr_t)p1 + 512);
0115         p2 = (unsigned long *)((uintptr_t)p2 + 512);
0116         p3 = (unsigned long *)((uintptr_t)p3 + 512);
0117     }
0118
0119     kernel_fpu_end();
0120 }
0121
0122 static void xor_avx_5(unsigned long bytes, unsigned long * __restrict p0,
0123          const unsigned long * __restrict p1,
0124          const unsigned long * __restrict p2,
0125          const unsigned long * __restrict p3,
0126          const unsigned long * __restrict p4)
0127 {
0128     unsigned long lines = bytes >> 9;
0129
0130     kernel_fpu_begin();
0131
0132     while (lines--) {
0133 #undef BLOCK
0134 #define BLOCK(i, reg) \
0135 do { \
0136     asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p4[i / sizeof(*p4)])); \
0137     asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
0138         "m" (p3[i / sizeof(*p3)])); \
0139     asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
0140         "m" (p2[i / sizeof(*p2)])); \
0141     asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
0142         "m" (p1[i / sizeof(*p1)])); \
0143     asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
0144         "m" (p0[i / sizeof(*p0)])); \
0145     asm volatile("vmovdqa %%ymm" #reg ", %0" : \
0146         "=m" (p0[i / sizeof(*p0)])); \
0147 } while (0);
0148
0149         BLOCK16()
0150
0151         p0 = (unsigned long *)((uintptr_t)p0 + 512);
0152         p1 = (unsigned long *)((uintptr_t)p1 + 512);
0153         p2 = (unsigned long *)((uintptr_t)p2 + 512);
0154         p3 = (unsigned long *)((uintptr_t)p3 + 512);
0155         p4 = (unsigned long *)((uintptr_t)p4 + 512);
0156     }
0157
0158     kernel_fpu_end();
0159 }
0160
0161 static struct xor_block_template xor_block_avx = {
0162     .name = "avx",
0163     .do_2 = xor_avx_2,
0164     .do_3 = xor_avx_3,
0165     .do_4 = xor_avx_4,
0166     .do_5 = xor_avx_5,
0167 };
0168
0169 #define AVX_XOR_SPEED \
0170 do { \
0171     if (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_OSXSAVE)) \
0172         xor_speed(&xor_block_avx); \
0173 } while (0)
0174
0175 #define AVX_SELECT(FASTEST) \
0176     (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_OSXSAVE) ? &xor_block_avx : FASTEST)
0177
0178 #endif