0001
0002 #ifndef _ASM_X86_XOR_AVX_H
0003 #define _ASM_X86_XOR_AVX_H
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014 #include <linux/compiler.h>
0015 #include <asm/fpu/api.h>
0016
0017 #define BLOCK4(i) \
0018 BLOCK(32 * i, 0) \
0019 BLOCK(32 * (i + 1), 1) \
0020 BLOCK(32 * (i + 2), 2) \
0021 BLOCK(32 * (i + 3), 3)
0022
0023 #define BLOCK16() \
0024 BLOCK4(0) \
0025 BLOCK4(4) \
0026 BLOCK4(8) \
0027 BLOCK4(12)
0028
0029 static void xor_avx_2(unsigned long bytes, unsigned long * __restrict p0,
0030 const unsigned long * __restrict p1)
0031 {
0032 unsigned long lines = bytes >> 9;
0033
0034 kernel_fpu_begin();
0035
0036 while (lines--) {
0037 #undef BLOCK
0038 #define BLOCK(i, reg) \
0039 do { \
0040 asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p1[i / sizeof(*p1)])); \
0041 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
0042 "m" (p0[i / sizeof(*p0)])); \
0043 asm volatile("vmovdqa %%ymm" #reg ", %0" : \
0044 "=m" (p0[i / sizeof(*p0)])); \
0045 } while (0);
0046
0047 BLOCK16()
0048
0049 p0 = (unsigned long *)((uintptr_t)p0 + 512);
0050 p1 = (unsigned long *)((uintptr_t)p1 + 512);
0051 }
0052
0053 kernel_fpu_end();
0054 }
0055
0056 static void xor_avx_3(unsigned long bytes, unsigned long * __restrict p0,
0057 const unsigned long * __restrict p1,
0058 const unsigned long * __restrict p2)
0059 {
0060 unsigned long lines = bytes >> 9;
0061
0062 kernel_fpu_begin();
0063
0064 while (lines--) {
0065 #undef BLOCK
0066 #define BLOCK(i, reg) \
0067 do { \
0068 asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p2[i / sizeof(*p2)])); \
0069 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
0070 "m" (p1[i / sizeof(*p1)])); \
0071 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
0072 "m" (p0[i / sizeof(*p0)])); \
0073 asm volatile("vmovdqa %%ymm" #reg ", %0" : \
0074 "=m" (p0[i / sizeof(*p0)])); \
0075 } while (0);
0076
0077 BLOCK16()
0078
0079 p0 = (unsigned long *)((uintptr_t)p0 + 512);
0080 p1 = (unsigned long *)((uintptr_t)p1 + 512);
0081 p2 = (unsigned long *)((uintptr_t)p2 + 512);
0082 }
0083
0084 kernel_fpu_end();
0085 }
0086
0087 static void xor_avx_4(unsigned long bytes, unsigned long * __restrict p0,
0088 const unsigned long * __restrict p1,
0089 const unsigned long * __restrict p2,
0090 const unsigned long * __restrict p3)
0091 {
0092 unsigned long lines = bytes >> 9;
0093
0094 kernel_fpu_begin();
0095
0096 while (lines--) {
0097 #undef BLOCK
0098 #define BLOCK(i, reg) \
0099 do { \
0100 asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p3[i / sizeof(*p3)])); \
0101 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
0102 "m" (p2[i / sizeof(*p2)])); \
0103 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
0104 "m" (p1[i / sizeof(*p1)])); \
0105 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
0106 "m" (p0[i / sizeof(*p0)])); \
0107 asm volatile("vmovdqa %%ymm" #reg ", %0" : \
0108 "=m" (p0[i / sizeof(*p0)])); \
0109 } while (0);
0110
0111 BLOCK16();
0112
0113 p0 = (unsigned long *)((uintptr_t)p0 + 512);
0114 p1 = (unsigned long *)((uintptr_t)p1 + 512);
0115 p2 = (unsigned long *)((uintptr_t)p2 + 512);
0116 p3 = (unsigned long *)((uintptr_t)p3 + 512);
0117 }
0118
0119 kernel_fpu_end();
0120 }
0121
0122 static void xor_avx_5(unsigned long bytes, unsigned long * __restrict p0,
0123 const unsigned long * __restrict p1,
0124 const unsigned long * __restrict p2,
0125 const unsigned long * __restrict p3,
0126 const unsigned long * __restrict p4)
0127 {
0128 unsigned long lines = bytes >> 9;
0129
0130 kernel_fpu_begin();
0131
0132 while (lines--) {
0133 #undef BLOCK
0134 #define BLOCK(i, reg) \
0135 do { \
0136 asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p4[i / sizeof(*p4)])); \
0137 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
0138 "m" (p3[i / sizeof(*p3)])); \
0139 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
0140 "m" (p2[i / sizeof(*p2)])); \
0141 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
0142 "m" (p1[i / sizeof(*p1)])); \
0143 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
0144 "m" (p0[i / sizeof(*p0)])); \
0145 asm volatile("vmovdqa %%ymm" #reg ", %0" : \
0146 "=m" (p0[i / sizeof(*p0)])); \
0147 } while (0);
0148
0149 BLOCK16()
0150
0151 p0 = (unsigned long *)((uintptr_t)p0 + 512);
0152 p1 = (unsigned long *)((uintptr_t)p1 + 512);
0153 p2 = (unsigned long *)((uintptr_t)p2 + 512);
0154 p3 = (unsigned long *)((uintptr_t)p3 + 512);
0155 p4 = (unsigned long *)((uintptr_t)p4 + 512);
0156 }
0157
0158 kernel_fpu_end();
0159 }
0160
0161 static struct xor_block_template xor_block_avx = {
0162 .name = "avx",
0163 .do_2 = xor_avx_2,
0164 .do_3 = xor_avx_3,
0165 .do_4 = xor_avx_4,
0166 .do_5 = xor_avx_5,
0167 };
0168
0169 #define AVX_XOR_SPEED \
0170 do { \
0171 if (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_OSXSAVE)) \
0172 xor_speed(&xor_block_avx); \
0173 } while (0)
0174
0175 #define AVX_SELECT(FASTEST) \
0176 (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_OSXSAVE) ? &xor_block_avx : FASTEST)
0177
0178 #endif