0001
0002 #ifndef _ASM_X86_XOR_H
0003 #define _ASM_X86_XOR_H
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024
0025
0026
0027
0028 #include <asm/fpu/api.h>
0029
0030 #ifdef CONFIG_X86_32
0031
0032 # define XOR_CONSTANT_CONSTRAINT "i"
0033 #else
0034 # define XOR_CONSTANT_CONSTRAINT "re"
0035 #endif
0036
0037 #define OFFS(x) "16*("#x")"
0038 #define PF_OFFS(x) "256+16*("#x")"
0039 #define PF0(x) " prefetchnta "PF_OFFS(x)"(%[p1]) ;\n"
0040 #define LD(x, y) " movaps "OFFS(x)"(%[p1]), %%xmm"#y" ;\n"
0041 #define ST(x, y) " movaps %%xmm"#y", "OFFS(x)"(%[p1]) ;\n"
0042 #define PF1(x) " prefetchnta "PF_OFFS(x)"(%[p2]) ;\n"
0043 #define PF2(x) " prefetchnta "PF_OFFS(x)"(%[p3]) ;\n"
0044 #define PF3(x) " prefetchnta "PF_OFFS(x)"(%[p4]) ;\n"
0045 #define PF4(x) " prefetchnta "PF_OFFS(x)"(%[p5]) ;\n"
0046 #define XO1(x, y) " xorps "OFFS(x)"(%[p2]), %%xmm"#y" ;\n"
0047 #define XO2(x, y) " xorps "OFFS(x)"(%[p3]), %%xmm"#y" ;\n"
0048 #define XO3(x, y) " xorps "OFFS(x)"(%[p4]), %%xmm"#y" ;\n"
0049 #define XO4(x, y) " xorps "OFFS(x)"(%[p5]), %%xmm"#y" ;\n"
0050 #define NOP(x)
0051
0052 #define BLK64(pf, op, i) \
0053 pf(i) \
0054 op(i, 0) \
0055 op(i + 1, 1) \
0056 op(i + 2, 2) \
0057 op(i + 3, 3)
0058
0059 static void
0060 xor_sse_2(unsigned long bytes, unsigned long * __restrict p1,
0061 const unsigned long * __restrict p2)
0062 {
0063 unsigned long lines = bytes >> 8;
0064
0065 kernel_fpu_begin();
0066
0067 asm volatile(
0068 #undef BLOCK
0069 #define BLOCK(i) \
0070 LD(i, 0) \
0071 LD(i + 1, 1) \
0072 PF1(i) \
0073 PF1(i + 2) \
0074 LD(i + 2, 2) \
0075 LD(i + 3, 3) \
0076 PF0(i + 4) \
0077 PF0(i + 6) \
0078 XO1(i, 0) \
0079 XO1(i + 1, 1) \
0080 XO1(i + 2, 2) \
0081 XO1(i + 3, 3) \
0082 ST(i, 0) \
0083 ST(i + 1, 1) \
0084 ST(i + 2, 2) \
0085 ST(i + 3, 3) \
0086
0087
0088 PF0(0)
0089 PF0(2)
0090
0091 " .align 32 ;\n"
0092 " 1: ;\n"
0093
0094 BLOCK(0)
0095 BLOCK(4)
0096 BLOCK(8)
0097 BLOCK(12)
0098
0099 " add %[inc], %[p1] ;\n"
0100 " add %[inc], %[p2] ;\n"
0101 " dec %[cnt] ;\n"
0102 " jnz 1b ;\n"
0103 : [cnt] "+r" (lines),
0104 [p1] "+r" (p1), [p2] "+r" (p2)
0105 : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
0106 : "memory");
0107
0108 kernel_fpu_end();
0109 }
0110
0111 static void
0112 xor_sse_2_pf64(unsigned long bytes, unsigned long * __restrict p1,
0113 const unsigned long * __restrict p2)
0114 {
0115 unsigned long lines = bytes >> 8;
0116
0117 kernel_fpu_begin();
0118
0119 asm volatile(
0120 #undef BLOCK
0121 #define BLOCK(i) \
0122 BLK64(PF0, LD, i) \
0123 BLK64(PF1, XO1, i) \
0124 BLK64(NOP, ST, i) \
0125
0126 " .align 32 ;\n"
0127 " 1: ;\n"
0128
0129 BLOCK(0)
0130 BLOCK(4)
0131 BLOCK(8)
0132 BLOCK(12)
0133
0134 " add %[inc], %[p1] ;\n"
0135 " add %[inc], %[p2] ;\n"
0136 " dec %[cnt] ;\n"
0137 " jnz 1b ;\n"
0138 : [cnt] "+r" (lines),
0139 [p1] "+r" (p1), [p2] "+r" (p2)
0140 : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
0141 : "memory");
0142
0143 kernel_fpu_end();
0144 }
0145
0146 static void
0147 xor_sse_3(unsigned long bytes, unsigned long * __restrict p1,
0148 const unsigned long * __restrict p2,
0149 const unsigned long * __restrict p3)
0150 {
0151 unsigned long lines = bytes >> 8;
0152
0153 kernel_fpu_begin();
0154
0155 asm volatile(
0156 #undef BLOCK
0157 #define BLOCK(i) \
0158 PF1(i) \
0159 PF1(i + 2) \
0160 LD(i, 0) \
0161 LD(i + 1, 1) \
0162 LD(i + 2, 2) \
0163 LD(i + 3, 3) \
0164 PF2(i) \
0165 PF2(i + 2) \
0166 PF0(i + 4) \
0167 PF0(i + 6) \
0168 XO1(i, 0) \
0169 XO1(i + 1, 1) \
0170 XO1(i + 2, 2) \
0171 XO1(i + 3, 3) \
0172 XO2(i, 0) \
0173 XO2(i + 1, 1) \
0174 XO2(i + 2, 2) \
0175 XO2(i + 3, 3) \
0176 ST(i, 0) \
0177 ST(i + 1, 1) \
0178 ST(i + 2, 2) \
0179 ST(i + 3, 3) \
0180
0181
0182 PF0(0)
0183 PF0(2)
0184
0185 " .align 32 ;\n"
0186 " 1: ;\n"
0187
0188 BLOCK(0)
0189 BLOCK(4)
0190 BLOCK(8)
0191 BLOCK(12)
0192
0193 " add %[inc], %[p1] ;\n"
0194 " add %[inc], %[p2] ;\n"
0195 " add %[inc], %[p3] ;\n"
0196 " dec %[cnt] ;\n"
0197 " jnz 1b ;\n"
0198 : [cnt] "+r" (lines),
0199 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
0200 : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
0201 : "memory");
0202
0203 kernel_fpu_end();
0204 }
0205
0206 static void
0207 xor_sse_3_pf64(unsigned long bytes, unsigned long * __restrict p1,
0208 const unsigned long * __restrict p2,
0209 const unsigned long * __restrict p3)
0210 {
0211 unsigned long lines = bytes >> 8;
0212
0213 kernel_fpu_begin();
0214
0215 asm volatile(
0216 #undef BLOCK
0217 #define BLOCK(i) \
0218 BLK64(PF0, LD, i) \
0219 BLK64(PF1, XO1, i) \
0220 BLK64(PF2, XO2, i) \
0221 BLK64(NOP, ST, i) \
0222
0223 " .align 32 ;\n"
0224 " 1: ;\n"
0225
0226 BLOCK(0)
0227 BLOCK(4)
0228 BLOCK(8)
0229 BLOCK(12)
0230
0231 " add %[inc], %[p1] ;\n"
0232 " add %[inc], %[p2] ;\n"
0233 " add %[inc], %[p3] ;\n"
0234 " dec %[cnt] ;\n"
0235 " jnz 1b ;\n"
0236 : [cnt] "+r" (lines),
0237 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
0238 : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
0239 : "memory");
0240
0241 kernel_fpu_end();
0242 }
0243
0244 static void
0245 xor_sse_4(unsigned long bytes, unsigned long * __restrict p1,
0246 const unsigned long * __restrict p2,
0247 const unsigned long * __restrict p3,
0248 const unsigned long * __restrict p4)
0249 {
0250 unsigned long lines = bytes >> 8;
0251
0252 kernel_fpu_begin();
0253
0254 asm volatile(
0255 #undef BLOCK
0256 #define BLOCK(i) \
0257 PF1(i) \
0258 PF1(i + 2) \
0259 LD(i, 0) \
0260 LD(i + 1, 1) \
0261 LD(i + 2, 2) \
0262 LD(i + 3, 3) \
0263 PF2(i) \
0264 PF2(i + 2) \
0265 XO1(i, 0) \
0266 XO1(i + 1, 1) \
0267 XO1(i + 2, 2) \
0268 XO1(i + 3, 3) \
0269 PF3(i) \
0270 PF3(i + 2) \
0271 PF0(i + 4) \
0272 PF0(i + 6) \
0273 XO2(i, 0) \
0274 XO2(i + 1, 1) \
0275 XO2(i + 2, 2) \
0276 XO2(i + 3, 3) \
0277 XO3(i, 0) \
0278 XO3(i + 1, 1) \
0279 XO3(i + 2, 2) \
0280 XO3(i + 3, 3) \
0281 ST(i, 0) \
0282 ST(i + 1, 1) \
0283 ST(i + 2, 2) \
0284 ST(i + 3, 3) \
0285
0286
0287 PF0(0)
0288 PF0(2)
0289
0290 " .align 32 ;\n"
0291 " 1: ;\n"
0292
0293 BLOCK(0)
0294 BLOCK(4)
0295 BLOCK(8)
0296 BLOCK(12)
0297
0298 " add %[inc], %[p1] ;\n"
0299 " add %[inc], %[p2] ;\n"
0300 " add %[inc], %[p3] ;\n"
0301 " add %[inc], %[p4] ;\n"
0302 " dec %[cnt] ;\n"
0303 " jnz 1b ;\n"
0304 : [cnt] "+r" (lines), [p1] "+r" (p1),
0305 [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
0306 : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
0307 : "memory");
0308
0309 kernel_fpu_end();
0310 }
0311
0312 static void
0313 xor_sse_4_pf64(unsigned long bytes, unsigned long * __restrict p1,
0314 const unsigned long * __restrict p2,
0315 const unsigned long * __restrict p3,
0316 const unsigned long * __restrict p4)
0317 {
0318 unsigned long lines = bytes >> 8;
0319
0320 kernel_fpu_begin();
0321
0322 asm volatile(
0323 #undef BLOCK
0324 #define BLOCK(i) \
0325 BLK64(PF0, LD, i) \
0326 BLK64(PF1, XO1, i) \
0327 BLK64(PF2, XO2, i) \
0328 BLK64(PF3, XO3, i) \
0329 BLK64(NOP, ST, i) \
0330
0331 " .align 32 ;\n"
0332 " 1: ;\n"
0333
0334 BLOCK(0)
0335 BLOCK(4)
0336 BLOCK(8)
0337 BLOCK(12)
0338
0339 " add %[inc], %[p1] ;\n"
0340 " add %[inc], %[p2] ;\n"
0341 " add %[inc], %[p3] ;\n"
0342 " add %[inc], %[p4] ;\n"
0343 " dec %[cnt] ;\n"
0344 " jnz 1b ;\n"
0345 : [cnt] "+r" (lines), [p1] "+r" (p1),
0346 [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
0347 : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
0348 : "memory");
0349
0350 kernel_fpu_end();
0351 }
0352
0353 static void
0354 xor_sse_5(unsigned long bytes, unsigned long * __restrict p1,
0355 const unsigned long * __restrict p2,
0356 const unsigned long * __restrict p3,
0357 const unsigned long * __restrict p4,
0358 const unsigned long * __restrict p5)
0359 {
0360 unsigned long lines = bytes >> 8;
0361
0362 kernel_fpu_begin();
0363
0364 asm volatile(
0365 #undef BLOCK
0366 #define BLOCK(i) \
0367 PF1(i) \
0368 PF1(i + 2) \
0369 LD(i, 0) \
0370 LD(i + 1, 1) \
0371 LD(i + 2, 2) \
0372 LD(i + 3, 3) \
0373 PF2(i) \
0374 PF2(i + 2) \
0375 XO1(i, 0) \
0376 XO1(i + 1, 1) \
0377 XO1(i + 2, 2) \
0378 XO1(i + 3, 3) \
0379 PF3(i) \
0380 PF3(i + 2) \
0381 XO2(i, 0) \
0382 XO2(i + 1, 1) \
0383 XO2(i + 2, 2) \
0384 XO2(i + 3, 3) \
0385 PF4(i) \
0386 PF4(i + 2) \
0387 PF0(i + 4) \
0388 PF0(i + 6) \
0389 XO3(i, 0) \
0390 XO3(i + 1, 1) \
0391 XO3(i + 2, 2) \
0392 XO3(i + 3, 3) \
0393 XO4(i, 0) \
0394 XO4(i + 1, 1) \
0395 XO4(i + 2, 2) \
0396 XO4(i + 3, 3) \
0397 ST(i, 0) \
0398 ST(i + 1, 1) \
0399 ST(i + 2, 2) \
0400 ST(i + 3, 3) \
0401
0402
0403 PF0(0)
0404 PF0(2)
0405
0406 " .align 32 ;\n"
0407 " 1: ;\n"
0408
0409 BLOCK(0)
0410 BLOCK(4)
0411 BLOCK(8)
0412 BLOCK(12)
0413
0414 " add %[inc], %[p1] ;\n"
0415 " add %[inc], %[p2] ;\n"
0416 " add %[inc], %[p3] ;\n"
0417 " add %[inc], %[p4] ;\n"
0418 " add %[inc], %[p5] ;\n"
0419 " dec %[cnt] ;\n"
0420 " jnz 1b ;\n"
0421 : [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2),
0422 [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5)
0423 : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
0424 : "memory");
0425
0426 kernel_fpu_end();
0427 }
0428
0429 static void
0430 xor_sse_5_pf64(unsigned long bytes, unsigned long * __restrict p1,
0431 const unsigned long * __restrict p2,
0432 const unsigned long * __restrict p3,
0433 const unsigned long * __restrict p4,
0434 const unsigned long * __restrict p5)
0435 {
0436 unsigned long lines = bytes >> 8;
0437
0438 kernel_fpu_begin();
0439
0440 asm volatile(
0441 #undef BLOCK
0442 #define BLOCK(i) \
0443 BLK64(PF0, LD, i) \
0444 BLK64(PF1, XO1, i) \
0445 BLK64(PF2, XO2, i) \
0446 BLK64(PF3, XO3, i) \
0447 BLK64(PF4, XO4, i) \
0448 BLK64(NOP, ST, i) \
0449
0450 " .align 32 ;\n"
0451 " 1: ;\n"
0452
0453 BLOCK(0)
0454 BLOCK(4)
0455 BLOCK(8)
0456 BLOCK(12)
0457
0458 " add %[inc], %[p1] ;\n"
0459 " add %[inc], %[p2] ;\n"
0460 " add %[inc], %[p3] ;\n"
0461 " add %[inc], %[p4] ;\n"
0462 " add %[inc], %[p5] ;\n"
0463 " dec %[cnt] ;\n"
0464 " jnz 1b ;\n"
0465 : [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2),
0466 [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5)
0467 : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
0468 : "memory");
0469
0470 kernel_fpu_end();
0471 }
0472
0473 static struct xor_block_template xor_block_sse_pf64 = {
0474 .name = "prefetch64-sse",
0475 .do_2 = xor_sse_2_pf64,
0476 .do_3 = xor_sse_3_pf64,
0477 .do_4 = xor_sse_4_pf64,
0478 .do_5 = xor_sse_5_pf64,
0479 };
0480
0481 #undef LD
0482 #undef XO1
0483 #undef XO2
0484 #undef XO3
0485 #undef XO4
0486 #undef ST
0487 #undef NOP
0488 #undef BLK64
0489 #undef BLOCK
0490
0491 #undef XOR_CONSTANT_CONSTRAINT
0492
0493 #ifdef CONFIG_X86_32
0494 # include <asm/xor_32.h>
0495 #else
0496 # include <asm/xor_64.h>
0497 #endif
0498
0499 #define XOR_SELECT_TEMPLATE(FASTEST) \
0500 AVX_SELECT(FASTEST)
0501
0502 #endif