0001
0002 #ifndef _ASM_X86_XOR_32_H
0003 #define _ASM_X86_XOR_32_H
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014 #define LD(x, y) " movq 8*("#x")(%1), %%mm"#y" ;\n"
0015 #define ST(x, y) " movq %%mm"#y", 8*("#x")(%1) ;\n"
0016 #define XO1(x, y) " pxor 8*("#x")(%2), %%mm"#y" ;\n"
0017 #define XO2(x, y) " pxor 8*("#x")(%3), %%mm"#y" ;\n"
0018 #define XO3(x, y) " pxor 8*("#x")(%4), %%mm"#y" ;\n"
0019 #define XO4(x, y) " pxor 8*("#x")(%5), %%mm"#y" ;\n"
0020
0021 #include <asm/fpu/api.h>
0022
0023 static void
0024 xor_pII_mmx_2(unsigned long bytes, unsigned long * __restrict p1,
0025 const unsigned long * __restrict p2)
0026 {
0027 unsigned long lines = bytes >> 7;
0028
0029 kernel_fpu_begin();
0030
0031 asm volatile(
0032 #undef BLOCK
0033 #define BLOCK(i) \
0034 LD(i, 0) \
0035 LD(i + 1, 1) \
0036 LD(i + 2, 2) \
0037 LD(i + 3, 3) \
0038 XO1(i, 0) \
0039 ST(i, 0) \
0040 XO1(i+1, 1) \
0041 ST(i+1, 1) \
0042 XO1(i + 2, 2) \
0043 ST(i + 2, 2) \
0044 XO1(i + 3, 3) \
0045 ST(i + 3, 3)
0046
0047 " .align 32 ;\n"
0048 " 1: ;\n"
0049
0050 BLOCK(0)
0051 BLOCK(4)
0052 BLOCK(8)
0053 BLOCK(12)
0054
0055 " addl $128, %1 ;\n"
0056 " addl $128, %2 ;\n"
0057 " decl %0 ;\n"
0058 " jnz 1b ;\n"
0059 : "+r" (lines),
0060 "+r" (p1), "+r" (p2)
0061 :
0062 : "memory");
0063
0064 kernel_fpu_end();
0065 }
0066
0067 static void
0068 xor_pII_mmx_3(unsigned long bytes, unsigned long * __restrict p1,
0069 const unsigned long * __restrict p2,
0070 const unsigned long * __restrict p3)
0071 {
0072 unsigned long lines = bytes >> 7;
0073
0074 kernel_fpu_begin();
0075
0076 asm volatile(
0077 #undef BLOCK
0078 #define BLOCK(i) \
0079 LD(i, 0) \
0080 LD(i + 1, 1) \
0081 LD(i + 2, 2) \
0082 LD(i + 3, 3) \
0083 XO1(i, 0) \
0084 XO1(i + 1, 1) \
0085 XO1(i + 2, 2) \
0086 XO1(i + 3, 3) \
0087 XO2(i, 0) \
0088 ST(i, 0) \
0089 XO2(i + 1, 1) \
0090 ST(i + 1, 1) \
0091 XO2(i + 2, 2) \
0092 ST(i + 2, 2) \
0093 XO2(i + 3, 3) \
0094 ST(i + 3, 3)
0095
0096 " .align 32 ;\n"
0097 " 1: ;\n"
0098
0099 BLOCK(0)
0100 BLOCK(4)
0101 BLOCK(8)
0102 BLOCK(12)
0103
0104 " addl $128, %1 ;\n"
0105 " addl $128, %2 ;\n"
0106 " addl $128, %3 ;\n"
0107 " decl %0 ;\n"
0108 " jnz 1b ;\n"
0109 : "+r" (lines),
0110 "+r" (p1), "+r" (p2), "+r" (p3)
0111 :
0112 : "memory");
0113
0114 kernel_fpu_end();
0115 }
0116
0117 static void
0118 xor_pII_mmx_4(unsigned long bytes, unsigned long * __restrict p1,
0119 const unsigned long * __restrict p2,
0120 const unsigned long * __restrict p3,
0121 const unsigned long * __restrict p4)
0122 {
0123 unsigned long lines = bytes >> 7;
0124
0125 kernel_fpu_begin();
0126
0127 asm volatile(
0128 #undef BLOCK
0129 #define BLOCK(i) \
0130 LD(i, 0) \
0131 LD(i + 1, 1) \
0132 LD(i + 2, 2) \
0133 LD(i + 3, 3) \
0134 XO1(i, 0) \
0135 XO1(i + 1, 1) \
0136 XO1(i + 2, 2) \
0137 XO1(i + 3, 3) \
0138 XO2(i, 0) \
0139 XO2(i + 1, 1) \
0140 XO2(i + 2, 2) \
0141 XO2(i + 3, 3) \
0142 XO3(i, 0) \
0143 ST(i, 0) \
0144 XO3(i + 1, 1) \
0145 ST(i + 1, 1) \
0146 XO3(i + 2, 2) \
0147 ST(i + 2, 2) \
0148 XO3(i + 3, 3) \
0149 ST(i + 3, 3)
0150
0151 " .align 32 ;\n"
0152 " 1: ;\n"
0153
0154 BLOCK(0)
0155 BLOCK(4)
0156 BLOCK(8)
0157 BLOCK(12)
0158
0159 " addl $128, %1 ;\n"
0160 " addl $128, %2 ;\n"
0161 " addl $128, %3 ;\n"
0162 " addl $128, %4 ;\n"
0163 " decl %0 ;\n"
0164 " jnz 1b ;\n"
0165 : "+r" (lines),
0166 "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4)
0167 :
0168 : "memory");
0169
0170 kernel_fpu_end();
0171 }
0172
0173
0174 static void
0175 xor_pII_mmx_5(unsigned long bytes, unsigned long * __restrict p1,
0176 const unsigned long * __restrict p2,
0177 const unsigned long * __restrict p3,
0178 const unsigned long * __restrict p4,
0179 const unsigned long * __restrict p5)
0180 {
0181 unsigned long lines = bytes >> 7;
0182
0183 kernel_fpu_begin();
0184
0185
0186
0187
0188
0189
0190
0191 asm("" : "+r" (p4), "+r" (p5));
0192
0193 asm volatile(
0194 #undef BLOCK
0195 #define BLOCK(i) \
0196 LD(i, 0) \
0197 LD(i + 1, 1) \
0198 LD(i + 2, 2) \
0199 LD(i + 3, 3) \
0200 XO1(i, 0) \
0201 XO1(i + 1, 1) \
0202 XO1(i + 2, 2) \
0203 XO1(i + 3, 3) \
0204 XO2(i, 0) \
0205 XO2(i + 1, 1) \
0206 XO2(i + 2, 2) \
0207 XO2(i + 3, 3) \
0208 XO3(i, 0) \
0209 XO3(i + 1, 1) \
0210 XO3(i + 2, 2) \
0211 XO3(i + 3, 3) \
0212 XO4(i, 0) \
0213 ST(i, 0) \
0214 XO4(i + 1, 1) \
0215 ST(i + 1, 1) \
0216 XO4(i + 2, 2) \
0217 ST(i + 2, 2) \
0218 XO4(i + 3, 3) \
0219 ST(i + 3, 3)
0220
0221 " .align 32 ;\n"
0222 " 1: ;\n"
0223
0224 BLOCK(0)
0225 BLOCK(4)
0226 BLOCK(8)
0227 BLOCK(12)
0228
0229 " addl $128, %1 ;\n"
0230 " addl $128, %2 ;\n"
0231 " addl $128, %3 ;\n"
0232 " addl $128, %4 ;\n"
0233 " addl $128, %5 ;\n"
0234 " decl %0 ;\n"
0235 " jnz 1b ;\n"
0236 : "+r" (lines),
0237 "+r" (p1), "+r" (p2), "+r" (p3)
0238 : "r" (p4), "r" (p5)
0239 : "memory");
0240
0241
0242
0243
0244 asm("" : "=r" (p4), "=r" (p5));
0245
0246 kernel_fpu_end();
0247 }
0248
0249 #undef LD
0250 #undef XO1
0251 #undef XO2
0252 #undef XO3
0253 #undef XO4
0254 #undef ST
0255 #undef BLOCK
0256
0257 static void
0258 xor_p5_mmx_2(unsigned long bytes, unsigned long * __restrict p1,
0259 const unsigned long * __restrict p2)
0260 {
0261 unsigned long lines = bytes >> 6;
0262
0263 kernel_fpu_begin();
0264
0265 asm volatile(
0266 " .align 32 ;\n"
0267 " 1: ;\n"
0268 " movq (%1), %%mm0 ;\n"
0269 " movq 8(%1), %%mm1 ;\n"
0270 " pxor (%2), %%mm0 ;\n"
0271 " movq 16(%1), %%mm2 ;\n"
0272 " movq %%mm0, (%1) ;\n"
0273 " pxor 8(%2), %%mm1 ;\n"
0274 " movq 24(%1), %%mm3 ;\n"
0275 " movq %%mm1, 8(%1) ;\n"
0276 " pxor 16(%2), %%mm2 ;\n"
0277 " movq 32(%1), %%mm4 ;\n"
0278 " movq %%mm2, 16(%1) ;\n"
0279 " pxor 24(%2), %%mm3 ;\n"
0280 " movq 40(%1), %%mm5 ;\n"
0281 " movq %%mm3, 24(%1) ;\n"
0282 " pxor 32(%2), %%mm4 ;\n"
0283 " movq 48(%1), %%mm6 ;\n"
0284 " movq %%mm4, 32(%1) ;\n"
0285 " pxor 40(%2), %%mm5 ;\n"
0286 " movq 56(%1), %%mm7 ;\n"
0287 " movq %%mm5, 40(%1) ;\n"
0288 " pxor 48(%2), %%mm6 ;\n"
0289 " pxor 56(%2), %%mm7 ;\n"
0290 " movq %%mm6, 48(%1) ;\n"
0291 " movq %%mm7, 56(%1) ;\n"
0292
0293 " addl $64, %1 ;\n"
0294 " addl $64, %2 ;\n"
0295 " decl %0 ;\n"
0296 " jnz 1b ;\n"
0297 : "+r" (lines),
0298 "+r" (p1), "+r" (p2)
0299 :
0300 : "memory");
0301
0302 kernel_fpu_end();
0303 }
0304
0305 static void
0306 xor_p5_mmx_3(unsigned long bytes, unsigned long * __restrict p1,
0307 const unsigned long * __restrict p2,
0308 const unsigned long * __restrict p3)
0309 {
0310 unsigned long lines = bytes >> 6;
0311
0312 kernel_fpu_begin();
0313
0314 asm volatile(
0315 " .align 32,0x90 ;\n"
0316 " 1: ;\n"
0317 " movq (%1), %%mm0 ;\n"
0318 " movq 8(%1), %%mm1 ;\n"
0319 " pxor (%2), %%mm0 ;\n"
0320 " movq 16(%1), %%mm2 ;\n"
0321 " pxor 8(%2), %%mm1 ;\n"
0322 " pxor (%3), %%mm0 ;\n"
0323 " pxor 16(%2), %%mm2 ;\n"
0324 " movq %%mm0, (%1) ;\n"
0325 " pxor 8(%3), %%mm1 ;\n"
0326 " pxor 16(%3), %%mm2 ;\n"
0327 " movq 24(%1), %%mm3 ;\n"
0328 " movq %%mm1, 8(%1) ;\n"
0329 " movq 32(%1), %%mm4 ;\n"
0330 " movq 40(%1), %%mm5 ;\n"
0331 " pxor 24(%2), %%mm3 ;\n"
0332 " movq %%mm2, 16(%1) ;\n"
0333 " pxor 32(%2), %%mm4 ;\n"
0334 " pxor 24(%3), %%mm3 ;\n"
0335 " pxor 40(%2), %%mm5 ;\n"
0336 " movq %%mm3, 24(%1) ;\n"
0337 " pxor 32(%3), %%mm4 ;\n"
0338 " pxor 40(%3), %%mm5 ;\n"
0339 " movq 48(%1), %%mm6 ;\n"
0340 " movq %%mm4, 32(%1) ;\n"
0341 " movq 56(%1), %%mm7 ;\n"
0342 " pxor 48(%2), %%mm6 ;\n"
0343 " movq %%mm5, 40(%1) ;\n"
0344 " pxor 56(%2), %%mm7 ;\n"
0345 " pxor 48(%3), %%mm6 ;\n"
0346 " pxor 56(%3), %%mm7 ;\n"
0347 " movq %%mm6, 48(%1) ;\n"
0348 " movq %%mm7, 56(%1) ;\n"
0349
0350 " addl $64, %1 ;\n"
0351 " addl $64, %2 ;\n"
0352 " addl $64, %3 ;\n"
0353 " decl %0 ;\n"
0354 " jnz 1b ;\n"
0355 : "+r" (lines),
0356 "+r" (p1), "+r" (p2), "+r" (p3)
0357 :
0358 : "memory" );
0359
0360 kernel_fpu_end();
0361 }
0362
0363 static void
0364 xor_p5_mmx_4(unsigned long bytes, unsigned long * __restrict p1,
0365 const unsigned long * __restrict p2,
0366 const unsigned long * __restrict p3,
0367 const unsigned long * __restrict p4)
0368 {
0369 unsigned long lines = bytes >> 6;
0370
0371 kernel_fpu_begin();
0372
0373 asm volatile(
0374 " .align 32,0x90 ;\n"
0375 " 1: ;\n"
0376 " movq (%1), %%mm0 ;\n"
0377 " movq 8(%1), %%mm1 ;\n"
0378 " pxor (%2), %%mm0 ;\n"
0379 " movq 16(%1), %%mm2 ;\n"
0380 " pxor 8(%2), %%mm1 ;\n"
0381 " pxor (%3), %%mm0 ;\n"
0382 " pxor 16(%2), %%mm2 ;\n"
0383 " pxor 8(%3), %%mm1 ;\n"
0384 " pxor (%4), %%mm0 ;\n"
0385 " movq 24(%1), %%mm3 ;\n"
0386 " pxor 16(%3), %%mm2 ;\n"
0387 " pxor 8(%4), %%mm1 ;\n"
0388 " movq %%mm0, (%1) ;\n"
0389 " movq 32(%1), %%mm4 ;\n"
0390 " pxor 24(%2), %%mm3 ;\n"
0391 " pxor 16(%4), %%mm2 ;\n"
0392 " movq %%mm1, 8(%1) ;\n"
0393 " movq 40(%1), %%mm5 ;\n"
0394 " pxor 32(%2), %%mm4 ;\n"
0395 " pxor 24(%3), %%mm3 ;\n"
0396 " movq %%mm2, 16(%1) ;\n"
0397 " pxor 40(%2), %%mm5 ;\n"
0398 " pxor 32(%3), %%mm4 ;\n"
0399 " pxor 24(%4), %%mm3 ;\n"
0400 " movq %%mm3, 24(%1) ;\n"
0401 " movq 56(%1), %%mm7 ;\n"
0402 " movq 48(%1), %%mm6 ;\n"
0403 " pxor 40(%3), %%mm5 ;\n"
0404 " pxor 32(%4), %%mm4 ;\n"
0405 " pxor 48(%2), %%mm6 ;\n"
0406 " movq %%mm4, 32(%1) ;\n"
0407 " pxor 56(%2), %%mm7 ;\n"
0408 " pxor 40(%4), %%mm5 ;\n"
0409 " pxor 48(%3), %%mm6 ;\n"
0410 " pxor 56(%3), %%mm7 ;\n"
0411 " movq %%mm5, 40(%1) ;\n"
0412 " pxor 48(%4), %%mm6 ;\n"
0413 " pxor 56(%4), %%mm7 ;\n"
0414 " movq %%mm6, 48(%1) ;\n"
0415 " movq %%mm7, 56(%1) ;\n"
0416
0417 " addl $64, %1 ;\n"
0418 " addl $64, %2 ;\n"
0419 " addl $64, %3 ;\n"
0420 " addl $64, %4 ;\n"
0421 " decl %0 ;\n"
0422 " jnz 1b ;\n"
0423 : "+r" (lines),
0424 "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4)
0425 :
0426 : "memory");
0427
0428 kernel_fpu_end();
0429 }
0430
0431 static void
0432 xor_p5_mmx_5(unsigned long bytes, unsigned long * __restrict p1,
0433 const unsigned long * __restrict p2,
0434 const unsigned long * __restrict p3,
0435 const unsigned long * __restrict p4,
0436 const unsigned long * __restrict p5)
0437 {
0438 unsigned long lines = bytes >> 6;
0439
0440 kernel_fpu_begin();
0441
0442
0443
0444
0445
0446
0447
0448 asm("" : "+r" (p4), "+r" (p5));
0449
0450 asm volatile(
0451 " .align 32,0x90 ;\n"
0452 " 1: ;\n"
0453 " movq (%1), %%mm0 ;\n"
0454 " movq 8(%1), %%mm1 ;\n"
0455 " pxor (%2), %%mm0 ;\n"
0456 " pxor 8(%2), %%mm1 ;\n"
0457 " movq 16(%1), %%mm2 ;\n"
0458 " pxor (%3), %%mm0 ;\n"
0459 " pxor 8(%3), %%mm1 ;\n"
0460 " pxor 16(%2), %%mm2 ;\n"
0461 " pxor (%4), %%mm0 ;\n"
0462 " pxor 8(%4), %%mm1 ;\n"
0463 " pxor 16(%3), %%mm2 ;\n"
0464 " movq 24(%1), %%mm3 ;\n"
0465 " pxor (%5), %%mm0 ;\n"
0466 " pxor 8(%5), %%mm1 ;\n"
0467 " movq %%mm0, (%1) ;\n"
0468 " pxor 16(%4), %%mm2 ;\n"
0469 " pxor 24(%2), %%mm3 ;\n"
0470 " movq %%mm1, 8(%1) ;\n"
0471 " pxor 16(%5), %%mm2 ;\n"
0472 " pxor 24(%3), %%mm3 ;\n"
0473 " movq 32(%1), %%mm4 ;\n"
0474 " movq %%mm2, 16(%1) ;\n"
0475 " pxor 24(%4), %%mm3 ;\n"
0476 " pxor 32(%2), %%mm4 ;\n"
0477 " movq 40(%1), %%mm5 ;\n"
0478 " pxor 24(%5), %%mm3 ;\n"
0479 " pxor 32(%3), %%mm4 ;\n"
0480 " pxor 40(%2), %%mm5 ;\n"
0481 " movq %%mm3, 24(%1) ;\n"
0482 " pxor 32(%4), %%mm4 ;\n"
0483 " pxor 40(%3), %%mm5 ;\n"
0484 " movq 48(%1), %%mm6 ;\n"
0485 " movq 56(%1), %%mm7 ;\n"
0486 " pxor 32(%5), %%mm4 ;\n"
0487 " pxor 40(%4), %%mm5 ;\n"
0488 " pxor 48(%2), %%mm6 ;\n"
0489 " pxor 56(%2), %%mm7 ;\n"
0490 " movq %%mm4, 32(%1) ;\n"
0491 " pxor 48(%3), %%mm6 ;\n"
0492 " pxor 56(%3), %%mm7 ;\n"
0493 " pxor 40(%5), %%mm5 ;\n"
0494 " pxor 48(%4), %%mm6 ;\n"
0495 " pxor 56(%4), %%mm7 ;\n"
0496 " movq %%mm5, 40(%1) ;\n"
0497 " pxor 48(%5), %%mm6 ;\n"
0498 " pxor 56(%5), %%mm7 ;\n"
0499 " movq %%mm6, 48(%1) ;\n"
0500 " movq %%mm7, 56(%1) ;\n"
0501
0502 " addl $64, %1 ;\n"
0503 " addl $64, %2 ;\n"
0504 " addl $64, %3 ;\n"
0505 " addl $64, %4 ;\n"
0506 " addl $64, %5 ;\n"
0507 " decl %0 ;\n"
0508 " jnz 1b ;\n"
0509 : "+r" (lines),
0510 "+r" (p1), "+r" (p2), "+r" (p3)
0511 : "r" (p4), "r" (p5)
0512 : "memory");
0513
0514
0515
0516
0517 asm("" : "=r" (p4), "=r" (p5));
0518
0519 kernel_fpu_end();
0520 }
0521
0522 static struct xor_block_template xor_block_pII_mmx = {
0523 .name = "pII_mmx",
0524 .do_2 = xor_pII_mmx_2,
0525 .do_3 = xor_pII_mmx_3,
0526 .do_4 = xor_pII_mmx_4,
0527 .do_5 = xor_pII_mmx_5,
0528 };
0529
0530 static struct xor_block_template xor_block_p5_mmx = {
0531 .name = "p5_mmx",
0532 .do_2 = xor_p5_mmx_2,
0533 .do_3 = xor_p5_mmx_3,
0534 .do_4 = xor_p5_mmx_4,
0535 .do_5 = xor_p5_mmx_5,
0536 };
0537
0538 static struct xor_block_template xor_block_pIII_sse = {
0539 .name = "pIII_sse",
0540 .do_2 = xor_sse_2,
0541 .do_3 = xor_sse_3,
0542 .do_4 = xor_sse_4,
0543 .do_5 = xor_sse_5,
0544 };
0545
0546
0547 #include <asm/xor_avx.h>
0548
0549
0550 #include <asm-generic/xor.h>
0551
0552
0553
0554
0555 #undef XOR_TRY_TEMPLATES
0556 #define XOR_TRY_TEMPLATES \
0557 do { \
0558 AVX_XOR_SPEED; \
0559 if (boot_cpu_has(X86_FEATURE_XMM)) { \
0560 xor_speed(&xor_block_pIII_sse); \
0561 xor_speed(&xor_block_sse_pf64); \
0562 } else if (boot_cpu_has(X86_FEATURE_MMX)) { \
0563 xor_speed(&xor_block_pII_mmx); \
0564 xor_speed(&xor_block_p5_mmx); \
0565 } else { \
0566 xor_speed(&xor_block_8regs); \
0567 xor_speed(&xor_block_8regs_p); \
0568 xor_speed(&xor_block_32regs); \
0569 xor_speed(&xor_block_32regs_p); \
0570 } \
0571 } while (0)
0572
0573 #endif