Back to home page

OSCL-LXR

 
 

    


0001 /* SPDX-License-Identifier: GPL-2.0-or-later */
0002 #ifndef _ASM_X86_XOR_32_H
0003 #define _ASM_X86_XOR_32_H
0004 
0005 /*
0006  * Optimized RAID-5 checksumming functions for MMX.
0007  */
0008 
0009 /*
0010  * High-speed RAID5 checksumming functions utilizing MMX instructions.
0011  * Copyright (C) 1998 Ingo Molnar.
0012  */
0013 
0014 #define LD(x, y)    "       movq   8*("#x")(%1), %%mm"#y"   ;\n"
0015 #define ST(x, y)    "       movq %%mm"#y",   8*("#x")(%1)   ;\n"
0016 #define XO1(x, y)   "       pxor   8*("#x")(%2), %%mm"#y"   ;\n"
0017 #define XO2(x, y)   "       pxor   8*("#x")(%3), %%mm"#y"   ;\n"
0018 #define XO3(x, y)   "       pxor   8*("#x")(%4), %%mm"#y"   ;\n"
0019 #define XO4(x, y)   "       pxor   8*("#x")(%5), %%mm"#y"   ;\n"
0020 
0021 #include <asm/fpu/api.h>
0022 
0023 static void
0024 xor_pII_mmx_2(unsigned long bytes, unsigned long * __restrict p1,
0025           const unsigned long * __restrict p2)
0026 {
0027     unsigned long lines = bytes >> 7;
0028 
0029     kernel_fpu_begin();
0030 
0031     asm volatile(
0032 #undef BLOCK
0033 #define BLOCK(i)                \
0034     LD(i, 0)                \
0035         LD(i + 1, 1)            \
0036             LD(i + 2, 2)        \
0037                 LD(i + 3, 3)    \
0038     XO1(i, 0)               \
0039     ST(i, 0)                \
0040         XO1(i+1, 1)         \
0041         ST(i+1, 1)          \
0042             XO1(i + 2, 2)       \
0043             ST(i + 2, 2)        \
0044                 XO1(i + 3, 3)   \
0045                 ST(i + 3, 3)
0046 
0047     " .align 32         ;\n"
0048     " 1:                            ;\n"
0049 
0050     BLOCK(0)
0051     BLOCK(4)
0052     BLOCK(8)
0053     BLOCK(12)
0054 
0055     "       addl $128, %1         ;\n"
0056     "       addl $128, %2         ;\n"
0057     "       decl %0               ;\n"
0058     "       jnz 1b                ;\n"
0059     : "+r" (lines),
0060       "+r" (p1), "+r" (p2)
0061     :
0062     : "memory");
0063 
0064     kernel_fpu_end();
0065 }
0066 
0067 static void
0068 xor_pII_mmx_3(unsigned long bytes, unsigned long * __restrict p1,
0069           const unsigned long * __restrict p2,
0070           const unsigned long * __restrict p3)
0071 {
0072     unsigned long lines = bytes >> 7;
0073 
0074     kernel_fpu_begin();
0075 
0076     asm volatile(
0077 #undef BLOCK
0078 #define BLOCK(i)                \
0079     LD(i, 0)                \
0080         LD(i + 1, 1)            \
0081             LD(i + 2, 2)        \
0082                 LD(i + 3, 3)    \
0083     XO1(i, 0)               \
0084         XO1(i + 1, 1)           \
0085             XO1(i + 2, 2)       \
0086                 XO1(i + 3, 3)   \
0087     XO2(i, 0)               \
0088     ST(i, 0)                \
0089         XO2(i + 1, 1)           \
0090         ST(i + 1, 1)            \
0091             XO2(i + 2, 2)       \
0092             ST(i + 2, 2)        \
0093                 XO2(i + 3, 3)   \
0094                 ST(i + 3, 3)
0095 
0096     " .align 32         ;\n"
0097     " 1:                            ;\n"
0098 
0099     BLOCK(0)
0100     BLOCK(4)
0101     BLOCK(8)
0102     BLOCK(12)
0103 
0104     "       addl $128, %1         ;\n"
0105     "       addl $128, %2         ;\n"
0106     "       addl $128, %3         ;\n"
0107     "       decl %0               ;\n"
0108     "       jnz 1b                ;\n"
0109     : "+r" (lines),
0110       "+r" (p1), "+r" (p2), "+r" (p3)
0111     :
0112     : "memory");
0113 
0114     kernel_fpu_end();
0115 }
0116 
0117 static void
0118 xor_pII_mmx_4(unsigned long bytes, unsigned long * __restrict p1,
0119           const unsigned long * __restrict p2,
0120           const unsigned long * __restrict p3,
0121           const unsigned long * __restrict p4)
0122 {
0123     unsigned long lines = bytes >> 7;
0124 
0125     kernel_fpu_begin();
0126 
0127     asm volatile(
0128 #undef BLOCK
0129 #define BLOCK(i)                \
0130     LD(i, 0)                \
0131         LD(i + 1, 1)            \
0132             LD(i + 2, 2)        \
0133                 LD(i + 3, 3)    \
0134     XO1(i, 0)               \
0135         XO1(i + 1, 1)           \
0136             XO1(i + 2, 2)       \
0137                 XO1(i + 3, 3)   \
0138     XO2(i, 0)               \
0139         XO2(i + 1, 1)           \
0140             XO2(i + 2, 2)       \
0141                 XO2(i + 3, 3)   \
0142     XO3(i, 0)               \
0143     ST(i, 0)                \
0144         XO3(i + 1, 1)           \
0145         ST(i + 1, 1)            \
0146             XO3(i + 2, 2)       \
0147             ST(i + 2, 2)        \
0148                 XO3(i + 3, 3)   \
0149                 ST(i + 3, 3)
0150 
0151     " .align 32         ;\n"
0152     " 1:                            ;\n"
0153 
0154     BLOCK(0)
0155     BLOCK(4)
0156     BLOCK(8)
0157     BLOCK(12)
0158 
0159     "       addl $128, %1         ;\n"
0160     "       addl $128, %2         ;\n"
0161     "       addl $128, %3         ;\n"
0162     "       addl $128, %4         ;\n"
0163     "       decl %0               ;\n"
0164     "       jnz 1b                ;\n"
0165     : "+r" (lines),
0166       "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4)
0167     :
0168     : "memory");
0169 
0170     kernel_fpu_end();
0171 }
0172 
0173 
0174 static void
0175 xor_pII_mmx_5(unsigned long bytes, unsigned long * __restrict p1,
0176           const unsigned long * __restrict p2,
0177           const unsigned long * __restrict p3,
0178           const unsigned long * __restrict p4,
0179           const unsigned long * __restrict p5)
0180 {
0181     unsigned long lines = bytes >> 7;
0182 
0183     kernel_fpu_begin();
0184 
0185     /* Make sure GCC forgets anything it knows about p4 or p5,
0186        such that it won't pass to the asm volatile below a
0187        register that is shared with any other variable.  That's
0188        because we modify p4 and p5 there, but we can't mark them
0189        as read/write, otherwise we'd overflow the 10-asm-operands
0190        limit of GCC < 3.1.  */
0191     asm("" : "+r" (p4), "+r" (p5));
0192 
0193     asm volatile(
0194 #undef BLOCK
0195 #define BLOCK(i)                \
0196     LD(i, 0)                \
0197         LD(i + 1, 1)            \
0198             LD(i + 2, 2)        \
0199                 LD(i + 3, 3)    \
0200     XO1(i, 0)               \
0201         XO1(i + 1, 1)           \
0202             XO1(i + 2, 2)       \
0203                 XO1(i + 3, 3)   \
0204     XO2(i, 0)               \
0205         XO2(i + 1, 1)           \
0206             XO2(i + 2, 2)       \
0207                 XO2(i + 3, 3)   \
0208     XO3(i, 0)               \
0209         XO3(i + 1, 1)           \
0210             XO3(i + 2, 2)       \
0211                 XO3(i + 3, 3)   \
0212     XO4(i, 0)               \
0213     ST(i, 0)                \
0214         XO4(i + 1, 1)           \
0215         ST(i + 1, 1)            \
0216             XO4(i + 2, 2)       \
0217             ST(i + 2, 2)        \
0218                 XO4(i + 3, 3)   \
0219                 ST(i + 3, 3)
0220 
0221     " .align 32         ;\n"
0222     " 1:                            ;\n"
0223 
0224     BLOCK(0)
0225     BLOCK(4)
0226     BLOCK(8)
0227     BLOCK(12)
0228 
0229     "       addl $128, %1         ;\n"
0230     "       addl $128, %2         ;\n"
0231     "       addl $128, %3         ;\n"
0232     "       addl $128, %4         ;\n"
0233     "       addl $128, %5         ;\n"
0234     "       decl %0               ;\n"
0235     "       jnz 1b                ;\n"
0236     : "+r" (lines),
0237       "+r" (p1), "+r" (p2), "+r" (p3)
0238     : "r" (p4), "r" (p5)
0239     : "memory");
0240 
0241     /* p4 and p5 were modified, and now the variables are dead.
0242        Clobber them just to be sure nobody does something stupid
0243        like assuming they have some legal value.  */
0244     asm("" : "=r" (p4), "=r" (p5));
0245 
0246     kernel_fpu_end();
0247 }
0248 
0249 #undef LD
0250 #undef XO1
0251 #undef XO2
0252 #undef XO3
0253 #undef XO4
0254 #undef ST
0255 #undef BLOCK
0256 
0257 static void
0258 xor_p5_mmx_2(unsigned long bytes, unsigned long * __restrict p1,
0259          const unsigned long * __restrict p2)
0260 {
0261     unsigned long lines = bytes >> 6;
0262 
0263     kernel_fpu_begin();
0264 
0265     asm volatile(
0266     " .align 32              ;\n"
0267     " 1:                         ;\n"
0268     "       movq   (%1), %%mm0   ;\n"
0269     "       movq  8(%1), %%mm1   ;\n"
0270     "       pxor   (%2), %%mm0   ;\n"
0271     "       movq 16(%1), %%mm2   ;\n"
0272     "       movq %%mm0,   (%1)   ;\n"
0273     "       pxor  8(%2), %%mm1   ;\n"
0274     "       movq 24(%1), %%mm3   ;\n"
0275     "       movq %%mm1,  8(%1)   ;\n"
0276     "       pxor 16(%2), %%mm2   ;\n"
0277     "       movq 32(%1), %%mm4   ;\n"
0278     "       movq %%mm2, 16(%1)   ;\n"
0279     "       pxor 24(%2), %%mm3   ;\n"
0280     "       movq 40(%1), %%mm5   ;\n"
0281     "       movq %%mm3, 24(%1)   ;\n"
0282     "       pxor 32(%2), %%mm4   ;\n"
0283     "       movq 48(%1), %%mm6   ;\n"
0284     "       movq %%mm4, 32(%1)   ;\n"
0285     "       pxor 40(%2), %%mm5   ;\n"
0286     "       movq 56(%1), %%mm7   ;\n"
0287     "       movq %%mm5, 40(%1)   ;\n"
0288     "       pxor 48(%2), %%mm6   ;\n"
0289     "       pxor 56(%2), %%mm7   ;\n"
0290     "       movq %%mm6, 48(%1)   ;\n"
0291     "       movq %%mm7, 56(%1)   ;\n"
0292 
0293     "       addl $64, %1         ;\n"
0294     "       addl $64, %2         ;\n"
0295     "       decl %0              ;\n"
0296     "       jnz 1b               ;\n"
0297     : "+r" (lines),
0298       "+r" (p1), "+r" (p2)
0299     :
0300     : "memory");
0301 
0302     kernel_fpu_end();
0303 }
0304 
0305 static void
0306 xor_p5_mmx_3(unsigned long bytes, unsigned long * __restrict p1,
0307          const unsigned long * __restrict p2,
0308          const unsigned long * __restrict p3)
0309 {
0310     unsigned long lines = bytes >> 6;
0311 
0312     kernel_fpu_begin();
0313 
0314     asm volatile(
0315     " .align 32,0x90             ;\n"
0316     " 1:                         ;\n"
0317     "       movq   (%1), %%mm0   ;\n"
0318     "       movq  8(%1), %%mm1   ;\n"
0319     "       pxor   (%2), %%mm0   ;\n"
0320     "       movq 16(%1), %%mm2   ;\n"
0321     "       pxor  8(%2), %%mm1   ;\n"
0322     "       pxor   (%3), %%mm0   ;\n"
0323     "       pxor 16(%2), %%mm2   ;\n"
0324     "       movq %%mm0,   (%1)   ;\n"
0325     "       pxor  8(%3), %%mm1   ;\n"
0326     "       pxor 16(%3), %%mm2   ;\n"
0327     "       movq 24(%1), %%mm3   ;\n"
0328     "       movq %%mm1,  8(%1)   ;\n"
0329     "       movq 32(%1), %%mm4   ;\n"
0330     "       movq 40(%1), %%mm5   ;\n"
0331     "       pxor 24(%2), %%mm3   ;\n"
0332     "       movq %%mm2, 16(%1)   ;\n"
0333     "       pxor 32(%2), %%mm4   ;\n"
0334     "       pxor 24(%3), %%mm3   ;\n"
0335     "       pxor 40(%2), %%mm5   ;\n"
0336     "       movq %%mm3, 24(%1)   ;\n"
0337     "       pxor 32(%3), %%mm4   ;\n"
0338     "       pxor 40(%3), %%mm5   ;\n"
0339     "       movq 48(%1), %%mm6   ;\n"
0340     "       movq %%mm4, 32(%1)   ;\n"
0341     "       movq 56(%1), %%mm7   ;\n"
0342     "       pxor 48(%2), %%mm6   ;\n"
0343     "       movq %%mm5, 40(%1)   ;\n"
0344     "       pxor 56(%2), %%mm7   ;\n"
0345     "       pxor 48(%3), %%mm6   ;\n"
0346     "       pxor 56(%3), %%mm7   ;\n"
0347     "       movq %%mm6, 48(%1)   ;\n"
0348     "       movq %%mm7, 56(%1)   ;\n"
0349 
0350     "       addl $64, %1         ;\n"
0351     "       addl $64, %2         ;\n"
0352     "       addl $64, %3         ;\n"
0353     "       decl %0              ;\n"
0354     "       jnz 1b               ;\n"
0355     : "+r" (lines),
0356       "+r" (p1), "+r" (p2), "+r" (p3)
0357     :
0358     : "memory" );
0359 
0360     kernel_fpu_end();
0361 }
0362 
0363 static void
0364 xor_p5_mmx_4(unsigned long bytes, unsigned long * __restrict p1,
0365          const unsigned long * __restrict p2,
0366          const unsigned long * __restrict p3,
0367          const unsigned long * __restrict p4)
0368 {
0369     unsigned long lines = bytes >> 6;
0370 
0371     kernel_fpu_begin();
0372 
0373     asm volatile(
0374     " .align 32,0x90             ;\n"
0375     " 1:                         ;\n"
0376     "       movq   (%1), %%mm0   ;\n"
0377     "       movq  8(%1), %%mm1   ;\n"
0378     "       pxor   (%2), %%mm0   ;\n"
0379     "       movq 16(%1), %%mm2   ;\n"
0380     "       pxor  8(%2), %%mm1   ;\n"
0381     "       pxor   (%3), %%mm0   ;\n"
0382     "       pxor 16(%2), %%mm2   ;\n"
0383     "       pxor  8(%3), %%mm1   ;\n"
0384     "       pxor   (%4), %%mm0   ;\n"
0385     "       movq 24(%1), %%mm3   ;\n"
0386     "       pxor 16(%3), %%mm2   ;\n"
0387     "       pxor  8(%4), %%mm1   ;\n"
0388     "       movq %%mm0,   (%1)   ;\n"
0389     "       movq 32(%1), %%mm4   ;\n"
0390     "       pxor 24(%2), %%mm3   ;\n"
0391     "       pxor 16(%4), %%mm2   ;\n"
0392     "       movq %%mm1,  8(%1)   ;\n"
0393     "       movq 40(%1), %%mm5   ;\n"
0394     "       pxor 32(%2), %%mm4   ;\n"
0395     "       pxor 24(%3), %%mm3   ;\n"
0396     "       movq %%mm2, 16(%1)   ;\n"
0397     "       pxor 40(%2), %%mm5   ;\n"
0398     "       pxor 32(%3), %%mm4   ;\n"
0399     "       pxor 24(%4), %%mm3   ;\n"
0400     "       movq %%mm3, 24(%1)   ;\n"
0401     "       movq 56(%1), %%mm7   ;\n"
0402     "       movq 48(%1), %%mm6   ;\n"
0403     "       pxor 40(%3), %%mm5   ;\n"
0404     "       pxor 32(%4), %%mm4   ;\n"
0405     "       pxor 48(%2), %%mm6   ;\n"
0406     "       movq %%mm4, 32(%1)   ;\n"
0407     "       pxor 56(%2), %%mm7   ;\n"
0408     "       pxor 40(%4), %%mm5   ;\n"
0409     "       pxor 48(%3), %%mm6   ;\n"
0410     "       pxor 56(%3), %%mm7   ;\n"
0411     "       movq %%mm5, 40(%1)   ;\n"
0412     "       pxor 48(%4), %%mm6   ;\n"
0413     "       pxor 56(%4), %%mm7   ;\n"
0414     "       movq %%mm6, 48(%1)   ;\n"
0415     "       movq %%mm7, 56(%1)   ;\n"
0416 
0417     "       addl $64, %1         ;\n"
0418     "       addl $64, %2         ;\n"
0419     "       addl $64, %3         ;\n"
0420     "       addl $64, %4         ;\n"
0421     "       decl %0              ;\n"
0422     "       jnz 1b               ;\n"
0423     : "+r" (lines),
0424       "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4)
0425     :
0426     : "memory");
0427 
0428     kernel_fpu_end();
0429 }
0430 
0431 static void
0432 xor_p5_mmx_5(unsigned long bytes, unsigned long * __restrict p1,
0433          const unsigned long * __restrict p2,
0434          const unsigned long * __restrict p3,
0435          const unsigned long * __restrict p4,
0436          const unsigned long * __restrict p5)
0437 {
0438     unsigned long lines = bytes >> 6;
0439 
0440     kernel_fpu_begin();
0441 
0442     /* Make sure GCC forgets anything it knows about p4 or p5,
0443        such that it won't pass to the asm volatile below a
0444        register that is shared with any other variable.  That's
0445        because we modify p4 and p5 there, but we can't mark them
0446        as read/write, otherwise we'd overflow the 10-asm-operands
0447        limit of GCC < 3.1.  */
0448     asm("" : "+r" (p4), "+r" (p5));
0449 
0450     asm volatile(
0451     " .align 32,0x90             ;\n"
0452     " 1:                         ;\n"
0453     "       movq   (%1), %%mm0   ;\n"
0454     "       movq  8(%1), %%mm1   ;\n"
0455     "       pxor   (%2), %%mm0   ;\n"
0456     "       pxor  8(%2), %%mm1   ;\n"
0457     "       movq 16(%1), %%mm2   ;\n"
0458     "       pxor   (%3), %%mm0   ;\n"
0459     "       pxor  8(%3), %%mm1   ;\n"
0460     "       pxor 16(%2), %%mm2   ;\n"
0461     "       pxor   (%4), %%mm0   ;\n"
0462     "       pxor  8(%4), %%mm1   ;\n"
0463     "       pxor 16(%3), %%mm2   ;\n"
0464     "       movq 24(%1), %%mm3   ;\n"
0465     "       pxor   (%5), %%mm0   ;\n"
0466     "       pxor  8(%5), %%mm1   ;\n"
0467     "       movq %%mm0,   (%1)   ;\n"
0468     "       pxor 16(%4), %%mm2   ;\n"
0469     "       pxor 24(%2), %%mm3   ;\n"
0470     "       movq %%mm1,  8(%1)   ;\n"
0471     "       pxor 16(%5), %%mm2   ;\n"
0472     "       pxor 24(%3), %%mm3   ;\n"
0473     "       movq 32(%1), %%mm4   ;\n"
0474     "       movq %%mm2, 16(%1)   ;\n"
0475     "       pxor 24(%4), %%mm3   ;\n"
0476     "       pxor 32(%2), %%mm4   ;\n"
0477     "       movq 40(%1), %%mm5   ;\n"
0478     "       pxor 24(%5), %%mm3   ;\n"
0479     "       pxor 32(%3), %%mm4   ;\n"
0480     "       pxor 40(%2), %%mm5   ;\n"
0481     "       movq %%mm3, 24(%1)   ;\n"
0482     "       pxor 32(%4), %%mm4   ;\n"
0483     "       pxor 40(%3), %%mm5   ;\n"
0484     "       movq 48(%1), %%mm6   ;\n"
0485     "       movq 56(%1), %%mm7   ;\n"
0486     "       pxor 32(%5), %%mm4   ;\n"
0487     "       pxor 40(%4), %%mm5   ;\n"
0488     "       pxor 48(%2), %%mm6   ;\n"
0489     "       pxor 56(%2), %%mm7   ;\n"
0490     "       movq %%mm4, 32(%1)   ;\n"
0491     "       pxor 48(%3), %%mm6   ;\n"
0492     "       pxor 56(%3), %%mm7   ;\n"
0493     "       pxor 40(%5), %%mm5   ;\n"
0494     "       pxor 48(%4), %%mm6   ;\n"
0495     "       pxor 56(%4), %%mm7   ;\n"
0496     "       movq %%mm5, 40(%1)   ;\n"
0497     "       pxor 48(%5), %%mm6   ;\n"
0498     "       pxor 56(%5), %%mm7   ;\n"
0499     "       movq %%mm6, 48(%1)   ;\n"
0500     "       movq %%mm7, 56(%1)   ;\n"
0501 
0502     "       addl $64, %1         ;\n"
0503     "       addl $64, %2         ;\n"
0504     "       addl $64, %3         ;\n"
0505     "       addl $64, %4         ;\n"
0506     "       addl $64, %5         ;\n"
0507     "       decl %0              ;\n"
0508     "       jnz 1b               ;\n"
0509     : "+r" (lines),
0510       "+r" (p1), "+r" (p2), "+r" (p3)
0511     : "r" (p4), "r" (p5)
0512     : "memory");
0513 
0514     /* p4 and p5 were modified, and now the variables are dead.
0515        Clobber them just to be sure nobody does something stupid
0516        like assuming they have some legal value.  */
0517     asm("" : "=r" (p4), "=r" (p5));
0518 
0519     kernel_fpu_end();
0520 }
0521 
0522 static struct xor_block_template xor_block_pII_mmx = {
0523     .name = "pII_mmx",
0524     .do_2 = xor_pII_mmx_2,
0525     .do_3 = xor_pII_mmx_3,
0526     .do_4 = xor_pII_mmx_4,
0527     .do_5 = xor_pII_mmx_5,
0528 };
0529 
0530 static struct xor_block_template xor_block_p5_mmx = {
0531     .name = "p5_mmx",
0532     .do_2 = xor_p5_mmx_2,
0533     .do_3 = xor_p5_mmx_3,
0534     .do_4 = xor_p5_mmx_4,
0535     .do_5 = xor_p5_mmx_5,
0536 };
0537 
0538 static struct xor_block_template xor_block_pIII_sse = {
0539     .name = "pIII_sse",
0540     .do_2 = xor_sse_2,
0541     .do_3 = xor_sse_3,
0542     .do_4 = xor_sse_4,
0543     .do_5 = xor_sse_5,
0544 };
0545 
0546 /* Also try the AVX routines */
0547 #include <asm/xor_avx.h>
0548 
0549 /* Also try the generic routines.  */
0550 #include <asm-generic/xor.h>
0551 
0552 /* We force the use of the SSE xor block because it can write around L2.
0553    We may also be able to load into the L1 only depending on how the cpu
0554    deals with a load to a line that is being prefetched.  */
0555 #undef XOR_TRY_TEMPLATES
0556 #define XOR_TRY_TEMPLATES               \
0557 do {                            \
0558     AVX_XOR_SPEED;                  \
0559     if (boot_cpu_has(X86_FEATURE_XMM)) {                \
0560         xor_speed(&xor_block_pIII_sse);     \
0561         xor_speed(&xor_block_sse_pf64);     \
0562     } else if (boot_cpu_has(X86_FEATURE_MMX)) { \
0563         xor_speed(&xor_block_pII_mmx);      \
0564         xor_speed(&xor_block_p5_mmx);       \
0565     } else {                    \
0566         xor_speed(&xor_block_8regs);        \
0567         xor_speed(&xor_block_8regs_p);      \
0568         xor_speed(&xor_block_32regs);       \
0569         xor_speed(&xor_block_32regs_p);     \
0570     }                       \
0571 } while (0)
0572 
0573 #endif /* _ASM_X86_XOR_32_H */