Back to home page

OSCL-LXR

 
 

    


0001 /* SPDX-License-Identifier: GPL-2.0-or-later */
0002 #ifndef _ASM_X86_XOR_H
0003 #define _ASM_X86_XOR_H
0004 
0005 /*
0006  * Optimized RAID-5 checksumming functions for SSE.
0007  */
0008 
0009 /*
0010  * Cache avoiding checksumming functions utilizing KNI instructions
0011  * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
0012  */
0013 
0014 /*
0015  * Based on
0016  * High-speed RAID5 checksumming functions utilizing SSE instructions.
0017  * Copyright (C) 1998 Ingo Molnar.
0018  */
0019 
0020 /*
0021  * x86-64 changes / gcc fixes from Andi Kleen.
0022  * Copyright 2002 Andi Kleen, SuSE Labs.
0023  *
0024  * This hasn't been optimized for the hammer yet, but there are likely
0025  * no advantages to be gotten from x86-64 here anyways.
0026  */
0027 
0028 #include <asm/fpu/api.h>
0029 
0030 #ifdef CONFIG_X86_32
0031 /* reduce register pressure */
0032 # define XOR_CONSTANT_CONSTRAINT "i"
0033 #else
0034 # define XOR_CONSTANT_CONSTRAINT "re"
0035 #endif
0036 
0037 #define OFFS(x)     "16*("#x")"
0038 #define PF_OFFS(x)  "256+16*("#x")"
0039 #define PF0(x)      "   prefetchnta "PF_OFFS(x)"(%[p1])     ;\n"
0040 #define LD(x, y)    "   movaps "OFFS(x)"(%[p1]), %%xmm"#y"  ;\n"
0041 #define ST(x, y)    "   movaps %%xmm"#y", "OFFS(x)"(%[p1])  ;\n"
0042 #define PF1(x)      "   prefetchnta "PF_OFFS(x)"(%[p2])     ;\n"
0043 #define PF2(x)      "   prefetchnta "PF_OFFS(x)"(%[p3])     ;\n"
0044 #define PF3(x)      "   prefetchnta "PF_OFFS(x)"(%[p4])     ;\n"
0045 #define PF4(x)      "   prefetchnta "PF_OFFS(x)"(%[p5])     ;\n"
0046 #define XO1(x, y)   "   xorps "OFFS(x)"(%[p2]), %%xmm"#y"   ;\n"
0047 #define XO2(x, y)   "   xorps "OFFS(x)"(%[p3]), %%xmm"#y"   ;\n"
0048 #define XO3(x, y)   "   xorps "OFFS(x)"(%[p4]), %%xmm"#y"   ;\n"
0049 #define XO4(x, y)   "   xorps "OFFS(x)"(%[p5]), %%xmm"#y"   ;\n"
0050 #define NOP(x)
0051 
0052 #define BLK64(pf, op, i)                \
0053         pf(i)                   \
0054         op(i, 0)                \
0055             op(i + 1, 1)            \
0056                 op(i + 2, 2)        \
0057                     op(i + 3, 3)
0058 
0059 static void
0060 xor_sse_2(unsigned long bytes, unsigned long * __restrict p1,
0061       const unsigned long * __restrict p2)
0062 {
0063     unsigned long lines = bytes >> 8;
0064 
0065     kernel_fpu_begin();
0066 
0067     asm volatile(
0068 #undef BLOCK
0069 #define BLOCK(i)                    \
0070         LD(i, 0)                \
0071             LD(i + 1, 1)            \
0072         PF1(i)                  \
0073                 PF1(i + 2)      \
0074                 LD(i + 2, 2)        \
0075                     LD(i + 3, 3)    \
0076         PF0(i + 4)              \
0077                 PF0(i + 6)      \
0078         XO1(i, 0)               \
0079             XO1(i + 1, 1)           \
0080                 XO1(i + 2, 2)       \
0081                     XO1(i + 3, 3)   \
0082         ST(i, 0)                \
0083             ST(i + 1, 1)            \
0084                 ST(i + 2, 2)        \
0085                     ST(i + 3, 3)    \
0086 
0087 
0088         PF0(0)
0089                 PF0(2)
0090 
0091     " .align 32         ;\n"
0092     " 1:                            ;\n"
0093 
0094         BLOCK(0)
0095         BLOCK(4)
0096         BLOCK(8)
0097         BLOCK(12)
0098 
0099     "       add %[inc], %[p1]       ;\n"
0100     "       add %[inc], %[p2]       ;\n"
0101     "       dec %[cnt]              ;\n"
0102     "       jnz 1b                  ;\n"
0103     : [cnt] "+r" (lines),
0104       [p1] "+r" (p1), [p2] "+r" (p2)
0105     : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
0106     : "memory");
0107 
0108     kernel_fpu_end();
0109 }
0110 
0111 static void
0112 xor_sse_2_pf64(unsigned long bytes, unsigned long * __restrict p1,
0113            const unsigned long * __restrict p2)
0114 {
0115     unsigned long lines = bytes >> 8;
0116 
0117     kernel_fpu_begin();
0118 
0119     asm volatile(
0120 #undef BLOCK
0121 #define BLOCK(i)            \
0122         BLK64(PF0, LD, i)   \
0123         BLK64(PF1, XO1, i)  \
0124         BLK64(NOP, ST, i)   \
0125 
0126     " .align 32         ;\n"
0127     " 1:                            ;\n"
0128 
0129         BLOCK(0)
0130         BLOCK(4)
0131         BLOCK(8)
0132         BLOCK(12)
0133 
0134     "       add %[inc], %[p1]       ;\n"
0135     "       add %[inc], %[p2]       ;\n"
0136     "       dec %[cnt]              ;\n"
0137     "       jnz 1b                  ;\n"
0138     : [cnt] "+r" (lines),
0139       [p1] "+r" (p1), [p2] "+r" (p2)
0140     : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
0141     : "memory");
0142 
0143     kernel_fpu_end();
0144 }
0145 
0146 static void
0147 xor_sse_3(unsigned long bytes, unsigned long * __restrict p1,
0148       const unsigned long * __restrict p2,
0149       const unsigned long * __restrict p3)
0150 {
0151     unsigned long lines = bytes >> 8;
0152 
0153     kernel_fpu_begin();
0154 
0155     asm volatile(
0156 #undef BLOCK
0157 #define BLOCK(i) \
0158         PF1(i)                  \
0159                 PF1(i + 2)      \
0160         LD(i, 0)                \
0161             LD(i + 1, 1)            \
0162                 LD(i + 2, 2)        \
0163                     LD(i + 3, 3)    \
0164         PF2(i)                  \
0165                 PF2(i + 2)      \
0166         PF0(i + 4)              \
0167                 PF0(i + 6)      \
0168         XO1(i, 0)               \
0169             XO1(i + 1, 1)           \
0170                 XO1(i + 2, 2)       \
0171                     XO1(i + 3, 3)   \
0172         XO2(i, 0)               \
0173             XO2(i + 1, 1)           \
0174                 XO2(i + 2, 2)       \
0175                     XO2(i + 3, 3)   \
0176         ST(i, 0)                \
0177             ST(i + 1, 1)            \
0178                 ST(i + 2, 2)        \
0179                     ST(i + 3, 3)    \
0180 
0181 
0182         PF0(0)
0183                 PF0(2)
0184 
0185     " .align 32         ;\n"
0186     " 1:                            ;\n"
0187 
0188         BLOCK(0)
0189         BLOCK(4)
0190         BLOCK(8)
0191         BLOCK(12)
0192 
0193     "       add %[inc], %[p1]       ;\n"
0194     "       add %[inc], %[p2]       ;\n"
0195     "       add %[inc], %[p3]       ;\n"
0196     "       dec %[cnt]              ;\n"
0197     "       jnz 1b                  ;\n"
0198     : [cnt] "+r" (lines),
0199       [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
0200     : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
0201     : "memory");
0202 
0203     kernel_fpu_end();
0204 }
0205 
0206 static void
0207 xor_sse_3_pf64(unsigned long bytes, unsigned long * __restrict p1,
0208            const unsigned long * __restrict p2,
0209            const unsigned long * __restrict p3)
0210 {
0211     unsigned long lines = bytes >> 8;
0212 
0213     kernel_fpu_begin();
0214 
0215     asm volatile(
0216 #undef BLOCK
0217 #define BLOCK(i)            \
0218         BLK64(PF0, LD, i)   \
0219         BLK64(PF1, XO1, i)  \
0220         BLK64(PF2, XO2, i)  \
0221         BLK64(NOP, ST, i)   \
0222 
0223     " .align 32         ;\n"
0224     " 1:                            ;\n"
0225 
0226         BLOCK(0)
0227         BLOCK(4)
0228         BLOCK(8)
0229         BLOCK(12)
0230 
0231     "       add %[inc], %[p1]       ;\n"
0232     "       add %[inc], %[p2]       ;\n"
0233     "       add %[inc], %[p3]       ;\n"
0234     "       dec %[cnt]              ;\n"
0235     "       jnz 1b                  ;\n"
0236     : [cnt] "+r" (lines),
0237       [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
0238     : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
0239     : "memory");
0240 
0241     kernel_fpu_end();
0242 }
0243 
0244 static void
0245 xor_sse_4(unsigned long bytes, unsigned long * __restrict p1,
0246       const unsigned long * __restrict p2,
0247       const unsigned long * __restrict p3,
0248       const unsigned long * __restrict p4)
0249 {
0250     unsigned long lines = bytes >> 8;
0251 
0252     kernel_fpu_begin();
0253 
0254     asm volatile(
0255 #undef BLOCK
0256 #define BLOCK(i) \
0257         PF1(i)                  \
0258                 PF1(i + 2)      \
0259         LD(i, 0)                \
0260             LD(i + 1, 1)            \
0261                 LD(i + 2, 2)        \
0262                     LD(i + 3, 3)    \
0263         PF2(i)                  \
0264                 PF2(i + 2)      \
0265         XO1(i, 0)               \
0266             XO1(i + 1, 1)           \
0267                 XO1(i + 2, 2)       \
0268                     XO1(i + 3, 3)   \
0269         PF3(i)                  \
0270                 PF3(i + 2)      \
0271         PF0(i + 4)              \
0272                 PF0(i + 6)      \
0273         XO2(i, 0)               \
0274             XO2(i + 1, 1)           \
0275                 XO2(i + 2, 2)       \
0276                     XO2(i + 3, 3)   \
0277         XO3(i, 0)               \
0278             XO3(i + 1, 1)           \
0279                 XO3(i + 2, 2)       \
0280                     XO3(i + 3, 3)   \
0281         ST(i, 0)                \
0282             ST(i + 1, 1)            \
0283                 ST(i + 2, 2)        \
0284                     ST(i + 3, 3)    \
0285 
0286 
0287         PF0(0)
0288                 PF0(2)
0289 
0290     " .align 32         ;\n"
0291     " 1:                            ;\n"
0292 
0293         BLOCK(0)
0294         BLOCK(4)
0295         BLOCK(8)
0296         BLOCK(12)
0297 
0298     "       add %[inc], %[p1]       ;\n"
0299     "       add %[inc], %[p2]       ;\n"
0300     "       add %[inc], %[p3]       ;\n"
0301     "       add %[inc], %[p4]       ;\n"
0302     "       dec %[cnt]              ;\n"
0303     "       jnz 1b                  ;\n"
0304     : [cnt] "+r" (lines), [p1] "+r" (p1),
0305       [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
0306     : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
0307     : "memory");
0308 
0309     kernel_fpu_end();
0310 }
0311 
0312 static void
0313 xor_sse_4_pf64(unsigned long bytes, unsigned long * __restrict p1,
0314            const unsigned long * __restrict p2,
0315            const unsigned long * __restrict p3,
0316            const unsigned long * __restrict p4)
0317 {
0318     unsigned long lines = bytes >> 8;
0319 
0320     kernel_fpu_begin();
0321 
0322     asm volatile(
0323 #undef BLOCK
0324 #define BLOCK(i)            \
0325         BLK64(PF0, LD, i)   \
0326         BLK64(PF1, XO1, i)  \
0327         BLK64(PF2, XO2, i)  \
0328         BLK64(PF3, XO3, i)  \
0329         BLK64(NOP, ST, i)   \
0330 
0331     " .align 32         ;\n"
0332     " 1:                            ;\n"
0333 
0334         BLOCK(0)
0335         BLOCK(4)
0336         BLOCK(8)
0337         BLOCK(12)
0338 
0339     "       add %[inc], %[p1]       ;\n"
0340     "       add %[inc], %[p2]       ;\n"
0341     "       add %[inc], %[p3]       ;\n"
0342     "       add %[inc], %[p4]       ;\n"
0343     "       dec %[cnt]              ;\n"
0344     "       jnz 1b                  ;\n"
0345     : [cnt] "+r" (lines), [p1] "+r" (p1),
0346       [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
0347     : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
0348     : "memory");
0349 
0350     kernel_fpu_end();
0351 }
0352 
0353 static void
0354 xor_sse_5(unsigned long bytes, unsigned long * __restrict p1,
0355       const unsigned long * __restrict p2,
0356       const unsigned long * __restrict p3,
0357       const unsigned long * __restrict p4,
0358       const unsigned long * __restrict p5)
0359 {
0360     unsigned long lines = bytes >> 8;
0361 
0362     kernel_fpu_begin();
0363 
0364     asm volatile(
0365 #undef BLOCK
0366 #define BLOCK(i) \
0367         PF1(i)                  \
0368                 PF1(i + 2)      \
0369         LD(i, 0)                \
0370             LD(i + 1, 1)            \
0371                 LD(i + 2, 2)        \
0372                     LD(i + 3, 3)    \
0373         PF2(i)                  \
0374                 PF2(i + 2)      \
0375         XO1(i, 0)               \
0376             XO1(i + 1, 1)           \
0377                 XO1(i + 2, 2)       \
0378                     XO1(i + 3, 3)   \
0379         PF3(i)                  \
0380                 PF3(i + 2)      \
0381         XO2(i, 0)               \
0382             XO2(i + 1, 1)           \
0383                 XO2(i + 2, 2)       \
0384                     XO2(i + 3, 3)   \
0385         PF4(i)                  \
0386                 PF4(i + 2)      \
0387         PF0(i + 4)              \
0388                 PF0(i + 6)      \
0389         XO3(i, 0)               \
0390             XO3(i + 1, 1)           \
0391                 XO3(i + 2, 2)       \
0392                     XO3(i + 3, 3)   \
0393         XO4(i, 0)               \
0394             XO4(i + 1, 1)           \
0395                 XO4(i + 2, 2)       \
0396                     XO4(i + 3, 3)   \
0397         ST(i, 0)                \
0398             ST(i + 1, 1)            \
0399                 ST(i + 2, 2)        \
0400                     ST(i + 3, 3)    \
0401 
0402 
0403         PF0(0)
0404                 PF0(2)
0405 
0406     " .align 32         ;\n"
0407     " 1:                            ;\n"
0408 
0409         BLOCK(0)
0410         BLOCK(4)
0411         BLOCK(8)
0412         BLOCK(12)
0413 
0414     "       add %[inc], %[p1]       ;\n"
0415     "       add %[inc], %[p2]       ;\n"
0416     "       add %[inc], %[p3]       ;\n"
0417     "       add %[inc], %[p4]       ;\n"
0418     "       add %[inc], %[p5]       ;\n"
0419     "       dec %[cnt]              ;\n"
0420     "       jnz 1b                  ;\n"
0421     : [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2),
0422       [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5)
0423     : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
0424     : "memory");
0425 
0426     kernel_fpu_end();
0427 }
0428 
0429 static void
0430 xor_sse_5_pf64(unsigned long bytes, unsigned long * __restrict p1,
0431            const unsigned long * __restrict p2,
0432            const unsigned long * __restrict p3,
0433            const unsigned long * __restrict p4,
0434            const unsigned long * __restrict p5)
0435 {
0436     unsigned long lines = bytes >> 8;
0437 
0438     kernel_fpu_begin();
0439 
0440     asm volatile(
0441 #undef BLOCK
0442 #define BLOCK(i)            \
0443         BLK64(PF0, LD, i)   \
0444         BLK64(PF1, XO1, i)  \
0445         BLK64(PF2, XO2, i)  \
0446         BLK64(PF3, XO3, i)  \
0447         BLK64(PF4, XO4, i)  \
0448         BLK64(NOP, ST, i)   \
0449 
0450     " .align 32         ;\n"
0451     " 1:                            ;\n"
0452 
0453         BLOCK(0)
0454         BLOCK(4)
0455         BLOCK(8)
0456         BLOCK(12)
0457 
0458     "       add %[inc], %[p1]       ;\n"
0459     "       add %[inc], %[p2]       ;\n"
0460     "       add %[inc], %[p3]       ;\n"
0461     "       add %[inc], %[p4]       ;\n"
0462     "       add %[inc], %[p5]       ;\n"
0463     "       dec %[cnt]              ;\n"
0464     "       jnz 1b                  ;\n"
0465     : [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2),
0466       [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5)
0467     : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
0468     : "memory");
0469 
0470     kernel_fpu_end();
0471 }
0472 
0473 static struct xor_block_template xor_block_sse_pf64 = {
0474     .name = "prefetch64-sse",
0475     .do_2 = xor_sse_2_pf64,
0476     .do_3 = xor_sse_3_pf64,
0477     .do_4 = xor_sse_4_pf64,
0478     .do_5 = xor_sse_5_pf64,
0479 };
0480 
0481 #undef LD
0482 #undef XO1
0483 #undef XO2
0484 #undef XO3
0485 #undef XO4
0486 #undef ST
0487 #undef NOP
0488 #undef BLK64
0489 #undef BLOCK
0490 
0491 #undef XOR_CONSTANT_CONSTRAINT
0492 
0493 #ifdef CONFIG_X86_32
0494 # include <asm/xor_32.h>
0495 #else
0496 # include <asm/xor_64.h>
0497 #endif
0498 
0499 #define XOR_SELECT_TEMPLATE(FASTEST) \
0500     AVX_SELECT(FASTEST)
0501 
0502 #endif /* _ASM_X86_XOR_H */