Back to home page

OSCL-LXR

 
 

    


0001 /* SPDX-License-Identifier: GPL-2.0-only */
0002 /*
0003  * Copyright 2012 Xyratex Technology Limited
0004  *
0005  * Using hardware provided PCLMULQDQ instruction to accelerate the CRC32
0006  * calculation.
0007  * CRC32 polynomial:0x04c11db7(BE)/0xEDB88320(LE)
0008  * PCLMULQDQ is a new instruction in Intel SSE4.2, the reference can be found
0009  * at:
0010  * http://www.intel.com/products/processor/manuals/
0011  * Intel(R) 64 and IA-32 Architectures Software Developer's Manual
0012  * Volume 2B: Instruction Set Reference, N-Z
0013  *
0014  * Authors:   Gregory Prestas <Gregory_Prestas@us.xyratex.com>
0015  *        Alexander Boyko <Alexander_Boyko@xyratex.com>
0016  */
0017 
0018 #include <linux/linkage.h>
0019 
0020 
0021 .section .rodata
0022 .align 16
0023 /*
0024  * [x4*128+32 mod P(x) << 32)]'  << 1   = 0x154442bd4
0025  * #define CONSTANT_R1  0x154442bd4LL
0026  *
0027  * [(x4*128-32 mod P(x) << 32)]' << 1   = 0x1c6e41596
0028  * #define CONSTANT_R2  0x1c6e41596LL
0029  */
0030 .Lconstant_R2R1:
0031     .octa 0x00000001c6e415960000000154442bd4
0032 /*
0033  * [(x128+32 mod P(x) << 32)]'   << 1   = 0x1751997d0
0034  * #define CONSTANT_R3  0x1751997d0LL
0035  *
0036  * [(x128-32 mod P(x) << 32)]'   << 1   = 0x0ccaa009e
0037  * #define CONSTANT_R4  0x0ccaa009eLL
0038  */
0039 .Lconstant_R4R3:
0040     .octa 0x00000000ccaa009e00000001751997d0
0041 /*
0042  * [(x64 mod P(x) << 32)]'       << 1   = 0x163cd6124
0043  * #define CONSTANT_R5  0x163cd6124LL
0044  */
0045 .Lconstant_R5:
0046     .octa 0x00000000000000000000000163cd6124
0047 .Lconstant_mask32:
0048     .octa 0x000000000000000000000000FFFFFFFF
0049 /*
0050  * #define CRCPOLY_TRUE_LE_FULL 0x1DB710641LL
0051  *
0052  * Barrett Reduction constant (u64`) = u` = (x**64 / P(x))` = 0x1F7011641LL
0053  * #define CONSTANT_RU  0x1F7011641LL
0054  */
0055 .Lconstant_RUpoly:
0056     .octa 0x00000001F701164100000001DB710641
0057 
0058 #define CONSTANT %xmm0
0059 
0060 #ifdef __x86_64__
0061 #define BUF     %rdi
0062 #define LEN     %rsi
0063 #define CRC     %edx
0064 #else
0065 #define BUF     %eax
0066 #define LEN     %edx
0067 #define CRC     %ecx
0068 #endif
0069 
0070 
0071 
0072 .text
0073 /**
0074  *      Calculate crc32
0075  *      BUF - buffer (16 bytes aligned)
0076  *      LEN - sizeof buffer (16 bytes aligned), LEN should be grater than 63
0077  *      CRC - initial crc32
0078  *      return %eax crc32
0079  *      uint crc32_pclmul_le_16(unsigned char const *buffer,
0080  *                       size_t len, uint crc32)
0081  */
0082 
0083 SYM_FUNC_START(crc32_pclmul_le_16) /* buffer and buffer size are 16 bytes aligned */
0084     movdqa  (BUF), %xmm1
0085     movdqa  0x10(BUF), %xmm2
0086     movdqa  0x20(BUF), %xmm3
0087     movdqa  0x30(BUF), %xmm4
0088     movd    CRC, CONSTANT
0089     pxor    CONSTANT, %xmm1
0090     sub     $0x40, LEN
0091     add     $0x40, BUF
0092     cmp     $0x40, LEN
0093     jb      less_64
0094 
0095 #ifdef __x86_64__
0096     movdqa .Lconstant_R2R1(%rip), CONSTANT
0097 #else
0098     movdqa .Lconstant_R2R1, CONSTANT
0099 #endif
0100 
0101 loop_64:/*  64 bytes Full cache line folding */
0102     prefetchnta    0x40(BUF)
0103     movdqa  %xmm1, %xmm5
0104     movdqa  %xmm2, %xmm6
0105     movdqa  %xmm3, %xmm7
0106 #ifdef __x86_64__
0107     movdqa  %xmm4, %xmm8
0108 #endif
0109     pclmulqdq $0x00, CONSTANT, %xmm1
0110     pclmulqdq $0x00, CONSTANT, %xmm2
0111     pclmulqdq $0x00, CONSTANT, %xmm3
0112 #ifdef __x86_64__
0113     pclmulqdq $0x00, CONSTANT, %xmm4
0114 #endif
0115     pclmulqdq $0x11, CONSTANT, %xmm5
0116     pclmulqdq $0x11, CONSTANT, %xmm6
0117     pclmulqdq $0x11, CONSTANT, %xmm7
0118 #ifdef __x86_64__
0119     pclmulqdq $0x11, CONSTANT, %xmm8
0120 #endif
0121     pxor    %xmm5, %xmm1
0122     pxor    %xmm6, %xmm2
0123     pxor    %xmm7, %xmm3
0124 #ifdef __x86_64__
0125     pxor    %xmm8, %xmm4
0126 #else
0127     /* xmm8 unsupported for x32 */
0128     movdqa  %xmm4, %xmm5
0129     pclmulqdq $0x00, CONSTANT, %xmm4
0130     pclmulqdq $0x11, CONSTANT, %xmm5
0131     pxor    %xmm5, %xmm4
0132 #endif
0133 
0134     pxor    (BUF), %xmm1
0135     pxor    0x10(BUF), %xmm2
0136     pxor    0x20(BUF), %xmm3
0137     pxor    0x30(BUF), %xmm4
0138 
0139     sub     $0x40, LEN
0140     add     $0x40, BUF
0141     cmp     $0x40, LEN
0142     jge     loop_64
0143 less_64:/*  Folding cache line into 128bit */
0144 #ifdef __x86_64__
0145     movdqa  .Lconstant_R4R3(%rip), CONSTANT
0146 #else
0147     movdqa  .Lconstant_R4R3, CONSTANT
0148 #endif
0149     prefetchnta     (BUF)
0150 
0151     movdqa  %xmm1, %xmm5
0152     pclmulqdq $0x00, CONSTANT, %xmm1
0153     pclmulqdq $0x11, CONSTANT, %xmm5
0154     pxor    %xmm5, %xmm1
0155     pxor    %xmm2, %xmm1
0156 
0157     movdqa  %xmm1, %xmm5
0158     pclmulqdq $0x00, CONSTANT, %xmm1
0159     pclmulqdq $0x11, CONSTANT, %xmm5
0160     pxor    %xmm5, %xmm1
0161     pxor    %xmm3, %xmm1
0162 
0163     movdqa  %xmm1, %xmm5
0164     pclmulqdq $0x00, CONSTANT, %xmm1
0165     pclmulqdq $0x11, CONSTANT, %xmm5
0166     pxor    %xmm5, %xmm1
0167     pxor    %xmm4, %xmm1
0168 
0169     cmp     $0x10, LEN
0170     jb      fold_64
0171 loop_16:/* Folding rest buffer into 128bit */
0172     movdqa  %xmm1, %xmm5
0173     pclmulqdq $0x00, CONSTANT, %xmm1
0174     pclmulqdq $0x11, CONSTANT, %xmm5
0175     pxor    %xmm5, %xmm1
0176     pxor    (BUF), %xmm1
0177     sub     $0x10, LEN
0178     add     $0x10, BUF
0179     cmp     $0x10, LEN
0180     jge     loop_16
0181 
0182 fold_64:
0183     /* perform the last 64 bit fold, also adds 32 zeroes
0184      * to the input stream */
0185     pclmulqdq $0x01, %xmm1, CONSTANT /* R4 * xmm1.low */
0186     psrldq  $0x08, %xmm1
0187     pxor    CONSTANT, %xmm1
0188 
0189     /* final 32-bit fold */
0190     movdqa  %xmm1, %xmm2
0191 #ifdef __x86_64__
0192     movdqa  .Lconstant_R5(%rip), CONSTANT
0193     movdqa  .Lconstant_mask32(%rip), %xmm3
0194 #else
0195     movdqa  .Lconstant_R5, CONSTANT
0196     movdqa  .Lconstant_mask32, %xmm3
0197 #endif
0198     psrldq  $0x04, %xmm2
0199     pand    %xmm3, %xmm1
0200     pclmulqdq $0x00, CONSTANT, %xmm1
0201     pxor    %xmm2, %xmm1
0202 
0203     /* Finish up with the bit-reversed barrett reduction 64 ==> 32 bits */
0204 #ifdef __x86_64__
0205     movdqa  .Lconstant_RUpoly(%rip), CONSTANT
0206 #else
0207     movdqa  .Lconstant_RUpoly, CONSTANT
0208 #endif
0209     movdqa  %xmm1, %xmm2
0210     pand    %xmm3, %xmm1
0211     pclmulqdq $0x10, CONSTANT, %xmm1
0212     pand    %xmm3, %xmm1
0213     pclmulqdq $0x00, CONSTANT, %xmm1
0214     pxor    %xmm2, %xmm1
0215     pextrd  $0x01, %xmm1, %eax
0216 
0217     RET
0218 SYM_FUNC_END(crc32_pclmul_le_16)