s390/crypto/crc32le-vx.S

0001 /* SPDX-License-Identifier: GPL-2.0 */
0002 /*
0003  * Hardware-accelerated CRC-32 variants for Linux on z Systems
0004  *
0005  * Use the z/Architecture Vector Extension Facility to accelerate the
0006  * computing of bitreflected CRC-32 checksums for IEEE 802.3 Ethernet
0007  * and Castagnoli.
0008  *
0009  * This CRC-32 implementation algorithm is bitreflected and processes
0010  * the least-significant bit first (Little-Endian).
0011  *
0012  * Copyright IBM Corp. 2015
0013  * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
0014  */
0015
0016 #include <linux/linkage.h>
0017 #include <asm/nospec-insn.h>
0018 #include <asm/vx-insn.h>
0019
0020 /* Vector register range containing CRC-32 constants */
0021 #define CONST_PERM_LE2BE    %v9
0022 #define CONST_R2R1      %v10
0023 #define CONST_R4R3      %v11
0024 #define CONST_R5        %v12
0025 #define CONST_RU_POLY       %v13
0026 #define CONST_CRC_POLY      %v14
0027
0028 .data
0029 .align 8
0030
0031 /*
0032  * The CRC-32 constant block contains reduction constants to fold and
0033  * process particular chunks of the input data stream in parallel.
0034  *
0035  * For the CRC-32 variants, the constants are precomputed according to
0036  * these definitions:
0037  *
0038  *  R1 = [(x4*128+32 mod P'(x) << 32)]' << 1
0039  *  R2 = [(x4*128-32 mod P'(x) << 32)]' << 1
0040  *  R3 = [(x128+32 mod P'(x) << 32)]'   << 1
0041  *  R4 = [(x128-32 mod P'(x) << 32)]'   << 1
0042  *  R5 = [(x64 mod P'(x) << 32)]'       << 1
0043  *  R6 = [(x32 mod P'(x) << 32)]'       << 1
0044  *
0045  *  The bitreflected Barret reduction constant, u', is defined as
0046  *  the bit reversal of floor(x**64 / P(x)).
0047  *
0048  *  where P(x) is the polynomial in the normal domain and the P'(x) is the
0049  *  polynomial in the reversed (bitreflected) domain.
0050  *
0051  * CRC-32 (IEEE 802.3 Ethernet, ...) polynomials:
0052  *
0053  *  P(x)  = 0x04C11DB7
0054  *  P'(x) = 0xEDB88320
0055  *
0056  * CRC-32C (Castagnoli) polynomials:
0057  *
0058  *  P(x)  = 0x1EDC6F41
0059  *  P'(x) = 0x82F63B78
0060  */
0061
0062 .Lconstants_CRC_32_LE:
0063     .octa       0x0F0E0D0C0B0A09080706050403020100  # BE->LE mask
0064     .quad       0x1c6e41596, 0x154442bd4        # R2, R1
0065     .quad       0x0ccaa009e, 0x1751997d0        # R4, R3
0066     .octa       0x163cd6124             # R5
0067     .octa       0x1F7011641             # u'
0068     .octa       0x1DB710641             # P'(x) << 1
0069
0070 .Lconstants_CRC_32C_LE:
0071     .octa       0x0F0E0D0C0B0A09080706050403020100  # BE->LE mask
0072     .quad       0x09e4addf8, 0x740eef02         # R2, R1
0073     .quad       0x14cd00bd6, 0xf20c0dfe         # R4, R3
0074     .octa       0x0dd45aab8             # R5
0075     .octa       0x0dea713f1             # u'
0076     .octa       0x105ec76f0             # P'(x) << 1
0077
0078 .previous
0079
0080     GEN_BR_THUNK %r14
0081
0082 .text
0083
0084 /*
0085  * The CRC-32 functions use these calling conventions:
0086  *
0087  * Parameters:
0088  *
0089  *  %r2:    Initial CRC value, typically ~0; and final CRC (return) value.
0090  *  %r3:    Input buffer pointer, performance might be improved if the
0091  *      buffer is on a doubleword boundary.
0092  *  %r4:    Length of the buffer, must be 64 bytes or greater.
0093  *
0094  * Register usage:
0095  *
0096  *  %r5:    CRC-32 constant pool base pointer.
0097  *  V0: Initial CRC value and intermediate constants and results.
0098  *  V1..V4: Data for CRC computation.
0099  *  V5..V8: Next data chunks that are fetched from the input buffer.
0100  *  V9: Constant for BE->LE conversion and shift operations
0101  *
0102  *  V10..V14: CRC-32 constants.
0103  */
0104
0105 ENTRY(crc32_le_vgfm_16)
0106     larl    %r5,.Lconstants_CRC_32_LE
0107     j   crc32_le_vgfm_generic
0108 ENDPROC(crc32_le_vgfm_16)
0109
0110 ENTRY(crc32c_le_vgfm_16)
0111     larl    %r5,.Lconstants_CRC_32C_LE
0112     j   crc32_le_vgfm_generic
0113 ENDPROC(crc32c_le_vgfm_16)
0114
0115 ENTRY(crc32_le_vgfm_generic)
0116     /* Load CRC-32 constants */
0117     VLM CONST_PERM_LE2BE,CONST_CRC_POLY,0,%r5
0118
0119     /*
0120      * Load the initial CRC value.
0121      *
0122      * The CRC value is loaded into the rightmost word of the
0123      * vector register and is later XORed with the LSB portion
0124      * of the loaded input data.
0125      */
0126     VZERO   %v0         /* Clear V0 */
0127     VLVGF   %v0,%r2,3       /* Load CRC into rightmost word */
0128
0129     /* Load a 64-byte data chunk and XOR with CRC */
0130     VLM %v1,%v4,0,%r3       /* 64-bytes into V1..V4 */
0131     VPERM   %v1,%v1,%v1,CONST_PERM_LE2BE
0132     VPERM   %v2,%v2,%v2,CONST_PERM_LE2BE
0133     VPERM   %v3,%v3,%v3,CONST_PERM_LE2BE
0134     VPERM   %v4,%v4,%v4,CONST_PERM_LE2BE
0135
0136     VX  %v1,%v0,%v1     /* V1 ^= CRC */
0137     aghi    %r3,64          /* BUF = BUF + 64 */
0138     aghi    %r4,-64         /* LEN = LEN - 64 */
0139
0140     cghi    %r4,64
0141     jl  .Lless_than_64bytes
0142
0143 .Lfold_64bytes_loop:
0144     /* Load the next 64-byte data chunk into V5 to V8 */
0145     VLM %v5,%v8,0,%r3
0146     VPERM   %v5,%v5,%v5,CONST_PERM_LE2BE
0147     VPERM   %v6,%v6,%v6,CONST_PERM_LE2BE
0148     VPERM   %v7,%v7,%v7,CONST_PERM_LE2BE
0149     VPERM   %v8,%v8,%v8,CONST_PERM_LE2BE
0150
0151     /*
0152      * Perform a GF(2) multiplication of the doublewords in V1 with
0153      * the R1 and R2 reduction constants in V0.  The intermediate result
0154      * is then folded (accumulated) with the next data chunk in V5 and
0155      * stored in V1. Repeat this step for the register contents
0156      * in V2, V3, and V4 respectively.
0157      */
0158     VGFMAG  %v1,CONST_R2R1,%v1,%v5
0159     VGFMAG  %v2,CONST_R2R1,%v2,%v6
0160     VGFMAG  %v3,CONST_R2R1,%v3,%v7
0161     VGFMAG  %v4,CONST_R2R1,%v4,%v8
0162
0163     aghi    %r3,64          /* BUF = BUF + 64 */
0164     aghi    %r4,-64         /* LEN = LEN - 64 */
0165
0166     cghi    %r4,64
0167     jnl .Lfold_64bytes_loop
0168
0169 .Lless_than_64bytes:
0170     /*
0171      * Fold V1 to V4 into a single 128-bit value in V1.  Multiply V1 with R3
0172      * and R4 and accumulating the next 128-bit chunk until a single 128-bit
0173      * value remains.
0174      */
0175     VGFMAG  %v1,CONST_R4R3,%v1,%v2
0176     VGFMAG  %v1,CONST_R4R3,%v1,%v3
0177     VGFMAG  %v1,CONST_R4R3,%v1,%v4
0178
0179     cghi    %r4,16
0180     jl  .Lfinal_fold
0181
0182 .Lfold_16bytes_loop:
0183
0184     VL  %v2,0,,%r3      /* Load next data chunk */
0185     VPERM   %v2,%v2,%v2,CONST_PERM_LE2BE
0186     VGFMAG  %v1,CONST_R4R3,%v1,%v2  /* Fold next data chunk */
0187
0188     aghi    %r3,16
0189     aghi    %r4,-16
0190
0191     cghi    %r4,16
0192     jnl .Lfold_16bytes_loop
0193
0194 .Lfinal_fold:
0195     /*
0196      * Set up a vector register for byte shifts.  The shift value must
0197      * be loaded in bits 1-4 in byte element 7 of a vector register.
0198      * Shift by 8 bytes: 0x40
0199      * Shift by 4 bytes: 0x20
0200      */
0201     VLEIB   %v9,0x40,7
0202
0203     /*
0204      * Prepare V0 for the next GF(2) multiplication: shift V0 by 8 bytes
0205      * to move R4 into the rightmost doubleword and set the leftmost
0206      * doubleword to 0x1.
0207      */
0208     VSRLB   %v0,CONST_R4R3,%v9
0209     VLEIG   %v0,1,0
0210
0211     /*
0212      * Compute GF(2) product of V1 and V0.  The rightmost doubleword
0213      * of V1 is multiplied with R4.  The leftmost doubleword of V1 is
0214      * multiplied by 0x1 and is then XORed with rightmost product.
0215      * Implicitly, the intermediate leftmost product becomes padded
0216      */
0217     VGFMG   %v1,%v0,%v1
0218
0219     /*
0220      * Now do the final 32-bit fold by multiplying the rightmost word
0221      * in V1 with R5 and XOR the result with the remaining bits in V1.
0222      *
0223      * To achieve this by a single VGFMAG, right shift V1 by a word
0224      * and store the result in V2 which is then accumulated.  Use the
0225      * vector unpack instruction to load the rightmost half of the
0226      * doubleword into the rightmost doubleword element of V1; the other
0227      * half is loaded in the leftmost doubleword.
0228      * The vector register with CONST_R5 contains the R5 constant in the
0229      * rightmost doubleword and the leftmost doubleword is zero to ignore
0230      * the leftmost product of V1.
0231      */
0232     VLEIB   %v9,0x20,7        /* Shift by words */
0233     VSRLB   %v2,%v1,%v9       /* Store remaining bits in V2 */
0234     VUPLLF  %v1,%v1           /* Split rightmost doubleword */
0235     VGFMAG  %v1,CONST_R5,%v1,%v2      /* V1 = (V1 * R5) XOR V2 */
0236
0237     /*
0238      * Apply a Barret reduction to compute the final 32-bit CRC value.
0239      *
0240      * The input values to the Barret reduction are the degree-63 polynomial
0241      * in V1 (R(x)), degree-32 generator polynomial, and the reduction
0242      * constant u.  The Barret reduction result is the CRC value of R(x) mod
0243      * P(x).
0244      *
0245      * The Barret reduction algorithm is defined as:
0246      *
0247      *    1. T1(x) = floor( R(x) / x^32 ) GF2MUL u
0248      *    2. T2(x) = floor( T1(x) / x^32 ) GF2MUL P(x)
0249      *    3. C(x)  = R(x) XOR T2(x) mod x^32
0250      *
0251      *  Note: The leftmost doubleword of vector register containing
0252      *  CONST_RU_POLY is zero and, thus, the intermediate GF(2) product
0253      *  is zero and does not contribute to the final result.
0254      */
0255
0256     /* T1(x) = floor( R(x) / x^32 ) GF2MUL u */
0257     VUPLLF  %v2,%v1
0258     VGFMG   %v2,CONST_RU_POLY,%v2
0259
0260     /*
0261      * Compute the GF(2) product of the CRC polynomial with T1(x) in
0262      * V2 and XOR the intermediate result, T2(x), with the value in V1.
0263      * The final result is stored in word element 2 of V2.
0264      */
0265     VUPLLF  %v2,%v2
0266     VGFMAG  %v2,CONST_CRC_POLY,%v2,%v1
0267
0268 .Ldone:
0269     VLGVF   %r2,%v2,2
0270     BR_EX   %r14
0271 ENDPROC(crc32_le_vgfm_generic)
0272
0273 .previous