arm/crypto/crct10dif-ce-core.S

0001 //
0002 // Accelerated CRC-T10DIF using ARM NEON and Crypto Extensions instructions
0003 //
0004 // Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
0005 // Copyright (C) 2019 Google LLC <ebiggers@google.com>
0006 //
0007 // This program is free software; you can redistribute it and/or modify
0008 // it under the terms of the GNU General Public License version 2 as
0009 // published by the Free Software Foundation.
0010 //
0011
0012 // Derived from the x86 version:
0013 //
0014 // Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions
0015 //
0016 // Copyright (c) 2013, Intel Corporation
0017 //
0018 // Authors:
0019 //     Erdinc Ozturk <erdinc.ozturk@intel.com>
0020 //     Vinodh Gopal <vinodh.gopal@intel.com>
0021 //     James Guilford <james.guilford@intel.com>
0022 //     Tim Chen <tim.c.chen@linux.intel.com>
0023 //
0024 // This software is available to you under a choice of one of two
0025 // licenses.  You may choose to be licensed under the terms of the GNU
0026 // General Public License (GPL) Version 2, available from the file
0027 // COPYING in the main directory of this source tree, or the
0028 // OpenIB.org BSD license below:
0029 //
0030 // Redistribution and use in source and binary forms, with or without
0031 // modification, are permitted provided that the following conditions are
0032 // met:
0033 //
0034 // * Redistributions of source code must retain the above copyright
0035 //   notice, this list of conditions and the following disclaimer.
0036 //
0037 // * Redistributions in binary form must reproduce the above copyright
0038 //   notice, this list of conditions and the following disclaimer in the
0039 //   documentation and/or other materials provided with the
0040 //   distribution.
0041 //
0042 // * Neither the name of the Intel Corporation nor the names of its
0043 //   contributors may be used to endorse or promote products derived from
0044 //   this software without specific prior written permission.
0045 //
0046 //
0047 // THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
0048 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
0049 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
0050 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
0051 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
0052 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
0053 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
0054 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
0055 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
0056 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
0057 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
0058 //
0059 //       Reference paper titled "Fast CRC Computation for Generic
0060 //  Polynomials Using PCLMULQDQ Instruction"
0061 //       URL: http://www.intel.com/content/dam/www/public/us/en/documents
0062 //  /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
0063 //
0064
0065 #include <linux/linkage.h>
0066 #include <asm/assembler.h>
0067
0068 #ifdef CONFIG_CPU_ENDIAN_BE8
0069 #define CPU_LE(code...)
0070 #else
0071 #define CPU_LE(code...)     code
0072 #endif
0073
0074     .text
0075     .arch       armv8-a
0076     .fpu        crypto-neon-fp-armv8
0077
0078     init_crc    .req    r0
0079     buf     .req    r1
0080     len     .req    r2
0081
0082     fold_consts_ptr .req    ip
0083
0084     q0l     .req    d0
0085     q0h     .req    d1
0086     q1l     .req    d2
0087     q1h     .req    d3
0088     q2l     .req    d4
0089     q2h     .req    d5
0090     q3l     .req    d6
0091     q3h     .req    d7
0092     q4l     .req    d8
0093     q4h     .req    d9
0094     q5l     .req    d10
0095     q5h     .req    d11
0096     q6l     .req    d12
0097     q6h     .req    d13
0098     q7l     .req    d14
0099     q7h     .req    d15
0100     q8l     .req    d16
0101     q8h     .req    d17
0102     q9l     .req    d18
0103     q9h     .req    d19
0104     q10l        .req    d20
0105     q10h        .req    d21
0106     q11l        .req    d22
0107     q11h        .req    d23
0108     q12l        .req    d24
0109     q12h        .req    d25
0110
0111     FOLD_CONSTS .req    q10
0112     FOLD_CONST_L    .req    q10l
0113     FOLD_CONST_H    .req    q10h
0114
0115     // Fold reg1, reg2 into the next 32 data bytes, storing the result back
0116     // into reg1, reg2.
0117     .macro      fold_32_bytes, reg1, reg2
0118     vld1.64     {q11-q12}, [buf]!
0119
0120     vmull.p64   q8, \reg1\()h, FOLD_CONST_H
0121     vmull.p64   \reg1, \reg1\()l, FOLD_CONST_L
0122     vmull.p64   q9, \reg2\()h, FOLD_CONST_H
0123     vmull.p64   \reg2, \reg2\()l, FOLD_CONST_L
0124
0125 CPU_LE( vrev64.8    q11, q11    )
0126 CPU_LE( vrev64.8    q12, q12    )
0127     vswp        q11l, q11h
0128     vswp        q12l, q12h
0129
0130     veor.8      \reg1, \reg1, q8
0131     veor.8      \reg2, \reg2, q9
0132     veor.8      \reg1, \reg1, q11
0133     veor.8      \reg2, \reg2, q12
0134     .endm
0135
0136     // Fold src_reg into dst_reg, optionally loading the next fold constants
0137     .macro      fold_16_bytes, src_reg, dst_reg, load_next_consts
0138     vmull.p64   q8, \src_reg\()l, FOLD_CONST_L
0139     vmull.p64   \src_reg, \src_reg\()h, FOLD_CONST_H
0140     .ifnb       \load_next_consts
0141     vld1.64     {FOLD_CONSTS}, [fold_consts_ptr, :128]!
0142     .endif
0143     veor.8      \dst_reg, \dst_reg, q8
0144     veor.8      \dst_reg, \dst_reg, \src_reg
0145     .endm
0146
0147     .macro      __adrl, out, sym
0148     movw        \out, #:lower16:\sym
0149     movt        \out, #:upper16:\sym
0150     .endm
0151
0152 //
0153 // u16 crc_t10dif_pmull(u16 init_crc, const u8 *buf, size_t len);
0154 //
0155 // Assumes len >= 16.
0156 //
0157 ENTRY(crc_t10dif_pmull)
0158
0159     // For sizes less than 256 bytes, we can't fold 128 bytes at a time.
0160     cmp     len, #256
0161     blt     .Lless_than_256_bytes
0162
0163     __adrl      fold_consts_ptr, .Lfold_across_128_bytes_consts
0164
0165     // Load the first 128 data bytes.  Byte swapping is necessary to make
0166     // the bit order match the polynomial coefficient order.
0167     vld1.64     {q0-q1}, [buf]!
0168     vld1.64     {q2-q3}, [buf]!
0169     vld1.64     {q4-q5}, [buf]!
0170     vld1.64     {q6-q7}, [buf]!
0171 CPU_LE( vrev64.8    q0, q0  )
0172 CPU_LE( vrev64.8    q1, q1  )
0173 CPU_LE( vrev64.8    q2, q2  )
0174 CPU_LE( vrev64.8    q3, q3  )
0175 CPU_LE( vrev64.8    q4, q4  )
0176 CPU_LE( vrev64.8    q5, q5  )
0177 CPU_LE( vrev64.8    q6, q6  )
0178 CPU_LE( vrev64.8    q7, q7  )
0179     vswp        q0l, q0h
0180     vswp        q1l, q1h
0181     vswp        q2l, q2h
0182     vswp        q3l, q3h
0183     vswp        q4l, q4h
0184     vswp        q5l, q5h
0185     vswp        q6l, q6h
0186     vswp        q7l, q7h
0187
0188     // XOR the first 16 data *bits* with the initial CRC value.
0189     vmov.i8     q8h, #0
0190     vmov.u16    q8h[3], init_crc
0191     veor        q0h, q0h, q8h
0192
0193     // Load the constants for folding across 128 bytes.
0194     vld1.64     {FOLD_CONSTS}, [fold_consts_ptr, :128]!
0195
0196     // Subtract 128 for the 128 data bytes just consumed.  Subtract another
0197     // 128 to simplify the termination condition of the following loop.
0198     sub     len, len, #256
0199
0200     // While >= 128 data bytes remain (not counting q0-q7), fold the 128
0201     // bytes q0-q7 into them, storing the result back into q0-q7.
0202 .Lfold_128_bytes_loop:
0203     fold_32_bytes   q0, q1
0204     fold_32_bytes   q2, q3
0205     fold_32_bytes   q4, q5
0206     fold_32_bytes   q6, q7
0207     subs        len, len, #128
0208     bge     .Lfold_128_bytes_loop
0209
0210     // Now fold the 112 bytes in q0-q6 into the 16 bytes in q7.
0211
0212     // Fold across 64 bytes.
0213     vld1.64     {FOLD_CONSTS}, [fold_consts_ptr, :128]!
0214     fold_16_bytes   q0, q4
0215     fold_16_bytes   q1, q5
0216     fold_16_bytes   q2, q6
0217     fold_16_bytes   q3, q7, 1
0218     // Fold across 32 bytes.
0219     fold_16_bytes   q4, q6
0220     fold_16_bytes   q5, q7, 1
0221     // Fold across 16 bytes.
0222     fold_16_bytes   q6, q7
0223
0224     // Add 128 to get the correct number of data bytes remaining in 0...127
0225     // (not counting q7), following the previous extra subtraction by 128.
0226     // Then subtract 16 to simplify the termination condition of the
0227     // following loop.
0228     adds        len, len, #(128-16)
0229
0230     // While >= 16 data bytes remain (not counting q7), fold the 16 bytes q7
0231     // into them, storing the result back into q7.
0232     blt     .Lfold_16_bytes_loop_done
0233 .Lfold_16_bytes_loop:
0234     vmull.p64   q8, q7l, FOLD_CONST_L
0235     vmull.p64   q7, q7h, FOLD_CONST_H
0236     veor.8      q7, q7, q8
0237     vld1.64     {q0}, [buf]!
0238 CPU_LE( vrev64.8    q0, q0  )
0239     vswp        q0l, q0h
0240     veor.8      q7, q7, q0
0241     subs        len, len, #16
0242     bge     .Lfold_16_bytes_loop
0243
0244 .Lfold_16_bytes_loop_done:
0245     // Add 16 to get the correct number of data bytes remaining in 0...15
0246     // (not counting q7), following the previous extra subtraction by 16.
0247     adds        len, len, #16
0248     beq     .Lreduce_final_16_bytes
0249
0250 .Lhandle_partial_segment:
0251     // Reduce the last '16 + len' bytes where 1 <= len <= 15 and the first
0252     // 16 bytes are in q7 and the rest are the remaining data in 'buf'.  To
0253     // do this without needing a fold constant for each possible 'len',
0254     // redivide the bytes into a first chunk of 'len' bytes and a second
0255     // chunk of 16 bytes, then fold the first chunk into the second.
0256
0257     // q0 = last 16 original data bytes
0258     add     buf, buf, len
0259     sub     buf, buf, #16
0260     vld1.64     {q0}, [buf]
0261 CPU_LE( vrev64.8    q0, q0  )
0262     vswp        q0l, q0h
0263
0264     // q1 = high order part of second chunk: q7 left-shifted by 'len' bytes.
0265     __adrl      r3, .Lbyteshift_table + 16
0266     sub     r3, r3, len
0267     vld1.8      {q2}, [r3]
0268     vtbl.8      q1l, {q7l-q7h}, q2l
0269     vtbl.8      q1h, {q7l-q7h}, q2h
0270
0271     // q3 = first chunk: q7 right-shifted by '16-len' bytes.
0272     vmov.i8     q3, #0x80
0273     veor.8      q2, q2, q3
0274     vtbl.8      q3l, {q7l-q7h}, q2l
0275     vtbl.8      q3h, {q7l-q7h}, q2h
0276
0277     // Convert to 8-bit masks: 'len' 0x00 bytes, then '16-len' 0xff bytes.
0278     vshr.s8     q2, q2, #7
0279
0280     // q2 = second chunk: 'len' bytes from q0 (low-order bytes),
0281     // then '16-len' bytes from q1 (high-order bytes).
0282     vbsl.8      q2, q1, q0
0283
0284     // Fold the first chunk into the second chunk, storing the result in q7.
0285     vmull.p64   q0, q3l, FOLD_CONST_L
0286     vmull.p64   q7, q3h, FOLD_CONST_H
0287     veor.8      q7, q7, q0
0288     veor.8      q7, q7, q2
0289
0290 .Lreduce_final_16_bytes:
0291     // Reduce the 128-bit value M(x), stored in q7, to the final 16-bit CRC.
0292
0293     // Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'.
0294     vld1.64     {FOLD_CONSTS}, [fold_consts_ptr, :128]!
0295
0296     // Fold the high 64 bits into the low 64 bits, while also multiplying by
0297     // x^64.  This produces a 128-bit value congruent to x^64 * M(x) and
0298     // whose low 48 bits are 0.
0299     vmull.p64   q0, q7h, FOLD_CONST_H   // high bits * x^48 * (x^80 mod G(x))
0300     veor.8      q0h, q0h, q7l       // + low bits * x^64
0301
0302     // Fold the high 32 bits into the low 96 bits.  This produces a 96-bit
0303     // value congruent to x^64 * M(x) and whose low 48 bits are 0.
0304     vmov.i8     q1, #0
0305     vmov        s4, s3          // extract high 32 bits
0306     vmov        s3, s5          // zero high 32 bits
0307     vmull.p64   q1, q1l, FOLD_CONST_L   // high 32 bits * x^48 * (x^48 mod G(x))
0308     veor.8      q0, q0, q1      // + low bits
0309
0310     // Load G(x) and floor(x^48 / G(x)).
0311     vld1.64     {FOLD_CONSTS}, [fold_consts_ptr, :128]
0312
0313     // Use Barrett reduction to compute the final CRC value.
0314     vmull.p64   q1, q0h, FOLD_CONST_H   // high 32 bits * floor(x^48 / G(x))
0315     vshr.u64    q1l, q1l, #32       // /= x^32
0316     vmull.p64   q1, q1l, FOLD_CONST_L   // *= G(x)
0317     vshr.u64    q0l, q0l, #48
0318     veor.8      q0l, q0l, q1l       // + low 16 nonzero bits
0319     // Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of q0.
0320
0321     vmov.u16    r0, q0l[0]
0322     bx      lr
0323
0324 .Lless_than_256_bytes:
0325     // Checksumming a buffer of length 16...255 bytes
0326
0327     __adrl      fold_consts_ptr, .Lfold_across_16_bytes_consts
0328
0329     // Load the first 16 data bytes.
0330     vld1.64     {q7}, [buf]!
0331 CPU_LE( vrev64.8    q7, q7  )
0332     vswp        q7l, q7h
0333
0334     // XOR the first 16 data *bits* with the initial CRC value.
0335     vmov.i8     q0h, #0
0336     vmov.u16    q0h[3], init_crc
0337     veor.8      q7h, q7h, q0h
0338
0339     // Load the fold-across-16-bytes constants.
0340     vld1.64     {FOLD_CONSTS}, [fold_consts_ptr, :128]!
0341
0342     cmp     len, #16
0343     beq     .Lreduce_final_16_bytes     // len == 16
0344     subs        len, len, #32
0345     addlt       len, len, #16
0346     blt     .Lhandle_partial_segment    // 17 <= len <= 31
0347     b       .Lfold_16_bytes_loop        // 32 <= len <= 255
0348 ENDPROC(crc_t10dif_pmull)
0349
0350     .section    ".rodata", "a"
0351     .align      4
0352
0353 // Fold constants precomputed from the polynomial 0x18bb7
0354 // G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0
0355 .Lfold_across_128_bytes_consts:
0356     .quad       0x0000000000006123  // x^(8*128)    mod G(x)
0357     .quad       0x0000000000002295  // x^(8*128+64) mod G(x)
0358 // .Lfold_across_64_bytes_consts:
0359     .quad       0x0000000000001069  // x^(4*128)    mod G(x)
0360     .quad       0x000000000000dd31  // x^(4*128+64) mod G(x)
0361 // .Lfold_across_32_bytes_consts:
0362     .quad       0x000000000000857d  // x^(2*128)    mod G(x)
0363     .quad       0x0000000000007acc  // x^(2*128+64) mod G(x)
0364 .Lfold_across_16_bytes_consts:
0365     .quad       0x000000000000a010  // x^(1*128)    mod G(x)
0366     .quad       0x0000000000001faa  // x^(1*128+64) mod G(x)
0367 // .Lfinal_fold_consts:
0368     .quad       0x1368000000000000  // x^48 * (x^48 mod G(x))
0369     .quad       0x2d56000000000000  // x^48 * (x^80 mod G(x))
0370 // .Lbarrett_reduction_consts:
0371     .quad       0x0000000000018bb7  // G(x)
0372     .quad       0x00000001f65a57f8  // floor(x^48 / G(x))
0373
0374 // For 1 <= len <= 15, the 16-byte vector beginning at &byteshift_table[16 -
0375 // len] is the index vector to shift left by 'len' bytes, and is also {0x80,
0376 // ..., 0x80} XOR the index vector to shift right by '16 - len' bytes.
0377 .Lbyteshift_table:
0378     .byte        0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87
0379     .byte       0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f
0380     .byte        0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
0381     .byte        0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe , 0x0