0001 //
0002 // Accelerated CRC-T10DIF using ARM NEON and Crypto Extensions instructions
0003 //
0004 // Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
0005 // Copyright (C) 2019 Google LLC <ebiggers@google.com>
0006 //
0007 // This program is free software; you can redistribute it and/or modify
0008 // it under the terms of the GNU General Public License version 2 as
0009 // published by the Free Software Foundation.
0010 //
0011
0012 // Derived from the x86 version:
0013 //
0014 // Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions
0015 //
0016 // Copyright (c) 2013, Intel Corporation
0017 //
0018 // Authors:
0019 // Erdinc Ozturk <erdinc.ozturk@intel.com>
0020 // Vinodh Gopal <vinodh.gopal@intel.com>
0021 // James Guilford <james.guilford@intel.com>
0022 // Tim Chen <tim.c.chen@linux.intel.com>
0023 //
0024 // This software is available to you under a choice of one of two
0025 // licenses. You may choose to be licensed under the terms of the GNU
0026 // General Public License (GPL) Version 2, available from the file
0027 // COPYING in the main directory of this source tree, or the
0028 // OpenIB.org BSD license below:
0029 //
0030 // Redistribution and use in source and binary forms, with or without
0031 // modification, are permitted provided that the following conditions are
0032 // met:
0033 //
0034 // * Redistributions of source code must retain the above copyright
0035 // notice, this list of conditions and the following disclaimer.
0036 //
0037 // * Redistributions in binary form must reproduce the above copyright
0038 // notice, this list of conditions and the following disclaimer in the
0039 // documentation and/or other materials provided with the
0040 // distribution.
0041 //
0042 // * Neither the name of the Intel Corporation nor the names of its
0043 // contributors may be used to endorse or promote products derived from
0044 // this software without specific prior written permission.
0045 //
0046 //
0047 // THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
0048 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
0049 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
0050 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
0051 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
0052 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
0053 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
0054 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
0055 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
0056 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
0057 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
0058 //
0059 // Reference paper titled "Fast CRC Computation for Generic
0060 // Polynomials Using PCLMULQDQ Instruction"
0061 // URL: http://www.intel.com/content/dam/www/public/us/en/documents
0062 // /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
0063 //
0064
0065 #include <linux/linkage.h>
0066 #include <asm/assembler.h>
0067
0068 #ifdef CONFIG_CPU_ENDIAN_BE8
0069 #define CPU_LE(code...)
0070 #else
0071 #define CPU_LE(code...) code
0072 #endif
0073
0074 .text
0075 .arch armv8-a
0076 .fpu crypto-neon-fp-armv8
0077
0078 init_crc .req r0
0079 buf .req r1
0080 len .req r2
0081
0082 fold_consts_ptr .req ip
0083
0084 q0l .req d0
0085 q0h .req d1
0086 q1l .req d2
0087 q1h .req d3
0088 q2l .req d4
0089 q2h .req d5
0090 q3l .req d6
0091 q3h .req d7
0092 q4l .req d8
0093 q4h .req d9
0094 q5l .req d10
0095 q5h .req d11
0096 q6l .req d12
0097 q6h .req d13
0098 q7l .req d14
0099 q7h .req d15
0100 q8l .req d16
0101 q8h .req d17
0102 q9l .req d18
0103 q9h .req d19
0104 q10l .req d20
0105 q10h .req d21
0106 q11l .req d22
0107 q11h .req d23
0108 q12l .req d24
0109 q12h .req d25
0110
0111 FOLD_CONSTS .req q10
0112 FOLD_CONST_L .req q10l
0113 FOLD_CONST_H .req q10h
0114
0115 // Fold reg1, reg2 into the next 32 data bytes, storing the result back
0116 // into reg1, reg2.
0117 .macro fold_32_bytes, reg1, reg2
0118 vld1.64 {q11-q12}, [buf]!
0119
0120 vmull.p64 q8, \reg1\()h, FOLD_CONST_H
0121 vmull.p64 \reg1, \reg1\()l, FOLD_CONST_L
0122 vmull.p64 q9, \reg2\()h, FOLD_CONST_H
0123 vmull.p64 \reg2, \reg2\()l, FOLD_CONST_L
0124
0125 CPU_LE( vrev64.8 q11, q11 )
0126 CPU_LE( vrev64.8 q12, q12 )
0127 vswp q11l, q11h
0128 vswp q12l, q12h
0129
0130 veor.8 \reg1, \reg1, q8
0131 veor.8 \reg2, \reg2, q9
0132 veor.8 \reg1, \reg1, q11
0133 veor.8 \reg2, \reg2, q12
0134 .endm
0135
0136 // Fold src_reg into dst_reg, optionally loading the next fold constants
0137 .macro fold_16_bytes, src_reg, dst_reg, load_next_consts
0138 vmull.p64 q8, \src_reg\()l, FOLD_CONST_L
0139 vmull.p64 \src_reg, \src_reg\()h, FOLD_CONST_H
0140 .ifnb \load_next_consts
0141 vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]!
0142 .endif
0143 veor.8 \dst_reg, \dst_reg, q8
0144 veor.8 \dst_reg, \dst_reg, \src_reg
0145 .endm
0146
0147 .macro __adrl, out, sym
0148 movw \out, #:lower16:\sym
0149 movt \out, #:upper16:\sym
0150 .endm
0151
0152 //
0153 // u16 crc_t10dif_pmull(u16 init_crc, const u8 *buf, size_t len);
0154 //
0155 // Assumes len >= 16.
0156 //
0157 ENTRY(crc_t10dif_pmull)
0158
0159 // For sizes less than 256 bytes, we can't fold 128 bytes at a time.
0160 cmp len, #256
0161 blt .Lless_than_256_bytes
0162
0163 __adrl fold_consts_ptr, .Lfold_across_128_bytes_consts
0164
0165 // Load the first 128 data bytes. Byte swapping is necessary to make
0166 // the bit order match the polynomial coefficient order.
0167 vld1.64 {q0-q1}, [buf]!
0168 vld1.64 {q2-q3}, [buf]!
0169 vld1.64 {q4-q5}, [buf]!
0170 vld1.64 {q6-q7}, [buf]!
0171 CPU_LE( vrev64.8 q0, q0 )
0172 CPU_LE( vrev64.8 q1, q1 )
0173 CPU_LE( vrev64.8 q2, q2 )
0174 CPU_LE( vrev64.8 q3, q3 )
0175 CPU_LE( vrev64.8 q4, q4 )
0176 CPU_LE( vrev64.8 q5, q5 )
0177 CPU_LE( vrev64.8 q6, q6 )
0178 CPU_LE( vrev64.8 q7, q7 )
0179 vswp q0l, q0h
0180 vswp q1l, q1h
0181 vswp q2l, q2h
0182 vswp q3l, q3h
0183 vswp q4l, q4h
0184 vswp q5l, q5h
0185 vswp q6l, q6h
0186 vswp q7l, q7h
0187
0188 // XOR the first 16 data *bits* with the initial CRC value.
0189 vmov.i8 q8h, #0
0190 vmov.u16 q8h[3], init_crc
0191 veor q0h, q0h, q8h
0192
0193 // Load the constants for folding across 128 bytes.
0194 vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]!
0195
0196 // Subtract 128 for the 128 data bytes just consumed. Subtract another
0197 // 128 to simplify the termination condition of the following loop.
0198 sub len, len, #256
0199
0200 // While >= 128 data bytes remain (not counting q0-q7), fold the 128
0201 // bytes q0-q7 into them, storing the result back into q0-q7.
0202 .Lfold_128_bytes_loop:
0203 fold_32_bytes q0, q1
0204 fold_32_bytes q2, q3
0205 fold_32_bytes q4, q5
0206 fold_32_bytes q6, q7
0207 subs len, len, #128
0208 bge .Lfold_128_bytes_loop
0209
0210 // Now fold the 112 bytes in q0-q6 into the 16 bytes in q7.
0211
0212 // Fold across 64 bytes.
0213 vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]!
0214 fold_16_bytes q0, q4
0215 fold_16_bytes q1, q5
0216 fold_16_bytes q2, q6
0217 fold_16_bytes q3, q7, 1
0218 // Fold across 32 bytes.
0219 fold_16_bytes q4, q6
0220 fold_16_bytes q5, q7, 1
0221 // Fold across 16 bytes.
0222 fold_16_bytes q6, q7
0223
0224 // Add 128 to get the correct number of data bytes remaining in 0...127
0225 // (not counting q7), following the previous extra subtraction by 128.
0226 // Then subtract 16 to simplify the termination condition of the
0227 // following loop.
0228 adds len, len, #(128-16)
0229
0230 // While >= 16 data bytes remain (not counting q7), fold the 16 bytes q7
0231 // into them, storing the result back into q7.
0232 blt .Lfold_16_bytes_loop_done
0233 .Lfold_16_bytes_loop:
0234 vmull.p64 q8, q7l, FOLD_CONST_L
0235 vmull.p64 q7, q7h, FOLD_CONST_H
0236 veor.8 q7, q7, q8
0237 vld1.64 {q0}, [buf]!
0238 CPU_LE( vrev64.8 q0, q0 )
0239 vswp q0l, q0h
0240 veor.8 q7, q7, q0
0241 subs len, len, #16
0242 bge .Lfold_16_bytes_loop
0243
0244 .Lfold_16_bytes_loop_done:
0245 // Add 16 to get the correct number of data bytes remaining in 0...15
0246 // (not counting q7), following the previous extra subtraction by 16.
0247 adds len, len, #16
0248 beq .Lreduce_final_16_bytes
0249
0250 .Lhandle_partial_segment:
0251 // Reduce the last '16 + len' bytes where 1 <= len <= 15 and the first
0252 // 16 bytes are in q7 and the rest are the remaining data in 'buf'. To
0253 // do this without needing a fold constant for each possible 'len',
0254 // redivide the bytes into a first chunk of 'len' bytes and a second
0255 // chunk of 16 bytes, then fold the first chunk into the second.
0256
0257 // q0 = last 16 original data bytes
0258 add buf, buf, len
0259 sub buf, buf, #16
0260 vld1.64 {q0}, [buf]
0261 CPU_LE( vrev64.8 q0, q0 )
0262 vswp q0l, q0h
0263
0264 // q1 = high order part of second chunk: q7 left-shifted by 'len' bytes.
0265 __adrl r3, .Lbyteshift_table + 16
0266 sub r3, r3, len
0267 vld1.8 {q2}, [r3]
0268 vtbl.8 q1l, {q7l-q7h}, q2l
0269 vtbl.8 q1h, {q7l-q7h}, q2h
0270
0271 // q3 = first chunk: q7 right-shifted by '16-len' bytes.
0272 vmov.i8 q3, #0x80
0273 veor.8 q2, q2, q3
0274 vtbl.8 q3l, {q7l-q7h}, q2l
0275 vtbl.8 q3h, {q7l-q7h}, q2h
0276
0277 // Convert to 8-bit masks: 'len' 0x00 bytes, then '16-len' 0xff bytes.
0278 vshr.s8 q2, q2, #7
0279
0280 // q2 = second chunk: 'len' bytes from q0 (low-order bytes),
0281 // then '16-len' bytes from q1 (high-order bytes).
0282 vbsl.8 q2, q1, q0
0283
0284 // Fold the first chunk into the second chunk, storing the result in q7.
0285 vmull.p64 q0, q3l, FOLD_CONST_L
0286 vmull.p64 q7, q3h, FOLD_CONST_H
0287 veor.8 q7, q7, q0
0288 veor.8 q7, q7, q2
0289
0290 .Lreduce_final_16_bytes:
0291 // Reduce the 128-bit value M(x), stored in q7, to the final 16-bit CRC.
0292
0293 // Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'.
0294 vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]!
0295
0296 // Fold the high 64 bits into the low 64 bits, while also multiplying by
0297 // x^64. This produces a 128-bit value congruent to x^64 * M(x) and
0298 // whose low 48 bits are 0.
0299 vmull.p64 q0, q7h, FOLD_CONST_H // high bits * x^48 * (x^80 mod G(x))
0300 veor.8 q0h, q0h, q7l // + low bits * x^64
0301
0302 // Fold the high 32 bits into the low 96 bits. This produces a 96-bit
0303 // value congruent to x^64 * M(x) and whose low 48 bits are 0.
0304 vmov.i8 q1, #0
0305 vmov s4, s3 // extract high 32 bits
0306 vmov s3, s5 // zero high 32 bits
0307 vmull.p64 q1, q1l, FOLD_CONST_L // high 32 bits * x^48 * (x^48 mod G(x))
0308 veor.8 q0, q0, q1 // + low bits
0309
0310 // Load G(x) and floor(x^48 / G(x)).
0311 vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]
0312
0313 // Use Barrett reduction to compute the final CRC value.
0314 vmull.p64 q1, q0h, FOLD_CONST_H // high 32 bits * floor(x^48 / G(x))
0315 vshr.u64 q1l, q1l, #32 // /= x^32
0316 vmull.p64 q1, q1l, FOLD_CONST_L // *= G(x)
0317 vshr.u64 q0l, q0l, #48
0318 veor.8 q0l, q0l, q1l // + low 16 nonzero bits
0319 // Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of q0.
0320
0321 vmov.u16 r0, q0l[0]
0322 bx lr
0323
0324 .Lless_than_256_bytes:
0325 // Checksumming a buffer of length 16...255 bytes
0326
0327 __adrl fold_consts_ptr, .Lfold_across_16_bytes_consts
0328
0329 // Load the first 16 data bytes.
0330 vld1.64 {q7}, [buf]!
0331 CPU_LE( vrev64.8 q7, q7 )
0332 vswp q7l, q7h
0333
0334 // XOR the first 16 data *bits* with the initial CRC value.
0335 vmov.i8 q0h, #0
0336 vmov.u16 q0h[3], init_crc
0337 veor.8 q7h, q7h, q0h
0338
0339 // Load the fold-across-16-bytes constants.
0340 vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]!
0341
0342 cmp len, #16
0343 beq .Lreduce_final_16_bytes // len == 16
0344 subs len, len, #32
0345 addlt len, len, #16
0346 blt .Lhandle_partial_segment // 17 <= len <= 31
0347 b .Lfold_16_bytes_loop // 32 <= len <= 255
0348 ENDPROC(crc_t10dif_pmull)
0349
0350 .section ".rodata", "a"
0351 .align 4
0352
0353 // Fold constants precomputed from the polynomial 0x18bb7
0354 // G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0
0355 .Lfold_across_128_bytes_consts:
0356 .quad 0x0000000000006123 // x^(8*128) mod G(x)
0357 .quad 0x0000000000002295 // x^(8*128+64) mod G(x)
0358 // .Lfold_across_64_bytes_consts:
0359 .quad 0x0000000000001069 // x^(4*128) mod G(x)
0360 .quad 0x000000000000dd31 // x^(4*128+64) mod G(x)
0361 // .Lfold_across_32_bytes_consts:
0362 .quad 0x000000000000857d // x^(2*128) mod G(x)
0363 .quad 0x0000000000007acc // x^(2*128+64) mod G(x)
0364 .Lfold_across_16_bytes_consts:
0365 .quad 0x000000000000a010 // x^(1*128) mod G(x)
0366 .quad 0x0000000000001faa // x^(1*128+64) mod G(x)
0367 // .Lfinal_fold_consts:
0368 .quad 0x1368000000000000 // x^48 * (x^48 mod G(x))
0369 .quad 0x2d56000000000000 // x^48 * (x^80 mod G(x))
0370 // .Lbarrett_reduction_consts:
0371 .quad 0x0000000000018bb7 // G(x)
0372 .quad 0x00000001f65a57f8 // floor(x^48 / G(x))
0373
0374 // For 1 <= len <= 15, the 16-byte vector beginning at &byteshift_table[16 -
0375 // len] is the index vector to shift left by 'len' bytes, and is also {0x80,
0376 // ..., 0x80} XOR the index vector to shift right by '16 - len' bytes.
0377 .Lbyteshift_table:
0378 .byte 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87
0379 .byte 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f
0380 .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
0381 .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe , 0x0