arm/crypto/crc32-ce-core.S

0001 /*
0002  * Accelerated CRC32(C) using ARM CRC, NEON and Crypto Extensions instructions
0003  *
0004  * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
0005  *
0006  * This program is free software; you can redistribute it and/or modify
0007  * it under the terms of the GNU General Public License version 2 as
0008  * published by the Free Software Foundation.
0009  */
0010
0011 /* GPL HEADER START
0012  *
0013  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
0014  *
0015  * This program is free software; you can redistribute it and/or modify
0016  * it under the terms of the GNU General Public License version 2 only,
0017  * as published by the Free Software Foundation.
0018  *
0019  * This program is distributed in the hope that it will be useful, but
0020  * WITHOUT ANY WARRANTY; without even the implied warranty of
0021  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
0022  * General Public License version 2 for more details (a copy is included
0023  * in the LICENSE file that accompanied this code).
0024  *
0025  * You should have received a copy of the GNU General Public License
0026  * version 2 along with this program; If not, see http://www.gnu.org/licenses
0027  *
0028  * Please  visit http://www.xyratex.com/contact if you need additional
0029  * information or have any questions.
0030  *
0031  * GPL HEADER END
0032  */
0033
0034 /*
0035  * Copyright 2012 Xyratex Technology Limited
0036  *
0037  * Using hardware provided PCLMULQDQ instruction to accelerate the CRC32
0038  * calculation.
0039  * CRC32 polynomial:0x04c11db7(BE)/0xEDB88320(LE)
0040  * PCLMULQDQ is a new instruction in Intel SSE4.2, the reference can be found
0041  * at:
0042  * https://www.intel.com/products/processor/manuals/
0043  * Intel(R) 64 and IA-32 Architectures Software Developer's Manual
0044  * Volume 2B: Instruction Set Reference, N-Z
0045  *
0046  * Authors:   Gregory Prestas <Gregory_Prestas@us.xyratex.com>
0047  *        Alexander Boyko <Alexander_Boyko@xyratex.com>
0048  */
0049
0050 #include <linux/linkage.h>
0051 #include <asm/assembler.h>
0052
0053     .text
0054     .align      6
0055     .arch       armv8-a
0056     .arch_extension crc
0057     .fpu        crypto-neon-fp-armv8
0058
0059 .Lcrc32_constants:
0060     /*
0061      * [x4*128+32 mod P(x) << 32)]'  << 1   = 0x154442bd4
0062      * #define CONSTANT_R1  0x154442bd4LL
0063      *
0064      * [(x4*128-32 mod P(x) << 32)]' << 1   = 0x1c6e41596
0065      * #define CONSTANT_R2  0x1c6e41596LL
0066      */
0067     .quad       0x0000000154442bd4
0068     .quad       0x00000001c6e41596
0069
0070     /*
0071      * [(x128+32 mod P(x) << 32)]'   << 1   = 0x1751997d0
0072      * #define CONSTANT_R3  0x1751997d0LL
0073      *
0074      * [(x128-32 mod P(x) << 32)]'   << 1   = 0x0ccaa009e
0075      * #define CONSTANT_R4  0x0ccaa009eLL
0076      */
0077     .quad       0x00000001751997d0
0078     .quad       0x00000000ccaa009e
0079
0080     /*
0081      * [(x64 mod P(x) << 32)]'       << 1   = 0x163cd6124
0082      * #define CONSTANT_R5  0x163cd6124LL
0083      */
0084     .quad       0x0000000163cd6124
0085     .quad       0x00000000FFFFFFFF
0086
0087     /*
0088      * #define CRCPOLY_TRUE_LE_FULL 0x1DB710641LL
0089      *
0090      * Barrett Reduction constant (u64`) = u` = (x**64 / P(x))`
0091      *                                                      = 0x1F7011641LL
0092      * #define CONSTANT_RU  0x1F7011641LL
0093      */
0094     .quad       0x00000001DB710641
0095     .quad       0x00000001F7011641
0096
0097 .Lcrc32c_constants:
0098     .quad       0x00000000740eef02
0099     .quad       0x000000009e4addf8
0100     .quad       0x00000000f20c0dfe
0101     .quad       0x000000014cd00bd6
0102     .quad       0x00000000dd45aab8
0103     .quad       0x00000000FFFFFFFF
0104     .quad       0x0000000105ec76f0
0105     .quad       0x00000000dea713f1
0106
0107     dCONSTANTl  .req    d0
0108     dCONSTANTh  .req    d1
0109     qCONSTANT   .req    q0
0110
0111     BUF     .req    r0
0112     LEN     .req    r1
0113     CRC     .req    r2
0114
0115     qzr     .req    q9
0116
0117     /**
0118      * Calculate crc32
0119      * BUF - buffer
0120      * LEN - sizeof buffer (multiple of 16 bytes), LEN should be > 63
0121      * CRC - initial crc32
0122      * return %eax crc32
0123      * uint crc32_pmull_le(unsigned char const *buffer,
0124      *                     size_t len, uint crc32)
0125      */
0126 ENTRY(crc32_pmull_le)
0127     adr     r3, .Lcrc32_constants
0128     b       0f
0129
0130 ENTRY(crc32c_pmull_le)
0131     adr     r3, .Lcrc32c_constants
0132
0133 0:  bic     LEN, LEN, #15
0134     vld1.8      {q1-q2}, [BUF, :128]!
0135     vld1.8      {q3-q4}, [BUF, :128]!
0136     vmov.i8     qzr, #0
0137     vmov.i8     qCONSTANT, #0
0138     vmov.32     dCONSTANTl[0], CRC
0139     veor.8      d2, d2, dCONSTANTl
0140     sub     LEN, LEN, #0x40
0141     cmp     LEN, #0x40
0142     blt     less_64
0143
0144     vld1.64     {qCONSTANT}, [r3]
0145
0146 loop_64:        /* 64 bytes Full cache line folding */
0147     sub     LEN, LEN, #0x40
0148
0149     vmull.p64   q5, d3, dCONSTANTh
0150     vmull.p64   q6, d5, dCONSTANTh
0151     vmull.p64   q7, d7, dCONSTANTh
0152     vmull.p64   q8, d9, dCONSTANTh
0153
0154     vmull.p64   q1, d2, dCONSTANTl
0155     vmull.p64   q2, d4, dCONSTANTl
0156     vmull.p64   q3, d6, dCONSTANTl
0157     vmull.p64   q4, d8, dCONSTANTl
0158
0159     veor.8      q1, q1, q5
0160     vld1.8      {q5}, [BUF, :128]!
0161     veor.8      q2, q2, q6
0162     vld1.8      {q6}, [BUF, :128]!
0163     veor.8      q3, q3, q7
0164     vld1.8      {q7}, [BUF, :128]!
0165     veor.8      q4, q4, q8
0166     vld1.8      {q8}, [BUF, :128]!
0167
0168     veor.8      q1, q1, q5
0169     veor.8      q2, q2, q6
0170     veor.8      q3, q3, q7
0171     veor.8      q4, q4, q8
0172
0173     cmp     LEN, #0x40
0174     bge     loop_64
0175
0176 less_64:        /* Folding cache line into 128bit */
0177     vldr        dCONSTANTl, [r3, #16]
0178     vldr        dCONSTANTh, [r3, #24]
0179
0180     vmull.p64   q5, d3, dCONSTANTh
0181     vmull.p64   q1, d2, dCONSTANTl
0182     veor.8      q1, q1, q5
0183     veor.8      q1, q1, q2
0184
0185     vmull.p64   q5, d3, dCONSTANTh
0186     vmull.p64   q1, d2, dCONSTANTl
0187     veor.8      q1, q1, q5
0188     veor.8      q1, q1, q3
0189
0190     vmull.p64   q5, d3, dCONSTANTh
0191     vmull.p64   q1, d2, dCONSTANTl
0192     veor.8      q1, q1, q5
0193     veor.8      q1, q1, q4
0194
0195     teq     LEN, #0
0196     beq     fold_64
0197
0198 loop_16:        /* Folding rest buffer into 128bit */
0199     subs        LEN, LEN, #0x10
0200
0201     vld1.8      {q2}, [BUF, :128]!
0202     vmull.p64   q5, d3, dCONSTANTh
0203     vmull.p64   q1, d2, dCONSTANTl
0204     veor.8      q1, q1, q5
0205     veor.8      q1, q1, q2
0206
0207     bne     loop_16
0208
0209 fold_64:
0210     /* perform the last 64 bit fold, also adds 32 zeroes
0211      * to the input stream */
0212     vmull.p64   q2, d2, dCONSTANTh
0213     vext.8      q1, q1, qzr, #8
0214     veor.8      q1, q1, q2
0215
0216     /* final 32-bit fold */
0217     vldr        dCONSTANTl, [r3, #32]
0218     vldr        d6, [r3, #40]
0219     vmov.i8     d7, #0
0220
0221     vext.8      q2, q1, qzr, #4
0222     vand.8      d2, d2, d6
0223     vmull.p64   q1, d2, dCONSTANTl
0224     veor.8      q1, q1, q2
0225
0226     /* Finish up with the bit-reversed barrett reduction 64 ==> 32 bits */
0227     vldr        dCONSTANTl, [r3, #48]
0228     vldr        dCONSTANTh, [r3, #56]
0229
0230     vand.8      q2, q1, q3
0231     vext.8      q2, qzr, q2, #8
0232     vmull.p64   q2, d5, dCONSTANTh
0233     vand.8      q2, q2, q3
0234     vmull.p64   q2, d4, dCONSTANTl
0235     veor.8      q1, q1, q2
0236     vmov        r0, s5
0237
0238     bx      lr
0239 ENDPROC(crc32_pmull_le)
0240 ENDPROC(crc32c_pmull_le)
0241
0242     .macro      __crc32, c
0243     subs        ip, r2, #8
0244     bmi     .Ltail\c
0245
0246     tst     r1, #3
0247     bne     .Lunaligned\c
0248
0249     teq     ip, #0
0250 .Laligned8\c:
0251     ldrd        r2, r3, [r1], #8
0252 ARM_BE8(rev     r2, r2      )
0253 ARM_BE8(rev     r3, r3      )
0254     crc32\c\()w r0, r0, r2
0255     crc32\c\()w r0, r0, r3
0256     bxeq        lr
0257     subs        ip, ip, #8
0258     bpl     .Laligned8\c
0259
0260 .Ltail\c:
0261     tst     ip, #4
0262     beq     2f
0263     ldr     r3, [r1], #4
0264 ARM_BE8(rev     r3, r3      )
0265     crc32\c\()w r0, r0, r3
0266
0267 2:  tst     ip, #2
0268     beq     1f
0269     ldrh        r3, [r1], #2
0270 ARM_BE8(rev16       r3, r3      )
0271     crc32\c\()h r0, r0, r3
0272
0273 1:  tst     ip, #1
0274     bxeq        lr
0275     ldrb        r3, [r1]
0276     crc32\c\()b r0, r0, r3
0277     bx      lr
0278
0279 .Lunaligned\c:
0280     tst     r1, #1
0281     beq     2f
0282     ldrb        r3, [r1], #1
0283     subs        r2, r2, #1
0284     crc32\c\()b r0, r0, r3
0285
0286     tst     r1, #2
0287     beq     0f
0288 2:  ldrh        r3, [r1], #2
0289     subs        r2, r2, #2
0290 ARM_BE8(rev16       r3, r3      )
0291     crc32\c\()h r0, r0, r3
0292
0293 0:  subs        ip, r2, #8
0294     bpl     .Laligned8\c
0295     b       .Ltail\c
0296     .endm
0297
0298     .align      5
0299 ENTRY(crc32_armv8_le)
0300     __crc32
0301 ENDPROC(crc32_armv8_le)
0302
0303     .align      5
0304 ENTRY(crc32c_armv8_le)
0305     __crc32     c
0306 ENDPROC(crc32c_armv8_le)