arm64/crypto/aes-neon.S

0001 /* SPDX-License-Identifier: GPL-2.0-only */
0002 /*
0003  * linux/arch/arm64/crypto/aes-neon.S - AES cipher for ARMv8 NEON
0004  *
0005  * Copyright (C) 2013 - 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
0006  */
0007
0008 #include <linux/linkage.h>
0009 #include <asm/assembler.h>
0010
0011 #define AES_FUNC_START(func)        SYM_FUNC_START(neon_ ## func)
0012 #define AES_FUNC_END(func)      SYM_FUNC_END(neon_ ## func)
0013
0014     xtsmask     .req    v7
0015     cbciv       .req    v7
0016     vctr        .req    v4
0017
0018     .macro      xts_reload_mask, tmp
0019     xts_load_mask   \tmp
0020     .endm
0021
0022     /* special case for the neon-bs driver calling into this one for CTS */
0023     .macro      xts_cts_skip_tw, reg, lbl
0024     tbnz        \reg, #1, \lbl
0025     .endm
0026
0027     /* multiply by polynomial 'x' in GF(2^8) */
0028     .macro      mul_by_x, out, in, temp, const
0029     sshr        \temp, \in, #7
0030     shl     \out, \in, #1
0031     and     \temp, \temp, \const
0032     eor     \out, \out, \temp
0033     .endm
0034
0035     /* multiply by polynomial 'x^2' in GF(2^8) */
0036     .macro      mul_by_x2, out, in, temp, const
0037     ushr        \temp, \in, #6
0038     shl     \out, \in, #2
0039     pmul        \temp, \temp, \const
0040     eor     \out, \out, \temp
0041     .endm
0042
0043     /* preload the entire Sbox */
0044     .macro      prepare, sbox, shiftrows, temp
0045     movi        v12.16b, #0x1b
0046     ldr_l       q13, \shiftrows, \temp
0047     ldr_l       q14, .Lror32by8, \temp
0048     adr_l       \temp, \sbox
0049     ld1     {v16.16b-v19.16b}, [\temp], #64
0050     ld1     {v20.16b-v23.16b}, [\temp], #64
0051     ld1     {v24.16b-v27.16b}, [\temp], #64
0052     ld1     {v28.16b-v31.16b}, [\temp]
0053     .endm
0054
0055     /* do preload for encryption */
0056     .macro      enc_prepare, ignore0, ignore1, temp
0057     prepare     crypto_aes_sbox, .LForward_ShiftRows, \temp
0058     .endm
0059
0060     .macro      enc_switch_key, ignore0, ignore1, temp
0061     /* do nothing */
0062     .endm
0063
0064     /* do preload for decryption */
0065     .macro      dec_prepare, ignore0, ignore1, temp
0066     prepare     crypto_aes_inv_sbox, .LReverse_ShiftRows, \temp
0067     .endm
0068
0069     /* apply SubBytes transformation using the preloaded Sbox */
0070     .macro      sub_bytes, in
0071     sub     v9.16b, \in\().16b, v15.16b
0072     tbl     \in\().16b, {v16.16b-v19.16b}, \in\().16b
0073     sub     v10.16b, v9.16b, v15.16b
0074     tbx     \in\().16b, {v20.16b-v23.16b}, v9.16b
0075     sub     v11.16b, v10.16b, v15.16b
0076     tbx     \in\().16b, {v24.16b-v27.16b}, v10.16b
0077     tbx     \in\().16b, {v28.16b-v31.16b}, v11.16b
0078     .endm
0079
0080     /* apply MixColumns transformation */
0081     .macro      mix_columns, in, enc
0082     .if     \enc == 0
0083     /* Inverse MixColumns: pre-multiply by { 5, 0, 4, 0 } */
0084     mul_by_x2   v8.16b, \in\().16b, v9.16b, v12.16b
0085     eor     \in\().16b, \in\().16b, v8.16b
0086     rev32       v8.8h, v8.8h
0087     eor     \in\().16b, \in\().16b, v8.16b
0088     .endif
0089
0090     mul_by_x    v9.16b, \in\().16b, v8.16b, v12.16b
0091     rev32       v8.8h, \in\().8h
0092     eor     v8.16b, v8.16b, v9.16b
0093     eor     \in\().16b, \in\().16b, v8.16b
0094     tbl     \in\().16b, {\in\().16b}, v14.16b
0095     eor     \in\().16b, \in\().16b, v8.16b
0096     .endm
0097
0098     .macro      do_block, enc, in, rounds, rk, rkp, i
0099     ld1     {v15.4s}, [\rk]
0100     add     \rkp, \rk, #16
0101     mov     \i, \rounds
0102 1111:   eor     \in\().16b, \in\().16b, v15.16b     /* ^round key */
0103     movi        v15.16b, #0x40
0104     tbl     \in\().16b, {\in\().16b}, v13.16b   /* ShiftRows */
0105     sub_bytes   \in
0106     subs        \i, \i, #1
0107     ld1     {v15.4s}, [\rkp], #16
0108     beq     2222f
0109     mix_columns \in, \enc
0110     b       1111b
0111 2222:   eor     \in\().16b, \in\().16b, v15.16b     /* ^round key */
0112     .endm
0113
0114     .macro      encrypt_block, in, rounds, rk, rkp, i
0115     do_block    1, \in, \rounds, \rk, \rkp, \i
0116     .endm
0117
0118     .macro      decrypt_block, in, rounds, rk, rkp, i
0119     do_block    0, \in, \rounds, \rk, \rkp, \i
0120     .endm
0121
0122     /*
0123      * Interleaved versions: functionally equivalent to the
0124      * ones above, but applied to AES states in parallel.
0125      */
0126
0127     .macro      sub_bytes_4x, in0, in1, in2, in3
0128     sub     v8.16b, \in0\().16b, v15.16b
0129     tbl     \in0\().16b, {v16.16b-v19.16b}, \in0\().16b
0130     sub     v9.16b, \in1\().16b, v15.16b
0131     tbl     \in1\().16b, {v16.16b-v19.16b}, \in1\().16b
0132     sub     v10.16b, \in2\().16b, v15.16b
0133     tbl     \in2\().16b, {v16.16b-v19.16b}, \in2\().16b
0134     sub     v11.16b, \in3\().16b, v15.16b
0135     tbl     \in3\().16b, {v16.16b-v19.16b}, \in3\().16b
0136     tbx     \in0\().16b, {v20.16b-v23.16b}, v8.16b
0137     tbx     \in1\().16b, {v20.16b-v23.16b}, v9.16b
0138     sub     v8.16b, v8.16b, v15.16b
0139     tbx     \in2\().16b, {v20.16b-v23.16b}, v10.16b
0140     sub     v9.16b, v9.16b, v15.16b
0141     tbx     \in3\().16b, {v20.16b-v23.16b}, v11.16b
0142     sub     v10.16b, v10.16b, v15.16b
0143     tbx     \in0\().16b, {v24.16b-v27.16b}, v8.16b
0144     sub     v11.16b, v11.16b, v15.16b
0145     tbx     \in1\().16b, {v24.16b-v27.16b}, v9.16b
0146     sub     v8.16b, v8.16b, v15.16b
0147     tbx     \in2\().16b, {v24.16b-v27.16b}, v10.16b
0148     sub     v9.16b, v9.16b, v15.16b
0149     tbx     \in3\().16b, {v24.16b-v27.16b}, v11.16b
0150     sub     v10.16b, v10.16b, v15.16b
0151     tbx     \in0\().16b, {v28.16b-v31.16b}, v8.16b
0152     sub     v11.16b, v11.16b, v15.16b
0153     tbx     \in1\().16b, {v28.16b-v31.16b}, v9.16b
0154     tbx     \in2\().16b, {v28.16b-v31.16b}, v10.16b
0155     tbx     \in3\().16b, {v28.16b-v31.16b}, v11.16b
0156     .endm
0157
0158     .macro      mul_by_x_2x, out0, out1, in0, in1, tmp0, tmp1, const
0159     sshr        \tmp0\().16b, \in0\().16b, #7
0160     shl     \out0\().16b, \in0\().16b, #1
0161     sshr        \tmp1\().16b, \in1\().16b, #7
0162     and     \tmp0\().16b, \tmp0\().16b, \const\().16b
0163     shl     \out1\().16b, \in1\().16b, #1
0164     and     \tmp1\().16b, \tmp1\().16b, \const\().16b
0165     eor     \out0\().16b, \out0\().16b, \tmp0\().16b
0166     eor     \out1\().16b, \out1\().16b, \tmp1\().16b
0167     .endm
0168
0169     .macro      mul_by_x2_2x, out0, out1, in0, in1, tmp0, tmp1, const
0170     ushr        \tmp0\().16b, \in0\().16b, #6
0171     shl     \out0\().16b, \in0\().16b, #2
0172     ushr        \tmp1\().16b, \in1\().16b, #6
0173     pmul        \tmp0\().16b, \tmp0\().16b, \const\().16b
0174     shl     \out1\().16b, \in1\().16b, #2
0175     pmul        \tmp1\().16b, \tmp1\().16b, \const\().16b
0176     eor     \out0\().16b, \out0\().16b, \tmp0\().16b
0177     eor     \out1\().16b, \out1\().16b, \tmp1\().16b
0178     .endm
0179
0180     .macro      mix_columns_2x, in0, in1, enc
0181     .if     \enc == 0
0182     /* Inverse MixColumns: pre-multiply by { 5, 0, 4, 0 } */
0183     mul_by_x2_2x    v8, v9, \in0, \in1, v10, v11, v12
0184     eor     \in0\().16b, \in0\().16b, v8.16b
0185     rev32       v8.8h, v8.8h
0186     eor     \in1\().16b, \in1\().16b, v9.16b
0187     rev32       v9.8h, v9.8h
0188     eor     \in0\().16b, \in0\().16b, v8.16b
0189     eor     \in1\().16b, \in1\().16b, v9.16b
0190     .endif
0191
0192     mul_by_x_2x v8, v9, \in0, \in1, v10, v11, v12
0193     rev32       v10.8h, \in0\().8h
0194     rev32       v11.8h, \in1\().8h
0195     eor     v10.16b, v10.16b, v8.16b
0196     eor     v11.16b, v11.16b, v9.16b
0197     eor     \in0\().16b, \in0\().16b, v10.16b
0198     eor     \in1\().16b, \in1\().16b, v11.16b
0199     tbl     \in0\().16b, {\in0\().16b}, v14.16b
0200     tbl     \in1\().16b, {\in1\().16b}, v14.16b
0201     eor     \in0\().16b, \in0\().16b, v10.16b
0202     eor     \in1\().16b, \in1\().16b, v11.16b
0203     .endm
0204
0205     .macro      do_block_4x, enc, in0, in1, in2, in3, rounds, rk, rkp, i
0206     ld1     {v15.4s}, [\rk]
0207     add     \rkp, \rk, #16
0208     mov     \i, \rounds
0209 1111:   eor     \in0\().16b, \in0\().16b, v15.16b   /* ^round key */
0210     eor     \in1\().16b, \in1\().16b, v15.16b   /* ^round key */
0211     eor     \in2\().16b, \in2\().16b, v15.16b   /* ^round key */
0212     eor     \in3\().16b, \in3\().16b, v15.16b   /* ^round key */
0213     movi        v15.16b, #0x40
0214     tbl     \in0\().16b, {\in0\().16b}, v13.16b /* ShiftRows */
0215     tbl     \in1\().16b, {\in1\().16b}, v13.16b /* ShiftRows */
0216     tbl     \in2\().16b, {\in2\().16b}, v13.16b /* ShiftRows */
0217     tbl     \in3\().16b, {\in3\().16b}, v13.16b /* ShiftRows */
0218     sub_bytes_4x    \in0, \in1, \in2, \in3
0219     subs        \i, \i, #1
0220     ld1     {v15.4s}, [\rkp], #16
0221     beq     2222f
0222     mix_columns_2x  \in0, \in1, \enc
0223     mix_columns_2x  \in2, \in3, \enc
0224     b       1111b
0225 2222:   eor     \in0\().16b, \in0\().16b, v15.16b   /* ^round key */
0226     eor     \in1\().16b, \in1\().16b, v15.16b   /* ^round key */
0227     eor     \in2\().16b, \in2\().16b, v15.16b   /* ^round key */
0228     eor     \in3\().16b, \in3\().16b, v15.16b   /* ^round key */
0229     .endm
0230
0231     .macro      encrypt_block4x, in0, in1, in2, in3, rounds, rk, rkp, i
0232     do_block_4x 1, \in0, \in1, \in2, \in3, \rounds, \rk, \rkp, \i
0233     .endm
0234
0235     .macro      decrypt_block4x, in0, in1, in2, in3, rounds, rk, rkp, i
0236     do_block_4x 0, \in0, \in1, \in2, \in3, \rounds, \rk, \rkp, \i
0237     .endm
0238
0239 #include "aes-modes.S"
0240
0241     .section    ".rodata", "a"
0242     .align      4
0243 .LForward_ShiftRows:
0244     .octa       0x0b06010c07020d08030e09040f0a0500
0245
0246 .LReverse_ShiftRows:
0247     .octa       0x0306090c0f0205080b0e0104070a0d00
0248
0249 .Lror32by8:
0250     .octa       0x0c0f0e0d080b0a090407060500030201