arm64/crypto/aes-modes.S

0001 /* SPDX-License-Identifier: GPL-2.0-only */
0002 /*
0003  * linux/arch/arm64/crypto/aes-modes.S - chaining mode wrappers for AES
0004  *
0005  * Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
0006  */
0007
0008 /* included by aes-ce.S and aes-neon.S */
0009
0010     .text
0011     .align      4
0012
0013 #ifndef MAX_STRIDE
0014 #define MAX_STRIDE  4
0015 #endif
0016
0017 #if MAX_STRIDE == 4
0018 #define ST4(x...) x
0019 #define ST5(x...)
0020 #else
0021 #define ST4(x...)
0022 #define ST5(x...) x
0023 #endif
0024
0025 SYM_FUNC_START_LOCAL(aes_encrypt_block4x)
0026     encrypt_block4x v0, v1, v2, v3, w3, x2, x8, w7
0027     ret
0028 SYM_FUNC_END(aes_encrypt_block4x)
0029
0030 SYM_FUNC_START_LOCAL(aes_decrypt_block4x)
0031     decrypt_block4x v0, v1, v2, v3, w3, x2, x8, w7
0032     ret
0033 SYM_FUNC_END(aes_decrypt_block4x)
0034
0035 #if MAX_STRIDE == 5
0036 SYM_FUNC_START_LOCAL(aes_encrypt_block5x)
0037     encrypt_block5x v0, v1, v2, v3, v4, w3, x2, x8, w7
0038     ret
0039 SYM_FUNC_END(aes_encrypt_block5x)
0040
0041 SYM_FUNC_START_LOCAL(aes_decrypt_block5x)
0042     decrypt_block5x v0, v1, v2, v3, v4, w3, x2, x8, w7
0043     ret
0044 SYM_FUNC_END(aes_decrypt_block5x)
0045 #endif
0046
0047     /*
0048      * aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
0049      *         int blocks)
0050      * aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
0051      *         int blocks)
0052      */
0053
0054 AES_FUNC_START(aes_ecb_encrypt)
0055     stp     x29, x30, [sp, #-16]!
0056     mov     x29, sp
0057
0058     enc_prepare w3, x2, x5
0059
0060 .LecbencloopNx:
0061     subs        w4, w4, #MAX_STRIDE
0062     bmi     .Lecbenc1x
0063     ld1     {v0.16b-v3.16b}, [x1], #64  /* get 4 pt blocks */
0064 ST4(    bl      aes_encrypt_block4x     )
0065 ST5(    ld1     {v4.16b}, [x1], #16     )
0066 ST5(    bl      aes_encrypt_block5x     )
0067     st1     {v0.16b-v3.16b}, [x0], #64
0068 ST5(    st1     {v4.16b}, [x0], #16     )
0069     b       .LecbencloopNx
0070 .Lecbenc1x:
0071     adds        w4, w4, #MAX_STRIDE
0072     beq     .Lecbencout
0073 .Lecbencloop:
0074     ld1     {v0.16b}, [x1], #16     /* get next pt block */
0075     encrypt_block   v0, w3, x2, x5, w6
0076     st1     {v0.16b}, [x0], #16
0077     subs        w4, w4, #1
0078     bne     .Lecbencloop
0079 .Lecbencout:
0080     ldp     x29, x30, [sp], #16
0081     ret
0082 AES_FUNC_END(aes_ecb_encrypt)
0083
0084
0085 AES_FUNC_START(aes_ecb_decrypt)
0086     stp     x29, x30, [sp, #-16]!
0087     mov     x29, sp
0088
0089     dec_prepare w3, x2, x5
0090
0091 .LecbdecloopNx:
0092     subs        w4, w4, #MAX_STRIDE
0093     bmi     .Lecbdec1x
0094     ld1     {v0.16b-v3.16b}, [x1], #64  /* get 4 ct blocks */
0095 ST4(    bl      aes_decrypt_block4x     )
0096 ST5(    ld1     {v4.16b}, [x1], #16     )
0097 ST5(    bl      aes_decrypt_block5x     )
0098     st1     {v0.16b-v3.16b}, [x0], #64
0099 ST5(    st1     {v4.16b}, [x0], #16     )
0100     b       .LecbdecloopNx
0101 .Lecbdec1x:
0102     adds        w4, w4, #MAX_STRIDE
0103     beq     .Lecbdecout
0104 .Lecbdecloop:
0105     ld1     {v0.16b}, [x1], #16     /* get next ct block */
0106     decrypt_block   v0, w3, x2, x5, w6
0107     st1     {v0.16b}, [x0], #16
0108     subs        w4, w4, #1
0109     bne     .Lecbdecloop
0110 .Lecbdecout:
0111     ldp     x29, x30, [sp], #16
0112     ret
0113 AES_FUNC_END(aes_ecb_decrypt)
0114
0115
0116     /*
0117      * aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
0118      *         int blocks, u8 iv[])
0119      * aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
0120      *         int blocks, u8 iv[])
0121      * aes_essiv_cbc_encrypt(u8 out[], u8 const in[], u32 const rk1[],
0122      *           int rounds, int blocks, u8 iv[],
0123      *           u32 const rk2[]);
0124      * aes_essiv_cbc_decrypt(u8 out[], u8 const in[], u32 const rk1[],
0125      *           int rounds, int blocks, u8 iv[],
0126      *           u32 const rk2[]);
0127      */
0128
0129 AES_FUNC_START(aes_essiv_cbc_encrypt)
0130     ld1     {v4.16b}, [x5]          /* get iv */
0131
0132     mov     w8, #14             /* AES-256: 14 rounds */
0133     enc_prepare w8, x6, x7
0134     encrypt_block   v4, w8, x6, x7, w9
0135     enc_switch_key  w3, x2, x6
0136     b       .Lcbcencloop4x
0137
0138 AES_FUNC_START(aes_cbc_encrypt)
0139     ld1     {v4.16b}, [x5]          /* get iv */
0140     enc_prepare w3, x2, x6
0141
0142 .Lcbcencloop4x:
0143     subs        w4, w4, #4
0144     bmi     .Lcbcenc1x
0145     ld1     {v0.16b-v3.16b}, [x1], #64  /* get 4 pt blocks */
0146     eor     v0.16b, v0.16b, v4.16b      /* ..and xor with iv */
0147     encrypt_block   v0, w3, x2, x6, w7
0148     eor     v1.16b, v1.16b, v0.16b
0149     encrypt_block   v1, w3, x2, x6, w7
0150     eor     v2.16b, v2.16b, v1.16b
0151     encrypt_block   v2, w3, x2, x6, w7
0152     eor     v3.16b, v3.16b, v2.16b
0153     encrypt_block   v3, w3, x2, x6, w7
0154     st1     {v0.16b-v3.16b}, [x0], #64
0155     mov     v4.16b, v3.16b
0156     b       .Lcbcencloop4x
0157 .Lcbcenc1x:
0158     adds        w4, w4, #4
0159     beq     .Lcbcencout
0160 .Lcbcencloop:
0161     ld1     {v0.16b}, [x1], #16     /* get next pt block */
0162     eor     v4.16b, v4.16b, v0.16b      /* ..and xor with iv */
0163     encrypt_block   v4, w3, x2, x6, w7
0164     st1     {v4.16b}, [x0], #16
0165     subs        w4, w4, #1
0166     bne     .Lcbcencloop
0167 .Lcbcencout:
0168     st1     {v4.16b}, [x5]          /* return iv */
0169     ret
0170 AES_FUNC_END(aes_cbc_encrypt)
0171 AES_FUNC_END(aes_essiv_cbc_encrypt)
0172
0173 AES_FUNC_START(aes_essiv_cbc_decrypt)
0174     stp     x29, x30, [sp, #-16]!
0175     mov     x29, sp
0176
0177     ld1     {cbciv.16b}, [x5]       /* get iv */
0178
0179     mov     w8, #14             /* AES-256: 14 rounds */
0180     enc_prepare w8, x6, x7
0181     encrypt_block   cbciv, w8, x6, x7, w9
0182     b       .Lessivcbcdecstart
0183
0184 AES_FUNC_START(aes_cbc_decrypt)
0185     stp     x29, x30, [sp, #-16]!
0186     mov     x29, sp
0187
0188     ld1     {cbciv.16b}, [x5]       /* get iv */
0189 .Lessivcbcdecstart:
0190     dec_prepare w3, x2, x6
0191
0192 .LcbcdecloopNx:
0193     subs        w4, w4, #MAX_STRIDE
0194     bmi     .Lcbcdec1x
0195     ld1     {v0.16b-v3.16b}, [x1], #64  /* get 4 ct blocks */
0196 #if MAX_STRIDE == 5
0197     ld1     {v4.16b}, [x1], #16     /* get 1 ct block */
0198     mov     v5.16b, v0.16b
0199     mov     v6.16b, v1.16b
0200     mov     v7.16b, v2.16b
0201     bl      aes_decrypt_block5x
0202     sub     x1, x1, #32
0203     eor     v0.16b, v0.16b, cbciv.16b
0204     eor     v1.16b, v1.16b, v5.16b
0205     ld1     {v5.16b}, [x1], #16     /* reload 1 ct block */
0206     ld1     {cbciv.16b}, [x1], #16      /* reload 1 ct block */
0207     eor     v2.16b, v2.16b, v6.16b
0208     eor     v3.16b, v3.16b, v7.16b
0209     eor     v4.16b, v4.16b, v5.16b
0210 #else
0211     mov     v4.16b, v0.16b
0212     mov     v5.16b, v1.16b
0213     mov     v6.16b, v2.16b
0214     bl      aes_decrypt_block4x
0215     sub     x1, x1, #16
0216     eor     v0.16b, v0.16b, cbciv.16b
0217     eor     v1.16b, v1.16b, v4.16b
0218     ld1     {cbciv.16b}, [x1], #16      /* reload 1 ct block */
0219     eor     v2.16b, v2.16b, v5.16b
0220     eor     v3.16b, v3.16b, v6.16b
0221 #endif
0222     st1     {v0.16b-v3.16b}, [x0], #64
0223 ST5(    st1     {v4.16b}, [x0], #16     )
0224     b       .LcbcdecloopNx
0225 .Lcbcdec1x:
0226     adds        w4, w4, #MAX_STRIDE
0227     beq     .Lcbcdecout
0228 .Lcbcdecloop:
0229     ld1     {v1.16b}, [x1], #16     /* get next ct block */
0230     mov     v0.16b, v1.16b          /* ...and copy to v0 */
0231     decrypt_block   v0, w3, x2, x6, w7
0232     eor     v0.16b, v0.16b, cbciv.16b   /* xor with iv => pt */
0233     mov     cbciv.16b, v1.16b       /* ct is next iv */
0234     st1     {v0.16b}, [x0], #16
0235     subs        w4, w4, #1
0236     bne     .Lcbcdecloop
0237 .Lcbcdecout:
0238     st1     {cbciv.16b}, [x5]       /* return iv */
0239     ldp     x29, x30, [sp], #16
0240     ret
0241 AES_FUNC_END(aes_cbc_decrypt)
0242 AES_FUNC_END(aes_essiv_cbc_decrypt)
0243
0244
0245     /*
0246      * aes_cbc_cts_encrypt(u8 out[], u8 const in[], u32 const rk[],
0247      *             int rounds, int bytes, u8 const iv[])
0248      * aes_cbc_cts_decrypt(u8 out[], u8 const in[], u32 const rk[],
0249      *             int rounds, int bytes, u8 const iv[])
0250      */
0251
0252 AES_FUNC_START(aes_cbc_cts_encrypt)
0253     adr_l       x8, .Lcts_permute_table
0254     sub     x4, x4, #16
0255     add     x9, x8, #32
0256     add     x8, x8, x4
0257     sub     x9, x9, x4
0258     ld1     {v3.16b}, [x8]
0259     ld1     {v4.16b}, [x9]
0260
0261     ld1     {v0.16b}, [x1], x4      /* overlapping loads */
0262     ld1     {v1.16b}, [x1]
0263
0264     ld1     {v5.16b}, [x5]          /* get iv */
0265     enc_prepare w3, x2, x6
0266
0267     eor     v0.16b, v0.16b, v5.16b      /* xor with iv */
0268     tbl     v1.16b, {v1.16b}, v4.16b
0269     encrypt_block   v0, w3, x2, x6, w7
0270
0271     eor     v1.16b, v1.16b, v0.16b
0272     tbl     v0.16b, {v0.16b}, v3.16b
0273     encrypt_block   v1, w3, x2, x6, w7
0274
0275     add     x4, x0, x4
0276     st1     {v0.16b}, [x4]          /* overlapping stores */
0277     st1     {v1.16b}, [x0]
0278     ret
0279 AES_FUNC_END(aes_cbc_cts_encrypt)
0280
0281 AES_FUNC_START(aes_cbc_cts_decrypt)
0282     adr_l       x8, .Lcts_permute_table
0283     sub     x4, x4, #16
0284     add     x9, x8, #32
0285     add     x8, x8, x4
0286     sub     x9, x9, x4
0287     ld1     {v3.16b}, [x8]
0288     ld1     {v4.16b}, [x9]
0289
0290     ld1     {v0.16b}, [x1], x4      /* overlapping loads */
0291     ld1     {v1.16b}, [x1]
0292
0293     ld1     {v5.16b}, [x5]          /* get iv */
0294     dec_prepare w3, x2, x6
0295
0296     decrypt_block   v0, w3, x2, x6, w7
0297     tbl     v2.16b, {v0.16b}, v3.16b
0298     eor     v2.16b, v2.16b, v1.16b
0299
0300     tbx     v0.16b, {v1.16b}, v4.16b
0301     decrypt_block   v0, w3, x2, x6, w7
0302     eor     v0.16b, v0.16b, v5.16b      /* xor with iv */
0303
0304     add     x4, x0, x4
0305     st1     {v2.16b}, [x4]          /* overlapping stores */
0306     st1     {v0.16b}, [x0]
0307     ret
0308 AES_FUNC_END(aes_cbc_cts_decrypt)
0309
0310     .section    ".rodata", "a"
0311     .align      6
0312 .Lcts_permute_table:
0313     .byte       0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
0314     .byte       0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
0315     .byte        0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
0316     .byte        0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf
0317     .byte       0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
0318     .byte       0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
0319     .previous
0320
0321     /*
0322      * This macro generates the code for CTR and XCTR mode.
0323      */
0324 .macro ctr_encrypt xctr
0325     // Arguments
0326     OUT     .req x0
0327     IN      .req x1
0328     KEY     .req x2
0329     ROUNDS_W    .req w3
0330     BYTES_W     .req w4
0331     IV      .req x5
0332     BYTE_CTR_W  .req w6     // XCTR only
0333     // Intermediate values
0334     CTR_W       .req w11    // XCTR only
0335     CTR     .req x11    // XCTR only
0336     IV_PART     .req x12
0337     BLOCKS      .req x13
0338     BLOCKS_W    .req w13
0339
0340     stp     x29, x30, [sp, #-16]!
0341     mov     x29, sp
0342
0343     enc_prepare ROUNDS_W, KEY, IV_PART
0344     ld1     {vctr.16b}, [IV]
0345
0346     /*
0347      * Keep 64 bits of the IV in a register.  For CTR mode this lets us
0348      * easily increment the IV.  For XCTR mode this lets us efficiently XOR
0349      * the 64-bit counter with the IV.
0350      */
0351     .if \xctr
0352         umov        IV_PART, vctr.d[0]
0353         lsr     CTR_W, BYTE_CTR_W, #4
0354     .else
0355         umov        IV_PART, vctr.d[1]
0356         rev     IV_PART, IV_PART
0357     .endif
0358
0359 .LctrloopNx\xctr:
0360     add     BLOCKS_W, BYTES_W, #15
0361     sub     BYTES_W, BYTES_W, #MAX_STRIDE << 4
0362     lsr     BLOCKS_W, BLOCKS_W, #4
0363     mov     w8, #MAX_STRIDE
0364     cmp     BLOCKS_W, w8
0365     csel        BLOCKS_W, BLOCKS_W, w8, lt
0366
0367     /*
0368      * Set up the counter values in v0-v{MAX_STRIDE-1}.
0369      *
0370      * If we are encrypting less than MAX_STRIDE blocks, the tail block
0371      * handling code expects the last keystream block to be in
0372      * v{MAX_STRIDE-1}.  For example: if encrypting two blocks with
0373      * MAX_STRIDE=5, then v3 and v4 should have the next two counter blocks.
0374      */
0375     .if \xctr
0376         add     CTR, CTR, BLOCKS
0377     .else
0378         adds        IV_PART, IV_PART, BLOCKS
0379     .endif
0380     mov     v0.16b, vctr.16b
0381     mov     v1.16b, vctr.16b
0382     mov     v2.16b, vctr.16b
0383     mov     v3.16b, vctr.16b
0384 ST5(    mov     v4.16b, vctr.16b        )
0385     .if \xctr
0386         sub     x6, CTR, #MAX_STRIDE - 1
0387         sub     x7, CTR, #MAX_STRIDE - 2
0388         sub     x8, CTR, #MAX_STRIDE - 3
0389         sub     x9, CTR, #MAX_STRIDE - 4
0390 ST5(        sub     x10, CTR, #MAX_STRIDE - 5   )
0391         eor     x6, x6, IV_PART
0392         eor     x7, x7, IV_PART
0393         eor     x8, x8, IV_PART
0394         eor     x9, x9, IV_PART
0395 ST5(        eor     x10, x10, IV_PART       )
0396         mov     v0.d[0], x6
0397         mov     v1.d[0], x7
0398         mov     v2.d[0], x8
0399         mov     v3.d[0], x9
0400 ST5(        mov     v4.d[0], x10            )
0401     .else
0402         bcs     0f
0403         .subsection 1
0404         /*
0405          * This subsection handles carries.
0406          *
0407          * Conditional branching here is allowed with respect to time
0408          * invariance since the branches are dependent on the IV instead
0409          * of the plaintext or key.  This code is rarely executed in
0410          * practice anyway.
0411          */
0412
0413         /* Apply carry to outgoing counter. */
0414 0:      umov        x8, vctr.d[0]
0415         rev     x8, x8
0416         add     x8, x8, #1
0417         rev     x8, x8
0418         ins     vctr.d[0], x8
0419
0420         /*
0421          * Apply carry to counter blocks if needed.
0422          *
0423          * Since the carry flag was set, we know 0 <= IV_PART <
0424          * MAX_STRIDE.  Using the value of IV_PART we can determine how
0425          * many counter blocks need to be updated.
0426          */
0427         cbz     IV_PART, 2f
0428         adr     x16, 1f
0429         sub     x16, x16, IV_PART, lsl #3
0430         br      x16
0431         bti     c
0432         mov     v0.d[0], vctr.d[0]
0433         bti     c
0434         mov     v1.d[0], vctr.d[0]
0435         bti     c
0436         mov     v2.d[0], vctr.d[0]
0437         bti     c
0438         mov     v3.d[0], vctr.d[0]
0439 ST5(        bti     c               )
0440 ST5(        mov     v4.d[0], vctr.d[0]      )
0441 1:      b       2f
0442         .previous
0443
0444 2:      rev     x7, IV_PART
0445         ins     vctr.d[1], x7
0446         sub     x7, IV_PART, #MAX_STRIDE - 1
0447         sub     x8, IV_PART, #MAX_STRIDE - 2
0448         sub     x9, IV_PART, #MAX_STRIDE - 3
0449         rev     x7, x7
0450         rev     x8, x8
0451         mov     v1.d[1], x7
0452         rev     x9, x9
0453 ST5(        sub     x10, IV_PART, #MAX_STRIDE - 4   )
0454         mov     v2.d[1], x8
0455 ST5(        rev     x10, x10            )
0456         mov     v3.d[1], x9
0457 ST5(        mov     v4.d[1], x10            )
0458     .endif
0459
0460     /*
0461      * If there are at least MAX_STRIDE blocks left, XOR the data with
0462      * keystream and store.  Otherwise jump to tail handling.
0463      */
0464     tbnz        BYTES_W, #31, .Lctrtail\xctr
0465     ld1     {v5.16b-v7.16b}, [IN], #48
0466 ST4(    bl      aes_encrypt_block4x     )
0467 ST5(    bl      aes_encrypt_block5x     )
0468     eor     v0.16b, v5.16b, v0.16b
0469 ST4(    ld1     {v5.16b}, [IN], #16     )
0470     eor     v1.16b, v6.16b, v1.16b
0471 ST5(    ld1     {v5.16b-v6.16b}, [IN], #32  )
0472     eor     v2.16b, v7.16b, v2.16b
0473     eor     v3.16b, v5.16b, v3.16b
0474 ST5(    eor     v4.16b, v6.16b, v4.16b      )
0475     st1     {v0.16b-v3.16b}, [OUT], #64
0476 ST5(    st1     {v4.16b}, [OUT], #16        )
0477     cbz     BYTES_W, .Lctrout\xctr
0478     b       .LctrloopNx\xctr
0479
0480 .Lctrout\xctr:
0481     .if !\xctr
0482         st1     {vctr.16b}, [IV] /* return next CTR value */
0483     .endif
0484     ldp     x29, x30, [sp], #16
0485     ret
0486
0487 .Lctrtail\xctr:
0488     /*
0489      * Handle up to MAX_STRIDE * 16 - 1 bytes of plaintext
0490      *
0491      * This code expects the last keystream block to be in v{MAX_STRIDE-1}.
0492      * For example: if encrypting two blocks with MAX_STRIDE=5, then v3 and
0493      * v4 should have the next two counter blocks.
0494      *
0495      * This allows us to store the ciphertext by writing to overlapping
0496      * regions of memory.  Any invalid ciphertext blocks get overwritten by
0497      * correctly computed blocks.  This approach greatly simplifies the
0498      * logic for storing the ciphertext.
0499      */
0500     mov     x16, #16
0501     ands        w7, BYTES_W, #0xf
0502     csel        x13, x7, x16, ne
0503
0504 ST5(    cmp     BYTES_W, #64 - (MAX_STRIDE << 4))
0505 ST5(    csel        x14, x16, xzr, gt       )
0506     cmp     BYTES_W, #48 - (MAX_STRIDE << 4)
0507     csel        x15, x16, xzr, gt
0508     cmp     BYTES_W, #32 - (MAX_STRIDE << 4)
0509     csel        x16, x16, xzr, gt
0510     cmp     BYTES_W, #16 - (MAX_STRIDE << 4)
0511
0512     adr_l       x9, .Lcts_permute_table
0513     add     x9, x9, x13
0514     ble     .Lctrtail1x\xctr
0515
0516 ST5(    ld1     {v5.16b}, [IN], x14     )
0517     ld1     {v6.16b}, [IN], x15
0518     ld1     {v7.16b}, [IN], x16
0519
0520 ST4(    bl      aes_encrypt_block4x     )
0521 ST5(    bl      aes_encrypt_block5x     )
0522
0523     ld1     {v8.16b}, [IN], x13
0524     ld1     {v9.16b}, [IN]
0525     ld1     {v10.16b}, [x9]
0526
0527 ST4(    eor     v6.16b, v6.16b, v0.16b      )
0528 ST4(    eor     v7.16b, v7.16b, v1.16b      )
0529 ST4(    tbl     v3.16b, {v3.16b}, v10.16b   )
0530 ST4(    eor     v8.16b, v8.16b, v2.16b      )
0531 ST4(    eor     v9.16b, v9.16b, v3.16b      )
0532
0533 ST5(    eor     v5.16b, v5.16b, v0.16b      )
0534 ST5(    eor     v6.16b, v6.16b, v1.16b      )
0535 ST5(    tbl     v4.16b, {v4.16b}, v10.16b   )
0536 ST5(    eor     v7.16b, v7.16b, v2.16b      )
0537 ST5(    eor     v8.16b, v8.16b, v3.16b      )
0538 ST5(    eor     v9.16b, v9.16b, v4.16b      )
0539
0540 ST5(    st1     {v5.16b}, [OUT], x14        )
0541     st1     {v6.16b}, [OUT], x15
0542     st1     {v7.16b}, [OUT], x16
0543     add     x13, x13, OUT
0544     st1     {v9.16b}, [x13]     // overlapping stores
0545     st1     {v8.16b}, [OUT]
0546     b       .Lctrout\xctr
0547
0548 .Lctrtail1x\xctr:
0549     /*
0550      * Handle <= 16 bytes of plaintext
0551      *
0552      * This code always reads and writes 16 bytes.  To avoid out of bounds
0553      * accesses, XCTR and CTR modes must use a temporary buffer when
0554      * encrypting/decrypting less than 16 bytes.
0555      *
0556      * This code is unusual in that it loads the input and stores the output
0557      * relative to the end of the buffers rather than relative to the start.
0558      * This causes unusual behaviour when encrypting/decrypting less than 16
0559      * bytes; the end of the data is expected to be at the end of the
0560      * temporary buffer rather than the start of the data being at the start
0561      * of the temporary buffer.
0562      */
0563     sub     x8, x7, #16
0564     csel        x7, x7, x8, eq
0565     add     IN, IN, x7
0566     add     OUT, OUT, x7
0567     ld1     {v5.16b}, [IN]
0568     ld1     {v6.16b}, [OUT]
0569 ST5(    mov     v3.16b, v4.16b          )
0570     encrypt_block   v3, ROUNDS_W, KEY, x8, w7
0571     ld1     {v10.16b-v11.16b}, [x9]
0572     tbl     v3.16b, {v3.16b}, v10.16b
0573     sshr        v11.16b, v11.16b, #7
0574     eor     v5.16b, v5.16b, v3.16b
0575     bif     v5.16b, v6.16b, v11.16b
0576     st1     {v5.16b}, [OUT]
0577     b       .Lctrout\xctr
0578
0579     // Arguments
0580     .unreq OUT
0581     .unreq IN
0582     .unreq KEY
0583     .unreq ROUNDS_W
0584     .unreq BYTES_W
0585     .unreq IV
0586     .unreq BYTE_CTR_W   // XCTR only
0587     // Intermediate values
0588     .unreq CTR_W        // XCTR only
0589     .unreq CTR      // XCTR only
0590     .unreq IV_PART
0591     .unreq BLOCKS
0592     .unreq BLOCKS_W
0593 .endm
0594
0595     /*
0596      * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
0597      *         int bytes, u8 ctr[])
0598      *
0599      * The input and output buffers must always be at least 16 bytes even if
0600      * encrypting/decrypting less than 16 bytes.  Otherwise out of bounds
0601      * accesses will occur.  The data to be encrypted/decrypted is expected
0602      * to be at the end of this 16-byte temporary buffer rather than the
0603      * start.
0604      */
0605
0606 AES_FUNC_START(aes_ctr_encrypt)
0607     ctr_encrypt 0
0608 AES_FUNC_END(aes_ctr_encrypt)
0609
0610     /*
0611      * aes_xctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
0612      *         int bytes, u8 const iv[], int byte_ctr)
0613      *
0614      * The input and output buffers must always be at least 16 bytes even if
0615      * encrypting/decrypting less than 16 bytes.  Otherwise out of bounds
0616      * accesses will occur.  The data to be encrypted/decrypted is expected
0617      * to be at the end of this 16-byte temporary buffer rather than the
0618      * start.
0619      */
0620
0621 AES_FUNC_START(aes_xctr_encrypt)
0622     ctr_encrypt 1
0623 AES_FUNC_END(aes_xctr_encrypt)
0624
0625
0626     /*
0627      * aes_xts_encrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
0628      *         int bytes, u8 const rk2[], u8 iv[], int first)
0629      * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
0630      *         int bytes, u8 const rk2[], u8 iv[], int first)
0631      */
0632
0633     .macro      next_tweak, out, in, tmp
0634     sshr        \tmp\().2d,  \in\().2d,   #63
0635     and     \tmp\().16b, \tmp\().16b, xtsmask.16b
0636     add     \out\().2d,  \in\().2d,   \in\().2d
0637     ext     \tmp\().16b, \tmp\().16b, \tmp\().16b, #8
0638     eor     \out\().16b, \out\().16b, \tmp\().16b
0639     .endm
0640
0641     .macro      xts_load_mask, tmp
0642     movi        xtsmask.2s, #0x1
0643     movi        \tmp\().2s, #0x87
0644     uzp1        xtsmask.4s, xtsmask.4s, \tmp\().4s
0645     .endm
0646
0647 AES_FUNC_START(aes_xts_encrypt)
0648     stp     x29, x30, [sp, #-16]!
0649     mov     x29, sp
0650
0651     ld1     {v4.16b}, [x6]
0652     xts_load_mask   v8
0653     cbz     w7, .Lxtsencnotfirst
0654
0655     enc_prepare w3, x5, x8
0656     xts_cts_skip_tw w7, .LxtsencNx
0657     encrypt_block   v4, w3, x5, x8, w7      /* first tweak */
0658     enc_switch_key  w3, x2, x8
0659     b       .LxtsencNx
0660
0661 .Lxtsencnotfirst:
0662     enc_prepare w3, x2, x8
0663 .LxtsencloopNx:
0664     next_tweak  v4, v4, v8
0665 .LxtsencNx:
0666     subs        w4, w4, #64
0667     bmi     .Lxtsenc1x
0668     ld1     {v0.16b-v3.16b}, [x1], #64  /* get 4 pt blocks */
0669     next_tweak  v5, v4, v8
0670     eor     v0.16b, v0.16b, v4.16b
0671     next_tweak  v6, v5, v8
0672     eor     v1.16b, v1.16b, v5.16b
0673     eor     v2.16b, v2.16b, v6.16b
0674     next_tweak  v7, v6, v8
0675     eor     v3.16b, v3.16b, v7.16b
0676     bl      aes_encrypt_block4x
0677     eor     v3.16b, v3.16b, v7.16b
0678     eor     v0.16b, v0.16b, v4.16b
0679     eor     v1.16b, v1.16b, v5.16b
0680     eor     v2.16b, v2.16b, v6.16b
0681     st1     {v0.16b-v3.16b}, [x0], #64
0682     mov     v4.16b, v7.16b
0683     cbz     w4, .Lxtsencret
0684     xts_reload_mask v8
0685     b       .LxtsencloopNx
0686 .Lxtsenc1x:
0687     adds        w4, w4, #64
0688     beq     .Lxtsencout
0689     subs        w4, w4, #16
0690     bmi     .LxtsencctsNx
0691 .Lxtsencloop:
0692     ld1     {v0.16b}, [x1], #16
0693 .Lxtsencctsout:
0694     eor     v0.16b, v0.16b, v4.16b
0695     encrypt_block   v0, w3, x2, x8, w7
0696     eor     v0.16b, v0.16b, v4.16b
0697     cbz     w4, .Lxtsencout
0698     subs        w4, w4, #16
0699     next_tweak  v4, v4, v8
0700     bmi     .Lxtsenccts
0701     st1     {v0.16b}, [x0], #16
0702     b       .Lxtsencloop
0703 .Lxtsencout:
0704     st1     {v0.16b}, [x0]
0705 .Lxtsencret:
0706     st1     {v4.16b}, [x6]
0707     ldp     x29, x30, [sp], #16
0708     ret
0709
0710 .LxtsencctsNx:
0711     mov     v0.16b, v3.16b
0712     sub     x0, x0, #16
0713 .Lxtsenccts:
0714     adr_l       x8, .Lcts_permute_table
0715
0716     add     x1, x1, w4, sxtw    /* rewind input pointer */
0717     add     w4, w4, #16     /* # bytes in final block */
0718     add     x9, x8, #32
0719     add     x8, x8, x4
0720     sub     x9, x9, x4
0721     add     x4, x0, x4      /* output address of final block */
0722
0723     ld1     {v1.16b}, [x1]      /* load final block */
0724     ld1     {v2.16b}, [x8]
0725     ld1     {v3.16b}, [x9]
0726
0727     tbl     v2.16b, {v0.16b}, v2.16b
0728     tbx     v0.16b, {v1.16b}, v3.16b
0729     st1     {v2.16b}, [x4]          /* overlapping stores */
0730     mov     w4, wzr
0731     b       .Lxtsencctsout
0732 AES_FUNC_END(aes_xts_encrypt)
0733
0734 AES_FUNC_START(aes_xts_decrypt)
0735     stp     x29, x30, [sp, #-16]!
0736     mov     x29, sp
0737
0738     /* subtract 16 bytes if we are doing CTS */
0739     sub     w8, w4, #0x10
0740     tst     w4, #0xf
0741     csel        w4, w4, w8, eq
0742
0743     ld1     {v4.16b}, [x6]
0744     xts_load_mask   v8
0745     xts_cts_skip_tw w7, .Lxtsdecskiptw
0746     cbz     w7, .Lxtsdecnotfirst
0747
0748     enc_prepare w3, x5, x8
0749     encrypt_block   v4, w3, x5, x8, w7      /* first tweak */
0750 .Lxtsdecskiptw:
0751     dec_prepare w3, x2, x8
0752     b       .LxtsdecNx
0753
0754 .Lxtsdecnotfirst:
0755     dec_prepare w3, x2, x8
0756 .LxtsdecloopNx:
0757     next_tweak  v4, v4, v8
0758 .LxtsdecNx:
0759     subs        w4, w4, #64
0760     bmi     .Lxtsdec1x
0761     ld1     {v0.16b-v3.16b}, [x1], #64  /* get 4 ct blocks */
0762     next_tweak  v5, v4, v8
0763     eor     v0.16b, v0.16b, v4.16b
0764     next_tweak  v6, v5, v8
0765     eor     v1.16b, v1.16b, v5.16b
0766     eor     v2.16b, v2.16b, v6.16b
0767     next_tweak  v7, v6, v8
0768     eor     v3.16b, v3.16b, v7.16b
0769     bl      aes_decrypt_block4x
0770     eor     v3.16b, v3.16b, v7.16b
0771     eor     v0.16b, v0.16b, v4.16b
0772     eor     v1.16b, v1.16b, v5.16b
0773     eor     v2.16b, v2.16b, v6.16b
0774     st1     {v0.16b-v3.16b}, [x0], #64
0775     mov     v4.16b, v7.16b
0776     cbz     w4, .Lxtsdecout
0777     xts_reload_mask v8
0778     b       .LxtsdecloopNx
0779 .Lxtsdec1x:
0780     adds        w4, w4, #64
0781     beq     .Lxtsdecout
0782     subs        w4, w4, #16
0783 .Lxtsdecloop:
0784     ld1     {v0.16b}, [x1], #16
0785     bmi     .Lxtsdeccts
0786 .Lxtsdecctsout:
0787     eor     v0.16b, v0.16b, v4.16b
0788     decrypt_block   v0, w3, x2, x8, w7
0789     eor     v0.16b, v0.16b, v4.16b
0790     st1     {v0.16b}, [x0], #16
0791     cbz     w4, .Lxtsdecout
0792     subs        w4, w4, #16
0793     next_tweak  v4, v4, v8
0794     b       .Lxtsdecloop
0795 .Lxtsdecout:
0796     st1     {v4.16b}, [x6]
0797     ldp     x29, x30, [sp], #16
0798     ret
0799
0800 .Lxtsdeccts:
0801     adr_l       x8, .Lcts_permute_table
0802
0803     add     x1, x1, w4, sxtw    /* rewind input pointer */
0804     add     w4, w4, #16     /* # bytes in final block */
0805     add     x9, x8, #32
0806     add     x8, x8, x4
0807     sub     x9, x9, x4
0808     add     x4, x0, x4      /* output address of final block */
0809
0810     next_tweak  v5, v4, v8
0811
0812     ld1     {v1.16b}, [x1]      /* load final block */
0813     ld1     {v2.16b}, [x8]
0814     ld1     {v3.16b}, [x9]
0815
0816     eor     v0.16b, v0.16b, v5.16b
0817     decrypt_block   v0, w3, x2, x8, w7
0818     eor     v0.16b, v0.16b, v5.16b
0819
0820     tbl     v2.16b, {v0.16b}, v2.16b
0821     tbx     v0.16b, {v1.16b}, v3.16b
0822
0823     st1     {v2.16b}, [x4]          /* overlapping stores */
0824     mov     w4, wzr
0825     b       .Lxtsdecctsout
0826 AES_FUNC_END(aes_xts_decrypt)
0827
0828     /*
0829      * aes_mac_update(u8 const in[], u32 const rk[], int rounds,
0830      *        int blocks, u8 dg[], int enc_before, int enc_after)
0831      */
0832 AES_FUNC_START(aes_mac_update)
0833     ld1     {v0.16b}, [x4]          /* get dg */
0834     enc_prepare w2, x1, x7
0835     cbz     w5, .Lmacloop4x
0836
0837     encrypt_block   v0, w2, x1, x7, w8
0838
0839 .Lmacloop4x:
0840     subs        w3, w3, #4
0841     bmi     .Lmac1x
0842     ld1     {v1.16b-v4.16b}, [x0], #64  /* get next pt block */
0843     eor     v0.16b, v0.16b, v1.16b      /* ..and xor with dg */
0844     encrypt_block   v0, w2, x1, x7, w8
0845     eor     v0.16b, v0.16b, v2.16b
0846     encrypt_block   v0, w2, x1, x7, w8
0847     eor     v0.16b, v0.16b, v3.16b
0848     encrypt_block   v0, w2, x1, x7, w8
0849     eor     v0.16b, v0.16b, v4.16b
0850     cmp     w3, wzr
0851     csinv       x5, x6, xzr, eq
0852     cbz     w5, .Lmacout
0853     encrypt_block   v0, w2, x1, x7, w8
0854     st1     {v0.16b}, [x4]          /* return dg */
0855     cond_yield  .Lmacout, x7, x8
0856     b       .Lmacloop4x
0857 .Lmac1x:
0858     add     w3, w3, #4
0859 .Lmacloop:
0860     cbz     w3, .Lmacout
0861     ld1     {v1.16b}, [x0], #16     /* get next pt block */
0862     eor     v0.16b, v0.16b, v1.16b      /* ..and xor with dg */
0863
0864     subs        w3, w3, #1
0865     csinv       x5, x6, xzr, eq
0866     cbz     w5, .Lmacout
0867
0868 .Lmacenc:
0869     encrypt_block   v0, w2, x1, x7, w8
0870     b       .Lmacloop
0871
0872 .Lmacout:
0873     st1     {v0.16b}, [x4]          /* return dg */
0874     mov     w0, w3
0875     ret
0876 AES_FUNC_END(aes_mac_update)