arm/crypto/aes-neonbs-core.S

0001 /* SPDX-License-Identifier: GPL-2.0-only */
0002 /*
0003  * Bit sliced AES using NEON instructions
0004  *
0005  * Copyright (C) 2017 Linaro Ltd.
0006  * Author: Ard Biesheuvel <ard.biesheuvel@linaro.org>
0007  */
0008
0009 /*
0010  * The algorithm implemented here is described in detail by the paper
0011  * 'Faster and Timing-Attack Resistant AES-GCM' by Emilia Kaesper and
0012  * Peter Schwabe (https://eprint.iacr.org/2009/129.pdf)
0013  *
0014  * This implementation is based primarily on the OpenSSL implementation
0015  * for 32-bit ARM written by Andy Polyakov <appro@openssl.org>
0016  */
0017
0018 #include <linux/linkage.h>
0019 #include <asm/assembler.h>
0020
0021     .text
0022     .fpu        neon
0023
0024     rounds      .req    ip
0025     bskey       .req    r4
0026
0027     q0l     .req    d0
0028     q0h     .req    d1
0029     q1l     .req    d2
0030     q1h     .req    d3
0031     q2l     .req    d4
0032     q2h     .req    d5
0033     q3l     .req    d6
0034     q3h     .req    d7
0035     q4l     .req    d8
0036     q4h     .req    d9
0037     q5l     .req    d10
0038     q5h     .req    d11
0039     q6l     .req    d12
0040     q6h     .req    d13
0041     q7l     .req    d14
0042     q7h     .req    d15
0043     q8l     .req    d16
0044     q8h     .req    d17
0045     q9l     .req    d18
0046     q9h     .req    d19
0047     q10l        .req    d20
0048     q10h        .req    d21
0049     q11l        .req    d22
0050     q11h        .req    d23
0051     q12l        .req    d24
0052     q12h        .req    d25
0053     q13l        .req    d26
0054     q13h        .req    d27
0055     q14l        .req    d28
0056     q14h        .req    d29
0057     q15l        .req    d30
0058     q15h        .req    d31
0059
0060     .macro      __tbl, out, tbl, in, tmp
0061     .ifc        \out, \tbl
0062     .ifb        \tmp
0063     .error      __tbl needs temp register if out == tbl
0064     .endif
0065     vmov        \tmp, \out
0066     .endif
0067     vtbl.8      \out\()l, {\tbl}, \in\()l
0068     .ifc        \out, \tbl
0069     vtbl.8      \out\()h, {\tmp}, \in\()h
0070     .else
0071     vtbl.8      \out\()h, {\tbl}, \in\()h
0072     .endif
0073     .endm
0074
0075     .macro      __ldr, out, sym
0076     vldr        \out\()l, \sym
0077     vldr        \out\()h, \sym + 8
0078     .endm
0079
0080     .macro      in_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7
0081     veor        \b2, \b2, \b1
0082     veor        \b5, \b5, \b6
0083     veor        \b3, \b3, \b0
0084     veor        \b6, \b6, \b2
0085     veor        \b5, \b5, \b0
0086     veor        \b6, \b6, \b3
0087     veor        \b3, \b3, \b7
0088     veor        \b7, \b7, \b5
0089     veor        \b3, \b3, \b4
0090     veor        \b4, \b4, \b5
0091     veor        \b2, \b2, \b7
0092     veor        \b3, \b3, \b1
0093     veor        \b1, \b1, \b5
0094     .endm
0095
0096     .macro      out_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7
0097     veor        \b0, \b0, \b6
0098     veor        \b1, \b1, \b4
0099     veor        \b4, \b4, \b6
0100     veor        \b2, \b2, \b0
0101     veor        \b6, \b6, \b1
0102     veor        \b1, \b1, \b5
0103     veor        \b5, \b5, \b3
0104     veor        \b3, \b3, \b7
0105     veor        \b7, \b7, \b5
0106     veor        \b2, \b2, \b5
0107     veor        \b4, \b4, \b7
0108     .endm
0109
0110     .macro      inv_in_bs_ch, b6, b1, b2, b4, b7, b0, b3, b5
0111     veor        \b1, \b1, \b7
0112     veor        \b4, \b4, \b7
0113     veor        \b7, \b7, \b5
0114     veor        \b1, \b1, \b3
0115     veor        \b2, \b2, \b5
0116     veor        \b3, \b3, \b7
0117     veor        \b6, \b6, \b1
0118     veor        \b2, \b2, \b0
0119     veor        \b5, \b5, \b3
0120     veor        \b4, \b4, \b6
0121     veor        \b0, \b0, \b6
0122     veor        \b1, \b1, \b4
0123     .endm
0124
0125     .macro      inv_out_bs_ch, b6, b5, b0, b3, b7, b1, b4, b2
0126     veor        \b1, \b1, \b5
0127     veor        \b2, \b2, \b7
0128     veor        \b3, \b3, \b1
0129     veor        \b4, \b4, \b5
0130     veor        \b7, \b7, \b5
0131     veor        \b3, \b3, \b4
0132     veor        \b5, \b5, \b0
0133     veor        \b3, \b3, \b7
0134     veor        \b6, \b6, \b2
0135     veor        \b2, \b2, \b1
0136     veor        \b6, \b6, \b3
0137     veor        \b3, \b3, \b0
0138     veor        \b5, \b5, \b6
0139     .endm
0140
0141     .macro      mul_gf4, x0, x1, y0, y1, t0, t1
0142     veor        \t0, \y0, \y1
0143     vand        \t0, \t0, \x0
0144     veor        \x0, \x0, \x1
0145     vand        \t1, \x1, \y0
0146     vand        \x0, \x0, \y1
0147     veor        \x1, \t1, \t0
0148     veor        \x0, \x0, \t1
0149     .endm
0150
0151     .macro      mul_gf4_n_gf4, x0, x1, y0, y1, t0, x2, x3, y2, y3, t1
0152     veor        \t0, \y0, \y1
0153     veor        \t1, \y2, \y3
0154     vand        \t0, \t0, \x0
0155     vand        \t1, \t1, \x2
0156     veor        \x0, \x0, \x1
0157     veor        \x2, \x2, \x3
0158     vand        \x1, \x1, \y0
0159     vand        \x3, \x3, \y2
0160     vand        \x0, \x0, \y1
0161     vand        \x2, \x2, \y3
0162     veor        \x1, \x1, \x0
0163     veor        \x2, \x2, \x3
0164     veor        \x0, \x0, \t0
0165     veor        \x3, \x3, \t1
0166     .endm
0167
0168     .macro      mul_gf16_2, x0, x1, x2, x3, x4, x5, x6, x7, \
0169                     y0, y1, y2, y3, t0, t1, t2, t3
0170     veor        \t0, \x0, \x2
0171     veor        \t1, \x1, \x3
0172     mul_gf4     \x0, \x1, \y0, \y1, \t2, \t3
0173     veor        \y0, \y0, \y2
0174     veor        \y1, \y1, \y3
0175     mul_gf4_n_gf4   \t0, \t1, \y0, \y1, \t3, \x2, \x3, \y2, \y3, \t2
0176     veor        \x0, \x0, \t0
0177     veor        \x2, \x2, \t0
0178     veor        \x1, \x1, \t1
0179     veor        \x3, \x3, \t1
0180     veor        \t0, \x4, \x6
0181     veor        \t1, \x5, \x7
0182     mul_gf4_n_gf4   \t0, \t1, \y0, \y1, \t3, \x6, \x7, \y2, \y3, \t2
0183     veor        \y0, \y0, \y2
0184     veor        \y1, \y1, \y3
0185     mul_gf4     \x4, \x5, \y0, \y1, \t2, \t3
0186     veor        \x4, \x4, \t0
0187     veor        \x6, \x6, \t0
0188     veor        \x5, \x5, \t1
0189     veor        \x7, \x7, \t1
0190     .endm
0191
0192     .macro      inv_gf256, x0, x1, x2, x3, x4, x5, x6, x7, \
0193                    t0, t1, t2, t3, s0, s1, s2, s3
0194     veor        \t3, \x4, \x6
0195     veor        \t0, \x5, \x7
0196     veor        \t1, \x1, \x3
0197     veor        \s1, \x7, \x6
0198     veor        \s0, \x0, \x2
0199     veor        \s3, \t3, \t0
0200     vorr        \t2, \t0, \t1
0201     vand        \s2, \t3, \s0
0202     vorr        \t3, \t3, \s0
0203     veor        \s0, \s0, \t1
0204     vand        \t0, \t0, \t1
0205     veor        \t1, \x3, \x2
0206     vand        \s3, \s3, \s0
0207     vand        \s1, \s1, \t1
0208     veor        \t1, \x4, \x5
0209     veor        \s0, \x1, \x0
0210     veor        \t3, \t3, \s1
0211     veor        \t2, \t2, \s1
0212     vand        \s1, \t1, \s0
0213     vorr        \t1, \t1, \s0
0214     veor        \t3, \t3, \s3
0215     veor        \t0, \t0, \s1
0216     veor        \t2, \t2, \s2
0217     veor        \t1, \t1, \s3
0218     veor        \t0, \t0, \s2
0219     vand        \s0, \x7, \x3
0220     veor        \t1, \t1, \s2
0221     vand        \s1, \x6, \x2
0222     vand        \s2, \x5, \x1
0223     vorr        \s3, \x4, \x0
0224     veor        \t3, \t3, \s0
0225     veor        \t1, \t1, \s2
0226     veor        \s0, \t0, \s3
0227     veor        \t2, \t2, \s1
0228     vand        \s2, \t3, \t1
0229     veor        \s1, \t2, \s2
0230     veor        \s3, \s0, \s2
0231     vbsl        \s1, \t1, \s0
0232     vmvn        \t0, \s0
0233     vbsl        \s0, \s1, \s3
0234     vbsl        \t0, \s1, \s3
0235     vbsl        \s3, \t3, \t2
0236     veor        \t3, \t3, \t2
0237     vand        \s2, \s0, \s3
0238     veor        \t1, \t1, \t0
0239     veor        \s2, \s2, \t3
0240     mul_gf16_2  \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \
0241             \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
0242     .endm
0243
0244     .macro      sbox, b0, b1, b2, b3, b4, b5, b6, b7, \
0245                   t0, t1, t2, t3, s0, s1, s2, s3
0246     in_bs_ch    \b0, \b1, \b2, \b3, \b4, \b5, \b6, \b7
0247     inv_gf256   \b6, \b5, \b0, \b3, \b7, \b1, \b4, \b2, \
0248             \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
0249     out_bs_ch   \b7, \b1, \b4, \b2, \b6, \b5, \b0, \b3
0250     .endm
0251
0252     .macro      inv_sbox, b0, b1, b2, b3, b4, b5, b6, b7, \
0253                   t0, t1, t2, t3, s0, s1, s2, s3
0254     inv_in_bs_ch    \b0, \b1, \b2, \b3, \b4, \b5, \b6, \b7
0255     inv_gf256   \b5, \b1, \b2, \b6, \b3, \b7, \b0, \b4, \
0256             \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
0257     inv_out_bs_ch   \b3, \b7, \b0, \b4, \b5, \b1, \b2, \b6
0258     .endm
0259
0260     .macro      shift_rows, x0, x1, x2, x3, x4, x5, x6, x7, \
0261                     t0, t1, t2, t3, mask
0262     vld1.8      {\t0-\t1}, [bskey, :256]!
0263     veor        \t0, \t0, \x0
0264     vld1.8      {\t2-\t3}, [bskey, :256]!
0265     veor        \t1, \t1, \x1
0266     __tbl       \x0, \t0, \mask
0267     veor        \t2, \t2, \x2
0268     __tbl       \x1, \t1, \mask
0269     vld1.8      {\t0-\t1}, [bskey, :256]!
0270     veor        \t3, \t3, \x3
0271     __tbl       \x2, \t2, \mask
0272     __tbl       \x3, \t3, \mask
0273     vld1.8      {\t2-\t3}, [bskey, :256]!
0274     veor        \t0, \t0, \x4
0275     veor        \t1, \t1, \x5
0276     __tbl       \x4, \t0, \mask
0277     veor        \t2, \t2, \x6
0278     __tbl       \x5, \t1, \mask
0279     veor        \t3, \t3, \x7
0280     __tbl       \x6, \t2, \mask
0281     __tbl       \x7, \t3, \mask
0282     .endm
0283
0284     .macro      inv_shift_rows, x0, x1, x2, x3, x4, x5, x6, x7, \
0285                     t0, t1, t2, t3, mask
0286     __tbl       \x0, \x0, \mask, \t0
0287     __tbl       \x1, \x1, \mask, \t1
0288     __tbl       \x2, \x2, \mask, \t2
0289     __tbl       \x3, \x3, \mask, \t3
0290     __tbl       \x4, \x4, \mask, \t0
0291     __tbl       \x5, \x5, \mask, \t1
0292     __tbl       \x6, \x6, \mask, \t2
0293     __tbl       \x7, \x7, \mask, \t3
0294     .endm
0295
0296     .macro      mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \
0297                   t0, t1, t2, t3, t4, t5, t6, t7, inv
0298     vext.8      \t0, \x0, \x0, #12
0299     vext.8      \t1, \x1, \x1, #12
0300     veor        \x0, \x0, \t0
0301     vext.8      \t2, \x2, \x2, #12
0302     veor        \x1, \x1, \t1
0303     vext.8      \t3, \x3, \x3, #12
0304     veor        \x2, \x2, \t2
0305     vext.8      \t4, \x4, \x4, #12
0306     veor        \x3, \x3, \t3
0307     vext.8      \t5, \x5, \x5, #12
0308     veor        \x4, \x4, \t4
0309     vext.8      \t6, \x6, \x6, #12
0310     veor        \x5, \x5, \t5
0311     vext.8      \t7, \x7, \x7, #12
0312     veor        \x6, \x6, \t6
0313     veor        \t1, \t1, \x0
0314     veor.8      \x7, \x7, \t7
0315     vext.8      \x0, \x0, \x0, #8
0316     veor        \t2, \t2, \x1
0317     veor        \t0, \t0, \x7
0318     veor        \t1, \t1, \x7
0319     vext.8      \x1, \x1, \x1, #8
0320     veor        \t5, \t5, \x4
0321     veor        \x0, \x0, \t0
0322     veor        \t6, \t6, \x5
0323     veor        \x1, \x1, \t1
0324     vext.8      \t0, \x4, \x4, #8
0325     veor        \t4, \t4, \x3
0326     vext.8      \t1, \x5, \x5, #8
0327     veor        \t7, \t7, \x6
0328     vext.8      \x4, \x3, \x3, #8
0329     veor        \t3, \t3, \x2
0330     vext.8      \x5, \x7, \x7, #8
0331     veor        \t4, \t4, \x7
0332     vext.8      \x3, \x6, \x6, #8
0333     veor        \t3, \t3, \x7
0334     vext.8      \x6, \x2, \x2, #8
0335     veor        \x7, \t1, \t5
0336     .ifb        \inv
0337     veor        \x2, \t0, \t4
0338     veor        \x4, \x4, \t3
0339     veor        \x5, \x5, \t7
0340     veor        \x3, \x3, \t6
0341     veor        \x6, \x6, \t2
0342     .else
0343     veor        \t3, \t3, \x4
0344     veor        \x5, \x5, \t7
0345     veor        \x2, \x3, \t6
0346     veor        \x3, \t0, \t4
0347     veor        \x4, \x6, \t2
0348     vmov        \x6, \t3
0349     .endif
0350     .endm
0351
0352     .macro      inv_mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \
0353                       t0, t1, t2, t3, t4, t5, t6, t7
0354     vld1.8      {\t0-\t1}, [bskey, :256]!
0355     veor        \x0, \x0, \t0
0356     vld1.8      {\t2-\t3}, [bskey, :256]!
0357     veor        \x1, \x1, \t1
0358     vld1.8      {\t4-\t5}, [bskey, :256]!
0359     veor        \x2, \x2, \t2
0360     vld1.8      {\t6-\t7}, [bskey, :256]
0361     sub     bskey, bskey, #224
0362     veor        \x3, \x3, \t3
0363     veor        \x4, \x4, \t4
0364     veor        \x5, \x5, \t5
0365     veor        \x6, \x6, \t6
0366     veor        \x7, \x7, \t7
0367     vext.8      \t0, \x0, \x0, #8
0368     vext.8      \t6, \x6, \x6, #8
0369     vext.8      \t7, \x7, \x7, #8
0370     veor        \t0, \t0, \x0
0371     vext.8      \t1, \x1, \x1, #8
0372     veor        \t6, \t6, \x6
0373     vext.8      \t2, \x2, \x2, #8
0374     veor        \t7, \t7, \x7
0375     vext.8      \t3, \x3, \x3, #8
0376     veor        \t1, \t1, \x1
0377     vext.8      \t4, \x4, \x4, #8
0378     veor        \t2, \t2, \x2
0379     vext.8      \t5, \x5, \x5, #8
0380     veor        \t3, \t3, \x3
0381     veor        \t4, \t4, \x4
0382     veor        \t5, \t5, \x5
0383     veor        \x0, \x0, \t6
0384     veor        \x1, \x1, \t6
0385     veor        \x2, \x2, \t0
0386     veor        \x4, \x4, \t2
0387     veor        \x3, \x3, \t1
0388     veor        \x1, \x1, \t7
0389     veor        \x2, \x2, \t7
0390     veor        \x4, \x4, \t6
0391     veor        \x5, \x5, \t3
0392     veor        \x3, \x3, \t6
0393     veor        \x6, \x6, \t4
0394     veor        \x4, \x4, \t7
0395     veor        \x5, \x5, \t7
0396     veor        \x7, \x7, \t5
0397     mix_cols    \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \
0398             \t0, \t1, \t2, \t3, \t4, \t5, \t6, \t7, 1
0399     .endm
0400
0401     .macro      swapmove_2x, a0, b0, a1, b1, n, mask, t0, t1
0402     vshr.u64    \t0, \b0, #\n
0403     vshr.u64    \t1, \b1, #\n
0404     veor        \t0, \t0, \a0
0405     veor        \t1, \t1, \a1
0406     vand        \t0, \t0, \mask
0407     vand        \t1, \t1, \mask
0408     veor        \a0, \a0, \t0
0409     vshl.s64    \t0, \t0, #\n
0410     veor        \a1, \a1, \t1
0411     vshl.s64    \t1, \t1, #\n
0412     veor        \b0, \b0, \t0
0413     veor        \b1, \b1, \t1
0414     .endm
0415
0416     .macro      bitslice, x7, x6, x5, x4, x3, x2, x1, x0, t0, t1, t2, t3
0417     vmov.i8     \t0, #0x55
0418     vmov.i8     \t1, #0x33
0419     swapmove_2x \x0, \x1, \x2, \x3, 1, \t0, \t2, \t3
0420     swapmove_2x \x4, \x5, \x6, \x7, 1, \t0, \t2, \t3
0421     vmov.i8     \t0, #0x0f
0422     swapmove_2x \x0, \x2, \x1, \x3, 2, \t1, \t2, \t3
0423     swapmove_2x \x4, \x6, \x5, \x7, 2, \t1, \t2, \t3
0424     swapmove_2x \x0, \x4, \x1, \x5, 4, \t0, \t2, \t3
0425     swapmove_2x \x2, \x6, \x3, \x7, 4, \t0, \t2, \t3
0426     .endm
0427
0428     .align      4
0429 M0: .quad       0x02060a0e03070b0f, 0x0004080c0105090d
0430
0431     /*
0432      * void aesbs_convert_key(u8 out[], u32 const rk[], int rounds)
0433      */
0434 ENTRY(aesbs_convert_key)
0435     vld1.32     {q7}, [r1]!     // load round 0 key
0436     vld1.32     {q15}, [r1]!        // load round 1 key
0437
0438     vmov.i8     q8,  #0x01      // bit masks
0439     vmov.i8     q9,  #0x02
0440     vmov.i8     q10, #0x04
0441     vmov.i8     q11, #0x08
0442     vmov.i8     q12, #0x10
0443     vmov.i8     q13, #0x20
0444     __ldr       q14, M0
0445
0446     sub     r2, r2, #1
0447     vst1.8      {q7}, [r0, :128]!   // save round 0 key
0448
0449 .Lkey_loop:
0450     __tbl       q7, q15, q14
0451     vmov.i8     q6, #0x40
0452     vmov.i8     q15, #0x80
0453
0454     vtst.8      q0, q7, q8
0455     vtst.8      q1, q7, q9
0456     vtst.8      q2, q7, q10
0457     vtst.8      q3, q7, q11
0458     vtst.8      q4, q7, q12
0459     vtst.8      q5, q7, q13
0460     vtst.8      q6, q7, q6
0461     vtst.8      q7, q7, q15
0462     vld1.32     {q15}, [r1]!        // load next round key
0463     vmvn        q0, q0
0464     vmvn        q1, q1
0465     vmvn        q5, q5
0466     vmvn        q6, q6
0467
0468     subs        r2, r2, #1
0469     vst1.8      {q0-q1}, [r0, :256]!
0470     vst1.8      {q2-q3}, [r0, :256]!
0471     vst1.8      {q4-q5}, [r0, :256]!
0472     vst1.8      {q6-q7}, [r0, :256]!
0473     bne     .Lkey_loop
0474
0475     vmov.i8     q7, #0x63       // compose .L63
0476     veor        q15, q15, q7
0477     vst1.8      {q15}, [r0, :128]
0478     bx      lr
0479 ENDPROC(aesbs_convert_key)
0480
0481     .align      4
0482 M0SR:   .quad       0x0a0e02060f03070b, 0x0004080c05090d01
0483
0484 aesbs_encrypt8:
0485     vld1.8      {q9}, [bskey, :128]!    // round 0 key
0486     __ldr       q8, M0SR
0487
0488     veor        q10, q0, q9     // xor with round0 key
0489     veor        q11, q1, q9
0490     __tbl       q0, q10, q8
0491     veor        q12, q2, q9
0492     __tbl       q1, q11, q8
0493     veor        q13, q3, q9
0494     __tbl       q2, q12, q8
0495     veor        q14, q4, q9
0496     __tbl       q3, q13, q8
0497     veor        q15, q5, q9
0498     __tbl       q4, q14, q8
0499     veor        q10, q6, q9
0500     __tbl       q5, q15, q8
0501     veor        q11, q7, q9
0502     __tbl       q6, q10, q8
0503     __tbl       q7, q11, q8
0504
0505     bitslice    q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11
0506
0507     sub     rounds, rounds, #1
0508     b       .Lenc_sbox
0509
0510     .align      5
0511 SR: .quad       0x0504070600030201, 0x0f0e0d0c0a09080b
0512 SRM0:   .quad       0x0304090e00050a0f, 0x01060b0c0207080d
0513
0514 .Lenc_last:
0515     __ldr       q12, SRM0
0516 .Lenc_loop:
0517     shift_rows  q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12
0518 .Lenc_sbox:
0519     sbox        q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12, \
0520                                 q13, q14, q15
0521     subs        rounds, rounds, #1
0522     bcc     .Lenc_done
0523
0524     mix_cols    q0, q1, q4, q6, q3, q7, q2, q5, q8, q9, q10, q11, q12, \
0525                                 q13, q14, q15
0526
0527     beq     .Lenc_last
0528     __ldr       q12, SR
0529     b       .Lenc_loop
0530
0531 .Lenc_done:
0532     vld1.8      {q12}, [bskey, :128]    // last round key
0533
0534     bitslice    q0, q1, q4, q6, q3, q7, q2, q5, q8, q9, q10, q11
0535
0536     veor        q0, q0, q12
0537     veor        q1, q1, q12
0538     veor        q4, q4, q12
0539     veor        q6, q6, q12
0540     veor        q3, q3, q12
0541     veor        q7, q7, q12
0542     veor        q2, q2, q12
0543     veor        q5, q5, q12
0544     bx      lr
0545 ENDPROC(aesbs_encrypt8)
0546
0547     .align      4
0548 M0ISR:  .quad       0x0a0e0206070b0f03, 0x0004080c0d010509
0549
0550 aesbs_decrypt8:
0551     add     bskey, bskey, rounds, lsl #7
0552     sub     bskey, bskey, #112
0553     vld1.8      {q9}, [bskey, :128] // round 0 key
0554     sub     bskey, bskey, #128
0555     __ldr       q8, M0ISR
0556
0557     veor        q10, q0, q9     // xor with round0 key
0558     veor        q11, q1, q9
0559     __tbl       q0, q10, q8
0560     veor        q12, q2, q9
0561     __tbl       q1, q11, q8
0562     veor        q13, q3, q9
0563     __tbl       q2, q12, q8
0564     veor        q14, q4, q9
0565     __tbl       q3, q13, q8
0566     veor        q15, q5, q9
0567     __tbl       q4, q14, q8
0568     veor        q10, q6, q9
0569     __tbl       q5, q15, q8
0570     veor        q11, q7, q9
0571     __tbl       q6, q10, q8
0572     __tbl       q7, q11, q8
0573
0574     bitslice    q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11
0575
0576     sub     rounds, rounds, #1
0577     b       .Ldec_sbox
0578
0579     .align      5
0580 ISR:    .quad       0x0504070602010003, 0x0f0e0d0c080b0a09
0581 ISRM0:  .quad       0x01040b0e0205080f, 0x0306090c00070a0d
0582
0583 .Ldec_last:
0584     __ldr       q12, ISRM0
0585 .Ldec_loop:
0586     inv_shift_rows  q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12
0587 .Ldec_sbox:
0588     inv_sbox    q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12, \
0589                                 q13, q14, q15
0590     subs        rounds, rounds, #1
0591     bcc     .Ldec_done
0592
0593     inv_mix_cols    q0, q1, q6, q4, q2, q7, q3, q5, q8, q9, q10, q11, q12, \
0594                                 q13, q14, q15
0595
0596     beq     .Ldec_last
0597     __ldr       q12, ISR
0598     b       .Ldec_loop
0599
0600 .Ldec_done:
0601     add     bskey, bskey, #112
0602     vld1.8      {q12}, [bskey, :128]    // last round key
0603
0604     bitslice    q0, q1, q6, q4, q2, q7, q3, q5, q8, q9, q10, q11
0605
0606     veor        q0, q0, q12
0607     veor        q1, q1, q12
0608     veor        q6, q6, q12
0609     veor        q4, q4, q12
0610     veor        q2, q2, q12
0611     veor        q7, q7, q12
0612     veor        q3, q3, q12
0613     veor        q5, q5, q12
0614     bx      lr
0615 ENDPROC(aesbs_decrypt8)
0616
0617     /*
0618      * aesbs_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
0619      *           int blocks)
0620      * aesbs_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
0621      *           int blocks)
0622      */
0623     .macro      __ecb_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
0624     push        {r4-r6, lr}
0625     ldr     r5, [sp, #16]       // number of blocks
0626
0627 99: adr     ip, 0f
0628     and     lr, r5, #7
0629     cmp     r5, #8
0630     sub     ip, ip, lr, lsl #2
0631     movlt       pc, ip          // computed goto if blocks < 8
0632
0633     vld1.8      {q0}, [r1]!
0634     vld1.8      {q1}, [r1]!
0635     vld1.8      {q2}, [r1]!
0636     vld1.8      {q3}, [r1]!
0637     vld1.8      {q4}, [r1]!
0638     vld1.8      {q5}, [r1]!
0639     vld1.8      {q6}, [r1]!
0640     vld1.8      {q7}, [r1]!
0641
0642 0:  mov     bskey, r2
0643     mov     rounds, r3
0644     bl      \do8
0645
0646     adr     ip, 1f
0647     and     lr, r5, #7
0648     cmp     r5, #8
0649     sub     ip, ip, lr, lsl #2
0650     movlt       pc, ip          // computed goto if blocks < 8
0651
0652     vst1.8      {\o0}, [r0]!
0653     vst1.8      {\o1}, [r0]!
0654     vst1.8      {\o2}, [r0]!
0655     vst1.8      {\o3}, [r0]!
0656     vst1.8      {\o4}, [r0]!
0657     vst1.8      {\o5}, [r0]!
0658     vst1.8      {\o6}, [r0]!
0659     vst1.8      {\o7}, [r0]!
0660
0661 1:  subs        r5, r5, #8
0662     bgt     99b
0663
0664     pop     {r4-r6, pc}
0665     .endm
0666
0667     .align      4
0668 ENTRY(aesbs_ecb_encrypt)
0669     __ecb_crypt aesbs_encrypt8, q0, q1, q4, q6, q3, q7, q2, q5
0670 ENDPROC(aesbs_ecb_encrypt)
0671
0672     .align      4
0673 ENTRY(aesbs_ecb_decrypt)
0674     __ecb_crypt aesbs_decrypt8, q0, q1, q6, q4, q2, q7, q3, q5
0675 ENDPROC(aesbs_ecb_decrypt)
0676
0677     /*
0678      * aesbs_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[],
0679      *           int rounds, int blocks, u8 iv[])
0680      */
0681     .align      4
0682 ENTRY(aesbs_cbc_decrypt)
0683     mov     ip, sp
0684     push        {r4-r6, lr}
0685     ldm     ip, {r5-r6}     // load args 4-5
0686
0687 99: adr     ip, 0f
0688     and     lr, r5, #7
0689     cmp     r5, #8
0690     sub     ip, ip, lr, lsl #2
0691     mov     lr, r1
0692     movlt       pc, ip          // computed goto if blocks < 8
0693
0694     vld1.8      {q0}, [lr]!
0695     vld1.8      {q1}, [lr]!
0696     vld1.8      {q2}, [lr]!
0697     vld1.8      {q3}, [lr]!
0698     vld1.8      {q4}, [lr]!
0699     vld1.8      {q5}, [lr]!
0700     vld1.8      {q6}, [lr]!
0701     vld1.8      {q7}, [lr]
0702
0703 0:  mov     bskey, r2
0704     mov     rounds, r3
0705     bl      aesbs_decrypt8
0706
0707     vld1.8      {q8}, [r6]
0708     vmov        q9, q8
0709     vmov        q10, q8
0710     vmov        q11, q8
0711     vmov        q12, q8
0712     vmov        q13, q8
0713     vmov        q14, q8
0714     vmov        q15, q8
0715
0716     adr     ip, 1f
0717     and     lr, r5, #7
0718     cmp     r5, #8
0719     sub     ip, ip, lr, lsl #2
0720     movlt       pc, ip          // computed goto if blocks < 8
0721
0722     vld1.8      {q9}, [r1]!
0723     vld1.8      {q10}, [r1]!
0724     vld1.8      {q11}, [r1]!
0725     vld1.8      {q12}, [r1]!
0726     vld1.8      {q13}, [r1]!
0727     vld1.8      {q14}, [r1]!
0728     vld1.8      {q15}, [r1]!
0729     W(nop)
0730
0731 1:  adr     ip, 2f
0732     sub     ip, ip, lr, lsl #3
0733     movlt       pc, ip          // computed goto if blocks < 8
0734
0735     veor        q0, q0, q8
0736     vst1.8      {q0}, [r0]!
0737     veor        q1, q1, q9
0738     vst1.8      {q1}, [r0]!
0739     veor        q6, q6, q10
0740     vst1.8      {q6}, [r0]!
0741     veor        q4, q4, q11
0742     vst1.8      {q4}, [r0]!
0743     veor        q2, q2, q12
0744     vst1.8      {q2}, [r0]!
0745     veor        q7, q7, q13
0746     vst1.8      {q7}, [r0]!
0747     veor        q3, q3, q14
0748     vst1.8      {q3}, [r0]!
0749     veor        q5, q5, q15
0750     vld1.8      {q8}, [r1]!     // load next round's iv
0751 2:  vst1.8      {q5}, [r0]!
0752
0753     subs        r5, r5, #8
0754     vst1.8      {q8}, [r6]      // store next round's iv
0755     bgt     99b
0756
0757     pop     {r4-r6, pc}
0758 ENDPROC(aesbs_cbc_decrypt)
0759
0760     .macro      next_ctr, q
0761     vmov        \q\()h, r9, r10
0762     adds        r10, r10, #1
0763     adcs        r9, r9, #0
0764     vmov        \q\()l, r7, r8
0765     adcs        r8, r8, #0
0766     adc     r7, r7, #0
0767     vrev32.8    \q, \q
0768     .endm
0769
0770     /*
0771      * aesbs_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[],
0772      *           int rounds, int bytes, u8 ctr[])
0773      */
0774 ENTRY(aesbs_ctr_encrypt)
0775     mov     ip, sp
0776     push        {r4-r10, lr}
0777
0778     ldm     ip, {r5, r6}        // load args 4-5
0779     vld1.8      {q0}, [r6]      // load counter
0780     vrev32.8    q1, q0
0781     vmov        r9, r10, d3
0782     vmov        r7, r8, d2
0783
0784     adds        r10, r10, #1
0785     adcs        r9, r9, #0
0786     adcs        r8, r8, #0
0787     adc     r7, r7, #0
0788
0789 99: vmov        q1, q0
0790     sub     lr, r5, #1
0791     vmov        q2, q0
0792     adr     ip, 0f
0793     vmov        q3, q0
0794     and     lr, lr, #112
0795     vmov        q4, q0
0796     cmp     r5, #112
0797     vmov        q5, q0
0798     sub     ip, ip, lr, lsl #1
0799     vmov        q6, q0
0800     add     ip, ip, lr, lsr #2
0801     vmov        q7, q0
0802     movle       pc, ip          // computed goto if bytes < 112
0803
0804     next_ctr    q1
0805     next_ctr    q2
0806     next_ctr    q3
0807     next_ctr    q4
0808     next_ctr    q5
0809     next_ctr    q6
0810     next_ctr    q7
0811
0812 0:  mov     bskey, r2
0813     mov     rounds, r3
0814     bl      aesbs_encrypt8
0815
0816     adr     ip, 1f
0817     sub     lr, r5, #1
0818     cmp     r5, #128
0819     bic     lr, lr, #15
0820     ands        r4, r5, #15     // preserves C flag
0821     teqcs       r5, r5          // set Z flag if not last iteration
0822     sub     ip, ip, lr, lsr #2
0823     rsb     r4, r4, #16
0824     movcc       pc, ip          // computed goto if bytes < 128
0825
0826     vld1.8      {q8}, [r1]!
0827     vld1.8      {q9}, [r1]!
0828     vld1.8      {q10}, [r1]!
0829     vld1.8      {q11}, [r1]!
0830     vld1.8      {q12}, [r1]!
0831     vld1.8      {q13}, [r1]!
0832     vld1.8      {q14}, [r1]!
0833 1:  subne       r1, r1, r4
0834     vld1.8      {q15}, [r1]!
0835
0836     add     ip, ip, #2f - 1b
0837
0838     veor        q0, q0, q8
0839     veor        q1, q1, q9
0840     veor        q4, q4, q10
0841     veor        q6, q6, q11
0842     veor        q3, q3, q12
0843     veor        q7, q7, q13
0844     veor        q2, q2, q14
0845     bne     3f
0846     veor        q5, q5, q15
0847
0848     movcc       pc, ip          // computed goto if bytes < 128
0849
0850     vst1.8      {q0}, [r0]!
0851     vst1.8      {q1}, [r0]!
0852     vst1.8      {q4}, [r0]!
0853     vst1.8      {q6}, [r0]!
0854     vst1.8      {q3}, [r0]!
0855     vst1.8      {q7}, [r0]!
0856     vst1.8      {q2}, [r0]!
0857 2:  subne       r0, r0, r4
0858     vst1.8      {q5}, [r0]!
0859
0860     next_ctr    q0
0861
0862     subs        r5, r5, #128
0863     bgt     99b
0864
0865     vst1.8      {q0}, [r6]
0866     pop     {r4-r10, pc}
0867
0868 3:  adr     lr, .Lpermute_table + 16
0869     cmp     r5, #16         // Z flag remains cleared
0870     sub     lr, lr, r4
0871     vld1.8      {q8-q9}, [lr]
0872     vtbl.8      d16, {q5}, d16
0873     vtbl.8      d17, {q5}, d17
0874     veor        q5, q8, q15
0875     bcc     4f          // have to reload prev if R5 < 16
0876     vtbx.8      d10, {q2}, d18
0877     vtbx.8      d11, {q2}, d19
0878     mov     pc, ip          // branch back to VST sequence
0879
0880 4:  sub     r0, r0, r4
0881     vshr.s8     q9, q9, #7      // create mask for VBIF
0882     vld1.8      {q8}, [r0]      // reload
0883     vbif        q5, q8, q9
0884     vst1.8      {q5}, [r0]
0885     pop     {r4-r10, pc}
0886 ENDPROC(aesbs_ctr_encrypt)
0887
0888     .align      6
0889 .Lpermute_table:
0890     .byte       0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
0891     .byte       0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
0892     .byte       0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
0893     .byte       0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
0894     .byte       0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
0895     .byte       0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
0896
0897     .macro      next_tweak, out, in, const, tmp
0898     vshr.s64    \tmp, \in, #63
0899     vand        \tmp, \tmp, \const
0900     vadd.u64    \out, \in, \in
0901     vext.8      \tmp, \tmp, \tmp, #8
0902     veor        \out, \out, \tmp
0903     .endm
0904
0905     /*
0906      * aesbs_xts_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
0907      *           int blocks, u8 iv[], int reorder_last_tweak)
0908      * aesbs_xts_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
0909      *           int blocks, u8 iv[], int reorder_last_tweak)
0910      */
0911     .align      6
0912 __xts_prepare8:
0913     vld1.8      {q14}, [r7]     // load iv
0914     vmov.i32    d30, #0x87      // compose tweak mask vector
0915     vmovl.u32   q15, d30
0916     vshr.u64    d30, d31, #7
0917     vmov        q12, q14
0918
0919     adr     ip, 0f
0920     and     r4, r6, #7
0921     cmp     r6, #8
0922     sub     ip, ip, r4, lsl #5
0923     mov     r4, sp
0924     movlt       pc, ip          // computed goto if blocks < 8
0925
0926     vld1.8      {q0}, [r1]!
0927     next_tweak  q12, q14, q15, q13
0928     veor        q0, q0, q14
0929     vst1.8      {q14}, [r4, :128]!
0930
0931     vld1.8      {q1}, [r1]!
0932     next_tweak  q14, q12, q15, q13
0933     veor        q1, q1, q12
0934     vst1.8      {q12}, [r4, :128]!
0935
0936     vld1.8      {q2}, [r1]!
0937     next_tweak  q12, q14, q15, q13
0938     veor        q2, q2, q14
0939     vst1.8      {q14}, [r4, :128]!
0940
0941     vld1.8      {q3}, [r1]!
0942     next_tweak  q14, q12, q15, q13
0943     veor        q3, q3, q12
0944     vst1.8      {q12}, [r4, :128]!
0945
0946     vld1.8      {q4}, [r1]!
0947     next_tweak  q12, q14, q15, q13
0948     veor        q4, q4, q14
0949     vst1.8      {q14}, [r4, :128]!
0950
0951     vld1.8      {q5}, [r1]!
0952     next_tweak  q14, q12, q15, q13
0953     veor        q5, q5, q12
0954     vst1.8      {q12}, [r4, :128]!
0955
0956     vld1.8      {q6}, [r1]!
0957     next_tweak  q12, q14, q15, q13
0958     veor        q6, q6, q14
0959     vst1.8      {q14}, [r4, :128]!
0960
0961     vld1.8      {q7}, [r1]!
0962     next_tweak  q14, q12, q15, q13
0963 THUMB(  itt     le      )
0964     W(cmple)    r8, #0
0965     ble     1f
0966 0:  veor        q7, q7, q12
0967     vst1.8      {q12}, [r4, :128]
0968
0969     vst1.8      {q14}, [r7]     // store next iv
0970     bx      lr
0971
0972 1:  vswp        q12, q14
0973     b       0b
0974 ENDPROC(__xts_prepare8)
0975
0976     .macro      __xts_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
0977     push        {r4-r8, lr}
0978     mov     r5, sp          // preserve sp
0979     ldrd        r6, r7, [sp, #24]   // get blocks and iv args
0980     rsb     r8, ip, #1
0981     sub     ip, sp, #128        // make room for 8x tweak
0982     bic     ip, ip, #0xf        // align sp to 16 bytes
0983     mov     sp, ip
0984
0985 99: bl      __xts_prepare8
0986
0987     mov     bskey, r2
0988     mov     rounds, r3
0989     bl      \do8
0990
0991     adr     ip, 0f
0992     and     lr, r6, #7
0993     cmp     r6, #8
0994     sub     ip, ip, lr, lsl #2
0995     mov     r4, sp
0996     movlt       pc, ip          // computed goto if blocks < 8
0997
0998     vld1.8      {q8}, [r4, :128]!
0999     vld1.8      {q9}, [r4, :128]!
1000     vld1.8      {q10}, [r4, :128]!
1001     vld1.8      {q11}, [r4, :128]!
1002     vld1.8      {q12}, [r4, :128]!
1003     vld1.8      {q13}, [r4, :128]!
1004     vld1.8      {q14}, [r4, :128]!
1005     vld1.8      {q15}, [r4, :128]
1006
1007 0:  adr     ip, 1f
1008     sub     ip, ip, lr, lsl #3
1009     movlt       pc, ip          // computed goto if blocks < 8
1010
1011     veor        \o0, \o0, q8
1012     vst1.8      {\o0}, [r0]!
1013     veor        \o1, \o1, q9
1014     vst1.8      {\o1}, [r0]!
1015     veor        \o2, \o2, q10
1016     vst1.8      {\o2}, [r0]!
1017     veor        \o3, \o3, q11
1018     vst1.8      {\o3}, [r0]!
1019     veor        \o4, \o4, q12
1020     vst1.8      {\o4}, [r0]!
1021     veor        \o5, \o5, q13
1022     vst1.8      {\o5}, [r0]!
1023     veor        \o6, \o6, q14
1024     vst1.8      {\o6}, [r0]!
1025     veor        \o7, \o7, q15
1026     vst1.8      {\o7}, [r0]!
1027
1028 1:  subs        r6, r6, #8
1029     bgt     99b
1030
1031     mov     sp, r5
1032     pop     {r4-r8, pc}
1033     .endm
1034
1035 ENTRY(aesbs_xts_encrypt)
1036     mov     ip, #0          // never reorder final tweak
1037     __xts_crypt aesbs_encrypt8, q0, q1, q4, q6, q3, q7, q2, q5
1038 ENDPROC(aesbs_xts_encrypt)
1039
1040 ENTRY(aesbs_xts_decrypt)
1041     ldr     ip, [sp, #8]        // reorder final tweak?
1042     __xts_crypt aesbs_decrypt8, q0, q1, q6, q4, q2, q7, q3, q5
1043 ENDPROC(aesbs_xts_decrypt)