Back to home page

OSCL-LXR

 
 

    


0001 /* SPDX-License-Identifier: GPL-2.0-only */
0002 /*
0003  * Bit sliced AES using NEON instructions
0004  *
0005  * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
0006  */
0007 
0008 /*
0009  * The algorithm implemented here is described in detail by the paper
0010  * 'Faster and Timing-Attack Resistant AES-GCM' by Emilia Kaesper and
0011  * Peter Schwabe (https://eprint.iacr.org/2009/129.pdf)
0012  *
0013  * This implementation is based primarily on the OpenSSL implementation
0014  * for 32-bit ARM written by Andy Polyakov <appro@openssl.org>
0015  */
0016 
0017 #include <linux/linkage.h>
0018 #include <asm/assembler.h>
0019 
0020     .text
0021 
0022     rounds      .req    x11
0023     bskey       .req    x12
0024 
0025     .macro      in_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7
0026     eor     \b2, \b2, \b1
0027     eor     \b5, \b5, \b6
0028     eor     \b3, \b3, \b0
0029     eor     \b6, \b6, \b2
0030     eor     \b5, \b5, \b0
0031     eor     \b6, \b6, \b3
0032     eor     \b3, \b3, \b7
0033     eor     \b7, \b7, \b5
0034     eor     \b3, \b3, \b4
0035     eor     \b4, \b4, \b5
0036     eor     \b2, \b2, \b7
0037     eor     \b3, \b3, \b1
0038     eor     \b1, \b1, \b5
0039     .endm
0040 
0041     .macro      out_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7
0042     eor     \b0, \b0, \b6
0043     eor     \b1, \b1, \b4
0044     eor     \b4, \b4, \b6
0045     eor     \b2, \b2, \b0
0046     eor     \b6, \b6, \b1
0047     eor     \b1, \b1, \b5
0048     eor     \b5, \b5, \b3
0049     eor     \b3, \b3, \b7
0050     eor     \b7, \b7, \b5
0051     eor     \b2, \b2, \b5
0052     eor     \b4, \b4, \b7
0053     .endm
0054 
0055     .macro      inv_in_bs_ch, b6, b1, b2, b4, b7, b0, b3, b5
0056     eor     \b1, \b1, \b7
0057     eor     \b4, \b4, \b7
0058     eor     \b7, \b7, \b5
0059     eor     \b1, \b1, \b3
0060     eor     \b2, \b2, \b5
0061     eor     \b3, \b3, \b7
0062     eor     \b6, \b6, \b1
0063     eor     \b2, \b2, \b0
0064     eor     \b5, \b5, \b3
0065     eor     \b4, \b4, \b6
0066     eor     \b0, \b0, \b6
0067     eor     \b1, \b1, \b4
0068     .endm
0069 
0070     .macro      inv_out_bs_ch, b6, b5, b0, b3, b7, b1, b4, b2
0071     eor     \b1, \b1, \b5
0072     eor     \b2, \b2, \b7
0073     eor     \b3, \b3, \b1
0074     eor     \b4, \b4, \b5
0075     eor     \b7, \b7, \b5
0076     eor     \b3, \b3, \b4
0077     eor         \b5, \b5, \b0
0078     eor     \b3, \b3, \b7
0079     eor     \b6, \b6, \b2
0080     eor     \b2, \b2, \b1
0081     eor     \b6, \b6, \b3
0082     eor     \b3, \b3, \b0
0083     eor     \b5, \b5, \b6
0084     .endm
0085 
0086     .macro      mul_gf4, x0, x1, y0, y1, t0, t1
0087     eor         \t0, \y0, \y1
0088     and     \t0, \t0, \x0
0089     eor     \x0, \x0, \x1
0090     and     \t1, \x1, \y0
0091     and     \x0, \x0, \y1
0092     eor     \x1, \t1, \t0
0093     eor     \x0, \x0, \t1
0094     .endm
0095 
0096     .macro      mul_gf4_n_gf4, x0, x1, y0, y1, t0, x2, x3, y2, y3, t1
0097     eor     \t0, \y0, \y1
0098     eor         \t1, \y2, \y3
0099     and     \t0, \t0, \x0
0100     and     \t1, \t1, \x2
0101     eor     \x0, \x0, \x1
0102     eor     \x2, \x2, \x3
0103     and     \x1, \x1, \y0
0104     and     \x3, \x3, \y2
0105     and     \x0, \x0, \y1
0106     and     \x2, \x2, \y3
0107     eor     \x1, \x1, \x0
0108     eor     \x2, \x2, \x3
0109     eor     \x0, \x0, \t0
0110     eor     \x3, \x3, \t1
0111     .endm
0112 
0113     .macro      mul_gf16_2, x0, x1, x2, x3, x4, x5, x6, x7, \
0114                     y0, y1, y2, y3, t0, t1, t2, t3
0115     eor     \t0, \x0, \x2
0116     eor     \t1, \x1, \x3
0117     mul_gf4     \x0, \x1, \y0, \y1, \t2, \t3
0118     eor     \y0, \y0, \y2
0119     eor     \y1, \y1, \y3
0120     mul_gf4_n_gf4   \t0, \t1, \y0, \y1, \t3, \x2, \x3, \y2, \y3, \t2
0121     eor     \x0, \x0, \t0
0122     eor     \x2, \x2, \t0
0123     eor     \x1, \x1, \t1
0124     eor     \x3, \x3, \t1
0125     eor     \t0, \x4, \x6
0126     eor     \t1, \x5, \x7
0127     mul_gf4_n_gf4   \t0, \t1, \y0, \y1, \t3, \x6, \x7, \y2, \y3, \t2
0128     eor     \y0, \y0, \y2
0129     eor     \y1, \y1, \y3
0130     mul_gf4     \x4, \x5, \y0, \y1, \t2, \t3
0131     eor     \x4, \x4, \t0
0132     eor     \x6, \x6, \t0
0133     eor     \x5, \x5, \t1
0134     eor     \x7, \x7, \t1
0135     .endm
0136 
0137     .macro      inv_gf256, x0, x1, x2, x3, x4, x5, x6, x7, \
0138                    t0, t1, t2, t3, s0, s1, s2, s3
0139     eor     \t3, \x4, \x6
0140     eor     \t0, \x5, \x7
0141     eor     \t1, \x1, \x3
0142     eor     \s1, \x7, \x6
0143     eor     \s0, \x0, \x2
0144     eor     \s3, \t3, \t0
0145     orr     \t2, \t0, \t1
0146     and     \s2, \t3, \s0
0147     orr     \t3, \t3, \s0
0148     eor     \s0, \s0, \t1
0149     and     \t0, \t0, \t1
0150     eor     \t1, \x3, \x2
0151     and     \s3, \s3, \s0
0152     and     \s1, \s1, \t1
0153     eor     \t1, \x4, \x5
0154     eor     \s0, \x1, \x0
0155     eor     \t3, \t3, \s1
0156     eor     \t2, \t2, \s1
0157     and     \s1, \t1, \s0
0158     orr     \t1, \t1, \s0
0159     eor     \t3, \t3, \s3
0160     eor     \t0, \t0, \s1
0161     eor     \t2, \t2, \s2
0162     eor     \t1, \t1, \s3
0163     eor     \t0, \t0, \s2
0164     and     \s0, \x7, \x3
0165     eor     \t1, \t1, \s2
0166     and     \s1, \x6, \x2
0167     and     \s2, \x5, \x1
0168     orr     \s3, \x4, \x0
0169     eor     \t3, \t3, \s0
0170     eor     \t1, \t1, \s2
0171     eor     \s0, \t0, \s3
0172     eor     \t2, \t2, \s1
0173     and     \s2, \t3, \t1
0174     eor     \s1, \t2, \s2
0175     eor     \s3, \s0, \s2
0176     bsl     \s1, \t1, \s0
0177     not     \t0, \s0
0178     bsl     \s0, \s1, \s3
0179     bsl     \t0, \s1, \s3
0180     bsl     \s3, \t3, \t2
0181     eor     \t3, \t3, \t2
0182     and     \s2, \s0, \s3
0183     eor     \t1, \t1, \t0
0184     eor     \s2, \s2, \t3
0185     mul_gf16_2  \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \
0186             \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
0187     .endm
0188 
0189     .macro      sbox, b0, b1, b2, b3, b4, b5, b6, b7, \
0190                   t0, t1, t2, t3, s0, s1, s2, s3
0191     in_bs_ch    \b0\().16b, \b1\().16b, \b2\().16b, \b3\().16b, \
0192             \b4\().16b, \b5\().16b, \b6\().16b, \b7\().16b
0193     inv_gf256   \b6\().16b, \b5\().16b, \b0\().16b, \b3\().16b, \
0194             \b7\().16b, \b1\().16b, \b4\().16b, \b2\().16b, \
0195             \t0\().16b, \t1\().16b, \t2\().16b, \t3\().16b, \
0196             \s0\().16b, \s1\().16b, \s2\().16b, \s3\().16b
0197     out_bs_ch   \b7\().16b, \b1\().16b, \b4\().16b, \b2\().16b, \
0198             \b6\().16b, \b5\().16b, \b0\().16b, \b3\().16b
0199     .endm
0200 
0201     .macro      inv_sbox, b0, b1, b2, b3, b4, b5, b6, b7, \
0202                   t0, t1, t2, t3, s0, s1, s2, s3
0203     inv_in_bs_ch    \b0\().16b, \b1\().16b, \b2\().16b, \b3\().16b, \
0204             \b4\().16b, \b5\().16b, \b6\().16b, \b7\().16b
0205     inv_gf256   \b5\().16b, \b1\().16b, \b2\().16b, \b6\().16b, \
0206             \b3\().16b, \b7\().16b, \b0\().16b, \b4\().16b, \
0207             \t0\().16b, \t1\().16b, \t2\().16b, \t3\().16b, \
0208             \s0\().16b, \s1\().16b, \s2\().16b, \s3\().16b
0209     inv_out_bs_ch   \b3\().16b, \b7\().16b, \b0\().16b, \b4\().16b, \
0210             \b5\().16b, \b1\().16b, \b2\().16b, \b6\().16b
0211     .endm
0212 
0213     .macro      enc_next_rk
0214     ldp     q16, q17, [bskey], #128
0215     ldp     q18, q19, [bskey, #-96]
0216     ldp     q20, q21, [bskey, #-64]
0217     ldp     q22, q23, [bskey, #-32]
0218     .endm
0219 
0220     .macro      dec_next_rk
0221     ldp     q16, q17, [bskey, #-128]!
0222     ldp     q18, q19, [bskey, #32]
0223     ldp     q20, q21, [bskey, #64]
0224     ldp     q22, q23, [bskey, #96]
0225     .endm
0226 
0227     .macro      add_round_key, x0, x1, x2, x3, x4, x5, x6, x7
0228     eor     \x0\().16b, \x0\().16b, v16.16b
0229     eor     \x1\().16b, \x1\().16b, v17.16b
0230     eor     \x2\().16b, \x2\().16b, v18.16b
0231     eor     \x3\().16b, \x3\().16b, v19.16b
0232     eor     \x4\().16b, \x4\().16b, v20.16b
0233     eor     \x5\().16b, \x5\().16b, v21.16b
0234     eor     \x6\().16b, \x6\().16b, v22.16b
0235     eor     \x7\().16b, \x7\().16b, v23.16b
0236     .endm
0237 
0238     .macro      shift_rows, x0, x1, x2, x3, x4, x5, x6, x7, mask
0239     tbl     \x0\().16b, {\x0\().16b}, \mask\().16b
0240     tbl     \x1\().16b, {\x1\().16b}, \mask\().16b
0241     tbl     \x2\().16b, {\x2\().16b}, \mask\().16b
0242     tbl     \x3\().16b, {\x3\().16b}, \mask\().16b
0243     tbl     \x4\().16b, {\x4\().16b}, \mask\().16b
0244     tbl     \x5\().16b, {\x5\().16b}, \mask\().16b
0245     tbl     \x6\().16b, {\x6\().16b}, \mask\().16b
0246     tbl     \x7\().16b, {\x7\().16b}, \mask\().16b
0247     .endm
0248 
0249     .macro      mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \
0250                   t0, t1, t2, t3, t4, t5, t6, t7, inv
0251     ext     \t0\().16b, \x0\().16b, \x0\().16b, #12
0252     ext     \t1\().16b, \x1\().16b, \x1\().16b, #12
0253     eor     \x0\().16b, \x0\().16b, \t0\().16b
0254     ext     \t2\().16b, \x2\().16b, \x2\().16b, #12
0255     eor     \x1\().16b, \x1\().16b, \t1\().16b
0256     ext     \t3\().16b, \x3\().16b, \x3\().16b, #12
0257     eor     \x2\().16b, \x2\().16b, \t2\().16b
0258     ext     \t4\().16b, \x4\().16b, \x4\().16b, #12
0259     eor     \x3\().16b, \x3\().16b, \t3\().16b
0260     ext     \t5\().16b, \x5\().16b, \x5\().16b, #12
0261     eor     \x4\().16b, \x4\().16b, \t4\().16b
0262     ext     \t6\().16b, \x6\().16b, \x6\().16b, #12
0263     eor     \x5\().16b, \x5\().16b, \t5\().16b
0264     ext     \t7\().16b, \x7\().16b, \x7\().16b, #12
0265     eor     \x6\().16b, \x6\().16b, \t6\().16b
0266     eor     \t1\().16b, \t1\().16b, \x0\().16b
0267     eor     \x7\().16b, \x7\().16b, \t7\().16b
0268     ext     \x0\().16b, \x0\().16b, \x0\().16b, #8
0269     eor     \t2\().16b, \t2\().16b, \x1\().16b
0270     eor     \t0\().16b, \t0\().16b, \x7\().16b
0271     eor     \t1\().16b, \t1\().16b, \x7\().16b
0272     ext     \x1\().16b, \x1\().16b, \x1\().16b, #8
0273     eor     \t5\().16b, \t5\().16b, \x4\().16b
0274     eor     \x0\().16b, \x0\().16b, \t0\().16b
0275     eor     \t6\().16b, \t6\().16b, \x5\().16b
0276     eor     \x1\().16b, \x1\().16b, \t1\().16b
0277     ext     \t0\().16b, \x4\().16b, \x4\().16b, #8
0278     eor     \t4\().16b, \t4\().16b, \x3\().16b
0279     ext     \t1\().16b, \x5\().16b, \x5\().16b, #8
0280     eor     \t7\().16b, \t7\().16b, \x6\().16b
0281     ext     \x4\().16b, \x3\().16b, \x3\().16b, #8
0282     eor     \t3\().16b, \t3\().16b, \x2\().16b
0283     ext     \x5\().16b, \x7\().16b, \x7\().16b, #8
0284     eor     \t4\().16b, \t4\().16b, \x7\().16b
0285     ext     \x3\().16b, \x6\().16b, \x6\().16b, #8
0286     eor     \t3\().16b, \t3\().16b, \x7\().16b
0287     ext     \x6\().16b, \x2\().16b, \x2\().16b, #8
0288     eor     \x7\().16b, \t1\().16b, \t5\().16b
0289     .ifb        \inv
0290     eor     \x2\().16b, \t0\().16b, \t4\().16b
0291     eor     \x4\().16b, \x4\().16b, \t3\().16b
0292     eor     \x5\().16b, \x5\().16b, \t7\().16b
0293     eor     \x3\().16b, \x3\().16b, \t6\().16b
0294     eor     \x6\().16b, \x6\().16b, \t2\().16b
0295     .else
0296     eor     \t3\().16b, \t3\().16b, \x4\().16b
0297     eor     \x5\().16b, \x5\().16b, \t7\().16b
0298     eor     \x2\().16b, \x3\().16b, \t6\().16b
0299     eor     \x3\().16b, \t0\().16b, \t4\().16b
0300     eor     \x4\().16b, \x6\().16b, \t2\().16b
0301     mov     \x6\().16b, \t3\().16b
0302     .endif
0303     .endm
0304 
0305     .macro      inv_mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \
0306                       t0, t1, t2, t3, t4, t5, t6, t7
0307     ext     \t0\().16b, \x0\().16b, \x0\().16b, #8
0308     ext     \t6\().16b, \x6\().16b, \x6\().16b, #8
0309     ext     \t7\().16b, \x7\().16b, \x7\().16b, #8
0310     eor     \t0\().16b, \t0\().16b, \x0\().16b
0311     ext     \t1\().16b, \x1\().16b, \x1\().16b, #8
0312     eor     \t6\().16b, \t6\().16b, \x6\().16b
0313     ext     \t2\().16b, \x2\().16b, \x2\().16b, #8
0314     eor     \t7\().16b, \t7\().16b, \x7\().16b
0315     ext     \t3\().16b, \x3\().16b, \x3\().16b, #8
0316     eor     \t1\().16b, \t1\().16b, \x1\().16b
0317     ext     \t4\().16b, \x4\().16b, \x4\().16b, #8
0318     eor     \t2\().16b, \t2\().16b, \x2\().16b
0319     ext     \t5\().16b, \x5\().16b, \x5\().16b, #8
0320     eor     \t3\().16b, \t3\().16b, \x3\().16b
0321     eor     \t4\().16b, \t4\().16b, \x4\().16b
0322     eor     \t5\().16b, \t5\().16b, \x5\().16b
0323     eor     \x0\().16b, \x0\().16b, \t6\().16b
0324     eor     \x1\().16b, \x1\().16b, \t6\().16b
0325     eor     \x2\().16b, \x2\().16b, \t0\().16b
0326     eor     \x4\().16b, \x4\().16b, \t2\().16b
0327     eor     \x3\().16b, \x3\().16b, \t1\().16b
0328     eor     \x1\().16b, \x1\().16b, \t7\().16b
0329     eor     \x2\().16b, \x2\().16b, \t7\().16b
0330     eor     \x4\().16b, \x4\().16b, \t6\().16b
0331     eor     \x5\().16b, \x5\().16b, \t3\().16b
0332     eor     \x3\().16b, \x3\().16b, \t6\().16b
0333     eor     \x6\().16b, \x6\().16b, \t4\().16b
0334     eor     \x4\().16b, \x4\().16b, \t7\().16b
0335     eor     \x5\().16b, \x5\().16b, \t7\().16b
0336     eor     \x7\().16b, \x7\().16b, \t5\().16b
0337     mix_cols    \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \
0338             \t0, \t1, \t2, \t3, \t4, \t5, \t6, \t7, 1
0339     .endm
0340 
0341     .macro      swapmove_2x, a0, b0, a1, b1, n, mask, t0, t1
0342     ushr        \t0\().2d, \b0\().2d, #\n
0343     ushr        \t1\().2d, \b1\().2d, #\n
0344     eor     \t0\().16b, \t0\().16b, \a0\().16b
0345     eor     \t1\().16b, \t1\().16b, \a1\().16b
0346     and     \t0\().16b, \t0\().16b, \mask\().16b
0347     and     \t1\().16b, \t1\().16b, \mask\().16b
0348     eor     \a0\().16b, \a0\().16b, \t0\().16b
0349     shl     \t0\().2d, \t0\().2d, #\n
0350     eor     \a1\().16b, \a1\().16b, \t1\().16b
0351     shl     \t1\().2d, \t1\().2d, #\n
0352     eor     \b0\().16b, \b0\().16b, \t0\().16b
0353     eor     \b1\().16b, \b1\().16b, \t1\().16b
0354     .endm
0355 
0356     .macro      bitslice, x7, x6, x5, x4, x3, x2, x1, x0, t0, t1, t2, t3
0357     movi        \t0\().16b, #0x55
0358     movi        \t1\().16b, #0x33
0359     swapmove_2x \x0, \x1, \x2, \x3, 1, \t0, \t2, \t3
0360     swapmove_2x \x4, \x5, \x6, \x7, 1, \t0, \t2, \t3
0361     movi        \t0\().16b, #0x0f
0362     swapmove_2x \x0, \x2, \x1, \x3, 2, \t1, \t2, \t3
0363     swapmove_2x \x4, \x6, \x5, \x7, 2, \t1, \t2, \t3
0364     swapmove_2x \x0, \x4, \x1, \x5, 4, \t0, \t2, \t3
0365     swapmove_2x \x2, \x6, \x3, \x7, 4, \t0, \t2, \t3
0366     .endm
0367 
0368 
0369     .align      6
0370 M0: .octa       0x0004080c0105090d02060a0e03070b0f
0371 
0372 M0SR:   .octa       0x0004080c05090d010a0e02060f03070b
0373 SR: .octa       0x0f0e0d0c0a09080b0504070600030201
0374 SRM0:   .octa       0x01060b0c0207080d0304090e00050a0f
0375 
0376 M0ISR:  .octa       0x0004080c0d0105090a0e0206070b0f03
0377 ISR:    .octa       0x0f0e0d0c080b0a090504070602010003
0378 ISRM0:  .octa       0x0306090c00070a0d01040b0e0205080f
0379 
0380     /*
0381      * void aesbs_convert_key(u8 out[], u32 const rk[], int rounds)
0382      */
0383 SYM_FUNC_START(aesbs_convert_key)
0384     ld1     {v7.4s}, [x1], #16      // load round 0 key
0385     ld1     {v17.4s}, [x1], #16     // load round 1 key
0386 
0387     movi        v8.16b,  #0x01          // bit masks
0388     movi        v9.16b,  #0x02
0389     movi        v10.16b, #0x04
0390     movi        v11.16b, #0x08
0391     movi        v12.16b, #0x10
0392     movi        v13.16b, #0x20
0393     movi        v14.16b, #0x40
0394     movi        v15.16b, #0x80
0395     ldr     q16, M0
0396 
0397     sub     x2, x2, #1
0398     str     q7, [x0], #16       // save round 0 key
0399 
0400 .Lkey_loop:
0401     tbl     v7.16b ,{v17.16b}, v16.16b
0402     ld1     {v17.4s}, [x1], #16     // load next round key
0403 
0404     cmtst       v0.16b, v7.16b, v8.16b
0405     cmtst       v1.16b, v7.16b, v9.16b
0406     cmtst       v2.16b, v7.16b, v10.16b
0407     cmtst       v3.16b, v7.16b, v11.16b
0408     cmtst       v4.16b, v7.16b, v12.16b
0409     cmtst       v5.16b, v7.16b, v13.16b
0410     cmtst       v6.16b, v7.16b, v14.16b
0411     cmtst       v7.16b, v7.16b, v15.16b
0412     not     v0.16b, v0.16b
0413     not     v1.16b, v1.16b
0414     not     v5.16b, v5.16b
0415     not     v6.16b, v6.16b
0416 
0417     subs        x2, x2, #1
0418     stp     q0, q1, [x0], #128
0419     stp     q2, q3, [x0, #-96]
0420     stp     q4, q5, [x0, #-64]
0421     stp     q6, q7, [x0, #-32]
0422     b.ne        .Lkey_loop
0423 
0424     movi        v7.16b, #0x63           // compose .L63
0425     eor     v17.16b, v17.16b, v7.16b
0426     str     q17, [x0]
0427     ret
0428 SYM_FUNC_END(aesbs_convert_key)
0429 
0430     .align      4
0431 SYM_FUNC_START_LOCAL(aesbs_encrypt8)
0432     ldr     q9, [bskey], #16        // round 0 key
0433     ldr     q8, M0SR
0434     ldr     q24, SR
0435 
0436     eor     v10.16b, v0.16b, v9.16b     // xor with round0 key
0437     eor     v11.16b, v1.16b, v9.16b
0438     tbl     v0.16b, {v10.16b}, v8.16b
0439     eor     v12.16b, v2.16b, v9.16b
0440     tbl     v1.16b, {v11.16b}, v8.16b
0441     eor     v13.16b, v3.16b, v9.16b
0442     tbl     v2.16b, {v12.16b}, v8.16b
0443     eor     v14.16b, v4.16b, v9.16b
0444     tbl     v3.16b, {v13.16b}, v8.16b
0445     eor     v15.16b, v5.16b, v9.16b
0446     tbl     v4.16b, {v14.16b}, v8.16b
0447     eor     v10.16b, v6.16b, v9.16b
0448     tbl     v5.16b, {v15.16b}, v8.16b
0449     eor     v11.16b, v7.16b, v9.16b
0450     tbl     v6.16b, {v10.16b}, v8.16b
0451     tbl     v7.16b, {v11.16b}, v8.16b
0452 
0453     bitslice    v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11
0454 
0455     sub     rounds, rounds, #1
0456     b       .Lenc_sbox
0457 
0458 .Lenc_loop:
0459     shift_rows  v0, v1, v2, v3, v4, v5, v6, v7, v24
0460 .Lenc_sbox:
0461     sbox        v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, \
0462                                 v13, v14, v15
0463     subs        rounds, rounds, #1
0464     b.cc        .Lenc_done
0465 
0466     enc_next_rk
0467 
0468     mix_cols    v0, v1, v4, v6, v3, v7, v2, v5, v8, v9, v10, v11, v12, \
0469                                 v13, v14, v15
0470 
0471     add_round_key   v0, v1, v2, v3, v4, v5, v6, v7
0472 
0473     b.ne        .Lenc_loop
0474     ldr     q24, SRM0
0475     b       .Lenc_loop
0476 
0477 .Lenc_done:
0478     ldr     q12, [bskey]            // last round key
0479 
0480     bitslice    v0, v1, v4, v6, v3, v7, v2, v5, v8, v9, v10, v11
0481 
0482     eor     v0.16b, v0.16b, v12.16b
0483     eor     v1.16b, v1.16b, v12.16b
0484     eor     v4.16b, v4.16b, v12.16b
0485     eor     v6.16b, v6.16b, v12.16b
0486     eor     v3.16b, v3.16b, v12.16b
0487     eor     v7.16b, v7.16b, v12.16b
0488     eor     v2.16b, v2.16b, v12.16b
0489     eor     v5.16b, v5.16b, v12.16b
0490     ret
0491 SYM_FUNC_END(aesbs_encrypt8)
0492 
0493     .align      4
0494 SYM_FUNC_START_LOCAL(aesbs_decrypt8)
0495     lsl     x9, rounds, #7
0496     add     bskey, bskey, x9
0497 
0498     ldr     q9, [bskey, #-112]!     // round 0 key
0499     ldr     q8, M0ISR
0500     ldr     q24, ISR
0501 
0502     eor     v10.16b, v0.16b, v9.16b     // xor with round0 key
0503     eor     v11.16b, v1.16b, v9.16b
0504     tbl     v0.16b, {v10.16b}, v8.16b
0505     eor     v12.16b, v2.16b, v9.16b
0506     tbl     v1.16b, {v11.16b}, v8.16b
0507     eor     v13.16b, v3.16b, v9.16b
0508     tbl     v2.16b, {v12.16b}, v8.16b
0509     eor     v14.16b, v4.16b, v9.16b
0510     tbl     v3.16b, {v13.16b}, v8.16b
0511     eor     v15.16b, v5.16b, v9.16b
0512     tbl     v4.16b, {v14.16b}, v8.16b
0513     eor     v10.16b, v6.16b, v9.16b
0514     tbl     v5.16b, {v15.16b}, v8.16b
0515     eor     v11.16b, v7.16b, v9.16b
0516     tbl     v6.16b, {v10.16b}, v8.16b
0517     tbl     v7.16b, {v11.16b}, v8.16b
0518 
0519     bitslice    v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11
0520 
0521     sub     rounds, rounds, #1
0522     b       .Ldec_sbox
0523 
0524 .Ldec_loop:
0525     shift_rows  v0, v1, v2, v3, v4, v5, v6, v7, v24
0526 .Ldec_sbox:
0527     inv_sbox    v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, \
0528                                 v13, v14, v15
0529     subs        rounds, rounds, #1
0530     b.cc        .Ldec_done
0531 
0532     dec_next_rk
0533 
0534     add_round_key   v0, v1, v6, v4, v2, v7, v3, v5
0535 
0536     inv_mix_cols    v0, v1, v6, v4, v2, v7, v3, v5, v8, v9, v10, v11, v12, \
0537                                 v13, v14, v15
0538 
0539     b.ne        .Ldec_loop
0540     ldr     q24, ISRM0
0541     b       .Ldec_loop
0542 .Ldec_done:
0543     ldr     q12, [bskey, #-16]      // last round key
0544 
0545     bitslice    v0, v1, v6, v4, v2, v7, v3, v5, v8, v9, v10, v11
0546 
0547     eor     v0.16b, v0.16b, v12.16b
0548     eor     v1.16b, v1.16b, v12.16b
0549     eor     v6.16b, v6.16b, v12.16b
0550     eor     v4.16b, v4.16b, v12.16b
0551     eor     v2.16b, v2.16b, v12.16b
0552     eor     v7.16b, v7.16b, v12.16b
0553     eor     v3.16b, v3.16b, v12.16b
0554     eor     v5.16b, v5.16b, v12.16b
0555     ret
0556 SYM_FUNC_END(aesbs_decrypt8)
0557 
0558     /*
0559      * aesbs_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
0560      *           int blocks)
0561      * aesbs_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
0562      *           int blocks)
0563      */
0564     .macro      __ecb_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
0565     frame_push  5
0566 
0567     mov     x19, x0
0568     mov     x20, x1
0569     mov     x21, x2
0570     mov     x22, x3
0571     mov     x23, x4
0572 
0573 99: mov     x5, #1
0574     lsl     x5, x5, x23
0575     subs        w23, w23, #8
0576     csel        x23, x23, xzr, pl
0577     csel        x5, x5, xzr, mi
0578 
0579     ld1     {v0.16b}, [x20], #16
0580     tbnz        x5, #1, 0f
0581     ld1     {v1.16b}, [x20], #16
0582     tbnz        x5, #2, 0f
0583     ld1     {v2.16b}, [x20], #16
0584     tbnz        x5, #3, 0f
0585     ld1     {v3.16b}, [x20], #16
0586     tbnz        x5, #4, 0f
0587     ld1     {v4.16b}, [x20], #16
0588     tbnz        x5, #5, 0f
0589     ld1     {v5.16b}, [x20], #16
0590     tbnz        x5, #6, 0f
0591     ld1     {v6.16b}, [x20], #16
0592     tbnz        x5, #7, 0f
0593     ld1     {v7.16b}, [x20], #16
0594 
0595 0:  mov     bskey, x21
0596     mov     rounds, x22
0597     bl      \do8
0598 
0599     st1     {\o0\().16b}, [x19], #16
0600     tbnz        x5, #1, 1f
0601     st1     {\o1\().16b}, [x19], #16
0602     tbnz        x5, #2, 1f
0603     st1     {\o2\().16b}, [x19], #16
0604     tbnz        x5, #3, 1f
0605     st1     {\o3\().16b}, [x19], #16
0606     tbnz        x5, #4, 1f
0607     st1     {\o4\().16b}, [x19], #16
0608     tbnz        x5, #5, 1f
0609     st1     {\o5\().16b}, [x19], #16
0610     tbnz        x5, #6, 1f
0611     st1     {\o6\().16b}, [x19], #16
0612     tbnz        x5, #7, 1f
0613     st1     {\o7\().16b}, [x19], #16
0614 
0615     cbz     x23, 1f
0616     b       99b
0617 
0618 1:  frame_pop
0619     ret
0620     .endm
0621 
0622     .align      4
0623 SYM_FUNC_START(aesbs_ecb_encrypt)
0624     __ecb_crypt aesbs_encrypt8, v0, v1, v4, v6, v3, v7, v2, v5
0625 SYM_FUNC_END(aesbs_ecb_encrypt)
0626 
0627     .align      4
0628 SYM_FUNC_START(aesbs_ecb_decrypt)
0629     __ecb_crypt aesbs_decrypt8, v0, v1, v6, v4, v2, v7, v3, v5
0630 SYM_FUNC_END(aesbs_ecb_decrypt)
0631 
0632     /*
0633      * aesbs_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
0634      *           int blocks, u8 iv[])
0635      */
0636     .align      4
0637 SYM_FUNC_START(aesbs_cbc_decrypt)
0638     frame_push  6
0639 
0640     mov     x19, x0
0641     mov     x20, x1
0642     mov     x21, x2
0643     mov     x22, x3
0644     mov     x23, x4
0645     mov     x24, x5
0646 
0647 99: mov     x6, #1
0648     lsl     x6, x6, x23
0649     subs        w23, w23, #8
0650     csel        x23, x23, xzr, pl
0651     csel        x6, x6, xzr, mi
0652 
0653     ld1     {v0.16b}, [x20], #16
0654     mov     v25.16b, v0.16b
0655     tbnz        x6, #1, 0f
0656     ld1     {v1.16b}, [x20], #16
0657     mov     v26.16b, v1.16b
0658     tbnz        x6, #2, 0f
0659     ld1     {v2.16b}, [x20], #16
0660     mov     v27.16b, v2.16b
0661     tbnz        x6, #3, 0f
0662     ld1     {v3.16b}, [x20], #16
0663     mov     v28.16b, v3.16b
0664     tbnz        x6, #4, 0f
0665     ld1     {v4.16b}, [x20], #16
0666     mov     v29.16b, v4.16b
0667     tbnz        x6, #5, 0f
0668     ld1     {v5.16b}, [x20], #16
0669     mov     v30.16b, v5.16b
0670     tbnz        x6, #6, 0f
0671     ld1     {v6.16b}, [x20], #16
0672     mov     v31.16b, v6.16b
0673     tbnz        x6, #7, 0f
0674     ld1     {v7.16b}, [x20]
0675 
0676 0:  mov     bskey, x21
0677     mov     rounds, x22
0678     bl      aesbs_decrypt8
0679 
0680     ld1     {v24.16b}, [x24]        // load IV
0681 
0682     eor     v1.16b, v1.16b, v25.16b
0683     eor     v6.16b, v6.16b, v26.16b
0684     eor     v4.16b, v4.16b, v27.16b
0685     eor     v2.16b, v2.16b, v28.16b
0686     eor     v7.16b, v7.16b, v29.16b
0687     eor     v0.16b, v0.16b, v24.16b
0688     eor     v3.16b, v3.16b, v30.16b
0689     eor     v5.16b, v5.16b, v31.16b
0690 
0691     st1     {v0.16b}, [x19], #16
0692     mov     v24.16b, v25.16b
0693     tbnz        x6, #1, 1f
0694     st1     {v1.16b}, [x19], #16
0695     mov     v24.16b, v26.16b
0696     tbnz        x6, #2, 1f
0697     st1     {v6.16b}, [x19], #16
0698     mov     v24.16b, v27.16b
0699     tbnz        x6, #3, 1f
0700     st1     {v4.16b}, [x19], #16
0701     mov     v24.16b, v28.16b
0702     tbnz        x6, #4, 1f
0703     st1     {v2.16b}, [x19], #16
0704     mov     v24.16b, v29.16b
0705     tbnz        x6, #5, 1f
0706     st1     {v7.16b}, [x19], #16
0707     mov     v24.16b, v30.16b
0708     tbnz        x6, #6, 1f
0709     st1     {v3.16b}, [x19], #16
0710     mov     v24.16b, v31.16b
0711     tbnz        x6, #7, 1f
0712     ld1     {v24.16b}, [x20], #16
0713     st1     {v5.16b}, [x19], #16
0714 1:  st1     {v24.16b}, [x24]        // store IV
0715 
0716     cbz     x23, 2f
0717     b       99b
0718 
0719 2:  frame_pop
0720     ret
0721 SYM_FUNC_END(aesbs_cbc_decrypt)
0722 
0723     .macro      next_tweak, out, in, const, tmp
0724     sshr        \tmp\().2d,  \in\().2d,   #63
0725     and     \tmp\().16b, \tmp\().16b, \const\().16b
0726     add     \out\().2d,  \in\().2d,   \in\().2d
0727     ext     \tmp\().16b, \tmp\().16b, \tmp\().16b, #8
0728     eor     \out\().16b, \out\().16b, \tmp\().16b
0729     .endm
0730 
0731     /*
0732      * aesbs_xts_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
0733      *           int blocks, u8 iv[])
0734      * aesbs_xts_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
0735      *           int blocks, u8 iv[])
0736      */
0737 SYM_FUNC_START_LOCAL(__xts_crypt8)
0738     movi        v18.2s, #0x1
0739     movi        v19.2s, #0x87
0740     uzp1        v18.4s, v18.4s, v19.4s
0741 
0742     ld1     {v0.16b-v3.16b}, [x1], #64
0743     ld1     {v4.16b-v7.16b}, [x1], #64
0744 
0745     next_tweak  v26, v25, v18, v19
0746     next_tweak  v27, v26, v18, v19
0747     next_tweak  v28, v27, v18, v19
0748     next_tweak  v29, v28, v18, v19
0749     next_tweak  v30, v29, v18, v19
0750     next_tweak  v31, v30, v18, v19
0751     next_tweak  v16, v31, v18, v19
0752     next_tweak  v17, v16, v18, v19
0753 
0754     eor     v0.16b, v0.16b, v25.16b
0755     eor     v1.16b, v1.16b, v26.16b
0756     eor     v2.16b, v2.16b, v27.16b
0757     eor     v3.16b, v3.16b, v28.16b
0758     eor     v4.16b, v4.16b, v29.16b
0759     eor     v5.16b, v5.16b, v30.16b
0760     eor     v6.16b, v6.16b, v31.16b
0761     eor     v7.16b, v7.16b, v16.16b
0762 
0763     stp     q16, q17, [sp, #16]
0764 
0765     mov     bskey, x2
0766     mov     rounds, x3
0767     br      x16
0768 SYM_FUNC_END(__xts_crypt8)
0769 
0770     .macro      __xts_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
0771     stp     x29, x30, [sp, #-48]!
0772     mov     x29, sp
0773 
0774     ld1     {v25.16b}, [x5]
0775 
0776 0:  adr     x16, \do8
0777     bl      __xts_crypt8
0778 
0779     eor     v16.16b, \o0\().16b, v25.16b
0780     eor     v17.16b, \o1\().16b, v26.16b
0781     eor     v18.16b, \o2\().16b, v27.16b
0782     eor     v19.16b, \o3\().16b, v28.16b
0783 
0784     ldp     q24, q25, [sp, #16]
0785 
0786     eor     v20.16b, \o4\().16b, v29.16b
0787     eor     v21.16b, \o5\().16b, v30.16b
0788     eor     v22.16b, \o6\().16b, v31.16b
0789     eor     v23.16b, \o7\().16b, v24.16b
0790 
0791     st1     {v16.16b-v19.16b}, [x0], #64
0792     st1     {v20.16b-v23.16b}, [x0], #64
0793 
0794     subs        x4, x4, #8
0795     b.gt        0b
0796 
0797     st1     {v25.16b}, [x5]
0798     ldp     x29, x30, [sp], #48
0799     ret
0800     .endm
0801 
0802 SYM_FUNC_START(aesbs_xts_encrypt)
0803     __xts_crypt aesbs_encrypt8, v0, v1, v4, v6, v3, v7, v2, v5
0804 SYM_FUNC_END(aesbs_xts_encrypt)
0805 
0806 SYM_FUNC_START(aesbs_xts_decrypt)
0807     __xts_crypt aesbs_decrypt8, v0, v1, v6, v4, v2, v7, v3, v5
0808 SYM_FUNC_END(aesbs_xts_decrypt)
0809 
0810     .macro      next_ctr, v
0811     mov     \v\().d[1], x8
0812     adds        x8, x8, #1
0813     mov     \v\().d[0], x7
0814     adc     x7, x7, xzr
0815     rev64       \v\().16b, \v\().16b
0816     .endm
0817 
0818     /*
0819      * aesbs_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[],
0820      *           int rounds, int blocks, u8 iv[])
0821      */
0822 SYM_FUNC_START(aesbs_ctr_encrypt)
0823     stp     x29, x30, [sp, #-16]!
0824     mov     x29, sp
0825 
0826     ldp     x7, x8, [x5]
0827     ld1     {v0.16b}, [x5]
0828 CPU_LE( rev     x7, x7      )
0829 CPU_LE( rev     x8, x8      )
0830     adds        x8, x8, #1
0831     adc     x7, x7, xzr
0832 
0833 0:  next_ctr    v1
0834     next_ctr    v2
0835     next_ctr    v3
0836     next_ctr    v4
0837     next_ctr    v5
0838     next_ctr    v6
0839     next_ctr    v7
0840 
0841     mov     bskey, x2
0842     mov     rounds, x3
0843     bl      aesbs_encrypt8
0844 
0845     ld1     { v8.16b-v11.16b}, [x1], #64
0846     ld1     {v12.16b-v15.16b}, [x1], #64
0847 
0848     eor     v8.16b, v0.16b, v8.16b
0849     eor     v9.16b, v1.16b, v9.16b
0850     eor     v10.16b, v4.16b, v10.16b
0851     eor     v11.16b, v6.16b, v11.16b
0852     eor     v12.16b, v3.16b, v12.16b
0853     eor     v13.16b, v7.16b, v13.16b
0854     eor     v14.16b, v2.16b, v14.16b
0855     eor     v15.16b, v5.16b, v15.16b
0856 
0857     st1     { v8.16b-v11.16b}, [x0], #64
0858     st1     {v12.16b-v15.16b}, [x0], #64
0859 
0860     next_ctr    v0
0861     subs        x4, x4, #8
0862     b.gt        0b
0863 
0864     st1     {v0.16b}, [x5]
0865     ldp     x29, x30, [sp], #16
0866     ret
0867 SYM_FUNC_END(aesbs_ctr_encrypt)