Back to home page

OSCL-LXR

 
 

    


0001 /* SPDX-License-Identifier: GPL-2.0-only */
0002 /*
0003  * aes-ce-core.S - AES in CBC/CTR/XTS mode using ARMv8 Crypto Extensions
0004  *
0005  * Copyright (C) 2015 Linaro Ltd <ard.biesheuvel@linaro.org>
0006  */
0007 
0008 #include <linux/linkage.h>
0009 #include <asm/assembler.h>
0010 
0011     .text
0012     .arch       armv8-a
0013     .fpu        crypto-neon-fp-armv8
0014     .align      3
0015 
0016     .macro      enc_round, state, key
0017     aese.8      \state, \key
0018     aesmc.8     \state, \state
0019     .endm
0020 
0021     .macro      dec_round, state, key
0022     aesd.8      \state, \key
0023     aesimc.8    \state, \state
0024     .endm
0025 
0026     .macro      enc_dround, key1, key2
0027     enc_round   q0, \key1
0028     enc_round   q0, \key2
0029     .endm
0030 
0031     .macro      dec_dround, key1, key2
0032     dec_round   q0, \key1
0033     dec_round   q0, \key2
0034     .endm
0035 
0036     .macro      enc_fround, key1, key2, key3
0037     enc_round   q0, \key1
0038     aese.8      q0, \key2
0039     veor        q0, q0, \key3
0040     .endm
0041 
0042     .macro      dec_fround, key1, key2, key3
0043     dec_round   q0, \key1
0044     aesd.8      q0, \key2
0045     veor        q0, q0, \key3
0046     .endm
0047 
0048     .macro      enc_dround_4x, key1, key2
0049     enc_round   q0, \key1
0050     enc_round   q1, \key1
0051     enc_round   q2, \key1
0052     enc_round   q3, \key1
0053     enc_round   q0, \key2
0054     enc_round   q1, \key2
0055     enc_round   q2, \key2
0056     enc_round   q3, \key2
0057     .endm
0058 
0059     .macro      dec_dround_4x, key1, key2
0060     dec_round   q0, \key1
0061     dec_round   q1, \key1
0062     dec_round   q2, \key1
0063     dec_round   q3, \key1
0064     dec_round   q0, \key2
0065     dec_round   q1, \key2
0066     dec_round   q2, \key2
0067     dec_round   q3, \key2
0068     .endm
0069 
0070     .macro      enc_fround_4x, key1, key2, key3
0071     enc_round   q0, \key1
0072     enc_round   q1, \key1
0073     enc_round   q2, \key1
0074     enc_round   q3, \key1
0075     aese.8      q0, \key2
0076     aese.8      q1, \key2
0077     aese.8      q2, \key2
0078     aese.8      q3, \key2
0079     veor        q0, q0, \key3
0080     veor        q1, q1, \key3
0081     veor        q2, q2, \key3
0082     veor        q3, q3, \key3
0083     .endm
0084 
0085     .macro      dec_fround_4x, key1, key2, key3
0086     dec_round   q0, \key1
0087     dec_round   q1, \key1
0088     dec_round   q2, \key1
0089     dec_round   q3, \key1
0090     aesd.8      q0, \key2
0091     aesd.8      q1, \key2
0092     aesd.8      q2, \key2
0093     aesd.8      q3, \key2
0094     veor        q0, q0, \key3
0095     veor        q1, q1, \key3
0096     veor        q2, q2, \key3
0097     veor        q3, q3, \key3
0098     .endm
0099 
0100     .macro      do_block, dround, fround
0101     cmp     r3, #12         @ which key size?
0102     vld1.32     {q10-q11}, [ip]!
0103     \dround     q8, q9
0104     vld1.32     {q12-q13}, [ip]!
0105     \dround     q10, q11
0106     vld1.32     {q10-q11}, [ip]!
0107     \dround     q12, q13
0108     vld1.32     {q12-q13}, [ip]!
0109     \dround     q10, q11
0110     blo     0f          @ AES-128: 10 rounds
0111     vld1.32     {q10-q11}, [ip]!
0112     \dround     q12, q13
0113     beq     1f          @ AES-192: 12 rounds
0114     vld1.32     {q12-q13}, [ip]
0115     \dround     q10, q11
0116 0:  \fround     q12, q13, q14
0117     bx      lr
0118 
0119 1:  \fround     q10, q11, q14
0120     bx      lr
0121     .endm
0122 
0123     /*
0124      * Internal, non-AAPCS compliant functions that implement the core AES
0125      * transforms. These should preserve all registers except q0 - q2 and ip
0126      * Arguments:
0127      *   q0        : first in/output block
0128      *   q1        : second in/output block (_4x version only)
0129      *   q2        : third in/output block (_4x version only)
0130      *   q3        : fourth in/output block (_4x version only)
0131      *   q8        : first round key
0132      *   q9        : secound round key
0133      *   q14       : final round key
0134      *   r2        : address of round key array
0135      *   r3        : number of rounds
0136      */
0137     .align      6
0138 aes_encrypt:
0139     add     ip, r2, #32     @ 3rd round key
0140 .Laes_encrypt_tweak:
0141     do_block    enc_dround, enc_fround
0142 ENDPROC(aes_encrypt)
0143 
0144     .align      6
0145 aes_decrypt:
0146     add     ip, r2, #32     @ 3rd round key
0147     do_block    dec_dround, dec_fround
0148 ENDPROC(aes_decrypt)
0149 
0150     .align      6
0151 aes_encrypt_4x:
0152     add     ip, r2, #32     @ 3rd round key
0153     do_block    enc_dround_4x, enc_fround_4x
0154 ENDPROC(aes_encrypt_4x)
0155 
0156     .align      6
0157 aes_decrypt_4x:
0158     add     ip, r2, #32     @ 3rd round key
0159     do_block    dec_dround_4x, dec_fround_4x
0160 ENDPROC(aes_decrypt_4x)
0161 
0162     .macro      prepare_key, rk, rounds
0163     add     ip, \rk, \rounds, lsl #4
0164     vld1.32     {q8-q9}, [\rk]      @ load first 2 round keys
0165     vld1.32     {q14}, [ip]     @ load last round key
0166     .endm
0167 
0168     /*
0169      * aes_ecb_encrypt(u8 out[], u8 const in[], u32 const rk[], int rounds,
0170      *         int blocks)
0171      * aes_ecb_decrypt(u8 out[], u8 const in[], u32 const rk[], int rounds,
0172      *         int blocks)
0173      */
0174 ENTRY(ce_aes_ecb_encrypt)
0175     push        {r4, lr}
0176     ldr     r4, [sp, #8]
0177     prepare_key r2, r3
0178 .Lecbencloop4x:
0179     subs        r4, r4, #4
0180     bmi     .Lecbenc1x
0181     vld1.8      {q0-q1}, [r1]!
0182     vld1.8      {q2-q3}, [r1]!
0183     bl      aes_encrypt_4x
0184     vst1.8      {q0-q1}, [r0]!
0185     vst1.8      {q2-q3}, [r0]!
0186     b       .Lecbencloop4x
0187 .Lecbenc1x:
0188     adds        r4, r4, #4
0189     beq     .Lecbencout
0190 .Lecbencloop:
0191     vld1.8      {q0}, [r1]!
0192     bl      aes_encrypt
0193     vst1.8      {q0}, [r0]!
0194     subs        r4, r4, #1
0195     bne     .Lecbencloop
0196 .Lecbencout:
0197     pop     {r4, pc}
0198 ENDPROC(ce_aes_ecb_encrypt)
0199 
0200 ENTRY(ce_aes_ecb_decrypt)
0201     push        {r4, lr}
0202     ldr     r4, [sp, #8]
0203     prepare_key r2, r3
0204 .Lecbdecloop4x:
0205     subs        r4, r4, #4
0206     bmi     .Lecbdec1x
0207     vld1.8      {q0-q1}, [r1]!
0208     vld1.8      {q2-q3}, [r1]!
0209     bl      aes_decrypt_4x
0210     vst1.8      {q0-q1}, [r0]!
0211     vst1.8      {q2-q3}, [r0]!
0212     b       .Lecbdecloop4x
0213 .Lecbdec1x:
0214     adds        r4, r4, #4
0215     beq     .Lecbdecout
0216 .Lecbdecloop:
0217     vld1.8      {q0}, [r1]!
0218     bl      aes_decrypt
0219     vst1.8      {q0}, [r0]!
0220     subs        r4, r4, #1
0221     bne     .Lecbdecloop
0222 .Lecbdecout:
0223     pop     {r4, pc}
0224 ENDPROC(ce_aes_ecb_decrypt)
0225 
0226     /*
0227      * aes_cbc_encrypt(u8 out[], u8 const in[], u32 const rk[], int rounds,
0228      *         int blocks, u8 iv[])
0229      * aes_cbc_decrypt(u8 out[], u8 const in[], u32 const rk[], int rounds,
0230      *         int blocks, u8 iv[])
0231      */
0232 ENTRY(ce_aes_cbc_encrypt)
0233     push        {r4-r6, lr}
0234     ldrd        r4, r5, [sp, #16]
0235     vld1.8      {q0}, [r5]
0236     prepare_key r2, r3
0237 .Lcbcencloop:
0238     vld1.8      {q1}, [r1]!     @ get next pt block
0239     veor        q0, q0, q1      @ ..and xor with iv
0240     bl      aes_encrypt
0241     vst1.8      {q0}, [r0]!
0242     subs        r4, r4, #1
0243     bne     .Lcbcencloop
0244     vst1.8      {q0}, [r5]
0245     pop     {r4-r6, pc}
0246 ENDPROC(ce_aes_cbc_encrypt)
0247 
0248 ENTRY(ce_aes_cbc_decrypt)
0249     push        {r4-r6, lr}
0250     ldrd        r4, r5, [sp, #16]
0251     vld1.8      {q15}, [r5]     @ keep iv in q15
0252     prepare_key r2, r3
0253 .Lcbcdecloop4x:
0254     subs        r4, r4, #4
0255     bmi     .Lcbcdec1x
0256     vld1.8      {q0-q1}, [r1]!
0257     vld1.8      {q2-q3}, [r1]!
0258     vmov        q4, q0
0259     vmov        q5, q1
0260     vmov        q6, q2
0261     vmov        q7, q3
0262     bl      aes_decrypt_4x
0263     veor        q0, q0, q15
0264     veor        q1, q1, q4
0265     veor        q2, q2, q5
0266     veor        q3, q3, q6
0267     vmov        q15, q7
0268     vst1.8      {q0-q1}, [r0]!
0269     vst1.8      {q2-q3}, [r0]!
0270     b       .Lcbcdecloop4x
0271 .Lcbcdec1x:
0272     adds        r4, r4, #4
0273     beq     .Lcbcdecout
0274     vmov        q6, q14         @ preserve last round key
0275 .Lcbcdecloop:
0276     vld1.8      {q0}, [r1]!     @ get next ct block
0277     veor        q14, q15, q6        @ combine prev ct with last key
0278     vmov        q15, q0
0279     bl      aes_decrypt
0280     vst1.8      {q0}, [r0]!
0281     subs        r4, r4, #1
0282     bne     .Lcbcdecloop
0283 .Lcbcdecout:
0284     vst1.8      {q15}, [r5]     @ keep iv in q15
0285     pop     {r4-r6, pc}
0286 ENDPROC(ce_aes_cbc_decrypt)
0287 
0288 
0289     /*
0290      * ce_aes_cbc_cts_encrypt(u8 out[], u8 const in[], u32 const rk[],
0291      *            int rounds, int bytes, u8 const iv[])
0292      * ce_aes_cbc_cts_decrypt(u8 out[], u8 const in[], u32 const rk[],
0293      *            int rounds, int bytes, u8 const iv[])
0294      */
0295 
0296 ENTRY(ce_aes_cbc_cts_encrypt)
0297     push        {r4-r6, lr}
0298     ldrd        r4, r5, [sp, #16]
0299 
0300     movw        ip, :lower16:.Lcts_permute_table
0301     movt        ip, :upper16:.Lcts_permute_table
0302     sub     r4, r4, #16
0303     add     lr, ip, #32
0304     add     ip, ip, r4
0305     sub     lr, lr, r4
0306     vld1.8      {q5}, [ip]
0307     vld1.8      {q6}, [lr]
0308 
0309     add     ip, r1, r4
0310     vld1.8      {q0}, [r1]          @ overlapping loads
0311     vld1.8      {q3}, [ip]
0312 
0313     vld1.8      {q1}, [r5]          @ get iv
0314     prepare_key r2, r3
0315 
0316     veor        q0, q0, q1          @ xor with iv
0317     bl      aes_encrypt
0318 
0319     vtbl.8      d4, {d0-d1}, d10
0320     vtbl.8      d5, {d0-d1}, d11
0321     vtbl.8      d2, {d6-d7}, d12
0322     vtbl.8      d3, {d6-d7}, d13
0323 
0324     veor        q0, q0, q1
0325     bl      aes_encrypt
0326 
0327     add     r4, r0, r4
0328     vst1.8      {q2}, [r4]          @ overlapping stores
0329     vst1.8      {q0}, [r0]
0330 
0331     pop     {r4-r6, pc}
0332 ENDPROC(ce_aes_cbc_cts_encrypt)
0333 
0334 ENTRY(ce_aes_cbc_cts_decrypt)
0335     push        {r4-r6, lr}
0336     ldrd        r4, r5, [sp, #16]
0337 
0338     movw        ip, :lower16:.Lcts_permute_table
0339     movt        ip, :upper16:.Lcts_permute_table
0340     sub     r4, r4, #16
0341     add     lr, ip, #32
0342     add     ip, ip, r4
0343     sub     lr, lr, r4
0344     vld1.8      {q5}, [ip]
0345     vld1.8      {q6}, [lr]
0346 
0347     add     ip, r1, r4
0348     vld1.8      {q0}, [r1]          @ overlapping loads
0349     vld1.8      {q1}, [ip]
0350 
0351     vld1.8      {q3}, [r5]          @ get iv
0352     prepare_key r2, r3
0353 
0354     bl      aes_decrypt
0355 
0356     vtbl.8      d4, {d0-d1}, d10
0357     vtbl.8      d5, {d0-d1}, d11
0358     vtbx.8      d0, {d2-d3}, d12
0359     vtbx.8      d1, {d2-d3}, d13
0360 
0361     veor        q1, q1, q2
0362     bl      aes_decrypt
0363     veor        q0, q0, q3          @ xor with iv
0364 
0365     add     r4, r0, r4
0366     vst1.8      {q1}, [r4]          @ overlapping stores
0367     vst1.8      {q0}, [r0]
0368 
0369     pop     {r4-r6, pc}
0370 ENDPROC(ce_aes_cbc_cts_decrypt)
0371 
0372 
0373     /*
0374      * aes_ctr_encrypt(u8 out[], u8 const in[], u32 const rk[], int rounds,
0375      *         int blocks, u8 ctr[])
0376      */
0377 ENTRY(ce_aes_ctr_encrypt)
0378     push        {r4-r6, lr}
0379     ldrd        r4, r5, [sp, #16]
0380     vld1.8      {q7}, [r5]      @ load ctr
0381     prepare_key r2, r3
0382     vmov        r6, s31         @ keep swabbed ctr in r6
0383     rev     r6, r6
0384     cmn     r6, r4          @ 32 bit overflow?
0385     bcs     .Lctrloop
0386 .Lctrloop4x:
0387     subs        r4, r4, #4
0388     bmi     .Lctr1x
0389 
0390     /*
0391      * NOTE: the sequence below has been carefully tweaked to avoid
0392      * a silicon erratum that exists in Cortex-A57 (#1742098) and
0393      * Cortex-A72 (#1655431) cores, where AESE/AESMC instruction pairs
0394      * may produce an incorrect result if they take their input from a
0395      * register of which a single 32-bit lane has been updated the last
0396      * time it was modified. To work around this, the lanes of registers
0397      * q0-q3 below are not manipulated individually, and the different
0398      * counter values are prepared by successive manipulations of q7.
0399      */
0400     add     ip, r6, #1
0401     vmov        q0, q7
0402     rev     ip, ip
0403     add     lr, r6, #2
0404     vmov        s31, ip         @ set lane 3 of q1 via q7
0405     add     ip, r6, #3
0406     rev     lr, lr
0407     vmov        q1, q7
0408     vmov        s31, lr         @ set lane 3 of q2 via q7
0409     rev     ip, ip
0410     vmov        q2, q7
0411     vmov        s31, ip         @ set lane 3 of q3 via q7
0412     add     r6, r6, #4
0413     vmov        q3, q7
0414 
0415     vld1.8      {q4-q5}, [r1]!
0416     vld1.8      {q6}, [r1]!
0417     vld1.8      {q15}, [r1]!
0418     bl      aes_encrypt_4x
0419     veor        q0, q0, q4
0420     veor        q1, q1, q5
0421     veor        q2, q2, q6
0422     veor        q3, q3, q15
0423     rev     ip, r6
0424     vst1.8      {q0-q1}, [r0]!
0425     vst1.8      {q2-q3}, [r0]!
0426     vmov        s31, ip
0427     b       .Lctrloop4x
0428 .Lctr1x:
0429     adds        r4, r4, #4
0430     beq     .Lctrout
0431 .Lctrloop:
0432     vmov        q0, q7
0433     bl      aes_encrypt
0434 
0435     adds        r6, r6, #1      @ increment BE ctr
0436     rev     ip, r6
0437     vmov        s31, ip
0438     bcs     .Lctrcarry
0439 
0440 .Lctrcarrydone:
0441     subs        r4, r4, #1
0442     bmi     .Lctrtailblock      @ blocks < 0 means tail block
0443     vld1.8      {q3}, [r1]!
0444     veor        q3, q0, q3
0445     vst1.8      {q3}, [r0]!
0446     bne     .Lctrloop
0447 
0448 .Lctrout:
0449     vst1.8      {q7}, [r5]      @ return next CTR value
0450     pop     {r4-r6, pc}
0451 
0452 .Lctrtailblock:
0453     vst1.8      {q0}, [r0, :64]     @ return the key stream
0454     b       .Lctrout
0455 
0456 .Lctrcarry:
0457     .irp        sreg, s30, s29, s28
0458     vmov        ip, \sreg       @ load next word of ctr
0459     rev     ip, ip          @ ... to handle the carry
0460     adds        ip, ip, #1
0461     rev     ip, ip
0462     vmov        \sreg, ip
0463     bcc     .Lctrcarrydone
0464     .endr
0465     b       .Lctrcarrydone
0466 ENDPROC(ce_aes_ctr_encrypt)
0467 
0468     /*
0469      * aes_xts_encrypt(u8 out[], u8 const in[], u32 const rk1[], int rounds,
0470      *         int bytes, u8 iv[], u32 const rk2[], int first)
0471      * aes_xts_decrypt(u8 out[], u8 const in[], u32 const rk1[], int rounds,
0472      *         int bytes, u8 iv[], u32 const rk2[], int first)
0473      */
0474 
0475     .macro      next_tweak, out, in, const, tmp
0476     vshr.s64    \tmp, \in, #63
0477     vand        \tmp, \tmp, \const
0478     vadd.u64    \out, \in, \in
0479     vext.8      \tmp, \tmp, \tmp, #8
0480     veor        \out, \out, \tmp
0481     .endm
0482 
0483 ce_aes_xts_init:
0484     vmov.i32    d30, #0x87      @ compose tweak mask vector
0485     vmovl.u32   q15, d30
0486     vshr.u64    d30, d31, #7
0487 
0488     ldrd        r4, r5, [sp, #16]   @ load args
0489     ldr     r6, [sp, #28]
0490     vld1.8      {q0}, [r5]      @ load iv
0491     teq     r6, #1          @ start of a block?
0492     bxne        lr
0493 
0494     @ Encrypt the IV in q0 with the second AES key. This should only
0495     @ be done at the start of a block.
0496     ldr     r6, [sp, #24]       @ load AES key 2
0497     prepare_key r6, r3
0498     add     ip, r6, #32     @ 3rd round key of key 2
0499     b       .Laes_encrypt_tweak @ tail call
0500 ENDPROC(ce_aes_xts_init)
0501 
0502 ENTRY(ce_aes_xts_encrypt)
0503     push        {r4-r6, lr}
0504 
0505     bl      ce_aes_xts_init     @ run shared prologue
0506     prepare_key r2, r3
0507     vmov        q4, q0
0508 
0509     teq     r6, #0          @ start of a block?
0510     bne     .Lxtsenc4x
0511 
0512 .Lxtsencloop4x:
0513     next_tweak  q4, q4, q15, q10
0514 .Lxtsenc4x:
0515     subs        r4, r4, #64
0516     bmi     .Lxtsenc1x
0517     vld1.8      {q0-q1}, [r1]!      @ get 4 pt blocks
0518     vld1.8      {q2-q3}, [r1]!
0519     next_tweak  q5, q4, q15, q10
0520     veor        q0, q0, q4
0521     next_tweak  q6, q5, q15, q10
0522     veor        q1, q1, q5
0523     next_tweak  q7, q6, q15, q10
0524     veor        q2, q2, q6
0525     veor        q3, q3, q7
0526     bl      aes_encrypt_4x
0527     veor        q0, q0, q4
0528     veor        q1, q1, q5
0529     veor        q2, q2, q6
0530     veor        q3, q3, q7
0531     vst1.8      {q0-q1}, [r0]!      @ write 4 ct blocks
0532     vst1.8      {q2-q3}, [r0]!
0533     vmov        q4, q7
0534     teq     r4, #0
0535     beq     .Lxtsencret
0536     b       .Lxtsencloop4x
0537 .Lxtsenc1x:
0538     adds        r4, r4, #64
0539     beq     .Lxtsencout
0540     subs        r4, r4, #16
0541     bmi     .LxtsencctsNx
0542 .Lxtsencloop:
0543     vld1.8      {q0}, [r1]!
0544 .Lxtsencctsout:
0545     veor        q0, q0, q4
0546     bl      aes_encrypt
0547     veor        q0, q0, q4
0548     teq     r4, #0
0549     beq     .Lxtsencout
0550     subs        r4, r4, #16
0551     next_tweak  q4, q4, q15, q6
0552     bmi     .Lxtsenccts
0553     vst1.8      {q0}, [r0]!
0554     b       .Lxtsencloop
0555 .Lxtsencout:
0556     vst1.8      {q0}, [r0]
0557 .Lxtsencret:
0558     vst1.8      {q4}, [r5]
0559     pop     {r4-r6, pc}
0560 
0561 .LxtsencctsNx:
0562     vmov        q0, q3
0563     sub     r0, r0, #16
0564 .Lxtsenccts:
0565     movw        ip, :lower16:.Lcts_permute_table
0566     movt        ip, :upper16:.Lcts_permute_table
0567 
0568     add     r1, r1, r4      @ rewind input pointer
0569     add     r4, r4, #16     @ # bytes in final block
0570     add     lr, ip, #32
0571     add     ip, ip, r4
0572     sub     lr, lr, r4
0573     add     r4, r0, r4      @ output address of final block
0574 
0575     vld1.8      {q1}, [r1]      @ load final partial block
0576     vld1.8      {q2}, [ip]
0577     vld1.8      {q3}, [lr]
0578 
0579     vtbl.8      d4, {d0-d1}, d4
0580     vtbl.8      d5, {d0-d1}, d5
0581     vtbx.8      d0, {d2-d3}, d6
0582     vtbx.8      d1, {d2-d3}, d7
0583 
0584     vst1.8      {q2}, [r4]      @ overlapping stores
0585     mov     r4, #0
0586     b       .Lxtsencctsout
0587 ENDPROC(ce_aes_xts_encrypt)
0588 
0589 
0590 ENTRY(ce_aes_xts_decrypt)
0591     push        {r4-r6, lr}
0592 
0593     bl      ce_aes_xts_init     @ run shared prologue
0594     prepare_key r2, r3
0595     vmov        q4, q0
0596 
0597     /* subtract 16 bytes if we are doing CTS */
0598     tst     r4, #0xf
0599     subne       r4, r4, #0x10
0600 
0601     teq     r6, #0          @ start of a block?
0602     bne     .Lxtsdec4x
0603 
0604 .Lxtsdecloop4x:
0605     next_tweak  q4, q4, q15, q10
0606 .Lxtsdec4x:
0607     subs        r4, r4, #64
0608     bmi     .Lxtsdec1x
0609     vld1.8      {q0-q1}, [r1]!      @ get 4 ct blocks
0610     vld1.8      {q2-q3}, [r1]!
0611     next_tweak  q5, q4, q15, q10
0612     veor        q0, q0, q4
0613     next_tweak  q6, q5, q15, q10
0614     veor        q1, q1, q5
0615     next_tweak  q7, q6, q15, q10
0616     veor        q2, q2, q6
0617     veor        q3, q3, q7
0618     bl      aes_decrypt_4x
0619     veor        q0, q0, q4
0620     veor        q1, q1, q5
0621     veor        q2, q2, q6
0622     veor        q3, q3, q7
0623     vst1.8      {q0-q1}, [r0]!      @ write 4 pt blocks
0624     vst1.8      {q2-q3}, [r0]!
0625     vmov        q4, q7
0626     teq     r4, #0
0627     beq     .Lxtsdecout
0628     b       .Lxtsdecloop4x
0629 .Lxtsdec1x:
0630     adds        r4, r4, #64
0631     beq     .Lxtsdecout
0632     subs        r4, r4, #16
0633 .Lxtsdecloop:
0634     vld1.8      {q0}, [r1]!
0635     bmi     .Lxtsdeccts
0636 .Lxtsdecctsout:
0637     veor        q0, q0, q4
0638     bl      aes_decrypt
0639     veor        q0, q0, q4
0640     vst1.8      {q0}, [r0]!
0641     teq     r4, #0
0642     beq     .Lxtsdecout
0643     subs        r4, r4, #16
0644     next_tweak  q4, q4, q15, q6
0645     b       .Lxtsdecloop
0646 .Lxtsdecout:
0647     vst1.8      {q4}, [r5]
0648     pop     {r4-r6, pc}
0649 
0650 .Lxtsdeccts:
0651     movw        ip, :lower16:.Lcts_permute_table
0652     movt        ip, :upper16:.Lcts_permute_table
0653 
0654     add     r1, r1, r4      @ rewind input pointer
0655     add     r4, r4, #16     @ # bytes in final block
0656     add     lr, ip, #32
0657     add     ip, ip, r4
0658     sub     lr, lr, r4
0659     add     r4, r0, r4      @ output address of final block
0660 
0661     next_tweak  q5, q4, q15, q6
0662 
0663     vld1.8      {q1}, [r1]      @ load final partial block
0664     vld1.8      {q2}, [ip]
0665     vld1.8      {q3}, [lr]
0666 
0667     veor        q0, q0, q5
0668     bl      aes_decrypt
0669     veor        q0, q0, q5
0670 
0671     vtbl.8      d4, {d0-d1}, d4
0672     vtbl.8      d5, {d0-d1}, d5
0673     vtbx.8      d0, {d2-d3}, d6
0674     vtbx.8      d1, {d2-d3}, d7
0675 
0676     vst1.8      {q2}, [r4]      @ overlapping stores
0677     mov     r4, #0
0678     b       .Lxtsdecctsout
0679 ENDPROC(ce_aes_xts_decrypt)
0680 
0681     /*
0682      * u32 ce_aes_sub(u32 input) - use the aese instruction to perform the
0683      *                             AES sbox substitution on each byte in
0684      *                             'input'
0685      */
0686 ENTRY(ce_aes_sub)
0687     vdup.32     q1, r0
0688     veor        q0, q0, q0
0689     aese.8      q0, q1
0690     vmov        r0, s0
0691     bx      lr
0692 ENDPROC(ce_aes_sub)
0693 
0694     /*
0695      * void ce_aes_invert(u8 *dst, u8 *src) - perform the Inverse MixColumns
0696      *                                        operation on round key *src
0697      */
0698 ENTRY(ce_aes_invert)
0699     vld1.32     {q0}, [r1]
0700     aesimc.8    q0, q0
0701     vst1.32     {q0}, [r0]
0702     bx      lr
0703 ENDPROC(ce_aes_invert)
0704 
0705     .section    ".rodata", "a"
0706     .align      6
0707 .Lcts_permute_table:
0708     .byte       0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
0709     .byte       0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
0710     .byte        0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
0711     .byte        0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf
0712     .byte       0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
0713     .byte       0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff