arm/crypto/chacha-scalar-core.S

0001 /* SPDX-License-Identifier: GPL-2.0 */
0002 /*
0003  * Copyright (C) 2018 Google, Inc.
0004  */
0005
0006 #include <linux/linkage.h>
0007 #include <asm/assembler.h>
0008
0009 /*
0010  * Design notes:
0011  *
0012  * 16 registers would be needed to hold the state matrix, but only 14 are
0013  * available because 'sp' and 'pc' cannot be used.  So we spill the elements
0014  * (x8, x9) to the stack and swap them out with (x10, x11).  This adds one
0015  * 'ldrd' and one 'strd' instruction per round.
0016  *
0017  * All rotates are performed using the implicit rotate operand accepted by the
0018  * 'add' and 'eor' instructions.  This is faster than using explicit rotate
0019  * instructions.  To make this work, we allow the values in the second and last
0020  * rows of the ChaCha state matrix (rows 'b' and 'd') to temporarily have the
0021  * wrong rotation amount.  The rotation amount is then fixed up just in time
0022  * when the values are used.  'brot' is the number of bits the values in row 'b'
0023  * need to be rotated right to arrive at the correct values, and 'drot'
0024  * similarly for row 'd'.  (brot, drot) start out as (0, 0) but we make it such
0025  * that they end up as (25, 24) after every round.
0026  */
0027
0028     // ChaCha state registers
0029     X0  .req    r0
0030     X1  .req    r1
0031     X2  .req    r2
0032     X3  .req    r3
0033     X4  .req    r4
0034     X5  .req    r5
0035     X6  .req    r6
0036     X7  .req    r7
0037     X8_X10  .req    r8  // shared by x8 and x10
0038     X9_X11  .req    r9  // shared by x9 and x11
0039     X12 .req    r10
0040     X13 .req    r11
0041     X14 .req    r12
0042     X15 .req    r14
0043
0044 .macro _le32_bswap_4x   a, b, c, d,  tmp
0045 #ifdef __ARMEB__
0046     rev_l       \a,  \tmp
0047     rev_l       \b,  \tmp
0048     rev_l       \c,  \tmp
0049     rev_l       \d,  \tmp
0050 #endif
0051 .endm
0052
0053 .macro __ldrd       a, b, src, offset
0054 #if __LINUX_ARM_ARCH__ >= 6
0055     ldrd        \a, \b, [\src, #\offset]
0056 #else
0057     ldr     \a, [\src, #\offset]
0058     ldr     \b, [\src, #\offset + 4]
0059 #endif
0060 .endm
0061
0062 .macro __strd       a, b, dst, offset
0063 #if __LINUX_ARM_ARCH__ >= 6
0064     strd        \a, \b, [\dst, #\offset]
0065 #else
0066     str     \a, [\dst, #\offset]
0067     str     \b, [\dst, #\offset + 4]
0068 #endif
0069 .endm
0070
0071 .macro _halfround   a1, b1, c1, d1,  a2, b2, c2, d2
0072
0073     // a += b; d ^= a; d = rol(d, 16);
0074     add     \a1, \a1, \b1, ror #brot
0075     add     \a2, \a2, \b2, ror #brot
0076     eor     \d1, \a1, \d1, ror #drot
0077     eor     \d2, \a2, \d2, ror #drot
0078     // drot == 32 - 16 == 16
0079
0080     // c += d; b ^= c; b = rol(b, 12);
0081     add     \c1, \c1, \d1, ror #16
0082     add     \c2, \c2, \d2, ror #16
0083     eor     \b1, \c1, \b1, ror #brot
0084     eor     \b2, \c2, \b2, ror #brot
0085     // brot == 32 - 12 == 20
0086
0087     // a += b; d ^= a; d = rol(d, 8);
0088     add     \a1, \a1, \b1, ror #20
0089     add     \a2, \a2, \b2, ror #20
0090     eor     \d1, \a1, \d1, ror #16
0091     eor     \d2, \a2, \d2, ror #16
0092     // drot == 32 - 8 == 24
0093
0094     // c += d; b ^= c; b = rol(b, 7);
0095     add     \c1, \c1, \d1, ror #24
0096     add     \c2, \c2, \d2, ror #24
0097     eor     \b1, \c1, \b1, ror #20
0098     eor     \b2, \c2, \b2, ror #20
0099     // brot == 32 - 7 == 25
0100 .endm
0101
0102 .macro _doubleround
0103
0104     // column round
0105
0106     // quarterrounds: (x0, x4, x8, x12) and (x1, x5, x9, x13)
0107     _halfround  X0, X4, X8_X10, X12,  X1, X5, X9_X11, X13
0108
0109     // save (x8, x9); restore (x10, x11)
0110     __strd      X8_X10, X9_X11, sp, 0
0111     __ldrd      X8_X10, X9_X11, sp, 8
0112
0113     // quarterrounds: (x2, x6, x10, x14) and (x3, x7, x11, x15)
0114     _halfround  X2, X6, X8_X10, X14,  X3, X7, X9_X11, X15
0115
0116     .set brot, 25
0117     .set drot, 24
0118
0119     // diagonal round
0120
0121     // quarterrounds: (x0, x5, x10, x15) and (x1, x6, x11, x12)
0122     _halfround  X0, X5, X8_X10, X15,  X1, X6, X9_X11, X12
0123
0124     // save (x10, x11); restore (x8, x9)
0125     __strd      X8_X10, X9_X11, sp, 8
0126     __ldrd      X8_X10, X9_X11, sp, 0
0127
0128     // quarterrounds: (x2, x7, x8, x13) and (x3, x4, x9, x14)
0129     _halfround  X2, X7, X8_X10, X13,  X3, X4, X9_X11, X14
0130 .endm
0131
0132 .macro _chacha_permute  nrounds
0133     .set brot, 0
0134     .set drot, 0
0135     .rept \nrounds / 2
0136      _doubleround
0137     .endr
0138 .endm
0139
0140 .macro _chacha      nrounds
0141
0142 .Lnext_block\@:
0143     // Stack: unused0-unused1 x10-x11 x0-x15 OUT IN LEN
0144     // Registers contain x0-x9,x12-x15.
0145
0146     // Do the core ChaCha permutation to update x0-x15.
0147     _chacha_permute \nrounds
0148
0149     add     sp, #8
0150     // Stack: x10-x11 orig_x0-orig_x15 OUT IN LEN
0151     // Registers contain x0-x9,x12-x15.
0152     // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
0153
0154     // Free up some registers (r8-r12,r14) by pushing (x8-x9,x12-x15).
0155     push        {X8_X10, X9_X11, X12, X13, X14, X15}
0156
0157     // Load (OUT, IN, LEN).
0158     ldr     r14, [sp, #96]
0159     ldr     r12, [sp, #100]
0160     ldr     r11, [sp, #104]
0161
0162     orr     r10, r14, r12
0163
0164     // Use slow path if fewer than 64 bytes remain.
0165     cmp     r11, #64
0166     blt     .Lxor_slowpath\@
0167
0168     // Use slow path if IN and/or OUT isn't 4-byte aligned.  Needed even on
0169     // ARMv6+, since ldmia and stmia (used below) still require alignment.
0170     tst     r10, #3
0171     bne     .Lxor_slowpath\@
0172
0173     // Fast path: XOR 64 bytes of aligned data.
0174
0175     // Stack: x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN
0176     // Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is OUT.
0177     // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
0178
0179     // x0-x3
0180     __ldrd      r8, r9, sp, 32
0181     __ldrd      r10, r11, sp, 40
0182     add     X0, X0, r8
0183     add     X1, X1, r9
0184     add     X2, X2, r10
0185     add     X3, X3, r11
0186     _le32_bswap_4x  X0, X1, X2, X3,  r8
0187     ldmia       r12!, {r8-r11}
0188     eor     X0, X0, r8
0189     eor     X1, X1, r9
0190     eor     X2, X2, r10
0191     eor     X3, X3, r11
0192     stmia       r14!, {X0-X3}
0193
0194     // x4-x7
0195     __ldrd      r8, r9, sp, 48
0196     __ldrd      r10, r11, sp, 56
0197     add     X4, r8, X4, ror #brot
0198     add     X5, r9, X5, ror #brot
0199     ldmia       r12!, {X0-X3}
0200     add     X6, r10, X6, ror #brot
0201     add     X7, r11, X7, ror #brot
0202     _le32_bswap_4x  X4, X5, X6, X7,  r8
0203     eor     X4, X4, X0
0204     eor     X5, X5, X1
0205     eor     X6, X6, X2
0206     eor     X7, X7, X3
0207     stmia       r14!, {X4-X7}
0208
0209     // x8-x15
0210     pop     {r0-r7}         // (x8-x9,x12-x15,x10-x11)
0211     __ldrd      r8, r9, sp, 32
0212     __ldrd      r10, r11, sp, 40
0213     add     r0, r0, r8      // x8
0214     add     r1, r1, r9      // x9
0215     add     r6, r6, r10     // x10
0216     add     r7, r7, r11     // x11
0217     _le32_bswap_4x  r0, r1, r6, r7,  r8
0218     ldmia       r12!, {r8-r11}
0219     eor     r0, r0, r8      // x8
0220     eor     r1, r1, r9      // x9
0221     eor     r6, r6, r10     // x10
0222     eor     r7, r7, r11     // x11
0223     stmia       r14!, {r0,r1,r6,r7}
0224     ldmia       r12!, {r0,r1,r6,r7}
0225     __ldrd      r8, r9, sp, 48
0226     __ldrd      r10, r11, sp, 56
0227     add     r2, r8, r2, ror #drot   // x12
0228     add     r3, r9, r3, ror #drot   // x13
0229     add     r4, r10, r4, ror #drot  // x14
0230     add     r5, r11, r5, ror #drot  // x15
0231     _le32_bswap_4x  r2, r3, r4, r5,  r9
0232       ldr       r9, [sp, #72]       // load LEN
0233     eor     r2, r2, r0      // x12
0234     eor     r3, r3, r1      // x13
0235     eor     r4, r4, r6      // x14
0236     eor     r5, r5, r7      // x15
0237       subs      r9, #64         // decrement and check LEN
0238     stmia       r14!, {r2-r5}
0239
0240     beq     .Ldone\@
0241
0242 .Lprepare_for_next_block\@:
0243
0244     // Stack: x0-x15 OUT IN LEN
0245
0246     // Increment block counter (x12)
0247     add     r8, #1
0248
0249     // Store updated (OUT, IN, LEN)
0250     str     r14, [sp, #64]
0251     str     r12, [sp, #68]
0252     str     r9, [sp, #72]
0253
0254       mov       r14, sp
0255
0256     // Store updated block counter (x12)
0257     str     r8, [sp, #48]
0258
0259       sub       sp, #16
0260
0261     // Reload state and do next block
0262     ldmia       r14!, {r0-r11}      // load x0-x11
0263     __strd      r10, r11, sp, 8     // store x10-x11 before state
0264     ldmia       r14, {r10-r12,r14}  // load x12-x15
0265     b       .Lnext_block\@
0266
0267 .Lxor_slowpath\@:
0268     // Slow path: < 64 bytes remaining, or unaligned input or output buffer.
0269     // We handle it by storing the 64 bytes of keystream to the stack, then
0270     // XOR-ing the needed portion with the data.
0271
0272     // Allocate keystream buffer
0273     sub     sp, #64
0274     mov     r14, sp
0275
0276     // Stack: ks0-ks15 x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN
0277     // Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is &ks0.
0278     // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
0279
0280     // Save keystream for x0-x3
0281     __ldrd      r8, r9, sp, 96
0282     __ldrd      r10, r11, sp, 104
0283     add     X0, X0, r8
0284     add     X1, X1, r9
0285     add     X2, X2, r10
0286     add     X3, X3, r11
0287     _le32_bswap_4x  X0, X1, X2, X3,  r8
0288     stmia       r14!, {X0-X3}
0289
0290     // Save keystream for x4-x7
0291     __ldrd      r8, r9, sp, 112
0292     __ldrd      r10, r11, sp, 120
0293     add     X4, r8, X4, ror #brot
0294     add     X5, r9, X5, ror #brot
0295     add     X6, r10, X6, ror #brot
0296     add     X7, r11, X7, ror #brot
0297     _le32_bswap_4x  X4, X5, X6, X7,  r8
0298       add       r8, sp, #64
0299     stmia       r14!, {X4-X7}
0300
0301     // Save keystream for x8-x15
0302     ldm     r8, {r0-r7}     // (x8-x9,x12-x15,x10-x11)
0303     __ldrd      r8, r9, sp, 128
0304     __ldrd      r10, r11, sp, 136
0305     add     r0, r0, r8      // x8
0306     add     r1, r1, r9      // x9
0307     add     r6, r6, r10     // x10
0308     add     r7, r7, r11     // x11
0309     _le32_bswap_4x  r0, r1, r6, r7,  r8
0310     stmia       r14!, {r0,r1,r6,r7}
0311     __ldrd      r8, r9, sp, 144
0312     __ldrd      r10, r11, sp, 152
0313     add     r2, r8, r2, ror #drot   // x12
0314     add     r3, r9, r3, ror #drot   // x13
0315     add     r4, r10, r4, ror #drot  // x14
0316     add     r5, r11, r5, ror #drot  // x15
0317     _le32_bswap_4x  r2, r3, r4, r5,  r9
0318     stmia       r14, {r2-r5}
0319
0320     // Stack: ks0-ks15 unused0-unused7 x0-x15 OUT IN LEN
0321     // Registers: r8 is block counter, r12 is IN.
0322
0323     ldr     r9, [sp, #168]      // LEN
0324     ldr     r14, [sp, #160]     // OUT
0325     cmp     r9, #64
0326       mov       r0, sp
0327     movle       r1, r9
0328     movgt       r1, #64
0329     // r1 is number of bytes to XOR, in range [1, 64]
0330
0331 .if __LINUX_ARM_ARCH__ < 6
0332     orr     r2, r12, r14
0333     tst     r2, #3          // IN or OUT misaligned?
0334     bne     .Lxor_next_byte\@
0335 .endif
0336
0337     // XOR a word at a time
0338 .rept 16
0339     subs        r1, #4
0340     blt     .Lxor_words_done\@
0341     ldr     r2, [r12], #4
0342     ldr     r3, [r0], #4
0343     eor     r2, r2, r3
0344     str     r2, [r14], #4
0345 .endr
0346     b       .Lxor_slowpath_done\@
0347 .Lxor_words_done\@:
0348     ands        r1, r1, #3
0349     beq     .Lxor_slowpath_done\@
0350
0351     // XOR a byte at a time
0352 .Lxor_next_byte\@:
0353     ldrb        r2, [r12], #1
0354     ldrb        r3, [r0], #1
0355     eor     r2, r2, r3
0356     strb        r2, [r14], #1
0357     subs        r1, #1
0358     bne     .Lxor_next_byte\@
0359
0360 .Lxor_slowpath_done\@:
0361     subs        r9, #64
0362     add     sp, #96
0363     bgt     .Lprepare_for_next_block\@
0364
0365 .Ldone\@:
0366 .endm   // _chacha
0367
0368 /*
0369  * void chacha_doarm(u8 *dst, const u8 *src, unsigned int bytes,
0370  *           const u32 *state, int nrounds);
0371  */
0372 ENTRY(chacha_doarm)
0373     cmp     r2, #0          // len == 0?
0374     reteq       lr
0375
0376     ldr     ip, [sp]
0377     cmp     ip, #12
0378
0379     push        {r0-r2,r4-r11,lr}
0380
0381     // Push state x0-x15 onto stack.
0382     // Also store an extra copy of x10-x11 just before the state.
0383
0384     add     X12, r3, #48
0385     ldm     X12, {X12,X13,X14,X15}
0386     push        {X12,X13,X14,X15}
0387     sub     sp, sp, #64
0388
0389     __ldrd      X8_X10, X9_X11, r3, 40
0390     __strd      X8_X10, X9_X11, sp, 8
0391     __strd      X8_X10, X9_X11, sp, 56
0392     ldm     r3, {X0-X9_X11}
0393     __strd      X0, X1, sp, 16
0394     __strd      X2, X3, sp, 24
0395     __strd      X4, X5, sp, 32
0396     __strd      X6, X7, sp, 40
0397     __strd      X8_X10, X9_X11, sp, 48
0398
0399     beq     1f
0400     _chacha     20
0401
0402 0:  add     sp, #76
0403     pop     {r4-r11, pc}
0404
0405 1:  _chacha     12
0406     b       0b
0407 ENDPROC(chacha_doarm)
0408
0409 /*
0410  * void hchacha_block_arm(const u32 state[16], u32 out[8], int nrounds);
0411  */
0412 ENTRY(hchacha_block_arm)
0413     push        {r1,r4-r11,lr}
0414
0415     cmp     r2, #12         // ChaCha12 ?
0416
0417     mov     r14, r0
0418     ldmia       r14!, {r0-r11}      // load x0-x11
0419     push        {r10-r11}       // store x10-x11 to stack
0420     ldm     r14, {r10-r12,r14}  // load x12-x15
0421     sub     sp, #8
0422
0423     beq     1f
0424     _chacha_permute 20
0425
0426     // Skip over (unused0-unused1, x10-x11)
0427 0:  add     sp, #16
0428
0429     // Fix up rotations of x12-x15
0430     ror     X12, X12, #drot
0431     ror     X13, X13, #drot
0432       pop       {r4}            // load 'out'
0433     ror     X14, X14, #drot
0434     ror     X15, X15, #drot
0435
0436     // Store (x0-x3,x12-x15) to 'out'
0437     stm     r4, {X0,X1,X2,X3,X12,X13,X14,X15}
0438
0439     pop     {r4-r11,pc}
0440
0441 1:  _chacha_permute 12
0442     b       0b
0443 ENDPROC(hchacha_block_arm)