0001
0002
0003
0004
0005
0006 #include <linux/linkage.h>
0007 #include <asm/assembler.h>
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024
0025
0026
0027
0028 // ChaCha state registers
0029 X0 .req r0
0030 X1 .req r1
0031 X2 .req r2
0032 X3 .req r3
0033 X4 .req r4
0034 X5 .req r5
0035 X6 .req r6
0036 X7 .req r7
0037 X8_X10 .req r8 // shared by x8 and x10
0038 X9_X11 .req r9 // shared by x9 and x11
0039 X12 .req r10
0040 X13 .req r11
0041 X14 .req r12
0042 X15 .req r14
0043
0044 .macro _le32_bswap_4x a, b, c, d, tmp
0045 #ifdef __ARMEB__
0046 rev_l \a, \tmp
0047 rev_l \b, \tmp
0048 rev_l \c, \tmp
0049 rev_l \d, \tmp
0050 #endif
0051 .endm
0052
0053 .macro __ldrd a, b, src, offset
0054 #if __LINUX_ARM_ARCH__ >= 6
0055 ldrd \a, \b, [\src, #\offset]
0056 #else
0057 ldr \a, [\src, #\offset]
0058 ldr \b, [\src, #\offset + 4]
0059 #endif
0060 .endm
0061
0062 .macro __strd a, b, dst, offset
0063 #if __LINUX_ARM_ARCH__ >= 6
0064 strd \a, \b, [\dst, #\offset]
0065 #else
0066 str \a, [\dst, #\offset]
0067 str \b, [\dst, #\offset + 4]
0068 #endif
0069 .endm
0070
0071 .macro _halfround a1, b1, c1, d1, a2, b2, c2, d2
0072
0073 // a += b; d ^= a; d = rol(d, 16);
0074 add \a1, \a1, \b1, ror #brot
0075 add \a2, \a2, \b2, ror #brot
0076 eor \d1, \a1, \d1, ror #drot
0077 eor \d2, \a2, \d2, ror #drot
0078 // drot == 32 - 16 == 16
0079
0080 // c += d; b ^= c; b = rol(b, 12);
0081 add \c1, \c1, \d1, ror #16
0082 add \c2, \c2, \d2, ror #16
0083 eor \b1, \c1, \b1, ror #brot
0084 eor \b2, \c2, \b2, ror #brot
0085 // brot == 32 - 12 == 20
0086
0087 // a += b; d ^= a; d = rol(d, 8);
0088 add \a1, \a1, \b1, ror #20
0089 add \a2, \a2, \b2, ror #20
0090 eor \d1, \a1, \d1, ror #16
0091 eor \d2, \a2, \d2, ror #16
0092 // drot == 32 - 8 == 24
0093
0094 // c += d; b ^= c; b = rol(b, 7);
0095 add \c1, \c1, \d1, ror #24
0096 add \c2, \c2, \d2, ror #24
0097 eor \b1, \c1, \b1, ror #20
0098 eor \b2, \c2, \b2, ror #20
0099 // brot == 32 - 7 == 25
0100 .endm
0101
0102 .macro _doubleround
0103
0104 // column round
0105
0106 // quarterrounds: (x0, x4, x8, x12) and (x1, x5, x9, x13)
0107 _halfround X0, X4, X8_X10, X12, X1, X5, X9_X11, X13
0108
0109 // save (x8, x9); restore (x10, x11)
0110 __strd X8_X10, X9_X11, sp, 0
0111 __ldrd X8_X10, X9_X11, sp, 8
0112
0113 // quarterrounds: (x2, x6, x10, x14) and (x3, x7, x11, x15)
0114 _halfround X2, X6, X8_X10, X14, X3, X7, X9_X11, X15
0115
0116 .set brot, 25
0117 .set drot, 24
0118
0119 // diagonal round
0120
0121 // quarterrounds: (x0, x5, x10, x15) and (x1, x6, x11, x12)
0122 _halfround X0, X5, X8_X10, X15, X1, X6, X9_X11, X12
0123
0124 // save (x10, x11); restore (x8, x9)
0125 __strd X8_X10, X9_X11, sp, 8
0126 __ldrd X8_X10, X9_X11, sp, 0
0127
0128 // quarterrounds: (x2, x7, x8, x13) and (x3, x4, x9, x14)
0129 _halfround X2, X7, X8_X10, X13, X3, X4, X9_X11, X14
0130 .endm
0131
0132 .macro _chacha_permute nrounds
0133 .set brot, 0
0134 .set drot, 0
0135 .rept \nrounds / 2
0136 _doubleround
0137 .endr
0138 .endm
0139
0140 .macro _chacha nrounds
0141
0142 .Lnext_block\@:
0143 // Stack: unused0-unused1 x10-x11 x0-x15 OUT IN LEN
0144 // Registers contain x0-x9,x12-x15.
0145
0146 // Do the core ChaCha permutation to update x0-x15.
0147 _chacha_permute \nrounds
0148
0149 add sp, #8
0150 // Stack: x10-x11 orig_x0-orig_x15 OUT IN LEN
0151 // Registers contain x0-x9,x12-x15.
0152 // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
0153
0154 // Free up some registers (r8-r12,r14) by pushing (x8-x9,x12-x15).
0155 push {X8_X10, X9_X11, X12, X13, X14, X15}
0156
0157 // Load (OUT, IN, LEN).
0158 ldr r14, [sp, #96]
0159 ldr r12, [sp, #100]
0160 ldr r11, [sp, #104]
0161
0162 orr r10, r14, r12
0163
0164 // Use slow path if fewer than 64 bytes remain.
0165 cmp r11, #64
0166 blt .Lxor_slowpath\@
0167
0168 // Use slow path if IN and/or OUT isn't 4-byte aligned. Needed even on
0169 // ARMv6+, since ldmia and stmia (used below) still require alignment.
0170 tst r10, #3
0171 bne .Lxor_slowpath\@
0172
0173 // Fast path: XOR 64 bytes of aligned data.
0174
0175 // Stack: x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN
0176 // Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is OUT.
0177 // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
0178
0179 // x0-x3
0180 __ldrd r8, r9, sp, 32
0181 __ldrd r10, r11, sp, 40
0182 add X0, X0, r8
0183 add X1, X1, r9
0184 add X2, X2, r10
0185 add X3, X3, r11
0186 _le32_bswap_4x X0, X1, X2, X3, r8
0187 ldmia r12!, {r8-r11}
0188 eor X0, X0, r8
0189 eor X1, X1, r9
0190 eor X2, X2, r10
0191 eor X3, X3, r11
0192 stmia r14!, {X0-X3}
0193
0194 // x4-x7
0195 __ldrd r8, r9, sp, 48
0196 __ldrd r10, r11, sp, 56
0197 add X4, r8, X4, ror #brot
0198 add X5, r9, X5, ror #brot
0199 ldmia r12!, {X0-X3}
0200 add X6, r10, X6, ror #brot
0201 add X7, r11, X7, ror #brot
0202 _le32_bswap_4x X4, X5, X6, X7, r8
0203 eor X4, X4, X0
0204 eor X5, X5, X1
0205 eor X6, X6, X2
0206 eor X7, X7, X3
0207 stmia r14!, {X4-X7}
0208
0209 // x8-x15
0210 pop {r0-r7} // (x8-x9,x12-x15,x10-x11)
0211 __ldrd r8, r9, sp, 32
0212 __ldrd r10, r11, sp, 40
0213 add r0, r0, r8 // x8
0214 add r1, r1, r9 // x9
0215 add r6, r6, r10 // x10
0216 add r7, r7, r11 // x11
0217 _le32_bswap_4x r0, r1, r6, r7, r8
0218 ldmia r12!, {r8-r11}
0219 eor r0, r0, r8 // x8
0220 eor r1, r1, r9 // x9
0221 eor r6, r6, r10 // x10
0222 eor r7, r7, r11 // x11
0223 stmia r14!, {r0,r1,r6,r7}
0224 ldmia r12!, {r0,r1,r6,r7}
0225 __ldrd r8, r9, sp, 48
0226 __ldrd r10, r11, sp, 56
0227 add r2, r8, r2, ror #drot // x12
0228 add r3, r9, r3, ror #drot // x13
0229 add r4, r10, r4, ror #drot // x14
0230 add r5, r11, r5, ror #drot // x15
0231 _le32_bswap_4x r2, r3, r4, r5, r9
0232 ldr r9, [sp, #72] // load LEN
0233 eor r2, r2, r0 // x12
0234 eor r3, r3, r1 // x13
0235 eor r4, r4, r6 // x14
0236 eor r5, r5, r7 // x15
0237 subs r9, #64 // decrement and check LEN
0238 stmia r14!, {r2-r5}
0239
0240 beq .Ldone\@
0241
0242 .Lprepare_for_next_block\@:
0243
0244 // Stack: x0-x15 OUT IN LEN
0245
0246 // Increment block counter (x12)
0247 add r8, #1
0248
0249 // Store updated (OUT, IN, LEN)
0250 str r14, [sp, #64]
0251 str r12, [sp, #68]
0252 str r9, [sp, #72]
0253
0254 mov r14, sp
0255
0256 // Store updated block counter (x12)
0257 str r8, [sp, #48]
0258
0259 sub sp, #16
0260
0261 // Reload state and do next block
0262 ldmia r14!, {r0-r11} // load x0-x11
0263 __strd r10, r11, sp, 8 // store x10-x11 before state
0264 ldmia r14, {r10-r12,r14} // load x12-x15
0265 b .Lnext_block\@
0266
0267 .Lxor_slowpath\@:
0268 // Slow path: < 64 bytes remaining, or unaligned input or output buffer.
0269 // We handle it by storing the 64 bytes of keystream to the stack, then
0270 // XOR-ing the needed portion with the data.
0271
0272 // Allocate keystream buffer
0273 sub sp, #64
0274 mov r14, sp
0275
0276 // Stack: ks0-ks15 x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN
0277 // Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is &ks0.
0278 // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
0279
0280 // Save keystream for x0-x3
0281 __ldrd r8, r9, sp, 96
0282 __ldrd r10, r11, sp, 104
0283 add X0, X0, r8
0284 add X1, X1, r9
0285 add X2, X2, r10
0286 add X3, X3, r11
0287 _le32_bswap_4x X0, X1, X2, X3, r8
0288 stmia r14!, {X0-X3}
0289
0290 // Save keystream for x4-x7
0291 __ldrd r8, r9, sp, 112
0292 __ldrd r10, r11, sp, 120
0293 add X4, r8, X4, ror #brot
0294 add X5, r9, X5, ror #brot
0295 add X6, r10, X6, ror #brot
0296 add X7, r11, X7, ror #brot
0297 _le32_bswap_4x X4, X5, X6, X7, r8
0298 add r8, sp, #64
0299 stmia r14!, {X4-X7}
0300
0301 // Save keystream for x8-x15
0302 ldm r8, {r0-r7} // (x8-x9,x12-x15,x10-x11)
0303 __ldrd r8, r9, sp, 128
0304 __ldrd r10, r11, sp, 136
0305 add r0, r0, r8 // x8
0306 add r1, r1, r9 // x9
0307 add r6, r6, r10 // x10
0308 add r7, r7, r11 // x11
0309 _le32_bswap_4x r0, r1, r6, r7, r8
0310 stmia r14!, {r0,r1,r6,r7}
0311 __ldrd r8, r9, sp, 144
0312 __ldrd r10, r11, sp, 152
0313 add r2, r8, r2, ror #drot // x12
0314 add r3, r9, r3, ror #drot // x13
0315 add r4, r10, r4, ror #drot // x14
0316 add r5, r11, r5, ror #drot // x15
0317 _le32_bswap_4x r2, r3, r4, r5, r9
0318 stmia r14, {r2-r5}
0319
0320 // Stack: ks0-ks15 unused0-unused7 x0-x15 OUT IN LEN
0321 // Registers: r8 is block counter, r12 is IN.
0322
0323 ldr r9, [sp, #168] // LEN
0324 ldr r14, [sp, #160] // OUT
0325 cmp r9, #64
0326 mov r0, sp
0327 movle r1, r9
0328 movgt r1, #64
0329 // r1 is number of bytes to XOR, in range [1, 64]
0330
0331 .if __LINUX_ARM_ARCH__ < 6
0332 orr r2, r12, r14
0333 tst r2, #3 // IN or OUT misaligned?
0334 bne .Lxor_next_byte\@
0335 .endif
0336
0337 // XOR a word at a time
0338 .rept 16
0339 subs r1, #4
0340 blt .Lxor_words_done\@
0341 ldr r2, [r12], #4
0342 ldr r3, [r0], #4
0343 eor r2, r2, r3
0344 str r2, [r14], #4
0345 .endr
0346 b .Lxor_slowpath_done\@
0347 .Lxor_words_done\@:
0348 ands r1, r1, #3
0349 beq .Lxor_slowpath_done\@
0350
0351 // XOR a byte at a time
0352 .Lxor_next_byte\@:
0353 ldrb r2, [r12], #1
0354 ldrb r3, [r0], #1
0355 eor r2, r2, r3
0356 strb r2, [r14], #1
0357 subs r1, #1
0358 bne .Lxor_next_byte\@
0359
0360 .Lxor_slowpath_done\@:
0361 subs r9, #64
0362 add sp, #96
0363 bgt .Lprepare_for_next_block\@
0364
0365 .Ldone\@:
0366 .endm // _chacha
0367
0368
0369
0370
0371
0372 ENTRY(chacha_doarm)
0373 cmp r2, #0 // len == 0?
0374 reteq lr
0375
0376 ldr ip, [sp]
0377 cmp ip, #12
0378
0379 push {r0-r2,r4-r11,lr}
0380
0381 // Push state x0-x15 onto stack.
0382 // Also store an extra copy of x10-x11 just before the state.
0383
0384 add X12, r3, #48
0385 ldm X12, {X12,X13,X14,X15}
0386 push {X12,X13,X14,X15}
0387 sub sp, sp, #64
0388
0389 __ldrd X8_X10, X9_X11, r3, 40
0390 __strd X8_X10, X9_X11, sp, 8
0391 __strd X8_X10, X9_X11, sp, 56
0392 ldm r3, {X0-X9_X11}
0393 __strd X0, X1, sp, 16
0394 __strd X2, X3, sp, 24
0395 __strd X4, X5, sp, 32
0396 __strd X6, X7, sp, 40
0397 __strd X8_X10, X9_X11, sp, 48
0398
0399 beq 1f
0400 _chacha 20
0401
0402 0: add sp, #76
0403 pop {r4-r11, pc}
0404
0405 1: _chacha 12
0406 b 0b
0407 ENDPROC(chacha_doarm)
0408
0409
0410
0411
0412 ENTRY(hchacha_block_arm)
0413 push {r1,r4-r11,lr}
0414
0415 cmp r2, #12 // ChaCha12 ?
0416
0417 mov r14, r0
0418 ldmia r14!, {r0-r11} // load x0-x11
0419 push {r10-r11} // store x10-x11 to stack
0420 ldm r14, {r10-r12,r14} // load x12-x15
0421 sub sp, #8
0422
0423 beq 1f
0424 _chacha_permute 20
0425
0426 // Skip over (unused0-unused1, x10-x11)
0427 0: add sp, #16
0428
0429 // Fix up rotations of x12-x15
0430 ror X12, X12, #drot
0431 ror X13, X13, #drot
0432 pop {r4} // load 'out'
0433 ror X14, X14, #drot
0434 ror X15, X15, #drot
0435
0436 // Store (x0-x3,x12-x15) to 'out'
0437 stm r4, {X0,X1,X2,X3,X12,X13,X14,X15}
0438
0439 pop {r4-r11,pc}
0440
0441 1: _chacha_permute 12
0442 b 0b
0443 ENDPROC(hchacha_block_arm)