0001
0002
0003
0004
0005
0006
0007 #define MASK_U32 0x3c
0008 #define CHACHA20_BLOCK_SIZE 64
0009 #define STACK_SIZE 32
0010
0011 #define X0 $t0
0012 #define X1 $t1
0013 #define X2 $t2
0014 #define X3 $t3
0015 #define X4 $t4
0016 #define X5 $t5
0017 #define X6 $t6
0018 #define X7 $t7
0019 #define X8 $t8
0020 #define X9 $t9
0021 #define X10 $v1
0022 #define X11 $s6
0023 #define X12 $s5
0024 #define X13 $s4
0025 #define X14 $s3
0026 #define X15 $s2
0027
0028 #define T0 $s1
0029 #define T1 $s0
0030 #define T(n) T ## n
0031 #define X(n) X ## n
0032
0033
0034 #define STATE $a0
0035 #define OUT $a1
0036 #define IN $a2
0037 #define BYTES $a3
0038
0039
0040
0041
0042
0043
0044 #define NONCE_0 $v0
0045
0046
0047
0048
0049
0050 #define SAVED_X X15
0051 #define SAVED_CA $s7
0052
0053 #define IS_UNALIGNED $s7
0054
0055 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
0056 #define MSB 0
0057 #define LSB 3
0058 #define ROTx rotl
0059 #define ROTR(n) rotr n, 24
0060 #define CPU_TO_LE32(n) \
0061 wsbh n; \
0062 rotr n, 16;
0063 #else
0064 #define MSB 3
0065 #define LSB 0
0066 #define ROTx rotr
0067 #define CPU_TO_LE32(n)
0068 #define ROTR(n)
0069 #endif
0070
0071 #define FOR_EACH_WORD(x) \
0072 x( 0); \
0073 x( 1); \
0074 x( 2); \
0075 x( 3); \
0076 x( 4); \
0077 x( 5); \
0078 x( 6); \
0079 x( 7); \
0080 x( 8); \
0081 x( 9); \
0082 x(10); \
0083 x(11); \
0084 x(12); \
0085 x(13); \
0086 x(14); \
0087 x(15);
0088
0089 #define FOR_EACH_WORD_REV(x) \
0090 x(15); \
0091 x(14); \
0092 x(13); \
0093 x(12); \
0094 x(11); \
0095 x(10); \
0096 x( 9); \
0097 x( 8); \
0098 x( 7); \
0099 x( 6); \
0100 x( 5); \
0101 x( 4); \
0102 x( 3); \
0103 x( 2); \
0104 x( 1); \
0105 x( 0);
0106
0107 #define PLUS_ONE_0 1
0108 #define PLUS_ONE_1 2
0109 #define PLUS_ONE_2 3
0110 #define PLUS_ONE_3 4
0111 #define PLUS_ONE_4 5
0112 #define PLUS_ONE_5 6
0113 #define PLUS_ONE_6 7
0114 #define PLUS_ONE_7 8
0115 #define PLUS_ONE_8 9
0116 #define PLUS_ONE_9 10
0117 #define PLUS_ONE_10 11
0118 #define PLUS_ONE_11 12
0119 #define PLUS_ONE_12 13
0120 #define PLUS_ONE_13 14
0121 #define PLUS_ONE_14 15
0122 #define PLUS_ONE_15 16
0123 #define PLUS_ONE(x) PLUS_ONE_ ## x
0124 #define _CONCAT3(a,b,c) a ## b ## c
0125 #define CONCAT3(a,b,c) _CONCAT3(a,b,c)
0126
0127 #define STORE_UNALIGNED(x) \
0128 CONCAT3(.Lchacha_mips_xor_unaligned_, PLUS_ONE(x), _b: ;) \
0129 .if (x != 12); \
0130 lw T0, (x*4)(STATE); \
0131 .endif; \
0132 lwl T1, (x*4)+MSB ## (IN); \
0133 lwr T1, (x*4)+LSB ## (IN); \
0134 .if (x == 12); \
0135 addu X ## x, NONCE_0; \
0136 .else; \
0137 addu X ## x, T0; \
0138 .endif; \
0139 CPU_TO_LE32(X ## x); \
0140 xor X ## x, T1; \
0141 swl X ## x, (x*4)+MSB ## (OUT); \
0142 swr X ## x, (x*4)+LSB ## (OUT);
0143
0144 #define STORE_ALIGNED(x) \
0145 CONCAT3(.Lchacha_mips_xor_aligned_, PLUS_ONE(x), _b: ;) \
0146 .if (x != 12); \
0147 lw T0, (x*4)(STATE); \
0148 .endif; \
0149 lw T1, (x*4) ## (IN); \
0150 .if (x == 12); \
0151 addu X ## x, NONCE_0; \
0152 .else; \
0153 addu X ## x, T0; \
0154 .endif; \
0155 CPU_TO_LE32(X ## x); \
0156 xor X ## x, T1; \
0157 sw X ## x, (x*4) ## (OUT);
0158
0159
0160
0161
0162
0163
0164 #define JMPTBL_ALIGNED(x) \
0165 .Lchacha_mips_jmptbl_aligned_ ## x: ; \
0166 .set noreorder; \
0167 b .Lchacha_mips_xor_aligned_ ## x ## _b; \
0168 .if (x == 12); \
0169 addu SAVED_X, X ## x, NONCE_0; \
0170 .else; \
0171 addu SAVED_X, X ## x, SAVED_CA; \
0172 .endif; \
0173 .set reorder
0174
0175 #define JMPTBL_UNALIGNED(x) \
0176 .Lchacha_mips_jmptbl_unaligned_ ## x: ; \
0177 .set noreorder; \
0178 b .Lchacha_mips_xor_unaligned_ ## x ## _b; \
0179 .if (x == 12); \
0180 addu SAVED_X, X ## x, NONCE_0; \
0181 .else; \
0182 addu SAVED_X, X ## x, SAVED_CA; \
0183 .endif; \
0184 .set reorder
0185
0186 #define AXR(A, B, C, D, K, L, M, N, V, W, Y, Z, S) \
0187 addu X(A), X(K); \
0188 addu X(B), X(L); \
0189 addu X(C), X(M); \
0190 addu X(D), X(N); \
0191 xor X(V), X(A); \
0192 xor X(W), X(B); \
0193 xor X(Y), X(C); \
0194 xor X(Z), X(D); \
0195 rotl X(V), S; \
0196 rotl X(W), S; \
0197 rotl X(Y), S; \
0198 rotl X(Z), S;
0199
0200 .text
0201 .set reorder
0202 .set noat
0203 .globl chacha_crypt_arch
0204 .ent chacha_crypt_arch
0205 chacha_crypt_arch:
0206 .frame $sp, STACK_SIZE, $ra
0207
0208
0209 lw $at, 16($sp)
0210
0211 addiu $sp, -STACK_SIZE
0212
0213
0214 beqz BYTES, .Lchacha_mips_end
0215
0216 lw NONCE_0, 48(STATE)
0217
0218
0219 sw $s0, 0($sp)
0220 sw $s1, 4($sp)
0221 sw $s2, 8($sp)
0222 sw $s3, 12($sp)
0223 sw $s4, 16($sp)
0224 sw $s5, 20($sp)
0225 sw $s6, 24($sp)
0226 sw $s7, 28($sp)
0227
0228
0229
0230
0231 or IS_UNALIGNED, IN, OUT
0232 andi IS_UNALIGNED, 0x3
0233
0234 b .Lchacha_rounds_start
0235
0236 .align 4
0237 .Loop_chacha_rounds:
0238 addiu IN, CHACHA20_BLOCK_SIZE
0239 addiu OUT, CHACHA20_BLOCK_SIZE
0240 addiu NONCE_0, 1
0241
0242 .Lchacha_rounds_start:
0243 lw X0, 0(STATE)
0244 lw X1, 4(STATE)
0245 lw X2, 8(STATE)
0246 lw X3, 12(STATE)
0247
0248 lw X4, 16(STATE)
0249 lw X5, 20(STATE)
0250 lw X6, 24(STATE)
0251 lw X7, 28(STATE)
0252 lw X8, 32(STATE)
0253 lw X9, 36(STATE)
0254 lw X10, 40(STATE)
0255 lw X11, 44(STATE)
0256
0257 move X12, NONCE_0
0258 lw X13, 52(STATE)
0259 lw X14, 56(STATE)
0260 lw X15, 60(STATE)
0261
0262 .Loop_chacha_xor_rounds:
0263 addiu $at, -2
0264 AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 16);
0265 AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 12);
0266 AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 8);
0267 AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 7);
0268 AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 16);
0269 AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 12);
0270 AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 8);
0271 AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 7);
0272 bnez $at, .Loop_chacha_xor_rounds
0273
0274 addiu BYTES, -(CHACHA20_BLOCK_SIZE)
0275
0276
0277 bnez IS_UNALIGNED, .Loop_chacha_unaligned
0278
0279
0280 lw $at, (STACK_SIZE+16)($sp)
0281
0282
0283 bltz BYTES, .Lchacha_mips_no_full_block_aligned
0284
0285 FOR_EACH_WORD_REV(STORE_ALIGNED)
0286
0287
0288 bgtz BYTES, .Loop_chacha_rounds
0289
0290
0291 addiu NONCE_0, 1
0292
0293
0294 bltz BYTES, .Lchacha_mips_xor_bytes
0295
0296 .Lchacha_mips_xor_done:
0297
0298 lw $s0, 0($sp)
0299 lw $s1, 4($sp)
0300 lw $s2, 8($sp)
0301 lw $s3, 12($sp)
0302 lw $s4, 16($sp)
0303 lw $s5, 20($sp)
0304 lw $s6, 24($sp)
0305 lw $s7, 28($sp)
0306
0307
0308 sw NONCE_0, 48(STATE)
0309
0310 .Lchacha_mips_end:
0311 addiu $sp, STACK_SIZE
0312 jr $ra
0313
0314 .Lchacha_mips_no_full_block_aligned:
0315
0316 addiu BYTES, CHACHA20_BLOCK_SIZE
0317
0318
0319 andi $at, BYTES, MASK_U32
0320
0321
0322 lui T0, %hi(.Lchacha_mips_jmptbl_aligned_0)
0323
0324
0325 ins T0, $at, 1, 6
0326
0327
0328 addu T1, STATE, $at
0329
0330
0331 addiu T0, %lo(.Lchacha_mips_jmptbl_aligned_0)
0332
0333
0334 lw SAVED_CA, 0(T1)
0335
0336
0337 subu BYTES, $at, BYTES
0338
0339 jr T0
0340
0341
0342 FOR_EACH_WORD(JMPTBL_ALIGNED)
0343
0344
0345 .Loop_chacha_unaligned:
0346
0347 lw $at, (STACK_SIZE+16)($sp)
0348
0349
0350 bltz BYTES, .Lchacha_mips_no_full_block_unaligned
0351
0352 FOR_EACH_WORD_REV(STORE_UNALIGNED)
0353
0354
0355 bgtz BYTES, .Loop_chacha_rounds
0356
0357
0358 sw NONCE_0, 48(STATE)
0359
0360 .set noreorder
0361
0362 bgez BYTES, .Lchacha_mips_xor_done
0363 .Lchacha_mips_xor_unaligned_0_b:
0364 .Lchacha_mips_xor_aligned_0_b:
0365
0366 addiu NONCE_0, 1
0367 .set reorder
0368
0369 .Lchacha_mips_xor_bytes:
0370 addu IN, $at
0371 addu OUT, $at
0372
0373 lbu T1, 0(IN)
0374 addiu $at, BYTES, 1
0375 CPU_TO_LE32(SAVED_X)
0376 ROTR(SAVED_X)
0377 xor T1, SAVED_X
0378 sb T1, 0(OUT)
0379 beqz $at, .Lchacha_mips_xor_done
0380
0381 lbu T1, 1(IN)
0382 addiu $at, BYTES, 2
0383 ROTx SAVED_X, 8
0384 xor T1, SAVED_X
0385 sb T1, 1(OUT)
0386 beqz $at, .Lchacha_mips_xor_done
0387
0388 lbu T1, 2(IN)
0389 ROTx SAVED_X, 8
0390 xor T1, SAVED_X
0391 sb T1, 2(OUT)
0392 b .Lchacha_mips_xor_done
0393
0394 .Lchacha_mips_no_full_block_unaligned:
0395
0396 addiu BYTES, CHACHA20_BLOCK_SIZE
0397
0398
0399 andi $at, BYTES, MASK_U32
0400
0401
0402 lui T0, %hi(.Lchacha_mips_jmptbl_unaligned_0)
0403
0404
0405 ins T0, $at, 1, 6
0406
0407
0408 addu T1, STATE, $at
0409
0410
0411 addiu T0, %lo(.Lchacha_mips_jmptbl_unaligned_0)
0412
0413
0414 lw SAVED_CA, 0(T1)
0415
0416
0417 subu BYTES, $at, BYTES
0418
0419 jr T0
0420
0421
0422 FOR_EACH_WORD(JMPTBL_UNALIGNED)
0423 .end chacha_crypt_arch
0424 .set at
0425
0426
0427
0428
0429
0430
0431
0432 #undef X12
0433 #undef X13
0434 #undef X14
0435 #undef X15
0436
0437 #define X12 $a3
0438 #define X13 $at
0439 #define X14 $v0
0440 #define X15 STATE
0441
0442 .set noat
0443 .globl hchacha_block_arch
0444 .ent hchacha_block_arch
0445 hchacha_block_arch:
0446 .frame $sp, STACK_SIZE, $ra
0447
0448 addiu $sp, -STACK_SIZE
0449
0450
0451 sw X11, 0($sp)
0452
0453 lw X0, 0(STATE)
0454 lw X1, 4(STATE)
0455 lw X2, 8(STATE)
0456 lw X3, 12(STATE)
0457 lw X4, 16(STATE)
0458 lw X5, 20(STATE)
0459 lw X6, 24(STATE)
0460 lw X7, 28(STATE)
0461 lw X8, 32(STATE)
0462 lw X9, 36(STATE)
0463 lw X10, 40(STATE)
0464 lw X11, 44(STATE)
0465 lw X12, 48(STATE)
0466 lw X13, 52(STATE)
0467 lw X14, 56(STATE)
0468 lw X15, 60(STATE)
0469
0470 .Loop_hchacha_xor_rounds:
0471 addiu $a2, -2
0472 AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 16);
0473 AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 12);
0474 AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 8);
0475 AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 7);
0476 AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 16);
0477 AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 12);
0478 AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 8);
0479 AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 7);
0480 bnez $a2, .Loop_hchacha_xor_rounds
0481
0482
0483 lw X11, 0($sp)
0484
0485 sw X0, 0(OUT)
0486 sw X1, 4(OUT)
0487 sw X2, 8(OUT)
0488 sw X3, 12(OUT)
0489 sw X12, 16(OUT)
0490 sw X13, 20(OUT)
0491 sw X14, 24(OUT)
0492 sw X15, 28(OUT)
0493
0494 addiu $sp, STACK_SIZE
0495 jr $ra
0496 .end hchacha_block_arch
0497 .set at