Back to home page

OSCL-LXR

 
 

    


0001 /* SPDX-License-Identifier: GPL-2.0 OR MIT */
0002 /*
0003  * Copyright (C) 2016-2018 René van Dorst <opensource@vdorst.com>. All Rights Reserved.
0004  * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
0005  */
0006 
0007 #define MASK_U32        0x3c
0008 #define CHACHA20_BLOCK_SIZE 64
0009 #define STACK_SIZE      32
0010 
0011 #define X0  $t0
0012 #define X1  $t1
0013 #define X2  $t2
0014 #define X3  $t3
0015 #define X4  $t4
0016 #define X5  $t5
0017 #define X6  $t6
0018 #define X7  $t7
0019 #define X8  $t8
0020 #define X9  $t9
0021 #define X10 $v1
0022 #define X11 $s6
0023 #define X12 $s5
0024 #define X13 $s4
0025 #define X14 $s3
0026 #define X15 $s2
0027 /* Use regs which are overwritten on exit for Tx so we don't leak clear data. */
0028 #define T0  $s1
0029 #define T1  $s0
0030 #define T(n)    T ## n
0031 #define X(n)    X ## n
0032 
0033 /* Input arguments */
0034 #define STATE       $a0
0035 #define OUT     $a1
0036 #define IN      $a2
0037 #define BYTES       $a3
0038 
0039 /* Output argument */
0040 /* NONCE[0] is kept in a register and not in memory.
0041  * We don't want to touch original value in memory.
0042  * Must be incremented every loop iteration.
0043  */
0044 #define NONCE_0     $v0
0045 
0046 /* SAVED_X and SAVED_CA are set in the jump table.
0047  * Use regs which are overwritten on exit else we don't leak clear data.
0048  * They are used to handling the last bytes which are not multiple of 4.
0049  */
0050 #define SAVED_X     X15
0051 #define SAVED_CA    $s7
0052 
0053 #define IS_UNALIGNED    $s7
0054 
0055 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
0056 #define MSB 0
0057 #define LSB 3
0058 #define ROTx rotl
0059 #define ROTR(n) rotr n, 24
0060 #define CPU_TO_LE32(n) \
0061     wsbh    n; \
0062     rotr    n, 16;
0063 #else
0064 #define MSB 3
0065 #define LSB 0
0066 #define ROTx rotr
0067 #define CPU_TO_LE32(n)
0068 #define ROTR(n)
0069 #endif
0070 
0071 #define FOR_EACH_WORD(x) \
0072     x( 0); \
0073     x( 1); \
0074     x( 2); \
0075     x( 3); \
0076     x( 4); \
0077     x( 5); \
0078     x( 6); \
0079     x( 7); \
0080     x( 8); \
0081     x( 9); \
0082     x(10); \
0083     x(11); \
0084     x(12); \
0085     x(13); \
0086     x(14); \
0087     x(15);
0088 
0089 #define FOR_EACH_WORD_REV(x) \
0090     x(15); \
0091     x(14); \
0092     x(13); \
0093     x(12); \
0094     x(11); \
0095     x(10); \
0096     x( 9); \
0097     x( 8); \
0098     x( 7); \
0099     x( 6); \
0100     x( 5); \
0101     x( 4); \
0102     x( 3); \
0103     x( 2); \
0104     x( 1); \
0105     x( 0);
0106 
0107 #define PLUS_ONE_0   1
0108 #define PLUS_ONE_1   2
0109 #define PLUS_ONE_2   3
0110 #define PLUS_ONE_3   4
0111 #define PLUS_ONE_4   5
0112 #define PLUS_ONE_5   6
0113 #define PLUS_ONE_6   7
0114 #define PLUS_ONE_7   8
0115 #define PLUS_ONE_8   9
0116 #define PLUS_ONE_9  10
0117 #define PLUS_ONE_10 11
0118 #define PLUS_ONE_11 12
0119 #define PLUS_ONE_12 13
0120 #define PLUS_ONE_13 14
0121 #define PLUS_ONE_14 15
0122 #define PLUS_ONE_15 16
0123 #define PLUS_ONE(x) PLUS_ONE_ ## x
0124 #define _CONCAT3(a,b,c) a ## b ## c
0125 #define CONCAT3(a,b,c)  _CONCAT3(a,b,c)
0126 
0127 #define STORE_UNALIGNED(x) \
0128 CONCAT3(.Lchacha_mips_xor_unaligned_, PLUS_ONE(x), _b: ;) \
0129     .if (x != 12); \
0130         lw  T0, (x*4)(STATE); \
0131     .endif; \
0132     lwl T1, (x*4)+MSB ## (IN); \
0133     lwr T1, (x*4)+LSB ## (IN); \
0134     .if (x == 12); \
0135         addu    X ## x, NONCE_0; \
0136     .else; \
0137         addu    X ## x, T0; \
0138     .endif; \
0139     CPU_TO_LE32(X ## x); \
0140     xor X ## x, T1; \
0141     swl X ## x, (x*4)+MSB ## (OUT); \
0142     swr X ## x, (x*4)+LSB ## (OUT);
0143 
0144 #define STORE_ALIGNED(x) \
0145 CONCAT3(.Lchacha_mips_xor_aligned_, PLUS_ONE(x), _b: ;) \
0146     .if (x != 12); \
0147         lw  T0, (x*4)(STATE); \
0148     .endif; \
0149     lw  T1, (x*4) ## (IN); \
0150     .if (x == 12); \
0151         addu    X ## x, NONCE_0; \
0152     .else; \
0153         addu    X ## x, T0; \
0154     .endif; \
0155     CPU_TO_LE32(X ## x); \
0156     xor X ## x, T1; \
0157     sw  X ## x, (x*4) ## (OUT);
0158 
0159 /* Jump table macro.
0160  * Used for setup and handling the last bytes, which are not multiple of 4.
0161  * X15 is free to store Xn
0162  * Every jumptable entry must be equal in size.
0163  */
0164 #define JMPTBL_ALIGNED(x) \
0165 .Lchacha_mips_jmptbl_aligned_ ## x: ; \
0166     .set    noreorder; \
0167     b   .Lchacha_mips_xor_aligned_ ## x ## _b; \
0168     .if (x == 12); \
0169         addu    SAVED_X, X ## x, NONCE_0; \
0170     .else; \
0171         addu    SAVED_X, X ## x, SAVED_CA; \
0172     .endif; \
0173     .set    reorder
0174 
0175 #define JMPTBL_UNALIGNED(x) \
0176 .Lchacha_mips_jmptbl_unaligned_ ## x: ; \
0177     .set    noreorder; \
0178     b   .Lchacha_mips_xor_unaligned_ ## x ## _b; \
0179     .if (x == 12); \
0180         addu    SAVED_X, X ## x, NONCE_0; \
0181     .else; \
0182         addu    SAVED_X, X ## x, SAVED_CA; \
0183     .endif; \
0184     .set    reorder
0185 
0186 #define AXR(A, B, C, D,  K, L, M, N,  V, W, Y, Z,  S) \
0187     addu    X(A), X(K); \
0188     addu    X(B), X(L); \
0189     addu    X(C), X(M); \
0190     addu    X(D), X(N); \
0191     xor X(V), X(A); \
0192     xor X(W), X(B); \
0193     xor X(Y), X(C); \
0194     xor X(Z), X(D); \
0195     rotl    X(V), S;    \
0196     rotl    X(W), S;    \
0197     rotl    X(Y), S;    \
0198     rotl    X(Z), S;
0199 
0200 .text
0201 .set    reorder
0202 .set    noat
0203 .globl  chacha_crypt_arch
0204 .ent    chacha_crypt_arch
0205 chacha_crypt_arch:
0206     .frame  $sp, STACK_SIZE, $ra
0207 
0208     /* Load number of rounds */
0209     lw  $at, 16($sp)
0210 
0211     addiu   $sp, -STACK_SIZE
0212 
0213     /* Return bytes = 0. */
0214     beqz    BYTES, .Lchacha_mips_end
0215 
0216     lw  NONCE_0, 48(STATE)
0217 
0218     /* Save s0-s7 */
0219     sw  $s0,  0($sp)
0220     sw  $s1,  4($sp)
0221     sw  $s2,  8($sp)
0222     sw  $s3, 12($sp)
0223     sw  $s4, 16($sp)
0224     sw  $s5, 20($sp)
0225     sw  $s6, 24($sp)
0226     sw  $s7, 28($sp)
0227 
0228     /* Test IN or OUT is unaligned.
0229      * IS_UNALIGNED = ( IN | OUT ) & 0x00000003
0230      */
0231     or  IS_UNALIGNED, IN, OUT
0232     andi    IS_UNALIGNED, 0x3
0233 
0234     b   .Lchacha_rounds_start
0235 
0236 .align 4
0237 .Loop_chacha_rounds:
0238     addiu   IN,  CHACHA20_BLOCK_SIZE
0239     addiu   OUT, CHACHA20_BLOCK_SIZE
0240     addiu   NONCE_0, 1
0241 
0242 .Lchacha_rounds_start:
0243     lw  X0,  0(STATE)
0244     lw  X1,  4(STATE)
0245     lw  X2,  8(STATE)
0246     lw  X3,  12(STATE)
0247 
0248     lw  X4,  16(STATE)
0249     lw  X5,  20(STATE)
0250     lw  X6,  24(STATE)
0251     lw  X7,  28(STATE)
0252     lw  X8,  32(STATE)
0253     lw  X9,  36(STATE)
0254     lw  X10, 40(STATE)
0255     lw  X11, 44(STATE)
0256 
0257     move    X12, NONCE_0
0258     lw  X13, 52(STATE)
0259     lw  X14, 56(STATE)
0260     lw  X15, 60(STATE)
0261 
0262 .Loop_chacha_xor_rounds:
0263     addiu   $at, -2
0264     AXR( 0, 1, 2, 3,  4, 5, 6, 7, 12,13,14,15, 16);
0265     AXR( 8, 9,10,11, 12,13,14,15,  4, 5, 6, 7, 12);
0266     AXR( 0, 1, 2, 3,  4, 5, 6, 7, 12,13,14,15,  8);
0267     AXR( 8, 9,10,11, 12,13,14,15,  4, 5, 6, 7,  7);
0268     AXR( 0, 1, 2, 3,  5, 6, 7, 4, 15,12,13,14, 16);
0269     AXR(10,11, 8, 9, 15,12,13,14,  5, 6, 7, 4, 12);
0270     AXR( 0, 1, 2, 3,  5, 6, 7, 4, 15,12,13,14,  8);
0271     AXR(10,11, 8, 9, 15,12,13,14,  5, 6, 7, 4,  7);
0272     bnez    $at, .Loop_chacha_xor_rounds
0273 
0274     addiu   BYTES, -(CHACHA20_BLOCK_SIZE)
0275 
0276     /* Is data src/dst unaligned? Jump */
0277     bnez    IS_UNALIGNED, .Loop_chacha_unaligned
0278 
0279     /* Set number rounds here to fill delayslot. */
0280     lw  $at, (STACK_SIZE+16)($sp)
0281 
0282     /* BYTES < 0, it has no full block. */
0283     bltz    BYTES, .Lchacha_mips_no_full_block_aligned
0284 
0285     FOR_EACH_WORD_REV(STORE_ALIGNED)
0286 
0287     /* BYTES > 0? Loop again. */
0288     bgtz    BYTES, .Loop_chacha_rounds
0289 
0290     /* Place this here to fill delay slot */
0291     addiu   NONCE_0, 1
0292 
0293     /* BYTES < 0? Handle last bytes */
0294     bltz    BYTES, .Lchacha_mips_xor_bytes
0295 
0296 .Lchacha_mips_xor_done:
0297     /* Restore used registers */
0298     lw  $s0,  0($sp)
0299     lw  $s1,  4($sp)
0300     lw  $s2,  8($sp)
0301     lw  $s3, 12($sp)
0302     lw  $s4, 16($sp)
0303     lw  $s5, 20($sp)
0304     lw  $s6, 24($sp)
0305     lw  $s7, 28($sp)
0306 
0307     /* Write NONCE_0 back to right location in state */
0308     sw  NONCE_0, 48(STATE)
0309 
0310 .Lchacha_mips_end:
0311     addiu   $sp, STACK_SIZE
0312     jr  $ra
0313 
0314 .Lchacha_mips_no_full_block_aligned:
0315     /* Restore the offset on BYTES */
0316     addiu   BYTES, CHACHA20_BLOCK_SIZE
0317 
0318     /* Get number of full WORDS */
0319     andi    $at, BYTES, MASK_U32
0320 
0321     /* Load upper half of jump table addr */
0322     lui T0, %hi(.Lchacha_mips_jmptbl_aligned_0)
0323 
0324     /* Calculate lower half jump table offset */
0325     ins T0, $at, 1, 6
0326 
0327     /* Add offset to STATE */
0328     addu    T1, STATE, $at
0329 
0330     /* Add lower half jump table addr */
0331     addiu   T0, %lo(.Lchacha_mips_jmptbl_aligned_0)
0332 
0333     /* Read value from STATE */
0334     lw  SAVED_CA, 0(T1)
0335 
0336     /* Store remaining bytecounter as negative value */
0337     subu    BYTES, $at, BYTES
0338 
0339     jr  T0
0340 
0341     /* Jump table */
0342     FOR_EACH_WORD(JMPTBL_ALIGNED)
0343 
0344 
0345 .Loop_chacha_unaligned:
0346     /* Set number rounds here to fill delayslot. */
0347     lw  $at, (STACK_SIZE+16)($sp)
0348 
0349     /* BYTES > 0, it has no full block. */
0350     bltz    BYTES, .Lchacha_mips_no_full_block_unaligned
0351 
0352     FOR_EACH_WORD_REV(STORE_UNALIGNED)
0353 
0354     /* BYTES > 0? Loop again. */
0355     bgtz    BYTES, .Loop_chacha_rounds
0356 
0357     /* Write NONCE_0 back to right location in state */
0358     sw  NONCE_0, 48(STATE)
0359 
0360     .set noreorder
0361     /* Fall through to byte handling */
0362     bgez    BYTES, .Lchacha_mips_xor_done
0363 .Lchacha_mips_xor_unaligned_0_b:
0364 .Lchacha_mips_xor_aligned_0_b:
0365     /* Place this here to fill delay slot */
0366     addiu   NONCE_0, 1
0367     .set reorder
0368 
0369 .Lchacha_mips_xor_bytes:
0370     addu    IN, $at
0371     addu    OUT, $at
0372     /* First byte */
0373     lbu T1, 0(IN)
0374     addiu   $at, BYTES, 1
0375     CPU_TO_LE32(SAVED_X)
0376     ROTR(SAVED_X)
0377     xor T1, SAVED_X
0378     sb  T1, 0(OUT)
0379     beqz    $at, .Lchacha_mips_xor_done
0380     /* Second byte */
0381     lbu T1, 1(IN)
0382     addiu   $at, BYTES, 2
0383     ROTx    SAVED_X, 8
0384     xor T1, SAVED_X
0385     sb  T1, 1(OUT)
0386     beqz    $at, .Lchacha_mips_xor_done
0387     /* Third byte */
0388     lbu T1, 2(IN)
0389     ROTx    SAVED_X, 8
0390     xor T1, SAVED_X
0391     sb  T1, 2(OUT)
0392     b   .Lchacha_mips_xor_done
0393 
0394 .Lchacha_mips_no_full_block_unaligned:
0395     /* Restore the offset on BYTES */
0396     addiu   BYTES, CHACHA20_BLOCK_SIZE
0397 
0398     /* Get number of full WORDS */
0399     andi    $at, BYTES, MASK_U32
0400 
0401     /* Load upper half of jump table addr */
0402     lui T0, %hi(.Lchacha_mips_jmptbl_unaligned_0)
0403 
0404     /* Calculate lower half jump table offset */
0405     ins T0, $at, 1, 6
0406 
0407     /* Add offset to STATE */
0408     addu    T1, STATE, $at
0409 
0410     /* Add lower half jump table addr */
0411     addiu   T0, %lo(.Lchacha_mips_jmptbl_unaligned_0)
0412 
0413     /* Read value from STATE */
0414     lw  SAVED_CA, 0(T1)
0415 
0416     /* Store remaining bytecounter as negative value */
0417     subu    BYTES, $at, BYTES
0418 
0419     jr  T0
0420 
0421     /* Jump table */
0422     FOR_EACH_WORD(JMPTBL_UNALIGNED)
0423 .end chacha_crypt_arch
0424 .set at
0425 
0426 /* Input arguments
0427  * STATE    $a0
0428  * OUT      $a1
0429  * NROUND   $a2
0430  */
0431 
0432 #undef X12
0433 #undef X13
0434 #undef X14
0435 #undef X15
0436 
0437 #define X12 $a3
0438 #define X13 $at
0439 #define X14 $v0
0440 #define X15 STATE
0441 
0442 .set noat
0443 .globl  hchacha_block_arch
0444 .ent    hchacha_block_arch
0445 hchacha_block_arch:
0446     .frame  $sp, STACK_SIZE, $ra
0447 
0448     addiu   $sp, -STACK_SIZE
0449 
0450     /* Save X11(s6) */
0451     sw  X11, 0($sp)
0452 
0453     lw  X0,  0(STATE)
0454     lw  X1,  4(STATE)
0455     lw  X2,  8(STATE)
0456     lw  X3,  12(STATE)
0457     lw  X4,  16(STATE)
0458     lw  X5,  20(STATE)
0459     lw  X6,  24(STATE)
0460     lw  X7,  28(STATE)
0461     lw  X8,  32(STATE)
0462     lw  X9,  36(STATE)
0463     lw  X10, 40(STATE)
0464     lw  X11, 44(STATE)
0465     lw  X12, 48(STATE)
0466     lw  X13, 52(STATE)
0467     lw  X14, 56(STATE)
0468     lw  X15, 60(STATE)
0469 
0470 .Loop_hchacha_xor_rounds:
0471     addiu   $a2, -2
0472     AXR( 0, 1, 2, 3,  4, 5, 6, 7, 12,13,14,15, 16);
0473     AXR( 8, 9,10,11, 12,13,14,15,  4, 5, 6, 7, 12);
0474     AXR( 0, 1, 2, 3,  4, 5, 6, 7, 12,13,14,15,  8);
0475     AXR( 8, 9,10,11, 12,13,14,15,  4, 5, 6, 7,  7);
0476     AXR( 0, 1, 2, 3,  5, 6, 7, 4, 15,12,13,14, 16);
0477     AXR(10,11, 8, 9, 15,12,13,14,  5, 6, 7, 4, 12);
0478     AXR( 0, 1, 2, 3,  5, 6, 7, 4, 15,12,13,14,  8);
0479     AXR(10,11, 8, 9, 15,12,13,14,  5, 6, 7, 4,  7);
0480     bnez    $a2, .Loop_hchacha_xor_rounds
0481 
0482     /* Restore used register */
0483     lw  X11, 0($sp)
0484 
0485     sw  X0,  0(OUT)
0486     sw  X1,  4(OUT)
0487     sw  X2,  8(OUT)
0488     sw  X3,  12(OUT)
0489     sw  X12, 16(OUT)
0490     sw  X13, 20(OUT)
0491     sw  X14, 24(OUT)
0492     sw  X15, 28(OUT)
0493 
0494     addiu   $sp, STACK_SIZE
0495     jr  $ra
0496 .end hchacha_block_arch
0497 .set at