arm/crypto/blake2s-core.S

0001 /* SPDX-License-Identifier: GPL-2.0-or-later */
0002 /*
0003  * BLAKE2s digest algorithm, ARM scalar implementation
0004  *
0005  * Copyright 2020 Google LLC
0006  *
0007  * Author: Eric Biggers <ebiggers@google.com>
0008  */
0009
0010 #include <linux/linkage.h>
0011 #include <asm/assembler.h>
0012
0013     // Registers used to hold message words temporarily.  There aren't
0014     // enough ARM registers to hold the whole message block, so we have to
0015     // load the words on-demand.
0016     M_0     .req    r12
0017     M_1     .req    r14
0018
0019 // The BLAKE2s initialization vector
0020 .Lblake2s_IV:
0021     .word   0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A
0022     .word   0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
0023
0024 .macro __ldrd       a, b, src, offset
0025 #if __LINUX_ARM_ARCH__ >= 6
0026     ldrd        \a, \b, [\src, #\offset]
0027 #else
0028     ldr     \a, [\src, #\offset]
0029     ldr     \b, [\src, #\offset + 4]
0030 #endif
0031 .endm
0032
0033 .macro __strd       a, b, dst, offset
0034 #if __LINUX_ARM_ARCH__ >= 6
0035     strd        \a, \b, [\dst, #\offset]
0036 #else
0037     str     \a, [\dst, #\offset]
0038     str     \b, [\dst, #\offset + 4]
0039 #endif
0040 .endm
0041
0042 .macro _le32_bswap  a, tmp
0043 #ifdef __ARMEB__
0044     rev_l       \a, \tmp
0045 #endif
0046 .endm
0047
0048 .macro _le32_bswap_8x   a, b, c, d, e, f, g, h,  tmp
0049     _le32_bswap \a, \tmp
0050     _le32_bswap \b, \tmp
0051     _le32_bswap \c, \tmp
0052     _le32_bswap \d, \tmp
0053     _le32_bswap \e, \tmp
0054     _le32_bswap \f, \tmp
0055     _le32_bswap \g, \tmp
0056     _le32_bswap \h, \tmp
0057 .endm
0058
0059 // Execute a quarter-round of BLAKE2s by mixing two columns or two diagonals.
0060 // (a0, b0, c0, d0) and (a1, b1, c1, d1) give the registers containing the two
0061 // columns/diagonals.  s0-s1 are the word offsets to the message words the first
0062 // column/diagonal needs, and likewise s2-s3 for the second column/diagonal.
0063 // M_0 and M_1 are free to use, and the message block can be found at sp + 32.
0064 //
0065 // Note that to save instructions, the rotations don't happen when the
0066 // pseudocode says they should, but rather they are delayed until the values are
0067 // used.  See the comment above _blake2s_round().
0068 .macro _blake2s_quarterround  a0, b0, c0, d0,  a1, b1, c1, d1,  s0, s1, s2, s3
0069
0070     ldr     M_0, [sp, #32 + 4 * \s0]
0071     ldr     M_1, [sp, #32 + 4 * \s2]
0072
0073     // a += b + m[blake2s_sigma[r][2*i + 0]];
0074     add     \a0, \a0, \b0, ror #brot
0075     add     \a1, \a1, \b1, ror #brot
0076     add     \a0, \a0, M_0
0077     add     \a1, \a1, M_1
0078
0079     // d = ror32(d ^ a, 16);
0080     eor     \d0, \a0, \d0, ror #drot
0081     eor     \d1, \a1, \d1, ror #drot
0082
0083     // c += d;
0084     add     \c0, \c0, \d0, ror #16
0085     add     \c1, \c1, \d1, ror #16
0086
0087     // b = ror32(b ^ c, 12);
0088     eor     \b0, \c0, \b0, ror #brot
0089     eor     \b1, \c1, \b1, ror #brot
0090
0091     ldr     M_0, [sp, #32 + 4 * \s1]
0092     ldr     M_1, [sp, #32 + 4 * \s3]
0093
0094     // a += b + m[blake2s_sigma[r][2*i + 1]];
0095     add     \a0, \a0, \b0, ror #12
0096     add     \a1, \a1, \b1, ror #12
0097     add     \a0, \a0, M_0
0098     add     \a1, \a1, M_1
0099
0100     // d = ror32(d ^ a, 8);
0101     eor     \d0, \a0, \d0, ror#16
0102     eor     \d1, \a1, \d1, ror#16
0103
0104     // c += d;
0105     add     \c0, \c0, \d0, ror#8
0106     add     \c1, \c1, \d1, ror#8
0107
0108     // b = ror32(b ^ c, 7);
0109     eor     \b0, \c0, \b0, ror#12
0110     eor     \b1, \c1, \b1, ror#12
0111 .endm
0112
0113 // Execute one round of BLAKE2s by updating the state matrix v[0..15].  v[0..9]
0114 // are in r0..r9.  The stack pointer points to 8 bytes of scratch space for
0115 // spilling v[8..9], then to v[9..15], then to the message block.  r10-r12 and
0116 // r14 are free to use.  The macro arguments s0-s15 give the order in which the
0117 // message words are used in this round.
0118 //
0119 // All rotates are performed using the implicit rotate operand accepted by the
0120 // 'add' and 'eor' instructions.  This is faster than using explicit rotate
0121 // instructions.  To make this work, we allow the values in the second and last
0122 // rows of the BLAKE2s state matrix (rows 'b' and 'd') to temporarily have the
0123 // wrong rotation amount.  The rotation amount is then fixed up just in time
0124 // when the values are used.  'brot' is the number of bits the values in row 'b'
0125 // need to be rotated right to arrive at the correct values, and 'drot'
0126 // similarly for row 'd'.  (brot, drot) start out as (0, 0) but we make it such
0127 // that they end up as (7, 8) after every round.
0128 .macro  _blake2s_round  s0, s1, s2, s3, s4, s5, s6, s7, \
0129             s8, s9, s10, s11, s12, s13, s14, s15
0130
0131     // Mix first two columns:
0132     // (v[0], v[4], v[8], v[12]) and (v[1], v[5], v[9], v[13]).
0133     __ldrd      r10, r11, sp, 16    // load v[12] and v[13]
0134     _blake2s_quarterround   r0, r4, r8, r10,  r1, r5, r9, r11, \
0135                 \s0, \s1, \s2, \s3
0136     __strd      r8, r9, sp, 0
0137     __strd      r10, r11, sp, 16
0138
0139     // Mix second two columns:
0140     // (v[2], v[6], v[10], v[14]) and (v[3], v[7], v[11], v[15]).
0141     __ldrd      r8, r9, sp, 8       // load v[10] and v[11]
0142     __ldrd      r10, r11, sp, 24    // load v[14] and v[15]
0143     _blake2s_quarterround   r2, r6, r8, r10,  r3, r7, r9, r11, \
0144                 \s4, \s5, \s6, \s7
0145     str     r10, [sp, #24]      // store v[14]
0146     // v[10], v[11], and v[15] are used below, so no need to store them yet.
0147
0148     .set brot, 7
0149     .set drot, 8
0150
0151     // Mix first two diagonals:
0152     // (v[0], v[5], v[10], v[15]) and (v[1], v[6], v[11], v[12]).
0153     ldr     r10, [sp, #16]      // load v[12]
0154     _blake2s_quarterround   r0, r5, r8, r11,  r1, r6, r9, r10, \
0155                 \s8, \s9, \s10, \s11
0156     __strd      r8, r9, sp, 8
0157     str     r11, [sp, #28]
0158     str     r10, [sp, #16]
0159
0160     // Mix second two diagonals:
0161     // (v[2], v[7], v[8], v[13]) and (v[3], v[4], v[9], v[14]).
0162     __ldrd      r8, r9, sp, 0       // load v[8] and v[9]
0163     __ldrd      r10, r11, sp, 20    // load v[13] and v[14]
0164     _blake2s_quarterround   r2, r7, r8, r10,  r3, r4, r9, r11, \
0165                 \s12, \s13, \s14, \s15
0166     __strd      r10, r11, sp, 20
0167 .endm
0168
0169 //
0170 // void blake2s_compress(struct blake2s_state *state,
0171 //           const u8 *block, size_t nblocks, u32 inc);
0172 //
0173 // Only the first three fields of struct blake2s_state are used:
0174 //  u32 h[8];   (inout)
0175 //  u32 t[2];   (inout)
0176 //  u32 f[2];   (in)
0177 //
0178     .align      5
0179 ENTRY(blake2s_compress)
0180     push        {r0-r2,r4-r11,lr}   // keep this an even number
0181
0182 .Lnext_block:
0183     // r0 is 'state'
0184     // r1 is 'block'
0185     // r3 is 'inc'
0186
0187     // Load and increment the counter t[0..1].
0188     __ldrd      r10, r11, r0, 32
0189     adds        r10, r10, r3
0190     adc     r11, r11, #0
0191     __strd      r10, r11, r0, 32
0192
0193     // _blake2s_round is very short on registers, so copy the message block
0194     // to the stack to save a register during the rounds.  This also has the
0195     // advantage that misalignment only needs to be dealt with in one place.
0196     sub     sp, sp, #64
0197     mov     r12, sp
0198     tst     r1, #3
0199     bne     .Lcopy_block_misaligned
0200     ldmia       r1!, {r2-r9}
0201     _le32_bswap_8x  r2, r3, r4, r5, r6, r7, r8, r9,  r14
0202     stmia       r12!, {r2-r9}
0203     ldmia       r1!, {r2-r9}
0204     _le32_bswap_8x  r2, r3, r4, r5, r6, r7, r8, r9,  r14
0205     stmia       r12, {r2-r9}
0206 .Lcopy_block_done:
0207     str     r1, [sp, #68]       // Update message pointer
0208
0209     // Calculate v[8..15].  Push v[9..15] onto the stack, and leave space
0210     // for spilling v[8..9].  Leave v[8..9] in r8-r9.
0211     mov     r14, r0         // r14 = state
0212     adr     r12, .Lblake2s_IV
0213     ldmia       r12!, {r8-r9}       // load IV[0..1]
0214     __ldrd      r0, r1, r14, 40     // load f[0..1]
0215     ldm     r12, {r2-r7}        // load IV[3..7]
0216     eor     r4, r4, r10     // v[12] = IV[4] ^ t[0]
0217     eor     r5, r5, r11     // v[13] = IV[5] ^ t[1]
0218     eor     r6, r6, r0      // v[14] = IV[6] ^ f[0]
0219     eor     r7, r7, r1      // v[15] = IV[7] ^ f[1]
0220     push        {r2-r7}         // push v[9..15]
0221     sub     sp, sp, #8      // leave space for v[8..9]
0222
0223     // Load h[0..7] == v[0..7].
0224     ldm     r14, {r0-r7}
0225
0226     // Execute the rounds.  Each round is provided the order in which it
0227     // needs to use the message words.
0228     .set brot, 0
0229     .set drot, 0
0230     _blake2s_round  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
0231     _blake2s_round  14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3
0232     _blake2s_round  11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4
0233     _blake2s_round  7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8
0234     _blake2s_round  9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13
0235     _blake2s_round  2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9
0236     _blake2s_round  12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11
0237     _blake2s_round  13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10
0238     _blake2s_round  6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5
0239     _blake2s_round  10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0
0240
0241     // Fold the final state matrix into the hash chaining value:
0242     //
0243     //  for (i = 0; i < 8; i++)
0244     //      h[i] ^= v[i] ^ v[i + 8];
0245     //
0246     ldr     r14, [sp, #96]      // r14 = &h[0]
0247     add     sp, sp, #8      // v[8..9] are already loaded.
0248     pop     {r10-r11}       // load v[10..11]
0249     eor     r0, r0, r8
0250     eor     r1, r1, r9
0251     eor     r2, r2, r10
0252     eor     r3, r3, r11
0253     ldm     r14, {r8-r11}       // load h[0..3]
0254     eor     r0, r0, r8
0255     eor     r1, r1, r9
0256     eor     r2, r2, r10
0257     eor     r3, r3, r11
0258     stmia       r14!, {r0-r3}       // store new h[0..3]
0259     ldm     r14, {r0-r3}        // load old h[4..7]
0260     pop     {r8-r11}        // load v[12..15]
0261     eor     r0, r0, r4, ror #brot
0262     eor     r1, r1, r5, ror #brot
0263     eor     r2, r2, r6, ror #brot
0264     eor     r3, r3, r7, ror #brot
0265     eor     r0, r0, r8, ror #drot
0266     eor     r1, r1, r9, ror #drot
0267     eor     r2, r2, r10, ror #drot
0268     eor     r3, r3, r11, ror #drot
0269       add       sp, sp, #64     // skip copy of message block
0270     stm     r14, {r0-r3}        // store new h[4..7]
0271
0272     // Advance to the next block, if there is one.  Note that if there are
0273     // multiple blocks, then 'inc' (the counter increment amount) must be
0274     // 64.  So we can simply set it to 64 without re-loading it.
0275     ldm     sp, {r0, r1, r2}    // load (state, block, nblocks)
0276     mov     r3, #64         // set 'inc'
0277     subs        r2, r2, #1      // nblocks--
0278     str     r2, [sp, #8]
0279     bne     .Lnext_block        // nblocks != 0?
0280
0281     pop     {r0-r2,r4-r11,pc}
0282
0283     // The next message block (pointed to by r1) isn't 4-byte aligned, so it
0284     // can't be loaded using ldmia.  Copy it to the stack buffer (pointed to
0285     // by r12) using an alternative method.  r2-r9 are free to use.
0286 .Lcopy_block_misaligned:
0287     mov     r2, #64
0288 1:
0289 #ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
0290     ldr     r3, [r1], #4
0291     _le32_bswap r3, r4
0292 #else
0293     ldrb        r3, [r1, #0]
0294     ldrb        r4, [r1, #1]
0295     ldrb        r5, [r1, #2]
0296     ldrb        r6, [r1, #3]
0297     add     r1, r1, #4
0298     orr     r3, r3, r4, lsl #8
0299     orr     r3, r3, r5, lsl #16
0300     orr     r3, r3, r6, lsl #24
0301 #endif
0302     subs        r2, r2, #4
0303     str     r3, [r12], #4
0304     bne     1b
0305     b       .Lcopy_block_done
0306 ENDPROC(blake2s_compress)