0001
0002
0003
0004
0005
0006
0007
0008
0009
0010 #include <linux/linkage.h>
0011
0012 .text
0013 .fpu neon
0014
0015 // The arguments to blake2b_compress_neon()
0016 STATE .req r0
0017 BLOCK .req r1
0018 NBLOCKS .req r2
0019 INC .req r3
0020
0021 // Pointers to the rotation tables
0022 ROR24_TABLE .req r4
0023 ROR16_TABLE .req r5
0024
0025 // The original stack pointer
0026 ORIG_SP .req r6
0027
0028 // NEON registers which contain the message words of the current block.
0029 // M_0-M_3 are occasionally used for other purposes too.
0030 M_0 .req d16
0031 M_1 .req d17
0032 M_2 .req d18
0033 M_3 .req d19
0034 M_4 .req d20
0035 M_5 .req d21
0036 M_6 .req d22
0037 M_7 .req d23
0038 M_8 .req d24
0039 M_9 .req d25
0040 M_10 .req d26
0041 M_11 .req d27
0042 M_12 .req d28
0043 M_13 .req d29
0044 M_14 .req d30
0045 M_15 .req d31
0046
0047 .align 4
0048 // Tables for computing ror64(x, 24) and ror64(x, 16) using the vtbl.8
0049 // instruction. This is the most efficient way to implement these
0050 // rotation amounts with NEON. (On Cortex-A53 it's the same speed as
0051 // vshr.u64 + vsli.u64, while on Cortex-A7 it's faster.)
0052 .Lror24_table:
0053 .byte 3, 4, 5, 6, 7, 0, 1, 2
0054 .Lror16_table:
0055 .byte 2, 3, 4, 5, 6, 7, 0, 1
0056 // The BLAKE2b initialization vector
0057 .Lblake2b_IV:
0058 .quad 0x6a09e667f3bcc908, 0xbb67ae8584caa73b
0059 .quad 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1
0060 .quad 0x510e527fade682d1, 0x9b05688c2b3e6c1f
0061 .quad 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179
0062
0063 // Execute one round of BLAKE2b by updating the state matrix v[0..15] in the
0064 // NEON registers q0-q7. The message block is in q8..q15 (M_0-M_15). The stack
0065 // pointer points to a 32-byte aligned buffer containing a copy of q8 and q9
0066 // (M_0-M_3), so that they can be reloaded if they are used as temporary
0067 // registers. The macro arguments s0-s15 give the order in which the message
0068 // words are used in this round. 'final' is 1 if this is the final round.
0069 .macro _blake2b_round s0, s1, s2, s3, s4, s5, s6, s7, \
0070 s8, s9, s10, s11, s12, s13, s14, s15, final=0
0071
0072 // Mix the columns:
0073 // (v[0], v[4], v[8], v[12]), (v[1], v[5], v[9], v[13]),
0074 // (v[2], v[6], v[10], v[14]), and (v[3], v[7], v[11], v[15]).
0075
0076 // a += b + m[blake2b_sigma[r][2*i + 0]];
0077 vadd.u64 q0, q0, q2
0078 vadd.u64 q1, q1, q3
0079 vadd.u64 d0, d0, M_\s0
0080 vadd.u64 d1, d1, M_\s2
0081 vadd.u64 d2, d2, M_\s4
0082 vadd.u64 d3, d3, M_\s6
0083
0084 // d = ror64(d ^ a, 32);
0085 veor q6, q6, q0
0086 veor q7, q7, q1
0087 vrev64.32 q6, q6
0088 vrev64.32 q7, q7
0089
0090 // c += d;
0091 vadd.u64 q4, q4, q6
0092 vadd.u64 q5, q5, q7
0093
0094 // b = ror64(b ^ c, 24);
0095 vld1.8 {M_0}, [ROR24_TABLE, :64]
0096 veor q2, q2, q4
0097 veor q3, q3, q5
0098 vtbl.8 d4, {d4}, M_0
0099 vtbl.8 d5, {d5}, M_0
0100 vtbl.8 d6, {d6}, M_0
0101 vtbl.8 d7, {d7}, M_0
0102
0103 // a += b + m[blake2b_sigma[r][2*i + 1]];
0104 //
0105 // M_0 got clobbered above, so we have to reload it if any of the four
0106 // message words this step needs happens to be M_0. Otherwise we don't
0107 // need to reload it here, as it will just get clobbered again below.
0108 .if \s1 == 0 || \s3 == 0 || \s5 == 0 || \s7 == 0
0109 vld1.8 {M_0}, [sp, :64]
0110 .endif
0111 vadd.u64 q0, q0, q2
0112 vadd.u64 q1, q1, q3
0113 vadd.u64 d0, d0, M_\s1
0114 vadd.u64 d1, d1, M_\s3
0115 vadd.u64 d2, d2, M_\s5
0116 vadd.u64 d3, d3, M_\s7
0117
0118 // d = ror64(d ^ a, 16);
0119 vld1.8 {M_0}, [ROR16_TABLE, :64]
0120 veor q6, q6, q0
0121 veor q7, q7, q1
0122 vtbl.8 d12, {d12}, M_0
0123 vtbl.8 d13, {d13}, M_0
0124 vtbl.8 d14, {d14}, M_0
0125 vtbl.8 d15, {d15}, M_0
0126
0127 // c += d;
0128 vadd.u64 q4, q4, q6
0129 vadd.u64 q5, q5, q7
0130
0131 // b = ror64(b ^ c, 63);
0132 //
0133 // This rotation amount isn't a multiple of 8, so it has to be
0134 // implemented using a pair of shifts, which requires temporary
0135 // registers. Use q8-q9 (M_0-M_3) for this, and reload them afterwards.
0136 veor q8, q2, q4
0137 veor q9, q3, q5
0138 vshr.u64 q2, q8, #63
0139 vshr.u64 q3, q9, #63
0140 vsli.u64 q2, q8, #1
0141 vsli.u64 q3, q9, #1
0142 vld1.8 {q8-q9}, [sp, :256]
0143
0144 // Mix the diagonals:
0145 // (v[0], v[5], v[10], v[15]), (v[1], v[6], v[11], v[12]),
0146 // (v[2], v[7], v[8], v[13]), and (v[3], v[4], v[9], v[14]).
0147 //
0148 // There are two possible ways to do this: use 'vext' instructions to
0149 // shift the rows of the matrix so that the diagonals become columns,
0150 // and undo it afterwards; or just use 64-bit operations on 'd'
0151 // registers instead of 128-bit operations on 'q' registers. We use the
0152 // latter approach, as it performs much better on Cortex-A7.
0153
0154 // a += b + m[blake2b_sigma[r][2*i + 0]];
0155 vadd.u64 d0, d0, d5
0156 vadd.u64 d1, d1, d6
0157 vadd.u64 d2, d2, d7
0158 vadd.u64 d3, d3, d4
0159 vadd.u64 d0, d0, M_\s8
0160 vadd.u64 d1, d1, M_\s10
0161 vadd.u64 d2, d2, M_\s12
0162 vadd.u64 d3, d3, M_\s14
0163
0164 // d = ror64(d ^ a, 32);
0165 veor d15, d15, d0
0166 veor d12, d12, d1
0167 veor d13, d13, d2
0168 veor d14, d14, d3
0169 vrev64.32 d15, d15
0170 vrev64.32 d12, d12
0171 vrev64.32 d13, d13
0172 vrev64.32 d14, d14
0173
0174 // c += d;
0175 vadd.u64 d10, d10, d15
0176 vadd.u64 d11, d11, d12
0177 vadd.u64 d8, d8, d13
0178 vadd.u64 d9, d9, d14
0179
0180 // b = ror64(b ^ c, 24);
0181 vld1.8 {M_0}, [ROR24_TABLE, :64]
0182 veor d5, d5, d10
0183 veor d6, d6, d11
0184 veor d7, d7, d8
0185 veor d4, d4, d9
0186 vtbl.8 d5, {d5}, M_0
0187 vtbl.8 d6, {d6}, M_0
0188 vtbl.8 d7, {d7}, M_0
0189 vtbl.8 d4, {d4}, M_0
0190
0191 // a += b + m[blake2b_sigma[r][2*i + 1]];
0192 .if \s9 == 0 || \s11 == 0 || \s13 == 0 || \s15 == 0
0193 vld1.8 {M_0}, [sp, :64]
0194 .endif
0195 vadd.u64 d0, d0, d5
0196 vadd.u64 d1, d1, d6
0197 vadd.u64 d2, d2, d7
0198 vadd.u64 d3, d3, d4
0199 vadd.u64 d0, d0, M_\s9
0200 vadd.u64 d1, d1, M_\s11
0201 vadd.u64 d2, d2, M_\s13
0202 vadd.u64 d3, d3, M_\s15
0203
0204 // d = ror64(d ^ a, 16);
0205 vld1.8 {M_0}, [ROR16_TABLE, :64]
0206 veor d15, d15, d0
0207 veor d12, d12, d1
0208 veor d13, d13, d2
0209 veor d14, d14, d3
0210 vtbl.8 d12, {d12}, M_0
0211 vtbl.8 d13, {d13}, M_0
0212 vtbl.8 d14, {d14}, M_0
0213 vtbl.8 d15, {d15}, M_0
0214
0215 // c += d;
0216 vadd.u64 d10, d10, d15
0217 vadd.u64 d11, d11, d12
0218 vadd.u64 d8, d8, d13
0219 vadd.u64 d9, d9, d14
0220
0221 // b = ror64(b ^ c, 63);
0222 veor d16, d4, d9
0223 veor d17, d5, d10
0224 veor d18, d6, d11
0225 veor d19, d7, d8
0226 vshr.u64 q2, q8, #63
0227 vshr.u64 q3, q9, #63
0228 vsli.u64 q2, q8, #1
0229 vsli.u64 q3, q9, #1
0230 // Reloading q8-q9 can be skipped on the final round.
0231 .if ! \final
0232 vld1.8 {q8-q9}, [sp, :256]
0233 .endif
0234 .endm
0235
0236 //
0237 // void blake2b_compress_neon(struct blake2b_state *state,
0238 // const u8 *block, size_t nblocks, u32 inc);
0239 //
0240 // Only the first three fields of struct blake2b_state are used:
0241 // u64 h[8]; (inout)
0242 // u64 t[2]; (inout)
0243 // u64 f[2]; (in)
0244 //
0245 .align 5
0246 ENTRY(blake2b_compress_neon)
0247 push {r4-r10}
0248
0249 // Allocate a 32-byte stack buffer that is 32-byte aligned.
0250 mov ORIG_SP, sp
0251 sub ip, sp, #32
0252 bic ip, ip, #31
0253 mov sp, ip
0254
0255 adr ROR24_TABLE, .Lror24_table
0256 adr ROR16_TABLE, .Lror16_table
0257
0258 mov ip, STATE
0259 vld1.64 {q0-q1}, [ip]! // Load h[0..3]
0260 vld1.64 {q2-q3}, [ip]! // Load h[4..7]
0261 .Lnext_block:
0262 adr r10, .Lblake2b_IV
0263 vld1.64 {q14-q15}, [ip] // Load t[0..1] and f[0..1]
0264 vld1.64 {q4-q5}, [r10]! // Load IV[0..3]
0265 vmov r7, r8, d28 // Copy t[0] to (r7, r8)
0266 vld1.64 {q6-q7}, [r10] // Load IV[4..7]
0267 adds r7, r7, INC // Increment counter
0268 bcs .Lslow_inc_ctr
0269 vmov.i32 d28[0], r7
0270 vst1.64 {d28}, [ip] // Update t[0]
0271 .Linc_ctr_done:
0272
0273 // Load the next message block and finish initializing the state matrix
0274 // 'v'. Fortunately, there are exactly enough NEON registers to fit the
0275 // entire state matrix in q0-q7 and the entire message block in q8-15.
0276 //
0277 // However, _blake2b_round also needs some extra registers for rotates,
0278 // so we have to spill some registers. It's better to spill the message
0279 // registers than the state registers, as the message doesn't change.
0280 // Therefore we store a copy of the first 32 bytes of the message block
0281 // (q8-q9) in an aligned buffer on the stack so that they can be
0282 // reloaded when needed. (We could just reload directly from the
0283 // message buffer, but it's faster to use aligned loads.)
0284 vld1.8 {q8-q9}, [BLOCK]!
0285 veor q6, q6, q14 // v[12..13] = IV[4..5] ^ t[0..1]
0286 vld1.8 {q10-q11}, [BLOCK]!
0287 veor q7, q7, q15 // v[14..15] = IV[6..7] ^ f[0..1]
0288 vld1.8 {q12-q13}, [BLOCK]!
0289 vst1.8 {q8-q9}, [sp, :256]
0290 mov ip, STATE
0291 vld1.8 {q14-q15}, [BLOCK]!
0292
0293 // Execute the rounds. Each round is provided the order in which it
0294 // needs to use the message words.
0295 _blake2b_round 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
0296 _blake2b_round 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3
0297 _blake2b_round 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4
0298 _blake2b_round 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8
0299 _blake2b_round 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13
0300 _blake2b_round 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9
0301 _blake2b_round 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11
0302 _blake2b_round 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10
0303 _blake2b_round 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5
0304 _blake2b_round 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0
0305 _blake2b_round 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
0306 _blake2b_round 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 \
0307 final=1
0308
0309 // Fold the final state matrix into the hash chaining value:
0310 //
0311 // for (i = 0; i < 8; i++)
0312 // h[i] ^= v[i] ^ v[i + 8];
0313 //
0314 vld1.64 {q8-q9}, [ip]! // Load old h[0..3]
0315 veor q0, q0, q4 // v[0..1] ^= v[8..9]
0316 veor q1, q1, q5 // v[2..3] ^= v[10..11]
0317 vld1.64 {q10-q11}, [ip] // Load old h[4..7]
0318 veor q2, q2, q6 // v[4..5] ^= v[12..13]
0319 veor q3, q3, q7 // v[6..7] ^= v[14..15]
0320 veor q0, q0, q8 // v[0..1] ^= h[0..1]
0321 veor q1, q1, q9 // v[2..3] ^= h[2..3]
0322 mov ip, STATE
0323 subs NBLOCKS, NBLOCKS, #1 // nblocks--
0324 vst1.64 {q0-q1}, [ip]! // Store new h[0..3]
0325 veor q2, q2, q10 // v[4..5] ^= h[4..5]
0326 veor q3, q3, q11 // v[6..7] ^= h[6..7]
0327 vst1.64 {q2-q3}, [ip]! // Store new h[4..7]
0328
0329 // Advance to the next block, if there is one.
0330 bne .Lnext_block // nblocks != 0?
0331
0332 mov sp, ORIG_SP
0333 pop {r4-r10}
0334 mov pc, lr
0335
0336 .Lslow_inc_ctr:
0337 // Handle the case where the counter overflowed its low 32 bits, by
0338 // carrying the overflow bit into the full 128-bit counter.
0339 vmov r9, r10, d29
0340 adcs r8, r8, #0
0341 adcs r9, r9, #0
0342 adc r10, r10, #0
0343 vmov d28, r7, r8
0344 vmov d29, r9, r10
0345 vst1.64 {q14}, [ip] // Update t[0] and t[1]
0346 b .Linc_ctr_done
0347 ENDPROC(blake2b_compress_neon)