Back to home page

OSCL-LXR

 
 

    


0001 /* SPDX-License-Identifier: GPL-2.0-or-later */
0002 /* sha1-armv7-neon.S - ARM/NEON accelerated SHA-1 transform function
0003  *
0004  * Copyright © 2013-2014 Jussi Kivilinna <jussi.kivilinna@iki.fi>
0005  */
0006 
0007 #include <linux/linkage.h>
0008 #include <asm/assembler.h>
0009 
0010 .syntax unified
0011 .fpu neon
0012 
0013 .text
0014 
0015 
0016 /* Context structure */
0017 
0018 #define state_h0 0
0019 #define state_h1 4
0020 #define state_h2 8
0021 #define state_h3 12
0022 #define state_h4 16
0023 
0024 
0025 /* Constants */
0026 
0027 #define K1  0x5A827999
0028 #define K2  0x6ED9EBA1
0029 #define K3  0x8F1BBCDC
0030 #define K4  0xCA62C1D6
0031 .align 4
0032 .LK_VEC:
0033 .LK1:   .long K1, K1, K1, K1
0034 .LK2:   .long K2, K2, K2, K2
0035 .LK3:   .long K3, K3, K3, K3
0036 .LK4:   .long K4, K4, K4, K4
0037 
0038 
0039 /* Register macros */
0040 
0041 #define RSTATE r0
0042 #define RDATA r1
0043 #define RNBLKS r2
0044 #define ROLDSTACK r3
0045 #define RWK lr
0046 
0047 #define _a r4
0048 #define _b r5
0049 #define _c r6
0050 #define _d r7
0051 #define _e r8
0052 
0053 #define RT0 r9
0054 #define RT1 r10
0055 #define RT2 r11
0056 #define RT3 r12
0057 
0058 #define W0 q0
0059 #define W1 q7
0060 #define W2 q2
0061 #define W3 q3
0062 #define W4 q4
0063 #define W5 q6
0064 #define W6 q5
0065 #define W7 q1
0066 
0067 #define tmp0 q8
0068 #define tmp1 q9
0069 #define tmp2 q10
0070 #define tmp3 q11
0071 
0072 #define qK1 q12
0073 #define qK2 q13
0074 #define qK3 q14
0075 #define qK4 q15
0076 
0077 #ifdef CONFIG_CPU_BIG_ENDIAN
0078 #define ARM_LE(code...)
0079 #else
0080 #define ARM_LE(code...)     code
0081 #endif
0082 
0083 /* Round function macros. */
0084 
0085 #define WK_offs(i) (((i) & 15) * 4)
0086 
0087 #define _R_F1(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
0088           W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
0089     ldr RT3, [sp, WK_offs(i)]; \
0090         pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
0091     bic RT0, d, b; \
0092     add e, e, a, ror #(32 - 5); \
0093     and RT1, c, b; \
0094         pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
0095     add RT0, RT0, RT3; \
0096     add e, e, RT1; \
0097     ror b, #(32 - 30); \
0098         pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
0099     add e, e, RT0;
0100 
0101 #define _R_F2(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
0102           W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
0103     ldr RT3, [sp, WK_offs(i)]; \
0104         pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
0105     eor RT0, d, b; \
0106     add e, e, a, ror #(32 - 5); \
0107     eor RT0, RT0, c; \
0108         pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
0109     add e, e, RT3; \
0110     ror b, #(32 - 30); \
0111         pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
0112     add e, e, RT0; \
0113 
0114 #define _R_F3(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
0115           W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
0116     ldr RT3, [sp, WK_offs(i)]; \
0117         pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
0118     eor RT0, b, c; \
0119     and RT1, b, c; \
0120     add e, e, a, ror #(32 - 5); \
0121         pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
0122     and RT0, RT0, d; \
0123     add RT1, RT1, RT3; \
0124     add e, e, RT0; \
0125     ror b, #(32 - 30); \
0126         pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
0127     add e, e, RT1;
0128 
0129 #define _R_F4(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
0130           W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
0131     _R_F2(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
0132           W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28)
0133 
0134 #define _R(a,b,c,d,e,f,i,pre1,pre2,pre3,i16,\
0135            W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
0136     _R_##f(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
0137            W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28)
0138 
0139 #define R(a,b,c,d,e,f,i) \
0140     _R_##f(a,b,c,d,e,i,dummy,dummy,dummy,i16,\
0141            W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28)
0142 
0143 #define dummy(...)
0144 
0145 
0146 /* Input expansion macros. */
0147 
0148 /********* Precalc macros for rounds 0-15 *************************************/
0149 
0150 #define W_PRECALC_00_15() \
0151     add       RWK, sp, #(WK_offs(0));           \
0152     \
0153     vld1.32   {W0, W7}, [RDATA]!;               \
0154  ARM_LE(vrev32.8  W0, W0;   )   /* big => little */ \
0155     vld1.32   {W6, W5}, [RDATA]!;               \
0156     vadd.u32  tmp0, W0, curK;               \
0157  ARM_LE(vrev32.8  W7, W7;   )   /* big => little */ \
0158  ARM_LE(vrev32.8  W6, W6;   )   /* big => little */ \
0159     vadd.u32  tmp1, W7, curK;               \
0160  ARM_LE(vrev32.8  W5, W5;   )   /* big => little */ \
0161     vadd.u32  tmp2, W6, curK;               \
0162     vst1.32   {tmp0, tmp1}, [RWK]!;             \
0163     vadd.u32  tmp3, W5, curK;               \
0164     vst1.32   {tmp2, tmp3}, [RWK];              \
0165 
0166 #define WPRECALC_00_15_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
0167     vld1.32   {W0, W7}, [RDATA]!;               \
0168 
0169 #define WPRECALC_00_15_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
0170     add       RWK, sp, #(WK_offs(0));           \
0171 
0172 #define WPRECALC_00_15_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
0173  ARM_LE(vrev32.8  W0, W0;   )   /* big => little */ \
0174 
0175 #define WPRECALC_00_15_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
0176     vld1.32   {W6, W5}, [RDATA]!;               \
0177 
0178 #define WPRECALC_00_15_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
0179     vadd.u32  tmp0, W0, curK;               \
0180 
0181 #define WPRECALC_00_15_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
0182  ARM_LE(vrev32.8  W7, W7;   )   /* big => little */ \
0183 
0184 #define WPRECALC_00_15_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
0185  ARM_LE(vrev32.8  W6, W6;   )   /* big => little */ \
0186 
0187 #define WPRECALC_00_15_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
0188     vadd.u32  tmp1, W7, curK;               \
0189 
0190 #define WPRECALC_00_15_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
0191  ARM_LE(vrev32.8  W5, W5;   )   /* big => little */ \
0192 
0193 #define WPRECALC_00_15_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
0194     vadd.u32  tmp2, W6, curK;               \
0195 
0196 #define WPRECALC_00_15_10(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
0197     vst1.32   {tmp0, tmp1}, [RWK]!;             \
0198 
0199 #define WPRECALC_00_15_11(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
0200     vadd.u32  tmp3, W5, curK;               \
0201 
0202 #define WPRECALC_00_15_12(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
0203     vst1.32   {tmp2, tmp3}, [RWK];              \
0204 
0205 
0206 /********* Precalc macros for rounds 16-31 ************************************/
0207 
0208 #define WPRECALC_16_31_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
0209     veor      tmp0, tmp0;           \
0210     vext.8    W, W_m16, W_m12, #8;      \
0211 
0212 #define WPRECALC_16_31_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
0213     add       RWK, sp, #(WK_offs(i));   \
0214     vext.8    tmp0, W_m04, tmp0, #4;    \
0215 
0216 #define WPRECALC_16_31_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
0217     veor      tmp0, tmp0, W_m16;        \
0218     veor.32   W, W, W_m08;          \
0219 
0220 #define WPRECALC_16_31_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
0221     veor      tmp1, tmp1;           \
0222     veor      W, W, tmp0;           \
0223 
0224 #define WPRECALC_16_31_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
0225     vshl.u32  tmp0, W, #1;          \
0226 
0227 #define WPRECALC_16_31_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
0228     vext.8    tmp1, tmp1, W, #(16-12);  \
0229     vshr.u32  W, W, #31;            \
0230 
0231 #define WPRECALC_16_31_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
0232     vorr      tmp0, tmp0, W;        \
0233     vshr.u32  W, tmp1, #30;         \
0234 
0235 #define WPRECALC_16_31_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
0236     vshl.u32  tmp1, tmp1, #2;       \
0237 
0238 #define WPRECALC_16_31_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
0239     veor      tmp0, tmp0, W;        \
0240 
0241 #define WPRECALC_16_31_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
0242     veor      W, tmp0, tmp1;        \
0243 
0244 #define WPRECALC_16_31_10(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
0245     vadd.u32  tmp0, W, curK;        \
0246 
0247 #define WPRECALC_16_31_11(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
0248     vst1.32   {tmp0}, [RWK];
0249 
0250 
0251 /********* Precalc macros for rounds 32-79 ************************************/
0252 
0253 #define WPRECALC_32_79_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
0254     veor W, W_m28; \
0255 
0256 #define WPRECALC_32_79_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
0257     vext.8 tmp0, W_m08, W_m04, #8; \
0258 
0259 #define WPRECALC_32_79_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
0260     veor W, W_m16; \
0261 
0262 #define WPRECALC_32_79_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
0263     veor W, tmp0; \
0264 
0265 #define WPRECALC_32_79_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
0266     add RWK, sp, #(WK_offs(i&~3)); \
0267 
0268 #define WPRECALC_32_79_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
0269     vshl.u32 tmp1, W, #2; \
0270 
0271 #define WPRECALC_32_79_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
0272     vshr.u32 tmp0, W, #30; \
0273 
0274 #define WPRECALC_32_79_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
0275     vorr W, tmp0, tmp1; \
0276 
0277 #define WPRECALC_32_79_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
0278     vadd.u32 tmp0, W, curK; \
0279 
0280 #define WPRECALC_32_79_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
0281     vst1.32 {tmp0}, [RWK];
0282 
0283 
0284 /*
0285  * Transform nblks*64 bytes (nblks*16 32-bit words) at DATA.
0286  *
0287  * unsigned int
0288  * sha1_transform_neon (void *ctx, const unsigned char *data,
0289  *                      unsigned int nblks)
0290  */
0291 .align 3
0292 ENTRY(sha1_transform_neon)
0293   /* input:
0294    *    r0: ctx, CTX
0295    *    r1: data (64*nblks bytes)
0296    *    r2: nblks
0297    */
0298 
0299   cmp RNBLKS, #0;
0300   beq .Ldo_nothing;
0301 
0302   push {r4-r12, lr};
0303   /*vpush {q4-q7};*/
0304 
0305   adr RT3, .LK_VEC;
0306 
0307   mov ROLDSTACK, sp;
0308 
0309   /* Align stack. */
0310   sub RT0, sp, #(16*4);
0311   and RT0, #(~(16-1));
0312   mov sp, RT0;
0313 
0314   vld1.32 {qK1-qK2}, [RT3]!; /* Load K1,K2 */
0315 
0316   /* Get the values of the chaining variables. */
0317   ldm RSTATE, {_a-_e};
0318 
0319   vld1.32 {qK3-qK4}, [RT3]; /* Load K3,K4 */
0320 
0321 #undef curK
0322 #define curK qK1
0323   /* Precalc 0-15. */
0324   W_PRECALC_00_15();
0325 
0326 .Loop:
0327   /* Transform 0-15 + Precalc 16-31. */
0328   _R( _a, _b, _c, _d, _e, F1,  0,
0329       WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 16,
0330       W4, W5, W6, W7, W0, _, _, _ );
0331   _R( _e, _a, _b, _c, _d, F1,  1,
0332       WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 16,
0333       W4, W5, W6, W7, W0, _, _, _ );
0334   _R( _d, _e, _a, _b, _c, F1,  2,
0335       WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 16,
0336       W4, W5, W6, W7, W0, _, _, _ );
0337   _R( _c, _d, _e, _a, _b, F1,  3,
0338       WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,16,
0339       W4, W5, W6, W7, W0, _, _, _ );
0340 
0341 #undef curK
0342 #define curK qK2
0343   _R( _b, _c, _d, _e, _a, F1,  4,
0344       WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 20,
0345       W3, W4, W5, W6, W7, _, _, _ );
0346   _R( _a, _b, _c, _d, _e, F1,  5,
0347       WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 20,
0348       W3, W4, W5, W6, W7, _, _, _ );
0349   _R( _e, _a, _b, _c, _d, F1,  6,
0350       WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 20,
0351       W3, W4, W5, W6, W7, _, _, _ );
0352   _R( _d, _e, _a, _b, _c, F1,  7,
0353       WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,20,
0354       W3, W4, W5, W6, W7, _, _, _ );
0355 
0356   _R( _c, _d, _e, _a, _b, F1,  8,
0357       WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 24,
0358       W2, W3, W4, W5, W6, _, _, _ );
0359   _R( _b, _c, _d, _e, _a, F1,  9,
0360       WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 24,
0361       W2, W3, W4, W5, W6, _, _, _ );
0362   _R( _a, _b, _c, _d, _e, F1, 10,
0363       WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 24,
0364       W2, W3, W4, W5, W6, _, _, _ );
0365   _R( _e, _a, _b, _c, _d, F1, 11,
0366       WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,24,
0367       W2, W3, W4, W5, W6, _, _, _ );
0368 
0369   _R( _d, _e, _a, _b, _c, F1, 12,
0370       WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 28,
0371       W1, W2, W3, W4, W5, _, _, _ );
0372   _R( _c, _d, _e, _a, _b, F1, 13,
0373       WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 28,
0374       W1, W2, W3, W4, W5, _, _, _ );
0375   _R( _b, _c, _d, _e, _a, F1, 14,
0376       WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 28,
0377       W1, W2, W3, W4, W5, _, _, _ );
0378   _R( _a, _b, _c, _d, _e, F1, 15,
0379       WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,28,
0380       W1, W2, W3, W4, W5, _, _, _ );
0381 
0382   /* Transform 16-63 + Precalc 32-79. */
0383   _R( _e, _a, _b, _c, _d, F1, 16,
0384       WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 32,
0385       W0, W1, W2, W3, W4, W5, W6, W7);
0386   _R( _d, _e, _a, _b, _c, F1, 17,
0387       WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 32,
0388       W0, W1, W2, W3, W4, W5, W6, W7);
0389   _R( _c, _d, _e, _a, _b, F1, 18,
0390       WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 32,
0391       W0, W1, W2, W3, W4, W5, W6, W7);
0392   _R( _b, _c, _d, _e, _a, F1, 19,
0393       WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 32,
0394       W0, W1, W2, W3, W4, W5, W6, W7);
0395 
0396   _R( _a, _b, _c, _d, _e, F2, 20,
0397       WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 36,
0398       W7, W0, W1, W2, W3, W4, W5, W6);
0399   _R( _e, _a, _b, _c, _d, F2, 21,
0400       WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 36,
0401       W7, W0, W1, W2, W3, W4, W5, W6);
0402   _R( _d, _e, _a, _b, _c, F2, 22,
0403       WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 36,
0404       W7, W0, W1, W2, W3, W4, W5, W6);
0405   _R( _c, _d, _e, _a, _b, F2, 23,
0406       WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 36,
0407       W7, W0, W1, W2, W3, W4, W5, W6);
0408 
0409 #undef curK
0410 #define curK qK3
0411   _R( _b, _c, _d, _e, _a, F2, 24,
0412       WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 40,
0413       W6, W7, W0, W1, W2, W3, W4, W5);
0414   _R( _a, _b, _c, _d, _e, F2, 25,
0415       WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 40,
0416       W6, W7, W0, W1, W2, W3, W4, W5);
0417   _R( _e, _a, _b, _c, _d, F2, 26,
0418       WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 40,
0419       W6, W7, W0, W1, W2, W3, W4, W5);
0420   _R( _d, _e, _a, _b, _c, F2, 27,
0421       WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 40,
0422       W6, W7, W0, W1, W2, W3, W4, W5);
0423 
0424   _R( _c, _d, _e, _a, _b, F2, 28,
0425       WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 44,
0426       W5, W6, W7, W0, W1, W2, W3, W4);
0427   _R( _b, _c, _d, _e, _a, F2, 29,
0428       WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 44,
0429       W5, W6, W7, W0, W1, W2, W3, W4);
0430   _R( _a, _b, _c, _d, _e, F2, 30,
0431       WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 44,
0432       W5, W6, W7, W0, W1, W2, W3, W4);
0433   _R( _e, _a, _b, _c, _d, F2, 31,
0434       WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 44,
0435       W5, W6, W7, W0, W1, W2, W3, W4);
0436 
0437   _R( _d, _e, _a, _b, _c, F2, 32,
0438       WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 48,
0439       W4, W5, W6, W7, W0, W1, W2, W3);
0440   _R( _c, _d, _e, _a, _b, F2, 33,
0441       WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 48,
0442       W4, W5, W6, W7, W0, W1, W2, W3);
0443   _R( _b, _c, _d, _e, _a, F2, 34,
0444       WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 48,
0445       W4, W5, W6, W7, W0, W1, W2, W3);
0446   _R( _a, _b, _c, _d, _e, F2, 35,
0447       WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 48,
0448       W4, W5, W6, W7, W0, W1, W2, W3);
0449 
0450   _R( _e, _a, _b, _c, _d, F2, 36,
0451       WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 52,
0452       W3, W4, W5, W6, W7, W0, W1, W2);
0453   _R( _d, _e, _a, _b, _c, F2, 37,
0454       WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 52,
0455       W3, W4, W5, W6, W7, W0, W1, W2);
0456   _R( _c, _d, _e, _a, _b, F2, 38,
0457       WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 52,
0458       W3, W4, W5, W6, W7, W0, W1, W2);
0459   _R( _b, _c, _d, _e, _a, F2, 39,
0460       WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 52,
0461       W3, W4, W5, W6, W7, W0, W1, W2);
0462 
0463   _R( _a, _b, _c, _d, _e, F3, 40,
0464       WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 56,
0465       W2, W3, W4, W5, W6, W7, W0, W1);
0466   _R( _e, _a, _b, _c, _d, F3, 41,
0467       WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 56,
0468       W2, W3, W4, W5, W6, W7, W0, W1);
0469   _R( _d, _e, _a, _b, _c, F3, 42,
0470       WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 56,
0471       W2, W3, W4, W5, W6, W7, W0, W1);
0472   _R( _c, _d, _e, _a, _b, F3, 43,
0473       WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 56,
0474       W2, W3, W4, W5, W6, W7, W0, W1);
0475 
0476 #undef curK
0477 #define curK qK4
0478   _R( _b, _c, _d, _e, _a, F3, 44,
0479       WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 60,
0480       W1, W2, W3, W4, W5, W6, W7, W0);
0481   _R( _a, _b, _c, _d, _e, F3, 45,
0482       WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 60,
0483       W1, W2, W3, W4, W5, W6, W7, W0);
0484   _R( _e, _a, _b, _c, _d, F3, 46,
0485       WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 60,
0486       W1, W2, W3, W4, W5, W6, W7, W0);
0487   _R( _d, _e, _a, _b, _c, F3, 47,
0488       WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 60,
0489       W1, W2, W3, W4, W5, W6, W7, W0);
0490 
0491   _R( _c, _d, _e, _a, _b, F3, 48,
0492       WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 64,
0493       W0, W1, W2, W3, W4, W5, W6, W7);
0494   _R( _b, _c, _d, _e, _a, F3, 49,
0495       WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 64,
0496       W0, W1, W2, W3, W4, W5, W6, W7);
0497   _R( _a, _b, _c, _d, _e, F3, 50,
0498       WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 64,
0499       W0, W1, W2, W3, W4, W5, W6, W7);
0500   _R( _e, _a, _b, _c, _d, F3, 51,
0501       WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 64,
0502       W0, W1, W2, W3, W4, W5, W6, W7);
0503 
0504   _R( _d, _e, _a, _b, _c, F3, 52,
0505       WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 68,
0506       W7, W0, W1, W2, W3, W4, W5, W6);
0507   _R( _c, _d, _e, _a, _b, F3, 53,
0508       WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 68,
0509       W7, W0, W1, W2, W3, W4, W5, W6);
0510   _R( _b, _c, _d, _e, _a, F3, 54,
0511       WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 68,
0512       W7, W0, W1, W2, W3, W4, W5, W6);
0513   _R( _a, _b, _c, _d, _e, F3, 55,
0514       WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 68,
0515       W7, W0, W1, W2, W3, W4, W5, W6);
0516 
0517   _R( _e, _a, _b, _c, _d, F3, 56,
0518       WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 72,
0519       W6, W7, W0, W1, W2, W3, W4, W5);
0520   _R( _d, _e, _a, _b, _c, F3, 57,
0521       WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 72,
0522       W6, W7, W0, W1, W2, W3, W4, W5);
0523   _R( _c, _d, _e, _a, _b, F3, 58,
0524       WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 72,
0525       W6, W7, W0, W1, W2, W3, W4, W5);
0526   _R( _b, _c, _d, _e, _a, F3, 59,
0527       WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 72,
0528       W6, W7, W0, W1, W2, W3, W4, W5);
0529 
0530   subs RNBLKS, #1;
0531 
0532   _R( _a, _b, _c, _d, _e, F4, 60,
0533       WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 76,
0534       W5, W6, W7, W0, W1, W2, W3, W4);
0535   _R( _e, _a, _b, _c, _d, F4, 61,
0536       WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 76,
0537       W5, W6, W7, W0, W1, W2, W3, W4);
0538   _R( _d, _e, _a, _b, _c, F4, 62,
0539       WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 76,
0540       W5, W6, W7, W0, W1, W2, W3, W4);
0541   _R( _c, _d, _e, _a, _b, F4, 63,
0542       WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 76,
0543       W5, W6, W7, W0, W1, W2, W3, W4);
0544 
0545   beq .Lend;
0546 
0547   /* Transform 64-79 + Precalc 0-15 of next block. */
0548 #undef curK
0549 #define curK qK1
0550   _R( _b, _c, _d, _e, _a, F4, 64,
0551       WPRECALC_00_15_0, dummy, dummy, _, _, _, _, _, _, _, _, _ );
0552   _R( _a, _b, _c, _d, _e, F4, 65,
0553       WPRECALC_00_15_1, dummy, dummy, _, _, _, _, _, _, _, _, _ );
0554   _R( _e, _a, _b, _c, _d, F4, 66,
0555       WPRECALC_00_15_2, dummy, dummy, _, _, _, _, _, _, _, _, _ );
0556   _R( _d, _e, _a, _b, _c, F4, 67,
0557       WPRECALC_00_15_3, dummy, dummy, _, _, _, _, _, _, _, _, _ );
0558 
0559   _R( _c, _d, _e, _a, _b, F4, 68,
0560       dummy,            dummy, dummy, _, _, _, _, _, _, _, _, _ );
0561   _R( _b, _c, _d, _e, _a, F4, 69,
0562       dummy,            dummy, dummy, _, _, _, _, _, _, _, _, _ );
0563   _R( _a, _b, _c, _d, _e, F4, 70,
0564       WPRECALC_00_15_4, dummy, dummy, _, _, _, _, _, _, _, _, _ );
0565   _R( _e, _a, _b, _c, _d, F4, 71,
0566       WPRECALC_00_15_5, dummy, dummy, _, _, _, _, _, _, _, _, _ );
0567 
0568   _R( _d, _e, _a, _b, _c, F4, 72,
0569       dummy,            dummy, dummy, _, _, _, _, _, _, _, _, _ );
0570   _R( _c, _d, _e, _a, _b, F4, 73,
0571       dummy,            dummy, dummy, _, _, _, _, _, _, _, _, _ );
0572   _R( _b, _c, _d, _e, _a, F4, 74,
0573       WPRECALC_00_15_6, dummy, dummy, _, _, _, _, _, _, _, _, _ );
0574   _R( _a, _b, _c, _d, _e, F4, 75,
0575       WPRECALC_00_15_7, dummy, dummy, _, _, _, _, _, _, _, _, _ );
0576 
0577   _R( _e, _a, _b, _c, _d, F4, 76,
0578       WPRECALC_00_15_8, dummy, dummy, _, _, _, _, _, _, _, _, _ );
0579   _R( _d, _e, _a, _b, _c, F4, 77,
0580       WPRECALC_00_15_9, dummy, dummy, _, _, _, _, _, _, _, _, _ );
0581   _R( _c, _d, _e, _a, _b, F4, 78,
0582       WPRECALC_00_15_10, dummy, dummy, _, _, _, _, _, _, _, _, _ );
0583   _R( _b, _c, _d, _e, _a, F4, 79,
0584       WPRECALC_00_15_11, dummy, WPRECALC_00_15_12, _, _, _, _, _, _, _, _, _ );
0585 
0586   /* Update the chaining variables. */
0587   ldm RSTATE, {RT0-RT3};
0588   add _a, RT0;
0589   ldr RT0, [RSTATE, #state_h4];
0590   add _b, RT1;
0591   add _c, RT2;
0592   add _d, RT3;
0593   add _e, RT0;
0594   stm RSTATE, {_a-_e};
0595 
0596   b .Loop;
0597 
0598 .Lend:
0599   /* Transform 64-79 */
0600   R( _b, _c, _d, _e, _a, F4, 64 );
0601   R( _a, _b, _c, _d, _e, F4, 65 );
0602   R( _e, _a, _b, _c, _d, F4, 66 );
0603   R( _d, _e, _a, _b, _c, F4, 67 );
0604   R( _c, _d, _e, _a, _b, F4, 68 );
0605   R( _b, _c, _d, _e, _a, F4, 69 );
0606   R( _a, _b, _c, _d, _e, F4, 70 );
0607   R( _e, _a, _b, _c, _d, F4, 71 );
0608   R( _d, _e, _a, _b, _c, F4, 72 );
0609   R( _c, _d, _e, _a, _b, F4, 73 );
0610   R( _b, _c, _d, _e, _a, F4, 74 );
0611   R( _a, _b, _c, _d, _e, F4, 75 );
0612   R( _e, _a, _b, _c, _d, F4, 76 );
0613   R( _d, _e, _a, _b, _c, F4, 77 );
0614   R( _c, _d, _e, _a, _b, F4, 78 );
0615   R( _b, _c, _d, _e, _a, F4, 79 );
0616 
0617   mov sp, ROLDSTACK;
0618 
0619   /* Update the chaining variables. */
0620   ldm RSTATE, {RT0-RT3};
0621   add _a, RT0;
0622   ldr RT0, [RSTATE, #state_h4];
0623   add _b, RT1;
0624   add _c, RT2;
0625   add _d, RT3;
0626   /*vpop {q4-q7};*/
0627   add _e, RT0;
0628   stm RSTATE, {_a-_e};
0629 
0630   pop {r4-r12, pc};
0631 
0632 .Ldo_nothing:
0633   bx lr
0634 ENDPROC(sha1_transform_neon)