0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024
0025
0026
0027 #include <linux/linkage.h>
0028
0029 #define CTX %rdi // arg1
0030 #define BUF %rsi // arg2
0031 #define CNT %rdx // arg3
0032
0033 #define REG_A %ecx
0034 #define REG_B %esi
0035 #define REG_C %edi
0036 #define REG_D %r12d
0037 #define REG_E %edx
0038
0039 #define REG_T1 %eax
0040 #define REG_T2 %ebx
0041
0042 #define K_BASE %r8
0043 #define HASH_PTR %r9
0044 #define BUFFER_PTR %r10
0045 #define BUFFER_END %r11
0046
0047 #define W_TMP1 %xmm0
0048 #define W_TMP2 %xmm9
0049
0050 #define W0 %xmm1
0051 #define W4 %xmm2
0052 #define W8 %xmm3
0053 #define W12 %xmm4
0054 #define W16 %xmm5
0055 #define W20 %xmm6
0056 #define W24 %xmm7
0057 #define W28 %xmm8
0058
0059 #define XMM_SHUFB_BSWAP %xmm10
0060
0061
0062 #define WK(t) (((t) & 15) * 4)(%rsp)
0063 #define W_PRECALC_AHEAD 16
0064
0065
0066
0067
0068
0069 .macro SHA1_VECTOR_ASM name
0070 SYM_FUNC_START(\name)
0071
0072 push %rbx
0073 push %r12
0074 push %rbp
0075 mov %rsp, %rbp
0076
0077 sub $64, %rsp # allocate workspace
0078 and $~15, %rsp # align stack
0079
0080 mov CTX, HASH_PTR
0081 mov BUF, BUFFER_PTR
0082
0083 shl $6, CNT # multiply by 64
0084 add BUF, CNT
0085 mov CNT, BUFFER_END
0086
0087 lea K_XMM_AR(%rip), K_BASE
0088 xmm_mov BSWAP_SHUFB_CTL(%rip), XMM_SHUFB_BSWAP
0089
0090 SHA1_PIPELINED_MAIN_BODY
0091
0092 # cleanup workspace
0093 mov $8, %ecx
0094 mov %rsp, %rdi
0095 xor %eax, %eax
0096 rep stosq
0097
0098 mov %rbp, %rsp # deallocate workspace
0099 pop %rbp
0100 pop %r12
0101 pop %rbx
0102 RET
0103
0104 SYM_FUNC_END(\name)
0105 .endm
0106
0107
0108
0109
0110 .macro SHA1_PIPELINED_MAIN_BODY
0111 INIT_REGALLOC
0112
0113 mov (HASH_PTR), A
0114 mov 4(HASH_PTR), B
0115 mov 8(HASH_PTR), C
0116 mov 12(HASH_PTR), D
0117 mov 16(HASH_PTR), E
0118
0119 .set i, 0
0120 .rept W_PRECALC_AHEAD
0121 W_PRECALC i
0122 .set i, (i+1)
0123 .endr
0124
0125 .align 4
0126 1:
0127 RR F1,A,B,C,D,E,0
0128 RR F1,D,E,A,B,C,2
0129 RR F1,B,C,D,E,A,4
0130 RR F1,E,A,B,C,D,6
0131 RR F1,C,D,E,A,B,8
0132
0133 RR F1,A,B,C,D,E,10
0134 RR F1,D,E,A,B,C,12
0135 RR F1,B,C,D,E,A,14
0136 RR F1,E,A,B,C,D,16
0137 RR F1,C,D,E,A,B,18
0138
0139 RR F2,A,B,C,D,E,20
0140 RR F2,D,E,A,B,C,22
0141 RR F2,B,C,D,E,A,24
0142 RR F2,E,A,B,C,D,26
0143 RR F2,C,D,E,A,B,28
0144
0145 RR F2,A,B,C,D,E,30
0146 RR F2,D,E,A,B,C,32
0147 RR F2,B,C,D,E,A,34
0148 RR F2,E,A,B,C,D,36
0149 RR F2,C,D,E,A,B,38
0150
0151 RR F3,A,B,C,D,E,40
0152 RR F3,D,E,A,B,C,42
0153 RR F3,B,C,D,E,A,44
0154 RR F3,E,A,B,C,D,46
0155 RR F3,C,D,E,A,B,48
0156
0157 RR F3,A,B,C,D,E,50
0158 RR F3,D,E,A,B,C,52
0159 RR F3,B,C,D,E,A,54
0160 RR F3,E,A,B,C,D,56
0161 RR F3,C,D,E,A,B,58
0162
0163 add $64, BUFFER_PTR # move to the next 64-byte block
0164 cmp BUFFER_END, BUFFER_PTR # if the current is the last one use
0165 cmovae K_BASE, BUFFER_PTR # dummy source to avoid buffer overrun
0166
0167 RR F4,A,B,C,D,E,60
0168 RR F4,D,E,A,B,C,62
0169 RR F4,B,C,D,E,A,64
0170 RR F4,E,A,B,C,D,66
0171 RR F4,C,D,E,A,B,68
0172
0173 RR F4,A,B,C,D,E,70
0174 RR F4,D,E,A,B,C,72
0175 RR F4,B,C,D,E,A,74
0176 RR F4,E,A,B,C,D,76
0177 RR F4,C,D,E,A,B,78
0178
0179 UPDATE_HASH (HASH_PTR), A
0180 UPDATE_HASH 4(HASH_PTR), B
0181 UPDATE_HASH 8(HASH_PTR), C
0182 UPDATE_HASH 12(HASH_PTR), D
0183 UPDATE_HASH 16(HASH_PTR), E
0184
0185 RESTORE_RENAMED_REGS
0186 cmp K_BASE, BUFFER_PTR # K_BASE means, we reached the end
0187 jne 1b
0188 .endm
0189
0190 .macro INIT_REGALLOC
0191 .set A, REG_A
0192 .set B, REG_B
0193 .set C, REG_C
0194 .set D, REG_D
0195 .set E, REG_E
0196 .set T1, REG_T1
0197 .set T2, REG_T2
0198 .endm
0199
0200 .macro RESTORE_RENAMED_REGS
0201 # order is important (REG_C is where it should be)
0202 mov B, REG_B
0203 mov D, REG_D
0204 mov A, REG_A
0205 mov E, REG_E
0206 .endm
0207
0208 .macro SWAP_REG_NAMES a, b
0209 .set _T, \a
0210 .set \a, \b
0211 .set \b, _T
0212 .endm
0213
0214 .macro F1 b, c, d
0215 mov \c, T1
0216 SWAP_REG_NAMES \c, T1
0217 xor \d, T1
0218 and \b, T1
0219 xor \d, T1
0220 .endm
0221
0222 .macro F2 b, c, d
0223 mov \d, T1
0224 SWAP_REG_NAMES \d, T1
0225 xor \c, T1
0226 xor \b, T1
0227 .endm
0228
0229 .macro F3 b, c ,d
0230 mov \c, T1
0231 SWAP_REG_NAMES \c, T1
0232 mov \b, T2
0233 or \b, T1
0234 and \c, T2
0235 and \d, T1
0236 or T2, T1
0237 .endm
0238
0239 .macro F4 b, c, d
0240 F2 \b, \c, \d
0241 .endm
0242
0243 .macro UPDATE_HASH hash, val
0244 add \hash, \val
0245 mov \val, \hash
0246 .endm
0247
0248
0249
0250
0251
0252
0253
0254
0255
0256
0257
0258
0259 .macro RR F, a, b, c, d, e, round
0260 add WK(\round), \e
0261 \F \b, \c, \d # t1 = F(b, c, d);
0262 W_PRECALC (\round + W_PRECALC_AHEAD)
0263 rol $30, \b
0264 add T1, \e
0265 add WK(\round + 1), \d
0266
0267 \F \a, \b, \c
0268 W_PRECALC (\round + W_PRECALC_AHEAD + 1)
0269 rol $5, \a
0270 add \a, \e
0271 add T1, \d
0272 ror $7, \a # (a <<r 5) >>r 7) => a <<r 30)
0273
0274 mov \e, T1
0275 SWAP_REG_NAMES \e, T1
0276
0277 rol $5, T1
0278 add T1, \d
0279
0280 # write: \a, \b
0281 # rotate: \a<=\d, \b<=\e, \c<=\a, \d<=\b, \e<=\c
0282 .endm
0283
0284 .macro W_PRECALC r
0285 .set i, \r
0286
0287 .if (i < 20)
0288 .set K_XMM, 0
0289 .elseif (i < 40)
0290 .set K_XMM, 16
0291 .elseif (i < 60)
0292 .set K_XMM, 32
0293 .elseif (i < 80)
0294 .set K_XMM, 48
0295 .endif
0296
0297 .if ((i < 16) || ((i >= 80) && (i < (80 + W_PRECALC_AHEAD))))
0298 .set i, ((\r) % 80) # pre-compute for the next iteration
0299 .if (i == 0)
0300 W_PRECALC_RESET
0301 .endif
0302 W_PRECALC_00_15
0303 .elseif (i<32)
0304 W_PRECALC_16_31
0305 .elseif (i < 80) // rounds 32-79
0306 W_PRECALC_32_79
0307 .endif
0308 .endm
0309
0310 .macro W_PRECALC_RESET
0311 .set W, W0
0312 .set W_minus_04, W4
0313 .set W_minus_08, W8
0314 .set W_minus_12, W12
0315 .set W_minus_16, W16
0316 .set W_minus_20, W20
0317 .set W_minus_24, W24
0318 .set W_minus_28, W28
0319 .set W_minus_32, W
0320 .endm
0321
0322 .macro W_PRECALC_ROTATE
0323 .set W_minus_32, W_minus_28
0324 .set W_minus_28, W_minus_24
0325 .set W_minus_24, W_minus_20
0326 .set W_minus_20, W_minus_16
0327 .set W_minus_16, W_minus_12
0328 .set W_minus_12, W_minus_08
0329 .set W_minus_08, W_minus_04
0330 .set W_minus_04, W
0331 .set W, W_minus_32
0332 .endm
0333
0334 .macro W_PRECALC_SSSE3
0335
0336 .macro W_PRECALC_00_15
0337 W_PRECALC_00_15_SSSE3
0338 .endm
0339 .macro W_PRECALC_16_31
0340 W_PRECALC_16_31_SSSE3
0341 .endm
0342 .macro W_PRECALC_32_79
0343 W_PRECALC_32_79_SSSE3
0344 .endm
0345
0346
0347 .macro W_PRECALC_00_15_SSSE3
0348 .if ((i & 3) == 0)
0349 movdqu (i*4)(BUFFER_PTR), W_TMP1
0350 .elseif ((i & 3) == 1)
0351 pshufb XMM_SHUFB_BSWAP, W_TMP1
0352 movdqa W_TMP1, W
0353 .elseif ((i & 3) == 2)
0354 paddd (K_BASE), W_TMP1
0355 .elseif ((i & 3) == 3)
0356 movdqa W_TMP1, WK(i&~3)
0357 W_PRECALC_ROTATE
0358 .endif
0359 .endm
0360
0361
0362
0363
0364
0365
0366
0367
0368
0369
0370 .macro W_PRECALC_16_31_SSSE3
0371 # blended scheduling of vector and scalar instruction streams, one 4-wide
0372 # vector iteration / 4 scalar rounds
0373 .if ((i & 3) == 0)
0374 movdqa W_minus_12, W
0375 palignr $8, W_minus_16, W # w[i-14]
0376 movdqa W_minus_04, W_TMP1
0377 psrldq $4, W_TMP1 # w[i-3]
0378 pxor W_minus_08, W
0379 .elseif ((i & 3) == 1)
0380 pxor W_minus_16, W_TMP1
0381 pxor W_TMP1, W
0382 movdqa W, W_TMP2
0383 movdqa W, W_TMP1
0384 pslldq $12, W_TMP2
0385 .elseif ((i & 3) == 2)
0386 psrld $31, W
0387 pslld $1, W_TMP1
0388 por W, W_TMP1
0389 movdqa W_TMP2, W
0390 psrld $30, W_TMP2
0391 pslld $2, W
0392 .elseif ((i & 3) == 3)
0393 pxor W, W_TMP1
0394 pxor W_TMP2, W_TMP1
0395 movdqa W_TMP1, W
0396 paddd K_XMM(K_BASE), W_TMP1
0397 movdqa W_TMP1, WK(i&~3)
0398 W_PRECALC_ROTATE
0399 .endif
0400 .endm
0401
0402
0403
0404
0405
0406
0407
0408 .macro W_PRECALC_32_79_SSSE3
0409 .if ((i & 3) == 0)
0410 movdqa W_minus_04, W_TMP1
0411 pxor W_minus_28, W # W is W_minus_32 before xor
0412 palignr $8, W_minus_08, W_TMP1
0413 .elseif ((i & 3) == 1)
0414 pxor W_minus_16, W
0415 pxor W_TMP1, W
0416 movdqa W, W_TMP1
0417 .elseif ((i & 3) == 2)
0418 psrld $30, W
0419 pslld $2, W_TMP1
0420 por W, W_TMP1
0421 .elseif ((i & 3) == 3)
0422 movdqa W_TMP1, W
0423 paddd K_XMM(K_BASE), W_TMP1
0424 movdqa W_TMP1, WK(i&~3)
0425 W_PRECALC_ROTATE
0426 .endif
0427 .endm
0428
0429 .endm // W_PRECALC_SSSE3
0430
0431
0432 #define K1 0x5a827999
0433 #define K2 0x6ed9eba1
0434 #define K3 0x8f1bbcdc
0435 #define K4 0xca62c1d6
0436
0437 .section .rodata
0438 .align 16
0439
0440 K_XMM_AR:
0441 .long K1, K1, K1, K1
0442 .long K2, K2, K2, K2
0443 .long K3, K3, K3, K3
0444 .long K4, K4, K4, K4
0445
0446 BSWAP_SHUFB_CTL:
0447 .long 0x00010203
0448 .long 0x04050607
0449 .long 0x08090a0b
0450 .long 0x0c0d0e0f
0451
0452
0453 .section .text
0454
0455 W_PRECALC_SSSE3
0456 .macro xmm_mov a, b
0457 movdqu \a,\b
0458 .endm
0459
0460
0461
0462
0463
0464
0465
0466
0467
0468 SHA1_VECTOR_ASM sha1_transform_ssse3
0469
0470 .macro W_PRECALC_AVX
0471
0472 .purgem W_PRECALC_00_15
0473 .macro W_PRECALC_00_15
0474 W_PRECALC_00_15_AVX
0475 .endm
0476 .purgem W_PRECALC_16_31
0477 .macro W_PRECALC_16_31
0478 W_PRECALC_16_31_AVX
0479 .endm
0480 .purgem W_PRECALC_32_79
0481 .macro W_PRECALC_32_79
0482 W_PRECALC_32_79_AVX
0483 .endm
0484
0485 .macro W_PRECALC_00_15_AVX
0486 .if ((i & 3) == 0)
0487 vmovdqu (i*4)(BUFFER_PTR), W_TMP1
0488 .elseif ((i & 3) == 1)
0489 vpshufb XMM_SHUFB_BSWAP, W_TMP1, W
0490 .elseif ((i & 3) == 2)
0491 vpaddd (K_BASE), W, W_TMP1
0492 .elseif ((i & 3) == 3)
0493 vmovdqa W_TMP1, WK(i&~3)
0494 W_PRECALC_ROTATE
0495 .endif
0496 .endm
0497
0498 .macro W_PRECALC_16_31_AVX
0499 .if ((i & 3) == 0)
0500 vpalignr $8, W_minus_16, W_minus_12, W # w[i-14]
0501 vpsrldq $4, W_minus_04, W_TMP1 # w[i-3]
0502 vpxor W_minus_08, W, W
0503 vpxor W_minus_16, W_TMP1, W_TMP1
0504 .elseif ((i & 3) == 1)
0505 vpxor W_TMP1, W, W
0506 vpslldq $12, W, W_TMP2
0507 vpslld $1, W, W_TMP1
0508 .elseif ((i & 3) == 2)
0509 vpsrld $31, W, W
0510 vpor W, W_TMP1, W_TMP1
0511 vpslld $2, W_TMP2, W
0512 vpsrld $30, W_TMP2, W_TMP2
0513 .elseif ((i & 3) == 3)
0514 vpxor W, W_TMP1, W_TMP1
0515 vpxor W_TMP2, W_TMP1, W
0516 vpaddd K_XMM(K_BASE), W, W_TMP1
0517 vmovdqu W_TMP1, WK(i&~3)
0518 W_PRECALC_ROTATE
0519 .endif
0520 .endm
0521
0522 .macro W_PRECALC_32_79_AVX
0523 .if ((i & 3) == 0)
0524 vpalignr $8, W_minus_08, W_minus_04, W_TMP1
0525 vpxor W_minus_28, W, W # W is W_minus_32 before xor
0526 .elseif ((i & 3) == 1)
0527 vpxor W_minus_16, W_TMP1, W_TMP1
0528 vpxor W_TMP1, W, W
0529 .elseif ((i & 3) == 2)
0530 vpslld $2, W, W_TMP1
0531 vpsrld $30, W, W
0532 vpor W, W_TMP1, W
0533 .elseif ((i & 3) == 3)
0534 vpaddd K_XMM(K_BASE), W, W_TMP1
0535 vmovdqu W_TMP1, WK(i&~3)
0536 W_PRECALC_ROTATE
0537 .endif
0538 .endm
0539
0540 .endm // W_PRECALC_AVX
0541
0542 W_PRECALC_AVX
0543 .purgem xmm_mov
0544 .macro xmm_mov a, b
0545 vmovdqu \a,\b
0546 .endm
0547
0548
0549
0550
0551
0552
0553 SHA1_VECTOR_ASM sha1_transform_avx