Back to home page

OSCL-LXR

 
 

    


0001 /*
0002  *  Implement fast SHA-1 with AVX2 instructions. (x86_64)
0003  *
0004  * This file is provided under a dual BSD/GPLv2 license.  When using or
0005  * redistributing this file, you may do so under either license.
0006  *
0007  * GPL LICENSE SUMMARY
0008  *
0009  * Copyright(c) 2014 Intel Corporation.
0010  *
0011  * This program is free software; you can redistribute it and/or modify
0012  * it under the terms of version 2 of the GNU General Public License as
0013  * published by the Free Software Foundation.
0014  *
0015  * This program is distributed in the hope that it will be useful, but
0016  * WITHOUT ANY WARRANTY; without even the implied warranty of
0017  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
0018  * General Public License for more details.
0019  *
0020  * Contact Information:
0021  * Ilya Albrekht <ilya.albrekht@intel.com>
0022  * Maxim Locktyukhin <maxim.locktyukhin@intel.com>
0023  * Ronen Zohar <ronen.zohar@intel.com>
0024  * Chandramouli Narayanan <mouli@linux.intel.com>
0025  *
0026  * BSD LICENSE
0027  *
0028  * Copyright(c) 2014 Intel Corporation.
0029  *
0030  * Redistribution and use in source and binary forms, with or without
0031  * modification, are permitted provided that the following conditions
0032  * are met:
0033  *
0034  * Redistributions of source code must retain the above copyright
0035  * notice, this list of conditions and the following disclaimer.
0036  * Redistributions in binary form must reproduce the above copyright
0037  * notice, this list of conditions and the following disclaimer in
0038  * the documentation and/or other materials provided with the
0039  * distribution.
0040  * Neither the name of Intel Corporation nor the names of its
0041  * contributors may be used to endorse or promote products derived
0042  * from this software without specific prior written permission.
0043  *
0044  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
0045  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
0046  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
0047  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
0048  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
0049  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
0050  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
0051  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
0052  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
0053  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
0054  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
0055  *
0056  */
0057 
0058 /*
0059  * SHA-1 implementation with Intel(R) AVX2 instruction set extensions.
0060  *
0061  *This implementation is based on the previous SSSE3 release:
0062  *Visit http://software.intel.com/en-us/articles/
0063  *and refer to improving-the-performance-of-the-secure-hash-algorithm-1/
0064  *
0065  *Updates 20-byte SHA-1 record at start of 'state', from 'input', for
0066  *even number of 'blocks' consecutive 64-byte blocks.
0067  *
0068  *extern "C" void sha1_transform_avx2(
0069  *  struct sha1_state *state, const u8* input, int blocks );
0070  */
0071 
0072 #include <linux/linkage.h>
0073 
0074 #define CTX %rdi    /* arg1 */
0075 #define BUF %rsi    /* arg2 */
0076 #define CNT %rdx    /* arg3 */
0077 
0078 #define REG_A   %ecx
0079 #define REG_B   %esi
0080 #define REG_C   %edi
0081 #define REG_D   %eax
0082 #define REG_E   %edx
0083 #define REG_TB  %ebx
0084 #define REG_TA  %r12d
0085 #define REG_RA  %rcx
0086 #define REG_RB  %rsi
0087 #define REG_RC  %rdi
0088 #define REG_RD  %rax
0089 #define REG_RE  %rdx
0090 #define REG_RTA %r12
0091 #define REG_RTB %rbx
0092 #define REG_T1  %r11d
0093 #define xmm_mov vmovups
0094 #define avx2_zeroupper  vzeroupper
0095 #define RND_F1  1
0096 #define RND_F2  2
0097 #define RND_F3  3
0098 
0099 .macro REGALLOC
0100     .set A, REG_A
0101     .set B, REG_B
0102     .set C, REG_C
0103     .set D, REG_D
0104     .set E, REG_E
0105     .set TB, REG_TB
0106     .set TA, REG_TA
0107 
0108     .set RA, REG_RA
0109     .set RB, REG_RB
0110     .set RC, REG_RC
0111     .set RD, REG_RD
0112     .set RE, REG_RE
0113 
0114     .set RTA, REG_RTA
0115     .set RTB, REG_RTB
0116 
0117     .set T1, REG_T1
0118 .endm
0119 
0120 #define HASH_PTR    %r9
0121 #define BLOCKS_CTR  %r8
0122 #define BUFFER_PTR  %r10
0123 #define BUFFER_PTR2 %r13
0124 
0125 #define PRECALC_BUF %r14
0126 #define WK_BUF      %r15
0127 
0128 #define W_TMP       %xmm0
0129 #define WY_TMP      %ymm0
0130 #define WY_TMP2     %ymm9
0131 
0132 # AVX2 variables
0133 #define WY0     %ymm3
0134 #define WY4     %ymm5
0135 #define WY08        %ymm7
0136 #define WY12        %ymm8
0137 #define WY16        %ymm12
0138 #define WY20        %ymm13
0139 #define WY24        %ymm14
0140 #define WY28        %ymm15
0141 
0142 #define YMM_SHUFB_BSWAP %ymm10
0143 
0144 /*
0145  * Keep 2 iterations precalculated at a time:
0146  *    - 80 DWORDs per iteration * 2
0147  */
0148 #define W_SIZE      (80*2*2 +16)
0149 
0150 #define WK(t)   ((((t) % 80) / 4)*32 + ( (t) % 4)*4 + ((t)/80)*16 )(WK_BUF)
0151 #define PRECALC_WK(t)   ((t)*2*2)(PRECALC_BUF)
0152 
0153 
0154 .macro UPDATE_HASH  hash, val
0155     add \hash, \val
0156     mov \val, \hash
0157 .endm
0158 
0159 .macro PRECALC_RESET_WY
0160     .set WY_00, WY0
0161     .set WY_04, WY4
0162     .set WY_08, WY08
0163     .set WY_12, WY12
0164     .set WY_16, WY16
0165     .set WY_20, WY20
0166     .set WY_24, WY24
0167     .set WY_28, WY28
0168     .set WY_32, WY_00
0169 .endm
0170 
0171 .macro PRECALC_ROTATE_WY
0172     /* Rotate macros */
0173     .set WY_32, WY_28
0174     .set WY_28, WY_24
0175     .set WY_24, WY_20
0176     .set WY_20, WY_16
0177     .set WY_16, WY_12
0178     .set WY_12, WY_08
0179     .set WY_08, WY_04
0180     .set WY_04, WY_00
0181     .set WY_00, WY_32
0182 
0183     /* Define register aliases */
0184     .set WY, WY_00
0185     .set WY_minus_04, WY_04
0186     .set WY_minus_08, WY_08
0187     .set WY_minus_12, WY_12
0188     .set WY_minus_16, WY_16
0189     .set WY_minus_20, WY_20
0190     .set WY_minus_24, WY_24
0191     .set WY_minus_28, WY_28
0192     .set WY_minus_32, WY
0193 .endm
0194 
0195 .macro PRECALC_00_15
0196     .if (i == 0) # Initialize and rotate registers
0197         PRECALC_RESET_WY
0198         PRECALC_ROTATE_WY
0199     .endif
0200 
0201     /* message scheduling pre-compute for rounds 0-15 */
0202     .if   ((i & 7) == 0)
0203         /*
0204          * blended AVX2 and ALU instruction scheduling
0205          * 1 vector iteration per 8 rounds
0206          */
0207         vmovdqu (i * 2)(BUFFER_PTR), W_TMP
0208     .elseif ((i & 7) == 1)
0209         vinsertf128 $1, ((i-1) * 2)(BUFFER_PTR2),\
0210              WY_TMP, WY_TMP
0211     .elseif ((i & 7) == 2)
0212         vpshufb YMM_SHUFB_BSWAP, WY_TMP, WY
0213     .elseif ((i & 7) == 4)
0214         vpaddd  K_XMM + K_XMM_AR(%rip), WY, WY_TMP
0215     .elseif ((i & 7) == 7)
0216         vmovdqu  WY_TMP, PRECALC_WK(i&~7)
0217 
0218         PRECALC_ROTATE_WY
0219     .endif
0220 .endm
0221 
0222 .macro PRECALC_16_31
0223     /*
0224      * message scheduling pre-compute for rounds 16-31
0225      * calculating last 32 w[i] values in 8 XMM registers
0226      * pre-calculate K+w[i] values and store to mem
0227      * for later load by ALU add instruction
0228      *
0229      * "brute force" vectorization for rounds 16-31 only
0230      * due to w[i]->w[i-3] dependency
0231      */
0232     .if   ((i & 7) == 0)
0233         /*
0234          * blended AVX2 and ALU instruction scheduling
0235          * 1 vector iteration per 8 rounds
0236          */
0237         /* w[i-14] */
0238         vpalignr    $8, WY_minus_16, WY_minus_12, WY
0239         vpsrldq $4, WY_minus_04, WY_TMP               /* w[i-3] */
0240     .elseif ((i & 7) == 1)
0241         vpxor   WY_minus_08, WY, WY
0242         vpxor   WY_minus_16, WY_TMP, WY_TMP
0243     .elseif ((i & 7) == 2)
0244         vpxor   WY_TMP, WY, WY
0245         vpslldq $12, WY, WY_TMP2
0246     .elseif ((i & 7) == 3)
0247         vpslld  $1, WY, WY_TMP
0248         vpsrld  $31, WY, WY
0249     .elseif ((i & 7) == 4)
0250         vpor    WY, WY_TMP, WY_TMP
0251         vpslld  $2, WY_TMP2, WY
0252     .elseif ((i & 7) == 5)
0253         vpsrld  $30, WY_TMP2, WY_TMP2
0254         vpxor   WY, WY_TMP, WY_TMP
0255     .elseif ((i & 7) == 7)
0256         vpxor   WY_TMP2, WY_TMP, WY
0257         vpaddd  K_XMM + K_XMM_AR(%rip), WY, WY_TMP
0258         vmovdqu WY_TMP, PRECALC_WK(i&~7)
0259 
0260         PRECALC_ROTATE_WY
0261     .endif
0262 .endm
0263 
0264 .macro PRECALC_32_79
0265     /*
0266      * in SHA-1 specification:
0267      * w[i] = (w[i-3] ^ w[i-8]  ^ w[i-14] ^ w[i-16]) rol 1
0268      * instead we do equal:
0269      * w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
0270      * allows more efficient vectorization
0271      * since w[i]=>w[i-3] dependency is broken
0272      */
0273 
0274     .if   ((i & 7) == 0)
0275     /*
0276      * blended AVX2 and ALU instruction scheduling
0277      * 1 vector iteration per 8 rounds
0278      */
0279         vpalignr    $8, WY_minus_08, WY_minus_04, WY_TMP
0280     .elseif ((i & 7) == 1)
0281         /* W is W_minus_32 before xor */
0282         vpxor   WY_minus_28, WY, WY
0283     .elseif ((i & 7) == 2)
0284         vpxor   WY_minus_16, WY_TMP, WY_TMP
0285     .elseif ((i & 7) == 3)
0286         vpxor   WY_TMP, WY, WY
0287     .elseif ((i & 7) == 4)
0288         vpslld  $2, WY, WY_TMP
0289     .elseif ((i & 7) == 5)
0290         vpsrld  $30, WY, WY
0291         vpor    WY, WY_TMP, WY
0292     .elseif ((i & 7) == 7)
0293         vpaddd  K_XMM + K_XMM_AR(%rip), WY, WY_TMP
0294         vmovdqu WY_TMP, PRECALC_WK(i&~7)
0295 
0296         PRECALC_ROTATE_WY
0297     .endif
0298 .endm
0299 
0300 .macro PRECALC r, s
0301     .set i, \r
0302 
0303     .if (i < 40)
0304         .set K_XMM, 32*0
0305     .elseif (i < 80)
0306         .set K_XMM, 32*1
0307     .elseif (i < 120)
0308         .set K_XMM, 32*2
0309     .else
0310         .set K_XMM, 32*3
0311     .endif
0312 
0313     .if (i<32)
0314         PRECALC_00_15   \s
0315     .elseif (i<64)
0316         PRECALC_16_31   \s
0317     .elseif (i < 160)
0318         PRECALC_32_79   \s
0319     .endif
0320 .endm
0321 
0322 .macro ROTATE_STATE
0323     .set T_REG, E
0324     .set E, D
0325     .set D, C
0326     .set C, B
0327     .set B, TB
0328     .set TB, A
0329     .set A, T_REG
0330 
0331     .set T_REG, RE
0332     .set RE, RD
0333     .set RD, RC
0334     .set RC, RB
0335     .set RB, RTB
0336     .set RTB, RA
0337     .set RA, T_REG
0338 .endm
0339 
0340 /* Macro relies on saved ROUND_Fx */
0341 
0342 .macro RND_FUN f, r
0343     .if (\f == RND_F1)
0344         ROUND_F1    \r
0345     .elseif (\f == RND_F2)
0346         ROUND_F2    \r
0347     .elseif (\f == RND_F3)
0348         ROUND_F3    \r
0349     .endif
0350 .endm
0351 
0352 .macro RR r
0353     .set round_id, (\r % 80)
0354 
0355     .if (round_id == 0)        /* Precalculate F for first round */
0356         .set ROUND_FUNC, RND_F1
0357         mov B, TB
0358 
0359         rorx    $(32-30), B, B    /* b>>>2 */
0360         andn    D, TB, T1
0361         and C, TB
0362         xor T1, TB
0363     .endif
0364 
0365     RND_FUN ROUND_FUNC, \r
0366     ROTATE_STATE
0367 
0368     .if   (round_id == 18)
0369         .set ROUND_FUNC, RND_F2
0370     .elseif (round_id == 38)
0371         .set ROUND_FUNC, RND_F3
0372     .elseif (round_id == 58)
0373         .set ROUND_FUNC, RND_F2
0374     .endif
0375 
0376     .set round_id, ( (\r+1) % 80)
0377 
0378     RND_FUN ROUND_FUNC, (\r+1)
0379     ROTATE_STATE
0380 .endm
0381 
0382 .macro ROUND_F1 r
0383     add WK(\r), E
0384 
0385     andn    C, A, T1            /* ~b&d */
0386     lea (RE,RTB), E     /* Add F from the previous round */
0387 
0388     rorx    $(32-5), A, TA      /* T2 = A >>> 5 */
0389     rorx    $(32-30),A, TB      /* b>>>2 for next round */
0390 
0391     PRECALC (\r)            /* msg scheduling for next 2 blocks */
0392 
0393     /*
0394      * Calculate F for the next round
0395      * (b & c) ^ andn[b, d]
0396      */
0397     and B, A            /* b&c */
0398     xor T1, A           /* F1 = (b&c) ^ (~b&d) */
0399 
0400     lea (RE,RTA), E     /* E += A >>> 5 */
0401 .endm
0402 
0403 .macro ROUND_F2 r
0404     add WK(\r), E
0405     lea (RE,RTB), E     /* Add F from the previous round */
0406 
0407     /* Calculate F for the next round */
0408     rorx    $(32-5), A, TA      /* T2 = A >>> 5 */
0409     .if ((round_id) < 79)
0410         rorx    $(32-30), A, TB /* b>>>2 for next round */
0411     .endif
0412     PRECALC (\r)            /* msg scheduling for next 2 blocks */
0413 
0414     .if ((round_id) < 79)
0415         xor B, A
0416     .endif
0417 
0418     add TA, E           /* E += A >>> 5 */
0419 
0420     .if ((round_id) < 79)
0421         xor C, A
0422     .endif
0423 .endm
0424 
0425 .macro ROUND_F3 r
0426     add WK(\r), E
0427     PRECALC (\r)            /* msg scheduling for next 2 blocks */
0428 
0429     lea (RE,RTB), E     /* Add F from the previous round */
0430 
0431     mov B, T1
0432     or  A, T1
0433 
0434     rorx    $(32-5), A, TA      /* T2 = A >>> 5 */
0435     rorx    $(32-30), A, TB     /* b>>>2 for next round */
0436 
0437     /* Calculate F for the next round
0438      * (b and c) or (d and (b or c))
0439      */
0440     and C, T1
0441     and B, A
0442     or  T1, A
0443 
0444     add TA, E           /* E += A >>> 5 */
0445 
0446 .endm
0447 
0448 /* Add constant only if (%2 > %3) condition met (uses RTA as temp)
0449  * %1 + %2 >= %3 ? %4 : 0
0450  */
0451 .macro ADD_IF_GE a, b, c, d
0452     mov     \a, RTA
0453     add     $\d, RTA
0454     cmp     $\c, \b
0455     cmovge  RTA, \a
0456 .endm
0457 
0458 /*
0459  * macro implements 80 rounds of SHA-1, for multiple blocks with s/w pipelining
0460  */
0461 .macro SHA1_PIPELINED_MAIN_BODY
0462 
0463     REGALLOC
0464 
0465     mov (HASH_PTR), A
0466     mov 4(HASH_PTR), B
0467     mov 8(HASH_PTR), C
0468     mov 12(HASH_PTR), D
0469     mov 16(HASH_PTR), E
0470 
0471     mov %rsp, PRECALC_BUF
0472     lea (2*4*80+32)(%rsp), WK_BUF
0473 
0474     # Precalc WK for first 2 blocks
0475     ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 2, 64
0476     .set i, 0
0477     .rept    160
0478         PRECALC i
0479         .set i, i + 1
0480     .endr
0481 
0482     /* Go to next block if needed */
0483     ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 3, 128
0484     ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128
0485     xchg    WK_BUF, PRECALC_BUF
0486 
0487     .align 32
0488 _loop:
0489     /*
0490      * code loops through more than one block
0491      * we use K_BASE value as a signal of a last block,
0492      * it is set below by: cmovae BUFFER_PTR, K_BASE
0493      */
0494     test BLOCKS_CTR, BLOCKS_CTR
0495     jnz _begin
0496     .align 32
0497     jmp _end
0498     .align 32
0499 _begin:
0500 
0501     /*
0502      * Do first block
0503      * rounds: 0,2,4,6,8
0504      */
0505     .set j, 0
0506     .rept 5
0507         RR  j
0508         .set j, j+2
0509     .endr
0510 
0511     jmp _loop0
0512 _loop0:
0513 
0514     /*
0515      * rounds:
0516      * 10,12,14,16,18
0517      * 20,22,24,26,28
0518      * 30,32,34,36,38
0519      * 40,42,44,46,48
0520      * 50,52,54,56,58
0521      */
0522     .rept 25
0523         RR  j
0524         .set j, j+2
0525     .endr
0526 
0527     /* Update Counter */
0528     sub $1, BLOCKS_CTR
0529     /* Move to the next block only if needed*/
0530     ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 4, 128
0531     /*
0532      * rounds
0533      * 60,62,64,66,68
0534      * 70,72,74,76,78
0535      */
0536     .rept 10
0537         RR  j
0538         .set j, j+2
0539     .endr
0540 
0541     UPDATE_HASH (HASH_PTR), A
0542     UPDATE_HASH 4(HASH_PTR), TB
0543     UPDATE_HASH 8(HASH_PTR), C
0544     UPDATE_HASH 12(HASH_PTR), D
0545     UPDATE_HASH 16(HASH_PTR), E
0546 
0547     test    BLOCKS_CTR, BLOCKS_CTR
0548     jz  _loop
0549 
0550     mov TB, B
0551 
0552     /* Process second block */
0553     /*
0554      * rounds
0555      *  0+80, 2+80, 4+80, 6+80, 8+80
0556      * 10+80,12+80,14+80,16+80,18+80
0557      */
0558 
0559     .set j, 0
0560     .rept 10
0561         RR  j+80
0562         .set j, j+2
0563     .endr
0564 
0565     jmp _loop1
0566 _loop1:
0567     /*
0568      * rounds
0569      * 20+80,22+80,24+80,26+80,28+80
0570      * 30+80,32+80,34+80,36+80,38+80
0571      */
0572     .rept 10
0573         RR  j+80
0574         .set j, j+2
0575     .endr
0576 
0577     jmp _loop2
0578 _loop2:
0579 
0580     /*
0581      * rounds
0582      * 40+80,42+80,44+80,46+80,48+80
0583      * 50+80,52+80,54+80,56+80,58+80
0584      */
0585     .rept 10
0586         RR  j+80
0587         .set j, j+2
0588     .endr
0589 
0590     /* update counter */
0591     sub     $1, BLOCKS_CTR
0592     /* Move to the next block only if needed*/
0593     ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128
0594 
0595     jmp _loop3
0596 _loop3:
0597 
0598     /*
0599      * rounds
0600      * 60+80,62+80,64+80,66+80,68+80
0601      * 70+80,72+80,74+80,76+80,78+80
0602      */
0603     .rept 10
0604         RR  j+80
0605         .set j, j+2
0606     .endr
0607 
0608     UPDATE_HASH (HASH_PTR), A
0609     UPDATE_HASH 4(HASH_PTR), TB
0610     UPDATE_HASH 8(HASH_PTR), C
0611     UPDATE_HASH 12(HASH_PTR), D
0612     UPDATE_HASH 16(HASH_PTR), E
0613 
0614     /* Reset state for AVX2 reg permutation */
0615     mov A, TA
0616     mov TB, A
0617     mov C, TB
0618     mov E, C
0619     mov D, B
0620     mov TA, D
0621 
0622     REGALLOC
0623 
0624     xchg    WK_BUF, PRECALC_BUF
0625 
0626     jmp _loop
0627 
0628     .align 32
0629     _end:
0630 
0631 .endm
0632 /*
0633  * macro implements SHA-1 function's body for several 64-byte blocks
0634  * param: function's name
0635  */
0636 .macro SHA1_VECTOR_ASM  name
0637     SYM_FUNC_START(\name)
0638 
0639     push    %rbx
0640     push    %r12
0641     push    %r13
0642     push    %r14
0643     push    %r15
0644 
0645     RESERVE_STACK  = (W_SIZE*4 + 8+24)
0646 
0647     /* Align stack */
0648     push    %rbp
0649     mov %rsp, %rbp
0650     and $~(0x20-1), %rsp
0651     sub $RESERVE_STACK, %rsp
0652 
0653     avx2_zeroupper
0654 
0655     /* Setup initial values */
0656     mov CTX, HASH_PTR
0657     mov BUF, BUFFER_PTR
0658 
0659     mov BUF, BUFFER_PTR2
0660     mov CNT, BLOCKS_CTR
0661 
0662     xmm_mov BSWAP_SHUFB_CTL(%rip), YMM_SHUFB_BSWAP
0663 
0664     SHA1_PIPELINED_MAIN_BODY
0665 
0666     avx2_zeroupper
0667 
0668     mov %rbp, %rsp
0669     pop %rbp
0670 
0671     pop %r15
0672     pop %r14
0673     pop %r13
0674     pop %r12
0675     pop %rbx
0676 
0677     RET
0678 
0679     SYM_FUNC_END(\name)
0680 .endm
0681 
0682 .section .rodata
0683 
0684 #define K1 0x5a827999
0685 #define K2 0x6ed9eba1
0686 #define K3 0x8f1bbcdc
0687 #define K4 0xca62c1d6
0688 
0689 .align 128
0690 K_XMM_AR:
0691     .long K1, K1, K1, K1
0692     .long K1, K1, K1, K1
0693     .long K2, K2, K2, K2
0694     .long K2, K2, K2, K2
0695     .long K3, K3, K3, K3
0696     .long K3, K3, K3, K3
0697     .long K4, K4, K4, K4
0698     .long K4, K4, K4, K4
0699 
0700 BSWAP_SHUFB_CTL:
0701     .long 0x00010203
0702     .long 0x04050607
0703     .long 0x08090a0b
0704     .long 0x0c0d0e0f
0705     .long 0x00010203
0706     .long 0x04050607
0707     .long 0x08090a0b
0708     .long 0x0c0d0e0f
0709 .text
0710 
0711 SHA1_VECTOR_ASM     sha1_transform_avx2