0001 ########################################################################
0002 # Implement fast SHA-256 with AVX1 instructions. (x86_64)
0003 #
0004 # Copyright (C) 2013 Intel Corporation.
0005 #
0006 # Authors:
0007 # James Guilford <james.guilford@intel.com>
0008 # Kirk Yap <kirk.s.yap@intel.com>
0009 # Tim Chen <tim.c.chen@linux.intel.com>
0010 #
0011 # This software is available to you under a choice of one of two
0012 # licenses. You may choose to be licensed under the terms of the GNU
0013 # General Public License (GPL) Version 2, available from the file
0014 # COPYING in the main directory of this source tree, or the
0015 # OpenIB.org BSD license below:
0016 #
0017 # Redistribution and use in source and binary forms, with or
0018 # without modification, are permitted provided that the following
0019 # conditions are met:
0020 #
0021 # - Redistributions of source code must retain the above
0022 # copyright notice, this list of conditions and the following
0023 # disclaimer.
0024 #
0025 # - Redistributions in binary form must reproduce the above
0026 # copyright notice, this list of conditions and the following
0027 # disclaimer in the documentation and/or other materials
0028 # provided with the distribution.
0029 #
0030 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
0031 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
0032 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
0033 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
0034 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
0035 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
0036 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
0037 # SOFTWARE.
0038 ########################################################################
0039 #
0040 # This code is described in an Intel White-Paper:
0041 # "Fast SHA-256 Implementations on Intel Architecture Processors"
0042 #
0043 # To find it, surf to http://www.intel.com/p/en_US/embedded
0044 # and search for that title.
0045 #
0046 ########################################################################
0047 # This code schedules 1 block at a time, with 4 lanes per block
0048 ########################################################################
0049
0050 #include <linux/linkage.h>
0051
0052 ## assume buffers not aligned
0053 #define VMOVDQ vmovdqu
0054
0055 ################################ Define Macros
0056
0057 # addm [mem], reg
0058 # Add reg to mem using reg-mem add and store
0059 .macro addm p1 p2
0060 add \p1, \p2
0061 mov \p2, \p1
0062 .endm
0063
0064
0065 .macro MY_ROR p1 p2
0066 shld $(32-(\p1)), \p2, \p2
0067 .endm
0068
0069 ################################
0070
0071 # COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
0072 # Load xmm with mem and byte swap each dword
0073 .macro COPY_XMM_AND_BSWAP p1 p2 p3
0074 VMOVDQ \p2, \p1
0075 vpshufb \p3, \p1, \p1
0076 .endm
0077
0078 ################################
0079
0080 X0 = %xmm4
0081 X1 = %xmm5
0082 X2 = %xmm6
0083 X3 = %xmm7
0084
0085 XTMP0 = %xmm0
0086 XTMP1 = %xmm1
0087 XTMP2 = %xmm2
0088 XTMP3 = %xmm3
0089 XTMP4 = %xmm8
0090 XFER = %xmm9
0091 XTMP5 = %xmm11
0092
0093 SHUF_00BA = %xmm10 # shuffle xBxA -> 00BA
0094 SHUF_DC00 = %xmm12 # shuffle xDxC -> DC00
0095 BYTE_FLIP_MASK = %xmm13
0096
0097 NUM_BLKS = %rdx # 3rd arg
0098 INP = %rsi # 2nd arg
0099 CTX = %rdi # 1st arg
0100
0101 SRND = %rsi # clobbers INP
0102 c = %ecx
0103 d = %r8d
0104 e = %edx
0105 TBL = %r12
0106 a = %eax
0107 b = %ebx
0108
0109 f = %r9d
0110 g = %r10d
0111 h = %r11d
0112
0113 y0 = %r13d
0114 y1 = %r14d
0115 y2 = %r15d
0116
0117
0118 _INP_END_SIZE = 8
0119 _INP_SIZE = 8
0120 _XFER_SIZE = 16
0121 _XMM_SAVE_SIZE = 0
0122
0123 _INP_END = 0
0124 _INP = _INP_END + _INP_END_SIZE
0125 _XFER = _INP + _INP_SIZE
0126 _XMM_SAVE = _XFER + _XFER_SIZE
0127 STACK_SIZE = _XMM_SAVE + _XMM_SAVE_SIZE
0128
0129 # rotate_Xs
0130 # Rotate values of symbols X0...X3
0131 .macro rotate_Xs
0132 X_ = X0
0133 X0 = X1
0134 X1 = X2
0135 X2 = X3
0136 X3 = X_
0137 .endm
0138
0139 # ROTATE_ARGS
0140 # Rotate values of symbols a...h
0141 .macro ROTATE_ARGS
0142 TMP_ = h
0143 h = g
0144 g = f
0145 f = e
0146 e = d
0147 d = c
0148 c = b
0149 b = a
0150 a = TMP_
0151 .endm
0152
0153 .macro FOUR_ROUNDS_AND_SCHED
0154 ## compute s0 four at a time and s1 two at a time
0155 ## compute W[-16] + W[-7] 4 at a time
0156
0157 mov e, y0 # y0 = e
0158 MY_ROR (25-11), y0 # y0 = e >> (25-11)
0159 mov a, y1 # y1 = a
0160 vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7]
0161 MY_ROR (22-13), y1 # y1 = a >> (22-13)
0162 xor e, y0 # y0 = e ^ (e >> (25-11))
0163 mov f, y2 # y2 = f
0164 MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
0165 xor a, y1 # y1 = a ^ (a >> (22-13)
0166 xor g, y2 # y2 = f^g
0167 vpaddd X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]
0168 xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
0169 and e, y2 # y2 = (f^g)&e
0170 MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
0171 ## compute s0
0172 vpalignr $4, X0, X1, XTMP1 # XTMP1 = W[-15]
0173 xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
0174 MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
0175 xor g, y2 # y2 = CH = ((f^g)&e)^g
0176 MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
0177 add y0, y2 # y2 = S1 + CH
0178 add _XFER(%rsp), y2 # y2 = k + w + S1 + CH
0179 mov a, y0 # y0 = a
0180 add y2, h # h = h + S1 + CH + k + w
0181 mov a, y2 # y2 = a
0182 vpsrld $7, XTMP1, XTMP2
0183 or c, y0 # y0 = a|c
0184 add h, d # d = d + h + S1 + CH + k + w
0185 and c, y2 # y2 = a&c
0186 vpslld $(32-7), XTMP1, XTMP3
0187 and b, y0 # y0 = (a|c)&b
0188 add y1, h # h = h + S1 + CH + k + w + S0
0189 vpor XTMP2, XTMP3, XTMP3 # XTMP1 = W[-15] MY_ROR 7
0190 or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
0191 add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
0192 ROTATE_ARGS
0193 mov e, y0 # y0 = e
0194 mov a, y1 # y1 = a
0195 MY_ROR (25-11), y0 # y0 = e >> (25-11)
0196 xor e, y0 # y0 = e ^ (e >> (25-11))
0197 mov f, y2 # y2 = f
0198 MY_ROR (22-13), y1 # y1 = a >> (22-13)
0199 vpsrld $18, XTMP1, XTMP2 #
0200 xor a, y1 # y1 = a ^ (a >> (22-13)
0201 MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
0202 xor g, y2 # y2 = f^g
0203 vpsrld $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3
0204 MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
0205 xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
0206 and e, y2 # y2 = (f^g)&e
0207 MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
0208 vpslld $(32-18), XTMP1, XTMP1
0209 xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
0210 xor g, y2 # y2 = CH = ((f^g)&e)^g
0211 vpxor XTMP1, XTMP3, XTMP3 #
0212 add y0, y2 # y2 = S1 + CH
0213 add (1*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
0214 MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
0215 vpxor XTMP2, XTMP3, XTMP3 # XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR
0216 mov a, y0 # y0 = a
0217 add y2, h # h = h + S1 + CH + k + w
0218 mov a, y2 # y2 = a
0219 vpxor XTMP4, XTMP3, XTMP1 # XTMP1 = s0
0220 or c, y0 # y0 = a|c
0221 add h, d # d = d + h + S1 + CH + k + w
0222 and c, y2 # y2 = a&c
0223 ## compute low s1
0224 vpshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA}
0225 and b, y0 # y0 = (a|c)&b
0226 add y1, h # h = h + S1 + CH + k + w + S0
0227 vpaddd XTMP1, XTMP0, XTMP0 # XTMP0 = W[-16] + W[-7] + s0
0228 or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
0229 add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
0230 ROTATE_ARGS
0231 mov e, y0 # y0 = e
0232 mov a, y1 # y1 = a
0233 MY_ROR (25-11), y0 # y0 = e >> (25-11)
0234 xor e, y0 # y0 = e ^ (e >> (25-11))
0235 MY_ROR (22-13), y1 # y1 = a >> (22-13)
0236 mov f, y2 # y2 = f
0237 xor a, y1 # y1 = a ^ (a >> (22-13)
0238 MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
0239 vpsrld $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}
0240 xor g, y2 # y2 = f^g
0241 vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] MY_ROR 19 {xBxA}
0242 xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
0243 and e, y2 # y2 = (f^g)&e
0244 vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] MY_ROR 17 {xBxA}
0245 MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
0246 xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
0247 xor g, y2 # y2 = CH = ((f^g)&e)^g
0248 MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
0249 vpxor XTMP3, XTMP2, XTMP2 #
0250 add y0, y2 # y2 = S1 + CH
0251 MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
0252 add (2*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
0253 vpxor XTMP2, XTMP4, XTMP4 # XTMP4 = s1 {xBxA}
0254 mov a, y0 # y0 = a
0255 add y2, h # h = h + S1 + CH + k + w
0256 mov a, y2 # y2 = a
0257 vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA}
0258 or c, y0 # y0 = a|c
0259 add h, d # d = d + h + S1 + CH + k + w
0260 and c, y2 # y2 = a&c
0261 vpaddd XTMP4, XTMP0, XTMP0 # XTMP0 = {..., ..., W[1], W[0]}
0262 and b, y0 # y0 = (a|c)&b
0263 add y1, h # h = h + S1 + CH + k + w + S0
0264 ## compute high s1
0265 vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC}
0266 or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
0267 add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
0268 ROTATE_ARGS
0269 mov e, y0 # y0 = e
0270 MY_ROR (25-11), y0 # y0 = e >> (25-11)
0271 mov a, y1 # y1 = a
0272 MY_ROR (22-13), y1 # y1 = a >> (22-13)
0273 xor e, y0 # y0 = e ^ (e >> (25-11))
0274 mov f, y2 # y2 = f
0275 MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
0276 vpsrld $10, XTMP2, XTMP5 # XTMP5 = W[-2] >> 10 {DDCC}
0277 xor a, y1 # y1 = a ^ (a >> (22-13)
0278 xor g, y2 # y2 = f^g
0279 vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] MY_ROR 19 {xDxC}
0280 xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
0281 and e, y2 # y2 = (f^g)&e
0282 MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
0283 vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] MY_ROR 17 {xDxC}
0284 xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
0285 MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
0286 xor g, y2 # y2 = CH = ((f^g)&e)^g
0287 vpxor XTMP3, XTMP2, XTMP2
0288 MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
0289 add y0, y2 # y2 = S1 + CH
0290 add (3*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
0291 vpxor XTMP2, XTMP5, XTMP5 # XTMP5 = s1 {xDxC}
0292 mov a, y0 # y0 = a
0293 add y2, h # h = h + S1 + CH + k + w
0294 mov a, y2 # y2 = a
0295 vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00}
0296 or c, y0 # y0 = a|c
0297 add h, d # d = d + h + S1 + CH + k + w
0298 and c, y2 # y2 = a&c
0299 vpaddd XTMP0, XTMP5, X0 # X0 = {W[3], W[2], W[1], W[0]}
0300 and b, y0 # y0 = (a|c)&b
0301 add y1, h # h = h + S1 + CH + k + w + S0
0302 or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
0303 add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
0304 ROTATE_ARGS
0305 rotate_Xs
0306 .endm
0307
0308 ## input is [rsp + _XFER + %1 * 4]
0309 .macro DO_ROUND round
0310 mov e, y0 # y0 = e
0311 MY_ROR (25-11), y0 # y0 = e >> (25-11)
0312 mov a, y1 # y1 = a
0313 xor e, y0 # y0 = e ^ (e >> (25-11))
0314 MY_ROR (22-13), y1 # y1 = a >> (22-13)
0315 mov f, y2 # y2 = f
0316 xor a, y1 # y1 = a ^ (a >> (22-13)
0317 MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
0318 xor g, y2 # y2 = f^g
0319 xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
0320 MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
0321 and e, y2 # y2 = (f^g)&e
0322 xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
0323 MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
0324 xor g, y2 # y2 = CH = ((f^g)&e)^g
0325 add y0, y2 # y2 = S1 + CH
0326 MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
0327 offset = \round * 4 + _XFER #
0328 add offset(%rsp), y2 # y2 = k + w + S1 + CH
0329 mov a, y0 # y0 = a
0330 add y2, h # h = h + S1 + CH + k + w
0331 mov a, y2 # y2 = a
0332 or c, y0 # y0 = a|c
0333 add h, d # d = d + h + S1 + CH + k + w
0334 and c, y2 # y2 = a&c
0335 and b, y0 # y0 = (a|c)&b
0336 add y1, h # h = h + S1 + CH + k + w + S0
0337 or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
0338 add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
0339 ROTATE_ARGS
0340 .endm
0341
0342 ########################################################################
0343 ## void sha256_transform_avx(state sha256_state *state, const u8 *data, int blocks)
0344 ## arg 1 : pointer to state
0345 ## arg 2 : pointer to input data
0346 ## arg 3 : Num blocks
0347 ########################################################################
0348 .text
0349 SYM_FUNC_START(sha256_transform_avx)
0350 .align 32
0351 pushq %rbx
0352 pushq %r12
0353 pushq %r13
0354 pushq %r14
0355 pushq %r15
0356 pushq %rbp
0357 movq %rsp, %rbp
0358
0359 subq $STACK_SIZE, %rsp # allocate stack space
0360 and $~15, %rsp # align stack pointer
0361
0362 shl $6, NUM_BLKS # convert to bytes
0363 jz done_hash
0364 add INP, NUM_BLKS # pointer to end of data
0365 mov NUM_BLKS, _INP_END(%rsp)
0366
0367 ## load initial digest
0368 mov 4*0(CTX), a
0369 mov 4*1(CTX), b
0370 mov 4*2(CTX), c
0371 mov 4*3(CTX), d
0372 mov 4*4(CTX), e
0373 mov 4*5(CTX), f
0374 mov 4*6(CTX), g
0375 mov 4*7(CTX), h
0376
0377 vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
0378 vmovdqa _SHUF_00BA(%rip), SHUF_00BA
0379 vmovdqa _SHUF_DC00(%rip), SHUF_DC00
0380 loop0:
0381 lea K256(%rip), TBL
0382
0383 ## byte swap first 16 dwords
0384 COPY_XMM_AND_BSWAP X0, 0*16(INP), BYTE_FLIP_MASK
0385 COPY_XMM_AND_BSWAP X1, 1*16(INP), BYTE_FLIP_MASK
0386 COPY_XMM_AND_BSWAP X2, 2*16(INP), BYTE_FLIP_MASK
0387 COPY_XMM_AND_BSWAP X3, 3*16(INP), BYTE_FLIP_MASK
0388
0389 mov INP, _INP(%rsp)
0390
0391 ## schedule 48 input dwords, by doing 3 rounds of 16 each
0392 mov $3, SRND
0393 .align 16
0394 loop1:
0395 vpaddd (TBL), X0, XFER
0396 vmovdqa XFER, _XFER(%rsp)
0397 FOUR_ROUNDS_AND_SCHED
0398
0399 vpaddd 1*16(TBL), X0, XFER
0400 vmovdqa XFER, _XFER(%rsp)
0401 FOUR_ROUNDS_AND_SCHED
0402
0403 vpaddd 2*16(TBL), X0, XFER
0404 vmovdqa XFER, _XFER(%rsp)
0405 FOUR_ROUNDS_AND_SCHED
0406
0407 vpaddd 3*16(TBL), X0, XFER
0408 vmovdqa XFER, _XFER(%rsp)
0409 add $4*16, TBL
0410 FOUR_ROUNDS_AND_SCHED
0411
0412 sub $1, SRND
0413 jne loop1
0414
0415 mov $2, SRND
0416 loop2:
0417 vpaddd (TBL), X0, XFER
0418 vmovdqa XFER, _XFER(%rsp)
0419 DO_ROUND 0
0420 DO_ROUND 1
0421 DO_ROUND 2
0422 DO_ROUND 3
0423
0424 vpaddd 1*16(TBL), X1, XFER
0425 vmovdqa XFER, _XFER(%rsp)
0426 add $2*16, TBL
0427 DO_ROUND 0
0428 DO_ROUND 1
0429 DO_ROUND 2
0430 DO_ROUND 3
0431
0432 vmovdqa X2, X0
0433 vmovdqa X3, X1
0434
0435 sub $1, SRND
0436 jne loop2
0437
0438 addm (4*0)(CTX),a
0439 addm (4*1)(CTX),b
0440 addm (4*2)(CTX),c
0441 addm (4*3)(CTX),d
0442 addm (4*4)(CTX),e
0443 addm (4*5)(CTX),f
0444 addm (4*6)(CTX),g
0445 addm (4*7)(CTX),h
0446
0447 mov _INP(%rsp), INP
0448 add $64, INP
0449 cmp _INP_END(%rsp), INP
0450 jne loop0
0451
0452 done_hash:
0453
0454 mov %rbp, %rsp
0455 popq %rbp
0456 popq %r15
0457 popq %r14
0458 popq %r13
0459 popq %r12
0460 popq %rbx
0461 RET
0462 SYM_FUNC_END(sha256_transform_avx)
0463
0464 .section .rodata.cst256.K256, "aM", @progbits, 256
0465 .align 64
0466 K256:
0467 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
0468 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
0469 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
0470 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
0471 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
0472 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
0473 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
0474 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
0475 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
0476 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
0477 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
0478 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
0479 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
0480 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
0481 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
0482 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
0483
0484 .section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
0485 .align 16
0486 PSHUFFLE_BYTE_FLIP_MASK:
0487 .octa 0x0c0d0e0f08090a0b0405060700010203
0488
0489 .section .rodata.cst16._SHUF_00BA, "aM", @progbits, 16
0490 .align 16
0491 # shuffle xBxA -> 00BA
0492 _SHUF_00BA:
0493 .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100
0494
0495 .section .rodata.cst16._SHUF_DC00, "aM", @progbits, 16
0496 .align 16
0497 # shuffle xDxC -> DC00
0498 _SHUF_DC00:
0499 .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF