0001 ########################################################################
0002 # Implement fast SHA-256 with SSSE3 instructions. (x86_64)
0003 #
0004 # Copyright (C) 2013 Intel Corporation.
0005 #
0006 # Authors:
0007 # James Guilford <james.guilford@intel.com>
0008 # Kirk Yap <kirk.s.yap@intel.com>
0009 # Tim Chen <tim.c.chen@linux.intel.com>
0010 #
0011 # This software is available to you under a choice of one of two
0012 # licenses. You may choose to be licensed under the terms of the GNU
0013 # General Public License (GPL) Version 2, available from the file
0014 # COPYING in the main directory of this source tree, or the
0015 # OpenIB.org BSD license below:
0016 #
0017 # Redistribution and use in source and binary forms, with or
0018 # without modification, are permitted provided that the following
0019 # conditions are met:
0020 #
0021 # - Redistributions of source code must retain the above
0022 # copyright notice, this list of conditions and the following
0023 # disclaimer.
0024 #
0025 # - Redistributions in binary form must reproduce the above
0026 # copyright notice, this list of conditions and the following
0027 # disclaimer in the documentation and/or other materials
0028 # provided with the distribution.
0029 #
0030 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
0031 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
0032 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
0033 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
0034 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
0035 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
0036 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
0037 # SOFTWARE.
0038 #
0039 ########################################################################
0040 #
0041 # This code is described in an Intel White-Paper:
0042 # "Fast SHA-256 Implementations on Intel Architecture Processors"
0043 #
0044 # To find it, surf to http://www.intel.com/p/en_US/embedded
0045 # and search for that title.
0046 #
0047 ########################################################################
0048
0049 #include <linux/linkage.h>
0050
0051 ## assume buffers not aligned
0052 #define MOVDQ movdqu
0053
0054 ################################ Define Macros
0055
0056 # addm [mem], reg
0057 # Add reg to mem using reg-mem add and store
0058 .macro addm p1 p2
0059 add \p1, \p2
0060 mov \p2, \p1
0061 .endm
0062
0063 ################################
0064
0065 # COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
0066 # Load xmm with mem and byte swap each dword
0067 .macro COPY_XMM_AND_BSWAP p1 p2 p3
0068 MOVDQ \p2, \p1
0069 pshufb \p3, \p1
0070 .endm
0071
0072 ################################
0073
0074 X0 = %xmm4
0075 X1 = %xmm5
0076 X2 = %xmm6
0077 X3 = %xmm7
0078
0079 XTMP0 = %xmm0
0080 XTMP1 = %xmm1
0081 XTMP2 = %xmm2
0082 XTMP3 = %xmm3
0083 XTMP4 = %xmm8
0084 XFER = %xmm9
0085
0086 SHUF_00BA = %xmm10 # shuffle xBxA -> 00BA
0087 SHUF_DC00 = %xmm11 # shuffle xDxC -> DC00
0088 BYTE_FLIP_MASK = %xmm12
0089
0090 NUM_BLKS = %rdx # 3rd arg
0091 INP = %rsi # 2nd arg
0092 CTX = %rdi # 1st arg
0093
0094 SRND = %rsi # clobbers INP
0095 c = %ecx
0096 d = %r8d
0097 e = %edx
0098 TBL = %r12
0099 a = %eax
0100 b = %ebx
0101
0102 f = %r9d
0103 g = %r10d
0104 h = %r11d
0105
0106 y0 = %r13d
0107 y1 = %r14d
0108 y2 = %r15d
0109
0110
0111
0112 _INP_END_SIZE = 8
0113 _INP_SIZE = 8
0114 _XFER_SIZE = 16
0115 _XMM_SAVE_SIZE = 0
0116
0117 _INP_END = 0
0118 _INP = _INP_END + _INP_END_SIZE
0119 _XFER = _INP + _INP_SIZE
0120 _XMM_SAVE = _XFER + _XFER_SIZE
0121 STACK_SIZE = _XMM_SAVE + _XMM_SAVE_SIZE
0122
0123 # rotate_Xs
0124 # Rotate values of symbols X0...X3
0125 .macro rotate_Xs
0126 X_ = X0
0127 X0 = X1
0128 X1 = X2
0129 X2 = X3
0130 X3 = X_
0131 .endm
0132
0133 # ROTATE_ARGS
0134 # Rotate values of symbols a...h
0135 .macro ROTATE_ARGS
0136 TMP_ = h
0137 h = g
0138 g = f
0139 f = e
0140 e = d
0141 d = c
0142 c = b
0143 b = a
0144 a = TMP_
0145 .endm
0146
0147 .macro FOUR_ROUNDS_AND_SCHED
0148 ## compute s0 four at a time and s1 two at a time
0149 ## compute W[-16] + W[-7] 4 at a time
0150 movdqa X3, XTMP0
0151 mov e, y0 # y0 = e
0152 ror $(25-11), y0 # y0 = e >> (25-11)
0153 mov a, y1 # y1 = a
0154 palignr $4, X2, XTMP0 # XTMP0 = W[-7]
0155 ror $(22-13), y1 # y1 = a >> (22-13)
0156 xor e, y0 # y0 = e ^ (e >> (25-11))
0157 mov f, y2 # y2 = f
0158 ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
0159 movdqa X1, XTMP1
0160 xor a, y1 # y1 = a ^ (a >> (22-13)
0161 xor g, y2 # y2 = f^g
0162 paddd X0, XTMP0 # XTMP0 = W[-7] + W[-16]
0163 xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
0164 and e, y2 # y2 = (f^g)&e
0165 ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
0166 ## compute s0
0167 palignr $4, X0, XTMP1 # XTMP1 = W[-15]
0168 xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
0169 ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
0170 xor g, y2 # y2 = CH = ((f^g)&e)^g
0171 movdqa XTMP1, XTMP2 # XTMP2 = W[-15]
0172 ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
0173 add y0, y2 # y2 = S1 + CH
0174 add _XFER(%rsp) , y2 # y2 = k + w + S1 + CH
0175 movdqa XTMP1, XTMP3 # XTMP3 = W[-15]
0176 mov a, y0 # y0 = a
0177 add y2, h # h = h + S1 + CH + k + w
0178 mov a, y2 # y2 = a
0179 pslld $(32-7), XTMP1 #
0180 or c, y0 # y0 = a|c
0181 add h, d # d = d + h + S1 + CH + k + w
0182 and c, y2 # y2 = a&c
0183 psrld $7, XTMP2 #
0184 and b, y0 # y0 = (a|c)&b
0185 add y1, h # h = h + S1 + CH + k + w + S0
0186 por XTMP2, XTMP1 # XTMP1 = W[-15] ror 7
0187 or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
0188 add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
0189 #
0190 ROTATE_ARGS #
0191 movdqa XTMP3, XTMP2 # XTMP2 = W[-15]
0192 mov e, y0 # y0 = e
0193 mov a, y1 # y1 = a
0194 movdqa XTMP3, XTMP4 # XTMP4 = W[-15]
0195 ror $(25-11), y0 # y0 = e >> (25-11)
0196 xor e, y0 # y0 = e ^ (e >> (25-11))
0197 mov f, y2 # y2 = f
0198 ror $(22-13), y1 # y1 = a >> (22-13)
0199 pslld $(32-18), XTMP3 #
0200 xor a, y1 # y1 = a ^ (a >> (22-13)
0201 ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
0202 xor g, y2 # y2 = f^g
0203 psrld $18, XTMP2 #
0204 ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
0205 xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
0206 and e, y2 # y2 = (f^g)&e
0207 ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
0208 pxor XTMP3, XTMP1
0209 xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
0210 xor g, y2 # y2 = CH = ((f^g)&e)^g
0211 psrld $3, XTMP4 # XTMP4 = W[-15] >> 3
0212 add y0, y2 # y2 = S1 + CH
0213 add (1*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
0214 ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
0215 pxor XTMP2, XTMP1 # XTMP1 = W[-15] ror 7 ^ W[-15] ror 18
0216 mov a, y0 # y0 = a
0217 add y2, h # h = h + S1 + CH + k + w
0218 mov a, y2 # y2 = a
0219 pxor XTMP4, XTMP1 # XTMP1 = s0
0220 or c, y0 # y0 = a|c
0221 add h, d # d = d + h + S1 + CH + k + w
0222 and c, y2 # y2 = a&c
0223 ## compute low s1
0224 pshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA}
0225 and b, y0 # y0 = (a|c)&b
0226 add y1, h # h = h + S1 + CH + k + w + S0
0227 paddd XTMP1, XTMP0 # XTMP0 = W[-16] + W[-7] + s0
0228 or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
0229 add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
0230
0231 ROTATE_ARGS
0232 movdqa XTMP2, XTMP3 # XTMP3 = W[-2] {BBAA}
0233 mov e, y0 # y0 = e
0234 mov a, y1 # y1 = a
0235 ror $(25-11), y0 # y0 = e >> (25-11)
0236 movdqa XTMP2, XTMP4 # XTMP4 = W[-2] {BBAA}
0237 xor e, y0 # y0 = e ^ (e >> (25-11))
0238 ror $(22-13), y1 # y1 = a >> (22-13)
0239 mov f, y2 # y2 = f
0240 xor a, y1 # y1 = a ^ (a >> (22-13)
0241 ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
0242 psrlq $17, XTMP2 # XTMP2 = W[-2] ror 17 {xBxA}
0243 xor g, y2 # y2 = f^g
0244 psrlq $19, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA}
0245 xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
0246 and e, y2 # y2 = (f^g)&e
0247 psrld $10, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}
0248 ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
0249 xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
0250 xor g, y2 # y2 = CH = ((f^g)&e)^g
0251 ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
0252 pxor XTMP3, XTMP2
0253 add y0, y2 # y2 = S1 + CH
0254 ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
0255 add (2*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
0256 pxor XTMP2, XTMP4 # XTMP4 = s1 {xBxA}
0257 mov a, y0 # y0 = a
0258 add y2, h # h = h + S1 + CH + k + w
0259 mov a, y2 # y2 = a
0260 pshufb SHUF_00BA, XTMP4 # XTMP4 = s1 {00BA}
0261 or c, y0 # y0 = a|c
0262 add h, d # d = d + h + S1 + CH + k + w
0263 and c, y2 # y2 = a&c
0264 paddd XTMP4, XTMP0 # XTMP0 = {..., ..., W[1], W[0]}
0265 and b, y0 # y0 = (a|c)&b
0266 add y1, h # h = h + S1 + CH + k + w + S0
0267 ## compute high s1
0268 pshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {BBAA}
0269 or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
0270 add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
0271 #
0272 ROTATE_ARGS #
0273 movdqa XTMP2, XTMP3 # XTMP3 = W[-2] {DDCC}
0274 mov e, y0 # y0 = e
0275 ror $(25-11), y0 # y0 = e >> (25-11)
0276 mov a, y1 # y1 = a
0277 movdqa XTMP2, X0 # X0 = W[-2] {DDCC}
0278 ror $(22-13), y1 # y1 = a >> (22-13)
0279 xor e, y0 # y0 = e ^ (e >> (25-11))
0280 mov f, y2 # y2 = f
0281 ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
0282 psrlq $17, XTMP2 # XTMP2 = W[-2] ror 17 {xDxC}
0283 xor a, y1 # y1 = a ^ (a >> (22-13)
0284 xor g, y2 # y2 = f^g
0285 psrlq $19, XTMP3 # XTMP3 = W[-2] ror 19 {xDxC}
0286 xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25
0287 and e, y2 # y2 = (f^g)&e
0288 ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
0289 psrld $10, X0 # X0 = W[-2] >> 10 {DDCC}
0290 xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22
0291 ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>2
0292 xor g, y2 # y2 = CH = ((f^g)&e)^g
0293 pxor XTMP3, XTMP2 #
0294 ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>2
0295 add y0, y2 # y2 = S1 + CH
0296 add (3*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
0297 pxor XTMP2, X0 # X0 = s1 {xDxC}
0298 mov a, y0 # y0 = a
0299 add y2, h # h = h + S1 + CH + k + w
0300 mov a, y2 # y2 = a
0301 pshufb SHUF_DC00, X0 # X0 = s1 {DC00}
0302 or c, y0 # y0 = a|c
0303 add h, d # d = d + h + S1 + CH + k + w
0304 and c, y2 # y2 = a&c
0305 paddd XTMP0, X0 # X0 = {W[3], W[2], W[1], W[0]}
0306 and b, y0 # y0 = (a|c)&b
0307 add y1, h # h = h + S1 + CH + k + w + S0
0308 or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
0309 add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
0310
0311 ROTATE_ARGS
0312 rotate_Xs
0313 .endm
0314
0315 ## input is [rsp + _XFER + %1 * 4]
0316 .macro DO_ROUND round
0317 mov e, y0 # y0 = e
0318 ror $(25-11), y0 # y0 = e >> (25-11)
0319 mov a, y1 # y1 = a
0320 xor e, y0 # y0 = e ^ (e >> (25-11))
0321 ror $(22-13), y1 # y1 = a >> (22-13)
0322 mov f, y2 # y2 = f
0323 xor a, y1 # y1 = a ^ (a >> (22-13)
0324 ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
0325 xor g, y2 # y2 = f^g
0326 xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
0327 ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
0328 and e, y2 # y2 = (f^g)&e
0329 xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
0330 ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
0331 xor g, y2 # y2 = CH = ((f^g)&e)^g
0332 add y0, y2 # y2 = S1 + CH
0333 ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
0334 offset = \round * 4 + _XFER
0335 add offset(%rsp), y2 # y2 = k + w + S1 + CH
0336 mov a, y0 # y0 = a
0337 add y2, h # h = h + S1 + CH + k + w
0338 mov a, y2 # y2 = a
0339 or c, y0 # y0 = a|c
0340 add h, d # d = d + h + S1 + CH + k + w
0341 and c, y2 # y2 = a&c
0342 and b, y0 # y0 = (a|c)&b
0343 add y1, h # h = h + S1 + CH + k + w + S0
0344 or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
0345 add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
0346 ROTATE_ARGS
0347 .endm
0348
0349 ########################################################################
0350 ## void sha256_transform_ssse3(struct sha256_state *state, const u8 *data,
0351 ## int blocks);
0352 ## arg 1 : pointer to state
0353 ## (struct sha256_state is assumed to begin with u32 state[8])
0354 ## arg 2 : pointer to input data
0355 ## arg 3 : Num blocks
0356 ########################################################################
0357 .text
0358 SYM_FUNC_START(sha256_transform_ssse3)
0359 .align 32
0360 pushq %rbx
0361 pushq %r12
0362 pushq %r13
0363 pushq %r14
0364 pushq %r15
0365 pushq %rbp
0366 mov %rsp, %rbp
0367
0368 subq $STACK_SIZE, %rsp
0369 and $~15, %rsp
0370
0371 shl $6, NUM_BLKS # convert to bytes
0372 jz done_hash
0373 add INP, NUM_BLKS
0374 mov NUM_BLKS, _INP_END(%rsp) # pointer to end of data
0375
0376 ## load initial digest
0377 mov 4*0(CTX), a
0378 mov 4*1(CTX), b
0379 mov 4*2(CTX), c
0380 mov 4*3(CTX), d
0381 mov 4*4(CTX), e
0382 mov 4*5(CTX), f
0383 mov 4*6(CTX), g
0384 mov 4*7(CTX), h
0385
0386 movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
0387 movdqa _SHUF_00BA(%rip), SHUF_00BA
0388 movdqa _SHUF_DC00(%rip), SHUF_DC00
0389
0390 loop0:
0391 lea K256(%rip), TBL
0392
0393 ## byte swap first 16 dwords
0394 COPY_XMM_AND_BSWAP X0, 0*16(INP), BYTE_FLIP_MASK
0395 COPY_XMM_AND_BSWAP X1, 1*16(INP), BYTE_FLIP_MASK
0396 COPY_XMM_AND_BSWAP X2, 2*16(INP), BYTE_FLIP_MASK
0397 COPY_XMM_AND_BSWAP X3, 3*16(INP), BYTE_FLIP_MASK
0398
0399 mov INP, _INP(%rsp)
0400
0401 ## schedule 48 input dwords, by doing 3 rounds of 16 each
0402 mov $3, SRND
0403 .align 16
0404 loop1:
0405 movdqa (TBL), XFER
0406 paddd X0, XFER
0407 movdqa XFER, _XFER(%rsp)
0408 FOUR_ROUNDS_AND_SCHED
0409
0410 movdqa 1*16(TBL), XFER
0411 paddd X0, XFER
0412 movdqa XFER, _XFER(%rsp)
0413 FOUR_ROUNDS_AND_SCHED
0414
0415 movdqa 2*16(TBL), XFER
0416 paddd X0, XFER
0417 movdqa XFER, _XFER(%rsp)
0418 FOUR_ROUNDS_AND_SCHED
0419
0420 movdqa 3*16(TBL), XFER
0421 paddd X0, XFER
0422 movdqa XFER, _XFER(%rsp)
0423 add $4*16, TBL
0424 FOUR_ROUNDS_AND_SCHED
0425
0426 sub $1, SRND
0427 jne loop1
0428
0429 mov $2, SRND
0430 loop2:
0431 paddd (TBL), X0
0432 movdqa X0, _XFER(%rsp)
0433 DO_ROUND 0
0434 DO_ROUND 1
0435 DO_ROUND 2
0436 DO_ROUND 3
0437 paddd 1*16(TBL), X1
0438 movdqa X1, _XFER(%rsp)
0439 add $2*16, TBL
0440 DO_ROUND 0
0441 DO_ROUND 1
0442 DO_ROUND 2
0443 DO_ROUND 3
0444
0445 movdqa X2, X0
0446 movdqa X3, X1
0447
0448 sub $1, SRND
0449 jne loop2
0450
0451 addm (4*0)(CTX),a
0452 addm (4*1)(CTX),b
0453 addm (4*2)(CTX),c
0454 addm (4*3)(CTX),d
0455 addm (4*4)(CTX),e
0456 addm (4*5)(CTX),f
0457 addm (4*6)(CTX),g
0458 addm (4*7)(CTX),h
0459
0460 mov _INP(%rsp), INP
0461 add $64, INP
0462 cmp _INP_END(%rsp), INP
0463 jne loop0
0464
0465 done_hash:
0466
0467 mov %rbp, %rsp
0468 popq %rbp
0469 popq %r15
0470 popq %r14
0471 popq %r13
0472 popq %r12
0473 popq %rbx
0474
0475 RET
0476 SYM_FUNC_END(sha256_transform_ssse3)
0477
0478 .section .rodata.cst256.K256, "aM", @progbits, 256
0479 .align 64
0480 K256:
0481 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
0482 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
0483 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
0484 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
0485 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
0486 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
0487 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
0488 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
0489 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
0490 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
0491 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
0492 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
0493 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
0494 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
0495 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
0496 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
0497
0498 .section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
0499 .align 16
0500 PSHUFFLE_BYTE_FLIP_MASK:
0501 .octa 0x0c0d0e0f08090a0b0405060700010203
0502
0503 .section .rodata.cst16._SHUF_00BA, "aM", @progbits, 16
0504 .align 16
0505 # shuffle xBxA -> 00BA
0506 _SHUF_00BA:
0507 .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100
0508
0509 .section .rodata.cst16._SHUF_DC00, "aM", @progbits, 16
0510 .align 16
0511 # shuffle xDxC -> DC00
0512 _SHUF_DC00:
0513 .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF