0001 ########################################################################
0002 # Implement fast SHA-256 with AVX2 instructions. (x86_64)
0003 #
0004 # Copyright (C) 2013 Intel Corporation.
0005 #
0006 # Authors:
0007 # James Guilford <james.guilford@intel.com>
0008 # Kirk Yap <kirk.s.yap@intel.com>
0009 # Tim Chen <tim.c.chen@linux.intel.com>
0010 #
0011 # This software is available to you under a choice of one of two
0012 # licenses. You may choose to be licensed under the terms of the GNU
0013 # General Public License (GPL) Version 2, available from the file
0014 # COPYING in the main directory of this source tree, or the
0015 # OpenIB.org BSD license below:
0016 #
0017 # Redistribution and use in source and binary forms, with or
0018 # without modification, are permitted provided that the following
0019 # conditions are met:
0020 #
0021 # - Redistributions of source code must retain the above
0022 # copyright notice, this list of conditions and the following
0023 # disclaimer.
0024 #
0025 # - Redistributions in binary form must reproduce the above
0026 # copyright notice, this list of conditions and the following
0027 # disclaimer in the documentation and/or other materials
0028 # provided with the distribution.
0029 #
0030 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
0031 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
0032 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
0033 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
0034 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
0035 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
0036 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
0037 # SOFTWARE.
0038 #
0039 ########################################################################
0040 #
0041 # This code is described in an Intel White-Paper:
0042 # "Fast SHA-256 Implementations on Intel Architecture Processors"
0043 #
0044 # To find it, surf to http://www.intel.com/p/en_US/embedded
0045 # and search for that title.
0046 #
0047 ########################################################################
0048 # This code schedules 2 blocks at a time, with 4 lanes per block
0049 ########################################################################
0050
0051 #include <linux/linkage.h>
0052
0053 ## assume buffers not aligned
0054 #define VMOVDQ vmovdqu
0055
0056 ################################ Define Macros
0057
0058 # addm [mem], reg
0059 # Add reg to mem using reg-mem add and store
0060 .macro addm p1 p2
0061 add \p1, \p2
0062 mov \p2, \p1
0063 .endm
0064
0065 ################################
0066
0067 X0 = %ymm4
0068 X1 = %ymm5
0069 X2 = %ymm6
0070 X3 = %ymm7
0071
0072 # XMM versions of above
0073 XWORD0 = %xmm4
0074 XWORD1 = %xmm5
0075 XWORD2 = %xmm6
0076 XWORD3 = %xmm7
0077
0078 XTMP0 = %ymm0
0079 XTMP1 = %ymm1
0080 XTMP2 = %ymm2
0081 XTMP3 = %ymm3
0082 XTMP4 = %ymm8
0083 XFER = %ymm9
0084 XTMP5 = %ymm11
0085
0086 SHUF_00BA = %ymm10 # shuffle xBxA -> 00BA
0087 SHUF_DC00 = %ymm12 # shuffle xDxC -> DC00
0088 BYTE_FLIP_MASK = %ymm13
0089
0090 X_BYTE_FLIP_MASK = %xmm13 # XMM version of BYTE_FLIP_MASK
0091
0092 NUM_BLKS = %rdx # 3rd arg
0093 INP = %rsi # 2nd arg
0094 CTX = %rdi # 1st arg
0095 c = %ecx
0096 d = %r8d
0097 e = %edx # clobbers NUM_BLKS
0098 y3 = %esi # clobbers INP
0099
0100 SRND = CTX # SRND is same register as CTX
0101
0102 a = %eax
0103 b = %ebx
0104 f = %r9d
0105 g = %r10d
0106 h = %r11d
0107 old_h = %r11d
0108
0109 T1 = %r12d
0110 y0 = %r13d
0111 y1 = %r14d
0112 y2 = %r15d
0113
0114
0115 _XFER_SIZE = 2*64*4 # 2 blocks, 64 rounds, 4 bytes/round
0116 _XMM_SAVE_SIZE = 0
0117 _INP_END_SIZE = 8
0118 _INP_SIZE = 8
0119 _CTX_SIZE = 8
0120
0121 _XFER = 0
0122 _XMM_SAVE = _XFER + _XFER_SIZE
0123 _INP_END = _XMM_SAVE + _XMM_SAVE_SIZE
0124 _INP = _INP_END + _INP_END_SIZE
0125 _CTX = _INP + _INP_SIZE
0126 STACK_SIZE = _CTX + _CTX_SIZE
0127
0128 # rotate_Xs
0129 # Rotate values of symbols X0...X3
0130 .macro rotate_Xs
0131 X_ = X0
0132 X0 = X1
0133 X1 = X2
0134 X2 = X3
0135 X3 = X_
0136 .endm
0137
0138 # ROTATE_ARGS
0139 # Rotate values of symbols a...h
0140 .macro ROTATE_ARGS
0141 old_h = h
0142 TMP_ = h
0143 h = g
0144 g = f
0145 f = e
0146 e = d
0147 d = c
0148 c = b
0149 b = a
0150 a = TMP_
0151 .endm
0152
0153 .macro FOUR_ROUNDS_AND_SCHED disp
0154 ################################### RND N + 0 ############################
0155
0156 mov a, y3 # y3 = a # MAJA
0157 rorx $25, e, y0 # y0 = e >> 25 # S1A
0158 rorx $11, e, y1 # y1 = e >> 11 # S1B
0159
0160 addl \disp(%rsp, SRND), h # h = k + w + h # --
0161 or c, y3 # y3 = a|c # MAJA
0162 vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7]
0163 mov f, y2 # y2 = f # CH
0164 rorx $13, a, T1 # T1 = a >> 13 # S0B
0165
0166 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
0167 xor g, y2 # y2 = f^g # CH
0168 vpaddd X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]# y1 = (e >> 6)# S1
0169 rorx $6, e, y1 # y1 = (e >> 6) # S1
0170
0171 and e, y2 # y2 = (f^g)&e # CH
0172 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
0173 rorx $22, a, y1 # y1 = a >> 22 # S0A
0174 add h, d # d = k + w + h + d # --
0175
0176 and b, y3 # y3 = (a|c)&b # MAJA
0177 vpalignr $4, X0, X1, XTMP1 # XTMP1 = W[-15]
0178 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
0179 rorx $2, a, T1 # T1 = (a >> 2) # S0
0180
0181 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
0182 vpsrld $7, XTMP1, XTMP2
0183 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
0184 mov a, T1 # T1 = a # MAJB
0185 and c, T1 # T1 = a&c # MAJB
0186
0187 add y0, y2 # y2 = S1 + CH # --
0188 vpslld $(32-7), XTMP1, XTMP3
0189 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
0190 add y1, h # h = k + w + h + S0 # --
0191
0192 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
0193 vpor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7
0194
0195 vpsrld $18, XTMP1, XTMP2
0196 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
0197 add y3, h # h = t1 + S0 + MAJ # --
0198
0199
0200 ROTATE_ARGS
0201
0202 ################################### RND N + 1 ############################
0203
0204 mov a, y3 # y3 = a # MAJA
0205 rorx $25, e, y0 # y0 = e >> 25 # S1A
0206 rorx $11, e, y1 # y1 = e >> 11 # S1B
0207 offset = \disp + 1*4
0208 addl offset(%rsp, SRND), h # h = k + w + h # --
0209 or c, y3 # y3 = a|c # MAJA
0210
0211
0212 vpsrld $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3
0213 mov f, y2 # y2 = f # CH
0214 rorx $13, a, T1 # T1 = a >> 13 # S0B
0215 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
0216 xor g, y2 # y2 = f^g # CH
0217
0218
0219 rorx $6, e, y1 # y1 = (e >> 6) # S1
0220 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
0221 rorx $22, a, y1 # y1 = a >> 22 # S0A
0222 and e, y2 # y2 = (f^g)&e # CH
0223 add h, d # d = k + w + h + d # --
0224
0225 vpslld $(32-18), XTMP1, XTMP1
0226 and b, y3 # y3 = (a|c)&b # MAJA
0227 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
0228
0229 vpxor XTMP1, XTMP3, XTMP3
0230 rorx $2, a, T1 # T1 = (a >> 2) # S0
0231 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
0232
0233 vpxor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7 ^ W[-15] ror 18
0234 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
0235 mov a, T1 # T1 = a # MAJB
0236 and c, T1 # T1 = a&c # MAJB
0237 add y0, y2 # y2 = S1 + CH # --
0238
0239 vpxor XTMP4, XTMP3, XTMP1 # XTMP1 = s0
0240 vpshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA}
0241 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
0242 add y1, h # h = k + w + h + S0 # --
0243
0244 vpaddd XTMP1, XTMP0, XTMP0 # XTMP0 = W[-16] + W[-7] + s0
0245 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
0246 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
0247 add y3, h # h = t1 + S0 + MAJ # --
0248
0249 vpsrld $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}
0250
0251
0252 ROTATE_ARGS
0253
0254 ################################### RND N + 2 ############################
0255
0256 mov a, y3 # y3 = a # MAJA
0257 rorx $25, e, y0 # y0 = e >> 25 # S1A
0258 offset = \disp + 2*4
0259 addl offset(%rsp, SRND), h # h = k + w + h # --
0260
0261 vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA}
0262 rorx $11, e, y1 # y1 = e >> 11 # S1B
0263 or c, y3 # y3 = a|c # MAJA
0264 mov f, y2 # y2 = f # CH
0265 xor g, y2 # y2 = f^g # CH
0266
0267 rorx $13, a, T1 # T1 = a >> 13 # S0B
0268 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
0269 vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xBxA}
0270 and e, y2 # y2 = (f^g)&e # CH
0271
0272 rorx $6, e, y1 # y1 = (e >> 6) # S1
0273 vpxor XTMP3, XTMP2, XTMP2
0274 add h, d # d = k + w + h + d # --
0275 and b, y3 # y3 = (a|c)&b # MAJA
0276
0277 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
0278 rorx $22, a, y1 # y1 = a >> 22 # S0A
0279 vpxor XTMP2, XTMP4, XTMP4 # XTMP4 = s1 {xBxA}
0280 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
0281
0282 vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA}
0283 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
0284 rorx $2, a ,T1 # T1 = (a >> 2) # S0
0285 vpaddd XTMP4, XTMP0, XTMP0 # XTMP0 = {..., ..., W[1], W[0]}
0286
0287 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
0288 mov a, T1 # T1 = a # MAJB
0289 and c, T1 # T1 = a&c # MAJB
0290 add y0, y2 # y2 = S1 + CH # --
0291 vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC}
0292
0293 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
0294 add y1,h # h = k + w + h + S0 # --
0295 add y2,d # d = k + w + h + d + S1 + CH = d + t1 # --
0296 add y2,h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
0297
0298 add y3,h # h = t1 + S0 + MAJ # --
0299
0300
0301 ROTATE_ARGS
0302
0303 ################################### RND N + 3 ############################
0304
0305 mov a, y3 # y3 = a # MAJA
0306 rorx $25, e, y0 # y0 = e >> 25 # S1A
0307 rorx $11, e, y1 # y1 = e >> 11 # S1B
0308 offset = \disp + 3*4
0309 addl offset(%rsp, SRND), h # h = k + w + h # --
0310 or c, y3 # y3 = a|c # MAJA
0311
0312
0313 vpsrld $10, XTMP2, XTMP5 # XTMP5 = W[-2] >> 10 {DDCC}
0314 mov f, y2 # y2 = f # CH
0315 rorx $13, a, T1 # T1 = a >> 13 # S0B
0316 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
0317 xor g, y2 # y2 = f^g # CH
0318
0319
0320 vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xDxC}
0321 rorx $6, e, y1 # y1 = (e >> 6) # S1
0322 and e, y2 # y2 = (f^g)&e # CH
0323 add h, d # d = k + w + h + d # --
0324 and b, y3 # y3 = (a|c)&b # MAJA
0325
0326 vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xDxC}
0327 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
0328 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
0329
0330 vpxor XTMP3, XTMP2, XTMP2
0331 rorx $22, a, y1 # y1 = a >> 22 # S0A
0332 add y0, y2 # y2 = S1 + CH # --
0333
0334 vpxor XTMP2, XTMP5, XTMP5 # XTMP5 = s1 {xDxC}
0335 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
0336 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
0337
0338 rorx $2, a, T1 # T1 = (a >> 2) # S0
0339 vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00}
0340
0341 vpaddd XTMP0, XTMP5, X0 # X0 = {W[3], W[2], W[1], W[0]}
0342 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
0343 mov a, T1 # T1 = a # MAJB
0344 and c, T1 # T1 = a&c # MAJB
0345 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
0346
0347 add y1, h # h = k + w + h + S0 # --
0348 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
0349 add y3, h # h = t1 + S0 + MAJ # --
0350
0351 ROTATE_ARGS
0352 rotate_Xs
0353 .endm
0354
0355 .macro DO_4ROUNDS disp
0356 ################################### RND N + 0 ###########################
0357
0358 mov f, y2 # y2 = f # CH
0359 rorx $25, e, y0 # y0 = e >> 25 # S1A
0360 rorx $11, e, y1 # y1 = e >> 11 # S1B
0361 xor g, y2 # y2 = f^g # CH
0362
0363 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
0364 rorx $6, e, y1 # y1 = (e >> 6) # S1
0365 and e, y2 # y2 = (f^g)&e # CH
0366
0367 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
0368 rorx $13, a, T1 # T1 = a >> 13 # S0B
0369 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
0370 rorx $22, a, y1 # y1 = a >> 22 # S0A
0371 mov a, y3 # y3 = a # MAJA
0372
0373 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
0374 rorx $2, a, T1 # T1 = (a >> 2) # S0
0375 addl \disp(%rsp, SRND), h # h = k + w + h # --
0376 or c, y3 # y3 = a|c # MAJA
0377
0378 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
0379 mov a, T1 # T1 = a # MAJB
0380 and b, y3 # y3 = (a|c)&b # MAJA
0381 and c, T1 # T1 = a&c # MAJB
0382 add y0, y2 # y2 = S1 + CH # --
0383
0384
0385 add h, d # d = k + w + h + d # --
0386 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
0387 add y1, h # h = k + w + h + S0 # --
0388 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
0389
0390 ROTATE_ARGS
0391
0392 ################################### RND N + 1 ###########################
0393
0394 add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
0395 mov f, y2 # y2 = f # CH
0396 rorx $25, e, y0 # y0 = e >> 25 # S1A
0397 rorx $11, e, y1 # y1 = e >> 11 # S1B
0398 xor g, y2 # y2 = f^g # CH
0399
0400 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
0401 rorx $6, e, y1 # y1 = (e >> 6) # S1
0402 and e, y2 # y2 = (f^g)&e # CH
0403 add y3, old_h # h = t1 + S0 + MAJ # --
0404
0405 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
0406 rorx $13, a, T1 # T1 = a >> 13 # S0B
0407 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
0408 rorx $22, a, y1 # y1 = a >> 22 # S0A
0409 mov a, y3 # y3 = a # MAJA
0410
0411 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
0412 rorx $2, a, T1 # T1 = (a >> 2) # S0
0413 offset = 4*1 + \disp
0414 addl offset(%rsp, SRND), h # h = k + w + h # --
0415 or c, y3 # y3 = a|c # MAJA
0416
0417 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
0418 mov a, T1 # T1 = a # MAJB
0419 and b, y3 # y3 = (a|c)&b # MAJA
0420 and c, T1 # T1 = a&c # MAJB
0421 add y0, y2 # y2 = S1 + CH # --
0422
0423
0424 add h, d # d = k + w + h + d # --
0425 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
0426 add y1, h # h = k + w + h + S0 # --
0427
0428 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
0429
0430 ROTATE_ARGS
0431
0432 ################################### RND N + 2 ##############################
0433
0434 add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
0435 mov f, y2 # y2 = f # CH
0436 rorx $25, e, y0 # y0 = e >> 25 # S1A
0437 rorx $11, e, y1 # y1 = e >> 11 # S1B
0438 xor g, y2 # y2 = f^g # CH
0439
0440 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
0441 rorx $6, e, y1 # y1 = (e >> 6) # S1
0442 and e, y2 # y2 = (f^g)&e # CH
0443 add y3, old_h # h = t1 + S0 + MAJ # --
0444
0445 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
0446 rorx $13, a, T1 # T1 = a >> 13 # S0B
0447 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
0448 rorx $22, a, y1 # y1 = a >> 22 # S0A
0449 mov a, y3 # y3 = a # MAJA
0450
0451 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
0452 rorx $2, a, T1 # T1 = (a >> 2) # S0
0453 offset = 4*2 + \disp
0454 addl offset(%rsp, SRND), h # h = k + w + h # --
0455 or c, y3 # y3 = a|c # MAJA
0456
0457 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
0458 mov a, T1 # T1 = a # MAJB
0459 and b, y3 # y3 = (a|c)&b # MAJA
0460 and c, T1 # T1 = a&c # MAJB
0461 add y0, y2 # y2 = S1 + CH # --
0462
0463
0464 add h, d # d = k + w + h + d # --
0465 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
0466 add y1, h # h = k + w + h + S0 # --
0467
0468 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
0469
0470 ROTATE_ARGS
0471
0472 ################################### RND N + 3 ###########################
0473
0474 add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
0475 mov f, y2 # y2 = f # CH
0476 rorx $25, e, y0 # y0 = e >> 25 # S1A
0477 rorx $11, e, y1 # y1 = e >> 11 # S1B
0478 xor g, y2 # y2 = f^g # CH
0479
0480 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
0481 rorx $6, e, y1 # y1 = (e >> 6) # S1
0482 and e, y2 # y2 = (f^g)&e # CH
0483 add y3, old_h # h = t1 + S0 + MAJ # --
0484
0485 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
0486 rorx $13, a, T1 # T1 = a >> 13 # S0B
0487 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
0488 rorx $22, a, y1 # y1 = a >> 22 # S0A
0489 mov a, y3 # y3 = a # MAJA
0490
0491 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
0492 rorx $2, a, T1 # T1 = (a >> 2) # S0
0493 offset = 4*3 + \disp
0494 addl offset(%rsp, SRND), h # h = k + w + h # --
0495 or c, y3 # y3 = a|c # MAJA
0496
0497 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
0498 mov a, T1 # T1 = a # MAJB
0499 and b, y3 # y3 = (a|c)&b # MAJA
0500 and c, T1 # T1 = a&c # MAJB
0501 add y0, y2 # y2 = S1 + CH # --
0502
0503
0504 add h, d # d = k + w + h + d # --
0505 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
0506 add y1, h # h = k + w + h + S0 # --
0507
0508 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
0509
0510
0511 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
0512
0513 add y3, h # h = t1 + S0 + MAJ # --
0514
0515 ROTATE_ARGS
0516
0517 .endm
0518
0519 ########################################################################
0520 ## void sha256_transform_rorx(struct sha256_state *state, const u8 *data, int blocks)
0521 ## arg 1 : pointer to state
0522 ## arg 2 : pointer to input data
0523 ## arg 3 : Num blocks
0524 ########################################################################
0525 .text
0526 SYM_FUNC_START(sha256_transform_rorx)
0527 .align 32
0528 pushq %rbx
0529 pushq %r12
0530 pushq %r13
0531 pushq %r14
0532 pushq %r15
0533
0534 push %rbp
0535 mov %rsp, %rbp
0536
0537 subq $STACK_SIZE, %rsp
0538 and $-32, %rsp # align rsp to 32 byte boundary
0539
0540 shl $6, NUM_BLKS # convert to bytes
0541 jz done_hash
0542 lea -64(INP, NUM_BLKS), NUM_BLKS # pointer to last block
0543 mov NUM_BLKS, _INP_END(%rsp)
0544
0545 cmp NUM_BLKS, INP
0546 je only_one_block
0547
0548 ## load initial digest
0549 mov (CTX), a
0550 mov 4*1(CTX), b
0551 mov 4*2(CTX), c
0552 mov 4*3(CTX), d
0553 mov 4*4(CTX), e
0554 mov 4*5(CTX), f
0555 mov 4*6(CTX), g
0556 mov 4*7(CTX), h
0557
0558 vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
0559 vmovdqa _SHUF_00BA(%rip), SHUF_00BA
0560 vmovdqa _SHUF_DC00(%rip), SHUF_DC00
0561
0562 mov CTX, _CTX(%rsp)
0563
0564 loop0:
0565 ## Load first 16 dwords from two blocks
0566 VMOVDQ 0*32(INP),XTMP0
0567 VMOVDQ 1*32(INP),XTMP1
0568 VMOVDQ 2*32(INP),XTMP2
0569 VMOVDQ 3*32(INP),XTMP3
0570
0571 ## byte swap data
0572 vpshufb BYTE_FLIP_MASK, XTMP0, XTMP0
0573 vpshufb BYTE_FLIP_MASK, XTMP1, XTMP1
0574 vpshufb BYTE_FLIP_MASK, XTMP2, XTMP2
0575 vpshufb BYTE_FLIP_MASK, XTMP3, XTMP3
0576
0577 ## transpose data into high/low halves
0578 vperm2i128 $0x20, XTMP2, XTMP0, X0
0579 vperm2i128 $0x31, XTMP2, XTMP0, X1
0580 vperm2i128 $0x20, XTMP3, XTMP1, X2
0581 vperm2i128 $0x31, XTMP3, XTMP1, X3
0582
0583 last_block_enter:
0584 add $64, INP
0585 mov INP, _INP(%rsp)
0586
0587 ## schedule 48 input dwords, by doing 3 rounds of 12 each
0588 xor SRND, SRND
0589
0590 .align 16
0591 loop1:
0592 vpaddd K256+0*32(SRND), X0, XFER
0593 vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
0594 FOUR_ROUNDS_AND_SCHED _XFER + 0*32
0595
0596 vpaddd K256+1*32(SRND), X0, XFER
0597 vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
0598 FOUR_ROUNDS_AND_SCHED _XFER + 1*32
0599
0600 vpaddd K256+2*32(SRND), X0, XFER
0601 vmovdqa XFER, 2*32+_XFER(%rsp, SRND)
0602 FOUR_ROUNDS_AND_SCHED _XFER + 2*32
0603
0604 vpaddd K256+3*32(SRND), X0, XFER
0605 vmovdqa XFER, 3*32+_XFER(%rsp, SRND)
0606 FOUR_ROUNDS_AND_SCHED _XFER + 3*32
0607
0608 add $4*32, SRND
0609 cmp $3*4*32, SRND
0610 jb loop1
0611
0612 loop2:
0613 ## Do last 16 rounds with no scheduling
0614 vpaddd K256+0*32(SRND), X0, XFER
0615 vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
0616 DO_4ROUNDS _XFER + 0*32
0617
0618 vpaddd K256+1*32(SRND), X1, XFER
0619 vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
0620 DO_4ROUNDS _XFER + 1*32
0621 add $2*32, SRND
0622
0623 vmovdqa X2, X0
0624 vmovdqa X3, X1
0625
0626 cmp $4*4*32, SRND
0627 jb loop2
0628
0629 mov _CTX(%rsp), CTX
0630 mov _INP(%rsp), INP
0631
0632 addm (4*0)(CTX),a
0633 addm (4*1)(CTX),b
0634 addm (4*2)(CTX),c
0635 addm (4*3)(CTX),d
0636 addm (4*4)(CTX),e
0637 addm (4*5)(CTX),f
0638 addm (4*6)(CTX),g
0639 addm (4*7)(CTX),h
0640
0641 cmp _INP_END(%rsp), INP
0642 ja done_hash
0643
0644 #### Do second block using previously scheduled results
0645 xor SRND, SRND
0646 .align 16
0647 loop3:
0648 DO_4ROUNDS _XFER + 0*32 + 16
0649 DO_4ROUNDS _XFER + 1*32 + 16
0650 add $2*32, SRND
0651 cmp $4*4*32, SRND
0652 jb loop3
0653
0654 mov _CTX(%rsp), CTX
0655 mov _INP(%rsp), INP
0656 add $64, INP
0657
0658 addm (4*0)(CTX),a
0659 addm (4*1)(CTX),b
0660 addm (4*2)(CTX),c
0661 addm (4*3)(CTX),d
0662 addm (4*4)(CTX),e
0663 addm (4*5)(CTX),f
0664 addm (4*6)(CTX),g
0665 addm (4*7)(CTX),h
0666
0667 cmp _INP_END(%rsp), INP
0668 jb loop0
0669 ja done_hash
0670
0671 do_last_block:
0672 VMOVDQ 0*16(INP),XWORD0
0673 VMOVDQ 1*16(INP),XWORD1
0674 VMOVDQ 2*16(INP),XWORD2
0675 VMOVDQ 3*16(INP),XWORD3
0676
0677 vpshufb X_BYTE_FLIP_MASK, XWORD0, XWORD0
0678 vpshufb X_BYTE_FLIP_MASK, XWORD1, XWORD1
0679 vpshufb X_BYTE_FLIP_MASK, XWORD2, XWORD2
0680 vpshufb X_BYTE_FLIP_MASK, XWORD3, XWORD3
0681
0682 jmp last_block_enter
0683
0684 only_one_block:
0685
0686 ## load initial digest
0687 mov (4*0)(CTX),a
0688 mov (4*1)(CTX),b
0689 mov (4*2)(CTX),c
0690 mov (4*3)(CTX),d
0691 mov (4*4)(CTX),e
0692 mov (4*5)(CTX),f
0693 mov (4*6)(CTX),g
0694 mov (4*7)(CTX),h
0695
0696 vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
0697 vmovdqa _SHUF_00BA(%rip), SHUF_00BA
0698 vmovdqa _SHUF_DC00(%rip), SHUF_DC00
0699
0700 mov CTX, _CTX(%rsp)
0701 jmp do_last_block
0702
0703 done_hash:
0704
0705 mov %rbp, %rsp
0706 pop %rbp
0707
0708 popq %r15
0709 popq %r14
0710 popq %r13
0711 popq %r12
0712 popq %rbx
0713 RET
0714 SYM_FUNC_END(sha256_transform_rorx)
0715
0716 .section .rodata.cst512.K256, "aM", @progbits, 512
0717 .align 64
0718 K256:
0719 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
0720 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
0721 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
0722 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
0723 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
0724 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
0725 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
0726 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
0727 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
0728 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
0729 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
0730 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
0731 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
0732 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
0733 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
0734 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
0735 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
0736 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
0737 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
0738 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
0739 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
0740 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
0741 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
0742 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
0743 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
0744 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
0745 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
0746 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
0747 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
0748 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
0749 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
0750 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
0751
0752 .section .rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32
0753 .align 32
0754 PSHUFFLE_BYTE_FLIP_MASK:
0755 .octa 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203
0756
0757 # shuffle xBxA -> 00BA
0758 .section .rodata.cst32._SHUF_00BA, "aM", @progbits, 32
0759 .align 32
0760 _SHUF_00BA:
0761 .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100
0762
0763 # shuffle xDxC -> DC00
0764 .section .rodata.cst32._SHUF_DC00, "aM", @progbits, 32
0765 .align 32
0766 _SHUF_DC00:
0767 .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF