0001 ########################################################################
0002 # Implement fast SHA-512 with AVX2 instructions. (x86_64)
0003 #
0004 # Copyright (C) 2013 Intel Corporation.
0005 #
0006 # Authors:
0007 # James Guilford <james.guilford@intel.com>
0008 # Kirk Yap <kirk.s.yap@intel.com>
0009 # David Cote <david.m.cote@intel.com>
0010 # Tim Chen <tim.c.chen@linux.intel.com>
0011 #
0012 # This software is available to you under a choice of one of two
0013 # licenses. You may choose to be licensed under the terms of the GNU
0014 # General Public License (GPL) Version 2, available from the file
0015 # COPYING in the main directory of this source tree, or the
0016 # OpenIB.org BSD license below:
0017 #
0018 # Redistribution and use in source and binary forms, with or
0019 # without modification, are permitted provided that the following
0020 # conditions are met:
0021 #
0022 # - Redistributions of source code must retain the above
0023 # copyright notice, this list of conditions and the following
0024 # disclaimer.
0025 #
0026 # - Redistributions in binary form must reproduce the above
0027 # copyright notice, this list of conditions and the following
0028 # disclaimer in the documentation and/or other materials
0029 # provided with the distribution.
0030 #
0031 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
0032 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
0033 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
0034 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
0035 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
0036 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
0037 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
0038 # SOFTWARE.
0039 #
0040 ########################################################################
0041 #
0042 # This code is described in an Intel White-Paper:
0043 # "Fast SHA-512 Implementations on Intel Architecture Processors"
0044 #
0045 # To find it, surf to http://www.intel.com/p/en_US/embedded
0046 # and search for that title.
0047 #
0048 ########################################################################
0049 # This code schedules 1 blocks at a time, with 4 lanes per block
0050 ########################################################################
0051
0052 #include <linux/linkage.h>
0053
0054 .text
0055
0056 # Virtual Registers
0057 Y_0 = %ymm4
0058 Y_1 = %ymm5
0059 Y_2 = %ymm6
0060 Y_3 = %ymm7
0061
0062 YTMP0 = %ymm0
0063 YTMP1 = %ymm1
0064 YTMP2 = %ymm2
0065 YTMP3 = %ymm3
0066 YTMP4 = %ymm8
0067 XFER = YTMP0
0068
0069 BYTE_FLIP_MASK = %ymm9
0070
0071 # 1st arg is %rdi, which is saved to the stack and accessed later via %r12
0072 CTX1 = %rdi
0073 CTX2 = %r12
0074 # 2nd arg
0075 INP = %rsi
0076 # 3rd arg
0077 NUM_BLKS = %rdx
0078
0079 c = %rcx
0080 d = %r8
0081 e = %rdx
0082 y3 = %rsi
0083
0084 TBL = %rdi # clobbers CTX1
0085
0086 a = %rax
0087 b = %rbx
0088
0089 f = %r9
0090 g = %r10
0091 h = %r11
0092 old_h = %r11
0093
0094 T1 = %r12 # clobbers CTX2
0095 y0 = %r13
0096 y1 = %r14
0097 y2 = %r15
0098
0099 # Local variables (stack frame)
0100 XFER_SIZE = 4*8
0101 SRND_SIZE = 1*8
0102 INP_SIZE = 1*8
0103 INPEND_SIZE = 1*8
0104 CTX_SIZE = 1*8
0105
0106 frame_XFER = 0
0107 frame_SRND = frame_XFER + XFER_SIZE
0108 frame_INP = frame_SRND + SRND_SIZE
0109 frame_INPEND = frame_INP + INP_SIZE
0110 frame_CTX = frame_INPEND + INPEND_SIZE
0111 frame_size = frame_CTX + CTX_SIZE
0112
0113 ## assume buffers not aligned
0114 #define VMOVDQ vmovdqu
0115
0116 # addm [mem], reg
0117 # Add reg to mem using reg-mem add and store
0118 .macro addm p1 p2
0119 add \p1, \p2
0120 mov \p2, \p1
0121 .endm
0122
0123
0124 # COPY_YMM_AND_BSWAP ymm, [mem], byte_flip_mask
0125 # Load ymm with mem and byte swap each dword
0126 .macro COPY_YMM_AND_BSWAP p1 p2 p3
0127 VMOVDQ \p2, \p1
0128 vpshufb \p3, \p1, \p1
0129 .endm
0130 # rotate_Ys
0131 # Rotate values of symbols Y0...Y3
0132 .macro rotate_Ys
0133 Y_ = Y_0
0134 Y_0 = Y_1
0135 Y_1 = Y_2
0136 Y_2 = Y_3
0137 Y_3 = Y_
0138 .endm
0139
0140 # RotateState
0141 .macro RotateState
0142 # Rotate symbols a..h right
0143 old_h = h
0144 TMP_ = h
0145 h = g
0146 g = f
0147 f = e
0148 e = d
0149 d = c
0150 c = b
0151 b = a
0152 a = TMP_
0153 .endm
0154
0155 # macro MY_VPALIGNR YDST, YSRC1, YSRC2, RVAL
0156 # YDST = {YSRC1, YSRC2} >> RVAL*8
0157 .macro MY_VPALIGNR YDST YSRC1 YSRC2 RVAL
0158 vperm2f128 $0x3, \YSRC2, \YSRC1, \YDST # YDST = {YS1_LO, YS2_HI}
0159 vpalignr $\RVAL, \YSRC2, \YDST, \YDST # YDST = {YDS1, YS2} >> RVAL*8
0160 .endm
0161
0162 .macro FOUR_ROUNDS_AND_SCHED
0163 ################################### RND N + 0 #########################################
0164
0165 # Extract w[t-7]
0166 MY_VPALIGNR YTMP0, Y_3, Y_2, 8 # YTMP0 = W[-7]
0167 # Calculate w[t-16] + w[t-7]
0168 vpaddq Y_0, YTMP0, YTMP0 # YTMP0 = W[-7] + W[-16]
0169 # Extract w[t-15]
0170 MY_VPALIGNR YTMP1, Y_1, Y_0, 8 # YTMP1 = W[-15]
0171
0172 # Calculate sigma0
0173
0174 # Calculate w[t-15] ror 1
0175 vpsrlq $1, YTMP1, YTMP2
0176 vpsllq $(64-1), YTMP1, YTMP3
0177 vpor YTMP2, YTMP3, YTMP3 # YTMP3 = W[-15] ror 1
0178 # Calculate w[t-15] shr 7
0179 vpsrlq $7, YTMP1, YTMP4 # YTMP4 = W[-15] >> 7
0180
0181 mov a, y3 # y3 = a # MAJA
0182 rorx $41, e, y0 # y0 = e >> 41 # S1A
0183 rorx $18, e, y1 # y1 = e >> 18 # S1B
0184 add frame_XFER(%rsp),h # h = k + w + h # --
0185 or c, y3 # y3 = a|c # MAJA
0186 mov f, y2 # y2 = f # CH
0187 rorx $34, a, T1 # T1 = a >> 34 # S0B
0188
0189 xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
0190 xor g, y2 # y2 = f^g # CH
0191 rorx $14, e, y1 # y1 = (e >> 14) # S1
0192
0193 and e, y2 # y2 = (f^g)&e # CH
0194 xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
0195 rorx $39, a, y1 # y1 = a >> 39 # S0A
0196 add h, d # d = k + w + h + d # --
0197
0198 and b, y3 # y3 = (a|c)&b # MAJA
0199 xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
0200 rorx $28, a, T1 # T1 = (a >> 28) # S0
0201
0202 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
0203 xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
0204 mov a, T1 # T1 = a # MAJB
0205 and c, T1 # T1 = a&c # MAJB
0206
0207 add y0, y2 # y2 = S1 + CH # --
0208 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
0209 add y1, h # h = k + w + h + S0 # --
0210
0211 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
0212
0213 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
0214 add y3, h # h = t1 + S0 + MAJ # --
0215
0216 RotateState
0217
0218 ################################### RND N + 1 #########################################
0219
0220 # Calculate w[t-15] ror 8
0221 vpsrlq $8, YTMP1, YTMP2
0222 vpsllq $(64-8), YTMP1, YTMP1
0223 vpor YTMP2, YTMP1, YTMP1 # YTMP1 = W[-15] ror 8
0224 # XOR the three components
0225 vpxor YTMP4, YTMP3, YTMP3 # YTMP3 = W[-15] ror 1 ^ W[-15] >> 7
0226 vpxor YTMP1, YTMP3, YTMP1 # YTMP1 = s0
0227
0228
0229 # Add three components, w[t-16], w[t-7] and sigma0
0230 vpaddq YTMP1, YTMP0, YTMP0 # YTMP0 = W[-16] + W[-7] + s0
0231 # Move to appropriate lanes for calculating w[16] and w[17]
0232 vperm2f128 $0x0, YTMP0, YTMP0, Y_0 # Y_0 = W[-16] + W[-7] + s0 {BABA}
0233 # Move to appropriate lanes for calculating w[18] and w[19]
0234 vpand MASK_YMM_LO(%rip), YTMP0, YTMP0 # YTMP0 = W[-16] + W[-7] + s0 {DC00}
0235
0236 # Calculate w[16] and w[17] in both 128 bit lanes
0237
0238 # Calculate sigma1 for w[16] and w[17] on both 128 bit lanes
0239 vperm2f128 $0x11, Y_3, Y_3, YTMP2 # YTMP2 = W[-2] {BABA}
0240 vpsrlq $6, YTMP2, YTMP4 # YTMP4 = W[-2] >> 6 {BABA}
0241
0242
0243 mov a, y3 # y3 = a # MAJA
0244 rorx $41, e, y0 # y0 = e >> 41 # S1A
0245 rorx $18, e, y1 # y1 = e >> 18 # S1B
0246 add 1*8+frame_XFER(%rsp), h # h = k + w + h # --
0247 or c, y3 # y3 = a|c # MAJA
0248
0249
0250 mov f, y2 # y2 = f # CH
0251 rorx $34, a, T1 # T1 = a >> 34 # S0B
0252 xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
0253 xor g, y2 # y2 = f^g # CH
0254
0255
0256 rorx $14, e, y1 # y1 = (e >> 14) # S1
0257 xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
0258 rorx $39, a, y1 # y1 = a >> 39 # S0A
0259 and e, y2 # y2 = (f^g)&e # CH
0260 add h, d # d = k + w + h + d # --
0261
0262 and b, y3 # y3 = (a|c)&b # MAJA
0263 xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
0264
0265 rorx $28, a, T1 # T1 = (a >> 28) # S0
0266 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
0267
0268 xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
0269 mov a, T1 # T1 = a # MAJB
0270 and c, T1 # T1 = a&c # MAJB
0271 add y0, y2 # y2 = S1 + CH # --
0272
0273 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
0274 add y1, h # h = k + w + h + S0 # --
0275
0276 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
0277 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
0278 add y3, h # h = t1 + S0 + MAJ # --
0279
0280 RotateState
0281
0282
0283 ################################### RND N + 2 #########################################
0284
0285 vpsrlq $19, YTMP2, YTMP3 # YTMP3 = W[-2] >> 19 {BABA}
0286 vpsllq $(64-19), YTMP2, YTMP1 # YTMP1 = W[-2] << 19 {BABA}
0287 vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 19 {BABA}
0288 vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {BABA}
0289 vpsrlq $61, YTMP2, YTMP3 # YTMP3 = W[-2] >> 61 {BABA}
0290 vpsllq $(64-61), YTMP2, YTMP1 # YTMP1 = W[-2] << 61 {BABA}
0291 vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 61 {BABA}
0292 vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = s1 = (W[-2] ror 19) ^
0293 # (W[-2] ror 61) ^ (W[-2] >> 6) {BABA}
0294
0295 # Add sigma1 to the other compunents to get w[16] and w[17]
0296 vpaddq YTMP4, Y_0, Y_0 # Y_0 = {W[1], W[0], W[1], W[0]}
0297
0298 # Calculate sigma1 for w[18] and w[19] for upper 128 bit lane
0299 vpsrlq $6, Y_0, YTMP4 # YTMP4 = W[-2] >> 6 {DC--}
0300
0301 mov a, y3 # y3 = a # MAJA
0302 rorx $41, e, y0 # y0 = e >> 41 # S1A
0303 add 2*8+frame_XFER(%rsp), h # h = k + w + h # --
0304
0305 rorx $18, e, y1 # y1 = e >> 18 # S1B
0306 or c, y3 # y3 = a|c # MAJA
0307 mov f, y2 # y2 = f # CH
0308 xor g, y2 # y2 = f^g # CH
0309
0310 rorx $34, a, T1 # T1 = a >> 34 # S0B
0311 xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
0312 and e, y2 # y2 = (f^g)&e # CH
0313
0314 rorx $14, e, y1 # y1 = (e >> 14) # S1
0315 add h, d # d = k + w + h + d # --
0316 and b, y3 # y3 = (a|c)&b # MAJA
0317
0318 xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
0319 rorx $39, a, y1 # y1 = a >> 39 # S0A
0320 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
0321
0322 xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
0323 rorx $28, a, T1 # T1 = (a >> 28) # S0
0324
0325 xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
0326 mov a, T1 # T1 = a # MAJB
0327 and c, T1 # T1 = a&c # MAJB
0328 add y0, y2 # y2 = S1 + CH # --
0329
0330 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
0331 add y1, h # h = k + w + h + S0 # --
0332 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
0333 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
0334
0335 add y3, h # h = t1 + S0 + MAJ # --
0336
0337 RotateState
0338
0339 ################################### RND N + 3 #########################################
0340
0341 vpsrlq $19, Y_0, YTMP3 # YTMP3 = W[-2] >> 19 {DC--}
0342 vpsllq $(64-19), Y_0, YTMP1 # YTMP1 = W[-2] << 19 {DC--}
0343 vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 19 {DC--}
0344 vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {DC--}
0345 vpsrlq $61, Y_0, YTMP3 # YTMP3 = W[-2] >> 61 {DC--}
0346 vpsllq $(64-61), Y_0, YTMP1 # YTMP1 = W[-2] << 61 {DC--}
0347 vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 61 {DC--}
0348 vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = s1 = (W[-2] ror 19) ^
0349 # (W[-2] ror 61) ^ (W[-2] >> 6) {DC--}
0350
0351 # Add the sigma0 + w[t-7] + w[t-16] for w[18] and w[19]
0352 # to newly calculated sigma1 to get w[18] and w[19]
0353 vpaddq YTMP4, YTMP0, YTMP2 # YTMP2 = {W[3], W[2], --, --}
0354
0355 # Form w[19, w[18], w17], w[16]
0356 vpblendd $0xF0, YTMP2, Y_0, Y_0 # Y_0 = {W[3], W[2], W[1], W[0]}
0357
0358 mov a, y3 # y3 = a # MAJA
0359 rorx $41, e, y0 # y0 = e >> 41 # S1A
0360 rorx $18, e, y1 # y1 = e >> 18 # S1B
0361 add 3*8+frame_XFER(%rsp), h # h = k + w + h # --
0362 or c, y3 # y3 = a|c # MAJA
0363
0364
0365 mov f, y2 # y2 = f # CH
0366 rorx $34, a, T1 # T1 = a >> 34 # S0B
0367 xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
0368 xor g, y2 # y2 = f^g # CH
0369
0370
0371 rorx $14, e, y1 # y1 = (e >> 14) # S1
0372 and e, y2 # y2 = (f^g)&e # CH
0373 add h, d # d = k + w + h + d # --
0374 and b, y3 # y3 = (a|c)&b # MAJA
0375
0376 xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
0377 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
0378
0379 rorx $39, a, y1 # y1 = a >> 39 # S0A
0380 add y0, y2 # y2 = S1 + CH # --
0381
0382 xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
0383 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
0384
0385 rorx $28, a, T1 # T1 = (a >> 28) # S0
0386
0387 xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
0388 mov a, T1 # T1 = a # MAJB
0389 and c, T1 # T1 = a&c # MAJB
0390 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
0391
0392 add y1, h # h = k + w + h + S0 # --
0393 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
0394 add y3, h # h = t1 + S0 + MAJ # --
0395
0396 RotateState
0397
0398 rotate_Ys
0399 .endm
0400
0401 .macro DO_4ROUNDS
0402
0403 ################################### RND N + 0 #########################################
0404
0405 mov f, y2 # y2 = f # CH
0406 rorx $41, e, y0 # y0 = e >> 41 # S1A
0407 rorx $18, e, y1 # y1 = e >> 18 # S1B
0408 xor g, y2 # y2 = f^g # CH
0409
0410 xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
0411 rorx $14, e, y1 # y1 = (e >> 14) # S1
0412 and e, y2 # y2 = (f^g)&e # CH
0413
0414 xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
0415 rorx $34, a, T1 # T1 = a >> 34 # S0B
0416 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
0417 rorx $39, a, y1 # y1 = a >> 39 # S0A
0418 mov a, y3 # y3 = a # MAJA
0419
0420 xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
0421 rorx $28, a, T1 # T1 = (a >> 28) # S0
0422 add frame_XFER(%rsp), h # h = k + w + h # --
0423 or c, y3 # y3 = a|c # MAJA
0424
0425 xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
0426 mov a, T1 # T1 = a # MAJB
0427 and b, y3 # y3 = (a|c)&b # MAJA
0428 and c, T1 # T1 = a&c # MAJB
0429 add y0, y2 # y2 = S1 + CH # --
0430
0431 add h, d # d = k + w + h + d # --
0432 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
0433 add y1, h # h = k + w + h + S0 # --
0434
0435 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
0436
0437 RotateState
0438
0439 ################################### RND N + 1 #########################################
0440
0441 add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
0442 mov f, y2 # y2 = f # CH
0443 rorx $41, e, y0 # y0 = e >> 41 # S1A
0444 rorx $18, e, y1 # y1 = e >> 18 # S1B
0445 xor g, y2 # y2 = f^g # CH
0446
0447 xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
0448 rorx $14, e, y1 # y1 = (e >> 14) # S1
0449 and e, y2 # y2 = (f^g)&e # CH
0450 add y3, old_h # h = t1 + S0 + MAJ # --
0451
0452 xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
0453 rorx $34, a, T1 # T1 = a >> 34 # S0B
0454 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
0455 rorx $39, a, y1 # y1 = a >> 39 # S0A
0456 mov a, y3 # y3 = a # MAJA
0457
0458 xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
0459 rorx $28, a, T1 # T1 = (a >> 28) # S0
0460 add 8*1+frame_XFER(%rsp), h # h = k + w + h # --
0461 or c, y3 # y3 = a|c # MAJA
0462
0463 xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
0464 mov a, T1 # T1 = a # MAJB
0465 and b, y3 # y3 = (a|c)&b # MAJA
0466 and c, T1 # T1 = a&c # MAJB
0467 add y0, y2 # y2 = S1 + CH # --
0468
0469 add h, d # d = k + w + h + d # --
0470 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
0471 add y1, h # h = k + w + h + S0 # --
0472
0473 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
0474
0475 RotateState
0476
0477 ################################### RND N + 2 #########################################
0478
0479 add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
0480 mov f, y2 # y2 = f # CH
0481 rorx $41, e, y0 # y0 = e >> 41 # S1A
0482 rorx $18, e, y1 # y1 = e >> 18 # S1B
0483 xor g, y2 # y2 = f^g # CH
0484
0485 xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
0486 rorx $14, e, y1 # y1 = (e >> 14) # S1
0487 and e, y2 # y2 = (f^g)&e # CH
0488 add y3, old_h # h = t1 + S0 + MAJ # --
0489
0490 xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
0491 rorx $34, a, T1 # T1 = a >> 34 # S0B
0492 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
0493 rorx $39, a, y1 # y1 = a >> 39 # S0A
0494 mov a, y3 # y3 = a # MAJA
0495
0496 xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
0497 rorx $28, a, T1 # T1 = (a >> 28) # S0
0498 add 8*2+frame_XFER(%rsp), h # h = k + w + h # --
0499 or c, y3 # y3 = a|c # MAJA
0500
0501 xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
0502 mov a, T1 # T1 = a # MAJB
0503 and b, y3 # y3 = (a|c)&b # MAJA
0504 and c, T1 # T1 = a&c # MAJB
0505 add y0, y2 # y2 = S1 + CH # --
0506
0507 add h, d # d = k + w + h + d # --
0508 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
0509 add y1, h # h = k + w + h + S0 # --
0510
0511 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
0512
0513 RotateState
0514
0515 ################################### RND N + 3 #########################################
0516
0517 add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
0518 mov f, y2 # y2 = f # CH
0519 rorx $41, e, y0 # y0 = e >> 41 # S1A
0520 rorx $18, e, y1 # y1 = e >> 18 # S1B
0521 xor g, y2 # y2 = f^g # CH
0522
0523 xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
0524 rorx $14, e, y1 # y1 = (e >> 14) # S1
0525 and e, y2 # y2 = (f^g)&e # CH
0526 add y3, old_h # h = t1 + S0 + MAJ # --
0527
0528 xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
0529 rorx $34, a, T1 # T1 = a >> 34 # S0B
0530 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
0531 rorx $39, a, y1 # y1 = a >> 39 # S0A
0532 mov a, y3 # y3 = a # MAJA
0533
0534 xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
0535 rorx $28, a, T1 # T1 = (a >> 28) # S0
0536 add 8*3+frame_XFER(%rsp), h # h = k + w + h # --
0537 or c, y3 # y3 = a|c # MAJA
0538
0539 xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
0540 mov a, T1 # T1 = a # MAJB
0541 and b, y3 # y3 = (a|c)&b # MAJA
0542 and c, T1 # T1 = a&c # MAJB
0543 add y0, y2 # y2 = S1 + CH # --
0544
0545
0546 add h, d # d = k + w + h + d # --
0547 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
0548 add y1, h # h = k + w + h + S0 # --
0549
0550 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
0551
0552 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
0553
0554 add y3, h # h = t1 + S0 + MAJ # --
0555
0556 RotateState
0557
0558 .endm
0559
0560 ########################################################################
0561 # void sha512_transform_rorx(sha512_state *state, const u8 *data, int blocks)
0562 # Purpose: Updates the SHA512 digest stored at "state" with the message
0563 # stored in "data".
0564 # The size of the message pointed to by "data" must be an integer multiple
0565 # of SHA512 message blocks.
0566 # "blocks" is the message length in SHA512 blocks
0567 ########################################################################
0568 SYM_FUNC_START(sha512_transform_rorx)
0569 # Save GPRs
0570 push %rbx
0571 push %r12
0572 push %r13
0573 push %r14
0574 push %r15
0575
0576 # Allocate Stack Space
0577 push %rbp
0578 mov %rsp, %rbp
0579 sub $frame_size, %rsp
0580 and $~(0x20 - 1), %rsp
0581
0582 shl $7, NUM_BLKS # convert to bytes
0583 jz done_hash
0584 add INP, NUM_BLKS # pointer to end of data
0585 mov NUM_BLKS, frame_INPEND(%rsp)
0586
0587 ## load initial digest
0588 mov 8*0(CTX1), a
0589 mov 8*1(CTX1), b
0590 mov 8*2(CTX1), c
0591 mov 8*3(CTX1), d
0592 mov 8*4(CTX1), e
0593 mov 8*5(CTX1), f
0594 mov 8*6(CTX1), g
0595 mov 8*7(CTX1), h
0596
0597 # save %rdi (CTX) before it gets clobbered
0598 mov %rdi, frame_CTX(%rsp)
0599
0600 vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
0601
0602 loop0:
0603 lea K512(%rip), TBL
0604
0605 ## byte swap first 16 dwords
0606 COPY_YMM_AND_BSWAP Y_0, (INP), BYTE_FLIP_MASK
0607 COPY_YMM_AND_BSWAP Y_1, 1*32(INP), BYTE_FLIP_MASK
0608 COPY_YMM_AND_BSWAP Y_2, 2*32(INP), BYTE_FLIP_MASK
0609 COPY_YMM_AND_BSWAP Y_3, 3*32(INP), BYTE_FLIP_MASK
0610
0611 mov INP, frame_INP(%rsp)
0612
0613 ## schedule 64 input dwords, by doing 12 rounds of 4 each
0614 movq $4, frame_SRND(%rsp)
0615
0616 .align 16
0617 loop1:
0618 vpaddq (TBL), Y_0, XFER
0619 vmovdqa XFER, frame_XFER(%rsp)
0620 FOUR_ROUNDS_AND_SCHED
0621
0622 vpaddq 1*32(TBL), Y_0, XFER
0623 vmovdqa XFER, frame_XFER(%rsp)
0624 FOUR_ROUNDS_AND_SCHED
0625
0626 vpaddq 2*32(TBL), Y_0, XFER
0627 vmovdqa XFER, frame_XFER(%rsp)
0628 FOUR_ROUNDS_AND_SCHED
0629
0630 vpaddq 3*32(TBL), Y_0, XFER
0631 vmovdqa XFER, frame_XFER(%rsp)
0632 add $(4*32), TBL
0633 FOUR_ROUNDS_AND_SCHED
0634
0635 subq $1, frame_SRND(%rsp)
0636 jne loop1
0637
0638 movq $2, frame_SRND(%rsp)
0639 loop2:
0640 vpaddq (TBL), Y_0, XFER
0641 vmovdqa XFER, frame_XFER(%rsp)
0642 DO_4ROUNDS
0643 vpaddq 1*32(TBL), Y_1, XFER
0644 vmovdqa XFER, frame_XFER(%rsp)
0645 add $(2*32), TBL
0646 DO_4ROUNDS
0647
0648 vmovdqa Y_2, Y_0
0649 vmovdqa Y_3, Y_1
0650
0651 subq $1, frame_SRND(%rsp)
0652 jne loop2
0653
0654 mov frame_CTX(%rsp), CTX2
0655 addm 8*0(CTX2), a
0656 addm 8*1(CTX2), b
0657 addm 8*2(CTX2), c
0658 addm 8*3(CTX2), d
0659 addm 8*4(CTX2), e
0660 addm 8*5(CTX2), f
0661 addm 8*6(CTX2), g
0662 addm 8*7(CTX2), h
0663
0664 mov frame_INP(%rsp), INP
0665 add $128, INP
0666 cmp frame_INPEND(%rsp), INP
0667 jne loop0
0668
0669 done_hash:
0670
0671 # Restore Stack Pointer
0672 mov %rbp, %rsp
0673 pop %rbp
0674
0675 # Restore GPRs
0676 pop %r15
0677 pop %r14
0678 pop %r13
0679 pop %r12
0680 pop %rbx
0681
0682 RET
0683 SYM_FUNC_END(sha512_transform_rorx)
0684
0685 ########################################################################
0686 ### Binary Data
0687
0688
0689 # Mergeable 640-byte rodata section. This allows linker to merge the table
0690 # with other, exactly the same 640-byte fragment of another rodata section
0691 # (if such section exists).
0692 .section .rodata.cst640.K512, "aM", @progbits, 640
0693 .align 64
0694 # K[t] used in SHA512 hashing
0695 K512:
0696 .quad 0x428a2f98d728ae22,0x7137449123ef65cd
0697 .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
0698 .quad 0x3956c25bf348b538,0x59f111f1b605d019
0699 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
0700 .quad 0xd807aa98a3030242,0x12835b0145706fbe
0701 .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
0702 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
0703 .quad 0x9bdc06a725c71235,0xc19bf174cf692694
0704 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
0705 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
0706 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
0707 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
0708 .quad 0x983e5152ee66dfab,0xa831c66d2db43210
0709 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
0710 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
0711 .quad 0x06ca6351e003826f,0x142929670a0e6e70
0712 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
0713 .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
0714 .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
0715 .quad 0x81c2c92e47edaee6,0x92722c851482353b
0716 .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
0717 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
0718 .quad 0xd192e819d6ef5218,0xd69906245565a910
0719 .quad 0xf40e35855771202a,0x106aa07032bbd1b8
0720 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
0721 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
0722 .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
0723 .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
0724 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
0725 .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
0726 .quad 0x90befffa23631e28,0xa4506cebde82bde9
0727 .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
0728 .quad 0xca273eceea26619c,0xd186b8c721c0c207
0729 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
0730 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
0731 .quad 0x113f9804bef90dae,0x1b710b35131c471b
0732 .quad 0x28db77f523047d84,0x32caab7b40c72493
0733 .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
0734 .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
0735 .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
0736
0737 .section .rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32
0738 .align 32
0739 # Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
0740 PSHUFFLE_BYTE_FLIP_MASK:
0741 .octa 0x08090a0b0c0d0e0f0001020304050607
0742 .octa 0x18191a1b1c1d1e1f1011121314151617
0743
0744 .section .rodata.cst32.MASK_YMM_LO, "aM", @progbits, 32
0745 .align 32
0746 MASK_YMM_LO:
0747 .octa 0x00000000000000000000000000000000
0748 .octa 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF