Back to home page

OSCL-LXR

 
 

    


0001 ########################################################################
0002 # Implement fast SHA-256 with AVX2 instructions. (x86_64)
0003 #
0004 # Copyright (C) 2013 Intel Corporation.
0005 #
0006 # Authors:
0007 #     James Guilford <james.guilford@intel.com>
0008 #     Kirk Yap <kirk.s.yap@intel.com>
0009 #     Tim Chen <tim.c.chen@linux.intel.com>
0010 #
0011 # This software is available to you under a choice of one of two
0012 # licenses.  You may choose to be licensed under the terms of the GNU
0013 # General Public License (GPL) Version 2, available from the file
0014 # COPYING in the main directory of this source tree, or the
0015 # OpenIB.org BSD license below:
0016 #
0017 #     Redistribution and use in source and binary forms, with or
0018 #     without modification, are permitted provided that the following
0019 #     conditions are met:
0020 #
0021 #      - Redistributions of source code must retain the above
0022 #        copyright notice, this list of conditions and the following
0023 #        disclaimer.
0024 #
0025 #      - Redistributions in binary form must reproduce the above
0026 #        copyright notice, this list of conditions and the following
0027 #        disclaimer in the documentation and/or other materials
0028 #        provided with the distribution.
0029 #
0030 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
0031 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
0032 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
0033 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
0034 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
0035 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
0036 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
0037 # SOFTWARE.
0038 #
0039 ########################################################################
0040 #
0041 # This code is described in an Intel White-Paper:
0042 # "Fast SHA-256 Implementations on Intel Architecture Processors"
0043 #
0044 # To find it, surf to http://www.intel.com/p/en_US/embedded
0045 # and search for that title.
0046 #
0047 ########################################################################
0048 # This code schedules 2 blocks at a time, with 4 lanes per block
0049 ########################################################################
0050 
0051 #include <linux/linkage.h>
0052 
0053 ## assume buffers not aligned
0054 #define VMOVDQ vmovdqu
0055 
0056 ################################ Define Macros
0057 
0058 # addm [mem], reg
0059 # Add reg to mem using reg-mem add and store
0060 .macro addm p1 p2
0061     add \p1, \p2
0062     mov \p2, \p1
0063 .endm
0064 
0065 ################################
0066 
0067 X0 = %ymm4
0068 X1 = %ymm5
0069 X2 = %ymm6
0070 X3 = %ymm7
0071 
0072 # XMM versions of above
0073 XWORD0 = %xmm4
0074 XWORD1 = %xmm5
0075 XWORD2 = %xmm6
0076 XWORD3 = %xmm7
0077 
0078 XTMP0 = %ymm0
0079 XTMP1 = %ymm1
0080 XTMP2 = %ymm2
0081 XTMP3 = %ymm3
0082 XTMP4 = %ymm8
0083 XFER  = %ymm9
0084 XTMP5 = %ymm11
0085 
0086 SHUF_00BA = %ymm10 # shuffle xBxA -> 00BA
0087 SHUF_DC00 = %ymm12 # shuffle xDxC -> DC00
0088 BYTE_FLIP_MASK = %ymm13
0089 
0090 X_BYTE_FLIP_MASK = %xmm13 # XMM version of BYTE_FLIP_MASK
0091 
0092 NUM_BLKS = %rdx # 3rd arg
0093 INP = %rsi  # 2nd arg
0094 CTX = %rdi  # 1st arg
0095 c   = %ecx
0096 d   = %r8d
0097 e       = %edx  # clobbers NUM_BLKS
0098 y3  = %esi  # clobbers INP
0099 
0100 SRND    = CTX   # SRND is same register as CTX
0101 
0102 a = %eax
0103 b = %ebx
0104 f = %r9d
0105 g = %r10d
0106 h = %r11d
0107 old_h = %r11d
0108 
0109 T1 = %r12d
0110 y0 = %r13d
0111 y1 = %r14d
0112 y2 = %r15d
0113 
0114 
0115 _XFER_SIZE  = 2*64*4    # 2 blocks, 64 rounds, 4 bytes/round
0116 _XMM_SAVE_SIZE  = 0
0117 _INP_END_SIZE   = 8
0118 _INP_SIZE   = 8
0119 _CTX_SIZE   = 8
0120 
0121 _XFER       = 0
0122 _XMM_SAVE   = _XFER     + _XFER_SIZE
0123 _INP_END    = _XMM_SAVE + _XMM_SAVE_SIZE
0124 _INP        = _INP_END  + _INP_END_SIZE
0125 _CTX        = _INP      + _INP_SIZE
0126 STACK_SIZE  = _CTX      + _CTX_SIZE
0127 
0128 # rotate_Xs
0129 # Rotate values of symbols X0...X3
0130 .macro rotate_Xs
0131     X_ = X0
0132     X0 = X1
0133     X1 = X2
0134     X2 = X3
0135     X3 = X_
0136 .endm
0137 
0138 # ROTATE_ARGS
0139 # Rotate values of symbols a...h
0140 .macro ROTATE_ARGS
0141     old_h = h
0142     TMP_ = h
0143     h = g
0144     g = f
0145     f = e
0146     e = d
0147     d = c
0148     c = b
0149     b = a
0150     a = TMP_
0151 .endm
0152 
0153 .macro FOUR_ROUNDS_AND_SCHED disp
0154 ################################### RND N + 0 ############################
0155 
0156     mov a, y3       # y3 = a                                # MAJA
0157     rorx    $25, e, y0  # y0 = e >> 25              # S1A
0158     rorx    $11, e, y1  # y1 = e >> 11              # S1B
0159 
0160     addl    \disp(%rsp, SRND), h        # h = k + w + h         # --
0161     or  c, y3       # y3 = a|c                              # MAJA
0162     vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7]
0163     mov f, y2       # y2 = f                                # CH
0164     rorx    $13, a, T1  # T1 = a >> 13              # S0B
0165 
0166     xor y1, y0      # y0 = (e>>25) ^ (e>>11)        # S1
0167     xor g, y2       # y2 = f^g                              # CH
0168     vpaddd  X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]# y1 = (e >> 6)# S1
0169     rorx    $6, e, y1   # y1 = (e >> 6)             # S1
0170 
0171     and e, y2       # y2 = (f^g)&e                          # CH
0172     xor y1, y0      # y0 = (e>>25) ^ (e>>11) ^ (e>>6)   # S1
0173     rorx    $22, a, y1  # y1 = a >> 22              # S0A
0174     add h, d        # d = k + w + h + d                     # --
0175 
0176     and b, y3       # y3 = (a|c)&b                          # MAJA
0177     vpalignr $4, X0, X1, XTMP1  # XTMP1 = W[-15]
0178     xor T1, y1      # y1 = (a>>22) ^ (a>>13)        # S0
0179     rorx    $2, a, T1   # T1 = (a >> 2)             # S0
0180 
0181     xor g, y2       # y2 = CH = ((f^g)&e)^g                 # CH
0182     vpsrld  $7, XTMP1, XTMP2
0183     xor T1, y1      # y1 = (a>>22) ^ (a>>13) ^ (a>>2)   # S0
0184     mov a, T1       # T1 = a                                # MAJB
0185     and c, T1       # T1 = a&c                              # MAJB
0186 
0187     add y0, y2      # y2 = S1 + CH                          # --
0188     vpslld  $(32-7), XTMP1, XTMP3
0189     or  T1, y3      # y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
0190     add y1, h       # h = k + w + h + S0                    # --
0191 
0192     add y2, d       # d = k + w + h + d + S1 + CH = d + t1  # --
0193     vpor    XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7
0194 
0195     vpsrld  $18, XTMP1, XTMP2
0196     add y2, h       # h = k + w + h + S0 + S1 + CH = t1 + S0# --
0197     add y3, h       # h = t1 + S0 + MAJ                     # --
0198 
0199 
0200     ROTATE_ARGS
0201 
0202 ################################### RND N + 1 ############################
0203 
0204     mov a, y3       # y3 = a                                # MAJA
0205     rorx    $25, e, y0  # y0 = e >> 25              # S1A
0206     rorx    $11, e, y1  # y1 = e >> 11              # S1B
0207     offset = \disp + 1*4
0208     addl    offset(%rsp, SRND), h   # h = k + w + h         # --
0209     or  c, y3       # y3 = a|c                              # MAJA
0210 
0211 
0212     vpsrld  $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3
0213     mov f, y2       # y2 = f                                # CH
0214     rorx    $13, a, T1  # T1 = a >> 13              # S0B
0215     xor y1, y0      # y0 = (e>>25) ^ (e>>11)        # S1
0216     xor g, y2       # y2 = f^g                              # CH
0217 
0218 
0219     rorx    $6, e, y1   # y1 = (e >> 6)             # S1
0220     xor y1, y0      # y0 = (e>>25) ^ (e>>11) ^ (e>>6)   # S1
0221     rorx    $22, a, y1  # y1 = a >> 22              # S0A
0222     and e, y2       # y2 = (f^g)&e                          # CH
0223     add h, d        # d = k + w + h + d                     # --
0224 
0225     vpslld  $(32-18), XTMP1, XTMP1
0226     and b, y3       # y3 = (a|c)&b                          # MAJA
0227     xor T1, y1      # y1 = (a>>22) ^ (a>>13)        # S0
0228 
0229     vpxor   XTMP1, XTMP3, XTMP3
0230     rorx    $2, a, T1   # T1 = (a >> 2)             # S0
0231     xor g, y2       # y2 = CH = ((f^g)&e)^g                 # CH
0232 
0233     vpxor   XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7 ^ W[-15] ror 18
0234     xor T1, y1      # y1 = (a>>22) ^ (a>>13) ^ (a>>2)   # S0
0235     mov a, T1       # T1 = a                                # MAJB
0236     and c, T1       # T1 = a&c                              # MAJB
0237     add y0, y2      # y2 = S1 + CH                          # --
0238 
0239     vpxor   XTMP4, XTMP3, XTMP1 # XTMP1 = s0
0240     vpshufd $0b11111010, X3, XTMP2  # XTMP2 = W[-2] {BBAA}
0241     or  T1, y3      # y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
0242     add y1, h       # h = k + w + h + S0                    # --
0243 
0244     vpaddd  XTMP1, XTMP0, XTMP0 # XTMP0 = W[-16] + W[-7] + s0
0245     add y2, d       # d = k + w + h + d + S1 + CH = d + t1  # --
0246     add y2, h       # h = k + w + h + S0 + S1 + CH = t1 + S0# --
0247     add y3, h       # h = t1 + S0 + MAJ                     # --
0248 
0249     vpsrld  $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}
0250 
0251 
0252     ROTATE_ARGS
0253 
0254 ################################### RND N + 2 ############################
0255 
0256     mov a, y3       # y3 = a                                # MAJA
0257     rorx    $25, e, y0  # y0 = e >> 25              # S1A
0258     offset = \disp + 2*4
0259     addl    offset(%rsp, SRND), h   # h = k + w + h         # --
0260 
0261     vpsrlq  $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA}
0262     rorx    $11, e, y1  # y1 = e >> 11              # S1B
0263     or  c, y3       # y3 = a|c                              # MAJA
0264     mov f, y2       # y2 = f                                # CH
0265     xor g, y2       # y2 = f^g                              # CH
0266 
0267     rorx    $13, a, T1  # T1 = a >> 13              # S0B
0268     xor y1, y0      # y0 = (e>>25) ^ (e>>11)        # S1
0269     vpsrlq  $17, XTMP2, XTMP2   # XTMP2 = W[-2] ror 17 {xBxA}
0270     and e, y2       # y2 = (f^g)&e                          # CH
0271 
0272     rorx    $6, e, y1   # y1 = (e >> 6)             # S1
0273     vpxor   XTMP3, XTMP2, XTMP2
0274     add h, d        # d = k + w + h + d                     # --
0275     and b, y3       # y3 = (a|c)&b                          # MAJA
0276 
0277     xor y1, y0      # y0 = (e>>25) ^ (e>>11) ^ (e>>6)   # S1
0278     rorx    $22, a, y1  # y1 = a >> 22              # S0A
0279     vpxor   XTMP2, XTMP4, XTMP4 # XTMP4 = s1 {xBxA}
0280     xor g, y2       # y2 = CH = ((f^g)&e)^g                 # CH
0281 
0282     vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA}
0283     xor T1, y1      # y1 = (a>>22) ^ (a>>13)        # S0
0284     rorx    $2, a ,T1   # T1 = (a >> 2)             # S0
0285     vpaddd  XTMP4, XTMP0, XTMP0 # XTMP0 = {..., ..., W[1], W[0]}
0286 
0287     xor T1, y1      # y1 = (a>>22) ^ (a>>13) ^ (a>>2)   # S0
0288     mov a, T1       # T1 = a                                # MAJB
0289     and c, T1       # T1 = a&c                              # MAJB
0290     add y0, y2      # y2 = S1 + CH                          # --
0291     vpshufd $0b01010000, XTMP0, XTMP2   # XTMP2 = W[-2] {DDCC}
0292 
0293     or  T1, y3      # y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
0294     add y1,h        # h = k + w + h + S0                    # --
0295     add y2,d        # d = k + w + h + d + S1 + CH = d + t1  # --
0296     add y2,h        # h = k + w + h + S0 + S1 + CH = t1 + S0# --
0297 
0298     add y3,h        # h = t1 + S0 + MAJ                     # --
0299 
0300 
0301     ROTATE_ARGS
0302 
0303 ################################### RND N + 3 ############################
0304 
0305     mov a, y3       # y3 = a                                # MAJA
0306     rorx    $25, e, y0  # y0 = e >> 25              # S1A
0307     rorx    $11, e, y1  # y1 = e >> 11              # S1B
0308     offset = \disp + 3*4
0309     addl    offset(%rsp, SRND), h   # h = k + w + h         # --
0310     or  c, y3       # y3 = a|c                              # MAJA
0311 
0312 
0313     vpsrld  $10, XTMP2, XTMP5   # XTMP5 = W[-2] >> 10 {DDCC}
0314     mov f, y2       # y2 = f                                # CH
0315     rorx    $13, a, T1  # T1 = a >> 13              # S0B
0316     xor y1, y0      # y0 = (e>>25) ^ (e>>11)        # S1
0317     xor g, y2       # y2 = f^g                              # CH
0318 
0319 
0320     vpsrlq  $19, XTMP2, XTMP3   # XTMP3 = W[-2] ror 19 {xDxC}
0321     rorx    $6, e, y1   # y1 = (e >> 6)             # S1
0322     and e, y2       # y2 = (f^g)&e                          # CH
0323     add h, d        # d = k + w + h + d                     # --
0324     and b, y3       # y3 = (a|c)&b                          # MAJA
0325 
0326     vpsrlq  $17, XTMP2, XTMP2   # XTMP2 = W[-2] ror 17 {xDxC}
0327     xor y1, y0      # y0 = (e>>25) ^ (e>>11) ^ (e>>6)   # S1
0328     xor g, y2       # y2 = CH = ((f^g)&e)^g                 # CH
0329 
0330     vpxor   XTMP3, XTMP2, XTMP2
0331     rorx    $22, a, y1  # y1 = a >> 22              # S0A
0332     add y0, y2      # y2 = S1 + CH                          # --
0333 
0334     vpxor   XTMP2, XTMP5, XTMP5 # XTMP5 = s1 {xDxC}
0335     xor T1, y1      # y1 = (a>>22) ^ (a>>13)        # S0
0336     add y2, d       # d = k + w + h + d + S1 + CH = d + t1  # --
0337 
0338     rorx    $2, a, T1   # T1 = (a >> 2)             # S0
0339     vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00}
0340 
0341     vpaddd  XTMP0, XTMP5, X0    # X0 = {W[3], W[2], W[1], W[0]}
0342     xor T1, y1      # y1 = (a>>22) ^ (a>>13) ^ (a>>2)   # S0
0343     mov a, T1       # T1 = a                                # MAJB
0344     and c, T1       # T1 = a&c                              # MAJB
0345     or  T1, y3      # y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
0346 
0347     add y1, h       # h = k + w + h + S0                    # --
0348     add y2, h       # h = k + w + h + S0 + S1 + CH = t1 + S0# --
0349     add y3, h       # h = t1 + S0 + MAJ                     # --
0350 
0351     ROTATE_ARGS
0352     rotate_Xs
0353 .endm
0354 
0355 .macro DO_4ROUNDS disp
0356 ################################### RND N + 0 ###########################
0357 
0358     mov f, y2       # y2 = f                                # CH
0359     rorx    $25, e, y0  # y0 = e >> 25              # S1A
0360     rorx    $11, e, y1  # y1 = e >> 11              # S1B
0361     xor g, y2       # y2 = f^g                              # CH
0362 
0363     xor y1, y0      # y0 = (e>>25) ^ (e>>11)        # S1
0364     rorx    $6, e, y1   # y1 = (e >> 6)             # S1
0365     and e, y2       # y2 = (f^g)&e                          # CH
0366 
0367     xor y1, y0      # y0 = (e>>25) ^ (e>>11) ^ (e>>6)   # S1
0368     rorx    $13, a, T1  # T1 = a >> 13              # S0B
0369     xor g, y2       # y2 = CH = ((f^g)&e)^g                 # CH
0370     rorx    $22, a, y1  # y1 = a >> 22              # S0A
0371     mov a, y3       # y3 = a                                # MAJA
0372 
0373     xor T1, y1      # y1 = (a>>22) ^ (a>>13)        # S0
0374     rorx    $2, a, T1   # T1 = (a >> 2)             # S0
0375     addl    \disp(%rsp, SRND), h        # h = k + w + h # --
0376     or  c, y3       # y3 = a|c                              # MAJA
0377 
0378     xor T1, y1      # y1 = (a>>22) ^ (a>>13) ^ (a>>2)   # S0
0379     mov a, T1       # T1 = a                                # MAJB
0380     and b, y3       # y3 = (a|c)&b                          # MAJA
0381     and c, T1       # T1 = a&c                              # MAJB
0382     add y0, y2      # y2 = S1 + CH                          # --
0383 
0384 
0385     add h, d        # d = k + w + h + d                     # --
0386     or  T1, y3      # y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
0387     add y1, h       # h = k + w + h + S0                    # --
0388     add y2, d       # d = k + w + h + d + S1 + CH = d + t1  # --
0389 
0390     ROTATE_ARGS
0391 
0392 ################################### RND N + 1 ###########################
0393 
0394     add y2, old_h   # h = k + w + h + S0 + S1 + CH = t1 + S0# --
0395     mov f, y2       # y2 = f                                # CH
0396     rorx    $25, e, y0  # y0 = e >> 25              # S1A
0397     rorx    $11, e, y1  # y1 = e >> 11              # S1B
0398     xor g, y2       # y2 = f^g                              # CH
0399 
0400     xor y1, y0      # y0 = (e>>25) ^ (e>>11)        # S1
0401     rorx    $6, e, y1   # y1 = (e >> 6)             # S1
0402     and e, y2       # y2 = (f^g)&e                          # CH
0403     add y3, old_h   # h = t1 + S0 + MAJ                     # --
0404 
0405     xor y1, y0      # y0 = (e>>25) ^ (e>>11) ^ (e>>6)   # S1
0406     rorx    $13, a, T1  # T1 = a >> 13              # S0B
0407     xor g, y2       # y2 = CH = ((f^g)&e)^g                 # CH
0408     rorx    $22, a, y1  # y1 = a >> 22              # S0A
0409     mov a, y3       # y3 = a                                # MAJA
0410 
0411     xor T1, y1      # y1 = (a>>22) ^ (a>>13)        # S0
0412     rorx    $2, a, T1   # T1 = (a >> 2)             # S0
0413     offset = 4*1 + \disp
0414     addl    offset(%rsp, SRND), h       # h = k + w + h # --
0415     or  c, y3       # y3 = a|c                              # MAJA
0416 
0417     xor T1, y1      # y1 = (a>>22) ^ (a>>13) ^ (a>>2)   # S0
0418     mov a, T1       # T1 = a                                # MAJB
0419     and b, y3       # y3 = (a|c)&b                          # MAJA
0420     and c, T1       # T1 = a&c                              # MAJB
0421     add y0, y2      # y2 = S1 + CH                          # --
0422 
0423 
0424     add h, d        # d = k + w + h + d                     # --
0425     or  T1, y3      # y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
0426     add y1, h       # h = k + w + h + S0                    # --
0427 
0428     add y2, d       # d = k + w + h + d + S1 + CH = d + t1  # --
0429 
0430     ROTATE_ARGS
0431 
0432 ################################### RND N + 2 ##############################
0433 
0434     add y2, old_h   # h = k + w + h + S0 + S1 + CH = t1 + S0# --
0435     mov f, y2       # y2 = f                                # CH
0436     rorx    $25, e, y0  # y0 = e >> 25              # S1A
0437     rorx    $11, e, y1  # y1 = e >> 11              # S1B
0438     xor g, y2       # y2 = f^g                              # CH
0439 
0440     xor y1, y0      # y0 = (e>>25) ^ (e>>11)        # S1
0441     rorx    $6, e, y1   # y1 = (e >> 6)             # S1
0442     and e, y2       # y2 = (f^g)&e                          # CH
0443     add y3, old_h   # h = t1 + S0 + MAJ                     # --
0444 
0445     xor y1, y0      # y0 = (e>>25) ^ (e>>11) ^ (e>>6)   # S1
0446     rorx    $13, a, T1  # T1 = a >> 13              # S0B
0447     xor g, y2       # y2 = CH = ((f^g)&e)^g                 # CH
0448     rorx    $22, a, y1  # y1 = a >> 22              # S0A
0449     mov a, y3       # y3 = a                                # MAJA
0450 
0451     xor T1, y1      # y1 = (a>>22) ^ (a>>13)        # S0
0452     rorx    $2, a, T1   # T1 = (a >> 2)             # S0
0453     offset = 4*2 + \disp
0454     addl    offset(%rsp, SRND), h       # h = k + w + h # --
0455     or  c, y3       # y3 = a|c                              # MAJA
0456 
0457     xor T1, y1      # y1 = (a>>22) ^ (a>>13) ^ (a>>2)   # S0
0458     mov a, T1       # T1 = a                                # MAJB
0459     and b, y3       # y3 = (a|c)&b                          # MAJA
0460     and c, T1       # T1 = a&c                              # MAJB
0461     add y0, y2      # y2 = S1 + CH                          # --
0462 
0463 
0464     add h, d        # d = k + w + h + d                     # --
0465     or  T1, y3      # y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
0466     add y1, h       # h = k + w + h + S0                    # --
0467 
0468     add y2, d       # d = k + w + h + d + S1 + CH = d + t1  # --
0469 
0470     ROTATE_ARGS
0471 
0472 ################################### RND N + 3 ###########################
0473 
0474     add y2, old_h   # h = k + w + h + S0 + S1 + CH = t1 + S0# --
0475     mov f, y2       # y2 = f                                # CH
0476     rorx    $25, e, y0  # y0 = e >> 25              # S1A
0477     rorx    $11, e, y1  # y1 = e >> 11              # S1B
0478     xor g, y2       # y2 = f^g                              # CH
0479 
0480     xor y1, y0      # y0 = (e>>25) ^ (e>>11)        # S1
0481     rorx    $6, e, y1   # y1 = (e >> 6)             # S1
0482     and e, y2       # y2 = (f^g)&e                          # CH
0483     add y3, old_h   # h = t1 + S0 + MAJ                     # --
0484 
0485     xor y1, y0      # y0 = (e>>25) ^ (e>>11) ^ (e>>6)   # S1
0486     rorx    $13, a, T1  # T1 = a >> 13              # S0B
0487     xor g, y2       # y2 = CH = ((f^g)&e)^g                 # CH
0488     rorx    $22, a, y1  # y1 = a >> 22              # S0A
0489     mov a, y3       # y3 = a                                # MAJA
0490 
0491     xor T1, y1      # y1 = (a>>22) ^ (a>>13)        # S0
0492     rorx    $2, a, T1   # T1 = (a >> 2)             # S0
0493     offset = 4*3 + \disp
0494     addl    offset(%rsp, SRND), h       # h = k + w + h # --
0495     or  c, y3       # y3 = a|c                              # MAJA
0496 
0497     xor T1, y1      # y1 = (a>>22) ^ (a>>13) ^ (a>>2)   # S0
0498     mov a, T1       # T1 = a                                # MAJB
0499     and b, y3       # y3 = (a|c)&b                          # MAJA
0500     and c, T1       # T1 = a&c                              # MAJB
0501     add y0, y2      # y2 = S1 + CH                          # --
0502 
0503 
0504     add h, d        # d = k + w + h + d                     # --
0505     or  T1, y3      # y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
0506     add y1, h       # h = k + w + h + S0                    # --
0507 
0508     add y2, d       # d = k + w + h + d + S1 + CH = d + t1  # --
0509 
0510 
0511     add y2, h       # h = k + w + h + S0 + S1 + CH = t1 + S0# --
0512 
0513     add y3, h       # h = t1 + S0 + MAJ                     # --
0514 
0515     ROTATE_ARGS
0516 
0517 .endm
0518 
0519 ########################################################################
0520 ## void sha256_transform_rorx(struct sha256_state *state, const u8 *data, int blocks)
0521 ## arg 1 : pointer to state
0522 ## arg 2 : pointer to input data
0523 ## arg 3 : Num blocks
0524 ########################################################################
0525 .text
0526 SYM_FUNC_START(sha256_transform_rorx)
0527 .align 32
0528     pushq   %rbx
0529     pushq   %r12
0530     pushq   %r13
0531     pushq   %r14
0532     pushq   %r15
0533 
0534     push    %rbp
0535     mov %rsp, %rbp
0536 
0537     subq    $STACK_SIZE, %rsp
0538     and $-32, %rsp  # align rsp to 32 byte boundary
0539 
0540     shl $6, NUM_BLKS    # convert to bytes
0541     jz  done_hash
0542     lea -64(INP, NUM_BLKS), NUM_BLKS # pointer to last block
0543     mov NUM_BLKS, _INP_END(%rsp)
0544 
0545     cmp NUM_BLKS, INP
0546     je  only_one_block
0547 
0548     ## load initial digest
0549     mov (CTX), a
0550     mov 4*1(CTX), b
0551     mov 4*2(CTX), c
0552     mov 4*3(CTX), d
0553     mov 4*4(CTX), e
0554     mov 4*5(CTX), f
0555     mov 4*6(CTX), g
0556     mov 4*7(CTX), h
0557 
0558     vmovdqa  PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
0559     vmovdqa  _SHUF_00BA(%rip), SHUF_00BA
0560     vmovdqa  _SHUF_DC00(%rip), SHUF_DC00
0561 
0562     mov CTX, _CTX(%rsp)
0563 
0564 loop0:
0565     ## Load first 16 dwords from two blocks
0566     VMOVDQ  0*32(INP),XTMP0
0567     VMOVDQ  1*32(INP),XTMP1
0568     VMOVDQ  2*32(INP),XTMP2
0569     VMOVDQ  3*32(INP),XTMP3
0570 
0571     ## byte swap data
0572     vpshufb BYTE_FLIP_MASK, XTMP0, XTMP0
0573     vpshufb BYTE_FLIP_MASK, XTMP1, XTMP1
0574     vpshufb BYTE_FLIP_MASK, XTMP2, XTMP2
0575     vpshufb BYTE_FLIP_MASK, XTMP3, XTMP3
0576 
0577     ## transpose data into high/low halves
0578     vperm2i128  $0x20, XTMP2, XTMP0, X0
0579     vperm2i128  $0x31, XTMP2, XTMP0, X1
0580     vperm2i128  $0x20, XTMP3, XTMP1, X2
0581     vperm2i128  $0x31, XTMP3, XTMP1, X3
0582 
0583 last_block_enter:
0584     add $64, INP
0585     mov INP, _INP(%rsp)
0586 
0587     ## schedule 48 input dwords, by doing 3 rounds of 12 each
0588     xor SRND, SRND
0589 
0590 .align 16
0591 loop1:
0592     vpaddd  K256+0*32(SRND), X0, XFER
0593     vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
0594     FOUR_ROUNDS_AND_SCHED   _XFER + 0*32
0595 
0596     vpaddd  K256+1*32(SRND), X0, XFER
0597     vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
0598     FOUR_ROUNDS_AND_SCHED   _XFER + 1*32
0599 
0600     vpaddd  K256+2*32(SRND), X0, XFER
0601     vmovdqa XFER, 2*32+_XFER(%rsp, SRND)
0602     FOUR_ROUNDS_AND_SCHED   _XFER + 2*32
0603 
0604     vpaddd  K256+3*32(SRND), X0, XFER
0605     vmovdqa XFER, 3*32+_XFER(%rsp, SRND)
0606     FOUR_ROUNDS_AND_SCHED   _XFER + 3*32
0607 
0608     add $4*32, SRND
0609     cmp $3*4*32, SRND
0610     jb  loop1
0611 
0612 loop2:
0613     ## Do last 16 rounds with no scheduling
0614     vpaddd  K256+0*32(SRND), X0, XFER
0615     vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
0616     DO_4ROUNDS  _XFER + 0*32
0617 
0618     vpaddd  K256+1*32(SRND), X1, XFER
0619     vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
0620     DO_4ROUNDS  _XFER + 1*32
0621     add $2*32, SRND
0622 
0623     vmovdqa X2, X0
0624     vmovdqa X3, X1
0625 
0626     cmp $4*4*32, SRND
0627     jb  loop2
0628 
0629     mov _CTX(%rsp), CTX
0630     mov _INP(%rsp), INP
0631 
0632     addm    (4*0)(CTX),a
0633     addm    (4*1)(CTX),b
0634     addm    (4*2)(CTX),c
0635     addm    (4*3)(CTX),d
0636     addm    (4*4)(CTX),e
0637     addm    (4*5)(CTX),f
0638     addm    (4*6)(CTX),g
0639     addm    (4*7)(CTX),h
0640 
0641     cmp _INP_END(%rsp), INP
0642     ja  done_hash
0643 
0644     #### Do second block using previously scheduled results
0645     xor SRND, SRND
0646 .align 16
0647 loop3:
0648     DO_4ROUNDS   _XFER + 0*32 + 16
0649     DO_4ROUNDS   _XFER + 1*32 + 16
0650     add $2*32, SRND
0651     cmp $4*4*32, SRND
0652     jb  loop3
0653 
0654     mov _CTX(%rsp), CTX
0655     mov _INP(%rsp), INP
0656     add $64, INP
0657 
0658     addm    (4*0)(CTX),a
0659     addm    (4*1)(CTX),b
0660     addm    (4*2)(CTX),c
0661     addm    (4*3)(CTX),d
0662     addm    (4*4)(CTX),e
0663     addm    (4*5)(CTX),f
0664     addm    (4*6)(CTX),g
0665     addm    (4*7)(CTX),h
0666 
0667     cmp _INP_END(%rsp), INP
0668     jb  loop0
0669     ja  done_hash
0670 
0671 do_last_block:
0672     VMOVDQ  0*16(INP),XWORD0
0673     VMOVDQ  1*16(INP),XWORD1
0674     VMOVDQ  2*16(INP),XWORD2
0675     VMOVDQ  3*16(INP),XWORD3
0676 
0677     vpshufb X_BYTE_FLIP_MASK, XWORD0, XWORD0
0678     vpshufb X_BYTE_FLIP_MASK, XWORD1, XWORD1
0679     vpshufb X_BYTE_FLIP_MASK, XWORD2, XWORD2
0680     vpshufb X_BYTE_FLIP_MASK, XWORD3, XWORD3
0681 
0682     jmp last_block_enter
0683 
0684 only_one_block:
0685 
0686     ## load initial digest
0687     mov (4*0)(CTX),a
0688     mov (4*1)(CTX),b
0689     mov (4*2)(CTX),c
0690     mov (4*3)(CTX),d
0691     mov (4*4)(CTX),e
0692     mov (4*5)(CTX),f
0693     mov (4*6)(CTX),g
0694     mov (4*7)(CTX),h
0695 
0696     vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
0697     vmovdqa _SHUF_00BA(%rip), SHUF_00BA
0698     vmovdqa _SHUF_DC00(%rip), SHUF_DC00
0699 
0700     mov CTX, _CTX(%rsp)
0701     jmp do_last_block
0702 
0703 done_hash:
0704 
0705     mov %rbp, %rsp
0706     pop %rbp
0707 
0708     popq    %r15
0709     popq    %r14
0710     popq    %r13
0711     popq    %r12
0712     popq    %rbx
0713     RET
0714 SYM_FUNC_END(sha256_transform_rorx)
0715 
0716 .section    .rodata.cst512.K256, "aM", @progbits, 512
0717 .align 64
0718 K256:
0719     .long   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
0720     .long   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
0721     .long   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
0722     .long   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
0723     .long   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
0724     .long   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
0725     .long   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
0726     .long   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
0727     .long   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
0728     .long   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
0729     .long   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
0730     .long   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
0731     .long   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
0732     .long   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
0733     .long   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
0734     .long   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
0735     .long   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
0736     .long   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
0737     .long   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
0738     .long   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
0739     .long   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
0740     .long   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
0741     .long   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
0742     .long   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
0743     .long   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
0744     .long   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
0745     .long   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
0746     .long   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
0747     .long   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
0748     .long   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
0749     .long   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
0750     .long   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
0751 
0752 .section    .rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32
0753 .align 32
0754 PSHUFFLE_BYTE_FLIP_MASK:
0755     .octa 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203
0756 
0757 # shuffle xBxA -> 00BA
0758 .section    .rodata.cst32._SHUF_00BA, "aM", @progbits, 32
0759 .align 32
0760 _SHUF_00BA:
0761     .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100
0762 
0763 # shuffle xDxC -> DC00
0764 .section    .rodata.cst32._SHUF_DC00, "aM", @progbits, 32
0765 .align 32
0766 _SHUF_DC00:
0767     .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF