0001
0002
0003
0004
0005
0006
0007 #include <linux/linkage.h>
0008
0009 .section .rodata.cst32.BLAKE2S_IV, "aM", @progbits, 32
0010 .align 32
0011 IV: .octa 0xA54FF53A3C6EF372BB67AE856A09E667
0012 .octa 0x5BE0CD191F83D9AB9B05688C510E527F
0013 .section .rodata.cst16.ROT16, "aM", @progbits, 16
0014 .align 16
0015 ROT16: .octa 0x0D0C0F0E09080B0A0504070601000302
0016 .section .rodata.cst16.ROR328, "aM", @progbits, 16
0017 .align 16
0018 ROR328: .octa 0x0C0F0E0D080B0A090407060500030201
0019 .section .rodata.cst64.BLAKE2S_SIGMA, "aM", @progbits, 160
0020 .align 64
0021 SIGMA:
0022 .byte 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13
0023 .byte 14, 4, 9, 13, 10, 8, 15, 6, 5, 1, 0, 11, 3, 12, 2, 7
0024 .byte 11, 12, 5, 15, 8, 0, 2, 13, 9, 10, 3, 7, 4, 14, 6, 1
0025 .byte 7, 3, 13, 11, 9, 1, 12, 14, 15, 2, 5, 4, 8, 6, 10, 0
0026 .byte 9, 5, 2, 10, 0, 7, 4, 15, 3, 14, 11, 6, 13, 1, 12, 8
0027 .byte 2, 6, 0, 8, 12, 10, 11, 3, 1, 4, 7, 15, 9, 13, 5, 14
0028 .byte 12, 1, 14, 4, 5, 15, 13, 10, 8, 0, 6, 9, 11, 7, 3, 2
0029 .byte 13, 7, 12, 3, 11, 14, 1, 9, 2, 5, 15, 8, 10, 0, 4, 6
0030 .byte 6, 14, 11, 0, 15, 9, 3, 8, 10, 12, 13, 1, 5, 2, 7, 4
0031 .byte 10, 8, 7, 1, 2, 4, 6, 5, 13, 15, 9, 3, 0, 11, 14, 12
0032 #ifdef CONFIG_AS_AVX512
0033 .section .rodata.cst64.BLAKE2S_SIGMA2, "aM", @progbits, 640
0034 .align 64
0035 SIGMA2:
0036 .long 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13
0037 .long 8, 2, 13, 15, 10, 9, 12, 3, 6, 4, 0, 14, 5, 11, 1, 7
0038 .long 11, 13, 8, 6, 5, 10, 14, 3, 2, 4, 12, 15, 1, 0, 7, 9
0039 .long 11, 10, 7, 0, 8, 15, 1, 13, 3, 6, 2, 12, 4, 14, 9, 5
0040 .long 4, 10, 9, 14, 15, 0, 11, 8, 1, 7, 3, 13, 2, 5, 6, 12
0041 .long 2, 11, 4, 15, 14, 3, 10, 8, 13, 6, 5, 7, 0, 12, 1, 9
0042 .long 4, 8, 15, 9, 14, 11, 13, 5, 3, 2, 1, 12, 6, 10, 7, 0
0043 .long 6, 13, 0, 14, 12, 2, 1, 11, 15, 4, 5, 8, 7, 9, 3, 10
0044 .long 15, 5, 4, 13, 10, 7, 3, 11, 12, 2, 0, 6, 9, 8, 1, 14
0045 .long 8, 7, 14, 11, 13, 15, 0, 12, 10, 4, 5, 6, 3, 2, 1, 9
0046 #endif
0047
0048 .text
0049 SYM_FUNC_START(blake2s_compress_ssse3)
0050 testq %rdx,%rdx
0051 je .Lendofloop
0052 movdqu (%rdi),%xmm0
0053 movdqu 0x10(%rdi),%xmm1
0054 movdqa ROT16(%rip),%xmm12
0055 movdqa ROR328(%rip),%xmm13
0056 movdqu 0x20(%rdi),%xmm14
0057 movq %rcx,%xmm15
0058 leaq SIGMA+0xa0(%rip),%r8
0059 jmp .Lbeginofloop
0060 .align 32
0061 .Lbeginofloop:
0062 movdqa %xmm0,%xmm10
0063 movdqa %xmm1,%xmm11
0064 paddq %xmm15,%xmm14
0065 movdqa IV(%rip),%xmm2
0066 movdqa %xmm14,%xmm3
0067 pxor IV+0x10(%rip),%xmm3
0068 leaq SIGMA(%rip),%rcx
0069 .Lroundloop:
0070 movzbl (%rcx),%eax
0071 movd (%rsi,%rax,4),%xmm4
0072 movzbl 0x1(%rcx),%eax
0073 movd (%rsi,%rax,4),%xmm5
0074 movzbl 0x2(%rcx),%eax
0075 movd (%rsi,%rax,4),%xmm6
0076 movzbl 0x3(%rcx),%eax
0077 movd (%rsi,%rax,4),%xmm7
0078 punpckldq %xmm5,%xmm4
0079 punpckldq %xmm7,%xmm6
0080 punpcklqdq %xmm6,%xmm4
0081 paddd %xmm4,%xmm0
0082 paddd %xmm1,%xmm0
0083 pxor %xmm0,%xmm3
0084 pshufb %xmm12,%xmm3
0085 paddd %xmm3,%xmm2
0086 pxor %xmm2,%xmm1
0087 movdqa %xmm1,%xmm8
0088 psrld $0xc,%xmm1
0089 pslld $0x14,%xmm8
0090 por %xmm8,%xmm1
0091 movzbl 0x4(%rcx),%eax
0092 movd (%rsi,%rax,4),%xmm5
0093 movzbl 0x5(%rcx),%eax
0094 movd (%rsi,%rax,4),%xmm6
0095 movzbl 0x6(%rcx),%eax
0096 movd (%rsi,%rax,4),%xmm7
0097 movzbl 0x7(%rcx),%eax
0098 movd (%rsi,%rax,4),%xmm4
0099 punpckldq %xmm6,%xmm5
0100 punpckldq %xmm4,%xmm7
0101 punpcklqdq %xmm7,%xmm5
0102 paddd %xmm5,%xmm0
0103 paddd %xmm1,%xmm0
0104 pxor %xmm0,%xmm3
0105 pshufb %xmm13,%xmm3
0106 paddd %xmm3,%xmm2
0107 pxor %xmm2,%xmm1
0108 movdqa %xmm1,%xmm8
0109 psrld $0x7,%xmm1
0110 pslld $0x19,%xmm8
0111 por %xmm8,%xmm1
0112 pshufd $0x93,%xmm0,%xmm0
0113 pshufd $0x4e,%xmm3,%xmm3
0114 pshufd $0x39,%xmm2,%xmm2
0115 movzbl 0x8(%rcx),%eax
0116 movd (%rsi,%rax,4),%xmm6
0117 movzbl 0x9(%rcx),%eax
0118 movd (%rsi,%rax,4),%xmm7
0119 movzbl 0xa(%rcx),%eax
0120 movd (%rsi,%rax,4),%xmm4
0121 movzbl 0xb(%rcx),%eax
0122 movd (%rsi,%rax,4),%xmm5
0123 punpckldq %xmm7,%xmm6
0124 punpckldq %xmm5,%xmm4
0125 punpcklqdq %xmm4,%xmm6
0126 paddd %xmm6,%xmm0
0127 paddd %xmm1,%xmm0
0128 pxor %xmm0,%xmm3
0129 pshufb %xmm12,%xmm3
0130 paddd %xmm3,%xmm2
0131 pxor %xmm2,%xmm1
0132 movdqa %xmm1,%xmm8
0133 psrld $0xc,%xmm1
0134 pslld $0x14,%xmm8
0135 por %xmm8,%xmm1
0136 movzbl 0xc(%rcx),%eax
0137 movd (%rsi,%rax,4),%xmm7
0138 movzbl 0xd(%rcx),%eax
0139 movd (%rsi,%rax,4),%xmm4
0140 movzbl 0xe(%rcx),%eax
0141 movd (%rsi,%rax,4),%xmm5
0142 movzbl 0xf(%rcx),%eax
0143 movd (%rsi,%rax,4),%xmm6
0144 punpckldq %xmm4,%xmm7
0145 punpckldq %xmm6,%xmm5
0146 punpcklqdq %xmm5,%xmm7
0147 paddd %xmm7,%xmm0
0148 paddd %xmm1,%xmm0
0149 pxor %xmm0,%xmm3
0150 pshufb %xmm13,%xmm3
0151 paddd %xmm3,%xmm2
0152 pxor %xmm2,%xmm1
0153 movdqa %xmm1,%xmm8
0154 psrld $0x7,%xmm1
0155 pslld $0x19,%xmm8
0156 por %xmm8,%xmm1
0157 pshufd $0x39,%xmm0,%xmm0
0158 pshufd $0x4e,%xmm3,%xmm3
0159 pshufd $0x93,%xmm2,%xmm2
0160 addq $0x10,%rcx
0161 cmpq %r8,%rcx
0162 jnz .Lroundloop
0163 pxor %xmm2,%xmm0
0164 pxor %xmm3,%xmm1
0165 pxor %xmm10,%xmm0
0166 pxor %xmm11,%xmm1
0167 addq $0x40,%rsi
0168 decq %rdx
0169 jnz .Lbeginofloop
0170 movdqu %xmm0,(%rdi)
0171 movdqu %xmm1,0x10(%rdi)
0172 movdqu %xmm14,0x20(%rdi)
0173 .Lendofloop:
0174 RET
0175 SYM_FUNC_END(blake2s_compress_ssse3)
0176
0177 #ifdef CONFIG_AS_AVX512
0178 SYM_FUNC_START(blake2s_compress_avx512)
0179 vmovdqu (%rdi),%xmm0
0180 vmovdqu 0x10(%rdi),%xmm1
0181 vmovdqu 0x20(%rdi),%xmm4
0182 vmovq %rcx,%xmm5
0183 vmovdqa IV(%rip),%xmm14
0184 vmovdqa IV+16(%rip),%xmm15
0185 jmp .Lblake2s_compress_avx512_mainloop
0186 .align 32
0187 .Lblake2s_compress_avx512_mainloop:
0188 vmovdqa %xmm0,%xmm10
0189 vmovdqa %xmm1,%xmm11
0190 vpaddq %xmm5,%xmm4,%xmm4
0191 vmovdqa %xmm14,%xmm2
0192 vpxor %xmm15,%xmm4,%xmm3
0193 vmovdqu (%rsi),%ymm6
0194 vmovdqu 0x20(%rsi),%ymm7
0195 addq $0x40,%rsi
0196 leaq SIGMA2(%rip),%rax
0197 movb $0xa,%cl
0198 .Lblake2s_compress_avx512_roundloop:
0199 addq $0x40,%rax
0200 vmovdqa -0x40(%rax),%ymm8
0201 vmovdqa -0x20(%rax),%ymm9
0202 vpermi2d %ymm7,%ymm6,%ymm8
0203 vpermi2d %ymm7,%ymm6,%ymm9
0204 vmovdqa %ymm8,%ymm6
0205 vmovdqa %ymm9,%ymm7
0206 vpaddd %xmm8,%xmm0,%xmm0
0207 vpaddd %xmm1,%xmm0,%xmm0
0208 vpxor %xmm0,%xmm3,%xmm3
0209 vprord $0x10,%xmm3,%xmm3
0210 vpaddd %xmm3,%xmm2,%xmm2
0211 vpxor %xmm2,%xmm1,%xmm1
0212 vprord $0xc,%xmm1,%xmm1
0213 vextracti128 $0x1,%ymm8,%xmm8
0214 vpaddd %xmm8,%xmm0,%xmm0
0215 vpaddd %xmm1,%xmm0,%xmm0
0216 vpxor %xmm0,%xmm3,%xmm3
0217 vprord $0x8,%xmm3,%xmm3
0218 vpaddd %xmm3,%xmm2,%xmm2
0219 vpxor %xmm2,%xmm1,%xmm1
0220 vprord $0x7,%xmm1,%xmm1
0221 vpshufd $0x93,%xmm0,%xmm0
0222 vpshufd $0x4e,%xmm3,%xmm3
0223 vpshufd $0x39,%xmm2,%xmm2
0224 vpaddd %xmm9,%xmm0,%xmm0
0225 vpaddd %xmm1,%xmm0,%xmm0
0226 vpxor %xmm0,%xmm3,%xmm3
0227 vprord $0x10,%xmm3,%xmm3
0228 vpaddd %xmm3,%xmm2,%xmm2
0229 vpxor %xmm2,%xmm1,%xmm1
0230 vprord $0xc,%xmm1,%xmm1
0231 vextracti128 $0x1,%ymm9,%xmm9
0232 vpaddd %xmm9,%xmm0,%xmm0
0233 vpaddd %xmm1,%xmm0,%xmm0
0234 vpxor %xmm0,%xmm3,%xmm3
0235 vprord $0x8,%xmm3,%xmm3
0236 vpaddd %xmm3,%xmm2,%xmm2
0237 vpxor %xmm2,%xmm1,%xmm1
0238 vprord $0x7,%xmm1,%xmm1
0239 vpshufd $0x39,%xmm0,%xmm0
0240 vpshufd $0x4e,%xmm3,%xmm3
0241 vpshufd $0x93,%xmm2,%xmm2
0242 decb %cl
0243 jne .Lblake2s_compress_avx512_roundloop
0244 vpxor %xmm10,%xmm0,%xmm0
0245 vpxor %xmm11,%xmm1,%xmm1
0246 vpxor %xmm2,%xmm0,%xmm0
0247 vpxor %xmm3,%xmm1,%xmm1
0248 decq %rdx
0249 jne .Lblake2s_compress_avx512_mainloop
0250 vmovdqu %xmm0,(%rdi)
0251 vmovdqu %xmm1,0x10(%rdi)
0252 vmovdqu %xmm4,0x20(%rdi)
0253 vzeroupper
0254 RET
0255 SYM_FUNC_END(blake2s_compress_avx512)
0256 #endif