Back to home page

OSCL-LXR

 
 

    


0001 /* SPDX-License-Identifier: GPL-2.0 OR MIT */
0002 /*
0003  * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
0004  * Copyright (C) 2017-2019 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
0005  */
0006 
0007 #include <linux/linkage.h>
0008 
0009 .section .rodata.cst32.BLAKE2S_IV, "aM", @progbits, 32
0010 .align 32
0011 IV: .octa 0xA54FF53A3C6EF372BB67AE856A09E667
0012     .octa 0x5BE0CD191F83D9AB9B05688C510E527F
0013 .section .rodata.cst16.ROT16, "aM", @progbits, 16
0014 .align 16
0015 ROT16:  .octa 0x0D0C0F0E09080B0A0504070601000302
0016 .section .rodata.cst16.ROR328, "aM", @progbits, 16
0017 .align 16
0018 ROR328: .octa 0x0C0F0E0D080B0A090407060500030201
0019 .section .rodata.cst64.BLAKE2S_SIGMA, "aM", @progbits, 160
0020 .align 64
0021 SIGMA:
0022 .byte  0,  2,  4,  6,  1,  3,  5,  7, 14,  8, 10, 12, 15,  9, 11, 13
0023 .byte 14,  4,  9, 13, 10,  8, 15,  6,  5,  1,  0, 11,  3, 12,  2,  7
0024 .byte 11, 12,  5, 15,  8,  0,  2, 13,  9, 10,  3,  7,  4, 14,  6,  1
0025 .byte  7,  3, 13, 11,  9,  1, 12, 14, 15,  2,  5,  4,  8,  6, 10,  0
0026 .byte  9,  5,  2, 10,  0,  7,  4, 15,  3, 14, 11,  6, 13,  1, 12,  8
0027 .byte  2,  6,  0,  8, 12, 10, 11,  3,  1,  4,  7, 15,  9, 13,  5, 14
0028 .byte 12,  1, 14,  4,  5, 15, 13, 10,  8,  0,  6,  9, 11,  7,  3,  2
0029 .byte 13,  7, 12,  3, 11, 14,  1,  9,  2,  5, 15,  8, 10,  0,  4,  6
0030 .byte  6, 14, 11,  0, 15,  9,  3,  8, 10, 12, 13,  1,  5,  2,  7,  4
0031 .byte 10,  8,  7,  1,  2,  4,  6,  5, 13, 15,  9,  3,  0, 11, 14, 12
0032 #ifdef CONFIG_AS_AVX512
0033 .section .rodata.cst64.BLAKE2S_SIGMA2, "aM", @progbits, 640
0034 .align 64
0035 SIGMA2:
0036 .long  0,  2,  4,  6,  1,  3,  5,  7, 14,  8, 10, 12, 15,  9, 11, 13
0037 .long  8,  2, 13, 15, 10,  9, 12,  3,  6,  4,  0, 14,  5, 11,  1,  7
0038 .long 11, 13,  8,  6,  5, 10, 14,  3,  2,  4, 12, 15,  1,  0,  7,  9
0039 .long 11, 10,  7,  0,  8, 15,  1, 13,  3,  6,  2, 12,  4, 14,  9,  5
0040 .long  4, 10,  9, 14, 15,  0, 11,  8,  1,  7,  3, 13,  2,  5,  6, 12
0041 .long  2, 11,  4, 15, 14,  3, 10,  8, 13,  6,  5,  7,  0, 12,  1,  9
0042 .long  4,  8, 15,  9, 14, 11, 13,  5,  3,  2,  1, 12,  6, 10,  7,  0
0043 .long  6, 13,  0, 14, 12,  2,  1, 11, 15,  4,  5,  8,  7,  9,  3, 10
0044 .long 15,  5,  4, 13, 10,  7,  3, 11, 12,  2,  0,  6,  9,  8,  1, 14
0045 .long  8,  7, 14, 11, 13, 15,  0, 12, 10,  4,  5,  6,  3,  2,  1,  9
0046 #endif /* CONFIG_AS_AVX512 */
0047 
0048 .text
0049 SYM_FUNC_START(blake2s_compress_ssse3)
0050     testq       %rdx,%rdx
0051     je      .Lendofloop
0052     movdqu      (%rdi),%xmm0
0053     movdqu      0x10(%rdi),%xmm1
0054     movdqa      ROT16(%rip),%xmm12
0055     movdqa      ROR328(%rip),%xmm13
0056     movdqu      0x20(%rdi),%xmm14
0057     movq        %rcx,%xmm15
0058     leaq        SIGMA+0xa0(%rip),%r8
0059     jmp     .Lbeginofloop
0060     .align      32
0061 .Lbeginofloop:
0062     movdqa      %xmm0,%xmm10
0063     movdqa      %xmm1,%xmm11
0064     paddq       %xmm15,%xmm14
0065     movdqa      IV(%rip),%xmm2
0066     movdqa      %xmm14,%xmm3
0067     pxor        IV+0x10(%rip),%xmm3
0068     leaq        SIGMA(%rip),%rcx
0069 .Lroundloop:
0070     movzbl      (%rcx),%eax
0071     movd        (%rsi,%rax,4),%xmm4
0072     movzbl      0x1(%rcx),%eax
0073     movd        (%rsi,%rax,4),%xmm5
0074     movzbl      0x2(%rcx),%eax
0075     movd        (%rsi,%rax,4),%xmm6
0076     movzbl      0x3(%rcx),%eax
0077     movd        (%rsi,%rax,4),%xmm7
0078     punpckldq   %xmm5,%xmm4
0079     punpckldq   %xmm7,%xmm6
0080     punpcklqdq  %xmm6,%xmm4
0081     paddd       %xmm4,%xmm0
0082     paddd       %xmm1,%xmm0
0083     pxor        %xmm0,%xmm3
0084     pshufb      %xmm12,%xmm3
0085     paddd       %xmm3,%xmm2
0086     pxor        %xmm2,%xmm1
0087     movdqa      %xmm1,%xmm8
0088     psrld       $0xc,%xmm1
0089     pslld       $0x14,%xmm8
0090     por     %xmm8,%xmm1
0091     movzbl      0x4(%rcx),%eax
0092     movd        (%rsi,%rax,4),%xmm5
0093     movzbl      0x5(%rcx),%eax
0094     movd        (%rsi,%rax,4),%xmm6
0095     movzbl      0x6(%rcx),%eax
0096     movd        (%rsi,%rax,4),%xmm7
0097     movzbl      0x7(%rcx),%eax
0098     movd        (%rsi,%rax,4),%xmm4
0099     punpckldq   %xmm6,%xmm5
0100     punpckldq   %xmm4,%xmm7
0101     punpcklqdq  %xmm7,%xmm5
0102     paddd       %xmm5,%xmm0
0103     paddd       %xmm1,%xmm0
0104     pxor        %xmm0,%xmm3
0105     pshufb      %xmm13,%xmm3
0106     paddd       %xmm3,%xmm2
0107     pxor        %xmm2,%xmm1
0108     movdqa      %xmm1,%xmm8
0109     psrld       $0x7,%xmm1
0110     pslld       $0x19,%xmm8
0111     por     %xmm8,%xmm1
0112     pshufd      $0x93,%xmm0,%xmm0
0113     pshufd      $0x4e,%xmm3,%xmm3
0114     pshufd      $0x39,%xmm2,%xmm2
0115     movzbl      0x8(%rcx),%eax
0116     movd        (%rsi,%rax,4),%xmm6
0117     movzbl      0x9(%rcx),%eax
0118     movd        (%rsi,%rax,4),%xmm7
0119     movzbl      0xa(%rcx),%eax
0120     movd        (%rsi,%rax,4),%xmm4
0121     movzbl      0xb(%rcx),%eax
0122     movd        (%rsi,%rax,4),%xmm5
0123     punpckldq   %xmm7,%xmm6
0124     punpckldq   %xmm5,%xmm4
0125     punpcklqdq  %xmm4,%xmm6
0126     paddd       %xmm6,%xmm0
0127     paddd       %xmm1,%xmm0
0128     pxor        %xmm0,%xmm3
0129     pshufb      %xmm12,%xmm3
0130     paddd       %xmm3,%xmm2
0131     pxor        %xmm2,%xmm1
0132     movdqa      %xmm1,%xmm8
0133     psrld       $0xc,%xmm1
0134     pslld       $0x14,%xmm8
0135     por     %xmm8,%xmm1
0136     movzbl      0xc(%rcx),%eax
0137     movd        (%rsi,%rax,4),%xmm7
0138     movzbl      0xd(%rcx),%eax
0139     movd        (%rsi,%rax,4),%xmm4
0140     movzbl      0xe(%rcx),%eax
0141     movd        (%rsi,%rax,4),%xmm5
0142     movzbl      0xf(%rcx),%eax
0143     movd        (%rsi,%rax,4),%xmm6
0144     punpckldq   %xmm4,%xmm7
0145     punpckldq   %xmm6,%xmm5
0146     punpcklqdq  %xmm5,%xmm7
0147     paddd       %xmm7,%xmm0
0148     paddd       %xmm1,%xmm0
0149     pxor        %xmm0,%xmm3
0150     pshufb      %xmm13,%xmm3
0151     paddd       %xmm3,%xmm2
0152     pxor        %xmm2,%xmm1
0153     movdqa      %xmm1,%xmm8
0154     psrld       $0x7,%xmm1
0155     pslld       $0x19,%xmm8
0156     por     %xmm8,%xmm1
0157     pshufd      $0x39,%xmm0,%xmm0
0158     pshufd      $0x4e,%xmm3,%xmm3
0159     pshufd      $0x93,%xmm2,%xmm2
0160     addq        $0x10,%rcx
0161     cmpq        %r8,%rcx
0162     jnz     .Lroundloop
0163     pxor        %xmm2,%xmm0
0164     pxor        %xmm3,%xmm1
0165     pxor        %xmm10,%xmm0
0166     pxor        %xmm11,%xmm1
0167     addq        $0x40,%rsi
0168     decq        %rdx
0169     jnz     .Lbeginofloop
0170     movdqu      %xmm0,(%rdi)
0171     movdqu      %xmm1,0x10(%rdi)
0172     movdqu      %xmm14,0x20(%rdi)
0173 .Lendofloop:
0174     RET
0175 SYM_FUNC_END(blake2s_compress_ssse3)
0176 
0177 #ifdef CONFIG_AS_AVX512
0178 SYM_FUNC_START(blake2s_compress_avx512)
0179     vmovdqu     (%rdi),%xmm0
0180     vmovdqu     0x10(%rdi),%xmm1
0181     vmovdqu     0x20(%rdi),%xmm4
0182     vmovq       %rcx,%xmm5
0183     vmovdqa     IV(%rip),%xmm14
0184     vmovdqa     IV+16(%rip),%xmm15
0185     jmp     .Lblake2s_compress_avx512_mainloop
0186 .align 32
0187 .Lblake2s_compress_avx512_mainloop:
0188     vmovdqa     %xmm0,%xmm10
0189     vmovdqa     %xmm1,%xmm11
0190     vpaddq      %xmm5,%xmm4,%xmm4
0191     vmovdqa     %xmm14,%xmm2
0192     vpxor       %xmm15,%xmm4,%xmm3
0193     vmovdqu     (%rsi),%ymm6
0194     vmovdqu     0x20(%rsi),%ymm7
0195     addq        $0x40,%rsi
0196     leaq        SIGMA2(%rip),%rax
0197     movb        $0xa,%cl
0198 .Lblake2s_compress_avx512_roundloop:
0199     addq        $0x40,%rax
0200     vmovdqa     -0x40(%rax),%ymm8
0201     vmovdqa     -0x20(%rax),%ymm9
0202     vpermi2d    %ymm7,%ymm6,%ymm8
0203     vpermi2d    %ymm7,%ymm6,%ymm9
0204     vmovdqa     %ymm8,%ymm6
0205     vmovdqa     %ymm9,%ymm7
0206     vpaddd      %xmm8,%xmm0,%xmm0
0207     vpaddd      %xmm1,%xmm0,%xmm0
0208     vpxor       %xmm0,%xmm3,%xmm3
0209     vprord      $0x10,%xmm3,%xmm3
0210     vpaddd      %xmm3,%xmm2,%xmm2
0211     vpxor       %xmm2,%xmm1,%xmm1
0212     vprord      $0xc,%xmm1,%xmm1
0213     vextracti128    $0x1,%ymm8,%xmm8
0214     vpaddd      %xmm8,%xmm0,%xmm0
0215     vpaddd      %xmm1,%xmm0,%xmm0
0216     vpxor       %xmm0,%xmm3,%xmm3
0217     vprord      $0x8,%xmm3,%xmm3
0218     vpaddd      %xmm3,%xmm2,%xmm2
0219     vpxor       %xmm2,%xmm1,%xmm1
0220     vprord      $0x7,%xmm1,%xmm1
0221     vpshufd     $0x93,%xmm0,%xmm0
0222     vpshufd     $0x4e,%xmm3,%xmm3
0223     vpshufd     $0x39,%xmm2,%xmm2
0224     vpaddd      %xmm9,%xmm0,%xmm0
0225     vpaddd      %xmm1,%xmm0,%xmm0
0226     vpxor       %xmm0,%xmm3,%xmm3
0227     vprord      $0x10,%xmm3,%xmm3
0228     vpaddd      %xmm3,%xmm2,%xmm2
0229     vpxor       %xmm2,%xmm1,%xmm1
0230     vprord      $0xc,%xmm1,%xmm1
0231     vextracti128    $0x1,%ymm9,%xmm9
0232     vpaddd      %xmm9,%xmm0,%xmm0
0233     vpaddd      %xmm1,%xmm0,%xmm0
0234     vpxor       %xmm0,%xmm3,%xmm3
0235     vprord      $0x8,%xmm3,%xmm3
0236     vpaddd      %xmm3,%xmm2,%xmm2
0237     vpxor       %xmm2,%xmm1,%xmm1
0238     vprord      $0x7,%xmm1,%xmm1
0239     vpshufd     $0x39,%xmm0,%xmm0
0240     vpshufd     $0x4e,%xmm3,%xmm3
0241     vpshufd     $0x93,%xmm2,%xmm2
0242     decb        %cl
0243     jne     .Lblake2s_compress_avx512_roundloop
0244     vpxor       %xmm10,%xmm0,%xmm0
0245     vpxor       %xmm11,%xmm1,%xmm1
0246     vpxor       %xmm2,%xmm0,%xmm0
0247     vpxor       %xmm3,%xmm1,%xmm1
0248     decq        %rdx
0249     jne     .Lblake2s_compress_avx512_mainloop
0250     vmovdqu     %xmm0,(%rdi)
0251     vmovdqu     %xmm1,0x10(%rdi)
0252     vmovdqu     %xmm4,0x20(%rdi)
0253     vzeroupper
0254     RET
0255 SYM_FUNC_END(blake2s_compress_avx512)
0256 #endif /* CONFIG_AS_AVX512 */