Back to home page

OSCL-LXR

 
 

    


0001 /* SPDX-License-Identifier: GPL-2.0 */
0002 /*
0003  * NH - ε-almost-universal hash function, x86_64 SSE2 accelerated
0004  *
0005  * Copyright 2018 Google LLC
0006  *
0007  * Author: Eric Biggers <ebiggers@google.com>
0008  */
0009 
0010 #include <linux/linkage.h>
0011 
0012 #define     PASS0_SUMS  %xmm0
0013 #define     PASS1_SUMS  %xmm1
0014 #define     PASS2_SUMS  %xmm2
0015 #define     PASS3_SUMS  %xmm3
0016 #define     K0      %xmm4
0017 #define     K1      %xmm5
0018 #define     K2      %xmm6
0019 #define     K3      %xmm7
0020 #define     T0      %xmm8
0021 #define     T1      %xmm9
0022 #define     T2      %xmm10
0023 #define     T3      %xmm11
0024 #define     T4      %xmm12
0025 #define     T5      %xmm13
0026 #define     T6      %xmm14
0027 #define     T7      %xmm15
0028 #define     KEY     %rdi
0029 #define     MESSAGE     %rsi
0030 #define     MESSAGE_LEN %rdx
0031 #define     HASH        %rcx
0032 
0033 .macro _nh_stride   k0, k1, k2, k3, offset
0034 
0035     // Load next message stride
0036     movdqu      \offset(MESSAGE), T1
0037 
0038     // Load next key stride
0039     movdqu      \offset(KEY), \k3
0040 
0041     // Add message words to key words
0042     movdqa      T1, T2
0043     movdqa      T1, T3
0044     paddd       T1, \k0    // reuse k0 to avoid a move
0045     paddd       \k1, T1
0046     paddd       \k2, T2
0047     paddd       \k3, T3
0048 
0049     // Multiply 32x32 => 64 and accumulate
0050     pshufd      $0x10, \k0, T4
0051     pshufd      $0x32, \k0, \k0
0052     pshufd      $0x10, T1, T5
0053     pshufd      $0x32, T1, T1
0054     pshufd      $0x10, T2, T6
0055     pshufd      $0x32, T2, T2
0056     pshufd      $0x10, T3, T7
0057     pshufd      $0x32, T3, T3
0058     pmuludq     T4, \k0
0059     pmuludq     T5, T1
0060     pmuludq     T6, T2
0061     pmuludq     T7, T3
0062     paddq       \k0, PASS0_SUMS
0063     paddq       T1, PASS1_SUMS
0064     paddq       T2, PASS2_SUMS
0065     paddq       T3, PASS3_SUMS
0066 .endm
0067 
0068 /*
0069  * void nh_sse2(const u32 *key, const u8 *message, size_t message_len,
0070  *      u8 hash[NH_HASH_BYTES])
0071  *
0072  * It's guaranteed that message_len % 16 == 0.
0073  */
0074 SYM_FUNC_START(nh_sse2)
0075 
0076     movdqu      0x00(KEY), K0
0077     movdqu      0x10(KEY), K1
0078     movdqu      0x20(KEY), K2
0079     add     $0x30, KEY
0080     pxor        PASS0_SUMS, PASS0_SUMS
0081     pxor        PASS1_SUMS, PASS1_SUMS
0082     pxor        PASS2_SUMS, PASS2_SUMS
0083     pxor        PASS3_SUMS, PASS3_SUMS
0084 
0085     sub     $0x40, MESSAGE_LEN
0086     jl      .Lloop4_done
0087 .Lloop4:
0088     _nh_stride  K0, K1, K2, K3, 0x00
0089     _nh_stride  K1, K2, K3, K0, 0x10
0090     _nh_stride  K2, K3, K0, K1, 0x20
0091     _nh_stride  K3, K0, K1, K2, 0x30
0092     add     $0x40, KEY
0093     add     $0x40, MESSAGE
0094     sub     $0x40, MESSAGE_LEN
0095     jge     .Lloop4
0096 
0097 .Lloop4_done:
0098     and     $0x3f, MESSAGE_LEN
0099     jz      .Ldone
0100     _nh_stride  K0, K1, K2, K3, 0x00
0101 
0102     sub     $0x10, MESSAGE_LEN
0103     jz      .Ldone
0104     _nh_stride  K1, K2, K3, K0, 0x10
0105 
0106     sub     $0x10, MESSAGE_LEN
0107     jz      .Ldone
0108     _nh_stride  K2, K3, K0, K1, 0x20
0109 
0110 .Ldone:
0111     // Sum the accumulators for each pass, then store the sums to 'hash'
0112     movdqa      PASS0_SUMS, T0
0113     movdqa      PASS2_SUMS, T1
0114     punpcklqdq  PASS1_SUMS, T0      // => (PASS0_SUM_A PASS1_SUM_A)
0115     punpcklqdq  PASS3_SUMS, T1      // => (PASS2_SUM_A PASS3_SUM_A)
0116     punpckhqdq  PASS1_SUMS, PASS0_SUMS  // => (PASS0_SUM_B PASS1_SUM_B)
0117     punpckhqdq  PASS3_SUMS, PASS2_SUMS  // => (PASS2_SUM_B PASS3_SUM_B)
0118     paddq       PASS0_SUMS, T0
0119     paddq       PASS2_SUMS, T1
0120     movdqu      T0, 0x00(HASH)
0121     movdqu      T1, 0x10(HASH)
0122     RET
0123 SYM_FUNC_END(nh_sse2)