0001
0002
0003
0004
0005
0006
0007
0008
0009
0010 #include <linux/linkage.h>
0011
0012 #define PASS0_SUMS %xmm0
0013 #define PASS1_SUMS %xmm1
0014 #define PASS2_SUMS %xmm2
0015 #define PASS3_SUMS %xmm3
0016 #define K0 %xmm4
0017 #define K1 %xmm5
0018 #define K2 %xmm6
0019 #define K3 %xmm7
0020 #define T0 %xmm8
0021 #define T1 %xmm9
0022 #define T2 %xmm10
0023 #define T3 %xmm11
0024 #define T4 %xmm12
0025 #define T5 %xmm13
0026 #define T6 %xmm14
0027 #define T7 %xmm15
0028 #define KEY %rdi
0029 #define MESSAGE %rsi
0030 #define MESSAGE_LEN %rdx
0031 #define HASH %rcx
0032
0033 .macro _nh_stride k0, k1, k2, k3, offset
0034
0035 // Load next message stride
0036 movdqu \offset(MESSAGE), T1
0037
0038 // Load next key stride
0039 movdqu \offset(KEY), \k3
0040
0041 // Add message words to key words
0042 movdqa T1, T2
0043 movdqa T1, T3
0044 paddd T1, \k0 // reuse k0 to avoid a move
0045 paddd \k1, T1
0046 paddd \k2, T2
0047 paddd \k3, T3
0048
0049 // Multiply 32x32 => 64 and accumulate
0050 pshufd $0x10, \k0, T4
0051 pshufd $0x32, \k0, \k0
0052 pshufd $0x10, T1, T5
0053 pshufd $0x32, T1, T1
0054 pshufd $0x10, T2, T6
0055 pshufd $0x32, T2, T2
0056 pshufd $0x10, T3, T7
0057 pshufd $0x32, T3, T3
0058 pmuludq T4, \k0
0059 pmuludq T5, T1
0060 pmuludq T6, T2
0061 pmuludq T7, T3
0062 paddq \k0, PASS0_SUMS
0063 paddq T1, PASS1_SUMS
0064 paddq T2, PASS2_SUMS
0065 paddq T3, PASS3_SUMS
0066 .endm
0067
0068
0069
0070
0071
0072
0073
0074 SYM_FUNC_START(nh_sse2)
0075
0076 movdqu 0x00(KEY), K0
0077 movdqu 0x10(KEY), K1
0078 movdqu 0x20(KEY), K2
0079 add $0x30, KEY
0080 pxor PASS0_SUMS, PASS0_SUMS
0081 pxor PASS1_SUMS, PASS1_SUMS
0082 pxor PASS2_SUMS, PASS2_SUMS
0083 pxor PASS3_SUMS, PASS3_SUMS
0084
0085 sub $0x40, MESSAGE_LEN
0086 jl .Lloop4_done
0087 .Lloop4:
0088 _nh_stride K0, K1, K2, K3, 0x00
0089 _nh_stride K1, K2, K3, K0, 0x10
0090 _nh_stride K2, K3, K0, K1, 0x20
0091 _nh_stride K3, K0, K1, K2, 0x30
0092 add $0x40, KEY
0093 add $0x40, MESSAGE
0094 sub $0x40, MESSAGE_LEN
0095 jge .Lloop4
0096
0097 .Lloop4_done:
0098 and $0x3f, MESSAGE_LEN
0099 jz .Ldone
0100 _nh_stride K0, K1, K2, K3, 0x00
0101
0102 sub $0x10, MESSAGE_LEN
0103 jz .Ldone
0104 _nh_stride K1, K2, K3, K0, 0x10
0105
0106 sub $0x10, MESSAGE_LEN
0107 jz .Ldone
0108 _nh_stride K2, K3, K0, K1, 0x20
0109
0110 .Ldone:
0111 // Sum the accumulators for each pass, then store the sums to 'hash'
0112 movdqa PASS0_SUMS, T0
0113 movdqa PASS2_SUMS, T1
0114 punpcklqdq PASS1_SUMS, T0 // => (PASS0_SUM_A PASS1_SUM_A)
0115 punpcklqdq PASS3_SUMS, T1 // => (PASS2_SUM_A PASS3_SUM_A)
0116 punpckhqdq PASS1_SUMS, PASS0_SUMS // => (PASS0_SUM_B PASS1_SUM_B)
0117 punpckhqdq PASS3_SUMS, PASS2_SUMS // => (PASS2_SUM_B PASS3_SUM_B)
0118 paddq PASS0_SUMS, T0
0119 paddq PASS2_SUMS, T1
0120 movdqu T0, 0x00(HASH)
0121 movdqu T1, 0x10(HASH)
0122 RET
0123 SYM_FUNC_END(nh_sse2)