0001
0002
0003
0004
0005
0006
0007
0008
0009
0010 #include <linux/linkage.h>
0011
0012 KEY .req x0
0013 MESSAGE .req x1
0014 MESSAGE_LEN .req x2
0015 HASH .req x3
0016
0017 PASS0_SUMS .req v0
0018 PASS1_SUMS .req v1
0019 PASS2_SUMS .req v2
0020 PASS3_SUMS .req v3
0021 K0 .req v4
0022 K1 .req v5
0023 K2 .req v6
0024 K3 .req v7
0025 T0 .req v8
0026 T1 .req v9
0027 T2 .req v10
0028 T3 .req v11
0029 T4 .req v12
0030 T5 .req v13
0031 T6 .req v14
0032 T7 .req v15
0033
0034 .macro _nh_stride k0, k1, k2, k3
0035
0036 // Load next message stride
0037 ld1 {T3.16b}, [MESSAGE], #16
0038
0039 // Load next key stride
0040 ld1 {\k3\().4s}, [KEY], #16
0041
0042 // Add message words to key words
0043 add T0.4s, T3.4s, \k0\().4s
0044 add T1.4s, T3.4s, \k1\().4s
0045 add T2.4s, T3.4s, \k2\().4s
0046 add T3.4s, T3.4s, \k3\().4s
0047
0048 // Multiply 32x32 => 64 and accumulate
0049 mov T4.d[0], T0.d[1]
0050 mov T5.d[0], T1.d[1]
0051 mov T6.d[0], T2.d[1]
0052 mov T7.d[0], T3.d[1]
0053 umlal PASS0_SUMS.2d, T0.2s, T4.2s
0054 umlal PASS1_SUMS.2d, T1.2s, T5.2s
0055 umlal PASS2_SUMS.2d, T2.2s, T6.2s
0056 umlal PASS3_SUMS.2d, T3.2s, T7.2s
0057 .endm
0058
0059
0060
0061
0062
0063
0064
0065 SYM_FUNC_START(nh_neon)
0066
0067 ld1 {K0.4s,K1.4s}, [KEY], #32
0068 movi PASS0_SUMS.2d, #0
0069 movi PASS1_SUMS.2d, #0
0070 ld1 {K2.4s}, [KEY], #16
0071 movi PASS2_SUMS.2d, #0
0072 movi PASS3_SUMS.2d, #0
0073
0074 subs MESSAGE_LEN, MESSAGE_LEN, #64
0075 blt .Lloop4_done
0076 .Lloop4:
0077 _nh_stride K0, K1, K2, K3
0078 _nh_stride K1, K2, K3, K0
0079 _nh_stride K2, K3, K0, K1
0080 _nh_stride K3, K0, K1, K2
0081 subs MESSAGE_LEN, MESSAGE_LEN, #64
0082 bge .Lloop4
0083
0084 .Lloop4_done:
0085 ands MESSAGE_LEN, MESSAGE_LEN, #63
0086 beq .Ldone
0087 _nh_stride K0, K1, K2, K3
0088
0089 subs MESSAGE_LEN, MESSAGE_LEN, #16
0090 beq .Ldone
0091 _nh_stride K1, K2, K3, K0
0092
0093 subs MESSAGE_LEN, MESSAGE_LEN, #16
0094 beq .Ldone
0095 _nh_stride K2, K3, K0, K1
0096
0097 .Ldone:
0098 // Sum the accumulators for each pass, then store the sums to 'hash'
0099 addp T0.2d, PASS0_SUMS.2d, PASS1_SUMS.2d
0100 addp T1.2d, PASS2_SUMS.2d, PASS3_SUMS.2d
0101 st1 {T0.16b,T1.16b}, [HASH]
0102 ret
0103 SYM_FUNC_END(nh_neon)