0001
0002
0003
0004
0005
0006
0007
0008
0009
0010 #include <linux/linkage.h>
0011
0012 .text
0013 .fpu neon
0014
0015 KEY .req r0
0016 MESSAGE .req r1
0017 MESSAGE_LEN .req r2
0018 HASH .req r3
0019
0020 PASS0_SUMS .req q0
0021 PASS0_SUM_A .req d0
0022 PASS0_SUM_B .req d1
0023 PASS1_SUMS .req q1
0024 PASS1_SUM_A .req d2
0025 PASS1_SUM_B .req d3
0026 PASS2_SUMS .req q2
0027 PASS2_SUM_A .req d4
0028 PASS2_SUM_B .req d5
0029 PASS3_SUMS .req q3
0030 PASS3_SUM_A .req d6
0031 PASS3_SUM_B .req d7
0032 K0 .req q4
0033 K1 .req q5
0034 K2 .req q6
0035 K3 .req q7
0036 T0 .req q8
0037 T0_L .req d16
0038 T0_H .req d17
0039 T1 .req q9
0040 T1_L .req d18
0041 T1_H .req d19
0042 T2 .req q10
0043 T2_L .req d20
0044 T2_H .req d21
0045 T3 .req q11
0046 T3_L .req d22
0047 T3_H .req d23
0048
0049 .macro _nh_stride k0, k1, k2, k3
0050
0051 // Load next message stride
0052 vld1.8 {T3}, [MESSAGE]!
0053
0054 // Load next key stride
0055 vld1.32 {\k3}, [KEY]!
0056
0057 // Add message words to key words
0058 vadd.u32 T0, T3, \k0
0059 vadd.u32 T1, T3, \k1
0060 vadd.u32 T2, T3, \k2
0061 vadd.u32 T3, T3, \k3
0062
0063 // Multiply 32x32 => 64 and accumulate
0064 vmlal.u32 PASS0_SUMS, T0_L, T0_H
0065 vmlal.u32 PASS1_SUMS, T1_L, T1_H
0066 vmlal.u32 PASS2_SUMS, T2_L, T2_H
0067 vmlal.u32 PASS3_SUMS, T3_L, T3_H
0068 .endm
0069
0070
0071
0072
0073
0074
0075
0076 ENTRY(nh_neon)
0077
0078 vld1.32 {K0,K1}, [KEY]!
0079 vmov.u64 PASS0_SUMS, #0
0080 vmov.u64 PASS1_SUMS, #0
0081 vld1.32 {K2}, [KEY]!
0082 vmov.u64 PASS2_SUMS, #0
0083 vmov.u64 PASS3_SUMS, #0
0084
0085 subs MESSAGE_LEN, MESSAGE_LEN, #64
0086 blt .Lloop4_done
0087 .Lloop4:
0088 _nh_stride K0, K1, K2, K3
0089 _nh_stride K1, K2, K3, K0
0090 _nh_stride K2, K3, K0, K1
0091 _nh_stride K3, K0, K1, K2
0092 subs MESSAGE_LEN, MESSAGE_LEN, #64
0093 bge .Lloop4
0094
0095 .Lloop4_done:
0096 ands MESSAGE_LEN, MESSAGE_LEN, #63
0097 beq .Ldone
0098 _nh_stride K0, K1, K2, K3
0099
0100 subs MESSAGE_LEN, MESSAGE_LEN, #16
0101 beq .Ldone
0102 _nh_stride K1, K2, K3, K0
0103
0104 subs MESSAGE_LEN, MESSAGE_LEN, #16
0105 beq .Ldone
0106 _nh_stride K2, K3, K0, K1
0107
0108 .Ldone:
0109 // Sum the accumulators for each pass, then store the sums to 'hash'
0110 vadd.u64 T0_L, PASS0_SUM_A, PASS0_SUM_B
0111 vadd.u64 T0_H, PASS1_SUM_A, PASS1_SUM_B
0112 vadd.u64 T1_L, PASS2_SUM_A, PASS2_SUM_B
0113 vadd.u64 T1_H, PASS3_SUM_A, PASS3_SUM_B
0114 vst1.8 {T0-T1}, [HASH]
0115 bx lr
0116 ENDPROC(nh_neon)