Back to home page

OSCL-LXR

 
 

    


0001 /* SPDX-License-Identifier: GPL-2.0 */
0002 /*
0003  * NH - ε-almost-universal hash function, ARM64 NEON accelerated version
0004  *
0005  * Copyright 2018 Google LLC
0006  *
0007  * Author: Eric Biggers <ebiggers@google.com>
0008  */
0009 
0010 #include <linux/linkage.h>
0011 
0012     KEY     .req    x0
0013     MESSAGE     .req    x1
0014     MESSAGE_LEN .req    x2
0015     HASH        .req    x3
0016 
0017     PASS0_SUMS  .req    v0
0018     PASS1_SUMS  .req    v1
0019     PASS2_SUMS  .req    v2
0020     PASS3_SUMS  .req    v3
0021     K0      .req    v4
0022     K1      .req    v5
0023     K2      .req    v6
0024     K3      .req    v7
0025     T0      .req    v8
0026     T1      .req    v9
0027     T2      .req    v10
0028     T3      .req    v11
0029     T4      .req    v12
0030     T5      .req    v13
0031     T6      .req    v14
0032     T7      .req    v15
0033 
0034 .macro _nh_stride   k0, k1, k2, k3
0035 
0036     // Load next message stride
0037     ld1     {T3.16b}, [MESSAGE], #16
0038 
0039     // Load next key stride
0040     ld1     {\k3\().4s}, [KEY], #16
0041 
0042     // Add message words to key words
0043     add     T0.4s, T3.4s, \k0\().4s
0044     add     T1.4s, T3.4s, \k1\().4s
0045     add     T2.4s, T3.4s, \k2\().4s
0046     add     T3.4s, T3.4s, \k3\().4s
0047 
0048     // Multiply 32x32 => 64 and accumulate
0049     mov     T4.d[0], T0.d[1]
0050     mov     T5.d[0], T1.d[1]
0051     mov     T6.d[0], T2.d[1]
0052     mov     T7.d[0], T3.d[1]
0053     umlal       PASS0_SUMS.2d, T0.2s, T4.2s
0054     umlal       PASS1_SUMS.2d, T1.2s, T5.2s
0055     umlal       PASS2_SUMS.2d, T2.2s, T6.2s
0056     umlal       PASS3_SUMS.2d, T3.2s, T7.2s
0057 .endm
0058 
0059 /*
0060  * void nh_neon(const u32 *key, const u8 *message, size_t message_len,
0061  *      u8 hash[NH_HASH_BYTES])
0062  *
0063  * It's guaranteed that message_len % 16 == 0.
0064  */
0065 SYM_FUNC_START(nh_neon)
0066 
0067     ld1     {K0.4s,K1.4s}, [KEY], #32
0068       movi      PASS0_SUMS.2d, #0
0069       movi      PASS1_SUMS.2d, #0
0070     ld1     {K2.4s}, [KEY], #16
0071       movi      PASS2_SUMS.2d, #0
0072       movi      PASS3_SUMS.2d, #0
0073 
0074     subs        MESSAGE_LEN, MESSAGE_LEN, #64
0075     blt     .Lloop4_done
0076 .Lloop4:
0077     _nh_stride  K0, K1, K2, K3
0078     _nh_stride  K1, K2, K3, K0
0079     _nh_stride  K2, K3, K0, K1
0080     _nh_stride  K3, K0, K1, K2
0081     subs        MESSAGE_LEN, MESSAGE_LEN, #64
0082     bge     .Lloop4
0083 
0084 .Lloop4_done:
0085     ands        MESSAGE_LEN, MESSAGE_LEN, #63
0086     beq     .Ldone
0087     _nh_stride  K0, K1, K2, K3
0088 
0089     subs        MESSAGE_LEN, MESSAGE_LEN, #16
0090     beq     .Ldone
0091     _nh_stride  K1, K2, K3, K0
0092 
0093     subs        MESSAGE_LEN, MESSAGE_LEN, #16
0094     beq     .Ldone
0095     _nh_stride  K2, K3, K0, K1
0096 
0097 .Ldone:
0098     // Sum the accumulators for each pass, then store the sums to 'hash'
0099     addp        T0.2d, PASS0_SUMS.2d, PASS1_SUMS.2d
0100     addp        T1.2d, PASS2_SUMS.2d, PASS3_SUMS.2d
0101     st1     {T0.16b,T1.16b}, [HASH]
0102     ret
0103 SYM_FUNC_END(nh_neon)