Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0-only
0002 /*
0003  * arch/arm64/lib/xor-neon.c
0004  *
0005  * Authors: Jackie Liu <liuyun01@kylinos.cn>
0006  * Copyright (C) 2018,Tianjin KYLIN Information Technology Co., Ltd.
0007  */
0008 
0009 #include <linux/raid/xor.h>
0010 #include <linux/module.h>
0011 #include <asm/neon-intrinsics.h>
0012 
0013 void xor_arm64_neon_2(unsigned long bytes, unsigned long * __restrict p1,
0014     const unsigned long * __restrict p2)
0015 {
0016     uint64_t *dp1 = (uint64_t *)p1;
0017     uint64_t *dp2 = (uint64_t *)p2;
0018 
0019     register uint64x2_t v0, v1, v2, v3;
0020     long lines = bytes / (sizeof(uint64x2_t) * 4);
0021 
0022     do {
0023         /* p1 ^= p2 */
0024         v0 = veorq_u64(vld1q_u64(dp1 +  0), vld1q_u64(dp2 +  0));
0025         v1 = veorq_u64(vld1q_u64(dp1 +  2), vld1q_u64(dp2 +  2));
0026         v2 = veorq_u64(vld1q_u64(dp1 +  4), vld1q_u64(dp2 +  4));
0027         v3 = veorq_u64(vld1q_u64(dp1 +  6), vld1q_u64(dp2 +  6));
0028 
0029         /* store */
0030         vst1q_u64(dp1 +  0, v0);
0031         vst1q_u64(dp1 +  2, v1);
0032         vst1q_u64(dp1 +  4, v2);
0033         vst1q_u64(dp1 +  6, v3);
0034 
0035         dp1 += 8;
0036         dp2 += 8;
0037     } while (--lines > 0);
0038 }
0039 
0040 void xor_arm64_neon_3(unsigned long bytes, unsigned long * __restrict p1,
0041     const unsigned long * __restrict p2,
0042     const unsigned long * __restrict p3)
0043 {
0044     uint64_t *dp1 = (uint64_t *)p1;
0045     uint64_t *dp2 = (uint64_t *)p2;
0046     uint64_t *dp3 = (uint64_t *)p3;
0047 
0048     register uint64x2_t v0, v1, v2, v3;
0049     long lines = bytes / (sizeof(uint64x2_t) * 4);
0050 
0051     do {
0052         /* p1 ^= p2 */
0053         v0 = veorq_u64(vld1q_u64(dp1 +  0), vld1q_u64(dp2 +  0));
0054         v1 = veorq_u64(vld1q_u64(dp1 +  2), vld1q_u64(dp2 +  2));
0055         v2 = veorq_u64(vld1q_u64(dp1 +  4), vld1q_u64(dp2 +  4));
0056         v3 = veorq_u64(vld1q_u64(dp1 +  6), vld1q_u64(dp2 +  6));
0057 
0058         /* p1 ^= p3 */
0059         v0 = veorq_u64(v0, vld1q_u64(dp3 +  0));
0060         v1 = veorq_u64(v1, vld1q_u64(dp3 +  2));
0061         v2 = veorq_u64(v2, vld1q_u64(dp3 +  4));
0062         v3 = veorq_u64(v3, vld1q_u64(dp3 +  6));
0063 
0064         /* store */
0065         vst1q_u64(dp1 +  0, v0);
0066         vst1q_u64(dp1 +  2, v1);
0067         vst1q_u64(dp1 +  4, v2);
0068         vst1q_u64(dp1 +  6, v3);
0069 
0070         dp1 += 8;
0071         dp2 += 8;
0072         dp3 += 8;
0073     } while (--lines > 0);
0074 }
0075 
0076 void xor_arm64_neon_4(unsigned long bytes, unsigned long * __restrict p1,
0077     const unsigned long * __restrict p2,
0078     const unsigned long * __restrict p3,
0079     const unsigned long * __restrict p4)
0080 {
0081     uint64_t *dp1 = (uint64_t *)p1;
0082     uint64_t *dp2 = (uint64_t *)p2;
0083     uint64_t *dp3 = (uint64_t *)p3;
0084     uint64_t *dp4 = (uint64_t *)p4;
0085 
0086     register uint64x2_t v0, v1, v2, v3;
0087     long lines = bytes / (sizeof(uint64x2_t) * 4);
0088 
0089     do {
0090         /* p1 ^= p2 */
0091         v0 = veorq_u64(vld1q_u64(dp1 +  0), vld1q_u64(dp2 +  0));
0092         v1 = veorq_u64(vld1q_u64(dp1 +  2), vld1q_u64(dp2 +  2));
0093         v2 = veorq_u64(vld1q_u64(dp1 +  4), vld1q_u64(dp2 +  4));
0094         v3 = veorq_u64(vld1q_u64(dp1 +  6), vld1q_u64(dp2 +  6));
0095 
0096         /* p1 ^= p3 */
0097         v0 = veorq_u64(v0, vld1q_u64(dp3 +  0));
0098         v1 = veorq_u64(v1, vld1q_u64(dp3 +  2));
0099         v2 = veorq_u64(v2, vld1q_u64(dp3 +  4));
0100         v3 = veorq_u64(v3, vld1q_u64(dp3 +  6));
0101 
0102         /* p1 ^= p4 */
0103         v0 = veorq_u64(v0, vld1q_u64(dp4 +  0));
0104         v1 = veorq_u64(v1, vld1q_u64(dp4 +  2));
0105         v2 = veorq_u64(v2, vld1q_u64(dp4 +  4));
0106         v3 = veorq_u64(v3, vld1q_u64(dp4 +  6));
0107 
0108         /* store */
0109         vst1q_u64(dp1 +  0, v0);
0110         vst1q_u64(dp1 +  2, v1);
0111         vst1q_u64(dp1 +  4, v2);
0112         vst1q_u64(dp1 +  6, v3);
0113 
0114         dp1 += 8;
0115         dp2 += 8;
0116         dp3 += 8;
0117         dp4 += 8;
0118     } while (--lines > 0);
0119 }
0120 
0121 void xor_arm64_neon_5(unsigned long bytes, unsigned long * __restrict p1,
0122     const unsigned long * __restrict p2,
0123     const unsigned long * __restrict p3,
0124     const unsigned long * __restrict p4,
0125     const unsigned long * __restrict p5)
0126 {
0127     uint64_t *dp1 = (uint64_t *)p1;
0128     uint64_t *dp2 = (uint64_t *)p2;
0129     uint64_t *dp3 = (uint64_t *)p3;
0130     uint64_t *dp4 = (uint64_t *)p4;
0131     uint64_t *dp5 = (uint64_t *)p5;
0132 
0133     register uint64x2_t v0, v1, v2, v3;
0134     long lines = bytes / (sizeof(uint64x2_t) * 4);
0135 
0136     do {
0137         /* p1 ^= p2 */
0138         v0 = veorq_u64(vld1q_u64(dp1 +  0), vld1q_u64(dp2 +  0));
0139         v1 = veorq_u64(vld1q_u64(dp1 +  2), vld1q_u64(dp2 +  2));
0140         v2 = veorq_u64(vld1q_u64(dp1 +  4), vld1q_u64(dp2 +  4));
0141         v3 = veorq_u64(vld1q_u64(dp1 +  6), vld1q_u64(dp2 +  6));
0142 
0143         /* p1 ^= p3 */
0144         v0 = veorq_u64(v0, vld1q_u64(dp3 +  0));
0145         v1 = veorq_u64(v1, vld1q_u64(dp3 +  2));
0146         v2 = veorq_u64(v2, vld1q_u64(dp3 +  4));
0147         v3 = veorq_u64(v3, vld1q_u64(dp3 +  6));
0148 
0149         /* p1 ^= p4 */
0150         v0 = veorq_u64(v0, vld1q_u64(dp4 +  0));
0151         v1 = veorq_u64(v1, vld1q_u64(dp4 +  2));
0152         v2 = veorq_u64(v2, vld1q_u64(dp4 +  4));
0153         v3 = veorq_u64(v3, vld1q_u64(dp4 +  6));
0154 
0155         /* p1 ^= p5 */
0156         v0 = veorq_u64(v0, vld1q_u64(dp5 +  0));
0157         v1 = veorq_u64(v1, vld1q_u64(dp5 +  2));
0158         v2 = veorq_u64(v2, vld1q_u64(dp5 +  4));
0159         v3 = veorq_u64(v3, vld1q_u64(dp5 +  6));
0160 
0161         /* store */
0162         vst1q_u64(dp1 +  0, v0);
0163         vst1q_u64(dp1 +  2, v1);
0164         vst1q_u64(dp1 +  4, v2);
0165         vst1q_u64(dp1 +  6, v3);
0166 
0167         dp1 += 8;
0168         dp2 += 8;
0169         dp3 += 8;
0170         dp4 += 8;
0171         dp5 += 8;
0172     } while (--lines > 0);
0173 }
0174 
0175 struct xor_block_template xor_block_inner_neon __ro_after_init = {
0176     .name   = "__inner_neon__",
0177     .do_2   = xor_arm64_neon_2,
0178     .do_3   = xor_arm64_neon_3,
0179     .do_4   = xor_arm64_neon_4,
0180     .do_5   = xor_arm64_neon_5,
0181 };
0182 EXPORT_SYMBOL(xor_block_inner_neon);
0183 
0184 static inline uint64x2_t eor3(uint64x2_t p, uint64x2_t q, uint64x2_t r)
0185 {
0186     uint64x2_t res;
0187 
0188     asm(ARM64_ASM_PREAMBLE ".arch_extension sha3\n"
0189         "eor3 %0.16b, %1.16b, %2.16b, %3.16b"
0190         : "=w"(res) : "w"(p), "w"(q), "w"(r));
0191     return res;
0192 }
0193 
0194 static void xor_arm64_eor3_3(unsigned long bytes,
0195     unsigned long * __restrict p1,
0196     const unsigned long * __restrict p2,
0197     const unsigned long * __restrict p3)
0198 {
0199     uint64_t *dp1 = (uint64_t *)p1;
0200     uint64_t *dp2 = (uint64_t *)p2;
0201     uint64_t *dp3 = (uint64_t *)p3;
0202 
0203     register uint64x2_t v0, v1, v2, v3;
0204     long lines = bytes / (sizeof(uint64x2_t) * 4);
0205 
0206     do {
0207         /* p1 ^= p2 ^ p3 */
0208         v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0),
0209               vld1q_u64(dp3 + 0));
0210         v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2),
0211               vld1q_u64(dp3 + 2));
0212         v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4),
0213               vld1q_u64(dp3 + 4));
0214         v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6),
0215               vld1q_u64(dp3 + 6));
0216 
0217         /* store */
0218         vst1q_u64(dp1 + 0, v0);
0219         vst1q_u64(dp1 + 2, v1);
0220         vst1q_u64(dp1 + 4, v2);
0221         vst1q_u64(dp1 + 6, v3);
0222 
0223         dp1 += 8;
0224         dp2 += 8;
0225         dp3 += 8;
0226     } while (--lines > 0);
0227 }
0228 
0229 static void xor_arm64_eor3_4(unsigned long bytes,
0230     unsigned long * __restrict p1,
0231     const unsigned long * __restrict p2,
0232     const unsigned long * __restrict p3,
0233     const unsigned long * __restrict p4)
0234 {
0235     uint64_t *dp1 = (uint64_t *)p1;
0236     uint64_t *dp2 = (uint64_t *)p2;
0237     uint64_t *dp3 = (uint64_t *)p3;
0238     uint64_t *dp4 = (uint64_t *)p4;
0239 
0240     register uint64x2_t v0, v1, v2, v3;
0241     long lines = bytes / (sizeof(uint64x2_t) * 4);
0242 
0243     do {
0244         /* p1 ^= p2 ^ p3 */
0245         v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0),
0246               vld1q_u64(dp3 + 0));
0247         v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2),
0248               vld1q_u64(dp3 + 2));
0249         v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4),
0250               vld1q_u64(dp3 + 4));
0251         v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6),
0252               vld1q_u64(dp3 + 6));
0253 
0254         /* p1 ^= p4 */
0255         v0 = veorq_u64(v0, vld1q_u64(dp4 + 0));
0256         v1 = veorq_u64(v1, vld1q_u64(dp4 + 2));
0257         v2 = veorq_u64(v2, vld1q_u64(dp4 + 4));
0258         v3 = veorq_u64(v3, vld1q_u64(dp4 + 6));
0259 
0260         /* store */
0261         vst1q_u64(dp1 + 0, v0);
0262         vst1q_u64(dp1 + 2, v1);
0263         vst1q_u64(dp1 + 4, v2);
0264         vst1q_u64(dp1 + 6, v3);
0265 
0266         dp1 += 8;
0267         dp2 += 8;
0268         dp3 += 8;
0269         dp4 += 8;
0270     } while (--lines > 0);
0271 }
0272 
0273 static void xor_arm64_eor3_5(unsigned long bytes,
0274     unsigned long * __restrict p1,
0275     const unsigned long * __restrict p2,
0276     const unsigned long * __restrict p3,
0277     const unsigned long * __restrict p4,
0278     const unsigned long * __restrict p5)
0279 {
0280     uint64_t *dp1 = (uint64_t *)p1;
0281     uint64_t *dp2 = (uint64_t *)p2;
0282     uint64_t *dp3 = (uint64_t *)p3;
0283     uint64_t *dp4 = (uint64_t *)p4;
0284     uint64_t *dp5 = (uint64_t *)p5;
0285 
0286     register uint64x2_t v0, v1, v2, v3;
0287     long lines = bytes / (sizeof(uint64x2_t) * 4);
0288 
0289     do {
0290         /* p1 ^= p2 ^ p3 */
0291         v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0),
0292               vld1q_u64(dp3 + 0));
0293         v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2),
0294               vld1q_u64(dp3 + 2));
0295         v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4),
0296               vld1q_u64(dp3 + 4));
0297         v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6),
0298               vld1q_u64(dp3 + 6));
0299 
0300         /* p1 ^= p4 ^ p5 */
0301         v0 = eor3(v0, vld1q_u64(dp4 + 0), vld1q_u64(dp5 + 0));
0302         v1 = eor3(v1, vld1q_u64(dp4 + 2), vld1q_u64(dp5 + 2));
0303         v2 = eor3(v2, vld1q_u64(dp4 + 4), vld1q_u64(dp5 + 4));
0304         v3 = eor3(v3, vld1q_u64(dp4 + 6), vld1q_u64(dp5 + 6));
0305 
0306         /* store */
0307         vst1q_u64(dp1 + 0, v0);
0308         vst1q_u64(dp1 + 2, v1);
0309         vst1q_u64(dp1 + 4, v2);
0310         vst1q_u64(dp1 + 6, v3);
0311 
0312         dp1 += 8;
0313         dp2 += 8;
0314         dp3 += 8;
0315         dp4 += 8;
0316         dp5 += 8;
0317     } while (--lines > 0);
0318 }
0319 
0320 static int __init xor_neon_init(void)
0321 {
0322     if (IS_ENABLED(CONFIG_AS_HAS_SHA3) && cpu_have_named_feature(SHA3)) {
0323         xor_block_inner_neon.do_3 = xor_arm64_eor3_3;
0324         xor_block_inner_neon.do_4 = xor_arm64_eor3_4;
0325         xor_block_inner_neon.do_5 = xor_arm64_eor3_5;
0326     }
0327     return 0;
0328 }
0329 module_init(xor_neon_init);
0330 
0331 static void __exit xor_neon_exit(void)
0332 {
0333 }
0334 module_exit(xor_neon_exit);
0335 
0336 MODULE_AUTHOR("Jackie Liu <liuyun01@kylinos.cn>");
0337 MODULE_DESCRIPTION("ARMv8 XOR Extensions");
0338 MODULE_LICENSE("GPL");