0001
0002
0003
0004
0005
0006
0007
0008
0009 #include <linux/raid/xor.h>
0010 #include <linux/module.h>
0011 #include <asm/neon-intrinsics.h>
0012
0013 void xor_arm64_neon_2(unsigned long bytes, unsigned long * __restrict p1,
0014 const unsigned long * __restrict p2)
0015 {
0016 uint64_t *dp1 = (uint64_t *)p1;
0017 uint64_t *dp2 = (uint64_t *)p2;
0018
0019 register uint64x2_t v0, v1, v2, v3;
0020 long lines = bytes / (sizeof(uint64x2_t) * 4);
0021
0022 do {
0023
0024 v0 = veorq_u64(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0));
0025 v1 = veorq_u64(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2));
0026 v2 = veorq_u64(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4));
0027 v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6));
0028
0029
0030 vst1q_u64(dp1 + 0, v0);
0031 vst1q_u64(dp1 + 2, v1);
0032 vst1q_u64(dp1 + 4, v2);
0033 vst1q_u64(dp1 + 6, v3);
0034
0035 dp1 += 8;
0036 dp2 += 8;
0037 } while (--lines > 0);
0038 }
0039
0040 void xor_arm64_neon_3(unsigned long bytes, unsigned long * __restrict p1,
0041 const unsigned long * __restrict p2,
0042 const unsigned long * __restrict p3)
0043 {
0044 uint64_t *dp1 = (uint64_t *)p1;
0045 uint64_t *dp2 = (uint64_t *)p2;
0046 uint64_t *dp3 = (uint64_t *)p3;
0047
0048 register uint64x2_t v0, v1, v2, v3;
0049 long lines = bytes / (sizeof(uint64x2_t) * 4);
0050
0051 do {
0052
0053 v0 = veorq_u64(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0));
0054 v1 = veorq_u64(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2));
0055 v2 = veorq_u64(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4));
0056 v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6));
0057
0058
0059 v0 = veorq_u64(v0, vld1q_u64(dp3 + 0));
0060 v1 = veorq_u64(v1, vld1q_u64(dp3 + 2));
0061 v2 = veorq_u64(v2, vld1q_u64(dp3 + 4));
0062 v3 = veorq_u64(v3, vld1q_u64(dp3 + 6));
0063
0064
0065 vst1q_u64(dp1 + 0, v0);
0066 vst1q_u64(dp1 + 2, v1);
0067 vst1q_u64(dp1 + 4, v2);
0068 vst1q_u64(dp1 + 6, v3);
0069
0070 dp1 += 8;
0071 dp2 += 8;
0072 dp3 += 8;
0073 } while (--lines > 0);
0074 }
0075
0076 void xor_arm64_neon_4(unsigned long bytes, unsigned long * __restrict p1,
0077 const unsigned long * __restrict p2,
0078 const unsigned long * __restrict p3,
0079 const unsigned long * __restrict p4)
0080 {
0081 uint64_t *dp1 = (uint64_t *)p1;
0082 uint64_t *dp2 = (uint64_t *)p2;
0083 uint64_t *dp3 = (uint64_t *)p3;
0084 uint64_t *dp4 = (uint64_t *)p4;
0085
0086 register uint64x2_t v0, v1, v2, v3;
0087 long lines = bytes / (sizeof(uint64x2_t) * 4);
0088
0089 do {
0090
0091 v0 = veorq_u64(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0));
0092 v1 = veorq_u64(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2));
0093 v2 = veorq_u64(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4));
0094 v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6));
0095
0096
0097 v0 = veorq_u64(v0, vld1q_u64(dp3 + 0));
0098 v1 = veorq_u64(v1, vld1q_u64(dp3 + 2));
0099 v2 = veorq_u64(v2, vld1q_u64(dp3 + 4));
0100 v3 = veorq_u64(v3, vld1q_u64(dp3 + 6));
0101
0102
0103 v0 = veorq_u64(v0, vld1q_u64(dp4 + 0));
0104 v1 = veorq_u64(v1, vld1q_u64(dp4 + 2));
0105 v2 = veorq_u64(v2, vld1q_u64(dp4 + 4));
0106 v3 = veorq_u64(v3, vld1q_u64(dp4 + 6));
0107
0108
0109 vst1q_u64(dp1 + 0, v0);
0110 vst1q_u64(dp1 + 2, v1);
0111 vst1q_u64(dp1 + 4, v2);
0112 vst1q_u64(dp1 + 6, v3);
0113
0114 dp1 += 8;
0115 dp2 += 8;
0116 dp3 += 8;
0117 dp4 += 8;
0118 } while (--lines > 0);
0119 }
0120
0121 void xor_arm64_neon_5(unsigned long bytes, unsigned long * __restrict p1,
0122 const unsigned long * __restrict p2,
0123 const unsigned long * __restrict p3,
0124 const unsigned long * __restrict p4,
0125 const unsigned long * __restrict p5)
0126 {
0127 uint64_t *dp1 = (uint64_t *)p1;
0128 uint64_t *dp2 = (uint64_t *)p2;
0129 uint64_t *dp3 = (uint64_t *)p3;
0130 uint64_t *dp4 = (uint64_t *)p4;
0131 uint64_t *dp5 = (uint64_t *)p5;
0132
0133 register uint64x2_t v0, v1, v2, v3;
0134 long lines = bytes / (sizeof(uint64x2_t) * 4);
0135
0136 do {
0137
0138 v0 = veorq_u64(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0));
0139 v1 = veorq_u64(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2));
0140 v2 = veorq_u64(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4));
0141 v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6));
0142
0143
0144 v0 = veorq_u64(v0, vld1q_u64(dp3 + 0));
0145 v1 = veorq_u64(v1, vld1q_u64(dp3 + 2));
0146 v2 = veorq_u64(v2, vld1q_u64(dp3 + 4));
0147 v3 = veorq_u64(v3, vld1q_u64(dp3 + 6));
0148
0149
0150 v0 = veorq_u64(v0, vld1q_u64(dp4 + 0));
0151 v1 = veorq_u64(v1, vld1q_u64(dp4 + 2));
0152 v2 = veorq_u64(v2, vld1q_u64(dp4 + 4));
0153 v3 = veorq_u64(v3, vld1q_u64(dp4 + 6));
0154
0155
0156 v0 = veorq_u64(v0, vld1q_u64(dp5 + 0));
0157 v1 = veorq_u64(v1, vld1q_u64(dp5 + 2));
0158 v2 = veorq_u64(v2, vld1q_u64(dp5 + 4));
0159 v3 = veorq_u64(v3, vld1q_u64(dp5 + 6));
0160
0161
0162 vst1q_u64(dp1 + 0, v0);
0163 vst1q_u64(dp1 + 2, v1);
0164 vst1q_u64(dp1 + 4, v2);
0165 vst1q_u64(dp1 + 6, v3);
0166
0167 dp1 += 8;
0168 dp2 += 8;
0169 dp3 += 8;
0170 dp4 += 8;
0171 dp5 += 8;
0172 } while (--lines > 0);
0173 }
0174
0175 struct xor_block_template xor_block_inner_neon __ro_after_init = {
0176 .name = "__inner_neon__",
0177 .do_2 = xor_arm64_neon_2,
0178 .do_3 = xor_arm64_neon_3,
0179 .do_4 = xor_arm64_neon_4,
0180 .do_5 = xor_arm64_neon_5,
0181 };
0182 EXPORT_SYMBOL(xor_block_inner_neon);
0183
0184 static inline uint64x2_t eor3(uint64x2_t p, uint64x2_t q, uint64x2_t r)
0185 {
0186 uint64x2_t res;
0187
0188 asm(ARM64_ASM_PREAMBLE ".arch_extension sha3\n"
0189 "eor3 %0.16b, %1.16b, %2.16b, %3.16b"
0190 : "=w"(res) : "w"(p), "w"(q), "w"(r));
0191 return res;
0192 }
0193
0194 static void xor_arm64_eor3_3(unsigned long bytes,
0195 unsigned long * __restrict p1,
0196 const unsigned long * __restrict p2,
0197 const unsigned long * __restrict p3)
0198 {
0199 uint64_t *dp1 = (uint64_t *)p1;
0200 uint64_t *dp2 = (uint64_t *)p2;
0201 uint64_t *dp3 = (uint64_t *)p3;
0202
0203 register uint64x2_t v0, v1, v2, v3;
0204 long lines = bytes / (sizeof(uint64x2_t) * 4);
0205
0206 do {
0207
0208 v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0),
0209 vld1q_u64(dp3 + 0));
0210 v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2),
0211 vld1q_u64(dp3 + 2));
0212 v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4),
0213 vld1q_u64(dp3 + 4));
0214 v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6),
0215 vld1q_u64(dp3 + 6));
0216
0217
0218 vst1q_u64(dp1 + 0, v0);
0219 vst1q_u64(dp1 + 2, v1);
0220 vst1q_u64(dp1 + 4, v2);
0221 vst1q_u64(dp1 + 6, v3);
0222
0223 dp1 += 8;
0224 dp2 += 8;
0225 dp3 += 8;
0226 } while (--lines > 0);
0227 }
0228
0229 static void xor_arm64_eor3_4(unsigned long bytes,
0230 unsigned long * __restrict p1,
0231 const unsigned long * __restrict p2,
0232 const unsigned long * __restrict p3,
0233 const unsigned long * __restrict p4)
0234 {
0235 uint64_t *dp1 = (uint64_t *)p1;
0236 uint64_t *dp2 = (uint64_t *)p2;
0237 uint64_t *dp3 = (uint64_t *)p3;
0238 uint64_t *dp4 = (uint64_t *)p4;
0239
0240 register uint64x2_t v0, v1, v2, v3;
0241 long lines = bytes / (sizeof(uint64x2_t) * 4);
0242
0243 do {
0244
0245 v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0),
0246 vld1q_u64(dp3 + 0));
0247 v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2),
0248 vld1q_u64(dp3 + 2));
0249 v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4),
0250 vld1q_u64(dp3 + 4));
0251 v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6),
0252 vld1q_u64(dp3 + 6));
0253
0254
0255 v0 = veorq_u64(v0, vld1q_u64(dp4 + 0));
0256 v1 = veorq_u64(v1, vld1q_u64(dp4 + 2));
0257 v2 = veorq_u64(v2, vld1q_u64(dp4 + 4));
0258 v3 = veorq_u64(v3, vld1q_u64(dp4 + 6));
0259
0260
0261 vst1q_u64(dp1 + 0, v0);
0262 vst1q_u64(dp1 + 2, v1);
0263 vst1q_u64(dp1 + 4, v2);
0264 vst1q_u64(dp1 + 6, v3);
0265
0266 dp1 += 8;
0267 dp2 += 8;
0268 dp3 += 8;
0269 dp4 += 8;
0270 } while (--lines > 0);
0271 }
0272
0273 static void xor_arm64_eor3_5(unsigned long bytes,
0274 unsigned long * __restrict p1,
0275 const unsigned long * __restrict p2,
0276 const unsigned long * __restrict p3,
0277 const unsigned long * __restrict p4,
0278 const unsigned long * __restrict p5)
0279 {
0280 uint64_t *dp1 = (uint64_t *)p1;
0281 uint64_t *dp2 = (uint64_t *)p2;
0282 uint64_t *dp3 = (uint64_t *)p3;
0283 uint64_t *dp4 = (uint64_t *)p4;
0284 uint64_t *dp5 = (uint64_t *)p5;
0285
0286 register uint64x2_t v0, v1, v2, v3;
0287 long lines = bytes / (sizeof(uint64x2_t) * 4);
0288
0289 do {
0290
0291 v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0),
0292 vld1q_u64(dp3 + 0));
0293 v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2),
0294 vld1q_u64(dp3 + 2));
0295 v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4),
0296 vld1q_u64(dp3 + 4));
0297 v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6),
0298 vld1q_u64(dp3 + 6));
0299
0300
0301 v0 = eor3(v0, vld1q_u64(dp4 + 0), vld1q_u64(dp5 + 0));
0302 v1 = eor3(v1, vld1q_u64(dp4 + 2), vld1q_u64(dp5 + 2));
0303 v2 = eor3(v2, vld1q_u64(dp4 + 4), vld1q_u64(dp5 + 4));
0304 v3 = eor3(v3, vld1q_u64(dp4 + 6), vld1q_u64(dp5 + 6));
0305
0306
0307 vst1q_u64(dp1 + 0, v0);
0308 vst1q_u64(dp1 + 2, v1);
0309 vst1q_u64(dp1 + 4, v2);
0310 vst1q_u64(dp1 + 6, v3);
0311
0312 dp1 += 8;
0313 dp2 += 8;
0314 dp3 += 8;
0315 dp4 += 8;
0316 dp5 += 8;
0317 } while (--lines > 0);
0318 }
0319
0320 static int __init xor_neon_init(void)
0321 {
0322 if (IS_ENABLED(CONFIG_AS_HAS_SHA3) && cpu_have_named_feature(SHA3)) {
0323 xor_block_inner_neon.do_3 = xor_arm64_eor3_3;
0324 xor_block_inner_neon.do_4 = xor_arm64_eor3_4;
0325 xor_block_inner_neon.do_5 = xor_arm64_eor3_5;
0326 }
0327 return 0;
0328 }
0329 module_init(xor_neon_init);
0330
0331 static void __exit xor_neon_exit(void)
0332 {
0333 }
0334 module_exit(xor_neon_exit);
0335
0336 MODULE_AUTHOR("Jackie Liu <liuyun01@kylinos.cn>");
0337 MODULE_DESCRIPTION("ARMv8 XOR Extensions");
0338 MODULE_LICENSE("GPL");