0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021 #include <linux/linkage.h>
0022 #include <asm/assembler.h>
0023 #include <asm/cache.h>
0024
0025 .text
0026 .align 6
0027
0028
0029
0030
0031
0032
0033
0034
0035
0036
0037
0038
0039 SYM_FUNC_START_LOCAL(chacha_permute)
0040
0041 adr_l x10, ROT8
0042 ld1 {v12.4s}, [x10]
0043
0044 .Ldoubleround:
0045 // x0 += x1, x3 = rotl32(x3 ^ x0, 16)
0046 add v0.4s, v0.4s, v1.4s
0047 eor v3.16b, v3.16b, v0.16b
0048 rev32 v3.8h, v3.8h
0049
0050 // x2 += x3, x1 = rotl32(x1 ^ x2, 12)
0051 add v2.4s, v2.4s, v3.4s
0052 eor v4.16b, v1.16b, v2.16b
0053 shl v1.4s, v4.4s, #12
0054 sri v1.4s, v4.4s, #20
0055
0056 // x0 += x1, x3 = rotl32(x3 ^ x0, 8)
0057 add v0.4s, v0.4s, v1.4s
0058 eor v3.16b, v3.16b, v0.16b
0059 tbl v3.16b, {v3.16b}, v12.16b
0060
0061 // x2 += x3, x1 = rotl32(x1 ^ x2, 7)
0062 add v2.4s, v2.4s, v3.4s
0063 eor v4.16b, v1.16b, v2.16b
0064 shl v1.4s, v4.4s, #7
0065 sri v1.4s, v4.4s, #25
0066
0067 // x1 = shuffle32(x1, MASK(0, 3, 2, 1))
0068 ext v1.16b, v1.16b, v1.16b, #4
0069 // x2 = shuffle32(x2, MASK(1, 0, 3, 2))
0070 ext v2.16b, v2.16b, v2.16b, #8
0071 // x3 = shuffle32(x3, MASK(2, 1, 0, 3))
0072 ext v3.16b, v3.16b, v3.16b, #12
0073
0074 // x0 += x1, x3 = rotl32(x3 ^ x0, 16)
0075 add v0.4s, v0.4s, v1.4s
0076 eor v3.16b, v3.16b, v0.16b
0077 rev32 v3.8h, v3.8h
0078
0079 // x2 += x3, x1 = rotl32(x1 ^ x2, 12)
0080 add v2.4s, v2.4s, v3.4s
0081 eor v4.16b, v1.16b, v2.16b
0082 shl v1.4s, v4.4s, #12
0083 sri v1.4s, v4.4s, #20
0084
0085 // x0 += x1, x3 = rotl32(x3 ^ x0, 8)
0086 add v0.4s, v0.4s, v1.4s
0087 eor v3.16b, v3.16b, v0.16b
0088 tbl v3.16b, {v3.16b}, v12.16b
0089
0090 // x2 += x3, x1 = rotl32(x1 ^ x2, 7)
0091 add v2.4s, v2.4s, v3.4s
0092 eor v4.16b, v1.16b, v2.16b
0093 shl v1.4s, v4.4s, #7
0094 sri v1.4s, v4.4s, #25
0095
0096 // x1 = shuffle32(x1, MASK(2, 1, 0, 3))
0097 ext v1.16b, v1.16b, v1.16b, #12
0098 // x2 = shuffle32(x2, MASK(1, 0, 3, 2))
0099 ext v2.16b, v2.16b, v2.16b, #8
0100 // x3 = shuffle32(x3, MASK(0, 3, 2, 1))
0101 ext v3.16b, v3.16b, v3.16b, #4
0102
0103 subs w3, w3, #2
0104 b.ne .Ldoubleround
0105
0106 ret
0107 SYM_FUNC_END(chacha_permute)
0108
0109 SYM_FUNC_START(chacha_block_xor_neon)
0110 // x0: Input state matrix, s
0111 // x1: 1 data block output, o
0112 // x2: 1 data block input, i
0113 // w3: nrounds
0114
0115 stp x29, x30, [sp, #-16]!
0116 mov x29, sp
0117
0118 // x0..3 = s0..3
0119 ld1 {v0.4s-v3.4s}, [x0]
0120 ld1 {v8.4s-v11.4s}, [x0]
0121
0122 bl chacha_permute
0123
0124 ld1 {v4.16b-v7.16b}, [x2]
0125
0126 // o0 = i0 ^ (x0 + s0)
0127 add v0.4s, v0.4s, v8.4s
0128 eor v0.16b, v0.16b, v4.16b
0129
0130 // o1 = i1 ^ (x1 + s1)
0131 add v1.4s, v1.4s, v9.4s
0132 eor v1.16b, v1.16b, v5.16b
0133
0134 // o2 = i2 ^ (x2 + s2)
0135 add v2.4s, v2.4s, v10.4s
0136 eor v2.16b, v2.16b, v6.16b
0137
0138 // o3 = i3 ^ (x3 + s3)
0139 add v3.4s, v3.4s, v11.4s
0140 eor v3.16b, v3.16b, v7.16b
0141
0142 st1 {v0.16b-v3.16b}, [x1]
0143
0144 ldp x29, x30, [sp], #16
0145 ret
0146 SYM_FUNC_END(chacha_block_xor_neon)
0147
0148 SYM_FUNC_START(hchacha_block_neon)
0149 // x0: Input state matrix, s
0150 // x1: output (8 32-bit words)
0151 // w2: nrounds
0152
0153 stp x29, x30, [sp, #-16]!
0154 mov x29, sp
0155
0156 ld1 {v0.4s-v3.4s}, [x0]
0157
0158 mov w3, w2
0159 bl chacha_permute
0160
0161 st1 {v0.4s}, [x1], #16
0162 st1 {v3.4s}, [x1]
0163
0164 ldp x29, x30, [sp], #16
0165 ret
0166 SYM_FUNC_END(hchacha_block_neon)
0167
0168 a0 .req w12
0169 a1 .req w13
0170 a2 .req w14
0171 a3 .req w15
0172 a4 .req w16
0173 a5 .req w17
0174 a6 .req w19
0175 a7 .req w20
0176 a8 .req w21
0177 a9 .req w22
0178 a10 .req w23
0179 a11 .req w24
0180 a12 .req w25
0181 a13 .req w26
0182 a14 .req w27
0183 a15 .req w28
0184
0185 .align 6
0186 SYM_FUNC_START(chacha_4block_xor_neon)
0187 frame_push 10
0188
0189 // x0: Input state matrix, s
0190 // x1: 4 data blocks output, o
0191 // x2: 4 data blocks input, i
0192 // w3: nrounds
0193 // x4: byte count
0194
0195 adr_l x10, .Lpermute
0196 and x5, x4, #63
0197 add x10, x10, x5
0198
0199 //
0200 // This function encrypts four consecutive ChaCha blocks by loading
0201 // the state matrix in NEON registers four times. The algorithm performs
0202 // each operation on the corresponding word of each state matrix, hence
0203 // requires no word shuffling. For final XORing step we transpose the
0204 // matrix by interleaving 32- and then 64-bit words, which allows us to
0205 // do XOR in NEON registers.
0206 //
0207 // At the same time, a fifth block is encrypted in parallel using
0208 // scalar registers
0209 //
0210 adr_l x9, CTRINC // ... and ROT8
0211 ld1 {v30.4s-v31.4s}, [x9]
0212
0213 // x0..15[0-3] = s0..3[0..3]
0214 add x8, x0, #16
0215 ld4r { v0.4s- v3.4s}, [x0]
0216 ld4r { v4.4s- v7.4s}, [x8], #16
0217 ld4r { v8.4s-v11.4s}, [x8], #16
0218 ld4r {v12.4s-v15.4s}, [x8]
0219
0220 mov a0, v0.s[0]
0221 mov a1, v1.s[0]
0222 mov a2, v2.s[0]
0223 mov a3, v3.s[0]
0224 mov a4, v4.s[0]
0225 mov a5, v5.s[0]
0226 mov a6, v6.s[0]
0227 mov a7, v7.s[0]
0228 mov a8, v8.s[0]
0229 mov a9, v9.s[0]
0230 mov a10, v10.s[0]
0231 mov a11, v11.s[0]
0232 mov a12, v12.s[0]
0233 mov a13, v13.s[0]
0234 mov a14, v14.s[0]
0235 mov a15, v15.s[0]
0236
0237 // x12 += counter values 1-4
0238 add v12.4s, v12.4s, v30.4s
0239
0240 .Ldoubleround4:
0241 // x0 += x4, x12 = rotl32(x12 ^ x0, 16)
0242 // x1 += x5, x13 = rotl32(x13 ^ x1, 16)
0243 // x2 += x6, x14 = rotl32(x14 ^ x2, 16)
0244 // x3 += x7, x15 = rotl32(x15 ^ x3, 16)
0245 add v0.4s, v0.4s, v4.4s
0246 add a0, a0, a4
0247 add v1.4s, v1.4s, v5.4s
0248 add a1, a1, a5
0249 add v2.4s, v2.4s, v6.4s
0250 add a2, a2, a6
0251 add v3.4s, v3.4s, v7.4s
0252 add a3, a3, a7
0253
0254 eor v12.16b, v12.16b, v0.16b
0255 eor a12, a12, a0
0256 eor v13.16b, v13.16b, v1.16b
0257 eor a13, a13, a1
0258 eor v14.16b, v14.16b, v2.16b
0259 eor a14, a14, a2
0260 eor v15.16b, v15.16b, v3.16b
0261 eor a15, a15, a3
0262
0263 rev32 v12.8h, v12.8h
0264 ror a12, a12, #16
0265 rev32 v13.8h, v13.8h
0266 ror a13, a13, #16
0267 rev32 v14.8h, v14.8h
0268 ror a14, a14, #16
0269 rev32 v15.8h, v15.8h
0270 ror a15, a15, #16
0271
0272 // x8 += x12, x4 = rotl32(x4 ^ x8, 12)
0273 // x9 += x13, x5 = rotl32(x5 ^ x9, 12)
0274 // x10 += x14, x6 = rotl32(x6 ^ x10, 12)
0275 // x11 += x15, x7 = rotl32(x7 ^ x11, 12)
0276 add v8.4s, v8.4s, v12.4s
0277 add a8, a8, a12
0278 add v9.4s, v9.4s, v13.4s
0279 add a9, a9, a13
0280 add v10.4s, v10.4s, v14.4s
0281 add a10, a10, a14
0282 add v11.4s, v11.4s, v15.4s
0283 add a11, a11, a15
0284
0285 eor v16.16b, v4.16b, v8.16b
0286 eor a4, a4, a8
0287 eor v17.16b, v5.16b, v9.16b
0288 eor a5, a5, a9
0289 eor v18.16b, v6.16b, v10.16b
0290 eor a6, a6, a10
0291 eor v19.16b, v7.16b, v11.16b
0292 eor a7, a7, a11
0293
0294 shl v4.4s, v16.4s, #12
0295 shl v5.4s, v17.4s, #12
0296 shl v6.4s, v18.4s, #12
0297 shl v7.4s, v19.4s, #12
0298
0299 sri v4.4s, v16.4s, #20
0300 ror a4, a4, #20
0301 sri v5.4s, v17.4s, #20
0302 ror a5, a5, #20
0303 sri v6.4s, v18.4s, #20
0304 ror a6, a6, #20
0305 sri v7.4s, v19.4s, #20
0306 ror a7, a7, #20
0307
0308 // x0 += x4, x12 = rotl32(x12 ^ x0, 8)
0309 // x1 += x5, x13 = rotl32(x13 ^ x1, 8)
0310 // x2 += x6, x14 = rotl32(x14 ^ x2, 8)
0311 // x3 += x7, x15 = rotl32(x15 ^ x3, 8)
0312 add v0.4s, v0.4s, v4.4s
0313 add a0, a0, a4
0314 add v1.4s, v1.4s, v5.4s
0315 add a1, a1, a5
0316 add v2.4s, v2.4s, v6.4s
0317 add a2, a2, a6
0318 add v3.4s, v3.4s, v7.4s
0319 add a3, a3, a7
0320
0321 eor v12.16b, v12.16b, v0.16b
0322 eor a12, a12, a0
0323 eor v13.16b, v13.16b, v1.16b
0324 eor a13, a13, a1
0325 eor v14.16b, v14.16b, v2.16b
0326 eor a14, a14, a2
0327 eor v15.16b, v15.16b, v3.16b
0328 eor a15, a15, a3
0329
0330 tbl v12.16b, {v12.16b}, v31.16b
0331 ror a12, a12, #24
0332 tbl v13.16b, {v13.16b}, v31.16b
0333 ror a13, a13, #24
0334 tbl v14.16b, {v14.16b}, v31.16b
0335 ror a14, a14, #24
0336 tbl v15.16b, {v15.16b}, v31.16b
0337 ror a15, a15, #24
0338
0339 // x8 += x12, x4 = rotl32(x4 ^ x8, 7)
0340 // x9 += x13, x5 = rotl32(x5 ^ x9, 7)
0341 // x10 += x14, x6 = rotl32(x6 ^ x10, 7)
0342 // x11 += x15, x7 = rotl32(x7 ^ x11, 7)
0343 add v8.4s, v8.4s, v12.4s
0344 add a8, a8, a12
0345 add v9.4s, v9.4s, v13.4s
0346 add a9, a9, a13
0347 add v10.4s, v10.4s, v14.4s
0348 add a10, a10, a14
0349 add v11.4s, v11.4s, v15.4s
0350 add a11, a11, a15
0351
0352 eor v16.16b, v4.16b, v8.16b
0353 eor a4, a4, a8
0354 eor v17.16b, v5.16b, v9.16b
0355 eor a5, a5, a9
0356 eor v18.16b, v6.16b, v10.16b
0357 eor a6, a6, a10
0358 eor v19.16b, v7.16b, v11.16b
0359 eor a7, a7, a11
0360
0361 shl v4.4s, v16.4s, #7
0362 shl v5.4s, v17.4s, #7
0363 shl v6.4s, v18.4s, #7
0364 shl v7.4s, v19.4s, #7
0365
0366 sri v4.4s, v16.4s, #25
0367 ror a4, a4, #25
0368 sri v5.4s, v17.4s, #25
0369 ror a5, a5, #25
0370 sri v6.4s, v18.4s, #25
0371 ror a6, a6, #25
0372 sri v7.4s, v19.4s, #25
0373 ror a7, a7, #25
0374
0375 // x0 += x5, x15 = rotl32(x15 ^ x0, 16)
0376 // x1 += x6, x12 = rotl32(x12 ^ x1, 16)
0377 // x2 += x7, x13 = rotl32(x13 ^ x2, 16)
0378 // x3 += x4, x14 = rotl32(x14 ^ x3, 16)
0379 add v0.4s, v0.4s, v5.4s
0380 add a0, a0, a5
0381 add v1.4s, v1.4s, v6.4s
0382 add a1, a1, a6
0383 add v2.4s, v2.4s, v7.4s
0384 add a2, a2, a7
0385 add v3.4s, v3.4s, v4.4s
0386 add a3, a3, a4
0387
0388 eor v15.16b, v15.16b, v0.16b
0389 eor a15, a15, a0
0390 eor v12.16b, v12.16b, v1.16b
0391 eor a12, a12, a1
0392 eor v13.16b, v13.16b, v2.16b
0393 eor a13, a13, a2
0394 eor v14.16b, v14.16b, v3.16b
0395 eor a14, a14, a3
0396
0397 rev32 v15.8h, v15.8h
0398 ror a15, a15, #16
0399 rev32 v12.8h, v12.8h
0400 ror a12, a12, #16
0401 rev32 v13.8h, v13.8h
0402 ror a13, a13, #16
0403 rev32 v14.8h, v14.8h
0404 ror a14, a14, #16
0405
0406 // x10 += x15, x5 = rotl32(x5 ^ x10, 12)
0407 // x11 += x12, x6 = rotl32(x6 ^ x11, 12)
0408 // x8 += x13, x7 = rotl32(x7 ^ x8, 12)
0409 // x9 += x14, x4 = rotl32(x4 ^ x9, 12)
0410 add v10.4s, v10.4s, v15.4s
0411 add a10, a10, a15
0412 add v11.4s, v11.4s, v12.4s
0413 add a11, a11, a12
0414 add v8.4s, v8.4s, v13.4s
0415 add a8, a8, a13
0416 add v9.4s, v9.4s, v14.4s
0417 add a9, a9, a14
0418
0419 eor v16.16b, v5.16b, v10.16b
0420 eor a5, a5, a10
0421 eor v17.16b, v6.16b, v11.16b
0422 eor a6, a6, a11
0423 eor v18.16b, v7.16b, v8.16b
0424 eor a7, a7, a8
0425 eor v19.16b, v4.16b, v9.16b
0426 eor a4, a4, a9
0427
0428 shl v5.4s, v16.4s, #12
0429 shl v6.4s, v17.4s, #12
0430 shl v7.4s, v18.4s, #12
0431 shl v4.4s, v19.4s, #12
0432
0433 sri v5.4s, v16.4s, #20
0434 ror a5, a5, #20
0435 sri v6.4s, v17.4s, #20
0436 ror a6, a6, #20
0437 sri v7.4s, v18.4s, #20
0438 ror a7, a7, #20
0439 sri v4.4s, v19.4s, #20
0440 ror a4, a4, #20
0441
0442 // x0 += x5, x15 = rotl32(x15 ^ x0, 8)
0443 // x1 += x6, x12 = rotl32(x12 ^ x1, 8)
0444 // x2 += x7, x13 = rotl32(x13 ^ x2, 8)
0445 // x3 += x4, x14 = rotl32(x14 ^ x3, 8)
0446 add v0.4s, v0.4s, v5.4s
0447 add a0, a0, a5
0448 add v1.4s, v1.4s, v6.4s
0449 add a1, a1, a6
0450 add v2.4s, v2.4s, v7.4s
0451 add a2, a2, a7
0452 add v3.4s, v3.4s, v4.4s
0453 add a3, a3, a4
0454
0455 eor v15.16b, v15.16b, v0.16b
0456 eor a15, a15, a0
0457 eor v12.16b, v12.16b, v1.16b
0458 eor a12, a12, a1
0459 eor v13.16b, v13.16b, v2.16b
0460 eor a13, a13, a2
0461 eor v14.16b, v14.16b, v3.16b
0462 eor a14, a14, a3
0463
0464 tbl v15.16b, {v15.16b}, v31.16b
0465 ror a15, a15, #24
0466 tbl v12.16b, {v12.16b}, v31.16b
0467 ror a12, a12, #24
0468 tbl v13.16b, {v13.16b}, v31.16b
0469 ror a13, a13, #24
0470 tbl v14.16b, {v14.16b}, v31.16b
0471 ror a14, a14, #24
0472
0473 // x10 += x15, x5 = rotl32(x5 ^ x10, 7)
0474 // x11 += x12, x6 = rotl32(x6 ^ x11, 7)
0475 // x8 += x13, x7 = rotl32(x7 ^ x8, 7)
0476 // x9 += x14, x4 = rotl32(x4 ^ x9, 7)
0477 add v10.4s, v10.4s, v15.4s
0478 add a10, a10, a15
0479 add v11.4s, v11.4s, v12.4s
0480 add a11, a11, a12
0481 add v8.4s, v8.4s, v13.4s
0482 add a8, a8, a13
0483 add v9.4s, v9.4s, v14.4s
0484 add a9, a9, a14
0485
0486 eor v16.16b, v5.16b, v10.16b
0487 eor a5, a5, a10
0488 eor v17.16b, v6.16b, v11.16b
0489 eor a6, a6, a11
0490 eor v18.16b, v7.16b, v8.16b
0491 eor a7, a7, a8
0492 eor v19.16b, v4.16b, v9.16b
0493 eor a4, a4, a9
0494
0495 shl v5.4s, v16.4s, #7
0496 shl v6.4s, v17.4s, #7
0497 shl v7.4s, v18.4s, #7
0498 shl v4.4s, v19.4s, #7
0499
0500 sri v5.4s, v16.4s, #25
0501 ror a5, a5, #25
0502 sri v6.4s, v17.4s, #25
0503 ror a6, a6, #25
0504 sri v7.4s, v18.4s, #25
0505 ror a7, a7, #25
0506 sri v4.4s, v19.4s, #25
0507 ror a4, a4, #25
0508
0509 subs w3, w3, #2
0510 b.ne .Ldoubleround4
0511
0512 ld4r {v16.4s-v19.4s}, [x0], #16
0513 ld4r {v20.4s-v23.4s}, [x0], #16
0514
0515 // x12 += counter values 0-3
0516 add v12.4s, v12.4s, v30.4s
0517
0518 // x0[0-3] += s0[0]
0519 // x1[0-3] += s0[1]
0520 // x2[0-3] += s0[2]
0521 // x3[0-3] += s0[3]
0522 add v0.4s, v0.4s, v16.4s
0523 mov w6, v16.s[0]
0524 mov w7, v17.s[0]
0525 add v1.4s, v1.4s, v17.4s
0526 mov w8, v18.s[0]
0527 mov w9, v19.s[0]
0528 add v2.4s, v2.4s, v18.4s
0529 add a0, a0, w6
0530 add a1, a1, w7
0531 add v3.4s, v3.4s, v19.4s
0532 add a2, a2, w8
0533 add a3, a3, w9
0534 CPU_BE( rev a0, a0 )
0535 CPU_BE( rev a1, a1 )
0536 CPU_BE( rev a2, a2 )
0537 CPU_BE( rev a3, a3 )
0538
0539 ld4r {v24.4s-v27.4s}, [x0], #16
0540 ld4r {v28.4s-v31.4s}, [x0]
0541
0542 // x4[0-3] += s1[0]
0543 // x5[0-3] += s1[1]
0544 // x6[0-3] += s1[2]
0545 // x7[0-3] += s1[3]
0546 add v4.4s, v4.4s, v20.4s
0547 mov w6, v20.s[0]
0548 mov w7, v21.s[0]
0549 add v5.4s, v5.4s, v21.4s
0550 mov w8, v22.s[0]
0551 mov w9, v23.s[0]
0552 add v6.4s, v6.4s, v22.4s
0553 add a4, a4, w6
0554 add a5, a5, w7
0555 add v7.4s, v7.4s, v23.4s
0556 add a6, a6, w8
0557 add a7, a7, w9
0558 CPU_BE( rev a4, a4 )
0559 CPU_BE( rev a5, a5 )
0560 CPU_BE( rev a6, a6 )
0561 CPU_BE( rev a7, a7 )
0562
0563 // x8[0-3] += s2[0]
0564 // x9[0-3] += s2[1]
0565 // x10[0-3] += s2[2]
0566 // x11[0-3] += s2[3]
0567 add v8.4s, v8.4s, v24.4s
0568 mov w6, v24.s[0]
0569 mov w7, v25.s[0]
0570 add v9.4s, v9.4s, v25.4s
0571 mov w8, v26.s[0]
0572 mov w9, v27.s[0]
0573 add v10.4s, v10.4s, v26.4s
0574 add a8, a8, w6
0575 add a9, a9, w7
0576 add v11.4s, v11.4s, v27.4s
0577 add a10, a10, w8
0578 add a11, a11, w9
0579 CPU_BE( rev a8, a8 )
0580 CPU_BE( rev a9, a9 )
0581 CPU_BE( rev a10, a10 )
0582 CPU_BE( rev a11, a11 )
0583
0584 // x12[0-3] += s3[0]
0585 // x13[0-3] += s3[1]
0586 // x14[0-3] += s3[2]
0587 // x15[0-3] += s3[3]
0588 add v12.4s, v12.4s, v28.4s
0589 mov w6, v28.s[0]
0590 mov w7, v29.s[0]
0591 add v13.4s, v13.4s, v29.4s
0592 mov w8, v30.s[0]
0593 mov w9, v31.s[0]
0594 add v14.4s, v14.4s, v30.4s
0595 add a12, a12, w6
0596 add a13, a13, w7
0597 add v15.4s, v15.4s, v31.4s
0598 add a14, a14, w8
0599 add a15, a15, w9
0600 CPU_BE( rev a12, a12 )
0601 CPU_BE( rev a13, a13 )
0602 CPU_BE( rev a14, a14 )
0603 CPU_BE( rev a15, a15 )
0604
0605 // interleave 32-bit words in state n, n+1
0606 ldp w6, w7, [x2], #64
0607 zip1 v16.4s, v0.4s, v1.4s
0608 ldp w8, w9, [x2, #-56]
0609 eor a0, a0, w6
0610 zip2 v17.4s, v0.4s, v1.4s
0611 eor a1, a1, w7
0612 zip1 v18.4s, v2.4s, v3.4s
0613 eor a2, a2, w8
0614 zip2 v19.4s, v2.4s, v3.4s
0615 eor a3, a3, w9
0616 ldp w6, w7, [x2, #-48]
0617 zip1 v20.4s, v4.4s, v5.4s
0618 ldp w8, w9, [x2, #-40]
0619 eor a4, a4, w6
0620 zip2 v21.4s, v4.4s, v5.4s
0621 eor a5, a5, w7
0622 zip1 v22.4s, v6.4s, v7.4s
0623 eor a6, a6, w8
0624 zip2 v23.4s, v6.4s, v7.4s
0625 eor a7, a7, w9
0626 ldp w6, w7, [x2, #-32]
0627 zip1 v24.4s, v8.4s, v9.4s
0628 ldp w8, w9, [x2, #-24]
0629 eor a8, a8, w6
0630 zip2 v25.4s, v8.4s, v9.4s
0631 eor a9, a9, w7
0632 zip1 v26.4s, v10.4s, v11.4s
0633 eor a10, a10, w8
0634 zip2 v27.4s, v10.4s, v11.4s
0635 eor a11, a11, w9
0636 ldp w6, w7, [x2, #-16]
0637 zip1 v28.4s, v12.4s, v13.4s
0638 ldp w8, w9, [x2, #-8]
0639 eor a12, a12, w6
0640 zip2 v29.4s, v12.4s, v13.4s
0641 eor a13, a13, w7
0642 zip1 v30.4s, v14.4s, v15.4s
0643 eor a14, a14, w8
0644 zip2 v31.4s, v14.4s, v15.4s
0645 eor a15, a15, w9
0646
0647 add x3, x2, x4
0648 sub x3, x3, #128 // start of last block
0649
0650 subs x5, x4, #128
0651 csel x2, x2, x3, ge
0652
0653 // interleave 64-bit words in state n, n+2
0654 zip1 v0.2d, v16.2d, v18.2d
0655 zip2 v4.2d, v16.2d, v18.2d
0656 stp a0, a1, [x1], #64
0657 zip1 v8.2d, v17.2d, v19.2d
0658 zip2 v12.2d, v17.2d, v19.2d
0659 stp a2, a3, [x1, #-56]
0660
0661 subs x6, x4, #192
0662 ld1 {v16.16b-v19.16b}, [x2], #64
0663 csel x2, x2, x3, ge
0664
0665 zip1 v1.2d, v20.2d, v22.2d
0666 zip2 v5.2d, v20.2d, v22.2d
0667 stp a4, a5, [x1, #-48]
0668 zip1 v9.2d, v21.2d, v23.2d
0669 zip2 v13.2d, v21.2d, v23.2d
0670 stp a6, a7, [x1, #-40]
0671
0672 subs x7, x4, #256
0673 ld1 {v20.16b-v23.16b}, [x2], #64
0674 csel x2, x2, x3, ge
0675
0676 zip1 v2.2d, v24.2d, v26.2d
0677 zip2 v6.2d, v24.2d, v26.2d
0678 stp a8, a9, [x1, #-32]
0679 zip1 v10.2d, v25.2d, v27.2d
0680 zip2 v14.2d, v25.2d, v27.2d
0681 stp a10, a11, [x1, #-24]
0682
0683 subs x8, x4, #320
0684 ld1 {v24.16b-v27.16b}, [x2], #64
0685 csel x2, x2, x3, ge
0686
0687 zip1 v3.2d, v28.2d, v30.2d
0688 zip2 v7.2d, v28.2d, v30.2d
0689 stp a12, a13, [x1, #-16]
0690 zip1 v11.2d, v29.2d, v31.2d
0691 zip2 v15.2d, v29.2d, v31.2d
0692 stp a14, a15, [x1, #-8]
0693
0694 tbnz x5, #63, .Lt128
0695 ld1 {v28.16b-v31.16b}, [x2]
0696
0697 // xor with corresponding input, write to output
0698 eor v16.16b, v16.16b, v0.16b
0699 eor v17.16b, v17.16b, v1.16b
0700 eor v18.16b, v18.16b, v2.16b
0701 eor v19.16b, v19.16b, v3.16b
0702
0703 tbnz x6, #63, .Lt192
0704
0705 eor v20.16b, v20.16b, v4.16b
0706 eor v21.16b, v21.16b, v5.16b
0707 eor v22.16b, v22.16b, v6.16b
0708 eor v23.16b, v23.16b, v7.16b
0709
0710 st1 {v16.16b-v19.16b}, [x1], #64
0711 tbnz x7, #63, .Lt256
0712
0713 eor v24.16b, v24.16b, v8.16b
0714 eor v25.16b, v25.16b, v9.16b
0715 eor v26.16b, v26.16b, v10.16b
0716 eor v27.16b, v27.16b, v11.16b
0717
0718 st1 {v20.16b-v23.16b}, [x1], #64
0719 tbnz x8, #63, .Lt320
0720
0721 eor v28.16b, v28.16b, v12.16b
0722 eor v29.16b, v29.16b, v13.16b
0723 eor v30.16b, v30.16b, v14.16b
0724 eor v31.16b, v31.16b, v15.16b
0725
0726 st1 {v24.16b-v27.16b}, [x1], #64
0727 st1 {v28.16b-v31.16b}, [x1]
0728
0729 .Lout: frame_pop
0730 ret
0731
0732 // fewer than 192 bytes of in/output
0733 .Lt192: cbz x5, 1f // exactly 128 bytes?
0734 ld1 {v28.16b-v31.16b}, [x10]
0735 add x5, x5, x1
0736 tbl v28.16b, {v4.16b-v7.16b}, v28.16b
0737 tbl v29.16b, {v4.16b-v7.16b}, v29.16b
0738 tbl v30.16b, {v4.16b-v7.16b}, v30.16b
0739 tbl v31.16b, {v4.16b-v7.16b}, v31.16b
0740
0741 0: eor v20.16b, v20.16b, v28.16b
0742 eor v21.16b, v21.16b, v29.16b
0743 eor v22.16b, v22.16b, v30.16b
0744 eor v23.16b, v23.16b, v31.16b
0745 st1 {v20.16b-v23.16b}, [x5] // overlapping stores
0746 1: st1 {v16.16b-v19.16b}, [x1]
0747 b .Lout
0748
0749 // fewer than 128 bytes of in/output
0750 .Lt128: ld1 {v28.16b-v31.16b}, [x10]
0751 add x5, x5, x1
0752 sub x1, x1, #64
0753 tbl v28.16b, {v0.16b-v3.16b}, v28.16b
0754 tbl v29.16b, {v0.16b-v3.16b}, v29.16b
0755 tbl v30.16b, {v0.16b-v3.16b}, v30.16b
0756 tbl v31.16b, {v0.16b-v3.16b}, v31.16b
0757 ld1 {v16.16b-v19.16b}, [x1] // reload first output block
0758 b 0b
0759
0760 // fewer than 256 bytes of in/output
0761 .Lt256: cbz x6, 2f // exactly 192 bytes?
0762 ld1 {v4.16b-v7.16b}, [x10]
0763 add x6, x6, x1
0764 tbl v0.16b, {v8.16b-v11.16b}, v4.16b
0765 tbl v1.16b, {v8.16b-v11.16b}, v5.16b
0766 tbl v2.16b, {v8.16b-v11.16b}, v6.16b
0767 tbl v3.16b, {v8.16b-v11.16b}, v7.16b
0768
0769 eor v28.16b, v28.16b, v0.16b
0770 eor v29.16b, v29.16b, v1.16b
0771 eor v30.16b, v30.16b, v2.16b
0772 eor v31.16b, v31.16b, v3.16b
0773 st1 {v28.16b-v31.16b}, [x6] // overlapping stores
0774 2: st1 {v20.16b-v23.16b}, [x1]
0775 b .Lout
0776
0777 // fewer than 320 bytes of in/output
0778 .Lt320: cbz x7, 3f // exactly 256 bytes?
0779 ld1 {v4.16b-v7.16b}, [x10]
0780 add x7, x7, x1
0781 tbl v0.16b, {v12.16b-v15.16b}, v4.16b
0782 tbl v1.16b, {v12.16b-v15.16b}, v5.16b
0783 tbl v2.16b, {v12.16b-v15.16b}, v6.16b
0784 tbl v3.16b, {v12.16b-v15.16b}, v7.16b
0785
0786 eor v28.16b, v28.16b, v0.16b
0787 eor v29.16b, v29.16b, v1.16b
0788 eor v30.16b, v30.16b, v2.16b
0789 eor v31.16b, v31.16b, v3.16b
0790 st1 {v28.16b-v31.16b}, [x7] // overlapping stores
0791 3: st1 {v24.16b-v27.16b}, [x1]
0792 b .Lout
0793 SYM_FUNC_END(chacha_4block_xor_neon)
0794
0795 .section ".rodata", "a", %progbits
0796 .align L1_CACHE_SHIFT
0797 .Lpermute:
0798 .set .Li, 0
0799 .rept 128
0800 .byte (.Li - 64)
0801 .set .Li, .Li + 1
0802 .endr
0803
0804 CTRINC: .word 1, 2, 3, 4
0805 ROT8: .word 0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f