0001
0002
0003
0004
0005
0006
0007
0008 #include <linux/linkage.h>
0009
0010 .section .rodata.cst32.ROT8, "aM", @progbits, 32
0011 .align 32
0012 ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003
0013 .octa 0x0e0d0c0f0a09080b0605040702010003
0014
0015 .section .rodata.cst32.ROT16, "aM", @progbits, 32
0016 .align 32
0017 ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302
0018 .octa 0x0d0c0f0e09080b0a0504070601000302
0019
0020 .section .rodata.cst32.CTRINC, "aM", @progbits, 32
0021 .align 32
0022 CTRINC: .octa 0x00000003000000020000000100000000
0023 .octa 0x00000007000000060000000500000004
0024
0025 .section .rodata.cst32.CTR2BL, "aM", @progbits, 32
0026 .align 32
0027 CTR2BL: .octa 0x00000000000000000000000000000000
0028 .octa 0x00000000000000000000000000000001
0029
0030 .section .rodata.cst32.CTR4BL, "aM", @progbits, 32
0031 .align 32
0032 CTR4BL: .octa 0x00000000000000000000000000000002
0033 .octa 0x00000000000000000000000000000003
0034
0035 .text
0036
0037 SYM_FUNC_START(chacha_2block_xor_avx2)
0038 # %rdi: Input state matrix, s
0039 # %rsi: up to 2 data blocks output, o
0040 # %rdx: up to 2 data blocks input, i
0041 # %rcx: input/output length in bytes
0042 # %r8d: nrounds
0043
0044 # This function encrypts two ChaCha blocks by loading the state
0045 # matrix twice across four AVX registers. It performs matrix operations
0046 # on four words in each matrix in parallel, but requires shuffling to
0047 # rearrange the words after each round.
0048
0049 vzeroupper
0050
0051 # x0..3[0-2] = s0..3
0052 vbroadcasti128 0x00(%rdi),%ymm0
0053 vbroadcasti128 0x10(%rdi),%ymm1
0054 vbroadcasti128 0x20(%rdi),%ymm2
0055 vbroadcasti128 0x30(%rdi),%ymm3
0056
0057 vpaddd CTR2BL(%rip),%ymm3,%ymm3
0058
0059 vmovdqa %ymm0,%ymm8
0060 vmovdqa %ymm1,%ymm9
0061 vmovdqa %ymm2,%ymm10
0062 vmovdqa %ymm3,%ymm11
0063
0064 vmovdqa ROT8(%rip),%ymm4
0065 vmovdqa ROT16(%rip),%ymm5
0066
0067 mov %rcx,%rax
0068
0069 .Ldoubleround:
0070
0071 # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
0072 vpaddd %ymm1,%ymm0,%ymm0
0073 vpxor %ymm0,%ymm3,%ymm3
0074 vpshufb %ymm5,%ymm3,%ymm3
0075
0076 # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
0077 vpaddd %ymm3,%ymm2,%ymm2
0078 vpxor %ymm2,%ymm1,%ymm1
0079 vmovdqa %ymm1,%ymm6
0080 vpslld $12,%ymm6,%ymm6
0081 vpsrld $20,%ymm1,%ymm1
0082 vpor %ymm6,%ymm1,%ymm1
0083
0084 # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
0085 vpaddd %ymm1,%ymm0,%ymm0
0086 vpxor %ymm0,%ymm3,%ymm3
0087 vpshufb %ymm4,%ymm3,%ymm3
0088
0089 # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
0090 vpaddd %ymm3,%ymm2,%ymm2
0091 vpxor %ymm2,%ymm1,%ymm1
0092 vmovdqa %ymm1,%ymm7
0093 vpslld $7,%ymm7,%ymm7
0094 vpsrld $25,%ymm1,%ymm1
0095 vpor %ymm7,%ymm1,%ymm1
0096
0097 # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
0098 vpshufd $0x39,%ymm1,%ymm1
0099 # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
0100 vpshufd $0x4e,%ymm2,%ymm2
0101 # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
0102 vpshufd $0x93,%ymm3,%ymm3
0103
0104 # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
0105 vpaddd %ymm1,%ymm0,%ymm0
0106 vpxor %ymm0,%ymm3,%ymm3
0107 vpshufb %ymm5,%ymm3,%ymm3
0108
0109 # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
0110 vpaddd %ymm3,%ymm2,%ymm2
0111 vpxor %ymm2,%ymm1,%ymm1
0112 vmovdqa %ymm1,%ymm6
0113 vpslld $12,%ymm6,%ymm6
0114 vpsrld $20,%ymm1,%ymm1
0115 vpor %ymm6,%ymm1,%ymm1
0116
0117 # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
0118 vpaddd %ymm1,%ymm0,%ymm0
0119 vpxor %ymm0,%ymm3,%ymm3
0120 vpshufb %ymm4,%ymm3,%ymm3
0121
0122 # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
0123 vpaddd %ymm3,%ymm2,%ymm2
0124 vpxor %ymm2,%ymm1,%ymm1
0125 vmovdqa %ymm1,%ymm7
0126 vpslld $7,%ymm7,%ymm7
0127 vpsrld $25,%ymm1,%ymm1
0128 vpor %ymm7,%ymm1,%ymm1
0129
0130 # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
0131 vpshufd $0x93,%ymm1,%ymm1
0132 # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
0133 vpshufd $0x4e,%ymm2,%ymm2
0134 # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
0135 vpshufd $0x39,%ymm3,%ymm3
0136
0137 sub $2,%r8d
0138 jnz .Ldoubleround
0139
0140 # o0 = i0 ^ (x0 + s0)
0141 vpaddd %ymm8,%ymm0,%ymm7
0142 cmp $0x10,%rax
0143 jl .Lxorpart2
0144 vpxor 0x00(%rdx),%xmm7,%xmm6
0145 vmovdqu %xmm6,0x00(%rsi)
0146 vextracti128 $1,%ymm7,%xmm0
0147 # o1 = i1 ^ (x1 + s1)
0148 vpaddd %ymm9,%ymm1,%ymm7
0149 cmp $0x20,%rax
0150 jl .Lxorpart2
0151 vpxor 0x10(%rdx),%xmm7,%xmm6
0152 vmovdqu %xmm6,0x10(%rsi)
0153 vextracti128 $1,%ymm7,%xmm1
0154 # o2 = i2 ^ (x2 + s2)
0155 vpaddd %ymm10,%ymm2,%ymm7
0156 cmp $0x30,%rax
0157 jl .Lxorpart2
0158 vpxor 0x20(%rdx),%xmm7,%xmm6
0159 vmovdqu %xmm6,0x20(%rsi)
0160 vextracti128 $1,%ymm7,%xmm2
0161 # o3 = i3 ^ (x3 + s3)
0162 vpaddd %ymm11,%ymm3,%ymm7
0163 cmp $0x40,%rax
0164 jl .Lxorpart2
0165 vpxor 0x30(%rdx),%xmm7,%xmm6
0166 vmovdqu %xmm6,0x30(%rsi)
0167 vextracti128 $1,%ymm7,%xmm3
0168
0169 # xor and write second block
0170 vmovdqa %xmm0,%xmm7
0171 cmp $0x50,%rax
0172 jl .Lxorpart2
0173 vpxor 0x40(%rdx),%xmm7,%xmm6
0174 vmovdqu %xmm6,0x40(%rsi)
0175
0176 vmovdqa %xmm1,%xmm7
0177 cmp $0x60,%rax
0178 jl .Lxorpart2
0179 vpxor 0x50(%rdx),%xmm7,%xmm6
0180 vmovdqu %xmm6,0x50(%rsi)
0181
0182 vmovdqa %xmm2,%xmm7
0183 cmp $0x70,%rax
0184 jl .Lxorpart2
0185 vpxor 0x60(%rdx),%xmm7,%xmm6
0186 vmovdqu %xmm6,0x60(%rsi)
0187
0188 vmovdqa %xmm3,%xmm7
0189 cmp $0x80,%rax
0190 jl .Lxorpart2
0191 vpxor 0x70(%rdx),%xmm7,%xmm6
0192 vmovdqu %xmm6,0x70(%rsi)
0193
0194 .Ldone2:
0195 vzeroupper
0196 RET
0197
0198 .Lxorpart2:
0199 # xor remaining bytes from partial register into output
0200 mov %rax,%r9
0201 and $0x0f,%r9
0202 jz .Ldone2
0203 and $~0x0f,%rax
0204
0205 mov %rsi,%r11
0206
0207 lea 8(%rsp),%r10
0208 sub $0x10,%rsp
0209 and $~31,%rsp
0210
0211 lea (%rdx,%rax),%rsi
0212 mov %rsp,%rdi
0213 mov %r9,%rcx
0214 rep movsb
0215
0216 vpxor 0x00(%rsp),%xmm7,%xmm7
0217 vmovdqa %xmm7,0x00(%rsp)
0218
0219 mov %rsp,%rsi
0220 lea (%r11,%rax),%rdi
0221 mov %r9,%rcx
0222 rep movsb
0223
0224 lea -8(%r10),%rsp
0225 jmp .Ldone2
0226
0227 SYM_FUNC_END(chacha_2block_xor_avx2)
0228
0229 SYM_FUNC_START(chacha_4block_xor_avx2)
0230 # %rdi: Input state matrix, s
0231 # %rsi: up to 4 data blocks output, o
0232 # %rdx: up to 4 data blocks input, i
0233 # %rcx: input/output length in bytes
0234 # %r8d: nrounds
0235
0236 # This function encrypts four ChaCha blocks by loading the state
0237 # matrix four times across eight AVX registers. It performs matrix
0238 # operations on four words in two matrices in parallel, sequentially
0239 # to the operations on the four words of the other two matrices. The
0240 # required word shuffling has a rather high latency, we can do the
0241 # arithmetic on two matrix-pairs without much slowdown.
0242
0243 vzeroupper
0244
0245 # x0..3[0-4] = s0..3
0246 vbroadcasti128 0x00(%rdi),%ymm0
0247 vbroadcasti128 0x10(%rdi),%ymm1
0248 vbroadcasti128 0x20(%rdi),%ymm2
0249 vbroadcasti128 0x30(%rdi),%ymm3
0250
0251 vmovdqa %ymm0,%ymm4
0252 vmovdqa %ymm1,%ymm5
0253 vmovdqa %ymm2,%ymm6
0254 vmovdqa %ymm3,%ymm7
0255
0256 vpaddd CTR2BL(%rip),%ymm3,%ymm3
0257 vpaddd CTR4BL(%rip),%ymm7,%ymm7
0258
0259 vmovdqa %ymm0,%ymm11
0260 vmovdqa %ymm1,%ymm12
0261 vmovdqa %ymm2,%ymm13
0262 vmovdqa %ymm3,%ymm14
0263 vmovdqa %ymm7,%ymm15
0264
0265 vmovdqa ROT8(%rip),%ymm8
0266 vmovdqa ROT16(%rip),%ymm9
0267
0268 mov %rcx,%rax
0269
0270 .Ldoubleround4:
0271
0272 # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
0273 vpaddd %ymm1,%ymm0,%ymm0
0274 vpxor %ymm0,%ymm3,%ymm3
0275 vpshufb %ymm9,%ymm3,%ymm3
0276
0277 vpaddd %ymm5,%ymm4,%ymm4
0278 vpxor %ymm4,%ymm7,%ymm7
0279 vpshufb %ymm9,%ymm7,%ymm7
0280
0281 # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
0282 vpaddd %ymm3,%ymm2,%ymm2
0283 vpxor %ymm2,%ymm1,%ymm1
0284 vmovdqa %ymm1,%ymm10
0285 vpslld $12,%ymm10,%ymm10
0286 vpsrld $20,%ymm1,%ymm1
0287 vpor %ymm10,%ymm1,%ymm1
0288
0289 vpaddd %ymm7,%ymm6,%ymm6
0290 vpxor %ymm6,%ymm5,%ymm5
0291 vmovdqa %ymm5,%ymm10
0292 vpslld $12,%ymm10,%ymm10
0293 vpsrld $20,%ymm5,%ymm5
0294 vpor %ymm10,%ymm5,%ymm5
0295
0296 # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
0297 vpaddd %ymm1,%ymm0,%ymm0
0298 vpxor %ymm0,%ymm3,%ymm3
0299 vpshufb %ymm8,%ymm3,%ymm3
0300
0301 vpaddd %ymm5,%ymm4,%ymm4
0302 vpxor %ymm4,%ymm7,%ymm7
0303 vpshufb %ymm8,%ymm7,%ymm7
0304
0305 # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
0306 vpaddd %ymm3,%ymm2,%ymm2
0307 vpxor %ymm2,%ymm1,%ymm1
0308 vmovdqa %ymm1,%ymm10
0309 vpslld $7,%ymm10,%ymm10
0310 vpsrld $25,%ymm1,%ymm1
0311 vpor %ymm10,%ymm1,%ymm1
0312
0313 vpaddd %ymm7,%ymm6,%ymm6
0314 vpxor %ymm6,%ymm5,%ymm5
0315 vmovdqa %ymm5,%ymm10
0316 vpslld $7,%ymm10,%ymm10
0317 vpsrld $25,%ymm5,%ymm5
0318 vpor %ymm10,%ymm5,%ymm5
0319
0320 # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
0321 vpshufd $0x39,%ymm1,%ymm1
0322 vpshufd $0x39,%ymm5,%ymm5
0323 # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
0324 vpshufd $0x4e,%ymm2,%ymm2
0325 vpshufd $0x4e,%ymm6,%ymm6
0326 # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
0327 vpshufd $0x93,%ymm3,%ymm3
0328 vpshufd $0x93,%ymm7,%ymm7
0329
0330 # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
0331 vpaddd %ymm1,%ymm0,%ymm0
0332 vpxor %ymm0,%ymm3,%ymm3
0333 vpshufb %ymm9,%ymm3,%ymm3
0334
0335 vpaddd %ymm5,%ymm4,%ymm4
0336 vpxor %ymm4,%ymm7,%ymm7
0337 vpshufb %ymm9,%ymm7,%ymm7
0338
0339 # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
0340 vpaddd %ymm3,%ymm2,%ymm2
0341 vpxor %ymm2,%ymm1,%ymm1
0342 vmovdqa %ymm1,%ymm10
0343 vpslld $12,%ymm10,%ymm10
0344 vpsrld $20,%ymm1,%ymm1
0345 vpor %ymm10,%ymm1,%ymm1
0346
0347 vpaddd %ymm7,%ymm6,%ymm6
0348 vpxor %ymm6,%ymm5,%ymm5
0349 vmovdqa %ymm5,%ymm10
0350 vpslld $12,%ymm10,%ymm10
0351 vpsrld $20,%ymm5,%ymm5
0352 vpor %ymm10,%ymm5,%ymm5
0353
0354 # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
0355 vpaddd %ymm1,%ymm0,%ymm0
0356 vpxor %ymm0,%ymm3,%ymm3
0357 vpshufb %ymm8,%ymm3,%ymm3
0358
0359 vpaddd %ymm5,%ymm4,%ymm4
0360 vpxor %ymm4,%ymm7,%ymm7
0361 vpshufb %ymm8,%ymm7,%ymm7
0362
0363 # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
0364 vpaddd %ymm3,%ymm2,%ymm2
0365 vpxor %ymm2,%ymm1,%ymm1
0366 vmovdqa %ymm1,%ymm10
0367 vpslld $7,%ymm10,%ymm10
0368 vpsrld $25,%ymm1,%ymm1
0369 vpor %ymm10,%ymm1,%ymm1
0370
0371 vpaddd %ymm7,%ymm6,%ymm6
0372 vpxor %ymm6,%ymm5,%ymm5
0373 vmovdqa %ymm5,%ymm10
0374 vpslld $7,%ymm10,%ymm10
0375 vpsrld $25,%ymm5,%ymm5
0376 vpor %ymm10,%ymm5,%ymm5
0377
0378 # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
0379 vpshufd $0x93,%ymm1,%ymm1
0380 vpshufd $0x93,%ymm5,%ymm5
0381 # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
0382 vpshufd $0x4e,%ymm2,%ymm2
0383 vpshufd $0x4e,%ymm6,%ymm6
0384 # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
0385 vpshufd $0x39,%ymm3,%ymm3
0386 vpshufd $0x39,%ymm7,%ymm7
0387
0388 sub $2,%r8d
0389 jnz .Ldoubleround4
0390
0391 # o0 = i0 ^ (x0 + s0), first block
0392 vpaddd %ymm11,%ymm0,%ymm10
0393 cmp $0x10,%rax
0394 jl .Lxorpart4
0395 vpxor 0x00(%rdx),%xmm10,%xmm9
0396 vmovdqu %xmm9,0x00(%rsi)
0397 vextracti128 $1,%ymm10,%xmm0
0398 # o1 = i1 ^ (x1 + s1), first block
0399 vpaddd %ymm12,%ymm1,%ymm10
0400 cmp $0x20,%rax
0401 jl .Lxorpart4
0402 vpxor 0x10(%rdx),%xmm10,%xmm9
0403 vmovdqu %xmm9,0x10(%rsi)
0404 vextracti128 $1,%ymm10,%xmm1
0405 # o2 = i2 ^ (x2 + s2), first block
0406 vpaddd %ymm13,%ymm2,%ymm10
0407 cmp $0x30,%rax
0408 jl .Lxorpart4
0409 vpxor 0x20(%rdx),%xmm10,%xmm9
0410 vmovdqu %xmm9,0x20(%rsi)
0411 vextracti128 $1,%ymm10,%xmm2
0412 # o3 = i3 ^ (x3 + s3), first block
0413 vpaddd %ymm14,%ymm3,%ymm10
0414 cmp $0x40,%rax
0415 jl .Lxorpart4
0416 vpxor 0x30(%rdx),%xmm10,%xmm9
0417 vmovdqu %xmm9,0x30(%rsi)
0418 vextracti128 $1,%ymm10,%xmm3
0419
0420 # xor and write second block
0421 vmovdqa %xmm0,%xmm10
0422 cmp $0x50,%rax
0423 jl .Lxorpart4
0424 vpxor 0x40(%rdx),%xmm10,%xmm9
0425 vmovdqu %xmm9,0x40(%rsi)
0426
0427 vmovdqa %xmm1,%xmm10
0428 cmp $0x60,%rax
0429 jl .Lxorpart4
0430 vpxor 0x50(%rdx),%xmm10,%xmm9
0431 vmovdqu %xmm9,0x50(%rsi)
0432
0433 vmovdqa %xmm2,%xmm10
0434 cmp $0x70,%rax
0435 jl .Lxorpart4
0436 vpxor 0x60(%rdx),%xmm10,%xmm9
0437 vmovdqu %xmm9,0x60(%rsi)
0438
0439 vmovdqa %xmm3,%xmm10
0440 cmp $0x80,%rax
0441 jl .Lxorpart4
0442 vpxor 0x70(%rdx),%xmm10,%xmm9
0443 vmovdqu %xmm9,0x70(%rsi)
0444
0445 # o0 = i0 ^ (x0 + s0), third block
0446 vpaddd %ymm11,%ymm4,%ymm10
0447 cmp $0x90,%rax
0448 jl .Lxorpart4
0449 vpxor 0x80(%rdx),%xmm10,%xmm9
0450 vmovdqu %xmm9,0x80(%rsi)
0451 vextracti128 $1,%ymm10,%xmm4
0452 # o1 = i1 ^ (x1 + s1), third block
0453 vpaddd %ymm12,%ymm5,%ymm10
0454 cmp $0xa0,%rax
0455 jl .Lxorpart4
0456 vpxor 0x90(%rdx),%xmm10,%xmm9
0457 vmovdqu %xmm9,0x90(%rsi)
0458 vextracti128 $1,%ymm10,%xmm5
0459 # o2 = i2 ^ (x2 + s2), third block
0460 vpaddd %ymm13,%ymm6,%ymm10
0461 cmp $0xb0,%rax
0462 jl .Lxorpart4
0463 vpxor 0xa0(%rdx),%xmm10,%xmm9
0464 vmovdqu %xmm9,0xa0(%rsi)
0465 vextracti128 $1,%ymm10,%xmm6
0466 # o3 = i3 ^ (x3 + s3), third block
0467 vpaddd %ymm15,%ymm7,%ymm10
0468 cmp $0xc0,%rax
0469 jl .Lxorpart4
0470 vpxor 0xb0(%rdx),%xmm10,%xmm9
0471 vmovdqu %xmm9,0xb0(%rsi)
0472 vextracti128 $1,%ymm10,%xmm7
0473
0474 # xor and write fourth block
0475 vmovdqa %xmm4,%xmm10
0476 cmp $0xd0,%rax
0477 jl .Lxorpart4
0478 vpxor 0xc0(%rdx),%xmm10,%xmm9
0479 vmovdqu %xmm9,0xc0(%rsi)
0480
0481 vmovdqa %xmm5,%xmm10
0482 cmp $0xe0,%rax
0483 jl .Lxorpart4
0484 vpxor 0xd0(%rdx),%xmm10,%xmm9
0485 vmovdqu %xmm9,0xd0(%rsi)
0486
0487 vmovdqa %xmm6,%xmm10
0488 cmp $0xf0,%rax
0489 jl .Lxorpart4
0490 vpxor 0xe0(%rdx),%xmm10,%xmm9
0491 vmovdqu %xmm9,0xe0(%rsi)
0492
0493 vmovdqa %xmm7,%xmm10
0494 cmp $0x100,%rax
0495 jl .Lxorpart4
0496 vpxor 0xf0(%rdx),%xmm10,%xmm9
0497 vmovdqu %xmm9,0xf0(%rsi)
0498
0499 .Ldone4:
0500 vzeroupper
0501 RET
0502
0503 .Lxorpart4:
0504 # xor remaining bytes from partial register into output
0505 mov %rax,%r9
0506 and $0x0f,%r9
0507 jz .Ldone4
0508 and $~0x0f,%rax
0509
0510 mov %rsi,%r11
0511
0512 lea 8(%rsp),%r10
0513 sub $0x10,%rsp
0514 and $~31,%rsp
0515
0516 lea (%rdx,%rax),%rsi
0517 mov %rsp,%rdi
0518 mov %r9,%rcx
0519 rep movsb
0520
0521 vpxor 0x00(%rsp),%xmm10,%xmm10
0522 vmovdqa %xmm10,0x00(%rsp)
0523
0524 mov %rsp,%rsi
0525 lea (%r11,%rax),%rdi
0526 mov %r9,%rcx
0527 rep movsb
0528
0529 lea -8(%r10),%rsp
0530 jmp .Ldone4
0531
0532 SYM_FUNC_END(chacha_4block_xor_avx2)
0533
0534 SYM_FUNC_START(chacha_8block_xor_avx2)
0535 # %rdi: Input state matrix, s
0536 # %rsi: up to 8 data blocks output, o
0537 # %rdx: up to 8 data blocks input, i
0538 # %rcx: input/output length in bytes
0539 # %r8d: nrounds
0540
0541 # This function encrypts eight consecutive ChaCha blocks by loading
0542 # the state matrix in AVX registers eight times. As we need some
0543 # scratch registers, we save the first four registers on the stack. The
0544 # algorithm performs each operation on the corresponding word of each
0545 # state matrix, hence requires no word shuffling. For final XORing step
0546 # we transpose the matrix by interleaving 32-, 64- and then 128-bit
0547 # words, which allows us to do XOR in AVX registers. 8/16-bit word
0548 # rotation is done with the slightly better performing byte shuffling,
0549 # 7/12-bit word rotation uses traditional shift+OR.
0550
0551 vzeroupper
0552 # 4 * 32 byte stack, 32-byte aligned
0553 lea 8(%rsp),%r10
0554 and $~31, %rsp
0555 sub $0x80, %rsp
0556 mov %rcx,%rax
0557
0558 # x0..15[0-7] = s[0..15]
0559 vpbroadcastd 0x00(%rdi),%ymm0
0560 vpbroadcastd 0x04(%rdi),%ymm1
0561 vpbroadcastd 0x08(%rdi),%ymm2
0562 vpbroadcastd 0x0c(%rdi),%ymm3
0563 vpbroadcastd 0x10(%rdi),%ymm4
0564 vpbroadcastd 0x14(%rdi),%ymm5
0565 vpbroadcastd 0x18(%rdi),%ymm6
0566 vpbroadcastd 0x1c(%rdi),%ymm7
0567 vpbroadcastd 0x20(%rdi),%ymm8
0568 vpbroadcastd 0x24(%rdi),%ymm9
0569 vpbroadcastd 0x28(%rdi),%ymm10
0570 vpbroadcastd 0x2c(%rdi),%ymm11
0571 vpbroadcastd 0x30(%rdi),%ymm12
0572 vpbroadcastd 0x34(%rdi),%ymm13
0573 vpbroadcastd 0x38(%rdi),%ymm14
0574 vpbroadcastd 0x3c(%rdi),%ymm15
0575 # x0..3 on stack
0576 vmovdqa %ymm0,0x00(%rsp)
0577 vmovdqa %ymm1,0x20(%rsp)
0578 vmovdqa %ymm2,0x40(%rsp)
0579 vmovdqa %ymm3,0x60(%rsp)
0580
0581 vmovdqa CTRINC(%rip),%ymm1
0582 vmovdqa ROT8(%rip),%ymm2
0583 vmovdqa ROT16(%rip),%ymm3
0584
0585 # x12 += counter values 0-3
0586 vpaddd %ymm1,%ymm12,%ymm12
0587
0588 .Ldoubleround8:
0589 # x0 += x4, x12 = rotl32(x12 ^ x0, 16)
0590 vpaddd 0x00(%rsp),%ymm4,%ymm0
0591 vmovdqa %ymm0,0x00(%rsp)
0592 vpxor %ymm0,%ymm12,%ymm12
0593 vpshufb %ymm3,%ymm12,%ymm12
0594 # x1 += x5, x13 = rotl32(x13 ^ x1, 16)
0595 vpaddd 0x20(%rsp),%ymm5,%ymm0
0596 vmovdqa %ymm0,0x20(%rsp)
0597 vpxor %ymm0,%ymm13,%ymm13
0598 vpshufb %ymm3,%ymm13,%ymm13
0599 # x2 += x6, x14 = rotl32(x14 ^ x2, 16)
0600 vpaddd 0x40(%rsp),%ymm6,%ymm0
0601 vmovdqa %ymm0,0x40(%rsp)
0602 vpxor %ymm0,%ymm14,%ymm14
0603 vpshufb %ymm3,%ymm14,%ymm14
0604 # x3 += x7, x15 = rotl32(x15 ^ x3, 16)
0605 vpaddd 0x60(%rsp),%ymm7,%ymm0
0606 vmovdqa %ymm0,0x60(%rsp)
0607 vpxor %ymm0,%ymm15,%ymm15
0608 vpshufb %ymm3,%ymm15,%ymm15
0609
0610 # x8 += x12, x4 = rotl32(x4 ^ x8, 12)
0611 vpaddd %ymm12,%ymm8,%ymm8
0612 vpxor %ymm8,%ymm4,%ymm4
0613 vpslld $12,%ymm4,%ymm0
0614 vpsrld $20,%ymm4,%ymm4
0615 vpor %ymm0,%ymm4,%ymm4
0616 # x9 += x13, x5 = rotl32(x5 ^ x9, 12)
0617 vpaddd %ymm13,%ymm9,%ymm9
0618 vpxor %ymm9,%ymm5,%ymm5
0619 vpslld $12,%ymm5,%ymm0
0620 vpsrld $20,%ymm5,%ymm5
0621 vpor %ymm0,%ymm5,%ymm5
0622 # x10 += x14, x6 = rotl32(x6 ^ x10, 12)
0623 vpaddd %ymm14,%ymm10,%ymm10
0624 vpxor %ymm10,%ymm6,%ymm6
0625 vpslld $12,%ymm6,%ymm0
0626 vpsrld $20,%ymm6,%ymm6
0627 vpor %ymm0,%ymm6,%ymm6
0628 # x11 += x15, x7 = rotl32(x7 ^ x11, 12)
0629 vpaddd %ymm15,%ymm11,%ymm11
0630 vpxor %ymm11,%ymm7,%ymm7
0631 vpslld $12,%ymm7,%ymm0
0632 vpsrld $20,%ymm7,%ymm7
0633 vpor %ymm0,%ymm7,%ymm7
0634
0635 # x0 += x4, x12 = rotl32(x12 ^ x0, 8)
0636 vpaddd 0x00(%rsp),%ymm4,%ymm0
0637 vmovdqa %ymm0,0x00(%rsp)
0638 vpxor %ymm0,%ymm12,%ymm12
0639 vpshufb %ymm2,%ymm12,%ymm12
0640 # x1 += x5, x13 = rotl32(x13 ^ x1, 8)
0641 vpaddd 0x20(%rsp),%ymm5,%ymm0
0642 vmovdqa %ymm0,0x20(%rsp)
0643 vpxor %ymm0,%ymm13,%ymm13
0644 vpshufb %ymm2,%ymm13,%ymm13
0645 # x2 += x6, x14 = rotl32(x14 ^ x2, 8)
0646 vpaddd 0x40(%rsp),%ymm6,%ymm0
0647 vmovdqa %ymm0,0x40(%rsp)
0648 vpxor %ymm0,%ymm14,%ymm14
0649 vpshufb %ymm2,%ymm14,%ymm14
0650 # x3 += x7, x15 = rotl32(x15 ^ x3, 8)
0651 vpaddd 0x60(%rsp),%ymm7,%ymm0
0652 vmovdqa %ymm0,0x60(%rsp)
0653 vpxor %ymm0,%ymm15,%ymm15
0654 vpshufb %ymm2,%ymm15,%ymm15
0655
0656 # x8 += x12, x4 = rotl32(x4 ^ x8, 7)
0657 vpaddd %ymm12,%ymm8,%ymm8
0658 vpxor %ymm8,%ymm4,%ymm4
0659 vpslld $7,%ymm4,%ymm0
0660 vpsrld $25,%ymm4,%ymm4
0661 vpor %ymm0,%ymm4,%ymm4
0662 # x9 += x13, x5 = rotl32(x5 ^ x9, 7)
0663 vpaddd %ymm13,%ymm9,%ymm9
0664 vpxor %ymm9,%ymm5,%ymm5
0665 vpslld $7,%ymm5,%ymm0
0666 vpsrld $25,%ymm5,%ymm5
0667 vpor %ymm0,%ymm5,%ymm5
0668 # x10 += x14, x6 = rotl32(x6 ^ x10, 7)
0669 vpaddd %ymm14,%ymm10,%ymm10
0670 vpxor %ymm10,%ymm6,%ymm6
0671 vpslld $7,%ymm6,%ymm0
0672 vpsrld $25,%ymm6,%ymm6
0673 vpor %ymm0,%ymm6,%ymm6
0674 # x11 += x15, x7 = rotl32(x7 ^ x11, 7)
0675 vpaddd %ymm15,%ymm11,%ymm11
0676 vpxor %ymm11,%ymm7,%ymm7
0677 vpslld $7,%ymm7,%ymm0
0678 vpsrld $25,%ymm7,%ymm7
0679 vpor %ymm0,%ymm7,%ymm7
0680
0681 # x0 += x5, x15 = rotl32(x15 ^ x0, 16)
0682 vpaddd 0x00(%rsp),%ymm5,%ymm0
0683 vmovdqa %ymm0,0x00(%rsp)
0684 vpxor %ymm0,%ymm15,%ymm15
0685 vpshufb %ymm3,%ymm15,%ymm15
0686 # x1 += x6, x12 = rotl32(x12 ^ x1, 16)%ymm0
0687 vpaddd 0x20(%rsp),%ymm6,%ymm0
0688 vmovdqa %ymm0,0x20(%rsp)
0689 vpxor %ymm0,%ymm12,%ymm12
0690 vpshufb %ymm3,%ymm12,%ymm12
0691 # x2 += x7, x13 = rotl32(x13 ^ x2, 16)
0692 vpaddd 0x40(%rsp),%ymm7,%ymm0
0693 vmovdqa %ymm0,0x40(%rsp)
0694 vpxor %ymm0,%ymm13,%ymm13
0695 vpshufb %ymm3,%ymm13,%ymm13
0696 # x3 += x4, x14 = rotl32(x14 ^ x3, 16)
0697 vpaddd 0x60(%rsp),%ymm4,%ymm0
0698 vmovdqa %ymm0,0x60(%rsp)
0699 vpxor %ymm0,%ymm14,%ymm14
0700 vpshufb %ymm3,%ymm14,%ymm14
0701
0702 # x10 += x15, x5 = rotl32(x5 ^ x10, 12)
0703 vpaddd %ymm15,%ymm10,%ymm10
0704 vpxor %ymm10,%ymm5,%ymm5
0705 vpslld $12,%ymm5,%ymm0
0706 vpsrld $20,%ymm5,%ymm5
0707 vpor %ymm0,%ymm5,%ymm5
0708 # x11 += x12, x6 = rotl32(x6 ^ x11, 12)
0709 vpaddd %ymm12,%ymm11,%ymm11
0710 vpxor %ymm11,%ymm6,%ymm6
0711 vpslld $12,%ymm6,%ymm0
0712 vpsrld $20,%ymm6,%ymm6
0713 vpor %ymm0,%ymm6,%ymm6
0714 # x8 += x13, x7 = rotl32(x7 ^ x8, 12)
0715 vpaddd %ymm13,%ymm8,%ymm8
0716 vpxor %ymm8,%ymm7,%ymm7
0717 vpslld $12,%ymm7,%ymm0
0718 vpsrld $20,%ymm7,%ymm7
0719 vpor %ymm0,%ymm7,%ymm7
0720 # x9 += x14, x4 = rotl32(x4 ^ x9, 12)
0721 vpaddd %ymm14,%ymm9,%ymm9
0722 vpxor %ymm9,%ymm4,%ymm4
0723 vpslld $12,%ymm4,%ymm0
0724 vpsrld $20,%ymm4,%ymm4
0725 vpor %ymm0,%ymm4,%ymm4
0726
0727 # x0 += x5, x15 = rotl32(x15 ^ x0, 8)
0728 vpaddd 0x00(%rsp),%ymm5,%ymm0
0729 vmovdqa %ymm0,0x00(%rsp)
0730 vpxor %ymm0,%ymm15,%ymm15
0731 vpshufb %ymm2,%ymm15,%ymm15
0732 # x1 += x6, x12 = rotl32(x12 ^ x1, 8)
0733 vpaddd 0x20(%rsp),%ymm6,%ymm0
0734 vmovdqa %ymm0,0x20(%rsp)
0735 vpxor %ymm0,%ymm12,%ymm12
0736 vpshufb %ymm2,%ymm12,%ymm12
0737 # x2 += x7, x13 = rotl32(x13 ^ x2, 8)
0738 vpaddd 0x40(%rsp),%ymm7,%ymm0
0739 vmovdqa %ymm0,0x40(%rsp)
0740 vpxor %ymm0,%ymm13,%ymm13
0741 vpshufb %ymm2,%ymm13,%ymm13
0742 # x3 += x4, x14 = rotl32(x14 ^ x3, 8)
0743 vpaddd 0x60(%rsp),%ymm4,%ymm0
0744 vmovdqa %ymm0,0x60(%rsp)
0745 vpxor %ymm0,%ymm14,%ymm14
0746 vpshufb %ymm2,%ymm14,%ymm14
0747
0748 # x10 += x15, x5 = rotl32(x5 ^ x10, 7)
0749 vpaddd %ymm15,%ymm10,%ymm10
0750 vpxor %ymm10,%ymm5,%ymm5
0751 vpslld $7,%ymm5,%ymm0
0752 vpsrld $25,%ymm5,%ymm5
0753 vpor %ymm0,%ymm5,%ymm5
0754 # x11 += x12, x6 = rotl32(x6 ^ x11, 7)
0755 vpaddd %ymm12,%ymm11,%ymm11
0756 vpxor %ymm11,%ymm6,%ymm6
0757 vpslld $7,%ymm6,%ymm0
0758 vpsrld $25,%ymm6,%ymm6
0759 vpor %ymm0,%ymm6,%ymm6
0760 # x8 += x13, x7 = rotl32(x7 ^ x8, 7)
0761 vpaddd %ymm13,%ymm8,%ymm8
0762 vpxor %ymm8,%ymm7,%ymm7
0763 vpslld $7,%ymm7,%ymm0
0764 vpsrld $25,%ymm7,%ymm7
0765 vpor %ymm0,%ymm7,%ymm7
0766 # x9 += x14, x4 = rotl32(x4 ^ x9, 7)
0767 vpaddd %ymm14,%ymm9,%ymm9
0768 vpxor %ymm9,%ymm4,%ymm4
0769 vpslld $7,%ymm4,%ymm0
0770 vpsrld $25,%ymm4,%ymm4
0771 vpor %ymm0,%ymm4,%ymm4
0772
0773 sub $2,%r8d
0774 jnz .Ldoubleround8
0775
0776 # x0..15[0-3] += s[0..15]
0777 vpbroadcastd 0x00(%rdi),%ymm0
0778 vpaddd 0x00(%rsp),%ymm0,%ymm0
0779 vmovdqa %ymm0,0x00(%rsp)
0780 vpbroadcastd 0x04(%rdi),%ymm0
0781 vpaddd 0x20(%rsp),%ymm0,%ymm0
0782 vmovdqa %ymm0,0x20(%rsp)
0783 vpbroadcastd 0x08(%rdi),%ymm0
0784 vpaddd 0x40(%rsp),%ymm0,%ymm0
0785 vmovdqa %ymm0,0x40(%rsp)
0786 vpbroadcastd 0x0c(%rdi),%ymm0
0787 vpaddd 0x60(%rsp),%ymm0,%ymm0
0788 vmovdqa %ymm0,0x60(%rsp)
0789 vpbroadcastd 0x10(%rdi),%ymm0
0790 vpaddd %ymm0,%ymm4,%ymm4
0791 vpbroadcastd 0x14(%rdi),%ymm0
0792 vpaddd %ymm0,%ymm5,%ymm5
0793 vpbroadcastd 0x18(%rdi),%ymm0
0794 vpaddd %ymm0,%ymm6,%ymm6
0795 vpbroadcastd 0x1c(%rdi),%ymm0
0796 vpaddd %ymm0,%ymm7,%ymm7
0797 vpbroadcastd 0x20(%rdi),%ymm0
0798 vpaddd %ymm0,%ymm8,%ymm8
0799 vpbroadcastd 0x24(%rdi),%ymm0
0800 vpaddd %ymm0,%ymm9,%ymm9
0801 vpbroadcastd 0x28(%rdi),%ymm0
0802 vpaddd %ymm0,%ymm10,%ymm10
0803 vpbroadcastd 0x2c(%rdi),%ymm0
0804 vpaddd %ymm0,%ymm11,%ymm11
0805 vpbroadcastd 0x30(%rdi),%ymm0
0806 vpaddd %ymm0,%ymm12,%ymm12
0807 vpbroadcastd 0x34(%rdi),%ymm0
0808 vpaddd %ymm0,%ymm13,%ymm13
0809 vpbroadcastd 0x38(%rdi),%ymm0
0810 vpaddd %ymm0,%ymm14,%ymm14
0811 vpbroadcastd 0x3c(%rdi),%ymm0
0812 vpaddd %ymm0,%ymm15,%ymm15
0813
0814 # x12 += counter values 0-3
0815 vpaddd %ymm1,%ymm12,%ymm12
0816
0817 # interleave 32-bit words in state n, n+1
0818 vmovdqa 0x00(%rsp),%ymm0
0819 vmovdqa 0x20(%rsp),%ymm1
0820 vpunpckldq %ymm1,%ymm0,%ymm2
0821 vpunpckhdq %ymm1,%ymm0,%ymm1
0822 vmovdqa %ymm2,0x00(%rsp)
0823 vmovdqa %ymm1,0x20(%rsp)
0824 vmovdqa 0x40(%rsp),%ymm0
0825 vmovdqa 0x60(%rsp),%ymm1
0826 vpunpckldq %ymm1,%ymm0,%ymm2
0827 vpunpckhdq %ymm1,%ymm0,%ymm1
0828 vmovdqa %ymm2,0x40(%rsp)
0829 vmovdqa %ymm1,0x60(%rsp)
0830 vmovdqa %ymm4,%ymm0
0831 vpunpckldq %ymm5,%ymm0,%ymm4
0832 vpunpckhdq %ymm5,%ymm0,%ymm5
0833 vmovdqa %ymm6,%ymm0
0834 vpunpckldq %ymm7,%ymm0,%ymm6
0835 vpunpckhdq %ymm7,%ymm0,%ymm7
0836 vmovdqa %ymm8,%ymm0
0837 vpunpckldq %ymm9,%ymm0,%ymm8
0838 vpunpckhdq %ymm9,%ymm0,%ymm9
0839 vmovdqa %ymm10,%ymm0
0840 vpunpckldq %ymm11,%ymm0,%ymm10
0841 vpunpckhdq %ymm11,%ymm0,%ymm11
0842 vmovdqa %ymm12,%ymm0
0843 vpunpckldq %ymm13,%ymm0,%ymm12
0844 vpunpckhdq %ymm13,%ymm0,%ymm13
0845 vmovdqa %ymm14,%ymm0
0846 vpunpckldq %ymm15,%ymm0,%ymm14
0847 vpunpckhdq %ymm15,%ymm0,%ymm15
0848
0849 # interleave 64-bit words in state n, n+2
0850 vmovdqa 0x00(%rsp),%ymm0
0851 vmovdqa 0x40(%rsp),%ymm2
0852 vpunpcklqdq %ymm2,%ymm0,%ymm1
0853 vpunpckhqdq %ymm2,%ymm0,%ymm2
0854 vmovdqa %ymm1,0x00(%rsp)
0855 vmovdqa %ymm2,0x40(%rsp)
0856 vmovdqa 0x20(%rsp),%ymm0
0857 vmovdqa 0x60(%rsp),%ymm2
0858 vpunpcklqdq %ymm2,%ymm0,%ymm1
0859 vpunpckhqdq %ymm2,%ymm0,%ymm2
0860 vmovdqa %ymm1,0x20(%rsp)
0861 vmovdqa %ymm2,0x60(%rsp)
0862 vmovdqa %ymm4,%ymm0
0863 vpunpcklqdq %ymm6,%ymm0,%ymm4
0864 vpunpckhqdq %ymm6,%ymm0,%ymm6
0865 vmovdqa %ymm5,%ymm0
0866 vpunpcklqdq %ymm7,%ymm0,%ymm5
0867 vpunpckhqdq %ymm7,%ymm0,%ymm7
0868 vmovdqa %ymm8,%ymm0
0869 vpunpcklqdq %ymm10,%ymm0,%ymm8
0870 vpunpckhqdq %ymm10,%ymm0,%ymm10
0871 vmovdqa %ymm9,%ymm0
0872 vpunpcklqdq %ymm11,%ymm0,%ymm9
0873 vpunpckhqdq %ymm11,%ymm0,%ymm11
0874 vmovdqa %ymm12,%ymm0
0875 vpunpcklqdq %ymm14,%ymm0,%ymm12
0876 vpunpckhqdq %ymm14,%ymm0,%ymm14
0877 vmovdqa %ymm13,%ymm0
0878 vpunpcklqdq %ymm15,%ymm0,%ymm13
0879 vpunpckhqdq %ymm15,%ymm0,%ymm15
0880
0881 # interleave 128-bit words in state n, n+4
0882 # xor/write first four blocks
0883 vmovdqa 0x00(%rsp),%ymm1
0884 vperm2i128 $0x20,%ymm4,%ymm1,%ymm0
0885 cmp $0x0020,%rax
0886 jl .Lxorpart8
0887 vpxor 0x0000(%rdx),%ymm0,%ymm0
0888 vmovdqu %ymm0,0x0000(%rsi)
0889 vperm2i128 $0x31,%ymm4,%ymm1,%ymm4
0890
0891 vperm2i128 $0x20,%ymm12,%ymm8,%ymm0
0892 cmp $0x0040,%rax
0893 jl .Lxorpart8
0894 vpxor 0x0020(%rdx),%ymm0,%ymm0
0895 vmovdqu %ymm0,0x0020(%rsi)
0896 vperm2i128 $0x31,%ymm12,%ymm8,%ymm12
0897
0898 vmovdqa 0x40(%rsp),%ymm1
0899 vperm2i128 $0x20,%ymm6,%ymm1,%ymm0
0900 cmp $0x0060,%rax
0901 jl .Lxorpart8
0902 vpxor 0x0040(%rdx),%ymm0,%ymm0
0903 vmovdqu %ymm0,0x0040(%rsi)
0904 vperm2i128 $0x31,%ymm6,%ymm1,%ymm6
0905
0906 vperm2i128 $0x20,%ymm14,%ymm10,%ymm0
0907 cmp $0x0080,%rax
0908 jl .Lxorpart8
0909 vpxor 0x0060(%rdx),%ymm0,%ymm0
0910 vmovdqu %ymm0,0x0060(%rsi)
0911 vperm2i128 $0x31,%ymm14,%ymm10,%ymm14
0912
0913 vmovdqa 0x20(%rsp),%ymm1
0914 vperm2i128 $0x20,%ymm5,%ymm1,%ymm0
0915 cmp $0x00a0,%rax
0916 jl .Lxorpart8
0917 vpxor 0x0080(%rdx),%ymm0,%ymm0
0918 vmovdqu %ymm0,0x0080(%rsi)
0919 vperm2i128 $0x31,%ymm5,%ymm1,%ymm5
0920
0921 vperm2i128 $0x20,%ymm13,%ymm9,%ymm0
0922 cmp $0x00c0,%rax
0923 jl .Lxorpart8
0924 vpxor 0x00a0(%rdx),%ymm0,%ymm0
0925 vmovdqu %ymm0,0x00a0(%rsi)
0926 vperm2i128 $0x31,%ymm13,%ymm9,%ymm13
0927
0928 vmovdqa 0x60(%rsp),%ymm1
0929 vperm2i128 $0x20,%ymm7,%ymm1,%ymm0
0930 cmp $0x00e0,%rax
0931 jl .Lxorpart8
0932 vpxor 0x00c0(%rdx),%ymm0,%ymm0
0933 vmovdqu %ymm0,0x00c0(%rsi)
0934 vperm2i128 $0x31,%ymm7,%ymm1,%ymm7
0935
0936 vperm2i128 $0x20,%ymm15,%ymm11,%ymm0
0937 cmp $0x0100,%rax
0938 jl .Lxorpart8
0939 vpxor 0x00e0(%rdx),%ymm0,%ymm0
0940 vmovdqu %ymm0,0x00e0(%rsi)
0941 vperm2i128 $0x31,%ymm15,%ymm11,%ymm15
0942
0943 # xor remaining blocks, write to output
0944 vmovdqa %ymm4,%ymm0
0945 cmp $0x0120,%rax
0946 jl .Lxorpart8
0947 vpxor 0x0100(%rdx),%ymm0,%ymm0
0948 vmovdqu %ymm0,0x0100(%rsi)
0949
0950 vmovdqa %ymm12,%ymm0
0951 cmp $0x0140,%rax
0952 jl .Lxorpart8
0953 vpxor 0x0120(%rdx),%ymm0,%ymm0
0954 vmovdqu %ymm0,0x0120(%rsi)
0955
0956 vmovdqa %ymm6,%ymm0
0957 cmp $0x0160,%rax
0958 jl .Lxorpart8
0959 vpxor 0x0140(%rdx),%ymm0,%ymm0
0960 vmovdqu %ymm0,0x0140(%rsi)
0961
0962 vmovdqa %ymm14,%ymm0
0963 cmp $0x0180,%rax
0964 jl .Lxorpart8
0965 vpxor 0x0160(%rdx),%ymm0,%ymm0
0966 vmovdqu %ymm0,0x0160(%rsi)
0967
0968 vmovdqa %ymm5,%ymm0
0969 cmp $0x01a0,%rax
0970 jl .Lxorpart8
0971 vpxor 0x0180(%rdx),%ymm0,%ymm0
0972 vmovdqu %ymm0,0x0180(%rsi)
0973
0974 vmovdqa %ymm13,%ymm0
0975 cmp $0x01c0,%rax
0976 jl .Lxorpart8
0977 vpxor 0x01a0(%rdx),%ymm0,%ymm0
0978 vmovdqu %ymm0,0x01a0(%rsi)
0979
0980 vmovdqa %ymm7,%ymm0
0981 cmp $0x01e0,%rax
0982 jl .Lxorpart8
0983 vpxor 0x01c0(%rdx),%ymm0,%ymm0
0984 vmovdqu %ymm0,0x01c0(%rsi)
0985
0986 vmovdqa %ymm15,%ymm0
0987 cmp $0x0200,%rax
0988 jl .Lxorpart8
0989 vpxor 0x01e0(%rdx),%ymm0,%ymm0
0990 vmovdqu %ymm0,0x01e0(%rsi)
0991
0992 .Ldone8:
0993 vzeroupper
0994 lea -8(%r10),%rsp
0995 RET
0996
0997 .Lxorpart8:
0998 # xor remaining bytes from partial register into output
0999 mov %rax,%r9
1000 and $0x1f,%r9
1001 jz .Ldone8
1002 and $~0x1f,%rax
1003
1004 mov %rsi,%r11
1005
1006 lea (%rdx,%rax),%rsi
1007 mov %rsp,%rdi
1008 mov %r9,%rcx
1009 rep movsb
1010
1011 vpxor 0x00(%rsp),%ymm0,%ymm0
1012 vmovdqa %ymm0,0x00(%rsp)
1013
1014 mov %rsp,%rsi
1015 lea (%r11,%rax),%rdi
1016 mov %r9,%rcx
1017 rep movsb
1018
1019 jmp .Ldone8
1020
1021 SYM_FUNC_END(chacha_8block_xor_avx2)