0001
0002
0003
0004
0005
0006
0007
0008 #include <linux/linkage.h>
0009 #include <asm/frame.h>
0010
0011 .section .rodata.cst16.ROT8, "aM", @progbits, 16
0012 .align 16
0013 ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003
0014 .section .rodata.cst16.ROT16, "aM", @progbits, 16
0015 .align 16
0016 ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302
0017 .section .rodata.cst16.CTRINC, "aM", @progbits, 16
0018 .align 16
0019 CTRINC: .octa 0x00000003000000020000000100000000
0020
0021 .text
0022
0023
0024
0025
0026
0027
0028
0029
0030
0031
0032
0033
0034
0035
0036 SYM_FUNC_START_LOCAL(chacha_permute)
0037
0038 movdqa ROT8(%rip),%xmm4
0039 movdqa ROT16(%rip),%xmm5
0040
0041 .Ldoubleround:
0042 # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
0043 paddd %xmm1,%xmm0
0044 pxor %xmm0,%xmm3
0045 pshufb %xmm5,%xmm3
0046
0047 # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
0048 paddd %xmm3,%xmm2
0049 pxor %xmm2,%xmm1
0050 movdqa %xmm1,%xmm6
0051 pslld $12,%xmm6
0052 psrld $20,%xmm1
0053 por %xmm6,%xmm1
0054
0055 # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
0056 paddd %xmm1,%xmm0
0057 pxor %xmm0,%xmm3
0058 pshufb %xmm4,%xmm3
0059
0060 # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
0061 paddd %xmm3,%xmm2
0062 pxor %xmm2,%xmm1
0063 movdqa %xmm1,%xmm7
0064 pslld $7,%xmm7
0065 psrld $25,%xmm1
0066 por %xmm7,%xmm1
0067
0068 # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
0069 pshufd $0x39,%xmm1,%xmm1
0070 # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
0071 pshufd $0x4e,%xmm2,%xmm2
0072 # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
0073 pshufd $0x93,%xmm3,%xmm3
0074
0075 # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
0076 paddd %xmm1,%xmm0
0077 pxor %xmm0,%xmm3
0078 pshufb %xmm5,%xmm3
0079
0080 # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
0081 paddd %xmm3,%xmm2
0082 pxor %xmm2,%xmm1
0083 movdqa %xmm1,%xmm6
0084 pslld $12,%xmm6
0085 psrld $20,%xmm1
0086 por %xmm6,%xmm1
0087
0088 # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
0089 paddd %xmm1,%xmm0
0090 pxor %xmm0,%xmm3
0091 pshufb %xmm4,%xmm3
0092
0093 # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
0094 paddd %xmm3,%xmm2
0095 pxor %xmm2,%xmm1
0096 movdqa %xmm1,%xmm7
0097 pslld $7,%xmm7
0098 psrld $25,%xmm1
0099 por %xmm7,%xmm1
0100
0101 # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
0102 pshufd $0x93,%xmm1,%xmm1
0103 # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
0104 pshufd $0x4e,%xmm2,%xmm2
0105 # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
0106 pshufd $0x39,%xmm3,%xmm3
0107
0108 sub $2,%r8d
0109 jnz .Ldoubleround
0110
0111 RET
0112 SYM_FUNC_END(chacha_permute)
0113
0114 SYM_FUNC_START(chacha_block_xor_ssse3)
0115 # %rdi: Input state matrix, s
0116 # %rsi: up to 1 data block output, o
0117 # %rdx: up to 1 data block input, i
0118 # %rcx: input/output length in bytes
0119 # %r8d: nrounds
0120 FRAME_BEGIN
0121
0122 # x0..3 = s0..3
0123 movdqu 0x00(%rdi),%xmm0
0124 movdqu 0x10(%rdi),%xmm1
0125 movdqu 0x20(%rdi),%xmm2
0126 movdqu 0x30(%rdi),%xmm3
0127 movdqa %xmm0,%xmm8
0128 movdqa %xmm1,%xmm9
0129 movdqa %xmm2,%xmm10
0130 movdqa %xmm3,%xmm11
0131
0132 mov %rcx,%rax
0133 call chacha_permute
0134
0135 # o0 = i0 ^ (x0 + s0)
0136 paddd %xmm8,%xmm0
0137 cmp $0x10,%rax
0138 jl .Lxorpart
0139 movdqu 0x00(%rdx),%xmm4
0140 pxor %xmm4,%xmm0
0141 movdqu %xmm0,0x00(%rsi)
0142 # o1 = i1 ^ (x1 + s1)
0143 paddd %xmm9,%xmm1
0144 movdqa %xmm1,%xmm0
0145 cmp $0x20,%rax
0146 jl .Lxorpart
0147 movdqu 0x10(%rdx),%xmm0
0148 pxor %xmm1,%xmm0
0149 movdqu %xmm0,0x10(%rsi)
0150 # o2 = i2 ^ (x2 + s2)
0151 paddd %xmm10,%xmm2
0152 movdqa %xmm2,%xmm0
0153 cmp $0x30,%rax
0154 jl .Lxorpart
0155 movdqu 0x20(%rdx),%xmm0
0156 pxor %xmm2,%xmm0
0157 movdqu %xmm0,0x20(%rsi)
0158 # o3 = i3 ^ (x3 + s3)
0159 paddd %xmm11,%xmm3
0160 movdqa %xmm3,%xmm0
0161 cmp $0x40,%rax
0162 jl .Lxorpart
0163 movdqu 0x30(%rdx),%xmm0
0164 pxor %xmm3,%xmm0
0165 movdqu %xmm0,0x30(%rsi)
0166
0167 .Ldone:
0168 FRAME_END
0169 RET
0170
0171 .Lxorpart:
0172 # xor remaining bytes from partial register into output
0173 mov %rax,%r9
0174 and $0x0f,%r9
0175 jz .Ldone
0176 and $~0x0f,%rax
0177
0178 mov %rsi,%r11
0179
0180 lea 8(%rsp),%r10
0181 sub $0x10,%rsp
0182 and $~31,%rsp
0183
0184 lea (%rdx,%rax),%rsi
0185 mov %rsp,%rdi
0186 mov %r9,%rcx
0187 rep movsb
0188
0189 pxor 0x00(%rsp),%xmm0
0190 movdqa %xmm0,0x00(%rsp)
0191
0192 mov %rsp,%rsi
0193 lea (%r11,%rax),%rdi
0194 mov %r9,%rcx
0195 rep movsb
0196
0197 lea -8(%r10),%rsp
0198 jmp .Ldone
0199
0200 SYM_FUNC_END(chacha_block_xor_ssse3)
0201
0202 SYM_FUNC_START(hchacha_block_ssse3)
0203 # %rdi: Input state matrix, s
0204 # %rsi: output (8 32-bit words)
0205 # %edx: nrounds
0206 FRAME_BEGIN
0207
0208 movdqu 0x00(%rdi),%xmm0
0209 movdqu 0x10(%rdi),%xmm1
0210 movdqu 0x20(%rdi),%xmm2
0211 movdqu 0x30(%rdi),%xmm3
0212
0213 mov %edx,%r8d
0214 call chacha_permute
0215
0216 movdqu %xmm0,0x00(%rsi)
0217 movdqu %xmm3,0x10(%rsi)
0218
0219 FRAME_END
0220 RET
0221 SYM_FUNC_END(hchacha_block_ssse3)
0222
0223 SYM_FUNC_START(chacha_4block_xor_ssse3)
0224 # %rdi: Input state matrix, s
0225 # %rsi: up to 4 data blocks output, o
0226 # %rdx: up to 4 data blocks input, i
0227 # %rcx: input/output length in bytes
0228 # %r8d: nrounds
0229
0230 # This function encrypts four consecutive ChaCha blocks by loading the
0231 # the state matrix in SSE registers four times. As we need some scratch
0232 # registers, we save the first four registers on the stack. The
0233 # algorithm performs each operation on the corresponding word of each
0234 # state matrix, hence requires no word shuffling. For final XORing step
0235 # we transpose the matrix by interleaving 32- and then 64-bit words,
0236 # which allows us to do XOR in SSE registers. 8/16-bit word rotation is
0237 # done with the slightly better performing SSSE3 byte shuffling,
0238 # 7/12-bit word rotation uses traditional shift+OR.
0239
0240 lea 8(%rsp),%r10
0241 sub $0x80,%rsp
0242 and $~63,%rsp
0243 mov %rcx,%rax
0244
0245 # x0..15[0-3] = s0..3[0..3]
0246 movq 0x00(%rdi),%xmm1
0247 pshufd $0x00,%xmm1,%xmm0
0248 pshufd $0x55,%xmm1,%xmm1
0249 movq 0x08(%rdi),%xmm3
0250 pshufd $0x00,%xmm3,%xmm2
0251 pshufd $0x55,%xmm3,%xmm3
0252 movq 0x10(%rdi),%xmm5
0253 pshufd $0x00,%xmm5,%xmm4
0254 pshufd $0x55,%xmm5,%xmm5
0255 movq 0x18(%rdi),%xmm7
0256 pshufd $0x00,%xmm7,%xmm6
0257 pshufd $0x55,%xmm7,%xmm7
0258 movq 0x20(%rdi),%xmm9
0259 pshufd $0x00,%xmm9,%xmm8
0260 pshufd $0x55,%xmm9,%xmm9
0261 movq 0x28(%rdi),%xmm11
0262 pshufd $0x00,%xmm11,%xmm10
0263 pshufd $0x55,%xmm11,%xmm11
0264 movq 0x30(%rdi),%xmm13
0265 pshufd $0x00,%xmm13,%xmm12
0266 pshufd $0x55,%xmm13,%xmm13
0267 movq 0x38(%rdi),%xmm15
0268 pshufd $0x00,%xmm15,%xmm14
0269 pshufd $0x55,%xmm15,%xmm15
0270 # x0..3 on stack
0271 movdqa %xmm0,0x00(%rsp)
0272 movdqa %xmm1,0x10(%rsp)
0273 movdqa %xmm2,0x20(%rsp)
0274 movdqa %xmm3,0x30(%rsp)
0275
0276 movdqa CTRINC(%rip),%xmm1
0277 movdqa ROT8(%rip),%xmm2
0278 movdqa ROT16(%rip),%xmm3
0279
0280 # x12 += counter values 0-3
0281 paddd %xmm1,%xmm12
0282
0283 .Ldoubleround4:
0284 # x0 += x4, x12 = rotl32(x12 ^ x0, 16)
0285 movdqa 0x00(%rsp),%xmm0
0286 paddd %xmm4,%xmm0
0287 movdqa %xmm0,0x00(%rsp)
0288 pxor %xmm0,%xmm12
0289 pshufb %xmm3,%xmm12
0290 # x1 += x5, x13 = rotl32(x13 ^ x1, 16)
0291 movdqa 0x10(%rsp),%xmm0
0292 paddd %xmm5,%xmm0
0293 movdqa %xmm0,0x10(%rsp)
0294 pxor %xmm0,%xmm13
0295 pshufb %xmm3,%xmm13
0296 # x2 += x6, x14 = rotl32(x14 ^ x2, 16)
0297 movdqa 0x20(%rsp),%xmm0
0298 paddd %xmm6,%xmm0
0299 movdqa %xmm0,0x20(%rsp)
0300 pxor %xmm0,%xmm14
0301 pshufb %xmm3,%xmm14
0302 # x3 += x7, x15 = rotl32(x15 ^ x3, 16)
0303 movdqa 0x30(%rsp),%xmm0
0304 paddd %xmm7,%xmm0
0305 movdqa %xmm0,0x30(%rsp)
0306 pxor %xmm0,%xmm15
0307 pshufb %xmm3,%xmm15
0308
0309 # x8 += x12, x4 = rotl32(x4 ^ x8, 12)
0310 paddd %xmm12,%xmm8
0311 pxor %xmm8,%xmm4
0312 movdqa %xmm4,%xmm0
0313 pslld $12,%xmm0
0314 psrld $20,%xmm4
0315 por %xmm0,%xmm4
0316 # x9 += x13, x5 = rotl32(x5 ^ x9, 12)
0317 paddd %xmm13,%xmm9
0318 pxor %xmm9,%xmm5
0319 movdqa %xmm5,%xmm0
0320 pslld $12,%xmm0
0321 psrld $20,%xmm5
0322 por %xmm0,%xmm5
0323 # x10 += x14, x6 = rotl32(x6 ^ x10, 12)
0324 paddd %xmm14,%xmm10
0325 pxor %xmm10,%xmm6
0326 movdqa %xmm6,%xmm0
0327 pslld $12,%xmm0
0328 psrld $20,%xmm6
0329 por %xmm0,%xmm6
0330 # x11 += x15, x7 = rotl32(x7 ^ x11, 12)
0331 paddd %xmm15,%xmm11
0332 pxor %xmm11,%xmm7
0333 movdqa %xmm7,%xmm0
0334 pslld $12,%xmm0
0335 psrld $20,%xmm7
0336 por %xmm0,%xmm7
0337
0338 # x0 += x4, x12 = rotl32(x12 ^ x0, 8)
0339 movdqa 0x00(%rsp),%xmm0
0340 paddd %xmm4,%xmm0
0341 movdqa %xmm0,0x00(%rsp)
0342 pxor %xmm0,%xmm12
0343 pshufb %xmm2,%xmm12
0344 # x1 += x5, x13 = rotl32(x13 ^ x1, 8)
0345 movdqa 0x10(%rsp),%xmm0
0346 paddd %xmm5,%xmm0
0347 movdqa %xmm0,0x10(%rsp)
0348 pxor %xmm0,%xmm13
0349 pshufb %xmm2,%xmm13
0350 # x2 += x6, x14 = rotl32(x14 ^ x2, 8)
0351 movdqa 0x20(%rsp),%xmm0
0352 paddd %xmm6,%xmm0
0353 movdqa %xmm0,0x20(%rsp)
0354 pxor %xmm0,%xmm14
0355 pshufb %xmm2,%xmm14
0356 # x3 += x7, x15 = rotl32(x15 ^ x3, 8)
0357 movdqa 0x30(%rsp),%xmm0
0358 paddd %xmm7,%xmm0
0359 movdqa %xmm0,0x30(%rsp)
0360 pxor %xmm0,%xmm15
0361 pshufb %xmm2,%xmm15
0362
0363 # x8 += x12, x4 = rotl32(x4 ^ x8, 7)
0364 paddd %xmm12,%xmm8
0365 pxor %xmm8,%xmm4
0366 movdqa %xmm4,%xmm0
0367 pslld $7,%xmm0
0368 psrld $25,%xmm4
0369 por %xmm0,%xmm4
0370 # x9 += x13, x5 = rotl32(x5 ^ x9, 7)
0371 paddd %xmm13,%xmm9
0372 pxor %xmm9,%xmm5
0373 movdqa %xmm5,%xmm0
0374 pslld $7,%xmm0
0375 psrld $25,%xmm5
0376 por %xmm0,%xmm5
0377 # x10 += x14, x6 = rotl32(x6 ^ x10, 7)
0378 paddd %xmm14,%xmm10
0379 pxor %xmm10,%xmm6
0380 movdqa %xmm6,%xmm0
0381 pslld $7,%xmm0
0382 psrld $25,%xmm6
0383 por %xmm0,%xmm6
0384 # x11 += x15, x7 = rotl32(x7 ^ x11, 7)
0385 paddd %xmm15,%xmm11
0386 pxor %xmm11,%xmm7
0387 movdqa %xmm7,%xmm0
0388 pslld $7,%xmm0
0389 psrld $25,%xmm7
0390 por %xmm0,%xmm7
0391
0392 # x0 += x5, x15 = rotl32(x15 ^ x0, 16)
0393 movdqa 0x00(%rsp),%xmm0
0394 paddd %xmm5,%xmm0
0395 movdqa %xmm0,0x00(%rsp)
0396 pxor %xmm0,%xmm15
0397 pshufb %xmm3,%xmm15
0398 # x1 += x6, x12 = rotl32(x12 ^ x1, 16)
0399 movdqa 0x10(%rsp),%xmm0
0400 paddd %xmm6,%xmm0
0401 movdqa %xmm0,0x10(%rsp)
0402 pxor %xmm0,%xmm12
0403 pshufb %xmm3,%xmm12
0404 # x2 += x7, x13 = rotl32(x13 ^ x2, 16)
0405 movdqa 0x20(%rsp),%xmm0
0406 paddd %xmm7,%xmm0
0407 movdqa %xmm0,0x20(%rsp)
0408 pxor %xmm0,%xmm13
0409 pshufb %xmm3,%xmm13
0410 # x3 += x4, x14 = rotl32(x14 ^ x3, 16)
0411 movdqa 0x30(%rsp),%xmm0
0412 paddd %xmm4,%xmm0
0413 movdqa %xmm0,0x30(%rsp)
0414 pxor %xmm0,%xmm14
0415 pshufb %xmm3,%xmm14
0416
0417 # x10 += x15, x5 = rotl32(x5 ^ x10, 12)
0418 paddd %xmm15,%xmm10
0419 pxor %xmm10,%xmm5
0420 movdqa %xmm5,%xmm0
0421 pslld $12,%xmm0
0422 psrld $20,%xmm5
0423 por %xmm0,%xmm5
0424 # x11 += x12, x6 = rotl32(x6 ^ x11, 12)
0425 paddd %xmm12,%xmm11
0426 pxor %xmm11,%xmm6
0427 movdqa %xmm6,%xmm0
0428 pslld $12,%xmm0
0429 psrld $20,%xmm6
0430 por %xmm0,%xmm6
0431 # x8 += x13, x7 = rotl32(x7 ^ x8, 12)
0432 paddd %xmm13,%xmm8
0433 pxor %xmm8,%xmm7
0434 movdqa %xmm7,%xmm0
0435 pslld $12,%xmm0
0436 psrld $20,%xmm7
0437 por %xmm0,%xmm7
0438 # x9 += x14, x4 = rotl32(x4 ^ x9, 12)
0439 paddd %xmm14,%xmm9
0440 pxor %xmm9,%xmm4
0441 movdqa %xmm4,%xmm0
0442 pslld $12,%xmm0
0443 psrld $20,%xmm4
0444 por %xmm0,%xmm4
0445
0446 # x0 += x5, x15 = rotl32(x15 ^ x0, 8)
0447 movdqa 0x00(%rsp),%xmm0
0448 paddd %xmm5,%xmm0
0449 movdqa %xmm0,0x00(%rsp)
0450 pxor %xmm0,%xmm15
0451 pshufb %xmm2,%xmm15
0452 # x1 += x6, x12 = rotl32(x12 ^ x1, 8)
0453 movdqa 0x10(%rsp),%xmm0
0454 paddd %xmm6,%xmm0
0455 movdqa %xmm0,0x10(%rsp)
0456 pxor %xmm0,%xmm12
0457 pshufb %xmm2,%xmm12
0458 # x2 += x7, x13 = rotl32(x13 ^ x2, 8)
0459 movdqa 0x20(%rsp),%xmm0
0460 paddd %xmm7,%xmm0
0461 movdqa %xmm0,0x20(%rsp)
0462 pxor %xmm0,%xmm13
0463 pshufb %xmm2,%xmm13
0464 # x3 += x4, x14 = rotl32(x14 ^ x3, 8)
0465 movdqa 0x30(%rsp),%xmm0
0466 paddd %xmm4,%xmm0
0467 movdqa %xmm0,0x30(%rsp)
0468 pxor %xmm0,%xmm14
0469 pshufb %xmm2,%xmm14
0470
0471 # x10 += x15, x5 = rotl32(x5 ^ x10, 7)
0472 paddd %xmm15,%xmm10
0473 pxor %xmm10,%xmm5
0474 movdqa %xmm5,%xmm0
0475 pslld $7,%xmm0
0476 psrld $25,%xmm5
0477 por %xmm0,%xmm5
0478 # x11 += x12, x6 = rotl32(x6 ^ x11, 7)
0479 paddd %xmm12,%xmm11
0480 pxor %xmm11,%xmm6
0481 movdqa %xmm6,%xmm0
0482 pslld $7,%xmm0
0483 psrld $25,%xmm6
0484 por %xmm0,%xmm6
0485 # x8 += x13, x7 = rotl32(x7 ^ x8, 7)
0486 paddd %xmm13,%xmm8
0487 pxor %xmm8,%xmm7
0488 movdqa %xmm7,%xmm0
0489 pslld $7,%xmm0
0490 psrld $25,%xmm7
0491 por %xmm0,%xmm7
0492 # x9 += x14, x4 = rotl32(x4 ^ x9, 7)
0493 paddd %xmm14,%xmm9
0494 pxor %xmm9,%xmm4
0495 movdqa %xmm4,%xmm0
0496 pslld $7,%xmm0
0497 psrld $25,%xmm4
0498 por %xmm0,%xmm4
0499
0500 sub $2,%r8d
0501 jnz .Ldoubleround4
0502
0503 # x0[0-3] += s0[0]
0504 # x1[0-3] += s0[1]
0505 movq 0x00(%rdi),%xmm3
0506 pshufd $0x00,%xmm3,%xmm2
0507 pshufd $0x55,%xmm3,%xmm3
0508 paddd 0x00(%rsp),%xmm2
0509 movdqa %xmm2,0x00(%rsp)
0510 paddd 0x10(%rsp),%xmm3
0511 movdqa %xmm3,0x10(%rsp)
0512 # x2[0-3] += s0[2]
0513 # x3[0-3] += s0[3]
0514 movq 0x08(%rdi),%xmm3
0515 pshufd $0x00,%xmm3,%xmm2
0516 pshufd $0x55,%xmm3,%xmm3
0517 paddd 0x20(%rsp),%xmm2
0518 movdqa %xmm2,0x20(%rsp)
0519 paddd 0x30(%rsp),%xmm3
0520 movdqa %xmm3,0x30(%rsp)
0521
0522 # x4[0-3] += s1[0]
0523 # x5[0-3] += s1[1]
0524 movq 0x10(%rdi),%xmm3
0525 pshufd $0x00,%xmm3,%xmm2
0526 pshufd $0x55,%xmm3,%xmm3
0527 paddd %xmm2,%xmm4
0528 paddd %xmm3,%xmm5
0529 # x6[0-3] += s1[2]
0530 # x7[0-3] += s1[3]
0531 movq 0x18(%rdi),%xmm3
0532 pshufd $0x00,%xmm3,%xmm2
0533 pshufd $0x55,%xmm3,%xmm3
0534 paddd %xmm2,%xmm6
0535 paddd %xmm3,%xmm7
0536
0537 # x8[0-3] += s2[0]
0538 # x9[0-3] += s2[1]
0539 movq 0x20(%rdi),%xmm3
0540 pshufd $0x00,%xmm3,%xmm2
0541 pshufd $0x55,%xmm3,%xmm3
0542 paddd %xmm2,%xmm8
0543 paddd %xmm3,%xmm9
0544 # x10[0-3] += s2[2]
0545 # x11[0-3] += s2[3]
0546 movq 0x28(%rdi),%xmm3
0547 pshufd $0x00,%xmm3,%xmm2
0548 pshufd $0x55,%xmm3,%xmm3
0549 paddd %xmm2,%xmm10
0550 paddd %xmm3,%xmm11
0551
0552 # x12[0-3] += s3[0]
0553 # x13[0-3] += s3[1]
0554 movq 0x30(%rdi),%xmm3
0555 pshufd $0x00,%xmm3,%xmm2
0556 pshufd $0x55,%xmm3,%xmm3
0557 paddd %xmm2,%xmm12
0558 paddd %xmm3,%xmm13
0559 # x14[0-3] += s3[2]
0560 # x15[0-3] += s3[3]
0561 movq 0x38(%rdi),%xmm3
0562 pshufd $0x00,%xmm3,%xmm2
0563 pshufd $0x55,%xmm3,%xmm3
0564 paddd %xmm2,%xmm14
0565 paddd %xmm3,%xmm15
0566
0567 # x12 += counter values 0-3
0568 paddd %xmm1,%xmm12
0569
0570 # interleave 32-bit words in state n, n+1
0571 movdqa 0x00(%rsp),%xmm0
0572 movdqa 0x10(%rsp),%xmm1
0573 movdqa %xmm0,%xmm2
0574 punpckldq %xmm1,%xmm2
0575 punpckhdq %xmm1,%xmm0
0576 movdqa %xmm2,0x00(%rsp)
0577 movdqa %xmm0,0x10(%rsp)
0578 movdqa 0x20(%rsp),%xmm0
0579 movdqa 0x30(%rsp),%xmm1
0580 movdqa %xmm0,%xmm2
0581 punpckldq %xmm1,%xmm2
0582 punpckhdq %xmm1,%xmm0
0583 movdqa %xmm2,0x20(%rsp)
0584 movdqa %xmm0,0x30(%rsp)
0585 movdqa %xmm4,%xmm0
0586 punpckldq %xmm5,%xmm4
0587 punpckhdq %xmm5,%xmm0
0588 movdqa %xmm0,%xmm5
0589 movdqa %xmm6,%xmm0
0590 punpckldq %xmm7,%xmm6
0591 punpckhdq %xmm7,%xmm0
0592 movdqa %xmm0,%xmm7
0593 movdqa %xmm8,%xmm0
0594 punpckldq %xmm9,%xmm8
0595 punpckhdq %xmm9,%xmm0
0596 movdqa %xmm0,%xmm9
0597 movdqa %xmm10,%xmm0
0598 punpckldq %xmm11,%xmm10
0599 punpckhdq %xmm11,%xmm0
0600 movdqa %xmm0,%xmm11
0601 movdqa %xmm12,%xmm0
0602 punpckldq %xmm13,%xmm12
0603 punpckhdq %xmm13,%xmm0
0604 movdqa %xmm0,%xmm13
0605 movdqa %xmm14,%xmm0
0606 punpckldq %xmm15,%xmm14
0607 punpckhdq %xmm15,%xmm0
0608 movdqa %xmm0,%xmm15
0609
0610 # interleave 64-bit words in state n, n+2
0611 movdqa 0x00(%rsp),%xmm0
0612 movdqa 0x20(%rsp),%xmm1
0613 movdqa %xmm0,%xmm2
0614 punpcklqdq %xmm1,%xmm2
0615 punpckhqdq %xmm1,%xmm0
0616 movdqa %xmm2,0x00(%rsp)
0617 movdqa %xmm0,0x20(%rsp)
0618 movdqa 0x10(%rsp),%xmm0
0619 movdqa 0x30(%rsp),%xmm1
0620 movdqa %xmm0,%xmm2
0621 punpcklqdq %xmm1,%xmm2
0622 punpckhqdq %xmm1,%xmm0
0623 movdqa %xmm2,0x10(%rsp)
0624 movdqa %xmm0,0x30(%rsp)
0625 movdqa %xmm4,%xmm0
0626 punpcklqdq %xmm6,%xmm4
0627 punpckhqdq %xmm6,%xmm0
0628 movdqa %xmm0,%xmm6
0629 movdqa %xmm5,%xmm0
0630 punpcklqdq %xmm7,%xmm5
0631 punpckhqdq %xmm7,%xmm0
0632 movdqa %xmm0,%xmm7
0633 movdqa %xmm8,%xmm0
0634 punpcklqdq %xmm10,%xmm8
0635 punpckhqdq %xmm10,%xmm0
0636 movdqa %xmm0,%xmm10
0637 movdqa %xmm9,%xmm0
0638 punpcklqdq %xmm11,%xmm9
0639 punpckhqdq %xmm11,%xmm0
0640 movdqa %xmm0,%xmm11
0641 movdqa %xmm12,%xmm0
0642 punpcklqdq %xmm14,%xmm12
0643 punpckhqdq %xmm14,%xmm0
0644 movdqa %xmm0,%xmm14
0645 movdqa %xmm13,%xmm0
0646 punpcklqdq %xmm15,%xmm13
0647 punpckhqdq %xmm15,%xmm0
0648 movdqa %xmm0,%xmm15
0649
0650 # xor with corresponding input, write to output
0651 movdqa 0x00(%rsp),%xmm0
0652 cmp $0x10,%rax
0653 jl .Lxorpart4
0654 movdqu 0x00(%rdx),%xmm1
0655 pxor %xmm1,%xmm0
0656 movdqu %xmm0,0x00(%rsi)
0657
0658 movdqu %xmm4,%xmm0
0659 cmp $0x20,%rax
0660 jl .Lxorpart4
0661 movdqu 0x10(%rdx),%xmm1
0662 pxor %xmm1,%xmm0
0663 movdqu %xmm0,0x10(%rsi)
0664
0665 movdqu %xmm8,%xmm0
0666 cmp $0x30,%rax
0667 jl .Lxorpart4
0668 movdqu 0x20(%rdx),%xmm1
0669 pxor %xmm1,%xmm0
0670 movdqu %xmm0,0x20(%rsi)
0671
0672 movdqu %xmm12,%xmm0
0673 cmp $0x40,%rax
0674 jl .Lxorpart4
0675 movdqu 0x30(%rdx),%xmm1
0676 pxor %xmm1,%xmm0
0677 movdqu %xmm0,0x30(%rsi)
0678
0679 movdqa 0x20(%rsp),%xmm0
0680 cmp $0x50,%rax
0681 jl .Lxorpart4
0682 movdqu 0x40(%rdx),%xmm1
0683 pxor %xmm1,%xmm0
0684 movdqu %xmm0,0x40(%rsi)
0685
0686 movdqu %xmm6,%xmm0
0687 cmp $0x60,%rax
0688 jl .Lxorpart4
0689 movdqu 0x50(%rdx),%xmm1
0690 pxor %xmm1,%xmm0
0691 movdqu %xmm0,0x50(%rsi)
0692
0693 movdqu %xmm10,%xmm0
0694 cmp $0x70,%rax
0695 jl .Lxorpart4
0696 movdqu 0x60(%rdx),%xmm1
0697 pxor %xmm1,%xmm0
0698 movdqu %xmm0,0x60(%rsi)
0699
0700 movdqu %xmm14,%xmm0
0701 cmp $0x80,%rax
0702 jl .Lxorpart4
0703 movdqu 0x70(%rdx),%xmm1
0704 pxor %xmm1,%xmm0
0705 movdqu %xmm0,0x70(%rsi)
0706
0707 movdqa 0x10(%rsp),%xmm0
0708 cmp $0x90,%rax
0709 jl .Lxorpart4
0710 movdqu 0x80(%rdx),%xmm1
0711 pxor %xmm1,%xmm0
0712 movdqu %xmm0,0x80(%rsi)
0713
0714 movdqu %xmm5,%xmm0
0715 cmp $0xa0,%rax
0716 jl .Lxorpart4
0717 movdqu 0x90(%rdx),%xmm1
0718 pxor %xmm1,%xmm0
0719 movdqu %xmm0,0x90(%rsi)
0720
0721 movdqu %xmm9,%xmm0
0722 cmp $0xb0,%rax
0723 jl .Lxorpart4
0724 movdqu 0xa0(%rdx),%xmm1
0725 pxor %xmm1,%xmm0
0726 movdqu %xmm0,0xa0(%rsi)
0727
0728 movdqu %xmm13,%xmm0
0729 cmp $0xc0,%rax
0730 jl .Lxorpart4
0731 movdqu 0xb0(%rdx),%xmm1
0732 pxor %xmm1,%xmm0
0733 movdqu %xmm0,0xb0(%rsi)
0734
0735 movdqa 0x30(%rsp),%xmm0
0736 cmp $0xd0,%rax
0737 jl .Lxorpart4
0738 movdqu 0xc0(%rdx),%xmm1
0739 pxor %xmm1,%xmm0
0740 movdqu %xmm0,0xc0(%rsi)
0741
0742 movdqu %xmm7,%xmm0
0743 cmp $0xe0,%rax
0744 jl .Lxorpart4
0745 movdqu 0xd0(%rdx),%xmm1
0746 pxor %xmm1,%xmm0
0747 movdqu %xmm0,0xd0(%rsi)
0748
0749 movdqu %xmm11,%xmm0
0750 cmp $0xf0,%rax
0751 jl .Lxorpart4
0752 movdqu 0xe0(%rdx),%xmm1
0753 pxor %xmm1,%xmm0
0754 movdqu %xmm0,0xe0(%rsi)
0755
0756 movdqu %xmm15,%xmm0
0757 cmp $0x100,%rax
0758 jl .Lxorpart4
0759 movdqu 0xf0(%rdx),%xmm1
0760 pxor %xmm1,%xmm0
0761 movdqu %xmm0,0xf0(%rsi)
0762
0763 .Ldone4:
0764 lea -8(%r10),%rsp
0765 RET
0766
0767 .Lxorpart4:
0768 # xor remaining bytes from partial register into output
0769 mov %rax,%r9
0770 and $0x0f,%r9
0771 jz .Ldone4
0772 and $~0x0f,%rax
0773
0774 mov %rsi,%r11
0775
0776 lea (%rdx,%rax),%rsi
0777 mov %rsp,%rdi
0778 mov %r9,%rcx
0779 rep movsb
0780
0781 pxor 0x00(%rsp),%xmm0
0782 movdqa %xmm0,0x00(%rsp)
0783
0784 mov %rsp,%rsi
0785 lea (%r11,%rax),%rdi
0786 mov %r9,%rcx
0787 rep movsb
0788
0789 jmp .Ldone4
0790
0791 SYM_FUNC_END(chacha_4block_xor_ssse3)