0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024
0025
0026
0027
0028
0029
0030
0031
0032
0033
0034
0035
0036
0037
0038
0039
0040
0041
0042
0043
0044
0045
0046
0047
0048
0049
0050
0051
0052
0053
0054
0055
0056
0057
0058
0059
0060
0061
0062
0063
0064
0065
0066
0067
0068
0069 $flavour = shift;
0070 $output = shift;
0071 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
0072
0073 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
0074 $kernel=0; $kernel=1 if (!$flavour && !$output);
0075
0076 if (!$kernel) {
0077 $0 =~ ; $dir=$1;
0078 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
0079 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
0080 die "can't locate x86_64-xlate.pl";
0081
0082 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
0083 *STDOUT=*OUT;
0084
0085 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
0086 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
0087 $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25);
0088 }
0089
0090 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
0091 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {
0092 $avx = ($1>=2.09) + ($1>=2.10) + ($1>=2.12);
0093 $avx += 1 if ($1==2.11 && $2>=8);
0094 }
0095
0096 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
0097 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
0098 $avx = ($1>=10) + ($1>=11);
0099 }
0100
0101 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
0102 $avx = ($2>=3.0) + ($2>3.0);
0103 }
0104 } else {
0105 $avx = 4;
0106 }
0107
0108 sub declare_function() {
0109 my ($name, $align, $nargs) = @_;
0110 if($kernel) {
0111 $code .= ".align $align\n";
0112 $code .= "SYM_FUNC_START($name)\n";
0113 $code .= ".L$name:\n";
0114 } else {
0115 $code .= ".globl $name\n";
0116 $code .= ".type $name,\@function,$nargs\n";
0117 $code .= ".align $align\n";
0118 $code .= "$name:\n";
0119 }
0120 }
0121
0122 sub end_function() {
0123 my ($name) = @_;
0124 if($kernel) {
0125 $code .= "SYM_FUNC_END($name)\n";
0126 } else {
0127 $code .= ".size $name,.-$name\n";
0128 }
0129 }
0130
0131 $code.=<<___ if $kernel;
0132
0133 ___
0134
0135 if ($avx) {
0136 $code.=<<___ if $kernel;
0137 .section .rodata
0138 ___
0139 $code.=<<___;
0140 .align 64
0141 .Lconst:
0142 .Lmask24:
0143 .long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
0144 .L129:
0145 .long `1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0
0146 .Lmask26:
0147 .long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
0148 .Lpermd_avx2:
0149 .long 2,2,2,3,2,0,2,1
0150 .Lpermd_avx512:
0151 .long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7
0152
0153 .L2_44_inp_permd:
0154 .long 0,1,1,2,2,3,7,7
0155 .L2_44_inp_shift:
0156 .quad 0,12,24,64
0157 .L2_44_mask:
0158 .quad 0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff
0159 .L2_44_shift_rgt:
0160 .quad 44,44,42,64
0161 .L2_44_shift_lft:
0162 .quad 8,8,10,64
0163
0164 .align 64
0165 .Lx_mask44:
0166 .quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
0167 .quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
0168 .Lx_mask42:
0169 .quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
0170 .quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
0171 ___
0172 }
0173 $code.=<<___ if (!$kernel);
0174 .asciz "Poly1305 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
0175 .align 16
0176 ___
0177
0178 my ($ctx,$inp,$len,$padbit)=("%rdi","%rsi","%rdx","%rcx");
0179 my ($mac,$nonce)=($inp,$len);
0180 my ($d1,$d2,$d3, $r0,$r1,$s1)=("%r8","%r9","%rdi","%r11","%r12","%r13");
0181 my ($h0,$h1,$h2)=("%r14","%rbx","%r10");
0182
0183 sub poly1305_iteration {
0184
0185
0186 $code.=<<___;
0187 mulq $h0
0188 mov %rax,$d2
0189 mov $r0,%rax
0190 mov %rdx,$d3
0191
0192 mulq $h0
0193 mov %rax,$h0
0194 mov $r0,%rax
0195 mov %rdx,$d1
0196
0197 mulq $h1
0198 add %rax,$d2
0199 mov $s1,%rax
0200 adc %rdx,$d3
0201
0202 mulq $h1
0203 mov $h2,$h1
0204 add %rax,$h0
0205 adc %rdx,$d1
0206
0207 imulq $s1,$h1
0208 add $h1,$d2
0209 mov $d1,$h1
0210 adc \$0,$d3
0211
0212 imulq $r0,$h2
0213 add $d2,$h1
0214 mov \$-4,%rax
0215 adc $h2,$d3
0216
0217 and $d3,%rax
0218 mov $d3,$h2
0219 shr \$2,$d3
0220 and \$3,$h2
0221 add $d3,%rax
0222 add %rax,$h0
0223 adc \$0,$h1
0224 adc \$0,$h2
0225 ___
0226 }
0227
0228
0229
0230
0231
0232
0233
0234 $code.=<<___;
0235 .text
0236 ___
0237 $code.=<<___ if (!$kernel);
0238 .extern OPENSSL_ia32cap_P
0239
0240 .globl poly1305_init_x86_64
0241 .hidden poly1305_init_x86_64
0242 .globl poly1305_blocks_x86_64
0243 .hidden poly1305_blocks_x86_64
0244 .globl poly1305_emit_x86_64
0245 .hidden poly1305_emit_x86_64
0246 ___
0247 &declare_function("poly1305_init_x86_64", 32, 3);
0248 $code.=<<___;
0249 xor %eax,%eax
0250 mov %rax,0($ctx)
0251 mov %rax,8($ctx)
0252 mov %rax,16($ctx)
0253
0254 test $inp,$inp
0255 je .Lno_key
0256 ___
0257 $code.=<<___ if (!$kernel);
0258 lea poly1305_blocks_x86_64(%rip),%r10
0259 lea poly1305_emit_x86_64(%rip),%r11
0260 ___
0261 $code.=<<___ if (!$kernel && $avx);
0262 mov OPENSSL_ia32cap_P+4(%rip),%r9
0263 lea poly1305_blocks_avx(%rip),%rax
0264 lea poly1305_emit_avx(%rip),%rcx
0265 bt \$`60-32`,%r9
0266 cmovc %rax,%r10
0267 cmovc %rcx,%r11
0268 ___
0269 $code.=<<___ if (!$kernel && $avx>1);
0270 lea poly1305_blocks_avx2(%rip),%rax
0271 bt \$`5+32`,%r9
0272 cmovc %rax,%r10
0273 ___
0274 $code.=<<___ if (!$kernel && $avx>3);
0275 mov \$`(1<<31|1<<21|1<<16)`,%rax
0276 shr \$32,%r9
0277 and %rax,%r9
0278 cmp %rax,%r9
0279 je .Linit_base2_44
0280 ___
0281 $code.=<<___;
0282 mov \$0x0ffffffc0fffffff,%rax
0283 mov \$0x0ffffffc0ffffffc,%rcx
0284 and 0($inp),%rax
0285 and 8($inp),%rcx
0286 mov %rax,24($ctx)
0287 mov %rcx,32($ctx)
0288 ___
0289 $code.=<<___ if (!$kernel && $flavour !~ /elf32/);
0290 mov %r10,0(%rdx)
0291 mov %r11,8(%rdx)
0292 ___
0293 $code.=<<___ if (!$kernel && $flavour =~ /elf32/);
0294 mov %r10d,0(%rdx)
0295 mov %r11d,4(%rdx)
0296 ___
0297 $code.=<<___;
0298 mov \$1,%eax
0299 .Lno_key:
0300 RET
0301 ___
0302 &end_function("poly1305_init_x86_64");
0303
0304 &declare_function("poly1305_blocks_x86_64", 32, 4);
0305 $code.=<<___;
0306 .cfi_startproc
0307 .Lblocks:
0308 shr \$4,$len
0309 jz .Lno_data
0310
0311 push %rbx
0312 .cfi_push %rbx
0313 push %r12
0314 .cfi_push %r12
0315 push %r13
0316 .cfi_push %r13
0317 push %r14
0318 .cfi_push %r14
0319 push %r15
0320 .cfi_push %r15
0321 push $ctx
0322 .cfi_push $ctx
0323 .Lblocks_body:
0324
0325 mov $len,%r15
0326
0327 mov 24($ctx),$r0
0328 mov 32($ctx),$s1
0329
0330 mov 0($ctx),$h0
0331 mov 8($ctx),$h1
0332 mov 16($ctx),$h2
0333
0334 mov $s1,$r1
0335 shr \$2,$s1
0336 mov $r1,%rax
0337 add $r1,$s1
0338 jmp .Loop
0339
0340 .align 32
0341 .Loop:
0342 add 0($inp),$h0
0343 adc 8($inp),$h1
0344 lea 16($inp),$inp
0345 adc $padbit,$h2
0346 ___
0347
0348 &poly1305_iteration();
0349
0350 $code.=<<___;
0351 mov $r1,%rax
0352 dec %r15
0353 jnz .Loop
0354
0355 mov 0(%rsp),$ctx
0356 .cfi_restore $ctx
0357
0358 mov $h0,0($ctx)
0359 mov $h1,8($ctx)
0360 mov $h2,16($ctx)
0361
0362 mov 8(%rsp),%r15
0363 .cfi_restore %r15
0364 mov 16(%rsp),%r14
0365 .cfi_restore %r14
0366 mov 24(%rsp),%r13
0367 .cfi_restore %r13
0368 mov 32(%rsp),%r12
0369 .cfi_restore %r12
0370 mov 40(%rsp),%rbx
0371 .cfi_restore %rbx
0372 lea 48(%rsp),%rsp
0373 .cfi_adjust_cfa_offset -48
0374 .Lno_data:
0375 .Lblocks_epilogue:
0376 RET
0377 .cfi_endproc
0378 ___
0379 &end_function("poly1305_blocks_x86_64");
0380
0381 &declare_function("poly1305_emit_x86_64", 32, 3);
0382 $code.=<<___;
0383 .Lemit:
0384 mov 0($ctx),%r8
0385 mov 8($ctx),%r9
0386 mov 16($ctx),%r10
0387
0388 mov %r8,%rax
0389 add \$5,%r8
0390 mov %r9,%rcx
0391 adc \$0,%r9
0392 adc \$0,%r10
0393 shr \$2,%r10
0394 cmovnz %r8,%rax
0395 cmovnz %r9,%rcx
0396
0397 add 0($nonce),%rax
0398 adc 8($nonce),%rcx
0399 mov %rax,0($mac)
0400 mov %rcx,8($mac)
0401
0402 RET
0403 ___
0404 &end_function("poly1305_emit_x86_64");
0405 if ($avx) {
0406
0407
0408
0409
0410
0411
0412
0413
0414
0415
0416
0417
0418
0419
0420 my ($H0,$H1,$H2,$H3,$H4, $T0,$T1,$T2,$T3,$T4, $D0,$D1,$D2,$D3,$D4, $MASK) =
0421 map("%xmm$_",(0..15));
0422
0423 $code.=<<___;
0424 .type __poly1305_block,\@abi-omnipotent
0425 .align 32
0426 __poly1305_block:
0427 push $ctx
0428 ___
0429 &poly1305_iteration();
0430 $code.=<<___;
0431 pop $ctx
0432 RET
0433 .size __poly1305_block,.-__poly1305_block
0434
0435 .type __poly1305_init_avx,\@abi-omnipotent
0436 .align 32
0437 __poly1305_init_avx:
0438 push %rbp
0439 mov %rsp,%rbp
0440 mov $r0,$h0
0441 mov $r1,$h1
0442 xor $h2,$h2
0443
0444 lea 48+64($ctx),$ctx
0445
0446 mov $r1,%rax
0447 call __poly1305_block
0448
0449 mov \$0x3ffffff,%eax
0450 mov \$0x3ffffff,%edx
0451 mov $h0,$d1
0452 and $h0
0453 mov $r0,$d2
0454 and $r0
0455 mov %eax,`16*0+0-64`($ctx)
0456 shr \$26,$d1
0457 mov %edx,`16*0+4-64`($ctx)
0458 shr \$26,$d2
0459
0460 mov \$0x3ffffff,%eax
0461 mov \$0x3ffffff,%edx
0462 and $d1
0463 and $d2
0464 mov %eax,`16*1+0-64`($ctx)
0465 lea (%rax,%rax,4),%eax
0466 mov %edx,`16*1+4-64`($ctx)
0467 lea (%rdx,%rdx,4),%edx
0468 mov %eax,`16*2+0-64`($ctx)
0469 shr \$26,$d1
0470 mov %edx,`16*2+4-64`($ctx)
0471 shr \$26,$d2
0472
0473 mov $h1,%rax
0474 mov $r1,%rdx
0475 shl \$12,%rax
0476 shl \$12,%rdx
0477 or $d1,%rax
0478 or $d2,%rdx
0479 and \$0x3ffffff,%eax
0480 and \$0x3ffffff,%edx
0481 mov %eax,`16*3+0-64`($ctx)
0482 lea (%rax,%rax,4),%eax
0483 mov %edx,`16*3+4-64`($ctx)
0484 lea (%rdx,%rdx,4),%edx
0485 mov %eax,`16*4+0-64`($ctx)
0486 mov $h1,$d1
0487 mov %edx,`16*4+4-64`($ctx)
0488 mov $r1,$d2
0489
0490 mov \$0x3ffffff,%eax
0491 mov \$0x3ffffff,%edx
0492 shr \$14,$d1
0493 shr \$14,$d2
0494 and $d1
0495 and $d2
0496 mov %eax,`16*5+0-64`($ctx)
0497 lea (%rax,%rax,4),%eax
0498 mov %edx,`16*5+4-64`($ctx)
0499 lea (%rdx,%rdx,4),%edx
0500 mov %eax,`16*6+0-64`($ctx)
0501 shr \$26,$d1
0502 mov %edx,`16*6+4-64`($ctx)
0503 shr \$26,$d2
0504
0505 mov $h2,%rax
0506 shl \$24,%rax
0507 or %rax,$d1
0508 mov $d1
0509 lea ($d1,$d1,4),$d1
0510 mov $d2
0511 lea ($d2,$d2,4),$d2
0512 mov $d1
0513 mov $d2
0514
0515 mov $r1,%rax
0516 call __poly1305_block
0517
0518 mov \$0x3ffffff,%eax
0519 mov $h0,$d1
0520 and $h0
0521 shr \$26,$d1
0522 mov %eax,`16*0+12-64`($ctx)
0523
0524 mov \$0x3ffffff,%edx
0525 and $d1
0526 mov %edx,`16*1+12-64`($ctx)
0527 lea (%rdx,%rdx,4),%edx
0528 shr \$26,$d1
0529 mov %edx,`16*2+12-64`($ctx)
0530
0531 mov $h1,%rax
0532 shl \$12,%rax
0533 or $d1,%rax
0534 and \$0x3ffffff,%eax
0535 mov %eax,`16*3+12-64`($ctx)
0536 lea (%rax,%rax,4),%eax
0537 mov $h1,$d1
0538 mov %eax,`16*4+12-64`($ctx)
0539
0540 mov \$0x3ffffff,%edx
0541 shr \$14,$d1
0542 and $d1
0543 mov %edx,`16*5+12-64`($ctx)
0544 lea (%rdx,%rdx,4),%edx
0545 shr \$26,$d1
0546 mov %edx,`16*6+12-64`($ctx)
0547
0548 mov $h2,%rax
0549 shl \$24,%rax
0550 or %rax,$d1
0551 mov $d1
0552 lea ($d1,$d1,4),$d1
0553 mov $d1
0554
0555 mov $r1,%rax
0556 call __poly1305_block
0557
0558 mov \$0x3ffffff,%eax
0559 mov $h0,$d1
0560 and $h0
0561 shr \$26,$d1
0562 mov %eax,`16*0+8-64`($ctx)
0563
0564 mov \$0x3ffffff,%edx
0565 and $d1
0566 mov %edx,`16*1+8-64`($ctx)
0567 lea (%rdx,%rdx,4),%edx
0568 shr \$26,$d1
0569 mov %edx,`16*2+8-64`($ctx)
0570
0571 mov $h1,%rax
0572 shl \$12,%rax
0573 or $d1,%rax
0574 and \$0x3ffffff,%eax
0575 mov %eax,`16*3+8-64`($ctx)
0576 lea (%rax,%rax,4),%eax
0577 mov $h1,$d1
0578 mov %eax,`16*4+8-64`($ctx)
0579
0580 mov \$0x3ffffff,%edx
0581 shr \$14,$d1
0582 and $d1
0583 mov %edx,`16*5+8-64`($ctx)
0584 lea (%rdx,%rdx,4),%edx
0585 shr \$26,$d1
0586 mov %edx,`16*6+8-64`($ctx)
0587
0588 mov $h2,%rax
0589 shl \$24,%rax
0590 or %rax,$d1
0591 mov $d1
0592 lea ($d1,$d1,4),$d1
0593 mov $d1
0594
0595 lea -48-64($ctx),$ctx
0596 pop %rbp
0597 RET
0598 .size __poly1305_init_avx,.-__poly1305_init_avx
0599 ___
0600
0601 &declare_function("poly1305_blocks_avx", 32, 4);
0602 $code.=<<___;
0603 .cfi_startproc
0604 mov 20($ctx),%r8d
0605 cmp \$128,$len
0606 jae .Lblocks_avx
0607 test %r8d,%r8d
0608 jz .Lblocks
0609
0610 .Lblocks_avx:
0611 and \$-16,$len
0612 jz .Lno_data_avx
0613
0614 vzeroupper
0615
0616 test %r8d,%r8d
0617 jz .Lbase2_64_avx
0618
0619 test \$31,$len
0620 jz .Leven_avx
0621
0622 push %rbp
0623 .cfi_push %rbp
0624 mov %rsp,%rbp
0625 push %rbx
0626 .cfi_push %rbx
0627 push %r12
0628 .cfi_push %r12
0629 push %r13
0630 .cfi_push %r13
0631 push %r14
0632 .cfi_push %r14
0633 push %r15
0634 .cfi_push %r15
0635 .Lblocks_avx_body:
0636
0637 mov $len,%r15
0638
0639 mov 0($ctx),$d1
0640 mov 8($ctx),$d2
0641 mov 16($ctx),$h2
0642
0643 mov 24($ctx),$r0
0644 mov 32($ctx),$s1
0645
0646
0647 mov $d1
0648 and \$`-1*(1<<31)`,$d1
0649 mov $d2,$r1
0650 mov $d2
0651 and \$`-1*(1<<31)`,$d2
0652
0653 shr \$6,$d1
0654 shl \$52,$r1
0655 add $d1,$h0
0656 shr \$12,$h1
0657 shr \$18,$d2
0658 add $r1,$h0
0659 adc $d2,$h1
0660
0661 mov $h2,$d1
0662 shl \$40,$d1
0663 shr \$24,$h2
0664 add $d1,$h1
0665 adc \$0,$h2
0666
0667 mov \$-4,$d2
0668 mov $h2,$d1
0669 and $h2,$d2
0670 shr \$2,$d1
0671 and \$3,$h2
0672 add $d2,$d1
0673 add $d1,$h0
0674 adc \$0,$h1
0675 adc \$0,$h2
0676
0677 mov $s1,$r1
0678 mov $s1,%rax
0679 shr \$2,$s1
0680 add $r1,$s1
0681
0682 add 0($inp),$h0
0683 adc 8($inp),$h1
0684 lea 16($inp),$inp
0685 adc $padbit,$h2
0686
0687 call __poly1305_block
0688
0689 test $padbit,$padbit
0690 jz .Lstore_base2_64_avx
0691
0692
0693 mov $h0,%rax
0694 mov $h0,%rdx
0695 shr \$52,$h0
0696 mov $h1,$r0
0697 mov $h1,$r1
0698 shr \$26,%rdx
0699 and \$0x3ffffff,%rax
0700 shl \$12,$r0
0701 and \$0x3ffffff,%rdx
0702 shr \$14,$h1
0703 or $r0,$h0
0704 shl \$24,$h2
0705 and \$0x3ffffff,$h0
0706 shr \$40,$r1
0707 and \$0x3ffffff,$h1
0708 or $r1,$h2
0709
0710 sub \$16,%r15
0711 jz .Lstore_base2_26_avx
0712
0713 vmovd %rax
0714 vmovd %rdx
0715 vmovd $h0
0716 vmovd $h1
0717 vmovd $h2
0718 jmp .Lproceed_avx
0719
0720 .align 32
0721 .Lstore_base2_64_avx:
0722 mov $h0,0($ctx)
0723 mov $h1,8($ctx)
0724 mov $h2,16($ctx)
0725 jmp .Ldone_avx
0726
0727 .align 16
0728 .Lstore_base2_26_avx:
0729 mov %rax
0730 mov %rdx
0731 mov $h0
0732 mov $h1
0733 mov $h2
0734 .align 16
0735 .Ldone_avx:
0736 pop %r15
0737 .cfi_restore %r15
0738 pop %r14
0739 .cfi_restore %r14
0740 pop %r13
0741 .cfi_restore %r13
0742 pop %r12
0743 .cfi_restore %r12
0744 pop %rbx
0745 .cfi_restore %rbx
0746 pop %rbp
0747 .cfi_restore %rbp
0748 .Lno_data_avx:
0749 .Lblocks_avx_epilogue:
0750 RET
0751 .cfi_endproc
0752
0753 .align 32
0754 .Lbase2_64_avx:
0755 .cfi_startproc
0756 push %rbp
0757 .cfi_push %rbp
0758 mov %rsp,%rbp
0759 push %rbx
0760 .cfi_push %rbx
0761 push %r12
0762 .cfi_push %r12
0763 push %r13
0764 .cfi_push %r13
0765 push %r14
0766 .cfi_push %r14
0767 push %r15
0768 .cfi_push %r15
0769 .Lbase2_64_avx_body:
0770
0771 mov $len,%r15
0772
0773 mov 24($ctx),$r0
0774 mov 32($ctx),$s1
0775
0776 mov 0($ctx),$h0
0777 mov 8($ctx),$h1
0778 mov 16($ctx),$h2
0779
0780 mov $s1,$r1
0781 mov $s1,%rax
0782 shr \$2,$s1
0783 add $r1,$s1
0784
0785 test \$31,$len
0786 jz .Linit_avx
0787
0788 add 0($inp),$h0
0789 adc 8($inp),$h1
0790 lea 16($inp),$inp
0791 adc $padbit,$h2
0792 sub \$16,%r15
0793
0794 call __poly1305_block
0795
0796 .Linit_avx:
0797
0798 mov $h0,%rax
0799 mov $h0,%rdx
0800 shr \$52,$h0
0801 mov $h1,$d1
0802 mov $h1,$d2
0803 shr \$26,%rdx
0804 and \$0x3ffffff,%rax
0805 shl \$12,$d1
0806 and \$0x3ffffff,%rdx
0807 shr \$14,$h1
0808 or $d1,$h0
0809 shl \$24,$h2
0810 and \$0x3ffffff,$h0
0811 shr \$40,$d2
0812 and \$0x3ffffff,$h1
0813 or $d2,$h2
0814
0815 vmovd %rax
0816 vmovd %rdx
0817 vmovd $h0
0818 vmovd $h1
0819 vmovd $h2
0820 movl \$1,20($ctx)
0821
0822 call __poly1305_init_avx
0823
0824 .Lproceed_avx:
0825 mov %r15,$len
0826 pop %r15
0827 .cfi_restore %r15
0828 pop %r14
0829 .cfi_restore %r14
0830 pop %r13
0831 .cfi_restore %r13
0832 pop %r12
0833 .cfi_restore %r12
0834 pop %rbx
0835 .cfi_restore %rbx
0836 pop %rbp
0837 .cfi_restore %rbp
0838 .Lbase2_64_avx_epilogue:
0839 jmp .Ldo_avx
0840 .cfi_endproc
0841
0842 .align 32
0843 .Leven_avx:
0844 .cfi_startproc
0845 vmovd 4*0($ctx),$H0
0846 vmovd 4*1($ctx),$H1
0847 vmovd 4*2($ctx),$H2
0848 vmovd 4*3($ctx),$H3
0849 vmovd 4*4($ctx),$H4
0850
0851 .Ldo_avx:
0852 ___
0853 $code.=<<___ if (!$win64);
0854 lea 8(%rsp),%r10
0855 .cfi_def_cfa_register %r10
0856 and \$-32,%rsp
0857 sub \$-8,%rsp
0858 lea -0x58(%rsp),%r11
0859 sub \$0x178,%rsp
0860 ___
0861 $code.=<<___ if ($win64);
0862 lea -0xf8(%rsp),%r11
0863 sub \$0x218,%rsp
0864 vmovdqa %xmm6,0x50(%r11)
0865 vmovdqa %xmm7,0x60(%r11)
0866 vmovdqa %xmm8,0x70(%r11)
0867 vmovdqa %xmm9,0x80(%r11)
0868 vmovdqa %xmm10,0x90(%r11)
0869 vmovdqa %xmm11,0xa0(%r11)
0870 vmovdqa %xmm12,0xb0(%r11)
0871 vmovdqa %xmm13,0xc0(%r11)
0872 vmovdqa %xmm14,0xd0(%r11)
0873 vmovdqa %xmm15,0xe0(%r11)
0874 .Ldo_avx_body:
0875 ___
0876 $code.=<<___;
0877 sub \$64,$len
0878 lea -32($inp),%rax
0879 cmovc %rax,$inp
0880
0881 vmovdqu `16*3`($ctx),$D4
0882 lea `16*3+64`($ctx),$ctx
0883 lea .Lconst(%rip),%rcx
0884
0885
0886
0887 vmovdqu 16*2($inp),$T0
0888 vmovdqu 16*3($inp),$T1
0889 vmovdqa 64(%rcx),$MASK
0890
0891 vpsrldq \$6,$T0,$T2
0892 vpsrldq \$6,$T1,$T3
0893 vpunpckhqdq $T1,$T0,$T4
0894 vpunpcklqdq $T1,$T0,$T0
0895 vpunpcklqdq $T3,$T2,$T3
0896
0897 vpsrlq \$40,$T4,$T4
0898 vpsrlq \$26,$T0,$T1
0899 vpand $MASK,$T0,$T0
0900 vpsrlq \$4,$T3,$T2
0901 vpand $MASK,$T1,$T1
0902 vpsrlq \$30,$T3,$T3
0903 vpand $MASK,$T2,$T2
0904 vpand $MASK,$T3,$T3
0905 vpor 32(%rcx),$T4,$T4
0906
0907 jbe .Lskip_loop_avx
0908
0909
0910 vmovdqu `16*1-64`($ctx),$D1
0911 vmovdqu `16*2-64`($ctx),$D2
0912 vpshufd \$0xEE,$D4,$D3
0913 vpshufd \$0x44,$D4,$D0
0914 vmovdqa $D3,-0x90(%r11)
0915 vmovdqa $D0,0x00(%rsp)
0916 vpshufd \$0xEE,$D1,$D4
0917 vmovdqu `16*3-64`($ctx),$D0
0918 vpshufd \$0x44,$D1,$D1
0919 vmovdqa $D4,-0x80(%r11)
0920 vmovdqa $D1,0x10(%rsp)
0921 vpshufd \$0xEE,$D2,$D3
0922 vmovdqu `16*4-64`($ctx),$D1
0923 vpshufd \$0x44,$D2,$D2
0924 vmovdqa $D3,-0x70(%r11)
0925 vmovdqa $D2,0x20(%rsp)
0926 vpshufd \$0xEE,$D0,$D4
0927 vmovdqu `16*5-64`($ctx),$D2
0928 vpshufd \$0x44,$D0,$D0
0929 vmovdqa $D4,-0x60(%r11)
0930 vmovdqa $D0,0x30(%rsp)
0931 vpshufd \$0xEE,$D1,$D3
0932 vmovdqu `16*6-64`($ctx),$D0
0933 vpshufd \$0x44,$D1,$D1
0934 vmovdqa $D3,-0x50(%r11)
0935 vmovdqa $D1,0x40(%rsp)
0936 vpshufd \$0xEE,$D2,$D4
0937 vmovdqu `16*7-64`($ctx),$D1
0938 vpshufd \$0x44,$D2,$D2
0939 vmovdqa $D4,-0x40(%r11)
0940 vmovdqa $D2,0x50(%rsp)
0941 vpshufd \$0xEE,$D0,$D3
0942 vmovdqu `16*8-64`($ctx),$D2
0943 vpshufd \$0x44,$D0,$D0
0944 vmovdqa $D3,-0x30(%r11)
0945 vmovdqa $D0,0x60(%rsp)
0946 vpshufd \$0xEE,$D1,$D4
0947 vpshufd \$0x44,$D1,$D1
0948 vmovdqa $D4,-0x20(%r11)
0949 vmovdqa $D1,0x70(%rsp)
0950 vpshufd \$0xEE,$D2,$D3
0951 vmovdqa 0x00(%rsp),$D4
0952 vpshufd \$0x44,$D2,$D2
0953 vmovdqa $D3,-0x10(%r11)
0954 vmovdqa $D2,0x80(%rsp)
0955
0956 jmp .Loop_avx
0957
0958 .align 32
0959 .Loop_avx:
0960
0961
0962
0963
0964
0965
0966
0967
0968
0969
0970
0971
0972
0973
0974
0975
0976
0977
0978
0979
0980 vpmuludq $T0,$D4,$D0
0981 vpmuludq $T1,$D4,$D1
0982 vmovdqa $H2,0x20(%r11)
0983 vpmuludq $T2,$D4,$D2
0984 vmovdqa 0x10(%rsp),$H2
0985 vpmuludq $T3,$D4,$D3
0986 vpmuludq $T4,$D4,$D4
0987
0988 vmovdqa $H0,0x00(%r11)
0989 vpmuludq 0x20(%rsp),$T4,$H0
0990 vmovdqa $H1,0x10(%r11)
0991 vpmuludq $T3,$H2,$H1
0992 vpaddq $H0,$D0,$D0
0993 vpaddq $H1,$D4,$D4
0994 vmovdqa $H3,0x30(%r11)
0995 vpmuludq $T2,$H2,$H0
0996 vpmuludq $T1,$H2,$H1
0997 vpaddq $H0,$D3,$D3
0998 vmovdqa 0x30(%rsp),$H3
0999 vpaddq $H1,$D2,$D2
1000 vmovdqa $H4,0x40(%r11)
1001 vpmuludq $T0,$H2,$H2
1002 vpmuludq $T2,$H3,$H0
1003 vpaddq $H2,$D1,$D1
1004
1005 vmovdqa 0x40(%rsp),$H4
1006 vpaddq $H0,$D4,$D4
1007 vpmuludq $T1,$H3,$H1
1008 vpmuludq $T0,$H3,$H3
1009 vpaddq $H1,$D3,$D3
1010 vmovdqa 0x50(%rsp),$H2
1011 vpaddq $H3,$D2,$D2
1012 vpmuludq $T4,$H4,$H0
1013 vpmuludq $T3,$H4,$H4
1014 vpaddq $H0,$D1,$D1
1015 vmovdqa 0x60(%rsp),$H3
1016 vpaddq $H4,$D0,$D0
1017
1018 vmovdqa 0x80(%rsp),$H4
1019 vpmuludq $T1,$H2,$H1
1020 vpmuludq $T0,$H2,$H2
1021 vpaddq $H1,$D4,$D4
1022 vpaddq $H2,$D3,$D3
1023 vpmuludq $T4,$H3,$H0
1024 vpmuludq $T3,$H3,$H1
1025 vpaddq $H0,$D2,$D2
1026 vmovdqu 16*0($inp),$H0
1027 vpaddq $H1,$D1,$D1
1028 vpmuludq $T2,$H3,$H3
1029 vpmuludq $T2,$H4,$T2
1030 vpaddq $H3,$D0,$D0
1031
1032 vmovdqu 16*1($inp),$H1
1033 vpaddq $T2,$D1,$D1
1034 vpmuludq $T3,$H4,$T3
1035 vpmuludq $T4,$H4,$T4
1036 vpsrldq \$6,$H0,$H2
1037 vpaddq $T3,$D2,$D2
1038 vpaddq $T4,$D3,$D3
1039 vpsrldq \$6,$H1,$H3
1040 vpmuludq 0x70(%rsp),$T0,$T4
1041 vpmuludq $T1,$H4,$T0
1042 vpunpckhqdq $H1,$H0,$H4
1043 vpaddq $T4,$D4,$D4
1044 vmovdqa -0x90(%r11),$T4
1045 vpaddq $T0,$D0,$D0
1046
1047 vpunpcklqdq $H1,$H0,$H0
1048 vpunpcklqdq $H3,$H2,$H3
1049
1050
1051 vpsrldq \$`40/8`,$H4,$H4
1052 vpsrlq \$26,$H0,$H1
1053 vpand $MASK,$H0,$H0
1054 vpsrlq \$4,$H3,$H2
1055 vpand $MASK,$H1,$H1
1056 vpand 0(%rcx),$H4,$H4
1057 vpsrlq \$30,$H3,$H3
1058 vpand $MASK,$H2,$H2
1059 vpand $MASK,$H3,$H3
1060 vpor 32(%rcx),$H4,$H4
1061
1062 vpaddq 0x00(%r11),$H0,$H0
1063 vpaddq 0x10(%r11),$H1,$H1
1064 vpaddq 0x20(%r11),$H2,$H2
1065 vpaddq 0x30(%r11),$H3,$H3
1066 vpaddq 0x40(%r11),$H4,$H4
1067
1068 lea 16*2($inp),%rax
1069 lea 16*4($inp),$inp
1070 sub \$64,$len
1071 cmovc %rax,$inp
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082 vpmuludq $H0,$T4,$T0
1083 vpmuludq $H1,$T4,$T1
1084 vpaddq $T0,$D0,$D0
1085 vpaddq $T1,$D1,$D1
1086 vmovdqa -0x80(%r11),$T2
1087 vpmuludq $H2,$T4,$T0
1088 vpmuludq $H3,$T4,$T1
1089 vpaddq $T0,$D2,$D2
1090 vpaddq $T1,$D3,$D3
1091 vpmuludq $H4,$T4,$T4
1092 vpmuludq -0x70(%r11),$H4,$T0
1093 vpaddq $T4,$D4,$D4
1094
1095 vpaddq $T0,$D0,$D0
1096 vpmuludq $H2,$T2,$T1
1097 vpmuludq $H3,$T2,$T0
1098 vpaddq $T1,$D3,$D3
1099 vmovdqa -0x60(%r11),$T3
1100 vpaddq $T0,$D4,$D4
1101 vpmuludq $H1,$T2,$T1
1102 vpmuludq $H0,$T2,$T2
1103 vpaddq $T1,$D2,$D2
1104 vpaddq $T2,$D1,$D1
1105
1106 vmovdqa -0x50(%r11),$T4
1107 vpmuludq $H2,$T3,$T0
1108 vpmuludq $H1,$T3,$T1
1109 vpaddq $T0,$D4,$D4
1110 vpaddq $T1,$D3,$D3
1111 vmovdqa -0x40(%r11),$T2
1112 vpmuludq $H0,$T3,$T3
1113 vpmuludq $H4,$T4,$T0
1114 vpaddq $T3,$D2,$D2
1115 vpaddq $T0,$D1,$D1
1116 vmovdqa -0x30(%r11),$T3
1117 vpmuludq $H3,$T4,$T4
1118 vpmuludq $H1,$T2,$T1
1119 vpaddq $T4,$D0,$D0
1120
1121 vmovdqa -0x10(%r11),$T4
1122 vpaddq $T1,$D4,$D4
1123 vpmuludq $H0,$T2,$T2
1124 vpmuludq $H4,$T3,$T0
1125 vpaddq $T2,$D3,$D3
1126 vpaddq $T0,$D2,$D2
1127 vmovdqu 16*2($inp),$T0
1128 vpmuludq $H3,$T3,$T2
1129 vpmuludq $H2,$T3,$T3
1130 vpaddq $T2,$D1,$D1
1131 vmovdqu 16*3($inp),$T1
1132 vpaddq $T3,$D0,$D0
1133
1134 vpmuludq $H2,$T4,$H2
1135 vpmuludq $H3,$T4,$H3
1136 vpsrldq \$6,$T0,$T2
1137 vpaddq $H2,$D1,$D1
1138 vpmuludq $H4,$T4,$H4
1139 vpsrldq \$6,$T1,$T3
1140 vpaddq $H3,$D2,$H2
1141 vpaddq $H4,$D3,$H3
1142 vpmuludq -0x20(%r11),$H0,$H4
1143 vpmuludq $H1,$T4,$H0
1144 vpunpckhqdq $T1,$T0,$T4
1145 vpaddq $H4,$D4,$H4
1146 vpaddq $H0,$D0,$H0
1147
1148 vpunpcklqdq $T1,$T0,$T0
1149 vpunpcklqdq $T3,$T2,$T3
1150
1151
1152 vpsrldq \$`40/8`,$T4,$T4
1153 vpsrlq \$26,$T0,$T1
1154 vmovdqa 0x00(%rsp),$D4
1155 vpand $MASK,$T0,$T0
1156 vpsrlq \$4,$T3,$T2
1157 vpand $MASK,$T1,$T1
1158 vpand 0(%rcx),$T4,$T4
1159 vpsrlq \$30,$T3,$T3
1160 vpand $MASK,$T2,$T2
1161 vpand $MASK,$T3,$T3
1162 vpor 32(%rcx),$T4,$T4
1163
1164
1165
1166
1167
1168 vpsrlq \$26,$H3,$D3
1169 vpand $MASK,$H3,$H3
1170 vpaddq $D3,$H4,$H4
1171
1172 vpsrlq \$26,$H0,$D0
1173 vpand $MASK,$H0,$H0
1174 vpaddq $D0,$D1,$H1
1175
1176 vpsrlq \$26,$H4,$D0
1177 vpand $MASK,$H4,$H4
1178
1179 vpsrlq \$26,$H1,$D1
1180 vpand $MASK,$H1,$H1
1181 vpaddq $D1,$H2,$H2
1182
1183 vpaddq $D0,$H0,$H0
1184 vpsllq \$2,$D0,$D0
1185 vpaddq $D0,$H0,$H0
1186
1187 vpsrlq \$26,$H2,$D2
1188 vpand $MASK,$H2,$H2
1189 vpaddq $D2,$H3,$H3
1190
1191 vpsrlq \$26,$H0,$D0
1192 vpand $MASK,$H0,$H0
1193 vpaddq $D0,$H1,$H1
1194
1195 vpsrlq \$26,$H3,$D3
1196 vpand $MASK,$H3,$H3
1197 vpaddq $D3,$H4,$H4
1198
1199 ja .Loop_avx
1200
1201 .Lskip_loop_avx:
1202
1203
1204
1205 vpshufd \$0x10,$D4,$D4
1206 add \$32,$len
1207 jnz .Long_tail_avx
1208
1209 vpaddq $H2,$T2,$T2
1210 vpaddq $H0,$T0,$T0
1211 vpaddq $H1,$T1,$T1
1212 vpaddq $H3,$T3,$T3
1213 vpaddq $H4,$T4,$T4
1214
1215 .Long_tail_avx:
1216 vmovdqa $H2,0x20(%r11)
1217 vmovdqa $H0,0x00(%r11)
1218 vmovdqa $H1,0x10(%r11)
1219 vmovdqa $H3,0x30(%r11)
1220 vmovdqa $H4,0x40(%r11)
1221
1222
1223
1224
1225
1226
1227
1228 vpmuludq $T2,$D4,$D2
1229 vpmuludq $T0,$D4,$D0
1230 vpshufd \$0x10,`16*1-64`($ctx),$H2
1231 vpmuludq $T1,$D4,$D1
1232 vpmuludq $T3,$D4,$D3
1233 vpmuludq $T4,$D4,$D4
1234
1235 vpmuludq $T3,$H2,$H0
1236 vpaddq $H0,$D4,$D4
1237 vpshufd \$0x10,`16*2-64`($ctx),$H3
1238 vpmuludq $T2,$H2,$H1
1239 vpaddq $H1,$D3,$D3
1240 vpshufd \$0x10,`16*3-64`($ctx),$H4
1241 vpmuludq $T1,$H2,$H0
1242 vpaddq $H0,$D2,$D2
1243 vpmuludq $T0,$H2,$H2
1244 vpaddq $H2,$D1,$D1
1245 vpmuludq $T4,$H3,$H3
1246 vpaddq $H3,$D0,$D0
1247
1248 vpshufd \$0x10,`16*4-64`($ctx),$H2
1249 vpmuludq $T2,$H4,$H1
1250 vpaddq $H1,$D4,$D4
1251 vpmuludq $T1,$H4,$H0
1252 vpaddq $H0,$D3,$D3
1253 vpshufd \$0x10,`16*5-64`($ctx),$H3
1254 vpmuludq $T0,$H4,$H4
1255 vpaddq $H4,$D2,$D2
1256 vpmuludq $T4,$H2,$H1
1257 vpaddq $H1,$D1,$D1
1258 vpshufd \$0x10,`16*6-64`($ctx),$H4
1259 vpmuludq $T3,$H2,$H2
1260 vpaddq $H2,$D0,$D0
1261
1262 vpmuludq $T1,$H3,$H0
1263 vpaddq $H0,$D4,$D4
1264 vpmuludq $T0,$H3,$H3
1265 vpaddq $H3,$D3,$D3
1266 vpshufd \$0x10,`16*7-64`($ctx),$H2
1267 vpmuludq $T4,$H4,$H1
1268 vpaddq $H1,$D2,$D2
1269 vpshufd \$0x10,`16*8-64`($ctx),$H3
1270 vpmuludq $T3,$H4,$H0
1271 vpaddq $H0,$D1,$D1
1272 vpmuludq $T2,$H4,$H4
1273 vpaddq $H4,$D0,$D0
1274
1275 vpmuludq $T0,$H2,$H2
1276 vpaddq $H2,$D4,$D4
1277 vpmuludq $T4,$H3,$H1
1278 vpaddq $H1,$D3,$D3
1279 vpmuludq $T3,$H3,$H0
1280 vpaddq $H0,$D2,$D2
1281 vpmuludq $T2,$H3,$H1
1282 vpaddq $H1,$D1,$D1
1283 vpmuludq $T1,$H3,$H3
1284 vpaddq $H3,$D0,$D0
1285
1286 jz .Lshort_tail_avx
1287
1288 vmovdqu 16*0($inp),$H0
1289 vmovdqu 16*1($inp),$H1
1290
1291 vpsrldq \$6,$H0,$H2
1292 vpsrldq \$6,$H1,$H3
1293 vpunpckhqdq $H1,$H0,$H4
1294 vpunpcklqdq $H1,$H0,$H0
1295 vpunpcklqdq $H3,$H2,$H3
1296
1297 vpsrlq \$40,$H4,$H4
1298 vpsrlq \$26,$H0,$H1
1299 vpand $MASK,$H0,$H0
1300 vpsrlq \$4,$H3,$H2
1301 vpand $MASK,$H1,$H1
1302 vpsrlq \$30,$H3,$H3
1303 vpand $MASK,$H2,$H2
1304 vpand $MASK,$H3,$H3
1305 vpor 32(%rcx),$H4,$H4
1306
1307 vpshufd \$0x32,`16*0-64`($ctx),$T4
1308 vpaddq 0x00(%r11),$H0,$H0
1309 vpaddq 0x10(%r11),$H1,$H1
1310 vpaddq 0x20(%r11),$H2,$H2
1311 vpaddq 0x30(%r11),$H3,$H3
1312 vpaddq 0x40(%r11),$H4,$H4
1313
1314
1315
1316
1317 vpmuludq $H0,$T4,$T0
1318 vpaddq $T0,$D0,$D0
1319 vpmuludq $H1,$T4,$T1
1320 vpaddq $T1,$D1,$D1
1321 vpmuludq $H2,$T4,$T0
1322 vpaddq $T0,$D2,$D2
1323 vpshufd \$0x32,`16*1-64`($ctx),$T2
1324 vpmuludq $H3,$T4,$T1
1325 vpaddq $T1,$D3,$D3
1326 vpmuludq $H4,$T4,$T4
1327 vpaddq $T4,$D4,$D4
1328
1329 vpmuludq $H3,$T2,$T0
1330 vpaddq $T0,$D4,$D4
1331 vpshufd \$0x32,`16*2-64`($ctx),$T3
1332 vpmuludq $H2,$T2,$T1
1333 vpaddq $T1,$D3,$D3
1334 vpshufd \$0x32,`16*3-64`($ctx),$T4
1335 vpmuludq $H1,$T2,$T0
1336 vpaddq $T0,$D2,$D2
1337 vpmuludq $H0,$T2,$T2
1338 vpaddq $T2,$D1,$D1
1339 vpmuludq $H4,$T3,$T3
1340 vpaddq $T3,$D0,$D0
1341
1342 vpshufd \$0x32,`16*4-64`($ctx),$T2
1343 vpmuludq $H2,$T4,$T1
1344 vpaddq $T1,$D4,$D4
1345 vpmuludq $H1,$T4,$T0
1346 vpaddq $T0,$D3,$D3
1347 vpshufd \$0x32,`16*5-64`($ctx),$T3
1348 vpmuludq $H0,$T4,$T4
1349 vpaddq $T4,$D2,$D2
1350 vpmuludq $H4,$T2,$T1
1351 vpaddq $T1,$D1,$D1
1352 vpshufd \$0x32,`16*6-64`($ctx),$T4
1353 vpmuludq $H3,$T2,$T2
1354 vpaddq $T2,$D0,$D0
1355
1356 vpmuludq $H1,$T3,$T0
1357 vpaddq $T0,$D4,$D4
1358 vpmuludq $H0,$T3,$T3
1359 vpaddq $T3,$D3,$D3
1360 vpshufd \$0x32,`16*7-64`($ctx),$T2
1361 vpmuludq $H4,$T4,$T1
1362 vpaddq $T1,$D2,$D2
1363 vpshufd \$0x32,`16*8-64`($ctx),$T3
1364 vpmuludq $H3,$T4,$T0
1365 vpaddq $T0,$D1,$D1
1366 vpmuludq $H2,$T4,$T4
1367 vpaddq $T4,$D0,$D0
1368
1369 vpmuludq $H0,$T2,$T2
1370 vpaddq $T2,$D4,$D4
1371 vpmuludq $H4,$T3,$T1
1372 vpaddq $T1,$D3,$D3
1373 vpmuludq $H3,$T3,$T0
1374 vpaddq $T0,$D2,$D2
1375 vpmuludq $H2,$T3,$T1
1376 vpaddq $T1,$D1,$D1
1377 vpmuludq $H1,$T3,$T3
1378 vpaddq $T3,$D0,$D0
1379
1380 .Lshort_tail_avx:
1381
1382
1383
1384 vpsrldq \$8,$D4,$T4
1385 vpsrldq \$8,$D3,$T3
1386 vpsrldq \$8,$D1,$T1
1387 vpsrldq \$8,$D0,$T0
1388 vpsrldq \$8,$D2,$T2
1389 vpaddq $T3,$D3,$D3
1390 vpaddq $T4,$D4,$D4
1391 vpaddq $T0,$D0,$D0
1392 vpaddq $T1,$D1,$D1
1393 vpaddq $T2,$D2,$D2
1394
1395
1396
1397
1398 vpsrlq \$26,$D3,$H3
1399 vpand $MASK,$D3,$D3
1400 vpaddq $H3,$D4,$D4
1401
1402 vpsrlq \$26,$D0,$H0
1403 vpand $MASK,$D0,$D0
1404 vpaddq $H0,$D1,$D1
1405
1406 vpsrlq \$26,$D4,$H4
1407 vpand $MASK,$D4,$D4
1408
1409 vpsrlq \$26,$D1,$H1
1410 vpand $MASK,$D1,$D1
1411 vpaddq $H1,$D2,$D2
1412
1413 vpaddq $H4,$D0,$D0
1414 vpsllq \$2,$H4,$H4
1415 vpaddq $H4,$D0,$D0
1416
1417 vpsrlq \$26,$D2,$H2
1418 vpand $MASK,$D2,$D2
1419 vpaddq $H2,$D3,$D3
1420
1421 vpsrlq \$26,$D0,$H0
1422 vpand $MASK,$D0,$D0
1423 vpaddq $H0,$D1,$D1
1424
1425 vpsrlq \$26,$D3,$H3
1426 vpand $MASK,$D3,$D3
1427 vpaddq $H3,$D4,$D4
1428
1429 vmovd $D0,`4*0-48-64`($ctx)
1430 vmovd $D1,`4*1-48-64`($ctx)
1431 vmovd $D2,`4*2-48-64`($ctx)
1432 vmovd $D3,`4*3-48-64`($ctx)
1433 vmovd $D4,`4*4-48-64`($ctx)
1434 ___
1435 $code.=<<___ if ($win64);
1436 vmovdqa 0x50(%r11),%xmm6
1437 vmovdqa 0x60(%r11),%xmm7
1438 vmovdqa 0x70(%r11),%xmm8
1439 vmovdqa 0x80(%r11),%xmm9
1440 vmovdqa 0x90(%r11),%xmm10
1441 vmovdqa 0xa0(%r11),%xmm11
1442 vmovdqa 0xb0(%r11),%xmm12
1443 vmovdqa 0xc0(%r11),%xmm13
1444 vmovdqa 0xd0(%r11),%xmm14
1445 vmovdqa 0xe0(%r11),%xmm15
1446 lea 0xf8(%r11),%rsp
1447 .Ldo_avx_epilogue:
1448 ___
1449 $code.=<<___ if (!$win64);
1450 lea -8(%r10),%rsp
1451 .cfi_def_cfa_register %rsp
1452 ___
1453 $code.=<<___;
1454 vzeroupper
1455 RET
1456 .cfi_endproc
1457 ___
1458 &end_function("poly1305_blocks_avx");
1459
1460 &declare_function("poly1305_emit_avx", 32, 3);
1461 $code.=<<___;
1462 cmpl \$0,20($ctx)
1463 je .Lemit
1464
1465 mov 0($ctx),%eax
1466 mov 4($ctx),%ecx
1467 mov 8($ctx),%r8d
1468 mov 12($ctx),%r11d
1469 mov 16($ctx),%r10d
1470
1471 shl \$26,%rcx
1472 mov %r8,%r9
1473 shl \$52,%r8
1474 add %rcx,%rax
1475 shr \$12,%r9
1476 add %rax,%r8
1477 adc \$0,%r9
1478
1479 shl \$14,%r11
1480 mov %r10,%rax
1481 shr \$24,%r10
1482 add %r11,%r9
1483 shl \$40,%rax
1484 add %rax,%r9
1485 adc \$0,%r10
1486
1487 mov %r10,%rax
1488 mov %r10,%rcx
1489 and \$3,%r10
1490 shr \$2,%rax
1491 and \$-4,%rcx
1492 add %rcx,%rax
1493 add %rax,%r8
1494 adc \$0,%r9
1495 adc \$0,%r10
1496
1497 mov %r8,%rax
1498 add \$5,%r8
1499 mov %r9,%rcx
1500 adc \$0,%r9
1501 adc \$0,%r10
1502 shr \$2,%r10
1503 cmovnz %r8,%rax
1504 cmovnz %r9,%rcx
1505
1506 add 0($nonce),%rax
1507 adc 8($nonce),%rcx
1508 mov %rax,0($mac)
1509 mov %rcx,8($mac)
1510
1511 RET
1512 ___
1513 &end_function("poly1305_emit_avx");
1514
1515 if ($avx>1) {
1516
1517 my ($H0,$H1,$H2,$H3,$H4, $MASK, $T4,$T0,$T1,$T2,$T3, $D0,$D1,$D2,$D3,$D4) =
1518 map("%ymm$_",(0..15));
1519 my $S4=$MASK;
1520
1521 sub poly1305_blocks_avxN {
1522 my ($avx512) = @_;
1523 my $suffix = $avx512 ? "_avx512" : "";
1524 $code.=<<___;
1525 .cfi_startproc
1526 mov 20($ctx),%r8d
1527 cmp \$128,$len
1528 jae .Lblocks_avx2$suffix
1529 test %r8d,%r8d
1530 jz .Lblocks
1531
1532 .Lblocks_avx2$suffix:
1533 and \$-16,$len
1534 jz .Lno_data_avx2$suffix
1535
1536 vzeroupper
1537
1538 test %r8d,%r8d
1539 jz .Lbase2_64_avx2$suffix
1540
1541 test \$63,$len
1542 jz .Leven_avx2$suffix
1543
1544 push %rbp
1545 .cfi_push %rbp
1546 mov %rsp,%rbp
1547 push %rbx
1548 .cfi_push %rbx
1549 push %r12
1550 .cfi_push %r12
1551 push %r13
1552 .cfi_push %r13
1553 push %r14
1554 .cfi_push %r14
1555 push %r15
1556 .cfi_push %r15
1557 .Lblocks_avx2_body$suffix:
1558
1559 mov $len,%r15
1560
1561 mov 0($ctx),$d1
1562 mov 8($ctx),$d2
1563 mov 16($ctx),$h2
1564
1565 mov 24($ctx),$r0
1566 mov 32($ctx),$s1
1567
1568
1569 mov $d1
1570 and \$`-1*(1<<31)`,$d1
1571 mov $d2,$r1
1572 mov $d2
1573 and \$`-1*(1<<31)`,$d2
1574
1575 shr \$6,$d1
1576 shl \$52,$r1
1577 add $d1,$h0
1578 shr \$12,$h1
1579 shr \$18,$d2
1580 add $r1,$h0
1581 adc $d2,$h1
1582
1583 mov $h2,$d1
1584 shl \$40,$d1
1585 shr \$24,$h2
1586 add $d1,$h1
1587 adc \$0,$h2
1588
1589 mov \$-4,$d2
1590 mov $h2,$d1
1591 and $h2,$d2
1592 shr \$2,$d1
1593 and \$3,$h2
1594 add $d2,$d1
1595 add $d1,$h0
1596 adc \$0,$h1
1597 adc \$0,$h2
1598
1599 mov $s1,$r1
1600 mov $s1,%rax
1601 shr \$2,$s1
1602 add $r1,$s1
1603
1604 .Lbase2_26_pre_avx2$suffix:
1605 add 0($inp),$h0
1606 adc 8($inp),$h1
1607 lea 16($inp),$inp
1608 adc $padbit,$h2
1609 sub \$16,%r15
1610
1611 call __poly1305_block
1612 mov $r1,%rax
1613
1614 test \$63,%r15
1615 jnz .Lbase2_26_pre_avx2$suffix
1616
1617 test $padbit,$padbit
1618 jz .Lstore_base2_64_avx2$suffix
1619
1620
1621 mov $h0,%rax
1622 mov $h0,%rdx
1623 shr \$52,$h0
1624 mov $h1,$r0
1625 mov $h1,$r1
1626 shr \$26,%rdx
1627 and \$0x3ffffff,%rax
1628 shl \$12,$r0
1629 and \$0x3ffffff,%rdx
1630 shr \$14,$h1
1631 or $r0,$h0
1632 shl \$24,$h2
1633 and \$0x3ffffff,$h0
1634 shr \$40,$r1
1635 and \$0x3ffffff,$h1
1636 or $r1,$h2
1637
1638 test %r15,%r15
1639 jz .Lstore_base2_26_avx2$suffix
1640
1641 vmovd %rax
1642 vmovd %rdx
1643 vmovd $h0
1644 vmovd $h1
1645 vmovd $h2
1646 jmp .Lproceed_avx2$suffix
1647
1648 .align 32
1649 .Lstore_base2_64_avx2$suffix:
1650 mov $h0,0($ctx)
1651 mov $h1,8($ctx)
1652 mov $h2,16($ctx)
1653 jmp .Ldone_avx2$suffix
1654
1655 .align 16
1656 .Lstore_base2_26_avx2$suffix:
1657 mov %rax
1658 mov %rdx
1659 mov $h0
1660 mov $h1
1661 mov $h2
1662 .align 16
1663 .Ldone_avx2$suffix:
1664 pop %r15
1665 .cfi_restore %r15
1666 pop %r14
1667 .cfi_restore %r14
1668 pop %r13
1669 .cfi_restore %r13
1670 pop %r12
1671 .cfi_restore %r12
1672 pop %rbx
1673 .cfi_restore %rbx
1674 pop %rbp
1675 .cfi_restore %rbp
1676 .Lno_data_avx2$suffix:
1677 .Lblocks_avx2_epilogue$suffix:
1678 RET
1679 .cfi_endproc
1680
1681 .align 32
1682 .Lbase2_64_avx2$suffix:
1683 .cfi_startproc
1684 push %rbp
1685 .cfi_push %rbp
1686 mov %rsp,%rbp
1687 push %rbx
1688 .cfi_push %rbx
1689 push %r12
1690 .cfi_push %r12
1691 push %r13
1692 .cfi_push %r13
1693 push %r14
1694 .cfi_push %r14
1695 push %r15
1696 .cfi_push %r15
1697 .Lbase2_64_avx2_body$suffix:
1698
1699 mov $len,%r15
1700
1701 mov 24($ctx),$r0
1702 mov 32($ctx),$s1
1703
1704 mov 0($ctx),$h0
1705 mov 8($ctx),$h1
1706 mov 16($ctx),$h2
1707
1708 mov $s1,$r1
1709 mov $s1,%rax
1710 shr \$2,$s1
1711 add $r1,$s1
1712
1713 test \$63,$len
1714 jz .Linit_avx2$suffix
1715
1716 .Lbase2_64_pre_avx2$suffix:
1717 add 0($inp),$h0
1718 adc 8($inp),$h1
1719 lea 16($inp),$inp
1720 adc $padbit,$h2
1721 sub \$16,%r15
1722
1723 call __poly1305_block
1724 mov $r1,%rax
1725
1726 test \$63,%r15
1727 jnz .Lbase2_64_pre_avx2$suffix
1728
1729 .Linit_avx2$suffix:
1730
1731 mov $h0,%rax
1732 mov $h0,%rdx
1733 shr \$52,$h0
1734 mov $h1,$d1
1735 mov $h1,$d2
1736 shr \$26,%rdx
1737 and \$0x3ffffff,%rax
1738 shl \$12,$d1
1739 and \$0x3ffffff,%rdx
1740 shr \$14,$h1
1741 or $d1,$h0
1742 shl \$24,$h2
1743 and \$0x3ffffff,$h0
1744 shr \$40,$d2
1745 and \$0x3ffffff,$h1
1746 or $d2,$h2
1747
1748 vmovd %rax
1749 vmovd %rdx
1750 vmovd $h0
1751 vmovd $h1
1752 vmovd $h2
1753 movl \$1,20($ctx)
1754
1755 call __poly1305_init_avx
1756
1757 .Lproceed_avx2$suffix:
1758 mov %r15,$len
1759 ___
1760 $code.=<<___ if (!$kernel);
1761 mov OPENSSL_ia32cap_P+8(%rip),%r9d
1762 mov \$`(1<<31|1<<30|1<<16)`,%r11d
1763 ___
1764 $code.=<<___;
1765 pop %r15
1766 .cfi_restore %r15
1767 pop %r14
1768 .cfi_restore %r14
1769 pop %r13
1770 .cfi_restore %r13
1771 pop %r12
1772 .cfi_restore %r12
1773 pop %rbx
1774 .cfi_restore %rbx
1775 pop %rbp
1776 .cfi_restore %rbp
1777 .Lbase2_64_avx2_epilogue$suffix:
1778 jmp .Ldo_avx2$suffix
1779 .cfi_endproc
1780
1781 .align 32
1782 .Leven_avx2$suffix:
1783 .cfi_startproc
1784 ___
1785 $code.=<<___ if (!$kernel);
1786 mov OPENSSL_ia32cap_P+8(%rip),%r9d
1787 ___
1788 $code.=<<___;
1789 vmovd 4*0($ctx),%x
1790 vmovd 4*1($ctx),%x
1791 vmovd 4*2($ctx),%x
1792 vmovd 4*3($ctx),%x
1793 vmovd 4*4($ctx),%x
1794
1795 .Ldo_avx2$suffix:
1796 ___
1797 $code.=<<___ if (!$kernel && $avx>2);
1798 cmp \$512,$len
1799 jb .Lskip_avx512
1800 and %r11d,%r9d
1801 test \$`1<<16`,%r9d
1802 jnz .Lblocks_avx512
1803 .Lskip_avx512$suffix:
1804 ___
1805 $code.=<<___ if ($avx > 2 && $avx512 && $kernel);
1806 cmp \$512,$len
1807 jae .Lblocks_avx512
1808 ___
1809 $code.=<<___ if (!$win64);
1810 lea 8(%rsp),%r10
1811 .cfi_def_cfa_register %r10
1812 sub \$0x128,%rsp
1813 ___
1814 $code.=<<___ if ($win64);
1815 lea 8(%rsp),%r10
1816 sub \$0x1c8,%rsp
1817 vmovdqa %xmm6,-0xb0(%r10)
1818 vmovdqa %xmm7,-0xa0(%r10)
1819 vmovdqa %xmm8,-0x90(%r10)
1820 vmovdqa %xmm9,-0x80(%r10)
1821 vmovdqa %xmm10,-0x70(%r10)
1822 vmovdqa %xmm11,-0x60(%r10)
1823 vmovdqa %xmm12,-0x50(%r10)
1824 vmovdqa %xmm13,-0x40(%r10)
1825 vmovdqa %xmm14,-0x30(%r10)
1826 vmovdqa %xmm15,-0x20(%r10)
1827 .Ldo_avx2_body$suffix:
1828 ___
1829 $code.=<<___;
1830 lea .Lconst(%rip),%rcx
1831 lea 48+64($ctx),$ctx
1832 vmovdqa 96(%rcx),$T0
1833
1834
1835 vmovdqu `16*0-64`($ctx),%x
1836 and \$-512,%rsp
1837 vmovdqu `16*1-64`($ctx),%x
1838 vmovdqu `16*2-64`($ctx),%x
1839 vmovdqu `16*3-64`($ctx),%x
1840 vmovdqu `16*4-64`($ctx),%x
1841 vmovdqu `16*5-64`($ctx),%x
1842 lea 0x90(%rsp),%rax
1843 vmovdqu `16*6-64`($ctx),%x
1844 vpermd $T2,$T0,$T2
1845 vmovdqu `16*7-64`($ctx),%x
1846 vpermd $T3,$T0,$T3
1847 vmovdqu `16*8-64`($ctx),%x
1848 vpermd $T4,$T0,$T4
1849 vmovdqa $T2,0x00(%rsp)
1850 vpermd $D0,$T0,$D0
1851 vmovdqa $T3,0x20-0x90(%rax)
1852 vpermd $D1,$T0,$D1
1853 vmovdqa $T4,0x40-0x90(%rax)
1854 vpermd $D2,$T0,$D2
1855 vmovdqa $D0,0x60-0x90(%rax)
1856 vpermd $D3,$T0,$D3
1857 vmovdqa $D1,0x80-0x90(%rax)
1858 vpermd $D4,$T0,$D4
1859 vmovdqa $D2,0xa0-0x90(%rax)
1860 vpermd $MASK,$T0,$MASK
1861 vmovdqa $D3,0xc0-0x90(%rax)
1862 vmovdqa $D4,0xe0-0x90(%rax)
1863 vmovdqa $MASK,0x100-0x90(%rax)
1864 vmovdqa 64(%rcx),$MASK
1865
1866
1867
1868 vmovdqu 16*0($inp),%x
1869 vmovdqu 16*1($inp),%x
1870 vinserti128 \$1,16*2($inp),$T0,$T0
1871 vinserti128 \$1,16*3($inp),$T1,$T1
1872 lea 16*4($inp),$inp
1873
1874 vpsrldq \$6,$T0,$T2
1875 vpsrldq \$6,$T1,$T3
1876 vpunpckhqdq $T1,$T0,$T4
1877 vpunpcklqdq $T3,$T2,$T2
1878 vpunpcklqdq $T1,$T0,$T0
1879
1880 vpsrlq \$30,$T2,$T3
1881 vpsrlq \$4,$T2,$T2
1882 vpsrlq \$26,$T0,$T1
1883 vpsrlq \$40,$T4,$T4
1884 vpand $MASK,$T2,$T2
1885 vpand $MASK,$T0,$T0
1886 vpand $MASK,$T1,$T1
1887 vpand $MASK,$T3,$T3
1888 vpor 32(%rcx),$T4,$T4
1889
1890 vpaddq $H2,$T2,$H2
1891 sub \$64,$len
1892 jz .Ltail_avx2$suffix
1893 jmp .Loop_avx2$suffix
1894
1895 .align 32
1896 .Loop_avx2$suffix:
1897
1898
1899
1900
1901
1902
1903
1904
1905 vpaddq $H0,$T0,$H0
1906 vmovdqa `32*0`(%rsp),$T0
1907 vpaddq $H1,$T1,$H1
1908 vmovdqa `32*1`(%rsp),$T1
1909 vpaddq $H3,$T3,$H3
1910 vmovdqa `32*3`(%rsp),$T2
1911 vpaddq $H4,$T4,$H4
1912 vmovdqa `32*6-0x90`(%rax),$T3
1913 vmovdqa `32*8-0x90`(%rax),$S4
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930 vpmuludq $H2,$T0,$D2
1931 vpmuludq $H2,$T1,$D3
1932 vpmuludq $H2,$T2,$D4
1933 vpmuludq $H2,$T3,$D0
1934 vpmuludq $H2,$S4,$D1
1935
1936 vpmuludq $H0,$T1,$T4
1937 vpmuludq $H1,$T1,$H2
1938 vpaddq $T4,$D1,$D1
1939 vpaddq $H2,$D2,$D2
1940 vpmuludq $H3,$T1,$T4
1941 vpmuludq `32*2`(%rsp),$H4,$H2
1942 vpaddq $T4,$D4,$D4
1943 vpaddq $H2,$D0,$D0
1944 vmovdqa `32*4-0x90`(%rax),$T1
1945
1946 vpmuludq $H0,$T0,$T4
1947 vpmuludq $H1,$T0,$H2
1948 vpaddq $T4,$D0,$D0
1949 vpaddq $H2,$D1,$D1
1950 vpmuludq $H3,$T0,$T4
1951 vpmuludq $H4,$T0,$H2
1952 vmovdqu 16*0($inp),%x
1953 vpaddq $T4,$D3,$D3
1954 vpaddq $H2,$D4,$D4
1955 vinserti128 \$1,16*2($inp),$T0,$T0
1956
1957 vpmuludq $H3,$T1,$T4
1958 vpmuludq $H4,$T1,$H2
1959 vmovdqu 16*1($inp),%x
1960 vpaddq $T4,$D0,$D0
1961 vpaddq $H2,$D1,$D1
1962 vmovdqa `32*5-0x90`(%rax),$H2
1963 vpmuludq $H1,$T2,$T4
1964 vpmuludq $H0,$T2,$T2
1965 vpaddq $T4,$D3,$D3
1966 vpaddq $T2,$D2,$D2
1967 vinserti128 \$1,16*3($inp),$T1,$T1
1968 lea 16*4($inp),$inp
1969
1970 vpmuludq $H1,$H2,$T4
1971 vpmuludq $H0,$H2,$H2
1972 vpsrldq \$6,$T0,$T2
1973 vpaddq $T4,$D4,$D4
1974 vpaddq $H2,$D3,$D3
1975 vpmuludq $H3,$T3,$T4
1976 vpmuludq $H4,$T3,$H2
1977 vpsrldq \$6,$T1,$T3
1978 vpaddq $T4,$D1,$D1
1979 vpaddq $H2,$D2,$D2
1980 vpunpckhqdq $T1,$T0,$T4
1981
1982 vpmuludq $H3,$S4,$H3
1983 vpmuludq $H4,$S4,$H4
1984 vpunpcklqdq $T1,$T0,$T0
1985 vpaddq $H3,$D2,$H2
1986 vpaddq $H4,$D3,$H3
1987 vpunpcklqdq $T3,$T2,$T3
1988 vpmuludq `32*7-0x90`(%rax),$H0,$H4
1989 vpmuludq $H1,$S4,$H0
1990 vmovdqa 64(%rcx),$MASK
1991 vpaddq $H4,$D4,$H4
1992 vpaddq $H0,$D0,$H0
1993
1994
1995
1996
1997 vpsrlq \$26,$H3,$D3
1998 vpand $MASK,$H3,$H3
1999 vpaddq $D3,$H4,$H4
2000
2001 vpsrlq \$26,$H0,$D0
2002 vpand $MASK,$H0,$H0
2003 vpaddq $D0,$D1,$H1
2004
2005 vpsrlq \$26,$H4,$D4
2006 vpand $MASK,$H4,$H4
2007
2008 vpsrlq \$4,$T3,$T2
2009
2010 vpsrlq \$26,$H1,$D1
2011 vpand $MASK,$H1,$H1
2012 vpaddq $D1,$H2,$H2
2013
2014 vpaddq $D4,$H0,$H0
2015 vpsllq \$2,$D4,$D4
2016 vpaddq $D4,$H0,$H0
2017
2018 vpand $MASK,$T2,$T2
2019 vpsrlq \$26,$T0,$T1
2020
2021 vpsrlq \$26,$H2,$D2
2022 vpand $MASK,$H2,$H2
2023 vpaddq $D2,$H3,$H3
2024
2025 vpaddq $T2,$H2,$H2
2026 vpsrlq \$30,$T3,$T3
2027
2028 vpsrlq \$26,$H0,$D0
2029 vpand $MASK,$H0,$H0
2030 vpaddq $D0,$H1,$H1
2031
2032 vpsrlq \$40,$T4,$T4
2033
2034 vpsrlq \$26,$H3,$D3
2035 vpand $MASK,$H3,$H3
2036 vpaddq $D3,$H4,$H4
2037
2038 vpand $MASK,$T0,$T0
2039 vpand $MASK,$T1,$T1
2040 vpand $MASK,$T3,$T3
2041 vpor 32(%rcx),$T4,$T4
2042
2043 sub \$64,$len
2044 jnz .Loop_avx2$suffix
2045
2046 .byte 0x66,0x90
2047 .Ltail_avx2$suffix:
2048
2049
2050
2051
2052
2053
2054
2055 vpaddq $H0,$T0,$H0
2056 vmovdqu `32*0+4`(%rsp),$T0
2057 vpaddq $H1,$T1,$H1
2058 vmovdqu `32*1+4`(%rsp),$T1
2059 vpaddq $H3,$T3,$H3
2060 vmovdqu `32*3+4`(%rsp),$T2
2061 vpaddq $H4,$T4,$H4
2062 vmovdqu `32*6+4-0x90`(%rax),$T3
2063 vmovdqu `32*8+4-0x90`(%rax),$S4
2064
2065 vpmuludq $H2,$T0,$D2
2066 vpmuludq $H2,$T1,$D3
2067 vpmuludq $H2,$T2,$D4
2068 vpmuludq $H2,$T3,$D0
2069 vpmuludq $H2,$S4,$D1
2070
2071 vpmuludq $H0,$T1,$T4
2072 vpmuludq $H1,$T1,$H2
2073 vpaddq $T4,$D1,$D1
2074 vpaddq $H2,$D2,$D2
2075 vpmuludq $H3,$T1,$T4
2076 vpmuludq `32*2+4`(%rsp),$H4,$H2
2077 vpaddq $T4,$D4,$D4
2078 vpaddq $H2,$D0,$D0
2079
2080 vpmuludq $H0,$T0,$T4
2081 vpmuludq $H1,$T0,$H2
2082 vpaddq $T4,$D0,$D0
2083 vmovdqu `32*4+4-0x90`(%rax),$T1
2084 vpaddq $H2,$D1,$D1
2085 vpmuludq $H3,$T0,$T4
2086 vpmuludq $H4,$T0,$H2
2087 vpaddq $T4,$D3,$D3
2088 vpaddq $H2,$D4,$D4
2089
2090 vpmuludq $H3,$T1,$T4
2091 vpmuludq $H4,$T1,$H2
2092 vpaddq $T4,$D0,$D0
2093 vpaddq $H2,$D1,$D1
2094 vmovdqu `32*5+4-0x90`(%rax),$H2
2095 vpmuludq $H1,$T2,$T4
2096 vpmuludq $H0,$T2,$T2
2097 vpaddq $T4,$D3,$D3
2098 vpaddq $T2,$D2,$D2
2099
2100 vpmuludq $H1,$H2,$T4
2101 vpmuludq $H0,$H2,$H2
2102 vpaddq $T4,$D4,$D4
2103 vpaddq $H2,$D3,$D3
2104 vpmuludq $H3,$T3,$T4
2105 vpmuludq $H4,$T3,$H2
2106 vpaddq $T4,$D1,$D1
2107 vpaddq $H2,$D2,$D2
2108
2109 vpmuludq $H3,$S4,$H3
2110 vpmuludq $H4,$S4,$H4
2111 vpaddq $H3,$D2,$H2
2112 vpaddq $H4,$D3,$H3
2113 vpmuludq `32*7+4-0x90`(%rax),$H0,$H4
2114 vpmuludq $H1,$S4,$H0
2115 vmovdqa 64(%rcx),$MASK
2116 vpaddq $H4,$D4,$H4
2117 vpaddq $H0,$D0,$H0
2118
2119
2120
2121
2122 vpsrldq \$8,$D1,$T1
2123 vpsrldq \$8,$H2,$T2
2124 vpsrldq \$8,$H3,$T3
2125 vpsrldq \$8,$H4,$T4
2126 vpsrldq \$8,$H0,$T0
2127 vpaddq $T1,$D1,$D1
2128 vpaddq $T2,$H2,$H2
2129 vpaddq $T3,$H3,$H3
2130 vpaddq $T4,$H4,$H4
2131 vpaddq $T0,$H0,$H0
2132
2133 vpermq \$0x2,$H3,$T3
2134 vpermq \$0x2,$H4,$T4
2135 vpermq \$0x2,$H0,$T0
2136 vpermq \$0x2,$D1,$T1
2137 vpermq \$0x2,$H2,$T2
2138 vpaddq $T3,$H3,$H3
2139 vpaddq $T4,$H4,$H4
2140 vpaddq $T0,$H0,$H0
2141 vpaddq $T1,$D1,$D1
2142 vpaddq $T2,$H2,$H2
2143
2144
2145
2146
2147 vpsrlq \$26,$H3,$D3
2148 vpand $MASK,$H3,$H3
2149 vpaddq $D3,$H4,$H4
2150
2151 vpsrlq \$26,$H0,$D0
2152 vpand $MASK,$H0,$H0
2153 vpaddq $D0,$D1,$H1
2154
2155 vpsrlq \$26,$H4,$D4
2156 vpand $MASK,$H4,$H4
2157
2158 vpsrlq \$26,$H1,$D1
2159 vpand $MASK,$H1,$H1
2160 vpaddq $D1,$H2,$H2
2161
2162 vpaddq $D4,$H0,$H0
2163 vpsllq \$2,$D4,$D4
2164 vpaddq $D4,$H0,$H0
2165
2166 vpsrlq \$26,$H2,$D2
2167 vpand $MASK,$H2,$H2
2168 vpaddq $D2,$H3,$H3
2169
2170 vpsrlq \$26,$H0,$D0
2171 vpand $MASK,$H0,$H0
2172 vpaddq $D0,$H1,$H1
2173
2174 vpsrlq \$26,$H3,$D3
2175 vpand $MASK,$H3,$H3
2176 vpaddq $D3,$H4,$H4
2177
2178 vmovd %x
2179 vmovd %x
2180 vmovd %x
2181 vmovd %x
2182 vmovd %x
2183 ___
2184 $code.=<<___ if ($win64);
2185 vmovdqa -0xb0(%r10),%xmm6
2186 vmovdqa -0xa0(%r10),%xmm7
2187 vmovdqa -0x90(%r10),%xmm8
2188 vmovdqa -0x80(%r10),%xmm9
2189 vmovdqa -0x70(%r10),%xmm10
2190 vmovdqa -0x60(%r10),%xmm11
2191 vmovdqa -0x50(%r10),%xmm12
2192 vmovdqa -0x40(%r10),%xmm13
2193 vmovdqa -0x30(%r10),%xmm14
2194 vmovdqa -0x20(%r10),%xmm15
2195 lea -8(%r10),%rsp
2196 .Ldo_avx2_epilogue$suffix:
2197 ___
2198 $code.=<<___ if (!$win64);
2199 lea -8(%r10),%rsp
2200 .cfi_def_cfa_register %rsp
2201 ___
2202 $code.=<<___;
2203 vzeroupper
2204 RET
2205 .cfi_endproc
2206 ___
2207 if($avx > 2 && $avx512) {
2208 my ($R0,$R1,$R2,$R3,$R4, $S1,$S2,$S3,$S4) = map("%zmm$_",(16..24));
2209 my ($M0,$M1,$M2,$M3,$M4) = map("%zmm$_",(25..29));
2210 my $PADBIT="%zmm30";
2211
2212 map(s/%y/%z/,($T4,$T0,$T1,$T2,$T3));
2213 map(s/%y/%z/,($D0,$D1,$D2,$D3,$D4));
2214 map(s/%y/%z/,($H0,$H1,$H2,$H3,$H4));
2215 map(s/%y/%z/,($MASK));
2216
2217 $code.=<<___;
2218 .cfi_startproc
2219 .Lblocks_avx512:
2220 mov \$15,%eax
2221 kmovw %eax,%k2
2222 ___
2223 $code.=<<___ if (!$win64);
2224 lea 8(%rsp),%r10
2225 .cfi_def_cfa_register %r10
2226 sub \$0x128,%rsp
2227 ___
2228 $code.=<<___ if ($win64);
2229 lea 8(%rsp),%r10
2230 sub \$0x1c8,%rsp
2231 vmovdqa %xmm6,-0xb0(%r10)
2232 vmovdqa %xmm7,-0xa0(%r10)
2233 vmovdqa %xmm8,-0x90(%r10)
2234 vmovdqa %xmm9,-0x80(%r10)
2235 vmovdqa %xmm10,-0x70(%r10)
2236 vmovdqa %xmm11,-0x60(%r10)
2237 vmovdqa %xmm12,-0x50(%r10)
2238 vmovdqa %xmm13,-0x40(%r10)
2239 vmovdqa %xmm14,-0x30(%r10)
2240 vmovdqa %xmm15,-0x20(%r10)
2241 .Ldo_avx512_body:
2242 ___
2243 $code.=<<___;
2244 lea .Lconst(%rip),%rcx
2245 lea 48+64($ctx),$ctx
2246 vmovdqa 96(%rcx),%y
2247
2248
2249 vmovdqu `16*0-64`($ctx),%x
2250 and \$-512,%rsp
2251 vmovdqu `16*1-64`($ctx),%x
2252 mov \$0x20,%rax
2253 vmovdqu `16*2-64`($ctx),%x
2254 vmovdqu `16*3-64`($ctx),%x
2255 vmovdqu `16*4-64`($ctx),%x
2256 vmovdqu `16*5-64`($ctx),%x
2257 vmovdqu `16*6-64`($ctx),%x
2258 vmovdqu `16*7-64`($ctx),%x
2259 vmovdqu `16*8-64`($ctx),%x
2260 vpermd $D0,$T2,$R0
2261 vpbroadcastq 64(%rcx),$MASK
2262 vpermd $D1,$T2,$R1
2263 vpermd $T0,$T2,$S1
2264 vpermd $D2,$T2,$R2
2265 vmovdqa64 $R0,0x00(%rsp){%k2}
2266 vpsrlq \$32,$R0,$T0
2267 vpermd $T1,$T2,$S2
2268 vmovdqu64 $R1,0x00(%rsp,%rax){%k2}
2269 vpsrlq \$32,$R1,$T1
2270 vpermd $D3,$T2,$R3
2271 vmovdqa64 $S1,0x40(%rsp){%k2}
2272 vpermd $T3,$T2,$S3
2273 vpermd $D4,$T2,$R4
2274 vmovdqu64 $R2,0x40(%rsp,%rax){%k2}
2275 vpermd $T4,$T2,$S4
2276 vmovdqa64 $S2,0x80(%rsp){%k2}
2277 vmovdqu64 $R3,0x80(%rsp,%rax){%k2}
2278 vmovdqa64 $S3,0xc0(%rsp){%k2}
2279 vmovdqu64 $R4,0xc0(%rsp,%rax){%k2}
2280 vmovdqa64 $S4,0x100(%rsp){%k2}
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291 vpmuludq $T0,$R0,$D0
2292 vpmuludq $T0,$R1,$D1
2293 vpmuludq $T0,$R2,$D2
2294 vpmuludq $T0,$R3,$D3
2295 vpmuludq $T0,$R4,$D4
2296 vpsrlq \$32,$R2,$T2
2297
2298 vpmuludq $T1,$S4,$M0
2299 vpmuludq $T1,$R0,$M1
2300 vpmuludq $T1,$R1,$M2
2301 vpmuludq $T1,$R2,$M3
2302 vpmuludq $T1,$R3,$M4
2303 vpsrlq \$32,$R3,$T3
2304 vpaddq $M0,$D0,$D0
2305 vpaddq $M1,$D1,$D1
2306 vpaddq $M2,$D2,$D2
2307 vpaddq $M3,$D3,$D3
2308 vpaddq $M4,$D4,$D4
2309
2310 vpmuludq $T2,$S3,$M0
2311 vpmuludq $T2,$S4,$M1
2312 vpmuludq $T2,$R1,$M3
2313 vpmuludq $T2,$R2,$M4
2314 vpmuludq $T2,$R0,$M2
2315 vpsrlq \$32,$R4,$T4
2316 vpaddq $M0,$D0,$D0
2317 vpaddq $M1,$D1,$D1
2318 vpaddq $M3,$D3,$D3
2319 vpaddq $M4,$D4,$D4
2320 vpaddq $M2,$D2,$D2
2321
2322 vpmuludq $T3,$S2,$M0
2323 vpmuludq $T3,$R0,$M3
2324 vpmuludq $T3,$R1,$M4
2325 vpmuludq $T3,$S3,$M1
2326 vpmuludq $T3,$S4,$M2
2327 vpaddq $M0,$D0,$D0
2328 vpaddq $M3,$D3,$D3
2329 vpaddq $M4,$D4,$D4
2330 vpaddq $M1,$D1,$D1
2331 vpaddq $M2,$D2,$D2
2332
2333 vpmuludq $T4,$S4,$M3
2334 vpmuludq $T4,$R0,$M4
2335 vpmuludq $T4,$S1,$M0
2336 vpmuludq $T4,$S2,$M1
2337 vpmuludq $T4,$S3,$M2
2338 vpaddq $M3,$D3,$D3
2339 vpaddq $M4,$D4,$D4
2340 vpaddq $M0,$D0,$D0
2341 vpaddq $M1,$D1,$D1
2342 vpaddq $M2,$D2,$D2
2343
2344
2345
2346 vmovdqu64 16*0($inp),%z
2347 vmovdqu64 16*4($inp),%z
2348 lea 16*8($inp),$inp
2349
2350
2351
2352
2353 vpsrlq \$26,$D3,$M3
2354 vpandq $MASK,$D3,$D3
2355 vpaddq $M3,$D4,$D4
2356
2357 vpsrlq \$26,$D0,$M0
2358 vpandq $MASK,$D0,$D0
2359 vpaddq $M0,$D1,$D1
2360
2361 vpsrlq \$26,$D4,$M4
2362 vpandq $MASK,$D4,$D4
2363
2364 vpsrlq \$26,$D1,$M1
2365 vpandq $MASK,$D1,$D1
2366 vpaddq $M1,$D2,$D2
2367
2368 vpaddq $M4,$D0,$D0
2369 vpsllq \$2,$M4,$M4
2370 vpaddq $M4,$D0,$D0
2371
2372 vpsrlq \$26,$D2,$M2
2373 vpandq $MASK,$D2,$D2
2374 vpaddq $M2,$D3,$D3
2375
2376 vpsrlq \$26,$D0,$M0
2377 vpandq $MASK,$D0,$D0
2378 vpaddq $M0,$D1,$D1
2379
2380 vpsrlq \$26,$D3,$M3
2381 vpandq $MASK,$D3,$D3
2382 vpaddq $M3,$D4,$D4
2383
2384
2385
2386
2387
2388 vpunpcklqdq $T4,$T3,$T0
2389 vpunpckhqdq $T4,$T3,$T4
2390
2391
2392
2393
2394
2395
2396 vmovdqa32 128(%rcx),$M0
2397 mov \$0x7777,%eax
2398 kmovw %eax,%k1
2399
2400 vpermd $R0,$M0,$R0
2401 vpermd $R1,$M0,$R1
2402 vpermd $R2,$M0,$R2
2403 vpermd $R3,$M0,$R3
2404 vpermd $R4,$M0,$R4
2405
2406 vpermd $D0,$M0,${R0}{%k1}
2407 vpermd $D1,$M0,${R1}{%k1}
2408 vpermd $D2,$M0,${R2}{%k1}
2409 vpermd $D3,$M0,${R3}{%k1}
2410 vpermd $D4,$M0,${R4}{%k1}
2411
2412 vpslld \$2,$R1,$S1
2413 vpslld \$2,$R2,$S2
2414 vpslld \$2,$R3,$S3
2415 vpslld \$2,$R4,$S4
2416 vpaddd $R1,$S1,$S1
2417 vpaddd $R2,$S2,$S2
2418 vpaddd $R3,$S3,$S3
2419 vpaddd $R4,$S4,$S4
2420
2421 vpbroadcastq 32(%rcx),$PADBIT
2422
2423 vpsrlq \$52,$T0,$T2
2424 vpsllq \$12,$T4,$T3
2425 vporq $T3,$T2,$T2
2426 vpsrlq \$26,$T0,$T1
2427 vpsrlq \$14,$T4,$T3
2428 vpsrlq \$40,$T4,$T4
2429 vpandq $MASK,$T2,$T2
2430 vpandq $MASK,$T0,$T0
2431
2432
2433
2434
2435 vpaddq $H2,$T2,$H2
2436 sub \$192,$len
2437 jbe .Ltail_avx512
2438 jmp .Loop_avx512
2439
2440 .align 32
2441 .Loop_avx512:
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470 vpmuludq $H2,$R1,$D3
2471 vpaddq $H0,$T0,$H0
2472 vpmuludq $H2,$R2,$D4
2473 vpandq $MASK,$T1,$T1
2474 vpmuludq $H2,$S3,$D0
2475 vpandq $MASK,$T3,$T3
2476 vpmuludq $H2,$S4,$D1
2477 vporq $PADBIT,$T4,$T4
2478 vpmuludq $H2,$R0,$D2
2479 vpaddq $H1,$T1,$H1
2480 vpaddq $H3,$T3,$H3
2481 vpaddq $H4,$T4,$H4
2482
2483 vmovdqu64 16*0($inp),$T3
2484 vmovdqu64 16*4($inp),$T4
2485 lea 16*8($inp),$inp
2486 vpmuludq $H0,$R3,$M3
2487 vpmuludq $H0,$R4,$M4
2488 vpmuludq $H0,$R0,$M0
2489 vpmuludq $H0,$R1,$M1
2490 vpaddq $M3,$D3,$D3
2491 vpaddq $M4,$D4,$D4
2492 vpaddq $M0,$D0,$D0
2493 vpaddq $M1,$D1,$D1
2494
2495 vpmuludq $H1,$R2,$M3
2496 vpmuludq $H1,$R3,$M4
2497 vpmuludq $H1,$S4,$M0
2498 vpmuludq $H0,$R2,$M2
2499 vpaddq $M3,$D3,$D3
2500 vpaddq $M4,$D4,$D4
2501 vpaddq $M0,$D0,$D0
2502 vpaddq $M2,$D2,$D2
2503
2504 vpunpcklqdq $T4,$T3,$T0
2505 vpunpckhqdq $T4,$T3,$T4
2506
2507 vpmuludq $H3,$R0,$M3
2508 vpmuludq $H3,$R1,$M4
2509 vpmuludq $H1,$R0,$M1
2510 vpmuludq $H1,$R1,$M2
2511 vpaddq $M3,$D3,$D3
2512 vpaddq $M4,$D4,$D4
2513 vpaddq $M1,$D1,$D1
2514 vpaddq $M2,$D2,$D2
2515
2516 vpmuludq $H4,$S4,$M3
2517 vpmuludq $H4,$R0,$M4
2518 vpmuludq $H3,$S2,$M0
2519 vpmuludq $H3,$S3,$M1
2520 vpaddq $M3,$D3,$D3
2521 vpmuludq $H3,$S4,$M2
2522 vpaddq $M4,$D4,$D4
2523 vpaddq $M0,$D0,$D0
2524 vpaddq $M1,$D1,$D1
2525 vpaddq $M2,$D2,$D2
2526
2527 vpmuludq $H4,$S1,$M0
2528 vpmuludq $H4,$S2,$M1
2529 vpmuludq $H4,$S3,$M2
2530 vpaddq $M0,$D0,$H0
2531 vpaddq $M1,$D1,$H1
2532 vpaddq $M2,$D2,$H2
2533
2534
2535
2536
2537 vpsrlq \$52,$T0,$T2
2538 vpsllq \$12,$T4,$T3
2539
2540 vpsrlq \$26,$D3,$H3
2541 vpandq $MASK,$D3,$D3
2542 vpaddq $H3,$D4,$H4
2543
2544 vporq $T3,$T2,$T2
2545
2546 vpsrlq \$26,$H0,$D0
2547 vpandq $MASK,$H0,$H0
2548 vpaddq $D0,$H1,$H1
2549
2550 vpandq $MASK,$T2,$T2
2551
2552 vpsrlq \$26,$H4,$D4
2553 vpandq $MASK,$H4,$H4
2554
2555 vpsrlq \$26,$H1,$D1
2556 vpandq $MASK,$H1,$H1
2557 vpaddq $D1,$H2,$H2
2558
2559 vpaddq $D4,$H0,$H0
2560 vpsllq \$2,$D4,$D4
2561 vpaddq $D4,$H0,$H0
2562
2563 vpaddq $T2,$H2,$H2
2564 vpsrlq \$26,$T0,$T1
2565
2566 vpsrlq \$26,$H2,$D2
2567 vpandq $MASK,$H2,$H2
2568 vpaddq $D2,$D3,$H3
2569
2570 vpsrlq \$14,$T4,$T3
2571
2572 vpsrlq \$26,$H0,$D0
2573 vpandq $MASK,$H0,$H0
2574 vpaddq $D0,$H1,$H1
2575
2576 vpsrlq \$40,$T4,$T4
2577
2578 vpsrlq \$26,$H3,$D3
2579 vpandq $MASK,$H3,$H3
2580 vpaddq $D3,$H4,$H4
2581
2582 vpandq $MASK,$T0,$T0
2583
2584
2585
2586
2587 sub \$128,$len
2588 ja .Loop_avx512
2589
2590 .Ltail_avx512:
2591
2592
2593
2594
2595
2596 vpsrlq \$32,$R0,$R0
2597 vpsrlq \$32,$R1,$R1
2598 vpsrlq \$32,$R2,$R2
2599 vpsrlq \$32,$S3,$S3
2600 vpsrlq \$32,$S4,$S4
2601 vpsrlq \$32,$R3,$R3
2602 vpsrlq \$32,$R4,$R4
2603 vpsrlq \$32,$S1,$S1
2604 vpsrlq \$32,$S2,$S2
2605
2606
2607
2608 lea ($inp,$len),$inp
2609
2610
2611 vpaddq $H0,$T0,$H0
2612
2613 vpmuludq $H2,$R1,$D3
2614 vpmuludq $H2,$R2,$D4
2615 vpmuludq $H2,$S3,$D0
2616 vpandq $MASK,$T1,$T1
2617 vpmuludq $H2,$S4,$D1
2618 vpandq $MASK,$T3,$T3
2619 vpmuludq $H2,$R0,$D2
2620 vporq $PADBIT,$T4,$T4
2621 vpaddq $H1,$T1,$H1
2622 vpaddq $H3,$T3,$H3
2623 vpaddq $H4,$T4,$H4
2624
2625 vmovdqu 16*0($inp),%x
2626 vpmuludq $H0,$R3,$M3
2627 vpmuludq $H0,$R4,$M4
2628 vpmuludq $H0,$R0,$M0
2629 vpmuludq $H0,$R1,$M1
2630 vpaddq $M3,$D3,$D3
2631 vpaddq $M4,$D4,$D4
2632 vpaddq $M0,$D0,$D0
2633 vpaddq $M1,$D1,$D1
2634
2635 vmovdqu 16*1($inp),%x
2636 vpmuludq $H1,$R2,$M3
2637 vpmuludq $H1,$R3,$M4
2638 vpmuludq $H1,$S4,$M0
2639 vpmuludq $H0,$R2,$M2
2640 vpaddq $M3,$D3,$D3
2641 vpaddq $M4,$D4,$D4
2642 vpaddq $M0,$D0,$D0
2643 vpaddq $M2,$D2,$D2
2644
2645 vinserti128 \$1,16*2($inp),%y
2646 vpmuludq $H3,$R0,$M3
2647 vpmuludq $H3,$R1,$M4
2648 vpmuludq $H1,$R0,$M1
2649 vpmuludq $H1,$R1,$M2
2650 vpaddq $M3,$D3,$D3
2651 vpaddq $M4,$D4,$D4
2652 vpaddq $M1,$D1,$D1
2653 vpaddq $M2,$D2,$D2
2654
2655 vinserti128 \$1,16*3($inp),%y
2656 vpmuludq $H4,$S4,$M3
2657 vpmuludq $H4,$R0,$M4
2658 vpmuludq $H3,$S2,$M0
2659 vpmuludq $H3,$S3,$M1
2660 vpmuludq $H3,$S4,$M2
2661 vpaddq $M3,$D3,$H3
2662 vpaddq $M4,$D4,$D4
2663 vpaddq $M0,$D0,$D0
2664 vpaddq $M1,$D1,$D1
2665 vpaddq $M2,$D2,$D2
2666
2667 vpmuludq $H4,$S1,$M0
2668 vpmuludq $H4,$S2,$M1
2669 vpmuludq $H4,$S3,$M2
2670 vpaddq $M0,$D0,$H0
2671 vpaddq $M1,$D1,$H1
2672 vpaddq $M2,$D2,$H2
2673
2674
2675
2676
2677 mov \$1,%eax
2678 vpermq \$0xb1,$H3,$D3
2679 vpermq \$0xb1,$D4,$H4
2680 vpermq \$0xb1,$H0,$D0
2681 vpermq \$0xb1,$H1,$D1
2682 vpermq \$0xb1,$H2,$D2
2683 vpaddq $D3,$H3,$H3
2684 vpaddq $D4,$H4,$H4
2685 vpaddq $D0,$H0,$H0
2686 vpaddq $D1,$H1,$H1
2687 vpaddq $D2,$H2,$H2
2688
2689 kmovw %eax,%k3
2690 vpermq \$0x2,$H3,$D3
2691 vpermq \$0x2,$H4,$D4
2692 vpermq \$0x2,$H0,$D0
2693 vpermq \$0x2,$H1,$D1
2694 vpermq \$0x2,$H2,$D2
2695 vpaddq $D3,$H3,$H3
2696 vpaddq $D4,$H4,$H4
2697 vpaddq $D0,$H0,$H0
2698 vpaddq $D1,$H1,$H1
2699 vpaddq $D2,$H2,$H2
2700
2701 vextracti64x4 \$0x1,$H3,%y
2702 vextracti64x4 \$0x1,$H4,%y
2703 vextracti64x4 \$0x1,$H0,%y
2704 vextracti64x4 \$0x1,$H1,%y
2705 vextracti64x4 \$0x1,$H2,%y
2706 vpaddq $D3,$H3,${H3}{%k3}{z}
2707 vpaddq $D4,$H4,${H4}{%k3}{z}
2708 vpaddq $D0,$H0,${H0}{%k3}{z}
2709 vpaddq $D1,$H1,${H1}{%k3}{z}
2710 vpaddq $D2,$H2,${H2}{%k3}{z}
2711 ___
2712 map(s/%z/%y/,($T0,$T1,$T2,$T3,$T4, $PADBIT));
2713 map(s/%z/%y/,($H0,$H1,$H2,$H3,$H4, $D0,$D1,$D2,$D3,$D4, $MASK));
2714 $code.=<<___;
2715
2716
2717
2718 vpsrlq \$26,$H3,$D3
2719 vpand $MASK,$H3,$H3
2720 vpsrldq \$6,$T0,$T2
2721 vpsrldq \$6,$T1,$T3
2722 vpunpckhqdq $T1,$T0,$T4
2723 vpaddq $D3,$H4,$H4
2724
2725 vpsrlq \$26,$H0,$D0
2726 vpand $MASK,$H0,$H0
2727 vpunpcklqdq $T3,$T2,$T2
2728 vpunpcklqdq $T1,$T0,$T0
2729 vpaddq $D0,$H1,$H1
2730
2731 vpsrlq \$26,$H4,$D4
2732 vpand $MASK,$H4,$H4
2733
2734 vpsrlq \$26,$H1,$D1
2735 vpand $MASK,$H1,$H1
2736 vpsrlq \$30,$T2,$T3
2737 vpsrlq \$4,$T2,$T2
2738 vpaddq $D1,$H2,$H2
2739
2740 vpaddq $D4,$H0,$H0
2741 vpsllq \$2,$D4,$D4
2742 vpsrlq \$26,$T0,$T1
2743 vpsrlq \$40,$T4,$T4
2744 vpaddq $D4,$H0,$H0
2745
2746 vpsrlq \$26,$H2,$D2
2747 vpand $MASK,$H2,$H2
2748 vpand $MASK,$T2,$T2
2749 vpand $MASK,$T0,$T0
2750 vpaddq $D2,$H3,$H3
2751
2752 vpsrlq \$26,$H0,$D0
2753 vpand $MASK,$H0,$H0
2754 vpaddq $H2,$T2,$H2
2755 vpand $MASK,$T1,$T1
2756 vpaddq $D0,$H1,$H1
2757
2758 vpsrlq \$26,$H3,$D3
2759 vpand $MASK,$H3,$H3
2760 vpand $MASK,$T3,$T3
2761 vpor 32(%rcx),$T4,$T4
2762 vpaddq $D3,$H4,$H4
2763
2764 lea 0x90(%rsp),%rax
2765 add \$64,$len
2766 jnz .Ltail_avx2$suffix
2767
2768 vpsubq $T2,$H2,$H2
2769 vmovd %x
2770 vmovd %x
2771 vmovd %x
2772 vmovd %x
2773 vmovd %x
2774 vzeroall
2775 ___
2776 $code.=<<___ if ($win64);
2777 movdqa -0xb0(%r10),%xmm6
2778 movdqa -0xa0(%r10),%xmm7
2779 movdqa -0x90(%r10),%xmm8
2780 movdqa -0x80(%r10),%xmm9
2781 movdqa -0x70(%r10),%xmm10
2782 movdqa -0x60(%r10),%xmm11
2783 movdqa -0x50(%r10),%xmm12
2784 movdqa -0x40(%r10),%xmm13
2785 movdqa -0x30(%r10),%xmm14
2786 movdqa -0x20(%r10),%xmm15
2787 lea -8(%r10),%rsp
2788 .Ldo_avx512_epilogue:
2789 ___
2790 $code.=<<___ if (!$win64);
2791 lea -8(%r10),%rsp
2792 .cfi_def_cfa_register %rsp
2793 ___
2794 $code.=<<___;
2795 RET
2796 .cfi_endproc
2797 ___
2798
2799 }
2800
2801 }
2802
2803 &declare_function("poly1305_blocks_avx2", 32, 4);
2804 poly1305_blocks_avxN(0);
2805 &end_function("poly1305_blocks_avx2");
2806
2807
2808 if ($avx>2) {
2809
2810
2811
2812
2813
2814
2815 if($kernel) {
2816 $code .= "#ifdef CONFIG_AS_AVX512\n";
2817 }
2818
2819 &declare_function("poly1305_blocks_avx512", 32, 4);
2820 poly1305_blocks_avxN(1);
2821 &end_function("poly1305_blocks_avx512");
2822
2823 if ($kernel) {
2824 $code .= "#endif\n";
2825 }
2826
2827 if (!$kernel && $avx>3) {
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852 $code.=<<___;
2853 .type poly1305_init_base2_44,\@function,3
2854 .align 32
2855 poly1305_init_base2_44:
2856 xor %eax,%eax
2857 mov %rax,0($ctx)
2858 mov %rax,8($ctx)
2859 mov %rax,16($ctx)
2860
2861 .Linit_base2_44:
2862 lea poly1305_blocks_vpmadd52(%rip),%r10
2863 lea poly1305_emit_base2_44(%rip),%r11
2864
2865 mov \$0x0ffffffc0fffffff,%rax
2866 mov \$0x0ffffffc0ffffffc,%rcx
2867 and 0($inp),%rax
2868 mov \$0x00000fffffffffff,%r8
2869 and 8($inp),%rcx
2870 mov \$0x00000fffffffffff,%r9
2871 and %rax,%r8
2872 shrd \$44,%rcx,%rax
2873 mov %r8,40($ctx)
2874 and %r9,%rax
2875 shr \$24,%rcx
2876 mov %rax,48($ctx)
2877 lea (%rax,%rax,4),%rax
2878 mov %rcx,56($ctx)
2879 shl \$2,%rax
2880 lea (%rcx,%rcx,4),%rcx
2881 shl \$2,%rcx
2882 mov %rax,24($ctx)
2883 mov %rcx,32($ctx)
2884 movq \$-1,64($ctx)
2885 ___
2886 $code.=<<___ if ($flavour !~ /elf32/);
2887 mov %r10,0(%rdx)
2888 mov %r11,8(%rdx)
2889 ___
2890 $code.=<<___ if ($flavour =~ /elf32/);
2891 mov %r10d,0(%rdx)
2892 mov %r11d,4(%rdx)
2893 ___
2894 $code.=<<___;
2895 mov \$1,%eax
2896 RET
2897 .size poly1305_init_base2_44,.-poly1305_init_base2_44
2898 ___
2899 {
2900 my ($H0,$H1,$H2,$r2r1r0,$r1r0s2,$r0s2s1,$Dlo,$Dhi) = map("%ymm$_",(0..5,16,17));
2901 my ($T0,$inp_permd,$inp_shift,$PAD) = map("%ymm$_",(18..21));
2902 my ($reduc_mask,$reduc_rght,$reduc_left) = map("%ymm$_",(22..25));
2903
2904 $code.=<<___;
2905 .type poly1305_blocks_vpmadd52,\@function,4
2906 .align 32
2907 poly1305_blocks_vpmadd52:
2908 shr \$4,$len
2909 jz .Lno_data_vpmadd52
2910
2911 shl \$40,$padbit
2912 mov 64($ctx),%r8
2913
2914
2915
2916
2917
2918
2919 mov \$3,%rax
2920 mov \$1,%r10
2921 cmp \$4,$len
2922 cmovae %r10,%rax
2923 test %r8,%r8
2924 cmovns %r10,%rax
2925
2926 and $len,%rax
2927 jz .Lblocks_vpmadd52_4x
2928
2929 sub %rax,$len
2930 mov \$7,%r10d
2931 mov \$1,%r11d
2932 kmovw %r10d,%k7
2933 lea .L2_44_inp_permd(%rip),%r10
2934 kmovw %r11d,%k1
2935
2936 vmovq $padbit,%x
2937 vmovdqa64 0(%r10),$inp_permd
2938 vmovdqa64 32(%r10),$inp_shift
2939 vpermq \$0xcf,$PAD,$PAD
2940 vmovdqa64 64(%r10),$reduc_mask
2941
2942 vmovdqu64 0($ctx),${Dlo}{%k7}{z}
2943 vmovdqu64 40($ctx),${r2r1r0}{%k7}{z}
2944 vmovdqu64 32($ctx),${r1r0s2}{%k7}{z}
2945 vmovdqu64 24($ctx),${r0s2s1}{%k7}{z}
2946
2947 vmovdqa64 96(%r10),$reduc_rght
2948 vmovdqa64 128(%r10),$reduc_left
2949
2950 jmp .Loop_vpmadd52
2951
2952 .align 32
2953 .Loop_vpmadd52:
2954 vmovdqu32 0($inp),%x
2955 lea 16($inp),$inp
2956
2957 vpermd $T0,$inp_permd,$T0
2958 vpsrlvq $inp_shift,$T0,$T0
2959 vpandq $reduc_mask,$T0,$T0
2960 vporq $PAD,$T0,$T0
2961
2962 vpaddq $T0,$Dlo,$Dlo
2963
2964 vpermq \$0,$Dlo,${H0}{%k7}{z}
2965 vpermq \$0b01010101,$Dlo,${H1}{%k7}{z}
2966 vpermq \$0b10101010,$Dlo,${H2}{%k7}{z}
2967
2968 vpxord $Dlo,$Dlo,$Dlo
2969 vpxord $Dhi,$Dhi,$Dhi
2970
2971 vpmadd52luq $r2r1r0,$H0,$Dlo
2972 vpmadd52huq $r2r1r0,$H0,$Dhi
2973
2974 vpmadd52luq $r1r0s2,$H1,$Dlo
2975 vpmadd52huq $r1r0s2,$H1,$Dhi
2976
2977 vpmadd52luq $r0s2s1,$H2,$Dlo
2978 vpmadd52huq $r0s2s1,$H2,$Dhi
2979
2980 vpsrlvq $reduc_rght,$Dlo,$T0
2981 vpsllvq $reduc_left,$Dhi,$Dhi
2982 vpandq $reduc_mask,$Dlo,$Dlo
2983
2984 vpaddq $T0,$Dhi,$Dhi
2985
2986 vpermq \$0b10010011,$Dhi,$Dhi
2987
2988 vpaddq $Dhi,$Dlo,$Dlo
2989
2990 vpsrlvq $reduc_rght,$Dlo,$T0
2991 vpandq $reduc_mask,$Dlo,$Dlo
2992
2993 vpermq \$0b10010011,$T0,$T0
2994
2995 vpaddq $T0,$Dlo,$Dlo
2996
2997 vpermq \$0b10010011,$Dlo,${T0}{%k1}{z}
2998
2999 vpaddq $T0,$Dlo,$Dlo
3000 vpsllq \$2,$T0,$T0
3001
3002 vpaddq $T0,$Dlo,$Dlo
3003
3004 dec %rax
3005 jnz .Loop_vpmadd52
3006
3007 vmovdqu64 $Dlo,0($ctx){%k7}
3008
3009 test $len,$len
3010 jnz .Lblocks_vpmadd52_4x
3011
3012 .Lno_data_vpmadd52:
3013 RET
3014 .size poly1305_blocks_vpmadd52,.-poly1305_blocks_vpmadd52
3015 ___
3016 }
3017 {
3018
3019
3020
3021
3022
3023 my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17));
3024 my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23));
3025 my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31));
3026
3027 $code.=<<___;
3028 .type poly1305_blocks_vpmadd52_4x,\@function,4
3029 .align 32
3030 poly1305_blocks_vpmadd52_4x:
3031 shr \$4,$len
3032 jz .Lno_data_vpmadd52_4x
3033
3034 shl \$40,$padbit
3035 mov 64($ctx),%r8
3036
3037 .Lblocks_vpmadd52_4x:
3038 vpbroadcastq $padbit,$PAD
3039
3040 vmovdqa64 .Lx_mask44(%rip),$mask44
3041 mov \$5,%eax
3042 vmovdqa64 .Lx_mask42(%rip),$mask42
3043 kmovw %eax,%k1
3044
3045 test %r8,%r8
3046 js .Linit_vpmadd52
3047
3048 vmovq 0($ctx),%x
3049 vmovq 8($ctx),%x
3050 vmovq 16($ctx),%x
3051
3052 test \$3,$len
3053 jnz .Lblocks_vpmadd52_2x_do
3054
3055 .Lblocks_vpmadd52_4x_do:
3056 vpbroadcastq 64($ctx),$R0
3057 vpbroadcastq 96($ctx),$R1
3058 vpbroadcastq 128($ctx),$R2
3059 vpbroadcastq 160($ctx),$S1
3060
3061 .Lblocks_vpmadd52_4x_key_loaded:
3062 vpsllq \$2,$R2,$S2
3063 vpaddq $R2,$S2,$S2
3064 vpsllq \$2,$S2,$S2
3065
3066 test \$7,$len
3067 jz .Lblocks_vpmadd52_8x
3068
3069 vmovdqu64 16*0($inp),$T2
3070 vmovdqu64 16*2($inp),$T3
3071 lea 16*4($inp),$inp
3072
3073 vpunpcklqdq $T3,$T2,$T1
3074 vpunpckhqdq $T3,$T2,$T3
3075
3076
3077
3078 vpsrlq \$24,$T3,$T2
3079 vporq $PAD,$T2,$T2
3080 vpaddq $T2,$H2,$H2
3081 vpandq $mask44,$T1,$T0
3082 vpsrlq \$44,$T1,$T1
3083 vpsllq \$20,$T3,$T3
3084 vporq $T3,$T1,$T1
3085 vpandq $mask44,$T1,$T1
3086
3087 sub \$4,$len
3088 jz .Ltail_vpmadd52_4x
3089 jmp .Loop_vpmadd52_4x
3090 ud2
3091
3092 .align 32
3093 .Linit_vpmadd52:
3094 vmovq 24($ctx),%x
3095 vmovq 56($ctx),%x
3096 vmovq 32($ctx),%x
3097 vmovq 40($ctx),%x
3098 vmovq 48($ctx),%x
3099
3100 vmovdqa $R0,$H0
3101 vmovdqa $R1,$H1
3102 vmovdqa $H2,$R2
3103
3104 mov \$2,%eax
3105
3106 .Lmul_init_vpmadd52:
3107 vpxorq $D0lo,$D0lo,$D0lo
3108 vpmadd52luq $H2,$S1,$D0lo
3109 vpxorq $D0hi,$D0hi,$D0hi
3110 vpmadd52huq $H2,$S1,$D0hi
3111 vpxorq $D1lo,$D1lo,$D1lo
3112 vpmadd52luq $H2,$S2,$D1lo
3113 vpxorq $D1hi,$D1hi,$D1hi
3114 vpmadd52huq $H2,$S2,$D1hi
3115 vpxorq $D2lo,$D2lo,$D2lo
3116 vpmadd52luq $H2,$R0,$D2lo
3117 vpxorq $D2hi,$D2hi,$D2hi
3118 vpmadd52huq $H2,$R0,$D2hi
3119
3120 vpmadd52luq $H0,$R0,$D0lo
3121 vpmadd52huq $H0,$R0,$D0hi
3122 vpmadd52luq $H0,$R1,$D1lo
3123 vpmadd52huq $H0,$R1,$D1hi
3124 vpmadd52luq $H0,$R2,$D2lo
3125 vpmadd52huq $H0,$R2,$D2hi
3126
3127 vpmadd52luq $H1,$S2,$D0lo
3128 vpmadd52huq $H1,$S2,$D0hi
3129 vpmadd52luq $H1,$R0,$D1lo
3130 vpmadd52huq $H1,$R0,$D1hi
3131 vpmadd52luq $H1,$R1,$D2lo
3132 vpmadd52huq $H1,$R1,$D2hi
3133
3134
3135
3136 vpsrlq \$44,$D0lo,$tmp
3137 vpsllq \$8,$D0hi,$D0hi
3138 vpandq $mask44,$D0lo,$H0
3139 vpaddq $tmp,$D0hi,$D0hi
3140
3141 vpaddq $D0hi,$D1lo,$D1lo
3142
3143 vpsrlq \$44,$D1lo,$tmp
3144 vpsllq \$8,$D1hi,$D1hi
3145 vpandq $mask44,$D1lo,$H1
3146 vpaddq $tmp,$D1hi,$D1hi
3147
3148 vpaddq $D1hi,$D2lo,$D2lo
3149
3150 vpsrlq \$42,$D2lo,$tmp
3151 vpsllq \$10,$D2hi,$D2hi
3152 vpandq $mask42,$D2lo,$H2
3153 vpaddq $tmp,$D2hi,$D2hi
3154
3155 vpaddq $D2hi,$H0,$H0
3156 vpsllq \$2,$D2hi,$D2hi
3157
3158 vpaddq $D2hi,$H0,$H0
3159
3160 vpsrlq \$44,$H0,$tmp
3161 vpandq $mask44,$H0,$H0
3162
3163 vpaddq $tmp,$H1,$H1
3164
3165 dec %eax
3166 jz .Ldone_init_vpmadd52
3167
3168 vpunpcklqdq $R1,$H1,$R1
3169 vpbroadcastq %x
3170 vpunpcklqdq $R2,$H2,$R2
3171 vpbroadcastq %x
3172 vpunpcklqdq $R0,$H0,$R0
3173 vpbroadcastq %x
3174
3175 vpsllq \$2,$R1,$S1
3176 vpsllq \$2,$R2,$S2
3177 vpaddq $R1,$S1,$S1
3178 vpaddq $R2,$S2,$S2
3179 vpsllq \$2,$S1,$S1
3180 vpsllq \$2,$S2,$S2
3181
3182 jmp .Lmul_init_vpmadd52
3183 ud2
3184
3185 .align 32
3186 .Ldone_init_vpmadd52:
3187 vinserti128 \$1,%x
3188 vinserti128 \$1,%x
3189 vinserti128 \$1,%x
3190
3191 vpermq \$0b11011000,$R1,$R1
3192 vpermq \$0b11011000,$R2,$R2
3193 vpermq \$0b11011000,$R0,$R0
3194
3195 vpsllq \$2,$R1,$S1
3196 vpaddq $R1,$S1,$S1
3197 vpsllq \$2,$S1,$S1
3198
3199 vmovq 0($ctx),%x
3200 vmovq 8($ctx),%x
3201 vmovq 16($ctx),%x
3202
3203 test \$3,$len
3204 jnz .Ldone_init_vpmadd52_2x
3205
3206 vmovdqu64 $R0,64($ctx)
3207 vpbroadcastq %x
3208 vmovdqu64 $R1,96($ctx)
3209 vpbroadcastq %x
3210 vmovdqu64 $R2,128($ctx)
3211 vpbroadcastq %x
3212 vmovdqu64 $S1,160($ctx)
3213 vpbroadcastq %x
3214
3215 jmp .Lblocks_vpmadd52_4x_key_loaded
3216 ud2
3217
3218 .align 32
3219 .Ldone_init_vpmadd52_2x:
3220 vmovdqu64 $R0,64($ctx)
3221 vpsrldq \$8,$R0,$R0
3222 vmovdqu64 $R1,96($ctx)
3223 vpsrldq \$8,$R1,$R1
3224 vmovdqu64 $R2,128($ctx)
3225 vpsrldq \$8,$R2,$R2
3226 vmovdqu64 $S1,160($ctx)
3227 vpsrldq \$8,$S1,$S1
3228 jmp .Lblocks_vpmadd52_2x_key_loaded
3229 ud2
3230
3231 .align 32
3232 .Lblocks_vpmadd52_2x_do:
3233 vmovdqu64 128+8($ctx),${R2}{%k1}{z}
3234 vmovdqu64 160+8($ctx),${S1}{%k1}{z}
3235 vmovdqu64 64+8($ctx),${R0}{%k1}{z}
3236 vmovdqu64 96+8($ctx),${R1}{%k1}{z}
3237
3238 .Lblocks_vpmadd52_2x_key_loaded:
3239 vmovdqu64 16*0($inp),$T2
3240 vpxorq $T3,$T3,$T3
3241 lea 16*2($inp),$inp
3242
3243 vpunpcklqdq $T3,$T2,$T1
3244 vpunpckhqdq $T3,$T2,$T3
3245
3246
3247
3248 vpsrlq \$24,$T3,$T2
3249 vporq $PAD,$T2,$T2
3250 vpaddq $T2,$H2,$H2
3251 vpandq $mask44,$T1,$T0
3252 vpsrlq \$44,$T1,$T1
3253 vpsllq \$20,$T3,$T3
3254 vporq $T3,$T1,$T1
3255 vpandq $mask44,$T1,$T1
3256
3257 jmp .Ltail_vpmadd52_2x
3258 ud2
3259
3260 .align 32
3261 .Loop_vpmadd52_4x:
3262
3263 vpaddq $T0,$H0,$H0
3264 vpaddq $T1,$H1,$H1
3265
3266 vpxorq $D0lo,$D0lo,$D0lo
3267 vpmadd52luq $H2,$S1,$D0lo
3268 vpxorq $D0hi,$D0hi,$D0hi
3269 vpmadd52huq $H2,$S1,$D0hi
3270 vpxorq $D1lo,$D1lo,$D1lo
3271 vpmadd52luq $H2,$S2,$D1lo
3272 vpxorq $D1hi,$D1hi,$D1hi
3273 vpmadd52huq $H2,$S2,$D1hi
3274 vpxorq $D2lo,$D2lo,$D2lo
3275 vpmadd52luq $H2,$R0,$D2lo
3276 vpxorq $D2hi,$D2hi,$D2hi
3277 vpmadd52huq $H2,$R0,$D2hi
3278
3279 vmovdqu64 16*0($inp),$T2
3280 vmovdqu64 16*2($inp),$T3
3281 lea 16*4($inp),$inp
3282 vpmadd52luq $H0,$R0,$D0lo
3283 vpmadd52huq $H0,$R0,$D0hi
3284 vpmadd52luq $H0,$R1,$D1lo
3285 vpmadd52huq $H0,$R1,$D1hi
3286 vpmadd52luq $H0,$R2,$D2lo
3287 vpmadd52huq $H0,$R2,$D2hi
3288
3289 vpunpcklqdq $T3,$T2,$T1
3290 vpunpckhqdq $T3,$T2,$T3
3291 vpmadd52luq $H1,$S2,$D0lo
3292 vpmadd52huq $H1,$S2,$D0hi
3293 vpmadd52luq $H1,$R0,$D1lo
3294 vpmadd52huq $H1,$R0,$D1hi
3295 vpmadd52luq $H1,$R1,$D2lo
3296 vpmadd52huq $H1,$R1,$D2hi
3297
3298
3299
3300 vpsrlq \$44,$D0lo,$tmp
3301 vpsllq \$8,$D0hi,$D0hi
3302 vpandq $mask44,$D0lo,$H0
3303 vpaddq $tmp,$D0hi,$D0hi
3304
3305 vpsrlq \$24,$T3,$T2
3306 vporq $PAD,$T2,$T2
3307 vpaddq $D0hi,$D1lo,$D1lo
3308
3309 vpsrlq \$44,$D1lo,$tmp
3310 vpsllq \$8,$D1hi,$D1hi
3311 vpandq $mask44,$D1lo,$H1
3312 vpaddq $tmp,$D1hi,$D1hi
3313
3314 vpandq $mask44,$T1,$T0
3315 vpsrlq \$44,$T1,$T1
3316 vpsllq \$20,$T3,$T3
3317 vpaddq $D1hi,$D2lo,$D2lo
3318
3319 vpsrlq \$42,$D2lo,$tmp
3320 vpsllq \$10,$D2hi,$D2hi
3321 vpandq $mask42,$D2lo,$H2
3322 vpaddq $tmp,$D2hi,$D2hi
3323
3324 vpaddq $T2,$H2,$H2
3325 vpaddq $D2hi,$H0,$H0
3326 vpsllq \$2,$D2hi,$D2hi
3327
3328 vpaddq $D2hi,$H0,$H0
3329 vporq $T3,$T1,$T1
3330 vpandq $mask44,$T1,$T1
3331
3332 vpsrlq \$44,$H0,$tmp
3333 vpandq $mask44,$H0,$H0
3334
3335 vpaddq $tmp,$H1,$H1
3336
3337 sub \$4,$len
3338 jnz .Loop_vpmadd52_4x
3339
3340 .Ltail_vpmadd52_4x:
3341 vmovdqu64 128($ctx),$R2
3342 vmovdqu64 160($ctx),$S1
3343 vmovdqu64 64($ctx),$R0
3344 vmovdqu64 96($ctx),$R1
3345
3346 .Ltail_vpmadd52_2x:
3347 vpsllq \$2,$R2,$S2
3348 vpaddq $R2,$S2,$S2
3349 vpsllq \$2,$S2,$S2
3350
3351
3352 vpaddq $T0,$H0,$H0
3353 vpaddq $T1,$H1,$H1
3354
3355 vpxorq $D0lo,$D0lo,$D0lo
3356 vpmadd52luq $H2,$S1,$D0lo
3357 vpxorq $D0hi,$D0hi,$D0hi
3358 vpmadd52huq $H2,$S1,$D0hi
3359 vpxorq $D1lo,$D1lo,$D1lo
3360 vpmadd52luq $H2,$S2,$D1lo
3361 vpxorq $D1hi,$D1hi,$D1hi
3362 vpmadd52huq $H2,$S2,$D1hi
3363 vpxorq $D2lo,$D2lo,$D2lo
3364 vpmadd52luq $H2,$R0,$D2lo
3365 vpxorq $D2hi,$D2hi,$D2hi
3366 vpmadd52huq $H2,$R0,$D2hi
3367
3368 vpmadd52luq $H0,$R0,$D0lo
3369 vpmadd52huq $H0,$R0,$D0hi
3370 vpmadd52luq $H0,$R1,$D1lo
3371 vpmadd52huq $H0,$R1,$D1hi
3372 vpmadd52luq $H0,$R2,$D2lo
3373 vpmadd52huq $H0,$R2,$D2hi
3374
3375 vpmadd52luq $H1,$S2,$D0lo
3376 vpmadd52huq $H1,$S2,$D0hi
3377 vpmadd52luq $H1,$R0,$D1lo
3378 vpmadd52huq $H1,$R0,$D1hi
3379 vpmadd52luq $H1,$R1,$D2lo
3380 vpmadd52huq $H1,$R1,$D2hi
3381
3382
3383
3384
3385 mov \$1,%eax
3386 kmovw %eax,%k1
3387 vpsrldq \$8,$D0lo,$T0
3388 vpsrldq \$8,$D0hi,$H0
3389 vpsrldq \$8,$D1lo,$T1
3390 vpsrldq \$8,$D1hi,$H1
3391 vpaddq $T0,$D0lo,$D0lo
3392 vpaddq $H0,$D0hi,$D0hi
3393 vpsrldq \$8,$D2lo,$T2
3394 vpsrldq \$8,$D2hi,$H2
3395 vpaddq $T1,$D1lo,$D1lo
3396 vpaddq $H1,$D1hi,$D1hi
3397 vpermq \$0x2,$D0lo,$T0
3398 vpermq \$0x2,$D0hi,$H0
3399 vpaddq $T2,$D2lo,$D2lo
3400 vpaddq $H2,$D2hi,$D2hi
3401
3402 vpermq \$0x2,$D1lo,$T1
3403 vpermq \$0x2,$D1hi,$H1
3404 vpaddq $T0,$D0lo,${D0lo}{%k1}{z}
3405 vpaddq $H0,$D0hi,${D0hi}{%k1}{z}
3406 vpermq \$0x2,$D2lo,$T2
3407 vpermq \$0x2,$D2hi,$H2
3408 vpaddq $T1,$D1lo,${D1lo}{%k1}{z}
3409 vpaddq $H1,$D1hi,${D1hi}{%k1}{z}
3410 vpaddq $T2,$D2lo,${D2lo}{%k1}{z}
3411 vpaddq $H2,$D2hi,${D2hi}{%k1}{z}
3412
3413
3414
3415 vpsrlq \$44,$D0lo,$tmp
3416 vpsllq \$8,$D0hi,$D0hi
3417 vpandq $mask44,$D0lo,$H0
3418 vpaddq $tmp,$D0hi,$D0hi
3419
3420 vpaddq $D0hi,$D1lo,$D1lo
3421
3422 vpsrlq \$44,$D1lo,$tmp
3423 vpsllq \$8,$D1hi,$D1hi
3424 vpandq $mask44,$D1lo,$H1
3425 vpaddq $tmp,$D1hi,$D1hi
3426
3427 vpaddq $D1hi,$D2lo,$D2lo
3428
3429 vpsrlq \$42,$D2lo,$tmp
3430 vpsllq \$10,$D2hi,$D2hi
3431 vpandq $mask42,$D2lo,$H2
3432 vpaddq $tmp,$D2hi,$D2hi
3433
3434 vpaddq $D2hi,$H0,$H0
3435 vpsllq \$2,$D2hi,$D2hi
3436
3437 vpaddq $D2hi,$H0,$H0
3438
3439 vpsrlq \$44,$H0,$tmp
3440 vpandq $mask44,$H0,$H0
3441
3442 vpaddq $tmp,$H1,$H1
3443
3444
3445 sub \$2,$len
3446 ja .Lblocks_vpmadd52_4x_do
3447
3448 vmovq %x
3449 vmovq %x
3450 vmovq %x
3451 vzeroall
3452
3453 .Lno_data_vpmadd52_4x:
3454 RET
3455 .size poly1305_blocks_vpmadd52_4x,.-poly1305_blocks_vpmadd52_4x
3456 ___
3457 }
3458 {
3459
3460
3461
3462
3463
3464 my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17));
3465 my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23));
3466 my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31));
3467 my ($RR0,$RR1,$RR2,$SS1,$SS2) = map("%ymm$_",(6..10));
3468
3469 $code.=<<___;
3470 .type poly1305_blocks_vpmadd52_8x,\@function,4
3471 .align 32
3472 poly1305_blocks_vpmadd52_8x:
3473 shr \$4,$len
3474 jz .Lno_data_vpmadd52_8x
3475
3476 shl \$40,$padbit
3477 mov 64($ctx),%r8
3478
3479 vmovdqa64 .Lx_mask44(%rip),$mask44
3480 vmovdqa64 .Lx_mask42(%rip),$mask42
3481
3482 test %r8,%r8
3483 js .Linit_vpmadd52
3484
3485 vmovq 0($ctx),%x
3486 vmovq 8($ctx),%x
3487 vmovq 16($ctx),%x
3488
3489 .Lblocks_vpmadd52_8x:
3490
3491
3492
3493 vmovdqu64 128($ctx),$R2
3494 vmovdqu64 160($ctx),$S1
3495 vmovdqu64 64($ctx),$R0
3496 vmovdqu64 96($ctx),$R1
3497
3498 vpsllq \$2,$R2,$S2
3499 vpaddq $R2,$S2,$S2
3500 vpsllq \$2,$S2,$S2
3501
3502 vpbroadcastq %x
3503 vpbroadcastq %x
3504 vpbroadcastq %x
3505
3506 vpxorq $D0lo,$D0lo,$D0lo
3507 vpmadd52luq $RR2,$S1,$D0lo
3508 vpxorq $D0hi,$D0hi,$D0hi
3509 vpmadd52huq $RR2,$S1,$D0hi
3510 vpxorq $D1lo,$D1lo,$D1lo
3511 vpmadd52luq $RR2,$S2,$D1lo
3512 vpxorq $D1hi,$D1hi,$D1hi
3513 vpmadd52huq $RR2,$S2,$D1hi
3514 vpxorq $D2lo,$D2lo,$D2lo
3515 vpmadd52luq $RR2,$R0,$D2lo
3516 vpxorq $D2hi,$D2hi,$D2hi
3517 vpmadd52huq $RR2,$R0,$D2hi
3518
3519 vpmadd52luq $RR0,$R0,$D0lo
3520 vpmadd52huq $RR0,$R0,$D0hi
3521 vpmadd52luq $RR0,$R1,$D1lo
3522 vpmadd52huq $RR0,$R1,$D1hi
3523 vpmadd52luq $RR0,$R2,$D2lo
3524 vpmadd52huq $RR0,$R2,$D2hi
3525
3526 vpmadd52luq $RR1,$S2,$D0lo
3527 vpmadd52huq $RR1,$S2,$D0hi
3528 vpmadd52luq $RR1,$R0,$D1lo
3529 vpmadd52huq $RR1,$R0,$D1hi
3530 vpmadd52luq $RR1,$R1,$D2lo
3531 vpmadd52huq $RR1,$R1,$D2hi
3532
3533
3534
3535 vpsrlq \$44,$D0lo,$tmp
3536 vpsllq \$8,$D0hi,$D0hi
3537 vpandq $mask44,$D0lo,$RR0
3538 vpaddq $tmp,$D0hi,$D0hi
3539
3540 vpaddq $D0hi,$D1lo,$D1lo
3541
3542 vpsrlq \$44,$D1lo,$tmp
3543 vpsllq \$8,$D1hi,$D1hi
3544 vpandq $mask44,$D1lo,$RR1
3545 vpaddq $tmp,$D1hi,$D1hi
3546
3547 vpaddq $D1hi,$D2lo,$D2lo
3548
3549 vpsrlq \$42,$D2lo,$tmp
3550 vpsllq \$10,$D2hi,$D2hi
3551 vpandq $mask42,$D2lo,$RR2
3552 vpaddq $tmp,$D2hi,$D2hi
3553
3554 vpaddq $D2hi,$RR0,$RR0
3555 vpsllq \$2,$D2hi,$D2hi
3556
3557 vpaddq $D2hi,$RR0,$RR0
3558
3559 vpsrlq \$44,$RR0,$tmp
3560 vpandq $mask44,$RR0,$RR0
3561
3562 vpaddq $tmp,$RR1,$RR1
3563
3564
3565
3566
3567
3568 vpunpcklqdq $R2,$RR2,$T2
3569 vpunpckhqdq $R2,$RR2,$R2
3570 vpunpcklqdq $R0,$RR0,$T0
3571 vpunpckhqdq $R0,$RR0,$R0
3572 vpunpcklqdq $R1,$RR1,$T1
3573 vpunpckhqdq $R1,$RR1,$R1
3574 ___
3575
3576 map(s/%y/%z/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2);
3577 map(s/%y/%z/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi);
3578 map(s/%y/%z/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD);
3579 map(s/%y/%z/, $RR0,$RR1,$RR2,$SS1,$SS2);
3580
3581 $code.=<<___;
3582 vshufi64x2 \$0x44,$R2,$T2,$RR2
3583 vshufi64x2 \$0x44,$R0,$T0,$RR0
3584 vshufi64x2 \$0x44,$R1,$T1,$RR1
3585
3586 vmovdqu64 16*0($inp),$T2
3587 vmovdqu64 16*4($inp),$T3
3588 lea 16*8($inp),$inp
3589
3590 vpsllq \$2,$RR2,$SS2
3591 vpsllq \$2,$RR1,$SS1
3592 vpaddq $RR2,$SS2,$SS2
3593 vpaddq $RR1,$SS1,$SS1
3594 vpsllq \$2,$SS2,$SS2
3595 vpsllq \$2,$SS1,$SS1
3596
3597 vpbroadcastq $padbit,$PAD
3598 vpbroadcastq %x
3599 vpbroadcastq %x
3600
3601 vpbroadcastq %x
3602 vpbroadcastq %x
3603 vpbroadcastq %x
3604 vpbroadcastq %x
3605 vpbroadcastq %x
3606
3607 vpunpcklqdq $T3,$T2,$T1
3608 vpunpckhqdq $T3,$T2,$T3
3609
3610
3611
3612 vpsrlq \$24,$T3,$T2
3613 vporq $PAD,$T2,$T2
3614 vpaddq $T2,$H2,$H2
3615 vpandq $mask44,$T1,$T0
3616 vpsrlq \$44,$T1,$T1
3617 vpsllq \$20,$T3,$T3
3618 vporq $T3,$T1,$T1
3619 vpandq $mask44,$T1,$T1
3620
3621 sub \$8,$len
3622 jz .Ltail_vpmadd52_8x
3623 jmp .Loop_vpmadd52_8x
3624
3625 .align 32
3626 .Loop_vpmadd52_8x:
3627
3628 vpaddq $T0,$H0,$H0
3629 vpaddq $T1,$H1,$H1
3630
3631 vpxorq $D0lo,$D0lo,$D0lo
3632 vpmadd52luq $H2,$S1,$D0lo
3633 vpxorq $D0hi,$D0hi,$D0hi
3634 vpmadd52huq $H2,$S1,$D0hi
3635 vpxorq $D1lo,$D1lo,$D1lo
3636 vpmadd52luq $H2,$S2,$D1lo
3637 vpxorq $D1hi,$D1hi,$D1hi
3638 vpmadd52huq $H2,$S2,$D1hi
3639 vpxorq $D2lo,$D2lo,$D2lo
3640 vpmadd52luq $H2,$R0,$D2lo
3641 vpxorq $D2hi,$D2hi,$D2hi
3642 vpmadd52huq $H2,$R0,$D2hi
3643
3644 vmovdqu64 16*0($inp),$T2
3645 vmovdqu64 16*4($inp),$T3
3646 lea 16*8($inp),$inp
3647 vpmadd52luq $H0,$R0,$D0lo
3648 vpmadd52huq $H0,$R0,$D0hi
3649 vpmadd52luq $H0,$R1,$D1lo
3650 vpmadd52huq $H0,$R1,$D1hi
3651 vpmadd52luq $H0,$R2,$D2lo
3652 vpmadd52huq $H0,$R2,$D2hi
3653
3654 vpunpcklqdq $T3,$T2,$T1
3655 vpunpckhqdq $T3,$T2,$T3
3656 vpmadd52luq $H1,$S2,$D0lo
3657 vpmadd52huq $H1,$S2,$D0hi
3658 vpmadd52luq $H1,$R0,$D1lo
3659 vpmadd52huq $H1,$R0,$D1hi
3660 vpmadd52luq $H1,$R1,$D2lo
3661 vpmadd52huq $H1,$R1,$D2hi
3662
3663
3664
3665 vpsrlq \$44,$D0lo,$tmp
3666 vpsllq \$8,$D0hi,$D0hi
3667 vpandq $mask44,$D0lo,$H0
3668 vpaddq $tmp,$D0hi,$D0hi
3669
3670 vpsrlq \$24,$T3,$T2
3671 vporq $PAD,$T2,$T2
3672 vpaddq $D0hi,$D1lo,$D1lo
3673
3674 vpsrlq \$44,$D1lo,$tmp
3675 vpsllq \$8,$D1hi,$D1hi
3676 vpandq $mask44,$D1lo,$H1
3677 vpaddq $tmp,$D1hi,$D1hi
3678
3679 vpandq $mask44,$T1,$T0
3680 vpsrlq \$44,$T1,$T1
3681 vpsllq \$20,$T3,$T3
3682 vpaddq $D1hi,$D2lo,$D2lo
3683
3684 vpsrlq \$42,$D2lo,$tmp
3685 vpsllq \$10,$D2hi,$D2hi
3686 vpandq $mask42,$D2lo,$H2
3687 vpaddq $tmp,$D2hi,$D2hi
3688
3689 vpaddq $T2,$H2,$H2
3690 vpaddq $D2hi,$H0,$H0
3691 vpsllq \$2,$D2hi,$D2hi
3692
3693 vpaddq $D2hi,$H0,$H0
3694 vporq $T3,$T1,$T1
3695 vpandq $mask44,$T1,$T1
3696
3697 vpsrlq \$44,$H0,$tmp
3698 vpandq $mask44,$H0,$H0
3699
3700 vpaddq $tmp,$H1,$H1
3701
3702 sub \$8,$len
3703 jnz .Loop_vpmadd52_8x
3704
3705 .Ltail_vpmadd52_8x:
3706
3707 vpaddq $T0,$H0,$H0
3708 vpaddq $T1,$H1,$H1
3709
3710 vpxorq $D0lo,$D0lo,$D0lo
3711 vpmadd52luq $H2,$SS1,$D0lo
3712 vpxorq $D0hi,$D0hi,$D0hi
3713 vpmadd52huq $H2,$SS1,$D0hi
3714 vpxorq $D1lo,$D1lo,$D1lo
3715 vpmadd52luq $H2,$SS2,$D1lo
3716 vpxorq $D1hi,$D1hi,$D1hi
3717 vpmadd52huq $H2,$SS2,$D1hi
3718 vpxorq $D2lo,$D2lo,$D2lo
3719 vpmadd52luq $H2,$RR0,$D2lo
3720 vpxorq $D2hi,$D2hi,$D2hi
3721 vpmadd52huq $H2,$RR0,$D2hi
3722
3723 vpmadd52luq $H0,$RR0,$D0lo
3724 vpmadd52huq $H0,$RR0,$D0hi
3725 vpmadd52luq $H0,$RR1,$D1lo
3726 vpmadd52huq $H0,$RR1,$D1hi
3727 vpmadd52luq $H0,$RR2,$D2lo
3728 vpmadd52huq $H0,$RR2,$D2hi
3729
3730 vpmadd52luq $H1,$SS2,$D0lo
3731 vpmadd52huq $H1,$SS2,$D0hi
3732 vpmadd52luq $H1,$RR0,$D1lo
3733 vpmadd52huq $H1,$RR0,$D1hi
3734 vpmadd52luq $H1,$RR1,$D2lo
3735 vpmadd52huq $H1,$RR1,$D2hi
3736
3737
3738
3739
3740 mov \$1,%eax
3741 kmovw %eax,%k1
3742 vpsrldq \$8,$D0lo,$T0
3743 vpsrldq \$8,$D0hi,$H0
3744 vpsrldq \$8,$D1lo,$T1
3745 vpsrldq \$8,$D1hi,$H1
3746 vpaddq $T0,$D0lo,$D0lo
3747 vpaddq $H0,$D0hi,$D0hi
3748 vpsrldq \$8,$D2lo,$T2
3749 vpsrldq \$8,$D2hi,$H2
3750 vpaddq $T1,$D1lo,$D1lo
3751 vpaddq $H1,$D1hi,$D1hi
3752 vpermq \$0x2,$D0lo,$T0
3753 vpermq \$0x2,$D0hi,$H0
3754 vpaddq $T2,$D2lo,$D2lo
3755 vpaddq $H2,$D2hi,$D2hi
3756
3757 vpermq \$0x2,$D1lo,$T1
3758 vpermq \$0x2,$D1hi,$H1
3759 vpaddq $T0,$D0lo,$D0lo
3760 vpaddq $H0,$D0hi,$D0hi
3761 vpermq \$0x2,$D2lo,$T2
3762 vpermq \$0x2,$D2hi,$H2
3763 vpaddq $T1,$D1lo,$D1lo
3764 vpaddq $H1,$D1hi,$D1hi
3765 vextracti64x4 \$1,$D0lo,%y
3766 vextracti64x4 \$1,$D0hi,%y
3767 vpaddq $T2,$D2lo,$D2lo
3768 vpaddq $H2,$D2hi,$D2hi
3769
3770 vextracti64x4 \$1,$D1lo,%y
3771 vextracti64x4 \$1,$D1hi,%y
3772 vextracti64x4 \$1,$D2lo,%y
3773 vextracti64x4 \$1,$D2hi,%y
3774 ___
3775
3776 map(s/%z/%y/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2);
3777 map(s/%z/%y/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi);
3778 map(s/%z/%y/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD);
3779
3780 $code.=<<___;
3781 vpaddq $T0,$D0lo,${D0lo}{%k1}{z}
3782 vpaddq $H0,$D0hi,${D0hi}{%k1}{z}
3783 vpaddq $T1,$D1lo,${D1lo}{%k1}{z}
3784 vpaddq $H1,$D1hi,${D1hi}{%k1}{z}
3785 vpaddq $T2,$D2lo,${D2lo}{%k1}{z}
3786 vpaddq $H2,$D2hi,${D2hi}{%k1}{z}
3787
3788
3789
3790 vpsrlq \$44,$D0lo,$tmp
3791 vpsllq \$8,$D0hi,$D0hi
3792 vpandq $mask44,$D0lo,$H0
3793 vpaddq $tmp,$D0hi,$D0hi
3794
3795 vpaddq $D0hi,$D1lo,$D1lo
3796
3797 vpsrlq \$44,$D1lo,$tmp
3798 vpsllq \$8,$D1hi,$D1hi
3799 vpandq $mask44,$D1lo,$H1
3800 vpaddq $tmp,$D1hi,$D1hi
3801
3802 vpaddq $D1hi,$D2lo,$D2lo
3803
3804 vpsrlq \$42,$D2lo,$tmp
3805 vpsllq \$10,$D2hi,$D2hi
3806 vpandq $mask42,$D2lo,$H2
3807 vpaddq $tmp,$D2hi,$D2hi
3808
3809 vpaddq $D2hi,$H0,$H0
3810 vpsllq \$2,$D2hi,$D2hi
3811
3812 vpaddq $D2hi,$H0,$H0
3813
3814 vpsrlq \$44,$H0,$tmp
3815 vpandq $mask44,$H0,$H0
3816
3817 vpaddq $tmp,$H1,$H1
3818
3819
3820
3821 vmovq %x
3822 vmovq %x
3823 vmovq %x
3824 vzeroall
3825
3826 .Lno_data_vpmadd52_8x:
3827 RET
3828 .size poly1305_blocks_vpmadd52_8x,.-poly1305_blocks_vpmadd52_8x
3829 ___
3830 }
3831 $code.=<<___;
3832 .type poly1305_emit_base2_44,\@function,3
3833 .align 32
3834 poly1305_emit_base2_44:
3835 mov 0($ctx),%r8
3836 mov 8($ctx),%r9
3837 mov 16($ctx),%r10
3838
3839 mov %r9,%rax
3840 shr \$20,%r9
3841 shl \$44,%rax
3842 mov %r10,%rcx
3843 shr \$40,%r10
3844 shl \$24,%rcx
3845
3846 add %rax,%r8
3847 adc %rcx,%r9
3848 adc \$0,%r10
3849
3850 mov %r8,%rax
3851 add \$5,%r8
3852 mov %r9,%rcx
3853 adc \$0,%r9
3854 adc \$0,%r10
3855 shr \$2,%r10
3856 cmovnz %r8,%rax
3857 cmovnz %r9,%rcx
3858
3859 add 0($nonce),%rax
3860 adc 8($nonce),%rcx
3861 mov %rax,0($mac)
3862 mov %rcx,8($mac)
3863
3864 RET
3865 .size poly1305_emit_base2_44,.-poly1305_emit_base2_44
3866 ___
3867 } } }
3868 }
3869
3870 if (!$kernel)
3871 {
3872 my ($out,$inp,$otp,$len)=$win64 ? ("%rcx","%rdx","%r8", "%r9") :
3873 ("%rdi","%rsi","%rdx","%rcx");
3874 $code.=<<___;
3875 .globl xor128_encrypt_n_pad
3876 .type xor128_encrypt_n_pad,\@abi-omnipotent
3877 .align 16
3878 xor128_encrypt_n_pad:
3879 sub $otp,$inp
3880 sub $otp,$out
3881 mov $len,%r10
3882 shr \$4,$len
3883 jz .Ltail_enc
3884 nop
3885 .Loop_enc_xmm:
3886 movdqu ($inp,$otp),%xmm0
3887 pxor ($otp),%xmm0
3888 movdqu %xmm0,($out,$otp)
3889 movdqa %xmm0,($otp)
3890 lea 16($otp),$otp
3891 dec $len
3892 jnz .Loop_enc_xmm
3893
3894 and \$15,%r10
3895 jz .Ldone_enc
3896
3897 .Ltail_enc:
3898 mov \$16,$len
3899 sub %r10,$len
3900 xor %eax,%eax
3901 .Loop_enc_byte:
3902 mov ($inp,$otp),%al
3903 xor ($otp),%al
3904 mov %al,($out,$otp)
3905 mov %al,($otp)
3906 lea 1($otp),$otp
3907 dec %r10
3908 jnz .Loop_enc_byte
3909
3910 xor %eax,%eax
3911 .Loop_enc_pad:
3912 mov %al,($otp)
3913 lea 1($otp),$otp
3914 dec $len
3915 jnz .Loop_enc_pad
3916
3917 .Ldone_enc:
3918 mov $otp,%rax
3919 RET
3920 .size xor128_encrypt_n_pad,.-xor128_encrypt_n_pad
3921
3922 .globl xor128_decrypt_n_pad
3923 .type xor128_decrypt_n_pad,\@abi-omnipotent
3924 .align 16
3925 xor128_decrypt_n_pad:
3926 sub $otp,$inp
3927 sub $otp,$out
3928 mov $len,%r10
3929 shr \$4,$len
3930 jz .Ltail_dec
3931 nop
3932 .Loop_dec_xmm:
3933 movdqu ($inp,$otp),%xmm0
3934 movdqa ($otp),%xmm1
3935 pxor %xmm0,%xmm1
3936 movdqu %xmm1,($out,$otp)
3937 movdqa %xmm0,($otp)
3938 lea 16($otp),$otp
3939 dec $len
3940 jnz .Loop_dec_xmm
3941
3942 pxor %xmm1,%xmm1
3943 and \$15,%r10
3944 jz .Ldone_dec
3945
3946 .Ltail_dec:
3947 mov \$16,$len
3948 sub %r10,$len
3949 xor %eax,%eax
3950 xor %r11d,%r11d
3951 .Loop_dec_byte:
3952 mov ($inp,$otp),%r11b
3953 mov ($otp),%al
3954 xor %r11b,%al
3955 mov %al,($out,$otp)
3956 mov %r11b,($otp)
3957 lea 1($otp),$otp
3958 dec %r10
3959 jnz .Loop_dec_byte
3960
3961 xor %eax,%eax
3962 .Loop_dec_pad:
3963 mov %al,($otp)
3964 lea 1($otp),$otp
3965 dec $len
3966 jnz .Loop_dec_pad
3967
3968 .Ldone_dec:
3969 mov $otp,%rax
3970 RET
3971 .size xor128_decrypt_n_pad,.-xor128_decrypt_n_pad
3972 ___
3973 }
3974
3975
3976
3977 if ($win64) {
3978 $rec="%rcx";
3979 $frame="%rdx";
3980 $context="%r8";
3981 $disp="%r9";
3982
3983 $code.=<<___;
3984 .extern __imp_RtlVirtualUnwind
3985 .type se_handler,\@abi-omnipotent
3986 .align 16
3987 se_handler:
3988 push %rsi
3989 push %rdi
3990 push %rbx
3991 push %rbp
3992 push %r12
3993 push %r13
3994 push %r14
3995 push %r15
3996 pushfq
3997 sub \$64,%rsp
3998
3999 mov 120($context),%rax
4000 mov 248($context),%rbx
4001
4002 mov 8($disp),%rsi
4003 mov 56($disp),%r11
4004
4005 mov 0(%r11),%r10d
4006 lea (%rsi,%r10),%r10
4007 cmp %r10,%rbx
4008 jb .Lcommon_seh_tail
4009
4010 mov 152($context),%rax
4011
4012 mov 4(%r11),%r10d
4013 lea (%rsi,%r10),%r10
4014 cmp %r10,%rbx
4015 jae .Lcommon_seh_tail
4016
4017 lea 48(%rax),%rax
4018
4019 mov -8(%rax),%rbx
4020 mov -16(%rax),%rbp
4021 mov -24(%rax),%r12
4022 mov -32(%rax),%r13
4023 mov -40(%rax),%r14
4024 mov -48(%rax),%r15
4025 mov %rbx,144($context)
4026 mov %rbp,160($context)
4027 mov %r12,216($context)
4028 mov %r13,224($context)
4029 mov %r14,232($context)
4030 mov %r15,240($context)
4031
4032 jmp .Lcommon_seh_tail
4033 .size se_handler,.-se_handler
4034
4035 .type avx_handler,\@abi-omnipotent
4036 .align 16
4037 avx_handler:
4038 push %rsi
4039 push %rdi
4040 push %rbx
4041 push %rbp
4042 push %r12
4043 push %r13
4044 push %r14
4045 push %r15
4046 pushfq
4047 sub \$64,%rsp
4048
4049 mov 120($context),%rax
4050 mov 248($context),%rbx
4051
4052 mov 8($disp),%rsi
4053 mov 56($disp),%r11
4054
4055 mov 0(%r11),%r10d
4056 lea (%rsi,%r10),%r10
4057 cmp %r10,%rbx
4058 jb .Lcommon_seh_tail
4059
4060 mov 152($context),%rax
4061
4062 mov 4(%r11),%r10d
4063 lea (%rsi,%r10),%r10
4064 cmp %r10,%rbx
4065 jae .Lcommon_seh_tail
4066
4067 mov 208($context),%rax
4068
4069 lea 0x50(%rax),%rsi
4070 lea 0xf8(%rax),%rax
4071 lea 512($context),%rdi
4072 mov \$20,%ecx
4073 .long 0xa548f3fc
4074
4075 .Lcommon_seh_tail:
4076 mov 8(%rax),%rdi
4077 mov 16(%rax),%rsi
4078 mov %rax,152($context)
4079 mov %rsi,168($context)
4080 mov %rdi,176($context)
4081
4082 mov 40($disp),%rdi
4083 mov $context,%rsi
4084 mov \$154,%ecx
4085 .long 0xa548f3fc
4086
4087 mov $disp,%rsi
4088 xor %ecx,%ecx
4089 mov 8(%rsi),%rdx
4090 mov 0(%rsi),%r8
4091 mov 16(%rsi),%r9
4092 mov 40(%rsi),%r10
4093 lea 56(%rsi),%r11
4094 lea 24(%rsi),%r12
4095 mov %r10,32(%rsp)
4096 mov %r11,40(%rsp)
4097 mov %r12,48(%rsp)
4098 mov %rcx,56(%rsp)
4099 call *__imp_RtlVirtualUnwind(%rip)
4100
4101 mov \$1,%eax
4102 add \$64,%rsp
4103 popfq
4104 pop %r15
4105 pop %r14
4106 pop %r13
4107 pop %r12
4108 pop %rbp
4109 pop %rbx
4110 pop %rdi
4111 pop %rsi
4112 RET
4113 .size avx_handler,.-avx_handler
4114
4115 .section .pdata
4116 .align 4
4117 .rva .LSEH_begin_poly1305_init_x86_64
4118 .rva .LSEH_end_poly1305_init_x86_64
4119 .rva .LSEH_info_poly1305_init_x86_64
4120
4121 .rva .LSEH_begin_poly1305_blocks_x86_64
4122 .rva .LSEH_end_poly1305_blocks_x86_64
4123 .rva .LSEH_info_poly1305_blocks_x86_64
4124
4125 .rva .LSEH_begin_poly1305_emit_x86_64
4126 .rva .LSEH_end_poly1305_emit_x86_64
4127 .rva .LSEH_info_poly1305_emit_x86_64
4128 ___
4129 $code.=<<___ if ($avx);
4130 .rva .LSEH_begin_poly1305_blocks_avx
4131 .rva .Lbase2_64_avx
4132 .rva .LSEH_info_poly1305_blocks_avx_1
4133
4134 .rva .Lbase2_64_avx
4135 .rva .Leven_avx
4136 .rva .LSEH_info_poly1305_blocks_avx_2
4137
4138 .rva .Leven_avx
4139 .rva .LSEH_end_poly1305_blocks_avx
4140 .rva .LSEH_info_poly1305_blocks_avx_3
4141
4142 .rva .LSEH_begin_poly1305_emit_avx
4143 .rva .LSEH_end_poly1305_emit_avx
4144 .rva .LSEH_info_poly1305_emit_avx
4145 ___
4146 $code.=<<___ if ($avx>1);
4147 .rva .LSEH_begin_poly1305_blocks_avx2
4148 .rva .Lbase2_64_avx2
4149 .rva .LSEH_info_poly1305_blocks_avx2_1
4150
4151 .rva .Lbase2_64_avx2
4152 .rva .Leven_avx2
4153 .rva .LSEH_info_poly1305_blocks_avx2_2
4154
4155 .rva .Leven_avx2
4156 .rva .LSEH_end_poly1305_blocks_avx2
4157 .rva .LSEH_info_poly1305_blocks_avx2_3
4158 ___
4159 $code.=<<___ if ($avx>2);
4160 .rva .LSEH_begin_poly1305_blocks_avx512
4161 .rva .LSEH_end_poly1305_blocks_avx512
4162 .rva .LSEH_info_poly1305_blocks_avx512
4163 ___
4164 $code.=<<___;
4165 .section .xdata
4166 .align 8
4167 .LSEH_info_poly1305_init_x86_64:
4168 .byte 9,0,0,0
4169 .rva se_handler
4170 .rva .LSEH_begin_poly1305_init_x86_64,.LSEH_begin_poly1305_init_x86_64
4171
4172 .LSEH_info_poly1305_blocks_x86_64:
4173 .byte 9,0,0,0
4174 .rva se_handler
4175 .rva .Lblocks_body,.Lblocks_epilogue
4176
4177 .LSEH_info_poly1305_emit_x86_64:
4178 .byte 9,0,0,0
4179 .rva se_handler
4180 .rva .LSEH_begin_poly1305_emit_x86_64,.LSEH_begin_poly1305_emit_x86_64
4181 ___
4182 $code.=<<___ if ($avx);
4183 .LSEH_info_poly1305_blocks_avx_1:
4184 .byte 9,0,0,0
4185 .rva se_handler
4186 .rva .Lblocks_avx_body,.Lblocks_avx_epilogue
4187
4188 .LSEH_info_poly1305_blocks_avx_2:
4189 .byte 9,0,0,0
4190 .rva se_handler
4191 .rva .Lbase2_64_avx_body,.Lbase2_64_avx_epilogue
4192
4193 .LSEH_info_poly1305_blocks_avx_3:
4194 .byte 9,0,0,0
4195 .rva avx_handler
4196 .rva .Ldo_avx_body,.Ldo_avx_epilogue
4197
4198 .LSEH_info_poly1305_emit_avx:
4199 .byte 9,0,0,0
4200 .rva se_handler
4201 .rva .LSEH_begin_poly1305_emit_avx,.LSEH_begin_poly1305_emit_avx
4202 ___
4203 $code.=<<___ if ($avx>1);
4204 .LSEH_info_poly1305_blocks_avx2_1:
4205 .byte 9,0,0,0
4206 .rva se_handler
4207 .rva .Lblocks_avx2_body,.Lblocks_avx2_epilogue
4208
4209 .LSEH_info_poly1305_blocks_avx2_2:
4210 .byte 9,0,0,0
4211 .rva se_handler
4212 .rva .Lbase2_64_avx2_body,.Lbase2_64_avx2_epilogue
4213
4214 .LSEH_info_poly1305_blocks_avx2_3:
4215 .byte 9,0,0,0
4216 .rva avx_handler
4217 .rva .Ldo_avx2_body,.Ldo_avx2_epilogue
4218 ___
4219 $code.=<<___ if ($avx>2);
4220 .LSEH_info_poly1305_blocks_avx512:
4221 .byte 9,0,0,0
4222 .rva avx_handler
4223 .rva .Ldo_avx512_body,.Ldo_avx512_epilogue
4224 ___
4225 }
4226
4227 open SELF,$0;
4228 while(<SELF>) {
4229 next if (/^
4230 last if (!s/^#/\/\// and !/^$/);
4231 print;
4232 }
4233 close SELF;
4234
4235 foreach (split('\n',$code)) {
4236 s/\`([^\`]*)\`/eval($1)/ge;
4237 s/%r([a-z]+)#d/%e$1/g;
4238 s/%r([0-9]+)#d/%r$1d/g;
4239 s/%x#%[yz]/%x/g or s/%y#%z/%y/g or s/%z#%[yz]/%z/g;
4240
4241 if ($kernel) {
4242 s/(^\.type.*),[0-9]+$/\1/;
4243 s/(^\.type.*),\@abi-omnipotent+$/\1,\@function/;
4244 next if /^\.cfi.*/;
4245 }
4246
4247 print $_,"\n";
4248 }
4249 close STDOUT;