0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024
0025
0026
0027
0028
0029
0030 $flavour=shift;
0031 $output=shift;
0032
0033 if ($flavour && $flavour ne "void") {
0034 $0 =~ ; $dir=$1;
0035 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
0036 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
0037 die "can't locate arm-xlate.pl";
0038
0039 open STDOUT,"| \"$^X\" $xlate $flavour $output";
0040 } else {
0041 open STDOUT,">$output";
0042 }
0043
0044 my ($ctx,$inp,$len,$padbit) = map("x$_",(0..3));
0045 my ($mac,$nonce)=($inp,$len);
0046
0047 my ($h0,$h1,$h2,$r0,$r1,$s1,$t0,$t1,$d0,$d1,$d2) = map("x$_",(4..14));
0048
0049 $code.=<<___;
0050
0051
0052 .extern OPENSSL_armcap_P
0053
0054
0055 .text
0056
0057 // forward "declarations" are required for Apple
0058 .globl poly1305_blocks
0059 .globl poly1305_emit
0060
0061 .globl poly1305_init
0062 .type poly1305_init,%function
0063 .align 5
0064 poly1305_init:
0065 cmp $inp,xzr
0066 stp xzr,xzr,[$ctx] // zero hash value
0067 stp xzr,xzr,[$ctx,
0068
0069 csel x0,xzr,x0,eq
0070 b.eq .Lno_key
0071
0072
0073 adrp x17,OPENSSL_armcap_P
0074 ldr w17,[x17,
0075
0076
0077 ldp $r0,$r1,[$inp] // load key
0078 mov $s1,
0079 movk $s1,
0080
0081 rev $r0,$r0 // flip bytes
0082 rev $r1,$r1
0083
0084 and $r0,$r0,$s1 // &=0ffffffc0fffffff
0085 and $s1,$s1,
0086 and $r1,$r1,$s1 // &=0ffffffc0ffffffc
0087 mov w
0088 stp $r0,$r1,[$ctx,
0089 str w
0090
0091
0092 tst w17,
0093
0094 adr $d0,.Lpoly1305_blocks
0095 adr $r0,.Lpoly1305_blocks_neon
0096 adr $d1,.Lpoly1305_emit
0097
0098 csel $d0,$d0,$r0,eq
0099
0100
0101 stp w
0102
0103 stp $d0,$d1,[$len]
0104
0105
0106 mov x0,
0107 .Lno_key:
0108 ret
0109 .size poly1305_init,.-poly1305_init
0110
0111 .type poly1305_blocks,%function
0112 .align 5
0113 poly1305_blocks:
0114 .Lpoly1305_blocks:
0115 ands $len,$len,
0116 b.eq .Lno_data
0117
0118 ldp $h0,$h1,[$ctx] // load hash value
0119 ldp $h2,x17,[$ctx,
0120 ldp $r0,$r1,[$ctx,
0121
0122
0123 lsr $d0,$h0,
0124 mov w
0125 lsr $d2,$h1,
0126 mov w15,w
0127 lsr x16,$h2,
0128
0129 mov w
0130 lsr $d1,$h0,
0131 mov w
0132 lsr x15,$h1,
0133 mov w16,w
0134
0135
0136 add $d0,$d0,$d1,lsl
0137 lsr $d1,$d2,
0138 adds $d0,$d0,$d2,lsl
0139 add $d1,$d1,x15,lsl
0140 adc $d1,$d1,xzr
0141 lsr $d2,x16,
0142 adds $d1,$d1,x16,lsl
0143 adc $d2,$d2,xzr
0144
0145 cmp x17,
0146 add $s1,$r1,$r1,lsr
0147 csel $h0,$h0,$d0,eq // choose between radixes
0148 csel $h1,$h1,$d1,eq
0149 csel $h2,$h2,$d2,eq
0150
0151 .Loop:
0152 ldp $t0,$t1,[$inp],
0153 sub $len,$len,
0154
0155 rev $t0,$t0
0156 rev $t1,$t1
0157
0158 adds $h0,$h0,$t0 // accumulate input
0159 adcs $h1,$h1,$t1
0160
0161 mul $d0,$h0,$r0 // h0*r0
0162 adc $h2,$h2,$padbit
0163 umulh $d1,$h0,$r0
0164
0165 mul $t0,$h1,$s1 // h1*5*r1
0166 umulh $t1,$h1,$s1
0167
0168 adds $d0,$d0,$t0
0169 mul $t0,$h0,$r1 // h0*r1
0170 adc $d1,$d1,$t1
0171 umulh $d2,$h0,$r1
0172
0173 adds $d1,$d1,$t0
0174 mul $t0,$h1,$r0 // h1*r0
0175 adc $d2,$d2,xzr
0176 umulh $t1,$h1,$r0
0177
0178 adds $d1,$d1,$t0
0179 mul $t0,$h2,$s1 // h2*5*r1
0180 adc $d2,$d2,$t1
0181 mul $t1,$h2,$r0 // h2*r0
0182
0183 adds $d1,$d1,$t0
0184 adc $d2,$d2,$t1
0185
0186 and $t0,$d2,
0187 and $h2,$d2,
0188 add $t0,$t0,$d2,lsr
0189 adds $h0,$d0,$t0
0190 adcs $h1,$d1,xzr
0191 adc $h2,$h2,xzr
0192
0193 cbnz $len,.Loop
0194
0195 stp $h0,$h1,[$ctx] // store hash value
0196 stp $h2,xzr,[$ctx,
0197
0198 .Lno_data:
0199 ret
0200 .size poly1305_blocks,.-poly1305_blocks
0201
0202 .type poly1305_emit,%function
0203 .align 5
0204 poly1305_emit:
0205 .Lpoly1305_emit:
0206 ldp $h0,$h1,[$ctx] // load hash base 2^64
0207 ldp $h2,$r0,[$ctx,
0208 ldp $t0,$t1,[$nonce] // load nonce
0209
0210
0211 lsr $d0,$h0,
0212 mov w
0213 lsr $d2,$h1,
0214 mov w15,w
0215 lsr x16,$h2,
0216
0217 mov w
0218 lsr $d1,$h0,
0219 mov w
0220 lsr x15,$h1,
0221 mov w16,w
0222
0223
0224 add $d0,$d0,$d1,lsl
0225 lsr $d1,$d2,
0226 adds $d0,$d0,$d2,lsl
0227 add $d1,$d1,x15,lsl
0228 adc $d1,$d1,xzr
0229 lsr $d2,x16,
0230 adds $d1,$d1,x16,lsl
0231 adc $d2,$d2,xzr
0232
0233 cmp $r0,
0234 csel $h0,$h0,$d0,eq // choose between radixes
0235 csel $h1,$h1,$d1,eq
0236 csel $h2,$h2,$d2,eq
0237
0238 adds $d0,$h0,
0239 adcs $d1,$h1,xzr
0240 adc $d2,$h2,xzr
0241
0242 tst $d2,
0243
0244 csel $h0,$h0,$d0,eq
0245 csel $h1,$h1,$d1,eq
0246
0247
0248 ror $t0,$t0,
0249 ror $t1,$t1,
0250
0251 adds $h0,$h0,$t0 // accumulate nonce
0252 adc $h1,$h1,$t1
0253
0254 rev $h0,$h0 // flip output bytes
0255 rev $h1,$h1
0256
0257 stp $h0,$h1,[$mac] // write result
0258
0259 ret
0260 .size poly1305_emit,.-poly1305_emit
0261 ___
0262 my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("v$_.4s",(0..8));
0263 my ($IN01_0,$IN01_1,$IN01_2,$IN01_3,$IN01_4) = map("v$_.2s",(9..13));
0264 my ($IN23_0,$IN23_1,$IN23_2,$IN23_3,$IN23_4) = map("v$_.2s",(14..18));
0265 my ($ACC0,$ACC1,$ACC2,$ACC3,$ACC4) = map("v$_.2d",(19..23));
0266 my ($H0,$H1,$H2,$H3,$H4) = map("v$_.2s",(24..28));
0267 my ($T0,$T1,$MASK) = map("v$_",(29..31));
0268
0269 my ($in2,$zeros)=("x16","x17");
0270 my $is_base2_26 = $zeros;
0271
0272 $code.=<<___;
0273 .type poly1305_mult,%function
0274 .align 5
0275 poly1305_mult:
0276 mul $d0,$h0,$r0 // h0*r0
0277 umulh $d1,$h0,$r0
0278
0279 mul $t0,$h1,$s1 // h1*5*r1
0280 umulh $t1,$h1,$s1
0281
0282 adds $d0,$d0,$t0
0283 mul $t0,$h0,$r1 // h0*r1
0284 adc $d1,$d1,$t1
0285 umulh $d2,$h0,$r1
0286
0287 adds $d1,$d1,$t0
0288 mul $t0,$h1,$r0 // h1*r0
0289 adc $d2,$d2,xzr
0290 umulh $t1,$h1,$r0
0291
0292 adds $d1,$d1,$t0
0293 mul $t0,$h2,$s1 // h2*5*r1
0294 adc $d2,$d2,$t1
0295 mul $t1,$h2,$r0 // h2*r0
0296
0297 adds $d1,$d1,$t0
0298 adc $d2,$d2,$t1
0299
0300 and $t0,$d2,
0301 and $h2,$d2,
0302 add $t0,$t0,$d2,lsr
0303 adds $h0,$d0,$t0
0304 adcs $h1,$d1,xzr
0305 adc $h2,$h2,xzr
0306
0307 ret
0308 .size poly1305_mult,.-poly1305_mult
0309
0310 .type poly1305_splat,%function
0311 .align 4
0312 poly1305_splat:
0313 and x12,$h0,
0314 ubfx x13,$h0,
0315 extr x14,$h1,$h0,
0316 and x14,x14,
0317 ubfx x15,$h1,
0318 extr x16,$h2,$h1,
0319
0320 str w12,[$ctx,
0321 add w12,w13,w13,lsl
0322 str w13,[$ctx,
0323 add w13,w14,w14,lsl
0324 str w12,[$ctx,
0325 str w14,[$ctx,
0326 add w14,w15,w15,lsl
0327 str w13,[$ctx,
0328 str w15,[$ctx,
0329 add w15,w16,w16,lsl
0330 str w14,[$ctx,
0331 str w16,[$ctx,
0332 str w15,[$ctx,
0333
0334 ret
0335 .size poly1305_splat,.-poly1305_splat
0336
0337
0338 .globl poly1305_blocks_neon
0339
0340 .type poly1305_blocks_neon,%function
0341 .align 5
0342 poly1305_blocks_neon:
0343 .Lpoly1305_blocks_neon:
0344 ldr $is_base2_26,[$ctx,
0345 cmp $len,
0346 b.lo .Lpoly1305_blocks
0347
0348 .inst 0xd503233f // paciasp
0349 stp x29,x30,[sp,
0350 add x29,sp,
0351
0352 stp d8,d9,[sp,
0353 stp d10,d11,[sp,
0354 stp d12,d13,[sp,
0355 stp d14,d15,[sp,
0356
0357 cbz $is_base2_26,.Lbase2_64_neon
0358
0359 ldp w10,w11,[$ctx] // load hash value base 2^26
0360 ldp w12,w13,[$ctx,
0361 ldr w14,[$ctx,
0362
0363 tst $len,
0364 b.eq .Leven_neon
0365
0366 ldp $r0,$r1,[$ctx,
0367
0368 add $h0,x10,x11,lsl
0369 lsr $h1,x12,
0370 adds $h0,$h0,x12,lsl
0371 add $h1,$h1,x13,lsl
0372 adc $h1,$h1,xzr
0373 lsr $h2,x14,
0374 adds $h1,$h1,x14,lsl
0375 adc $d2,$h2,xzr // can be partially reduced...
0376
0377 ldp $d0,$d1,[$inp],
0378 sub $len,$len,
0379 add $s1,$r1,$r1,lsr
0380
0381
0382 rev $d0,$d0
0383 rev $d1,$d1
0384
0385 adds $h0,$h0,$d0 // accumulate input
0386 adcs $h1,$h1,$d1
0387 adc $h2,$h2,$padbit
0388
0389 bl poly1305_mult
0390
0391 and x10,$h0,
0392 ubfx x11,$h0,
0393 extr x12,$h1,$h0,
0394 and x12,x12,
0395 ubfx x13,$h1,
0396 extr x14,$h2,$h1,
0397
0398 b .Leven_neon
0399
0400 .align 4
0401 .Lbase2_64_neon:
0402 ldp $r0,$r1,[$ctx,
0403
0404 ldp $h0,$h1,[$ctx] // load hash value base 2^64
0405 ldr $h2,[$ctx,
0406
0407 tst $len,
0408 b.eq .Linit_neon
0409
0410 ldp $d0,$d1,[$inp],
0411 sub $len,$len,
0412 add $s1,$r1,$r1,lsr
0413
0414 rev $d0,$d0
0415 rev $d1,$d1
0416
0417 adds $h0,$h0,$d0 // accumulate input
0418 adcs $h1,$h1,$d1
0419 adc $h2,$h2,$padbit
0420
0421 bl poly1305_mult
0422
0423 .Linit_neon:
0424 ldr w17,[$ctx,
0425 and x10,$h0,
0426 ubfx x11,$h0,
0427 extr x12,$h1,$h0,
0428 and x12,x12,
0429 ubfx x13,$h1,
0430 extr x14,$h2,$h1,
0431
0432 cmp w17,
0433 b.ne .Leven_neon
0434
0435 fmov ${H0},x10
0436 fmov ${H1},x11
0437 fmov ${H2},x12
0438 fmov ${H3},x13
0439 fmov ${H4},x14
0440
0441 ////////////////////////////////// initialize r^n table
0442 mov $h0,$r0 // r^1
0443 add $s1,$r1,$r1,lsr
0444 mov $h1,$r1
0445 mov $h2,xzr
0446 add $ctx,$ctx,
0447 bl poly1305_splat
0448
0449 bl poly1305_mult // r^2
0450 sub $ctx,$ctx,
0451 bl poly1305_splat
0452
0453 bl poly1305_mult // r^3
0454 sub $ctx,$ctx,
0455 bl poly1305_splat
0456
0457 bl poly1305_mult // r^4
0458 sub $ctx,$ctx,
0459 bl poly1305_splat
0460 sub $ctx,$ctx,
0461 b .Ldo_neon
0462
0463 .align 4
0464 .Leven_neon:
0465 fmov ${H0},x10
0466 fmov ${H1},x11
0467 fmov ${H2},x12
0468 fmov ${H3},x13
0469 fmov ${H4},x14
0470
0471 .Ldo_neon:
0472 ldp x8,x12,[$inp,
0473 subs $len,$len,
0474 ldp x9,x13,[$inp,
0475 add $in2,$inp,
0476 adr $zeros,.Lzeros
0477
0478 lsl $padbit,$padbit,
0479 add x15,$ctx,
0480
0481
0482 rev x8,x8
0483 rev x12,x12
0484 rev x9,x9
0485 rev x13,x13
0486
0487 and x4,x8,
0488 and x5,x9,
0489 ubfx x6,x8,
0490 ubfx x7,x9,
0491 add x4,x4,x5,lsl
0492 extr x8,x12,x8,
0493 extr x9,x13,x9,
0494 add x6,x6,x7,lsl
0495 fmov $IN23_0,x4
0496 and x8,x8,
0497 and x9,x9,
0498 ubfx x10,x12,
0499 ubfx x11,x13,
0500 add x12,$padbit,x12,lsr
0501 add x13,$padbit,x13,lsr
0502 add x8,x8,x9,lsl
0503 fmov $IN23_1,x6
0504 add x10,x10,x11,lsl
0505 add x12,x12,x13,lsl
0506 fmov $IN23_2,x8
0507 fmov $IN23_3,x10
0508 fmov $IN23_4,x12
0509
0510 ldp x8,x12,[$inp],
0511 ldp x9,x13,[$inp],
0512
0513 ld1 {$R0,$R1,$S1,$R2},[x15],
0514 ld1 {$S2,$R3,$S3,$R4},[x15],
0515 ld1 {$S4},[x15]
0516
0517
0518 rev x8,x8
0519 rev x12,x12
0520 rev x9,x9
0521 rev x13,x13
0522
0523 and x4,x8,
0524 and x5,x9,
0525 ubfx x6,x8,
0526 ubfx x7,x9,
0527 add x4,x4,x5,lsl
0528 extr x8,x12,x8,
0529 extr x9,x13,x9,
0530 add x6,x6,x7,lsl
0531 fmov $IN01_0,x4
0532 and x8,x8,
0533 and x9,x9,
0534 ubfx x10,x12,
0535 ubfx x11,x13,
0536 add x12,$padbit,x12,lsr
0537 add x13,$padbit,x13,lsr
0538 add x8,x8,x9,lsl
0539 fmov $IN01_1,x6
0540 add x10,x10,x11,lsl
0541 add x12,x12,x13,lsl
0542 movi $MASK.2d,
0543 fmov $IN01_2,x8
0544 fmov $IN01_3,x10
0545 fmov $IN01_4,x12
0546 ushr $MASK.2d,$MASK.2d,
0547
0548 b.ls .Lskip_loop
0549
0550 .align 4
0551 .Loop_neon:
0552 ////////////////////////////////////////////////////////////////
0553 // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
0554 // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
0555 // \___________________/
0556 // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
0557 // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
0558 // \___________________/ \____________________/
0559 //
0560 // Note that we start with inp[2:3]*r^2. This is because it
0561 // doesn't depend on reduction in previous iteration.
0562 ////////////////////////////////////////////////////////////////
0563 // d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0
0564 // d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*5*r4
0565 // d2 = h0*r2 + h1*r1 + h2*r0 + h3*5*r4 + h4*5*r3
0566 // d1 = h0*r1 + h1*r0 + h2*5*r4 + h3*5*r3 + h4*5*r2
0567 // d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
0568
0569 subs $len,$len,#64
0570 umull $ACC4,$IN23_0,${R4}[2]
0571 csel $in2,$zeros,$in2,lo
0572 umull $ACC3,$IN23_0,${R3}[2]
0573 umull $ACC2,$IN23_0,${R2}[2]
0574 ldp x8,x12,[$in2],#16 // inp[2:3] (or zero)
0575 umull $ACC1,$IN23_0,${R1}[2]
0576 ldp x9,x13,[$in2],#48
0577 umull $ACC0,$IN23_0,${R0}[2]
0578 #ifdef __AARCH64EB__
0579 rev x8,x8
0580 rev x12,x12
0581 rev x9,x9
0582 rev x13,x13
0583 #endif
0584
0585 umlal $ACC4,$IN23_1,${R3}[2]
0586 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
0587 umlal $ACC3,$IN23_1,${R2}[2]
0588 and x5,x9,#0x03ffffff
0589 umlal $ACC2,$IN23_1,${R1}[2]
0590 ubfx x6,x8,#26,#26
0591 umlal $ACC1,$IN23_1,${R0}[2]
0592 ubfx x7,x9,#26,#26
0593 umlal $ACC0,$IN23_1,${S4}[2]
0594 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
0595
0596 umlal $ACC4,$IN23_2,${R2}[2]
0597 extr x8,x12,x8,#52
0598 umlal $ACC3,$IN23_2,${R1}[2]
0599 extr x9,x13,x9,#52
0600 umlal $ACC2,$IN23_2,${R0}[2]
0601 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
0602 umlal $ACC1,$IN23_2,${S4}[2]
0603 fmov $IN23_0,x4
0604 umlal $ACC0,$IN23_2,${S3}[2]
0605 and x8,x8,#0x03ffffff
0606
0607 umlal $ACC4,$IN23_3,${R1}[2]
0608 and x9,x9,#0x03ffffff
0609 umlal $ACC3,$IN23_3,${R0}[2]
0610 ubfx x10,x12,#14,#26
0611 umlal $ACC2,$IN23_3,${S4}[2]
0612 ubfx x11,x13,#14,#26
0613 umlal $ACC1,$IN23_3,${S3}[2]
0614 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
0615 umlal $ACC0,$IN23_3,${S2}[2]
0616 fmov $IN23_1,x6
0617
0618 add $IN01_2,$IN01_2,$H2
0619 add x12,$padbit,x12,lsr#40
0620 umlal $ACC4,$IN23_4,${R0}[2]
0621 add x13,$padbit,x13,lsr#40
0622 umlal $ACC3,$IN23_4,${S4}[2]
0623 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
0624 umlal $ACC2,$IN23_4,${S3}[2]
0625 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
0626 umlal $ACC1,$IN23_4,${S2}[2]
0627 fmov $IN23_2,x8
0628 umlal $ACC0,$IN23_4,${S1}[2]
0629 fmov $IN23_3,x10
0630
0631 ////////////////////////////////////////////////////////////////
0632 // (hash+inp[0:1])*r^4 and accumulate
0633
0634 add $IN01_0,$IN01_0,$H0
0635 fmov $IN23_4,x12
0636 umlal $ACC3,$IN01_2,${R1}[0]
0637 ldp x8,x12,[$inp],#16 // inp[0:1]
0638 umlal $ACC0,$IN01_2,${S3}[0]
0639 ldp x9,x13,[$inp],#48
0640 umlal $ACC4,$IN01_2,${R2}[0]
0641 umlal $ACC1,$IN01_2,${S4}[0]
0642 umlal $ACC2,$IN01_2,${R0}[0]
0643 #ifdef __AARCH64EB__
0644 rev x8,x8
0645 rev x12,x12
0646 rev x9,x9
0647 rev x13,x13
0648 #endif
0649
0650 add $IN01_1,$IN01_1,$H1
0651 umlal $ACC3,$IN01_0,${R3}[0]
0652 umlal $ACC4,$IN01_0,${R4}[0]
0653 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
0654 umlal $ACC2,$IN01_0,${R2}[0]
0655 and x5,x9,#0x03ffffff
0656 umlal $ACC0,$IN01_0,${R0}[0]
0657 ubfx x6,x8,#26,#26
0658 umlal $ACC1,$IN01_0,${R1}[0]
0659 ubfx x7,x9,#26,#26
0660
0661 add $IN01_3,$IN01_3,$H3
0662 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
0663 umlal $ACC3,$IN01_1,${R2}[0]
0664 extr x8,x12,x8,#52
0665 umlal $ACC4,$IN01_1,${R3}[0]
0666 extr x9,x13,x9,#52
0667 umlal $ACC0,$IN01_1,${S4}[0]
0668 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
0669 umlal $ACC2,$IN01_1,${R1}[0]
0670 fmov $IN01_0,x4
0671 umlal $ACC1,$IN01_1,${R0}[0]
0672 and x8,x8,#0x03ffffff
0673
0674 add $IN01_4,$IN01_4,$H4
0675 and x9,x9,#0x03ffffff
0676 umlal $ACC3,$IN01_3,${R0}[0]
0677 ubfx x10,x12,#14,#26
0678 umlal $ACC0,$IN01_3,${S2}[0]
0679 ubfx x11,x13,#14,#26
0680 umlal $ACC4,$IN01_3,${R1}[0]
0681 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
0682 umlal $ACC1,$IN01_3,${S3}[0]
0683 fmov $IN01_1,x6
0684 umlal $ACC2,$IN01_3,${S4}[0]
0685 add x12,$padbit,x12,lsr#40
0686
0687 umlal $ACC3,$IN01_4,${S4}[0]
0688 add x13,$padbit,x13,lsr#40
0689 umlal $ACC0,$IN01_4,${S1}[0]
0690 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
0691 umlal $ACC4,$IN01_4,${R0}[0]
0692 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
0693 umlal $ACC1,$IN01_4,${S2}[0]
0694 fmov $IN01_2,x8
0695 umlal $ACC2,$IN01_4,${S3}[0]
0696 fmov $IN01_3,x10
0697 fmov $IN01_4,x12
0698
0699 /////////////////////////////////////////////////////////////////
0700 // lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
0701 // and P. Schwabe
0702 //
0703 // [see discussion in poly1305-armv4 module]
0704
0705 ushr $T0.2d,$ACC3,#26
0706 xtn $H3,$ACC3
0707 ushr $T1.2d,$ACC0,#26
0708 and $ACC0,$ACC0,$MASK.2d
0709 add $ACC4,$ACC4,$T0.2d // h3 -> h4
0710 bic $H3,#0xfc,lsl#24 // &=0x03ffffff
0711 add $ACC1,$ACC1,$T1.2d // h0 -> h1
0712
0713 ushr $T0.2d,$ACC4,#26
0714 xtn $H4,$ACC4
0715 ushr $T1.2d,$ACC1,#26
0716 xtn $H1,$ACC1
0717 bic $H4,#0xfc,lsl#24
0718 add $ACC2,$ACC2,$T1.2d // h1 -> h2
0719
0720 add $ACC0,$ACC0,$T0.2d
0721 shl $T0.2d,$T0.2d,#2
0722 shrn $T1.2s,$ACC2,#26
0723 xtn $H2,$ACC2
0724 add $ACC0,$ACC0,$T0.2d // h4 -> h0
0725 bic $H1,#0xfc,lsl#24
0726 add $H3,$H3,$T1.2s // h2 -> h3
0727 bic $H2,#0xfc,lsl#24
0728
0729 shrn $T0.2s,$ACC0,#26
0730 xtn $H0,$ACC0
0731 ushr $T1.2s,$H3,#26
0732 bic $H3,#0xfc,lsl#24
0733 bic $H0,#0xfc,lsl#24
0734 add $H1,$H1,$T0.2s // h0 -> h1
0735 add $H4,$H4,$T1.2s // h3 -> h4
0736
0737 b.hi .Loop_neon
0738
0739 .Lskip_loop:
0740 dup $IN23_2,${IN23_2}[0]
0741 add $IN01_2,$IN01_2,$H2
0742
0743 ////////////////////////////////////////////////////////////////
0744 // multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
0745
0746 adds $len,$len,#32
0747 b.ne .Long_tail
0748
0749 dup $IN23_2,${IN01_2}[0]
0750 add $IN23_0,$IN01_0,$H0
0751 add $IN23_3,$IN01_3,$H3
0752 add $IN23_1,$IN01_1,$H1
0753 add $IN23_4,$IN01_4,$H4
0754
0755 .Long_tail:
0756 dup $IN23_0,${IN23_0}[0]
0757 umull2 $ACC0,$IN23_2,${S3}
0758 umull2 $ACC3,$IN23_2,${R1}
0759 umull2 $ACC4,$IN23_2,${R2}
0760 umull2 $ACC2,$IN23_2,${R0}
0761 umull2 $ACC1,$IN23_2,${S4}
0762
0763 dup $IN23_1,${IN23_1}[0]
0764 umlal2 $ACC0,$IN23_0,${R0}
0765 umlal2 $ACC2,$IN23_0,${R2}
0766 umlal2 $ACC3,$IN23_0,${R3}
0767 umlal2 $ACC4,$IN23_0,${R4}
0768 umlal2 $ACC1,$IN23_0,${R1}
0769
0770 dup $IN23_3,${IN23_3}[0]
0771 umlal2 $ACC0,$IN23_1,${S4}
0772 umlal2 $ACC3,$IN23_1,${R2}
0773 umlal2 $ACC2,$IN23_1,${R1}
0774 umlal2 $ACC4,$IN23_1,${R3}
0775 umlal2 $ACC1,$IN23_1,${R0}
0776
0777 dup $IN23_4,${IN23_4}[0]
0778 umlal2 $ACC3,$IN23_3,${R0}
0779 umlal2 $ACC4,$IN23_3,${R1}
0780 umlal2 $ACC0,$IN23_3,${S2}
0781 umlal2 $ACC1,$IN23_3,${S3}
0782 umlal2 $ACC2,$IN23_3,${S4}
0783
0784 umlal2 $ACC3,$IN23_4,${S4}
0785 umlal2 $ACC0,$IN23_4,${S1}
0786 umlal2 $ACC4,$IN23_4,${R0}
0787 umlal2 $ACC1,$IN23_4,${S2}
0788 umlal2 $ACC2,$IN23_4,${S3}
0789
0790 b.eq .Lshort_tail
0791
0792 ////////////////////////////////////////////////////////////////
0793 // (hash+inp[0:1])*r^4:r^3 and accumulate
0794
0795 add $IN01_0,$IN01_0,$H0
0796 umlal $ACC3,$IN01_2,${R1}
0797 umlal $ACC0,$IN01_2,${S3}
0798 umlal $ACC4,$IN01_2,${R2}
0799 umlal $ACC1,$IN01_2,${S4}
0800 umlal $ACC2,$IN01_2,${R0}
0801
0802 add $IN01_1,$IN01_1,$H1
0803 umlal $ACC3,$IN01_0,${R3}
0804 umlal $ACC0,$IN01_0,${R0}
0805 umlal $ACC4,$IN01_0,${R4}
0806 umlal $ACC1,$IN01_0,${R1}
0807 umlal $ACC2,$IN01_0,${R2}
0808
0809 add $IN01_3,$IN01_3,$H3
0810 umlal $ACC3,$IN01_1,${R2}
0811 umlal $ACC0,$IN01_1,${S4}
0812 umlal $ACC4,$IN01_1,${R3}
0813 umlal $ACC1,$IN01_1,${R0}
0814 umlal $ACC2,$IN01_1,${R1}
0815
0816 add $IN01_4,$IN01_4,$H4
0817 umlal $ACC3,$IN01_3,${R0}
0818 umlal $ACC0,$IN01_3,${S2}
0819 umlal $ACC4,$IN01_3,${R1}
0820 umlal $ACC1,$IN01_3,${S3}
0821 umlal $ACC2,$IN01_3,${S4}
0822
0823 umlal $ACC3,$IN01_4,${S4}
0824 umlal $ACC0,$IN01_4,${S1}
0825 umlal $ACC4,$IN01_4,${R0}
0826 umlal $ACC1,$IN01_4,${S2}
0827 umlal $ACC2,$IN01_4,${S3}
0828
0829 .Lshort_tail:
0830 ////////////////////////////////////////////////////////////////
0831 // horizontal add
0832
0833 addp $ACC3,$ACC3,$ACC3
0834 ldp d8,d9,[sp,#16] // meet ABI requirements
0835 addp $ACC0,$ACC0,$ACC0
0836 ldp d10,d11,[sp,#32]
0837 addp $ACC4,$ACC4,$ACC4
0838 ldp d12,d13,[sp,#48]
0839 addp $ACC1,$ACC1,$ACC1
0840 ldp d14,d15,[sp,#64]
0841 addp $ACC2,$ACC2,$ACC2
0842 ldr x30,[sp,#8]
0843
0844 ////////////////////////////////////////////////////////////////
0845 // lazy reduction, but without narrowing
0846
0847 ushr $T0.2d,$ACC3,#26
0848 and $ACC3,$ACC3,$MASK.2d
0849 ushr $T1.2d,$ACC0,#26
0850 and $ACC0,$ACC0,$MASK.2d
0851
0852 add $ACC4,$ACC4,$T0.2d // h3 -> h4
0853 add $ACC1,$ACC1,$T1.2d // h0 -> h1
0854
0855 ushr $T0.2d,$ACC4,#26
0856 and $ACC4,$ACC4,$MASK.2d
0857 ushr $T1.2d,$ACC1,#26
0858 and $ACC1,$ACC1,$MASK.2d
0859 add $ACC2,$ACC2,$T1.2d // h1 -> h2
0860
0861 add $ACC0,$ACC0,$T0.2d
0862 shl $T0.2d,$T0.2d,#2
0863 ushr $T1.2d,$ACC2,#26
0864 and $ACC2,$ACC2,$MASK.2d
0865 add $ACC0,$ACC0,$T0.2d // h4 -> h0
0866 add $ACC3,$ACC3,$T1.2d // h2 -> h3
0867
0868 ushr $T0.2d,$ACC0,#26
0869 and $ACC0,$ACC0,$MASK.2d
0870 ushr $T1.2d,$ACC3,#26
0871 and $ACC3,$ACC3,$MASK.2d
0872 add $ACC1,$ACC1,$T0.2d // h0 -> h1
0873 add $ACC4,$ACC4,$T1.2d // h3 -> h4
0874
0875 ////////////////////////////////////////////////////////////////
0876 // write the result, can be partially reduced
0877
0878 st4 {$ACC0,$ACC1,$ACC2,$ACC3}[0],[$ctx],#16
0879 mov x4,#1
0880 st1 {$ACC4}[0],[$ctx]
0881 str x4,[$ctx,#8] // set is_base2_26
0882
0883 ldr x29,[sp],#80
0884 .inst 0xd50323bf // autiasp
0885 ret
0886 .size poly1305_blocks_neon,.-poly1305_blocks_neon
0887
0888 .align 5
0889 .Lzeros:
0890 .long 0,0,0,0,0,0,0,0
0891 .asciz "Poly1305 for ARMv8, CRYPTOGAMS by \@dot-asm"
0892 .align 2
0893 #if !defined(__KERNEL__) && !defined(_WIN64)
0894 .comm OPENSSL_armcap_P,4,4
0895 .hidden OPENSSL_armcap_P
0896 #endif
0897 ___
0898
0899 foreach (split("\n",$code)) {
0900 s/\b(shrn\s+v[0-9]+)\.[24]d/$1.2s/ or
0901 s/\b(fmov\s+)v([0-9]+)[^,]*,\s*x([0-9]+)/$1d$2,x$3/ or
0902 (m/\bdup\b/ and (s/\.[24]s/.2d/g or 1)) or
0903 (m/\b(eor|and)/ and (s/\.[248][sdh]/.16b/g or 1)) or
0904 (m/\bum(ul|la)l\b/ and (s/\.4s/.2s/g or 1)) or
0905 (m/\bum(ul|la)l2\b/ and (s/\.2s/.4s/g or 1)) or
0906 (m/\bst[1-4]\s+{[^}]+}\[/ and (s/\.[24]d/.s/g or 1));
0907
0908 s/\.[124]([sd])\[/.$1\[/;
0909 s/w#x([0-9]+)/w$1/g;
0910
0911 print $_,"\n";
0912 }
0913 close STDOUT;