0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023 $flavour = shift;
0024 if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
0025 else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
0026
0027 if ($flavour && $flavour ne "void") {
0028 $0 =~ ; $dir=$1;
0029 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
0030 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
0031 die "can't locate arm-xlate.pl";
0032
0033 open STDOUT,"| \"$^X\" $xlate $flavour $output";
0034 } else {
0035 open STDOUT,">$output";
0036 }
0037
0038 ($ctx,$inp,$len,$padbit)=map("r$_",(0..3));
0039
0040 $code.=<<___;
0041
0042
0043
0044
0045
0046
0047
0048
0049 .globl poly1305_blocks_neon
0050
0051
0052
0053 .syntax unified
0054 .thumb
0055
0056 .code 32
0057
0058
0059 .text
0060
0061 .globl poly1305_emit
0062 .globl poly1305_blocks
0063 .globl poly1305_init
0064 .type poly1305_init,%function
0065 .align 5
0066 poly1305_init:
0067 .Lpoly1305_init:
0068 stmdb sp!,{r4-r11}
0069
0070 eor r3,r3,r3
0071 cmp $inp,
0072 str r3,[$ctx,
0073 str r3,[$ctx,
0074 str r3,[$ctx,
0075 str r3,[$ctx,
0076 str r3,[$ctx,
0077 str r3,[$ctx,
0078 add $ctx,$ctx,
0079
0080
0081 it eq
0082
0083 moveq r0,
0084 beq .Lno_key
0085
0086
0087 mov r3,
0088 str r3,[$ctx,
0089
0090 adr r11,.Lpoly1305_init
0091 ldr r12,.LOPENSSL_armcap
0092
0093
0094 ldrb r4,[$inp,
0095 mov r10,
0096 ldrb r5,[$inp,
0097 and r3,r10,
0098 ldrb r6,[$inp,
0099 ldrb r7,[$inp,
0100 orr r4,r4,r5,lsl
0101 ldrb r5,[$inp,
0102 orr r4,r4,r6,lsl
0103 ldrb r6,[$inp,
0104 orr r4,r4,r7,lsl
0105 ldrb r7,[$inp,
0106 and r4,r4,r10
0107
0108
0109
0110 ldr r12,[r11,r12] @ OPENSSL_armcap_P
0111
0112
0113 ldr r12,[r12]
0114
0115
0116 ldrb r8,[$inp,
0117 orr r5,r5,r6,lsl
0118 ldrb r6,[$inp,
0119 orr r5,r5,r7,lsl
0120 ldrb r7,[$inp,
0121 orr r5,r5,r8,lsl
0122 ldrb r8,[$inp,
0123 and r5,r5,r3
0124
0125
0126 tst r12,
0127
0128 adr r9,.Lpoly1305_blocks_neon
0129 adr r11,.Lpoly1305_blocks
0130 it ne
0131 movne r11,r9
0132 adr r12,.Lpoly1305_emit
0133 orr r11,r11,
0134 orr r12,r12,
0135
0136 add r12,r11,
0137 ite eq
0138 addeq r11,r11,
0139 addne r11,r11,
0140
0141
0142 ldrb r9,[$inp,
0143 orr r6,r6,r7,lsl
0144 ldrb r7,[$inp,
0145 orr r6,r6,r8,lsl
0146 ldrb r8,[$inp,
0147 orr r6,r6,r9,lsl
0148 ldrb r9,[$inp,
0149 and r6,r6,r3
0150
0151 ldrb r10,[$inp,
0152 orr r7,r7,r8,lsl
0153 str r4,[$ctx,
0154 orr r7,r7,r9,lsl
0155 str r5,[$ctx,
0156 orr r7,r7,r10,lsl
0157 str r6,[$ctx,
0158 and r7,r7,r3
0159 str r7,[$ctx,
0160
0161 stmia r2,{r11,r12} @ fill functions table
0162 mov r0,
0163
0164 mov r0,
0165
0166 .Lno_key:
0167 ldmia sp!,{r4-r11}
0168
0169 ret @ bx lr
0170
0171 tst lr,
0172 moveq pc,lr @ be binary compatible with V4, yet
0173 bx lr @ interoperable with Thumb ISA:-)
0174
0175 .size poly1305_init,.-poly1305_init
0176 ___
0177 {
0178 my ($h0,$h1,$h2,$h3,$h4,$r0,$r1,$r2,$r3)=map("r$_",(4..12));
0179 my ($s1,$s2,$s3)=($r1,$r2,$r3);
0180
0181 $code.=<<___;
0182 .type poly1305_blocks,%function
0183 .align 5
0184 poly1305_blocks:
0185 .Lpoly1305_blocks:
0186 stmdb sp!,{r3-r11,lr}
0187
0188 ands $len,$len,
0189 beq .Lno_data
0190
0191 add $len,$len,$inp @ end pointer
0192 sub sp,sp,
0193
0194
0195 ldmia $ctx,{$h0-$r3} @ load context
0196 add $ctx,$ctx,
0197 str $len,[sp,
0198 str $ctx,[sp,
0199
0200 ldr lr,[$ctx,
0201 ldmia $ctx!,{$h0-$h4} @ load hash value
0202 str $len,[sp,
0203 str $ctx,[sp,
0204
0205 adds $r0,$h0,$h1,lsl
0206 mov $r1,$h1,lsr
0207 adcs $r1,$r1,$h2,lsl
0208 mov $r2,$h2,lsr
0209 adcs $r2,$r2,$h3,lsl
0210 mov $r3,$h3,lsr
0211 adcs $r3,$r3,$h4,lsl
0212 mov $len,
0213 teq lr,
0214 str $len,[$ctx,
0215 adc $len,$len,$h4,lsr
0216
0217 itttt ne
0218 movne $h0,$r0 @ choose between radixes
0219 movne $h1,$r1
0220 movne $h2,$r2
0221 movne $h3,$r3
0222 ldmia $ctx,{$r0-$r3} @ load key
0223 it ne
0224 movne $h4,$len
0225
0226
0227 mov lr,$inp
0228 cmp $padbit,
0229 str $r1,[sp,
0230 str $r2,[sp,
0231 str $r3,[sp,
0232 b .Loop
0233
0234 .align 4
0235 .Loop:
0236
0237 ldrb r0,[lr],
0238
0239 it hi
0240
0241 addhi $h4,$h4,
0242 ldrb r1,[lr,
0243 ldrb r2,[lr,
0244 ldrb r3,[lr,
0245 orr r1,r0,r1,lsl
0246 ldrb r0,[lr,
0247 orr r2,r1,r2,lsl
0248 ldrb r1,[lr,
0249 orr r3,r2,r3,lsl
0250 ldrb r2,[lr,
0251 adds $h0,$h0,r3 @ accumulate input
0252
0253 ldrb r3,[lr,
0254 orr r1,r0,r1,lsl
0255 ldrb r0,[lr,
0256 orr r2,r1,r2,lsl
0257 ldrb r1,[lr,
0258 orr r3,r2,r3,lsl
0259 ldrb r2,[lr,
0260 adcs $h1,$h1,r3
0261
0262 ldrb r3,[lr,
0263 orr r1,r0,r1,lsl
0264 ldrb r0,[lr,
0265 orr r2,r1,r2,lsl
0266 ldrb r1,[lr,
0267 orr r3,r2,r3,lsl
0268 ldrb r2,[lr,
0269 adcs $h2,$h2,r3
0270
0271 ldrb r3,[lr,
0272 orr r1,r0,r1,lsl
0273 str lr,[sp,
0274 orr r2,r1,r2,lsl
0275 add $s1,$r1,$r1,lsr
0276 orr r3,r2,r3,lsl
0277
0278 ldr r0,[lr],
0279 it hi
0280 addhi $h4,$h4,
0281 ldr r1,[lr,
0282 ldr r2,[lr,
0283 ldr r3,[lr,
0284
0285 rev r0,r0
0286 rev r1,r1
0287 rev r2,r2
0288 rev r3,r3
0289
0290 adds $h0,$h0,r0 @ accumulate input
0291 str lr,[sp,
0292 adcs $h1,$h1,r1
0293 add $s1,$r1,$r1,lsr
0294 adcs $h2,$h2,r2
0295
0296 add $s2,$r2,$r2,lsr
0297 adcs $h3,$h3,r3
0298 add $s3,$r3,$r3,lsr
0299
0300 umull r2,r3,$h1,$r0
0301 adc $h4,$h4,
0302 umull r0,r1,$h0,$r0
0303 umlal r2,r3,$h4,$s1
0304 umlal r0,r1,$h3,$s1
0305 ldr $r1,[sp,
0306 umlal r2,r3,$h2,$s3
0307 umlal r0,r1,$h1,$s3
0308 umlal r2,r3,$h3,$s2
0309 umlal r0,r1,$h2,$s2
0310 umlal r2,r3,$h0,$r1
0311 str r0,[sp,
0312 mul r0,$s2,$h4
0313 ldr $r2,[sp,
0314 adds r2,r2,r1 @ d1+=d0>>32
0315 eor r1,r1,r1
0316 adc lr,r3,
0317 str r2,[sp,
0318
0319 mul r2,$s3,$h4
0320 eor r3,r3,r3
0321 umlal r0,r1,$h3,$s3
0322 ldr $r3,[sp,
0323 umlal r2,r3,$h3,$r0
0324 umlal r0,r1,$h2,$r0
0325 umlal r2,r3,$h2,$r1
0326 umlal r0,r1,$h1,$r1
0327 umlal r2,r3,$h1,$r2
0328 umlal r0,r1,$h0,$r2
0329 umlal r2,r3,$h0,$r3
0330 ldr $h0,[sp,
0331 mul $h4,$r0,$h4
0332 ldr $h1,[sp,
0333
0334 adds $h2,lr,r0 @ d2+=d1>>32
0335 ldr lr,[sp,
0336 adc r1,r1,
0337 adds $h3,r2,r1 @ d3+=d2>>32
0338 ldr r0,[sp,
0339 adc r3,r3,
0340 add $h4,$h4,r3 @ h4+=d3>>32
0341
0342 and r1,$h4,
0343 and $h4,$h4,
0344 add r1,r1,r1,lsr
0345 adds $h0,$h0,r1
0346 adcs $h1,$h1,
0347 adcs $h2,$h2,
0348 adcs $h3,$h3,
0349 adc $h4,$h4,
0350
0351 cmp r0,lr @ done yet?
0352 bhi .Loop
0353
0354 ldr $ctx,[sp,
0355 add sp,sp,
0356 stmdb $ctx,{$h0-$h4} @ store the result
0357
0358 .Lno_data:
0359
0360 ldmia sp!,{r3-r11,pc}
0361
0362 ldmia sp!,{r3-r11,lr}
0363 tst lr,
0364 moveq pc,lr @ be binary compatible with V4, yet
0365 bx lr @ interoperable with Thumb ISA:-)
0366
0367 .size poly1305_blocks,.-poly1305_blocks
0368 ___
0369 }
0370 {
0371 my ($ctx,$mac,$nonce)=map("r$_",(0..2));
0372 my ($h0,$h1,$h2,$h3,$h4,$g0,$g1,$g2,$g3)=map("r$_",(3..11));
0373 my $g4=$ctx;
0374
0375 $code.=<<___;
0376 .type poly1305_emit,%function
0377 .align 5
0378 poly1305_emit:
0379 .Lpoly1305_emit:
0380 stmdb sp!,{r4-r11}
0381
0382 ldmia $ctx,{$h0-$h4}
0383
0384
0385 ldr ip,[$ctx,
0386
0387 adds $g0,$h0,$h1,lsl
0388 mov $g1,$h1,lsr
0389 adcs $g1,$g1,$h2,lsl
0390 mov $g2,$h2,lsr
0391 adcs $g2,$g2,$h3,lsl
0392 mov $g3,$h3,lsr
0393 adcs $g3,$g3,$h4,lsl
0394 mov $g4,
0395 adc $g4,$g4,$h4,lsr
0396
0397 tst ip,ip
0398 itttt ne
0399 movne $h0,$g0
0400 movne $h1,$g1
0401 movne $h2,$g2
0402 movne $h3,$g3
0403 it ne
0404 movne $h4,$g4
0405
0406
0407 adds $g0,$h0,
0408 adcs $g1,$h1,
0409 adcs $g2,$h2,
0410 adcs $g3,$h3,
0411 adc $g4,$h4,
0412 tst $g4,
0413
0414
0415 it ne
0416
0417 movne $h0,$g0
0418 ldr $g0,[$nonce,
0419
0420 it ne
0421
0422 movne $h1,$g1
0423 ldr $g1,[$nonce,
0424
0425 it ne
0426
0427 movne $h2,$g2
0428 ldr $g2,[$nonce,
0429
0430 it ne
0431
0432 movne $h3,$g3
0433 ldr $g3,[$nonce,
0434
0435 adds $h0,$h0,$g0
0436 adcs $h1,$h1,$g1
0437 adcs $h2,$h2,$g2
0438 adc $h3,$h3,$g3
0439
0440
0441
0442 rev $h0,$h0
0443 rev $h1,$h1
0444 rev $h2,$h2
0445 rev $h3,$h3
0446
0447 str $h0,[$mac,
0448 str $h1,[$mac,
0449 str $h2,[$mac,
0450 str $h3,[$mac,
0451
0452 strb $h0,[$mac,
0453 mov $h0,$h0,lsr
0454 strb $h1,[$mac,
0455 mov $h1,$h1,lsr
0456 strb $h2,[$mac,
0457 mov $h2,$h2,lsr
0458 strb $h3,[$mac,
0459 mov $h3,$h3,lsr
0460
0461 strb $h0,[$mac,
0462 mov $h0,$h0,lsr
0463 strb $h1,[$mac,
0464 mov $h1,$h1,lsr
0465 strb $h2,[$mac,
0466 mov $h2,$h2,lsr
0467 strb $h3,[$mac,
0468 mov $h3,$h3,lsr
0469
0470 strb $h0,[$mac,
0471 mov $h0,$h0,lsr
0472 strb $h1,[$mac,
0473 mov $h1,$h1,lsr
0474 strb $h2,[$mac,
0475 mov $h2,$h2,lsr
0476 strb $h3,[$mac,
0477 mov $h3,$h3,lsr
0478
0479 strb $h0,[$mac,
0480 strb $h1,[$mac,
0481 strb $h2,[$mac,
0482 strb $h3,[$mac,
0483
0484 ldmia sp!,{r4-r11}
0485
0486 ret @ bx lr
0487
0488 tst lr,
0489 moveq pc,lr @ be binary compatible with V4, yet
0490 bx lr @ interoperable with Thumb ISA:-)
0491
0492 .size poly1305_emit,.-poly1305_emit
0493 ___
0494 {
0495 my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("d$_",(0..9));
0496 my ($D0,$D1,$D2,$D3,$D4, $H0,$H1,$H2,$H3,$H4) = map("q$_",(5..14));
0497 my ($T0,$T1,$MASK) = map("q$_",(15,4,0));
0498
0499 my ($in2,$zeros,$tbl0,$tbl1) = map("r$_",(4..7));
0500
0501 $code.=<<___;
0502
0503 .fpu neon
0504
0505 .type poly1305_init_neon,%function
0506 .align 5
0507 poly1305_init_neon:
0508 .Lpoly1305_init_neon:
0509 ldr r3,[$ctx,
0510 cmp r3,
0511 bne .Lno_init_neon
0512
0513 ldr r4,[$ctx,
0514 ldr r5,[$ctx,
0515 ldr r6,[$ctx,
0516 ldr r7,[$ctx,
0517
0518 and r2,r4,
0519 mov r3,r4,lsr
0520 mov r4,r5,lsr
0521 orr r3,r3,r5,lsl
0522 mov r5,r6,lsr
0523 orr r4,r4,r6,lsl
0524 mov r6,r7,lsr
0525 orr r5,r5,r7,lsl
0526 and r3,r3,
0527 and r4,r4,
0528 and r5,r5,
0529
0530 vdup.32 $R0,r2 @ r^1 in both lanes
0531 add r2,r3,r3,lsl
0532 vdup.32 $R1,r3
0533 add r3,r4,r4,lsl
0534 vdup.32 $S1,r2
0535 vdup.32 $R2,r4
0536 add r4,r5,r5,lsl
0537 vdup.32 $S2,r3
0538 vdup.32 $R3,r5
0539 add r5,r6,r6,lsl
0540 vdup.32 $S3,r4
0541 vdup.32 $R4,r6
0542 vdup.32 $S4,r5
0543
0544 mov $zeros,
0545
0546 .Lsquare_neon:
0547 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
0548 @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
0549 @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
0550 @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
0551 @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
0552 @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
0553
0554 vmull.u32 $D0,$R0,${R0}[1]
0555 vmull.u32 $D1,$R1,${R0}[1]
0556 vmull.u32 $D2,$R2,${R0}[1]
0557 vmull.u32 $D3,$R3,${R0}[1]
0558 vmull.u32 $D4,$R4,${R0}[1]
0559
0560 vmlal.u32 $D0,$R4,${S1}[1]
0561 vmlal.u32 $D1,$R0,${R1}[1]
0562 vmlal.u32 $D2,$R1,${R1}[1]
0563 vmlal.u32 $D3,$R2,${R1}[1]
0564 vmlal.u32 $D4,$R3,${R1}[1]
0565
0566 vmlal.u32 $D0,$R3,${S2}[1]
0567 vmlal.u32 $D1,$R4,${S2}[1]
0568 vmlal.u32 $D3,$R1,${R2}[1]
0569 vmlal.u32 $D2,$R0,${R2}[1]
0570 vmlal.u32 $D4,$R2,${R2}[1]
0571
0572 vmlal.u32 $D0,$R2,${S3}[1]
0573 vmlal.u32 $D3,$R0,${R3}[1]
0574 vmlal.u32 $D1,$R3,${S3}[1]
0575 vmlal.u32 $D2,$R4,${S3}[1]
0576 vmlal.u32 $D4,$R1,${R3}[1]
0577
0578 vmlal.u32 $D3,$R4,${S4}[1]
0579 vmlal.u32 $D0,$R1,${S4}[1]
0580 vmlal.u32 $D1,$R2,${S4}[1]
0581 vmlal.u32 $D2,$R3,${S4}[1]
0582 vmlal.u32 $D4,$R0,${R4}[1]
0583
0584 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
0585 @ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
0586 @ and P. Schwabe
0587 @
0588 @ H0>>+H1>>+H2>>+H3>>+H4
0589 @ H3>>+H4>>*5+H0>>+H1
0590 @
0591 @ Trivia.
0592 @
0593 @ Result of multiplication of n-bit number by
0594 number,
0595 @
0596
0597 numbers is n+1 bits wide, sum of three - n+2,
0598 @ and so is sum of four. Sum of 2^ and n-bit
0599 @ one is n+1 bits wide.
0600 @
0601 @ >>+ denotes Hnext += Hn>>26, Hn &= 0x3ffffff. This means that
0602 @ H0, H2, H3 are guaranteed to be 26 bits wide, while H1 and H4
0603 @ can be 27. However! In cases when their width exceeds 26 bits
0604 @ they are limited by 2^26+2^6. This in turn means that *sum*
0605 @ of the products with these values can still be viewed as sum
0606 @ of 52-bit numbers as long as the amount of addends is not a
0607 @ power of 2. For example,
0608 @
0609 @ H4 = H4*R0 + H3*R1 + H2*R2 + H1*R3 + H0 * R4,
0610 @
0611 @ which can't be larger than 5 * (2^26 + 2^6) * (2^26 + 2^6), or
0612 @ 5 * (2^52 + 2*2^32 + 2^12), which in turn is smaller than
0613 @ 8 * (2^52) or 2^55. However, the value is then multiplied by
0614 @ by 5, so we should be looking at 5 * 5 * (2^52 + 2^33 + 2^12),
0615 @ which is less than 32 * (2^52) or 2^57. And when processing
0616 @ data we are looking at triple as many addends...
0617 @
0618 @ In key setup procedure pre-reduced H0 is limited by 5*4+1 and
0619 @ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the
0620 @ input H0 is limited by (5*4+1)*3 addends, or 58 bits, while
0621 @ 5*H4 by 5*5*3, or 59[!] bits. How is this relevant? vmlal.u32
0622 @ instruction accepts 2x32-bit input and writes 2x64-bit result.
0623 @ This means that result of reduction have to be compressed upon
0624 @ loop wrap-around. This can be done in the process of reduction
0625 @ to minimize amount of instructions [as well as amount of
0626 @ 128-bit instructions, which benefits low-end processors], but
0627 @ one has to watch for H2 (which is narrower than H0) and 5*H4
0628 @ not being wider than 58 bits, so that result of right shift
0629 @ by 26 bits fits in 32 bits. This is also useful on x86,
0630 @ because it allows to use paddd in place for paddq, which
0631 @ benefits Atom, where paddq is ridiculously slow.
0632
0633 vshr.u64 $T0,$D3,#26
0634 vmovn.i64 $D3#lo,$D3
0635 vshr.u64 $T1,$D0,#26
0636 vmovn.i64 $D0#lo,$D0
0637 vadd.i64 $D4,$D4,$T0 @ h3 -> h4
0638 vbic.i32 $D3#lo,#0xfc000000 @ &=0x03ffffff
0639 vadd.i64 $D1,$D1,$T1 @ h0 -> h1
0640 vbic.i32 $D0#lo,#0xfc000000
0641
0642 vshrn.u64 $T0#lo,$D4,#26
0643 vmovn.i64 $D4#lo,$D4
0644 vshr.u64 $T1,$D1,#26
0645 vmovn.i64 $D1#lo,$D1
0646 vadd.i64 $D2,$D2,$T1 @ h1 -> h2
0647 vbic.i32 $D4#lo,#0xfc000000
0648 vbic.i32 $D1#lo,#0xfc000000
0649
0650 vadd.i32 $D0#lo,$D0#lo,$T0#lo
0651 vshl.u32 $T0#lo,$T0#lo,#2
0652 vshrn.u64 $T1#lo,$D2,#26
0653 vmovn.i64 $D2#lo,$D2
0654 vadd.i32 $D0#lo,$D0#lo,$T0#lo @ h4 -> h0
0655 vadd.i32 $D3#lo,$D3#lo,$T1#lo @ h2 -> h3
0656 vbic.i32 $D2#lo,#0xfc000000
0657
0658 vshr.u32 $T0#lo,$D0#lo,#26
0659 vbic.i32 $D0#lo,#0xfc000000
0660 vshr.u32 $T1#lo,$D3#lo,#26
0661 vbic.i32 $D3#lo,#0xfc000000
0662 vadd.i32 $D1#lo,$D1#lo,$T0#lo @ h0 -> h1
0663 vadd.i32 $D4#lo,$D4#lo,$T1#lo @ h3 -> h4
0664
0665 subs $zeros,$zeros,#1
0666 beq .Lsquare_break_neon
0667
0668 add $tbl0,$ctx,#(48+0*9*4)
0669 add $tbl1,$ctx,#(48+1*9*4)
0670
0671 vtrn.32 $R0,$D0#lo @ r^2:r^1
0672 vtrn.32 $R2,$D2#lo
0673 vtrn.32 $R3,$D3#lo
0674 vtrn.32 $R1,$D1#lo
0675 vtrn.32 $R4,$D4#lo
0676
0677 vshl.u32 $S2,$R2,#2 @ *5
0678 vshl.u32 $S3,$R3,#2
0679 vshl.u32 $S1,$R1,#2
0680 vshl.u32 $S4,$R4,#2
0681 vadd.i32 $S2,$S2,$R2
0682 vadd.i32 $S1,$S1,$R1
0683 vadd.i32 $S3,$S3,$R3
0684 vadd.i32 $S4,$S4,$R4
0685
0686 vst4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!
0687 vst4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!
0688 vst4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
0689 vst4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
0690 vst1.32 {${S4}[0]},[$tbl0,:32]
0691 vst1.32 {${S4}[1]},[$tbl1,:32]
0692
0693 b .Lsquare_neon
0694
0695 .align 4
0696 .Lsquare_break_neon:
0697 add $tbl0,$ctx,#(48+2*4*9)
0698 add $tbl1,$ctx,#(48+3*4*9)
0699
0700 vmov $R0,$D0#lo @ r^4:r^3
0701 vshl.u32 $S1,$D1#lo,#2 @ *5
0702 vmov $R1,$D1#lo
0703 vshl.u32 $S2,$D2#lo,#2
0704 vmov $R2,$D2#lo
0705 vshl.u32 $S3,$D3#lo,#2
0706 vmov $R3,$D3#lo
0707 vshl.u32 $S4,$D4#lo,#2
0708 vmov $R4,$D4#lo
0709 vadd.i32 $S1,$S1,$D1#lo
0710 vadd.i32 $S2,$S2,$D2#lo
0711 vadd.i32 $S3,$S3,$D3#lo
0712 vadd.i32 $S4,$S4,$D4#lo
0713
0714 vst4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!
0715 vst4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!
0716 vst4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
0717 vst4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
0718 vst1.32 {${S4}[0]},[$tbl0]
0719 vst1.32 {${S4}[1]},[$tbl1]
0720
0721 .Lno_init_neon:
0722 ret @ bx lr
0723 .size poly1305_init_neon,.-poly1305_init_neon
0724
0725 .type poly1305_blocks_neon,%function
0726 .align 5
0727 poly1305_blocks_neon:
0728 .Lpoly1305_blocks_neon:
0729 ldr ip,[$ctx,#36] @ is_base2_26
0730
0731 cmp $len,#64
0732 blo .Lpoly1305_blocks
0733
0734 stmdb sp!,{r4-r7}
0735 vstmdb sp!,{d8-d15} @ ABI specification says so
0736
0737 tst ip,ip @ is_base2_26?
0738 bne .Lbase2_26_neon
0739
0740 stmdb sp!,{r1-r3,lr}
0741 bl .Lpoly1305_init_neon
0742
0743 ldr r4,[$ctx,#0] @ load hash value base 2^32
0744 ldr r5,[$ctx,#4]
0745 ldr r6,[$ctx,#8]
0746 ldr r7,[$ctx,#12]
0747 ldr ip,[$ctx,#16]
0748
0749 and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26
0750 mov r3,r4,lsr#26
0751 veor $D0#lo,$D0#lo,$D0#lo
0752 mov r4,r5,lsr#20
0753 orr r3,r3,r5,lsl#6
0754 veor $D1#lo,$D1#lo,$D1#lo
0755 mov r5,r6,lsr#14
0756 orr r4,r4,r6,lsl#12
0757 veor $D2#lo,$D2#lo,$D2#lo
0758 mov r6,r7,lsr#8
0759 orr r5,r5,r7,lsl#18
0760 veor $D3#lo,$D3#lo,$D3#lo
0761 and r3,r3,#0x03ffffff
0762 orr r6,r6,ip,lsl#24
0763 veor $D4#lo,$D4#lo,$D4#lo
0764 and r4,r4,#0x03ffffff
0765 mov r1,#1
0766 and r5,r5,#0x03ffffff
0767 str r1,[$ctx,#36] @ set is_base2_26
0768
0769 vmov.32 $D0#lo[0],r2
0770 vmov.32 $D1#lo[0],r3
0771 vmov.32 $D2#lo[0],r4
0772 vmov.32 $D3#lo[0],r5
0773 vmov.32 $D4#lo[0],r6
0774 adr $zeros,.Lzeros
0775
0776 ldmia sp!,{r1-r3,lr}
0777 b .Lhash_loaded
0778
0779 .align 4
0780 .Lbase2_26_neon:
0781 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
0782 @ load hash value
0783
0784 veor $D0#lo,$D0#lo,$D0#lo
0785 veor $D1#lo,$D1#lo,$D1#lo
0786 veor $D2#lo,$D2#lo,$D2#lo
0787 veor $D3#lo,$D3#lo,$D3#lo
0788 veor $D4#lo,$D4#lo,$D4#lo
0789 vld4.32 {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]!
0790 adr $zeros,.Lzeros
0791 vld1.32 {$D4#lo[0]},[$ctx]
0792 sub $ctx,$ctx,#16 @ rewind
0793
0794 .Lhash_loaded:
0795 add $in2,$inp,#32
0796 mov $padbit,$padbit,lsl#24
0797 tst $len,#31
0798 beq .Leven
0799
0800 vld4.32 {$H0#lo[0],$H1#lo[0],$H2#lo[0],$H3#lo[0]},[$inp]!
0801 vmov.32 $H4#lo[0],$padbit
0802 sub $len,$len,#16
0803 add $in2,$inp,#32
0804
0805 # ifdef __ARMEB__
0806 vrev32.8 $H0,$H0
0807 vrev32.8 $H3,$H3
0808 vrev32.8 $H1,$H1
0809 vrev32.8 $H2,$H2
0810 # endif
0811 vsri.u32 $H4#lo,$H3#lo,#8 @ base 2^32 -> base 2^26
0812 vshl.u32 $H3#lo,$H3#lo,#18
0813
0814 vsri.u32 $H3#lo,$H2#lo,#14
0815 vshl.u32 $H2#lo,$H2#lo,#12
0816 vadd.i32 $H4#hi,$H4#lo,$D4#lo @ add hash value and move to #hi
0817
0818 vbic.i32 $H3#lo,#0xfc000000
0819 vsri.u32 $H2#lo,$H1#lo,#20
0820 vshl.u32 $H1#lo,$H1#lo,#6
0821
0822 vbic.i32 $H2#lo,#0xfc000000
0823 vsri.u32 $H1#lo,$H0#lo,#26
0824 vadd.i32 $H3#hi,$H3#lo,$D3#lo
0825
0826 vbic.i32 $H0#lo,#0xfc000000
0827 vbic.i32 $H1#lo,#0xfc000000
0828 vadd.i32 $H2#hi,$H2#lo,$D2#lo
0829
0830 vadd.i32 $H0#hi,$H0#lo,$D0#lo
0831 vadd.i32 $H1#hi,$H1#lo,$D1#lo
0832
0833 mov $tbl1,$zeros
0834 add $tbl0,$ctx,#48
0835
0836 cmp $len,$len
0837 b .Long_tail
0838
0839 .align 4
0840 .Leven:
0841 subs $len,$len,#64
0842 it lo
0843 movlo $in2,$zeros
0844
0845 vmov.i32 $H4,#1<<24 @ padbit, yes, always
0846 vld4.32 {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp] @ inp[0:1]
0847 add $inp,$inp,#64
0848 vld4.32 {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2] @ inp[2:3] (or 0)
0849 add $in2,$in2,#64
0850 itt hi
0851 addhi $tbl1,$ctx,#(48+1*9*4)
0852 addhi $tbl0,$ctx,#(48+3*9*4)
0853
0854 # ifdef __ARMEB__
0855 vrev32.8 $H0,$H0
0856 vrev32.8 $H3,$H3
0857 vrev32.8 $H1,$H1
0858 vrev32.8 $H2,$H2
0859 # endif
0860 vsri.u32 $H4,$H3,#8 @ base 2^32 -> base 2^26
0861 vshl.u32 $H3,$H3,#18
0862
0863 vsri.u32 $H3,$H2,#14
0864 vshl.u32 $H2,$H2,#12
0865
0866 vbic.i32 $H3,#0xfc000000
0867 vsri.u32 $H2,$H1,#20
0868 vshl.u32 $H1,$H1,#6
0869
0870 vbic.i32 $H2,#0xfc000000
0871 vsri.u32 $H1,$H0,#26
0872
0873 vbic.i32 $H0,#0xfc000000
0874 vbic.i32 $H1,#0xfc000000
0875
0876 bls .Lskip_loop
0877
0878 vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^2
0879 vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^4
0880 vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
0881 vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
0882 b .Loop_neon
0883
0884 .align 5
0885 .Loop_neon:
0886 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
0887 @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
0888 @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
0889 @ \___________________/
0890 @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
0891 @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
0892 @ \___________________/ \____________________/
0893 @
0894 @ Note that we start with inp[2:3]*r^2. This is because it
0895 @ doesn't depend on reduction in previous iteration.
0896 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
0897 @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
0898 @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
0899 @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
0900 @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
0901 @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
0902
0903 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
0904 @ inp[2:3]*r^2
0905
0906 vadd.i32 $H2
0907 vmull.u32 $D2,$H2
0908 vadd.i32 $H0
0909 vmull.u32 $D0,$H0
0910 vadd.i32 $H3
0911 vmull.u32 $D3,$H3
0912 vmlal.u32 $D2,$H1
0913 vadd.i32 $H1
0914 vmull.u32 $D1,$H1
0915
0916 vadd.i32 $H4
0917 vmull.u32 $D4,$H4
0918 subs $len,$len,
0919 vmlal.u32 $D0,$H4
0920 it lo
0921 movlo $in2,$zeros
0922 vmlal.u32 $D3,$H2
0923 vld1.32 ${S4}[1],[$tbl1,:32]
0924 vmlal.u32 $D1,$H0
0925 vmlal.u32 $D4,$H3
0926
0927 vmlal.u32 $D0,$H3
0928 vmlal.u32 $D3,$H1
0929 vmlal.u32 $D4,$H2
0930 vmlal.u32 $D1,$H4
0931 vmlal.u32 $D2,$H0
0932
0933 vmlal.u32 $D3,$H0
0934 vmlal.u32 $D0,$H2
0935 vmlal.u32 $D4,$H1
0936 vmlal.u32 $D1,$H3
0937 vmlal.u32 $D2,$H4
0938
0939 vmlal.u32 $D3,$H4
0940 vmlal.u32 $D0,$H1
0941 vmlal.u32 $D4,$H0
0942 vmlal.u32 $D1,$H2
0943 vmlal.u32 $D2,$H3
0944
0945 vld4.32 {$H0
0946 add $in2,$in2,
0947
0948 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
0949 @ (hash+inp[0:1])*r^4 and accumulate
0950
0951 vmlal.u32 $D3,$H3
0952 vmlal.u32 $D0,$H0
0953 vmlal.u32 $D4,$H4
0954 vmlal.u32 $D1,$H1
0955 vmlal.u32 $D2,$H2
0956 vld1.32 ${S4}[0],[$tbl0,:32]
0957
0958 vmlal.u32 $D3,$H2
0959 vmlal.u32 $D0,$H4
0960 vmlal.u32 $D4,$H3
0961 vmlal.u32 $D1,$H0
0962 vmlal.u32 $D2,$H1
0963
0964 vmlal.u32 $D3,$H1
0965 vmlal.u32 $D0,$H3
0966 vmlal.u32 $D4,$H2
0967 vmlal.u32 $D1,$H4
0968 vmlal.u32 $D2,$H0
0969
0970 vmlal.u32 $D3,$H0
0971 vmlal.u32 $D0,$H2
0972 vmlal.u32 $D4,$H1
0973 vmlal.u32 $D1,$H3
0974 vmlal.u32 $D3,$H4
0975
0976 vmlal.u32 $D2,$H4
0977 vmlal.u32 $D0,$H1
0978 vmlal.u32 $D4,$H0
0979 vmov.i32 $H4,
0980 vmlal.u32 $D1,$H2
0981 vmlal.u32 $D2,$H3
0982
0983 vld4.32 {$H0
0984 add $inp,$inp,
0985
0986 vrev32.8 $H0,$H0
0987 vrev32.8 $H1,$H1
0988 vrev32.8 $H2,$H2
0989 vrev32.8 $H3,$H3
0990
0991
0992 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
0993 @ lazy reduction interleaved with base 2^32 -> base 2^26 of
0994 @ inp[0:3] previously loaded to $H0-$H3 and smashed to $H0-$H4.
0995
0996 vshr.u64 $T0,$D3,
0997 vmovn.i64 $D3
0998 vshr.u64 $T1,$D0,
0999 vmovn.i64 $D0
1000 vadd.i64 $D4,$D4,$T0 @ h3 -> h4
1001 vbic.i32 $D3
1002 vsri.u32 $H4,$H3,
1003 vadd.i64 $D1,$D1,$T1 @ h0 -> h1
1004 vshl.u32 $H3,$H3,
1005 vbic.i32 $D0
1006
1007 vshrn.u64 $T0
1008 vmovn.i64 $D4
1009 vshr.u64 $T1,$D1,
1010 vmovn.i64 $D1
1011 vadd.i64 $D2,$D2,$T1 @ h1 -> h2
1012 vsri.u32 $H3,$H2,
1013 vbic.i32 $D4
1014 vshl.u32 $H2,$H2,
1015 vbic.i32 $D1
1016
1017 vadd.i32 $D0
1018 vshl.u32 $T0
1019 vbic.i32 $H3,
1020 vshrn.u64 $T1
1021 vmovn.i64 $D2
1022 vaddl.u32 $D0,$D0
1023 vsri.u32 $H2,$H1,
1024 vadd.i32 $D3
1025 vshl.u32 $H1,$H1,
1026 vbic.i32 $D2
1027 vbic.i32 $H2,
1028
1029 vshrn.u64 $T0
1030 vmovn.i64 $D0
1031 vsri.u32 $H1,$H0,
1032 vbic.i32 $H0,
1033 vshr.u32 $T1
1034 vbic.i32 $D3
1035 vbic.i32 $D0
1036 vadd.i32 $D1
1037 vadd.i32 $D4
1038 vbic.i32 $H1,
1039
1040 bhi .Loop_neon
1041
1042 .Lskip_loop:
1043 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1044 @ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
1045
1046 add $tbl1,$ctx,
1047 add $tbl0,$ctx,
1048 adds $len,$len,
1049 it ne
1050 movne $len,
1051 bne .Long_tail
1052
1053 vadd.i32 $H2
1054 vadd.i32 $H0
1055 vadd.i32 $H3
1056 vadd.i32 $H1
1057 vadd.i32 $H4
1058
1059 .Long_tail:
1060 vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^1
1061 vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^2
1062
1063 vadd.i32 $H2
1064 vmull.u32 $D2,$H2
1065 vadd.i32 $H0
1066 vmull.u32 $D0,$H0
1067 vadd.i32 $H3
1068 vmull.u32 $D3,$H3
1069 vadd.i32 $H1
1070 vmull.u32 $D1,$H1
1071 vadd.i32 $H4
1072 vmull.u32 $D4,$H4
1073
1074 vmlal.u32 $D0,$H4
1075 vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
1076 vmlal.u32 $D3,$H2
1077 vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
1078 vmlal.u32 $D1,$H0
1079 vmlal.u32 $D4,$H3
1080 vmlal.u32 $D2,$H1
1081
1082 vmlal.u32 $D3,$H1
1083 vld1.32 ${S4}[1],[$tbl1,:32]
1084 vmlal.u32 $D0,$H3
1085 vld1.32 ${S4}[0],[$tbl0,:32]
1086 vmlal.u32 $D4,$H2
1087 vmlal.u32 $D1,$H4
1088 vmlal.u32 $D2,$H0
1089
1090 vmlal.u32 $D3,$H0
1091 it ne
1092 addne $tbl1,$ctx,
1093 vmlal.u32 $D0,$H2
1094 it ne
1095 addne $tbl0,$ctx,
1096 vmlal.u32 $D4,$H1
1097 vmlal.u32 $D1,$H3
1098 vmlal.u32 $D2,$H4
1099
1100 vmlal.u32 $D3,$H4
1101 vorn $MASK,$MASK,$MASK @ all-ones, can be redundant
1102 vmlal.u32 $D0,$H1
1103 vshr.u64 $MASK,$MASK,
1104 vmlal.u32 $D4,$H0
1105 vmlal.u32 $D1,$H2
1106 vmlal.u32 $D2,$H3
1107
1108 beq .Lshort_tail
1109
1110 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1111 @ (hash+inp[0:1])*r^4:r^3 and accumulate
1112
1113 vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^3
1114 vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^4
1115
1116 vmlal.u32 $D2,$H2
1117 vmlal.u32 $D0,$H0
1118 vmlal.u32 $D3,$H3
1119 vmlal.u32 $D1,$H1
1120 vmlal.u32 $D4,$H4
1121
1122 vmlal.u32 $D0,$H4
1123 vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
1124 vmlal.u32 $D3,$H2
1125 vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
1126 vmlal.u32 $D1,$H0
1127 vmlal.u32 $D4,$H3
1128 vmlal.u32 $D2,$H1
1129
1130 vmlal.u32 $D3,$H1
1131 vld1.32 ${S4}[1],[$tbl1,:32]
1132 vmlal.u32 $D0,$H3
1133 vld1.32 ${S4}[0],[$tbl0,:32]
1134 vmlal.u32 $D4,$H2
1135 vmlal.u32 $D1,$H4
1136 vmlal.u32 $D2,$H0
1137
1138 vmlal.u32 $D3,$H0
1139 vmlal.u32 $D0,$H2
1140 vmlal.u32 $D4,$H1
1141 vmlal.u32 $D1,$H3
1142 vmlal.u32 $D2,$H4
1143
1144 vmlal.u32 $D3,$H4
1145 vorn $MASK,$MASK,$MASK @ all-ones
1146 vmlal.u32 $D0,$H1
1147 vshr.u64 $MASK,$MASK,
1148 vmlal.u32 $D4,$H0
1149 vmlal.u32 $D1,$H2
1150 vmlal.u32 $D2,$H3
1151
1152 .Lshort_tail:
1153 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1154 @ horizontal addition
1155
1156 vadd.i64 $D3
1157 vadd.i64 $D0
1158 vadd.i64 $D4
1159 vadd.i64 $D1
1160 vadd.i64 $D2
1161
1162 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1163 @ lazy reduction, but without narrowing
1164
1165 vshr.u64 $T0,$D3,
1166 vand.i64 $D3,$D3,$MASK
1167 vshr.u64 $T1,$D0,
1168 vand.i64 $D0,$D0,$MASK
1169 vadd.i64 $D4,$D4,$T0 @ h3 -> h4
1170 vadd.i64 $D1,$D1,$T1 @ h0 -> h1
1171
1172 vshr.u64 $T0,$D4,
1173 vand.i64 $D4,$D4,$MASK
1174 vshr.u64 $T1,$D1,
1175 vand.i64 $D1,$D1,$MASK
1176 vadd.i64 $D2,$D2,$T1 @ h1 -> h2
1177
1178 vadd.i64 $D0,$D0,$T0
1179 vshl.u64 $T0,$T0,
1180 vshr.u64 $T1,$D2,
1181 vand.i64 $D2,$D2,$MASK
1182 vadd.i64 $D0,$D0,$T0 @ h4 -> h0
1183 vadd.i64 $D3,$D3,$T1 @ h2 -> h3
1184
1185 vshr.u64 $T0,$D0,
1186 vand.i64 $D0,$D0,$MASK
1187 vshr.u64 $T1,$D3,
1188 vand.i64 $D3,$D3,$MASK
1189 vadd.i64 $D1,$D1,$T0 @ h0 -> h1
1190 vadd.i64 $D4,$D4,$T1 @ h3 -> h4
1191
1192 cmp $len,
1193 bne .Leven
1194
1195 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1196 @ store hash value
1197
1198 vst4.32 {$D0
1199 vst1.32 {$D4
1200
1201 vldmia sp!,{d8-d15} @ epilogue
1202 ldmia sp!,{r4-r7}
1203 ret @ bx lr
1204 .size poly1305_blocks_neon,.-poly1305_blocks_neon
1205
1206 .align 5
1207 .Lzeros:
1208 .long 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1209
1210 .LOPENSSL_armcap:
1211
1212 .word OPENSSL_armcap_P
1213
1214 .word OPENSSL_armcap_P-.Lpoly1305_init
1215
1216 .comm OPENSSL_armcap_P,4,4
1217 .hidden OPENSSL_armcap_P
1218
1219
1220 ___
1221 } }
1222 $code.=<<___;
1223 .asciz "Poly1305 for ARMv4/NEON, CRYPTOGAMS by \@dot-asm"
1224 .align 2
1225 ___
1226
1227 foreach (split("\n",$code)) {
1228 s/\`([^\`]*)\`/eval $1/geo;
1229
1230 s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
1231 s/\bret\b/bx lr/go or
1232 s/\bbx\s+lr\b/.word\t0xe12fff1e/go;
1233
1234 print $_,"\n";
1235 }
1236 close STDOUT;