arm64/crypto/sha512-armv8.pl

0001 #! /usr/bin/env perl
0002 # SPDX-License-Identifier: GPL-2.0
0003
0004 # This code is taken from the OpenSSL project but the author (Andy Polyakov)
0005 # has relicensed it under the GPLv2. Therefore this program is free software;
0006 # you can redistribute it and/or modify it under the terms of the GNU General
0007 # Public License version 2 as published by the Free Software Foundation.
0008 #
0009 # The original headers, including the original license headers, are
0010 # included below for completeness.
0011
0012 # Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
0013 #
0014 # Licensed under the OpenSSL license (the "License").  You may not use
0015 # this file except in compliance with the License.  You can obtain a copy
0016 # in the file LICENSE in the source distribution or at
0017 # https://www.openssl.org/source/license.html
0018
0019 # ====================================================================
0020 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
0021 # project. The module is, however, dual licensed under OpenSSL and
0022 # CRYPTOGAMS licenses depending on where you obtain it. For further
0023 # details see http://www.openssl.org/~appro/cryptogams/.
0024 # ====================================================================
0025 #
0026 # SHA256/512 for ARMv8.
0027 #
0028 # Performance in cycles per processed byte and improvement coefficient
0029 # over code generated with "default" compiler:
0030 #
0031 #       SHA256-hw   SHA256(*)   SHA512
0032 # Apple A7  1.97        10.5 (+33%) 6.73 (-1%(**))
0033 # Cortex-A53    2.38        15.5 (+115%)    10.0 (+150%(***))
0034 # Cortex-A57    2.31        11.6 (+86%) 7.51 (+260%(***))
0035 # Denver    2.01        10.5 (+26%) 6.70 (+8%)
0036 # X-Gene            20.0 (+100%)    12.8 (+300%(***))
0037 # Mongoose  2.36        13.0 (+50%) 8.36 (+33%)
0038 #
0039 # (*)   Software SHA256 results are of lesser relevance, presented
0040 #   mostly for informational purposes.
0041 # (**)  The result is a trade-off: it's possible to improve it by
0042 #   10% (or by 1 cycle per round), but at the cost of 20% loss
0043 #   on Cortex-A53 (or by 4 cycles per round).
0044 # (***) Super-impressive coefficients over gcc-generated code are
0045 #   indication of some compiler "pathology", most notably code
0046 #   generated with -mgeneral-regs-only is significantly faster
0047 #   and the gap is only 40-90%.
0048 #
0049 # October 2016.
0050 #
0051 # Originally it was reckoned that it makes no sense to implement NEON
0052 # version of SHA256 for 64-bit processors. This is because performance
0053 # improvement on most wide-spread Cortex-A5x processors was observed
0054 # to be marginal, same on Cortex-A53 and ~10% on A57. But then it was
0055 # observed that 32-bit NEON SHA256 performs significantly better than
0056 # 64-bit scalar version on *some* of the more recent processors. As
0057 # result 64-bit NEON version of SHA256 was added to provide best
0058 # all-round performance. For example it executes ~30% faster on X-Gene
0059 # and Mongoose. [For reference, NEON version of SHA512 is bound to
0060 # deliver much less improvement, likely *negative* on Cortex-A5x.
0061 # Which is why NEON support is limited to SHA256.]
0062
0063 $output=pop;
0064 $flavour=pop;
0065
0066 if ($flavour && $flavour ne "void") {
0067     $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
0068     ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
0069     ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
0070     die "can't locate arm-xlate.pl";
0071
0072     open OUT,"| \"$^X\" $xlate $flavour $output";
0073     *STDOUT=*OUT;
0074 } else {
0075     open STDOUT,">$output";
0076 }
0077
0078 if ($output =~ /512/) {
0079     $BITS=512;
0080     $SZ=8;
0081     @Sigma0=(28,34,39);
0082     @Sigma1=(14,18,41);
0083     @sigma0=(1,  8, 7);
0084     @sigma1=(19,61, 6);
0085     $rounds=80;
0086     $reg_t="x";
0087 } else {
0088     $BITS=256;
0089     $SZ=4;
0090     @Sigma0=( 2,13,22);
0091     @Sigma1=( 6,11,25);
0092     @sigma0=( 7,18, 3);
0093     @sigma1=(17,19,10);
0094     $rounds=64;
0095     $reg_t="w";
0096 }
0097
0098 $func="sha${BITS}_block_data_order";
0099
0100 ($ctx,$inp,$num,$Ktbl)=map("x$_",(0..2,30));
0101
0102 @X=map("$reg_t$_",(3..15,0..2));
0103 @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("$reg_t$_",(20..27));
0104 ($t0,$t1,$t2,$t3)=map("$reg_t$_",(16,17,19,28));
0105
0106 sub BODY_00_xx {
0107 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
0108 my $j=($i+1)&15;
0109 my ($T0,$T1,$T2)=(@X[($i-8)&15],@X[($i-9)&15],@X[($i-10)&15]);
0110    $T0=@X[$i+3] if ($i<11);
0111
0112 $code.=<<___    if ($i<16);
0113 #ifndef __AARCH64EB__
0114     rev @X[$i],@X[$i]           // $i
0115 #endif
0116 ___
0117 $code.=<<___    if ($i<13 && ($i&1));
0118     ldp @X[$i+1],@X[$i+2],[$inp],#2*$SZ
0119 ___
0120 $code.=<<___    if ($i==13);
0121     ldp @X[14],@X[15],[$inp]
0122 ___
0123 $code.=<<___    if ($i>=14);
0124     ldr @X[($i-11)&15],[sp,#`$SZ*(($i-11)%4)`]
0125 ___
0126 $code.=<<___    if ($i>0 && $i<16);
0127     add $a,$a,$t1           // h+=Sigma0(a)
0128 ___
0129 $code.=<<___    if ($i>=11);
0130     str @X[($i-8)&15],[sp,#`$SZ*(($i-8)%4)`]
0131 ___
0132 # While ARMv8 specifies merged rotate-n-logical operation such as
0133 # 'eor x,y,z,ror#n', it was found to negatively affect performance
0134 # on Apple A7. The reason seems to be that it requires even 'y' to
0135 # be available earlier. This means that such merged instruction is
0136 # not necessarily best choice on critical path... On the other hand
0137 # Cortex-A5x handles merged instructions much better than disjoint
0138 # rotate and logical... See (**) footnote above.
0139 $code.=<<___    if ($i<15);
0140     ror $t0,$e,#$Sigma1[0]
0141     add $h,$h,$t2           // h+=K[i]
0142     eor $T0,$e,$e,ror#`$Sigma1[2]-$Sigma1[1]`
0143     and $t1,$f,$e
0144     bic $t2,$g,$e
0145     add $h,$h,@X[$i&15]         // h+=X[i]
0146     orr $t1,$t1,$t2         // Ch(e,f,g)
0147     eor $t2,$a,$b           // a^b, b^c in next round
0148     eor $t0,$t0,$T0,ror#$Sigma1[1]  // Sigma1(e)
0149     ror $T0,$a,#$Sigma0[0]
0150     add $h,$h,$t1           // h+=Ch(e,f,g)
0151     eor $t1,$a,$a,ror#`$Sigma0[2]-$Sigma0[1]`
0152     add $h,$h,$t0           // h+=Sigma1(e)
0153     and $t3,$t3,$t2         // (b^c)&=(a^b)
0154     add $d,$d,$h            // d+=h
0155     eor $t3,$t3,$b          // Maj(a,b,c)
0156     eor $t1,$T0,$t1,ror#$Sigma0[1]  // Sigma0(a)
0157     add $h,$h,$t3           // h+=Maj(a,b,c)
0158     ldr $t3,[$Ktbl],#$SZ        // *K++, $t2 in next round
0159     //add   $h,$h,$t1           // h+=Sigma0(a)
0160 ___
0161 $code.=<<___    if ($i>=15);
0162     ror $t0,$e,#$Sigma1[0]
0163     add $h,$h,$t2           // h+=K[i]
0164     ror $T1,@X[($j+1)&15],#$sigma0[0]
0165     and $t1,$f,$e
0166     ror $T2,@X[($j+14)&15],#$sigma1[0]
0167     bic $t2,$g,$e
0168     ror $T0,$a,#$Sigma0[0]
0169     add $h,$h,@X[$i&15]         // h+=X[i]
0170     eor $t0,$t0,$e,ror#$Sigma1[1]
0171     eor $T1,$T1,@X[($j+1)&15],ror#$sigma0[1]
0172     orr $t1,$t1,$t2         // Ch(e,f,g)
0173     eor $t2,$a,$b           // a^b, b^c in next round
0174     eor $t0,$t0,$e,ror#$Sigma1[2]   // Sigma1(e)
0175     eor $T0,$T0,$a,ror#$Sigma0[1]
0176     add $h,$h,$t1           // h+=Ch(e,f,g)
0177     and $t3,$t3,$t2         // (b^c)&=(a^b)
0178     eor $T2,$T2,@X[($j+14)&15],ror#$sigma1[1]
0179     eor $T1,$T1,@X[($j+1)&15],lsr#$sigma0[2]    // sigma0(X[i+1])
0180     add $h,$h,$t0           // h+=Sigma1(e)
0181     eor $t3,$t3,$b          // Maj(a,b,c)
0182     eor $t1,$T0,$a,ror#$Sigma0[2]   // Sigma0(a)
0183     eor $T2,$T2,@X[($j+14)&15],lsr#$sigma1[2]   // sigma1(X[i+14])
0184     add @X[$j],@X[$j],@X[($j+9)&15]
0185     add $d,$d,$h            // d+=h
0186     add $h,$h,$t3           // h+=Maj(a,b,c)
0187     ldr $t3,[$Ktbl],#$SZ        // *K++, $t2 in next round
0188     add @X[$j],@X[$j],$T1
0189     add $h,$h,$t1           // h+=Sigma0(a)
0190     add @X[$j],@X[$j],$T2
0191 ___
0192     ($t2,$t3)=($t3,$t2);
0193 }
0194
0195 $code.=<<___;
0196 #ifndef __KERNEL__
0197 # include "arm_arch.h"
0198 #endif
0199
0200 .text
0201
0202 .extern OPENSSL_armcap_P
0203 .globl  $func
0204 .type   $func,%function
0205 .align  6
0206 $func:
0207 ___
0208 $code.=<<___    if ($SZ==4);
0209 #ifndef __KERNEL__
0210 # ifdef __ILP32__
0211     ldrsw   x16,.LOPENSSL_armcap_P
0212 # else
0213     ldr x16,.LOPENSSL_armcap_P
0214 # endif
0215     adr x17,.LOPENSSL_armcap_P
0216     add x16,x16,x17
0217     ldr w16,[x16]
0218     tst w16,#ARMV8_SHA256
0219     b.ne    .Lv8_entry
0220     tst w16,#ARMV7_NEON
0221     b.ne    .Lneon_entry
0222 #endif
0223 ___
0224 $code.=<<___;
0225     stp x29,x30,[sp,#-128]!
0226     add x29,sp,#0
0227
0228     stp x19,x20,[sp,#16]
0229     stp x21,x22,[sp,#32]
0230     stp x23,x24,[sp,#48]
0231     stp x25,x26,[sp,#64]
0232     stp x27,x28,[sp,#80]
0233     sub sp,sp,#4*$SZ
0234
0235     ldp $A,$B,[$ctx]                // load context
0236     ldp $C,$D,[$ctx,#2*$SZ]
0237     ldp $E,$F,[$ctx,#4*$SZ]
0238     add $num,$inp,$num,lsl#`log(16*$SZ)/log(2)` // end of input
0239     ldp $G,$H,[$ctx,#6*$SZ]
0240     adr $Ktbl,.LK$BITS
0241     stp $ctx,$num,[x29,#96]
0242
0243 .Loop:
0244     ldp @X[0],@X[1],[$inp],#2*$SZ
0245     ldr $t2,[$Ktbl],#$SZ            // *K++
0246     eor $t3,$B,$C               // magic seed
0247     str $inp,[x29,#112]
0248 ___
0249 for ($i=0;$i<16;$i++)   { &BODY_00_xx($i,@V); unshift(@V,pop(@V)); }
0250 $code.=".Loop_16_xx:\n";
0251 for (;$i<32;$i++)   { &BODY_00_xx($i,@V); unshift(@V,pop(@V)); }
0252 $code.=<<___;
0253     cbnz    $t2,.Loop_16_xx
0254
0255     ldp $ctx,$num,[x29,#96]
0256     ldr $inp,[x29,#112]
0257     sub $Ktbl,$Ktbl,#`$SZ*($rounds+1)`      // rewind
0258
0259     ldp @X[0],@X[1],[$ctx]
0260     ldp @X[2],@X[3],[$ctx,#2*$SZ]
0261     add $inp,$inp,#14*$SZ           // advance input pointer
0262     ldp @X[4],@X[5],[$ctx,#4*$SZ]
0263     add $A,$A,@X[0]
0264     ldp @X[6],@X[7],[$ctx,#6*$SZ]
0265     add $B,$B,@X[1]
0266     add $C,$C,@X[2]
0267     add $D,$D,@X[3]
0268     stp $A,$B,[$ctx]
0269     add $E,$E,@X[4]
0270     add $F,$F,@X[5]
0271     stp $C,$D,[$ctx,#2*$SZ]
0272     add $G,$G,@X[6]
0273     add $H,$H,@X[7]
0274     cmp $inp,$num
0275     stp $E,$F,[$ctx,#4*$SZ]
0276     stp $G,$H,[$ctx,#6*$SZ]
0277     b.ne    .Loop
0278
0279     ldp x19,x20,[x29,#16]
0280     add sp,sp,#4*$SZ
0281     ldp x21,x22,[x29,#32]
0282     ldp x23,x24,[x29,#48]
0283     ldp x25,x26,[x29,#64]
0284     ldp x27,x28,[x29,#80]
0285     ldp x29,x30,[sp],#128
0286     ret
0287 .size   $func,.-$func
0288
0289 .align  6
0290 .type   .LK$BITS,%object
0291 .LK$BITS:
0292 ___
0293 $code.=<<___ if ($SZ==8);
0294     .quad   0x428a2f98d728ae22,0x7137449123ef65cd
0295     .quad   0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
0296     .quad   0x3956c25bf348b538,0x59f111f1b605d019
0297     .quad   0x923f82a4af194f9b,0xab1c5ed5da6d8118
0298     .quad   0xd807aa98a3030242,0x12835b0145706fbe
0299     .quad   0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
0300     .quad   0x72be5d74f27b896f,0x80deb1fe3b1696b1
0301     .quad   0x9bdc06a725c71235,0xc19bf174cf692694
0302     .quad   0xe49b69c19ef14ad2,0xefbe4786384f25e3
0303     .quad   0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
0304     .quad   0x2de92c6f592b0275,0x4a7484aa6ea6e483
0305     .quad   0x5cb0a9dcbd41fbd4,0x76f988da831153b5
0306     .quad   0x983e5152ee66dfab,0xa831c66d2db43210
0307     .quad   0xb00327c898fb213f,0xbf597fc7beef0ee4
0308     .quad   0xc6e00bf33da88fc2,0xd5a79147930aa725
0309     .quad   0x06ca6351e003826f,0x142929670a0e6e70
0310     .quad   0x27b70a8546d22ffc,0x2e1b21385c26c926
0311     .quad   0x4d2c6dfc5ac42aed,0x53380d139d95b3df
0312     .quad   0x650a73548baf63de,0x766a0abb3c77b2a8
0313     .quad   0x81c2c92e47edaee6,0x92722c851482353b
0314     .quad   0xa2bfe8a14cf10364,0xa81a664bbc423001
0315     .quad   0xc24b8b70d0f89791,0xc76c51a30654be30
0316     .quad   0xd192e819d6ef5218,0xd69906245565a910
0317     .quad   0xf40e35855771202a,0x106aa07032bbd1b8
0318     .quad   0x19a4c116b8d2d0c8,0x1e376c085141ab53
0319     .quad   0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
0320     .quad   0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
0321     .quad   0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
0322     .quad   0x748f82ee5defb2fc,0x78a5636f43172f60
0323     .quad   0x84c87814a1f0ab72,0x8cc702081a6439ec
0324     .quad   0x90befffa23631e28,0xa4506cebde82bde9
0325     .quad   0xbef9a3f7b2c67915,0xc67178f2e372532b
0326     .quad   0xca273eceea26619c,0xd186b8c721c0c207
0327     .quad   0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
0328     .quad   0x06f067aa72176fba,0x0a637dc5a2c898a6
0329     .quad   0x113f9804bef90dae,0x1b710b35131c471b
0330     .quad   0x28db77f523047d84,0x32caab7b40c72493
0331     .quad   0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
0332     .quad   0x4cc5d4becb3e42b6,0x597f299cfc657e2a
0333     .quad   0x5fcb6fab3ad6faec,0x6c44198c4a475817
0334     .quad   0   // terminator
0335 ___
0336 $code.=<<___ if ($SZ==4);
0337     .long   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
0338     .long   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
0339     .long   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
0340     .long   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
0341     .long   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
0342     .long   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
0343     .long   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
0344     .long   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
0345     .long   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
0346     .long   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
0347     .long   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
0348     .long   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
0349     .long   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
0350     .long   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
0351     .long   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
0352     .long   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
0353     .long   0   //terminator
0354 ___
0355 $code.=<<___;
0356 .size   .LK$BITS,.-.LK$BITS
0357 #ifndef __KERNEL__
0358 .align  3
0359 .LOPENSSL_armcap_P:
0360 # ifdef __ILP32__
0361     .long   OPENSSL_armcap_P-.
0362 # else
0363     .quad   OPENSSL_armcap_P-.
0364 # endif
0365 #endif
0366 .asciz  "SHA$BITS block transform for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
0367 .align  2
0368 ___
0369
0370 if ($SZ==4) {
0371 my $Ktbl="x3";
0372
0373 my ($ABCD,$EFGH,$abcd)=map("v$_.16b",(0..2));
0374 my @MSG=map("v$_.16b",(4..7));
0375 my ($W0,$W1)=("v16.4s","v17.4s");
0376 my ($ABCD_SAVE,$EFGH_SAVE)=("v18.16b","v19.16b");
0377
0378 $code.=<<___;
0379 #ifndef __KERNEL__
0380 .type   sha256_block_armv8,%function
0381 .align  6
0382 sha256_block_armv8:
0383 .Lv8_entry:
0384     stp     x29,x30,[sp,#-16]!
0385     add     x29,sp,#0
0386
0387     ld1.32      {$ABCD,$EFGH},[$ctx]
0388     adr     $Ktbl,.LK256
0389
0390 .Loop_hw:
0391     ld1     {@MSG[0]-@MSG[3]},[$inp],#64
0392     sub     $num,$num,#1
0393     ld1.32      {$W0},[$Ktbl],#16
0394     rev32       @MSG[0],@MSG[0]
0395     rev32       @MSG[1],@MSG[1]
0396     rev32       @MSG[2],@MSG[2]
0397     rev32       @MSG[3],@MSG[3]
0398     orr     $ABCD_SAVE,$ABCD,$ABCD      // offload
0399     orr     $EFGH_SAVE,$EFGH,$EFGH
0400 ___
0401 for($i=0;$i<12;$i++) {
0402 $code.=<<___;
0403     ld1.32      {$W1},[$Ktbl],#16
0404     add.i32     $W0,$W0,@MSG[0]
0405     sha256su0   @MSG[0],@MSG[1]
0406     orr     $abcd,$ABCD,$ABCD
0407     sha256h     $ABCD,$EFGH,$W0
0408     sha256h2    $EFGH,$abcd,$W0
0409     sha256su1   @MSG[0],@MSG[2],@MSG[3]
0410 ___
0411     ($W0,$W1)=($W1,$W0);    push(@MSG,shift(@MSG));
0412 }
0413 $code.=<<___;
0414     ld1.32      {$W1},[$Ktbl],#16
0415     add.i32     $W0,$W0,@MSG[0]
0416     orr     $abcd,$ABCD,$ABCD
0417     sha256h     $ABCD,$EFGH,$W0
0418     sha256h2    $EFGH,$abcd,$W0
0419
0420     ld1.32      {$W0},[$Ktbl],#16
0421     add.i32     $W1,$W1,@MSG[1]
0422     orr     $abcd,$ABCD,$ABCD
0423     sha256h     $ABCD,$EFGH,$W1
0424     sha256h2    $EFGH,$abcd,$W1
0425
0426     ld1.32      {$W1},[$Ktbl]
0427     add.i32     $W0,$W0,@MSG[2]
0428     sub     $Ktbl,$Ktbl,#$rounds*$SZ-16 // rewind
0429     orr     $abcd,$ABCD,$ABCD
0430     sha256h     $ABCD,$EFGH,$W0
0431     sha256h2    $EFGH,$abcd,$W0
0432
0433     add.i32     $W1,$W1,@MSG[3]
0434     orr     $abcd,$ABCD,$ABCD
0435     sha256h     $ABCD,$EFGH,$W1
0436     sha256h2    $EFGH,$abcd,$W1
0437
0438     add.i32     $ABCD,$ABCD,$ABCD_SAVE
0439     add.i32     $EFGH,$EFGH,$EFGH_SAVE
0440
0441     cbnz        $num,.Loop_hw
0442
0443     st1.32      {$ABCD,$EFGH},[$ctx]
0444
0445     ldr     x29,[sp],#16
0446     ret
0447 .size   sha256_block_armv8,.-sha256_block_armv8
0448 #endif
0449 ___
0450 }
0451
0452 if ($SZ==4) {   ######################################### NEON stuff #
0453 # You'll surely note a lot of similarities with sha256-armv4 module,
0454 # and of course it's not a coincidence. sha256-armv4 was used as
0455 # initial template, but was adapted for ARMv8 instruction set and
0456 # extensively re-tuned for all-round performance.
0457
0458 my @V = ($A,$B,$C,$D,$E,$F,$G,$H) = map("w$_",(3..10));
0459 my ($t0,$t1,$t2,$t3,$t4) = map("w$_",(11..15));
0460 my $Ktbl="x16";
0461 my $Xfer="x17";
0462 my @X = map("q$_",(0..3));
0463 my ($T0,$T1,$T2,$T3,$T4,$T5,$T6,$T7) = map("q$_",(4..7,16..19));
0464 my $j=0;
0465
0466 sub AUTOLOAD()          # thunk [simplified] x86-style perlasm
0467 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
0468   my $arg = pop;
0469     $arg = "#$arg" if ($arg*1 eq $arg);
0470     $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
0471 }
0472
0473 sub Dscalar { shift =~ m|[qv]([0-9]+)|?"d$1":""; }
0474 sub Dlo     { shift =~ m|[qv]([0-9]+)|?"v$1.d[0]":""; }
0475 sub Dhi     { shift =~ m|[qv]([0-9]+)|?"v$1.d[1]":""; }
0476
0477 sub Xupdate()
0478 { use integer;
0479   my $body = shift;
0480   my @insns = (&$body,&$body,&$body,&$body);
0481   my ($a,$b,$c,$d,$e,$f,$g,$h);
0482
0483     &ext_8      ($T0,@X[0],@X[1],4);    # X[1..4]
0484      eval(shift(@insns));
0485      eval(shift(@insns));
0486      eval(shift(@insns));
0487     &ext_8      ($T3,@X[2],@X[3],4);    # X[9..12]
0488      eval(shift(@insns));
0489      eval(shift(@insns));
0490     &mov        (&Dscalar($T7),&Dhi(@X[3]));    # X[14..15]
0491      eval(shift(@insns));
0492      eval(shift(@insns));
0493     &ushr_32    ($T2,$T0,$sigma0[0]);
0494      eval(shift(@insns));
0495     &ushr_32    ($T1,$T0,$sigma0[2]);
0496      eval(shift(@insns));
0497     &add_32     (@X[0],@X[0],$T3);  # X[0..3] += X[9..12]
0498      eval(shift(@insns));
0499     &sli_32     ($T2,$T0,32-$sigma0[0]);
0500      eval(shift(@insns));
0501      eval(shift(@insns));
0502     &ushr_32    ($T3,$T0,$sigma0[1]);
0503      eval(shift(@insns));
0504      eval(shift(@insns));
0505     &eor_8      ($T1,$T1,$T2);
0506      eval(shift(@insns));
0507      eval(shift(@insns));
0508     &sli_32     ($T3,$T0,32-$sigma0[1]);
0509      eval(shift(@insns));
0510      eval(shift(@insns));
0511       &ushr_32  ($T4,$T7,$sigma1[0]);
0512      eval(shift(@insns));
0513      eval(shift(@insns));
0514     &eor_8      ($T1,$T1,$T3);      # sigma0(X[1..4])
0515      eval(shift(@insns));
0516      eval(shift(@insns));
0517       &sli_32   ($T4,$T7,32-$sigma1[0]);
0518      eval(shift(@insns));
0519      eval(shift(@insns));
0520       &ushr_32  ($T5,$T7,$sigma1[2]);
0521      eval(shift(@insns));
0522      eval(shift(@insns));
0523       &ushr_32  ($T3,$T7,$sigma1[1]);
0524      eval(shift(@insns));
0525      eval(shift(@insns));
0526     &add_32     (@X[0],@X[0],$T1);  # X[0..3] += sigma0(X[1..4])
0527      eval(shift(@insns));
0528      eval(shift(@insns));
0529       &sli_u32  ($T3,$T7,32-$sigma1[1]);
0530      eval(shift(@insns));
0531      eval(shift(@insns));
0532       &eor_8    ($T5,$T5,$T4);
0533      eval(shift(@insns));
0534      eval(shift(@insns));
0535      eval(shift(@insns));
0536       &eor_8    ($T5,$T5,$T3);      # sigma1(X[14..15])
0537      eval(shift(@insns));
0538      eval(shift(@insns));
0539      eval(shift(@insns));
0540     &add_32     (@X[0],@X[0],$T5);  # X[0..1] += sigma1(X[14..15])
0541      eval(shift(@insns));
0542      eval(shift(@insns));
0543      eval(shift(@insns));
0544       &ushr_32  ($T6,@X[0],$sigma1[0]);
0545      eval(shift(@insns));
0546       &ushr_32  ($T7,@X[0],$sigma1[2]);
0547      eval(shift(@insns));
0548      eval(shift(@insns));
0549       &sli_32   ($T6,@X[0],32-$sigma1[0]);
0550      eval(shift(@insns));
0551       &ushr_32  ($T5,@X[0],$sigma1[1]);
0552      eval(shift(@insns));
0553      eval(shift(@insns));
0554       &eor_8    ($T7,$T7,$T6);
0555      eval(shift(@insns));
0556      eval(shift(@insns));
0557       &sli_32   ($T5,@X[0],32-$sigma1[1]);
0558      eval(shift(@insns));
0559      eval(shift(@insns));
0560     &ld1_32     ("{$T0}","[$Ktbl], #16");
0561      eval(shift(@insns));
0562       &eor_8    ($T7,$T7,$T5);      # sigma1(X[16..17])
0563      eval(shift(@insns));
0564      eval(shift(@insns));
0565     &eor_8      ($T5,$T5,$T5);
0566      eval(shift(@insns));
0567      eval(shift(@insns));
0568     &mov        (&Dhi($T5), &Dlo($T7));
0569      eval(shift(@insns));
0570      eval(shift(@insns));
0571      eval(shift(@insns));
0572     &add_32     (@X[0],@X[0],$T5);  # X[2..3] += sigma1(X[16..17])
0573      eval(shift(@insns));
0574      eval(shift(@insns));
0575      eval(shift(@insns));
0576     &add_32     ($T0,$T0,@X[0]);
0577      while($#insns>=1) { eval(shift(@insns)); }
0578     &st1_32     ("{$T0}","[$Xfer], #16");
0579      eval(shift(@insns));
0580
0581     push(@X,shift(@X));     # "rotate" X[]
0582 }
0583
0584 sub Xpreload()
0585 { use integer;
0586   my $body = shift;
0587   my @insns = (&$body,&$body,&$body,&$body);
0588   my ($a,$b,$c,$d,$e,$f,$g,$h);
0589
0590      eval(shift(@insns));
0591      eval(shift(@insns));
0592     &ld1_8      ("{@X[0]}","[$inp],#16");
0593      eval(shift(@insns));
0594      eval(shift(@insns));
0595     &ld1_32     ("{$T0}","[$Ktbl],#16");
0596      eval(shift(@insns));
0597      eval(shift(@insns));
0598      eval(shift(@insns));
0599      eval(shift(@insns));
0600     &rev32      (@X[0],@X[0]);
0601      eval(shift(@insns));
0602      eval(shift(@insns));
0603      eval(shift(@insns));
0604      eval(shift(@insns));
0605     &add_32     ($T0,$T0,@X[0]);
0606      foreach (@insns) { eval; } # remaining instructions
0607     &st1_32     ("{$T0}","[$Xfer], #16");
0608
0609     push(@X,shift(@X));     # "rotate" X[]
0610 }
0611
0612 sub body_00_15 () {
0613     (
0614     '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
0615     '&add   ($h,$h,$t1)',           # h+=X[i]+K[i]
0616     '&add   ($a,$a,$t4);'.          # h+=Sigma0(a) from the past
0617     '&and   ($t1,$f,$e)',
0618     '&bic   ($t4,$g,$e)',
0619     '&eor   ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
0620     '&add   ($a,$a,$t2)',           # h+=Maj(a,b,c) from the past
0621     '&orr   ($t1,$t1,$t4)',         # Ch(e,f,g)
0622     '&eor   ($t0,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))',   # Sigma1(e)
0623     '&eor   ($t4,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
0624     '&add   ($h,$h,$t1)',           # h+=Ch(e,f,g)
0625     '&ror   ($t0,$t0,"#$Sigma1[0]")',
0626     '&eor   ($t2,$a,$b)',           # a^b, b^c in next round
0627     '&eor   ($t4,$t4,$a,"ror#".($Sigma0[2]-$Sigma0[0]))',   # Sigma0(a)
0628     '&add   ($h,$h,$t0)',           # h+=Sigma1(e)
0629     '&ldr   ($t1,sprintf "[sp,#%d]",4*(($j+1)&15))  if (($j&15)!=15);'.
0630     '&ldr   ($t1,"[$Ktbl]")             if ($j==15);'.
0631     '&and   ($t3,$t3,$t2)',         # (b^c)&=(a^b)
0632     '&ror   ($t4,$t4,"#$Sigma0[0]")',
0633     '&add   ($d,$d,$h)',            # d+=h
0634     '&eor   ($t3,$t3,$b)',          # Maj(a,b,c)
0635     '$j++;  unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
0636     )
0637 }
0638
0639 $code.=<<___;
0640 #ifdef  __KERNEL__
0641 .globl  sha256_block_neon
0642 #endif
0643 .type   sha256_block_neon,%function
0644 .align  4
0645 sha256_block_neon:
0646 .Lneon_entry:
0647     stp x29, x30, [sp, #-16]!
0648     mov x29, sp
0649     sub sp,sp,#16*4
0650
0651     adr $Ktbl,.LK256
0652     add $num,$inp,$num,lsl#6    // len to point at the end of inp
0653
0654     ld1.8   {@X[0]},[$inp], #16
0655     ld1.8   {@X[1]},[$inp], #16
0656     ld1.8   {@X[2]},[$inp], #16
0657     ld1.8   {@X[3]},[$inp], #16
0658     ld1.32  {$T0},[$Ktbl], #16
0659     ld1.32  {$T1},[$Ktbl], #16
0660     ld1.32  {$T2},[$Ktbl], #16
0661     ld1.32  {$T3},[$Ktbl], #16
0662     rev32   @X[0],@X[0]     // yes, even on
0663     rev32   @X[1],@X[1]     // big-endian
0664     rev32   @X[2],@X[2]
0665     rev32   @X[3],@X[3]
0666     mov $Xfer,sp
0667     add.32  $T0,$T0,@X[0]
0668     add.32  $T1,$T1,@X[1]
0669     add.32  $T2,$T2,@X[2]
0670     st1.32  {$T0-$T1},[$Xfer], #32
0671     add.32  $T3,$T3,@X[3]
0672     st1.32  {$T2-$T3},[$Xfer]
0673     sub $Xfer,$Xfer,#32
0674
0675     ldp $A,$B,[$ctx]
0676     ldp $C,$D,[$ctx,#8]
0677     ldp $E,$F,[$ctx,#16]
0678     ldp $G,$H,[$ctx,#24]
0679     ldr $t1,[sp,#0]
0680     mov $t2,wzr
0681     eor $t3,$B,$C
0682     mov $t4,wzr
0683     b   .L_00_48
0684
0685 .align  4
0686 .L_00_48:
0687 ___
0688     &Xupdate(\&body_00_15);
0689     &Xupdate(\&body_00_15);
0690     &Xupdate(\&body_00_15);
0691     &Xupdate(\&body_00_15);
0692 $code.=<<___;
0693     cmp $t1,#0              // check for K256 terminator
0694     ldr $t1,[sp,#0]
0695     sub $Xfer,$Xfer,#64
0696     bne .L_00_48
0697
0698     sub $Ktbl,$Ktbl,#256        // rewind $Ktbl
0699     cmp $inp,$num
0700     mov $Xfer, #64
0701     csel    $Xfer, $Xfer, xzr, eq
0702     sub $inp,$inp,$Xfer         // avoid SEGV
0703     mov $Xfer,sp
0704 ___
0705     &Xpreload(\&body_00_15);
0706     &Xpreload(\&body_00_15);
0707     &Xpreload(\&body_00_15);
0708     &Xpreload(\&body_00_15);
0709 $code.=<<___;
0710     add $A,$A,$t4           // h+=Sigma0(a) from the past
0711     ldp $t0,$t1,[$ctx,#0]
0712     add $A,$A,$t2           // h+=Maj(a,b,c) from the past
0713     ldp $t2,$t3,[$ctx,#8]
0714     add $A,$A,$t0           // accumulate
0715     add $B,$B,$t1
0716     ldp $t0,$t1,[$ctx,#16]
0717     add $C,$C,$t2
0718     add $D,$D,$t3
0719     ldp $t2,$t3,[$ctx,#24]
0720     add $E,$E,$t0
0721     add $F,$F,$t1
0722      ldr    $t1,[sp,#0]
0723     stp $A,$B,[$ctx,#0]
0724     add $G,$G,$t2
0725      mov    $t2,wzr
0726     stp $C,$D,[$ctx,#8]
0727     add $H,$H,$t3
0728     stp $E,$F,[$ctx,#16]
0729      eor    $t3,$B,$C
0730     stp $G,$H,[$ctx,#24]
0731      mov    $t4,wzr
0732      mov    $Xfer,sp
0733     b.ne    .L_00_48
0734
0735     ldr x29,[x29]
0736     add sp,sp,#16*4+16
0737     ret
0738 .size   sha256_block_neon,.-sha256_block_neon
0739 ___
0740 }
0741
0742 $code.=<<___;
0743 #ifndef __KERNEL__
0744 .comm   OPENSSL_armcap_P,4,4
0745 #endif
0746 ___
0747
0748 {   my  %opcode = (
0749     "sha256h"   => 0x5e004000,  "sha256h2"  => 0x5e005000,
0750     "sha256su0" => 0x5e282800,  "sha256su1" => 0x5e006000   );
0751
0752     sub unsha256 {
0753     my ($mnemonic,$arg)=@_;
0754
0755     $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o
0756     &&
0757     sprintf ".inst\t0x%08x\t//%s %s",
0758             $opcode{$mnemonic}|$1|($2<<5)|($3<<16),
0759             $mnemonic,$arg;
0760     }
0761 }
0762
0763 open SELF,$0;
0764 while(<SELF>) {
0765         next if (/^#!/);
0766         last if (!s/^#/\/\// and !/^$/);
0767         print;
0768 }
0769 close SELF;
0770
0771 foreach(split("\n",$code)) {
0772
0773     s/\`([^\`]*)\`/eval($1)/ge;
0774
0775     s/\b(sha256\w+)\s+([qv].*)/unsha256($1,$2)/ge;
0776
0777     s/\bq([0-9]+)\b/v$1.16b/g;      # old->new registers
0778
0779     s/\.[ui]?8(\s)/$1/;
0780     s/\.\w?32\b//       and s/\.16b/\.4s/g;
0781     m/(ld|st)1[^\[]+\[0\]/  and s/\.4s/\.s/g;
0782
0783     print $_,"\n";
0784 }
0785
0786 close STDOUT;