Back to home page

OSCL-LXR

 
 

    


0001 #!/usr/bin/env perl
0002 # SPDX-License-Identifier: GPL-2.0
0003 
0004 # This code is taken from the OpenSSL project but the author (Andy Polyakov)
0005 # has relicensed it under the GPLv2. Therefore this program is free software;
0006 # you can redistribute it and/or modify it under the terms of the GNU General
0007 # Public License version 2 as published by the Free Software Foundation.
0008 #
0009 # The original headers, including the original license headers, are
0010 # included below for completeness.
0011 
0012 # ====================================================================
0013 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
0014 # project. The module is, however, dual licensed under OpenSSL and
0015 # CRYPTOGAMS licenses depending on where you obtain it. For further
0016 # details see https://www.openssl.org/~appro/cryptogams/.
0017 # ====================================================================
0018 
0019 # SHA256 block procedure for ARMv4. May 2007.
0020 
0021 # Performance is ~2x better than gcc 3.4 generated code and in "abso-
0022 # lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
0023 # byte [on single-issue Xscale PXA250 core].
0024 
0025 # July 2010.
0026 #
0027 # Rescheduling for dual-issue pipeline resulted in 22% improvement on
0028 # Cortex A8 core and ~20 cycles per processed byte.
0029 
0030 # February 2011.
0031 #
0032 # Profiler-assisted and platform-specific optimization resulted in 16%
0033 # improvement on Cortex A8 core and ~15.4 cycles per processed byte.
0034 
0035 # September 2013.
0036 #
0037 # Add NEON implementation. On Cortex A8 it was measured to process one
0038 # byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
0039 # S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
0040 # code (meaning that latter performs sub-optimally, nothing was done
0041 # about it).
0042 
0043 # May 2014.
0044 #
0045 # Add ARMv8 code path performing at 2.0 cpb on Apple A7.
0046 
0047 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
0048 open STDOUT,">$output";
0049 
0050 $ctx="r0";  $t0="r0";
0051 $inp="r1";  $t4="r1";
0052 $len="r2";  $t1="r2";
0053 $T1="r3";   $t3="r3";
0054 $A="r4";
0055 $B="r5";
0056 $C="r6";
0057 $D="r7";
0058 $E="r8";
0059 $F="r9";
0060 $G="r10";
0061 $H="r11";
0062 @V=($A,$B,$C,$D,$E,$F,$G,$H);
0063 $t2="r12";
0064 $Ktbl="r14";
0065 
0066 @Sigma0=( 2,13,22);
0067 @Sigma1=( 6,11,25);
0068 @sigma0=( 7,18, 3);
0069 @sigma1=(17,19,10);
0070 
0071 sub BODY_00_15 {
0072 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
0073 
0074 $code.=<<___ if ($i<16);
0075 #if __ARM_ARCH__>=7
0076     @ ldr   $t1,[$inp],#4           @ $i
0077 # if $i==15
0078     str $inp,[sp,#17*4]         @ make room for $t4
0079 # endif
0080     eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
0081     add $a,$a,$t2           @ h+=Maj(a,b,c) from the past
0082     eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`  @ Sigma1(e)
0083 # ifndef __ARMEB__
0084     rev $t1,$t1
0085 # endif
0086 #else
0087     @ ldrb  $t1,[$inp,#3]           @ $i
0088     add $a,$a,$t2           @ h+=Maj(a,b,c) from the past
0089     ldrb    $t2,[$inp,#2]
0090     ldrb    $t0,[$inp,#1]
0091     orr $t1,$t1,$t2,lsl#8
0092     ldrb    $t2,[$inp],#4
0093     orr $t1,$t1,$t0,lsl#16
0094 # if $i==15
0095     str $inp,[sp,#17*4]         @ make room for $t4
0096 # endif
0097     eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
0098     orr $t1,$t1,$t2,lsl#24
0099     eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`  @ Sigma1(e)
0100 #endif
0101 ___
0102 $code.=<<___;
0103     ldr $t2,[$Ktbl],#4          @ *K256++
0104     add $h,$h,$t1           @ h+=X[i]
0105     str $t1,[sp,#`$i%16`*4]
0106     eor $t1,$f,$g
0107     add $h,$h,$t0,ror#$Sigma1[0]    @ h+=Sigma1(e)
0108     and $t1,$t1,$e
0109     add $h,$h,$t2           @ h+=K256[i]
0110     eor $t1,$t1,$g          @ Ch(e,f,g)
0111     eor $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
0112     add $h,$h,$t1           @ h+=Ch(e,f,g)
0113 #if $i==31
0114     and $t2,$t2,#0xff
0115     cmp $t2,#0xf2           @ done?
0116 #endif
0117 #if $i<15
0118 # if __ARM_ARCH__>=7
0119     ldr $t1,[$inp],#4           @ prefetch
0120 # else
0121     ldrb    $t1,[$inp,#3]
0122 # endif
0123     eor $t2,$a,$b           @ a^b, b^c in next round
0124 #else
0125     ldr $t1,[sp,#`($i+2)%16`*4]     @ from future BODY_16_xx
0126     eor $t2,$a,$b           @ a^b, b^c in next round
0127     ldr $t4,[sp,#`($i+15)%16`*4]    @ from future BODY_16_xx
0128 #endif
0129     eor $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]`  @ Sigma0(a)
0130     and $t3,$t3,$t2         @ (b^c)&=(a^b)
0131     add $d,$d,$h            @ d+=h
0132     eor $t3,$t3,$b          @ Maj(a,b,c)
0133     add $h,$h,$t0,ror#$Sigma0[0]    @ h+=Sigma0(a)
0134     @ add   $h,$h,$t3           @ h+=Maj(a,b,c)
0135 ___
0136     ($t2,$t3)=($t3,$t2);
0137 }
0138 
0139 sub BODY_16_XX {
0140 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
0141 
0142 $code.=<<___;
0143     @ ldr   $t1,[sp,#`($i+1)%16`*4]     @ $i
0144     @ ldr   $t4,[sp,#`($i+14)%16`*4]
0145     mov $t0,$t1,ror#$sigma0[0]
0146     add $a,$a,$t2           @ h+=Maj(a,b,c) from the past
0147     mov $t2,$t4,ror#$sigma1[0]
0148     eor $t0,$t0,$t1,ror#$sigma0[1]
0149     eor $t2,$t2,$t4,ror#$sigma1[1]
0150     eor $t0,$t0,$t1,lsr#$sigma0[2]  @ sigma0(X[i+1])
0151     ldr $t1,[sp,#`($i+0)%16`*4]
0152     eor $t2,$t2,$t4,lsr#$sigma1[2]  @ sigma1(X[i+14])
0153     ldr $t4,[sp,#`($i+9)%16`*4]
0154 
0155     add $t2,$t2,$t0
0156     eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`   @ from BODY_00_15
0157     add $t1,$t1,$t2
0158     eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`  @ Sigma1(e)
0159     add $t1,$t1,$t4         @ X[i]
0160 ___
0161     &BODY_00_15(@_);
0162 }
0163 
0164 $code=<<___;
0165 #ifndef __KERNEL__
0166 # include "arm_arch.h"
0167 #else
0168 # define __ARM_ARCH__ __LINUX_ARM_ARCH__
0169 # define __ARM_MAX_ARCH__ 7
0170 #endif
0171 
0172 .text
0173 #if __ARM_ARCH__<7
0174 .code   32
0175 #else
0176 .syntax unified
0177 # ifdef __thumb2__
0178 .thumb
0179 # else
0180 .code   32
0181 # endif
0182 #endif
0183 
0184 .type   K256,%object
0185 .align  5
0186 K256:
0187 .word   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
0188 .word   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
0189 .word   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
0190 .word   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
0191 .word   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
0192 .word   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
0193 .word   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
0194 .word   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
0195 .word   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
0196 .word   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
0197 .word   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
0198 .word   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
0199 .word   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
0200 .word   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
0201 .word   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
0202 .word   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
0203 .size   K256,.-K256
0204 .word   0               @ terminator
0205 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
0206 .LOPENSSL_armcap:
0207 .word   OPENSSL_armcap_P-sha256_block_data_order
0208 #endif
0209 .align  5
0210 
0211 .global sha256_block_data_order
0212 .type   sha256_block_data_order,%function
0213 sha256_block_data_order:
0214 .Lsha256_block_data_order:
0215 #if __ARM_ARCH__<7
0216     sub r3,pc,#8        @ sha256_block_data_order
0217 #else
0218     adr r3,.Lsha256_block_data_order
0219 #endif
0220 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
0221     ldr r12,.LOPENSSL_armcap
0222     ldr r12,[r3,r12]        @ OPENSSL_armcap_P
0223     tst r12,#ARMV8_SHA256
0224     bne .LARMv8
0225     tst r12,#ARMV7_NEON
0226     bne .LNEON
0227 #endif
0228     add $len,$inp,$len,lsl#6    @ len to point at the end of inp
0229     stmdb   sp!,{$ctx,$inp,$len,r4-r11,lr}
0230     ldmia   $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
0231     sub $Ktbl,r3,#256+32    @ K256
0232     sub sp,sp,#16*4     @ alloca(X[16])
0233 .Loop:
0234 # if __ARM_ARCH__>=7
0235     ldr $t1,[$inp],#4
0236 # else
0237     ldrb    $t1,[$inp,#3]
0238 # endif
0239     eor $t3,$B,$C       @ magic
0240     eor $t2,$t2,$t2
0241 ___
0242 for($i=0;$i<16;$i++)    { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
0243 $code.=".Lrounds_16_xx:\n";
0244 for (;$i<32;$i++)   { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
0245 $code.=<<___;
0246 #if __ARM_ARCH__>=7
0247     ite eq          @ Thumb2 thing, sanity check in ARM
0248 #endif
0249     ldreq   $t3,[sp,#16*4]      @ pull ctx
0250     bne .Lrounds_16_xx
0251 
0252     add $A,$A,$t2       @ h+=Maj(a,b,c) from the past
0253     ldr $t0,[$t3,#0]
0254     ldr $t1,[$t3,#4]
0255     ldr $t2,[$t3,#8]
0256     add $A,$A,$t0
0257     ldr $t0,[$t3,#12]
0258     add $B,$B,$t1
0259     ldr $t1,[$t3,#16]
0260     add $C,$C,$t2
0261     ldr $t2,[$t3,#20]
0262     add $D,$D,$t0
0263     ldr $t0,[$t3,#24]
0264     add $E,$E,$t1
0265     ldr $t1,[$t3,#28]
0266     add $F,$F,$t2
0267     ldr $inp,[sp,#17*4]     @ pull inp
0268     ldr $t2,[sp,#18*4]      @ pull inp+len
0269     add $G,$G,$t0
0270     add $H,$H,$t1
0271     stmia   $t3,{$A,$B,$C,$D,$E,$F,$G,$H}
0272     cmp $inp,$t2
0273     sub $Ktbl,$Ktbl,#256    @ rewind Ktbl
0274     bne .Loop
0275 
0276     add sp,sp,#`16+3`*4 @ destroy frame
0277 #if __ARM_ARCH__>=5
0278     ldmia   sp!,{r4-r11,pc}
0279 #else
0280     ldmia   sp!,{r4-r11,lr}
0281     tst lr,#1
0282     moveq   pc,lr           @ be binary compatible with V4, yet
0283     bx  lr          @ interoperable with Thumb ISA:-)
0284 #endif
0285 .size   sha256_block_data_order,.-sha256_block_data_order
0286 ___
0287 ######################################################################
0288 # NEON stuff
0289 #
0290 {{{
0291 my @X=map("q$_",(0..3));
0292 my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
0293 my $Xfer=$t4;
0294 my $j=0;
0295 
0296 sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
0297 sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
0298 
0299 sub AUTOLOAD()          # thunk [simplified] x86-style perlasm
0300 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
0301   my $arg = pop;
0302     $arg = "#$arg" if ($arg*1 eq $arg);
0303     $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
0304 }
0305 
0306 sub Xupdate()
0307 { use integer;
0308   my $body = shift;
0309   my @insns = (&$body,&$body,&$body,&$body);
0310   my ($a,$b,$c,$d,$e,$f,$g,$h);
0311 
0312     &vext_8     ($T0,@X[0],@X[1],4);    # X[1..4]
0313      eval(shift(@insns));
0314      eval(shift(@insns));
0315      eval(shift(@insns));
0316     &vext_8     ($T1,@X[2],@X[3],4);    # X[9..12]
0317      eval(shift(@insns));
0318      eval(shift(@insns));
0319      eval(shift(@insns));
0320     &vshr_u32   ($T2,$T0,$sigma0[0]);
0321      eval(shift(@insns));
0322      eval(shift(@insns));
0323     &vadd_i32   (@X[0],@X[0],$T1);  # X[0..3] += X[9..12]
0324      eval(shift(@insns));
0325      eval(shift(@insns));
0326     &vshr_u32   ($T1,$T0,$sigma0[2]);
0327      eval(shift(@insns));
0328      eval(shift(@insns));
0329     &vsli_32    ($T2,$T0,32-$sigma0[0]);
0330      eval(shift(@insns));
0331      eval(shift(@insns));
0332     &vshr_u32   ($T3,$T0,$sigma0[1]);
0333      eval(shift(@insns));
0334      eval(shift(@insns));
0335     &veor       ($T1,$T1,$T2);
0336      eval(shift(@insns));
0337      eval(shift(@insns));
0338     &vsli_32    ($T3,$T0,32-$sigma0[1]);
0339      eval(shift(@insns));
0340      eval(shift(@insns));
0341       &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[0]);
0342      eval(shift(@insns));
0343      eval(shift(@insns));
0344     &veor       ($T1,$T1,$T3);      # sigma0(X[1..4])
0345      eval(shift(@insns));
0346      eval(shift(@insns));
0347       &vsli_32  ($T4,&Dhi(@X[3]),32-$sigma1[0]);
0348      eval(shift(@insns));
0349      eval(shift(@insns));
0350       &vshr_u32 ($T5,&Dhi(@X[3]),$sigma1[2]);
0351      eval(shift(@insns));
0352      eval(shift(@insns));
0353     &vadd_i32   (@X[0],@X[0],$T1);  # X[0..3] += sigma0(X[1..4])
0354      eval(shift(@insns));
0355      eval(shift(@insns));
0356       &veor     ($T5,$T5,$T4);
0357      eval(shift(@insns));
0358      eval(shift(@insns));
0359       &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[1]);
0360      eval(shift(@insns));
0361      eval(shift(@insns));
0362       &vsli_32  ($T4,&Dhi(@X[3]),32-$sigma1[1]);
0363      eval(shift(@insns));
0364      eval(shift(@insns));
0365       &veor     ($T5,$T5,$T4);      # sigma1(X[14..15])
0366      eval(shift(@insns));
0367      eval(shift(@insns));
0368     &vadd_i32   (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
0369      eval(shift(@insns));
0370      eval(shift(@insns));
0371       &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[0]);
0372      eval(shift(@insns));
0373      eval(shift(@insns));
0374       &vsli_32  ($T4,&Dlo(@X[0]),32-$sigma1[0]);
0375      eval(shift(@insns));
0376      eval(shift(@insns));
0377       &vshr_u32 ($T5,&Dlo(@X[0]),$sigma1[2]);
0378      eval(shift(@insns));
0379      eval(shift(@insns));
0380       &veor     ($T5,$T5,$T4);
0381      eval(shift(@insns));
0382      eval(shift(@insns));
0383       &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[1]);
0384      eval(shift(@insns));
0385      eval(shift(@insns));
0386     &vld1_32    ("{$T0}","[$Ktbl,:128]!");
0387      eval(shift(@insns));
0388      eval(shift(@insns));
0389       &vsli_32  ($T4,&Dlo(@X[0]),32-$sigma1[1]);
0390      eval(shift(@insns));
0391      eval(shift(@insns));
0392       &veor     ($T5,$T5,$T4);      # sigma1(X[16..17])
0393      eval(shift(@insns));
0394      eval(shift(@insns));
0395     &vadd_i32   (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
0396      eval(shift(@insns));
0397      eval(shift(@insns));
0398     &vadd_i32   ($T0,$T0,@X[0]);
0399      while($#insns>=2) { eval(shift(@insns)); }
0400     &vst1_32    ("{$T0}","[$Xfer,:128]!");
0401      eval(shift(@insns));
0402      eval(shift(@insns));
0403 
0404     push(@X,shift(@X));     # "rotate" X[]
0405 }
0406 
0407 sub Xpreload()
0408 { use integer;
0409   my $body = shift;
0410   my @insns = (&$body,&$body,&$body,&$body);
0411   my ($a,$b,$c,$d,$e,$f,$g,$h);
0412 
0413      eval(shift(@insns));
0414      eval(shift(@insns));
0415      eval(shift(@insns));
0416      eval(shift(@insns));
0417     &vld1_32    ("{$T0}","[$Ktbl,:128]!");
0418      eval(shift(@insns));
0419      eval(shift(@insns));
0420      eval(shift(@insns));
0421      eval(shift(@insns));
0422     &vrev32_8   (@X[0],@X[0]);
0423      eval(shift(@insns));
0424      eval(shift(@insns));
0425      eval(shift(@insns));
0426      eval(shift(@insns));
0427     &vadd_i32   ($T0,$T0,@X[0]);
0428      foreach (@insns) { eval; } # remaining instructions
0429     &vst1_32    ("{$T0}","[$Xfer,:128]!");
0430 
0431     push(@X,shift(@X));     # "rotate" X[]
0432 }
0433 
0434 sub body_00_15 () {
0435     (
0436     '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
0437     '&add   ($h,$h,$t1)',           # h+=X[i]+K[i]
0438     '&eor   ($t1,$f,$g)',
0439     '&eor   ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
0440     '&add   ($a,$a,$t2)',           # h+=Maj(a,b,c) from the past
0441     '&and   ($t1,$t1,$e)',
0442     '&eor   ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))',   # Sigma1(e)
0443     '&eor   ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
0444     '&eor   ($t1,$t1,$g)',          # Ch(e,f,g)
0445     '&add   ($h,$h,$t2,"ror#$Sigma1[0]")',  # h+=Sigma1(e)
0446     '&eor   ($t2,$a,$b)',           # a^b, b^c in next round
0447     '&eor   ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))',   # Sigma0(a)
0448     '&add   ($h,$h,$t1)',           # h+=Ch(e,f,g)
0449     '&ldr   ($t1,sprintf "[sp,#%d]",4*(($j+1)&15))  if (($j&15)!=15);'.
0450     '&ldr   ($t1,"[$Ktbl]")             if ($j==15);'.
0451     '&ldr   ($t1,"[sp,#64]")            if ($j==31)',
0452     '&and   ($t3,$t3,$t2)',         # (b^c)&=(a^b)
0453     '&add   ($d,$d,$h)',            # d+=h
0454     '&add   ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a)
0455     '&eor   ($t3,$t3,$b)',          # Maj(a,b,c)
0456     '$j++;  unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
0457     )
0458 }
0459 
0460 $code.=<<___;
0461 #if __ARM_MAX_ARCH__>=7
0462 .arch   armv7-a
0463 .fpu    neon
0464 
0465 .global sha256_block_data_order_neon
0466 .type   sha256_block_data_order_neon,%function
0467 .align  4
0468 sha256_block_data_order_neon:
0469 .LNEON:
0470     stmdb   sp!,{r4-r12,lr}
0471 
0472     sub $H,sp,#16*4+16
0473     adr $Ktbl,.Lsha256_block_data_order
0474     sub $Ktbl,$Ktbl,#.Lsha256_block_data_order-K256
0475     bic $H,$H,#15       @ align for 128-bit stores
0476     mov $t2,sp
0477     mov sp,$H           @ alloca
0478     add $len,$inp,$len,lsl#6    @ len to point at the end of inp
0479 
0480     vld1.8      {@X[0]},[$inp]!
0481     vld1.8      {@X[1]},[$inp]!
0482     vld1.8      {@X[2]},[$inp]!
0483     vld1.8      {@X[3]},[$inp]!
0484     vld1.32     {$T0},[$Ktbl,:128]!
0485     vld1.32     {$T1},[$Ktbl,:128]!
0486     vld1.32     {$T2},[$Ktbl,:128]!
0487     vld1.32     {$T3},[$Ktbl,:128]!
0488     vrev32.8    @X[0],@X[0]     @ yes, even on
0489     str     $ctx,[sp,#64]
0490     vrev32.8    @X[1],@X[1]     @ big-endian
0491     str     $inp,[sp,#68]
0492     mov     $Xfer,sp
0493     vrev32.8    @X[2],@X[2]
0494     str     $len,[sp,#72]
0495     vrev32.8    @X[3],@X[3]
0496     str     $t2,[sp,#76]        @ save original sp
0497     vadd.i32    $T0,$T0,@X[0]
0498     vadd.i32    $T1,$T1,@X[1]
0499     vst1.32     {$T0},[$Xfer,:128]!
0500     vadd.i32    $T2,$T2,@X[2]
0501     vst1.32     {$T1},[$Xfer,:128]!
0502     vadd.i32    $T3,$T3,@X[3]
0503     vst1.32     {$T2},[$Xfer,:128]!
0504     vst1.32     {$T3},[$Xfer,:128]!
0505 
0506     ldmia       $ctx,{$A-$H}
0507     sub     $Xfer,$Xfer,#64
0508     ldr     $t1,[sp,#0]
0509     eor     $t2,$t2,$t2
0510     eor     $t3,$B,$C
0511     b       .L_00_48
0512 
0513 .align  4
0514 .L_00_48:
0515 ___
0516     &Xupdate(\&body_00_15);
0517     &Xupdate(\&body_00_15);
0518     &Xupdate(\&body_00_15);
0519     &Xupdate(\&body_00_15);
0520 $code.=<<___;
0521     teq $t1,#0              @ check for K256 terminator
0522     ldr $t1,[sp,#0]
0523     sub $Xfer,$Xfer,#64
0524     bne .L_00_48
0525 
0526     ldr     $inp,[sp,#68]
0527     ldr     $t0,[sp,#72]
0528     sub     $Ktbl,$Ktbl,#256    @ rewind $Ktbl
0529     teq     $inp,$t0
0530     it      eq
0531     subeq       $inp,$inp,#64       @ avoid SEGV
0532     vld1.8      {@X[0]},[$inp]!     @ load next input block
0533     vld1.8      {@X[1]},[$inp]!
0534     vld1.8      {@X[2]},[$inp]!
0535     vld1.8      {@X[3]},[$inp]!
0536     it      ne
0537     strne       $inp,[sp,#68]
0538     mov     $Xfer,sp
0539 ___
0540     &Xpreload(\&body_00_15);
0541     &Xpreload(\&body_00_15);
0542     &Xpreload(\&body_00_15);
0543     &Xpreload(\&body_00_15);
0544 $code.=<<___;
0545     ldr $t0,[$t1,#0]
0546     add $A,$A,$t2           @ h+=Maj(a,b,c) from the past
0547     ldr $t2,[$t1,#4]
0548     ldr $t3,[$t1,#8]
0549     ldr $t4,[$t1,#12]
0550     add $A,$A,$t0           @ accumulate
0551     ldr $t0,[$t1,#16]
0552     add $B,$B,$t2
0553     ldr $t2,[$t1,#20]
0554     add $C,$C,$t3
0555     ldr $t3,[$t1,#24]
0556     add $D,$D,$t4
0557     ldr $t4,[$t1,#28]
0558     add $E,$E,$t0
0559     str $A,[$t1],#4
0560     add $F,$F,$t2
0561     str $B,[$t1],#4
0562     add $G,$G,$t3
0563     str $C,[$t1],#4
0564     add $H,$H,$t4
0565     str $D,[$t1],#4
0566     stmia   $t1,{$E-$H}
0567 
0568     ittte   ne
0569     movne   $Xfer,sp
0570     ldrne   $t1,[sp,#0]
0571     eorne   $t2,$t2,$t2
0572     ldreq   sp,[sp,#76]         @ restore original sp
0573     itt ne
0574     eorne   $t3,$B,$C
0575     bne .L_00_48
0576 
0577     ldmia   sp!,{r4-r12,pc}
0578 .size   sha256_block_data_order_neon,.-sha256_block_data_order_neon
0579 #endif
0580 ___
0581 }}}
0582 ######################################################################
0583 # ARMv8 stuff
0584 #
0585 {{{
0586 my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2));
0587 my @MSG=map("q$_",(8..11));
0588 my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
0589 my $Ktbl="r3";
0590 
0591 $code.=<<___;
0592 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
0593 
0594 # ifdef __thumb2__
0595 #  define INST(a,b,c,d) .byte   c,d|0xc,a,b
0596 # else
0597 #  define INST(a,b,c,d) .byte   a,b,c,d
0598 # endif
0599 
0600 .type   sha256_block_data_order_armv8,%function
0601 .align  5
0602 sha256_block_data_order_armv8:
0603 .LARMv8:
0604     vld1.32 {$ABCD,$EFGH},[$ctx]
0605 # ifdef __thumb2__
0606     adr $Ktbl,.LARMv8
0607     sub $Ktbl,$Ktbl,#.LARMv8-K256
0608 # else
0609     adrl    $Ktbl,K256
0610 # endif
0611     add $len,$inp,$len,lsl#6    @ len to point at the end of inp
0612 
0613 .Loop_v8:
0614     vld1.8      {@MSG[0]-@MSG[1]},[$inp]!
0615     vld1.8      {@MSG[2]-@MSG[3]},[$inp]!
0616     vld1.32     {$W0},[$Ktbl]!
0617     vrev32.8    @MSG[0],@MSG[0]
0618     vrev32.8    @MSG[1],@MSG[1]
0619     vrev32.8    @MSG[2],@MSG[2]
0620     vrev32.8    @MSG[3],@MSG[3]
0621     vmov        $ABCD_SAVE,$ABCD    @ offload
0622     vmov        $EFGH_SAVE,$EFGH
0623     teq     $inp,$len
0624 ___
0625 for($i=0;$i<12;$i++) {
0626 $code.=<<___;
0627     vld1.32     {$W1},[$Ktbl]!
0628     vadd.i32    $W0,$W0,@MSG[0]
0629     sha256su0   @MSG[0],@MSG[1]
0630     vmov        $abcd,$ABCD
0631     sha256h     $ABCD,$EFGH,$W0
0632     sha256h2    $EFGH,$abcd,$W0
0633     sha256su1   @MSG[0],@MSG[2],@MSG[3]
0634 ___
0635     ($W0,$W1)=($W1,$W0);    push(@MSG,shift(@MSG));
0636 }
0637 $code.=<<___;
0638     vld1.32     {$W1},[$Ktbl]!
0639     vadd.i32    $W0,$W0,@MSG[0]
0640     vmov        $abcd,$ABCD
0641     sha256h     $ABCD,$EFGH,$W0
0642     sha256h2    $EFGH,$abcd,$W0
0643 
0644     vld1.32     {$W0},[$Ktbl]!
0645     vadd.i32    $W1,$W1,@MSG[1]
0646     vmov        $abcd,$ABCD
0647     sha256h     $ABCD,$EFGH,$W1
0648     sha256h2    $EFGH,$abcd,$W1
0649 
0650     vld1.32     {$W1},[$Ktbl]
0651     vadd.i32    $W0,$W0,@MSG[2]
0652     sub     $Ktbl,$Ktbl,#256-16 @ rewind
0653     vmov        $abcd,$ABCD
0654     sha256h     $ABCD,$EFGH,$W0
0655     sha256h2    $EFGH,$abcd,$W0
0656 
0657     vadd.i32    $W1,$W1,@MSG[3]
0658     vmov        $abcd,$ABCD
0659     sha256h     $ABCD,$EFGH,$W1
0660     sha256h2    $EFGH,$abcd,$W1
0661 
0662     vadd.i32    $ABCD,$ABCD,$ABCD_SAVE
0663     vadd.i32    $EFGH,$EFGH,$EFGH_SAVE
0664     it      ne
0665     bne     .Loop_v8
0666 
0667     vst1.32     {$ABCD,$EFGH},[$ctx]
0668 
0669     ret     @ bx lr
0670 .size   sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
0671 #endif
0672 ___
0673 }}}
0674 $code.=<<___;
0675 .asciz  "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
0676 .align  2
0677 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
0678 .comm   OPENSSL_armcap_P,4,4
0679 #endif
0680 ___
0681 
0682 open SELF,$0;
0683 while(<SELF>) {
0684     next if (/^#!/);
0685     last if (!s/^#/@/ and !/^$/);
0686     print;
0687 }
0688 close SELF;
0689 
0690 {   my  %opcode = (
0691     "sha256h"   => 0xf3000c40,  "sha256h2"  => 0xf3100c40,
0692     "sha256su0" => 0xf3ba03c0,  "sha256su1" => 0xf3200c40   );
0693 
0694     sub unsha256 {
0695     my ($mnemonic,$arg)=@_;
0696 
0697     if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
0698         my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
0699                      |(($2&7)<<17)|(($2&8)<<4)
0700                      |(($3&7)<<1) |(($3&8)<<2);
0701         # since ARMv7 instructions are always encoded little-endian.
0702         # correct solution is to use .inst directive, but older
0703         # assemblers don't implement it:-(
0704         sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
0705             $word&0xff,($word>>8)&0xff,
0706             ($word>>16)&0xff,($word>>24)&0xff,
0707             $mnemonic,$arg;
0708     }
0709     }
0710 }
0711 
0712 foreach (split($/,$code)) {
0713 
0714     s/\`([^\`]*)\`/eval $1/geo;
0715 
0716     s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
0717 
0718     s/\bret\b/bx    lr/go       or
0719     s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
0720 
0721     print $_,"\n";
0722 }
0723 
0724 close STDOUT; # enforce flush