Back to home page

OSCL-LXR

 
 

    


0001 #!/usr/bin/env perl
0002 # SPDX-License-Identifier: GPL-2.0
0003 
0004 # This code is taken from the OpenSSL project but the author (Andy Polyakov)
0005 # has relicensed it under the GPLv2. Therefore this program is free software;
0006 # you can redistribute it and/or modify it under the terms of the GNU General
0007 # Public License version 2 as published by the Free Software Foundation.
0008 #
0009 # The original headers, including the original license headers, are
0010 # included below for completeness.
0011 
0012 # ====================================================================
0013 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
0014 # project. The module is, however, dual licensed under OpenSSL and
0015 # CRYPTOGAMS licenses depending on where you obtain it. For further
0016 # details see https://www.openssl.org/~appro/cryptogams/.
0017 # ====================================================================
0018 
0019 # SHA512 block procedure for ARMv4. September 2007.
0020 
0021 # This code is ~4.5 (four and a half) times faster than code generated
0022 # by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
0023 # Xscale PXA250 core].
0024 #
0025 # July 2010.
0026 #
0027 # Rescheduling for dual-issue pipeline resulted in 6% improvement on
0028 # Cortex A8 core and ~40 cycles per processed byte.
0029 
0030 # February 2011.
0031 #
0032 # Profiler-assisted and platform-specific optimization resulted in 7%
0033 # improvement on Coxtex A8 core and ~38 cycles per byte.
0034 
0035 # March 2011.
0036 #
0037 # Add NEON implementation. On Cortex A8 it was measured to process
0038 # one byte in 23.3 cycles or ~60% faster than integer-only code.
0039 
0040 # August 2012.
0041 #
0042 # Improve NEON performance by 12% on Snapdragon S4. In absolute
0043 # terms it's 22.6 cycles per byte, which is disappointing result.
0044 # Technical writers asserted that 3-way S4 pipeline can sustain
0045 # multiple NEON instructions per cycle, but dual NEON issue could
0046 # not be observed, see https://www.openssl.org/~appro/Snapdragon-S4.html
0047 # for further details. On side note Cortex-A15 processes one byte in
0048 # 16 cycles.
0049 
0050 # Byte order [in]dependence. =========================================
0051 #
0052 # Originally caller was expected to maintain specific *dword* order in
0053 # h[0-7], namely with most significant dword at *lower* address, which
0054 # was reflected in below two parameters as 0 and 4. Now caller is
0055 # expected to maintain native byte order for whole 64-bit values.
0056 $hi="HI";
0057 $lo="LO";
0058 # ====================================================================
0059 
0060 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
0061 open STDOUT,">$output";
0062 
0063 $ctx="r0";  # parameter block
0064 $inp="r1";
0065 $len="r2";
0066 
0067 $Tlo="r3";
0068 $Thi="r4";
0069 $Alo="r5";
0070 $Ahi="r6";
0071 $Elo="r7";
0072 $Ehi="r8";
0073 $t0="r9";
0074 $t1="r10";
0075 $t2="r11";
0076 $t3="r12";
0077 ############    r13 is stack pointer
0078 $Ktbl="r14";
0079 ############    r15 is program counter
0080 
0081 $Aoff=8*0;
0082 $Boff=8*1;
0083 $Coff=8*2;
0084 $Doff=8*3;
0085 $Eoff=8*4;
0086 $Foff=8*5;
0087 $Goff=8*6;
0088 $Hoff=8*7;
0089 $Xoff=8*8;
0090 
0091 sub BODY_00_15() {
0092 my $magic = shift;
0093 $code.=<<___;
0094     @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18)  ^ ROTR((x),41))
0095     @ LO        lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
0096     @ HI        hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
0097     mov $t0,$Elo,lsr#14
0098     str $Tlo,[sp,#$Xoff+0]
0099     mov $t1,$Ehi,lsr#14
0100     str $Thi,[sp,#$Xoff+4]
0101     eor $t0,$t0,$Ehi,lsl#18
0102     ldr $t2,[sp,#$Hoff+0]   @ h.lo
0103     eor $t1,$t1,$Elo,lsl#18
0104     ldr $t3,[sp,#$Hoff+4]   @ h.hi
0105     eor $t0,$t0,$Elo,lsr#18
0106     eor $t1,$t1,$Ehi,lsr#18
0107     eor $t0,$t0,$Ehi,lsl#14
0108     eor $t1,$t1,$Elo,lsl#14
0109     eor $t0,$t0,$Ehi,lsr#9
0110     eor $t1,$t1,$Elo,lsr#9
0111     eor $t0,$t0,$Elo,lsl#23
0112     eor $t1,$t1,$Ehi,lsl#23 @ Sigma1(e)
0113     adds    $Tlo,$Tlo,$t0
0114     ldr $t0,[sp,#$Foff+0]   @ f.lo
0115     adc $Thi,$Thi,$t1       @ T += Sigma1(e)
0116     ldr $t1,[sp,#$Foff+4]   @ f.hi
0117     adds    $Tlo,$Tlo,$t2
0118     ldr $t2,[sp,#$Goff+0]   @ g.lo
0119     adc $Thi,$Thi,$t3       @ T += h
0120     ldr $t3,[sp,#$Goff+4]   @ g.hi
0121 
0122     eor $t0,$t0,$t2
0123     str $Elo,[sp,#$Eoff+0]
0124     eor $t1,$t1,$t3
0125     str $Ehi,[sp,#$Eoff+4]
0126     and $t0,$t0,$Elo
0127     str $Alo,[sp,#$Aoff+0]
0128     and $t1,$t1,$Ehi
0129     str $Ahi,[sp,#$Aoff+4]
0130     eor $t0,$t0,$t2
0131     ldr $t2,[$Ktbl,#$lo]    @ K[i].lo
0132     eor $t1,$t1,$t3     @ Ch(e,f,g)
0133     ldr $t3,[$Ktbl,#$hi]    @ K[i].hi
0134 
0135     adds    $Tlo,$Tlo,$t0
0136     ldr $Elo,[sp,#$Doff+0]  @ d.lo
0137     adc $Thi,$Thi,$t1       @ T += Ch(e,f,g)
0138     ldr $Ehi,[sp,#$Doff+4]  @ d.hi
0139     adds    $Tlo,$Tlo,$t2
0140     and $t0,$t2,#0xff
0141     adc $Thi,$Thi,$t3       @ T += K[i]
0142     adds    $Elo,$Elo,$Tlo
0143     ldr $t2,[sp,#$Boff+0]   @ b.lo
0144     adc $Ehi,$Ehi,$Thi      @ d += T
0145     teq $t0,#$magic
0146 
0147     ldr $t3,[sp,#$Coff+0]   @ c.lo
0148 #if __ARM_ARCH__>=7
0149     it  eq          @ Thumb2 thing, sanity check in ARM
0150 #endif
0151     orreq   $Ktbl,$Ktbl,#1
0152     @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
0153     @ LO        lo>>28^hi<<4  ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
0154     @ HI        hi>>28^lo<<4  ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
0155     mov $t0,$Alo,lsr#28
0156     mov $t1,$Ahi,lsr#28
0157     eor $t0,$t0,$Ahi,lsl#4
0158     eor $t1,$t1,$Alo,lsl#4
0159     eor $t0,$t0,$Ahi,lsr#2
0160     eor $t1,$t1,$Alo,lsr#2
0161     eor $t0,$t0,$Alo,lsl#30
0162     eor $t1,$t1,$Ahi,lsl#30
0163     eor $t0,$t0,$Ahi,lsr#7
0164     eor $t1,$t1,$Alo,lsr#7
0165     eor $t0,$t0,$Alo,lsl#25
0166     eor $t1,$t1,$Ahi,lsl#25 @ Sigma0(a)
0167     adds    $Tlo,$Tlo,$t0
0168     and $t0,$Alo,$t2
0169     adc $Thi,$Thi,$t1       @ T += Sigma0(a)
0170 
0171     ldr $t1,[sp,#$Boff+4]   @ b.hi
0172     orr $Alo,$Alo,$t2
0173     ldr $t2,[sp,#$Coff+4]   @ c.hi
0174     and $Alo,$Alo,$t3
0175     and $t3,$Ahi,$t1
0176     orr $Ahi,$Ahi,$t1
0177     orr $Alo,$Alo,$t0       @ Maj(a,b,c).lo
0178     and $Ahi,$Ahi,$t2
0179     adds    $Alo,$Alo,$Tlo
0180     orr $Ahi,$Ahi,$t3       @ Maj(a,b,c).hi
0181     sub sp,sp,#8
0182     adc $Ahi,$Ahi,$Thi      @ h += T
0183     tst $Ktbl,#1
0184     add $Ktbl,$Ktbl,#8
0185 ___
0186 }
0187 $code=<<___;
0188 #ifndef __KERNEL__
0189 # include "arm_arch.h"
0190 # define VFP_ABI_PUSH   vstmdb  sp!,{d8-d15}
0191 # define VFP_ABI_POP    vldmia  sp!,{d8-d15}
0192 #else
0193 # define __ARM_ARCH__ __LINUX_ARM_ARCH__
0194 # define __ARM_MAX_ARCH__ 7
0195 # define VFP_ABI_PUSH
0196 # define VFP_ABI_POP
0197 #endif
0198 
0199 #ifdef __ARMEL__
0200 # define LO 0
0201 # define HI 4
0202 # define WORD64(hi0,lo0,hi1,lo1)    .word   lo0,hi0, lo1,hi1
0203 #else
0204 # define HI 0
0205 # define LO 4
0206 # define WORD64(hi0,lo0,hi1,lo1)    .word   hi0,lo0, hi1,lo1
0207 #endif
0208 
0209 .text
0210 #if __ARM_ARCH__<7
0211 .code   32
0212 #else
0213 .syntax unified
0214 # ifdef __thumb2__
0215 .thumb
0216 # else
0217 .code   32
0218 # endif
0219 #endif
0220 
0221 .type   K512,%object
0222 .align  5
0223 K512:
0224 WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
0225 WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
0226 WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
0227 WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
0228 WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
0229 WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
0230 WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
0231 WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
0232 WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
0233 WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
0234 WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
0235 WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
0236 WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
0237 WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
0238 WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
0239 WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
0240 WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
0241 WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
0242 WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
0243 WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
0244 WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
0245 WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
0246 WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
0247 WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
0248 WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
0249 WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
0250 WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
0251 WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
0252 WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
0253 WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
0254 WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
0255 WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
0256 WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
0257 WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
0258 WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
0259 WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
0260 WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
0261 WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
0262 WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
0263 WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
0264 .size   K512,.-K512
0265 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
0266 .LOPENSSL_armcap:
0267 .word   OPENSSL_armcap_P-sha512_block_data_order
0268 .skip   32-4
0269 #else
0270 .skip   32
0271 #endif
0272 
0273 .global sha512_block_data_order
0274 .type   sha512_block_data_order,%function
0275 sha512_block_data_order:
0276 .Lsha512_block_data_order:
0277 #if __ARM_ARCH__<7
0278     sub r3,pc,#8        @ sha512_block_data_order
0279 #else
0280     adr r3,.Lsha512_block_data_order
0281 #endif
0282 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
0283     ldr r12,.LOPENSSL_armcap
0284     ldr r12,[r3,r12]        @ OPENSSL_armcap_P
0285     tst r12,#1
0286     bne .LNEON
0287 #endif
0288     add $len,$inp,$len,lsl#7    @ len to point at the end of inp
0289     stmdb   sp!,{r4-r12,lr}
0290     sub $Ktbl,r3,#672       @ K512
0291     sub sp,sp,#9*8
0292 
0293     ldr $Elo,[$ctx,#$Eoff+$lo]
0294     ldr $Ehi,[$ctx,#$Eoff+$hi]
0295     ldr $t0, [$ctx,#$Goff+$lo]
0296     ldr $t1, [$ctx,#$Goff+$hi]
0297     ldr $t2, [$ctx,#$Hoff+$lo]
0298     ldr $t3, [$ctx,#$Hoff+$hi]
0299 .Loop:
0300     str $t0, [sp,#$Goff+0]
0301     str $t1, [sp,#$Goff+4]
0302     str $t2, [sp,#$Hoff+0]
0303     str $t3, [sp,#$Hoff+4]
0304     ldr $Alo,[$ctx,#$Aoff+$lo]
0305     ldr $Ahi,[$ctx,#$Aoff+$hi]
0306     ldr $Tlo,[$ctx,#$Boff+$lo]
0307     ldr $Thi,[$ctx,#$Boff+$hi]
0308     ldr $t0, [$ctx,#$Coff+$lo]
0309     ldr $t1, [$ctx,#$Coff+$hi]
0310     ldr $t2, [$ctx,#$Doff+$lo]
0311     ldr $t3, [$ctx,#$Doff+$hi]
0312     str $Tlo,[sp,#$Boff+0]
0313     str $Thi,[sp,#$Boff+4]
0314     str $t0, [sp,#$Coff+0]
0315     str $t1, [sp,#$Coff+4]
0316     str $t2, [sp,#$Doff+0]
0317     str $t3, [sp,#$Doff+4]
0318     ldr $Tlo,[$ctx,#$Foff+$lo]
0319     ldr $Thi,[$ctx,#$Foff+$hi]
0320     str $Tlo,[sp,#$Foff+0]
0321     str $Thi,[sp,#$Foff+4]
0322 
0323 .L00_15:
0324 #if __ARM_ARCH__<7
0325     ldrb    $Tlo,[$inp,#7]
0326     ldrb    $t0, [$inp,#6]
0327     ldrb    $t1, [$inp,#5]
0328     ldrb    $t2, [$inp,#4]
0329     ldrb    $Thi,[$inp,#3]
0330     ldrb    $t3, [$inp,#2]
0331     orr $Tlo,$Tlo,$t0,lsl#8
0332     ldrb    $t0, [$inp,#1]
0333     orr $Tlo,$Tlo,$t1,lsl#16
0334     ldrb    $t1, [$inp],#8
0335     orr $Tlo,$Tlo,$t2,lsl#24
0336     orr $Thi,$Thi,$t3,lsl#8
0337     orr $Thi,$Thi,$t0,lsl#16
0338     orr $Thi,$Thi,$t1,lsl#24
0339 #else
0340     ldr $Tlo,[$inp,#4]
0341     ldr $Thi,[$inp],#8
0342 #ifdef __ARMEL__
0343     rev $Tlo,$Tlo
0344     rev $Thi,$Thi
0345 #endif
0346 #endif
0347 ___
0348     &BODY_00_15(0x94);
0349 $code.=<<___;
0350     tst $Ktbl,#1
0351     beq .L00_15
0352     ldr $t0,[sp,#`$Xoff+8*(16-1)`+0]
0353     ldr $t1,[sp,#`$Xoff+8*(16-1)`+4]
0354     bic $Ktbl,$Ktbl,#1
0355 .L16_79:
0356     @ sigma0(x) (ROTR((x),1)  ^ ROTR((x),8)  ^ ((x)>>7))
0357     @ LO        lo>>1^hi<<31  ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
0358     @ HI        hi>>1^lo<<31  ^ hi>>8^lo<<24 ^ hi>>7
0359     mov $Tlo,$t0,lsr#1
0360     ldr $t2,[sp,#`$Xoff+8*(16-14)`+0]
0361     mov $Thi,$t1,lsr#1
0362     ldr $t3,[sp,#`$Xoff+8*(16-14)`+4]
0363     eor $Tlo,$Tlo,$t1,lsl#31
0364     eor $Thi,$Thi,$t0,lsl#31
0365     eor $Tlo,$Tlo,$t0,lsr#8
0366     eor $Thi,$Thi,$t1,lsr#8
0367     eor $Tlo,$Tlo,$t1,lsl#24
0368     eor $Thi,$Thi,$t0,lsl#24
0369     eor $Tlo,$Tlo,$t0,lsr#7
0370     eor $Thi,$Thi,$t1,lsr#7
0371     eor $Tlo,$Tlo,$t1,lsl#25
0372 
0373     @ sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
0374     @ LO        lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
0375     @ HI        hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
0376     mov $t0,$t2,lsr#19
0377     mov $t1,$t3,lsr#19
0378     eor $t0,$t0,$t3,lsl#13
0379     eor $t1,$t1,$t2,lsl#13
0380     eor $t0,$t0,$t3,lsr#29
0381     eor $t1,$t1,$t2,lsr#29
0382     eor $t0,$t0,$t2,lsl#3
0383     eor $t1,$t1,$t3,lsl#3
0384     eor $t0,$t0,$t2,lsr#6
0385     eor $t1,$t1,$t3,lsr#6
0386     ldr $t2,[sp,#`$Xoff+8*(16-9)`+0]
0387     eor $t0,$t0,$t3,lsl#26
0388 
0389     ldr $t3,[sp,#`$Xoff+8*(16-9)`+4]
0390     adds    $Tlo,$Tlo,$t0
0391     ldr $t0,[sp,#`$Xoff+8*16`+0]
0392     adc $Thi,$Thi,$t1
0393 
0394     ldr $t1,[sp,#`$Xoff+8*16`+4]
0395     adds    $Tlo,$Tlo,$t2
0396     adc $Thi,$Thi,$t3
0397     adds    $Tlo,$Tlo,$t0
0398     adc $Thi,$Thi,$t1
0399 ___
0400     &BODY_00_15(0x17);
0401 $code.=<<___;
0402 #if __ARM_ARCH__>=7
0403     ittt    eq          @ Thumb2 thing, sanity check in ARM
0404 #endif
0405     ldreq   $t0,[sp,#`$Xoff+8*(16-1)`+0]
0406     ldreq   $t1,[sp,#`$Xoff+8*(16-1)`+4]
0407     beq .L16_79
0408     bic $Ktbl,$Ktbl,#1
0409 
0410     ldr $Tlo,[sp,#$Boff+0]
0411     ldr $Thi,[sp,#$Boff+4]
0412     ldr $t0, [$ctx,#$Aoff+$lo]
0413     ldr $t1, [$ctx,#$Aoff+$hi]
0414     ldr $t2, [$ctx,#$Boff+$lo]
0415     ldr $t3, [$ctx,#$Boff+$hi]
0416     adds    $t0,$Alo,$t0
0417     str $t0, [$ctx,#$Aoff+$lo]
0418     adc $t1,$Ahi,$t1
0419     str $t1, [$ctx,#$Aoff+$hi]
0420     adds    $t2,$Tlo,$t2
0421     str $t2, [$ctx,#$Boff+$lo]
0422     adc $t3,$Thi,$t3
0423     str $t3, [$ctx,#$Boff+$hi]
0424 
0425     ldr $Alo,[sp,#$Coff+0]
0426     ldr $Ahi,[sp,#$Coff+4]
0427     ldr $Tlo,[sp,#$Doff+0]
0428     ldr $Thi,[sp,#$Doff+4]
0429     ldr $t0, [$ctx,#$Coff+$lo]
0430     ldr $t1, [$ctx,#$Coff+$hi]
0431     ldr $t2, [$ctx,#$Doff+$lo]
0432     ldr $t3, [$ctx,#$Doff+$hi]
0433     adds    $t0,$Alo,$t0
0434     str $t0, [$ctx,#$Coff+$lo]
0435     adc $t1,$Ahi,$t1
0436     str $t1, [$ctx,#$Coff+$hi]
0437     adds    $t2,$Tlo,$t2
0438     str $t2, [$ctx,#$Doff+$lo]
0439     adc $t3,$Thi,$t3
0440     str $t3, [$ctx,#$Doff+$hi]
0441 
0442     ldr $Tlo,[sp,#$Foff+0]
0443     ldr $Thi,[sp,#$Foff+4]
0444     ldr $t0, [$ctx,#$Eoff+$lo]
0445     ldr $t1, [$ctx,#$Eoff+$hi]
0446     ldr $t2, [$ctx,#$Foff+$lo]
0447     ldr $t3, [$ctx,#$Foff+$hi]
0448     adds    $Elo,$Elo,$t0
0449     str $Elo,[$ctx,#$Eoff+$lo]
0450     adc $Ehi,$Ehi,$t1
0451     str $Ehi,[$ctx,#$Eoff+$hi]
0452     adds    $t2,$Tlo,$t2
0453     str $t2, [$ctx,#$Foff+$lo]
0454     adc $t3,$Thi,$t3
0455     str $t3, [$ctx,#$Foff+$hi]
0456 
0457     ldr $Alo,[sp,#$Goff+0]
0458     ldr $Ahi,[sp,#$Goff+4]
0459     ldr $Tlo,[sp,#$Hoff+0]
0460     ldr $Thi,[sp,#$Hoff+4]
0461     ldr $t0, [$ctx,#$Goff+$lo]
0462     ldr $t1, [$ctx,#$Goff+$hi]
0463     ldr $t2, [$ctx,#$Hoff+$lo]
0464     ldr $t3, [$ctx,#$Hoff+$hi]
0465     adds    $t0,$Alo,$t0
0466     str $t0, [$ctx,#$Goff+$lo]
0467     adc $t1,$Ahi,$t1
0468     str $t1, [$ctx,#$Goff+$hi]
0469     adds    $t2,$Tlo,$t2
0470     str $t2, [$ctx,#$Hoff+$lo]
0471     adc $t3,$Thi,$t3
0472     str $t3, [$ctx,#$Hoff+$hi]
0473 
0474     add sp,sp,#640
0475     sub $Ktbl,$Ktbl,#640
0476 
0477     teq $inp,$len
0478     bne .Loop
0479 
0480     add sp,sp,#8*9      @ destroy frame
0481 #if __ARM_ARCH__>=5
0482     ldmia   sp!,{r4-r12,pc}
0483 #else
0484     ldmia   sp!,{r4-r12,lr}
0485     tst lr,#1
0486     moveq   pc,lr           @ be binary compatible with V4, yet
0487     bx  lr          @ interoperable with Thumb ISA:-)
0488 #endif
0489 .size   sha512_block_data_order,.-sha512_block_data_order
0490 ___
0491 
0492 {
0493 my @Sigma0=(28,34,39);
0494 my @Sigma1=(14,18,41);
0495 my @sigma0=(1, 8, 7);
0496 my @sigma1=(19,61,6);
0497 
0498 my $Ktbl="r3";
0499 my $cnt="r12";  # volatile register known as ip, intra-procedure-call scratch
0500 
0501 my @X=map("d$_",(0..15));
0502 my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23));
0503 
0504 sub NEON_00_15() {
0505 my $i=shift;
0506 my ($a,$b,$c,$d,$e,$f,$g,$h)=@_;
0507 my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31));   # temps
0508 
0509 $code.=<<___ if ($i<16 || $i&1);
0510     vshr.u64    $t0,$e,#@Sigma1[0]  @ $i
0511 #if $i<16
0512     vld1.64     {@X[$i%16]},[$inp]! @ handles unaligned
0513 #endif
0514     vshr.u64    $t1,$e,#@Sigma1[1]
0515 #if $i>0
0516      vadd.i64   $a,$Maj         @ h+=Maj from the past
0517 #endif
0518     vshr.u64    $t2,$e,#@Sigma1[2]
0519 ___
0520 $code.=<<___;
0521     vld1.64     {$K},[$Ktbl,:64]!   @ K[i++]
0522     vsli.64     $t0,$e,#`64-@Sigma1[0]`
0523     vsli.64     $t1,$e,#`64-@Sigma1[1]`
0524     vmov        $Ch,$e
0525     vsli.64     $t2,$e,#`64-@Sigma1[2]`
0526 #if $i<16 && defined(__ARMEL__)
0527     vrev64.8    @X[$i],@X[$i]
0528 #endif
0529     veor        $t1,$t0
0530     vbsl        $Ch,$f,$g       @ Ch(e,f,g)
0531     vshr.u64    $t0,$a,#@Sigma0[0]
0532     veor        $t2,$t1         @ Sigma1(e)
0533     vadd.i64    $T1,$Ch,$h
0534     vshr.u64    $t1,$a,#@Sigma0[1]
0535     vsli.64     $t0,$a,#`64-@Sigma0[0]`
0536     vadd.i64    $T1,$t2
0537     vshr.u64    $t2,$a,#@Sigma0[2]
0538     vadd.i64    $K,@X[$i%16]
0539     vsli.64     $t1,$a,#`64-@Sigma0[1]`
0540     veor        $Maj,$a,$b
0541     vsli.64     $t2,$a,#`64-@Sigma0[2]`
0542     veor        $h,$t0,$t1
0543     vadd.i64    $T1,$K
0544     vbsl        $Maj,$c,$b      @ Maj(a,b,c)
0545     veor        $h,$t2          @ Sigma0(a)
0546     vadd.i64    $d,$T1
0547     vadd.i64    $Maj,$T1
0548     @ vadd.i64  $h,$Maj
0549 ___
0550 }
0551 
0552 sub NEON_16_79() {
0553 my $i=shift;
0554 
0555 if ($i&1)   { &NEON_00_15($i,@_); return; }
0556 
0557 # 2x-vectorized, therefore runs every 2nd round
0558 my @X=map("q$_",(0..7));            # view @X as 128-bit vector
0559 my ($t0,$t1,$s0,$s1) = map("q$_",(12..15)); # temps
0560 my ($d0,$d1,$d2) = map("d$_",(24..26));     # temps from NEON_00_15
0561 my $e=@_[4];                    # $e from NEON_00_15
0562 $i /= 2;
0563 $code.=<<___;
0564     vshr.u64    $t0,@X[($i+7)%8],#@sigma1[0]
0565     vshr.u64    $t1,@X[($i+7)%8],#@sigma1[1]
0566      vadd.i64   @_[0],d30           @ h+=Maj from the past
0567     vshr.u64    $s1,@X[($i+7)%8],#@sigma1[2]
0568     vsli.64     $t0,@X[($i+7)%8],#`64-@sigma1[0]`
0569     vext.8      $s0,@X[$i%8],@X[($i+1)%8],#8    @ X[i+1]
0570     vsli.64     $t1,@X[($i+7)%8],#`64-@sigma1[1]`
0571     veor        $s1,$t0
0572     vshr.u64    $t0,$s0,#@sigma0[0]
0573     veor        $s1,$t1             @ sigma1(X[i+14])
0574     vshr.u64    $t1,$s0,#@sigma0[1]
0575     vadd.i64    @X[$i%8],$s1
0576     vshr.u64    $s1,$s0,#@sigma0[2]
0577     vsli.64     $t0,$s0,#`64-@sigma0[0]`
0578     vsli.64     $t1,$s0,#`64-@sigma0[1]`
0579     vext.8      $s0,@X[($i+4)%8],@X[($i+5)%8],#8    @ X[i+9]
0580     veor        $s1,$t0
0581     vshr.u64    $d0,$e,#@Sigma1[0]      @ from NEON_00_15
0582     vadd.i64    @X[$i%8],$s0
0583     vshr.u64    $d1,$e,#@Sigma1[1]      @ from NEON_00_15
0584     veor        $s1,$t1             @ sigma0(X[i+1])
0585     vshr.u64    $d2,$e,#@Sigma1[2]      @ from NEON_00_15
0586     vadd.i64    @X[$i%8],$s1
0587 ___
0588     &NEON_00_15(2*$i,@_);
0589 }
0590 
0591 $code.=<<___;
0592 #if __ARM_MAX_ARCH__>=7
0593 .arch   armv7-a
0594 .fpu    neon
0595 
0596 .global sha512_block_data_order_neon
0597 .type   sha512_block_data_order_neon,%function
0598 .align  4
0599 sha512_block_data_order_neon:
0600 .LNEON:
0601     dmb             @ errata #451034 on early Cortex A8
0602     add $len,$inp,$len,lsl#7    @ len to point at the end of inp
0603     VFP_ABI_PUSH
0604     adr $Ktbl,.Lsha512_block_data_order
0605     sub $Ktbl,$Ktbl,.Lsha512_block_data_order-K512
0606     vldmia  $ctx,{$A-$H}        @ load context
0607 .Loop_neon:
0608 ___
0609 for($i=0;$i<16;$i++)    { &NEON_00_15($i,@V); unshift(@V,pop(@V)); }
0610 $code.=<<___;
0611     mov     $cnt,#4
0612 .L16_79_neon:
0613     subs        $cnt,#1
0614 ___
0615 for(;$i<32;$i++)    { &NEON_16_79($i,@V); unshift(@V,pop(@V)); }
0616 $code.=<<___;
0617     bne     .L16_79_neon
0618 
0619      vadd.i64   $A,d30      @ h+=Maj from the past
0620     vldmia      $ctx,{d24-d31}  @ load context to temp
0621     vadd.i64    q8,q12      @ vectorized accumulate
0622     vadd.i64    q9,q13
0623     vadd.i64    q10,q14
0624     vadd.i64    q11,q15
0625     vstmia      $ctx,{$A-$H}    @ save context
0626     teq     $inp,$len
0627     sub     $Ktbl,#640  @ rewind K512
0628     bne     .Loop_neon
0629 
0630     VFP_ABI_POP
0631     ret             @ bx lr
0632 .size   sha512_block_data_order_neon,.-sha512_block_data_order_neon
0633 #endif
0634 ___
0635 }
0636 $code.=<<___;
0637 .asciz  "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
0638 .align  2
0639 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
0640 .comm   OPENSSL_armcap_P,4,4
0641 #endif
0642 ___
0643 
0644 $code =~ s/\`([^\`]*)\`/eval $1/gem;
0645 $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;    # make it possible to compile with -march=armv4
0646 $code =~ s/\bret\b/bx   lr/gm;
0647 
0648 open SELF,$0;
0649 while(<SELF>) {
0650     next if (/^#!/);
0651     last if (!s/^#/@/ and !/^$/);
0652     print;
0653 }
0654 close SELF;
0655 
0656 print $code;
0657 close STDOUT; # enforce flush