Back to home page

OSCL-LXR

 
 

    


0001 #!/usr/bin/env perl
0002 # SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause
0003 #
0004 # ====================================================================
0005 # Written by Andy Polyakov, @dot-asm, originally for the OpenSSL
0006 # project.
0007 # ====================================================================
0008 
0009 # Poly1305 hash for MIPS.
0010 #
0011 # May 2016
0012 #
0013 # Numbers are cycles per processed byte with poly1305_blocks alone.
0014 #
0015 #       IALU/gcc
0016 # R1x000    ~5.5/+130%  (big-endian)
0017 # Octeon II 2.50/+70%   (little-endian)
0018 #
0019 # March 2019
0020 #
0021 # Add 32-bit code path.
0022 #
0023 # October 2019
0024 #
0025 # Modulo-scheduling reduction allows to omit dependency chain at the
0026 # end of inner loop and improve performance. Also optimize MIPS32R2
0027 # code path for MIPS 1004K core. Per René von Dorst's suggestions.
0028 #
0029 #       IALU/gcc
0030 # R1x000    ~9.8/?      (big-endian)
0031 # Octeon II 3.65/+140%  (little-endian)
0032 # MT7621/1004K  4.75/?      (little-endian)
0033 #
0034 ######################################################################
0035 # There is a number of MIPS ABI in use, O32 and N32/64 are most
0036 # widely used. Then there is a new contender: NUBI. It appears that if
0037 # one picks the latter, it's possible to arrange code in ABI neutral
0038 # manner. Therefore let's stick to NUBI register layout:
0039 #
0040 ($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
0041 ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
0042 ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
0043 ($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
0044 #
0045 # The return value is placed in $a0. Following coding rules facilitate
0046 # interoperability:
0047 #
0048 # - never ever touch $tp, "thread pointer", former $gp [o32 can be
0049 #   excluded from the rule, because it's specified volatile];
0050 # - copy return value to $t0, former $v0 [or to $a0 if you're adapting
0051 #   old code];
0052 # - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
0053 #
0054 # For reference here is register layout for N32/64 MIPS ABIs:
0055 #
0056 # ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
0057 # ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
0058 # ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
0059 # ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
0060 # ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
0061 #
0062 # <appro@openssl.org>
0063 #
0064 ######################################################################
0065 
0066 $flavour = shift || "64"; # supported flavours are o32,n32,64,nubi32,nubi64
0067 
0068 $v0 = ($flavour =~ /nubi/i) ? $a0 : $t0;
0069 
0070 if ($flavour =~ /64|n32/i) {{{
0071 ######################################################################
0072 # 64-bit code path
0073 #
0074 
0075 my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3);
0076 my ($in0,$in1,$tmp0,$tmp1,$tmp2,$tmp3,$tmp4) = ($a4,$a5,$a6,$a7,$at,$t0,$t1);
0077 
0078 $code.=<<___;
0079 #if (defined(_MIPS_ARCH_MIPS64R3) || defined(_MIPS_ARCH_MIPS64R5) || \\
0080      defined(_MIPS_ARCH_MIPS64R6)) \\
0081      && !defined(_MIPS_ARCH_MIPS64R2)
0082 # define _MIPS_ARCH_MIPS64R2
0083 #endif
0084 
0085 #if defined(_MIPS_ARCH_MIPS64R6)
0086 # define dmultu(rs,rt)
0087 # define mflo(rd,rs,rt) dmulu   rd,rs,rt
0088 # define mfhi(rd,rs,rt) dmuhu   rd,rs,rt
0089 #else
0090 # define dmultu(rs,rt)      dmultu  rs,rt
0091 # define mflo(rd,rs,rt) mflo    rd
0092 # define mfhi(rd,rs,rt) mfhi    rd
0093 #endif
0094 
0095 #ifdef  __KERNEL__
0096 # define poly1305_init   poly1305_init_mips
0097 # define poly1305_blocks poly1305_blocks_mips
0098 # define poly1305_emit   poly1305_emit_mips
0099 #endif
0100 
0101 #if defined(__MIPSEB__) && !defined(MIPSEB)
0102 # define MIPSEB
0103 #endif
0104 
0105 #ifdef MIPSEB
0106 # define MSB 0
0107 # define LSB 7
0108 #else
0109 # define MSB 7
0110 # define LSB 0
0111 #endif
0112 
0113 .text
0114 .set    noat
0115 .set    noreorder
0116 
0117 .align  5
0118 .globl  poly1305_init
0119 .ent    poly1305_init
0120 poly1305_init:
0121     .frame  $sp,0,$ra
0122     .set    reorder
0123 
0124     sd  $zero,0($ctx)
0125     sd  $zero,8($ctx)
0126     sd  $zero,16($ctx)
0127 
0128     beqz    $inp,.Lno_key
0129 
0130 #if defined(_MIPS_ARCH_MIPS64R6)
0131     andi    $tmp0,$inp,7        # $inp % 8
0132     dsubu   $inp,$inp,$tmp0     # align $inp
0133     sll $tmp0,$tmp0,3       # byte to bit offset
0134     ld  $in0,0($inp)
0135     ld  $in1,8($inp)
0136     beqz    $tmp0,.Laligned_key
0137     ld  $tmp2,16($inp)
0138 
0139     subu    $tmp1,$zero,$tmp0
0140 # ifdef MIPSEB
0141     dsllv   $in0,$in0,$tmp0
0142     dsrlv   $tmp3,$in1,$tmp1
0143     dsllv   $in1,$in1,$tmp0
0144     dsrlv   $tmp2,$tmp2,$tmp1
0145 # else
0146     dsrlv   $in0,$in0,$tmp0
0147     dsllv   $tmp3,$in1,$tmp1
0148     dsrlv   $in1,$in1,$tmp0
0149     dsllv   $tmp2,$tmp2,$tmp1
0150 # endif
0151     or  $in0,$in0,$tmp3
0152     or  $in1,$in1,$tmp2
0153 .Laligned_key:
0154 #else
0155     ldl $in0,0+MSB($inp)
0156     ldl $in1,8+MSB($inp)
0157     ldr $in0,0+LSB($inp)
0158     ldr $in1,8+LSB($inp)
0159 #endif
0160 #ifdef  MIPSEB
0161 # if defined(_MIPS_ARCH_MIPS64R2)
0162     dsbh    $in0,$in0       # byte swap
0163      dsbh   $in1,$in1
0164     dshd    $in0,$in0
0165      dshd   $in1,$in1
0166 # else
0167     ori $tmp0,$zero,0xFF
0168     dsll    $tmp2,$tmp0,32
0169     or  $tmp0,$tmp2     # 0x000000FF000000FF
0170 
0171     and $tmp1,$in0,$tmp0    # byte swap
0172      and    $tmp3,$in1,$tmp0
0173     dsrl    $tmp2,$in0,24
0174      dsrl   $tmp4,$in1,24
0175     dsll    $tmp1,24
0176      dsll   $tmp3,24
0177     and $tmp2,$tmp0
0178      and    $tmp4,$tmp0
0179     dsll    $tmp0,8         # 0x0000FF000000FF00
0180     or  $tmp1,$tmp2
0181      or $tmp3,$tmp4
0182     and $tmp2,$in0,$tmp0
0183      and    $tmp4,$in1,$tmp0
0184     dsrl    $in0,8
0185      dsrl   $in1,8
0186     dsll    $tmp2,8
0187      dsll   $tmp4,8
0188     and $in0,$tmp0
0189      and    $in1,$tmp0
0190     or  $tmp1,$tmp2
0191      or $tmp3,$tmp4
0192     or  $in0,$tmp1
0193      or $in1,$tmp3
0194     dsrl    $tmp1,$in0,32
0195      dsrl   $tmp3,$in1,32
0196     dsll    $in0,32
0197      dsll   $in1,32
0198     or  $in0,$tmp1
0199      or $in1,$tmp3
0200 # endif
0201 #endif
0202     li  $tmp0,1
0203     dsll    $tmp0,32        # 0x0000000100000000
0204     daddiu  $tmp0,-63       # 0x00000000ffffffc1
0205     dsll    $tmp0,28        # 0x0ffffffc10000000
0206     daddiu  $tmp0,-1        # 0x0ffffffc0fffffff
0207 
0208     and $in0,$tmp0
0209     daddiu  $tmp0,-3        # 0x0ffffffc0ffffffc
0210     and $in1,$tmp0
0211 
0212     sd  $in0,24($ctx)
0213     dsrl    $tmp0,$in1,2
0214     sd  $in1,32($ctx)
0215     daddu   $tmp0,$in1      # s1 = r1 + (r1 >> 2)
0216     sd  $tmp0,40($ctx)
0217 
0218 .Lno_key:
0219     li  $v0,0           # return 0
0220     jr  $ra
0221 .end    poly1305_init
0222 ___
0223 {
0224 my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x0003f000" : "0x00030000";
0225 
0226 my ($h0,$h1,$h2,$r0,$r1,$rs1,$d0,$d1,$d2) =
0227    ($s0,$s1,$s2,$s3,$s4,$s5,$in0,$in1,$t2);
0228 my ($shr,$shl) = ($s6,$s7);     # used on R6
0229 
0230 $code.=<<___;
0231 .align  5
0232 .globl  poly1305_blocks
0233 .ent    poly1305_blocks
0234 poly1305_blocks:
0235     .set    noreorder
0236     dsrl    $len,4          # number of complete blocks
0237     bnez    $len,poly1305_blocks_internal
0238     nop
0239     jr  $ra
0240     nop
0241 .end    poly1305_blocks
0242 
0243 .align  5
0244 .ent    poly1305_blocks_internal
0245 poly1305_blocks_internal:
0246     .set    noreorder
0247 #if defined(_MIPS_ARCH_MIPS64R6)
0248     .frame  $sp,8*8,$ra
0249     .mask   $SAVED_REGS_MASK|0x000c0000,-8
0250     dsubu   $sp,8*8
0251     sd  $s7,56($sp)
0252     sd  $s6,48($sp)
0253 #else
0254     .frame  $sp,6*8,$ra
0255     .mask   $SAVED_REGS_MASK,-8
0256     dsubu   $sp,6*8
0257 #endif
0258     sd  $s5,40($sp)
0259     sd  $s4,32($sp)
0260 ___
0261 $code.=<<___ if ($flavour =~ /nubi/i);  # optimize non-nubi prologue
0262     sd  $s3,24($sp)
0263     sd  $s2,16($sp)
0264     sd  $s1,8($sp)
0265     sd  $s0,0($sp)
0266 ___
0267 $code.=<<___;
0268     .set    reorder
0269 
0270 #if defined(_MIPS_ARCH_MIPS64R6)
0271     andi    $shr,$inp,7
0272     dsubu   $inp,$inp,$shr      # align $inp
0273     sll $shr,$shr,3     # byte to bit offset
0274     subu    $shl,$zero,$shr
0275 #endif
0276 
0277     ld  $h0,0($ctx)     # load hash value
0278     ld  $h1,8($ctx)
0279     ld  $h2,16($ctx)
0280 
0281     ld  $r0,24($ctx)        # load key
0282     ld  $r1,32($ctx)
0283     ld  $rs1,40($ctx)
0284 
0285     dsll    $len,4
0286     daddu   $len,$inp       # end of buffer
0287     b   .Loop
0288 
0289 .align  4
0290 .Loop:
0291 #if defined(_MIPS_ARCH_MIPS64R6)
0292     ld  $in0,0($inp)        # load input
0293     ld  $in1,8($inp)
0294     beqz    $shr,.Laligned_inp
0295 
0296     ld  $tmp2,16($inp)
0297 # ifdef MIPSEB
0298     dsllv   $in0,$in0,$shr
0299     dsrlv   $tmp3,$in1,$shl
0300     dsllv   $in1,$in1,$shr
0301     dsrlv   $tmp2,$tmp2,$shl
0302 # else
0303     dsrlv   $in0,$in0,$shr
0304     dsllv   $tmp3,$in1,$shl
0305     dsrlv   $in1,$in1,$shr
0306     dsllv   $tmp2,$tmp2,$shl
0307 # endif
0308     or  $in0,$in0,$tmp3
0309     or  $in1,$in1,$tmp2
0310 .Laligned_inp:
0311 #else
0312     ldl $in0,0+MSB($inp)    # load input
0313     ldl $in1,8+MSB($inp)
0314     ldr $in0,0+LSB($inp)
0315     ldr $in1,8+LSB($inp)
0316 #endif
0317     daddiu  $inp,16
0318 #ifdef  MIPSEB
0319 # if defined(_MIPS_ARCH_MIPS64R2)
0320     dsbh    $in0,$in0       # byte swap
0321      dsbh   $in1,$in1
0322     dshd    $in0,$in0
0323      dshd   $in1,$in1
0324 # else
0325     ori $tmp0,$zero,0xFF
0326     dsll    $tmp2,$tmp0,32
0327     or  $tmp0,$tmp2     # 0x000000FF000000FF
0328 
0329     and $tmp1,$in0,$tmp0    # byte swap
0330      and    $tmp3,$in1,$tmp0
0331     dsrl    $tmp2,$in0,24
0332      dsrl   $tmp4,$in1,24
0333     dsll    $tmp1,24
0334      dsll   $tmp3,24
0335     and $tmp2,$tmp0
0336      and    $tmp4,$tmp0
0337     dsll    $tmp0,8         # 0x0000FF000000FF00
0338     or  $tmp1,$tmp2
0339      or $tmp3,$tmp4
0340     and $tmp2,$in0,$tmp0
0341      and    $tmp4,$in1,$tmp0
0342     dsrl    $in0,8
0343      dsrl   $in1,8
0344     dsll    $tmp2,8
0345      dsll   $tmp4,8
0346     and $in0,$tmp0
0347      and    $in1,$tmp0
0348     or  $tmp1,$tmp2
0349      or $tmp3,$tmp4
0350     or  $in0,$tmp1
0351      or $in1,$tmp3
0352     dsrl    $tmp1,$in0,32
0353      dsrl   $tmp3,$in1,32
0354     dsll    $in0,32
0355      dsll   $in1,32
0356     or  $in0,$tmp1
0357      or $in1,$tmp3
0358 # endif
0359 #endif
0360     dsrl    $tmp1,$h2,2     # modulo-scheduled reduction
0361     andi    $h2,$h2,3
0362     dsll    $tmp0,$tmp1,2
0363 
0364     daddu   $d0,$h0,$in0        # accumulate input
0365      daddu  $tmp1,$tmp0
0366     sltu    $tmp0,$d0,$h0
0367     daddu   $d0,$d0,$tmp1       # ... and residue
0368     sltu    $tmp1,$d0,$tmp1
0369     daddu   $d1,$h1,$in1
0370     daddu   $tmp0,$tmp1
0371     sltu    $tmp1,$d1,$h1
0372     daddu   $d1,$tmp0
0373 
0374     dmultu  ($r0,$d0)       # h0*r0
0375      daddu  $d2,$h2,$padbit
0376      sltu   $tmp0,$d1,$tmp0
0377     mflo    ($h0,$r0,$d0)
0378     mfhi    ($h1,$r0,$d0)
0379 
0380     dmultu  ($rs1,$d1)      # h1*5*r1
0381      daddu  $d2,$tmp1
0382      daddu  $d2,$tmp0
0383     mflo    ($tmp0,$rs1,$d1)
0384     mfhi    ($tmp1,$rs1,$d1)
0385 
0386     dmultu  ($r1,$d0)       # h0*r1
0387     mflo    ($tmp2,$r1,$d0)
0388     mfhi    ($h2,$r1,$d0)
0389      daddu  $h0,$tmp0
0390      daddu  $h1,$tmp1
0391      sltu   $tmp0,$h0,$tmp0
0392 
0393     dmultu  ($r0,$d1)       # h1*r0
0394      daddu  $h1,$tmp0
0395      daddu  $h1,$tmp2
0396     mflo    ($tmp0,$r0,$d1)
0397     mfhi    ($tmp1,$r0,$d1)
0398 
0399     dmultu  ($rs1,$d2)      # h2*5*r1
0400      sltu   $tmp2,$h1,$tmp2
0401      daddu  $h2,$tmp2
0402     mflo    ($tmp2,$rs1,$d2)
0403 
0404     dmultu  ($r0,$d2)       # h2*r0
0405      daddu  $h1,$tmp0
0406      daddu  $h2,$tmp1
0407     mflo    ($tmp3,$r0,$d2)
0408      sltu   $tmp0,$h1,$tmp0
0409      daddu  $h2,$tmp0
0410 
0411     daddu   $h1,$tmp2
0412     sltu    $tmp2,$h1,$tmp2
0413     daddu   $h2,$tmp2
0414     daddu   $h2,$tmp3
0415 
0416     bne $inp,$len,.Loop
0417 
0418     sd  $h0,0($ctx)     # store hash value
0419     sd  $h1,8($ctx)
0420     sd  $h2,16($ctx)
0421 
0422     .set    noreorder
0423 #if defined(_MIPS_ARCH_MIPS64R6)
0424     ld  $s7,56($sp)
0425     ld  $s6,48($sp)
0426 #endif
0427     ld  $s5,40($sp)     # epilogue
0428     ld  $s4,32($sp)
0429 ___
0430 $code.=<<___ if ($flavour =~ /nubi/i);  # optimize non-nubi epilogue
0431     ld  $s3,24($sp)
0432     ld  $s2,16($sp)
0433     ld  $s1,8($sp)
0434     ld  $s0,0($sp)
0435 ___
0436 $code.=<<___;
0437     jr  $ra
0438 #if defined(_MIPS_ARCH_MIPS64R6)
0439     daddu   $sp,8*8
0440 #else
0441     daddu   $sp,6*8
0442 #endif
0443 .end    poly1305_blocks_internal
0444 ___
0445 }
0446 {
0447 my ($ctx,$mac,$nonce) = ($a0,$a1,$a2);
0448 
0449 $code.=<<___;
0450 .align  5
0451 .globl  poly1305_emit
0452 .ent    poly1305_emit
0453 poly1305_emit:
0454     .frame  $sp,0,$ra
0455     .set    reorder
0456 
0457     ld  $tmp2,16($ctx)
0458     ld  $tmp0,0($ctx)
0459     ld  $tmp1,8($ctx)
0460 
0461     li  $in0,-4         # final reduction
0462     dsrl    $in1,$tmp2,2
0463     and $in0,$tmp2
0464     andi    $tmp2,$tmp2,3
0465     daddu   $in0,$in1
0466 
0467     daddu   $tmp0,$tmp0,$in0
0468     sltu    $in1,$tmp0,$in0
0469      daddiu $in0,$tmp0,5        # compare to modulus
0470     daddu   $tmp1,$tmp1,$in1
0471      sltiu  $tmp3,$in0,5
0472     sltu    $tmp4,$tmp1,$in1
0473      daddu  $in1,$tmp1,$tmp3
0474     daddu   $tmp2,$tmp2,$tmp4
0475      sltu   $tmp3,$in1,$tmp3
0476      daddu  $tmp2,$tmp2,$tmp3
0477 
0478     dsrl    $tmp2,2         # see if it carried/borrowed
0479     dsubu   $tmp2,$zero,$tmp2
0480 
0481     xor $in0,$tmp0
0482     xor $in1,$tmp1
0483     and $in0,$tmp2
0484     and $in1,$tmp2
0485     xor $in0,$tmp0
0486     xor $in1,$tmp1
0487 
0488     lwu $tmp0,0($nonce)     # load nonce
0489     lwu $tmp1,4($nonce)
0490     lwu $tmp2,8($nonce)
0491     lwu $tmp3,12($nonce)
0492     dsll    $tmp1,32
0493     dsll    $tmp3,32
0494     or  $tmp0,$tmp1
0495     or  $tmp2,$tmp3
0496 
0497     daddu   $in0,$tmp0      # accumulate nonce
0498     daddu   $in1,$tmp2
0499     sltu    $tmp0,$in0,$tmp0
0500     daddu   $in1,$tmp0
0501 
0502     dsrl    $tmp0,$in0,8        # write mac value
0503     dsrl    $tmp1,$in0,16
0504     dsrl    $tmp2,$in0,24
0505     sb  $in0,0($mac)
0506     dsrl    $tmp3,$in0,32
0507     sb  $tmp0,1($mac)
0508     dsrl    $tmp0,$in0,40
0509     sb  $tmp1,2($mac)
0510     dsrl    $tmp1,$in0,48
0511     sb  $tmp2,3($mac)
0512     dsrl    $tmp2,$in0,56
0513     sb  $tmp3,4($mac)
0514     dsrl    $tmp3,$in1,8
0515     sb  $tmp0,5($mac)
0516     dsrl    $tmp0,$in1,16
0517     sb  $tmp1,6($mac)
0518     dsrl    $tmp1,$in1,24
0519     sb  $tmp2,7($mac)
0520 
0521     sb  $in1,8($mac)
0522     dsrl    $tmp2,$in1,32
0523     sb  $tmp3,9($mac)
0524     dsrl    $tmp3,$in1,40
0525     sb  $tmp0,10($mac)
0526     dsrl    $tmp0,$in1,48
0527     sb  $tmp1,11($mac)
0528     dsrl    $tmp1,$in1,56
0529     sb  $tmp2,12($mac)
0530     sb  $tmp3,13($mac)
0531     sb  $tmp0,14($mac)
0532     sb  $tmp1,15($mac)
0533 
0534     jr  $ra
0535 .end    poly1305_emit
0536 .rdata
0537 .asciiz "Poly1305 for MIPS64, CRYPTOGAMS by \@dot-asm"
0538 .align  2
0539 ___
0540 }
0541 }}} else {{{
0542 ######################################################################
0543 # 32-bit code path
0544 #
0545 
0546 my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3);
0547 my ($in0,$in1,$in2,$in3,$tmp0,$tmp1,$tmp2,$tmp3) =
0548    ($a4,$a5,$a6,$a7,$at,$t0,$t1,$t2);
0549 
0550 $code.=<<___;
0551 #if (defined(_MIPS_ARCH_MIPS32R3) || defined(_MIPS_ARCH_MIPS32R5) || \\
0552      defined(_MIPS_ARCH_MIPS32R6)) \\
0553      && !defined(_MIPS_ARCH_MIPS32R2)
0554 # define _MIPS_ARCH_MIPS32R2
0555 #endif
0556 
0557 #if defined(_MIPS_ARCH_MIPS32R6)
0558 # define multu(rs,rt)
0559 # define mflo(rd,rs,rt) mulu    rd,rs,rt
0560 # define mfhi(rd,rs,rt) muhu    rd,rs,rt
0561 #else
0562 # define multu(rs,rt)   multu   rs,rt
0563 # define mflo(rd,rs,rt) mflo    rd
0564 # define mfhi(rd,rs,rt) mfhi    rd
0565 #endif
0566 
0567 #ifdef  __KERNEL__
0568 # define poly1305_init   poly1305_init_mips
0569 # define poly1305_blocks poly1305_blocks_mips
0570 # define poly1305_emit   poly1305_emit_mips
0571 #endif
0572 
0573 #if defined(__MIPSEB__) && !defined(MIPSEB)
0574 # define MIPSEB
0575 #endif
0576 
0577 #ifdef MIPSEB
0578 # define MSB 0
0579 # define LSB 3
0580 #else
0581 # define MSB 3
0582 # define LSB 0
0583 #endif
0584 
0585 .text
0586 .set    noat
0587 .set    noreorder
0588 
0589 .align  5
0590 .globl  poly1305_init
0591 .ent    poly1305_init
0592 poly1305_init:
0593     .frame  $sp,0,$ra
0594     .set    reorder
0595 
0596     sw  $zero,0($ctx)
0597     sw  $zero,4($ctx)
0598     sw  $zero,8($ctx)
0599     sw  $zero,12($ctx)
0600     sw  $zero,16($ctx)
0601 
0602     beqz    $inp,.Lno_key
0603 
0604 #if defined(_MIPS_ARCH_MIPS32R6)
0605     andi    $tmp0,$inp,3        # $inp % 4
0606     subu    $inp,$inp,$tmp0     # align $inp
0607     sll $tmp0,$tmp0,3       # byte to bit offset
0608     lw  $in0,0($inp)
0609     lw  $in1,4($inp)
0610     lw  $in2,8($inp)
0611     lw  $in3,12($inp)
0612     beqz    $tmp0,.Laligned_key
0613 
0614     lw  $tmp2,16($inp)
0615     subu    $tmp1,$zero,$tmp0
0616 # ifdef MIPSEB
0617     sllv    $in0,$in0,$tmp0
0618     srlv    $tmp3,$in1,$tmp1
0619     sllv    $in1,$in1,$tmp0
0620     or  $in0,$in0,$tmp3
0621     srlv    $tmp3,$in2,$tmp1
0622     sllv    $in2,$in2,$tmp0
0623     or  $in1,$in1,$tmp3
0624     srlv    $tmp3,$in3,$tmp1
0625     sllv    $in3,$in3,$tmp0
0626     or  $in2,$in2,$tmp3
0627     srlv    $tmp2,$tmp2,$tmp1
0628     or  $in3,$in3,$tmp2
0629 # else
0630     srlv    $in0,$in0,$tmp0
0631     sllv    $tmp3,$in1,$tmp1
0632     srlv    $in1,$in1,$tmp0
0633     or  $in0,$in0,$tmp3
0634     sllv    $tmp3,$in2,$tmp1
0635     srlv    $in2,$in2,$tmp0
0636     or  $in1,$in1,$tmp3
0637     sllv    $tmp3,$in3,$tmp1
0638     srlv    $in3,$in3,$tmp0
0639     or  $in2,$in2,$tmp3
0640     sllv    $tmp2,$tmp2,$tmp1
0641     or  $in3,$in3,$tmp2
0642 # endif
0643 .Laligned_key:
0644 #else
0645     lwl $in0,0+MSB($inp)
0646     lwl $in1,4+MSB($inp)
0647     lwl $in2,8+MSB($inp)
0648     lwl $in3,12+MSB($inp)
0649     lwr $in0,0+LSB($inp)
0650     lwr $in1,4+LSB($inp)
0651     lwr $in2,8+LSB($inp)
0652     lwr $in3,12+LSB($inp)
0653 #endif
0654 #ifdef  MIPSEB
0655 # if defined(_MIPS_ARCH_MIPS32R2)
0656     wsbh    $in0,$in0       # byte swap
0657     wsbh    $in1,$in1
0658     wsbh    $in2,$in2
0659     wsbh    $in3,$in3
0660     rotr    $in0,$in0,16
0661     rotr    $in1,$in1,16
0662     rotr    $in2,$in2,16
0663     rotr    $in3,$in3,16
0664 # else
0665     srl $tmp0,$in0,24       # byte swap
0666     srl $tmp1,$in0,8
0667     andi    $tmp2,$in0,0xFF00
0668     sll $in0,$in0,24
0669     andi    $tmp1,0xFF00
0670     sll $tmp2,$tmp2,8
0671     or  $in0,$tmp0
0672      srl    $tmp0,$in1,24
0673     or  $tmp1,$tmp2
0674      srl    $tmp2,$in1,8
0675     or  $in0,$tmp1
0676      andi   $tmp1,$in1,0xFF00
0677      sll    $in1,$in1,24
0678      andi   $tmp2,0xFF00
0679      sll    $tmp1,$tmp1,8
0680      or $in1,$tmp0
0681     srl $tmp0,$in2,24
0682      or $tmp2,$tmp1
0683     srl $tmp1,$in2,8
0684      or $in1,$tmp2
0685     andi    $tmp2,$in2,0xFF00
0686     sll $in2,$in2,24
0687     andi    $tmp1,0xFF00
0688     sll $tmp2,$tmp2,8
0689     or  $in2,$tmp0
0690      srl    $tmp0,$in3,24
0691     or  $tmp1,$tmp2
0692      srl    $tmp2,$in3,8
0693     or  $in2,$tmp1
0694      andi   $tmp1,$in3,0xFF00
0695      sll    $in3,$in3,24
0696      andi   $tmp2,0xFF00
0697      sll    $tmp1,$tmp1,8
0698      or $in3,$tmp0
0699      or $tmp2,$tmp1
0700      or $in3,$tmp2
0701 # endif
0702 #endif
0703     lui $tmp0,0x0fff
0704     ori $tmp0,0xffff        # 0x0fffffff
0705     and $in0,$in0,$tmp0
0706     subu    $tmp0,3         # 0x0ffffffc
0707     and $in1,$in1,$tmp0
0708     and $in2,$in2,$tmp0
0709     and $in3,$in3,$tmp0
0710 
0711     sw  $in0,20($ctx)
0712     sw  $in1,24($ctx)
0713     sw  $in2,28($ctx)
0714     sw  $in3,32($ctx)
0715 
0716     srl $tmp1,$in1,2
0717     srl $tmp2,$in2,2
0718     srl $tmp3,$in3,2
0719     addu    $in1,$in1,$tmp1     # s1 = r1 + (r1 >> 2)
0720     addu    $in2,$in2,$tmp2
0721     addu    $in3,$in3,$tmp3
0722     sw  $in1,36($ctx)
0723     sw  $in2,40($ctx)
0724     sw  $in3,44($ctx)
0725 .Lno_key:
0726     li  $v0,0
0727     jr  $ra
0728 .end    poly1305_init
0729 ___
0730 {
0731 my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x00fff000" : "0x00ff0000";
0732 
0733 my ($h0,$h1,$h2,$h3,$h4, $r0,$r1,$r2,$r3, $rs1,$rs2,$rs3) =
0734    ($s0,$s1,$s2,$s3,$s4, $s5,$s6,$s7,$s8, $s9,$s10,$s11);
0735 my ($d0,$d1,$d2,$d3) =
0736    ($a4,$a5,$a6,$a7);
0737 my $shr = $t2;      # used on R6
0738 my $one = $t2;      # used on R2
0739 
0740 $code.=<<___;
0741 .globl  poly1305_blocks
0742 .align  5
0743 .ent    poly1305_blocks
0744 poly1305_blocks:
0745     .frame  $sp,16*4,$ra
0746     .mask   $SAVED_REGS_MASK,-4
0747     .set    noreorder
0748     subu    $sp, $sp,4*12
0749     sw  $s11,4*11($sp)
0750     sw  $s10,4*10($sp)
0751     sw  $s9, 4*9($sp)
0752     sw  $s8, 4*8($sp)
0753     sw  $s7, 4*7($sp)
0754     sw  $s6, 4*6($sp)
0755     sw  $s5, 4*5($sp)
0756     sw  $s4, 4*4($sp)
0757 ___
0758 $code.=<<___ if ($flavour =~ /nubi/i);  # optimize non-nubi prologue
0759     sw  $s3, 4*3($sp)
0760     sw  $s2, 4*2($sp)
0761     sw  $s1, 4*1($sp)
0762     sw  $s0, 4*0($sp)
0763 ___
0764 $code.=<<___;
0765     .set    reorder
0766 
0767     srl $len,4          # number of complete blocks
0768     li  $one,1
0769     beqz    $len,.Labort
0770 
0771 #if defined(_MIPS_ARCH_MIPS32R6)
0772     andi    $shr,$inp,3
0773     subu    $inp,$inp,$shr      # align $inp
0774     sll $shr,$shr,3     # byte to bit offset
0775 #endif
0776 
0777     lw  $h0,0($ctx)     # load hash value
0778     lw  $h1,4($ctx)
0779     lw  $h2,8($ctx)
0780     lw  $h3,12($ctx)
0781     lw  $h4,16($ctx)
0782 
0783     lw  $r0,20($ctx)        # load key
0784     lw  $r1,24($ctx)
0785     lw  $r2,28($ctx)
0786     lw  $r3,32($ctx)
0787     lw  $rs1,36($ctx)
0788     lw  $rs2,40($ctx)
0789     lw  $rs3,44($ctx)
0790 
0791     sll $len,4
0792     addu    $len,$len,$inp      # end of buffer
0793     b   .Loop
0794 
0795 .align  4
0796 .Loop:
0797 #if defined(_MIPS_ARCH_MIPS32R6)
0798     lw  $d0,0($inp)     # load input
0799     lw  $d1,4($inp)
0800     lw  $d2,8($inp)
0801     lw  $d3,12($inp)
0802     beqz    $shr,.Laligned_inp
0803 
0804     lw  $t0,16($inp)
0805     subu    $t1,$zero,$shr
0806 # ifdef MIPSEB
0807     sllv    $d0,$d0,$shr
0808     srlv    $at,$d1,$t1
0809     sllv    $d1,$d1,$shr
0810     or  $d0,$d0,$at
0811     srlv    $at,$d2,$t1
0812     sllv    $d2,$d2,$shr
0813     or  $d1,$d1,$at
0814     srlv    $at,$d3,$t1
0815     sllv    $d3,$d3,$shr
0816     or  $d2,$d2,$at
0817     srlv    $t0,$t0,$t1
0818     or  $d3,$d3,$t0
0819 # else
0820     srlv    $d0,$d0,$shr
0821     sllv    $at,$d1,$t1
0822     srlv    $d1,$d1,$shr
0823     or  $d0,$d0,$at
0824     sllv    $at,$d2,$t1
0825     srlv    $d2,$d2,$shr
0826     or  $d1,$d1,$at
0827     sllv    $at,$d3,$t1
0828     srlv    $d3,$d3,$shr
0829     or  $d2,$d2,$at
0830     sllv    $t0,$t0,$t1
0831     or  $d3,$d3,$t0
0832 # endif
0833 .Laligned_inp:
0834 #else
0835     lwl $d0,0+MSB($inp)     # load input
0836     lwl $d1,4+MSB($inp)
0837     lwl $d2,8+MSB($inp)
0838     lwl $d3,12+MSB($inp)
0839     lwr $d0,0+LSB($inp)
0840     lwr $d1,4+LSB($inp)
0841     lwr $d2,8+LSB($inp)
0842     lwr $d3,12+LSB($inp)
0843 #endif
0844 #ifdef  MIPSEB
0845 # if defined(_MIPS_ARCH_MIPS32R2)
0846     wsbh    $d0,$d0         # byte swap
0847     wsbh    $d1,$d1
0848     wsbh    $d2,$d2
0849     wsbh    $d3,$d3
0850     rotr    $d0,$d0,16
0851     rotr    $d1,$d1,16
0852     rotr    $d2,$d2,16
0853     rotr    $d3,$d3,16
0854 # else
0855     srl $at,$d0,24      # byte swap
0856     srl $t0,$d0,8
0857     andi    $t1,$d0,0xFF00
0858     sll $d0,$d0,24
0859     andi    $t0,0xFF00
0860     sll $t1,$t1,8
0861     or  $d0,$at
0862      srl    $at,$d1,24
0863     or  $t0,$t1
0864      srl    $t1,$d1,8
0865     or  $d0,$t0
0866      andi   $t0,$d1,0xFF00
0867      sll    $d1,$d1,24
0868      andi   $t1,0xFF00
0869      sll    $t0,$t0,8
0870      or $d1,$at
0871     srl $at,$d2,24
0872      or $t1,$t0
0873     srl $t0,$d2,8
0874      or $d1,$t1
0875     andi    $t1,$d2,0xFF00
0876     sll $d2,$d2,24
0877     andi    $t0,0xFF00
0878     sll $t1,$t1,8
0879     or  $d2,$at
0880      srl    $at,$d3,24
0881     or  $t0,$t1
0882      srl    $t1,$d3,8
0883     or  $d2,$t0
0884      andi   $t0,$d3,0xFF00
0885      sll    $d3,$d3,24
0886      andi   $t1,0xFF00
0887      sll    $t0,$t0,8
0888      or $d3,$at
0889      or $t1,$t0
0890      or $d3,$t1
0891 # endif
0892 #endif
0893     srl $t0,$h4,2       # modulo-scheduled reduction
0894     andi    $h4,$h4,3
0895     sll $at,$t0,2
0896 
0897     addu    $d0,$d0,$h0     # accumulate input
0898      addu   $t0,$t0,$at
0899     sltu    $h0,$d0,$h0
0900     addu    $d0,$d0,$t0     # ... and residue
0901     sltu    $at,$d0,$t0
0902 
0903     addu    $d1,$d1,$h1
0904      addu   $h0,$h0,$at     # carry
0905     sltu    $h1,$d1,$h1
0906     addu    $d1,$d1,$h0
0907     sltu    $h0,$d1,$h0
0908 
0909     addu    $d2,$d2,$h2
0910      addu   $h1,$h1,$h0     # carry
0911     sltu    $h2,$d2,$h2
0912     addu    $d2,$d2,$h1
0913     sltu    $h1,$d2,$h1
0914 
0915     addu    $d3,$d3,$h3
0916      addu   $h2,$h2,$h1     # carry
0917     sltu    $h3,$d3,$h3
0918     addu    $d3,$d3,$h2
0919 
0920 #if defined(_MIPS_ARCH_MIPS32R2) && !defined(_MIPS_ARCH_MIPS32R6)
0921     multu   $r0,$d0         # d0*r0
0922      sltu   $h2,$d3,$h2
0923     maddu   $rs3,$d1        # d1*s3
0924      addu   $h3,$h3,$h2     # carry
0925     maddu   $rs2,$d2        # d2*s2
0926      addu   $h4,$h4,$padbit
0927     maddu   $rs1,$d3        # d3*s1
0928      addu   $h4,$h4,$h3
0929     mfhi    $at
0930     mflo    $h0
0931 
0932     multu   $r1,$d0         # d0*r1
0933     maddu   $r0,$d1         # d1*r0
0934     maddu   $rs3,$d2        # d2*s3
0935     maddu   $rs2,$d3        # d3*s2
0936     maddu   $rs1,$h4        # h4*s1
0937     maddu   $at,$one        # hi*1
0938     mfhi    $at
0939     mflo    $h1
0940 
0941     multu   $r2,$d0         # d0*r2
0942     maddu   $r1,$d1         # d1*r1
0943     maddu   $r0,$d2         # d2*r0
0944     maddu   $rs3,$d3        # d3*s3
0945     maddu   $rs2,$h4        # h4*s2
0946     maddu   $at,$one        # hi*1
0947     mfhi    $at
0948     mflo    $h2
0949 
0950     mul $t0,$r0,$h4     # h4*r0
0951 
0952     multu   $r3,$d0         # d0*r3
0953     maddu   $r2,$d1         # d1*r2
0954     maddu   $r1,$d2         # d2*r1
0955     maddu   $r0,$d3         # d3*r0
0956     maddu   $rs3,$h4        # h4*s3
0957     maddu   $at,$one        # hi*1
0958     mfhi    $at
0959     mflo    $h3
0960 
0961      addiu  $inp,$inp,16
0962 
0963     addu    $h4,$t0,$at
0964 #else
0965     multu   ($r0,$d0)       # d0*r0
0966     mflo    ($h0,$r0,$d0)
0967     mfhi    ($h1,$r0,$d0)
0968 
0969      sltu   $h2,$d3,$h2
0970      addu   $h3,$h3,$h2     # carry
0971 
0972     multu   ($rs3,$d1)      # d1*s3
0973     mflo    ($at,$rs3,$d1)
0974     mfhi    ($t0,$rs3,$d1)
0975 
0976      addu   $h4,$h4,$padbit
0977      addiu  $inp,$inp,16
0978      addu   $h4,$h4,$h3
0979 
0980     multu   ($rs2,$d2)      # d2*s2
0981     mflo    ($a3,$rs2,$d2)
0982     mfhi    ($t1,$rs2,$d2)
0983      addu   $h0,$h0,$at
0984      addu   $h1,$h1,$t0
0985     multu   ($rs1,$d3)      # d3*s1
0986      sltu   $at,$h0,$at
0987      addu   $h1,$h1,$at
0988 
0989     mflo    ($at,$rs1,$d3)
0990     mfhi    ($t0,$rs1,$d3)
0991      addu   $h0,$h0,$a3
0992      addu   $h1,$h1,$t1
0993     multu   ($r1,$d0)       # d0*r1
0994      sltu   $a3,$h0,$a3
0995      addu   $h1,$h1,$a3
0996 
0997 
0998     mflo    ($a3,$r1,$d0)
0999     mfhi    ($h2,$r1,$d0)
1000      addu   $h0,$h0,$at
1001      addu   $h1,$h1,$t0
1002     multu   ($r0,$d1)       # d1*r0
1003      sltu   $at,$h0,$at
1004      addu   $h1,$h1,$at
1005 
1006     mflo    ($at,$r0,$d1)
1007     mfhi    ($t0,$r0,$d1)
1008      addu   $h1,$h1,$a3
1009      sltu   $a3,$h1,$a3
1010     multu   ($rs3,$d2)      # d2*s3
1011      addu   $h2,$h2,$a3
1012 
1013     mflo    ($a3,$rs3,$d2)
1014     mfhi    ($t1,$rs3,$d2)
1015      addu   $h1,$h1,$at
1016      addu   $h2,$h2,$t0
1017     multu   ($rs2,$d3)      # d3*s2
1018      sltu   $at,$h1,$at
1019      addu   $h2,$h2,$at
1020 
1021     mflo    ($at,$rs2,$d3)
1022     mfhi    ($t0,$rs2,$d3)
1023      addu   $h1,$h1,$a3
1024      addu   $h2,$h2,$t1
1025     multu   ($rs1,$h4)      # h4*s1
1026      sltu   $a3,$h1,$a3
1027      addu   $h2,$h2,$a3
1028 
1029     mflo    ($a3,$rs1,$h4)
1030      addu   $h1,$h1,$at
1031      addu   $h2,$h2,$t0
1032     multu   ($r2,$d0)       # d0*r2
1033      sltu   $at,$h1,$at
1034      addu   $h2,$h2,$at
1035 
1036 
1037     mflo    ($at,$r2,$d0)
1038     mfhi    ($h3,$r2,$d0)
1039      addu   $h1,$h1,$a3
1040      sltu   $a3,$h1,$a3
1041     multu   ($r1,$d1)       # d1*r1
1042      addu   $h2,$h2,$a3
1043 
1044     mflo    ($a3,$r1,$d1)
1045     mfhi    ($t1,$r1,$d1)
1046      addu   $h2,$h2,$at
1047      sltu   $at,$h2,$at
1048     multu   ($r0,$d2)       # d2*r0
1049      addu   $h3,$h3,$at
1050 
1051     mflo    ($at,$r0,$d2)
1052     mfhi    ($t0,$r0,$d2)
1053      addu   $h2,$h2,$a3
1054      addu   $h3,$h3,$t1
1055     multu   ($rs3,$d3)      # d3*s3
1056      sltu   $a3,$h2,$a3
1057      addu   $h3,$h3,$a3
1058 
1059     mflo    ($a3,$rs3,$d3)
1060     mfhi    ($t1,$rs3,$d3)
1061      addu   $h2,$h2,$at
1062      addu   $h3,$h3,$t0
1063     multu   ($rs2,$h4)      # h4*s2
1064      sltu   $at,$h2,$at
1065      addu   $h3,$h3,$at
1066 
1067     mflo    ($at,$rs2,$h4)
1068      addu   $h2,$h2,$a3
1069      addu   $h3,$h3,$t1
1070     multu   ($r3,$d0)       # d0*r3
1071      sltu   $a3,$h2,$a3
1072      addu   $h3,$h3,$a3
1073 
1074 
1075     mflo    ($a3,$r3,$d0)
1076     mfhi    ($t1,$r3,$d0)
1077      addu   $h2,$h2,$at
1078      sltu   $at,$h2,$at
1079     multu   ($r2,$d1)       # d1*r2
1080      addu   $h3,$h3,$at
1081 
1082     mflo    ($at,$r2,$d1)
1083     mfhi    ($t0,$r2,$d1)
1084      addu   $h3,$h3,$a3
1085      sltu   $a3,$h3,$a3
1086     multu   ($r0,$d3)       # d3*r0
1087      addu   $t1,$t1,$a3
1088 
1089     mflo    ($a3,$r0,$d3)
1090     mfhi    ($d3,$r0,$d3)
1091      addu   $h3,$h3,$at
1092      addu   $t1,$t1,$t0
1093     multu   ($r1,$d2)       # d2*r1
1094      sltu   $at,$h3,$at
1095      addu   $t1,$t1,$at
1096 
1097     mflo    ($at,$r1,$d2)
1098     mfhi    ($t0,$r1,$d2)
1099      addu   $h3,$h3,$a3
1100      addu   $t1,$t1,$d3
1101     multu   ($rs3,$h4)      # h4*s3
1102      sltu   $a3,$h3,$a3
1103      addu   $t1,$t1,$a3
1104 
1105     mflo    ($a3,$rs3,$h4)
1106      addu   $h3,$h3,$at
1107      addu   $t1,$t1,$t0
1108     multu   ($r0,$h4)       # h4*r0
1109      sltu   $at,$h3,$at
1110      addu   $t1,$t1,$at
1111 
1112 
1113     mflo    ($h4,$r0,$h4)
1114      addu   $h3,$h3,$a3
1115      sltu   $a3,$h3,$a3
1116      addu   $t1,$t1,$a3
1117     addu    $h4,$h4,$t1
1118 
1119     li  $padbit,1       # if we loop, padbit is 1
1120 #endif
1121     bne $inp,$len,.Loop
1122 
1123     sw  $h0,0($ctx)     # store hash value
1124     sw  $h1,4($ctx)
1125     sw  $h2,8($ctx)
1126     sw  $h3,12($ctx)
1127     sw  $h4,16($ctx)
1128 
1129     .set    noreorder
1130 .Labort:
1131     lw  $s11,4*11($sp)
1132     lw  $s10,4*10($sp)
1133     lw  $s9, 4*9($sp)
1134     lw  $s8, 4*8($sp)
1135     lw  $s7, 4*7($sp)
1136     lw  $s6, 4*6($sp)
1137     lw  $s5, 4*5($sp)
1138     lw  $s4, 4*4($sp)
1139 ___
1140 $code.=<<___ if ($flavour =~ /nubi/i);  # optimize non-nubi prologue
1141     lw  $s3, 4*3($sp)
1142     lw  $s2, 4*2($sp)
1143     lw  $s1, 4*1($sp)
1144     lw  $s0, 4*0($sp)
1145 ___
1146 $code.=<<___;
1147     jr  $ra
1148     addu    $sp,$sp,4*12
1149 .end    poly1305_blocks
1150 ___
1151 }
1152 {
1153 my ($ctx,$mac,$nonce,$tmp4) = ($a0,$a1,$a2,$a3);
1154 
1155 $code.=<<___;
1156 .align  5
1157 .globl  poly1305_emit
1158 .ent    poly1305_emit
1159 poly1305_emit:
1160     .frame  $sp,0,$ra
1161     .set    reorder
1162 
1163     lw  $tmp4,16($ctx)
1164     lw  $tmp0,0($ctx)
1165     lw  $tmp1,4($ctx)
1166     lw  $tmp2,8($ctx)
1167     lw  $tmp3,12($ctx)
1168 
1169     li  $in0,-4         # final reduction
1170     srl $ctx,$tmp4,2
1171     and $in0,$in0,$tmp4
1172     andi    $tmp4,$tmp4,3
1173     addu    $ctx,$ctx,$in0
1174 
1175     addu    $tmp0,$tmp0,$ctx
1176     sltu    $ctx,$tmp0,$ctx
1177      addiu  $in0,$tmp0,5        # compare to modulus
1178     addu    $tmp1,$tmp1,$ctx
1179      sltiu  $in1,$in0,5
1180     sltu    $ctx,$tmp1,$ctx
1181      addu   $in1,$in1,$tmp1
1182     addu    $tmp2,$tmp2,$ctx
1183      sltu   $in2,$in1,$tmp1
1184     sltu    $ctx,$tmp2,$ctx
1185      addu   $in2,$in2,$tmp2
1186     addu    $tmp3,$tmp3,$ctx
1187      sltu   $in3,$in2,$tmp2
1188     sltu    $ctx,$tmp3,$ctx
1189      addu   $in3,$in3,$tmp3
1190     addu    $tmp4,$tmp4,$ctx
1191      sltu   $ctx,$in3,$tmp3
1192      addu   $ctx,$tmp4
1193 
1194     srl $ctx,2          # see if it carried/borrowed
1195     subu    $ctx,$zero,$ctx
1196 
1197     xor $in0,$tmp0
1198     xor $in1,$tmp1
1199     xor $in2,$tmp2
1200     xor $in3,$tmp3
1201     and $in0,$ctx
1202     and $in1,$ctx
1203     and $in2,$ctx
1204     and $in3,$ctx
1205     xor $in0,$tmp0
1206     xor $in1,$tmp1
1207     xor $in2,$tmp2
1208     xor $in3,$tmp3
1209 
1210     lw  $tmp0,0($nonce)     # load nonce
1211     lw  $tmp1,4($nonce)
1212     lw  $tmp2,8($nonce)
1213     lw  $tmp3,12($nonce)
1214 
1215     addu    $in0,$tmp0      # accumulate nonce
1216     sltu    $ctx,$in0,$tmp0
1217 
1218     addu    $in1,$tmp1
1219     sltu    $tmp1,$in1,$tmp1
1220     addu    $in1,$ctx
1221     sltu    $ctx,$in1,$ctx
1222     addu    $ctx,$tmp1
1223 
1224     addu    $in2,$tmp2
1225     sltu    $tmp2,$in2,$tmp2
1226     addu    $in2,$ctx
1227     sltu    $ctx,$in2,$ctx
1228     addu    $ctx,$tmp2
1229 
1230     addu    $in3,$tmp3
1231     addu    $in3,$ctx
1232 
1233     srl $tmp0,$in0,8        # write mac value
1234     srl $tmp1,$in0,16
1235     srl $tmp2,$in0,24
1236     sb  $in0, 0($mac)
1237     sb  $tmp0,1($mac)
1238     srl $tmp0,$in1,8
1239     sb  $tmp1,2($mac)
1240     srl $tmp1,$in1,16
1241     sb  $tmp2,3($mac)
1242     srl $tmp2,$in1,24
1243     sb  $in1, 4($mac)
1244     sb  $tmp0,5($mac)
1245     srl $tmp0,$in2,8
1246     sb  $tmp1,6($mac)
1247     srl $tmp1,$in2,16
1248     sb  $tmp2,7($mac)
1249     srl $tmp2,$in2,24
1250     sb  $in2, 8($mac)
1251     sb  $tmp0,9($mac)
1252     srl $tmp0,$in3,8
1253     sb  $tmp1,10($mac)
1254     srl $tmp1,$in3,16
1255     sb  $tmp2,11($mac)
1256     srl $tmp2,$in3,24
1257     sb  $in3, 12($mac)
1258     sb  $tmp0,13($mac)
1259     sb  $tmp1,14($mac)
1260     sb  $tmp2,15($mac)
1261 
1262     jr  $ra
1263 .end    poly1305_emit
1264 .rdata
1265 .asciiz "Poly1305 for MIPS32, CRYPTOGAMS by \@dot-asm"
1266 .align  2
1267 ___
1268 }
1269 }}}
1270 
1271 $output=pop and open STDOUT,">$output";
1272 print $code;
1273 close STDOUT;