Back to home page

OSCL-LXR

 
 

    


0001 /* SPDX-License-Identifier: GPL-2.0-or-later */
0002 /*
0003  * Serpent Cipher 4-way parallel algorithm (i586/SSE2)
0004  *
0005  * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
0006  *
0007  * Based on crypto/serpent.c by
0008  *  Copyright (C) 2002 Dag Arne Osvik <osvik@ii.uib.no>
0009  *                2003 Herbert Valerio Riedel <hvr@gnu.org>
0010  */
0011 
0012 #include <linux/linkage.h>
0013 
0014 .file "serpent-sse2-i586-asm_32.S"
0015 .text
0016 
0017 #define arg_ctx 4
0018 #define arg_dst 8
0019 #define arg_src 12
0020 #define arg_xor 16
0021 
0022 /**********************************************************************
0023   4-way SSE2 serpent
0024  **********************************************************************/
0025 #define CTX %edx
0026 
0027 #define RA %xmm0
0028 #define RB %xmm1
0029 #define RC %xmm2
0030 #define RD %xmm3
0031 #define RE %xmm4
0032 
0033 #define RT0 %xmm5
0034 #define RT1 %xmm6
0035 
0036 #define RNOT %xmm7
0037 
0038 #define get_key(i, j, t) \
0039     movd (4*(i)+(j))*4(CTX), t; \
0040     pshufd $0, t, t;
0041 
0042 #define K(x0, x1, x2, x3, x4, i) \
0043     get_key(i, 0, x4); \
0044     get_key(i, 1, RT0); \
0045     get_key(i, 2, RT1); \
0046     pxor x4,        x0; \
0047     pxor RT0,       x1; \
0048     pxor RT1,       x2; \
0049     get_key(i, 3, x4); \
0050     pxor x4,        x3;
0051 
0052 #define LK(x0, x1, x2, x3, x4, i) \
0053     movdqa x0,      x4; \
0054     pslld $13,      x0; \
0055     psrld $(32 - 13),   x4; \
0056     por x4,         x0; \
0057     pxor x0,        x1; \
0058     movdqa x2,      x4; \
0059     pslld $3,       x2; \
0060     psrld $(32 - 3),    x4; \
0061     por x4,         x2; \
0062     pxor x2,        x1; \
0063     movdqa x1,      x4; \
0064     pslld $1,       x1; \
0065     psrld $(32 - 1),    x4; \
0066     por x4,         x1; \
0067     movdqa x0,      x4; \
0068     pslld $3,       x4; \
0069     pxor x2,        x3; \
0070     pxor x4,        x3; \
0071     movdqa x3,      x4; \
0072     pslld $7,       x3; \
0073     psrld $(32 - 7),    x4; \
0074     por x4,         x3; \
0075     movdqa x1,      x4; \
0076     pslld $7,       x4; \
0077     pxor x1,        x0; \
0078     pxor x3,        x0; \
0079     pxor x3,        x2; \
0080     pxor x4,        x2; \
0081     movdqa x0,      x4; \
0082     get_key(i, 1, RT0); \
0083     pxor RT0,       x1; \
0084     get_key(i, 3, RT0); \
0085     pxor RT0,       x3; \
0086     pslld $5,       x0; \
0087     psrld $(32 - 5),    x4; \
0088     por x4,         x0; \
0089     movdqa x2,      x4; \
0090     pslld $22,      x2; \
0091     psrld $(32 - 22),   x4; \
0092     por x4,         x2; \
0093     get_key(i, 0, RT0); \
0094     pxor RT0,       x0; \
0095     get_key(i, 2, RT0); \
0096     pxor RT0,       x2;
0097 
0098 #define KL(x0, x1, x2, x3, x4, i) \
0099     K(x0, x1, x2, x3, x4, i); \
0100     movdqa x0,      x4; \
0101     psrld $5,       x0; \
0102     pslld $(32 - 5),    x4; \
0103     por x4,         x0; \
0104     movdqa x2,      x4; \
0105     psrld $22,      x2; \
0106     pslld $(32 - 22),   x4; \
0107     por x4,         x2; \
0108     pxor x3,        x2; \
0109     pxor x3,        x0; \
0110     movdqa x1,      x4; \
0111     pslld $7,       x4; \
0112     pxor x1,        x0; \
0113     pxor x4,        x2; \
0114     movdqa x1,      x4; \
0115     psrld $1,       x1; \
0116     pslld $(32 - 1),    x4; \
0117     por x4,         x1; \
0118     movdqa x3,      x4; \
0119     psrld $7,       x3; \
0120     pslld $(32 - 7),    x4; \
0121     por x4,         x3; \
0122     pxor x0,        x1; \
0123     movdqa x0,      x4; \
0124     pslld $3,       x4; \
0125     pxor x4,        x3; \
0126     movdqa x0,      x4; \
0127     psrld $13,      x0; \
0128     pslld $(32 - 13),   x4; \
0129     por x4,         x0; \
0130     pxor x2,        x1; \
0131     pxor x2,        x3; \
0132     movdqa x2,      x4; \
0133     psrld $3,       x2; \
0134     pslld $(32 - 3),    x4; \
0135     por x4,         x2;
0136 
0137 #define S0(x0, x1, x2, x3, x4) \
0138     movdqa x3,      x4; \
0139     por x0,         x3; \
0140     pxor x4,        x0; \
0141     pxor x2,        x4; \
0142     pxor RNOT,      x4; \
0143     pxor x1,        x3; \
0144     pand x0,        x1; \
0145     pxor x4,        x1; \
0146     pxor x0,        x2; \
0147     pxor x3,        x0; \
0148     por x0,         x4; \
0149     pxor x2,        x0; \
0150     pand x1,        x2; \
0151     pxor x2,        x3; \
0152     pxor RNOT,      x1; \
0153     pxor x4,        x2; \
0154     pxor x2,        x1;
0155 
0156 #define S1(x0, x1, x2, x3, x4) \
0157     movdqa x1,      x4; \
0158     pxor x0,        x1; \
0159     pxor x3,        x0; \
0160     pxor RNOT,      x3; \
0161     pand x1,        x4; \
0162     por x1,         x0; \
0163     pxor x2,        x3; \
0164     pxor x3,        x0; \
0165     pxor x3,        x1; \
0166     pxor x4,        x3; \
0167     por x4,         x1; \
0168     pxor x2,        x4; \
0169     pand x0,        x2; \
0170     pxor x1,        x2; \
0171     por x0,         x1; \
0172     pxor RNOT,      x0; \
0173     pxor x2,        x0; \
0174     pxor x1,        x4;
0175 
0176 #define S2(x0, x1, x2, x3, x4) \
0177     pxor RNOT,      x3; \
0178     pxor x0,        x1; \
0179     movdqa x0,      x4; \
0180     pand x2,        x0; \
0181     pxor x3,        x0; \
0182     por x4,         x3; \
0183     pxor x1,        x2; \
0184     pxor x1,        x3; \
0185     pand x0,        x1; \
0186     pxor x2,        x0; \
0187     pand x3,        x2; \
0188     por x1,         x3; \
0189     pxor RNOT,      x0; \
0190     pxor x0,        x3; \
0191     pxor x0,        x4; \
0192     pxor x2,        x0; \
0193     por x2,         x1;
0194 
0195 #define S3(x0, x1, x2, x3, x4) \
0196     movdqa x1,      x4; \
0197     pxor x3,        x1; \
0198     por x0,         x3; \
0199     pand x0,        x4; \
0200     pxor x2,        x0; \
0201     pxor x1,        x2; \
0202     pand x3,        x1; \
0203     pxor x3,        x2; \
0204     por x4,         x0; \
0205     pxor x3,        x4; \
0206     pxor x0,        x1; \
0207     pand x3,        x0; \
0208     pand x4,        x3; \
0209     pxor x2,        x3; \
0210     por x1,         x4; \
0211     pand x1,        x2; \
0212     pxor x3,        x4; \
0213     pxor x3,        x0; \
0214     pxor x2,        x3;
0215 
0216 #define S4(x0, x1, x2, x3, x4) \
0217     movdqa x3,      x4; \
0218     pand x0,        x3; \
0219     pxor x4,        x0; \
0220     pxor x2,        x3; \
0221     por x4,         x2; \
0222     pxor x1,        x0; \
0223     pxor x3,        x4; \
0224     por x0,         x2; \
0225     pxor x1,        x2; \
0226     pand x0,        x1; \
0227     pxor x4,        x1; \
0228     pand x2,        x4; \
0229     pxor x3,        x2; \
0230     pxor x0,        x4; \
0231     por x1,         x3; \
0232     pxor RNOT,      x1; \
0233     pxor x0,        x3;
0234 
0235 #define S5(x0, x1, x2, x3, x4) \
0236     movdqa x1,      x4; \
0237     por x0,         x1; \
0238     pxor x1,        x2; \
0239     pxor RNOT,      x3; \
0240     pxor x0,        x4; \
0241     pxor x2,        x0; \
0242     pand x4,        x1; \
0243     por x3,         x4; \
0244     pxor x0,        x4; \
0245     pand x3,        x0; \
0246     pxor x3,        x1; \
0247     pxor x2,        x3; \
0248     pxor x1,        x0; \
0249     pand x4,        x2; \
0250     pxor x2,        x1; \
0251     pand x0,        x2; \
0252     pxor x2,        x3;
0253 
0254 #define S6(x0, x1, x2, x3, x4) \
0255     movdqa x1,      x4; \
0256     pxor x0,        x3; \
0257     pxor x2,        x1; \
0258     pxor x0,        x2; \
0259     pand x3,        x0; \
0260     por x3,         x1; \
0261     pxor RNOT,      x4; \
0262     pxor x1,        x0; \
0263     pxor x2,        x1; \
0264     pxor x4,        x3; \
0265     pxor x0,        x4; \
0266     pand x0,        x2; \
0267     pxor x1,        x4; \
0268     pxor x3,        x2; \
0269     pand x1,        x3; \
0270     pxor x0,        x3; \
0271     pxor x2,        x1;
0272 
0273 #define S7(x0, x1, x2, x3, x4) \
0274     pxor RNOT,      x1; \
0275     movdqa x1,      x4; \
0276     pxor RNOT,      x0; \
0277     pand x2,        x1; \
0278     pxor x3,        x1; \
0279     por x4,         x3; \
0280     pxor x2,        x4; \
0281     pxor x3,        x2; \
0282     pxor x0,        x3; \
0283     por x1,         x0; \
0284     pand x0,        x2; \
0285     pxor x4,        x0; \
0286     pxor x3,        x4; \
0287     pand x0,        x3; \
0288     pxor x1,        x4; \
0289     pxor x4,        x2; \
0290     pxor x1,        x3; \
0291     por x0,         x4; \
0292     pxor x1,        x4;
0293 
0294 #define SI0(x0, x1, x2, x3, x4) \
0295     movdqa x3,      x4; \
0296     pxor x0,        x1; \
0297     por x1,         x3; \
0298     pxor x1,        x4; \
0299     pxor RNOT,      x0; \
0300     pxor x3,        x2; \
0301     pxor x0,        x3; \
0302     pand x1,        x0; \
0303     pxor x2,        x0; \
0304     pand x3,        x2; \
0305     pxor x4,        x3; \
0306     pxor x3,        x2; \
0307     pxor x3,        x1; \
0308     pand x0,        x3; \
0309     pxor x0,        x1; \
0310     pxor x2,        x0; \
0311     pxor x3,        x4;
0312 
0313 #define SI1(x0, x1, x2, x3, x4) \
0314     pxor x3,        x1; \
0315     movdqa x0,      x4; \
0316     pxor x2,        x0; \
0317     pxor RNOT,      x2; \
0318     por x1,         x4; \
0319     pxor x3,        x4; \
0320     pand x1,        x3; \
0321     pxor x2,        x1; \
0322     pand x4,        x2; \
0323     pxor x1,        x4; \
0324     por x3,         x1; \
0325     pxor x0,        x3; \
0326     pxor x0,        x2; \
0327     por x4,         x0; \
0328     pxor x4,        x2; \
0329     pxor x0,        x1; \
0330     pxor x1,        x4;
0331 
0332 #define SI2(x0, x1, x2, x3, x4) \
0333     pxor x1,        x2; \
0334     movdqa x3,      x4; \
0335     pxor RNOT,      x3; \
0336     por x2,         x3; \
0337     pxor x4,        x2; \
0338     pxor x0,        x4; \
0339     pxor x1,        x3; \
0340     por x2,         x1; \
0341     pxor x0,        x2; \
0342     pxor x4,        x1; \
0343     por x3,         x4; \
0344     pxor x3,        x2; \
0345     pxor x2,        x4; \
0346     pand x1,        x2; \
0347     pxor x3,        x2; \
0348     pxor x4,        x3; \
0349     pxor x0,        x4;
0350 
0351 #define SI3(x0, x1, x2, x3, x4) \
0352     pxor x1,        x2; \
0353     movdqa x1,      x4; \
0354     pand x2,        x1; \
0355     pxor x0,        x1; \
0356     por x4,         x0; \
0357     pxor x3,        x4; \
0358     pxor x3,        x0; \
0359     por x1,         x3; \
0360     pxor x2,        x1; \
0361     pxor x3,        x1; \
0362     pxor x2,        x0; \
0363     pxor x3,        x2; \
0364     pand x1,        x3; \
0365     pxor x0,        x1; \
0366     pand x2,        x0; \
0367     pxor x3,        x4; \
0368     pxor x0,        x3; \
0369     pxor x1,        x0;
0370 
0371 #define SI4(x0, x1, x2, x3, x4) \
0372     pxor x3,        x2; \
0373     movdqa x0,      x4; \
0374     pand x1,        x0; \
0375     pxor x2,        x0; \
0376     por x3,         x2; \
0377     pxor RNOT,      x4; \
0378     pxor x0,        x1; \
0379     pxor x2,        x0; \
0380     pand x4,        x2; \
0381     pxor x0,        x2; \
0382     por x4,         x0; \
0383     pxor x3,        x0; \
0384     pand x2,        x3; \
0385     pxor x3,        x4; \
0386     pxor x1,        x3; \
0387     pand x0,        x1; \
0388     pxor x1,        x4; \
0389     pxor x3,        x0;
0390 
0391 #define SI5(x0, x1, x2, x3, x4) \
0392     movdqa x1,      x4; \
0393     por x2,         x1; \
0394     pxor x4,        x2; \
0395     pxor x3,        x1; \
0396     pand x4,        x3; \
0397     pxor x3,        x2; \
0398     por x0,         x3; \
0399     pxor RNOT,      x0; \
0400     pxor x2,        x3; \
0401     por x0,         x2; \
0402     pxor x1,        x4; \
0403     pxor x4,        x2; \
0404     pand x0,        x4; \
0405     pxor x1,        x0; \
0406     pxor x3,        x1; \
0407     pand x2,        x0; \
0408     pxor x3,        x2; \
0409     pxor x2,        x0; \
0410     pxor x4,        x2; \
0411     pxor x3,        x4;
0412 
0413 #define SI6(x0, x1, x2, x3, x4) \
0414     pxor x2,        x0; \
0415     movdqa x0,      x4; \
0416     pand x3,        x0; \
0417     pxor x3,        x2; \
0418     pxor x2,        x0; \
0419     pxor x1,        x3; \
0420     por x4,         x2; \
0421     pxor x3,        x2; \
0422     pand x0,        x3; \
0423     pxor RNOT,      x0; \
0424     pxor x1,        x3; \
0425     pand x2,        x1; \
0426     pxor x0,        x4; \
0427     pxor x4,        x3; \
0428     pxor x2,        x4; \
0429     pxor x1,        x0; \
0430     pxor x0,        x2;
0431 
0432 #define SI7(x0, x1, x2, x3, x4) \
0433     movdqa x3,      x4; \
0434     pand x0,        x3; \
0435     pxor x2,        x0; \
0436     por x4,         x2; \
0437     pxor x1,        x4; \
0438     pxor RNOT,      x0; \
0439     por x3,         x1; \
0440     pxor x0,        x4; \
0441     pand x2,        x0; \
0442     pxor x1,        x0; \
0443     pand x2,        x1; \
0444     pxor x2,        x3; \
0445     pxor x3,        x4; \
0446     pand x3,        x2; \
0447     por x0,         x3; \
0448     pxor x4,        x1; \
0449     pxor x4,        x3; \
0450     pand x0,        x4; \
0451     pxor x2,        x4;
0452 
0453 #define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
0454     movdqa x0,      t2; \
0455     punpckldq x1,       x0; \
0456     punpckhdq x1,       t2; \
0457     movdqa x2,      t1; \
0458     punpckhdq x3,       x2; \
0459     punpckldq x3,       t1; \
0460     movdqa x0,      x1; \
0461     punpcklqdq t1,      x0; \
0462     punpckhqdq t1,      x1; \
0463     movdqa t2,      x3; \
0464     punpcklqdq x2,      t2; \
0465     punpckhqdq x2,      x3; \
0466     movdqa t2,      x2;
0467 
0468 #define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \
0469     movdqu (0*4*4)(in), x0; \
0470     movdqu (1*4*4)(in), x1; \
0471     movdqu (2*4*4)(in), x2; \
0472     movdqu (3*4*4)(in), x3; \
0473     \
0474     transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
0475 
0476 #define write_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
0477     transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
0478     \
0479     movdqu x0, (0*4*4)(out); \
0480     movdqu x1, (1*4*4)(out); \
0481     movdqu x2, (2*4*4)(out); \
0482     movdqu x3, (3*4*4)(out);
0483 
0484 #define xor_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
0485     transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
0486     \
0487     movdqu (0*4*4)(out),    t0; \
0488     pxor t0,        x0; \
0489     movdqu x0,      (0*4*4)(out); \
0490     movdqu (1*4*4)(out),    t0; \
0491     pxor t0,        x1; \
0492     movdqu x1,      (1*4*4)(out); \
0493     movdqu (2*4*4)(out),    t0; \
0494     pxor t0,        x2; \
0495     movdqu x2,      (2*4*4)(out); \
0496     movdqu (3*4*4)(out),    t0; \
0497     pxor t0,        x3; \
0498     movdqu x3,      (3*4*4)(out);
0499 
0500 SYM_FUNC_START(__serpent_enc_blk_4way)
0501     /* input:
0502      *  arg_ctx(%esp): ctx, CTX
0503      *  arg_dst(%esp): dst
0504      *  arg_src(%esp): src
0505      *  arg_xor(%esp): bool, if true: xor output
0506      */
0507 
0508     pcmpeqd RNOT, RNOT;
0509 
0510     movl arg_ctx(%esp), CTX;
0511 
0512     movl arg_src(%esp), %eax;
0513     read_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE);
0514 
0515                      K(RA, RB, RC, RD, RE, 0);
0516     S0(RA, RB, RC, RD, RE);     LK(RC, RB, RD, RA, RE, 1);
0517     S1(RC, RB, RD, RA, RE);     LK(RE, RD, RA, RC, RB, 2);
0518     S2(RE, RD, RA, RC, RB);     LK(RB, RD, RE, RC, RA, 3);
0519     S3(RB, RD, RE, RC, RA);     LK(RC, RA, RD, RB, RE, 4);
0520     S4(RC, RA, RD, RB, RE);     LK(RA, RD, RB, RE, RC, 5);
0521     S5(RA, RD, RB, RE, RC);     LK(RC, RA, RD, RE, RB, 6);
0522     S6(RC, RA, RD, RE, RB);     LK(RD, RB, RA, RE, RC, 7);
0523     S7(RD, RB, RA, RE, RC);     LK(RC, RA, RE, RD, RB, 8);
0524     S0(RC, RA, RE, RD, RB);     LK(RE, RA, RD, RC, RB, 9);
0525     S1(RE, RA, RD, RC, RB);     LK(RB, RD, RC, RE, RA, 10);
0526     S2(RB, RD, RC, RE, RA);     LK(RA, RD, RB, RE, RC, 11);
0527     S3(RA, RD, RB, RE, RC);     LK(RE, RC, RD, RA, RB, 12);
0528     S4(RE, RC, RD, RA, RB);     LK(RC, RD, RA, RB, RE, 13);
0529     S5(RC, RD, RA, RB, RE);     LK(RE, RC, RD, RB, RA, 14);
0530     S6(RE, RC, RD, RB, RA);     LK(RD, RA, RC, RB, RE, 15);
0531     S7(RD, RA, RC, RB, RE);     LK(RE, RC, RB, RD, RA, 16);
0532     S0(RE, RC, RB, RD, RA);     LK(RB, RC, RD, RE, RA, 17);
0533     S1(RB, RC, RD, RE, RA);     LK(RA, RD, RE, RB, RC, 18);
0534     S2(RA, RD, RE, RB, RC);     LK(RC, RD, RA, RB, RE, 19);
0535     S3(RC, RD, RA, RB, RE);     LK(RB, RE, RD, RC, RA, 20);
0536     S4(RB, RE, RD, RC, RA);     LK(RE, RD, RC, RA, RB, 21);
0537     S5(RE, RD, RC, RA, RB);     LK(RB, RE, RD, RA, RC, 22);
0538     S6(RB, RE, RD, RA, RC);     LK(RD, RC, RE, RA, RB, 23);
0539     S7(RD, RC, RE, RA, RB);     LK(RB, RE, RA, RD, RC, 24);
0540     S0(RB, RE, RA, RD, RC);     LK(RA, RE, RD, RB, RC, 25);
0541     S1(RA, RE, RD, RB, RC);     LK(RC, RD, RB, RA, RE, 26);
0542     S2(RC, RD, RB, RA, RE);     LK(RE, RD, RC, RA, RB, 27);
0543     S3(RE, RD, RC, RA, RB);     LK(RA, RB, RD, RE, RC, 28);
0544     S4(RA, RB, RD, RE, RC);     LK(RB, RD, RE, RC, RA, 29);
0545     S5(RB, RD, RE, RC, RA);     LK(RA, RB, RD, RC, RE, 30);
0546     S6(RA, RB, RD, RC, RE);     LK(RD, RE, RB, RC, RA, 31);
0547     S7(RD, RE, RB, RC, RA);      K(RA, RB, RC, RD, RE, 32);
0548 
0549     movl arg_dst(%esp), %eax;
0550 
0551     cmpb $0, arg_xor(%esp);
0552     jnz .L__enc_xor4;
0553 
0554     write_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE);
0555 
0556     RET;
0557 
0558 .L__enc_xor4:
0559     xor_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE);
0560 
0561     RET;
0562 SYM_FUNC_END(__serpent_enc_blk_4way)
0563 
0564 SYM_FUNC_START(serpent_dec_blk_4way)
0565     /* input:
0566      *  arg_ctx(%esp): ctx, CTX
0567      *  arg_dst(%esp): dst
0568      *  arg_src(%esp): src
0569      */
0570 
0571     pcmpeqd RNOT, RNOT;
0572 
0573     movl arg_ctx(%esp), CTX;
0574 
0575     movl arg_src(%esp), %eax;
0576     read_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE);
0577 
0578                      K(RA, RB, RC, RD, RE, 32);
0579     SI7(RA, RB, RC, RD, RE);    KL(RB, RD, RA, RE, RC, 31);
0580     SI6(RB, RD, RA, RE, RC);    KL(RA, RC, RE, RB, RD, 30);
0581     SI5(RA, RC, RE, RB, RD);    KL(RC, RD, RA, RE, RB, 29);
0582     SI4(RC, RD, RA, RE, RB);    KL(RC, RA, RB, RE, RD, 28);
0583     SI3(RC, RA, RB, RE, RD);    KL(RB, RC, RD, RE, RA, 27);
0584     SI2(RB, RC, RD, RE, RA);    KL(RC, RA, RE, RD, RB, 26);
0585     SI1(RC, RA, RE, RD, RB);    KL(RB, RA, RE, RD, RC, 25);
0586     SI0(RB, RA, RE, RD, RC);    KL(RE, RC, RA, RB, RD, 24);
0587     SI7(RE, RC, RA, RB, RD);    KL(RC, RB, RE, RD, RA, 23);
0588     SI6(RC, RB, RE, RD, RA);    KL(RE, RA, RD, RC, RB, 22);
0589     SI5(RE, RA, RD, RC, RB);    KL(RA, RB, RE, RD, RC, 21);
0590     SI4(RA, RB, RE, RD, RC);    KL(RA, RE, RC, RD, RB, 20);
0591     SI3(RA, RE, RC, RD, RB);    KL(RC, RA, RB, RD, RE, 19);
0592     SI2(RC, RA, RB, RD, RE);    KL(RA, RE, RD, RB, RC, 18);
0593     SI1(RA, RE, RD, RB, RC);    KL(RC, RE, RD, RB, RA, 17);
0594     SI0(RC, RE, RD, RB, RA);    KL(RD, RA, RE, RC, RB, 16);
0595     SI7(RD, RA, RE, RC, RB);    KL(RA, RC, RD, RB, RE, 15);
0596     SI6(RA, RC, RD, RB, RE);    KL(RD, RE, RB, RA, RC, 14);
0597     SI5(RD, RE, RB, RA, RC);    KL(RE, RC, RD, RB, RA, 13);
0598     SI4(RE, RC, RD, RB, RA);    KL(RE, RD, RA, RB, RC, 12);
0599     SI3(RE, RD, RA, RB, RC);    KL(RA, RE, RC, RB, RD, 11);
0600     SI2(RA, RE, RC, RB, RD);    KL(RE, RD, RB, RC, RA, 10);
0601     SI1(RE, RD, RB, RC, RA);    KL(RA, RD, RB, RC, RE, 9);
0602     SI0(RA, RD, RB, RC, RE);    KL(RB, RE, RD, RA, RC, 8);
0603     SI7(RB, RE, RD, RA, RC);    KL(RE, RA, RB, RC, RD, 7);
0604     SI6(RE, RA, RB, RC, RD);    KL(RB, RD, RC, RE, RA, 6);
0605     SI5(RB, RD, RC, RE, RA);    KL(RD, RA, RB, RC, RE, 5);
0606     SI4(RD, RA, RB, RC, RE);    KL(RD, RB, RE, RC, RA, 4);
0607     SI3(RD, RB, RE, RC, RA);    KL(RE, RD, RA, RC, RB, 3);
0608     SI2(RE, RD, RA, RC, RB);    KL(RD, RB, RC, RA, RE, 2);
0609     SI1(RD, RB, RC, RA, RE);    KL(RE, RB, RC, RA, RD, 1);
0610     SI0(RE, RB, RC, RA, RD);     K(RC, RD, RB, RE, RA, 0);
0611 
0612     movl arg_dst(%esp), %eax;
0613     write_blocks(%eax, RC, RD, RB, RE, RT0, RT1, RA);
0614 
0615     RET;
0616 SYM_FUNC_END(serpent_dec_blk_4way)