Back to home page

OSCL-LXR

 
 

    


0001 /* SPDX-License-Identifier: GPL-2.0-or-later */
0002 /*
0003  * Serpent Cipher 8-way parallel algorithm (x86_64/SSE2)
0004  *
0005  * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
0006  *
0007  * Based on crypto/serpent.c by
0008  *  Copyright (C) 2002 Dag Arne Osvik <osvik@ii.uib.no>
0009  *                2003 Herbert Valerio Riedel <hvr@gnu.org>
0010  */
0011 
0012 #include <linux/linkage.h>
0013 
0014 .file "serpent-sse2-x86_64-asm_64.S"
0015 .text
0016 
0017 #define CTX %rdi
0018 
0019 /**********************************************************************
0020   8-way SSE2 serpent
0021  **********************************************************************/
0022 #define RA1 %xmm0
0023 #define RB1 %xmm1
0024 #define RC1 %xmm2
0025 #define RD1 %xmm3
0026 #define RE1 %xmm4
0027 
0028 #define RA2 %xmm5
0029 #define RB2 %xmm6
0030 #define RC2 %xmm7
0031 #define RD2 %xmm8
0032 #define RE2 %xmm9
0033 
0034 #define RNOT %xmm10
0035 
0036 #define RK0 %xmm11
0037 #define RK1 %xmm12
0038 #define RK2 %xmm13
0039 #define RK3 %xmm14
0040 
0041 #define S0_1(x0, x1, x2, x3, x4) \
0042     movdqa x3,      x4; \
0043     por x0,         x3; \
0044     pxor x4,        x0; \
0045     pxor x2,        x4; \
0046     pxor RNOT,      x4; \
0047     pxor x1,        x3; \
0048     pand x0,        x1; \
0049     pxor x4,        x1; \
0050     pxor x0,        x2;
0051 #define S0_2(x0, x1, x2, x3, x4) \
0052     pxor x3,        x0; \
0053     por x0,         x4; \
0054     pxor x2,        x0; \
0055     pand x1,        x2; \
0056     pxor x2,        x3; \
0057     pxor RNOT,      x1; \
0058     pxor x4,        x2; \
0059     pxor x2,        x1;
0060 
0061 #define S1_1(x0, x1, x2, x3, x4) \
0062     movdqa x1,      x4; \
0063     pxor x0,        x1; \
0064     pxor x3,        x0; \
0065     pxor RNOT,      x3; \
0066     pand x1,        x4; \
0067     por x1,         x0; \
0068     pxor x2,        x3; \
0069     pxor x3,        x0; \
0070     pxor x3,        x1;
0071 #define S1_2(x0, x1, x2, x3, x4) \
0072     pxor x4,        x3; \
0073     por x4,         x1; \
0074     pxor x2,        x4; \
0075     pand x0,        x2; \
0076     pxor x1,        x2; \
0077     por x0,         x1; \
0078     pxor RNOT,      x0; \
0079     pxor x2,        x0; \
0080     pxor x1,        x4;
0081 
0082 #define S2_1(x0, x1, x2, x3, x4) \
0083     pxor RNOT,      x3; \
0084     pxor x0,        x1; \
0085     movdqa x0,      x4; \
0086     pand x2,        x0; \
0087     pxor x3,        x0; \
0088     por x4,         x3; \
0089     pxor x1,        x2; \
0090     pxor x1,        x3; \
0091     pand x0,        x1;
0092 #define S2_2(x0, x1, x2, x3, x4) \
0093     pxor x2,        x0; \
0094     pand x3,        x2; \
0095     por x1,         x3; \
0096     pxor RNOT,      x0; \
0097     pxor x0,        x3; \
0098     pxor x0,        x4; \
0099     pxor x2,        x0; \
0100     por x2,         x1;
0101 
0102 #define S3_1(x0, x1, x2, x3, x4) \
0103     movdqa x1,      x4; \
0104     pxor x3,        x1; \
0105     por x0,         x3; \
0106     pand x0,        x4; \
0107     pxor x2,        x0; \
0108     pxor x1,        x2; \
0109     pand x3,        x1; \
0110     pxor x3,        x2; \
0111     por x4,         x0; \
0112     pxor x3,        x4;
0113 #define S3_2(x0, x1, x2, x3, x4) \
0114     pxor x0,        x1; \
0115     pand x3,        x0; \
0116     pand x4,        x3; \
0117     pxor x2,        x3; \
0118     por x1,         x4; \
0119     pand x1,        x2; \
0120     pxor x3,        x4; \
0121     pxor x3,        x0; \
0122     pxor x2,        x3;
0123 
0124 #define S4_1(x0, x1, x2, x3, x4) \
0125     movdqa x3,      x4; \
0126     pand x0,        x3; \
0127     pxor x4,        x0; \
0128     pxor x2,        x3; \
0129     por x4,         x2; \
0130     pxor x1,        x0; \
0131     pxor x3,        x4; \
0132     por x0,         x2; \
0133     pxor x1,        x2;
0134 #define S4_2(x0, x1, x2, x3, x4) \
0135     pand x0,        x1; \
0136     pxor x4,        x1; \
0137     pand x2,        x4; \
0138     pxor x3,        x2; \
0139     pxor x0,        x4; \
0140     por x1,         x3; \
0141     pxor RNOT,      x1; \
0142     pxor x0,        x3;
0143 
0144 #define S5_1(x0, x1, x2, x3, x4) \
0145     movdqa x1,      x4; \
0146     por x0,         x1; \
0147     pxor x1,        x2; \
0148     pxor RNOT,      x3; \
0149     pxor x0,        x4; \
0150     pxor x2,        x0; \
0151     pand x4,        x1; \
0152     por x3,         x4; \
0153     pxor x0,        x4;
0154 #define S5_2(x0, x1, x2, x3, x4) \
0155     pand x3,        x0; \
0156     pxor x3,        x1; \
0157     pxor x2,        x3; \
0158     pxor x1,        x0; \
0159     pand x4,        x2; \
0160     pxor x2,        x1; \
0161     pand x0,        x2; \
0162     pxor x2,        x3;
0163 
0164 #define S6_1(x0, x1, x2, x3, x4) \
0165     movdqa x1,      x4; \
0166     pxor x0,        x3; \
0167     pxor x2,        x1; \
0168     pxor x0,        x2; \
0169     pand x3,        x0; \
0170     por x3,         x1; \
0171     pxor RNOT,      x4; \
0172     pxor x1,        x0; \
0173     pxor x2,        x1;
0174 #define S6_2(x0, x1, x2, x3, x4) \
0175     pxor x4,        x3; \
0176     pxor x0,        x4; \
0177     pand x0,        x2; \
0178     pxor x1,        x4; \
0179     pxor x3,        x2; \
0180     pand x1,        x3; \
0181     pxor x0,        x3; \
0182     pxor x2,        x1;
0183 
0184 #define S7_1(x0, x1, x2, x3, x4) \
0185     pxor RNOT,      x1; \
0186     movdqa x1,      x4; \
0187     pxor RNOT,      x0; \
0188     pand x2,        x1; \
0189     pxor x3,        x1; \
0190     por x4,         x3; \
0191     pxor x2,        x4; \
0192     pxor x3,        x2; \
0193     pxor x0,        x3; \
0194     por x1,         x0;
0195 #define S7_2(x0, x1, x2, x3, x4) \
0196     pand x0,        x2; \
0197     pxor x4,        x0; \
0198     pxor x3,        x4; \
0199     pand x0,        x3; \
0200     pxor x1,        x4; \
0201     pxor x4,        x2; \
0202     pxor x1,        x3; \
0203     por x0,         x4; \
0204     pxor x1,        x4;
0205 
0206 #define SI0_1(x0, x1, x2, x3, x4) \
0207     movdqa x3,      x4; \
0208     pxor x0,        x1; \
0209     por x1,         x3; \
0210     pxor x1,        x4; \
0211     pxor RNOT,      x0; \
0212     pxor x3,        x2; \
0213     pxor x0,        x3; \
0214     pand x1,        x0; \
0215     pxor x2,        x0;
0216 #define SI0_2(x0, x1, x2, x3, x4) \
0217     pand x3,        x2; \
0218     pxor x4,        x3; \
0219     pxor x3,        x2; \
0220     pxor x3,        x1; \
0221     pand x0,        x3; \
0222     pxor x0,        x1; \
0223     pxor x2,        x0; \
0224     pxor x3,        x4;
0225 
0226 #define SI1_1(x0, x1, x2, x3, x4) \
0227     pxor x3,        x1; \
0228     movdqa x0,      x4; \
0229     pxor x2,        x0; \
0230     pxor RNOT,      x2; \
0231     por x1,         x4; \
0232     pxor x3,        x4; \
0233     pand x1,        x3; \
0234     pxor x2,        x1; \
0235     pand x4,        x2;
0236 #define SI1_2(x0, x1, x2, x3, x4) \
0237     pxor x1,        x4; \
0238     por x3,         x1; \
0239     pxor x0,        x3; \
0240     pxor x0,        x2; \
0241     por x4,         x0; \
0242     pxor x4,        x2; \
0243     pxor x0,        x1; \
0244     pxor x1,        x4;
0245 
0246 #define SI2_1(x0, x1, x2, x3, x4) \
0247     pxor x1,        x2; \
0248     movdqa x3,      x4; \
0249     pxor RNOT,      x3; \
0250     por x2,         x3; \
0251     pxor x4,        x2; \
0252     pxor x0,        x4; \
0253     pxor x1,        x3; \
0254     por x2,         x1; \
0255     pxor x0,        x2;
0256 #define SI2_2(x0, x1, x2, x3, x4) \
0257     pxor x4,        x1; \
0258     por x3,         x4; \
0259     pxor x3,        x2; \
0260     pxor x2,        x4; \
0261     pand x1,        x2; \
0262     pxor x3,        x2; \
0263     pxor x4,        x3; \
0264     pxor x0,        x4;
0265 
0266 #define SI3_1(x0, x1, x2, x3, x4) \
0267     pxor x1,        x2; \
0268     movdqa x1,      x4; \
0269     pand x2,        x1; \
0270     pxor x0,        x1; \
0271     por x4,         x0; \
0272     pxor x3,        x4; \
0273     pxor x3,        x0; \
0274     por x1,         x3; \
0275     pxor x2,        x1;
0276 #define SI3_2(x0, x1, x2, x3, x4) \
0277     pxor x3,        x1; \
0278     pxor x2,        x0; \
0279     pxor x3,        x2; \
0280     pand x1,        x3; \
0281     pxor x0,        x1; \
0282     pand x2,        x0; \
0283     pxor x3,        x4; \
0284     pxor x0,        x3; \
0285     pxor x1,        x0;
0286 
0287 #define SI4_1(x0, x1, x2, x3, x4) \
0288     pxor x3,        x2; \
0289     movdqa x0,      x4; \
0290     pand x1,        x0; \
0291     pxor x2,        x0; \
0292     por x3,         x2; \
0293     pxor RNOT,      x4; \
0294     pxor x0,        x1; \
0295     pxor x2,        x0; \
0296     pand x4,        x2;
0297 #define SI4_2(x0, x1, x2, x3, x4) \
0298     pxor x0,        x2; \
0299     por x4,         x0; \
0300     pxor x3,        x0; \
0301     pand x2,        x3; \
0302     pxor x3,        x4; \
0303     pxor x1,        x3; \
0304     pand x0,        x1; \
0305     pxor x1,        x4; \
0306     pxor x3,        x0;
0307 
0308 #define SI5_1(x0, x1, x2, x3, x4) \
0309     movdqa x1,      x4; \
0310     por x2,         x1; \
0311     pxor x4,        x2; \
0312     pxor x3,        x1; \
0313     pand x4,        x3; \
0314     pxor x3,        x2; \
0315     por x0,         x3; \
0316     pxor RNOT,      x0; \
0317     pxor x2,        x3; \
0318     por x0,         x2;
0319 #define SI5_2(x0, x1, x2, x3, x4) \
0320     pxor x1,        x4; \
0321     pxor x4,        x2; \
0322     pand x0,        x4; \
0323     pxor x1,        x0; \
0324     pxor x3,        x1; \
0325     pand x2,        x0; \
0326     pxor x3,        x2; \
0327     pxor x2,        x0; \
0328     pxor x4,        x2; \
0329     pxor x3,        x4;
0330 
0331 #define SI6_1(x0, x1, x2, x3, x4) \
0332     pxor x2,        x0; \
0333     movdqa x0,      x4; \
0334     pand x3,        x0; \
0335     pxor x3,        x2; \
0336     pxor x2,        x0; \
0337     pxor x1,        x3; \
0338     por x4,         x2; \
0339     pxor x3,        x2; \
0340     pand x0,        x3;
0341 #define SI6_2(x0, x1, x2, x3, x4) \
0342     pxor RNOT,      x0; \
0343     pxor x1,        x3; \
0344     pand x2,        x1; \
0345     pxor x0,        x4; \
0346     pxor x4,        x3; \
0347     pxor x2,        x4; \
0348     pxor x1,        x0; \
0349     pxor x0,        x2;
0350 
0351 #define SI7_1(x0, x1, x2, x3, x4) \
0352     movdqa x3,      x4; \
0353     pand x0,        x3; \
0354     pxor x2,        x0; \
0355     por x4,         x2; \
0356     pxor x1,        x4; \
0357     pxor RNOT,      x0; \
0358     por x3,         x1; \
0359     pxor x0,        x4; \
0360     pand x2,        x0; \
0361     pxor x1,        x0;
0362 #define SI7_2(x0, x1, x2, x3, x4) \
0363     pand x2,        x1; \
0364     pxor x2,        x3; \
0365     pxor x3,        x4; \
0366     pand x3,        x2; \
0367     por x0,         x3; \
0368     pxor x4,        x1; \
0369     pxor x4,        x3; \
0370     pand x0,        x4; \
0371     pxor x2,        x4;
0372 
0373 #define get_key(i, j, t) \
0374     movd (4*(i)+(j))*4(CTX), t; \
0375     pshufd $0, t, t;
0376 
0377 #define K2(x0, x1, x2, x3, x4, i) \
0378     get_key(i, 0, RK0); \
0379     get_key(i, 1, RK1); \
0380     get_key(i, 2, RK2); \
0381     get_key(i, 3, RK3); \
0382     pxor RK0,       x0 ## 1; \
0383     pxor RK1,       x1 ## 1; \
0384     pxor RK2,       x2 ## 1; \
0385     pxor RK3,       x3 ## 1; \
0386         pxor RK0,       x0 ## 2; \
0387         pxor RK1,       x1 ## 2; \
0388         pxor RK2,       x2 ## 2; \
0389         pxor RK3,       x3 ## 2;
0390 
0391 #define LK2(x0, x1, x2, x3, x4, i) \
0392     movdqa x0 ## 1,     x4 ## 1; \
0393     pslld $13,      x0 ## 1; \
0394     psrld $(32 - 13),   x4 ## 1; \
0395     por x4 ## 1,        x0 ## 1; \
0396     pxor x0 ## 1,       x1 ## 1; \
0397     movdqa x2 ## 1,     x4 ## 1; \
0398     pslld $3,       x2 ## 1; \
0399     psrld $(32 - 3),    x4 ## 1; \
0400     por x4 ## 1,        x2 ## 1; \
0401     pxor x2 ## 1,       x1 ## 1; \
0402         movdqa x0 ## 2,     x4 ## 2; \
0403         pslld $13,      x0 ## 2; \
0404         psrld $(32 - 13),   x4 ## 2; \
0405         por x4 ## 2,        x0 ## 2; \
0406         pxor x0 ## 2,       x1 ## 2; \
0407         movdqa x2 ## 2,     x4 ## 2; \
0408         pslld $3,       x2 ## 2; \
0409         psrld $(32 - 3),    x4 ## 2; \
0410         por x4 ## 2,        x2 ## 2; \
0411         pxor x2 ## 2,       x1 ## 2; \
0412     movdqa x1 ## 1,     x4 ## 1; \
0413     pslld $1,       x1 ## 1; \
0414     psrld $(32 - 1),    x4 ## 1; \
0415     por x4 ## 1,        x1 ## 1; \
0416     movdqa x0 ## 1,     x4 ## 1; \
0417     pslld $3,       x4 ## 1; \
0418     pxor x2 ## 1,       x3 ## 1; \
0419     pxor x4 ## 1,       x3 ## 1; \
0420     movdqa x3 ## 1,     x4 ## 1; \
0421     get_key(i, 1, RK1); \
0422         movdqa x1 ## 2,     x4 ## 2; \
0423         pslld $1,       x1 ## 2; \
0424         psrld $(32 - 1),    x4 ## 2; \
0425         por x4 ## 2,        x1 ## 2; \
0426         movdqa x0 ## 2,     x4 ## 2; \
0427         pslld $3,       x4 ## 2; \
0428         pxor x2 ## 2,       x3 ## 2; \
0429         pxor x4 ## 2,       x3 ## 2; \
0430         movdqa x3 ## 2,     x4 ## 2; \
0431         get_key(i, 3, RK3); \
0432     pslld $7,       x3 ## 1; \
0433     psrld $(32 - 7),    x4 ## 1; \
0434     por x4 ## 1,        x3 ## 1; \
0435     movdqa x1 ## 1,     x4 ## 1; \
0436     pslld $7,       x4 ## 1; \
0437     pxor x1 ## 1,       x0 ## 1; \
0438     pxor x3 ## 1,       x0 ## 1; \
0439     pxor x3 ## 1,       x2 ## 1; \
0440     pxor x4 ## 1,       x2 ## 1; \
0441     get_key(i, 0, RK0); \
0442         pslld $7,       x3 ## 2; \
0443         psrld $(32 - 7),    x4 ## 2; \
0444         por x4 ## 2,        x3 ## 2; \
0445         movdqa x1 ## 2,     x4 ## 2; \
0446         pslld $7,       x4 ## 2; \
0447         pxor x1 ## 2,       x0 ## 2; \
0448         pxor x3 ## 2,       x0 ## 2; \
0449         pxor x3 ## 2,       x2 ## 2; \
0450         pxor x4 ## 2,       x2 ## 2; \
0451         get_key(i, 2, RK2); \
0452     pxor RK1,       x1 ## 1; \
0453     pxor RK3,       x3 ## 1; \
0454     movdqa x0 ## 1,     x4 ## 1; \
0455     pslld $5,       x0 ## 1; \
0456     psrld $(32 - 5),    x4 ## 1; \
0457     por x4 ## 1,        x0 ## 1; \
0458     movdqa x2 ## 1,     x4 ## 1; \
0459     pslld $22,      x2 ## 1; \
0460     psrld $(32 - 22),   x4 ## 1; \
0461     por x4 ## 1,        x2 ## 1; \
0462     pxor RK0,       x0 ## 1; \
0463     pxor RK2,       x2 ## 1; \
0464         pxor RK1,       x1 ## 2; \
0465         pxor RK3,       x3 ## 2; \
0466         movdqa x0 ## 2,     x4 ## 2; \
0467         pslld $5,       x0 ## 2; \
0468         psrld $(32 - 5),    x4 ## 2; \
0469         por x4 ## 2,        x0 ## 2; \
0470         movdqa x2 ## 2,     x4 ## 2; \
0471         pslld $22,      x2 ## 2; \
0472         psrld $(32 - 22),   x4 ## 2; \
0473         por x4 ## 2,        x2 ## 2; \
0474         pxor RK0,       x0 ## 2; \
0475         pxor RK2,       x2 ## 2;
0476 
0477 #define KL2(x0, x1, x2, x3, x4, i) \
0478     pxor RK0,       x0 ## 1; \
0479     pxor RK2,       x2 ## 1; \
0480     movdqa x0 ## 1,     x4 ## 1; \
0481     psrld $5,       x0 ## 1; \
0482     pslld $(32 - 5),    x4 ## 1; \
0483     por x4 ## 1,        x0 ## 1; \
0484     pxor RK3,       x3 ## 1; \
0485     pxor RK1,       x1 ## 1; \
0486     movdqa x2 ## 1,     x4 ## 1; \
0487     psrld $22,      x2 ## 1; \
0488     pslld $(32 - 22),   x4 ## 1; \
0489     por x4 ## 1,        x2 ## 1; \
0490     pxor x3 ## 1,       x2 ## 1; \
0491         pxor RK0,       x0 ## 2; \
0492         pxor RK2,       x2 ## 2; \
0493         movdqa x0 ## 2,     x4 ## 2; \
0494         psrld $5,       x0 ## 2; \
0495         pslld $(32 - 5),    x4 ## 2; \
0496         por x4 ## 2,        x0 ## 2; \
0497         pxor RK3,       x3 ## 2; \
0498         pxor RK1,       x1 ## 2; \
0499         movdqa x2 ## 2,     x4 ## 2; \
0500         psrld $22,      x2 ## 2; \
0501         pslld $(32 - 22),   x4 ## 2; \
0502         por x4 ## 2,        x2 ## 2; \
0503         pxor x3 ## 2,       x2 ## 2; \
0504     pxor x3 ## 1,       x0 ## 1; \
0505     movdqa x1 ## 1,     x4 ## 1; \
0506     pslld $7,       x4 ## 1; \
0507     pxor x1 ## 1,       x0 ## 1; \
0508     pxor x4 ## 1,       x2 ## 1; \
0509     movdqa x1 ## 1,     x4 ## 1; \
0510     psrld $1,       x1 ## 1; \
0511     pslld $(32 - 1),    x4 ## 1; \
0512     por x4 ## 1,        x1 ## 1; \
0513         pxor x3 ## 2,       x0 ## 2; \
0514         movdqa x1 ## 2,     x4 ## 2; \
0515         pslld $7,       x4 ## 2; \
0516         pxor x1 ## 2,       x0 ## 2; \
0517         pxor x4 ## 2,       x2 ## 2; \
0518         movdqa x1 ## 2,     x4 ## 2; \
0519         psrld $1,       x1 ## 2; \
0520         pslld $(32 - 1),    x4 ## 2; \
0521         por x4 ## 2,        x1 ## 2; \
0522     movdqa x3 ## 1,     x4 ## 1; \
0523     psrld $7,       x3 ## 1; \
0524     pslld $(32 - 7),    x4 ## 1; \
0525     por x4 ## 1,        x3 ## 1; \
0526     pxor x0 ## 1,       x1 ## 1; \
0527     movdqa x0 ## 1,     x4 ## 1; \
0528     pslld $3,       x4 ## 1; \
0529     pxor x4 ## 1,       x3 ## 1; \
0530     movdqa x0 ## 1,     x4 ## 1; \
0531         movdqa x3 ## 2,     x4 ## 2; \
0532         psrld $7,       x3 ## 2; \
0533         pslld $(32 - 7),    x4 ## 2; \
0534         por x4 ## 2,        x3 ## 2; \
0535         pxor x0 ## 2,       x1 ## 2; \
0536         movdqa x0 ## 2,     x4 ## 2; \
0537         pslld $3,       x4 ## 2; \
0538         pxor x4 ## 2,       x3 ## 2; \
0539         movdqa x0 ## 2,     x4 ## 2; \
0540     psrld $13,      x0 ## 1; \
0541     pslld $(32 - 13),   x4 ## 1; \
0542     por x4 ## 1,        x0 ## 1; \
0543     pxor x2 ## 1,       x1 ## 1; \
0544     pxor x2 ## 1,       x3 ## 1; \
0545     movdqa x2 ## 1,     x4 ## 1; \
0546     psrld $3,       x2 ## 1; \
0547     pslld $(32 - 3),    x4 ## 1; \
0548     por x4 ## 1,        x2 ## 1; \
0549         psrld $13,      x0 ## 2; \
0550         pslld $(32 - 13),   x4 ## 2; \
0551         por x4 ## 2,        x0 ## 2; \
0552         pxor x2 ## 2,       x1 ## 2; \
0553         pxor x2 ## 2,       x3 ## 2; \
0554         movdqa x2 ## 2,     x4 ## 2; \
0555         psrld $3,       x2 ## 2; \
0556         pslld $(32 - 3),    x4 ## 2; \
0557         por x4 ## 2,        x2 ## 2;
0558 
0559 #define S(SBOX, x0, x1, x2, x3, x4) \
0560     SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
0561     SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
0562     SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
0563     SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2);
0564 
0565 #define SP(SBOX, x0, x1, x2, x3, x4, i) \
0566     get_key(i, 0, RK0); \
0567     SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
0568     get_key(i, 2, RK2); \
0569     SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
0570     get_key(i, 3, RK3); \
0571     SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
0572     get_key(i, 1, RK1); \
0573     SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
0574 
0575 #define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
0576     movdqa x0,      t2; \
0577     punpckldq x1,       x0; \
0578     punpckhdq x1,       t2; \
0579     movdqa x2,      t1; \
0580     punpckhdq x3,       x2; \
0581     punpckldq x3,       t1; \
0582     movdqa x0,      x1; \
0583     punpcklqdq t1,      x0; \
0584     punpckhqdq t1,      x1; \
0585     movdqa t2,      x3; \
0586     punpcklqdq x2,      t2; \
0587     punpckhqdq x2,      x3; \
0588     movdqa t2,      x2;
0589 
0590 #define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \
0591     movdqu (0*4*4)(in), x0; \
0592     movdqu (1*4*4)(in), x1; \
0593     movdqu (2*4*4)(in), x2; \
0594     movdqu (3*4*4)(in), x3; \
0595     \
0596     transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
0597 
0598 #define write_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
0599     transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
0600     \
0601     movdqu x0,      (0*4*4)(out); \
0602     movdqu x1,      (1*4*4)(out); \
0603     movdqu x2,      (2*4*4)(out); \
0604     movdqu x3,      (3*4*4)(out);
0605 
0606 #define xor_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
0607     transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
0608     \
0609     movdqu (0*4*4)(out),    t0; \
0610     pxor t0,        x0; \
0611     movdqu x0,      (0*4*4)(out); \
0612     movdqu (1*4*4)(out),    t0; \
0613     pxor t0,        x1; \
0614     movdqu x1,      (1*4*4)(out); \
0615     movdqu (2*4*4)(out),    t0; \
0616     pxor t0,        x2; \
0617     movdqu x2,      (2*4*4)(out); \
0618     movdqu (3*4*4)(out),    t0; \
0619     pxor t0,        x3; \
0620     movdqu x3,      (3*4*4)(out);
0621 
0622 SYM_FUNC_START(__serpent_enc_blk_8way)
0623     /* input:
0624      *  %rdi: ctx, CTX
0625      *  %rsi: dst
0626      *  %rdx: src
0627      *  %rcx: bool, if true: xor output
0628      */
0629 
0630     pcmpeqd RNOT, RNOT;
0631 
0632     leaq (4*4*4)(%rdx), %rax;
0633     read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
0634     read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
0635 
0636                          K2(RA, RB, RC, RD, RE, 0);
0637     S(S0, RA, RB, RC, RD, RE);      LK2(RC, RB, RD, RA, RE, 1);
0638     S(S1, RC, RB, RD, RA, RE);      LK2(RE, RD, RA, RC, RB, 2);
0639     S(S2, RE, RD, RA, RC, RB);      LK2(RB, RD, RE, RC, RA, 3);
0640     S(S3, RB, RD, RE, RC, RA);      LK2(RC, RA, RD, RB, RE, 4);
0641     S(S4, RC, RA, RD, RB, RE);      LK2(RA, RD, RB, RE, RC, 5);
0642     S(S5, RA, RD, RB, RE, RC);      LK2(RC, RA, RD, RE, RB, 6);
0643     S(S6, RC, RA, RD, RE, RB);      LK2(RD, RB, RA, RE, RC, 7);
0644     S(S7, RD, RB, RA, RE, RC);      LK2(RC, RA, RE, RD, RB, 8);
0645     S(S0, RC, RA, RE, RD, RB);      LK2(RE, RA, RD, RC, RB, 9);
0646     S(S1, RE, RA, RD, RC, RB);      LK2(RB, RD, RC, RE, RA, 10);
0647     S(S2, RB, RD, RC, RE, RA);      LK2(RA, RD, RB, RE, RC, 11);
0648     S(S3, RA, RD, RB, RE, RC);      LK2(RE, RC, RD, RA, RB, 12);
0649     S(S4, RE, RC, RD, RA, RB);      LK2(RC, RD, RA, RB, RE, 13);
0650     S(S5, RC, RD, RA, RB, RE);      LK2(RE, RC, RD, RB, RA, 14);
0651     S(S6, RE, RC, RD, RB, RA);      LK2(RD, RA, RC, RB, RE, 15);
0652     S(S7, RD, RA, RC, RB, RE);      LK2(RE, RC, RB, RD, RA, 16);
0653     S(S0, RE, RC, RB, RD, RA);      LK2(RB, RC, RD, RE, RA, 17);
0654     S(S1, RB, RC, RD, RE, RA);      LK2(RA, RD, RE, RB, RC, 18);
0655     S(S2, RA, RD, RE, RB, RC);      LK2(RC, RD, RA, RB, RE, 19);
0656     S(S3, RC, RD, RA, RB, RE);      LK2(RB, RE, RD, RC, RA, 20);
0657     S(S4, RB, RE, RD, RC, RA);      LK2(RE, RD, RC, RA, RB, 21);
0658     S(S5, RE, RD, RC, RA, RB);      LK2(RB, RE, RD, RA, RC, 22);
0659     S(S6, RB, RE, RD, RA, RC);      LK2(RD, RC, RE, RA, RB, 23);
0660     S(S7, RD, RC, RE, RA, RB);      LK2(RB, RE, RA, RD, RC, 24);
0661     S(S0, RB, RE, RA, RD, RC);      LK2(RA, RE, RD, RB, RC, 25);
0662     S(S1, RA, RE, RD, RB, RC);      LK2(RC, RD, RB, RA, RE, 26);
0663     S(S2, RC, RD, RB, RA, RE);      LK2(RE, RD, RC, RA, RB, 27);
0664     S(S3, RE, RD, RC, RA, RB);      LK2(RA, RB, RD, RE, RC, 28);
0665     S(S4, RA, RB, RD, RE, RC);      LK2(RB, RD, RE, RC, RA, 29);
0666     S(S5, RB, RD, RE, RC, RA);      LK2(RA, RB, RD, RC, RE, 30);
0667     S(S6, RA, RB, RD, RC, RE);      LK2(RD, RE, RB, RC, RA, 31);
0668     S(S7, RD, RE, RB, RC, RA);       K2(RA, RB, RC, RD, RE, 32);
0669 
0670     leaq (4*4*4)(%rsi), %rax;
0671 
0672     testb %cl, %cl;
0673     jnz .L__enc_xor8;
0674 
0675     write_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
0676     write_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
0677 
0678     RET;
0679 
0680 .L__enc_xor8:
0681     xor_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
0682     xor_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
0683 
0684     RET;
0685 SYM_FUNC_END(__serpent_enc_blk_8way)
0686 
0687 SYM_FUNC_START(serpent_dec_blk_8way)
0688     /* input:
0689      *  %rdi: ctx, CTX
0690      *  %rsi: dst
0691      *  %rdx: src
0692      */
0693 
0694     pcmpeqd RNOT, RNOT;
0695 
0696     leaq (4*4*4)(%rdx), %rax;
0697     read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
0698     read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
0699 
0700                          K2(RA, RB, RC, RD, RE, 32);
0701     SP(SI7, RA, RB, RC, RD, RE, 31);    KL2(RB, RD, RA, RE, RC, 31);
0702     SP(SI6, RB, RD, RA, RE, RC, 30);    KL2(RA, RC, RE, RB, RD, 30);
0703     SP(SI5, RA, RC, RE, RB, RD, 29);    KL2(RC, RD, RA, RE, RB, 29);
0704     SP(SI4, RC, RD, RA, RE, RB, 28);    KL2(RC, RA, RB, RE, RD, 28);
0705     SP(SI3, RC, RA, RB, RE, RD, 27);    KL2(RB, RC, RD, RE, RA, 27);
0706     SP(SI2, RB, RC, RD, RE, RA, 26);    KL2(RC, RA, RE, RD, RB, 26);
0707     SP(SI1, RC, RA, RE, RD, RB, 25);    KL2(RB, RA, RE, RD, RC, 25);
0708     SP(SI0, RB, RA, RE, RD, RC, 24);    KL2(RE, RC, RA, RB, RD, 24);
0709     SP(SI7, RE, RC, RA, RB, RD, 23);    KL2(RC, RB, RE, RD, RA, 23);
0710     SP(SI6, RC, RB, RE, RD, RA, 22);    KL2(RE, RA, RD, RC, RB, 22);
0711     SP(SI5, RE, RA, RD, RC, RB, 21);    KL2(RA, RB, RE, RD, RC, 21);
0712     SP(SI4, RA, RB, RE, RD, RC, 20);    KL2(RA, RE, RC, RD, RB, 20);
0713     SP(SI3, RA, RE, RC, RD, RB, 19);    KL2(RC, RA, RB, RD, RE, 19);
0714     SP(SI2, RC, RA, RB, RD, RE, 18);    KL2(RA, RE, RD, RB, RC, 18);
0715     SP(SI1, RA, RE, RD, RB, RC, 17);    KL2(RC, RE, RD, RB, RA, 17);
0716     SP(SI0, RC, RE, RD, RB, RA, 16);    KL2(RD, RA, RE, RC, RB, 16);
0717     SP(SI7, RD, RA, RE, RC, RB, 15);    KL2(RA, RC, RD, RB, RE, 15);
0718     SP(SI6, RA, RC, RD, RB, RE, 14);    KL2(RD, RE, RB, RA, RC, 14);
0719     SP(SI5, RD, RE, RB, RA, RC, 13);    KL2(RE, RC, RD, RB, RA, 13);
0720     SP(SI4, RE, RC, RD, RB, RA, 12);    KL2(RE, RD, RA, RB, RC, 12);
0721     SP(SI3, RE, RD, RA, RB, RC, 11);    KL2(RA, RE, RC, RB, RD, 11);
0722     SP(SI2, RA, RE, RC, RB, RD, 10);    KL2(RE, RD, RB, RC, RA, 10);
0723     SP(SI1, RE, RD, RB, RC, RA, 9);     KL2(RA, RD, RB, RC, RE, 9);
0724     SP(SI0, RA, RD, RB, RC, RE, 8);     KL2(RB, RE, RD, RA, RC, 8);
0725     SP(SI7, RB, RE, RD, RA, RC, 7);     KL2(RE, RA, RB, RC, RD, 7);
0726     SP(SI6, RE, RA, RB, RC, RD, 6);     KL2(RB, RD, RC, RE, RA, 6);
0727     SP(SI5, RB, RD, RC, RE, RA, 5);     KL2(RD, RA, RB, RC, RE, 5);
0728     SP(SI4, RD, RA, RB, RC, RE, 4);     KL2(RD, RB, RE, RC, RA, 4);
0729     SP(SI3, RD, RB, RE, RC, RA, 3);     KL2(RE, RD, RA, RC, RB, 3);
0730     SP(SI2, RE, RD, RA, RC, RB, 2);     KL2(RD, RB, RC, RA, RE, 2);
0731     SP(SI1, RD, RB, RC, RA, RE, 1);     KL2(RE, RB, RC, RA, RD, 1);
0732     S(SI0, RE, RB, RC, RA, RD);      K2(RC, RD, RB, RE, RA, 0);
0733 
0734     leaq (4*4*4)(%rsi), %rax;
0735     write_blocks(%rsi, RC1, RD1, RB1, RE1, RK0, RK1, RK2);
0736     write_blocks(%rax, RC2, RD2, RB2, RE2, RK0, RK1, RK2);
0737 
0738     RET;
0739 SYM_FUNC_END(serpent_dec_blk_8way)