Back to home page

OSCL-LXR

 
 

    


0001 /* SPDX-License-Identifier: GPL-2.0-or-later */
0002 /*
0003  * Serpent Cipher 8-way parallel algorithm (x86_64/AVX)
0004  *
0005  * Copyright (C) 2012 Johannes Goetzfried
0006  *     <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
0007  *
0008  * Copyright © 2011-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
0009  */
0010 
0011 #include <linux/linkage.h>
0012 #include <asm/frame.h>
0013 #include "glue_helper-asm-avx.S"
0014 
0015 .file "serpent-avx-x86_64-asm_64.S"
0016 
0017 .section    .rodata.cst16.bswap128_mask, "aM", @progbits, 16
0018 .align 16
0019 .Lbswap128_mask:
0020     .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
0021 
0022 .text
0023 
0024 #define CTX %rdi
0025 
0026 /**********************************************************************
0027   8-way AVX serpent
0028  **********************************************************************/
0029 #define RA1 %xmm0
0030 #define RB1 %xmm1
0031 #define RC1 %xmm2
0032 #define RD1 %xmm3
0033 #define RE1 %xmm4
0034 
0035 #define tp  %xmm5
0036 
0037 #define RA2 %xmm6
0038 #define RB2 %xmm7
0039 #define RC2 %xmm8
0040 #define RD2 %xmm9
0041 #define RE2 %xmm10
0042 
0043 #define RNOT %xmm11
0044 
0045 #define RK0 %xmm12
0046 #define RK1 %xmm13
0047 #define RK2 %xmm14
0048 #define RK3 %xmm15
0049 
0050 
0051 #define S0_1(x0, x1, x2, x3, x4)      \
0052     vpor        x0,   x3, tp; \
0053     vpxor       x3,   x0, x0; \
0054     vpxor       x2,   x3, x4; \
0055     vpxor       RNOT, x4, x4; \
0056     vpxor       x1,   tp, x3; \
0057     vpand       x0,   x1, x1; \
0058     vpxor       x4,   x1, x1; \
0059     vpxor       x0,   x2, x2;
0060 #define S0_2(x0, x1, x2, x3, x4)      \
0061     vpxor       x3,   x0, x0; \
0062     vpor        x0,   x4, x4; \
0063     vpxor       x2,   x0, x0; \
0064     vpand       x1,   x2, x2; \
0065     vpxor       x2,   x3, x3; \
0066     vpxor       RNOT, x1, x1; \
0067     vpxor       x4,   x2, x2; \
0068     vpxor       x2,   x1, x1;
0069 
0070 #define S1_1(x0, x1, x2, x3, x4)      \
0071     vpxor       x0,   x1, tp; \
0072     vpxor       x3,   x0, x0; \
0073     vpxor       RNOT, x3, x3; \
0074     vpand       tp,   x1, x4; \
0075     vpor        tp,   x0, x0; \
0076     vpxor       x2,   x3, x3; \
0077     vpxor       x3,   x0, x0; \
0078     vpxor       x3,   tp, x1;
0079 #define S1_2(x0, x1, x2, x3, x4)      \
0080     vpxor       x4,   x3, x3; \
0081     vpor        x4,   x1, x1; \
0082     vpxor       x2,   x4, x4; \
0083     vpand       x0,   x2, x2; \
0084     vpxor       x1,   x2, x2; \
0085     vpor        x0,   x1, x1; \
0086     vpxor       RNOT, x0, x0; \
0087     vpxor       x2,   x0, x0; \
0088     vpxor       x1,   x4, x4;
0089 
0090 #define S2_1(x0, x1, x2, x3, x4)      \
0091     vpxor       RNOT, x3, x3; \
0092     vpxor       x0,   x1, x1; \
0093     vpand       x2,   x0, tp; \
0094     vpxor       x3,   tp, tp; \
0095     vpor        x0,   x3, x3; \
0096     vpxor       x1,   x2, x2; \
0097     vpxor       x1,   x3, x3; \
0098     vpand       tp,   x1, x1;
0099 #define S2_2(x0, x1, x2, x3, x4)      \
0100     vpxor       x2,   tp, tp; \
0101     vpand       x3,   x2, x2; \
0102     vpor        x1,   x3, x3; \
0103     vpxor       RNOT, tp, tp; \
0104     vpxor       tp,   x3, x3; \
0105     vpxor       tp,   x0, x4; \
0106     vpxor       x2,   tp, x0; \
0107     vpor        x2,   x1, x1;
0108 
0109 #define S3_1(x0, x1, x2, x3, x4)      \
0110     vpxor       x3,   x1, tp; \
0111     vpor        x0,   x3, x3; \
0112     vpand       x0,   x1, x4; \
0113     vpxor       x2,   x0, x0; \
0114     vpxor       tp,   x2, x2; \
0115     vpand       x3,   tp, x1; \
0116     vpxor       x3,   x2, x2; \
0117     vpor        x4,   x0, x0; \
0118     vpxor       x3,   x4, x4;
0119 #define S3_2(x0, x1, x2, x3, x4)      \
0120     vpxor       x0,   x1, x1; \
0121     vpand       x3,   x0, x0; \
0122     vpand       x4,   x3, x3; \
0123     vpxor       x2,   x3, x3; \
0124     vpor        x1,   x4, x4; \
0125     vpand       x1,   x2, x2; \
0126     vpxor       x3,   x4, x4; \
0127     vpxor       x3,   x0, x0; \
0128     vpxor       x2,   x3, x3;
0129 
0130 #define S4_1(x0, x1, x2, x3, x4)      \
0131     vpand       x0,   x3, tp; \
0132     vpxor       x3,   x0, x0; \
0133     vpxor       x2,   tp, tp; \
0134     vpor        x3,   x2, x2; \
0135     vpxor       x1,   x0, x0; \
0136     vpxor       tp,   x3, x4; \
0137     vpor        x0,   x2, x2; \
0138     vpxor       x1,   x2, x2;
0139 #define S4_2(x0, x1, x2, x3, x4)      \
0140     vpand       x0,   x1, x1; \
0141     vpxor       x4,   x1, x1; \
0142     vpand       x2,   x4, x4; \
0143     vpxor       tp,   x2, x2; \
0144     vpxor       x0,   x4, x4; \
0145     vpor        x1,   tp, x3; \
0146     vpxor       RNOT, x1, x1; \
0147     vpxor       x0,   x3, x3;
0148 
0149 #define S5_1(x0, x1, x2, x3, x4)      \
0150     vpor        x0,   x1, tp; \
0151     vpxor       tp,   x2, x2; \
0152     vpxor       RNOT, x3, x3; \
0153     vpxor       x0,   x1, x4; \
0154     vpxor       x2,   x0, x0; \
0155     vpand       x4,   tp, x1; \
0156     vpor        x3,   x4, x4; \
0157     vpxor       x0,   x4, x4;
0158 #define S5_2(x0, x1, x2, x3, x4)      \
0159     vpand       x3,   x0, x0; \
0160     vpxor       x3,   x1, x1; \
0161     vpxor       x2,   x3, x3; \
0162     vpxor       x1,   x0, x0; \
0163     vpand       x4,   x2, x2; \
0164     vpxor       x2,   x1, x1; \
0165     vpand       x0,   x2, x2; \
0166     vpxor       x2,   x3, x3;
0167 
0168 #define S6_1(x0, x1, x2, x3, x4)      \
0169     vpxor       x0,   x3, x3; \
0170     vpxor       x2,   x1, tp; \
0171     vpxor       x0,   x2, x2; \
0172     vpand       x3,   x0, x0; \
0173     vpor        x3,   tp, tp; \
0174     vpxor       RNOT, x1, x4; \
0175     vpxor       tp,   x0, x0; \
0176     vpxor       x2,   tp, x1;
0177 #define S6_2(x0, x1, x2, x3, x4)      \
0178     vpxor       x4,   x3, x3; \
0179     vpxor       x0,   x4, x4; \
0180     vpand       x0,   x2, x2; \
0181     vpxor       x1,   x4, x4; \
0182     vpxor       x3,   x2, x2; \
0183     vpand       x1,   x3, x3; \
0184     vpxor       x0,   x3, x3; \
0185     vpxor       x2,   x1, x1;
0186 
0187 #define S7_1(x0, x1, x2, x3, x4)      \
0188     vpxor       RNOT, x1, tp; \
0189     vpxor       RNOT, x0, x0; \
0190     vpand       x2,   tp, x1; \
0191     vpxor       x3,   x1, x1; \
0192     vpor        tp,   x3, x3; \
0193     vpxor       x2,   tp, x4; \
0194     vpxor       x3,   x2, x2; \
0195     vpxor       x0,   x3, x3; \
0196     vpor        x1,   x0, x0;
0197 #define S7_2(x0, x1, x2, x3, x4)      \
0198     vpand       x0,   x2, x2; \
0199     vpxor       x4,   x0, x0; \
0200     vpxor       x3,   x4, x4; \
0201     vpand       x0,   x3, x3; \
0202     vpxor       x1,   x4, x4; \
0203     vpxor       x4,   x2, x2; \
0204     vpxor       x1,   x3, x3; \
0205     vpor        x0,   x4, x4; \
0206     vpxor       x1,   x4, x4;
0207 
0208 #define SI0_1(x0, x1, x2, x3, x4)     \
0209     vpxor       x0,   x1, x1; \
0210     vpor        x1,   x3, tp; \
0211     vpxor       x1,   x3, x4; \
0212     vpxor       RNOT, x0, x0; \
0213     vpxor       tp,   x2, x2; \
0214     vpxor       x0,   tp, x3; \
0215     vpand       x1,   x0, x0; \
0216     vpxor       x2,   x0, x0;
0217 #define SI0_2(x0, x1, x2, x3, x4)     \
0218     vpand       x3,   x2, x2; \
0219     vpxor       x4,   x3, x3; \
0220     vpxor       x3,   x2, x2; \
0221     vpxor       x3,   x1, x1; \
0222     vpand       x0,   x3, x3; \
0223     vpxor       x0,   x1, x1; \
0224     vpxor       x2,   x0, x0; \
0225     vpxor       x3,   x4, x4;
0226 
0227 #define SI1_1(x0, x1, x2, x3, x4)     \
0228     vpxor       x3,   x1, x1; \
0229     vpxor       x2,   x0, tp; \
0230     vpxor       RNOT, x2, x2; \
0231     vpor        x1,   x0, x4; \
0232     vpxor       x3,   x4, x4; \
0233     vpand       x1,   x3, x3; \
0234     vpxor       x2,   x1, x1; \
0235     vpand       x4,   x2, x2;
0236 #define SI1_2(x0, x1, x2, x3, x4)     \
0237     vpxor       x1,   x4, x4; \
0238     vpor        x3,   x1, x1; \
0239     vpxor       tp,   x3, x3; \
0240     vpxor       tp,   x2, x2; \
0241     vpor        x4,   tp, x0; \
0242     vpxor       x4,   x2, x2; \
0243     vpxor       x0,   x1, x1; \
0244     vpxor       x1,   x4, x4;
0245 
0246 #define SI2_1(x0, x1, x2, x3, x4)     \
0247     vpxor       x1,   x2, x2; \
0248     vpxor       RNOT, x3, tp; \
0249     vpor        x2,   tp, tp; \
0250     vpxor       x3,   x2, x2; \
0251     vpxor       x0,   x3, x4; \
0252     vpxor       x1,   tp, x3; \
0253     vpor        x2,   x1, x1; \
0254     vpxor       x0,   x2, x2;
0255 #define SI2_2(x0, x1, x2, x3, x4)     \
0256     vpxor       x4,   x1, x1; \
0257     vpor        x3,   x4, x4; \
0258     vpxor       x3,   x2, x2; \
0259     vpxor       x2,   x4, x4; \
0260     vpand       x1,   x2, x2; \
0261     vpxor       x3,   x2, x2; \
0262     vpxor       x4,   x3, x3; \
0263     vpxor       x0,   x4, x4;
0264 
0265 #define SI3_1(x0, x1, x2, x3, x4)     \
0266     vpxor       x1,   x2, x2; \
0267     vpand       x2,   x1, tp; \
0268     vpxor       x0,   tp, tp; \
0269     vpor        x1,   x0, x0; \
0270     vpxor       x3,   x1, x4; \
0271     vpxor       x3,   x0, x0; \
0272     vpor        tp,   x3, x3; \
0273     vpxor       x2,   tp, x1;
0274 #define SI3_2(x0, x1, x2, x3, x4)     \
0275     vpxor       x3,   x1, x1; \
0276     vpxor       x2,   x0, x0; \
0277     vpxor       x3,   x2, x2; \
0278     vpand       x1,   x3, x3; \
0279     vpxor       x0,   x1, x1; \
0280     vpand       x2,   x0, x0; \
0281     vpxor       x3,   x4, x4; \
0282     vpxor       x0,   x3, x3; \
0283     vpxor       x1,   x0, x0;
0284 
0285 #define SI4_1(x0, x1, x2, x3, x4)     \
0286     vpxor       x3,   x2, x2; \
0287     vpand       x1,   x0, tp; \
0288     vpxor       x2,   tp, tp; \
0289     vpor        x3,   x2, x2; \
0290     vpxor       RNOT, x0, x4; \
0291     vpxor       tp,   x1, x1; \
0292     vpxor       x2,   tp, x0; \
0293     vpand       x4,   x2, x2;
0294 #define SI4_2(x0, x1, x2, x3, x4)     \
0295     vpxor       x0,   x2, x2; \
0296     vpor        x4,   x0, x0; \
0297     vpxor       x3,   x0, x0; \
0298     vpand       x2,   x3, x3; \
0299     vpxor       x3,   x4, x4; \
0300     vpxor       x1,   x3, x3; \
0301     vpand       x0,   x1, x1; \
0302     vpxor       x1,   x4, x4; \
0303     vpxor       x3,   x0, x0;
0304 
0305 #define SI5_1(x0, x1, x2, x3, x4)     \
0306     vpor        x2,   x1, tp; \
0307     vpxor       x1,   x2, x2; \
0308     vpxor       x3,   tp, tp; \
0309     vpand       x1,   x3, x3; \
0310     vpxor       x3,   x2, x2; \
0311     vpor        x0,   x3, x3; \
0312     vpxor       RNOT, x0, x0; \
0313     vpxor       x2,   x3, x3; \
0314     vpor        x0,   x2, x2;
0315 #define SI5_2(x0, x1, x2, x3, x4)     \
0316     vpxor       tp,   x1, x4; \
0317     vpxor       x4,   x2, x2; \
0318     vpand       x0,   x4, x4; \
0319     vpxor       tp,   x0, x0; \
0320     vpxor       x3,   tp, x1; \
0321     vpand       x2,   x0, x0; \
0322     vpxor       x3,   x2, x2; \
0323     vpxor       x2,   x0, x0; \
0324     vpxor       x4,   x2, x2; \
0325     vpxor       x3,   x4, x4;
0326 
0327 #define SI6_1(x0, x1, x2, x3, x4)     \
0328     vpxor       x2,   x0, x0; \
0329     vpand       x3,   x0, tp; \
0330     vpxor       x3,   x2, x2; \
0331     vpxor       x2,   tp, tp; \
0332     vpxor       x1,   x3, x3; \
0333     vpor        x0,   x2, x2; \
0334     vpxor       x3,   x2, x2; \
0335     vpand       tp,   x3, x3;
0336 #define SI6_2(x0, x1, x2, x3, x4)     \
0337     vpxor       RNOT, tp, tp; \
0338     vpxor       x1,   x3, x3; \
0339     vpand       x2,   x1, x1; \
0340     vpxor       tp,   x0, x4; \
0341     vpxor       x4,   x3, x3; \
0342     vpxor       x2,   x4, x4; \
0343     vpxor       x1,   tp, x0; \
0344     vpxor       x0,   x2, x2;
0345 
0346 #define SI7_1(x0, x1, x2, x3, x4)     \
0347     vpand       x0,   x3, tp; \
0348     vpxor       x2,   x0, x0; \
0349     vpor        x3,   x2, x2; \
0350     vpxor       x1,   x3, x4; \
0351     vpxor       RNOT, x0, x0; \
0352     vpor        tp,   x1, x1; \
0353     vpxor       x0,   x4, x4; \
0354     vpand       x2,   x0, x0; \
0355     vpxor       x1,   x0, x0;
0356 #define SI7_2(x0, x1, x2, x3, x4)     \
0357     vpand       x2,   x1, x1; \
0358     vpxor       x2,   tp, x3; \
0359     vpxor       x3,   x4, x4; \
0360     vpand       x3,   x2, x2; \
0361     vpor        x0,   x3, x3; \
0362     vpxor       x4,   x1, x1; \
0363     vpxor       x4,   x3, x3; \
0364     vpand       x0,   x4, x4; \
0365     vpxor       x2,   x4, x4;
0366 
0367 #define get_key(i, j, t) \
0368     vbroadcastss (4*(i)+(j))*4(CTX), t;
0369 
0370 #define K2(x0, x1, x2, x3, x4, i) \
0371     get_key(i, 0, RK0); \
0372     get_key(i, 1, RK1); \
0373     get_key(i, 2, RK2); \
0374     get_key(i, 3, RK3); \
0375     vpxor RK0,  x0 ## 1, x0 ## 1; \
0376     vpxor RK1,  x1 ## 1, x1 ## 1; \
0377     vpxor RK2,  x2 ## 1, x2 ## 1; \
0378     vpxor RK3,  x3 ## 1, x3 ## 1; \
0379         vpxor RK0,  x0 ## 2, x0 ## 2; \
0380         vpxor RK1,  x1 ## 2, x1 ## 2; \
0381         vpxor RK2,  x2 ## 2, x2 ## 2; \
0382         vpxor RK3,  x3 ## 2, x3 ## 2;
0383 
0384 #define LK2(x0, x1, x2, x3, x4, i) \
0385     vpslld $13,     x0 ## 1, x4 ## 1;          \
0386     vpsrld $(32 - 13),  x0 ## 1, x0 ## 1;          \
0387     vpor            x4 ## 1, x0 ## 1, x0 ## 1; \
0388     vpxor           x0 ## 1, x1 ## 1, x1 ## 1; \
0389     vpslld $3,      x2 ## 1, x4 ## 1;          \
0390     vpsrld $(32 - 3),   x2 ## 1, x2 ## 1;          \
0391     vpor            x4 ## 1, x2 ## 1, x2 ## 1; \
0392     vpxor           x2 ## 1, x1 ## 1, x1 ## 1; \
0393         vpslld $13,     x0 ## 2, x4 ## 2;          \
0394         vpsrld $(32 - 13),  x0 ## 2, x0 ## 2;          \
0395         vpor            x4 ## 2, x0 ## 2, x0 ## 2; \
0396         vpxor           x0 ## 2, x1 ## 2, x1 ## 2; \
0397         vpslld $3,      x2 ## 2, x4 ## 2;          \
0398         vpsrld $(32 - 3),   x2 ## 2, x2 ## 2;          \
0399         vpor            x4 ## 2, x2 ## 2, x2 ## 2; \
0400         vpxor           x2 ## 2, x1 ## 2, x1 ## 2; \
0401     vpslld $1,      x1 ## 1, x4 ## 1;          \
0402     vpsrld $(32 - 1),   x1 ## 1, x1 ## 1;          \
0403     vpor            x4 ## 1, x1 ## 1, x1 ## 1; \
0404     vpslld $3,      x0 ## 1, x4 ## 1;          \
0405     vpxor           x2 ## 1, x3 ## 1, x3 ## 1; \
0406     vpxor           x4 ## 1, x3 ## 1, x3 ## 1; \
0407     get_key(i, 1, RK1); \
0408         vpslld $1,      x1 ## 2, x4 ## 2;          \
0409         vpsrld $(32 - 1),   x1 ## 2, x1 ## 2;          \
0410         vpor            x4 ## 2, x1 ## 2, x1 ## 2; \
0411         vpslld $3,      x0 ## 2, x4 ## 2;          \
0412         vpxor           x2 ## 2, x3 ## 2, x3 ## 2; \
0413         vpxor           x4 ## 2, x3 ## 2, x3 ## 2; \
0414         get_key(i, 3, RK3); \
0415     vpslld $7,      x3 ## 1, x4 ## 1;          \
0416     vpsrld $(32 - 7),   x3 ## 1, x3 ## 1;          \
0417     vpor            x4 ## 1, x3 ## 1, x3 ## 1; \
0418     vpslld $7,      x1 ## 1, x4 ## 1;          \
0419     vpxor           x1 ## 1, x0 ## 1, x0 ## 1; \
0420     vpxor           x3 ## 1, x0 ## 1, x0 ## 1; \
0421     vpxor           x3 ## 1, x2 ## 1, x2 ## 1; \
0422     vpxor           x4 ## 1, x2 ## 1, x2 ## 1; \
0423     get_key(i, 0, RK0); \
0424         vpslld $7,      x3 ## 2, x4 ## 2;          \
0425         vpsrld $(32 - 7),   x3 ## 2, x3 ## 2;          \
0426         vpor            x4 ## 2, x3 ## 2, x3 ## 2; \
0427         vpslld $7,      x1 ## 2, x4 ## 2;          \
0428         vpxor           x1 ## 2, x0 ## 2, x0 ## 2; \
0429         vpxor           x3 ## 2, x0 ## 2, x0 ## 2; \
0430         vpxor           x3 ## 2, x2 ## 2, x2 ## 2; \
0431         vpxor           x4 ## 2, x2 ## 2, x2 ## 2; \
0432         get_key(i, 2, RK2); \
0433     vpxor           RK1, x1 ## 1, x1 ## 1;     \
0434     vpxor           RK3, x3 ## 1, x3 ## 1;     \
0435     vpslld $5,      x0 ## 1, x4 ## 1;          \
0436     vpsrld $(32 - 5),   x0 ## 1, x0 ## 1;          \
0437     vpor            x4 ## 1, x0 ## 1, x0 ## 1; \
0438     vpslld $22,     x2 ## 1, x4 ## 1;          \
0439     vpsrld $(32 - 22),  x2 ## 1, x2 ## 1;          \
0440     vpor            x4 ## 1, x2 ## 1, x2 ## 1; \
0441     vpxor           RK0, x0 ## 1, x0 ## 1;     \
0442     vpxor           RK2, x2 ## 1, x2 ## 1;     \
0443         vpxor           RK1, x1 ## 2, x1 ## 2;     \
0444         vpxor           RK3, x3 ## 2, x3 ## 2;     \
0445         vpslld $5,      x0 ## 2, x4 ## 2;          \
0446         vpsrld $(32 - 5),   x0 ## 2, x0 ## 2;          \
0447         vpor            x4 ## 2, x0 ## 2, x0 ## 2; \
0448         vpslld $22,     x2 ## 2, x4 ## 2;          \
0449         vpsrld $(32 - 22),  x2 ## 2, x2 ## 2;          \
0450         vpor            x4 ## 2, x2 ## 2, x2 ## 2; \
0451         vpxor           RK0, x0 ## 2, x0 ## 2;     \
0452         vpxor           RK2, x2 ## 2, x2 ## 2;
0453 
0454 #define KL2(x0, x1, x2, x3, x4, i) \
0455     vpxor           RK0, x0 ## 1, x0 ## 1;     \
0456     vpxor           RK2, x2 ## 1, x2 ## 1;     \
0457     vpsrld $5,      x0 ## 1, x4 ## 1;          \
0458     vpslld $(32 - 5),   x0 ## 1, x0 ## 1;          \
0459     vpor            x4 ## 1, x0 ## 1, x0 ## 1; \
0460     vpxor           RK3, x3 ## 1, x3 ## 1;     \
0461     vpxor           RK1, x1 ## 1, x1 ## 1;     \
0462     vpsrld $22,     x2 ## 1, x4 ## 1;          \
0463     vpslld $(32 - 22),  x2 ## 1, x2 ## 1;          \
0464     vpor            x4 ## 1, x2 ## 1, x2 ## 1; \
0465     vpxor           x3 ## 1, x2 ## 1, x2 ## 1; \
0466         vpxor           RK0, x0 ## 2, x0 ## 2;     \
0467         vpxor           RK2, x2 ## 2, x2 ## 2;     \
0468         vpsrld $5,      x0 ## 2, x4 ## 2;          \
0469         vpslld $(32 - 5),   x0 ## 2, x0 ## 2;          \
0470         vpor            x4 ## 2, x0 ## 2, x0 ## 2; \
0471         vpxor           RK3, x3 ## 2, x3 ## 2;     \
0472         vpxor           RK1, x1 ## 2, x1 ## 2;     \
0473         vpsrld $22,     x2 ## 2, x4 ## 2;          \
0474         vpslld $(32 - 22),  x2 ## 2, x2 ## 2;          \
0475         vpor            x4 ## 2, x2 ## 2, x2 ## 2; \
0476         vpxor           x3 ## 2, x2 ## 2, x2 ## 2; \
0477     vpxor           x3 ## 1, x0 ## 1, x0 ## 1; \
0478     vpslld $7,      x1 ## 1, x4 ## 1;          \
0479     vpxor           x1 ## 1, x0 ## 1, x0 ## 1; \
0480     vpxor           x4 ## 1, x2 ## 1, x2 ## 1; \
0481     vpsrld $1,      x1 ## 1, x4 ## 1;          \
0482     vpslld $(32 - 1),   x1 ## 1, x1 ## 1;          \
0483     vpor            x4 ## 1, x1 ## 1, x1 ## 1; \
0484         vpxor           x3 ## 2, x0 ## 2, x0 ## 2; \
0485         vpslld $7,      x1 ## 2, x4 ## 2;          \
0486         vpxor           x1 ## 2, x0 ## 2, x0 ## 2; \
0487         vpxor           x4 ## 2, x2 ## 2, x2 ## 2; \
0488         vpsrld $1,      x1 ## 2, x4 ## 2;          \
0489         vpslld $(32 - 1),   x1 ## 2, x1 ## 2;          \
0490         vpor            x4 ## 2, x1 ## 2, x1 ## 2; \
0491     vpsrld $7,      x3 ## 1, x4 ## 1;          \
0492     vpslld $(32 - 7),   x3 ## 1, x3 ## 1;          \
0493     vpor            x4 ## 1, x3 ## 1, x3 ## 1; \
0494     vpxor           x0 ## 1, x1 ## 1, x1 ## 1; \
0495     vpslld $3,      x0 ## 1, x4 ## 1;          \
0496     vpxor           x4 ## 1, x3 ## 1, x3 ## 1; \
0497         vpsrld $7,      x3 ## 2, x4 ## 2;          \
0498         vpslld $(32 - 7),   x3 ## 2, x3 ## 2;          \
0499         vpor            x4 ## 2, x3 ## 2, x3 ## 2; \
0500         vpxor           x0 ## 2, x1 ## 2, x1 ## 2; \
0501         vpslld $3,      x0 ## 2, x4 ## 2;          \
0502         vpxor           x4 ## 2, x3 ## 2, x3 ## 2; \
0503     vpsrld $13,     x0 ## 1, x4 ## 1;          \
0504     vpslld $(32 - 13),  x0 ## 1, x0 ## 1;          \
0505     vpor            x4 ## 1, x0 ## 1, x0 ## 1; \
0506     vpxor           x2 ## 1, x1 ## 1, x1 ## 1; \
0507     vpxor           x2 ## 1, x3 ## 1, x3 ## 1; \
0508     vpsrld $3,      x2 ## 1, x4 ## 1;          \
0509     vpslld $(32 - 3),   x2 ## 1, x2 ## 1;          \
0510     vpor            x4 ## 1, x2 ## 1, x2 ## 1; \
0511         vpsrld $13,     x0 ## 2, x4 ## 2;          \
0512         vpslld $(32 - 13),  x0 ## 2, x0 ## 2;          \
0513         vpor            x4 ## 2, x0 ## 2, x0 ## 2; \
0514         vpxor           x2 ## 2, x1 ## 2, x1 ## 2; \
0515         vpxor           x2 ## 2, x3 ## 2, x3 ## 2; \
0516         vpsrld $3,      x2 ## 2, x4 ## 2;          \
0517         vpslld $(32 - 3),   x2 ## 2, x2 ## 2;          \
0518         vpor            x4 ## 2, x2 ## 2, x2 ## 2;
0519 
0520 #define S(SBOX, x0, x1, x2, x3, x4) \
0521     SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
0522     SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
0523     SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
0524     SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2);
0525 
0526 #define SP(SBOX, x0, x1, x2, x3, x4, i) \
0527     get_key(i, 0, RK0); \
0528     SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
0529     get_key(i, 2, RK2); \
0530     SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
0531     get_key(i, 3, RK3); \
0532     SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
0533     get_key(i, 1, RK1); \
0534     SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
0535 
0536 #define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
0537     vpunpckldq      x1, x0, t0; \
0538     vpunpckhdq      x1, x0, t2; \
0539     vpunpckldq      x3, x2, t1; \
0540     vpunpckhdq      x3, x2, x3; \
0541     \
0542     vpunpcklqdq     t1, t0, x0; \
0543     vpunpckhqdq     t1, t0, x1; \
0544     vpunpcklqdq     x3, t2, x2; \
0545     vpunpckhqdq     x3, t2, x3;
0546 
0547 #define read_blocks(x0, x1, x2, x3, t0, t1, t2) \
0548     transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
0549 
0550 #define write_blocks(x0, x1, x2, x3, t0, t1, t2) \
0551     transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
0552 
0553 .align 8
0554 SYM_FUNC_START_LOCAL(__serpent_enc_blk8_avx)
0555     /* input:
0556      *  %rdi: ctx, CTX
0557      *  RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks
0558      * output:
0559      *  RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
0560      */
0561 
0562     vpcmpeqd RNOT, RNOT, RNOT;
0563 
0564     read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
0565     read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
0566 
0567                          K2(RA, RB, RC, RD, RE, 0);
0568     S(S0, RA, RB, RC, RD, RE);      LK2(RC, RB, RD, RA, RE, 1);
0569     S(S1, RC, RB, RD, RA, RE);      LK2(RE, RD, RA, RC, RB, 2);
0570     S(S2, RE, RD, RA, RC, RB);      LK2(RB, RD, RE, RC, RA, 3);
0571     S(S3, RB, RD, RE, RC, RA);      LK2(RC, RA, RD, RB, RE, 4);
0572     S(S4, RC, RA, RD, RB, RE);      LK2(RA, RD, RB, RE, RC, 5);
0573     S(S5, RA, RD, RB, RE, RC);      LK2(RC, RA, RD, RE, RB, 6);
0574     S(S6, RC, RA, RD, RE, RB);      LK2(RD, RB, RA, RE, RC, 7);
0575     S(S7, RD, RB, RA, RE, RC);      LK2(RC, RA, RE, RD, RB, 8);
0576     S(S0, RC, RA, RE, RD, RB);      LK2(RE, RA, RD, RC, RB, 9);
0577     S(S1, RE, RA, RD, RC, RB);      LK2(RB, RD, RC, RE, RA, 10);
0578     S(S2, RB, RD, RC, RE, RA);      LK2(RA, RD, RB, RE, RC, 11);
0579     S(S3, RA, RD, RB, RE, RC);      LK2(RE, RC, RD, RA, RB, 12);
0580     S(S4, RE, RC, RD, RA, RB);      LK2(RC, RD, RA, RB, RE, 13);
0581     S(S5, RC, RD, RA, RB, RE);      LK2(RE, RC, RD, RB, RA, 14);
0582     S(S6, RE, RC, RD, RB, RA);      LK2(RD, RA, RC, RB, RE, 15);
0583     S(S7, RD, RA, RC, RB, RE);      LK2(RE, RC, RB, RD, RA, 16);
0584     S(S0, RE, RC, RB, RD, RA);      LK2(RB, RC, RD, RE, RA, 17);
0585     S(S1, RB, RC, RD, RE, RA);      LK2(RA, RD, RE, RB, RC, 18);
0586     S(S2, RA, RD, RE, RB, RC);      LK2(RC, RD, RA, RB, RE, 19);
0587     S(S3, RC, RD, RA, RB, RE);      LK2(RB, RE, RD, RC, RA, 20);
0588     S(S4, RB, RE, RD, RC, RA);      LK2(RE, RD, RC, RA, RB, 21);
0589     S(S5, RE, RD, RC, RA, RB);      LK2(RB, RE, RD, RA, RC, 22);
0590     S(S6, RB, RE, RD, RA, RC);      LK2(RD, RC, RE, RA, RB, 23);
0591     S(S7, RD, RC, RE, RA, RB);      LK2(RB, RE, RA, RD, RC, 24);
0592     S(S0, RB, RE, RA, RD, RC);      LK2(RA, RE, RD, RB, RC, 25);
0593     S(S1, RA, RE, RD, RB, RC);      LK2(RC, RD, RB, RA, RE, 26);
0594     S(S2, RC, RD, RB, RA, RE);      LK2(RE, RD, RC, RA, RB, 27);
0595     S(S3, RE, RD, RC, RA, RB);      LK2(RA, RB, RD, RE, RC, 28);
0596     S(S4, RA, RB, RD, RE, RC);      LK2(RB, RD, RE, RC, RA, 29);
0597     S(S5, RB, RD, RE, RC, RA);      LK2(RA, RB, RD, RC, RE, 30);
0598     S(S6, RA, RB, RD, RC, RE);      LK2(RD, RE, RB, RC, RA, 31);
0599     S(S7, RD, RE, RB, RC, RA);       K2(RA, RB, RC, RD, RE, 32);
0600 
0601     write_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
0602     write_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
0603 
0604     RET;
0605 SYM_FUNC_END(__serpent_enc_blk8_avx)
0606 
0607 .align 8
0608 SYM_FUNC_START_LOCAL(__serpent_dec_blk8_avx)
0609     /* input:
0610      *  %rdi: ctx, CTX
0611      *  RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
0612      * output:
0613      *  RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2: decrypted blocks
0614      */
0615 
0616     vpcmpeqd RNOT, RNOT, RNOT;
0617 
0618     read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
0619     read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
0620 
0621                          K2(RA, RB, RC, RD, RE, 32);
0622     SP(SI7, RA, RB, RC, RD, RE, 31);    KL2(RB, RD, RA, RE, RC, 31);
0623     SP(SI6, RB, RD, RA, RE, RC, 30);    KL2(RA, RC, RE, RB, RD, 30);
0624     SP(SI5, RA, RC, RE, RB, RD, 29);    KL2(RC, RD, RA, RE, RB, 29);
0625     SP(SI4, RC, RD, RA, RE, RB, 28);    KL2(RC, RA, RB, RE, RD, 28);
0626     SP(SI3, RC, RA, RB, RE, RD, 27);    KL2(RB, RC, RD, RE, RA, 27);
0627     SP(SI2, RB, RC, RD, RE, RA, 26);    KL2(RC, RA, RE, RD, RB, 26);
0628     SP(SI1, RC, RA, RE, RD, RB, 25);    KL2(RB, RA, RE, RD, RC, 25);
0629     SP(SI0, RB, RA, RE, RD, RC, 24);    KL2(RE, RC, RA, RB, RD, 24);
0630     SP(SI7, RE, RC, RA, RB, RD, 23);    KL2(RC, RB, RE, RD, RA, 23);
0631     SP(SI6, RC, RB, RE, RD, RA, 22);    KL2(RE, RA, RD, RC, RB, 22);
0632     SP(SI5, RE, RA, RD, RC, RB, 21);    KL2(RA, RB, RE, RD, RC, 21);
0633     SP(SI4, RA, RB, RE, RD, RC, 20);    KL2(RA, RE, RC, RD, RB, 20);
0634     SP(SI3, RA, RE, RC, RD, RB, 19);    KL2(RC, RA, RB, RD, RE, 19);
0635     SP(SI2, RC, RA, RB, RD, RE, 18);    KL2(RA, RE, RD, RB, RC, 18);
0636     SP(SI1, RA, RE, RD, RB, RC, 17);    KL2(RC, RE, RD, RB, RA, 17);
0637     SP(SI0, RC, RE, RD, RB, RA, 16);    KL2(RD, RA, RE, RC, RB, 16);
0638     SP(SI7, RD, RA, RE, RC, RB, 15);    KL2(RA, RC, RD, RB, RE, 15);
0639     SP(SI6, RA, RC, RD, RB, RE, 14);    KL2(RD, RE, RB, RA, RC, 14);
0640     SP(SI5, RD, RE, RB, RA, RC, 13);    KL2(RE, RC, RD, RB, RA, 13);
0641     SP(SI4, RE, RC, RD, RB, RA, 12);    KL2(RE, RD, RA, RB, RC, 12);
0642     SP(SI3, RE, RD, RA, RB, RC, 11);    KL2(RA, RE, RC, RB, RD, 11);
0643     SP(SI2, RA, RE, RC, RB, RD, 10);    KL2(RE, RD, RB, RC, RA, 10);
0644     SP(SI1, RE, RD, RB, RC, RA, 9);     KL2(RA, RD, RB, RC, RE, 9);
0645     SP(SI0, RA, RD, RB, RC, RE, 8);     KL2(RB, RE, RD, RA, RC, 8);
0646     SP(SI7, RB, RE, RD, RA, RC, 7);     KL2(RE, RA, RB, RC, RD, 7);
0647     SP(SI6, RE, RA, RB, RC, RD, 6);     KL2(RB, RD, RC, RE, RA, 6);
0648     SP(SI5, RB, RD, RC, RE, RA, 5);     KL2(RD, RA, RB, RC, RE, 5);
0649     SP(SI4, RD, RA, RB, RC, RE, 4);     KL2(RD, RB, RE, RC, RA, 4);
0650     SP(SI3, RD, RB, RE, RC, RA, 3);     KL2(RE, RD, RA, RC, RB, 3);
0651     SP(SI2, RE, RD, RA, RC, RB, 2);     KL2(RD, RB, RC, RA, RE, 2);
0652     SP(SI1, RD, RB, RC, RA, RE, 1);     KL2(RE, RB, RC, RA, RD, 1);
0653     S(SI0, RE, RB, RC, RA, RD);      K2(RC, RD, RB, RE, RA, 0);
0654 
0655     write_blocks(RC1, RD1, RB1, RE1, RK0, RK1, RK2);
0656     write_blocks(RC2, RD2, RB2, RE2, RK0, RK1, RK2);
0657 
0658     RET;
0659 SYM_FUNC_END(__serpent_dec_blk8_avx)
0660 
0661 SYM_FUNC_START(serpent_ecb_enc_8way_avx)
0662     /* input:
0663      *  %rdi: ctx, CTX
0664      *  %rsi: dst
0665      *  %rdx: src
0666      */
0667     FRAME_BEGIN
0668 
0669     load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
0670 
0671     call __serpent_enc_blk8_avx;
0672 
0673     store_8way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
0674 
0675     FRAME_END
0676     RET;
0677 SYM_FUNC_END(serpent_ecb_enc_8way_avx)
0678 
0679 SYM_FUNC_START(serpent_ecb_dec_8way_avx)
0680     /* input:
0681      *  %rdi: ctx, CTX
0682      *  %rsi: dst
0683      *  %rdx: src
0684      */
0685     FRAME_BEGIN
0686 
0687     load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
0688 
0689     call __serpent_dec_blk8_avx;
0690 
0691     store_8way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
0692 
0693     FRAME_END
0694     RET;
0695 SYM_FUNC_END(serpent_ecb_dec_8way_avx)
0696 
0697 SYM_FUNC_START(serpent_cbc_dec_8way_avx)
0698     /* input:
0699      *  %rdi: ctx, CTX
0700      *  %rsi: dst
0701      *  %rdx: src
0702      */
0703     FRAME_BEGIN
0704 
0705     load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
0706 
0707     call __serpent_dec_blk8_avx;
0708 
0709     store_cbc_8way(%rdx, %rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
0710 
0711     FRAME_END
0712     RET;
0713 SYM_FUNC_END(serpent_cbc_dec_8way_avx)