Back to home page

OSCL-LXR

 
 

    


0001 /* SPDX-License-Identifier: GPL-2.0-or-later */
0002 /*
0003  * Twofish Cipher 8-way parallel algorithm (AVX/x86_64)
0004  *
0005  * Copyright (C) 2012 Johannes Goetzfried
0006  *     <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
0007  *
0008  * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
0009  */
0010 
0011 #include <linux/linkage.h>
0012 #include <asm/frame.h>
0013 #include "glue_helper-asm-avx.S"
0014 
0015 .file "twofish-avx-x86_64-asm_64.S"
0016 
0017 .section    .rodata.cst16.bswap128_mask, "aM", @progbits, 16
0018 .align 16
0019 .Lbswap128_mask:
0020     .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
0021 
0022 .text
0023 
0024 /* structure of crypto context */
0025 #define s0  0
0026 #define s1  1024
0027 #define s2  2048
0028 #define s3  3072
0029 #define w   4096
0030 #define k   4128
0031 
0032 /**********************************************************************
0033   8-way AVX twofish
0034  **********************************************************************/
0035 #define CTX %rdi
0036 
0037 #define RA1 %xmm0
0038 #define RB1 %xmm1
0039 #define RC1 %xmm2
0040 #define RD1 %xmm3
0041 
0042 #define RA2 %xmm4
0043 #define RB2 %xmm5
0044 #define RC2 %xmm6
0045 #define RD2 %xmm7
0046 
0047 #define RX0 %xmm8
0048 #define RY0 %xmm9
0049 
0050 #define RX1 %xmm10
0051 #define RY1 %xmm11
0052 
0053 #define RK1 %xmm12
0054 #define RK2 %xmm13
0055 
0056 #define RT %xmm14
0057 #define RR %xmm15
0058 
0059 #define RID1  %r13
0060 #define RID1d %r13d
0061 #define RID2  %rsi
0062 #define RID2d %esi
0063 
0064 #define RGI1   %rdx
0065 #define RGI1bl %dl
0066 #define RGI1bh %dh
0067 #define RGI2   %rcx
0068 #define RGI2bl %cl
0069 #define RGI2bh %ch
0070 
0071 #define RGI3   %rax
0072 #define RGI3bl %al
0073 #define RGI3bh %ah
0074 #define RGI4   %rbx
0075 #define RGI4bl %bl
0076 #define RGI4bh %bh
0077 
0078 #define RGS1  %r8
0079 #define RGS1d %r8d
0080 #define RGS2  %r9
0081 #define RGS2d %r9d
0082 #define RGS3  %r10
0083 #define RGS3d %r10d
0084 
0085 
0086 #define lookup_32bit(t0, t1, t2, t3, src, dst, interleave_op, il_reg) \
0087     movzbl      src ## bl,        RID1d;     \
0088     movzbl      src ## bh,        RID2d;     \
0089     shrq $16,   src;                         \
0090     movl        t0(CTX, RID1, 4), dst ## d;  \
0091     movl        t1(CTX, RID2, 4), RID2d;     \
0092     movzbl      src ## bl,        RID1d;     \
0093     xorl        RID2d,            dst ## d;  \
0094     movzbl      src ## bh,        RID2d;     \
0095     interleave_op(il_reg);               \
0096     xorl        t2(CTX, RID1, 4), dst ## d;  \
0097     xorl        t3(CTX, RID2, 4), dst ## d;
0098 
0099 #define dummy(d) /* do nothing */
0100 
0101 #define shr_next(reg) \
0102     shrq $16,   reg;
0103 
0104 #define G(gi1, gi2, x, t0, t1, t2, t3) \
0105     lookup_32bit(t0, t1, t2, t3, ##gi1, RGS1, shr_next, ##gi1);  \
0106     lookup_32bit(t0, t1, t2, t3, ##gi2, RGS3, shr_next, ##gi2);  \
0107     \
0108     lookup_32bit(t0, t1, t2, t3, ##gi1, RGS2, dummy, none);      \
0109     shlq $32,   RGS2;                                        \
0110     orq     RGS1, RGS2;                                  \
0111     lookup_32bit(t0, t1, t2, t3, ##gi2, RGS1, dummy, none);      \
0112     shlq $32,   RGS1;                                        \
0113     orq     RGS1, RGS3;
0114 
0115 #define round_head_2(a, b, x1, y1, x2, y2) \
0116     vmovq       b ## 1, RGI3;           \
0117     vpextrq $1, b ## 1, RGI4;           \
0118     \
0119     G(RGI1, RGI2, x1, s0, s1, s2, s3);      \
0120     vmovq       a ## 2, RGI1;           \
0121     vpextrq $1, a ## 2, RGI2;           \
0122     vmovq       RGS2, x1;               \
0123     vpinsrq $1, RGS3, x1, x1;           \
0124     \
0125     G(RGI3, RGI4, y1, s1, s2, s3, s0);      \
0126     vmovq       b ## 2, RGI3;           \
0127     vpextrq $1, b ## 2, RGI4;           \
0128     vmovq       RGS2, y1;               \
0129     vpinsrq $1, RGS3, y1, y1;           \
0130     \
0131     G(RGI1, RGI2, x2, s0, s1, s2, s3);      \
0132     vmovq       RGS2, x2;               \
0133     vpinsrq $1, RGS3, x2, x2;           \
0134     \
0135     G(RGI3, RGI4, y2, s1, s2, s3, s0);      \
0136     vmovq       RGS2, y2;               \
0137     vpinsrq $1, RGS3, y2, y2;
0138 
0139 #define encround_tail(a, b, c, d, x, y, prerotate) \
0140     vpaddd          x, y,   x; \
0141     vpaddd          x, RK1, RT;\
0142     prerotate(b);              \
0143     vpxor           RT, c,  c; \
0144     vpaddd          y, x,   y; \
0145     vpaddd          y, RK2, y; \
0146     vpsrld $1,      c, RT;     \
0147     vpslld $(32 - 1),   c, c;      \
0148     vpor            c, RT,  c; \
0149     vpxor           d, y,   d; \
0150 
0151 #define decround_tail(a, b, c, d, x, y, prerotate) \
0152     vpaddd          x, y,   x; \
0153     vpaddd          x, RK1, RT;\
0154     prerotate(a);              \
0155     vpxor           RT, c,  c; \
0156     vpaddd          y, x,   y; \
0157     vpaddd          y, RK2, y; \
0158     vpxor           d, y,   d; \
0159     vpsrld $1,      d, y;      \
0160     vpslld $(32 - 1),   d, d;      \
0161     vpor            d, y,   d; \
0162 
0163 #define rotate_1l(x) \
0164     vpslld $1,      x, RR;     \
0165     vpsrld $(32 - 1),   x, x;      \
0166     vpor            x, RR,  x;
0167 
0168 #define preload_rgi(c) \
0169     vmovq           c, RGI1; \
0170     vpextrq $1,     c, RGI2;
0171 
0172 #define encrypt_round(n, a, b, c, d, preload, prerotate) \
0173     vbroadcastss (k+4*(2*(n)))(CTX),   RK1;                  \
0174     vbroadcastss (k+4*(2*(n)+1))(CTX), RK2;                  \
0175     round_head_2(a, b, RX0, RY0, RX1, RY1);                  \
0176     encround_tail(a ## 1, b ## 1, c ## 1, d ## 1, RX0, RY0, prerotate); \
0177     preload(c ## 1);                                         \
0178     encround_tail(a ## 2, b ## 2, c ## 2, d ## 2, RX1, RY1, prerotate);
0179 
0180 #define decrypt_round(n, a, b, c, d, preload, prerotate) \
0181     vbroadcastss (k+4*(2*(n)))(CTX),   RK1;                  \
0182     vbroadcastss (k+4*(2*(n)+1))(CTX), RK2;                  \
0183     round_head_2(a, b, RX0, RY0, RX1, RY1);                  \
0184     decround_tail(a ## 1, b ## 1, c ## 1, d ## 1, RX0, RY0, prerotate); \
0185     preload(c ## 1);                                         \
0186     decround_tail(a ## 2, b ## 2, c ## 2, d ## 2, RX1, RY1, prerotate);
0187 
0188 #define encrypt_cycle(n) \
0189     encrypt_round((2*n), RA, RB, RC, RD, preload_rgi, rotate_1l); \
0190     encrypt_round(((2*n) + 1), RC, RD, RA, RB, preload_rgi, rotate_1l);
0191 
0192 #define encrypt_cycle_last(n) \
0193     encrypt_round((2*n), RA, RB, RC, RD, preload_rgi, rotate_1l); \
0194     encrypt_round(((2*n) + 1), RC, RD, RA, RB, dummy, dummy);
0195 
0196 #define decrypt_cycle(n) \
0197     decrypt_round(((2*n) + 1), RC, RD, RA, RB, preload_rgi, rotate_1l); \
0198     decrypt_round((2*n), RA, RB, RC, RD, preload_rgi, rotate_1l);
0199 
0200 #define decrypt_cycle_last(n) \
0201     decrypt_round(((2*n) + 1), RC, RD, RA, RB, preload_rgi, rotate_1l); \
0202     decrypt_round((2*n), RA, RB, RC, RD, dummy, dummy);
0203 
0204 #define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
0205     vpunpckldq      x1, x0, t0; \
0206     vpunpckhdq      x1, x0, t2; \
0207     vpunpckldq      x3, x2, t1; \
0208     vpunpckhdq      x3, x2, x3; \
0209     \
0210     vpunpcklqdq     t1, t0, x0; \
0211     vpunpckhqdq     t1, t0, x1; \
0212     vpunpcklqdq     x3, t2, x2; \
0213     vpunpckhqdq     x3, t2, x3;
0214 
0215 #define inpack_blocks(x0, x1, x2, x3, wkey, t0, t1, t2) \
0216     vpxor       x0, wkey, x0; \
0217     vpxor       x1, wkey, x1; \
0218     vpxor       x2, wkey, x2; \
0219     vpxor       x3, wkey, x3; \
0220     \
0221     transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
0222 
0223 #define outunpack_blocks(x0, x1, x2, x3, wkey, t0, t1, t2) \
0224     transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
0225     \
0226     vpxor       x0, wkey, x0; \
0227     vpxor       x1, wkey, x1; \
0228     vpxor       x2, wkey, x2; \
0229     vpxor       x3, wkey, x3;
0230 
0231 .align 8
0232 SYM_FUNC_START_LOCAL(__twofish_enc_blk8)
0233     /* input:
0234      *  %rdi: ctx, CTX
0235      *  RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks
0236      * output:
0237      *  RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2: encrypted blocks
0238      */
0239 
0240     vmovdqu w(CTX), RK1;
0241 
0242     pushq %r13;
0243     pushq %rbx;
0244     pushq %rcx;
0245 
0246     inpack_blocks(RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2);
0247     preload_rgi(RA1);
0248     rotate_1l(RD1);
0249     inpack_blocks(RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2);
0250     rotate_1l(RD2);
0251 
0252     encrypt_cycle(0);
0253     encrypt_cycle(1);
0254     encrypt_cycle(2);
0255     encrypt_cycle(3);
0256     encrypt_cycle(4);
0257     encrypt_cycle(5);
0258     encrypt_cycle(6);
0259     encrypt_cycle_last(7);
0260 
0261     vmovdqu (w+4*4)(CTX), RK1;
0262 
0263     popq %rcx;
0264     popq %rbx;
0265     popq %r13;
0266 
0267     outunpack_blocks(RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);
0268     outunpack_blocks(RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2);
0269 
0270     RET;
0271 SYM_FUNC_END(__twofish_enc_blk8)
0272 
0273 .align 8
0274 SYM_FUNC_START_LOCAL(__twofish_dec_blk8)
0275     /* input:
0276      *  %rdi: ctx, CTX
0277      *  RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2: encrypted blocks
0278      * output:
0279      *  RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: decrypted blocks
0280      */
0281 
0282     vmovdqu (w+4*4)(CTX), RK1;
0283 
0284     pushq %r13;
0285     pushq %rbx;
0286 
0287     inpack_blocks(RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);
0288     preload_rgi(RC1);
0289     rotate_1l(RA1);
0290     inpack_blocks(RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2);
0291     rotate_1l(RA2);
0292 
0293     decrypt_cycle(7);
0294     decrypt_cycle(6);
0295     decrypt_cycle(5);
0296     decrypt_cycle(4);
0297     decrypt_cycle(3);
0298     decrypt_cycle(2);
0299     decrypt_cycle(1);
0300     decrypt_cycle_last(0);
0301 
0302     vmovdqu (w)(CTX), RK1;
0303 
0304     popq %rbx;
0305     popq %r13;
0306 
0307     outunpack_blocks(RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2);
0308     outunpack_blocks(RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2);
0309 
0310     RET;
0311 SYM_FUNC_END(__twofish_dec_blk8)
0312 
0313 SYM_FUNC_START(twofish_ecb_enc_8way)
0314     /* input:
0315      *  %rdi: ctx, CTX
0316      *  %rsi: dst
0317      *  %rdx: src
0318      */
0319     FRAME_BEGIN
0320 
0321     movq %rsi, %r11;
0322 
0323     load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
0324 
0325     call __twofish_enc_blk8;
0326 
0327     store_8way(%r11, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);
0328 
0329     FRAME_END
0330     RET;
0331 SYM_FUNC_END(twofish_ecb_enc_8way)
0332 
0333 SYM_FUNC_START(twofish_ecb_dec_8way)
0334     /* input:
0335      *  %rdi: ctx, CTX
0336      *  %rsi: dst
0337      *  %rdx: src
0338      */
0339     FRAME_BEGIN
0340 
0341     movq %rsi, %r11;
0342 
0343     load_8way(%rdx, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);
0344 
0345     call __twofish_dec_blk8;
0346 
0347     store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
0348 
0349     FRAME_END
0350     RET;
0351 SYM_FUNC_END(twofish_ecb_dec_8way)
0352 
0353 SYM_FUNC_START(twofish_cbc_dec_8way)
0354     /* input:
0355      *  %rdi: ctx, CTX
0356      *  %rsi: dst
0357      *  %rdx: src
0358      */
0359     FRAME_BEGIN
0360 
0361     pushq %r12;
0362 
0363     movq %rsi, %r11;
0364     movq %rdx, %r12;
0365 
0366     load_8way(%rdx, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);
0367 
0368     call __twofish_dec_blk8;
0369 
0370     store_cbc_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
0371 
0372     popq %r12;
0373 
0374     FRAME_END
0375     RET;
0376 SYM_FUNC_END(twofish_cbc_dec_8way)