Back to home page

OSCL-LXR

 
 

    


0001 /* SPDX-License-Identifier: GPL-2.0-or-later */
0002 /*
0003  * Blowfish Cipher Algorithm (x86_64)
0004  *
0005  * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
0006  */
0007 
0008 #include <linux/linkage.h>
0009 
0010 .file "blowfish-x86_64-asm.S"
0011 .text
0012 
0013 /* structure of crypto context */
0014 #define p   0
0015 #define s0  ((16 + 2) * 4)
0016 #define s1  ((16 + 2 + (1 * 256)) * 4)
0017 #define s2  ((16 + 2 + (2 * 256)) * 4)
0018 #define s3  ((16 + 2 + (3 * 256)) * 4)
0019 
0020 /* register macros */
0021 #define CTX %r12
0022 #define RIO %rsi
0023 
0024 #define RX0 %rax
0025 #define RX1 %rbx
0026 #define RX2 %rcx
0027 #define RX3 %rdx
0028 
0029 #define RX0d %eax
0030 #define RX1d %ebx
0031 #define RX2d %ecx
0032 #define RX3d %edx
0033 
0034 #define RX0bl %al
0035 #define RX1bl %bl
0036 #define RX2bl %cl
0037 #define RX3bl %dl
0038 
0039 #define RX0bh %ah
0040 #define RX1bh %bh
0041 #define RX2bh %ch
0042 #define RX3bh %dh
0043 
0044 #define RT0 %rdi
0045 #define RT1 %rsi
0046 #define RT2 %r8
0047 #define RT3 %r9
0048 
0049 #define RT0d %edi
0050 #define RT1d %esi
0051 #define RT2d %r8d
0052 #define RT3d %r9d
0053 
0054 #define RKEY %r10
0055 
0056 /***********************************************************************
0057  * 1-way blowfish
0058  ***********************************************************************/
0059 #define F() \
0060     rorq $16,       RX0; \
0061     movzbl RX0bh,       RT0d; \
0062     movzbl RX0bl,       RT1d; \
0063     rolq $16,       RX0; \
0064     movl s0(CTX,RT0,4), RT0d; \
0065     addl s1(CTX,RT1,4), RT0d; \
0066     movzbl RX0bh,       RT1d; \
0067     movzbl RX0bl,       RT2d; \
0068     rolq $32,       RX0; \
0069     xorl s2(CTX,RT1,4), RT0d; \
0070     addl s3(CTX,RT2,4), RT0d; \
0071     xorq RT0,       RX0;
0072 
0073 #define add_roundkey_enc(n) \
0074     xorq p+4*(n)(CTX),  RX0;
0075 
0076 #define round_enc(n) \
0077     add_roundkey_enc(n); \
0078     \
0079     F(); \
0080     F();
0081 
0082 #define add_roundkey_dec(n) \
0083     movq p+4*(n-1)(CTX),    RT0; \
0084     rorq $32,       RT0; \
0085     xorq RT0,       RX0;
0086 
0087 #define round_dec(n) \
0088     add_roundkey_dec(n); \
0089     \
0090     F(); \
0091     F(); \
0092 
0093 #define read_block() \
0094     movq (RIO),         RX0; \
0095     rorq $32,       RX0; \
0096     bswapq          RX0;
0097 
0098 #define write_block() \
0099     bswapq          RX0; \
0100     movq RX0,       (RIO);
0101 
0102 #define xor_block() \
0103     bswapq          RX0; \
0104     xorq RX0,       (RIO);
0105 
0106 SYM_FUNC_START(__blowfish_enc_blk)
0107     /* input:
0108      *  %rdi: ctx
0109      *  %rsi: dst
0110      *  %rdx: src
0111      *  %rcx: bool, if true: xor output
0112      */
0113     movq %r12, %r11;
0114 
0115     movq %rdi, CTX;
0116     movq %rsi, %r10;
0117     movq %rdx, RIO;
0118 
0119     read_block();
0120 
0121     round_enc(0);
0122     round_enc(2);
0123     round_enc(4);
0124     round_enc(6);
0125     round_enc(8);
0126     round_enc(10);
0127     round_enc(12);
0128     round_enc(14);
0129     add_roundkey_enc(16);
0130 
0131     movq %r11, %r12;
0132 
0133     movq %r10, RIO;
0134     test %cl, %cl;
0135     jnz .L__enc_xor;
0136 
0137     write_block();
0138     RET;
0139 .L__enc_xor:
0140     xor_block();
0141     RET;
0142 SYM_FUNC_END(__blowfish_enc_blk)
0143 
0144 SYM_FUNC_START(blowfish_dec_blk)
0145     /* input:
0146      *  %rdi: ctx
0147      *  %rsi: dst
0148      *  %rdx: src
0149      */
0150     movq %r12, %r11;
0151 
0152     movq %rdi, CTX;
0153     movq %rsi, %r10;
0154     movq %rdx, RIO;
0155 
0156     read_block();
0157 
0158     round_dec(17);
0159     round_dec(15);
0160     round_dec(13);
0161     round_dec(11);
0162     round_dec(9);
0163     round_dec(7);
0164     round_dec(5);
0165     round_dec(3);
0166     add_roundkey_dec(1);
0167 
0168     movq %r10, RIO;
0169     write_block();
0170 
0171     movq %r11, %r12;
0172 
0173     RET;
0174 SYM_FUNC_END(blowfish_dec_blk)
0175 
0176 /**********************************************************************
0177   4-way blowfish, four blocks parallel
0178  **********************************************************************/
0179 
0180 /* F() for 4-way. Slower when used alone/1-way, but faster when used
0181  * parallel/4-way (tested on AMD Phenom II & Intel Xeon E7330).
0182  */
0183 #define F4(x) \
0184     movzbl x ## bh,     RT1d; \
0185     movzbl x ## bl,     RT3d; \
0186     rorq $16,       x; \
0187     movzbl x ## bh,     RT0d; \
0188     movzbl x ## bl,     RT2d; \
0189     rorq $16,       x; \
0190     movl s0(CTX,RT0,4), RT0d; \
0191     addl s1(CTX,RT2,4), RT0d; \
0192     xorl s2(CTX,RT1,4), RT0d; \
0193     addl s3(CTX,RT3,4), RT0d; \
0194     xorq RT0,       x;
0195 
0196 #define add_preloaded_roundkey4() \
0197     xorq RKEY,      RX0; \
0198     xorq RKEY,      RX1; \
0199     xorq RKEY,      RX2; \
0200     xorq RKEY,      RX3;
0201 
0202 #define preload_roundkey_enc(n) \
0203     movq p+4*(n)(CTX),  RKEY;
0204 
0205 #define add_roundkey_enc4(n) \
0206     add_preloaded_roundkey4(); \
0207     preload_roundkey_enc(n + 2);
0208 
0209 #define round_enc4(n) \
0210     add_roundkey_enc4(n); \
0211     \
0212     F4(RX0); \
0213     F4(RX1); \
0214     F4(RX2); \
0215     F4(RX3); \
0216     \
0217     F4(RX0); \
0218     F4(RX1); \
0219     F4(RX2); \
0220     F4(RX3);
0221 
0222 #define preload_roundkey_dec(n) \
0223     movq p+4*((n)-1)(CTX),  RKEY; \
0224     rorq $32,       RKEY;
0225 
0226 #define add_roundkey_dec4(n) \
0227     add_preloaded_roundkey4(); \
0228     preload_roundkey_dec(n - 2);
0229 
0230 #define round_dec4(n) \
0231     add_roundkey_dec4(n); \
0232     \
0233     F4(RX0); \
0234     F4(RX1); \
0235     F4(RX2); \
0236     F4(RX3); \
0237     \
0238     F4(RX0); \
0239     F4(RX1); \
0240     F4(RX2); \
0241     F4(RX3);
0242 
0243 #define read_block4() \
0244     movq (RIO),     RX0; \
0245     rorq $32,       RX0; \
0246     bswapq          RX0; \
0247     \
0248     movq 8(RIO),        RX1; \
0249     rorq $32,       RX1; \
0250     bswapq          RX1; \
0251     \
0252     movq 16(RIO),       RX2; \
0253     rorq $32,       RX2; \
0254     bswapq          RX2; \
0255     \
0256     movq 24(RIO),       RX3; \
0257     rorq $32,       RX3; \
0258     bswapq          RX3;
0259 
0260 #define write_block4() \
0261     bswapq          RX0; \
0262     movq RX0,       (RIO); \
0263     \
0264     bswapq          RX1; \
0265     movq RX1,       8(RIO); \
0266     \
0267     bswapq          RX2; \
0268     movq RX2,       16(RIO); \
0269     \
0270     bswapq          RX3; \
0271     movq RX3,       24(RIO);
0272 
0273 #define xor_block4() \
0274     bswapq          RX0; \
0275     xorq RX0,       (RIO); \
0276     \
0277     bswapq          RX1; \
0278     xorq RX1,       8(RIO); \
0279     \
0280     bswapq          RX2; \
0281     xorq RX2,       16(RIO); \
0282     \
0283     bswapq          RX3; \
0284     xorq RX3,       24(RIO);
0285 
0286 SYM_FUNC_START(__blowfish_enc_blk_4way)
0287     /* input:
0288      *  %rdi: ctx
0289      *  %rsi: dst
0290      *  %rdx: src
0291      *  %rcx: bool, if true: xor output
0292      */
0293     pushq %r12;
0294     pushq %rbx;
0295     pushq %rcx;
0296 
0297     movq %rdi, CTX
0298     movq %rsi, %r11;
0299     movq %rdx, RIO;
0300 
0301     preload_roundkey_enc(0);
0302 
0303     read_block4();
0304 
0305     round_enc4(0);
0306     round_enc4(2);
0307     round_enc4(4);
0308     round_enc4(6);
0309     round_enc4(8);
0310     round_enc4(10);
0311     round_enc4(12);
0312     round_enc4(14);
0313     add_preloaded_roundkey4();
0314 
0315     popq %r12;
0316     movq %r11, RIO;
0317 
0318     test %r12b, %r12b;
0319     jnz .L__enc_xor4;
0320 
0321     write_block4();
0322 
0323     popq %rbx;
0324     popq %r12;
0325     RET;
0326 
0327 .L__enc_xor4:
0328     xor_block4();
0329 
0330     popq %rbx;
0331     popq %r12;
0332     RET;
0333 SYM_FUNC_END(__blowfish_enc_blk_4way)
0334 
0335 SYM_FUNC_START(blowfish_dec_blk_4way)
0336     /* input:
0337      *  %rdi: ctx
0338      *  %rsi: dst
0339      *  %rdx: src
0340      */
0341     pushq %r12;
0342     pushq %rbx;
0343 
0344     movq %rdi, CTX;
0345     movq %rsi, %r11
0346     movq %rdx, RIO;
0347 
0348     preload_roundkey_dec(17);
0349     read_block4();
0350 
0351     round_dec4(17);
0352     round_dec4(15);
0353     round_dec4(13);
0354     round_dec4(11);
0355     round_dec4(9);
0356     round_dec4(7);
0357     round_dec4(5);
0358     round_dec4(3);
0359     add_preloaded_roundkey4();
0360 
0361     movq %r11, RIO;
0362     write_block4();
0363 
0364     popq %rbx;
0365     popq %r12;
0366 
0367     RET;
0368 SYM_FUNC_END(blowfish_dec_blk_4way)