0001
0002
0003
0004
0005
0006
0007
0008 #include <linux/linkage.h>
0009
0010 .file "blowfish-x86_64-asm.S"
0011 .text
0012
0013
0014 #define p 0
0015 #define s0 ((16 + 2) * 4)
0016 #define s1 ((16 + 2 + (1 * 256)) * 4)
0017 #define s2 ((16 + 2 + (2 * 256)) * 4)
0018 #define s3 ((16 + 2 + (3 * 256)) * 4)
0019
0020
0021 #define CTX %r12
0022 #define RIO %rsi
0023
0024 #define RX0 %rax
0025 #define RX1 %rbx
0026 #define RX2 %rcx
0027 #define RX3 %rdx
0028
0029 #define RX0d %eax
0030 #define RX1d %ebx
0031 #define RX2d %ecx
0032 #define RX3d %edx
0033
0034 #define RX0bl %al
0035 #define RX1bl %bl
0036 #define RX2bl %cl
0037 #define RX3bl %dl
0038
0039 #define RX0bh %ah
0040 #define RX1bh %bh
0041 #define RX2bh %ch
0042 #define RX3bh %dh
0043
0044 #define RT0 %rdi
0045 #define RT1 %rsi
0046 #define RT2 %r8
0047 #define RT3 %r9
0048
0049 #define RT0d %edi
0050 #define RT1d %esi
0051 #define RT2d %r8d
0052 #define RT3d %r9d
0053
0054 #define RKEY %r10
0055
0056
0057
0058
0059 #define F() \
0060 rorq $16, RX0; \
0061 movzbl RX0bh, RT0d; \
0062 movzbl RX0bl, RT1d; \
0063 rolq $16, RX0; \
0064 movl s0(CTX,RT0,4), RT0d; \
0065 addl s1(CTX,RT1,4), RT0d; \
0066 movzbl RX0bh, RT1d; \
0067 movzbl RX0bl, RT2d; \
0068 rolq $32, RX0; \
0069 xorl s2(CTX,RT1,4), RT0d; \
0070 addl s3(CTX,RT2,4), RT0d; \
0071 xorq RT0, RX0;
0072
0073 #define add_roundkey_enc(n) \
0074 xorq p+4*(n)(CTX), RX0;
0075
0076 #define round_enc(n) \
0077 add_roundkey_enc(n); \
0078 \
0079 F(); \
0080 F();
0081
0082 #define add_roundkey_dec(n) \
0083 movq p+4*(n-1)(CTX), RT0; \
0084 rorq $32, RT0; \
0085 xorq RT0, RX0;
0086
0087 #define round_dec(n) \
0088 add_roundkey_dec(n); \
0089 \
0090 F(); \
0091 F(); \
0092
0093 #define read_block() \
0094 movq (RIO), RX0; \
0095 rorq $32, RX0; \
0096 bswapq RX0;
0097
0098 #define write_block() \
0099 bswapq RX0; \
0100 movq RX0, (RIO);
0101
0102 #define xor_block() \
0103 bswapq RX0; \
0104 xorq RX0, (RIO);
0105
0106 SYM_FUNC_START(__blowfish_enc_blk)
0107
0108
0109
0110
0111
0112
0113 movq %r12, %r11;
0114
0115 movq %rdi, CTX;
0116 movq %rsi, %r10;
0117 movq %rdx, RIO;
0118
0119 read_block();
0120
0121 round_enc(0);
0122 round_enc(2);
0123 round_enc(4);
0124 round_enc(6);
0125 round_enc(8);
0126 round_enc(10);
0127 round_enc(12);
0128 round_enc(14);
0129 add_roundkey_enc(16);
0130
0131 movq %r11, %r12;
0132
0133 movq %r10, RIO;
0134 test %cl, %cl;
0135 jnz .L__enc_xor;
0136
0137 write_block();
0138 RET;
0139 .L__enc_xor:
0140 xor_block();
0141 RET;
0142 SYM_FUNC_END(__blowfish_enc_blk)
0143
0144 SYM_FUNC_START(blowfish_dec_blk)
0145
0146
0147
0148
0149
0150 movq %r12, %r11;
0151
0152 movq %rdi, CTX;
0153 movq %rsi, %r10;
0154 movq %rdx, RIO;
0155
0156 read_block();
0157
0158 round_dec(17);
0159 round_dec(15);
0160 round_dec(13);
0161 round_dec(11);
0162 round_dec(9);
0163 round_dec(7);
0164 round_dec(5);
0165 round_dec(3);
0166 add_roundkey_dec(1);
0167
0168 movq %r10, RIO;
0169 write_block();
0170
0171 movq %r11, %r12;
0172
0173 RET;
0174 SYM_FUNC_END(blowfish_dec_blk)
0175
0176
0177
0178
0179
0180
0181
0182
0183 #define F4(x) \
0184 movzbl x ## bh, RT1d; \
0185 movzbl x ## bl, RT3d; \
0186 rorq $16, x; \
0187 movzbl x ## bh, RT0d; \
0188 movzbl x ## bl, RT2d; \
0189 rorq $16, x; \
0190 movl s0(CTX,RT0,4), RT0d; \
0191 addl s1(CTX,RT2,4), RT0d; \
0192 xorl s2(CTX,RT1,4), RT0d; \
0193 addl s3(CTX,RT3,4), RT0d; \
0194 xorq RT0, x;
0195
0196 #define add_preloaded_roundkey4() \
0197 xorq RKEY, RX0; \
0198 xorq RKEY, RX1; \
0199 xorq RKEY, RX2; \
0200 xorq RKEY, RX3;
0201
0202 #define preload_roundkey_enc(n) \
0203 movq p+4*(n)(CTX), RKEY;
0204
0205 #define add_roundkey_enc4(n) \
0206 add_preloaded_roundkey4(); \
0207 preload_roundkey_enc(n + 2);
0208
0209 #define round_enc4(n) \
0210 add_roundkey_enc4(n); \
0211 \
0212 F4(RX0); \
0213 F4(RX1); \
0214 F4(RX2); \
0215 F4(RX3); \
0216 \
0217 F4(RX0); \
0218 F4(RX1); \
0219 F4(RX2); \
0220 F4(RX3);
0221
0222 #define preload_roundkey_dec(n) \
0223 movq p+4*((n)-1)(CTX), RKEY; \
0224 rorq $32, RKEY;
0225
0226 #define add_roundkey_dec4(n) \
0227 add_preloaded_roundkey4(); \
0228 preload_roundkey_dec(n - 2);
0229
0230 #define round_dec4(n) \
0231 add_roundkey_dec4(n); \
0232 \
0233 F4(RX0); \
0234 F4(RX1); \
0235 F4(RX2); \
0236 F4(RX3); \
0237 \
0238 F4(RX0); \
0239 F4(RX1); \
0240 F4(RX2); \
0241 F4(RX3);
0242
0243 #define read_block4() \
0244 movq (RIO), RX0; \
0245 rorq $32, RX0; \
0246 bswapq RX0; \
0247 \
0248 movq 8(RIO), RX1; \
0249 rorq $32, RX1; \
0250 bswapq RX1; \
0251 \
0252 movq 16(RIO), RX2; \
0253 rorq $32, RX2; \
0254 bswapq RX2; \
0255 \
0256 movq 24(RIO), RX3; \
0257 rorq $32, RX3; \
0258 bswapq RX3;
0259
0260 #define write_block4() \
0261 bswapq RX0; \
0262 movq RX0, (RIO); \
0263 \
0264 bswapq RX1; \
0265 movq RX1, 8(RIO); \
0266 \
0267 bswapq RX2; \
0268 movq RX2, 16(RIO); \
0269 \
0270 bswapq RX3; \
0271 movq RX3, 24(RIO);
0272
0273 #define xor_block4() \
0274 bswapq RX0; \
0275 xorq RX0, (RIO); \
0276 \
0277 bswapq RX1; \
0278 xorq RX1, 8(RIO); \
0279 \
0280 bswapq RX2; \
0281 xorq RX2, 16(RIO); \
0282 \
0283 bswapq RX3; \
0284 xorq RX3, 24(RIO);
0285
0286 SYM_FUNC_START(__blowfish_enc_blk_4way)
0287
0288
0289
0290
0291
0292
0293 pushq %r12;
0294 pushq %rbx;
0295 pushq %rcx;
0296
0297 movq %rdi, CTX
0298 movq %rsi, %r11;
0299 movq %rdx, RIO;
0300
0301 preload_roundkey_enc(0);
0302
0303 read_block4();
0304
0305 round_enc4(0);
0306 round_enc4(2);
0307 round_enc4(4);
0308 round_enc4(6);
0309 round_enc4(8);
0310 round_enc4(10);
0311 round_enc4(12);
0312 round_enc4(14);
0313 add_preloaded_roundkey4();
0314
0315 popq %r12;
0316 movq %r11, RIO;
0317
0318 test %r12b, %r12b;
0319 jnz .L__enc_xor4;
0320
0321 write_block4();
0322
0323 popq %rbx;
0324 popq %r12;
0325 RET;
0326
0327 .L__enc_xor4:
0328 xor_block4();
0329
0330 popq %rbx;
0331 popq %r12;
0332 RET;
0333 SYM_FUNC_END(__blowfish_enc_blk_4way)
0334
0335 SYM_FUNC_START(blowfish_dec_blk_4way)
0336
0337
0338
0339
0340
0341 pushq %r12;
0342 pushq %rbx;
0343
0344 movq %rdi, CTX;
0345 movq %rsi, %r11
0346 movq %rdx, RIO;
0347
0348 preload_roundkey_dec(17);
0349 read_block4();
0350
0351 round_dec4(17);
0352 round_dec4(15);
0353 round_dec4(13);
0354 round_dec4(11);
0355 round_dec4(9);
0356 round_dec4(7);
0357 round_dec4(5);
0358 round_dec4(3);
0359 add_preloaded_roundkey4();
0360
0361 movq %r11, RIO;
0362 write_block4();
0363
0364 popq %rbx;
0365 popq %r12;
0366
0367 RET;
0368 SYM_FUNC_END(blowfish_dec_blk_4way)