0001
0002
0003
0004
0005
0006
0007
0008 #include <linux/linkage.h>
0009
0010 .file "twofish-x86_64-asm-3way.S"
0011 .text
0012
0013
0014 #define s0 0
0015 #define s1 1024
0016 #define s2 2048
0017 #define s3 3072
0018 #define w 4096
0019 #define k 4128
0020
0021
0022
0023
0024 #define CTX %rdi
0025 #define RIO %rdx
0026
0027 #define RAB0 %rax
0028 #define RAB1 %rbx
0029 #define RAB2 %rcx
0030
0031 #define RAB0d %eax
0032 #define RAB1d %ebx
0033 #define RAB2d %ecx
0034
0035 #define RAB0bh %ah
0036 #define RAB1bh %bh
0037 #define RAB2bh %ch
0038
0039 #define RAB0bl %al
0040 #define RAB1bl %bl
0041 #define RAB2bl %cl
0042
0043 #define CD0 0x0(%rsp)
0044 #define CD1 0x8(%rsp)
0045 #define CD2 0x10(%rsp)
0046
0047 # used only before/after all rounds
0048 #define RCD0 %r8
0049 #define RCD1 %r9
0050 #define RCD2 %r10
0051
0052 # used only during rounds
0053 #define RX0 %r8
0054 #define RX1 %r9
0055 #define RX2 %r10
0056
0057 #define RX0d %r8d
0058 #define RX1d %r9d
0059 #define RX2d %r10d
0060
0061 #define RY0 %r11
0062 #define RY1 %r12
0063 #define RY2 %r13
0064
0065 #define RY0d %r11d
0066 #define RY1d %r12d
0067 #define RY2d %r13d
0068
0069 #define RT0 %rdx
0070 #define RT1 %rsi
0071
0072 #define RT0d %edx
0073 #define RT1d %esi
0074
0075 #define RT1bl %sil
0076
0077 #define do16bit_ror(rot, op1, op2, T0, T1, tmp1, tmp2, ab, dst) \
0078 movzbl ab ## bl, tmp2 ## d; \
0079 movzbl ab ## bh, tmp1 ## d; \
0080 rorq $(rot), ab; \
0081 op1##l T0(CTX, tmp2, 4), dst ## d; \
0082 op2##l T1(CTX, tmp1, 4), dst ## d;
0083
0084 #define swap_ab_with_cd(ab, cd, tmp) \
0085 movq cd, tmp; \
0086 movq ab, cd; \
0087 movq tmp, ab;
0088
0089
0090
0091
0092
0093 #define g1g2_3(ab, cd, Tx0, Tx1, Tx2, Tx3, Ty0, Ty1, Ty2, Ty3, x, y) \
0094 \
0095 do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 0, ab ## 0, x ## 0); \
0096 do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 0, ab ## 0, y ## 0); \
0097 \
0098 do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 1, ab ## 1, x ## 1); \
0099 do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 1, ab ## 1, y ## 1); \
0100 \
0101 do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 2, ab ## 2, x ## 2); \
0102 do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 2, ab ## 2, y ## 2); \
0103 \
0104 \
0105 do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 0, x ## 0); \
0106 do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 0, y ## 0); \
0107 swap_ab_with_cd(ab ## 0, cd ## 0, RT0); \
0108 \
0109 do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 1, x ## 1); \
0110 do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 1, y ## 1); \
0111 swap_ab_with_cd(ab ## 1, cd ## 1, RT0); \
0112 \
0113 do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 2, x ## 2); \
0114 do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 2, y ## 2); \
0115 swap_ab_with_cd(ab ## 2, cd ## 2, RT0);
0116
0117 #define enc_round_end(ab, x, y, n) \
0118 addl y ## d, x ## d; \
0119 addl x ## d, y ## d; \
0120 addl k+4*(2*(n))(CTX), x ## d; \
0121 xorl ab ## d, x ## d; \
0122 addl k+4*(2*(n)+1)(CTX), y ## d; \
0123 shrq $32, ab; \
0124 roll $1, ab ## d; \
0125 xorl y ## d, ab ## d; \
0126 shlq $32, ab; \
0127 rorl $1, x ## d; \
0128 orq x, ab;
0129
0130 #define dec_round_end(ba, x, y, n) \
0131 addl y ## d, x ## d; \
0132 addl x ## d, y ## d; \
0133 addl k+4*(2*(n))(CTX), x ## d; \
0134 addl k+4*(2*(n)+1)(CTX), y ## d; \
0135 xorl ba ## d, y ## d; \
0136 shrq $32, ba; \
0137 roll $1, ba ## d; \
0138 xorl x ## d, ba ## d; \
0139 shlq $32, ba; \
0140 rorl $1, y ## d; \
0141 orq y, ba;
0142
0143 #define encrypt_round3(ab, cd, n) \
0144 g1g2_3(ab, cd, s0, s1, s2, s3, s0, s1, s2, s3, RX, RY); \
0145 \
0146 enc_round_end(ab ## 0, RX0, RY0, n); \
0147 enc_round_end(ab ## 1, RX1, RY1, n); \
0148 enc_round_end(ab ## 2, RX2, RY2, n);
0149
0150 #define decrypt_round3(ba, dc, n) \
0151 g1g2_3(ba, dc, s1, s2, s3, s0, s3, s0, s1, s2, RY, RX); \
0152 \
0153 dec_round_end(ba ## 0, RX0, RY0, n); \
0154 dec_round_end(ba ## 1, RX1, RY1, n); \
0155 dec_round_end(ba ## 2, RX2, RY2, n);
0156
0157 #define encrypt_cycle3(ab, cd, n) \
0158 encrypt_round3(ab, cd, n*2); \
0159 encrypt_round3(ab, cd, (n*2)+1);
0160
0161 #define decrypt_cycle3(ba, dc, n) \
0162 decrypt_round3(ba, dc, (n*2)+1); \
0163 decrypt_round3(ba, dc, (n*2));
0164
0165 #define push_cd() \
0166 pushq RCD2; \
0167 pushq RCD1; \
0168 pushq RCD0;
0169
0170 #define pop_cd() \
0171 popq RCD0; \
0172 popq RCD1; \
0173 popq RCD2;
0174
0175 #define inpack3(in, n, xy, m) \
0176 movq 4*(n)(in), xy ## 0; \
0177 xorq w+4*m(CTX), xy ## 0; \
0178 \
0179 movq 4*(4+(n))(in), xy ## 1; \
0180 xorq w+4*m(CTX), xy ## 1; \
0181 \
0182 movq 4*(8+(n))(in), xy ## 2; \
0183 xorq w+4*m(CTX), xy ## 2;
0184
0185 #define outunpack3(op, out, n, xy, m) \
0186 xorq w+4*m(CTX), xy ## 0; \
0187 op ## q xy ## 0, 4*(n)(out); \
0188 \
0189 xorq w+4*m(CTX), xy ## 1; \
0190 op ## q xy ## 1, 4*(4+(n))(out); \
0191 \
0192 xorq w+4*m(CTX), xy ## 2; \
0193 op ## q xy ## 2, 4*(8+(n))(out);
0194
0195 #define inpack_enc3() \
0196 inpack3(RIO, 0, RAB, 0); \
0197 inpack3(RIO, 2, RCD, 2);
0198
0199 #define outunpack_enc3(op) \
0200 outunpack3(op, RIO, 2, RAB, 6); \
0201 outunpack3(op, RIO, 0, RCD, 4);
0202
0203 #define inpack_dec3() \
0204 inpack3(RIO, 0, RAB, 4); \
0205 rorq $32, RAB0; \
0206 rorq $32, RAB1; \
0207 rorq $32, RAB2; \
0208 inpack3(RIO, 2, RCD, 6); \
0209 rorq $32, RCD0; \
0210 rorq $32, RCD1; \
0211 rorq $32, RCD2;
0212
0213 #define outunpack_dec3() \
0214 rorq $32, RCD0; \
0215 rorq $32, RCD1; \
0216 rorq $32, RCD2; \
0217 outunpack3(mov, RIO, 0, RCD, 0); \
0218 rorq $32, RAB0; \
0219 rorq $32, RAB1; \
0220 rorq $32, RAB2; \
0221 outunpack3(mov, RIO, 2, RAB, 2);
0222
0223 SYM_FUNC_START(__twofish_enc_blk_3way)
0224
0225
0226
0227
0228
0229
0230 pushq %r13;
0231 pushq %r12;
0232 pushq %rbx;
0233
0234 pushq %rcx;
0235 pushq %rsi;
0236
0237 inpack_enc3();
0238
0239 push_cd();
0240 encrypt_cycle3(RAB, CD, 0);
0241 encrypt_cycle3(RAB, CD, 1);
0242 encrypt_cycle3(RAB, CD, 2);
0243 encrypt_cycle3(RAB, CD, 3);
0244 encrypt_cycle3(RAB, CD, 4);
0245 encrypt_cycle3(RAB, CD, 5);
0246 encrypt_cycle3(RAB, CD, 6);
0247 encrypt_cycle3(RAB, CD, 7);
0248 pop_cd();
0249
0250 popq RIO;
0251 popq RT1;
0252
0253 testb RT1bl, RT1bl;
0254 jnz .L__enc_xor3;
0255
0256 outunpack_enc3(mov);
0257
0258 popq %rbx;
0259 popq %r12;
0260 popq %r13;
0261 RET;
0262
0263 .L__enc_xor3:
0264 outunpack_enc3(xor);
0265
0266 popq %rbx;
0267 popq %r12;
0268 popq %r13;
0269 RET;
0270 SYM_FUNC_END(__twofish_enc_blk_3way)
0271
0272 SYM_FUNC_START(twofish_dec_blk_3way)
0273
0274
0275
0276
0277
0278 pushq %r13;
0279 pushq %r12;
0280 pushq %rbx;
0281
0282 pushq %rsi;
0283
0284 inpack_dec3();
0285
0286 push_cd();
0287 decrypt_cycle3(RAB, CD, 7);
0288 decrypt_cycle3(RAB, CD, 6);
0289 decrypt_cycle3(RAB, CD, 5);
0290 decrypt_cycle3(RAB, CD, 4);
0291 decrypt_cycle3(RAB, CD, 3);
0292 decrypt_cycle3(RAB, CD, 2);
0293 decrypt_cycle3(RAB, CD, 1);
0294 decrypt_cycle3(RAB, CD, 0);
0295 pop_cd();
0296
0297 popq RIO;
0298
0299 outunpack_dec3();
0300
0301 popq %rbx;
0302 popq %r12;
0303 popq %r13;
0304 RET;
0305 SYM_FUNC_END(twofish_dec_blk_3way)