0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011 #include <linux/linkage.h>
0012 #include <asm/frame.h>
0013 #include "glue_helper-asm-avx.S"
0014
0015 .file "twofish-avx-x86_64-asm_64.S"
0016
0017 .section .rodata.cst16.bswap128_mask, "aM", @progbits, 16
0018 .align 16
0019 .Lbswap128_mask:
0020 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
0021
0022 .text
0023
0024
0025 #define s0 0
0026 #define s1 1024
0027 #define s2 2048
0028 #define s3 3072
0029 #define w 4096
0030 #define k 4128
0031
0032
0033
0034
0035 #define CTX %rdi
0036
0037 #define RA1 %xmm0
0038 #define RB1 %xmm1
0039 #define RC1 %xmm2
0040 #define RD1 %xmm3
0041
0042 #define RA2 %xmm4
0043 #define RB2 %xmm5
0044 #define RC2 %xmm6
0045 #define RD2 %xmm7
0046
0047 #define RX0 %xmm8
0048 #define RY0 %xmm9
0049
0050 #define RX1 %xmm10
0051 #define RY1 %xmm11
0052
0053 #define RK1 %xmm12
0054 #define RK2 %xmm13
0055
0056 #define RT %xmm14
0057 #define RR %xmm15
0058
0059 #define RID1 %r13
0060 #define RID1d %r13d
0061 #define RID2 %rsi
0062 #define RID2d %esi
0063
0064 #define RGI1 %rdx
0065 #define RGI1bl %dl
0066 #define RGI1bh %dh
0067 #define RGI2 %rcx
0068 #define RGI2bl %cl
0069 #define RGI2bh %ch
0070
0071 #define RGI3 %rax
0072 #define RGI3bl %al
0073 #define RGI3bh %ah
0074 #define RGI4 %rbx
0075 #define RGI4bl %bl
0076 #define RGI4bh %bh
0077
0078 #define RGS1 %r8
0079 #define RGS1d %r8d
0080 #define RGS2 %r9
0081 #define RGS2d %r9d
0082 #define RGS3 %r10
0083 #define RGS3d %r10d
0084
0085
0086 #define lookup_32bit(t0, t1, t2, t3, src, dst, interleave_op, il_reg) \
0087 movzbl src ## bl, RID1d; \
0088 movzbl src ## bh, RID2d; \
0089 shrq $16, src; \
0090 movl t0(CTX, RID1, 4), dst ## d; \
0091 movl t1(CTX, RID2, 4), RID2d; \
0092 movzbl src ## bl, RID1d; \
0093 xorl RID2d, dst ## d; \
0094 movzbl src ## bh, RID2d; \
0095 interleave_op(il_reg); \
0096 xorl t2(CTX, RID1, 4), dst ## d; \
0097 xorl t3(CTX, RID2, 4), dst ## d;
0098
0099 #define dummy(d)
0100
0101 #define shr_next(reg) \
0102 shrq $16, reg;
0103
0104 #define G(gi1, gi2, x, t0, t1, t2, t3) \
0105 lookup_32bit(t0, t1, t2, t3, ##gi1, RGS1, shr_next, ##gi1); \
0106 lookup_32bit(t0, t1, t2, t3, ##gi2, RGS3, shr_next, ##gi2); \
0107 \
0108 lookup_32bit(t0, t1, t2, t3, ##gi1, RGS2, dummy, none); \
0109 shlq $32, RGS2; \
0110 orq RGS1, RGS2; \
0111 lookup_32bit(t0, t1, t2, t3, ##gi2, RGS1, dummy, none); \
0112 shlq $32, RGS1; \
0113 orq RGS1, RGS3;
0114
0115 #define round_head_2(a, b, x1, y1, x2, y2) \
0116 vmovq b ## 1, RGI3; \
0117 vpextrq $1, b ## 1, RGI4; \
0118 \
0119 G(RGI1, RGI2, x1, s0, s1, s2, s3); \
0120 vmovq a ## 2, RGI1; \
0121 vpextrq $1, a ## 2, RGI2; \
0122 vmovq RGS2, x1; \
0123 vpinsrq $1, RGS3, x1, x1; \
0124 \
0125 G(RGI3, RGI4, y1, s1, s2, s3, s0); \
0126 vmovq b ## 2, RGI3; \
0127 vpextrq $1, b ## 2, RGI4; \
0128 vmovq RGS2, y1; \
0129 vpinsrq $1, RGS3, y1, y1; \
0130 \
0131 G(RGI1, RGI2, x2, s0, s1, s2, s3); \
0132 vmovq RGS2, x2; \
0133 vpinsrq $1, RGS3, x2, x2; \
0134 \
0135 G(RGI3, RGI4, y2, s1, s2, s3, s0); \
0136 vmovq RGS2, y2; \
0137 vpinsrq $1, RGS3, y2, y2;
0138
0139 #define encround_tail(a, b, c, d, x, y, prerotate) \
0140 vpaddd x, y, x; \
0141 vpaddd x, RK1, RT;\
0142 prerotate(b); \
0143 vpxor RT, c, c; \
0144 vpaddd y, x, y; \
0145 vpaddd y, RK2, y; \
0146 vpsrld $1, c, RT; \
0147 vpslld $(32 - 1), c, c; \
0148 vpor c, RT, c; \
0149 vpxor d, y, d; \
0150
0151 #define decround_tail(a, b, c, d, x, y, prerotate) \
0152 vpaddd x, y, x; \
0153 vpaddd x, RK1, RT;\
0154 prerotate(a); \
0155 vpxor RT, c, c; \
0156 vpaddd y, x, y; \
0157 vpaddd y, RK2, y; \
0158 vpxor d, y, d; \
0159 vpsrld $1, d, y; \
0160 vpslld $(32 - 1), d, d; \
0161 vpor d, y, d; \
0162
0163 #define rotate_1l(x) \
0164 vpslld $1, x, RR; \
0165 vpsrld $(32 - 1), x, x; \
0166 vpor x, RR, x;
0167
0168 #define preload_rgi(c) \
0169 vmovq c, RGI1; \
0170 vpextrq $1, c, RGI2;
0171
0172 #define encrypt_round(n, a, b, c, d, preload, prerotate) \
0173 vbroadcastss (k+4*(2*(n)))(CTX), RK1; \
0174 vbroadcastss (k+4*(2*(n)+1))(CTX), RK2; \
0175 round_head_2(a, b, RX0, RY0, RX1, RY1); \
0176 encround_tail(a ## 1, b ## 1, c ## 1, d ## 1, RX0, RY0, prerotate); \
0177 preload(c ## 1); \
0178 encround_tail(a ## 2, b ## 2, c ## 2, d ## 2, RX1, RY1, prerotate);
0179
0180 #define decrypt_round(n, a, b, c, d, preload, prerotate) \
0181 vbroadcastss (k+4*(2*(n)))(CTX), RK1; \
0182 vbroadcastss (k+4*(2*(n)+1))(CTX), RK2; \
0183 round_head_2(a, b, RX0, RY0, RX1, RY1); \
0184 decround_tail(a ## 1, b ## 1, c ## 1, d ## 1, RX0, RY0, prerotate); \
0185 preload(c ## 1); \
0186 decround_tail(a ## 2, b ## 2, c ## 2, d ## 2, RX1, RY1, prerotate);
0187
0188 #define encrypt_cycle(n) \
0189 encrypt_round((2*n), RA, RB, RC, RD, preload_rgi, rotate_1l); \
0190 encrypt_round(((2*n) + 1), RC, RD, RA, RB, preload_rgi, rotate_1l);
0191
0192 #define encrypt_cycle_last(n) \
0193 encrypt_round((2*n), RA, RB, RC, RD, preload_rgi, rotate_1l); \
0194 encrypt_round(((2*n) + 1), RC, RD, RA, RB, dummy, dummy);
0195
0196 #define decrypt_cycle(n) \
0197 decrypt_round(((2*n) + 1), RC, RD, RA, RB, preload_rgi, rotate_1l); \
0198 decrypt_round((2*n), RA, RB, RC, RD, preload_rgi, rotate_1l);
0199
0200 #define decrypt_cycle_last(n) \
0201 decrypt_round(((2*n) + 1), RC, RD, RA, RB, preload_rgi, rotate_1l); \
0202 decrypt_round((2*n), RA, RB, RC, RD, dummy, dummy);
0203
0204 #define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
0205 vpunpckldq x1, x0, t0; \
0206 vpunpckhdq x1, x0, t2; \
0207 vpunpckldq x3, x2, t1; \
0208 vpunpckhdq x3, x2, x3; \
0209 \
0210 vpunpcklqdq t1, t0, x0; \
0211 vpunpckhqdq t1, t0, x1; \
0212 vpunpcklqdq x3, t2, x2; \
0213 vpunpckhqdq x3, t2, x3;
0214
0215 #define inpack_blocks(x0, x1, x2, x3, wkey, t0, t1, t2) \
0216 vpxor x0, wkey, x0; \
0217 vpxor x1, wkey, x1; \
0218 vpxor x2, wkey, x2; \
0219 vpxor x3, wkey, x3; \
0220 \
0221 transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
0222
0223 #define outunpack_blocks(x0, x1, x2, x3, wkey, t0, t1, t2) \
0224 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
0225 \
0226 vpxor x0, wkey, x0; \
0227 vpxor x1, wkey, x1; \
0228 vpxor x2, wkey, x2; \
0229 vpxor x3, wkey, x3;
0230
0231 .align 8
0232 SYM_FUNC_START_LOCAL(__twofish_enc_blk8)
0233
0234
0235
0236
0237
0238
0239
0240 vmovdqu w(CTX), RK1;
0241
0242 pushq %r13;
0243 pushq %rbx;
0244 pushq %rcx;
0245
0246 inpack_blocks(RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2);
0247 preload_rgi(RA1);
0248 rotate_1l(RD1);
0249 inpack_blocks(RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2);
0250 rotate_1l(RD2);
0251
0252 encrypt_cycle(0);
0253 encrypt_cycle(1);
0254 encrypt_cycle(2);
0255 encrypt_cycle(3);
0256 encrypt_cycle(4);
0257 encrypt_cycle(5);
0258 encrypt_cycle(6);
0259 encrypt_cycle_last(7);
0260
0261 vmovdqu (w+4*4)(CTX), RK1;
0262
0263 popq %rcx;
0264 popq %rbx;
0265 popq %r13;
0266
0267 outunpack_blocks(RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);
0268 outunpack_blocks(RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2);
0269
0270 RET;
0271 SYM_FUNC_END(__twofish_enc_blk8)
0272
0273 .align 8
0274 SYM_FUNC_START_LOCAL(__twofish_dec_blk8)
0275
0276
0277
0278
0279
0280
0281
0282 vmovdqu (w+4*4)(CTX), RK1;
0283
0284 pushq %r13;
0285 pushq %rbx;
0286
0287 inpack_blocks(RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);
0288 preload_rgi(RC1);
0289 rotate_1l(RA1);
0290 inpack_blocks(RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2);
0291 rotate_1l(RA2);
0292
0293 decrypt_cycle(7);
0294 decrypt_cycle(6);
0295 decrypt_cycle(5);
0296 decrypt_cycle(4);
0297 decrypt_cycle(3);
0298 decrypt_cycle(2);
0299 decrypt_cycle(1);
0300 decrypt_cycle_last(0);
0301
0302 vmovdqu (w)(CTX), RK1;
0303
0304 popq %rbx;
0305 popq %r13;
0306
0307 outunpack_blocks(RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2);
0308 outunpack_blocks(RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2);
0309
0310 RET;
0311 SYM_FUNC_END(__twofish_dec_blk8)
0312
0313 SYM_FUNC_START(twofish_ecb_enc_8way)
0314
0315
0316
0317
0318
0319 FRAME_BEGIN
0320
0321 movq %rsi, %r11;
0322
0323 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
0324
0325 call __twofish_enc_blk8;
0326
0327 store_8way(%r11, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);
0328
0329 FRAME_END
0330 RET;
0331 SYM_FUNC_END(twofish_ecb_enc_8way)
0332
0333 SYM_FUNC_START(twofish_ecb_dec_8way)
0334
0335
0336
0337
0338
0339 FRAME_BEGIN
0340
0341 movq %rsi, %r11;
0342
0343 load_8way(%rdx, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);
0344
0345 call __twofish_dec_blk8;
0346
0347 store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
0348
0349 FRAME_END
0350 RET;
0351 SYM_FUNC_END(twofish_ecb_dec_8way)
0352
0353 SYM_FUNC_START(twofish_cbc_dec_8way)
0354
0355
0356
0357
0358
0359 FRAME_BEGIN
0360
0361 pushq %r12;
0362
0363 movq %rsi, %r11;
0364 movq %rdx, %r12;
0365
0366 load_8way(%rdx, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);
0367
0368 call __twofish_dec_blk8;
0369
0370 store_cbc_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
0371
0372 popq %r12;
0373
0374 FRAME_END
0375 RET;
0376 SYM_FUNC_END(twofish_cbc_dec_8way)