0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011 #include <linux/linkage.h>
0012 #include <asm/frame.h>
0013
0014 .file "cast5-avx-x86_64-asm_64.S"
0015
0016 .extern cast_s1
0017 .extern cast_s2
0018 .extern cast_s3
0019 .extern cast_s4
0020
0021
0022 #define km 0
0023 #define kr (16*4)
0024 #define rr ((16*4)+16)
0025
0026
0027 #define s1 cast_s1
0028 #define s2 cast_s2
0029 #define s3 cast_s3
0030 #define s4 cast_s4
0031
0032
0033
0034
0035 #define CTX %r15
0036
0037 #define RL1 %xmm0
0038 #define RR1 %xmm1
0039 #define RL2 %xmm2
0040 #define RR2 %xmm3
0041 #define RL3 %xmm4
0042 #define RR3 %xmm5
0043 #define RL4 %xmm6
0044 #define RR4 %xmm7
0045
0046 #define RX %xmm8
0047
0048 #define RKM %xmm9
0049 #define RKR %xmm10
0050 #define RKRF %xmm11
0051 #define RKRR %xmm12
0052
0053 #define R32 %xmm13
0054 #define R1ST %xmm14
0055
0056 #define RTMP %xmm15
0057
0058 #define RID1 %rdi
0059 #define RID1d %edi
0060 #define RID2 %rsi
0061 #define RID2d %esi
0062
0063 #define RGI1 %rdx
0064 #define RGI1bl %dl
0065 #define RGI1bh %dh
0066 #define RGI2 %rcx
0067 #define RGI2bl %cl
0068 #define RGI2bh %ch
0069
0070 #define RGI3 %rax
0071 #define RGI3bl %al
0072 #define RGI3bh %ah
0073 #define RGI4 %rbx
0074 #define RGI4bl %bl
0075 #define RGI4bh %bh
0076
0077 #define RFS1 %r8
0078 #define RFS1d %r8d
0079 #define RFS2 %r9
0080 #define RFS2d %r9d
0081 #define RFS3 %r10
0082 #define RFS3d %r10d
0083
0084
0085 #define lookup_32bit(src, dst, op1, op2, op3, interleave_op, il_reg) \
0086 movzbl src ## bh, RID1d; \
0087 movzbl src ## bl, RID2d; \
0088 shrq $16, src; \
0089 movl s1(, RID1, 4), dst ## d; \
0090 op1 s2(, RID2, 4), dst ## d; \
0091 movzbl src ## bh, RID1d; \
0092 movzbl src ## bl, RID2d; \
0093 interleave_op(il_reg); \
0094 op2 s3(, RID1, 4), dst ## d; \
0095 op3 s4(, RID2, 4), dst ## d;
0096
0097 #define dummy(d)
0098
0099 #define shr_next(reg) \
0100 shrq $16, reg;
0101
0102 #define F_head(a, x, gi1, gi2, op0) \
0103 op0 a, RKM, x; \
0104 vpslld RKRF, x, RTMP; \
0105 vpsrld RKRR, x, x; \
0106 vpor RTMP, x, x; \
0107 \
0108 vmovq x, gi1; \
0109 vpextrq $1, x, gi2;
0110
0111 #define F_tail(a, x, gi1, gi2, op1, op2, op3) \
0112 lookup_32bit(##gi1, RFS1, op1, op2, op3, shr_next, ##gi1); \
0113 lookup_32bit(##gi2, RFS3, op1, op2, op3, shr_next, ##gi2); \
0114 \
0115 lookup_32bit(##gi1, RFS2, op1, op2, op3, dummy, none); \
0116 shlq $32, RFS2; \
0117 orq RFS1, RFS2; \
0118 lookup_32bit(##gi2, RFS1, op1, op2, op3, dummy, none); \
0119 shlq $32, RFS1; \
0120 orq RFS1, RFS3; \
0121 \
0122 vmovq RFS2, x; \
0123 vpinsrq $1, RFS3, x, x;
0124
0125 #define F_2(a1, b1, a2, b2, op0, op1, op2, op3) \
0126 F_head(b1, RX, RGI1, RGI2, op0); \
0127 F_head(b2, RX, RGI3, RGI4, op0); \
0128 \
0129 F_tail(b1, RX, RGI1, RGI2, op1, op2, op3); \
0130 F_tail(b2, RTMP, RGI3, RGI4, op1, op2, op3); \
0131 \
0132 vpxor a1, RX, a1; \
0133 vpxor a2, RTMP, a2;
0134
0135 #define F1_2(a1, b1, a2, b2) \
0136 F_2(a1, b1, a2, b2, vpaddd, xorl, subl, addl)
0137 #define F2_2(a1, b1, a2, b2) \
0138 F_2(a1, b1, a2, b2, vpxor, subl, addl, xorl)
0139 #define F3_2(a1, b1, a2, b2) \
0140 F_2(a1, b1, a2, b2, vpsubd, addl, xorl, subl)
0141
0142 #define subround(a1, b1, a2, b2, f) \
0143 F ## f ## _2(a1, b1, a2, b2);
0144
0145 #define round(l, r, n, f) \
0146 vbroadcastss (km+(4*n))(CTX), RKM; \
0147 vpand R1ST, RKR, RKRF; \
0148 vpsubq RKRF, R32, RKRR; \
0149 vpsrldq $1, RKR, RKR; \
0150 subround(l ## 1, r ## 1, l ## 2, r ## 2, f); \
0151 subround(l ## 3, r ## 3, l ## 4, r ## 4, f);
0152
0153 #define enc_preload_rkr() \
0154 vbroadcastss .L16_mask, RKR; \
0155 \
0156 vpxor kr(CTX), RKR, RKR;
0157
0158 #define dec_preload_rkr() \
0159 vbroadcastss .L16_mask, RKR; \
0160 \
0161 vpxor kr(CTX), RKR, RKR; \
0162 vpshufb .Lbswap128_mask, RKR, RKR;
0163
0164 #define transpose_2x4(x0, x1, t0, t1) \
0165 vpunpckldq x1, x0, t0; \
0166 vpunpckhdq x1, x0, t1; \
0167 \
0168 vpunpcklqdq t1, t0, x0; \
0169 vpunpckhqdq t1, t0, x1;
0170
0171 #define inpack_blocks(x0, x1, t0, t1, rmask) \
0172 vpshufb rmask, x0, x0; \
0173 vpshufb rmask, x1, x1; \
0174 \
0175 transpose_2x4(x0, x1, t0, t1)
0176
0177 #define outunpack_blocks(x0, x1, t0, t1, rmask) \
0178 transpose_2x4(x0, x1, t0, t1) \
0179 \
0180 vpshufb rmask, x0, x0; \
0181 vpshufb rmask, x1, x1;
0182
0183 .section .rodata.cst16.bswap_mask, "aM", @progbits, 16
0184 .align 16
0185 .Lbswap_mask:
0186 .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
0187 .section .rodata.cst16.bswap128_mask, "aM", @progbits, 16
0188 .align 16
0189 .Lbswap128_mask:
0190 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
0191 .section .rodata.cst16.bswap_iv_mask, "aM", @progbits, 16
0192 .align 16
0193 .Lbswap_iv_mask:
0194 .byte 7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0
0195
0196 .section .rodata.cst4.16_mask, "aM", @progbits, 4
0197 .align 4
0198 .L16_mask:
0199 .byte 16, 16, 16, 16
0200 .section .rodata.cst4.32_mask, "aM", @progbits, 4
0201 .align 4
0202 .L32_mask:
0203 .byte 32, 0, 0, 0
0204 .section .rodata.cst4.first_mask, "aM", @progbits, 4
0205 .align 4
0206 .Lfirst_mask:
0207 .byte 0x1f, 0, 0, 0
0208
0209 .text
0210
0211 .align 16
0212 SYM_FUNC_START_LOCAL(__cast5_enc_blk16)
0213
0214
0215
0216
0217
0218
0219
0220
0221
0222
0223
0224
0225
0226
0227
0228
0229
0230
0231
0232
0233
0234 pushq %r15;
0235 pushq %rbx;
0236
0237 movq %rdi, CTX;
0238
0239 vmovdqa .Lbswap_mask, RKM;
0240 vmovd .Lfirst_mask, R1ST;
0241 vmovd .L32_mask, R32;
0242 enc_preload_rkr();
0243
0244 inpack_blocks(RL1, RR1, RTMP, RX, RKM);
0245 inpack_blocks(RL2, RR2, RTMP, RX, RKM);
0246 inpack_blocks(RL3, RR3, RTMP, RX, RKM);
0247 inpack_blocks(RL4, RR4, RTMP, RX, RKM);
0248
0249 round(RL, RR, 0, 1);
0250 round(RR, RL, 1, 2);
0251 round(RL, RR, 2, 3);
0252 round(RR, RL, 3, 1);
0253 round(RL, RR, 4, 2);
0254 round(RR, RL, 5, 3);
0255 round(RL, RR, 6, 1);
0256 round(RR, RL, 7, 2);
0257 round(RL, RR, 8, 3);
0258 round(RR, RL, 9, 1);
0259 round(RL, RR, 10, 2);
0260 round(RR, RL, 11, 3);
0261
0262 movzbl rr(CTX), %eax;
0263 testl %eax, %eax;
0264 jnz .L__skip_enc;
0265
0266 round(RL, RR, 12, 1);
0267 round(RR, RL, 13, 2);
0268 round(RL, RR, 14, 3);
0269 round(RR, RL, 15, 1);
0270
0271 .L__skip_enc:
0272 popq %rbx;
0273 popq %r15;
0274
0275 vmovdqa .Lbswap_mask, RKM;
0276
0277 outunpack_blocks(RR1, RL1, RTMP, RX, RKM);
0278 outunpack_blocks(RR2, RL2, RTMP, RX, RKM);
0279 outunpack_blocks(RR3, RL3, RTMP, RX, RKM);
0280 outunpack_blocks(RR4, RL4, RTMP, RX, RKM);
0281
0282 RET;
0283 SYM_FUNC_END(__cast5_enc_blk16)
0284
0285 .align 16
0286 SYM_FUNC_START_LOCAL(__cast5_dec_blk16)
0287
0288
0289
0290
0291
0292
0293
0294
0295
0296
0297
0298
0299
0300
0301
0302
0303
0304
0305
0306
0307
0308 pushq %r15;
0309 pushq %rbx;
0310
0311 movq %rdi, CTX;
0312
0313 vmovdqa .Lbswap_mask, RKM;
0314 vmovd .Lfirst_mask, R1ST;
0315 vmovd .L32_mask, R32;
0316 dec_preload_rkr();
0317
0318 inpack_blocks(RL1, RR1, RTMP, RX, RKM);
0319 inpack_blocks(RL2, RR2, RTMP, RX, RKM);
0320 inpack_blocks(RL3, RR3, RTMP, RX, RKM);
0321 inpack_blocks(RL4, RR4, RTMP, RX, RKM);
0322
0323 movzbl rr(CTX), %eax;
0324 testl %eax, %eax;
0325 jnz .L__skip_dec;
0326
0327 round(RL, RR, 15, 1);
0328 round(RR, RL, 14, 3);
0329 round(RL, RR, 13, 2);
0330 round(RR, RL, 12, 1);
0331
0332 .L__dec_tail:
0333 round(RL, RR, 11, 3);
0334 round(RR, RL, 10, 2);
0335 round(RL, RR, 9, 1);
0336 round(RR, RL, 8, 3);
0337 round(RL, RR, 7, 2);
0338 round(RR, RL, 6, 1);
0339 round(RL, RR, 5, 3);
0340 round(RR, RL, 4, 2);
0341 round(RL, RR, 3, 1);
0342 round(RR, RL, 2, 3);
0343 round(RL, RR, 1, 2);
0344 round(RR, RL, 0, 1);
0345
0346 vmovdqa .Lbswap_mask, RKM;
0347 popq %rbx;
0348 popq %r15;
0349
0350 outunpack_blocks(RR1, RL1, RTMP, RX, RKM);
0351 outunpack_blocks(RR2, RL2, RTMP, RX, RKM);
0352 outunpack_blocks(RR3, RL3, RTMP, RX, RKM);
0353 outunpack_blocks(RR4, RL4, RTMP, RX, RKM);
0354
0355 RET;
0356
0357 .L__skip_dec:
0358 vpsrldq $4, RKR, RKR;
0359 jmp .L__dec_tail;
0360 SYM_FUNC_END(__cast5_dec_blk16)
0361
0362 SYM_FUNC_START(cast5_ecb_enc_16way)
0363
0364
0365
0366
0367
0368 FRAME_BEGIN
0369 pushq %r15;
0370
0371 movq %rdi, CTX;
0372 movq %rsi, %r11;
0373
0374 vmovdqu (0*4*4)(%rdx), RL1;
0375 vmovdqu (1*4*4)(%rdx), RR1;
0376 vmovdqu (2*4*4)(%rdx), RL2;
0377 vmovdqu (3*4*4)(%rdx), RR2;
0378 vmovdqu (4*4*4)(%rdx), RL3;
0379 vmovdqu (5*4*4)(%rdx), RR3;
0380 vmovdqu (6*4*4)(%rdx), RL4;
0381 vmovdqu (7*4*4)(%rdx), RR4;
0382
0383 call __cast5_enc_blk16;
0384
0385 vmovdqu RR1, (0*4*4)(%r11);
0386 vmovdqu RL1, (1*4*4)(%r11);
0387 vmovdqu RR2, (2*4*4)(%r11);
0388 vmovdqu RL2, (3*4*4)(%r11);
0389 vmovdqu RR3, (4*4*4)(%r11);
0390 vmovdqu RL3, (5*4*4)(%r11);
0391 vmovdqu RR4, (6*4*4)(%r11);
0392 vmovdqu RL4, (7*4*4)(%r11);
0393
0394 popq %r15;
0395 FRAME_END
0396 RET;
0397 SYM_FUNC_END(cast5_ecb_enc_16way)
0398
0399 SYM_FUNC_START(cast5_ecb_dec_16way)
0400
0401
0402
0403
0404
0405
0406 FRAME_BEGIN
0407 pushq %r15;
0408
0409 movq %rdi, CTX;
0410 movq %rsi, %r11;
0411
0412 vmovdqu (0*4*4)(%rdx), RL1;
0413 vmovdqu (1*4*4)(%rdx), RR1;
0414 vmovdqu (2*4*4)(%rdx), RL2;
0415 vmovdqu (3*4*4)(%rdx), RR2;
0416 vmovdqu (4*4*4)(%rdx), RL3;
0417 vmovdqu (5*4*4)(%rdx), RR3;
0418 vmovdqu (6*4*4)(%rdx), RL4;
0419 vmovdqu (7*4*4)(%rdx), RR4;
0420
0421 call __cast5_dec_blk16;
0422
0423 vmovdqu RR1, (0*4*4)(%r11);
0424 vmovdqu RL1, (1*4*4)(%r11);
0425 vmovdqu RR2, (2*4*4)(%r11);
0426 vmovdqu RL2, (3*4*4)(%r11);
0427 vmovdqu RR3, (4*4*4)(%r11);
0428 vmovdqu RL3, (5*4*4)(%r11);
0429 vmovdqu RR4, (6*4*4)(%r11);
0430 vmovdqu RL4, (7*4*4)(%r11);
0431
0432 popq %r15;
0433 FRAME_END
0434 RET;
0435 SYM_FUNC_END(cast5_ecb_dec_16way)
0436
0437 SYM_FUNC_START(cast5_cbc_dec_16way)
0438
0439
0440
0441
0442
0443 FRAME_BEGIN
0444 pushq %r12;
0445 pushq %r15;
0446
0447 movq %rdi, CTX;
0448 movq %rsi, %r11;
0449 movq %rdx, %r12;
0450
0451 vmovdqu (0*16)(%rdx), RL1;
0452 vmovdqu (1*16)(%rdx), RR1;
0453 vmovdqu (2*16)(%rdx), RL2;
0454 vmovdqu (3*16)(%rdx), RR2;
0455 vmovdqu (4*16)(%rdx), RL3;
0456 vmovdqu (5*16)(%rdx), RR3;
0457 vmovdqu (6*16)(%rdx), RL4;
0458 vmovdqu (7*16)(%rdx), RR4;
0459
0460 call __cast5_dec_blk16;
0461
0462
0463 vmovq (%r12), RX;
0464 vpshufd $0x4f, RX, RX;
0465 vpxor RX, RR1, RR1;
0466 vpxor 0*16+8(%r12), RL1, RL1;
0467 vpxor 1*16+8(%r12), RR2, RR2;
0468 vpxor 2*16+8(%r12), RL2, RL2;
0469 vpxor 3*16+8(%r12), RR3, RR3;
0470 vpxor 4*16+8(%r12), RL3, RL3;
0471 vpxor 5*16+8(%r12), RR4, RR4;
0472 vpxor 6*16+8(%r12), RL4, RL4;
0473
0474 vmovdqu RR1, (0*16)(%r11);
0475 vmovdqu RL1, (1*16)(%r11);
0476 vmovdqu RR2, (2*16)(%r11);
0477 vmovdqu RL2, (3*16)(%r11);
0478 vmovdqu RR3, (4*16)(%r11);
0479 vmovdqu RL3, (5*16)(%r11);
0480 vmovdqu RR4, (6*16)(%r11);
0481 vmovdqu RL4, (7*16)(%r11);
0482
0483 popq %r15;
0484 popq %r12;
0485 FRAME_END
0486 RET;
0487 SYM_FUNC_END(cast5_cbc_dec_16way)
0488
0489 SYM_FUNC_START(cast5_ctr_16way)
0490
0491
0492
0493
0494
0495
0496 FRAME_BEGIN
0497 pushq %r12;
0498 pushq %r15;
0499
0500 movq %rdi, CTX;
0501 movq %rsi, %r11;
0502 movq %rdx, %r12;
0503
0504 vpcmpeqd RTMP, RTMP, RTMP;
0505 vpsrldq $8, RTMP, RTMP;
0506
0507 vpcmpeqd RKR, RKR, RKR;
0508 vpaddq RKR, RKR, RKR;
0509 vmovdqa .Lbswap_iv_mask, R1ST;
0510 vmovdqa .Lbswap128_mask, RKM;
0511
0512
0513 vmovq (%rcx), RX;
0514 vpshufb R1ST, RX, RX;
0515
0516
0517 vpsubq RTMP, RX, RX;
0518 vpshufb RKM, RX, RL1;
0519 vpsubq RKR, RX, RX;
0520 vpshufb RKM, RX, RR1;
0521 vpsubq RKR, RX, RX;
0522 vpshufb RKM, RX, RL2;
0523 vpsubq RKR, RX, RX;
0524 vpshufb RKM, RX, RR2;
0525 vpsubq RKR, RX, RX;
0526 vpshufb RKM, RX, RL3;
0527 vpsubq RKR, RX, RX;
0528 vpshufb RKM, RX, RR3;
0529 vpsubq RKR, RX, RX;
0530 vpshufb RKM, RX, RL4;
0531 vpsubq RKR, RX, RX;
0532 vpshufb RKM, RX, RR4;
0533
0534
0535 vpsubq RTMP, RX, RX;
0536 vpshufb R1ST, RX, RX;
0537 vmovq RX, (%rcx);
0538
0539 call __cast5_enc_blk16;
0540
0541
0542 vpxor (0*16)(%r12), RR1, RR1;
0543 vpxor (1*16)(%r12), RL1, RL1;
0544 vpxor (2*16)(%r12), RR2, RR2;
0545 vpxor (3*16)(%r12), RL2, RL2;
0546 vpxor (4*16)(%r12), RR3, RR3;
0547 vpxor (5*16)(%r12), RL3, RL3;
0548 vpxor (6*16)(%r12), RR4, RR4;
0549 vpxor (7*16)(%r12), RL4, RL4;
0550 vmovdqu RR1, (0*16)(%r11);
0551 vmovdqu RL1, (1*16)(%r11);
0552 vmovdqu RR2, (2*16)(%r11);
0553 vmovdqu RL2, (3*16)(%r11);
0554 vmovdqu RR3, (4*16)(%r11);
0555 vmovdqu RL3, (5*16)(%r11);
0556 vmovdqu RR4, (6*16)(%r11);
0557 vmovdqu RL4, (7*16)(%r11);
0558
0559 popq %r15;
0560 popq %r12;
0561 FRAME_END
0562 RET;
0563 SYM_FUNC_END(cast5_ctr_16way)