0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011 #include <linux/linkage.h>
0012 #include <asm/frame.h>
0013 #include "glue_helper-asm-avx.S"
0014
0015 .file "cast6-avx-x86_64-asm_64.S"
0016
0017 .extern cast_s1
0018 .extern cast_s2
0019 .extern cast_s3
0020 .extern cast_s4
0021
0022
0023 #define km 0
0024 #define kr (12*4*4)
0025
0026
0027 #define s1 cast_s1
0028 #define s2 cast_s2
0029 #define s3 cast_s3
0030 #define s4 cast_s4
0031
0032
0033
0034
0035 #define CTX %r15
0036
0037 #define RA1 %xmm0
0038 #define RB1 %xmm1
0039 #define RC1 %xmm2
0040 #define RD1 %xmm3
0041
0042 #define RA2 %xmm4
0043 #define RB2 %xmm5
0044 #define RC2 %xmm6
0045 #define RD2 %xmm7
0046
0047 #define RX %xmm8
0048
0049 #define RKM %xmm9
0050 #define RKR %xmm10
0051 #define RKRF %xmm11
0052 #define RKRR %xmm12
0053 #define R32 %xmm13
0054 #define R1ST %xmm14
0055
0056 #define RTMP %xmm15
0057
0058 #define RID1 %rdi
0059 #define RID1d %edi
0060 #define RID2 %rsi
0061 #define RID2d %esi
0062
0063 #define RGI1 %rdx
0064 #define RGI1bl %dl
0065 #define RGI1bh %dh
0066 #define RGI2 %rcx
0067 #define RGI2bl %cl
0068 #define RGI2bh %ch
0069
0070 #define RGI3 %rax
0071 #define RGI3bl %al
0072 #define RGI3bh %ah
0073 #define RGI4 %rbx
0074 #define RGI4bl %bl
0075 #define RGI4bh %bh
0076
0077 #define RFS1 %r8
0078 #define RFS1d %r8d
0079 #define RFS2 %r9
0080 #define RFS2d %r9d
0081 #define RFS3 %r10
0082 #define RFS3d %r10d
0083
0084
0085 #define lookup_32bit(src, dst, op1, op2, op3, interleave_op, il_reg) \
0086 movzbl src ## bh, RID1d; \
0087 movzbl src ## bl, RID2d; \
0088 shrq $16, src; \
0089 movl s1(, RID1, 4), dst ## d; \
0090 op1 s2(, RID2, 4), dst ## d; \
0091 movzbl src ## bh, RID1d; \
0092 movzbl src ## bl, RID2d; \
0093 interleave_op(il_reg); \
0094 op2 s3(, RID1, 4), dst ## d; \
0095 op3 s4(, RID2, 4), dst ## d;
0096
0097 #define dummy(d)
0098
0099 #define shr_next(reg) \
0100 shrq $16, reg;
0101
0102 #define F_head(a, x, gi1, gi2, op0) \
0103 op0 a, RKM, x; \
0104 vpslld RKRF, x, RTMP; \
0105 vpsrld RKRR, x, x; \
0106 vpor RTMP, x, x; \
0107 \
0108 vmovq x, gi1; \
0109 vpextrq $1, x, gi2;
0110
0111 #define F_tail(a, x, gi1, gi2, op1, op2, op3) \
0112 lookup_32bit(##gi1, RFS1, op1, op2, op3, shr_next, ##gi1); \
0113 lookup_32bit(##gi2, RFS3, op1, op2, op3, shr_next, ##gi2); \
0114 \
0115 lookup_32bit(##gi1, RFS2, op1, op2, op3, dummy, none); \
0116 shlq $32, RFS2; \
0117 orq RFS1, RFS2; \
0118 lookup_32bit(##gi2, RFS1, op1, op2, op3, dummy, none); \
0119 shlq $32, RFS1; \
0120 orq RFS1, RFS3; \
0121 \
0122 vmovq RFS2, x; \
0123 vpinsrq $1, RFS3, x, x;
0124
0125 #define F_2(a1, b1, a2, b2, op0, op1, op2, op3) \
0126 F_head(b1, RX, RGI1, RGI2, op0); \
0127 F_head(b2, RX, RGI3, RGI4, op0); \
0128 \
0129 F_tail(b1, RX, RGI1, RGI2, op1, op2, op3); \
0130 F_tail(b2, RTMP, RGI3, RGI4, op1, op2, op3); \
0131 \
0132 vpxor a1, RX, a1; \
0133 vpxor a2, RTMP, a2;
0134
0135 #define F1_2(a1, b1, a2, b2) \
0136 F_2(a1, b1, a2, b2, vpaddd, xorl, subl, addl)
0137 #define F2_2(a1, b1, a2, b2) \
0138 F_2(a1, b1, a2, b2, vpxor, subl, addl, xorl)
0139 #define F3_2(a1, b1, a2, b2) \
0140 F_2(a1, b1, a2, b2, vpsubd, addl, xorl, subl)
0141
0142 #define qop(in, out, f) \
0143 F ## f ## _2(out ## 1, in ## 1, out ## 2, in ## 2);
0144
0145 #define get_round_keys(nn) \
0146 vbroadcastss (km+(4*(nn)))(CTX), RKM; \
0147 vpand R1ST, RKR, RKRF; \
0148 vpsubq RKRF, R32, RKRR; \
0149 vpsrldq $1, RKR, RKR;
0150
0151 #define Q(n) \
0152 get_round_keys(4*n+0); \
0153 qop(RD, RC, 1); \
0154 \
0155 get_round_keys(4*n+1); \
0156 qop(RC, RB, 2); \
0157 \
0158 get_round_keys(4*n+2); \
0159 qop(RB, RA, 3); \
0160 \
0161 get_round_keys(4*n+3); \
0162 qop(RA, RD, 1);
0163
0164 #define QBAR(n) \
0165 get_round_keys(4*n+3); \
0166 qop(RA, RD, 1); \
0167 \
0168 get_round_keys(4*n+2); \
0169 qop(RB, RA, 3); \
0170 \
0171 get_round_keys(4*n+1); \
0172 qop(RC, RB, 2); \
0173 \
0174 get_round_keys(4*n+0); \
0175 qop(RD, RC, 1);
0176
0177 #define shuffle(mask) \
0178 vpshufb mask, RKR, RKR;
0179
0180 #define preload_rkr(n, do_mask, mask) \
0181 vbroadcastss .L16_mask, RKR; \
0182 \
0183 vpxor (kr+n*16)(CTX), RKR, RKR; \
0184 do_mask(mask);
0185
0186 #define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
0187 vpunpckldq x1, x0, t0; \
0188 vpunpckhdq x1, x0, t2; \
0189 vpunpckldq x3, x2, t1; \
0190 vpunpckhdq x3, x2, x3; \
0191 \
0192 vpunpcklqdq t1, t0, x0; \
0193 vpunpckhqdq t1, t0, x1; \
0194 vpunpcklqdq x3, t2, x2; \
0195 vpunpckhqdq x3, t2, x3;
0196
0197 #define inpack_blocks(x0, x1, x2, x3, t0, t1, t2, rmask) \
0198 vpshufb rmask, x0, x0; \
0199 vpshufb rmask, x1, x1; \
0200 vpshufb rmask, x2, x2; \
0201 vpshufb rmask, x3, x3; \
0202 \
0203 transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
0204
0205 #define outunpack_blocks(x0, x1, x2, x3, t0, t1, t2, rmask) \
0206 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
0207 \
0208 vpshufb rmask, x0, x0; \
0209 vpshufb rmask, x1, x1; \
0210 vpshufb rmask, x2, x2; \
0211 vpshufb rmask, x3, x3;
0212
0213 .section .rodata.cst16, "aM", @progbits, 16
0214 .align 16
0215 .Lbswap_mask:
0216 .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
0217 .Lbswap128_mask:
0218 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
0219 .Lrkr_enc_Q_Q_QBAR_QBAR:
0220 .byte 0, 1, 2, 3, 4, 5, 6, 7, 11, 10, 9, 8, 15, 14, 13, 12
0221 .Lrkr_enc_QBAR_QBAR_QBAR_QBAR:
0222 .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
0223 .Lrkr_dec_Q_Q_Q_Q:
0224 .byte 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3
0225 .Lrkr_dec_Q_Q_QBAR_QBAR:
0226 .byte 12, 13, 14, 15, 8, 9, 10, 11, 7, 6, 5, 4, 3, 2, 1, 0
0227 .Lrkr_dec_QBAR_QBAR_QBAR_QBAR:
0228 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
0229
0230 .section .rodata.cst4.L16_mask, "aM", @progbits, 4
0231 .align 4
0232 .L16_mask:
0233 .byte 16, 16, 16, 16
0234
0235 .section .rodata.cst4.L32_mask, "aM", @progbits, 4
0236 .align 4
0237 .L32_mask:
0238 .byte 32, 0, 0, 0
0239
0240 .section .rodata.cst4.first_mask, "aM", @progbits, 4
0241 .align 4
0242 .Lfirst_mask:
0243 .byte 0x1f, 0, 0, 0
0244
0245 .text
0246
0247 .align 8
0248 SYM_FUNC_START_LOCAL(__cast6_enc_blk8)
0249
0250
0251
0252
0253
0254
0255
0256 pushq %r15;
0257 pushq %rbx;
0258
0259 movq %rdi, CTX;
0260
0261 vmovdqa .Lbswap_mask, RKM;
0262 vmovd .Lfirst_mask, R1ST;
0263 vmovd .L32_mask, R32;
0264
0265 inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
0266 inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
0267
0268 preload_rkr(0, dummy, none);
0269 Q(0);
0270 Q(1);
0271 Q(2);
0272 Q(3);
0273 preload_rkr(1, shuffle, .Lrkr_enc_Q_Q_QBAR_QBAR);
0274 Q(4);
0275 Q(5);
0276 QBAR(6);
0277 QBAR(7);
0278 preload_rkr(2, shuffle, .Lrkr_enc_QBAR_QBAR_QBAR_QBAR);
0279 QBAR(8);
0280 QBAR(9);
0281 QBAR(10);
0282 QBAR(11);
0283
0284 popq %rbx;
0285 popq %r15;
0286
0287 vmovdqa .Lbswap_mask, RKM;
0288
0289 outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
0290 outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
0291
0292 RET;
0293 SYM_FUNC_END(__cast6_enc_blk8)
0294
0295 .align 8
0296 SYM_FUNC_START_LOCAL(__cast6_dec_blk8)
0297
0298
0299
0300
0301
0302
0303
0304 pushq %r15;
0305 pushq %rbx;
0306
0307 movq %rdi, CTX;
0308
0309 vmovdqa .Lbswap_mask, RKM;
0310 vmovd .Lfirst_mask, R1ST;
0311 vmovd .L32_mask, R32;
0312
0313 inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
0314 inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
0315
0316 preload_rkr(2, shuffle, .Lrkr_dec_Q_Q_Q_Q);
0317 Q(11);
0318 Q(10);
0319 Q(9);
0320 Q(8);
0321 preload_rkr(1, shuffle, .Lrkr_dec_Q_Q_QBAR_QBAR);
0322 Q(7);
0323 Q(6);
0324 QBAR(5);
0325 QBAR(4);
0326 preload_rkr(0, shuffle, .Lrkr_dec_QBAR_QBAR_QBAR_QBAR);
0327 QBAR(3);
0328 QBAR(2);
0329 QBAR(1);
0330 QBAR(0);
0331
0332 popq %rbx;
0333 popq %r15;
0334
0335 vmovdqa .Lbswap_mask, RKM;
0336 outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
0337 outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
0338
0339 RET;
0340 SYM_FUNC_END(__cast6_dec_blk8)
0341
0342 SYM_FUNC_START(cast6_ecb_enc_8way)
0343
0344
0345
0346
0347
0348 FRAME_BEGIN
0349 pushq %r15;
0350
0351 movq %rdi, CTX;
0352 movq %rsi, %r11;
0353
0354 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
0355
0356 call __cast6_enc_blk8;
0357
0358 store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
0359
0360 popq %r15;
0361 FRAME_END
0362 RET;
0363 SYM_FUNC_END(cast6_ecb_enc_8way)
0364
0365 SYM_FUNC_START(cast6_ecb_dec_8way)
0366
0367
0368
0369
0370
0371 FRAME_BEGIN
0372 pushq %r15;
0373
0374 movq %rdi, CTX;
0375 movq %rsi, %r11;
0376
0377 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
0378
0379 call __cast6_dec_blk8;
0380
0381 store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
0382
0383 popq %r15;
0384 FRAME_END
0385 RET;
0386 SYM_FUNC_END(cast6_ecb_dec_8way)
0387
0388 SYM_FUNC_START(cast6_cbc_dec_8way)
0389
0390
0391
0392
0393
0394 FRAME_BEGIN
0395 pushq %r12;
0396 pushq %r15;
0397
0398 movq %rdi, CTX;
0399 movq %rsi, %r11;
0400 movq %rdx, %r12;
0401
0402 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
0403
0404 call __cast6_dec_blk8;
0405
0406 store_cbc_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
0407
0408 popq %r15;
0409 popq %r12;
0410 FRAME_END
0411 RET;
0412 SYM_FUNC_END(cast6_cbc_dec_8way)