0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016 #include <linux/linkage.h>
0017 #include <asm/frame.h>
0018
0019 #define rRIP (%rip)
0020
0021 #define RX0 %xmm0
0022 #define RX1 %xmm1
0023 #define MASK_4BIT %xmm2
0024 #define RTMP0 %xmm3
0025 #define RTMP1 %xmm4
0026 #define RTMP2 %xmm5
0027 #define RTMP3 %xmm6
0028 #define RTMP4 %xmm7
0029
0030 #define RA0 %xmm8
0031 #define RA1 %xmm9
0032 #define RA2 %xmm10
0033 #define RA3 %xmm11
0034
0035 #define RB0 %xmm12
0036 #define RB1 %xmm13
0037 #define RB2 %xmm14
0038 #define RB3 %xmm15
0039
0040 #define RNOT %xmm0
0041 #define RBSWAP %xmm1
0042
0043
0044
0045 #define transpose_4x4(x0, x1, x2, x3, t1, t2) \
0046 vpunpckhdq x1, x0, t2; \
0047 vpunpckldq x1, x0, x0; \
0048 \
0049 vpunpckldq x3, x2, t1; \
0050 vpunpckhdq x3, x2, x2; \
0051 \
0052 vpunpckhqdq t1, x0, x1; \
0053 vpunpcklqdq t1, x0, x0; \
0054 \
0055 vpunpckhqdq x2, t2, x3; \
0056 vpunpcklqdq x2, t2, x2;
0057
0058
0059 #define transform_pre(x, lo_t, hi_t, mask4bit, tmp0) \
0060 vpand x, mask4bit, tmp0; \
0061 vpandn x, mask4bit, x; \
0062 vpsrld $4, x, x; \
0063 \
0064 vpshufb tmp0, lo_t, tmp0; \
0065 vpshufb x, hi_t, x; \
0066 vpxor tmp0, x, x;
0067
0068
0069
0070
0071 #define transform_post(x, lo_t, hi_t, mask4bit, tmp0) \
0072 vpandn mask4bit, x, tmp0; \
0073 vpsrld $4, x, x; \
0074 vpand x, mask4bit, x; \
0075 \
0076 vpshufb tmp0, lo_t, tmp0; \
0077 vpshufb x, hi_t, x; \
0078 vpxor tmp0, x, x;
0079
0080
0081 .section .rodata.cst16, "aM", @progbits, 16
0082 .align 16
0083
0084
0085
0086
0087
0088
0089
0090
0091
0092 .Lpre_tf_lo_s:
0093 .quad 0x9197E2E474720701, 0xC7C1B4B222245157
0094 .Lpre_tf_hi_s:
0095 .quad 0xE240AB09EB49A200, 0xF052B91BF95BB012
0096
0097
0098 .Lpost_tf_lo_s:
0099 .quad 0x5B67F2CEA19D0834, 0xEDD14478172BBE82
0100 .Lpost_tf_hi_s:
0101 .quad 0xAE7201DD73AFDC00, 0x11CDBE62CC1063BF
0102
0103
0104 .Linv_shift_row:
0105 .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
0106 .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
0107
0108
0109 .Linv_shift_row_rol_8:
0110 .byte 0x07, 0x00, 0x0d, 0x0a, 0x0b, 0x04, 0x01, 0x0e
0111 .byte 0x0f, 0x08, 0x05, 0x02, 0x03, 0x0c, 0x09, 0x06
0112
0113
0114 .Linv_shift_row_rol_16:
0115 .byte 0x0a, 0x07, 0x00, 0x0d, 0x0e, 0x0b, 0x04, 0x01
0116 .byte 0x02, 0x0f, 0x08, 0x05, 0x06, 0x03, 0x0c, 0x09
0117
0118
0119 .Linv_shift_row_rol_24:
0120 .byte 0x0d, 0x0a, 0x07, 0x00, 0x01, 0x0e, 0x0b, 0x04
0121 .byte 0x05, 0x02, 0x0f, 0x08, 0x09, 0x06, 0x03, 0x0c
0122
0123
0124 .Lbswap128_mask:
0125 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
0126
0127
0128 .Lbswap32_mask:
0129 .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
0130
0131 .align 4
0132
0133 .L0f0f0f0f:
0134 .long 0x0f0f0f0f
0135
0136
0137 .Lpadding_deadbeef:
0138 .long 0xdeadbeef, 0xdeadbeef, 0xdeadbeef
0139
0140
0141 .text
0142 .align 16
0143
0144
0145
0146
0147
0148 .align 8
0149 SYM_FUNC_START(sm4_aesni_avx_crypt4)
0150
0151
0152
0153
0154
0155
0156 FRAME_BEGIN
0157
0158 vmovdqu 0*16(%rdx), RA0;
0159 vmovdqa RA0, RA1;
0160 vmovdqa RA0, RA2;
0161 vmovdqa RA0, RA3;
0162 cmpq $2, %rcx;
0163 jb .Lblk4_load_input_done;
0164 vmovdqu 1*16(%rdx), RA1;
0165 je .Lblk4_load_input_done;
0166 vmovdqu 2*16(%rdx), RA2;
0167 cmpq $3, %rcx;
0168 je .Lblk4_load_input_done;
0169 vmovdqu 3*16(%rdx), RA3;
0170
0171 .Lblk4_load_input_done:
0172
0173 vmovdqa .Lbswap32_mask rRIP, RTMP2;
0174 vpshufb RTMP2, RA0, RA0;
0175 vpshufb RTMP2, RA1, RA1;
0176 vpshufb RTMP2, RA2, RA2;
0177 vpshufb RTMP2, RA3, RA3;
0178
0179 vbroadcastss .L0f0f0f0f rRIP, MASK_4BIT;
0180 vmovdqa .Lpre_tf_lo_s rRIP, RTMP4;
0181 vmovdqa .Lpre_tf_hi_s rRIP, RB0;
0182 vmovdqa .Lpost_tf_lo_s rRIP, RB1;
0183 vmovdqa .Lpost_tf_hi_s rRIP, RB2;
0184 vmovdqa .Linv_shift_row rRIP, RB3;
0185 vmovdqa .Linv_shift_row_rol_8 rRIP, RTMP2;
0186 vmovdqa .Linv_shift_row_rol_16 rRIP, RTMP3;
0187 transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1);
0188
0189 #define ROUND(round, s0, s1, s2, s3) \
0190 vbroadcastss (4*(round))(%rdi), RX0; \
0191 vpxor s1, RX0, RX0; \
0192 vpxor s2, RX0, RX0; \
0193 vpxor s3, RX0, RX0; \
0194 \
0195 \
0196 transform_pre(RX0, RTMP4, RB0, MASK_4BIT, RTMP0); \
0197 vaesenclast MASK_4BIT, RX0, RX0; \
0198 transform_post(RX0, RB1, RB2, MASK_4BIT, RTMP0); \
0199 \
0200 \
0201 vpshufb RB3, RX0, RTMP0; \
0202 vpxor RTMP0, s0, s0; \
0203 vpshufb RTMP2, RX0, RTMP1; \
0204 vpxor RTMP1, RTMP0, RTMP0; \
0205 vpshufb RTMP3, RX0, RTMP1; \
0206 vpxor RTMP1, RTMP0, RTMP0; \
0207 vpshufb .Linv_shift_row_rol_24 rRIP, RX0, RTMP1; \
0208 vpxor RTMP1, s0, s0; \
0209 vpslld $2, RTMP0, RTMP1; \
0210 vpsrld $30, RTMP0, RTMP0; \
0211 vpxor RTMP0, s0, s0; \
0212 \
0213 vpxor RTMP1, s0, s0;
0214
0215 leaq (32*4)(%rdi), %rax;
0216 .align 16
0217 .Lroundloop_blk4:
0218 ROUND(0, RA0, RA1, RA2, RA3);
0219 ROUND(1, RA1, RA2, RA3, RA0);
0220 ROUND(2, RA2, RA3, RA0, RA1);
0221 ROUND(3, RA3, RA0, RA1, RA2);
0222 leaq (4*4)(%rdi), %rdi;
0223 cmpq %rax, %rdi;
0224 jne .Lroundloop_blk4;
0225
0226 #undef ROUND
0227
0228 vmovdqa .Lbswap128_mask rRIP, RTMP2;
0229
0230 transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1);
0231 vpshufb RTMP2, RA0, RA0;
0232 vpshufb RTMP2, RA1, RA1;
0233 vpshufb RTMP2, RA2, RA2;
0234 vpshufb RTMP2, RA3, RA3;
0235
0236 vmovdqu RA0, 0*16(%rsi);
0237 cmpq $2, %rcx;
0238 jb .Lblk4_store_output_done;
0239 vmovdqu RA1, 1*16(%rsi);
0240 je .Lblk4_store_output_done;
0241 vmovdqu RA2, 2*16(%rsi);
0242 cmpq $3, %rcx;
0243 je .Lblk4_store_output_done;
0244 vmovdqu RA3, 3*16(%rsi);
0245
0246 .Lblk4_store_output_done:
0247 vzeroall;
0248 FRAME_END
0249 RET;
0250 SYM_FUNC_END(sm4_aesni_avx_crypt4)
0251
0252 .align 8
0253 SYM_FUNC_START_LOCAL(__sm4_crypt_blk8)
0254
0255
0256
0257
0258
0259
0260
0261
0262 FRAME_BEGIN
0263
0264 vmovdqa .Lbswap32_mask rRIP, RTMP2;
0265 vpshufb RTMP2, RA0, RA0;
0266 vpshufb RTMP2, RA1, RA1;
0267 vpshufb RTMP2, RA2, RA2;
0268 vpshufb RTMP2, RA3, RA3;
0269 vpshufb RTMP2, RB0, RB0;
0270 vpshufb RTMP2, RB1, RB1;
0271 vpshufb RTMP2, RB2, RB2;
0272 vpshufb RTMP2, RB3, RB3;
0273
0274 vbroadcastss .L0f0f0f0f rRIP, MASK_4BIT;
0275 transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1);
0276 transpose_4x4(RB0, RB1, RB2, RB3, RTMP0, RTMP1);
0277
0278 #define ROUND(round, s0, s1, s2, s3, r0, r1, r2, r3) \
0279 vbroadcastss (4*(round))(%rdi), RX0; \
0280 vmovdqa .Lpre_tf_lo_s rRIP, RTMP4; \
0281 vmovdqa .Lpre_tf_hi_s rRIP, RTMP1; \
0282 vmovdqa RX0, RX1; \
0283 vpxor s1, RX0, RX0; \
0284 vpxor s2, RX0, RX0; \
0285 vpxor s3, RX0, RX0; \
0286 vmovdqa .Lpost_tf_lo_s rRIP, RTMP2; \
0287 vmovdqa .Lpost_tf_hi_s rRIP, RTMP3; \
0288 vpxor r1, RX1, RX1; \
0289 vpxor r2, RX1, RX1; \
0290 vpxor r3, RX1, RX1; \
0291 \
0292 \
0293 transform_pre(RX0, RTMP4, RTMP1, MASK_4BIT, RTMP0); \
0294 transform_pre(RX1, RTMP4, RTMP1, MASK_4BIT, RTMP0); \
0295 vmovdqa .Linv_shift_row rRIP, RTMP4; \
0296 vaesenclast MASK_4BIT, RX0, RX0; \
0297 vaesenclast MASK_4BIT, RX1, RX1; \
0298 transform_post(RX0, RTMP2, RTMP3, MASK_4BIT, RTMP0); \
0299 transform_post(RX1, RTMP2, RTMP3, MASK_4BIT, RTMP0); \
0300 \
0301 \
0302 vpshufb RTMP4, RX0, RTMP0; \
0303 vpxor RTMP0, s0, s0; \
0304 vpshufb RTMP4, RX1, RTMP2; \
0305 vmovdqa .Linv_shift_row_rol_8 rRIP, RTMP4; \
0306 vpxor RTMP2, r0, r0; \
0307 vpshufb RTMP4, RX0, RTMP1; \
0308 vpxor RTMP1, RTMP0, RTMP0; \
0309 vpshufb RTMP4, RX1, RTMP3; \
0310 vmovdqa .Linv_shift_row_rol_16 rRIP, RTMP4; \
0311 vpxor RTMP3, RTMP2, RTMP2; \
0312 vpshufb RTMP4, RX0, RTMP1; \
0313 vpxor RTMP1, RTMP0, RTMP0; \
0314 vpshufb RTMP4, RX1, RTMP3; \
0315 vmovdqa .Linv_shift_row_rol_24 rRIP, RTMP4; \
0316 vpxor RTMP3, RTMP2, RTMP2; \
0317 vpshufb RTMP4, RX0, RTMP1; \
0318 vpxor RTMP1, s0, s0; \
0319 \
0320 vpslld $2, RTMP0, RTMP1; \
0321 vpsrld $30, RTMP0, RTMP0; \
0322 vpxor RTMP0, s0, s0; \
0323 vpxor RTMP1, s0, s0; \
0324 vpshufb RTMP4, RX1, RTMP3; \
0325 vpxor RTMP3, r0, r0; \
0326 \
0327 vpslld $2, RTMP2, RTMP3; \
0328 vpsrld $30, RTMP2, RTMP2; \
0329 vpxor RTMP2, r0, r0; \
0330 vpxor RTMP3, r0, r0;
0331
0332 leaq (32*4)(%rdi), %rax;
0333 .align 16
0334 .Lroundloop_blk8:
0335 ROUND(0, RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3);
0336 ROUND(1, RA1, RA2, RA3, RA0, RB1, RB2, RB3, RB0);
0337 ROUND(2, RA2, RA3, RA0, RA1, RB2, RB3, RB0, RB1);
0338 ROUND(3, RA3, RA0, RA1, RA2, RB3, RB0, RB1, RB2);
0339 leaq (4*4)(%rdi), %rdi;
0340 cmpq %rax, %rdi;
0341 jne .Lroundloop_blk8;
0342
0343 #undef ROUND
0344
0345 vmovdqa .Lbswap128_mask rRIP, RTMP2;
0346
0347 transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1);
0348 transpose_4x4(RB0, RB1, RB2, RB3, RTMP0, RTMP1);
0349 vpshufb RTMP2, RA0, RA0;
0350 vpshufb RTMP2, RA1, RA1;
0351 vpshufb RTMP2, RA2, RA2;
0352 vpshufb RTMP2, RA3, RA3;
0353 vpshufb RTMP2, RB0, RB0;
0354 vpshufb RTMP2, RB1, RB1;
0355 vpshufb RTMP2, RB2, RB2;
0356 vpshufb RTMP2, RB3, RB3;
0357
0358 FRAME_END
0359 RET;
0360 SYM_FUNC_END(__sm4_crypt_blk8)
0361
0362
0363
0364
0365
0366 .align 8
0367 SYM_FUNC_START(sm4_aesni_avx_crypt8)
0368
0369
0370
0371
0372
0373
0374 cmpq $5, %rcx;
0375 jb sm4_aesni_avx_crypt4;
0376
0377 FRAME_BEGIN
0378
0379 vmovdqu (0 * 16)(%rdx), RA0;
0380 vmovdqu (1 * 16)(%rdx), RA1;
0381 vmovdqu (2 * 16)(%rdx), RA2;
0382 vmovdqu (3 * 16)(%rdx), RA3;
0383 vmovdqu (4 * 16)(%rdx), RB0;
0384 vmovdqa RB0, RB1;
0385 vmovdqa RB0, RB2;
0386 vmovdqa RB0, RB3;
0387 je .Lblk8_load_input_done;
0388 vmovdqu (5 * 16)(%rdx), RB1;
0389 cmpq $7, %rcx;
0390 jb .Lblk8_load_input_done;
0391 vmovdqu (6 * 16)(%rdx), RB2;
0392 je .Lblk8_load_input_done;
0393 vmovdqu (7 * 16)(%rdx), RB3;
0394
0395 .Lblk8_load_input_done:
0396 call __sm4_crypt_blk8;
0397
0398 cmpq $6, %rcx;
0399 vmovdqu RA0, (0 * 16)(%rsi);
0400 vmovdqu RA1, (1 * 16)(%rsi);
0401 vmovdqu RA2, (2 * 16)(%rsi);
0402 vmovdqu RA3, (3 * 16)(%rsi);
0403 vmovdqu RB0, (4 * 16)(%rsi);
0404 jb .Lblk8_store_output_done;
0405 vmovdqu RB1, (5 * 16)(%rsi);
0406 je .Lblk8_store_output_done;
0407 vmovdqu RB2, (6 * 16)(%rsi);
0408 cmpq $7, %rcx;
0409 je .Lblk8_store_output_done;
0410 vmovdqu RB3, (7 * 16)(%rsi);
0411
0412 .Lblk8_store_output_done:
0413 vzeroall;
0414 FRAME_END
0415 RET;
0416 SYM_FUNC_END(sm4_aesni_avx_crypt8)
0417
0418
0419
0420
0421
0422 .align 8
0423 SYM_FUNC_START(sm4_aesni_avx_ctr_enc_blk8)
0424
0425
0426
0427
0428
0429
0430 FRAME_BEGIN
0431
0432
0433 vmovdqu (%rcx), RA0;
0434
0435 vmovdqa .Lbswap128_mask rRIP, RBSWAP;
0436 vpshufb RBSWAP, RA0, RTMP0;
0437
0438 vpcmpeqd RNOT, RNOT, RNOT;
0439 vpsrldq $8, RNOT, RNOT;
0440
0441 #define inc_le128(x, minus_one, tmp) \
0442 vpcmpeqq minus_one, x, tmp; \
0443 vpsubq minus_one, x, x; \
0444 vpslldq $8, tmp, tmp; \
0445 vpsubq tmp, x, x;
0446
0447
0448 inc_le128(RTMP0, RNOT, RTMP2);
0449 vpshufb RBSWAP, RTMP0, RA1;
0450 inc_le128(RTMP0, RNOT, RTMP2);
0451 vpshufb RBSWAP, RTMP0, RA2;
0452 inc_le128(RTMP0, RNOT, RTMP2);
0453 vpshufb RBSWAP, RTMP0, RA3;
0454 inc_le128(RTMP0, RNOT, RTMP2);
0455 vpshufb RBSWAP, RTMP0, RB0;
0456 inc_le128(RTMP0, RNOT, RTMP2);
0457 vpshufb RBSWAP, RTMP0, RB1;
0458 inc_le128(RTMP0, RNOT, RTMP2);
0459 vpshufb RBSWAP, RTMP0, RB2;
0460 inc_le128(RTMP0, RNOT, RTMP2);
0461 vpshufb RBSWAP, RTMP0, RB3;
0462 inc_le128(RTMP0, RNOT, RTMP2);
0463 vpshufb RBSWAP, RTMP0, RTMP1;
0464
0465
0466 vmovdqu RTMP1, (%rcx);
0467
0468 call __sm4_crypt_blk8;
0469
0470 vpxor (0 * 16)(%rdx), RA0, RA0;
0471 vpxor (1 * 16)(%rdx), RA1, RA1;
0472 vpxor (2 * 16)(%rdx), RA2, RA2;
0473 vpxor (3 * 16)(%rdx), RA3, RA3;
0474 vpxor (4 * 16)(%rdx), RB0, RB0;
0475 vpxor (5 * 16)(%rdx), RB1, RB1;
0476 vpxor (6 * 16)(%rdx), RB2, RB2;
0477 vpxor (7 * 16)(%rdx), RB3, RB3;
0478
0479 vmovdqu RA0, (0 * 16)(%rsi);
0480 vmovdqu RA1, (1 * 16)(%rsi);
0481 vmovdqu RA2, (2 * 16)(%rsi);
0482 vmovdqu RA3, (3 * 16)(%rsi);
0483 vmovdqu RB0, (4 * 16)(%rsi);
0484 vmovdqu RB1, (5 * 16)(%rsi);
0485 vmovdqu RB2, (6 * 16)(%rsi);
0486 vmovdqu RB3, (7 * 16)(%rsi);
0487
0488 vzeroall;
0489 FRAME_END
0490 RET;
0491 SYM_FUNC_END(sm4_aesni_avx_ctr_enc_blk8)
0492
0493
0494
0495
0496
0497 .align 8
0498 SYM_FUNC_START(sm4_aesni_avx_cbc_dec_blk8)
0499
0500
0501
0502
0503
0504
0505 FRAME_BEGIN
0506
0507 vmovdqu (0 * 16)(%rdx), RA0;
0508 vmovdqu (1 * 16)(%rdx), RA1;
0509 vmovdqu (2 * 16)(%rdx), RA2;
0510 vmovdqu (3 * 16)(%rdx), RA3;
0511 vmovdqu (4 * 16)(%rdx), RB0;
0512 vmovdqu (5 * 16)(%rdx), RB1;
0513 vmovdqu (6 * 16)(%rdx), RB2;
0514 vmovdqu (7 * 16)(%rdx), RB3;
0515
0516 call __sm4_crypt_blk8;
0517
0518 vmovdqu (7 * 16)(%rdx), RNOT;
0519 vpxor (%rcx), RA0, RA0;
0520 vpxor (0 * 16)(%rdx), RA1, RA1;
0521 vpxor (1 * 16)(%rdx), RA2, RA2;
0522 vpxor (2 * 16)(%rdx), RA3, RA3;
0523 vpxor (3 * 16)(%rdx), RB0, RB0;
0524 vpxor (4 * 16)(%rdx), RB1, RB1;
0525 vpxor (5 * 16)(%rdx), RB2, RB2;
0526 vpxor (6 * 16)(%rdx), RB3, RB3;
0527 vmovdqu RNOT, (%rcx);
0528
0529 vmovdqu RA0, (0 * 16)(%rsi);
0530 vmovdqu RA1, (1 * 16)(%rsi);
0531 vmovdqu RA2, (2 * 16)(%rsi);
0532 vmovdqu RA3, (3 * 16)(%rsi);
0533 vmovdqu RB0, (4 * 16)(%rsi);
0534 vmovdqu RB1, (5 * 16)(%rsi);
0535 vmovdqu RB2, (6 * 16)(%rsi);
0536 vmovdqu RB3, (7 * 16)(%rsi);
0537
0538 vzeroall;
0539 FRAME_END
0540 RET;
0541 SYM_FUNC_END(sm4_aesni_avx_cbc_dec_blk8)
0542
0543
0544
0545
0546
0547 .align 8
0548 SYM_FUNC_START(sm4_aesni_avx_cfb_dec_blk8)
0549
0550
0551
0552
0553
0554
0555 FRAME_BEGIN
0556
0557
0558 vmovdqu (%rcx), RA0;
0559 vmovdqu 0 * 16(%rdx), RA1;
0560 vmovdqu 1 * 16(%rdx), RA2;
0561 vmovdqu 2 * 16(%rdx), RA3;
0562 vmovdqu 3 * 16(%rdx), RB0;
0563 vmovdqu 4 * 16(%rdx), RB1;
0564 vmovdqu 5 * 16(%rdx), RB2;
0565 vmovdqu 6 * 16(%rdx), RB3;
0566
0567
0568 vmovdqu 7 * 16(%rdx), RNOT;
0569 vmovdqu RNOT, (%rcx);
0570
0571 call __sm4_crypt_blk8;
0572
0573 vpxor (0 * 16)(%rdx), RA0, RA0;
0574 vpxor (1 * 16)(%rdx), RA1, RA1;
0575 vpxor (2 * 16)(%rdx), RA2, RA2;
0576 vpxor (3 * 16)(%rdx), RA3, RA3;
0577 vpxor (4 * 16)(%rdx), RB0, RB0;
0578 vpxor (5 * 16)(%rdx), RB1, RB1;
0579 vpxor (6 * 16)(%rdx), RB2, RB2;
0580 vpxor (7 * 16)(%rdx), RB3, RB3;
0581
0582 vmovdqu RA0, (0 * 16)(%rsi);
0583 vmovdqu RA1, (1 * 16)(%rsi);
0584 vmovdqu RA2, (2 * 16)(%rsi);
0585 vmovdqu RA3, (3 * 16)(%rsi);
0586 vmovdqu RB0, (4 * 16)(%rsi);
0587 vmovdqu RB1, (5 * 16)(%rsi);
0588 vmovdqu RB2, (6 * 16)(%rsi);
0589 vmovdqu RB3, (7 * 16)(%rsi);
0590
0591 vzeroall;
0592 FRAME_END
0593 RET;
0594 SYM_FUNC_END(sm4_aesni_avx_cfb_dec_blk8)