0001 // SPDX-License-Identifier: GPL-2.0-or-later
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016 #include <linux/linkage.h>
0017 #include <asm/frame.h>
0018
0019 #define rRIP (%rip)
0020
0021
0022 #define RX0 %ymm0
0023 #define RX1 %ymm1
0024 #define MASK_4BIT %ymm2
0025 #define RTMP0 %ymm3
0026 #define RTMP1 %ymm4
0027 #define RTMP2 %ymm5
0028 #define RTMP3 %ymm6
0029 #define RTMP4 %ymm7
0030
0031 #define RA0 %ymm8
0032 #define RA1 %ymm9
0033 #define RA2 %ymm10
0034 #define RA3 %ymm11
0035
0036 #define RB0 %ymm12
0037 #define RB1 %ymm13
0038 #define RB2 %ymm14
0039 #define RB3 %ymm15
0040
0041 #define RNOT %ymm0
0042 #define RBSWAP %ymm1
0043
0044 #define RX0x %xmm0
0045 #define RX1x %xmm1
0046 #define MASK_4BITx %xmm2
0047
0048 #define RNOTx %xmm0
0049 #define RBSWAPx %xmm1
0050
0051 #define RTMP0x %xmm3
0052 #define RTMP1x %xmm4
0053 #define RTMP2x %xmm5
0054 #define RTMP3x %xmm6
0055 #define RTMP4x %xmm7
0056
0057
0058
0059
0060
0061 #define transpose_4x4(x0, x1, x2, x3, t1, t2) \
0062 vpunpckhdq x1, x0, t2; \
0063 vpunpckldq x1, x0, x0; \
0064 \
0065 vpunpckldq x3, x2, t1; \
0066 vpunpckhdq x3, x2, x2; \
0067 \
0068 vpunpckhqdq t1, x0, x1; \
0069 vpunpcklqdq t1, x0, x0; \
0070 \
0071 vpunpckhqdq x2, t2, x3; \
0072 vpunpcklqdq x2, t2, x2;
0073
0074
0075 #define transform_pre(x, lo_t, hi_t, mask4bit, tmp0) \
0076 vpand x, mask4bit, tmp0; \
0077 vpandn x, mask4bit, x; \
0078 vpsrld $4, x, x; \
0079 \
0080 vpshufb tmp0, lo_t, tmp0; \
0081 vpshufb x, hi_t, x; \
0082 vpxor tmp0, x, x;
0083
0084
0085
0086 #define transform_post(x, lo_t, hi_t, mask4bit, tmp0) \
0087 vpandn mask4bit, x, tmp0; \
0088 vpsrld $4, x, x; \
0089 vpand x, mask4bit, x; \
0090 \
0091 vpshufb tmp0, lo_t, tmp0; \
0092 vpshufb x, hi_t, x; \
0093 vpxor tmp0, x, x;
0094
0095
0096 .section .rodata.cst16, "aM", @progbits, 16
0097 .align 16
0098
0099
0100
0101
0102
0103
0104
0105
0106
0107 .Lpre_tf_lo_s:
0108 .quad 0x9197E2E474720701, 0xC7C1B4B222245157
0109 .Lpre_tf_hi_s:
0110 .quad 0xE240AB09EB49A200, 0xF052B91BF95BB012
0111
0112
0113 .Lpost_tf_lo_s:
0114 .quad 0x5B67F2CEA19D0834, 0xEDD14478172BBE82
0115 .Lpost_tf_hi_s:
0116 .quad 0xAE7201DD73AFDC00, 0x11CDBE62CC1063BF
0117
0118
0119 .Linv_shift_row:
0120 .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
0121 .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
0122
0123
0124 .Linv_shift_row_rol_8:
0125 .byte 0x07, 0x00, 0x0d, 0x0a, 0x0b, 0x04, 0x01, 0x0e
0126 .byte 0x0f, 0x08, 0x05, 0x02, 0x03, 0x0c, 0x09, 0x06
0127
0128
0129 .Linv_shift_row_rol_16:
0130 .byte 0x0a, 0x07, 0x00, 0x0d, 0x0e, 0x0b, 0x04, 0x01
0131 .byte 0x02, 0x0f, 0x08, 0x05, 0x06, 0x03, 0x0c, 0x09
0132
0133
0134 .Linv_shift_row_rol_24:
0135 .byte 0x0d, 0x0a, 0x07, 0x00, 0x01, 0x0e, 0x0b, 0x04
0136 .byte 0x05, 0x02, 0x0f, 0x08, 0x09, 0x06, 0x03, 0x0c
0137
0138
0139 .Lbswap128_mask:
0140 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
0141
0142
0143 .Lbswap32_mask:
0144 .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
0145
0146 .align 4
0147
0148 .L0f0f0f0f:
0149 .long 0x0f0f0f0f
0150
0151
0152 .Lpadding_deadbeef:
0153 .long 0xdeadbeef, 0xdeadbeef, 0xdeadbeef
0154
0155 .text
0156 .align 16
0157
0158 .align 8
0159 SYM_FUNC_START_LOCAL(__sm4_crypt_blk16)
0160
0161
0162
0163
0164
0165
0166
0167
0168 FRAME_BEGIN
0169
0170 vbroadcasti128 .Lbswap32_mask rRIP, RTMP2;
0171 vpshufb RTMP2, RA0, RA0;
0172 vpshufb RTMP2, RA1, RA1;
0173 vpshufb RTMP2, RA2, RA2;
0174 vpshufb RTMP2, RA3, RA3;
0175 vpshufb RTMP2, RB0, RB0;
0176 vpshufb RTMP2, RB1, RB1;
0177 vpshufb RTMP2, RB2, RB2;
0178 vpshufb RTMP2, RB3, RB3;
0179
0180 vpbroadcastd .L0f0f0f0f rRIP, MASK_4BIT;
0181 transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1);
0182 transpose_4x4(RB0, RB1, RB2, RB3, RTMP0, RTMP1);
0183
0184 #define ROUND(round, s0, s1, s2, s3, r0, r1, r2, r3) \
0185 vpbroadcastd (4*(round))(%rdi), RX0; \
0186 vbroadcasti128 .Lpre_tf_lo_s rRIP, RTMP4; \
0187 vbroadcasti128 .Lpre_tf_hi_s rRIP, RTMP1; \
0188 vmovdqa RX0, RX1; \
0189 vpxor s1, RX0, RX0; \
0190 vpxor s2, RX0, RX0; \
0191 vpxor s3, RX0, RX0; \
0192 vbroadcasti128 .Lpost_tf_lo_s rRIP, RTMP2; \
0193 vbroadcasti128 .Lpost_tf_hi_s rRIP, RTMP3; \
0194 vpxor r1, RX1, RX1; \
0195 vpxor r2, RX1, RX1; \
0196 vpxor r3, RX1, RX1; \
0197 \
0198 \
0199 transform_pre(RX0, RTMP4, RTMP1, MASK_4BIT, RTMP0); \
0200 transform_pre(RX1, RTMP4, RTMP1, MASK_4BIT, RTMP0); \
0201 vextracti128 $1, RX0, RTMP4x; \
0202 vextracti128 $1, RX1, RTMP0x; \
0203 vaesenclast MASK_4BITx, RX0x, RX0x; \
0204 vaesenclast MASK_4BITx, RTMP4x, RTMP4x; \
0205 vaesenclast MASK_4BITx, RX1x, RX1x; \
0206 vaesenclast MASK_4BITx, RTMP0x, RTMP0x; \
0207 vinserti128 $1, RTMP4x, RX0, RX0; \
0208 vbroadcasti128 .Linv_shift_row rRIP, RTMP4; \
0209 vinserti128 $1, RTMP0x, RX1, RX1; \
0210 transform_post(RX0, RTMP2, RTMP3, MASK_4BIT, RTMP0); \
0211 transform_post(RX1, RTMP2, RTMP3, MASK_4BIT, RTMP0); \
0212 \
0213 \
0214 vpshufb RTMP4, RX0, RTMP0; \
0215 vpxor RTMP0, s0, s0; \
0216 vpshufb RTMP4, RX1, RTMP2; \
0217 vbroadcasti128 .Linv_shift_row_rol_8 rRIP, RTMP4; \
0218 vpxor RTMP2, r0, r0; \
0219 vpshufb RTMP4, RX0, RTMP1; \
0220 vpxor RTMP1, RTMP0, RTMP0; \
0221 vpshufb RTMP4, RX1, RTMP3; \
0222 vbroadcasti128 .Linv_shift_row_rol_16 rRIP, RTMP4; \
0223 vpxor RTMP3, RTMP2, RTMP2; \
0224 vpshufb RTMP4, RX0, RTMP1; \
0225 vpxor RTMP1, RTMP0, RTMP0; \
0226 vpshufb RTMP4, RX1, RTMP3; \
0227 vbroadcasti128 .Linv_shift_row_rol_24 rRIP, RTMP4; \
0228 vpxor RTMP3, RTMP2, RTMP2; \
0229 vpshufb RTMP4, RX0, RTMP1; \
0230 vpxor RTMP1, s0, s0; \
0231 vpslld $2, RTMP0, RTMP1; \
0232 vpsrld $30, RTMP0, RTMP0; \
0233 vpxor RTMP0, s0, s0; \
0234 \
0235 vpxor RTMP1, s0, s0; \
0236 vpshufb RTMP4, RX1, RTMP3; \
0237 vpxor RTMP3, r0, r0; \
0238 vpslld $2, RTMP2, RTMP3; \
0239 vpsrld $30, RTMP2, RTMP2; \
0240 vpxor RTMP2, r0, r0; \
0241 \
0242 vpxor RTMP3, r0, r0;
0243
0244 leaq (32*4)(%rdi), %rax;
0245 .align 16
0246 .Lroundloop_blk8:
0247 ROUND(0, RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3);
0248 ROUND(1, RA1, RA2, RA3, RA0, RB1, RB2, RB3, RB0);
0249 ROUND(2, RA2, RA3, RA0, RA1, RB2, RB3, RB0, RB1);
0250 ROUND(3, RA3, RA0, RA1, RA2, RB3, RB0, RB1, RB2);
0251 leaq (4*4)(%rdi), %rdi;
0252 cmpq %rax, %rdi;
0253 jne .Lroundloop_blk8;
0254
0255 #undef ROUND
0256
0257 vbroadcasti128 .Lbswap128_mask rRIP, RTMP2;
0258
0259 transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1);
0260 transpose_4x4(RB0, RB1, RB2, RB3, RTMP0, RTMP1);
0261 vpshufb RTMP2, RA0, RA0;
0262 vpshufb RTMP2, RA1, RA1;
0263 vpshufb RTMP2, RA2, RA2;
0264 vpshufb RTMP2, RA3, RA3;
0265 vpshufb RTMP2, RB0, RB0;
0266 vpshufb RTMP2, RB1, RB1;
0267 vpshufb RTMP2, RB2, RB2;
0268 vpshufb RTMP2, RB3, RB3;
0269
0270 FRAME_END
0271 RET;
0272 SYM_FUNC_END(__sm4_crypt_blk16)
0273
0274 #define inc_le128(x, minus_one, tmp) \
0275 vpcmpeqq minus_one, x, tmp; \
0276 vpsubq minus_one, x, x; \
0277 vpslldq $8, tmp, tmp; \
0278 vpsubq tmp, x, x;
0279
0280
0281
0282
0283
0284 .align 8
0285 SYM_FUNC_START(sm4_aesni_avx2_ctr_enc_blk16)
0286
0287
0288
0289
0290
0291
0292 FRAME_BEGIN
0293
0294 movq 8(%rcx), %rax;
0295 bswapq %rax;
0296
0297 vzeroupper;
0298
0299 vbroadcasti128 .Lbswap128_mask rRIP, RTMP3;
0300 vpcmpeqd RNOT, RNOT, RNOT;
0301 vpsrldq $8, RNOT, RNOT;
0302 vpaddq RNOT, RNOT, RTMP2;
0303
0304
0305 vmovdqu (%rcx), RTMP4x;
0306 vpshufb RTMP3x, RTMP4x, RTMP4x;
0307 vmovdqa RTMP4x, RTMP0x;
0308 inc_le128(RTMP4x, RNOTx, RTMP1x);
0309 vinserti128 $1, RTMP4x, RTMP0, RTMP0;
0310 vpshufb RTMP3, RTMP0, RA0;
0311
0312
0313 cmpq $(0xffffffffffffffff - 16), %rax;
0314 ja .Lhandle_ctr_carry;
0315
0316
0317 vpsubq RTMP2, RTMP0, RTMP0;
0318 vpshufb RTMP3, RTMP0, RA1;
0319 vpsubq RTMP2, RTMP0, RTMP0;
0320 vpshufb RTMP3, RTMP0, RA2;
0321 vpsubq RTMP2, RTMP0, RTMP0;
0322 vpshufb RTMP3, RTMP0, RA3;
0323 vpsubq RTMP2, RTMP0, RTMP0;
0324 vpshufb RTMP3, RTMP0, RB0;
0325 vpsubq RTMP2, RTMP0, RTMP0;
0326 vpshufb RTMP3, RTMP0, RB1;
0327 vpsubq RTMP2, RTMP0, RTMP0;
0328 vpshufb RTMP3, RTMP0, RB2;
0329 vpsubq RTMP2, RTMP0, RTMP0;
0330 vpshufb RTMP3, RTMP0, RB3;
0331 vpsubq RTMP2, RTMP0, RTMP0;
0332 vpshufb RTMP3x, RTMP0x, RTMP0x;
0333
0334 jmp .Lctr_carry_done;
0335
0336 .Lhandle_ctr_carry:
0337
0338 inc_le128(RTMP0, RNOT, RTMP1);
0339 inc_le128(RTMP0, RNOT, RTMP1);
0340 vpshufb RTMP3, RTMP0, RA1;
0341 inc_le128(RTMP0, RNOT, RTMP1);
0342 inc_le128(RTMP0, RNOT, RTMP1);
0343 vpshufb RTMP3, RTMP0, RA2;
0344 inc_le128(RTMP0, RNOT, RTMP1);
0345 inc_le128(RTMP0, RNOT, RTMP1);
0346 vpshufb RTMP3, RTMP0, RA3;
0347 inc_le128(RTMP0, RNOT, RTMP1);
0348 inc_le128(RTMP0, RNOT, RTMP1);
0349 vpshufb RTMP3, RTMP0, RB0;
0350 inc_le128(RTMP0, RNOT, RTMP1);
0351 inc_le128(RTMP0, RNOT, RTMP1);
0352 vpshufb RTMP3, RTMP0, RB1;
0353 inc_le128(RTMP0, RNOT, RTMP1);
0354 inc_le128(RTMP0, RNOT, RTMP1);
0355 vpshufb RTMP3, RTMP0, RB2;
0356 inc_le128(RTMP0, RNOT, RTMP1);
0357 inc_le128(RTMP0, RNOT, RTMP1);
0358 vpshufb RTMP3, RTMP0, RB3;
0359 inc_le128(RTMP0, RNOT, RTMP1);
0360 vextracti128 $1, RTMP0, RTMP0x;
0361 vpshufb RTMP3x, RTMP0x, RTMP0x;
0362
0363 .align 4
0364 .Lctr_carry_done:
0365
0366 vmovdqu RTMP0x, (%rcx);
0367
0368 call __sm4_crypt_blk16;
0369
0370 vpxor (0 * 32)(%rdx), RA0, RA0;
0371 vpxor (1 * 32)(%rdx), RA1, RA1;
0372 vpxor (2 * 32)(%rdx), RA2, RA2;
0373 vpxor (3 * 32)(%rdx), RA3, RA3;
0374 vpxor (4 * 32)(%rdx), RB0, RB0;
0375 vpxor (5 * 32)(%rdx), RB1, RB1;
0376 vpxor (6 * 32)(%rdx), RB2, RB2;
0377 vpxor (7 * 32)(%rdx), RB3, RB3;
0378
0379 vmovdqu RA0, (0 * 32)(%rsi);
0380 vmovdqu RA1, (1 * 32)(%rsi);
0381 vmovdqu RA2, (2 * 32)(%rsi);
0382 vmovdqu RA3, (3 * 32)(%rsi);
0383 vmovdqu RB0, (4 * 32)(%rsi);
0384 vmovdqu RB1, (5 * 32)(%rsi);
0385 vmovdqu RB2, (6 * 32)(%rsi);
0386 vmovdqu RB3, (7 * 32)(%rsi);
0387
0388 vzeroall;
0389 FRAME_END
0390 RET;
0391 SYM_FUNC_END(sm4_aesni_avx2_ctr_enc_blk16)
0392
0393
0394
0395
0396
0397 .align 8
0398 SYM_FUNC_START(sm4_aesni_avx2_cbc_dec_blk16)
0399
0400
0401
0402
0403
0404
0405 FRAME_BEGIN
0406
0407 vzeroupper;
0408
0409 vmovdqu (0 * 32)(%rdx), RA0;
0410 vmovdqu (1 * 32)(%rdx), RA1;
0411 vmovdqu (2 * 32)(%rdx), RA2;
0412 vmovdqu (3 * 32)(%rdx), RA3;
0413 vmovdqu (4 * 32)(%rdx), RB0;
0414 vmovdqu (5 * 32)(%rdx), RB1;
0415 vmovdqu (6 * 32)(%rdx), RB2;
0416 vmovdqu (7 * 32)(%rdx), RB3;
0417
0418 call __sm4_crypt_blk16;
0419
0420 vmovdqu (%rcx), RNOTx;
0421 vinserti128 $1, (%rdx), RNOT, RNOT;
0422 vpxor RNOT, RA0, RA0;
0423 vpxor (0 * 32 + 16)(%rdx), RA1, RA1;
0424 vpxor (1 * 32 + 16)(%rdx), RA2, RA2;
0425 vpxor (2 * 32 + 16)(%rdx), RA3, RA3;
0426 vpxor (3 * 32 + 16)(%rdx), RB0, RB0;
0427 vpxor (4 * 32 + 16)(%rdx), RB1, RB1;
0428 vpxor (5 * 32 + 16)(%rdx), RB2, RB2;
0429 vpxor (6 * 32 + 16)(%rdx), RB3, RB3;
0430 vmovdqu (7 * 32 + 16)(%rdx), RNOTx;
0431 vmovdqu RNOTx, (%rcx);
0432
0433 vmovdqu RA0, (0 * 32)(%rsi);
0434 vmovdqu RA1, (1 * 32)(%rsi);
0435 vmovdqu RA2, (2 * 32)(%rsi);
0436 vmovdqu RA3, (3 * 32)(%rsi);
0437 vmovdqu RB0, (4 * 32)(%rsi);
0438 vmovdqu RB1, (5 * 32)(%rsi);
0439 vmovdqu RB2, (6 * 32)(%rsi);
0440 vmovdqu RB3, (7 * 32)(%rsi);
0441
0442 vzeroall;
0443 FRAME_END
0444 RET;
0445 SYM_FUNC_END(sm4_aesni_avx2_cbc_dec_blk16)
0446
0447
0448
0449
0450
0451 .align 8
0452 SYM_FUNC_START(sm4_aesni_avx2_cfb_dec_blk16)
0453
0454
0455
0456
0457
0458
0459 FRAME_BEGIN
0460
0461 vzeroupper;
0462
0463
0464 vmovdqu (%rcx), RNOTx;
0465 vinserti128 $1, (%rdx), RNOT, RA0;
0466 vmovdqu (0 * 32 + 16)(%rdx), RA1;
0467 vmovdqu (1 * 32 + 16)(%rdx), RA2;
0468 vmovdqu (2 * 32 + 16)(%rdx), RA3;
0469 vmovdqu (3 * 32 + 16)(%rdx), RB0;
0470 vmovdqu (4 * 32 + 16)(%rdx), RB1;
0471 vmovdqu (5 * 32 + 16)(%rdx), RB2;
0472 vmovdqu (6 * 32 + 16)(%rdx), RB3;
0473
0474
0475 vmovdqu (7 * 32 + 16)(%rdx), RNOTx;
0476 vmovdqu RNOTx, (%rcx);
0477
0478 call __sm4_crypt_blk16;
0479
0480 vpxor (0 * 32)(%rdx), RA0, RA0;
0481 vpxor (1 * 32)(%rdx), RA1, RA1;
0482 vpxor (2 * 32)(%rdx), RA2, RA2;
0483 vpxor (3 * 32)(%rdx), RA3, RA3;
0484 vpxor (4 * 32)(%rdx), RB0, RB0;
0485 vpxor (5 * 32)(%rdx), RB1, RB1;
0486 vpxor (6 * 32)(%rdx), RB2, RB2;
0487 vpxor (7 * 32)(%rdx), RB3, RB3;
0488
0489 vmovdqu RA0, (0 * 32)(%rsi);
0490 vmovdqu RA1, (1 * 32)(%rsi);
0491 vmovdqu RA2, (2 * 32)(%rsi);
0492 vmovdqu RA3, (3 * 32)(%rsi);
0493 vmovdqu RB0, (4 * 32)(%rsi);
0494 vmovdqu RB1, (5 * 32)(%rsi);
0495 vmovdqu RB2, (6 * 32)(%rsi);
0496 vmovdqu RB3, (7 * 32)(%rsi);
0497
0498 vzeroall;
0499 FRAME_END
0500 RET;
0501 SYM_FUNC_END(sm4_aesni_avx2_cfb_dec_blk16)