0001
0002
0003
0004
0005
0006
0007
0008 #include <linux/linkage.h>
0009 #include <asm/frame.h>
0010
0011 #define CAMELLIA_TABLE_BYTE_LEN 272
0012
0013
0014 #define key_table 0
0015 #define key_length CAMELLIA_TABLE_BYTE_LEN
0016
0017
0018 #define CTX %rdi
0019 #define RIO %r8
0020
0021
0022
0023
0024 #define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \
0025 vpand x, mask4bit, tmp0; \
0026 vpandn x, mask4bit, x; \
0027 vpsrld $4, x, x; \
0028 \
0029 vpshufb tmp0, lo_t, tmp0; \
0030 vpshufb x, hi_t, x; \
0031 vpxor tmp0, x, x;
0032
0033 #define ymm0_x xmm0
0034 #define ymm1_x xmm1
0035 #define ymm2_x xmm2
0036 #define ymm3_x xmm3
0037 #define ymm4_x xmm4
0038 #define ymm5_x xmm5
0039 #define ymm6_x xmm6
0040 #define ymm7_x xmm7
0041 #define ymm8_x xmm8
0042 #define ymm9_x xmm9
0043 #define ymm10_x xmm10
0044 #define ymm11_x xmm11
0045 #define ymm12_x xmm12
0046 #define ymm13_x xmm13
0047 #define ymm14_x xmm14
0048 #define ymm15_x xmm15
0049
0050
0051
0052
0053
0054
0055
0056
0057
0058
0059
0060
0061
0062 #define roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, t6, \
0063 t7, mem_cd, key) \
0064
0065
0066 \
0067 vbroadcasti128 .Linv_shift_row, t4; \
0068 vpbroadcastd .L0f0f0f0f, t7; \
0069 vbroadcasti128 .Lpre_tf_lo_s1, t5; \
0070 vbroadcasti128 .Lpre_tf_hi_s1, t6; \
0071 vbroadcasti128 .Lpre_tf_lo_s4, t2; \
0072 vbroadcasti128 .Lpre_tf_hi_s4, t3; \
0073 \
0074 \
0075 vpshufb t4, x0, x0; \
0076 vpshufb t4, x7, x7; \
0077 vpshufb t4, x3, x3; \
0078 vpshufb t4, x6, x6; \
0079 vpshufb t4, x2, x2; \
0080 vpshufb t4, x5, x5; \
0081 vpshufb t4, x1, x1; \
0082 vpshufb t4, x4, x4; \
0083 \
0084 \
0085 \
0086 filter_8bit(x0, t5, t6, t7, t4); \
0087 filter_8bit(x7, t5, t6, t7, t4); \
0088 vextracti128 $1, x0, t0##_x; \
0089 vextracti128 $1, x7, t1##_x; \
0090 filter_8bit(x3, t2, t3, t7, t4); \
0091 filter_8bit(x6, t2, t3, t7, t4); \
0092 vextracti128 $1, x3, t3##_x; \
0093 vextracti128 $1, x6, t2##_x; \
0094 filter_8bit(x2, t5, t6, t7, t4); \
0095 filter_8bit(x5, t5, t6, t7, t4); \
0096 filter_8bit(x1, t5, t6, t7, t4); \
0097 filter_8bit(x4, t5, t6, t7, t4); \
0098 \
0099 vpxor t4##_x, t4##_x, t4##_x; \
0100 \
0101 \
0102 vextracti128 $1, x2, t6##_x; \
0103 vextracti128 $1, x5, t5##_x; \
0104 vaesenclast t4##_x, x0##_x, x0##_x; \
0105 vaesenclast t4##_x, t0##_x, t0##_x; \
0106 vinserti128 $1, t0##_x, x0, x0; \
0107 vaesenclast t4##_x, x7##_x, x7##_x; \
0108 vaesenclast t4##_x, t1##_x, t1##_x; \
0109 vinserti128 $1, t1##_x, x7, x7; \
0110 vaesenclast t4##_x, x3##_x, x3##_x; \
0111 vaesenclast t4##_x, t3##_x, t3##_x; \
0112 vinserti128 $1, t3##_x, x3, x3; \
0113 vaesenclast t4##_x, x6##_x, x6##_x; \
0114 vaesenclast t4##_x, t2##_x, t2##_x; \
0115 vinserti128 $1, t2##_x, x6, x6; \
0116 vextracti128 $1, x1, t3##_x; \
0117 vextracti128 $1, x4, t2##_x; \
0118 vbroadcasti128 .Lpost_tf_lo_s1, t0; \
0119 vbroadcasti128 .Lpost_tf_hi_s1, t1; \
0120 vaesenclast t4##_x, x2##_x, x2##_x; \
0121 vaesenclast t4##_x, t6##_x, t6##_x; \
0122 vinserti128 $1, t6##_x, x2, x2; \
0123 vaesenclast t4##_x, x5##_x, x5##_x; \
0124 vaesenclast t4##_x, t5##_x, t5##_x; \
0125 vinserti128 $1, t5##_x, x5, x5; \
0126 vaesenclast t4##_x, x1##_x, x1##_x; \
0127 vaesenclast t4##_x, t3##_x, t3##_x; \
0128 vinserti128 $1, t3##_x, x1, x1; \
0129 vaesenclast t4##_x, x4##_x, x4##_x; \
0130 vaesenclast t4##_x, t2##_x, t2##_x; \
0131 vinserti128 $1, t2##_x, x4, x4; \
0132 \
0133 \
0134 vbroadcasti128 .Lpost_tf_lo_s3, t2; \
0135 vbroadcasti128 .Lpost_tf_hi_s3, t3; \
0136 filter_8bit(x0, t0, t1, t7, t6); \
0137 filter_8bit(x7, t0, t1, t7, t6); \
0138 filter_8bit(x3, t0, t1, t7, t6); \
0139 filter_8bit(x6, t0, t1, t7, t6); \
0140 \
0141 \
0142 vbroadcasti128 .Lpost_tf_lo_s2, t4; \
0143 vbroadcasti128 .Lpost_tf_hi_s2, t5; \
0144 filter_8bit(x2, t2, t3, t7, t6); \
0145 filter_8bit(x5, t2, t3, t7, t6); \
0146 \
0147 vpbroadcastq key, t0; \
0148 \
0149 \
0150 filter_8bit(x1, t4, t5, t7, t2); \
0151 filter_8bit(x4, t4, t5, t7, t2); \
0152 vpxor t7, t7, t7; \
0153 \
0154 vpsrldq $1, t0, t1; \
0155 vpsrldq $2, t0, t2; \
0156 vpshufb t7, t1, t1; \
0157 vpsrldq $3, t0, t3; \
0158 \
0159 \
0160 vpxor x5, x0, x0; \
0161 vpxor x6, x1, x1; \
0162 vpxor x7, x2, x2; \
0163 vpxor x4, x3, x3; \
0164 \
0165 vpshufb t7, t2, t2; \
0166 vpsrldq $4, t0, t4; \
0167 vpshufb t7, t3, t3; \
0168 vpsrldq $5, t0, t5; \
0169 vpshufb t7, t4, t4; \
0170 \
0171 vpxor x2, x4, x4; \
0172 vpxor x3, x5, x5; \
0173 vpxor x0, x6, x6; \
0174 vpxor x1, x7, x7; \
0175 \
0176 vpsrldq $6, t0, t6; \
0177 vpshufb t7, t5, t5; \
0178 vpshufb t7, t6, t6; \
0179 \
0180 vpxor x7, x0, x0; \
0181 vpxor x4, x1, x1; \
0182 vpxor x5, x2, x2; \
0183 vpxor x6, x3, x3; \
0184 \
0185 vpxor x3, x4, x4; \
0186 vpxor x0, x5, x5; \
0187 vpxor x1, x6, x6; \
0188 vpxor x2, x7, x7; \
0189 \
0190 \
0191 \
0192 vpxor t6, x1, x1; \
0193 vpxor 5 * 32(mem_cd), x1, x1; \
0194 \
0195 vpsrldq $7, t0, t6; \
0196 vpshufb t7, t0, t0; \
0197 vpshufb t7, t6, t7; \
0198 \
0199 vpxor t7, x0, x0; \
0200 vpxor 4 * 32(mem_cd), x0, x0; \
0201 \
0202 vpxor t5, x2, x2; \
0203 vpxor 6 * 32(mem_cd), x2, x2; \
0204 \
0205 vpxor t4, x3, x3; \
0206 vpxor 7 * 32(mem_cd), x3, x3; \
0207 \
0208 vpxor t3, x4, x4; \
0209 vpxor 0 * 32(mem_cd), x4, x4; \
0210 \
0211 vpxor t2, x5, x5; \
0212 vpxor 1 * 32(mem_cd), x5, x5; \
0213 \
0214 vpxor t1, x6, x6; \
0215 vpxor 2 * 32(mem_cd), x6, x6; \
0216 \
0217 vpxor t0, x7, x7; \
0218 vpxor 3 * 32(mem_cd), x7, x7;
0219
0220
0221
0222
0223
0224 .align 8
0225 SYM_FUNC_START_LOCAL(roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd)
0226 roundsm32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
0227 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15,
0228 %rcx, (%r9));
0229 RET;
0230 SYM_FUNC_END(roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd)
0231
0232 .align 8
0233 SYM_FUNC_START_LOCAL(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
0234 roundsm32(%ymm4, %ymm5, %ymm6, %ymm7, %ymm0, %ymm1, %ymm2, %ymm3,
0235 %ymm12, %ymm13, %ymm14, %ymm15, %ymm8, %ymm9, %ymm10, %ymm11,
0236 %rax, (%r9));
0237 RET;
0238 SYM_FUNC_END(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
0239
0240
0241
0242
0243
0244
0245
0246 #define two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
0247 y6, y7, mem_ab, mem_cd, i, dir, store_ab) \
0248 leaq (key_table + (i) * 8)(CTX), %r9; \
0249 call roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd; \
0250 \
0251 vmovdqu x0, 4 * 32(mem_cd); \
0252 vmovdqu x1, 5 * 32(mem_cd); \
0253 vmovdqu x2, 6 * 32(mem_cd); \
0254 vmovdqu x3, 7 * 32(mem_cd); \
0255 vmovdqu x4, 0 * 32(mem_cd); \
0256 vmovdqu x5, 1 * 32(mem_cd); \
0257 vmovdqu x6, 2 * 32(mem_cd); \
0258 vmovdqu x7, 3 * 32(mem_cd); \
0259 \
0260 leaq (key_table + ((i) + (dir)) * 8)(CTX), %r9; \
0261 call roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab; \
0262 \
0263 store_ab(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab);
0264
0265 #define dummy_store(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab)
0266
0267 #define store_ab_state(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) \
0268 \
0269 vmovdqu x4, 4 * 32(mem_ab); \
0270 vmovdqu x5, 5 * 32(mem_ab); \
0271 vmovdqu x6, 6 * 32(mem_ab); \
0272 vmovdqu x7, 7 * 32(mem_ab); \
0273 vmovdqu x0, 0 * 32(mem_ab); \
0274 vmovdqu x1, 1 * 32(mem_ab); \
0275 vmovdqu x2, 2 * 32(mem_ab); \
0276 vmovdqu x3, 3 * 32(mem_ab);
0277
0278 #define enc_rounds32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
0279 y6, y7, mem_ab, mem_cd, i) \
0280 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
0281 y6, y7, mem_ab, mem_cd, (i) + 2, 1, store_ab_state); \
0282 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
0283 y6, y7, mem_ab, mem_cd, (i) + 4, 1, store_ab_state); \
0284 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
0285 y6, y7, mem_ab, mem_cd, (i) + 6, 1, dummy_store);
0286
0287 #define dec_rounds32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
0288 y6, y7, mem_ab, mem_cd, i) \
0289 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
0290 y6, y7, mem_ab, mem_cd, (i) + 7, -1, store_ab_state); \
0291 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
0292 y6, y7, mem_ab, mem_cd, (i) + 5, -1, store_ab_state); \
0293 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
0294 y6, y7, mem_ab, mem_cd, (i) + 3, -1, dummy_store);
0295
0296
0297
0298
0299
0300
0301
0302 #define rol32_1_32(v0, v1, v2, v3, t0, t1, t2, zero) \
0303 vpcmpgtb v0, zero, t0; \
0304 vpaddb v0, v0, v0; \
0305 vpabsb t0, t0; \
0306 \
0307 vpcmpgtb v1, zero, t1; \
0308 vpaddb v1, v1, v1; \
0309 vpabsb t1, t1; \
0310 \
0311 vpcmpgtb v2, zero, t2; \
0312 vpaddb v2, v2, v2; \
0313 vpabsb t2, t2; \
0314 \
0315 vpor t0, v1, v1; \
0316 \
0317 vpcmpgtb v3, zero, t0; \
0318 vpaddb v3, v3, v3; \
0319 vpabsb t0, t0; \
0320 \
0321 vpor t1, v2, v2; \
0322 vpor t2, v3, v3; \
0323 vpor t0, v0, v0;
0324
0325
0326
0327
0328
0329
0330
0331
0332 #define fls32(l, l0, l1, l2, l3, l4, l5, l6, l7, r, t0, t1, t2, t3, tt0, \
0333 tt1, tt2, tt3, kll, klr, krl, krr) \
0334
0335
0336
0337
0338 \
0339 vpbroadcastd kll, t0; \
0340 vpxor tt0, tt0, tt0; \
0341 vpshufb tt0, t0, t3; \
0342 vpsrldq $1, t0, t0; \
0343 vpshufb tt0, t0, t2; \
0344 vpsrldq $1, t0, t0; \
0345 vpshufb tt0, t0, t1; \
0346 vpsrldq $1, t0, t0; \
0347 vpshufb tt0, t0, t0; \
0348 \
0349 vpand l0, t0, t0; \
0350 vpand l1, t1, t1; \
0351 vpand l2, t2, t2; \
0352 vpand l3, t3, t3; \
0353 \
0354 rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
0355 \
0356 vpxor l4, t0, l4; \
0357 vpbroadcastd krr, t0; \
0358 vmovdqu l4, 4 * 32(l); \
0359 vpxor l5, t1, l5; \
0360 vmovdqu l5, 5 * 32(l); \
0361 vpxor l6, t2, l6; \
0362 vmovdqu l6, 6 * 32(l); \
0363 vpxor l7, t3, l7; \
0364 vmovdqu l7, 7 * 32(l); \
0365 \
0366
0367
0368
0369
0370 \
0371 \
0372 vpshufb tt0, t0, t3; \
0373 vpsrldq $1, t0, t0; \
0374 vpshufb tt0, t0, t2; \
0375 vpsrldq $1, t0, t0; \
0376 vpshufb tt0, t0, t1; \
0377 vpsrldq $1, t0, t0; \
0378 vpshufb tt0, t0, t0; \
0379 \
0380 vpor 4 * 32(r), t0, t0; \
0381 vpor 5 * 32(r), t1, t1; \
0382 vpor 6 * 32(r), t2, t2; \
0383 vpor 7 * 32(r), t3, t3; \
0384 \
0385 vpxor 0 * 32(r), t0, t0; \
0386 vpxor 1 * 32(r), t1, t1; \
0387 vpxor 2 * 32(r), t2, t2; \
0388 vpxor 3 * 32(r), t3, t3; \
0389 vmovdqu t0, 0 * 32(r); \
0390 vpbroadcastd krl, t0; \
0391 vmovdqu t1, 1 * 32(r); \
0392 vmovdqu t2, 2 * 32(r); \
0393 vmovdqu t3, 3 * 32(r); \
0394 \
0395
0396
0397
0398
0399 \
0400 vpshufb tt0, t0, t3; \
0401 vpsrldq $1, t0, t0; \
0402 vpshufb tt0, t0, t2; \
0403 vpsrldq $1, t0, t0; \
0404 vpshufb tt0, t0, t1; \
0405 vpsrldq $1, t0, t0; \
0406 vpshufb tt0, t0, t0; \
0407 \
0408 vpand 0 * 32(r), t0, t0; \
0409 vpand 1 * 32(r), t1, t1; \
0410 vpand 2 * 32(r), t2, t2; \
0411 vpand 3 * 32(r), t3, t3; \
0412 \
0413 rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
0414 \
0415 vpxor 4 * 32(r), t0, t0; \
0416 vpxor 5 * 32(r), t1, t1; \
0417 vpxor 6 * 32(r), t2, t2; \
0418 vpxor 7 * 32(r), t3, t3; \
0419 vmovdqu t0, 4 * 32(r); \
0420 vpbroadcastd klr, t0; \
0421 vmovdqu t1, 5 * 32(r); \
0422 vmovdqu t2, 6 * 32(r); \
0423 vmovdqu t3, 7 * 32(r); \
0424 \
0425
0426
0427
0428
0429 \
0430 \
0431 vpshufb tt0, t0, t3; \
0432 vpsrldq $1, t0, t0; \
0433 vpshufb tt0, t0, t2; \
0434 vpsrldq $1, t0, t0; \
0435 vpshufb tt0, t0, t1; \
0436 vpsrldq $1, t0, t0; \
0437 vpshufb tt0, t0, t0; \
0438 \
0439 vpor l4, t0, t0; \
0440 vpor l5, t1, t1; \
0441 vpor l6, t2, t2; \
0442 vpor l7, t3, t3; \
0443 \
0444 vpxor l0, t0, l0; \
0445 vmovdqu l0, 0 * 32(l); \
0446 vpxor l1, t1, l1; \
0447 vmovdqu l1, 1 * 32(l); \
0448 vpxor l2, t2, l2; \
0449 vmovdqu l2, 2 * 32(l); \
0450 vpxor l3, t3, l3; \
0451 vmovdqu l3, 3 * 32(l);
0452
0453 #define transpose_4x4(x0, x1, x2, x3, t1, t2) \
0454 vpunpckhdq x1, x0, t2; \
0455 vpunpckldq x1, x0, x0; \
0456 \
0457 vpunpckldq x3, x2, t1; \
0458 vpunpckhdq x3, x2, x2; \
0459 \
0460 vpunpckhqdq t1, x0, x1; \
0461 vpunpcklqdq t1, x0, x0; \
0462 \
0463 vpunpckhqdq x2, t2, x3; \
0464 vpunpcklqdq x2, t2, x2;
0465
0466 #define byteslice_16x16b_fast(a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, \
0467 a3, b3, c3, d3, st0, st1) \
0468 vmovdqu d2, st0; \
0469 vmovdqu d3, st1; \
0470 transpose_4x4(a0, a1, a2, a3, d2, d3); \
0471 transpose_4x4(b0, b1, b2, b3, d2, d3); \
0472 vmovdqu st0, d2; \
0473 vmovdqu st1, d3; \
0474 \
0475 vmovdqu a0, st0; \
0476 vmovdqu a1, st1; \
0477 transpose_4x4(c0, c1, c2, c3, a0, a1); \
0478 transpose_4x4(d0, d1, d2, d3, a0, a1); \
0479 \
0480 vbroadcasti128 .Lshufb_16x16b, a0; \
0481 vmovdqu st1, a1; \
0482 vpshufb a0, a2, a2; \
0483 vpshufb a0, a3, a3; \
0484 vpshufb a0, b0, b0; \
0485 vpshufb a0, b1, b1; \
0486 vpshufb a0, b2, b2; \
0487 vpshufb a0, b3, b3; \
0488 vpshufb a0, a1, a1; \
0489 vpshufb a0, c0, c0; \
0490 vpshufb a0, c1, c1; \
0491 vpshufb a0, c2, c2; \
0492 vpshufb a0, c3, c3; \
0493 vpshufb a0, d0, d0; \
0494 vpshufb a0, d1, d1; \
0495 vpshufb a0, d2, d2; \
0496 vpshufb a0, d3, d3; \
0497 vmovdqu d3, st1; \
0498 vmovdqu st0, d3; \
0499 vpshufb a0, d3, a0; \
0500 vmovdqu d2, st0; \
0501 \
0502 transpose_4x4(a0, b0, c0, d0, d2, d3); \
0503 transpose_4x4(a1, b1, c1, d1, d2, d3); \
0504 vmovdqu st0, d2; \
0505 vmovdqu st1, d3; \
0506 \
0507 vmovdqu b0, st0; \
0508 vmovdqu b1, st1; \
0509 transpose_4x4(a2, b2, c2, d2, b0, b1); \
0510 transpose_4x4(a3, b3, c3, d3, b0, b1); \
0511 vmovdqu st0, b0; \
0512 vmovdqu st1, b1; \
0513
0514
0515
0516 #define inpack32_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
0517 y6, y7, rio, key) \
0518 vpbroadcastq key, x0; \
0519 vpshufb .Lpack_bswap, x0, x0; \
0520 \
0521 vpxor 0 * 32(rio), x0, y7; \
0522 vpxor 1 * 32(rio), x0, y6; \
0523 vpxor 2 * 32(rio), x0, y5; \
0524 vpxor 3 * 32(rio), x0, y4; \
0525 vpxor 4 * 32(rio), x0, y3; \
0526 vpxor 5 * 32(rio), x0, y2; \
0527 vpxor 6 * 32(rio), x0, y1; \
0528 vpxor 7 * 32(rio), x0, y0; \
0529 vpxor 8 * 32(rio), x0, x7; \
0530 vpxor 9 * 32(rio), x0, x6; \
0531 vpxor 10 * 32(rio), x0, x5; \
0532 vpxor 11 * 32(rio), x0, x4; \
0533 vpxor 12 * 32(rio), x0, x3; \
0534 vpxor 13 * 32(rio), x0, x2; \
0535 vpxor 14 * 32(rio), x0, x1; \
0536 vpxor 15 * 32(rio), x0, x0;
0537
0538
0539 #define inpack32_post(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
0540 y6, y7, mem_ab, mem_cd) \
0541 byteslice_16x16b_fast(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, \
0542 y4, y5, y6, y7, (mem_ab), (mem_cd)); \
0543 \
0544 vmovdqu x0, 0 * 32(mem_ab); \
0545 vmovdqu x1, 1 * 32(mem_ab); \
0546 vmovdqu x2, 2 * 32(mem_ab); \
0547 vmovdqu x3, 3 * 32(mem_ab); \
0548 vmovdqu x4, 4 * 32(mem_ab); \
0549 vmovdqu x5, 5 * 32(mem_ab); \
0550 vmovdqu x6, 6 * 32(mem_ab); \
0551 vmovdqu x7, 7 * 32(mem_ab); \
0552 vmovdqu y0, 0 * 32(mem_cd); \
0553 vmovdqu y1, 1 * 32(mem_cd); \
0554 vmovdqu y2, 2 * 32(mem_cd); \
0555 vmovdqu y3, 3 * 32(mem_cd); \
0556 vmovdqu y4, 4 * 32(mem_cd); \
0557 vmovdqu y5, 5 * 32(mem_cd); \
0558 vmovdqu y6, 6 * 32(mem_cd); \
0559 vmovdqu y7, 7 * 32(mem_cd);
0560
0561
0562 #define outunpack32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \
0563 y5, y6, y7, key, stack_tmp0, stack_tmp1) \
0564 byteslice_16x16b_fast(y0, y4, x0, x4, y1, y5, x1, x5, y2, y6, x2, x6, \
0565 y3, y7, x3, x7, stack_tmp0, stack_tmp1); \
0566 \
0567 vmovdqu x0, stack_tmp0; \
0568 \
0569 vpbroadcastq key, x0; \
0570 vpshufb .Lpack_bswap, x0, x0; \
0571 \
0572 vpxor x0, y7, y7; \
0573 vpxor x0, y6, y6; \
0574 vpxor x0, y5, y5; \
0575 vpxor x0, y4, y4; \
0576 vpxor x0, y3, y3; \
0577 vpxor x0, y2, y2; \
0578 vpxor x0, y1, y1; \
0579 vpxor x0, y0, y0; \
0580 vpxor x0, x7, x7; \
0581 vpxor x0, x6, x6; \
0582 vpxor x0, x5, x5; \
0583 vpxor x0, x4, x4; \
0584 vpxor x0, x3, x3; \
0585 vpxor x0, x2, x2; \
0586 vpxor x0, x1, x1; \
0587 vpxor stack_tmp0, x0, x0;
0588
0589 #define write_output(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
0590 y6, y7, rio) \
0591 vmovdqu x0, 0 * 32(rio); \
0592 vmovdqu x1, 1 * 32(rio); \
0593 vmovdqu x2, 2 * 32(rio); \
0594 vmovdqu x3, 3 * 32(rio); \
0595 vmovdqu x4, 4 * 32(rio); \
0596 vmovdqu x5, 5 * 32(rio); \
0597 vmovdqu x6, 6 * 32(rio); \
0598 vmovdqu x7, 7 * 32(rio); \
0599 vmovdqu y0, 8 * 32(rio); \
0600 vmovdqu y1, 9 * 32(rio); \
0601 vmovdqu y2, 10 * 32(rio); \
0602 vmovdqu y3, 11 * 32(rio); \
0603 vmovdqu y4, 12 * 32(rio); \
0604 vmovdqu y5, 13 * 32(rio); \
0605 vmovdqu y6, 14 * 32(rio); \
0606 vmovdqu y7, 15 * 32(rio);
0607
0608
0609 .section .rodata.cst32.shufb_16x16b, "aM", @progbits, 32
0610 .align 32
0611 #define SHUFB_BYTES(idx) \
0612 0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
0613 .Lshufb_16x16b:
0614 .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
0615 .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
0616
0617 .section .rodata.cst32.pack_bswap, "aM", @progbits, 32
0618 .align 32
0619 .Lpack_bswap:
0620 .long 0x00010203, 0x04050607, 0x80808080, 0x80808080
0621 .long 0x00010203, 0x04050607, 0x80808080, 0x80808080
0622
0623
0624 .section .rodata.cst16, "aM", @progbits, 16
0625 .align 16
0626
0627
0628
0629
0630
0631
0632
0633
0634
0635
0636
0637
0638
0639
0640
0641 .Lpre_tf_lo_s1:
0642 .byte 0x45, 0xe8, 0x40, 0xed, 0x2e, 0x83, 0x2b, 0x86
0643 .byte 0x4b, 0xe6, 0x4e, 0xe3, 0x20, 0x8d, 0x25, 0x88
0644 .Lpre_tf_hi_s1:
0645 .byte 0x00, 0x51, 0xf1, 0xa0, 0x8a, 0xdb, 0x7b, 0x2a
0646 .byte 0x09, 0x58, 0xf8, 0xa9, 0x83, 0xd2, 0x72, 0x23
0647
0648
0649
0650
0651
0652
0653
0654
0655
0656
0657
0658
0659
0660
0661
0662 .Lpre_tf_lo_s4:
0663 .byte 0x45, 0x40, 0x2e, 0x2b, 0x4b, 0x4e, 0x20, 0x25
0664 .byte 0x14, 0x11, 0x7f, 0x7a, 0x1a, 0x1f, 0x71, 0x74
0665 .Lpre_tf_hi_s4:
0666 .byte 0x00, 0xf1, 0x8a, 0x7b, 0x09, 0xf8, 0x83, 0x72
0667 .byte 0xad, 0x5c, 0x27, 0xd6, 0xa4, 0x55, 0x2e, 0xdf
0668
0669
0670
0671
0672
0673
0674
0675
0676
0677
0678
0679
0680
0681
0682
0683
0684
0685 .Lpost_tf_lo_s1:
0686 .byte 0x3c, 0xcc, 0xcf, 0x3f, 0x32, 0xc2, 0xc1, 0x31
0687 .byte 0xdc, 0x2c, 0x2f, 0xdf, 0xd2, 0x22, 0x21, 0xd1
0688 .Lpost_tf_hi_s1:
0689 .byte 0x00, 0xf9, 0x86, 0x7f, 0xd7, 0x2e, 0x51, 0xa8
0690 .byte 0xa4, 0x5d, 0x22, 0xdb, 0x73, 0x8a, 0xf5, 0x0c
0691
0692
0693
0694
0695
0696
0697
0698
0699
0700
0701
0702
0703
0704
0705
0706
0707
0708 .Lpost_tf_lo_s2:
0709 .byte 0x78, 0x99, 0x9f, 0x7e, 0x64, 0x85, 0x83, 0x62
0710 .byte 0xb9, 0x58, 0x5e, 0xbf, 0xa5, 0x44, 0x42, 0xa3
0711 .Lpost_tf_hi_s2:
0712 .byte 0x00, 0xf3, 0x0d, 0xfe, 0xaf, 0x5c, 0xa2, 0x51
0713 .byte 0x49, 0xba, 0x44, 0xb7, 0xe6, 0x15, 0xeb, 0x18
0714
0715
0716
0717
0718
0719
0720
0721
0722
0723
0724
0725
0726
0727
0728
0729
0730
0731 .Lpost_tf_lo_s3:
0732 .byte 0x1e, 0x66, 0xe7, 0x9f, 0x19, 0x61, 0xe0, 0x98
0733 .byte 0x6e, 0x16, 0x97, 0xef, 0x69, 0x11, 0x90, 0xe8
0734 .Lpost_tf_hi_s3:
0735 .byte 0x00, 0xfc, 0x43, 0xbf, 0xeb, 0x17, 0xa8, 0x54
0736 .byte 0x52, 0xae, 0x11, 0xed, 0xb9, 0x45, 0xfa, 0x06
0737
0738
0739 .Linv_shift_row:
0740 .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
0741 .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
0742
0743 .section .rodata.cst4.L0f0f0f0f, "aM", @progbits, 4
0744 .align 4
0745
0746 .L0f0f0f0f:
0747 .long 0x0f0f0f0f
0748
0749 .text
0750
0751 .align 8
0752 SYM_FUNC_START_LOCAL(__camellia_enc_blk32)
0753
0754
0755
0756
0757
0758
0759
0760
0761 FRAME_BEGIN
0762
0763 leaq 8 * 32(%rax), %rcx;
0764
0765 inpack32_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
0766 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
0767 %ymm15, %rax, %rcx);
0768
0769 enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
0770 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
0771 %ymm15, %rax, %rcx, 0);
0772
0773 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
0774 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
0775 %ymm15,
0776 ((key_table + (8) * 8) + 0)(CTX),
0777 ((key_table + (8) * 8) + 4)(CTX),
0778 ((key_table + (8) * 8) + 8)(CTX),
0779 ((key_table + (8) * 8) + 12)(CTX));
0780
0781 enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
0782 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
0783 %ymm15, %rax, %rcx, 8);
0784
0785 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
0786 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
0787 %ymm15,
0788 ((key_table + (16) * 8) + 0)(CTX),
0789 ((key_table + (16) * 8) + 4)(CTX),
0790 ((key_table + (16) * 8) + 8)(CTX),
0791 ((key_table + (16) * 8) + 12)(CTX));
0792
0793 enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
0794 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
0795 %ymm15, %rax, %rcx, 16);
0796
0797 movl $24, %r8d;
0798 cmpl $16, key_length(CTX);
0799 jne .Lenc_max32;
0800
0801 .Lenc_done:
0802
0803 vmovdqu 0 * 32(%rcx), %ymm8;
0804 vmovdqu 1 * 32(%rcx), %ymm9;
0805 vmovdqu 2 * 32(%rcx), %ymm10;
0806 vmovdqu 3 * 32(%rcx), %ymm11;
0807 vmovdqu 4 * 32(%rcx), %ymm12;
0808 vmovdqu 5 * 32(%rcx), %ymm13;
0809 vmovdqu 6 * 32(%rcx), %ymm14;
0810 vmovdqu 7 * 32(%rcx), %ymm15;
0811
0812 outunpack32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
0813 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
0814 %ymm15, (key_table)(CTX, %r8, 8), (%rax), 1 * 32(%rax));
0815
0816 FRAME_END
0817 RET;
0818
0819 .align 8
0820 .Lenc_max32:
0821 movl $32, %r8d;
0822
0823 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
0824 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
0825 %ymm15,
0826 ((key_table + (24) * 8) + 0)(CTX),
0827 ((key_table + (24) * 8) + 4)(CTX),
0828 ((key_table + (24) * 8) + 8)(CTX),
0829 ((key_table + (24) * 8) + 12)(CTX));
0830
0831 enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
0832 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
0833 %ymm15, %rax, %rcx, 24);
0834
0835 jmp .Lenc_done;
0836 SYM_FUNC_END(__camellia_enc_blk32)
0837
0838 .align 8
0839 SYM_FUNC_START_LOCAL(__camellia_dec_blk32)
0840
0841
0842
0843
0844
0845
0846
0847
0848
0849 FRAME_BEGIN
0850
0851 leaq 8 * 32(%rax), %rcx;
0852
0853 inpack32_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
0854 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
0855 %ymm15, %rax, %rcx);
0856
0857 cmpl $32, %r8d;
0858 je .Ldec_max32;
0859
0860 .Ldec_max24:
0861 dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
0862 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
0863 %ymm15, %rax, %rcx, 16);
0864
0865 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
0866 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
0867 %ymm15,
0868 ((key_table + (16) * 8) + 8)(CTX),
0869 ((key_table + (16) * 8) + 12)(CTX),
0870 ((key_table + (16) * 8) + 0)(CTX),
0871 ((key_table + (16) * 8) + 4)(CTX));
0872
0873 dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
0874 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
0875 %ymm15, %rax, %rcx, 8);
0876
0877 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
0878 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
0879 %ymm15,
0880 ((key_table + (8) * 8) + 8)(CTX),
0881 ((key_table + (8) * 8) + 12)(CTX),
0882 ((key_table + (8) * 8) + 0)(CTX),
0883 ((key_table + (8) * 8) + 4)(CTX));
0884
0885 dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
0886 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
0887 %ymm15, %rax, %rcx, 0);
0888
0889
0890 vmovdqu 0 * 32(%rcx), %ymm8;
0891 vmovdqu 1 * 32(%rcx), %ymm9;
0892 vmovdqu 2 * 32(%rcx), %ymm10;
0893 vmovdqu 3 * 32(%rcx), %ymm11;
0894 vmovdqu 4 * 32(%rcx), %ymm12;
0895 vmovdqu 5 * 32(%rcx), %ymm13;
0896 vmovdqu 6 * 32(%rcx), %ymm14;
0897 vmovdqu 7 * 32(%rcx), %ymm15;
0898
0899 outunpack32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
0900 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
0901 %ymm15, (key_table)(CTX), (%rax), 1 * 32(%rax));
0902
0903 FRAME_END
0904 RET;
0905
0906 .align 8
0907 .Ldec_max32:
0908 dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
0909 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
0910 %ymm15, %rax, %rcx, 24);
0911
0912 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
0913 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
0914 %ymm15,
0915 ((key_table + (24) * 8) + 8)(CTX),
0916 ((key_table + (24) * 8) + 12)(CTX),
0917 ((key_table + (24) * 8) + 0)(CTX),
0918 ((key_table + (24) * 8) + 4)(CTX));
0919
0920 jmp .Ldec_max24;
0921 SYM_FUNC_END(__camellia_dec_blk32)
0922
0923 SYM_FUNC_START(camellia_ecb_enc_32way)
0924
0925
0926
0927
0928
0929 FRAME_BEGIN
0930
0931 vzeroupper;
0932
0933 inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
0934 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
0935 %ymm15, %rdx, (key_table)(CTX));
0936
0937
0938 movq %rsi, %rax;
0939
0940 call __camellia_enc_blk32;
0941
0942 write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
0943 %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
0944 %ymm8, %rsi);
0945
0946 vzeroupper;
0947
0948 FRAME_END
0949 RET;
0950 SYM_FUNC_END(camellia_ecb_enc_32way)
0951
0952 SYM_FUNC_START(camellia_ecb_dec_32way)
0953
0954
0955
0956
0957
0958 FRAME_BEGIN
0959
0960 vzeroupper;
0961
0962 cmpl $16, key_length(CTX);
0963 movl $32, %r8d;
0964 movl $24, %eax;
0965 cmovel %eax, %r8d;
0966
0967 inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
0968 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
0969 %ymm15, %rdx, (key_table)(CTX, %r8, 8));
0970
0971
0972 movq %rsi, %rax;
0973
0974 call __camellia_dec_blk32;
0975
0976 write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
0977 %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
0978 %ymm8, %rsi);
0979
0980 vzeroupper;
0981
0982 FRAME_END
0983 RET;
0984 SYM_FUNC_END(camellia_ecb_dec_32way)
0985
0986 SYM_FUNC_START(camellia_cbc_dec_32way)
0987
0988
0989
0990
0991
0992 FRAME_BEGIN
0993 subq $(16 * 32), %rsp;
0994
0995 vzeroupper;
0996
0997 cmpl $16, key_length(CTX);
0998 movl $32, %r8d;
0999 movl $24, %eax;
1000 cmovel %eax, %r8d;
1001
1002 inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
1003 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
1004 %ymm15, %rdx, (key_table)(CTX, %r8, 8));
1005
1006 cmpq %rsi, %rdx;
1007 je .Lcbc_dec_use_stack;
1008
1009
1010 movq %rsi, %rax;
1011 jmp .Lcbc_dec_continue;
1012
1013 .Lcbc_dec_use_stack:
1014
1015
1016
1017
1018 movq %rsp, %rax;
1019
1020 .Lcbc_dec_continue:
1021 call __camellia_dec_blk32;
1022
1023 vmovdqu %ymm7, (%rax);
1024 vpxor %ymm7, %ymm7, %ymm7;
1025 vinserti128 $1, (%rdx), %ymm7, %ymm7;
1026 vpxor (%rax), %ymm7, %ymm7;
1027 vpxor (0 * 32 + 16)(%rdx), %ymm6, %ymm6;
1028 vpxor (1 * 32 + 16)(%rdx), %ymm5, %ymm5;
1029 vpxor (2 * 32 + 16)(%rdx), %ymm4, %ymm4;
1030 vpxor (3 * 32 + 16)(%rdx), %ymm3, %ymm3;
1031 vpxor (4 * 32 + 16)(%rdx), %ymm2, %ymm2;
1032 vpxor (5 * 32 + 16)(%rdx), %ymm1, %ymm1;
1033 vpxor (6 * 32 + 16)(%rdx), %ymm0, %ymm0;
1034 vpxor (7 * 32 + 16)(%rdx), %ymm15, %ymm15;
1035 vpxor (8 * 32 + 16)(%rdx), %ymm14, %ymm14;
1036 vpxor (9 * 32 + 16)(%rdx), %ymm13, %ymm13;
1037 vpxor (10 * 32 + 16)(%rdx), %ymm12, %ymm12;
1038 vpxor (11 * 32 + 16)(%rdx), %ymm11, %ymm11;
1039 vpxor (12 * 32 + 16)(%rdx), %ymm10, %ymm10;
1040 vpxor (13 * 32 + 16)(%rdx), %ymm9, %ymm9;
1041 vpxor (14 * 32 + 16)(%rdx), %ymm8, %ymm8;
1042 write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
1043 %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
1044 %ymm8, %rsi);
1045
1046 vzeroupper;
1047
1048 addq $(16 * 32), %rsp;
1049 FRAME_END
1050 RET;
1051 SYM_FUNC_END(camellia_cbc_dec_32way)