0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018 #include <linux/linkage.h>
0019 #include <asm/frame.h>
0020
0021 #define CAMELLIA_TABLE_BYTE_LEN 272
0022
0023
0024 #define key_table 0
0025 #define key_length CAMELLIA_TABLE_BYTE_LEN
0026
0027
0028 #define CTX %rdi
0029
0030
0031
0032
0033 #define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \
0034 vpand x, mask4bit, tmp0; \
0035 vpandn x, mask4bit, x; \
0036 vpsrld $4, x, x; \
0037 \
0038 vpshufb tmp0, lo_t, tmp0; \
0039 vpshufb x, hi_t, x; \
0040 vpxor tmp0, x, x;
0041
0042
0043
0044
0045
0046
0047
0048
0049
0050 #define roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, t6, \
0051 t7, mem_cd, key) \
0052
0053
0054 \
0055 vmovdqa .Linv_shift_row, t4; \
0056 vbroadcastss .L0f0f0f0f, t7; \
0057 vmovdqa .Lpre_tf_lo_s1, t0; \
0058 vmovdqa .Lpre_tf_hi_s1, t1; \
0059 \
0060 \
0061 vpshufb t4, x0, x0; \
0062 vpshufb t4, x7, x7; \
0063 vpshufb t4, x1, x1; \
0064 vpshufb t4, x4, x4; \
0065 vpshufb t4, x2, x2; \
0066 vpshufb t4, x5, x5; \
0067 vpshufb t4, x3, x3; \
0068 vpshufb t4, x6, x6; \
0069 \
0070 \
0071 vmovdqa .Lpre_tf_lo_s4, t2; \
0072 vmovdqa .Lpre_tf_hi_s4, t3; \
0073 filter_8bit(x0, t0, t1, t7, t6); \
0074 filter_8bit(x7, t0, t1, t7, t6); \
0075 filter_8bit(x1, t0, t1, t7, t6); \
0076 filter_8bit(x4, t0, t1, t7, t6); \
0077 filter_8bit(x2, t0, t1, t7, t6); \
0078 filter_8bit(x5, t0, t1, t7, t6); \
0079 \
0080 \
0081 vpxor t4, t4, t4; \
0082 filter_8bit(x3, t2, t3, t7, t6); \
0083 filter_8bit(x6, t2, t3, t7, t6); \
0084 \
0085 \
0086 vmovdqa .Lpost_tf_lo_s1, t0; \
0087 vmovdqa .Lpost_tf_hi_s1, t1; \
0088 vaesenclast t4, x0, x0; \
0089 vaesenclast t4, x7, x7; \
0090 vaesenclast t4, x1, x1; \
0091 vaesenclast t4, x4, x4; \
0092 vaesenclast t4, x2, x2; \
0093 vaesenclast t4, x5, x5; \
0094 vaesenclast t4, x3, x3; \
0095 vaesenclast t4, x6, x6; \
0096 \
0097 \
0098 vmovdqa .Lpost_tf_lo_s3, t2; \
0099 vmovdqa .Lpost_tf_hi_s3, t3; \
0100 filter_8bit(x0, t0, t1, t7, t6); \
0101 filter_8bit(x7, t0, t1, t7, t6); \
0102 filter_8bit(x3, t0, t1, t7, t6); \
0103 filter_8bit(x6, t0, t1, t7, t6); \
0104 \
0105 \
0106 vmovdqa .Lpost_tf_lo_s2, t4; \
0107 vmovdqa .Lpost_tf_hi_s2, t5; \
0108 filter_8bit(x2, t2, t3, t7, t6); \
0109 filter_8bit(x5, t2, t3, t7, t6); \
0110 \
0111 vpxor t6, t6, t6; \
0112 vmovq key, t0; \
0113 \
0114 \
0115 filter_8bit(x1, t4, t5, t7, t2); \
0116 filter_8bit(x4, t4, t5, t7, t2); \
0117 \
0118 vpsrldq $5, t0, t5; \
0119 vpsrldq $1, t0, t1; \
0120 vpsrldq $2, t0, t2; \
0121 vpsrldq $3, t0, t3; \
0122 vpsrldq $4, t0, t4; \
0123 vpshufb t6, t0, t0; \
0124 vpshufb t6, t1, t1; \
0125 vpshufb t6, t2, t2; \
0126 vpshufb t6, t3, t3; \
0127 vpshufb t6, t4, t4; \
0128 vpsrldq $2, t5, t7; \
0129 vpshufb t6, t7, t7; \
0130 \
0131
0132
0133 \
0134 vpxor x5, x0, x0; \
0135 vpxor x6, x1, x1; \
0136 vpxor x7, x2, x2; \
0137 vpxor x4, x3, x3; \
0138 \
0139 vpxor x2, x4, x4; \
0140 vpxor x3, x5, x5; \
0141 vpxor x0, x6, x6; \
0142 vpxor x1, x7, x7; \
0143 \
0144 vpxor x7, x0, x0; \
0145 vpxor x4, x1, x1; \
0146 vpxor x5, x2, x2; \
0147 vpxor x6, x3, x3; \
0148 \
0149 vpxor x3, x4, x4; \
0150 vpxor x0, x5, x5; \
0151 vpxor x1, x6, x6; \
0152 vpxor x2, x7, x7; \
0153 \
0154
0155
0156 \
0157 \
0158 vpxor t3, x4, x4; \
0159 vpxor 0 * 16(mem_cd), x4, x4; \
0160 \
0161 vpxor t2, x5, x5; \
0162 vpxor 1 * 16(mem_cd), x5, x5; \
0163 \
0164 vpsrldq $1, t5, t3; \
0165 vpshufb t6, t5, t5; \
0166 vpshufb t6, t3, t6; \
0167 \
0168 vpxor t1, x6, x6; \
0169 vpxor 2 * 16(mem_cd), x6, x6; \
0170 \
0171 vpxor t0, x7, x7; \
0172 vpxor 3 * 16(mem_cd), x7, x7; \
0173 \
0174 vpxor t7, x0, x0; \
0175 vpxor 4 * 16(mem_cd), x0, x0; \
0176 \
0177 vpxor t6, x1, x1; \
0178 vpxor 5 * 16(mem_cd), x1, x1; \
0179 \
0180 vpxor t5, x2, x2; \
0181 vpxor 6 * 16(mem_cd), x2, x2; \
0182 \
0183 vpxor t4, x3, x3; \
0184 vpxor 7 * 16(mem_cd), x3, x3;
0185
0186
0187
0188
0189
0190 .align 8
0191 SYM_FUNC_START_LOCAL(roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd)
0192 roundsm16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
0193 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15,
0194 %rcx, (%r9));
0195 RET;
0196 SYM_FUNC_END(roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd)
0197
0198 .align 8
0199 SYM_FUNC_START_LOCAL(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
0200 roundsm16(%xmm4, %xmm5, %xmm6, %xmm7, %xmm0, %xmm1, %xmm2, %xmm3,
0201 %xmm12, %xmm13, %xmm14, %xmm15, %xmm8, %xmm9, %xmm10, %xmm11,
0202 %rax, (%r9));
0203 RET;
0204 SYM_FUNC_END(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
0205
0206
0207
0208
0209
0210
0211
0212 #define two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
0213 y6, y7, mem_ab, mem_cd, i, dir, store_ab) \
0214 leaq (key_table + (i) * 8)(CTX), %r9; \
0215 call roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd; \
0216 \
0217 vmovdqu x4, 0 * 16(mem_cd); \
0218 vmovdqu x5, 1 * 16(mem_cd); \
0219 vmovdqu x6, 2 * 16(mem_cd); \
0220 vmovdqu x7, 3 * 16(mem_cd); \
0221 vmovdqu x0, 4 * 16(mem_cd); \
0222 vmovdqu x1, 5 * 16(mem_cd); \
0223 vmovdqu x2, 6 * 16(mem_cd); \
0224 vmovdqu x3, 7 * 16(mem_cd); \
0225 \
0226 leaq (key_table + ((i) + (dir)) * 8)(CTX), %r9; \
0227 call roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab; \
0228 \
0229 store_ab(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab);
0230
0231 #define dummy_store(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab)
0232
0233 #define store_ab_state(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) \
0234 \
0235 vmovdqu x0, 0 * 16(mem_ab); \
0236 vmovdqu x1, 1 * 16(mem_ab); \
0237 vmovdqu x2, 2 * 16(mem_ab); \
0238 vmovdqu x3, 3 * 16(mem_ab); \
0239 vmovdqu x4, 4 * 16(mem_ab); \
0240 vmovdqu x5, 5 * 16(mem_ab); \
0241 vmovdqu x6, 6 * 16(mem_ab); \
0242 vmovdqu x7, 7 * 16(mem_ab);
0243
0244 #define enc_rounds16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
0245 y6, y7, mem_ab, mem_cd, i) \
0246 two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
0247 y6, y7, mem_ab, mem_cd, (i) + 2, 1, store_ab_state); \
0248 two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
0249 y6, y7, mem_ab, mem_cd, (i) + 4, 1, store_ab_state); \
0250 two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
0251 y6, y7, mem_ab, mem_cd, (i) + 6, 1, dummy_store);
0252
0253 #define dec_rounds16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
0254 y6, y7, mem_ab, mem_cd, i) \
0255 two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
0256 y6, y7, mem_ab, mem_cd, (i) + 7, -1, store_ab_state); \
0257 two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
0258 y6, y7, mem_ab, mem_cd, (i) + 5, -1, store_ab_state); \
0259 two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
0260 y6, y7, mem_ab, mem_cd, (i) + 3, -1, dummy_store);
0261
0262
0263
0264
0265
0266
0267
0268 #define rol32_1_16(v0, v1, v2, v3, t0, t1, t2, zero) \
0269 vpcmpgtb v0, zero, t0; \
0270 vpaddb v0, v0, v0; \
0271 vpabsb t0, t0; \
0272 \
0273 vpcmpgtb v1, zero, t1; \
0274 vpaddb v1, v1, v1; \
0275 vpabsb t1, t1; \
0276 \
0277 vpcmpgtb v2, zero, t2; \
0278 vpaddb v2, v2, v2; \
0279 vpabsb t2, t2; \
0280 \
0281 vpor t0, v1, v1; \
0282 \
0283 vpcmpgtb v3, zero, t0; \
0284 vpaddb v3, v3, v3; \
0285 vpabsb t0, t0; \
0286 \
0287 vpor t1, v2, v2; \
0288 vpor t2, v3, v3; \
0289 vpor t0, v0, v0;
0290
0291
0292
0293
0294
0295
0296
0297
0298 #define fls16(l, l0, l1, l2, l3, l4, l5, l6, l7, r, t0, t1, t2, t3, tt0, \
0299 tt1, tt2, tt3, kll, klr, krl, krr) \
0300
0301
0302
0303
0304 \
0305 vpxor tt0, tt0, tt0; \
0306 vmovd kll, t0; \
0307 vpshufb tt0, t0, t3; \
0308 vpsrldq $1, t0, t0; \
0309 vpshufb tt0, t0, t2; \
0310 vpsrldq $1, t0, t0; \
0311 vpshufb tt0, t0, t1; \
0312 vpsrldq $1, t0, t0; \
0313 vpshufb tt0, t0, t0; \
0314 \
0315 vpand l0, t0, t0; \
0316 vpand l1, t1, t1; \
0317 vpand l2, t2, t2; \
0318 vpand l3, t3, t3; \
0319 \
0320 rol32_1_16(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
0321 \
0322 vpxor l4, t0, l4; \
0323 vmovdqu l4, 4 * 16(l); \
0324 vpxor l5, t1, l5; \
0325 vmovdqu l5, 5 * 16(l); \
0326 vpxor l6, t2, l6; \
0327 vmovdqu l6, 6 * 16(l); \
0328 vpxor l7, t3, l7; \
0329 vmovdqu l7, 7 * 16(l); \
0330 \
0331
0332
0333
0334
0335 \
0336 \
0337 vmovd krr, t0; \
0338 vpshufb tt0, t0, t3; \
0339 vpsrldq $1, t0, t0; \
0340 vpshufb tt0, t0, t2; \
0341 vpsrldq $1, t0, t0; \
0342 vpshufb tt0, t0, t1; \
0343 vpsrldq $1, t0, t0; \
0344 vpshufb tt0, t0, t0; \
0345 \
0346 vpor 4 * 16(r), t0, t0; \
0347 vpor 5 * 16(r), t1, t1; \
0348 vpor 6 * 16(r), t2, t2; \
0349 vpor 7 * 16(r), t3, t3; \
0350 \
0351 vpxor 0 * 16(r), t0, t0; \
0352 vpxor 1 * 16(r), t1, t1; \
0353 vpxor 2 * 16(r), t2, t2; \
0354 vpxor 3 * 16(r), t3, t3; \
0355 vmovdqu t0, 0 * 16(r); \
0356 vmovdqu t1, 1 * 16(r); \
0357 vmovdqu t2, 2 * 16(r); \
0358 vmovdqu t3, 3 * 16(r); \
0359 \
0360
0361
0362
0363
0364 \
0365 vmovd krl, t0; \
0366 vpshufb tt0, t0, t3; \
0367 vpsrldq $1, t0, t0; \
0368 vpshufb tt0, t0, t2; \
0369 vpsrldq $1, t0, t0; \
0370 vpshufb tt0, t0, t1; \
0371 vpsrldq $1, t0, t0; \
0372 vpshufb tt0, t0, t0; \
0373 \
0374 vpand 0 * 16(r), t0, t0; \
0375 vpand 1 * 16(r), t1, t1; \
0376 vpand 2 * 16(r), t2, t2; \
0377 vpand 3 * 16(r), t3, t3; \
0378 \
0379 rol32_1_16(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
0380 \
0381 vpxor 4 * 16(r), t0, t0; \
0382 vpxor 5 * 16(r), t1, t1; \
0383 vpxor 6 * 16(r), t2, t2; \
0384 vpxor 7 * 16(r), t3, t3; \
0385 vmovdqu t0, 4 * 16(r); \
0386 vmovdqu t1, 5 * 16(r); \
0387 vmovdqu t2, 6 * 16(r); \
0388 vmovdqu t3, 7 * 16(r); \
0389 \
0390
0391
0392
0393
0394 \
0395 \
0396 vmovd klr, t0; \
0397 vpshufb tt0, t0, t3; \
0398 vpsrldq $1, t0, t0; \
0399 vpshufb tt0, t0, t2; \
0400 vpsrldq $1, t0, t0; \
0401 vpshufb tt0, t0, t1; \
0402 vpsrldq $1, t0, t0; \
0403 vpshufb tt0, t0, t0; \
0404 \
0405 vpor l4, t0, t0; \
0406 vpor l5, t1, t1; \
0407 vpor l6, t2, t2; \
0408 vpor l7, t3, t3; \
0409 \
0410 vpxor l0, t0, l0; \
0411 vmovdqu l0, 0 * 16(l); \
0412 vpxor l1, t1, l1; \
0413 vmovdqu l1, 1 * 16(l); \
0414 vpxor l2, t2, l2; \
0415 vmovdqu l2, 2 * 16(l); \
0416 vpxor l3, t3, l3; \
0417 vmovdqu l3, 3 * 16(l);
0418
0419 #define transpose_4x4(x0, x1, x2, x3, t1, t2) \
0420 vpunpckhdq x1, x0, t2; \
0421 vpunpckldq x1, x0, x0; \
0422 \
0423 vpunpckldq x3, x2, t1; \
0424 vpunpckhdq x3, x2, x2; \
0425 \
0426 vpunpckhqdq t1, x0, x1; \
0427 vpunpcklqdq t1, x0, x0; \
0428 \
0429 vpunpckhqdq x2, t2, x3; \
0430 vpunpcklqdq x2, t2, x2;
0431
0432 #define byteslice_16x16b(a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, \
0433 b3, c3, d3, st0, st1) \
0434 vmovdqu d2, st0; \
0435 vmovdqu d3, st1; \
0436 transpose_4x4(a0, a1, a2, a3, d2, d3); \
0437 transpose_4x4(b0, b1, b2, b3, d2, d3); \
0438 vmovdqu st0, d2; \
0439 vmovdqu st1, d3; \
0440 \
0441 vmovdqu a0, st0; \
0442 vmovdqu a1, st1; \
0443 transpose_4x4(c0, c1, c2, c3, a0, a1); \
0444 transpose_4x4(d0, d1, d2, d3, a0, a1); \
0445 \
0446 vmovdqu .Lshufb_16x16b, a0; \
0447 vmovdqu st1, a1; \
0448 vpshufb a0, a2, a2; \
0449 vpshufb a0, a3, a3; \
0450 vpshufb a0, b0, b0; \
0451 vpshufb a0, b1, b1; \
0452 vpshufb a0, b2, b2; \
0453 vpshufb a0, b3, b3; \
0454 vpshufb a0, a1, a1; \
0455 vpshufb a0, c0, c0; \
0456 vpshufb a0, c1, c1; \
0457 vpshufb a0, c2, c2; \
0458 vpshufb a0, c3, c3; \
0459 vpshufb a0, d0, d0; \
0460 vpshufb a0, d1, d1; \
0461 vpshufb a0, d2, d2; \
0462 vpshufb a0, d3, d3; \
0463 vmovdqu d3, st1; \
0464 vmovdqu st0, d3; \
0465 vpshufb a0, d3, a0; \
0466 vmovdqu d2, st0; \
0467 \
0468 transpose_4x4(a0, b0, c0, d0, d2, d3); \
0469 transpose_4x4(a1, b1, c1, d1, d2, d3); \
0470 vmovdqu st0, d2; \
0471 vmovdqu st1, d3; \
0472 \
0473 vmovdqu b0, st0; \
0474 vmovdqu b1, st1; \
0475 transpose_4x4(a2, b2, c2, d2, b0, b1); \
0476 transpose_4x4(a3, b3, c3, d3, b0, b1); \
0477 vmovdqu st0, b0; \
0478 vmovdqu st1, b1; \
0479
0480
0481
0482 #define inpack16_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
0483 y6, y7, rio, key) \
0484 vmovq key, x0; \
0485 vpshufb .Lpack_bswap, x0, x0; \
0486 \
0487 vpxor 0 * 16(rio), x0, y7; \
0488 vpxor 1 * 16(rio), x0, y6; \
0489 vpxor 2 * 16(rio), x0, y5; \
0490 vpxor 3 * 16(rio), x0, y4; \
0491 vpxor 4 * 16(rio), x0, y3; \
0492 vpxor 5 * 16(rio), x0, y2; \
0493 vpxor 6 * 16(rio), x0, y1; \
0494 vpxor 7 * 16(rio), x0, y0; \
0495 vpxor 8 * 16(rio), x0, x7; \
0496 vpxor 9 * 16(rio), x0, x6; \
0497 vpxor 10 * 16(rio), x0, x5; \
0498 vpxor 11 * 16(rio), x0, x4; \
0499 vpxor 12 * 16(rio), x0, x3; \
0500 vpxor 13 * 16(rio), x0, x2; \
0501 vpxor 14 * 16(rio), x0, x1; \
0502 vpxor 15 * 16(rio), x0, x0;
0503
0504
0505 #define inpack16_post(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
0506 y6, y7, mem_ab, mem_cd) \
0507 byteslice_16x16b(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \
0508 y5, y6, y7, (mem_ab), (mem_cd)); \
0509 \
0510 vmovdqu x0, 0 * 16(mem_ab); \
0511 vmovdqu x1, 1 * 16(mem_ab); \
0512 vmovdqu x2, 2 * 16(mem_ab); \
0513 vmovdqu x3, 3 * 16(mem_ab); \
0514 vmovdqu x4, 4 * 16(mem_ab); \
0515 vmovdqu x5, 5 * 16(mem_ab); \
0516 vmovdqu x6, 6 * 16(mem_ab); \
0517 vmovdqu x7, 7 * 16(mem_ab); \
0518 vmovdqu y0, 0 * 16(mem_cd); \
0519 vmovdqu y1, 1 * 16(mem_cd); \
0520 vmovdqu y2, 2 * 16(mem_cd); \
0521 vmovdqu y3, 3 * 16(mem_cd); \
0522 vmovdqu y4, 4 * 16(mem_cd); \
0523 vmovdqu y5, 5 * 16(mem_cd); \
0524 vmovdqu y6, 6 * 16(mem_cd); \
0525 vmovdqu y7, 7 * 16(mem_cd);
0526
0527
0528 #define outunpack16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \
0529 y5, y6, y7, key, stack_tmp0, stack_tmp1) \
0530 byteslice_16x16b(y0, y4, x0, x4, y1, y5, x1, x5, y2, y6, x2, x6, y3, \
0531 y7, x3, x7, stack_tmp0, stack_tmp1); \
0532 \
0533 vmovdqu x0, stack_tmp0; \
0534 \
0535 vmovq key, x0; \
0536 vpshufb .Lpack_bswap, x0, x0; \
0537 \
0538 vpxor x0, y7, y7; \
0539 vpxor x0, y6, y6; \
0540 vpxor x0, y5, y5; \
0541 vpxor x0, y4, y4; \
0542 vpxor x0, y3, y3; \
0543 vpxor x0, y2, y2; \
0544 vpxor x0, y1, y1; \
0545 vpxor x0, y0, y0; \
0546 vpxor x0, x7, x7; \
0547 vpxor x0, x6, x6; \
0548 vpxor x0, x5, x5; \
0549 vpxor x0, x4, x4; \
0550 vpxor x0, x3, x3; \
0551 vpxor x0, x2, x2; \
0552 vpxor x0, x1, x1; \
0553 vpxor stack_tmp0, x0, x0;
0554
0555 #define write_output(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
0556 y6, y7, rio) \
0557 vmovdqu x0, 0 * 16(rio); \
0558 vmovdqu x1, 1 * 16(rio); \
0559 vmovdqu x2, 2 * 16(rio); \
0560 vmovdqu x3, 3 * 16(rio); \
0561 vmovdqu x4, 4 * 16(rio); \
0562 vmovdqu x5, 5 * 16(rio); \
0563 vmovdqu x6, 6 * 16(rio); \
0564 vmovdqu x7, 7 * 16(rio); \
0565 vmovdqu y0, 8 * 16(rio); \
0566 vmovdqu y1, 9 * 16(rio); \
0567 vmovdqu y2, 10 * 16(rio); \
0568 vmovdqu y3, 11 * 16(rio); \
0569 vmovdqu y4, 12 * 16(rio); \
0570 vmovdqu y5, 13 * 16(rio); \
0571 vmovdqu y6, 14 * 16(rio); \
0572 vmovdqu y7, 15 * 16(rio);
0573
0574
0575
0576 .section .rodata.cst16, "aM", @progbits, 16
0577 .align 16
0578
0579 #define SHUFB_BYTES(idx) \
0580 0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
0581
0582 .Lshufb_16x16b:
0583 .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3);
0584
0585 .Lpack_bswap:
0586 .long 0x00010203
0587 .long 0x04050607
0588 .long 0x80808080
0589 .long 0x80808080
0590
0591
0592
0593
0594
0595
0596
0597
0598
0599
0600
0601
0602
0603
0604
0605 .Lpre_tf_lo_s1:
0606 .byte 0x45, 0xe8, 0x40, 0xed, 0x2e, 0x83, 0x2b, 0x86
0607 .byte 0x4b, 0xe6, 0x4e, 0xe3, 0x20, 0x8d, 0x25, 0x88
0608 .Lpre_tf_hi_s1:
0609 .byte 0x00, 0x51, 0xf1, 0xa0, 0x8a, 0xdb, 0x7b, 0x2a
0610 .byte 0x09, 0x58, 0xf8, 0xa9, 0x83, 0xd2, 0x72, 0x23
0611
0612
0613
0614
0615
0616
0617
0618
0619
0620
0621
0622
0623
0624
0625
0626 .Lpre_tf_lo_s4:
0627 .byte 0x45, 0x40, 0x2e, 0x2b, 0x4b, 0x4e, 0x20, 0x25
0628 .byte 0x14, 0x11, 0x7f, 0x7a, 0x1a, 0x1f, 0x71, 0x74
0629 .Lpre_tf_hi_s4:
0630 .byte 0x00, 0xf1, 0x8a, 0x7b, 0x09, 0xf8, 0x83, 0x72
0631 .byte 0xad, 0x5c, 0x27, 0xd6, 0xa4, 0x55, 0x2e, 0xdf
0632
0633
0634
0635
0636
0637
0638
0639
0640
0641
0642
0643
0644
0645
0646
0647
0648
0649 .Lpost_tf_lo_s1:
0650 .byte 0x3c, 0xcc, 0xcf, 0x3f, 0x32, 0xc2, 0xc1, 0x31
0651 .byte 0xdc, 0x2c, 0x2f, 0xdf, 0xd2, 0x22, 0x21, 0xd1
0652 .Lpost_tf_hi_s1:
0653 .byte 0x00, 0xf9, 0x86, 0x7f, 0xd7, 0x2e, 0x51, 0xa8
0654 .byte 0xa4, 0x5d, 0x22, 0xdb, 0x73, 0x8a, 0xf5, 0x0c
0655
0656
0657
0658
0659
0660
0661
0662
0663
0664
0665
0666
0667
0668
0669
0670
0671
0672 .Lpost_tf_lo_s2:
0673 .byte 0x78, 0x99, 0x9f, 0x7e, 0x64, 0x85, 0x83, 0x62
0674 .byte 0xb9, 0x58, 0x5e, 0xbf, 0xa5, 0x44, 0x42, 0xa3
0675 .Lpost_tf_hi_s2:
0676 .byte 0x00, 0xf3, 0x0d, 0xfe, 0xaf, 0x5c, 0xa2, 0x51
0677 .byte 0x49, 0xba, 0x44, 0xb7, 0xe6, 0x15, 0xeb, 0x18
0678
0679
0680
0681
0682
0683
0684
0685
0686
0687
0688
0689
0690
0691
0692
0693
0694
0695 .Lpost_tf_lo_s3:
0696 .byte 0x1e, 0x66, 0xe7, 0x9f, 0x19, 0x61, 0xe0, 0x98
0697 .byte 0x6e, 0x16, 0x97, 0xef, 0x69, 0x11, 0x90, 0xe8
0698 .Lpost_tf_hi_s3:
0699 .byte 0x00, 0xfc, 0x43, 0xbf, 0xeb, 0x17, 0xa8, 0x54
0700 .byte 0x52, 0xae, 0x11, 0xed, 0xb9, 0x45, 0xfa, 0x06
0701
0702
0703 .Linv_shift_row:
0704 .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
0705 .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
0706
0707
0708 .section .rodata.cst4.L0f0f0f0f, "aM", @progbits, 4
0709 .align 4
0710 .L0f0f0f0f:
0711 .long 0x0f0f0f0f
0712
0713 .text
0714
0715 .align 8
0716 SYM_FUNC_START_LOCAL(__camellia_enc_blk16)
0717
0718
0719
0720
0721
0722
0723
0724
0725 FRAME_BEGIN
0726
0727 leaq 8 * 16(%rax), %rcx;
0728
0729 inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
0730 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
0731 %xmm15, %rax, %rcx);
0732
0733 enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
0734 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
0735 %xmm15, %rax, %rcx, 0);
0736
0737 fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
0738 %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
0739 %xmm15,
0740 ((key_table + (8) * 8) + 0)(CTX),
0741 ((key_table + (8) * 8) + 4)(CTX),
0742 ((key_table + (8) * 8) + 8)(CTX),
0743 ((key_table + (8) * 8) + 12)(CTX));
0744
0745 enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
0746 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
0747 %xmm15, %rax, %rcx, 8);
0748
0749 fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
0750 %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
0751 %xmm15,
0752 ((key_table + (16) * 8) + 0)(CTX),
0753 ((key_table + (16) * 8) + 4)(CTX),
0754 ((key_table + (16) * 8) + 8)(CTX),
0755 ((key_table + (16) * 8) + 12)(CTX));
0756
0757 enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
0758 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
0759 %xmm15, %rax, %rcx, 16);
0760
0761 movl $24, %r8d;
0762 cmpl $16, key_length(CTX);
0763 jne .Lenc_max32;
0764
0765 .Lenc_done:
0766
0767 vmovdqu 0 * 16(%rcx), %xmm8;
0768 vmovdqu 1 * 16(%rcx), %xmm9;
0769 vmovdqu 2 * 16(%rcx), %xmm10;
0770 vmovdqu 3 * 16(%rcx), %xmm11;
0771 vmovdqu 4 * 16(%rcx), %xmm12;
0772 vmovdqu 5 * 16(%rcx), %xmm13;
0773 vmovdqu 6 * 16(%rcx), %xmm14;
0774 vmovdqu 7 * 16(%rcx), %xmm15;
0775
0776 outunpack16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
0777 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
0778 %xmm15, (key_table)(CTX, %r8, 8), (%rax), 1 * 16(%rax));
0779
0780 FRAME_END
0781 RET;
0782
0783 .align 8
0784 .Lenc_max32:
0785 movl $32, %r8d;
0786
0787 fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
0788 %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
0789 %xmm15,
0790 ((key_table + (24) * 8) + 0)(CTX),
0791 ((key_table + (24) * 8) + 4)(CTX),
0792 ((key_table + (24) * 8) + 8)(CTX),
0793 ((key_table + (24) * 8) + 12)(CTX));
0794
0795 enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
0796 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
0797 %xmm15, %rax, %rcx, 24);
0798
0799 jmp .Lenc_done;
0800 SYM_FUNC_END(__camellia_enc_blk16)
0801
0802 .align 8
0803 SYM_FUNC_START_LOCAL(__camellia_dec_blk16)
0804
0805
0806
0807
0808
0809
0810
0811
0812
0813 FRAME_BEGIN
0814
0815 leaq 8 * 16(%rax), %rcx;
0816
0817 inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
0818 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
0819 %xmm15, %rax, %rcx);
0820
0821 cmpl $32, %r8d;
0822 je .Ldec_max32;
0823
0824 .Ldec_max24:
0825 dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
0826 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
0827 %xmm15, %rax, %rcx, 16);
0828
0829 fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
0830 %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
0831 %xmm15,
0832 ((key_table + (16) * 8) + 8)(CTX),
0833 ((key_table + (16) * 8) + 12)(CTX),
0834 ((key_table + (16) * 8) + 0)(CTX),
0835 ((key_table + (16) * 8) + 4)(CTX));
0836
0837 dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
0838 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
0839 %xmm15, %rax, %rcx, 8);
0840
0841 fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
0842 %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
0843 %xmm15,
0844 ((key_table + (8) * 8) + 8)(CTX),
0845 ((key_table + (8) * 8) + 12)(CTX),
0846 ((key_table + (8) * 8) + 0)(CTX),
0847 ((key_table + (8) * 8) + 4)(CTX));
0848
0849 dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
0850 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
0851 %xmm15, %rax, %rcx, 0);
0852
0853
0854 vmovdqu 0 * 16(%rcx), %xmm8;
0855 vmovdqu 1 * 16(%rcx), %xmm9;
0856 vmovdqu 2 * 16(%rcx), %xmm10;
0857 vmovdqu 3 * 16(%rcx), %xmm11;
0858 vmovdqu 4 * 16(%rcx), %xmm12;
0859 vmovdqu 5 * 16(%rcx), %xmm13;
0860 vmovdqu 6 * 16(%rcx), %xmm14;
0861 vmovdqu 7 * 16(%rcx), %xmm15;
0862
0863 outunpack16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
0864 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
0865 %xmm15, (key_table)(CTX), (%rax), 1 * 16(%rax));
0866
0867 FRAME_END
0868 RET;
0869
0870 .align 8
0871 .Ldec_max32:
0872 dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
0873 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
0874 %xmm15, %rax, %rcx, 24);
0875
0876 fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
0877 %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
0878 %xmm15,
0879 ((key_table + (24) * 8) + 8)(CTX),
0880 ((key_table + (24) * 8) + 12)(CTX),
0881 ((key_table + (24) * 8) + 0)(CTX),
0882 ((key_table + (24) * 8) + 4)(CTX));
0883
0884 jmp .Ldec_max24;
0885 SYM_FUNC_END(__camellia_dec_blk16)
0886
0887 SYM_FUNC_START(camellia_ecb_enc_16way)
0888
0889
0890
0891
0892
0893 FRAME_BEGIN
0894
0895 inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
0896 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
0897 %xmm15, %rdx, (key_table)(CTX));
0898
0899
0900 movq %rsi, %rax;
0901
0902 call __camellia_enc_blk16;
0903
0904 write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
0905 %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
0906 %xmm8, %rsi);
0907
0908 FRAME_END
0909 RET;
0910 SYM_FUNC_END(camellia_ecb_enc_16way)
0911
0912 SYM_FUNC_START(camellia_ecb_dec_16way)
0913
0914
0915
0916
0917
0918 FRAME_BEGIN
0919
0920 cmpl $16, key_length(CTX);
0921 movl $32, %r8d;
0922 movl $24, %eax;
0923 cmovel %eax, %r8d;
0924
0925 inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
0926 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
0927 %xmm15, %rdx, (key_table)(CTX, %r8, 8));
0928
0929
0930 movq %rsi, %rax;
0931
0932 call __camellia_dec_blk16;
0933
0934 write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
0935 %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
0936 %xmm8, %rsi);
0937
0938 FRAME_END
0939 RET;
0940 SYM_FUNC_END(camellia_ecb_dec_16way)
0941
0942 SYM_FUNC_START(camellia_cbc_dec_16way)
0943
0944
0945
0946
0947
0948 FRAME_BEGIN
0949
0950 cmpl $16, key_length(CTX);
0951 movl $32, %r8d;
0952 movl $24, %eax;
0953 cmovel %eax, %r8d;
0954
0955 inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
0956 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
0957 %xmm15, %rdx, (key_table)(CTX, %r8, 8));
0958
0959
0960
0961
0962
0963 subq $(16 * 16), %rsp;
0964 movq %rsp, %rax;
0965
0966 call __camellia_dec_blk16;
0967
0968 addq $(16 * 16), %rsp;
0969
0970 vpxor (0 * 16)(%rdx), %xmm6, %xmm6;
0971 vpxor (1 * 16)(%rdx), %xmm5, %xmm5;
0972 vpxor (2 * 16)(%rdx), %xmm4, %xmm4;
0973 vpxor (3 * 16)(%rdx), %xmm3, %xmm3;
0974 vpxor (4 * 16)(%rdx), %xmm2, %xmm2;
0975 vpxor (5 * 16)(%rdx), %xmm1, %xmm1;
0976 vpxor (6 * 16)(%rdx), %xmm0, %xmm0;
0977 vpxor (7 * 16)(%rdx), %xmm15, %xmm15;
0978 vpxor (8 * 16)(%rdx), %xmm14, %xmm14;
0979 vpxor (9 * 16)(%rdx), %xmm13, %xmm13;
0980 vpxor (10 * 16)(%rdx), %xmm12, %xmm12;
0981 vpxor (11 * 16)(%rdx), %xmm11, %xmm11;
0982 vpxor (12 * 16)(%rdx), %xmm10, %xmm10;
0983 vpxor (13 * 16)(%rdx), %xmm9, %xmm9;
0984 vpxor (14 * 16)(%rdx), %xmm8, %xmm8;
0985 write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
0986 %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
0987 %xmm8, %rsi);
0988
0989 FRAME_END
0990 RET;
0991 SYM_FUNC_END(camellia_cbc_dec_16way)