0001
0002
0003
0004
0005
0006
0007
0008 #include <linux/linkage.h>
0009
0010 .file "camellia-x86_64-asm_64.S"
0011 .text
0012
0013 .extern camellia_sp10011110;
0014 .extern camellia_sp22000222;
0015 .extern camellia_sp03303033;
0016 .extern camellia_sp00444404;
0017 .extern camellia_sp02220222;
0018 .extern camellia_sp30333033;
0019 .extern camellia_sp44044404;
0020 .extern camellia_sp11101110;
0021
0022 #define sp10011110 camellia_sp10011110
0023 #define sp22000222 camellia_sp22000222
0024 #define sp03303033 camellia_sp03303033
0025 #define sp00444404 camellia_sp00444404
0026 #define sp02220222 camellia_sp02220222
0027 #define sp30333033 camellia_sp30333033
0028 #define sp44044404 camellia_sp44044404
0029 #define sp11101110 camellia_sp11101110
0030
0031 #define CAMELLIA_TABLE_BYTE_LEN 272
0032
0033
0034 #define key_table 0
0035 #define key_length CAMELLIA_TABLE_BYTE_LEN
0036
0037
0038 #define CTX %rdi
0039 #define RIO %rsi
0040 #define RIOd %esi
0041
0042 #define RAB0 %rax
0043 #define RCD0 %rcx
0044 #define RAB1 %rbx
0045 #define RCD1 %rdx
0046
0047 #define RAB0d %eax
0048 #define RCD0d %ecx
0049 #define RAB1d %ebx
0050 #define RCD1d %edx
0051
0052 #define RAB0bl %al
0053 #define RCD0bl %cl
0054 #define RAB1bl %bl
0055 #define RCD1bl %dl
0056
0057 #define RAB0bh %ah
0058 #define RCD0bh %ch
0059 #define RAB1bh %bh
0060 #define RCD1bh %dh
0061
0062 #define RT0 %rsi
0063 #define RT1 %r12
0064 #define RT2 %r8
0065
0066 #define RT0d %esi
0067 #define RT1d %r12d
0068 #define RT2d %r8d
0069
0070 #define RT2bl %r8b
0071
0072 #define RXOR %r9
0073 #define RR12 %r10
0074 #define RDST %r11
0075
0076 #define RXORd %r9d
0077 #define RXORbl %r9b
0078
0079 #define xor2ror16(T0, T1, tmp1, tmp2, ab, dst) \
0080 movzbl ab ## bl, tmp2 ## d; \
0081 movzbl ab ## bh, tmp1 ## d; \
0082 rorq $16, ab; \
0083 xorq T0(, tmp2, 8), dst; \
0084 xorq T1(, tmp1, 8), dst;
0085
0086
0087
0088
0089 #define roundsm(ab, subkey, cd) \
0090 movq (key_table + ((subkey) * 2) * 4)(CTX), RT2; \
0091 \
0092 xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 0, cd ## 0); \
0093 xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 0, RT2); \
0094 xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 0, cd ## 0); \
0095 xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 0, RT2); \
0096 \
0097 xorq RT2, cd ## 0;
0098
0099 #define fls(l, r, kl, kr) \
0100 movl (key_table + ((kl) * 2) * 4)(CTX), RT0d; \
0101 andl l ## 0d, RT0d; \
0102 roll $1, RT0d; \
0103 shlq $32, RT0; \
0104 xorq RT0, l ## 0; \
0105 movq (key_table + ((kr) * 2) * 4)(CTX), RT1; \
0106 orq r ## 0, RT1; \
0107 shrq $32, RT1; \
0108 xorq RT1, r ## 0; \
0109 \
0110 movq (key_table + ((kl) * 2) * 4)(CTX), RT2; \
0111 orq l ## 0, RT2; \
0112 shrq $32, RT2; \
0113 xorq RT2, l ## 0; \
0114 movl (key_table + ((kr) * 2) * 4)(CTX), RT0d; \
0115 andl r ## 0d, RT0d; \
0116 roll $1, RT0d; \
0117 shlq $32, RT0; \
0118 xorq RT0, r ## 0;
0119
0120 #define enc_rounds(i) \
0121 roundsm(RAB, i + 2, RCD); \
0122 roundsm(RCD, i + 3, RAB); \
0123 roundsm(RAB, i + 4, RCD); \
0124 roundsm(RCD, i + 5, RAB); \
0125 roundsm(RAB, i + 6, RCD); \
0126 roundsm(RCD, i + 7, RAB);
0127
0128 #define enc_fls(i) \
0129 fls(RAB, RCD, i + 0, i + 1);
0130
0131 #define enc_inpack() \
0132 movq (RIO), RAB0; \
0133 bswapq RAB0; \
0134 rolq $32, RAB0; \
0135 movq 4*2(RIO), RCD0; \
0136 bswapq RCD0; \
0137 rorq $32, RCD0; \
0138 xorq key_table(CTX), RAB0;
0139
0140 #define enc_outunpack(op, max) \
0141 xorq key_table(CTX, max, 8), RCD0; \
0142 rorq $32, RCD0; \
0143 bswapq RCD0; \
0144 op ## q RCD0, (RIO); \
0145 rolq $32, RAB0; \
0146 bswapq RAB0; \
0147 op ## q RAB0, 4*2(RIO);
0148
0149 #define dec_rounds(i) \
0150 roundsm(RAB, i + 7, RCD); \
0151 roundsm(RCD, i + 6, RAB); \
0152 roundsm(RAB, i + 5, RCD); \
0153 roundsm(RCD, i + 4, RAB); \
0154 roundsm(RAB, i + 3, RCD); \
0155 roundsm(RCD, i + 2, RAB);
0156
0157 #define dec_fls(i) \
0158 fls(RAB, RCD, i + 1, i + 0);
0159
0160 #define dec_inpack(max) \
0161 movq (RIO), RAB0; \
0162 bswapq RAB0; \
0163 rolq $32, RAB0; \
0164 movq 4*2(RIO), RCD0; \
0165 bswapq RCD0; \
0166 rorq $32, RCD0; \
0167 xorq key_table(CTX, max, 8), RAB0;
0168
0169 #define dec_outunpack() \
0170 xorq key_table(CTX), RCD0; \
0171 rorq $32, RCD0; \
0172 bswapq RCD0; \
0173 movq RCD0, (RIO); \
0174 rolq $32, RAB0; \
0175 bswapq RAB0; \
0176 movq RAB0, 4*2(RIO);
0177
0178 SYM_FUNC_START(__camellia_enc_blk)
0179
0180
0181
0182
0183
0184
0185 movq %r12, RR12;
0186
0187 movq %rcx, RXOR;
0188 movq %rsi, RDST;
0189 movq %rdx, RIO;
0190
0191 enc_inpack();
0192
0193 enc_rounds(0);
0194 enc_fls(8);
0195 enc_rounds(8);
0196 enc_fls(16);
0197 enc_rounds(16);
0198 movl $24, RT1d;
0199
0200 cmpb $16, key_length(CTX);
0201 je .L__enc_done;
0202
0203 enc_fls(24);
0204 enc_rounds(24);
0205 movl $32, RT1d;
0206
0207 .L__enc_done:
0208 testb RXORbl, RXORbl;
0209 movq RDST, RIO;
0210
0211 jnz .L__enc_xor;
0212
0213 enc_outunpack(mov, RT1);
0214
0215 movq RR12, %r12;
0216 RET;
0217
0218 .L__enc_xor:
0219 enc_outunpack(xor, RT1);
0220
0221 movq RR12, %r12;
0222 RET;
0223 SYM_FUNC_END(__camellia_enc_blk)
0224
0225 SYM_FUNC_START(camellia_dec_blk)
0226
0227
0228
0229
0230
0231 cmpl $16, key_length(CTX);
0232 movl $32, RT2d;
0233 movl $24, RXORd;
0234 cmovel RXORd, RT2d;
0235
0236 movq %r12, RR12;
0237 movq %rsi, RDST;
0238 movq %rdx, RIO;
0239
0240 dec_inpack(RT2);
0241
0242 cmpb $24, RT2bl;
0243 je .L__dec_rounds16;
0244
0245 dec_rounds(24);
0246 dec_fls(24);
0247
0248 .L__dec_rounds16:
0249 dec_rounds(16);
0250 dec_fls(16);
0251 dec_rounds(8);
0252 dec_fls(8);
0253 dec_rounds(0);
0254
0255 movq RDST, RIO;
0256
0257 dec_outunpack();
0258
0259 movq RR12, %r12;
0260 RET;
0261 SYM_FUNC_END(camellia_dec_blk)
0262
0263
0264
0265
0266 #define roundsm2(ab, subkey, cd) \
0267 movq (key_table + ((subkey) * 2) * 4)(CTX), RT2; \
0268 xorq RT2, cd ## 1; \
0269 \
0270 xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 0, cd ## 0); \
0271 xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 0, RT2); \
0272 xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 0, cd ## 0); \
0273 xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 0, RT2); \
0274 \
0275 xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 1, cd ## 1); \
0276 xorq RT2, cd ## 0; \
0277 xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 1, cd ## 1); \
0278 xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 1, cd ## 1); \
0279 xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 1, cd ## 1);
0280
0281 #define fls2(l, r, kl, kr) \
0282 movl (key_table + ((kl) * 2) * 4)(CTX), RT0d; \
0283 andl l ## 0d, RT0d; \
0284 roll $1, RT0d; \
0285 shlq $32, RT0; \
0286 xorq RT0, l ## 0; \
0287 movq (key_table + ((kr) * 2) * 4)(CTX), RT1; \
0288 orq r ## 0, RT1; \
0289 shrq $32, RT1; \
0290 xorq RT1, r ## 0; \
0291 \
0292 movl (key_table + ((kl) * 2) * 4)(CTX), RT2d; \
0293 andl l ## 1d, RT2d; \
0294 roll $1, RT2d; \
0295 shlq $32, RT2; \
0296 xorq RT2, l ## 1; \
0297 movq (key_table + ((kr) * 2) * 4)(CTX), RT0; \
0298 orq r ## 1, RT0; \
0299 shrq $32, RT0; \
0300 xorq RT0, r ## 1; \
0301 \
0302 movq (key_table + ((kl) * 2) * 4)(CTX), RT1; \
0303 orq l ## 0, RT1; \
0304 shrq $32, RT1; \
0305 xorq RT1, l ## 0; \
0306 movl (key_table + ((kr) * 2) * 4)(CTX), RT2d; \
0307 andl r ## 0d, RT2d; \
0308 roll $1, RT2d; \
0309 shlq $32, RT2; \
0310 xorq RT2, r ## 0; \
0311 \
0312 movq (key_table + ((kl) * 2) * 4)(CTX), RT0; \
0313 orq l ## 1, RT0; \
0314 shrq $32, RT0; \
0315 xorq RT0, l ## 1; \
0316 movl (key_table + ((kr) * 2) * 4)(CTX), RT1d; \
0317 andl r ## 1d, RT1d; \
0318 roll $1, RT1d; \
0319 shlq $32, RT1; \
0320 xorq RT1, r ## 1;
0321
0322 #define enc_rounds2(i) \
0323 roundsm2(RAB, i + 2, RCD); \
0324 roundsm2(RCD, i + 3, RAB); \
0325 roundsm2(RAB, i + 4, RCD); \
0326 roundsm2(RCD, i + 5, RAB); \
0327 roundsm2(RAB, i + 6, RCD); \
0328 roundsm2(RCD, i + 7, RAB);
0329
0330 #define enc_fls2(i) \
0331 fls2(RAB, RCD, i + 0, i + 1);
0332
0333 #define enc_inpack2() \
0334 movq (RIO), RAB0; \
0335 bswapq RAB0; \
0336 rorq $32, RAB0; \
0337 movq 4*2(RIO), RCD0; \
0338 bswapq RCD0; \
0339 rolq $32, RCD0; \
0340 xorq key_table(CTX), RAB0; \
0341 \
0342 movq 8*2(RIO), RAB1; \
0343 bswapq RAB1; \
0344 rorq $32, RAB1; \
0345 movq 12*2(RIO), RCD1; \
0346 bswapq RCD1; \
0347 rolq $32, RCD1; \
0348 xorq key_table(CTX), RAB1;
0349
0350 #define enc_outunpack2(op, max) \
0351 xorq key_table(CTX, max, 8), RCD0; \
0352 rolq $32, RCD0; \
0353 bswapq RCD0; \
0354 op ## q RCD0, (RIO); \
0355 rorq $32, RAB0; \
0356 bswapq RAB0; \
0357 op ## q RAB0, 4*2(RIO); \
0358 \
0359 xorq key_table(CTX, max, 8), RCD1; \
0360 rolq $32, RCD1; \
0361 bswapq RCD1; \
0362 op ## q RCD1, 8*2(RIO); \
0363 rorq $32, RAB1; \
0364 bswapq RAB1; \
0365 op ## q RAB1, 12*2(RIO);
0366
0367 #define dec_rounds2(i) \
0368 roundsm2(RAB, i + 7, RCD); \
0369 roundsm2(RCD, i + 6, RAB); \
0370 roundsm2(RAB, i + 5, RCD); \
0371 roundsm2(RCD, i + 4, RAB); \
0372 roundsm2(RAB, i + 3, RCD); \
0373 roundsm2(RCD, i + 2, RAB);
0374
0375 #define dec_fls2(i) \
0376 fls2(RAB, RCD, i + 1, i + 0);
0377
0378 #define dec_inpack2(max) \
0379 movq (RIO), RAB0; \
0380 bswapq RAB0; \
0381 rorq $32, RAB0; \
0382 movq 4*2(RIO), RCD0; \
0383 bswapq RCD0; \
0384 rolq $32, RCD0; \
0385 xorq key_table(CTX, max, 8), RAB0; \
0386 \
0387 movq 8*2(RIO), RAB1; \
0388 bswapq RAB1; \
0389 rorq $32, RAB1; \
0390 movq 12*2(RIO), RCD1; \
0391 bswapq RCD1; \
0392 rolq $32, RCD1; \
0393 xorq key_table(CTX, max, 8), RAB1;
0394
0395 #define dec_outunpack2() \
0396 xorq key_table(CTX), RCD0; \
0397 rolq $32, RCD0; \
0398 bswapq RCD0; \
0399 movq RCD0, (RIO); \
0400 rorq $32, RAB0; \
0401 bswapq RAB0; \
0402 movq RAB0, 4*2(RIO); \
0403 \
0404 xorq key_table(CTX), RCD1; \
0405 rolq $32, RCD1; \
0406 bswapq RCD1; \
0407 movq RCD1, 8*2(RIO); \
0408 rorq $32, RAB1; \
0409 bswapq RAB1; \
0410 movq RAB1, 12*2(RIO);
0411
0412 SYM_FUNC_START(__camellia_enc_blk_2way)
0413
0414
0415
0416
0417
0418
0419 pushq %rbx;
0420
0421 movq %r12, RR12;
0422 movq %rcx, RXOR;
0423 movq %rsi, RDST;
0424 movq %rdx, RIO;
0425
0426 enc_inpack2();
0427
0428 enc_rounds2(0);
0429 enc_fls2(8);
0430 enc_rounds2(8);
0431 enc_fls2(16);
0432 enc_rounds2(16);
0433 movl $24, RT2d;
0434
0435 cmpb $16, key_length(CTX);
0436 je .L__enc2_done;
0437
0438 enc_fls2(24);
0439 enc_rounds2(24);
0440 movl $32, RT2d;
0441
0442 .L__enc2_done:
0443 test RXORbl, RXORbl;
0444 movq RDST, RIO;
0445 jnz .L__enc2_xor;
0446
0447 enc_outunpack2(mov, RT2);
0448
0449 movq RR12, %r12;
0450 popq %rbx;
0451 RET;
0452
0453 .L__enc2_xor:
0454 enc_outunpack2(xor, RT2);
0455
0456 movq RR12, %r12;
0457 popq %rbx;
0458 RET;
0459 SYM_FUNC_END(__camellia_enc_blk_2way)
0460
0461 SYM_FUNC_START(camellia_dec_blk_2way)
0462
0463
0464
0465
0466
0467 cmpl $16, key_length(CTX);
0468 movl $32, RT2d;
0469 movl $24, RXORd;
0470 cmovel RXORd, RT2d;
0471
0472 movq %rbx, RXOR;
0473 movq %r12, RR12;
0474 movq %rsi, RDST;
0475 movq %rdx, RIO;
0476
0477 dec_inpack2(RT2);
0478
0479 cmpb $24, RT2bl;
0480 je .L__dec2_rounds16;
0481
0482 dec_rounds2(24);
0483 dec_fls2(24);
0484
0485 .L__dec2_rounds16:
0486 dec_rounds2(16);
0487 dec_fls2(16);
0488 dec_rounds2(8);
0489 dec_fls2(8);
0490 dec_rounds2(0);
0491
0492 movq RDST, RIO;
0493
0494 dec_outunpack2();
0495
0496 movq RR12, %r12;
0497 movq RXOR, %rbx;
0498 RET;
0499 SYM_FUNC_END(camellia_dec_blk_2way)