0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011 #include <linux/linkage.h>
0012 #include <asm/assembler.h>
0013
0014
0015
0016 #define RTMP0 v8
0017 #define RTMP1 v9
0018 #define RTMP2 v10
0019 #define RTMP3 v11
0020
0021 #define RX0 v12
0022 #define RX1 v13
0023 #define RKEY v14
0024 #define RIV v15
0025
0026
0027
0028 #define PREPARE \
0029 adr_l x5, crypto_sm4_sbox; \
0030 ld1 {v16.16b-v19.16b}, [x5], #64; \
0031 ld1 {v20.16b-v23.16b}, [x5], #64; \
0032 ld1 {v24.16b-v27.16b}, [x5], #64; \
0033 ld1 {v28.16b-v31.16b}, [x5];
0034
0035 #define transpose_4x4(s0, s1, s2, s3) \
0036 zip1 RTMP0.4s, s0.4s, s1.4s; \
0037 zip1 RTMP1.4s, s2.4s, s3.4s; \
0038 zip2 RTMP2.4s, s0.4s, s1.4s; \
0039 zip2 RTMP3.4s, s2.4s, s3.4s; \
0040 zip1 s0.2d, RTMP0.2d, RTMP1.2d; \
0041 zip2 s1.2d, RTMP0.2d, RTMP1.2d; \
0042 zip1 s2.2d, RTMP2.2d, RTMP3.2d; \
0043 zip2 s3.2d, RTMP2.2d, RTMP3.2d;
0044
0045 #define rotate_clockwise_90(s0, s1, s2, s3) \
0046 zip1 RTMP0.4s, s1.4s, s0.4s; \
0047 zip2 RTMP1.4s, s1.4s, s0.4s; \
0048 zip1 RTMP2.4s, s3.4s, s2.4s; \
0049 zip2 RTMP3.4s, s3.4s, s2.4s; \
0050 zip1 s0.2d, RTMP2.2d, RTMP0.2d; \
0051 zip2 s1.2d, RTMP2.2d, RTMP0.2d; \
0052 zip1 s2.2d, RTMP3.2d, RTMP1.2d; \
0053 zip2 s3.2d, RTMP3.2d, RTMP1.2d;
0054
0055 #define ROUND4(round, s0, s1, s2, s3) \
0056 dup RX0.4s, RKEY.s[round]; \
0057 \
0058 eor RTMP1.16b, s2.16b, s3.16b; \
0059 eor RX0.16b, RX0.16b, s1.16b; \
0060 eor RX0.16b, RX0.16b, RTMP1.16b; \
0061 \
0062 \
0063 movi RTMP3.16b, #64; \
0064 tbl RTMP0.16b, {v16.16b-v19.16b}, RX0.16b; \
0065 sub RX0.16b, RX0.16b, RTMP3.16b; \
0066 tbx RTMP0.16b, {v20.16b-v23.16b}, RX0.16b; \
0067 sub RX0.16b, RX0.16b, RTMP3.16b; \
0068 tbx RTMP0.16b, {v24.16b-v27.16b}, RX0.16b; \
0069 sub RX0.16b, RX0.16b, RTMP3.16b; \
0070 tbx RTMP0.16b, {v28.16b-v31.16b}, RX0.16b; \
0071 \
0072 \
0073 shl RTMP1.4s, RTMP0.4s, #8; \
0074 shl RTMP2.4s, RTMP0.4s, #16; \
0075 shl RTMP3.4s, RTMP0.4s, #24; \
0076 sri RTMP1.4s, RTMP0.4s, #(32-8); \
0077 sri RTMP2.4s, RTMP0.4s, #(32-16); \
0078 sri RTMP3.4s, RTMP0.4s, #(32-24); \
0079 \
0080 eor RTMP1.16b, RTMP1.16b, RTMP0.16b; \
0081 eor RTMP1.16b, RTMP1.16b, RTMP2.16b; \
0082 \
0083 eor RTMP3.16b, RTMP3.16b, RTMP0.16b; \
0084 shl RTMP2.4s, RTMP1.4s, 2; \
0085 sri RTMP2.4s, RTMP1.4s, #(32-2); \
0086 eor RTMP3.16b, RTMP3.16b, RTMP2.16b; \
0087 \
0088 eor s0.16b, s0.16b, RTMP3.16b;
0089
0090 #define SM4_CRYPT_BLK4(b0, b1, b2, b3) \
0091 rev32 b0.16b, b0.16b; \
0092 rev32 b1.16b, b1.16b; \
0093 rev32 b2.16b, b2.16b; \
0094 rev32 b3.16b, b3.16b; \
0095 \
0096 transpose_4x4(b0, b1, b2, b3); \
0097 \
0098 mov x6, 8; \
0099 4: \
0100 ld1 {RKEY.4s}, [x0], #16; \
0101 subs x6, x6, #1; \
0102 \
0103 ROUND4(0, b0, b1, b2, b3); \
0104 ROUND4(1, b1, b2, b3, b0); \
0105 ROUND4(2, b2, b3, b0, b1); \
0106 ROUND4(3, b3, b0, b1, b2); \
0107 \
0108 bne 4b; \
0109 \
0110 rotate_clockwise_90(b0, b1, b2, b3); \
0111 rev32 b0.16b, b0.16b; \
0112 rev32 b1.16b, b1.16b; \
0113 rev32 b2.16b, b2.16b; \
0114 rev32 b3.16b, b3.16b; \
0115 \
0116 \
0117 sub x0, x0, #128;
0118
0119 #define ROUND8(round, s0, s1, s2, s3, t0, t1, t2, t3) \
0120 \
0121 dup RX0.4s, RKEY.s[round]; \
0122 eor RTMP0.16b, s2.16b, s3.16b; \
0123 mov RX1.16b, RX0.16b; \
0124 eor RTMP1.16b, t2.16b, t3.16b; \
0125 eor RX0.16b, RX0.16b, s1.16b; \
0126 eor RX1.16b, RX1.16b, t1.16b; \
0127 eor RX0.16b, RX0.16b, RTMP0.16b; \
0128 eor RX1.16b, RX1.16b, RTMP1.16b; \
0129 \
0130 \
0131 movi RTMP3.16b, #64; \
0132 tbl RTMP0.16b, {v16.16b-v19.16b}, RX0.16b; \
0133 tbl RTMP1.16b, {v16.16b-v19.16b}, RX1.16b; \
0134 sub RX0.16b, RX0.16b, RTMP3.16b; \
0135 sub RX1.16b, RX1.16b, RTMP3.16b; \
0136 tbx RTMP0.16b, {v20.16b-v23.16b}, RX0.16b; \
0137 tbx RTMP1.16b, {v20.16b-v23.16b}, RX1.16b; \
0138 sub RX0.16b, RX0.16b, RTMP3.16b; \
0139 sub RX1.16b, RX1.16b, RTMP3.16b; \
0140 tbx RTMP0.16b, {v24.16b-v27.16b}, RX0.16b; \
0141 tbx RTMP1.16b, {v24.16b-v27.16b}, RX1.16b; \
0142 sub RX0.16b, RX0.16b, RTMP3.16b; \
0143 sub RX1.16b, RX1.16b, RTMP3.16b; \
0144 tbx RTMP0.16b, {v28.16b-v31.16b}, RX0.16b; \
0145 tbx RTMP1.16b, {v28.16b-v31.16b}, RX1.16b; \
0146 \
0147 \
0148 shl RX0.4s, RTMP0.4s, #8; \
0149 shl RX1.4s, RTMP1.4s, #8; \
0150 shl RTMP2.4s, RTMP0.4s, #16; \
0151 shl RTMP3.4s, RTMP1.4s, #16; \
0152 sri RX0.4s, RTMP0.4s, #(32 - 8); \
0153 sri RX1.4s, RTMP1.4s, #(32 - 8); \
0154 sri RTMP2.4s, RTMP0.4s, #(32 - 16); \
0155 sri RTMP3.4s, RTMP1.4s, #(32 - 16); \
0156 \
0157 eor RX0.16b, RX0.16b, RTMP0.16b; \
0158 eor RX1.16b, RX1.16b, RTMP1.16b; \
0159 eor RX0.16b, RX0.16b, RTMP2.16b; \
0160 eor RX1.16b, RX1.16b, RTMP3.16b; \
0161 \
0162 shl RTMP2.4s, RTMP0.4s, #24; \
0163 shl RTMP3.4s, RTMP1.4s, #24; \
0164 sri RTMP2.4s, RTMP0.4s, #(32 - 24); \
0165 sri RTMP3.4s, RTMP1.4s, #(32 - 24); \
0166 eor RTMP0.16b, RTMP0.16b, RTMP2.16b; \
0167 eor RTMP1.16b, RTMP1.16b, RTMP3.16b; \
0168 shl RTMP2.4s, RX0.4s, #2; \
0169 shl RTMP3.4s, RX1.4s, #2; \
0170 sri RTMP2.4s, RX0.4s, #(32 - 2); \
0171 sri RTMP3.4s, RX1.4s, #(32 - 2); \
0172 eor RTMP0.16b, RTMP0.16b, RTMP2.16b; \
0173 eor RTMP1.16b, RTMP1.16b, RTMP3.16b; \
0174 \
0175 eor s0.16b, s0.16b, RTMP0.16b; \
0176 eor t0.16b, t0.16b, RTMP1.16b;
0177
0178 #define SM4_CRYPT_BLK8(b0, b1, b2, b3, b4, b5, b6, b7) \
0179 rev32 b0.16b, b0.16b; \
0180 rev32 b1.16b, b1.16b; \
0181 rev32 b2.16b, b2.16b; \
0182 rev32 b3.16b, b3.16b; \
0183 rev32 b4.16b, b4.16b; \
0184 rev32 b5.16b, b5.16b; \
0185 rev32 b6.16b, b6.16b; \
0186 rev32 b7.16b, b7.16b; \
0187 \
0188 transpose_4x4(b0, b1, b2, b3); \
0189 transpose_4x4(b4, b5, b6, b7); \
0190 \
0191 mov x6, 8; \
0192 8: \
0193 ld1 {RKEY.4s}, [x0], #16; \
0194 subs x6, x6, #1; \
0195 \
0196 ROUND8(0, b0, b1, b2, b3, b4, b5, b6, b7); \
0197 ROUND8(1, b1, b2, b3, b0, b5, b6, b7, b4); \
0198 ROUND8(2, b2, b3, b0, b1, b6, b7, b4, b5); \
0199 ROUND8(3, b3, b0, b1, b2, b7, b4, b5, b6); \
0200 \
0201 bne 8b; \
0202 \
0203 rotate_clockwise_90(b0, b1, b2, b3); \
0204 rotate_clockwise_90(b4, b5, b6, b7); \
0205 rev32 b0.16b, b0.16b; \
0206 rev32 b1.16b, b1.16b; \
0207 rev32 b2.16b, b2.16b; \
0208 rev32 b3.16b, b3.16b; \
0209 rev32 b4.16b, b4.16b; \
0210 rev32 b5.16b, b5.16b; \
0211 rev32 b6.16b, b6.16b; \
0212 rev32 b7.16b, b7.16b; \
0213 \
0214 \
0215 sub x0, x0, #128;
0216
0217
0218 .align 3
0219 SYM_FUNC_START_LOCAL(__sm4_neon_crypt_blk1_4)
0220
0221
0222
0223
0224
0225
0226 PREPARE;
0227
0228 ld1 {v0.16b}, [x2], #16;
0229 mov v1.16b, v0.16b;
0230 mov v2.16b, v0.16b;
0231 mov v3.16b, v0.16b;
0232 cmp w3, #2;
0233 blt .Lblk4_load_input_done;
0234 ld1 {v1.16b}, [x2], #16;
0235 beq .Lblk4_load_input_done;
0236 ld1 {v2.16b}, [x2], #16;
0237 cmp w3, #3;
0238 beq .Lblk4_load_input_done;
0239 ld1 {v3.16b}, [x2];
0240
0241 .Lblk4_load_input_done:
0242 SM4_CRYPT_BLK4(v0, v1, v2, v3);
0243
0244 st1 {v0.16b}, [x1], #16;
0245 cmp w3, #2;
0246 blt .Lblk4_store_output_done;
0247 st1 {v1.16b}, [x1], #16;
0248 beq .Lblk4_store_output_done;
0249 st1 {v2.16b}, [x1], #16;
0250 cmp w3, #3;
0251 beq .Lblk4_store_output_done;
0252 st1 {v3.16b}, [x1];
0253
0254 .Lblk4_store_output_done:
0255 ret;
0256 SYM_FUNC_END(__sm4_neon_crypt_blk1_4)
0257
0258 .align 3
0259 SYM_FUNC_START(sm4_neon_crypt_blk1_8)
0260
0261
0262
0263
0264
0265
0266 cmp w3, #5;
0267 blt __sm4_neon_crypt_blk1_4;
0268
0269 PREPARE;
0270
0271 ld1 {v0.16b-v3.16b}, [x2], #64;
0272 ld1 {v4.16b}, [x2], #16;
0273 mov v5.16b, v4.16b;
0274 mov v6.16b, v4.16b;
0275 mov v7.16b, v4.16b;
0276 beq .Lblk8_load_input_done;
0277 ld1 {v5.16b}, [x2], #16;
0278 cmp w3, #7;
0279 blt .Lblk8_load_input_done;
0280 ld1 {v6.16b}, [x2], #16;
0281 beq .Lblk8_load_input_done;
0282 ld1 {v7.16b}, [x2];
0283
0284 .Lblk8_load_input_done:
0285 SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7);
0286
0287 cmp w3, #6;
0288 st1 {v0.16b-v3.16b}, [x1], #64;
0289 st1 {v4.16b}, [x1], #16;
0290 blt .Lblk8_store_output_done;
0291 st1 {v5.16b}, [x1], #16;
0292 beq .Lblk8_store_output_done;
0293 st1 {v6.16b}, [x1], #16;
0294 cmp w3, #7;
0295 beq .Lblk8_store_output_done;
0296 st1 {v7.16b}, [x1];
0297
0298 .Lblk8_store_output_done:
0299 ret;
0300 SYM_FUNC_END(sm4_neon_crypt_blk1_8)
0301
0302 .align 3
0303 SYM_FUNC_START(sm4_neon_crypt_blk8)
0304
0305
0306
0307
0308
0309
0310 PREPARE;
0311
0312 .Lcrypt_loop_blk:
0313 subs w3, w3, #8;
0314 bmi .Lcrypt_end;
0315
0316 ld1 {v0.16b-v3.16b}, [x2], #64;
0317 ld1 {v4.16b-v7.16b}, [x2], #64;
0318
0319 SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7);
0320
0321 st1 {v0.16b-v3.16b}, [x1], #64;
0322 st1 {v4.16b-v7.16b}, [x1], #64;
0323
0324 b .Lcrypt_loop_blk;
0325
0326 .Lcrypt_end:
0327 ret;
0328 SYM_FUNC_END(sm4_neon_crypt_blk8)
0329
0330 .align 3
0331 SYM_FUNC_START(sm4_neon_cbc_dec_blk8)
0332
0333
0334
0335
0336
0337
0338
0339 PREPARE;
0340
0341 ld1 {RIV.16b}, [x3];
0342
0343 .Lcbc_loop_blk:
0344 subs w4, w4, #8;
0345 bmi .Lcbc_end;
0346
0347 ld1 {v0.16b-v3.16b}, [x2], #64;
0348 ld1 {v4.16b-v7.16b}, [x2];
0349
0350 SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7);
0351
0352 sub x2, x2, #64;
0353 eor v0.16b, v0.16b, RIV.16b;
0354 ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
0355 eor v1.16b, v1.16b, RTMP0.16b;
0356 eor v2.16b, v2.16b, RTMP1.16b;
0357 eor v3.16b, v3.16b, RTMP2.16b;
0358 st1 {v0.16b-v3.16b}, [x1], #64;
0359
0360 eor v4.16b, v4.16b, RTMP3.16b;
0361 ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
0362 eor v5.16b, v5.16b, RTMP0.16b;
0363 eor v6.16b, v6.16b, RTMP1.16b;
0364 eor v7.16b, v7.16b, RTMP2.16b;
0365
0366 mov RIV.16b, RTMP3.16b;
0367 st1 {v4.16b-v7.16b}, [x1], #64;
0368
0369 b .Lcbc_loop_blk;
0370
0371 .Lcbc_end:
0372
0373 st1 {RIV.16b}, [x3];
0374
0375 ret;
0376 SYM_FUNC_END(sm4_neon_cbc_dec_blk8)
0377
0378 .align 3
0379 SYM_FUNC_START(sm4_neon_cfb_dec_blk8)
0380
0381
0382
0383
0384
0385
0386
0387 PREPARE;
0388
0389 ld1 {v0.16b}, [x3];
0390
0391 .Lcfb_loop_blk:
0392 subs w4, w4, #8;
0393 bmi .Lcfb_end;
0394
0395 ld1 {v1.16b, v2.16b, v3.16b}, [x2], #48;
0396 ld1 {v4.16b-v7.16b}, [x2];
0397
0398 SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7);
0399
0400 sub x2, x2, #48;
0401 ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
0402 eor v0.16b, v0.16b, RTMP0.16b;
0403 eor v1.16b, v1.16b, RTMP1.16b;
0404 eor v2.16b, v2.16b, RTMP2.16b;
0405 eor v3.16b, v3.16b, RTMP3.16b;
0406 st1 {v0.16b-v3.16b}, [x1], #64;
0407
0408 ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
0409 eor v4.16b, v4.16b, RTMP0.16b;
0410 eor v5.16b, v5.16b, RTMP1.16b;
0411 eor v6.16b, v6.16b, RTMP2.16b;
0412 eor v7.16b, v7.16b, RTMP3.16b;
0413 st1 {v4.16b-v7.16b}, [x1], #64;
0414
0415 mov v0.16b, RTMP3.16b;
0416
0417 b .Lcfb_loop_blk;
0418
0419 .Lcfb_end:
0420
0421 st1 {v0.16b}, [x3];
0422
0423 ret;
0424 SYM_FUNC_END(sm4_neon_cfb_dec_blk8)
0425
0426 .align 3
0427 SYM_FUNC_START(sm4_neon_ctr_enc_blk8)
0428
0429
0430
0431
0432
0433
0434
0435 PREPARE;
0436
0437 ldp x7, x8, [x3];
0438 rev x7, x7;
0439 rev x8, x8;
0440
0441 .Lctr_loop_blk:
0442 subs w4, w4, #8;
0443 bmi .Lctr_end;
0444
0445 #define inc_le128(vctr) \
0446 mov vctr.d[1], x8; \
0447 mov vctr.d[0], x7; \
0448 adds x8, x8, #1; \
0449 adc x7, x7, xzr; \
0450 rev64 vctr.16b, vctr.16b;
0451
0452
0453 inc_le128(v0);
0454 inc_le128(v1);
0455 inc_le128(v2);
0456 inc_le128(v3);
0457 inc_le128(v4);
0458 inc_le128(v5);
0459 inc_le128(v6);
0460 inc_le128(v7);
0461
0462 SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7);
0463
0464 ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
0465 eor v0.16b, v0.16b, RTMP0.16b;
0466 eor v1.16b, v1.16b, RTMP1.16b;
0467 eor v2.16b, v2.16b, RTMP2.16b;
0468 eor v3.16b, v3.16b, RTMP3.16b;
0469 st1 {v0.16b-v3.16b}, [x1], #64;
0470
0471 ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
0472 eor v4.16b, v4.16b, RTMP0.16b;
0473 eor v5.16b, v5.16b, RTMP1.16b;
0474 eor v6.16b, v6.16b, RTMP2.16b;
0475 eor v7.16b, v7.16b, RTMP3.16b;
0476 st1 {v4.16b-v7.16b}, [x1], #64;
0477
0478 b .Lctr_loop_blk;
0479
0480 .Lctr_end:
0481
0482 rev x7, x7;
0483 rev x8, x8;
0484 stp x7, x8, [x3];
0485
0486 ret;
0487 SYM_FUNC_END(sm4_neon_ctr_enc_blk8)