Back to home page

OSCL-LXR

 
 

    


0001 /* SPDX-License-Identifier: GPL-2.0-only OR BSD-3-Clause */
0002 /*
0003  * AES CTR mode by8 optimization with AVX instructions. (x86_64)
0004  *
0005  * Copyright(c) 2014 Intel Corporation.
0006  *
0007  * Contact Information:
0008  * James Guilford <james.guilford@intel.com>
0009  * Sean Gulley <sean.m.gulley@intel.com>
0010  * Chandramouli Narayanan <mouli@linux.intel.com>
0011  */
0012 /*
0013  * This is AES128/192/256 CTR mode optimization implementation. It requires
0014  * the support of Intel(R) AESNI and AVX instructions.
0015  *
0016  * This work was inspired by the AES CTR mode optimization published
0017  * in Intel Optimized IPSEC Cryptographic library.
0018  * Additional information on it can be found at:
0019  *    https://github.com/intel/intel-ipsec-mb
0020  */
0021 
0022 #include <linux/linkage.h>
0023 
0024 #define VMOVDQ      vmovdqu
0025 
0026 /*
0027  * Note: the "x" prefix in these aliases means "this is an xmm register".  The
0028  * alias prefixes have no relation to XCTR where the "X" prefix means "XOR
0029  * counter".
0030  */
0031 #define xdata0      %xmm0
0032 #define xdata1      %xmm1
0033 #define xdata2      %xmm2
0034 #define xdata3      %xmm3
0035 #define xdata4      %xmm4
0036 #define xdata5      %xmm5
0037 #define xdata6      %xmm6
0038 #define xdata7      %xmm7
0039 #define xcounter    %xmm8   // CTR mode only
0040 #define xiv     %xmm8   // XCTR mode only
0041 #define xbyteswap   %xmm9   // CTR mode only
0042 #define xtmp        %xmm9   // XCTR mode only
0043 #define xkey0       %xmm10
0044 #define xkey4       %xmm11
0045 #define xkey8       %xmm12
0046 #define xkey12      %xmm13
0047 #define xkeyA       %xmm14
0048 #define xkeyB       %xmm15
0049 
0050 #define p_in        %rdi
0051 #define p_iv        %rsi
0052 #define p_keys      %rdx
0053 #define p_out       %rcx
0054 #define num_bytes   %r8
0055 #define counter     %r9 // XCTR mode only
0056 #define tmp     %r10
0057 #define DDQ_DATA    0
0058 #define XDATA       1
0059 #define KEY_128     1
0060 #define KEY_192     2
0061 #define KEY_256     3
0062 
0063 .section .rodata
0064 .align 16
0065 
0066 byteswap_const:
0067     .octa 0x000102030405060708090A0B0C0D0E0F
0068 ddq_low_msk:
0069     .octa 0x0000000000000000FFFFFFFFFFFFFFFF
0070 ddq_high_add_1:
0071     .octa 0x00000000000000010000000000000000
0072 ddq_add_1:
0073     .octa 0x00000000000000000000000000000001
0074 ddq_add_2:
0075     .octa 0x00000000000000000000000000000002
0076 ddq_add_3:
0077     .octa 0x00000000000000000000000000000003
0078 ddq_add_4:
0079     .octa 0x00000000000000000000000000000004
0080 ddq_add_5:
0081     .octa 0x00000000000000000000000000000005
0082 ddq_add_6:
0083     .octa 0x00000000000000000000000000000006
0084 ddq_add_7:
0085     .octa 0x00000000000000000000000000000007
0086 ddq_add_8:
0087     .octa 0x00000000000000000000000000000008
0088 
0089 .text
0090 
0091 /* generate a unique variable for ddq_add_x */
0092 
0093 /* generate a unique variable for xmm register */
0094 .macro setxdata n
0095     var_xdata = %xmm\n
0096 .endm
0097 
0098 /* club the numeric 'id' to the symbol 'name' */
0099 
0100 .macro club name, id
0101 .altmacro
0102     .if \name == XDATA
0103         setxdata %\id
0104     .endif
0105 .noaltmacro
0106 .endm
0107 
0108 /*
0109  * do_aes num_in_par load_keys key_len
0110  * This increments p_in, but not p_out
0111  */
0112 .macro do_aes b, k, key_len, xctr
0113     .set by, \b
0114     .set load_keys, \k
0115     .set klen, \key_len
0116 
0117     .if (load_keys)
0118         vmovdqa 0*16(p_keys), xkey0
0119     .endif
0120 
0121     .if \xctr
0122         movq counter, xtmp
0123         .set i, 0
0124         .rept (by)
0125             club XDATA, i
0126             vpaddq  (ddq_add_1 + 16 * i)(%rip), xtmp, var_xdata
0127             .set i, (i +1)
0128         .endr
0129         .set i, 0
0130         .rept (by)
0131             club    XDATA, i
0132             vpxor   xiv, var_xdata, var_xdata
0133             .set i, (i +1)
0134         .endr
0135     .else
0136         vpshufb xbyteswap, xcounter, xdata0
0137         .set i, 1
0138         .rept (by - 1)
0139             club XDATA, i
0140             vpaddq  (ddq_add_1 + 16 * (i - 1))(%rip), xcounter, var_xdata
0141             vptest  ddq_low_msk(%rip), var_xdata
0142             jnz 1f
0143             vpaddq  ddq_high_add_1(%rip), var_xdata, var_xdata
0144             vpaddq  ddq_high_add_1(%rip), xcounter, xcounter
0145             1:
0146             vpshufb xbyteswap, var_xdata, var_xdata
0147             .set i, (i +1)
0148         .endr
0149     .endif
0150 
0151     vmovdqa 1*16(p_keys), xkeyA
0152 
0153     vpxor   xkey0, xdata0, xdata0
0154     .if \xctr
0155         add $by, counter
0156     .else
0157         vpaddq  (ddq_add_1 + 16 * (by - 1))(%rip), xcounter, xcounter
0158         vptest  ddq_low_msk(%rip), xcounter
0159         jnz 1f
0160         vpaddq  ddq_high_add_1(%rip), xcounter, xcounter
0161         1:
0162     .endif
0163 
0164     .set i, 1
0165     .rept (by - 1)
0166         club XDATA, i
0167         vpxor   xkey0, var_xdata, var_xdata
0168         .set i, (i +1)
0169     .endr
0170 
0171     vmovdqa 2*16(p_keys), xkeyB
0172 
0173     .set i, 0
0174     .rept by
0175         club XDATA, i
0176         vaesenc xkeyA, var_xdata, var_xdata     /* key 1 */
0177         .set i, (i +1)
0178     .endr
0179 
0180     .if (klen == KEY_128)
0181         .if (load_keys)
0182             vmovdqa 3*16(p_keys), xkey4
0183         .endif
0184     .else
0185         vmovdqa 3*16(p_keys), xkeyA
0186     .endif
0187 
0188     .set i, 0
0189     .rept by
0190         club XDATA, i
0191         vaesenc xkeyB, var_xdata, var_xdata     /* key 2 */
0192         .set i, (i +1)
0193     .endr
0194 
0195     add $(16*by), p_in
0196 
0197     .if (klen == KEY_128)
0198         vmovdqa 4*16(p_keys), xkeyB
0199     .else
0200         .if (load_keys)
0201             vmovdqa 4*16(p_keys), xkey4
0202         .endif
0203     .endif
0204 
0205     .set i, 0
0206     .rept by
0207         club XDATA, i
0208         /* key 3 */
0209         .if (klen == KEY_128)
0210             vaesenc xkey4, var_xdata, var_xdata
0211         .else
0212             vaesenc xkeyA, var_xdata, var_xdata
0213         .endif
0214         .set i, (i +1)
0215     .endr
0216 
0217     vmovdqa 5*16(p_keys), xkeyA
0218 
0219     .set i, 0
0220     .rept by
0221         club XDATA, i
0222         /* key 4 */
0223         .if (klen == KEY_128)
0224             vaesenc xkeyB, var_xdata, var_xdata
0225         .else
0226             vaesenc xkey4, var_xdata, var_xdata
0227         .endif
0228         .set i, (i +1)
0229     .endr
0230 
0231     .if (klen == KEY_128)
0232         .if (load_keys)
0233             vmovdqa 6*16(p_keys), xkey8
0234         .endif
0235     .else
0236         vmovdqa 6*16(p_keys), xkeyB
0237     .endif
0238 
0239     .set i, 0
0240     .rept by
0241         club XDATA, i
0242         vaesenc xkeyA, var_xdata, var_xdata     /* key 5 */
0243         .set i, (i +1)
0244     .endr
0245 
0246     vmovdqa 7*16(p_keys), xkeyA
0247 
0248     .set i, 0
0249     .rept by
0250         club XDATA, i
0251         /* key 6 */
0252         .if (klen == KEY_128)
0253             vaesenc xkey8, var_xdata, var_xdata
0254         .else
0255             vaesenc xkeyB, var_xdata, var_xdata
0256         .endif
0257         .set i, (i +1)
0258     .endr
0259 
0260     .if (klen == KEY_128)
0261         vmovdqa 8*16(p_keys), xkeyB
0262     .else
0263         .if (load_keys)
0264             vmovdqa 8*16(p_keys), xkey8
0265         .endif
0266     .endif
0267 
0268     .set i, 0
0269     .rept by
0270         club XDATA, i
0271         vaesenc xkeyA, var_xdata, var_xdata     /* key 7 */
0272         .set i, (i +1)
0273     .endr
0274 
0275     .if (klen == KEY_128)
0276         .if (load_keys)
0277             vmovdqa 9*16(p_keys), xkey12
0278         .endif
0279     .else
0280         vmovdqa 9*16(p_keys), xkeyA
0281     .endif
0282 
0283     .set i, 0
0284     .rept by
0285         club XDATA, i
0286         /* key 8 */
0287         .if (klen == KEY_128)
0288             vaesenc xkeyB, var_xdata, var_xdata
0289         .else
0290             vaesenc xkey8, var_xdata, var_xdata
0291         .endif
0292         .set i, (i +1)
0293     .endr
0294 
0295     vmovdqa 10*16(p_keys), xkeyB
0296 
0297     .set i, 0
0298     .rept by
0299         club XDATA, i
0300         /* key 9 */
0301         .if (klen == KEY_128)
0302             vaesenc xkey12, var_xdata, var_xdata
0303         .else
0304             vaesenc xkeyA, var_xdata, var_xdata
0305         .endif
0306         .set i, (i +1)
0307     .endr
0308 
0309     .if (klen != KEY_128)
0310         vmovdqa 11*16(p_keys), xkeyA
0311     .endif
0312 
0313     .set i, 0
0314     .rept by
0315         club XDATA, i
0316         /* key 10 */
0317         .if (klen == KEY_128)
0318             vaesenclast xkeyB, var_xdata, var_xdata
0319         .else
0320             vaesenc xkeyB, var_xdata, var_xdata
0321         .endif
0322         .set i, (i +1)
0323     .endr
0324 
0325     .if (klen != KEY_128)
0326         .if (load_keys)
0327             vmovdqa 12*16(p_keys), xkey12
0328         .endif
0329 
0330         .set i, 0
0331         .rept by
0332             club XDATA, i
0333             vaesenc xkeyA, var_xdata, var_xdata /* key 11 */
0334             .set i, (i +1)
0335         .endr
0336 
0337         .if (klen == KEY_256)
0338             vmovdqa 13*16(p_keys), xkeyA
0339         .endif
0340 
0341         .set i, 0
0342         .rept by
0343             club XDATA, i
0344             .if (klen == KEY_256)
0345                 /* key 12 */
0346                 vaesenc xkey12, var_xdata, var_xdata
0347             .else
0348                 vaesenclast xkey12, var_xdata, var_xdata
0349             .endif
0350             .set i, (i +1)
0351         .endr
0352 
0353         .if (klen == KEY_256)
0354             vmovdqa 14*16(p_keys), xkeyB
0355 
0356             .set i, 0
0357             .rept by
0358                 club XDATA, i
0359                 /* key 13 */
0360                 vaesenc xkeyA, var_xdata, var_xdata
0361                 .set i, (i +1)
0362             .endr
0363 
0364             .set i, 0
0365             .rept by
0366                 club XDATA, i
0367                 /* key 14 */
0368                 vaesenclast xkeyB, var_xdata, var_xdata
0369                 .set i, (i +1)
0370             .endr
0371         .endif
0372     .endif
0373 
0374     .set i, 0
0375     .rept (by / 2)
0376         .set j, (i+1)
0377         VMOVDQ  (i*16 - 16*by)(p_in), xkeyA
0378         VMOVDQ  (j*16 - 16*by)(p_in), xkeyB
0379         club XDATA, i
0380         vpxor   xkeyA, var_xdata, var_xdata
0381         club XDATA, j
0382         vpxor   xkeyB, var_xdata, var_xdata
0383         .set i, (i+2)
0384     .endr
0385 
0386     .if (i < by)
0387         VMOVDQ  (i*16 - 16*by)(p_in), xkeyA
0388         club XDATA, i
0389         vpxor   xkeyA, var_xdata, var_xdata
0390     .endif
0391 
0392     .set i, 0
0393     .rept by
0394         club XDATA, i
0395         VMOVDQ  var_xdata, i*16(p_out)
0396         .set i, (i+1)
0397     .endr
0398 .endm
0399 
0400 .macro do_aes_load val, key_len, xctr
0401     do_aes \val, 1, \key_len, \xctr
0402 .endm
0403 
0404 .macro do_aes_noload val, key_len, xctr
0405     do_aes \val, 0, \key_len, \xctr
0406 .endm
0407 
0408 /* main body of aes ctr load */
0409 
0410 .macro do_aes_ctrmain key_len, xctr
0411     cmp $16, num_bytes
0412     jb  .Ldo_return2\xctr\key_len
0413 
0414     .if \xctr
0415         shr $4, counter
0416         vmovdqu (p_iv), xiv
0417     .else
0418         vmovdqa byteswap_const(%rip), xbyteswap
0419         vmovdqu (p_iv), xcounter
0420         vpshufb xbyteswap, xcounter, xcounter
0421     .endif
0422 
0423     mov num_bytes, tmp
0424     and $(7*16), tmp
0425     jz  .Lmult_of_8_blks\xctr\key_len
0426 
0427     /* 1 <= tmp <= 7 */
0428     cmp $(4*16), tmp
0429     jg  .Lgt4\xctr\key_len
0430     je  .Leq4\xctr\key_len
0431 
0432 .Llt4\xctr\key_len:
0433     cmp $(2*16), tmp
0434     jg  .Leq3\xctr\key_len
0435     je  .Leq2\xctr\key_len
0436 
0437 .Leq1\xctr\key_len:
0438     do_aes_load 1, \key_len, \xctr
0439     add $(1*16), p_out
0440     and $(~7*16), num_bytes
0441     jz  .Ldo_return2\xctr\key_len
0442     jmp .Lmain_loop2\xctr\key_len
0443 
0444 .Leq2\xctr\key_len:
0445     do_aes_load 2, \key_len, \xctr
0446     add $(2*16), p_out
0447     and $(~7*16), num_bytes
0448     jz  .Ldo_return2\xctr\key_len
0449     jmp .Lmain_loop2\xctr\key_len
0450 
0451 
0452 .Leq3\xctr\key_len:
0453     do_aes_load 3, \key_len, \xctr
0454     add $(3*16), p_out
0455     and $(~7*16), num_bytes
0456     jz  .Ldo_return2\xctr\key_len
0457     jmp .Lmain_loop2\xctr\key_len
0458 
0459 .Leq4\xctr\key_len:
0460     do_aes_load 4, \key_len, \xctr
0461     add $(4*16), p_out
0462     and $(~7*16), num_bytes
0463     jz  .Ldo_return2\xctr\key_len
0464     jmp .Lmain_loop2\xctr\key_len
0465 
0466 .Lgt4\xctr\key_len:
0467     cmp $(6*16), tmp
0468     jg  .Leq7\xctr\key_len
0469     je  .Leq6\xctr\key_len
0470 
0471 .Leq5\xctr\key_len:
0472     do_aes_load 5, \key_len, \xctr
0473     add $(5*16), p_out
0474     and $(~7*16), num_bytes
0475     jz  .Ldo_return2\xctr\key_len
0476     jmp .Lmain_loop2\xctr\key_len
0477 
0478 .Leq6\xctr\key_len:
0479     do_aes_load 6, \key_len, \xctr
0480     add $(6*16), p_out
0481     and $(~7*16), num_bytes
0482     jz  .Ldo_return2\xctr\key_len
0483     jmp .Lmain_loop2\xctr\key_len
0484 
0485 .Leq7\xctr\key_len:
0486     do_aes_load 7, \key_len, \xctr
0487     add $(7*16), p_out
0488     and $(~7*16), num_bytes
0489     jz  .Ldo_return2\xctr\key_len
0490     jmp .Lmain_loop2\xctr\key_len
0491 
0492 .Lmult_of_8_blks\xctr\key_len:
0493     .if (\key_len != KEY_128)
0494         vmovdqa 0*16(p_keys), xkey0
0495         vmovdqa 4*16(p_keys), xkey4
0496         vmovdqa 8*16(p_keys), xkey8
0497         vmovdqa 12*16(p_keys), xkey12
0498     .else
0499         vmovdqa 0*16(p_keys), xkey0
0500         vmovdqa 3*16(p_keys), xkey4
0501         vmovdqa 6*16(p_keys), xkey8
0502         vmovdqa 9*16(p_keys), xkey12
0503     .endif
0504 .align 16
0505 .Lmain_loop2\xctr\key_len:
0506     /* num_bytes is a multiple of 8 and >0 */
0507     do_aes_noload   8, \key_len, \xctr
0508     add $(8*16), p_out
0509     sub $(8*16), num_bytes
0510     jne .Lmain_loop2\xctr\key_len
0511 
0512 .Ldo_return2\xctr\key_len:
0513     .if !\xctr
0514         /* return updated IV */
0515         vpshufb xbyteswap, xcounter, xcounter
0516         vmovdqu xcounter, (p_iv)
0517     .endif
0518     RET
0519 .endm
0520 
0521 /*
0522  * routine to do AES128 CTR enc/decrypt "by8"
0523  * XMM registers are clobbered.
0524  * Saving/restoring must be done at a higher level
0525  * aes_ctr_enc_128_avx_by8(void *in, void *iv, void *keys, void *out,
0526  *          unsigned int num_bytes)
0527  */
0528 SYM_FUNC_START(aes_ctr_enc_128_avx_by8)
0529     /* call the aes main loop */
0530     do_aes_ctrmain KEY_128 0
0531 
0532 SYM_FUNC_END(aes_ctr_enc_128_avx_by8)
0533 
0534 /*
0535  * routine to do AES192 CTR enc/decrypt "by8"
0536  * XMM registers are clobbered.
0537  * Saving/restoring must be done at a higher level
0538  * aes_ctr_enc_192_avx_by8(void *in, void *iv, void *keys, void *out,
0539  *          unsigned int num_bytes)
0540  */
0541 SYM_FUNC_START(aes_ctr_enc_192_avx_by8)
0542     /* call the aes main loop */
0543     do_aes_ctrmain KEY_192 0
0544 
0545 SYM_FUNC_END(aes_ctr_enc_192_avx_by8)
0546 
0547 /*
0548  * routine to do AES256 CTR enc/decrypt "by8"
0549  * XMM registers are clobbered.
0550  * Saving/restoring must be done at a higher level
0551  * aes_ctr_enc_256_avx_by8(void *in, void *iv, void *keys, void *out,
0552  *          unsigned int num_bytes)
0553  */
0554 SYM_FUNC_START(aes_ctr_enc_256_avx_by8)
0555     /* call the aes main loop */
0556     do_aes_ctrmain KEY_256 0
0557 
0558 SYM_FUNC_END(aes_ctr_enc_256_avx_by8)
0559 
0560 /*
0561  * routine to do AES128 XCTR enc/decrypt "by8"
0562  * XMM registers are clobbered.
0563  * Saving/restoring must be done at a higher level
0564  * aes_xctr_enc_128_avx_by8(const u8 *in, const u8 *iv, const void *keys,
0565  *  u8* out, unsigned int num_bytes, unsigned int byte_ctr)
0566  */
0567 SYM_FUNC_START(aes_xctr_enc_128_avx_by8)
0568     /* call the aes main loop */
0569     do_aes_ctrmain KEY_128 1
0570 
0571 SYM_FUNC_END(aes_xctr_enc_128_avx_by8)
0572 
0573 /*
0574  * routine to do AES192 XCTR enc/decrypt "by8"
0575  * XMM registers are clobbered.
0576  * Saving/restoring must be done at a higher level
0577  * aes_xctr_enc_192_avx_by8(const u8 *in, const u8 *iv, const void *keys,
0578  *  u8* out, unsigned int num_bytes, unsigned int byte_ctr)
0579  */
0580 SYM_FUNC_START(aes_xctr_enc_192_avx_by8)
0581     /* call the aes main loop */
0582     do_aes_ctrmain KEY_192 1
0583 
0584 SYM_FUNC_END(aes_xctr_enc_192_avx_by8)
0585 
0586 /*
0587  * routine to do AES256 XCTR enc/decrypt "by8"
0588  * XMM registers are clobbered.
0589  * Saving/restoring must be done at a higher level
0590  * aes_xctr_enc_256_avx_by8(const u8 *in, const u8 *iv, const void *keys,
0591  *  u8* out, unsigned int num_bytes, unsigned int byte_ctr)
0592  */
0593 SYM_FUNC_START(aes_xctr_enc_256_avx_by8)
0594     /* call the aes main loop */
0595     do_aes_ctrmain KEY_256 1
0596 
0597 SYM_FUNC_END(aes_xctr_enc_256_avx_by8)