x86/crypto/aesni-intel_asm.S

0001 /* SPDX-License-Identifier: GPL-2.0-or-later */
0002 /*
0003  * Implement AES algorithm in Intel AES-NI instructions.
0004  *
0005  * The white paper of AES-NI instructions can be downloaded from:
0006  *   http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
0007  *
0008  * Copyright (C) 2008, Intel Corp.
0009  *    Author: Huang Ying <ying.huang@intel.com>
0010  *            Vinodh Gopal <vinodh.gopal@intel.com>
0011  *            Kahraman Akdemir
0012  *
0013  * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
0014  * interface for 64-bit kernels.
0015  *    Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
0016  *             Aidan O'Mahony (aidan.o.mahony@intel.com)
0017  *             Adrian Hoban <adrian.hoban@intel.com>
0018  *             James Guilford (james.guilford@intel.com)
0019  *             Gabriele Paoloni <gabriele.paoloni@intel.com>
0020  *             Tadeusz Struk (tadeusz.struk@intel.com)
0021  *             Wajdi Feghali (wajdi.k.feghali@intel.com)
0022  *    Copyright (c) 2010, Intel Corporation.
0023  *
0024  * Ported x86_64 version to x86:
0025  *    Author: Mathias Krause <minipli@googlemail.com>
0026  */
0027
0028 #include <linux/linkage.h>
0029 #include <asm/frame.h>
0030 #include <asm/nospec-branch.h>
0031
0032 /*
0033  * The following macros are used to move an (un)aligned 16 byte value to/from
0034  * an XMM register.  This can done for either FP or integer values, for FP use
0035  * movaps (move aligned packed single) or integer use movdqa (move double quad
0036  * aligned).  It doesn't make a performance difference which instruction is used
0037  * since Nehalem (original Core i7) was released.  However, the movaps is a byte
0038  * shorter, so that is the one we'll use for now. (same for unaligned).
0039  */
0040 #define MOVADQ  movaps
0041 #define MOVUDQ  movups
0042
0043 #ifdef __x86_64__
0044
0045 # constants in mergeable sections, linker can reorder and merge
0046 .section    .rodata.cst16.POLY, "aM", @progbits, 16
0047 .align 16
0048 POLY:   .octa 0xC2000000000000000000000000000001
0049 .section    .rodata.cst16.TWOONE, "aM", @progbits, 16
0050 .align 16
0051 TWOONE: .octa 0x00000001000000000000000000000001
0052
0053 .section    .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
0054 .align 16
0055 SHUF_MASK:  .octa 0x000102030405060708090A0B0C0D0E0F
0056 .section    .rodata.cst16.MASK1, "aM", @progbits, 16
0057 .align 16
0058 MASK1:      .octa 0x0000000000000000ffffffffffffffff
0059 .section    .rodata.cst16.MASK2, "aM", @progbits, 16
0060 .align 16
0061 MASK2:      .octa 0xffffffffffffffff0000000000000000
0062 .section    .rodata.cst16.ONE, "aM", @progbits, 16
0063 .align 16
0064 ONE:        .octa 0x00000000000000000000000000000001
0065 .section    .rodata.cst16.F_MIN_MASK, "aM", @progbits, 16
0066 .align 16
0067 F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
0068 .section    .rodata.cst16.dec, "aM", @progbits, 16
0069 .align 16
0070 dec:        .octa 0x1
0071 .section    .rodata.cst16.enc, "aM", @progbits, 16
0072 .align 16
0073 enc:        .octa 0x2
0074
0075 # order of these constants should not change.
0076 # more specifically, ALL_F should follow SHIFT_MASK,
0077 # and zero should follow ALL_F
0078 .section    .rodata, "a", @progbits
0079 .align 16
0080 SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
0081 ALL_F:      .octa 0xffffffffffffffffffffffffffffffff
0082             .octa 0x00000000000000000000000000000000
0083
0084 .text
0085
0086
0087 #define STACK_OFFSET    8*3
0088
0089 #define AadHash 16*0
0090 #define AadLen 16*1
0091 #define InLen (16*1)+8
0092 #define PBlockEncKey 16*2
0093 #define OrigIV 16*3
0094 #define CurCount 16*4
0095 #define PBlockLen 16*5
0096 #define HashKey     16*6    // store HashKey <<1 mod poly here
0097 #define HashKey_2   16*7    // store HashKey^2 <<1 mod poly here
0098 #define HashKey_3   16*8    // store HashKey^3 <<1 mod poly here
0099 #define HashKey_4   16*9    // store HashKey^4 <<1 mod poly here
0100 #define HashKey_k   16*10   // store XOR of High 64 bits and Low 64
0101                 // bits of  HashKey <<1 mod poly here
0102                 //(for Karatsuba purposes)
0103 #define HashKey_2_k 16*11   // store XOR of High 64 bits and Low 64
0104                 // bits of  HashKey^2 <<1 mod poly here
0105                 // (for Karatsuba purposes)
0106 #define HashKey_3_k 16*12   // store XOR of High 64 bits and Low 64
0107                 // bits of  HashKey^3 <<1 mod poly here
0108                 // (for Karatsuba purposes)
0109 #define HashKey_4_k 16*13   // store XOR of High 64 bits and Low 64
0110                 // bits of  HashKey^4 <<1 mod poly here
0111                 // (for Karatsuba purposes)
0112
0113 #define arg1 rdi
0114 #define arg2 rsi
0115 #define arg3 rdx
0116 #define arg4 rcx
0117 #define arg5 r8
0118 #define arg6 r9
0119 #define arg7 STACK_OFFSET+8(%rsp)
0120 #define arg8 STACK_OFFSET+16(%rsp)
0121 #define arg9 STACK_OFFSET+24(%rsp)
0122 #define arg10 STACK_OFFSET+32(%rsp)
0123 #define arg11 STACK_OFFSET+40(%rsp)
0124 #define keysize 2*15*16(%arg1)
0125 #endif
0126
0127
0128 #define STATE1  %xmm0
0129 #define STATE2  %xmm4
0130 #define STATE3  %xmm5
0131 #define STATE4  %xmm6
0132 #define STATE   STATE1
0133 #define IN1 %xmm1
0134 #define IN2 %xmm7
0135 #define IN3 %xmm8
0136 #define IN4 %xmm9
0137 #define IN  IN1
0138 #define KEY %xmm2
0139 #define IV  %xmm3
0140
0141 #define BSWAP_MASK %xmm10
0142 #define CTR %xmm11
0143 #define INC %xmm12
0144
0145 #define GF128MUL_MASK %xmm7
0146
0147 #ifdef __x86_64__
0148 #define AREG    %rax
0149 #define KEYP    %rdi
0150 #define OUTP    %rsi
0151 #define UKEYP   OUTP
0152 #define INP %rdx
0153 #define LEN %rcx
0154 #define IVP %r8
0155 #define KLEN    %r9d
0156 #define T1  %r10
0157 #define TKEYP   T1
0158 #define T2  %r11
0159 #define TCTR_LOW T2
0160 #else
0161 #define AREG    %eax
0162 #define KEYP    %edi
0163 #define OUTP    AREG
0164 #define UKEYP   OUTP
0165 #define INP %edx
0166 #define LEN %esi
0167 #define IVP %ebp
0168 #define KLEN    %ebx
0169 #define T1  %ecx
0170 #define TKEYP   T1
0171 #endif
0172
0173 .macro FUNC_SAVE
0174     push    %r12
0175     push    %r13
0176     push    %r14
0177 #
0178 # states of %xmm registers %xmm6:%xmm15 not saved
0179 # all %xmm registers are clobbered
0180 #
0181 .endm
0182
0183
0184 .macro FUNC_RESTORE
0185     pop %r14
0186     pop %r13
0187     pop %r12
0188 .endm
0189
0190 # Precompute hashkeys.
0191 # Input: Hash subkey.
0192 # Output: HashKeys stored in gcm_context_data.  Only needs to be called
0193 # once per key.
0194 # clobbers r12, and tmp xmm registers.
0195 .macro PRECOMPUTE SUBKEY TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 TMP7
0196     mov \SUBKEY, %r12
0197     movdqu  (%r12), \TMP3
0198     movdqa  SHUF_MASK(%rip), \TMP2
0199     pshufb  \TMP2, \TMP3
0200
0201     # precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
0202
0203     movdqa  \TMP3, \TMP2
0204     psllq   $1, \TMP3
0205     psrlq   $63, \TMP2
0206     movdqa  \TMP2, \TMP1
0207     pslldq  $8, \TMP2
0208     psrldq  $8, \TMP1
0209     por \TMP2, \TMP3
0210
0211     # reduce HashKey<<1
0212
0213     pshufd  $0x24, \TMP1, \TMP2
0214     pcmpeqd TWOONE(%rip), \TMP2
0215     pand    POLY(%rip), \TMP2
0216     pxor    \TMP2, \TMP3
0217     movdqu  \TMP3, HashKey(%arg2)
0218
0219     movdqa     \TMP3, \TMP5
0220     pshufd     $78, \TMP3, \TMP1
0221     pxor       \TMP3, \TMP1
0222     movdqu     \TMP1, HashKey_k(%arg2)
0223
0224     GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
0225 # TMP5 = HashKey^2<<1 (mod poly)
0226     movdqu     \TMP5, HashKey_2(%arg2)
0227 # HashKey_2 = HashKey^2<<1 (mod poly)
0228     pshufd     $78, \TMP5, \TMP1
0229     pxor       \TMP5, \TMP1
0230     movdqu     \TMP1, HashKey_2_k(%arg2)
0231
0232     GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
0233 # TMP5 = HashKey^3<<1 (mod poly)
0234     movdqu     \TMP5, HashKey_3(%arg2)
0235     pshufd     $78, \TMP5, \TMP1
0236     pxor       \TMP5, \TMP1
0237     movdqu     \TMP1, HashKey_3_k(%arg2)
0238
0239     GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
0240 # TMP5 = HashKey^3<<1 (mod poly)
0241     movdqu     \TMP5, HashKey_4(%arg2)
0242     pshufd     $78, \TMP5, \TMP1
0243     pxor       \TMP5, \TMP1
0244     movdqu     \TMP1, HashKey_4_k(%arg2)
0245 .endm
0246
0247 # GCM_INIT initializes a gcm_context struct to prepare for encoding/decoding.
0248 # Clobbers rax, r10-r13 and xmm0-xmm6, %xmm13
0249 .macro GCM_INIT Iv SUBKEY AAD AADLEN
0250     mov \AADLEN, %r11
0251     mov %r11, AadLen(%arg2) # ctx_data.aad_length = aad_length
0252     xor %r11d, %r11d
0253     mov %r11, InLen(%arg2) # ctx_data.in_length = 0
0254     mov %r11, PBlockLen(%arg2) # ctx_data.partial_block_length = 0
0255     mov %r11, PBlockEncKey(%arg2) # ctx_data.partial_block_enc_key = 0
0256     mov \Iv, %rax
0257     movdqu (%rax), %xmm0
0258     movdqu %xmm0, OrigIV(%arg2) # ctx_data.orig_IV = iv
0259
0260     movdqa  SHUF_MASK(%rip), %xmm2
0261     pshufb %xmm2, %xmm0
0262     movdqu %xmm0, CurCount(%arg2) # ctx_data.current_counter = iv
0263
0264     PRECOMPUTE \SUBKEY, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7
0265     movdqu HashKey(%arg2), %xmm13
0266
0267     CALC_AAD_HASH %xmm13, \AAD, \AADLEN, %xmm0, %xmm1, %xmm2, %xmm3, \
0268     %xmm4, %xmm5, %xmm6
0269 .endm
0270
0271 # GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context
0272 # struct has been initialized by GCM_INIT.
0273 # Requires the input data be at least 1 byte long because of READ_PARTIAL_BLOCK
0274 # Clobbers rax, r10-r13, and xmm0-xmm15
0275 .macro GCM_ENC_DEC operation
0276     movdqu AadHash(%arg2), %xmm8
0277     movdqu HashKey(%arg2), %xmm13
0278     add %arg5, InLen(%arg2)
0279
0280     xor %r11d, %r11d # initialise the data pointer offset as zero
0281     PARTIAL_BLOCK %arg3 %arg4 %arg5 %r11 %xmm8 \operation
0282
0283     sub %r11, %arg5     # sub partial block data used
0284     mov %arg5, %r13     # save the number of bytes
0285
0286     and $-16, %r13      # %r13 = %r13 - (%r13 mod 16)
0287     mov %r13, %r12
0288     # Encrypt/Decrypt first few blocks
0289
0290     and $(3<<4), %r12
0291     jz  _initial_num_blocks_is_0_\@
0292     cmp $(2<<4), %r12
0293     jb  _initial_num_blocks_is_1_\@
0294     je  _initial_num_blocks_is_2_\@
0295 _initial_num_blocks_is_3_\@:
0296     INITIAL_BLOCKS_ENC_DEC  %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
0297 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, \operation
0298     sub $48, %r13
0299     jmp _initial_blocks_\@
0300 _initial_num_blocks_is_2_\@:
0301     INITIAL_BLOCKS_ENC_DEC  %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
0302 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, \operation
0303     sub $32, %r13
0304     jmp _initial_blocks_\@
0305 _initial_num_blocks_is_1_\@:
0306     INITIAL_BLOCKS_ENC_DEC  %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
0307 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, \operation
0308     sub $16, %r13
0309     jmp _initial_blocks_\@
0310 _initial_num_blocks_is_0_\@:
0311     INITIAL_BLOCKS_ENC_DEC  %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
0312 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, \operation
0313 _initial_blocks_\@:
0314
0315     # Main loop - Encrypt/Decrypt remaining blocks
0316
0317     test    %r13, %r13
0318     je  _zero_cipher_left_\@
0319     sub $64, %r13
0320     je  _four_cipher_left_\@
0321 _crypt_by_4_\@:
0322     GHASH_4_ENCRYPT_4_PARALLEL_\operation   %xmm9, %xmm10, %xmm11, %xmm12, \
0323     %xmm13, %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, \
0324     %xmm7, %xmm8, enc
0325     add $64, %r11
0326     sub $64, %r13
0327     jne _crypt_by_4_\@
0328 _four_cipher_left_\@:
0329     GHASH_LAST_4    %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
0330 %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
0331 _zero_cipher_left_\@:
0332     movdqu %xmm8, AadHash(%arg2)
0333     movdqu %xmm0, CurCount(%arg2)
0334
0335     mov %arg5, %r13
0336     and $15, %r13           # %r13 = arg5 (mod 16)
0337     je  _multiple_of_16_bytes_\@
0338
0339     mov %r13, PBlockLen(%arg2)
0340
0341     # Handle the last <16 Byte block separately
0342     paddd ONE(%rip), %xmm0                # INCR CNT to get Yn
0343     movdqu %xmm0, CurCount(%arg2)
0344     movdqa SHUF_MASK(%rip), %xmm10
0345     pshufb %xmm10, %xmm0
0346
0347     ENCRYPT_SINGLE_BLOCK    %xmm0, %xmm1        # Encrypt(K, Yn)
0348     movdqu %xmm0, PBlockEncKey(%arg2)
0349
0350     cmp $16, %arg5
0351     jge _large_enough_update_\@
0352
0353     lea (%arg4,%r11,1), %r10
0354     mov %r13, %r12
0355     READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1
0356     jmp _data_read_\@
0357
0358 _large_enough_update_\@:
0359     sub $16, %r11
0360     add %r13, %r11
0361
0362     # receive the last <16 Byte block
0363     movdqu  (%arg4, %r11, 1), %xmm1
0364
0365     sub %r13, %r11
0366     add $16, %r11
0367
0368     lea SHIFT_MASK+16(%rip), %r12
0369     # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
0370     # (r13 is the number of bytes in plaintext mod 16)
0371     sub %r13, %r12
0372     # get the appropriate shuffle mask
0373     movdqu  (%r12), %xmm2
0374     # shift right 16-r13 bytes
0375     pshufb  %xmm2, %xmm1
0376
0377 _data_read_\@:
0378     lea ALL_F+16(%rip), %r12
0379     sub %r13, %r12
0380
0381 .ifc \operation, dec
0382     movdqa  %xmm1, %xmm2
0383 .endif
0384     pxor    %xmm1, %xmm0            # XOR Encrypt(K, Yn)
0385     movdqu  (%r12), %xmm1
0386     # get the appropriate mask to mask out top 16-r13 bytes of xmm0
0387     pand    %xmm1, %xmm0            # mask out top 16-r13 bytes of xmm0
0388 .ifc \operation, dec
0389     pand    %xmm1, %xmm2
0390     movdqa SHUF_MASK(%rip), %xmm10
0391     pshufb %xmm10 ,%xmm2
0392
0393     pxor %xmm2, %xmm8
0394 .else
0395     movdqa SHUF_MASK(%rip), %xmm10
0396     pshufb %xmm10,%xmm0
0397
0398     pxor    %xmm0, %xmm8
0399 .endif
0400
0401     movdqu %xmm8, AadHash(%arg2)
0402 .ifc \operation, enc
0403     # GHASH computation for the last <16 byte block
0404     movdqa SHUF_MASK(%rip), %xmm10
0405     # shuffle xmm0 back to output as ciphertext
0406     pshufb %xmm10, %xmm0
0407 .endif
0408
0409     # Output %r13 bytes
0410     movq %xmm0, %rax
0411     cmp $8, %r13
0412     jle _less_than_8_bytes_left_\@
0413     mov %rax, (%arg3 , %r11, 1)
0414     add $8, %r11
0415     psrldq $8, %xmm0
0416     movq %xmm0, %rax
0417     sub $8, %r13
0418 _less_than_8_bytes_left_\@:
0419     mov %al,  (%arg3, %r11, 1)
0420     add $1, %r11
0421     shr $8, %rax
0422     sub $1, %r13
0423     jne _less_than_8_bytes_left_\@
0424 _multiple_of_16_bytes_\@:
0425 .endm
0426
0427 # GCM_COMPLETE Finishes update of tag of last partial block
0428 # Output: Authorization Tag (AUTH_TAG)
0429 # Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
0430 .macro GCM_COMPLETE AUTHTAG AUTHTAGLEN
0431     movdqu AadHash(%arg2), %xmm8
0432     movdqu HashKey(%arg2), %xmm13
0433
0434     mov PBlockLen(%arg2), %r12
0435
0436     test %r12, %r12
0437     je _partial_done\@
0438
0439     GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
0440
0441 _partial_done\@:
0442     mov AadLen(%arg2), %r12  # %r13 = aadLen (number of bytes)
0443     shl $3, %r12          # convert into number of bits
0444     movd    %r12d, %xmm15         # len(A) in %xmm15
0445     mov InLen(%arg2), %r12
0446     shl     $3, %r12                  # len(C) in bits (*128)
0447     movq    %r12, %xmm1
0448
0449     pslldq  $8, %xmm15        # %xmm15 = len(A)||0x0000000000000000
0450     pxor    %xmm1, %xmm15         # %xmm15 = len(A)||len(C)
0451     pxor    %xmm15, %xmm8
0452     GHASH_MUL   %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
0453     # final GHASH computation
0454     movdqa SHUF_MASK(%rip), %xmm10
0455     pshufb %xmm10, %xmm8
0456
0457     movdqu OrigIV(%arg2), %xmm0       # %xmm0 = Y0
0458     ENCRYPT_SINGLE_BLOCK    %xmm0,  %xmm1     # E(K, Y0)
0459     pxor    %xmm8, %xmm0
0460 _return_T_\@:
0461     mov \AUTHTAG, %r10                     # %r10 = authTag
0462     mov \AUTHTAGLEN, %r11                    # %r11 = auth_tag_len
0463     cmp $16, %r11
0464     je  _T_16_\@
0465     cmp $8, %r11
0466     jl  _T_4_\@
0467 _T_8_\@:
0468     movq    %xmm0, %rax
0469     mov %rax, (%r10)
0470     add $8, %r10
0471     sub $8, %r11
0472     psrldq  $8, %xmm0
0473     test    %r11, %r11
0474     je  _return_T_done_\@
0475 _T_4_\@:
0476     movd    %xmm0, %eax
0477     mov %eax, (%r10)
0478     add $4, %r10
0479     sub $4, %r11
0480     psrldq  $4, %xmm0
0481     test    %r11, %r11
0482     je  _return_T_done_\@
0483 _T_123_\@:
0484     movd    %xmm0, %eax
0485     cmp $2, %r11
0486     jl  _T_1_\@
0487     mov %ax, (%r10)
0488     cmp $2, %r11
0489     je  _return_T_done_\@
0490     add $2, %r10
0491     sar $16, %eax
0492 _T_1_\@:
0493     mov %al, (%r10)
0494     jmp _return_T_done_\@
0495 _T_16_\@:
0496     movdqu  %xmm0, (%r10)
0497 _return_T_done_\@:
0498 .endm
0499
0500 #ifdef __x86_64__
0501 /* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
0502 *
0503 *
0504 * Input: A and B (128-bits each, bit-reflected)
0505 * Output: C = A*B*x mod poly, (i.e. >>1 )
0506 * To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
0507 * GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
0508 *
0509 */
0510 .macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
0511     movdqa    \GH, \TMP1
0512     pshufd    $78, \GH, \TMP2
0513     pshufd    $78, \HK, \TMP3
0514     pxor      \GH, \TMP2            # TMP2 = a1+a0
0515     pxor      \HK, \TMP3            # TMP3 = b1+b0
0516     pclmulqdq $0x11, \HK, \TMP1     # TMP1 = a1*b1
0517     pclmulqdq $0x00, \HK, \GH       # GH = a0*b0
0518     pclmulqdq $0x00, \TMP3, \TMP2   # TMP2 = (a0+a1)*(b1+b0)
0519     pxor      \GH, \TMP2
0520     pxor      \TMP1, \TMP2          # TMP2 = (a0*b0)+(a1*b0)
0521     movdqa    \TMP2, \TMP3
0522     pslldq    $8, \TMP3             # left shift TMP3 2 DWs
0523     psrldq    $8, \TMP2             # right shift TMP2 2 DWs
0524     pxor      \TMP3, \GH
0525     pxor      \TMP2, \TMP1          # TMP2:GH holds the result of GH*HK
0526
0527         # first phase of the reduction
0528
0529     movdqa    \GH, \TMP2
0530     movdqa    \GH, \TMP3
0531     movdqa    \GH, \TMP4            # copy GH into TMP2,TMP3 and TMP4
0532                     # in in order to perform
0533                     # independent shifts
0534     pslld     $31, \TMP2            # packed right shift <<31
0535     pslld     $30, \TMP3            # packed right shift <<30
0536     pslld     $25, \TMP4            # packed right shift <<25
0537     pxor      \TMP3, \TMP2          # xor the shifted versions
0538     pxor      \TMP4, \TMP2
0539     movdqa    \TMP2, \TMP5
0540     psrldq    $4, \TMP5             # right shift TMP5 1 DW
0541     pslldq    $12, \TMP2            # left shift TMP2 3 DWs
0542     pxor      \TMP2, \GH
0543
0544         # second phase of the reduction
0545
0546     movdqa    \GH,\TMP2             # copy GH into TMP2,TMP3 and TMP4
0547                     # in in order to perform
0548                     # independent shifts
0549     movdqa    \GH,\TMP3
0550     movdqa    \GH,\TMP4
0551     psrld     $1,\TMP2              # packed left shift >>1
0552     psrld     $2,\TMP3              # packed left shift >>2
0553     psrld     $7,\TMP4              # packed left shift >>7
0554     pxor      \TMP3,\TMP2       # xor the shifted versions
0555     pxor      \TMP4,\TMP2
0556     pxor      \TMP5, \TMP2
0557     pxor      \TMP2, \GH
0558     pxor      \TMP1, \GH            # result is in TMP1
0559 .endm
0560
0561 # Reads DLEN bytes starting at DPTR and stores in XMMDst
0562 # where 0 < DLEN < 16
0563 # Clobbers %rax, DLEN and XMM1
0564 .macro READ_PARTIAL_BLOCK DPTR DLEN XMM1 XMMDst
0565         cmp $8, \DLEN
0566         jl _read_lt8_\@
0567         mov (\DPTR), %rax
0568         movq %rax, \XMMDst
0569         sub $8, \DLEN
0570         jz _done_read_partial_block_\@
0571     xor %eax, %eax
0572 _read_next_byte_\@:
0573         shl $8, %rax
0574         mov 7(\DPTR, \DLEN, 1), %al
0575         dec \DLEN
0576         jnz _read_next_byte_\@
0577         movq %rax, \XMM1
0578     pslldq $8, \XMM1
0579         por \XMM1, \XMMDst
0580     jmp _done_read_partial_block_\@
0581 _read_lt8_\@:
0582     xor %eax, %eax
0583 _read_next_byte_lt8_\@:
0584         shl $8, %rax
0585         mov -1(\DPTR, \DLEN, 1), %al
0586         dec \DLEN
0587         jnz _read_next_byte_lt8_\@
0588         movq %rax, \XMMDst
0589 _done_read_partial_block_\@:
0590 .endm
0591
0592 # CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
0593 # clobbers r10-11, xmm14
0594 .macro CALC_AAD_HASH HASHKEY AAD AADLEN TMP1 TMP2 TMP3 TMP4 TMP5 \
0595     TMP6 TMP7
0596     MOVADQ     SHUF_MASK(%rip), %xmm14
0597     mov    \AAD, %r10       # %r10 = AAD
0598     mov    \AADLEN, %r11        # %r11 = aadLen
0599     pxor       \TMP7, \TMP7
0600     pxor       \TMP6, \TMP6
0601
0602     cmp    $16, %r11
0603     jl     _get_AAD_rest\@
0604 _get_AAD_blocks\@:
0605     movdqu     (%r10), \TMP7
0606     pshufb     %xmm14, \TMP7 # byte-reflect the AAD data
0607     pxor       \TMP7, \TMP6
0608     GHASH_MUL  \TMP6, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
0609     add    $16, %r10
0610     sub    $16, %r11
0611     cmp    $16, %r11
0612     jge    _get_AAD_blocks\@
0613
0614     movdqu     \TMP6, \TMP7
0615
0616     /* read the last <16B of AAD */
0617 _get_AAD_rest\@:
0618     test       %r11, %r11
0619     je     _get_AAD_done\@
0620
0621     READ_PARTIAL_BLOCK %r10, %r11, \TMP1, \TMP7
0622     pshufb     %xmm14, \TMP7 # byte-reflect the AAD data
0623     pxor       \TMP6, \TMP7
0624     GHASH_MUL  \TMP7, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
0625     movdqu \TMP7, \TMP6
0626
0627 _get_AAD_done\@:
0628     movdqu \TMP6, AadHash(%arg2)
0629 .endm
0630
0631 # PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks
0632 # between update calls.
0633 # Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK
0634 # Outputs encrypted bytes, and updates hash and partial info in gcm_data_context
0635 # Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13
0636 .macro PARTIAL_BLOCK CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \
0637     AAD_HASH operation
0638     mov     PBlockLen(%arg2), %r13
0639     test    %r13, %r13
0640     je  _partial_block_done_\@  # Leave Macro if no partial blocks
0641     # Read in input data without over reading
0642     cmp $16, \PLAIN_CYPH_LEN
0643     jl  _fewer_than_16_bytes_\@
0644     movups  (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm
0645     jmp _data_read_\@
0646
0647 _fewer_than_16_bytes_\@:
0648     lea (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10
0649     mov \PLAIN_CYPH_LEN, %r12
0650     READ_PARTIAL_BLOCK %r10 %r12 %xmm0 %xmm1
0651
0652     mov PBlockLen(%arg2), %r13
0653
0654 _data_read_\@:              # Finished reading in data
0655
0656     movdqu  PBlockEncKey(%arg2), %xmm9
0657     movdqu  HashKey(%arg2), %xmm13
0658
0659     lea SHIFT_MASK(%rip), %r12
0660
0661     # adjust the shuffle mask pointer to be able to shift r13 bytes
0662     # r16-r13 is the number of bytes in plaintext mod 16)
0663     add %r13, %r12
0664     movdqu  (%r12), %xmm2       # get the appropriate shuffle mask
0665     pshufb  %xmm2, %xmm9        # shift right r13 bytes
0666
0667 .ifc \operation, dec
0668     movdqa  %xmm1, %xmm3
0669     pxor    %xmm1, %xmm9        # Cyphertext XOR E(K, Yn)
0670
0671     mov \PLAIN_CYPH_LEN, %r10
0672     add %r13, %r10
0673     # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
0674     sub $16, %r10
0675     # Determine if if partial block is not being filled and
0676     # shift mask accordingly
0677     jge _no_extra_mask_1_\@
0678     sub %r10, %r12
0679 _no_extra_mask_1_\@:
0680
0681     movdqu  ALL_F-SHIFT_MASK(%r12), %xmm1
0682     # get the appropriate mask to mask out bottom r13 bytes of xmm9
0683     pand    %xmm1, %xmm9        # mask out bottom r13 bytes of xmm9
0684
0685     pand    %xmm1, %xmm3
0686     movdqa  SHUF_MASK(%rip), %xmm10
0687     pshufb  %xmm10, %xmm3
0688     pshufb  %xmm2, %xmm3
0689     pxor    %xmm3, \AAD_HASH
0690
0691     test    %r10, %r10
0692     jl  _partial_incomplete_1_\@
0693
0694     # GHASH computation for the last <16 Byte block
0695     GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
0696     xor %eax, %eax
0697
0698     mov %rax, PBlockLen(%arg2)
0699     jmp _dec_done_\@
0700 _partial_incomplete_1_\@:
0701     add \PLAIN_CYPH_LEN, PBlockLen(%arg2)
0702 _dec_done_\@:
0703     movdqu  \AAD_HASH, AadHash(%arg2)
0704 .else
0705     pxor    %xmm1, %xmm9            # Plaintext XOR E(K, Yn)
0706
0707     mov \PLAIN_CYPH_LEN, %r10
0708     add %r13, %r10
0709     # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
0710     sub $16, %r10
0711     # Determine if if partial block is not being filled and
0712     # shift mask accordingly
0713     jge _no_extra_mask_2_\@
0714     sub %r10, %r12
0715 _no_extra_mask_2_\@:
0716
0717     movdqu  ALL_F-SHIFT_MASK(%r12), %xmm1
0718     # get the appropriate mask to mask out bottom r13 bytes of xmm9
0719     pand    %xmm1, %xmm9
0720
0721     movdqa  SHUF_MASK(%rip), %xmm1
0722     pshufb  %xmm1, %xmm9
0723     pshufb  %xmm2, %xmm9
0724     pxor    %xmm9, \AAD_HASH
0725
0726     test    %r10, %r10
0727     jl  _partial_incomplete_2_\@
0728
0729     # GHASH computation for the last <16 Byte block
0730     GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
0731     xor %eax, %eax
0732
0733     mov %rax, PBlockLen(%arg2)
0734     jmp _encode_done_\@
0735 _partial_incomplete_2_\@:
0736     add \PLAIN_CYPH_LEN, PBlockLen(%arg2)
0737 _encode_done_\@:
0738     movdqu  \AAD_HASH, AadHash(%arg2)
0739
0740     movdqa  SHUF_MASK(%rip), %xmm10
0741     # shuffle xmm9 back to output as ciphertext
0742     pshufb  %xmm10, %xmm9
0743     pshufb  %xmm2, %xmm9
0744 .endif
0745     # output encrypted Bytes
0746     test    %r10, %r10
0747     jl  _partial_fill_\@
0748     mov %r13, %r12
0749     mov $16, %r13
0750     # Set r13 to be the number of bytes to write out
0751     sub %r12, %r13
0752     jmp _count_set_\@
0753 _partial_fill_\@:
0754     mov \PLAIN_CYPH_LEN, %r13
0755 _count_set_\@:
0756     movdqa  %xmm9, %xmm0
0757     movq    %xmm0, %rax
0758     cmp $8, %r13
0759     jle _less_than_8_bytes_left_\@
0760
0761     mov %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
0762     add $8, \DATA_OFFSET
0763     psrldq  $8, %xmm0
0764     movq    %xmm0, %rax
0765     sub $8, %r13
0766 _less_than_8_bytes_left_\@:
0767     movb    %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
0768     add $1, \DATA_OFFSET
0769     shr $8, %rax
0770     sub $1, %r13
0771     jne _less_than_8_bytes_left_\@
0772 _partial_block_done_\@:
0773 .endm # PARTIAL_BLOCK
0774
0775 /*
0776 * if a = number of total plaintext bytes
0777 * b = floor(a/16)
0778 * num_initial_blocks = b mod 4
0779 * encrypt the initial num_initial_blocks blocks and apply ghash on
0780 * the ciphertext
0781 * %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
0782 * are clobbered
0783 * arg1, %arg2, %arg3 are used as a pointer only, not modified
0784 */
0785
0786
0787 .macro INITIAL_BLOCKS_ENC_DEC TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
0788     XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
0789     MOVADQ      SHUF_MASK(%rip), %xmm14
0790
0791     movdqu AadHash(%arg2), %xmm\i           # XMM0 = Y0
0792
0793     # start AES for num_initial_blocks blocks
0794
0795     movdqu CurCount(%arg2), \XMM0                # XMM0 = Y0
0796
0797 .if (\i == 5) || (\i == 6) || (\i == 7)
0798
0799     MOVADQ      ONE(%RIP),\TMP1
0800     MOVADQ      0(%arg1),\TMP2
0801 .irpc index, \i_seq
0802     paddd       \TMP1, \XMM0                 # INCR Y0
0803 .ifc \operation, dec
0804         movdqa     \XMM0, %xmm\index
0805 .else
0806     MOVADQ      \XMM0, %xmm\index
0807 .endif
0808     pshufb  %xmm14, %xmm\index      # perform a 16 byte swap
0809     pxor        \TMP2, %xmm\index
0810 .endr
0811     lea 0x10(%arg1),%r10
0812     mov keysize,%eax
0813     shr $2,%eax             # 128->4, 192->6, 256->8
0814     add $5,%eax               # 128->9, 192->11, 256->13
0815
0816 aes_loop_initial_\@:
0817     MOVADQ  (%r10),\TMP1
0818 .irpc   index, \i_seq
0819     aesenc  \TMP1, %xmm\index
0820 .endr
0821     add $16,%r10
0822     sub $1,%eax
0823     jnz aes_loop_initial_\@
0824
0825     MOVADQ  (%r10), \TMP1
0826 .irpc index, \i_seq
0827     aesenclast \TMP1, %xmm\index         # Last Round
0828 .endr
0829 .irpc index, \i_seq
0830     movdqu     (%arg4 , %r11, 1), \TMP1
0831     pxor       \TMP1, %xmm\index
0832     movdqu     %xmm\index, (%arg3 , %r11, 1)
0833     # write back plaintext/ciphertext for num_initial_blocks
0834     add    $16, %r11
0835
0836 .ifc \operation, dec
0837     movdqa     \TMP1, %xmm\index
0838 .endif
0839     pshufb     %xmm14, %xmm\index
0840
0841         # prepare plaintext/ciphertext for GHASH computation
0842 .endr
0843 .endif
0844
0845         # apply GHASH on num_initial_blocks blocks
0846
0847 .if \i == 5
0848         pxor       %xmm5, %xmm6
0849     GHASH_MUL  %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
0850         pxor       %xmm6, %xmm7
0851     GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
0852         pxor       %xmm7, %xmm8
0853     GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
0854 .elseif \i == 6
0855         pxor       %xmm6, %xmm7
0856     GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
0857         pxor       %xmm7, %xmm8
0858     GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
0859 .elseif \i == 7
0860         pxor       %xmm7, %xmm8
0861     GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
0862 .endif
0863     cmp    $64, %r13
0864     jl  _initial_blocks_done\@
0865     # no need for precomputed values
0866 /*
0867 *
0868 * Precomputations for HashKey parallel with encryption of first 4 blocks.
0869 * Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
0870 */
0871     MOVADQ     ONE(%RIP),\TMP1
0872     paddd      \TMP1, \XMM0              # INCR Y0
0873     MOVADQ     \XMM0, \XMM1
0874     pshufb  %xmm14, \XMM1        # perform a 16 byte swap
0875
0876     paddd      \TMP1, \XMM0              # INCR Y0
0877     MOVADQ     \XMM0, \XMM2
0878     pshufb  %xmm14, \XMM2        # perform a 16 byte swap
0879
0880     paddd      \TMP1, \XMM0              # INCR Y0
0881     MOVADQ     \XMM0, \XMM3
0882     pshufb %xmm14, \XMM3        # perform a 16 byte swap
0883
0884     paddd      \TMP1, \XMM0              # INCR Y0
0885     MOVADQ     \XMM0, \XMM4
0886     pshufb %xmm14, \XMM4        # perform a 16 byte swap
0887
0888     MOVADQ     0(%arg1),\TMP1
0889     pxor       \TMP1, \XMM1
0890     pxor       \TMP1, \XMM2
0891     pxor       \TMP1, \XMM3
0892     pxor       \TMP1, \XMM4
0893 .irpc index, 1234 # do 4 rounds
0894     movaps 0x10*\index(%arg1), \TMP1
0895     aesenc     \TMP1, \XMM1
0896     aesenc     \TMP1, \XMM2
0897     aesenc     \TMP1, \XMM3
0898     aesenc     \TMP1, \XMM4
0899 .endr
0900 .irpc index, 56789 # do next 5 rounds
0901     movaps 0x10*\index(%arg1), \TMP1
0902     aesenc     \TMP1, \XMM1
0903     aesenc     \TMP1, \XMM2
0904     aesenc     \TMP1, \XMM3
0905     aesenc     \TMP1, \XMM4
0906 .endr
0907     lea    0xa0(%arg1),%r10
0908     mov    keysize,%eax
0909     shr    $2,%eax          # 128->4, 192->6, 256->8
0910     sub    $4,%eax          # 128->0, 192->2, 256->4
0911     jz     aes_loop_pre_done\@
0912
0913 aes_loop_pre_\@:
0914     MOVADQ     (%r10),\TMP2
0915 .irpc   index, 1234
0916     aesenc     \TMP2, %xmm\index
0917 .endr
0918     add    $16,%r10
0919     sub    $1,%eax
0920     jnz    aes_loop_pre_\@
0921
0922 aes_loop_pre_done\@:
0923     MOVADQ     (%r10), \TMP2
0924     aesenclast \TMP2, \XMM1
0925     aesenclast \TMP2, \XMM2
0926     aesenclast \TMP2, \XMM3
0927     aesenclast \TMP2, \XMM4
0928     movdqu     16*0(%arg4 , %r11 , 1), \TMP1
0929     pxor       \TMP1, \XMM1
0930 .ifc \operation, dec
0931     movdqu     \XMM1, 16*0(%arg3 , %r11 , 1)
0932     movdqa     \TMP1, \XMM1
0933 .endif
0934     movdqu     16*1(%arg4 , %r11 , 1), \TMP1
0935     pxor       \TMP1, \XMM2
0936 .ifc \operation, dec
0937     movdqu     \XMM2, 16*1(%arg3 , %r11 , 1)
0938     movdqa     \TMP1, \XMM2
0939 .endif
0940     movdqu     16*2(%arg4 , %r11 , 1), \TMP1
0941     pxor       \TMP1, \XMM3
0942 .ifc \operation, dec
0943     movdqu     \XMM3, 16*2(%arg3 , %r11 , 1)
0944     movdqa     \TMP1, \XMM3
0945 .endif
0946     movdqu     16*3(%arg4 , %r11 , 1), \TMP1
0947     pxor       \TMP1, \XMM4
0948 .ifc \operation, dec
0949     movdqu     \XMM4, 16*3(%arg3 , %r11 , 1)
0950     movdqa     \TMP1, \XMM4
0951 .else
0952     movdqu     \XMM1, 16*0(%arg3 , %r11 , 1)
0953     movdqu     \XMM2, 16*1(%arg3 , %r11 , 1)
0954     movdqu     \XMM3, 16*2(%arg3 , %r11 , 1)
0955     movdqu     \XMM4, 16*3(%arg3 , %r11 , 1)
0956 .endif
0957
0958     add    $64, %r11
0959     pshufb %xmm14, \XMM1 # perform a 16 byte swap
0960     pxor       \XMMDst, \XMM1
0961 # combine GHASHed value with the corresponding ciphertext
0962     pshufb %xmm14, \XMM2 # perform a 16 byte swap
0963     pshufb %xmm14, \XMM3 # perform a 16 byte swap
0964     pshufb %xmm14, \XMM4 # perform a 16 byte swap
0965
0966 _initial_blocks_done\@:
0967
0968 .endm
0969
0970 /*
0971 * encrypt 4 blocks at a time
0972 * ghash the 4 previously encrypted ciphertext blocks
0973 * arg1, %arg3, %arg4 are used as pointers only, not modified
0974 * %r11 is the data offset value
0975 */
0976 .macro GHASH_4_ENCRYPT_4_PARALLEL_enc TMP1 TMP2 TMP3 TMP4 TMP5 \
0977 TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
0978
0979     movdqa    \XMM1, \XMM5
0980     movdqa    \XMM2, \XMM6
0981     movdqa    \XMM3, \XMM7
0982     movdqa    \XMM4, \XMM8
0983
0984         movdqa    SHUF_MASK(%rip), %xmm15
0985         # multiply TMP5 * HashKey using karatsuba
0986
0987     movdqa    \XMM5, \TMP4
0988     pshufd    $78, \XMM5, \TMP6
0989     pxor      \XMM5, \TMP6
0990     paddd     ONE(%rip), \XMM0      # INCR CNT
0991     movdqu    HashKey_4(%arg2), \TMP5
0992     pclmulqdq $0x11, \TMP5, \TMP4           # TMP4 = a1*b1
0993     movdqa    \XMM0, \XMM1
0994     paddd     ONE(%rip), \XMM0      # INCR CNT
0995     movdqa    \XMM0, \XMM2
0996     paddd     ONE(%rip), \XMM0      # INCR CNT
0997     movdqa    \XMM0, \XMM3
0998     paddd     ONE(%rip), \XMM0      # INCR CNT
0999     movdqa    \XMM0, \XMM4
1000     pshufb %xmm15, \XMM1    # perform a 16 byte swap
1001     pclmulqdq $0x00, \TMP5, \XMM5           # XMM5 = a0*b0
1002     pshufb %xmm15, \XMM2    # perform a 16 byte swap
1003     pshufb %xmm15, \XMM3    # perform a 16 byte swap
1004     pshufb %xmm15, \XMM4    # perform a 16 byte swap
1005
1006     pxor      (%arg1), \XMM1
1007     pxor      (%arg1), \XMM2
1008     pxor      (%arg1), \XMM3
1009     pxor      (%arg1), \XMM4
1010     movdqu    HashKey_4_k(%arg2), \TMP5
1011     pclmulqdq $0x00, \TMP5, \TMP6       # TMP6 = (a1+a0)*(b1+b0)
1012     movaps 0x10(%arg1), \TMP1
1013     aesenc    \TMP1, \XMM1              # Round 1
1014     aesenc    \TMP1, \XMM2
1015     aesenc    \TMP1, \XMM3
1016     aesenc    \TMP1, \XMM4
1017     movaps 0x20(%arg1), \TMP1
1018     aesenc    \TMP1, \XMM1              # Round 2
1019     aesenc    \TMP1, \XMM2
1020     aesenc    \TMP1, \XMM3
1021     aesenc    \TMP1, \XMM4
1022     movdqa    \XMM6, \TMP1
1023     pshufd    $78, \XMM6, \TMP2
1024     pxor      \XMM6, \TMP2
1025     movdqu    HashKey_3(%arg2), \TMP5
1026     pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1 * b1
1027     movaps 0x30(%arg1), \TMP3
1028     aesenc    \TMP3, \XMM1              # Round 3
1029     aesenc    \TMP3, \XMM2
1030     aesenc    \TMP3, \XMM3
1031     aesenc    \TMP3, \XMM4
1032     pclmulqdq $0x00, \TMP5, \XMM6       # XMM6 = a0*b0
1033     movaps 0x40(%arg1), \TMP3
1034     aesenc    \TMP3, \XMM1              # Round 4
1035     aesenc    \TMP3, \XMM2
1036     aesenc    \TMP3, \XMM3
1037     aesenc    \TMP3, \XMM4
1038     movdqu    HashKey_3_k(%arg2), \TMP5
1039     pclmulqdq $0x00, \TMP5, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1040     movaps 0x50(%arg1), \TMP3
1041     aesenc    \TMP3, \XMM1              # Round 5
1042     aesenc    \TMP3, \XMM2
1043     aesenc    \TMP3, \XMM3
1044     aesenc    \TMP3, \XMM4
1045     pxor      \TMP1, \TMP4
1046 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1047     pxor      \XMM6, \XMM5
1048     pxor      \TMP2, \TMP6
1049     movdqa    \XMM7, \TMP1
1050     pshufd    $78, \XMM7, \TMP2
1051     pxor      \XMM7, \TMP2
1052     movdqu    HashKey_2(%arg2), \TMP5
1053
1054         # Multiply TMP5 * HashKey using karatsuba
1055
1056     pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1057     movaps 0x60(%arg1), \TMP3
1058     aesenc    \TMP3, \XMM1              # Round 6
1059     aesenc    \TMP3, \XMM2
1060     aesenc    \TMP3, \XMM3
1061     aesenc    \TMP3, \XMM4
1062     pclmulqdq $0x00, \TMP5, \XMM7       # XMM7 = a0*b0
1063     movaps 0x70(%arg1), \TMP3
1064     aesenc    \TMP3, \XMM1              # Round 7
1065     aesenc    \TMP3, \XMM2
1066     aesenc    \TMP3, \XMM3
1067     aesenc    \TMP3, \XMM4
1068     movdqu    HashKey_2_k(%arg2), \TMP5
1069     pclmulqdq $0x00, \TMP5, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1070     movaps 0x80(%arg1), \TMP3
1071     aesenc    \TMP3, \XMM1              # Round 8
1072     aesenc    \TMP3, \XMM2
1073     aesenc    \TMP3, \XMM3
1074     aesenc    \TMP3, \XMM4
1075     pxor      \TMP1, \TMP4
1076 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1077     pxor      \XMM7, \XMM5
1078     pxor      \TMP2, \TMP6
1079
1080         # Multiply XMM8 * HashKey
1081         # XMM8 and TMP5 hold the values for the two operands
1082
1083     movdqa    \XMM8, \TMP1
1084     pshufd    $78, \XMM8, \TMP2
1085     pxor      \XMM8, \TMP2
1086     movdqu    HashKey(%arg2), \TMP5
1087     pclmulqdq $0x11, \TMP5, \TMP1      # TMP1 = a1*b1
1088     movaps 0x90(%arg1), \TMP3
1089     aesenc    \TMP3, \XMM1             # Round 9
1090     aesenc    \TMP3, \XMM2
1091     aesenc    \TMP3, \XMM3
1092     aesenc    \TMP3, \XMM4
1093     pclmulqdq $0x00, \TMP5, \XMM8      # XMM8 = a0*b0
1094     lea   0xa0(%arg1),%r10
1095     mov   keysize,%eax
1096     shr   $2,%eax           # 128->4, 192->6, 256->8
1097     sub   $4,%eax           # 128->0, 192->2, 256->4
1098     jz    aes_loop_par_enc_done\@
1099
1100 aes_loop_par_enc\@:
1101     MOVADQ    (%r10),\TMP3
1102 .irpc   index, 1234
1103     aesenc    \TMP3, %xmm\index
1104 .endr
1105     add   $16,%r10
1106     sub   $1,%eax
1107     jnz   aes_loop_par_enc\@
1108
1109 aes_loop_par_enc_done\@:
1110     MOVADQ    (%r10), \TMP3
1111     aesenclast \TMP3, \XMM1           # Round 10
1112     aesenclast \TMP3, \XMM2
1113     aesenclast \TMP3, \XMM3
1114     aesenclast \TMP3, \XMM4
1115     movdqu    HashKey_k(%arg2), \TMP5
1116     pclmulqdq $0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0)
1117     movdqu    (%arg4,%r11,1), \TMP3
1118     pxor      \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK
1119     movdqu    16(%arg4,%r11,1), \TMP3
1120     pxor      \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK
1121     movdqu    32(%arg4,%r11,1), \TMP3
1122     pxor      \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK
1123     movdqu    48(%arg4,%r11,1), \TMP3
1124     pxor      \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK
1125         movdqu    \XMM1, (%arg3,%r11,1)        # Write to the ciphertext buffer
1126         movdqu    \XMM2, 16(%arg3,%r11,1)      # Write to the ciphertext buffer
1127         movdqu    \XMM3, 32(%arg3,%r11,1)      # Write to the ciphertext buffer
1128         movdqu    \XMM4, 48(%arg3,%r11,1)      # Write to the ciphertext buffer
1129     pshufb %xmm15, \XMM1        # perform a 16 byte swap
1130     pshufb %xmm15, \XMM2    # perform a 16 byte swap
1131     pshufb %xmm15, \XMM3    # perform a 16 byte swap
1132     pshufb %xmm15, \XMM4    # perform a 16 byte swap
1133
1134     pxor      \TMP4, \TMP1
1135     pxor      \XMM8, \XMM5
1136     pxor      \TMP6, \TMP2
1137     pxor      \TMP1, \TMP2
1138     pxor      \XMM5, \TMP2
1139     movdqa    \TMP2, \TMP3
1140     pslldq    $8, \TMP3                    # left shift TMP3 2 DWs
1141     psrldq    $8, \TMP2                    # right shift TMP2 2 DWs
1142     pxor      \TMP3, \XMM5
1143     pxor      \TMP2, \TMP1    # accumulate the results in TMP1:XMM5
1144
1145         # first phase of reduction
1146
1147     movdqa    \XMM5, \TMP2
1148     movdqa    \XMM5, \TMP3
1149     movdqa    \XMM5, \TMP4
1150 # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1151     pslld     $31, \TMP2                   # packed right shift << 31
1152     pslld     $30, \TMP3                   # packed right shift << 30
1153     pslld     $25, \TMP4                   # packed right shift << 25
1154     pxor      \TMP3, \TMP2                 # xor the shifted versions
1155     pxor      \TMP4, \TMP2
1156     movdqa    \TMP2, \TMP5
1157     psrldq    $4, \TMP5                    # right shift T5 1 DW
1158     pslldq    $12, \TMP2                   # left shift T2 3 DWs
1159     pxor      \TMP2, \XMM5
1160
1161         # second phase of reduction
1162
1163     movdqa    \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1164     movdqa    \XMM5,\TMP3
1165     movdqa    \XMM5,\TMP4
1166     psrld     $1, \TMP2                    # packed left shift >>1
1167     psrld     $2, \TMP3                    # packed left shift >>2
1168     psrld     $7, \TMP4                    # packed left shift >>7
1169     pxor      \TMP3,\TMP2              # xor the shifted versions
1170     pxor      \TMP4,\TMP2
1171     pxor      \TMP5, \TMP2
1172     pxor      \TMP2, \XMM5
1173     pxor      \TMP1, \XMM5                 # result is in TMP1
1174
1175     pxor      \XMM5, \XMM1
1176 .endm
1177
1178 /*
1179 * decrypt 4 blocks at a time
1180 * ghash the 4 previously decrypted ciphertext blocks
1181 * arg1, %arg3, %arg4 are used as pointers only, not modified
1182 * %r11 is the data offset value
1183 */
1184 .macro GHASH_4_ENCRYPT_4_PARALLEL_dec TMP1 TMP2 TMP3 TMP4 TMP5 \
1185 TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
1186
1187     movdqa    \XMM1, \XMM5
1188     movdqa    \XMM2, \XMM6
1189     movdqa    \XMM3, \XMM7
1190     movdqa    \XMM4, \XMM8
1191
1192         movdqa    SHUF_MASK(%rip), %xmm15
1193         # multiply TMP5 * HashKey using karatsuba
1194
1195     movdqa    \XMM5, \TMP4
1196     pshufd    $78, \XMM5, \TMP6
1197     pxor      \XMM5, \TMP6
1198     paddd     ONE(%rip), \XMM0      # INCR CNT
1199     movdqu    HashKey_4(%arg2), \TMP5
1200     pclmulqdq $0x11, \TMP5, \TMP4           # TMP4 = a1*b1
1201     movdqa    \XMM0, \XMM1
1202     paddd     ONE(%rip), \XMM0      # INCR CNT
1203     movdqa    \XMM0, \XMM2
1204     paddd     ONE(%rip), \XMM0      # INCR CNT
1205     movdqa    \XMM0, \XMM3
1206     paddd     ONE(%rip), \XMM0      # INCR CNT
1207     movdqa    \XMM0, \XMM4
1208     pshufb %xmm15, \XMM1    # perform a 16 byte swap
1209     pclmulqdq $0x00, \TMP5, \XMM5           # XMM5 = a0*b0
1210     pshufb %xmm15, \XMM2    # perform a 16 byte swap
1211     pshufb %xmm15, \XMM3    # perform a 16 byte swap
1212     pshufb %xmm15, \XMM4    # perform a 16 byte swap
1213
1214     pxor      (%arg1), \XMM1
1215     pxor      (%arg1), \XMM2
1216     pxor      (%arg1), \XMM3
1217     pxor      (%arg1), \XMM4
1218     movdqu    HashKey_4_k(%arg2), \TMP5
1219     pclmulqdq $0x00, \TMP5, \TMP6       # TMP6 = (a1+a0)*(b1+b0)
1220     movaps 0x10(%arg1), \TMP1
1221     aesenc    \TMP1, \XMM1              # Round 1
1222     aesenc    \TMP1, \XMM2
1223     aesenc    \TMP1, \XMM3
1224     aesenc    \TMP1, \XMM4
1225     movaps 0x20(%arg1), \TMP1
1226     aesenc    \TMP1, \XMM1              # Round 2
1227     aesenc    \TMP1, \XMM2
1228     aesenc    \TMP1, \XMM3
1229     aesenc    \TMP1, \XMM4
1230     movdqa    \XMM6, \TMP1
1231     pshufd    $78, \XMM6, \TMP2
1232     pxor      \XMM6, \TMP2
1233     movdqu    HashKey_3(%arg2), \TMP5
1234     pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1 * b1
1235     movaps 0x30(%arg1), \TMP3
1236     aesenc    \TMP3, \XMM1              # Round 3
1237     aesenc    \TMP3, \XMM2
1238     aesenc    \TMP3, \XMM3
1239     aesenc    \TMP3, \XMM4
1240     pclmulqdq $0x00, \TMP5, \XMM6       # XMM6 = a0*b0
1241     movaps 0x40(%arg1), \TMP3
1242     aesenc    \TMP3, \XMM1              # Round 4
1243     aesenc    \TMP3, \XMM2
1244     aesenc    \TMP3, \XMM3
1245     aesenc    \TMP3, \XMM4
1246     movdqu    HashKey_3_k(%arg2), \TMP5
1247     pclmulqdq $0x00, \TMP5, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1248     movaps 0x50(%arg1), \TMP3
1249     aesenc    \TMP3, \XMM1              # Round 5
1250     aesenc    \TMP3, \XMM2
1251     aesenc    \TMP3, \XMM3
1252     aesenc    \TMP3, \XMM4
1253     pxor      \TMP1, \TMP4
1254 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1255     pxor      \XMM6, \XMM5
1256     pxor      \TMP2, \TMP6
1257     movdqa    \XMM7, \TMP1
1258     pshufd    $78, \XMM7, \TMP2
1259     pxor      \XMM7, \TMP2
1260     movdqu    HashKey_2(%arg2), \TMP5
1261
1262         # Multiply TMP5 * HashKey using karatsuba
1263
1264     pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1265     movaps 0x60(%arg1), \TMP3
1266     aesenc    \TMP3, \XMM1              # Round 6
1267     aesenc    \TMP3, \XMM2
1268     aesenc    \TMP3, \XMM3
1269     aesenc    \TMP3, \XMM4
1270     pclmulqdq $0x00, \TMP5, \XMM7       # XMM7 = a0*b0
1271     movaps 0x70(%arg1), \TMP3
1272     aesenc    \TMP3, \XMM1              # Round 7
1273     aesenc    \TMP3, \XMM2
1274     aesenc    \TMP3, \XMM3
1275     aesenc    \TMP3, \XMM4
1276     movdqu    HashKey_2_k(%arg2), \TMP5
1277     pclmulqdq $0x00, \TMP5, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1278     movaps 0x80(%arg1), \TMP3
1279     aesenc    \TMP3, \XMM1              # Round 8
1280     aesenc    \TMP3, \XMM2
1281     aesenc    \TMP3, \XMM3
1282     aesenc    \TMP3, \XMM4
1283     pxor      \TMP1, \TMP4
1284 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1285     pxor      \XMM7, \XMM5
1286     pxor      \TMP2, \TMP6
1287
1288         # Multiply XMM8 * HashKey
1289         # XMM8 and TMP5 hold the values for the two operands
1290
1291     movdqa    \XMM8, \TMP1
1292     pshufd    $78, \XMM8, \TMP2
1293     pxor      \XMM8, \TMP2
1294     movdqu    HashKey(%arg2), \TMP5
1295     pclmulqdq $0x11, \TMP5, \TMP1      # TMP1 = a1*b1
1296     movaps 0x90(%arg1), \TMP3
1297     aesenc    \TMP3, \XMM1             # Round 9
1298     aesenc    \TMP3, \XMM2
1299     aesenc    \TMP3, \XMM3
1300     aesenc    \TMP3, \XMM4
1301     pclmulqdq $0x00, \TMP5, \XMM8      # XMM8 = a0*b0
1302     lea   0xa0(%arg1),%r10
1303     mov   keysize,%eax
1304     shr   $2,%eax               # 128->4, 192->6, 256->8
1305     sub   $4,%eax           # 128->0, 192->2, 256->4
1306     jz    aes_loop_par_dec_done\@
1307
1308 aes_loop_par_dec\@:
1309     MOVADQ    (%r10),\TMP3
1310 .irpc   index, 1234
1311     aesenc    \TMP3, %xmm\index
1312 .endr
1313     add   $16,%r10
1314     sub   $1,%eax
1315     jnz   aes_loop_par_dec\@
1316
1317 aes_loop_par_dec_done\@:
1318     MOVADQ    (%r10), \TMP3
1319     aesenclast \TMP3, \XMM1           # last round
1320     aesenclast \TMP3, \XMM2
1321     aesenclast \TMP3, \XMM3
1322     aesenclast \TMP3, \XMM4
1323     movdqu    HashKey_k(%arg2), \TMP5
1324     pclmulqdq $0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0)
1325     movdqu    (%arg4,%r11,1), \TMP3
1326     pxor      \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK
1327     movdqu    \XMM1, (%arg3,%r11,1)        # Write to plaintext buffer
1328     movdqa    \TMP3, \XMM1
1329     movdqu    16(%arg4,%r11,1), \TMP3
1330     pxor      \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK
1331     movdqu    \XMM2, 16(%arg3,%r11,1)      # Write to plaintext buffer
1332     movdqa    \TMP3, \XMM2
1333     movdqu    32(%arg4,%r11,1), \TMP3
1334     pxor      \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK
1335     movdqu    \XMM3, 32(%arg3,%r11,1)      # Write to plaintext buffer
1336     movdqa    \TMP3, \XMM3
1337     movdqu    48(%arg4,%r11,1), \TMP3
1338     pxor      \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK
1339     movdqu    \XMM4, 48(%arg3,%r11,1)      # Write to plaintext buffer
1340     movdqa    \TMP3, \XMM4
1341     pshufb %xmm15, \XMM1        # perform a 16 byte swap
1342     pshufb %xmm15, \XMM2    # perform a 16 byte swap
1343     pshufb %xmm15, \XMM3    # perform a 16 byte swap
1344     pshufb %xmm15, \XMM4    # perform a 16 byte swap
1345
1346     pxor      \TMP4, \TMP1
1347     pxor      \XMM8, \XMM5
1348     pxor      \TMP6, \TMP2
1349     pxor      \TMP1, \TMP2
1350     pxor      \XMM5, \TMP2
1351     movdqa    \TMP2, \TMP3
1352     pslldq    $8, \TMP3                    # left shift TMP3 2 DWs
1353     psrldq    $8, \TMP2                    # right shift TMP2 2 DWs
1354     pxor      \TMP3, \XMM5
1355     pxor      \TMP2, \TMP1    # accumulate the results in TMP1:XMM5
1356
1357         # first phase of reduction
1358
1359     movdqa    \XMM5, \TMP2
1360     movdqa    \XMM5, \TMP3
1361     movdqa    \XMM5, \TMP4
1362 # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1363     pslld     $31, \TMP2                   # packed right shift << 31
1364     pslld     $30, \TMP3                   # packed right shift << 30
1365     pslld     $25, \TMP4                   # packed right shift << 25
1366     pxor      \TMP3, \TMP2                 # xor the shifted versions
1367     pxor      \TMP4, \TMP2
1368     movdqa    \TMP2, \TMP5
1369     psrldq    $4, \TMP5                    # right shift T5 1 DW
1370     pslldq    $12, \TMP2                   # left shift T2 3 DWs
1371     pxor      \TMP2, \XMM5
1372
1373         # second phase of reduction
1374
1375     movdqa    \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1376     movdqa    \XMM5,\TMP3
1377     movdqa    \XMM5,\TMP4
1378     psrld     $1, \TMP2                    # packed left shift >>1
1379     psrld     $2, \TMP3                    # packed left shift >>2
1380     psrld     $7, \TMP4                    # packed left shift >>7
1381     pxor      \TMP3,\TMP2              # xor the shifted versions
1382     pxor      \TMP4,\TMP2
1383     pxor      \TMP5, \TMP2
1384     pxor      \TMP2, \XMM5
1385     pxor      \TMP1, \XMM5                 # result is in TMP1
1386
1387     pxor      \XMM5, \XMM1
1388 .endm
1389
1390 /* GHASH the last 4 ciphertext blocks. */
1391 .macro  GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
1392 TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
1393
1394         # Multiply TMP6 * HashKey (using Karatsuba)
1395
1396     movdqa    \XMM1, \TMP6
1397     pshufd    $78, \XMM1, \TMP2
1398     pxor      \XMM1, \TMP2
1399     movdqu    HashKey_4(%arg2), \TMP5
1400     pclmulqdq $0x11, \TMP5, \TMP6       # TMP6 = a1*b1
1401     pclmulqdq $0x00, \TMP5, \XMM1       # XMM1 = a0*b0
1402     movdqu    HashKey_4_k(%arg2), \TMP4
1403     pclmulqdq $0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1404     movdqa    \XMM1, \XMMDst
1405     movdqa    \TMP2, \XMM1              # result in TMP6, XMMDst, XMM1
1406
1407         # Multiply TMP1 * HashKey (using Karatsuba)
1408
1409     movdqa    \XMM2, \TMP1
1410     pshufd    $78, \XMM2, \TMP2
1411     pxor      \XMM2, \TMP2
1412     movdqu    HashKey_3(%arg2), \TMP5
1413     pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1414     pclmulqdq $0x00, \TMP5, \XMM2       # XMM2 = a0*b0
1415     movdqu    HashKey_3_k(%arg2), \TMP4
1416     pclmulqdq $0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1417     pxor      \TMP1, \TMP6
1418     pxor      \XMM2, \XMMDst
1419     pxor      \TMP2, \XMM1
1420 # results accumulated in TMP6, XMMDst, XMM1
1421
1422         # Multiply TMP1 * HashKey (using Karatsuba)
1423
1424     movdqa    \XMM3, \TMP1
1425     pshufd    $78, \XMM3, \TMP2
1426     pxor      \XMM3, \TMP2
1427     movdqu    HashKey_2(%arg2), \TMP5
1428     pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1429     pclmulqdq $0x00, \TMP5, \XMM3       # XMM3 = a0*b0
1430     movdqu    HashKey_2_k(%arg2), \TMP4
1431     pclmulqdq $0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1432     pxor      \TMP1, \TMP6
1433     pxor      \XMM3, \XMMDst
1434     pxor      \TMP2, \XMM1   # results accumulated in TMP6, XMMDst, XMM1
1435
1436         # Multiply TMP1 * HashKey (using Karatsuba)
1437     movdqa    \XMM4, \TMP1
1438     pshufd    $78, \XMM4, \TMP2
1439     pxor      \XMM4, \TMP2
1440     movdqu    HashKey(%arg2), \TMP5
1441     pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1442     pclmulqdq $0x00, \TMP5, \XMM4       # XMM4 = a0*b0
1443     movdqu    HashKey_k(%arg2), \TMP4
1444     pclmulqdq $0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1445     pxor      \TMP1, \TMP6
1446     pxor      \XMM4, \XMMDst
1447     pxor      \XMM1, \TMP2
1448     pxor      \TMP6, \TMP2
1449     pxor      \XMMDst, \TMP2
1450     # middle section of the temp results combined as in karatsuba algorithm
1451     movdqa    \TMP2, \TMP4
1452     pslldq    $8, \TMP4                 # left shift TMP4 2 DWs
1453     psrldq    $8, \TMP2                 # right shift TMP2 2 DWs
1454     pxor      \TMP4, \XMMDst
1455     pxor      \TMP2, \TMP6
1456 # TMP6:XMMDst holds the result of the accumulated carry-less multiplications
1457     # first phase of the reduction
1458     movdqa    \XMMDst, \TMP2
1459     movdqa    \XMMDst, \TMP3
1460     movdqa    \XMMDst, \TMP4
1461 # move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
1462     pslld     $31, \TMP2                # packed right shifting << 31
1463     pslld     $30, \TMP3                # packed right shifting << 30
1464     pslld     $25, \TMP4                # packed right shifting << 25
1465     pxor      \TMP3, \TMP2              # xor the shifted versions
1466     pxor      \TMP4, \TMP2
1467     movdqa    \TMP2, \TMP7
1468     psrldq    $4, \TMP7                 # right shift TMP7 1 DW
1469     pslldq    $12, \TMP2                # left shift TMP2 3 DWs
1470     pxor      \TMP2, \XMMDst
1471
1472         # second phase of the reduction
1473     movdqa    \XMMDst, \TMP2
1474     # make 3 copies of XMMDst for doing 3 shift operations
1475     movdqa    \XMMDst, \TMP3
1476     movdqa    \XMMDst, \TMP4
1477     psrld     $1, \TMP2                 # packed left shift >> 1
1478     psrld     $2, \TMP3                 # packed left shift >> 2
1479     psrld     $7, \TMP4                 # packed left shift >> 7
1480     pxor      \TMP3, \TMP2              # xor the shifted versions
1481     pxor      \TMP4, \TMP2
1482     pxor      \TMP7, \TMP2
1483     pxor      \TMP2, \XMMDst
1484     pxor      \TMP6, \XMMDst            # reduced result is in XMMDst
1485 .endm
1486
1487
1488 /* Encryption of a single block
1489 * uses eax & r10
1490 */
1491
1492 .macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
1493
1494     pxor        (%arg1), \XMM0
1495     mov     keysize,%eax
1496     shr     $2,%eax         # 128->4, 192->6, 256->8
1497     add     $5,%eax         # 128->9, 192->11, 256->13
1498     lea     16(%arg1), %r10   # get first expanded key address
1499
1500 _esb_loop_\@:
1501     MOVADQ      (%r10),\TMP1
1502     aesenc      \TMP1,\XMM0
1503     add     $16,%r10
1504     sub     $1,%eax
1505     jnz     _esb_loop_\@
1506
1507     MOVADQ      (%r10),\TMP1
1508     aesenclast  \TMP1,\XMM0
1509 .endm
1510 /*****************************************************************************
1511 * void aesni_gcm_dec(void *aes_ctx,    // AES Key schedule. Starts on a 16 byte boundary.
1512 *                   struct gcm_context_data *data
1513 *                                      // Context data
1514 *                   u8 *out,           // Plaintext output. Encrypt in-place is allowed.
1515 *                   const u8 *in,      // Ciphertext input
1516 *                   u64 plaintext_len, // Length of data in bytes for decryption.
1517 *                   u8 *iv,            // Pre-counter block j0: 4 byte salt (from Security Association)
1518 *                                      // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1519 *                                      // concatenated with 0x00000001. 16-byte aligned pointer.
1520 *                   u8 *hash_subkey,   // H, the Hash sub key input. Data starts on a 16-byte boundary.
1521 *                   const u8 *aad,     // Additional Authentication Data (AAD)
1522 *                   u64 aad_len,       // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1523 *                   u8  *auth_tag,     // Authenticated Tag output. The driver will compare this to the
1524 *                                      // given authentication tag and only return the plaintext if they match.
1525 *                   u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
1526 *                                      // (most likely), 12 or 8.
1527 *
1528 * Assumptions:
1529 *
1530 * keys:
1531 *       keys are pre-expanded and aligned to 16 bytes. we are using the first
1532 *       set of 11 keys in the data structure void *aes_ctx
1533 *
1534 * iv:
1535 *       0                   1                   2                   3
1536 *       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1537 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1538 *       |                             Salt  (From the SA)               |
1539 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1540 *       |                     Initialization Vector                     |
1541 *       |         (This is the sequence number from IPSec header)       |
1542 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1543 *       |                              0x1                              |
1544 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1545 *
1546 *
1547 *
1548 * AAD:
1549 *       AAD padded to 128 bits with 0
1550 *       for example, assume AAD is a u32 vector
1551 *
1552 *       if AAD is 8 bytes:
1553 *       AAD[3] = {A0, A1};
1554 *       padded AAD in xmm register = {A1 A0 0 0}
1555 *
1556 *       0                   1                   2                   3
1557 *       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1558 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1559 *       |                               SPI (A1)                        |
1560 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1561 *       |                     32-bit Sequence Number (A0)               |
1562 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1563 *       |                              0x0                              |
1564 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1565 *
1566 *                                       AAD Format with 32-bit Sequence Number
1567 *
1568 *       if AAD is 12 bytes:
1569 *       AAD[3] = {A0, A1, A2};
1570 *       padded AAD in xmm register = {A2 A1 A0 0}
1571 *
1572 *       0                   1                   2                   3
1573 *       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1574 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1575 *       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1576 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1577 *       |                               SPI (A2)                        |
1578 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1579 *       |                 64-bit Extended Sequence Number {A1,A0}       |
1580 *       |                                                               |
1581 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1582 *       |                              0x0                              |
1583 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1584 *
1585 *                        AAD Format with 64-bit Extended Sequence Number
1586 *
1587 * poly = x^128 + x^127 + x^126 + x^121 + 1
1588 *
1589 *****************************************************************************/
1590 SYM_FUNC_START(aesni_gcm_dec)
1591     FUNC_SAVE
1592
1593     GCM_INIT %arg6, arg7, arg8, arg9
1594     GCM_ENC_DEC dec
1595     GCM_COMPLETE arg10, arg11
1596     FUNC_RESTORE
1597     RET
1598 SYM_FUNC_END(aesni_gcm_dec)
1599
1600
1601 /*****************************************************************************
1602 * void aesni_gcm_enc(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1603 *                    struct gcm_context_data *data
1604 *                                        // Context data
1605 *                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
1606 *                    const u8 *in,       // Plaintext input
1607 *                    u64 plaintext_len,  // Length of data in bytes for encryption.
1608 *                    u8 *iv,             // Pre-counter block j0: 4 byte salt (from Security Association)
1609 *                                        // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1610 *                                        // concatenated with 0x00000001. 16-byte aligned pointer.
1611 *                    u8 *hash_subkey,    // H, the Hash sub key input. Data starts on a 16-byte boundary.
1612 *                    const u8 *aad,      // Additional Authentication Data (AAD)
1613 *                    u64 aad_len,        // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1614 *                    u8 *auth_tag,       // Authenticated Tag output.
1615 *                    u64 auth_tag_len);  // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1616 *                                        // 12 or 8.
1617 *
1618 * Assumptions:
1619 *
1620 * keys:
1621 *       keys are pre-expanded and aligned to 16 bytes. we are using the
1622 *       first set of 11 keys in the data structure void *aes_ctx
1623 *
1624 *
1625 * iv:
1626 *       0                   1                   2                   3
1627 *       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1628 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1629 *       |                             Salt  (From the SA)               |
1630 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1631 *       |                     Initialization Vector                     |
1632 *       |         (This is the sequence number from IPSec header)       |
1633 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1634 *       |                              0x1                              |
1635 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1636 *
1637 *
1638 *
1639 * AAD:
1640 *       AAD padded to 128 bits with 0
1641 *       for example, assume AAD is a u32 vector
1642 *
1643 *       if AAD is 8 bytes:
1644 *       AAD[3] = {A0, A1};
1645 *       padded AAD in xmm register = {A1 A0 0 0}
1646 *
1647 *       0                   1                   2                   3
1648 *       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1649 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1650 *       |                               SPI (A1)                        |
1651 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1652 *       |                     32-bit Sequence Number (A0)               |
1653 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1654 *       |                              0x0                              |
1655 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1656 *
1657 *                                 AAD Format with 32-bit Sequence Number
1658 *
1659 *       if AAD is 12 bytes:
1660 *       AAD[3] = {A0, A1, A2};
1661 *       padded AAD in xmm register = {A2 A1 A0 0}
1662 *
1663 *       0                   1                   2                   3
1664 *       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1665 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1666 *       |                               SPI (A2)                        |
1667 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1668 *       |                 64-bit Extended Sequence Number {A1,A0}       |
1669 *       |                                                               |
1670 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1671 *       |                              0x0                              |
1672 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1673 *
1674 *                         AAD Format with 64-bit Extended Sequence Number
1675 *
1676 * poly = x^128 + x^127 + x^126 + x^121 + 1
1677 ***************************************************************************/
1678 SYM_FUNC_START(aesni_gcm_enc)
1679     FUNC_SAVE
1680
1681     GCM_INIT %arg6, arg7, arg8, arg9
1682     GCM_ENC_DEC enc
1683
1684     GCM_COMPLETE arg10, arg11
1685     FUNC_RESTORE
1686     RET
1687 SYM_FUNC_END(aesni_gcm_enc)
1688
1689 /*****************************************************************************
1690 * void aesni_gcm_init(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1691 *                     struct gcm_context_data *data,
1692 *                                         // context data
1693 *                     u8 *iv,             // Pre-counter block j0: 4 byte salt (from Security Association)
1694 *                                         // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1695 *                                         // concatenated with 0x00000001. 16-byte aligned pointer.
1696 *                     u8 *hash_subkey,    // H, the Hash sub key input. Data starts on a 16-byte boundary.
1697 *                     const u8 *aad,      // Additional Authentication Data (AAD)
1698 *                     u64 aad_len)        // Length of AAD in bytes.
1699 */
1700 SYM_FUNC_START(aesni_gcm_init)
1701     FUNC_SAVE
1702     GCM_INIT %arg3, %arg4,%arg5, %arg6
1703     FUNC_RESTORE
1704     RET
1705 SYM_FUNC_END(aesni_gcm_init)
1706
1707 /*****************************************************************************
1708 * void aesni_gcm_enc_update(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1709 *                    struct gcm_context_data *data,
1710 *                                        // context data
1711 *                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
1712 *                    const u8 *in,       // Plaintext input
1713 *                    u64 plaintext_len,  // Length of data in bytes for encryption.
1714 */
1715 SYM_FUNC_START(aesni_gcm_enc_update)
1716     FUNC_SAVE
1717     GCM_ENC_DEC enc
1718     FUNC_RESTORE
1719     RET
1720 SYM_FUNC_END(aesni_gcm_enc_update)
1721
1722 /*****************************************************************************
1723 * void aesni_gcm_dec_update(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1724 *                    struct gcm_context_data *data,
1725 *                                        // context data
1726 *                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
1727 *                    const u8 *in,       // Plaintext input
1728 *                    u64 plaintext_len,  // Length of data in bytes for encryption.
1729 */
1730 SYM_FUNC_START(aesni_gcm_dec_update)
1731     FUNC_SAVE
1732     GCM_ENC_DEC dec
1733     FUNC_RESTORE
1734     RET
1735 SYM_FUNC_END(aesni_gcm_dec_update)
1736
1737 /*****************************************************************************
1738 * void aesni_gcm_finalize(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1739 *                    struct gcm_context_data *data,
1740 *                                        // context data
1741 *                    u8 *auth_tag,       // Authenticated Tag output.
1742 *                    u64 auth_tag_len);  // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1743 *                                        // 12 or 8.
1744 */
1745 SYM_FUNC_START(aesni_gcm_finalize)
1746     FUNC_SAVE
1747     GCM_COMPLETE %arg3 %arg4
1748     FUNC_RESTORE
1749     RET
1750 SYM_FUNC_END(aesni_gcm_finalize)
1751
1752 #endif
1753
1754 SYM_FUNC_START_LOCAL(_key_expansion_256a)
1755     pshufd $0b11111111, %xmm1, %xmm1
1756     shufps $0b00010000, %xmm0, %xmm4
1757     pxor %xmm4, %xmm0
1758     shufps $0b10001100, %xmm0, %xmm4
1759     pxor %xmm4, %xmm0
1760     pxor %xmm1, %xmm0
1761     movaps %xmm0, (TKEYP)
1762     add $0x10, TKEYP
1763     RET
1764 SYM_FUNC_END(_key_expansion_256a)
1765 SYM_FUNC_ALIAS_LOCAL(_key_expansion_128, _key_expansion_256a)
1766
1767 SYM_FUNC_START_LOCAL(_key_expansion_192a)
1768     pshufd $0b01010101, %xmm1, %xmm1
1769     shufps $0b00010000, %xmm0, %xmm4
1770     pxor %xmm4, %xmm0
1771     shufps $0b10001100, %xmm0, %xmm4
1772     pxor %xmm4, %xmm0
1773     pxor %xmm1, %xmm0
1774
1775     movaps %xmm2, %xmm5
1776     movaps %xmm2, %xmm6
1777     pslldq $4, %xmm5
1778     pshufd $0b11111111, %xmm0, %xmm3
1779     pxor %xmm3, %xmm2
1780     pxor %xmm5, %xmm2
1781
1782     movaps %xmm0, %xmm1
1783     shufps $0b01000100, %xmm0, %xmm6
1784     movaps %xmm6, (TKEYP)
1785     shufps $0b01001110, %xmm2, %xmm1
1786     movaps %xmm1, 0x10(TKEYP)
1787     add $0x20, TKEYP
1788     RET
1789 SYM_FUNC_END(_key_expansion_192a)
1790
1791 SYM_FUNC_START_LOCAL(_key_expansion_192b)
1792     pshufd $0b01010101, %xmm1, %xmm1
1793     shufps $0b00010000, %xmm0, %xmm4
1794     pxor %xmm4, %xmm0
1795     shufps $0b10001100, %xmm0, %xmm4
1796     pxor %xmm4, %xmm0
1797     pxor %xmm1, %xmm0
1798
1799     movaps %xmm2, %xmm5
1800     pslldq $4, %xmm5
1801     pshufd $0b11111111, %xmm0, %xmm3
1802     pxor %xmm3, %xmm2
1803     pxor %xmm5, %xmm2
1804
1805     movaps %xmm0, (TKEYP)
1806     add $0x10, TKEYP
1807     RET
1808 SYM_FUNC_END(_key_expansion_192b)
1809
1810 SYM_FUNC_START_LOCAL(_key_expansion_256b)
1811     pshufd $0b10101010, %xmm1, %xmm1
1812     shufps $0b00010000, %xmm2, %xmm4
1813     pxor %xmm4, %xmm2
1814     shufps $0b10001100, %xmm2, %xmm4
1815     pxor %xmm4, %xmm2
1816     pxor %xmm1, %xmm2
1817     movaps %xmm2, (TKEYP)
1818     add $0x10, TKEYP
1819     RET
1820 SYM_FUNC_END(_key_expansion_256b)
1821
1822 /*
1823  * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
1824  *                   unsigned int key_len)
1825  */
1826 SYM_FUNC_START(aesni_set_key)
1827     FRAME_BEGIN
1828 #ifndef __x86_64__
1829     pushl KEYP
1830     movl (FRAME_OFFSET+8)(%esp), KEYP   # ctx
1831     movl (FRAME_OFFSET+12)(%esp), UKEYP # in_key
1832     movl (FRAME_OFFSET+16)(%esp), %edx  # key_len
1833 #endif
1834     movups (UKEYP), %xmm0       # user key (first 16 bytes)
1835     movaps %xmm0, (KEYP)
1836     lea 0x10(KEYP), TKEYP       # key addr
1837     movl %edx, 480(KEYP)
1838     pxor %xmm4, %xmm4       # xmm4 is assumed 0 in _key_expansion_x
1839     cmp $24, %dl
1840     jb .Lenc_key128
1841     je .Lenc_key192
1842     movups 0x10(UKEYP), %xmm2   # other user key
1843     movaps %xmm2, (TKEYP)
1844     add $0x10, TKEYP
1845     aeskeygenassist $0x1, %xmm2, %xmm1  # round 1
1846     call _key_expansion_256a
1847     aeskeygenassist $0x1, %xmm0, %xmm1
1848     call _key_expansion_256b
1849     aeskeygenassist $0x2, %xmm2, %xmm1  # round 2
1850     call _key_expansion_256a
1851     aeskeygenassist $0x2, %xmm0, %xmm1
1852     call _key_expansion_256b
1853     aeskeygenassist $0x4, %xmm2, %xmm1  # round 3
1854     call _key_expansion_256a
1855     aeskeygenassist $0x4, %xmm0, %xmm1
1856     call _key_expansion_256b
1857     aeskeygenassist $0x8, %xmm2, %xmm1  # round 4
1858     call _key_expansion_256a
1859     aeskeygenassist $0x8, %xmm0, %xmm1
1860     call _key_expansion_256b
1861     aeskeygenassist $0x10, %xmm2, %xmm1 # round 5
1862     call _key_expansion_256a
1863     aeskeygenassist $0x10, %xmm0, %xmm1
1864     call _key_expansion_256b
1865     aeskeygenassist $0x20, %xmm2, %xmm1 # round 6
1866     call _key_expansion_256a
1867     aeskeygenassist $0x20, %xmm0, %xmm1
1868     call _key_expansion_256b
1869     aeskeygenassist $0x40, %xmm2, %xmm1 # round 7
1870     call _key_expansion_256a
1871     jmp .Ldec_key
1872 .Lenc_key192:
1873     movq 0x10(UKEYP), %xmm2     # other user key
1874     aeskeygenassist $0x1, %xmm2, %xmm1  # round 1
1875     call _key_expansion_192a
1876     aeskeygenassist $0x2, %xmm2, %xmm1  # round 2
1877     call _key_expansion_192b
1878     aeskeygenassist $0x4, %xmm2, %xmm1  # round 3
1879     call _key_expansion_192a
1880     aeskeygenassist $0x8, %xmm2, %xmm1  # round 4
1881     call _key_expansion_192b
1882     aeskeygenassist $0x10, %xmm2, %xmm1 # round 5
1883     call _key_expansion_192a
1884     aeskeygenassist $0x20, %xmm2, %xmm1 # round 6
1885     call _key_expansion_192b
1886     aeskeygenassist $0x40, %xmm2, %xmm1 # round 7
1887     call _key_expansion_192a
1888     aeskeygenassist $0x80, %xmm2, %xmm1 # round 8
1889     call _key_expansion_192b
1890     jmp .Ldec_key
1891 .Lenc_key128:
1892     aeskeygenassist $0x1, %xmm0, %xmm1  # round 1
1893     call _key_expansion_128
1894     aeskeygenassist $0x2, %xmm0, %xmm1  # round 2
1895     call _key_expansion_128
1896     aeskeygenassist $0x4, %xmm0, %xmm1  # round 3
1897     call _key_expansion_128
1898     aeskeygenassist $0x8, %xmm0, %xmm1  # round 4
1899     call _key_expansion_128
1900     aeskeygenassist $0x10, %xmm0, %xmm1 # round 5
1901     call _key_expansion_128
1902     aeskeygenassist $0x20, %xmm0, %xmm1 # round 6
1903     call _key_expansion_128
1904     aeskeygenassist $0x40, %xmm0, %xmm1 # round 7
1905     call _key_expansion_128
1906     aeskeygenassist $0x80, %xmm0, %xmm1 # round 8
1907     call _key_expansion_128
1908     aeskeygenassist $0x1b, %xmm0, %xmm1 # round 9
1909     call _key_expansion_128
1910     aeskeygenassist $0x36, %xmm0, %xmm1 # round 10
1911     call _key_expansion_128
1912 .Ldec_key:
1913     sub $0x10, TKEYP
1914     movaps (KEYP), %xmm0
1915     movaps (TKEYP), %xmm1
1916     movaps %xmm0, 240(TKEYP)
1917     movaps %xmm1, 240(KEYP)
1918     add $0x10, KEYP
1919     lea 240-16(TKEYP), UKEYP
1920 .align 4
1921 .Ldec_key_loop:
1922     movaps (KEYP), %xmm0
1923     aesimc %xmm0, %xmm1
1924     movaps %xmm1, (UKEYP)
1925     add $0x10, KEYP
1926     sub $0x10, UKEYP
1927     cmp TKEYP, KEYP
1928     jb .Ldec_key_loop
1929     xor AREG, AREG
1930 #ifndef __x86_64__
1931     popl KEYP
1932 #endif
1933     FRAME_END
1934     RET
1935 SYM_FUNC_END(aesni_set_key)
1936
1937 /*
1938  * void aesni_enc(const void *ctx, u8 *dst, const u8 *src)
1939  */
1940 SYM_FUNC_START(aesni_enc)
1941     FRAME_BEGIN
1942 #ifndef __x86_64__
1943     pushl KEYP
1944     pushl KLEN
1945     movl (FRAME_OFFSET+12)(%esp), KEYP  # ctx
1946     movl (FRAME_OFFSET+16)(%esp), OUTP  # dst
1947     movl (FRAME_OFFSET+20)(%esp), INP   # src
1948 #endif
1949     movl 480(KEYP), KLEN        # key length
1950     movups (INP), STATE     # input
1951     call _aesni_enc1
1952     movups STATE, (OUTP)        # output
1953 #ifndef __x86_64__
1954     popl KLEN
1955     popl KEYP
1956 #endif
1957     FRAME_END
1958     RET
1959 SYM_FUNC_END(aesni_enc)
1960
1961 /*
1962  * _aesni_enc1:     internal ABI
1963  * input:
1964  *  KEYP:       key struct pointer
1965  *  KLEN:       round count
1966  *  STATE:      initial state (input)
1967  * output:
1968  *  STATE:      finial state (output)
1969  * changed:
1970  *  KEY
1971  *  TKEYP (T1)
1972  */
1973 SYM_FUNC_START_LOCAL(_aesni_enc1)
1974     movaps (KEYP), KEY      # key
1975     mov KEYP, TKEYP
1976     pxor KEY, STATE     # round 0
1977     add $0x30, TKEYP
1978     cmp $24, KLEN
1979     jb .Lenc128
1980     lea 0x20(TKEYP), TKEYP
1981     je .Lenc192
1982     add $0x20, TKEYP
1983     movaps -0x60(TKEYP), KEY
1984     aesenc KEY, STATE
1985     movaps -0x50(TKEYP), KEY
1986     aesenc KEY, STATE
1987 .align 4
1988 .Lenc192:
1989     movaps -0x40(TKEYP), KEY
1990     aesenc KEY, STATE
1991     movaps -0x30(TKEYP), KEY
1992     aesenc KEY, STATE
1993 .align 4
1994 .Lenc128:
1995     movaps -0x20(TKEYP), KEY
1996     aesenc KEY, STATE
1997     movaps -0x10(TKEYP), KEY
1998     aesenc KEY, STATE
1999     movaps (TKEYP), KEY
2000     aesenc KEY, STATE
2001     movaps 0x10(TKEYP), KEY
2002     aesenc KEY, STATE
2003     movaps 0x20(TKEYP), KEY
2004     aesenc KEY, STATE
2005     movaps 0x30(TKEYP), KEY
2006     aesenc KEY, STATE
2007     movaps 0x40(TKEYP), KEY
2008     aesenc KEY, STATE
2009     movaps 0x50(TKEYP), KEY
2010     aesenc KEY, STATE
2011     movaps 0x60(TKEYP), KEY
2012     aesenc KEY, STATE
2013     movaps 0x70(TKEYP), KEY
2014     aesenclast KEY, STATE
2015     RET
2016 SYM_FUNC_END(_aesni_enc1)
2017
2018 /*
2019  * _aesni_enc4: internal ABI
2020  * input:
2021  *  KEYP:       key struct pointer
2022  *  KLEN:       round count
2023  *  STATE1:     initial state (input)
2024  *  STATE2
2025  *  STATE3
2026  *  STATE4
2027  * output:
2028  *  STATE1:     finial state (output)
2029  *  STATE2
2030  *  STATE3
2031  *  STATE4
2032  * changed:
2033  *  KEY
2034  *  TKEYP (T1)
2035  */
2036 SYM_FUNC_START_LOCAL(_aesni_enc4)
2037     movaps (KEYP), KEY      # key
2038     mov KEYP, TKEYP
2039     pxor KEY, STATE1        # round 0
2040     pxor KEY, STATE2
2041     pxor KEY, STATE3
2042     pxor KEY, STATE4
2043     add $0x30, TKEYP
2044     cmp $24, KLEN
2045     jb .L4enc128
2046     lea 0x20(TKEYP), TKEYP
2047     je .L4enc192
2048     add $0x20, TKEYP
2049     movaps -0x60(TKEYP), KEY
2050     aesenc KEY, STATE1
2051     aesenc KEY, STATE2
2052     aesenc KEY, STATE3
2053     aesenc KEY, STATE4
2054     movaps -0x50(TKEYP), KEY
2055     aesenc KEY, STATE1
2056     aesenc KEY, STATE2
2057     aesenc KEY, STATE3
2058     aesenc KEY, STATE4
2059 #.align 4
2060 .L4enc192:
2061     movaps -0x40(TKEYP), KEY
2062     aesenc KEY, STATE1
2063     aesenc KEY, STATE2
2064     aesenc KEY, STATE3
2065     aesenc KEY, STATE4
2066     movaps -0x30(TKEYP), KEY
2067     aesenc KEY, STATE1
2068     aesenc KEY, STATE2
2069     aesenc KEY, STATE3
2070     aesenc KEY, STATE4
2071 #.align 4
2072 .L4enc128:
2073     movaps -0x20(TKEYP), KEY
2074     aesenc KEY, STATE1
2075     aesenc KEY, STATE2
2076     aesenc KEY, STATE3
2077     aesenc KEY, STATE4
2078     movaps -0x10(TKEYP), KEY
2079     aesenc KEY, STATE1
2080     aesenc KEY, STATE2
2081     aesenc KEY, STATE3
2082     aesenc KEY, STATE4
2083     movaps (TKEYP), KEY
2084     aesenc KEY, STATE1
2085     aesenc KEY, STATE2
2086     aesenc KEY, STATE3
2087     aesenc KEY, STATE4
2088     movaps 0x10(TKEYP), KEY
2089     aesenc KEY, STATE1
2090     aesenc KEY, STATE2
2091     aesenc KEY, STATE3
2092     aesenc KEY, STATE4
2093     movaps 0x20(TKEYP), KEY
2094     aesenc KEY, STATE1
2095     aesenc KEY, STATE2
2096     aesenc KEY, STATE3
2097     aesenc KEY, STATE4
2098     movaps 0x30(TKEYP), KEY
2099     aesenc KEY, STATE1
2100     aesenc KEY, STATE2
2101     aesenc KEY, STATE3
2102     aesenc KEY, STATE4
2103     movaps 0x40(TKEYP), KEY
2104     aesenc KEY, STATE1
2105     aesenc KEY, STATE2
2106     aesenc KEY, STATE3
2107     aesenc KEY, STATE4
2108     movaps 0x50(TKEYP), KEY
2109     aesenc KEY, STATE1
2110     aesenc KEY, STATE2
2111     aesenc KEY, STATE3
2112     aesenc KEY, STATE4
2113     movaps 0x60(TKEYP), KEY
2114     aesenc KEY, STATE1
2115     aesenc KEY, STATE2
2116     aesenc KEY, STATE3
2117     aesenc KEY, STATE4
2118     movaps 0x70(TKEYP), KEY
2119     aesenclast KEY, STATE1      # last round
2120     aesenclast KEY, STATE2
2121     aesenclast KEY, STATE3
2122     aesenclast KEY, STATE4
2123     RET
2124 SYM_FUNC_END(_aesni_enc4)
2125
2126 /*
2127  * void aesni_dec (const void *ctx, u8 *dst, const u8 *src)
2128  */
2129 SYM_FUNC_START(aesni_dec)
2130     FRAME_BEGIN
2131 #ifndef __x86_64__
2132     pushl KEYP
2133     pushl KLEN
2134     movl (FRAME_OFFSET+12)(%esp), KEYP  # ctx
2135     movl (FRAME_OFFSET+16)(%esp), OUTP  # dst
2136     movl (FRAME_OFFSET+20)(%esp), INP   # src
2137 #endif
2138     mov 480(KEYP), KLEN     # key length
2139     add $240, KEYP
2140     movups (INP), STATE     # input
2141     call _aesni_dec1
2142     movups STATE, (OUTP)        #output
2143 #ifndef __x86_64__
2144     popl KLEN
2145     popl KEYP
2146 #endif
2147     FRAME_END
2148     RET
2149 SYM_FUNC_END(aesni_dec)
2150
2151 /*
2152  * _aesni_dec1:     internal ABI
2153  * input:
2154  *  KEYP:       key struct pointer
2155  *  KLEN:       key length
2156  *  STATE:      initial state (input)
2157  * output:
2158  *  STATE:      finial state (output)
2159  * changed:
2160  *  KEY
2161  *  TKEYP (T1)
2162  */
2163 SYM_FUNC_START_LOCAL(_aesni_dec1)
2164     movaps (KEYP), KEY      # key
2165     mov KEYP, TKEYP
2166     pxor KEY, STATE     # round 0
2167     add $0x30, TKEYP
2168     cmp $24, KLEN
2169     jb .Ldec128
2170     lea 0x20(TKEYP), TKEYP
2171     je .Ldec192
2172     add $0x20, TKEYP
2173     movaps -0x60(TKEYP), KEY
2174     aesdec KEY, STATE
2175     movaps -0x50(TKEYP), KEY
2176     aesdec KEY, STATE
2177 .align 4
2178 .Ldec192:
2179     movaps -0x40(TKEYP), KEY
2180     aesdec KEY, STATE
2181     movaps -0x30(TKEYP), KEY
2182     aesdec KEY, STATE
2183 .align 4
2184 .Ldec128:
2185     movaps -0x20(TKEYP), KEY
2186     aesdec KEY, STATE
2187     movaps -0x10(TKEYP), KEY
2188     aesdec KEY, STATE
2189     movaps (TKEYP), KEY
2190     aesdec KEY, STATE
2191     movaps 0x10(TKEYP), KEY
2192     aesdec KEY, STATE
2193     movaps 0x20(TKEYP), KEY
2194     aesdec KEY, STATE
2195     movaps 0x30(TKEYP), KEY
2196     aesdec KEY, STATE
2197     movaps 0x40(TKEYP), KEY
2198     aesdec KEY, STATE
2199     movaps 0x50(TKEYP), KEY
2200     aesdec KEY, STATE
2201     movaps 0x60(TKEYP), KEY
2202     aesdec KEY, STATE
2203     movaps 0x70(TKEYP), KEY
2204     aesdeclast KEY, STATE
2205     RET
2206 SYM_FUNC_END(_aesni_dec1)
2207
2208 /*
2209  * _aesni_dec4: internal ABI
2210  * input:
2211  *  KEYP:       key struct pointer
2212  *  KLEN:       key length
2213  *  STATE1:     initial state (input)
2214  *  STATE2
2215  *  STATE3
2216  *  STATE4
2217  * output:
2218  *  STATE1:     finial state (output)
2219  *  STATE2
2220  *  STATE3
2221  *  STATE4
2222  * changed:
2223  *  KEY
2224  *  TKEYP (T1)
2225  */
2226 SYM_FUNC_START_LOCAL(_aesni_dec4)
2227     movaps (KEYP), KEY      # key
2228     mov KEYP, TKEYP
2229     pxor KEY, STATE1        # round 0
2230     pxor KEY, STATE2
2231     pxor KEY, STATE3
2232     pxor KEY, STATE4
2233     add $0x30, TKEYP
2234     cmp $24, KLEN
2235     jb .L4dec128
2236     lea 0x20(TKEYP), TKEYP
2237     je .L4dec192
2238     add $0x20, TKEYP
2239     movaps -0x60(TKEYP), KEY
2240     aesdec KEY, STATE1
2241     aesdec KEY, STATE2
2242     aesdec KEY, STATE3
2243     aesdec KEY, STATE4
2244     movaps -0x50(TKEYP), KEY
2245     aesdec KEY, STATE1
2246     aesdec KEY, STATE2
2247     aesdec KEY, STATE3
2248     aesdec KEY, STATE4
2249 .align 4
2250 .L4dec192:
2251     movaps -0x40(TKEYP), KEY
2252     aesdec KEY, STATE1
2253     aesdec KEY, STATE2
2254     aesdec KEY, STATE3
2255     aesdec KEY, STATE4
2256     movaps -0x30(TKEYP), KEY
2257     aesdec KEY, STATE1
2258     aesdec KEY, STATE2
2259     aesdec KEY, STATE3
2260     aesdec KEY, STATE4
2261 .align 4
2262 .L4dec128:
2263     movaps -0x20(TKEYP), KEY
2264     aesdec KEY, STATE1
2265     aesdec KEY, STATE2
2266     aesdec KEY, STATE3
2267     aesdec KEY, STATE4
2268     movaps -0x10(TKEYP), KEY
2269     aesdec KEY, STATE1
2270     aesdec KEY, STATE2
2271     aesdec KEY, STATE3
2272     aesdec KEY, STATE4
2273     movaps (TKEYP), KEY
2274     aesdec KEY, STATE1
2275     aesdec KEY, STATE2
2276     aesdec KEY, STATE3
2277     aesdec KEY, STATE4
2278     movaps 0x10(TKEYP), KEY
2279     aesdec KEY, STATE1
2280     aesdec KEY, STATE2
2281     aesdec KEY, STATE3
2282     aesdec KEY, STATE4
2283     movaps 0x20(TKEYP), KEY
2284     aesdec KEY, STATE1
2285     aesdec KEY, STATE2
2286     aesdec KEY, STATE3
2287     aesdec KEY, STATE4
2288     movaps 0x30(TKEYP), KEY
2289     aesdec KEY, STATE1
2290     aesdec KEY, STATE2
2291     aesdec KEY, STATE3
2292     aesdec KEY, STATE4
2293     movaps 0x40(TKEYP), KEY
2294     aesdec KEY, STATE1
2295     aesdec KEY, STATE2
2296     aesdec KEY, STATE3
2297     aesdec KEY, STATE4
2298     movaps 0x50(TKEYP), KEY
2299     aesdec KEY, STATE1
2300     aesdec KEY, STATE2
2301     aesdec KEY, STATE3
2302     aesdec KEY, STATE4
2303     movaps 0x60(TKEYP), KEY
2304     aesdec KEY, STATE1
2305     aesdec KEY, STATE2
2306     aesdec KEY, STATE3
2307     aesdec KEY, STATE4
2308     movaps 0x70(TKEYP), KEY
2309     aesdeclast KEY, STATE1      # last round
2310     aesdeclast KEY, STATE2
2311     aesdeclast KEY, STATE3
2312     aesdeclast KEY, STATE4
2313     RET
2314 SYM_FUNC_END(_aesni_dec4)
2315
2316 /*
2317  * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2318  *            size_t len)
2319  */
2320 SYM_FUNC_START(aesni_ecb_enc)
2321     FRAME_BEGIN
2322 #ifndef __x86_64__
2323     pushl LEN
2324     pushl KEYP
2325     pushl KLEN
2326     movl (FRAME_OFFSET+16)(%esp), KEYP  # ctx
2327     movl (FRAME_OFFSET+20)(%esp), OUTP  # dst
2328     movl (FRAME_OFFSET+24)(%esp), INP   # src
2329     movl (FRAME_OFFSET+28)(%esp), LEN   # len
2330 #endif
2331     test LEN, LEN       # check length
2332     jz .Lecb_enc_ret
2333     mov 480(KEYP), KLEN
2334     cmp $16, LEN
2335     jb .Lecb_enc_ret
2336     cmp $64, LEN
2337     jb .Lecb_enc_loop1
2338 .align 4
2339 .Lecb_enc_loop4:
2340     movups (INP), STATE1
2341     movups 0x10(INP), STATE2
2342     movups 0x20(INP), STATE3
2343     movups 0x30(INP), STATE4
2344     call _aesni_enc4
2345     movups STATE1, (OUTP)
2346     movups STATE2, 0x10(OUTP)
2347     movups STATE3, 0x20(OUTP)
2348     movups STATE4, 0x30(OUTP)
2349     sub $64, LEN
2350     add $64, INP
2351     add $64, OUTP
2352     cmp $64, LEN
2353     jge .Lecb_enc_loop4
2354     cmp $16, LEN
2355     jb .Lecb_enc_ret
2356 .align 4
2357 .Lecb_enc_loop1:
2358     movups (INP), STATE1
2359     call _aesni_enc1
2360     movups STATE1, (OUTP)
2361     sub $16, LEN
2362     add $16, INP
2363     add $16, OUTP
2364     cmp $16, LEN
2365     jge .Lecb_enc_loop1
2366 .Lecb_enc_ret:
2367 #ifndef __x86_64__
2368     popl KLEN
2369     popl KEYP
2370     popl LEN
2371 #endif
2372     FRAME_END
2373     RET
2374 SYM_FUNC_END(aesni_ecb_enc)
2375
2376 /*
2377  * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2378  *            size_t len);
2379  */
2380 SYM_FUNC_START(aesni_ecb_dec)
2381     FRAME_BEGIN
2382 #ifndef __x86_64__
2383     pushl LEN
2384     pushl KEYP
2385     pushl KLEN
2386     movl (FRAME_OFFSET+16)(%esp), KEYP  # ctx
2387     movl (FRAME_OFFSET+20)(%esp), OUTP  # dst
2388     movl (FRAME_OFFSET+24)(%esp), INP   # src
2389     movl (FRAME_OFFSET+28)(%esp), LEN   # len
2390 #endif
2391     test LEN, LEN
2392     jz .Lecb_dec_ret
2393     mov 480(KEYP), KLEN
2394     add $240, KEYP
2395     cmp $16, LEN
2396     jb .Lecb_dec_ret
2397     cmp $64, LEN
2398     jb .Lecb_dec_loop1
2399 .align 4
2400 .Lecb_dec_loop4:
2401     movups (INP), STATE1
2402     movups 0x10(INP), STATE2
2403     movups 0x20(INP), STATE3
2404     movups 0x30(INP), STATE4
2405     call _aesni_dec4
2406     movups STATE1, (OUTP)
2407     movups STATE2, 0x10(OUTP)
2408     movups STATE3, 0x20(OUTP)
2409     movups STATE4, 0x30(OUTP)
2410     sub $64, LEN
2411     add $64, INP
2412     add $64, OUTP
2413     cmp $64, LEN
2414     jge .Lecb_dec_loop4
2415     cmp $16, LEN
2416     jb .Lecb_dec_ret
2417 .align 4
2418 .Lecb_dec_loop1:
2419     movups (INP), STATE1
2420     call _aesni_dec1
2421     movups STATE1, (OUTP)
2422     sub $16, LEN
2423     add $16, INP
2424     add $16, OUTP
2425     cmp $16, LEN
2426     jge .Lecb_dec_loop1
2427 .Lecb_dec_ret:
2428 #ifndef __x86_64__
2429     popl KLEN
2430     popl KEYP
2431     popl LEN
2432 #endif
2433     FRAME_END
2434     RET
2435 SYM_FUNC_END(aesni_ecb_dec)
2436
2437 /*
2438  * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2439  *            size_t len, u8 *iv)
2440  */
2441 SYM_FUNC_START(aesni_cbc_enc)
2442     FRAME_BEGIN
2443 #ifndef __x86_64__
2444     pushl IVP
2445     pushl LEN
2446     pushl KEYP
2447     pushl KLEN
2448     movl (FRAME_OFFSET+20)(%esp), KEYP  # ctx
2449     movl (FRAME_OFFSET+24)(%esp), OUTP  # dst
2450     movl (FRAME_OFFSET+28)(%esp), INP   # src
2451     movl (FRAME_OFFSET+32)(%esp), LEN   # len
2452     movl (FRAME_OFFSET+36)(%esp), IVP   # iv
2453 #endif
2454     cmp $16, LEN
2455     jb .Lcbc_enc_ret
2456     mov 480(KEYP), KLEN
2457     movups (IVP), STATE # load iv as initial state
2458 .align 4
2459 .Lcbc_enc_loop:
2460     movups (INP), IN    # load input
2461     pxor IN, STATE
2462     call _aesni_enc1
2463     movups STATE, (OUTP)    # store output
2464     sub $16, LEN
2465     add $16, INP
2466     add $16, OUTP
2467     cmp $16, LEN
2468     jge .Lcbc_enc_loop
2469     movups STATE, (IVP)
2470 .Lcbc_enc_ret:
2471 #ifndef __x86_64__
2472     popl KLEN
2473     popl KEYP
2474     popl LEN
2475     popl IVP
2476 #endif
2477     FRAME_END
2478     RET
2479 SYM_FUNC_END(aesni_cbc_enc)
2480
2481 /*
2482  * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2483  *            size_t len, u8 *iv)
2484  */
2485 SYM_FUNC_START(aesni_cbc_dec)
2486     FRAME_BEGIN
2487 #ifndef __x86_64__
2488     pushl IVP
2489     pushl LEN
2490     pushl KEYP
2491     pushl KLEN
2492     movl (FRAME_OFFSET+20)(%esp), KEYP  # ctx
2493     movl (FRAME_OFFSET+24)(%esp), OUTP  # dst
2494     movl (FRAME_OFFSET+28)(%esp), INP   # src
2495     movl (FRAME_OFFSET+32)(%esp), LEN   # len
2496     movl (FRAME_OFFSET+36)(%esp), IVP   # iv
2497 #endif
2498     cmp $16, LEN
2499     jb .Lcbc_dec_just_ret
2500     mov 480(KEYP), KLEN
2501     add $240, KEYP
2502     movups (IVP), IV
2503     cmp $64, LEN
2504     jb .Lcbc_dec_loop1
2505 .align 4
2506 .Lcbc_dec_loop4:
2507     movups (INP), IN1
2508     movaps IN1, STATE1
2509     movups 0x10(INP), IN2
2510     movaps IN2, STATE2
2511 #ifdef __x86_64__
2512     movups 0x20(INP), IN3
2513     movaps IN3, STATE3
2514     movups 0x30(INP), IN4
2515     movaps IN4, STATE4
2516 #else
2517     movups 0x20(INP), IN1
2518     movaps IN1, STATE3
2519     movups 0x30(INP), IN2
2520     movaps IN2, STATE4
2521 #endif
2522     call _aesni_dec4
2523     pxor IV, STATE1
2524 #ifdef __x86_64__
2525     pxor IN1, STATE2
2526     pxor IN2, STATE3
2527     pxor IN3, STATE4
2528     movaps IN4, IV
2529 #else
2530     pxor IN1, STATE4
2531     movaps IN2, IV
2532     movups (INP), IN1
2533     pxor IN1, STATE2
2534     movups 0x10(INP), IN2
2535     pxor IN2, STATE3
2536 #endif
2537     movups STATE1, (OUTP)
2538     movups STATE2, 0x10(OUTP)
2539     movups STATE3, 0x20(OUTP)
2540     movups STATE4, 0x30(OUTP)
2541     sub $64, LEN
2542     add $64, INP
2543     add $64, OUTP
2544     cmp $64, LEN
2545     jge .Lcbc_dec_loop4
2546     cmp $16, LEN
2547     jb .Lcbc_dec_ret
2548 .align 4
2549 .Lcbc_dec_loop1:
2550     movups (INP), IN
2551     movaps IN, STATE
2552     call _aesni_dec1
2553     pxor IV, STATE
2554     movups STATE, (OUTP)
2555     movaps IN, IV
2556     sub $16, LEN
2557     add $16, INP
2558     add $16, OUTP
2559     cmp $16, LEN
2560     jge .Lcbc_dec_loop1
2561 .Lcbc_dec_ret:
2562     movups IV, (IVP)
2563 .Lcbc_dec_just_ret:
2564 #ifndef __x86_64__
2565     popl KLEN
2566     popl KEYP
2567     popl LEN
2568     popl IVP
2569 #endif
2570     FRAME_END
2571     RET
2572 SYM_FUNC_END(aesni_cbc_dec)
2573
2574 /*
2575  * void aesni_cts_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2576  *            size_t len, u8 *iv)
2577  */
2578 SYM_FUNC_START(aesni_cts_cbc_enc)
2579     FRAME_BEGIN
2580 #ifndef __x86_64__
2581     pushl IVP
2582     pushl LEN
2583     pushl KEYP
2584     pushl KLEN
2585     movl (FRAME_OFFSET+20)(%esp), KEYP  # ctx
2586     movl (FRAME_OFFSET+24)(%esp), OUTP  # dst
2587     movl (FRAME_OFFSET+28)(%esp), INP   # src
2588     movl (FRAME_OFFSET+32)(%esp), LEN   # len
2589     movl (FRAME_OFFSET+36)(%esp), IVP   # iv
2590     lea .Lcts_permute_table, T1
2591 #else
2592     lea .Lcts_permute_table(%rip), T1
2593 #endif
2594     mov 480(KEYP), KLEN
2595     movups (IVP), STATE
2596     sub $16, LEN
2597     mov T1, IVP
2598     add $32, IVP
2599     add LEN, T1
2600     sub LEN, IVP
2601     movups (T1), %xmm4
2602     movups (IVP), %xmm5
2603
2604     movups (INP), IN1
2605     add LEN, INP
2606     movups (INP), IN2
2607
2608     pxor IN1, STATE
2609     call _aesni_enc1
2610
2611     pshufb %xmm5, IN2
2612     pxor STATE, IN2
2613     pshufb %xmm4, STATE
2614     add OUTP, LEN
2615     movups STATE, (LEN)
2616
2617     movaps IN2, STATE
2618     call _aesni_enc1
2619     movups STATE, (OUTP)
2620
2621 #ifndef __x86_64__
2622     popl KLEN
2623     popl KEYP
2624     popl LEN
2625     popl IVP
2626 #endif
2627     FRAME_END
2628     RET
2629 SYM_FUNC_END(aesni_cts_cbc_enc)
2630
2631 /*
2632  * void aesni_cts_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2633  *            size_t len, u8 *iv)
2634  */
2635 SYM_FUNC_START(aesni_cts_cbc_dec)
2636     FRAME_BEGIN
2637 #ifndef __x86_64__
2638     pushl IVP
2639     pushl LEN
2640     pushl KEYP
2641     pushl KLEN
2642     movl (FRAME_OFFSET+20)(%esp), KEYP  # ctx
2643     movl (FRAME_OFFSET+24)(%esp), OUTP  # dst
2644     movl (FRAME_OFFSET+28)(%esp), INP   # src
2645     movl (FRAME_OFFSET+32)(%esp), LEN   # len
2646     movl (FRAME_OFFSET+36)(%esp), IVP   # iv
2647     lea .Lcts_permute_table, T1
2648 #else
2649     lea .Lcts_permute_table(%rip), T1
2650 #endif
2651     mov 480(KEYP), KLEN
2652     add $240, KEYP
2653     movups (IVP), IV
2654     sub $16, LEN
2655     mov T1, IVP
2656     add $32, IVP
2657     add LEN, T1
2658     sub LEN, IVP
2659     movups (T1), %xmm4
2660
2661     movups (INP), STATE
2662     add LEN, INP
2663     movups (INP), IN1
2664
2665     call _aesni_dec1
2666     movaps STATE, IN2
2667     pshufb %xmm4, STATE
2668     pxor IN1, STATE
2669
2670     add OUTP, LEN
2671     movups STATE, (LEN)
2672
2673     movups (IVP), %xmm0
2674     pshufb %xmm0, IN1
2675     pblendvb IN2, IN1
2676     movaps IN1, STATE
2677     call _aesni_dec1
2678
2679     pxor IV, STATE
2680     movups STATE, (OUTP)
2681
2682 #ifndef __x86_64__
2683     popl KLEN
2684     popl KEYP
2685     popl LEN
2686     popl IVP
2687 #endif
2688     FRAME_END
2689     RET
2690 SYM_FUNC_END(aesni_cts_cbc_dec)
2691
2692 .pushsection .rodata
2693 .align 16
2694 .Lcts_permute_table:
2695     .byte       0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
2696     .byte       0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
2697     .byte       0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
2698     .byte       0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
2699     .byte       0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
2700     .byte       0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
2701 #ifdef __x86_64__
2702 .Lbswap_mask:
2703     .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
2704 #endif
2705 .popsection
2706
2707 #ifdef __x86_64__
2708 /*
2709  * _aesni_inc_init: internal ABI
2710  *  setup registers used by _aesni_inc
2711  * input:
2712  *  IV
2713  * output:
2714  *  CTR:    == IV, in little endian
2715  *  TCTR_LOW: == lower qword of CTR
2716  *  INC:    == 1, in little endian
2717  *  BSWAP_MASK == endian swapping mask
2718  */
2719 SYM_FUNC_START_LOCAL(_aesni_inc_init)
2720     movaps .Lbswap_mask, BSWAP_MASK
2721     movaps IV, CTR
2722     pshufb BSWAP_MASK, CTR
2723     mov $1, TCTR_LOW
2724     movq TCTR_LOW, INC
2725     movq CTR, TCTR_LOW
2726     RET
2727 SYM_FUNC_END(_aesni_inc_init)
2728
2729 /*
2730  * _aesni_inc:      internal ABI
2731  *  Increase IV by 1, IV is in big endian
2732  * input:
2733  *  IV
2734  *  CTR:    == IV, in little endian
2735  *  TCTR_LOW: == lower qword of CTR
2736  *  INC:    == 1, in little endian
2737  *  BSWAP_MASK == endian swapping mask
2738  * output:
2739  *  IV: Increase by 1
2740  * changed:
2741  *  CTR:    == output IV, in little endian
2742  *  TCTR_LOW: == lower qword of CTR
2743  */
2744 SYM_FUNC_START_LOCAL(_aesni_inc)
2745     paddq INC, CTR
2746     add $1, TCTR_LOW
2747     jnc .Linc_low
2748     pslldq $8, INC
2749     paddq INC, CTR
2750     psrldq $8, INC
2751 .Linc_low:
2752     movaps CTR, IV
2753     pshufb BSWAP_MASK, IV
2754     RET
2755 SYM_FUNC_END(_aesni_inc)
2756
2757 /*
2758  * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2759  *            size_t len, u8 *iv)
2760  */
2761 SYM_FUNC_START(aesni_ctr_enc)
2762     FRAME_BEGIN
2763     cmp $16, LEN
2764     jb .Lctr_enc_just_ret
2765     mov 480(KEYP), KLEN
2766     movups (IVP), IV
2767     call _aesni_inc_init
2768     cmp $64, LEN
2769     jb .Lctr_enc_loop1
2770 .align 4
2771 .Lctr_enc_loop4:
2772     movaps IV, STATE1
2773     call _aesni_inc
2774     movups (INP), IN1
2775     movaps IV, STATE2
2776     call _aesni_inc
2777     movups 0x10(INP), IN2
2778     movaps IV, STATE3
2779     call _aesni_inc
2780     movups 0x20(INP), IN3
2781     movaps IV, STATE4
2782     call _aesni_inc
2783     movups 0x30(INP), IN4
2784     call _aesni_enc4
2785     pxor IN1, STATE1
2786     movups STATE1, (OUTP)
2787     pxor IN2, STATE2
2788     movups STATE2, 0x10(OUTP)
2789     pxor IN3, STATE3
2790     movups STATE3, 0x20(OUTP)
2791     pxor IN4, STATE4
2792     movups STATE4, 0x30(OUTP)
2793     sub $64, LEN
2794     add $64, INP
2795     add $64, OUTP
2796     cmp $64, LEN
2797     jge .Lctr_enc_loop4
2798     cmp $16, LEN
2799     jb .Lctr_enc_ret
2800 .align 4
2801 .Lctr_enc_loop1:
2802     movaps IV, STATE
2803     call _aesni_inc
2804     movups (INP), IN
2805     call _aesni_enc1
2806     pxor IN, STATE
2807     movups STATE, (OUTP)
2808     sub $16, LEN
2809     add $16, INP
2810     add $16, OUTP
2811     cmp $16, LEN
2812     jge .Lctr_enc_loop1
2813 .Lctr_enc_ret:
2814     movups IV, (IVP)
2815 .Lctr_enc_just_ret:
2816     FRAME_END
2817     RET
2818 SYM_FUNC_END(aesni_ctr_enc)
2819
2820 #endif
2821
2822 .section    .rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16
2823 .align 16
2824 .Lgf128mul_x_ble_mask:
2825     .octa 0x00000000000000010000000000000087
2826 .previous
2827
2828 /*
2829  * _aesni_gf128mul_x_ble:       internal ABI
2830  *  Multiply in GF(2^128) for XTS IVs
2831  * input:
2832  *  IV: current IV
2833  *  GF128MUL_MASK == mask with 0x87 and 0x01
2834  * output:
2835  *  IV: next IV
2836  * changed:
2837  *  CTR:    == temporary value
2838  */
2839 #define _aesni_gf128mul_x_ble() \
2840     pshufd $0x13, IV, KEY; \
2841     paddq IV, IV; \
2842     psrad $31, KEY; \
2843     pand GF128MUL_MASK, KEY; \
2844     pxor KEY, IV;
2845
2846 /*
2847  * void aesni_xts_encrypt(const struct crypto_aes_ctx *ctx, u8 *dst,
2848  *            const u8 *src, unsigned int len, le128 *iv)
2849  */
2850 SYM_FUNC_START(aesni_xts_encrypt)
2851     FRAME_BEGIN
2852 #ifndef __x86_64__
2853     pushl IVP
2854     pushl LEN
2855     pushl KEYP
2856     pushl KLEN
2857     movl (FRAME_OFFSET+20)(%esp), KEYP  # ctx
2858     movl (FRAME_OFFSET+24)(%esp), OUTP  # dst
2859     movl (FRAME_OFFSET+28)(%esp), INP   # src
2860     movl (FRAME_OFFSET+32)(%esp), LEN   # len
2861     movl (FRAME_OFFSET+36)(%esp), IVP   # iv
2862     movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
2863 #else
2864     movdqa .Lgf128mul_x_ble_mask(%rip), GF128MUL_MASK
2865 #endif
2866     movups (IVP), IV
2867
2868     mov 480(KEYP), KLEN
2869
2870 .Lxts_enc_loop4:
2871     sub $64, LEN
2872     jl .Lxts_enc_1x
2873
2874     movdqa IV, STATE1
2875     movdqu 0x00(INP), IN
2876     pxor IN, STATE1
2877     movdqu IV, 0x00(OUTP)
2878
2879     _aesni_gf128mul_x_ble()
2880     movdqa IV, STATE2
2881     movdqu 0x10(INP), IN
2882     pxor IN, STATE2
2883     movdqu IV, 0x10(OUTP)
2884
2885     _aesni_gf128mul_x_ble()
2886     movdqa IV, STATE3
2887     movdqu 0x20(INP), IN
2888     pxor IN, STATE3
2889     movdqu IV, 0x20(OUTP)
2890
2891     _aesni_gf128mul_x_ble()
2892     movdqa IV, STATE4
2893     movdqu 0x30(INP), IN
2894     pxor IN, STATE4
2895     movdqu IV, 0x30(OUTP)
2896
2897     call _aesni_enc4
2898
2899     movdqu 0x00(OUTP), IN
2900     pxor IN, STATE1
2901     movdqu STATE1, 0x00(OUTP)
2902
2903     movdqu 0x10(OUTP), IN
2904     pxor IN, STATE2
2905     movdqu STATE2, 0x10(OUTP)
2906
2907     movdqu 0x20(OUTP), IN
2908     pxor IN, STATE3
2909     movdqu STATE3, 0x20(OUTP)
2910
2911     movdqu 0x30(OUTP), IN
2912     pxor IN, STATE4
2913     movdqu STATE4, 0x30(OUTP)
2914
2915     _aesni_gf128mul_x_ble()
2916
2917     add $64, INP
2918     add $64, OUTP
2919     test LEN, LEN
2920     jnz .Lxts_enc_loop4
2921
2922 .Lxts_enc_ret_iv:
2923     movups IV, (IVP)
2924
2925 .Lxts_enc_ret:
2926 #ifndef __x86_64__
2927     popl KLEN
2928     popl KEYP
2929     popl LEN
2930     popl IVP
2931 #endif
2932     FRAME_END
2933     RET
2934
2935 .Lxts_enc_1x:
2936     add $64, LEN
2937     jz .Lxts_enc_ret_iv
2938     sub $16, LEN
2939     jl .Lxts_enc_cts4
2940
2941 .Lxts_enc_loop1:
2942     movdqu (INP), STATE
2943     pxor IV, STATE
2944     call _aesni_enc1
2945     pxor IV, STATE
2946     _aesni_gf128mul_x_ble()
2947
2948     test LEN, LEN
2949     jz .Lxts_enc_out
2950
2951     add $16, INP
2952     sub $16, LEN
2953     jl .Lxts_enc_cts1
2954
2955     movdqu STATE, (OUTP)
2956     add $16, OUTP
2957     jmp .Lxts_enc_loop1
2958
2959 .Lxts_enc_out:
2960     movdqu STATE, (OUTP)
2961     jmp .Lxts_enc_ret_iv
2962
2963 .Lxts_enc_cts4:
2964     movdqa STATE4, STATE
2965     sub $16, OUTP
2966
2967 .Lxts_enc_cts1:
2968 #ifndef __x86_64__
2969     lea .Lcts_permute_table, T1
2970 #else
2971     lea .Lcts_permute_table(%rip), T1
2972 #endif
2973     add LEN, INP        /* rewind input pointer */
2974     add $16, LEN        /* # bytes in final block */
2975     movups (INP), IN1
2976
2977     mov T1, IVP
2978     add $32, IVP
2979     add LEN, T1
2980     sub LEN, IVP
2981     add OUTP, LEN
2982
2983     movups (T1), %xmm4
2984     movaps STATE, IN2
2985     pshufb %xmm4, STATE
2986     movups STATE, (LEN)
2987
2988     movups (IVP), %xmm0
2989     pshufb %xmm0, IN1
2990     pblendvb IN2, IN1
2991     movaps IN1, STATE
2992
2993     pxor IV, STATE
2994     call _aesni_enc1
2995     pxor IV, STATE
2996
2997     movups STATE, (OUTP)
2998     jmp .Lxts_enc_ret
2999 SYM_FUNC_END(aesni_xts_encrypt)
3000
3001 /*
3002  * void aesni_xts_decrypt(const struct crypto_aes_ctx *ctx, u8 *dst,
3003  *            const u8 *src, unsigned int len, le128 *iv)
3004  */
3005 SYM_FUNC_START(aesni_xts_decrypt)
3006     FRAME_BEGIN
3007 #ifndef __x86_64__
3008     pushl IVP
3009     pushl LEN
3010     pushl KEYP
3011     pushl KLEN
3012     movl (FRAME_OFFSET+20)(%esp), KEYP  # ctx
3013     movl (FRAME_OFFSET+24)(%esp), OUTP  # dst
3014     movl (FRAME_OFFSET+28)(%esp), INP   # src
3015     movl (FRAME_OFFSET+32)(%esp), LEN   # len
3016     movl (FRAME_OFFSET+36)(%esp), IVP   # iv
3017     movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
3018 #else
3019     movdqa .Lgf128mul_x_ble_mask(%rip), GF128MUL_MASK
3020 #endif
3021     movups (IVP), IV
3022
3023     mov 480(KEYP), KLEN
3024     add $240, KEYP
3025
3026     test $15, LEN
3027     jz .Lxts_dec_loop4
3028     sub $16, LEN
3029
3030 .Lxts_dec_loop4:
3031     sub $64, LEN
3032     jl .Lxts_dec_1x
3033
3034     movdqa IV, STATE1
3035     movdqu 0x00(INP), IN
3036     pxor IN, STATE1
3037     movdqu IV, 0x00(OUTP)
3038
3039     _aesni_gf128mul_x_ble()
3040     movdqa IV, STATE2
3041     movdqu 0x10(INP), IN
3042     pxor IN, STATE2
3043     movdqu IV, 0x10(OUTP)
3044
3045     _aesni_gf128mul_x_ble()
3046     movdqa IV, STATE3
3047     movdqu 0x20(INP), IN
3048     pxor IN, STATE3
3049     movdqu IV, 0x20(OUTP)
3050
3051     _aesni_gf128mul_x_ble()
3052     movdqa IV, STATE4
3053     movdqu 0x30(INP), IN
3054     pxor IN, STATE4
3055     movdqu IV, 0x30(OUTP)
3056
3057     call _aesni_dec4
3058
3059     movdqu 0x00(OUTP), IN
3060     pxor IN, STATE1
3061     movdqu STATE1, 0x00(OUTP)
3062
3063     movdqu 0x10(OUTP), IN
3064     pxor IN, STATE2
3065     movdqu STATE2, 0x10(OUTP)
3066
3067     movdqu 0x20(OUTP), IN
3068     pxor IN, STATE3
3069     movdqu STATE3, 0x20(OUTP)
3070
3071     movdqu 0x30(OUTP), IN
3072     pxor IN, STATE4
3073     movdqu STATE4, 0x30(OUTP)
3074
3075     _aesni_gf128mul_x_ble()
3076
3077     add $64, INP
3078     add $64, OUTP
3079     test LEN, LEN
3080     jnz .Lxts_dec_loop4
3081
3082 .Lxts_dec_ret_iv:
3083     movups IV, (IVP)
3084
3085 .Lxts_dec_ret:
3086 #ifndef __x86_64__
3087     popl KLEN
3088     popl KEYP
3089     popl LEN
3090     popl IVP
3091 #endif
3092     FRAME_END
3093     RET
3094
3095 .Lxts_dec_1x:
3096     add $64, LEN
3097     jz .Lxts_dec_ret_iv
3098
3099 .Lxts_dec_loop1:
3100     movdqu (INP), STATE
3101
3102     add $16, INP
3103     sub $16, LEN
3104     jl .Lxts_dec_cts1
3105
3106     pxor IV, STATE
3107     call _aesni_dec1
3108     pxor IV, STATE
3109     _aesni_gf128mul_x_ble()
3110
3111     test LEN, LEN
3112     jz .Lxts_dec_out
3113
3114     movdqu STATE, (OUTP)
3115     add $16, OUTP
3116     jmp .Lxts_dec_loop1
3117
3118 .Lxts_dec_out:
3119     movdqu STATE, (OUTP)
3120     jmp .Lxts_dec_ret_iv
3121
3122 .Lxts_dec_cts1:
3123     movdqa IV, STATE4
3124     _aesni_gf128mul_x_ble()
3125
3126     pxor IV, STATE
3127     call _aesni_dec1
3128     pxor IV, STATE
3129
3130 #ifndef __x86_64__
3131     lea .Lcts_permute_table, T1
3132 #else
3133     lea .Lcts_permute_table(%rip), T1
3134 #endif
3135     add LEN, INP        /* rewind input pointer */
3136     add $16, LEN        /* # bytes in final block */
3137     movups (INP), IN1
3138
3139     mov T1, IVP
3140     add $32, IVP
3141     add LEN, T1
3142     sub LEN, IVP
3143     add OUTP, LEN
3144
3145     movups (T1), %xmm4
3146     movaps STATE, IN2
3147     pshufb %xmm4, STATE
3148     movups STATE, (LEN)
3149
3150     movups (IVP), %xmm0
3151     pshufb %xmm0, IN1
3152     pblendvb IN2, IN1
3153     movaps IN1, STATE
3154
3155     pxor STATE4, STATE
3156     call _aesni_dec1
3157     pxor STATE4, STATE
3158
3159     movups STATE, (OUTP)
3160     jmp .Lxts_dec_ret
3161 SYM_FUNC_END(aesni_xts_decrypt)