Back to home page

OSCL-LXR

 
 

    


0001 ########################################################################
0002 # Copyright (c) 2013, Intel Corporation
0003 #
0004 # This software is available to you under a choice of one of two
0005 # licenses.  You may choose to be licensed under the terms of the GNU
0006 # General Public License (GPL) Version 2, available from the file
0007 # COPYING in the main directory of this source tree, or the
0008 # OpenIB.org BSD license below:
0009 #
0010 # Redistribution and use in source and binary forms, with or without
0011 # modification, are permitted provided that the following conditions are
0012 # met:
0013 #
0014 # * Redistributions of source code must retain the above copyright
0015 #   notice, this list of conditions and the following disclaimer.
0016 #
0017 # * Redistributions in binary form must reproduce the above copyright
0018 #   notice, this list of conditions and the following disclaimer in the
0019 #   documentation and/or other materials provided with the
0020 #   distribution.
0021 #
0022 # * Neither the name of the Intel Corporation nor the names of its
0023 #   contributors may be used to endorse or promote products derived from
0024 #   this software without specific prior written permission.
0025 #
0026 #
0027 # THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
0028 # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
0029 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
0030 # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
0031 # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
0032 # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
0033 # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES# LOSS OF USE, DATA, OR
0034 # PROFITS# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
0035 # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
0036 # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
0037 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
0038 ########################################################################
0039 ##
0040 ## Authors:
0041 ##  Erdinc Ozturk <erdinc.ozturk@intel.com>
0042 ##  Vinodh Gopal <vinodh.gopal@intel.com>
0043 ##  James Guilford <james.guilford@intel.com>
0044 ##  Tim Chen <tim.c.chen@linux.intel.com>
0045 ##
0046 ## References:
0047 ##       This code was derived and highly optimized from the code described in paper:
0048 ##               Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation
0049 ##          on Intel Architecture Processors. August, 2010
0050 ##       The details of the implementation is explained in:
0051 ##               Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode
0052 ##          on Intel Architecture Processors. October, 2012.
0053 ##
0054 ## Assumptions:
0055 ##
0056 ##
0057 ##
0058 ## iv:
0059 ##       0                   1                   2                   3
0060 ##       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
0061 ##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
0062 ##       |                             Salt  (From the SA)               |
0063 ##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
0064 ##       |                     Initialization Vector                     |
0065 ##       |         (This is the sequence number from IPSec header)       |
0066 ##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
0067 ##       |                              0x1                              |
0068 ##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
0069 ##
0070 ##
0071 ##
0072 ## AAD:
0073 ##       AAD padded to 128 bits with 0
0074 ##       for example, assume AAD is a u32 vector
0075 ##
0076 ##       if AAD is 8 bytes:
0077 ##       AAD[3] = {A0, A1}#
0078 ##       padded AAD in xmm register = {A1 A0 0 0}
0079 ##
0080 ##       0                   1                   2                   3
0081 ##       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
0082 ##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
0083 ##       |                               SPI (A1)                        |
0084 ##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
0085 ##       |                     32-bit Sequence Number (A0)               |
0086 ##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
0087 ##       |                              0x0                              |
0088 ##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
0089 ##
0090 ##                                       AAD Format with 32-bit Sequence Number
0091 ##
0092 ##       if AAD is 12 bytes:
0093 ##       AAD[3] = {A0, A1, A2}#
0094 ##       padded AAD in xmm register = {A2 A1 A0 0}
0095 ##
0096 ##       0                   1                   2                   3
0097 ##       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
0098 ##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
0099 ##       |                               SPI (A2)                        |
0100 ##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
0101 ##       |                 64-bit Extended Sequence Number {A1,A0}       |
0102 ##       |                                                               |
0103 ##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
0104 ##       |                              0x0                              |
0105 ##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
0106 ##
0107 ##        AAD Format with 64-bit Extended Sequence Number
0108 ##
0109 ##
0110 ## aadLen:
0111 ##       from the definition of the spec, aadLen can only be 8 or 12 bytes.
0112 ##   The code additionally supports aadLen of length 16 bytes.
0113 ##
0114 ## TLen:
0115 ##       from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
0116 ##
0117 ## poly = x^128 + x^127 + x^126 + x^121 + 1
0118 ## throughout the code, one tab and two tab indentations are used. one tab is
0119 ## for GHASH part, two tabs is for AES part.
0120 ##
0121 
0122 #include <linux/linkage.h>
0123 
0124 # constants in mergeable sections, linker can reorder and merge
0125 .section    .rodata.cst16.POLY, "aM", @progbits, 16
0126 .align 16
0127 POLY:            .octa     0xC2000000000000000000000000000001
0128 
0129 .section    .rodata.cst16.POLY2, "aM", @progbits, 16
0130 .align 16
0131 POLY2:           .octa     0xC20000000000000000000001C2000000
0132 
0133 .section    .rodata.cst16.TWOONE, "aM", @progbits, 16
0134 .align 16
0135 TWOONE:          .octa     0x00000001000000000000000000000001
0136 
0137 .section    .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
0138 .align 16
0139 SHUF_MASK:       .octa     0x000102030405060708090A0B0C0D0E0F
0140 
0141 .section    .rodata.cst16.ONE, "aM", @progbits, 16
0142 .align 16
0143 ONE:             .octa     0x00000000000000000000000000000001
0144 
0145 .section    .rodata.cst16.ONEf, "aM", @progbits, 16
0146 .align 16
0147 ONEf:            .octa     0x01000000000000000000000000000000
0148 
0149 # order of these constants should not change.
0150 # more specifically, ALL_F should follow SHIFT_MASK, and zero should follow ALL_F
0151 .section    .rodata, "a", @progbits
0152 .align 16
0153 SHIFT_MASK:      .octa     0x0f0e0d0c0b0a09080706050403020100
0154 ALL_F:           .octa     0xffffffffffffffffffffffffffffffff
0155                  .octa     0x00000000000000000000000000000000
0156 
0157 .section .rodata
0158 .align 16
0159 .type aad_shift_arr, @object
0160 .size aad_shift_arr, 272
0161 aad_shift_arr:
0162         .octa     0xffffffffffffffffffffffffffffffff
0163         .octa     0xffffffffffffffffffffffffffffff0C
0164         .octa     0xffffffffffffffffffffffffffff0D0C
0165         .octa     0xffffffffffffffffffffffffff0E0D0C
0166         .octa     0xffffffffffffffffffffffff0F0E0D0C
0167         .octa     0xffffffffffffffffffffff0C0B0A0908
0168         .octa     0xffffffffffffffffffff0D0C0B0A0908
0169         .octa     0xffffffffffffffffff0E0D0C0B0A0908
0170         .octa     0xffffffffffffffff0F0E0D0C0B0A0908
0171         .octa     0xffffffffffffff0C0B0A090807060504
0172         .octa     0xffffffffffff0D0C0B0A090807060504
0173         .octa     0xffffffffff0E0D0C0B0A090807060504
0174         .octa     0xffffffff0F0E0D0C0B0A090807060504
0175         .octa     0xffffff0C0B0A09080706050403020100
0176         .octa     0xffff0D0C0B0A09080706050403020100
0177         .octa     0xff0E0D0C0B0A09080706050403020100
0178         .octa     0x0F0E0D0C0B0A09080706050403020100
0179 
0180 
0181 .text
0182 
0183 
0184 #define AadHash 16*0
0185 #define AadLen 16*1
0186 #define InLen (16*1)+8
0187 #define PBlockEncKey 16*2
0188 #define OrigIV 16*3
0189 #define CurCount 16*4
0190 #define PBlockLen 16*5
0191 
0192 HashKey        = 16*6   # store HashKey <<1 mod poly here
0193 HashKey_2      = 16*7   # store HashKey^2 <<1 mod poly here
0194 HashKey_3      = 16*8   # store HashKey^3 <<1 mod poly here
0195 HashKey_4      = 16*9   # store HashKey^4 <<1 mod poly here
0196 HashKey_5      = 16*10   # store HashKey^5 <<1 mod poly here
0197 HashKey_6      = 16*11   # store HashKey^6 <<1 mod poly here
0198 HashKey_7      = 16*12   # store HashKey^7 <<1 mod poly here
0199 HashKey_8      = 16*13   # store HashKey^8 <<1 mod poly here
0200 HashKey_k      = 16*14   # store XOR of HashKey <<1 mod poly here (for Karatsuba purposes)
0201 HashKey_2_k    = 16*15   # store XOR of HashKey^2 <<1 mod poly here (for Karatsuba purposes)
0202 HashKey_3_k    = 16*16   # store XOR of HashKey^3 <<1 mod poly here (for Karatsuba purposes)
0203 HashKey_4_k    = 16*17   # store XOR of HashKey^4 <<1 mod poly here (for Karatsuba purposes)
0204 HashKey_5_k    = 16*18   # store XOR of HashKey^5 <<1 mod poly here (for Karatsuba purposes)
0205 HashKey_6_k    = 16*19   # store XOR of HashKey^6 <<1 mod poly here (for Karatsuba purposes)
0206 HashKey_7_k    = 16*20   # store XOR of HashKey^7 <<1 mod poly here (for Karatsuba purposes)
0207 HashKey_8_k    = 16*21   # store XOR of HashKey^8 <<1 mod poly here (for Karatsuba purposes)
0208 
0209 #define arg1 %rdi
0210 #define arg2 %rsi
0211 #define arg3 %rdx
0212 #define arg4 %rcx
0213 #define arg5 %r8
0214 #define arg6 %r9
0215 #define keysize 2*15*16(arg1)
0216 
0217 i = 0
0218 j = 0
0219 
0220 out_order = 0
0221 in_order = 1
0222 DEC = 0
0223 ENC = 1
0224 
0225 .macro define_reg r n
0226 reg_\r = %xmm\n
0227 .endm
0228 
0229 .macro setreg
0230 .altmacro
0231 define_reg i %i
0232 define_reg j %j
0233 .noaltmacro
0234 .endm
0235 
0236 TMP1 =   16*0    # Temporary storage for AAD
0237 TMP2 =   16*1    # Temporary storage for AES State 2 (State 1 is stored in an XMM register)
0238 TMP3 =   16*2    # Temporary storage for AES State 3
0239 TMP4 =   16*3    # Temporary storage for AES State 4
0240 TMP5 =   16*4    # Temporary storage for AES State 5
0241 TMP6 =   16*5    # Temporary storage for AES State 6
0242 TMP7 =   16*6    # Temporary storage for AES State 7
0243 TMP8 =   16*7    # Temporary storage for AES State 8
0244 
0245 VARIABLE_OFFSET = 16*8
0246 
0247 ################################
0248 # Utility Macros
0249 ################################
0250 
0251 .macro FUNC_SAVE
0252         push    %r12
0253         push    %r13
0254         push    %r15
0255 
0256     push    %rbp
0257     mov %rsp, %rbp
0258 
0259         sub     $VARIABLE_OFFSET, %rsp
0260         and     $~63, %rsp                    # align rsp to 64 bytes
0261 .endm
0262 
0263 .macro FUNC_RESTORE
0264         mov     %rbp, %rsp
0265     pop %rbp
0266 
0267         pop     %r15
0268         pop     %r13
0269         pop     %r12
0270 .endm
0271 
0272 # Encryption of a single block
0273 .macro ENCRYPT_SINGLE_BLOCK REP XMM0
0274                 vpxor    (arg1), \XMM0, \XMM0
0275                i = 1
0276                setreg
0277 .rep \REP
0278                 vaesenc  16*i(arg1), \XMM0, \XMM0
0279                i = (i+1)
0280                setreg
0281 .endr
0282                 vaesenclast 16*i(arg1), \XMM0, \XMM0
0283 .endm
0284 
0285 # combined for GCM encrypt and decrypt functions
0286 # clobbering all xmm registers
0287 # clobbering r10, r11, r12, r13, r15, rax
0288 .macro  GCM_ENC_DEC INITIAL_BLOCKS GHASH_8_ENCRYPT_8_PARALLEL GHASH_LAST_8 GHASH_MUL ENC_DEC REP
0289         vmovdqu AadHash(arg2), %xmm8
0290         vmovdqu  HashKey(arg2), %xmm13      # xmm13 = HashKey
0291         add arg5, InLen(arg2)
0292 
0293         # initialize the data pointer offset as zero
0294         xor     %r11d, %r11d
0295 
0296         PARTIAL_BLOCK \GHASH_MUL, arg3, arg4, arg5, %r11, %xmm8, \ENC_DEC
0297         sub %r11, arg5
0298 
0299         mov     arg5, %r13                  # save the number of bytes of plaintext/ciphertext
0300         and     $-16, %r13                  # r13 = r13 - (r13 mod 16)
0301 
0302         mov     %r13, %r12
0303         shr     $4, %r12
0304         and     $7, %r12
0305         jz      _initial_num_blocks_is_0\@
0306 
0307         cmp     $7, %r12
0308         je      _initial_num_blocks_is_7\@
0309         cmp     $6, %r12
0310         je      _initial_num_blocks_is_6\@
0311         cmp     $5, %r12
0312         je      _initial_num_blocks_is_5\@
0313         cmp     $4, %r12
0314         je      _initial_num_blocks_is_4\@
0315         cmp     $3, %r12
0316         je      _initial_num_blocks_is_3\@
0317         cmp     $2, %r12
0318         je      _initial_num_blocks_is_2\@
0319 
0320         jmp     _initial_num_blocks_is_1\@
0321 
0322 _initial_num_blocks_is_7\@:
0323         \INITIAL_BLOCKS  \REP, 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
0324         sub     $16*7, %r13
0325         jmp     _initial_blocks_encrypted\@
0326 
0327 _initial_num_blocks_is_6\@:
0328         \INITIAL_BLOCKS  \REP, 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
0329         sub     $16*6, %r13
0330         jmp     _initial_blocks_encrypted\@
0331 
0332 _initial_num_blocks_is_5\@:
0333         \INITIAL_BLOCKS  \REP, 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
0334         sub     $16*5, %r13
0335         jmp     _initial_blocks_encrypted\@
0336 
0337 _initial_num_blocks_is_4\@:
0338         \INITIAL_BLOCKS  \REP, 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
0339         sub     $16*4, %r13
0340         jmp     _initial_blocks_encrypted\@
0341 
0342 _initial_num_blocks_is_3\@:
0343         \INITIAL_BLOCKS  \REP, 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
0344         sub     $16*3, %r13
0345         jmp     _initial_blocks_encrypted\@
0346 
0347 _initial_num_blocks_is_2\@:
0348         \INITIAL_BLOCKS  \REP, 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
0349         sub     $16*2, %r13
0350         jmp     _initial_blocks_encrypted\@
0351 
0352 _initial_num_blocks_is_1\@:
0353         \INITIAL_BLOCKS  \REP, 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
0354         sub     $16*1, %r13
0355         jmp     _initial_blocks_encrypted\@
0356 
0357 _initial_num_blocks_is_0\@:
0358         \INITIAL_BLOCKS  \REP, 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
0359 
0360 
0361 _initial_blocks_encrypted\@:
0362         test    %r13, %r13
0363         je      _zero_cipher_left\@
0364 
0365         sub     $128, %r13
0366         je      _eight_cipher_left\@
0367 
0368 
0369 
0370 
0371         vmovd   %xmm9, %r15d
0372         and     $255, %r15d
0373         vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
0374 
0375 
0376 _encrypt_by_8_new\@:
0377         cmp     $(255-8), %r15d
0378         jg      _encrypt_by_8\@
0379 
0380 
0381 
0382         add     $8, %r15b
0383         \GHASH_8_ENCRYPT_8_PARALLEL      \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
0384         add     $128, %r11
0385         sub     $128, %r13
0386         jne     _encrypt_by_8_new\@
0387 
0388         vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
0389         jmp     _eight_cipher_left\@
0390 
0391 _encrypt_by_8\@:
0392         vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
0393         add     $8, %r15b
0394         \GHASH_8_ENCRYPT_8_PARALLEL      \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
0395         vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
0396         add     $128, %r11
0397         sub     $128, %r13
0398         jne     _encrypt_by_8_new\@
0399 
0400         vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
0401 
0402 
0403 
0404 
0405 _eight_cipher_left\@:
0406         \GHASH_LAST_8    %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8
0407 
0408 
0409 _zero_cipher_left\@:
0410         vmovdqu %xmm14, AadHash(arg2)
0411         vmovdqu %xmm9, CurCount(arg2)
0412 
0413         # check for 0 length
0414         mov     arg5, %r13
0415         and     $15, %r13                            # r13 = (arg5 mod 16)
0416 
0417         je      _multiple_of_16_bytes\@
0418 
0419         # handle the last <16 Byte block separately
0420 
0421         mov %r13, PBlockLen(arg2)
0422 
0423         vpaddd  ONE(%rip), %xmm9, %xmm9              # INCR CNT to get Yn
0424         vmovdqu %xmm9, CurCount(arg2)
0425         vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
0426 
0427         ENCRYPT_SINGLE_BLOCK    \REP, %xmm9                # E(K, Yn)
0428         vmovdqu %xmm9, PBlockEncKey(arg2)
0429 
0430         cmp $16, arg5
0431         jge _large_enough_update\@
0432 
0433         lea (arg4,%r11,1), %r10
0434         mov %r13, %r12
0435 
0436         READ_PARTIAL_BLOCK %r10 %r12 %xmm1
0437 
0438         lea     SHIFT_MASK+16(%rip), %r12
0439         sub     %r13, %r12                           # adjust the shuffle mask pointer to be
0440                              # able to shift 16-r13 bytes (r13 is the
0441     # number of bytes in plaintext mod 16)
0442 
0443         jmp _final_ghash_mul\@
0444 
0445 _large_enough_update\@:
0446         sub $16, %r11
0447         add %r13, %r11
0448 
0449         # receive the last <16 Byte block
0450         vmovdqu (arg4, %r11, 1), %xmm1
0451 
0452         sub %r13, %r11
0453         add $16, %r11
0454 
0455         lea SHIFT_MASK+16(%rip), %r12
0456         # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
0457         # (r13 is the number of bytes in plaintext mod 16)
0458         sub %r13, %r12
0459         # get the appropriate shuffle mask
0460         vmovdqu (%r12), %xmm2
0461         # shift right 16-r13 bytes
0462         vpshufb  %xmm2, %xmm1, %xmm1
0463 
0464 _final_ghash_mul\@:
0465         .if  \ENC_DEC ==  DEC
0466         vmovdqa %xmm1, %xmm2
0467         vpxor   %xmm1, %xmm9, %xmm9                  # Plaintext XOR E(K, Yn)
0468         vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1        # get the appropriate mask to
0469                              # mask out top 16-r13 bytes of xmm9
0470         vpand   %xmm1, %xmm9, %xmm9                  # mask out top 16-r13 bytes of xmm9
0471         vpand   %xmm1, %xmm2, %xmm2
0472         vpshufb SHUF_MASK(%rip), %xmm2, %xmm2
0473         vpxor   %xmm2, %xmm14, %xmm14
0474 
0475         vmovdqu %xmm14, AadHash(arg2)
0476         .else
0477         vpxor   %xmm1, %xmm9, %xmm9                  # Plaintext XOR E(K, Yn)
0478         vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1        # get the appropriate mask to
0479                              # mask out top 16-r13 bytes of xmm9
0480         vpand   %xmm1, %xmm9, %xmm9                  # mask out top 16-r13 bytes of xmm9
0481         vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
0482         vpxor   %xmm9, %xmm14, %xmm14
0483 
0484         vmovdqu %xmm14, AadHash(arg2)
0485         vpshufb SHUF_MASK(%rip), %xmm9, %xmm9        # shuffle xmm9 back to output as ciphertext
0486         .endif
0487 
0488 
0489         #############################
0490         # output r13 Bytes
0491         vmovq   %xmm9, %rax
0492         cmp     $8, %r13
0493         jle     _less_than_8_bytes_left\@
0494 
0495         mov     %rax, (arg3 , %r11)
0496         add     $8, %r11
0497         vpsrldq $8, %xmm9, %xmm9
0498         vmovq   %xmm9, %rax
0499         sub     $8, %r13
0500 
0501 _less_than_8_bytes_left\@:
0502         movb    %al, (arg3 , %r11)
0503         add     $1, %r11
0504         shr     $8, %rax
0505         sub     $1, %r13
0506         jne     _less_than_8_bytes_left\@
0507         #############################
0508 
0509 _multiple_of_16_bytes\@:
0510 .endm
0511 
0512 
0513 # GCM_COMPLETE Finishes update of tag of last partial block
0514 # Output: Authorization Tag (AUTH_TAG)
0515 # Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
0516 .macro GCM_COMPLETE GHASH_MUL REP AUTH_TAG AUTH_TAG_LEN
0517         vmovdqu AadHash(arg2), %xmm14
0518         vmovdqu HashKey(arg2), %xmm13
0519 
0520         mov PBlockLen(arg2), %r12
0521         test %r12, %r12
0522         je _partial_done\@
0523 
0524     #GHASH computation for the last <16 Byte block
0525         \GHASH_MUL       %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
0526 
0527 _partial_done\@:
0528         mov AadLen(arg2), %r12                          # r12 = aadLen (number of bytes)
0529         shl     $3, %r12                             # convert into number of bits
0530         vmovd   %r12d, %xmm15                        # len(A) in xmm15
0531 
0532         mov InLen(arg2), %r12
0533         shl     $3, %r12                        # len(C) in bits  (*128)
0534         vmovq   %r12, %xmm1
0535         vpslldq $8, %xmm15, %xmm15                   # xmm15 = len(A)|| 0x0000000000000000
0536         vpxor   %xmm1, %xmm15, %xmm15                # xmm15 = len(A)||len(C)
0537 
0538         vpxor   %xmm15, %xmm14, %xmm14
0539         \GHASH_MUL       %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6    # final GHASH computation
0540         vpshufb SHUF_MASK(%rip), %xmm14, %xmm14      # perform a 16Byte swap
0541 
0542         vmovdqu OrigIV(arg2), %xmm9
0543 
0544         ENCRYPT_SINGLE_BLOCK    \REP, %xmm9                # E(K, Y0)
0545 
0546         vpxor   %xmm14, %xmm9, %xmm9
0547 
0548 
0549 
0550 _return_T\@:
0551         mov     \AUTH_TAG, %r10              # r10 = authTag
0552         mov     \AUTH_TAG_LEN, %r11              # r11 = auth_tag_len
0553 
0554         cmp     $16, %r11
0555         je      _T_16\@
0556 
0557         cmp     $8, %r11
0558         jl      _T_4\@
0559 
0560 _T_8\@:
0561         vmovq   %xmm9, %rax
0562         mov     %rax, (%r10)
0563         add     $8, %r10
0564         sub     $8, %r11
0565         vpsrldq $8, %xmm9, %xmm9
0566         test    %r11, %r11
0567         je     _return_T_done\@
0568 _T_4\@:
0569         vmovd   %xmm9, %eax
0570         mov     %eax, (%r10)
0571         add     $4, %r10
0572         sub     $4, %r11
0573         vpsrldq     $4, %xmm9, %xmm9
0574         test    %r11, %r11
0575         je     _return_T_done\@
0576 _T_123\@:
0577         vmovd     %xmm9, %eax
0578         cmp     $2, %r11
0579         jl     _T_1\@
0580         mov     %ax, (%r10)
0581         cmp     $2, %r11
0582         je     _return_T_done\@
0583         add     $2, %r10
0584         sar     $16, %eax
0585 _T_1\@:
0586         mov     %al, (%r10)
0587         jmp     _return_T_done\@
0588 
0589 _T_16\@:
0590         vmovdqu %xmm9, (%r10)
0591 
0592 _return_T_done\@:
0593 .endm
0594 
0595 .macro CALC_AAD_HASH GHASH_MUL AAD AADLEN T1 T2 T3 T4 T5 T6 T7 T8
0596 
0597     mov     \AAD, %r10                      # r10 = AAD
0598     mov     \AADLEN, %r12                      # r12 = aadLen
0599 
0600 
0601     mov     %r12, %r11
0602 
0603     vpxor   \T8, \T8, \T8
0604     vpxor   \T7, \T7, \T7
0605     cmp     $16, %r11
0606     jl      _get_AAD_rest8\@
0607 _get_AAD_blocks\@:
0608     vmovdqu (%r10), \T7
0609     vpshufb SHUF_MASK(%rip), \T7, \T7
0610     vpxor   \T7, \T8, \T8
0611     \GHASH_MUL       \T8, \T2, \T1, \T3, \T4, \T5, \T6
0612     add     $16, %r10
0613     sub     $16, %r12
0614     sub     $16, %r11
0615     cmp     $16, %r11
0616     jge     _get_AAD_blocks\@
0617     vmovdqu \T8, \T7
0618     test    %r11, %r11
0619     je      _get_AAD_done\@
0620 
0621     vpxor   \T7, \T7, \T7
0622 
0623     /* read the last <16B of AAD. since we have at least 4B of
0624     data right after the AAD (the ICV, and maybe some CT), we can
0625     read 4B/8B blocks safely, and then get rid of the extra stuff */
0626 _get_AAD_rest8\@:
0627     cmp     $4, %r11
0628     jle     _get_AAD_rest4\@
0629     movq    (%r10), \T1
0630     add     $8, %r10
0631     sub     $8, %r11
0632     vpslldq $8, \T1, \T1
0633     vpsrldq $8, \T7, \T7
0634     vpxor   \T1, \T7, \T7
0635     jmp     _get_AAD_rest8\@
0636 _get_AAD_rest4\@:
0637     test    %r11, %r11
0638     jle      _get_AAD_rest0\@
0639     mov     (%r10), %eax
0640     movq    %rax, \T1
0641     add     $4, %r10
0642     sub     $4, %r11
0643     vpslldq $12, \T1, \T1
0644     vpsrldq $4, \T7, \T7
0645     vpxor   \T1, \T7, \T7
0646 _get_AAD_rest0\@:
0647     /* finalize: shift out the extra bytes we read, and align
0648     left. since pslldq can only shift by an immediate, we use
0649     vpshufb and an array of shuffle masks */
0650     movq    %r12, %r11
0651     salq    $4, %r11
0652     vmovdqu  aad_shift_arr(%r11), \T1
0653     vpshufb \T1, \T7, \T7
0654 _get_AAD_rest_final\@:
0655     vpshufb SHUF_MASK(%rip), \T7, \T7
0656     vpxor   \T8, \T7, \T7
0657     \GHASH_MUL       \T7, \T2, \T1, \T3, \T4, \T5, \T6
0658 
0659 _get_AAD_done\@:
0660         vmovdqu \T7, AadHash(arg2)
0661 .endm
0662 
0663 .macro INIT GHASH_MUL PRECOMPUTE
0664         mov arg6, %r11
0665         mov %r11, AadLen(arg2) # ctx_data.aad_length = aad_length
0666         xor %r11d, %r11d
0667         mov %r11, InLen(arg2) # ctx_data.in_length = 0
0668 
0669         mov %r11, PBlockLen(arg2) # ctx_data.partial_block_length = 0
0670         mov %r11, PBlockEncKey(arg2) # ctx_data.partial_block_enc_key = 0
0671         mov arg3, %rax
0672         movdqu (%rax), %xmm0
0673         movdqu %xmm0, OrigIV(arg2) # ctx_data.orig_IV = iv
0674 
0675         vpshufb SHUF_MASK(%rip), %xmm0, %xmm0
0676         movdqu %xmm0, CurCount(arg2) # ctx_data.current_counter = iv
0677 
0678         vmovdqu  (arg4), %xmm6              # xmm6 = HashKey
0679 
0680         vpshufb  SHUF_MASK(%rip), %xmm6, %xmm6
0681         ###############  PRECOMPUTATION of HashKey<<1 mod poly from the HashKey
0682         vmovdqa  %xmm6, %xmm2
0683         vpsllq   $1, %xmm6, %xmm6
0684         vpsrlq   $63, %xmm2, %xmm2
0685         vmovdqa  %xmm2, %xmm1
0686         vpslldq  $8, %xmm2, %xmm2
0687         vpsrldq  $8, %xmm1, %xmm1
0688         vpor     %xmm2, %xmm6, %xmm6
0689         #reduction
0690         vpshufd  $0b00100100, %xmm1, %xmm2
0691         vpcmpeqd TWOONE(%rip), %xmm2, %xmm2
0692         vpand    POLY(%rip), %xmm2, %xmm2
0693         vpxor    %xmm2, %xmm6, %xmm6        # xmm6 holds the HashKey<<1 mod poly
0694         #######################################################################
0695         vmovdqu  %xmm6, HashKey(arg2)       # store HashKey<<1 mod poly
0696 
0697         CALC_AAD_HASH \GHASH_MUL, arg5, arg6, %xmm2, %xmm6, %xmm3, %xmm4, %xmm5, %xmm7, %xmm1, %xmm0
0698 
0699         \PRECOMPUTE  %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
0700 .endm
0701 
0702 
0703 # Reads DLEN bytes starting at DPTR and stores in XMMDst
0704 # where 0 < DLEN < 16
0705 # Clobbers %rax, DLEN
0706 .macro READ_PARTIAL_BLOCK DPTR DLEN XMMDst
0707         vpxor \XMMDst, \XMMDst, \XMMDst
0708 
0709         cmp $8, \DLEN
0710         jl _read_lt8_\@
0711         mov (\DPTR), %rax
0712         vpinsrq $0, %rax, \XMMDst, \XMMDst
0713         sub $8, \DLEN
0714         jz _done_read_partial_block_\@
0715         xor %eax, %eax
0716 _read_next_byte_\@:
0717         shl $8, %rax
0718         mov 7(\DPTR, \DLEN, 1), %al
0719         dec \DLEN
0720         jnz _read_next_byte_\@
0721         vpinsrq $1, %rax, \XMMDst, \XMMDst
0722         jmp _done_read_partial_block_\@
0723 _read_lt8_\@:
0724         xor %eax, %eax
0725 _read_next_byte_lt8_\@:
0726         shl $8, %rax
0727         mov -1(\DPTR, \DLEN, 1), %al
0728         dec \DLEN
0729         jnz _read_next_byte_lt8_\@
0730         vpinsrq $0, %rax, \XMMDst, \XMMDst
0731 _done_read_partial_block_\@:
0732 .endm
0733 
0734 # PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks
0735 # between update calls.
0736 # Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK
0737 # Outputs encrypted bytes, and updates hash and partial info in gcm_data_context
0738 # Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13
0739 .macro PARTIAL_BLOCK GHASH_MUL CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \
0740         AAD_HASH ENC_DEC
0741         mov     PBlockLen(arg2), %r13
0742         test    %r13, %r13
0743         je  _partial_block_done_\@  # Leave Macro if no partial blocks
0744         # Read in input data without over reading
0745         cmp $16, \PLAIN_CYPH_LEN
0746         jl  _fewer_than_16_bytes_\@
0747         vmovdqu (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm
0748         jmp _data_read_\@
0749 
0750 _fewer_than_16_bytes_\@:
0751         lea (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10
0752         mov \PLAIN_CYPH_LEN, %r12
0753         READ_PARTIAL_BLOCK %r10 %r12 %xmm1
0754 
0755         mov PBlockLen(arg2), %r13
0756 
0757 _data_read_\@:              # Finished reading in data
0758 
0759         vmovdqu PBlockEncKey(arg2), %xmm9
0760         vmovdqu HashKey(arg2), %xmm13
0761 
0762         lea SHIFT_MASK(%rip), %r12
0763 
0764         # adjust the shuffle mask pointer to be able to shift r13 bytes
0765         # r16-r13 is the number of bytes in plaintext mod 16)
0766         add %r13, %r12
0767         vmovdqu (%r12), %xmm2       # get the appropriate shuffle mask
0768         vpshufb %xmm2, %xmm9, %xmm9     # shift right r13 bytes
0769 
0770 .if  \ENC_DEC ==  DEC
0771         vmovdqa %xmm1, %xmm3
0772         pxor    %xmm1, %xmm9        # Cyphertext XOR E(K, Yn)
0773 
0774         mov \PLAIN_CYPH_LEN, %r10
0775         add %r13, %r10
0776         # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
0777         sub $16, %r10
0778         # Determine if if partial block is not being filled and
0779         # shift mask accordingly
0780         jge _no_extra_mask_1_\@
0781         sub %r10, %r12
0782 _no_extra_mask_1_\@:
0783 
0784         vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1
0785         # get the appropriate mask to mask out bottom r13 bytes of xmm9
0786         vpand   %xmm1, %xmm9, %xmm9     # mask out bottom r13 bytes of xmm9
0787 
0788         vpand   %xmm1, %xmm3, %xmm3
0789         vmovdqa SHUF_MASK(%rip), %xmm10
0790         vpshufb %xmm10, %xmm3, %xmm3
0791         vpshufb %xmm2, %xmm3, %xmm3
0792         vpxor   %xmm3, \AAD_HASH, \AAD_HASH
0793 
0794         test    %r10, %r10
0795         jl  _partial_incomplete_1_\@
0796 
0797         # GHASH computation for the last <16 Byte block
0798         \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
0799         xor %eax,%eax
0800 
0801         mov %rax, PBlockLen(arg2)
0802         jmp _dec_done_\@
0803 _partial_incomplete_1_\@:
0804         add \PLAIN_CYPH_LEN, PBlockLen(arg2)
0805 _dec_done_\@:
0806         vmovdqu \AAD_HASH, AadHash(arg2)
0807 .else
0808         vpxor   %xmm1, %xmm9, %xmm9         # Plaintext XOR E(K, Yn)
0809 
0810         mov \PLAIN_CYPH_LEN, %r10
0811         add %r13, %r10
0812         # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
0813         sub $16, %r10
0814         # Determine if if partial block is not being filled and
0815         # shift mask accordingly
0816         jge _no_extra_mask_2_\@
0817         sub %r10, %r12
0818 _no_extra_mask_2_\@:
0819 
0820         vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1
0821         # get the appropriate mask to mask out bottom r13 bytes of xmm9
0822         vpand   %xmm1, %xmm9, %xmm9
0823 
0824         vmovdqa SHUF_MASK(%rip), %xmm1
0825         vpshufb %xmm1, %xmm9, %xmm9
0826         vpshufb %xmm2, %xmm9, %xmm9
0827         vpxor   %xmm9, \AAD_HASH, \AAD_HASH
0828 
0829         test    %r10, %r10
0830         jl  _partial_incomplete_2_\@
0831 
0832         # GHASH computation for the last <16 Byte block
0833         \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
0834         xor %eax,%eax
0835 
0836         mov %rax, PBlockLen(arg2)
0837         jmp _encode_done_\@
0838 _partial_incomplete_2_\@:
0839         add \PLAIN_CYPH_LEN, PBlockLen(arg2)
0840 _encode_done_\@:
0841         vmovdqu \AAD_HASH, AadHash(arg2)
0842 
0843         vmovdqa SHUF_MASK(%rip), %xmm10
0844         # shuffle xmm9 back to output as ciphertext
0845         vpshufb %xmm10, %xmm9, %xmm9
0846         vpshufb %xmm2, %xmm9, %xmm9
0847 .endif
0848         # output encrypted Bytes
0849         test    %r10, %r10
0850         jl  _partial_fill_\@
0851         mov %r13, %r12
0852         mov $16, %r13
0853         # Set r13 to be the number of bytes to write out
0854         sub %r12, %r13
0855         jmp _count_set_\@
0856 _partial_fill_\@:
0857         mov \PLAIN_CYPH_LEN, %r13
0858 _count_set_\@:
0859         vmovdqa %xmm9, %xmm0
0860         vmovq   %xmm0, %rax
0861         cmp $8, %r13
0862         jle _less_than_8_bytes_left_\@
0863 
0864         mov %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
0865         add $8, \DATA_OFFSET
0866         psrldq  $8, %xmm0
0867         vmovq   %xmm0, %rax
0868         sub $8, %r13
0869 _less_than_8_bytes_left_\@:
0870         movb    %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
0871         add $1, \DATA_OFFSET
0872         shr $8, %rax
0873         sub $1, %r13
0874         jne _less_than_8_bytes_left_\@
0875 _partial_block_done_\@:
0876 .endm # PARTIAL_BLOCK
0877 
0878 ###############################################################################
0879 # GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
0880 # Input: A and B (128-bits each, bit-reflected)
0881 # Output: C = A*B*x mod poly, (i.e. >>1 )
0882 # To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
0883 # GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
0884 ###############################################################################
0885 .macro  GHASH_MUL_AVX GH HK T1 T2 T3 T4 T5
0886 
0887         vpshufd         $0b01001110, \GH, \T2
0888         vpshufd         $0b01001110, \HK, \T3
0889         vpxor           \GH     , \T2, \T2      # T2 = (a1+a0)
0890         vpxor           \HK     , \T3, \T3      # T3 = (b1+b0)
0891 
0892         vpclmulqdq      $0x11, \HK, \GH, \T1    # T1 = a1*b1
0893         vpclmulqdq      $0x00, \HK, \GH, \GH    # GH = a0*b0
0894         vpclmulqdq      $0x00, \T3, \T2, \T2    # T2 = (a1+a0)*(b1+b0)
0895         vpxor           \GH, \T2,\T2
0896         vpxor           \T1, \T2,\T2            # T2 = a0*b1+a1*b0
0897 
0898         vpslldq         $8, \T2,\T3             # shift-L T3 2 DWs
0899         vpsrldq         $8, \T2,\T2             # shift-R T2 2 DWs
0900         vpxor           \T3, \GH, \GH
0901         vpxor           \T2, \T1, \T1           # <T1:GH> = GH x HK
0902 
0903         #first phase of the reduction
0904         vpslld  $31, \GH, \T2                   # packed right shifting << 31
0905         vpslld  $30, \GH, \T3                   # packed right shifting shift << 30
0906         vpslld  $25, \GH, \T4                   # packed right shifting shift << 25
0907 
0908         vpxor   \T3, \T2, \T2                   # xor the shifted versions
0909         vpxor   \T4, \T2, \T2
0910 
0911         vpsrldq $4, \T2, \T5                    # shift-R T5 1 DW
0912 
0913         vpslldq $12, \T2, \T2                   # shift-L T2 3 DWs
0914         vpxor   \T2, \GH, \GH                   # first phase of the reduction complete
0915 
0916         #second phase of the reduction
0917 
0918         vpsrld  $1,\GH, \T2                     # packed left shifting >> 1
0919         vpsrld  $2,\GH, \T3                     # packed left shifting >> 2
0920         vpsrld  $7,\GH, \T4                     # packed left shifting >> 7
0921         vpxor   \T3, \T2, \T2                   # xor the shifted versions
0922         vpxor   \T4, \T2, \T2
0923 
0924         vpxor   \T5, \T2, \T2
0925         vpxor   \T2, \GH, \GH
0926         vpxor   \T1, \GH, \GH                   # the result is in GH
0927 
0928 
0929 .endm
0930 
0931 .macro PRECOMPUTE_AVX HK T1 T2 T3 T4 T5 T6
0932 
0933         # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
0934         vmovdqa  \HK, \T5
0935 
0936         vpshufd  $0b01001110, \T5, \T1
0937         vpxor    \T5, \T1, \T1
0938         vmovdqu  \T1, HashKey_k(arg2)
0939 
0940         GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^2<<1 mod poly
0941         vmovdqu  \T5, HashKey_2(arg2)                    #  [HashKey_2] = HashKey^2<<1 mod poly
0942         vpshufd  $0b01001110, \T5, \T1
0943         vpxor    \T5, \T1, \T1
0944         vmovdqu  \T1, HashKey_2_k(arg2)
0945 
0946         GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^3<<1 mod poly
0947         vmovdqu  \T5, HashKey_3(arg2)
0948         vpshufd  $0b01001110, \T5, \T1
0949         vpxor    \T5, \T1, \T1
0950         vmovdqu  \T1, HashKey_3_k(arg2)
0951 
0952         GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^4<<1 mod poly
0953         vmovdqu  \T5, HashKey_4(arg2)
0954         vpshufd  $0b01001110, \T5, \T1
0955         vpxor    \T5, \T1, \T1
0956         vmovdqu  \T1, HashKey_4_k(arg2)
0957 
0958         GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^5<<1 mod poly
0959         vmovdqu  \T5, HashKey_5(arg2)
0960         vpshufd  $0b01001110, \T5, \T1
0961         vpxor    \T5, \T1, \T1
0962         vmovdqu  \T1, HashKey_5_k(arg2)
0963 
0964         GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^6<<1 mod poly
0965         vmovdqu  \T5, HashKey_6(arg2)
0966         vpshufd  $0b01001110, \T5, \T1
0967         vpxor    \T5, \T1, \T1
0968         vmovdqu  \T1, HashKey_6_k(arg2)
0969 
0970         GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^7<<1 mod poly
0971         vmovdqu  \T5, HashKey_7(arg2)
0972         vpshufd  $0b01001110, \T5, \T1
0973         vpxor    \T5, \T1, \T1
0974         vmovdqu  \T1, HashKey_7_k(arg2)
0975 
0976         GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^8<<1 mod poly
0977         vmovdqu  \T5, HashKey_8(arg2)
0978         vpshufd  $0b01001110, \T5, \T1
0979         vpxor    \T5, \T1, \T1
0980         vmovdqu  \T1, HashKey_8_k(arg2)
0981 
0982 .endm
0983 
0984 ## if a = number of total plaintext bytes
0985 ## b = floor(a/16)
0986 ## num_initial_blocks = b mod 4#
0987 ## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
0988 ## r10, r11, r12, rax are clobbered
0989 ## arg1, arg2, arg3, arg4 are used as pointers only, not modified
0990 
0991 .macro INITIAL_BLOCKS_AVX REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC
0992     i = (8-\num_initial_blocks)
0993     setreg
0994         vmovdqu AadHash(arg2), reg_i
0995 
0996     # start AES for num_initial_blocks blocks
0997     vmovdqu CurCount(arg2), \CTR
0998 
0999     i = (9-\num_initial_blocks)
1000     setreg
1001 .rep \num_initial_blocks
1002                 vpaddd  ONE(%rip), \CTR, \CTR       # INCR Y0
1003                 vmovdqa \CTR, reg_i
1004                 vpshufb SHUF_MASK(%rip), reg_i, reg_i   # perform a 16Byte swap
1005     i = (i+1)
1006     setreg
1007 .endr
1008 
1009     vmovdqa  (arg1), \T_key
1010     i = (9-\num_initial_blocks)
1011     setreg
1012 .rep \num_initial_blocks
1013                 vpxor   \T_key, reg_i, reg_i
1014     i = (i+1)
1015     setreg
1016 .endr
1017 
1018        j = 1
1019        setreg
1020 .rep \REP
1021        vmovdqa  16*j(arg1), \T_key
1022     i = (9-\num_initial_blocks)
1023     setreg
1024 .rep \num_initial_blocks
1025         vaesenc \T_key, reg_i, reg_i
1026     i = (i+1)
1027     setreg
1028 .endr
1029 
1030        j = (j+1)
1031        setreg
1032 .endr
1033 
1034     vmovdqa  16*j(arg1), \T_key
1035     i = (9-\num_initial_blocks)
1036     setreg
1037 .rep \num_initial_blocks
1038         vaesenclast      \T_key, reg_i, reg_i
1039     i = (i+1)
1040     setreg
1041 .endr
1042 
1043     i = (9-\num_initial_blocks)
1044     setreg
1045 .rep \num_initial_blocks
1046                 vmovdqu (arg4, %r11), \T1
1047                 vpxor   \T1, reg_i, reg_i
1048                 vmovdqu reg_i, (arg3 , %r11)           # write back ciphertext for num_initial_blocks blocks
1049                 add     $16, %r11
1050 .if  \ENC_DEC == DEC
1051                 vmovdqa \T1, reg_i
1052 .endif
1053                 vpshufb SHUF_MASK(%rip), reg_i, reg_i  # prepare ciphertext for GHASH computations
1054     i = (i+1)
1055     setreg
1056 .endr
1057 
1058 
1059     i = (8-\num_initial_blocks)
1060     j = (9-\num_initial_blocks)
1061     setreg
1062 
1063 .rep \num_initial_blocks
1064         vpxor    reg_i, reg_j, reg_j
1065         GHASH_MUL_AVX       reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks
1066     i = (i+1)
1067     j = (j+1)
1068     setreg
1069 .endr
1070         # XMM8 has the combined result here
1071 
1072         vmovdqa  \XMM8, TMP1(%rsp)
1073         vmovdqa  \XMM8, \T3
1074 
1075         cmp     $128, %r13
1076         jl      _initial_blocks_done\@                  # no need for precomputed constants
1077 
1078 ###############################################################################
1079 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
1080                 vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1081                 vmovdqa  \CTR, \XMM1
1082                 vpshufb  SHUF_MASK(%rip), \XMM1, \XMM1  # perform a 16Byte swap
1083 
1084                 vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1085                 vmovdqa  \CTR, \XMM2
1086                 vpshufb  SHUF_MASK(%rip), \XMM2, \XMM2  # perform a 16Byte swap
1087 
1088                 vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1089                 vmovdqa  \CTR, \XMM3
1090                 vpshufb  SHUF_MASK(%rip), \XMM3, \XMM3  # perform a 16Byte swap
1091 
1092                 vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1093                 vmovdqa  \CTR, \XMM4
1094                 vpshufb  SHUF_MASK(%rip), \XMM4, \XMM4  # perform a 16Byte swap
1095 
1096                 vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1097                 vmovdqa  \CTR, \XMM5
1098                 vpshufb  SHUF_MASK(%rip), \XMM5, \XMM5  # perform a 16Byte swap
1099 
1100                 vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1101                 vmovdqa  \CTR, \XMM6
1102                 vpshufb  SHUF_MASK(%rip), \XMM6, \XMM6  # perform a 16Byte swap
1103 
1104                 vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1105                 vmovdqa  \CTR, \XMM7
1106                 vpshufb  SHUF_MASK(%rip), \XMM7, \XMM7  # perform a 16Byte swap
1107 
1108                 vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1109                 vmovdqa  \CTR, \XMM8
1110                 vpshufb  SHUF_MASK(%rip), \XMM8, \XMM8  # perform a 16Byte swap
1111 
1112                 vmovdqa  (arg1), \T_key
1113                 vpxor    \T_key, \XMM1, \XMM1
1114                 vpxor    \T_key, \XMM2, \XMM2
1115                 vpxor    \T_key, \XMM3, \XMM3
1116                 vpxor    \T_key, \XMM4, \XMM4
1117                 vpxor    \T_key, \XMM5, \XMM5
1118                 vpxor    \T_key, \XMM6, \XMM6
1119                 vpxor    \T_key, \XMM7, \XMM7
1120                 vpxor    \T_key, \XMM8, \XMM8
1121 
1122                i = 1
1123                setreg
1124 .rep    \REP       # do REP rounds
1125                 vmovdqa  16*i(arg1), \T_key
1126                 vaesenc  \T_key, \XMM1, \XMM1
1127                 vaesenc  \T_key, \XMM2, \XMM2
1128                 vaesenc  \T_key, \XMM3, \XMM3
1129                 vaesenc  \T_key, \XMM4, \XMM4
1130                 vaesenc  \T_key, \XMM5, \XMM5
1131                 vaesenc  \T_key, \XMM6, \XMM6
1132                 vaesenc  \T_key, \XMM7, \XMM7
1133                 vaesenc  \T_key, \XMM8, \XMM8
1134                i = (i+1)
1135                setreg
1136 .endr
1137 
1138                 vmovdqa  16*i(arg1), \T_key
1139                 vaesenclast  \T_key, \XMM1, \XMM1
1140                 vaesenclast  \T_key, \XMM2, \XMM2
1141                 vaesenclast  \T_key, \XMM3, \XMM3
1142                 vaesenclast  \T_key, \XMM4, \XMM4
1143                 vaesenclast  \T_key, \XMM5, \XMM5
1144                 vaesenclast  \T_key, \XMM6, \XMM6
1145                 vaesenclast  \T_key, \XMM7, \XMM7
1146                 vaesenclast  \T_key, \XMM8, \XMM8
1147 
1148                 vmovdqu  (arg4, %r11), \T1
1149                 vpxor    \T1, \XMM1, \XMM1
1150                 vmovdqu  \XMM1, (arg3 , %r11)
1151                 .if   \ENC_DEC == DEC
1152                 vmovdqa  \T1, \XMM1
1153                 .endif
1154 
1155                 vmovdqu  16*1(arg4, %r11), \T1
1156                 vpxor    \T1, \XMM2, \XMM2
1157                 vmovdqu  \XMM2, 16*1(arg3 , %r11)
1158                 .if   \ENC_DEC == DEC
1159                 vmovdqa  \T1, \XMM2
1160                 .endif
1161 
1162                 vmovdqu  16*2(arg4, %r11), \T1
1163                 vpxor    \T1, \XMM3, \XMM3
1164                 vmovdqu  \XMM3, 16*2(arg3 , %r11)
1165                 .if   \ENC_DEC == DEC
1166                 vmovdqa  \T1, \XMM3
1167                 .endif
1168 
1169                 vmovdqu  16*3(arg4, %r11), \T1
1170                 vpxor    \T1, \XMM4, \XMM4
1171                 vmovdqu  \XMM4, 16*3(arg3 , %r11)
1172                 .if   \ENC_DEC == DEC
1173                 vmovdqa  \T1, \XMM4
1174                 .endif
1175 
1176                 vmovdqu  16*4(arg4, %r11), \T1
1177                 vpxor    \T1, \XMM5, \XMM5
1178                 vmovdqu  \XMM5, 16*4(arg3 , %r11)
1179                 .if   \ENC_DEC == DEC
1180                 vmovdqa  \T1, \XMM5
1181                 .endif
1182 
1183                 vmovdqu  16*5(arg4, %r11), \T1
1184                 vpxor    \T1, \XMM6, \XMM6
1185                 vmovdqu  \XMM6, 16*5(arg3 , %r11)
1186                 .if   \ENC_DEC == DEC
1187                 vmovdqa  \T1, \XMM6
1188                 .endif
1189 
1190                 vmovdqu  16*6(arg4, %r11), \T1
1191                 vpxor    \T1, \XMM7, \XMM7
1192                 vmovdqu  \XMM7, 16*6(arg3 , %r11)
1193                 .if   \ENC_DEC == DEC
1194                 vmovdqa  \T1, \XMM7
1195                 .endif
1196 
1197                 vmovdqu  16*7(arg4, %r11), \T1
1198                 vpxor    \T1, \XMM8, \XMM8
1199                 vmovdqu  \XMM8, 16*7(arg3 , %r11)
1200                 .if   \ENC_DEC == DEC
1201                 vmovdqa  \T1, \XMM8
1202                 .endif
1203 
1204                 add     $128, %r11
1205 
1206                 vpshufb  SHUF_MASK(%rip), \XMM1, \XMM1     # perform a 16Byte swap
1207                 vpxor    TMP1(%rsp), \XMM1, \XMM1          # combine GHASHed value with the corresponding ciphertext
1208                 vpshufb  SHUF_MASK(%rip), \XMM2, \XMM2     # perform a 16Byte swap
1209                 vpshufb  SHUF_MASK(%rip), \XMM3, \XMM3     # perform a 16Byte swap
1210                 vpshufb  SHUF_MASK(%rip), \XMM4, \XMM4     # perform a 16Byte swap
1211                 vpshufb  SHUF_MASK(%rip), \XMM5, \XMM5     # perform a 16Byte swap
1212                 vpshufb  SHUF_MASK(%rip), \XMM6, \XMM6     # perform a 16Byte swap
1213                 vpshufb  SHUF_MASK(%rip), \XMM7, \XMM7     # perform a 16Byte swap
1214                 vpshufb  SHUF_MASK(%rip), \XMM8, \XMM8     # perform a 16Byte swap
1215 
1216 ###############################################################################
1217 
1218 _initial_blocks_done\@:
1219 
1220 .endm
1221 
1222 # encrypt 8 blocks at a time
1223 # ghash the 8 previously encrypted ciphertext blocks
1224 # arg1, arg2, arg3, arg4 are used as pointers only, not modified
1225 # r11 is the data offset value
1226 .macro GHASH_8_ENCRYPT_8_PARALLEL_AVX REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
1227 
1228         vmovdqa \XMM1, \T2
1229         vmovdqa \XMM2, TMP2(%rsp)
1230         vmovdqa \XMM3, TMP3(%rsp)
1231         vmovdqa \XMM4, TMP4(%rsp)
1232         vmovdqa \XMM5, TMP5(%rsp)
1233         vmovdqa \XMM6, TMP6(%rsp)
1234         vmovdqa \XMM7, TMP7(%rsp)
1235         vmovdqa \XMM8, TMP8(%rsp)
1236 
1237 .if \loop_idx == in_order
1238                 vpaddd  ONE(%rip), \CTR, \XMM1           # INCR CNT
1239                 vpaddd  ONE(%rip), \XMM1, \XMM2
1240                 vpaddd  ONE(%rip), \XMM2, \XMM3
1241                 vpaddd  ONE(%rip), \XMM3, \XMM4
1242                 vpaddd  ONE(%rip), \XMM4, \XMM5
1243                 vpaddd  ONE(%rip), \XMM5, \XMM6
1244                 vpaddd  ONE(%rip), \XMM6, \XMM7
1245                 vpaddd  ONE(%rip), \XMM7, \XMM8
1246                 vmovdqa \XMM8, \CTR
1247 
1248                 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1    # perform a 16Byte swap
1249                 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2    # perform a 16Byte swap
1250                 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3    # perform a 16Byte swap
1251                 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4    # perform a 16Byte swap
1252                 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5    # perform a 16Byte swap
1253                 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6    # perform a 16Byte swap
1254                 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7    # perform a 16Byte swap
1255                 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8    # perform a 16Byte swap
1256 .else
1257                 vpaddd  ONEf(%rip), \CTR, \XMM1           # INCR CNT
1258                 vpaddd  ONEf(%rip), \XMM1, \XMM2
1259                 vpaddd  ONEf(%rip), \XMM2, \XMM3
1260                 vpaddd  ONEf(%rip), \XMM3, \XMM4
1261                 vpaddd  ONEf(%rip), \XMM4, \XMM5
1262                 vpaddd  ONEf(%rip), \XMM5, \XMM6
1263                 vpaddd  ONEf(%rip), \XMM6, \XMM7
1264                 vpaddd  ONEf(%rip), \XMM7, \XMM8
1265                 vmovdqa \XMM8, \CTR
1266 .endif
1267 
1268 
1269         #######################################################################
1270 
1271                 vmovdqu (arg1), \T1
1272                 vpxor   \T1, \XMM1, \XMM1
1273                 vpxor   \T1, \XMM2, \XMM2
1274                 vpxor   \T1, \XMM3, \XMM3
1275                 vpxor   \T1, \XMM4, \XMM4
1276                 vpxor   \T1, \XMM5, \XMM5
1277                 vpxor   \T1, \XMM6, \XMM6
1278                 vpxor   \T1, \XMM7, \XMM7
1279                 vpxor   \T1, \XMM8, \XMM8
1280 
1281         #######################################################################
1282 
1283 
1284 
1285 
1286 
1287                 vmovdqu 16*1(arg1), \T1
1288                 vaesenc \T1, \XMM1, \XMM1
1289                 vaesenc \T1, \XMM2, \XMM2
1290                 vaesenc \T1, \XMM3, \XMM3
1291                 vaesenc \T1, \XMM4, \XMM4
1292                 vaesenc \T1, \XMM5, \XMM5
1293                 vaesenc \T1, \XMM6, \XMM6
1294                 vaesenc \T1, \XMM7, \XMM7
1295                 vaesenc \T1, \XMM8, \XMM8
1296 
1297                 vmovdqu 16*2(arg1), \T1
1298                 vaesenc \T1, \XMM1, \XMM1
1299                 vaesenc \T1, \XMM2, \XMM2
1300                 vaesenc \T1, \XMM3, \XMM3
1301                 vaesenc \T1, \XMM4, \XMM4
1302                 vaesenc \T1, \XMM5, \XMM5
1303                 vaesenc \T1, \XMM6, \XMM6
1304                 vaesenc \T1, \XMM7, \XMM7
1305                 vaesenc \T1, \XMM8, \XMM8
1306 
1307 
1308         #######################################################################
1309 
1310         vmovdqu         HashKey_8(arg2), \T5
1311         vpclmulqdq      $0x11, \T5, \T2, \T4             # T4 = a1*b1
1312         vpclmulqdq      $0x00, \T5, \T2, \T7             # T7 = a0*b0
1313 
1314         vpshufd         $0b01001110, \T2, \T6
1315         vpxor           \T2, \T6, \T6
1316 
1317         vmovdqu         HashKey_8_k(arg2), \T5
1318         vpclmulqdq      $0x00, \T5, \T6, \T6
1319 
1320                 vmovdqu 16*3(arg1), \T1
1321                 vaesenc \T1, \XMM1, \XMM1
1322                 vaesenc \T1, \XMM2, \XMM2
1323                 vaesenc \T1, \XMM3, \XMM3
1324                 vaesenc \T1, \XMM4, \XMM4
1325                 vaesenc \T1, \XMM5, \XMM5
1326                 vaesenc \T1, \XMM6, \XMM6
1327                 vaesenc \T1, \XMM7, \XMM7
1328                 vaesenc \T1, \XMM8, \XMM8
1329 
1330         vmovdqa         TMP2(%rsp), \T1
1331         vmovdqu         HashKey_7(arg2), \T5
1332         vpclmulqdq      $0x11, \T5, \T1, \T3
1333         vpxor           \T3, \T4, \T4
1334         vpclmulqdq      $0x00, \T5, \T1, \T3
1335         vpxor           \T3, \T7, \T7
1336 
1337         vpshufd         $0b01001110, \T1, \T3
1338         vpxor           \T1, \T3, \T3
1339         vmovdqu         HashKey_7_k(arg2), \T5
1340         vpclmulqdq      $0x10, \T5, \T3, \T3
1341         vpxor           \T3, \T6, \T6
1342 
1343                 vmovdqu 16*4(arg1), \T1
1344                 vaesenc \T1, \XMM1, \XMM1
1345                 vaesenc \T1, \XMM2, \XMM2
1346                 vaesenc \T1, \XMM3, \XMM3
1347                 vaesenc \T1, \XMM4, \XMM4
1348                 vaesenc \T1, \XMM5, \XMM5
1349                 vaesenc \T1, \XMM6, \XMM6
1350                 vaesenc \T1, \XMM7, \XMM7
1351                 vaesenc \T1, \XMM8, \XMM8
1352 
1353         #######################################################################
1354 
1355         vmovdqa         TMP3(%rsp), \T1
1356         vmovdqu         HashKey_6(arg2), \T5
1357         vpclmulqdq      $0x11, \T5, \T1, \T3
1358         vpxor           \T3, \T4, \T4
1359         vpclmulqdq      $0x00, \T5, \T1, \T3
1360         vpxor           \T3, \T7, \T7
1361 
1362         vpshufd         $0b01001110, \T1, \T3
1363         vpxor           \T1, \T3, \T3
1364         vmovdqu         HashKey_6_k(arg2), \T5
1365         vpclmulqdq      $0x10, \T5, \T3, \T3
1366         vpxor           \T3, \T6, \T6
1367 
1368                 vmovdqu 16*5(arg1), \T1
1369                 vaesenc \T1, \XMM1, \XMM1
1370                 vaesenc \T1, \XMM2, \XMM2
1371                 vaesenc \T1, \XMM3, \XMM3
1372                 vaesenc \T1, \XMM4, \XMM4
1373                 vaesenc \T1, \XMM5, \XMM5
1374                 vaesenc \T1, \XMM6, \XMM6
1375                 vaesenc \T1, \XMM7, \XMM7
1376                 vaesenc \T1, \XMM8, \XMM8
1377 
1378         vmovdqa         TMP4(%rsp), \T1
1379         vmovdqu         HashKey_5(arg2), \T5
1380         vpclmulqdq      $0x11, \T5, \T1, \T3
1381         vpxor           \T3, \T4, \T4
1382         vpclmulqdq      $0x00, \T5, \T1, \T3
1383         vpxor           \T3, \T7, \T7
1384 
1385         vpshufd         $0b01001110, \T1, \T3
1386         vpxor           \T1, \T3, \T3
1387         vmovdqu         HashKey_5_k(arg2), \T5
1388         vpclmulqdq      $0x10, \T5, \T3, \T3
1389         vpxor           \T3, \T6, \T6
1390 
1391                 vmovdqu 16*6(arg1), \T1
1392                 vaesenc \T1, \XMM1, \XMM1
1393                 vaesenc \T1, \XMM2, \XMM2
1394                 vaesenc \T1, \XMM3, \XMM3
1395                 vaesenc \T1, \XMM4, \XMM4
1396                 vaesenc \T1, \XMM5, \XMM5
1397                 vaesenc \T1, \XMM6, \XMM6
1398                 vaesenc \T1, \XMM7, \XMM7
1399                 vaesenc \T1, \XMM8, \XMM8
1400 
1401 
1402         vmovdqa         TMP5(%rsp), \T1
1403         vmovdqu         HashKey_4(arg2), \T5
1404         vpclmulqdq      $0x11, \T5, \T1, \T3
1405         vpxor           \T3, \T4, \T4
1406         vpclmulqdq      $0x00, \T5, \T1, \T3
1407         vpxor           \T3, \T7, \T7
1408 
1409         vpshufd         $0b01001110, \T1, \T3
1410         vpxor           \T1, \T3, \T3
1411         vmovdqu         HashKey_4_k(arg2), \T5
1412         vpclmulqdq      $0x10, \T5, \T3, \T3
1413         vpxor           \T3, \T6, \T6
1414 
1415                 vmovdqu 16*7(arg1), \T1
1416                 vaesenc \T1, \XMM1, \XMM1
1417                 vaesenc \T1, \XMM2, \XMM2
1418                 vaesenc \T1, \XMM3, \XMM3
1419                 vaesenc \T1, \XMM4, \XMM4
1420                 vaesenc \T1, \XMM5, \XMM5
1421                 vaesenc \T1, \XMM6, \XMM6
1422                 vaesenc \T1, \XMM7, \XMM7
1423                 vaesenc \T1, \XMM8, \XMM8
1424 
1425         vmovdqa         TMP6(%rsp), \T1
1426         vmovdqu         HashKey_3(arg2), \T5
1427         vpclmulqdq      $0x11, \T5, \T1, \T3
1428         vpxor           \T3, \T4, \T4
1429         vpclmulqdq      $0x00, \T5, \T1, \T3
1430         vpxor           \T3, \T7, \T7
1431 
1432         vpshufd         $0b01001110, \T1, \T3
1433         vpxor           \T1, \T3, \T3
1434         vmovdqu         HashKey_3_k(arg2), \T5
1435         vpclmulqdq      $0x10, \T5, \T3, \T3
1436         vpxor           \T3, \T6, \T6
1437 
1438 
1439                 vmovdqu 16*8(arg1), \T1
1440                 vaesenc \T1, \XMM1, \XMM1
1441                 vaesenc \T1, \XMM2, \XMM2
1442                 vaesenc \T1, \XMM3, \XMM3
1443                 vaesenc \T1, \XMM4, \XMM4
1444                 vaesenc \T1, \XMM5, \XMM5
1445                 vaesenc \T1, \XMM6, \XMM6
1446                 vaesenc \T1, \XMM7, \XMM7
1447                 vaesenc \T1, \XMM8, \XMM8
1448 
1449         vmovdqa         TMP7(%rsp), \T1
1450         vmovdqu         HashKey_2(arg2), \T5
1451         vpclmulqdq      $0x11, \T5, \T1, \T3
1452         vpxor           \T3, \T4, \T4
1453         vpclmulqdq      $0x00, \T5, \T1, \T3
1454         vpxor           \T3, \T7, \T7
1455 
1456         vpshufd         $0b01001110, \T1, \T3
1457         vpxor           \T1, \T3, \T3
1458         vmovdqu         HashKey_2_k(arg2), \T5
1459         vpclmulqdq      $0x10, \T5, \T3, \T3
1460         vpxor           \T3, \T6, \T6
1461 
1462         #######################################################################
1463 
1464                 vmovdqu 16*9(arg1), \T5
1465                 vaesenc \T5, \XMM1, \XMM1
1466                 vaesenc \T5, \XMM2, \XMM2
1467                 vaesenc \T5, \XMM3, \XMM3
1468                 vaesenc \T5, \XMM4, \XMM4
1469                 vaesenc \T5, \XMM5, \XMM5
1470                 vaesenc \T5, \XMM6, \XMM6
1471                 vaesenc \T5, \XMM7, \XMM7
1472                 vaesenc \T5, \XMM8, \XMM8
1473 
1474         vmovdqa         TMP8(%rsp), \T1
1475         vmovdqu         HashKey(arg2), \T5
1476         vpclmulqdq      $0x11, \T5, \T1, \T3
1477         vpxor           \T3, \T4, \T4
1478         vpclmulqdq      $0x00, \T5, \T1, \T3
1479         vpxor           \T3, \T7, \T7
1480 
1481         vpshufd         $0b01001110, \T1, \T3
1482         vpxor           \T1, \T3, \T3
1483         vmovdqu         HashKey_k(arg2), \T5
1484         vpclmulqdq      $0x10, \T5, \T3, \T3
1485         vpxor           \T3, \T6, \T6
1486 
1487         vpxor           \T4, \T6, \T6
1488         vpxor           \T7, \T6, \T6
1489 
1490                 vmovdqu 16*10(arg1), \T5
1491 
1492         i = 11
1493         setreg
1494 .rep (\REP-9)
1495 
1496         vaesenc \T5, \XMM1, \XMM1
1497         vaesenc \T5, \XMM2, \XMM2
1498         vaesenc \T5, \XMM3, \XMM3
1499         vaesenc \T5, \XMM4, \XMM4
1500         vaesenc \T5, \XMM5, \XMM5
1501         vaesenc \T5, \XMM6, \XMM6
1502         vaesenc \T5, \XMM7, \XMM7
1503         vaesenc \T5, \XMM8, \XMM8
1504 
1505         vmovdqu 16*i(arg1), \T5
1506         i = i + 1
1507         setreg
1508 .endr
1509 
1510     i = 0
1511     j = 1
1512     setreg
1513 .rep 8
1514         vpxor   16*i(arg4, %r11), \T5, \T2
1515                 .if \ENC_DEC == ENC
1516                 vaesenclast     \T2, reg_j, reg_j
1517                 .else
1518                 vaesenclast     \T2, reg_j, \T3
1519                 vmovdqu 16*i(arg4, %r11), reg_j
1520                 vmovdqu \T3, 16*i(arg3, %r11)
1521                 .endif
1522     i = (i+1)
1523     j = (j+1)
1524     setreg
1525 .endr
1526     #######################################################################
1527 
1528 
1529     vpslldq $8, \T6, \T3                # shift-L T3 2 DWs
1530     vpsrldq $8, \T6, \T6                # shift-R T2 2 DWs
1531     vpxor   \T3, \T7, \T7
1532     vpxor   \T4, \T6, \T6               # accumulate the results in T6:T7
1533 
1534 
1535 
1536     #######################################################################
1537     #first phase of the reduction
1538     #######################################################################
1539         vpslld  $31, \T7, \T2                           # packed right shifting << 31
1540         vpslld  $30, \T7, \T3                           # packed right shifting shift << 30
1541         vpslld  $25, \T7, \T4                           # packed right shifting shift << 25
1542 
1543         vpxor   \T3, \T2, \T2                           # xor the shifted versions
1544         vpxor   \T4, \T2, \T2
1545 
1546         vpsrldq $4, \T2, \T1                            # shift-R T1 1 DW
1547 
1548         vpslldq $12, \T2, \T2                           # shift-L T2 3 DWs
1549         vpxor   \T2, \T7, \T7                           # first phase of the reduction complete
1550     #######################################################################
1551                 .if \ENC_DEC == ENC
1552         vmovdqu  \XMM1, 16*0(arg3,%r11)     # Write to the Ciphertext buffer
1553         vmovdqu  \XMM2, 16*1(arg3,%r11)     # Write to the Ciphertext buffer
1554         vmovdqu  \XMM3, 16*2(arg3,%r11)     # Write to the Ciphertext buffer
1555         vmovdqu  \XMM4, 16*3(arg3,%r11)     # Write to the Ciphertext buffer
1556         vmovdqu  \XMM5, 16*4(arg3,%r11)     # Write to the Ciphertext buffer
1557         vmovdqu  \XMM6, 16*5(arg3,%r11)     # Write to the Ciphertext buffer
1558         vmovdqu  \XMM7, 16*6(arg3,%r11)     # Write to the Ciphertext buffer
1559         vmovdqu  \XMM8, 16*7(arg3,%r11)     # Write to the Ciphertext buffer
1560                 .endif
1561 
1562     #######################################################################
1563     #second phase of the reduction
1564         vpsrld  $1, \T7, \T2                            # packed left shifting >> 1
1565         vpsrld  $2, \T7, \T3                            # packed left shifting >> 2
1566         vpsrld  $7, \T7, \T4                            # packed left shifting >> 7
1567         vpxor   \T3, \T2, \T2                           # xor the shifted versions
1568         vpxor   \T4, \T2, \T2
1569 
1570         vpxor   \T1, \T2, \T2
1571         vpxor   \T2, \T7, \T7
1572         vpxor   \T7, \T6, \T6                           # the result is in T6
1573     #######################################################################
1574 
1575         vpshufb SHUF_MASK(%rip), \XMM1, \XMM1   # perform a 16Byte swap
1576         vpshufb SHUF_MASK(%rip), \XMM2, \XMM2   # perform a 16Byte swap
1577         vpshufb SHUF_MASK(%rip), \XMM3, \XMM3   # perform a 16Byte swap
1578         vpshufb SHUF_MASK(%rip), \XMM4, \XMM4   # perform a 16Byte swap
1579         vpshufb SHUF_MASK(%rip), \XMM5, \XMM5   # perform a 16Byte swap
1580         vpshufb SHUF_MASK(%rip), \XMM6, \XMM6   # perform a 16Byte swap
1581         vpshufb SHUF_MASK(%rip), \XMM7, \XMM7   # perform a 16Byte swap
1582         vpshufb SHUF_MASK(%rip), \XMM8, \XMM8   # perform a 16Byte swap
1583 
1584 
1585     vpxor   \T6, \XMM1, \XMM1
1586 
1587 
1588 
1589 .endm
1590 
1591 
1592 # GHASH the last 4 ciphertext blocks.
1593 .macro  GHASH_LAST_8_AVX T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
1594 
1595         ## Karatsuba Method
1596 
1597 
1598         vpshufd         $0b01001110, \XMM1, \T2
1599         vpxor           \XMM1, \T2, \T2
1600         vmovdqu         HashKey_8(arg2), \T5
1601         vpclmulqdq      $0x11, \T5, \XMM1, \T6
1602         vpclmulqdq      $0x00, \T5, \XMM1, \T7
1603 
1604         vmovdqu         HashKey_8_k(arg2), \T3
1605         vpclmulqdq      $0x00, \T3, \T2, \XMM1
1606 
1607         ######################
1608 
1609         vpshufd         $0b01001110, \XMM2, \T2
1610         vpxor           \XMM2, \T2, \T2
1611         vmovdqu         HashKey_7(arg2), \T5
1612         vpclmulqdq      $0x11, \T5, \XMM2, \T4
1613         vpxor           \T4, \T6, \T6
1614 
1615         vpclmulqdq      $0x00, \T5, \XMM2, \T4
1616         vpxor           \T4, \T7, \T7
1617 
1618         vmovdqu         HashKey_7_k(arg2), \T3
1619         vpclmulqdq      $0x00, \T3, \T2, \T2
1620         vpxor           \T2, \XMM1, \XMM1
1621 
1622         ######################
1623 
1624         vpshufd         $0b01001110, \XMM3, \T2
1625         vpxor           \XMM3, \T2, \T2
1626         vmovdqu         HashKey_6(arg2), \T5
1627         vpclmulqdq      $0x11, \T5, \XMM3, \T4
1628         vpxor           \T4, \T6, \T6
1629 
1630         vpclmulqdq      $0x00, \T5, \XMM3, \T4
1631         vpxor           \T4, \T7, \T7
1632 
1633         vmovdqu         HashKey_6_k(arg2), \T3
1634         vpclmulqdq      $0x00, \T3, \T2, \T2
1635         vpxor           \T2, \XMM1, \XMM1
1636 
1637         ######################
1638 
1639         vpshufd         $0b01001110, \XMM4, \T2
1640         vpxor           \XMM4, \T2, \T2
1641         vmovdqu         HashKey_5(arg2), \T5
1642         vpclmulqdq      $0x11, \T5, \XMM4, \T4
1643         vpxor           \T4, \T6, \T6
1644 
1645         vpclmulqdq      $0x00, \T5, \XMM4, \T4
1646         vpxor           \T4, \T7, \T7
1647 
1648         vmovdqu         HashKey_5_k(arg2), \T3
1649         vpclmulqdq      $0x00, \T3, \T2, \T2
1650         vpxor           \T2, \XMM1, \XMM1
1651 
1652         ######################
1653 
1654         vpshufd         $0b01001110, \XMM5, \T2
1655         vpxor           \XMM5, \T2, \T2
1656         vmovdqu         HashKey_4(arg2), \T5
1657         vpclmulqdq      $0x11, \T5, \XMM5, \T4
1658         vpxor           \T4, \T6, \T6
1659 
1660         vpclmulqdq      $0x00, \T5, \XMM5, \T4
1661         vpxor           \T4, \T7, \T7
1662 
1663         vmovdqu         HashKey_4_k(arg2), \T3
1664         vpclmulqdq      $0x00, \T3, \T2, \T2
1665         vpxor           \T2, \XMM1, \XMM1
1666 
1667         ######################
1668 
1669         vpshufd         $0b01001110, \XMM6, \T2
1670         vpxor           \XMM6, \T2, \T2
1671         vmovdqu         HashKey_3(arg2), \T5
1672         vpclmulqdq      $0x11, \T5, \XMM6, \T4
1673         vpxor           \T4, \T6, \T6
1674 
1675         vpclmulqdq      $0x00, \T5, \XMM6, \T4
1676         vpxor           \T4, \T7, \T7
1677 
1678         vmovdqu         HashKey_3_k(arg2), \T3
1679         vpclmulqdq      $0x00, \T3, \T2, \T2
1680         vpxor           \T2, \XMM1, \XMM1
1681 
1682         ######################
1683 
1684         vpshufd         $0b01001110, \XMM7, \T2
1685         vpxor           \XMM7, \T2, \T2
1686         vmovdqu         HashKey_2(arg2), \T5
1687         vpclmulqdq      $0x11, \T5, \XMM7, \T4
1688         vpxor           \T4, \T6, \T6
1689 
1690         vpclmulqdq      $0x00, \T5, \XMM7, \T4
1691         vpxor           \T4, \T7, \T7
1692 
1693         vmovdqu         HashKey_2_k(arg2), \T3
1694         vpclmulqdq      $0x00, \T3, \T2, \T2
1695         vpxor           \T2, \XMM1, \XMM1
1696 
1697         ######################
1698 
1699         vpshufd         $0b01001110, \XMM8, \T2
1700         vpxor           \XMM8, \T2, \T2
1701         vmovdqu         HashKey(arg2), \T5
1702         vpclmulqdq      $0x11, \T5, \XMM8, \T4
1703         vpxor           \T4, \T6, \T6
1704 
1705         vpclmulqdq      $0x00, \T5, \XMM8, \T4
1706         vpxor           \T4, \T7, \T7
1707 
1708         vmovdqu         HashKey_k(arg2), \T3
1709         vpclmulqdq      $0x00, \T3, \T2, \T2
1710 
1711         vpxor           \T2, \XMM1, \XMM1
1712         vpxor           \T6, \XMM1, \XMM1
1713         vpxor           \T7, \XMM1, \T2
1714 
1715 
1716 
1717 
1718         vpslldq $8, \T2, \T4
1719         vpsrldq $8, \T2, \T2
1720 
1721         vpxor   \T4, \T7, \T7
1722         vpxor   \T2, \T6, \T6   # <T6:T7> holds the result of
1723                 # the accumulated carry-less multiplications
1724 
1725         #######################################################################
1726         #first phase of the reduction
1727         vpslld  $31, \T7, \T2   # packed right shifting << 31
1728         vpslld  $30, \T7, \T3   # packed right shifting shift << 30
1729         vpslld  $25, \T7, \T4   # packed right shifting shift << 25
1730 
1731         vpxor   \T3, \T2, \T2   # xor the shifted versions
1732         vpxor   \T4, \T2, \T2
1733 
1734         vpsrldq $4, \T2, \T1    # shift-R T1 1 DW
1735 
1736         vpslldq $12, \T2, \T2   # shift-L T2 3 DWs
1737         vpxor   \T2, \T7, \T7   # first phase of the reduction complete
1738         #######################################################################
1739 
1740 
1741         #second phase of the reduction
1742         vpsrld  $1, \T7, \T2    # packed left shifting >> 1
1743         vpsrld  $2, \T7, \T3    # packed left shifting >> 2
1744         vpsrld  $7, \T7, \T4    # packed left shifting >> 7
1745         vpxor   \T3, \T2, \T2   # xor the shifted versions
1746         vpxor   \T4, \T2, \T2
1747 
1748         vpxor   \T1, \T2, \T2
1749         vpxor   \T2, \T7, \T7
1750         vpxor   \T7, \T6, \T6   # the result is in T6
1751 
1752 .endm
1753 
1754 #############################################################
1755 #void   aesni_gcm_precomp_avx_gen2
1756 #        (gcm_data     *my_ctx_data,
1757 #         gcm_context_data *data,
1758 #        u8     *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
1759 #        u8      *iv, /* Pre-counter block j0: 4 byte salt
1760 #           (from Security Association) concatenated with 8 byte
1761 #           Initialisation Vector (from IPSec ESP Payload)
1762 #           concatenated with 0x00000001. 16-byte aligned pointer. */
1763 #        const   u8 *aad, /* Additional Authentication Data (AAD)*/
1764 #        u64     aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
1765 #############################################################
1766 SYM_FUNC_START(aesni_gcm_init_avx_gen2)
1767         FUNC_SAVE
1768         INIT GHASH_MUL_AVX, PRECOMPUTE_AVX
1769         FUNC_RESTORE
1770         RET
1771 SYM_FUNC_END(aesni_gcm_init_avx_gen2)
1772 
1773 ###############################################################################
1774 #void   aesni_gcm_enc_update_avx_gen2(
1775 #        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
1776 #        gcm_context_data *data,
1777 #        u8      *out, /* Ciphertext output. Encrypt in-place is allowed.  */
1778 #        const   u8 *in, /* Plaintext input */
1779 #        u64     plaintext_len) /* Length of data in Bytes for encryption. */
1780 ###############################################################################
1781 SYM_FUNC_START(aesni_gcm_enc_update_avx_gen2)
1782         FUNC_SAVE
1783         mov     keysize, %eax
1784         cmp     $32, %eax
1785         je      key_256_enc_update
1786         cmp     $16, %eax
1787         je      key_128_enc_update
1788         # must be 192
1789         GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 11
1790         FUNC_RESTORE
1791         RET
1792 key_128_enc_update:
1793         GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 9
1794         FUNC_RESTORE
1795         RET
1796 key_256_enc_update:
1797         GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 13
1798         FUNC_RESTORE
1799         RET
1800 SYM_FUNC_END(aesni_gcm_enc_update_avx_gen2)
1801 
1802 ###############################################################################
1803 #void   aesni_gcm_dec_update_avx_gen2(
1804 #        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
1805 #        gcm_context_data *data,
1806 #        u8      *out, /* Plaintext output. Decrypt in-place is allowed.  */
1807 #        const   u8 *in, /* Ciphertext input */
1808 #        u64     plaintext_len) /* Length of data in Bytes for encryption. */
1809 ###############################################################################
1810 SYM_FUNC_START(aesni_gcm_dec_update_avx_gen2)
1811         FUNC_SAVE
1812         mov     keysize,%eax
1813         cmp     $32, %eax
1814         je      key_256_dec_update
1815         cmp     $16, %eax
1816         je      key_128_dec_update
1817         # must be 192
1818         GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 11
1819         FUNC_RESTORE
1820         RET
1821 key_128_dec_update:
1822         GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 9
1823         FUNC_RESTORE
1824         RET
1825 key_256_dec_update:
1826         GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 13
1827         FUNC_RESTORE
1828         RET
1829 SYM_FUNC_END(aesni_gcm_dec_update_avx_gen2)
1830 
1831 ###############################################################################
1832 #void   aesni_gcm_finalize_avx_gen2(
1833 #        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
1834 #        gcm_context_data *data,
1835 #        u8      *auth_tag, /* Authenticated Tag output. */
1836 #        u64     auth_tag_len)# /* Authenticated Tag Length in bytes.
1837 #               Valid values are 16 (most likely), 12 or 8. */
1838 ###############################################################################
1839 SYM_FUNC_START(aesni_gcm_finalize_avx_gen2)
1840         FUNC_SAVE
1841         mov keysize,%eax
1842         cmp     $32, %eax
1843         je      key_256_finalize
1844         cmp     $16, %eax
1845         je      key_128_finalize
1846         # must be 192
1847         GCM_COMPLETE GHASH_MUL_AVX, 11, arg3, arg4
1848         FUNC_RESTORE
1849         RET
1850 key_128_finalize:
1851         GCM_COMPLETE GHASH_MUL_AVX, 9, arg3, arg4
1852         FUNC_RESTORE
1853         RET
1854 key_256_finalize:
1855         GCM_COMPLETE GHASH_MUL_AVX, 13, arg3, arg4
1856         FUNC_RESTORE
1857         RET
1858 SYM_FUNC_END(aesni_gcm_finalize_avx_gen2)
1859 
1860 ###############################################################################
1861 # GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
1862 # Input: A and B (128-bits each, bit-reflected)
1863 # Output: C = A*B*x mod poly, (i.e. >>1 )
1864 # To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
1865 # GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
1866 ###############################################################################
1867 .macro  GHASH_MUL_AVX2 GH HK T1 T2 T3 T4 T5
1868 
1869         vpclmulqdq      $0x11,\HK,\GH,\T1      # T1 = a1*b1
1870         vpclmulqdq      $0x00,\HK,\GH,\T2      # T2 = a0*b0
1871         vpclmulqdq      $0x01,\HK,\GH,\T3      # T3 = a1*b0
1872         vpclmulqdq      $0x10,\HK,\GH,\GH      # GH = a0*b1
1873         vpxor           \T3, \GH, \GH
1874 
1875 
1876         vpsrldq         $8 , \GH, \T3          # shift-R GH 2 DWs
1877         vpslldq         $8 , \GH, \GH          # shift-L GH 2 DWs
1878 
1879         vpxor           \T3, \T1, \T1
1880         vpxor           \T2, \GH, \GH
1881 
1882         #######################################################################
1883         #first phase of the reduction
1884         vmovdqa         POLY2(%rip), \T3
1885 
1886         vpclmulqdq      $0x01, \GH, \T3, \T2
1887         vpslldq         $8, \T2, \T2           # shift-L T2 2 DWs
1888 
1889         vpxor           \T2, \GH, \GH          # first phase of the reduction complete
1890         #######################################################################
1891         #second phase of the reduction
1892         vpclmulqdq      $0x00, \GH, \T3, \T2
1893         vpsrldq         $4, \T2, \T2           # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
1894 
1895         vpclmulqdq      $0x10, \GH, \T3, \GH
1896         vpslldq         $4, \GH, \GH           # shift-L GH 1 DW (Shift-L 1-DW to obtain result with no shifts)
1897 
1898         vpxor           \T2, \GH, \GH          # second phase of the reduction complete
1899         #######################################################################
1900         vpxor           \T1, \GH, \GH          # the result is in GH
1901 
1902 
1903 .endm
1904 
1905 .macro PRECOMPUTE_AVX2 HK T1 T2 T3 T4 T5 T6
1906 
1907         # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
1908         vmovdqa  \HK, \T5
1909         GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^2<<1 mod poly
1910         vmovdqu  \T5, HashKey_2(arg2)                       #  [HashKey_2] = HashKey^2<<1 mod poly
1911 
1912         GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^3<<1 mod poly
1913         vmovdqu  \T5, HashKey_3(arg2)
1914 
1915         GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^4<<1 mod poly
1916         vmovdqu  \T5, HashKey_4(arg2)
1917 
1918         GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^5<<1 mod poly
1919         vmovdqu  \T5, HashKey_5(arg2)
1920 
1921         GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^6<<1 mod poly
1922         vmovdqu  \T5, HashKey_6(arg2)
1923 
1924         GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^7<<1 mod poly
1925         vmovdqu  \T5, HashKey_7(arg2)
1926 
1927         GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^8<<1 mod poly
1928         vmovdqu  \T5, HashKey_8(arg2)
1929 
1930 .endm
1931 
1932 ## if a = number of total plaintext bytes
1933 ## b = floor(a/16)
1934 ## num_initial_blocks = b mod 4#
1935 ## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
1936 ## r10, r11, r12, rax are clobbered
1937 ## arg1, arg2, arg3, arg4 are used as pointers only, not modified
1938 
1939 .macro INITIAL_BLOCKS_AVX2 REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER
1940     i = (8-\num_initial_blocks)
1941     setreg
1942     vmovdqu AadHash(arg2), reg_i
1943 
1944     # start AES for num_initial_blocks blocks
1945     vmovdqu CurCount(arg2), \CTR
1946 
1947     i = (9-\num_initial_blocks)
1948     setreg
1949 .rep \num_initial_blocks
1950                 vpaddd  ONE(%rip), \CTR, \CTR   # INCR Y0
1951                 vmovdqa \CTR, reg_i
1952                 vpshufb SHUF_MASK(%rip), reg_i, reg_i     # perform a 16Byte swap
1953     i = (i+1)
1954     setreg
1955 .endr
1956 
1957     vmovdqa  (arg1), \T_key
1958     i = (9-\num_initial_blocks)
1959     setreg
1960 .rep \num_initial_blocks
1961                 vpxor   \T_key, reg_i, reg_i
1962     i = (i+1)
1963     setreg
1964 .endr
1965 
1966     j = 1
1967     setreg
1968 .rep \REP
1969     vmovdqa  16*j(arg1), \T_key
1970     i = (9-\num_initial_blocks)
1971     setreg
1972 .rep \num_initial_blocks
1973         vaesenc \T_key, reg_i, reg_i
1974     i = (i+1)
1975     setreg
1976 .endr
1977 
1978     j = (j+1)
1979     setreg
1980 .endr
1981 
1982 
1983     vmovdqa  16*j(arg1), \T_key
1984     i = (9-\num_initial_blocks)
1985     setreg
1986 .rep \num_initial_blocks
1987         vaesenclast      \T_key, reg_i, reg_i
1988     i = (i+1)
1989     setreg
1990 .endr
1991 
1992     i = (9-\num_initial_blocks)
1993     setreg
1994 .rep \num_initial_blocks
1995                 vmovdqu (arg4, %r11), \T1
1996                 vpxor   \T1, reg_i, reg_i
1997                 vmovdqu reg_i, (arg3 , %r11)           # write back ciphertext for
1998                                # num_initial_blocks blocks
1999                 add     $16, %r11
2000 .if  \ENC_DEC == DEC
2001                 vmovdqa \T1, reg_i
2002 .endif
2003                 vpshufb SHUF_MASK(%rip), reg_i, reg_i  # prepare ciphertext for GHASH computations
2004     i = (i+1)
2005     setreg
2006 .endr
2007 
2008 
2009     i = (8-\num_initial_blocks)
2010     j = (9-\num_initial_blocks)
2011     setreg
2012 
2013 .rep \num_initial_blocks
2014         vpxor    reg_i, reg_j, reg_j
2015         GHASH_MUL_AVX2       reg_j, \T2, \T1, \T3, \T4, \T5, \T6  # apply GHASH on num_initial_blocks blocks
2016     i = (i+1)
2017     j = (j+1)
2018     setreg
2019 .endr
2020         # XMM8 has the combined result here
2021 
2022         vmovdqa  \XMM8, TMP1(%rsp)
2023         vmovdqa  \XMM8, \T3
2024 
2025         cmp     $128, %r13
2026         jl      _initial_blocks_done\@                  # no need for precomputed constants
2027 
2028 ###############################################################################
2029 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
2030                 vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
2031                 vmovdqa  \CTR, \XMM1
2032                 vpshufb  SHUF_MASK(%rip), \XMM1, \XMM1  # perform a 16Byte swap
2033 
2034                 vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
2035                 vmovdqa  \CTR, \XMM2
2036                 vpshufb  SHUF_MASK(%rip), \XMM2, \XMM2  # perform a 16Byte swap
2037 
2038                 vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
2039                 vmovdqa  \CTR, \XMM3
2040                 vpshufb  SHUF_MASK(%rip), \XMM3, \XMM3  # perform a 16Byte swap
2041 
2042                 vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
2043                 vmovdqa  \CTR, \XMM4
2044                 vpshufb  SHUF_MASK(%rip), \XMM4, \XMM4  # perform a 16Byte swap
2045 
2046                 vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
2047                 vmovdqa  \CTR, \XMM5
2048                 vpshufb  SHUF_MASK(%rip), \XMM5, \XMM5  # perform a 16Byte swap
2049 
2050                 vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
2051                 vmovdqa  \CTR, \XMM6
2052                 vpshufb  SHUF_MASK(%rip), \XMM6, \XMM6  # perform a 16Byte swap
2053 
2054                 vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
2055                 vmovdqa  \CTR, \XMM7
2056                 vpshufb  SHUF_MASK(%rip), \XMM7, \XMM7  # perform a 16Byte swap
2057 
2058                 vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
2059                 vmovdqa  \CTR, \XMM8
2060                 vpshufb  SHUF_MASK(%rip), \XMM8, \XMM8  # perform a 16Byte swap
2061 
2062                 vmovdqa  (arg1), \T_key
2063                 vpxor    \T_key, \XMM1, \XMM1
2064                 vpxor    \T_key, \XMM2, \XMM2
2065                 vpxor    \T_key, \XMM3, \XMM3
2066                 vpxor    \T_key, \XMM4, \XMM4
2067                 vpxor    \T_key, \XMM5, \XMM5
2068                 vpxor    \T_key, \XMM6, \XMM6
2069                 vpxor    \T_key, \XMM7, \XMM7
2070                 vpxor    \T_key, \XMM8, \XMM8
2071 
2072         i = 1
2073         setreg
2074 .rep    \REP       # do REP rounds
2075                 vmovdqa  16*i(arg1), \T_key
2076                 vaesenc  \T_key, \XMM1, \XMM1
2077                 vaesenc  \T_key, \XMM2, \XMM2
2078                 vaesenc  \T_key, \XMM3, \XMM3
2079                 vaesenc  \T_key, \XMM4, \XMM4
2080                 vaesenc  \T_key, \XMM5, \XMM5
2081                 vaesenc  \T_key, \XMM6, \XMM6
2082                 vaesenc  \T_key, \XMM7, \XMM7
2083                 vaesenc  \T_key, \XMM8, \XMM8
2084         i = (i+1)
2085         setreg
2086 .endr
2087 
2088 
2089                 vmovdqa  16*i(arg1), \T_key
2090                 vaesenclast  \T_key, \XMM1, \XMM1
2091                 vaesenclast  \T_key, \XMM2, \XMM2
2092                 vaesenclast  \T_key, \XMM3, \XMM3
2093                 vaesenclast  \T_key, \XMM4, \XMM4
2094                 vaesenclast  \T_key, \XMM5, \XMM5
2095                 vaesenclast  \T_key, \XMM6, \XMM6
2096                 vaesenclast  \T_key, \XMM7, \XMM7
2097                 vaesenclast  \T_key, \XMM8, \XMM8
2098 
2099                 vmovdqu  (arg4, %r11), \T1
2100                 vpxor    \T1, \XMM1, \XMM1
2101                 vmovdqu  \XMM1, (arg3 , %r11)
2102                 .if   \ENC_DEC == DEC
2103                 vmovdqa  \T1, \XMM1
2104                 .endif
2105 
2106                 vmovdqu  16*1(arg4, %r11), \T1
2107                 vpxor    \T1, \XMM2, \XMM2
2108                 vmovdqu  \XMM2, 16*1(arg3 , %r11)
2109                 .if   \ENC_DEC == DEC
2110                 vmovdqa  \T1, \XMM2
2111                 .endif
2112 
2113                 vmovdqu  16*2(arg4, %r11), \T1
2114                 vpxor    \T1, \XMM3, \XMM3
2115                 vmovdqu  \XMM3, 16*2(arg3 , %r11)
2116                 .if   \ENC_DEC == DEC
2117                 vmovdqa  \T1, \XMM3
2118                 .endif
2119 
2120                 vmovdqu  16*3(arg4, %r11), \T1
2121                 vpxor    \T1, \XMM4, \XMM4
2122                 vmovdqu  \XMM4, 16*3(arg3 , %r11)
2123                 .if   \ENC_DEC == DEC
2124                 vmovdqa  \T1, \XMM4
2125                 .endif
2126 
2127                 vmovdqu  16*4(arg4, %r11), \T1
2128                 vpxor    \T1, \XMM5, \XMM5
2129                 vmovdqu  \XMM5, 16*4(arg3 , %r11)
2130                 .if   \ENC_DEC == DEC
2131                 vmovdqa  \T1, \XMM5
2132                 .endif
2133 
2134                 vmovdqu  16*5(arg4, %r11), \T1
2135                 vpxor    \T1, \XMM6, \XMM6
2136                 vmovdqu  \XMM6, 16*5(arg3 , %r11)
2137                 .if   \ENC_DEC == DEC
2138                 vmovdqa  \T1, \XMM6
2139                 .endif
2140 
2141                 vmovdqu  16*6(arg4, %r11), \T1
2142                 vpxor    \T1, \XMM7, \XMM7
2143                 vmovdqu  \XMM7, 16*6(arg3 , %r11)
2144                 .if   \ENC_DEC == DEC
2145                 vmovdqa  \T1, \XMM7
2146                 .endif
2147 
2148                 vmovdqu  16*7(arg4, %r11), \T1
2149                 vpxor    \T1, \XMM8, \XMM8
2150                 vmovdqu  \XMM8, 16*7(arg3 , %r11)
2151                 .if   \ENC_DEC == DEC
2152                 vmovdqa  \T1, \XMM8
2153                 .endif
2154 
2155                 add     $128, %r11
2156 
2157                 vpshufb  SHUF_MASK(%rip), \XMM1, \XMM1     # perform a 16Byte swap
2158                 vpxor    TMP1(%rsp), \XMM1, \XMM1          # combine GHASHed value with
2159                                # the corresponding ciphertext
2160                 vpshufb  SHUF_MASK(%rip), \XMM2, \XMM2     # perform a 16Byte swap
2161                 vpshufb  SHUF_MASK(%rip), \XMM3, \XMM3     # perform a 16Byte swap
2162                 vpshufb  SHUF_MASK(%rip), \XMM4, \XMM4     # perform a 16Byte swap
2163                 vpshufb  SHUF_MASK(%rip), \XMM5, \XMM5     # perform a 16Byte swap
2164                 vpshufb  SHUF_MASK(%rip), \XMM6, \XMM6     # perform a 16Byte swap
2165                 vpshufb  SHUF_MASK(%rip), \XMM7, \XMM7     # perform a 16Byte swap
2166                 vpshufb  SHUF_MASK(%rip), \XMM8, \XMM8     # perform a 16Byte swap
2167 
2168 ###############################################################################
2169 
2170 _initial_blocks_done\@:
2171 
2172 
2173 .endm
2174 
2175 
2176 
2177 # encrypt 8 blocks at a time
2178 # ghash the 8 previously encrypted ciphertext blocks
2179 # arg1, arg2, arg3, arg4 are used as pointers only, not modified
2180 # r11 is the data offset value
2181 .macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
2182 
2183         vmovdqa \XMM1, \T2
2184         vmovdqa \XMM2, TMP2(%rsp)
2185         vmovdqa \XMM3, TMP3(%rsp)
2186         vmovdqa \XMM4, TMP4(%rsp)
2187         vmovdqa \XMM5, TMP5(%rsp)
2188         vmovdqa \XMM6, TMP6(%rsp)
2189         vmovdqa \XMM7, TMP7(%rsp)
2190         vmovdqa \XMM8, TMP8(%rsp)
2191 
2192 .if \loop_idx == in_order
2193                 vpaddd  ONE(%rip), \CTR, \XMM1            # INCR CNT
2194                 vpaddd  ONE(%rip), \XMM1, \XMM2
2195                 vpaddd  ONE(%rip), \XMM2, \XMM3
2196                 vpaddd  ONE(%rip), \XMM3, \XMM4
2197                 vpaddd  ONE(%rip), \XMM4, \XMM5
2198                 vpaddd  ONE(%rip), \XMM5, \XMM6
2199                 vpaddd  ONE(%rip), \XMM6, \XMM7
2200                 vpaddd  ONE(%rip), \XMM7, \XMM8
2201                 vmovdqa \XMM8, \CTR
2202 
2203                 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1     # perform a 16Byte swap
2204                 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2     # perform a 16Byte swap
2205                 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3     # perform a 16Byte swap
2206                 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4     # perform a 16Byte swap
2207                 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5     # perform a 16Byte swap
2208                 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6     # perform a 16Byte swap
2209                 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7     # perform a 16Byte swap
2210                 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8     # perform a 16Byte swap
2211 .else
2212                 vpaddd  ONEf(%rip), \CTR, \XMM1            # INCR CNT
2213                 vpaddd  ONEf(%rip), \XMM1, \XMM2
2214                 vpaddd  ONEf(%rip), \XMM2, \XMM3
2215                 vpaddd  ONEf(%rip), \XMM3, \XMM4
2216                 vpaddd  ONEf(%rip), \XMM4, \XMM5
2217                 vpaddd  ONEf(%rip), \XMM5, \XMM6
2218                 vpaddd  ONEf(%rip), \XMM6, \XMM7
2219                 vpaddd  ONEf(%rip), \XMM7, \XMM8
2220                 vmovdqa \XMM8, \CTR
2221 .endif
2222 
2223 
2224         #######################################################################
2225 
2226                 vmovdqu (arg1), \T1
2227                 vpxor   \T1, \XMM1, \XMM1
2228                 vpxor   \T1, \XMM2, \XMM2
2229                 vpxor   \T1, \XMM3, \XMM3
2230                 vpxor   \T1, \XMM4, \XMM4
2231                 vpxor   \T1, \XMM5, \XMM5
2232                 vpxor   \T1, \XMM6, \XMM6
2233                 vpxor   \T1, \XMM7, \XMM7
2234                 vpxor   \T1, \XMM8, \XMM8
2235 
2236         #######################################################################
2237 
2238 
2239 
2240 
2241 
2242                 vmovdqu 16*1(arg1), \T1
2243                 vaesenc \T1, \XMM1, \XMM1
2244                 vaesenc \T1, \XMM2, \XMM2
2245                 vaesenc \T1, \XMM3, \XMM3
2246                 vaesenc \T1, \XMM4, \XMM4
2247                 vaesenc \T1, \XMM5, \XMM5
2248                 vaesenc \T1, \XMM6, \XMM6
2249                 vaesenc \T1, \XMM7, \XMM7
2250                 vaesenc \T1, \XMM8, \XMM8
2251 
2252                 vmovdqu 16*2(arg1), \T1
2253                 vaesenc \T1, \XMM1, \XMM1
2254                 vaesenc \T1, \XMM2, \XMM2
2255                 vaesenc \T1, \XMM3, \XMM3
2256                 vaesenc \T1, \XMM4, \XMM4
2257                 vaesenc \T1, \XMM5, \XMM5
2258                 vaesenc \T1, \XMM6, \XMM6
2259                 vaesenc \T1, \XMM7, \XMM7
2260                 vaesenc \T1, \XMM8, \XMM8
2261 
2262 
2263         #######################################################################
2264 
2265         vmovdqu         HashKey_8(arg2), \T5
2266         vpclmulqdq      $0x11, \T5, \T2, \T4              # T4 = a1*b1
2267         vpclmulqdq      $0x00, \T5, \T2, \T7              # T7 = a0*b0
2268         vpclmulqdq      $0x01, \T5, \T2, \T6              # T6 = a1*b0
2269         vpclmulqdq      $0x10, \T5, \T2, \T5              # T5 = a0*b1
2270         vpxor           \T5, \T6, \T6
2271 
2272                 vmovdqu 16*3(arg1), \T1
2273                 vaesenc \T1, \XMM1, \XMM1
2274                 vaesenc \T1, \XMM2, \XMM2
2275                 vaesenc \T1, \XMM3, \XMM3
2276                 vaesenc \T1, \XMM4, \XMM4
2277                 vaesenc \T1, \XMM5, \XMM5
2278                 vaesenc \T1, \XMM6, \XMM6
2279                 vaesenc \T1, \XMM7, \XMM7
2280                 vaesenc \T1, \XMM8, \XMM8
2281 
2282         vmovdqa         TMP2(%rsp), \T1
2283         vmovdqu         HashKey_7(arg2), \T5
2284         vpclmulqdq      $0x11, \T5, \T1, \T3
2285         vpxor           \T3, \T4, \T4
2286 
2287         vpclmulqdq      $0x00, \T5, \T1, \T3
2288         vpxor           \T3, \T7, \T7
2289 
2290         vpclmulqdq      $0x01, \T5, \T1, \T3
2291         vpxor           \T3, \T6, \T6
2292 
2293         vpclmulqdq      $0x10, \T5, \T1, \T3
2294         vpxor           \T3, \T6, \T6
2295 
2296                 vmovdqu 16*4(arg1), \T1
2297                 vaesenc \T1, \XMM1, \XMM1
2298                 vaesenc \T1, \XMM2, \XMM2
2299                 vaesenc \T1, \XMM3, \XMM3
2300                 vaesenc \T1, \XMM4, \XMM4
2301                 vaesenc \T1, \XMM5, \XMM5
2302                 vaesenc \T1, \XMM6, \XMM6
2303                 vaesenc \T1, \XMM7, \XMM7
2304                 vaesenc \T1, \XMM8, \XMM8
2305 
2306         #######################################################################
2307 
2308         vmovdqa         TMP3(%rsp), \T1
2309         vmovdqu         HashKey_6(arg2), \T5
2310         vpclmulqdq      $0x11, \T5, \T1, \T3
2311         vpxor           \T3, \T4, \T4
2312 
2313         vpclmulqdq      $0x00, \T5, \T1, \T3
2314         vpxor           \T3, \T7, \T7
2315 
2316         vpclmulqdq      $0x01, \T5, \T1, \T3
2317         vpxor           \T3, \T6, \T6
2318 
2319         vpclmulqdq      $0x10, \T5, \T1, \T3
2320         vpxor           \T3, \T6, \T6
2321 
2322                 vmovdqu 16*5(arg1), \T1
2323                 vaesenc \T1, \XMM1, \XMM1
2324                 vaesenc \T1, \XMM2, \XMM2
2325                 vaesenc \T1, \XMM3, \XMM3
2326                 vaesenc \T1, \XMM4, \XMM4
2327                 vaesenc \T1, \XMM5, \XMM5
2328                 vaesenc \T1, \XMM6, \XMM6
2329                 vaesenc \T1, \XMM7, \XMM7
2330                 vaesenc \T1, \XMM8, \XMM8
2331 
2332         vmovdqa         TMP4(%rsp), \T1
2333         vmovdqu         HashKey_5(arg2), \T5
2334         vpclmulqdq      $0x11, \T5, \T1, \T3
2335         vpxor           \T3, \T4, \T4
2336 
2337         vpclmulqdq      $0x00, \T5, \T1, \T3
2338         vpxor           \T3, \T7, \T7
2339 
2340         vpclmulqdq      $0x01, \T5, \T1, \T3
2341         vpxor           \T3, \T6, \T6
2342 
2343         vpclmulqdq      $0x10, \T5, \T1, \T3
2344         vpxor           \T3, \T6, \T6
2345 
2346                 vmovdqu 16*6(arg1), \T1
2347                 vaesenc \T1, \XMM1, \XMM1
2348                 vaesenc \T1, \XMM2, \XMM2
2349                 vaesenc \T1, \XMM3, \XMM3
2350                 vaesenc \T1, \XMM4, \XMM4
2351                 vaesenc \T1, \XMM5, \XMM5
2352                 vaesenc \T1, \XMM6, \XMM6
2353                 vaesenc \T1, \XMM7, \XMM7
2354                 vaesenc \T1, \XMM8, \XMM8
2355 
2356 
2357         vmovdqa         TMP5(%rsp), \T1
2358         vmovdqu         HashKey_4(arg2), \T5
2359         vpclmulqdq      $0x11, \T5, \T1, \T3
2360         vpxor           \T3, \T4, \T4
2361 
2362         vpclmulqdq      $0x00, \T5, \T1, \T3
2363         vpxor           \T3, \T7, \T7
2364 
2365         vpclmulqdq      $0x01, \T5, \T1, \T3
2366         vpxor           \T3, \T6, \T6
2367 
2368         vpclmulqdq      $0x10, \T5, \T1, \T3
2369         vpxor           \T3, \T6, \T6
2370 
2371                 vmovdqu 16*7(arg1), \T1
2372                 vaesenc \T1, \XMM1, \XMM1
2373                 vaesenc \T1, \XMM2, \XMM2
2374                 vaesenc \T1, \XMM3, \XMM3
2375                 vaesenc \T1, \XMM4, \XMM4
2376                 vaesenc \T1, \XMM5, \XMM5
2377                 vaesenc \T1, \XMM6, \XMM6
2378                 vaesenc \T1, \XMM7, \XMM7
2379                 vaesenc \T1, \XMM8, \XMM8
2380 
2381         vmovdqa         TMP6(%rsp), \T1
2382         vmovdqu         HashKey_3(arg2), \T5
2383         vpclmulqdq      $0x11, \T5, \T1, \T3
2384         vpxor           \T3, \T4, \T4
2385 
2386         vpclmulqdq      $0x00, \T5, \T1, \T3
2387         vpxor           \T3, \T7, \T7
2388 
2389         vpclmulqdq      $0x01, \T5, \T1, \T3
2390         vpxor           \T3, \T6, \T6
2391 
2392         vpclmulqdq      $0x10, \T5, \T1, \T3
2393         vpxor           \T3, \T6, \T6
2394 
2395                 vmovdqu 16*8(arg1), \T1
2396                 vaesenc \T1, \XMM1, \XMM1
2397                 vaesenc \T1, \XMM2, \XMM2
2398                 vaesenc \T1, \XMM3, \XMM3
2399                 vaesenc \T1, \XMM4, \XMM4
2400                 vaesenc \T1, \XMM5, \XMM5
2401                 vaesenc \T1, \XMM6, \XMM6
2402                 vaesenc \T1, \XMM7, \XMM7
2403                 vaesenc \T1, \XMM8, \XMM8
2404 
2405         vmovdqa         TMP7(%rsp), \T1
2406         vmovdqu         HashKey_2(arg2), \T5
2407         vpclmulqdq      $0x11, \T5, \T1, \T3
2408         vpxor           \T3, \T4, \T4
2409 
2410         vpclmulqdq      $0x00, \T5, \T1, \T3
2411         vpxor           \T3, \T7, \T7
2412 
2413         vpclmulqdq      $0x01, \T5, \T1, \T3
2414         vpxor           \T3, \T6, \T6
2415 
2416         vpclmulqdq      $0x10, \T5, \T1, \T3
2417         vpxor           \T3, \T6, \T6
2418 
2419 
2420         #######################################################################
2421 
2422                 vmovdqu 16*9(arg1), \T5
2423                 vaesenc \T5, \XMM1, \XMM1
2424                 vaesenc \T5, \XMM2, \XMM2
2425                 vaesenc \T5, \XMM3, \XMM3
2426                 vaesenc \T5, \XMM4, \XMM4
2427                 vaesenc \T5, \XMM5, \XMM5
2428                 vaesenc \T5, \XMM6, \XMM6
2429                 vaesenc \T5, \XMM7, \XMM7
2430                 vaesenc \T5, \XMM8, \XMM8
2431 
2432         vmovdqa         TMP8(%rsp), \T1
2433         vmovdqu         HashKey(arg2), \T5
2434 
2435         vpclmulqdq      $0x00, \T5, \T1, \T3
2436         vpxor           \T3, \T7, \T7
2437 
2438         vpclmulqdq      $0x01, \T5, \T1, \T3
2439         vpxor           \T3, \T6, \T6
2440 
2441         vpclmulqdq      $0x10, \T5, \T1, \T3
2442         vpxor           \T3, \T6, \T6
2443 
2444         vpclmulqdq      $0x11, \T5, \T1, \T3
2445         vpxor           \T3, \T4, \T1
2446 
2447 
2448                 vmovdqu 16*10(arg1), \T5
2449 
2450         i = 11
2451         setreg
2452 .rep (\REP-9)
2453         vaesenc \T5, \XMM1, \XMM1
2454         vaesenc \T5, \XMM2, \XMM2
2455         vaesenc \T5, \XMM3, \XMM3
2456         vaesenc \T5, \XMM4, \XMM4
2457         vaesenc \T5, \XMM5, \XMM5
2458         vaesenc \T5, \XMM6, \XMM6
2459         vaesenc \T5, \XMM7, \XMM7
2460         vaesenc \T5, \XMM8, \XMM8
2461 
2462         vmovdqu 16*i(arg1), \T5
2463         i = i + 1
2464         setreg
2465 .endr
2466 
2467     i = 0
2468     j = 1
2469     setreg
2470 .rep 8
2471         vpxor   16*i(arg4, %r11), \T5, \T2
2472                 .if \ENC_DEC == ENC
2473                 vaesenclast     \T2, reg_j, reg_j
2474                 .else
2475                 vaesenclast     \T2, reg_j, \T3
2476                 vmovdqu 16*i(arg4, %r11), reg_j
2477                 vmovdqu \T3, 16*i(arg3, %r11)
2478                 .endif
2479     i = (i+1)
2480     j = (j+1)
2481     setreg
2482 .endr
2483     #######################################################################
2484 
2485 
2486     vpslldq $8, \T6, \T3                # shift-L T3 2 DWs
2487     vpsrldq $8, \T6, \T6                # shift-R T2 2 DWs
2488     vpxor   \T3, \T7, \T7
2489     vpxor   \T6, \T1, \T1               # accumulate the results in T1:T7
2490 
2491 
2492 
2493     #######################################################################
2494     #first phase of the reduction
2495     vmovdqa         POLY2(%rip), \T3
2496 
2497     vpclmulqdq  $0x01, \T7, \T3, \T2
2498     vpslldq     $8, \T2, \T2            # shift-L xmm2 2 DWs
2499 
2500     vpxor       \T2, \T7, \T7           # first phase of the reduction complete
2501     #######################################################################
2502                 .if \ENC_DEC == ENC
2503         vmovdqu  \XMM1, 16*0(arg3,%r11)     # Write to the Ciphertext buffer
2504         vmovdqu  \XMM2, 16*1(arg3,%r11)     # Write to the Ciphertext buffer
2505         vmovdqu  \XMM3, 16*2(arg3,%r11)     # Write to the Ciphertext buffer
2506         vmovdqu  \XMM4, 16*3(arg3,%r11)     # Write to the Ciphertext buffer
2507         vmovdqu  \XMM5, 16*4(arg3,%r11)     # Write to the Ciphertext buffer
2508         vmovdqu  \XMM6, 16*5(arg3,%r11)     # Write to the Ciphertext buffer
2509         vmovdqu  \XMM7, 16*6(arg3,%r11)     # Write to the Ciphertext buffer
2510         vmovdqu  \XMM8, 16*7(arg3,%r11)     # Write to the Ciphertext buffer
2511                 .endif
2512 
2513     #######################################################################
2514     #second phase of the reduction
2515     vpclmulqdq  $0x00, \T7, \T3, \T2
2516     vpsrldq     $4, \T2, \T2            # shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
2517 
2518     vpclmulqdq  $0x10, \T7, \T3, \T4
2519     vpslldq     $4, \T4, \T4            # shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts)
2520 
2521     vpxor       \T2, \T4, \T4           # second phase of the reduction complete
2522     #######################################################################
2523     vpxor       \T4, \T1, \T1           # the result is in T1
2524 
2525         vpshufb SHUF_MASK(%rip), \XMM1, \XMM1   # perform a 16Byte swap
2526         vpshufb SHUF_MASK(%rip), \XMM2, \XMM2   # perform a 16Byte swap
2527         vpshufb SHUF_MASK(%rip), \XMM3, \XMM3   # perform a 16Byte swap
2528         vpshufb SHUF_MASK(%rip), \XMM4, \XMM4   # perform a 16Byte swap
2529         vpshufb SHUF_MASK(%rip), \XMM5, \XMM5   # perform a 16Byte swap
2530         vpshufb SHUF_MASK(%rip), \XMM6, \XMM6   # perform a 16Byte swap
2531         vpshufb SHUF_MASK(%rip), \XMM7, \XMM7   # perform a 16Byte swap
2532         vpshufb SHUF_MASK(%rip), \XMM8, \XMM8   # perform a 16Byte swap
2533 
2534 
2535     vpxor   \T1, \XMM1, \XMM1
2536 
2537 
2538 
2539 .endm
2540 
2541 
2542 # GHASH the last 4 ciphertext blocks.
2543 .macro  GHASH_LAST_8_AVX2 T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
2544 
2545         ## Karatsuba Method
2546 
2547         vmovdqu         HashKey_8(arg2), \T5
2548 
2549         vpshufd         $0b01001110, \XMM1, \T2
2550         vpshufd         $0b01001110, \T5, \T3
2551         vpxor           \XMM1, \T2, \T2
2552         vpxor           \T5, \T3, \T3
2553 
2554         vpclmulqdq      $0x11, \T5, \XMM1, \T6
2555         vpclmulqdq      $0x00, \T5, \XMM1, \T7
2556 
2557         vpclmulqdq      $0x00, \T3, \T2, \XMM1
2558 
2559         ######################
2560 
2561         vmovdqu         HashKey_7(arg2), \T5
2562         vpshufd         $0b01001110, \XMM2, \T2
2563         vpshufd         $0b01001110, \T5, \T3
2564         vpxor           \XMM2, \T2, \T2
2565         vpxor           \T5, \T3, \T3
2566 
2567         vpclmulqdq      $0x11, \T5, \XMM2, \T4
2568         vpxor           \T4, \T6, \T6
2569 
2570         vpclmulqdq      $0x00, \T5, \XMM2, \T4
2571         vpxor           \T4, \T7, \T7
2572 
2573         vpclmulqdq      $0x00, \T3, \T2, \T2
2574 
2575         vpxor           \T2, \XMM1, \XMM1
2576 
2577         ######################
2578 
2579         vmovdqu         HashKey_6(arg2), \T5
2580         vpshufd         $0b01001110, \XMM3, \T2
2581         vpshufd         $0b01001110, \T5, \T3
2582         vpxor           \XMM3, \T2, \T2
2583         vpxor           \T5, \T3, \T3
2584 
2585         vpclmulqdq      $0x11, \T5, \XMM3, \T4
2586         vpxor           \T4, \T6, \T6
2587 
2588         vpclmulqdq      $0x00, \T5, \XMM3, \T4
2589         vpxor           \T4, \T7, \T7
2590 
2591         vpclmulqdq      $0x00, \T3, \T2, \T2
2592 
2593         vpxor           \T2, \XMM1, \XMM1
2594 
2595         ######################
2596 
2597         vmovdqu         HashKey_5(arg2), \T5
2598         vpshufd         $0b01001110, \XMM4, \T2
2599         vpshufd         $0b01001110, \T5, \T3
2600         vpxor           \XMM4, \T2, \T2
2601         vpxor           \T5, \T3, \T3
2602 
2603         vpclmulqdq      $0x11, \T5, \XMM4, \T4
2604         vpxor           \T4, \T6, \T6
2605 
2606         vpclmulqdq      $0x00, \T5, \XMM4, \T4
2607         vpxor           \T4, \T7, \T7
2608 
2609         vpclmulqdq      $0x00, \T3, \T2, \T2
2610 
2611         vpxor           \T2, \XMM1, \XMM1
2612 
2613         ######################
2614 
2615         vmovdqu         HashKey_4(arg2), \T5
2616         vpshufd         $0b01001110, \XMM5, \T2
2617         vpshufd         $0b01001110, \T5, \T3
2618         vpxor           \XMM5, \T2, \T2
2619         vpxor           \T5, \T3, \T3
2620 
2621         vpclmulqdq      $0x11, \T5, \XMM5, \T4
2622         vpxor           \T4, \T6, \T6
2623 
2624         vpclmulqdq      $0x00, \T5, \XMM5, \T4
2625         vpxor           \T4, \T7, \T7
2626 
2627         vpclmulqdq      $0x00, \T3, \T2, \T2
2628 
2629         vpxor           \T2, \XMM1, \XMM1
2630 
2631         ######################
2632 
2633         vmovdqu         HashKey_3(arg2), \T5
2634         vpshufd         $0b01001110, \XMM6, \T2
2635         vpshufd         $0b01001110, \T5, \T3
2636         vpxor           \XMM6, \T2, \T2
2637         vpxor           \T5, \T3, \T3
2638 
2639         vpclmulqdq      $0x11, \T5, \XMM6, \T4
2640         vpxor           \T4, \T6, \T6
2641 
2642         vpclmulqdq      $0x00, \T5, \XMM6, \T4
2643         vpxor           \T4, \T7, \T7
2644 
2645         vpclmulqdq      $0x00, \T3, \T2, \T2
2646 
2647         vpxor           \T2, \XMM1, \XMM1
2648 
2649         ######################
2650 
2651         vmovdqu         HashKey_2(arg2), \T5
2652         vpshufd         $0b01001110, \XMM7, \T2
2653         vpshufd         $0b01001110, \T5, \T3
2654         vpxor           \XMM7, \T2, \T2
2655         vpxor           \T5, \T3, \T3
2656 
2657         vpclmulqdq      $0x11, \T5, \XMM7, \T4
2658         vpxor           \T4, \T6, \T6
2659 
2660         vpclmulqdq      $0x00, \T5, \XMM7, \T4
2661         vpxor           \T4, \T7, \T7
2662 
2663         vpclmulqdq      $0x00, \T3, \T2, \T2
2664 
2665         vpxor           \T2, \XMM1, \XMM1
2666 
2667         ######################
2668 
2669         vmovdqu         HashKey(arg2), \T5
2670         vpshufd         $0b01001110, \XMM8, \T2
2671         vpshufd         $0b01001110, \T5, \T3
2672         vpxor           \XMM8, \T2, \T2
2673         vpxor           \T5, \T3, \T3
2674 
2675         vpclmulqdq      $0x11, \T5, \XMM8, \T4
2676         vpxor           \T4, \T6, \T6
2677 
2678         vpclmulqdq      $0x00, \T5, \XMM8, \T4
2679         vpxor           \T4, \T7, \T7
2680 
2681         vpclmulqdq      $0x00, \T3, \T2, \T2
2682 
2683         vpxor           \T2, \XMM1, \XMM1
2684         vpxor           \T6, \XMM1, \XMM1
2685         vpxor           \T7, \XMM1, \T2
2686 
2687 
2688 
2689 
2690         vpslldq $8, \T2, \T4
2691         vpsrldq $8, \T2, \T2
2692 
2693         vpxor   \T4, \T7, \T7
2694         vpxor   \T2, \T6, \T6                      # <T6:T7> holds the result of the
2695                            # accumulated carry-less multiplications
2696 
2697         #######################################################################
2698         #first phase of the reduction
2699         vmovdqa         POLY2(%rip), \T3
2700 
2701         vpclmulqdq      $0x01, \T7, \T3, \T2
2702         vpslldq         $8, \T2, \T2               # shift-L xmm2 2 DWs
2703 
2704         vpxor           \T2, \T7, \T7              # first phase of the reduction complete
2705         #######################################################################
2706 
2707 
2708         #second phase of the reduction
2709         vpclmulqdq      $0x00, \T7, \T3, \T2
2710         vpsrldq         $4, \T2, \T2               # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
2711 
2712         vpclmulqdq      $0x10, \T7, \T3, \T4
2713         vpslldq         $4, \T4, \T4               # shift-L T4 1 DW (Shift-L 1-DW to obtain result with no shifts)
2714 
2715         vpxor           \T2, \T4, \T4              # second phase of the reduction complete
2716         #######################################################################
2717         vpxor           \T4, \T6, \T6              # the result is in T6
2718 .endm
2719 
2720 
2721 
2722 #############################################################
2723 #void   aesni_gcm_init_avx_gen4
2724 #        (gcm_data     *my_ctx_data,
2725 #         gcm_context_data *data,
2726 #        u8      *iv, /* Pre-counter block j0: 4 byte salt
2727 #           (from Security Association) concatenated with 8 byte
2728 #           Initialisation Vector (from IPSec ESP Payload)
2729 #           concatenated with 0x00000001. 16-byte aligned pointer. */
2730 #        u8     *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
2731 #        const   u8 *aad, /* Additional Authentication Data (AAD)*/
2732 #        u64     aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
2733 #############################################################
2734 SYM_FUNC_START(aesni_gcm_init_avx_gen4)
2735         FUNC_SAVE
2736         INIT GHASH_MUL_AVX2, PRECOMPUTE_AVX2
2737         FUNC_RESTORE
2738         RET
2739 SYM_FUNC_END(aesni_gcm_init_avx_gen4)
2740 
2741 ###############################################################################
2742 #void   aesni_gcm_enc_avx_gen4(
2743 #        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
2744 #        gcm_context_data *data,
2745 #        u8      *out, /* Ciphertext output. Encrypt in-place is allowed.  */
2746 #        const   u8 *in, /* Plaintext input */
2747 #        u64     plaintext_len) /* Length of data in Bytes for encryption. */
2748 ###############################################################################
2749 SYM_FUNC_START(aesni_gcm_enc_update_avx_gen4)
2750         FUNC_SAVE
2751         mov     keysize,%eax
2752         cmp     $32, %eax
2753         je      key_256_enc_update4
2754         cmp     $16, %eax
2755         je      key_128_enc_update4
2756         # must be 192
2757         GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 11
2758         FUNC_RESTORE
2759     RET
2760 key_128_enc_update4:
2761         GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 9
2762         FUNC_RESTORE
2763     RET
2764 key_256_enc_update4:
2765         GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 13
2766         FUNC_RESTORE
2767     RET
2768 SYM_FUNC_END(aesni_gcm_enc_update_avx_gen4)
2769 
2770 ###############################################################################
2771 #void   aesni_gcm_dec_update_avx_gen4(
2772 #        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
2773 #        gcm_context_data *data,
2774 #        u8      *out, /* Plaintext output. Decrypt in-place is allowed.  */
2775 #        const   u8 *in, /* Ciphertext input */
2776 #        u64     plaintext_len) /* Length of data in Bytes for encryption. */
2777 ###############################################################################
2778 SYM_FUNC_START(aesni_gcm_dec_update_avx_gen4)
2779         FUNC_SAVE
2780         mov     keysize,%eax
2781         cmp     $32, %eax
2782         je      key_256_dec_update4
2783         cmp     $16, %eax
2784         je      key_128_dec_update4
2785         # must be 192
2786         GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 11
2787         FUNC_RESTORE
2788         RET
2789 key_128_dec_update4:
2790         GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 9
2791         FUNC_RESTORE
2792         RET
2793 key_256_dec_update4:
2794         GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 13
2795         FUNC_RESTORE
2796         RET
2797 SYM_FUNC_END(aesni_gcm_dec_update_avx_gen4)
2798 
2799 ###############################################################################
2800 #void   aesni_gcm_finalize_avx_gen4(
2801 #        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
2802 #        gcm_context_data *data,
2803 #        u8      *auth_tag, /* Authenticated Tag output. */
2804 #        u64     auth_tag_len)# /* Authenticated Tag Length in bytes.
2805 #                              Valid values are 16 (most likely), 12 or 8. */
2806 ###############################################################################
2807 SYM_FUNC_START(aesni_gcm_finalize_avx_gen4)
2808         FUNC_SAVE
2809         mov keysize,%eax
2810         cmp     $32, %eax
2811         je      key_256_finalize4
2812         cmp     $16, %eax
2813         je      key_128_finalize4
2814         # must be 192
2815         GCM_COMPLETE GHASH_MUL_AVX2, 11, arg3, arg4
2816         FUNC_RESTORE
2817         RET
2818 key_128_finalize4:
2819         GCM_COMPLETE GHASH_MUL_AVX2, 9, arg3, arg4
2820         FUNC_RESTORE
2821         RET
2822 key_256_finalize4:
2823         GCM_COMPLETE GHASH_MUL_AVX2, 13, arg3, arg4
2824         FUNC_RESTORE
2825         RET
2826 SYM_FUNC_END(aesni_gcm_finalize_avx_gen4)