0001 ########################################################################
0002 # Copyright (c) 2013, Intel Corporation
0003 #
0004 # This software is available to you under a choice of one of two
0005 # licenses. You may choose to be licensed under the terms of the GNU
0006 # General Public License (GPL) Version 2, available from the file
0007 # COPYING in the main directory of this source tree, or the
0008 # OpenIB.org BSD license below:
0009 #
0010 # Redistribution and use in source and binary forms, with or without
0011 # modification, are permitted provided that the following conditions are
0012 # met:
0013 #
0014 # * Redistributions of source code must retain the above copyright
0015 # notice, this list of conditions and the following disclaimer.
0016 #
0017 # * Redistributions in binary form must reproduce the above copyright
0018 # notice, this list of conditions and the following disclaimer in the
0019 # documentation and/or other materials provided with the
0020 # distribution.
0021 #
0022 # * Neither the name of the Intel Corporation nor the names of its
0023 # contributors may be used to endorse or promote products derived from
0024 # this software without specific prior written permission.
0025 #
0026 #
0027 # THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
0028 # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
0029 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
0030 # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
0031 # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
0032 # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
0033 # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES# LOSS OF USE, DATA, OR
0034 # PROFITS# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
0035 # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
0036 # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
0037 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
0038 ########################################################################
0039 ##
0040 ## Authors:
0041 ## Erdinc Ozturk <erdinc.ozturk@intel.com>
0042 ## Vinodh Gopal <vinodh.gopal@intel.com>
0043 ## James Guilford <james.guilford@intel.com>
0044 ## Tim Chen <tim.c.chen@linux.intel.com>
0045 ##
0046 ## References:
0047 ## This code was derived and highly optimized from the code described in paper:
0048 ## Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation
0049 ## on Intel Architecture Processors. August, 2010
0050 ## The details of the implementation is explained in:
0051 ## Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode
0052 ## on Intel Architecture Processors. October, 2012.
0053 ##
0054 ## Assumptions:
0055 ##
0056 ##
0057 ##
0058 ## iv:
0059 ## 0 1 2 3
0060 ## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
0061 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
0062 ## | Salt (From the SA) |
0063 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
0064 ## | Initialization Vector |
0065 ## | (This is the sequence number from IPSec header) |
0066 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
0067 ## | 0x1 |
0068 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
0069 ##
0070 ##
0071 ##
0072 ## AAD:
0073 ## AAD padded to 128 bits with 0
0074 ## for example, assume AAD is a u32 vector
0075 ##
0076 ## if AAD is 8 bytes:
0077 ## AAD[3] = {A0, A1}#
0078 ## padded AAD in xmm register = {A1 A0 0 0}
0079 ##
0080 ## 0 1 2 3
0081 ## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
0082 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
0083 ## | SPI (A1) |
0084 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
0085 ## | 32-bit Sequence Number (A0) |
0086 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
0087 ## | 0x0 |
0088 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
0089 ##
0090 ## AAD Format with 32-bit Sequence Number
0091 ##
0092 ## if AAD is 12 bytes:
0093 ## AAD[3] = {A0, A1, A2}#
0094 ## padded AAD in xmm register = {A2 A1 A0 0}
0095 ##
0096 ## 0 1 2 3
0097 ## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
0098 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
0099 ## | SPI (A2) |
0100 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
0101 ## | 64-bit Extended Sequence Number {A1,A0} |
0102 ## | |
0103 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
0104 ## | 0x0 |
0105 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
0106 ##
0107 ## AAD Format with 64-bit Extended Sequence Number
0108 ##
0109 ##
0110 ## aadLen:
0111 ## from the definition of the spec, aadLen can only be 8 or 12 bytes.
0112 ## The code additionally supports aadLen of length 16 bytes.
0113 ##
0114 ## TLen:
0115 ## from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
0116 ##
0117 ## poly = x^128 + x^127 + x^126 + x^121 + 1
0118 ## throughout the code, one tab and two tab indentations are used. one tab is
0119 ## for GHASH part, two tabs is for AES part.
0120 ##
0121
0122 #include <linux/linkage.h>
0123
0124 # constants in mergeable sections, linker can reorder and merge
0125 .section .rodata.cst16.POLY, "aM", @progbits, 16
0126 .align 16
0127 POLY: .octa 0xC2000000000000000000000000000001
0128
0129 .section .rodata.cst16.POLY2, "aM", @progbits, 16
0130 .align 16
0131 POLY2: .octa 0xC20000000000000000000001C2000000
0132
0133 .section .rodata.cst16.TWOONE, "aM", @progbits, 16
0134 .align 16
0135 TWOONE: .octa 0x00000001000000000000000000000001
0136
0137 .section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
0138 .align 16
0139 SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
0140
0141 .section .rodata.cst16.ONE, "aM", @progbits, 16
0142 .align 16
0143 ONE: .octa 0x00000000000000000000000000000001
0144
0145 .section .rodata.cst16.ONEf, "aM", @progbits, 16
0146 .align 16
0147 ONEf: .octa 0x01000000000000000000000000000000
0148
0149 # order of these constants should not change.
0150 # more specifically, ALL_F should follow SHIFT_MASK, and zero should follow ALL_F
0151 .section .rodata, "a", @progbits
0152 .align 16
0153 SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
0154 ALL_F: .octa 0xffffffffffffffffffffffffffffffff
0155 .octa 0x00000000000000000000000000000000
0156
0157 .section .rodata
0158 .align 16
0159 .type aad_shift_arr, @object
0160 .size aad_shift_arr, 272
0161 aad_shift_arr:
0162 .octa 0xffffffffffffffffffffffffffffffff
0163 .octa 0xffffffffffffffffffffffffffffff0C
0164 .octa 0xffffffffffffffffffffffffffff0D0C
0165 .octa 0xffffffffffffffffffffffffff0E0D0C
0166 .octa 0xffffffffffffffffffffffff0F0E0D0C
0167 .octa 0xffffffffffffffffffffff0C0B0A0908
0168 .octa 0xffffffffffffffffffff0D0C0B0A0908
0169 .octa 0xffffffffffffffffff0E0D0C0B0A0908
0170 .octa 0xffffffffffffffff0F0E0D0C0B0A0908
0171 .octa 0xffffffffffffff0C0B0A090807060504
0172 .octa 0xffffffffffff0D0C0B0A090807060504
0173 .octa 0xffffffffff0E0D0C0B0A090807060504
0174 .octa 0xffffffff0F0E0D0C0B0A090807060504
0175 .octa 0xffffff0C0B0A09080706050403020100
0176 .octa 0xffff0D0C0B0A09080706050403020100
0177 .octa 0xff0E0D0C0B0A09080706050403020100
0178 .octa 0x0F0E0D0C0B0A09080706050403020100
0179
0180
0181 .text
0182
0183
0184 #define AadHash 16*0
0185 #define AadLen 16*1
0186 #define InLen (16*1)+8
0187 #define PBlockEncKey 16*2
0188 #define OrigIV 16*3
0189 #define CurCount 16*4
0190 #define PBlockLen 16*5
0191
0192 HashKey = 16*6 # store HashKey <<1 mod poly here
0193 HashKey_2 = 16*7 # store HashKey^2 <<1 mod poly here
0194 HashKey_3 = 16*8 # store HashKey^3 <<1 mod poly here
0195 HashKey_4 = 16*9 # store HashKey^4 <<1 mod poly here
0196 HashKey_5 = 16*10 # store HashKey^5 <<1 mod poly here
0197 HashKey_6 = 16*11 # store HashKey^6 <<1 mod poly here
0198 HashKey_7 = 16*12 # store HashKey^7 <<1 mod poly here
0199 HashKey_8 = 16*13 # store HashKey^8 <<1 mod poly here
0200 HashKey_k = 16*14 # store XOR of HashKey <<1 mod poly here (for Karatsuba purposes)
0201 HashKey_2_k = 16*15 # store XOR of HashKey^2 <<1 mod poly here (for Karatsuba purposes)
0202 HashKey_3_k = 16*16 # store XOR of HashKey^3 <<1 mod poly here (for Karatsuba purposes)
0203 HashKey_4_k = 16*17 # store XOR of HashKey^4 <<1 mod poly here (for Karatsuba purposes)
0204 HashKey_5_k = 16*18 # store XOR of HashKey^5 <<1 mod poly here (for Karatsuba purposes)
0205 HashKey_6_k = 16*19 # store XOR of HashKey^6 <<1 mod poly here (for Karatsuba purposes)
0206 HashKey_7_k = 16*20 # store XOR of HashKey^7 <<1 mod poly here (for Karatsuba purposes)
0207 HashKey_8_k = 16*21 # store XOR of HashKey^8 <<1 mod poly here (for Karatsuba purposes)
0208
0209 #define arg1 %rdi
0210 #define arg2 %rsi
0211 #define arg3 %rdx
0212 #define arg4 %rcx
0213 #define arg5 %r8
0214 #define arg6 %r9
0215 #define keysize 2*15*16(arg1)
0216
0217 i = 0
0218 j = 0
0219
0220 out_order = 0
0221 in_order = 1
0222 DEC = 0
0223 ENC = 1
0224
0225 .macro define_reg r n
0226 reg_\r = %xmm\n
0227 .endm
0228
0229 .macro setreg
0230 .altmacro
0231 define_reg i %i
0232 define_reg j %j
0233 .noaltmacro
0234 .endm
0235
0236 TMP1 = 16*0 # Temporary storage for AAD
0237 TMP2 = 16*1 # Temporary storage for AES State 2 (State 1 is stored in an XMM register)
0238 TMP3 = 16*2 # Temporary storage for AES State 3
0239 TMP4 = 16*3 # Temporary storage for AES State 4
0240 TMP5 = 16*4 # Temporary storage for AES State 5
0241 TMP6 = 16*5 # Temporary storage for AES State 6
0242 TMP7 = 16*6 # Temporary storage for AES State 7
0243 TMP8 = 16*7 # Temporary storage for AES State 8
0244
0245 VARIABLE_OFFSET = 16*8
0246
0247 ################################
0248 # Utility Macros
0249 ################################
0250
0251 .macro FUNC_SAVE
0252 push %r12
0253 push %r13
0254 push %r15
0255
0256 push %rbp
0257 mov %rsp, %rbp
0258
0259 sub $VARIABLE_OFFSET, %rsp
0260 and $~63, %rsp # align rsp to 64 bytes
0261 .endm
0262
0263 .macro FUNC_RESTORE
0264 mov %rbp, %rsp
0265 pop %rbp
0266
0267 pop %r15
0268 pop %r13
0269 pop %r12
0270 .endm
0271
0272 # Encryption of a single block
0273 .macro ENCRYPT_SINGLE_BLOCK REP XMM0
0274 vpxor (arg1), \XMM0, \XMM0
0275 i = 1
0276 setreg
0277 .rep \REP
0278 vaesenc 16*i(arg1), \XMM0, \XMM0
0279 i = (i+1)
0280 setreg
0281 .endr
0282 vaesenclast 16*i(arg1), \XMM0, \XMM0
0283 .endm
0284
0285 # combined for GCM encrypt and decrypt functions
0286 # clobbering all xmm registers
0287 # clobbering r10, r11, r12, r13, r15, rax
0288 .macro GCM_ENC_DEC INITIAL_BLOCKS GHASH_8_ENCRYPT_8_PARALLEL GHASH_LAST_8 GHASH_MUL ENC_DEC REP
0289 vmovdqu AadHash(arg2), %xmm8
0290 vmovdqu HashKey(arg2), %xmm13 # xmm13 = HashKey
0291 add arg5, InLen(arg2)
0292
0293 # initialize the data pointer offset as zero
0294 xor %r11d, %r11d
0295
0296 PARTIAL_BLOCK \GHASH_MUL, arg3, arg4, arg5, %r11, %xmm8, \ENC_DEC
0297 sub %r11, arg5
0298
0299 mov arg5, %r13 # save the number of bytes of plaintext/ciphertext
0300 and $-16, %r13 # r13 = r13 - (r13 mod 16)
0301
0302 mov %r13, %r12
0303 shr $4, %r12
0304 and $7, %r12
0305 jz _initial_num_blocks_is_0\@
0306
0307 cmp $7, %r12
0308 je _initial_num_blocks_is_7\@
0309 cmp $6, %r12
0310 je _initial_num_blocks_is_6\@
0311 cmp $5, %r12
0312 je _initial_num_blocks_is_5\@
0313 cmp $4, %r12
0314 je _initial_num_blocks_is_4\@
0315 cmp $3, %r12
0316 je _initial_num_blocks_is_3\@
0317 cmp $2, %r12
0318 je _initial_num_blocks_is_2\@
0319
0320 jmp _initial_num_blocks_is_1\@
0321
0322 _initial_num_blocks_is_7\@:
0323 \INITIAL_BLOCKS \REP, 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
0324 sub $16*7, %r13
0325 jmp _initial_blocks_encrypted\@
0326
0327 _initial_num_blocks_is_6\@:
0328 \INITIAL_BLOCKS \REP, 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
0329 sub $16*6, %r13
0330 jmp _initial_blocks_encrypted\@
0331
0332 _initial_num_blocks_is_5\@:
0333 \INITIAL_BLOCKS \REP, 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
0334 sub $16*5, %r13
0335 jmp _initial_blocks_encrypted\@
0336
0337 _initial_num_blocks_is_4\@:
0338 \INITIAL_BLOCKS \REP, 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
0339 sub $16*4, %r13
0340 jmp _initial_blocks_encrypted\@
0341
0342 _initial_num_blocks_is_3\@:
0343 \INITIAL_BLOCKS \REP, 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
0344 sub $16*3, %r13
0345 jmp _initial_blocks_encrypted\@
0346
0347 _initial_num_blocks_is_2\@:
0348 \INITIAL_BLOCKS \REP, 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
0349 sub $16*2, %r13
0350 jmp _initial_blocks_encrypted\@
0351
0352 _initial_num_blocks_is_1\@:
0353 \INITIAL_BLOCKS \REP, 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
0354 sub $16*1, %r13
0355 jmp _initial_blocks_encrypted\@
0356
0357 _initial_num_blocks_is_0\@:
0358 \INITIAL_BLOCKS \REP, 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
0359
0360
0361 _initial_blocks_encrypted\@:
0362 test %r13, %r13
0363 je _zero_cipher_left\@
0364
0365 sub $128, %r13
0366 je _eight_cipher_left\@
0367
0368
0369
0370
0371 vmovd %xmm9, %r15d
0372 and $255, %r15d
0373 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
0374
0375
0376 _encrypt_by_8_new\@:
0377 cmp $(255-8), %r15d
0378 jg _encrypt_by_8\@
0379
0380
0381
0382 add $8, %r15b
0383 \GHASH_8_ENCRYPT_8_PARALLEL \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
0384 add $128, %r11
0385 sub $128, %r13
0386 jne _encrypt_by_8_new\@
0387
0388 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
0389 jmp _eight_cipher_left\@
0390
0391 _encrypt_by_8\@:
0392 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
0393 add $8, %r15b
0394 \GHASH_8_ENCRYPT_8_PARALLEL \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
0395 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
0396 add $128, %r11
0397 sub $128, %r13
0398 jne _encrypt_by_8_new\@
0399
0400 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
0401
0402
0403
0404
0405 _eight_cipher_left\@:
0406 \GHASH_LAST_8 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8
0407
0408
0409 _zero_cipher_left\@:
0410 vmovdqu %xmm14, AadHash(arg2)
0411 vmovdqu %xmm9, CurCount(arg2)
0412
0413 # check for 0 length
0414 mov arg5, %r13
0415 and $15, %r13 # r13 = (arg5 mod 16)
0416
0417 je _multiple_of_16_bytes\@
0418
0419 # handle the last <16 Byte block separately
0420
0421 mov %r13, PBlockLen(arg2)
0422
0423 vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn
0424 vmovdqu %xmm9, CurCount(arg2)
0425 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
0426
0427 ENCRYPT_SINGLE_BLOCK \REP, %xmm9 # E(K, Yn)
0428 vmovdqu %xmm9, PBlockEncKey(arg2)
0429
0430 cmp $16, arg5
0431 jge _large_enough_update\@
0432
0433 lea (arg4,%r11,1), %r10
0434 mov %r13, %r12
0435
0436 READ_PARTIAL_BLOCK %r10 %r12 %xmm1
0437
0438 lea SHIFT_MASK+16(%rip), %r12
0439 sub %r13, %r12 # adjust the shuffle mask pointer to be
0440 # able to shift 16-r13 bytes (r13 is the
0441 # number of bytes in plaintext mod 16)
0442
0443 jmp _final_ghash_mul\@
0444
0445 _large_enough_update\@:
0446 sub $16, %r11
0447 add %r13, %r11
0448
0449 # receive the last <16 Byte block
0450 vmovdqu (arg4, %r11, 1), %xmm1
0451
0452 sub %r13, %r11
0453 add $16, %r11
0454
0455 lea SHIFT_MASK+16(%rip), %r12
0456 # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
0457 # (r13 is the number of bytes in plaintext mod 16)
0458 sub %r13, %r12
0459 # get the appropriate shuffle mask
0460 vmovdqu (%r12), %xmm2
0461 # shift right 16-r13 bytes
0462 vpshufb %xmm2, %xmm1, %xmm1
0463
0464 _final_ghash_mul\@:
0465 .if \ENC_DEC == DEC
0466 vmovdqa %xmm1, %xmm2
0467 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
0468 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to
0469 # mask out top 16-r13 bytes of xmm9
0470 vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
0471 vpand %xmm1, %xmm2, %xmm2
0472 vpshufb SHUF_MASK(%rip), %xmm2, %xmm2
0473 vpxor %xmm2, %xmm14, %xmm14
0474
0475 vmovdqu %xmm14, AadHash(arg2)
0476 .else
0477 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
0478 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to
0479 # mask out top 16-r13 bytes of xmm9
0480 vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
0481 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
0482 vpxor %xmm9, %xmm14, %xmm14
0483
0484 vmovdqu %xmm14, AadHash(arg2)
0485 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext
0486 .endif
0487
0488
0489 #############################
0490 # output r13 Bytes
0491 vmovq %xmm9, %rax
0492 cmp $8, %r13
0493 jle _less_than_8_bytes_left\@
0494
0495 mov %rax, (arg3 , %r11)
0496 add $8, %r11
0497 vpsrldq $8, %xmm9, %xmm9
0498 vmovq %xmm9, %rax
0499 sub $8, %r13
0500
0501 _less_than_8_bytes_left\@:
0502 movb %al, (arg3 , %r11)
0503 add $1, %r11
0504 shr $8, %rax
0505 sub $1, %r13
0506 jne _less_than_8_bytes_left\@
0507 #############################
0508
0509 _multiple_of_16_bytes\@:
0510 .endm
0511
0512
0513 # GCM_COMPLETE Finishes update of tag of last partial block
0514 # Output: Authorization Tag (AUTH_TAG)
0515 # Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
0516 .macro GCM_COMPLETE GHASH_MUL REP AUTH_TAG AUTH_TAG_LEN
0517 vmovdqu AadHash(arg2), %xmm14
0518 vmovdqu HashKey(arg2), %xmm13
0519
0520 mov PBlockLen(arg2), %r12
0521 test %r12, %r12
0522 je _partial_done\@
0523
0524 #GHASH computation for the last <16 Byte block
0525 \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
0526
0527 _partial_done\@:
0528 mov AadLen(arg2), %r12 # r12 = aadLen (number of bytes)
0529 shl $3, %r12 # convert into number of bits
0530 vmovd %r12d, %xmm15 # len(A) in xmm15
0531
0532 mov InLen(arg2), %r12
0533 shl $3, %r12 # len(C) in bits (*128)
0534 vmovq %r12, %xmm1
0535 vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000
0536 vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C)
0537
0538 vpxor %xmm15, %xmm14, %xmm14
0539 \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation
0540 vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap
0541
0542 vmovdqu OrigIV(arg2), %xmm9
0543
0544 ENCRYPT_SINGLE_BLOCK \REP, %xmm9 # E(K, Y0)
0545
0546 vpxor %xmm14, %xmm9, %xmm9
0547
0548
0549
0550 _return_T\@:
0551 mov \AUTH_TAG, %r10 # r10 = authTag
0552 mov \AUTH_TAG_LEN, %r11 # r11 = auth_tag_len
0553
0554 cmp $16, %r11
0555 je _T_16\@
0556
0557 cmp $8, %r11
0558 jl _T_4\@
0559
0560 _T_8\@:
0561 vmovq %xmm9, %rax
0562 mov %rax, (%r10)
0563 add $8, %r10
0564 sub $8, %r11
0565 vpsrldq $8, %xmm9, %xmm9
0566 test %r11, %r11
0567 je _return_T_done\@
0568 _T_4\@:
0569 vmovd %xmm9, %eax
0570 mov %eax, (%r10)
0571 add $4, %r10
0572 sub $4, %r11
0573 vpsrldq $4, %xmm9, %xmm9
0574 test %r11, %r11
0575 je _return_T_done\@
0576 _T_123\@:
0577 vmovd %xmm9, %eax
0578 cmp $2, %r11
0579 jl _T_1\@
0580 mov %ax, (%r10)
0581 cmp $2, %r11
0582 je _return_T_done\@
0583 add $2, %r10
0584 sar $16, %eax
0585 _T_1\@:
0586 mov %al, (%r10)
0587 jmp _return_T_done\@
0588
0589 _T_16\@:
0590 vmovdqu %xmm9, (%r10)
0591
0592 _return_T_done\@:
0593 .endm
0594
0595 .macro CALC_AAD_HASH GHASH_MUL AAD AADLEN T1 T2 T3 T4 T5 T6 T7 T8
0596
0597 mov \AAD, %r10 # r10 = AAD
0598 mov \AADLEN, %r12 # r12 = aadLen
0599
0600
0601 mov %r12, %r11
0602
0603 vpxor \T8, \T8, \T8
0604 vpxor \T7, \T7, \T7
0605 cmp $16, %r11
0606 jl _get_AAD_rest8\@
0607 _get_AAD_blocks\@:
0608 vmovdqu (%r10), \T7
0609 vpshufb SHUF_MASK(%rip), \T7, \T7
0610 vpxor \T7, \T8, \T8
0611 \GHASH_MUL \T8, \T2, \T1, \T3, \T4, \T5, \T6
0612 add $16, %r10
0613 sub $16, %r12
0614 sub $16, %r11
0615 cmp $16, %r11
0616 jge _get_AAD_blocks\@
0617 vmovdqu \T8, \T7
0618 test %r11, %r11
0619 je _get_AAD_done\@
0620
0621 vpxor \T7, \T7, \T7
0622
0623
0624
0625
0626 _get_AAD_rest8\@:
0627 cmp $4, %r11
0628 jle _get_AAD_rest4\@
0629 movq (%r10), \T1
0630 add $8, %r10
0631 sub $8, %r11
0632 vpslldq $8, \T1, \T1
0633 vpsrldq $8, \T7, \T7
0634 vpxor \T1, \T7, \T7
0635 jmp _get_AAD_rest8\@
0636 _get_AAD_rest4\@:
0637 test %r11, %r11
0638 jle _get_AAD_rest0\@
0639 mov (%r10), %eax
0640 movq %rax, \T1
0641 add $4, %r10
0642 sub $4, %r11
0643 vpslldq $12, \T1, \T1
0644 vpsrldq $4, \T7, \T7
0645 vpxor \T1, \T7, \T7
0646 _get_AAD_rest0\@:
0647
0648
0649
0650 movq %r12, %r11
0651 salq $4, %r11
0652 vmovdqu aad_shift_arr(%r11), \T1
0653 vpshufb \T1, \T7, \T7
0654 _get_AAD_rest_final\@:
0655 vpshufb SHUF_MASK(%rip), \T7, \T7
0656 vpxor \T8, \T7, \T7
0657 \GHASH_MUL \T7, \T2, \T1, \T3, \T4, \T5, \T6
0658
0659 _get_AAD_done\@:
0660 vmovdqu \T7, AadHash(arg2)
0661 .endm
0662
0663 .macro INIT GHASH_MUL PRECOMPUTE
0664 mov arg6, %r11
0665 mov %r11, AadLen(arg2) # ctx_data.aad_length = aad_length
0666 xor %r11d, %r11d
0667 mov %r11, InLen(arg2) # ctx_data.in_length = 0
0668
0669 mov %r11, PBlockLen(arg2) # ctx_data.partial_block_length = 0
0670 mov %r11, PBlockEncKey(arg2) # ctx_data.partial_block_enc_key = 0
0671 mov arg3, %rax
0672 movdqu (%rax), %xmm0
0673 movdqu %xmm0, OrigIV(arg2) # ctx_data.orig_IV = iv
0674
0675 vpshufb SHUF_MASK(%rip), %xmm0, %xmm0
0676 movdqu %xmm0, CurCount(arg2) # ctx_data.current_counter = iv
0677
0678 vmovdqu (arg4), %xmm6 # xmm6 = HashKey
0679
0680 vpshufb SHUF_MASK(%rip), %xmm6, %xmm6
0681 ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey
0682 vmovdqa %xmm6, %xmm2
0683 vpsllq $1, %xmm6, %xmm6
0684 vpsrlq $63, %xmm2, %xmm2
0685 vmovdqa %xmm2, %xmm1
0686 vpslldq $8, %xmm2, %xmm2
0687 vpsrldq $8, %xmm1, %xmm1
0688 vpor %xmm2, %xmm6, %xmm6
0689 #reduction
0690 vpshufd $0b00100100, %xmm1, %xmm2
0691 vpcmpeqd TWOONE(%rip), %xmm2, %xmm2
0692 vpand POLY(%rip), %xmm2, %xmm2
0693 vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly
0694 #######################################################################
0695 vmovdqu %xmm6, HashKey(arg2) # store HashKey<<1 mod poly
0696
0697 CALC_AAD_HASH \GHASH_MUL, arg5, arg6, %xmm2, %xmm6, %xmm3, %xmm4, %xmm5, %xmm7, %xmm1, %xmm0
0698
0699 \PRECOMPUTE %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
0700 .endm
0701
0702
0703 # Reads DLEN bytes starting at DPTR and stores in XMMDst
0704 # where 0 < DLEN < 16
0705 # Clobbers %rax, DLEN
0706 .macro READ_PARTIAL_BLOCK DPTR DLEN XMMDst
0707 vpxor \XMMDst, \XMMDst, \XMMDst
0708
0709 cmp $8, \DLEN
0710 jl _read_lt8_\@
0711 mov (\DPTR), %rax
0712 vpinsrq $0, %rax, \XMMDst, \XMMDst
0713 sub $8, \DLEN
0714 jz _done_read_partial_block_\@
0715 xor %eax, %eax
0716 _read_next_byte_\@:
0717 shl $8, %rax
0718 mov 7(\DPTR, \DLEN, 1), %al
0719 dec \DLEN
0720 jnz _read_next_byte_\@
0721 vpinsrq $1, %rax, \XMMDst, \XMMDst
0722 jmp _done_read_partial_block_\@
0723 _read_lt8_\@:
0724 xor %eax, %eax
0725 _read_next_byte_lt8_\@:
0726 shl $8, %rax
0727 mov -1(\DPTR, \DLEN, 1), %al
0728 dec \DLEN
0729 jnz _read_next_byte_lt8_\@
0730 vpinsrq $0, %rax, \XMMDst, \XMMDst
0731 _done_read_partial_block_\@:
0732 .endm
0733
0734 # PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks
0735 # between update calls.
0736 # Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK
0737 # Outputs encrypted bytes, and updates hash and partial info in gcm_data_context
0738 # Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13
0739 .macro PARTIAL_BLOCK GHASH_MUL CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \
0740 AAD_HASH ENC_DEC
0741 mov PBlockLen(arg2), %r13
0742 test %r13, %r13
0743 je _partial_block_done_\@ # Leave Macro if no partial blocks
0744 # Read in input data without over reading
0745 cmp $16, \PLAIN_CYPH_LEN
0746 jl _fewer_than_16_bytes_\@
0747 vmovdqu (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm
0748 jmp _data_read_\@
0749
0750 _fewer_than_16_bytes_\@:
0751 lea (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10
0752 mov \PLAIN_CYPH_LEN, %r12
0753 READ_PARTIAL_BLOCK %r10 %r12 %xmm1
0754
0755 mov PBlockLen(arg2), %r13
0756
0757 _data_read_\@: # Finished reading in data
0758
0759 vmovdqu PBlockEncKey(arg2), %xmm9
0760 vmovdqu HashKey(arg2), %xmm13
0761
0762 lea SHIFT_MASK(%rip), %r12
0763
0764 # adjust the shuffle mask pointer to be able to shift r13 bytes
0765 # r16-r13 is the number of bytes in plaintext mod 16)
0766 add %r13, %r12
0767 vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask
0768 vpshufb %xmm2, %xmm9, %xmm9 # shift right r13 bytes
0769
0770 .if \ENC_DEC == DEC
0771 vmovdqa %xmm1, %xmm3
0772 pxor %xmm1, %xmm9 # Cyphertext XOR E(K, Yn)
0773
0774 mov \PLAIN_CYPH_LEN, %r10
0775 add %r13, %r10
0776 # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
0777 sub $16, %r10
0778 # Determine if if partial block is not being filled and
0779 # shift mask accordingly
0780 jge _no_extra_mask_1_\@
0781 sub %r10, %r12
0782 _no_extra_mask_1_\@:
0783
0784 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1
0785 # get the appropriate mask to mask out bottom r13 bytes of xmm9
0786 vpand %xmm1, %xmm9, %xmm9 # mask out bottom r13 bytes of xmm9
0787
0788 vpand %xmm1, %xmm3, %xmm3
0789 vmovdqa SHUF_MASK(%rip), %xmm10
0790 vpshufb %xmm10, %xmm3, %xmm3
0791 vpshufb %xmm2, %xmm3, %xmm3
0792 vpxor %xmm3, \AAD_HASH, \AAD_HASH
0793
0794 test %r10, %r10
0795 jl _partial_incomplete_1_\@
0796
0797 # GHASH computation for the last <16 Byte block
0798 \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
0799 xor %eax,%eax
0800
0801 mov %rax, PBlockLen(arg2)
0802 jmp _dec_done_\@
0803 _partial_incomplete_1_\@:
0804 add \PLAIN_CYPH_LEN, PBlockLen(arg2)
0805 _dec_done_\@:
0806 vmovdqu \AAD_HASH, AadHash(arg2)
0807 .else
0808 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
0809
0810 mov \PLAIN_CYPH_LEN, %r10
0811 add %r13, %r10
0812 # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
0813 sub $16, %r10
0814 # Determine if if partial block is not being filled and
0815 # shift mask accordingly
0816 jge _no_extra_mask_2_\@
0817 sub %r10, %r12
0818 _no_extra_mask_2_\@:
0819
0820 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1
0821 # get the appropriate mask to mask out bottom r13 bytes of xmm9
0822 vpand %xmm1, %xmm9, %xmm9
0823
0824 vmovdqa SHUF_MASK(%rip), %xmm1
0825 vpshufb %xmm1, %xmm9, %xmm9
0826 vpshufb %xmm2, %xmm9, %xmm9
0827 vpxor %xmm9, \AAD_HASH, \AAD_HASH
0828
0829 test %r10, %r10
0830 jl _partial_incomplete_2_\@
0831
0832 # GHASH computation for the last <16 Byte block
0833 \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
0834 xor %eax,%eax
0835
0836 mov %rax, PBlockLen(arg2)
0837 jmp _encode_done_\@
0838 _partial_incomplete_2_\@:
0839 add \PLAIN_CYPH_LEN, PBlockLen(arg2)
0840 _encode_done_\@:
0841 vmovdqu \AAD_HASH, AadHash(arg2)
0842
0843 vmovdqa SHUF_MASK(%rip), %xmm10
0844 # shuffle xmm9 back to output as ciphertext
0845 vpshufb %xmm10, %xmm9, %xmm9
0846 vpshufb %xmm2, %xmm9, %xmm9
0847 .endif
0848 # output encrypted Bytes
0849 test %r10, %r10
0850 jl _partial_fill_\@
0851 mov %r13, %r12
0852 mov $16, %r13
0853 # Set r13 to be the number of bytes to write out
0854 sub %r12, %r13
0855 jmp _count_set_\@
0856 _partial_fill_\@:
0857 mov \PLAIN_CYPH_LEN, %r13
0858 _count_set_\@:
0859 vmovdqa %xmm9, %xmm0
0860 vmovq %xmm0, %rax
0861 cmp $8, %r13
0862 jle _less_than_8_bytes_left_\@
0863
0864 mov %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
0865 add $8, \DATA_OFFSET
0866 psrldq $8, %xmm0
0867 vmovq %xmm0, %rax
0868 sub $8, %r13
0869 _less_than_8_bytes_left_\@:
0870 movb %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
0871 add $1, \DATA_OFFSET
0872 shr $8, %rax
0873 sub $1, %r13
0874 jne _less_than_8_bytes_left_\@
0875 _partial_block_done_\@:
0876 .endm # PARTIAL_BLOCK
0877
0878 ###############################################################################
0879 # GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
0880 # Input: A and B (128-bits each, bit-reflected)
0881 # Output: C = A*B*x mod poly, (i.e. >>1 )
0882 # To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
0883 # GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
0884 ###############################################################################
0885 .macro GHASH_MUL_AVX GH HK T1 T2 T3 T4 T5
0886
0887 vpshufd $0b01001110, \GH, \T2
0888 vpshufd $0b01001110, \HK, \T3
0889 vpxor \GH , \T2, \T2 # T2 = (a1+a0)
0890 vpxor \HK , \T3, \T3 # T3 = (b1+b0)
0891
0892 vpclmulqdq $0x11, \HK, \GH, \T1 # T1 = a1*b1
0893 vpclmulqdq $0x00, \HK, \GH, \GH # GH = a0*b0
0894 vpclmulqdq $0x00, \T3, \T2, \T2 # T2 = (a1+a0)*(b1+b0)
0895 vpxor \GH, \T2,\T2
0896 vpxor \T1, \T2,\T2 # T2 = a0*b1+a1*b0
0897
0898 vpslldq $8, \T2,\T3 # shift-L T3 2 DWs
0899 vpsrldq $8, \T2,\T2 # shift-R T2 2 DWs
0900 vpxor \T3, \GH, \GH
0901 vpxor \T2, \T1, \T1 # <T1:GH> = GH x HK
0902
0903 #first phase of the reduction
0904 vpslld $31, \GH, \T2 # packed right shifting << 31
0905 vpslld $30, \GH, \T3 # packed right shifting shift << 30
0906 vpslld $25, \GH, \T4 # packed right shifting shift << 25
0907
0908 vpxor \T3, \T2, \T2 # xor the shifted versions
0909 vpxor \T4, \T2, \T2
0910
0911 vpsrldq $4, \T2, \T5 # shift-R T5 1 DW
0912
0913 vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
0914 vpxor \T2, \GH, \GH # first phase of the reduction complete
0915
0916 #second phase of the reduction
0917
0918 vpsrld $1,\GH, \T2 # packed left shifting >> 1
0919 vpsrld $2,\GH, \T3 # packed left shifting >> 2
0920 vpsrld $7,\GH, \T4 # packed left shifting >> 7
0921 vpxor \T3, \T2, \T2 # xor the shifted versions
0922 vpxor \T4, \T2, \T2
0923
0924 vpxor \T5, \T2, \T2
0925 vpxor \T2, \GH, \GH
0926 vpxor \T1, \GH, \GH # the result is in GH
0927
0928
0929 .endm
0930
0931 .macro PRECOMPUTE_AVX HK T1 T2 T3 T4 T5 T6
0932
0933 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
0934 vmovdqa \HK, \T5
0935
0936 vpshufd $0b01001110, \T5, \T1
0937 vpxor \T5, \T1, \T1
0938 vmovdqu \T1, HashKey_k(arg2)
0939
0940 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly
0941 vmovdqu \T5, HashKey_2(arg2) # [HashKey_2] = HashKey^2<<1 mod poly
0942 vpshufd $0b01001110, \T5, \T1
0943 vpxor \T5, \T1, \T1
0944 vmovdqu \T1, HashKey_2_k(arg2)
0945
0946 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly
0947 vmovdqu \T5, HashKey_3(arg2)
0948 vpshufd $0b01001110, \T5, \T1
0949 vpxor \T5, \T1, \T1
0950 vmovdqu \T1, HashKey_3_k(arg2)
0951
0952 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly
0953 vmovdqu \T5, HashKey_4(arg2)
0954 vpshufd $0b01001110, \T5, \T1
0955 vpxor \T5, \T1, \T1
0956 vmovdqu \T1, HashKey_4_k(arg2)
0957
0958 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly
0959 vmovdqu \T5, HashKey_5(arg2)
0960 vpshufd $0b01001110, \T5, \T1
0961 vpxor \T5, \T1, \T1
0962 vmovdqu \T1, HashKey_5_k(arg2)
0963
0964 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly
0965 vmovdqu \T5, HashKey_6(arg2)
0966 vpshufd $0b01001110, \T5, \T1
0967 vpxor \T5, \T1, \T1
0968 vmovdqu \T1, HashKey_6_k(arg2)
0969
0970 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly
0971 vmovdqu \T5, HashKey_7(arg2)
0972 vpshufd $0b01001110, \T5, \T1
0973 vpxor \T5, \T1, \T1
0974 vmovdqu \T1, HashKey_7_k(arg2)
0975
0976 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly
0977 vmovdqu \T5, HashKey_8(arg2)
0978 vpshufd $0b01001110, \T5, \T1
0979 vpxor \T5, \T1, \T1
0980 vmovdqu \T1, HashKey_8_k(arg2)
0981
0982 .endm
0983
0984 ## if a = number of total plaintext bytes
0985 ## b = floor(a/16)
0986 ## num_initial_blocks = b mod 4#
0987 ## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
0988 ## r10, r11, r12, rax are clobbered
0989 ## arg1, arg2, arg3, arg4 are used as pointers only, not modified
0990
0991 .macro INITIAL_BLOCKS_AVX REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC
0992 i = (8-\num_initial_blocks)
0993 setreg
0994 vmovdqu AadHash(arg2), reg_i
0995
0996 # start AES for num_initial_blocks blocks
0997 vmovdqu CurCount(arg2), \CTR
0998
0999 i = (9-\num_initial_blocks)
1000 setreg
1001 .rep \num_initial_blocks
1002 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1003 vmovdqa \CTR, reg_i
1004 vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap
1005 i = (i+1)
1006 setreg
1007 .endr
1008
1009 vmovdqa (arg1), \T_key
1010 i = (9-\num_initial_blocks)
1011 setreg
1012 .rep \num_initial_blocks
1013 vpxor \T_key, reg_i, reg_i
1014 i = (i+1)
1015 setreg
1016 .endr
1017
1018 j = 1
1019 setreg
1020 .rep \REP
1021 vmovdqa 16*j(arg1), \T_key
1022 i = (9-\num_initial_blocks)
1023 setreg
1024 .rep \num_initial_blocks
1025 vaesenc \T_key, reg_i, reg_i
1026 i = (i+1)
1027 setreg
1028 .endr
1029
1030 j = (j+1)
1031 setreg
1032 .endr
1033
1034 vmovdqa 16*j(arg1), \T_key
1035 i = (9-\num_initial_blocks)
1036 setreg
1037 .rep \num_initial_blocks
1038 vaesenclast \T_key, reg_i, reg_i
1039 i = (i+1)
1040 setreg
1041 .endr
1042
1043 i = (9-\num_initial_blocks)
1044 setreg
1045 .rep \num_initial_blocks
1046 vmovdqu (arg4, %r11), \T1
1047 vpxor \T1, reg_i, reg_i
1048 vmovdqu reg_i, (arg3 , %r11) # write back ciphertext for num_initial_blocks blocks
1049 add $16, %r11
1050 .if \ENC_DEC == DEC
1051 vmovdqa \T1, reg_i
1052 .endif
1053 vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations
1054 i = (i+1)
1055 setreg
1056 .endr
1057
1058
1059 i = (8-\num_initial_blocks)
1060 j = (9-\num_initial_blocks)
1061 setreg
1062
1063 .rep \num_initial_blocks
1064 vpxor reg_i, reg_j, reg_j
1065 GHASH_MUL_AVX reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks
1066 i = (i+1)
1067 j = (j+1)
1068 setreg
1069 .endr
1070 # XMM8 has the combined result here
1071
1072 vmovdqa \XMM8, TMP1(%rsp)
1073 vmovdqa \XMM8, \T3
1074
1075 cmp $128, %r13
1076 jl _initial_blocks_done\@ # no need for precomputed constants
1077
1078 ###############################################################################
1079 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
1080 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1081 vmovdqa \CTR, \XMM1
1082 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
1083
1084 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1085 vmovdqa \CTR, \XMM2
1086 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
1087
1088 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1089 vmovdqa \CTR, \XMM3
1090 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
1091
1092 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1093 vmovdqa \CTR, \XMM4
1094 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
1095
1096 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1097 vmovdqa \CTR, \XMM5
1098 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
1099
1100 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1101 vmovdqa \CTR, \XMM6
1102 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
1103
1104 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1105 vmovdqa \CTR, \XMM7
1106 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
1107
1108 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1109 vmovdqa \CTR, \XMM8
1110 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
1111
1112 vmovdqa (arg1), \T_key
1113 vpxor \T_key, \XMM1, \XMM1
1114 vpxor \T_key, \XMM2, \XMM2
1115 vpxor \T_key, \XMM3, \XMM3
1116 vpxor \T_key, \XMM4, \XMM4
1117 vpxor \T_key, \XMM5, \XMM5
1118 vpxor \T_key, \XMM6, \XMM6
1119 vpxor \T_key, \XMM7, \XMM7
1120 vpxor \T_key, \XMM8, \XMM8
1121
1122 i = 1
1123 setreg
1124 .rep \REP # do REP rounds
1125 vmovdqa 16*i(arg1), \T_key
1126 vaesenc \T_key, \XMM1, \XMM1
1127 vaesenc \T_key, \XMM2, \XMM2
1128 vaesenc \T_key, \XMM3, \XMM3
1129 vaesenc \T_key, \XMM4, \XMM4
1130 vaesenc \T_key, \XMM5, \XMM5
1131 vaesenc \T_key, \XMM6, \XMM6
1132 vaesenc \T_key, \XMM7, \XMM7
1133 vaesenc \T_key, \XMM8, \XMM8
1134 i = (i+1)
1135 setreg
1136 .endr
1137
1138 vmovdqa 16*i(arg1), \T_key
1139 vaesenclast \T_key, \XMM1, \XMM1
1140 vaesenclast \T_key, \XMM2, \XMM2
1141 vaesenclast \T_key, \XMM3, \XMM3
1142 vaesenclast \T_key, \XMM4, \XMM4
1143 vaesenclast \T_key, \XMM5, \XMM5
1144 vaesenclast \T_key, \XMM6, \XMM6
1145 vaesenclast \T_key, \XMM7, \XMM7
1146 vaesenclast \T_key, \XMM8, \XMM8
1147
1148 vmovdqu (arg4, %r11), \T1
1149 vpxor \T1, \XMM1, \XMM1
1150 vmovdqu \XMM1, (arg3 , %r11)
1151 .if \ENC_DEC == DEC
1152 vmovdqa \T1, \XMM1
1153 .endif
1154
1155 vmovdqu 16*1(arg4, %r11), \T1
1156 vpxor \T1, \XMM2, \XMM2
1157 vmovdqu \XMM2, 16*1(arg3 , %r11)
1158 .if \ENC_DEC == DEC
1159 vmovdqa \T1, \XMM2
1160 .endif
1161
1162 vmovdqu 16*2(arg4, %r11), \T1
1163 vpxor \T1, \XMM3, \XMM3
1164 vmovdqu \XMM3, 16*2(arg3 , %r11)
1165 .if \ENC_DEC == DEC
1166 vmovdqa \T1, \XMM3
1167 .endif
1168
1169 vmovdqu 16*3(arg4, %r11), \T1
1170 vpxor \T1, \XMM4, \XMM4
1171 vmovdqu \XMM4, 16*3(arg3 , %r11)
1172 .if \ENC_DEC == DEC
1173 vmovdqa \T1, \XMM4
1174 .endif
1175
1176 vmovdqu 16*4(arg4, %r11), \T1
1177 vpxor \T1, \XMM5, \XMM5
1178 vmovdqu \XMM5, 16*4(arg3 , %r11)
1179 .if \ENC_DEC == DEC
1180 vmovdqa \T1, \XMM5
1181 .endif
1182
1183 vmovdqu 16*5(arg4, %r11), \T1
1184 vpxor \T1, \XMM6, \XMM6
1185 vmovdqu \XMM6, 16*5(arg3 , %r11)
1186 .if \ENC_DEC == DEC
1187 vmovdqa \T1, \XMM6
1188 .endif
1189
1190 vmovdqu 16*6(arg4, %r11), \T1
1191 vpxor \T1, \XMM7, \XMM7
1192 vmovdqu \XMM7, 16*6(arg3 , %r11)
1193 .if \ENC_DEC == DEC
1194 vmovdqa \T1, \XMM7
1195 .endif
1196
1197 vmovdqu 16*7(arg4, %r11), \T1
1198 vpxor \T1, \XMM8, \XMM8
1199 vmovdqu \XMM8, 16*7(arg3 , %r11)
1200 .if \ENC_DEC == DEC
1201 vmovdqa \T1, \XMM8
1202 .endif
1203
1204 add $128, %r11
1205
1206 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
1207 vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with the corresponding ciphertext
1208 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
1209 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
1210 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
1211 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
1212 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
1213 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
1214 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
1215
1216 ###############################################################################
1217
1218 _initial_blocks_done\@:
1219
1220 .endm
1221
1222 # encrypt 8 blocks at a time
1223 # ghash the 8 previously encrypted ciphertext blocks
1224 # arg1, arg2, arg3, arg4 are used as pointers only, not modified
1225 # r11 is the data offset value
1226 .macro GHASH_8_ENCRYPT_8_PARALLEL_AVX REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
1227
1228 vmovdqa \XMM1, \T2
1229 vmovdqa \XMM2, TMP2(%rsp)
1230 vmovdqa \XMM3, TMP3(%rsp)
1231 vmovdqa \XMM4, TMP4(%rsp)
1232 vmovdqa \XMM5, TMP5(%rsp)
1233 vmovdqa \XMM6, TMP6(%rsp)
1234 vmovdqa \XMM7, TMP7(%rsp)
1235 vmovdqa \XMM8, TMP8(%rsp)
1236
1237 .if \loop_idx == in_order
1238 vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT
1239 vpaddd ONE(%rip), \XMM1, \XMM2
1240 vpaddd ONE(%rip), \XMM2, \XMM3
1241 vpaddd ONE(%rip), \XMM3, \XMM4
1242 vpaddd ONE(%rip), \XMM4, \XMM5
1243 vpaddd ONE(%rip), \XMM5, \XMM6
1244 vpaddd ONE(%rip), \XMM6, \XMM7
1245 vpaddd ONE(%rip), \XMM7, \XMM8
1246 vmovdqa \XMM8, \CTR
1247
1248 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
1249 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
1250 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
1251 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
1252 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
1253 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
1254 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
1255 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
1256 .else
1257 vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT
1258 vpaddd ONEf(%rip), \XMM1, \XMM2
1259 vpaddd ONEf(%rip), \XMM2, \XMM3
1260 vpaddd ONEf(%rip), \XMM3, \XMM4
1261 vpaddd ONEf(%rip), \XMM4, \XMM5
1262 vpaddd ONEf(%rip), \XMM5, \XMM6
1263 vpaddd ONEf(%rip), \XMM6, \XMM7
1264 vpaddd ONEf(%rip), \XMM7, \XMM8
1265 vmovdqa \XMM8, \CTR
1266 .endif
1267
1268
1269 #######################################################################
1270
1271 vmovdqu (arg1), \T1
1272 vpxor \T1, \XMM1, \XMM1
1273 vpxor \T1, \XMM2, \XMM2
1274 vpxor \T1, \XMM3, \XMM3
1275 vpxor \T1, \XMM4, \XMM4
1276 vpxor \T1, \XMM5, \XMM5
1277 vpxor \T1, \XMM6, \XMM6
1278 vpxor \T1, \XMM7, \XMM7
1279 vpxor \T1, \XMM8, \XMM8
1280
1281 #######################################################################
1282
1283
1284
1285
1286
1287 vmovdqu 16*1(arg1), \T1
1288 vaesenc \T1, \XMM1, \XMM1
1289 vaesenc \T1, \XMM2, \XMM2
1290 vaesenc \T1, \XMM3, \XMM3
1291 vaesenc \T1, \XMM4, \XMM4
1292 vaesenc \T1, \XMM5, \XMM5
1293 vaesenc \T1, \XMM6, \XMM6
1294 vaesenc \T1, \XMM7, \XMM7
1295 vaesenc \T1, \XMM8, \XMM8
1296
1297 vmovdqu 16*2(arg1), \T1
1298 vaesenc \T1, \XMM1, \XMM1
1299 vaesenc \T1, \XMM2, \XMM2
1300 vaesenc \T1, \XMM3, \XMM3
1301 vaesenc \T1, \XMM4, \XMM4
1302 vaesenc \T1, \XMM5, \XMM5
1303 vaesenc \T1, \XMM6, \XMM6
1304 vaesenc \T1, \XMM7, \XMM7
1305 vaesenc \T1, \XMM8, \XMM8
1306
1307
1308 #######################################################################
1309
1310 vmovdqu HashKey_8(arg2), \T5
1311 vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1
1312 vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0
1313
1314 vpshufd $0b01001110, \T2, \T6
1315 vpxor \T2, \T6, \T6
1316
1317 vmovdqu HashKey_8_k(arg2), \T5
1318 vpclmulqdq $0x00, \T5, \T6, \T6
1319
1320 vmovdqu 16*3(arg1), \T1
1321 vaesenc \T1, \XMM1, \XMM1
1322 vaesenc \T1, \XMM2, \XMM2
1323 vaesenc \T1, \XMM3, \XMM3
1324 vaesenc \T1, \XMM4, \XMM4
1325 vaesenc \T1, \XMM5, \XMM5
1326 vaesenc \T1, \XMM6, \XMM6
1327 vaesenc \T1, \XMM7, \XMM7
1328 vaesenc \T1, \XMM8, \XMM8
1329
1330 vmovdqa TMP2(%rsp), \T1
1331 vmovdqu HashKey_7(arg2), \T5
1332 vpclmulqdq $0x11, \T5, \T1, \T3
1333 vpxor \T3, \T4, \T4
1334 vpclmulqdq $0x00, \T5, \T1, \T3
1335 vpxor \T3, \T7, \T7
1336
1337 vpshufd $0b01001110, \T1, \T3
1338 vpxor \T1, \T3, \T3
1339 vmovdqu HashKey_7_k(arg2), \T5
1340 vpclmulqdq $0x10, \T5, \T3, \T3
1341 vpxor \T3, \T6, \T6
1342
1343 vmovdqu 16*4(arg1), \T1
1344 vaesenc \T1, \XMM1, \XMM1
1345 vaesenc \T1, \XMM2, \XMM2
1346 vaesenc \T1, \XMM3, \XMM3
1347 vaesenc \T1, \XMM4, \XMM4
1348 vaesenc \T1, \XMM5, \XMM5
1349 vaesenc \T1, \XMM6, \XMM6
1350 vaesenc \T1, \XMM7, \XMM7
1351 vaesenc \T1, \XMM8, \XMM8
1352
1353 #######################################################################
1354
1355 vmovdqa TMP3(%rsp), \T1
1356 vmovdqu HashKey_6(arg2), \T5
1357 vpclmulqdq $0x11, \T5, \T1, \T3
1358 vpxor \T3, \T4, \T4
1359 vpclmulqdq $0x00, \T5, \T1, \T3
1360 vpxor \T3, \T7, \T7
1361
1362 vpshufd $0b01001110, \T1, \T3
1363 vpxor \T1, \T3, \T3
1364 vmovdqu HashKey_6_k(arg2), \T5
1365 vpclmulqdq $0x10, \T5, \T3, \T3
1366 vpxor \T3, \T6, \T6
1367
1368 vmovdqu 16*5(arg1), \T1
1369 vaesenc \T1, \XMM1, \XMM1
1370 vaesenc \T1, \XMM2, \XMM2
1371 vaesenc \T1, \XMM3, \XMM3
1372 vaesenc \T1, \XMM4, \XMM4
1373 vaesenc \T1, \XMM5, \XMM5
1374 vaesenc \T1, \XMM6, \XMM6
1375 vaesenc \T1, \XMM7, \XMM7
1376 vaesenc \T1, \XMM8, \XMM8
1377
1378 vmovdqa TMP4(%rsp), \T1
1379 vmovdqu HashKey_5(arg2), \T5
1380 vpclmulqdq $0x11, \T5, \T1, \T3
1381 vpxor \T3, \T4, \T4
1382 vpclmulqdq $0x00, \T5, \T1, \T3
1383 vpxor \T3, \T7, \T7
1384
1385 vpshufd $0b01001110, \T1, \T3
1386 vpxor \T1, \T3, \T3
1387 vmovdqu HashKey_5_k(arg2), \T5
1388 vpclmulqdq $0x10, \T5, \T3, \T3
1389 vpxor \T3, \T6, \T6
1390
1391 vmovdqu 16*6(arg1), \T1
1392 vaesenc \T1, \XMM1, \XMM1
1393 vaesenc \T1, \XMM2, \XMM2
1394 vaesenc \T1, \XMM3, \XMM3
1395 vaesenc \T1, \XMM4, \XMM4
1396 vaesenc \T1, \XMM5, \XMM5
1397 vaesenc \T1, \XMM6, \XMM6
1398 vaesenc \T1, \XMM7, \XMM7
1399 vaesenc \T1, \XMM8, \XMM8
1400
1401
1402 vmovdqa TMP5(%rsp), \T1
1403 vmovdqu HashKey_4(arg2), \T5
1404 vpclmulqdq $0x11, \T5, \T1, \T3
1405 vpxor \T3, \T4, \T4
1406 vpclmulqdq $0x00, \T5, \T1, \T3
1407 vpxor \T3, \T7, \T7
1408
1409 vpshufd $0b01001110, \T1, \T3
1410 vpxor \T1, \T3, \T3
1411 vmovdqu HashKey_4_k(arg2), \T5
1412 vpclmulqdq $0x10, \T5, \T3, \T3
1413 vpxor \T3, \T6, \T6
1414
1415 vmovdqu 16*7(arg1), \T1
1416 vaesenc \T1, \XMM1, \XMM1
1417 vaesenc \T1, \XMM2, \XMM2
1418 vaesenc \T1, \XMM3, \XMM3
1419 vaesenc \T1, \XMM4, \XMM4
1420 vaesenc \T1, \XMM5, \XMM5
1421 vaesenc \T1, \XMM6, \XMM6
1422 vaesenc \T1, \XMM7, \XMM7
1423 vaesenc \T1, \XMM8, \XMM8
1424
1425 vmovdqa TMP6(%rsp), \T1
1426 vmovdqu HashKey_3(arg2), \T5
1427 vpclmulqdq $0x11, \T5, \T1, \T3
1428 vpxor \T3, \T4, \T4
1429 vpclmulqdq $0x00, \T5, \T1, \T3
1430 vpxor \T3, \T7, \T7
1431
1432 vpshufd $0b01001110, \T1, \T3
1433 vpxor \T1, \T3, \T3
1434 vmovdqu HashKey_3_k(arg2), \T5
1435 vpclmulqdq $0x10, \T5, \T3, \T3
1436 vpxor \T3, \T6, \T6
1437
1438
1439 vmovdqu 16*8(arg1), \T1
1440 vaesenc \T1, \XMM1, \XMM1
1441 vaesenc \T1, \XMM2, \XMM2
1442 vaesenc \T1, \XMM3, \XMM3
1443 vaesenc \T1, \XMM4, \XMM4
1444 vaesenc \T1, \XMM5, \XMM5
1445 vaesenc \T1, \XMM6, \XMM6
1446 vaesenc \T1, \XMM7, \XMM7
1447 vaesenc \T1, \XMM8, \XMM8
1448
1449 vmovdqa TMP7(%rsp), \T1
1450 vmovdqu HashKey_2(arg2), \T5
1451 vpclmulqdq $0x11, \T5, \T1, \T3
1452 vpxor \T3, \T4, \T4
1453 vpclmulqdq $0x00, \T5, \T1, \T3
1454 vpxor \T3, \T7, \T7
1455
1456 vpshufd $0b01001110, \T1, \T3
1457 vpxor \T1, \T3, \T3
1458 vmovdqu HashKey_2_k(arg2), \T5
1459 vpclmulqdq $0x10, \T5, \T3, \T3
1460 vpxor \T3, \T6, \T6
1461
1462 #######################################################################
1463
1464 vmovdqu 16*9(arg1), \T5
1465 vaesenc \T5, \XMM1, \XMM1
1466 vaesenc \T5, \XMM2, \XMM2
1467 vaesenc \T5, \XMM3, \XMM3
1468 vaesenc \T5, \XMM4, \XMM4
1469 vaesenc \T5, \XMM5, \XMM5
1470 vaesenc \T5, \XMM6, \XMM6
1471 vaesenc \T5, \XMM7, \XMM7
1472 vaesenc \T5, \XMM8, \XMM8
1473
1474 vmovdqa TMP8(%rsp), \T1
1475 vmovdqu HashKey(arg2), \T5
1476 vpclmulqdq $0x11, \T5, \T1, \T3
1477 vpxor \T3, \T4, \T4
1478 vpclmulqdq $0x00, \T5, \T1, \T3
1479 vpxor \T3, \T7, \T7
1480
1481 vpshufd $0b01001110, \T1, \T3
1482 vpxor \T1, \T3, \T3
1483 vmovdqu HashKey_k(arg2), \T5
1484 vpclmulqdq $0x10, \T5, \T3, \T3
1485 vpxor \T3, \T6, \T6
1486
1487 vpxor \T4, \T6, \T6
1488 vpxor \T7, \T6, \T6
1489
1490 vmovdqu 16*10(arg1), \T5
1491
1492 i = 11
1493 setreg
1494 .rep (\REP-9)
1495
1496 vaesenc \T5, \XMM1, \XMM1
1497 vaesenc \T5, \XMM2, \XMM2
1498 vaesenc \T5, \XMM3, \XMM3
1499 vaesenc \T5, \XMM4, \XMM4
1500 vaesenc \T5, \XMM5, \XMM5
1501 vaesenc \T5, \XMM6, \XMM6
1502 vaesenc \T5, \XMM7, \XMM7
1503 vaesenc \T5, \XMM8, \XMM8
1504
1505 vmovdqu 16*i(arg1), \T5
1506 i = i + 1
1507 setreg
1508 .endr
1509
1510 i = 0
1511 j = 1
1512 setreg
1513 .rep 8
1514 vpxor 16*i(arg4, %r11), \T5, \T2
1515 .if \ENC_DEC == ENC
1516 vaesenclast \T2, reg_j, reg_j
1517 .else
1518 vaesenclast \T2, reg_j, \T3
1519 vmovdqu 16*i(arg4, %r11), reg_j
1520 vmovdqu \T3, 16*i(arg3, %r11)
1521 .endif
1522 i = (i+1)
1523 j = (j+1)
1524 setreg
1525 .endr
1526 #######################################################################
1527
1528
1529 vpslldq $8, \T6, \T3 # shift-L T3 2 DWs
1530 vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs
1531 vpxor \T3, \T7, \T7
1532 vpxor \T4, \T6, \T6 # accumulate the results in T6:T7
1533
1534
1535
1536 #######################################################################
1537 #first phase of the reduction
1538 #######################################################################
1539 vpslld $31, \T7, \T2 # packed right shifting << 31
1540 vpslld $30, \T7, \T3 # packed right shifting shift << 30
1541 vpslld $25, \T7, \T4 # packed right shifting shift << 25
1542
1543 vpxor \T3, \T2, \T2 # xor the shifted versions
1544 vpxor \T4, \T2, \T2
1545
1546 vpsrldq $4, \T2, \T1 # shift-R T1 1 DW
1547
1548 vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
1549 vpxor \T2, \T7, \T7 # first phase of the reduction complete
1550 #######################################################################
1551 .if \ENC_DEC == ENC
1552 vmovdqu \XMM1, 16*0(arg3,%r11) # Write to the Ciphertext buffer
1553 vmovdqu \XMM2, 16*1(arg3,%r11) # Write to the Ciphertext buffer
1554 vmovdqu \XMM3, 16*2(arg3,%r11) # Write to the Ciphertext buffer
1555 vmovdqu \XMM4, 16*3(arg3,%r11) # Write to the Ciphertext buffer
1556 vmovdqu \XMM5, 16*4(arg3,%r11) # Write to the Ciphertext buffer
1557 vmovdqu \XMM6, 16*5(arg3,%r11) # Write to the Ciphertext buffer
1558 vmovdqu \XMM7, 16*6(arg3,%r11) # Write to the Ciphertext buffer
1559 vmovdqu \XMM8, 16*7(arg3,%r11) # Write to the Ciphertext buffer
1560 .endif
1561
1562 #######################################################################
1563 #second phase of the reduction
1564 vpsrld $1, \T7, \T2 # packed left shifting >> 1
1565 vpsrld $2, \T7, \T3 # packed left shifting >> 2
1566 vpsrld $7, \T7, \T4 # packed left shifting >> 7
1567 vpxor \T3, \T2, \T2 # xor the shifted versions
1568 vpxor \T4, \T2, \T2
1569
1570 vpxor \T1, \T2, \T2
1571 vpxor \T2, \T7, \T7
1572 vpxor \T7, \T6, \T6 # the result is in T6
1573 #######################################################################
1574
1575 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
1576 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
1577 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
1578 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
1579 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
1580 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
1581 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
1582 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
1583
1584
1585 vpxor \T6, \XMM1, \XMM1
1586
1587
1588
1589 .endm
1590
1591
1592 # GHASH the last 4 ciphertext blocks.
1593 .macro GHASH_LAST_8_AVX T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
1594
1595 ## Karatsuba Method
1596
1597
1598 vpshufd $0b01001110, \XMM1, \T2
1599 vpxor \XMM1, \T2, \T2
1600 vmovdqu HashKey_8(arg2), \T5
1601 vpclmulqdq $0x11, \T5, \XMM1, \T6
1602 vpclmulqdq $0x00, \T5, \XMM1, \T7
1603
1604 vmovdqu HashKey_8_k(arg2), \T3
1605 vpclmulqdq $0x00, \T3, \T2, \XMM1
1606
1607 ######################
1608
1609 vpshufd $0b01001110, \XMM2, \T2
1610 vpxor \XMM2, \T2, \T2
1611 vmovdqu HashKey_7(arg2), \T5
1612 vpclmulqdq $0x11, \T5, \XMM2, \T4
1613 vpxor \T4, \T6, \T6
1614
1615 vpclmulqdq $0x00, \T5, \XMM2, \T4
1616 vpxor \T4, \T7, \T7
1617
1618 vmovdqu HashKey_7_k(arg2), \T3
1619 vpclmulqdq $0x00, \T3, \T2, \T2
1620 vpxor \T2, \XMM1, \XMM1
1621
1622 ######################
1623
1624 vpshufd $0b01001110, \XMM3, \T2
1625 vpxor \XMM3, \T2, \T2
1626 vmovdqu HashKey_6(arg2), \T5
1627 vpclmulqdq $0x11, \T5, \XMM3, \T4
1628 vpxor \T4, \T6, \T6
1629
1630 vpclmulqdq $0x00, \T5, \XMM3, \T4
1631 vpxor \T4, \T7, \T7
1632
1633 vmovdqu HashKey_6_k(arg2), \T3
1634 vpclmulqdq $0x00, \T3, \T2, \T2
1635 vpxor \T2, \XMM1, \XMM1
1636
1637 ######################
1638
1639 vpshufd $0b01001110, \XMM4, \T2
1640 vpxor \XMM4, \T2, \T2
1641 vmovdqu HashKey_5(arg2), \T5
1642 vpclmulqdq $0x11, \T5, \XMM4, \T4
1643 vpxor \T4, \T6, \T6
1644
1645 vpclmulqdq $0x00, \T5, \XMM4, \T4
1646 vpxor \T4, \T7, \T7
1647
1648 vmovdqu HashKey_5_k(arg2), \T3
1649 vpclmulqdq $0x00, \T3, \T2, \T2
1650 vpxor \T2, \XMM1, \XMM1
1651
1652 ######################
1653
1654 vpshufd $0b01001110, \XMM5, \T2
1655 vpxor \XMM5, \T2, \T2
1656 vmovdqu HashKey_4(arg2), \T5
1657 vpclmulqdq $0x11, \T5, \XMM5, \T4
1658 vpxor \T4, \T6, \T6
1659
1660 vpclmulqdq $0x00, \T5, \XMM5, \T4
1661 vpxor \T4, \T7, \T7
1662
1663 vmovdqu HashKey_4_k(arg2), \T3
1664 vpclmulqdq $0x00, \T3, \T2, \T2
1665 vpxor \T2, \XMM1, \XMM1
1666
1667 ######################
1668
1669 vpshufd $0b01001110, \XMM6, \T2
1670 vpxor \XMM6, \T2, \T2
1671 vmovdqu HashKey_3(arg2), \T5
1672 vpclmulqdq $0x11, \T5, \XMM6, \T4
1673 vpxor \T4, \T6, \T6
1674
1675 vpclmulqdq $0x00, \T5, \XMM6, \T4
1676 vpxor \T4, \T7, \T7
1677
1678 vmovdqu HashKey_3_k(arg2), \T3
1679 vpclmulqdq $0x00, \T3, \T2, \T2
1680 vpxor \T2, \XMM1, \XMM1
1681
1682 ######################
1683
1684 vpshufd $0b01001110, \XMM7, \T2
1685 vpxor \XMM7, \T2, \T2
1686 vmovdqu HashKey_2(arg2), \T5
1687 vpclmulqdq $0x11, \T5, \XMM7, \T4
1688 vpxor \T4, \T6, \T6
1689
1690 vpclmulqdq $0x00, \T5, \XMM7, \T4
1691 vpxor \T4, \T7, \T7
1692
1693 vmovdqu HashKey_2_k(arg2), \T3
1694 vpclmulqdq $0x00, \T3, \T2, \T2
1695 vpxor \T2, \XMM1, \XMM1
1696
1697 ######################
1698
1699 vpshufd $0b01001110, \XMM8, \T2
1700 vpxor \XMM8, \T2, \T2
1701 vmovdqu HashKey(arg2), \T5
1702 vpclmulqdq $0x11, \T5, \XMM8, \T4
1703 vpxor \T4, \T6, \T6
1704
1705 vpclmulqdq $0x00, \T5, \XMM8, \T4
1706 vpxor \T4, \T7, \T7
1707
1708 vmovdqu HashKey_k(arg2), \T3
1709 vpclmulqdq $0x00, \T3, \T2, \T2
1710
1711 vpxor \T2, \XMM1, \XMM1
1712 vpxor \T6, \XMM1, \XMM1
1713 vpxor \T7, \XMM1, \T2
1714
1715
1716
1717
1718 vpslldq $8, \T2, \T4
1719 vpsrldq $8, \T2, \T2
1720
1721 vpxor \T4, \T7, \T7
1722 vpxor \T2, \T6, \T6 # <T6:T7> holds the result of
1723 # the accumulated carry-less multiplications
1724
1725 #######################################################################
1726 #first phase of the reduction
1727 vpslld $31, \T7, \T2 # packed right shifting << 31
1728 vpslld $30, \T7, \T3 # packed right shifting shift << 30
1729 vpslld $25, \T7, \T4 # packed right shifting shift << 25
1730
1731 vpxor \T3, \T2, \T2 # xor the shifted versions
1732 vpxor \T4, \T2, \T2
1733
1734 vpsrldq $4, \T2, \T1 # shift-R T1 1 DW
1735
1736 vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
1737 vpxor \T2, \T7, \T7 # first phase of the reduction complete
1738 #######################################################################
1739
1740
1741 #second phase of the reduction
1742 vpsrld $1, \T7, \T2 # packed left shifting >> 1
1743 vpsrld $2, \T7, \T3 # packed left shifting >> 2
1744 vpsrld $7, \T7, \T4 # packed left shifting >> 7
1745 vpxor \T3, \T2, \T2 # xor the shifted versions
1746 vpxor \T4, \T2, \T2
1747
1748 vpxor \T1, \T2, \T2
1749 vpxor \T2, \T7, \T7
1750 vpxor \T7, \T6, \T6 # the result is in T6
1751
1752 .endm
1753
1754 #############################################################
1755 #void aesni_gcm_precomp_avx_gen2
1756 # (gcm_data *my_ctx_data,
1757 # gcm_context_data *data,
1758 # u8 *hash_subkey#
1759 # u8 *iv,
1760
1761
1762
1763 # const u8 *aad,
1764 # u64 aad_len)
1765 #############################################################
1766 SYM_FUNC_START(aesni_gcm_init_avx_gen2)
1767 FUNC_SAVE
1768 INIT GHASH_MUL_AVX, PRECOMPUTE_AVX
1769 FUNC_RESTORE
1770 RET
1771 SYM_FUNC_END(aesni_gcm_init_avx_gen2)
1772
1773 ###############################################################################
1774 #void aesni_gcm_enc_update_avx_gen2(
1775 # gcm_data *my_ctx_data,
1776 # gcm_context_data *data,
1777 # u8 *out,
1778 # const u8 *in,
1779 # u64 plaintext_len)
1780 ###############################################################################
1781 SYM_FUNC_START(aesni_gcm_enc_update_avx_gen2)
1782 FUNC_SAVE
1783 mov keysize, %eax
1784 cmp $32, %eax
1785 je key_256_enc_update
1786 cmp $16, %eax
1787 je key_128_enc_update
1788 # must be 192
1789 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 11
1790 FUNC_RESTORE
1791 RET
1792 key_128_enc_update:
1793 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 9
1794 FUNC_RESTORE
1795 RET
1796 key_256_enc_update:
1797 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 13
1798 FUNC_RESTORE
1799 RET
1800 SYM_FUNC_END(aesni_gcm_enc_update_avx_gen2)
1801
1802 ###############################################################################
1803 #void aesni_gcm_dec_update_avx_gen2(
1804 # gcm_data *my_ctx_data,
1805 # gcm_context_data *data,
1806 # u8 *out,
1807 # const u8 *in,
1808 # u64 plaintext_len)
1809 ###############################################################################
1810 SYM_FUNC_START(aesni_gcm_dec_update_avx_gen2)
1811 FUNC_SAVE
1812 mov keysize,%eax
1813 cmp $32, %eax
1814 je key_256_dec_update
1815 cmp $16, %eax
1816 je key_128_dec_update
1817 # must be 192
1818 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 11
1819 FUNC_RESTORE
1820 RET
1821 key_128_dec_update:
1822 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 9
1823 FUNC_RESTORE
1824 RET
1825 key_256_dec_update:
1826 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 13
1827 FUNC_RESTORE
1828 RET
1829 SYM_FUNC_END(aesni_gcm_dec_update_avx_gen2)
1830
1831 ###############################################################################
1832 #void aesni_gcm_finalize_avx_gen2(
1833 # gcm_data *my_ctx_data,
1834 # gcm_context_data *data,
1835 # u8 *auth_tag,
1836 # u64 auth_tag_len)#
1837
1838 ###############################################################################
1839 SYM_FUNC_START(aesni_gcm_finalize_avx_gen2)
1840 FUNC_SAVE
1841 mov keysize,%eax
1842 cmp $32, %eax
1843 je key_256_finalize
1844 cmp $16, %eax
1845 je key_128_finalize
1846 # must be 192
1847 GCM_COMPLETE GHASH_MUL_AVX, 11, arg3, arg4
1848 FUNC_RESTORE
1849 RET
1850 key_128_finalize:
1851 GCM_COMPLETE GHASH_MUL_AVX, 9, arg3, arg4
1852 FUNC_RESTORE
1853 RET
1854 key_256_finalize:
1855 GCM_COMPLETE GHASH_MUL_AVX, 13, arg3, arg4
1856 FUNC_RESTORE
1857 RET
1858 SYM_FUNC_END(aesni_gcm_finalize_avx_gen2)
1859
1860 ###############################################################################
1861 # GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
1862 # Input: A and B (128-bits each, bit-reflected)
1863 # Output: C = A*B*x mod poly, (i.e. >>1 )
1864 # To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
1865 # GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
1866 ###############################################################################
1867 .macro GHASH_MUL_AVX2 GH HK T1 T2 T3 T4 T5
1868
1869 vpclmulqdq $0x11,\HK,\GH,\T1 # T1 = a1*b1
1870 vpclmulqdq $0x00,\HK,\GH,\T2 # T2 = a0*b0
1871 vpclmulqdq $0x01,\HK,\GH,\T3 # T3 = a1*b0
1872 vpclmulqdq $0x10,\HK,\GH,\GH # GH = a0*b1
1873 vpxor \T3, \GH, \GH
1874
1875
1876 vpsrldq $8 , \GH, \T3 # shift-R GH 2 DWs
1877 vpslldq $8 , \GH, \GH # shift-L GH 2 DWs
1878
1879 vpxor \T3, \T1, \T1
1880 vpxor \T2, \GH, \GH
1881
1882 #######################################################################
1883 #first phase of the reduction
1884 vmovdqa POLY2(%rip), \T3
1885
1886 vpclmulqdq $0x01, \GH, \T3, \T2
1887 vpslldq $8, \T2, \T2 # shift-L T2 2 DWs
1888
1889 vpxor \T2, \GH, \GH # first phase of the reduction complete
1890 #######################################################################
1891 #second phase of the reduction
1892 vpclmulqdq $0x00, \GH, \T3, \T2
1893 vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
1894
1895 vpclmulqdq $0x10, \GH, \T3, \GH
1896 vpslldq $4, \GH, \GH # shift-L GH 1 DW (Shift-L 1-DW to obtain result with no shifts)
1897
1898 vpxor \T2, \GH, \GH # second phase of the reduction complete
1899 #######################################################################
1900 vpxor \T1, \GH, \GH # the result is in GH
1901
1902
1903 .endm
1904
1905 .macro PRECOMPUTE_AVX2 HK T1 T2 T3 T4 T5 T6
1906
1907 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
1908 vmovdqa \HK, \T5
1909 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly
1910 vmovdqu \T5, HashKey_2(arg2) # [HashKey_2] = HashKey^2<<1 mod poly
1911
1912 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly
1913 vmovdqu \T5, HashKey_3(arg2)
1914
1915 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly
1916 vmovdqu \T5, HashKey_4(arg2)
1917
1918 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly
1919 vmovdqu \T5, HashKey_5(arg2)
1920
1921 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly
1922 vmovdqu \T5, HashKey_6(arg2)
1923
1924 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly
1925 vmovdqu \T5, HashKey_7(arg2)
1926
1927 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly
1928 vmovdqu \T5, HashKey_8(arg2)
1929
1930 .endm
1931
1932 ## if a = number of total plaintext bytes
1933 ## b = floor(a/16)
1934 ## num_initial_blocks = b mod 4#
1935 ## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
1936 ## r10, r11, r12, rax are clobbered
1937 ## arg1, arg2, arg3, arg4 are used as pointers only, not modified
1938
1939 .macro INITIAL_BLOCKS_AVX2 REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER
1940 i = (8-\num_initial_blocks)
1941 setreg
1942 vmovdqu AadHash(arg2), reg_i
1943
1944 # start AES for num_initial_blocks blocks
1945 vmovdqu CurCount(arg2), \CTR
1946
1947 i = (9-\num_initial_blocks)
1948 setreg
1949 .rep \num_initial_blocks
1950 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1951 vmovdqa \CTR, reg_i
1952 vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap
1953 i = (i+1)
1954 setreg
1955 .endr
1956
1957 vmovdqa (arg1), \T_key
1958 i = (9-\num_initial_blocks)
1959 setreg
1960 .rep \num_initial_blocks
1961 vpxor \T_key, reg_i, reg_i
1962 i = (i+1)
1963 setreg
1964 .endr
1965
1966 j = 1
1967 setreg
1968 .rep \REP
1969 vmovdqa 16*j(arg1), \T_key
1970 i = (9-\num_initial_blocks)
1971 setreg
1972 .rep \num_initial_blocks
1973 vaesenc \T_key, reg_i, reg_i
1974 i = (i+1)
1975 setreg
1976 .endr
1977
1978 j = (j+1)
1979 setreg
1980 .endr
1981
1982
1983 vmovdqa 16*j(arg1), \T_key
1984 i = (9-\num_initial_blocks)
1985 setreg
1986 .rep \num_initial_blocks
1987 vaesenclast \T_key, reg_i, reg_i
1988 i = (i+1)
1989 setreg
1990 .endr
1991
1992 i = (9-\num_initial_blocks)
1993 setreg
1994 .rep \num_initial_blocks
1995 vmovdqu (arg4, %r11), \T1
1996 vpxor \T1, reg_i, reg_i
1997 vmovdqu reg_i, (arg3 , %r11) # write back ciphertext for
1998 # num_initial_blocks blocks
1999 add $16, %r11
2000 .if \ENC_DEC == DEC
2001 vmovdqa \T1, reg_i
2002 .endif
2003 vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations
2004 i = (i+1)
2005 setreg
2006 .endr
2007
2008
2009 i = (8-\num_initial_blocks)
2010 j = (9-\num_initial_blocks)
2011 setreg
2012
2013 .rep \num_initial_blocks
2014 vpxor reg_i, reg_j, reg_j
2015 GHASH_MUL_AVX2 reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks
2016 i = (i+1)
2017 j = (j+1)
2018 setreg
2019 .endr
2020 # XMM8 has the combined result here
2021
2022 vmovdqa \XMM8, TMP1(%rsp)
2023 vmovdqa \XMM8, \T3
2024
2025 cmp $128, %r13
2026 jl _initial_blocks_done\@ # no need for precomputed constants
2027
2028 ###############################################################################
2029 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
2030 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
2031 vmovdqa \CTR, \XMM1
2032 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
2033
2034 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
2035 vmovdqa \CTR, \XMM2
2036 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
2037
2038 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
2039 vmovdqa \CTR, \XMM3
2040 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
2041
2042 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
2043 vmovdqa \CTR, \XMM4
2044 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
2045
2046 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
2047 vmovdqa \CTR, \XMM5
2048 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
2049
2050 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
2051 vmovdqa \CTR, \XMM6
2052 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
2053
2054 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
2055 vmovdqa \CTR, \XMM7
2056 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
2057
2058 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
2059 vmovdqa \CTR, \XMM8
2060 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
2061
2062 vmovdqa (arg1), \T_key
2063 vpxor \T_key, \XMM1, \XMM1
2064 vpxor \T_key, \XMM2, \XMM2
2065 vpxor \T_key, \XMM3, \XMM3
2066 vpxor \T_key, \XMM4, \XMM4
2067 vpxor \T_key, \XMM5, \XMM5
2068 vpxor \T_key, \XMM6, \XMM6
2069 vpxor \T_key, \XMM7, \XMM7
2070 vpxor \T_key, \XMM8, \XMM8
2071
2072 i = 1
2073 setreg
2074 .rep \REP # do REP rounds
2075 vmovdqa 16*i(arg1), \T_key
2076 vaesenc \T_key, \XMM1, \XMM1
2077 vaesenc \T_key, \XMM2, \XMM2
2078 vaesenc \T_key, \XMM3, \XMM3
2079 vaesenc \T_key, \XMM4, \XMM4
2080 vaesenc \T_key, \XMM5, \XMM5
2081 vaesenc \T_key, \XMM6, \XMM6
2082 vaesenc \T_key, \XMM7, \XMM7
2083 vaesenc \T_key, \XMM8, \XMM8
2084 i = (i+1)
2085 setreg
2086 .endr
2087
2088
2089 vmovdqa 16*i(arg1), \T_key
2090 vaesenclast \T_key, \XMM1, \XMM1
2091 vaesenclast \T_key, \XMM2, \XMM2
2092 vaesenclast \T_key, \XMM3, \XMM3
2093 vaesenclast \T_key, \XMM4, \XMM4
2094 vaesenclast \T_key, \XMM5, \XMM5
2095 vaesenclast \T_key, \XMM6, \XMM6
2096 vaesenclast \T_key, \XMM7, \XMM7
2097 vaesenclast \T_key, \XMM8, \XMM8
2098
2099 vmovdqu (arg4, %r11), \T1
2100 vpxor \T1, \XMM1, \XMM1
2101 vmovdqu \XMM1, (arg3 , %r11)
2102 .if \ENC_DEC == DEC
2103 vmovdqa \T1, \XMM1
2104 .endif
2105
2106 vmovdqu 16*1(arg4, %r11), \T1
2107 vpxor \T1, \XMM2, \XMM2
2108 vmovdqu \XMM2, 16*1(arg3 , %r11)
2109 .if \ENC_DEC == DEC
2110 vmovdqa \T1, \XMM2
2111 .endif
2112
2113 vmovdqu 16*2(arg4, %r11), \T1
2114 vpxor \T1, \XMM3, \XMM3
2115 vmovdqu \XMM3, 16*2(arg3 , %r11)
2116 .if \ENC_DEC == DEC
2117 vmovdqa \T1, \XMM3
2118 .endif
2119
2120 vmovdqu 16*3(arg4, %r11), \T1
2121 vpxor \T1, \XMM4, \XMM4
2122 vmovdqu \XMM4, 16*3(arg3 , %r11)
2123 .if \ENC_DEC == DEC
2124 vmovdqa \T1, \XMM4
2125 .endif
2126
2127 vmovdqu 16*4(arg4, %r11), \T1
2128 vpxor \T1, \XMM5, \XMM5
2129 vmovdqu \XMM5, 16*4(arg3 , %r11)
2130 .if \ENC_DEC == DEC
2131 vmovdqa \T1, \XMM5
2132 .endif
2133
2134 vmovdqu 16*5(arg4, %r11), \T1
2135 vpxor \T1, \XMM6, \XMM6
2136 vmovdqu \XMM6, 16*5(arg3 , %r11)
2137 .if \ENC_DEC == DEC
2138 vmovdqa \T1, \XMM6
2139 .endif
2140
2141 vmovdqu 16*6(arg4, %r11), \T1
2142 vpxor \T1, \XMM7, \XMM7
2143 vmovdqu \XMM7, 16*6(arg3 , %r11)
2144 .if \ENC_DEC == DEC
2145 vmovdqa \T1, \XMM7
2146 .endif
2147
2148 vmovdqu 16*7(arg4, %r11), \T1
2149 vpxor \T1, \XMM8, \XMM8
2150 vmovdqu \XMM8, 16*7(arg3 , %r11)
2151 .if \ENC_DEC == DEC
2152 vmovdqa \T1, \XMM8
2153 .endif
2154
2155 add $128, %r11
2156
2157 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
2158 vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with
2159 # the corresponding ciphertext
2160 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
2161 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
2162 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
2163 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
2164 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
2165 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
2166 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
2167
2168 ###############################################################################
2169
2170 _initial_blocks_done\@:
2171
2172
2173 .endm
2174
2175
2176
2177 # encrypt 8 blocks at a time
2178 # ghash the 8 previously encrypted ciphertext blocks
2179 # arg1, arg2, arg3, arg4 are used as pointers only, not modified
2180 # r11 is the data offset value
2181 .macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
2182
2183 vmovdqa \XMM1, \T2
2184 vmovdqa \XMM2, TMP2(%rsp)
2185 vmovdqa \XMM3, TMP3(%rsp)
2186 vmovdqa \XMM4, TMP4(%rsp)
2187 vmovdqa \XMM5, TMP5(%rsp)
2188 vmovdqa \XMM6, TMP6(%rsp)
2189 vmovdqa \XMM7, TMP7(%rsp)
2190 vmovdqa \XMM8, TMP8(%rsp)
2191
2192 .if \loop_idx == in_order
2193 vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT
2194 vpaddd ONE(%rip), \XMM1, \XMM2
2195 vpaddd ONE(%rip), \XMM2, \XMM3
2196 vpaddd ONE(%rip), \XMM3, \XMM4
2197 vpaddd ONE(%rip), \XMM4, \XMM5
2198 vpaddd ONE(%rip), \XMM5, \XMM6
2199 vpaddd ONE(%rip), \XMM6, \XMM7
2200 vpaddd ONE(%rip), \XMM7, \XMM8
2201 vmovdqa \XMM8, \CTR
2202
2203 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
2204 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
2205 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
2206 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
2207 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
2208 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
2209 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
2210 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
2211 .else
2212 vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT
2213 vpaddd ONEf(%rip), \XMM1, \XMM2
2214 vpaddd ONEf(%rip), \XMM2, \XMM3
2215 vpaddd ONEf(%rip), \XMM3, \XMM4
2216 vpaddd ONEf(%rip), \XMM4, \XMM5
2217 vpaddd ONEf(%rip), \XMM5, \XMM6
2218 vpaddd ONEf(%rip), \XMM6, \XMM7
2219 vpaddd ONEf(%rip), \XMM7, \XMM8
2220 vmovdqa \XMM8, \CTR
2221 .endif
2222
2223
2224 #######################################################################
2225
2226 vmovdqu (arg1), \T1
2227 vpxor \T1, \XMM1, \XMM1
2228 vpxor \T1, \XMM2, \XMM2
2229 vpxor \T1, \XMM3, \XMM3
2230 vpxor \T1, \XMM4, \XMM4
2231 vpxor \T1, \XMM5, \XMM5
2232 vpxor \T1, \XMM6, \XMM6
2233 vpxor \T1, \XMM7, \XMM7
2234 vpxor \T1, \XMM8, \XMM8
2235
2236 #######################################################################
2237
2238
2239
2240
2241
2242 vmovdqu 16*1(arg1), \T1
2243 vaesenc \T1, \XMM1, \XMM1
2244 vaesenc \T1, \XMM2, \XMM2
2245 vaesenc \T1, \XMM3, \XMM3
2246 vaesenc \T1, \XMM4, \XMM4
2247 vaesenc \T1, \XMM5, \XMM5
2248 vaesenc \T1, \XMM6, \XMM6
2249 vaesenc \T1, \XMM7, \XMM7
2250 vaesenc \T1, \XMM8, \XMM8
2251
2252 vmovdqu 16*2(arg1), \T1
2253 vaesenc \T1, \XMM1, \XMM1
2254 vaesenc \T1, \XMM2, \XMM2
2255 vaesenc \T1, \XMM3, \XMM3
2256 vaesenc \T1, \XMM4, \XMM4
2257 vaesenc \T1, \XMM5, \XMM5
2258 vaesenc \T1, \XMM6, \XMM6
2259 vaesenc \T1, \XMM7, \XMM7
2260 vaesenc \T1, \XMM8, \XMM8
2261
2262
2263 #######################################################################
2264
2265 vmovdqu HashKey_8(arg2), \T5
2266 vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1
2267 vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0
2268 vpclmulqdq $0x01, \T5, \T2, \T6 # T6 = a1*b0
2269 vpclmulqdq $0x10, \T5, \T2, \T5 # T5 = a0*b1
2270 vpxor \T5, \T6, \T6
2271
2272 vmovdqu 16*3(arg1), \T1
2273 vaesenc \T1, \XMM1, \XMM1
2274 vaesenc \T1, \XMM2, \XMM2
2275 vaesenc \T1, \XMM3, \XMM3
2276 vaesenc \T1, \XMM4, \XMM4
2277 vaesenc \T1, \XMM5, \XMM5
2278 vaesenc \T1, \XMM6, \XMM6
2279 vaesenc \T1, \XMM7, \XMM7
2280 vaesenc \T1, \XMM8, \XMM8
2281
2282 vmovdqa TMP2(%rsp), \T1
2283 vmovdqu HashKey_7(arg2), \T5
2284 vpclmulqdq $0x11, \T5, \T1, \T3
2285 vpxor \T3, \T4, \T4
2286
2287 vpclmulqdq $0x00, \T5, \T1, \T3
2288 vpxor \T3, \T7, \T7
2289
2290 vpclmulqdq $0x01, \T5, \T1, \T3
2291 vpxor \T3, \T6, \T6
2292
2293 vpclmulqdq $0x10, \T5, \T1, \T3
2294 vpxor \T3, \T6, \T6
2295
2296 vmovdqu 16*4(arg1), \T1
2297 vaesenc \T1, \XMM1, \XMM1
2298 vaesenc \T1, \XMM2, \XMM2
2299 vaesenc \T1, \XMM3, \XMM3
2300 vaesenc \T1, \XMM4, \XMM4
2301 vaesenc \T1, \XMM5, \XMM5
2302 vaesenc \T1, \XMM6, \XMM6
2303 vaesenc \T1, \XMM7, \XMM7
2304 vaesenc \T1, \XMM8, \XMM8
2305
2306 #######################################################################
2307
2308 vmovdqa TMP3(%rsp), \T1
2309 vmovdqu HashKey_6(arg2), \T5
2310 vpclmulqdq $0x11, \T5, \T1, \T3
2311 vpxor \T3, \T4, \T4
2312
2313 vpclmulqdq $0x00, \T5, \T1, \T3
2314 vpxor \T3, \T7, \T7
2315
2316 vpclmulqdq $0x01, \T5, \T1, \T3
2317 vpxor \T3, \T6, \T6
2318
2319 vpclmulqdq $0x10, \T5, \T1, \T3
2320 vpxor \T3, \T6, \T6
2321
2322 vmovdqu 16*5(arg1), \T1
2323 vaesenc \T1, \XMM1, \XMM1
2324 vaesenc \T1, \XMM2, \XMM2
2325 vaesenc \T1, \XMM3, \XMM3
2326 vaesenc \T1, \XMM4, \XMM4
2327 vaesenc \T1, \XMM5, \XMM5
2328 vaesenc \T1, \XMM6, \XMM6
2329 vaesenc \T1, \XMM7, \XMM7
2330 vaesenc \T1, \XMM8, \XMM8
2331
2332 vmovdqa TMP4(%rsp), \T1
2333 vmovdqu HashKey_5(arg2), \T5
2334 vpclmulqdq $0x11, \T5, \T1, \T3
2335 vpxor \T3, \T4, \T4
2336
2337 vpclmulqdq $0x00, \T5, \T1, \T3
2338 vpxor \T3, \T7, \T7
2339
2340 vpclmulqdq $0x01, \T5, \T1, \T3
2341 vpxor \T3, \T6, \T6
2342
2343 vpclmulqdq $0x10, \T5, \T1, \T3
2344 vpxor \T3, \T6, \T6
2345
2346 vmovdqu 16*6(arg1), \T1
2347 vaesenc \T1, \XMM1, \XMM1
2348 vaesenc \T1, \XMM2, \XMM2
2349 vaesenc \T1, \XMM3, \XMM3
2350 vaesenc \T1, \XMM4, \XMM4
2351 vaesenc \T1, \XMM5, \XMM5
2352 vaesenc \T1, \XMM6, \XMM6
2353 vaesenc \T1, \XMM7, \XMM7
2354 vaesenc \T1, \XMM8, \XMM8
2355
2356
2357 vmovdqa TMP5(%rsp), \T1
2358 vmovdqu HashKey_4(arg2), \T5
2359 vpclmulqdq $0x11, \T5, \T1, \T3
2360 vpxor \T3, \T4, \T4
2361
2362 vpclmulqdq $0x00, \T5, \T1, \T3
2363 vpxor \T3, \T7, \T7
2364
2365 vpclmulqdq $0x01, \T5, \T1, \T3
2366 vpxor \T3, \T6, \T6
2367
2368 vpclmulqdq $0x10, \T5, \T1, \T3
2369 vpxor \T3, \T6, \T6
2370
2371 vmovdqu 16*7(arg1), \T1
2372 vaesenc \T1, \XMM1, \XMM1
2373 vaesenc \T1, \XMM2, \XMM2
2374 vaesenc \T1, \XMM3, \XMM3
2375 vaesenc \T1, \XMM4, \XMM4
2376 vaesenc \T1, \XMM5, \XMM5
2377 vaesenc \T1, \XMM6, \XMM6
2378 vaesenc \T1, \XMM7, \XMM7
2379 vaesenc \T1, \XMM8, \XMM8
2380
2381 vmovdqa TMP6(%rsp), \T1
2382 vmovdqu HashKey_3(arg2), \T5
2383 vpclmulqdq $0x11, \T5, \T1, \T3
2384 vpxor \T3, \T4, \T4
2385
2386 vpclmulqdq $0x00, \T5, \T1, \T3
2387 vpxor \T3, \T7, \T7
2388
2389 vpclmulqdq $0x01, \T5, \T1, \T3
2390 vpxor \T3, \T6, \T6
2391
2392 vpclmulqdq $0x10, \T5, \T1, \T3
2393 vpxor \T3, \T6, \T6
2394
2395 vmovdqu 16*8(arg1), \T1
2396 vaesenc \T1, \XMM1, \XMM1
2397 vaesenc \T1, \XMM2, \XMM2
2398 vaesenc \T1, \XMM3, \XMM3
2399 vaesenc \T1, \XMM4, \XMM4
2400 vaesenc \T1, \XMM5, \XMM5
2401 vaesenc \T1, \XMM6, \XMM6
2402 vaesenc \T1, \XMM7, \XMM7
2403 vaesenc \T1, \XMM8, \XMM8
2404
2405 vmovdqa TMP7(%rsp), \T1
2406 vmovdqu HashKey_2(arg2), \T5
2407 vpclmulqdq $0x11, \T5, \T1, \T3
2408 vpxor \T3, \T4, \T4
2409
2410 vpclmulqdq $0x00, \T5, \T1, \T3
2411 vpxor \T3, \T7, \T7
2412
2413 vpclmulqdq $0x01, \T5, \T1, \T3
2414 vpxor \T3, \T6, \T6
2415
2416 vpclmulqdq $0x10, \T5, \T1, \T3
2417 vpxor \T3, \T6, \T6
2418
2419
2420 #######################################################################
2421
2422 vmovdqu 16*9(arg1), \T5
2423 vaesenc \T5, \XMM1, \XMM1
2424 vaesenc \T5, \XMM2, \XMM2
2425 vaesenc \T5, \XMM3, \XMM3
2426 vaesenc \T5, \XMM4, \XMM4
2427 vaesenc \T5, \XMM5, \XMM5
2428 vaesenc \T5, \XMM6, \XMM6
2429 vaesenc \T5, \XMM7, \XMM7
2430 vaesenc \T5, \XMM8, \XMM8
2431
2432 vmovdqa TMP8(%rsp), \T1
2433 vmovdqu HashKey(arg2), \T5
2434
2435 vpclmulqdq $0x00, \T5, \T1, \T3
2436 vpxor \T3, \T7, \T7
2437
2438 vpclmulqdq $0x01, \T5, \T1, \T3
2439 vpxor \T3, \T6, \T6
2440
2441 vpclmulqdq $0x10, \T5, \T1, \T3
2442 vpxor \T3, \T6, \T6
2443
2444 vpclmulqdq $0x11, \T5, \T1, \T3
2445 vpxor \T3, \T4, \T1
2446
2447
2448 vmovdqu 16*10(arg1), \T5
2449
2450 i = 11
2451 setreg
2452 .rep (\REP-9)
2453 vaesenc \T5, \XMM1, \XMM1
2454 vaesenc \T5, \XMM2, \XMM2
2455 vaesenc \T5, \XMM3, \XMM3
2456 vaesenc \T5, \XMM4, \XMM4
2457 vaesenc \T5, \XMM5, \XMM5
2458 vaesenc \T5, \XMM6, \XMM6
2459 vaesenc \T5, \XMM7, \XMM7
2460 vaesenc \T5, \XMM8, \XMM8
2461
2462 vmovdqu 16*i(arg1), \T5
2463 i = i + 1
2464 setreg
2465 .endr
2466
2467 i = 0
2468 j = 1
2469 setreg
2470 .rep 8
2471 vpxor 16*i(arg4, %r11), \T5, \T2
2472 .if \ENC_DEC == ENC
2473 vaesenclast \T2, reg_j, reg_j
2474 .else
2475 vaesenclast \T2, reg_j, \T3
2476 vmovdqu 16*i(arg4, %r11), reg_j
2477 vmovdqu \T3, 16*i(arg3, %r11)
2478 .endif
2479 i = (i+1)
2480 j = (j+1)
2481 setreg
2482 .endr
2483 #######################################################################
2484
2485
2486 vpslldq $8, \T6, \T3 # shift-L T3 2 DWs
2487 vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs
2488 vpxor \T3, \T7, \T7
2489 vpxor \T6, \T1, \T1 # accumulate the results in T1:T7
2490
2491
2492
2493 #######################################################################
2494 #first phase of the reduction
2495 vmovdqa POLY2(%rip), \T3
2496
2497 vpclmulqdq $0x01, \T7, \T3, \T2
2498 vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs
2499
2500 vpxor \T2, \T7, \T7 # first phase of the reduction complete
2501 #######################################################################
2502 .if \ENC_DEC == ENC
2503 vmovdqu \XMM1, 16*0(arg3,%r11) # Write to the Ciphertext buffer
2504 vmovdqu \XMM2, 16*1(arg3,%r11) # Write to the Ciphertext buffer
2505 vmovdqu \XMM3, 16*2(arg3,%r11) # Write to the Ciphertext buffer
2506 vmovdqu \XMM4, 16*3(arg3,%r11) # Write to the Ciphertext buffer
2507 vmovdqu \XMM5, 16*4(arg3,%r11) # Write to the Ciphertext buffer
2508 vmovdqu \XMM6, 16*5(arg3,%r11) # Write to the Ciphertext buffer
2509 vmovdqu \XMM7, 16*6(arg3,%r11) # Write to the Ciphertext buffer
2510 vmovdqu \XMM8, 16*7(arg3,%r11) # Write to the Ciphertext buffer
2511 .endif
2512
2513 #######################################################################
2514 #second phase of the reduction
2515 vpclmulqdq $0x00, \T7, \T3, \T2
2516 vpsrldq $4, \T2, \T2 # shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
2517
2518 vpclmulqdq $0x10, \T7, \T3, \T4
2519 vpslldq $4, \T4, \T4 # shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts)
2520
2521 vpxor \T2, \T4, \T4 # second phase of the reduction complete
2522 #######################################################################
2523 vpxor \T4, \T1, \T1 # the result is in T1
2524
2525 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
2526 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
2527 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
2528 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
2529 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
2530 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
2531 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
2532 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
2533
2534
2535 vpxor \T1, \XMM1, \XMM1
2536
2537
2538
2539 .endm
2540
2541
2542 # GHASH the last 4 ciphertext blocks.
2543 .macro GHASH_LAST_8_AVX2 T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
2544
2545 ## Karatsuba Method
2546
2547 vmovdqu HashKey_8(arg2), \T5
2548
2549 vpshufd $0b01001110, \XMM1, \T2
2550 vpshufd $0b01001110, \T5, \T3
2551 vpxor \XMM1, \T2, \T2
2552 vpxor \T5, \T3, \T3
2553
2554 vpclmulqdq $0x11, \T5, \XMM1, \T6
2555 vpclmulqdq $0x00, \T5, \XMM1, \T7
2556
2557 vpclmulqdq $0x00, \T3, \T2, \XMM1
2558
2559 ######################
2560
2561 vmovdqu HashKey_7(arg2), \T5
2562 vpshufd $0b01001110, \XMM2, \T2
2563 vpshufd $0b01001110, \T5, \T3
2564 vpxor \XMM2, \T2, \T2
2565 vpxor \T5, \T3, \T3
2566
2567 vpclmulqdq $0x11, \T5, \XMM2, \T4
2568 vpxor \T4, \T6, \T6
2569
2570 vpclmulqdq $0x00, \T5, \XMM2, \T4
2571 vpxor \T4, \T7, \T7
2572
2573 vpclmulqdq $0x00, \T3, \T2, \T2
2574
2575 vpxor \T2, \XMM1, \XMM1
2576
2577 ######################
2578
2579 vmovdqu HashKey_6(arg2), \T5
2580 vpshufd $0b01001110, \XMM3, \T2
2581 vpshufd $0b01001110, \T5, \T3
2582 vpxor \XMM3, \T2, \T2
2583 vpxor \T5, \T3, \T3
2584
2585 vpclmulqdq $0x11, \T5, \XMM3, \T4
2586 vpxor \T4, \T6, \T6
2587
2588 vpclmulqdq $0x00, \T5, \XMM3, \T4
2589 vpxor \T4, \T7, \T7
2590
2591 vpclmulqdq $0x00, \T3, \T2, \T2
2592
2593 vpxor \T2, \XMM1, \XMM1
2594
2595 ######################
2596
2597 vmovdqu HashKey_5(arg2), \T5
2598 vpshufd $0b01001110, \XMM4, \T2
2599 vpshufd $0b01001110, \T5, \T3
2600 vpxor \XMM4, \T2, \T2
2601 vpxor \T5, \T3, \T3
2602
2603 vpclmulqdq $0x11, \T5, \XMM4, \T4
2604 vpxor \T4, \T6, \T6
2605
2606 vpclmulqdq $0x00, \T5, \XMM4, \T4
2607 vpxor \T4, \T7, \T7
2608
2609 vpclmulqdq $0x00, \T3, \T2, \T2
2610
2611 vpxor \T2, \XMM1, \XMM1
2612
2613 ######################
2614
2615 vmovdqu HashKey_4(arg2), \T5
2616 vpshufd $0b01001110, \XMM5, \T2
2617 vpshufd $0b01001110, \T5, \T3
2618 vpxor \XMM5, \T2, \T2
2619 vpxor \T5, \T3, \T3
2620
2621 vpclmulqdq $0x11, \T5, \XMM5, \T4
2622 vpxor \T4, \T6, \T6
2623
2624 vpclmulqdq $0x00, \T5, \XMM5, \T4
2625 vpxor \T4, \T7, \T7
2626
2627 vpclmulqdq $0x00, \T3, \T2, \T2
2628
2629 vpxor \T2, \XMM1, \XMM1
2630
2631 ######################
2632
2633 vmovdqu HashKey_3(arg2), \T5
2634 vpshufd $0b01001110, \XMM6, \T2
2635 vpshufd $0b01001110, \T5, \T3
2636 vpxor \XMM6, \T2, \T2
2637 vpxor \T5, \T3, \T3
2638
2639 vpclmulqdq $0x11, \T5, \XMM6, \T4
2640 vpxor \T4, \T6, \T6
2641
2642 vpclmulqdq $0x00, \T5, \XMM6, \T4
2643 vpxor \T4, \T7, \T7
2644
2645 vpclmulqdq $0x00, \T3, \T2, \T2
2646
2647 vpxor \T2, \XMM1, \XMM1
2648
2649 ######################
2650
2651 vmovdqu HashKey_2(arg2), \T5
2652 vpshufd $0b01001110, \XMM7, \T2
2653 vpshufd $0b01001110, \T5, \T3
2654 vpxor \XMM7, \T2, \T2
2655 vpxor \T5, \T3, \T3
2656
2657 vpclmulqdq $0x11, \T5, \XMM7, \T4
2658 vpxor \T4, \T6, \T6
2659
2660 vpclmulqdq $0x00, \T5, \XMM7, \T4
2661 vpxor \T4, \T7, \T7
2662
2663 vpclmulqdq $0x00, \T3, \T2, \T2
2664
2665 vpxor \T2, \XMM1, \XMM1
2666
2667 ######################
2668
2669 vmovdqu HashKey(arg2), \T5
2670 vpshufd $0b01001110, \XMM8, \T2
2671 vpshufd $0b01001110, \T5, \T3
2672 vpxor \XMM8, \T2, \T2
2673 vpxor \T5, \T3, \T3
2674
2675 vpclmulqdq $0x11, \T5, \XMM8, \T4
2676 vpxor \T4, \T6, \T6
2677
2678 vpclmulqdq $0x00, \T5, \XMM8, \T4
2679 vpxor \T4, \T7, \T7
2680
2681 vpclmulqdq $0x00, \T3, \T2, \T2
2682
2683 vpxor \T2, \XMM1, \XMM1
2684 vpxor \T6, \XMM1, \XMM1
2685 vpxor \T7, \XMM1, \T2
2686
2687
2688
2689
2690 vpslldq $8, \T2, \T4
2691 vpsrldq $8, \T2, \T2
2692
2693 vpxor \T4, \T7, \T7
2694 vpxor \T2, \T6, \T6 # <T6:T7> holds the result of the
2695 # accumulated carry-less multiplications
2696
2697 #######################################################################
2698 #first phase of the reduction
2699 vmovdqa POLY2(%rip), \T3
2700
2701 vpclmulqdq $0x01, \T7, \T3, \T2
2702 vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs
2703
2704 vpxor \T2, \T7, \T7 # first phase of the reduction complete
2705 #######################################################################
2706
2707
2708 #second phase of the reduction
2709 vpclmulqdq $0x00, \T7, \T3, \T2
2710 vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
2711
2712 vpclmulqdq $0x10, \T7, \T3, \T4
2713 vpslldq $4, \T4, \T4 # shift-L T4 1 DW (Shift-L 1-DW to obtain result with no shifts)
2714
2715 vpxor \T2, \T4, \T4 # second phase of the reduction complete
2716 #######################################################################
2717 vpxor \T4, \T6, \T6 # the result is in T6
2718 .endm
2719
2720
2721
2722 #############################################################
2723 #void aesni_gcm_init_avx_gen4
2724 # (gcm_data *my_ctx_data,
2725 # gcm_context_data *data,
2726 # u8 *iv,
2727
2728
2729
2730 # u8 *hash_subkey#
2731 # const u8 *aad,
2732 # u64 aad_len)
2733 #############################################################
2734 SYM_FUNC_START(aesni_gcm_init_avx_gen4)
2735 FUNC_SAVE
2736 INIT GHASH_MUL_AVX2, PRECOMPUTE_AVX2
2737 FUNC_RESTORE
2738 RET
2739 SYM_FUNC_END(aesni_gcm_init_avx_gen4)
2740
2741 ###############################################################################
2742 #void aesni_gcm_enc_avx_gen4(
2743 # gcm_data *my_ctx_data,
2744 # gcm_context_data *data,
2745 # u8 *out,
2746 # const u8 *in,
2747 # u64 plaintext_len)
2748 ###############################################################################
2749 SYM_FUNC_START(aesni_gcm_enc_update_avx_gen4)
2750 FUNC_SAVE
2751 mov keysize,%eax
2752 cmp $32, %eax
2753 je key_256_enc_update4
2754 cmp $16, %eax
2755 je key_128_enc_update4
2756 # must be 192
2757 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 11
2758 FUNC_RESTORE
2759 RET
2760 key_128_enc_update4:
2761 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 9
2762 FUNC_RESTORE
2763 RET
2764 key_256_enc_update4:
2765 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 13
2766 FUNC_RESTORE
2767 RET
2768 SYM_FUNC_END(aesni_gcm_enc_update_avx_gen4)
2769
2770 ###############################################################################
2771 #void aesni_gcm_dec_update_avx_gen4(
2772 # gcm_data *my_ctx_data,
2773 # gcm_context_data *data,
2774 # u8 *out,
2775 # const u8 *in,
2776 # u64 plaintext_len)
2777 ###############################################################################
2778 SYM_FUNC_START(aesni_gcm_dec_update_avx_gen4)
2779 FUNC_SAVE
2780 mov keysize,%eax
2781 cmp $32, %eax
2782 je key_256_dec_update4
2783 cmp $16, %eax
2784 je key_128_dec_update4
2785 # must be 192
2786 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 11
2787 FUNC_RESTORE
2788 RET
2789 key_128_dec_update4:
2790 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 9
2791 FUNC_RESTORE
2792 RET
2793 key_256_dec_update4:
2794 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 13
2795 FUNC_RESTORE
2796 RET
2797 SYM_FUNC_END(aesni_gcm_dec_update_avx_gen4)
2798
2799 ###############################################################################
2800 #void aesni_gcm_finalize_avx_gen4(
2801 # gcm_data *my_ctx_data,
2802 # gcm_context_data *data,
2803 # u8 *auth_tag,
2804 # u64 auth_tag_len)#
2805
2806 ###############################################################################
2807 SYM_FUNC_START(aesni_gcm_finalize_avx_gen4)
2808 FUNC_SAVE
2809 mov keysize,%eax
2810 cmp $32, %eax
2811 je key_256_finalize4
2812 cmp $16, %eax
2813 je key_128_finalize4
2814 # must be 192
2815 GCM_COMPLETE GHASH_MUL_AVX2, 11, arg3, arg4
2816 FUNC_RESTORE
2817 RET
2818 key_128_finalize4:
2819 GCM_COMPLETE GHASH_MUL_AVX2, 9, arg3, arg4
2820 FUNC_RESTORE
2821 RET
2822 key_256_finalize4:
2823 GCM_COMPLETE GHASH_MUL_AVX2, 13, arg3, arg4
2824 FUNC_RESTORE
2825 RET
2826 SYM_FUNC_END(aesni_gcm_finalize_avx_gen4)