0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024
0025
0026
0027
0028 #include <linux/linkage.h>
0029 #include <asm/frame.h>
0030 #include <asm/nospec-branch.h>
0031
0032
0033
0034
0035
0036
0037
0038
0039
0040 #define MOVADQ movaps
0041 #define MOVUDQ movups
0042
0043 #ifdef __x86_64__
0044
0045 # constants in mergeable sections, linker can reorder and merge
0046 .section .rodata.cst16.POLY, "aM", @progbits, 16
0047 .align 16
0048 POLY: .octa 0xC2000000000000000000000000000001
0049 .section .rodata.cst16.TWOONE, "aM", @progbits, 16
0050 .align 16
0051 TWOONE: .octa 0x00000001000000000000000000000001
0052
0053 .section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
0054 .align 16
0055 SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
0056 .section .rodata.cst16.MASK1, "aM", @progbits, 16
0057 .align 16
0058 MASK1: .octa 0x0000000000000000ffffffffffffffff
0059 .section .rodata.cst16.MASK2, "aM", @progbits, 16
0060 .align 16
0061 MASK2: .octa 0xffffffffffffffff0000000000000000
0062 .section .rodata.cst16.ONE, "aM", @progbits, 16
0063 .align 16
0064 ONE: .octa 0x00000000000000000000000000000001
0065 .section .rodata.cst16.F_MIN_MASK, "aM", @progbits, 16
0066 .align 16
0067 F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
0068 .section .rodata.cst16.dec, "aM", @progbits, 16
0069 .align 16
0070 dec: .octa 0x1
0071 .section .rodata.cst16.enc, "aM", @progbits, 16
0072 .align 16
0073 enc: .octa 0x2
0074
0075 # order of these constants should not change.
0076 # more specifically, ALL_F should follow SHIFT_MASK,
0077 # and zero should follow ALL_F
0078 .section .rodata, "a", @progbits
0079 .align 16
0080 SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
0081 ALL_F: .octa 0xffffffffffffffffffffffffffffffff
0082 .octa 0x00000000000000000000000000000000
0083
0084 .text
0085
0086
0087 #define STACK_OFFSET 8*3
0088
0089 #define AadHash 16*0
0090 #define AadLen 16*1
0091 #define InLen (16*1)+8
0092 #define PBlockEncKey 16*2
0093 #define OrigIV 16*3
0094 #define CurCount 16*4
0095 #define PBlockLen 16*5
0096 #define HashKey 16*6 // store HashKey <<1 mod poly here
0097 #define HashKey_2 16*7 // store HashKey^2 <<1 mod poly here
0098 #define HashKey_3 16*8 // store HashKey^3 <<1 mod poly here
0099 #define HashKey_4 16*9 // store HashKey^4 <<1 mod poly here
0100 #define HashKey_k 16*10 // store XOR of High 64 bits and Low 64
0101 // bits of HashKey <<1 mod poly here
0102 //(for Karatsuba purposes)
0103 #define HashKey_2_k 16*11 // store XOR of High 64 bits and Low 64
0104 // bits of HashKey^2 <<1 mod poly here
0105 // (for Karatsuba purposes)
0106 #define HashKey_3_k 16*12 // store XOR of High 64 bits and Low 64
0107 // bits of HashKey^3 <<1 mod poly here
0108 // (for Karatsuba purposes)
0109 #define HashKey_4_k 16*13 // store XOR of High 64 bits and Low 64
0110 // bits of HashKey^4 <<1 mod poly here
0111 // (for Karatsuba purposes)
0112
0113 #define arg1 rdi
0114 #define arg2 rsi
0115 #define arg3 rdx
0116 #define arg4 rcx
0117 #define arg5 r8
0118 #define arg6 r9
0119 #define arg7 STACK_OFFSET+8(%rsp)
0120 #define arg8 STACK_OFFSET+16(%rsp)
0121 #define arg9 STACK_OFFSET+24(%rsp)
0122 #define arg10 STACK_OFFSET+32(%rsp)
0123 #define arg11 STACK_OFFSET+40(%rsp)
0124 #define keysize 2*15*16(%arg1)
0125 #endif
0126
0127
0128 #define STATE1 %xmm0
0129 #define STATE2 %xmm4
0130 #define STATE3 %xmm5
0131 #define STATE4 %xmm6
0132 #define STATE STATE1
0133 #define IN1 %xmm1
0134 #define IN2 %xmm7
0135 #define IN3 %xmm8
0136 #define IN4 %xmm9
0137 #define IN IN1
0138 #define KEY %xmm2
0139 #define IV %xmm3
0140
0141 #define BSWAP_MASK %xmm10
0142 #define CTR %xmm11
0143 #define INC %xmm12
0144
0145 #define GF128MUL_MASK %xmm7
0146
0147 #ifdef __x86_64__
0148 #define AREG %rax
0149 #define KEYP %rdi
0150 #define OUTP %rsi
0151 #define UKEYP OUTP
0152 #define INP %rdx
0153 #define LEN %rcx
0154 #define IVP %r8
0155 #define KLEN %r9d
0156 #define T1 %r10
0157 #define TKEYP T1
0158 #define T2 %r11
0159 #define TCTR_LOW T2
0160 #else
0161 #define AREG %eax
0162 #define KEYP %edi
0163 #define OUTP AREG
0164 #define UKEYP OUTP
0165 #define INP %edx
0166 #define LEN %esi
0167 #define IVP %ebp
0168 #define KLEN %ebx
0169 #define T1 %ecx
0170 #define TKEYP T1
0171 #endif
0172
0173 .macro FUNC_SAVE
0174 push %r12
0175 push %r13
0176 push %r14
0177 #
0178 # states of %xmm registers %xmm6:%xmm15 not saved
0179 # all %xmm registers are clobbered
0180 #
0181 .endm
0182
0183
0184 .macro FUNC_RESTORE
0185 pop %r14
0186 pop %r13
0187 pop %r12
0188 .endm
0189
0190 # Precompute hashkeys.
0191 # Input: Hash subkey.
0192 # Output: HashKeys stored in gcm_context_data. Only needs to be called
0193 # once per key.
0194 # clobbers r12, and tmp xmm registers.
0195 .macro PRECOMPUTE SUBKEY TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 TMP7
0196 mov \SUBKEY, %r12
0197 movdqu (%r12), \TMP3
0198 movdqa SHUF_MASK(%rip), \TMP2
0199 pshufb \TMP2, \TMP3
0200
0201 # precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
0202
0203 movdqa \TMP3, \TMP2
0204 psllq $1, \TMP3
0205 psrlq $63, \TMP2
0206 movdqa \TMP2, \TMP1
0207 pslldq $8, \TMP2
0208 psrldq $8, \TMP1
0209 por \TMP2, \TMP3
0210
0211 # reduce HashKey<<1
0212
0213 pshufd $0x24, \TMP1, \TMP2
0214 pcmpeqd TWOONE(%rip), \TMP2
0215 pand POLY(%rip), \TMP2
0216 pxor \TMP2, \TMP3
0217 movdqu \TMP3, HashKey(%arg2)
0218
0219 movdqa \TMP3, \TMP5
0220 pshufd $78, \TMP3, \TMP1
0221 pxor \TMP3, \TMP1
0222 movdqu \TMP1, HashKey_k(%arg2)
0223
0224 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
0225 # TMP5 = HashKey^2<<1 (mod poly)
0226 movdqu \TMP5, HashKey_2(%arg2)
0227 # HashKey_2 = HashKey^2<<1 (mod poly)
0228 pshufd $78, \TMP5, \TMP1
0229 pxor \TMP5, \TMP1
0230 movdqu \TMP1, HashKey_2_k(%arg2)
0231
0232 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
0233 # TMP5 = HashKey^3<<1 (mod poly)
0234 movdqu \TMP5, HashKey_3(%arg2)
0235 pshufd $78, \TMP5, \TMP1
0236 pxor \TMP5, \TMP1
0237 movdqu \TMP1, HashKey_3_k(%arg2)
0238
0239 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
0240 # TMP5 = HashKey^3<<1 (mod poly)
0241 movdqu \TMP5, HashKey_4(%arg2)
0242 pshufd $78, \TMP5, \TMP1
0243 pxor \TMP5, \TMP1
0244 movdqu \TMP1, HashKey_4_k(%arg2)
0245 .endm
0246
0247 # GCM_INIT initializes a gcm_context struct to prepare for encoding/decoding.
0248 # Clobbers rax, r10-r13 and xmm0-xmm6, %xmm13
0249 .macro GCM_INIT Iv SUBKEY AAD AADLEN
0250 mov \AADLEN, %r11
0251 mov %r11, AadLen(%arg2) # ctx_data.aad_length = aad_length
0252 xor %r11d, %r11d
0253 mov %r11, InLen(%arg2) # ctx_data.in_length = 0
0254 mov %r11, PBlockLen(%arg2) # ctx_data.partial_block_length = 0
0255 mov %r11, PBlockEncKey(%arg2) # ctx_data.partial_block_enc_key = 0
0256 mov \Iv, %rax
0257 movdqu (%rax), %xmm0
0258 movdqu %xmm0, OrigIV(%arg2) # ctx_data.orig_IV = iv
0259
0260 movdqa SHUF_MASK(%rip), %xmm2
0261 pshufb %xmm2, %xmm0
0262 movdqu %xmm0, CurCount(%arg2) # ctx_data.current_counter = iv
0263
0264 PRECOMPUTE \SUBKEY, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7
0265 movdqu HashKey(%arg2), %xmm13
0266
0267 CALC_AAD_HASH %xmm13, \AAD, \AADLEN, %xmm0, %xmm1, %xmm2, %xmm3, \
0268 %xmm4, %xmm5, %xmm6
0269 .endm
0270
0271 # GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context
0272 # struct has been initialized by GCM_INIT.
0273 # Requires the input data be at least 1 byte long because of READ_PARTIAL_BLOCK
0274 # Clobbers rax, r10-r13, and xmm0-xmm15
0275 .macro GCM_ENC_DEC operation
0276 movdqu AadHash(%arg2), %xmm8
0277 movdqu HashKey(%arg2), %xmm13
0278 add %arg5, InLen(%arg2)
0279
0280 xor %r11d, %r11d # initialise the data pointer offset as zero
0281 PARTIAL_BLOCK %arg3 %arg4 %arg5 %r11 %xmm8 \operation
0282
0283 sub %r11, %arg5 # sub partial block data used
0284 mov %arg5, %r13 # save the number of bytes
0285
0286 and $-16, %r13 # %r13 = %r13 - (%r13 mod 16)
0287 mov %r13, %r12
0288 # Encrypt/Decrypt first few blocks
0289
0290 and $(3<<4), %r12
0291 jz _initial_num_blocks_is_0_\@
0292 cmp $(2<<4), %r12
0293 jb _initial_num_blocks_is_1_\@
0294 je _initial_num_blocks_is_2_\@
0295 _initial_num_blocks_is_3_\@:
0296 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
0297 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, \operation
0298 sub $48, %r13
0299 jmp _initial_blocks_\@
0300 _initial_num_blocks_is_2_\@:
0301 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
0302 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, \operation
0303 sub $32, %r13
0304 jmp _initial_blocks_\@
0305 _initial_num_blocks_is_1_\@:
0306 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
0307 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, \operation
0308 sub $16, %r13
0309 jmp _initial_blocks_\@
0310 _initial_num_blocks_is_0_\@:
0311 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
0312 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, \operation
0313 _initial_blocks_\@:
0314
0315 # Main loop - Encrypt/Decrypt remaining blocks
0316
0317 test %r13, %r13
0318 je _zero_cipher_left_\@
0319 sub $64, %r13
0320 je _four_cipher_left_\@
0321 _crypt_by_4_\@:
0322 GHASH_4_ENCRYPT_4_PARALLEL_\operation %xmm9, %xmm10, %xmm11, %xmm12, \
0323 %xmm13, %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, \
0324 %xmm7, %xmm8, enc
0325 add $64, %r11
0326 sub $64, %r13
0327 jne _crypt_by_4_\@
0328 _four_cipher_left_\@:
0329 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
0330 %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
0331 _zero_cipher_left_\@:
0332 movdqu %xmm8, AadHash(%arg2)
0333 movdqu %xmm0, CurCount(%arg2)
0334
0335 mov %arg5, %r13
0336 and $15, %r13 # %r13 = arg5 (mod 16)
0337 je _multiple_of_16_bytes_\@
0338
0339 mov %r13, PBlockLen(%arg2)
0340
0341 # Handle the last <16 Byte block separately
0342 paddd ONE(%rip), %xmm0 # INCR CNT to get Yn
0343 movdqu %xmm0, CurCount(%arg2)
0344 movdqa SHUF_MASK(%rip), %xmm10
0345 pshufb %xmm10, %xmm0
0346
0347 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn)
0348 movdqu %xmm0, PBlockEncKey(%arg2)
0349
0350 cmp $16, %arg5
0351 jge _large_enough_update_\@
0352
0353 lea (%arg4,%r11,1), %r10
0354 mov %r13, %r12
0355 READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1
0356 jmp _data_read_\@
0357
0358 _large_enough_update_\@:
0359 sub $16, %r11
0360 add %r13, %r11
0361
0362 # receive the last <16 Byte block
0363 movdqu (%arg4, %r11, 1), %xmm1
0364
0365 sub %r13, %r11
0366 add $16, %r11
0367
0368 lea SHIFT_MASK+16(%rip), %r12
0369 # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
0370 # (r13 is the number of bytes in plaintext mod 16)
0371 sub %r13, %r12
0372 # get the appropriate shuffle mask
0373 movdqu (%r12), %xmm2
0374 # shift right 16-r13 bytes
0375 pshufb %xmm2, %xmm1
0376
0377 _data_read_\@:
0378 lea ALL_F+16(%rip), %r12
0379 sub %r13, %r12
0380
0381 .ifc \operation, dec
0382 movdqa %xmm1, %xmm2
0383 .endif
0384 pxor %xmm1, %xmm0 # XOR Encrypt(K, Yn)
0385 movdqu (%r12), %xmm1
0386 # get the appropriate mask to mask out top 16-r13 bytes of xmm0
0387 pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0
0388 .ifc \operation, dec
0389 pand %xmm1, %xmm2
0390 movdqa SHUF_MASK(%rip), %xmm10
0391 pshufb %xmm10 ,%xmm2
0392
0393 pxor %xmm2, %xmm8
0394 .else
0395 movdqa SHUF_MASK(%rip), %xmm10
0396 pshufb %xmm10,%xmm0
0397
0398 pxor %xmm0, %xmm8
0399 .endif
0400
0401 movdqu %xmm8, AadHash(%arg2)
0402 .ifc \operation, enc
0403 # GHASH computation for the last <16 byte block
0404 movdqa SHUF_MASK(%rip), %xmm10
0405 # shuffle xmm0 back to output as ciphertext
0406 pshufb %xmm10, %xmm0
0407 .endif
0408
0409 # Output %r13 bytes
0410 movq %xmm0, %rax
0411 cmp $8, %r13
0412 jle _less_than_8_bytes_left_\@
0413 mov %rax, (%arg3 , %r11, 1)
0414 add $8, %r11
0415 psrldq $8, %xmm0
0416 movq %xmm0, %rax
0417 sub $8, %r13
0418 _less_than_8_bytes_left_\@:
0419 mov %al, (%arg3, %r11, 1)
0420 add $1, %r11
0421 shr $8, %rax
0422 sub $1, %r13
0423 jne _less_than_8_bytes_left_\@
0424 _multiple_of_16_bytes_\@:
0425 .endm
0426
0427 # GCM_COMPLETE Finishes update of tag of last partial block
0428 # Output: Authorization Tag (AUTH_TAG)
0429 # Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
0430 .macro GCM_COMPLETE AUTHTAG AUTHTAGLEN
0431 movdqu AadHash(%arg2), %xmm8
0432 movdqu HashKey(%arg2), %xmm13
0433
0434 mov PBlockLen(%arg2), %r12
0435
0436 test %r12, %r12
0437 je _partial_done\@
0438
0439 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
0440
0441 _partial_done\@:
0442 mov AadLen(%arg2), %r12 # %r13 = aadLen (number of bytes)
0443 shl $3, %r12 # convert into number of bits
0444 movd %r12d, %xmm15 # len(A) in %xmm15
0445 mov InLen(%arg2), %r12
0446 shl $3, %r12 # len(C) in bits (*128)
0447 movq %r12, %xmm1
0448
0449 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
0450 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
0451 pxor %xmm15, %xmm8
0452 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
0453 # final GHASH computation
0454 movdqa SHUF_MASK(%rip), %xmm10
0455 pshufb %xmm10, %xmm8
0456
0457 movdqu OrigIV(%arg2), %xmm0 # %xmm0 = Y0
0458 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0)
0459 pxor %xmm8, %xmm0
0460 _return_T_\@:
0461 mov \AUTHTAG, %r10 # %r10 = authTag
0462 mov \AUTHTAGLEN, %r11 # %r11 = auth_tag_len
0463 cmp $16, %r11
0464 je _T_16_\@
0465 cmp $8, %r11
0466 jl _T_4_\@
0467 _T_8_\@:
0468 movq %xmm0, %rax
0469 mov %rax, (%r10)
0470 add $8, %r10
0471 sub $8, %r11
0472 psrldq $8, %xmm0
0473 test %r11, %r11
0474 je _return_T_done_\@
0475 _T_4_\@:
0476 movd %xmm0, %eax
0477 mov %eax, (%r10)
0478 add $4, %r10
0479 sub $4, %r11
0480 psrldq $4, %xmm0
0481 test %r11, %r11
0482 je _return_T_done_\@
0483 _T_123_\@:
0484 movd %xmm0, %eax
0485 cmp $2, %r11
0486 jl _T_1_\@
0487 mov %ax, (%r10)
0488 cmp $2, %r11
0489 je _return_T_done_\@
0490 add $2, %r10
0491 sar $16, %eax
0492 _T_1_\@:
0493 mov %al, (%r10)
0494 jmp _return_T_done_\@
0495 _T_16_\@:
0496 movdqu %xmm0, (%r10)
0497 _return_T_done_\@:
0498 .endm
0499
0500 #ifdef __x86_64__
0501
0502
0503
0504
0505
0506
0507
0508
0509
0510 .macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
0511 movdqa \GH, \TMP1
0512 pshufd $78, \GH, \TMP2
0513 pshufd $78, \HK, \TMP3
0514 pxor \GH, \TMP2 # TMP2 = a1+a0
0515 pxor \HK, \TMP3 # TMP3 = b1+b0
0516 pclmulqdq $0x11, \HK, \TMP1 # TMP1 = a1*b1
0517 pclmulqdq $0x00, \HK, \GH # GH = a0*b0
0518 pclmulqdq $0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0)
0519 pxor \GH, \TMP2
0520 pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0)
0521 movdqa \TMP2, \TMP3
0522 pslldq $8, \TMP3 # left shift TMP3 2 DWs
0523 psrldq $8, \TMP2 # right shift TMP2 2 DWs
0524 pxor \TMP3, \GH
0525 pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK
0526
0527 # first phase of the reduction
0528
0529 movdqa \GH, \TMP2
0530 movdqa \GH, \TMP3
0531 movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4
0532 # in in order to perform
0533 # independent shifts
0534 pslld $31, \TMP2 # packed right shift <<31
0535 pslld $30, \TMP3 # packed right shift <<30
0536 pslld $25, \TMP4 # packed right shift <<25
0537 pxor \TMP3, \TMP2 # xor the shifted versions
0538 pxor \TMP4, \TMP2
0539 movdqa \TMP2, \TMP5
0540 psrldq $4, \TMP5 # right shift TMP5 1 DW
0541 pslldq $12, \TMP2 # left shift TMP2 3 DWs
0542 pxor \TMP2, \GH
0543
0544 # second phase of the reduction
0545
0546 movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4
0547 # in in order to perform
0548 # independent shifts
0549 movdqa \GH,\TMP3
0550 movdqa \GH,\TMP4
0551 psrld $1,\TMP2 # packed left shift >>1
0552 psrld $2,\TMP3 # packed left shift >>2
0553 psrld $7,\TMP4 # packed left shift >>7
0554 pxor \TMP3,\TMP2 # xor the shifted versions
0555 pxor \TMP4,\TMP2
0556 pxor \TMP5, \TMP2
0557 pxor \TMP2, \GH
0558 pxor \TMP1, \GH # result is in TMP1
0559 .endm
0560
0561 # Reads DLEN bytes starting at DPTR and stores in XMMDst
0562 # where 0 < DLEN < 16
0563 # Clobbers %rax, DLEN and XMM1
0564 .macro READ_PARTIAL_BLOCK DPTR DLEN XMM1 XMMDst
0565 cmp $8, \DLEN
0566 jl _read_lt8_\@
0567 mov (\DPTR), %rax
0568 movq %rax, \XMMDst
0569 sub $8, \DLEN
0570 jz _done_read_partial_block_\@
0571 xor %eax, %eax
0572 _read_next_byte_\@:
0573 shl $8, %rax
0574 mov 7(\DPTR, \DLEN, 1), %al
0575 dec \DLEN
0576 jnz _read_next_byte_\@
0577 movq %rax, \XMM1
0578 pslldq $8, \XMM1
0579 por \XMM1, \XMMDst
0580 jmp _done_read_partial_block_\@
0581 _read_lt8_\@:
0582 xor %eax, %eax
0583 _read_next_byte_lt8_\@:
0584 shl $8, %rax
0585 mov -1(\DPTR, \DLEN, 1), %al
0586 dec \DLEN
0587 jnz _read_next_byte_lt8_\@
0588 movq %rax, \XMMDst
0589 _done_read_partial_block_\@:
0590 .endm
0591
0592 # CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
0593 # clobbers r10-11, xmm14
0594 .macro CALC_AAD_HASH HASHKEY AAD AADLEN TMP1 TMP2 TMP3 TMP4 TMP5 \
0595 TMP6 TMP7
0596 MOVADQ SHUF_MASK(%rip), %xmm14
0597 mov \AAD, %r10 # %r10 = AAD
0598 mov \AADLEN, %r11 # %r11 = aadLen
0599 pxor \TMP7, \TMP7
0600 pxor \TMP6, \TMP6
0601
0602 cmp $16, %r11
0603 jl _get_AAD_rest\@
0604 _get_AAD_blocks\@:
0605 movdqu (%r10), \TMP7
0606 pshufb %xmm14, \TMP7 # byte-reflect the AAD data
0607 pxor \TMP7, \TMP6
0608 GHASH_MUL \TMP6, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
0609 add $16, %r10
0610 sub $16, %r11
0611 cmp $16, %r11
0612 jge _get_AAD_blocks\@
0613
0614 movdqu \TMP6, \TMP7
0615
0616
0617 _get_AAD_rest\@:
0618 test %r11, %r11
0619 je _get_AAD_done\@
0620
0621 READ_PARTIAL_BLOCK %r10, %r11, \TMP1, \TMP7
0622 pshufb %xmm14, \TMP7 # byte-reflect the AAD data
0623 pxor \TMP6, \TMP7
0624 GHASH_MUL \TMP7, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
0625 movdqu \TMP7, \TMP6
0626
0627 _get_AAD_done\@:
0628 movdqu \TMP6, AadHash(%arg2)
0629 .endm
0630
0631 # PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks
0632 # between update calls.
0633 # Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK
0634 # Outputs encrypted bytes, and updates hash and partial info in gcm_data_context
0635 # Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13
0636 .macro PARTIAL_BLOCK CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \
0637 AAD_HASH operation
0638 mov PBlockLen(%arg2), %r13
0639 test %r13, %r13
0640 je _partial_block_done_\@ # Leave Macro if no partial blocks
0641 # Read in input data without over reading
0642 cmp $16, \PLAIN_CYPH_LEN
0643 jl _fewer_than_16_bytes_\@
0644 movups (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm
0645 jmp _data_read_\@
0646
0647 _fewer_than_16_bytes_\@:
0648 lea (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10
0649 mov \PLAIN_CYPH_LEN, %r12
0650 READ_PARTIAL_BLOCK %r10 %r12 %xmm0 %xmm1
0651
0652 mov PBlockLen(%arg2), %r13
0653
0654 _data_read_\@: # Finished reading in data
0655
0656 movdqu PBlockEncKey(%arg2), %xmm9
0657 movdqu HashKey(%arg2), %xmm13
0658
0659 lea SHIFT_MASK(%rip), %r12
0660
0661 # adjust the shuffle mask pointer to be able to shift r13 bytes
0662 # r16-r13 is the number of bytes in plaintext mod 16)
0663 add %r13, %r12
0664 movdqu (%r12), %xmm2 # get the appropriate shuffle mask
0665 pshufb %xmm2, %xmm9 # shift right r13 bytes
0666
0667 .ifc \operation, dec
0668 movdqa %xmm1, %xmm3
0669 pxor %xmm1, %xmm9 # Cyphertext XOR E(K, Yn)
0670
0671 mov \PLAIN_CYPH_LEN, %r10
0672 add %r13, %r10
0673 # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
0674 sub $16, %r10
0675 # Determine if if partial block is not being filled and
0676 # shift mask accordingly
0677 jge _no_extra_mask_1_\@
0678 sub %r10, %r12
0679 _no_extra_mask_1_\@:
0680
0681 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
0682 # get the appropriate mask to mask out bottom r13 bytes of xmm9
0683 pand %xmm1, %xmm9 # mask out bottom r13 bytes of xmm9
0684
0685 pand %xmm1, %xmm3
0686 movdqa SHUF_MASK(%rip), %xmm10
0687 pshufb %xmm10, %xmm3
0688 pshufb %xmm2, %xmm3
0689 pxor %xmm3, \AAD_HASH
0690
0691 test %r10, %r10
0692 jl _partial_incomplete_1_\@
0693
0694 # GHASH computation for the last <16 Byte block
0695 GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
0696 xor %eax, %eax
0697
0698 mov %rax, PBlockLen(%arg2)
0699 jmp _dec_done_\@
0700 _partial_incomplete_1_\@:
0701 add \PLAIN_CYPH_LEN, PBlockLen(%arg2)
0702 _dec_done_\@:
0703 movdqu \AAD_HASH, AadHash(%arg2)
0704 .else
0705 pxor %xmm1, %xmm9 # Plaintext XOR E(K, Yn)
0706
0707 mov \PLAIN_CYPH_LEN, %r10
0708 add %r13, %r10
0709 # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
0710 sub $16, %r10
0711 # Determine if if partial block is not being filled and
0712 # shift mask accordingly
0713 jge _no_extra_mask_2_\@
0714 sub %r10, %r12
0715 _no_extra_mask_2_\@:
0716
0717 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
0718 # get the appropriate mask to mask out bottom r13 bytes of xmm9
0719 pand %xmm1, %xmm9
0720
0721 movdqa SHUF_MASK(%rip), %xmm1
0722 pshufb %xmm1, %xmm9
0723 pshufb %xmm2, %xmm9
0724 pxor %xmm9, \AAD_HASH
0725
0726 test %r10, %r10
0727 jl _partial_incomplete_2_\@
0728
0729 # GHASH computation for the last <16 Byte block
0730 GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
0731 xor %eax, %eax
0732
0733 mov %rax, PBlockLen(%arg2)
0734 jmp _encode_done_\@
0735 _partial_incomplete_2_\@:
0736 add \PLAIN_CYPH_LEN, PBlockLen(%arg2)
0737 _encode_done_\@:
0738 movdqu \AAD_HASH, AadHash(%arg2)
0739
0740 movdqa SHUF_MASK(%rip), %xmm10
0741 # shuffle xmm9 back to output as ciphertext
0742 pshufb %xmm10, %xmm9
0743 pshufb %xmm2, %xmm9
0744 .endif
0745 # output encrypted Bytes
0746 test %r10, %r10
0747 jl _partial_fill_\@
0748 mov %r13, %r12
0749 mov $16, %r13
0750 # Set r13 to be the number of bytes to write out
0751 sub %r12, %r13
0752 jmp _count_set_\@
0753 _partial_fill_\@:
0754 mov \PLAIN_CYPH_LEN, %r13
0755 _count_set_\@:
0756 movdqa %xmm9, %xmm0
0757 movq %xmm0, %rax
0758 cmp $8, %r13
0759 jle _less_than_8_bytes_left_\@
0760
0761 mov %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
0762 add $8, \DATA_OFFSET
0763 psrldq $8, %xmm0
0764 movq %xmm0, %rax
0765 sub $8, %r13
0766 _less_than_8_bytes_left_\@:
0767 movb %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
0768 add $1, \DATA_OFFSET
0769 shr $8, %rax
0770 sub $1, %r13
0771 jne _less_than_8_bytes_left_\@
0772 _partial_block_done_\@:
0773 .endm # PARTIAL_BLOCK
0774
0775
0776
0777
0778
0779
0780
0781
0782
0783
0784
0785
0786
0787 .macro INITIAL_BLOCKS_ENC_DEC TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
0788 XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
0789 MOVADQ SHUF_MASK(%rip), %xmm14
0790
0791 movdqu AadHash(%arg2), %xmm\i # XMM0 = Y0
0792
0793 # start AES for num_initial_blocks blocks
0794
0795 movdqu CurCount(%arg2), \XMM0 # XMM0 = Y0
0796
0797 .if (\i == 5) || (\i == 6) || (\i == 7)
0798
0799 MOVADQ ONE(%RIP),\TMP1
0800 MOVADQ 0(%arg1),\TMP2
0801 .irpc index, \i_seq
0802 paddd \TMP1, \XMM0 # INCR Y0
0803 .ifc \operation, dec
0804 movdqa \XMM0, %xmm\index
0805 .else
0806 MOVADQ \XMM0, %xmm\index
0807 .endif
0808 pshufb %xmm14, %xmm\index # perform a 16 byte swap
0809 pxor \TMP2, %xmm\index
0810 .endr
0811 lea 0x10(%arg1),%r10
0812 mov keysize,%eax
0813 shr $2,%eax # 128->4, 192->6, 256->8
0814 add $5,%eax # 128->9, 192->11, 256->13
0815
0816 aes_loop_initial_\@:
0817 MOVADQ (%r10),\TMP1
0818 .irpc index, \i_seq
0819 aesenc \TMP1, %xmm\index
0820 .endr
0821 add $16,%r10
0822 sub $1,%eax
0823 jnz aes_loop_initial_\@
0824
0825 MOVADQ (%r10), \TMP1
0826 .irpc index, \i_seq
0827 aesenclast \TMP1, %xmm\index # Last Round
0828 .endr
0829 .irpc index, \i_seq
0830 movdqu (%arg4 , %r11, 1), \TMP1
0831 pxor \TMP1, %xmm\index
0832 movdqu %xmm\index, (%arg3 , %r11, 1)
0833 # write back plaintext/ciphertext for num_initial_blocks
0834 add $16, %r11
0835
0836 .ifc \operation, dec
0837 movdqa \TMP1, %xmm\index
0838 .endif
0839 pshufb %xmm14, %xmm\index
0840
0841 # prepare plaintext/ciphertext for GHASH computation
0842 .endr
0843 .endif
0844
0845 # apply GHASH on num_initial_blocks blocks
0846
0847 .if \i == 5
0848 pxor %xmm5, %xmm6
0849 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
0850 pxor %xmm6, %xmm7
0851 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
0852 pxor %xmm7, %xmm8
0853 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
0854 .elseif \i == 6
0855 pxor %xmm6, %xmm7
0856 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
0857 pxor %xmm7, %xmm8
0858 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
0859 .elseif \i == 7
0860 pxor %xmm7, %xmm8
0861 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
0862 .endif
0863 cmp $64, %r13
0864 jl _initial_blocks_done\@
0865 # no need for precomputed values
0866
0867
0868
0869
0870
0871 MOVADQ ONE(%RIP),\TMP1
0872 paddd \TMP1, \XMM0 # INCR Y0
0873 MOVADQ \XMM0, \XMM1
0874 pshufb %xmm14, \XMM1 # perform a 16 byte swap
0875
0876 paddd \TMP1, \XMM0 # INCR Y0
0877 MOVADQ \XMM0, \XMM2
0878 pshufb %xmm14, \XMM2 # perform a 16 byte swap
0879
0880 paddd \TMP1, \XMM0 # INCR Y0
0881 MOVADQ \XMM0, \XMM3
0882 pshufb %xmm14, \XMM3 # perform a 16 byte swap
0883
0884 paddd \TMP1, \XMM0 # INCR Y0
0885 MOVADQ \XMM0, \XMM4
0886 pshufb %xmm14, \XMM4 # perform a 16 byte swap
0887
0888 MOVADQ 0(%arg1),\TMP1
0889 pxor \TMP1, \XMM1
0890 pxor \TMP1, \XMM2
0891 pxor \TMP1, \XMM3
0892 pxor \TMP1, \XMM4
0893 .irpc index, 1234 # do 4 rounds
0894 movaps 0x10*\index(%arg1), \TMP1
0895 aesenc \TMP1, \XMM1
0896 aesenc \TMP1, \XMM2
0897 aesenc \TMP1, \XMM3
0898 aesenc \TMP1, \XMM4
0899 .endr
0900 .irpc index, 56789 # do next 5 rounds
0901 movaps 0x10*\index(%arg1), \TMP1
0902 aesenc \TMP1, \XMM1
0903 aesenc \TMP1, \XMM2
0904 aesenc \TMP1, \XMM3
0905 aesenc \TMP1, \XMM4
0906 .endr
0907 lea 0xa0(%arg1),%r10
0908 mov keysize,%eax
0909 shr $2,%eax # 128->4, 192->6, 256->8
0910 sub $4,%eax # 128->0, 192->2, 256->4
0911 jz aes_loop_pre_done\@
0912
0913 aes_loop_pre_\@:
0914 MOVADQ (%r10),\TMP2
0915 .irpc index, 1234
0916 aesenc \TMP2, %xmm\index
0917 .endr
0918 add $16,%r10
0919 sub $1,%eax
0920 jnz aes_loop_pre_\@
0921
0922 aes_loop_pre_done\@:
0923 MOVADQ (%r10), \TMP2
0924 aesenclast \TMP2, \XMM1
0925 aesenclast \TMP2, \XMM2
0926 aesenclast \TMP2, \XMM3
0927 aesenclast \TMP2, \XMM4
0928 movdqu 16*0(%arg4 , %r11 , 1), \TMP1
0929 pxor \TMP1, \XMM1
0930 .ifc \operation, dec
0931 movdqu \XMM1, 16*0(%arg3 , %r11 , 1)
0932 movdqa \TMP1, \XMM1
0933 .endif
0934 movdqu 16*1(%arg4 , %r11 , 1), \TMP1
0935 pxor \TMP1, \XMM2
0936 .ifc \operation, dec
0937 movdqu \XMM2, 16*1(%arg3 , %r11 , 1)
0938 movdqa \TMP1, \XMM2
0939 .endif
0940 movdqu 16*2(%arg4 , %r11 , 1), \TMP1
0941 pxor \TMP1, \XMM3
0942 .ifc \operation, dec
0943 movdqu \XMM3, 16*2(%arg3 , %r11 , 1)
0944 movdqa \TMP1, \XMM3
0945 .endif
0946 movdqu 16*3(%arg4 , %r11 , 1), \TMP1
0947 pxor \TMP1, \XMM4
0948 .ifc \operation, dec
0949 movdqu \XMM4, 16*3(%arg3 , %r11 , 1)
0950 movdqa \TMP1, \XMM4
0951 .else
0952 movdqu \XMM1, 16*0(%arg3 , %r11 , 1)
0953 movdqu \XMM2, 16*1(%arg3 , %r11 , 1)
0954 movdqu \XMM3, 16*2(%arg3 , %r11 , 1)
0955 movdqu \XMM4, 16*3(%arg3 , %r11 , 1)
0956 .endif
0957
0958 add $64, %r11
0959 pshufb %xmm14, \XMM1 # perform a 16 byte swap
0960 pxor \XMMDst, \XMM1
0961 # combine GHASHed value with the corresponding ciphertext
0962 pshufb %xmm14, \XMM2 # perform a 16 byte swap
0963 pshufb %xmm14, \XMM3 # perform a 16 byte swap
0964 pshufb %xmm14, \XMM4 # perform a 16 byte swap
0965
0966 _initial_blocks_done\@:
0967
0968 .endm
0969
0970
0971
0972
0973
0974
0975
0976 .macro GHASH_4_ENCRYPT_4_PARALLEL_enc TMP1 TMP2 TMP3 TMP4 TMP5 \
0977 TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
0978
0979 movdqa \XMM1, \XMM5
0980 movdqa \XMM2, \XMM6
0981 movdqa \XMM3, \XMM7
0982 movdqa \XMM4, \XMM8
0983
0984 movdqa SHUF_MASK(%rip), %xmm15
0985 # multiply TMP5 * HashKey using karatsuba
0986
0987 movdqa \XMM5, \TMP4
0988 pshufd $78, \XMM5, \TMP6
0989 pxor \XMM5, \TMP6
0990 paddd ONE(%rip), \XMM0 # INCR CNT
0991 movdqu HashKey_4(%arg2), \TMP5
0992 pclmulqdq $0x11, \TMP5, \TMP4 # TMP4 = a1*b1
0993 movdqa \XMM0, \XMM1
0994 paddd ONE(%rip), \XMM0 # INCR CNT
0995 movdqa \XMM0, \XMM2
0996 paddd ONE(%rip), \XMM0 # INCR CNT
0997 movdqa \XMM0, \XMM3
0998 paddd ONE(%rip), \XMM0 # INCR CNT
0999 movdqa \XMM0, \XMM4
1000 pshufb %xmm15, \XMM1 # perform a 16 byte swap
1001 pclmulqdq $0x00, \TMP5, \XMM5 # XMM5 = a0*b0
1002 pshufb %xmm15, \XMM2 # perform a 16 byte swap
1003 pshufb %xmm15, \XMM3 # perform a 16 byte swap
1004 pshufb %xmm15, \XMM4 # perform a 16 byte swap
1005
1006 pxor (%arg1), \XMM1
1007 pxor (%arg1), \XMM2
1008 pxor (%arg1), \XMM3
1009 pxor (%arg1), \XMM4
1010 movdqu HashKey_4_k(%arg2), \TMP5
1011 pclmulqdq $0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
1012 movaps 0x10(%arg1), \TMP1
1013 aesenc \TMP1, \XMM1 # Round 1
1014 aesenc \TMP1, \XMM2
1015 aesenc \TMP1, \XMM3
1016 aesenc \TMP1, \XMM4
1017 movaps 0x20(%arg1), \TMP1
1018 aesenc \TMP1, \XMM1 # Round 2
1019 aesenc \TMP1, \XMM2
1020 aesenc \TMP1, \XMM3
1021 aesenc \TMP1, \XMM4
1022 movdqa \XMM6, \TMP1
1023 pshufd $78, \XMM6, \TMP2
1024 pxor \XMM6, \TMP2
1025 movdqu HashKey_3(%arg2), \TMP5
1026 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
1027 movaps 0x30(%arg1), \TMP3
1028 aesenc \TMP3, \XMM1 # Round 3
1029 aesenc \TMP3, \XMM2
1030 aesenc \TMP3, \XMM3
1031 aesenc \TMP3, \XMM4
1032 pclmulqdq $0x00, \TMP5, \XMM6 # XMM6 = a0*b0
1033 movaps 0x40(%arg1), \TMP3
1034 aesenc \TMP3, \XMM1 # Round 4
1035 aesenc \TMP3, \XMM2
1036 aesenc \TMP3, \XMM3
1037 aesenc \TMP3, \XMM4
1038 movdqu HashKey_3_k(%arg2), \TMP5
1039 pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1040 movaps 0x50(%arg1), \TMP3
1041 aesenc \TMP3, \XMM1 # Round 5
1042 aesenc \TMP3, \XMM2
1043 aesenc \TMP3, \XMM3
1044 aesenc \TMP3, \XMM4
1045 pxor \TMP1, \TMP4
1046 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1047 pxor \XMM6, \XMM5
1048 pxor \TMP2, \TMP6
1049 movdqa \XMM7, \TMP1
1050 pshufd $78, \XMM7, \TMP2
1051 pxor \XMM7, \TMP2
1052 movdqu HashKey_2(%arg2), \TMP5
1053
1054 # Multiply TMP5 * HashKey using karatsuba
1055
1056 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1057 movaps 0x60(%arg1), \TMP3
1058 aesenc \TMP3, \XMM1 # Round 6
1059 aesenc \TMP3, \XMM2
1060 aesenc \TMP3, \XMM3
1061 aesenc \TMP3, \XMM4
1062 pclmulqdq $0x00, \TMP5, \XMM7 # XMM7 = a0*b0
1063 movaps 0x70(%arg1), \TMP3
1064 aesenc \TMP3, \XMM1 # Round 7
1065 aesenc \TMP3, \XMM2
1066 aesenc \TMP3, \XMM3
1067 aesenc \TMP3, \XMM4
1068 movdqu HashKey_2_k(%arg2), \TMP5
1069 pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1070 movaps 0x80(%arg1), \TMP3
1071 aesenc \TMP3, \XMM1 # Round 8
1072 aesenc \TMP3, \XMM2
1073 aesenc \TMP3, \XMM3
1074 aesenc \TMP3, \XMM4
1075 pxor \TMP1, \TMP4
1076 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1077 pxor \XMM7, \XMM5
1078 pxor \TMP2, \TMP6
1079
1080 # Multiply XMM8 * HashKey
1081 # XMM8 and TMP5 hold the values for the two operands
1082
1083 movdqa \XMM8, \TMP1
1084 pshufd $78, \XMM8, \TMP2
1085 pxor \XMM8, \TMP2
1086 movdqu HashKey(%arg2), \TMP5
1087 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1088 movaps 0x90(%arg1), \TMP3
1089 aesenc \TMP3, \XMM1 # Round 9
1090 aesenc \TMP3, \XMM2
1091 aesenc \TMP3, \XMM3
1092 aesenc \TMP3, \XMM4
1093 pclmulqdq $0x00, \TMP5, \XMM8 # XMM8 = a0*b0
1094 lea 0xa0(%arg1),%r10
1095 mov keysize,%eax
1096 shr $2,%eax # 128->4, 192->6, 256->8
1097 sub $4,%eax # 128->0, 192->2, 256->4
1098 jz aes_loop_par_enc_done\@
1099
1100 aes_loop_par_enc\@:
1101 MOVADQ (%r10),\TMP3
1102 .irpc index, 1234
1103 aesenc \TMP3, %xmm\index
1104 .endr
1105 add $16,%r10
1106 sub $1,%eax
1107 jnz aes_loop_par_enc\@
1108
1109 aes_loop_par_enc_done\@:
1110 MOVADQ (%r10), \TMP3
1111 aesenclast \TMP3, \XMM1 # Round 10
1112 aesenclast \TMP3, \XMM2
1113 aesenclast \TMP3, \XMM3
1114 aesenclast \TMP3, \XMM4
1115 movdqu HashKey_k(%arg2), \TMP5
1116 pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1117 movdqu (%arg4,%r11,1), \TMP3
1118 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
1119 movdqu 16(%arg4,%r11,1), \TMP3
1120 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
1121 movdqu 32(%arg4,%r11,1), \TMP3
1122 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
1123 movdqu 48(%arg4,%r11,1), \TMP3
1124 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
1125 movdqu \XMM1, (%arg3,%r11,1) # Write to the ciphertext buffer
1126 movdqu \XMM2, 16(%arg3,%r11,1) # Write to the ciphertext buffer
1127 movdqu \XMM3, 32(%arg3,%r11,1) # Write to the ciphertext buffer
1128 movdqu \XMM4, 48(%arg3,%r11,1) # Write to the ciphertext buffer
1129 pshufb %xmm15, \XMM1 # perform a 16 byte swap
1130 pshufb %xmm15, \XMM2 # perform a 16 byte swap
1131 pshufb %xmm15, \XMM3 # perform a 16 byte swap
1132 pshufb %xmm15, \XMM4 # perform a 16 byte swap
1133
1134 pxor \TMP4, \TMP1
1135 pxor \XMM8, \XMM5
1136 pxor \TMP6, \TMP2
1137 pxor \TMP1, \TMP2
1138 pxor \XMM5, \TMP2
1139 movdqa \TMP2, \TMP3
1140 pslldq $8, \TMP3 # left shift TMP3 2 DWs
1141 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1142 pxor \TMP3, \XMM5
1143 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
1144
1145 # first phase of reduction
1146
1147 movdqa \XMM5, \TMP2
1148 movdqa \XMM5, \TMP3
1149 movdqa \XMM5, \TMP4
1150 # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1151 pslld $31, \TMP2 # packed right shift << 31
1152 pslld $30, \TMP3 # packed right shift << 30
1153 pslld $25, \TMP4 # packed right shift << 25
1154 pxor \TMP3, \TMP2 # xor the shifted versions
1155 pxor \TMP4, \TMP2
1156 movdqa \TMP2, \TMP5
1157 psrldq $4, \TMP5 # right shift T5 1 DW
1158 pslldq $12, \TMP2 # left shift T2 3 DWs
1159 pxor \TMP2, \XMM5
1160
1161 # second phase of reduction
1162
1163 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1164 movdqa \XMM5,\TMP3
1165 movdqa \XMM5,\TMP4
1166 psrld $1, \TMP2 # packed left shift >>1
1167 psrld $2, \TMP3 # packed left shift >>2
1168 psrld $7, \TMP4 # packed left shift >>7
1169 pxor \TMP3,\TMP2 # xor the shifted versions
1170 pxor \TMP4,\TMP2
1171 pxor \TMP5, \TMP2
1172 pxor \TMP2, \XMM5
1173 pxor \TMP1, \XMM5 # result is in TMP1
1174
1175 pxor \XMM5, \XMM1
1176 .endm
1177
1178
1179
1180
1181
1182
1183
1184 .macro GHASH_4_ENCRYPT_4_PARALLEL_dec TMP1 TMP2 TMP3 TMP4 TMP5 \
1185 TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
1186
1187 movdqa \XMM1, \XMM5
1188 movdqa \XMM2, \XMM6
1189 movdqa \XMM3, \XMM7
1190 movdqa \XMM4, \XMM8
1191
1192 movdqa SHUF_MASK(%rip), %xmm15
1193 # multiply TMP5 * HashKey using karatsuba
1194
1195 movdqa \XMM5, \TMP4
1196 pshufd $78, \XMM5, \TMP6
1197 pxor \XMM5, \TMP6
1198 paddd ONE(%rip), \XMM0 # INCR CNT
1199 movdqu HashKey_4(%arg2), \TMP5
1200 pclmulqdq $0x11, \TMP5, \TMP4 # TMP4 = a1*b1
1201 movdqa \XMM0, \XMM1
1202 paddd ONE(%rip), \XMM0 # INCR CNT
1203 movdqa \XMM0, \XMM2
1204 paddd ONE(%rip), \XMM0 # INCR CNT
1205 movdqa \XMM0, \XMM3
1206 paddd ONE(%rip), \XMM0 # INCR CNT
1207 movdqa \XMM0, \XMM4
1208 pshufb %xmm15, \XMM1 # perform a 16 byte swap
1209 pclmulqdq $0x00, \TMP5, \XMM5 # XMM5 = a0*b0
1210 pshufb %xmm15, \XMM2 # perform a 16 byte swap
1211 pshufb %xmm15, \XMM3 # perform a 16 byte swap
1212 pshufb %xmm15, \XMM4 # perform a 16 byte swap
1213
1214 pxor (%arg1), \XMM1
1215 pxor (%arg1), \XMM2
1216 pxor (%arg1), \XMM3
1217 pxor (%arg1), \XMM4
1218 movdqu HashKey_4_k(%arg2), \TMP5
1219 pclmulqdq $0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
1220 movaps 0x10(%arg1), \TMP1
1221 aesenc \TMP1, \XMM1 # Round 1
1222 aesenc \TMP1, \XMM2
1223 aesenc \TMP1, \XMM3
1224 aesenc \TMP1, \XMM4
1225 movaps 0x20(%arg1), \TMP1
1226 aesenc \TMP1, \XMM1 # Round 2
1227 aesenc \TMP1, \XMM2
1228 aesenc \TMP1, \XMM3
1229 aesenc \TMP1, \XMM4
1230 movdqa \XMM6, \TMP1
1231 pshufd $78, \XMM6, \TMP2
1232 pxor \XMM6, \TMP2
1233 movdqu HashKey_3(%arg2), \TMP5
1234 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
1235 movaps 0x30(%arg1), \TMP3
1236 aesenc \TMP3, \XMM1 # Round 3
1237 aesenc \TMP3, \XMM2
1238 aesenc \TMP3, \XMM3
1239 aesenc \TMP3, \XMM4
1240 pclmulqdq $0x00, \TMP5, \XMM6 # XMM6 = a0*b0
1241 movaps 0x40(%arg1), \TMP3
1242 aesenc \TMP3, \XMM1 # Round 4
1243 aesenc \TMP3, \XMM2
1244 aesenc \TMP3, \XMM3
1245 aesenc \TMP3, \XMM4
1246 movdqu HashKey_3_k(%arg2), \TMP5
1247 pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1248 movaps 0x50(%arg1), \TMP3
1249 aesenc \TMP3, \XMM1 # Round 5
1250 aesenc \TMP3, \XMM2
1251 aesenc \TMP3, \XMM3
1252 aesenc \TMP3, \XMM4
1253 pxor \TMP1, \TMP4
1254 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1255 pxor \XMM6, \XMM5
1256 pxor \TMP2, \TMP6
1257 movdqa \XMM7, \TMP1
1258 pshufd $78, \XMM7, \TMP2
1259 pxor \XMM7, \TMP2
1260 movdqu HashKey_2(%arg2), \TMP5
1261
1262 # Multiply TMP5 * HashKey using karatsuba
1263
1264 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1265 movaps 0x60(%arg1), \TMP3
1266 aesenc \TMP3, \XMM1 # Round 6
1267 aesenc \TMP3, \XMM2
1268 aesenc \TMP3, \XMM3
1269 aesenc \TMP3, \XMM4
1270 pclmulqdq $0x00, \TMP5, \XMM7 # XMM7 = a0*b0
1271 movaps 0x70(%arg1), \TMP3
1272 aesenc \TMP3, \XMM1 # Round 7
1273 aesenc \TMP3, \XMM2
1274 aesenc \TMP3, \XMM3
1275 aesenc \TMP3, \XMM4
1276 movdqu HashKey_2_k(%arg2), \TMP5
1277 pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1278 movaps 0x80(%arg1), \TMP3
1279 aesenc \TMP3, \XMM1 # Round 8
1280 aesenc \TMP3, \XMM2
1281 aesenc \TMP3, \XMM3
1282 aesenc \TMP3, \XMM4
1283 pxor \TMP1, \TMP4
1284 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1285 pxor \XMM7, \XMM5
1286 pxor \TMP2, \TMP6
1287
1288 # Multiply XMM8 * HashKey
1289 # XMM8 and TMP5 hold the values for the two operands
1290
1291 movdqa \XMM8, \TMP1
1292 pshufd $78, \XMM8, \TMP2
1293 pxor \XMM8, \TMP2
1294 movdqu HashKey(%arg2), \TMP5
1295 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1296 movaps 0x90(%arg1), \TMP3
1297 aesenc \TMP3, \XMM1 # Round 9
1298 aesenc \TMP3, \XMM2
1299 aesenc \TMP3, \XMM3
1300 aesenc \TMP3, \XMM4
1301 pclmulqdq $0x00, \TMP5, \XMM8 # XMM8 = a0*b0
1302 lea 0xa0(%arg1),%r10
1303 mov keysize,%eax
1304 shr $2,%eax # 128->4, 192->6, 256->8
1305 sub $4,%eax # 128->0, 192->2, 256->4
1306 jz aes_loop_par_dec_done\@
1307
1308 aes_loop_par_dec\@:
1309 MOVADQ (%r10),\TMP3
1310 .irpc index, 1234
1311 aesenc \TMP3, %xmm\index
1312 .endr
1313 add $16,%r10
1314 sub $1,%eax
1315 jnz aes_loop_par_dec\@
1316
1317 aes_loop_par_dec_done\@:
1318 MOVADQ (%r10), \TMP3
1319 aesenclast \TMP3, \XMM1 # last round
1320 aesenclast \TMP3, \XMM2
1321 aesenclast \TMP3, \XMM3
1322 aesenclast \TMP3, \XMM4
1323 movdqu HashKey_k(%arg2), \TMP5
1324 pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1325 movdqu (%arg4,%r11,1), \TMP3
1326 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
1327 movdqu \XMM1, (%arg3,%r11,1) # Write to plaintext buffer
1328 movdqa \TMP3, \XMM1
1329 movdqu 16(%arg4,%r11,1), \TMP3
1330 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
1331 movdqu \XMM2, 16(%arg3,%r11,1) # Write to plaintext buffer
1332 movdqa \TMP3, \XMM2
1333 movdqu 32(%arg4,%r11,1), \TMP3
1334 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
1335 movdqu \XMM3, 32(%arg3,%r11,1) # Write to plaintext buffer
1336 movdqa \TMP3, \XMM3
1337 movdqu 48(%arg4,%r11,1), \TMP3
1338 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
1339 movdqu \XMM4, 48(%arg3,%r11,1) # Write to plaintext buffer
1340 movdqa \TMP3, \XMM4
1341 pshufb %xmm15, \XMM1 # perform a 16 byte swap
1342 pshufb %xmm15, \XMM2 # perform a 16 byte swap
1343 pshufb %xmm15, \XMM3 # perform a 16 byte swap
1344 pshufb %xmm15, \XMM4 # perform a 16 byte swap
1345
1346 pxor \TMP4, \TMP1
1347 pxor \XMM8, \XMM5
1348 pxor \TMP6, \TMP2
1349 pxor \TMP1, \TMP2
1350 pxor \XMM5, \TMP2
1351 movdqa \TMP2, \TMP3
1352 pslldq $8, \TMP3 # left shift TMP3 2 DWs
1353 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1354 pxor \TMP3, \XMM5
1355 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
1356
1357 # first phase of reduction
1358
1359 movdqa \XMM5, \TMP2
1360 movdqa \XMM5, \TMP3
1361 movdqa \XMM5, \TMP4
1362 # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1363 pslld $31, \TMP2 # packed right shift << 31
1364 pslld $30, \TMP3 # packed right shift << 30
1365 pslld $25, \TMP4 # packed right shift << 25
1366 pxor \TMP3, \TMP2 # xor the shifted versions
1367 pxor \TMP4, \TMP2
1368 movdqa \TMP2, \TMP5
1369 psrldq $4, \TMP5 # right shift T5 1 DW
1370 pslldq $12, \TMP2 # left shift T2 3 DWs
1371 pxor \TMP2, \XMM5
1372
1373 # second phase of reduction
1374
1375 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1376 movdqa \XMM5,\TMP3
1377 movdqa \XMM5,\TMP4
1378 psrld $1, \TMP2 # packed left shift >>1
1379 psrld $2, \TMP3 # packed left shift >>2
1380 psrld $7, \TMP4 # packed left shift >>7
1381 pxor \TMP3,\TMP2 # xor the shifted versions
1382 pxor \TMP4,\TMP2
1383 pxor \TMP5, \TMP2
1384 pxor \TMP2, \XMM5
1385 pxor \TMP1, \XMM5 # result is in TMP1
1386
1387 pxor \XMM5, \XMM1
1388 .endm
1389
1390
1391 .macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
1392 TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
1393
1394 # Multiply TMP6 * HashKey (using Karatsuba)
1395
1396 movdqa \XMM1, \TMP6
1397 pshufd $78, \XMM1, \TMP2
1398 pxor \XMM1, \TMP2
1399 movdqu HashKey_4(%arg2), \TMP5
1400 pclmulqdq $0x11, \TMP5, \TMP6 # TMP6 = a1*b1
1401 pclmulqdq $0x00, \TMP5, \XMM1 # XMM1 = a0*b0
1402 movdqu HashKey_4_k(%arg2), \TMP4
1403 pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1404 movdqa \XMM1, \XMMDst
1405 movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1
1406
1407 # Multiply TMP1 * HashKey (using Karatsuba)
1408
1409 movdqa \XMM2, \TMP1
1410 pshufd $78, \XMM2, \TMP2
1411 pxor \XMM2, \TMP2
1412 movdqu HashKey_3(%arg2), \TMP5
1413 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1414 pclmulqdq $0x00, \TMP5, \XMM2 # XMM2 = a0*b0
1415 movdqu HashKey_3_k(%arg2), \TMP4
1416 pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1417 pxor \TMP1, \TMP6
1418 pxor \XMM2, \XMMDst
1419 pxor \TMP2, \XMM1
1420 # results accumulated in TMP6, XMMDst, XMM1
1421
1422 # Multiply TMP1 * HashKey (using Karatsuba)
1423
1424 movdqa \XMM3, \TMP1
1425 pshufd $78, \XMM3, \TMP2
1426 pxor \XMM3, \TMP2
1427 movdqu HashKey_2(%arg2), \TMP5
1428 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1429 pclmulqdq $0x00, \TMP5, \XMM3 # XMM3 = a0*b0
1430 movdqu HashKey_2_k(%arg2), \TMP4
1431 pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1432 pxor \TMP1, \TMP6
1433 pxor \XMM3, \XMMDst
1434 pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1
1435
1436 # Multiply TMP1 * HashKey (using Karatsuba)
1437 movdqa \XMM4, \TMP1
1438 pshufd $78, \XMM4, \TMP2
1439 pxor \XMM4, \TMP2
1440 movdqu HashKey(%arg2), \TMP5
1441 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1442 pclmulqdq $0x00, \TMP5, \XMM4 # XMM4 = a0*b0
1443 movdqu HashKey_k(%arg2), \TMP4
1444 pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1445 pxor \TMP1, \TMP6
1446 pxor \XMM4, \XMMDst
1447 pxor \XMM1, \TMP2
1448 pxor \TMP6, \TMP2
1449 pxor \XMMDst, \TMP2
1450 # middle section of the temp results combined as in karatsuba algorithm
1451 movdqa \TMP2, \TMP4
1452 pslldq $8, \TMP4 # left shift TMP4 2 DWs
1453 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1454 pxor \TMP4, \XMMDst
1455 pxor \TMP2, \TMP6
1456 # TMP6:XMMDst holds the result of the accumulated carry-less multiplications
1457 # first phase of the reduction
1458 movdqa \XMMDst, \TMP2
1459 movdqa \XMMDst, \TMP3
1460 movdqa \XMMDst, \TMP4
1461 # move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
1462 pslld $31, \TMP2 # packed right shifting << 31
1463 pslld $30, \TMP3 # packed right shifting << 30
1464 pslld $25, \TMP4 # packed right shifting << 25
1465 pxor \TMP3, \TMP2 # xor the shifted versions
1466 pxor \TMP4, \TMP2
1467 movdqa \TMP2, \TMP7
1468 psrldq $4, \TMP7 # right shift TMP7 1 DW
1469 pslldq $12, \TMP2 # left shift TMP2 3 DWs
1470 pxor \TMP2, \XMMDst
1471
1472 # second phase of the reduction
1473 movdqa \XMMDst, \TMP2
1474 # make 3 copies of XMMDst for doing 3 shift operations
1475 movdqa \XMMDst, \TMP3
1476 movdqa \XMMDst, \TMP4
1477 psrld $1, \TMP2 # packed left shift >> 1
1478 psrld $2, \TMP3 # packed left shift >> 2
1479 psrld $7, \TMP4 # packed left shift >> 7
1480 pxor \TMP3, \TMP2 # xor the shifted versions
1481 pxor \TMP4, \TMP2
1482 pxor \TMP7, \TMP2
1483 pxor \TMP2, \XMMDst
1484 pxor \TMP6, \XMMDst # reduced result is in XMMDst
1485 .endm
1486
1487
1488
1489
1490
1491
1492 .macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
1493
1494 pxor (%arg1), \XMM0
1495 mov keysize,%eax
1496 shr $2,%eax # 128->4, 192->6, 256->8
1497 add $5,%eax # 128->9, 192->11, 256->13
1498 lea 16(%arg1), %r10 # get first expanded key address
1499
1500 _esb_loop_\@:
1501 MOVADQ (%r10),\TMP1
1502 aesenc \TMP1,\XMM0
1503 add $16,%r10
1504 sub $1,%eax
1505 jnz _esb_loop_\@
1506
1507 MOVADQ (%r10),\TMP1
1508 aesenclast \TMP1,\XMM0
1509 .endm
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590 SYM_FUNC_START(aesni_gcm_dec)
1591 FUNC_SAVE
1592
1593 GCM_INIT %arg6, arg7, arg8, arg9
1594 GCM_ENC_DEC dec
1595 GCM_COMPLETE arg10, arg11
1596 FUNC_RESTORE
1597 RET
1598 SYM_FUNC_END(aesni_gcm_dec)
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678 SYM_FUNC_START(aesni_gcm_enc)
1679 FUNC_SAVE
1680
1681 GCM_INIT %arg6, arg7, arg8, arg9
1682 GCM_ENC_DEC enc
1683
1684 GCM_COMPLETE arg10, arg11
1685 FUNC_RESTORE
1686 RET
1687 SYM_FUNC_END(aesni_gcm_enc)
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700 SYM_FUNC_START(aesni_gcm_init)
1701 FUNC_SAVE
1702 GCM_INIT %arg3, %arg4,%arg5, %arg6
1703 FUNC_RESTORE
1704 RET
1705 SYM_FUNC_END(aesni_gcm_init)
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715 SYM_FUNC_START(aesni_gcm_enc_update)
1716 FUNC_SAVE
1717 GCM_ENC_DEC enc
1718 FUNC_RESTORE
1719 RET
1720 SYM_FUNC_END(aesni_gcm_enc_update)
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730 SYM_FUNC_START(aesni_gcm_dec_update)
1731 FUNC_SAVE
1732 GCM_ENC_DEC dec
1733 FUNC_RESTORE
1734 RET
1735 SYM_FUNC_END(aesni_gcm_dec_update)
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745 SYM_FUNC_START(aesni_gcm_finalize)
1746 FUNC_SAVE
1747 GCM_COMPLETE %arg3 %arg4
1748 FUNC_RESTORE
1749 RET
1750 SYM_FUNC_END(aesni_gcm_finalize)
1751
1752 #endif
1753
1754 SYM_FUNC_START_LOCAL(_key_expansion_256a)
1755 pshufd $0b11111111, %xmm1, %xmm1
1756 shufps $0b00010000, %xmm0, %xmm4
1757 pxor %xmm4, %xmm0
1758 shufps $0b10001100, %xmm0, %xmm4
1759 pxor %xmm4, %xmm0
1760 pxor %xmm1, %xmm0
1761 movaps %xmm0, (TKEYP)
1762 add $0x10, TKEYP
1763 RET
1764 SYM_FUNC_END(_key_expansion_256a)
1765 SYM_FUNC_ALIAS_LOCAL(_key_expansion_128, _key_expansion_256a)
1766
1767 SYM_FUNC_START_LOCAL(_key_expansion_192a)
1768 pshufd $0b01010101, %xmm1, %xmm1
1769 shufps $0b00010000, %xmm0, %xmm4
1770 pxor %xmm4, %xmm0
1771 shufps $0b10001100, %xmm0, %xmm4
1772 pxor %xmm4, %xmm0
1773 pxor %xmm1, %xmm0
1774
1775 movaps %xmm2, %xmm5
1776 movaps %xmm2, %xmm6
1777 pslldq $4, %xmm5
1778 pshufd $0b11111111, %xmm0, %xmm3
1779 pxor %xmm3, %xmm2
1780 pxor %xmm5, %xmm2
1781
1782 movaps %xmm0, %xmm1
1783 shufps $0b01000100, %xmm0, %xmm6
1784 movaps %xmm6, (TKEYP)
1785 shufps $0b01001110, %xmm2, %xmm1
1786 movaps %xmm1, 0x10(TKEYP)
1787 add $0x20, TKEYP
1788 RET
1789 SYM_FUNC_END(_key_expansion_192a)
1790
1791 SYM_FUNC_START_LOCAL(_key_expansion_192b)
1792 pshufd $0b01010101, %xmm1, %xmm1
1793 shufps $0b00010000, %xmm0, %xmm4
1794 pxor %xmm4, %xmm0
1795 shufps $0b10001100, %xmm0, %xmm4
1796 pxor %xmm4, %xmm0
1797 pxor %xmm1, %xmm0
1798
1799 movaps %xmm2, %xmm5
1800 pslldq $4, %xmm5
1801 pshufd $0b11111111, %xmm0, %xmm3
1802 pxor %xmm3, %xmm2
1803 pxor %xmm5, %xmm2
1804
1805 movaps %xmm0, (TKEYP)
1806 add $0x10, TKEYP
1807 RET
1808 SYM_FUNC_END(_key_expansion_192b)
1809
1810 SYM_FUNC_START_LOCAL(_key_expansion_256b)
1811 pshufd $0b10101010, %xmm1, %xmm1
1812 shufps $0b00010000, %xmm2, %xmm4
1813 pxor %xmm4, %xmm2
1814 shufps $0b10001100, %xmm2, %xmm4
1815 pxor %xmm4, %xmm2
1816 pxor %xmm1, %xmm2
1817 movaps %xmm2, (TKEYP)
1818 add $0x10, TKEYP
1819 RET
1820 SYM_FUNC_END(_key_expansion_256b)
1821
1822
1823
1824
1825
1826 SYM_FUNC_START(aesni_set_key)
1827 FRAME_BEGIN
1828 #ifndef __x86_64__
1829 pushl KEYP
1830 movl (FRAME_OFFSET+8)(%esp), KEYP # ctx
1831 movl (FRAME_OFFSET+12)(%esp), UKEYP # in_key
1832 movl (FRAME_OFFSET+16)(%esp), %edx # key_len
1833 #endif
1834 movups (UKEYP), %xmm0 # user key (first 16 bytes)
1835 movaps %xmm0, (KEYP)
1836 lea 0x10(KEYP), TKEYP # key addr
1837 movl %edx, 480(KEYP)
1838 pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x
1839 cmp $24, %dl
1840 jb .Lenc_key128
1841 je .Lenc_key192
1842 movups 0x10(UKEYP), %xmm2 # other user key
1843 movaps %xmm2, (TKEYP)
1844 add $0x10, TKEYP
1845 aeskeygenassist $0x1, %xmm2, %xmm1 # round 1
1846 call _key_expansion_256a
1847 aeskeygenassist $0x1, %xmm0, %xmm1
1848 call _key_expansion_256b
1849 aeskeygenassist $0x2, %xmm2, %xmm1 # round 2
1850 call _key_expansion_256a
1851 aeskeygenassist $0x2, %xmm0, %xmm1
1852 call _key_expansion_256b
1853 aeskeygenassist $0x4, %xmm2, %xmm1 # round 3
1854 call _key_expansion_256a
1855 aeskeygenassist $0x4, %xmm0, %xmm1
1856 call _key_expansion_256b
1857 aeskeygenassist $0x8, %xmm2, %xmm1 # round 4
1858 call _key_expansion_256a
1859 aeskeygenassist $0x8, %xmm0, %xmm1
1860 call _key_expansion_256b
1861 aeskeygenassist $0x10, %xmm2, %xmm1 # round 5
1862 call _key_expansion_256a
1863 aeskeygenassist $0x10, %xmm0, %xmm1
1864 call _key_expansion_256b
1865 aeskeygenassist $0x20, %xmm2, %xmm1 # round 6
1866 call _key_expansion_256a
1867 aeskeygenassist $0x20, %xmm0, %xmm1
1868 call _key_expansion_256b
1869 aeskeygenassist $0x40, %xmm2, %xmm1 # round 7
1870 call _key_expansion_256a
1871 jmp .Ldec_key
1872 .Lenc_key192:
1873 movq 0x10(UKEYP), %xmm2 # other user key
1874 aeskeygenassist $0x1, %xmm2, %xmm1 # round 1
1875 call _key_expansion_192a
1876 aeskeygenassist $0x2, %xmm2, %xmm1 # round 2
1877 call _key_expansion_192b
1878 aeskeygenassist $0x4, %xmm2, %xmm1 # round 3
1879 call _key_expansion_192a
1880 aeskeygenassist $0x8, %xmm2, %xmm1 # round 4
1881 call _key_expansion_192b
1882 aeskeygenassist $0x10, %xmm2, %xmm1 # round 5
1883 call _key_expansion_192a
1884 aeskeygenassist $0x20, %xmm2, %xmm1 # round 6
1885 call _key_expansion_192b
1886 aeskeygenassist $0x40, %xmm2, %xmm1 # round 7
1887 call _key_expansion_192a
1888 aeskeygenassist $0x80, %xmm2, %xmm1 # round 8
1889 call _key_expansion_192b
1890 jmp .Ldec_key
1891 .Lenc_key128:
1892 aeskeygenassist $0x1, %xmm0, %xmm1 # round 1
1893 call _key_expansion_128
1894 aeskeygenassist $0x2, %xmm0, %xmm1 # round 2
1895 call _key_expansion_128
1896 aeskeygenassist $0x4, %xmm0, %xmm1 # round 3
1897 call _key_expansion_128
1898 aeskeygenassist $0x8, %xmm0, %xmm1 # round 4
1899 call _key_expansion_128
1900 aeskeygenassist $0x10, %xmm0, %xmm1 # round 5
1901 call _key_expansion_128
1902 aeskeygenassist $0x20, %xmm0, %xmm1 # round 6
1903 call _key_expansion_128
1904 aeskeygenassist $0x40, %xmm0, %xmm1 # round 7
1905 call _key_expansion_128
1906 aeskeygenassist $0x80, %xmm0, %xmm1 # round 8
1907 call _key_expansion_128
1908 aeskeygenassist $0x1b, %xmm0, %xmm1 # round 9
1909 call _key_expansion_128
1910 aeskeygenassist $0x36, %xmm0, %xmm1 # round 10
1911 call _key_expansion_128
1912 .Ldec_key:
1913 sub $0x10, TKEYP
1914 movaps (KEYP), %xmm0
1915 movaps (TKEYP), %xmm1
1916 movaps %xmm0, 240(TKEYP)
1917 movaps %xmm1, 240(KEYP)
1918 add $0x10, KEYP
1919 lea 240-16(TKEYP), UKEYP
1920 .align 4
1921 .Ldec_key_loop:
1922 movaps (KEYP), %xmm0
1923 aesimc %xmm0, %xmm1
1924 movaps %xmm1, (UKEYP)
1925 add $0x10, KEYP
1926 sub $0x10, UKEYP
1927 cmp TKEYP, KEYP
1928 jb .Ldec_key_loop
1929 xor AREG, AREG
1930 #ifndef __x86_64__
1931 popl KEYP
1932 #endif
1933 FRAME_END
1934 RET
1935 SYM_FUNC_END(aesni_set_key)
1936
1937
1938
1939
1940 SYM_FUNC_START(aesni_enc)
1941 FRAME_BEGIN
1942 #ifndef __x86_64__
1943 pushl KEYP
1944 pushl KLEN
1945 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx
1946 movl (FRAME_OFFSET+16)(%esp), OUTP # dst
1947 movl (FRAME_OFFSET+20)(%esp), INP # src
1948 #endif
1949 movl 480(KEYP), KLEN # key length
1950 movups (INP), STATE # input
1951 call _aesni_enc1
1952 movups STATE, (OUTP) # output
1953 #ifndef __x86_64__
1954 popl KLEN
1955 popl KEYP
1956 #endif
1957 FRAME_END
1958 RET
1959 SYM_FUNC_END(aesni_enc)
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973 SYM_FUNC_START_LOCAL(_aesni_enc1)
1974 movaps (KEYP), KEY # key
1975 mov KEYP, TKEYP
1976 pxor KEY, STATE # round 0
1977 add $0x30, TKEYP
1978 cmp $24, KLEN
1979 jb .Lenc128
1980 lea 0x20(TKEYP), TKEYP
1981 je .Lenc192
1982 add $0x20, TKEYP
1983 movaps -0x60(TKEYP), KEY
1984 aesenc KEY, STATE
1985 movaps -0x50(TKEYP), KEY
1986 aesenc KEY, STATE
1987 .align 4
1988 .Lenc192:
1989 movaps -0x40(TKEYP), KEY
1990 aesenc KEY, STATE
1991 movaps -0x30(TKEYP), KEY
1992 aesenc KEY, STATE
1993 .align 4
1994 .Lenc128:
1995 movaps -0x20(TKEYP), KEY
1996 aesenc KEY, STATE
1997 movaps -0x10(TKEYP), KEY
1998 aesenc KEY, STATE
1999 movaps (TKEYP), KEY
2000 aesenc KEY, STATE
2001 movaps 0x10(TKEYP), KEY
2002 aesenc KEY, STATE
2003 movaps 0x20(TKEYP), KEY
2004 aesenc KEY, STATE
2005 movaps 0x30(TKEYP), KEY
2006 aesenc KEY, STATE
2007 movaps 0x40(TKEYP), KEY
2008 aesenc KEY, STATE
2009 movaps 0x50(TKEYP), KEY
2010 aesenc KEY, STATE
2011 movaps 0x60(TKEYP), KEY
2012 aesenc KEY, STATE
2013 movaps 0x70(TKEYP), KEY
2014 aesenclast KEY, STATE
2015 RET
2016 SYM_FUNC_END(_aesni_enc1)
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036 SYM_FUNC_START_LOCAL(_aesni_enc4)
2037 movaps (KEYP), KEY # key
2038 mov KEYP, TKEYP
2039 pxor KEY, STATE1 # round 0
2040 pxor KEY, STATE2
2041 pxor KEY, STATE3
2042 pxor KEY, STATE4
2043 add $0x30, TKEYP
2044 cmp $24, KLEN
2045 jb .L4enc128
2046 lea 0x20(TKEYP), TKEYP
2047 je .L4enc192
2048 add $0x20, TKEYP
2049 movaps -0x60(TKEYP), KEY
2050 aesenc KEY, STATE1
2051 aesenc KEY, STATE2
2052 aesenc KEY, STATE3
2053 aesenc KEY, STATE4
2054 movaps -0x50(TKEYP), KEY
2055 aesenc KEY, STATE1
2056 aesenc KEY, STATE2
2057 aesenc KEY, STATE3
2058 aesenc KEY, STATE4
2059 #.align 4
2060 .L4enc192:
2061 movaps -0x40(TKEYP), KEY
2062 aesenc KEY, STATE1
2063 aesenc KEY, STATE2
2064 aesenc KEY, STATE3
2065 aesenc KEY, STATE4
2066 movaps -0x30(TKEYP), KEY
2067 aesenc KEY, STATE1
2068 aesenc KEY, STATE2
2069 aesenc KEY, STATE3
2070 aesenc KEY, STATE4
2071 #.align 4
2072 .L4enc128:
2073 movaps -0x20(TKEYP), KEY
2074 aesenc KEY, STATE1
2075 aesenc KEY, STATE2
2076 aesenc KEY, STATE3
2077 aesenc KEY, STATE4
2078 movaps -0x10(TKEYP), KEY
2079 aesenc KEY, STATE1
2080 aesenc KEY, STATE2
2081 aesenc KEY, STATE3
2082 aesenc KEY, STATE4
2083 movaps (TKEYP), KEY
2084 aesenc KEY, STATE1
2085 aesenc KEY, STATE2
2086 aesenc KEY, STATE3
2087 aesenc KEY, STATE4
2088 movaps 0x10(TKEYP), KEY
2089 aesenc KEY, STATE1
2090 aesenc KEY, STATE2
2091 aesenc KEY, STATE3
2092 aesenc KEY, STATE4
2093 movaps 0x20(TKEYP), KEY
2094 aesenc KEY, STATE1
2095 aesenc KEY, STATE2
2096 aesenc KEY, STATE3
2097 aesenc KEY, STATE4
2098 movaps 0x30(TKEYP), KEY
2099 aesenc KEY, STATE1
2100 aesenc KEY, STATE2
2101 aesenc KEY, STATE3
2102 aesenc KEY, STATE4
2103 movaps 0x40(TKEYP), KEY
2104 aesenc KEY, STATE1
2105 aesenc KEY, STATE2
2106 aesenc KEY, STATE3
2107 aesenc KEY, STATE4
2108 movaps 0x50(TKEYP), KEY
2109 aesenc KEY, STATE1
2110 aesenc KEY, STATE2
2111 aesenc KEY, STATE3
2112 aesenc KEY, STATE4
2113 movaps 0x60(TKEYP), KEY
2114 aesenc KEY, STATE1
2115 aesenc KEY, STATE2
2116 aesenc KEY, STATE3
2117 aesenc KEY, STATE4
2118 movaps 0x70(TKEYP), KEY
2119 aesenclast KEY, STATE1 # last round
2120 aesenclast KEY, STATE2
2121 aesenclast KEY, STATE3
2122 aesenclast KEY, STATE4
2123 RET
2124 SYM_FUNC_END(_aesni_enc4)
2125
2126
2127
2128
2129 SYM_FUNC_START(aesni_dec)
2130 FRAME_BEGIN
2131 #ifndef __x86_64__
2132 pushl KEYP
2133 pushl KLEN
2134 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx
2135 movl (FRAME_OFFSET+16)(%esp), OUTP # dst
2136 movl (FRAME_OFFSET+20)(%esp), INP # src
2137 #endif
2138 mov 480(KEYP), KLEN # key length
2139 add $240, KEYP
2140 movups (INP), STATE # input
2141 call _aesni_dec1
2142 movups STATE, (OUTP) #output
2143 #ifndef __x86_64__
2144 popl KLEN
2145 popl KEYP
2146 #endif
2147 FRAME_END
2148 RET
2149 SYM_FUNC_END(aesni_dec)
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163 SYM_FUNC_START_LOCAL(_aesni_dec1)
2164 movaps (KEYP), KEY # key
2165 mov KEYP, TKEYP
2166 pxor KEY, STATE # round 0
2167 add $0x30, TKEYP
2168 cmp $24, KLEN
2169 jb .Ldec128
2170 lea 0x20(TKEYP), TKEYP
2171 je .Ldec192
2172 add $0x20, TKEYP
2173 movaps -0x60(TKEYP), KEY
2174 aesdec KEY, STATE
2175 movaps -0x50(TKEYP), KEY
2176 aesdec KEY, STATE
2177 .align 4
2178 .Ldec192:
2179 movaps -0x40(TKEYP), KEY
2180 aesdec KEY, STATE
2181 movaps -0x30(TKEYP), KEY
2182 aesdec KEY, STATE
2183 .align 4
2184 .Ldec128:
2185 movaps -0x20(TKEYP), KEY
2186 aesdec KEY, STATE
2187 movaps -0x10(TKEYP), KEY
2188 aesdec KEY, STATE
2189 movaps (TKEYP), KEY
2190 aesdec KEY, STATE
2191 movaps 0x10(TKEYP), KEY
2192 aesdec KEY, STATE
2193 movaps 0x20(TKEYP), KEY
2194 aesdec KEY, STATE
2195 movaps 0x30(TKEYP), KEY
2196 aesdec KEY, STATE
2197 movaps 0x40(TKEYP), KEY
2198 aesdec KEY, STATE
2199 movaps 0x50(TKEYP), KEY
2200 aesdec KEY, STATE
2201 movaps 0x60(TKEYP), KEY
2202 aesdec KEY, STATE
2203 movaps 0x70(TKEYP), KEY
2204 aesdeclast KEY, STATE
2205 RET
2206 SYM_FUNC_END(_aesni_dec1)
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226 SYM_FUNC_START_LOCAL(_aesni_dec4)
2227 movaps (KEYP), KEY # key
2228 mov KEYP, TKEYP
2229 pxor KEY, STATE1 # round 0
2230 pxor KEY, STATE2
2231 pxor KEY, STATE3
2232 pxor KEY, STATE4
2233 add $0x30, TKEYP
2234 cmp $24, KLEN
2235 jb .L4dec128
2236 lea 0x20(TKEYP), TKEYP
2237 je .L4dec192
2238 add $0x20, TKEYP
2239 movaps -0x60(TKEYP), KEY
2240 aesdec KEY, STATE1
2241 aesdec KEY, STATE2
2242 aesdec KEY, STATE3
2243 aesdec KEY, STATE4
2244 movaps -0x50(TKEYP), KEY
2245 aesdec KEY, STATE1
2246 aesdec KEY, STATE2
2247 aesdec KEY, STATE3
2248 aesdec KEY, STATE4
2249 .align 4
2250 .L4dec192:
2251 movaps -0x40(TKEYP), KEY
2252 aesdec KEY, STATE1
2253 aesdec KEY, STATE2
2254 aesdec KEY, STATE3
2255 aesdec KEY, STATE4
2256 movaps -0x30(TKEYP), KEY
2257 aesdec KEY, STATE1
2258 aesdec KEY, STATE2
2259 aesdec KEY, STATE3
2260 aesdec KEY, STATE4
2261 .align 4
2262 .L4dec128:
2263 movaps -0x20(TKEYP), KEY
2264 aesdec KEY, STATE1
2265 aesdec KEY, STATE2
2266 aesdec KEY, STATE3
2267 aesdec KEY, STATE4
2268 movaps -0x10(TKEYP), KEY
2269 aesdec KEY, STATE1
2270 aesdec KEY, STATE2
2271 aesdec KEY, STATE3
2272 aesdec KEY, STATE4
2273 movaps (TKEYP), KEY
2274 aesdec KEY, STATE1
2275 aesdec KEY, STATE2
2276 aesdec KEY, STATE3
2277 aesdec KEY, STATE4
2278 movaps 0x10(TKEYP), KEY
2279 aesdec KEY, STATE1
2280 aesdec KEY, STATE2
2281 aesdec KEY, STATE3
2282 aesdec KEY, STATE4
2283 movaps 0x20(TKEYP), KEY
2284 aesdec KEY, STATE1
2285 aesdec KEY, STATE2
2286 aesdec KEY, STATE3
2287 aesdec KEY, STATE4
2288 movaps 0x30(TKEYP), KEY
2289 aesdec KEY, STATE1
2290 aesdec KEY, STATE2
2291 aesdec KEY, STATE3
2292 aesdec KEY, STATE4
2293 movaps 0x40(TKEYP), KEY
2294 aesdec KEY, STATE1
2295 aesdec KEY, STATE2
2296 aesdec KEY, STATE3
2297 aesdec KEY, STATE4
2298 movaps 0x50(TKEYP), KEY
2299 aesdec KEY, STATE1
2300 aesdec KEY, STATE2
2301 aesdec KEY, STATE3
2302 aesdec KEY, STATE4
2303 movaps 0x60(TKEYP), KEY
2304 aesdec KEY, STATE1
2305 aesdec KEY, STATE2
2306 aesdec KEY, STATE3
2307 aesdec KEY, STATE4
2308 movaps 0x70(TKEYP), KEY
2309 aesdeclast KEY, STATE1 # last round
2310 aesdeclast KEY, STATE2
2311 aesdeclast KEY, STATE3
2312 aesdeclast KEY, STATE4
2313 RET
2314 SYM_FUNC_END(_aesni_dec4)
2315
2316
2317
2318
2319
2320 SYM_FUNC_START(aesni_ecb_enc)
2321 FRAME_BEGIN
2322 #ifndef __x86_64__
2323 pushl LEN
2324 pushl KEYP
2325 pushl KLEN
2326 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx
2327 movl (FRAME_OFFSET+20)(%esp), OUTP # dst
2328 movl (FRAME_OFFSET+24)(%esp), INP # src
2329 movl (FRAME_OFFSET+28)(%esp), LEN # len
2330 #endif
2331 test LEN, LEN # check length
2332 jz .Lecb_enc_ret
2333 mov 480(KEYP), KLEN
2334 cmp $16, LEN
2335 jb .Lecb_enc_ret
2336 cmp $64, LEN
2337 jb .Lecb_enc_loop1
2338 .align 4
2339 .Lecb_enc_loop4:
2340 movups (INP), STATE1
2341 movups 0x10(INP), STATE2
2342 movups 0x20(INP), STATE3
2343 movups 0x30(INP), STATE4
2344 call _aesni_enc4
2345 movups STATE1, (OUTP)
2346 movups STATE2, 0x10(OUTP)
2347 movups STATE3, 0x20(OUTP)
2348 movups STATE4, 0x30(OUTP)
2349 sub $64, LEN
2350 add $64, INP
2351 add $64, OUTP
2352 cmp $64, LEN
2353 jge .Lecb_enc_loop4
2354 cmp $16, LEN
2355 jb .Lecb_enc_ret
2356 .align 4
2357 .Lecb_enc_loop1:
2358 movups (INP), STATE1
2359 call _aesni_enc1
2360 movups STATE1, (OUTP)
2361 sub $16, LEN
2362 add $16, INP
2363 add $16, OUTP
2364 cmp $16, LEN
2365 jge .Lecb_enc_loop1
2366 .Lecb_enc_ret:
2367 #ifndef __x86_64__
2368 popl KLEN
2369 popl KEYP
2370 popl LEN
2371 #endif
2372 FRAME_END
2373 RET
2374 SYM_FUNC_END(aesni_ecb_enc)
2375
2376
2377
2378
2379
2380 SYM_FUNC_START(aesni_ecb_dec)
2381 FRAME_BEGIN
2382 #ifndef __x86_64__
2383 pushl LEN
2384 pushl KEYP
2385 pushl KLEN
2386 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx
2387 movl (FRAME_OFFSET+20)(%esp), OUTP # dst
2388 movl (FRAME_OFFSET+24)(%esp), INP # src
2389 movl (FRAME_OFFSET+28)(%esp), LEN # len
2390 #endif
2391 test LEN, LEN
2392 jz .Lecb_dec_ret
2393 mov 480(KEYP), KLEN
2394 add $240, KEYP
2395 cmp $16, LEN
2396 jb .Lecb_dec_ret
2397 cmp $64, LEN
2398 jb .Lecb_dec_loop1
2399 .align 4
2400 .Lecb_dec_loop4:
2401 movups (INP), STATE1
2402 movups 0x10(INP), STATE2
2403 movups 0x20(INP), STATE3
2404 movups 0x30(INP), STATE4
2405 call _aesni_dec4
2406 movups STATE1, (OUTP)
2407 movups STATE2, 0x10(OUTP)
2408 movups STATE3, 0x20(OUTP)
2409 movups STATE4, 0x30(OUTP)
2410 sub $64, LEN
2411 add $64, INP
2412 add $64, OUTP
2413 cmp $64, LEN
2414 jge .Lecb_dec_loop4
2415 cmp $16, LEN
2416 jb .Lecb_dec_ret
2417 .align 4
2418 .Lecb_dec_loop1:
2419 movups (INP), STATE1
2420 call _aesni_dec1
2421 movups STATE1, (OUTP)
2422 sub $16, LEN
2423 add $16, INP
2424 add $16, OUTP
2425 cmp $16, LEN
2426 jge .Lecb_dec_loop1
2427 .Lecb_dec_ret:
2428 #ifndef __x86_64__
2429 popl KLEN
2430 popl KEYP
2431 popl LEN
2432 #endif
2433 FRAME_END
2434 RET
2435 SYM_FUNC_END(aesni_ecb_dec)
2436
2437
2438
2439
2440
2441 SYM_FUNC_START(aesni_cbc_enc)
2442 FRAME_BEGIN
2443 #ifndef __x86_64__
2444 pushl IVP
2445 pushl LEN
2446 pushl KEYP
2447 pushl KLEN
2448 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
2449 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
2450 movl (FRAME_OFFSET+28)(%esp), INP # src
2451 movl (FRAME_OFFSET+32)(%esp), LEN # len
2452 movl (FRAME_OFFSET+36)(%esp), IVP # iv
2453 #endif
2454 cmp $16, LEN
2455 jb .Lcbc_enc_ret
2456 mov 480(KEYP), KLEN
2457 movups (IVP), STATE # load iv as initial state
2458 .align 4
2459 .Lcbc_enc_loop:
2460 movups (INP), IN # load input
2461 pxor IN, STATE
2462 call _aesni_enc1
2463 movups STATE, (OUTP) # store output
2464 sub $16, LEN
2465 add $16, INP
2466 add $16, OUTP
2467 cmp $16, LEN
2468 jge .Lcbc_enc_loop
2469 movups STATE, (IVP)
2470 .Lcbc_enc_ret:
2471 #ifndef __x86_64__
2472 popl KLEN
2473 popl KEYP
2474 popl LEN
2475 popl IVP
2476 #endif
2477 FRAME_END
2478 RET
2479 SYM_FUNC_END(aesni_cbc_enc)
2480
2481
2482
2483
2484
2485 SYM_FUNC_START(aesni_cbc_dec)
2486 FRAME_BEGIN
2487 #ifndef __x86_64__
2488 pushl IVP
2489 pushl LEN
2490 pushl KEYP
2491 pushl KLEN
2492 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
2493 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
2494 movl (FRAME_OFFSET+28)(%esp), INP # src
2495 movl (FRAME_OFFSET+32)(%esp), LEN # len
2496 movl (FRAME_OFFSET+36)(%esp), IVP # iv
2497 #endif
2498 cmp $16, LEN
2499 jb .Lcbc_dec_just_ret
2500 mov 480(KEYP), KLEN
2501 add $240, KEYP
2502 movups (IVP), IV
2503 cmp $64, LEN
2504 jb .Lcbc_dec_loop1
2505 .align 4
2506 .Lcbc_dec_loop4:
2507 movups (INP), IN1
2508 movaps IN1, STATE1
2509 movups 0x10(INP), IN2
2510 movaps IN2, STATE2
2511 #ifdef __x86_64__
2512 movups 0x20(INP), IN3
2513 movaps IN3, STATE3
2514 movups 0x30(INP), IN4
2515 movaps IN4, STATE4
2516 #else
2517 movups 0x20(INP), IN1
2518 movaps IN1, STATE3
2519 movups 0x30(INP), IN2
2520 movaps IN2, STATE4
2521 #endif
2522 call _aesni_dec4
2523 pxor IV, STATE1
2524 #ifdef __x86_64__
2525 pxor IN1, STATE2
2526 pxor IN2, STATE3
2527 pxor IN3, STATE4
2528 movaps IN4, IV
2529 #else
2530 pxor IN1, STATE4
2531 movaps IN2, IV
2532 movups (INP), IN1
2533 pxor IN1, STATE2
2534 movups 0x10(INP), IN2
2535 pxor IN2, STATE3
2536 #endif
2537 movups STATE1, (OUTP)
2538 movups STATE2, 0x10(OUTP)
2539 movups STATE3, 0x20(OUTP)
2540 movups STATE4, 0x30(OUTP)
2541 sub $64, LEN
2542 add $64, INP
2543 add $64, OUTP
2544 cmp $64, LEN
2545 jge .Lcbc_dec_loop4
2546 cmp $16, LEN
2547 jb .Lcbc_dec_ret
2548 .align 4
2549 .Lcbc_dec_loop1:
2550 movups (INP), IN
2551 movaps IN, STATE
2552 call _aesni_dec1
2553 pxor IV, STATE
2554 movups STATE, (OUTP)
2555 movaps IN, IV
2556 sub $16, LEN
2557 add $16, INP
2558 add $16, OUTP
2559 cmp $16, LEN
2560 jge .Lcbc_dec_loop1
2561 .Lcbc_dec_ret:
2562 movups IV, (IVP)
2563 .Lcbc_dec_just_ret:
2564 #ifndef __x86_64__
2565 popl KLEN
2566 popl KEYP
2567 popl LEN
2568 popl IVP
2569 #endif
2570 FRAME_END
2571 RET
2572 SYM_FUNC_END(aesni_cbc_dec)
2573
2574
2575
2576
2577
2578 SYM_FUNC_START(aesni_cts_cbc_enc)
2579 FRAME_BEGIN
2580 #ifndef __x86_64__
2581 pushl IVP
2582 pushl LEN
2583 pushl KEYP
2584 pushl KLEN
2585 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
2586 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
2587 movl (FRAME_OFFSET+28)(%esp), INP # src
2588 movl (FRAME_OFFSET+32)(%esp), LEN # len
2589 movl (FRAME_OFFSET+36)(%esp), IVP # iv
2590 lea .Lcts_permute_table, T1
2591 #else
2592 lea .Lcts_permute_table(%rip), T1
2593 #endif
2594 mov 480(KEYP), KLEN
2595 movups (IVP), STATE
2596 sub $16, LEN
2597 mov T1, IVP
2598 add $32, IVP
2599 add LEN, T1
2600 sub LEN, IVP
2601 movups (T1), %xmm4
2602 movups (IVP), %xmm5
2603
2604 movups (INP), IN1
2605 add LEN, INP
2606 movups (INP), IN2
2607
2608 pxor IN1, STATE
2609 call _aesni_enc1
2610
2611 pshufb %xmm5, IN2
2612 pxor STATE, IN2
2613 pshufb %xmm4, STATE
2614 add OUTP, LEN
2615 movups STATE, (LEN)
2616
2617 movaps IN2, STATE
2618 call _aesni_enc1
2619 movups STATE, (OUTP)
2620
2621 #ifndef __x86_64__
2622 popl KLEN
2623 popl KEYP
2624 popl LEN
2625 popl IVP
2626 #endif
2627 FRAME_END
2628 RET
2629 SYM_FUNC_END(aesni_cts_cbc_enc)
2630
2631
2632
2633
2634
2635 SYM_FUNC_START(aesni_cts_cbc_dec)
2636 FRAME_BEGIN
2637 #ifndef __x86_64__
2638 pushl IVP
2639 pushl LEN
2640 pushl KEYP
2641 pushl KLEN
2642 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
2643 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
2644 movl (FRAME_OFFSET+28)(%esp), INP # src
2645 movl (FRAME_OFFSET+32)(%esp), LEN # len
2646 movl (FRAME_OFFSET+36)(%esp), IVP # iv
2647 lea .Lcts_permute_table, T1
2648 #else
2649 lea .Lcts_permute_table(%rip), T1
2650 #endif
2651 mov 480(KEYP), KLEN
2652 add $240, KEYP
2653 movups (IVP), IV
2654 sub $16, LEN
2655 mov T1, IVP
2656 add $32, IVP
2657 add LEN, T1
2658 sub LEN, IVP
2659 movups (T1), %xmm4
2660
2661 movups (INP), STATE
2662 add LEN, INP
2663 movups (INP), IN1
2664
2665 call _aesni_dec1
2666 movaps STATE, IN2
2667 pshufb %xmm4, STATE
2668 pxor IN1, STATE
2669
2670 add OUTP, LEN
2671 movups STATE, (LEN)
2672
2673 movups (IVP), %xmm0
2674 pshufb %xmm0, IN1
2675 pblendvb IN2, IN1
2676 movaps IN1, STATE
2677 call _aesni_dec1
2678
2679 pxor IV, STATE
2680 movups STATE, (OUTP)
2681
2682 #ifndef __x86_64__
2683 popl KLEN
2684 popl KEYP
2685 popl LEN
2686 popl IVP
2687 #endif
2688 FRAME_END
2689 RET
2690 SYM_FUNC_END(aesni_cts_cbc_dec)
2691
2692 .pushsection .rodata
2693 .align 16
2694 .Lcts_permute_table:
2695 .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
2696 .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
2697 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
2698 .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
2699 .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
2700 .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
2701 #ifdef __x86_64__
2702 .Lbswap_mask:
2703 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
2704 #endif
2705 .popsection
2706
2707 #ifdef __x86_64__
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719 SYM_FUNC_START_LOCAL(_aesni_inc_init)
2720 movaps .Lbswap_mask, BSWAP_MASK
2721 movaps IV, CTR
2722 pshufb BSWAP_MASK, CTR
2723 mov $1, TCTR_LOW
2724 movq TCTR_LOW, INC
2725 movq CTR, TCTR_LOW
2726 RET
2727 SYM_FUNC_END(_aesni_inc_init)
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744 SYM_FUNC_START_LOCAL(_aesni_inc)
2745 paddq INC, CTR
2746 add $1, TCTR_LOW
2747 jnc .Linc_low
2748 pslldq $8, INC
2749 paddq INC, CTR
2750 psrldq $8, INC
2751 .Linc_low:
2752 movaps CTR, IV
2753 pshufb BSWAP_MASK, IV
2754 RET
2755 SYM_FUNC_END(_aesni_inc)
2756
2757
2758
2759
2760
2761 SYM_FUNC_START(aesni_ctr_enc)
2762 FRAME_BEGIN
2763 cmp $16, LEN
2764 jb .Lctr_enc_just_ret
2765 mov 480(KEYP), KLEN
2766 movups (IVP), IV
2767 call _aesni_inc_init
2768 cmp $64, LEN
2769 jb .Lctr_enc_loop1
2770 .align 4
2771 .Lctr_enc_loop4:
2772 movaps IV, STATE1
2773 call _aesni_inc
2774 movups (INP), IN1
2775 movaps IV, STATE2
2776 call _aesni_inc
2777 movups 0x10(INP), IN2
2778 movaps IV, STATE3
2779 call _aesni_inc
2780 movups 0x20(INP), IN3
2781 movaps IV, STATE4
2782 call _aesni_inc
2783 movups 0x30(INP), IN4
2784 call _aesni_enc4
2785 pxor IN1, STATE1
2786 movups STATE1, (OUTP)
2787 pxor IN2, STATE2
2788 movups STATE2, 0x10(OUTP)
2789 pxor IN3, STATE3
2790 movups STATE3, 0x20(OUTP)
2791 pxor IN4, STATE4
2792 movups STATE4, 0x30(OUTP)
2793 sub $64, LEN
2794 add $64, INP
2795 add $64, OUTP
2796 cmp $64, LEN
2797 jge .Lctr_enc_loop4
2798 cmp $16, LEN
2799 jb .Lctr_enc_ret
2800 .align 4
2801 .Lctr_enc_loop1:
2802 movaps IV, STATE
2803 call _aesni_inc
2804 movups (INP), IN
2805 call _aesni_enc1
2806 pxor IN, STATE
2807 movups STATE, (OUTP)
2808 sub $16, LEN
2809 add $16, INP
2810 add $16, OUTP
2811 cmp $16, LEN
2812 jge .Lctr_enc_loop1
2813 .Lctr_enc_ret:
2814 movups IV, (IVP)
2815 .Lctr_enc_just_ret:
2816 FRAME_END
2817 RET
2818 SYM_FUNC_END(aesni_ctr_enc)
2819
2820 #endif
2821
2822 .section .rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16
2823 .align 16
2824 .Lgf128mul_x_ble_mask:
2825 .octa 0x00000000000000010000000000000087
2826 .previous
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839 #define _aesni_gf128mul_x_ble() \
2840 pshufd $0x13, IV, KEY; \
2841 paddq IV, IV; \
2842 psrad $31, KEY; \
2843 pand GF128MUL_MASK, KEY; \
2844 pxor KEY, IV;
2845
2846
2847
2848
2849
2850 SYM_FUNC_START(aesni_xts_encrypt)
2851 FRAME_BEGIN
2852 #ifndef __x86_64__
2853 pushl IVP
2854 pushl LEN
2855 pushl KEYP
2856 pushl KLEN
2857 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
2858 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
2859 movl (FRAME_OFFSET+28)(%esp), INP # src
2860 movl (FRAME_OFFSET+32)(%esp), LEN # len
2861 movl (FRAME_OFFSET+36)(%esp), IVP # iv
2862 movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
2863 #else
2864 movdqa .Lgf128mul_x_ble_mask(%rip), GF128MUL_MASK
2865 #endif
2866 movups (IVP), IV
2867
2868 mov 480(KEYP), KLEN
2869
2870 .Lxts_enc_loop4:
2871 sub $64, LEN
2872 jl .Lxts_enc_1x
2873
2874 movdqa IV, STATE1
2875 movdqu 0x00(INP), IN
2876 pxor IN, STATE1
2877 movdqu IV, 0x00(OUTP)
2878
2879 _aesni_gf128mul_x_ble()
2880 movdqa IV, STATE2
2881 movdqu 0x10(INP), IN
2882 pxor IN, STATE2
2883 movdqu IV, 0x10(OUTP)
2884
2885 _aesni_gf128mul_x_ble()
2886 movdqa IV, STATE3
2887 movdqu 0x20(INP), IN
2888 pxor IN, STATE3
2889 movdqu IV, 0x20(OUTP)
2890
2891 _aesni_gf128mul_x_ble()
2892 movdqa IV, STATE4
2893 movdqu 0x30(INP), IN
2894 pxor IN, STATE4
2895 movdqu IV, 0x30(OUTP)
2896
2897 call _aesni_enc4
2898
2899 movdqu 0x00(OUTP), IN
2900 pxor IN, STATE1
2901 movdqu STATE1, 0x00(OUTP)
2902
2903 movdqu 0x10(OUTP), IN
2904 pxor IN, STATE2
2905 movdqu STATE2, 0x10(OUTP)
2906
2907 movdqu 0x20(OUTP), IN
2908 pxor IN, STATE3
2909 movdqu STATE3, 0x20(OUTP)
2910
2911 movdqu 0x30(OUTP), IN
2912 pxor IN, STATE4
2913 movdqu STATE4, 0x30(OUTP)
2914
2915 _aesni_gf128mul_x_ble()
2916
2917 add $64, INP
2918 add $64, OUTP
2919 test LEN, LEN
2920 jnz .Lxts_enc_loop4
2921
2922 .Lxts_enc_ret_iv:
2923 movups IV, (IVP)
2924
2925 .Lxts_enc_ret:
2926 #ifndef __x86_64__
2927 popl KLEN
2928 popl KEYP
2929 popl LEN
2930 popl IVP
2931 #endif
2932 FRAME_END
2933 RET
2934
2935 .Lxts_enc_1x:
2936 add $64, LEN
2937 jz .Lxts_enc_ret_iv
2938 sub $16, LEN
2939 jl .Lxts_enc_cts4
2940
2941 .Lxts_enc_loop1:
2942 movdqu (INP), STATE
2943 pxor IV, STATE
2944 call _aesni_enc1
2945 pxor IV, STATE
2946 _aesni_gf128mul_x_ble()
2947
2948 test LEN, LEN
2949 jz .Lxts_enc_out
2950
2951 add $16, INP
2952 sub $16, LEN
2953 jl .Lxts_enc_cts1
2954
2955 movdqu STATE, (OUTP)
2956 add $16, OUTP
2957 jmp .Lxts_enc_loop1
2958
2959 .Lxts_enc_out:
2960 movdqu STATE, (OUTP)
2961 jmp .Lxts_enc_ret_iv
2962
2963 .Lxts_enc_cts4:
2964 movdqa STATE4, STATE
2965 sub $16, OUTP
2966
2967 .Lxts_enc_cts1:
2968 #ifndef __x86_64__
2969 lea .Lcts_permute_table, T1
2970 #else
2971 lea .Lcts_permute_table(%rip), T1
2972 #endif
2973 add LEN, INP
2974 add $16, LEN
2975 movups (INP), IN1
2976
2977 mov T1, IVP
2978 add $32, IVP
2979 add LEN, T1
2980 sub LEN, IVP
2981 add OUTP, LEN
2982
2983 movups (T1), %xmm4
2984 movaps STATE, IN2
2985 pshufb %xmm4, STATE
2986 movups STATE, (LEN)
2987
2988 movups (IVP), %xmm0
2989 pshufb %xmm0, IN1
2990 pblendvb IN2, IN1
2991 movaps IN1, STATE
2992
2993 pxor IV, STATE
2994 call _aesni_enc1
2995 pxor IV, STATE
2996
2997 movups STATE, (OUTP)
2998 jmp .Lxts_enc_ret
2999 SYM_FUNC_END(aesni_xts_encrypt)
3000
3001
3002
3003
3004
3005 SYM_FUNC_START(aesni_xts_decrypt)
3006 FRAME_BEGIN
3007 #ifndef __x86_64__
3008 pushl IVP
3009 pushl LEN
3010 pushl KEYP
3011 pushl KLEN
3012 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
3013 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
3014 movl (FRAME_OFFSET+28)(%esp), INP # src
3015 movl (FRAME_OFFSET+32)(%esp), LEN # len
3016 movl (FRAME_OFFSET+36)(%esp), IVP # iv
3017 movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
3018 #else
3019 movdqa .Lgf128mul_x_ble_mask(%rip), GF128MUL_MASK
3020 #endif
3021 movups (IVP), IV
3022
3023 mov 480(KEYP), KLEN
3024 add $240, KEYP
3025
3026 test $15, LEN
3027 jz .Lxts_dec_loop4
3028 sub $16, LEN
3029
3030 .Lxts_dec_loop4:
3031 sub $64, LEN
3032 jl .Lxts_dec_1x
3033
3034 movdqa IV, STATE1
3035 movdqu 0x00(INP), IN
3036 pxor IN, STATE1
3037 movdqu IV, 0x00(OUTP)
3038
3039 _aesni_gf128mul_x_ble()
3040 movdqa IV, STATE2
3041 movdqu 0x10(INP), IN
3042 pxor IN, STATE2
3043 movdqu IV, 0x10(OUTP)
3044
3045 _aesni_gf128mul_x_ble()
3046 movdqa IV, STATE3
3047 movdqu 0x20(INP), IN
3048 pxor IN, STATE3
3049 movdqu IV, 0x20(OUTP)
3050
3051 _aesni_gf128mul_x_ble()
3052 movdqa IV, STATE4
3053 movdqu 0x30(INP), IN
3054 pxor IN, STATE4
3055 movdqu IV, 0x30(OUTP)
3056
3057 call _aesni_dec4
3058
3059 movdqu 0x00(OUTP), IN
3060 pxor IN, STATE1
3061 movdqu STATE1, 0x00(OUTP)
3062
3063 movdqu 0x10(OUTP), IN
3064 pxor IN, STATE2
3065 movdqu STATE2, 0x10(OUTP)
3066
3067 movdqu 0x20(OUTP), IN
3068 pxor IN, STATE3
3069 movdqu STATE3, 0x20(OUTP)
3070
3071 movdqu 0x30(OUTP), IN
3072 pxor IN, STATE4
3073 movdqu STATE4, 0x30(OUTP)
3074
3075 _aesni_gf128mul_x_ble()
3076
3077 add $64, INP
3078 add $64, OUTP
3079 test LEN, LEN
3080 jnz .Lxts_dec_loop4
3081
3082 .Lxts_dec_ret_iv:
3083 movups IV, (IVP)
3084
3085 .Lxts_dec_ret:
3086 #ifndef __x86_64__
3087 popl KLEN
3088 popl KEYP
3089 popl LEN
3090 popl IVP
3091 #endif
3092 FRAME_END
3093 RET
3094
3095 .Lxts_dec_1x:
3096 add $64, LEN
3097 jz .Lxts_dec_ret_iv
3098
3099 .Lxts_dec_loop1:
3100 movdqu (INP), STATE
3101
3102 add $16, INP
3103 sub $16, LEN
3104 jl .Lxts_dec_cts1
3105
3106 pxor IV, STATE
3107 call _aesni_dec1
3108 pxor IV, STATE
3109 _aesni_gf128mul_x_ble()
3110
3111 test LEN, LEN
3112 jz .Lxts_dec_out
3113
3114 movdqu STATE, (OUTP)
3115 add $16, OUTP
3116 jmp .Lxts_dec_loop1
3117
3118 .Lxts_dec_out:
3119 movdqu STATE, (OUTP)
3120 jmp .Lxts_dec_ret_iv
3121
3122 .Lxts_dec_cts1:
3123 movdqa IV, STATE4
3124 _aesni_gf128mul_x_ble()
3125
3126 pxor IV, STATE
3127 call _aesni_dec1
3128 pxor IV, STATE
3129
3130 #ifndef __x86_64__
3131 lea .Lcts_permute_table, T1
3132 #else
3133 lea .Lcts_permute_table(%rip), T1
3134 #endif
3135 add LEN, INP
3136 add $16, LEN
3137 movups (INP), IN1
3138
3139 mov T1, IVP
3140 add $32, IVP
3141 add LEN, T1
3142 sub LEN, IVP
3143 add OUTP, LEN
3144
3145 movups (T1), %xmm4
3146 movaps STATE, IN2
3147 pshufb %xmm4, STATE
3148 movups STATE, (LEN)
3149
3150 movups (IVP), %xmm0
3151 pshufb %xmm0, IN1
3152 pblendvb IN2, IN1
3153 movaps IN1, STATE
3154
3155 pxor STATE4, STATE
3156 call _aesni_dec1
3157 pxor STATE4, STATE
3158
3159 movups STATE, (OUTP)
3160 jmp .Lxts_dec_ret
3161 SYM_FUNC_END(aesni_xts_decrypt)