0001
0002
0003
0004
0005
0006
0007
0008 #include <linux/linkage.h>
0009 #include <asm/assembler.h>
0010
0011 SHASH .req v0
0012 SHASH2 .req v1
0013 T1 .req v2
0014 T2 .req v3
0015 MASK .req v4
0016 XM .req v5
0017 XL .req v6
0018 XH .req v7
0019 IN1 .req v7
0020
0021 k00_16 .req v8
0022 k32_48 .req v9
0023
0024 t3 .req v10
0025 t4 .req v11
0026 t5 .req v12
0027 t6 .req v13
0028 t7 .req v14
0029 t8 .req v15
0030 t9 .req v16
0031
0032 perm1 .req v17
0033 perm2 .req v18
0034 perm3 .req v19
0035
0036 sh1 .req v20
0037 sh2 .req v21
0038 sh3 .req v22
0039 sh4 .req v23
0040
0041 ss1 .req v24
0042 ss2 .req v25
0043 ss3 .req v26
0044 ss4 .req v27
0045
0046 XL2 .req v8
0047 XM2 .req v9
0048 XH2 .req v10
0049 XL3 .req v11
0050 XM3 .req v12
0051 XH3 .req v13
0052 TT3 .req v14
0053 TT4 .req v15
0054 HH .req v16
0055 HH3 .req v17
0056 HH4 .req v18
0057 HH34 .req v19
0058
0059 .text
0060 .arch armv8-a+crypto
0061
0062 .macro __pmull_p64, rd, rn, rm
0063 pmull \rd\().1q, \rn\().1d, \rm\().1d
0064 .endm
0065
0066 .macro __pmull2_p64, rd, rn, rm
0067 pmull2 \rd\().1q, \rn\().2d, \rm\().2d
0068 .endm
0069
0070 .macro __pmull_p8, rq, ad, bd
0071 ext t3.8b, \ad\().8b, \ad\().8b, #1 // A1
0072 ext t5.8b, \ad\().8b, \ad\().8b, #2 // A2
0073 ext t7.8b, \ad\().8b, \ad\().8b, #3 // A3
0074
0075 __pmull_p8_\bd \rq, \ad
0076 .endm
0077
0078 .macro __pmull2_p8, rq, ad, bd
0079 tbl t3.16b, {\ad\().16b}, perm1.16b // A1
0080 tbl t5.16b, {\ad\().16b}, perm2.16b // A2
0081 tbl t7.16b, {\ad\().16b}, perm3.16b // A3
0082
0083 __pmull2_p8_\bd \rq, \ad
0084 .endm
0085
0086 .macro __pmull_p8_SHASH, rq, ad
0087 __pmull_p8_tail \rq, \ad\().8b, SHASH.8b, 8b,, sh1, sh2, sh3, sh4
0088 .endm
0089
0090 .macro __pmull_p8_SHASH2, rq, ad
0091 __pmull_p8_tail \rq, \ad\().8b, SHASH2.8b, 8b,, ss1, ss2, ss3, ss4
0092 .endm
0093
0094 .macro __pmull2_p8_SHASH, rq, ad
0095 __pmull_p8_tail \rq, \ad\().16b, SHASH.16b, 16b, 2, sh1, sh2, sh3, sh4
0096 .endm
0097
0098 .macro __pmull_p8_tail, rq, ad, bd, nb, t, b1, b2, b3, b4
0099 pmull\t t3.8h, t3.\nb, \bd // F = A1*B
0100 pmull\t t4.8h, \ad, \b1\().\nb // E = A*B1
0101 pmull\t t5.8h, t5.\nb, \bd // H = A2*B
0102 pmull\t t6.8h, \ad, \b2\().\nb // G = A*B2
0103 pmull\t t7.8h, t7.\nb, \bd // J = A3*B
0104 pmull\t t8.8h, \ad, \b3\().\nb // I = A*B3
0105 pmull\t t9.8h, \ad, \b4\().\nb // K = A*B4
0106 pmull\t \rq\().8h, \ad, \bd // D = A*B
0107
0108 eor t3.16b, t3.16b, t4.16b // L = E + F
0109 eor t5.16b, t5.16b, t6.16b // M = G + H
0110 eor t7.16b, t7.16b, t8.16b // N = I + J
0111
0112 uzp1 t4.2d, t3.2d, t5.2d
0113 uzp2 t3.2d, t3.2d, t5.2d
0114 uzp1 t6.2d, t7.2d, t9.2d
0115 uzp2 t7.2d, t7.2d, t9.2d
0116
0117 // t3 = (L) (P0 + P1) << 8
0118 // t5 = (M) (P2 + P3) << 16
0119 eor t4.16b, t4.16b, t3.16b
0120 and t3.16b, t3.16b, k32_48.16b
0121
0122 // t7 = (N) (P4 + P5) << 24
0123 // t9 = (K) (P6 + P7) << 32
0124 eor t6.16b, t6.16b, t7.16b
0125 and t7.16b, t7.16b, k00_16.16b
0126
0127 eor t4.16b, t4.16b, t3.16b
0128 eor t6.16b, t6.16b, t7.16b
0129
0130 zip2 t5.2d, t4.2d, t3.2d
0131 zip1 t3.2d, t4.2d, t3.2d
0132 zip2 t9.2d, t6.2d, t7.2d
0133 zip1 t7.2d, t6.2d, t7.2d
0134
0135 ext t3.16b, t3.16b, t3.16b, #15
0136 ext t5.16b, t5.16b, t5.16b, #14
0137 ext t7.16b, t7.16b, t7.16b, #13
0138 ext t9.16b, t9.16b, t9.16b, #12
0139
0140 eor t3.16b, t3.16b, t5.16b
0141 eor t7.16b, t7.16b, t9.16b
0142 eor \rq\().16b, \rq\().16b, t3.16b
0143 eor \rq\().16b, \rq\().16b, t7.16b
0144 .endm
0145
0146 .macro __pmull_pre_p64
0147 add x8, x3, #16
0148 ld1 {HH.2d-HH4.2d}, [x8]
0149
0150 trn1 SHASH2.2d, SHASH.2d, HH.2d
0151 trn2 T1.2d, SHASH.2d, HH.2d
0152 eor SHASH2.16b, SHASH2.16b, T1.16b
0153
0154 trn1 HH34.2d, HH3.2d, HH4.2d
0155 trn2 T1.2d, HH3.2d, HH4.2d
0156 eor HH34.16b, HH34.16b, T1.16b
0157
0158 movi MASK.16b, #0xe1
0159 shl MASK.2d, MASK.2d, #57
0160 .endm
0161
0162 .macro __pmull_pre_p8
0163 ext SHASH2.16b, SHASH.16b, SHASH.16b, #8
0164 eor SHASH2.16b, SHASH2.16b, SHASH.16b
0165
0166 // k00_16 := 0x0000000000000000_000000000000ffff
0167 // k32_48 := 0x00000000ffffffff_0000ffffffffffff
0168 movi k32_48.2d, #0xffffffff
0169 mov k32_48.h[2], k32_48.h[0]
0170 ushr k00_16.2d, k32_48.2d, #32
0171
0172 // prepare the permutation vectors
0173 mov_q x5, 0x080f0e0d0c0b0a09
0174 movi T1.8b, #8
0175 dup perm1.2d, x5
0176 eor perm1.16b, perm1.16b, T1.16b
0177 ushr perm2.2d, perm1.2d, #8
0178 ushr perm3.2d, perm1.2d, #16
0179 ushr T1.2d, perm1.2d, #24
0180 sli perm2.2d, perm1.2d, #56
0181 sli perm3.2d, perm1.2d, #48
0182 sli T1.2d, perm1.2d, #40
0183
0184 // precompute loop invariants
0185 tbl sh1.16b, {SHASH.16b}, perm1.16b
0186 tbl sh2.16b, {SHASH.16b}, perm2.16b
0187 tbl sh3.16b, {SHASH.16b}, perm3.16b
0188 tbl sh4.16b, {SHASH.16b}, T1.16b
0189 ext ss1.8b, SHASH2.8b, SHASH2.8b, #1
0190 ext ss2.8b, SHASH2.8b, SHASH2.8b, #2
0191 ext ss3.8b, SHASH2.8b, SHASH2.8b, #3
0192 ext ss4.8b, SHASH2.8b, SHASH2.8b, #4
0193 .endm
0194
0195 //
0196 // PMULL (64x64->128) based reduction for CPUs that can do
0197 // it in a single instruction.
0198 //
0199 .macro __pmull_reduce_p64
0200 pmull T2.1q, XL.1d, MASK.1d
0201 eor XM.16b, XM.16b, T1.16b
0202
0203 mov XH.d[0], XM.d[1]
0204 mov XM.d[1], XL.d[0]
0205
0206 eor XL.16b, XM.16b, T2.16b
0207 ext T2.16b, XL.16b, XL.16b, #8
0208 pmull XL.1q, XL.1d, MASK.1d
0209 .endm
0210
0211 //
0212 // Alternative reduction for CPUs that lack support for the
0213 // 64x64->128 PMULL instruction
0214 //
0215 .macro __pmull_reduce_p8
0216 eor XM.16b, XM.16b, T1.16b
0217
0218 mov XL.d[1], XM.d[0]
0219 mov XH.d[0], XM.d[1]
0220
0221 shl T1.2d, XL.2d, #57
0222 shl T2.2d, XL.2d, #62
0223 eor T2.16b, T2.16b, T1.16b
0224 shl T1.2d, XL.2d, #63
0225 eor T2.16b, T2.16b, T1.16b
0226 ext T1.16b, XL.16b, XH.16b, #8
0227 eor T2.16b, T2.16b, T1.16b
0228
0229 mov XL.d[1], T2.d[0]
0230 mov XH.d[0], T2.d[1]
0231
0232 ushr T2.2d, XL.2d, #1
0233 eor XH.16b, XH.16b, XL.16b
0234 eor XL.16b, XL.16b, T2.16b
0235 ushr T2.2d, T2.2d, #6
0236 ushr XL.2d, XL.2d, #1
0237 .endm
0238
0239 .macro __pmull_ghash, pn
0240 ld1 {SHASH.2d}, [x3]
0241 ld1 {XL.2d}, [x1]
0242
0243 __pmull_pre_\pn
0244
0245
0246 cbz x4, 0f
0247 ld1 {T1.2d}, [x4]
0248 mov x4, xzr
0249 b 3f
0250
0251 0: .ifc \pn, p64
0252 tbnz w0, #0, 2f // skip until #blocks is a
0253 tbnz w0, #1, 2f // round multiple of 4
0254
0255 1: ld1 {XM3.16b-TT4.16b}, [x2], #64
0256
0257 sub w0, w0, #4
0258
0259 rev64 T1.16b, XM3.16b
0260 rev64 T2.16b, XH3.16b
0261 rev64 TT4.16b, TT4.16b
0262 rev64 TT3.16b, TT3.16b
0263
0264 ext IN1.16b, TT4.16b, TT4.16b, #8
0265 ext XL3.16b, TT3.16b, TT3.16b, #8
0266
0267 eor TT4.16b, TT4.16b, IN1.16b
0268 pmull2 XH2.1q, SHASH.2d, IN1.2d // a1 * b1
0269 pmull XL2.1q, SHASH.1d, IN1.1d // a0 * b0
0270 pmull XM2.1q, SHASH2.1d, TT4.1d // (a1 + a0)(b1 + b0)
0271
0272 eor TT3.16b, TT3.16b, XL3.16b
0273 pmull2 XH3.1q, HH.2d, XL3.2d // a1 * b1
0274 pmull XL3.1q, HH.1d, XL3.1d // a0 * b0
0275 pmull2 XM3.1q, SHASH2.2d, TT3.2d // (a1 + a0)(b1 + b0)
0276
0277 ext IN1.16b, T2.16b, T2.16b, #8
0278 eor XL2.16b, XL2.16b, XL3.16b
0279 eor XH2.16b, XH2.16b, XH3.16b
0280 eor XM2.16b, XM2.16b, XM3.16b
0281
0282 eor T2.16b, T2.16b, IN1.16b
0283 pmull2 XH3.1q, HH3.2d, IN1.2d // a1 * b1
0284 pmull XL3.1q, HH3.1d, IN1.1d // a0 * b0
0285 pmull XM3.1q, HH34.1d, T2.1d // (a1 + a0)(b1 + b0)
0286
0287 eor XL2.16b, XL2.16b, XL3.16b
0288 eor XH2.16b, XH2.16b, XH3.16b
0289 eor XM2.16b, XM2.16b, XM3.16b
0290
0291 ext IN1.16b, T1.16b, T1.16b, #8
0292 ext TT3.16b, XL.16b, XL.16b, #8
0293 eor XL.16b, XL.16b, IN1.16b
0294 eor T1.16b, T1.16b, TT3.16b
0295
0296 pmull2 XH.1q, HH4.2d, XL.2d // a1 * b1
0297 eor T1.16b, T1.16b, XL.16b
0298 pmull XL.1q, HH4.1d, XL.1d // a0 * b0
0299 pmull2 XM.1q, HH34.2d, T1.2d // (a1 + a0)(b1 + b0)
0300
0301 eor XL.16b, XL.16b, XL2.16b
0302 eor XH.16b, XH.16b, XH2.16b
0303 eor XM.16b, XM.16b, XM2.16b
0304
0305 eor T2.16b, XL.16b, XH.16b
0306 ext T1.16b, XL.16b, XH.16b, #8
0307 eor XM.16b, XM.16b, T2.16b
0308
0309 __pmull_reduce_p64
0310
0311 eor T2.16b, T2.16b, XH.16b
0312 eor XL.16b, XL.16b, T2.16b
0313
0314 cbz w0, 5f
0315 b 1b
0316 .endif
0317
0318 2: ld1 {T1.2d}, [x2], #16
0319 sub w0, w0, #1
0320
0321 3:
0322 CPU_LE( rev64 T1.16b, T1.16b )
0323
0324 ext T2.16b, XL.16b, XL.16b, #8
0325 ext IN1.16b, T1.16b, T1.16b, #8
0326 eor T1.16b, T1.16b, T2.16b
0327 eor XL.16b, XL.16b, IN1.16b
0328
0329 __pmull2_\pn XH, XL, SHASH // a1 * b1
0330 eor T1.16b, T1.16b, XL.16b
0331 __pmull_\pn XL, XL, SHASH // a0 * b0
0332 __pmull_\pn XM, T1, SHASH2 // (a1 + a0)(b1 + b0)
0333
0334 4: eor T2.16b, XL.16b, XH.16b
0335 ext T1.16b, XL.16b, XH.16b, #8
0336 eor XM.16b, XM.16b, T2.16b
0337
0338 __pmull_reduce_\pn
0339
0340 eor T2.16b, T2.16b, XH.16b
0341 eor XL.16b, XL.16b, T2.16b
0342
0343 cbnz w0, 0b
0344
0345 5: st1 {XL.2d}, [x1]
0346 ret
0347 .endm
0348
0349
0350
0351
0352
0353 SYM_FUNC_START(pmull_ghash_update_p64)
0354 __pmull_ghash p64
0355 SYM_FUNC_END(pmull_ghash_update_p64)
0356
0357 SYM_FUNC_START(pmull_ghash_update_p8)
0358 __pmull_ghash p8
0359 SYM_FUNC_END(pmull_ghash_update_p8)
0360
0361 KS0 .req v8
0362 KS1 .req v9
0363 KS2 .req v10
0364 KS3 .req v11
0365
0366 INP0 .req v21
0367 INP1 .req v22
0368 INP2 .req v23
0369 INP3 .req v24
0370
0371 K0 .req v25
0372 K1 .req v26
0373 K2 .req v27
0374 K3 .req v28
0375 K4 .req v12
0376 K5 .req v13
0377 K6 .req v4
0378 K7 .req v5
0379 K8 .req v14
0380 K9 .req v15
0381 KK .req v29
0382 KL .req v30
0383 KM .req v31
0384
0385 .macro load_round_keys, rounds, rk, tmp
0386 add \tmp, \rk, #64
0387 ld1 {K0.4s-K3.4s}, [\rk]
0388 ld1 {K4.4s-K5.4s}, [\tmp]
0389 add \tmp, \rk, \rounds, lsl #4
0390 sub \tmp, \tmp, #32
0391 ld1 {KK.4s-KM.4s}, [\tmp]
0392 .endm
0393
0394 .macro enc_round, state, key
0395 aese \state\().16b, \key\().16b
0396 aesmc \state\().16b, \state\().16b
0397 .endm
0398
0399 .macro enc_qround, s0, s1, s2, s3, key
0400 enc_round \s0, \key
0401 enc_round \s1, \key
0402 enc_round \s2, \key
0403 enc_round \s3, \key
0404 .endm
0405
0406 .macro enc_block, state, rounds, rk, tmp
0407 add \tmp, \rk, #96
0408 ld1 {K6.4s-K7.4s}, [\tmp], #32
0409 .irp key, K0, K1, K2, K3, K4 K5
0410 enc_round \state, \key
0411 .endr
0412
0413 tbnz \rounds, #2, .Lnot128_\@
0414 .Lout256_\@:
0415 enc_round \state, K6
0416 enc_round \state, K7
0417
0418 .Lout192_\@:
0419 enc_round \state, KK
0420 aese \state\().16b, KL.16b
0421 eor \state\().16b, \state\().16b, KM.16b
0422
0423 .subsection 1
0424 .Lnot128_\@:
0425 ld1 {K8.4s-K9.4s}, [\tmp], #32
0426 enc_round \state, K6
0427 enc_round \state, K7
0428 ld1 {K6.4s-K7.4s}, [\tmp]
0429 enc_round \state, K8
0430 enc_round \state, K9
0431 tbz \rounds, #1, .Lout192_\@
0432 b .Lout256_\@
0433 .previous
0434 .endm
0435
0436 .align 6
0437 .macro pmull_gcm_do_crypt, enc
0438 stp x29, x30, [sp, #-32]!
0439 mov x29, sp
0440 str x19, [sp, #24]
0441
0442 load_round_keys x7, x6, x8
0443
0444 ld1 {SHASH.2d}, [x3], #16
0445 ld1 {HH.2d-HH4.2d}, [x3]
0446
0447 trn1 SHASH2.2d, SHASH.2d, HH.2d
0448 trn2 T1.2d, SHASH.2d, HH.2d
0449 eor SHASH2.16b, SHASH2.16b, T1.16b
0450
0451 trn1 HH34.2d, HH3.2d, HH4.2d
0452 trn2 T1.2d, HH3.2d, HH4.2d
0453 eor HH34.16b, HH34.16b, T1.16b
0454
0455 ld1 {XL.2d}, [x4]
0456
0457 cbz x0, 3f // tag only?
0458
0459 ldr w8, [x5, #12] // load lower counter
0460 CPU_LE( rev w8, w8 )
0461
0462 0: mov w9, #4 // max blocks per round
0463 add x10, x0, #0xf
0464 lsr x10, x10, #4 // remaining blocks
0465
0466 subs x0, x0, #64
0467 csel w9, w10, w9, mi
0468 add w8, w8, w9
0469
0470 bmi 1f
0471 ld1 {INP0.16b-INP3.16b}, [x2], #64
0472 .subsection 1
0473
0474
0475
0476
0477
0478
0479
0480
0481
0482
0483
0484
0485
0486
0487
0488
0489 1: mov x15, #16
0490 ands x19, x0, #0xf
0491 csel x19, x19, x15, ne
0492 adr_l x17, .Lpermute_table + 16
0493
0494 sub x11, x15, x19
0495 add x12, x17, x11
0496 sub x17, x17, x11
0497 ld1 {T1.16b}, [x12]
0498 sub x10, x1, x11
0499 sub x11, x2, x11
0500
0501 cmp x0, #-16
0502 csel x14, x15, xzr, gt
0503 cmp x0, #-32
0504 csel x15, x15, xzr, gt
0505 cmp x0, #-48
0506 csel x16, x19, xzr, gt
0507 csel x1, x1, x10, gt
0508 csel x2, x2, x11, gt
0509
0510 ld1 {INP0.16b}, [x2], x14
0511 ld1 {INP1.16b}, [x2], x15
0512 ld1 {INP2.16b}, [x2], x16
0513 ld1 {INP3.16b}, [x2]
0514 tbl INP3.16b, {INP3.16b}, T1.16b
0515 b 2f
0516 .previous
0517
0518 2: .if \enc == 0
0519 bl pmull_gcm_ghash_4x
0520 .endif
0521
0522 bl pmull_gcm_enc_4x
0523
0524 tbnz x0, #63, 6f
0525 st1 {INP0.16b-INP3.16b}, [x1], #64
0526 .if \enc == 1
0527 bl pmull_gcm_ghash_4x
0528 .endif
0529 bne 0b
0530
0531 3: ldp x19, x10, [sp, #24]
0532 cbz x10, 5f // output tag?
0533
0534 ld1 {INP3.16b}, [x10] // load lengths[]
0535 mov w9, #1
0536 bl pmull_gcm_ghash_4x
0537
0538 mov w11, #(0x1 << 24) // BE '1U'
0539 ld1 {KS0.16b}, [x5]
0540 mov KS0.s[3], w11
0541
0542 enc_block KS0, x7, x6, x12
0543
0544 ext XL.16b, XL.16b, XL.16b, #8
0545 rev64 XL.16b, XL.16b
0546 eor XL.16b, XL.16b, KS0.16b
0547
0548 .if \enc == 1
0549 st1 {XL.16b}, [x10] // store tag
0550 .else
0551 ldp x11, x12, [sp, #40] // load tag pointer and authsize
0552 adr_l x17, .Lpermute_table
0553 ld1 {KS0.16b}, [x11] // load supplied tag
0554 add x17, x17, x12
0555 ld1 {KS1.16b}, [x17] // load permute vector
0556
0557 cmeq XL.16b, XL.16b, KS0.16b // compare tags
0558 mvn XL.16b, XL.16b // -1 for fail, 0 for pass
0559 tbl XL.16b, {XL.16b}, KS1.16b // keep authsize bytes only
0560 sminv b0, XL.16b // signed minimum across XL
0561 smov w0, v0.b[0] // return b0
0562 .endif
0563
0564 4: ldp x29, x30, [sp], #32
0565 ret
0566
0567 5:
0568 CPU_LE( rev w8, w8 )
0569 str w8, [x5, #12] // store lower counter
0570 st1 {XL.2d}, [x4]
0571 b 4b
0572
0573 6: ld1 {T1.16b-T2.16b}, [x17], #32 // permute vectors
0574 sub x17, x17, x19, lsl #1
0575
0576 cmp w9, #1
0577 beq 7f
0578 .subsection 1
0579 7: ld1 {INP2.16b}, [x1]
0580 tbx INP2.16b, {INP3.16b}, T1.16b
0581 mov INP3.16b, INP2.16b
0582 b 8f
0583 .previous
0584
0585 st1 {INP0.16b}, [x1], x14
0586 st1 {INP1.16b}, [x1], x15
0587 st1 {INP2.16b}, [x1], x16
0588 tbl INP3.16b, {INP3.16b}, T1.16b
0589 tbx INP3.16b, {INP2.16b}, T2.16b
0590 8: st1 {INP3.16b}, [x1]
0591
0592 .if \enc == 1
0593 ld1 {T1.16b}, [x17]
0594 tbl INP3.16b, {INP3.16b}, T1.16b // clear non-data bits
0595 bl pmull_gcm_ghash_4x
0596 .endif
0597 b 3b
0598 .endm
0599
0600
0601
0602
0603
0604
0605 SYM_FUNC_START(pmull_gcm_encrypt)
0606 pmull_gcm_do_crypt 1
0607 SYM_FUNC_END(pmull_gcm_encrypt)
0608
0609
0610
0611
0612
0613
0614 SYM_FUNC_START(pmull_gcm_decrypt)
0615 pmull_gcm_do_crypt 0
0616 SYM_FUNC_END(pmull_gcm_decrypt)
0617
0618 SYM_FUNC_START_LOCAL(pmull_gcm_ghash_4x)
0619 movi MASK.16b, #0xe1
0620 shl MASK.2d, MASK.2d, #57
0621
0622 rev64 T1.16b, INP0.16b
0623 rev64 T2.16b, INP1.16b
0624 rev64 TT3.16b, INP2.16b
0625 rev64 TT4.16b, INP3.16b
0626
0627 ext XL.16b, XL.16b, XL.16b, #8
0628
0629 tbz w9, #2, 0f // <4 blocks?
0630 .subsection 1
0631 0: movi XH2.16b, #0
0632 movi XM2.16b, #0
0633 movi XL2.16b, #0
0634
0635 tbz w9, #0, 1f // 2 blocks?
0636 tbz w9, #1, 2f // 1 block?
0637
0638 eor T2.16b, T2.16b, XL.16b
0639 ext T1.16b, T2.16b, T2.16b, #8
0640 b .Lgh3
0641
0642 1: eor TT3.16b, TT3.16b, XL.16b
0643 ext T2.16b, TT3.16b, TT3.16b, #8
0644 b .Lgh2
0645
0646 2: eor TT4.16b, TT4.16b, XL.16b
0647 ext IN1.16b, TT4.16b, TT4.16b, #8
0648 b .Lgh1
0649 .previous
0650
0651 eor T1.16b, T1.16b, XL.16b
0652 ext IN1.16b, T1.16b, T1.16b, #8
0653
0654 pmull2 XH2.1q, HH4.2d, IN1.2d // a1 * b1
0655 eor T1.16b, T1.16b, IN1.16b
0656 pmull XL2.1q, HH4.1d, IN1.1d // a0 * b0
0657 pmull2 XM2.1q, HH34.2d, T1.2d // (a1 + a0)(b1 + b0)
0658
0659 ext T1.16b, T2.16b, T2.16b, #8
0660 .Lgh3: eor T2.16b, T2.16b, T1.16b
0661 pmull2 XH.1q, HH3.2d, T1.2d // a1 * b1
0662 pmull XL.1q, HH3.1d, T1.1d // a0 * b0
0663 pmull XM.1q, HH34.1d, T2.1d // (a1 + a0)(b1 + b0)
0664
0665 eor XH2.16b, XH2.16b, XH.16b
0666 eor XL2.16b, XL2.16b, XL.16b
0667 eor XM2.16b, XM2.16b, XM.16b
0668
0669 ext T2.16b, TT3.16b, TT3.16b, #8
0670 .Lgh2: eor TT3.16b, TT3.16b, T2.16b
0671 pmull2 XH.1q, HH.2d, T2.2d // a1 * b1
0672 pmull XL.1q, HH.1d, T2.1d // a0 * b0
0673 pmull2 XM.1q, SHASH2.2d, TT3.2d // (a1 + a0)(b1 + b0)
0674
0675 eor XH2.16b, XH2.16b, XH.16b
0676 eor XL2.16b, XL2.16b, XL.16b
0677 eor XM2.16b, XM2.16b, XM.16b
0678
0679 ext IN1.16b, TT4.16b, TT4.16b, #8
0680 .Lgh1: eor TT4.16b, TT4.16b, IN1.16b
0681 pmull XL.1q, SHASH.1d, IN1.1d // a0 * b0
0682 pmull2 XH.1q, SHASH.2d, IN1.2d // a1 * b1
0683 pmull XM.1q, SHASH2.1d, TT4.1d // (a1 + a0)(b1 + b0)
0684
0685 eor XH.16b, XH.16b, XH2.16b
0686 eor XL.16b, XL.16b, XL2.16b
0687 eor XM.16b, XM.16b, XM2.16b
0688
0689 eor T2.16b, XL.16b, XH.16b
0690 ext T1.16b, XL.16b, XH.16b, #8
0691 eor XM.16b, XM.16b, T2.16b
0692
0693 __pmull_reduce_p64
0694
0695 eor T2.16b, T2.16b, XH.16b
0696 eor XL.16b, XL.16b, T2.16b
0697
0698 ret
0699 SYM_FUNC_END(pmull_gcm_ghash_4x)
0700
0701 SYM_FUNC_START_LOCAL(pmull_gcm_enc_4x)
0702 ld1 {KS0.16b}, [x5] // load upper counter
0703 sub w10, w8, #4
0704 sub w11, w8, #3
0705 sub w12, w8, #2
0706 sub w13, w8, #1
0707 rev w10, w10
0708 rev w11, w11
0709 rev w12, w12
0710 rev w13, w13
0711 mov KS1.16b, KS0.16b
0712 mov KS2.16b, KS0.16b
0713 mov KS3.16b, KS0.16b
0714 ins KS0.s[3], w10 // set lower counter
0715 ins KS1.s[3], w11
0716 ins KS2.s[3], w12
0717 ins KS3.s[3], w13
0718
0719 add x10, x6, #96 // round key pointer
0720 ld1 {K6.4s-K7.4s}, [x10], #32
0721 .irp key, K0, K1, K2, K3, K4, K5
0722 enc_qround KS0, KS1, KS2, KS3, \key
0723 .endr
0724
0725 tbnz x7, #2, .Lnot128
0726 .subsection 1
0727 .Lnot128:
0728 ld1 {K8.4s-K9.4s}, [x10], #32
0729 .irp key, K6, K7
0730 enc_qround KS0, KS1, KS2, KS3, \key
0731 .endr
0732 ld1 {K6.4s-K7.4s}, [x10]
0733 .irp key, K8, K9
0734 enc_qround KS0, KS1, KS2, KS3, \key
0735 .endr
0736 tbz x7, #1, .Lout192
0737 b .Lout256
0738 .previous
0739
0740 .Lout256:
0741 .irp key, K6, K7
0742 enc_qround KS0, KS1, KS2, KS3, \key
0743 .endr
0744
0745 .Lout192:
0746 enc_qround KS0, KS1, KS2, KS3, KK
0747
0748 aese KS0.16b, KL.16b
0749 aese KS1.16b, KL.16b
0750 aese KS2.16b, KL.16b
0751 aese KS3.16b, KL.16b
0752
0753 eor KS0.16b, KS0.16b, KM.16b
0754 eor KS1.16b, KS1.16b, KM.16b
0755 eor KS2.16b, KS2.16b, KM.16b
0756 eor KS3.16b, KS3.16b, KM.16b
0757
0758 eor INP0.16b, INP0.16b, KS0.16b
0759 eor INP1.16b, INP1.16b, KS1.16b
0760 eor INP2.16b, INP2.16b, KS2.16b
0761 eor INP3.16b, INP3.16b, KS3.16b
0762
0763 ret
0764 SYM_FUNC_END(pmull_gcm_enc_4x)
0765
0766 .section ".rodata", "a"
0767 .align 6
0768 .Lpermute_table:
0769 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
0770 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
0771 .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
0772 .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf
0773 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
0774 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
0775 .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
0776 .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf
0777 .previous