arm64/crypto/ghash-ce-core.S

0001 /* SPDX-License-Identifier: GPL-2.0-only */
0002 /*
0003  * Accelerated GHASH implementation with ARMv8 PMULL instructions.
0004  *
0005  * Copyright (C) 2014 - 2018 Linaro Ltd. <ard.biesheuvel@linaro.org>
0006  */
0007
0008 #include <linux/linkage.h>
0009 #include <asm/assembler.h>
0010
0011     SHASH       .req    v0
0012     SHASH2      .req    v1
0013     T1      .req    v2
0014     T2      .req    v3
0015     MASK        .req    v4
0016     XM      .req    v5
0017     XL      .req    v6
0018     XH      .req    v7
0019     IN1     .req    v7
0020
0021     k00_16      .req    v8
0022     k32_48      .req    v9
0023
0024     t3      .req    v10
0025     t4      .req    v11
0026     t5      .req    v12
0027     t6      .req    v13
0028     t7      .req    v14
0029     t8      .req    v15
0030     t9      .req    v16
0031
0032     perm1       .req    v17
0033     perm2       .req    v18
0034     perm3       .req    v19
0035
0036     sh1     .req    v20
0037     sh2     .req    v21
0038     sh3     .req    v22
0039     sh4     .req    v23
0040
0041     ss1     .req    v24
0042     ss2     .req    v25
0043     ss3     .req    v26
0044     ss4     .req    v27
0045
0046     XL2     .req    v8
0047     XM2     .req    v9
0048     XH2     .req    v10
0049     XL3     .req    v11
0050     XM3     .req    v12
0051     XH3     .req    v13
0052     TT3     .req    v14
0053     TT4     .req    v15
0054     HH      .req    v16
0055     HH3     .req    v17
0056     HH4     .req    v18
0057     HH34        .req    v19
0058
0059     .text
0060     .arch       armv8-a+crypto
0061
0062     .macro      __pmull_p64, rd, rn, rm
0063     pmull       \rd\().1q, \rn\().1d, \rm\().1d
0064     .endm
0065
0066     .macro      __pmull2_p64, rd, rn, rm
0067     pmull2      \rd\().1q, \rn\().2d, \rm\().2d
0068     .endm
0069
0070     .macro      __pmull_p8, rq, ad, bd
0071     ext     t3.8b, \ad\().8b, \ad\().8b, #1     // A1
0072     ext     t5.8b, \ad\().8b, \ad\().8b, #2     // A2
0073     ext     t7.8b, \ad\().8b, \ad\().8b, #3     // A3
0074
0075     __pmull_p8_\bd  \rq, \ad
0076     .endm
0077
0078     .macro      __pmull2_p8, rq, ad, bd
0079     tbl     t3.16b, {\ad\().16b}, perm1.16b     // A1
0080     tbl     t5.16b, {\ad\().16b}, perm2.16b     // A2
0081     tbl     t7.16b, {\ad\().16b}, perm3.16b     // A3
0082
0083     __pmull2_p8_\bd \rq, \ad
0084     .endm
0085
0086     .macro      __pmull_p8_SHASH, rq, ad
0087     __pmull_p8_tail \rq, \ad\().8b, SHASH.8b, 8b,, sh1, sh2, sh3, sh4
0088     .endm
0089
0090     .macro      __pmull_p8_SHASH2, rq, ad
0091     __pmull_p8_tail \rq, \ad\().8b, SHASH2.8b, 8b,, ss1, ss2, ss3, ss4
0092     .endm
0093
0094     .macro      __pmull2_p8_SHASH, rq, ad
0095     __pmull_p8_tail \rq, \ad\().16b, SHASH.16b, 16b, 2, sh1, sh2, sh3, sh4
0096     .endm
0097
0098     .macro      __pmull_p8_tail, rq, ad, bd, nb, t, b1, b2, b3, b4
0099     pmull\t     t3.8h, t3.\nb, \bd          // F = A1*B
0100     pmull\t     t4.8h, \ad, \b1\().\nb          // E = A*B1
0101     pmull\t     t5.8h, t5.\nb, \bd          // H = A2*B
0102     pmull\t     t6.8h, \ad, \b2\().\nb          // G = A*B2
0103     pmull\t     t7.8h, t7.\nb, \bd          // J = A3*B
0104     pmull\t     t8.8h, \ad, \b3\().\nb          // I = A*B3
0105     pmull\t     t9.8h, \ad, \b4\().\nb          // K = A*B4
0106     pmull\t     \rq\().8h, \ad, \bd         // D = A*B
0107
0108     eor     t3.16b, t3.16b, t4.16b          // L = E + F
0109     eor     t5.16b, t5.16b, t6.16b          // M = G + H
0110     eor     t7.16b, t7.16b, t8.16b          // N = I + J
0111
0112     uzp1        t4.2d, t3.2d, t5.2d
0113     uzp2        t3.2d, t3.2d, t5.2d
0114     uzp1        t6.2d, t7.2d, t9.2d
0115     uzp2        t7.2d, t7.2d, t9.2d
0116
0117     // t3 = (L) (P0 + P1) << 8
0118     // t5 = (M) (P2 + P3) << 16
0119     eor     t4.16b, t4.16b, t3.16b
0120     and     t3.16b, t3.16b, k32_48.16b
0121
0122     // t7 = (N) (P4 + P5) << 24
0123     // t9 = (K) (P6 + P7) << 32
0124     eor     t6.16b, t6.16b, t7.16b
0125     and     t7.16b, t7.16b, k00_16.16b
0126
0127     eor     t4.16b, t4.16b, t3.16b
0128     eor     t6.16b, t6.16b, t7.16b
0129
0130     zip2        t5.2d, t4.2d, t3.2d
0131     zip1        t3.2d, t4.2d, t3.2d
0132     zip2        t9.2d, t6.2d, t7.2d
0133     zip1        t7.2d, t6.2d, t7.2d
0134
0135     ext     t3.16b, t3.16b, t3.16b, #15
0136     ext     t5.16b, t5.16b, t5.16b, #14
0137     ext     t7.16b, t7.16b, t7.16b, #13
0138     ext     t9.16b, t9.16b, t9.16b, #12
0139
0140     eor     t3.16b, t3.16b, t5.16b
0141     eor     t7.16b, t7.16b, t9.16b
0142     eor     \rq\().16b, \rq\().16b, t3.16b
0143     eor     \rq\().16b, \rq\().16b, t7.16b
0144     .endm
0145
0146     .macro      __pmull_pre_p64
0147     add     x8, x3, #16
0148     ld1     {HH.2d-HH4.2d}, [x8]
0149
0150     trn1        SHASH2.2d, SHASH.2d, HH.2d
0151     trn2        T1.2d, SHASH.2d, HH.2d
0152     eor     SHASH2.16b, SHASH2.16b, T1.16b
0153
0154     trn1        HH34.2d, HH3.2d, HH4.2d
0155     trn2        T1.2d, HH3.2d, HH4.2d
0156     eor     HH34.16b, HH34.16b, T1.16b
0157
0158     movi        MASK.16b, #0xe1
0159     shl     MASK.2d, MASK.2d, #57
0160     .endm
0161
0162     .macro      __pmull_pre_p8
0163     ext     SHASH2.16b, SHASH.16b, SHASH.16b, #8
0164     eor     SHASH2.16b, SHASH2.16b, SHASH.16b
0165
0166     // k00_16 := 0x0000000000000000_000000000000ffff
0167     // k32_48 := 0x00000000ffffffff_0000ffffffffffff
0168     movi        k32_48.2d, #0xffffffff
0169     mov     k32_48.h[2], k32_48.h[0]
0170     ushr        k00_16.2d, k32_48.2d, #32
0171
0172     // prepare the permutation vectors
0173     mov_q       x5, 0x080f0e0d0c0b0a09
0174     movi        T1.8b, #8
0175     dup     perm1.2d, x5
0176     eor     perm1.16b, perm1.16b, T1.16b
0177     ushr        perm2.2d, perm1.2d, #8
0178     ushr        perm3.2d, perm1.2d, #16
0179     ushr        T1.2d, perm1.2d, #24
0180     sli     perm2.2d, perm1.2d, #56
0181     sli     perm3.2d, perm1.2d, #48
0182     sli     T1.2d, perm1.2d, #40
0183
0184     // precompute loop invariants
0185     tbl     sh1.16b, {SHASH.16b}, perm1.16b
0186     tbl     sh2.16b, {SHASH.16b}, perm2.16b
0187     tbl     sh3.16b, {SHASH.16b}, perm3.16b
0188     tbl     sh4.16b, {SHASH.16b}, T1.16b
0189     ext     ss1.8b, SHASH2.8b, SHASH2.8b, #1
0190     ext     ss2.8b, SHASH2.8b, SHASH2.8b, #2
0191     ext     ss3.8b, SHASH2.8b, SHASH2.8b, #3
0192     ext     ss4.8b, SHASH2.8b, SHASH2.8b, #4
0193     .endm
0194
0195     //
0196     // PMULL (64x64->128) based reduction for CPUs that can do
0197     // it in a single instruction.
0198     //
0199     .macro      __pmull_reduce_p64
0200     pmull       T2.1q, XL.1d, MASK.1d
0201     eor     XM.16b, XM.16b, T1.16b
0202
0203     mov     XH.d[0], XM.d[1]
0204     mov     XM.d[1], XL.d[0]
0205
0206     eor     XL.16b, XM.16b, T2.16b
0207     ext     T2.16b, XL.16b, XL.16b, #8
0208     pmull       XL.1q, XL.1d, MASK.1d
0209     .endm
0210
0211     //
0212     // Alternative reduction for CPUs that lack support for the
0213     // 64x64->128 PMULL instruction
0214     //
0215     .macro      __pmull_reduce_p8
0216     eor     XM.16b, XM.16b, T1.16b
0217
0218     mov     XL.d[1], XM.d[0]
0219     mov     XH.d[0], XM.d[1]
0220
0221     shl     T1.2d, XL.2d, #57
0222     shl     T2.2d, XL.2d, #62
0223     eor     T2.16b, T2.16b, T1.16b
0224     shl     T1.2d, XL.2d, #63
0225     eor     T2.16b, T2.16b, T1.16b
0226     ext     T1.16b, XL.16b, XH.16b, #8
0227     eor     T2.16b, T2.16b, T1.16b
0228
0229     mov     XL.d[1], T2.d[0]
0230     mov     XH.d[0], T2.d[1]
0231
0232     ushr        T2.2d, XL.2d, #1
0233     eor     XH.16b, XH.16b, XL.16b
0234     eor     XL.16b, XL.16b, T2.16b
0235     ushr        T2.2d, T2.2d, #6
0236     ushr        XL.2d, XL.2d, #1
0237     .endm
0238
0239     .macro      __pmull_ghash, pn
0240     ld1     {SHASH.2d}, [x3]
0241     ld1     {XL.2d}, [x1]
0242
0243     __pmull_pre_\pn
0244
0245     /* do the head block first, if supplied */
0246     cbz     x4, 0f
0247     ld1     {T1.2d}, [x4]
0248     mov     x4, xzr
0249     b       3f
0250
0251 0:  .ifc        \pn, p64
0252     tbnz        w0, #0, 2f      // skip until #blocks is a
0253     tbnz        w0, #1, 2f      // round multiple of 4
0254
0255 1:  ld1     {XM3.16b-TT4.16b}, [x2], #64
0256
0257     sub     w0, w0, #4
0258
0259     rev64       T1.16b, XM3.16b
0260     rev64       T2.16b, XH3.16b
0261     rev64       TT4.16b, TT4.16b
0262     rev64       TT3.16b, TT3.16b
0263
0264     ext     IN1.16b, TT4.16b, TT4.16b, #8
0265     ext     XL3.16b, TT3.16b, TT3.16b, #8
0266
0267     eor     TT4.16b, TT4.16b, IN1.16b
0268     pmull2      XH2.1q, SHASH.2d, IN1.2d    // a1 * b1
0269     pmull       XL2.1q, SHASH.1d, IN1.1d    // a0 * b0
0270     pmull       XM2.1q, SHASH2.1d, TT4.1d   // (a1 + a0)(b1 + b0)
0271
0272     eor     TT3.16b, TT3.16b, XL3.16b
0273     pmull2      XH3.1q, HH.2d, XL3.2d       // a1 * b1
0274     pmull       XL3.1q, HH.1d, XL3.1d       // a0 * b0
0275     pmull2      XM3.1q, SHASH2.2d, TT3.2d   // (a1 + a0)(b1 + b0)
0276
0277     ext     IN1.16b, T2.16b, T2.16b, #8
0278     eor     XL2.16b, XL2.16b, XL3.16b
0279     eor     XH2.16b, XH2.16b, XH3.16b
0280     eor     XM2.16b, XM2.16b, XM3.16b
0281
0282     eor     T2.16b, T2.16b, IN1.16b
0283     pmull2      XH3.1q, HH3.2d, IN1.2d      // a1 * b1
0284     pmull       XL3.1q, HH3.1d, IN1.1d      // a0 * b0
0285     pmull       XM3.1q, HH34.1d, T2.1d      // (a1 + a0)(b1 + b0)
0286
0287     eor     XL2.16b, XL2.16b, XL3.16b
0288     eor     XH2.16b, XH2.16b, XH3.16b
0289     eor     XM2.16b, XM2.16b, XM3.16b
0290
0291     ext     IN1.16b, T1.16b, T1.16b, #8
0292     ext     TT3.16b, XL.16b, XL.16b, #8
0293     eor     XL.16b, XL.16b, IN1.16b
0294     eor     T1.16b, T1.16b, TT3.16b
0295
0296     pmull2      XH.1q, HH4.2d, XL.2d        // a1 * b1
0297     eor     T1.16b, T1.16b, XL.16b
0298     pmull       XL.1q, HH4.1d, XL.1d        // a0 * b0
0299     pmull2      XM.1q, HH34.2d, T1.2d       // (a1 + a0)(b1 + b0)
0300
0301     eor     XL.16b, XL.16b, XL2.16b
0302     eor     XH.16b, XH.16b, XH2.16b
0303     eor     XM.16b, XM.16b, XM2.16b
0304
0305     eor     T2.16b, XL.16b, XH.16b
0306     ext     T1.16b, XL.16b, XH.16b, #8
0307     eor     XM.16b, XM.16b, T2.16b
0308
0309     __pmull_reduce_p64
0310
0311     eor     T2.16b, T2.16b, XH.16b
0312     eor     XL.16b, XL.16b, T2.16b
0313
0314     cbz     w0, 5f
0315     b       1b
0316     .endif
0317
0318 2:  ld1     {T1.2d}, [x2], #16
0319     sub     w0, w0, #1
0320
0321 3:  /* multiply XL by SHASH in GF(2^128) */
0322 CPU_LE( rev64       T1.16b, T1.16b  )
0323
0324     ext     T2.16b, XL.16b, XL.16b, #8
0325     ext     IN1.16b, T1.16b, T1.16b, #8
0326     eor     T1.16b, T1.16b, T2.16b
0327     eor     XL.16b, XL.16b, IN1.16b
0328
0329     __pmull2_\pn    XH, XL, SHASH           // a1 * b1
0330     eor     T1.16b, T1.16b, XL.16b
0331     __pmull_\pn     XL, XL, SHASH           // a0 * b0
0332     __pmull_\pn XM, T1, SHASH2          // (a1 + a0)(b1 + b0)
0333
0334 4:  eor     T2.16b, XL.16b, XH.16b
0335     ext     T1.16b, XL.16b, XH.16b, #8
0336     eor     XM.16b, XM.16b, T2.16b
0337
0338     __pmull_reduce_\pn
0339
0340     eor     T2.16b, T2.16b, XH.16b
0341     eor     XL.16b, XL.16b, T2.16b
0342
0343     cbnz        w0, 0b
0344
0345 5:  st1     {XL.2d}, [x1]
0346     ret
0347     .endm
0348
0349     /*
0350      * void pmull_ghash_update(int blocks, u64 dg[], const char *src,
0351      *             struct ghash_key const *k, const char *head)
0352      */
0353 SYM_FUNC_START(pmull_ghash_update_p64)
0354     __pmull_ghash   p64
0355 SYM_FUNC_END(pmull_ghash_update_p64)
0356
0357 SYM_FUNC_START(pmull_ghash_update_p8)
0358     __pmull_ghash   p8
0359 SYM_FUNC_END(pmull_ghash_update_p8)
0360
0361     KS0     .req    v8
0362     KS1     .req    v9
0363     KS2     .req    v10
0364     KS3     .req    v11
0365
0366     INP0        .req    v21
0367     INP1        .req    v22
0368     INP2        .req    v23
0369     INP3        .req    v24
0370
0371     K0      .req    v25
0372     K1      .req    v26
0373     K2      .req    v27
0374     K3      .req    v28
0375     K4      .req    v12
0376     K5      .req    v13
0377     K6      .req    v4
0378     K7      .req    v5
0379     K8      .req    v14
0380     K9      .req    v15
0381     KK      .req    v29
0382     KL      .req    v30
0383     KM      .req    v31
0384
0385     .macro      load_round_keys, rounds, rk, tmp
0386     add     \tmp, \rk, #64
0387     ld1     {K0.4s-K3.4s}, [\rk]
0388     ld1     {K4.4s-K5.4s}, [\tmp]
0389     add     \tmp, \rk, \rounds, lsl #4
0390     sub     \tmp, \tmp, #32
0391     ld1     {KK.4s-KM.4s}, [\tmp]
0392     .endm
0393
0394     .macro      enc_round, state, key
0395     aese        \state\().16b, \key\().16b
0396     aesmc       \state\().16b, \state\().16b
0397     .endm
0398
0399     .macro      enc_qround, s0, s1, s2, s3, key
0400     enc_round   \s0, \key
0401     enc_round   \s1, \key
0402     enc_round   \s2, \key
0403     enc_round   \s3, \key
0404     .endm
0405
0406     .macro      enc_block, state, rounds, rk, tmp
0407     add     \tmp, \rk, #96
0408     ld1     {K6.4s-K7.4s}, [\tmp], #32
0409     .irp        key, K0, K1, K2, K3, K4 K5
0410     enc_round   \state, \key
0411     .endr
0412
0413     tbnz        \rounds, #2, .Lnot128_\@
0414 .Lout256_\@:
0415     enc_round   \state, K6
0416     enc_round   \state, K7
0417
0418 .Lout192_\@:
0419     enc_round   \state, KK
0420     aese        \state\().16b, KL.16b
0421     eor     \state\().16b, \state\().16b, KM.16b
0422
0423     .subsection 1
0424 .Lnot128_\@:
0425     ld1     {K8.4s-K9.4s}, [\tmp], #32
0426     enc_round   \state, K6
0427     enc_round   \state, K7
0428     ld1     {K6.4s-K7.4s}, [\tmp]
0429     enc_round   \state, K8
0430     enc_round   \state, K9
0431     tbz     \rounds, #1, .Lout192_\@
0432     b       .Lout256_\@
0433     .previous
0434     .endm
0435
0436     .align      6
0437     .macro      pmull_gcm_do_crypt, enc
0438     stp     x29, x30, [sp, #-32]!
0439     mov     x29, sp
0440     str     x19, [sp, #24]
0441
0442     load_round_keys x7, x6, x8
0443
0444     ld1     {SHASH.2d}, [x3], #16
0445     ld1     {HH.2d-HH4.2d}, [x3]
0446
0447     trn1        SHASH2.2d, SHASH.2d, HH.2d
0448     trn2        T1.2d, SHASH.2d, HH.2d
0449     eor     SHASH2.16b, SHASH2.16b, T1.16b
0450
0451     trn1        HH34.2d, HH3.2d, HH4.2d
0452     trn2        T1.2d, HH3.2d, HH4.2d
0453     eor     HH34.16b, HH34.16b, T1.16b
0454
0455     ld1     {XL.2d}, [x4]
0456
0457     cbz     x0, 3f              // tag only?
0458
0459     ldr     w8, [x5, #12]           // load lower counter
0460 CPU_LE( rev     w8, w8      )
0461
0462 0:  mov     w9, #4              // max blocks per round
0463     add     x10, x0, #0xf
0464     lsr     x10, x10, #4            // remaining blocks
0465
0466     subs        x0, x0, #64
0467     csel        w9, w10, w9, mi
0468     add     w8, w8, w9
0469
0470     bmi     1f
0471     ld1     {INP0.16b-INP3.16b}, [x2], #64
0472     .subsection 1
0473     /*
0474      * Populate the four input registers right to left with up to 63 bytes
0475      * of data, using overlapping loads to avoid branches.
0476      *
0477      *                INP0     INP1     INP2     INP3
0478      *  1 byte     |        |        |        |x       |
0479      * 16 bytes    |        |        |        |xxxxxxxx|
0480      * 17 bytes    |        |        |xxxxxxxx|x       |
0481      * 47 bytes    |        |xxxxxxxx|xxxxxxxx|xxxxxxx |
0482      * etc etc
0483      *
0484      * Note that this code may read up to 15 bytes before the start of
0485      * the input. It is up to the calling code to ensure this is safe if
0486      * this happens in the first iteration of the loop (i.e., when the
0487      * input size is < 16 bytes)
0488      */
0489 1:  mov     x15, #16
0490     ands        x19, x0, #0xf
0491     csel        x19, x19, x15, ne
0492     adr_l       x17, .Lpermute_table + 16
0493
0494     sub     x11, x15, x19
0495     add     x12, x17, x11
0496     sub     x17, x17, x11
0497     ld1     {T1.16b}, [x12]
0498     sub     x10, x1, x11
0499     sub     x11, x2, x11
0500
0501     cmp     x0, #-16
0502     csel        x14, x15, xzr, gt
0503     cmp     x0, #-32
0504     csel        x15, x15, xzr, gt
0505     cmp     x0, #-48
0506     csel        x16, x19, xzr, gt
0507     csel        x1, x1, x10, gt
0508     csel        x2, x2, x11, gt
0509
0510     ld1     {INP0.16b}, [x2], x14
0511     ld1     {INP1.16b}, [x2], x15
0512     ld1     {INP2.16b}, [x2], x16
0513     ld1     {INP3.16b}, [x2]
0514     tbl     INP3.16b, {INP3.16b}, T1.16b
0515     b       2f
0516     .previous
0517
0518 2:  .if     \enc == 0
0519     bl      pmull_gcm_ghash_4x
0520     .endif
0521
0522     bl      pmull_gcm_enc_4x
0523
0524     tbnz        x0, #63, 6f
0525     st1     {INP0.16b-INP3.16b}, [x1], #64
0526     .if     \enc == 1
0527     bl      pmull_gcm_ghash_4x
0528     .endif
0529     bne     0b
0530
0531 3:  ldp     x19, x10, [sp, #24]
0532     cbz     x10, 5f             // output tag?
0533
0534     ld1     {INP3.16b}, [x10]       // load lengths[]
0535     mov     w9, #1
0536     bl      pmull_gcm_ghash_4x
0537
0538     mov     w11, #(0x1 << 24)       // BE '1U'
0539     ld1     {KS0.16b}, [x5]
0540     mov     KS0.s[3], w11
0541
0542     enc_block   KS0, x7, x6, x12
0543
0544     ext     XL.16b, XL.16b, XL.16b, #8
0545     rev64       XL.16b, XL.16b
0546     eor     XL.16b, XL.16b, KS0.16b
0547
0548     .if     \enc == 1
0549     st1     {XL.16b}, [x10]         // store tag
0550     .else
0551     ldp     x11, x12, [sp, #40]     // load tag pointer and authsize
0552     adr_l       x17, .Lpermute_table
0553     ld1     {KS0.16b}, [x11]        // load supplied tag
0554     add     x17, x17, x12
0555     ld1     {KS1.16b}, [x17]        // load permute vector
0556
0557     cmeq        XL.16b, XL.16b, KS0.16b     // compare tags
0558     mvn     XL.16b, XL.16b          // -1 for fail, 0 for pass
0559     tbl     XL.16b, {XL.16b}, KS1.16b   // keep authsize bytes only
0560     sminv       b0, XL.16b          // signed minimum across XL
0561     smov        w0, v0.b[0]         // return b0
0562     .endif
0563
0564 4:  ldp     x29, x30, [sp], #32
0565     ret
0566
0567 5:
0568 CPU_LE( rev     w8, w8      )
0569     str     w8, [x5, #12]           // store lower counter
0570     st1     {XL.2d}, [x4]
0571     b       4b
0572
0573 6:  ld1     {T1.16b-T2.16b}, [x17], #32 // permute vectors
0574     sub     x17, x17, x19, lsl #1
0575
0576     cmp     w9, #1
0577     beq     7f
0578     .subsection 1
0579 7:  ld1     {INP2.16b}, [x1]
0580     tbx     INP2.16b, {INP3.16b}, T1.16b
0581     mov     INP3.16b, INP2.16b
0582     b       8f
0583     .previous
0584
0585     st1     {INP0.16b}, [x1], x14
0586     st1     {INP1.16b}, [x1], x15
0587     st1     {INP2.16b}, [x1], x16
0588     tbl     INP3.16b, {INP3.16b}, T1.16b
0589     tbx     INP3.16b, {INP2.16b}, T2.16b
0590 8:  st1     {INP3.16b}, [x1]
0591
0592     .if     \enc == 1
0593     ld1     {T1.16b}, [x17]
0594     tbl     INP3.16b, {INP3.16b}, T1.16b    // clear non-data bits
0595     bl      pmull_gcm_ghash_4x
0596     .endif
0597     b       3b
0598     .endm
0599
0600     /*
0601      * void pmull_gcm_encrypt(int blocks, u8 dst[], const u8 src[],
0602      *            struct ghash_key const *k, u64 dg[], u8 ctr[],
0603      *            int rounds, u8 tag)
0604      */
0605 SYM_FUNC_START(pmull_gcm_encrypt)
0606     pmull_gcm_do_crypt  1
0607 SYM_FUNC_END(pmull_gcm_encrypt)
0608
0609     /*
0610      * void pmull_gcm_decrypt(int blocks, u8 dst[], const u8 src[],
0611      *            struct ghash_key const *k, u64 dg[], u8 ctr[],
0612      *            int rounds, u8 tag)
0613      */
0614 SYM_FUNC_START(pmull_gcm_decrypt)
0615     pmull_gcm_do_crypt  0
0616 SYM_FUNC_END(pmull_gcm_decrypt)
0617
0618 SYM_FUNC_START_LOCAL(pmull_gcm_ghash_4x)
0619     movi        MASK.16b, #0xe1
0620     shl     MASK.2d, MASK.2d, #57
0621
0622     rev64       T1.16b, INP0.16b
0623     rev64       T2.16b, INP1.16b
0624     rev64       TT3.16b, INP2.16b
0625     rev64       TT4.16b, INP3.16b
0626
0627     ext     XL.16b, XL.16b, XL.16b, #8
0628
0629     tbz     w9, #2, 0f          // <4 blocks?
0630     .subsection 1
0631 0:  movi        XH2.16b, #0
0632     movi        XM2.16b, #0
0633     movi        XL2.16b, #0
0634
0635     tbz     w9, #0, 1f          // 2 blocks?
0636     tbz     w9, #1, 2f          // 1 block?
0637
0638     eor     T2.16b, T2.16b, XL.16b
0639     ext     T1.16b, T2.16b, T2.16b, #8
0640     b       .Lgh3
0641
0642 1:  eor     TT3.16b, TT3.16b, XL.16b
0643     ext     T2.16b, TT3.16b, TT3.16b, #8
0644     b       .Lgh2
0645
0646 2:  eor     TT4.16b, TT4.16b, XL.16b
0647     ext     IN1.16b, TT4.16b, TT4.16b, #8
0648     b       .Lgh1
0649     .previous
0650
0651     eor     T1.16b, T1.16b, XL.16b
0652     ext     IN1.16b, T1.16b, T1.16b, #8
0653
0654     pmull2      XH2.1q, HH4.2d, IN1.2d      // a1 * b1
0655     eor     T1.16b, T1.16b, IN1.16b
0656     pmull       XL2.1q, HH4.1d, IN1.1d      // a0 * b0
0657     pmull2      XM2.1q, HH34.2d, T1.2d      // (a1 + a0)(b1 + b0)
0658
0659     ext     T1.16b, T2.16b, T2.16b, #8
0660 .Lgh3:  eor     T2.16b, T2.16b, T1.16b
0661     pmull2      XH.1q, HH3.2d, T1.2d        // a1 * b1
0662     pmull       XL.1q, HH3.1d, T1.1d        // a0 * b0
0663     pmull       XM.1q, HH34.1d, T2.1d       // (a1 + a0)(b1 + b0)
0664
0665     eor     XH2.16b, XH2.16b, XH.16b
0666     eor     XL2.16b, XL2.16b, XL.16b
0667     eor     XM2.16b, XM2.16b, XM.16b
0668
0669     ext     T2.16b, TT3.16b, TT3.16b, #8
0670 .Lgh2:  eor     TT3.16b, TT3.16b, T2.16b
0671     pmull2      XH.1q, HH.2d, T2.2d     // a1 * b1
0672     pmull       XL.1q, HH.1d, T2.1d     // a0 * b0
0673     pmull2      XM.1q, SHASH2.2d, TT3.2d    // (a1 + a0)(b1 + b0)
0674
0675     eor     XH2.16b, XH2.16b, XH.16b
0676     eor     XL2.16b, XL2.16b, XL.16b
0677     eor     XM2.16b, XM2.16b, XM.16b
0678
0679     ext     IN1.16b, TT4.16b, TT4.16b, #8
0680 .Lgh1:  eor     TT4.16b, TT4.16b, IN1.16b
0681     pmull       XL.1q, SHASH.1d, IN1.1d     // a0 * b0
0682     pmull2      XH.1q, SHASH.2d, IN1.2d     // a1 * b1
0683     pmull       XM.1q, SHASH2.1d, TT4.1d    // (a1 + a0)(b1 + b0)
0684
0685     eor     XH.16b, XH.16b, XH2.16b
0686     eor     XL.16b, XL.16b, XL2.16b
0687     eor     XM.16b, XM.16b, XM2.16b
0688
0689     eor     T2.16b, XL.16b, XH.16b
0690     ext     T1.16b, XL.16b, XH.16b, #8
0691     eor     XM.16b, XM.16b, T2.16b
0692
0693     __pmull_reduce_p64
0694
0695     eor     T2.16b, T2.16b, XH.16b
0696     eor     XL.16b, XL.16b, T2.16b
0697
0698     ret
0699 SYM_FUNC_END(pmull_gcm_ghash_4x)
0700
0701 SYM_FUNC_START_LOCAL(pmull_gcm_enc_4x)
0702     ld1     {KS0.16b}, [x5]         // load upper counter
0703     sub     w10, w8, #4
0704     sub     w11, w8, #3
0705     sub     w12, w8, #2
0706     sub     w13, w8, #1
0707     rev     w10, w10
0708     rev     w11, w11
0709     rev     w12, w12
0710     rev     w13, w13
0711     mov     KS1.16b, KS0.16b
0712     mov     KS2.16b, KS0.16b
0713     mov     KS3.16b, KS0.16b
0714     ins     KS0.s[3], w10           // set lower counter
0715     ins     KS1.s[3], w11
0716     ins     KS2.s[3], w12
0717     ins     KS3.s[3], w13
0718
0719     add     x10, x6, #96            // round key pointer
0720     ld1     {K6.4s-K7.4s}, [x10], #32
0721     .irp        key, K0, K1, K2, K3, K4, K5
0722     enc_qround  KS0, KS1, KS2, KS3, \key
0723     .endr
0724
0725     tbnz        x7, #2, .Lnot128
0726     .subsection 1
0727 .Lnot128:
0728     ld1     {K8.4s-K9.4s}, [x10], #32
0729     .irp        key, K6, K7
0730     enc_qround  KS0, KS1, KS2, KS3, \key
0731     .endr
0732     ld1     {K6.4s-K7.4s}, [x10]
0733     .irp        key, K8, K9
0734     enc_qround  KS0, KS1, KS2, KS3, \key
0735     .endr
0736     tbz     x7, #1, .Lout192
0737     b       .Lout256
0738     .previous
0739
0740 .Lout256:
0741     .irp        key, K6, K7
0742     enc_qround  KS0, KS1, KS2, KS3, \key
0743     .endr
0744
0745 .Lout192:
0746     enc_qround  KS0, KS1, KS2, KS3, KK
0747
0748     aese        KS0.16b, KL.16b
0749     aese        KS1.16b, KL.16b
0750     aese        KS2.16b, KL.16b
0751     aese        KS3.16b, KL.16b
0752
0753     eor     KS0.16b, KS0.16b, KM.16b
0754     eor     KS1.16b, KS1.16b, KM.16b
0755     eor     KS2.16b, KS2.16b, KM.16b
0756     eor     KS3.16b, KS3.16b, KM.16b
0757
0758     eor     INP0.16b, INP0.16b, KS0.16b
0759     eor     INP1.16b, INP1.16b, KS1.16b
0760     eor     INP2.16b, INP2.16b, KS2.16b
0761     eor     INP3.16b, INP3.16b, KS3.16b
0762
0763     ret
0764 SYM_FUNC_END(pmull_gcm_enc_4x)
0765
0766     .section    ".rodata", "a"
0767     .align      6
0768 .Lpermute_table:
0769     .byte       0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
0770     .byte       0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
0771     .byte        0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
0772     .byte        0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf
0773     .byte       0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
0774     .byte       0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
0775     .byte        0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
0776     .byte        0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf
0777     .previous