arm64/crypto/polyval-ce-core.S

0001 /* SPDX-License-Identifier: GPL-2.0 */
0002 /*
0003  * Implementation of POLYVAL using ARMv8 Crypto Extensions.
0004  *
0005  * Copyright 2021 Google LLC
0006  */
0007 /*
0008  * This is an efficient implementation of POLYVAL using ARMv8 Crypto Extensions
0009  * It works on 8 blocks at a time, by precomputing the first 8 keys powers h^8,
0010  * ..., h^1 in the POLYVAL finite field. This precomputation allows us to split
0011  * finite field multiplication into two steps.
0012  *
0013  * In the first step, we consider h^i, m_i as normal polynomials of degree less
0014  * than 128. We then compute p(x) = h^8m_0 + ... + h^1m_7 where multiplication
0015  * is simply polynomial multiplication.
0016  *
0017  * In the second step, we compute the reduction of p(x) modulo the finite field
0018  * modulus g(x) = x^128 + x^127 + x^126 + x^121 + 1.
0019  *
0020  * This two step process is equivalent to computing h^8m_0 + ... + h^1m_7 where
0021  * multiplication is finite field multiplication. The advantage is that the
0022  * two-step process  only requires 1 finite field reduction for every 8
0023  * polynomial multiplications. Further parallelism is gained by interleaving the
0024  * multiplications and polynomial reductions.
0025  */
0026
0027 #include <linux/linkage.h>
0028 #define STRIDE_BLOCKS 8
0029
0030 KEY_POWERS  .req    x0
0031 MSG     .req    x1
0032 BLOCKS_LEFT .req    x2
0033 ACCUMULATOR .req    x3
0034 KEY_START   .req    x10
0035 EXTRA_BYTES .req    x11
0036 TMP .req    x13
0037
0038 M0  .req    v0
0039 M1  .req    v1
0040 M2  .req    v2
0041 M3  .req    v3
0042 M4  .req    v4
0043 M5  .req    v5
0044 M6  .req    v6
0045 M7  .req    v7
0046 KEY8    .req    v8
0047 KEY7    .req    v9
0048 KEY6    .req    v10
0049 KEY5    .req    v11
0050 KEY4    .req    v12
0051 KEY3    .req    v13
0052 KEY2    .req    v14
0053 KEY1    .req    v15
0054 PL  .req    v16
0055 PH  .req    v17
0056 TMP_V   .req    v18
0057 LO  .req    v20
0058 MI  .req    v21
0059 HI  .req    v22
0060 SUM .req    v23
0061 GSTAR   .req    v24
0062
0063     .text
0064
0065     .arch   armv8-a+crypto
0066     .align  4
0067
0068 .Lgstar:
0069     .quad   0xc200000000000000, 0xc200000000000000
0070
0071 /*
0072  * Computes the product of two 128-bit polynomials in X and Y and XORs the
0073  * components of the 256-bit product into LO, MI, HI.
0074  *
0075  * Given:
0076  *  X = [X_1 : X_0]
0077  *  Y = [Y_1 : Y_0]
0078  *
0079  * We compute:
0080  *  LO += X_0 * Y_0
0081  *  MI += (X_0 + X_1) * (Y_0 + Y_1)
0082  *  HI += X_1 * Y_1
0083  *
0084  * Later, the 256-bit result can be extracted as:
0085  *   [HI_1 : HI_0 + HI_1 + MI_1 + LO_1 : LO_1 + HI_0 + MI_0 + LO_0 : LO_0]
0086  * This step is done when computing the polynomial reduction for efficiency
0087  * reasons.
0088  *
0089  * Karatsuba multiplication is used instead of Schoolbook multiplication because
0090  * it was found to be slightly faster on ARM64 CPUs.
0091  *
0092  */
0093 .macro karatsuba1 X Y
0094     X .req \X
0095     Y .req \Y
0096     ext v25.16b, X.16b, X.16b, #8
0097     ext v26.16b, Y.16b, Y.16b, #8
0098     eor v25.16b, v25.16b, X.16b
0099     eor v26.16b, v26.16b, Y.16b
0100     pmull2  v28.1q, X.2d, Y.2d
0101     pmull   v29.1q, X.1d, Y.1d
0102     pmull   v27.1q, v25.1d, v26.1d
0103     eor HI.16b, HI.16b, v28.16b
0104     eor LO.16b, LO.16b, v29.16b
0105     eor MI.16b, MI.16b, v27.16b
0106     .unreq X
0107     .unreq Y
0108 .endm
0109
0110 /*
0111  * Same as karatsuba1, except overwrites HI, LO, MI rather than XORing into
0112  * them.
0113  */
0114 .macro karatsuba1_store X Y
0115     X .req \X
0116     Y .req \Y
0117     ext v25.16b, X.16b, X.16b, #8
0118     ext v26.16b, Y.16b, Y.16b, #8
0119     eor v25.16b, v25.16b, X.16b
0120     eor v26.16b, v26.16b, Y.16b
0121     pmull2  HI.1q, X.2d, Y.2d
0122     pmull   LO.1q, X.1d, Y.1d
0123     pmull   MI.1q, v25.1d, v26.1d
0124     .unreq X
0125     .unreq Y
0126 .endm
0127
0128 /*
0129  * Computes the 256-bit polynomial represented by LO, HI, MI. Stores
0130  * the result in PL, PH.
0131  * [PH : PL] =
0132  *   [HI_1 : HI_1 + HI_0 + MI_1 + LO_1 : HI_0 + MI_0 + LO_1 + LO_0 : LO_0]
0133  */
0134 .macro karatsuba2
0135     // v4 = [HI_1 + MI_1 : HI_0 + MI_0]
0136     eor v4.16b, HI.16b, MI.16b
0137     // v4 = [HI_1 + MI_1 + LO_1 : HI_0 + MI_0 + LO_0]
0138     eor v4.16b, v4.16b, LO.16b
0139     // v5 = [HI_0 : LO_1]
0140     ext v5.16b, LO.16b, HI.16b, #8
0141     // v4 = [HI_1 + HI_0 + MI_1 + LO_1 : HI_0 + MI_0 + LO_1 + LO_0]
0142     eor v4.16b, v4.16b, v5.16b
0143     // HI = [HI_0 : HI_1]
0144     ext HI.16b, HI.16b, HI.16b, #8
0145     // LO = [LO_0 : LO_1]
0146     ext LO.16b, LO.16b, LO.16b, #8
0147     // PH = [HI_1 : HI_1 + HI_0 + MI_1 + LO_1]
0148     ext PH.16b, v4.16b, HI.16b, #8
0149     // PL = [HI_0 + MI_0 + LO_1 + LO_0 : LO_0]
0150     ext PL.16b, LO.16b, v4.16b, #8
0151 .endm
0152
0153 /*
0154  * Computes the 128-bit reduction of PH : PL. Stores the result in dest.
0155  *
0156  * This macro computes p(x) mod g(x) where p(x) is in montgomery form and g(x) =
0157  * x^128 + x^127 + x^126 + x^121 + 1.
0158  *
0159  * We have a 256-bit polynomial PH : PL = P_3 : P_2 : P_1 : P_0 that is the
0160  * product of two 128-bit polynomials in Montgomery form.  We need to reduce it
0161  * mod g(x).  Also, since polynomials in Montgomery form have an "extra" factor
0162  * of x^128, this product has two extra factors of x^128.  To get it back into
0163  * Montgomery form, we need to remove one of these factors by dividing by x^128.
0164  *
0165  * To accomplish both of these goals, we add multiples of g(x) that cancel out
0166  * the low 128 bits P_1 : P_0, leaving just the high 128 bits. Since the low
0167  * bits are zero, the polynomial division by x^128 can be done by right
0168  * shifting.
0169  *
0170  * Since the only nonzero term in the low 64 bits of g(x) is the constant term,
0171  * the multiple of g(x) needed to cancel out P_0 is P_0 * g(x).  The CPU can
0172  * only do 64x64 bit multiplications, so split P_0 * g(x) into x^128 * P_0 +
0173  * x^64 * g*(x) * P_0 + P_0, where g*(x) is bits 64-127 of g(x).  Adding this to
0174  * the original polynomial gives P_3 : P_2 + P_0 + T_1 : P_1 + T_0 : 0, where T
0175  * = T_1 : T_0 = g*(x) * P_0.  Thus, bits 0-63 got "folded" into bits 64-191.
0176  *
0177  * Repeating this same process on the next 64 bits "folds" bits 64-127 into bits
0178  * 128-255, giving the answer in bits 128-255. This time, we need to cancel P_1
0179  * + T_0 in bits 64-127. The multiple of g(x) required is (P_1 + T_0) * g(x) *
0180  * x^64. Adding this to our previous computation gives P_3 + P_1 + T_0 + V_1 :
0181  * P_2 + P_0 + T_1 + V_0 : 0 : 0, where V = V_1 : V_0 = g*(x) * (P_1 + T_0).
0182  *
0183  * So our final computation is:
0184  *   T = T_1 : T_0 = g*(x) * P_0
0185  *   V = V_1 : V_0 = g*(x) * (P_1 + T_0)
0186  *   p(x) / x^{128} mod g(x) = P_3 + P_1 + T_0 + V_1 : P_2 + P_0 + T_1 + V_0
0187  *
0188  * The implementation below saves a XOR instruction by computing P_1 + T_0 : P_0
0189  * + T_1 and XORing into dest, rather than separately XORing P_1 : P_0 and T_0 :
0190  * T_1 into dest.  This allows us to reuse P_1 + T_0 when computing V.
0191  */
0192 .macro montgomery_reduction dest
0193     DEST .req \dest
0194     // TMP_V = T_1 : T_0 = P_0 * g*(x)
0195     pmull   TMP_V.1q, PL.1d, GSTAR.1d
0196     // TMP_V = T_0 : T_1
0197     ext TMP_V.16b, TMP_V.16b, TMP_V.16b, #8
0198     // TMP_V = P_1 + T_0 : P_0 + T_1
0199     eor TMP_V.16b, PL.16b, TMP_V.16b
0200     // PH = P_3 + P_1 + T_0 : P_2 + P_0 + T_1
0201     eor PH.16b, PH.16b, TMP_V.16b
0202     // TMP_V = V_1 : V_0 = (P_1 + T_0) * g*(x)
0203     pmull2  TMP_V.1q, TMP_V.2d, GSTAR.2d
0204     eor DEST.16b, PH.16b, TMP_V.16b
0205     .unreq DEST
0206 .endm
0207
0208 /*
0209  * Compute Polyval on 8 blocks.
0210  *
0211  * If reduce is set, also computes the montgomery reduction of the
0212  * previous full_stride call and XORs with the first message block.
0213  * (m_0 + REDUCE(PL, PH))h^8 + ... + m_7h^1.
0214  * I.e., the first multiplication uses m_0 + REDUCE(PL, PH) instead of m_0.
0215  *
0216  * Sets PL, PH.
0217  */
0218 .macro full_stride reduce
0219     eor     LO.16b, LO.16b, LO.16b
0220     eor     MI.16b, MI.16b, MI.16b
0221     eor     HI.16b, HI.16b, HI.16b
0222
0223     ld1     {M0.16b, M1.16b, M2.16b, M3.16b}, [MSG], #64
0224     ld1     {M4.16b, M5.16b, M6.16b, M7.16b}, [MSG], #64
0225
0226     karatsuba1 M7 KEY1
0227     .if \reduce
0228     pmull   TMP_V.1q, PL.1d, GSTAR.1d
0229     .endif
0230
0231     karatsuba1 M6 KEY2
0232     .if \reduce
0233     ext TMP_V.16b, TMP_V.16b, TMP_V.16b, #8
0234     .endif
0235
0236     karatsuba1 M5 KEY3
0237     .if \reduce
0238     eor TMP_V.16b, PL.16b, TMP_V.16b
0239     .endif
0240
0241     karatsuba1 M4 KEY4
0242     .if \reduce
0243     eor PH.16b, PH.16b, TMP_V.16b
0244     .endif
0245
0246     karatsuba1 M3 KEY5
0247     .if \reduce
0248     pmull2  TMP_V.1q, TMP_V.2d, GSTAR.2d
0249     .endif
0250
0251     karatsuba1 M2 KEY6
0252     .if \reduce
0253     eor SUM.16b, PH.16b, TMP_V.16b
0254     .endif
0255
0256     karatsuba1 M1 KEY7
0257     eor M0.16b, M0.16b, SUM.16b
0258
0259     karatsuba1 M0 KEY8
0260     karatsuba2
0261 .endm
0262
0263 /*
0264  * Handle any extra blocks after full_stride loop.
0265  */
0266 .macro partial_stride
0267     add KEY_POWERS, KEY_START, #(STRIDE_BLOCKS << 4)
0268     sub KEY_POWERS, KEY_POWERS, BLOCKS_LEFT, lsl #4
0269     ld1 {KEY1.16b}, [KEY_POWERS], #16
0270
0271     ld1 {TMP_V.16b}, [MSG], #16
0272     eor SUM.16b, SUM.16b, TMP_V.16b
0273     karatsuba1_store KEY1 SUM
0274     sub BLOCKS_LEFT, BLOCKS_LEFT, #1
0275
0276     tst BLOCKS_LEFT, #4
0277     beq .Lpartial4BlocksDone
0278     ld1 {M0.16b, M1.16b,  M2.16b, M3.16b}, [MSG], #64
0279     ld1 {KEY8.16b, KEY7.16b, KEY6.16b,  KEY5.16b}, [KEY_POWERS], #64
0280     karatsuba1 M0 KEY8
0281     karatsuba1 M1 KEY7
0282     karatsuba1 M2 KEY6
0283     karatsuba1 M3 KEY5
0284 .Lpartial4BlocksDone:
0285     tst BLOCKS_LEFT, #2
0286     beq .Lpartial2BlocksDone
0287     ld1 {M0.16b, M1.16b}, [MSG], #32
0288     ld1 {KEY8.16b, KEY7.16b}, [KEY_POWERS], #32
0289     karatsuba1 M0 KEY8
0290     karatsuba1 M1 KEY7
0291 .Lpartial2BlocksDone:
0292     tst BLOCKS_LEFT, #1
0293     beq .LpartialDone
0294     ld1 {M0.16b}, [MSG], #16
0295     ld1 {KEY8.16b}, [KEY_POWERS], #16
0296     karatsuba1 M0 KEY8
0297 .LpartialDone:
0298     karatsuba2
0299     montgomery_reduction SUM
0300 .endm
0301
0302 /*
0303  * Perform montgomery multiplication in GF(2^128) and store result in op1.
0304  *
0305  * Computes op1*op2*x^{-128} mod x^128 + x^127 + x^126 + x^121 + 1
0306  * If op1, op2 are in montgomery form, this computes the montgomery
0307  * form of op1*op2.
0308  *
0309  * void pmull_polyval_mul(u8 *op1, const u8 *op2);
0310  */
0311 SYM_FUNC_START(pmull_polyval_mul)
0312     adr TMP, .Lgstar
0313     ld1 {GSTAR.2d}, [TMP]
0314     ld1 {v0.16b}, [x0]
0315     ld1 {v1.16b}, [x1]
0316     karatsuba1_store v0 v1
0317     karatsuba2
0318     montgomery_reduction SUM
0319     st1 {SUM.16b}, [x0]
0320     ret
0321 SYM_FUNC_END(pmull_polyval_mul)
0322
0323 /*
0324  * Perform polynomial evaluation as specified by POLYVAL.  This computes:
0325  *  h^n * accumulator + h^n * m_0 + ... + h^1 * m_{n-1}
0326  * where n=nblocks, h is the hash key, and m_i are the message blocks.
0327  *
0328  * x0 - pointer to precomputed key powers h^8 ... h^1
0329  * x1 - pointer to message blocks
0330  * x2 - number of blocks to hash
0331  * x3 - pointer to accumulator
0332  *
0333  * void pmull_polyval_update(const struct polyval_ctx *ctx, const u8 *in,
0334  *               size_t nblocks, u8 *accumulator);
0335  */
0336 SYM_FUNC_START(pmull_polyval_update)
0337     adr TMP, .Lgstar
0338     mov KEY_START, KEY_POWERS
0339     ld1 {GSTAR.2d}, [TMP]
0340     ld1 {SUM.16b}, [ACCUMULATOR]
0341     subs    BLOCKS_LEFT, BLOCKS_LEFT, #STRIDE_BLOCKS
0342     blt .LstrideLoopExit
0343     ld1 {KEY8.16b, KEY7.16b, KEY6.16b, KEY5.16b}, [KEY_POWERS], #64
0344     ld1 {KEY4.16b, KEY3.16b, KEY2.16b, KEY1.16b}, [KEY_POWERS], #64
0345     full_stride 0
0346     subs    BLOCKS_LEFT, BLOCKS_LEFT, #STRIDE_BLOCKS
0347     blt .LstrideLoopExitReduce
0348 .LstrideLoop:
0349     full_stride 1
0350     subs    BLOCKS_LEFT, BLOCKS_LEFT, #STRIDE_BLOCKS
0351     bge .LstrideLoop
0352 .LstrideLoopExitReduce:
0353     montgomery_reduction SUM
0354 .LstrideLoopExit:
0355     adds    BLOCKS_LEFT, BLOCKS_LEFT, #STRIDE_BLOCKS
0356     beq .LskipPartial
0357     partial_stride
0358 .LskipPartial:
0359     st1 {SUM.16b}, [ACCUMULATOR]
0360     ret
0361 SYM_FUNC_END(pmull_polyval_update)