![]() |
|
|||
0001 /* SPDX-License-Identifier: GPL-2.0 */ 0002 /* 0003 * Copyright 2021 Google LLC 0004 */ 0005 /* 0006 * This is an efficient implementation of POLYVAL using intel PCLMULQDQ-NI 0007 * instructions. It works on 8 blocks at a time, by precomputing the first 8 0008 * keys powers h^8, ..., h^1 in the POLYVAL finite field. This precomputation 0009 * allows us to split finite field multiplication into two steps. 0010 * 0011 * In the first step, we consider h^i, m_i as normal polynomials of degree less 0012 * than 128. We then compute p(x) = h^8m_0 + ... + h^1m_7 where multiplication 0013 * is simply polynomial multiplication. 0014 * 0015 * In the second step, we compute the reduction of p(x) modulo the finite field 0016 * modulus g(x) = x^128 + x^127 + x^126 + x^121 + 1. 0017 * 0018 * This two step process is equivalent to computing h^8m_0 + ... + h^1m_7 where 0019 * multiplication is finite field multiplication. The advantage is that the 0020 * two-step process only requires 1 finite field reduction for every 8 0021 * polynomial multiplications. Further parallelism is gained by interleaving the 0022 * multiplications and polynomial reductions. 0023 */ 0024 0025 #include <linux/linkage.h> 0026 #include <asm/frame.h> 0027 0028 #define STRIDE_BLOCKS 8 0029 0030 #define GSTAR %xmm7 0031 #define PL %xmm8 0032 #define PH %xmm9 0033 #define TMP_XMM %xmm11 0034 #define LO %xmm12 0035 #define HI %xmm13 0036 #define MI %xmm14 0037 #define SUM %xmm15 0038 0039 #define KEY_POWERS %rdi 0040 #define MSG %rsi 0041 #define BLOCKS_LEFT %rdx 0042 #define ACCUMULATOR %rcx 0043 #define TMP %rax 0044 0045 .section .rodata.cst16.gstar, "aM", @progbits, 16 0046 .align 16 0047 0048 .Lgstar: 0049 .quad 0xc200000000000000, 0xc200000000000000 0050 0051 .text 0052 0053 /* 0054 * Performs schoolbook1_iteration on two lists of 128-bit polynomials of length 0055 * count pointed to by MSG and KEY_POWERS. 0056 */ 0057 .macro schoolbook1 count 0058 .set i, 0 0059 .rept (\count) 0060 schoolbook1_iteration i 0 0061 .set i, (i +1) 0062 .endr 0063 .endm 0064 0065 /* 0066 * Computes the product of two 128-bit polynomials at the memory locations 0067 * specified by (MSG + 16*i) and (KEY_POWERS + 16*i) and XORs the components of 0068 * the 256-bit product into LO, MI, HI. 0069 * 0070 * Given: 0071 * X = [X_1 : X_0] 0072 * Y = [Y_1 : Y_0] 0073 * 0074 * We compute: 0075 * LO += X_0 * Y_0 0076 * MI += X_0 * Y_1 + X_1 * Y_0 0077 * HI += X_1 * Y_1 0078 * 0079 * Later, the 256-bit result can be extracted as: 0080 * [HI_1 : HI_0 + MI_1 : LO_1 + MI_0 : LO_0] 0081 * This step is done when computing the polynomial reduction for efficiency 0082 * reasons. 0083 * 0084 * If xor_sum == 1, then also XOR the value of SUM into m_0. This avoids an 0085 * extra multiplication of SUM and h^8. 0086 */ 0087 .macro schoolbook1_iteration i xor_sum 0088 movups (16*\i)(MSG), %xmm0 0089 .if (\i == 0 && \xor_sum == 1) 0090 pxor SUM, %xmm0 0091 .endif 0092 vpclmulqdq $0x01, (16*\i)(KEY_POWERS), %xmm0, %xmm2 0093 vpclmulqdq $0x00, (16*\i)(KEY_POWERS), %xmm0, %xmm1 0094 vpclmulqdq $0x10, (16*\i)(KEY_POWERS), %xmm0, %xmm3 0095 vpclmulqdq $0x11, (16*\i)(KEY_POWERS), %xmm0, %xmm4 0096 vpxor %xmm2, MI, MI 0097 vpxor %xmm1, LO, LO 0098 vpxor %xmm4, HI, HI 0099 vpxor %xmm3, MI, MI 0100 .endm 0101 0102 /* 0103 * Performs the same computation as schoolbook1_iteration, except we expect the 0104 * arguments to already be loaded into xmm0 and xmm1 and we set the result 0105 * registers LO, MI, and HI directly rather than XOR'ing into them. 0106 */ 0107 .macro schoolbook1_noload 0108 vpclmulqdq $0x01, %xmm0, %xmm1, MI 0109 vpclmulqdq $0x10, %xmm0, %xmm1, %xmm2 0110 vpclmulqdq $0x00, %xmm0, %xmm1, LO 0111 vpclmulqdq $0x11, %xmm0, %xmm1, HI 0112 vpxor %xmm2, MI, MI 0113 .endm 0114 0115 /* 0116 * Computes the 256-bit polynomial represented by LO, HI, MI. Stores 0117 * the result in PL, PH. 0118 * [PH : PL] = [HI_1 : HI_0 + MI_1 : LO_1 + MI_0 : LO_0] 0119 */ 0120 .macro schoolbook2 0121 vpslldq $8, MI, PL 0122 vpsrldq $8, MI, PH 0123 pxor LO, PL 0124 pxor HI, PH 0125 .endm 0126 0127 /* 0128 * Computes the 128-bit reduction of PH : PL. Stores the result in dest. 0129 * 0130 * This macro computes p(x) mod g(x) where p(x) is in montgomery form and g(x) = 0131 * x^128 + x^127 + x^126 + x^121 + 1. 0132 * 0133 * We have a 256-bit polynomial PH : PL = P_3 : P_2 : P_1 : P_0 that is the 0134 * product of two 128-bit polynomials in Montgomery form. We need to reduce it 0135 * mod g(x). Also, since polynomials in Montgomery form have an "extra" factor 0136 * of x^128, this product has two extra factors of x^128. To get it back into 0137 * Montgomery form, we need to remove one of these factors by dividing by x^128. 0138 * 0139 * To accomplish both of these goals, we add multiples of g(x) that cancel out 0140 * the low 128 bits P_1 : P_0, leaving just the high 128 bits. Since the low 0141 * bits are zero, the polynomial division by x^128 can be done by right shifting. 0142 * 0143 * Since the only nonzero term in the low 64 bits of g(x) is the constant term, 0144 * the multiple of g(x) needed to cancel out P_0 is P_0 * g(x). The CPU can 0145 * only do 64x64 bit multiplications, so split P_0 * g(x) into x^128 * P_0 + 0146 * x^64 * g*(x) * P_0 + P_0, where g*(x) is bits 64-127 of g(x). Adding this to 0147 * the original polynomial gives P_3 : P_2 + P_0 + T_1 : P_1 + T_0 : 0, where T 0148 * = T_1 : T_0 = g*(x) * P_0. Thus, bits 0-63 got "folded" into bits 64-191. 0149 * 0150 * Repeating this same process on the next 64 bits "folds" bits 64-127 into bits 0151 * 128-255, giving the answer in bits 128-255. This time, we need to cancel P_1 0152 * + T_0 in bits 64-127. The multiple of g(x) required is (P_1 + T_0) * g(x) * 0153 * x^64. Adding this to our previous computation gives P_3 + P_1 + T_0 + V_1 : 0154 * P_2 + P_0 + T_1 + V_0 : 0 : 0, where V = V_1 : V_0 = g*(x) * (P_1 + T_0). 0155 * 0156 * So our final computation is: 0157 * T = T_1 : T_0 = g*(x) * P_0 0158 * V = V_1 : V_0 = g*(x) * (P_1 + T_0) 0159 * p(x) / x^{128} mod g(x) = P_3 + P_1 + T_0 + V_1 : P_2 + P_0 + T_1 + V_0 0160 * 0161 * The implementation below saves a XOR instruction by computing P_1 + T_0 : P_0 0162 * + T_1 and XORing into dest, rather than separately XORing P_1 : P_0 and T_0 : 0163 * T_1 into dest. This allows us to reuse P_1 + T_0 when computing V. 0164 */ 0165 .macro montgomery_reduction dest 0166 vpclmulqdq $0x00, PL, GSTAR, TMP_XMM # TMP_XMM = T_1 : T_0 = P_0 * g*(x) 0167 pshufd $0b01001110, TMP_XMM, TMP_XMM # TMP_XMM = T_0 : T_1 0168 pxor PL, TMP_XMM # TMP_XMM = P_1 + T_0 : P_0 + T_1 0169 pxor TMP_XMM, PH # PH = P_3 + P_1 + T_0 : P_2 + P_0 + T_1 0170 pclmulqdq $0x11, GSTAR, TMP_XMM # TMP_XMM = V_1 : V_0 = V = [(P_1 + T_0) * g*(x)] 0171 vpxor TMP_XMM, PH, \dest 0172 .endm 0173 0174 /* 0175 * Compute schoolbook multiplication for 8 blocks 0176 * m_0h^8 + ... + m_7h^1 0177 * 0178 * If reduce is set, also computes the montgomery reduction of the 0179 * previous full_stride call and XORs with the first message block. 0180 * (m_0 + REDUCE(PL, PH))h^8 + ... + m_7h^1. 0181 * I.e., the first multiplication uses m_0 + REDUCE(PL, PH) instead of m_0. 0182 */ 0183 .macro full_stride reduce 0184 pxor LO, LO 0185 pxor HI, HI 0186 pxor MI, MI 0187 0188 schoolbook1_iteration 7 0 0189 .if \reduce 0190 vpclmulqdq $0x00, PL, GSTAR, TMP_XMM 0191 .endif 0192 0193 schoolbook1_iteration 6 0 0194 .if \reduce 0195 pshufd $0b01001110, TMP_XMM, TMP_XMM 0196 .endif 0197 0198 schoolbook1_iteration 5 0 0199 .if \reduce 0200 pxor PL, TMP_XMM 0201 .endif 0202 0203 schoolbook1_iteration 4 0 0204 .if \reduce 0205 pxor TMP_XMM, PH 0206 .endif 0207 0208 schoolbook1_iteration 3 0 0209 .if \reduce 0210 pclmulqdq $0x11, GSTAR, TMP_XMM 0211 .endif 0212 0213 schoolbook1_iteration 2 0 0214 .if \reduce 0215 vpxor TMP_XMM, PH, SUM 0216 .endif 0217 0218 schoolbook1_iteration 1 0 0219 0220 schoolbook1_iteration 0 1 0221 0222 addq $(8*16), MSG 0223 schoolbook2 0224 .endm 0225 0226 /* 0227 * Process BLOCKS_LEFT blocks, where 0 < BLOCKS_LEFT < STRIDE_BLOCKS 0228 */ 0229 .macro partial_stride 0230 mov BLOCKS_LEFT, TMP 0231 shlq $4, TMP 0232 addq $(16*STRIDE_BLOCKS), KEY_POWERS 0233 subq TMP, KEY_POWERS 0234 0235 movups (MSG), %xmm0 0236 pxor SUM, %xmm0 0237 movaps (KEY_POWERS), %xmm1 0238 schoolbook1_noload 0239 dec BLOCKS_LEFT 0240 addq $16, MSG 0241 addq $16, KEY_POWERS 0242 0243 test $4, BLOCKS_LEFT 0244 jz .Lpartial4BlocksDone 0245 schoolbook1 4 0246 addq $(4*16), MSG 0247 addq $(4*16), KEY_POWERS 0248 .Lpartial4BlocksDone: 0249 test $2, BLOCKS_LEFT 0250 jz .Lpartial2BlocksDone 0251 schoolbook1 2 0252 addq $(2*16), MSG 0253 addq $(2*16), KEY_POWERS 0254 .Lpartial2BlocksDone: 0255 test $1, BLOCKS_LEFT 0256 jz .LpartialDone 0257 schoolbook1 1 0258 .LpartialDone: 0259 schoolbook2 0260 montgomery_reduction SUM 0261 .endm 0262 0263 /* 0264 * Perform montgomery multiplication in GF(2^128) and store result in op1. 0265 * 0266 * Computes op1*op2*x^{-128} mod x^128 + x^127 + x^126 + x^121 + 1 0267 * If op1, op2 are in montgomery form, this computes the montgomery 0268 * form of op1*op2. 0269 * 0270 * void clmul_polyval_mul(u8 *op1, const u8 *op2); 0271 */ 0272 SYM_FUNC_START(clmul_polyval_mul) 0273 FRAME_BEGIN 0274 vmovdqa .Lgstar(%rip), GSTAR 0275 movups (%rdi), %xmm0 0276 movups (%rsi), %xmm1 0277 schoolbook1_noload 0278 schoolbook2 0279 montgomery_reduction SUM 0280 movups SUM, (%rdi) 0281 FRAME_END 0282 RET 0283 SYM_FUNC_END(clmul_polyval_mul) 0284 0285 /* 0286 * Perform polynomial evaluation as specified by POLYVAL. This computes: 0287 * h^n * accumulator + h^n * m_0 + ... + h^1 * m_{n-1} 0288 * where n=nblocks, h is the hash key, and m_i are the message blocks. 0289 * 0290 * rdi - pointer to precomputed key powers h^8 ... h^1 0291 * rsi - pointer to message blocks 0292 * rdx - number of blocks to hash 0293 * rcx - pointer to the accumulator 0294 * 0295 * void clmul_polyval_update(const struct polyval_tfm_ctx *keys, 0296 * const u8 *in, size_t nblocks, u8 *accumulator); 0297 */ 0298 SYM_FUNC_START(clmul_polyval_update) 0299 FRAME_BEGIN 0300 vmovdqa .Lgstar(%rip), GSTAR 0301 movups (ACCUMULATOR), SUM 0302 subq $STRIDE_BLOCKS, BLOCKS_LEFT 0303 js .LstrideLoopExit 0304 full_stride 0 0305 subq $STRIDE_BLOCKS, BLOCKS_LEFT 0306 js .LstrideLoopExitReduce 0307 .LstrideLoop: 0308 full_stride 1 0309 subq $STRIDE_BLOCKS, BLOCKS_LEFT 0310 jns .LstrideLoop 0311 .LstrideLoopExitReduce: 0312 montgomery_reduction SUM 0313 .LstrideLoopExit: 0314 add $STRIDE_BLOCKS, BLOCKS_LEFT 0315 jz .LskipPartial 0316 partial_stride 0317 .LskipPartial: 0318 movups SUM, (ACCUMULATOR) 0319 FRAME_END 0320 RET 0321 SYM_FUNC_END(clmul_polyval_update)
[ Source navigation ] | [ Diff markup ] | [ Identifier search ] | [ general search ] |
This page was automatically generated by the 2.1.0 LXR engine. The LXR team |
![]() ![]() |