Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0 OR MIT
0002 /*
0003  * Copyright (C) 2020 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
0004  * Copyright (c) 2016-2020 INRIA, CMU and Microsoft Corporation
0005  */
0006 
0007 #include <crypto/curve25519.h>
0008 #include <crypto/internal/kpp.h>
0009 
0010 #include <linux/types.h>
0011 #include <linux/jump_label.h>
0012 #include <linux/kernel.h>
0013 #include <linux/module.h>
0014 #include <linux/scatterlist.h>
0015 
0016 #include <asm/cpufeature.h>
0017 #include <asm/processor.h>
0018 
0019 static __always_inline u64 eq_mask(u64 a, u64 b)
0020 {
0021     u64 x = a ^ b;
0022     u64 minus_x = ~x + (u64)1U;
0023     u64 x_or_minus_x = x | minus_x;
0024     u64 xnx = x_or_minus_x >> (u32)63U;
0025     return xnx - (u64)1U;
0026 }
0027 
0028 static __always_inline u64 gte_mask(u64 a, u64 b)
0029 {
0030     u64 x = a;
0031     u64 y = b;
0032     u64 x_xor_y = x ^ y;
0033     u64 x_sub_y = x - y;
0034     u64 x_sub_y_xor_y = x_sub_y ^ y;
0035     u64 q = x_xor_y | x_sub_y_xor_y;
0036     u64 x_xor_q = x ^ q;
0037     u64 x_xor_q_ = x_xor_q >> (u32)63U;
0038     return x_xor_q_ - (u64)1U;
0039 }
0040 
0041 /* Computes the addition of four-element f1 with value in f2
0042  * and returns the carry (if any) */
0043 static inline u64 add_scalar(u64 *out, const u64 *f1, u64 f2)
0044 {
0045     u64 carry_r;
0046 
0047     asm volatile(
0048         /* Clear registers to propagate the carry bit */
0049         "  xor %%r8d, %%r8d;"
0050         "  xor %%r9d, %%r9d;"
0051         "  xor %%r10d, %%r10d;"
0052         "  xor %%r11d, %%r11d;"
0053         "  xor %k1, %k1;"
0054 
0055         /* Begin addition chain */
0056         "  addq 0(%3), %0;"
0057         "  movq %0, 0(%2);"
0058         "  adcxq 8(%3), %%r8;"
0059         "  movq %%r8, 8(%2);"
0060         "  adcxq 16(%3), %%r9;"
0061         "  movq %%r9, 16(%2);"
0062         "  adcxq 24(%3), %%r10;"
0063         "  movq %%r10, 24(%2);"
0064 
0065         /* Return the carry bit in a register */
0066         "  adcx %%r11, %1;"
0067         : "+&r"(f2), "=&r"(carry_r)
0068         : "r"(out), "r"(f1)
0069         : "%r8", "%r9", "%r10", "%r11", "memory", "cc");
0070 
0071     return carry_r;
0072 }
0073 
0074 /* Computes the field addition of two field elements */
0075 static inline void fadd(u64 *out, const u64 *f1, const u64 *f2)
0076 {
0077     asm volatile(
0078         /* Compute the raw addition of f1 + f2 */
0079         "  movq 0(%0), %%r8;"
0080         "  addq 0(%2), %%r8;"
0081         "  movq 8(%0), %%r9;"
0082         "  adcxq 8(%2), %%r9;"
0083         "  movq 16(%0), %%r10;"
0084         "  adcxq 16(%2), %%r10;"
0085         "  movq 24(%0), %%r11;"
0086         "  adcxq 24(%2), %%r11;"
0087 
0088         /* Wrap the result back into the field */
0089 
0090         /* Step 1: Compute carry*38 */
0091         "  mov $0, %%rax;"
0092         "  mov $38, %0;"
0093         "  cmovc %0, %%rax;"
0094 
0095         /* Step 2: Add carry*38 to the original sum */
0096         "  xor %%ecx, %%ecx;"
0097         "  add %%rax, %%r8;"
0098         "  adcx %%rcx, %%r9;"
0099         "  movq %%r9, 8(%1);"
0100         "  adcx %%rcx, %%r10;"
0101         "  movq %%r10, 16(%1);"
0102         "  adcx %%rcx, %%r11;"
0103         "  movq %%r11, 24(%1);"
0104 
0105         /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
0106         "  mov $0, %%rax;"
0107         "  cmovc %0, %%rax;"
0108         "  add %%rax, %%r8;"
0109         "  movq %%r8, 0(%1);"
0110         : "+&r"(f2)
0111         : "r"(out), "r"(f1)
0112         : "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "memory", "cc");
0113 }
0114 
0115 /* Computes the field subtraction of two field elements */
0116 static inline void fsub(u64 *out, const u64 *f1, const u64 *f2)
0117 {
0118     asm volatile(
0119         /* Compute the raw subtraction of f1-f2 */
0120         "  movq 0(%1), %%r8;"
0121         "  subq 0(%2), %%r8;"
0122         "  movq 8(%1), %%r9;"
0123         "  sbbq 8(%2), %%r9;"
0124         "  movq 16(%1), %%r10;"
0125         "  sbbq 16(%2), %%r10;"
0126         "  movq 24(%1), %%r11;"
0127         "  sbbq 24(%2), %%r11;"
0128 
0129         /* Wrap the result back into the field */
0130 
0131         /* Step 1: Compute carry*38 */
0132         "  mov $0, %%rax;"
0133         "  mov $38, %%rcx;"
0134         "  cmovc %%rcx, %%rax;"
0135 
0136         /* Step 2: Subtract carry*38 from the original difference */
0137         "  sub %%rax, %%r8;"
0138         "  sbb $0, %%r9;"
0139         "  sbb $0, %%r10;"
0140         "  sbb $0, %%r11;"
0141 
0142         /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
0143         "  mov $0, %%rax;"
0144         "  cmovc %%rcx, %%rax;"
0145         "  sub %%rax, %%r8;"
0146 
0147         /* Store the result */
0148         "  movq %%r8, 0(%0);"
0149         "  movq %%r9, 8(%0);"
0150         "  movq %%r10, 16(%0);"
0151         "  movq %%r11, 24(%0);"
0152         :
0153         : "r"(out), "r"(f1), "r"(f2)
0154         : "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "memory", "cc");
0155 }
0156 
0157 /* Computes a field multiplication: out <- f1 * f2
0158  * Uses the 8-element buffer tmp for intermediate results */
0159 static inline void fmul(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
0160 {
0161     asm volatile(
0162 
0163         /* Compute the raw multiplication: tmp <- src1 * src2 */
0164 
0165         /* Compute src1[0] * src2 */
0166         "  movq 0(%0), %%rdx;"
0167         "  mulxq 0(%1), %%r8, %%r9;"
0168         "  xor %%r10d, %%r10d;"
0169         "  movq %%r8, 0(%2);"
0170         "  mulxq 8(%1), %%r10, %%r11;"
0171         "  adox %%r9, %%r10;"
0172         "  movq %%r10, 8(%2);"
0173         "  mulxq 16(%1), %%rbx, %%r13;"
0174         "  adox %%r11, %%rbx;"
0175         "  mulxq 24(%1), %%r14, %%rdx;"
0176         "  adox %%r13, %%r14;"
0177         "  mov $0, %%rax;"
0178         "  adox %%rdx, %%rax;"
0179 
0180         /* Compute src1[1] * src2 */
0181         "  movq 8(%0), %%rdx;"
0182         "  mulxq 0(%1), %%r8, %%r9;"
0183         "  xor %%r10d, %%r10d;"
0184         "  adcxq 8(%2), %%r8;"
0185         "  movq %%r8, 8(%2);"
0186         "  mulxq 8(%1), %%r10, %%r11;"
0187         "  adox %%r9, %%r10;"
0188         "  adcx %%rbx, %%r10;"
0189         "  movq %%r10, 16(%2);"
0190         "  mulxq 16(%1), %%rbx, %%r13;"
0191         "  adox %%r11, %%rbx;"
0192         "  adcx %%r14, %%rbx;"
0193         "  mov $0, %%r8;"
0194         "  mulxq 24(%1), %%r14, %%rdx;"
0195         "  adox %%r13, %%r14;"
0196         "  adcx %%rax, %%r14;"
0197         "  mov $0, %%rax;"
0198         "  adox %%rdx, %%rax;"
0199         "  adcx %%r8, %%rax;"
0200 
0201         /* Compute src1[2] * src2 */
0202         "  movq 16(%0), %%rdx;"
0203         "  mulxq 0(%1), %%r8, %%r9;"
0204         "  xor %%r10d, %%r10d;"
0205         "  adcxq 16(%2), %%r8;"
0206         "  movq %%r8, 16(%2);"
0207         "  mulxq 8(%1), %%r10, %%r11;"
0208         "  adox %%r9, %%r10;"
0209         "  adcx %%rbx, %%r10;"
0210         "  movq %%r10, 24(%2);"
0211         "  mulxq 16(%1), %%rbx, %%r13;"
0212         "  adox %%r11, %%rbx;"
0213         "  adcx %%r14, %%rbx;"
0214         "  mov $0, %%r8;"
0215         "  mulxq 24(%1), %%r14, %%rdx;"
0216         "  adox %%r13, %%r14;"
0217         "  adcx %%rax, %%r14;"
0218         "  mov $0, %%rax;"
0219         "  adox %%rdx, %%rax;"
0220         "  adcx %%r8, %%rax;"
0221 
0222         /* Compute src1[3] * src2 */
0223         "  movq 24(%0), %%rdx;"
0224         "  mulxq 0(%1), %%r8, %%r9;"
0225         "  xor %%r10d, %%r10d;"
0226         "  adcxq 24(%2), %%r8;"
0227         "  movq %%r8, 24(%2);"
0228         "  mulxq 8(%1), %%r10, %%r11;"
0229         "  adox %%r9, %%r10;"
0230         "  adcx %%rbx, %%r10;"
0231         "  movq %%r10, 32(%2);"
0232         "  mulxq 16(%1), %%rbx, %%r13;"
0233         "  adox %%r11, %%rbx;"
0234         "  adcx %%r14, %%rbx;"
0235         "  movq %%rbx, 40(%2);"
0236         "  mov $0, %%r8;"
0237         "  mulxq 24(%1), %%r14, %%rdx;"
0238         "  adox %%r13, %%r14;"
0239         "  adcx %%rax, %%r14;"
0240         "  movq %%r14, 48(%2);"
0241         "  mov $0, %%rax;"
0242         "  adox %%rdx, %%rax;"
0243         "  adcx %%r8, %%rax;"
0244         "  movq %%rax, 56(%2);"
0245 
0246         /* Line up pointers */
0247         "  mov %2, %0;"
0248         "  mov %3, %2;"
0249 
0250         /* Wrap the result back into the field */
0251 
0252         /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
0253         "  mov $38, %%rdx;"
0254         "  mulxq 32(%0), %%r8, %%r13;"
0255         "  xor %k1, %k1;"
0256         "  adoxq 0(%0), %%r8;"
0257         "  mulxq 40(%0), %%r9, %%rbx;"
0258         "  adcx %%r13, %%r9;"
0259         "  adoxq 8(%0), %%r9;"
0260         "  mulxq 48(%0), %%r10, %%r13;"
0261         "  adcx %%rbx, %%r10;"
0262         "  adoxq 16(%0), %%r10;"
0263         "  mulxq 56(%0), %%r11, %%rax;"
0264         "  adcx %%r13, %%r11;"
0265         "  adoxq 24(%0), %%r11;"
0266         "  adcx %1, %%rax;"
0267         "  adox %1, %%rax;"
0268         "  imul %%rdx, %%rax;"
0269 
0270         /* Step 2: Fold the carry back into dst */
0271         "  add %%rax, %%r8;"
0272         "  adcx %1, %%r9;"
0273         "  movq %%r9, 8(%2);"
0274         "  adcx %1, %%r10;"
0275         "  movq %%r10, 16(%2);"
0276         "  adcx %1, %%r11;"
0277         "  movq %%r11, 24(%2);"
0278 
0279         /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
0280         "  mov $0, %%rax;"
0281         "  cmovc %%rdx, %%rax;"
0282         "  add %%rax, %%r8;"
0283         "  movq %%r8, 0(%2);"
0284         : "+&r"(f1), "+&r"(f2), "+&r"(tmp)
0285         : "r"(out)
0286         : "%rax", "%rbx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r13",
0287           "%r14", "memory", "cc");
0288 }
0289 
0290 /* Computes two field multiplications:
0291  *   out[0] <- f1[0] * f2[0]
0292  *   out[1] <- f1[1] * f2[1]
0293  * Uses the 16-element buffer tmp for intermediate results: */
0294 static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
0295 {
0296     asm volatile(
0297 
0298         /* Compute the raw multiplication tmp[0] <- f1[0] * f2[0] */
0299 
0300         /* Compute src1[0] * src2 */
0301         "  movq 0(%0), %%rdx;"
0302         "  mulxq 0(%1), %%r8, %%r9;"
0303         "  xor %%r10d, %%r10d;"
0304         "  movq %%r8, 0(%2);"
0305         "  mulxq 8(%1), %%r10, %%r11;"
0306         "  adox %%r9, %%r10;"
0307         "  movq %%r10, 8(%2);"
0308         "  mulxq 16(%1), %%rbx, %%r13;"
0309         "  adox %%r11, %%rbx;"
0310         "  mulxq 24(%1), %%r14, %%rdx;"
0311         "  adox %%r13, %%r14;"
0312         "  mov $0, %%rax;"
0313         "  adox %%rdx, %%rax;"
0314 
0315         /* Compute src1[1] * src2 */
0316         "  movq 8(%0), %%rdx;"
0317         "  mulxq 0(%1), %%r8, %%r9;"
0318         "  xor %%r10d, %%r10d;"
0319         "  adcxq 8(%2), %%r8;"
0320         "  movq %%r8, 8(%2);"
0321         "  mulxq 8(%1), %%r10, %%r11;"
0322         "  adox %%r9, %%r10;"
0323         "  adcx %%rbx, %%r10;"
0324         "  movq %%r10, 16(%2);"
0325         "  mulxq 16(%1), %%rbx, %%r13;"
0326         "  adox %%r11, %%rbx;"
0327         "  adcx %%r14, %%rbx;"
0328         "  mov $0, %%r8;"
0329         "  mulxq 24(%1), %%r14, %%rdx;"
0330         "  adox %%r13, %%r14;"
0331         "  adcx %%rax, %%r14;"
0332         "  mov $0, %%rax;"
0333         "  adox %%rdx, %%rax;"
0334         "  adcx %%r8, %%rax;"
0335 
0336         /* Compute src1[2] * src2 */
0337         "  movq 16(%0), %%rdx;"
0338         "  mulxq 0(%1), %%r8, %%r9;"
0339         "  xor %%r10d, %%r10d;"
0340         "  adcxq 16(%2), %%r8;"
0341         "  movq %%r8, 16(%2);"
0342         "  mulxq 8(%1), %%r10, %%r11;"
0343         "  adox %%r9, %%r10;"
0344         "  adcx %%rbx, %%r10;"
0345         "  movq %%r10, 24(%2);"
0346         "  mulxq 16(%1), %%rbx, %%r13;"
0347         "  adox %%r11, %%rbx;"
0348         "  adcx %%r14, %%rbx;"
0349         "  mov $0, %%r8;"
0350         "  mulxq 24(%1), %%r14, %%rdx;"
0351         "  adox %%r13, %%r14;"
0352         "  adcx %%rax, %%r14;"
0353         "  mov $0, %%rax;"
0354         "  adox %%rdx, %%rax;"
0355         "  adcx %%r8, %%rax;"
0356 
0357         /* Compute src1[3] * src2 */
0358         "  movq 24(%0), %%rdx;"
0359         "  mulxq 0(%1), %%r8, %%r9;"
0360         "  xor %%r10d, %%r10d;"
0361         "  adcxq 24(%2), %%r8;"
0362         "  movq %%r8, 24(%2);"
0363         "  mulxq 8(%1), %%r10, %%r11;"
0364         "  adox %%r9, %%r10;"
0365         "  adcx %%rbx, %%r10;"
0366         "  movq %%r10, 32(%2);"
0367         "  mulxq 16(%1), %%rbx, %%r13;"
0368         "  adox %%r11, %%rbx;"
0369         "  adcx %%r14, %%rbx;"
0370         "  movq %%rbx, 40(%2);"
0371         "  mov $0, %%r8;"
0372         "  mulxq 24(%1), %%r14, %%rdx;"
0373         "  adox %%r13, %%r14;"
0374         "  adcx %%rax, %%r14;"
0375         "  movq %%r14, 48(%2);"
0376         "  mov $0, %%rax;"
0377         "  adox %%rdx, %%rax;"
0378         "  adcx %%r8, %%rax;"
0379         "  movq %%rax, 56(%2);"
0380 
0381         /* Compute the raw multiplication tmp[1] <- f1[1] * f2[1] */
0382 
0383         /* Compute src1[0] * src2 */
0384         "  movq 32(%0), %%rdx;"
0385         "  mulxq 32(%1), %%r8, %%r9;"
0386         "  xor %%r10d, %%r10d;"
0387         "  movq %%r8, 64(%2);"
0388         "  mulxq 40(%1), %%r10, %%r11;"
0389         "  adox %%r9, %%r10;"
0390         "  movq %%r10, 72(%2);"
0391         "  mulxq 48(%1), %%rbx, %%r13;"
0392         "  adox %%r11, %%rbx;"
0393         "  mulxq 56(%1), %%r14, %%rdx;"
0394         "  adox %%r13, %%r14;"
0395         "  mov $0, %%rax;"
0396         "  adox %%rdx, %%rax;"
0397 
0398         /* Compute src1[1] * src2 */
0399         "  movq 40(%0), %%rdx;"
0400         "  mulxq 32(%1), %%r8, %%r9;"
0401         "  xor %%r10d, %%r10d;"
0402         "  adcxq 72(%2), %%r8;"
0403         "  movq %%r8, 72(%2);"
0404         "  mulxq 40(%1), %%r10, %%r11;"
0405         "  adox %%r9, %%r10;"
0406         "  adcx %%rbx, %%r10;"
0407         "  movq %%r10, 80(%2);"
0408         "  mulxq 48(%1), %%rbx, %%r13;"
0409         "  adox %%r11, %%rbx;"
0410         "  adcx %%r14, %%rbx;"
0411         "  mov $0, %%r8;"
0412         "  mulxq 56(%1), %%r14, %%rdx;"
0413         "  adox %%r13, %%r14;"
0414         "  adcx %%rax, %%r14;"
0415         "  mov $0, %%rax;"
0416         "  adox %%rdx, %%rax;"
0417         "  adcx %%r8, %%rax;"
0418 
0419         /* Compute src1[2] * src2 */
0420         "  movq 48(%0), %%rdx;"
0421         "  mulxq 32(%1), %%r8, %%r9;"
0422         "  xor %%r10d, %%r10d;"
0423         "  adcxq 80(%2), %%r8;"
0424         "  movq %%r8, 80(%2);"
0425         "  mulxq 40(%1), %%r10, %%r11;"
0426         "  adox %%r9, %%r10;"
0427         "  adcx %%rbx, %%r10;"
0428         "  movq %%r10, 88(%2);"
0429         "  mulxq 48(%1), %%rbx, %%r13;"
0430         "  adox %%r11, %%rbx;"
0431         "  adcx %%r14, %%rbx;"
0432         "  mov $0, %%r8;"
0433         "  mulxq 56(%1), %%r14, %%rdx;"
0434         "  adox %%r13, %%r14;"
0435         "  adcx %%rax, %%r14;"
0436         "  mov $0, %%rax;"
0437         "  adox %%rdx, %%rax;"
0438         "  adcx %%r8, %%rax;"
0439 
0440         /* Compute src1[3] * src2 */
0441         "  movq 56(%0), %%rdx;"
0442         "  mulxq 32(%1), %%r8, %%r9;"
0443         "  xor %%r10d, %%r10d;"
0444         "  adcxq 88(%2), %%r8;"
0445         "  movq %%r8, 88(%2);"
0446         "  mulxq 40(%1), %%r10, %%r11;"
0447         "  adox %%r9, %%r10;"
0448         "  adcx %%rbx, %%r10;"
0449         "  movq %%r10, 96(%2);"
0450         "  mulxq 48(%1), %%rbx, %%r13;"
0451         "  adox %%r11, %%rbx;"
0452         "  adcx %%r14, %%rbx;"
0453         "  movq %%rbx, 104(%2);"
0454         "  mov $0, %%r8;"
0455         "  mulxq 56(%1), %%r14, %%rdx;"
0456         "  adox %%r13, %%r14;"
0457         "  adcx %%rax, %%r14;"
0458         "  movq %%r14, 112(%2);"
0459         "  mov $0, %%rax;"
0460         "  adox %%rdx, %%rax;"
0461         "  adcx %%r8, %%rax;"
0462         "  movq %%rax, 120(%2);"
0463 
0464         /* Line up pointers */
0465         "  mov %2, %0;"
0466         "  mov %3, %2;"
0467 
0468         /* Wrap the results back into the field */
0469 
0470         /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
0471         "  mov $38, %%rdx;"
0472         "  mulxq 32(%0), %%r8, %%r13;"
0473         "  xor %k1, %k1;"
0474         "  adoxq 0(%0), %%r8;"
0475         "  mulxq 40(%0), %%r9, %%rbx;"
0476         "  adcx %%r13, %%r9;"
0477         "  adoxq 8(%0), %%r9;"
0478         "  mulxq 48(%0), %%r10, %%r13;"
0479         "  adcx %%rbx, %%r10;"
0480         "  adoxq 16(%0), %%r10;"
0481         "  mulxq 56(%0), %%r11, %%rax;"
0482         "  adcx %%r13, %%r11;"
0483         "  adoxq 24(%0), %%r11;"
0484         "  adcx %1, %%rax;"
0485         "  adox %1, %%rax;"
0486         "  imul %%rdx, %%rax;"
0487 
0488         /* Step 2: Fold the carry back into dst */
0489         "  add %%rax, %%r8;"
0490         "  adcx %1, %%r9;"
0491         "  movq %%r9, 8(%2);"
0492         "  adcx %1, %%r10;"
0493         "  movq %%r10, 16(%2);"
0494         "  adcx %1, %%r11;"
0495         "  movq %%r11, 24(%2);"
0496 
0497         /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
0498         "  mov $0, %%rax;"
0499         "  cmovc %%rdx, %%rax;"
0500         "  add %%rax, %%r8;"
0501         "  movq %%r8, 0(%2);"
0502 
0503         /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
0504         "  mov $38, %%rdx;"
0505         "  mulxq 96(%0), %%r8, %%r13;"
0506         "  xor %k1, %k1;"
0507         "  adoxq 64(%0), %%r8;"
0508         "  mulxq 104(%0), %%r9, %%rbx;"
0509         "  adcx %%r13, %%r9;"
0510         "  adoxq 72(%0), %%r9;"
0511         "  mulxq 112(%0), %%r10, %%r13;"
0512         "  adcx %%rbx, %%r10;"
0513         "  adoxq 80(%0), %%r10;"
0514         "  mulxq 120(%0), %%r11, %%rax;"
0515         "  adcx %%r13, %%r11;"
0516         "  adoxq 88(%0), %%r11;"
0517         "  adcx %1, %%rax;"
0518         "  adox %1, %%rax;"
0519         "  imul %%rdx, %%rax;"
0520 
0521         /* Step 2: Fold the carry back into dst */
0522         "  add %%rax, %%r8;"
0523         "  adcx %1, %%r9;"
0524         "  movq %%r9, 40(%2);"
0525         "  adcx %1, %%r10;"
0526         "  movq %%r10, 48(%2);"
0527         "  adcx %1, %%r11;"
0528         "  movq %%r11, 56(%2);"
0529 
0530         /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
0531         "  mov $0, %%rax;"
0532         "  cmovc %%rdx, %%rax;"
0533         "  add %%rax, %%r8;"
0534         "  movq %%r8, 32(%2);"
0535         : "+&r"(f1), "+&r"(f2), "+&r"(tmp)
0536         : "r"(out)
0537         : "%rax", "%rbx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r13",
0538           "%r14", "memory", "cc");
0539 }
0540 
0541 /* Computes the field multiplication of four-element f1 with value in f2
0542  * Requires f2 to be smaller than 2^17 */
0543 static inline void fmul_scalar(u64 *out, const u64 *f1, u64 f2)
0544 {
0545     register u64 f2_r asm("rdx") = f2;
0546 
0547     asm volatile(
0548         /* Compute the raw multiplication of f1*f2 */
0549         "  mulxq 0(%2), %%r8, %%rcx;" /* f1[0]*f2 */
0550         "  mulxq 8(%2), %%r9, %%rbx;" /* f1[1]*f2 */
0551         "  add %%rcx, %%r9;"
0552         "  mov $0, %%rcx;"
0553         "  mulxq 16(%2), %%r10, %%r13;" /* f1[2]*f2 */
0554         "  adcx %%rbx, %%r10;"
0555         "  mulxq 24(%2), %%r11, %%rax;" /* f1[3]*f2 */
0556         "  adcx %%r13, %%r11;"
0557         "  adcx %%rcx, %%rax;"
0558 
0559         /* Wrap the result back into the field */
0560 
0561         /* Step 1: Compute carry*38 */
0562         "  mov $38, %%rdx;"
0563         "  imul %%rdx, %%rax;"
0564 
0565         /* Step 2: Fold the carry back into dst */
0566         "  add %%rax, %%r8;"
0567         "  adcx %%rcx, %%r9;"
0568         "  movq %%r9, 8(%1);"
0569         "  adcx %%rcx, %%r10;"
0570         "  movq %%r10, 16(%1);"
0571         "  adcx %%rcx, %%r11;"
0572         "  movq %%r11, 24(%1);"
0573 
0574         /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
0575         "  mov $0, %%rax;"
0576         "  cmovc %%rdx, %%rax;"
0577         "  add %%rax, %%r8;"
0578         "  movq %%r8, 0(%1);"
0579         : "+&r"(f2_r)
0580         : "r"(out), "r"(f1)
0581         : "%rax", "%rbx", "%rcx", "%r8", "%r9", "%r10", "%r11", "%r13",
0582           "memory", "cc");
0583 }
0584 
0585 /* Computes p1 <- bit ? p2 : p1 in constant time */
0586 static inline void cswap2(u64 bit, const u64 *p1, const u64 *p2)
0587 {
0588     asm volatile(
0589         /* Transfer bit into CF flag */
0590         "  add $18446744073709551615, %0;"
0591 
0592         /* cswap p1[0], p2[0] */
0593         "  movq 0(%1), %%r8;"
0594         "  movq 0(%2), %%r9;"
0595         "  mov %%r8, %%r10;"
0596         "  cmovc %%r9, %%r8;"
0597         "  cmovc %%r10, %%r9;"
0598         "  movq %%r8, 0(%1);"
0599         "  movq %%r9, 0(%2);"
0600 
0601         /* cswap p1[1], p2[1] */
0602         "  movq 8(%1), %%r8;"
0603         "  movq 8(%2), %%r9;"
0604         "  mov %%r8, %%r10;"
0605         "  cmovc %%r9, %%r8;"
0606         "  cmovc %%r10, %%r9;"
0607         "  movq %%r8, 8(%1);"
0608         "  movq %%r9, 8(%2);"
0609 
0610         /* cswap p1[2], p2[2] */
0611         "  movq 16(%1), %%r8;"
0612         "  movq 16(%2), %%r9;"
0613         "  mov %%r8, %%r10;"
0614         "  cmovc %%r9, %%r8;"
0615         "  cmovc %%r10, %%r9;"
0616         "  movq %%r8, 16(%1);"
0617         "  movq %%r9, 16(%2);"
0618 
0619         /* cswap p1[3], p2[3] */
0620         "  movq 24(%1), %%r8;"
0621         "  movq 24(%2), %%r9;"
0622         "  mov %%r8, %%r10;"
0623         "  cmovc %%r9, %%r8;"
0624         "  cmovc %%r10, %%r9;"
0625         "  movq %%r8, 24(%1);"
0626         "  movq %%r9, 24(%2);"
0627 
0628         /* cswap p1[4], p2[4] */
0629         "  movq 32(%1), %%r8;"
0630         "  movq 32(%2), %%r9;"
0631         "  mov %%r8, %%r10;"
0632         "  cmovc %%r9, %%r8;"
0633         "  cmovc %%r10, %%r9;"
0634         "  movq %%r8, 32(%1);"
0635         "  movq %%r9, 32(%2);"
0636 
0637         /* cswap p1[5], p2[5] */
0638         "  movq 40(%1), %%r8;"
0639         "  movq 40(%2), %%r9;"
0640         "  mov %%r8, %%r10;"
0641         "  cmovc %%r9, %%r8;"
0642         "  cmovc %%r10, %%r9;"
0643         "  movq %%r8, 40(%1);"
0644         "  movq %%r9, 40(%2);"
0645 
0646         /* cswap p1[6], p2[6] */
0647         "  movq 48(%1), %%r8;"
0648         "  movq 48(%2), %%r9;"
0649         "  mov %%r8, %%r10;"
0650         "  cmovc %%r9, %%r8;"
0651         "  cmovc %%r10, %%r9;"
0652         "  movq %%r8, 48(%1);"
0653         "  movq %%r9, 48(%2);"
0654 
0655         /* cswap p1[7], p2[7] */
0656         "  movq 56(%1), %%r8;"
0657         "  movq 56(%2), %%r9;"
0658         "  mov %%r8, %%r10;"
0659         "  cmovc %%r9, %%r8;"
0660         "  cmovc %%r10, %%r9;"
0661         "  movq %%r8, 56(%1);"
0662         "  movq %%r9, 56(%2);"
0663         : "+&r"(bit)
0664         : "r"(p1), "r"(p2)
0665         : "%r8", "%r9", "%r10", "memory", "cc");
0666 }
0667 
0668 /* Computes the square of a field element: out <- f * f
0669  * Uses the 8-element buffer tmp for intermediate results */
0670 static inline void fsqr(u64 *out, const u64 *f, u64 *tmp)
0671 {
0672     asm volatile(
0673         /* Compute the raw multiplication: tmp <- f * f */
0674 
0675         /* Step 1: Compute all partial products */
0676         "  movq 0(%0), %%rdx;" /* f[0] */
0677         "  mulxq 8(%0), %%r8, %%r14;"
0678         "  xor %%r15d, %%r15d;" /* f[1]*f[0] */
0679         "  mulxq 16(%0), %%r9, %%r10;"
0680         "  adcx %%r14, %%r9;" /* f[2]*f[0] */
0681         "  mulxq 24(%0), %%rax, %%rcx;"
0682         "  adcx %%rax, %%r10;" /* f[3]*f[0] */
0683         "  movq 24(%0), %%rdx;" /* f[3] */
0684         "  mulxq 8(%0), %%r11, %%rbx;"
0685         "  adcx %%rcx, %%r11;" /* f[1]*f[3] */
0686         "  mulxq 16(%0), %%rax, %%r13;"
0687         "  adcx %%rax, %%rbx;" /* f[2]*f[3] */
0688         "  movq 8(%0), %%rdx;"
0689         "  adcx %%r15, %%r13;" /* f1 */
0690         "  mulxq 16(%0), %%rax, %%rcx;"
0691         "  mov $0, %%r14;" /* f[2]*f[1] */
0692 
0693         /* Step 2: Compute two parallel carry chains */
0694         "  xor %%r15d, %%r15d;"
0695         "  adox %%rax, %%r10;"
0696         "  adcx %%r8, %%r8;"
0697         "  adox %%rcx, %%r11;"
0698         "  adcx %%r9, %%r9;"
0699         "  adox %%r15, %%rbx;"
0700         "  adcx %%r10, %%r10;"
0701         "  adox %%r15, %%r13;"
0702         "  adcx %%r11, %%r11;"
0703         "  adox %%r15, %%r14;"
0704         "  adcx %%rbx, %%rbx;"
0705         "  adcx %%r13, %%r13;"
0706         "  adcx %%r14, %%r14;"
0707 
0708         /* Step 3: Compute intermediate squares */
0709         "  movq 0(%0), %%rdx;"
0710         "  mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */
0711         "  movq %%rax, 0(%1);"
0712         "  add %%rcx, %%r8;"
0713         "  movq %%r8, 8(%1);"
0714         "  movq 8(%0), %%rdx;"
0715         "  mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */
0716         "  adcx %%rax, %%r9;"
0717         "  movq %%r9, 16(%1);"
0718         "  adcx %%rcx, %%r10;"
0719         "  movq %%r10, 24(%1);"
0720         "  movq 16(%0), %%rdx;"
0721         "  mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */
0722         "  adcx %%rax, %%r11;"
0723         "  movq %%r11, 32(%1);"
0724         "  adcx %%rcx, %%rbx;"
0725         "  movq %%rbx, 40(%1);"
0726         "  movq 24(%0), %%rdx;"
0727         "  mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */
0728         "  adcx %%rax, %%r13;"
0729         "  movq %%r13, 48(%1);"
0730         "  adcx %%rcx, %%r14;"
0731         "  movq %%r14, 56(%1);"
0732 
0733         /* Line up pointers */
0734         "  mov %1, %0;"
0735         "  mov %2, %1;"
0736 
0737         /* Wrap the result back into the field */
0738 
0739         /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
0740         "  mov $38, %%rdx;"
0741         "  mulxq 32(%0), %%r8, %%r13;"
0742         "  xor %%ecx, %%ecx;"
0743         "  adoxq 0(%0), %%r8;"
0744         "  mulxq 40(%0), %%r9, %%rbx;"
0745         "  adcx %%r13, %%r9;"
0746         "  adoxq 8(%0), %%r9;"
0747         "  mulxq 48(%0), %%r10, %%r13;"
0748         "  adcx %%rbx, %%r10;"
0749         "  adoxq 16(%0), %%r10;"
0750         "  mulxq 56(%0), %%r11, %%rax;"
0751         "  adcx %%r13, %%r11;"
0752         "  adoxq 24(%0), %%r11;"
0753         "  adcx %%rcx, %%rax;"
0754         "  adox %%rcx, %%rax;"
0755         "  imul %%rdx, %%rax;"
0756 
0757         /* Step 2: Fold the carry back into dst */
0758         "  add %%rax, %%r8;"
0759         "  adcx %%rcx, %%r9;"
0760         "  movq %%r9, 8(%1);"
0761         "  adcx %%rcx, %%r10;"
0762         "  movq %%r10, 16(%1);"
0763         "  adcx %%rcx, %%r11;"
0764         "  movq %%r11, 24(%1);"
0765 
0766         /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
0767         "  mov $0, %%rax;"
0768         "  cmovc %%rdx, %%rax;"
0769         "  add %%rax, %%r8;"
0770         "  movq %%r8, 0(%1);"
0771         : "+&r"(f), "+&r"(tmp)
0772         : "r"(out)
0773         : "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11",
0774           "%r13", "%r14", "%r15", "memory", "cc");
0775 }
0776 
0777 /* Computes two field squarings:
0778  *   out[0] <- f[0] * f[0]
0779  *   out[1] <- f[1] * f[1]
0780  * Uses the 16-element buffer tmp for intermediate results */
0781 static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
0782 {
0783     asm volatile(
0784         /* Step 1: Compute all partial products */
0785         "  movq 0(%0), %%rdx;" /* f[0] */
0786         "  mulxq 8(%0), %%r8, %%r14;"
0787         "  xor %%r15d, %%r15d;" /* f[1]*f[0] */
0788         "  mulxq 16(%0), %%r9, %%r10;"
0789         "  adcx %%r14, %%r9;" /* f[2]*f[0] */
0790         "  mulxq 24(%0), %%rax, %%rcx;"
0791         "  adcx %%rax, %%r10;" /* f[3]*f[0] */
0792         "  movq 24(%0), %%rdx;" /* f[3] */
0793         "  mulxq 8(%0), %%r11, %%rbx;"
0794         "  adcx %%rcx, %%r11;" /* f[1]*f[3] */
0795         "  mulxq 16(%0), %%rax, %%r13;"
0796         "  adcx %%rax, %%rbx;" /* f[2]*f[3] */
0797         "  movq 8(%0), %%rdx;"
0798         "  adcx %%r15, %%r13;" /* f1 */
0799         "  mulxq 16(%0), %%rax, %%rcx;"
0800         "  mov $0, %%r14;" /* f[2]*f[1] */
0801 
0802         /* Step 2: Compute two parallel carry chains */
0803         "  xor %%r15d, %%r15d;"
0804         "  adox %%rax, %%r10;"
0805         "  adcx %%r8, %%r8;"
0806         "  adox %%rcx, %%r11;"
0807         "  adcx %%r9, %%r9;"
0808         "  adox %%r15, %%rbx;"
0809         "  adcx %%r10, %%r10;"
0810         "  adox %%r15, %%r13;"
0811         "  adcx %%r11, %%r11;"
0812         "  adox %%r15, %%r14;"
0813         "  adcx %%rbx, %%rbx;"
0814         "  adcx %%r13, %%r13;"
0815         "  adcx %%r14, %%r14;"
0816 
0817         /* Step 3: Compute intermediate squares */
0818         "  movq 0(%0), %%rdx;"
0819         "  mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */
0820         "  movq %%rax, 0(%1);"
0821         "  add %%rcx, %%r8;"
0822         "  movq %%r8, 8(%1);"
0823         "  movq 8(%0), %%rdx;"
0824         "  mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */
0825         "  adcx %%rax, %%r9;"
0826         "  movq %%r9, 16(%1);"
0827         "  adcx %%rcx, %%r10;"
0828         "  movq %%r10, 24(%1);"
0829         "  movq 16(%0), %%rdx;"
0830         "  mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */
0831         "  adcx %%rax, %%r11;"
0832         "  movq %%r11, 32(%1);"
0833         "  adcx %%rcx, %%rbx;"
0834         "  movq %%rbx, 40(%1);"
0835         "  movq 24(%0), %%rdx;"
0836         "  mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */
0837         "  adcx %%rax, %%r13;"
0838         "  movq %%r13, 48(%1);"
0839         "  adcx %%rcx, %%r14;"
0840         "  movq %%r14, 56(%1);"
0841 
0842         /* Step 1: Compute all partial products */
0843         "  movq 32(%0), %%rdx;" /* f[0] */
0844         "  mulxq 40(%0), %%r8, %%r14;"
0845         "  xor %%r15d, %%r15d;" /* f[1]*f[0] */
0846         "  mulxq 48(%0), %%r9, %%r10;"
0847         "  adcx %%r14, %%r9;" /* f[2]*f[0] */
0848         "  mulxq 56(%0), %%rax, %%rcx;"
0849         "  adcx %%rax, %%r10;" /* f[3]*f[0] */
0850         "  movq 56(%0), %%rdx;" /* f[3] */
0851         "  mulxq 40(%0), %%r11, %%rbx;"
0852         "  adcx %%rcx, %%r11;" /* f[1]*f[3] */
0853         "  mulxq 48(%0), %%rax, %%r13;"
0854         "  adcx %%rax, %%rbx;" /* f[2]*f[3] */
0855         "  movq 40(%0), %%rdx;"
0856         "  adcx %%r15, %%r13;" /* f1 */
0857         "  mulxq 48(%0), %%rax, %%rcx;"
0858         "  mov $0, %%r14;" /* f[2]*f[1] */
0859 
0860         /* Step 2: Compute two parallel carry chains */
0861         "  xor %%r15d, %%r15d;"
0862         "  adox %%rax, %%r10;"
0863         "  adcx %%r8, %%r8;"
0864         "  adox %%rcx, %%r11;"
0865         "  adcx %%r9, %%r9;"
0866         "  adox %%r15, %%rbx;"
0867         "  adcx %%r10, %%r10;"
0868         "  adox %%r15, %%r13;"
0869         "  adcx %%r11, %%r11;"
0870         "  adox %%r15, %%r14;"
0871         "  adcx %%rbx, %%rbx;"
0872         "  adcx %%r13, %%r13;"
0873         "  adcx %%r14, %%r14;"
0874 
0875         /* Step 3: Compute intermediate squares */
0876         "  movq 32(%0), %%rdx;"
0877         "  mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */
0878         "  movq %%rax, 64(%1);"
0879         "  add %%rcx, %%r8;"
0880         "  movq %%r8, 72(%1);"
0881         "  movq 40(%0), %%rdx;"
0882         "  mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */
0883         "  adcx %%rax, %%r9;"
0884         "  movq %%r9, 80(%1);"
0885         "  adcx %%rcx, %%r10;"
0886         "  movq %%r10, 88(%1);"
0887         "  movq 48(%0), %%rdx;"
0888         "  mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */
0889         "  adcx %%rax, %%r11;"
0890         "  movq %%r11, 96(%1);"
0891         "  adcx %%rcx, %%rbx;"
0892         "  movq %%rbx, 104(%1);"
0893         "  movq 56(%0), %%rdx;"
0894         "  mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */
0895         "  adcx %%rax, %%r13;"
0896         "  movq %%r13, 112(%1);"
0897         "  adcx %%rcx, %%r14;"
0898         "  movq %%r14, 120(%1);"
0899 
0900         /* Line up pointers */
0901         "  mov %1, %0;"
0902         "  mov %2, %1;"
0903 
0904         /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
0905         "  mov $38, %%rdx;"
0906         "  mulxq 32(%0), %%r8, %%r13;"
0907         "  xor %%ecx, %%ecx;"
0908         "  adoxq 0(%0), %%r8;"
0909         "  mulxq 40(%0), %%r9, %%rbx;"
0910         "  adcx %%r13, %%r9;"
0911         "  adoxq 8(%0), %%r9;"
0912         "  mulxq 48(%0), %%r10, %%r13;"
0913         "  adcx %%rbx, %%r10;"
0914         "  adoxq 16(%0), %%r10;"
0915         "  mulxq 56(%0), %%r11, %%rax;"
0916         "  adcx %%r13, %%r11;"
0917         "  adoxq 24(%0), %%r11;"
0918         "  adcx %%rcx, %%rax;"
0919         "  adox %%rcx, %%rax;"
0920         "  imul %%rdx, %%rax;"
0921 
0922         /* Step 2: Fold the carry back into dst */
0923         "  add %%rax, %%r8;"
0924         "  adcx %%rcx, %%r9;"
0925         "  movq %%r9, 8(%1);"
0926         "  adcx %%rcx, %%r10;"
0927         "  movq %%r10, 16(%1);"
0928         "  adcx %%rcx, %%r11;"
0929         "  movq %%r11, 24(%1);"
0930 
0931         /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
0932         "  mov $0, %%rax;"
0933         "  cmovc %%rdx, %%rax;"
0934         "  add %%rax, %%r8;"
0935         "  movq %%r8, 0(%1);"
0936 
0937         /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
0938         "  mov $38, %%rdx;"
0939         "  mulxq 96(%0), %%r8, %%r13;"
0940         "  xor %%ecx, %%ecx;"
0941         "  adoxq 64(%0), %%r8;"
0942         "  mulxq 104(%0), %%r9, %%rbx;"
0943         "  adcx %%r13, %%r9;"
0944         "  adoxq 72(%0), %%r9;"
0945         "  mulxq 112(%0), %%r10, %%r13;"
0946         "  adcx %%rbx, %%r10;"
0947         "  adoxq 80(%0), %%r10;"
0948         "  mulxq 120(%0), %%r11, %%rax;"
0949         "  adcx %%r13, %%r11;"
0950         "  adoxq 88(%0), %%r11;"
0951         "  adcx %%rcx, %%rax;"
0952         "  adox %%rcx, %%rax;"
0953         "  imul %%rdx, %%rax;"
0954 
0955         /* Step 2: Fold the carry back into dst */
0956         "  add %%rax, %%r8;"
0957         "  adcx %%rcx, %%r9;"
0958         "  movq %%r9, 40(%1);"
0959         "  adcx %%rcx, %%r10;"
0960         "  movq %%r10, 48(%1);"
0961         "  adcx %%rcx, %%r11;"
0962         "  movq %%r11, 56(%1);"
0963 
0964         /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
0965         "  mov $0, %%rax;"
0966         "  cmovc %%rdx, %%rax;"
0967         "  add %%rax, %%r8;"
0968         "  movq %%r8, 32(%1);"
0969         : "+&r"(f), "+&r"(tmp)
0970         : "r"(out)
0971         : "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11",
0972           "%r13", "%r14", "%r15", "memory", "cc");
0973 }
0974 
0975 static void point_add_and_double(u64 *q, u64 *p01_tmp1, u64 *tmp2)
0976 {
0977     u64 *nq = p01_tmp1;
0978     u64 *nq_p1 = p01_tmp1 + (u32)8U;
0979     u64 *tmp1 = p01_tmp1 + (u32)16U;
0980     u64 *x1 = q;
0981     u64 *x2 = nq;
0982     u64 *z2 = nq + (u32)4U;
0983     u64 *z3 = nq_p1 + (u32)4U;
0984     u64 *a = tmp1;
0985     u64 *b = tmp1 + (u32)4U;
0986     u64 *ab = tmp1;
0987     u64 *dc = tmp1 + (u32)8U;
0988     u64 *x3;
0989     u64 *z31;
0990     u64 *d0;
0991     u64 *c0;
0992     u64 *a1;
0993     u64 *b1;
0994     u64 *d;
0995     u64 *c;
0996     u64 *ab1;
0997     u64 *dc1;
0998     fadd(a, x2, z2);
0999     fsub(b, x2, z2);
1000     x3 = nq_p1;
1001     z31 = nq_p1 + (u32)4U;
1002     d0 = dc;
1003     c0 = dc + (u32)4U;
1004     fadd(c0, x3, z31);
1005     fsub(d0, x3, z31);
1006     fmul2(dc, dc, ab, tmp2);
1007     fadd(x3, d0, c0);
1008     fsub(z31, d0, c0);
1009     a1 = tmp1;
1010     b1 = tmp1 + (u32)4U;
1011     d = tmp1 + (u32)8U;
1012     c = tmp1 + (u32)12U;
1013     ab1 = tmp1;
1014     dc1 = tmp1 + (u32)8U;
1015     fsqr2(dc1, ab1, tmp2);
1016     fsqr2(nq_p1, nq_p1, tmp2);
1017     a1[0U] = c[0U];
1018     a1[1U] = c[1U];
1019     a1[2U] = c[2U];
1020     a1[3U] = c[3U];
1021     fsub(c, d, c);
1022     fmul_scalar(b1, c, (u64)121665U);
1023     fadd(b1, b1, d);
1024     fmul2(nq, dc1, ab1, tmp2);
1025     fmul(z3, z3, x1, tmp2);
1026 }
1027 
1028 static void point_double(u64 *nq, u64 *tmp1, u64 *tmp2)
1029 {
1030     u64 *x2 = nq;
1031     u64 *z2 = nq + (u32)4U;
1032     u64 *a = tmp1;
1033     u64 *b = tmp1 + (u32)4U;
1034     u64 *d = tmp1 + (u32)8U;
1035     u64 *c = tmp1 + (u32)12U;
1036     u64 *ab = tmp1;
1037     u64 *dc = tmp1 + (u32)8U;
1038     fadd(a, x2, z2);
1039     fsub(b, x2, z2);
1040     fsqr2(dc, ab, tmp2);
1041     a[0U] = c[0U];
1042     a[1U] = c[1U];
1043     a[2U] = c[2U];
1044     a[3U] = c[3U];
1045     fsub(c, d, c);
1046     fmul_scalar(b, c, (u64)121665U);
1047     fadd(b, b, d);
1048     fmul2(nq, dc, ab, tmp2);
1049 }
1050 
1051 static void montgomery_ladder(u64 *out, const u8 *key, u64 *init1)
1052 {
1053     u64 tmp2[16U] = { 0U };
1054     u64 p01_tmp1_swap[33U] = { 0U };
1055     u64 *p0 = p01_tmp1_swap;
1056     u64 *p01 = p01_tmp1_swap;
1057     u64 *p03 = p01;
1058     u64 *p11 = p01 + (u32)8U;
1059     u64 *x0;
1060     u64 *z0;
1061     u64 *p01_tmp1;
1062     u64 *p01_tmp11;
1063     u64 *nq10;
1064     u64 *nq_p11;
1065     u64 *swap1;
1066     u64 sw0;
1067     u64 *nq1;
1068     u64 *tmp1;
1069     memcpy(p11, init1, (u32)8U * sizeof(init1[0U]));
1070     x0 = p03;
1071     z0 = p03 + (u32)4U;
1072     x0[0U] = (u64)1U;
1073     x0[1U] = (u64)0U;
1074     x0[2U] = (u64)0U;
1075     x0[3U] = (u64)0U;
1076     z0[0U] = (u64)0U;
1077     z0[1U] = (u64)0U;
1078     z0[2U] = (u64)0U;
1079     z0[3U] = (u64)0U;
1080     p01_tmp1 = p01_tmp1_swap;
1081     p01_tmp11 = p01_tmp1_swap;
1082     nq10 = p01_tmp1_swap;
1083     nq_p11 = p01_tmp1_swap + (u32)8U;
1084     swap1 = p01_tmp1_swap + (u32)32U;
1085     cswap2((u64)1U, nq10, nq_p11);
1086     point_add_and_double(init1, p01_tmp11, tmp2);
1087     swap1[0U] = (u64)1U;
1088     {
1089         u32 i;
1090         for (i = (u32)0U; i < (u32)251U; i = i + (u32)1U) {
1091             u64 *p01_tmp12 = p01_tmp1_swap;
1092             u64 *swap2 = p01_tmp1_swap + (u32)32U;
1093             u64 *nq2 = p01_tmp12;
1094             u64 *nq_p12 = p01_tmp12 + (u32)8U;
1095             u64 bit = (u64)(key[((u32)253U - i) / (u32)8U] >> ((u32)253U - i) % (u32)8U & (u8)1U);
1096             u64 sw = swap2[0U] ^ bit;
1097             cswap2(sw, nq2, nq_p12);
1098             point_add_and_double(init1, p01_tmp12, tmp2);
1099             swap2[0U] = bit;
1100         }
1101     }
1102     sw0 = swap1[0U];
1103     cswap2(sw0, nq10, nq_p11);
1104     nq1 = p01_tmp1;
1105     tmp1 = p01_tmp1 + (u32)16U;
1106     point_double(nq1, tmp1, tmp2);
1107     point_double(nq1, tmp1, tmp2);
1108     point_double(nq1, tmp1, tmp2);
1109     memcpy(out, p0, (u32)8U * sizeof(p0[0U]));
1110 
1111     memzero_explicit(tmp2, sizeof(tmp2));
1112     memzero_explicit(p01_tmp1_swap, sizeof(p01_tmp1_swap));
1113 }
1114 
1115 static void fsquare_times(u64 *o, const u64 *inp, u64 *tmp, u32 n1)
1116 {
1117     u32 i;
1118     fsqr(o, inp, tmp);
1119     for (i = (u32)0U; i < n1 - (u32)1U; i = i + (u32)1U)
1120         fsqr(o, o, tmp);
1121 }
1122 
1123 static void finv(u64 *o, const u64 *i, u64 *tmp)
1124 {
1125     u64 t1[16U] = { 0U };
1126     u64 *a0 = t1;
1127     u64 *b = t1 + (u32)4U;
1128     u64 *c = t1 + (u32)8U;
1129     u64 *t00 = t1 + (u32)12U;
1130     u64 *tmp1 = tmp;
1131     u64 *a;
1132     u64 *t0;
1133     fsquare_times(a0, i, tmp1, (u32)1U);
1134     fsquare_times(t00, a0, tmp1, (u32)2U);
1135     fmul(b, t00, i, tmp);
1136     fmul(a0, b, a0, tmp);
1137     fsquare_times(t00, a0, tmp1, (u32)1U);
1138     fmul(b, t00, b, tmp);
1139     fsquare_times(t00, b, tmp1, (u32)5U);
1140     fmul(b, t00, b, tmp);
1141     fsquare_times(t00, b, tmp1, (u32)10U);
1142     fmul(c, t00, b, tmp);
1143     fsquare_times(t00, c, tmp1, (u32)20U);
1144     fmul(t00, t00, c, tmp);
1145     fsquare_times(t00, t00, tmp1, (u32)10U);
1146     fmul(b, t00, b, tmp);
1147     fsquare_times(t00, b, tmp1, (u32)50U);
1148     fmul(c, t00, b, tmp);
1149     fsquare_times(t00, c, tmp1, (u32)100U);
1150     fmul(t00, t00, c, tmp);
1151     fsquare_times(t00, t00, tmp1, (u32)50U);
1152     fmul(t00, t00, b, tmp);
1153     fsquare_times(t00, t00, tmp1, (u32)5U);
1154     a = t1;
1155     t0 = t1 + (u32)12U;
1156     fmul(o, t0, a, tmp);
1157 }
1158 
1159 static void store_felem(u64 *b, u64 *f)
1160 {
1161     u64 f30 = f[3U];
1162     u64 top_bit0 = f30 >> (u32)63U;
1163     u64 f31;
1164     u64 top_bit;
1165     u64 f0;
1166     u64 f1;
1167     u64 f2;
1168     u64 f3;
1169     u64 m0;
1170     u64 m1;
1171     u64 m2;
1172     u64 m3;
1173     u64 mask;
1174     u64 f0_;
1175     u64 f1_;
1176     u64 f2_;
1177     u64 f3_;
1178     u64 o0;
1179     u64 o1;
1180     u64 o2;
1181     u64 o3;
1182     f[3U] = f30 & (u64)0x7fffffffffffffffU;
1183     add_scalar(f, f, (u64)19U * top_bit0);
1184     f31 = f[3U];
1185     top_bit = f31 >> (u32)63U;
1186     f[3U] = f31 & (u64)0x7fffffffffffffffU;
1187     add_scalar(f, f, (u64)19U * top_bit);
1188     f0 = f[0U];
1189     f1 = f[1U];
1190     f2 = f[2U];
1191     f3 = f[3U];
1192     m0 = gte_mask(f0, (u64)0xffffffffffffffedU);
1193     m1 = eq_mask(f1, (u64)0xffffffffffffffffU);
1194     m2 = eq_mask(f2, (u64)0xffffffffffffffffU);
1195     m3 = eq_mask(f3, (u64)0x7fffffffffffffffU);
1196     mask = ((m0 & m1) & m2) & m3;
1197     f0_ = f0 - (mask & (u64)0xffffffffffffffedU);
1198     f1_ = f1 - (mask & (u64)0xffffffffffffffffU);
1199     f2_ = f2 - (mask & (u64)0xffffffffffffffffU);
1200     f3_ = f3 - (mask & (u64)0x7fffffffffffffffU);
1201     o0 = f0_;
1202     o1 = f1_;
1203     o2 = f2_;
1204     o3 = f3_;
1205     b[0U] = o0;
1206     b[1U] = o1;
1207     b[2U] = o2;
1208     b[3U] = o3;
1209 }
1210 
1211 static void encode_point(u8 *o, const u64 *i)
1212 {
1213     const u64 *x = i;
1214     const u64 *z = i + (u32)4U;
1215     u64 tmp[4U] = { 0U };
1216     u64 tmp_w[16U] = { 0U };
1217     finv(tmp, z, tmp_w);
1218     fmul(tmp, tmp, x, tmp_w);
1219     store_felem((u64 *)o, tmp);
1220 }
1221 
1222 static void curve25519_ever64(u8 *out, const u8 *priv, const u8 *pub)
1223 {
1224     u64 init1[8U] = { 0U };
1225     u64 tmp[4U] = { 0U };
1226     u64 tmp3;
1227     u64 *x;
1228     u64 *z;
1229     {
1230         u32 i;
1231         for (i = (u32)0U; i < (u32)4U; i = i + (u32)1U) {
1232             u64 *os = tmp;
1233             const u8 *bj = pub + i * (u32)8U;
1234             u64 u = *(u64 *)bj;
1235             u64 r = u;
1236             u64 x0 = r;
1237             os[i] = x0;
1238         }
1239     }
1240     tmp3 = tmp[3U];
1241     tmp[3U] = tmp3 & (u64)0x7fffffffffffffffU;
1242     x = init1;
1243     z = init1 + (u32)4U;
1244     z[0U] = (u64)1U;
1245     z[1U] = (u64)0U;
1246     z[2U] = (u64)0U;
1247     z[3U] = (u64)0U;
1248     x[0U] = tmp[0U];
1249     x[1U] = tmp[1U];
1250     x[2U] = tmp[2U];
1251     x[3U] = tmp[3U];
1252     montgomery_ladder(init1, priv, init1);
1253     encode_point(out, init1);
1254 }
1255 
1256 /* The below constants were generated using this sage script:
1257  *
1258  * #!/usr/bin/env sage
1259  * import sys
1260  * from sage.all import *
1261  * def limbs(n):
1262  *  n = int(n)
1263  *  l = ((n >> 0) % 2^64, (n >> 64) % 2^64, (n >> 128) % 2^64, (n >> 192) % 2^64)
1264  *  return "0x%016xULL, 0x%016xULL, 0x%016xULL, 0x%016xULL" % l
1265  * ec = EllipticCurve(GF(2^255 - 19), [0, 486662, 0, 1, 0])
1266  * p_minus_s = (ec.lift_x(9) - ec.lift_x(1))[0]
1267  * print("static const u64 p_minus_s[] = { %s };\n" % limbs(p_minus_s))
1268  * print("static const u64 table_ladder[] = {")
1269  * p = ec.lift_x(9)
1270  * for i in range(252):
1271  *  l = (p[0] + p[2]) / (p[0] - p[2])
1272  *  print(("\t%s" + ("," if i != 251 else "")) % limbs(l))
1273  *  p = p * 2
1274  * print("};")
1275  *
1276  */
1277 
1278 static const u64 p_minus_s[] = { 0x816b1e0137d48290ULL, 0x440f6a51eb4d1207ULL, 0x52385f46dca2b71dULL, 0x215132111d8354cbULL };
1279 
1280 static const u64 table_ladder[] = {
1281     0xfffffffffffffff3ULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0x5fffffffffffffffULL,
1282     0x6b8220f416aafe96ULL, 0x82ebeb2b4f566a34ULL, 0xd5a9a5b075a5950fULL, 0x5142b2cf4b2488f4ULL,
1283     0x6aaebc750069680cULL, 0x89cf7820a0f99c41ULL, 0x2a58d9183b56d0f4ULL, 0x4b5aca80e36011a4ULL,
1284     0x329132348c29745dULL, 0xf4a2e616e1642fd7ULL, 0x1e45bb03ff67bc34ULL, 0x306912d0f42a9b4aULL,
1285     0xff886507e6af7154ULL, 0x04f50e13dfeec82fULL, 0xaa512fe82abab5ceULL, 0x174e251a68d5f222ULL,
1286     0xcf96700d82028898ULL, 0x1743e3370a2c02c5ULL, 0x379eec98b4e86eaaULL, 0x0c59888a51e0482eULL,
1287     0xfbcbf1d699b5d189ULL, 0xacaef0d58e9fdc84ULL, 0xc1c20d06231f7614ULL, 0x2938218da274f972ULL,
1288     0xf6af49beff1d7f18ULL, 0xcc541c22387ac9c2ULL, 0x96fcc9ef4015c56bULL, 0x69c1627c690913a9ULL,
1289     0x7a86fd2f4733db0eULL, 0xfdb8c4f29e087de9ULL, 0x095e4b1a8ea2a229ULL, 0x1ad7a7c829b37a79ULL,
1290     0x342d89cad17ea0c0ULL, 0x67bedda6cced2051ULL, 0x19ca31bf2bb42f74ULL, 0x3df7b4c84980acbbULL,
1291     0xa8c6444dc80ad883ULL, 0xb91e440366e3ab85ULL, 0xc215cda00164f6d8ULL, 0x3d867c6ef247e668ULL,
1292     0xc7dd582bcc3e658cULL, 0xfd2c4748ee0e5528ULL, 0xa0fd9b95cc9f4f71ULL, 0x7529d871b0675ddfULL,
1293     0xb8f568b42d3cbd78ULL, 0x1233011b91f3da82ULL, 0x2dce6ccd4a7c3b62ULL, 0x75e7fc8e9e498603ULL,
1294     0x2f4f13f1fcd0b6ecULL, 0xf1a8ca1f29ff7a45ULL, 0xc249c1a72981e29bULL, 0x6ebe0dbb8c83b56aULL,
1295     0x7114fa8d170bb222ULL, 0x65a2dcd5bf93935fULL, 0xbdc41f68b59c979aULL, 0x2f0eef79a2ce9289ULL,
1296     0x42ecbf0c083c37ceULL, 0x2930bc09ec496322ULL, 0xf294b0c19cfeac0dULL, 0x3780aa4bedfabb80ULL,
1297     0x56c17d3e7cead929ULL, 0xe7cb4beb2e5722c5ULL, 0x0ce931732dbfe15aULL, 0x41b883c7621052f8ULL,
1298     0xdbf75ca0c3d25350ULL, 0x2936be086eb1e351ULL, 0xc936e03cb4a9b212ULL, 0x1d45bf82322225aaULL,
1299     0xe81ab1036a024cc5ULL, 0xe212201c304c9a72ULL, 0xc5d73fba6832b1fcULL, 0x20ffdb5a4d839581ULL,
1300     0xa283d367be5d0fadULL, 0x6c2b25ca8b164475ULL, 0x9d4935467caaf22eULL, 0x5166408eee85ff49ULL,
1301     0x3c67baa2fab4e361ULL, 0xb3e433c67ef35cefULL, 0x5259729241159b1cULL, 0x6a621892d5b0ab33ULL,
1302     0x20b74a387555cdcbULL, 0x532aa10e1208923fULL, 0xeaa17b7762281dd1ULL, 0x61ab3443f05c44bfULL,
1303     0x257a6c422324def8ULL, 0x131c6c1017e3cf7fULL, 0x23758739f630a257ULL, 0x295a407a01a78580ULL,
1304     0xf8c443246d5da8d9ULL, 0x19d775450c52fa5dULL, 0x2afcfc92731bf83dULL, 0x7d10c8e81b2b4700ULL,
1305     0xc8e0271f70baa20bULL, 0x993748867ca63957ULL, 0x5412efb3cb7ed4bbULL, 0x3196d36173e62975ULL,
1306     0xde5bcad141c7dffcULL, 0x47cc8cd2b395c848ULL, 0xa34cd942e11af3cbULL, 0x0256dbf2d04ecec2ULL,
1307     0x875ab7e94b0e667fULL, 0xcad4dd83c0850d10ULL, 0x47f12e8f4e72c79fULL, 0x5f1a87bb8c85b19bULL,
1308     0x7ae9d0b6437f51b8ULL, 0x12c7ce5518879065ULL, 0x2ade09fe5cf77aeeULL, 0x23a05a2f7d2c5627ULL,
1309     0x5908e128f17c169aULL, 0xf77498dd8ad0852dULL, 0x74b4c4ceab102f64ULL, 0x183abadd10139845ULL,
1310     0xb165ba8daa92aaacULL, 0xd5c5ef9599386705ULL, 0xbe2f8f0cf8fc40d1ULL, 0x2701e635ee204514ULL,
1311     0x629fa80020156514ULL, 0xf223868764a8c1ceULL, 0x5b894fff0b3f060eULL, 0x60d9944cf708a3faULL,
1312     0xaeea001a1c7a201fULL, 0xebf16a633ee2ce63ULL, 0x6f7709594c7a07e1ULL, 0x79b958150d0208cbULL,
1313     0x24b55e5301d410e7ULL, 0xe3a34edff3fdc84dULL, 0xd88768e4904032d8ULL, 0x131384427b3aaeecULL,
1314     0x8405e51286234f14ULL, 0x14dc4739adb4c529ULL, 0xb8a2b5b250634ffdULL, 0x2fe2a94ad8a7ff93ULL,
1315     0xec5c57efe843faddULL, 0x2843ce40f0bb9918ULL, 0xa4b561d6cf3d6305ULL, 0x743629bde8fb777eULL,
1316     0x343edd46bbaf738fULL, 0xed981828b101a651ULL, 0xa401760b882c797aULL, 0x1fc223e28dc88730ULL,
1317     0x48604e91fc0fba0eULL, 0xb637f78f052c6fa4ULL, 0x91ccac3d09e9239cULL, 0x23f7eed4437a687cULL,
1318     0x5173b1118d9bd800ULL, 0x29d641b63189d4a7ULL, 0xfdbf177988bbc586ULL, 0x2959894fcad81df5ULL,
1319     0xaebc8ef3b4bbc899ULL, 0x4148995ab26992b9ULL, 0x24e20b0134f92cfbULL, 0x40d158894a05dee8ULL,
1320     0x46b00b1185af76f6ULL, 0x26bac77873187a79ULL, 0x3dc0bf95ab8fff5fULL, 0x2a608bd8945524d7ULL,
1321     0x26449588bd446302ULL, 0x7c4bc21c0388439cULL, 0x8e98a4f383bd11b2ULL, 0x26218d7bc9d876b9ULL,
1322     0xe3081542997c178aULL, 0x3c2d29a86fb6606fULL, 0x5c217736fa279374ULL, 0x7dde05734afeb1faULL,
1323     0x3bf10e3906d42babULL, 0xe4f7803e1980649cULL, 0xe6053bf89595bf7aULL, 0x394faf38da245530ULL,
1324     0x7a8efb58896928f4ULL, 0xfbc778e9cc6a113cULL, 0x72670ce330af596fULL, 0x48f222a81d3d6cf7ULL,
1325     0xf01fce410d72caa7ULL, 0x5a20ecc7213b5595ULL, 0x7bc21165c1fa1483ULL, 0x07f89ae31da8a741ULL,
1326     0x05d2c2b4c6830ff9ULL, 0xd43e330fc6316293ULL, 0xa5a5590a96d3a904ULL, 0x705edb91a65333b6ULL,
1327     0x048ee15e0bb9a5f7ULL, 0x3240cfca9e0aaf5dULL, 0x8f4b71ceedc4a40bULL, 0x621c0da3de544a6dULL,
1328     0x92872836a08c4091ULL, 0xce8375b010c91445ULL, 0x8a72eb524f276394ULL, 0x2667fcfa7ec83635ULL,
1329     0x7f4c173345e8752aULL, 0x061b47feee7079a5ULL, 0x25dd9afa9f86ff34ULL, 0x3780cef5425dc89cULL,
1330     0x1a46035a513bb4e9ULL, 0x3e1ef379ac575adaULL, 0xc78c5f1c5fa24b50ULL, 0x321a967634fd9f22ULL,
1331     0x946707b8826e27faULL, 0x3dca84d64c506fd0ULL, 0xc189218075e91436ULL, 0x6d9284169b3b8484ULL,
1332     0x3a67e840383f2ddfULL, 0x33eec9a30c4f9b75ULL, 0x3ec7c86fa783ef47ULL, 0x26ec449fbac9fbc4ULL,
1333     0x5c0f38cba09b9e7dULL, 0x81168cc762a3478cULL, 0x3e23b0d306fc121cULL, 0x5a238aa0a5efdcddULL,
1334     0x1ba26121c4ea43ffULL, 0x36f8c77f7c8832b5ULL, 0x88fbea0b0adcf99aULL, 0x5ca9938ec25bebf9ULL,
1335     0xd5436a5e51fccda0ULL, 0x1dbc4797c2cd893bULL, 0x19346a65d3224a08ULL, 0x0f5034e49b9af466ULL,
1336     0xf23c3967a1e0b96eULL, 0xe58b08fa867a4d88ULL, 0xfb2fabc6a7341679ULL, 0x2a75381eb6026946ULL,
1337     0xc80a3be4c19420acULL, 0x66b1f6c681f2b6dcULL, 0x7cf7036761e93388ULL, 0x25abbbd8a660a4c4ULL,
1338     0x91ea12ba14fd5198ULL, 0x684950fc4a3cffa9ULL, 0xf826842130f5ad28ULL, 0x3ea988f75301a441ULL,
1339     0xc978109a695f8c6fULL, 0x1746eb4a0530c3f3ULL, 0x444d6d77b4459995ULL, 0x75952b8c054e5cc7ULL,
1340     0xa3703f7915f4d6aaULL, 0x66c346202f2647d8ULL, 0xd01469df811d644bULL, 0x77fea47d81a5d71fULL,
1341     0xc5e9529ef57ca381ULL, 0x6eeeb4b9ce2f881aULL, 0xb6e91a28e8009bd6ULL, 0x4b80be3e9afc3fecULL,
1342     0x7e3773c526aed2c5ULL, 0x1b4afcb453c9a49dULL, 0xa920bdd7baffb24dULL, 0x7c54699f122d400eULL,
1343     0xef46c8e14fa94bc8ULL, 0xe0b074ce2952ed5eULL, 0xbea450e1dbd885d5ULL, 0x61b68649320f712cULL,
1344     0x8a485f7309ccbdd1ULL, 0xbd06320d7d4d1a2dULL, 0x25232973322dbef4ULL, 0x445dc4758c17f770ULL,
1345     0xdb0434177cc8933cULL, 0xed6fe82175ea059fULL, 0x1efebefdc053db34ULL, 0x4adbe867c65daf99ULL,
1346     0x3acd71a2a90609dfULL, 0xe5e991856dd04050ULL, 0x1ec69b688157c23cULL, 0x697427f6885cfe4dULL,
1347     0xd7be7b9b65e1a851ULL, 0xa03d28d522c536ddULL, 0x28399d658fd2b645ULL, 0x49e5b7e17c2641e1ULL,
1348     0x6f8c3a98700457a4ULL, 0x5078f0a25ebb6778ULL, 0xd13c3ccbc382960fULL, 0x2e003258a7df84b1ULL,
1349     0x8ad1f39be6296a1cULL, 0xc1eeaa652a5fbfb2ULL, 0x33ee0673fd26f3cbULL, 0x59256173a69d2cccULL,
1350     0x41ea07aa4e18fc41ULL, 0xd9fc19527c87a51eULL, 0xbdaacb805831ca6fULL, 0x445b652dc916694fULL,
1351     0xce92a3a7f2172315ULL, 0x1edc282de11b9964ULL, 0xa1823aafe04c314aULL, 0x790a2d94437cf586ULL,
1352     0x71c447fb93f6e009ULL, 0x8922a56722845276ULL, 0xbf70903b204f5169ULL, 0x2f7a89891ba319feULL,
1353     0x02a08eb577e2140cULL, 0xed9a4ed4427bdcf4ULL, 0x5253ec44e4323cd1ULL, 0x3e88363c14e9355bULL,
1354     0xaa66c14277110b8cULL, 0x1ae0391610a23390ULL, 0x2030bd12c93fc2a2ULL, 0x3ee141579555c7abULL,
1355     0x9214de3a6d6e7d41ULL, 0x3ccdd88607f17efeULL, 0x674f1288f8e11217ULL, 0x5682250f329f93d0ULL,
1356     0x6cf00b136d2e396eULL, 0x6e4cf86f1014debfULL, 0x5930b1b5bfcc4e83ULL, 0x047069b48aba16b6ULL,
1357     0x0d4ce4ab69b20793ULL, 0xb24db91a97d0fb9eULL, 0xcdfa50f54e00d01dULL, 0x221b1085368bddb5ULL,
1358     0xe7e59468b1e3d8d2ULL, 0x53c56563bd122f93ULL, 0xeee8a903e0663f09ULL, 0x61efa662cbbe3d42ULL,
1359     0x2cf8ddddde6eab2aULL, 0x9bf80ad51435f231ULL, 0x5deadacec9f04973ULL, 0x29275b5d41d29b27ULL,
1360     0xcfde0f0895ebf14fULL, 0xb9aab96b054905a7ULL, 0xcae80dd9a1c420fdULL, 0x0a63bf2f1673bbc7ULL,
1361     0x092f6e11958fbc8cULL, 0x672a81e804822fadULL, 0xcac8351560d52517ULL, 0x6f3f7722c8f192f8ULL,
1362     0xf8ba90ccc2e894b7ULL, 0x2c7557a438ff9f0dULL, 0x894d1d855ae52359ULL, 0x68e122157b743d69ULL,
1363     0xd87e5570cfb919f3ULL, 0x3f2cdecd95798db9ULL, 0x2121154710c0a2ceULL, 0x3c66a115246dc5b2ULL,
1364     0xcbedc562294ecb72ULL, 0xba7143c36a280b16ULL, 0x9610c2efd4078b67ULL, 0x6144735d946a4b1eULL,
1365     0x536f111ed75b3350ULL, 0x0211db8c2041d81bULL, 0xf93cb1000e10413cULL, 0x149dfd3c039e8876ULL,
1366     0xd479dde46b63155bULL, 0xb66e15e93c837976ULL, 0xdafde43b1f13e038ULL, 0x5fafda1a2e4b0b35ULL,
1367     0x3600bbdf17197581ULL, 0x3972050bbe3cd2c2ULL, 0x5938906dbdd5be86ULL, 0x34fce5e43f9b860fULL,
1368     0x75a8a4cd42d14d02ULL, 0x828dabc53441df65ULL, 0x33dcabedd2e131d3ULL, 0x3ebad76fb814d25fULL,
1369     0xd4906f566f70e10fULL, 0x5d12f7aa51690f5aULL, 0x45adb16e76cefcf2ULL, 0x01f768aead232999ULL,
1370     0x2b6cc77b6248febdULL, 0x3cd30628ec3aaffdULL, 0xce1c0b80d4ef486aULL, 0x4c3bff2ea6f66c23ULL,
1371     0x3f2ec4094aeaeb5fULL, 0x61b19b286e372ca7ULL, 0x5eefa966de2a701dULL, 0x23b20565de55e3efULL,
1372     0xe301ca5279d58557ULL, 0x07b2d4ce27c2874fULL, 0xa532cd8a9dcf1d67ULL, 0x2a52fee23f2bff56ULL,
1373     0x8624efb37cd8663dULL, 0xbbc7ac20ffbd7594ULL, 0x57b85e9c82d37445ULL, 0x7b3052cb86a6ec66ULL,
1374     0x3482f0ad2525e91eULL, 0x2cb68043d28edca0ULL, 0xaf4f6d052e1b003aULL, 0x185f8c2529781b0aULL,
1375     0xaa41de5bd80ce0d6ULL, 0x9407b2416853e9d6ULL, 0x563ec36e357f4c3aULL, 0x4cc4b8dd0e297bceULL,
1376     0xa2fc1a52ffb8730eULL, 0x1811f16e67058e37ULL, 0x10f9a366cddf4ee1ULL, 0x72f4a0c4a0b9f099ULL,
1377     0x8c16c06f663f4ea7ULL, 0x693b3af74e970fbaULL, 0x2102e7f1d69ec345ULL, 0x0ba53cbc968a8089ULL,
1378     0xca3d9dc7fea15537ULL, 0x4c6824bb51536493ULL, 0xb9886314844006b1ULL, 0x40d2a72ab454cc60ULL,
1379     0x5936a1b712570975ULL, 0x91b9d648debda657ULL, 0x3344094bb64330eaULL, 0x006ba10d12ee51d0ULL,
1380     0x19228468f5de5d58ULL, 0x0eb12f4c38cc05b0ULL, 0xa1039f9dd5601990ULL, 0x4502d4ce4fff0e0bULL,
1381     0xeb2054106837c189ULL, 0xd0f6544c6dd3b93cULL, 0x40727064c416d74fULL, 0x6e15c6114b502ef0ULL,
1382     0x4df2a398cfb1a76bULL, 0x11256c7419f2f6b1ULL, 0x4a497962066e6043ULL, 0x705b3aab41355b44ULL,
1383     0x365ef536d797b1d8ULL, 0x00076bd622ddf0dbULL, 0x3bbf33b0e0575a88ULL, 0x3777aa05c8e4ca4dULL,
1384     0x392745c85578db5fULL, 0x6fda4149dbae5ae2ULL, 0xb1f0b00b8adc9867ULL, 0x09963437d36f1da3ULL,
1385     0x7e824e90a5dc3853ULL, 0xccb5f6641f135cbdULL, 0x6736d86c87ce8fccULL, 0x625f3ce26604249fULL,
1386     0xaf8ac8059502f63fULL, 0x0c05e70a2e351469ULL, 0x35292e9c764b6305ULL, 0x1a394360c7e23ac3ULL,
1387     0xd5c6d53251183264ULL, 0x62065abd43c2b74fULL, 0xb5fbf5d03b973f9bULL, 0x13a3da3661206e5eULL,
1388     0xc6bd5837725d94e5ULL, 0x18e30912205016c5ULL, 0x2088ce1570033c68ULL, 0x7fba1f495c837987ULL,
1389     0x5a8c7423f2f9079dULL, 0x1735157b34023fc5ULL, 0xe4f9b49ad2fab351ULL, 0x6691ff72c878e33cULL,
1390     0x122c2adedc5eff3eULL, 0xf8dd4bf1d8956cf4ULL, 0xeb86205d9e9e5bdaULL, 0x049b92b9d975c743ULL,
1391     0xa5379730b0f6c05aULL, 0x72a0ffacc6f3a553ULL, 0xb0032c34b20dcd6dULL, 0x470e9dbc88d5164aULL,
1392     0xb19cf10ca237c047ULL, 0xb65466711f6c81a2ULL, 0xb3321bd16dd80b43ULL, 0x48c14f600c5fbe8eULL,
1393     0x66451c264aa6c803ULL, 0xb66e3904a4fa7da6ULL, 0xd45f19b0b3128395ULL, 0x31602627c3c9bc10ULL,
1394     0x3120dc4832e4e10dULL, 0xeb20c46756c717f7ULL, 0x00f52e3f67280294ULL, 0x566d4fc14730c509ULL,
1395     0x7e3a5d40fd837206ULL, 0xc1e926dc7159547aULL, 0x216730fba68d6095ULL, 0x22e8c3843f69cea7ULL,
1396     0x33d074e8930e4b2bULL, 0xb6e4350e84d15816ULL, 0x5534c26ad6ba2365ULL, 0x7773c12f89f1f3f3ULL,
1397     0x8cba404da57962aaULL, 0x5b9897a81999ce56ULL, 0x508e862f121692fcULL, 0x3a81907fa093c291ULL,
1398     0x0dded0ff4725a510ULL, 0x10d8cc10673fc503ULL, 0x5b9d151c9f1f4e89ULL, 0x32a5c1d5cb09a44cULL,
1399     0x1e0aa442b90541fbULL, 0x5f85eb7cc1b485dbULL, 0xbee595ce8a9df2e5ULL, 0x25e496c722422236ULL,
1400     0x5edf3c46cd0fe5b9ULL, 0x34e75a7ed2a43388ULL, 0xe488de11d761e352ULL, 0x0e878a01a085545cULL,
1401     0xba493c77e021bb04ULL, 0x2b4d1843c7df899aULL, 0x9ea37a487ae80d67ULL, 0x67a9958011e41794ULL,
1402     0x4b58051a6697b065ULL, 0x47e33f7d8d6ba6d4ULL, 0xbb4da8d483ca46c1ULL, 0x68becaa181c2db0dULL,
1403     0x8d8980e90b989aa5ULL, 0xf95eb14a2c93c99bULL, 0x51c6c7c4796e73a2ULL, 0x6e228363b5efb569ULL,
1404     0xc6bbc0b02dd624c8ULL, 0x777eb47dec8170eeULL, 0x3cde15a004cfafa9ULL, 0x1dc6bc087160bf9bULL,
1405     0x2e07e043eec34002ULL, 0x18e9fc677a68dc7fULL, 0xd8da03188bd15b9aULL, 0x48fbc3bb00568253ULL,
1406     0x57547d4cfb654ce1ULL, 0xd3565b82a058e2adULL, 0xf63eaf0bbf154478ULL, 0x47531ef114dfbb18ULL,
1407     0xe1ec630a4278c587ULL, 0x5507d546ca8e83f3ULL, 0x85e135c63adc0c2bULL, 0x0aa7efa85682844eULL,
1408     0x72691ba8b3e1f615ULL, 0x32b4e9701fbe3ffaULL, 0x97b6d92e39bb7868ULL, 0x2cfe53dea02e39e8ULL,
1409     0x687392cd85cd52b0ULL, 0x27ff66c910e29831ULL, 0x97134556a9832d06ULL, 0x269bb0360a84f8a0ULL,
1410     0x706e55457643f85cULL, 0x3734a48c9b597d1bULL, 0x7aee91e8c6efa472ULL, 0x5cd6abc198a9d9e0ULL,
1411     0x0e04de06cb3ce41aULL, 0xd8c6eb893402e138ULL, 0x904659bb686e3772ULL, 0x7215c371746ba8c8ULL,
1412     0xfd12a97eeae4a2d9ULL, 0x9514b7516394f2c5ULL, 0x266fd5809208f294ULL, 0x5c847085619a26b9ULL,
1413     0x52985410fed694eaULL, 0x3c905b934a2ed254ULL, 0x10bb47692d3be467ULL, 0x063b3d2d69e5e9e1ULL,
1414     0x472726eedda57debULL, 0xefb6c4ae10f41891ULL, 0x2b1641917b307614ULL, 0x117c554fc4f45b7cULL,
1415     0xc07cf3118f9d8812ULL, 0x01dbd82050017939ULL, 0xd7e803f4171b2827ULL, 0x1015e87487d225eaULL,
1416     0xc58de3fed23acc4dULL, 0x50db91c294a7be2dULL, 0x0b94d43d1c9cf457ULL, 0x6b1640fa6e37524aULL,
1417     0x692f346c5fda0d09ULL, 0x200b1c59fa4d3151ULL, 0xb8c46f760777a296ULL, 0x4b38395f3ffdfbcfULL,
1418     0x18d25e00be54d671ULL, 0x60d50582bec8aba6ULL, 0x87ad8f263b78b982ULL, 0x50fdf64e9cda0432ULL,
1419     0x90f567aac578dcf0ULL, 0xef1e9b0ef2a3133bULL, 0x0eebba9242d9de71ULL, 0x15473c9bf03101c7ULL,
1420     0x7c77e8ae56b78095ULL, 0xb678e7666e6f078eULL, 0x2da0b9615348ba1fULL, 0x7cf931c1ff733f0bULL,
1421     0x26b357f50a0a366cULL, 0xe9708cf42b87d732ULL, 0xc13aeea5f91cb2c0ULL, 0x35d90c991143bb4cULL,
1422     0x47c1c404a9a0d9dcULL, 0x659e58451972d251ULL, 0x3875a8c473b38c31ULL, 0x1fbd9ed379561f24ULL,
1423     0x11fabc6fd41ec28dULL, 0x7ef8dfe3cd2a2dcaULL, 0x72e73b5d8c404595ULL, 0x6135fa4954b72f27ULL,
1424     0xccfc32a2de24b69cULL, 0x3f55698c1f095d88ULL, 0xbe3350ed5ac3f929ULL, 0x5e9bf806ca477eebULL,
1425     0xe9ce8fb63c309f68ULL, 0x5376f63565e1f9f4ULL, 0xd1afcfb35a6393f1ULL, 0x6632a1ede5623506ULL,
1426     0x0b7d6c390c2ded4cULL, 0x56cb3281df04cb1fULL, 0x66305a1249ecc3c7ULL, 0x5d588b60a38ca72aULL,
1427     0xa6ecbf78e8e5f42dULL, 0x86eeb44b3c8a3eecULL, 0xec219c48fbd21604ULL, 0x1aaf1af517c36731ULL,
1428     0xc306a2836769bde7ULL, 0x208280622b1e2adbULL, 0x8027f51ffbff94a6ULL, 0x76cfa1ce1124f26bULL,
1429     0x18eb00562422abb6ULL, 0xf377c4d58f8c29c3ULL, 0x4dbbc207f531561aULL, 0x0253b7f082128a27ULL,
1430     0x3d1f091cb62c17e0ULL, 0x4860e1abd64628a9ULL, 0x52d17436309d4253ULL, 0x356f97e13efae576ULL,
1431     0xd351e11aa150535bULL, 0x3e6b45bb1dd878ccULL, 0x0c776128bed92c98ULL, 0x1d34ae93032885b8ULL,
1432     0x4ba0488ca85ba4c3ULL, 0x985348c33c9ce6ceULL, 0x66124c6f97bda770ULL, 0x0f81a0290654124aULL,
1433     0x9ed09ca6569b86fdULL, 0x811009fd18af9a2dULL, 0xff08d03f93d8c20aULL, 0x52a148199faef26bULL,
1434     0x3e03f9dc2d8d1b73ULL, 0x4205801873961a70ULL, 0xc0d987f041a35970ULL, 0x07aa1f15a1c0d549ULL,
1435     0xdfd46ce08cd27224ULL, 0x6d0a024f934e4239ULL, 0x808a7a6399897b59ULL, 0x0a4556e9e13d95a2ULL,
1436     0xd21a991fe9c13045ULL, 0x9b0e8548fe7751b8ULL, 0x5da643cb4bf30035ULL, 0x77db28d63940f721ULL,
1437     0xfc5eeb614adc9011ULL, 0x5229419ae8c411ebULL, 0x9ec3e7787d1dcf74ULL, 0x340d053e216e4cb5ULL,
1438     0xcac7af39b48df2b4ULL, 0xc0faec2871a10a94ULL, 0x140a69245ca575edULL, 0x0cf1c37134273a4cULL,
1439     0xc8ee306ac224b8a5ULL, 0x57eaee7ccb4930b0ULL, 0xa1e806bdaacbe74fULL, 0x7d9a62742eeb657dULL,
1440     0x9eb6b6ef546c4830ULL, 0x885cca1fddb36e2eULL, 0xe6b9f383ef0d7105ULL, 0x58654fef9d2e0412ULL,
1441     0xa905c4ffbe0e8e26ULL, 0x942de5df9b31816eULL, 0x497d723f802e88e1ULL, 0x30684dea602f408dULL,
1442     0x21e5a278a3e6cb34ULL, 0xaefb6e6f5b151dc4ULL, 0xb30b8e049d77ca15ULL, 0x28c3c9cf53b98981ULL,
1443     0x287fb721556cdd2aULL, 0x0d317ca897022274ULL, 0x7468c7423a543258ULL, 0x4a7f11464eb5642fULL,
1444     0xa237a4774d193aa6ULL, 0xd865986ea92129a1ULL, 0x24c515ecf87c1a88ULL, 0x604003575f39f5ebULL,
1445     0x47b9f189570a9b27ULL, 0x2b98cede465e4b78ULL, 0x026df551dbb85c20ULL, 0x74fcd91047e21901ULL,
1446     0x13e2a90a23c1bfa3ULL, 0x0cb0074e478519f6ULL, 0x5ff1cbbe3af6cf44ULL, 0x67fe5438be812dbeULL,
1447     0xd13cf64fa40f05b0ULL, 0x054dfb2f32283787ULL, 0x4173915b7f0d2aeaULL, 0x482f144f1f610d4eULL,
1448     0xf6210201b47f8234ULL, 0x5d0ae1929e70b990ULL, 0xdcd7f455b049567cULL, 0x7e93d0f1f0916f01ULL,
1449     0xdd79cbf18a7db4faULL, 0xbe8391bf6f74c62fULL, 0x027145d14b8291bdULL, 0x585a73ea2cbf1705ULL,
1450     0x485ca03e928a0db2ULL, 0x10fc01a5742857e7ULL, 0x2f482edbd6d551a7ULL, 0x0f0433b5048fdb8aULL,
1451     0x60da2e8dd7dc6247ULL, 0x88b4c9d38cd4819aULL, 0x13033ac001f66697ULL, 0x273b24fe3b367d75ULL,
1452     0xc6e8f66a31b3b9d4ULL, 0x281514a494df49d5ULL, 0xd1726fdfc8b23da7ULL, 0x4b3ae7d103dee548ULL,
1453     0xc6256e19ce4b9d7eULL, 0xff5c5cf186e3c61cULL, 0xacc63ca34b8ec145ULL, 0x74621888fee66574ULL,
1454     0x956f409645290a1eULL, 0xef0bf8e3263a962eULL, 0xed6a50eb5ec2647bULL, 0x0694283a9dca7502ULL,
1455     0x769b963643a2dcd1ULL, 0x42b7c8ea09fc5353ULL, 0x4f002aee13397eabULL, 0x63005e2c19b7d63aULL,
1456     0xca6736da63023beaULL, 0x966c7f6db12a99b7ULL, 0xace09390c537c5e1ULL, 0x0b696063a1aa89eeULL,
1457     0xebb03e97288c56e5ULL, 0x432a9f9f938c8be8ULL, 0xa6a5a93d5b717f71ULL, 0x1a5fb4c3e18f9d97ULL,
1458     0x1c94e7ad1c60cdceULL, 0xee202a43fc02c4a0ULL, 0x8dafe4d867c46a20ULL, 0x0a10263c8ac27b58ULL,
1459     0xd0dea9dfe4432a4aULL, 0x856af87bbe9277c5ULL, 0xce8472acc212c71aULL, 0x6f151b6d9bbb1e91ULL,
1460     0x26776c527ceed56aULL, 0x7d211cb7fbf8faecULL, 0x37ae66a6fd4609ccULL, 0x1f81b702d2770c42ULL,
1461     0x2fb0b057eac58392ULL, 0xe1dd89fe29744e9dULL, 0xc964f8eb17beb4f8ULL, 0x29571073c9a2d41eULL,
1462     0xa948a18981c0e254ULL, 0x2df6369b65b22830ULL, 0xa33eb2d75fcfd3c6ULL, 0x078cd6ec4199a01fULL,
1463     0x4a584a41ad900d2fULL, 0x32142b78e2c74c52ULL, 0x68c4e8338431c978ULL, 0x7f69ea9008689fc2ULL,
1464     0x52f2c81e46a38265ULL, 0xfd78072d04a832fdULL, 0x8cd7d5fa25359e94ULL, 0x4de71b7454cc29d2ULL,
1465     0x42eb60ad1eda6ac9ULL, 0x0aad37dfdbc09c3aULL, 0x81004b71e33cc191ULL, 0x44e6be345122803cULL,
1466     0x03fe8388ba1920dbULL, 0xf5d57c32150db008ULL, 0x49c8c4281af60c29ULL, 0x21edb518de701aeeULL,
1467     0x7fb63e418f06dc99ULL, 0xa4460d99c166d7b8ULL, 0x24dd5248ce520a83ULL, 0x5ec3ad712b928358ULL,
1468     0x15022a5fbd17930fULL, 0xa4f64a77d82570e3ULL, 0x12bc8d6915783712ULL, 0x498194c0fc620abbULL,
1469     0x38a2d9d255686c82ULL, 0x785c6bd9193e21f0ULL, 0xe4d5c81ab24a5484ULL, 0x56307860b2e20989ULL,
1470     0x429d55f78b4d74c4ULL, 0x22f1834643350131ULL, 0x1e60c24598c71fffULL, 0x59f2f014979983efULL,
1471     0x46a47d56eb494a44ULL, 0x3e22a854d636a18eULL, 0xb346e15274491c3bULL, 0x2ceafd4e5390cde7ULL,
1472     0xba8a8538be0d6675ULL, 0x4b9074bb50818e23ULL, 0xcbdab89085d304c3ULL, 0x61a24fe0e56192c4ULL,
1473     0xcb7615e6db525bcbULL, 0xdd7d8c35a567e4caULL, 0xe6b4153acafcdd69ULL, 0x2d668e097f3c9766ULL,
1474     0xa57e7e265ce55ef0ULL, 0x5d9f4e527cd4b967ULL, 0xfbc83606492fd1e5ULL, 0x090d52beb7c3f7aeULL,
1475     0x09b9515a1e7b4d7cULL, 0x1f266a2599da44c0ULL, 0xa1c49548e2c55504ULL, 0x7ef04287126f15ccULL,
1476     0xfed1659dbd30ef15ULL, 0x8b4ab9eec4e0277bULL, 0x884d6236a5df3291ULL, 0x1fd96ea6bf5cf788ULL,
1477     0x42a161981f190d9aULL, 0x61d849507e6052c1ULL, 0x9fe113bf285a2cd5ULL, 0x7c22d676dbad85d8ULL,
1478     0x82e770ed2bfbd27dULL, 0x4c05b2ece996f5a5ULL, 0xcd40a9c2b0900150ULL, 0x5895319213d9bf64ULL,
1479     0xe7cc5d703fea2e08ULL, 0xb50c491258e2188cULL, 0xcce30baa48205bf0ULL, 0x537c659ccfa32d62ULL,
1480     0x37b6623a98cfc088ULL, 0xfe9bed1fa4d6aca4ULL, 0x04d29b8e56a8d1b0ULL, 0x725f71c40b519575ULL,
1481     0x28c7f89cd0339ce6ULL, 0x8367b14469ddc18bULL, 0x883ada83a6a1652cULL, 0x585f1974034d6c17ULL,
1482     0x89cfb266f1b19188ULL, 0xe63b4863e7c35217ULL, 0xd88c9da6b4c0526aULL, 0x3e035c9df0954635ULL,
1483     0xdd9d5412fb45de9dULL, 0xdd684532e4cff40dULL, 0x4b5c999b151d671cULL, 0x2d8c2cc811e7f690ULL,
1484     0x7f54be1d90055d40ULL, 0xa464c5df464aaf40ULL, 0x33979624f0e917beULL, 0x2c018dc527356b30ULL,
1485     0xa5415024e330b3d4ULL, 0x73ff3d96691652d3ULL, 0x94ec42c4ef9b59f1ULL, 0x0747201618d08e5aULL,
1486     0x4d6ca48aca411c53ULL, 0x66415f2fcfa66119ULL, 0x9c4dd40051e227ffULL, 0x59810bc09a02f7ebULL,
1487     0x2a7eb171b3dc101dULL, 0x441c5ab99ffef68eULL, 0x32025c9b93b359eaULL, 0x5e8ce0a71e9d112fULL,
1488     0xbfcccb92429503fdULL, 0xd271ba752f095d55ULL, 0x345ead5e972d091eULL, 0x18c8df11a83103baULL,
1489     0x90cd949a9aed0f4cULL, 0xc5d1f4cb6660e37eULL, 0xb8cac52d56c52e0bULL, 0x6e42e400c5808e0dULL,
1490     0xa3b46966eeaefd23ULL, 0x0c4f1f0be39ecdcaULL, 0x189dc8c9d683a51dULL, 0x51f27f054c09351bULL,
1491     0x4c487ccd2a320682ULL, 0x587ea95bb3df1c96ULL, 0xc8ccf79e555cb8e8ULL, 0x547dc829a206d73dULL,
1492     0xb822a6cd80c39b06ULL, 0xe96d54732000d4c6ULL, 0x28535b6f91463b4dULL, 0x228f4660e2486e1dULL,
1493     0x98799538de8d3abfULL, 0x8cd8330045ebca6eULL, 0x79952a008221e738ULL, 0x4322e1a7535cd2bbULL,
1494     0xb114c11819d1801cULL, 0x2016e4d84f3f5ec7ULL, 0xdd0e2df409260f4cULL, 0x5ec362c0ae5f7266ULL,
1495     0xc0462b18b8b2b4eeULL, 0x7cc8d950274d1afbULL, 0xf25f7105436b02d2ULL, 0x43bbf8dcbff9ccd3ULL,
1496     0xb6ad1767a039e9dfULL, 0xb0714da8f69d3583ULL, 0x5e55fa18b42931f5ULL, 0x4ed5558f33c60961ULL,
1497     0x1fe37901c647a5ddULL, 0x593ddf1f8081d357ULL, 0x0249a4fd813fd7a6ULL, 0x69acca274e9caf61ULL,
1498     0x047ba3ea330721c9ULL, 0x83423fc20e7e1ea0ULL, 0x1df4c0af01314a60ULL, 0x09a62dab89289527ULL,
1499     0xa5b325a49cc6cb00ULL, 0xe94b5dc654b56cb6ULL, 0x3be28779adc994a0ULL, 0x4296e8f8ba3a4aadULL,
1500     0x328689761e451eabULL, 0x2e4d598bff59594aULL, 0x49b96853d7a7084aULL, 0x4980a319601420a8ULL,
1501     0x9565b9e12f552c42ULL, 0x8a5318db7100fe96ULL, 0x05c90b4d43add0d7ULL, 0x538b4cd66a5d4edaULL,
1502     0xf4e94fc3e89f039fULL, 0x592c9af26f618045ULL, 0x08a36eb5fd4b9550ULL, 0x25fffaf6c2ed1419ULL,
1503     0x34434459cc79d354ULL, 0xeeecbfb4b1d5476bULL, 0xddeb34a061615d99ULL, 0x5129cecceb64b773ULL,
1504     0xee43215894993520ULL, 0x772f9c7cf14c0b3bULL, 0xd2e2fce306bedad5ULL, 0x715f42b546f06a97ULL,
1505     0x434ecdceda5b5f1aULL, 0x0da17115a49741a9ULL, 0x680bd77c73edad2eULL, 0x487c02354edd9041ULL,
1506     0xb8efeff3a70ed9c4ULL, 0x56a32aa3e857e302ULL, 0xdf3a68bd48a2a5a0ULL, 0x07f650b73176c444ULL,
1507     0xe38b9b1626e0ccb1ULL, 0x79e053c18b09fb36ULL, 0x56d90319c9f94964ULL, 0x1ca941e7ac9ff5c4ULL,
1508     0x49c4df29162fa0bbULL, 0x8488cf3282b33305ULL, 0x95dfda14cabb437dULL, 0x3391f78264d5ad86ULL,
1509     0x729ae06ae2b5095dULL, 0xd58a58d73259a946ULL, 0xe9834262d13921edULL, 0x27fedafaa54bb592ULL,
1510     0xa99dc5b829ad48bbULL, 0x5f025742499ee260ULL, 0x802c8ecd5d7513fdULL, 0x78ceb3ef3f6dd938ULL,
1511     0xc342f44f8a135d94ULL, 0x7b9edb44828cdda3ULL, 0x9436d11a0537cfe7ULL, 0x5064b164ec1ab4c8ULL,
1512     0x7020eccfd37eb2fcULL, 0x1f31ea3ed90d25fcULL, 0x1b930d7bdfa1bb34ULL, 0x5344467a48113044ULL,
1513     0x70073170f25e6dfbULL, 0xe385dc1a50114cc8ULL, 0x2348698ac8fc4f00ULL, 0x2a77a55284dd40d8ULL,
1514     0xfe06afe0c98c6ce4ULL, 0xc235df96dddfd6e4ULL, 0x1428d01e33bf1ed3ULL, 0x785768ec9300bdafULL,
1515     0x9702e57a91deb63bULL, 0x61bdb8bfe5ce8b80ULL, 0x645b426f3d1d58acULL, 0x4804a82227a557bcULL,
1516     0x8e57048ab44d2601ULL, 0x68d6501a4b3a6935ULL, 0xc39c9ec3f9e1c293ULL, 0x4172f257d4de63e2ULL,
1517     0xd368b450330c6401ULL, 0x040d3017418f2391ULL, 0x2c34bb6090b7d90dULL, 0x16f649228fdfd51fULL,
1518     0xbea6818e2b928ef5ULL, 0xe28ccf91cdc11e72ULL, 0x594aaa68e77a36cdULL, 0x313034806c7ffd0fULL,
1519     0x8a9d27ac2249bd65ULL, 0x19a3b464018e9512ULL, 0xc26ccff352b37ec7ULL, 0x056f68341d797b21ULL,
1520     0x5e79d6757efd2327ULL, 0xfabdbcb6553afe15ULL, 0xd3e7222c6eaf5a60ULL, 0x7046c76d4dae743bULL,
1521     0x660be872b18d4a55ULL, 0x19992518574e1496ULL, 0xc103053a302bdcbbULL, 0x3ed8e9800b218e8eULL,
1522     0x7b0b9239fa75e03eULL, 0xefe9fb684633c083ULL, 0x98a35fbe391a7793ULL, 0x6065510fe2d0fe34ULL,
1523     0x55cb668548abad0cULL, 0xb4584548da87e527ULL, 0x2c43ecea0107c1ddULL, 0x526028809372de35ULL,
1524     0x3415c56af9213b1fULL, 0x5bee1a4d017e98dbULL, 0x13f6b105b5cf709bULL, 0x5ff20e3482b29ab6ULL,
1525     0x0aa29c75cc2e6c90ULL, 0xfc7d73ca3a70e206ULL, 0x899fc38fc4b5c515ULL, 0x250386b124ffc207ULL,
1526     0x54ea28d5ae3d2b56ULL, 0x9913149dd6de60ceULL, 0x16694fc58f06d6c1ULL, 0x46b23975eb018fc7ULL,
1527     0x470a6a0fb4b7b4e2ULL, 0x5d92475a8f7253deULL, 0xabeee5b52fbd3adbULL, 0x7fa20801a0806968ULL,
1528     0x76f3faf19f7714d2ULL, 0xb3e840c12f4660c3ULL, 0x0fb4cd8df212744eULL, 0x4b065a251d3a2dd2ULL,
1529     0x5cebde383d77cd4aULL, 0x6adf39df882c9cb1ULL, 0xa2dd242eb09af759ULL, 0x3147c0e50e5f6422ULL,
1530     0x164ca5101d1350dbULL, 0xf8d13479c33fc962ULL, 0xe640ce4d13e5da08ULL, 0x4bdee0c45061f8baULL,
1531     0xd7c46dc1a4edb1c9ULL, 0x5514d7b6437fd98aULL, 0x58942f6bb2a1c00bULL, 0x2dffb2ab1d70710eULL,
1532     0xccdfcf2fc18b6d68ULL, 0xa8ebcba8b7806167ULL, 0x980697f95e2937e3ULL, 0x02fbba1cd0126e8cULL
1533 };
1534 
1535 static void curve25519_ever64_base(u8 *out, const u8 *priv)
1536 {
1537     u64 swap = 1;
1538     int i, j, k;
1539     u64 tmp[16 + 32 + 4];
1540     u64 *x1 = &tmp[0];
1541     u64 *z1 = &tmp[4];
1542     u64 *x2 = &tmp[8];
1543     u64 *z2 = &tmp[12];
1544     u64 *xz1 = &tmp[0];
1545     u64 *xz2 = &tmp[8];
1546     u64 *a = &tmp[0 + 16];
1547     u64 *b = &tmp[4 + 16];
1548     u64 *c = &tmp[8 + 16];
1549     u64 *ab = &tmp[0 + 16];
1550     u64 *abcd = &tmp[0 + 16];
1551     u64 *ef = &tmp[16 + 16];
1552     u64 *efgh = &tmp[16 + 16];
1553     u64 *key = &tmp[0 + 16 + 32];
1554 
1555     memcpy(key, priv, 32);
1556     ((u8 *)key)[0] &= 248;
1557     ((u8 *)key)[31] = (((u8 *)key)[31] & 127) | 64;
1558 
1559     x1[0] = 1, x1[1] = x1[2] = x1[3] = 0;
1560     z1[0] = 1, z1[1] = z1[2] = z1[3] = 0;
1561     z2[0] = 1, z2[1] = z2[2] = z2[3] = 0;
1562     memcpy(x2, p_minus_s, sizeof(p_minus_s));
1563 
1564     j = 3;
1565     for (i = 0; i < 4; ++i) {
1566         while (j < (const int[]){ 64, 64, 64, 63 }[i]) {
1567             u64 bit = (key[i] >> j) & 1;
1568             k = (64 * i + j - 3);
1569             swap = swap ^ bit;
1570             cswap2(swap, xz1, xz2);
1571             swap = bit;
1572             fsub(b, x1, z1);
1573             fadd(a, x1, z1);
1574             fmul(c, &table_ladder[4 * k], b, ef);
1575             fsub(b, a, c);
1576             fadd(a, a, c);
1577             fsqr2(ab, ab, efgh);
1578             fmul2(xz1, xz2, ab, efgh);
1579             ++j;
1580         }
1581         j = 0;
1582     }
1583 
1584     point_double(xz1, abcd, efgh);
1585     point_double(xz1, abcd, efgh);
1586     point_double(xz1, abcd, efgh);
1587     encode_point(out, xz1);
1588 
1589     memzero_explicit(tmp, sizeof(tmp));
1590 }
1591 
1592 static __ro_after_init DEFINE_STATIC_KEY_FALSE(curve25519_use_bmi2_adx);
1593 
1594 void curve25519_arch(u8 mypublic[CURVE25519_KEY_SIZE],
1595              const u8 secret[CURVE25519_KEY_SIZE],
1596              const u8 basepoint[CURVE25519_KEY_SIZE])
1597 {
1598     if (static_branch_likely(&curve25519_use_bmi2_adx))
1599         curve25519_ever64(mypublic, secret, basepoint);
1600     else
1601         curve25519_generic(mypublic, secret, basepoint);
1602 }
1603 EXPORT_SYMBOL(curve25519_arch);
1604 
1605 void curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE],
1606               const u8 secret[CURVE25519_KEY_SIZE])
1607 {
1608     if (static_branch_likely(&curve25519_use_bmi2_adx))
1609         curve25519_ever64_base(pub, secret);
1610     else
1611         curve25519_generic(pub, secret, curve25519_base_point);
1612 }
1613 EXPORT_SYMBOL(curve25519_base_arch);
1614 
1615 static int curve25519_set_secret(struct crypto_kpp *tfm, const void *buf,
1616                  unsigned int len)
1617 {
1618     u8 *secret = kpp_tfm_ctx(tfm);
1619 
1620     if (!len)
1621         curve25519_generate_secret(secret);
1622     else if (len == CURVE25519_KEY_SIZE &&
1623          crypto_memneq(buf, curve25519_null_point, CURVE25519_KEY_SIZE))
1624         memcpy(secret, buf, CURVE25519_KEY_SIZE);
1625     else
1626         return -EINVAL;
1627     return 0;
1628 }
1629 
1630 static int curve25519_generate_public_key(struct kpp_request *req)
1631 {
1632     struct crypto_kpp *tfm = crypto_kpp_reqtfm(req);
1633     const u8 *secret = kpp_tfm_ctx(tfm);
1634     u8 buf[CURVE25519_KEY_SIZE];
1635     int copied, nbytes;
1636 
1637     if (req->src)
1638         return -EINVAL;
1639 
1640     curve25519_base_arch(buf, secret);
1641 
1642     /* might want less than we've got */
1643     nbytes = min_t(size_t, CURVE25519_KEY_SIZE, req->dst_len);
1644     copied = sg_copy_from_buffer(req->dst, sg_nents_for_len(req->dst,
1645                                 nbytes),
1646                      buf, nbytes);
1647     if (copied != nbytes)
1648         return -EINVAL;
1649     return 0;
1650 }
1651 
1652 static int curve25519_compute_shared_secret(struct kpp_request *req)
1653 {
1654     struct crypto_kpp *tfm = crypto_kpp_reqtfm(req);
1655     const u8 *secret = kpp_tfm_ctx(tfm);
1656     u8 public_key[CURVE25519_KEY_SIZE];
1657     u8 buf[CURVE25519_KEY_SIZE];
1658     int copied, nbytes;
1659 
1660     if (!req->src)
1661         return -EINVAL;
1662 
1663     copied = sg_copy_to_buffer(req->src,
1664                    sg_nents_for_len(req->src,
1665                             CURVE25519_KEY_SIZE),
1666                    public_key, CURVE25519_KEY_SIZE);
1667     if (copied != CURVE25519_KEY_SIZE)
1668         return -EINVAL;
1669 
1670     curve25519_arch(buf, secret, public_key);
1671 
1672     /* might want less than we've got */
1673     nbytes = min_t(size_t, CURVE25519_KEY_SIZE, req->dst_len);
1674     copied = sg_copy_from_buffer(req->dst, sg_nents_for_len(req->dst,
1675                                 nbytes),
1676                      buf, nbytes);
1677     if (copied != nbytes)
1678         return -EINVAL;
1679     return 0;
1680 }
1681 
1682 static unsigned int curve25519_max_size(struct crypto_kpp *tfm)
1683 {
1684     return CURVE25519_KEY_SIZE;
1685 }
1686 
1687 static struct kpp_alg curve25519_alg = {
1688     .base.cra_name      = "curve25519",
1689     .base.cra_driver_name   = "curve25519-x86",
1690     .base.cra_priority  = 200,
1691     .base.cra_module    = THIS_MODULE,
1692     .base.cra_ctxsize   = CURVE25519_KEY_SIZE,
1693 
1694     .set_secret     = curve25519_set_secret,
1695     .generate_public_key    = curve25519_generate_public_key,
1696     .compute_shared_secret  = curve25519_compute_shared_secret,
1697     .max_size       = curve25519_max_size,
1698 };
1699 
1700 
1701 static int __init curve25519_mod_init(void)
1702 {
1703     if (boot_cpu_has(X86_FEATURE_BMI2) && boot_cpu_has(X86_FEATURE_ADX))
1704         static_branch_enable(&curve25519_use_bmi2_adx);
1705     else
1706         return 0;
1707     return IS_REACHABLE(CONFIG_CRYPTO_KPP) ?
1708         crypto_register_kpp(&curve25519_alg) : 0;
1709 }
1710 
1711 static void __exit curve25519_mod_exit(void)
1712 {
1713     if (IS_REACHABLE(CONFIG_CRYPTO_KPP) &&
1714         static_branch_likely(&curve25519_use_bmi2_adx))
1715         crypto_unregister_kpp(&curve25519_alg);
1716 }
1717 
1718 module_init(curve25519_mod_init);
1719 module_exit(curve25519_mod_exit);
1720 
1721 MODULE_ALIAS_CRYPTO("curve25519");
1722 MODULE_ALIAS_CRYPTO("curve25519-x86");
1723 MODULE_LICENSE("GPL v2");
1724 MODULE_AUTHOR("Jason A. Donenfeld <Jason@zx2c4.com>");