Back to home page

OSCL-LXR

 
 

    


0001 //
0002 // Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions
0003 //
0004 // Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
0005 // Copyright (C) 2019 Google LLC <ebiggers@google.com>
0006 //
0007 // This program is free software; you can redistribute it and/or modify
0008 // it under the terms of the GNU General Public License version 2 as
0009 // published by the Free Software Foundation.
0010 //
0011 
0012 // Derived from the x86 version:
0013 //
0014 // Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions
0015 //
0016 // Copyright (c) 2013, Intel Corporation
0017 //
0018 // Authors:
0019 //     Erdinc Ozturk <erdinc.ozturk@intel.com>
0020 //     Vinodh Gopal <vinodh.gopal@intel.com>
0021 //     James Guilford <james.guilford@intel.com>
0022 //     Tim Chen <tim.c.chen@linux.intel.com>
0023 //
0024 // This software is available to you under a choice of one of two
0025 // licenses.  You may choose to be licensed under the terms of the GNU
0026 // General Public License (GPL) Version 2, available from the file
0027 // COPYING in the main directory of this source tree, or the
0028 // OpenIB.org BSD license below:
0029 //
0030 // Redistribution and use in source and binary forms, with or without
0031 // modification, are permitted provided that the following conditions are
0032 // met:
0033 //
0034 // * Redistributions of source code must retain the above copyright
0035 //   notice, this list of conditions and the following disclaimer.
0036 //
0037 // * Redistributions in binary form must reproduce the above copyright
0038 //   notice, this list of conditions and the following disclaimer in the
0039 //   documentation and/or other materials provided with the
0040 //   distribution.
0041 //
0042 // * Neither the name of the Intel Corporation nor the names of its
0043 //   contributors may be used to endorse or promote products derived from
0044 //   this software without specific prior written permission.
0045 //
0046 //
0047 // THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
0048 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
0049 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
0050 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
0051 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
0052 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
0053 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
0054 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
0055 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
0056 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
0057 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
0058 //
0059 //       Reference paper titled "Fast CRC Computation for Generic
0060 //  Polynomials Using PCLMULQDQ Instruction"
0061 //       URL: http://www.intel.com/content/dam/www/public/us/en/documents
0062 //  /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
0063 //
0064 
0065 #include <linux/linkage.h>
0066 #include <asm/assembler.h>
0067 
0068     .text
0069     .arch       armv8-a+crypto
0070 
0071     init_crc    .req    w0
0072     buf     .req    x1
0073     len     .req    x2
0074     fold_consts_ptr .req    x3
0075 
0076     fold_consts .req    v10
0077 
0078     ad      .req    v14
0079 
0080     k00_16      .req    v15
0081     k32_48      .req    v16
0082 
0083     t3      .req    v17
0084     t4      .req    v18
0085     t5      .req    v19
0086     t6      .req    v20
0087     t7      .req    v21
0088     t8      .req    v22
0089     t9      .req    v23
0090 
0091     perm1       .req    v24
0092     perm2       .req    v25
0093     perm3       .req    v26
0094     perm4       .req    v27
0095 
0096     bd1     .req    v28
0097     bd2     .req    v29
0098     bd3     .req    v30
0099     bd4     .req    v31
0100 
0101     .macro      __pmull_init_p64
0102     .endm
0103 
0104     .macro      __pmull_pre_p64, bd
0105     .endm
0106 
0107     .macro      __pmull_init_p8
0108     // k00_16 := 0x0000000000000000_000000000000ffff
0109     // k32_48 := 0x00000000ffffffff_0000ffffffffffff
0110     movi        k32_48.2d, #0xffffffff
0111     mov     k32_48.h[2], k32_48.h[0]
0112     ushr        k00_16.2d, k32_48.2d, #32
0113 
0114     // prepare the permutation vectors
0115     mov_q       x5, 0x080f0e0d0c0b0a09
0116     movi        perm4.8b, #8
0117     dup     perm1.2d, x5
0118     eor     perm1.16b, perm1.16b, perm4.16b
0119     ushr        perm2.2d, perm1.2d, #8
0120     ushr        perm3.2d, perm1.2d, #16
0121     ushr        perm4.2d, perm1.2d, #24
0122     sli     perm2.2d, perm1.2d, #56
0123     sli     perm3.2d, perm1.2d, #48
0124     sli     perm4.2d, perm1.2d, #40
0125     .endm
0126 
0127     .macro      __pmull_pre_p8, bd
0128     tbl     bd1.16b, {\bd\().16b}, perm1.16b
0129     tbl     bd2.16b, {\bd\().16b}, perm2.16b
0130     tbl     bd3.16b, {\bd\().16b}, perm3.16b
0131     tbl     bd4.16b, {\bd\().16b}, perm4.16b
0132     .endm
0133 
0134 SYM_FUNC_START_LOCAL(__pmull_p8_core)
0135 .L__pmull_p8_core:
0136     ext     t4.8b, ad.8b, ad.8b, #1         // A1
0137     ext     t5.8b, ad.8b, ad.8b, #2         // A2
0138     ext     t6.8b, ad.8b, ad.8b, #3         // A3
0139 
0140     pmull       t4.8h, t4.8b, fold_consts.8b        // F = A1*B
0141     pmull       t8.8h, ad.8b, bd1.8b            // E = A*B1
0142     pmull       t5.8h, t5.8b, fold_consts.8b        // H = A2*B
0143     pmull       t7.8h, ad.8b, bd2.8b            // G = A*B2
0144     pmull       t6.8h, t6.8b, fold_consts.8b        // J = A3*B
0145     pmull       t9.8h, ad.8b, bd3.8b            // I = A*B3
0146     pmull       t3.8h, ad.8b, bd4.8b            // K = A*B4
0147     b       0f
0148 
0149 .L__pmull_p8_core2:
0150     tbl     t4.16b, {ad.16b}, perm1.16b     // A1
0151     tbl     t5.16b, {ad.16b}, perm2.16b     // A2
0152     tbl     t6.16b, {ad.16b}, perm3.16b     // A3
0153 
0154     pmull2      t4.8h, t4.16b, fold_consts.16b      // F = A1*B
0155     pmull2      t8.8h, ad.16b, bd1.16b          // E = A*B1
0156     pmull2      t5.8h, t5.16b, fold_consts.16b      // H = A2*B
0157     pmull2      t7.8h, ad.16b, bd2.16b          // G = A*B2
0158     pmull2      t6.8h, t6.16b, fold_consts.16b      // J = A3*B
0159     pmull2      t9.8h, ad.16b, bd3.16b          // I = A*B3
0160     pmull2      t3.8h, ad.16b, bd4.16b          // K = A*B4
0161 
0162 0:  eor     t4.16b, t4.16b, t8.16b          // L = E + F
0163     eor     t5.16b, t5.16b, t7.16b          // M = G + H
0164     eor     t6.16b, t6.16b, t9.16b          // N = I + J
0165 
0166     uzp1        t8.2d, t4.2d, t5.2d
0167     uzp2        t4.2d, t4.2d, t5.2d
0168     uzp1        t7.2d, t6.2d, t3.2d
0169     uzp2        t6.2d, t6.2d, t3.2d
0170 
0171     // t4 = (L) (P0 + P1) << 8
0172     // t5 = (M) (P2 + P3) << 16
0173     eor     t8.16b, t8.16b, t4.16b
0174     and     t4.16b, t4.16b, k32_48.16b
0175 
0176     // t6 = (N) (P4 + P5) << 24
0177     // t7 = (K) (P6 + P7) << 32
0178     eor     t7.16b, t7.16b, t6.16b
0179     and     t6.16b, t6.16b, k00_16.16b
0180 
0181     eor     t8.16b, t8.16b, t4.16b
0182     eor     t7.16b, t7.16b, t6.16b
0183 
0184     zip2        t5.2d, t8.2d, t4.2d
0185     zip1        t4.2d, t8.2d, t4.2d
0186     zip2        t3.2d, t7.2d, t6.2d
0187     zip1        t6.2d, t7.2d, t6.2d
0188 
0189     ext     t4.16b, t4.16b, t4.16b, #15
0190     ext     t5.16b, t5.16b, t5.16b, #14
0191     ext     t6.16b, t6.16b, t6.16b, #13
0192     ext     t3.16b, t3.16b, t3.16b, #12
0193 
0194     eor     t4.16b, t4.16b, t5.16b
0195     eor     t6.16b, t6.16b, t3.16b
0196     ret
0197 SYM_FUNC_END(__pmull_p8_core)
0198 
0199     .macro      __pmull_p8, rq, ad, bd, i
0200     .ifnc       \bd, fold_consts
0201     .err
0202     .endif
0203     mov     ad.16b, \ad\().16b
0204     .ifb        \i
0205     pmull       \rq\().8h, \ad\().8b, \bd\().8b     // D = A*B
0206     .else
0207     pmull2      \rq\().8h, \ad\().16b, \bd\().16b   // D = A*B
0208     .endif
0209 
0210     bl      .L__pmull_p8_core\i
0211 
0212     eor     \rq\().16b, \rq\().16b, t4.16b
0213     eor     \rq\().16b, \rq\().16b, t6.16b
0214     .endm
0215 
0216     // Fold reg1, reg2 into the next 32 data bytes, storing the result back
0217     // into reg1, reg2.
0218     .macro      fold_32_bytes, p, reg1, reg2
0219     ldp     q11, q12, [buf], #0x20
0220 
0221     __pmull_\p  v8, \reg1, fold_consts, 2
0222     __pmull_\p  \reg1, \reg1, fold_consts
0223 
0224 CPU_LE( rev64       v11.16b, v11.16b        )
0225 CPU_LE( rev64       v12.16b, v12.16b        )
0226 
0227     __pmull_\p  v9, \reg2, fold_consts, 2
0228     __pmull_\p  \reg2, \reg2, fold_consts
0229 
0230 CPU_LE( ext     v11.16b, v11.16b, v11.16b, #8   )
0231 CPU_LE( ext     v12.16b, v12.16b, v12.16b, #8   )
0232 
0233     eor     \reg1\().16b, \reg1\().16b, v8.16b
0234     eor     \reg2\().16b, \reg2\().16b, v9.16b
0235     eor     \reg1\().16b, \reg1\().16b, v11.16b
0236     eor     \reg2\().16b, \reg2\().16b, v12.16b
0237     .endm
0238 
0239     // Fold src_reg into dst_reg, optionally loading the next fold constants
0240     .macro      fold_16_bytes, p, src_reg, dst_reg, load_next_consts
0241     __pmull_\p  v8, \src_reg, fold_consts
0242     __pmull_\p  \src_reg, \src_reg, fold_consts, 2
0243     .ifnb       \load_next_consts
0244     ld1     {fold_consts.2d}, [fold_consts_ptr], #16
0245     __pmull_pre_\p  fold_consts
0246     .endif
0247     eor     \dst_reg\().16b, \dst_reg\().16b, v8.16b
0248     eor     \dst_reg\().16b, \dst_reg\().16b, \src_reg\().16b
0249     .endm
0250 
0251     .macro      __pmull_p64, rd, rn, rm, n
0252     .ifb        \n
0253     pmull       \rd\().1q, \rn\().1d, \rm\().1d
0254     .else
0255     pmull2      \rd\().1q, \rn\().2d, \rm\().2d
0256     .endif
0257     .endm
0258 
0259     .macro      crc_t10dif_pmull, p
0260     __pmull_init_\p
0261 
0262     // For sizes less than 256 bytes, we can't fold 128 bytes at a time.
0263     cmp     len, #256
0264     b.lt        .Lless_than_256_bytes_\@
0265 
0266     adr_l       fold_consts_ptr, .Lfold_across_128_bytes_consts
0267 
0268     // Load the first 128 data bytes.  Byte swapping is necessary to make
0269     // the bit order match the polynomial coefficient order.
0270     ldp     q0, q1, [buf]
0271     ldp     q2, q3, [buf, #0x20]
0272     ldp     q4, q5, [buf, #0x40]
0273     ldp     q6, q7, [buf, #0x60]
0274     add     buf, buf, #0x80
0275 CPU_LE( rev64       v0.16b, v0.16b          )
0276 CPU_LE( rev64       v1.16b, v1.16b          )
0277 CPU_LE( rev64       v2.16b, v2.16b          )
0278 CPU_LE( rev64       v3.16b, v3.16b          )
0279 CPU_LE( rev64       v4.16b, v4.16b          )
0280 CPU_LE( rev64       v5.16b, v5.16b          )
0281 CPU_LE( rev64       v6.16b, v6.16b          )
0282 CPU_LE( rev64       v7.16b, v7.16b          )
0283 CPU_LE( ext     v0.16b, v0.16b, v0.16b, #8  )
0284 CPU_LE( ext     v1.16b, v1.16b, v1.16b, #8  )
0285 CPU_LE( ext     v2.16b, v2.16b, v2.16b, #8  )
0286 CPU_LE( ext     v3.16b, v3.16b, v3.16b, #8  )
0287 CPU_LE( ext     v4.16b, v4.16b, v4.16b, #8  )
0288 CPU_LE( ext     v5.16b, v5.16b, v5.16b, #8  )
0289 CPU_LE( ext     v6.16b, v6.16b, v6.16b, #8  )
0290 CPU_LE( ext     v7.16b, v7.16b, v7.16b, #8  )
0291 
0292     // XOR the first 16 data *bits* with the initial CRC value.
0293     movi        v8.16b, #0
0294     mov     v8.h[7], init_crc
0295     eor     v0.16b, v0.16b, v8.16b
0296 
0297     // Load the constants for folding across 128 bytes.
0298     ld1     {fold_consts.2d}, [fold_consts_ptr]
0299     __pmull_pre_\p  fold_consts
0300 
0301     // Subtract 128 for the 128 data bytes just consumed.  Subtract another
0302     // 128 to simplify the termination condition of the following loop.
0303     sub     len, len, #256
0304 
0305     // While >= 128 data bytes remain (not counting v0-v7), fold the 128
0306     // bytes v0-v7 into them, storing the result back into v0-v7.
0307 .Lfold_128_bytes_loop_\@:
0308     fold_32_bytes   \p, v0, v1
0309     fold_32_bytes   \p, v2, v3
0310     fold_32_bytes   \p, v4, v5
0311     fold_32_bytes   \p, v6, v7
0312 
0313     subs        len, len, #128
0314     b.ge        .Lfold_128_bytes_loop_\@
0315 
0316     // Now fold the 112 bytes in v0-v6 into the 16 bytes in v7.
0317 
0318     // Fold across 64 bytes.
0319     add     fold_consts_ptr, fold_consts_ptr, #16
0320     ld1     {fold_consts.2d}, [fold_consts_ptr], #16
0321     __pmull_pre_\p  fold_consts
0322     fold_16_bytes   \p, v0, v4
0323     fold_16_bytes   \p, v1, v5
0324     fold_16_bytes   \p, v2, v6
0325     fold_16_bytes   \p, v3, v7, 1
0326     // Fold across 32 bytes.
0327     fold_16_bytes   \p, v4, v6
0328     fold_16_bytes   \p, v5, v7, 1
0329     // Fold across 16 bytes.
0330     fold_16_bytes   \p, v6, v7
0331 
0332     // Add 128 to get the correct number of data bytes remaining in 0...127
0333     // (not counting v7), following the previous extra subtraction by 128.
0334     // Then subtract 16 to simplify the termination condition of the
0335     // following loop.
0336     adds        len, len, #(128-16)
0337 
0338     // While >= 16 data bytes remain (not counting v7), fold the 16 bytes v7
0339     // into them, storing the result back into v7.
0340     b.lt        .Lfold_16_bytes_loop_done_\@
0341 .Lfold_16_bytes_loop_\@:
0342     __pmull_\p  v8, v7, fold_consts
0343     __pmull_\p  v7, v7, fold_consts, 2
0344     eor     v7.16b, v7.16b, v8.16b
0345     ldr     q0, [buf], #16
0346 CPU_LE( rev64       v0.16b, v0.16b          )
0347 CPU_LE( ext     v0.16b, v0.16b, v0.16b, #8  )
0348     eor     v7.16b, v7.16b, v0.16b
0349     subs        len, len, #16
0350     b.ge        .Lfold_16_bytes_loop_\@
0351 
0352 .Lfold_16_bytes_loop_done_\@:
0353     // Add 16 to get the correct number of data bytes remaining in 0...15
0354     // (not counting v7), following the previous extra subtraction by 16.
0355     adds        len, len, #16
0356     b.eq        .Lreduce_final_16_bytes_\@
0357 
0358 .Lhandle_partial_segment_\@:
0359     // Reduce the last '16 + len' bytes where 1 <= len <= 15 and the first
0360     // 16 bytes are in v7 and the rest are the remaining data in 'buf'.  To
0361     // do this without needing a fold constant for each possible 'len',
0362     // redivide the bytes into a first chunk of 'len' bytes and a second
0363     // chunk of 16 bytes, then fold the first chunk into the second.
0364 
0365     // v0 = last 16 original data bytes
0366     add     buf, buf, len
0367     ldr     q0, [buf, #-16]
0368 CPU_LE( rev64       v0.16b, v0.16b          )
0369 CPU_LE( ext     v0.16b, v0.16b, v0.16b, #8  )
0370 
0371     // v1 = high order part of second chunk: v7 left-shifted by 'len' bytes.
0372     adr_l       x4, .Lbyteshift_table + 16
0373     sub     x4, x4, len
0374     ld1     {v2.16b}, [x4]
0375     tbl     v1.16b, {v7.16b}, v2.16b
0376 
0377     // v3 = first chunk: v7 right-shifted by '16-len' bytes.
0378     movi        v3.16b, #0x80
0379     eor     v2.16b, v2.16b, v3.16b
0380     tbl     v3.16b, {v7.16b}, v2.16b
0381 
0382     // Convert to 8-bit masks: 'len' 0x00 bytes, then '16-len' 0xff bytes.
0383     sshr        v2.16b, v2.16b, #7
0384 
0385     // v2 = second chunk: 'len' bytes from v0 (low-order bytes),
0386     // then '16-len' bytes from v1 (high-order bytes).
0387     bsl     v2.16b, v1.16b, v0.16b
0388 
0389     // Fold the first chunk into the second chunk, storing the result in v7.
0390     __pmull_\p  v0, v3, fold_consts
0391     __pmull_\p  v7, v3, fold_consts, 2
0392     eor     v7.16b, v7.16b, v0.16b
0393     eor     v7.16b, v7.16b, v2.16b
0394 
0395 .Lreduce_final_16_bytes_\@:
0396     // Reduce the 128-bit value M(x), stored in v7, to the final 16-bit CRC.
0397 
0398     movi        v2.16b, #0      // init zero register
0399 
0400     // Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'.
0401     ld1     {fold_consts.2d}, [fold_consts_ptr], #16
0402     __pmull_pre_\p  fold_consts
0403 
0404     // Fold the high 64 bits into the low 64 bits, while also multiplying by
0405     // x^64.  This produces a 128-bit value congruent to x^64 * M(x) and
0406     // whose low 48 bits are 0.
0407     ext     v0.16b, v2.16b, v7.16b, #8
0408     __pmull_\p  v7, v7, fold_consts, 2  // high bits * x^48 * (x^80 mod G(x))
0409     eor     v0.16b, v0.16b, v7.16b  // + low bits * x^64
0410 
0411     // Fold the high 32 bits into the low 96 bits.  This produces a 96-bit
0412     // value congruent to x^64 * M(x) and whose low 48 bits are 0.
0413     ext     v1.16b, v0.16b, v2.16b, #12 // extract high 32 bits
0414     mov     v0.s[3], v2.s[0]    // zero high 32 bits
0415     __pmull_\p  v1, v1, fold_consts // high 32 bits * x^48 * (x^48 mod G(x))
0416     eor     v0.16b, v0.16b, v1.16b  // + low bits
0417 
0418     // Load G(x) and floor(x^48 / G(x)).
0419     ld1     {fold_consts.2d}, [fold_consts_ptr]
0420     __pmull_pre_\p  fold_consts
0421 
0422     // Use Barrett reduction to compute the final CRC value.
0423     __pmull_\p  v1, v0, fold_consts, 2  // high 32 bits * floor(x^48 / G(x))
0424     ushr        v1.2d, v1.2d, #32   // /= x^32
0425     __pmull_\p  v1, v1, fold_consts // *= G(x)
0426     ushr        v0.2d, v0.2d, #48
0427     eor     v0.16b, v0.16b, v1.16b  // + low 16 nonzero bits
0428     // Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of v0.
0429 
0430     umov        w0, v0.h[0]
0431     .ifc        \p, p8
0432     ldp     x29, x30, [sp], #16
0433     .endif
0434     ret
0435 
0436 .Lless_than_256_bytes_\@:
0437     // Checksumming a buffer of length 16...255 bytes
0438 
0439     adr_l       fold_consts_ptr, .Lfold_across_16_bytes_consts
0440 
0441     // Load the first 16 data bytes.
0442     ldr     q7, [buf], #0x10
0443 CPU_LE( rev64       v7.16b, v7.16b          )
0444 CPU_LE( ext     v7.16b, v7.16b, v7.16b, #8  )
0445 
0446     // XOR the first 16 data *bits* with the initial CRC value.
0447     movi        v0.16b, #0
0448     mov     v0.h[7], init_crc
0449     eor     v7.16b, v7.16b, v0.16b
0450 
0451     // Load the fold-across-16-bytes constants.
0452     ld1     {fold_consts.2d}, [fold_consts_ptr], #16
0453     __pmull_pre_\p  fold_consts
0454 
0455     cmp     len, #16
0456     b.eq        .Lreduce_final_16_bytes_\@  // len == 16
0457     subs        len, len, #32
0458     b.ge        .Lfold_16_bytes_loop_\@     // 32 <= len <= 255
0459     add     len, len, #16
0460     b       .Lhandle_partial_segment_\@ // 17 <= len <= 31
0461     .endm
0462 
0463 //
0464 // u16 crc_t10dif_pmull_p8(u16 init_crc, const u8 *buf, size_t len);
0465 //
0466 // Assumes len >= 16.
0467 //
0468 SYM_FUNC_START(crc_t10dif_pmull_p8)
0469     stp     x29, x30, [sp, #-16]!
0470     mov     x29, sp
0471     crc_t10dif_pmull p8
0472 SYM_FUNC_END(crc_t10dif_pmull_p8)
0473 
0474     .align      5
0475 //
0476 // u16 crc_t10dif_pmull_p64(u16 init_crc, const u8 *buf, size_t len);
0477 //
0478 // Assumes len >= 16.
0479 //
0480 SYM_FUNC_START(crc_t10dif_pmull_p64)
0481     crc_t10dif_pmull    p64
0482 SYM_FUNC_END(crc_t10dif_pmull_p64)
0483 
0484     .section    ".rodata", "a"
0485     .align      4
0486 
0487 // Fold constants precomputed from the polynomial 0x18bb7
0488 // G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0
0489 .Lfold_across_128_bytes_consts:
0490     .quad       0x0000000000006123  // x^(8*128)    mod G(x)
0491     .quad       0x0000000000002295  // x^(8*128+64) mod G(x)
0492 // .Lfold_across_64_bytes_consts:
0493     .quad       0x0000000000001069  // x^(4*128)    mod G(x)
0494     .quad       0x000000000000dd31  // x^(4*128+64) mod G(x)
0495 // .Lfold_across_32_bytes_consts:
0496     .quad       0x000000000000857d  // x^(2*128)    mod G(x)
0497     .quad       0x0000000000007acc  // x^(2*128+64) mod G(x)
0498 .Lfold_across_16_bytes_consts:
0499     .quad       0x000000000000a010  // x^(1*128)    mod G(x)
0500     .quad       0x0000000000001faa  // x^(1*128+64) mod G(x)
0501 // .Lfinal_fold_consts:
0502     .quad       0x1368000000000000  // x^48 * (x^48 mod G(x))
0503     .quad       0x2d56000000000000  // x^48 * (x^80 mod G(x))
0504 // .Lbarrett_reduction_consts:
0505     .quad       0x0000000000018bb7  // G(x)
0506     .quad       0x00000001f65a57f8  // floor(x^48 / G(x))
0507 
0508 // For 1 <= len <= 15, the 16-byte vector beginning at &byteshift_table[16 -
0509 // len] is the index vector to shift left by 'len' bytes, and is also {0x80,
0510 // ..., 0x80} XOR the index vector to shift right by '16 - len' bytes.
0511 .Lbyteshift_table:
0512     .byte        0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87
0513     .byte       0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f
0514     .byte        0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
0515     .byte        0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe , 0x0