Back to home page

OSCL-LXR

 
 

    


0001 /*
0002  * Implement fast CRC32C with PCLMULQDQ instructions. (x86_64)
0003  *
0004  * The white papers on CRC32C calculations with PCLMULQDQ instruction can be
0005  * downloaded from:
0006  * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/crc-iscsi-polynomial-crc32-instruction-paper.pdf
0007  * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-paper.pdf
0008  *
0009  * Copyright (C) 2012 Intel Corporation.
0010  *
0011  * Authors:
0012  *  Wajdi Feghali <wajdi.k.feghali@intel.com>
0013  *  James Guilford <james.guilford@intel.com>
0014  *  David Cote <david.m.cote@intel.com>
0015  *  Tim Chen <tim.c.chen@linux.intel.com>
0016  *
0017  * This software is available to you under a choice of one of two
0018  * licenses.  You may choose to be licensed under the terms of the GNU
0019  * General Public License (GPL) Version 2, available from the file
0020  * COPYING in the main directory of this source tree, or the
0021  * OpenIB.org BSD license below:
0022  *
0023  *     Redistribution and use in source and binary forms, with or
0024  *     without modification, are permitted provided that the following
0025  *     conditions are met:
0026  *
0027  *      - Redistributions of source code must retain the above
0028  *        copyright notice, this list of conditions and the following
0029  *        disclaimer.
0030  *
0031  *      - Redistributions in binary form must reproduce the above
0032  *        copyright notice, this list of conditions and the following
0033  *        disclaimer in the documentation and/or other materials
0034  *        provided with the distribution.
0035  *
0036  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
0037  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
0038  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
0039  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
0040  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
0041  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
0042  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
0043  * SOFTWARE.
0044  */
0045 
0046 #include <linux/linkage.h>
0047 #include <asm/nospec-branch.h>
0048 
0049 ## ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction
0050 
0051 .macro LABEL prefix n
0052 \prefix\n\():
0053 .endm
0054 
0055 .macro JMPTBL_ENTRY i
0056 .quad crc_\i
0057 .endm
0058 
0059 .macro JNC_LESS_THAN j
0060     jnc less_than_\j
0061 .endm
0062 
0063 # Define threshold where buffers are considered "small" and routed to more
0064 # efficient "by-1" code. This "by-1" code only handles up to 255 bytes, so
0065 # SMALL_SIZE can be no larger than 255.
0066 
0067 #define SMALL_SIZE 200
0068 
0069 .if (SMALL_SIZE > 255)
0070 .error "SMALL_ SIZE must be < 256"
0071 .endif
0072 
0073 # unsigned int crc_pcl(u8 *buffer, int len, unsigned int crc_init);
0074 
0075 .text
0076 SYM_FUNC_START(crc_pcl)
0077 #define    bufp     rdi
0078 #define    bufp_dw  %edi
0079 #define    bufp_w   %di
0080 #define    bufp_b   %dil
0081 #define    bufptmp  %rcx
0082 #define    block_0  %rcx
0083 #define    block_1  %rdx
0084 #define    block_2  %r11
0085 #define    len      %rsi
0086 #define    len_dw   %esi
0087 #define    len_w    %si
0088 #define    len_b    %sil
0089 #define    crc_init_arg %rdx
0090 #define    tmp      %rbx
0091 #define    crc_init %r8
0092 #define    crc_init_dw  %r8d
0093 #define    crc1     %r9
0094 #define    crc2     %r10
0095 
0096     pushq   %rbx
0097     pushq   %rdi
0098     pushq   %rsi
0099 
0100     ## Move crc_init for Linux to a different
0101     mov     crc_init_arg, crc_init
0102 
0103     ################################################################
0104     ## 1) ALIGN:
0105     ################################################################
0106 
0107     mov     %bufp, bufptmp      # rdi = *buf
0108     neg     %bufp
0109     and     $7, %bufp       # calculate the unalignment amount of
0110                     # the address
0111     je      proc_block      # Skip if aligned
0112 
0113     ## If len is less than 8 and we're unaligned, we need to jump
0114     ## to special code to avoid reading beyond the end of the buffer
0115     cmp     $8, len
0116     jae     do_align
0117     # less_than_8 expects length in upper 3 bits of len_dw
0118     # less_than_8_post_shl1 expects length = carryflag * 8 + len_dw[31:30]
0119     shl     $32-3+1, len_dw
0120     jmp     less_than_8_post_shl1
0121 
0122 do_align:
0123     #### Calculate CRC of unaligned bytes of the buffer (if any)
0124     movq    (bufptmp), tmp      # load a quadward from the buffer
0125     add     %bufp, bufptmp      # align buffer pointer for quadword
0126                     # processing
0127     sub     %bufp, len      # update buffer length
0128 align_loop:
0129     crc32b  %bl, crc_init_dw    # compute crc32 of 1-byte
0130     shr     $8, tmp         # get next byte
0131     dec     %bufp
0132     jne     align_loop
0133 
0134 proc_block:
0135 
0136     ################################################################
0137     ## 2) PROCESS  BLOCKS:
0138     ################################################################
0139 
0140     ## compute num of bytes to be processed
0141     movq    len, tmp        # save num bytes in tmp
0142 
0143     cmpq    $128*24, len
0144     jae     full_block
0145 
0146 continue_block:
0147     cmpq    $SMALL_SIZE, len
0148     jb      small
0149 
0150     ## len < 128*24
0151     movq    $2731, %rax     # 2731 = ceil(2^16 / 24)
0152     mul     len_dw
0153     shrq    $16, %rax
0154 
0155     ## eax contains floor(bytes / 24) = num 24-byte chunks to do
0156 
0157     ## process rax 24-byte chunks (128 >= rax >= 0)
0158 
0159     ## compute end address of each block
0160     ## block 0 (base addr + RAX * 8)
0161     ## block 1 (base addr + RAX * 16)
0162     ## block 2 (base addr + RAX * 24)
0163     lea     (bufptmp, %rax, 8), block_0
0164     lea     (block_0, %rax, 8), block_1
0165     lea     (block_1, %rax, 8), block_2
0166 
0167     xor     crc1, crc1
0168     xor     crc2, crc2
0169 
0170     ## branch into array
0171     mov jump_table(,%rax,8), %bufp
0172     JMP_NOSPEC bufp
0173 
0174     ################################################################
0175     ## 2a) PROCESS FULL BLOCKS:
0176     ################################################################
0177 full_block:
0178     movl    $128,%eax
0179     lea     128*8*2(block_0), block_1
0180     lea     128*8*3(block_0), block_2
0181     add     $128*8*1, block_0
0182 
0183     xor     crc1,crc1
0184     xor     crc2,crc2
0185 
0186     # Fall thruogh into top of crc array (crc_128)
0187 
0188     ################################################################
0189     ## 3) CRC Array:
0190     ################################################################
0191 
0192 crc_array:
0193     i=128
0194 .rept 128-1
0195 .altmacro
0196 LABEL crc_ %i
0197 .noaltmacro
0198     ENDBR
0199     crc32q   -i*8(block_0), crc_init
0200     crc32q   -i*8(block_1), crc1
0201     crc32q   -i*8(block_2), crc2
0202     i=(i-1)
0203 .endr
0204 
0205 .altmacro
0206 LABEL crc_ %i
0207 .noaltmacro
0208     ENDBR
0209     crc32q   -i*8(block_0), crc_init
0210     crc32q   -i*8(block_1), crc1
0211 # SKIP  crc32  -i*8(block_2), crc2 ; Don't do this one yet
0212 
0213     mov     block_2, block_0
0214 
0215     ################################################################
0216     ## 4) Combine three results:
0217     ################################################################
0218 
0219     lea (K_table-8)(%rip), %bufp        # first entry is for idx 1
0220     shlq    $3, %rax            # rax *= 8
0221     pmovzxdq (%bufp,%rax), %xmm0        # 2 consts: K1:K2
0222     leal    (%eax,%eax,2), %eax     # rax *= 3 (total *24)
0223     subq    %rax, tmp           # tmp -= rax*24
0224 
0225     movq    crc_init, %xmm1         # CRC for block 1
0226     pclmulqdq $0x00, %xmm0, %xmm1       # Multiply by K2
0227 
0228     movq    crc1, %xmm2         # CRC for block 2
0229     pclmulqdq $0x10, %xmm0, %xmm2       # Multiply by K1
0230 
0231     pxor    %xmm2,%xmm1
0232     movq    %xmm1, %rax
0233     xor     -i*8(block_2), %rax
0234     mov     crc2, crc_init
0235     crc32   %rax, crc_init
0236 
0237     ################################################################
0238     ## 5) Check for end:
0239     ################################################################
0240 
0241 LABEL crc_ 0
0242     ENDBR
0243     mov     tmp, len
0244     cmp     $128*24, tmp
0245     jae     full_block
0246     cmp     $24, tmp
0247     jae     continue_block
0248 
0249 less_than_24:
0250     shl     $32-4, len_dw           # less_than_16 expects length
0251                         # in upper 4 bits of len_dw
0252     jnc     less_than_16
0253     crc32q  (bufptmp), crc_init
0254     crc32q  8(bufptmp), crc_init
0255     jz      do_return
0256     add     $16, bufptmp
0257     # len is less than 8 if we got here
0258     # less_than_8 expects length in upper 3 bits of len_dw
0259     # less_than_8_post_shl1 expects length = carryflag * 8 + len_dw[31:30]
0260     shl     $2, len_dw
0261     jmp     less_than_8_post_shl1
0262 
0263     #######################################################################
0264     ## 6) LESS THAN 256-bytes REMAIN AT THIS POINT (8-bits of len are full)
0265     #######################################################################
0266 small:
0267     shl $32-8, len_dw       # Prepare len_dw for less_than_256
0268     j=256
0269 .rept 5                 # j = {256, 128, 64, 32, 16}
0270 .altmacro
0271 LABEL less_than_ %j         # less_than_j: Length should be in
0272                     # upper lg(j) bits of len_dw
0273     j=(j/2)
0274     shl     $1, len_dw      # Get next MSB
0275     JNC_LESS_THAN %j
0276 .noaltmacro
0277     i=0
0278 .rept (j/8)
0279     crc32q  i(bufptmp), crc_init    # Compute crc32 of 8-byte data
0280     i=i+8
0281 .endr
0282     jz      do_return       # Return if remaining length is zero
0283     add     $j, bufptmp     # Advance buf
0284 .endr
0285 
0286 less_than_8:                # Length should be stored in
0287                     # upper 3 bits of len_dw
0288     shl     $1, len_dw
0289 less_than_8_post_shl1:
0290     jnc     less_than_4
0291     crc32l  (bufptmp), crc_init_dw  # CRC of 4 bytes
0292     jz      do_return       # return if remaining data is zero
0293     add     $4, bufptmp
0294 less_than_4:                # Length should be stored in
0295                     # upper 2 bits of len_dw
0296     shl     $1, len_dw
0297     jnc     less_than_2
0298     crc32w  (bufptmp), crc_init_dw  # CRC of 2 bytes
0299     jz      do_return       # return if remaining data is zero
0300     add     $2, bufptmp
0301 less_than_2:                # Length should be stored in the MSB
0302                     # of len_dw
0303     shl     $1, len_dw
0304     jnc     less_than_1
0305     crc32b  (bufptmp), crc_init_dw  # CRC of 1 byte
0306 less_than_1:                # Length should be zero
0307 do_return:
0308     movq    crc_init, %rax
0309     popq    %rsi
0310     popq    %rdi
0311     popq    %rbx
0312         RET
0313 SYM_FUNC_END(crc_pcl)
0314 
0315 .section    .rodata, "a", @progbits
0316         ################################################################
0317         ## jump table        Table is 129 entries x 2 bytes each
0318         ################################################################
0319 .align 4
0320 jump_table:
0321     i=0
0322 .rept 129
0323 .altmacro
0324 JMPTBL_ENTRY %i
0325 .noaltmacro
0326     i=i+1
0327 .endr
0328 
0329 
0330     ################################################################
0331     ## PCLMULQDQ tables
0332     ## Table is 128 entries x 2 words (8 bytes) each
0333     ################################################################
0334 .align 8
0335 K_table:
0336     .long 0x493c7d27, 0x00000001
0337     .long 0xba4fc28e, 0x493c7d27
0338     .long 0xddc0152b, 0xf20c0dfe
0339     .long 0x9e4addf8, 0xba4fc28e
0340     .long 0x39d3b296, 0x3da6d0cb
0341     .long 0x0715ce53, 0xddc0152b
0342     .long 0x47db8317, 0x1c291d04
0343     .long 0x0d3b6092, 0x9e4addf8
0344     .long 0xc96cfdc0, 0x740eef02
0345     .long 0x878a92a7, 0x39d3b296
0346     .long 0xdaece73e, 0x083a6eec
0347     .long 0xab7aff2a, 0x0715ce53
0348     .long 0x2162d385, 0xc49f4f67
0349     .long 0x83348832, 0x47db8317
0350     .long 0x299847d5, 0x2ad91c30
0351     .long 0xb9e02b86, 0x0d3b6092
0352     .long 0x18b33a4e, 0x6992cea2
0353     .long 0xb6dd949b, 0xc96cfdc0
0354     .long 0x78d9ccb7, 0x7e908048
0355     .long 0xbac2fd7b, 0x878a92a7
0356     .long 0xa60ce07b, 0x1b3d8f29
0357     .long 0xce7f39f4, 0xdaece73e
0358     .long 0x61d82e56, 0xf1d0f55e
0359     .long 0xd270f1a2, 0xab7aff2a
0360     .long 0xc619809d, 0xa87ab8a8
0361     .long 0x2b3cac5d, 0x2162d385
0362     .long 0x65863b64, 0x8462d800
0363     .long 0x1b03397f, 0x83348832
0364     .long 0xebb883bd, 0x71d111a8
0365     .long 0xb3e32c28, 0x299847d5
0366     .long 0x064f7f26, 0xffd852c6
0367     .long 0xdd7e3b0c, 0xb9e02b86
0368     .long 0xf285651c, 0xdcb17aa4
0369     .long 0x10746f3c, 0x18b33a4e
0370     .long 0xc7a68855, 0xf37c5aee
0371     .long 0x271d9844, 0xb6dd949b
0372     .long 0x8e766a0c, 0x6051d5a2
0373     .long 0x93a5f730, 0x78d9ccb7
0374     .long 0x6cb08e5c, 0x18b0d4ff
0375     .long 0x6b749fb2, 0xbac2fd7b
0376     .long 0x1393e203, 0x21f3d99c
0377     .long 0xcec3662e, 0xa60ce07b
0378     .long 0x96c515bb, 0x8f158014
0379     .long 0xe6fc4e6a, 0xce7f39f4
0380     .long 0x8227bb8a, 0xa00457f7
0381     .long 0xb0cd4768, 0x61d82e56
0382     .long 0x39c7ff35, 0x8d6d2c43
0383     .long 0xd7a4825c, 0xd270f1a2
0384     .long 0x0ab3844b, 0x00ac29cf
0385     .long 0x0167d312, 0xc619809d
0386     .long 0xf6076544, 0xe9adf796
0387     .long 0x26f6a60a, 0x2b3cac5d
0388     .long 0xa741c1bf, 0x96638b34
0389     .long 0x98d8d9cb, 0x65863b64
0390     .long 0x49c3cc9c, 0xe0e9f351
0391     .long 0x68bce87a, 0x1b03397f
0392     .long 0x57a3d037, 0x9af01f2d
0393     .long 0x6956fc3b, 0xebb883bd
0394     .long 0x42d98888, 0x2cff42cf
0395     .long 0x3771e98f, 0xb3e32c28
0396     .long 0xb42ae3d9, 0x88f25a3a
0397     .long 0x2178513a, 0x064f7f26
0398     .long 0xe0ac139e, 0x4e36f0b0
0399     .long 0x170076fa, 0xdd7e3b0c
0400     .long 0x444dd413, 0xbd6f81f8
0401     .long 0x6f345e45, 0xf285651c
0402     .long 0x41d17b64, 0x91c9bd4b
0403     .long 0xff0dba97, 0x10746f3c
0404     .long 0xa2b73df1, 0x885f087b
0405     .long 0xf872e54c, 0xc7a68855
0406     .long 0x1e41e9fc, 0x4c144932
0407     .long 0x86d8e4d2, 0x271d9844
0408     .long 0x651bd98b, 0x52148f02
0409     .long 0x5bb8f1bc, 0x8e766a0c
0410     .long 0xa90fd27a, 0xa3c6f37a
0411     .long 0xb3af077a, 0x93a5f730
0412     .long 0x4984d782, 0xd7c0557f
0413     .long 0xca6ef3ac, 0x6cb08e5c
0414     .long 0x234e0b26, 0x63ded06a
0415     .long 0xdd66cbbb, 0x6b749fb2
0416     .long 0x4597456a, 0x4d56973c
0417     .long 0xe9e28eb4, 0x1393e203
0418     .long 0x7b3ff57a, 0x9669c9df
0419     .long 0xc9c8b782, 0xcec3662e
0420     .long 0x3f70cc6f, 0xe417f38a
0421     .long 0x93e106a4, 0x96c515bb
0422     .long 0x62ec6c6d, 0x4b9e0f71
0423     .long 0xd813b325, 0xe6fc4e6a
0424     .long 0x0df04680, 0xd104b8fc
0425     .long 0x2342001e, 0x8227bb8a
0426     .long 0x0a2a8d7e, 0x5b397730
0427     .long 0x6d9a4957, 0xb0cd4768
0428     .long 0xe8b6368b, 0xe78eb416
0429     .long 0xd2c3ed1a, 0x39c7ff35
0430     .long 0x995a5724, 0x61ff0e01
0431     .long 0x9ef68d35, 0xd7a4825c
0432     .long 0x0c139b31, 0x8d96551c
0433     .long 0xf2271e60, 0x0ab3844b
0434     .long 0x0b0bf8ca, 0x0bf80dd2
0435     .long 0x2664fd8b, 0x0167d312
0436     .long 0xed64812d, 0x8821abed
0437     .long 0x02ee03b2, 0xf6076544
0438     .long 0x8604ae0f, 0x6a45d2b2
0439     .long 0x363bd6b3, 0x26f6a60a
0440     .long 0x135c83fd, 0xd8d26619
0441     .long 0x5fabe670, 0xa741c1bf
0442     .long 0x35ec3279, 0xde87806c
0443     .long 0x00bcf5f6, 0x98d8d9cb
0444     .long 0x8ae00689, 0x14338754
0445     .long 0x17f27698, 0x49c3cc9c
0446     .long 0x58ca5f00, 0x5bd2011f
0447     .long 0xaa7c7ad5, 0x68bce87a
0448     .long 0xb5cfca28, 0xdd07448e
0449     .long 0xded288f8, 0x57a3d037
0450     .long 0x59f229bc, 0xdde8f5b9
0451     .long 0x6d390dec, 0x6956fc3b
0452     .long 0x37170390, 0xa3e3e02c
0453     .long 0x6353c1cc, 0x42d98888
0454     .long 0xc4584f5c, 0xd73c7bea
0455     .long 0xf48642e9, 0x3771e98f
0456     .long 0x531377e2, 0x80ff0093
0457     .long 0xdd35bc8d, 0xb42ae3d9
0458     .long 0xb25b29f2, 0x8fe4c34d
0459     .long 0x9a5ede41, 0x2178513a
0460     .long 0xa563905d, 0xdf99fc11
0461     .long 0x45cddf4e, 0xe0ac139e
0462     .long 0xacfa3103, 0x6c23e841
0463     .long 0xa51b6135, 0x170076fa