sparc/lib/checksum_64.S

0001 /* SPDX-License-Identifier: GPL-2.0 */
0002 /* checksum.S: Sparc V9 optimized checksum code.
0003  *
0004  *  Copyright(C) 1995 Linus Torvalds
0005  *  Copyright(C) 1995 Miguel de Icaza
0006  *  Copyright(C) 1996, 2000 David S. Miller
0007  *  Copyright(C) 1997 Jakub Jelinek
0008  *
0009  * derived from:
0010  *  Linux/Alpha checksum c-code
0011  *      Linux/ix86 inline checksum assembly
0012  *      RFC1071 Computing the Internet Checksum (esp. Jacobsons m68k code)
0013  *  David Mosberger-Tang for optimized reference c-code
0014  *  BSD4.4 portable checksum routine
0015  */
0016
0017 #include <asm/export.h>
0018     .text
0019
0020 csum_partial_fix_alignment:
0021     /* We checked for zero length already, so there must be
0022      * at least one byte.
0023      */
0024     be,pt       %icc, 1f
0025      nop
0026     ldub        [%o0 + 0x00], %o4
0027     add     %o0, 1, %o0
0028     sub     %o1, 1, %o1
0029 1:  andcc       %o0, 0x2, %g0
0030     be,pn       %icc, csum_partial_post_align
0031      cmp        %o1, 2
0032     blu,pn      %icc, csum_partial_end_cruft
0033      nop
0034     lduh        [%o0 + 0x00], %o5
0035     add     %o0, 2, %o0
0036     sub     %o1, 2, %o1
0037     ba,pt       %xcc, csum_partial_post_align
0038      add        %o5, %o4, %o4
0039
0040     .align      32
0041     .globl      csum_partial
0042     .type       csum_partial,#function
0043     EXPORT_SYMBOL(csum_partial)
0044 csum_partial:       /* %o0=buff, %o1=len, %o2=sum */
0045     prefetch    [%o0 + 0x000], #n_reads
0046     clr     %o4
0047     prefetch    [%o0 + 0x040], #n_reads
0048     brz,pn      %o1, csum_partial_finish
0049      andcc      %o0, 0x3, %g0
0050
0051     /* We "remember" whether the lowest bit in the address
0052      * was set in %g7.  Because if it is, we have to swap
0053      * upper and lower 8 bit fields of the sum we calculate.
0054     */
0055     bne,pn      %icc, csum_partial_fix_alignment
0056      andcc      %o0, 0x1, %g7
0057
0058 csum_partial_post_align:
0059     prefetch    [%o0 + 0x080], #n_reads
0060     andncc      %o1, 0x3f, %o3
0061
0062     prefetch    [%o0 + 0x0c0], #n_reads
0063     sub     %o1, %o3, %o1
0064     brz,pn      %o3, 2f
0065      prefetch   [%o0 + 0x100], #n_reads
0066
0067     /* So that we don't need to use the non-pairing
0068      * add-with-carry instructions we accumulate 32-bit
0069      * values into a 64-bit register.  At the end of the
0070      * loop we fold it down to 32-bits and so on.
0071      */
0072     prefetch    [%o0 + 0x140], #n_reads
0073 1:  lduw        [%o0 + 0x00], %o5
0074     lduw        [%o0 + 0x04], %g1
0075     lduw        [%o0 + 0x08], %g2
0076     add     %o4, %o5, %o4
0077     lduw        [%o0 + 0x0c], %g3
0078     add     %o4, %g1, %o4
0079     lduw        [%o0 + 0x10], %o5
0080     add     %o4, %g2, %o4
0081     lduw        [%o0 + 0x14], %g1
0082     add     %o4, %g3, %o4
0083     lduw        [%o0 + 0x18], %g2
0084     add     %o4, %o5, %o4
0085     lduw        [%o0 + 0x1c], %g3
0086     add     %o4, %g1, %o4
0087     lduw        [%o0 + 0x20], %o5
0088     add     %o4, %g2, %o4
0089     lduw        [%o0 + 0x24], %g1
0090     add     %o4, %g3, %o4
0091     lduw        [%o0 + 0x28], %g2
0092     add     %o4, %o5, %o4
0093     lduw        [%o0 + 0x2c], %g3
0094     add     %o4, %g1, %o4
0095     lduw        [%o0 + 0x30], %o5
0096     add     %o4, %g2, %o4
0097     lduw        [%o0 + 0x34], %g1
0098     add     %o4, %g3, %o4
0099     lduw        [%o0 + 0x38], %g2
0100     add     %o4, %o5, %o4
0101     lduw        [%o0 + 0x3c], %g3
0102     add     %o4, %g1, %o4
0103     prefetch    [%o0 + 0x180], #n_reads
0104     add     %o4, %g2, %o4
0105     subcc       %o3, 0x40, %o3
0106     add     %o0, 0x40, %o0
0107     bne,pt      %icc, 1b
0108      add        %o4, %g3, %o4
0109
0110 2:  and     %o1, 0x3c, %o3
0111     brz,pn      %o3, 2f
0112      sub        %o1, %o3, %o1
0113 1:  lduw        [%o0 + 0x00], %o5
0114     subcc       %o3, 0x4, %o3
0115     add     %o0, 0x4, %o0
0116     bne,pt      %icc, 1b
0117      add        %o4, %o5, %o4
0118
0119 2:
0120     /* fold 64-->32 */
0121     srlx        %o4, 32, %o5
0122     srl     %o4, 0, %o4
0123     add     %o4, %o5, %o4
0124     srlx        %o4, 32, %o5
0125     srl     %o4, 0, %o4
0126     add     %o4, %o5, %o4
0127
0128     /* fold 32-->16 */
0129     sethi       %hi(0xffff0000), %g1
0130     srl     %o4, 16, %o5
0131     andn        %o4, %g1, %g2
0132     add     %o5, %g2, %o4
0133     srl     %o4, 16, %o5
0134     andn        %o4, %g1, %g2
0135     add     %o5, %g2, %o4
0136
0137 csum_partial_end_cruft:
0138     /* %o4 has the 16-bit sum we have calculated so-far.  */
0139     cmp     %o1, 2
0140     blu,pt      %icc, 1f
0141      nop
0142     lduh        [%o0 + 0x00], %o5
0143     sub     %o1, 2, %o1
0144     add     %o0, 2, %o0
0145     add     %o4, %o5, %o4
0146 1:  brz,pt      %o1, 1f
0147      nop
0148     ldub        [%o0 + 0x00], %o5
0149     sub     %o1, 1, %o1
0150     add     %o0, 1, %o0
0151     sllx        %o5, 8, %o5
0152     add     %o4, %o5, %o4
0153 1:
0154     /* fold 32-->16 */
0155     sethi       %hi(0xffff0000), %g1
0156     srl     %o4, 16, %o5
0157     andn        %o4, %g1, %g2
0158     add     %o5, %g2, %o4
0159     srl     %o4, 16, %o5
0160     andn        %o4, %g1, %g2
0161     add     %o5, %g2, %o4
0162
0163 1:  brz,pt      %g7, 1f
0164      nop
0165
0166     /* We started with an odd byte, byte-swap the result.  */
0167     srl     %o4, 8, %o5
0168     and     %o4, 0xff, %g1
0169     sll     %g1, 8, %g1
0170     or      %o5, %g1, %o4
0171
0172 1:  addcc       %o2, %o4, %o2
0173     addc        %g0, %o2, %o2
0174
0175 csum_partial_finish:
0176     retl
0177      srl        %o2, 0, %o0