powerpc/lib/checksum_64.S

0001 /* SPDX-License-Identifier: GPL-2.0-or-later */
0002 /*
0003  * This file contains assembly-language implementations
0004  * of IP-style 1's complement checksum routines.
0005  *
0006  *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
0007  *
0008  * Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au).
0009  */
0010
0011 #include <linux/sys.h>
0012 #include <asm/processor.h>
0013 #include <asm/errno.h>
0014 #include <asm/ppc_asm.h>
0015 #include <asm/export.h>
0016
0017 /*
0018  * Computes the checksum of a memory block at buff, length len,
0019  * and adds in "sum" (32-bit).
0020  *
0021  * __csum_partial(r3=buff, r4=len, r5=sum)
0022  */
0023 _GLOBAL(__csum_partial)
0024     addic   r0,r5,0         /* clear carry */
0025
0026     srdi.   r6,r4,3         /* less than 8 bytes? */
0027     beq .Lcsum_tail_word
0028
0029     /*
0030      * If only halfword aligned, align to a double word. Since odd
0031      * aligned addresses should be rare and they would require more
0032      * work to calculate the correct checksum, we ignore that case
0033      * and take the potential slowdown of unaligned loads.
0034      */
0035     rldicl. r6,r3,64-1,64-2     /* r6 = (r3 >> 1) & 0x3 */
0036     beq .Lcsum_aligned
0037
0038     li  r7,4
0039     sub r6,r7,r6
0040     mtctr   r6
0041
0042 1:
0043     lhz r6,0(r3)        /* align to doubleword */
0044     subi    r4,r4,2
0045     addi    r3,r3,2
0046     adde    r0,r0,r6
0047     bdnz    1b
0048
0049 .Lcsum_aligned:
0050     /*
0051      * We unroll the loop such that each iteration is 64 bytes with an
0052      * entry and exit limb of 64 bytes, meaning a minimum size of
0053      * 128 bytes.
0054      */
0055     srdi.   r6,r4,7
0056     beq .Lcsum_tail_doublewords     /* len < 128 */
0057
0058     srdi    r6,r4,6
0059     subi    r6,r6,1
0060     mtctr   r6
0061
0062     stdu    r1,-STACKFRAMESIZE(r1)
0063     std r14,STK_REG(R14)(r1)
0064     std r15,STK_REG(R15)(r1)
0065     std r16,STK_REG(R16)(r1)
0066
0067     ld  r6,0(r3)
0068     ld  r9,8(r3)
0069
0070     ld  r10,16(r3)
0071     ld  r11,24(r3)
0072
0073     /*
0074      * On POWER6 and POWER7 back to back adde instructions take 2 cycles
0075      * because of the XER dependency. This means the fastest this loop can
0076      * go is 16 cycles per iteration. The scheduling of the loop below has
0077      * been shown to hit this on both POWER6 and POWER7.
0078      */
0079     .align 5
0080 2:
0081     adde    r0,r0,r6
0082     ld  r12,32(r3)
0083     ld  r14,40(r3)
0084
0085     adde    r0,r0,r9
0086     ld  r15,48(r3)
0087     ld  r16,56(r3)
0088     addi    r3,r3,64
0089
0090     adde    r0,r0,r10
0091
0092     adde    r0,r0,r11
0093
0094     adde    r0,r0,r12
0095
0096     adde    r0,r0,r14
0097
0098     adde    r0,r0,r15
0099     ld  r6,0(r3)
0100     ld  r9,8(r3)
0101
0102     adde    r0,r0,r16
0103     ld  r10,16(r3)
0104     ld  r11,24(r3)
0105     bdnz    2b
0106
0107
0108     adde    r0,r0,r6
0109     ld  r12,32(r3)
0110     ld  r14,40(r3)
0111
0112     adde    r0,r0,r9
0113     ld  r15,48(r3)
0114     ld  r16,56(r3)
0115     addi    r3,r3,64
0116
0117     adde    r0,r0,r10
0118     adde    r0,r0,r11
0119     adde    r0,r0,r12
0120     adde    r0,r0,r14
0121     adde    r0,r0,r15
0122     adde    r0,r0,r16
0123
0124     ld  r14,STK_REG(R14)(r1)
0125     ld  r15,STK_REG(R15)(r1)
0126     ld  r16,STK_REG(R16)(r1)
0127     addi    r1,r1,STACKFRAMESIZE
0128
0129     andi.   r4,r4,63
0130
0131 .Lcsum_tail_doublewords:        /* Up to 127 bytes to go */
0132     srdi.   r6,r4,3
0133     beq .Lcsum_tail_word
0134
0135     mtctr   r6
0136 3:
0137     ld  r6,0(r3)
0138     addi    r3,r3,8
0139     adde    r0,r0,r6
0140     bdnz    3b
0141
0142     andi.   r4,r4,7
0143
0144 .Lcsum_tail_word:           /* Up to 7 bytes to go */
0145     srdi.   r6,r4,2
0146     beq .Lcsum_tail_halfword
0147
0148     lwz r6,0(r3)
0149     addi    r3,r3,4
0150     adde    r0,r0,r6
0151     subi    r4,r4,4
0152
0153 .Lcsum_tail_halfword:           /* Up to 3 bytes to go */
0154     srdi.   r6,r4,1
0155     beq .Lcsum_tail_byte
0156
0157     lhz r6,0(r3)
0158     addi    r3,r3,2
0159     adde    r0,r0,r6
0160     subi    r4,r4,2
0161
0162 .Lcsum_tail_byte:           /* Up to 1 byte to go */
0163     andi.   r6,r4,1
0164     beq .Lcsum_finish
0165
0166     lbz r6,0(r3)
0167 #ifdef __BIG_ENDIAN__
0168     sldi    r9,r6,8         /* Pad the byte out to 16 bits */
0169     adde    r0,r0,r9
0170 #else
0171     adde    r0,r0,r6
0172 #endif
0173
0174 .Lcsum_finish:
0175     addze   r0,r0           /* add in final carry */
0176     rldicl  r4,r0,32,0      /* fold two 32 bit halves together */
0177     add r3,r4,r0
0178     srdi    r3,r3,32
0179     blr
0180 EXPORT_SYMBOL(__csum_partial)
0181
0182
0183     .macro srcnr
0184 100:
0185     EX_TABLE(100b,.Lerror_nr)
0186     .endm
0187
0188     .macro source
0189 150:
0190     EX_TABLE(150b,.Lerror)
0191     .endm
0192
0193     .macro dstnr
0194 200:
0195     EX_TABLE(200b,.Lerror_nr)
0196     .endm
0197
0198     .macro dest
0199 250:
0200     EX_TABLE(250b,.Lerror)
0201     .endm
0202
0203 /*
0204  * Computes the checksum of a memory block at src, length len,
0205  * and adds in 0xffffffff (32-bit), while copying the block to dst.
0206  * If an access exception occurs, it returns 0.
0207  *
0208  * csum_partial_copy_generic(r3=src, r4=dst, r5=len)
0209  */
0210 _GLOBAL(csum_partial_copy_generic)
0211     li  r6,-1
0212     addic   r0,r6,0         /* clear carry */
0213
0214     srdi.   r6,r5,3         /* less than 8 bytes? */
0215     beq .Lcopy_tail_word
0216
0217     /*
0218      * If only halfword aligned, align to a double word. Since odd
0219      * aligned addresses should be rare and they would require more
0220      * work to calculate the correct checksum, we ignore that case
0221      * and take the potential slowdown of unaligned loads.
0222      *
0223      * If the source and destination are relatively unaligned we only
0224      * align the source. This keeps things simple.
0225      */
0226     rldicl. r6,r3,64-1,64-2     /* r6 = (r3 >> 1) & 0x3 */
0227     beq .Lcopy_aligned
0228
0229     li  r9,4
0230     sub r6,r9,r6
0231     mtctr   r6
0232
0233 1:
0234 srcnr;  lhz r6,0(r3)        /* align to doubleword */
0235     subi    r5,r5,2
0236     addi    r3,r3,2
0237     adde    r0,r0,r6
0238 dstnr;  sth r6,0(r4)
0239     addi    r4,r4,2
0240     bdnz    1b
0241
0242 .Lcopy_aligned:
0243     /*
0244      * We unroll the loop such that each iteration is 64 bytes with an
0245      * entry and exit limb of 64 bytes, meaning a minimum size of
0246      * 128 bytes.
0247      */
0248     srdi.   r6,r5,7
0249     beq .Lcopy_tail_doublewords     /* len < 128 */
0250
0251     srdi    r6,r5,6
0252     subi    r6,r6,1
0253     mtctr   r6
0254
0255     stdu    r1,-STACKFRAMESIZE(r1)
0256     std r14,STK_REG(R14)(r1)
0257     std r15,STK_REG(R15)(r1)
0258     std r16,STK_REG(R16)(r1)
0259
0260 source; ld  r6,0(r3)
0261 source; ld  r9,8(r3)
0262
0263 source; ld  r10,16(r3)
0264 source; ld  r11,24(r3)
0265
0266     /*
0267      * On POWER6 and POWER7 back to back adde instructions take 2 cycles
0268      * because of the XER dependency. This means the fastest this loop can
0269      * go is 16 cycles per iteration. The scheduling of the loop below has
0270      * been shown to hit this on both POWER6 and POWER7.
0271      */
0272     .align 5
0273 2:
0274     adde    r0,r0,r6
0275 source; ld  r12,32(r3)
0276 source; ld  r14,40(r3)
0277
0278     adde    r0,r0,r9
0279 source; ld  r15,48(r3)
0280 source; ld  r16,56(r3)
0281     addi    r3,r3,64
0282
0283     adde    r0,r0,r10
0284 dest;   std r6,0(r4)
0285 dest;   std r9,8(r4)
0286
0287     adde    r0,r0,r11
0288 dest;   std r10,16(r4)
0289 dest;   std r11,24(r4)
0290
0291     adde    r0,r0,r12
0292 dest;   std r12,32(r4)
0293 dest;   std r14,40(r4)
0294
0295     adde    r0,r0,r14
0296 dest;   std r15,48(r4)
0297 dest;   std r16,56(r4)
0298     addi    r4,r4,64
0299
0300     adde    r0,r0,r15
0301 source; ld  r6,0(r3)
0302 source; ld  r9,8(r3)
0303
0304     adde    r0,r0,r16
0305 source; ld  r10,16(r3)
0306 source; ld  r11,24(r3)
0307     bdnz    2b
0308
0309
0310     adde    r0,r0,r6
0311 source; ld  r12,32(r3)
0312 source; ld  r14,40(r3)
0313
0314     adde    r0,r0,r9
0315 source; ld  r15,48(r3)
0316 source; ld  r16,56(r3)
0317     addi    r3,r3,64
0318
0319     adde    r0,r0,r10
0320 dest;   std r6,0(r4)
0321 dest;   std r9,8(r4)
0322
0323     adde    r0,r0,r11
0324 dest;   std r10,16(r4)
0325 dest;   std r11,24(r4)
0326
0327     adde    r0,r0,r12
0328 dest;   std r12,32(r4)
0329 dest;   std r14,40(r4)
0330
0331     adde    r0,r0,r14
0332 dest;   std r15,48(r4)
0333 dest;   std r16,56(r4)
0334     addi    r4,r4,64
0335
0336     adde    r0,r0,r15
0337     adde    r0,r0,r16
0338
0339     ld  r14,STK_REG(R14)(r1)
0340     ld  r15,STK_REG(R15)(r1)
0341     ld  r16,STK_REG(R16)(r1)
0342     addi    r1,r1,STACKFRAMESIZE
0343
0344     andi.   r5,r5,63
0345
0346 .Lcopy_tail_doublewords:        /* Up to 127 bytes to go */
0347     srdi.   r6,r5,3
0348     beq .Lcopy_tail_word
0349
0350     mtctr   r6
0351 3:
0352 srcnr;  ld  r6,0(r3)
0353     addi    r3,r3,8
0354     adde    r0,r0,r6
0355 dstnr;  std r6,0(r4)
0356     addi    r4,r4,8
0357     bdnz    3b
0358
0359     andi.   r5,r5,7
0360
0361 .Lcopy_tail_word:           /* Up to 7 bytes to go */
0362     srdi.   r6,r5,2
0363     beq .Lcopy_tail_halfword
0364
0365 srcnr;  lwz r6,0(r3)
0366     addi    r3,r3,4
0367     adde    r0,r0,r6
0368 dstnr;  stw r6,0(r4)
0369     addi    r4,r4,4
0370     subi    r5,r5,4
0371
0372 .Lcopy_tail_halfword:           /* Up to 3 bytes to go */
0373     srdi.   r6,r5,1
0374     beq .Lcopy_tail_byte
0375
0376 srcnr;  lhz r6,0(r3)
0377     addi    r3,r3,2
0378     adde    r0,r0,r6
0379 dstnr;  sth r6,0(r4)
0380     addi    r4,r4,2
0381     subi    r5,r5,2
0382
0383 .Lcopy_tail_byte:           /* Up to 1 byte to go */
0384     andi.   r6,r5,1
0385     beq .Lcopy_finish
0386
0387 srcnr;  lbz r6,0(r3)
0388 #ifdef __BIG_ENDIAN__
0389     sldi    r9,r6,8         /* Pad the byte out to 16 bits */
0390     adde    r0,r0,r9
0391 #else
0392     adde    r0,r0,r6
0393 #endif
0394 dstnr;  stb r6,0(r4)
0395
0396 .Lcopy_finish:
0397     addze   r0,r0           /* add in final carry */
0398     rldicl  r4,r0,32,0      /* fold two 32 bit halves together */
0399     add r3,r4,r0
0400     srdi    r3,r3,32
0401     blr
0402
0403 .Lerror:
0404     ld  r14,STK_REG(R14)(r1)
0405     ld  r15,STK_REG(R15)(r1)
0406     ld  r16,STK_REG(R16)(r1)
0407     addi    r1,r1,STACKFRAMESIZE
0408 .Lerror_nr:
0409     li  r3,0
0410     blr
0411
0412 EXPORT_SYMBOL(csum_partial_copy_generic)
0413
0414 /*
0415  * __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
0416  *             const struct in6_addr *daddr,
0417  *             __u32 len, __u8 proto, __wsum sum)
0418  */
0419
0420 _GLOBAL(csum_ipv6_magic)
0421     ld  r8, 0(r3)
0422     ld  r9, 8(r3)
0423     add r5, r5, r6
0424     addc    r0, r8, r9
0425     ld  r10, 0(r4)
0426     ld  r11, 8(r4)
0427 #ifdef CONFIG_CPU_LITTLE_ENDIAN
0428     rotldi  r5, r5, 8
0429 #endif
0430     adde    r0, r0, r10
0431     add r5, r5, r7
0432     adde    r0, r0, r11
0433     adde    r0, r0, r5
0434     addze   r0, r0
0435     rotldi  r3, r0, 32      /* fold two 32 bit halves together */
0436     add r3, r0, r3
0437     srdi    r0, r3, 32
0438     rotlwi  r3, r0, 16      /* fold two 16 bit halves together */
0439     add r3, r0, r3
0440     not r3, r3
0441     rlwinm  r3, r3, 16, 16, 31
0442     blr
0443 EXPORT_SYMBOL(csum_ipv6_magic)