Back to home page

OSCL-LXR

 
 

    


0001 /* SPDX-License-Identifier: GPL-2.0-or-later */
0002 /*
0003  * This file contains assembly-language implementations
0004  * of IP-style 1's complement checksum routines.
0005  *  
0006  *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
0007  *
0008  * Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au).
0009  */
0010 
0011 #include <linux/sys.h>
0012 #include <asm/processor.h>
0013 #include <asm/errno.h>
0014 #include <asm/ppc_asm.h>
0015 #include <asm/export.h>
0016 
0017 /*
0018  * Computes the checksum of a memory block at buff, length len,
0019  * and adds in "sum" (32-bit).
0020  *
0021  * __csum_partial(r3=buff, r4=len, r5=sum)
0022  */
0023 _GLOBAL(__csum_partial)
0024     addic   r0,r5,0         /* clear carry */
0025 
0026     srdi.   r6,r4,3         /* less than 8 bytes? */
0027     beq .Lcsum_tail_word
0028 
0029     /*
0030      * If only halfword aligned, align to a double word. Since odd
0031      * aligned addresses should be rare and they would require more
0032      * work to calculate the correct checksum, we ignore that case
0033      * and take the potential slowdown of unaligned loads.
0034      */
0035     rldicl. r6,r3,64-1,64-2     /* r6 = (r3 >> 1) & 0x3 */
0036     beq .Lcsum_aligned
0037 
0038     li  r7,4
0039     sub r6,r7,r6
0040     mtctr   r6
0041 
0042 1:
0043     lhz r6,0(r3)        /* align to doubleword */
0044     subi    r4,r4,2
0045     addi    r3,r3,2
0046     adde    r0,r0,r6
0047     bdnz    1b
0048 
0049 .Lcsum_aligned:
0050     /*
0051      * We unroll the loop such that each iteration is 64 bytes with an
0052      * entry and exit limb of 64 bytes, meaning a minimum size of
0053      * 128 bytes.
0054      */
0055     srdi.   r6,r4,7
0056     beq .Lcsum_tail_doublewords     /* len < 128 */
0057 
0058     srdi    r6,r4,6
0059     subi    r6,r6,1
0060     mtctr   r6
0061 
0062     stdu    r1,-STACKFRAMESIZE(r1)
0063     std r14,STK_REG(R14)(r1)
0064     std r15,STK_REG(R15)(r1)
0065     std r16,STK_REG(R16)(r1)
0066 
0067     ld  r6,0(r3)
0068     ld  r9,8(r3)
0069 
0070     ld  r10,16(r3)
0071     ld  r11,24(r3)
0072 
0073     /*
0074      * On POWER6 and POWER7 back to back adde instructions take 2 cycles
0075      * because of the XER dependency. This means the fastest this loop can
0076      * go is 16 cycles per iteration. The scheduling of the loop below has
0077      * been shown to hit this on both POWER6 and POWER7.
0078      */
0079     .align 5
0080 2:
0081     adde    r0,r0,r6
0082     ld  r12,32(r3)
0083     ld  r14,40(r3)
0084 
0085     adde    r0,r0,r9
0086     ld  r15,48(r3)
0087     ld  r16,56(r3)
0088     addi    r3,r3,64
0089 
0090     adde    r0,r0,r10
0091 
0092     adde    r0,r0,r11
0093 
0094     adde    r0,r0,r12
0095 
0096     adde    r0,r0,r14
0097 
0098     adde    r0,r0,r15
0099     ld  r6,0(r3)
0100     ld  r9,8(r3)
0101 
0102     adde    r0,r0,r16
0103     ld  r10,16(r3)
0104     ld  r11,24(r3)
0105     bdnz    2b
0106 
0107 
0108     adde    r0,r0,r6
0109     ld  r12,32(r3)
0110     ld  r14,40(r3)
0111 
0112     adde    r0,r0,r9
0113     ld  r15,48(r3)
0114     ld  r16,56(r3)
0115     addi    r3,r3,64
0116 
0117     adde    r0,r0,r10
0118     adde    r0,r0,r11
0119     adde    r0,r0,r12
0120     adde    r0,r0,r14
0121     adde    r0,r0,r15
0122     adde    r0,r0,r16
0123 
0124     ld  r14,STK_REG(R14)(r1)
0125     ld  r15,STK_REG(R15)(r1)
0126     ld  r16,STK_REG(R16)(r1)
0127     addi    r1,r1,STACKFRAMESIZE
0128 
0129     andi.   r4,r4,63
0130 
0131 .Lcsum_tail_doublewords:        /* Up to 127 bytes to go */
0132     srdi.   r6,r4,3
0133     beq .Lcsum_tail_word
0134 
0135     mtctr   r6
0136 3:
0137     ld  r6,0(r3)
0138     addi    r3,r3,8
0139     adde    r0,r0,r6
0140     bdnz    3b
0141 
0142     andi.   r4,r4,7
0143 
0144 .Lcsum_tail_word:           /* Up to 7 bytes to go */
0145     srdi.   r6,r4,2
0146     beq .Lcsum_tail_halfword
0147 
0148     lwz r6,0(r3)
0149     addi    r3,r3,4
0150     adde    r0,r0,r6
0151     subi    r4,r4,4
0152 
0153 .Lcsum_tail_halfword:           /* Up to 3 bytes to go */
0154     srdi.   r6,r4,1
0155     beq .Lcsum_tail_byte
0156 
0157     lhz r6,0(r3)
0158     addi    r3,r3,2
0159     adde    r0,r0,r6
0160     subi    r4,r4,2
0161 
0162 .Lcsum_tail_byte:           /* Up to 1 byte to go */
0163     andi.   r6,r4,1
0164     beq .Lcsum_finish
0165 
0166     lbz r6,0(r3)
0167 #ifdef __BIG_ENDIAN__
0168     sldi    r9,r6,8         /* Pad the byte out to 16 bits */
0169     adde    r0,r0,r9
0170 #else
0171     adde    r0,r0,r6
0172 #endif
0173 
0174 .Lcsum_finish:
0175     addze   r0,r0           /* add in final carry */
0176     rldicl  r4,r0,32,0      /* fold two 32 bit halves together */
0177     add r3,r4,r0
0178     srdi    r3,r3,32
0179     blr
0180 EXPORT_SYMBOL(__csum_partial)
0181 
0182 
0183     .macro srcnr
0184 100:
0185     EX_TABLE(100b,.Lerror_nr)
0186     .endm
0187 
0188     .macro source
0189 150:
0190     EX_TABLE(150b,.Lerror)
0191     .endm
0192 
0193     .macro dstnr
0194 200:
0195     EX_TABLE(200b,.Lerror_nr)
0196     .endm
0197 
0198     .macro dest
0199 250:
0200     EX_TABLE(250b,.Lerror)
0201     .endm
0202 
0203 /*
0204  * Computes the checksum of a memory block at src, length len,
0205  * and adds in 0xffffffff (32-bit), while copying the block to dst.
0206  * If an access exception occurs, it returns 0.
0207  *
0208  * csum_partial_copy_generic(r3=src, r4=dst, r5=len)
0209  */
0210 _GLOBAL(csum_partial_copy_generic)
0211     li  r6,-1
0212     addic   r0,r6,0         /* clear carry */
0213 
0214     srdi.   r6,r5,3         /* less than 8 bytes? */
0215     beq .Lcopy_tail_word
0216 
0217     /*
0218      * If only halfword aligned, align to a double word. Since odd
0219      * aligned addresses should be rare and they would require more
0220      * work to calculate the correct checksum, we ignore that case
0221      * and take the potential slowdown of unaligned loads.
0222      *
0223      * If the source and destination are relatively unaligned we only
0224      * align the source. This keeps things simple.
0225      */
0226     rldicl. r6,r3,64-1,64-2     /* r6 = (r3 >> 1) & 0x3 */
0227     beq .Lcopy_aligned
0228 
0229     li  r9,4
0230     sub r6,r9,r6
0231     mtctr   r6
0232 
0233 1:
0234 srcnr;  lhz r6,0(r3)        /* align to doubleword */
0235     subi    r5,r5,2
0236     addi    r3,r3,2
0237     adde    r0,r0,r6
0238 dstnr;  sth r6,0(r4)
0239     addi    r4,r4,2
0240     bdnz    1b
0241 
0242 .Lcopy_aligned:
0243     /*
0244      * We unroll the loop such that each iteration is 64 bytes with an
0245      * entry and exit limb of 64 bytes, meaning a minimum size of
0246      * 128 bytes.
0247      */
0248     srdi.   r6,r5,7
0249     beq .Lcopy_tail_doublewords     /* len < 128 */
0250 
0251     srdi    r6,r5,6
0252     subi    r6,r6,1
0253     mtctr   r6
0254 
0255     stdu    r1,-STACKFRAMESIZE(r1)
0256     std r14,STK_REG(R14)(r1)
0257     std r15,STK_REG(R15)(r1)
0258     std r16,STK_REG(R16)(r1)
0259 
0260 source; ld  r6,0(r3)
0261 source; ld  r9,8(r3)
0262 
0263 source; ld  r10,16(r3)
0264 source; ld  r11,24(r3)
0265 
0266     /*
0267      * On POWER6 and POWER7 back to back adde instructions take 2 cycles
0268      * because of the XER dependency. This means the fastest this loop can
0269      * go is 16 cycles per iteration. The scheduling of the loop below has
0270      * been shown to hit this on both POWER6 and POWER7.
0271      */
0272     .align 5
0273 2:
0274     adde    r0,r0,r6
0275 source; ld  r12,32(r3)
0276 source; ld  r14,40(r3)
0277 
0278     adde    r0,r0,r9
0279 source; ld  r15,48(r3)
0280 source; ld  r16,56(r3)
0281     addi    r3,r3,64
0282 
0283     adde    r0,r0,r10
0284 dest;   std r6,0(r4)
0285 dest;   std r9,8(r4)
0286 
0287     adde    r0,r0,r11
0288 dest;   std r10,16(r4)
0289 dest;   std r11,24(r4)
0290 
0291     adde    r0,r0,r12
0292 dest;   std r12,32(r4)
0293 dest;   std r14,40(r4)
0294 
0295     adde    r0,r0,r14
0296 dest;   std r15,48(r4)
0297 dest;   std r16,56(r4)
0298     addi    r4,r4,64
0299 
0300     adde    r0,r0,r15
0301 source; ld  r6,0(r3)
0302 source; ld  r9,8(r3)
0303 
0304     adde    r0,r0,r16
0305 source; ld  r10,16(r3)
0306 source; ld  r11,24(r3)
0307     bdnz    2b
0308 
0309 
0310     adde    r0,r0,r6
0311 source; ld  r12,32(r3)
0312 source; ld  r14,40(r3)
0313 
0314     adde    r0,r0,r9
0315 source; ld  r15,48(r3)
0316 source; ld  r16,56(r3)
0317     addi    r3,r3,64
0318 
0319     adde    r0,r0,r10
0320 dest;   std r6,0(r4)
0321 dest;   std r9,8(r4)
0322 
0323     adde    r0,r0,r11
0324 dest;   std r10,16(r4)
0325 dest;   std r11,24(r4)
0326 
0327     adde    r0,r0,r12
0328 dest;   std r12,32(r4)
0329 dest;   std r14,40(r4)
0330 
0331     adde    r0,r0,r14
0332 dest;   std r15,48(r4)
0333 dest;   std r16,56(r4)
0334     addi    r4,r4,64
0335 
0336     adde    r0,r0,r15
0337     adde    r0,r0,r16
0338 
0339     ld  r14,STK_REG(R14)(r1)
0340     ld  r15,STK_REG(R15)(r1)
0341     ld  r16,STK_REG(R16)(r1)
0342     addi    r1,r1,STACKFRAMESIZE
0343 
0344     andi.   r5,r5,63
0345 
0346 .Lcopy_tail_doublewords:        /* Up to 127 bytes to go */
0347     srdi.   r6,r5,3
0348     beq .Lcopy_tail_word
0349 
0350     mtctr   r6
0351 3:
0352 srcnr;  ld  r6,0(r3)
0353     addi    r3,r3,8
0354     adde    r0,r0,r6
0355 dstnr;  std r6,0(r4)
0356     addi    r4,r4,8
0357     bdnz    3b
0358 
0359     andi.   r5,r5,7
0360 
0361 .Lcopy_tail_word:           /* Up to 7 bytes to go */
0362     srdi.   r6,r5,2
0363     beq .Lcopy_tail_halfword
0364 
0365 srcnr;  lwz r6,0(r3)
0366     addi    r3,r3,4
0367     adde    r0,r0,r6
0368 dstnr;  stw r6,0(r4)
0369     addi    r4,r4,4
0370     subi    r5,r5,4
0371 
0372 .Lcopy_tail_halfword:           /* Up to 3 bytes to go */
0373     srdi.   r6,r5,1
0374     beq .Lcopy_tail_byte
0375 
0376 srcnr;  lhz r6,0(r3)
0377     addi    r3,r3,2
0378     adde    r0,r0,r6
0379 dstnr;  sth r6,0(r4)
0380     addi    r4,r4,2
0381     subi    r5,r5,2
0382 
0383 .Lcopy_tail_byte:           /* Up to 1 byte to go */
0384     andi.   r6,r5,1
0385     beq .Lcopy_finish
0386 
0387 srcnr;  lbz r6,0(r3)
0388 #ifdef __BIG_ENDIAN__
0389     sldi    r9,r6,8         /* Pad the byte out to 16 bits */
0390     adde    r0,r0,r9
0391 #else
0392     adde    r0,r0,r6
0393 #endif
0394 dstnr;  stb r6,0(r4)
0395 
0396 .Lcopy_finish:
0397     addze   r0,r0           /* add in final carry */
0398     rldicl  r4,r0,32,0      /* fold two 32 bit halves together */
0399     add r3,r4,r0
0400     srdi    r3,r3,32
0401     blr
0402 
0403 .Lerror:
0404     ld  r14,STK_REG(R14)(r1)
0405     ld  r15,STK_REG(R15)(r1)
0406     ld  r16,STK_REG(R16)(r1)
0407     addi    r1,r1,STACKFRAMESIZE
0408 .Lerror_nr:
0409     li  r3,0
0410     blr
0411 
0412 EXPORT_SYMBOL(csum_partial_copy_generic)
0413 
0414 /*
0415  * __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
0416  *             const struct in6_addr *daddr,
0417  *             __u32 len, __u8 proto, __wsum sum)
0418  */
0419 
0420 _GLOBAL(csum_ipv6_magic)
0421     ld  r8, 0(r3)
0422     ld  r9, 8(r3)
0423     add r5, r5, r6
0424     addc    r0, r8, r9
0425     ld  r10, 0(r4)
0426     ld  r11, 8(r4)
0427 #ifdef CONFIG_CPU_LITTLE_ENDIAN
0428     rotldi  r5, r5, 8
0429 #endif
0430     adde    r0, r0, r10
0431     add r5, r5, r7
0432     adde    r0, r0, r11
0433     adde    r0, r0, r5
0434     addze   r0, r0
0435     rotldi  r3, r0, 32      /* fold two 32 bit halves together */
0436     add r3, r0, r3
0437     srdi    r0, r3, 32
0438     rotlwi  r3, r0, 16      /* fold two 16 bit halves together */
0439     add r3, r0, r3
0440     not r3, r3
0441     rlwinm  r3, r3, 16, 16, 31
0442     blr
0443 EXPORT_SYMBOL(csum_ipv6_magic)