Back to home page

OSCL-LXR

 
 

    


0001 /* SPDX-License-Identifier: GPL-2.0-or-later */
0002 /*
0003  *
0004  * Copyright (C) IBM Corporation, 2012
0005  *
0006  * Author: Anton Blanchard <anton@au.ibm.com>
0007  */
0008 #include <asm/ppc_asm.h>
0009 
0010 #ifndef SELFTEST_CASE
0011 /* 0 == don't use VMX, 1 == use VMX */
0012 #define SELFTEST_CASE   0
0013 #endif
0014 
0015 #ifdef __BIG_ENDIAN__
0016 #define LVS(VRT,RA,RB)      lvsl    VRT,RA,RB
0017 #define VPERM(VRT,VRA,VRB,VRC)  vperm   VRT,VRA,VRB,VRC
0018 #else
0019 #define LVS(VRT,RA,RB)      lvsr    VRT,RA,RB
0020 #define VPERM(VRT,VRA,VRB,VRC)  vperm   VRT,VRB,VRA,VRC
0021 #endif
0022 
0023 _GLOBAL(memcpy_power7)
0024     cmpldi  r5,16
0025     cmpldi  cr1,r5,4096
0026     std r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
0027     blt .Lshort_copy
0028 
0029 #ifdef CONFIG_ALTIVEC
0030 test_feature = SELFTEST_CASE
0031 BEGIN_FTR_SECTION
0032     bgt cr1, .Lvmx_copy
0033 END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
0034 #endif
0035 
0036 .Lnonvmx_copy:
0037     /* Get the source 8B aligned */
0038     neg r6,r4
0039     mtocrf  0x01,r6
0040     clrldi  r6,r6,(64-3)
0041 
0042     bf  cr7*4+3,1f
0043     lbz r0,0(r4)
0044     addi    r4,r4,1
0045     stb r0,0(r3)
0046     addi    r3,r3,1
0047 
0048 1:  bf  cr7*4+2,2f
0049     lhz r0,0(r4)
0050     addi    r4,r4,2
0051     sth r0,0(r3)
0052     addi    r3,r3,2
0053 
0054 2:  bf  cr7*4+1,3f
0055     lwz r0,0(r4)
0056     addi    r4,r4,4
0057     stw r0,0(r3)
0058     addi    r3,r3,4
0059 
0060 3:  sub r5,r5,r6
0061     cmpldi  r5,128
0062     blt 5f
0063 
0064     mflr    r0
0065     stdu    r1,-STACKFRAMESIZE(r1)
0066     std r14,STK_REG(R14)(r1)
0067     std r15,STK_REG(R15)(r1)
0068     std r16,STK_REG(R16)(r1)
0069     std r17,STK_REG(R17)(r1)
0070     std r18,STK_REG(R18)(r1)
0071     std r19,STK_REG(R19)(r1)
0072     std r20,STK_REG(R20)(r1)
0073     std r21,STK_REG(R21)(r1)
0074     std r22,STK_REG(R22)(r1)
0075     std r0,STACKFRAMESIZE+16(r1)
0076 
0077     srdi    r6,r5,7
0078     mtctr   r6
0079 
0080     /* Now do cacheline (128B) sized loads and stores. */
0081     .align  5
0082 4:
0083     ld  r0,0(r4)
0084     ld  r6,8(r4)
0085     ld  r7,16(r4)
0086     ld  r8,24(r4)
0087     ld  r9,32(r4)
0088     ld  r10,40(r4)
0089     ld  r11,48(r4)
0090     ld  r12,56(r4)
0091     ld  r14,64(r4)
0092     ld  r15,72(r4)
0093     ld  r16,80(r4)
0094     ld  r17,88(r4)
0095     ld  r18,96(r4)
0096     ld  r19,104(r4)
0097     ld  r20,112(r4)
0098     ld  r21,120(r4)
0099     addi    r4,r4,128
0100     std r0,0(r3)
0101     std r6,8(r3)
0102     std r7,16(r3)
0103     std r8,24(r3)
0104     std r9,32(r3)
0105     std r10,40(r3)
0106     std r11,48(r3)
0107     std r12,56(r3)
0108     std r14,64(r3)
0109     std r15,72(r3)
0110     std r16,80(r3)
0111     std r17,88(r3)
0112     std r18,96(r3)
0113     std r19,104(r3)
0114     std r20,112(r3)
0115     std r21,120(r3)
0116     addi    r3,r3,128
0117     bdnz    4b
0118 
0119     clrldi  r5,r5,(64-7)
0120 
0121     ld  r14,STK_REG(R14)(r1)
0122     ld  r15,STK_REG(R15)(r1)
0123     ld  r16,STK_REG(R16)(r1)
0124     ld  r17,STK_REG(R17)(r1)
0125     ld  r18,STK_REG(R18)(r1)
0126     ld  r19,STK_REG(R19)(r1)
0127     ld  r20,STK_REG(R20)(r1)
0128     ld  r21,STK_REG(R21)(r1)
0129     ld  r22,STK_REG(R22)(r1)
0130     addi    r1,r1,STACKFRAMESIZE
0131 
0132     /* Up to 127B to go */
0133 5:  srdi    r6,r5,4
0134     mtocrf  0x01,r6
0135 
0136 6:  bf  cr7*4+1,7f
0137     ld  r0,0(r4)
0138     ld  r6,8(r4)
0139     ld  r7,16(r4)
0140     ld  r8,24(r4)
0141     ld  r9,32(r4)
0142     ld  r10,40(r4)
0143     ld  r11,48(r4)
0144     ld  r12,56(r4)
0145     addi    r4,r4,64
0146     std r0,0(r3)
0147     std r6,8(r3)
0148     std r7,16(r3)
0149     std r8,24(r3)
0150     std r9,32(r3)
0151     std r10,40(r3)
0152     std r11,48(r3)
0153     std r12,56(r3)
0154     addi    r3,r3,64
0155 
0156     /* Up to 63B to go */
0157 7:  bf  cr7*4+2,8f
0158     ld  r0,0(r4)
0159     ld  r6,8(r4)
0160     ld  r7,16(r4)
0161     ld  r8,24(r4)
0162     addi    r4,r4,32
0163     std r0,0(r3)
0164     std r6,8(r3)
0165     std r7,16(r3)
0166     std r8,24(r3)
0167     addi    r3,r3,32
0168 
0169     /* Up to 31B to go */
0170 8:  bf  cr7*4+3,9f
0171     ld  r0,0(r4)
0172     ld  r6,8(r4)
0173     addi    r4,r4,16
0174     std r0,0(r3)
0175     std r6,8(r3)
0176     addi    r3,r3,16
0177 
0178 9:  clrldi  r5,r5,(64-4)
0179 
0180     /* Up to 15B to go */
0181 .Lshort_copy:
0182     mtocrf  0x01,r5
0183     bf  cr7*4+0,12f
0184     lwz r0,0(r4)    /* Less chance of a reject with word ops */
0185     lwz r6,4(r4)
0186     addi    r4,r4,8
0187     stw r0,0(r3)
0188     stw r6,4(r3)
0189     addi    r3,r3,8
0190 
0191 12: bf  cr7*4+1,13f
0192     lwz r0,0(r4)
0193     addi    r4,r4,4
0194     stw r0,0(r3)
0195     addi    r3,r3,4
0196 
0197 13: bf  cr7*4+2,14f
0198     lhz r0,0(r4)
0199     addi    r4,r4,2
0200     sth r0,0(r3)
0201     addi    r3,r3,2
0202 
0203 14: bf  cr7*4+3,15f
0204     lbz r0,0(r4)
0205     stb r0,0(r3)
0206 
0207 15: ld  r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
0208     blr
0209 
0210 .Lunwind_stack_nonvmx_copy:
0211     addi    r1,r1,STACKFRAMESIZE
0212     b   .Lnonvmx_copy
0213 
0214 .Lvmx_copy:
0215 #ifdef CONFIG_ALTIVEC
0216     mflr    r0
0217     std r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
0218     std r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
0219     std r0,16(r1)
0220     stdu    r1,-STACKFRAMESIZE(r1)
0221     bl  enter_vmx_ops
0222     cmpwi   cr1,r3,0
0223     ld  r0,STACKFRAMESIZE+16(r1)
0224     ld  r3,STK_REG(R31)(r1)
0225     ld  r4,STK_REG(R30)(r1)
0226     ld  r5,STK_REG(R29)(r1)
0227     mtlr    r0
0228 
0229     /*
0230      * We prefetch both the source and destination using enhanced touch
0231      * instructions. We use a stream ID of 0 for the load side and
0232      * 1 for the store side.
0233      */
0234     clrrdi  r6,r4,7
0235     clrrdi  r9,r3,7
0236     ori r9,r9,1     /* stream=1 */
0237 
0238     srdi    r7,r5,7     /* length in cachelines, capped at 0x3FF */
0239     cmpldi  r7,0x3FF
0240     ble 1f
0241     li  r7,0x3FF
0242 1:  lis r0,0x0E00   /* depth=7 */
0243     sldi    r7,r7,7
0244     or  r7,r7,r0
0245     ori r10,r7,1    /* stream=1 */
0246 
0247     lis r8,0x8000   /* GO=1 */
0248     clrldi  r8,r8,32
0249 
0250     dcbt    0,r6,0b01000
0251     dcbt    0,r7,0b01010
0252     dcbtst  0,r9,0b01000
0253     dcbtst  0,r10,0b01010
0254     eieio
0255     dcbt    0,r8,0b01010    /* GO */
0256 
0257     beq cr1,.Lunwind_stack_nonvmx_copy
0258 
0259     /*
0260      * If source and destination are not relatively aligned we use a
0261      * slower permute loop.
0262      */
0263     xor r6,r4,r3
0264     rldicl. r6,r6,0,(64-4)
0265     bne .Lvmx_unaligned_copy
0266 
0267     /* Get the destination 16B aligned */
0268     neg r6,r3
0269     mtocrf  0x01,r6
0270     clrldi  r6,r6,(64-4)
0271 
0272     bf  cr7*4+3,1f
0273     lbz r0,0(r4)
0274     addi    r4,r4,1
0275     stb r0,0(r3)
0276     addi    r3,r3,1
0277 
0278 1:  bf  cr7*4+2,2f
0279     lhz r0,0(r4)
0280     addi    r4,r4,2
0281     sth r0,0(r3)
0282     addi    r3,r3,2
0283 
0284 2:  bf  cr7*4+1,3f
0285     lwz r0,0(r4)
0286     addi    r4,r4,4
0287     stw r0,0(r3)
0288     addi    r3,r3,4
0289 
0290 3:  bf  cr7*4+0,4f
0291     ld  r0,0(r4)
0292     addi    r4,r4,8
0293     std r0,0(r3)
0294     addi    r3,r3,8
0295 
0296 4:  sub r5,r5,r6
0297 
0298     /* Get the desination 128B aligned */
0299     neg r6,r3
0300     srdi    r7,r6,4
0301     mtocrf  0x01,r7
0302     clrldi  r6,r6,(64-7)
0303 
0304     li  r9,16
0305     li  r10,32
0306     li  r11,48
0307 
0308     bf  cr7*4+3,5f
0309     lvx v1,0,r4
0310     addi    r4,r4,16
0311     stvx    v1,0,r3
0312     addi    r3,r3,16
0313 
0314 5:  bf  cr7*4+2,6f
0315     lvx v1,0,r4
0316     lvx v0,r4,r9
0317     addi    r4,r4,32
0318     stvx    v1,0,r3
0319     stvx    v0,r3,r9
0320     addi    r3,r3,32
0321 
0322 6:  bf  cr7*4+1,7f
0323     lvx v3,0,r4
0324     lvx v2,r4,r9
0325     lvx v1,r4,r10
0326     lvx v0,r4,r11
0327     addi    r4,r4,64
0328     stvx    v3,0,r3
0329     stvx    v2,r3,r9
0330     stvx    v1,r3,r10
0331     stvx    v0,r3,r11
0332     addi    r3,r3,64
0333 
0334 7:  sub r5,r5,r6
0335     srdi    r6,r5,7
0336 
0337     std r14,STK_REG(R14)(r1)
0338     std r15,STK_REG(R15)(r1)
0339     std r16,STK_REG(R16)(r1)
0340 
0341     li  r12,64
0342     li  r14,80
0343     li  r15,96
0344     li  r16,112
0345 
0346     mtctr   r6
0347 
0348     /*
0349      * Now do cacheline sized loads and stores. By this stage the
0350      * cacheline stores are also cacheline aligned.
0351      */
0352     .align  5
0353 8:
0354     lvx v7,0,r4
0355     lvx v6,r4,r9
0356     lvx v5,r4,r10
0357     lvx v4,r4,r11
0358     lvx v3,r4,r12
0359     lvx v2,r4,r14
0360     lvx v1,r4,r15
0361     lvx v0,r4,r16
0362     addi    r4,r4,128
0363     stvx    v7,0,r3
0364     stvx    v6,r3,r9
0365     stvx    v5,r3,r10
0366     stvx    v4,r3,r11
0367     stvx    v3,r3,r12
0368     stvx    v2,r3,r14
0369     stvx    v1,r3,r15
0370     stvx    v0,r3,r16
0371     addi    r3,r3,128
0372     bdnz    8b
0373 
0374     ld  r14,STK_REG(R14)(r1)
0375     ld  r15,STK_REG(R15)(r1)
0376     ld  r16,STK_REG(R16)(r1)
0377 
0378     /* Up to 127B to go */
0379     clrldi  r5,r5,(64-7)
0380     srdi    r6,r5,4
0381     mtocrf  0x01,r6
0382 
0383     bf  cr7*4+1,9f
0384     lvx v3,0,r4
0385     lvx v2,r4,r9
0386     lvx v1,r4,r10
0387     lvx v0,r4,r11
0388     addi    r4,r4,64
0389     stvx    v3,0,r3
0390     stvx    v2,r3,r9
0391     stvx    v1,r3,r10
0392     stvx    v0,r3,r11
0393     addi    r3,r3,64
0394 
0395 9:  bf  cr7*4+2,10f
0396     lvx v1,0,r4
0397     lvx v0,r4,r9
0398     addi    r4,r4,32
0399     stvx    v1,0,r3
0400     stvx    v0,r3,r9
0401     addi    r3,r3,32
0402 
0403 10: bf  cr7*4+3,11f
0404     lvx v1,0,r4
0405     addi    r4,r4,16
0406     stvx    v1,0,r3
0407     addi    r3,r3,16
0408 
0409     /* Up to 15B to go */
0410 11: clrldi  r5,r5,(64-4)
0411     mtocrf  0x01,r5
0412     bf  cr7*4+0,12f
0413     ld  r0,0(r4)
0414     addi    r4,r4,8
0415     std r0,0(r3)
0416     addi    r3,r3,8
0417 
0418 12: bf  cr7*4+1,13f
0419     lwz r0,0(r4)
0420     addi    r4,r4,4
0421     stw r0,0(r3)
0422     addi    r3,r3,4
0423 
0424 13: bf  cr7*4+2,14f
0425     lhz r0,0(r4)
0426     addi    r4,r4,2
0427     sth r0,0(r3)
0428     addi    r3,r3,2
0429 
0430 14: bf  cr7*4+3,15f
0431     lbz r0,0(r4)
0432     stb r0,0(r3)
0433 
0434 15: addi    r1,r1,STACKFRAMESIZE
0435     ld  r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
0436     b   exit_vmx_ops        /* tail call optimise */
0437 
0438 .Lvmx_unaligned_copy:
0439     /* Get the destination 16B aligned */
0440     neg r6,r3
0441     mtocrf  0x01,r6
0442     clrldi  r6,r6,(64-4)
0443 
0444     bf  cr7*4+3,1f
0445     lbz r0,0(r4)
0446     addi    r4,r4,1
0447     stb r0,0(r3)
0448     addi    r3,r3,1
0449 
0450 1:  bf  cr7*4+2,2f
0451     lhz r0,0(r4)
0452     addi    r4,r4,2
0453     sth r0,0(r3)
0454     addi    r3,r3,2
0455 
0456 2:  bf  cr7*4+1,3f
0457     lwz r0,0(r4)
0458     addi    r4,r4,4
0459     stw r0,0(r3)
0460     addi    r3,r3,4
0461 
0462 3:  bf  cr7*4+0,4f
0463     lwz r0,0(r4)    /* Less chance of a reject with word ops */
0464     lwz r7,4(r4)
0465     addi    r4,r4,8
0466     stw r0,0(r3)
0467     stw r7,4(r3)
0468     addi    r3,r3,8
0469 
0470 4:  sub r5,r5,r6
0471 
0472     /* Get the desination 128B aligned */
0473     neg r6,r3
0474     srdi    r7,r6,4
0475     mtocrf  0x01,r7
0476     clrldi  r6,r6,(64-7)
0477 
0478     li  r9,16
0479     li  r10,32
0480     li  r11,48
0481 
0482     LVS(v16,0,r4)       /* Setup permute control vector */
0483     lvx v0,0,r4
0484     addi    r4,r4,16
0485 
0486     bf  cr7*4+3,5f
0487     lvx v1,0,r4
0488     VPERM(v8,v0,v1,v16)
0489     addi    r4,r4,16
0490     stvx    v8,0,r3
0491     addi    r3,r3,16
0492     vor v0,v1,v1
0493 
0494 5:  bf  cr7*4+2,6f
0495     lvx v1,0,r4
0496     VPERM(v8,v0,v1,v16)
0497     lvx v0,r4,r9
0498     VPERM(v9,v1,v0,v16)
0499     addi    r4,r4,32
0500     stvx    v8,0,r3
0501     stvx    v9,r3,r9
0502     addi    r3,r3,32
0503 
0504 6:  bf  cr7*4+1,7f
0505     lvx v3,0,r4
0506     VPERM(v8,v0,v3,v16)
0507     lvx v2,r4,r9
0508     VPERM(v9,v3,v2,v16)
0509     lvx v1,r4,r10
0510     VPERM(v10,v2,v1,v16)
0511     lvx v0,r4,r11
0512     VPERM(v11,v1,v0,v16)
0513     addi    r4,r4,64
0514     stvx    v8,0,r3
0515     stvx    v9,r3,r9
0516     stvx    v10,r3,r10
0517     stvx    v11,r3,r11
0518     addi    r3,r3,64
0519 
0520 7:  sub r5,r5,r6
0521     srdi    r6,r5,7
0522 
0523     std r14,STK_REG(R14)(r1)
0524     std r15,STK_REG(R15)(r1)
0525     std r16,STK_REG(R16)(r1)
0526 
0527     li  r12,64
0528     li  r14,80
0529     li  r15,96
0530     li  r16,112
0531 
0532     mtctr   r6
0533 
0534     /*
0535      * Now do cacheline sized loads and stores. By this stage the
0536      * cacheline stores are also cacheline aligned.
0537      */
0538     .align  5
0539 8:
0540     lvx v7,0,r4
0541     VPERM(v8,v0,v7,v16)
0542     lvx v6,r4,r9
0543     VPERM(v9,v7,v6,v16)
0544     lvx v5,r4,r10
0545     VPERM(v10,v6,v5,v16)
0546     lvx v4,r4,r11
0547     VPERM(v11,v5,v4,v16)
0548     lvx v3,r4,r12
0549     VPERM(v12,v4,v3,v16)
0550     lvx v2,r4,r14
0551     VPERM(v13,v3,v2,v16)
0552     lvx v1,r4,r15
0553     VPERM(v14,v2,v1,v16)
0554     lvx v0,r4,r16
0555     VPERM(v15,v1,v0,v16)
0556     addi    r4,r4,128
0557     stvx    v8,0,r3
0558     stvx    v9,r3,r9
0559     stvx    v10,r3,r10
0560     stvx    v11,r3,r11
0561     stvx    v12,r3,r12
0562     stvx    v13,r3,r14
0563     stvx    v14,r3,r15
0564     stvx    v15,r3,r16
0565     addi    r3,r3,128
0566     bdnz    8b
0567 
0568     ld  r14,STK_REG(R14)(r1)
0569     ld  r15,STK_REG(R15)(r1)
0570     ld  r16,STK_REG(R16)(r1)
0571 
0572     /* Up to 127B to go */
0573     clrldi  r5,r5,(64-7)
0574     srdi    r6,r5,4
0575     mtocrf  0x01,r6
0576 
0577     bf  cr7*4+1,9f
0578     lvx v3,0,r4
0579     VPERM(v8,v0,v3,v16)
0580     lvx v2,r4,r9
0581     VPERM(v9,v3,v2,v16)
0582     lvx v1,r4,r10
0583     VPERM(v10,v2,v1,v16)
0584     lvx v0,r4,r11
0585     VPERM(v11,v1,v0,v16)
0586     addi    r4,r4,64
0587     stvx    v8,0,r3
0588     stvx    v9,r3,r9
0589     stvx    v10,r3,r10
0590     stvx    v11,r3,r11
0591     addi    r3,r3,64
0592 
0593 9:  bf  cr7*4+2,10f
0594     lvx v1,0,r4
0595     VPERM(v8,v0,v1,v16)
0596     lvx v0,r4,r9
0597     VPERM(v9,v1,v0,v16)
0598     addi    r4,r4,32
0599     stvx    v8,0,r3
0600     stvx    v9,r3,r9
0601     addi    r3,r3,32
0602 
0603 10: bf  cr7*4+3,11f
0604     lvx v1,0,r4
0605     VPERM(v8,v0,v1,v16)
0606     addi    r4,r4,16
0607     stvx    v8,0,r3
0608     addi    r3,r3,16
0609 
0610     /* Up to 15B to go */
0611 11: clrldi  r5,r5,(64-4)
0612     addi    r4,r4,-16   /* Unwind the +16 load offset */
0613     mtocrf  0x01,r5
0614     bf  cr7*4+0,12f
0615     lwz r0,0(r4)    /* Less chance of a reject with word ops */
0616     lwz r6,4(r4)
0617     addi    r4,r4,8
0618     stw r0,0(r3)
0619     stw r6,4(r3)
0620     addi    r3,r3,8
0621 
0622 12: bf  cr7*4+1,13f
0623     lwz r0,0(r4)
0624     addi    r4,r4,4
0625     stw r0,0(r3)
0626     addi    r3,r3,4
0627 
0628 13: bf  cr7*4+2,14f
0629     lhz r0,0(r4)
0630     addi    r4,r4,2
0631     sth r0,0(r3)
0632     addi    r3,r3,2
0633 
0634 14: bf  cr7*4+3,15f
0635     lbz r0,0(r4)
0636     stb r0,0(r3)
0637 
0638 15: addi    r1,r1,STACKFRAMESIZE
0639     ld  r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
0640     b   exit_vmx_ops        /* tail call optimise */
0641 #endif /* CONFIG_ALTIVEC */