0001
0002
0003
0004
0005
0006
0007
0008 #include <asm/ppc_asm.h>
0009
0010 #ifndef SELFTEST_CASE
0011
0012 #define SELFTEST_CASE 0
0013 #endif
0014
0015 #ifdef __BIG_ENDIAN__
0016 #define LVS(VRT,RA,RB) lvsl VRT,RA,RB
0017 #define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRA,VRB,VRC
0018 #else
0019 #define LVS(VRT,RA,RB) lvsr VRT,RA,RB
0020 #define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRB,VRA,VRC
0021 #endif
0022
0023 _GLOBAL(memcpy_power7)
0024 cmpldi r5,16
0025 cmpldi cr1,r5,4096
0026 std r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
0027 blt .Lshort_copy
0028
0029 #ifdef CONFIG_ALTIVEC
0030 test_feature = SELFTEST_CASE
0031 BEGIN_FTR_SECTION
0032 bgt cr1, .Lvmx_copy
0033 END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
0034 #endif
0035
0036 .Lnonvmx_copy:
0037
0038 neg r6,r4
0039 mtocrf 0x01,r6
0040 clrldi r6,r6,(64-3)
0041
0042 bf cr7*4+3,1f
0043 lbz r0,0(r4)
0044 addi r4,r4,1
0045 stb r0,0(r3)
0046 addi r3,r3,1
0047
0048 1: bf cr7*4+2,2f
0049 lhz r0,0(r4)
0050 addi r4,r4,2
0051 sth r0,0(r3)
0052 addi r3,r3,2
0053
0054 2: bf cr7*4+1,3f
0055 lwz r0,0(r4)
0056 addi r4,r4,4
0057 stw r0,0(r3)
0058 addi r3,r3,4
0059
0060 3: sub r5,r5,r6
0061 cmpldi r5,128
0062 blt 5f
0063
0064 mflr r0
0065 stdu r1,-STACKFRAMESIZE(r1)
0066 std r14,STK_REG(R14)(r1)
0067 std r15,STK_REG(R15)(r1)
0068 std r16,STK_REG(R16)(r1)
0069 std r17,STK_REG(R17)(r1)
0070 std r18,STK_REG(R18)(r1)
0071 std r19,STK_REG(R19)(r1)
0072 std r20,STK_REG(R20)(r1)
0073 std r21,STK_REG(R21)(r1)
0074 std r22,STK_REG(R22)(r1)
0075 std r0,STACKFRAMESIZE+16(r1)
0076
0077 srdi r6,r5,7
0078 mtctr r6
0079
0080
0081 .align 5
0082 4:
0083 ld r0,0(r4)
0084 ld r6,8(r4)
0085 ld r7,16(r4)
0086 ld r8,24(r4)
0087 ld r9,32(r4)
0088 ld r10,40(r4)
0089 ld r11,48(r4)
0090 ld r12,56(r4)
0091 ld r14,64(r4)
0092 ld r15,72(r4)
0093 ld r16,80(r4)
0094 ld r17,88(r4)
0095 ld r18,96(r4)
0096 ld r19,104(r4)
0097 ld r20,112(r4)
0098 ld r21,120(r4)
0099 addi r4,r4,128
0100 std r0,0(r3)
0101 std r6,8(r3)
0102 std r7,16(r3)
0103 std r8,24(r3)
0104 std r9,32(r3)
0105 std r10,40(r3)
0106 std r11,48(r3)
0107 std r12,56(r3)
0108 std r14,64(r3)
0109 std r15,72(r3)
0110 std r16,80(r3)
0111 std r17,88(r3)
0112 std r18,96(r3)
0113 std r19,104(r3)
0114 std r20,112(r3)
0115 std r21,120(r3)
0116 addi r3,r3,128
0117 bdnz 4b
0118
0119 clrldi r5,r5,(64-7)
0120
0121 ld r14,STK_REG(R14)(r1)
0122 ld r15,STK_REG(R15)(r1)
0123 ld r16,STK_REG(R16)(r1)
0124 ld r17,STK_REG(R17)(r1)
0125 ld r18,STK_REG(R18)(r1)
0126 ld r19,STK_REG(R19)(r1)
0127 ld r20,STK_REG(R20)(r1)
0128 ld r21,STK_REG(R21)(r1)
0129 ld r22,STK_REG(R22)(r1)
0130 addi r1,r1,STACKFRAMESIZE
0131
0132
0133 5: srdi r6,r5,4
0134 mtocrf 0x01,r6
0135
0136 6: bf cr7*4+1,7f
0137 ld r0,0(r4)
0138 ld r6,8(r4)
0139 ld r7,16(r4)
0140 ld r8,24(r4)
0141 ld r9,32(r4)
0142 ld r10,40(r4)
0143 ld r11,48(r4)
0144 ld r12,56(r4)
0145 addi r4,r4,64
0146 std r0,0(r3)
0147 std r6,8(r3)
0148 std r7,16(r3)
0149 std r8,24(r3)
0150 std r9,32(r3)
0151 std r10,40(r3)
0152 std r11,48(r3)
0153 std r12,56(r3)
0154 addi r3,r3,64
0155
0156
0157 7: bf cr7*4+2,8f
0158 ld r0,0(r4)
0159 ld r6,8(r4)
0160 ld r7,16(r4)
0161 ld r8,24(r4)
0162 addi r4,r4,32
0163 std r0,0(r3)
0164 std r6,8(r3)
0165 std r7,16(r3)
0166 std r8,24(r3)
0167 addi r3,r3,32
0168
0169
0170 8: bf cr7*4+3,9f
0171 ld r0,0(r4)
0172 ld r6,8(r4)
0173 addi r4,r4,16
0174 std r0,0(r3)
0175 std r6,8(r3)
0176 addi r3,r3,16
0177
0178 9: clrldi r5,r5,(64-4)
0179
0180
0181 .Lshort_copy:
0182 mtocrf 0x01,r5
0183 bf cr7*4+0,12f
0184 lwz r0,0(r4)
0185 lwz r6,4(r4)
0186 addi r4,r4,8
0187 stw r0,0(r3)
0188 stw r6,4(r3)
0189 addi r3,r3,8
0190
0191 12: bf cr7*4+1,13f
0192 lwz r0,0(r4)
0193 addi r4,r4,4
0194 stw r0,0(r3)
0195 addi r3,r3,4
0196
0197 13: bf cr7*4+2,14f
0198 lhz r0,0(r4)
0199 addi r4,r4,2
0200 sth r0,0(r3)
0201 addi r3,r3,2
0202
0203 14: bf cr7*4+3,15f
0204 lbz r0,0(r4)
0205 stb r0,0(r3)
0206
0207 15: ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
0208 blr
0209
0210 .Lunwind_stack_nonvmx_copy:
0211 addi r1,r1,STACKFRAMESIZE
0212 b .Lnonvmx_copy
0213
0214 .Lvmx_copy:
0215 #ifdef CONFIG_ALTIVEC
0216 mflr r0
0217 std r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
0218 std r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
0219 std r0,16(r1)
0220 stdu r1,-STACKFRAMESIZE(r1)
0221 bl enter_vmx_ops
0222 cmpwi cr1,r3,0
0223 ld r0,STACKFRAMESIZE+16(r1)
0224 ld r3,STK_REG(R31)(r1)
0225 ld r4,STK_REG(R30)(r1)
0226 ld r5,STK_REG(R29)(r1)
0227 mtlr r0
0228
0229
0230
0231
0232
0233
0234 clrrdi r6,r4,7
0235 clrrdi r9,r3,7
0236 ori r9,r9,1
0237
0238 srdi r7,r5,7
0239 cmpldi r7,0x3FF
0240 ble 1f
0241 li r7,0x3FF
0242 1: lis r0,0x0E00
0243 sldi r7,r7,7
0244 or r7,r7,r0
0245 ori r10,r7,1
0246
0247 lis r8,0x8000
0248 clrldi r8,r8,32
0249
0250 dcbt 0,r6,0b01000
0251 dcbt 0,r7,0b01010
0252 dcbtst 0,r9,0b01000
0253 dcbtst 0,r10,0b01010
0254 eieio
0255 dcbt 0,r8,0b01010
0256
0257 beq cr1,.Lunwind_stack_nonvmx_copy
0258
0259
0260
0261
0262
0263 xor r6,r4,r3
0264 rldicl. r6,r6,0,(64-4)
0265 bne .Lvmx_unaligned_copy
0266
0267
0268 neg r6,r3
0269 mtocrf 0x01,r6
0270 clrldi r6,r6,(64-4)
0271
0272 bf cr7*4+3,1f
0273 lbz r0,0(r4)
0274 addi r4,r4,1
0275 stb r0,0(r3)
0276 addi r3,r3,1
0277
0278 1: bf cr7*4+2,2f
0279 lhz r0,0(r4)
0280 addi r4,r4,2
0281 sth r0,0(r3)
0282 addi r3,r3,2
0283
0284 2: bf cr7*4+1,3f
0285 lwz r0,0(r4)
0286 addi r4,r4,4
0287 stw r0,0(r3)
0288 addi r3,r3,4
0289
0290 3: bf cr7*4+0,4f
0291 ld r0,0(r4)
0292 addi r4,r4,8
0293 std r0,0(r3)
0294 addi r3,r3,8
0295
0296 4: sub r5,r5,r6
0297
0298
0299 neg r6,r3
0300 srdi r7,r6,4
0301 mtocrf 0x01,r7
0302 clrldi r6,r6,(64-7)
0303
0304 li r9,16
0305 li r10,32
0306 li r11,48
0307
0308 bf cr7*4+3,5f
0309 lvx v1,0,r4
0310 addi r4,r4,16
0311 stvx v1,0,r3
0312 addi r3,r3,16
0313
0314 5: bf cr7*4+2,6f
0315 lvx v1,0,r4
0316 lvx v0,r4,r9
0317 addi r4,r4,32
0318 stvx v1,0,r3
0319 stvx v0,r3,r9
0320 addi r3,r3,32
0321
0322 6: bf cr7*4+1,7f
0323 lvx v3,0,r4
0324 lvx v2,r4,r9
0325 lvx v1,r4,r10
0326 lvx v0,r4,r11
0327 addi r4,r4,64
0328 stvx v3,0,r3
0329 stvx v2,r3,r9
0330 stvx v1,r3,r10
0331 stvx v0,r3,r11
0332 addi r3,r3,64
0333
0334 7: sub r5,r5,r6
0335 srdi r6,r5,7
0336
0337 std r14,STK_REG(R14)(r1)
0338 std r15,STK_REG(R15)(r1)
0339 std r16,STK_REG(R16)(r1)
0340
0341 li r12,64
0342 li r14,80
0343 li r15,96
0344 li r16,112
0345
0346 mtctr r6
0347
0348
0349
0350
0351
0352 .align 5
0353 8:
0354 lvx v7,0,r4
0355 lvx v6,r4,r9
0356 lvx v5,r4,r10
0357 lvx v4,r4,r11
0358 lvx v3,r4,r12
0359 lvx v2,r4,r14
0360 lvx v1,r4,r15
0361 lvx v0,r4,r16
0362 addi r4,r4,128
0363 stvx v7,0,r3
0364 stvx v6,r3,r9
0365 stvx v5,r3,r10
0366 stvx v4,r3,r11
0367 stvx v3,r3,r12
0368 stvx v2,r3,r14
0369 stvx v1,r3,r15
0370 stvx v0,r3,r16
0371 addi r3,r3,128
0372 bdnz 8b
0373
0374 ld r14,STK_REG(R14)(r1)
0375 ld r15,STK_REG(R15)(r1)
0376 ld r16,STK_REG(R16)(r1)
0377
0378
0379 clrldi r5,r5,(64-7)
0380 srdi r6,r5,4
0381 mtocrf 0x01,r6
0382
0383 bf cr7*4+1,9f
0384 lvx v3,0,r4
0385 lvx v2,r4,r9
0386 lvx v1,r4,r10
0387 lvx v0,r4,r11
0388 addi r4,r4,64
0389 stvx v3,0,r3
0390 stvx v2,r3,r9
0391 stvx v1,r3,r10
0392 stvx v0,r3,r11
0393 addi r3,r3,64
0394
0395 9: bf cr7*4+2,10f
0396 lvx v1,0,r4
0397 lvx v0,r4,r9
0398 addi r4,r4,32
0399 stvx v1,0,r3
0400 stvx v0,r3,r9
0401 addi r3,r3,32
0402
0403 10: bf cr7*4+3,11f
0404 lvx v1,0,r4
0405 addi r4,r4,16
0406 stvx v1,0,r3
0407 addi r3,r3,16
0408
0409
0410 11: clrldi r5,r5,(64-4)
0411 mtocrf 0x01,r5
0412 bf cr7*4+0,12f
0413 ld r0,0(r4)
0414 addi r4,r4,8
0415 std r0,0(r3)
0416 addi r3,r3,8
0417
0418 12: bf cr7*4+1,13f
0419 lwz r0,0(r4)
0420 addi r4,r4,4
0421 stw r0,0(r3)
0422 addi r3,r3,4
0423
0424 13: bf cr7*4+2,14f
0425 lhz r0,0(r4)
0426 addi r4,r4,2
0427 sth r0,0(r3)
0428 addi r3,r3,2
0429
0430 14: bf cr7*4+3,15f
0431 lbz r0,0(r4)
0432 stb r0,0(r3)
0433
0434 15: addi r1,r1,STACKFRAMESIZE
0435 ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
0436 b exit_vmx_ops
0437
0438 .Lvmx_unaligned_copy:
0439
0440 neg r6,r3
0441 mtocrf 0x01,r6
0442 clrldi r6,r6,(64-4)
0443
0444 bf cr7*4+3,1f
0445 lbz r0,0(r4)
0446 addi r4,r4,1
0447 stb r0,0(r3)
0448 addi r3,r3,1
0449
0450 1: bf cr7*4+2,2f
0451 lhz r0,0(r4)
0452 addi r4,r4,2
0453 sth r0,0(r3)
0454 addi r3,r3,2
0455
0456 2: bf cr7*4+1,3f
0457 lwz r0,0(r4)
0458 addi r4,r4,4
0459 stw r0,0(r3)
0460 addi r3,r3,4
0461
0462 3: bf cr7*4+0,4f
0463 lwz r0,0(r4)
0464 lwz r7,4(r4)
0465 addi r4,r4,8
0466 stw r0,0(r3)
0467 stw r7,4(r3)
0468 addi r3,r3,8
0469
0470 4: sub r5,r5,r6
0471
0472
0473 neg r6,r3
0474 srdi r7,r6,4
0475 mtocrf 0x01,r7
0476 clrldi r6,r6,(64-7)
0477
0478 li r9,16
0479 li r10,32
0480 li r11,48
0481
0482 LVS(v16,0,r4)
0483 lvx v0,0,r4
0484 addi r4,r4,16
0485
0486 bf cr7*4+3,5f
0487 lvx v1,0,r4
0488 VPERM(v8,v0,v1,v16)
0489 addi r4,r4,16
0490 stvx v8,0,r3
0491 addi r3,r3,16
0492 vor v0,v1,v1
0493
0494 5: bf cr7*4+2,6f
0495 lvx v1,0,r4
0496 VPERM(v8,v0,v1,v16)
0497 lvx v0,r4,r9
0498 VPERM(v9,v1,v0,v16)
0499 addi r4,r4,32
0500 stvx v8,0,r3
0501 stvx v9,r3,r9
0502 addi r3,r3,32
0503
0504 6: bf cr7*4+1,7f
0505 lvx v3,0,r4
0506 VPERM(v8,v0,v3,v16)
0507 lvx v2,r4,r9
0508 VPERM(v9,v3,v2,v16)
0509 lvx v1,r4,r10
0510 VPERM(v10,v2,v1,v16)
0511 lvx v0,r4,r11
0512 VPERM(v11,v1,v0,v16)
0513 addi r4,r4,64
0514 stvx v8,0,r3
0515 stvx v9,r3,r9
0516 stvx v10,r3,r10
0517 stvx v11,r3,r11
0518 addi r3,r3,64
0519
0520 7: sub r5,r5,r6
0521 srdi r6,r5,7
0522
0523 std r14,STK_REG(R14)(r1)
0524 std r15,STK_REG(R15)(r1)
0525 std r16,STK_REG(R16)(r1)
0526
0527 li r12,64
0528 li r14,80
0529 li r15,96
0530 li r16,112
0531
0532 mtctr r6
0533
0534
0535
0536
0537
0538 .align 5
0539 8:
0540 lvx v7,0,r4
0541 VPERM(v8,v0,v7,v16)
0542 lvx v6,r4,r9
0543 VPERM(v9,v7,v6,v16)
0544 lvx v5,r4,r10
0545 VPERM(v10,v6,v5,v16)
0546 lvx v4,r4,r11
0547 VPERM(v11,v5,v4,v16)
0548 lvx v3,r4,r12
0549 VPERM(v12,v4,v3,v16)
0550 lvx v2,r4,r14
0551 VPERM(v13,v3,v2,v16)
0552 lvx v1,r4,r15
0553 VPERM(v14,v2,v1,v16)
0554 lvx v0,r4,r16
0555 VPERM(v15,v1,v0,v16)
0556 addi r4,r4,128
0557 stvx v8,0,r3
0558 stvx v9,r3,r9
0559 stvx v10,r3,r10
0560 stvx v11,r3,r11
0561 stvx v12,r3,r12
0562 stvx v13,r3,r14
0563 stvx v14,r3,r15
0564 stvx v15,r3,r16
0565 addi r3,r3,128
0566 bdnz 8b
0567
0568 ld r14,STK_REG(R14)(r1)
0569 ld r15,STK_REG(R15)(r1)
0570 ld r16,STK_REG(R16)(r1)
0571
0572
0573 clrldi r5,r5,(64-7)
0574 srdi r6,r5,4
0575 mtocrf 0x01,r6
0576
0577 bf cr7*4+1,9f
0578 lvx v3,0,r4
0579 VPERM(v8,v0,v3,v16)
0580 lvx v2,r4,r9
0581 VPERM(v9,v3,v2,v16)
0582 lvx v1,r4,r10
0583 VPERM(v10,v2,v1,v16)
0584 lvx v0,r4,r11
0585 VPERM(v11,v1,v0,v16)
0586 addi r4,r4,64
0587 stvx v8,0,r3
0588 stvx v9,r3,r9
0589 stvx v10,r3,r10
0590 stvx v11,r3,r11
0591 addi r3,r3,64
0592
0593 9: bf cr7*4+2,10f
0594 lvx v1,0,r4
0595 VPERM(v8,v0,v1,v16)
0596 lvx v0,r4,r9
0597 VPERM(v9,v1,v0,v16)
0598 addi r4,r4,32
0599 stvx v8,0,r3
0600 stvx v9,r3,r9
0601 addi r3,r3,32
0602
0603 10: bf cr7*4+3,11f
0604 lvx v1,0,r4
0605 VPERM(v8,v0,v1,v16)
0606 addi r4,r4,16
0607 stvx v8,0,r3
0608 addi r3,r3,16
0609
0610
0611 11: clrldi r5,r5,(64-4)
0612 addi r4,r4,-16
0613 mtocrf 0x01,r5
0614 bf cr7*4+0,12f
0615 lwz r0,0(r4)
0616 lwz r6,4(r4)
0617 addi r4,r4,8
0618 stw r0,0(r3)
0619 stw r6,4(r3)
0620 addi r3,r3,8
0621
0622 12: bf cr7*4+1,13f
0623 lwz r0,0(r4)
0624 addi r4,r4,4
0625 stw r0,0(r3)
0626 addi r3,r3,4
0627
0628 13: bf cr7*4+2,14f
0629 lhz r0,0(r4)
0630 addi r4,r4,2
0631 sth r0,0(r3)
0632 addi r3,r3,2
0633
0634 14: bf cr7*4+3,15f
0635 lbz r0,0(r4)
0636 stb r0,0(r3)
0637
0638 15: addi r1,r1,STACKFRAMESIZE
0639 ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
0640 b exit_vmx_ops
0641 #endif