0001
0002
0003
0004
0005
0006 #include <asm/ppc_asm.h>
0007 #include <asm/export.h>
0008 #include <asm/ppc-opcode.h>
0009
0010 #define off8 r6
0011 #define off16 r7
0012 #define off24 r8
0013
0014 #define rA r9
0015 #define rB r10
0016 #define rC r11
0017 #define rD r27
0018 #define rE r28
0019 #define rF r29
0020 #define rG r30
0021 #define rH r31
0022
0023 #ifdef __LITTLE_ENDIAN__
0024 #define LH lhbrx
0025 #define LW lwbrx
0026 #define LD ldbrx
0027 #define LVS lvsr
0028 #define VPERM(_VRT,_VRA,_VRB,_VRC) \
0029 vperm _VRT,_VRB,_VRA,_VRC
0030 #else
0031 #define LH lhzx
0032 #define LW lwzx
0033 #define LD ldx
0034 #define LVS lvsl
0035 #define VPERM(_VRT,_VRA,_VRB,_VRC) \
0036 vperm _VRT,_VRA,_VRB,_VRC
0037 #endif
0038
0039 #define VMX_THRESH 4096
0040 #define ENTER_VMX_OPS \
0041 mflr r0; \
0042 std r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
0043 std r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
0044 std r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
0045 std r0,16(r1); \
0046 stdu r1,-STACKFRAMESIZE(r1); \
0047 bl enter_vmx_ops; \
0048 cmpwi cr1,r3,0; \
0049 ld r0,STACKFRAMESIZE+16(r1); \
0050 ld r3,STK_REG(R31)(r1); \
0051 ld r4,STK_REG(R30)(r1); \
0052 ld r5,STK_REG(R29)(r1); \
0053 addi r1,r1,STACKFRAMESIZE; \
0054 mtlr r0
0055
0056 #define EXIT_VMX_OPS \
0057 mflr r0; \
0058 std r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
0059 std r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
0060 std r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
0061 std r0,16(r1); \
0062 stdu r1,-STACKFRAMESIZE(r1); \
0063 bl exit_vmx_ops; \
0064 ld r0,STACKFRAMESIZE+16(r1); \
0065 ld r3,STK_REG(R31)(r1); \
0066 ld r4,STK_REG(R30)(r1); \
0067 ld r5,STK_REG(R29)(r1); \
0068 addi r1,r1,STACKFRAMESIZE; \
0069 mtlr r0
0070
0071
0072
0073
0074
0075
0076
0077
0078
0079
0080
0081
0082
0083
0084
0085
0086
0087
0088
0089
0090 #define LD_VSR_CROSS16B(_vaddr,_vmask,_v1st_qw,_v2nd_qw,_v_res) \
0091 lvx _v2nd_qw,_vaddr,off16; \
0092 VPERM(_v_res,_v1st_qw,_v2nd_qw,_vmask)
0093
0094
0095
0096
0097
0098
0099
0100
0101 _GLOBAL_TOC(memcmp)
0102 cmpdi cr1,r5,0
0103
0104
0105
0106
0107 xor r6,r3,r4
0108 andi. r6,r6,7
0109
0110
0111
0112
0113 cmpdi cr6,r5,7
0114
0115 beq cr1,.Lzero
0116 bgt cr6,.Lno_short
0117
0118 .Lshort:
0119 mtctr r5
0120 1: lbz rA,0(r3)
0121 lbz rB,0(r4)
0122 subf. rC,rB,rA
0123 bne .Lnon_zero
0124 bdz .Lzero
0125
0126 lbz rA,1(r3)
0127 lbz rB,1(r4)
0128 subf. rC,rB,rA
0129 bne .Lnon_zero
0130 bdz .Lzero
0131
0132 lbz rA,2(r3)
0133 lbz rB,2(r4)
0134 subf. rC,rB,rA
0135 bne .Lnon_zero
0136 bdz .Lzero
0137
0138 lbz rA,3(r3)
0139 lbz rB,3(r4)
0140 subf. rC,rB,rA
0141 bne .Lnon_zero
0142
0143 addi r3,r3,4
0144 addi r4,r4,4
0145
0146 bdnz 1b
0147
0148 .Lzero:
0149 li r3,0
0150 blr
0151
0152 .Lno_short:
0153 dcbt 0,r3
0154 dcbt 0,r4
0155 bne .Ldiffoffset_8bytes_make_align_start
0156
0157
0158 .Lsameoffset_8bytes_make_align_start:
0159
0160
0161
0162 andi. r6,r3,7
0163
0164
0165
0166
0167
0168 rlwinm r6,r3,3,26,28
0169 beq .Lsameoffset_8bytes_aligned
0170 clrrdi r3,r3,3
0171 clrrdi r4,r4,3
0172 LD rA,0,r3
0173 LD rB,0,r4
0174 sld rA,rA,r6
0175 sld rB,rB,r6
0176 cmpld cr0,rA,rB
0177 srwi r6,r6,3
0178 bne cr0,.LcmpAB_lightweight
0179 subfic r6,r6,8
0180 subf. r5,r6,r5
0181 addi r3,r3,8
0182 addi r4,r4,8
0183 beq .Lzero
0184
0185 .Lsameoffset_8bytes_aligned:
0186
0187
0188
0189 cmpdi cr6,r5,31
0190 bgt cr6,.Llong
0191
0192 .Lcmp_lt32bytes:
0193
0194 cmpdi cr5,r5,7
0195 srdi r0,r5,3
0196 ble cr5,.Lcmp_rest_lt8bytes
0197
0198
0199 clrldi r5,r5,61
0200 mtctr r0
0201 2:
0202 LD rA,0,r3
0203 LD rB,0,r4
0204 cmpld cr0,rA,rB
0205 addi r3,r3,8
0206 addi r4,r4,8
0207 bne cr0,.LcmpAB_lightweight
0208 bdnz 2b
0209
0210 cmpwi r5,0
0211 beq .Lzero
0212
0213 .Lcmp_rest_lt8bytes:
0214
0215
0216
0217
0218
0219
0220
0221
0222
0223
0224 clrldi r6,r4,(64-12) // r6 = r4 & 0xfff
0225 cmpdi r6,0xff8
0226 bgt .Lshort
0227
0228 subfic r6,r5,8
0229 slwi r6,r6,3
0230 LD rA,0,r3
0231 LD rB,0,r4
0232 srd rA,rA,r6
0233 srd rB,rB,r6
0234 cmpld cr0,rA,rB
0235 bne cr0,.LcmpAB_lightweight
0236 b .Lzero
0237
0238 .Lnon_zero:
0239 mr r3,rC
0240 blr
0241
0242 .Llong:
0243 #ifdef CONFIG_ALTIVEC
0244 BEGIN_FTR_SECTION
0245
0246 cmpldi cr6,r5,VMX_THRESH
0247 bge cr6,.Lsameoffset_vmx_cmp
0248 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
0249
0250 .Llong_novmx_cmp:
0251 #endif
0252
0253 li off8,8
0254 li off16,16
0255 li off24,24
0256
0257 std r31,-8(r1)
0258 std r30,-16(r1)
0259 std r29,-24(r1)
0260 std r28,-32(r1)
0261 std r27,-40(r1)
0262
0263 srdi r0,r5,5
0264 mtctr r0
0265 andi. r5,r5,31
0266
0267 LD rA,0,r3
0268 LD rB,0,r4
0269
0270 LD rC,off8,r3
0271 LD rD,off8,r4
0272
0273 LD rE,off16,r3
0274 LD rF,off16,r4
0275
0276 LD rG,off24,r3
0277 LD rH,off24,r4
0278 cmpld cr0,rA,rB
0279
0280 addi r3,r3,32
0281 addi r4,r4,32
0282
0283 bdz .Lfirst32
0284
0285 LD rA,0,r3
0286 LD rB,0,r4
0287 cmpld cr1,rC,rD
0288
0289 LD rC,off8,r3
0290 LD rD,off8,r4
0291 cmpld cr6,rE,rF
0292
0293 LD rE,off16,r3
0294 LD rF,off16,r4
0295 cmpld cr7,rG,rH
0296 bne cr0,.LcmpAB
0297
0298 LD rG,off24,r3
0299 LD rH,off24,r4
0300 cmpld cr0,rA,rB
0301 bne cr1,.LcmpCD
0302
0303 addi r3,r3,32
0304 addi r4,r4,32
0305
0306 bdz .Lsecond32
0307
0308 .balign 16
0309
0310 1: LD rA,0,r3
0311 LD rB,0,r4
0312 cmpld cr1,rC,rD
0313 bne cr6,.LcmpEF
0314
0315 LD rC,off8,r3
0316 LD rD,off8,r4
0317 cmpld cr6,rE,rF
0318 bne cr7,.LcmpGH
0319
0320 LD rE,off16,r3
0321 LD rF,off16,r4
0322 cmpld cr7,rG,rH
0323 bne cr0,.LcmpAB
0324
0325 LD rG,off24,r3
0326 LD rH,off24,r4
0327 cmpld cr0,rA,rB
0328 bne cr1,.LcmpCD
0329
0330 addi r3,r3,32
0331 addi r4,r4,32
0332
0333 bdnz 1b
0334
0335 .Lsecond32:
0336 cmpld cr1,rC,rD
0337 bne cr6,.LcmpEF
0338
0339 cmpld cr6,rE,rF
0340 bne cr7,.LcmpGH
0341
0342 cmpld cr7,rG,rH
0343 bne cr0,.LcmpAB
0344
0345 bne cr1,.LcmpCD
0346 bne cr6,.LcmpEF
0347 bne cr7,.LcmpGH
0348
0349 .Ltail:
0350 ld r31,-8(r1)
0351 ld r30,-16(r1)
0352 ld r29,-24(r1)
0353 ld r28,-32(r1)
0354 ld r27,-40(r1)
0355
0356 cmpdi r5,0
0357 beq .Lzero
0358 b .Lshort
0359
0360 .Lfirst32:
0361 cmpld cr1,rC,rD
0362 cmpld cr6,rE,rF
0363 cmpld cr7,rG,rH
0364
0365 bne cr0,.LcmpAB
0366 bne cr1,.LcmpCD
0367 bne cr6,.LcmpEF
0368 bne cr7,.LcmpGH
0369
0370 b .Ltail
0371
0372 .LcmpAB:
0373 li r3,1
0374 bgt cr0,.Lout
0375 li r3,-1
0376 b .Lout
0377
0378 .LcmpCD:
0379 li r3,1
0380 bgt cr1,.Lout
0381 li r3,-1
0382 b .Lout
0383
0384 .LcmpEF:
0385 li r3,1
0386 bgt cr6,.Lout
0387 li r3,-1
0388 b .Lout
0389
0390 .LcmpGH:
0391 li r3,1
0392 bgt cr7,.Lout
0393 li r3,-1
0394
0395 .Lout:
0396 ld r31,-8(r1)
0397 ld r30,-16(r1)
0398 ld r29,-24(r1)
0399 ld r28,-32(r1)
0400 ld r27,-40(r1)
0401 blr
0402
0403 .LcmpAB_lightweight:
0404 li r3,1
0405 bgtlr
0406 li r3,-1
0407 blr
0408
0409 #ifdef CONFIG_ALTIVEC
0410 .Lsameoffset_vmx_cmp:
0411
0412
0413
0414
0415
0416
0417
0418
0419
0420
0421 li r0,4
0422 mtctr r0
0423 .Lsameoffset_prechk_32B_loop:
0424 LD rA,0,r3
0425 LD rB,0,r4
0426 cmpld cr0,rA,rB
0427 addi r3,r3,8
0428 addi r4,r4,8
0429 bne cr0,.LcmpAB_lightweight
0430 addi r5,r5,-8
0431 bdnz .Lsameoffset_prechk_32B_loop
0432
0433 ENTER_VMX_OPS
0434 beq cr1,.Llong_novmx_cmp
0435
0436 3:
0437
0438
0439
0440 xor r0,r3,r4
0441 andi. r0,r0,0xf
0442 bne .Ldiffoffset_vmx_cmp_start
0443
0444
0445
0446 andi. rA,r3,8
0447 LD rA,0,r3
0448 beq 4f
0449 LD rB,0,r4
0450 cmpld cr0,rA,rB
0451 addi r3,r3,8
0452 addi r4,r4,8
0453 addi r5,r5,-8
0454
0455 beq cr0,4f
0456
0457 mfocrf r5,128
0458 EXIT_VMX_OPS
0459 mtocrf 128,r5
0460 b .LcmpAB_lightweight
0461
0462 4:
0463
0464 srdi r0,r5,5
0465 mtctr r0
0466 clrldi r5,r5,59
0467 li off16,16
0468
0469 .balign 16
0470 5:
0471 lvx v0,0,r3
0472 lvx v1,0,r4
0473 VCMPEQUD_RC(v0,v0,v1)
0474 bnl cr6,7f
0475 lvx v0,off16,r3
0476 lvx v1,off16,r4
0477 VCMPEQUD_RC(v0,v0,v1)
0478 bnl cr6,6f
0479 addi r3,r3,32
0480 addi r4,r4,32
0481 bdnz 5b
0482
0483 EXIT_VMX_OPS
0484 cmpdi r5,0
0485 beq .Lzero
0486 b .Lcmp_lt32bytes
0487
0488 6:
0489 addi r3,r3,16
0490 addi r4,r4,16
0491
0492 7:
0493
0494 EXIT_VMX_OPS
0495 LD rA,0,r3
0496 LD rB,0,r4
0497 cmpld cr0,rA,rB
0498 li off8,8
0499 bne cr0,.LcmpAB_lightweight
0500
0501 LD rA,off8,r3
0502 LD rB,off8,r4
0503 cmpld cr0,rA,rB
0504 bne cr0,.LcmpAB_lightweight
0505 b .Lzero
0506 #endif
0507
0508 .Ldiffoffset_8bytes_make_align_start:
0509
0510 rlwinm r6,r3,3,26,28
0511 beq .Ldiffoffset_align_s1_8bytes
0512
0513 clrrdi r3,r3,3
0514 LD rA,0,r3
0515 LD rB,0,r4
0516 sld rA,rA,r6
0517 srd rA,rA,r6
0518 srd rB,rB,r6
0519 cmpld cr0,rA,rB
0520 srwi r6,r6,3
0521 bne cr0,.LcmpAB_lightweight
0522
0523 subfic r6,r6,8
0524 subf. r5,r6,r5
0525 addi r3,r3,8
0526 add r4,r4,r6
0527
0528 beq .Lzero
0529
0530 .Ldiffoffset_align_s1_8bytes:
0531
0532 #ifdef CONFIG_ALTIVEC
0533 BEGIN_FTR_SECTION
0534
0535 cmpdi cr5,r5,VMX_THRESH
0536 bge cr5,.Ldiffoffset_vmx_cmp
0537 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
0538
0539 .Ldiffoffset_novmx_cmp:
0540 #endif
0541
0542
0543 cmpdi cr5,r5,31
0544 ble cr5,.Lcmp_lt32bytes
0545
0546 #ifdef CONFIG_ALTIVEC
0547 b .Llong_novmx_cmp
0548 #else
0549 b .Llong
0550 #endif
0551
0552 #ifdef CONFIG_ALTIVEC
0553 .Ldiffoffset_vmx_cmp:
0554
0555
0556
0557 li r0,4
0558 mtctr r0
0559 .Ldiffoffset_prechk_32B_loop:
0560 LD rA,0,r3
0561 LD rB,0,r4
0562 cmpld cr0,rA,rB
0563 addi r3,r3,8
0564 addi r4,r4,8
0565 bne cr0,.LcmpAB_lightweight
0566 addi r5,r5,-8
0567 bdnz .Ldiffoffset_prechk_32B_loop
0568
0569 ENTER_VMX_OPS
0570 beq cr1,.Ldiffoffset_novmx_cmp
0571
0572 .Ldiffoffset_vmx_cmp_start:
0573
0574 andi. r6,r3,0xf
0575 li off16,16
0576 beq .Ldiffoffset_vmx_s1_16bytes_align
0577
0578 LVS v3,0,r3
0579 LVS v4,0,r4
0580
0581 lvx v5,0,r3
0582 lvx v6,0,r4
0583 LD_VSR_CROSS16B(r3,v3,v5,v7,v9)
0584 LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
0585
0586 VCMPEQUB_RC(v7,v9,v10)
0587 bnl cr6,.Ldiffoffset_vmx_diff_found
0588
0589 subfic r6,r6,16
0590 subf r5,r6,r5
0591 add r3,r3,r6
0592 add r4,r4,r6
0593
0594 .Ldiffoffset_vmx_s1_16bytes_align:
0595
0596 lvx v6,0,r4
0597 LVS v4,0,r4
0598 srdi r6,r5,5
0599 clrldi r5,r5,59
0600 mtctr r6
0601
0602 .balign 16
0603 .Ldiffoffset_vmx_32bytesloop:
0604
0605 lvx v9,0,r3
0606 LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
0607 VCMPEQUB_RC(v7,v9,v10)
0608 vor v6,v8,v8
0609 bnl cr6,.Ldiffoffset_vmx_diff_found
0610
0611 addi r3,r3,16
0612 addi r4,r4,16
0613
0614 lvx v9,0,r3
0615 LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
0616 VCMPEQUB_RC(v7,v9,v10)
0617 vor v6,v8,v8
0618 bnl cr6,.Ldiffoffset_vmx_diff_found
0619
0620 addi r3,r3,16
0621 addi r4,r4,16
0622
0623 bdnz .Ldiffoffset_vmx_32bytesloop
0624
0625 EXIT_VMX_OPS
0626
0627 cmpdi r5,0
0628 beq .Lzero
0629 b .Lcmp_lt32bytes
0630
0631 .Ldiffoffset_vmx_diff_found:
0632 EXIT_VMX_OPS
0633
0634 li r5,16
0635 b .Lcmp_lt32bytes
0636
0637 #endif
0638 EXPORT_SYMBOL(memcmp)