powerpc/stringloops/memcmp_64.S

0001 /* SPDX-License-Identifier: GPL-2.0-or-later */
0002 /*
0003  * Author: Anton Blanchard <anton@au.ibm.com>
0004  * Copyright 2015 IBM Corporation.
0005  */
0006 #include <asm/ppc_asm.h>
0007 #include <asm/export.h>
0008 #include <asm/ppc-opcode.h>
0009
0010 #define off8    r6
0011 #define off16   r7
0012 #define off24   r8
0013
0014 #define rA  r9
0015 #define rB  r10
0016 #define rC  r11
0017 #define rD  r27
0018 #define rE  r28
0019 #define rF  r29
0020 #define rG  r30
0021 #define rH  r31
0022
0023 #ifdef __LITTLE_ENDIAN__
0024 #define LH  lhbrx
0025 #define LW  lwbrx
0026 #define LD  ldbrx
0027 #define LVS lvsr
0028 #define VPERM(_VRT,_VRA,_VRB,_VRC) \
0029     vperm _VRT,_VRB,_VRA,_VRC
0030 #else
0031 #define LH  lhzx
0032 #define LW  lwzx
0033 #define LD  ldx
0034 #define LVS lvsl
0035 #define VPERM(_VRT,_VRA,_VRB,_VRC) \
0036     vperm _VRT,_VRA,_VRB,_VRC
0037 #endif
0038
0039 #define VMX_THRESH 4096
0040 #define ENTER_VMX_OPS   \
0041     mflr    r0; \
0042     std     r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
0043     std     r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
0044     std     r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
0045     std     r0,16(r1); \
0046     stdu    r1,-STACKFRAMESIZE(r1); \
0047     bl      enter_vmx_ops; \
0048     cmpwi   cr1,r3,0; \
0049     ld      r0,STACKFRAMESIZE+16(r1); \
0050     ld      r3,STK_REG(R31)(r1); \
0051     ld      r4,STK_REG(R30)(r1); \
0052     ld      r5,STK_REG(R29)(r1); \
0053     addi    r1,r1,STACKFRAMESIZE; \
0054     mtlr    r0
0055
0056 #define EXIT_VMX_OPS \
0057     mflr    r0; \
0058     std     r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
0059     std     r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
0060     std     r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
0061     std     r0,16(r1); \
0062     stdu    r1,-STACKFRAMESIZE(r1); \
0063     bl      exit_vmx_ops; \
0064     ld      r0,STACKFRAMESIZE+16(r1); \
0065     ld      r3,STK_REG(R31)(r1); \
0066     ld      r4,STK_REG(R30)(r1); \
0067     ld      r5,STK_REG(R29)(r1); \
0068     addi    r1,r1,STACKFRAMESIZE; \
0069     mtlr    r0
0070
0071 /*
0072  * LD_VSR_CROSS16B load the 2nd 16 bytes for _vaddr which is unaligned with
0073  * 16 bytes boundary and permute the result with the 1st 16 bytes.
0074
0075  *    |  y y y y y y y y y y y y y 0 1 2 | 3 4 5 6 7 8 9 a b c d e f z z z |
0076  *    ^                                  ^                                 ^
0077  * 0xbbbb10                          0xbbbb20                          0xbbb30
0078  *                                 ^
0079  *                                _vaddr
0080  *
0081  *
0082  * _vmask is the mask generated by LVS
0083  * _v1st_qw is the 1st aligned QW of current addr which is already loaded.
0084  *   for example: 0xyyyyyyyyyyyyy012 for big endian
0085  * _v2nd_qw is the 2nd aligned QW of cur _vaddr to be loaded.
0086  *   for example: 0x3456789abcdefzzz for big endian
0087  * The permute result is saved in _v_res.
0088  *   for example: 0x0123456789abcdef for big endian.
0089  */
0090 #define LD_VSR_CROSS16B(_vaddr,_vmask,_v1st_qw,_v2nd_qw,_v_res) \
0091         lvx     _v2nd_qw,_vaddr,off16; \
0092         VPERM(_v_res,_v1st_qw,_v2nd_qw,_vmask)
0093
0094 /*
0095  * There are 2 categories for memcmp:
0096  * 1) src/dst has the same offset to the 8 bytes boundary. The handlers
0097  * are named like .Lsameoffset_xxxx
0098  * 2) src/dst has different offset to the 8 bytes boundary. The handlers
0099  * are named like .Ldiffoffset_xxxx
0100  */
0101 _GLOBAL_TOC(memcmp)
0102     cmpdi   cr1,r5,0
0103
0104     /* Use the short loop if the src/dst addresses are not
0105      * with the same offset of 8 bytes align boundary.
0106      */
0107     xor r6,r3,r4
0108     andi.   r6,r6,7
0109
0110     /* Fall back to short loop if compare at aligned addrs
0111      * with less than 8 bytes.
0112      */
0113     cmpdi   cr6,r5,7
0114
0115     beq cr1,.Lzero
0116     bgt cr6,.Lno_short
0117
0118 .Lshort:
0119     mtctr   r5
0120 1:  lbz rA,0(r3)
0121     lbz rB,0(r4)
0122     subf.   rC,rB,rA
0123     bne .Lnon_zero
0124     bdz .Lzero
0125
0126     lbz rA,1(r3)
0127     lbz rB,1(r4)
0128     subf.   rC,rB,rA
0129     bne .Lnon_zero
0130     bdz .Lzero
0131
0132     lbz rA,2(r3)
0133     lbz rB,2(r4)
0134     subf.   rC,rB,rA
0135     bne .Lnon_zero
0136     bdz .Lzero
0137
0138     lbz rA,3(r3)
0139     lbz rB,3(r4)
0140     subf.   rC,rB,rA
0141     bne .Lnon_zero
0142
0143     addi    r3,r3,4
0144     addi    r4,r4,4
0145
0146     bdnz    1b
0147
0148 .Lzero:
0149     li  r3,0
0150     blr
0151
0152 .Lno_short:
0153     dcbt    0,r3
0154     dcbt    0,r4
0155     bne .Ldiffoffset_8bytes_make_align_start
0156
0157
0158 .Lsameoffset_8bytes_make_align_start:
0159     /* attempt to compare bytes not aligned with 8 bytes so that
0160      * rest comparison can run based on 8 bytes alignment.
0161      */
0162     andi.   r6,r3,7
0163
0164     /* Try to compare the first double word which is not 8 bytes aligned:
0165      * load the first double word at (src & ~7UL) and shift left appropriate
0166      * bits before comparision.
0167      */
0168     rlwinm  r6,r3,3,26,28
0169     beq     .Lsameoffset_8bytes_aligned
0170     clrrdi  r3,r3,3
0171     clrrdi  r4,r4,3
0172     LD  rA,0,r3
0173     LD  rB,0,r4
0174     sld rA,rA,r6
0175     sld rB,rB,r6
0176     cmpld   cr0,rA,rB
0177     srwi    r6,r6,3
0178     bne cr0,.LcmpAB_lightweight
0179     subfic  r6,r6,8
0180     subf.   r5,r6,r5
0181     addi    r3,r3,8
0182     addi    r4,r4,8
0183     beq .Lzero
0184
0185 .Lsameoffset_8bytes_aligned:
0186     /* now we are aligned with 8 bytes.
0187      * Use .Llong loop if left cmp bytes are equal or greater than 32B.
0188      */
0189     cmpdi   cr6,r5,31
0190     bgt cr6,.Llong
0191
0192 .Lcmp_lt32bytes:
0193     /* compare 1 ~ 31 bytes, at least r3 addr is 8 bytes aligned now */
0194     cmpdi   cr5,r5,7
0195     srdi    r0,r5,3
0196     ble cr5,.Lcmp_rest_lt8bytes
0197
0198     /* handle 8 ~ 31 bytes */
0199     clrldi  r5,r5,61
0200     mtctr   r0
0201 2:
0202     LD  rA,0,r3
0203     LD  rB,0,r4
0204     cmpld   cr0,rA,rB
0205     addi    r3,r3,8
0206     addi    r4,r4,8
0207     bne cr0,.LcmpAB_lightweight
0208     bdnz    2b
0209
0210     cmpwi   r5,0
0211     beq .Lzero
0212
0213 .Lcmp_rest_lt8bytes:
0214     /*
0215      * Here we have less than 8 bytes to compare. At least s1 is aligned to
0216      * 8 bytes, but s2 may not be. We must make sure s2 + 7 doesn't cross a
0217      * page boundary, otherwise we might read past the end of the buffer and
0218      * trigger a page fault. We use 4K as the conservative minimum page
0219      * size. If we detect that case we go to the byte-by-byte loop.
0220      *
0221      * Otherwise the next double word is loaded from s1 and s2, and shifted
0222      * right to compare the appropriate bits.
0223      */
0224     clrldi  r6,r4,(64-12)   // r6 = r4 & 0xfff
0225     cmpdi   r6,0xff8
0226     bgt .Lshort
0227
0228     subfic  r6,r5,8
0229     slwi    r6,r6,3
0230     LD  rA,0,r3
0231     LD  rB,0,r4
0232     srd rA,rA,r6
0233     srd rB,rB,r6
0234     cmpld   cr0,rA,rB
0235     bne cr0,.LcmpAB_lightweight
0236     b   .Lzero
0237
0238 .Lnon_zero:
0239     mr  r3,rC
0240     blr
0241
0242 .Llong:
0243 #ifdef CONFIG_ALTIVEC
0244 BEGIN_FTR_SECTION
0245     /* Try to use vmx loop if length is equal or greater than 4K */
0246     cmpldi  cr6,r5,VMX_THRESH
0247     bge cr6,.Lsameoffset_vmx_cmp
0248 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
0249
0250 .Llong_novmx_cmp:
0251 #endif
0252     /* At least s1 addr is aligned with 8 bytes */
0253     li  off8,8
0254     li  off16,16
0255     li  off24,24
0256
0257     std r31,-8(r1)
0258     std r30,-16(r1)
0259     std r29,-24(r1)
0260     std r28,-32(r1)
0261     std r27,-40(r1)
0262
0263     srdi    r0,r5,5
0264     mtctr   r0
0265     andi.   r5,r5,31
0266
0267     LD  rA,0,r3
0268     LD  rB,0,r4
0269
0270     LD  rC,off8,r3
0271     LD  rD,off8,r4
0272
0273     LD  rE,off16,r3
0274     LD  rF,off16,r4
0275
0276     LD  rG,off24,r3
0277     LD  rH,off24,r4
0278     cmpld   cr0,rA,rB
0279
0280     addi    r3,r3,32
0281     addi    r4,r4,32
0282
0283     bdz .Lfirst32
0284
0285     LD  rA,0,r3
0286     LD  rB,0,r4
0287     cmpld   cr1,rC,rD
0288
0289     LD  rC,off8,r3
0290     LD  rD,off8,r4
0291     cmpld   cr6,rE,rF
0292
0293     LD  rE,off16,r3
0294     LD  rF,off16,r4
0295     cmpld   cr7,rG,rH
0296     bne cr0,.LcmpAB
0297
0298     LD  rG,off24,r3
0299     LD  rH,off24,r4
0300     cmpld   cr0,rA,rB
0301     bne cr1,.LcmpCD
0302
0303     addi    r3,r3,32
0304     addi    r4,r4,32
0305
0306     bdz .Lsecond32
0307
0308     .balign 16
0309
0310 1:  LD  rA,0,r3
0311     LD  rB,0,r4
0312     cmpld   cr1,rC,rD
0313     bne cr6,.LcmpEF
0314
0315     LD  rC,off8,r3
0316     LD  rD,off8,r4
0317     cmpld   cr6,rE,rF
0318     bne cr7,.LcmpGH
0319
0320     LD  rE,off16,r3
0321     LD  rF,off16,r4
0322     cmpld   cr7,rG,rH
0323     bne cr0,.LcmpAB
0324
0325     LD  rG,off24,r3
0326     LD  rH,off24,r4
0327     cmpld   cr0,rA,rB
0328     bne cr1,.LcmpCD
0329
0330     addi    r3,r3,32
0331     addi    r4,r4,32
0332
0333     bdnz    1b
0334
0335 .Lsecond32:
0336     cmpld   cr1,rC,rD
0337     bne cr6,.LcmpEF
0338
0339     cmpld   cr6,rE,rF
0340     bne cr7,.LcmpGH
0341
0342     cmpld   cr7,rG,rH
0343     bne cr0,.LcmpAB
0344
0345     bne cr1,.LcmpCD
0346     bne cr6,.LcmpEF
0347     bne cr7,.LcmpGH
0348
0349 .Ltail:
0350     ld  r31,-8(r1)
0351     ld  r30,-16(r1)
0352     ld  r29,-24(r1)
0353     ld  r28,-32(r1)
0354     ld  r27,-40(r1)
0355
0356     cmpdi   r5,0
0357     beq .Lzero
0358     b   .Lshort
0359
0360 .Lfirst32:
0361     cmpld   cr1,rC,rD
0362     cmpld   cr6,rE,rF
0363     cmpld   cr7,rG,rH
0364
0365     bne cr0,.LcmpAB
0366     bne cr1,.LcmpCD
0367     bne cr6,.LcmpEF
0368     bne cr7,.LcmpGH
0369
0370     b   .Ltail
0371
0372 .LcmpAB:
0373     li  r3,1
0374     bgt cr0,.Lout
0375     li  r3,-1
0376     b   .Lout
0377
0378 .LcmpCD:
0379     li  r3,1
0380     bgt cr1,.Lout
0381     li  r3,-1
0382     b   .Lout
0383
0384 .LcmpEF:
0385     li  r3,1
0386     bgt cr6,.Lout
0387     li  r3,-1
0388     b   .Lout
0389
0390 .LcmpGH:
0391     li  r3,1
0392     bgt cr7,.Lout
0393     li  r3,-1
0394
0395 .Lout:
0396     ld  r31,-8(r1)
0397     ld  r30,-16(r1)
0398     ld  r29,-24(r1)
0399     ld  r28,-32(r1)
0400     ld  r27,-40(r1)
0401     blr
0402
0403 .LcmpAB_lightweight:   /* skip NV GPRS restore */
0404     li  r3,1
0405     bgtlr
0406     li  r3,-1
0407     blr
0408
0409 #ifdef CONFIG_ALTIVEC
0410 .Lsameoffset_vmx_cmp:
0411     /* Enter with src/dst addrs has the same offset with 8 bytes
0412      * align boundary.
0413      *
0414      * There is an optimization based on following fact: memcmp()
0415      * prones to fail early at the first 32 bytes.
0416      * Before applying VMX instructions which will lead to 32x128bits
0417      * VMX regs load/restore penalty, we compare the first 32 bytes
0418      * so that we can catch the ~80% fail cases.
0419      */
0420
0421     li  r0,4
0422     mtctr   r0
0423 .Lsameoffset_prechk_32B_loop:
0424     LD  rA,0,r3
0425     LD  rB,0,r4
0426     cmpld   cr0,rA,rB
0427     addi    r3,r3,8
0428     addi    r4,r4,8
0429     bne     cr0,.LcmpAB_lightweight
0430     addi    r5,r5,-8
0431     bdnz    .Lsameoffset_prechk_32B_loop
0432
0433     ENTER_VMX_OPS
0434     beq     cr1,.Llong_novmx_cmp
0435
0436 3:
0437     /* need to check whether r4 has the same offset with r3
0438      * for 16 bytes boundary.
0439      */
0440     xor r0,r3,r4
0441     andi.   r0,r0,0xf
0442     bne .Ldiffoffset_vmx_cmp_start
0443
0444     /* len is no less than 4KB. Need to align with 16 bytes further.
0445      */
0446     andi.   rA,r3,8
0447     LD  rA,0,r3
0448     beq 4f
0449     LD  rB,0,r4
0450     cmpld   cr0,rA,rB
0451     addi    r3,r3,8
0452     addi    r4,r4,8
0453     addi    r5,r5,-8
0454
0455     beq cr0,4f
0456     /* save and restore cr0 */
0457     mfocrf  r5,128
0458     EXIT_VMX_OPS
0459     mtocrf  128,r5
0460     b   .LcmpAB_lightweight
0461
0462 4:
0463     /* compare 32 bytes for each loop */
0464     srdi    r0,r5,5
0465     mtctr   r0
0466     clrldi  r5,r5,59
0467     li  off16,16
0468
0469 .balign 16
0470 5:
0471     lvx     v0,0,r3
0472     lvx     v1,0,r4
0473     VCMPEQUD_RC(v0,v0,v1)
0474     bnl cr6,7f
0475     lvx     v0,off16,r3
0476     lvx     v1,off16,r4
0477     VCMPEQUD_RC(v0,v0,v1)
0478     bnl cr6,6f
0479     addi    r3,r3,32
0480     addi    r4,r4,32
0481     bdnz    5b
0482
0483     EXIT_VMX_OPS
0484     cmpdi   r5,0
0485     beq .Lzero
0486     b   .Lcmp_lt32bytes
0487
0488 6:
0489     addi    r3,r3,16
0490     addi    r4,r4,16
0491
0492 7:
0493     /* diff the last 16 bytes */
0494     EXIT_VMX_OPS
0495     LD  rA,0,r3
0496     LD  rB,0,r4
0497     cmpld   cr0,rA,rB
0498     li  off8,8
0499     bne cr0,.LcmpAB_lightweight
0500
0501     LD  rA,off8,r3
0502     LD  rB,off8,r4
0503     cmpld   cr0,rA,rB
0504     bne cr0,.LcmpAB_lightweight
0505     b   .Lzero
0506 #endif
0507
0508 .Ldiffoffset_8bytes_make_align_start:
0509     /* now try to align s1 with 8 bytes */
0510     rlwinm  r6,r3,3,26,28
0511     beq     .Ldiffoffset_align_s1_8bytes
0512
0513     clrrdi  r3,r3,3
0514     LD  rA,0,r3
0515     LD  rB,0,r4  /* unaligned load */
0516     sld rA,rA,r6
0517     srd rA,rA,r6
0518     srd rB,rB,r6
0519     cmpld   cr0,rA,rB
0520     srwi    r6,r6,3
0521     bne cr0,.LcmpAB_lightweight
0522
0523     subfic  r6,r6,8
0524     subf.   r5,r6,r5
0525     addi    r3,r3,8
0526     add r4,r4,r6
0527
0528     beq .Lzero
0529
0530 .Ldiffoffset_align_s1_8bytes:
0531     /* now s1 is aligned with 8 bytes. */
0532 #ifdef CONFIG_ALTIVEC
0533 BEGIN_FTR_SECTION
0534     /* only do vmx ops when the size equal or greater than 4K bytes */
0535     cmpdi   cr5,r5,VMX_THRESH
0536     bge cr5,.Ldiffoffset_vmx_cmp
0537 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
0538
0539 .Ldiffoffset_novmx_cmp:
0540 #endif
0541
0542
0543     cmpdi   cr5,r5,31
0544     ble cr5,.Lcmp_lt32bytes
0545
0546 #ifdef CONFIG_ALTIVEC
0547     b   .Llong_novmx_cmp
0548 #else
0549     b   .Llong
0550 #endif
0551
0552 #ifdef CONFIG_ALTIVEC
0553 .Ldiffoffset_vmx_cmp:
0554     /* perform a 32 bytes pre-checking before
0555      * enable VMX operations.
0556      */
0557     li  r0,4
0558     mtctr   r0
0559 .Ldiffoffset_prechk_32B_loop:
0560     LD  rA,0,r3
0561     LD  rB,0,r4
0562     cmpld   cr0,rA,rB
0563     addi    r3,r3,8
0564     addi    r4,r4,8
0565     bne     cr0,.LcmpAB_lightweight
0566     addi    r5,r5,-8
0567     bdnz    .Ldiffoffset_prechk_32B_loop
0568
0569     ENTER_VMX_OPS
0570     beq     cr1,.Ldiffoffset_novmx_cmp
0571
0572 .Ldiffoffset_vmx_cmp_start:
0573     /* Firstly try to align r3 with 16 bytes */
0574     andi.   r6,r3,0xf
0575     li  off16,16
0576     beq     .Ldiffoffset_vmx_s1_16bytes_align
0577
0578     LVS v3,0,r3
0579     LVS v4,0,r4
0580
0581     lvx     v5,0,r3
0582     lvx     v6,0,r4
0583     LD_VSR_CROSS16B(r3,v3,v5,v7,v9)
0584     LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
0585
0586     VCMPEQUB_RC(v7,v9,v10)
0587     bnl cr6,.Ldiffoffset_vmx_diff_found
0588
0589     subfic  r6,r6,16
0590     subf    r5,r6,r5
0591     add     r3,r3,r6
0592     add     r4,r4,r6
0593
0594 .Ldiffoffset_vmx_s1_16bytes_align:
0595     /* now s1 is aligned with 16 bytes */
0596     lvx     v6,0,r4
0597     LVS v4,0,r4
0598     srdi    r6,r5,5  /* loop for 32 bytes each */
0599     clrldi  r5,r5,59
0600     mtctr   r6
0601
0602 .balign 16
0603 .Ldiffoffset_vmx_32bytesloop:
0604     /* the first qw of r4 was saved in v6 */
0605     lvx v9,0,r3
0606     LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
0607     VCMPEQUB_RC(v7,v9,v10)
0608     vor v6,v8,v8
0609     bnl cr6,.Ldiffoffset_vmx_diff_found
0610
0611     addi    r3,r3,16
0612     addi    r4,r4,16
0613
0614     lvx v9,0,r3
0615     LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
0616     VCMPEQUB_RC(v7,v9,v10)
0617     vor v6,v8,v8
0618     bnl cr6,.Ldiffoffset_vmx_diff_found
0619
0620     addi    r3,r3,16
0621     addi    r4,r4,16
0622
0623     bdnz    .Ldiffoffset_vmx_32bytesloop
0624
0625     EXIT_VMX_OPS
0626
0627     cmpdi   r5,0
0628     beq .Lzero
0629     b   .Lcmp_lt32bytes
0630
0631 .Ldiffoffset_vmx_diff_found:
0632     EXIT_VMX_OPS
0633     /* anyway, the diff will appear in next 16 bytes */
0634     li  r5,16
0635     b   .Lcmp_lt32bytes
0636
0637 #endif
0638 EXPORT_SYMBOL(memcmp)