Back to home page

OSCL-LXR

 
 

    


0001 /* SPDX-License-Identifier: GPL-2.0-only */
0002 /*
0003  * Copyright (c) 2013-2022, Arm Limited.
0004  *
0005  * Adapted from the original at:
0006  * https://github.com/ARM-software/optimized-routines/blob/189dfefe37d54c5b/string/aarch64/strncmp.S
0007  */
0008 
0009 #include <linux/linkage.h>
0010 #include <asm/assembler.h>
0011 
0012 /* Assumptions:
0013  *
0014  * ARMv8-a, AArch64.
0015  * MTE compatible.
0016  */
0017 
0018 #define L(label) .L ## label
0019 
0020 #define REP8_01 0x0101010101010101
0021 #define REP8_7f 0x7f7f7f7f7f7f7f7f
0022 
0023 /* Parameters and result.  */
0024 #define src1        x0
0025 #define src2        x1
0026 #define limit       x2
0027 #define result      x0
0028 
0029 /* Internal variables.  */
0030 #define data1       x3
0031 #define data1w      w3
0032 #define data2       x4
0033 #define data2w      w4
0034 #define has_nul     x5
0035 #define diff        x6
0036 #define syndrome    x7
0037 #define tmp1        x8
0038 #define tmp2        x9
0039 #define tmp3        x10
0040 #define zeroones    x11
0041 #define pos     x12
0042 #define mask        x13
0043 #define endloop     x14
0044 #define count       mask
0045 #define offset      pos
0046 #define neg_offset  x15
0047 
0048 /* Define endian dependent shift operations.
0049    On big-endian early bytes are at MSB and on little-endian LSB.
0050    LS_FW means shifting towards early bytes.
0051    LS_BK means shifting towards later bytes.
0052    */
0053 #ifdef __AARCH64EB__
0054 #define LS_FW lsl
0055 #define LS_BK lsr
0056 #else
0057 #define LS_FW lsr
0058 #define LS_BK lsl
0059 #endif
0060 
0061 SYM_FUNC_START(__pi_strncmp)
0062     cbz limit, L(ret0)
0063     eor tmp1, src1, src2
0064     mov zeroones, #REP8_01
0065     tst tmp1, #7
0066     and count, src1, #7
0067     b.ne    L(misaligned8)
0068     cbnz    count, L(mutual_align)
0069 
0070     /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
0071        (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
0072        can be done in parallel across the entire word.  */
0073     .p2align 4
0074 L(loop_aligned):
0075     ldr data1, [src1], #8
0076     ldr data2, [src2], #8
0077 L(start_realigned):
0078     subs    limit, limit, #8
0079     sub tmp1, data1, zeroones
0080     orr tmp2, data1, #REP8_7f
0081     eor diff, data1, data2  /* Non-zero if differences found.  */
0082     csinv   endloop, diff, xzr, hi  /* Last Dword or differences.  */
0083     bics    has_nul, tmp1, tmp2 /* Non-zero if NUL terminator.  */
0084     ccmp    endloop, #0, #0, eq
0085     b.eq    L(loop_aligned)
0086     /* End of main loop */
0087 
0088 L(full_check):
0089 #ifndef __AARCH64EB__
0090     orr syndrome, diff, has_nul
0091     add limit, limit, 8 /* Rewind limit to before last subs. */
0092 L(syndrome_check):
0093     /* Limit was reached. Check if the NUL byte or the difference
0094        is before the limit. */
0095     rev syndrome, syndrome
0096     rev data1, data1
0097     clz pos, syndrome
0098     rev data2, data2
0099     lsl data1, data1, pos
0100     cmp limit, pos, lsr #3
0101     lsl data2, data2, pos
0102     /* But we need to zero-extend (char is unsigned) the value and then
0103        perform a signed 32-bit subtraction.  */
0104     lsr data1, data1, #56
0105     sub result, data1, data2, lsr #56
0106     csel result, result, xzr, hi
0107     ret
0108 #else
0109     /* Not reached the limit, must have found the end or a diff.  */
0110     tbz limit, #63, L(not_limit)
0111     add tmp1, limit, 8
0112     cbz limit, L(not_limit)
0113 
0114     lsl limit, tmp1, #3 /* Bits -> bytes.  */
0115     mov mask, #~0
0116     lsr mask, mask, limit
0117     bic data1, data1, mask
0118     bic data2, data2, mask
0119 
0120     /* Make sure that the NUL byte is marked in the syndrome.  */
0121     orr has_nul, has_nul, mask
0122 
0123 L(not_limit):
0124     /* For big-endian we cannot use the trick with the syndrome value
0125        as carry-propagation can corrupt the upper bits if the trailing
0126        bytes in the string contain 0x01.  */
0127     /* However, if there is no NUL byte in the dword, we can generate
0128        the result directly.  We can't just subtract the bytes as the
0129        MSB might be significant.  */
0130     cbnz    has_nul, 1f
0131     cmp data1, data2
0132     cset    result, ne
0133     cneg    result, result, lo
0134     ret
0135 1:
0136     /* Re-compute the NUL-byte detection, using a byte-reversed value.  */
0137     rev tmp3, data1
0138     sub tmp1, tmp3, zeroones
0139     orr tmp2, tmp3, #REP8_7f
0140     bic has_nul, tmp1, tmp2
0141     rev has_nul, has_nul
0142     orr syndrome, diff, has_nul
0143     clz pos, syndrome
0144     /* The most-significant-non-zero bit of the syndrome marks either the
0145        first bit that is different, or the top bit of the first zero byte.
0146        Shifting left now will bring the critical information into the
0147        top bits.  */
0148 L(end_quick):
0149     lsl data1, data1, pos
0150     lsl data2, data2, pos
0151     /* But we need to zero-extend (char is unsigned) the value and then
0152        perform a signed 32-bit subtraction.  */
0153     lsr data1, data1, #56
0154     sub result, data1, data2, lsr #56
0155     ret
0156 #endif
0157 
0158 L(mutual_align):
0159     /* Sources are mutually aligned, but are not currently at an
0160        alignment boundary.  Round down the addresses and then mask off
0161        the bytes that precede the start point.
0162        We also need to adjust the limit calculations, but without
0163        overflowing if the limit is near ULONG_MAX.  */
0164     bic src1, src1, #7
0165     bic src2, src2, #7
0166     ldr data1, [src1], #8
0167     neg tmp3, count, lsl #3 /* 64 - bits(bytes beyond align). */
0168     ldr data2, [src2], #8
0169     mov tmp2, #~0
0170     LS_FW   tmp2, tmp2, tmp3    /* Shift (count & 63).  */
0171     /* Adjust the limit and ensure it doesn't overflow.  */
0172     adds    limit, limit, count
0173     csinv   limit, limit, xzr, lo
0174     orr data1, data1, tmp2
0175     orr data2, data2, tmp2
0176     b   L(start_realigned)
0177 
0178     .p2align 4
0179     /* Don't bother with dwords for up to 16 bytes.  */
0180 L(misaligned8):
0181     cmp limit, #16
0182     b.hs    L(try_misaligned_words)
0183 
0184 L(byte_loop):
0185     /* Perhaps we can do better than this.  */
0186     ldrb    data1w, [src1], #1
0187     ldrb    data2w, [src2], #1
0188     subs    limit, limit, #1
0189     ccmp    data1w, #1, #0, hi  /* NZCV = 0b0000.  */
0190     ccmp    data1w, data2w, #0, cs  /* NZCV = 0b0000.  */
0191     b.eq    L(byte_loop)
0192 L(done):
0193     sub result, data1, data2
0194     ret
0195     /* Align the SRC1 to a dword by doing a bytewise compare and then do
0196        the dword loop.  */
0197 L(try_misaligned_words):
0198     cbz count, L(src1_aligned)
0199 
0200     neg count, count
0201     and count, count, #7
0202     sub limit, limit, count
0203 
0204 L(page_end_loop):
0205     ldrb    data1w, [src1], #1
0206     ldrb    data2w, [src2], #1
0207     cmp data1w, #1
0208     ccmp    data1w, data2w, #0, cs  /* NZCV = 0b0000.  */
0209     b.ne    L(done)
0210     subs    count, count, #1
0211     b.hi    L(page_end_loop)
0212 
0213     /* The following diagram explains the comparison of misaligned strings.
0214        The bytes are shown in natural order. For little-endian, it is
0215        reversed in the registers. The "x" bytes are before the string.
0216        The "|" separates data that is loaded at one time.
0217        src1     | a a a a a a a a | b b b c c c c c | . . .
0218        src2     | x x x x x a a a   a a a a a b b b | c c c c c . . .
0219 
0220        After shifting in each step, the data looks like this:
0221                     STEP_A              STEP_B              STEP_C
0222        data1    a a a a a a a a     b b b c c c c c     b b b c c c c c
0223        data2    a a a a a a a a     b b b 0 0 0 0 0     0 0 0 c c c c c
0224 
0225        The bytes with "0" are eliminated from the syndrome via mask.
0226 
0227        Align SRC2 down to 16 bytes. This way we can read 16 bytes at a
0228        time from SRC2. The comparison happens in 3 steps. After each step
0229        the loop can exit, or read from SRC1 or SRC2. */
0230 L(src1_aligned):
0231     /* Calculate offset from 8 byte alignment to string start in bits. No
0232        need to mask offset since shifts are ignoring upper bits. */
0233     lsl offset, src2, #3
0234     bic src2, src2, #0xf
0235     mov mask, -1
0236     neg neg_offset, offset
0237     ldr data1, [src1], #8
0238     ldp tmp1, tmp2, [src2], #16
0239     LS_BK   mask, mask, neg_offset
0240     and neg_offset, neg_offset, #63 /* Need actual value for cmp later. */
0241     /* Skip the first compare if data in tmp1 is irrelevant. */
0242     tbnz    offset, 6, L(misaligned_mid_loop)
0243 
0244 L(loop_misaligned):
0245     /* STEP_A: Compare full 8 bytes when there is enough data from SRC2.*/
0246     LS_FW   data2, tmp1, offset
0247     LS_BK   tmp1, tmp2, neg_offset
0248     subs    limit, limit, #8
0249     orr data2, data2, tmp1  /* 8 bytes from SRC2 combined from two regs.*/
0250     sub has_nul, data1, zeroones
0251     eor diff, data1, data2  /* Non-zero if differences found.  */
0252     orr tmp3, data1, #REP8_7f
0253     csinv   endloop, diff, xzr, hi  /* If limit, set to all ones. */
0254     bic has_nul, has_nul, tmp3  /* Non-zero if NUL byte found in SRC1. */
0255     orr tmp3, endloop, has_nul
0256     cbnz    tmp3, L(full_check)
0257 
0258     ldr data1, [src1], #8
0259 L(misaligned_mid_loop):
0260     /* STEP_B: Compare first part of data1 to second part of tmp2. */
0261     LS_FW   data2, tmp2, offset
0262 #ifdef __AARCH64EB__
0263     /* For big-endian we do a byte reverse to avoid carry-propagation
0264     problem described above. This way we can reuse the has_nul in the
0265     next step and also use syndrome value trick at the end. */
0266     rev tmp3, data1
0267     #define data1_fixed tmp3
0268 #else
0269     #define data1_fixed data1
0270 #endif
0271     sub has_nul, data1_fixed, zeroones
0272     orr tmp3, data1_fixed, #REP8_7f
0273     eor diff, data2, data1  /* Non-zero if differences found.  */
0274     bic has_nul, has_nul, tmp3  /* Non-zero if NUL terminator.  */
0275 #ifdef __AARCH64EB__
0276     rev has_nul, has_nul
0277 #endif
0278     cmp limit, neg_offset, lsr #3
0279     orr syndrome, diff, has_nul
0280     bic syndrome, syndrome, mask    /* Ignore later bytes. */
0281     csinv   tmp3, syndrome, xzr, hi /* If limit, set to all ones. */
0282     cbnz    tmp3, L(syndrome_check)
0283 
0284     /* STEP_C: Compare second part of data1 to first part of tmp1. */
0285     ldp tmp1, tmp2, [src2], #16
0286     cmp limit, #8
0287     LS_BK   data2, tmp1, neg_offset
0288     eor diff, data2, data1  /* Non-zero if differences found.  */
0289     orr syndrome, diff, has_nul
0290     and syndrome, syndrome, mask    /* Ignore earlier bytes. */
0291     csinv   tmp3, syndrome, xzr, hi /* If limit, set to all ones. */
0292     cbnz    tmp3, L(syndrome_check)
0293 
0294     ldr data1, [src1], #8
0295     sub limit, limit, #8
0296     b   L(loop_misaligned)
0297 
0298 #ifdef  __AARCH64EB__
0299 L(syndrome_check):
0300     clz pos, syndrome
0301     cmp pos, limit, lsl #3
0302     b.lo    L(end_quick)
0303 #endif
0304 
0305 L(ret0):
0306     mov result, #0
0307     ret
0308 SYM_FUNC_END(__pi_strncmp)
0309 SYM_FUNC_ALIAS_WEAK(strncmp, __pi_strncmp)
0310 EXPORT_SYMBOL_NOKASAN(strncmp)