arm64/lib/strlen.S

0001 /* SPDX-License-Identifier: GPL-2.0-only */
0002 /*
0003  * Copyright (c) 2013-2021, Arm Limited.
0004  *
0005  * Adapted from the original at:
0006  * https://github.com/ARM-software/optimized-routines/blob/98e4d6a5c13c8e54/string/aarch64/strlen.S
0007  */
0008
0009 #include <linux/linkage.h>
0010 #include <asm/assembler.h>
0011 #include <asm/mte-def.h>
0012
0013 /* Assumptions:
0014  *
0015  * ARMv8-a, AArch64, unaligned accesses, min page size 4k.
0016  */
0017
0018 #define L(label) .L ## label
0019
0020 /* Arguments and results.  */
0021 #define srcin       x0
0022 #define len     x0
0023
0024 /* Locals and temporaries.  */
0025 #define src     x1
0026 #define data1       x2
0027 #define data2       x3
0028 #define has_nul1    x4
0029 #define has_nul2    x5
0030 #define tmp1        x4
0031 #define tmp2        x5
0032 #define tmp3        x6
0033 #define tmp4        x7
0034 #define zeroones    x8
0035
0036     /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
0037        (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
0038        can be done in parallel across the entire word. A faster check
0039        (X - 1) & 0x80 is zero for non-NUL ASCII characters, but gives
0040        false hits for characters 129..255.  */
0041
0042 #define REP8_01 0x0101010101010101
0043 #define REP8_7f 0x7f7f7f7f7f7f7f7f
0044 #define REP8_80 0x8080808080808080
0045
0046 /*
0047  * When KASAN_HW_TAGS is in use, memory is checked at MTE_GRANULE_SIZE
0048  * (16-byte) granularity, and we must ensure that no access straddles this
0049  * alignment boundary.
0050  */
0051 #ifdef CONFIG_KASAN_HW_TAGS
0052 #define MIN_PAGE_SIZE MTE_GRANULE_SIZE
0053 #else
0054 #define MIN_PAGE_SIZE 4096
0055 #endif
0056
0057     /* Since strings are short on average, we check the first 16 bytes
0058        of the string for a NUL character.  In order to do an unaligned ldp
0059        safely we have to do a page cross check first.  If there is a NUL
0060        byte we calculate the length from the 2 8-byte words using
0061        conditional select to reduce branch mispredictions (it is unlikely
0062        strlen will be repeatedly called on strings with the same length).
0063
0064        If the string is longer than 16 bytes, we align src so don't need
0065        further page cross checks, and process 32 bytes per iteration
0066        using the fast NUL check.  If we encounter non-ASCII characters,
0067        fallback to a second loop using the full NUL check.
0068
0069        If the page cross check fails, we read 16 bytes from an aligned
0070        address, remove any characters before the string, and continue
0071        in the main loop using aligned loads.  Since strings crossing a
0072        page in the first 16 bytes are rare (probability of
0073        16/MIN_PAGE_SIZE ~= 0.4%), this case does not need to be optimized.
0074
0075        AArch64 systems have a minimum page size of 4k.  We don't bother
0076        checking for larger page sizes - the cost of setting up the correct
0077        page size is just not worth the extra gain from a small reduction in
0078        the cases taking the slow path.  Note that we only care about
0079        whether the first fetch, which may be misaligned, crosses a page
0080        boundary.  */
0081
0082 SYM_FUNC_START(__pi_strlen)
0083     and tmp1, srcin, MIN_PAGE_SIZE - 1
0084     mov zeroones, REP8_01
0085     cmp tmp1, MIN_PAGE_SIZE - 16
0086     b.gt    L(page_cross)
0087     ldp data1, data2, [srcin]
0088 #ifdef __AARCH64EB__
0089     /* For big-endian, carry propagation (if the final byte in the
0090        string is 0x01) means we cannot use has_nul1/2 directly.
0091        Since we expect strings to be small and early-exit,
0092        byte-swap the data now so has_null1/2 will be correct.  */
0093     rev data1, data1
0094     rev data2, data2
0095 #endif
0096     sub tmp1, data1, zeroones
0097     orr tmp2, data1, REP8_7f
0098     sub tmp3, data2, zeroones
0099     orr tmp4, data2, REP8_7f
0100     bics    has_nul1, tmp1, tmp2
0101     bic has_nul2, tmp3, tmp4
0102     ccmp    has_nul2, 0, 0, eq
0103     beq L(main_loop_entry)
0104
0105     /* Enter with C = has_nul1 == 0.  */
0106     csel    has_nul1, has_nul1, has_nul2, cc
0107     mov len, 8
0108     rev has_nul1, has_nul1
0109     clz tmp1, has_nul1
0110     csel    len, xzr, len, cc
0111     add len, len, tmp1, lsr 3
0112     ret
0113
0114     /* The inner loop processes 32 bytes per iteration and uses the fast
0115        NUL check.  If we encounter non-ASCII characters, use a second
0116        loop with the accurate NUL check.  */
0117     .p2align 4
0118 L(main_loop_entry):
0119     bic src, srcin, 15
0120     sub src, src, 16
0121 L(main_loop):
0122     ldp data1, data2, [src, 32]!
0123 L(page_cross_entry):
0124     sub tmp1, data1, zeroones
0125     sub tmp3, data2, zeroones
0126     orr tmp2, tmp1, tmp3
0127     tst tmp2, zeroones, lsl 7
0128     bne 1f
0129     ldp data1, data2, [src, 16]
0130     sub tmp1, data1, zeroones
0131     sub tmp3, data2, zeroones
0132     orr tmp2, tmp1, tmp3
0133     tst tmp2, zeroones, lsl 7
0134     beq L(main_loop)
0135     add src, src, 16
0136 1:
0137     /* The fast check failed, so do the slower, accurate NUL check.  */
0138     orr tmp2, data1, REP8_7f
0139     orr tmp4, data2, REP8_7f
0140     bics    has_nul1, tmp1, tmp2
0141     bic has_nul2, tmp3, tmp4
0142     ccmp    has_nul2, 0, 0, eq
0143     beq L(nonascii_loop)
0144
0145     /* Enter with C = has_nul1 == 0.  */
0146 L(tail):
0147 #ifdef __AARCH64EB__
0148     /* For big-endian, carry propagation (if the final byte in the
0149        string is 0x01) means we cannot use has_nul1/2 directly.  The
0150        easiest way to get the correct byte is to byte-swap the data
0151        and calculate the syndrome a second time.  */
0152     csel    data1, data1, data2, cc
0153     rev data1, data1
0154     sub tmp1, data1, zeroones
0155     orr tmp2, data1, REP8_7f
0156     bic has_nul1, tmp1, tmp2
0157 #else
0158     csel    has_nul1, has_nul1, has_nul2, cc
0159 #endif
0160     sub len, src, srcin
0161     rev has_nul1, has_nul1
0162     add tmp2, len, 8
0163     clz tmp1, has_nul1
0164     csel    len, len, tmp2, cc
0165     add len, len, tmp1, lsr 3
0166     ret
0167
0168 L(nonascii_loop):
0169     ldp data1, data2, [src, 16]!
0170     sub tmp1, data1, zeroones
0171     orr tmp2, data1, REP8_7f
0172     sub tmp3, data2, zeroones
0173     orr tmp4, data2, REP8_7f
0174     bics    has_nul1, tmp1, tmp2
0175     bic has_nul2, tmp3, tmp4
0176     ccmp    has_nul2, 0, 0, eq
0177     bne L(tail)
0178     ldp data1, data2, [src, 16]!
0179     sub tmp1, data1, zeroones
0180     orr tmp2, data1, REP8_7f
0181     sub tmp3, data2, zeroones
0182     orr tmp4, data2, REP8_7f
0183     bics    has_nul1, tmp1, tmp2
0184     bic has_nul2, tmp3, tmp4
0185     ccmp    has_nul2, 0, 0, eq
0186     beq L(nonascii_loop)
0187     b   L(tail)
0188
0189     /* Load 16 bytes from [srcin & ~15] and force the bytes that precede
0190        srcin to 0x7f, so we ignore any NUL bytes before the string.
0191        Then continue in the aligned loop.  */
0192 L(page_cross):
0193     bic src, srcin, 15
0194     ldp data1, data2, [src]
0195     lsl tmp1, srcin, 3
0196     mov tmp4, -1
0197 #ifdef __AARCH64EB__
0198     /* Big-endian.  Early bytes are at MSB.  */
0199     lsr tmp1, tmp4, tmp1    /* Shift (tmp1 & 63).  */
0200 #else
0201     /* Little-endian.  Early bytes are at LSB.  */
0202     lsl tmp1, tmp4, tmp1    /* Shift (tmp1 & 63).  */
0203 #endif
0204     orr tmp1, tmp1, REP8_80
0205     orn data1, data1, tmp1
0206     orn tmp2, data2, tmp1
0207     tst srcin, 8
0208     csel    data1, data1, tmp4, eq
0209     csel    data2, data2, tmp2, eq
0210     b   L(page_cross_entry)
0211 SYM_FUNC_END(__pi_strlen)
0212 SYM_FUNC_ALIAS_WEAK(strlen, __pi_strlen)
0213 EXPORT_SYMBOL_NOKASAN(strlen)