arm64/lib/copy_template.S

0001 /* SPDX-License-Identifier: GPL-2.0-only */
0002 /*
0003  * Copyright (C) 2013 ARM Ltd.
0004  * Copyright (C) 2013 Linaro.
0005  *
0006  * This code is based on glibc cortex strings work originally authored by Linaro
0007  * be found @
0008  *
0009  * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
0010  * files/head:/src/aarch64/
0011  */
0012
0013
0014 /*
0015  * Copy a buffer from src to dest (alignment handled by the hardware)
0016  *
0017  * Parameters:
0018  *  x0 - dest
0019  *  x1 - src
0020  *  x2 - n
0021  * Returns:
0022  *  x0 - dest
0023  */
0024 dstin   .req    x0
0025 src .req    x1
0026 count   .req    x2
0027 tmp1    .req    x3
0028 tmp1w   .req    w3
0029 tmp2    .req    x4
0030 tmp2w   .req    w4
0031 dst .req    x6
0032
0033 A_l .req    x7
0034 A_h .req    x8
0035 B_l .req    x9
0036 B_h .req    x10
0037 C_l .req    x11
0038 C_h .req    x12
0039 D_l .req    x13
0040 D_h .req    x14
0041
0042     mov dst, dstin
0043     cmp count, #16
0044     /*When memory length is less than 16, the accessed are not aligned.*/
0045     b.lo    .Ltiny15
0046
0047     neg tmp2, src
0048     ands    tmp2, tmp2, #15/* Bytes to reach alignment. */
0049     b.eq    .LSrcAligned
0050     sub count, count, tmp2
0051     /*
0052     * Copy the leading memory data from src to dst in an increasing
0053     * address order.By this way,the risk of overwriting the source
0054     * memory data is eliminated when the distance between src and
0055     * dst is less than 16. The memory accesses here are alignment.
0056     */
0057     tbz tmp2, #0, 1f
0058     ldrb1   tmp1w, src, #1
0059     strb1   tmp1w, dst, #1
0060 1:
0061     tbz tmp2, #1, 2f
0062     ldrh1   tmp1w, src, #2
0063     strh1   tmp1w, dst, #2
0064 2:
0065     tbz tmp2, #2, 3f
0066     ldr1    tmp1w, src, #4
0067     str1    tmp1w, dst, #4
0068 3:
0069     tbz tmp2, #3, .LSrcAligned
0070     ldr1    tmp1, src, #8
0071     str1    tmp1, dst, #8
0072
0073 .LSrcAligned:
0074     cmp count, #64
0075     b.ge    .Lcpy_over64
0076     /*
0077     * Deal with small copies quickly by dropping straight into the
0078     * exit block.
0079     */
0080 .Ltail63:
0081     /*
0082     * Copy up to 48 bytes of data. At this point we only need the
0083     * bottom 6 bits of count to be accurate.
0084     */
0085     ands    tmp1, count, #0x30
0086     b.eq    .Ltiny15
0087     cmp tmp1w, #0x20
0088     b.eq    1f
0089     b.lt    2f
0090     ldp1    A_l, A_h, src, #16
0091     stp1    A_l, A_h, dst, #16
0092 1:
0093     ldp1    A_l, A_h, src, #16
0094     stp1    A_l, A_h, dst, #16
0095 2:
0096     ldp1    A_l, A_h, src, #16
0097     stp1    A_l, A_h, dst, #16
0098 .Ltiny15:
0099     /*
0100     * Prefer to break one ldp/stp into several load/store to access
0101     * memory in an increasing address order,rather than to load/store 16
0102     * bytes from (src-16) to (dst-16) and to backward the src to aligned
0103     * address,which way is used in original cortex memcpy. If keeping
0104     * the original memcpy process here, memmove need to satisfy the
0105     * precondition that src address is at least 16 bytes bigger than dst
0106     * address,otherwise some source data will be overwritten when memove
0107     * call memcpy directly. To make memmove simpler and decouple the
0108     * memcpy's dependency on memmove, withdrew the original process.
0109     */
0110     tbz count, #3, 1f
0111     ldr1    tmp1, src, #8
0112     str1    tmp1, dst, #8
0113 1:
0114     tbz count, #2, 2f
0115     ldr1    tmp1w, src, #4
0116     str1    tmp1w, dst, #4
0117 2:
0118     tbz count, #1, 3f
0119     ldrh1   tmp1w, src, #2
0120     strh1   tmp1w, dst, #2
0121 3:
0122     tbz count, #0, .Lexitfunc
0123     ldrb1   tmp1w, src, #1
0124     strb1   tmp1w, dst, #1
0125
0126     b   .Lexitfunc
0127
0128 .Lcpy_over64:
0129     subs    count, count, #128
0130     b.ge    .Lcpy_body_large
0131     /*
0132     * Less than 128 bytes to copy, so handle 64 here and then jump
0133     * to the tail.
0134     */
0135     ldp1    A_l, A_h, src, #16
0136     stp1    A_l, A_h, dst, #16
0137     ldp1    B_l, B_h, src, #16
0138     ldp1    C_l, C_h, src, #16
0139     stp1    B_l, B_h, dst, #16
0140     stp1    C_l, C_h, dst, #16
0141     ldp1    D_l, D_h, src, #16
0142     stp1    D_l, D_h, dst, #16
0143
0144     tst count, #0x3f
0145     b.ne    .Ltail63
0146     b   .Lexitfunc
0147
0148     /*
0149     * Critical loop.  Start at a new cache line boundary.  Assuming
0150     * 64 bytes per line this ensures the entire loop is in one line.
0151     */
0152     .p2align    L1_CACHE_SHIFT
0153 .Lcpy_body_large:
0154     /* pre-get 64 bytes data. */
0155     ldp1    A_l, A_h, src, #16
0156     ldp1    B_l, B_h, src, #16
0157     ldp1    C_l, C_h, src, #16
0158     ldp1    D_l, D_h, src, #16
0159 1:
0160     /*
0161     * interlace the load of next 64 bytes data block with store of the last
0162     * loaded 64 bytes data.
0163     */
0164     stp1    A_l, A_h, dst, #16
0165     ldp1    A_l, A_h, src, #16
0166     stp1    B_l, B_h, dst, #16
0167     ldp1    B_l, B_h, src, #16
0168     stp1    C_l, C_h, dst, #16
0169     ldp1    C_l, C_h, src, #16
0170     stp1    D_l, D_h, dst, #16
0171     ldp1    D_l, D_h, src, #16
0172     subs    count, count, #64
0173     b.ge    1b
0174     stp1    A_l, A_h, dst, #16
0175     stp1    B_l, B_h, dst, #16
0176     stp1    C_l, C_h, dst, #16
0177     stp1    D_l, D_h, dst, #16
0178
0179     tst count, #0x3f
0180     b.ne    .Ltail63
0181 .Lexitfunc: