arm64/lib/memcpy.S

0001 /* SPDX-License-Identifier: GPL-2.0-only */
0002 /*
0003  * Copyright (c) 2012-2021, Arm Limited.
0004  *
0005  * Adapted from the original at:
0006  * https://github.com/ARM-software/optimized-routines/blob/afd6244a1f8d9229/string/aarch64/memcpy.S
0007  */
0008
0009 #include <linux/linkage.h>
0010 #include <asm/assembler.h>
0011
0012 /* Assumptions:
0013  *
0014  * ARMv8-a, AArch64, unaligned accesses.
0015  *
0016  */
0017
0018 #define L(label) .L ## label
0019
0020 #define dstin   x0
0021 #define src x1
0022 #define count   x2
0023 #define dst x3
0024 #define srcend  x4
0025 #define dstend  x5
0026 #define A_l x6
0027 #define A_lw    w6
0028 #define A_h x7
0029 #define B_l x8
0030 #define B_lw    w8
0031 #define B_h x9
0032 #define C_l x10
0033 #define C_lw    w10
0034 #define C_h x11
0035 #define D_l x12
0036 #define D_h x13
0037 #define E_l x14
0038 #define E_h x15
0039 #define F_l x16
0040 #define F_h x17
0041 #define G_l count
0042 #define G_h dst
0043 #define H_l src
0044 #define H_h srcend
0045 #define tmp1    x14
0046
0047 /* This implementation handles overlaps and supports both memcpy and memmove
0048    from a single entry point.  It uses unaligned accesses and branchless
0049    sequences to keep the code small, simple and improve performance.
0050
0051    Copies are split into 3 main cases: small copies of up to 32 bytes, medium
0052    copies of up to 128 bytes, and large copies.  The overhead of the overlap
0053    check is negligible since it is only required for large copies.
0054
0055    Large copies use a software pipelined loop processing 64 bytes per iteration.
0056    The destination pointer is 16-byte aligned to minimize unaligned accesses.
0057    The loop tail is handled by always copying 64 bytes from the end.
0058 */
0059
0060 SYM_FUNC_START(__pi_memcpy)
0061     add srcend, src, count
0062     add dstend, dstin, count
0063     cmp count, 128
0064     b.hi    L(copy_long)
0065     cmp count, 32
0066     b.hi    L(copy32_128)
0067
0068     /* Small copies: 0..32 bytes.  */
0069     cmp count, 16
0070     b.lo    L(copy16)
0071     ldp A_l, A_h, [src]
0072     ldp D_l, D_h, [srcend, -16]
0073     stp A_l, A_h, [dstin]
0074     stp D_l, D_h, [dstend, -16]
0075     ret
0076
0077     /* Copy 8-15 bytes.  */
0078 L(copy16):
0079     tbz count, 3, L(copy8)
0080     ldr A_l, [src]
0081     ldr A_h, [srcend, -8]
0082     str A_l, [dstin]
0083     str A_h, [dstend, -8]
0084     ret
0085
0086     .p2align 3
0087     /* Copy 4-7 bytes.  */
0088 L(copy8):
0089     tbz count, 2, L(copy4)
0090     ldr A_lw, [src]
0091     ldr B_lw, [srcend, -4]
0092     str A_lw, [dstin]
0093     str B_lw, [dstend, -4]
0094     ret
0095
0096     /* Copy 0..3 bytes using a branchless sequence.  */
0097 L(copy4):
0098     cbz count, L(copy0)
0099     lsr tmp1, count, 1
0100     ldrb    A_lw, [src]
0101     ldrb    C_lw, [srcend, -1]
0102     ldrb    B_lw, [src, tmp1]
0103     strb    A_lw, [dstin]
0104     strb    B_lw, [dstin, tmp1]
0105     strb    C_lw, [dstend, -1]
0106 L(copy0):
0107     ret
0108
0109     .p2align 4
0110     /* Medium copies: 33..128 bytes.  */
0111 L(copy32_128):
0112     ldp A_l, A_h, [src]
0113     ldp B_l, B_h, [src, 16]
0114     ldp C_l, C_h, [srcend, -32]
0115     ldp D_l, D_h, [srcend, -16]
0116     cmp count, 64
0117     b.hi    L(copy128)
0118     stp A_l, A_h, [dstin]
0119     stp B_l, B_h, [dstin, 16]
0120     stp C_l, C_h, [dstend, -32]
0121     stp D_l, D_h, [dstend, -16]
0122     ret
0123
0124     .p2align 4
0125     /* Copy 65..128 bytes.  */
0126 L(copy128):
0127     ldp E_l, E_h, [src, 32]
0128     ldp F_l, F_h, [src, 48]
0129     cmp count, 96
0130     b.ls    L(copy96)
0131     ldp G_l, G_h, [srcend, -64]
0132     ldp H_l, H_h, [srcend, -48]
0133     stp G_l, G_h, [dstend, -64]
0134     stp H_l, H_h, [dstend, -48]
0135 L(copy96):
0136     stp A_l, A_h, [dstin]
0137     stp B_l, B_h, [dstin, 16]
0138     stp E_l, E_h, [dstin, 32]
0139     stp F_l, F_h, [dstin, 48]
0140     stp C_l, C_h, [dstend, -32]
0141     stp D_l, D_h, [dstend, -16]
0142     ret
0143
0144     .p2align 4
0145     /* Copy more than 128 bytes.  */
0146 L(copy_long):
0147     /* Use backwards copy if there is an overlap.  */
0148     sub tmp1, dstin, src
0149     cbz tmp1, L(copy0)
0150     cmp tmp1, count
0151     b.lo    L(copy_long_backwards)
0152
0153     /* Copy 16 bytes and then align dst to 16-byte alignment.  */
0154
0155     ldp D_l, D_h, [src]
0156     and tmp1, dstin, 15
0157     bic dst, dstin, 15
0158     sub src, src, tmp1
0159     add count, count, tmp1  /* Count is now 16 too large.  */
0160     ldp A_l, A_h, [src, 16]
0161     stp D_l, D_h, [dstin]
0162     ldp B_l, B_h, [src, 32]
0163     ldp C_l, C_h, [src, 48]
0164     ldp D_l, D_h, [src, 64]!
0165     subs    count, count, 128 + 16  /* Test and readjust count.  */
0166     b.ls    L(copy64_from_end)
0167
0168 L(loop64):
0169     stp A_l, A_h, [dst, 16]
0170     ldp A_l, A_h, [src, 16]
0171     stp B_l, B_h, [dst, 32]
0172     ldp B_l, B_h, [src, 32]
0173     stp C_l, C_h, [dst, 48]
0174     ldp C_l, C_h, [src, 48]
0175     stp D_l, D_h, [dst, 64]!
0176     ldp D_l, D_h, [src, 64]!
0177     subs    count, count, 64
0178     b.hi    L(loop64)
0179
0180     /* Write the last iteration and copy 64 bytes from the end.  */
0181 L(copy64_from_end):
0182     ldp E_l, E_h, [srcend, -64]
0183     stp A_l, A_h, [dst, 16]
0184     ldp A_l, A_h, [srcend, -48]
0185     stp B_l, B_h, [dst, 32]
0186     ldp B_l, B_h, [srcend, -32]
0187     stp C_l, C_h, [dst, 48]
0188     ldp C_l, C_h, [srcend, -16]
0189     stp D_l, D_h, [dst, 64]
0190     stp E_l, E_h, [dstend, -64]
0191     stp A_l, A_h, [dstend, -48]
0192     stp B_l, B_h, [dstend, -32]
0193     stp C_l, C_h, [dstend, -16]
0194     ret
0195
0196     .p2align 4
0197
0198     /* Large backwards copy for overlapping copies.
0199        Copy 16 bytes and then align dst to 16-byte alignment.  */
0200 L(copy_long_backwards):
0201     ldp D_l, D_h, [srcend, -16]
0202     and tmp1, dstend, 15
0203     sub srcend, srcend, tmp1
0204     sub count, count, tmp1
0205     ldp A_l, A_h, [srcend, -16]
0206     stp D_l, D_h, [dstend, -16]
0207     ldp B_l, B_h, [srcend, -32]
0208     ldp C_l, C_h, [srcend, -48]
0209     ldp D_l, D_h, [srcend, -64]!
0210     sub dstend, dstend, tmp1
0211     subs    count, count, 128
0212     b.ls    L(copy64_from_start)
0213
0214 L(loop64_backwards):
0215     stp A_l, A_h, [dstend, -16]
0216     ldp A_l, A_h, [srcend, -16]
0217     stp B_l, B_h, [dstend, -32]
0218     ldp B_l, B_h, [srcend, -32]
0219     stp C_l, C_h, [dstend, -48]
0220     ldp C_l, C_h, [srcend, -48]
0221     stp D_l, D_h, [dstend, -64]!
0222     ldp D_l, D_h, [srcend, -64]!
0223     subs    count, count, 64
0224     b.hi    L(loop64_backwards)
0225
0226     /* Write the last iteration and copy 64 bytes from the start.  */
0227 L(copy64_from_start):
0228     ldp G_l, G_h, [src, 48]
0229     stp A_l, A_h, [dstend, -16]
0230     ldp A_l, A_h, [src, 32]
0231     stp B_l, B_h, [dstend, -32]
0232     ldp B_l, B_h, [src, 16]
0233     stp C_l, C_h, [dstend, -48]
0234     ldp C_l, C_h, [src]
0235     stp D_l, D_h, [dstend, -64]
0236     stp G_l, G_h, [dstin, 48]
0237     stp A_l, A_h, [dstin, 32]
0238     stp B_l, B_h, [dstin, 16]
0239     stp C_l, C_h, [dstin]
0240     ret
0241 SYM_FUNC_END(__pi_memcpy)
0242
0243 SYM_FUNC_ALIAS(__memcpy, __pi_memcpy)
0244 EXPORT_SYMBOL(__memcpy)
0245 SYM_FUNC_ALIAS_WEAK(memcpy, __memcpy)
0246 EXPORT_SYMBOL(memcpy)
0247
0248 SYM_FUNC_ALIAS(__pi_memmove, __pi_memcpy)
0249
0250 SYM_FUNC_ALIAS(__memmove, __pi_memmove)
0251 EXPORT_SYMBOL(__memmove)
0252 SYM_FUNC_ALIAS_WEAK(memmove, __memmove)
0253 EXPORT_SYMBOL(memmove)