x86/lib/memmove_64.S

0001 /* SPDX-License-Identifier: GPL-2.0 */
0002 /*
0003  * Normally compiler builtins are used, but sometimes the compiler calls out
0004  * of line code. Based on asm-i386/string.h.
0005  *
0006  * This assembly file is re-written from memmove_64.c file.
0007  *  - Copyright 2011 Fenghua Yu <fenghua.yu@intel.com>
0008  */
0009 #include <linux/linkage.h>
0010 #include <asm/cpufeatures.h>
0011 #include <asm/alternative.h>
0012 #include <asm/export.h>
0013
0014 #undef memmove
0015
0016 /*
0017  * Implement memmove(). This can handle overlap between src and dst.
0018  *
0019  * Input:
0020  * rdi: dest
0021  * rsi: src
0022  * rdx: count
0023  *
0024  * Output:
0025  * rax: dest
0026  */
0027 SYM_FUNC_START(__memmove)
0028
0029     mov %rdi, %rax
0030
0031     /* Decide forward/backward copy mode */
0032     cmp %rdi, %rsi
0033     jge .Lmemmove_begin_forward
0034     mov %rsi, %r8
0035     add %rdx, %r8
0036     cmp %rdi, %r8
0037     jg 2f
0038
0039     /* FSRM implies ERMS => no length checks, do the copy directly */
0040 .Lmemmove_begin_forward:
0041     ALTERNATIVE "cmp $0x20, %rdx; jb 1f", "", X86_FEATURE_FSRM
0042     ALTERNATIVE "", "jmp .Lmemmove_erms", X86_FEATURE_ERMS
0043
0044     /*
0045      * movsq instruction have many startup latency
0046      * so we handle small size by general register.
0047      */
0048     cmp  $680, %rdx
0049     jb  3f
0050     /*
0051      * movsq instruction is only good for aligned case.
0052      */
0053
0054     cmpb %dil, %sil
0055     je 4f
0056 3:
0057     sub $0x20, %rdx
0058     /*
0059      * We gobble 32 bytes forward in each loop.
0060      */
0061 5:
0062     sub $0x20, %rdx
0063     movq 0*8(%rsi), %r11
0064     movq 1*8(%rsi), %r10
0065     movq 2*8(%rsi), %r9
0066     movq 3*8(%rsi), %r8
0067     leaq 4*8(%rsi), %rsi
0068
0069     movq %r11, 0*8(%rdi)
0070     movq %r10, 1*8(%rdi)
0071     movq %r9, 2*8(%rdi)
0072     movq %r8, 3*8(%rdi)
0073     leaq 4*8(%rdi), %rdi
0074     jae 5b
0075     addq $0x20, %rdx
0076     jmp 1f
0077     /*
0078      * Handle data forward by movsq.
0079      */
0080     .p2align 4
0081 4:
0082     movq %rdx, %rcx
0083     movq -8(%rsi, %rdx), %r11
0084     lea -8(%rdi, %rdx), %r10
0085     shrq $3, %rcx
0086     rep movsq
0087     movq %r11, (%r10)
0088     jmp 13f
0089 .Lmemmove_end_forward:
0090
0091     /*
0092      * Handle data backward by movsq.
0093      */
0094     .p2align 4
0095 7:
0096     movq %rdx, %rcx
0097     movq (%rsi), %r11
0098     movq %rdi, %r10
0099     leaq -8(%rsi, %rdx), %rsi
0100     leaq -8(%rdi, %rdx), %rdi
0101     shrq $3, %rcx
0102     std
0103     rep movsq
0104     cld
0105     movq %r11, (%r10)
0106     jmp 13f
0107
0108     /*
0109      * Start to prepare for backward copy.
0110      */
0111     .p2align 4
0112 2:
0113     cmp $0x20, %rdx
0114     jb 1f
0115     cmp $680, %rdx
0116     jb 6f
0117     cmp %dil, %sil
0118     je 7b
0119 6:
0120     /*
0121      * Calculate copy position to tail.
0122      */
0123     addq %rdx, %rsi
0124     addq %rdx, %rdi
0125     subq $0x20, %rdx
0126     /*
0127      * We gobble 32 bytes backward in each loop.
0128      */
0129 8:
0130     subq $0x20, %rdx
0131     movq -1*8(%rsi), %r11
0132     movq -2*8(%rsi), %r10
0133     movq -3*8(%rsi), %r9
0134     movq -4*8(%rsi), %r8
0135     leaq -4*8(%rsi), %rsi
0136
0137     movq %r11, -1*8(%rdi)
0138     movq %r10, -2*8(%rdi)
0139     movq %r9, -3*8(%rdi)
0140     movq %r8, -4*8(%rdi)
0141     leaq -4*8(%rdi), %rdi
0142     jae 8b
0143     /*
0144      * Calculate copy position to head.
0145      */
0146     addq $0x20, %rdx
0147     subq %rdx, %rsi
0148     subq %rdx, %rdi
0149 1:
0150     cmpq $16, %rdx
0151     jb 9f
0152     /*
0153      * Move data from 16 bytes to 31 bytes.
0154      */
0155     movq 0*8(%rsi), %r11
0156     movq 1*8(%rsi), %r10
0157     movq -2*8(%rsi, %rdx), %r9
0158     movq -1*8(%rsi, %rdx), %r8
0159     movq %r11, 0*8(%rdi)
0160     movq %r10, 1*8(%rdi)
0161     movq %r9, -2*8(%rdi, %rdx)
0162     movq %r8, -1*8(%rdi, %rdx)
0163     jmp 13f
0164     .p2align 4
0165 9:
0166     cmpq $8, %rdx
0167     jb 10f
0168     /*
0169      * Move data from 8 bytes to 15 bytes.
0170      */
0171     movq 0*8(%rsi), %r11
0172     movq -1*8(%rsi, %rdx), %r10
0173     movq %r11, 0*8(%rdi)
0174     movq %r10, -1*8(%rdi, %rdx)
0175     jmp 13f
0176 10:
0177     cmpq $4, %rdx
0178     jb 11f
0179     /*
0180      * Move data from 4 bytes to 7 bytes.
0181      */
0182     movl (%rsi), %r11d
0183     movl -4(%rsi, %rdx), %r10d
0184     movl %r11d, (%rdi)
0185     movl %r10d, -4(%rdi, %rdx)
0186     jmp 13f
0187 11:
0188     cmp $2, %rdx
0189     jb 12f
0190     /*
0191      * Move data from 2 bytes to 3 bytes.
0192      */
0193     movw (%rsi), %r11w
0194     movw -2(%rsi, %rdx), %r10w
0195     movw %r11w, (%rdi)
0196     movw %r10w, -2(%rdi, %rdx)
0197     jmp 13f
0198 12:
0199     cmp $1, %rdx
0200     jb 13f
0201     /*
0202      * Move data for 1 byte.
0203      */
0204     movb (%rsi), %r11b
0205     movb %r11b, (%rdi)
0206 13:
0207     RET
0208
0209 .Lmemmove_erms:
0210     movq %rdx, %rcx
0211     rep movsb
0212     RET
0213 SYM_FUNC_END(__memmove)
0214 EXPORT_SYMBOL(__memmove)
0215
0216 SYM_FUNC_ALIAS_WEAK(memmove, __memmove)
0217 EXPORT_SYMBOL(memmove)