x86/lib/memcpy_64.S

0001 /* SPDX-License-Identifier: GPL-2.0-only */
0002 /* Copyright 2002 Andi Kleen */
0003
0004 #include <linux/linkage.h>
0005 #include <asm/errno.h>
0006 #include <asm/cpufeatures.h>
0007 #include <asm/alternative.h>
0008 #include <asm/export.h>
0009
0010 .pushsection .noinstr.text, "ax"
0011
0012 /*
0013  * We build a jump to memcpy_orig by default which gets NOPped out on
0014  * the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which
0015  * have the enhanced REP MOVSB/STOSB feature (ERMS), change those NOPs
0016  * to a jmp to memcpy_erms which does the REP; MOVSB mem copy.
0017  */
0018
0019 /*
0020  * memcpy - Copy a memory block.
0021  *
0022  * Input:
0023  *  rdi destination
0024  *  rsi source
0025  *  rdx count
0026  *
0027  * Output:
0028  * rax original destination
0029  */
0030 SYM_FUNC_START(__memcpy)
0031     ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \
0032               "jmp memcpy_erms", X86_FEATURE_ERMS
0033
0034     movq %rdi, %rax
0035     movq %rdx, %rcx
0036     shrq $3, %rcx
0037     andl $7, %edx
0038     rep movsq
0039     movl %edx, %ecx
0040     rep movsb
0041     RET
0042 SYM_FUNC_END(__memcpy)
0043 EXPORT_SYMBOL(__memcpy)
0044
0045 SYM_FUNC_ALIAS_WEAK(memcpy, __memcpy)
0046 EXPORT_SYMBOL(memcpy)
0047
0048 /*
0049  * memcpy_erms() - enhanced fast string memcpy. This is faster and
0050  * simpler than memcpy. Use memcpy_erms when possible.
0051  */
0052 SYM_FUNC_START_LOCAL(memcpy_erms)
0053     movq %rdi, %rax
0054     movq %rdx, %rcx
0055     rep movsb
0056     RET
0057 SYM_FUNC_END(memcpy_erms)
0058
0059 SYM_FUNC_START_LOCAL(memcpy_orig)
0060     movq %rdi, %rax
0061
0062     cmpq $0x20, %rdx
0063     jb .Lhandle_tail
0064
0065     /*
0066      * We check whether memory false dependence could occur,
0067      * then jump to corresponding copy mode.
0068      */
0069     cmp  %dil, %sil
0070     jl .Lcopy_backward
0071     subq $0x20, %rdx
0072 .Lcopy_forward_loop:
0073     subq $0x20, %rdx
0074
0075     /*
0076      * Move in blocks of 4x8 bytes:
0077      */
0078     movq 0*8(%rsi), %r8
0079     movq 1*8(%rsi), %r9
0080     movq 2*8(%rsi), %r10
0081     movq 3*8(%rsi), %r11
0082     leaq 4*8(%rsi), %rsi
0083
0084     movq %r8,   0*8(%rdi)
0085     movq %r9,   1*8(%rdi)
0086     movq %r10,  2*8(%rdi)
0087     movq %r11,  3*8(%rdi)
0088     leaq 4*8(%rdi), %rdi
0089     jae  .Lcopy_forward_loop
0090     addl $0x20, %edx
0091     jmp  .Lhandle_tail
0092
0093 .Lcopy_backward:
0094     /*
0095      * Calculate copy position to tail.
0096      */
0097     addq %rdx,  %rsi
0098     addq %rdx,  %rdi
0099     subq $0x20, %rdx
0100     /*
0101      * At most 3 ALU operations in one cycle,
0102      * so append NOPS in the same 16 bytes trunk.
0103      */
0104     .p2align 4
0105 .Lcopy_backward_loop:
0106     subq $0x20, %rdx
0107     movq -1*8(%rsi),    %r8
0108     movq -2*8(%rsi),    %r9
0109     movq -3*8(%rsi),    %r10
0110     movq -4*8(%rsi),    %r11
0111     leaq -4*8(%rsi),    %rsi
0112     movq %r8,       -1*8(%rdi)
0113     movq %r9,       -2*8(%rdi)
0114     movq %r10,      -3*8(%rdi)
0115     movq %r11,      -4*8(%rdi)
0116     leaq -4*8(%rdi),    %rdi
0117     jae  .Lcopy_backward_loop
0118
0119     /*
0120      * Calculate copy position to head.
0121      */
0122     addl $0x20, %edx
0123     subq %rdx,  %rsi
0124     subq %rdx,  %rdi
0125 .Lhandle_tail:
0126     cmpl $16,   %edx
0127     jb   .Lless_16bytes
0128
0129     /*
0130      * Move data from 16 bytes to 31 bytes.
0131      */
0132     movq 0*8(%rsi), %r8
0133     movq 1*8(%rsi), %r9
0134     movq -2*8(%rsi, %rdx),  %r10
0135     movq -1*8(%rsi, %rdx),  %r11
0136     movq %r8,   0*8(%rdi)
0137     movq %r9,   1*8(%rdi)
0138     movq %r10,  -2*8(%rdi, %rdx)
0139     movq %r11,  -1*8(%rdi, %rdx)
0140     RET
0141     .p2align 4
0142 .Lless_16bytes:
0143     cmpl $8,    %edx
0144     jb   .Lless_8bytes
0145     /*
0146      * Move data from 8 bytes to 15 bytes.
0147      */
0148     movq 0*8(%rsi), %r8
0149     movq -1*8(%rsi, %rdx),  %r9
0150     movq %r8,   0*8(%rdi)
0151     movq %r9,   -1*8(%rdi, %rdx)
0152     RET
0153     .p2align 4
0154 .Lless_8bytes:
0155     cmpl $4,    %edx
0156     jb   .Lless_3bytes
0157
0158     /*
0159      * Move data from 4 bytes to 7 bytes.
0160      */
0161     movl (%rsi), %ecx
0162     movl -4(%rsi, %rdx), %r8d
0163     movl %ecx, (%rdi)
0164     movl %r8d, -4(%rdi, %rdx)
0165     RET
0166     .p2align 4
0167 .Lless_3bytes:
0168     subl $1, %edx
0169     jb .Lend
0170     /*
0171      * Move data from 1 bytes to 3 bytes.
0172      */
0173     movzbl (%rsi), %ecx
0174     jz .Lstore_1byte
0175     movzbq 1(%rsi), %r8
0176     movzbq (%rsi, %rdx), %r9
0177     movb %r8b, 1(%rdi)
0178     movb %r9b, (%rdi, %rdx)
0179 .Lstore_1byte:
0180     movb %cl, (%rdi)
0181
0182 .Lend:
0183     RET
0184 SYM_FUNC_END(memcpy_orig)
0185
0186 .popsection