Back to home page

OSCL-LXR

 
 

    


0001 /* SPDX-License-Identifier: GPL-2.0 */
0002 /* Copyright 2002 Andi Kleen, SuSE Labs */
0003 
0004 #include <linux/linkage.h>
0005 #include <asm/cpufeatures.h>
0006 #include <asm/alternative.h>
0007 #include <asm/export.h>
0008 
0009 /*
0010  * ISO C memset - set a memory block to a byte value. This function uses fast
0011  * string to get better performance than the original function. The code is
0012  * simpler and shorter than the original function as well.
0013  *
0014  * rdi   destination
0015  * rsi   value (char)
0016  * rdx   count (bytes)
0017  *
0018  * rax   original destination
0019  */
0020 SYM_FUNC_START(__memset)
0021     /*
0022      * Some CPUs support enhanced REP MOVSB/STOSB feature. It is recommended
0023      * to use it when possible. If not available, use fast string instructions.
0024      *
0025      * Otherwise, use original memset function.
0026      */
0027     ALTERNATIVE_2 "jmp memset_orig", "", X86_FEATURE_REP_GOOD, \
0028               "jmp memset_erms", X86_FEATURE_ERMS
0029 
0030     movq %rdi,%r9
0031     movq %rdx,%rcx
0032     andl $7,%edx
0033     shrq $3,%rcx
0034     /* expand byte value  */
0035     movzbl %sil,%esi
0036     movabs $0x0101010101010101,%rax
0037     imulq %rsi,%rax
0038     rep stosq
0039     movl %edx,%ecx
0040     rep stosb
0041     movq %r9,%rax
0042     RET
0043 SYM_FUNC_END(__memset)
0044 EXPORT_SYMBOL(__memset)
0045 
0046 SYM_FUNC_ALIAS_WEAK(memset, __memset)
0047 EXPORT_SYMBOL(memset)
0048 
0049 /*
0050  * ISO C memset - set a memory block to a byte value. This function uses
0051  * enhanced rep stosb to override the fast string function.
0052  * The code is simpler and shorter than the fast string function as well.
0053  *
0054  * rdi   destination
0055  * rsi   value (char)
0056  * rdx   count (bytes)
0057  *
0058  * rax   original destination
0059  */
0060 SYM_FUNC_START_LOCAL(memset_erms)
0061     movq %rdi,%r9
0062     movb %sil,%al
0063     movq %rdx,%rcx
0064     rep stosb
0065     movq %r9,%rax
0066     RET
0067 SYM_FUNC_END(memset_erms)
0068 
0069 SYM_FUNC_START_LOCAL(memset_orig)
0070     movq %rdi,%r10
0071 
0072     /* expand byte value  */
0073     movzbl %sil,%ecx
0074     movabs $0x0101010101010101,%rax
0075     imulq  %rcx,%rax
0076 
0077     /* align dst */
0078     movl  %edi,%r9d
0079     andl  $7,%r9d
0080     jnz  .Lbad_alignment
0081 .Lafter_bad_alignment:
0082 
0083     movq  %rdx,%rcx
0084     shrq  $6,%rcx
0085     jz   .Lhandle_tail
0086 
0087     .p2align 4
0088 .Lloop_64:
0089     decq  %rcx
0090     movq  %rax,(%rdi)
0091     movq  %rax,8(%rdi)
0092     movq  %rax,16(%rdi)
0093     movq  %rax,24(%rdi)
0094     movq  %rax,32(%rdi)
0095     movq  %rax,40(%rdi)
0096     movq  %rax,48(%rdi)
0097     movq  %rax,56(%rdi)
0098     leaq  64(%rdi),%rdi
0099     jnz    .Lloop_64
0100 
0101     /* Handle tail in loops. The loops should be faster than hard
0102        to predict jump tables. */
0103     .p2align 4
0104 .Lhandle_tail:
0105     movl    %edx,%ecx
0106     andl    $63&(~7),%ecx
0107     jz      .Lhandle_7
0108     shrl    $3,%ecx
0109     .p2align 4
0110 .Lloop_8:
0111     decl   %ecx
0112     movq  %rax,(%rdi)
0113     leaq  8(%rdi),%rdi
0114     jnz    .Lloop_8
0115 
0116 .Lhandle_7:
0117     andl    $7,%edx
0118     jz      .Lende
0119     .p2align 4
0120 .Lloop_1:
0121     decl    %edx
0122     movb    %al,(%rdi)
0123     leaq    1(%rdi),%rdi
0124     jnz     .Lloop_1
0125 
0126 .Lende:
0127     movq    %r10,%rax
0128     RET
0129 
0130 .Lbad_alignment:
0131     cmpq $7,%rdx
0132     jbe .Lhandle_7
0133     movq %rax,(%rdi)    /* unaligned store */
0134     movq $8,%r8
0135     subq %r9,%r8
0136     addq %r8,%rdi
0137     subq %r8,%rdx
0138     jmp .Lafter_bad_alignment
0139 .Lfinal:
0140 SYM_FUNC_END(memset_orig)