Back to home page

OSCL-LXR

 
 

    


0001 /*
0002  * M7memcpy: Optimized SPARC M7 memcpy
0003  *
0004  * Copyright (c) 2016, Oracle and/or its affiliates. All rights reserved.
0005  */
0006 
0007     .file   "M7memcpy.S"
0008 
0009 /*
0010  * memcpy(s1, s2, len)
0011  *
0012  * Copy s2 to s1, always copy n bytes.
0013  * Note: this C code does not work for overlapped copies.
0014  *
0015  * Fast assembler language version of the following C-program for memcpy
0016  * which represents the `standard' for the C-library.
0017  *
0018  *  void *
0019  *  memcpy(void *s, const void *s0, size_t n)
0020  *  {
0021  *      if (n != 0) {
0022  *          char *s1 = s;
0023  *          const char *s2 = s0;
0024  *          do {
0025  *          *s1++ = *s2++;
0026  *          } while (--n != 0);
0027  *      }
0028  *      return (s);
0029  *  }
0030  *
0031  *
0032  * SPARC T7/M7 Flow :
0033  *
0034  * if (count < SMALL_MAX) {
0035  *   if count < SHORTCOPY              (SHORTCOPY=3)
0036  *  copy bytes; exit with dst addr
0037  *   if src & dst aligned on word boundary but not long word boundary,
0038  *     copy with ldw/stw; branch to finish_up
0039  *   if src & dst aligned on long word boundary
0040  *     copy with ldx/stx; branch to finish_up
0041  *   if src & dst not aligned and length <= SHORTCHECK   (SHORTCHECK=14)
0042  *     copy bytes; exit with dst addr
0043  *   move enough bytes to get src to word boundary
0044  *   if dst now on word boundary
0045  * move_words:
0046  *     copy words; branch to finish_up
0047  *   if dst now on half word boundary
0048  *     load words, shift half words, store words; branch to finish_up
0049  *   if dst on byte 1
0050  *     load words, shift 3 bytes, store words; branch to finish_up
0051  *   if dst on byte 3
0052  *     load words, shift 1 byte, store words; branch to finish_up
0053  * finish_up:
0054  *     copy bytes; exit with dst addr
0055  * } else {                                         More than SMALL_MAX bytes
0056  *   move bytes until dst is on long word boundary
0057  *   if( src is on long word boundary ) {
0058  *     if (count < MED_MAX) {
0059  * finish_long:                    src/dst aligned on 8 bytes
0060  *       copy with ldx/stx in 8-way unrolled loop;
0061  *       copy final 0-63 bytes; exit with dst addr
0062  *     } else {                  src/dst aligned; count > MED_MAX
0063  *       align dst on 64 byte boundary; for main data movement:
0064  *       prefetch src data to L2 cache; let HW prefetch move data to L1 cache
0065  *       Use BIS (block initializing store) to avoid copying store cache
0066  *       lines from memory. But pre-store first element of each cache line
0067  *       ST_CHUNK lines in advance of the rest of that cache line. That
0068  *       gives time for replacement cache lines to be written back without
0069  *       excess STQ and Miss Buffer filling. Repeat until near the end,
0070  *       then finish up storing before going to finish_long.
0071  *     }
0072  *   } else {                                   src/dst not aligned on 8 bytes
0073  *     if src is word aligned and count < MED_WMAX
0074  *       move words in 8-way unrolled loop
0075  *       move final 0-31 bytes; exit with dst addr
0076  *     if count < MED_UMAX
0077  *       use alignaddr/faligndata combined with ldd/std in 8-way
0078  *       unrolled loop to move data.
0079  *       go to unalign_done
0080  *     else
0081  *       setup alignaddr for faligndata instructions
0082  *       align dst on 64 byte boundary; prefetch src data to L1 cache
0083  *       loadx8, falign, block-store, prefetch loop
0084  *   (only use block-init-store when src/dst on 8 byte boundaries.)
0085  * unalign_done:
0086  *       move remaining bytes for unaligned cases. exit with dst addr.
0087  * }
0088  *
0089  */
0090 
0091 #include <asm/visasm.h>
0092 #include <asm/asi.h>
0093 
0094 #if !defined(EX_LD) && !defined(EX_ST)
0095 #define NON_USER_COPY
0096 #endif
0097 
0098 #ifndef EX_LD
0099 #define EX_LD(x,y)  x
0100 #endif
0101 #ifndef EX_LD_FP
0102 #define EX_LD_FP(x,y)   x
0103 #endif
0104 
0105 #ifndef EX_ST
0106 #define EX_ST(x,y)  x
0107 #endif
0108 #ifndef EX_ST_FP
0109 #define EX_ST_FP(x,y)   x
0110 #endif
0111 
0112 #ifndef EX_RETVAL
0113 #define EX_RETVAL(x)    x
0114 #endif
0115 
0116 #ifndef LOAD
0117 #define LOAD(type,addr,dest)    type [addr], dest
0118 #endif
0119 
0120 #ifndef STORE
0121 #define STORE(type,src,addr)    type src, [addr]
0122 #endif
0123 
0124 /*
0125  * ASI_BLK_INIT_QUAD_LDD_P/ASI_BLK_INIT_QUAD_LDD_S marks the cache
0126  * line as "least recently used" which means if many threads are
0127  * active, it has a high probability of being pushed out of the cache
0128  * between the first initializing store and the final stores.
0129  * Thus, we use ASI_ST_BLKINIT_MRU_P/ASI_ST_BLKINIT_MRU_S which
0130  * marks the cache line as "most recently used" for all
0131  * but the last cache line
0132  */
0133 #ifndef STORE_ASI
0134 #ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA
0135 #define STORE_ASI   ASI_BLK_INIT_QUAD_LDD_P
0136 #else
0137 #define STORE_ASI   0x80        /* ASI_P */
0138 #endif
0139 #endif
0140 
0141 #ifndef STORE_MRU_ASI
0142 #ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA
0143 #define STORE_MRU_ASI   ASI_ST_BLKINIT_MRU_P
0144 #else
0145 #define STORE_MRU_ASI   0x80        /* ASI_P */
0146 #endif
0147 #endif
0148 
0149 #ifndef STORE_INIT
0150 #define STORE_INIT(src,addr)    stxa src, [addr] STORE_ASI
0151 #endif
0152 
0153 #ifndef STORE_INIT_MRU
0154 #define STORE_INIT_MRU(src,addr)    stxa src, [addr] STORE_MRU_ASI
0155 #endif
0156 
0157 #ifndef FUNC_NAME
0158 #define FUNC_NAME   M7memcpy
0159 #endif
0160 
0161 #ifndef PREAMBLE
0162 #define PREAMBLE
0163 #endif
0164 
0165 #define BLOCK_SIZE  64
0166 #define SHORTCOPY   3
0167 #define SHORTCHECK  14
0168 #define SHORT_LONG  64  /* max copy for short longword-aligned case */
0169                 /* must be at least 64 */
0170 #define SMALL_MAX   128
0171 #define MED_UMAX    1024    /* max copy for medium un-aligned case */
0172 #define MED_WMAX    1024    /* max copy for medium word-aligned case */
0173 #define MED_MAX     1024    /* max copy for medium longword-aligned case */
0174 #define ST_CHUNK    24  /* ST_CHUNK - block of values for BIS Store */
0175 #define ALIGN_PRE   24  /* distance for aligned prefetch loop */
0176 
0177     .register   %g2,#scratch
0178 
0179     .section    ".text"
0180     .global     FUNC_NAME
0181     .type       FUNC_NAME, #function
0182     .align      16
0183 FUNC_NAME:
0184     srlx            %o2, 31, %g2
0185     cmp             %g2, 0
0186     tne             %xcc, 5
0187     PREAMBLE
0188     mov     %o0, %g1    ! save %o0
0189     brz,pn          %o2, .Lsmallx
0190      cmp            %o2, 3
0191     ble,pn          %icc, .Ltiny_cp
0192      cmp            %o2, 19
0193     ble,pn          %icc, .Lsmall_cp
0194      or             %o0, %o1, %g2
0195     cmp             %o2, SMALL_MAX
0196     bl,pn           %icc, .Lmedium_cp
0197      nop
0198 
0199 .Lmedium:
0200     neg %o0, %o5
0201     andcc   %o5, 7, %o5     ! bytes till DST 8 byte aligned
0202     brz,pt  %o5, .Ldst_aligned_on_8
0203 
0204     ! %o5 has the bytes to be written in partial store.
0205      sub    %o2, %o5, %o2
0206     sub %o1, %o0, %o1       ! %o1 gets the difference
0207 7:                  ! dst aligning loop
0208     add %o1, %o0, %o4
0209     EX_LD(LOAD(ldub, %o4, %o4), memcpy_retl_o2_plus_o5) ! load one byte
0210     subcc   %o5, 1, %o5
0211     EX_ST(STORE(stb, %o4, %o0), memcpy_retl_o2_plus_o5_plus_1)
0212     bgu,pt  %xcc, 7b
0213      add    %o0, 1, %o0     ! advance dst
0214     add %o1, %o0, %o1       ! restore %o1
0215 .Ldst_aligned_on_8:
0216     andcc   %o1, 7, %o5
0217     brnz,pt %o5, .Lsrc_dst_unaligned_on_8
0218      nop
0219 
0220 .Lsrc_dst_aligned_on_8:
0221     ! check if we are copying MED_MAX or more bytes
0222     set MED_MAX, %o3
0223     cmp %o2, %o3            ! limit to store buffer size
0224     bgu,pn  %xcc, .Llarge_align8_copy
0225      nop
0226 
0227 /*
0228  * Special case for handling when src and dest are both long word aligned
0229  * and total data to move is less than MED_MAX bytes
0230  */
0231 .Lmedlong:
0232     subcc   %o2, 63, %o2        ! adjust length to allow cc test
0233     ble,pn  %xcc, .Lmedl63      ! skip big loop if less than 64 bytes
0234      nop
0235 .Lmedl64:
0236     EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2_plus_63)  ! load
0237     subcc   %o2, 64, %o2        ! decrement length count
0238     EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_63_64)  ! and store
0239     EX_LD(LOAD(ldx, %o1+8, %o3), memcpy_retl_o2_plus_63_56) ! a block of 64
0240     EX_ST(STORE(stx, %o3, %o0+8), memcpy_retl_o2_plus_63_56)
0241     EX_LD(LOAD(ldx, %o1+16, %o4), memcpy_retl_o2_plus_63_48)
0242     EX_ST(STORE(stx, %o4, %o0+16), memcpy_retl_o2_plus_63_48)
0243     EX_LD(LOAD(ldx, %o1+24, %o3), memcpy_retl_o2_plus_63_40)
0244     EX_ST(STORE(stx, %o3, %o0+24), memcpy_retl_o2_plus_63_40)
0245     EX_LD(LOAD(ldx, %o1+32, %o4), memcpy_retl_o2_plus_63_32)! load and store
0246     EX_ST(STORE(stx, %o4, %o0+32), memcpy_retl_o2_plus_63_32)
0247     EX_LD(LOAD(ldx, %o1+40, %o3), memcpy_retl_o2_plus_63_24)! a block of 64
0248     add %o1, 64, %o1        ! increase src ptr by 64
0249     EX_ST(STORE(stx, %o3, %o0+40), memcpy_retl_o2_plus_63_24)
0250     EX_LD(LOAD(ldx, %o1-16, %o4), memcpy_retl_o2_plus_63_16)
0251     add %o0, 64, %o0        ! increase dst ptr by 64
0252     EX_ST(STORE(stx, %o4, %o0-16), memcpy_retl_o2_plus_63_16)
0253     EX_LD(LOAD(ldx, %o1-8, %o3), memcpy_retl_o2_plus_63_8)
0254     bgu,pt  %xcc, .Lmedl64      ! repeat if at least 64 bytes left
0255      EX_ST(STORE(stx, %o3, %o0-8), memcpy_retl_o2_plus_63_8)
0256 .Lmedl63:
0257     addcc   %o2, 32, %o2        ! adjust remaining count
0258     ble,pt  %xcc, .Lmedl31      ! to skip if 31 or fewer bytes left
0259      nop
0260     EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2_plus_31)  ! load
0261     sub %o2, 32, %o2        ! decrement length count
0262     EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_31_32)  ! and store
0263     EX_LD(LOAD(ldx, %o1+8, %o3), memcpy_retl_o2_plus_31_24) ! a block of 32
0264     add %o1, 32, %o1        ! increase src ptr by 32
0265     EX_ST(STORE(stx, %o3, %o0+8), memcpy_retl_o2_plus_31_24)
0266     EX_LD(LOAD(ldx, %o1-16, %o4), memcpy_retl_o2_plus_31_16)
0267     add %o0, 32, %o0        ! increase dst ptr by 32
0268     EX_ST(STORE(stx, %o4, %o0-16), memcpy_retl_o2_plus_31_16)
0269     EX_LD(LOAD(ldx, %o1-8, %o3), memcpy_retl_o2_plus_31_8)
0270     EX_ST(STORE(stx, %o3, %o0-8), memcpy_retl_o2_plus_31_8)
0271 .Lmedl31:
0272     addcc   %o2, 16, %o2        ! adjust remaining count
0273     ble,pt  %xcc, .Lmedl15      ! skip if 15 or fewer bytes left
0274      nop                !
0275     EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2_plus_15)
0276     add %o1, 16, %o1        ! increase src ptr by 16
0277     EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_15)
0278     sub %o2, 16, %o2        ! decrease count by 16
0279     EX_LD(LOAD(ldx, %o1-8, %o3), memcpy_retl_o2_plus_15_8)
0280     add %o0, 16, %o0        ! increase dst ptr by 16
0281     EX_ST(STORE(stx, %o3, %o0-8), memcpy_retl_o2_plus_15_8)
0282 .Lmedl15:
0283     addcc   %o2, 15, %o2        ! restore count
0284     bz,pt   %xcc, .Lsmallx  ! exit if finished
0285      cmp    %o2, 8
0286     blt,pt  %xcc, .Lmedw7       ! skip if 7 or fewer bytes left
0287      tst    %o2
0288     EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2)  ! load 8 bytes
0289     add %o1, 8, %o1     ! increase src ptr by 8
0290     add %o0, 8, %o0     ! increase dst ptr by 8
0291     subcc   %o2, 8, %o2     ! decrease count by 8
0292     bnz,pn  %xcc, .Lmedw7
0293      EX_ST(STORE(stx, %o4, %o0-8), memcpy_retl_o2_plus_8)   ! and store 8
0294     retl
0295      mov    EX_RETVAL(%g1), %o0 ! restore %o0
0296 
0297     .align 16
0298 .Lsrc_dst_unaligned_on_8:
0299     ! DST is 8-byte aligned, src is not
0300 2:
0301     andcc   %o1, 0x3, %o5       ! test word alignment
0302     bnz,pt  %xcc, .Lunalignsetup    ! branch to skip if not word aligned
0303      nop
0304 
0305 /*
0306  * Handle all cases where src and dest are aligned on word
0307  * boundaries. Use unrolled loops for better performance.
0308  * This option wins over standard large data move when
0309  * source and destination is in cache for.Lmedium
0310  * to short data moves.
0311  */
0312     set MED_WMAX, %o3
0313     cmp %o2, %o3            ! limit to store buffer size
0314     bge,pt  %xcc, .Lunalignrejoin   ! otherwise rejoin main loop
0315      nop
0316 
0317     subcc   %o2, 31, %o2        ! adjust length to allow cc test
0318                     ! for end of loop
0319     ble,pt  %xcc, .Lmedw31      ! skip big loop if less than 16
0320 .Lmedw32:
0321     EX_LD(LOAD(ld, %o1, %o4), memcpy_retl_o2_plus_31)! move a block of 32
0322     sllx    %o4, 32, %o5
0323     EX_LD(LOAD(ld, %o1+4, %o4), memcpy_retl_o2_plus_31)
0324     or  %o4, %o5, %o5
0325     EX_ST(STORE(stx, %o5, %o0), memcpy_retl_o2_plus_31)
0326     subcc   %o2, 32, %o2        ! decrement length count
0327     EX_LD(LOAD(ld, %o1+8, %o4), memcpy_retl_o2_plus_31_24)
0328     sllx    %o4, 32, %o5
0329     EX_LD(LOAD(ld, %o1+12, %o4), memcpy_retl_o2_plus_31_24)
0330     or  %o4, %o5, %o5
0331     EX_ST(STORE(stx, %o5, %o0+8), memcpy_retl_o2_plus_31_24)
0332     add %o1, 32, %o1        ! increase src ptr by 32
0333     EX_LD(LOAD(ld, %o1-16, %o4), memcpy_retl_o2_plus_31_16)
0334     sllx    %o4, 32, %o5
0335     EX_LD(LOAD(ld, %o1-12, %o4), memcpy_retl_o2_plus_31_16)
0336     or  %o4, %o5, %o5
0337     EX_ST(STORE(stx, %o5, %o0+16), memcpy_retl_o2_plus_31_16)
0338     add %o0, 32, %o0        ! increase dst ptr by 32
0339     EX_LD(LOAD(ld, %o1-8, %o4), memcpy_retl_o2_plus_31_8)
0340     sllx    %o4, 32, %o5
0341     EX_LD(LOAD(ld, %o1-4, %o4), memcpy_retl_o2_plus_31_8)
0342     or  %o4, %o5, %o5
0343     bgu,pt  %xcc, .Lmedw32      ! repeat if at least 32 bytes left
0344      EX_ST(STORE(stx, %o5, %o0-8), memcpy_retl_o2_plus_31_8)
0345 .Lmedw31:
0346     addcc   %o2, 31, %o2        ! restore count
0347 
0348     bz,pt   %xcc, .Lsmallx  ! exit if finished
0349      nop
0350     cmp %o2, 16
0351     blt,pt  %xcc, .Lmedw15
0352      nop
0353     EX_LD(LOAD(ld, %o1, %o4), memcpy_retl_o2)! move a block of 16 bytes
0354     sllx    %o4, 32, %o5
0355     subcc   %o2, 16, %o2        ! decrement length count
0356     EX_LD(LOAD(ld, %o1+4, %o4), memcpy_retl_o2_plus_16)
0357     or  %o4, %o5, %o5
0358     EX_ST(STORE(stx, %o5, %o0), memcpy_retl_o2_plus_16)
0359     add %o1, 16, %o1        ! increase src ptr by 16
0360     EX_LD(LOAD(ld, %o1-8, %o4), memcpy_retl_o2_plus_8)
0361     add %o0, 16, %o0        ! increase dst ptr by 16
0362     sllx    %o4, 32, %o5
0363     EX_LD(LOAD(ld, %o1-4, %o4), memcpy_retl_o2_plus_8)
0364     or  %o4, %o5, %o5
0365     EX_ST(STORE(stx, %o5, %o0-8), memcpy_retl_o2_plus_8)
0366 .Lmedw15:
0367     bz,pt   %xcc, .Lsmallx  ! exit if finished
0368      cmp    %o2, 8
0369     blt,pn  %xcc, .Lmedw7       ! skip if 7 or fewer bytes left
0370      tst    %o2
0371     EX_LD(LOAD(ld, %o1, %o4), memcpy_retl_o2)   ! load 4 bytes
0372     subcc   %o2, 8, %o2     ! decrease count by 8
0373     EX_ST(STORE(stw, %o4, %o0), memcpy_retl_o2_plus_8)! and store 4 bytes
0374     add %o1, 8, %o1     ! increase src ptr by 8
0375     EX_LD(LOAD(ld, %o1-4, %o3), memcpy_retl_o2_plus_4)  ! load 4 bytes
0376     add %o0, 8, %o0     ! increase dst ptr by 8
0377     EX_ST(STORE(stw, %o3, %o0-4), memcpy_retl_o2_plus_4)! and store 4 bytes
0378     bz,pt   %xcc, .Lsmallx  ! exit if finished
0379 .Lmedw7:                ! count is ge 1, less than 8
0380     cmp %o2, 4          ! check for 4 bytes left
0381     blt,pn  %xcc, .Lsmallleft3  ! skip if 3 or fewer bytes left
0382      nop                !
0383     EX_LD(LOAD(ld, %o1, %o4), memcpy_retl_o2)   ! load 4 bytes
0384     add %o1, 4, %o1     ! increase src ptr by 4
0385     add %o0, 4, %o0     ! increase dst ptr by 4
0386     subcc   %o2, 4, %o2     ! decrease count by 4
0387     bnz .Lsmallleft3
0388      EX_ST(STORE(stw, %o4, %o0-4), memcpy_retl_o2_plus_4)! and store 4 bytes
0389     retl
0390      mov    EX_RETVAL(%g1), %o0
0391 
0392     .align 16
0393 .Llarge_align8_copy:            ! Src and dst share 8 byte alignment
0394     ! align dst to 64 byte boundary
0395     andcc   %o0, 0x3f, %o3      ! %o3 == 0 means dst is 64 byte aligned
0396     brz,pn  %o3, .Laligned_to_64
0397      andcc  %o0, 8, %o3     ! odd long words to move?
0398     brz,pt  %o3, .Laligned_to_16
0399      nop
0400     EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2)
0401     sub %o2, 8, %o2
0402     add %o1, 8, %o1     ! increment src ptr
0403     add %o0, 8, %o0     ! increment dst ptr
0404     EX_ST(STORE(stx, %o4, %o0-8), memcpy_retl_o2_plus_8)
0405 .Laligned_to_16:
0406     andcc   %o0, 16, %o3        ! pair of long words to move?
0407     brz,pt  %o3, .Laligned_to_32
0408      nop
0409     EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2)
0410     sub %o2, 16, %o2
0411     EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_16)
0412     add %o1, 16, %o1        ! increment src ptr
0413     EX_LD(LOAD(ldx, %o1-8, %o4), memcpy_retl_o2_plus_8)
0414     add %o0, 16, %o0        ! increment dst ptr
0415     EX_ST(STORE(stx, %o4, %o0-8), memcpy_retl_o2_plus_8)
0416 .Laligned_to_32:
0417     andcc   %o0, 32, %o3        ! four long words to move?
0418     brz,pt  %o3, .Laligned_to_64
0419      nop
0420     EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2)
0421     sub %o2, 32, %o2
0422     EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_32)
0423     EX_LD(LOAD(ldx, %o1+8, %o4), memcpy_retl_o2_plus_24)
0424     EX_ST(STORE(stx, %o4, %o0+8), memcpy_retl_o2_plus_24)
0425     EX_LD(LOAD(ldx, %o1+16, %o4), memcpy_retl_o2_plus_16)
0426     EX_ST(STORE(stx, %o4, %o0+16), memcpy_retl_o2_plus_16)
0427     add %o1, 32, %o1        ! increment src ptr
0428     EX_LD(LOAD(ldx, %o1-8, %o4), memcpy_retl_o2_plus_8)
0429     add %o0, 32, %o0        ! increment dst ptr
0430     EX_ST(STORE(stx, %o4, %o0-8), memcpy_retl_o2_plus_8)
0431 .Laligned_to_64:
0432 !
0433 !   Using block init store (BIS) instructions to avoid fetching cache
0434 !   lines from memory. Use ST_CHUNK stores to first element of each cache
0435 !   line (similar to prefetching) to avoid overfilling STQ or miss buffers.
0436 !   Gives existing cache lines time to be moved out of L1/L2/L3 cache.
0437 !   Initial stores using MRU version of BIS to keep cache line in
0438 !   cache until we are ready to store final element of cache line.
0439 !   Then store last element using the LRU version of BIS.
0440 !
0441     andn    %o2, 0x3f, %o5      ! %o5 is multiple of block size
0442     and %o2, 0x3f, %o2      ! residue bytes in %o2
0443 !
0444 !   We use STORE_MRU_ASI for the first seven stores to each cache line
0445 !   followed by STORE_ASI (mark as LRU) for the last store. That
0446 !   mixed approach reduces the probability that the cache line is removed
0447 !   before we finish setting it, while minimizing the effects on
0448 !   other cached values during a large memcpy
0449 !
0450 !   ST_CHUNK batches up initial BIS operations for several cache lines
0451 !   to allow multiple requests to not be blocked by overflowing the
0452 !   the store miss buffer. Then the matching stores for all those
0453 !   BIS operations are executed.
0454 !
0455 
0456     sub %o0, 8, %o0     ! adjust %o0 for ASI alignment
0457 .Lalign_loop:
0458     cmp %o5, ST_CHUNK*64
0459     blu,pt  %xcc, .Lalign_loop_fin
0460      mov    ST_CHUNK,%o3
0461 .Lalign_loop_start:
0462     prefetch [%o1 + (ALIGN_PRE * BLOCK_SIZE)], 21
0463     subcc   %o3, 1, %o3
0464     EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2_plus_o5)
0465     add %o1, 64, %o1
0466     add %o0, 8, %o0
0467     EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5)
0468     bgu %xcc,.Lalign_loop_start
0469      add    %o0, 56, %o0
0470 
0471     mov ST_CHUNK,%o3
0472     sllx    %o3, 6, %o4     ! ST_CHUNK*64
0473     sub %o1, %o4, %o1       ! reset %o1
0474     sub %o0, %o4, %o0       ! reset %o0
0475 
0476 .Lalign_loop_rest:
0477     EX_LD(LOAD(ldx, %o1+8, %o4), memcpy_retl_o2_plus_o5)
0478     add %o0, 16, %o0
0479     EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5)
0480     EX_LD(LOAD(ldx, %o1+16, %o4), memcpy_retl_o2_plus_o5)
0481     add %o0, 8, %o0
0482     EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5)
0483     subcc   %o3, 1, %o3
0484     EX_LD(LOAD(ldx, %o1+24, %o4), memcpy_retl_o2_plus_o5)
0485     add %o0, 8, %o0
0486     EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5)
0487     EX_LD(LOAD(ldx, %o1+32, %o4), memcpy_retl_o2_plus_o5)
0488     add %o0, 8, %o0
0489     EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5)
0490     EX_LD(LOAD(ldx, %o1+40, %o4), memcpy_retl_o2_plus_o5)
0491     add %o0, 8, %o0
0492     EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5)
0493     EX_LD(LOAD(ldx, %o1+48, %o4), memcpy_retl_o2_plus_o5)
0494     add %o1, 64, %o1
0495     add %o0, 8, %o0
0496     EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5)
0497     add %o0, 8, %o0
0498     EX_LD(LOAD(ldx, %o1-8, %o4), memcpy_retl_o2_plus_o5)
0499     sub %o5, 64, %o5
0500     bgu %xcc,.Lalign_loop_rest
0501     ! mark cache line as LRU
0502      EX_ST(STORE_INIT(%o4, %o0), memcpy_retl_o2_plus_o5_plus_64)
0503 
0504     cmp %o5, ST_CHUNK*64
0505     bgu,pt  %xcc, .Lalign_loop_start
0506      mov    ST_CHUNK,%o3
0507 
0508     cmp %o5, 0
0509     beq .Lalign_done
0510      nop
0511 .Lalign_loop_fin:
0512     EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2_plus_o5)
0513     EX_ST(STORE(stx, %o4, %o0+8), memcpy_retl_o2_plus_o5)
0514     EX_LD(LOAD(ldx, %o1+8, %o4), memcpy_retl_o2_plus_o5)
0515     EX_ST(STORE(stx, %o4, %o0+8+8), memcpy_retl_o2_plus_o5)
0516     EX_LD(LOAD(ldx, %o1+16, %o4), memcpy_retl_o2_plus_o5)
0517     EX_ST(STORE(stx, %o4, %o0+8+16), memcpy_retl_o2_plus_o5)
0518     subcc   %o5, 64, %o5
0519     EX_LD(LOAD(ldx, %o1+24, %o4), memcpy_retl_o2_plus_o5_64)
0520     EX_ST(STORE(stx, %o4, %o0+8+24), memcpy_retl_o2_plus_o5_64)
0521     EX_LD(LOAD(ldx, %o1+32, %o4), memcpy_retl_o2_plus_o5_64)
0522     EX_ST(STORE(stx, %o4, %o0+8+32), memcpy_retl_o2_plus_o5_64)
0523     EX_LD(LOAD(ldx, %o1+40, %o4), memcpy_retl_o2_plus_o5_64)
0524     EX_ST(STORE(stx, %o4, %o0+8+40), memcpy_retl_o2_plus_o5_64)
0525     EX_LD(LOAD(ldx, %o1+48, %o4), memcpy_retl_o2_plus_o5_64)
0526     add %o1, 64, %o1
0527     EX_ST(STORE(stx, %o4, %o0+8+48), memcpy_retl_o2_plus_o5_64)
0528     add %o0, 64, %o0
0529     EX_LD(LOAD(ldx, %o1-8, %o4), memcpy_retl_o2_plus_o5_64)
0530     bgu %xcc,.Lalign_loop_fin
0531      EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_o5_64)
0532 
0533 .Lalign_done:
0534     add %o0, 8, %o0     ! restore %o0 from ASI alignment
0535     membar  #StoreStore
0536     sub %o2, 63, %o2        ! adjust length to allow cc test
0537     ba  .Lmedl63        ! in .Lmedl63
0538      nop
0539 
0540     .align 16
0541     ! Dst is on 8 byte boundary; src is not; remaining count > SMALL_MAX
0542 .Lunalignsetup:
0543 .Lunalignrejoin:
0544     mov %g1, %o3    ! save %g1 as VISEntryHalf clobbers it
0545 #ifdef NON_USER_COPY
0546     VISEntryHalfFast(.Lmedium_vis_entry_fail_cp)
0547 #else
0548     VISEntryHalf
0549 #endif
0550     mov %o3, %g1    ! restore %g1
0551 
0552     set MED_UMAX, %o3
0553     cmp %o2, %o3        ! check for.Lmedium unaligned limit
0554     bge,pt  %xcc,.Lunalign_large
0555      prefetch [%o1 + (4 * BLOCK_SIZE)], 20
0556     andn    %o2, 0x3f, %o5      ! %o5 is multiple of block size
0557     and %o2, 0x3f, %o2      ! residue bytes in %o2
0558     cmp %o2, 8          ! Insure we do not load beyond
0559     bgt .Lunalign_adjust    ! end of source buffer
0560      andn   %o1, 0x7, %o4       ! %o4 has long word aligned src address
0561     add %o2, 64, %o2        ! adjust to leave loop
0562     sub %o5, 64, %o5        ! early if necessary
0563 .Lunalign_adjust:
0564     alignaddr %o1, %g0, %g0     ! generate %gsr
0565     add %o1, %o5, %o1       ! advance %o1 to after blocks
0566     EX_LD_FP(LOAD(ldd, %o4, %f0), memcpy_retl_o2_plus_o5)
0567 .Lunalign_loop:
0568     EX_LD_FP(LOAD(ldd, %o4+8, %f2), memcpy_retl_o2_plus_o5)
0569     faligndata %f0, %f2, %f16
0570     EX_LD_FP(LOAD(ldd, %o4+16, %f4), memcpy_retl_o2_plus_o5)
0571     subcc   %o5, BLOCK_SIZE, %o5
0572     EX_ST_FP(STORE(std, %f16, %o0), memcpy_retl_o2_plus_o5_plus_64)
0573     faligndata %f2, %f4, %f18
0574     EX_LD_FP(LOAD(ldd, %o4+24, %f6), memcpy_retl_o2_plus_o5_plus_56)
0575     EX_ST_FP(STORE(std, %f18, %o0+8), memcpy_retl_o2_plus_o5_plus_56)
0576     faligndata %f4, %f6, %f20
0577     EX_LD_FP(LOAD(ldd, %o4+32, %f8), memcpy_retl_o2_plus_o5_plus_48)
0578     EX_ST_FP(STORE(std, %f20, %o0+16), memcpy_retl_o2_plus_o5_plus_48)
0579     faligndata %f6, %f8, %f22
0580     EX_LD_FP(LOAD(ldd, %o4+40, %f10), memcpy_retl_o2_plus_o5_plus_40)
0581     EX_ST_FP(STORE(std, %f22, %o0+24), memcpy_retl_o2_plus_o5_plus_40)
0582     faligndata %f8, %f10, %f24
0583     EX_LD_FP(LOAD(ldd, %o4+48, %f12), memcpy_retl_o2_plus_o5_plus_32)
0584     EX_ST_FP(STORE(std, %f24, %o0+32), memcpy_retl_o2_plus_o5_plus_32)
0585     faligndata %f10, %f12, %f26
0586     EX_LD_FP(LOAD(ldd, %o4+56, %f14), memcpy_retl_o2_plus_o5_plus_24)
0587     add %o4, BLOCK_SIZE, %o4
0588     EX_ST_FP(STORE(std, %f26, %o0+40), memcpy_retl_o2_plus_o5_plus_24)
0589     faligndata %f12, %f14, %f28
0590     EX_LD_FP(LOAD(ldd, %o4, %f0), memcpy_retl_o2_plus_o5_plus_16)
0591     EX_ST_FP(STORE(std, %f28, %o0+48), memcpy_retl_o2_plus_o5_plus_16)
0592     faligndata %f14, %f0, %f30
0593     EX_ST_FP(STORE(std, %f30, %o0+56), memcpy_retl_o2_plus_o5_plus_8)
0594     add %o0, BLOCK_SIZE, %o0
0595     bgu,pt  %xcc, .Lunalign_loop
0596      prefetch [%o4 + (5 * BLOCK_SIZE)], 20
0597     ba  .Lunalign_done
0598      nop
0599 
0600 .Lunalign_large:
0601     andcc   %o0, 0x3f, %o3      ! is dst 64-byte block aligned?
0602     bz  %xcc, .Lunalignsrc
0603      sub    %o3, 64, %o3        ! %o3 will be multiple of 8
0604     neg %o3         ! bytes until dest is 64 byte aligned
0605     sub %o2, %o3, %o2       ! update cnt with bytes to be moved
0606     ! Move bytes according to source alignment
0607     andcc   %o1, 0x1, %o5
0608     bnz %xcc, .Lunalignbyte ! check for byte alignment
0609      nop
0610     andcc   %o1, 2, %o5     ! check for half word alignment
0611     bnz %xcc, .Lunalignhalf
0612      nop
0613     ! Src is word aligned
0614 .Lunalignword:
0615     EX_LD_FP(LOAD(ld, %o1, %o4), memcpy_retl_o2_plus_o3)    ! load 4 bytes
0616     add %o1, 8, %o1     ! increase src ptr by 8
0617     EX_ST_FP(STORE(stw, %o4, %o0), memcpy_retl_o2_plus_o3)  ! and store 4
0618     subcc   %o3, 8, %o3     ! decrease count by 8
0619     EX_LD_FP(LOAD(ld, %o1-4, %o4), memcpy_retl_o2_plus_o3_plus_4)! load 4
0620     add %o0, 8, %o0     ! increase dst ptr by 8
0621     bnz %xcc, .Lunalignword
0622      EX_ST_FP(STORE(stw, %o4, %o0-4), memcpy_retl_o2_plus_o3_plus_4)
0623     ba  .Lunalignsrc
0624      nop
0625 
0626     ! Src is half-word aligned
0627 .Lunalignhalf:
0628     EX_LD_FP(LOAD(lduh, %o1, %o4), memcpy_retl_o2_plus_o3)  ! load 2 bytes
0629     sllx    %o4, 32, %o5        ! shift left
0630     EX_LD_FP(LOAD(lduw, %o1+2, %o4), memcpy_retl_o2_plus_o3)
0631     or  %o4, %o5, %o5
0632     sllx    %o5, 16, %o5
0633     EX_LD_FP(LOAD(lduh, %o1+6, %o4), memcpy_retl_o2_plus_o3)
0634     or  %o4, %o5, %o5
0635     EX_ST_FP(STORE(stx, %o5, %o0), memcpy_retl_o2_plus_o3)
0636     add %o1, 8, %o1
0637     subcc   %o3, 8, %o3
0638     bnz %xcc, .Lunalignhalf
0639      add    %o0, 8, %o0
0640     ba  .Lunalignsrc
0641      nop
0642 
0643     ! Src is Byte aligned
0644 .Lunalignbyte:
0645     sub %o0, %o1, %o0       ! share pointer advance
0646 .Lunalignbyte_loop:
0647     EX_LD_FP(LOAD(ldub, %o1, %o4), memcpy_retl_o2_plus_o3)
0648     sllx    %o4, 56, %o5
0649     EX_LD_FP(LOAD(lduh, %o1+1, %o4), memcpy_retl_o2_plus_o3)
0650     sllx    %o4, 40, %o4
0651     or  %o4, %o5, %o5
0652     EX_LD_FP(LOAD(lduh, %o1+3, %o4), memcpy_retl_o2_plus_o3)
0653     sllx    %o4, 24, %o4
0654     or  %o4, %o5, %o5
0655     EX_LD_FP(LOAD(lduh, %o1+5, %o4), memcpy_retl_o2_plus_o3)
0656     sllx    %o4,  8, %o4
0657     or  %o4, %o5, %o5
0658     EX_LD_FP(LOAD(ldub, %o1+7, %o4), memcpy_retl_o2_plus_o3)
0659     or  %o4, %o5, %o5
0660     add %o0, %o1, %o0
0661     EX_ST_FP(STORE(stx, %o5, %o0), memcpy_retl_o2_plus_o3)
0662     sub %o0, %o1, %o0
0663     subcc   %o3, 8, %o3
0664     bnz %xcc, .Lunalignbyte_loop
0665      add    %o1, 8, %o1
0666     add %o0,%o1, %o0        ! restore pointer
0667 
0668     ! Destination is now block (64 byte aligned)
0669 .Lunalignsrc:
0670     andn    %o2, 0x3f, %o5      ! %o5 is multiple of block size
0671     and %o2, 0x3f, %o2      ! residue bytes in %o2
0672     add %o2, 64, %o2        ! Insure we do not load beyond
0673     sub %o5, 64, %o5        ! end of source buffer
0674 
0675     andn    %o1, 0x7, %o4       ! %o4 has long word aligned src address
0676     alignaddr %o1, %g0, %g0     ! generate %gsr
0677     add %o1, %o5, %o1       ! advance %o1 to after blocks
0678 
0679     EX_LD_FP(LOAD(ldd, %o4, %f14), memcpy_retl_o2_plus_o5)
0680     add %o4, 8, %o4
0681 .Lunalign_sloop:
0682     EX_LD_FP(LOAD(ldd, %o4, %f16), memcpy_retl_o2_plus_o5)
0683     faligndata %f14, %f16, %f0
0684     EX_LD_FP(LOAD(ldd, %o4+8, %f18), memcpy_retl_o2_plus_o5)
0685     faligndata %f16, %f18, %f2
0686     EX_LD_FP(LOAD(ldd, %o4+16, %f20), memcpy_retl_o2_plus_o5)
0687     faligndata %f18, %f20, %f4
0688     EX_ST_FP(STORE(std, %f0, %o0), memcpy_retl_o2_plus_o5)
0689     subcc   %o5, 64, %o5
0690     EX_LD_FP(LOAD(ldd, %o4+24, %f22), memcpy_retl_o2_plus_o5_plus_56)
0691     faligndata %f20, %f22, %f6
0692     EX_ST_FP(STORE(std, %f2, %o0+8), memcpy_retl_o2_plus_o5_plus_56)
0693     EX_LD_FP(LOAD(ldd, %o4+32, %f24), memcpy_retl_o2_plus_o5_plus_48)
0694     faligndata %f22, %f24, %f8
0695     EX_ST_FP(STORE(std, %f4, %o0+16), memcpy_retl_o2_plus_o5_plus_48)
0696     EX_LD_FP(LOAD(ldd, %o4+40, %f26), memcpy_retl_o2_plus_o5_plus_40)
0697     faligndata %f24, %f26, %f10
0698     EX_ST_FP(STORE(std, %f6, %o0+24), memcpy_retl_o2_plus_o5_plus_40)
0699     EX_LD_FP(LOAD(ldd, %o4+48, %f28), memcpy_retl_o2_plus_o5_plus_40)
0700     faligndata %f26, %f28, %f12
0701     EX_ST_FP(STORE(std, %f8, %o0+32), memcpy_retl_o2_plus_o5_plus_40)
0702     add %o4, 64, %o4
0703     EX_LD_FP(LOAD(ldd, %o4-8, %f30), memcpy_retl_o2_plus_o5_plus_40)
0704     faligndata %f28, %f30, %f14
0705     EX_ST_FP(STORE(std, %f10, %o0+40), memcpy_retl_o2_plus_o5_plus_40)
0706     EX_ST_FP(STORE(std, %f12, %o0+48), memcpy_retl_o2_plus_o5_plus_40)
0707     add %o0, 64, %o0
0708     EX_ST_FP(STORE(std, %f14, %o0-8), memcpy_retl_o2_plus_o5_plus_40)
0709     fsrc2   %f30, %f14
0710     bgu,pt  %xcc, .Lunalign_sloop
0711      prefetch [%o4 + (8 * BLOCK_SIZE)], 20
0712 
0713 .Lunalign_done:
0714     ! Handle trailing bytes, 64 to 127
0715     ! Dest long word aligned, Src not long word aligned
0716     cmp %o2, 15
0717     bleu    %xcc, .Lunalign_short
0718 
0719      andn   %o2, 0x7, %o5       ! %o5 is multiple of 8
0720     and %o2, 0x7, %o2       ! residue bytes in %o2
0721     add %o2, 8, %o2
0722     sub %o5, 8, %o5     ! insure we do not load past end of src
0723     andn    %o1, 0x7, %o4       ! %o4 has long word aligned src address
0724     add %o1, %o5, %o1       ! advance %o1 to after multiple of 8
0725     EX_LD_FP(LOAD(ldd, %o4, %f0), memcpy_retl_o2_plus_o5)! fetch partialword
0726 .Lunalign_by8:
0727     EX_LD_FP(LOAD(ldd, %o4+8, %f2), memcpy_retl_o2_plus_o5)
0728     add %o4, 8, %o4
0729     faligndata %f0, %f2, %f16
0730     subcc   %o5, 8, %o5
0731     EX_ST_FP(STORE(std, %f16, %o0), memcpy_retl_o2_plus_o5)
0732     fsrc2   %f2, %f0
0733     bgu,pt  %xcc, .Lunalign_by8
0734      add    %o0, 8, %o0
0735 
0736 .Lunalign_short:
0737 #ifdef NON_USER_COPY
0738     VISExitHalfFast
0739 #else
0740     VISExitHalf
0741 #endif
0742     ba  .Lsmallrest
0743      nop
0744 
0745 /*
0746  * This is a special case of nested memcpy. This can happen when kernel
0747  * calls unaligned memcpy back to back without saving FP registers. We need
0748  * traps(context switch) to save/restore FP registers. If the kernel calls
0749  * memcpy without this trap sequence we will hit FP corruption. Let's use
0750  * the normal integer load/store method in this case.
0751  */
0752 
0753 #ifdef NON_USER_COPY
0754 .Lmedium_vis_entry_fail_cp:
0755     or  %o0, %o1, %g2
0756 #endif
0757 .Lmedium_cp:
0758     LOAD(prefetch, %o1 + 0x40, #n_reads_strong)
0759     andcc   %g2, 0x7, %g0
0760     bne,pn  %xcc, .Lmedium_unaligned_cp
0761      nop
0762 
0763 .Lmedium_noprefetch_cp:
0764     andncc  %o2, 0x20 - 1, %o5
0765     be,pn   %xcc, 2f
0766      sub    %o2, %o5, %o2
0767 1:  EX_LD(LOAD(ldx, %o1 + 0x00, %o3), memcpy_retl_o2_plus_o5)
0768     EX_LD(LOAD(ldx, %o1 + 0x08, %g2), memcpy_retl_o2_plus_o5)
0769     EX_LD(LOAD(ldx, %o1 + 0x10, %g7), memcpy_retl_o2_plus_o5)
0770     EX_LD(LOAD(ldx, %o1 + 0x18, %o4), memcpy_retl_o2_plus_o5)
0771     add %o1, 0x20, %o1
0772     subcc   %o5, 0x20, %o5
0773     EX_ST(STORE(stx, %o3, %o0 + 0x00), memcpy_retl_o2_plus_o5_plus_32)
0774     EX_ST(STORE(stx, %g2, %o0 + 0x08), memcpy_retl_o2_plus_o5_plus_24)
0775     EX_ST(STORE(stx, %g7, %o0 + 0x10), memcpy_retl_o2_plus_o5_plus_24)
0776     EX_ST(STORE(stx, %o4, %o0 + 0x18), memcpy_retl_o2_plus_o5_plus_8)
0777     bne,pt  %xcc, 1b
0778      add    %o0, 0x20, %o0
0779 2:  andcc   %o2, 0x18, %o5
0780     be,pt   %xcc, 3f
0781      sub    %o2, %o5, %o2
0782 1:  EX_LD(LOAD(ldx, %o1 + 0x00, %o3), memcpy_retl_o2_plus_o5)
0783     add %o1, 0x08, %o1
0784     add %o0, 0x08, %o0
0785     subcc   %o5, 0x08, %o5
0786     bne,pt  %xcc, 1b
0787      EX_ST(STORE(stx, %o3, %o0 - 0x08), memcpy_retl_o2_plus_o5_plus_8)
0788 3:  brz,pt  %o2, .Lexit_cp
0789      cmp    %o2, 0x04
0790     bl,pn   %xcc, .Ltiny_cp
0791      nop
0792     EX_LD(LOAD(lduw, %o1 + 0x00, %o3), memcpy_retl_o2)
0793     add %o1, 0x04, %o1
0794     add %o0, 0x04, %o0
0795     subcc   %o2, 0x04, %o2
0796     bne,pn  %xcc, .Ltiny_cp
0797      EX_ST(STORE(stw, %o3, %o0 - 0x04), memcpy_retl_o2_plus_4)
0798     ba,a,pt %xcc, .Lexit_cp
0799 
0800 .Lmedium_unaligned_cp:
0801     /* First get dest 8 byte aligned.  */
0802     sub %g0, %o0, %o3
0803     and %o3, 0x7, %o3
0804     brz,pt  %o3, 2f
0805      sub    %o2, %o3, %o2
0806 
0807 1:  EX_LD(LOAD(ldub, %o1 + 0x00, %g2), memcpy_retl_o2_plus_g1)
0808     add %o1, 1, %o1
0809     subcc   %o3, 1, %o3
0810     add %o0, 1, %o0
0811     bne,pt  %xcc, 1b
0812      EX_ST(STORE(stb, %g2, %o0 - 0x01), memcpy_retl_o2_plus_g1_plus_1)
0813 2:
0814     and %o1, 0x7, %o3
0815     brz,pn  %o3, .Lmedium_noprefetch_cp
0816      sll    %o3, 3, %o3
0817     mov 64, %g2
0818     sub %g2, %o3, %g2
0819     andn    %o1, 0x7, %o1
0820     EX_LD(LOAD(ldx, %o1 + 0x00, %o4), memcpy_retl_o2)
0821     sllx    %o4, %o3, %o4
0822     andn    %o2, 0x08 - 1, %o5
0823     sub %o2, %o5, %o2
0824 
0825 1:  EX_LD(LOAD(ldx, %o1 + 0x08, %g3), memcpy_retl_o2_plus_o5)
0826     add %o1, 0x08, %o1
0827     subcc   %o5, 0x08, %o5
0828     srlx    %g3, %g2, %g7
0829     or  %g7, %o4, %g7
0830     EX_ST(STORE(stx, %g7, %o0 + 0x00), memcpy_retl_o2_plus_o5_plus_8)
0831     add %o0, 0x08, %o0
0832     bne,pt  %xcc, 1b
0833      sllx   %g3, %o3, %o4
0834     srl %o3, 3, %o3
0835     add %o1, %o3, %o1
0836     brz,pn  %o2, .Lexit_cp
0837      nop
0838     ba,pt   %xcc, .Lsmall_unaligned_cp
0839 
0840 .Ltiny_cp:
0841     EX_LD(LOAD(ldub, %o1 + 0x00, %o3), memcpy_retl_o2)
0842     subcc   %o2, 1, %o2
0843     be,pn   %xcc, .Lexit_cp
0844      EX_ST(STORE(stb, %o3, %o0 + 0x00), memcpy_retl_o2_plus_1)
0845     EX_LD(LOAD(ldub, %o1 + 0x01, %o3), memcpy_retl_o2)
0846     subcc   %o2, 1, %o2
0847     be,pn   %xcc, .Lexit_cp
0848      EX_ST(STORE(stb, %o3, %o0 + 0x01), memcpy_retl_o2_plus_1)
0849     EX_LD(LOAD(ldub, %o1 + 0x02, %o3), memcpy_retl_o2)
0850     ba,pt   %xcc, .Lexit_cp
0851      EX_ST(STORE(stb, %o3, %o0 + 0x02), memcpy_retl_o2)
0852 
0853 .Lsmall_cp:
0854     andcc   %g2, 0x3, %g0
0855     bne,pn  %xcc, .Lsmall_unaligned_cp
0856      andn   %o2, 0x4 - 1, %o5
0857     sub %o2, %o5, %o2
0858 1:
0859     EX_LD(LOAD(lduw, %o1 + 0x00, %o3), memcpy_retl_o2_plus_o5)
0860     add %o1, 0x04, %o1
0861     subcc   %o5, 0x04, %o5
0862     add %o0, 0x04, %o0
0863     bne,pt  %xcc, 1b
0864      EX_ST(STORE(stw, %o3, %o0 - 0x04), memcpy_retl_o2_plus_o5_plus_4)
0865     brz,pt  %o2, .Lexit_cp
0866      nop
0867     ba,a,pt %xcc, .Ltiny_cp
0868 
0869 .Lsmall_unaligned_cp:
0870 1:  EX_LD(LOAD(ldub, %o1 + 0x00, %o3), memcpy_retl_o2)
0871     add %o1, 1, %o1
0872     add %o0, 1, %o0
0873     subcc   %o2, 1, %o2
0874     bne,pt  %xcc, 1b
0875      EX_ST(STORE(stb, %o3, %o0 - 0x01), memcpy_retl_o2_plus_1)
0876     ba,a,pt %xcc, .Lexit_cp
0877 
0878 .Lsmallrest:
0879     tst %o2
0880     bz,pt   %xcc, .Lsmallx
0881      cmp    %o2, 4
0882     blt,pn  %xcc, .Lsmallleft3
0883      nop
0884     sub %o2, 3, %o2
0885 .Lsmallnotalign4:
0886     EX_LD(LOAD(ldub, %o1, %o3), memcpy_retl_o2_plus_3)! read byte
0887     subcc   %o2, 4, %o2     ! reduce count by 4
0888     EX_ST(STORE(stb, %o3, %o0), memcpy_retl_o2_plus_7)! write byte & repeat
0889     EX_LD(LOAD(ldub, %o1+1, %o3), memcpy_retl_o2_plus_6)! for total of 4
0890     add %o1, 4, %o1     ! advance SRC by 4
0891     EX_ST(STORE(stb, %o3, %o0+1), memcpy_retl_o2_plus_6)
0892     EX_LD(LOAD(ldub, %o1-2, %o3), memcpy_retl_o2_plus_5)
0893     add %o0, 4, %o0     ! advance DST by 4
0894     EX_ST(STORE(stb, %o3, %o0-2), memcpy_retl_o2_plus_5)
0895     EX_LD(LOAD(ldub, %o1-1, %o3), memcpy_retl_o2_plus_4)
0896     bgu,pt  %xcc, .Lsmallnotalign4  ! loop til 3 or fewer bytes remain
0897     EX_ST(STORE(stb, %o3, %o0-1), memcpy_retl_o2_plus_4)
0898     addcc   %o2, 3, %o2     ! restore count
0899     bz,pt   %xcc, .Lsmallx
0900 .Lsmallleft3:               ! 1, 2, or 3 bytes remain
0901     subcc   %o2, 1, %o2
0902     EX_LD(LOAD(ldub, %o1, %o3), memcpy_retl_o2_plus_1)  ! load one byte
0903     bz,pt   %xcc, .Lsmallx
0904     EX_ST(STORE(stb, %o3, %o0), memcpy_retl_o2_plus_1)  ! store one byte
0905     EX_LD(LOAD(ldub, %o1+1, %o3), memcpy_retl_o2)   ! load second byte
0906     subcc   %o2, 1, %o2
0907     bz,pt   %xcc, .Lsmallx
0908     EX_ST(STORE(stb, %o3, %o0+1), memcpy_retl_o2_plus_1)! store second byte
0909     EX_LD(LOAD(ldub, %o1+2, %o3), memcpy_retl_o2)   ! load third byte
0910     EX_ST(STORE(stb, %o3, %o0+2), memcpy_retl_o2)   ! store third byte
0911 .Lsmallx:
0912     retl
0913      mov    EX_RETVAL(%g1), %o0
0914 .Lsmallfin:
0915     tst %o2
0916     bnz,pn  %xcc, .Lsmallleft3
0917      nop
0918     retl
0919      mov    EX_RETVAL(%g1), %o0 ! restore %o0
0920 .Lexit_cp:
0921     retl
0922      mov    EX_RETVAL(%g1), %o0
0923     .size  FUNC_NAME, .-FUNC_NAME