Back to home page

OSCL-LXR

 
 

    


0001 /* SPDX-License-Identifier: GPL-2.0 */
0002 /* NGmemcpy.S: Niagara optimized memcpy.
0003  *
0004  * Copyright (C) 2006, 2007 David S. Miller (davem@davemloft.net)
0005  */
0006 
0007 #ifdef __KERNEL__
0008 #include <linux/linkage.h>
0009 #include <asm/asi.h>
0010 #include <asm/thread_info.h>
0011 #define GLOBAL_SPARE    %g7
0012 #define RESTORE_ASI(TMP)    \
0013     wr  %g0, ASI_AIUS, %asi
0014 #else
0015 #define GLOBAL_SPARE    %g5
0016 #define RESTORE_ASI(TMP)    \
0017     wr  %g0, ASI_PNF, %asi
0018 #endif
0019 
0020 #ifdef __sparc_v9__
0021 #define SAVE_AMOUNT 128
0022 #else
0023 #define SAVE_AMOUNT 64
0024 #endif
0025 
0026 #ifndef STORE_ASI
0027 #define STORE_ASI   ASI_BLK_INIT_QUAD_LDD_P
0028 #endif
0029 
0030 #ifndef EX_LD
0031 #define EX_LD(x,y)  x
0032 #endif
0033 
0034 #ifndef EX_ST
0035 #define EX_ST(x,y)  x
0036 #endif
0037 
0038 #ifndef LOAD
0039 #ifndef MEMCPY_DEBUG
0040 #define LOAD(type,addr,dest)    type [addr], dest
0041 #else
0042 #define LOAD(type,addr,dest)    type##a [addr] 0x80, dest
0043 #endif
0044 #endif
0045 
0046 #ifndef LOAD_TWIN
0047 #define LOAD_TWIN(addr_reg,dest0,dest1) \
0048     ldda [addr_reg] ASI_BLK_INIT_QUAD_LDD_P, dest0
0049 #endif
0050 
0051 #ifndef STORE
0052 #define STORE(type,src,addr)    type src, [addr]
0053 #endif
0054 
0055 #ifndef STORE_INIT
0056 #ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA
0057 #define STORE_INIT(src,addr)    stxa src, [addr] %asi
0058 #else
0059 #define STORE_INIT(src,addr)    stx src, [addr + 0x00]
0060 #endif
0061 #endif
0062 
0063 #ifndef FUNC_NAME
0064 #define FUNC_NAME   NGmemcpy
0065 #endif
0066 
0067 #ifndef PREAMBLE
0068 #define PREAMBLE
0069 #endif
0070 
0071 #ifndef XCC
0072 #define XCC xcc
0073 #endif
0074 
0075     .register   %g2,#scratch
0076     .register   %g3,#scratch
0077 
0078     .text
0079 #ifndef EX_RETVAL
0080 #define EX_RETVAL(x)    x
0081 __restore_asi:
0082     ret
0083     wr  %g0, ASI_AIUS, %asi
0084      restore
0085 ENTRY(NG_ret_i2_plus_i4_plus_1)
0086     ba,pt   %xcc, __restore_asi
0087      add    %i2, %i5, %i0
0088 ENDPROC(NG_ret_i2_plus_i4_plus_1)
0089 ENTRY(NG_ret_i2_plus_g1)
0090     ba,pt   %xcc, __restore_asi
0091      add    %i2, %g1, %i0
0092 ENDPROC(NG_ret_i2_plus_g1)
0093 ENTRY(NG_ret_i2_plus_g1_minus_8)
0094     sub %g1, 8, %g1
0095     ba,pt   %xcc, __restore_asi
0096      add    %i2, %g1, %i0
0097 ENDPROC(NG_ret_i2_plus_g1_minus_8)
0098 ENTRY(NG_ret_i2_plus_g1_minus_16)
0099     sub %g1, 16, %g1
0100     ba,pt   %xcc, __restore_asi
0101      add    %i2, %g1, %i0
0102 ENDPROC(NG_ret_i2_plus_g1_minus_16)
0103 ENTRY(NG_ret_i2_plus_g1_minus_24)
0104     sub %g1, 24, %g1
0105     ba,pt   %xcc, __restore_asi
0106      add    %i2, %g1, %i0
0107 ENDPROC(NG_ret_i2_plus_g1_minus_24)
0108 ENTRY(NG_ret_i2_plus_g1_minus_32)
0109     sub %g1, 32, %g1
0110     ba,pt   %xcc, __restore_asi
0111      add    %i2, %g1, %i0
0112 ENDPROC(NG_ret_i2_plus_g1_minus_32)
0113 ENTRY(NG_ret_i2_plus_g1_minus_40)
0114     sub %g1, 40, %g1
0115     ba,pt   %xcc, __restore_asi
0116      add    %i2, %g1, %i0
0117 ENDPROC(NG_ret_i2_plus_g1_minus_40)
0118 ENTRY(NG_ret_i2_plus_g1_minus_48)
0119     sub %g1, 48, %g1
0120     ba,pt   %xcc, __restore_asi
0121      add    %i2, %g1, %i0
0122 ENDPROC(NG_ret_i2_plus_g1_minus_48)
0123 ENTRY(NG_ret_i2_plus_g1_minus_56)
0124     sub %g1, 56, %g1
0125     ba,pt   %xcc, __restore_asi
0126      add    %i2, %g1, %i0
0127 ENDPROC(NG_ret_i2_plus_g1_minus_56)
0128 ENTRY(NG_ret_i2_plus_i4)
0129     ba,pt   %xcc, __restore_asi
0130      add    %i2, %i4, %i0
0131 ENDPROC(NG_ret_i2_plus_i4)
0132 ENTRY(NG_ret_i2_plus_i4_minus_8)
0133     sub %i4, 8, %i4
0134     ba,pt   %xcc, __restore_asi
0135      add    %i2, %i4, %i0
0136 ENDPROC(NG_ret_i2_plus_i4_minus_8)
0137 ENTRY(NG_ret_i2_plus_8)
0138     ba,pt   %xcc, __restore_asi
0139      add    %i2, 8, %i0
0140 ENDPROC(NG_ret_i2_plus_8)
0141 ENTRY(NG_ret_i2_plus_4)
0142     ba,pt   %xcc, __restore_asi
0143      add    %i2, 4, %i0
0144 ENDPROC(NG_ret_i2_plus_4)
0145 ENTRY(NG_ret_i2_plus_1)
0146     ba,pt   %xcc, __restore_asi
0147      add    %i2, 1, %i0
0148 ENDPROC(NG_ret_i2_plus_1)
0149 ENTRY(NG_ret_i2_plus_g1_plus_1)
0150     add %g1, 1, %g1
0151     ba,pt   %xcc, __restore_asi
0152      add    %i2, %g1, %i0
0153 ENDPROC(NG_ret_i2_plus_g1_plus_1)
0154 ENTRY(NG_ret_i2)
0155     ba,pt   %xcc, __restore_asi
0156      mov    %i2, %i0
0157 ENDPROC(NG_ret_i2)
0158 ENTRY(NG_ret_i2_and_7_plus_i4)
0159     and %i2, 7, %i2
0160     ba,pt   %xcc, __restore_asi
0161      add    %i2, %i4, %i0
0162 ENDPROC(NG_ret_i2_and_7_plus_i4)
0163 #endif
0164 
0165     .align      64
0166 
0167     .globl  FUNC_NAME
0168     .type   FUNC_NAME,#function
0169 FUNC_NAME:  /* %i0=dst, %i1=src, %i2=len */
0170     PREAMBLE
0171     save        %sp, -SAVE_AMOUNT, %sp
0172     srlx        %i2, 31, %g2
0173     cmp     %g2, 0
0174     tne     %xcc, 5
0175     mov     %i0, %o0
0176     cmp     %i2, 0
0177     be,pn       %XCC, 85f
0178      or     %o0, %i1, %i3
0179     cmp     %i2, 16
0180     blu,a,pn    %XCC, 80f
0181      or     %i3, %i2, %i3
0182 
0183     /* 2 blocks (128 bytes) is the minimum we can do the block
0184      * copy with.  We need to ensure that we'll iterate at least
0185      * once in the block copy loop.  At worst we'll need to align
0186      * the destination to a 64-byte boundary which can chew up
0187      * to (64 - 1) bytes from the length before we perform the
0188      * block copy loop.
0189      */
0190     cmp     %i2, (2 * 64)
0191     blu,pt      %XCC, 70f
0192      andcc      %i3, 0x7, %g0
0193 
0194     /* %o0: dst
0195      * %i1: src
0196      * %i2: len  (known to be >= 128)
0197      *
0198      * The block copy loops will use %i4/%i5,%g2/%g3 as
0199      * temporaries while copying the data.
0200      */
0201 
0202     LOAD(prefetch, %i1, #one_read)
0203     wr      %g0, STORE_ASI, %asi
0204 
0205     /* Align destination on 64-byte boundary.  */
0206     andcc       %o0, (64 - 1), %i4
0207     be,pt       %XCC, 2f
0208      sub        %i4, 64, %i4
0209     sub     %g0, %i4, %i4   ! bytes to align dst
0210     sub     %i2, %i4, %i2
0211 1:  subcc       %i4, 1, %i4
0212     EX_LD(LOAD(ldub, %i1, %g1), NG_ret_i2_plus_i4_plus_1)
0213     EX_ST(STORE(stb, %g1, %o0), NG_ret_i2_plus_i4_plus_1)
0214     add     %i1, 1, %i1
0215     bne,pt      %XCC, 1b
0216     add     %o0, 1, %o0
0217 
0218     /* If the source is on a 16-byte boundary we can do
0219      * the direct block copy loop.  If it is 8-byte aligned
0220      * we can do the 16-byte loads offset by -8 bytes and the
0221      * init stores offset by one register.
0222      *
0223      * If the source is not even 8-byte aligned, we need to do
0224      * shifting and masking (basically integer faligndata).
0225      *
0226      * The careful bit with init stores is that if we store
0227      * to any part of the cache line we have to store the whole
0228      * cacheline else we can end up with corrupt L2 cache line
0229      * contents.  Since the loop works on 64-bytes of 64-byte
0230      * aligned store data at a time, this is easy to ensure.
0231      */
0232 2:
0233     andcc       %i1, (16 - 1), %i4
0234     andn        %i2, (64 - 1), %g1  ! block copy loop iterator
0235     be,pt       %XCC, 50f
0236      sub        %i2, %g1, %i2       ! final sub-block copy bytes
0237 
0238     cmp     %i4, 8
0239     be,pt       %XCC, 10f
0240      sub        %i1, %i4, %i1
0241 
0242     /* Neither 8-byte nor 16-byte aligned, shift and mask.  */
0243     and     %i4, 0x7, GLOBAL_SPARE
0244     sll     GLOBAL_SPARE, 3, GLOBAL_SPARE
0245     mov     64, %i5
0246     EX_LD(LOAD_TWIN(%i1, %g2, %g3), NG_ret_i2_plus_g1)
0247     sub     %i5, GLOBAL_SPARE, %i5
0248     mov     16, %o4
0249     mov     32, %o5
0250     mov     48, %o7
0251     mov     64, %i3
0252 
0253     bg,pn       %XCC, 9f
0254      nop
0255 
0256 #define MIX_THREE_WORDS(WORD1, WORD2, WORD3, PRE_SHIFT, POST_SHIFT, TMP) \
0257     sllx        WORD1, POST_SHIFT, WORD1; \
0258     srlx        WORD2, PRE_SHIFT, TMP; \
0259     sllx        WORD2, POST_SHIFT, WORD2; \
0260     or      WORD1, TMP, WORD1; \
0261     srlx        WORD3, PRE_SHIFT, TMP; \
0262     or      WORD2, TMP, WORD2;
0263 
0264 8:  EX_LD(LOAD_TWIN(%i1 + %o4, %o2, %o3), NG_ret_i2_plus_g1)
0265     MIX_THREE_WORDS(%g2, %g3, %o2, %i5, GLOBAL_SPARE, %o1)
0266     LOAD(prefetch, %i1 + %i3, #one_read)
0267 
0268     EX_ST(STORE_INIT(%g2, %o0 + 0x00), NG_ret_i2_plus_g1)
0269     EX_ST(STORE_INIT(%g3, %o0 + 0x08), NG_ret_i2_plus_g1_minus_8)
0270 
0271     EX_LD(LOAD_TWIN(%i1 + %o5, %g2, %g3), NG_ret_i2_plus_g1_minus_16)
0272     MIX_THREE_WORDS(%o2, %o3, %g2, %i5, GLOBAL_SPARE, %o1)
0273 
0274     EX_ST(STORE_INIT(%o2, %o0 + 0x10), NG_ret_i2_plus_g1_minus_16)
0275     EX_ST(STORE_INIT(%o3, %o0 + 0x18), NG_ret_i2_plus_g1_minus_24)
0276 
0277     EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3), NG_ret_i2_plus_g1_minus_32)
0278     MIX_THREE_WORDS(%g2, %g3, %o2, %i5, GLOBAL_SPARE, %o1)
0279 
0280     EX_ST(STORE_INIT(%g2, %o0 + 0x20), NG_ret_i2_plus_g1_minus_32)
0281     EX_ST(STORE_INIT(%g3, %o0 + 0x28), NG_ret_i2_plus_g1_minus_40)
0282 
0283     EX_LD(LOAD_TWIN(%i1 + %i3, %g2, %g3), NG_ret_i2_plus_g1_minus_48)
0284     add     %i1, 64, %i1
0285     MIX_THREE_WORDS(%o2, %o3, %g2, %i5, GLOBAL_SPARE, %o1)
0286 
0287     EX_ST(STORE_INIT(%o2, %o0 + 0x30), NG_ret_i2_plus_g1_minus_48)
0288     EX_ST(STORE_INIT(%o3, %o0 + 0x38), NG_ret_i2_plus_g1_minus_56)
0289 
0290     subcc       %g1, 64, %g1
0291     bne,pt      %XCC, 8b
0292      add        %o0, 64, %o0
0293 
0294     ba,pt       %XCC, 60f
0295      add        %i1, %i4, %i1
0296 
0297 9:  EX_LD(LOAD_TWIN(%i1 + %o4, %o2, %o3), NG_ret_i2_plus_g1)
0298     MIX_THREE_WORDS(%g3, %o2, %o3, %i5, GLOBAL_SPARE, %o1)
0299     LOAD(prefetch, %i1 + %i3, #one_read)
0300 
0301     EX_ST(STORE_INIT(%g3, %o0 + 0x00), NG_ret_i2_plus_g1)
0302     EX_ST(STORE_INIT(%o2, %o0 + 0x08), NG_ret_i2_plus_g1_minus_8)
0303 
0304     EX_LD(LOAD_TWIN(%i1 + %o5, %g2, %g3), NG_ret_i2_plus_g1_minus_16)
0305     MIX_THREE_WORDS(%o3, %g2, %g3, %i5, GLOBAL_SPARE, %o1)
0306 
0307     EX_ST(STORE_INIT(%o3, %o0 + 0x10), NG_ret_i2_plus_g1_minus_16)
0308     EX_ST(STORE_INIT(%g2, %o0 + 0x18), NG_ret_i2_plus_g1_minus_24)
0309 
0310     EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3), NG_ret_i2_plus_g1_minus_32)
0311     MIX_THREE_WORDS(%g3, %o2, %o3, %i5, GLOBAL_SPARE, %o1)
0312 
0313     EX_ST(STORE_INIT(%g3, %o0 + 0x20), NG_ret_i2_plus_g1_minus_32)
0314     EX_ST(STORE_INIT(%o2, %o0 + 0x28), NG_ret_i2_plus_g1_minus_40)
0315 
0316     EX_LD(LOAD_TWIN(%i1 + %i3, %g2, %g3), NG_ret_i2_plus_g1_minus_48)
0317     add     %i1, 64, %i1
0318     MIX_THREE_WORDS(%o3, %g2, %g3, %i5, GLOBAL_SPARE, %o1)
0319 
0320     EX_ST(STORE_INIT(%o3, %o0 + 0x30), NG_ret_i2_plus_g1_minus_48)
0321     EX_ST(STORE_INIT(%g2, %o0 + 0x38), NG_ret_i2_plus_g1_minus_56)
0322 
0323     subcc       %g1, 64, %g1
0324     bne,pt      %XCC, 9b
0325      add        %o0, 64, %o0
0326 
0327     ba,pt       %XCC, 60f
0328      add        %i1, %i4, %i1
0329 
0330 10: /* Destination is 64-byte aligned, source was only 8-byte
0331      * aligned but it has been subtracted by 8 and we perform
0332      * one twin load ahead, then add 8 back into source when
0333      * we finish the loop.
0334      */
0335     EX_LD(LOAD_TWIN(%i1, %o4, %o5), NG_ret_i2_plus_g1)
0336     mov 16, %o7
0337     mov 32, %g2
0338     mov 48, %g3
0339     mov 64, %o1
0340 1:  EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3), NG_ret_i2_plus_g1)
0341     LOAD(prefetch, %i1 + %o1, #one_read)
0342     EX_ST(STORE_INIT(%o5, %o0 + 0x00), NG_ret_i2_plus_g1)   ! initializes cache line
0343     EX_ST(STORE_INIT(%o2, %o0 + 0x08), NG_ret_i2_plus_g1_minus_8)
0344     EX_LD(LOAD_TWIN(%i1 + %g2, %o4, %o5), NG_ret_i2_plus_g1_minus_16)
0345     EX_ST(STORE_INIT(%o3, %o0 + 0x10), NG_ret_i2_plus_g1_minus_16)
0346     EX_ST(STORE_INIT(%o4, %o0 + 0x18), NG_ret_i2_plus_g1_minus_24)
0347     EX_LD(LOAD_TWIN(%i1 + %g3, %o2, %o3), NG_ret_i2_plus_g1_minus_32)
0348     EX_ST(STORE_INIT(%o5, %o0 + 0x20), NG_ret_i2_plus_g1_minus_32)
0349     EX_ST(STORE_INIT(%o2, %o0 + 0x28), NG_ret_i2_plus_g1_minus_40)
0350     EX_LD(LOAD_TWIN(%i1 + %o1, %o4, %o5), NG_ret_i2_plus_g1_minus_48)
0351     add     %i1, 64, %i1
0352     EX_ST(STORE_INIT(%o3, %o0 + 0x30), NG_ret_i2_plus_g1_minus_48)
0353     EX_ST(STORE_INIT(%o4, %o0 + 0x38), NG_ret_i2_plus_g1_minus_56)
0354     subcc       %g1, 64, %g1
0355     bne,pt      %XCC, 1b
0356      add        %o0, 64, %o0
0357 
0358     ba,pt       %XCC, 60f
0359      add        %i1, 0x8, %i1
0360 
0361 50: /* Destination is 64-byte aligned, and source is 16-byte
0362      * aligned.
0363      */
0364     mov 16, %o7
0365     mov 32, %g2
0366     mov 48, %g3
0367     mov 64, %o1
0368 1:  EX_LD(LOAD_TWIN(%i1 + %g0, %o4, %o5), NG_ret_i2_plus_g1)
0369     EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3), NG_ret_i2_plus_g1)
0370     LOAD(prefetch, %i1 + %o1, #one_read)
0371     EX_ST(STORE_INIT(%o4, %o0 + 0x00), NG_ret_i2_plus_g1)   ! initializes cache line
0372     EX_ST(STORE_INIT(%o5, %o0 + 0x08), NG_ret_i2_plus_g1_minus_8)
0373     EX_LD(LOAD_TWIN(%i1 + %g2, %o4, %o5), NG_ret_i2_plus_g1_minus_16)
0374     EX_ST(STORE_INIT(%o2, %o0 + 0x10), NG_ret_i2_plus_g1_minus_16)
0375     EX_ST(STORE_INIT(%o3, %o0 + 0x18), NG_ret_i2_plus_g1_minus_24)
0376     EX_LD(LOAD_TWIN(%i1 + %g3, %o2, %o3), NG_ret_i2_plus_g1_minus_32)
0377     add %i1, 64, %i1
0378     EX_ST(STORE_INIT(%o4, %o0 + 0x20), NG_ret_i2_plus_g1_minus_32)
0379     EX_ST(STORE_INIT(%o5, %o0 + 0x28), NG_ret_i2_plus_g1_minus_40)
0380     EX_ST(STORE_INIT(%o2, %o0 + 0x30), NG_ret_i2_plus_g1_minus_48)
0381     EX_ST(STORE_INIT(%o3, %o0 + 0x38), NG_ret_i2_plus_g1_minus_56)
0382     subcc   %g1, 64, %g1
0383     bne,pt  %XCC, 1b
0384      add    %o0, 64, %o0
0385     /* fall through */
0386 
0387 60: 
0388     membar      #Sync
0389 
0390     /* %i2 contains any final bytes still needed to be copied
0391      * over. If anything is left, we copy it one byte at a time.
0392      */
0393     RESTORE_ASI(%i3)
0394     brz,pt      %i2, 85f
0395      sub        %o0, %i1, %i3
0396     ba,a,pt     %XCC, 90f
0397      nop
0398 
0399     .align      64
0400 70: /* 16 < len <= 64 */
0401     bne,pn      %XCC, 75f
0402      sub        %o0, %i1, %i3
0403 
0404 72:
0405     andn        %i2, 0xf, %i4
0406     and     %i2, 0xf, %i2
0407 1:  subcc       %i4, 0x10, %i4
0408     EX_LD(LOAD(ldx, %i1, %o4), NG_ret_i2_plus_i4)
0409     add     %i1, 0x08, %i1
0410     EX_LD(LOAD(ldx, %i1, %g1), NG_ret_i2_plus_i4)
0411     sub     %i1, 0x08, %i1
0412     EX_ST(STORE(stx, %o4, %i1 + %i3), NG_ret_i2_plus_i4)
0413     add     %i1, 0x8, %i1
0414     EX_ST(STORE(stx, %g1, %i1 + %i3), NG_ret_i2_plus_i4_minus_8)
0415     bgu,pt      %XCC, 1b
0416      add        %i1, 0x8, %i1
0417 73: andcc       %i2, 0x8, %g0
0418     be,pt       %XCC, 1f
0419      nop
0420     sub     %i2, 0x8, %i2
0421     EX_LD(LOAD(ldx, %i1, %o4), NG_ret_i2_plus_8)
0422     EX_ST(STORE(stx, %o4, %i1 + %i3), NG_ret_i2_plus_8)
0423     add     %i1, 0x8, %i1
0424 1:  andcc       %i2, 0x4, %g0
0425     be,pt       %XCC, 1f
0426      nop
0427     sub     %i2, 0x4, %i2
0428     EX_LD(LOAD(lduw, %i1, %i5), NG_ret_i2_plus_4)
0429     EX_ST(STORE(stw, %i5, %i1 + %i3), NG_ret_i2_plus_4)
0430     add     %i1, 0x4, %i1
0431 1:  cmp     %i2, 0
0432     be,pt       %XCC, 85f
0433      nop
0434     ba,pt       %xcc, 90f
0435      nop
0436 
0437 75:
0438     andcc       %o0, 0x7, %g1
0439     sub     %g1, 0x8, %g1
0440     be,pn       %icc, 2f
0441      sub        %g0, %g1, %g1
0442     sub     %i2, %g1, %i2
0443 
0444 1:  subcc       %g1, 1, %g1
0445     EX_LD(LOAD(ldub, %i1, %i5), NG_ret_i2_plus_g1_plus_1)
0446     EX_ST(STORE(stb, %i5, %i1 + %i3), NG_ret_i2_plus_g1_plus_1)
0447     bgu,pt      %icc, 1b
0448      add        %i1, 1, %i1
0449 
0450 2:  add     %i1, %i3, %o0
0451     andcc       %i1, 0x7, %g1
0452     bne,pt      %icc, 8f
0453      sll        %g1, 3, %g1
0454 
0455     cmp     %i2, 16
0456     bgeu,pt     %icc, 72b
0457      nop
0458     ba,a,pt     %xcc, 73b
0459 
0460 8:  mov     64, %i3
0461     andn        %i1, 0x7, %i1
0462     EX_LD(LOAD(ldx, %i1, %g2), NG_ret_i2)
0463     sub     %i3, %g1, %i3
0464     andn        %i2, 0x7, %i4
0465     sllx        %g2, %g1, %g2
0466 1:  add     %i1, 0x8, %i1
0467     EX_LD(LOAD(ldx, %i1, %g3), NG_ret_i2_and_7_plus_i4)
0468     subcc       %i4, 0x8, %i4
0469     srlx        %g3, %i3, %i5
0470     or      %i5, %g2, %i5
0471     EX_ST(STORE(stx, %i5, %o0), NG_ret_i2_and_7_plus_i4)
0472     add     %o0, 0x8, %o0
0473     bgu,pt      %icc, 1b
0474      sllx       %g3, %g1, %g2
0475 
0476     srl     %g1, 3, %g1
0477     andcc       %i2, 0x7, %i2
0478     be,pn       %icc, 85f
0479      add        %i1, %g1, %i1
0480     ba,pt       %xcc, 90f
0481      sub        %o0, %i1, %i3
0482 
0483     .align      64
0484 80: /* 0 < len <= 16 */
0485     andcc       %i3, 0x3, %g0
0486     bne,pn      %XCC, 90f
0487      sub        %o0, %i1, %i3
0488 
0489 1:
0490     subcc       %i2, 4, %i2
0491     EX_LD(LOAD(lduw, %i1, %g1), NG_ret_i2_plus_4)
0492     EX_ST(STORE(stw, %g1, %i1 + %i3), NG_ret_i2_plus_4)
0493     bgu,pt      %XCC, 1b
0494      add        %i1, 4, %i1
0495 
0496 85: ret
0497      restore    EX_RETVAL(%i0), %g0, %o0
0498 
0499     .align      32
0500 90:
0501     subcc       %i2, 1, %i2
0502     EX_LD(LOAD(ldub, %i1, %g1), NG_ret_i2_plus_1)
0503     EX_ST(STORE(stb, %g1, %i1 + %i3), NG_ret_i2_plus_1)
0504     bgu,pt      %XCC, 90b
0505      add        %i1, 1, %i1
0506     ret
0507      restore    EX_RETVAL(%i0), %g0, %o0
0508 
0509     .size       FUNC_NAME, .-FUNC_NAME