Back to home page

OSCL-LXR

 
 

    


0001 /*
0002  * M7memset.S: SPARC M7 optimized memset.
0003  *
0004  * Copyright (c) 2016, Oracle and/or its affiliates.  All rights reserved.
0005  */
0006 
0007 /*
0008  * M7memset.S: M7 optimized memset.
0009  *
0010  * char *memset(sp, c, n)
0011  *
0012  * Set an array of n chars starting at sp to the character c.
0013  * Return sp.
0014  *
0015  * Fast assembler language version of the following C-program for memset
0016  * which represents the `standard' for the C-library.
0017  *
0018  *  void *
0019  *  memset(void *sp1, int c, size_t n)
0020  *  {
0021  *      if (n != 0) {
0022  *      char *sp = sp1;
0023  *      do {
0024  *          *sp++ = (char)c;
0025  *      } while (--n != 0);
0026  *      }
0027  *      return (sp1);
0028  *  }
0029  *
0030  * The algorithm is as follows :
0031  *
0032  *  For small 6 or fewer bytes stores, bytes will be stored.
0033  *
0034  *  For less than 32 bytes stores, align the address on 4 byte boundary.
0035  *  Then store as many 4-byte chunks, followed by trailing bytes.
0036  *
0037  *  For sizes greater than 32 bytes, align the address on 8 byte boundary.
0038  *  if (count >= 64) {
0039  *          store 8-bytes chunks to align the address on 64 byte boundary
0040  *      if (value to be set is zero && count >= MIN_ZERO) {
0041  *                  Using BIS stores, set the first long word of each
0042  *          64-byte cache line to zero which will also clear the
0043  *          other seven long words of the cache line.
0044  *          }
0045  *          else if (count >= MIN_LOOP) {
0046  *              Using BIS stores, set the first long word of each of
0047  *                  ST_CHUNK cache lines (64 bytes each) before the main
0048  *          loop is entered.
0049  *                  In the main loop, continue pre-setting the first long
0050  *                  word of each cache line ST_CHUNK lines in advance while
0051  *                  setting the other seven long words (56 bytes) of each
0052  *          cache line until fewer than ST_CHUNK*64 bytes remain.
0053  *          Then set the remaining seven long words of each cache
0054  *          line that has already had its first long word set.
0055  *          }
0056  *          store remaining data in 64-byte chunks until less than
0057  *          64 bytes remain.
0058  *       }
0059  *       Store as many 8-byte chunks, followed by trailing bytes.
0060  *
0061  * BIS = Block Init Store
0062  *   Doing the advance store of the first element of the cache line
0063  *   initiates the displacement of a cache line while only using a single
0064  *   instruction in the pipeline. That avoids various pipeline delays,
0065  *   such as filling the miss buffer. The performance effect is
0066  *   similar to prefetching for normal stores.
0067  *   The special case for zero fills runs faster and uses fewer instruction
0068  *   cycles than the normal memset loop.
0069  *
0070  * We only use BIS for memset of greater than MIN_LOOP bytes because a sequence
0071  * BIS stores must be followed by a membar #StoreStore. The benefit of
0072  * the BIS store must be balanced against the cost of the membar operation.
0073  */
0074 
0075 /*
0076  * ASI_STBI_P marks the cache line as "least recently used"
0077  * which means if many threads are active, it has a high chance
0078  * of being pushed out of the cache between the first initializing
0079  * store and the final stores.
0080  * Thus, we use ASI_STBIMRU_P which marks the cache line as
0081  * "most recently used" for all but the last store to the cache line.
0082  */
0083 
0084 #include <asm/asi.h>
0085 #include <asm/page.h>
0086 
0087 #define ASI_STBI_P      ASI_BLK_INIT_QUAD_LDD_P
0088 #define ASI_STBIMRU_P   ASI_ST_BLKINIT_MRU_P
0089 
0090 
0091 #define ST_CHUNK        24   /* multiple of 4 due to loop unrolling */
0092 #define MIN_LOOP        16320
0093 #define MIN_ZERO        512
0094 
0095     .section    ".text"
0096     .align      32
0097 
0098 /*
0099  * Define clear_page(dest) as memset(dest, 0, PAGE_SIZE)
0100  * (can create a more optimized version later.)
0101  */
0102     .globl      M7clear_page
0103     .globl      M7clear_user_page
0104 M7clear_page:       /* clear_page(dest) */
0105 M7clear_user_page:
0106     set PAGE_SIZE, %o1
0107     /* fall through into bzero code */
0108 
0109     .size       M7clear_page,.-M7clear_page
0110     .size       M7clear_user_page,.-M7clear_user_page
0111 
0112 /*
0113  * Define bzero(dest, n) as memset(dest, 0, n)
0114  * (can create a more optimized version later.)
0115  */
0116     .globl      M7bzero
0117 M7bzero:        /* bzero(dest, size) */
0118     mov %o1, %o2
0119     mov 0, %o1
0120     /* fall through into memset code */
0121 
0122     .size       M7bzero,.-M7bzero
0123 
0124     .global     M7memset
0125     .type       M7memset, #function
0126     .register   %g3, #scratch
0127 M7memset:
0128     mov     %o0, %o5                ! copy sp1 before using it
0129     cmp     %o2, 7                  ! if small counts, just write bytes
0130     bleu,pn %xcc, .wrchar
0131      and     %o1, 0xff, %o1          ! o1 is (char)c
0132 
0133     sll     %o1, 8, %o3
0134     or      %o1, %o3, %o1           ! now o1 has 2 bytes of c
0135     sll     %o1, 16, %o3
0136     cmp     %o2, 32
0137     blu,pn  %xcc, .wdalign
0138      or      %o1, %o3, %o1           ! now o1 has 4 bytes of c
0139 
0140     sllx    %o1, 32, %o3
0141     or      %o1, %o3, %o1           ! now o1 has 8 bytes of c
0142 
0143 .dbalign:
0144     andcc   %o5, 7, %o3             ! is sp1 aligned on a 8 byte bound?
0145     bz,pt   %xcc, .blkalign         ! already long word aligned
0146      sub     %o3, 8, %o3             ! -(bytes till long word aligned)
0147 
0148     add     %o2, %o3, %o2           ! update o2 with new count
0149     ! Set -(%o3) bytes till sp1 long word aligned
0150 1:  stb     %o1, [%o5]              ! there is at least 1 byte to set
0151     inccc   %o3                     ! byte clearing loop
0152     bl,pt   %xcc, 1b
0153      inc     %o5
0154 
0155     ! Now sp1 is long word aligned (sp1 is found in %o5)
0156 .blkalign:
0157     cmp     %o2, 64                 ! check if there are 64 bytes to set
0158     blu,pn  %xcc, .wrshort
0159      mov     %o2, %o3
0160 
0161     andcc   %o5, 63, %o3            ! is sp1 block aligned?
0162     bz,pt   %xcc, .blkwr            ! now block aligned
0163      sub     %o3, 64, %o3            ! o3 is -(bytes till block aligned)
0164     add     %o2, %o3, %o2           ! o2 is the remainder
0165 
0166     ! Store -(%o3) bytes till dst is block (64 byte) aligned.
0167     ! Use long word stores.
0168     ! Recall that dst is already long word aligned
0169 1:
0170     addcc   %o3, 8, %o3
0171     stx     %o1, [%o5]
0172     bl,pt   %xcc, 1b
0173      add     %o5, 8, %o5
0174 
0175     ! Now sp1 is block aligned
0176 .blkwr:
0177     andn    %o2, 63, %o4            ! calculate size of blocks in bytes
0178     brz,pn  %o1, .wrzero            ! special case if c == 0
0179      and     %o2, 63, %o3            ! %o3 = bytes left after blk stores.
0180 
0181     set     MIN_LOOP, %g1
0182     cmp     %o4, %g1                ! check there are enough bytes to set
0183     blu,pn  %xcc, .short_set        ! to justify cost of membar
0184                                     ! must be > pre-cleared lines
0185      nop
0186 
0187     ! initial cache-clearing stores
0188     ! get store pipeline moving
0189     rd  %asi, %g3       ! save %asi to be restored later
0190     wr     %g0, ASI_STBIMRU_P, %asi
0191 
0192     ! Primary memset loop for large memsets
0193 .wr_loop:
0194     sub     %o5, 8, %o5     ! adjust %o5 for ASI store alignment
0195     mov     ST_CHUNK, %g1
0196 .wr_loop_start:
0197     stxa    %o1, [%o5+8]%asi
0198     subcc   %g1, 4, %g1
0199     stxa    %o1, [%o5+8+64]%asi
0200     add     %o5, 256, %o5
0201     stxa    %o1, [%o5+8-128]%asi
0202     bgu     %xcc, .wr_loop_start
0203      stxa    %o1, [%o5+8-64]%asi
0204 
0205     sub     %o5, ST_CHUNK*64, %o5   ! reset %o5
0206     mov     ST_CHUNK, %g1
0207 
0208 .wr_loop_rest:
0209     stxa    %o1, [%o5+8+8]%asi
0210     sub     %o4, 64, %o4
0211     stxa    %o1, [%o5+16+8]%asi
0212     subcc   %g1, 1, %g1
0213     stxa    %o1, [%o5+24+8]%asi
0214     stxa    %o1, [%o5+32+8]%asi
0215     stxa    %o1, [%o5+40+8]%asi
0216     add     %o5, 64, %o5
0217     stxa    %o1, [%o5-8]%asi
0218     bgu     %xcc, .wr_loop_rest
0219      stxa    %o1, [%o5]ASI_STBI_P
0220 
0221     ! If more than ST_CHUNK*64 bytes remain to set, continue
0222     ! setting the first long word of each cache line in advance
0223     ! to keep the store pipeline moving.
0224 
0225     cmp     %o4, ST_CHUNK*64
0226     bge,pt  %xcc, .wr_loop_start
0227      mov     ST_CHUNK, %g1
0228 
0229     brz,a,pn %o4, .asi_done
0230      add     %o5, 8, %o5             ! restore %o5 offset
0231 
0232 .wr_loop_small:
0233     stxa    %o1, [%o5+8]%asi
0234     stxa    %o1, [%o5+8+8]%asi
0235     stxa    %o1, [%o5+16+8]%asi
0236     stxa    %o1, [%o5+24+8]%asi
0237     stxa    %o1, [%o5+32+8]%asi
0238     subcc   %o4, 64, %o4
0239     stxa    %o1, [%o5+40+8]%asi
0240     add     %o5, 64, %o5
0241     stxa    %o1, [%o5-8]%asi
0242     bgu,pt  %xcc, .wr_loop_small
0243      stxa    %o1, [%o5]ASI_STBI_P
0244 
0245     ba      .asi_done
0246      add     %o5, 8, %o5             ! restore %o5 offset
0247 
0248     ! Special case loop for zero fill memsets
0249     ! For each 64 byte cache line, single STBI to first element
0250     ! clears line
0251 .wrzero:
0252     cmp     %o4, MIN_ZERO           ! check if enough bytes to set
0253                     ! to pay %asi + membar cost
0254     blu     %xcc, .short_set
0255      nop
0256     sub     %o4, 256, %o4
0257 
0258 .wrzero_loop:
0259     mov     64, %g3
0260     stxa    %o1, [%o5]ASI_STBI_P
0261     subcc   %o4, 256, %o4
0262     stxa    %o1, [%o5+%g3]ASI_STBI_P
0263     add     %o5, 256, %o5
0264     sub     %g3, 192, %g3
0265     stxa    %o1, [%o5+%g3]ASI_STBI_P
0266     add %g3, 64, %g3
0267     bge,pt  %xcc, .wrzero_loop
0268      stxa    %o1, [%o5+%g3]ASI_STBI_P
0269     add     %o4, 256, %o4
0270 
0271     brz,pn  %o4, .bsi_done
0272      nop
0273 
0274 .wrzero_small:
0275     stxa    %o1, [%o5]ASI_STBI_P
0276     subcc   %o4, 64, %o4
0277     bgu,pt  %xcc, .wrzero_small
0278      add     %o5, 64, %o5
0279     ba,a    .bsi_done
0280 
0281 .asi_done:
0282     wr  %g3, 0x0, %asi      ! restored saved %asi
0283 .bsi_done:
0284     membar  #StoreStore             ! required by use of Block Store Init
0285 
0286 .short_set:
0287     cmp     %o4, 64                 ! check if 64 bytes to set
0288     blu     %xcc, 5f
0289      nop
0290 4:                                      ! set final blocks of 64 bytes
0291     stx     %o1, [%o5]
0292     stx     %o1, [%o5+8]
0293     stx     %o1, [%o5+16]
0294     stx     %o1, [%o5+24]
0295     subcc   %o4, 64, %o4
0296     stx     %o1, [%o5+32]
0297     stx     %o1, [%o5+40]
0298     add     %o5, 64, %o5
0299     stx     %o1, [%o5-16]
0300     bgu,pt  %xcc, 4b
0301      stx     %o1, [%o5-8]
0302 
0303 5:
0304     ! Set the remaining long words
0305 .wrshort:
0306     subcc   %o3, 8, %o3             ! Can we store any long words?
0307     blu,pn  %xcc, .wrchars
0308      and     %o2, 7, %o2             ! calc bytes left after long words
0309 6:
0310     subcc   %o3, 8, %o3
0311     stx     %o1, [%o5]              ! store the long words
0312     bgeu,pt %xcc, 6b
0313      add     %o5, 8, %o5
0314 
0315 .wrchars:                               ! check for extra chars
0316     brnz    %o2, .wrfin
0317      nop
0318     retl
0319      nop
0320 
0321 .wdalign:
0322     andcc   %o5, 3, %o3             ! is sp1 aligned on a word boundary
0323     bz,pn   %xcc, .wrword
0324      andn    %o2, 3, %o3             ! create word sized count in %o3
0325 
0326     dec     %o2                     ! decrement count
0327     stb     %o1, [%o5]              ! clear a byte
0328     b       .wdalign
0329      inc     %o5                     ! next byte
0330 
0331 .wrword:
0332     subcc   %o3, 4, %o3
0333     st      %o1, [%o5]              ! 4-byte writing loop
0334     bnz,pt  %xcc, .wrword
0335      add     %o5, 4, %o5
0336 
0337     and     %o2, 3, %o2             ! leftover count, if any
0338 
0339 .wrchar:
0340     ! Set the remaining bytes, if any
0341     brz     %o2, .exit
0342      nop
0343 .wrfin:
0344     deccc   %o2
0345     stb     %o1, [%o5]
0346     bgu,pt  %xcc, .wrfin
0347      inc     %o5
0348 .exit:
0349     retl                            ! %o0 was preserved
0350      nop
0351 
0352     .size       M7memset,.-M7memset