Back to home page

OSCL-LXR

 
 

    


0001 /* SPDX-License-Identifier: GPL-2.0 */
0002 /*
0003  * arch/alpha/lib/ev6-stxcpy.S
0004  * 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com>
0005  *
0006  * Copy a null-terminated string from SRC to DST.
0007  *
0008  * This is an internal routine used by strcpy, stpcpy, and strcat.
0009  * As such, it uses special linkage conventions to make implementation
0010  * of these public functions more efficient.
0011  *
0012  * On input:
0013  *  t9 = return address
0014  *  a0 = DST
0015  *  a1 = SRC
0016  *
0017  * On output:
0018  *  t12 = bitmask (with one bit set) indicating the last byte written
0019  *  a0  = unaligned address of the last *word* written
0020  *
0021  * Furthermore, v0, a3-a5, t11, and t12 are untouched.
0022  *
0023  * Much of the information about 21264 scheduling/coding comes from:
0024  *  Compiler Writer's Guide for the Alpha 21264
0025  *  abbreviated as 'CWG' in other comments here
0026  *  ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
0027  * Scheduling notation:
0028  *  E   - either cluster
0029  *  U   - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
0030  *  L   - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
0031  * Try not to change the actual algorithm if possible for consistency.
0032  */
0033 
0034 #include <asm/regdef.h>
0035 
0036     .set noat
0037     .set noreorder
0038 
0039     .text
0040 
0041 /* There is a problem with either gdb (as of 4.16) or gas (as of 2.7) that
0042    doesn't like putting the entry point for a procedure somewhere in the
0043    middle of the procedure descriptor.  Work around this by putting the
0044    aligned copy in its own procedure descriptor */
0045 
0046 
0047     .ent stxcpy_aligned
0048     .align 4
0049 stxcpy_aligned:
0050     .frame sp, 0, t9
0051     .prologue 0
0052 
0053     /* On entry to this basic block:
0054        t0 == the first destination word for masking back in
0055        t1 == the first source word.  */
0056 
0057     /* Create the 1st output word and detect 0's in the 1st input word.  */
0058     lda t2, -1      # E : build a mask against false zero
0059     mskqh   t2, a1, t2  # U :   detection in the src word (stall)
0060     mskqh   t1, a1, t3  # U :
0061     ornot   t1, t2, t2  # E : (stall)
0062 
0063     mskql   t0, a1, t0  # U : assemble the first output word
0064     cmpbge  zero, t2, t8    # E : bits set iff null found
0065     or  t0, t3, t1  # E : (stall)
0066     bne t8, $a_eos  # U : (stall)
0067 
0068     /* On entry to this basic block:
0069        t0 == the first destination word for masking back in
0070        t1 == a source word not containing a null.  */
0071     /* Nops here to separate store quads from load quads */
0072 
0073 $a_loop:
0074     stq_u   t1, 0(a0)   # L :
0075     addq    a0, 8, a0   # E :
0076     nop
0077     nop
0078 
0079     ldq_u   t1, 0(a1)   # L : Latency=3
0080     addq    a1, 8, a1   # E :
0081     cmpbge  zero, t1, t8    # E : (3 cycle stall)
0082     beq t8, $a_loop # U : (stall for t8)
0083 
0084     /* Take care of the final (partial) word store.
0085        On entry to this basic block we have:
0086        t1 == the source word containing the null
0087        t8 == the cmpbge mask that found it.  */
0088 $a_eos:
0089     negq    t8, t6      # E : find low bit set
0090     and t8, t6, t12 # E : (stall)
0091     /* For the sake of the cache, don't read a destination word
0092        if we're not going to need it.  */
0093     and t12, 0x80, t6   # E : (stall)
0094     bne t6, 1f      # U : (stall)
0095 
0096     /* We're doing a partial word store and so need to combine
0097        our source and original destination words.  */
0098     ldq_u   t0, 0(a0)   # L : Latency=3
0099     subq    t12, 1, t6  # E :
0100     zapnot  t1, t6, t1  # U : clear src bytes >= null (stall)
0101     or  t12, t6, t8 # E : (stall)
0102 
0103     zap t0, t8, t0  # E : clear dst bytes <= null
0104     or  t0, t1, t1  # E : (stall)
0105     nop
0106     nop
0107 
0108 1:  stq_u   t1, 0(a0)   # L :
0109     ret (t9)        # L0 : Latency=3
0110     nop
0111     nop
0112 
0113     .end stxcpy_aligned
0114 
0115     .align 4
0116     .ent __stxcpy
0117     .globl __stxcpy
0118 __stxcpy:
0119     .frame sp, 0, t9
0120     .prologue 0
0121 
0122     /* Are source and destination co-aligned?  */
0123     xor a0, a1, t0  # E :
0124     unop            # E :
0125     and t0, 7, t0   # E : (stall)
0126     bne t0, $unaligned  # U : (stall)
0127 
0128     /* We are co-aligned; take care of a partial first word.  */
0129     ldq_u   t1, 0(a1)       # L : load first src word
0130     and a0, 7, t0       # E : take care not to load a word ...
0131     addq    a1, 8, a1       # E :
0132     beq t0, stxcpy_aligned  # U : ... if we wont need it (stall)
0133 
0134     ldq_u   t0, 0(a0)   # L :
0135     br  stxcpy_aligned  # L0 : Latency=3
0136     nop
0137     nop
0138 
0139 
0140 /* The source and destination are not co-aligned.  Align the destination
0141    and cope.  We have to be very careful about not reading too much and
0142    causing a SEGV.  */
0143 
0144     .align 4
0145 $u_head:
0146     /* We know just enough now to be able to assemble the first
0147        full source word.  We can still find a zero at the end of it
0148        that prevents us from outputting the whole thing.
0149 
0150        On entry to this basic block:
0151        t0 == the first dest word, for masking back in, if needed else 0
0152        t1 == the low bits of the first source word
0153        t6 == bytemask that is -1 in dest word bytes */
0154 
0155     ldq_u   t2, 8(a1)   # L :
0156     addq    a1, 8, a1   # E :
0157     extql   t1, a1, t1  # U : (stall on a1)
0158     extqh   t2, a1, t4  # U : (stall on a1)
0159 
0160     mskql   t0, a0, t0  # U :
0161     or  t1, t4, t1  # E :
0162     mskqh   t1, a0, t1  # U : (stall on t1)
0163     or  t0, t1, t1  # E : (stall on t1)
0164 
0165     or  t1, t6, t6  # E :
0166     cmpbge  zero, t6, t8    # E : (stall)
0167     lda t6, -1      # E : for masking just below
0168     bne t8, $u_final    # U : (stall)
0169 
0170     mskql   t6, a1, t6      # U : mask out the bits we have
0171     or  t6, t2, t2      # E :   already extracted before (stall)
0172     cmpbge  zero, t2, t8        # E :   testing eos (stall)
0173     bne t8, $u_late_head_exit   # U : (stall)
0174 
0175     /* Finally, we've got all the stupid leading edge cases taken care
0176        of and we can set up to enter the main loop.  */
0177 
0178     stq_u   t1, 0(a0)   # L : store first output word
0179     addq    a0, 8, a0   # E :
0180     extql   t2, a1, t0  # U : position ho-bits of lo word
0181     ldq_u   t2, 8(a1)   # U : read next high-order source word
0182 
0183     addq    a1, 8, a1   # E :
0184     cmpbge  zero, t2, t8    # E : (stall for t2)
0185     nop         # E :
0186     bne t8, $u_eos  # U : (stall)
0187 
0188     /* Unaligned copy main loop.  In order to avoid reading too much,
0189        the loop is structured to detect zeros in aligned source words.
0190        This has, unfortunately, effectively pulled half of a loop
0191        iteration out into the head and half into the tail, but it does
0192        prevent nastiness from accumulating in the very thing we want
0193        to run as fast as possible.
0194 
0195        On entry to this basic block:
0196        t0 == the shifted high-order bits from the previous source word
0197        t2 == the unshifted current source word
0198 
0199        We further know that t2 does not contain a null terminator.  */
0200 
0201     .align 3
0202 $u_loop:
0203     extqh   t2, a1, t1  # U : extract high bits for current word
0204     addq    a1, 8, a1   # E : (stall)
0205     extql   t2, a1, t3  # U : extract low bits for next time (stall)
0206     addq    a0, 8, a0   # E :
0207 
0208     or  t0, t1, t1  # E : current dst word now complete
0209     ldq_u   t2, 0(a1)   # L : Latency=3 load high word for next time
0210     stq_u   t1, -8(a0)  # L : save the current word (stall)
0211     mov t3, t0      # E :
0212 
0213     cmpbge  zero, t2, t8    # E : test new word for eos
0214     beq t8, $u_loop # U : (stall)
0215     nop
0216     nop
0217 
0218     /* We've found a zero somewhere in the source word we just read.
0219        If it resides in the lower half, we have one (probably partial)
0220        word to write out, and if it resides in the upper half, we
0221        have one full and one partial word left to write out.
0222 
0223        On entry to this basic block:
0224        t0 == the shifted high-order bits from the previous source word
0225        t2 == the unshifted current source word.  */
0226 $u_eos:
0227     extqh   t2, a1, t1  # U :
0228     or  t0, t1, t1  # E : first (partial) source word complete (stall)
0229     cmpbge  zero, t1, t8    # E : is the null in this first bit? (stall)
0230     bne t8, $u_final    # U : (stall)
0231 
0232 $u_late_head_exit:
0233     stq_u   t1, 0(a0)   # L : the null was in the high-order bits
0234     addq    a0, 8, a0   # E :
0235     extql   t2, a1, t1  # U :
0236     cmpbge  zero, t1, t8    # E : (stall)
0237 
0238     /* Take care of a final (probably partial) result word.
0239        On entry to this basic block:
0240        t1 == assembled source word
0241        t8 == cmpbge mask that found the null.  */
0242 $u_final:
0243     negq    t8, t6      # E : isolate low bit set
0244     and t6, t8, t12 # E : (stall)
0245     and t12, 0x80, t6   # E : avoid dest word load if we can (stall)
0246     bne t6, 1f      # U : (stall)
0247 
0248     ldq_u   t0, 0(a0)   # E :
0249     subq    t12, 1, t6  # E :
0250     or  t6, t12, t8 # E : (stall)
0251     zapnot  t1, t6, t1  # U : kill source bytes >= null (stall)
0252 
0253     zap t0, t8, t0  # U : kill dest bytes <= null (2 cycle data stall)
0254     or  t0, t1, t1  # E : (stall)
0255     nop
0256     nop
0257 
0258 1:  stq_u   t1, 0(a0)   # L :
0259     ret (t9)        # L0 : Latency=3
0260     nop
0261     nop
0262 
0263     /* Unaligned copy entry point.  */
0264     .align 4
0265 $unaligned:
0266 
0267     ldq_u   t1, 0(a1)   # L : load first source word
0268     and a0, 7, t4   # E : find dest misalignment
0269     and a1, 7, t5   # E : find src misalignment
0270     /* Conditionally load the first destination word and a bytemask
0271        with 0xff indicating that the destination byte is sacrosanct.  */
0272     mov zero, t0    # E :
0273 
0274     mov zero, t6    # E :
0275     beq t4, 1f      # U :
0276     ldq_u   t0, 0(a0)   # L :
0277     lda t6, -1      # E :
0278 
0279     mskql   t6, a0, t6  # U :
0280     nop
0281     nop
0282     nop
0283 1:
0284     subq    a1, t4, a1  # E : sub dest misalignment from src addr
0285     /* If source misalignment is larger than dest misalignment, we need
0286        extra startup checks to avoid SEGV.  */
0287     cmplt   t4, t5, t12 # E :
0288     beq t12, $u_head    # U :
0289     lda t2, -1      # E : mask out leading garbage in source
0290 
0291     mskqh   t2, t5, t2  # U :
0292     ornot   t1, t2, t3  # E : (stall)
0293     cmpbge  zero, t3, t8    # E : is there a zero? (stall)
0294     beq t8, $u_head # U : (stall)
0295 
0296     /* At this point we've found a zero in the first partial word of
0297        the source.  We need to isolate the valid source data and mask
0298        it into the original destination data.  (Incidentally, we know
0299        that we'll need at least one byte of that original dest word.) */
0300 
0301     ldq_u   t0, 0(a0)   # L :
0302     negq    t8, t6      # E : build bitmask of bytes <= zero
0303     and t6, t8, t12 # E : (stall)
0304     and a1, 7, t5   # E :
0305 
0306     subq    t12, 1, t6  # E :
0307     or  t6, t12, t8 # E : (stall)
0308     srl t12, t5, t12    # U : adjust final null return value
0309     zapnot  t2, t8, t2  # U : prepare source word; mirror changes (stall)
0310 
0311     and t1, t2, t1  # E : to source validity mask
0312     extql   t2, a1, t2  # U :
0313     extql   t1, a1, t1  # U : (stall)
0314     andnot  t0, t2, t0  # .. e1 : zero place for source to reside (stall)
0315 
0316     or  t0, t1, t1  # e1    : and put it there
0317     stq_u   t1, 0(a0)   # .. e0 : (stall)
0318     ret (t9)        # e1    :
0319     nop
0320 
0321     .end __stxcpy
0322