Back to home page

OSCL-LXR

 
 

    


0001 /* SPDX-License-Identifier: GPL-2.0 */
0002 /*
0003  * arch/alpha/lib/ev67-strchr.S
0004  * 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com>
0005  *
0006  * Return the address of a given character within a null-terminated
0007  * string, or null if it is not found.
0008  *
0009  * Much of the information about 21264 scheduling/coding comes from:
0010  *  Compiler Writer's Guide for the Alpha 21264
0011  *  abbreviated as 'CWG' in other comments here
0012  *  ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
0013  * Scheduling notation:
0014  *  E   - either cluster
0015  *  U   - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
0016  *  L   - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
0017  * Try not to change the actual algorithm if possible for consistency.
0018  */
0019 #include <asm/export.h>
0020 #include <asm/regdef.h>
0021 
0022     .set noreorder
0023     .set noat
0024 
0025     .align 4
0026     .globl strchr
0027     .ent strchr
0028 strchr:
0029     .frame sp, 0, ra
0030     .prologue 0
0031 
0032     ldq_u   t0, 0(a0)   # L : load first quadword Latency=3
0033     and a1, 0xff, t3    # E : 00000000000000ch
0034     insbl   a1, 1, t5   # U : 000000000000ch00
0035     insbl   a1, 7, a2   # U : ch00000000000000
0036 
0037     insbl   t3, 6, a3   # U : 00ch000000000000
0038     or  t5, t3, a1  # E : 000000000000chch
0039     andnot  a0, 7, v0   # E : align our loop pointer
0040     lda t4, -1      # E : build garbage mask
0041 
0042     mskqh   t4, a0, t4  # U : only want relevant part of first quad
0043     or  a2, a3, a2  # E : chch000000000000
0044     inswl   a1, 2, t5   # E : 00000000chch0000
0045     inswl   a1, 4, a3   # E : 0000chch00000000
0046 
0047     or  a1, a2, a1  # E : chch00000000chch
0048     or  a3, t5, t5  # E : 0000chchchch0000
0049     cmpbge  zero, t0, t2    # E : bits set iff byte == zero
0050     cmpbge  zero, t4, t4    # E : bits set iff byte is garbage
0051 
0052     /* This quad is _very_ serialized.  Lots of stalling happens */
0053     or  t5, a1, a1  # E : chchchchchchchch
0054     xor t0, a1, t1  # E : make bytes == c zero
0055     cmpbge  zero, t1, t3    # E : bits set iff byte == c
0056     or  t2, t3, t0  # E : bits set iff char match or zero match
0057 
0058     andnot  t0, t4, t0  # E : clear garbage bits
0059     cttz    t0, a2      # U0 : speculative (in case we get a match)
0060     nop         # E :
0061     bne t0, $found  # U :
0062 
0063     /*
0064      * Yuk.  This loop is going to stall like crazy waiting for the
0065      * data to be loaded.  Not much can be done about it unless it's
0066      * unrolled multiple times - is that safe to do in kernel space?
0067      * Or would exception handling recovery code do the trick here?
0068      */
0069 $loop:  ldq t0, 8(v0)   # L : Latency=3
0070     addq    v0, 8, v0   # E :
0071     xor t0, a1, t1  # E :
0072     cmpbge  zero, t0, t2    # E : bits set iff byte == 0
0073 
0074     cmpbge  zero, t1, t3    # E : bits set iff byte == c
0075     or  t2, t3, t0  # E :
0076     cttz    t3, a2      # U0 : speculative (in case we get a match)
0077     beq t0, $loop   # U :
0078 
0079 $found: negq    t0, t1      # E : clear all but least set bit
0080     and     t0, t1, t0  # E :
0081     and t0, t3, t1  # E : bit set iff byte was the char
0082     addq    v0, a2, v0  # E : Add in the bit number from above
0083 
0084     cmoveq  t1, $31, v0 # E : Two mapping slots, latency = 2
0085     nop
0086     nop
0087     ret         # L0 :
0088 
0089     .end strchr
0090     EXPORT_SYMBOL(strchr)