Back to home page

OSCL-LXR

 
 

    


0001 /* SPDX-License-Identifier: GPL-2.0 */
0002 /*
0003  * arch/alpha/lib/ev67-strrchr.S
0004  * 21264 version by Rick Gorton <rick.gorton@alpha-processor.com>
0005  *
0006  * Finds length of a 0-terminated string.  Optimized for the
0007  * Alpha architecture:
0008  *
0009  *  - memory accessed as aligned quadwords only
0010  *  - uses bcmpge to compare 8 bytes in parallel
0011  *
0012  * Much of the information about 21264 scheduling/coding comes from:
0013  *  Compiler Writer's Guide for the Alpha 21264
0014  *  abbreviated as 'CWG' in other comments here
0015  *  ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
0016  * Scheduling notation:
0017  *  E   - either cluster
0018  *  U   - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
0019  *  L   - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
0020  */
0021 
0022 #include <asm/export.h>
0023 #include <asm/regdef.h>
0024 
0025     .set noreorder
0026     .set noat
0027 
0028     .align 4
0029     .ent strrchr
0030     .globl strrchr
0031 strrchr:
0032     .frame sp, 0, ra
0033     .prologue 0
0034 
0035     and a1, 0xff, t2    # E : 00000000000000ch
0036     insbl   a1, 1, t4   # U : 000000000000ch00
0037     insbl   a1, 2, t5   # U : 0000000000ch0000
0038     ldq_u   t0, 0(a0)   # L : load first quadword Latency=3
0039 
0040     mov zero, t6    # E : t6 is last match aligned addr
0041     or  t2, t4, a1  # E : 000000000000chch
0042     sll t5, 8, t3   # U : 00000000ch000000
0043     mov zero, t8    # E : t8 is last match byte compare mask
0044 
0045     andnot  a0, 7, v0   # E : align source addr
0046     or  t5, t3, t3  # E : 00000000chch0000
0047     sll a1, 32, t2  # U : 0000chch00000000
0048     sll a1, 48, t4  # U : chch000000000000
0049 
0050     or  t4, a1, a1  # E : chch00000000chch
0051     or  t2, t3, t2  # E : 0000chchchch0000
0052     or  a1, t2, a1  # E : chchchchchchchch
0053     lda t5, -1      # E : build garbage mask
0054 
0055     cmpbge  zero, t0, t1    # E : bits set iff byte == zero
0056     mskqh   t5, a0, t4  # E : Complete garbage mask
0057     xor t0, a1, t2  # E : make bytes == c zero
0058     cmpbge  zero, t4, t4    # E : bits set iff byte is garbage
0059 
0060     cmpbge  zero, t2, t3    # E : bits set iff byte == c
0061     andnot  t1, t4, t1  # E : clear garbage from null test
0062     andnot  t3, t4, t3  # E : clear garbage from char test
0063     bne t1, $eos    # U : did we already hit the terminator?
0064 
0065     /* Character search main loop */
0066 $loop:
0067     ldq t0, 8(v0)   # L : load next quadword
0068     cmovne  t3, v0, t6  # E : save previous comparisons match
0069     nop         #   : Latency=2, extra map slot (keep nop with cmov)
0070     nop
0071 
0072     cmovne  t3, t3, t8  # E : Latency=2, extra map slot
0073     nop         #   : keep with cmovne
0074     addq    v0, 8, v0   # E :
0075     xor t0, a1, t2  # E :
0076 
0077     cmpbge  zero, t0, t1    # E : bits set iff byte == zero
0078     cmpbge  zero, t2, t3    # E : bits set iff byte == c
0079     beq t1, $loop   # U : if we havnt seen a null, loop
0080     nop
0081 
0082     /* Mask out character matches after terminator */
0083 $eos:
0084     negq    t1, t4      # E : isolate first null byte match
0085     and t1, t4, t4  # E :
0086     subq    t4, 1, t5   # E : build a mask of the bytes up to...
0087     or  t4, t5, t4  # E : ... and including the null
0088 
0089     and t3, t4, t3  # E : mask out char matches after null
0090     cmovne  t3, t3, t8  # E : save it, if match found Latency=2, extra map slot
0091     nop         #   : Keep with cmovne
0092     nop
0093 
0094     cmovne  t3, v0, t6  # E :
0095     nop         #   : Keep with cmovne
0096     /* Locate the address of the last matched character */
0097     ctlz    t8, t2      # U0 : Latency=3 (0x40 for t8=0)
0098     nop
0099 
0100     cmoveq  t8, 0x3f, t2    # E : Compensate for case when no match is seen
0101     nop         # E : hide the cmov latency (2) behind ctlz latency
0102     lda t5, 0x3f($31)   # E :
0103     subq    t5, t2, t5  # E : Normalize leading zero count
0104 
0105     addq    t6, t5, v0  # E : and add to quadword address
0106     ret         # L0 : Latency=3
0107     nop
0108     nop
0109 
0110     .end strrchr
0111     EXPORT_SYMBOL(strrchr)