Back to home page

OSCL-LXR

 
 

    


0001 /* SPDX-License-Identifier: GPL-2.0 */
0002 /*
0003  * arch/alpha/lib/ev6-memchr.S
0004  *
0005  * 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com>
0006  *
0007  * Finds characters in a memory area.  Optimized for the Alpha:
0008  *
0009  *    - memory accessed as aligned quadwords only
0010  *    - uses cmpbge to compare 8 bytes in parallel
0011  *    - does binary search to find 0 byte in last
0012  *      quadword (HAKMEM needed 12 instructions to
0013  *      do this instead of the 9 instructions that
0014  *      binary search needs).
0015  *
0016  * For correctness consider that:
0017  *
0018  *    - only minimum number of quadwords may be accessed
0019  *    - the third argument is an unsigned long
0020  *
0021  * Much of the information about 21264 scheduling/coding comes from:
0022  *  Compiler Writer's Guide for the Alpha 21264
0023  *  abbreviated as 'CWG' in other comments here
0024  *  ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
0025  * Scheduling notation:
0026  *  E   - either cluster
0027  *  U   - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
0028  *  L   - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
0029  * Try not to change the actual algorithm if possible for consistency.
0030  */
0031 #include <asm/export.h>
0032         .set noreorder
0033         .set noat
0034 
0035     .align  4
0036     .globl memchr
0037     .ent memchr
0038 memchr:
0039     .frame $30,0,$26,0
0040     .prologue 0
0041 
0042     # Hack -- if someone passes in (size_t)-1, hoping to just
0043     # search til the end of the address space, we will overflow
0044     # below when we find the address of the last byte.  Given
0045     # that we will never have a 56-bit address space, cropping
0046     # the length is the easiest way to avoid trouble.
0047     zap $18, 0x80, $5   # U : Bound length
0048     beq $18, $not_found # U :
0049         ldq_u   $1, 0($16)  # L : load first quadword Latency=3
0050     and $17, 0xff, $17  # E : L L U U : 00000000000000ch
0051 
0052     insbl   $17, 1, $2  # U : 000000000000ch00
0053     cmpult  $18, 9, $4  # E : small (< 1 quad) string?
0054     or  $2, $17, $17    # E : 000000000000chch
0055         lda     $3, -1($31) # E : U L L U
0056 
0057     sll $17, 16, $2 # U : 00000000chch0000
0058     addq    $16, $5, $5 # E : Max search address
0059     or  $2, $17, $17    # E : 00000000chchchch
0060     sll $17, 32, $2 # U : U L L U : chchchch00000000
0061 
0062     or  $2, $17, $17    # E : chchchchchchchch
0063     extql   $1, $16, $7 # U : $7 is upper bits
0064     beq $4, $first_quad # U :
0065     ldq_u   $6, -1($5)  # L : L U U L : eight or less bytes to search Latency=3
0066 
0067     extqh   $6, $16, $6 # U : 2 cycle stall for $6
0068     mov $16, $0     # E :
0069     nop         # E :
0070     or  $7, $6, $1  # E : L U L U $1 = quadword starting at $16
0071 
0072     # Deal with the case where at most 8 bytes remain to be searched
0073     # in $1.  E.g.:
0074     #   $18 = 6
0075     #   $1 = ????c6c5c4c3c2c1
0076 $last_quad:
0077     negq    $18, $6     # E :
0078         xor $17, $1, $1 # E :
0079     srl $3, $6, $6  # U : $6 = mask of $18 bits set
0080         cmpbge  $31, $1, $2 # E : L U L U
0081 
0082     nop
0083     nop
0084     and $2, $6, $2  # E :
0085         beq     $2, $not_found  # U : U L U L
0086 
0087 $found_it:
0088 #ifdef CONFIG_ALPHA_EV67
0089     /*
0090      * Since we are guaranteed to have set one of the bits, we don't
0091      * have to worry about coming back with a 0x40 out of cttz...
0092      */
0093     cttz    $2, $3      # U0 :
0094     addq    $0, $3, $0  # E : All done
0095     nop         # E :
0096     ret         # L0 : L U L U
0097 #else
0098     /*
0099      * Slow and clunky.  It can probably be improved.
0100      * An exercise left for others.
0101      */
0102         negq    $2, $3      # E :
0103         and     $2, $3, $2  # E :
0104         and     $2, 0x0f, $1    # E :
0105         addq    $0, 4, $3   # E :
0106 
0107         cmoveq  $1, $3, $0  # E : Latency 2, extra map cycle
0108     nop         # E : keep with cmov
0109         and     $2, 0x33, $1    # E :
0110         addq    $0, 2, $3   # E : U L U L : 2 cycle stall on $0
0111 
0112         cmoveq  $1, $3, $0  # E : Latency 2, extra map cycle
0113     nop         # E : keep with cmov
0114         and     $2, 0x55, $1    # E :
0115         addq    $0, 1, $3   # E : U L U L : 2 cycle stall on $0
0116 
0117         cmoveq  $1, $3, $0  # E : Latency 2, extra map cycle
0118     nop
0119     nop
0120     ret         # L0 : L U L U
0121 #endif
0122 
0123     # Deal with the case where $18 > 8 bytes remain to be
0124     # searched.  $16 may not be aligned.
0125     .align 4
0126 $first_quad:
0127     andnot  $16, 0x7, $0    # E :
0128         insqh   $3, $16, $2 # U : $2 = 0000ffffffffffff ($16<0:2> ff)
0129         xor $1, $17, $1 # E :
0130     or  $1, $2, $1  # E : U L U L $1 = ====ffffffffffff
0131 
0132         cmpbge  $31, $1, $2 # E :
0133         bne     $2, $found_it   # U :
0134     # At least one byte left to process.
0135     ldq $1, 8($0)   # L :
0136     subq    $5, 1, $18  # E : U L U L
0137 
0138     addq    $0, 8, $0   # E :
0139     # Make $18 point to last quad to be accessed (the
0140     # last quad may or may not be partial).
0141     andnot  $18, 0x7, $18   # E :
0142     cmpult  $0, $18, $2 # E :
0143     beq $2, $final  # U : U L U L
0144 
0145     # At least two quads remain to be accessed.
0146 
0147     subq    $18, $0, $4 # E : $4 <- nr quads to be processed
0148     and $4, 8, $4   # E : odd number of quads?
0149     bne $4, $odd_quad_count # U :
0150     # At least three quads remain to be accessed
0151     mov $1, $4      # E : L U L U : move prefetched value to correct reg
0152 
0153     .align  4
0154 $unrolled_loop:
0155     ldq $1, 8($0)   # L : prefetch $1
0156     xor $17, $4, $2 # E :
0157     cmpbge  $31, $2, $2 # E :
0158     bne $2, $found_it   # U : U L U L
0159 
0160     addq    $0, 8, $0   # E :
0161     nop         # E :
0162     nop         # E :
0163     nop         # E :
0164 
0165 $odd_quad_count:
0166     xor $17, $1, $2 # E :
0167     ldq $4, 8($0)   # L : prefetch $4
0168     cmpbge  $31, $2, $2 # E :
0169     addq    $0, 8, $6   # E :
0170 
0171     bne $2, $found_it   # U :
0172     cmpult  $6, $18, $6 # E :
0173     addq    $0, 8, $0   # E :
0174     nop         # E :
0175 
0176     bne $6, $unrolled_loop # U :
0177     mov $4, $1      # E : move prefetched value into $1
0178     nop         # E :
0179     nop         # E :
0180 
0181 $final: subq    $5, $0, $18 # E : $18 <- number of bytes left to do
0182     nop         # E :
0183     nop         # E :
0184     bne $18, $last_quad # U :
0185 
0186 $not_found:
0187     mov $31, $0     # E :
0188     nop         # E :
0189     nop         # E :
0190     ret         # L0 :
0191 
0192         .end memchr
0193     EXPORT_SYMBOL(memchr)