0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024
0025
0026
0027
0028
0029
0030
0031 #include <asm/export.h>
0032 .set noreorder
0033 .set noat
0034
0035 .align 4
0036 .globl memchr
0037 .ent memchr
0038 memchr:
0039 .frame $30,0,$26,0
0040 .prologue 0
0041
0042 # Hack -- if someone passes in (size_t)-1, hoping to just
0043 # search til the end of the address space, we will overflow
0044 # below when we find the address of the last byte. Given
0045 # that we will never have a 56-bit address space, cropping
0046 # the length is the easiest way to avoid trouble.
0047 zap $18, 0x80, $5 # U : Bound length
0048 beq $18, $not_found # U :
0049 ldq_u $1, 0($16) # L : load first quadword Latency=3
0050 and $17, 0xff, $17 # E : L L U U : 00000000000000ch
0051
0052 insbl $17, 1, $2 # U : 000000000000ch00
0053 cmpult $18, 9, $4 # E : small (< 1 quad) string?
0054 or $2, $17, $17 # E : 000000000000chch
0055 lda $3, -1($31) # E : U L L U
0056
0057 sll $17, 16, $2 # U : 00000000chch0000
0058 addq $16, $5, $5 # E : Max search address
0059 or $2, $17, $17 # E : 00000000chchchch
0060 sll $17, 32, $2 # U : U L L U : chchchch00000000
0061
0062 or $2, $17, $17 # E : chchchchchchchch
0063 extql $1, $16, $7 # U : $7 is upper bits
0064 beq $4, $first_quad # U :
0065 ldq_u $6, -1($5) # L : L U U L : eight or less bytes to search Latency=3
0066
0067 extqh $6, $16, $6 # U : 2 cycle stall for $6
0068 mov $16, $0 # E :
0069 nop # E :
0070 or $7, $6, $1 # E : L U L U $1 = quadword starting at $16
0071
0072 # Deal with the case where at most 8 bytes remain to be searched
0073 # in $1. E.g.:
0074 # $18 = 6
0075 # $1 = ????c6c5c4c3c2c1
0076 $last_quad:
0077 negq $18, $6 # E :
0078 xor $17, $1, $1 # E :
0079 srl $3, $6, $6 # U : $6 = mask of $18 bits set
0080 cmpbge $31, $1, $2 # E : L U L U
0081
0082 nop
0083 nop
0084 and $2, $6, $2 # E :
0085 beq $2, $not_found # U : U L U L
0086
0087 $found_it:
0088 #ifdef CONFIG_ALPHA_EV67
0089
0090
0091
0092
0093 cttz $2, $3 # U0 :
0094 addq $0, $3, $0 # E : All done
0095 nop # E :
0096 ret # L0 : L U L U
0097 #else
0098
0099
0100
0101
0102 negq $2, $3 # E :
0103 and $2, $3, $2 # E :
0104 and $2, 0x0f, $1 # E :
0105 addq $0, 4, $3 # E :
0106
0107 cmoveq $1, $3, $0 # E : Latency 2, extra map cycle
0108 nop # E : keep with cmov
0109 and $2, 0x33, $1 # E :
0110 addq $0, 2, $3 # E : U L U L : 2 cycle stall on $0
0111
0112 cmoveq $1, $3, $0 # E : Latency 2, extra map cycle
0113 nop # E : keep with cmov
0114 and $2, 0x55, $1 # E :
0115 addq $0, 1, $3 # E : U L U L : 2 cycle stall on $0
0116
0117 cmoveq $1, $3, $0 # E : Latency 2, extra map cycle
0118 nop
0119 nop
0120 ret # L0 : L U L U
0121 #endif
0122
0123 # Deal with the case where $18 > 8 bytes remain to be
0124 # searched. $16 may not be aligned.
0125 .align 4
0126 $first_quad:
0127 andnot $16, 0x7, $0 # E :
0128 insqh $3, $16, $2 # U : $2 = 0000ffffffffffff ($16<0:2> ff)
0129 xor $1, $17, $1 # E :
0130 or $1, $2, $1 # E : U L U L $1 = ====ffffffffffff
0131
0132 cmpbge $31, $1, $2 # E :
0133 bne $2, $found_it # U :
0134 # At least one byte left to process.
0135 ldq $1, 8($0) # L :
0136 subq $5, 1, $18 # E : U L U L
0137
0138 addq $0, 8, $0 # E :
0139 # Make $18 point to last quad to be accessed (the
0140 # last quad may or may not be partial).
0141 andnot $18, 0x7, $18 # E :
0142 cmpult $0, $18, $2 # E :
0143 beq $2, $final # U : U L U L
0144
0145 # At least two quads remain to be accessed.
0146
0147 subq $18, $0, $4 # E : $4 <- nr quads to be processed
0148 and $4, 8, $4 # E : odd number of quads?
0149 bne $4, $odd_quad_count # U :
0150 # At least three quads remain to be accessed
0151 mov $1, $4 # E : L U L U : move prefetched value to correct reg
0152
0153 .align 4
0154 $unrolled_loop:
0155 ldq $1, 8($0) # L : prefetch $1
0156 xor $17, $4, $2 # E :
0157 cmpbge $31, $2, $2 # E :
0158 bne $2, $found_it # U : U L U L
0159
0160 addq $0, 8, $0 # E :
0161 nop # E :
0162 nop # E :
0163 nop # E :
0164
0165 $odd_quad_count:
0166 xor $17, $1, $2 # E :
0167 ldq $4, 8($0) # L : prefetch $4
0168 cmpbge $31, $2, $2 # E :
0169 addq $0, 8, $6 # E :
0170
0171 bne $2, $found_it # U :
0172 cmpult $6, $18, $6 # E :
0173 addq $0, 8, $0 # E :
0174 nop # E :
0175
0176 bne $6, $unrolled_loop # U :
0177 mov $4, $1 # E : move prefetched value into $1
0178 nop # E :
0179 nop # E :
0180
0181 $final: subq $5, $0, $18 # E : $18 <- number of bytes left to do
0182 nop # E :
0183 nop # E :
0184 bne $18, $last_quad # U :
0185
0186 $not_found:
0187 mov $31, $0 # E :
0188 nop # E :
0189 nop # E :
0190 ret # L0 :
0191
0192 .end memchr
0193 EXPORT_SYMBOL(memchr)