0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024
0025
0026
0027
0028
0029
0030
0031
0032
0033
0034 #include <asm/regdef.h>
0035
0036 .set noat
0037 .set noreorder
0038
0039 .text
0040
0041
0042
0043
0044
0045
0046
0047 .ent stxcpy_aligned
0048 .align 4
0049 stxcpy_aligned:
0050 .frame sp, 0, t9
0051 .prologue 0
0052
0053
0054
0055
0056
0057
0058 lda t2, -1 # E : build a mask against false zero
0059 mskqh t2, a1, t2 # U : detection in the src word (stall)
0060 mskqh t1, a1, t3 # U :
0061 ornot t1, t2, t2 # E : (stall)
0062
0063 mskql t0, a1, t0 # U : assemble the first output word
0064 cmpbge zero, t2, t8 # E : bits set iff null found
0065 or t0, t3, t1 # E : (stall)
0066 bne t8, $a_eos # U : (stall)
0067
0068
0069
0070
0071
0072
0073 $a_loop:
0074 stq_u t1, 0(a0) # L :
0075 addq a0, 8, a0 # E :
0076 nop
0077 nop
0078
0079 ldq_u t1, 0(a1) # L : Latency=3
0080 addq a1, 8, a1 # E :
0081 cmpbge zero, t1, t8 # E : (3 cycle stall)
0082 beq t8, $a_loop # U : (stall for t8)
0083
0084
0085
0086
0087
0088 $a_eos:
0089 negq t8, t6 # E : find low bit set
0090 and t8, t6, t12 # E : (stall)
0091
0092
0093 and t12, 0x80, t6 # E : (stall)
0094 bne t6, 1f # U : (stall)
0095
0096
0097
0098 ldq_u t0, 0(a0) # L : Latency=3
0099 subq t12, 1, t6 # E :
0100 zapnot t1, t6, t1 # U : clear src bytes >= null (stall)
0101 or t12, t6, t8 # E : (stall)
0102
0103 zap t0, t8, t0 # E : clear dst bytes <= null
0104 or t0, t1, t1 # E : (stall)
0105 nop
0106 nop
0107
0108 1: stq_u t1, 0(a0) # L :
0109 ret (t9) # L0 : Latency=3
0110 nop
0111 nop
0112
0113 .end stxcpy_aligned
0114
0115 .align 4
0116 .ent __stxcpy
0117 .globl __stxcpy
0118 __stxcpy:
0119 .frame sp, 0, t9
0120 .prologue 0
0121
0122
0123 xor a0, a1, t0 # E :
0124 unop # E :
0125 and t0, 7, t0 # E : (stall)
0126 bne t0, $unaligned # U : (stall)
0127
0128
0129 ldq_u t1, 0(a1) # L : load first src word
0130 and a0, 7, t0 # E : take care not to load a word ...
0131 addq a1, 8, a1 # E :
0132 beq t0, stxcpy_aligned # U : ... if we wont need it (stall)
0133
0134 ldq_u t0, 0(a0) # L :
0135 br stxcpy_aligned # L0 : Latency=3
0136 nop
0137 nop
0138
0139
0140
0141
0142
0143
0144 .align 4
0145 $u_head:
0146
0147
0148
0149
0150
0151
0152
0153
0154
0155 ldq_u t2, 8(a1) # L :
0156 addq a1, 8, a1 # E :
0157 extql t1, a1, t1 # U : (stall on a1)
0158 extqh t2, a1, t4 # U : (stall on a1)
0159
0160 mskql t0, a0, t0 # U :
0161 or t1, t4, t1 # E :
0162 mskqh t1, a0, t1 # U : (stall on t1)
0163 or t0, t1, t1 # E : (stall on t1)
0164
0165 or t1, t6, t6 # E :
0166 cmpbge zero, t6, t8 # E : (stall)
0167 lda t6, -1 # E : for masking just below
0168 bne t8, $u_final # U : (stall)
0169
0170 mskql t6, a1, t6 # U : mask out the bits we have
0171 or t6, t2, t2 # E : already extracted before (stall)
0172 cmpbge zero, t2, t8 # E : testing eos (stall)
0173 bne t8, $u_late_head_exit # U : (stall)
0174
0175
0176
0177
0178 stq_u t1, 0(a0) # L : store first output word
0179 addq a0, 8, a0 # E :
0180 extql t2, a1, t0 # U : position ho-bits of lo word
0181 ldq_u t2, 8(a1) # U : read next high-order source word
0182
0183 addq a1, 8, a1 # E :
0184 cmpbge zero, t2, t8 # E : (stall for t2)
0185 nop # E :
0186 bne t8, $u_eos # U : (stall)
0187
0188
0189
0190
0191
0192
0193
0194
0195
0196
0197
0198
0199
0200
0201 .align 3
0202 $u_loop:
0203 extqh t2, a1, t1 # U : extract high bits for current word
0204 addq a1, 8, a1 # E : (stall)
0205 extql t2, a1, t3 # U : extract low bits for next time (stall)
0206 addq a0, 8, a0 # E :
0207
0208 or t0, t1, t1 # E : current dst word now complete
0209 ldq_u t2, 0(a1) # L : Latency=3 load high word for next time
0210 stq_u t1, -8(a0) # L : save the current word (stall)
0211 mov t3, t0 # E :
0212
0213 cmpbge zero, t2, t8 # E : test new word for eos
0214 beq t8, $u_loop # U : (stall)
0215 nop
0216 nop
0217
0218
0219
0220
0221
0222
0223
0224
0225
0226 $u_eos:
0227 extqh t2, a1, t1 # U :
0228 or t0, t1, t1 # E : first (partial) source word complete (stall)
0229 cmpbge zero, t1, t8 # E : is the null in this first bit? (stall)
0230 bne t8, $u_final # U : (stall)
0231
0232 $u_late_head_exit:
0233 stq_u t1, 0(a0) # L : the null was in the high-order bits
0234 addq a0, 8, a0 # E :
0235 extql t2, a1, t1 # U :
0236 cmpbge zero, t1, t8 # E : (stall)
0237
0238
0239
0240
0241
0242 $u_final:
0243 negq t8, t6 # E : isolate low bit set
0244 and t6, t8, t12 # E : (stall)
0245 and t12, 0x80, t6 # E : avoid dest word load if we can (stall)
0246 bne t6, 1f # U : (stall)
0247
0248 ldq_u t0, 0(a0) # E :
0249 subq t12, 1, t6 # E :
0250 or t6, t12, t8 # E : (stall)
0251 zapnot t1, t6, t1 # U : kill source bytes >= null (stall)
0252
0253 zap t0, t8, t0 # U : kill dest bytes <= null (2 cycle data stall)
0254 or t0, t1, t1 # E : (stall)
0255 nop
0256 nop
0257
0258 1: stq_u t1, 0(a0) # L :
0259 ret (t9) # L0 : Latency=3
0260 nop
0261 nop
0262
0263
0264 .align 4
0265 $unaligned:
0266
0267 ldq_u t1, 0(a1) # L : load first source word
0268 and a0, 7, t4 # E : find dest misalignment
0269 and a1, 7, t5 # E : find src misalignment
0270
0271
0272 mov zero, t0 # E :
0273
0274 mov zero, t6 # E :
0275 beq t4, 1f # U :
0276 ldq_u t0, 0(a0) # L :
0277 lda t6, -1 # E :
0278
0279 mskql t6, a0, t6 # U :
0280 nop
0281 nop
0282 nop
0283 1:
0284 subq a1, t4, a1 # E : sub dest misalignment from src addr
0285
0286
0287 cmplt t4, t5, t12 # E :
0288 beq t12, $u_head # U :
0289 lda t2, -1 # E : mask out leading garbage in source
0290
0291 mskqh t2, t5, t2 # U :
0292 ornot t1, t2, t3 # E : (stall)
0293 cmpbge zero, t3, t8 # E : is there a zero? (stall)
0294 beq t8, $u_head # U : (stall)
0295
0296
0297
0298
0299
0300
0301 ldq_u t0, 0(a0) # L :
0302 negq t8, t6 # E : build bitmask of bytes <= zero
0303 and t6, t8, t12 # E : (stall)
0304 and a1, 7, t5 # E :
0305
0306 subq t12, 1, t6 # E :
0307 or t6, t12, t8 # E : (stall)
0308 srl t12, t5, t12 # U : adjust final null return value
0309 zapnot t2, t8, t2 # U : prepare source word; mirror changes (stall)
0310
0311 and t1, t2, t1 # E : to source validity mask
0312 extql t2, a1, t2 # U :
0313 extql t1, a1, t1 # U : (stall)
0314 andnot t0, t2, t0 # .. e1 : zero place for source to reside (stall)
0315
0316 or t0, t1, t1 # e1 : and put it there
0317 stq_u t1, 0(a0) # .. e0 : (stall)
0318 ret (t9) # e1 :
0319 nop
0320
0321 .end __stxcpy
0322