0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024
0025
0026
0027
0028
0029
0030
0031
0032
0033
0034
0035
0036
0037
0038
0039
0040
0041
0042 #include <asm/regdef.h>
0043
0044 .set noat
0045 .set noreorder
0046
0047 .text
0048
0049
0050
0051
0052
0053
0054
0055 .ent stxncpy_aligned
0056 .align 4
0057 stxncpy_aligned:
0058 .frame sp, 0, t9, 0
0059 .prologue 0
0060
0061
0062
0063
0064
0065
0066 lda t2, -1 # E : build a mask against false zero
0067 mskqh t2, a1, t2 # U : detection in the src word (stall)
0068 mskqh t1, a1, t3 # U :
0069 ornot t1, t2, t2 # E : (stall)
0070
0071 mskql t0, a1, t0 # U : assemble the first output word
0072 cmpbge zero, t2, t8 # E : bits set iff null found
0073 or t0, t3, t0 # E : (stall)
0074 beq a2, $a_eoc # U :
0075
0076 bne t8, $a_eos # U :
0077 nop
0078 nop
0079 nop
0080
0081
0082
0083
0084
0085
0086
0087
0088
0089 $a_loop:
0090 stq_u t0, 0(a0) # L :
0091 addq a0, 8, a0 # E :
0092 subq a2, 1, a2 # E :
0093 nop
0094
0095 ldq_u t0, 0(a1) # L :
0096 addq a1, 8, a1 # E :
0097 cmpbge zero, t0, t8 # E :
0098 beq a2, $a_eoc # U :
0099
0100 beq t8, $a_loop # U :
0101 nop
0102 nop
0103 nop
0104
0105
0106
0107
0108
0109
0110
0111
0112 $a_eos:
0113 negq t8, t12 # E : find low bit set
0114 and t8, t12, t12 # E : (stall)
0115
0116
0117 and t12, 0x80, t6 # E : (stall)
0118 bne t6, 1f # U : (stall)
0119
0120
0121
0122 ldq_u t1, 0(a0) # L :
0123 subq t12, 1, t6 # E :
0124 or t12, t6, t8 # E : (stall)
0125 zapnot t0, t8, t0 # U : clear src bytes > null (stall)
0126
0127 zap t1, t8, t1 # .. e1 : clear dst bytes <= null
0128 or t0, t1, t0 # e1 : (stall)
0129 nop
0130 nop
0131
0132 1: stq_u t0, 0(a0) # L :
0133 ret (t9) # L0 : Latency=3
0134 nop
0135 nop
0136
0137
0138 $a_eoc:
0139 or t10, t8, t8 # E :
0140 br $a_eos # L0 : Latency=3
0141 nop
0142 nop
0143
0144 .end stxncpy_aligned
0145
0146 .align 4
0147 .ent __stxncpy
0148 .globl __stxncpy
0149 __stxncpy:
0150 .frame sp, 0, t9, 0
0151 .prologue 0
0152
0153
0154 xor a0, a1, t1 # E :
0155 and a0, 7, t0 # E : find dest misalignment
0156 and t1, 7, t1 # E : (stall)
0157 addq a2, t0, a2 # E : bias count by dest misalignment (stall)
0158
0159 subq a2, 1, a2 # E :
0160 and a2, 7, t2 # E : (stall)
0161 srl a2, 3, a2 # U : a2 = loop counter = (count - 1)/8 (stall)
0162 addq zero, 1, t10 # E :
0163
0164 sll t10, t2, t10 # U : t10 = bitmask of last count byte
0165 bne t1, $unaligned # U :
0166
0167 ldq_u t1, 0(a1) # L : load first src word
0168 addq a1, 8, a1 # E :
0169
0170 beq t0, stxncpy_aligned # U : avoid loading dest word if not needed
0171 ldq_u t0, 0(a0) # L :
0172 nop
0173 nop
0174
0175 br stxncpy_aligned # .. e1 :
0176 nop
0177 nop
0178 nop
0179
0180
0181
0182
0183
0184
0185
0186 .align 4
0187 $u_head:
0188
0189
0190
0191
0192
0193
0194
0195
0196
0197 ldq_u t2, 8(a1) # L : Latency=3 load second src word
0198 addq a1, 8, a1 # E :
0199 mskql t0, a0, t0 # U : mask trailing garbage in dst
0200 extqh t2, a1, t4 # U : (3 cycle stall on t2)
0201
0202 or t1, t4, t1 # E : first aligned src word complete (stall)
0203 mskqh t1, a0, t1 # U : mask leading garbage in src (stall)
0204 or t0, t1, t0 # E : first output word complete (stall)
0205 or t0, t6, t6 # E : mask original data for zero test (stall)
0206
0207 cmpbge zero, t6, t8 # E :
0208 beq a2, $u_eocfin # U :
0209 lda t6, -1 # E :
0210 nop
0211
0212 bne t8, $u_final # U :
0213 mskql t6, a1, t6 # U : mask out bits already seen
0214 stq_u t0, 0(a0) # L : store first output word
0215 or t6, t2, t2 # E : (stall)
0216
0217 cmpbge zero, t2, t8 # E : find nulls in second partial
0218 addq a0, 8, a0 # E :
0219 subq a2, 1, a2 # E :
0220 bne t8, $u_late_head_exit # U :
0221
0222
0223
0224 extql t2, a1, t1 # U : position hi-bits of lo word
0225 beq a2, $u_eoc # U :
0226 ldq_u t2, 8(a1) # L : read next high-order source word
0227 addq a1, 8, a1 # E :
0228
0229 extqh t2, a1, t0 # U : position lo-bits of hi word (stall)
0230 cmpbge zero, t2, t8 # E :
0231 nop
0232 bne t8, $u_eos # U :
0233
0234
0235
0236
0237
0238
0239
0240
0241
0242
0243
0244
0245
0246
0247
0248 .align 4
0249 $u_loop:
0250 or t0, t1, t0 # E : current dst word now complete
0251 subq a2, 1, a2 # E : decrement word count
0252 extql t2, a1, t1 # U : extract low bits for next time
0253 addq a0, 8, a0 # E :
0254
0255 stq_u t0, -8(a0) # U : save the current word
0256 beq a2, $u_eoc # U :
0257 ldq_u t2, 8(a1) # U : Latency=3 load high word for next time
0258 addq a1, 8, a1 # E :
0259
0260 extqh t2, a1, t0 # U : extract low bits (2 cycle stall)
0261 cmpbge zero, t2, t8 # E : test new word for eos
0262 nop
0263 beq t8, $u_loop # U :
0264
0265
0266
0267
0268
0269
0270
0271
0272
0273
0274 $u_eos:
0275 or t0, t1, t0 # E : first (partial) source word complete
0276 nop
0277 cmpbge zero, t0, t8 # E : is the null in this first bit? (stall)
0278 bne t8, $u_final # U : (stall)
0279
0280 stq_u t0, 0(a0) # L : the null was in the high-order bits
0281 addq a0, 8, a0 # E :
0282 subq a2, 1, a2 # E :
0283 nop
0284
0285 $u_late_head_exit:
0286 extql t2, a1, t0 # U :
0287 cmpbge zero, t0, t8 # E :
0288 or t8, t10, t6 # E : (stall)
0289 cmoveq a2, t6, t8 # E : Latency=2, extra map slot (stall)
0290
0291
0292
0293
0294
0295 $u_final:
0296 negq t8, t6 # E : isolate low bit set
0297 and t6, t8, t12 # E : (stall)
0298 and t12, 0x80, t6 # E : avoid dest word load if we can (stall)
0299 bne t6, 1f # U : (stall)
0300
0301 ldq_u t1, 0(a0) # L :
0302 subq t12, 1, t6 # E :
0303 or t6, t12, t8 # E : (stall)
0304 zapnot t0, t8, t0 # U : kill source bytes > null
0305
0306 zap t1, t8, t1 # U : kill dest bytes <= null
0307 or t0, t1, t0 # E : (stall)
0308 nop
0309 nop
0310
0311 1: stq_u t0, 0(a0) # L :
0312 ret (t9) # L0 : Latency=3
0313
0314
0315
0316
0317 $u_eoc:
0318 and a1, 7, t6 # E : avoid final load if possible
0319 sll t10, t6, t6 # U : (stall)
0320 and t6, 0xff, t6 # E : (stall)
0321 bne t6, 1f # U : (stall)
0322
0323 ldq_u t2, 8(a1) # L : load final src word
0324 nop
0325 extqh t2, a1, t0 # U : extract low bits for last word (stall)
0326 or t1, t0, t1 # E : (stall)
0327
0328 1: cmpbge zero, t1, t8 # E :
0329 mov t1, t0 # E :
0330
0331 $u_eocfin: # end-of-count, final word
0332 or t10, t8, t8 # E :
0333 br $u_final # L0 : Latency=3
0334
0335
0336 .align 4
0337 $unaligned:
0338
0339 ldq_u t1, 0(a1) # L : load first source word
0340 and a0, 7, t4 # E : find dest misalignment
0341 and a1, 7, t5 # E : find src misalignment
0342
0343
0344 mov zero, t0 # E :
0345
0346 mov zero, t6 # E :
0347 beq t4, 1f # U :
0348 ldq_u t0, 0(a0) # L :
0349 lda t6, -1 # E :
0350
0351 mskql t6, a0, t6 # U :
0352 nop
0353 nop
0354 subq a1, t4, a1 # E : sub dest misalignment from src addr
0355
0356
0357
0358
0359 1: cmplt t4, t5, t12 # E :
0360 extql t1, a1, t1 # U : shift src into place
0361 lda t2, -1 # E : for creating masks later
0362 beq t12, $u_head # U : (stall)
0363
0364 extql t2, a1, t2 # U :
0365 cmpbge zero, t1, t8 # E : is there a zero?
0366 andnot t2, t6, t2 # E : dest mask for a single word copy
0367 or t8, t10, t5 # E : test for end-of-count too
0368
0369 cmpbge zero, t2, t3 # E :
0370 cmoveq a2, t5, t8 # E : Latency=2, extra map slot
0371 nop # E : keep with cmoveq
0372 andnot t8, t3, t8 # E : (stall)
0373
0374 beq t8, $u_head # U :
0375
0376
0377
0378
0379 ldq_u t0, 0(a0) # L :
0380 negq t8, t6 # E : build bitmask of bytes <= zero
0381 mskqh t1, t4, t1 # U :
0382
0383 and t6, t8, t12 # E :
0384 subq t12, 1, t6 # E : (stall)
0385 or t6, t12, t8 # E : (stall)
0386 zapnot t2, t8, t2 # U : prepare source word; mirror changes (stall)
0387
0388 zapnot t1, t8, t1 # U : to source validity mask
0389 andnot t0, t2, t0 # E : zero place for source to reside
0390 or t0, t1, t0 # E : and put it there (stall both t0, t1)
0391 stq_u t0, 0(a0) # L : (stall)
0392
0393 ret (t9) # L0 : Latency=3
0394 nop
0395 nop
0396 nop
0397
0398 .end __stxncpy