0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024
0025
0026
0027
0028
0029
0030 #include <asm/export.h>
0031 .set noat
0032 .set noreorder
0033 .text
0034 .globl memset
0035 .globl __memset
0036 .globl ___memset
0037 .globl __memset16
0038 .globl __constant_c_memset
0039
0040 .ent ___memset
0041 .align 5
0042 ___memset:
0043 .frame $30,0,$26,0
0044 .prologue 0
0045
0046
0047
0048
0049
0050
0051
0052
0053 and $17,255,$1 # E : 00000000000000ch
0054 insbl $17,1,$2 # U : 000000000000ch00
0055 bis $16,$16,$0 # E : return value
0056 ble $18,end_b # U : zero length requested?
0057
0058 addq $18,$16,$6 # E : max address to write to
0059 bis $1,$2,$17 # E : 000000000000chch
0060 insbl $1,2,$3 # U : 0000000000ch0000
0061 insbl $1,3,$4 # U : 00000000ch000000
0062
0063 or $3,$4,$3 # E : 00000000chch0000
0064 inswl $17,4,$5 # U : 0000chch00000000
0065 xor $16,$6,$1 # E : will complete write be within one quadword?
0066 inswl $17,6,$2 # U : chch000000000000
0067
0068 or $17,$3,$17 # E : 00000000chchchch
0069 or $2,$5,$2 # E : chchchch00000000
0070 bic $1,7,$1 # E : fit within a single quadword?
0071 and $16,7,$3 # E : Target addr misalignment
0072
0073 or $17,$2,$17 # E : chchchchchchchch
0074 beq $1,within_quad_b # U :
0075 nop # E :
0076 beq $3,aligned_b # U : target is 0mod8
0077
0078
0079
0080
0081 ldq_u $4,0($16) # L : Fetch first partial
0082 bis $16,$16,$5 # E : Save the address
0083 insql $17,$16,$2 # U : Insert new bytes
0084 subq $3,8,$3 # E : Invert (for addressing uses)
0085
0086 addq $18,$3,$18 # E : $18 is new count ($3 is negative)
0087 mskql $4,$16,$4 # U : clear relevant parts of the quad
0088 subq $16,$3,$16 # E : $16 is new aligned destination
0089 bis $2,$4,$1 # E : Final bytes
0090
0091 nop
0092 stq_u $1,0($5) # L : Store result
0093 nop
0094 nop
0095
0096 .align 4
0097 aligned_b:
0098
0099
0100
0101
0102
0103 sra $18,3,$3 # U : Number of remaining quads to write
0104 and $18,7,$18 # E : Number of trailing bytes to write
0105 bis $16,$16,$5 # E : Save dest address
0106 beq $3,no_quad_b # U : tail stuff only
0107
0108
0109
0110
0111
0112
0113
0114
0115
0116
0117
0118
0119 and $16, 0x3f, $2 # E : Forward work (only useful for unrolled loop)
0120 subq $3, 16, $4 # E : Only try to unroll if > 128 bytes
0121 subq $2, 0x40, $1 # E : bias counter (aligning stuff 0mod64)
0122 blt $4, loop_b # U :
0123
0124
0125
0126
0127
0128
0129
0130 nop # E :
0131 nop # E :
0132 nop # E :
0133 beq $1, $bigalign_b # U :
0134
0135 $alignmod64_b:
0136 stq $17, 0($5) # L :
0137 subq $3, 1, $3 # E : For consistency later
0138 addq $1, 8, $1 # E : Increment towards zero for alignment
0139 addq $5, 8, $4 # E : Initial wh64 address (filler instruction)
0140
0141 nop
0142 nop
0143 addq $5, 8, $5 # E : Inc address
0144 blt $1, $alignmod64_b # U :
0145
0146 $bigalign_b:
0147
0148
0149
0150
0151
0152
0153
0154
0155
0156
0157
0158
0159
0160 $do_wh64_b:
0161 wh64 ($4) # L1 : memory subsystem write hint
0162 subq $3, 24, $2 # E : For determining future wh64 addresses
0163 stq $17, 0($5) # L :
0164 nop # E :
0165
0166 addq $5, 128, $4 # E : speculative target of next wh64
0167 stq $17, 8($5) # L :
0168 stq $17, 16($5) # L :
0169 addq $5, 64, $7 # E : Fallback address for wh64 (== next trip addr)
0170
0171 stq $17, 24($5) # L :
0172 stq $17, 32($5) # L :
0173 cmovlt $2, $7, $4 # E : Latency 2, extra mapping cycle
0174 nop
0175
0176 stq $17, 40($5) # L :
0177 stq $17, 48($5) # L :
0178 subq $3, 16, $2 # E : Repeat the loop at least once more?
0179 nop
0180
0181 stq $17, 56($5) # L :
0182 addq $5, 64, $5 # E :
0183 subq $3, 8, $3 # E :
0184 bge $2, $do_wh64_b # U :
0185
0186 nop
0187 nop
0188 nop
0189 beq $3, no_quad_b # U : Might have finished already
0190
0191 .align 4
0192
0193
0194
0195
0196 loop_b:
0197 stq $17,0($5) # L :
0198 subq $3,1,$3 # E : Decrement number quads left
0199 addq $5,8,$5 # E : Inc address
0200 bne $3,loop_b # U : more?
0201
0202 no_quad_b:
0203
0204
0205
0206 nop # E :
0207 beq $18,end_b # U : All done?
0208 ldq $7,0($5) # L :
0209 mskqh $7,$6,$2 # U : Mask final quad
0210
0211 insqh $17,$6,$4 # U : New bits
0212 bis $2,$4,$1 # E : Put it all together
0213 stq $1,0($5) # L : And back to memory
0214 ret $31,($26),1 # L0 :
0215
0216 within_quad_b:
0217 ldq_u $1,0($16) # L :
0218 insql $17,$16,$2 # U : New bits
0219 mskql $1,$16,$4 # U : Clear old
0220 bis $2,$4,$2 # E : New result
0221
0222 mskql $2,$6,$4 # U :
0223 mskqh $1,$6,$2 # U :
0224 bis $2,$4,$1 # E :
0225 stq_u $1,0($16) # L :
0226
0227 end_b:
0228 nop
0229 nop
0230 nop
0231 ret $31,($26),1 # L0 :
0232 .end ___memset
0233 EXPORT_SYMBOL(___memset)
0234
0235
0236
0237
0238
0239
0240 .align 4
0241 .ent __constant_c_memset
0242 __constant_c_memset:
0243 .frame $30,0,$26,0
0244 .prologue 0
0245
0246 addq $18,$16,$6 # E : max address to write to
0247 bis $16,$16,$0 # E : return value
0248 xor $16,$6,$1 # E : will complete write be within one quadword?
0249 ble $18,end # U : zero length requested?
0250
0251 bic $1,7,$1 # E : fit within a single quadword
0252 beq $1,within_one_quad # U :
0253 and $16,7,$3 # E : Target addr misalignment
0254 beq $3,aligned # U : target is 0mod8
0255
0256
0257
0258
0259 ldq_u $4,0($16) # L : Fetch first partial
0260 bis $16,$16,$5 # E : Save the address
0261 insql $17,$16,$2 # U : Insert new bytes
0262 subq $3,8,$3 # E : Invert (for addressing uses)
0263
0264 addq $18,$3,$18 # E : $18 is new count ($3 is negative)
0265 mskql $4,$16,$4 # U : clear relevant parts of the quad
0266 subq $16,$3,$16 # E : $16 is new aligned destination
0267 bis $2,$4,$1 # E : Final bytes
0268
0269 nop
0270 stq_u $1,0($5) # L : Store result
0271 nop
0272 nop
0273
0274 .align 4
0275 aligned:
0276
0277
0278
0279
0280
0281 sra $18,3,$3 # U : Number of remaining quads to write
0282 and $18,7,$18 # E : Number of trailing bytes to write
0283 bis $16,$16,$5 # E : Save dest address
0284 beq $3,no_quad # U : tail stuff only
0285
0286
0287
0288
0289
0290
0291
0292
0293
0294
0295
0296
0297 and $16, 0x3f, $2 # E : Forward work (only useful for unrolled loop)
0298 subq $3, 16, $4 # E : Only try to unroll if > 128 bytes
0299 subq $2, 0x40, $1 # E : bias counter (aligning stuff 0mod64)
0300 blt $4, loop # U :
0301
0302
0303
0304
0305
0306
0307
0308 nop # E :
0309 nop # E :
0310 nop # E :
0311 beq $1, $bigalign # U :
0312
0313 $alignmod64:
0314 stq $17, 0($5) # L :
0315 subq $3, 1, $3 # E : For consistency later
0316 addq $1, 8, $1 # E : Increment towards zero for alignment
0317 addq $5, 8, $4 # E : Initial wh64 address (filler instruction)
0318
0319 nop
0320 nop
0321 addq $5, 8, $5 # E : Inc address
0322 blt $1, $alignmod64 # U :
0323
0324 $bigalign:
0325
0326
0327
0328
0329
0330
0331
0332
0333
0334
0335
0336
0337
0338 $do_wh64:
0339 wh64 ($4) # L1 : memory subsystem write hint
0340 subq $3, 24, $2 # E : For determining future wh64 addresses
0341 stq $17, 0($5) # L :
0342 nop # E :
0343
0344 addq $5, 128, $4 # E : speculative target of next wh64
0345 stq $17, 8($5) # L :
0346 stq $17, 16($5) # L :
0347 addq $5, 64, $7 # E : Fallback address for wh64 (== next trip addr)
0348
0349 stq $17, 24($5) # L :
0350 stq $17, 32($5) # L :
0351 cmovlt $2, $7, $4 # E : Latency 2, extra mapping cycle
0352 nop
0353
0354 stq $17, 40($5) # L :
0355 stq $17, 48($5) # L :
0356 subq $3, 16, $2 # E : Repeat the loop at least once more?
0357 nop
0358
0359 stq $17, 56($5) # L :
0360 addq $5, 64, $5 # E :
0361 subq $3, 8, $3 # E :
0362 bge $2, $do_wh64 # U :
0363
0364 nop
0365 nop
0366 nop
0367 beq $3, no_quad # U : Might have finished already
0368
0369 .align 4
0370
0371
0372
0373
0374 loop:
0375 stq $17,0($5) # L :
0376 subq $3,1,$3 # E : Decrement number quads left
0377 addq $5,8,$5 # E : Inc address
0378 bne $3,loop # U : more?
0379
0380 no_quad:
0381
0382
0383
0384 nop # E :
0385 beq $18,end # U : All done?
0386 ldq $7,0($5) # L :
0387 mskqh $7,$6,$2 # U : Mask final quad
0388
0389 insqh $17,$6,$4 # U : New bits
0390 bis $2,$4,$1 # E : Put it all together
0391 stq $1,0($5) # L : And back to memory
0392 ret $31,($26),1 # L0 :
0393
0394 within_one_quad:
0395 ldq_u $1,0($16) # L :
0396 insql $17,$16,$2 # U : New bits
0397 mskql $1,$16,$4 # U : Clear old
0398 bis $2,$4,$2 # E : New result
0399
0400 mskql $2,$6,$4 # U :
0401 mskqh $1,$6,$2 # U :
0402 bis $2,$4,$1 # E :
0403 stq_u $1,0($16) # L :
0404
0405 end:
0406 nop
0407 nop
0408 nop
0409 ret $31,($26),1 # L0 :
0410 .end __constant_c_memset
0411 EXPORT_SYMBOL(__constant_c_memset)
0412
0413
0414
0415
0416
0417 .align 5
0418 .ent __memset16
0419
0420 __memset16:
0421 .frame $30,0,$26,0
0422 .prologue 0
0423
0424 inswl $17,0,$5 # U : 000000000000c1c2
0425 inswl $17,2,$2 # U : 00000000c1c20000
0426 bis $16,$16,$0 # E : return value
0427 addq $18,$16,$6 # E : max address to write to
0428
0429 ble $18, end_w # U : zero length requested?
0430 inswl $17,4,$3 # U : 0000c1c200000000
0431 inswl $17,6,$4 # U : c1c2000000000000
0432 xor $16,$6,$1 # E : will complete write be within one quadword?
0433
0434 or $2,$5,$2 # E : 00000000c1c2c1c2
0435 or $3,$4,$17 # E : c1c2c1c200000000
0436 bic $1,7,$1 # E : fit within a single quadword
0437 and $16,7,$3 # E : Target addr misalignment
0438
0439 or $17,$2,$17 # E : c1c2c1c2c1c2c1c2
0440 beq $1,within_quad_w # U :
0441 nop
0442 beq $3,aligned_w # U : target is 0mod8
0443
0444
0445
0446
0447 ldq_u $4,0($16) # L : Fetch first partial
0448 bis $16,$16,$5 # E : Save the address
0449 insql $17,$16,$2 # U : Insert new bytes
0450 subq $3,8,$3 # E : Invert (for addressing uses)
0451
0452 addq $18,$3,$18 # E : $18 is new count ($3 is negative)
0453 mskql $4,$16,$4 # U : clear relevant parts of the quad
0454 subq $16,$3,$16 # E : $16 is new aligned destination
0455 bis $2,$4,$1 # E : Final bytes
0456
0457 nop
0458 stq_u $1,0($5) # L : Store result
0459 nop
0460 nop
0461
0462 .align 4
0463 aligned_w:
0464
0465
0466
0467
0468
0469 sra $18,3,$3 # U : Number of remaining quads to write
0470 and $18,7,$18 # E : Number of trailing bytes to write
0471 bis $16,$16,$5 # E : Save dest address
0472 beq $3,no_quad_w # U : tail stuff only
0473
0474
0475
0476
0477
0478
0479
0480
0481
0482
0483
0484
0485 and $16, 0x3f, $2 # E : Forward work (only useful for unrolled loop)
0486 subq $3, 16, $4 # E : Only try to unroll if > 128 bytes
0487 subq $2, 0x40, $1 # E : bias counter (aligning stuff 0mod64)
0488 blt $4, loop_w # U :
0489
0490
0491
0492
0493
0494
0495
0496 nop # E :
0497 nop # E :
0498 nop # E :
0499 beq $1, $bigalign_w # U :
0500
0501 $alignmod64_w:
0502 stq $17, 0($5) # L :
0503 subq $3, 1, $3 # E : For consistency later
0504 addq $1, 8, $1 # E : Increment towards zero for alignment
0505 addq $5, 8, $4 # E : Initial wh64 address (filler instruction)
0506
0507 nop
0508 nop
0509 addq $5, 8, $5 # E : Inc address
0510 blt $1, $alignmod64_w # U :
0511
0512 $bigalign_w:
0513
0514
0515
0516
0517
0518
0519
0520
0521
0522
0523
0524
0525
0526 $do_wh64_w:
0527 wh64 ($4) # L1 : memory subsystem write hint
0528 subq $3, 24, $2 # E : For determining future wh64 addresses
0529 stq $17, 0($5) # L :
0530 nop # E :
0531
0532 addq $5, 128, $4 # E : speculative target of next wh64
0533 stq $17, 8($5) # L :
0534 stq $17, 16($5) # L :
0535 addq $5, 64, $7 # E : Fallback address for wh64 (== next trip addr)
0536
0537 stq $17, 24($5) # L :
0538 stq $17, 32($5) # L :
0539 cmovlt $2, $7, $4 # E : Latency 2, extra mapping cycle
0540 nop
0541
0542 stq $17, 40($5) # L :
0543 stq $17, 48($5) # L :
0544 subq $3, 16, $2 # E : Repeat the loop at least once more?
0545 nop
0546
0547 stq $17, 56($5) # L :
0548 addq $5, 64, $5 # E :
0549 subq $3, 8, $3 # E :
0550 bge $2, $do_wh64_w # U :
0551
0552 nop
0553 nop
0554 nop
0555 beq $3, no_quad_w # U : Might have finished already
0556
0557 .align 4
0558
0559
0560
0561
0562 loop_w:
0563 stq $17,0($5) # L :
0564 subq $3,1,$3 # E : Decrement number quads left
0565 addq $5,8,$5 # E : Inc address
0566 bne $3,loop_w # U : more?
0567
0568 no_quad_w:
0569
0570
0571
0572 nop # E :
0573 beq $18,end_w # U : All done?
0574 ldq $7,0($5) # L :
0575 mskqh $7,$6,$2 # U : Mask final quad
0576
0577 insqh $17,$6,$4 # U : New bits
0578 bis $2,$4,$1 # E : Put it all together
0579 stq $1,0($5) # L : And back to memory
0580 ret $31,($26),1 # L0 :
0581
0582 within_quad_w:
0583 ldq_u $1,0($16) # L :
0584 insql $17,$16,$2 # U : New bits
0585 mskql $1,$16,$4 # U : Clear old
0586 bis $2,$4,$2 # E : New result
0587
0588 mskql $2,$6,$4 # U :
0589 mskqh $1,$6,$2 # U :
0590 bis $2,$4,$1 # E :
0591 stq_u $1,0($16) # L :
0592
0593 end_w:
0594 nop
0595 nop
0596 nop
0597 ret $31,($26),1 # L0 :
0598
0599 .end __memset16
0600 EXPORT_SYMBOL(__memset16)
0601
0602 memset = ___memset
0603 __memset = ___memset
0604 EXPORT_SYMBOL(memset)
0605 EXPORT_SYMBOL(__memset)