alpha/lib/ev6-memset.S

0001 /* SPDX-License-Identifier: GPL-2.0 */
0002 /*
0003  * arch/alpha/lib/ev6-memset.S
0004  *
0005  * This is an efficient (and relatively small) implementation of the C library
0006  * "memset()" function for the 21264 implementation of Alpha.
0007  *
0008  * 21264 version  contributed by Rick Gorton <rick.gorton@alpha-processor.com>
0009  *
0010  * Much of the information about 21264 scheduling/coding comes from:
0011  *  Compiler Writer's Guide for the Alpha 21264
0012  *  abbreviated as 'CWG' in other comments here
0013  *  ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
0014  * Scheduling notation:
0015  *  E   - either cluster
0016  *  U   - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
0017  *  L   - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
0018  * The algorithm for the leading and trailing quadwords remains the same,
0019  * however the loop has been unrolled to enable better memory throughput,
0020  * and the code has been replicated for each of the entry points: __memset
0021  * and __memset16 to permit better scheduling to eliminate the stalling
0022  * encountered during the mask replication.
0023  * A future enhancement might be to put in a byte store loop for really
0024  * small (say < 32 bytes) memset()s.  Whether or not that change would be
0025  * a win in the kernel would depend upon the contextual usage.
0026  * WARNING: Maintaining this is going to be more work than the above version,
0027  * as fixes will need to be made in multiple places.  The performance gain
0028  * is worth it.
0029  */
0030 #include <asm/export.h>
0031     .set noat
0032     .set noreorder
0033 .text
0034     .globl memset
0035     .globl __memset
0036     .globl ___memset
0037     .globl __memset16
0038     .globl __constant_c_memset
0039
0040     .ent ___memset
0041 .align 5
0042 ___memset:
0043     .frame $30,0,$26,0
0044     .prologue 0
0045
0046     /*
0047      * Serious stalling happens.  The only way to mitigate this is to
0048      * undertake a major re-write to interleave the constant materialization
0049      * with other parts of the fall-through code.  This is important, even
0050      * though it makes maintenance tougher.
0051      * Do this later.
0052      */
0053     and $17,255,$1      # E : 00000000000000ch
0054     insbl $17,1,$2      # U : 000000000000ch00
0055     bis $16,$16,$0      # E : return value
0056     ble $18,end_b       # U : zero length requested?
0057
0058     addq $18,$16,$6     # E : max address to write to
0059     bis $1,$2,$17   # E : 000000000000chch
0060     insbl   $1,2,$3     # U : 0000000000ch0000
0061     insbl   $1,3,$4     # U : 00000000ch000000
0062
0063     or  $3,$4,$3    # E : 00000000chch0000
0064     inswl   $17,4,$5    # U : 0000chch00000000
0065     xor $16,$6,$1   # E : will complete write be within one quadword?
0066     inswl   $17,6,$2    # U : chch000000000000
0067
0068     or  $17,$3,$17  # E : 00000000chchchch
0069     or  $2,$5,$2    # E : chchchch00000000
0070     bic $1,7,$1     # E : fit within a single quadword?
0071     and $16,7,$3    # E : Target addr misalignment
0072
0073     or  $17,$2,$17  # E : chchchchchchchch
0074     beq $1,within_quad_b # U :
0075     nop         # E :
0076     beq $3,aligned_b    # U : target is 0mod8
0077
0078     /*
0079      * Target address is misaligned, and won't fit within a quadword
0080      */
0081     ldq_u $4,0($16)     # L : Fetch first partial
0082     bis $16,$16,$5      # E : Save the address
0083     insql $17,$16,$2    # U : Insert new bytes
0084     subq $3,8,$3        # E : Invert (for addressing uses)
0085
0086     addq $18,$3,$18     # E : $18 is new count ($3 is negative)
0087     mskql $4,$16,$4     # U : clear relevant parts of the quad
0088     subq $16,$3,$16     # E : $16 is new aligned destination
0089     bis $2,$4,$1        # E : Final bytes
0090
0091     nop
0092     stq_u $1,0($5)      # L : Store result
0093     nop
0094     nop
0095
0096 .align 4
0097 aligned_b:
0098     /*
0099      * We are now guaranteed to be quad aligned, with at least
0100      * one partial quad to write.
0101      */
0102
0103     sra $18,3,$3        # U : Number of remaining quads to write
0104     and $18,7,$18       # E : Number of trailing bytes to write
0105     bis $16,$16,$5      # E : Save dest address
0106     beq $3,no_quad_b    # U : tail stuff only
0107
0108     /*
0109      * it's worth the effort to unroll this and use wh64 if possible
0110      * Lifted a bunch of code from clear_user.S
0111      * At this point, entry values are:
0112      * $16  Current destination address
0113      * $5   A copy of $16
0114      * $6   The max quadword address to write to
0115      * $18  Number trailer bytes
0116      * $3   Number quads to write
0117      */
0118
0119     and $16, 0x3f, $2   # E : Forward work (only useful for unrolled loop)
0120     subq    $3, 16, $4  # E : Only try to unroll if > 128 bytes
0121     subq    $2, 0x40, $1    # E : bias counter (aligning stuff 0mod64)
0122     blt $4, loop_b  # U :
0123
0124     /*
0125      * We know we've got at least 16 quads, minimum of one trip
0126      * through unrolled loop.  Do a quad at a time to get us 0mod64
0127      * aligned.
0128      */
0129
0130     nop         # E :
0131     nop         # E :
0132     nop         # E :
0133     beq $1, $bigalign_b # U :
0134
0135 $alignmod64_b:
0136     stq $17, 0($5)  # L :
0137     subq    $3, 1, $3   # E : For consistency later
0138     addq    $1, 8, $1   # E : Increment towards zero for alignment
0139     addq    $5, 8, $4   # E : Initial wh64 address (filler instruction)
0140
0141     nop
0142     nop
0143     addq    $5, 8, $5   # E : Inc address
0144     blt $1, $alignmod64_b # U :
0145
0146 $bigalign_b:
0147     /*
0148      * $3 - number quads left to go
0149      * $5 - target address (aligned 0mod64)
0150      * $17 - mask of stuff to store
0151      * Scratch registers available: $7, $2, $4, $1
0152      * we know that we'll be taking a minimum of one trip through
0153      * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle
0154      * Assumes the wh64 needs to be for 2 trips through the loop in the future
0155      * The wh64 is issued on for the starting destination address for trip +2
0156      * through the loop, and if there are less than two trips left, the target
0157      * address will be for the current trip.
0158      */
0159
0160 $do_wh64_b:
0161     wh64    ($4)        # L1 : memory subsystem write hint
0162     subq    $3, 24, $2  # E : For determining future wh64 addresses
0163     stq $17, 0($5)  # L :
0164     nop         # E :
0165
0166     addq    $5, 128, $4 # E : speculative target of next wh64
0167     stq $17, 8($5)  # L :
0168     stq $17, 16($5) # L :
0169     addq    $5, 64, $7  # E : Fallback address for wh64 (== next trip addr)
0170
0171     stq $17, 24($5) # L :
0172     stq $17, 32($5) # L :
0173     cmovlt  $2, $7, $4  # E : Latency 2, extra mapping cycle
0174     nop
0175
0176     stq $17, 40($5) # L :
0177     stq $17, 48($5) # L :
0178     subq    $3, 16, $2  # E : Repeat the loop at least once more?
0179     nop
0180
0181     stq $17, 56($5) # L :
0182     addq    $5, 64, $5  # E :
0183     subq    $3, 8, $3   # E :
0184     bge $2, $do_wh64_b  # U :
0185
0186     nop
0187     nop
0188     nop
0189     beq $3, no_quad_b   # U : Might have finished already
0190
0191 .align 4
0192     /*
0193      * Simple loop for trailing quadwords, or for small amounts
0194      * of data (where we can't use an unrolled loop and wh64)
0195      */
0196 loop_b:
0197     stq $17,0($5)       # L :
0198     subq $3,1,$3        # E : Decrement number quads left
0199     addq $5,8,$5        # E : Inc address
0200     bne $3,loop_b       # U : more?
0201
0202 no_quad_b:
0203     /*
0204      * Write 0..7 trailing bytes.
0205      */
0206     nop         # E :
0207     beq $18,end_b       # U : All done?
0208     ldq $7,0($5)        # L :
0209     mskqh $7,$6,$2      # U : Mask final quad
0210
0211     insqh $17,$6,$4     # U : New bits
0212     bis $2,$4,$1        # E : Put it all together
0213     stq $1,0($5)        # L : And back to memory
0214     ret $31,($26),1     # L0 :
0215
0216 within_quad_b:
0217     ldq_u $1,0($16)     # L :
0218     insql $17,$16,$2    # U : New bits
0219     mskql $1,$16,$4     # U : Clear old
0220     bis $2,$4,$2        # E : New result
0221
0222     mskql $2,$6,$4      # U :
0223     mskqh $1,$6,$2      # U :
0224     bis $2,$4,$1        # E :
0225     stq_u $1,0($16)     # L :
0226
0227 end_b:
0228     nop
0229     nop
0230     nop
0231     ret $31,($26),1     # L0 :
0232     .end ___memset
0233     EXPORT_SYMBOL(___memset)
0234
0235     /*
0236      * This is the original body of code, prior to replication and
0237      * rescheduling.  Leave it here, as there may be calls to this
0238      * entry point.
0239      */
0240 .align 4
0241     .ent __constant_c_memset
0242 __constant_c_memset:
0243     .frame $30,0,$26,0
0244     .prologue 0
0245
0246     addq $18,$16,$6     # E : max address to write to
0247     bis $16,$16,$0      # E : return value
0248     xor $16,$6,$1       # E : will complete write be within one quadword?
0249     ble $18,end     # U : zero length requested?
0250
0251     bic $1,7,$1     # E : fit within a single quadword
0252     beq $1,within_one_quad  # U :
0253     and $16,7,$3        # E : Target addr misalignment
0254     beq $3,aligned      # U : target is 0mod8
0255
0256     /*
0257      * Target address is misaligned, and won't fit within a quadword
0258      */
0259     ldq_u $4,0($16)     # L : Fetch first partial
0260     bis $16,$16,$5      # E : Save the address
0261     insql $17,$16,$2    # U : Insert new bytes
0262     subq $3,8,$3        # E : Invert (for addressing uses)
0263
0264     addq $18,$3,$18     # E : $18 is new count ($3 is negative)
0265     mskql $4,$16,$4     # U : clear relevant parts of the quad
0266     subq $16,$3,$16     # E : $16 is new aligned destination
0267     bis $2,$4,$1        # E : Final bytes
0268
0269     nop
0270     stq_u $1,0($5)      # L : Store result
0271     nop
0272     nop
0273
0274 .align 4
0275 aligned:
0276     /*
0277      * We are now guaranteed to be quad aligned, with at least
0278      * one partial quad to write.
0279      */
0280
0281     sra $18,3,$3        # U : Number of remaining quads to write
0282     and $18,7,$18       # E : Number of trailing bytes to write
0283     bis $16,$16,$5      # E : Save dest address
0284     beq $3,no_quad      # U : tail stuff only
0285
0286     /*
0287      * it's worth the effort to unroll this and use wh64 if possible
0288      * Lifted a bunch of code from clear_user.S
0289      * At this point, entry values are:
0290      * $16  Current destination address
0291      * $5   A copy of $16
0292      * $6   The max quadword address to write to
0293      * $18  Number trailer bytes
0294      * $3   Number quads to write
0295      */
0296
0297     and $16, 0x3f, $2   # E : Forward work (only useful for unrolled loop)
0298     subq    $3, 16, $4  # E : Only try to unroll if > 128 bytes
0299     subq    $2, 0x40, $1    # E : bias counter (aligning stuff 0mod64)
0300     blt $4, loop    # U :
0301
0302     /*
0303      * We know we've got at least 16 quads, minimum of one trip
0304      * through unrolled loop.  Do a quad at a time to get us 0mod64
0305      * aligned.
0306      */
0307
0308     nop         # E :
0309     nop         # E :
0310     nop         # E :
0311     beq $1, $bigalign   # U :
0312
0313 $alignmod64:
0314     stq $17, 0($5)  # L :
0315     subq    $3, 1, $3   # E : For consistency later
0316     addq    $1, 8, $1   # E : Increment towards zero for alignment
0317     addq    $5, 8, $4   # E : Initial wh64 address (filler instruction)
0318
0319     nop
0320     nop
0321     addq    $5, 8, $5   # E : Inc address
0322     blt $1, $alignmod64 # U :
0323
0324 $bigalign:
0325     /*
0326      * $3 - number quads left to go
0327      * $5 - target address (aligned 0mod64)
0328      * $17 - mask of stuff to store
0329      * Scratch registers available: $7, $2, $4, $1
0330      * we know that we'll be taking a minimum of one trip through
0331      * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle
0332      * Assumes the wh64 needs to be for 2 trips through the loop in the future
0333      * The wh64 is issued on for the starting destination address for trip +2
0334      * through the loop, and if there are less than two trips left, the target
0335      * address will be for the current trip.
0336      */
0337
0338 $do_wh64:
0339     wh64    ($4)        # L1 : memory subsystem write hint
0340     subq    $3, 24, $2  # E : For determining future wh64 addresses
0341     stq $17, 0($5)  # L :
0342     nop         # E :
0343
0344     addq    $5, 128, $4 # E : speculative target of next wh64
0345     stq $17, 8($5)  # L :
0346     stq $17, 16($5) # L :
0347     addq    $5, 64, $7  # E : Fallback address for wh64 (== next trip addr)
0348
0349     stq $17, 24($5) # L :
0350     stq $17, 32($5) # L :
0351     cmovlt  $2, $7, $4  # E : Latency 2, extra mapping cycle
0352     nop
0353
0354     stq $17, 40($5) # L :
0355     stq $17, 48($5) # L :
0356     subq    $3, 16, $2  # E : Repeat the loop at least once more?
0357     nop
0358
0359     stq $17, 56($5) # L :
0360     addq    $5, 64, $5  # E :
0361     subq    $3, 8, $3   # E :
0362     bge $2, $do_wh64    # U :
0363
0364     nop
0365     nop
0366     nop
0367     beq $3, no_quad # U : Might have finished already
0368
0369 .align 4
0370     /*
0371      * Simple loop for trailing quadwords, or for small amounts
0372      * of data (where we can't use an unrolled loop and wh64)
0373      */
0374 loop:
0375     stq $17,0($5)       # L :
0376     subq $3,1,$3        # E : Decrement number quads left
0377     addq $5,8,$5        # E : Inc address
0378     bne $3,loop     # U : more?
0379
0380 no_quad:
0381     /*
0382      * Write 0..7 trailing bytes.
0383      */
0384     nop         # E :
0385     beq $18,end     # U : All done?
0386     ldq $7,0($5)        # L :
0387     mskqh $7,$6,$2      # U : Mask final quad
0388
0389     insqh $17,$6,$4     # U : New bits
0390     bis $2,$4,$1        # E : Put it all together
0391     stq $1,0($5)        # L : And back to memory
0392     ret $31,($26),1     # L0 :
0393
0394 within_one_quad:
0395     ldq_u $1,0($16)     # L :
0396     insql $17,$16,$2    # U : New bits
0397     mskql $1,$16,$4     # U : Clear old
0398     bis $2,$4,$2        # E : New result
0399
0400     mskql $2,$6,$4      # U :
0401     mskqh $1,$6,$2      # U :
0402     bis $2,$4,$1        # E :
0403     stq_u $1,0($16)     # L :
0404
0405 end:
0406     nop
0407     nop
0408     nop
0409     ret $31,($26),1     # L0 :
0410     .end __constant_c_memset
0411     EXPORT_SYMBOL(__constant_c_memset)
0412
0413     /*
0414      * This is a replicant of the __constant_c_memset code, rescheduled
0415      * to mask stalls.  Note that entry point names also had to change
0416      */
0417     .align 5
0418     .ent __memset16
0419
0420 __memset16:
0421     .frame $30,0,$26,0
0422     .prologue 0
0423
0424     inswl $17,0,$5      # U : 000000000000c1c2
0425     inswl $17,2,$2      # U : 00000000c1c20000
0426     bis $16,$16,$0      # E : return value
0427     addq    $18,$16,$6  # E : max address to write to
0428
0429     ble $18, end_w      # U : zero length requested?
0430     inswl   $17,4,$3    # U : 0000c1c200000000
0431     inswl   $17,6,$4    # U : c1c2000000000000
0432     xor $16,$6,$1   # E : will complete write be within one quadword?
0433
0434     or  $2,$5,$2    # E : 00000000c1c2c1c2
0435     or  $3,$4,$17   # E : c1c2c1c200000000
0436     bic $1,7,$1     # E : fit within a single quadword
0437     and $16,7,$3    # E : Target addr misalignment
0438
0439     or  $17,$2,$17  # E : c1c2c1c2c1c2c1c2
0440     beq $1,within_quad_w    # U :
0441     nop
0442     beq $3,aligned_w    # U : target is 0mod8
0443
0444     /*
0445      * Target address is misaligned, and won't fit within a quadword
0446      */
0447     ldq_u $4,0($16)     # L : Fetch first partial
0448     bis $16,$16,$5      # E : Save the address
0449     insql $17,$16,$2    # U : Insert new bytes
0450     subq $3,8,$3        # E : Invert (for addressing uses)
0451
0452     addq $18,$3,$18     # E : $18 is new count ($3 is negative)
0453     mskql $4,$16,$4     # U : clear relevant parts of the quad
0454     subq $16,$3,$16     # E : $16 is new aligned destination
0455     bis $2,$4,$1        # E : Final bytes
0456
0457     nop
0458     stq_u $1,0($5)      # L : Store result
0459     nop
0460     nop
0461
0462 .align 4
0463 aligned_w:
0464     /*
0465      * We are now guaranteed to be quad aligned, with at least
0466      * one partial quad to write.
0467      */
0468
0469     sra $18,3,$3        # U : Number of remaining quads to write
0470     and $18,7,$18       # E : Number of trailing bytes to write
0471     bis $16,$16,$5      # E : Save dest address
0472     beq $3,no_quad_w    # U : tail stuff only
0473
0474     /*
0475      * it's worth the effort to unroll this and use wh64 if possible
0476      * Lifted a bunch of code from clear_user.S
0477      * At this point, entry values are:
0478      * $16  Current destination address
0479      * $5   A copy of $16
0480      * $6   The max quadword address to write to
0481      * $18  Number trailer bytes
0482      * $3   Number quads to write
0483      */
0484
0485     and $16, 0x3f, $2   # E : Forward work (only useful for unrolled loop)
0486     subq    $3, 16, $4  # E : Only try to unroll if > 128 bytes
0487     subq    $2, 0x40, $1    # E : bias counter (aligning stuff 0mod64)
0488     blt $4, loop_w  # U :
0489
0490     /*
0491      * We know we've got at least 16 quads, minimum of one trip
0492      * through unrolled loop.  Do a quad at a time to get us 0mod64
0493      * aligned.
0494      */
0495
0496     nop         # E :
0497     nop         # E :
0498     nop         # E :
0499     beq $1, $bigalign_w # U :
0500
0501 $alignmod64_w:
0502     stq $17, 0($5)  # L :
0503     subq    $3, 1, $3   # E : For consistency later
0504     addq    $1, 8, $1   # E : Increment towards zero for alignment
0505     addq    $5, 8, $4   # E : Initial wh64 address (filler instruction)
0506
0507     nop
0508     nop
0509     addq    $5, 8, $5   # E : Inc address
0510     blt $1, $alignmod64_w   # U :
0511
0512 $bigalign_w:
0513     /*
0514      * $3 - number quads left to go
0515      * $5 - target address (aligned 0mod64)
0516      * $17 - mask of stuff to store
0517      * Scratch registers available: $7, $2, $4, $1
0518      * we know that we'll be taking a minimum of one trip through
0519      * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle
0520      * Assumes the wh64 needs to be for 2 trips through the loop in the future
0521      * The wh64 is issued on for the starting destination address for trip +2
0522      * through the loop, and if there are less than two trips left, the target
0523      * address will be for the current trip.
0524      */
0525
0526 $do_wh64_w:
0527     wh64    ($4)        # L1 : memory subsystem write hint
0528     subq    $3, 24, $2  # E : For determining future wh64 addresses
0529     stq $17, 0($5)  # L :
0530     nop         # E :
0531
0532     addq    $5, 128, $4 # E : speculative target of next wh64
0533     stq $17, 8($5)  # L :
0534     stq $17, 16($5) # L :
0535     addq    $5, 64, $7  # E : Fallback address for wh64 (== next trip addr)
0536
0537     stq $17, 24($5) # L :
0538     stq $17, 32($5) # L :
0539     cmovlt  $2, $7, $4  # E : Latency 2, extra mapping cycle
0540     nop
0541
0542     stq $17, 40($5) # L :
0543     stq $17, 48($5) # L :
0544     subq    $3, 16, $2  # E : Repeat the loop at least once more?
0545     nop
0546
0547     stq $17, 56($5) # L :
0548     addq    $5, 64, $5  # E :
0549     subq    $3, 8, $3   # E :
0550     bge $2, $do_wh64_w  # U :
0551
0552     nop
0553     nop
0554     nop
0555     beq $3, no_quad_w   # U : Might have finished already
0556
0557 .align 4
0558     /*
0559      * Simple loop for trailing quadwords, or for small amounts
0560      * of data (where we can't use an unrolled loop and wh64)
0561      */
0562 loop_w:
0563     stq $17,0($5)       # L :
0564     subq $3,1,$3        # E : Decrement number quads left
0565     addq $5,8,$5        # E : Inc address
0566     bne $3,loop_w       # U : more?
0567
0568 no_quad_w:
0569     /*
0570      * Write 0..7 trailing bytes.
0571      */
0572     nop         # E :
0573     beq $18,end_w       # U : All done?
0574     ldq $7,0($5)        # L :
0575     mskqh $7,$6,$2      # U : Mask final quad
0576
0577     insqh $17,$6,$4     # U : New bits
0578     bis $2,$4,$1        # E : Put it all together
0579     stq $1,0($5)        # L : And back to memory
0580     ret $31,($26),1     # L0 :
0581
0582 within_quad_w:
0583     ldq_u $1,0($16)     # L :
0584     insql $17,$16,$2    # U : New bits
0585     mskql $1,$16,$4     # U : Clear old
0586     bis $2,$4,$2        # E : New result
0587
0588     mskql $2,$6,$4      # U :
0589     mskqh $1,$6,$2      # U :
0590     bis $2,$4,$1        # E :
0591     stq_u $1,0($16)     # L :
0592
0593 end_w:
0594     nop
0595     nop
0596     nop
0597     ret $31,($26),1     # L0 :
0598
0599     .end __memset16
0600     EXPORT_SYMBOL(__memset16)
0601
0602 memset = ___memset
0603 __memset = ___memset
0604     EXPORT_SYMBOL(memset)
0605     EXPORT_SYMBOL(__memset)