alpha/lib/ev6-copy_page.S

0001 /* SPDX-License-Identifier: GPL-2.0 */
0002 /*
0003  * arch/alpha/lib/ev6-copy_page.S
0004  *
0005  * Copy an entire page.
0006  */
0007
0008 /* The following comparison of this routine vs the normal copy_page.S
0009    was written by an unnamed ev6 hardware designer and forwarded to me
0010    via Steven Hobbs <hobbs@steven.zko.dec.com>.
0011
0012    First Problem: STQ overflows.
0013    -----------------------------
0014
0015     It would be nice if EV6 handled every resource overflow efficiently,
0016     but for some it doesn't.  Including store queue overflows.  It causes
0017     a trap and a restart of the pipe.
0018
0019     To get around this we sometimes use (to borrow a term from a VSSAD
0020     researcher) "aeration".  The idea is to slow the rate at which the
0021     processor receives valid instructions by inserting nops in the fetch
0022     path.  In doing so, you can prevent the overflow and actually make
0023     the code run faster.  You can, of course, take advantage of the fact
0024     that the processor can fetch at most 4 aligned instructions per cycle.
0025
0026     I inserted enough nops to force it to take 10 cycles to fetch the
0027     loop code.  In theory, EV6 should be able to execute this loop in
0028     9 cycles but I was not able to get it to run that fast -- the initial
0029     conditions were such that I could not reach this optimum rate on
0030     (chaotic) EV6.  I wrote the code such that everything would issue
0031     in order.
0032
0033    Second Problem: Dcache index matches.
0034    -------------------------------------
0035
0036     If you are going to use this routine on random aligned pages, there
0037     is a 25% chance that the pages will be at the same dcache indices.
0038     This results in many nasty memory traps without care.
0039
0040     The solution is to schedule the prefetches to avoid the memory
0041     conflicts.  I schedule the wh64 prefetches farther ahead of the
0042     read prefetches to avoid this problem.
0043
0044    Third Problem: Needs more prefetching.
0045    --------------------------------------
0046
0047     In order to improve the code I added deeper prefetching to take the
0048     most advantage of EV6's bandwidth.
0049
0050     I also prefetched the read stream. Note that adding the read prefetch
0051     forced me to add another cycle to the inner-most kernel - up to 11
0052     from the original 8 cycles per iteration.  We could improve performance
0053     further by unrolling the loop and doing multiple prefetches per cycle.
0054
0055    I think that the code below will be very robust and fast code for the
0056    purposes of copying aligned pages.  It is slower when both source and
0057    destination pages are in the dcache, but it is my guess that this is
0058    less important than the dcache miss case.  */
0059
0060 #include <asm/export.h>
0061     .text
0062     .align 4
0063     .global copy_page
0064     .ent copy_page
0065 copy_page:
0066     .prologue 0
0067
0068     /* Prefetch 5 read cachelines; write-hint 10 cache lines.  */
0069     wh64    ($16)
0070     ldl $31,0($17)
0071     ldl $31,64($17)
0072     lda $1,1*64($16)
0073
0074     wh64    ($1)
0075     ldl $31,128($17)
0076     ldl $31,192($17)
0077     lda $1,2*64($16)
0078
0079     wh64    ($1)
0080     ldl $31,256($17)
0081     lda $18,118
0082     lda $1,3*64($16)
0083
0084     wh64    ($1)
0085     nop
0086     lda $1,4*64($16)
0087     lda $2,5*64($16)
0088
0089     wh64    ($1)
0090     wh64    ($2)
0091     lda $1,6*64($16)
0092     lda $2,7*64($16)
0093
0094     wh64    ($1)
0095     wh64    ($2)
0096     lda $1,8*64($16)
0097     lda $2,9*64($16)
0098
0099     wh64    ($1)
0100     wh64    ($2)
0101     lda $19,10*64($16)
0102     nop
0103
0104     /* Main prefetching/write-hinting loop.  */
0105 1:  ldq $0,0($17)
0106     ldq $1,8($17)
0107     unop
0108     unop
0109
0110     unop
0111     unop
0112     ldq $2,16($17)
0113     ldq $3,24($17)
0114
0115     ldq $4,32($17)
0116     ldq $5,40($17)
0117     unop
0118     unop
0119
0120     unop
0121     unop
0122     ldq $6,48($17)
0123     ldq $7,56($17)
0124
0125     ldl $31,320($17)
0126     unop
0127     unop
0128     unop
0129
0130     /* This gives the extra cycle of aeration above the minimum.  */
0131     unop
0132     unop
0133     unop
0134     unop
0135
0136     wh64    ($19)
0137     unop
0138     unop
0139     unop
0140
0141     stq $0,0($16)
0142     subq    $18,1,$18
0143     stq $1,8($16)
0144     unop
0145
0146     unop
0147     stq $2,16($16)
0148     addq    $17,64,$17
0149     stq $3,24($16)
0150
0151     stq $4,32($16)
0152     stq $5,40($16)
0153     addq    $19,64,$19
0154     unop
0155
0156     stq $6,48($16)
0157     stq $7,56($16)
0158     addq    $16,64,$16
0159     bne $18, 1b
0160
0161     /* Prefetch the final 5 cache lines of the read stream.  */
0162     lda $18,10
0163     ldl $31,320($17)
0164     ldl $31,384($17)
0165     ldl $31,448($17)
0166
0167     ldl $31,512($17)
0168     ldl $31,576($17)
0169     nop
0170     nop
0171
0172     /* Non-prefetching, non-write-hinting cleanup loop for the
0173        final 10 cache lines.  */
0174 2:  ldq $0,0($17)
0175     ldq $1,8($17)
0176     ldq $2,16($17)
0177     ldq $3,24($17)
0178
0179     ldq $4,32($17)
0180     ldq $5,40($17)
0181     ldq $6,48($17)
0182     ldq $7,56($17)
0183
0184     stq $0,0($16)
0185     subq    $18,1,$18
0186     stq $1,8($16)
0187     addq    $17,64,$17
0188
0189     stq $2,16($16)
0190     stq $3,24($16)
0191     stq $4,32($16)
0192     stq $5,40($16)
0193
0194     stq $6,48($16)
0195     stq $7,56($16)
0196     addq    $16,64,$16
0197     bne $18, 2b
0198
0199     ret
0200     nop
0201     unop
0202     nop
0203
0204     .end copy_page
0205     EXPORT_SYMBOL(copy_page)