Back to home page

OSCL-LXR

 
 

    


0001 /* SPDX-License-Identifier: GPL-2.0 */
0002 /*
0003  * arch/alpha/lib/ev6-copy_user.S
0004  *
0005  * 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com>
0006  *
0007  * Copy to/from user space, handling exceptions as we go..  This
0008  * isn't exactly pretty.
0009  *
0010  * This is essentially the same as "memcpy()", but with a few twists.
0011  * Notably, we have to make sure that $0 is always up-to-date and
0012  * contains the right "bytes left to copy" value (and that it is updated
0013  * only _after_ a successful copy). There is also some rather minor
0014  * exception setup stuff..
0015  *
0016  * Much of the information about 21264 scheduling/coding comes from:
0017  *  Compiler Writer's Guide for the Alpha 21264
0018  *  abbreviated as 'CWG' in other comments here
0019  *  ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
0020  * Scheduling notation:
0021  *  E   - either cluster
0022  *  U   - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
0023  *  L   - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
0024  */
0025 
0026 #include <asm/export.h>
0027 /* Allow an exception for an insn; exit if we get one.  */
0028 #define EXI(x,y...)         \
0029     99: x,##y;          \
0030     .section __ex_table,"a";    \
0031     .long 99b - .;          \
0032     lda $31, $exitin-99b($31);  \
0033     .previous
0034 
0035 #define EXO(x,y...)         \
0036     99: x,##y;          \
0037     .section __ex_table,"a";    \
0038     .long 99b - .;          \
0039     lda $31, $exitout-99b($31); \
0040     .previous
0041 
0042     .set noat
0043     .align 4
0044     .globl __copy_user
0045     .ent __copy_user
0046                 # Pipeline info: Slotting & Comments
0047 __copy_user:
0048     .prologue 0
0049     mov $18, $0     # .. .. .. E
0050     subq $18, 32, $1    # .. .. E. ..   : Is this going to be a small copy?
0051     nop         # .. E  .. ..
0052     beq $18, $zerolength    # U  .. .. ..   : U L U L
0053 
0054     and $16,7,$3        # .. .. .. E    : is leading dest misalignment
0055     ble $1, $onebyteloop    # .. .. U  ..   : 1st branch : small amount of data
0056     beq $3, $destaligned    # .. U  .. ..   : 2nd (one cycle fetcher stall)
0057     subq $3, 8, $3      # E  .. .. ..   : L U U L : trip counter
0058 /*
0059  * The fetcher stall also hides the 1 cycle cross-cluster stall for $3 (L --> U)
0060  * This loop aligns the destination a byte at a time
0061  * We know we have at least one trip through this loop
0062  */
0063 $aligndest:
0064     EXI( ldbu $1,0($17) )   # .. .. .. L    : Keep loads separate from stores
0065     addq $16,1,$16      # .. .. E  ..   : Section 3.8 in the CWG
0066     addq $3,1,$3        # .. E  .. ..   :
0067     nop         # E  .. .. ..   : U L U L
0068 
0069 /*
0070  * the -1 is to compensate for the inc($16) done in a previous quadpack
0071  * which allows us zero dependencies within either quadpack in the loop
0072  */
0073     EXO( stb $1,-1($16) )   # .. .. .. L    :
0074     addq $17,1,$17      # .. .. E  ..   : Section 3.8 in the CWG
0075     subq $0,1,$0        # .. E  .. ..   :
0076     bne $3, $aligndest  # U  .. .. ..   : U L U L
0077 
0078 /*
0079  * If we fell through into here, we have a minimum of 33 - 7 bytes
0080  * If we arrived via branch, we have a minimum of 32 bytes
0081  */
0082 $destaligned:
0083     and $17,7,$1        # .. .. .. E    : Check _current_ source alignment
0084     bic $0,7,$4     # .. .. E  ..   : number bytes as a quadword loop
0085     EXI( ldq_u $3,0($17) )  # .. L  .. ..   : Forward fetch for fallthrough code
0086     beq $1,$quadaligned # U  .. .. ..   : U L U L
0087 
0088 /*
0089  * In the worst case, we've just executed an ldq_u here from 0($17)
0090  * and we'll repeat it once if we take the branch
0091  */
0092 
0093 /* Misaligned quadword loop - not unrolled.  Leave it that way. */
0094 $misquad:
0095     EXI( ldq_u $2,8($17) )  # .. .. .. L    :
0096     subq $4,8,$4        # .. .. E  ..   :
0097     extql $3,$17,$3     # .. U  .. ..   :
0098     extqh $2,$17,$1     # U  .. .. ..   : U U L L
0099 
0100     bis $3,$1,$1        # .. .. .. E    :
0101     EXO( stq $1,0($16) )    # .. .. L  ..   :
0102     addq $17,8,$17      # .. E  .. ..   :
0103     subq $0,8,$0        # E  .. .. ..   : U L L U
0104 
0105     addq $16,8,$16      # .. .. .. E    :
0106     bis $2,$2,$3        # .. .. E  ..   :
0107     nop         # .. E  .. ..   :
0108     bne $4,$misquad     # U  .. .. ..   : U L U L
0109 
0110     nop         # .. .. .. E
0111     nop         # .. .. E  ..
0112     nop         # .. E  .. ..
0113     beq $0,$zerolength  # U  .. .. ..   : U L U L
0114 
0115 /* We know we have at least one trip through the byte loop */
0116     EXI ( ldbu $2,0($17) )  # .. .. .. L    : No loads in the same quad
0117     addq $16,1,$16      # .. .. E  ..   : as the store (Section 3.8 in CWG)
0118     nop         # .. E  .. ..   :
0119     br $31, $dirtyentry # L0 .. .. ..   : L U U L
0120 /* Do the trailing byte loop load, then hop into the store part of the loop */
0121 
0122 /*
0123  * A minimum of (33 - 7) bytes to do a quad at a time.
0124  * Based upon the usage context, it's worth the effort to unroll this loop
0125  * $0 - number of bytes to be moved
0126  * $4 - number of bytes to move as quadwords
0127  * $16 is current destination address
0128  * $17 is current source address
0129  */
0130 $quadaligned:
0131     subq    $4, 32, $2  # .. .. .. E    : do not unroll for small stuff
0132     nop         # .. .. E  ..
0133     nop         # .. E  .. ..
0134     blt $2, $onequad    # U  .. .. ..   : U L U L
0135 
0136 /*
0137  * There is a significant assumption here that the source and destination
0138  * addresses differ by more than 32 bytes.  In this particular case, a
0139  * sparsity of registers further bounds this to be a minimum of 8 bytes.
0140  * But if this isn't met, then the output result will be incorrect.
0141  * Furthermore, due to a lack of available registers, we really can't
0142  * unroll this to be an 8x loop (which would enable us to use the wh64
0143  * instruction memory hint instruction).
0144  */
0145 $unroll4:
0146     EXI( ldq $1,0($17) )    # .. .. .. L
0147     EXI( ldq $2,8($17) )    # .. .. L  ..
0148     subq    $4,32,$4    # .. E  .. ..
0149     nop         # E  .. .. ..   : U U L L
0150 
0151     addq    $17,16,$17  # .. .. .. E
0152     EXO( stq $1,0($16) )    # .. .. L  ..
0153     EXO( stq $2,8($16) )    # .. L  .. ..
0154     subq    $0,16,$0    # E  .. .. ..   : U L L U
0155 
0156     addq    $16,16,$16  # .. .. .. E
0157     EXI( ldq $1,0($17) )    # .. .. L  ..
0158     EXI( ldq $2,8($17) )    # .. L  .. ..
0159     subq    $4, 32, $3  # E  .. .. ..   : U U L L : is there enough for another trip?
0160 
0161     EXO( stq $1,0($16) )    # .. .. .. L
0162     EXO( stq $2,8($16) )    # .. .. L  ..
0163     subq    $0,16,$0    # .. E  .. ..
0164     addq    $17,16,$17  # E  .. .. ..   : U L L U
0165 
0166     nop         # .. .. .. E
0167     nop         # .. .. E  ..
0168     addq    $16,16,$16  # .. E  .. ..
0169     bgt $3,$unroll4 # U  .. .. ..   : U L U L
0170 
0171     nop
0172     nop
0173     nop
0174     beq $4, $noquads
0175 
0176 $onequad:
0177     EXI( ldq $1,0($17) )
0178     subq    $4,8,$4
0179     addq    $17,8,$17
0180     nop
0181 
0182     EXO( stq $1,0($16) )
0183     subq    $0,8,$0
0184     addq    $16,8,$16
0185     bne $4,$onequad
0186 
0187 $noquads:
0188     nop
0189     nop
0190     nop
0191     beq $0,$zerolength
0192 
0193 /*
0194  * For small copies (or the tail of a larger copy), do a very simple byte loop.
0195  * There's no point in doing a lot of complex alignment calculations to try to
0196  * to quadword stuff for a small amount of data.
0197  *  $0 - remaining number of bytes left to copy
0198  *  $16 - current dest addr
0199  *  $17 - current source addr
0200  */
0201 
0202 $onebyteloop:
0203     EXI ( ldbu $2,0($17) )  # .. .. .. L    : No loads in the same quad
0204     addq $16,1,$16      # .. .. E  ..   : as the store (Section 3.8 in CWG)
0205     nop         # .. E  .. ..   :
0206     nop         # E  .. .. ..   : U L U L
0207 
0208 $dirtyentry:
0209 /*
0210  * the -1 is to compensate for the inc($16) done in a previous quadpack
0211  * which allows us zero dependencies within either quadpack in the loop
0212  */
0213     EXO ( stb $2,-1($16) )  # .. .. .. L    :
0214     addq $17,1,$17      # .. .. E  ..   : quadpack as the load
0215     subq $0,1,$0        # .. E  .. ..   : change count _after_ copy
0216     bgt $0,$onebyteloop # U  .. .. ..   : U L U L
0217 
0218 $zerolength:
0219 $exitin:
0220 $exitout:           # Destination for exception recovery(?)
0221     nop         # .. .. .. E
0222     nop         # .. .. E  ..
0223     nop         # .. E  .. ..
0224     ret $31,($26),1     # L0 .. .. ..   : L U L U
0225 
0226     .end __copy_user
0227     EXPORT_SYMBOL(__copy_user)