0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023 #include <asm/export.h>
0024 .set noreorder
0025 .set noat
0026
0027 .align 4
0028 .globl memcpy
0029 .ent memcpy
0030 memcpy:
0031 .frame $30,0,$26,0
0032 .prologue 0
0033
0034 mov $16, $0 # E : copy dest to return
0035 ble $18, $nomoredata # U : done with the copy?
0036 xor $16, $17, $1 # E : are source and dest alignments the same?
0037 and $1, 7, $1 # E : are they the same mod 8?
0038
0039 bne $1, $misaligned # U : Nope - gotta do this the slow way
0040
0041 and $16, 7, $1 # E : Are both 0mod8?
0042 beq $1, $both_0mod8 # U : Yes
0043 nop # E :
0044
0045
0046
0047
0048
0049
0050
0051 $head_align:
0052 ldbu $1, 0($17) # L : grab a byte
0053 subq $18, 1, $18 # E : count--
0054 addq $17, 1, $17 # E : src++
0055 stb $1, 0($16) # L :
0056 addq $16, 1, $16 # E : dest++
0057 and $16, 7, $1 # E : Are we at 0mod8 yet?
0058 ble $18, $nomoredata # U : done with the copy?
0059 bne $1, $head_align # U :
0060
0061 $both_0mod8:
0062 cmple $18, 127, $1 # E : Can we unroll the loop?
0063 bne $1, $no_unroll # U :
0064 and $16, 63, $1 # E : get mod64 alignment
0065 beq $1, $do_unroll # U : no single quads to fiddle
0066
0067 $single_head_quad:
0068 ldq $1, 0($17) # L : get 8 bytes
0069 subq $18, 8, $18 # E : count -= 8
0070 addq $17, 8, $17 # E : src += 8
0071 nop # E :
0072
0073 stq $1, 0($16) # L : store
0074 addq $16, 8, $16 # E : dest += 8
0075 and $16, 63, $1 # E : get mod64 alignment
0076 bne $1, $single_head_quad # U : still not fully aligned
0077
0078 $do_unroll:
0079 addq $16, 64, $7 # E : Initial (+1 trip) wh64 address
0080 cmple $18, 127, $1 # E : Can we go through the unrolled loop?
0081 bne $1, $tail_quads # U : Nope
0082 nop # E :
0083
0084 $unroll_body:
0085 wh64 ($7) # L1 : memory subsystem hint: 64 bytes at
0086 # ($7) are about to be over-written
0087 ldq $6, 0($17) # L0 : bytes 0..7
0088 nop # E :
0089 nop # E :
0090
0091 ldq $4, 8($17) # L : bytes 8..15
0092 ldq $5, 16($17) # L : bytes 16..23
0093 addq $7, 64, $7 # E : Update next wh64 address
0094 nop # E :
0095
0096 ldq $3, 24($17) # L : bytes 24..31
0097 addq $16, 64, $1 # E : fallback value for wh64
0098 nop # E :
0099 nop # E :
0100
0101 addq $17, 32, $17 # E : src += 32 bytes
0102 stq $6, 0($16) # L : bytes 0..7
0103 nop # E :
0104 nop # E :
0105
0106 stq $4, 8($16) # L : bytes 8..15
0107 stq $5, 16($16) # L : bytes 16..23
0108 subq $18, 192, $2 # E : At least two more trips to go?
0109 nop # E :
0110
0111 stq $3, 24($16) # L : bytes 24..31
0112 addq $16, 32, $16 # E : dest += 32 bytes
0113 nop # E :
0114 nop # E :
0115
0116 ldq $6, 0($17) # L : bytes 0..7
0117 ldq $4, 8($17) # L : bytes 8..15
0118 cmovlt $2, $1, $7 # E : Latency 2, extra map slot - Use
0119 # fallback wh64 address if < 2 more trips
0120 nop # E :
0121
0122 ldq $5, 16($17) # L : bytes 16..23
0123 ldq $3, 24($17) # L : bytes 24..31
0124 addq $16, 32, $16 # E : dest += 32
0125 subq $18, 64, $18 # E : count -= 64
0126
0127 addq $17, 32, $17 # E : src += 32
0128 stq $6, -32($16) # L : bytes 0..7
0129 stq $4, -24($16) # L : bytes 8..15
0130 cmple $18, 63, $1 # E : At least one more trip?
0131
0132 stq $5, -16($16) # L : bytes 16..23
0133 stq $3, -8($16) # L : bytes 24..31
0134 nop # E :
0135 beq $1, $unroll_body
0136
0137 $tail_quads:
0138 $no_unroll:
0139 .align 4
0140 subq $18, 8, $18 # E : At least a quad left?
0141 blt $18, $less_than_8 # U : Nope
0142 nop # E :
0143 nop # E :
0144
0145 $move_a_quad:
0146 ldq $1, 0($17) # L : fetch 8
0147 subq $18, 8, $18 # E : count -= 8
0148 addq $17, 8, $17 # E : src += 8
0149 nop # E :
0150
0151 stq $1, 0($16) # L : store 8
0152 addq $16, 8, $16 # E : dest += 8
0153 bge $18, $move_a_quad # U :
0154 nop # E :
0155
0156 $less_than_8:
0157 .align 4
0158 addq $18, 8, $18 # E : add back for trailing bytes
0159 ble $18, $nomoredata # U : All-done
0160 nop # E :
0161 nop # E :
0162
0163
0164 $tail_bytes:
0165 subq $18, 1, $18 # E : count--
0166 ldbu $1, 0($17) # L : fetch a byte
0167 addq $17, 1, $17 # E : src++
0168 nop # E :
0169
0170 stb $1, 0($16) # L : store a byte
0171 addq $16, 1, $16 # E : dest++
0172 bgt $18, $tail_bytes # U : more to be done?
0173 nop # E :
0174
0175
0176 ret $31, ($26), 1 # L0 :
0177 nop # E :
0178 nop # E :
0179 nop # E :
0180
0181 $misaligned:
0182 mov $0, $4 # E : dest temp
0183 and $0, 7, $1 # E : dest alignment mod8
0184 beq $1, $dest_0mod8 # U : life doesnt totally suck
0185 nop
0186
0187 $aligndest:
0188 ble $18, $nomoredata # U :
0189 ldbu $1, 0($17) # L : fetch a byte
0190 subq $18, 1, $18 # E : count--
0191 addq $17, 1, $17 # E : src++
0192
0193 stb $1, 0($4) # L : store it
0194 addq $4, 1, $4 # E : dest++
0195 and $4, 7, $1 # E : dest 0mod8 yet?
0196 bne $1, $aligndest # U : go until we are aligned.
0197
0198
0199 $dest_0mod8:
0200 subq $18, 8, $18 # E : At least a quad left?
0201 blt $18, $misalign_tail # U : Nope
0202 ldq_u $3, 0($17) # L : seed (rotating load) of 8 bytes
0203 nop # E :
0204
0205 $mis_quad:
0206 ldq_u $16, 8($17) # L : Fetch next 8
0207 extql $3, $17, $3 # U : masking
0208 extqh $16, $17, $1 # U : masking
0209 bis $3, $1, $1 # E : merged bytes to store
0210
0211 subq $18, 8, $18 # E : count -= 8
0212 addq $17, 8, $17 # E : src += 8
0213 stq $1, 0($4) # L : store 8 (aligned)
0214 mov $16, $3 # E : "rotate" source data
0215
0216 addq $4, 8, $4 # E : dest += 8
0217 bge $18, $mis_quad # U : More quads to move
0218 nop
0219 nop
0220
0221 $misalign_tail:
0222 addq $18, 8, $18 # E : account for tail stuff
0223 ble $18, $nomoredata # U :
0224 nop
0225 nop
0226
0227 $misalign_byte:
0228 ldbu $1, 0($17) # L : fetch 1
0229 subq $18, 1, $18 # E : count--
0230 addq $17, 1, $17 # E : src++
0231 nop # E :
0232
0233 stb $1, 0($4) # L : store
0234 addq $4, 1, $4 # E : dest++
0235 bgt $18, $misalign_byte # U : more to go?
0236 nop
0237
0238
0239 $nomoredata:
0240 ret $31, ($26), 1 # L0 :
0241 nop # E :
0242 nop # E :
0243 nop # E :
0244
0245 .end memcpy
0246 EXPORT_SYMBOL(memcpy)
0247
0248
0249 __memcpy = memcpy
0250 .globl __memcpy