0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016 #include <asm/asm.h>
0017 #include <asm/asm-offsets.h>
0018 #include <asm/export.h>
0019 #include <asm/regdef.h>
0020
0021 #define dst a0
0022 #define src a1
0023 #define len a2
0024
0025
0026
0027
0028
0029
0030
0031
0032
0033
0034
0035
0036
0037
0038
0039
0040
0041
0042
0043
0044
0045
0046
0047
0048
0049
0050
0051
0052
0053
0054
0055
0056
0057
0058
0059
0060
0061
0062
0063
0064
0065
0066
0067
0068
0069
0070
0071
0072
0073
0074 #define EXC(inst_reg,addr,handler) \
0075 9: inst_reg, addr; \
0076 .section __ex_table,"a"; \
0077 PTR_WD 9b, handler; \
0078 .previous
0079
0080
0081
0082
0083
0084 #define LOAD ld
0085 #define LOADL ldl
0086 #define LOADR ldr
0087 #define STOREL sdl
0088 #define STORER sdr
0089 #define STORE sd
0090 #define ADD daddu
0091 #define SUB dsubu
0092 #define SRL dsrl
0093 #define SRA dsra
0094 #define SLL dsll
0095 #define SLLV dsllv
0096 #define SRLV dsrlv
0097 #define NBYTES 8
0098 #define LOG_NBYTES 3
0099
0100
0101
0102
0103
0104
0105 #undef t0
0106 #undef t1
0107 #undef t2
0108 #undef t3
0109 #define t0 $8
0110 #define t1 $9
0111 #define t2 $10
0112 #define t3 $11
0113 #define t4 $12
0114 #define t5 $13
0115 #define t6 $14
0116 #define t7 $15
0117
0118 #ifdef CONFIG_CPU_LITTLE_ENDIAN
0119 #define LDFIRST LOADR
0120 #define LDREST LOADL
0121 #define STFIRST STORER
0122 #define STREST STOREL
0123 #define SHIFT_DISCARD SLLV
0124 #else
0125 #define LDFIRST LOADL
0126 #define LDREST LOADR
0127 #define STFIRST STOREL
0128 #define STREST STORER
0129 #define SHIFT_DISCARD SRLV
0130 #endif
0131
0132 #define FIRST(unit) ((unit)*NBYTES)
0133 #define REST(unit) (FIRST(unit)+NBYTES-1)
0134 #define UNIT(unit) FIRST(unit)
0135
0136 #define ADDRMASK (NBYTES-1)
0137
0138 .text
0139 .set noreorder
0140 .set noat
0141
0142
0143
0144
0145
0146
0147
0148 .align 5
0149 LEAF(memcpy)
0150 EXPORT_SYMBOL(memcpy)
0151 move v0, dst
0152 __memcpy:
0153 FEXPORT(__raw_copy_from_user)
0154 EXPORT_SYMBOL(__raw_copy_from_user)
0155 FEXPORT(__raw_copy_to_user)
0156 EXPORT_SYMBOL(__raw_copy_to_user)
0157
0158
0159
0160
0161 #
0162 # Octeon doesn't care if the destination is unaligned. The hardware
0163 # can fix it faster than we can special case the assembly.
0164 #
0165 pref 0, 0(src)
0166 sltu t0, len, NBYTES # Check if < 1 word
0167 bnez t0, copy_bytes_checklen
0168 and t0, src, ADDRMASK # Check if src unaligned
0169 bnez t0, src_unaligned
0170 sltu t0, len, 4*NBYTES # Check if < 4 words
0171 bnez t0, less_than_4units
0172 sltu t0, len, 8*NBYTES # Check if < 8 words
0173 bnez t0, less_than_8units
0174 sltu t0, len, 16*NBYTES # Check if < 16 words
0175 bnez t0, cleanup_both_aligned
0176 sltu t0, len, 128+1 # Check if len < 129
0177 bnez t0, 1f # Skip prefetch if len is too short
0178 sltu t0, len, 256+1 # Check if len < 257
0179 bnez t0, 1f # Skip prefetch if len is too short
0180 pref 0, 128(src) # We must not prefetch invalid addresses
0181 #
0182 # This is where we loop if there is more than 128 bytes left
0183 2: pref 0, 256(src) # We must not prefetch invalid addresses
0184 #
0185 # This is where we loop if we can't prefetch anymore
0186 1:
0187 EXC( LOAD t0, UNIT(0)(src), l_exc)
0188 EXC( LOAD t1, UNIT(1)(src), l_exc_copy)
0189 EXC( LOAD t2, UNIT(2)(src), l_exc_copy)
0190 EXC( LOAD t3, UNIT(3)(src), l_exc_copy)
0191 SUB len, len, 16*NBYTES
0192 EXC( STORE t0, UNIT(0)(dst), s_exc_p16u)
0193 EXC( STORE t1, UNIT(1)(dst), s_exc_p15u)
0194 EXC( STORE t2, UNIT(2)(dst), s_exc_p14u)
0195 EXC( STORE t3, UNIT(3)(dst), s_exc_p13u)
0196 EXC( LOAD t0, UNIT(4)(src), l_exc_copy)
0197 EXC( LOAD t1, UNIT(5)(src), l_exc_copy)
0198 EXC( LOAD t2, UNIT(6)(src), l_exc_copy)
0199 EXC( LOAD t3, UNIT(7)(src), l_exc_copy)
0200 EXC( STORE t0, UNIT(4)(dst), s_exc_p12u)
0201 EXC( STORE t1, UNIT(5)(dst), s_exc_p11u)
0202 EXC( STORE t2, UNIT(6)(dst), s_exc_p10u)
0203 ADD src, src, 16*NBYTES
0204 EXC( STORE t3, UNIT(7)(dst), s_exc_p9u)
0205 ADD dst, dst, 16*NBYTES
0206 EXC( LOAD t0, UNIT(-8)(src), l_exc_copy_rewind16)
0207 EXC( LOAD t1, UNIT(-7)(src), l_exc_copy_rewind16)
0208 EXC( LOAD t2, UNIT(-6)(src), l_exc_copy_rewind16)
0209 EXC( LOAD t3, UNIT(-5)(src), l_exc_copy_rewind16)
0210 EXC( STORE t0, UNIT(-8)(dst), s_exc_p8u)
0211 EXC( STORE t1, UNIT(-7)(dst), s_exc_p7u)
0212 EXC( STORE t2, UNIT(-6)(dst), s_exc_p6u)
0213 EXC( STORE t3, UNIT(-5)(dst), s_exc_p5u)
0214 EXC( LOAD t0, UNIT(-4)(src), l_exc_copy_rewind16)
0215 EXC( LOAD t1, UNIT(-3)(src), l_exc_copy_rewind16)
0216 EXC( LOAD t2, UNIT(-2)(src), l_exc_copy_rewind16)
0217 EXC( LOAD t3, UNIT(-1)(src), l_exc_copy_rewind16)
0218 EXC( STORE t0, UNIT(-4)(dst), s_exc_p4u)
0219 EXC( STORE t1, UNIT(-3)(dst), s_exc_p3u)
0220 EXC( STORE t2, UNIT(-2)(dst), s_exc_p2u)
0221 EXC( STORE t3, UNIT(-1)(dst), s_exc_p1u)
0222 sltu t0, len, 256+1 # See if we can prefetch more
0223 beqz t0, 2b
0224 sltu t0, len, 128 # See if we can loop more time
0225 beqz t0, 1b
0226 nop
0227 #
0228 # Jump here if there are less than 16*NBYTES left.
0229 #
0230 cleanup_both_aligned:
0231 beqz len, done
0232 sltu t0, len, 8*NBYTES
0233 bnez t0, less_than_8units
0234 nop
0235 EXC( LOAD t0, UNIT(0)(src), l_exc)
0236 EXC( LOAD t1, UNIT(1)(src), l_exc_copy)
0237 EXC( LOAD t2, UNIT(2)(src), l_exc_copy)
0238 EXC( LOAD t3, UNIT(3)(src), l_exc_copy)
0239 SUB len, len, 8*NBYTES
0240 EXC( STORE t0, UNIT(0)(dst), s_exc_p8u)
0241 EXC( STORE t1, UNIT(1)(dst), s_exc_p7u)
0242 EXC( STORE t2, UNIT(2)(dst), s_exc_p6u)
0243 EXC( STORE t3, UNIT(3)(dst), s_exc_p5u)
0244 EXC( LOAD t0, UNIT(4)(src), l_exc_copy)
0245 EXC( LOAD t1, UNIT(5)(src), l_exc_copy)
0246 EXC( LOAD t2, UNIT(6)(src), l_exc_copy)
0247 EXC( LOAD t3, UNIT(7)(src), l_exc_copy)
0248 EXC( STORE t0, UNIT(4)(dst), s_exc_p4u)
0249 EXC( STORE t1, UNIT(5)(dst), s_exc_p3u)
0250 EXC( STORE t2, UNIT(6)(dst), s_exc_p2u)
0251 EXC( STORE t3, UNIT(7)(dst), s_exc_p1u)
0252 ADD src, src, 8*NBYTES
0253 beqz len, done
0254 ADD dst, dst, 8*NBYTES
0255 #
0256 # Jump here if there are less than 8*NBYTES left.
0257 #
0258 less_than_8units:
0259 sltu t0, len, 4*NBYTES
0260 bnez t0, less_than_4units
0261 nop
0262 EXC( LOAD t0, UNIT(0)(src), l_exc)
0263 EXC( LOAD t1, UNIT(1)(src), l_exc_copy)
0264 EXC( LOAD t2, UNIT(2)(src), l_exc_copy)
0265 EXC( LOAD t3, UNIT(3)(src), l_exc_copy)
0266 SUB len, len, 4*NBYTES
0267 EXC( STORE t0, UNIT(0)(dst), s_exc_p4u)
0268 EXC( STORE t1, UNIT(1)(dst), s_exc_p3u)
0269 EXC( STORE t2, UNIT(2)(dst), s_exc_p2u)
0270 EXC( STORE t3, UNIT(3)(dst), s_exc_p1u)
0271 ADD src, src, 4*NBYTES
0272 beqz len, done
0273 ADD dst, dst, 4*NBYTES
0274 #
0275 # Jump here if there are less than 4*NBYTES left. This means
0276 # we may need to copy up to 3 NBYTES words.
0277 #
0278 less_than_4units:
0279 sltu t0, len, 1*NBYTES
0280 bnez t0, copy_bytes_checklen
0281 nop
0282 #
0283 # 1) Copy NBYTES, then check length again
0284 #
0285 EXC( LOAD t0, 0(src), l_exc)
0286 SUB len, len, NBYTES
0287 sltu t1, len, 8
0288 EXC( STORE t0, 0(dst), s_exc_p1u)
0289 ADD src, src, NBYTES
0290 bnez t1, copy_bytes_checklen
0291 ADD dst, dst, NBYTES
0292 #
0293 # 2) Copy NBYTES, then check length again
0294 #
0295 EXC( LOAD t0, 0(src), l_exc)
0296 SUB len, len, NBYTES
0297 sltu t1, len, 8
0298 EXC( STORE t0, 0(dst), s_exc_p1u)
0299 ADD src, src, NBYTES
0300 bnez t1, copy_bytes_checklen
0301 ADD dst, dst, NBYTES
0302 #
0303 # 3) Copy NBYTES, then check length again
0304 #
0305 EXC( LOAD t0, 0(src), l_exc)
0306 SUB len, len, NBYTES
0307 ADD src, src, NBYTES
0308 ADD dst, dst, NBYTES
0309 b copy_bytes_checklen
0310 EXC( STORE t0, -8(dst), s_exc_p1u)
0311
0312 src_unaligned:
0313 #define rem t8
0314 SRL t0, len, LOG_NBYTES+2 # +2 for 4 units/iter
0315 beqz t0, cleanup_src_unaligned
0316 and rem, len, (4*NBYTES-1) # rem = len % 4*NBYTES
0317 1:
0318
0319
0320
0321
0322
0323
0324 EXC( LDFIRST t0, FIRST(0)(src), l_exc)
0325 EXC( LDFIRST t1, FIRST(1)(src), l_exc_copy)
0326 SUB len, len, 4*NBYTES
0327 EXC( LDREST t0, REST(0)(src), l_exc_copy)
0328 EXC( LDREST t1, REST(1)(src), l_exc_copy)
0329 EXC( LDFIRST t2, FIRST(2)(src), l_exc_copy)
0330 EXC( LDFIRST t3, FIRST(3)(src), l_exc_copy)
0331 EXC( LDREST t2, REST(2)(src), l_exc_copy)
0332 EXC( LDREST t3, REST(3)(src), l_exc_copy)
0333 ADD src, src, 4*NBYTES
0334 EXC( STORE t0, UNIT(0)(dst), s_exc_p4u)
0335 EXC( STORE t1, UNIT(1)(dst), s_exc_p3u)
0336 EXC( STORE t2, UNIT(2)(dst), s_exc_p2u)
0337 EXC( STORE t3, UNIT(3)(dst), s_exc_p1u)
0338 bne len, rem, 1b
0339 ADD dst, dst, 4*NBYTES
0340
0341 cleanup_src_unaligned:
0342 beqz len, done
0343 and rem, len, NBYTES-1 # rem = len % NBYTES
0344 beq rem, len, copy_bytes
0345 nop
0346 1:
0347 EXC( LDFIRST t0, FIRST(0)(src), l_exc)
0348 EXC( LDREST t0, REST(0)(src), l_exc_copy)
0349 SUB len, len, NBYTES
0350 EXC( STORE t0, 0(dst), s_exc_p1u)
0351 ADD src, src, NBYTES
0352 bne len, rem, 1b
0353 ADD dst, dst, NBYTES
0354
0355 copy_bytes_checklen:
0356 beqz len, done
0357 nop
0358 copy_bytes:
0359
0360 #define COPY_BYTE(N) \
0361 EXC( lb t0, N(src), l_exc); \
0362 SUB len, len, 1; \
0363 beqz len, done; \
0364 EXC( sb t0, N(dst), s_exc_p1)
0365
0366 COPY_BYTE(0)
0367 COPY_BYTE(1)
0368 COPY_BYTE(2)
0369 COPY_BYTE(3)
0370 COPY_BYTE(4)
0371 COPY_BYTE(5)
0372 EXC( lb t0, NBYTES-2(src), l_exc)
0373 SUB len, len, 1
0374 jr ra
0375 EXC( sb t0, NBYTES-2(dst), s_exc_p1)
0376 done:
0377 jr ra
0378 nop
0379 END(memcpy)
0380
0381 l_exc_copy_rewind16:
0382
0383 SUB src, src, 16*NBYTES
0384 SUB dst, dst, 16*NBYTES
0385 l_exc_copy:
0386
0387
0388
0389
0390
0391
0392
0393
0394
0395
0396 LOAD t0, TI_TASK($28)
0397 LOAD t0, THREAD_BUADDR(t0)
0398 1:
0399 EXC( lb t1, 0(src), l_exc)
0400 ADD src, src, 1
0401 sb t1, 0(dst) # can't fault -- we're copy_from_user
0402 bne src, t0, 1b
0403 ADD dst, dst, 1
0404 l_exc:
0405 LOAD t0, TI_TASK($28)
0406 LOAD t0, THREAD_BUADDR(t0) # t0 is just past last good address
0407 SUB len, AT, t0 # len number of uncopied bytes
0408 jr ra
0409 nop
0410
0411
0412 #define SEXC(n) \
0413 s_exc_p ## n ## u: \
0414 jr ra; \
0415 ADD len, len, n*NBYTES
0416
0417 SEXC(16)
0418 SEXC(15)
0419 SEXC(14)
0420 SEXC(13)
0421 SEXC(12)
0422 SEXC(11)
0423 SEXC(10)
0424 SEXC(9)
0425 SEXC(8)
0426 SEXC(7)
0427 SEXC(6)
0428 SEXC(5)
0429 SEXC(4)
0430 SEXC(3)
0431 SEXC(2)
0432 SEXC(1)
0433
0434 s_exc_p1:
0435 jr ra
0436 ADD len, len, 1
0437 s_exc:
0438 jr ra
0439 nop
0440
0441 .align 5
0442 LEAF(memmove)
0443 EXPORT_SYMBOL(memmove)
0444 ADD t0, a0, a2
0445 ADD t1, a1, a2
0446 sltu t0, a1, t0 # dst + len <= src -> memcpy
0447 sltu t1, a0, t1 # dst >= src + len -> memcpy
0448 and t0, t1
0449 beqz t0, __memcpy
0450 move v0, a0
0451 beqz a2, r_out
0452 END(memmove)
0453
0454
0455 LEAF(__rmemcpy)
0456 sltu t0, a1, a0
0457 beqz t0, r_end_bytes_up # src >= dst
0458 nop
0459 ADD a0, a2 # dst = dst + len
0460 ADD a1, a2 # src = src + len
0461
0462 r_end_bytes:
0463 lb t0, -1(a1)
0464 SUB a2, a2, 0x1
0465 sb t0, -1(a0)
0466 SUB a1, a1, 0x1
0467 bnez a2, r_end_bytes
0468 SUB a0, a0, 0x1
0469
0470 r_out:
0471 jr ra
0472 move a2, zero
0473
0474 r_end_bytes_up:
0475 lb t0, (a1)
0476 SUB a2, a2, 0x1
0477 sb t0, (a0)
0478 ADD a1, a1, 0x1
0479 bnez a2, r_end_bytes_up
0480 ADD a0, a0, 0x1
0481
0482 jr ra
0483 move a2, zero
0484 END(__rmemcpy)