0001
0002
0003
0004
0005
0006
0007 #ifdef __KERNEL__
0008 #include <linux/linkage.h>
0009 #include <asm/asi.h>
0010 #include <asm/thread_info.h>
0011 #define GLOBAL_SPARE %g7
0012 #define RESTORE_ASI(TMP) \
0013 wr %g0, ASI_AIUS, %asi
0014 #else
0015 #define GLOBAL_SPARE %g5
0016 #define RESTORE_ASI(TMP) \
0017 wr %g0, ASI_PNF, %asi
0018 #endif
0019
0020 #ifdef __sparc_v9__
0021 #define SAVE_AMOUNT 128
0022 #else
0023 #define SAVE_AMOUNT 64
0024 #endif
0025
0026 #ifndef STORE_ASI
0027 #define STORE_ASI ASI_BLK_INIT_QUAD_LDD_P
0028 #endif
0029
0030 #ifndef EX_LD
0031 #define EX_LD(x,y) x
0032 #endif
0033
0034 #ifndef EX_ST
0035 #define EX_ST(x,y) x
0036 #endif
0037
0038 #ifndef LOAD
0039 #ifndef MEMCPY_DEBUG
0040 #define LOAD(type,addr,dest) type [addr], dest
0041 #else
0042 #define LOAD(type,addr,dest) type##a [addr] 0x80, dest
0043 #endif
0044 #endif
0045
0046 #ifndef LOAD_TWIN
0047 #define LOAD_TWIN(addr_reg,dest0,dest1) \
0048 ldda [addr_reg] ASI_BLK_INIT_QUAD_LDD_P, dest0
0049 #endif
0050
0051 #ifndef STORE
0052 #define STORE(type,src,addr) type src, [addr]
0053 #endif
0054
0055 #ifndef STORE_INIT
0056 #ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA
0057 #define STORE_INIT(src,addr) stxa src, [addr] %asi
0058 #else
0059 #define STORE_INIT(src,addr) stx src, [addr + 0x00]
0060 #endif
0061 #endif
0062
0063 #ifndef FUNC_NAME
0064 #define FUNC_NAME NGmemcpy
0065 #endif
0066
0067 #ifndef PREAMBLE
0068 #define PREAMBLE
0069 #endif
0070
0071 #ifndef XCC
0072 #define XCC xcc
0073 #endif
0074
0075 .register %g2,#scratch
0076 .register %g3,#scratch
0077
0078 .text
0079 #ifndef EX_RETVAL
0080 #define EX_RETVAL(x) x
0081 __restore_asi:
0082 ret
0083 wr %g0, ASI_AIUS, %asi
0084 restore
0085 ENTRY(NG_ret_i2_plus_i4_plus_1)
0086 ba,pt %xcc, __restore_asi
0087 add %i2, %i5, %i0
0088 ENDPROC(NG_ret_i2_plus_i4_plus_1)
0089 ENTRY(NG_ret_i2_plus_g1)
0090 ba,pt %xcc, __restore_asi
0091 add %i2, %g1, %i0
0092 ENDPROC(NG_ret_i2_plus_g1)
0093 ENTRY(NG_ret_i2_plus_g1_minus_8)
0094 sub %g1, 8, %g1
0095 ba,pt %xcc, __restore_asi
0096 add %i2, %g1, %i0
0097 ENDPROC(NG_ret_i2_plus_g1_minus_8)
0098 ENTRY(NG_ret_i2_plus_g1_minus_16)
0099 sub %g1, 16, %g1
0100 ba,pt %xcc, __restore_asi
0101 add %i2, %g1, %i0
0102 ENDPROC(NG_ret_i2_plus_g1_minus_16)
0103 ENTRY(NG_ret_i2_plus_g1_minus_24)
0104 sub %g1, 24, %g1
0105 ba,pt %xcc, __restore_asi
0106 add %i2, %g1, %i0
0107 ENDPROC(NG_ret_i2_plus_g1_minus_24)
0108 ENTRY(NG_ret_i2_plus_g1_minus_32)
0109 sub %g1, 32, %g1
0110 ba,pt %xcc, __restore_asi
0111 add %i2, %g1, %i0
0112 ENDPROC(NG_ret_i2_plus_g1_minus_32)
0113 ENTRY(NG_ret_i2_plus_g1_minus_40)
0114 sub %g1, 40, %g1
0115 ba,pt %xcc, __restore_asi
0116 add %i2, %g1, %i0
0117 ENDPROC(NG_ret_i2_plus_g1_minus_40)
0118 ENTRY(NG_ret_i2_plus_g1_minus_48)
0119 sub %g1, 48, %g1
0120 ba,pt %xcc, __restore_asi
0121 add %i2, %g1, %i0
0122 ENDPROC(NG_ret_i2_plus_g1_minus_48)
0123 ENTRY(NG_ret_i2_plus_g1_minus_56)
0124 sub %g1, 56, %g1
0125 ba,pt %xcc, __restore_asi
0126 add %i2, %g1, %i0
0127 ENDPROC(NG_ret_i2_plus_g1_minus_56)
0128 ENTRY(NG_ret_i2_plus_i4)
0129 ba,pt %xcc, __restore_asi
0130 add %i2, %i4, %i0
0131 ENDPROC(NG_ret_i2_plus_i4)
0132 ENTRY(NG_ret_i2_plus_i4_minus_8)
0133 sub %i4, 8, %i4
0134 ba,pt %xcc, __restore_asi
0135 add %i2, %i4, %i0
0136 ENDPROC(NG_ret_i2_plus_i4_minus_8)
0137 ENTRY(NG_ret_i2_plus_8)
0138 ba,pt %xcc, __restore_asi
0139 add %i2, 8, %i0
0140 ENDPROC(NG_ret_i2_plus_8)
0141 ENTRY(NG_ret_i2_plus_4)
0142 ba,pt %xcc, __restore_asi
0143 add %i2, 4, %i0
0144 ENDPROC(NG_ret_i2_plus_4)
0145 ENTRY(NG_ret_i2_plus_1)
0146 ba,pt %xcc, __restore_asi
0147 add %i2, 1, %i0
0148 ENDPROC(NG_ret_i2_plus_1)
0149 ENTRY(NG_ret_i2_plus_g1_plus_1)
0150 add %g1, 1, %g1
0151 ba,pt %xcc, __restore_asi
0152 add %i2, %g1, %i0
0153 ENDPROC(NG_ret_i2_plus_g1_plus_1)
0154 ENTRY(NG_ret_i2)
0155 ba,pt %xcc, __restore_asi
0156 mov %i2, %i0
0157 ENDPROC(NG_ret_i2)
0158 ENTRY(NG_ret_i2_and_7_plus_i4)
0159 and %i2, 7, %i2
0160 ba,pt %xcc, __restore_asi
0161 add %i2, %i4, %i0
0162 ENDPROC(NG_ret_i2_and_7_plus_i4)
0163 #endif
0164
0165 .align 64
0166
0167 .globl FUNC_NAME
0168 .type FUNC_NAME,#function
0169 FUNC_NAME:
0170 PREAMBLE
0171 save %sp, -SAVE_AMOUNT, %sp
0172 srlx %i2, 31, %g2
0173 cmp %g2, 0
0174 tne %xcc, 5
0175 mov %i0, %o0
0176 cmp %i2, 0
0177 be,pn %XCC, 85f
0178 or %o0, %i1, %i3
0179 cmp %i2, 16
0180 blu,a,pn %XCC, 80f
0181 or %i3, %i2, %i3
0182
0183
0184
0185
0186
0187
0188
0189
0190 cmp %i2, (2 * 64)
0191 blu,pt %XCC, 70f
0192 andcc %i3, 0x7, %g0
0193
0194
0195
0196
0197
0198
0199
0200
0201
0202 LOAD(prefetch, %i1, #one_read)
0203 wr %g0, STORE_ASI, %asi
0204
0205
0206 andcc %o0, (64 - 1), %i4
0207 be,pt %XCC, 2f
0208 sub %i4, 64, %i4
0209 sub %g0, %i4, %i4 ! bytes to align dst
0210 sub %i2, %i4, %i2
0211 1: subcc %i4, 1, %i4
0212 EX_LD(LOAD(ldub, %i1, %g1), NG_ret_i2_plus_i4_plus_1)
0213 EX_ST(STORE(stb, %g1, %o0), NG_ret_i2_plus_i4_plus_1)
0214 add %i1, 1, %i1
0215 bne,pt %XCC, 1b
0216 add %o0, 1, %o0
0217
0218
0219
0220
0221
0222
0223
0224
0225
0226
0227
0228
0229
0230
0231
0232 2:
0233 andcc %i1, (16 - 1), %i4
0234 andn %i2, (64 - 1), %g1 ! block copy loop iterator
0235 be,pt %XCC, 50f
0236 sub %i2, %g1, %i2 ! final sub-block copy bytes
0237
0238 cmp %i4, 8
0239 be,pt %XCC, 10f
0240 sub %i1, %i4, %i1
0241
0242
0243 and %i4, 0x7, GLOBAL_SPARE
0244 sll GLOBAL_SPARE, 3, GLOBAL_SPARE
0245 mov 64, %i5
0246 EX_LD(LOAD_TWIN(%i1, %g2, %g3), NG_ret_i2_plus_g1)
0247 sub %i5, GLOBAL_SPARE, %i5
0248 mov 16, %o4
0249 mov 32, %o5
0250 mov 48, %o7
0251 mov 64, %i3
0252
0253 bg,pn %XCC, 9f
0254 nop
0255
0256 #define MIX_THREE_WORDS(WORD1, WORD2, WORD3, PRE_SHIFT, POST_SHIFT, TMP) \
0257 sllx WORD1, POST_SHIFT, WORD1; \
0258 srlx WORD2, PRE_SHIFT, TMP; \
0259 sllx WORD2, POST_SHIFT, WORD2; \
0260 or WORD1, TMP, WORD1; \
0261 srlx WORD3, PRE_SHIFT, TMP; \
0262 or WORD2, TMP, WORD2;
0263
0264 8: EX_LD(LOAD_TWIN(%i1 + %o4, %o2, %o3), NG_ret_i2_plus_g1)
0265 MIX_THREE_WORDS(%g2, %g3, %o2, %i5, GLOBAL_SPARE, %o1)
0266 LOAD(prefetch, %i1 + %i3, #one_read)
0267
0268 EX_ST(STORE_INIT(%g2, %o0 + 0x00), NG_ret_i2_plus_g1)
0269 EX_ST(STORE_INIT(%g3, %o0 + 0x08), NG_ret_i2_plus_g1_minus_8)
0270
0271 EX_LD(LOAD_TWIN(%i1 + %o5, %g2, %g3), NG_ret_i2_plus_g1_minus_16)
0272 MIX_THREE_WORDS(%o2, %o3, %g2, %i5, GLOBAL_SPARE, %o1)
0273
0274 EX_ST(STORE_INIT(%o2, %o0 + 0x10), NG_ret_i2_plus_g1_minus_16)
0275 EX_ST(STORE_INIT(%o3, %o0 + 0x18), NG_ret_i2_plus_g1_minus_24)
0276
0277 EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3), NG_ret_i2_plus_g1_minus_32)
0278 MIX_THREE_WORDS(%g2, %g3, %o2, %i5, GLOBAL_SPARE, %o1)
0279
0280 EX_ST(STORE_INIT(%g2, %o0 + 0x20), NG_ret_i2_plus_g1_minus_32)
0281 EX_ST(STORE_INIT(%g3, %o0 + 0x28), NG_ret_i2_plus_g1_minus_40)
0282
0283 EX_LD(LOAD_TWIN(%i1 + %i3, %g2, %g3), NG_ret_i2_plus_g1_minus_48)
0284 add %i1, 64, %i1
0285 MIX_THREE_WORDS(%o2, %o3, %g2, %i5, GLOBAL_SPARE, %o1)
0286
0287 EX_ST(STORE_INIT(%o2, %o0 + 0x30), NG_ret_i2_plus_g1_minus_48)
0288 EX_ST(STORE_INIT(%o3, %o0 + 0x38), NG_ret_i2_plus_g1_minus_56)
0289
0290 subcc %g1, 64, %g1
0291 bne,pt %XCC, 8b
0292 add %o0, 64, %o0
0293
0294 ba,pt %XCC, 60f
0295 add %i1, %i4, %i1
0296
0297 9: EX_LD(LOAD_TWIN(%i1 + %o4, %o2, %o3), NG_ret_i2_plus_g1)
0298 MIX_THREE_WORDS(%g3, %o2, %o3, %i5, GLOBAL_SPARE, %o1)
0299 LOAD(prefetch, %i1 + %i3, #one_read)
0300
0301 EX_ST(STORE_INIT(%g3, %o0 + 0x00), NG_ret_i2_plus_g1)
0302 EX_ST(STORE_INIT(%o2, %o0 + 0x08), NG_ret_i2_plus_g1_minus_8)
0303
0304 EX_LD(LOAD_TWIN(%i1 + %o5, %g2, %g3), NG_ret_i2_plus_g1_minus_16)
0305 MIX_THREE_WORDS(%o3, %g2, %g3, %i5, GLOBAL_SPARE, %o1)
0306
0307 EX_ST(STORE_INIT(%o3, %o0 + 0x10), NG_ret_i2_plus_g1_minus_16)
0308 EX_ST(STORE_INIT(%g2, %o0 + 0x18), NG_ret_i2_plus_g1_minus_24)
0309
0310 EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3), NG_ret_i2_plus_g1_minus_32)
0311 MIX_THREE_WORDS(%g3, %o2, %o3, %i5, GLOBAL_SPARE, %o1)
0312
0313 EX_ST(STORE_INIT(%g3, %o0 + 0x20), NG_ret_i2_plus_g1_minus_32)
0314 EX_ST(STORE_INIT(%o2, %o0 + 0x28), NG_ret_i2_plus_g1_minus_40)
0315
0316 EX_LD(LOAD_TWIN(%i1 + %i3, %g2, %g3), NG_ret_i2_plus_g1_minus_48)
0317 add %i1, 64, %i1
0318 MIX_THREE_WORDS(%o3, %g2, %g3, %i5, GLOBAL_SPARE, %o1)
0319
0320 EX_ST(STORE_INIT(%o3, %o0 + 0x30), NG_ret_i2_plus_g1_minus_48)
0321 EX_ST(STORE_INIT(%g2, %o0 + 0x38), NG_ret_i2_plus_g1_minus_56)
0322
0323 subcc %g1, 64, %g1
0324 bne,pt %XCC, 9b
0325 add %o0, 64, %o0
0326
0327 ba,pt %XCC, 60f
0328 add %i1, %i4, %i1
0329
0330 10:
0331
0332
0333
0334
0335 EX_LD(LOAD_TWIN(%i1, %o4, %o5), NG_ret_i2_plus_g1)
0336 mov 16, %o7
0337 mov 32, %g2
0338 mov 48, %g3
0339 mov 64, %o1
0340 1: EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3), NG_ret_i2_plus_g1)
0341 LOAD(prefetch, %i1 + %o1, #one_read)
0342 EX_ST(STORE_INIT(%o5, %o0 + 0x00), NG_ret_i2_plus_g1) ! initializes cache line
0343 EX_ST(STORE_INIT(%o2, %o0 + 0x08), NG_ret_i2_plus_g1_minus_8)
0344 EX_LD(LOAD_TWIN(%i1 + %g2, %o4, %o5), NG_ret_i2_plus_g1_minus_16)
0345 EX_ST(STORE_INIT(%o3, %o0 + 0x10), NG_ret_i2_plus_g1_minus_16)
0346 EX_ST(STORE_INIT(%o4, %o0 + 0x18), NG_ret_i2_plus_g1_minus_24)
0347 EX_LD(LOAD_TWIN(%i1 + %g3, %o2, %o3), NG_ret_i2_plus_g1_minus_32)
0348 EX_ST(STORE_INIT(%o5, %o0 + 0x20), NG_ret_i2_plus_g1_minus_32)
0349 EX_ST(STORE_INIT(%o2, %o0 + 0x28), NG_ret_i2_plus_g1_minus_40)
0350 EX_LD(LOAD_TWIN(%i1 + %o1, %o4, %o5), NG_ret_i2_plus_g1_minus_48)
0351 add %i1, 64, %i1
0352 EX_ST(STORE_INIT(%o3, %o0 + 0x30), NG_ret_i2_plus_g1_minus_48)
0353 EX_ST(STORE_INIT(%o4, %o0 + 0x38), NG_ret_i2_plus_g1_minus_56)
0354 subcc %g1, 64, %g1
0355 bne,pt %XCC, 1b
0356 add %o0, 64, %o0
0357
0358 ba,pt %XCC, 60f
0359 add %i1, 0x8, %i1
0360
0361 50:
0362
0363
0364 mov 16, %o7
0365 mov 32, %g2
0366 mov 48, %g3
0367 mov 64, %o1
0368 1: EX_LD(LOAD_TWIN(%i1 + %g0, %o4, %o5), NG_ret_i2_plus_g1)
0369 EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3), NG_ret_i2_plus_g1)
0370 LOAD(prefetch, %i1 + %o1, #one_read)
0371 EX_ST(STORE_INIT(%o4, %o0 + 0x00), NG_ret_i2_plus_g1) ! initializes cache line
0372 EX_ST(STORE_INIT(%o5, %o0 + 0x08), NG_ret_i2_plus_g1_minus_8)
0373 EX_LD(LOAD_TWIN(%i1 + %g2, %o4, %o5), NG_ret_i2_plus_g1_minus_16)
0374 EX_ST(STORE_INIT(%o2, %o0 + 0x10), NG_ret_i2_plus_g1_minus_16)
0375 EX_ST(STORE_INIT(%o3, %o0 + 0x18), NG_ret_i2_plus_g1_minus_24)
0376 EX_LD(LOAD_TWIN(%i1 + %g3, %o2, %o3), NG_ret_i2_plus_g1_minus_32)
0377 add %i1, 64, %i1
0378 EX_ST(STORE_INIT(%o4, %o0 + 0x20), NG_ret_i2_plus_g1_minus_32)
0379 EX_ST(STORE_INIT(%o5, %o0 + 0x28), NG_ret_i2_plus_g1_minus_40)
0380 EX_ST(STORE_INIT(%o2, %o0 + 0x30), NG_ret_i2_plus_g1_minus_48)
0381 EX_ST(STORE_INIT(%o3, %o0 + 0x38), NG_ret_i2_plus_g1_minus_56)
0382 subcc %g1, 64, %g1
0383 bne,pt %XCC, 1b
0384 add %o0, 64, %o0
0385
0386
0387 60:
0388 membar #Sync
0389
0390
0391
0392
0393 RESTORE_ASI(%i3)
0394 brz,pt %i2, 85f
0395 sub %o0, %i1, %i3
0396 ba,a,pt %XCC, 90f
0397 nop
0398
0399 .align 64
0400 70:
0401 bne,pn %XCC, 75f
0402 sub %o0, %i1, %i3
0403
0404 72:
0405 andn %i2, 0xf, %i4
0406 and %i2, 0xf, %i2
0407 1: subcc %i4, 0x10, %i4
0408 EX_LD(LOAD(ldx, %i1, %o4), NG_ret_i2_plus_i4)
0409 add %i1, 0x08, %i1
0410 EX_LD(LOAD(ldx, %i1, %g1), NG_ret_i2_plus_i4)
0411 sub %i1, 0x08, %i1
0412 EX_ST(STORE(stx, %o4, %i1 + %i3), NG_ret_i2_plus_i4)
0413 add %i1, 0x8, %i1
0414 EX_ST(STORE(stx, %g1, %i1 + %i3), NG_ret_i2_plus_i4_minus_8)
0415 bgu,pt %XCC, 1b
0416 add %i1, 0x8, %i1
0417 73: andcc %i2, 0x8, %g0
0418 be,pt %XCC, 1f
0419 nop
0420 sub %i2, 0x8, %i2
0421 EX_LD(LOAD(ldx, %i1, %o4), NG_ret_i2_plus_8)
0422 EX_ST(STORE(stx, %o4, %i1 + %i3), NG_ret_i2_plus_8)
0423 add %i1, 0x8, %i1
0424 1: andcc %i2, 0x4, %g0
0425 be,pt %XCC, 1f
0426 nop
0427 sub %i2, 0x4, %i2
0428 EX_LD(LOAD(lduw, %i1, %i5), NG_ret_i2_plus_4)
0429 EX_ST(STORE(stw, %i5, %i1 + %i3), NG_ret_i2_plus_4)
0430 add %i1, 0x4, %i1
0431 1: cmp %i2, 0
0432 be,pt %XCC, 85f
0433 nop
0434 ba,pt %xcc, 90f
0435 nop
0436
0437 75:
0438 andcc %o0, 0x7, %g1
0439 sub %g1, 0x8, %g1
0440 be,pn %icc, 2f
0441 sub %g0, %g1, %g1
0442 sub %i2, %g1, %i2
0443
0444 1: subcc %g1, 1, %g1
0445 EX_LD(LOAD(ldub, %i1, %i5), NG_ret_i2_plus_g1_plus_1)
0446 EX_ST(STORE(stb, %i5, %i1 + %i3), NG_ret_i2_plus_g1_plus_1)
0447 bgu,pt %icc, 1b
0448 add %i1, 1, %i1
0449
0450 2: add %i1, %i3, %o0
0451 andcc %i1, 0x7, %g1
0452 bne,pt %icc, 8f
0453 sll %g1, 3, %g1
0454
0455 cmp %i2, 16
0456 bgeu,pt %icc, 72b
0457 nop
0458 ba,a,pt %xcc, 73b
0459
0460 8: mov 64, %i3
0461 andn %i1, 0x7, %i1
0462 EX_LD(LOAD(ldx, %i1, %g2), NG_ret_i2)
0463 sub %i3, %g1, %i3
0464 andn %i2, 0x7, %i4
0465 sllx %g2, %g1, %g2
0466 1: add %i1, 0x8, %i1
0467 EX_LD(LOAD(ldx, %i1, %g3), NG_ret_i2_and_7_plus_i4)
0468 subcc %i4, 0x8, %i4
0469 srlx %g3, %i3, %i5
0470 or %i5, %g2, %i5
0471 EX_ST(STORE(stx, %i5, %o0), NG_ret_i2_and_7_plus_i4)
0472 add %o0, 0x8, %o0
0473 bgu,pt %icc, 1b
0474 sllx %g3, %g1, %g2
0475
0476 srl %g1, 3, %g1
0477 andcc %i2, 0x7, %i2
0478 be,pn %icc, 85f
0479 add %i1, %g1, %i1
0480 ba,pt %xcc, 90f
0481 sub %o0, %i1, %i3
0482
0483 .align 64
0484 80:
0485 andcc %i3, 0x3, %g0
0486 bne,pn %XCC, 90f
0487 sub %o0, %i1, %i3
0488
0489 1:
0490 subcc %i2, 4, %i2
0491 EX_LD(LOAD(lduw, %i1, %g1), NG_ret_i2_plus_4)
0492 EX_ST(STORE(stw, %g1, %i1 + %i3), NG_ret_i2_plus_4)
0493 bgu,pt %XCC, 1b
0494 add %i1, 4, %i1
0495
0496 85: ret
0497 restore EX_RETVAL(%i0), %g0, %o0
0498
0499 .align 32
0500 90:
0501 subcc %i2, 1, %i2
0502 EX_LD(LOAD(ldub, %i1, %g1), NG_ret_i2_plus_1)
0503 EX_ST(STORE(stb, %g1, %i1 + %i3), NG_ret_i2_plus_1)
0504 bgu,pt %XCC, 90b
0505 add %i1, 1, %i1
0506 ret
0507 restore EX_RETVAL(%i0), %g0, %o0
0508
0509 .size FUNC_NAME, .-FUNC_NAME