0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024
0025
0026
0027
0028
0029
0030
0031
0032
0033
0034
0035
0036
0037
0038 #define CHIP_NAVI10 26
0039 #define CHIP_SIENNA_CICHLID 30
0040 #define CHIP_PLUM_BONITO 36
0041
0042 #define NO_SQC_STORE (ASIC_FAMILY >= CHIP_SIENNA_CICHLID)
0043 #define HAVE_XNACK (ASIC_FAMILY < CHIP_SIENNA_CICHLID)
0044 #define HAVE_SENDMSG_RTN (ASIC_FAMILY >= CHIP_PLUM_BONITO)
0045 #define HAVE_BUFFER_LDS_LOAD (ASIC_FAMILY < CHIP_PLUM_BONITO)
0046
0047 var SINGLE_STEP_MISSED_WORKAROUND = 1 //workaround for lost MODE.DEBUG_EN exception when SAVECTX raised
0048
0049 var SQ_WAVE_STATUS_SPI_PRIO_MASK = 0x00000006
0050 var SQ_WAVE_STATUS_HALT_MASK = 0x2000
0051 var SQ_WAVE_STATUS_ECC_ERR_MASK = 0x20000
0052
0053 var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT = 12
0054 var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE = 9
0055 var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE = 8
0056 var SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT = 24
0057 var SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE = 4
0058 var SQ_WAVE_IB_STS2_WAVE64_SHIFT = 11
0059 var SQ_WAVE_IB_STS2_WAVE64_SIZE = 1
0060
0061 #if ASIC_FAMILY < CHIP_PLUM_BONITO
0062 var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT = 8
0063 #else
0064 var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT = 12
0065 #endif
0066
0067 var SQ_WAVE_TRAPSTS_SAVECTX_MASK = 0x400
0068 var SQ_WAVE_TRAPSTS_EXCP_MASK = 0x1FF
0069 var SQ_WAVE_TRAPSTS_SAVECTX_SHIFT = 10
0070 var SQ_WAVE_TRAPSTS_ADDR_WATCH_MASK = 0x80
0071 var SQ_WAVE_TRAPSTS_ADDR_WATCH_SHIFT = 7
0072 var SQ_WAVE_TRAPSTS_MEM_VIOL_MASK = 0x100
0073 var SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT = 8
0074 var SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK = 0x3FF
0075 var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT = 0x0
0076 var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE = 10
0077 var SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK = 0xFFFFF800
0078 var SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT = 11
0079 var SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE = 21
0080 var SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK = 0x800
0081 var SQ_WAVE_TRAPSTS_EXCP_HI_MASK = 0x7000
0082
0083 var SQ_WAVE_MODE_EXCP_EN_SHIFT = 12
0084 var SQ_WAVE_MODE_EXCP_EN_ADDR_WATCH_SHIFT = 19
0085
0086 var SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT = 15
0087 var SQ_WAVE_IB_STS_REPLAY_W64H_SHIFT = 25
0088 var SQ_WAVE_IB_STS_REPLAY_W64H_MASK = 0x02000000
0089 var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK = 0x003F8000
0090
0091 var SQ_WAVE_MODE_DEBUG_EN_MASK = 0x800
0092
0093 // bits [31:24] unused by SPI debug data
0094 var TTMP11_SAVE_REPLAY_W64H_SHIFT = 31
0095 var TTMP11_SAVE_REPLAY_W64H_MASK = 0x80000000
0096 var TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT = 24
0097 var TTMP11_SAVE_RCNT_FIRST_REPLAY_MASK = 0x7F000000
0098 var TTMP11_DEBUG_TRAP_ENABLED_SHIFT = 23
0099 var TTMP11_DEBUG_TRAP_ENABLED_MASK = 0x800000
0100
0101 // SQ_SEL_X/Y/Z/W, BUF_NUM_FORMAT_FLOAT, (0 for MUBUF stride[17:14]
0102 // when ADD_TID_ENABLE and BUF_DATA_FORMAT_32 for MTBUF), ADD_TID_ENABLE
0103 var S_SAVE_BUF_RSRC_WORD1_STRIDE = 0x00040000
0104 var S_SAVE_BUF_RSRC_WORD3_MISC = 0x10807FAC
0105 var S_SAVE_PC_HI_TRAP_ID_MASK = 0x00FF0000
0106 var S_SAVE_PC_HI_HT_MASK = 0x01000000
0107 var S_SAVE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000
0108 var S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT = 26
0109
0110 var S_SAVE_PC_HI_FIRST_WAVE_MASK = 0x80000000
0111 var S_SAVE_PC_HI_FIRST_WAVE_SHIFT = 31
0112
0113 var s_sgpr_save_num = 108
0114
0115 var s_save_spi_init_lo = exec_lo
0116 var s_save_spi_init_hi = exec_hi
0117 var s_save_pc_lo = ttmp0
0118 var s_save_pc_hi = ttmp1
0119 var s_save_exec_lo = ttmp2
0120 var s_save_exec_hi = ttmp3
0121 var s_save_status = ttmp12
0122 var s_save_trapsts = ttmp15
0123 var s_save_xnack_mask = s_save_trapsts
0124 var s_wave_size = ttmp7
0125 var s_save_buf_rsrc0 = ttmp8
0126 var s_save_buf_rsrc1 = ttmp9
0127 var s_save_buf_rsrc2 = ttmp10
0128 var s_save_buf_rsrc3 = ttmp11
0129 var s_save_mem_offset = ttmp4
0130 var s_save_alloc_size = s_save_trapsts
0131 var s_save_tmp = ttmp14
0132 var s_save_m0 = ttmp5
0133 var s_save_ttmps_lo = s_save_tmp
0134 var s_save_ttmps_hi = s_save_trapsts
0135
0136 var S_RESTORE_BUF_RSRC_WORD1_STRIDE = S_SAVE_BUF_RSRC_WORD1_STRIDE
0137 var S_RESTORE_BUF_RSRC_WORD3_MISC = S_SAVE_BUF_RSRC_WORD3_MISC
0138
0139 var S_RESTORE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000
0140 var S_RESTORE_SPI_INIT_FIRST_WAVE_SHIFT = 26
0141 var S_WAVE_SIZE = 25
0142
0143 var s_restore_spi_init_lo = exec_lo
0144 var s_restore_spi_init_hi = exec_hi
0145 var s_restore_mem_offset = ttmp12
0146 var s_restore_alloc_size = ttmp3
0147 var s_restore_tmp = ttmp2
0148 var s_restore_mem_offset_save = s_restore_tmp
0149 var s_restore_m0 = s_restore_alloc_size
0150 var s_restore_mode = ttmp7
0151 var s_restore_flat_scratch = s_restore_tmp
0152 var s_restore_pc_lo = ttmp0
0153 var s_restore_pc_hi = ttmp1
0154 var s_restore_exec_lo = ttmp4
0155 var s_restore_exec_hi = ttmp5
0156 var s_restore_status = ttmp14
0157 var s_restore_trapsts = ttmp15
0158 var s_restore_xnack_mask = ttmp13
0159 var s_restore_buf_rsrc0 = ttmp8
0160 var s_restore_buf_rsrc1 = ttmp9
0161 var s_restore_buf_rsrc2 = ttmp10
0162 var s_restore_buf_rsrc3 = ttmp11
0163 var s_restore_size = ttmp6
0164 var s_restore_ttmps_lo = s_restore_tmp
0165 var s_restore_ttmps_hi = s_restore_alloc_size
0166
0167 shader main
0168 asic(DEFAULT)
0169 type(CS)
0170 wave_size(32)
0171
0172 s_branch L_SKIP_RESTORE //NOT restore. might be a regular trap or save
0173
0174 L_JUMP_TO_RESTORE:
0175 s_branch L_RESTORE
0176
0177 L_SKIP_RESTORE:
0178 s_getreg_b32 s_save_status, hwreg(HW_REG_STATUS) //save STATUS since we will change SCC
0179
0180 // Clear SPI_PRIO: do not save with elevated priority.
0181 // Clear ECC_ERR: prevents SQC store and triggers FATAL_HALT if setreg'd.
0182 s_andn2_b32 s_save_status, s_save_status, SQ_WAVE_STATUS_SPI_PRIO_MASK|SQ_WAVE_STATUS_ECC_ERR_MASK
0183
0184 s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS)
0185
0186 s_and_b32 ttmp2, s_save_status, SQ_WAVE_STATUS_HALT_MASK
0187 s_cbranch_scc0 L_NOT_HALTED
0188
0189 L_HALTED:
0190 // Host trap may occur while wave is halted.
0191 s_and_b32 ttmp2, s_save_pc_hi, S_SAVE_PC_HI_TRAP_ID_MASK
0192 s_cbranch_scc1 L_FETCH_2ND_TRAP
0193
0194 L_CHECK_SAVE:
0195 s_and_b32 ttmp2, s_save_trapsts, SQ_WAVE_TRAPSTS_SAVECTX_MASK
0196 s_cbranch_scc1 L_SAVE
0197
0198 // Wave is halted but neither host trap nor SAVECTX is raised.
0199 // Caused by instruction fetch memory violation.
0200 // Spin wait until context saved to prevent interrupt storm.
0201 s_sleep 0x10
0202 s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS)
0203 s_branch L_CHECK_SAVE
0204
0205 L_NOT_HALTED:
0206 // Let second-level handle non-SAVECTX exception or trap.
0207 // Any concurrent SAVECTX will be handled upon re-entry once halted.
0208
0209 // Check non-maskable exceptions. memory_violation, illegal_instruction
0210 // and xnack_error exceptions always cause the wave to enter the trap
0211 // handler.
0212 s_and_b32 ttmp2, s_save_trapsts, SQ_WAVE_TRAPSTS_MEM_VIOL_MASK|SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK
0213 s_cbranch_scc1 L_FETCH_2ND_TRAP
0214
0215 // Check for maskable exceptions in trapsts.excp and trapsts.excp_hi.
0216 // Maskable exceptions only cause the wave to enter the trap handler if
0217 // their respective bit in mode.excp_en is set.
0218 s_and_b32 ttmp2, s_save_trapsts, SQ_WAVE_TRAPSTS_EXCP_MASK|SQ_WAVE_TRAPSTS_EXCP_HI_MASK
0219 s_cbranch_scc0 L_CHECK_TRAP_ID
0220
0221 s_and_b32 ttmp3, s_save_trapsts, SQ_WAVE_TRAPSTS_ADDR_WATCH_MASK|SQ_WAVE_TRAPSTS_EXCP_HI_MASK
0222 s_cbranch_scc0 L_NOT_ADDR_WATCH
0223 s_bitset1_b32 ttmp2, SQ_WAVE_TRAPSTS_ADDR_WATCH_SHIFT // Check all addr_watch[123] exceptions against excp_en.addr_watch
0224
0225 L_NOT_ADDR_WATCH:
0226 s_getreg_b32 ttmp3, hwreg(HW_REG_MODE)
0227 s_lshl_b32 ttmp2, ttmp2, SQ_WAVE_MODE_EXCP_EN_SHIFT
0228 s_and_b32 ttmp2, ttmp2, ttmp3
0229 s_cbranch_scc1 L_FETCH_2ND_TRAP
0230
0231 L_CHECK_TRAP_ID:
0232 // Check trap_id != 0
0233 s_and_b32 ttmp2, s_save_pc_hi, S_SAVE_PC_HI_TRAP_ID_MASK
0234 s_cbranch_scc1 L_FETCH_2ND_TRAP
0235
0236 if SINGLE_STEP_MISSED_WORKAROUND
0237 // Prioritize single step exception over context save.
0238 // Second-level trap will halt wave and RFE, re-entering for SAVECTX.
0239 s_getreg_b32 ttmp2, hwreg(HW_REG_MODE)
0240 s_and_b32 ttmp2, ttmp2, SQ_WAVE_MODE_DEBUG_EN_MASK
0241 s_cbranch_scc1 L_FETCH_2ND_TRAP
0242 end
0243
0244 s_and_b32 ttmp2, s_save_trapsts, SQ_WAVE_TRAPSTS_SAVECTX_MASK
0245 s_cbranch_scc1 L_SAVE
0246
0247 L_FETCH_2ND_TRAP:
0248 #if HAVE_XNACK
0249 save_and_clear_ib_sts(ttmp14, ttmp15)
0250 #endif
0251
0252 // Read second-level TBA/TMA from first-level TMA and jump if available.
0253 // ttmp[2:5] and ttmp12 can be used (others hold SPI-initialized debug data)
0254 // ttmp12 holds SQ_WAVE_STATUS
0255 #if HAVE_SENDMSG_RTN
0256 s_sendmsg_rtn_b64 [ttmp14, ttmp15], sendmsg(MSG_RTN_GET_TMA)
0257 s_waitcnt lgkmcnt(0)
0258 #else
0259 s_getreg_b32 ttmp14, hwreg(HW_REG_SHADER_TMA_LO)
0260 s_getreg_b32 ttmp15, hwreg(HW_REG_SHADER_TMA_HI)
0261 #endif
0262 s_lshl_b64 [ttmp14, ttmp15], [ttmp14, ttmp15], 0x8
0263
0264 s_load_dword ttmp2, [ttmp14, ttmp15], 0x10 glc:1 // debug trap enabled flag
0265 s_waitcnt lgkmcnt(0)
0266 s_lshl_b32 ttmp2, ttmp2, TTMP11_DEBUG_TRAP_ENABLED_SHIFT
0267 s_andn2_b32 ttmp11, ttmp11, TTMP11_DEBUG_TRAP_ENABLED_MASK
0268 s_or_b32 ttmp11, ttmp11, ttmp2
0269
0270 s_load_dwordx2 [ttmp2, ttmp3], [ttmp14, ttmp15], 0x0 glc:1 // second-level TBA
0271 s_waitcnt lgkmcnt(0)
0272 s_load_dwordx2 [ttmp14, ttmp15], [ttmp14, ttmp15], 0x8 glc:1 // second-level TMA
0273 s_waitcnt lgkmcnt(0)
0274
0275 s_and_b64 [ttmp2, ttmp3], [ttmp2, ttmp3], [ttmp2, ttmp3]
0276 s_cbranch_scc0 L_NO_NEXT_TRAP // second-level trap handler not been set
0277 s_setpc_b64 [ttmp2, ttmp3] // jump to second-level trap handler
0278
0279 L_NO_NEXT_TRAP:
0280 // If not caused by trap then halt wave to prevent re-entry.
0281 s_and_b32 ttmp2, s_save_pc_hi, (S_SAVE_PC_HI_TRAP_ID_MASK|S_SAVE_PC_HI_HT_MASK)
0282 s_cbranch_scc1 L_TRAP_CASE
0283 s_or_b32 s_save_status, s_save_status, SQ_WAVE_STATUS_HALT_MASK
0284
0285 // If the PC points to S_ENDPGM then context save will fail if STATUS.HALT is set.
0286 // Rewind the PC to prevent this from occurring.
0287 s_sub_u32 ttmp0, ttmp0, 0x8
0288 s_subb_u32 ttmp1, ttmp1, 0x0
0289
0290 s_branch L_EXIT_TRAP
0291
0292 L_TRAP_CASE:
0293 // Host trap will not cause trap re-entry.
0294 s_and_b32 ttmp2, s_save_pc_hi, S_SAVE_PC_HI_HT_MASK
0295 s_cbranch_scc1 L_EXIT_TRAP
0296
0297 // Advance past trap instruction to prevent re-entry.
0298 s_add_u32 ttmp0, ttmp0, 0x4
0299 s_addc_u32 ttmp1, ttmp1, 0x0
0300
0301 L_EXIT_TRAP:
0302 s_and_b32 ttmp1, ttmp1, 0xFFFF
0303
0304 #if HAVE_XNACK
0305 restore_ib_sts(ttmp14, ttmp15)
0306 #endif
0307
0308 // Restore SQ_WAVE_STATUS.
0309 s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32
0310 s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32
0311 s_setreg_b32 hwreg(HW_REG_STATUS), s_save_status
0312
0313 s_rfe_b64 [ttmp0, ttmp1]
0314
0315 L_SAVE:
0316 s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32]
0317 s_mov_b32 s_save_tmp, 0
0318 s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_SAVECTX_SHIFT, 1), s_save_tmp //clear saveCtx bit
0319
0320 #if HAVE_XNACK
0321 save_and_clear_ib_sts(s_save_tmp, s_save_trapsts)
0322 #endif
0323
0324
0325 s_mov_b32 s_save_exec_lo, exec_lo //save EXEC and use EXEC for the go signal from SPI
0326 s_mov_b32 s_save_exec_hi, exec_hi
0327 s_mov_b64 exec, 0x0 //clear EXEC to get ready to receive
0328
0329 #if HAVE_SENDMSG_RTN
0330 s_sendmsg_rtn_b64 [exec_lo, exec_hi], sendmsg(MSG_RTN_SAVE_WAVE)
0331 #else
0332 s_sendmsg sendmsg(MSG_SAVEWAVE) //send SPI a message and wait for SPI's write to EXEC
0333 #endif
0334
0335 #if ASIC_FAMILY < CHIP_SIENNA_CICHLID
0336 L_SLEEP:
0337 // sleep 1 (64clk) is not enough for 8 waves per SIMD, which will cause
0338 // SQ hang, since the 7,8th wave could not get arbit to exec inst, while
0339 // other waves are stuck into the sleep-loop and waiting for wrexec!=0
0340 s_sleep 0x2
0341 s_cbranch_execz L_SLEEP
0342 #else
0343 s_waitcnt lgkmcnt(0)
0344 #endif
0345
0346 // Save first_wave flag so we can clear high bits of save address.
0347 s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK
0348 s_lshl_b32 s_save_tmp, s_save_tmp, (S_SAVE_PC_HI_FIRST_WAVE_SHIFT - S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT)
0349 s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp
0350
0351 #if NO_SQC_STORE
0352 // Trap temporaries must be saved via VGPR but all VGPRs are in use.
0353 // There is no ttmp space to hold the resource constant for VGPR save.
0354 // Save v0 by itself since it requires only two SGPRs.
0355 s_mov_b32 s_save_ttmps_lo, exec_lo
0356 s_and_b32 s_save_ttmps_hi, exec_hi, 0xFFFF
0357 s_mov_b32 exec_lo, 0xFFFFFFFF
0358 s_mov_b32 exec_hi, 0xFFFFFFFF
0359 global_store_dword_addtid v0, [s_save_ttmps_lo, s_save_ttmps_hi] slc:1 glc:1
0360 v_mov_b32 v0, 0x0
0361 s_mov_b32 exec_lo, s_save_ttmps_lo
0362 s_mov_b32 exec_hi, s_save_ttmps_hi
0363 #endif
0364
0365 // Save trap temporaries 4-11, 13 initialized by SPI debug dispatch logic
0366 // ttmp SR memory offset : size(VGPR)+size(SVGPR)+size(SGPR)+0x40
0367 get_wave_size(s_save_ttmps_hi)
0368 get_vgpr_size_bytes(s_save_ttmps_lo, s_save_ttmps_hi)
0369 get_svgpr_size_bytes(s_save_ttmps_hi)
0370 s_add_u32 s_save_ttmps_lo, s_save_ttmps_lo, s_save_ttmps_hi
0371 s_and_b32 s_save_ttmps_hi, s_save_spi_init_hi, 0xFFFF
0372 s_add_u32 s_save_ttmps_lo, s_save_ttmps_lo, get_sgpr_size_bytes()
0373 s_add_u32 s_save_ttmps_lo, s_save_ttmps_lo, s_save_spi_init_lo
0374 s_addc_u32 s_save_ttmps_hi, s_save_ttmps_hi, 0x0
0375
0376 #if NO_SQC_STORE
0377 v_writelane_b32 v0, ttmp4, 0x4
0378 v_writelane_b32 v0, ttmp5, 0x5
0379 v_writelane_b32 v0, ttmp6, 0x6
0380 v_writelane_b32 v0, ttmp7, 0x7
0381 v_writelane_b32 v0, ttmp8, 0x8
0382 v_writelane_b32 v0, ttmp9, 0x9
0383 v_writelane_b32 v0, ttmp10, 0xA
0384 v_writelane_b32 v0, ttmp11, 0xB
0385 v_writelane_b32 v0, ttmp13, 0xD
0386 v_writelane_b32 v0, exec_lo, 0xE
0387 v_writelane_b32 v0, exec_hi, 0xF
0388
0389 s_mov_b32 exec_lo, 0x3FFF
0390 s_mov_b32 exec_hi, 0x0
0391 global_store_dword_addtid v0, [s_save_ttmps_lo, s_save_ttmps_hi] inst_offset:0x40 slc:1 glc:1
0392 v_readlane_b32 ttmp14, v0, 0xE
0393 v_readlane_b32 ttmp15, v0, 0xF
0394 s_mov_b32 exec_lo, ttmp14
0395 s_mov_b32 exec_hi, ttmp15
0396 #else
0397 s_store_dwordx4 [ttmp4, ttmp5, ttmp6, ttmp7], [s_save_ttmps_lo, s_save_ttmps_hi], 0x50 glc:1
0398 s_store_dwordx4 [ttmp8, ttmp9, ttmp10, ttmp11], [s_save_ttmps_lo, s_save_ttmps_hi], 0x60 glc:1
0399 s_store_dword ttmp13, [s_save_ttmps_lo, s_save_ttmps_hi], 0x74 glc:1
0400 #endif
0401
0402
0403 s_mov_b32 s_save_buf_rsrc0, s_save_spi_init_lo //base_addr_lo
0404 s_and_b32 s_save_buf_rsrc1, s_save_spi_init_hi, 0x0000FFFF //base_addr_hi
0405 s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE
0406 s_mov_b32 s_save_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes) although not neccessarily inited
0407 s_mov_b32 s_save_buf_rsrc3, S_SAVE_BUF_RSRC_WORD3_MISC
0408
0409 s_mov_b32 s_save_m0, m0
0410
0411
0412 s_mov_b32 s_save_mem_offset, 0x0
0413 get_wave_size(s_wave_size)
0414
0415 #if HAVE_XNACK
0416 // Save and clear vector XNACK state late to free up SGPRs.
0417 s_getreg_b32 s_save_xnack_mask, hwreg(HW_REG_SHADER_XNACK_MASK)
0418 s_setreg_imm32_b32 hwreg(HW_REG_SHADER_XNACK_MASK), 0x0
0419 #endif
0420
0421
0422 s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on
0423 s_lshr_b32 m0, s_wave_size, S_WAVE_SIZE
0424 s_and_b32 m0, m0, 1
0425 s_cmp_eq_u32 m0, 1
0426 s_cbranch_scc1 L_ENABLE_SAVE_4VGPR_EXEC_HI
0427 s_mov_b32 exec_hi, 0x00000000
0428 s_branch L_SAVE_4VGPR_WAVE32
0429 L_ENABLE_SAVE_4VGPR_EXEC_HI:
0430 s_mov_b32 exec_hi, 0xFFFFFFFF
0431 s_branch L_SAVE_4VGPR_WAVE64
0432 L_SAVE_4VGPR_WAVE32:
0433 s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
0434
0435 // VGPR Allocated in 4-GPR granularity
0436
0437 #if !NO_SQC_STORE
0438 buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
0439 #endif
0440 buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128
0441 buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128*2
0442 buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128*3
0443 s_branch L_SAVE_HWREG
0444
0445 L_SAVE_4VGPR_WAVE64:
0446 s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
0447
0448 // VGPR Allocated in 4-GPR granularity
0449
0450 #if !NO_SQC_STORE
0451 buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
0452 #endif
0453 buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256
0454 buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2
0455 buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3
0456
0457
0458
0459 L_SAVE_HWREG:
0460 // HWREG SR memory offset : size(VGPR)+size(SVGPR)+size(SGPR)
0461 get_vgpr_size_bytes(s_save_mem_offset, s_wave_size)
0462 get_svgpr_size_bytes(s_save_tmp)
0463 s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp
0464 s_add_u32 s_save_mem_offset, s_save_mem_offset, get_sgpr_size_bytes()
0465
0466 s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
0467
0468 #if NO_SQC_STORE
0469 v_mov_b32 v0, 0x0 //Offset[31:0] from buffer resource
0470 v_mov_b32 v1, 0x0 //Offset[63:32] from buffer resource
0471 v_mov_b32 v2, 0x0 //Set of SGPRs for TCP store
0472 s_mov_b32 m0, 0x0 //Next lane of v2 to write to
0473 #endif
0474
0475 write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset)
0476 write_hwreg_to_mem(s_save_pc_lo, s_save_buf_rsrc0, s_save_mem_offset)
0477 s_andn2_b32 s_save_tmp, s_save_pc_hi, S_SAVE_PC_HI_FIRST_WAVE_MASK
0478 write_hwreg_to_mem(s_save_tmp, s_save_buf_rsrc0, s_save_mem_offset)
0479 write_hwreg_to_mem(s_save_exec_lo, s_save_buf_rsrc0, s_save_mem_offset)
0480 write_hwreg_to_mem(s_save_exec_hi, s_save_buf_rsrc0, s_save_mem_offset)
0481 write_hwreg_to_mem(s_save_status, s_save_buf_rsrc0, s_save_mem_offset)
0482
0483 s_getreg_b32 s_save_tmp, hwreg(HW_REG_TRAPSTS)
0484 write_hwreg_to_mem(s_save_tmp, s_save_buf_rsrc0, s_save_mem_offset)
0485
0486 // Not used on Sienna_Cichlid but keep layout same for debugger.
0487 write_hwreg_to_mem(s_save_xnack_mask, s_save_buf_rsrc0, s_save_mem_offset)
0488
0489 s_getreg_b32 s_save_m0, hwreg(HW_REG_MODE)
0490 write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset)
0491
0492 s_getreg_b32 s_save_m0, hwreg(HW_REG_SHADER_FLAT_SCRATCH_LO)
0493 write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset)
0494
0495 s_getreg_b32 s_save_m0, hwreg(HW_REG_SHADER_FLAT_SCRATCH_HI)
0496 write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset)
0497
0498 #if NO_SQC_STORE
0499 // Write HWREGs with 16 VGPR lanes. TTMPs occupy space after this.
0500 s_mov_b32 exec_lo, 0xFFFF
0501 s_mov_b32 exec_hi, 0x0
0502 buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
0503
0504 // Write SGPRs with 32 VGPR lanes. This works in wave32 and wave64 mode.
0505 s_mov_b32 exec_lo, 0xFFFFFFFF
0506 #endif
0507
0508
0509 // Save SGPR before LDS save, then the s0 to s4 can be used during LDS save...
0510
0511 // SGPR SR memory offset : size(VGPR)+size(SVGPR)
0512 get_vgpr_size_bytes(s_save_mem_offset, s_wave_size)
0513 get_svgpr_size_bytes(s_save_tmp)
0514 s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp
0515 s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
0516
0517 #if NO_SQC_STORE
0518 s_mov_b32 ttmp13, 0x0 //next VGPR lane to copy SGPR into
0519 #else
0520 // backup s_save_buf_rsrc0,1 to s_save_pc_lo/hi, since write_16sgpr_to_mem function will change the rsrc0
0521 s_mov_b32 s_save_xnack_mask, s_save_buf_rsrc0
0522 s_add_u32 s_save_buf_rsrc0, s_save_buf_rsrc0, s_save_mem_offset
0523 s_addc_u32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0
0524 #endif
0525
0526 s_mov_b32 m0, 0x0 //SGPR initial index value =0
0527 s_nop 0x0 //Manually inserted wait states
0528 L_SAVE_SGPR_LOOP:
0529 // SGPR is allocated in 16 SGPR granularity
0530 s_movrels_b64 s0, s0 //s0 = s[0+m0], s1 = s[1+m0]
0531 s_movrels_b64 s2, s2 //s2 = s[2+m0], s3 = s[3+m0]
0532 s_movrels_b64 s4, s4 //s4 = s[4+m0], s5 = s[5+m0]
0533 s_movrels_b64 s6, s6 //s6 = s[6+m0], s7 = s[7+m0]
0534 s_movrels_b64 s8, s8 //s8 = s[8+m0], s9 = s[9+m0]
0535 s_movrels_b64 s10, s10 //s10 = s[10+m0], s11 = s[11+m0]
0536 s_movrels_b64 s12, s12 //s12 = s[12+m0], s13 = s[13+m0]
0537 s_movrels_b64 s14, s14 //s14 = s[14+m0], s15 = s[15+m0]
0538
0539 write_16sgpr_to_mem(s0, s_save_buf_rsrc0, s_save_mem_offset)
0540
0541 #if NO_SQC_STORE
0542 s_cmp_eq_u32 ttmp13, 0x20 //have 32 VGPR lanes filled?
0543 s_cbranch_scc0 L_SAVE_SGPR_SKIP_TCP_STORE
0544
0545 buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
0546 s_add_u32 s_save_mem_offset, s_save_mem_offset, 0x80
0547 s_mov_b32 ttmp13, 0x0
0548 v_mov_b32 v2, 0x0
0549 L_SAVE_SGPR_SKIP_TCP_STORE:
0550 #endif
0551
0552 s_add_u32 m0, m0, 16 //next sgpr index
0553 s_cmp_lt_u32 m0, 96 //scc = (m0 < first 96 SGPR) ? 1 : 0
0554 s_cbranch_scc1 L_SAVE_SGPR_LOOP //first 96 SGPR save is complete?
0555
0556 //save the rest 12 SGPR
0557 s_movrels_b64 s0, s0 //s0 = s[0+m0], s1 = s[1+m0]
0558 s_movrels_b64 s2, s2 //s2 = s[2+m0], s3 = s[3+m0]
0559 s_movrels_b64 s4, s4 //s4 = s[4+m0], s5 = s[5+m0]
0560 s_movrels_b64 s6, s6 //s6 = s[6+m0], s7 = s[7+m0]
0561 s_movrels_b64 s8, s8 //s8 = s[8+m0], s9 = s[9+m0]
0562 s_movrels_b64 s10, s10 //s10 = s[10+m0], s11 = s[11+m0]
0563 write_12sgpr_to_mem(s0, s_save_buf_rsrc0, s_save_mem_offset)
0564
0565 #if NO_SQC_STORE
0566 buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
0567 #else
0568 // restore s_save_buf_rsrc0,1
0569 s_mov_b32 s_save_buf_rsrc0, s_save_xnack_mask
0570 #endif
0571
0572
0573
0574 L_SAVE_LDS:
0575 // Change EXEC to all threads...
0576 s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on
0577 s_lshr_b32 m0, s_wave_size, S_WAVE_SIZE
0578 s_and_b32 m0, m0, 1
0579 s_cmp_eq_u32 m0, 1
0580 s_cbranch_scc1 L_ENABLE_SAVE_LDS_EXEC_HI
0581 s_mov_b32 exec_hi, 0x00000000
0582 s_branch L_SAVE_LDS_NORMAL
0583 L_ENABLE_SAVE_LDS_EXEC_HI:
0584 s_mov_b32 exec_hi, 0xFFFFFFFF
0585 L_SAVE_LDS_NORMAL:
0586 s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE)
0587 s_and_b32 s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF //lds_size is zero?
0588 s_cbranch_scc0 L_SAVE_LDS_DONE //no lds used? jump to L_SAVE_DONE
0589
0590 s_barrier //LDS is used? wait for other waves in the same TG
0591 s_and_b32 s_save_tmp, s_save_pc_hi, S_SAVE_PC_HI_FIRST_WAVE_MASK
0592 s_cbranch_scc0 L_SAVE_LDS_DONE
0593
0594 // first wave do LDS save;
0595
0596 s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 6 //LDS size in dwords = lds_size * 64dw
0597 s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //LDS size in bytes
0598 s_mov_b32 s_save_buf_rsrc2, s_save_alloc_size //NUM_RECORDS in bytes
0599
0600 // LDS at offset: size(VGPR)+size(SVGPR)+SIZE(SGPR)+SIZE(HWREG)
0601 //
0602 get_vgpr_size_bytes(s_save_mem_offset, s_wave_size)
0603 get_svgpr_size_bytes(s_save_tmp)
0604 s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp
0605 s_add_u32 s_save_mem_offset, s_save_mem_offset, get_sgpr_size_bytes()
0606 s_add_u32 s_save_mem_offset, s_save_mem_offset, get_hwreg_size_bytes()
0607
0608 s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
0609
0610 //load 0~63*4(byte address) to vgpr v0
0611 v_mbcnt_lo_u32_b32 v0, -1, 0
0612 v_mbcnt_hi_u32_b32 v0, -1, v0
0613 v_mul_u32_u24 v0, 4, v0
0614
0615 s_lshr_b32 m0, s_wave_size, S_WAVE_SIZE
0616 s_and_b32 m0, m0, 1
0617 s_cmp_eq_u32 m0, 1
0618 s_mov_b32 m0, 0x0
0619 s_cbranch_scc1 L_SAVE_LDS_W64
0620
0621 L_SAVE_LDS_W32:
0622 s_mov_b32 s3, 128
0623 s_nop 0
0624 s_nop 0
0625 s_nop 0
0626 L_SAVE_LDS_LOOP_W32:
0627 ds_read_b32 v1, v0
0628 s_waitcnt 0
0629 buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
0630
0631 s_add_u32 m0, m0, s3 //every buffer_store_lds does 256 bytes
0632 s_add_u32 s_save_mem_offset, s_save_mem_offset, s3
0633 v_add_nc_u32 v0, v0, 128 //mem offset increased by 128 bytes
0634 s_cmp_lt_u32 m0, s_save_alloc_size //scc=(m0 < s_save_alloc_size) ? 1 : 0
0635 s_cbranch_scc1 L_SAVE_LDS_LOOP_W32 //LDS save is complete?
0636
0637 s_branch L_SAVE_LDS_DONE
0638
0639 L_SAVE_LDS_W64:
0640 s_mov_b32 s3, 256
0641 s_nop 0
0642 s_nop 0
0643 s_nop 0
0644 L_SAVE_LDS_LOOP_W64:
0645 ds_read_b32 v1, v0
0646 s_waitcnt 0
0647 buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
0648
0649 s_add_u32 m0, m0, s3 //every buffer_store_lds does 256 bytes
0650 s_add_u32 s_save_mem_offset, s_save_mem_offset, s3
0651 v_add_nc_u32 v0, v0, 256 //mem offset increased by 256 bytes
0652 s_cmp_lt_u32 m0, s_save_alloc_size //scc=(m0 < s_save_alloc_size) ? 1 : 0
0653 s_cbranch_scc1 L_SAVE_LDS_LOOP_W64 //LDS save is complete?
0654
0655 L_SAVE_LDS_DONE:
0656
0657 L_SAVE_VGPR:
0658 // VGPR SR memory offset: 0
0659 s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on
0660 s_lshr_b32 m0, s_wave_size, S_WAVE_SIZE
0661 s_and_b32 m0, m0, 1
0662 s_cmp_eq_u32 m0, 1
0663 s_cbranch_scc1 L_ENABLE_SAVE_VGPR_EXEC_HI
0664 s_mov_b32 s_save_mem_offset, (0+128*4) // for the rest VGPRs
0665 s_mov_b32 exec_hi, 0x00000000
0666 s_branch L_SAVE_VGPR_NORMAL
0667 L_ENABLE_SAVE_VGPR_EXEC_HI:
0668 s_mov_b32 s_save_mem_offset, (0+256*4) // for the rest VGPRs
0669 s_mov_b32 exec_hi, 0xFFFFFFFF
0670 L_SAVE_VGPR_NORMAL:
0671 s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE)
0672 s_add_u32 s_save_alloc_size, s_save_alloc_size, 1
0673 s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value)
0674 //determine it is wave32 or wave64
0675 s_lshr_b32 m0, s_wave_size, S_WAVE_SIZE
0676 s_and_b32 m0, m0, 1
0677 s_cmp_eq_u32 m0, 1
0678 s_cbranch_scc1 L_SAVE_VGPR_WAVE64
0679
0680 s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
0681
0682 // VGPR Allocated in 4-GPR granularity
0683
0684 // VGPR store using dw burst
0685 s_mov_b32 m0, 0x4 //VGPR initial index value =4
0686 s_cmp_lt_u32 m0, s_save_alloc_size
0687 s_cbranch_scc0 L_SAVE_VGPR_END
0688
0689 L_SAVE_VGPR_W32_LOOP:
0690 v_movrels_b32 v0, v0 //v0 = v[0+m0]
0691 v_movrels_b32 v1, v1 //v1 = v[1+m0]
0692 v_movrels_b32 v2, v2 //v2 = v[2+m0]
0693 v_movrels_b32 v3, v3 //v3 = v[3+m0]
0694
0695 buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
0696 buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128
0697 buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128*2
0698 buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128*3
0699
0700 s_add_u32 m0, m0, 4 //next vgpr index
0701 s_add_u32 s_save_mem_offset, s_save_mem_offset, 128*4 //every buffer_store_dword does 128 bytes
0702 s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0
0703 s_cbranch_scc1 L_SAVE_VGPR_W32_LOOP //VGPR save is complete?
0704
0705 s_branch L_SAVE_VGPR_END
0706
0707 L_SAVE_VGPR_WAVE64:
0708 s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
0709
0710 // VGPR store using dw burst
0711 s_mov_b32 m0, 0x4 //VGPR initial index value =4
0712 s_cmp_lt_u32 m0, s_save_alloc_size
0713 s_cbranch_scc0 L_SAVE_SHARED_VGPR
0714
0715 L_SAVE_VGPR_W64_LOOP:
0716 v_movrels_b32 v0, v0 //v0 = v[0+m0]
0717 v_movrels_b32 v1, v1 //v1 = v[1+m0]
0718 v_movrels_b32 v2, v2 //v2 = v[2+m0]
0719 v_movrels_b32 v3, v3 //v3 = v[3+m0]
0720
0721 buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
0722 buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256
0723 buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2
0724 buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3
0725
0726 s_add_u32 m0, m0, 4 //next vgpr index
0727 s_add_u32 s_save_mem_offset, s_save_mem_offset, 256*4 //every buffer_store_dword does 256 bytes
0728 s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0
0729 s_cbranch_scc1 L_SAVE_VGPR_W64_LOOP //VGPR save is complete?
0730
0731 L_SAVE_SHARED_VGPR:
0732 //Below part will be the save shared vgpr part (new for gfx10)
0733 s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE)
0734 s_and_b32 s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF //shared_vgpr_size is zero?
0735 s_cbranch_scc0 L_SAVE_VGPR_END //no shared_vgpr used? jump to L_SAVE_LDS
0736 s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 3 //Number of SHARED_VGPRs = shared_vgpr_size * 8 (non-zero value)
0737 //m0 now has the value of normal vgpr count, just add the m0 with shared_vgpr count to get the total count.
0738 //save shared_vgpr will start from the index of m0
0739 s_add_u32 s_save_alloc_size, s_save_alloc_size, m0
0740 s_mov_b32 exec_lo, 0xFFFFFFFF
0741 s_mov_b32 exec_hi, 0x00000000
0742 L_SAVE_SHARED_VGPR_WAVE64_LOOP:
0743 v_movrels_b32 v0, v0 //v0 = v[0+m0]
0744 buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
0745 s_add_u32 m0, m0, 1 //next vgpr index
0746 s_add_u32 s_save_mem_offset, s_save_mem_offset, 128
0747 s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0
0748 s_cbranch_scc1 L_SAVE_SHARED_VGPR_WAVE64_LOOP //SHARED_VGPR save is complete?
0749
0750 L_SAVE_VGPR_END:
0751 s_branch L_END_PGM
0752
0753 L_RESTORE:
0754
0755 s_mov_b32 s_restore_buf_rsrc0, s_restore_spi_init_lo //base_addr_lo
0756 s_and_b32 s_restore_buf_rsrc1, s_restore_spi_init_hi, 0x0000FFFF //base_addr_hi
0757 s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE
0758 s_mov_b32 s_restore_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes)
0759 s_mov_b32 s_restore_buf_rsrc3, S_RESTORE_BUF_RSRC_WORD3_MISC
0760
0761 //determine it is wave32 or wave64
0762 get_wave_size(s_restore_size)
0763
0764 s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_FIRST_WAVE_MASK
0765 s_cbranch_scc0 L_RESTORE_VGPR
0766
0767
0768 L_RESTORE_LDS:
0769 s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on
0770 s_lshr_b32 m0, s_restore_size, S_WAVE_SIZE
0771 s_and_b32 m0, m0, 1
0772 s_cmp_eq_u32 m0, 1
0773 s_cbranch_scc1 L_ENABLE_RESTORE_LDS_EXEC_HI
0774 s_mov_b32 exec_hi, 0x00000000
0775 s_branch L_RESTORE_LDS_NORMAL
0776 L_ENABLE_RESTORE_LDS_EXEC_HI:
0777 s_mov_b32 exec_hi, 0xFFFFFFFF
0778 L_RESTORE_LDS_NORMAL:
0779 s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE)
0780 s_and_b32 s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF //lds_size is zero?
0781 s_cbranch_scc0 L_RESTORE_VGPR //no lds used? jump to L_RESTORE_VGPR
0782 s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 6 //LDS size in dwords = lds_size * 64dw
0783 s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //LDS size in bytes
0784 s_mov_b32 s_restore_buf_rsrc2, s_restore_alloc_size //NUM_RECORDS in bytes
0785
0786 // LDS at offset: size(VGPR)+size(SVGPR)+SIZE(SGPR)+SIZE(HWREG)
0787 //
0788 get_vgpr_size_bytes(s_restore_mem_offset, s_restore_size)
0789 get_svgpr_size_bytes(s_restore_tmp)
0790 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp
0791 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, get_sgpr_size_bytes()
0792 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, get_hwreg_size_bytes()
0793
0794 s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
0795
0796 s_lshr_b32 m0, s_restore_size, S_WAVE_SIZE
0797 s_and_b32 m0, m0, 1
0798 s_cmp_eq_u32 m0, 1
0799 s_mov_b32 m0, 0x0
0800 s_cbranch_scc1 L_RESTORE_LDS_LOOP_W64
0801
0802 L_RESTORE_LDS_LOOP_W32:
0803 #if HAVE_BUFFER_LDS_LOAD
0804 buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 // first 64DW
0805 #else
0806 buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset
0807 s_waitcnt vmcnt(0)
0808 ds_store_addtid_b32 v0
0809 #endif
0810 s_add_u32 m0, m0, 128 // 128 DW
0811 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 128 //mem offset increased by 128DW
0812 s_cmp_lt_u32 m0, s_restore_alloc_size //scc=(m0 < s_restore_alloc_size) ? 1 : 0
0813 s_cbranch_scc1 L_RESTORE_LDS_LOOP_W32 //LDS restore is complete?
0814 s_branch L_RESTORE_VGPR
0815
0816 L_RESTORE_LDS_LOOP_W64:
0817 #if HAVE_BUFFER_LDS_LOAD
0818 buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 // first 64DW
0819 #else
0820 buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset
0821 s_waitcnt vmcnt(0)
0822 ds_store_addtid_b32 v0
0823 #endif
0824 s_add_u32 m0, m0, 256 // 256 DW
0825 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256 //mem offset increased by 256DW
0826 s_cmp_lt_u32 m0, s_restore_alloc_size //scc=(m0 < s_restore_alloc_size) ? 1 : 0
0827 s_cbranch_scc1 L_RESTORE_LDS_LOOP_W64 //LDS restore is complete?
0828
0829
0830 L_RESTORE_VGPR:
0831 // VGPR SR memory offset : 0
0832 s_mov_b32 s_restore_mem_offset, 0x0
0833 s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on
0834 s_lshr_b32 m0, s_restore_size, S_WAVE_SIZE
0835 s_and_b32 m0, m0, 1
0836 s_cmp_eq_u32 m0, 1
0837 s_cbranch_scc1 L_ENABLE_RESTORE_VGPR_EXEC_HI
0838 s_mov_b32 exec_hi, 0x00000000
0839 s_branch L_RESTORE_VGPR_NORMAL
0840 L_ENABLE_RESTORE_VGPR_EXEC_HI:
0841 s_mov_b32 exec_hi, 0xFFFFFFFF
0842 L_RESTORE_VGPR_NORMAL:
0843 s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE)
0844 s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1
0845 s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value)
0846 //determine it is wave32 or wave64
0847 s_lshr_b32 m0, s_restore_size, S_WAVE_SIZE
0848 s_and_b32 m0, m0, 1
0849 s_cmp_eq_u32 m0, 1
0850 s_cbranch_scc1 L_RESTORE_VGPR_WAVE64
0851
0852 s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
0853
0854 // VGPR load using dw burst
0855 s_mov_b32 s_restore_mem_offset_save, s_restore_mem_offset // restore start with v1, v0 will be the last
0856 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 128*4
0857 s_mov_b32 m0, 4 //VGPR initial index value = 4
0858 s_cmp_lt_u32 m0, s_restore_alloc_size
0859 s_cbranch_scc0 L_RESTORE_SGPR
0860
0861 L_RESTORE_VGPR_WAVE32_LOOP:
0862 buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1
0863 buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:128
0864 buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:128*2
0865 buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:128*3
0866 s_waitcnt vmcnt(0)
0867 v_movreld_b32 v0, v0 //v[0+m0] = v0
0868 v_movreld_b32 v1, v1
0869 v_movreld_b32 v2, v2
0870 v_movreld_b32 v3, v3
0871 s_add_u32 m0, m0, 4 //next vgpr index
0872 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 128*4 //every buffer_load_dword does 128 bytes
0873 s_cmp_lt_u32 m0, s_restore_alloc_size //scc = (m0 < s_restore_alloc_size) ? 1 : 0
0874 s_cbranch_scc1 L_RESTORE_VGPR_WAVE32_LOOP //VGPR restore (except v0) is complete?
0875
0876
0877 buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1
0878 buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:128
0879 buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:128*2
0880 buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:128*3
0881 s_waitcnt vmcnt(0)
0882
0883 s_branch L_RESTORE_SGPR
0884
0885 L_RESTORE_VGPR_WAVE64:
0886 s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
0887
0888 // VGPR load using dw burst
0889 s_mov_b32 s_restore_mem_offset_save, s_restore_mem_offset // restore start with v4, v0 will be the last
0890 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4
0891 s_mov_b32 m0, 4 //VGPR initial index value = 4
0892 s_cmp_lt_u32 m0, s_restore_alloc_size
0893 s_cbranch_scc0 L_RESTORE_SHARED_VGPR
0894
0895 L_RESTORE_VGPR_WAVE64_LOOP:
0896 buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1
0897 buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256
0898 buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*2
0899 buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*3
0900 s_waitcnt vmcnt(0)
0901 v_movreld_b32 v0, v0 //v[0+m0] = v0
0902 v_movreld_b32 v1, v1
0903 v_movreld_b32 v2, v2
0904 v_movreld_b32 v3, v3
0905 s_add_u32 m0, m0, 4 //next vgpr index
0906 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 //every buffer_load_dword does 256 bytes
0907 s_cmp_lt_u32 m0, s_restore_alloc_size //scc = (m0 < s_restore_alloc_size) ? 1 : 0
0908 s_cbranch_scc1 L_RESTORE_VGPR_WAVE64_LOOP //VGPR restore (except v0) is complete?
0909
0910 L_RESTORE_SHARED_VGPR:
0911 //Below part will be the restore shared vgpr part (new for gfx10)
0912 s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE) //shared_vgpr_size
0913 s_and_b32 s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF //shared_vgpr_size is zero?
0914 s_cbranch_scc0 L_RESTORE_V0 //no shared_vgpr used?
0915 s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 3 //Number of SHARED_VGPRs = shared_vgpr_size * 8 (non-zero value)
0916 //m0 now has the value of normal vgpr count, just add the m0 with shared_vgpr count to get the total count.
0917 //restore shared_vgpr will start from the index of m0
0918 s_add_u32 s_restore_alloc_size, s_restore_alloc_size, m0
0919 s_mov_b32 exec_lo, 0xFFFFFFFF
0920 s_mov_b32 exec_hi, 0x00000000
0921 L_RESTORE_SHARED_VGPR_WAVE64_LOOP:
0922 buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1
0923 s_waitcnt vmcnt(0)
0924 v_movreld_b32 v0, v0 //v[0+m0] = v0
0925 s_add_u32 m0, m0, 1 //next vgpr index
0926 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 128
0927 s_cmp_lt_u32 m0, s_restore_alloc_size //scc = (m0 < s_restore_alloc_size) ? 1 : 0
0928 s_cbranch_scc1 L_RESTORE_SHARED_VGPR_WAVE64_LOOP //VGPR restore (except v0) is complete?
0929
0930 s_mov_b32 exec_hi, 0xFFFFFFFF //restore back exec_hi before restoring V0!!
0931
0932
0933 L_RESTORE_V0:
0934 buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1
0935 buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256
0936 buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256*2
0937 buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256*3
0938 s_waitcnt vmcnt(0)
0939
0940
0941 //will be 2+8+16*6
0942 // SGPR SR memory offset : size(VGPR)+size(SVGPR)
0943 L_RESTORE_SGPR:
0944 get_vgpr_size_bytes(s_restore_mem_offset, s_restore_size)
0945 get_svgpr_size_bytes(s_restore_tmp)
0946 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp
0947 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, get_sgpr_size_bytes()
0948 s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 20*4 //s108~s127 is not saved
0949
0950 s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
0951
0952 s_mov_b32 m0, s_sgpr_save_num
0953
0954 read_4sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset)
0955 s_waitcnt lgkmcnt(0)
0956
0957 s_sub_u32 m0, m0, 4 // Restore from S[0] to S[104]
0958 s_nop 0 // hazard SALU M0=> S_MOVREL
0959
0960 s_movreld_b64 s0, s0 //s[0+m0] = s0
0961 s_movreld_b64 s2, s2
0962
0963 read_8sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset)
0964 s_waitcnt lgkmcnt(0)
0965
0966 s_sub_u32 m0, m0, 8 // Restore from S[0] to S[96]
0967 s_nop 0 // hazard SALU M0=> S_MOVREL
0968
0969 s_movreld_b64 s0, s0 //s[0+m0] = s0
0970 s_movreld_b64 s2, s2
0971 s_movreld_b64 s4, s4
0972 s_movreld_b64 s6, s6
0973
0974 L_RESTORE_SGPR_LOOP:
0975 read_16sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset)
0976 s_waitcnt lgkmcnt(0)
0977
0978 s_sub_u32 m0, m0, 16 // Restore from S[n] to S[0]
0979 s_nop 0 // hazard SALU M0=> S_MOVREL
0980
0981 s_movreld_b64 s0, s0 //s[0+m0] = s0
0982 s_movreld_b64 s2, s2
0983 s_movreld_b64 s4, s4
0984 s_movreld_b64 s6, s6
0985 s_movreld_b64 s8, s8
0986 s_movreld_b64 s10, s10
0987 s_movreld_b64 s12, s12
0988 s_movreld_b64 s14, s14
0989
0990 s_cmp_eq_u32 m0, 0 //scc = (m0 < s_sgpr_save_num) ? 1 : 0
0991 s_cbranch_scc0 L_RESTORE_SGPR_LOOP
0992
0993 // s_barrier with MODE.DEBUG_EN=1, STATUS.PRIV=1 incorrectly asserts debug exception.
0994 // Clear DEBUG_EN before and restore MODE after the barrier.
0995 s_setreg_imm32_b32 hwreg(HW_REG_MODE), 0
0996 s_barrier //barrier to ensure the readiness of LDS before access attemps from any other wave in the same TG
0997
0998
0999 L_RESTORE_HWREG:
1000 // HWREG SR memory offset : size(VGPR)+size(SVGPR)+size(SGPR)
1001 get_vgpr_size_bytes(s_restore_mem_offset, s_restore_size)
1002 get_svgpr_size_bytes(s_restore_tmp)
1003 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp
1004 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, get_sgpr_size_bytes()
1005
1006 s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
1007
1008 read_hwreg_from_mem(s_restore_m0, s_restore_buf_rsrc0, s_restore_mem_offset)
1009 read_hwreg_from_mem(s_restore_pc_lo, s_restore_buf_rsrc0, s_restore_mem_offset)
1010 read_hwreg_from_mem(s_restore_pc_hi, s_restore_buf_rsrc0, s_restore_mem_offset)
1011 read_hwreg_from_mem(s_restore_exec_lo, s_restore_buf_rsrc0, s_restore_mem_offset)
1012 read_hwreg_from_mem(s_restore_exec_hi, s_restore_buf_rsrc0, s_restore_mem_offset)
1013 read_hwreg_from_mem(s_restore_status, s_restore_buf_rsrc0, s_restore_mem_offset)
1014 read_hwreg_from_mem(s_restore_trapsts, s_restore_buf_rsrc0, s_restore_mem_offset)
1015 read_hwreg_from_mem(s_restore_xnack_mask, s_restore_buf_rsrc0, s_restore_mem_offset)
1016 read_hwreg_from_mem(s_restore_mode, s_restore_buf_rsrc0, s_restore_mem_offset)
1017 read_hwreg_from_mem(s_restore_flat_scratch, s_restore_buf_rsrc0, s_restore_mem_offset)
1018 s_waitcnt lgkmcnt(0)
1019
1020 s_setreg_b32 hwreg(HW_REG_SHADER_FLAT_SCRATCH_LO), s_restore_flat_scratch
1021
1022 read_hwreg_from_mem(s_restore_flat_scratch, s_restore_buf_rsrc0, s_restore_mem_offset)
1023 s_waitcnt lgkmcnt(0) //from now on, it is safe to restore STATUS and IB_STS
1024
1025 s_setreg_b32 hwreg(HW_REG_SHADER_FLAT_SCRATCH_HI), s_restore_flat_scratch
1026
1027 s_mov_b32 m0, s_restore_m0
1028 s_mov_b32 exec_lo, s_restore_exec_lo
1029 s_mov_b32 exec_hi, s_restore_exec_hi
1030
1031 s_and_b32 s_restore_m0, SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK, s_restore_trapsts
1032 s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE), s_restore_m0
1033
1034 #if HAVE_XNACK
1035 s_setreg_b32 hwreg(HW_REG_SHADER_XNACK_MASK), s_restore_xnack_mask
1036 #endif
1037
1038 s_and_b32 s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK, s_restore_trapsts
1039 s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT
1040 s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE), s_restore_m0
1041 s_setreg_b32 hwreg(HW_REG_MODE), s_restore_mode
1042
1043 // Restore trap temporaries 4-11, 13 initialized by SPI debug dispatch logic
1044 // ttmp SR memory offset : size(VGPR)+size(SVGPR)+size(SGPR)+0x40
1045 get_vgpr_size_bytes(s_restore_ttmps_lo, s_restore_size)
1046 get_svgpr_size_bytes(s_restore_ttmps_hi)
1047 s_add_u32 s_restore_ttmps_lo, s_restore_ttmps_lo, s_restore_ttmps_hi
1048 s_add_u32 s_restore_ttmps_lo, s_restore_ttmps_lo, get_sgpr_size_bytes()
1049 s_add_u32 s_restore_ttmps_lo, s_restore_ttmps_lo, s_restore_buf_rsrc0
1050 s_addc_u32 s_restore_ttmps_hi, s_restore_buf_rsrc1, 0x0
1051 s_and_b32 s_restore_ttmps_hi, s_restore_ttmps_hi, 0xFFFF
1052 s_load_dwordx4 [ttmp4, ttmp5, ttmp6, ttmp7], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x50 glc:1
1053 s_load_dwordx4 [ttmp8, ttmp9, ttmp10, ttmp11], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x60 glc:1
1054 s_load_dword ttmp13, [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x74 glc:1
1055 s_waitcnt lgkmcnt(0)
1056
1057 #if HAVE_XNACK
1058 restore_ib_sts(s_restore_tmp, s_restore_m0)
1059 #endif
1060
1061 s_and_b32 s_restore_pc_hi, s_restore_pc_hi, 0x0000ffff //pc[47:32] //Do it here in order not to affect STATUS
1062 s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32
1063 s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32
1064 s_setreg_b32 hwreg(HW_REG_STATUS), s_restore_status // SCC is included, which is changed by previous salu
1065
1066 s_rfe_b64 s_restore_pc_lo //Return to the main shader program and resume execution
1067
1068 L_END_PGM:
1069 s_endpgm
1070 end
1071
1072 function write_hwreg_to_mem(s, s_rsrc, s_mem_offset)
1073 #if NO_SQC_STORE
1074 // Copy into VGPR for later TCP store.
1075 v_writelane_b32 v2, s, m0
1076 s_add_u32 m0, m0, 0x1
1077 #else
1078 s_mov_b32 exec_lo, m0
1079 s_mov_b32 m0, s_mem_offset
1080 s_buffer_store_dword s, s_rsrc, m0 glc:1
1081 s_add_u32 s_mem_offset, s_mem_offset, 4
1082 s_mov_b32 m0, exec_lo
1083 #endif
1084 end
1085
1086
1087 function write_16sgpr_to_mem(s, s_rsrc, s_mem_offset)
1088 #if NO_SQC_STORE
1089 // Copy into VGPR for later TCP store.
1090 for var sgpr_idx = 0; sgpr_idx < 16; sgpr_idx ++
1091 v_writelane_b32 v2, s[sgpr_idx], ttmp13
1092 s_add_u32 ttmp13, ttmp13, 0x1
1093 end
1094 #else
1095 s_buffer_store_dwordx4 s[0], s_rsrc, 0 glc:1
1096 s_buffer_store_dwordx4 s[4], s_rsrc, 16 glc:1
1097 s_buffer_store_dwordx4 s[8], s_rsrc, 32 glc:1
1098 s_buffer_store_dwordx4 s[12], s_rsrc, 48 glc:1
1099 s_add_u32 s_rsrc[0], s_rsrc[0], 4*16
1100 s_addc_u32 s_rsrc[1], s_rsrc[1], 0x0
1101 #endif
1102 end
1103
1104 function write_12sgpr_to_mem(s, s_rsrc, s_mem_offset)
1105 #if NO_SQC_STORE
1106 // Copy into VGPR for later TCP store.
1107 for var sgpr_idx = 0; sgpr_idx < 12; sgpr_idx ++
1108 v_writelane_b32 v2, s[sgpr_idx], ttmp13
1109 s_add_u32 ttmp13, ttmp13, 0x1
1110 end
1111 #else
1112 s_buffer_store_dwordx4 s[0], s_rsrc, 0 glc:1
1113 s_buffer_store_dwordx4 s[4], s_rsrc, 16 glc:1
1114 s_buffer_store_dwordx4 s[8], s_rsrc, 32 glc:1
1115 s_add_u32 s_rsrc[0], s_rsrc[0], 4*12
1116 s_addc_u32 s_rsrc[1], s_rsrc[1], 0x0
1117 #endif
1118 end
1119
1120 function read_hwreg_from_mem(s, s_rsrc, s_mem_offset)
1121 s_buffer_load_dword s, s_rsrc, s_mem_offset glc:1
1122 s_add_u32 s_mem_offset, s_mem_offset, 4
1123 end
1124
1125 function read_16sgpr_from_mem(s, s_rsrc, s_mem_offset)
1126 s_sub_u32 s_mem_offset, s_mem_offset, 4*16
1127 s_buffer_load_dwordx16 s, s_rsrc, s_mem_offset glc:1
1128 end
1129
1130 function read_8sgpr_from_mem(s, s_rsrc, s_mem_offset)
1131 s_sub_u32 s_mem_offset, s_mem_offset, 4*8
1132 s_buffer_load_dwordx8 s, s_rsrc, s_mem_offset glc:1
1133 end
1134
1135 function read_4sgpr_from_mem(s, s_rsrc, s_mem_offset)
1136 s_sub_u32 s_mem_offset, s_mem_offset, 4*4
1137 s_buffer_load_dwordx4 s, s_rsrc, s_mem_offset glc:1
1138 end
1139
1140
1141 function get_lds_size_bytes(s_lds_size_byte)
1142 s_getreg_b32 s_lds_size_byte, hwreg(HW_REG_LDS_ALLOC, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE)
1143 s_lshl_b32 s_lds_size_byte, s_lds_size_byte, 8 //LDS size in dwords = lds_size * 64 *4Bytes // granularity 64DW
1144 end
1145
1146 function get_vgpr_size_bytes(s_vgpr_size_byte, s_size)
1147 s_getreg_b32 s_vgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE)
1148 s_add_u32 s_vgpr_size_byte, s_vgpr_size_byte, 1
1149 s_bitcmp1_b32 s_size, S_WAVE_SIZE
1150 s_cbranch_scc1 L_ENABLE_SHIFT_W64
1151 s_lshl_b32 s_vgpr_size_byte, s_vgpr_size_byte, (2+7) //Number of VGPRs = (vgpr_size + 1) * 4 * 32 * 4 (non-zero value)
1152 s_branch L_SHIFT_DONE
1153 L_ENABLE_SHIFT_W64:
1154 s_lshl_b32 s_vgpr_size_byte, s_vgpr_size_byte, (2+8) //Number of VGPRs = (vgpr_size + 1) * 4 * 64 * 4 (non-zero value)
1155 L_SHIFT_DONE:
1156 end
1157
1158 function get_svgpr_size_bytes(s_svgpr_size_byte)
1159 s_getreg_b32 s_svgpr_size_byte, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE)
1160 s_lshl_b32 s_svgpr_size_byte, s_svgpr_size_byte, (3+7)
1161 end
1162
1163 function get_sgpr_size_bytes
1164 return 512
1165 end
1166
1167 function get_hwreg_size_bytes
1168 return 128
1169 end
1170
1171 function get_wave_size(s_reg)
1172 s_getreg_b32 s_reg, hwreg(HW_REG_IB_STS2,SQ_WAVE_IB_STS2_WAVE64_SHIFT,SQ_WAVE_IB_STS2_WAVE64_SIZE)
1173 s_lshl_b32 s_reg, s_reg, S_WAVE_SIZE
1174 end
1175
1176 function save_and_clear_ib_sts(tmp1, tmp2)
1177 // Preserve and clear scalar XNACK state before issuing scalar loads.
1178 // Save IB_STS.REPLAY_W64H[25], RCNT[21:16], FIRST_REPLAY[15] into
1179 // unused space ttmp11[31:24].
1180 s_andn2_b32 ttmp11, ttmp11, (TTMP11_SAVE_REPLAY_W64H_MASK | TTMP11_SAVE_RCNT_FIRST_REPLAY_MASK)
1181 s_getreg_b32 tmp1, hwreg(HW_REG_IB_STS)
1182 s_and_b32 tmp2, tmp1, SQ_WAVE_IB_STS_REPLAY_W64H_MASK
1183 s_lshl_b32 tmp2, tmp2, (TTMP11_SAVE_REPLAY_W64H_SHIFT - SQ_WAVE_IB_STS_REPLAY_W64H_SHIFT)
1184 s_or_b32 ttmp11, ttmp11, tmp2
1185 s_and_b32 tmp2, tmp1, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK
1186 s_lshl_b32 tmp2, tmp2, (TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT - SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT)
1187 s_or_b32 ttmp11, ttmp11, tmp2
1188 s_andn2_b32 tmp1, tmp1, (SQ_WAVE_IB_STS_REPLAY_W64H_MASK | SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK)
1189 s_setreg_b32 hwreg(HW_REG_IB_STS), tmp1
1190 end
1191
1192 function restore_ib_sts(tmp1, tmp2)
1193 s_lshr_b32 tmp1, ttmp11, (TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT - SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT)
1194 s_and_b32 tmp2, tmp1, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK
1195 s_lshr_b32 tmp1, ttmp11, (TTMP11_SAVE_REPLAY_W64H_SHIFT - SQ_WAVE_IB_STS_REPLAY_W64H_SHIFT)
1196 s_and_b32 tmp1, tmp1, SQ_WAVE_IB_STS_REPLAY_W64H_MASK
1197 s_or_b32 tmp1, tmp1, tmp2
1198 s_setreg_b32 hwreg(HW_REG_IB_STS), tmp1
1199 end