0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024
0025
0026
0027
0028
0029
0030
0031
0032
0033
0034
0035
0036
0037
0038 #define CHIP_VEGAM 18
0039 #define CHIP_ARCTURUS 23
0040 #define CHIP_ALDEBARAN 25
0041
0042 var ACK_SQC_STORE = 1 //workaround for suspected SQC store bug causing incorrect stores under concurrency
0043 var SAVE_AFTER_XNACK_ERROR = 1 //workaround for TCP store failure after XNACK error when ALLOW_REPLAY=0, for debugger
0044 var SINGLE_STEP_MISSED_WORKAROUND = 1 //workaround for lost MODE.DEBUG_EN exception when SAVECTX raised
0045
0046
0047
0048
0049 var SQ_WAVE_STATUS_SPI_PRIO_SHIFT = 1
0050 var SQ_WAVE_STATUS_SPI_PRIO_MASK = 0x00000006
0051 var SQ_WAVE_STATUS_HALT_MASK = 0x2000
0052 var SQ_WAVE_STATUS_PRE_SPI_PRIO_SHIFT = 0
0053 var SQ_WAVE_STATUS_PRE_SPI_PRIO_SIZE = 1
0054 var SQ_WAVE_STATUS_POST_SPI_PRIO_SHIFT = 3
0055 var SQ_WAVE_STATUS_POST_SPI_PRIO_SIZE = 29
0056 var SQ_WAVE_STATUS_ALLOW_REPLAY_MASK = 0x400000
0057 var SQ_WAVE_STATUS_ECC_ERR_MASK = 0x20000
0058
0059 var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT = 12
0060 var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE = 9
0061 var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE = 6
0062 var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE = 3 //FIXME sq.blk still has 4 bits at this time while SQ programming guide has 3 bits
0063 var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT = 24
0064
0065 #if ASIC_FAMILY >= CHIP_ALDEBARAN
0066 var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT = 6
0067 var SQ_WAVE_GPR_ALLOC_ACCV_OFFSET_SHIFT = 12
0068 var SQ_WAVE_GPR_ALLOC_ACCV_OFFSET_SIZE = 6
0069 #else
0070 var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT = 8
0071 #endif
0072
0073 var SQ_WAVE_TRAPSTS_SAVECTX_MASK = 0x400
0074 var SQ_WAVE_TRAPSTS_EXCP_MASK = 0x1FF
0075 var SQ_WAVE_TRAPSTS_SAVECTX_SHIFT = 10
0076 var SQ_WAVE_TRAPSTS_ADDR_WATCH_MASK = 0x80
0077 var SQ_WAVE_TRAPSTS_ADDR_WATCH_SHIFT = 7
0078 var SQ_WAVE_TRAPSTS_MEM_VIOL_MASK = 0x100
0079 var SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT = 8
0080 var SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK = 0x3FF
0081 var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT = 0x0
0082 var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE = 10
0083 var SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK = 0xFFFFF800
0084 var SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT = 11
0085 var SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE = 21
0086 var SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK = 0x800
0087 var SQ_WAVE_TRAPSTS_EXCP_HI_MASK = 0x7000
0088 var SQ_WAVE_TRAPSTS_XNACK_ERROR_MASK = 0x10000000
0089
0090 var SQ_WAVE_MODE_EXCP_EN_SHIFT = 12
0091 var SQ_WAVE_MODE_EXCP_EN_ADDR_WATCH_SHIFT = 19
0092
0093 var SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT = 15 //FIXME
0094 var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK = 0x1F8000
0095
0096 var SQ_WAVE_MODE_DEBUG_EN_MASK = 0x800
0097
0098 var TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT = 26 // bits [31:26] unused by SPI debug data
0099 var TTMP11_SAVE_RCNT_FIRST_REPLAY_MASK = 0xFC000000
0100 var TTMP11_DEBUG_TRAP_ENABLED_SHIFT = 23
0101 var TTMP11_DEBUG_TRAP_ENABLED_MASK = 0x800000
0102
0103
0104 var S_SAVE_BUF_RSRC_WORD1_STRIDE = 0x00040000 //stride is 4 bytes
0105 var S_SAVE_BUF_RSRC_WORD3_MISC = 0x00807FAC //SQ_SEL_X/Y/Z/W, BUF_NUM_FORMAT_FLOAT, (0 for MUBUF stride[17:14] when ADD_TID_ENABLE and BUF_DATA_FORMAT_32 for MTBUF), ADD_TID_ENABLE
0106 var S_SAVE_PC_HI_TRAP_ID_MASK = 0x00FF0000
0107 var S_SAVE_PC_HI_HT_MASK = 0x01000000
0108 var S_SAVE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000 //bit[26]: FirstWaveInTG
0109 var S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT = 26
0110
0111 var s_save_spi_init_lo = exec_lo
0112 var s_save_spi_init_hi = exec_hi
0113
0114 var s_save_pc_lo = ttmp0 //{TTMP1, TTMP0} = {3'h0,pc_rewind[3:0], HT[0],trapID[7:0], PC[47:0]}
0115 var s_save_pc_hi = ttmp1
0116 var s_save_exec_lo = ttmp2
0117 var s_save_exec_hi = ttmp3
0118 var s_save_tmp = ttmp14
0119 var s_save_trapsts = ttmp15 //not really used until the end of the SAVE routine
0120 var s_save_xnack_mask_lo = ttmp6
0121 var s_save_xnack_mask_hi = ttmp7
0122 var s_save_buf_rsrc0 = ttmp8
0123 var s_save_buf_rsrc1 = ttmp9
0124 var s_save_buf_rsrc2 = ttmp10
0125 var s_save_buf_rsrc3 = ttmp11
0126 var s_save_status = ttmp12
0127 var s_save_mem_offset = ttmp4
0128 var s_save_alloc_size = s_save_trapsts //conflict
0129 var s_save_m0 = ttmp5
0130 var s_save_ttmps_lo = s_save_tmp //no conflict
0131 var s_save_ttmps_hi = s_save_trapsts //no conflict
0132
0133
0134 var S_RESTORE_BUF_RSRC_WORD1_STRIDE = S_SAVE_BUF_RSRC_WORD1_STRIDE
0135 var S_RESTORE_BUF_RSRC_WORD3_MISC = S_SAVE_BUF_RSRC_WORD3_MISC
0136
0137 var S_RESTORE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000 //bit[26]: FirstWaveInTG
0138 var S_RESTORE_SPI_INIT_FIRST_WAVE_SHIFT = 26
0139
0140 var s_restore_spi_init_lo = exec_lo
0141 var s_restore_spi_init_hi = exec_hi
0142
0143 var s_restore_mem_offset = ttmp12
0144 var s_restore_tmp2 = ttmp13
0145 var s_restore_alloc_size = ttmp3
0146 var s_restore_tmp = ttmp2
0147 var s_restore_mem_offset_save = s_restore_tmp //no conflict
0148 var s_restore_accvgpr_offset_save = ttmp7
0149
0150 var s_restore_m0 = s_restore_alloc_size //no conflict
0151
0152 var s_restore_mode = s_restore_accvgpr_offset_save
0153
0154 var s_restore_pc_lo = ttmp0
0155 var s_restore_pc_hi = ttmp1
0156 var s_restore_exec_lo = ttmp4
0157 var s_restore_exec_hi = ttmp5
0158 var s_restore_status = ttmp14
0159 var s_restore_trapsts = ttmp15
0160 var s_restore_xnack_mask_lo = xnack_mask_lo
0161 var s_restore_xnack_mask_hi = xnack_mask_hi
0162 var s_restore_buf_rsrc0 = ttmp8
0163 var s_restore_buf_rsrc1 = ttmp9
0164 var s_restore_buf_rsrc2 = ttmp10
0165 var s_restore_buf_rsrc3 = ttmp11
0166 var s_restore_ttmps_lo = s_restore_tmp //no conflict
0167 var s_restore_ttmps_hi = s_restore_alloc_size //no conflict
0168
0169
0170
0171
0172
0173
0174 shader main
0175 asic(DEFAULT)
0176 type(CS)
0177
0178
0179 s_branch L_SKIP_RESTORE //NOT restore. might be a regular trap or save
0180
0181 L_JUMP_TO_RESTORE:
0182 s_branch L_RESTORE //restore
0183
0184 L_SKIP_RESTORE:
0185
0186 s_getreg_b32 s_save_status, hwreg(HW_REG_STATUS) //save STATUS since we will change SCC
0187
0188 // Clear SPI_PRIO: do not save with elevated priority.
0189 // Clear ECC_ERR: prevents SQC store and triggers FATAL_HALT if setreg'd.
0190 s_andn2_b32 s_save_status, s_save_status, SQ_WAVE_STATUS_SPI_PRIO_MASK|SQ_WAVE_STATUS_ECC_ERR_MASK
0191
0192 s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS)
0193
0194 s_and_b32 ttmp2, s_save_status, SQ_WAVE_STATUS_HALT_MASK
0195 s_cbranch_scc0 L_NOT_HALTED
0196
0197 L_HALTED:
0198 // Host trap may occur while wave is halted.
0199 s_and_b32 ttmp2, s_save_pc_hi, S_SAVE_PC_HI_TRAP_ID_MASK
0200 s_cbranch_scc1 L_FETCH_2ND_TRAP
0201
0202 L_CHECK_SAVE:
0203 s_and_b32 ttmp2, s_save_trapsts, SQ_WAVE_TRAPSTS_SAVECTX_MASK //check whether this is for save
0204 s_cbranch_scc1 L_SAVE //this is the operation for save
0205
0206 // Wave is halted but neither host trap nor SAVECTX is raised.
0207 // Caused by instruction fetch memory violation.
0208 // Spin wait until context saved to prevent interrupt storm.
0209 s_sleep 0x10
0210 s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS)
0211 s_branch L_CHECK_SAVE
0212
0213 L_NOT_HALTED:
0214 // Let second-level handle non-SAVECTX exception or trap.
0215 // Any concurrent SAVECTX will be handled upon re-entry once halted.
0216
0217 // Check non-maskable exceptions. memory_violation, illegal_instruction
0218 // and xnack_error exceptions always cause the wave to enter the trap
0219 // handler.
0220 s_and_b32 ttmp2, s_save_trapsts, SQ_WAVE_TRAPSTS_MEM_VIOL_MASK|SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK
0221 s_cbranch_scc1 L_FETCH_2ND_TRAP
0222
0223 // Check for maskable exceptions in trapsts.excp and trapsts.excp_hi.
0224 // Maskable exceptions only cause the wave to enter the trap handler if
0225 // their respective bit in mode.excp_en is set.
0226 s_and_b32 ttmp2, s_save_trapsts, SQ_WAVE_TRAPSTS_EXCP_MASK|SQ_WAVE_TRAPSTS_EXCP_HI_MASK
0227 s_cbranch_scc0 L_CHECK_TRAP_ID
0228
0229 s_and_b32 ttmp3, s_save_trapsts, SQ_WAVE_TRAPSTS_ADDR_WATCH_MASK|SQ_WAVE_TRAPSTS_EXCP_HI_MASK
0230 s_cbranch_scc0 L_NOT_ADDR_WATCH
0231 s_bitset1_b32 ttmp2, SQ_WAVE_TRAPSTS_ADDR_WATCH_SHIFT // Check all addr_watch[123] exceptions against excp_en.addr_watch
0232
0233 L_NOT_ADDR_WATCH:
0234 s_getreg_b32 ttmp3, hwreg(HW_REG_MODE)
0235 s_lshl_b32 ttmp2, ttmp2, SQ_WAVE_MODE_EXCP_EN_SHIFT
0236 s_and_b32 ttmp2, ttmp2, ttmp3
0237 s_cbranch_scc1 L_FETCH_2ND_TRAP
0238
0239 L_CHECK_TRAP_ID:
0240 // Check trap_id != 0
0241 s_and_b32 ttmp2, s_save_pc_hi, S_SAVE_PC_HI_TRAP_ID_MASK
0242 s_cbranch_scc1 L_FETCH_2ND_TRAP
0243
0244 if SINGLE_STEP_MISSED_WORKAROUND
0245 // Prioritize single step exception over context save.
0246 // Second-level trap will halt wave and RFE, re-entering for SAVECTX.
0247 s_getreg_b32 ttmp2, hwreg(HW_REG_MODE)
0248 s_and_b32 ttmp2, ttmp2, SQ_WAVE_MODE_DEBUG_EN_MASK
0249 s_cbranch_scc1 L_FETCH_2ND_TRAP
0250 end
0251
0252 s_and_b32 ttmp2, s_save_trapsts, SQ_WAVE_TRAPSTS_SAVECTX_MASK
0253 s_cbranch_scc1 L_SAVE
0254
0255 L_FETCH_2ND_TRAP:
0256 // Preserve and clear scalar XNACK state before issuing scalar reads.
0257 save_and_clear_ib_sts(ttmp14)
0258
0259 // Read second-level TBA/TMA from first-level TMA and jump if available.
0260 // ttmp[2:5] and ttmp12 can be used (others hold SPI-initialized debug data)
0261 // ttmp12 holds SQ_WAVE_STATUS
0262 s_getreg_b32 ttmp14, hwreg(HW_REG_SQ_SHADER_TMA_LO)
0263 s_getreg_b32 ttmp15, hwreg(HW_REG_SQ_SHADER_TMA_HI)
0264 s_lshl_b64 [ttmp14, ttmp15], [ttmp14, ttmp15], 0x8
0265
0266 s_load_dword ttmp2, [ttmp14, ttmp15], 0x10 glc:1 // debug trap enabled flag
0267 s_waitcnt lgkmcnt(0)
0268 s_lshl_b32 ttmp2, ttmp2, TTMP11_DEBUG_TRAP_ENABLED_SHIFT
0269 s_andn2_b32 ttmp11, ttmp11, TTMP11_DEBUG_TRAP_ENABLED_MASK
0270 s_or_b32 ttmp11, ttmp11, ttmp2
0271
0272 s_load_dwordx2 [ttmp2, ttmp3], [ttmp14, ttmp15], 0x0 glc:1 // second-level TBA
0273 s_waitcnt lgkmcnt(0)
0274 s_load_dwordx2 [ttmp14, ttmp15], [ttmp14, ttmp15], 0x8 glc:1 // second-level TMA
0275 s_waitcnt lgkmcnt(0)
0276
0277 s_and_b64 [ttmp2, ttmp3], [ttmp2, ttmp3], [ttmp2, ttmp3]
0278 s_cbranch_scc0 L_NO_NEXT_TRAP // second-level trap handler not been set
0279 s_setpc_b64 [ttmp2, ttmp3] // jump to second-level trap handler
0280
0281 L_NO_NEXT_TRAP:
0282 // If not caused by trap then halt wave to prevent re-entry.
0283 s_and_b32 ttmp2, s_save_pc_hi, (S_SAVE_PC_HI_TRAP_ID_MASK|S_SAVE_PC_HI_HT_MASK)
0284 s_cbranch_scc1 L_TRAP_CASE
0285 s_or_b32 s_save_status, s_save_status, SQ_WAVE_STATUS_HALT_MASK
0286
0287 // If the PC points to S_ENDPGM then context save will fail if STATUS.HALT is set.
0288 // Rewind the PC to prevent this from occurring.
0289 s_sub_u32 ttmp0, ttmp0, 0x8
0290 s_subb_u32 ttmp1, ttmp1, 0x0
0291
0292 s_branch L_EXIT_TRAP
0293
0294 L_TRAP_CASE:
0295 // Host trap will not cause trap re-entry.
0296 s_and_b32 ttmp2, s_save_pc_hi, S_SAVE_PC_HI_HT_MASK
0297 s_cbranch_scc1 L_EXIT_TRAP
0298
0299 // Advance past trap instruction to prevent re-entry.
0300 s_add_u32 ttmp0, ttmp0, 0x4
0301 s_addc_u32 ttmp1, ttmp1, 0x0
0302
0303 L_EXIT_TRAP:
0304 s_and_b32 ttmp1, ttmp1, 0xFFFF
0305
0306 restore_ib_sts(ttmp14)
0307
0308 // Restore SQ_WAVE_STATUS.
0309 s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32
0310 s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32
0311 set_status_without_spi_prio(s_save_status, ttmp2)
0312
0313 s_rfe_b64 [ttmp0, ttmp1]
0314
0315 // ********* End handling of non-CWSR traps *******************
0316
0317
0318
0319
0320
0321 L_SAVE:
0322 s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32]
0323
0324 s_mov_b32 s_save_tmp, 0 //clear saveCtx bit
0325 s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_SAVECTX_SHIFT, 1), s_save_tmp //clear saveCtx bit
0326
0327 save_and_clear_ib_sts(s_save_tmp)
0328
0329
0330 s_mov_b32 s_save_exec_lo, exec_lo //save EXEC and use EXEC for the go signal from SPI
0331 s_mov_b32 s_save_exec_hi, exec_hi
0332 s_mov_b64 exec, 0x0 //clear EXEC to get ready to receive
0333
0334 s_sendmsg sendmsg(MSG_SAVEWAVE) //send SPI a message and wait for SPI's write to EXEC
0335
0336 // Set SPI_PRIO=2 to avoid starving instruction fetch in the waves we're waiting for.
0337 s_or_b32 s_save_tmp, s_save_status, (2 << SQ_WAVE_STATUS_SPI_PRIO_SHIFT)
0338 s_setreg_b32 hwreg(HW_REG_STATUS), s_save_tmp
0339
0340 L_SLEEP:
0341 s_sleep 0x2 // sleep 1 (64clk) is not enough for 8 waves per SIMD, which will cause SQ hang, since the 7,8th wave could not get arbit to exec inst, while other waves are stuck into the sleep-loop and waiting for wrexec!=0
0342
0343 s_cbranch_execz L_SLEEP
0344
0345 // Save trap temporaries 4-11, 13 initialized by SPI debug dispatch logic
0346 // ttmp SR memory offset : size(VGPR)+size(SGPR)+0x40
0347 get_vgpr_size_bytes(s_save_ttmps_lo)
0348 get_sgpr_size_bytes(s_save_ttmps_hi)
0349 s_add_u32 s_save_ttmps_lo, s_save_ttmps_lo, s_save_ttmps_hi
0350 s_add_u32 s_save_ttmps_lo, s_save_ttmps_lo, s_save_spi_init_lo
0351 s_addc_u32 s_save_ttmps_hi, s_save_spi_init_hi, 0x0
0352 s_and_b32 s_save_ttmps_hi, s_save_ttmps_hi, 0xFFFF
0353 s_store_dwordx4 [ttmp4, ttmp5, ttmp6, ttmp7], [s_save_ttmps_lo, s_save_ttmps_hi], 0x50 glc:1
0354 ack_sqc_store_workaround()
0355 s_store_dwordx4 [ttmp8, ttmp9, ttmp10, ttmp11], [s_save_ttmps_lo, s_save_ttmps_hi], 0x60 glc:1
0356 ack_sqc_store_workaround()
0357 s_store_dword ttmp13, [s_save_ttmps_lo, s_save_ttmps_hi], 0x74 glc:1
0358 ack_sqc_store_workaround()
0359
0360
0361 s_mov_b32 s_save_buf_rsrc0, s_save_spi_init_lo //base_addr_lo
0362 s_and_b32 s_save_buf_rsrc1, s_save_spi_init_hi, 0x0000FFFF //base_addr_hi
0363 s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE
0364 s_mov_b32 s_save_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes) although not neccessarily inited
0365 s_mov_b32 s_save_buf_rsrc3, S_SAVE_BUF_RSRC_WORD3_MISC
0366
0367 //FIXME right now s_save_m0/s_save_mem_offset use tma_lo/tma_hi (might need to save them before using them?)
0368 s_mov_b32 s_save_m0, m0 //save M0
0369
0370
0371 s_mov_b32 s_save_mem_offset, 0x0 //mem offset initial value = 0
0372
0373
0374
0375
0376
0377 //////////////////////////////
0378
0379 L_SAVE_HWREG:
0380 // HWREG SR memory offset : size(VGPR)+size(SGPR)
0381 get_vgpr_size_bytes(s_save_mem_offset)
0382 get_sgpr_size_bytes(s_save_tmp)
0383 s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp
0384
0385
0386 s_mov_b32 s_save_buf_rsrc2, 0x4 //NUM_RECORDS in bytes
0387 s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
0388
0389
0390 write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset) //M0
0391 write_hwreg_to_mem(s_save_pc_lo, s_save_buf_rsrc0, s_save_mem_offset) //PC
0392 write_hwreg_to_mem(s_save_pc_hi, s_save_buf_rsrc0, s_save_mem_offset)
0393 write_hwreg_to_mem(s_save_exec_lo, s_save_buf_rsrc0, s_save_mem_offset) //EXEC
0394 write_hwreg_to_mem(s_save_exec_hi, s_save_buf_rsrc0, s_save_mem_offset)
0395 write_hwreg_to_mem(s_save_status, s_save_buf_rsrc0, s_save_mem_offset) //STATUS
0396
0397 //s_save_trapsts conflicts with s_save_alloc_size
0398 s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS)
0399 write_hwreg_to_mem(s_save_trapsts, s_save_buf_rsrc0, s_save_mem_offset) //TRAPSTS
0400
0401 write_hwreg_to_mem(xnack_mask_lo, s_save_buf_rsrc0, s_save_mem_offset) //XNACK_MASK_LO
0402 write_hwreg_to_mem(xnack_mask_hi, s_save_buf_rsrc0, s_save_mem_offset) //XNACK_MASK_HI
0403
0404 //use s_save_tmp would introduce conflict here between s_save_tmp and s_save_buf_rsrc2
0405 s_getreg_b32 s_save_m0, hwreg(HW_REG_MODE) //MODE
0406 write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset)
0407
0408
0409
0410
0411 s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK // extract fisrt wave bit
0412 s_mov_b32 s_save_exec_hi, 0x0
0413 s_or_b32 s_save_exec_hi, s_save_tmp, s_save_exec_hi // save first wave bit to s_save_exec_hi.bits[26]
0414
0415
0416
0417 // Save SGPR before LDS save, then the s0 to s4 can be used during LDS save...
0418 //////////////////////////////
0419
0420 // SGPR SR memory offset : size(VGPR)
0421 get_vgpr_size_bytes(s_save_mem_offset)
0422 // TODO, change RSRC word to rearrange memory layout for SGPRS
0423
0424 s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size
0425 s_add_u32 s_save_alloc_size, s_save_alloc_size, 1
0426 s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 4 //Number of SGPRs = (sgpr_size + 1) * 16 (non-zero value)
0427
0428 s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 2 //NUM_RECORDS in bytes
0429
0430 s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
0431
0432
0433 // backup s_save_buf_rsrc0,1 to s_save_pc_lo/hi, since write_16sgpr_to_mem function will change the rsrc0
0434 //s_mov_b64 s_save_pc_lo, s_save_buf_rsrc0
0435 s_mov_b64 s_save_xnack_mask_lo, s_save_buf_rsrc0
0436 s_add_u32 s_save_buf_rsrc0, s_save_buf_rsrc0, s_save_mem_offset
0437 s_addc_u32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0
0438
0439 s_mov_b32 m0, 0x0 //SGPR initial index value =0
0440 s_nop 0x0 //Manually inserted wait states
0441 L_SAVE_SGPR_LOOP:
0442 // SGPR is allocated in 16 SGPR granularity
0443 s_movrels_b64 s0, s0 //s0 = s[0+m0], s1 = s[1+m0]
0444 s_movrels_b64 s2, s2 //s2 = s[2+m0], s3 = s[3+m0]
0445 s_movrels_b64 s4, s4 //s4 = s[4+m0], s5 = s[5+m0]
0446 s_movrels_b64 s6, s6 //s6 = s[6+m0], s7 = s[7+m0]
0447 s_movrels_b64 s8, s8 //s8 = s[8+m0], s9 = s[9+m0]
0448 s_movrels_b64 s10, s10 //s10 = s[10+m0], s11 = s[11+m0]
0449 s_movrels_b64 s12, s12 //s12 = s[12+m0], s13 = s[13+m0]
0450 s_movrels_b64 s14, s14 //s14 = s[14+m0], s15 = s[15+m0]
0451
0452 write_16sgpr_to_mem(s0, s_save_buf_rsrc0, s_save_mem_offset) //PV: the best performance should be using s_buffer_store_dwordx4
0453 s_add_u32 m0, m0, 16 //next sgpr index
0454 s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0
0455 s_cbranch_scc1 L_SAVE_SGPR_LOOP //SGPR save is complete?
0456 // restore s_save_buf_rsrc0,1
0457 //s_mov_b64 s_save_buf_rsrc0, s_save_pc_lo
0458 s_mov_b64 s_save_buf_rsrc0, s_save_xnack_mask_lo
0459
0460
0461
0462
0463
0464 // each wave will alloc 4 vgprs at least...
0465 /////////////////////////////////////////////////////////////////////////////////////
0466
0467 s_mov_b32 s_save_mem_offset, 0
0468 s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on
0469 s_mov_b32 exec_hi, 0xFFFFFFFF
0470 s_mov_b32 xnack_mask_lo, 0x0
0471 s_mov_b32 xnack_mask_hi, 0x0
0472
0473 s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
0474
0475
0476 // VGPR Allocated in 4-GPR granularity
0477
0478 if SAVE_AFTER_XNACK_ERROR
0479 check_if_tcp_store_ok()
0480 s_cbranch_scc1 L_SAVE_FIRST_VGPRS_WITH_TCP
0481
0482 write_vgprs_to_mem_with_sqc(v0, 4, s_save_buf_rsrc0, s_save_mem_offset)
0483 s_branch L_SAVE_LDS
0484
0485 L_SAVE_FIRST_VGPRS_WITH_TCP:
0486 end
0487
0488 write_4vgprs_to_mem(s_save_buf_rsrc0, s_save_mem_offset)
0489
0490
0491 //////////////////////////////
0492
0493 L_SAVE_LDS:
0494
0495 // Change EXEC to all threads...
0496 s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on
0497 s_mov_b32 exec_hi, 0xFFFFFFFF
0498
0499 s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) //lds_size
0500 s_and_b32 s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF //lds_size is zero?
0501 s_cbranch_scc0 L_SAVE_LDS_DONE //no lds used? jump to L_SAVE_DONE
0502
0503 s_barrier //LDS is used? wait for other waves in the same TG
0504 s_and_b32 s_save_tmp, s_save_exec_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK //exec is still used here
0505 s_cbranch_scc0 L_SAVE_LDS_DONE
0506
0507 // first wave do LDS save;
0508
0509 s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 6 //LDS size in dwords = lds_size * 64dw
0510 s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //LDS size in bytes
0511 s_mov_b32 s_save_buf_rsrc2, s_save_alloc_size //NUM_RECORDS in bytes
0512
0513 // LDS at offset: size(VGPR)+SIZE(SGPR)+SIZE(HWREG)
0514 //
0515 get_vgpr_size_bytes(s_save_mem_offset)
0516 get_sgpr_size_bytes(s_save_tmp)
0517 s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp
0518 s_add_u32 s_save_mem_offset, s_save_mem_offset, get_hwreg_size_bytes()
0519
0520
0521 s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
0522
0523 s_mov_b32 m0, 0x0 //lds_offset initial value = 0
0524
0525
0526 v_mbcnt_lo_u32_b32 v2, 0xffffffff, 0x0
0527 v_mbcnt_hi_u32_b32 v3, 0xffffffff, v2 // tid
0528
0529 if SAVE_AFTER_XNACK_ERROR
0530 check_if_tcp_store_ok()
0531 s_cbranch_scc1 L_SAVE_LDS_WITH_TCP
0532
0533 v_lshlrev_b32 v2, 2, v3
0534 L_SAVE_LDS_LOOP_SQC:
0535 ds_read2_b32 v[0:1], v2 offset0:0 offset1:0x40
0536 s_waitcnt lgkmcnt(0)
0537
0538 write_vgprs_to_mem_with_sqc(v0, 2, s_save_buf_rsrc0, s_save_mem_offset)
0539
0540 v_add_u32 v2, 0x200, v2
0541 v_cmp_lt_u32 vcc[0:1], v2, s_save_alloc_size
0542 s_cbranch_vccnz L_SAVE_LDS_LOOP_SQC
0543
0544 s_branch L_SAVE_LDS_DONE
0545
0546 L_SAVE_LDS_WITH_TCP:
0547 end
0548
0549 v_mul_i32_i24 v2, v3, 8 // tid*8
0550 v_mov_b32 v3, 256*2
0551 s_mov_b32 m0, 0x10000
0552 s_mov_b32 s0, s_save_buf_rsrc3
0553 s_and_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, 0xFF7FFFFF // disable add_tid
0554 s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, 0x58000 //DFMT
0555
0556 L_SAVE_LDS_LOOP_VECTOR:
0557 ds_read_b64 v[0:1], v2 //x =LDS[a], byte address
0558 s_waitcnt lgkmcnt(0)
0559 buffer_store_dwordx2 v[0:1], v2, s_save_buf_rsrc0, s_save_mem_offset offen:1 glc:1 slc:1
0560 // s_waitcnt vmcnt(0)
0561 // v_add_u32 v2, vcc[0:1], v2, v3
0562 v_add_u32 v2, v2, v3
0563 v_cmp_lt_u32 vcc[0:1], v2, s_save_alloc_size
0564 s_cbranch_vccnz L_SAVE_LDS_LOOP_VECTOR
0565
0566 // restore rsrc3
0567 s_mov_b32 s_save_buf_rsrc3, s0
0568
0569 L_SAVE_LDS_DONE:
0570
0571
0572
0573 //////////////////////////////////////////////////////////////////////////////////////
0574 L_SAVE_VGPR:
0575 // VGPR SR memory offset: 0
0576 // TODO rearrange the RSRC words to use swizzle for VGPR save...
0577
0578 s_mov_b32 s_save_mem_offset, (0+256*4) // for the rest VGPRs
0579 s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on
0580 s_mov_b32 exec_hi, 0xFFFFFFFF
0581
0582 get_num_arch_vgprs(s_save_alloc_size)
0583 s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
0584
0585
0586 // VGPR store using dw burst
0587 s_mov_b32 m0, 0x4 //VGPR initial index value =0
0588 s_cmp_lt_u32 m0, s_save_alloc_size
0589 s_cbranch_scc0 L_SAVE_VGPR_END
0590
0591
0592 s_set_gpr_idx_on m0, 0x1 //M0[7:0] = M0[7:0] and M0[15:12] = 0x1
0593 s_add_u32 s_save_alloc_size, s_save_alloc_size, 0x1000 //add 0x1000 since we compare m0 against it later
0594
0595 if SAVE_AFTER_XNACK_ERROR
0596 check_if_tcp_store_ok()
0597 s_cbranch_scc1 L_SAVE_VGPR_LOOP
0598
0599 L_SAVE_VGPR_LOOP_SQC:
0600 write_vgprs_to_mem_with_sqc(v0, 4, s_save_buf_rsrc0, s_save_mem_offset)
0601
0602 s_add_u32 m0, m0, 4
0603 s_cmp_lt_u32 m0, s_save_alloc_size
0604 s_cbranch_scc1 L_SAVE_VGPR_LOOP_SQC
0605
0606 s_set_gpr_idx_off
0607 s_branch L_SAVE_VGPR_END
0608 end
0609
0610 L_SAVE_VGPR_LOOP:
0611 v_mov_b32 v0, v0 //v0 = v[0+m0]
0612 v_mov_b32 v1, v1 //v0 = v[0+m0]
0613 v_mov_b32 v2, v2 //v0 = v[0+m0]
0614 v_mov_b32 v3, v3 //v0 = v[0+m0]
0615
0616 write_4vgprs_to_mem(s_save_buf_rsrc0, s_save_mem_offset)
0617
0618 s_add_u32 m0, m0, 4 //next vgpr index
0619 s_add_u32 s_save_mem_offset, s_save_mem_offset, 256*4 //every buffer_store_dword does 256 bytes
0620 s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0
0621 s_cbranch_scc1 L_SAVE_VGPR_LOOP //VGPR save is complete?
0622 s_set_gpr_idx_off
0623
0624 L_SAVE_VGPR_END:
0625
0626 #if ASIC_FAMILY >= CHIP_ARCTURUS
0627 // Save ACC VGPRs
0628
0629 #if ASIC_FAMILY >= CHIP_ALDEBARAN
0630 // ACC VGPR count may differ from ARCH VGPR count.
0631 get_num_acc_vgprs(s_save_alloc_size, s_save_tmp)
0632 s_and_b32 s_save_alloc_size, s_save_alloc_size, s_save_alloc_size
0633 s_cbranch_scc0 L_SAVE_ACCVGPR_END
0634 s_add_u32 s_save_alloc_size, s_save_alloc_size, 0x1000 //add 0x1000 since we compare m0 against it later
0635 #endif
0636
0637 s_mov_b32 m0, 0x0 //VGPR initial index value =0
0638 s_set_gpr_idx_on m0, 0x1 //M0[7:0] = M0[7:0] and M0[15:12] = 0x1
0639
0640 if SAVE_AFTER_XNACK_ERROR
0641 check_if_tcp_store_ok()
0642 s_cbranch_scc1 L_SAVE_ACCVGPR_LOOP
0643
0644 L_SAVE_ACCVGPR_LOOP_SQC:
0645 for var vgpr = 0; vgpr < 4; ++ vgpr
0646 v_accvgpr_read v[vgpr], acc[vgpr] // v[N] = acc[N+m0]
0647 end
0648
0649 write_vgprs_to_mem_with_sqc(v0, 4, s_save_buf_rsrc0, s_save_mem_offset)
0650
0651 s_add_u32 m0, m0, 4
0652 s_cmp_lt_u32 m0, s_save_alloc_size
0653 s_cbranch_scc1 L_SAVE_ACCVGPR_LOOP_SQC
0654
0655 s_set_gpr_idx_off
0656 s_branch L_SAVE_ACCVGPR_END
0657 end
0658
0659 L_SAVE_ACCVGPR_LOOP:
0660 for var vgpr = 0; vgpr < 4; ++ vgpr
0661 v_accvgpr_read v[vgpr], acc[vgpr] // v[N] = acc[N+m0]
0662 end
0663
0664 write_4vgprs_to_mem(s_save_buf_rsrc0, s_save_mem_offset)
0665
0666 s_add_u32 m0, m0, 4
0667 s_add_u32 s_save_mem_offset, s_save_mem_offset, 256*4
0668 s_cmp_lt_u32 m0, s_save_alloc_size
0669 s_cbranch_scc1 L_SAVE_ACCVGPR_LOOP
0670 s_set_gpr_idx_off
0671
0672 L_SAVE_ACCVGPR_END:
0673 #endif
0674
0675 s_branch L_END_PGM
0676
0677
0678
0679
0680
0681
0682
0683 L_RESTORE:
0684
0685 s_mov_b32 s_restore_buf_rsrc0, s_restore_spi_init_lo //base_addr_lo
0686 s_and_b32 s_restore_buf_rsrc1, s_restore_spi_init_hi, 0x0000FFFF //base_addr_hi
0687 s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE
0688 s_mov_b32 s_restore_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes)
0689 s_mov_b32 s_restore_buf_rsrc3, S_RESTORE_BUF_RSRC_WORD3_MISC
0690
0691
0692 // s_mov_b32 s_restore_mem_offset, 0x0 //mem offset initial value = 0
0693
0694
0695 s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_FIRST_WAVE_MASK
0696 s_cbranch_scc0 L_RESTORE_VGPR
0697
0698
0699 //////////////////////////////
0700 L_RESTORE_LDS:
0701
0702 s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on //be consistent with SAVE although can be moved ahead
0703 s_mov_b32 exec_hi, 0xFFFFFFFF
0704
0705 s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) //lds_size
0706 s_and_b32 s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF //lds_size is zero?
0707 s_cbranch_scc0 L_RESTORE_VGPR //no lds used? jump to L_RESTORE_VGPR
0708 s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 6 //LDS size in dwords = lds_size * 64dw
0709 s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //LDS size in bytes
0710 s_mov_b32 s_restore_buf_rsrc2, s_restore_alloc_size //NUM_RECORDS in bytes
0711
0712 // LDS at offset: size(VGPR)+SIZE(SGPR)+SIZE(HWREG)
0713 //
0714 get_vgpr_size_bytes(s_restore_mem_offset)
0715 get_sgpr_size_bytes(s_restore_tmp)
0716 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp
0717 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, get_hwreg_size_bytes() //FIXME, Check if offset overflow???
0718
0719
0720 s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
0721 s_mov_b32 m0, 0x0 //lds_offset initial value = 0
0722
0723 L_RESTORE_LDS_LOOP:
0724 buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 // first 64DW
0725 buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 offset:256 // second 64DW
0726 s_add_u32 m0, m0, 256*2 // 128 DW
0727 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*2 //mem offset increased by 128DW
0728 s_cmp_lt_u32 m0, s_restore_alloc_size //scc=(m0 < s_restore_alloc_size) ? 1 : 0
0729 s_cbranch_scc1 L_RESTORE_LDS_LOOP //LDS restore is complete?
0730
0731
0732
0733 //////////////////////////////
0734 L_RESTORE_VGPR:
0735 s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on //be consistent with SAVE although can be moved ahead
0736 s_mov_b32 exec_hi, 0xFFFFFFFF
0737 s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
0738
0739 // Save ARCH VGPRs 4-N, then all ACC VGPRs, then ARCH VGPRs 0-3.
0740 get_num_arch_vgprs(s_restore_alloc_size)
0741 s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 0x8000 //add 0x8000 since we compare m0 against it later
0742
0743 // ARCH VGPRs at offset: 0
0744 s_mov_b32 s_restore_mem_offset, 0x0
0745 s_mov_b32 s_restore_mem_offset_save, s_restore_mem_offset // restore start with v1, v0 will be the last
0746 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4
0747 s_mov_b32 m0, 4 //VGPR initial index value = 1
0748 s_set_gpr_idx_on m0, 0x8 //M0[7:0] = M0[7:0] and M0[15:12] = 0x8
0749
0750 L_RESTORE_VGPR_LOOP:
0751 read_4vgprs_from_mem(s_restore_buf_rsrc0, s_restore_mem_offset)
0752 v_mov_b32 v0, v0 //v[0+m0] = v0
0753 v_mov_b32 v1, v1
0754 v_mov_b32 v2, v2
0755 v_mov_b32 v3, v3
0756 s_add_u32 m0, m0, 4 //next vgpr index
0757 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 //every buffer_load_dword does 256 bytes
0758 s_cmp_lt_u32 m0, s_restore_alloc_size //scc = (m0 < s_restore_alloc_size) ? 1 : 0
0759 s_cbranch_scc1 L_RESTORE_VGPR_LOOP //VGPR restore (except v0) is complete?
0760
0761 #if ASIC_FAMILY >= CHIP_ALDEBARAN
0762 // ACC VGPR count may differ from ARCH VGPR count.
0763 get_num_acc_vgprs(s_restore_alloc_size, s_restore_tmp2)
0764 s_and_b32 s_restore_alloc_size, s_restore_alloc_size, s_restore_alloc_size
0765 s_cbranch_scc0 L_RESTORE_ACCVGPR_END
0766 s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 0x8000 //add 0x8000 since we compare m0 against it later
0767 #endif
0768
0769 #if ASIC_FAMILY >= CHIP_ARCTURUS
0770 // ACC VGPRs at offset: size(ARCH VGPRs)
0771 s_mov_b32 m0, 0
0772 s_set_gpr_idx_on m0, 0x8 //M0[7:0] = M0[7:0] and M0[15:12] = 0x8
0773
0774 L_RESTORE_ACCVGPR_LOOP:
0775 read_4vgprs_from_mem(s_restore_buf_rsrc0, s_restore_mem_offset)
0776
0777 for var vgpr = 0; vgpr < 4; ++ vgpr
0778 v_accvgpr_write acc[vgpr], v[vgpr]
0779 end
0780
0781 s_add_u32 m0, m0, 4 //next vgpr index
0782 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 //every buffer_load_dword does 256 bytes
0783 s_cmp_lt_u32 m0, s_restore_alloc_size //scc = (m0 < s_restore_alloc_size) ? 1 : 0
0784 s_cbranch_scc1 L_RESTORE_ACCVGPR_LOOP //VGPR restore (except v0) is complete?
0785 L_RESTORE_ACCVGPR_END:
0786 #endif
0787
0788 s_set_gpr_idx_off
0789
0790 // Restore VGPRs 0-3 last, no longer needed.
0791 read_4vgprs_from_mem(s_restore_buf_rsrc0, s_restore_mem_offset_save)
0792
0793
0794 //////////////////////////////
0795
0796 // SGPR SR memory offset : size(VGPR)
0797 get_vgpr_size_bytes(s_restore_mem_offset)
0798 get_sgpr_size_bytes(s_restore_tmp)
0799 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp
0800 s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 16*4 // restore SGPR from S[n] to S[0], by 16 sgprs group
0801 // TODO, change RSRC word to rearrange memory layout for SGPRS
0802
0803 s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size
0804 s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1
0805 s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 4 //Number of SGPRs = (sgpr_size + 1) * 16 (non-zero value)
0806
0807 s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 2 //NUM_RECORDS in bytes
0808 s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
0809
0810 s_mov_b32 m0, s_restore_alloc_size
0811
0812 L_RESTORE_SGPR_LOOP:
0813 read_16sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset) //PV: further performance improvement can be made
0814 s_waitcnt lgkmcnt(0) //ensure data ready
0815
0816 s_sub_u32 m0, m0, 16 // Restore from S[n] to S[0]
0817 s_nop 0 // hazard SALU M0=> S_MOVREL
0818
0819 s_movreld_b64 s0, s0 //s[0+m0] = s0
0820 s_movreld_b64 s2, s2
0821 s_movreld_b64 s4, s4
0822 s_movreld_b64 s6, s6
0823 s_movreld_b64 s8, s8
0824 s_movreld_b64 s10, s10
0825 s_movreld_b64 s12, s12
0826 s_movreld_b64 s14, s14
0827
0828 s_cmp_eq_u32 m0, 0 //scc = (m0 < s_restore_alloc_size) ? 1 : 0
0829 s_cbranch_scc0 L_RESTORE_SGPR_LOOP //SGPR restore (except s0) is complete?
0830
0831
0832 //////////////////////////////
0833 L_RESTORE_HWREG:
0834
0835
0836 // HWREG SR memory offset : size(VGPR)+size(SGPR)
0837 get_vgpr_size_bytes(s_restore_mem_offset)
0838 get_sgpr_size_bytes(s_restore_tmp)
0839 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp
0840
0841
0842 s_mov_b32 s_restore_buf_rsrc2, 0x4 //NUM_RECORDS in bytes
0843 s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
0844
0845 read_hwreg_from_mem(s_restore_m0, s_restore_buf_rsrc0, s_restore_mem_offset) //M0
0846 read_hwreg_from_mem(s_restore_pc_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //PC
0847 read_hwreg_from_mem(s_restore_pc_hi, s_restore_buf_rsrc0, s_restore_mem_offset)
0848 read_hwreg_from_mem(s_restore_exec_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //EXEC
0849 read_hwreg_from_mem(s_restore_exec_hi, s_restore_buf_rsrc0, s_restore_mem_offset)
0850 read_hwreg_from_mem(s_restore_status, s_restore_buf_rsrc0, s_restore_mem_offset) //STATUS
0851 read_hwreg_from_mem(s_restore_trapsts, s_restore_buf_rsrc0, s_restore_mem_offset) //TRAPSTS
0852 read_hwreg_from_mem(xnack_mask_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //XNACK_MASK_LO
0853 read_hwreg_from_mem(xnack_mask_hi, s_restore_buf_rsrc0, s_restore_mem_offset) //XNACK_MASK_HI
0854 read_hwreg_from_mem(s_restore_mode, s_restore_buf_rsrc0, s_restore_mem_offset) //MODE
0855
0856 s_waitcnt lgkmcnt(0) //from now on, it is safe to restore STATUS and IB_STS
0857
0858 s_mov_b32 m0, s_restore_m0
0859 s_mov_b32 exec_lo, s_restore_exec_lo
0860 s_mov_b32 exec_hi, s_restore_exec_hi
0861
0862 s_and_b32 s_restore_m0, SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK, s_restore_trapsts
0863 s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE), s_restore_m0
0864 s_and_b32 s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK, s_restore_trapsts
0865 s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT
0866 s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE), s_restore_m0
0867 //s_setreg_b32 hwreg(HW_REG_TRAPSTS), s_restore_trapsts //don't overwrite SAVECTX bit as it may be set through external SAVECTX during restore
0868 s_setreg_b32 hwreg(HW_REG_MODE), s_restore_mode
0869
0870 // Restore trap temporaries 4-11, 13 initialized by SPI debug dispatch logic
0871 // ttmp SR memory offset : size(VGPR)+size(SGPR)+0x40
0872 get_vgpr_size_bytes(s_restore_ttmps_lo)
0873 get_sgpr_size_bytes(s_restore_ttmps_hi)
0874 s_add_u32 s_restore_ttmps_lo, s_restore_ttmps_lo, s_restore_ttmps_hi
0875 s_add_u32 s_restore_ttmps_lo, s_restore_ttmps_lo, s_restore_buf_rsrc0
0876 s_addc_u32 s_restore_ttmps_hi, s_restore_buf_rsrc1, 0x0
0877 s_and_b32 s_restore_ttmps_hi, s_restore_ttmps_hi, 0xFFFF
0878 s_load_dwordx4 [ttmp4, ttmp5, ttmp6, ttmp7], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x50 glc:1
0879 s_load_dwordx4 [ttmp8, ttmp9, ttmp10, ttmp11], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x60 glc:1
0880 s_load_dword ttmp13, [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x74 glc:1
0881 s_waitcnt lgkmcnt(0)
0882
0883 restore_ib_sts(s_restore_tmp)
0884
0885 s_and_b32 s_restore_pc_hi, s_restore_pc_hi, 0x0000ffff //pc[47:32] //Do it here in order not to affect STATUS
0886 s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32
0887 s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32
0888 set_status_without_spi_prio(s_restore_status, s_restore_tmp) // SCC is included, which is changed by previous salu
0889
0890 s_barrier //barrier to ensure the readiness of LDS before access attempts from any other wave in the same TG //FIXME not performance-optimal at this time
0891
0892 s_rfe_b64 s_restore_pc_lo //Return to the main shader program and resume execution
0893
0894
0895
0896
0897
0898 L_END_PGM:
0899 s_endpgm
0900
0901 end
0902
0903
0904
0905
0906
0907
0908 //Only for save hwreg to mem
0909 function write_hwreg_to_mem(s, s_rsrc, s_mem_offset)
0910 s_mov_b32 exec_lo, m0 //assuming exec_lo is not needed anymore from this point on
0911 s_mov_b32 m0, s_mem_offset
0912 s_buffer_store_dword s, s_rsrc, m0 glc:1
0913 ack_sqc_store_workaround()
0914 s_add_u32 s_mem_offset, s_mem_offset, 4
0915 s_mov_b32 m0, exec_lo
0916 end
0917
0918
0919 // HWREG are saved before SGPRs, so all HWREG could be use.
0920 function write_16sgpr_to_mem(s, s_rsrc, s_mem_offset)
0921
0922 s_buffer_store_dwordx4 s[0], s_rsrc, 0 glc:1
0923 ack_sqc_store_workaround()
0924 s_buffer_store_dwordx4 s[4], s_rsrc, 16 glc:1
0925 ack_sqc_store_workaround()
0926 s_buffer_store_dwordx4 s[8], s_rsrc, 32 glc:1
0927 ack_sqc_store_workaround()
0928 s_buffer_store_dwordx4 s[12], s_rsrc, 48 glc:1
0929 ack_sqc_store_workaround()
0930 s_add_u32 s_rsrc[0], s_rsrc[0], 4*16
0931 s_addc_u32 s_rsrc[1], s_rsrc[1], 0x0 // +scc
0932 end
0933
0934
0935 function read_hwreg_from_mem(s, s_rsrc, s_mem_offset)
0936 s_buffer_load_dword s, s_rsrc, s_mem_offset glc:1
0937 s_add_u32 s_mem_offset, s_mem_offset, 4
0938 end
0939
0940 function read_16sgpr_from_mem(s, s_rsrc, s_mem_offset)
0941 s_buffer_load_dwordx16 s, s_rsrc, s_mem_offset glc:1
0942 s_sub_u32 s_mem_offset, s_mem_offset, 4*16
0943 end
0944
0945 function check_if_tcp_store_ok
0946 // If STATUS.ALLOW_REPLAY=0 and TRAPSTS.XNACK_ERROR=1 then TCP stores will fail.
0947 s_and_b32 s_save_tmp, s_save_status, SQ_WAVE_STATUS_ALLOW_REPLAY_MASK
0948 s_cbranch_scc1 L_TCP_STORE_CHECK_DONE
0949
0950 s_getreg_b32 s_save_tmp, hwreg(HW_REG_TRAPSTS)
0951 s_andn2_b32 s_save_tmp, SQ_WAVE_TRAPSTS_XNACK_ERROR_MASK, s_save_tmp
0952
0953 L_TCP_STORE_CHECK_DONE:
0954 end
0955
0956 function write_4vgprs_to_mem(s_rsrc, s_mem_offset)
0957 buffer_store_dword v0, v0, s_rsrc, s_mem_offset slc:1 glc:1
0958 buffer_store_dword v1, v0, s_rsrc, s_mem_offset slc:1 glc:1 offset:256
0959 buffer_store_dword v2, v0, s_rsrc, s_mem_offset slc:1 glc:1 offset:256*2
0960 buffer_store_dword v3, v0, s_rsrc, s_mem_offset slc:1 glc:1 offset:256*3
0961 end
0962
0963 function read_4vgprs_from_mem(s_rsrc, s_mem_offset)
0964 buffer_load_dword v0, v0, s_rsrc, s_mem_offset slc:1 glc:1
0965 buffer_load_dword v1, v0, s_rsrc, s_mem_offset slc:1 glc:1 offset:256
0966 buffer_load_dword v2, v0, s_rsrc, s_mem_offset slc:1 glc:1 offset:256*2
0967 buffer_load_dword v3, v0, s_rsrc, s_mem_offset slc:1 glc:1 offset:256*3
0968 s_waitcnt vmcnt(0)
0969 end
0970
0971 function write_vgpr_to_mem_with_sqc(v, s_rsrc, s_mem_offset)
0972 s_mov_b32 s4, 0
0973
0974 L_WRITE_VGPR_LANE_LOOP:
0975 for var lane = 0; lane < 4; ++ lane
0976 v_readlane_b32 s[lane], v, s4
0977 s_add_u32 s4, s4, 1
0978 end
0979
0980 s_buffer_store_dwordx4 s[0:3], s_rsrc, s_mem_offset glc:1
0981 ack_sqc_store_workaround()
0982
0983 s_add_u32 s_mem_offset, s_mem_offset, 0x10
0984 s_cmp_eq_u32 s4, 0x40
0985 s_cbranch_scc0 L_WRITE_VGPR_LANE_LOOP
0986 end
0987
0988 function write_vgprs_to_mem_with_sqc(v, n_vgprs, s_rsrc, s_mem_offset)
0989 for var vgpr = 0; vgpr < n_vgprs; ++ vgpr
0990 write_vgpr_to_mem_with_sqc(v[vgpr], s_rsrc, s_mem_offset)
0991 end
0992 end
0993
0994 function get_lds_size_bytes(s_lds_size_byte)
0995 // SQ LDS granularity is 64DW, while PGM_RSRC2.lds_size is in granularity 128DW
0996 s_getreg_b32 s_lds_size_byte, hwreg(HW_REG_LDS_ALLOC, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) // lds_size
0997 s_lshl_b32 s_lds_size_byte, s_lds_size_byte, 8 //LDS size in dwords = lds_size * 64 *4Bytes // granularity 64DW
0998 end
0999
1000 function get_vgpr_size_bytes(s_vgpr_size_byte)
1001 s_getreg_b32 s_vgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size
1002 s_add_u32 s_vgpr_size_byte, s_vgpr_size_byte, 1
1003 s_lshl_b32 s_vgpr_size_byte, s_vgpr_size_byte, (2+8) //Number of VGPRs = (vgpr_size + 1) * 4 * 64 * 4 (non-zero value) //FIXME for GFX, zero is possible
1004
1005 #if ASIC_FAMILY >= CHIP_ARCTURUS
1006 s_lshl_b32 s_vgpr_size_byte, s_vgpr_size_byte, 1 // Double size for ACC VGPRs
1007 #endif
1008 end
1009
1010 function get_sgpr_size_bytes(s_sgpr_size_byte)
1011 s_getreg_b32 s_sgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size
1012 s_add_u32 s_sgpr_size_byte, s_sgpr_size_byte, 1
1013 s_lshl_b32 s_sgpr_size_byte, s_sgpr_size_byte, 6 //Number of SGPRs = (sgpr_size + 1) * 16 *4 (non-zero value)
1014 end
1015
1016 function get_hwreg_size_bytes
1017 return 128 //HWREG size 128 bytes
1018 end
1019
1020 function get_num_arch_vgprs(s_num_arch_vgprs)
1021 #if ASIC_FAMILY >= CHIP_ALDEBARAN
1022 // VGPR count includes ACC VGPRs, use ACC VGPR offset for ARCH VGPR count.
1023 s_getreg_b32 s_num_arch_vgprs, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_ACCV_OFFSET_SHIFT,SQ_WAVE_GPR_ALLOC_ACCV_OFFSET_SIZE)
1024 #else
1025 s_getreg_b32 s_num_arch_vgprs, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE)
1026 #endif
1027
1028 // Number of VGPRs = (vgpr_size + 1) * 4
1029 s_add_u32 s_num_arch_vgprs, s_num_arch_vgprs, 1
1030 s_lshl_b32 s_num_arch_vgprs, s_num_arch_vgprs, 2
1031 end
1032
1033 #if ASIC_FAMILY >= CHIP_ALDEBARAN
1034 function get_num_acc_vgprs(s_num_acc_vgprs, s_tmp)
1035 // VGPR count = (GPR_ALLOC.VGPR_SIZE + 1) * 8
1036 s_getreg_b32 s_num_acc_vgprs, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE)
1037 s_add_u32 s_num_acc_vgprs, s_num_acc_vgprs, 1
1038 s_lshl_b32 s_num_acc_vgprs, s_num_acc_vgprs, 3
1039
1040 // ACC VGPR count = VGPR count - ARCH VGPR count.
1041 get_num_arch_vgprs(s_tmp)
1042 s_sub_u32 s_num_acc_vgprs, s_num_acc_vgprs, s_tmp
1043 end
1044 #endif
1045
1046 function ack_sqc_store_workaround
1047 if ACK_SQC_STORE
1048 s_waitcnt lgkmcnt(0)
1049 end
1050 end
1051
1052 function set_status_without_spi_prio(status, tmp)
1053 // Do not restore STATUS.SPI_PRIO since scheduler may have raised it.
1054 s_lshr_b32 tmp, status, SQ_WAVE_STATUS_POST_SPI_PRIO_SHIFT
1055 s_setreg_b32 hwreg(HW_REG_STATUS, SQ_WAVE_STATUS_POST_SPI_PRIO_SHIFT, SQ_WAVE_STATUS_POST_SPI_PRIO_SIZE), tmp
1056 s_nop 0x2 // avoid S_SETREG => S_SETREG hazard
1057 s_setreg_b32 hwreg(HW_REG_STATUS, SQ_WAVE_STATUS_PRE_SPI_PRIO_SHIFT, SQ_WAVE_STATUS_PRE_SPI_PRIO_SIZE), status
1058 end
1059
1060 function save_and_clear_ib_sts(tmp)
1061 // Save IB_STS.FIRST_REPLAY[15] and IB_STS.RCNT[20:16] into unused space ttmp11[31:26].
1062 s_getreg_b32 tmp, hwreg(HW_REG_IB_STS)
1063 s_and_b32 tmp, tmp, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK
1064 s_lshl_b32 tmp, tmp, (TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT - SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT)
1065 s_andn2_b32 ttmp11, ttmp11, TTMP11_SAVE_RCNT_FIRST_REPLAY_MASK
1066 s_or_b32 ttmp11, ttmp11, tmp
1067 s_setreg_imm32_b32 hwreg(HW_REG_IB_STS), 0x0
1068 end
1069
1070 function restore_ib_sts(tmp)
1071 s_lshr_b32 tmp, ttmp11, (TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT - SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT)
1072 s_and_b32 tmp, tmp, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK
1073 s_setreg_b32 hwreg(HW_REG_IB_STS), tmp
1074 end