0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024
0025
0026
0027
0028
0029
0030 var SQ_WAVE_STATUS_INST_ATC_SHIFT = 23
0031 var SQ_WAVE_STATUS_INST_ATC_MASK = 0x00800000
0032 var SQ_WAVE_STATUS_SPI_PRIO_SHIFT = 1
0033 var SQ_WAVE_STATUS_SPI_PRIO_MASK = 0x00000006
0034 var SQ_WAVE_STATUS_PRE_SPI_PRIO_SHIFT = 0
0035 var SQ_WAVE_STATUS_PRE_SPI_PRIO_SIZE = 1
0036 var SQ_WAVE_STATUS_POST_SPI_PRIO_SHIFT = 3
0037 var SQ_WAVE_STATUS_POST_SPI_PRIO_SIZE = 29
0038
0039 var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT = 12
0040 var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE = 9
0041 var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT = 8
0042 var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE = 6
0043 var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT = 24
0044 var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE = 3 //FIXME sq.blk still has 4 bits at this time while SQ programming guide has 3 bits
0045
0046 var SQ_WAVE_TRAPSTS_SAVECTX_MASK = 0x400
0047 var SQ_WAVE_TRAPSTS_EXCE_MASK = 0x1FF // Exception mask
0048 var SQ_WAVE_TRAPSTS_SAVECTX_SHIFT = 10
0049 var SQ_WAVE_TRAPSTS_MEM_VIOL_MASK = 0x100
0050 var SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT = 8
0051 var SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK = 0x3FF
0052 var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT = 0x0
0053 var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE = 10
0054 var SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK = 0xFFFFF800
0055 var SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT = 11
0056 var SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE = 21
0057
0058 var SQ_WAVE_IB_STS_RCNT_SHIFT = 16 //FIXME
0059 var SQ_WAVE_IB_STS_RCNT_SIZE = 4 //FIXME
0060 var SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT = 15 //FIXME
0061 var SQ_WAVE_IB_STS_FIRST_REPLAY_SIZE = 1 //FIXME
0062 var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG = 0x00007FFF //FIXME
0063
0064 var SQ_BUF_RSRC_WORD1_ATC_SHIFT = 24
0065 var SQ_BUF_RSRC_WORD3_MTYPE_SHIFT = 27
0066
0067
0068
0069 var S_SAVE_BUF_RSRC_WORD1_STRIDE = 0x00040000 //stride is 4 bytes
0070 var S_SAVE_BUF_RSRC_WORD3_MISC = 0x00807FAC //SQ_SEL_X/Y/Z/W, BUF_NUM_FORMAT_FLOAT, (0 for MUBUF stride[17:14] when ADD_TID_ENABLE and BUF_DATA_FORMAT_32 for MTBUF), ADD_TID_ENABLE
0071
0072 var S_SAVE_SPI_INIT_ATC_MASK = 0x08000000 //bit[27]: ATC bit
0073 var S_SAVE_SPI_INIT_ATC_SHIFT = 27
0074 var S_SAVE_SPI_INIT_MTYPE_MASK = 0x70000000 //bit[30:28]: Mtype
0075 var S_SAVE_SPI_INIT_MTYPE_SHIFT = 28
0076 var S_SAVE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000 //bit[26]: FirstWaveInTG
0077 var S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT = 26
0078
0079 var S_SAVE_PC_HI_RCNT_SHIFT = 28 //FIXME check with Brian to ensure all fields other than PC[47:0] can be used
0080 var S_SAVE_PC_HI_RCNT_MASK = 0xF0000000 //FIXME
0081 var S_SAVE_PC_HI_FIRST_REPLAY_SHIFT = 27 //FIXME
0082 var S_SAVE_PC_HI_FIRST_REPLAY_MASK = 0x08000000 //FIXME
0083
0084 var s_save_spi_init_lo = exec_lo
0085 var s_save_spi_init_hi = exec_hi
0086
0087 //tba_lo and tba_hi need to be saved/restored
0088 var s_save_pc_lo = ttmp0 //{TTMP1, TTMP0} = {3'h0,pc_rewind[3:0], HT[0],trapID[7:0], PC[47:0]}
0089 var s_save_pc_hi = ttmp1
0090 var s_save_exec_lo = ttmp2
0091 var s_save_exec_hi = ttmp3
0092 var s_save_status = ttmp4
0093 var s_save_trapsts = ttmp5 //not really used until the end of the SAVE routine
0094 var s_save_xnack_mask_lo = ttmp6
0095 var s_save_xnack_mask_hi = ttmp7
0096 var s_save_buf_rsrc0 = ttmp8
0097 var s_save_buf_rsrc1 = ttmp9
0098 var s_save_buf_rsrc2 = ttmp10
0099 var s_save_buf_rsrc3 = ttmp11
0100
0101 var s_save_mem_offset = tma_lo
0102 var s_save_alloc_size = s_save_trapsts //conflict
0103 var s_save_tmp = s_save_buf_rsrc2 //shared with s_save_buf_rsrc2 (conflict: should not use mem access with s_save_tmp at the same time)
0104 var s_save_m0 = tma_hi
0105
0106
0107 var S_RESTORE_BUF_RSRC_WORD1_STRIDE = S_SAVE_BUF_RSRC_WORD1_STRIDE
0108 var S_RESTORE_BUF_RSRC_WORD3_MISC = S_SAVE_BUF_RSRC_WORD3_MISC
0109
0110 var S_RESTORE_SPI_INIT_ATC_MASK = 0x08000000 //bit[27]: ATC bit
0111 var S_RESTORE_SPI_INIT_ATC_SHIFT = 27
0112 var S_RESTORE_SPI_INIT_MTYPE_MASK = 0x70000000 //bit[30:28]: Mtype
0113 var S_RESTORE_SPI_INIT_MTYPE_SHIFT = 28
0114 var S_RESTORE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000 //bit[26]: FirstWaveInTG
0115 var S_RESTORE_SPI_INIT_FIRST_WAVE_SHIFT = 26
0116
0117 var S_RESTORE_PC_HI_RCNT_SHIFT = S_SAVE_PC_HI_RCNT_SHIFT
0118 var S_RESTORE_PC_HI_RCNT_MASK = S_SAVE_PC_HI_RCNT_MASK
0119 var S_RESTORE_PC_HI_FIRST_REPLAY_SHIFT = S_SAVE_PC_HI_FIRST_REPLAY_SHIFT
0120 var S_RESTORE_PC_HI_FIRST_REPLAY_MASK = S_SAVE_PC_HI_FIRST_REPLAY_MASK
0121
0122 var s_restore_spi_init_lo = exec_lo
0123 var s_restore_spi_init_hi = exec_hi
0124
0125 var s_restore_mem_offset = ttmp2
0126 var s_restore_alloc_size = ttmp3
0127 var s_restore_tmp = ttmp6 //tba_lo/hi need to be restored
0128 var s_restore_mem_offset_save = s_restore_tmp //no conflict
0129
0130 var s_restore_m0 = s_restore_alloc_size //no conflict
0131
0132 var s_restore_mode = ttmp7
0133
0134 var s_restore_pc_lo = ttmp0
0135 var s_restore_pc_hi = ttmp1
0136 var s_restore_exec_lo = tma_lo //no conflict
0137 var s_restore_exec_hi = tma_hi //no conflict
0138 var s_restore_status = ttmp4
0139 var s_restore_trapsts = ttmp5
0140 var s_restore_xnack_mask_lo = xnack_mask_lo
0141 var s_restore_xnack_mask_hi = xnack_mask_hi
0142 var s_restore_buf_rsrc0 = ttmp8
0143 var s_restore_buf_rsrc1 = ttmp9
0144 var s_restore_buf_rsrc2 = ttmp10
0145 var s_restore_buf_rsrc3 = ttmp11
0146
0147
0148
0149
0150
0151
0152 shader main
0153 asic(VI)
0154 type(CS)
0155
0156
0157 s_branch L_SKIP_RESTORE //NOT restore. might be a regular trap or save
0158
0159 L_JUMP_TO_RESTORE:
0160 s_branch L_RESTORE //restore
0161
0162 L_SKIP_RESTORE:
0163
0164 s_getreg_b32 s_save_status, hwreg(HW_REG_STATUS) //save STATUS since we will change SCC
0165 s_andn2_b32 s_save_status, s_save_status, SQ_WAVE_STATUS_SPI_PRIO_MASK //check whether this is for save
0166 s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS)
0167 s_and_b32 s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_SAVECTX_MASK //check whether this is for save
0168 s_cbranch_scc1 L_SAVE //this is the operation for save
0169
0170 // ********* Handle non-CWSR traps *******************
0171
0172
0173 s_load_dwordx4 [ttmp8,ttmp9,ttmp10, ttmp11], [tma_lo,tma_hi], 0
0174 s_waitcnt lgkmcnt(0)
0175 s_or_b32 ttmp7, ttmp8, ttmp9
0176 s_cbranch_scc0 L_NO_NEXT_TRAP //next level trap handler not been set
0177 set_status_without_spi_prio(s_save_status, ttmp2) //restore HW status(SCC)
0178 s_setpc_b64 [ttmp8,ttmp9] //jump to next level trap handler
0179
0180 L_NO_NEXT_TRAP:
0181 s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS)
0182 s_and_b32 s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_EXCE_MASK // Check whether it is an exception
0183 s_cbranch_scc1 L_EXCP_CASE // Exception, jump back to the shader program directly.
0184 s_add_u32 ttmp0, ttmp0, 4 // S_TRAP case, add 4 to ttmp0
0185 s_addc_u32 ttmp1, ttmp1, 0
0186 L_EXCP_CASE:
0187 s_and_b32 ttmp1, ttmp1, 0xFFFF
0188 set_status_without_spi_prio(s_save_status, ttmp2) //restore HW status(SCC)
0189 s_rfe_b64 [ttmp0, ttmp1]
0190
0191 // ********* End handling of non-CWSR traps *******************
0192
0193
0194
0195
0196
0197 L_SAVE:
0198 s_mov_b32 s_save_tmp, 0 //clear saveCtx bit
0199 s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_SAVECTX_SHIFT, 1), s_save_tmp //clear saveCtx bit
0200
0201 s_mov_b32 s_save_xnack_mask_lo, xnack_mask_lo //save XNACK_MASK
0202 s_mov_b32 s_save_xnack_mask_hi, xnack_mask_hi //save XNACK must before any memory operation
0203 s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_RCNT_SHIFT, SQ_WAVE_IB_STS_RCNT_SIZE) //save RCNT
0204 s_lshl_b32 s_save_tmp, s_save_tmp, S_SAVE_PC_HI_RCNT_SHIFT
0205 s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp
0206 s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT, SQ_WAVE_IB_STS_FIRST_REPLAY_SIZE) //save FIRST_REPLAY
0207 s_lshl_b32 s_save_tmp, s_save_tmp, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT
0208 s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp
0209 s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS) //clear RCNT and FIRST_REPLAY in IB_STS
0210 s_and_b32 s_save_tmp, s_save_tmp, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG
0211
0212 s_setreg_b32 hwreg(HW_REG_IB_STS), s_save_tmp
0213
0214
0215 s_mov_b32 s_save_exec_lo, exec_lo //save EXEC and use EXEC for the go signal from SPI
0216 s_mov_b32 s_save_exec_hi, exec_hi
0217 s_mov_b64 exec, 0x0 //clear EXEC to get ready to receive
0218
0219 s_sendmsg sendmsg(MSG_SAVEWAVE) //send SPI a message and wait for SPI's write to EXEC
0220
0221 // Set SPI_PRIO=2 to avoid starving instruction fetch in the waves we're waiting for.
0222 s_or_b32 s_save_tmp, s_save_status, (2 << SQ_WAVE_STATUS_SPI_PRIO_SHIFT)
0223 s_setreg_b32 hwreg(HW_REG_STATUS), s_save_tmp
0224
0225 L_SLEEP:
0226 s_sleep 0x2 // sleep 1 (64clk) is not enough for 8 waves per SIMD, which will cause SQ hang, since the 7,8th wave could not get arbit to exec inst, while other waves are stuck into the sleep-loop and waiting for wrexec!=0
0227
0228 s_cbranch_execz L_SLEEP
0229
0230
0231 s_mov_b32 s_save_buf_rsrc0, s_save_spi_init_lo //base_addr_lo
0232 s_and_b32 s_save_buf_rsrc1, s_save_spi_init_hi, 0x0000FFFF //base_addr_hi
0233 s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE
0234 s_mov_b32 s_save_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes) although not neccessarily inited
0235 s_mov_b32 s_save_buf_rsrc3, S_SAVE_BUF_RSRC_WORD3_MISC
0236 s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_ATC_MASK
0237 s_lshr_b32 s_save_tmp, s_save_tmp, (S_SAVE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT) //get ATC bit into position
0238 s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, s_save_tmp //or ATC
0239 s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_MTYPE_MASK
0240 s_lshr_b32 s_save_tmp, s_save_tmp, (S_SAVE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT) //get MTYPE bits into position
0241 s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, s_save_tmp //or MTYPE
0242
0243 //FIXME right now s_save_m0/s_save_mem_offset use tma_lo/tma_hi (might need to save them before using them?)
0244 s_mov_b32 s_save_m0, m0 //save M0
0245
0246
0247 s_mov_b32 s_save_mem_offset, 0x0 //mem offset initial value = 0
0248
0249
0250
0251
0252
0253 //////////////////////////////
0254
0255 L_SAVE_HWREG:
0256 // HWREG SR memory offset : size(VGPR)+size(SGPR)
0257 get_vgpr_size_bytes(s_save_mem_offset)
0258 get_sgpr_size_bytes(s_save_tmp)
0259 s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp
0260
0261
0262 s_mov_b32 s_save_buf_rsrc2, 0x4 //NUM_RECORDS in bytes
0263 s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
0264
0265
0266 write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset) //M0
0267 write_hwreg_to_mem(s_save_pc_lo, s_save_buf_rsrc0, s_save_mem_offset) //PC
0268 write_hwreg_to_mem(s_save_pc_hi, s_save_buf_rsrc0, s_save_mem_offset)
0269 write_hwreg_to_mem(s_save_exec_lo, s_save_buf_rsrc0, s_save_mem_offset) //EXEC
0270 write_hwreg_to_mem(s_save_exec_hi, s_save_buf_rsrc0, s_save_mem_offset)
0271 write_hwreg_to_mem(s_save_status, s_save_buf_rsrc0, s_save_mem_offset) //STATUS
0272
0273 //s_save_trapsts conflicts with s_save_alloc_size
0274 s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS)
0275 write_hwreg_to_mem(s_save_trapsts, s_save_buf_rsrc0, s_save_mem_offset) //TRAPSTS
0276
0277 write_hwreg_to_mem(s_save_xnack_mask_lo, s_save_buf_rsrc0, s_save_mem_offset) //XNACK_MASK_LO
0278 write_hwreg_to_mem(s_save_xnack_mask_hi, s_save_buf_rsrc0, s_save_mem_offset) //XNACK_MASK_HI
0279
0280 //use s_save_tmp would introduce conflict here between s_save_tmp and s_save_buf_rsrc2
0281 s_getreg_b32 s_save_m0, hwreg(HW_REG_MODE) //MODE
0282 write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset)
0283 write_hwreg_to_mem(tba_lo, s_save_buf_rsrc0, s_save_mem_offset) //TBA_LO
0284 write_hwreg_to_mem(tba_hi, s_save_buf_rsrc0, s_save_mem_offset) //TBA_HI
0285
0286
0287
0288
0289 // save fist_wave bits in tba_hi unused bit.26
0290 s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK // extract fisrt wave bit
0291 //s_or_b32 tba_hi, s_save_tmp, tba_hi // save first wave bit to tba_hi.bits[26]
0292 s_mov_b32 s_save_exec_hi, 0x0
0293 s_or_b32 s_save_exec_hi, s_save_tmp, s_save_exec_hi // save first wave bit to s_save_exec_hi.bits[26]
0294
0295
0296
0297 // Save SGPR before LDS save, then the s0 to s4 can be used during LDS save...
0298 //////////////////////////////
0299
0300 // SGPR SR memory offset : size(VGPR)
0301 get_vgpr_size_bytes(s_save_mem_offset)
0302 // TODO, change RSRC word to rearrange memory layout for SGPRS
0303
0304 s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size
0305 s_add_u32 s_save_alloc_size, s_save_alloc_size, 1
0306 s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 4 //Number of SGPRs = (sgpr_size + 1) * 16 (non-zero value)
0307
0308 s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 2 //NUM_RECORDS in bytes
0309 s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
0310
0311 // backup s_save_buf_rsrc0,1 to s_save_pc_lo/hi, since write_16sgpr_to_mem function will change the rsrc0
0312 //s_mov_b64 s_save_pc_lo, s_save_buf_rsrc0
0313 s_mov_b64 s_save_xnack_mask_lo, s_save_buf_rsrc0
0314 s_add_u32 s_save_buf_rsrc0, s_save_buf_rsrc0, s_save_mem_offset
0315 s_addc_u32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0
0316
0317 s_mov_b32 m0, 0x0 //SGPR initial index value =0
0318 L_SAVE_SGPR_LOOP:
0319 // SGPR is allocated in 16 SGPR granularity
0320 s_movrels_b64 s0, s0 //s0 = s[0+m0], s1 = s[1+m0]
0321 s_movrels_b64 s2, s2 //s2 = s[2+m0], s3 = s[3+m0]
0322 s_movrels_b64 s4, s4 //s4 = s[4+m0], s5 = s[5+m0]
0323 s_movrels_b64 s6, s6 //s6 = s[6+m0], s7 = s[7+m0]
0324 s_movrels_b64 s8, s8 //s8 = s[8+m0], s9 = s[9+m0]
0325 s_movrels_b64 s10, s10 //s10 = s[10+m0], s11 = s[11+m0]
0326 s_movrels_b64 s12, s12 //s12 = s[12+m0], s13 = s[13+m0]
0327 s_movrels_b64 s14, s14 //s14 = s[14+m0], s15 = s[15+m0]
0328
0329 write_16sgpr_to_mem(s0, s_save_buf_rsrc0, s_save_mem_offset) //PV: the best performance should be using s_buffer_store_dwordx4
0330 s_add_u32 m0, m0, 16 //next sgpr index
0331 s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0
0332 s_cbranch_scc1 L_SAVE_SGPR_LOOP //SGPR save is complete?
0333 // restore s_save_buf_rsrc0,1
0334 //s_mov_b64 s_save_buf_rsrc0, s_save_pc_lo
0335 s_mov_b64 s_save_buf_rsrc0, s_save_xnack_mask_lo
0336
0337
0338
0339
0340
0341 // each wave will alloc 4 vgprs at least...
0342 /////////////////////////////////////////////////////////////////////////////////////
0343
0344 s_mov_b32 s_save_mem_offset, 0
0345 s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on
0346 s_mov_b32 exec_hi, 0xFFFFFFFF
0347
0348 s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
0349
0350 // VGPR Allocated in 4-GPR granularity
0351
0352 buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
0353 buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256
0354 buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2
0355 buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3
0356
0357
0358
0359
0360 //////////////////////////////
0361
0362 L_SAVE_LDS:
0363
0364 // Change EXEC to all threads...
0365 s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on
0366 s_mov_b32 exec_hi, 0xFFFFFFFF
0367
0368 s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) //lds_size
0369 s_and_b32 s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF //lds_size is zero?
0370 s_cbranch_scc0 L_SAVE_LDS_DONE //no lds used? jump to L_SAVE_DONE
0371
0372 s_barrier //LDS is used? wait for other waves in the same TG
0373 //s_and_b32 s_save_tmp, tba_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK //exec is still used here
0374 s_and_b32 s_save_tmp, s_save_exec_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK //exec is still used here
0375 s_cbranch_scc0 L_SAVE_LDS_DONE
0376
0377 // first wave do LDS save;
0378
0379 s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 6 //LDS size in dwords = lds_size * 64dw
0380 s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //LDS size in bytes
0381 s_mov_b32 s_save_buf_rsrc2, s_save_alloc_size //NUM_RECORDS in bytes
0382
0383 // LDS at offset: size(VGPR)+SIZE(SGPR)+SIZE(HWREG)
0384 //
0385 get_vgpr_size_bytes(s_save_mem_offset)
0386 get_sgpr_size_bytes(s_save_tmp)
0387 s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp
0388 s_add_u32 s_save_mem_offset, s_save_mem_offset, get_hwreg_size_bytes()
0389
0390
0391 s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
0392 s_mov_b32 m0, 0x0 //lds_offset initial value = 0
0393
0394
0395 v_mbcnt_lo_u32_b32 v2, 0xffffffff, 0x0
0396 v_mbcnt_hi_u32_b32 v3, 0xffffffff, v2 // tid
0397 v_mul_i32_i24 v2, v3, 8 // tid*8
0398 v_mov_b32 v3, 256*2
0399 s_mov_b32 m0, 0x10000
0400 s_mov_b32 s0, s_save_buf_rsrc3
0401 s_and_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, 0xFF7FFFFF // disable add_tid
0402 s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, 0x58000 //DFMT
0403
0404 L_SAVE_LDS_LOOP_VECTOR:
0405 ds_read_b64 v[0:1], v2 //x =LDS[a], byte address
0406 s_waitcnt lgkmcnt(0)
0407 buffer_store_dwordx2 v[0:1], v2, s_save_buf_rsrc0, s_save_mem_offset offen:1 glc:1 slc:1
0408 // s_waitcnt vmcnt(0)
0409 v_add_u32 v2, vcc[0:1], v2, v3
0410 v_cmp_lt_u32 vcc[0:1], v2, s_save_alloc_size
0411 s_cbranch_vccnz L_SAVE_LDS_LOOP_VECTOR
0412
0413 // restore rsrc3
0414 s_mov_b32 s_save_buf_rsrc3, s0
0415
0416 L_SAVE_LDS_DONE:
0417
0418
0419
0420 //////////////////////////////////////////////////////////////////////////////////////
0421 L_SAVE_VGPR:
0422 // VGPR SR memory offset: 0
0423 // TODO rearrange the RSRC words to use swizzle for VGPR save...
0424
0425 s_mov_b32 s_save_mem_offset, (0+256*4) // for the rest VGPRs
0426 s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on
0427 s_mov_b32 exec_hi, 0xFFFFFFFF
0428
0429 s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size
0430 s_add_u32 s_save_alloc_size, s_save_alloc_size, 1
0431 s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value) //FIXME for GFX, zero is possible
0432 s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 8 //NUM_RECORDS in bytes (64 threads*4)
0433 s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
0434
0435 // VGPR store using dw burst
0436 s_mov_b32 m0, 0x4 //VGPR initial index value =0
0437 s_cmp_lt_u32 m0, s_save_alloc_size
0438 s_cbranch_scc0 L_SAVE_VGPR_END
0439
0440
0441 s_set_gpr_idx_on m0, 0x1 //M0[7:0] = M0[7:0] and M0[15:12] = 0x1
0442 s_add_u32 s_save_alloc_size, s_save_alloc_size, 0x1000 //add 0x1000 since we compare m0 against it later
0443
0444 L_SAVE_VGPR_LOOP:
0445 v_mov_b32 v0, v0 //v0 = v[0+m0]
0446 v_mov_b32 v1, v1 //v0 = v[0+m0]
0447 v_mov_b32 v2, v2 //v0 = v[0+m0]
0448 v_mov_b32 v3, v3 //v0 = v[0+m0]
0449
0450 buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
0451 buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256
0452 buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2
0453 buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3
0454
0455 s_add_u32 m0, m0, 4 //next vgpr index
0456 s_add_u32 s_save_mem_offset, s_save_mem_offset, 256*4 //every buffer_store_dword does 256 bytes
0457 s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0
0458 s_cbranch_scc1 L_SAVE_VGPR_LOOP //VGPR save is complete?
0459 s_set_gpr_idx_off
0460
0461 L_SAVE_VGPR_END:
0462 s_branch L_END_PGM
0463
0464
0465
0466
0467
0468
0469
0470 L_RESTORE:
0471
0472 s_mov_b32 s_restore_buf_rsrc0, s_restore_spi_init_lo //base_addr_lo
0473 s_and_b32 s_restore_buf_rsrc1, s_restore_spi_init_hi, 0x0000FFFF //base_addr_hi
0474 s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE
0475 s_mov_b32 s_restore_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes)
0476 s_mov_b32 s_restore_buf_rsrc3, S_RESTORE_BUF_RSRC_WORD3_MISC
0477 s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_ATC_MASK
0478 s_lshr_b32 s_restore_tmp, s_restore_tmp, (S_RESTORE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT) //get ATC bit into position
0479 s_or_b32 s_restore_buf_rsrc3, s_restore_buf_rsrc3, s_restore_tmp //or ATC
0480 s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_MTYPE_MASK
0481 s_lshr_b32 s_restore_tmp, s_restore_tmp, (S_RESTORE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT) //get MTYPE bits into position
0482 s_or_b32 s_restore_buf_rsrc3, s_restore_buf_rsrc3, s_restore_tmp //or MTYPE
0483
0484
0485 // s_mov_b32 s_restore_mem_offset, 0x0 //mem offset initial value = 0
0486
0487
0488 s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_FIRST_WAVE_MASK
0489 s_cbranch_scc0 L_RESTORE_VGPR
0490
0491
0492 //////////////////////////////
0493 L_RESTORE_LDS:
0494
0495 s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on //be consistent with SAVE although can be moved ahead
0496 s_mov_b32 exec_hi, 0xFFFFFFFF
0497
0498 s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) //lds_size
0499 s_and_b32 s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF //lds_size is zero?
0500 s_cbranch_scc0 L_RESTORE_VGPR //no lds used? jump to L_RESTORE_VGPR
0501 s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 6 //LDS size in dwords = lds_size * 64dw
0502 s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //LDS size in bytes
0503 s_mov_b32 s_restore_buf_rsrc2, s_restore_alloc_size //NUM_RECORDS in bytes
0504
0505 // LDS at offset: size(VGPR)+SIZE(SGPR)+SIZE(HWREG)
0506 //
0507 get_vgpr_size_bytes(s_restore_mem_offset)
0508 get_sgpr_size_bytes(s_restore_tmp)
0509 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp
0510 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, get_hwreg_size_bytes() //FIXME, Check if offset overflow???
0511
0512
0513 s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
0514 s_mov_b32 m0, 0x0 //lds_offset initial value = 0
0515
0516 L_RESTORE_LDS_LOOP:
0517 buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 // first 64DW
0518 buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 offset:256 // second 64DW
0519 s_add_u32 m0, m0, 256*2 // 128 DW
0520 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*2 //mem offset increased by 128DW
0521 s_cmp_lt_u32 m0, s_restore_alloc_size //scc=(m0 < s_restore_alloc_size) ? 1 : 0
0522 s_cbranch_scc1 L_RESTORE_LDS_LOOP //LDS restore is complete?
0523
0524
0525
0526 //////////////////////////////
0527 L_RESTORE_VGPR:
0528 // VGPR SR memory offset : 0
0529 s_mov_b32 s_restore_mem_offset, 0x0
0530 s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on //be consistent with SAVE although can be moved ahead
0531 s_mov_b32 exec_hi, 0xFFFFFFFF
0532
0533 s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size
0534 s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1
0535 s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value)
0536 s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 8 //NUM_RECORDS in bytes (64 threads*4)
0537 s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
0538
0539 // VGPR load using dw burst
0540 s_mov_b32 s_restore_mem_offset_save, s_restore_mem_offset // restore start with v1, v0 will be the last
0541 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4
0542 s_mov_b32 m0, 4 //VGPR initial index value = 1
0543 s_set_gpr_idx_on m0, 0x8 //M0[7:0] = M0[7:0] and M0[15:12] = 0x8
0544 s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 0x8000 //add 0x8000 since we compare m0 against it later
0545
0546 L_RESTORE_VGPR_LOOP:
0547 buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1
0548 buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256
0549 buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*2
0550 buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*3
0551 s_waitcnt vmcnt(0) //ensure data ready
0552 v_mov_b32 v0, v0 //v[0+m0] = v0
0553 v_mov_b32 v1, v1
0554 v_mov_b32 v2, v2
0555 v_mov_b32 v3, v3
0556 s_add_u32 m0, m0, 4 //next vgpr index
0557 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 //every buffer_load_dword does 256 bytes
0558 s_cmp_lt_u32 m0, s_restore_alloc_size //scc = (m0 < s_restore_alloc_size) ? 1 : 0
0559 s_cbranch_scc1 L_RESTORE_VGPR_LOOP //VGPR restore (except v0) is complete?
0560 s_set_gpr_idx_off
0561
0562 buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1
0563 buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256
0564 buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256*2
0565 buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256*3
0566 s_waitcnt vmcnt(0)
0567
0568
0569 //////////////////////////////
0570
0571 // SGPR SR memory offset : size(VGPR)
0572 get_vgpr_size_bytes(s_restore_mem_offset)
0573 get_sgpr_size_bytes(s_restore_tmp)
0574 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp
0575 s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 16*4 // restore SGPR from S[n] to S[0], by 16 sgprs group
0576 // TODO, change RSRC word to rearrange memory layout for SGPRS
0577
0578 s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size
0579 s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1
0580 s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 4 //Number of SGPRs = (sgpr_size + 1) * 16 (non-zero value)
0581
0582 s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 2 //NUM_RECORDS in bytes
0583 s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
0584
0585
0586
0587
0588 s_mov_b32 m0, s_restore_alloc_size
0589
0590 L_RESTORE_SGPR_LOOP:
0591 read_16sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset) //PV: further performance improvement can be made
0592 s_waitcnt lgkmcnt(0) //ensure data ready
0593
0594 s_sub_u32 m0, m0, 16 // Restore from S[n] to S[0]
0595
0596 s_movreld_b64 s0, s0 //s[0+m0] = s0
0597 s_movreld_b64 s2, s2
0598 s_movreld_b64 s4, s4
0599 s_movreld_b64 s6, s6
0600 s_movreld_b64 s8, s8
0601 s_movreld_b64 s10, s10
0602 s_movreld_b64 s12, s12
0603 s_movreld_b64 s14, s14
0604
0605 s_cmp_eq_u32 m0, 0 //scc = (m0 < s_restore_alloc_size) ? 1 : 0
0606 s_cbranch_scc0 L_RESTORE_SGPR_LOOP //SGPR restore (except s0) is complete?
0607
0608
0609 //////////////////////////////
0610 L_RESTORE_HWREG:
0611
0612 // HWREG SR memory offset : size(VGPR)+size(SGPR)
0613 get_vgpr_size_bytes(s_restore_mem_offset)
0614 get_sgpr_size_bytes(s_restore_tmp)
0615 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp
0616
0617
0618 s_mov_b32 s_restore_buf_rsrc2, 0x4 //NUM_RECORDS in bytes
0619 s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
0620
0621 read_hwreg_from_mem(s_restore_m0, s_restore_buf_rsrc0, s_restore_mem_offset) //M0
0622 read_hwreg_from_mem(s_restore_pc_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //PC
0623 read_hwreg_from_mem(s_restore_pc_hi, s_restore_buf_rsrc0, s_restore_mem_offset)
0624 read_hwreg_from_mem(s_restore_exec_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //EXEC
0625 read_hwreg_from_mem(s_restore_exec_hi, s_restore_buf_rsrc0, s_restore_mem_offset)
0626 read_hwreg_from_mem(s_restore_status, s_restore_buf_rsrc0, s_restore_mem_offset) //STATUS
0627 read_hwreg_from_mem(s_restore_trapsts, s_restore_buf_rsrc0, s_restore_mem_offset) //TRAPSTS
0628 read_hwreg_from_mem(xnack_mask_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //XNACK_MASK_LO
0629 read_hwreg_from_mem(xnack_mask_hi, s_restore_buf_rsrc0, s_restore_mem_offset) //XNACK_MASK_HI
0630 read_hwreg_from_mem(s_restore_mode, s_restore_buf_rsrc0, s_restore_mem_offset) //MODE
0631 read_hwreg_from_mem(tba_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //TBA_LO
0632 read_hwreg_from_mem(tba_hi, s_restore_buf_rsrc0, s_restore_mem_offset) //TBA_HI
0633
0634 s_waitcnt lgkmcnt(0) //from now on, it is safe to restore STATUS and IB_STS
0635
0636 s_mov_b32 m0, s_restore_m0
0637 s_mov_b32 exec_lo, s_restore_exec_lo
0638 s_mov_b32 exec_hi, s_restore_exec_hi
0639
0640 s_and_b32 s_restore_m0, SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK, s_restore_trapsts
0641 s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE), s_restore_m0
0642 s_and_b32 s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK, s_restore_trapsts
0643 s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT
0644 s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE), s_restore_m0
0645 //s_setreg_b32 hwreg(HW_REG_TRAPSTS), s_restore_trapsts //don't overwrite SAVECTX bit as it may be set through external SAVECTX during restore
0646 s_setreg_b32 hwreg(HW_REG_MODE), s_restore_mode
0647 //reuse s_restore_m0 as a temp register
0648 s_and_b32 s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_RCNT_MASK
0649 s_lshr_b32 s_restore_m0, s_restore_m0, S_SAVE_PC_HI_RCNT_SHIFT
0650 s_lshl_b32 s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_RCNT_SHIFT
0651 s_mov_b32 s_restore_tmp, 0x0 //IB_STS is zero
0652 s_or_b32 s_restore_tmp, s_restore_tmp, s_restore_m0
0653 s_and_b32 s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_FIRST_REPLAY_MASK
0654 s_lshr_b32 s_restore_m0, s_restore_m0, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT
0655 s_lshl_b32 s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT
0656 s_or_b32 s_restore_tmp, s_restore_tmp, s_restore_m0
0657 s_and_b32 s_restore_m0, s_restore_status, SQ_WAVE_STATUS_INST_ATC_MASK
0658 s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_STATUS_INST_ATC_SHIFT
0659 s_setreg_b32 hwreg(HW_REG_IB_STS), s_restore_tmp
0660
0661 s_and_b32 s_restore_pc_hi, s_restore_pc_hi, 0x0000ffff //pc[47:32] //Do it here in order not to affect STATUS
0662 s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32
0663 s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32
0664 set_status_without_spi_prio(s_restore_status, s_restore_tmp) // SCC is included, which is changed by previous salu
0665
0666 s_barrier //barrier to ensure the readiness of LDS before access attempts from any other wave in the same TG //FIXME not performance-optimal at this time
0667
0668 // s_rfe_b64 s_restore_pc_lo //Return to the main shader program and resume execution
0669 s_rfe_restore_b64 s_restore_pc_lo, s_restore_m0 // s_restore_m0[0] is used to set STATUS.inst_atc
0670
0671
0672
0673
0674
0675 L_END_PGM:
0676 s_endpgm
0677
0678 end
0679
0680
0681
0682
0683
0684
0685 //Only for save hwreg to mem
0686 function write_hwreg_to_mem(s, s_rsrc, s_mem_offset)
0687 s_mov_b32 exec_lo, m0 //assuming exec_lo is not needed anymore from this point on
0688 s_mov_b32 m0, s_mem_offset
0689 s_buffer_store_dword s, s_rsrc, m0 glc:1
0690 s_add_u32 s_mem_offset, s_mem_offset, 4
0691 s_mov_b32 m0, exec_lo
0692 end
0693
0694
0695 // HWREG are saved before SGPRs, so all HWREG could be use.
0696 function write_16sgpr_to_mem(s, s_rsrc, s_mem_offset)
0697
0698 s_buffer_store_dwordx4 s[0], s_rsrc, 0 glc:1
0699 s_buffer_store_dwordx4 s[4], s_rsrc, 16 glc:1
0700 s_buffer_store_dwordx4 s[8], s_rsrc, 32 glc:1
0701 s_buffer_store_dwordx4 s[12], s_rsrc, 48 glc:1
0702 s_add_u32 s_rsrc[0], s_rsrc[0], 4*16
0703 s_addc_u32 s_rsrc[1], s_rsrc[1], 0x0 // +scc
0704 end
0705
0706
0707 function read_hwreg_from_mem(s, s_rsrc, s_mem_offset)
0708 s_buffer_load_dword s, s_rsrc, s_mem_offset glc:1
0709 s_add_u32 s_mem_offset, s_mem_offset, 4
0710 end
0711
0712 function read_16sgpr_from_mem(s, s_rsrc, s_mem_offset)
0713 s_buffer_load_dwordx16 s, s_rsrc, s_mem_offset glc:1
0714 s_sub_u32 s_mem_offset, s_mem_offset, 4*16
0715 end
0716
0717
0718
0719 function get_lds_size_bytes(s_lds_size_byte)
0720 // SQ LDS granularity is 64DW, while PGM_RSRC2.lds_size is in granularity 128DW
0721 s_getreg_b32 s_lds_size_byte, hwreg(HW_REG_LDS_ALLOC, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) // lds_size
0722 s_lshl_b32 s_lds_size_byte, s_lds_size_byte, 8 //LDS size in dwords = lds_size * 64 *4Bytes // granularity 64DW
0723 end
0724
0725 function get_vgpr_size_bytes(s_vgpr_size_byte)
0726 s_getreg_b32 s_vgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size
0727 s_add_u32 s_vgpr_size_byte, s_vgpr_size_byte, 1
0728 s_lshl_b32 s_vgpr_size_byte, s_vgpr_size_byte, (2+8) //Number of VGPRs = (vgpr_size + 1) * 4 * 64 * 4 (non-zero value) //FIXME for GFX, zero is possible
0729 end
0730
0731 function get_sgpr_size_bytes(s_sgpr_size_byte)
0732 s_getreg_b32 s_sgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size
0733 s_add_u32 s_sgpr_size_byte, s_sgpr_size_byte, 1
0734 s_lshl_b32 s_sgpr_size_byte, s_sgpr_size_byte, 6 //Number of SGPRs = (sgpr_size + 1) * 16 *4 (non-zero value)
0735 end
0736
0737 function get_hwreg_size_bytes
0738 return 128 //HWREG size 128 bytes
0739 end
0740
0741 function set_status_without_spi_prio(status, tmp)
0742 // Do not restore STATUS.SPI_PRIO since scheduler may have raised it.
0743 s_lshr_b32 tmp, status, SQ_WAVE_STATUS_POST_SPI_PRIO_SHIFT
0744 s_setreg_b32 hwreg(HW_REG_STATUS, SQ_WAVE_STATUS_POST_SPI_PRIO_SHIFT, SQ_WAVE_STATUS_POST_SPI_PRIO_SIZE), tmp
0745 s_nop 0x2 // avoid S_SETREG => S_SETREG hazard
0746 s_setreg_b32 hwreg(HW_REG_STATUS, SQ_WAVE_STATUS_PRE_SPI_PRIO_SHIFT, SQ_WAVE_STATUS_PRE_SPI_PRIO_SIZE), status
0747 end