Back to home page

OSCL-LXR

 
 

    


0001 /*
0002  * Copyright 2018 Advanced Micro Devices, Inc.
0003  *
0004  * Permission is hereby granted, free of charge, to any person obtaining a
0005  * copy of this software and associated documentation files (the "Software"),
0006  * to deal in the Software without restriction, including without limitation
0007  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
0008  * and/or sell copies of the Software, and to permit persons to whom the
0009  * Software is furnished to do so, subject to the following conditions:
0010  *
0011  * The above copyright notice and this permission notice shall be included in
0012  * all copies or substantial portions of the Software.
0013  *
0014  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
0015  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
0016  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
0017  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
0018  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
0019  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
0020  * OTHER DEALINGS IN THE SOFTWARE.
0021  */
0022 
0023 /* To compile this assembly code:
0024  *
0025  * Navi1x:
0026  *   cpp -DASIC_FAMILY=CHIP_NAVI10 cwsr_trap_handler_gfx10.asm -P -o nv1x.sp3
0027  *   sp3 nv1x.sp3 -hex nv1x.hex
0028  *
0029  * gfx10:
0030  *   cpp -DASIC_FAMILY=CHIP_SIENNA_CICHLID cwsr_trap_handler_gfx10.asm -P -o gfx10.sp3
0031  *   sp3 gfx10.sp3 -hex gfx10.hex
0032  *
0033  * gfx11:
0034  *   cpp -DASIC_FAMILY=CHIP_PLUM_BONITO cwsr_trap_handler_gfx10.asm -P -o gfx11.sp3
0035  *   sp3 gfx11.sp3 -hex gfx11.hex
0036  */
0037 
0038 #define CHIP_NAVI10 26
0039 #define CHIP_SIENNA_CICHLID 30
0040 #define CHIP_PLUM_BONITO 36
0041 
0042 #define NO_SQC_STORE (ASIC_FAMILY >= CHIP_SIENNA_CICHLID)
0043 #define HAVE_XNACK (ASIC_FAMILY < CHIP_SIENNA_CICHLID)
0044 #define HAVE_SENDMSG_RTN (ASIC_FAMILY >= CHIP_PLUM_BONITO)
0045 #define HAVE_BUFFER_LDS_LOAD (ASIC_FAMILY < CHIP_PLUM_BONITO)
0046 
0047 var SINGLE_STEP_MISSED_WORKAROUND       = 1 //workaround for lost MODE.DEBUG_EN exception when SAVECTX raised
0048 
0049 var SQ_WAVE_STATUS_SPI_PRIO_MASK        = 0x00000006
0050 var SQ_WAVE_STATUS_HALT_MASK            = 0x2000
0051 var SQ_WAVE_STATUS_ECC_ERR_MASK         = 0x20000
0052 
0053 var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT        = 12
0054 var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE     = 9
0055 var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE        = 8
0056 var SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT    = 24
0057 var SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE = 4
0058 var SQ_WAVE_IB_STS2_WAVE64_SHIFT        = 11
0059 var SQ_WAVE_IB_STS2_WAVE64_SIZE         = 1
0060 
0061 #if ASIC_FAMILY < CHIP_PLUM_BONITO
0062 var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT       = 8
0063 #else
0064 var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT       = 12
0065 #endif
0066 
0067 var SQ_WAVE_TRAPSTS_SAVECTX_MASK        = 0x400
0068 var SQ_WAVE_TRAPSTS_EXCP_MASK           = 0x1FF
0069 var SQ_WAVE_TRAPSTS_SAVECTX_SHIFT       = 10
0070 var SQ_WAVE_TRAPSTS_ADDR_WATCH_MASK     = 0x80
0071 var SQ_WAVE_TRAPSTS_ADDR_WATCH_SHIFT        = 7
0072 var SQ_WAVE_TRAPSTS_MEM_VIOL_MASK       = 0x100
0073 var SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT      = 8
0074 var SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK        = 0x3FF
0075 var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT       = 0x0
0076 var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE        = 10
0077 var SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK       = 0xFFFFF800
0078 var SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT      = 11
0079 var SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE       = 21
0080 var SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK       = 0x800
0081 var SQ_WAVE_TRAPSTS_EXCP_HI_MASK        = 0x7000
0082 
0083 var SQ_WAVE_MODE_EXCP_EN_SHIFT          = 12
0084 var SQ_WAVE_MODE_EXCP_EN_ADDR_WATCH_SHIFT   = 19
0085 
0086 var SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT       = 15
0087 var SQ_WAVE_IB_STS_REPLAY_W64H_SHIFT        = 25
0088 var SQ_WAVE_IB_STS_REPLAY_W64H_MASK     = 0x02000000
0089 var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK   = 0x003F8000
0090 
0091 var SQ_WAVE_MODE_DEBUG_EN_MASK          = 0x800
0092 
0093 // bits [31:24] unused by SPI debug data
0094 var TTMP11_SAVE_REPLAY_W64H_SHIFT       = 31
0095 var TTMP11_SAVE_REPLAY_W64H_MASK        = 0x80000000
0096 var TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT     = 24
0097 var TTMP11_SAVE_RCNT_FIRST_REPLAY_MASK      = 0x7F000000
0098 var TTMP11_DEBUG_TRAP_ENABLED_SHIFT     = 23
0099 var TTMP11_DEBUG_TRAP_ENABLED_MASK      = 0x800000
0100 
0101 // SQ_SEL_X/Y/Z/W, BUF_NUM_FORMAT_FLOAT, (0 for MUBUF stride[17:14]
0102 // when ADD_TID_ENABLE and BUF_DATA_FORMAT_32 for MTBUF), ADD_TID_ENABLE
0103 var S_SAVE_BUF_RSRC_WORD1_STRIDE        = 0x00040000
0104 var S_SAVE_BUF_RSRC_WORD3_MISC          = 0x10807FAC
0105 var S_SAVE_PC_HI_TRAP_ID_MASK           = 0x00FF0000
0106 var S_SAVE_PC_HI_HT_MASK            = 0x01000000
0107 var S_SAVE_SPI_INIT_FIRST_WAVE_MASK     = 0x04000000
0108 var S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT        = 26
0109 
0110 var S_SAVE_PC_HI_FIRST_WAVE_MASK        = 0x80000000
0111 var S_SAVE_PC_HI_FIRST_WAVE_SHIFT       = 31
0112 
0113 var s_sgpr_save_num             = 108
0114 
0115 var s_save_spi_init_lo              = exec_lo
0116 var s_save_spi_init_hi              = exec_hi
0117 var s_save_pc_lo                = ttmp0
0118 var s_save_pc_hi                = ttmp1
0119 var s_save_exec_lo              = ttmp2
0120 var s_save_exec_hi              = ttmp3
0121 var s_save_status               = ttmp12
0122 var s_save_trapsts              = ttmp15
0123 var s_save_xnack_mask               = s_save_trapsts
0124 var s_wave_size                 = ttmp7
0125 var s_save_buf_rsrc0                = ttmp8
0126 var s_save_buf_rsrc1                = ttmp9
0127 var s_save_buf_rsrc2                = ttmp10
0128 var s_save_buf_rsrc3                = ttmp11
0129 var s_save_mem_offset               = ttmp4
0130 var s_save_alloc_size               = s_save_trapsts
0131 var s_save_tmp                  = ttmp14
0132 var s_save_m0                   = ttmp5
0133 var s_save_ttmps_lo             = s_save_tmp
0134 var s_save_ttmps_hi             = s_save_trapsts
0135 
0136 var S_RESTORE_BUF_RSRC_WORD1_STRIDE     = S_SAVE_BUF_RSRC_WORD1_STRIDE
0137 var S_RESTORE_BUF_RSRC_WORD3_MISC       = S_SAVE_BUF_RSRC_WORD3_MISC
0138 
0139 var S_RESTORE_SPI_INIT_FIRST_WAVE_MASK      = 0x04000000
0140 var S_RESTORE_SPI_INIT_FIRST_WAVE_SHIFT     = 26
0141 var S_WAVE_SIZE                 = 25
0142 
0143 var s_restore_spi_init_lo           = exec_lo
0144 var s_restore_spi_init_hi           = exec_hi
0145 var s_restore_mem_offset            = ttmp12
0146 var s_restore_alloc_size            = ttmp3
0147 var s_restore_tmp               = ttmp2
0148 var s_restore_mem_offset_save           = s_restore_tmp
0149 var s_restore_m0                = s_restore_alloc_size
0150 var s_restore_mode              = ttmp7
0151 var s_restore_flat_scratch          = s_restore_tmp
0152 var s_restore_pc_lo             = ttmp0
0153 var s_restore_pc_hi             = ttmp1
0154 var s_restore_exec_lo               = ttmp4
0155 var s_restore_exec_hi               = ttmp5
0156 var s_restore_status                = ttmp14
0157 var s_restore_trapsts               = ttmp15
0158 var s_restore_xnack_mask            = ttmp13
0159 var s_restore_buf_rsrc0             = ttmp8
0160 var s_restore_buf_rsrc1             = ttmp9
0161 var s_restore_buf_rsrc2             = ttmp10
0162 var s_restore_buf_rsrc3             = ttmp11
0163 var s_restore_size              = ttmp6
0164 var s_restore_ttmps_lo              = s_restore_tmp
0165 var s_restore_ttmps_hi              = s_restore_alloc_size
0166 
0167 shader main
0168     asic(DEFAULT)
0169     type(CS)
0170     wave_size(32)
0171 
0172     s_branch    L_SKIP_RESTORE                      //NOT restore. might be a regular trap or save
0173 
0174 L_JUMP_TO_RESTORE:
0175     s_branch    L_RESTORE
0176 
0177 L_SKIP_RESTORE:
0178     s_getreg_b32    s_save_status, hwreg(HW_REG_STATUS)         //save STATUS since we will change SCC
0179 
0180     // Clear SPI_PRIO: do not save with elevated priority.
0181     // Clear ECC_ERR: prevents SQC store and triggers FATAL_HALT if setreg'd.
0182     s_andn2_b32 s_save_status, s_save_status, SQ_WAVE_STATUS_SPI_PRIO_MASK|SQ_WAVE_STATUS_ECC_ERR_MASK
0183 
0184     s_getreg_b32    s_save_trapsts, hwreg(HW_REG_TRAPSTS)
0185 
0186     s_and_b32       ttmp2, s_save_status, SQ_WAVE_STATUS_HALT_MASK
0187     s_cbranch_scc0  L_NOT_HALTED
0188 
0189 L_HALTED:
0190     // Host trap may occur while wave is halted.
0191     s_and_b32   ttmp2, s_save_pc_hi, S_SAVE_PC_HI_TRAP_ID_MASK
0192     s_cbranch_scc1  L_FETCH_2ND_TRAP
0193 
0194 L_CHECK_SAVE:
0195     s_and_b32   ttmp2, s_save_trapsts, SQ_WAVE_TRAPSTS_SAVECTX_MASK
0196     s_cbranch_scc1  L_SAVE
0197 
0198     // Wave is halted but neither host trap nor SAVECTX is raised.
0199     // Caused by instruction fetch memory violation.
0200     // Spin wait until context saved to prevent interrupt storm.
0201     s_sleep     0x10
0202     s_getreg_b32    s_save_trapsts, hwreg(HW_REG_TRAPSTS)
0203     s_branch    L_CHECK_SAVE
0204 
0205 L_NOT_HALTED:
0206     // Let second-level handle non-SAVECTX exception or trap.
0207     // Any concurrent SAVECTX will be handled upon re-entry once halted.
0208 
0209     // Check non-maskable exceptions. memory_violation, illegal_instruction
0210     // and xnack_error exceptions always cause the wave to enter the trap
0211     // handler.
0212     s_and_b32   ttmp2, s_save_trapsts, SQ_WAVE_TRAPSTS_MEM_VIOL_MASK|SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK
0213     s_cbranch_scc1  L_FETCH_2ND_TRAP
0214 
0215     // Check for maskable exceptions in trapsts.excp and trapsts.excp_hi.
0216     // Maskable exceptions only cause the wave to enter the trap handler if
0217     // their respective bit in mode.excp_en is set.
0218     s_and_b32   ttmp2, s_save_trapsts, SQ_WAVE_TRAPSTS_EXCP_MASK|SQ_WAVE_TRAPSTS_EXCP_HI_MASK
0219     s_cbranch_scc0  L_CHECK_TRAP_ID
0220 
0221     s_and_b32   ttmp3, s_save_trapsts, SQ_WAVE_TRAPSTS_ADDR_WATCH_MASK|SQ_WAVE_TRAPSTS_EXCP_HI_MASK
0222     s_cbranch_scc0  L_NOT_ADDR_WATCH
0223     s_bitset1_b32   ttmp2, SQ_WAVE_TRAPSTS_ADDR_WATCH_SHIFT // Check all addr_watch[123] exceptions against excp_en.addr_watch
0224 
0225 L_NOT_ADDR_WATCH:
0226     s_getreg_b32    ttmp3, hwreg(HW_REG_MODE)
0227     s_lshl_b32  ttmp2, ttmp2, SQ_WAVE_MODE_EXCP_EN_SHIFT
0228     s_and_b32   ttmp2, ttmp2, ttmp3
0229     s_cbranch_scc1  L_FETCH_2ND_TRAP
0230 
0231 L_CHECK_TRAP_ID:
0232     // Check trap_id != 0
0233     s_and_b32   ttmp2, s_save_pc_hi, S_SAVE_PC_HI_TRAP_ID_MASK
0234     s_cbranch_scc1  L_FETCH_2ND_TRAP
0235 
0236 if SINGLE_STEP_MISSED_WORKAROUND
0237     // Prioritize single step exception over context save.
0238     // Second-level trap will halt wave and RFE, re-entering for SAVECTX.
0239     s_getreg_b32    ttmp2, hwreg(HW_REG_MODE)
0240     s_and_b32   ttmp2, ttmp2, SQ_WAVE_MODE_DEBUG_EN_MASK
0241     s_cbranch_scc1  L_FETCH_2ND_TRAP
0242 end
0243 
0244     s_and_b32   ttmp2, s_save_trapsts, SQ_WAVE_TRAPSTS_SAVECTX_MASK
0245     s_cbranch_scc1  L_SAVE
0246 
0247 L_FETCH_2ND_TRAP:
0248 #if HAVE_XNACK
0249     save_and_clear_ib_sts(ttmp14, ttmp15)
0250 #endif
0251 
0252     // Read second-level TBA/TMA from first-level TMA and jump if available.
0253     // ttmp[2:5] and ttmp12 can be used (others hold SPI-initialized debug data)
0254     // ttmp12 holds SQ_WAVE_STATUS
0255 #if HAVE_SENDMSG_RTN
0256     s_sendmsg_rtn_b64       [ttmp14, ttmp15], sendmsg(MSG_RTN_GET_TMA)
0257     s_waitcnt       lgkmcnt(0)
0258 #else
0259     s_getreg_b32    ttmp14, hwreg(HW_REG_SHADER_TMA_LO)
0260     s_getreg_b32    ttmp15, hwreg(HW_REG_SHADER_TMA_HI)
0261 #endif
0262     s_lshl_b64  [ttmp14, ttmp15], [ttmp14, ttmp15], 0x8
0263 
0264     s_load_dword    ttmp2, [ttmp14, ttmp15], 0x10 glc:1         // debug trap enabled flag
0265     s_waitcnt       lgkmcnt(0)
0266     s_lshl_b32      ttmp2, ttmp2, TTMP11_DEBUG_TRAP_ENABLED_SHIFT
0267     s_andn2_b32     ttmp11, ttmp11, TTMP11_DEBUG_TRAP_ENABLED_MASK
0268     s_or_b32        ttmp11, ttmp11, ttmp2
0269 
0270     s_load_dwordx2  [ttmp2, ttmp3], [ttmp14, ttmp15], 0x0 glc:1     // second-level TBA
0271     s_waitcnt   lgkmcnt(0)
0272     s_load_dwordx2  [ttmp14, ttmp15], [ttmp14, ttmp15], 0x8 glc:1       // second-level TMA
0273     s_waitcnt   lgkmcnt(0)
0274 
0275     s_and_b64   [ttmp2, ttmp3], [ttmp2, ttmp3], [ttmp2, ttmp3]
0276     s_cbranch_scc0  L_NO_NEXT_TRAP                      // second-level trap handler not been set
0277     s_setpc_b64 [ttmp2, ttmp3]                      // jump to second-level trap handler
0278 
0279 L_NO_NEXT_TRAP:
0280     // If not caused by trap then halt wave to prevent re-entry.
0281     s_and_b32   ttmp2, s_save_pc_hi, (S_SAVE_PC_HI_TRAP_ID_MASK|S_SAVE_PC_HI_HT_MASK)
0282     s_cbranch_scc1  L_TRAP_CASE
0283     s_or_b32    s_save_status, s_save_status, SQ_WAVE_STATUS_HALT_MASK
0284 
0285     // If the PC points to S_ENDPGM then context save will fail if STATUS.HALT is set.
0286     // Rewind the PC to prevent this from occurring.
0287     s_sub_u32   ttmp0, ttmp0, 0x8
0288     s_subb_u32  ttmp1, ttmp1, 0x0
0289 
0290     s_branch    L_EXIT_TRAP
0291 
0292 L_TRAP_CASE:
0293     // Host trap will not cause trap re-entry.
0294     s_and_b32   ttmp2, s_save_pc_hi, S_SAVE_PC_HI_HT_MASK
0295     s_cbranch_scc1  L_EXIT_TRAP
0296 
0297     // Advance past trap instruction to prevent re-entry.
0298     s_add_u32   ttmp0, ttmp0, 0x4
0299     s_addc_u32  ttmp1, ttmp1, 0x0
0300 
0301 L_EXIT_TRAP:
0302     s_and_b32   ttmp1, ttmp1, 0xFFFF
0303 
0304 #if HAVE_XNACK
0305     restore_ib_sts(ttmp14, ttmp15)
0306 #endif
0307 
0308     // Restore SQ_WAVE_STATUS.
0309     s_and_b64   exec, exec, exec                    // Restore STATUS.EXECZ, not writable by s_setreg_b32
0310     s_and_b64   vcc, vcc, vcc                       // Restore STATUS.VCCZ, not writable by s_setreg_b32
0311     s_setreg_b32    hwreg(HW_REG_STATUS), s_save_status
0312 
0313     s_rfe_b64   [ttmp0, ttmp1]
0314 
0315 L_SAVE:
0316     s_and_b32   s_save_pc_hi, s_save_pc_hi, 0x0000ffff          //pc[47:32]
0317     s_mov_b32   s_save_tmp, 0
0318     s_setreg_b32    hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_SAVECTX_SHIFT, 1), s_save_tmp //clear saveCtx bit
0319 
0320 #if HAVE_XNACK
0321     save_and_clear_ib_sts(s_save_tmp, s_save_trapsts)
0322 #endif
0323 
0324     /* inform SPI the readiness and wait for SPI's go signal */
0325     s_mov_b32   s_save_exec_lo, exec_lo                 //save EXEC and use EXEC for the go signal from SPI
0326     s_mov_b32   s_save_exec_hi, exec_hi
0327     s_mov_b64   exec, 0x0                       //clear EXEC to get ready to receive
0328 
0329 #if HAVE_SENDMSG_RTN
0330     s_sendmsg_rtn_b64       [exec_lo, exec_hi], sendmsg(MSG_RTN_SAVE_WAVE)
0331 #else
0332     s_sendmsg   sendmsg(MSG_SAVEWAVE)                   //send SPI a message and wait for SPI's write to EXEC
0333 #endif
0334 
0335 #if ASIC_FAMILY < CHIP_SIENNA_CICHLID
0336 L_SLEEP:
0337     // sleep 1 (64clk) is not enough for 8 waves per SIMD, which will cause
0338     // SQ hang, since the 7,8th wave could not get arbit to exec inst, while
0339     // other waves are stuck into the sleep-loop and waiting for wrexec!=0
0340     s_sleep     0x2
0341     s_cbranch_execz L_SLEEP
0342 #else
0343     s_waitcnt   lgkmcnt(0)
0344 #endif
0345 
0346     // Save first_wave flag so we can clear high bits of save address.
0347     s_and_b32   s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK
0348     s_lshl_b32  s_save_tmp, s_save_tmp, (S_SAVE_PC_HI_FIRST_WAVE_SHIFT - S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT)
0349     s_or_b32    s_save_pc_hi, s_save_pc_hi, s_save_tmp
0350 
0351 #if NO_SQC_STORE
0352     // Trap temporaries must be saved via VGPR but all VGPRs are in use.
0353     // There is no ttmp space to hold the resource constant for VGPR save.
0354     // Save v0 by itself since it requires only two SGPRs.
0355     s_mov_b32   s_save_ttmps_lo, exec_lo
0356     s_and_b32   s_save_ttmps_hi, exec_hi, 0xFFFF
0357     s_mov_b32   exec_lo, 0xFFFFFFFF
0358     s_mov_b32   exec_hi, 0xFFFFFFFF
0359     global_store_dword_addtid   v0, [s_save_ttmps_lo, s_save_ttmps_hi] slc:1 glc:1
0360     v_mov_b32   v0, 0x0
0361     s_mov_b32   exec_lo, s_save_ttmps_lo
0362     s_mov_b32   exec_hi, s_save_ttmps_hi
0363 #endif
0364 
0365     // Save trap temporaries 4-11, 13 initialized by SPI debug dispatch logic
0366     // ttmp SR memory offset : size(VGPR)+size(SVGPR)+size(SGPR)+0x40
0367     get_wave_size(s_save_ttmps_hi)
0368     get_vgpr_size_bytes(s_save_ttmps_lo, s_save_ttmps_hi)
0369     get_svgpr_size_bytes(s_save_ttmps_hi)
0370     s_add_u32   s_save_ttmps_lo, s_save_ttmps_lo, s_save_ttmps_hi
0371     s_and_b32   s_save_ttmps_hi, s_save_spi_init_hi, 0xFFFF
0372     s_add_u32   s_save_ttmps_lo, s_save_ttmps_lo, get_sgpr_size_bytes()
0373     s_add_u32   s_save_ttmps_lo, s_save_ttmps_lo, s_save_spi_init_lo
0374     s_addc_u32  s_save_ttmps_hi, s_save_ttmps_hi, 0x0
0375 
0376 #if NO_SQC_STORE
0377     v_writelane_b32 v0, ttmp4, 0x4
0378     v_writelane_b32 v0, ttmp5, 0x5
0379     v_writelane_b32 v0, ttmp6, 0x6
0380     v_writelane_b32 v0, ttmp7, 0x7
0381     v_writelane_b32 v0, ttmp8, 0x8
0382     v_writelane_b32 v0, ttmp9, 0x9
0383     v_writelane_b32 v0, ttmp10, 0xA
0384     v_writelane_b32 v0, ttmp11, 0xB
0385     v_writelane_b32 v0, ttmp13, 0xD
0386     v_writelane_b32 v0, exec_lo, 0xE
0387     v_writelane_b32 v0, exec_hi, 0xF
0388 
0389     s_mov_b32   exec_lo, 0x3FFF
0390     s_mov_b32   exec_hi, 0x0
0391     global_store_dword_addtid   v0, [s_save_ttmps_lo, s_save_ttmps_hi] inst_offset:0x40 slc:1 glc:1
0392     v_readlane_b32  ttmp14, v0, 0xE
0393     v_readlane_b32  ttmp15, v0, 0xF
0394     s_mov_b32   exec_lo, ttmp14
0395     s_mov_b32   exec_hi, ttmp15
0396 #else
0397     s_store_dwordx4 [ttmp4, ttmp5, ttmp6, ttmp7], [s_save_ttmps_lo, s_save_ttmps_hi], 0x50 glc:1
0398     s_store_dwordx4 [ttmp8, ttmp9, ttmp10, ttmp11], [s_save_ttmps_lo, s_save_ttmps_hi], 0x60 glc:1
0399     s_store_dword   ttmp13, [s_save_ttmps_lo, s_save_ttmps_hi], 0x74 glc:1
0400 #endif
0401 
0402     /* setup Resource Contants */
0403     s_mov_b32   s_save_buf_rsrc0, s_save_spi_init_lo            //base_addr_lo
0404     s_and_b32   s_save_buf_rsrc1, s_save_spi_init_hi, 0x0000FFFF    //base_addr_hi
0405     s_or_b32    s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE
0406     s_mov_b32   s_save_buf_rsrc2, 0                 //NUM_RECORDS initial value = 0 (in bytes) although not neccessarily inited
0407     s_mov_b32   s_save_buf_rsrc3, S_SAVE_BUF_RSRC_WORD3_MISC
0408 
0409     s_mov_b32   s_save_m0, m0
0410 
0411     /* global mem offset */
0412     s_mov_b32   s_save_mem_offset, 0x0
0413     get_wave_size(s_wave_size)
0414 
0415 #if HAVE_XNACK
0416     // Save and clear vector XNACK state late to free up SGPRs.
0417     s_getreg_b32    s_save_xnack_mask, hwreg(HW_REG_SHADER_XNACK_MASK)
0418     s_setreg_imm32_b32  hwreg(HW_REG_SHADER_XNACK_MASK), 0x0
0419 #endif
0420 
0421     /* save first 4 VGPRs, needed for SGPR save */
0422     s_mov_b32   exec_lo, 0xFFFFFFFF                 //need every thread from now on
0423     s_lshr_b32  m0, s_wave_size, S_WAVE_SIZE
0424     s_and_b32   m0, m0, 1
0425     s_cmp_eq_u32    m0, 1
0426     s_cbranch_scc1  L_ENABLE_SAVE_4VGPR_EXEC_HI
0427     s_mov_b32   exec_hi, 0x00000000
0428     s_branch    L_SAVE_4VGPR_WAVE32
0429 L_ENABLE_SAVE_4VGPR_EXEC_HI:
0430     s_mov_b32   exec_hi, 0xFFFFFFFF
0431     s_branch    L_SAVE_4VGPR_WAVE64
0432 L_SAVE_4VGPR_WAVE32:
0433     s_mov_b32   s_save_buf_rsrc2, 0x1000000             //NUM_RECORDS in bytes
0434 
0435     // VGPR Allocated in 4-GPR granularity
0436 
0437 #if !NO_SQC_STORE
0438     buffer_store_dword  v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
0439 #endif
0440     buffer_store_dword  v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128
0441     buffer_store_dword  v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128*2
0442     buffer_store_dword  v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128*3
0443     s_branch    L_SAVE_HWREG
0444 
0445 L_SAVE_4VGPR_WAVE64:
0446     s_mov_b32   s_save_buf_rsrc2, 0x1000000             //NUM_RECORDS in bytes
0447 
0448     // VGPR Allocated in 4-GPR granularity
0449 
0450 #if !NO_SQC_STORE
0451     buffer_store_dword  v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
0452 #endif
0453     buffer_store_dword  v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256
0454     buffer_store_dword  v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2
0455     buffer_store_dword  v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3
0456 
0457     /* save HW registers */
0458 
0459 L_SAVE_HWREG:
0460     // HWREG SR memory offset : size(VGPR)+size(SVGPR)+size(SGPR)
0461     get_vgpr_size_bytes(s_save_mem_offset, s_wave_size)
0462     get_svgpr_size_bytes(s_save_tmp)
0463     s_add_u32   s_save_mem_offset, s_save_mem_offset, s_save_tmp
0464     s_add_u32   s_save_mem_offset, s_save_mem_offset, get_sgpr_size_bytes()
0465 
0466     s_mov_b32   s_save_buf_rsrc2, 0x1000000             //NUM_RECORDS in bytes
0467 
0468 #if NO_SQC_STORE
0469     v_mov_b32   v0, 0x0                         //Offset[31:0] from buffer resource
0470     v_mov_b32   v1, 0x0                         //Offset[63:32] from buffer resource
0471     v_mov_b32   v2, 0x0                         //Set of SGPRs for TCP store
0472     s_mov_b32   m0, 0x0                         //Next lane of v2 to write to
0473 #endif
0474 
0475     write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset)
0476     write_hwreg_to_mem(s_save_pc_lo, s_save_buf_rsrc0, s_save_mem_offset)
0477     s_andn2_b32 s_save_tmp, s_save_pc_hi, S_SAVE_PC_HI_FIRST_WAVE_MASK
0478     write_hwreg_to_mem(s_save_tmp, s_save_buf_rsrc0, s_save_mem_offset)
0479     write_hwreg_to_mem(s_save_exec_lo, s_save_buf_rsrc0, s_save_mem_offset)
0480     write_hwreg_to_mem(s_save_exec_hi, s_save_buf_rsrc0, s_save_mem_offset)
0481     write_hwreg_to_mem(s_save_status, s_save_buf_rsrc0, s_save_mem_offset)
0482 
0483     s_getreg_b32    s_save_tmp, hwreg(HW_REG_TRAPSTS)
0484     write_hwreg_to_mem(s_save_tmp, s_save_buf_rsrc0, s_save_mem_offset)
0485 
0486     // Not used on Sienna_Cichlid but keep layout same for debugger.
0487     write_hwreg_to_mem(s_save_xnack_mask, s_save_buf_rsrc0, s_save_mem_offset)
0488 
0489     s_getreg_b32    s_save_m0, hwreg(HW_REG_MODE)
0490     write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset)
0491 
0492     s_getreg_b32    s_save_m0, hwreg(HW_REG_SHADER_FLAT_SCRATCH_LO)
0493     write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset)
0494 
0495     s_getreg_b32    s_save_m0, hwreg(HW_REG_SHADER_FLAT_SCRATCH_HI)
0496     write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset)
0497 
0498 #if NO_SQC_STORE
0499     // Write HWREGs with 16 VGPR lanes. TTMPs occupy space after this.
0500     s_mov_b32       exec_lo, 0xFFFF
0501     s_mov_b32   exec_hi, 0x0
0502     buffer_store_dword  v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
0503 
0504     // Write SGPRs with 32 VGPR lanes. This works in wave32 and wave64 mode.
0505     s_mov_b32       exec_lo, 0xFFFFFFFF
0506 #endif
0507 
0508     /* save SGPRs */
0509     // Save SGPR before LDS save, then the s0 to s4 can be used during LDS save...
0510 
0511     // SGPR SR memory offset : size(VGPR)+size(SVGPR)
0512     get_vgpr_size_bytes(s_save_mem_offset, s_wave_size)
0513     get_svgpr_size_bytes(s_save_tmp)
0514     s_add_u32   s_save_mem_offset, s_save_mem_offset, s_save_tmp
0515     s_mov_b32   s_save_buf_rsrc2, 0x1000000             //NUM_RECORDS in bytes
0516 
0517 #if NO_SQC_STORE
0518     s_mov_b32   ttmp13, 0x0                     //next VGPR lane to copy SGPR into
0519 #else
0520     // backup s_save_buf_rsrc0,1 to s_save_pc_lo/hi, since write_16sgpr_to_mem function will change the rsrc0
0521     s_mov_b32   s_save_xnack_mask, s_save_buf_rsrc0
0522     s_add_u32   s_save_buf_rsrc0, s_save_buf_rsrc0, s_save_mem_offset
0523     s_addc_u32  s_save_buf_rsrc1, s_save_buf_rsrc1, 0
0524 #endif
0525 
0526     s_mov_b32   m0, 0x0                         //SGPR initial index value =0
0527     s_nop       0x0                         //Manually inserted wait states
0528 L_SAVE_SGPR_LOOP:
0529     // SGPR is allocated in 16 SGPR granularity
0530     s_movrels_b64   s0, s0                          //s0 = s[0+m0], s1 = s[1+m0]
0531     s_movrels_b64   s2, s2                          //s2 = s[2+m0], s3 = s[3+m0]
0532     s_movrels_b64   s4, s4                          //s4 = s[4+m0], s5 = s[5+m0]
0533     s_movrels_b64   s6, s6                          //s6 = s[6+m0], s7 = s[7+m0]
0534     s_movrels_b64   s8, s8                          //s8 = s[8+m0], s9 = s[9+m0]
0535     s_movrels_b64   s10, s10                        //s10 = s[10+m0], s11 = s[11+m0]
0536     s_movrels_b64   s12, s12                        //s12 = s[12+m0], s13 = s[13+m0]
0537     s_movrels_b64   s14, s14                        //s14 = s[14+m0], s15 = s[15+m0]
0538 
0539     write_16sgpr_to_mem(s0, s_save_buf_rsrc0, s_save_mem_offset)
0540 
0541 #if NO_SQC_STORE
0542     s_cmp_eq_u32    ttmp13, 0x20                        //have 32 VGPR lanes filled?
0543     s_cbranch_scc0  L_SAVE_SGPR_SKIP_TCP_STORE
0544 
0545     buffer_store_dword  v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
0546     s_add_u32   s_save_mem_offset, s_save_mem_offset, 0x80
0547     s_mov_b32   ttmp13, 0x0
0548     v_mov_b32   v2, 0x0
0549 L_SAVE_SGPR_SKIP_TCP_STORE:
0550 #endif
0551 
0552     s_add_u32   m0, m0, 16                      //next sgpr index
0553     s_cmp_lt_u32    m0, 96                          //scc = (m0 < first 96 SGPR) ? 1 : 0
0554     s_cbranch_scc1  L_SAVE_SGPR_LOOP                    //first 96 SGPR save is complete?
0555 
0556     //save the rest 12 SGPR
0557     s_movrels_b64   s0, s0                          //s0 = s[0+m0], s1 = s[1+m0]
0558     s_movrels_b64   s2, s2                          //s2 = s[2+m0], s3 = s[3+m0]
0559     s_movrels_b64   s4, s4                          //s4 = s[4+m0], s5 = s[5+m0]
0560     s_movrels_b64   s6, s6                          //s6 = s[6+m0], s7 = s[7+m0]
0561     s_movrels_b64   s8, s8                          //s8 = s[8+m0], s9 = s[9+m0]
0562     s_movrels_b64   s10, s10                        //s10 = s[10+m0], s11 = s[11+m0]
0563     write_12sgpr_to_mem(s0, s_save_buf_rsrc0, s_save_mem_offset)
0564 
0565 #if NO_SQC_STORE
0566     buffer_store_dword  v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
0567 #else
0568     // restore s_save_buf_rsrc0,1
0569     s_mov_b32   s_save_buf_rsrc0, s_save_xnack_mask
0570 #endif
0571 
0572     /* save LDS */
0573 
0574 L_SAVE_LDS:
0575     // Change EXEC to all threads...
0576     s_mov_b32   exec_lo, 0xFFFFFFFF                 //need every thread from now on
0577     s_lshr_b32  m0, s_wave_size, S_WAVE_SIZE
0578     s_and_b32   m0, m0, 1
0579     s_cmp_eq_u32    m0, 1
0580     s_cbranch_scc1  L_ENABLE_SAVE_LDS_EXEC_HI
0581     s_mov_b32   exec_hi, 0x00000000
0582     s_branch    L_SAVE_LDS_NORMAL
0583 L_ENABLE_SAVE_LDS_EXEC_HI:
0584     s_mov_b32   exec_hi, 0xFFFFFFFF
0585 L_SAVE_LDS_NORMAL:
0586     s_getreg_b32    s_save_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE)
0587     s_and_b32   s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF    //lds_size is zero?
0588     s_cbranch_scc0  L_SAVE_LDS_DONE                     //no lds used? jump to L_SAVE_DONE
0589 
0590     s_barrier                               //LDS is used? wait for other waves in the same TG
0591     s_and_b32   s_save_tmp, s_save_pc_hi, S_SAVE_PC_HI_FIRST_WAVE_MASK
0592     s_cbranch_scc0  L_SAVE_LDS_DONE
0593 
0594     // first wave do LDS save;
0595 
0596     s_lshl_b32  s_save_alloc_size, s_save_alloc_size, 6         //LDS size in dwords = lds_size * 64dw
0597     s_lshl_b32  s_save_alloc_size, s_save_alloc_size, 2         //LDS size in bytes
0598     s_mov_b32   s_save_buf_rsrc2, s_save_alloc_size         //NUM_RECORDS in bytes
0599 
0600     // LDS at offset: size(VGPR)+size(SVGPR)+SIZE(SGPR)+SIZE(HWREG)
0601     //
0602     get_vgpr_size_bytes(s_save_mem_offset, s_wave_size)
0603     get_svgpr_size_bytes(s_save_tmp)
0604     s_add_u32   s_save_mem_offset, s_save_mem_offset, s_save_tmp
0605     s_add_u32   s_save_mem_offset, s_save_mem_offset, get_sgpr_size_bytes()
0606     s_add_u32   s_save_mem_offset, s_save_mem_offset, get_hwreg_size_bytes()
0607 
0608     s_mov_b32   s_save_buf_rsrc2, 0x1000000             //NUM_RECORDS in bytes
0609 
0610     //load 0~63*4(byte address) to vgpr v0
0611     v_mbcnt_lo_u32_b32  v0, -1, 0
0612     v_mbcnt_hi_u32_b32  v0, -1, v0
0613     v_mul_u32_u24   v0, 4, v0
0614 
0615     s_lshr_b32  m0, s_wave_size, S_WAVE_SIZE
0616     s_and_b32   m0, m0, 1
0617     s_cmp_eq_u32    m0, 1
0618     s_mov_b32   m0, 0x0
0619     s_cbranch_scc1  L_SAVE_LDS_W64
0620 
0621 L_SAVE_LDS_W32:
0622     s_mov_b32   s3, 128
0623     s_nop       0
0624     s_nop       0
0625     s_nop       0
0626 L_SAVE_LDS_LOOP_W32:
0627     ds_read_b32 v1, v0
0628     s_waitcnt   0
0629     buffer_store_dword  v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
0630 
0631     s_add_u32   m0, m0, s3                      //every buffer_store_lds does 256 bytes
0632     s_add_u32   s_save_mem_offset, s_save_mem_offset, s3
0633     v_add_nc_u32    v0, v0, 128                     //mem offset increased by 128 bytes
0634     s_cmp_lt_u32    m0, s_save_alloc_size                   //scc=(m0 < s_save_alloc_size) ? 1 : 0
0635     s_cbranch_scc1  L_SAVE_LDS_LOOP_W32                 //LDS save is complete?
0636 
0637     s_branch    L_SAVE_LDS_DONE
0638 
0639 L_SAVE_LDS_W64:
0640     s_mov_b32   s3, 256
0641     s_nop       0
0642     s_nop       0
0643     s_nop       0
0644 L_SAVE_LDS_LOOP_W64:
0645     ds_read_b32 v1, v0
0646     s_waitcnt   0
0647     buffer_store_dword  v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
0648 
0649     s_add_u32   m0, m0, s3                      //every buffer_store_lds does 256 bytes
0650     s_add_u32   s_save_mem_offset, s_save_mem_offset, s3
0651     v_add_nc_u32    v0, v0, 256                     //mem offset increased by 256 bytes
0652     s_cmp_lt_u32    m0, s_save_alloc_size                   //scc=(m0 < s_save_alloc_size) ? 1 : 0
0653     s_cbranch_scc1  L_SAVE_LDS_LOOP_W64                 //LDS save is complete?
0654 
0655 L_SAVE_LDS_DONE:
0656     /* save VGPRs  - set the Rest VGPRs */
0657 L_SAVE_VGPR:
0658     // VGPR SR memory offset: 0
0659     s_mov_b32   exec_lo, 0xFFFFFFFF                 //need every thread from now on
0660     s_lshr_b32  m0, s_wave_size, S_WAVE_SIZE
0661     s_and_b32   m0, m0, 1
0662     s_cmp_eq_u32    m0, 1
0663     s_cbranch_scc1  L_ENABLE_SAVE_VGPR_EXEC_HI
0664     s_mov_b32   s_save_mem_offset, (0+128*4)                // for the rest VGPRs
0665     s_mov_b32   exec_hi, 0x00000000
0666     s_branch    L_SAVE_VGPR_NORMAL
0667 L_ENABLE_SAVE_VGPR_EXEC_HI:
0668     s_mov_b32   s_save_mem_offset, (0+256*4)                // for the rest VGPRs
0669     s_mov_b32   exec_hi, 0xFFFFFFFF
0670 L_SAVE_VGPR_NORMAL:
0671     s_getreg_b32    s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE)
0672     s_add_u32   s_save_alloc_size, s_save_alloc_size, 1
0673     s_lshl_b32  s_save_alloc_size, s_save_alloc_size, 2         //Number of VGPRs = (vgpr_size + 1) * 4    (non-zero value)
0674     //determine it is wave32 or wave64
0675     s_lshr_b32  m0, s_wave_size, S_WAVE_SIZE
0676     s_and_b32   m0, m0, 1
0677     s_cmp_eq_u32    m0, 1
0678     s_cbranch_scc1  L_SAVE_VGPR_WAVE64
0679 
0680     s_mov_b32   s_save_buf_rsrc2, 0x1000000             //NUM_RECORDS in bytes
0681 
0682     // VGPR Allocated in 4-GPR granularity
0683 
0684     // VGPR store using dw burst
0685     s_mov_b32   m0, 0x4                         //VGPR initial index value =4
0686     s_cmp_lt_u32    m0, s_save_alloc_size
0687     s_cbranch_scc0  L_SAVE_VGPR_END
0688 
0689 L_SAVE_VGPR_W32_LOOP:
0690     v_movrels_b32   v0, v0                          //v0 = v[0+m0]
0691     v_movrels_b32   v1, v1                          //v1 = v[1+m0]
0692     v_movrels_b32   v2, v2                          //v2 = v[2+m0]
0693     v_movrels_b32   v3, v3                          //v3 = v[3+m0]
0694 
0695     buffer_store_dword  v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
0696     buffer_store_dword  v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128
0697     buffer_store_dword  v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128*2
0698     buffer_store_dword  v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128*3
0699 
0700     s_add_u32   m0, m0, 4                       //next vgpr index
0701     s_add_u32   s_save_mem_offset, s_save_mem_offset, 128*4     //every buffer_store_dword does 128 bytes
0702     s_cmp_lt_u32    m0, s_save_alloc_size                   //scc = (m0 < s_save_alloc_size) ? 1 : 0
0703     s_cbranch_scc1  L_SAVE_VGPR_W32_LOOP                    //VGPR save is complete?
0704 
0705     s_branch    L_SAVE_VGPR_END
0706 
0707 L_SAVE_VGPR_WAVE64:
0708     s_mov_b32   s_save_buf_rsrc2, 0x1000000             //NUM_RECORDS in bytes
0709 
0710     // VGPR store using dw burst
0711     s_mov_b32   m0, 0x4                         //VGPR initial index value =4
0712     s_cmp_lt_u32    m0, s_save_alloc_size
0713     s_cbranch_scc0  L_SAVE_SHARED_VGPR
0714 
0715 L_SAVE_VGPR_W64_LOOP:
0716     v_movrels_b32   v0, v0                          //v0 = v[0+m0]
0717     v_movrels_b32   v1, v1                          //v1 = v[1+m0]
0718     v_movrels_b32   v2, v2                          //v2 = v[2+m0]
0719     v_movrels_b32   v3, v3                          //v3 = v[3+m0]
0720 
0721     buffer_store_dword  v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
0722     buffer_store_dword  v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256
0723     buffer_store_dword  v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2
0724     buffer_store_dword  v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3
0725 
0726     s_add_u32   m0, m0, 4                       //next vgpr index
0727     s_add_u32   s_save_mem_offset, s_save_mem_offset, 256*4     //every buffer_store_dword does 256 bytes
0728     s_cmp_lt_u32    m0, s_save_alloc_size                   //scc = (m0 < s_save_alloc_size) ? 1 : 0
0729     s_cbranch_scc1  L_SAVE_VGPR_W64_LOOP                    //VGPR save is complete?
0730 
0731 L_SAVE_SHARED_VGPR:
0732     //Below part will be the save shared vgpr part (new for gfx10)
0733     s_getreg_b32    s_save_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE)
0734     s_and_b32   s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF    //shared_vgpr_size is zero?
0735     s_cbranch_scc0  L_SAVE_VGPR_END                     //no shared_vgpr used? jump to L_SAVE_LDS
0736     s_lshl_b32  s_save_alloc_size, s_save_alloc_size, 3         //Number of SHARED_VGPRs = shared_vgpr_size * 8    (non-zero value)
0737     //m0 now has the value of normal vgpr count, just add the m0 with shared_vgpr count to get the total count.
0738     //save shared_vgpr will start from the index of m0
0739     s_add_u32   s_save_alloc_size, s_save_alloc_size, m0
0740     s_mov_b32   exec_lo, 0xFFFFFFFF
0741     s_mov_b32   exec_hi, 0x00000000
0742 L_SAVE_SHARED_VGPR_WAVE64_LOOP:
0743     v_movrels_b32   v0, v0                          //v0 = v[0+m0]
0744     buffer_store_dword  v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
0745     s_add_u32   m0, m0, 1                       //next vgpr index
0746     s_add_u32   s_save_mem_offset, s_save_mem_offset, 128
0747     s_cmp_lt_u32    m0, s_save_alloc_size                   //scc = (m0 < s_save_alloc_size) ? 1 : 0
0748     s_cbranch_scc1  L_SAVE_SHARED_VGPR_WAVE64_LOOP              //SHARED_VGPR save is complete?
0749 
0750 L_SAVE_VGPR_END:
0751     s_branch    L_END_PGM
0752 
0753 L_RESTORE:
0754     /* Setup Resource Contants */
0755     s_mov_b32   s_restore_buf_rsrc0, s_restore_spi_init_lo      //base_addr_lo
0756     s_and_b32   s_restore_buf_rsrc1, s_restore_spi_init_hi, 0x0000FFFF  //base_addr_hi
0757     s_or_b32    s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE
0758     s_mov_b32   s_restore_buf_rsrc2, 0                  //NUM_RECORDS initial value = 0 (in bytes)
0759     s_mov_b32   s_restore_buf_rsrc3, S_RESTORE_BUF_RSRC_WORD3_MISC
0760 
0761     //determine it is wave32 or wave64
0762     get_wave_size(s_restore_size)
0763 
0764     s_and_b32   s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_FIRST_WAVE_MASK
0765     s_cbranch_scc0  L_RESTORE_VGPR
0766 
0767     /* restore LDS */
0768 L_RESTORE_LDS:
0769     s_mov_b32   exec_lo, 0xFFFFFFFF                 //need every thread from now on
0770     s_lshr_b32  m0, s_restore_size, S_WAVE_SIZE
0771     s_and_b32   m0, m0, 1
0772     s_cmp_eq_u32    m0, 1
0773     s_cbranch_scc1  L_ENABLE_RESTORE_LDS_EXEC_HI
0774     s_mov_b32   exec_hi, 0x00000000
0775     s_branch    L_RESTORE_LDS_NORMAL
0776 L_ENABLE_RESTORE_LDS_EXEC_HI:
0777     s_mov_b32   exec_hi, 0xFFFFFFFF
0778 L_RESTORE_LDS_NORMAL:
0779     s_getreg_b32    s_restore_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE)
0780     s_and_b32   s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF  //lds_size is zero?
0781     s_cbranch_scc0  L_RESTORE_VGPR                      //no lds used? jump to L_RESTORE_VGPR
0782     s_lshl_b32  s_restore_alloc_size, s_restore_alloc_size, 6       //LDS size in dwords = lds_size * 64dw
0783     s_lshl_b32  s_restore_alloc_size, s_restore_alloc_size, 2       //LDS size in bytes
0784     s_mov_b32   s_restore_buf_rsrc2, s_restore_alloc_size       //NUM_RECORDS in bytes
0785 
0786     // LDS at offset: size(VGPR)+size(SVGPR)+SIZE(SGPR)+SIZE(HWREG)
0787     //
0788     get_vgpr_size_bytes(s_restore_mem_offset, s_restore_size)
0789     get_svgpr_size_bytes(s_restore_tmp)
0790     s_add_u32   s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp
0791     s_add_u32   s_restore_mem_offset, s_restore_mem_offset, get_sgpr_size_bytes()
0792     s_add_u32   s_restore_mem_offset, s_restore_mem_offset, get_hwreg_size_bytes()
0793 
0794     s_mov_b32   s_restore_buf_rsrc2, 0x1000000              //NUM_RECORDS in bytes
0795 
0796     s_lshr_b32  m0, s_restore_size, S_WAVE_SIZE
0797     s_and_b32   m0, m0, 1
0798     s_cmp_eq_u32    m0, 1
0799     s_mov_b32   m0, 0x0
0800     s_cbranch_scc1  L_RESTORE_LDS_LOOP_W64
0801 
0802 L_RESTORE_LDS_LOOP_W32:
0803 #if HAVE_BUFFER_LDS_LOAD
0804     buffer_load_dword   v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 // first 64DW
0805 #else
0806     buffer_load_dword       v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset
0807     s_waitcnt   vmcnt(0)
0808     ds_store_addtid_b32     v0
0809 #endif
0810     s_add_u32   m0, m0, 128                     // 128 DW
0811     s_add_u32   s_restore_mem_offset, s_restore_mem_offset, 128     //mem offset increased by 128DW
0812     s_cmp_lt_u32    m0, s_restore_alloc_size                //scc=(m0 < s_restore_alloc_size) ? 1 : 0
0813     s_cbranch_scc1  L_RESTORE_LDS_LOOP_W32                  //LDS restore is complete?
0814     s_branch    L_RESTORE_VGPR
0815 
0816 L_RESTORE_LDS_LOOP_W64:
0817 #if HAVE_BUFFER_LDS_LOAD
0818     buffer_load_dword   v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 // first 64DW
0819 #else
0820     buffer_load_dword       v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset
0821     s_waitcnt   vmcnt(0)
0822     ds_store_addtid_b32     v0
0823 #endif
0824     s_add_u32   m0, m0, 256                     // 256 DW
0825     s_add_u32   s_restore_mem_offset, s_restore_mem_offset, 256     //mem offset increased by 256DW
0826     s_cmp_lt_u32    m0, s_restore_alloc_size                //scc=(m0 < s_restore_alloc_size) ? 1 : 0
0827     s_cbranch_scc1  L_RESTORE_LDS_LOOP_W64                  //LDS restore is complete?
0828 
0829     /* restore VGPRs */
0830 L_RESTORE_VGPR:
0831     // VGPR SR memory offset : 0
0832     s_mov_b32   s_restore_mem_offset, 0x0
0833     s_mov_b32   exec_lo, 0xFFFFFFFF                 //need every thread from now on
0834     s_lshr_b32  m0, s_restore_size, S_WAVE_SIZE
0835     s_and_b32   m0, m0, 1
0836     s_cmp_eq_u32    m0, 1
0837     s_cbranch_scc1  L_ENABLE_RESTORE_VGPR_EXEC_HI
0838     s_mov_b32   exec_hi, 0x00000000
0839     s_branch    L_RESTORE_VGPR_NORMAL
0840 L_ENABLE_RESTORE_VGPR_EXEC_HI:
0841     s_mov_b32   exec_hi, 0xFFFFFFFF
0842 L_RESTORE_VGPR_NORMAL:
0843     s_getreg_b32    s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE)
0844     s_add_u32   s_restore_alloc_size, s_restore_alloc_size, 1
0845     s_lshl_b32  s_restore_alloc_size, s_restore_alloc_size, 2       //Number of VGPRs = (vgpr_size + 1) * 4    (non-zero value)
0846     //determine it is wave32 or wave64
0847     s_lshr_b32  m0, s_restore_size, S_WAVE_SIZE
0848     s_and_b32   m0, m0, 1
0849     s_cmp_eq_u32    m0, 1
0850     s_cbranch_scc1  L_RESTORE_VGPR_WAVE64
0851 
0852     s_mov_b32   s_restore_buf_rsrc2, 0x1000000              //NUM_RECORDS in bytes
0853 
0854     // VGPR load using dw burst
0855     s_mov_b32   s_restore_mem_offset_save, s_restore_mem_offset     // restore start with v1, v0 will be the last
0856     s_add_u32   s_restore_mem_offset, s_restore_mem_offset, 128*4
0857     s_mov_b32   m0, 4                           //VGPR initial index value = 4
0858     s_cmp_lt_u32    m0, s_restore_alloc_size
0859     s_cbranch_scc0  L_RESTORE_SGPR
0860 
0861 L_RESTORE_VGPR_WAVE32_LOOP:
0862     buffer_load_dword   v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1
0863     buffer_load_dword   v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:128
0864     buffer_load_dword   v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:128*2
0865     buffer_load_dword   v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:128*3
0866     s_waitcnt   vmcnt(0)
0867     v_movreld_b32   v0, v0                          //v[0+m0] = v0
0868     v_movreld_b32   v1, v1
0869     v_movreld_b32   v2, v2
0870     v_movreld_b32   v3, v3
0871     s_add_u32   m0, m0, 4                       //next vgpr index
0872     s_add_u32   s_restore_mem_offset, s_restore_mem_offset, 128*4   //every buffer_load_dword does 128 bytes
0873     s_cmp_lt_u32    m0, s_restore_alloc_size                //scc = (m0 < s_restore_alloc_size) ? 1 : 0
0874     s_cbranch_scc1  L_RESTORE_VGPR_WAVE32_LOOP              //VGPR restore (except v0) is complete?
0875 
0876     /* VGPR restore on v0 */
0877     buffer_load_dword   v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1
0878     buffer_load_dword   v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:128
0879     buffer_load_dword   v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:128*2
0880     buffer_load_dword   v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:128*3
0881     s_waitcnt   vmcnt(0)
0882 
0883     s_branch    L_RESTORE_SGPR
0884 
0885 L_RESTORE_VGPR_WAVE64:
0886     s_mov_b32   s_restore_buf_rsrc2, 0x1000000              //NUM_RECORDS in bytes
0887 
0888     // VGPR load using dw burst
0889     s_mov_b32   s_restore_mem_offset_save, s_restore_mem_offset     // restore start with v4, v0 will be the last
0890     s_add_u32   s_restore_mem_offset, s_restore_mem_offset, 256*4
0891     s_mov_b32   m0, 4                           //VGPR initial index value = 4
0892     s_cmp_lt_u32    m0, s_restore_alloc_size
0893     s_cbranch_scc0  L_RESTORE_SHARED_VGPR
0894 
0895 L_RESTORE_VGPR_WAVE64_LOOP:
0896     buffer_load_dword   v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1
0897     buffer_load_dword   v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256
0898     buffer_load_dword   v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*2
0899     buffer_load_dword   v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*3
0900     s_waitcnt   vmcnt(0)
0901     v_movreld_b32   v0, v0                          //v[0+m0] = v0
0902     v_movreld_b32   v1, v1
0903     v_movreld_b32   v2, v2
0904     v_movreld_b32   v3, v3
0905     s_add_u32   m0, m0, 4                       //next vgpr index
0906     s_add_u32   s_restore_mem_offset, s_restore_mem_offset, 256*4   //every buffer_load_dword does 256 bytes
0907     s_cmp_lt_u32    m0, s_restore_alloc_size                //scc = (m0 < s_restore_alloc_size) ? 1 : 0
0908     s_cbranch_scc1  L_RESTORE_VGPR_WAVE64_LOOP              //VGPR restore (except v0) is complete?
0909 
0910 L_RESTORE_SHARED_VGPR:
0911     //Below part will be the restore shared vgpr part (new for gfx10)
0912     s_getreg_b32    s_restore_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE)  //shared_vgpr_size
0913     s_and_b32   s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF  //shared_vgpr_size is zero?
0914     s_cbranch_scc0  L_RESTORE_V0                        //no shared_vgpr used?
0915     s_lshl_b32  s_restore_alloc_size, s_restore_alloc_size, 3       //Number of SHARED_VGPRs = shared_vgpr_size * 8    (non-zero value)
0916     //m0 now has the value of normal vgpr count, just add the m0 with shared_vgpr count to get the total count.
0917     //restore shared_vgpr will start from the index of m0
0918     s_add_u32   s_restore_alloc_size, s_restore_alloc_size, m0
0919     s_mov_b32   exec_lo, 0xFFFFFFFF
0920     s_mov_b32   exec_hi, 0x00000000
0921 L_RESTORE_SHARED_VGPR_WAVE64_LOOP:
0922     buffer_load_dword   v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1
0923     s_waitcnt   vmcnt(0)
0924     v_movreld_b32   v0, v0                          //v[0+m0] = v0
0925     s_add_u32   m0, m0, 1                       //next vgpr index
0926     s_add_u32   s_restore_mem_offset, s_restore_mem_offset, 128
0927     s_cmp_lt_u32    m0, s_restore_alloc_size                //scc = (m0 < s_restore_alloc_size) ? 1 : 0
0928     s_cbranch_scc1  L_RESTORE_SHARED_VGPR_WAVE64_LOOP           //VGPR restore (except v0) is complete?
0929 
0930     s_mov_b32   exec_hi, 0xFFFFFFFF                 //restore back exec_hi before restoring V0!!
0931 
0932     /* VGPR restore on v0 */
0933 L_RESTORE_V0:
0934     buffer_load_dword   v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1
0935     buffer_load_dword   v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256
0936     buffer_load_dword   v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256*2
0937     buffer_load_dword   v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256*3
0938     s_waitcnt   vmcnt(0)
0939 
0940     /* restore SGPRs */
0941     //will be 2+8+16*6
0942     // SGPR SR memory offset : size(VGPR)+size(SVGPR)
0943 L_RESTORE_SGPR:
0944     get_vgpr_size_bytes(s_restore_mem_offset, s_restore_size)
0945     get_svgpr_size_bytes(s_restore_tmp)
0946     s_add_u32   s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp
0947     s_add_u32   s_restore_mem_offset, s_restore_mem_offset, get_sgpr_size_bytes()
0948     s_sub_u32   s_restore_mem_offset, s_restore_mem_offset, 20*4    //s108~s127 is not saved
0949 
0950     s_mov_b32   s_restore_buf_rsrc2, 0x1000000              //NUM_RECORDS in bytes
0951 
0952     s_mov_b32   m0, s_sgpr_save_num
0953 
0954     read_4sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset)
0955     s_waitcnt   lgkmcnt(0)
0956 
0957     s_sub_u32   m0, m0, 4                       // Restore from S[0] to S[104]
0958     s_nop       0                           // hazard SALU M0=> S_MOVREL
0959 
0960     s_movreld_b64   s0, s0                          //s[0+m0] = s0
0961     s_movreld_b64   s2, s2
0962 
0963     read_8sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset)
0964     s_waitcnt   lgkmcnt(0)
0965 
0966     s_sub_u32   m0, m0, 8                       // Restore from S[0] to S[96]
0967     s_nop       0                           // hazard SALU M0=> S_MOVREL
0968 
0969     s_movreld_b64   s0, s0                          //s[0+m0] = s0
0970     s_movreld_b64   s2, s2
0971     s_movreld_b64   s4, s4
0972     s_movreld_b64   s6, s6
0973 
0974  L_RESTORE_SGPR_LOOP:
0975     read_16sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset)
0976     s_waitcnt   lgkmcnt(0)
0977 
0978     s_sub_u32   m0, m0, 16                      // Restore from S[n] to S[0]
0979     s_nop       0                           // hazard SALU M0=> S_MOVREL
0980 
0981     s_movreld_b64   s0, s0                          //s[0+m0] = s0
0982     s_movreld_b64   s2, s2
0983     s_movreld_b64   s4, s4
0984     s_movreld_b64   s6, s6
0985     s_movreld_b64   s8, s8
0986     s_movreld_b64   s10, s10
0987     s_movreld_b64   s12, s12
0988     s_movreld_b64   s14, s14
0989 
0990     s_cmp_eq_u32    m0, 0                           //scc = (m0 < s_sgpr_save_num) ? 1 : 0
0991     s_cbranch_scc0  L_RESTORE_SGPR_LOOP
0992 
0993     // s_barrier with MODE.DEBUG_EN=1, STATUS.PRIV=1 incorrectly asserts debug exception.
0994     // Clear DEBUG_EN before and restore MODE after the barrier.
0995     s_setreg_imm32_b32  hwreg(HW_REG_MODE), 0
0996     s_barrier                               //barrier to ensure the readiness of LDS before access attemps from any other wave in the same TG
0997 
0998     /* restore HW registers */
0999 L_RESTORE_HWREG:
1000     // HWREG SR memory offset : size(VGPR)+size(SVGPR)+size(SGPR)
1001     get_vgpr_size_bytes(s_restore_mem_offset, s_restore_size)
1002     get_svgpr_size_bytes(s_restore_tmp)
1003     s_add_u32   s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp
1004     s_add_u32   s_restore_mem_offset, s_restore_mem_offset, get_sgpr_size_bytes()
1005 
1006     s_mov_b32   s_restore_buf_rsrc2, 0x1000000              //NUM_RECORDS in bytes
1007 
1008     read_hwreg_from_mem(s_restore_m0, s_restore_buf_rsrc0, s_restore_mem_offset)
1009     read_hwreg_from_mem(s_restore_pc_lo, s_restore_buf_rsrc0, s_restore_mem_offset)
1010     read_hwreg_from_mem(s_restore_pc_hi, s_restore_buf_rsrc0, s_restore_mem_offset)
1011     read_hwreg_from_mem(s_restore_exec_lo, s_restore_buf_rsrc0, s_restore_mem_offset)
1012     read_hwreg_from_mem(s_restore_exec_hi, s_restore_buf_rsrc0, s_restore_mem_offset)
1013     read_hwreg_from_mem(s_restore_status, s_restore_buf_rsrc0, s_restore_mem_offset)
1014     read_hwreg_from_mem(s_restore_trapsts, s_restore_buf_rsrc0, s_restore_mem_offset)
1015     read_hwreg_from_mem(s_restore_xnack_mask, s_restore_buf_rsrc0, s_restore_mem_offset)
1016     read_hwreg_from_mem(s_restore_mode, s_restore_buf_rsrc0, s_restore_mem_offset)
1017     read_hwreg_from_mem(s_restore_flat_scratch, s_restore_buf_rsrc0, s_restore_mem_offset)
1018     s_waitcnt   lgkmcnt(0)
1019 
1020     s_setreg_b32    hwreg(HW_REG_SHADER_FLAT_SCRATCH_LO), s_restore_flat_scratch
1021 
1022     read_hwreg_from_mem(s_restore_flat_scratch, s_restore_buf_rsrc0, s_restore_mem_offset)
1023     s_waitcnt   lgkmcnt(0)                      //from now on, it is safe to restore STATUS and IB_STS
1024 
1025     s_setreg_b32    hwreg(HW_REG_SHADER_FLAT_SCRATCH_HI), s_restore_flat_scratch
1026 
1027     s_mov_b32   m0, s_restore_m0
1028     s_mov_b32   exec_lo, s_restore_exec_lo
1029     s_mov_b32   exec_hi, s_restore_exec_hi
1030 
1031     s_and_b32   s_restore_m0, SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK, s_restore_trapsts
1032     s_setreg_b32    hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE), s_restore_m0
1033 
1034 #if HAVE_XNACK
1035     s_setreg_b32    hwreg(HW_REG_SHADER_XNACK_MASK), s_restore_xnack_mask
1036 #endif
1037 
1038     s_and_b32   s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK, s_restore_trapsts
1039     s_lshr_b32  s_restore_m0, s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT
1040     s_setreg_b32    hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE), s_restore_m0
1041     s_setreg_b32    hwreg(HW_REG_MODE), s_restore_mode
1042 
1043     // Restore trap temporaries 4-11, 13 initialized by SPI debug dispatch logic
1044     // ttmp SR memory offset : size(VGPR)+size(SVGPR)+size(SGPR)+0x40
1045     get_vgpr_size_bytes(s_restore_ttmps_lo, s_restore_size)
1046     get_svgpr_size_bytes(s_restore_ttmps_hi)
1047     s_add_u32   s_restore_ttmps_lo, s_restore_ttmps_lo, s_restore_ttmps_hi
1048     s_add_u32   s_restore_ttmps_lo, s_restore_ttmps_lo, get_sgpr_size_bytes()
1049     s_add_u32   s_restore_ttmps_lo, s_restore_ttmps_lo, s_restore_buf_rsrc0
1050     s_addc_u32  s_restore_ttmps_hi, s_restore_buf_rsrc1, 0x0
1051     s_and_b32   s_restore_ttmps_hi, s_restore_ttmps_hi, 0xFFFF
1052     s_load_dwordx4  [ttmp4, ttmp5, ttmp6, ttmp7], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x50 glc:1
1053     s_load_dwordx4  [ttmp8, ttmp9, ttmp10, ttmp11], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x60 glc:1
1054     s_load_dword    ttmp13, [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x74 glc:1
1055     s_waitcnt   lgkmcnt(0)
1056 
1057 #if HAVE_XNACK
1058     restore_ib_sts(s_restore_tmp, s_restore_m0)
1059 #endif
1060 
1061     s_and_b32   s_restore_pc_hi, s_restore_pc_hi, 0x0000ffff        //pc[47:32] //Do it here in order not to affect STATUS
1062     s_and_b64   exec, exec, exec                    // Restore STATUS.EXECZ, not writable by s_setreg_b32
1063     s_and_b64   vcc, vcc, vcc                       // Restore STATUS.VCCZ, not writable by s_setreg_b32
1064     s_setreg_b32    hwreg(HW_REG_STATUS), s_restore_status          // SCC is included, which is changed by previous salu
1065 
1066     s_rfe_b64   s_restore_pc_lo                     //Return to the main shader program and resume execution
1067 
1068 L_END_PGM:
1069     s_endpgm
1070 end
1071 
1072 function write_hwreg_to_mem(s, s_rsrc, s_mem_offset)
1073 #if NO_SQC_STORE
1074     // Copy into VGPR for later TCP store.
1075     v_writelane_b32 v2, s, m0
1076     s_add_u32   m0, m0, 0x1
1077 #else
1078     s_mov_b32   exec_lo, m0
1079     s_mov_b32   m0, s_mem_offset
1080     s_buffer_store_dword    s, s_rsrc, m0 glc:1
1081     s_add_u32   s_mem_offset, s_mem_offset, 4
1082     s_mov_b32   m0, exec_lo
1083 #endif
1084 end
1085 
1086 
1087 function write_16sgpr_to_mem(s, s_rsrc, s_mem_offset)
1088 #if NO_SQC_STORE
1089     // Copy into VGPR for later TCP store.
1090     for var sgpr_idx = 0; sgpr_idx < 16; sgpr_idx ++
1091         v_writelane_b32 v2, s[sgpr_idx], ttmp13
1092         s_add_u32   ttmp13, ttmp13, 0x1
1093     end
1094 #else
1095     s_buffer_store_dwordx4  s[0], s_rsrc, 0 glc:1
1096     s_buffer_store_dwordx4  s[4], s_rsrc, 16 glc:1
1097     s_buffer_store_dwordx4  s[8], s_rsrc, 32 glc:1
1098     s_buffer_store_dwordx4  s[12], s_rsrc, 48 glc:1
1099     s_add_u32   s_rsrc[0], s_rsrc[0], 4*16
1100     s_addc_u32  s_rsrc[1], s_rsrc[1], 0x0
1101 #endif
1102 end
1103 
1104 function write_12sgpr_to_mem(s, s_rsrc, s_mem_offset)
1105 #if NO_SQC_STORE
1106     // Copy into VGPR for later TCP store.
1107     for var sgpr_idx = 0; sgpr_idx < 12; sgpr_idx ++
1108         v_writelane_b32 v2, s[sgpr_idx], ttmp13
1109         s_add_u32   ttmp13, ttmp13, 0x1
1110     end
1111 #else
1112     s_buffer_store_dwordx4  s[0], s_rsrc, 0 glc:1
1113     s_buffer_store_dwordx4  s[4], s_rsrc, 16 glc:1
1114     s_buffer_store_dwordx4  s[8], s_rsrc, 32 glc:1
1115     s_add_u32   s_rsrc[0], s_rsrc[0], 4*12
1116     s_addc_u32  s_rsrc[1], s_rsrc[1], 0x0
1117 #endif
1118 end
1119 
1120 function read_hwreg_from_mem(s, s_rsrc, s_mem_offset)
1121     s_buffer_load_dword s, s_rsrc, s_mem_offset glc:1
1122     s_add_u32   s_mem_offset, s_mem_offset, 4
1123 end
1124 
1125 function read_16sgpr_from_mem(s, s_rsrc, s_mem_offset)
1126     s_sub_u32   s_mem_offset, s_mem_offset, 4*16
1127     s_buffer_load_dwordx16  s, s_rsrc, s_mem_offset glc:1
1128 end
1129 
1130 function read_8sgpr_from_mem(s, s_rsrc, s_mem_offset)
1131     s_sub_u32   s_mem_offset, s_mem_offset, 4*8
1132     s_buffer_load_dwordx8   s, s_rsrc, s_mem_offset glc:1
1133 end
1134 
1135 function read_4sgpr_from_mem(s, s_rsrc, s_mem_offset)
1136     s_sub_u32   s_mem_offset, s_mem_offset, 4*4
1137     s_buffer_load_dwordx4   s, s_rsrc, s_mem_offset glc:1
1138 end
1139 
1140 
1141 function get_lds_size_bytes(s_lds_size_byte)
1142     s_getreg_b32    s_lds_size_byte, hwreg(HW_REG_LDS_ALLOC, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE)
1143     s_lshl_b32  s_lds_size_byte, s_lds_size_byte, 8         //LDS size in dwords = lds_size * 64 *4Bytes // granularity 64DW
1144 end
1145 
1146 function get_vgpr_size_bytes(s_vgpr_size_byte, s_size)
1147     s_getreg_b32    s_vgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE)
1148     s_add_u32   s_vgpr_size_byte, s_vgpr_size_byte, 1
1149     s_bitcmp1_b32   s_size, S_WAVE_SIZE
1150     s_cbranch_scc1  L_ENABLE_SHIFT_W64
1151     s_lshl_b32  s_vgpr_size_byte, s_vgpr_size_byte, (2+7)       //Number of VGPRs = (vgpr_size + 1) * 4 * 32 * 4   (non-zero value)
1152     s_branch    L_SHIFT_DONE
1153 L_ENABLE_SHIFT_W64:
1154     s_lshl_b32  s_vgpr_size_byte, s_vgpr_size_byte, (2+8)       //Number of VGPRs = (vgpr_size + 1) * 4 * 64 * 4   (non-zero value)
1155 L_SHIFT_DONE:
1156 end
1157 
1158 function get_svgpr_size_bytes(s_svgpr_size_byte)
1159     s_getreg_b32    s_svgpr_size_byte, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE)
1160     s_lshl_b32  s_svgpr_size_byte, s_svgpr_size_byte, (3+7)
1161 end
1162 
1163 function get_sgpr_size_bytes
1164     return 512
1165 end
1166 
1167 function get_hwreg_size_bytes
1168     return 128
1169 end
1170 
1171 function get_wave_size(s_reg)
1172     s_getreg_b32    s_reg, hwreg(HW_REG_IB_STS2,SQ_WAVE_IB_STS2_WAVE64_SHIFT,SQ_WAVE_IB_STS2_WAVE64_SIZE)
1173     s_lshl_b32  s_reg, s_reg, S_WAVE_SIZE
1174 end
1175 
1176 function save_and_clear_ib_sts(tmp1, tmp2)
1177     // Preserve and clear scalar XNACK state before issuing scalar loads.
1178     // Save IB_STS.REPLAY_W64H[25], RCNT[21:16], FIRST_REPLAY[15] into
1179     // unused space ttmp11[31:24].
1180     s_andn2_b32 ttmp11, ttmp11, (TTMP11_SAVE_REPLAY_W64H_MASK | TTMP11_SAVE_RCNT_FIRST_REPLAY_MASK)
1181     s_getreg_b32    tmp1, hwreg(HW_REG_IB_STS)
1182     s_and_b32   tmp2, tmp1, SQ_WAVE_IB_STS_REPLAY_W64H_MASK
1183     s_lshl_b32  tmp2, tmp2, (TTMP11_SAVE_REPLAY_W64H_SHIFT - SQ_WAVE_IB_STS_REPLAY_W64H_SHIFT)
1184     s_or_b32    ttmp11, ttmp11, tmp2
1185     s_and_b32   tmp2, tmp1, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK
1186     s_lshl_b32  tmp2, tmp2, (TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT - SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT)
1187     s_or_b32    ttmp11, ttmp11, tmp2
1188     s_andn2_b32 tmp1, tmp1, (SQ_WAVE_IB_STS_REPLAY_W64H_MASK | SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK)
1189     s_setreg_b32    hwreg(HW_REG_IB_STS), tmp1
1190 end
1191 
1192 function restore_ib_sts(tmp1, tmp2)
1193     s_lshr_b32  tmp1, ttmp11, (TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT - SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT)
1194     s_and_b32   tmp2, tmp1, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK
1195     s_lshr_b32  tmp1, ttmp11, (TTMP11_SAVE_REPLAY_W64H_SHIFT - SQ_WAVE_IB_STS_REPLAY_W64H_SHIFT)
1196     s_and_b32   tmp1, tmp1, SQ_WAVE_IB_STS_REPLAY_W64H_MASK
1197     s_or_b32    tmp1, tmp1, tmp2
1198     s_setreg_b32    hwreg(HW_REG_IB_STS), tmp1
1199 end