Back to home page

OSCL-LXR

 
 

    


0001 /*
0002  * Copyright 2016 Advanced Micro Devices, Inc.
0003  *
0004  * Permission is hereby granted, free of charge, to any person obtaining a
0005  * copy of this software and associated documentation files (the "Software"),
0006  * to deal in the Software without restriction, including without limitation
0007  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
0008  * and/or sell copies of the Software, and to permit persons to whom the
0009  * Software is furnished to do so, subject to the following conditions:
0010  *
0011  * The above copyright notice and this permission notice shall be included in
0012  * all copies or substantial portions of the Software.
0013  *
0014  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
0015  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
0016  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
0017  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
0018  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
0019  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
0020  * OTHER DEALINGS IN THE SOFTWARE.
0021  */
0022 
0023 /* To compile this assembly code:
0024  *
0025  * gfx9:
0026  *   cpp -DASIC_FAMILY=CHIP_VEGAM cwsr_trap_handler_gfx9.asm -P -o gfx9.sp3
0027  *   sp3 gfx9.sp3 -hex gfx9.hex
0028  *
0029  * arcturus:
0030  *   cpp -DASIC_FAMILY=CHIP_ARCTURUS cwsr_trap_handler_gfx9.asm -P -o arcturus.sp3
0031  *   sp3 arcturus.sp3 -hex arcturus.hex
0032  *
0033  * aldebaran:
0034  *   cpp -DASIC_FAMILY=CHIP_ALDEBARAN cwsr_trap_handler_gfx9.asm -P -o aldebaran.sp3
0035  *   sp3 aldebaran.sp3 -hex aldebaran.hex
0036  */
0037 
0038 #define CHIP_VEGAM 18
0039 #define CHIP_ARCTURUS 23
0040 #define CHIP_ALDEBARAN 25
0041 
0042 var ACK_SQC_STORE           =   1           //workaround for suspected SQC store bug causing incorrect stores under concurrency
0043 var SAVE_AFTER_XNACK_ERROR      =   1           //workaround for TCP store failure after XNACK error when ALLOW_REPLAY=0, for debugger
0044 var SINGLE_STEP_MISSED_WORKAROUND   =   1           //workaround for lost MODE.DEBUG_EN exception when SAVECTX raised
0045 
0046 /**************************************************************************/
0047 /*          variables                     */
0048 /**************************************************************************/
0049 var SQ_WAVE_STATUS_SPI_PRIO_SHIFT  = 1
0050 var SQ_WAVE_STATUS_SPI_PRIO_MASK   = 0x00000006
0051 var SQ_WAVE_STATUS_HALT_MASK       = 0x2000
0052 var SQ_WAVE_STATUS_PRE_SPI_PRIO_SHIFT   = 0
0053 var SQ_WAVE_STATUS_PRE_SPI_PRIO_SIZE    = 1
0054 var SQ_WAVE_STATUS_POST_SPI_PRIO_SHIFT  = 3
0055 var SQ_WAVE_STATUS_POST_SPI_PRIO_SIZE   = 29
0056 var SQ_WAVE_STATUS_ALLOW_REPLAY_MASK    = 0x400000
0057 var SQ_WAVE_STATUS_ECC_ERR_MASK         = 0x20000
0058 
0059 var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT    = 12
0060 var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE = 9
0061 var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE    = 6
0062 var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE    = 3         //FIXME  sq.blk still has 4 bits at this time while SQ programming guide has 3 bits
0063 var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT   = 24
0064 
0065 #if ASIC_FAMILY >= CHIP_ALDEBARAN
0066 var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT   = 6
0067 var SQ_WAVE_GPR_ALLOC_ACCV_OFFSET_SHIFT = 12
0068 var SQ_WAVE_GPR_ALLOC_ACCV_OFFSET_SIZE  = 6
0069 #else
0070 var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT   = 8
0071 #endif
0072 
0073 var SQ_WAVE_TRAPSTS_SAVECTX_MASK    =   0x400
0074 var SQ_WAVE_TRAPSTS_EXCP_MASK       =   0x1FF
0075 var SQ_WAVE_TRAPSTS_SAVECTX_SHIFT   =   10
0076 var SQ_WAVE_TRAPSTS_ADDR_WATCH_MASK =   0x80
0077 var SQ_WAVE_TRAPSTS_ADDR_WATCH_SHIFT =  7
0078 var SQ_WAVE_TRAPSTS_MEM_VIOL_MASK   =   0x100
0079 var SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT  =   8
0080 var SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK    =   0x3FF
0081 var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT   =   0x0
0082 var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE    =   10
0083 var SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK   =   0xFFFFF800
0084 var SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT  =   11
0085 var SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE   =   21
0086 var SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK   =   0x800
0087 var SQ_WAVE_TRAPSTS_EXCP_HI_MASK    =   0x7000
0088 var SQ_WAVE_TRAPSTS_XNACK_ERROR_MASK    =   0x10000000
0089 
0090 var SQ_WAVE_MODE_EXCP_EN_SHIFT      =   12
0091 var SQ_WAVE_MODE_EXCP_EN_ADDR_WATCH_SHIFT   = 19
0092 
0093 var SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT   =   15          //FIXME
0094 var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK   = 0x1F8000
0095 
0096 var SQ_WAVE_MODE_DEBUG_EN_MASK      =   0x800
0097 
0098 var TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT =   26          // bits [31:26] unused by SPI debug data
0099 var TTMP11_SAVE_RCNT_FIRST_REPLAY_MASK  =   0xFC000000
0100 var TTMP11_DEBUG_TRAP_ENABLED_SHIFT =   23
0101 var TTMP11_DEBUG_TRAP_ENABLED_MASK  =   0x800000
0102 
0103 /*  Save        */
0104 var S_SAVE_BUF_RSRC_WORD1_STRIDE    =   0x00040000      //stride is 4 bytes
0105 var S_SAVE_BUF_RSRC_WORD3_MISC      =   0x00807FAC      //SQ_SEL_X/Y/Z/W, BUF_NUM_FORMAT_FLOAT, (0 for MUBUF stride[17:14] when ADD_TID_ENABLE and BUF_DATA_FORMAT_32 for MTBUF), ADD_TID_ENABLE
0106 var S_SAVE_PC_HI_TRAP_ID_MASK       =   0x00FF0000
0107 var S_SAVE_PC_HI_HT_MASK        =   0x01000000
0108 var S_SAVE_SPI_INIT_FIRST_WAVE_MASK =   0x04000000      //bit[26]: FirstWaveInTG
0109 var S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT    =   26
0110 
0111 var s_save_spi_init_lo          =   exec_lo
0112 var s_save_spi_init_hi          =   exec_hi
0113 
0114 var s_save_pc_lo        =   ttmp0       //{TTMP1, TTMP0} = {3'h0,pc_rewind[3:0], HT[0],trapID[7:0], PC[47:0]}
0115 var s_save_pc_hi        =   ttmp1
0116 var s_save_exec_lo      =   ttmp2
0117 var s_save_exec_hi      =   ttmp3
0118 var s_save_tmp          =   ttmp14
0119 var s_save_trapsts      =   ttmp15      //not really used until the end of the SAVE routine
0120 var s_save_xnack_mask_lo    =   ttmp6
0121 var s_save_xnack_mask_hi    =   ttmp7
0122 var s_save_buf_rsrc0        =   ttmp8
0123 var s_save_buf_rsrc1        =   ttmp9
0124 var s_save_buf_rsrc2        =   ttmp10
0125 var s_save_buf_rsrc3        =   ttmp11
0126 var s_save_status       =   ttmp12
0127 var s_save_mem_offset       =   ttmp4
0128 var s_save_alloc_size       =   s_save_trapsts      //conflict
0129 var s_save_m0           =   ttmp5
0130 var s_save_ttmps_lo     =   s_save_tmp      //no conflict
0131 var s_save_ttmps_hi     =   s_save_trapsts      //no conflict
0132 
0133 /*  Restore     */
0134 var S_RESTORE_BUF_RSRC_WORD1_STRIDE     =   S_SAVE_BUF_RSRC_WORD1_STRIDE
0135 var S_RESTORE_BUF_RSRC_WORD3_MISC       =   S_SAVE_BUF_RSRC_WORD3_MISC
0136 
0137 var S_RESTORE_SPI_INIT_FIRST_WAVE_MASK      =   0x04000000      //bit[26]: FirstWaveInTG
0138 var S_RESTORE_SPI_INIT_FIRST_WAVE_SHIFT     =   26
0139 
0140 var s_restore_spi_init_lo           =   exec_lo
0141 var s_restore_spi_init_hi           =   exec_hi
0142 
0143 var s_restore_mem_offset    =   ttmp12
0144 var s_restore_tmp2      =   ttmp13
0145 var s_restore_alloc_size    =   ttmp3
0146 var s_restore_tmp       =   ttmp2
0147 var s_restore_mem_offset_save   =   s_restore_tmp   //no conflict
0148 var s_restore_accvgpr_offset_save = ttmp7
0149 
0150 var s_restore_m0        =   s_restore_alloc_size    //no conflict
0151 
0152 var s_restore_mode      =   s_restore_accvgpr_offset_save
0153 
0154 var s_restore_pc_lo     =   ttmp0
0155 var s_restore_pc_hi     =   ttmp1
0156 var s_restore_exec_lo       =   ttmp4
0157 var s_restore_exec_hi       =   ttmp5
0158 var s_restore_status        =   ttmp14
0159 var s_restore_trapsts       =   ttmp15
0160 var s_restore_xnack_mask_lo =   xnack_mask_lo
0161 var s_restore_xnack_mask_hi =   xnack_mask_hi
0162 var s_restore_buf_rsrc0     =   ttmp8
0163 var s_restore_buf_rsrc1     =   ttmp9
0164 var s_restore_buf_rsrc2     =   ttmp10
0165 var s_restore_buf_rsrc3     =   ttmp11
0166 var s_restore_ttmps_lo      =   s_restore_tmp       //no conflict
0167 var s_restore_ttmps_hi      =   s_restore_alloc_size    //no conflict
0168 
0169 /**************************************************************************/
0170 /*          trap handler entry points             */
0171 /**************************************************************************/
0172 /* Shader Main*/
0173 
0174 shader main
0175   asic(DEFAULT)
0176   type(CS)
0177 
0178 
0179     s_branch L_SKIP_RESTORE                     //NOT restore. might be a regular trap or save
0180 
0181 L_JUMP_TO_RESTORE:
0182     s_branch L_RESTORE                          //restore
0183 
0184 L_SKIP_RESTORE:
0185 
0186     s_getreg_b32    s_save_status, hwreg(HW_REG_STATUS)                 //save STATUS since we will change SCC
0187 
0188     // Clear SPI_PRIO: do not save with elevated priority.
0189     // Clear ECC_ERR: prevents SQC store and triggers FATAL_HALT if setreg'd.
0190     s_andn2_b32     s_save_status, s_save_status, SQ_WAVE_STATUS_SPI_PRIO_MASK|SQ_WAVE_STATUS_ECC_ERR_MASK
0191 
0192     s_getreg_b32    s_save_trapsts, hwreg(HW_REG_TRAPSTS)
0193 
0194     s_and_b32       ttmp2, s_save_status, SQ_WAVE_STATUS_HALT_MASK
0195     s_cbranch_scc0  L_NOT_HALTED
0196 
0197 L_HALTED:
0198     // Host trap may occur while wave is halted.
0199     s_and_b32       ttmp2, s_save_pc_hi, S_SAVE_PC_HI_TRAP_ID_MASK
0200     s_cbranch_scc1  L_FETCH_2ND_TRAP
0201 
0202 L_CHECK_SAVE:
0203     s_and_b32       ttmp2, s_save_trapsts, SQ_WAVE_TRAPSTS_SAVECTX_MASK    //check whether this is for save
0204     s_cbranch_scc1  L_SAVE                  //this is the operation for save
0205 
0206     // Wave is halted but neither host trap nor SAVECTX is raised.
0207     // Caused by instruction fetch memory violation.
0208     // Spin wait until context saved to prevent interrupt storm.
0209     s_sleep         0x10
0210     s_getreg_b32    s_save_trapsts, hwreg(HW_REG_TRAPSTS)
0211     s_branch        L_CHECK_SAVE
0212 
0213 L_NOT_HALTED:
0214     // Let second-level handle non-SAVECTX exception or trap.
0215     // Any concurrent SAVECTX will be handled upon re-entry once halted.
0216 
0217     // Check non-maskable exceptions. memory_violation, illegal_instruction
0218     // and xnack_error exceptions always cause the wave to enter the trap
0219     // handler.
0220     s_and_b32       ttmp2, s_save_trapsts, SQ_WAVE_TRAPSTS_MEM_VIOL_MASK|SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK
0221     s_cbranch_scc1  L_FETCH_2ND_TRAP
0222 
0223     // Check for maskable exceptions in trapsts.excp and trapsts.excp_hi.
0224     // Maskable exceptions only cause the wave to enter the trap handler if
0225     // their respective bit in mode.excp_en is set.
0226     s_and_b32       ttmp2, s_save_trapsts, SQ_WAVE_TRAPSTS_EXCP_MASK|SQ_WAVE_TRAPSTS_EXCP_HI_MASK
0227     s_cbranch_scc0  L_CHECK_TRAP_ID
0228 
0229     s_and_b32       ttmp3, s_save_trapsts, SQ_WAVE_TRAPSTS_ADDR_WATCH_MASK|SQ_WAVE_TRAPSTS_EXCP_HI_MASK
0230     s_cbranch_scc0  L_NOT_ADDR_WATCH
0231     s_bitset1_b32   ttmp2, SQ_WAVE_TRAPSTS_ADDR_WATCH_SHIFT // Check all addr_watch[123] exceptions against excp_en.addr_watch
0232 
0233 L_NOT_ADDR_WATCH:
0234     s_getreg_b32    ttmp3, hwreg(HW_REG_MODE)
0235     s_lshl_b32      ttmp2, ttmp2, SQ_WAVE_MODE_EXCP_EN_SHIFT
0236     s_and_b32       ttmp2, ttmp2, ttmp3
0237     s_cbranch_scc1  L_FETCH_2ND_TRAP
0238 
0239 L_CHECK_TRAP_ID:
0240     // Check trap_id != 0
0241     s_and_b32       ttmp2, s_save_pc_hi, S_SAVE_PC_HI_TRAP_ID_MASK
0242     s_cbranch_scc1  L_FETCH_2ND_TRAP
0243 
0244 if SINGLE_STEP_MISSED_WORKAROUND
0245     // Prioritize single step exception over context save.
0246     // Second-level trap will halt wave and RFE, re-entering for SAVECTX.
0247     s_getreg_b32    ttmp2, hwreg(HW_REG_MODE)
0248     s_and_b32       ttmp2, ttmp2, SQ_WAVE_MODE_DEBUG_EN_MASK
0249     s_cbranch_scc1  L_FETCH_2ND_TRAP
0250 end
0251 
0252     s_and_b32       ttmp2, s_save_trapsts, SQ_WAVE_TRAPSTS_SAVECTX_MASK
0253     s_cbranch_scc1  L_SAVE
0254 
0255 L_FETCH_2ND_TRAP:
0256     // Preserve and clear scalar XNACK state before issuing scalar reads.
0257     save_and_clear_ib_sts(ttmp14)
0258 
0259     // Read second-level TBA/TMA from first-level TMA and jump if available.
0260     // ttmp[2:5] and ttmp12 can be used (others hold SPI-initialized debug data)
0261     // ttmp12 holds SQ_WAVE_STATUS
0262     s_getreg_b32    ttmp14, hwreg(HW_REG_SQ_SHADER_TMA_LO)
0263     s_getreg_b32    ttmp15, hwreg(HW_REG_SQ_SHADER_TMA_HI)
0264     s_lshl_b64      [ttmp14, ttmp15], [ttmp14, ttmp15], 0x8
0265 
0266     s_load_dword    ttmp2, [ttmp14, ttmp15], 0x10 glc:1 // debug trap enabled flag
0267     s_waitcnt       lgkmcnt(0)
0268     s_lshl_b32      ttmp2, ttmp2, TTMP11_DEBUG_TRAP_ENABLED_SHIFT
0269     s_andn2_b32     ttmp11, ttmp11, TTMP11_DEBUG_TRAP_ENABLED_MASK
0270     s_or_b32        ttmp11, ttmp11, ttmp2
0271 
0272     s_load_dwordx2  [ttmp2, ttmp3], [ttmp14, ttmp15], 0x0 glc:1 // second-level TBA
0273     s_waitcnt       lgkmcnt(0)
0274     s_load_dwordx2  [ttmp14, ttmp15], [ttmp14, ttmp15], 0x8 glc:1 // second-level TMA
0275     s_waitcnt       lgkmcnt(0)
0276 
0277     s_and_b64       [ttmp2, ttmp3], [ttmp2, ttmp3], [ttmp2, ttmp3]
0278     s_cbranch_scc0  L_NO_NEXT_TRAP // second-level trap handler not been set
0279     s_setpc_b64     [ttmp2, ttmp3] // jump to second-level trap handler
0280 
0281 L_NO_NEXT_TRAP:
0282     // If not caused by trap then halt wave to prevent re-entry.
0283     s_and_b32       ttmp2, s_save_pc_hi, (S_SAVE_PC_HI_TRAP_ID_MASK|S_SAVE_PC_HI_HT_MASK)
0284     s_cbranch_scc1  L_TRAP_CASE
0285     s_or_b32        s_save_status, s_save_status, SQ_WAVE_STATUS_HALT_MASK
0286 
0287     // If the PC points to S_ENDPGM then context save will fail if STATUS.HALT is set.
0288     // Rewind the PC to prevent this from occurring.
0289     s_sub_u32       ttmp0, ttmp0, 0x8
0290     s_subb_u32      ttmp1, ttmp1, 0x0
0291 
0292     s_branch        L_EXIT_TRAP
0293 
0294 L_TRAP_CASE:
0295     // Host trap will not cause trap re-entry.
0296     s_and_b32       ttmp2, s_save_pc_hi, S_SAVE_PC_HI_HT_MASK
0297     s_cbranch_scc1  L_EXIT_TRAP
0298 
0299     // Advance past trap instruction to prevent re-entry.
0300     s_add_u32       ttmp0, ttmp0, 0x4
0301     s_addc_u32      ttmp1, ttmp1, 0x0
0302 
0303 L_EXIT_TRAP:
0304     s_and_b32   ttmp1, ttmp1, 0xFFFF
0305 
0306     restore_ib_sts(ttmp14)
0307 
0308     // Restore SQ_WAVE_STATUS.
0309     s_and_b64       exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32
0310     s_and_b64       vcc, vcc, vcc    // Restore STATUS.VCCZ, not writable by s_setreg_b32
0311     set_status_without_spi_prio(s_save_status, ttmp2)
0312 
0313     s_rfe_b64       [ttmp0, ttmp1]
0314 
0315     // *********    End handling of non-CWSR traps   *******************
0316 
0317 /**************************************************************************/
0318 /*          save routine                      */
0319 /**************************************************************************/
0320 
0321 L_SAVE:
0322     s_and_b32       s_save_pc_hi, s_save_pc_hi, 0x0000ffff    //pc[47:32]
0323 
0324     s_mov_b32       s_save_tmp, 0                               //clear saveCtx bit
0325     s_setreg_b32    hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_SAVECTX_SHIFT, 1), s_save_tmp     //clear saveCtx bit
0326 
0327     save_and_clear_ib_sts(s_save_tmp)
0328 
0329     /*      inform SPI the readiness and wait for SPI's go signal */
0330     s_mov_b32       s_save_exec_lo, exec_lo                         //save EXEC and use EXEC for the go signal from SPI
0331     s_mov_b32       s_save_exec_hi, exec_hi
0332     s_mov_b64       exec,   0x0                                 //clear EXEC to get ready to receive
0333 
0334     s_sendmsg   sendmsg(MSG_SAVEWAVE)  //send SPI a message and wait for SPI's write to EXEC
0335 
0336     // Set SPI_PRIO=2 to avoid starving instruction fetch in the waves we're waiting for.
0337     s_or_b32 s_save_tmp, s_save_status, (2 << SQ_WAVE_STATUS_SPI_PRIO_SHIFT)
0338     s_setreg_b32 hwreg(HW_REG_STATUS), s_save_tmp
0339 
0340   L_SLEEP:
0341     s_sleep 0x2            // sleep 1 (64clk) is not enough for 8 waves per SIMD, which will cause SQ hang, since the 7,8th wave could not get arbit to exec inst, while other waves are stuck into the sleep-loop and waiting for wrexec!=0
0342 
0343     s_cbranch_execz L_SLEEP
0344 
0345     // Save trap temporaries 4-11, 13 initialized by SPI debug dispatch logic
0346     // ttmp SR memory offset : size(VGPR)+size(SGPR)+0x40
0347     get_vgpr_size_bytes(s_save_ttmps_lo)
0348     get_sgpr_size_bytes(s_save_ttmps_hi)
0349     s_add_u32       s_save_ttmps_lo, s_save_ttmps_lo, s_save_ttmps_hi
0350     s_add_u32       s_save_ttmps_lo, s_save_ttmps_lo, s_save_spi_init_lo
0351     s_addc_u32      s_save_ttmps_hi, s_save_spi_init_hi, 0x0
0352     s_and_b32       s_save_ttmps_hi, s_save_ttmps_hi, 0xFFFF
0353     s_store_dwordx4 [ttmp4, ttmp5, ttmp6, ttmp7], [s_save_ttmps_lo, s_save_ttmps_hi], 0x50 glc:1
0354     ack_sqc_store_workaround()
0355     s_store_dwordx4 [ttmp8, ttmp9, ttmp10, ttmp11], [s_save_ttmps_lo, s_save_ttmps_hi], 0x60 glc:1
0356     ack_sqc_store_workaround()
0357     s_store_dword   ttmp13, [s_save_ttmps_lo, s_save_ttmps_hi], 0x74 glc:1
0358     ack_sqc_store_workaround()
0359 
0360     /*      setup Resource Contants    */
0361     s_mov_b32       s_save_buf_rsrc0,   s_save_spi_init_lo                          //base_addr_lo
0362     s_and_b32       s_save_buf_rsrc1,   s_save_spi_init_hi, 0x0000FFFF                      //base_addr_hi
0363     s_or_b32        s_save_buf_rsrc1,   s_save_buf_rsrc1,  S_SAVE_BUF_RSRC_WORD1_STRIDE
0364     s_mov_b32       s_save_buf_rsrc2,   0                                   //NUM_RECORDS initial value = 0 (in bytes) although not neccessarily inited
0365     s_mov_b32       s_save_buf_rsrc3,   S_SAVE_BUF_RSRC_WORD3_MISC
0366 
0367     //FIXME  right now s_save_m0/s_save_mem_offset use tma_lo/tma_hi  (might need to save them before using them?)
0368     s_mov_b32       s_save_m0,      m0                                  //save M0
0369 
0370     /*      global mem offset       */
0371     s_mov_b32       s_save_mem_offset,  0x0                                 //mem offset initial value = 0
0372 
0373 
0374 
0375 
0376     /*      save HW registers   */
0377     //////////////////////////////
0378 
0379   L_SAVE_HWREG:
0380     // HWREG SR memory offset : size(VGPR)+size(SGPR)
0381        get_vgpr_size_bytes(s_save_mem_offset)
0382        get_sgpr_size_bytes(s_save_tmp)
0383        s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp
0384 
0385 
0386     s_mov_b32       s_save_buf_rsrc2, 0x4               //NUM_RECORDS   in bytes
0387     s_mov_b32   s_save_buf_rsrc2,  0x1000000                    //NUM_RECORDS in bytes
0388 
0389 
0390     write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset)          //M0
0391     write_hwreg_to_mem(s_save_pc_lo, s_save_buf_rsrc0, s_save_mem_offset)           //PC
0392     write_hwreg_to_mem(s_save_pc_hi, s_save_buf_rsrc0, s_save_mem_offset)
0393     write_hwreg_to_mem(s_save_exec_lo, s_save_buf_rsrc0, s_save_mem_offset)     //EXEC
0394     write_hwreg_to_mem(s_save_exec_hi, s_save_buf_rsrc0, s_save_mem_offset)
0395     write_hwreg_to_mem(s_save_status, s_save_buf_rsrc0, s_save_mem_offset)      //STATUS
0396 
0397     //s_save_trapsts conflicts with s_save_alloc_size
0398     s_getreg_b32    s_save_trapsts, hwreg(HW_REG_TRAPSTS)
0399     write_hwreg_to_mem(s_save_trapsts, s_save_buf_rsrc0, s_save_mem_offset)     //TRAPSTS
0400 
0401     write_hwreg_to_mem(xnack_mask_lo, s_save_buf_rsrc0, s_save_mem_offset)      //XNACK_MASK_LO
0402     write_hwreg_to_mem(xnack_mask_hi, s_save_buf_rsrc0, s_save_mem_offset)      //XNACK_MASK_HI
0403 
0404     //use s_save_tmp would introduce conflict here between s_save_tmp and s_save_buf_rsrc2
0405     s_getreg_b32    s_save_m0, hwreg(HW_REG_MODE)                           //MODE
0406     write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset)
0407 
0408 
0409 
0410     /*      the first wave in the threadgroup    */
0411     s_and_b32       s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK // extract fisrt wave bit
0412     s_mov_b32        s_save_exec_hi, 0x0
0413     s_or_b32         s_save_exec_hi, s_save_tmp, s_save_exec_hi              // save first wave bit to s_save_exec_hi.bits[26]
0414 
0415 
0416     /*      save SGPRs  */
0417     // Save SGPR before LDS save, then the s0 to s4 can be used during LDS save...
0418     //////////////////////////////
0419 
0420     // SGPR SR memory offset : size(VGPR)
0421     get_vgpr_size_bytes(s_save_mem_offset)
0422     // TODO, change RSRC word to rearrange memory layout for SGPRS
0423 
0424     s_getreg_b32    s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE)       //spgr_size
0425     s_add_u32       s_save_alloc_size, s_save_alloc_size, 1
0426     s_lshl_b32      s_save_alloc_size, s_save_alloc_size, 4             //Number of SGPRs = (sgpr_size + 1) * 16   (non-zero value)
0427 
0428     s_lshl_b32  s_save_buf_rsrc2,   s_save_alloc_size, 2            //NUM_RECORDS in bytes
0429 
0430     s_mov_b32   s_save_buf_rsrc2,  0x1000000                    //NUM_RECORDS in bytes
0431 
0432 
0433     // backup s_save_buf_rsrc0,1 to s_save_pc_lo/hi, since write_16sgpr_to_mem function will change the rsrc0
0434     //s_mov_b64 s_save_pc_lo, s_save_buf_rsrc0
0435     s_mov_b64 s_save_xnack_mask_lo, s_save_buf_rsrc0
0436     s_add_u32 s_save_buf_rsrc0, s_save_buf_rsrc0, s_save_mem_offset
0437     s_addc_u32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0
0438 
0439     s_mov_b32       m0, 0x0             //SGPR initial index value =0
0440     s_nop       0x0                 //Manually inserted wait states
0441   L_SAVE_SGPR_LOOP:
0442     // SGPR is allocated in 16 SGPR granularity
0443     s_movrels_b64   s0, s0     //s0 = s[0+m0], s1 = s[1+m0]
0444     s_movrels_b64   s2, s2     //s2 = s[2+m0], s3 = s[3+m0]
0445     s_movrels_b64   s4, s4     //s4 = s[4+m0], s5 = s[5+m0]
0446     s_movrels_b64   s6, s6     //s6 = s[6+m0], s7 = s[7+m0]
0447     s_movrels_b64   s8, s8     //s8 = s[8+m0], s9 = s[9+m0]
0448     s_movrels_b64   s10, s10   //s10 = s[10+m0], s11 = s[11+m0]
0449     s_movrels_b64   s12, s12   //s12 = s[12+m0], s13 = s[13+m0]
0450     s_movrels_b64   s14, s14   //s14 = s[14+m0], s15 = s[15+m0]
0451 
0452     write_16sgpr_to_mem(s0, s_save_buf_rsrc0, s_save_mem_offset) //PV: the best performance should be using s_buffer_store_dwordx4
0453     s_add_u32       m0, m0, 16                              //next sgpr index
0454     s_cmp_lt_u32    m0, s_save_alloc_size                       //scc = (m0 < s_save_alloc_size) ? 1 : 0
0455     s_cbranch_scc1  L_SAVE_SGPR_LOOP                    //SGPR save is complete?
0456     // restore s_save_buf_rsrc0,1
0457     //s_mov_b64 s_save_buf_rsrc0, s_save_pc_lo
0458     s_mov_b64 s_save_buf_rsrc0, s_save_xnack_mask_lo
0459 
0460 
0461 
0462 
0463     /*      save first 4 VGPR, then LDS save could use   */
0464     // each wave will alloc 4 vgprs at least...
0465     /////////////////////////////////////////////////////////////////////////////////////
0466 
0467     s_mov_b32       s_save_mem_offset, 0
0468     s_mov_b32       exec_lo, 0xFFFFFFFF                         //need every thread from now on
0469     s_mov_b32       exec_hi, 0xFFFFFFFF
0470     s_mov_b32       xnack_mask_lo, 0x0
0471     s_mov_b32       xnack_mask_hi, 0x0
0472 
0473     s_mov_b32   s_save_buf_rsrc2,  0x1000000                    //NUM_RECORDS in bytes
0474 
0475 
0476     // VGPR Allocated in 4-GPR granularity
0477 
0478 if SAVE_AFTER_XNACK_ERROR
0479     check_if_tcp_store_ok()
0480     s_cbranch_scc1 L_SAVE_FIRST_VGPRS_WITH_TCP
0481 
0482     write_vgprs_to_mem_with_sqc(v0, 4, s_save_buf_rsrc0, s_save_mem_offset)
0483     s_branch L_SAVE_LDS
0484 
0485 L_SAVE_FIRST_VGPRS_WITH_TCP:
0486 end
0487 
0488     write_4vgprs_to_mem(s_save_buf_rsrc0, s_save_mem_offset)
0489 
0490     /*      save LDS    */
0491     //////////////////////////////
0492 
0493   L_SAVE_LDS:
0494 
0495     // Change EXEC to all threads...
0496     s_mov_b32       exec_lo, 0xFFFFFFFF   //need every thread from now on
0497     s_mov_b32       exec_hi, 0xFFFFFFFF
0498 
0499     s_getreg_b32    s_save_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE)         //lds_size
0500     s_and_b32       s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF            //lds_size is zero?
0501     s_cbranch_scc0  L_SAVE_LDS_DONE                                        //no lds used? jump to L_SAVE_DONE
0502 
0503     s_barrier           //LDS is used? wait for other waves in the same TG
0504     s_and_b32       s_save_tmp, s_save_exec_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK            //exec is still used here
0505     s_cbranch_scc0  L_SAVE_LDS_DONE
0506 
0507     // first wave do LDS save;
0508 
0509     s_lshl_b32      s_save_alloc_size, s_save_alloc_size, 6             //LDS size in dwords = lds_size * 64dw
0510     s_lshl_b32      s_save_alloc_size, s_save_alloc_size, 2             //LDS size in bytes
0511     s_mov_b32       s_save_buf_rsrc2,  s_save_alloc_size                //NUM_RECORDS in bytes
0512 
0513     // LDS at offset: size(VGPR)+SIZE(SGPR)+SIZE(HWREG)
0514     //
0515     get_vgpr_size_bytes(s_save_mem_offset)
0516     get_sgpr_size_bytes(s_save_tmp)
0517     s_add_u32  s_save_mem_offset, s_save_mem_offset, s_save_tmp
0518     s_add_u32 s_save_mem_offset, s_save_mem_offset, get_hwreg_size_bytes()
0519 
0520 
0521     s_mov_b32   s_save_buf_rsrc2,  0x1000000              //NUM_RECORDS in bytes
0522 
0523     s_mov_b32       m0, 0x0                       //lds_offset initial value = 0
0524 
0525 
0526       v_mbcnt_lo_u32_b32 v2, 0xffffffff, 0x0
0527       v_mbcnt_hi_u32_b32 v3, 0xffffffff, v2 // tid
0528 
0529 if SAVE_AFTER_XNACK_ERROR
0530     check_if_tcp_store_ok()
0531     s_cbranch_scc1 L_SAVE_LDS_WITH_TCP
0532 
0533     v_lshlrev_b32 v2, 2, v3
0534 L_SAVE_LDS_LOOP_SQC:
0535     ds_read2_b32 v[0:1], v2 offset0:0 offset1:0x40
0536     s_waitcnt lgkmcnt(0)
0537 
0538     write_vgprs_to_mem_with_sqc(v0, 2, s_save_buf_rsrc0, s_save_mem_offset)
0539 
0540     v_add_u32 v2, 0x200, v2
0541     v_cmp_lt_u32 vcc[0:1], v2, s_save_alloc_size
0542     s_cbranch_vccnz L_SAVE_LDS_LOOP_SQC
0543 
0544     s_branch L_SAVE_LDS_DONE
0545 
0546 L_SAVE_LDS_WITH_TCP:
0547 end
0548 
0549       v_mul_i32_i24 v2, v3, 8   // tid*8
0550       v_mov_b32 v3, 256*2
0551       s_mov_b32 m0, 0x10000
0552       s_mov_b32 s0, s_save_buf_rsrc3
0553       s_and_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, 0xFF7FFFFF    // disable add_tid
0554       s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, 0x58000   //DFMT
0555 
0556 L_SAVE_LDS_LOOP_VECTOR:
0557       ds_read_b64 v[0:1], v2    //x =LDS[a], byte address
0558       s_waitcnt lgkmcnt(0)
0559       buffer_store_dwordx2  v[0:1], v2, s_save_buf_rsrc0, s_save_mem_offset offen:1  glc:1  slc:1
0560 //  s_waitcnt vmcnt(0)
0561 //  v_add_u32 v2, vcc[0:1], v2, v3
0562       v_add_u32 v2, v2, v3
0563       v_cmp_lt_u32 vcc[0:1], v2, s_save_alloc_size
0564       s_cbranch_vccnz L_SAVE_LDS_LOOP_VECTOR
0565 
0566       // restore rsrc3
0567       s_mov_b32 s_save_buf_rsrc3, s0
0568 
0569 L_SAVE_LDS_DONE:
0570 
0571 
0572     /*      save VGPRs  - set the Rest VGPRs    */
0573     //////////////////////////////////////////////////////////////////////////////////////
0574   L_SAVE_VGPR:
0575     // VGPR SR memory offset: 0
0576     // TODO rearrange the RSRC words to use swizzle for VGPR save...
0577 
0578     s_mov_b32       s_save_mem_offset, (0+256*4)                    // for the rest VGPRs
0579     s_mov_b32       exec_lo, 0xFFFFFFFF                         //need every thread from now on
0580     s_mov_b32       exec_hi, 0xFFFFFFFF
0581 
0582     get_num_arch_vgprs(s_save_alloc_size)
0583     s_mov_b32       s_save_buf_rsrc2,  0x1000000                    //NUM_RECORDS in bytes
0584 
0585 
0586     // VGPR store using dw burst
0587     s_mov_b32         m0, 0x4   //VGPR initial index value =0
0588     s_cmp_lt_u32      m0, s_save_alloc_size
0589     s_cbranch_scc0    L_SAVE_VGPR_END
0590 
0591 
0592     s_set_gpr_idx_on    m0, 0x1 //M0[7:0] = M0[7:0] and M0[15:12] = 0x1
0593     s_add_u32       s_save_alloc_size, s_save_alloc_size, 0x1000            //add 0x1000 since we compare m0 against it later
0594 
0595 if SAVE_AFTER_XNACK_ERROR
0596     check_if_tcp_store_ok()
0597     s_cbranch_scc1 L_SAVE_VGPR_LOOP
0598 
0599 L_SAVE_VGPR_LOOP_SQC:
0600     write_vgprs_to_mem_with_sqc(v0, 4, s_save_buf_rsrc0, s_save_mem_offset)
0601 
0602     s_add_u32 m0, m0, 4
0603     s_cmp_lt_u32 m0, s_save_alloc_size
0604     s_cbranch_scc1 L_SAVE_VGPR_LOOP_SQC
0605 
0606     s_set_gpr_idx_off
0607     s_branch L_SAVE_VGPR_END
0608 end
0609 
0610   L_SAVE_VGPR_LOOP:
0611     v_mov_b32       v0, v0      //v0 = v[0+m0]
0612     v_mov_b32       v1, v1      //v0 = v[0+m0]
0613     v_mov_b32       v2, v2      //v0 = v[0+m0]
0614     v_mov_b32       v3, v3      //v0 = v[0+m0]
0615 
0616     write_4vgprs_to_mem(s_save_buf_rsrc0, s_save_mem_offset)
0617 
0618     s_add_u32       m0, m0, 4                               //next vgpr index
0619     s_add_u32       s_save_mem_offset, s_save_mem_offset, 256*4             //every buffer_store_dword does 256 bytes
0620     s_cmp_lt_u32    m0, s_save_alloc_size                       //scc = (m0 < s_save_alloc_size) ? 1 : 0
0621     s_cbranch_scc1  L_SAVE_VGPR_LOOP                            //VGPR save is complete?
0622     s_set_gpr_idx_off
0623 
0624 L_SAVE_VGPR_END:
0625 
0626 #if ASIC_FAMILY >= CHIP_ARCTURUS
0627     // Save ACC VGPRs
0628 
0629 #if ASIC_FAMILY >= CHIP_ALDEBARAN
0630     // ACC VGPR count may differ from ARCH VGPR count.
0631     get_num_acc_vgprs(s_save_alloc_size, s_save_tmp)
0632     s_and_b32       s_save_alloc_size, s_save_alloc_size, s_save_alloc_size
0633     s_cbranch_scc0  L_SAVE_ACCVGPR_END
0634     s_add_u32       s_save_alloc_size, s_save_alloc_size, 0x1000            //add 0x1000 since we compare m0 against it later
0635 #endif
0636 
0637     s_mov_b32 m0, 0x0 //VGPR initial index value =0
0638     s_set_gpr_idx_on m0, 0x1 //M0[7:0] = M0[7:0] and M0[15:12] = 0x1
0639 
0640 if SAVE_AFTER_XNACK_ERROR
0641     check_if_tcp_store_ok()
0642     s_cbranch_scc1 L_SAVE_ACCVGPR_LOOP
0643 
0644 L_SAVE_ACCVGPR_LOOP_SQC:
0645     for var vgpr = 0; vgpr < 4; ++ vgpr
0646         v_accvgpr_read v[vgpr], acc[vgpr]  // v[N] = acc[N+m0]
0647     end
0648 
0649     write_vgprs_to_mem_with_sqc(v0, 4, s_save_buf_rsrc0, s_save_mem_offset)
0650 
0651     s_add_u32 m0, m0, 4
0652     s_cmp_lt_u32 m0, s_save_alloc_size
0653     s_cbranch_scc1 L_SAVE_ACCVGPR_LOOP_SQC
0654 
0655     s_set_gpr_idx_off
0656     s_branch L_SAVE_ACCVGPR_END
0657 end
0658 
0659 L_SAVE_ACCVGPR_LOOP:
0660     for var vgpr = 0; vgpr < 4; ++ vgpr
0661         v_accvgpr_read v[vgpr], acc[vgpr]  // v[N] = acc[N+m0]
0662     end
0663 
0664     write_4vgprs_to_mem(s_save_buf_rsrc0, s_save_mem_offset)
0665 
0666     s_add_u32 m0, m0, 4
0667     s_add_u32 s_save_mem_offset, s_save_mem_offset, 256*4
0668     s_cmp_lt_u32 m0, s_save_alloc_size
0669     s_cbranch_scc1 L_SAVE_ACCVGPR_LOOP
0670     s_set_gpr_idx_off
0671 
0672 L_SAVE_ACCVGPR_END:
0673 #endif
0674 
0675     s_branch    L_END_PGM
0676 
0677 
0678 
0679 /**************************************************************************/
0680 /*          restore routine                   */
0681 /**************************************************************************/
0682 
0683 L_RESTORE:
0684     /*      Setup Resource Contants    */
0685     s_mov_b32       s_restore_buf_rsrc0,    s_restore_spi_init_lo                               //base_addr_lo
0686     s_and_b32       s_restore_buf_rsrc1,    s_restore_spi_init_hi, 0x0000FFFF                           //base_addr_hi
0687     s_or_b32        s_restore_buf_rsrc1,    s_restore_buf_rsrc1,  S_RESTORE_BUF_RSRC_WORD1_STRIDE
0688     s_mov_b32       s_restore_buf_rsrc2,    0                                           //NUM_RECORDS initial value = 0 (in bytes)
0689     s_mov_b32       s_restore_buf_rsrc3,    S_RESTORE_BUF_RSRC_WORD3_MISC
0690 
0691     /*      global mem offset       */
0692 //  s_mov_b32       s_restore_mem_offset, 0x0                   //mem offset initial value = 0
0693 
0694     /*      the first wave in the threadgroup    */
0695     s_and_b32       s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_FIRST_WAVE_MASK
0696     s_cbranch_scc0  L_RESTORE_VGPR
0697 
0698     /*      restore LDS */
0699     //////////////////////////////
0700   L_RESTORE_LDS:
0701 
0702     s_mov_b32       exec_lo, 0xFFFFFFFF                             //need every thread from now on   //be consistent with SAVE although can be moved ahead
0703     s_mov_b32       exec_hi, 0xFFFFFFFF
0704 
0705     s_getreg_b32    s_restore_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE)      //lds_size
0706     s_and_b32       s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF          //lds_size is zero?
0707     s_cbranch_scc0  L_RESTORE_VGPR                              //no lds used? jump to L_RESTORE_VGPR
0708     s_lshl_b32      s_restore_alloc_size, s_restore_alloc_size, 6               //LDS size in dwords = lds_size * 64dw
0709     s_lshl_b32      s_restore_alloc_size, s_restore_alloc_size, 2               //LDS size in bytes
0710     s_mov_b32       s_restore_buf_rsrc2,    s_restore_alloc_size                //NUM_RECORDS in bytes
0711 
0712     // LDS at offset: size(VGPR)+SIZE(SGPR)+SIZE(HWREG)
0713     //
0714     get_vgpr_size_bytes(s_restore_mem_offset)
0715     get_sgpr_size_bytes(s_restore_tmp)
0716     s_add_u32  s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp
0717     s_add_u32  s_restore_mem_offset, s_restore_mem_offset, get_hwreg_size_bytes()        //FIXME, Check if offset overflow???
0718 
0719 
0720     s_mov_b32   s_restore_buf_rsrc2,  0x1000000                     //NUM_RECORDS in bytes
0721     s_mov_b32       m0, 0x0                                 //lds_offset initial value = 0
0722 
0723   L_RESTORE_LDS_LOOP:
0724     buffer_load_dword   v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1            // first 64DW
0725     buffer_load_dword   v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 offset:256         // second 64DW
0726     s_add_u32       m0, m0, 256*2                       // 128 DW
0727     s_add_u32       s_restore_mem_offset, s_restore_mem_offset, 256*2       //mem offset increased by 128DW
0728     s_cmp_lt_u32    m0, s_restore_alloc_size                    //scc=(m0 < s_restore_alloc_size) ? 1 : 0
0729     s_cbranch_scc1  L_RESTORE_LDS_LOOP                              //LDS restore is complete?
0730 
0731 
0732     /*      restore VGPRs       */
0733     //////////////////////////////
0734   L_RESTORE_VGPR:
0735     s_mov_b32       exec_lo, 0xFFFFFFFF                             //need every thread from now on   //be consistent with SAVE although can be moved ahead
0736     s_mov_b32       exec_hi, 0xFFFFFFFF
0737     s_mov_b32       s_restore_buf_rsrc2,  0x1000000                     //NUM_RECORDS in bytes
0738 
0739     // Save ARCH VGPRs 4-N, then all ACC VGPRs, then ARCH VGPRs 0-3.
0740     get_num_arch_vgprs(s_restore_alloc_size)
0741     s_add_u32       s_restore_alloc_size, s_restore_alloc_size, 0x8000              //add 0x8000 since we compare m0 against it later
0742 
0743     // ARCH VGPRs at offset: 0
0744     s_mov_b32       s_restore_mem_offset, 0x0
0745     s_mov_b32       s_restore_mem_offset_save, s_restore_mem_offset // restore start with v1, v0 will be the last
0746     s_add_u32       s_restore_mem_offset, s_restore_mem_offset, 256*4
0747     s_mov_b32       m0, 4               //VGPR initial index value = 1
0748     s_set_gpr_idx_on    m0, 0x8                                 //M0[7:0] = M0[7:0] and M0[15:12] = 0x8
0749 
0750   L_RESTORE_VGPR_LOOP:
0751     read_4vgprs_from_mem(s_restore_buf_rsrc0, s_restore_mem_offset)
0752     v_mov_b32       v0, v0                                  //v[0+m0] = v0
0753     v_mov_b32       v1, v1
0754     v_mov_b32       v2, v2
0755     v_mov_b32       v3, v3
0756     s_add_u32       m0, m0, 4                                   //next vgpr index
0757     s_add_u32       s_restore_mem_offset, s_restore_mem_offset, 256*4               //every buffer_load_dword does 256 bytes
0758     s_cmp_lt_u32    m0, s_restore_alloc_size                            //scc = (m0 < s_restore_alloc_size) ? 1 : 0
0759     s_cbranch_scc1  L_RESTORE_VGPR_LOOP                             //VGPR restore (except v0) is complete?
0760 
0761 #if ASIC_FAMILY >= CHIP_ALDEBARAN
0762     // ACC VGPR count may differ from ARCH VGPR count.
0763     get_num_acc_vgprs(s_restore_alloc_size, s_restore_tmp2)
0764     s_and_b32       s_restore_alloc_size, s_restore_alloc_size, s_restore_alloc_size
0765     s_cbranch_scc0  L_RESTORE_ACCVGPR_END
0766     s_add_u32       s_restore_alloc_size, s_restore_alloc_size, 0x8000              //add 0x8000 since we compare m0 against it later
0767 #endif
0768 
0769 #if ASIC_FAMILY >= CHIP_ARCTURUS
0770     // ACC VGPRs at offset: size(ARCH VGPRs)
0771     s_mov_b32       m0, 0
0772     s_set_gpr_idx_on    m0, 0x8                                 //M0[7:0] = M0[7:0] and M0[15:12] = 0x8
0773 
0774   L_RESTORE_ACCVGPR_LOOP:
0775     read_4vgprs_from_mem(s_restore_buf_rsrc0, s_restore_mem_offset)
0776 
0777     for var vgpr = 0; vgpr < 4; ++ vgpr
0778         v_accvgpr_write acc[vgpr], v[vgpr]
0779     end
0780 
0781     s_add_u32       m0, m0, 4                                   //next vgpr index
0782     s_add_u32       s_restore_mem_offset, s_restore_mem_offset, 256*4               //every buffer_load_dword does 256 bytes
0783     s_cmp_lt_u32    m0, s_restore_alloc_size                            //scc = (m0 < s_restore_alloc_size) ? 1 : 0
0784     s_cbranch_scc1  L_RESTORE_ACCVGPR_LOOP                          //VGPR restore (except v0) is complete?
0785   L_RESTORE_ACCVGPR_END:
0786 #endif
0787 
0788     s_set_gpr_idx_off
0789 
0790     // Restore VGPRs 0-3 last, no longer needed.
0791     read_4vgprs_from_mem(s_restore_buf_rsrc0, s_restore_mem_offset_save)
0792 
0793     /*      restore SGPRs       */
0794     //////////////////////////////
0795 
0796     // SGPR SR memory offset : size(VGPR)
0797     get_vgpr_size_bytes(s_restore_mem_offset)
0798     get_sgpr_size_bytes(s_restore_tmp)
0799     s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp
0800     s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 16*4     // restore SGPR from S[n] to S[0], by 16 sgprs group
0801     // TODO, change RSRC word to rearrange memory layout for SGPRS
0802 
0803     s_getreg_b32    s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE)            //spgr_size
0804     s_add_u32       s_restore_alloc_size, s_restore_alloc_size, 1
0805     s_lshl_b32      s_restore_alloc_size, s_restore_alloc_size, 4               //Number of SGPRs = (sgpr_size + 1) * 16   (non-zero value)
0806 
0807     s_lshl_b32  s_restore_buf_rsrc2,    s_restore_alloc_size, 2             //NUM_RECORDS in bytes
0808     s_mov_b32   s_restore_buf_rsrc2,  0x1000000                     //NUM_RECORDS in bytes
0809 
0810     s_mov_b32 m0, s_restore_alloc_size
0811 
0812  L_RESTORE_SGPR_LOOP:
0813     read_16sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset)  //PV: further performance improvement can be made
0814     s_waitcnt       lgkmcnt(0)                                  //ensure data ready
0815 
0816     s_sub_u32 m0, m0, 16    // Restore from S[n] to S[0]
0817     s_nop 0 // hazard SALU M0=> S_MOVREL
0818 
0819     s_movreld_b64   s0, s0  //s[0+m0] = s0
0820     s_movreld_b64   s2, s2
0821     s_movreld_b64   s4, s4
0822     s_movreld_b64   s6, s6
0823     s_movreld_b64   s8, s8
0824     s_movreld_b64   s10, s10
0825     s_movreld_b64   s12, s12
0826     s_movreld_b64   s14, s14
0827 
0828     s_cmp_eq_u32    m0, 0       //scc = (m0 < s_restore_alloc_size) ? 1 : 0
0829     s_cbranch_scc0  L_RESTORE_SGPR_LOOP         //SGPR restore (except s0) is complete?
0830 
0831     /*      restore HW registers    */
0832     //////////////////////////////
0833   L_RESTORE_HWREG:
0834 
0835 
0836     // HWREG SR memory offset : size(VGPR)+size(SGPR)
0837     get_vgpr_size_bytes(s_restore_mem_offset)
0838     get_sgpr_size_bytes(s_restore_tmp)
0839     s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp
0840 
0841 
0842     s_mov_b32       s_restore_buf_rsrc2, 0x4                            //NUM_RECORDS   in bytes
0843     s_mov_b32   s_restore_buf_rsrc2,  0x1000000                     //NUM_RECORDS in bytes
0844 
0845     read_hwreg_from_mem(s_restore_m0, s_restore_buf_rsrc0, s_restore_mem_offset)            //M0
0846     read_hwreg_from_mem(s_restore_pc_lo, s_restore_buf_rsrc0, s_restore_mem_offset)     //PC
0847     read_hwreg_from_mem(s_restore_pc_hi, s_restore_buf_rsrc0, s_restore_mem_offset)
0848     read_hwreg_from_mem(s_restore_exec_lo, s_restore_buf_rsrc0, s_restore_mem_offset)           //EXEC
0849     read_hwreg_from_mem(s_restore_exec_hi, s_restore_buf_rsrc0, s_restore_mem_offset)
0850     read_hwreg_from_mem(s_restore_status, s_restore_buf_rsrc0, s_restore_mem_offset)            //STATUS
0851     read_hwreg_from_mem(s_restore_trapsts, s_restore_buf_rsrc0, s_restore_mem_offset)           //TRAPSTS
0852     read_hwreg_from_mem(xnack_mask_lo, s_restore_buf_rsrc0, s_restore_mem_offset)           //XNACK_MASK_LO
0853     read_hwreg_from_mem(xnack_mask_hi, s_restore_buf_rsrc0, s_restore_mem_offset)           //XNACK_MASK_HI
0854     read_hwreg_from_mem(s_restore_mode, s_restore_buf_rsrc0, s_restore_mem_offset)      //MODE
0855 
0856     s_waitcnt       lgkmcnt(0)                                              //from now on, it is safe to restore STATUS and IB_STS
0857 
0858     s_mov_b32       m0,     s_restore_m0
0859     s_mov_b32       exec_lo,    s_restore_exec_lo
0860     s_mov_b32       exec_hi,    s_restore_exec_hi
0861 
0862     s_and_b32       s_restore_m0, SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK, s_restore_trapsts
0863     s_setreg_b32    hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE), s_restore_m0
0864     s_and_b32       s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK, s_restore_trapsts
0865     s_lshr_b32      s_restore_m0, s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT
0866     s_setreg_b32    hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE), s_restore_m0
0867     //s_setreg_b32  hwreg(HW_REG_TRAPSTS),  s_restore_trapsts      //don't overwrite SAVECTX bit as it may be set through external SAVECTX during restore
0868     s_setreg_b32    hwreg(HW_REG_MODE),     s_restore_mode
0869 
0870     // Restore trap temporaries 4-11, 13 initialized by SPI debug dispatch logic
0871     // ttmp SR memory offset : size(VGPR)+size(SGPR)+0x40
0872     get_vgpr_size_bytes(s_restore_ttmps_lo)
0873     get_sgpr_size_bytes(s_restore_ttmps_hi)
0874     s_add_u32       s_restore_ttmps_lo, s_restore_ttmps_lo, s_restore_ttmps_hi
0875     s_add_u32       s_restore_ttmps_lo, s_restore_ttmps_lo, s_restore_buf_rsrc0
0876     s_addc_u32      s_restore_ttmps_hi, s_restore_buf_rsrc1, 0x0
0877     s_and_b32       s_restore_ttmps_hi, s_restore_ttmps_hi, 0xFFFF
0878     s_load_dwordx4  [ttmp4, ttmp5, ttmp6, ttmp7], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x50 glc:1
0879     s_load_dwordx4  [ttmp8, ttmp9, ttmp10, ttmp11], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x60 glc:1
0880     s_load_dword    ttmp13, [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x74 glc:1
0881     s_waitcnt       lgkmcnt(0)
0882 
0883     restore_ib_sts(s_restore_tmp)
0884 
0885     s_and_b32 s_restore_pc_hi, s_restore_pc_hi, 0x0000ffff  //pc[47:32]    //Do it here in order not to affect STATUS
0886     s_and_b64    exec, exec, exec  // Restore STATUS.EXECZ, not writable by s_setreg_b32
0887     s_and_b64    vcc, vcc, vcc  // Restore STATUS.VCCZ, not writable by s_setreg_b32
0888     set_status_without_spi_prio(s_restore_status, s_restore_tmp) // SCC is included, which is changed by previous salu
0889 
0890     s_barrier                           //barrier to ensure the readiness of LDS before access attempts from any other wave in the same TG //FIXME not performance-optimal at this time
0891 
0892     s_rfe_b64 s_restore_pc_lo                   //Return to the main shader program and resume execution
0893 
0894 
0895 /**************************************************************************/
0896 /*          the END                       */
0897 /**************************************************************************/
0898 L_END_PGM:
0899     s_endpgm
0900 
0901 end
0902 
0903 
0904 /**************************************************************************/
0905 /*          the helper functions                  */
0906 /**************************************************************************/
0907 
0908 //Only for save hwreg to mem
0909 function write_hwreg_to_mem(s, s_rsrc, s_mem_offset)
0910     s_mov_b32 exec_lo, m0           //assuming exec_lo is not needed anymore from this point on
0911     s_mov_b32 m0, s_mem_offset
0912     s_buffer_store_dword s, s_rsrc, m0  glc:1
0913     ack_sqc_store_workaround()
0914     s_add_u32   s_mem_offset, s_mem_offset, 4
0915     s_mov_b32   m0, exec_lo
0916 end
0917 
0918 
0919 // HWREG are saved before SGPRs, so all HWREG could be use.
0920 function write_16sgpr_to_mem(s, s_rsrc, s_mem_offset)
0921 
0922     s_buffer_store_dwordx4 s[0], s_rsrc, 0  glc:1
0923     ack_sqc_store_workaround()
0924     s_buffer_store_dwordx4 s[4], s_rsrc, 16  glc:1
0925     ack_sqc_store_workaround()
0926     s_buffer_store_dwordx4 s[8], s_rsrc, 32  glc:1
0927     ack_sqc_store_workaround()
0928     s_buffer_store_dwordx4 s[12], s_rsrc, 48 glc:1
0929     ack_sqc_store_workaround()
0930     s_add_u32   s_rsrc[0], s_rsrc[0], 4*16
0931     s_addc_u32  s_rsrc[1], s_rsrc[1], 0x0         // +scc
0932 end
0933 
0934 
0935 function read_hwreg_from_mem(s, s_rsrc, s_mem_offset)
0936     s_buffer_load_dword s, s_rsrc, s_mem_offset     glc:1
0937     s_add_u32       s_mem_offset, s_mem_offset, 4
0938 end
0939 
0940 function read_16sgpr_from_mem(s, s_rsrc, s_mem_offset)
0941     s_buffer_load_dwordx16 s, s_rsrc, s_mem_offset  glc:1
0942     s_sub_u32       s_mem_offset, s_mem_offset, 4*16
0943 end
0944 
0945 function check_if_tcp_store_ok
0946     // If STATUS.ALLOW_REPLAY=0 and TRAPSTS.XNACK_ERROR=1 then TCP stores will fail.
0947     s_and_b32 s_save_tmp, s_save_status, SQ_WAVE_STATUS_ALLOW_REPLAY_MASK
0948     s_cbranch_scc1 L_TCP_STORE_CHECK_DONE
0949 
0950     s_getreg_b32 s_save_tmp, hwreg(HW_REG_TRAPSTS)
0951     s_andn2_b32 s_save_tmp, SQ_WAVE_TRAPSTS_XNACK_ERROR_MASK, s_save_tmp
0952 
0953 L_TCP_STORE_CHECK_DONE:
0954 end
0955 
0956 function write_4vgprs_to_mem(s_rsrc, s_mem_offset)
0957     buffer_store_dword v0, v0, s_rsrc, s_mem_offset slc:1 glc:1
0958     buffer_store_dword v1, v0, s_rsrc, s_mem_offset slc:1 glc:1  offset:256
0959     buffer_store_dword v2, v0, s_rsrc, s_mem_offset slc:1 glc:1  offset:256*2
0960     buffer_store_dword v3, v0, s_rsrc, s_mem_offset slc:1 glc:1  offset:256*3
0961 end
0962 
0963 function read_4vgprs_from_mem(s_rsrc, s_mem_offset)
0964     buffer_load_dword v0, v0, s_rsrc, s_mem_offset slc:1 glc:1
0965     buffer_load_dword v1, v0, s_rsrc, s_mem_offset slc:1 glc:1 offset:256
0966     buffer_load_dword v2, v0, s_rsrc, s_mem_offset slc:1 glc:1 offset:256*2
0967     buffer_load_dword v3, v0, s_rsrc, s_mem_offset slc:1 glc:1 offset:256*3
0968     s_waitcnt vmcnt(0)
0969 end
0970 
0971 function write_vgpr_to_mem_with_sqc(v, s_rsrc, s_mem_offset)
0972     s_mov_b32 s4, 0
0973 
0974 L_WRITE_VGPR_LANE_LOOP:
0975     for var lane = 0; lane < 4; ++ lane
0976         v_readlane_b32 s[lane], v, s4
0977         s_add_u32 s4, s4, 1
0978     end
0979 
0980     s_buffer_store_dwordx4 s[0:3], s_rsrc, s_mem_offset glc:1
0981     ack_sqc_store_workaround()
0982 
0983     s_add_u32 s_mem_offset, s_mem_offset, 0x10
0984     s_cmp_eq_u32 s4, 0x40
0985     s_cbranch_scc0 L_WRITE_VGPR_LANE_LOOP
0986 end
0987 
0988 function write_vgprs_to_mem_with_sqc(v, n_vgprs, s_rsrc, s_mem_offset)
0989     for var vgpr = 0; vgpr < n_vgprs; ++ vgpr
0990         write_vgpr_to_mem_with_sqc(v[vgpr], s_rsrc, s_mem_offset)
0991     end
0992 end
0993 
0994 function get_lds_size_bytes(s_lds_size_byte)
0995     // SQ LDS granularity is 64DW, while PGM_RSRC2.lds_size is in granularity 128DW
0996     s_getreg_b32   s_lds_size_byte, hwreg(HW_REG_LDS_ALLOC, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE)      // lds_size
0997     s_lshl_b32     s_lds_size_byte, s_lds_size_byte, 8              //LDS size in dwords = lds_size * 64 *4Bytes    // granularity 64DW
0998 end
0999 
1000 function get_vgpr_size_bytes(s_vgpr_size_byte)
1001     s_getreg_b32   s_vgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE)  //vpgr_size
1002     s_add_u32      s_vgpr_size_byte, s_vgpr_size_byte, 1
1003     s_lshl_b32     s_vgpr_size_byte, s_vgpr_size_byte, (2+8) //Number of VGPRs = (vgpr_size + 1) * 4 * 64 * 4   (non-zero value)   //FIXME for GFX, zero is possible
1004 
1005 #if ASIC_FAMILY >= CHIP_ARCTURUS
1006     s_lshl_b32     s_vgpr_size_byte, s_vgpr_size_byte, 1  // Double size for ACC VGPRs
1007 #endif
1008 end
1009 
1010 function get_sgpr_size_bytes(s_sgpr_size_byte)
1011     s_getreg_b32   s_sgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE)  //spgr_size
1012     s_add_u32      s_sgpr_size_byte, s_sgpr_size_byte, 1
1013     s_lshl_b32     s_sgpr_size_byte, s_sgpr_size_byte, 6 //Number of SGPRs = (sgpr_size + 1) * 16 *4   (non-zero value)
1014 end
1015 
1016 function get_hwreg_size_bytes
1017     return 128 //HWREG size 128 bytes
1018 end
1019 
1020 function get_num_arch_vgprs(s_num_arch_vgprs)
1021 #if ASIC_FAMILY >= CHIP_ALDEBARAN
1022     // VGPR count includes ACC VGPRs, use ACC VGPR offset for ARCH VGPR count.
1023     s_getreg_b32    s_num_arch_vgprs, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_ACCV_OFFSET_SHIFT,SQ_WAVE_GPR_ALLOC_ACCV_OFFSET_SIZE)
1024 #else
1025     s_getreg_b32    s_num_arch_vgprs, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE)
1026 #endif
1027 
1028     // Number of VGPRs = (vgpr_size + 1) * 4
1029     s_add_u32       s_num_arch_vgprs, s_num_arch_vgprs, 1
1030     s_lshl_b32      s_num_arch_vgprs, s_num_arch_vgprs, 2
1031 end
1032 
1033 #if ASIC_FAMILY >= CHIP_ALDEBARAN
1034 function get_num_acc_vgprs(s_num_acc_vgprs, s_tmp)
1035     // VGPR count = (GPR_ALLOC.VGPR_SIZE + 1) * 8
1036     s_getreg_b32    s_num_acc_vgprs, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE)
1037     s_add_u32       s_num_acc_vgprs, s_num_acc_vgprs, 1
1038     s_lshl_b32      s_num_acc_vgprs, s_num_acc_vgprs, 3
1039 
1040     // ACC VGPR count = VGPR count - ARCH VGPR count.
1041     get_num_arch_vgprs(s_tmp)
1042     s_sub_u32       s_num_acc_vgprs, s_num_acc_vgprs, s_tmp
1043 end
1044 #endif
1045 
1046 function ack_sqc_store_workaround
1047     if ACK_SQC_STORE
1048         s_waitcnt lgkmcnt(0)
1049     end
1050 end
1051 
1052 function set_status_without_spi_prio(status, tmp)
1053     // Do not restore STATUS.SPI_PRIO since scheduler may have raised it.
1054     s_lshr_b32      tmp, status, SQ_WAVE_STATUS_POST_SPI_PRIO_SHIFT
1055     s_setreg_b32    hwreg(HW_REG_STATUS, SQ_WAVE_STATUS_POST_SPI_PRIO_SHIFT, SQ_WAVE_STATUS_POST_SPI_PRIO_SIZE), tmp
1056     s_nop           0x2 // avoid S_SETREG => S_SETREG hazard
1057     s_setreg_b32    hwreg(HW_REG_STATUS, SQ_WAVE_STATUS_PRE_SPI_PRIO_SHIFT, SQ_WAVE_STATUS_PRE_SPI_PRIO_SIZE), status
1058 end
1059 
1060 function save_and_clear_ib_sts(tmp)
1061     // Save IB_STS.FIRST_REPLAY[15] and IB_STS.RCNT[20:16] into unused space ttmp11[31:26].
1062     s_getreg_b32    tmp, hwreg(HW_REG_IB_STS)
1063     s_and_b32       tmp, tmp, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK
1064     s_lshl_b32      tmp, tmp, (TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT - SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT)
1065     s_andn2_b32     ttmp11, ttmp11, TTMP11_SAVE_RCNT_FIRST_REPLAY_MASK
1066     s_or_b32        ttmp11, ttmp11, tmp
1067     s_setreg_imm32_b32 hwreg(HW_REG_IB_STS), 0x0
1068 end
1069 
1070 function restore_ib_sts(tmp)
1071     s_lshr_b32      tmp, ttmp11, (TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT - SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT)
1072     s_and_b32       tmp, tmp, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK
1073     s_setreg_b32    hwreg(HW_REG_IB_STS), tmp
1074 end