0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023 #include "kfd_priv.h"
0024 #include "kfd_events.h"
0025 #include "soc15_int.h"
0026 #include "kfd_device_queue_manager.h"
0027 #include "ivsrcid/vmc/irqsrcs_vmc_1_0.h"
0028 #include "kfd_smi_events.h"
0029
0030
0031
0032
0033
0034
0035
0036
0037
0038
0039
0040
0041
0042
0043
0044
0045
0046
0047
0048
0049
0050
0051
0052
0053
0054
0055
0056
0057
0058
0059 enum SQ_INTERRUPT_WORD_ENCODING {
0060 SQ_INTERRUPT_WORD_ENCODING_AUTO = 0x0,
0061 SQ_INTERRUPT_WORD_ENCODING_INST,
0062 SQ_INTERRUPT_WORD_ENCODING_ERROR,
0063 };
0064
0065 enum SQ_INTERRUPT_ERROR_TYPE {
0066 SQ_INTERRUPT_ERROR_TYPE_EDC_FUE = 0x0,
0067 SQ_INTERRUPT_ERROR_TYPE_ILLEGAL_INST,
0068 SQ_INTERRUPT_ERROR_TYPE_MEMVIOL,
0069 SQ_INTERRUPT_ERROR_TYPE_EDC_FED,
0070 };
0071
0072
0073 #define SQ_INTERRUPT_WORD_AUTO_CTXID0__THREAD_TRACE__SHIFT 0
0074 #define SQ_INTERRUPT_WORD_AUTO_CTXID0__WLT__SHIFT 1
0075 #define SQ_INTERRUPT_WORD_AUTO_CTXID0__THREAD_TRACE_BUF_FULL__SHIFT 2
0076 #define SQ_INTERRUPT_WORD_AUTO_CTXID0__REG_TIMESTAMP__SHIFT 3
0077 #define SQ_INTERRUPT_WORD_AUTO_CTXID0__CMD_TIMESTAMP__SHIFT 4
0078 #define SQ_INTERRUPT_WORD_AUTO_CTXID0__HOST_CMD_OVERFLOW__SHIFT 5
0079 #define SQ_INTERRUPT_WORD_AUTO_CTXID0__HOST_REG_OVERFLOW__SHIFT 6
0080 #define SQ_INTERRUPT_WORD_AUTO_CTXID0__IMMED_OVERFLOW__SHIFT 7
0081 #define SQ_INTERRUPT_WORD_AUTO_CTXID0__THREAD_TRACE_UTC_ERROR__SHIFT 8
0082 #define SQ_INTERRUPT_WORD_AUTO_CTXID1__ENCODING__SHIFT 6
0083
0084 #define SQ_INTERRUPT_WORD_AUTO_CTXID0__THREAD_TRACE_MASK 0x00000001
0085 #define SQ_INTERRUPT_WORD_AUTO_CTXID0__WLT_MASK 0x00000002
0086 #define SQ_INTERRUPT_WORD_AUTO_CTXID0__THREAD_TRACE_BUF_FULL_MASK 0x00000004
0087 #define SQ_INTERRUPT_WORD_AUTO_CTXID0__REG_TIMESTAMP_MASK 0x00000008
0088 #define SQ_INTERRUPT_WORD_AUTO_CTXID0__CMD_TIMESTAMP_MASK 0x00000010
0089 #define SQ_INTERRUPT_WORD_AUTO_CTXID0__HOST_CMD_OVERFLOW_MASK 0x00000020
0090 #define SQ_INTERRUPT_WORD_AUTO_CTXID0__HOST_REG_OVERFLOW_MASK 0x00000040
0091 #define SQ_INTERRUPT_WORD_AUTO_CTXID0__IMMED_OVERFLOW_MASK 0x00000080
0092 #define SQ_INTERRUPT_WORD_AUTO_CTXID0__THREAD_TRACE_UTC_ERROR_MASK 0x00000100
0093 #define SQ_INTERRUPT_WORD_AUTO_CTXID1__ENCODING_MASK 0x000000c0
0094
0095
0096 #define SQ_INTERRUPT_WORD_WAVE_CTXID0__DATA__SHIFT 0
0097 #define SQ_INTERRUPT_WORD_WAVE_CTXID0__SH_ID__SHIFT 25
0098 #define SQ_INTERRUPT_WORD_WAVE_CTXID0__PRIV__SHIFT 26
0099 #define SQ_INTERRUPT_WORD_WAVE_CTXID0__WAVE_ID__SHIFT 27
0100 #define SQ_INTERRUPT_WORD_WAVE_CTXID1__SIMD_ID__SHIFT 0
0101 #define SQ_INTERRUPT_WORD_WAVE_CTXID1__WGP_ID__SHIFT 2
0102 #define SQ_INTERRUPT_WORD_WAVE_CTXID1__ENCODING__SHIFT 6
0103
0104 #define SQ_INTERRUPT_WORD_WAVE_CTXID0__DATA_MASK 0x00ffffff
0105 #define SQ_INTERRUPT_WORD_WAVE_CTXID0__SH_ID_MASK 0x02000000
0106 #define SQ_INTERRUPT_WORD_WAVE_CTXID0__PRIV_MASK 0x04000000
0107 #define SQ_INTERRUPT_WORD_WAVE_CTXID0__WAVE_ID_MASK 0xf8000000
0108 #define SQ_INTERRUPT_WORD_WAVE_CTXID1__SIMD_ID_MASK 0x00000003
0109 #define SQ_INTERRUPT_WORD_WAVE_CTXID1__WGP_ID_MASK 0x0000003c
0110 #define SQ_INTERRUPT_WORD_WAVE_CTXID1__ENCODING_MASK 0x000000c0
0111
0112
0113 #define SQ_INTERRUPT_WORD_ERROR_CTXID0__DETAIL__SHIFT 0
0114 #define SQ_INTERRUPT_WORD_ERROR_CTXID0__TYPE__SHIFT 21
0115 #define SQ_INTERRUPT_WORD_ERROR_CTXID0__SH_ID__SHIFT 25
0116 #define SQ_INTERRUPT_WORD_ERROR_CTXID0__PRIV__SHIFT 26
0117 #define SQ_INTERRUPT_WORD_ERROR_CTXID0__WAVE_ID__SHIFT 27
0118 #define SQ_INTERRUPT_WORD_ERROR_CTXID1__SIMD_ID__SHIFT 0
0119 #define SQ_INTERRUPT_WORD_ERROR_CTXID1__WGP_ID__SHIFT 2
0120 #define SQ_INTERRUPT_WORD_ERROR_CTXID1__ENCODING__SHIFT 6
0121
0122 #define SQ_INTERRUPT_WORD_ERROR_CTXID0__DETAIL_MASK 0x001fffff
0123 #define SQ_INTERRUPT_WORD_ERROR_CTXID0__TYPE_MASK 0x01e00000
0124 #define SQ_INTERRUPT_WORD_ERROR_CTXID0__SH_ID_MASK 0x02000000
0125 #define SQ_INTERRUPT_WORD_ERROR_CTXID0__PRIV_MASK 0x04000000
0126 #define SQ_INTERRUPT_WORD_ERROR_CTXID0__WAVE_ID_MASK 0xf8000000
0127 #define SQ_INTERRUPT_WORD_ERROR_CTXID1__SIMD_ID_MASK 0x00000003
0128 #define SQ_INTERRUPT_WORD_ERROR_CTXID1__WGP_ID_MASK 0x0000003c
0129 #define SQ_INTERRUPT_WORD_ERROR_CTXID1__ENCODING_MASK 0x000000c0
0130
0131
0132
0133
0134
0135
0136 #define KFD_CTXID0_TRAP_CODE_SHIFT 10
0137 #define KFD_CTXID0_TRAP_CODE_MASK 0xfffc00
0138 #define KFD_CTXID0_CP_BAD_OP_ECODE_MASK 0x3ffffff
0139 #define KFD_CTXID0_DOORBELL_ID_MASK 0x0003ff
0140
0141 #define KFD_CTXID0_TRAP_CODE(ctxid0) (((ctxid0) & \
0142 KFD_CTXID0_TRAP_CODE_MASK) >> \
0143 KFD_CTXID0_TRAP_CODE_SHIFT)
0144 #define KFD_CTXID0_CP_BAD_OP_ECODE(ctxid0) (((ctxid0) & \
0145 KFD_CTXID0_CP_BAD_OP_ECODE_MASK) >> \
0146 KFD_CTXID0_TRAP_CODE_SHIFT)
0147 #define KFD_CTXID0_DOORBELL_ID(ctxid0) ((ctxid0) & \
0148 KFD_CTXID0_DOORBELL_ID_MASK)
0149
0150 static void print_sq_intr_info_auto(uint32_t context_id0, uint32_t context_id1)
0151 {
0152 pr_debug(
0153 "sq_intr: auto, ttrace %d, wlt %d, ttrace_buf_full %d, reg_tms %d, cmd_tms %d, host_cmd_ovf %d, host_reg_ovf %d, immed_ovf %d, ttrace_utc_err %d\n",
0154 REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, THREAD_TRACE),
0155 REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, WLT),
0156 REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, THREAD_TRACE_BUF_FULL),
0157 REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, REG_TIMESTAMP),
0158 REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, CMD_TIMESTAMP),
0159 REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, HOST_CMD_OVERFLOW),
0160 REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, HOST_REG_OVERFLOW),
0161 REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, IMMED_OVERFLOW),
0162 REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, THREAD_TRACE_UTC_ERROR));
0163 }
0164
0165 static void print_sq_intr_info_inst(uint32_t context_id0, uint32_t context_id1)
0166 {
0167 pr_debug(
0168 "sq_intr: inst, data 0x%08x, sh %d, priv %d, wave_id %d, simd_id %d, wgp_id %d\n",
0169 REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0, DATA),
0170 REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0, SH_ID),
0171 REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0, PRIV),
0172 REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0, WAVE_ID),
0173 REG_GET_FIELD(context_id1, SQ_INTERRUPT_WORD_WAVE_CTXID1, SIMD_ID),
0174 REG_GET_FIELD(context_id1, SQ_INTERRUPT_WORD_WAVE_CTXID1, WGP_ID));
0175 }
0176
0177 static void print_sq_intr_info_error(uint32_t context_id0, uint32_t context_id1)
0178 {
0179 pr_warn(
0180 "sq_intr: error, detail 0x%08x, type %d, sh %d, priv %d, wave_id %d, simd_id %d, wgp_id %d\n",
0181 REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID0, DETAIL),
0182 REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID0, TYPE),
0183 REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID0, SH_ID),
0184 REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID0, PRIV),
0185 REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID0, WAVE_ID),
0186 REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID1, SIMD_ID),
0187 REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID1, WGP_ID));
0188 }
0189
0190 static void event_interrupt_poison_consumption_v11(struct kfd_dev *dev,
0191 uint16_t pasid, uint16_t source_id)
0192 {
0193 int ret = -EINVAL;
0194 struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
0195
0196 if (!p)
0197 return;
0198
0199
0200 if (atomic_read(&p->poison)) {
0201 kfd_unref_process(p);
0202 return;
0203 }
0204
0205 atomic_set(&p->poison, 1);
0206 kfd_unref_process(p);
0207
0208 switch (source_id) {
0209 case SOC15_INTSRC_SQ_INTERRUPT_MSG:
0210 if (dev->dqm->ops.reset_queues)
0211 ret = dev->dqm->ops.reset_queues(dev->dqm, pasid);
0212 break;
0213 case SOC21_INTSRC_SDMA_ECC:
0214 default:
0215 break;
0216 }
0217
0218 kfd_signal_poison_consumed_event(dev, pasid);
0219
0220
0221
0222 if (!ret)
0223 amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, false);
0224 else
0225 amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, true);
0226 }
0227
0228 static bool event_interrupt_isr_v11(struct kfd_dev *dev,
0229 const uint32_t *ih_ring_entry,
0230 uint32_t *patched_ihre,
0231 bool *patched_flag)
0232 {
0233 uint16_t source_id, client_id, pasid, vmid;
0234 const uint32_t *data = ih_ring_entry;
0235 uint32_t context_id0;
0236
0237 source_id = SOC15_SOURCE_ID_FROM_IH_ENTRY(ih_ring_entry);
0238 client_id = SOC15_CLIENT_ID_FROM_IH_ENTRY(ih_ring_entry);
0239
0240 vmid = SOC15_VMID_FROM_IH_ENTRY(ih_ring_entry);
0241 if (
0242 (vmid < dev->vm_info.first_vmid_kfd ||
0243 vmid > dev->vm_info.last_vmid_kfd))
0244 return false;
0245
0246 pasid = SOC15_PASID_FROM_IH_ENTRY(ih_ring_entry);
0247 context_id0 = SOC15_CONTEXT_ID0_FROM_IH_ENTRY(ih_ring_entry);
0248
0249 if ((source_id == SOC15_INTSRC_CP_END_OF_PIPE) &&
0250 (context_id0 & AMDGPU_FENCE_MES_QUEUE_FLAG))
0251 return false;
0252
0253 pr_debug("client id 0x%x, source id %d, vmid %d, pasid 0x%x. raw data:\n",
0254 client_id, source_id, vmid, pasid);
0255 pr_debug("%8X, %8X, %8X, %8X, %8X, %8X, %8X, %8X.\n",
0256 data[0], data[1], data[2], data[3],
0257 data[4], data[5], data[6], data[7]);
0258
0259
0260 if (WARN_ONCE(pasid == 0, "Bug: No PASID in KFD interrupt"))
0261 return false;
0262
0263
0264
0265
0266 return source_id == SOC15_INTSRC_CP_END_OF_PIPE ||
0267 source_id == SOC15_INTSRC_SQ_INTERRUPT_MSG ||
0268 source_id == SOC15_INTSRC_CP_BAD_OPCODE ||
0269 source_id == SOC21_INTSRC_SDMA_TRAP ||
0270
0271 (((client_id == SOC21_IH_CLIENTID_VMC) ||
0272 ((client_id == SOC21_IH_CLIENTID_GFX) &&
0273 (source_id == UTCL2_1_0__SRCID__FAULT))) &&
0274 !amdgpu_no_queue_eviction_on_vm_fault);
0275 }
0276
0277 static void event_interrupt_wq_v11(struct kfd_dev *dev,
0278 const uint32_t *ih_ring_entry)
0279 {
0280 uint16_t source_id, client_id, ring_id, pasid, vmid;
0281 uint32_t context_id0, context_id1;
0282 uint8_t sq_int_enc, sq_int_errtype, sq_int_priv;
0283 struct kfd_vm_fault_info info = {0};
0284 struct kfd_hsa_memory_exception_data exception_data;
0285
0286 source_id = SOC15_SOURCE_ID_FROM_IH_ENTRY(ih_ring_entry);
0287 client_id = SOC15_CLIENT_ID_FROM_IH_ENTRY(ih_ring_entry);
0288 ring_id = SOC15_RING_ID_FROM_IH_ENTRY(ih_ring_entry);
0289 pasid = SOC15_PASID_FROM_IH_ENTRY(ih_ring_entry);
0290 vmid = SOC15_VMID_FROM_IH_ENTRY(ih_ring_entry);
0291 context_id0 = SOC15_CONTEXT_ID0_FROM_IH_ENTRY(ih_ring_entry);
0292 context_id1 = SOC15_CONTEXT_ID1_FROM_IH_ENTRY(ih_ring_entry);
0293
0294
0295 if (client_id == SOC21_IH_CLIENTID_VMC ||
0296 ((client_id == SOC21_IH_CLIENTID_GFX) &&
0297 (source_id == UTCL2_1_0__SRCID__FAULT))) {
0298
0299 info.vmid = vmid;
0300 info.mc_id = client_id;
0301 info.page_addr = ih_ring_entry[4] |
0302 (uint64_t)(ih_ring_entry[5] & 0xf) << 32;
0303 info.prot_valid = ring_id & 0x08;
0304 info.prot_read = ring_id & 0x10;
0305 info.prot_write = ring_id & 0x20;
0306
0307 memset(&exception_data, 0, sizeof(exception_data));
0308 exception_data.gpu_id = dev->id;
0309 exception_data.va = (info.page_addr) << PAGE_SHIFT;
0310 exception_data.failure.NotPresent = info.prot_valid ? 1 : 0;
0311 exception_data.failure.NoExecute = info.prot_exec ? 1 : 0;
0312 exception_data.failure.ReadOnly = info.prot_write ? 1 : 0;
0313 exception_data.failure.imprecise = 0;
0314
0315
0316
0317
0318 kfd_smi_event_update_vmfault(dev, pasid);
0319
0320
0321 } else if (client_id == SOC21_IH_CLIENTID_GRBM_CP ||
0322 client_id == SOC21_IH_CLIENTID_GFX) {
0323
0324
0325 if (source_id == SOC15_INTSRC_CP_END_OF_PIPE)
0326 kfd_signal_event_interrupt(pasid, context_id0, 32);
0327
0328
0329
0330
0331
0332
0333
0334 else if (source_id == SOC21_INTSRC_SDMA_TRAP)
0335 kfd_signal_event_interrupt(pasid, context_id0 & 0xfffffff, 28);
0336 else if (source_id == SOC21_INTSRC_SDMA_ECC) {
0337 event_interrupt_poison_consumption_v11(dev, pasid, source_id);
0338 return;
0339 }
0340
0341
0342 else if (source_id == SOC15_INTSRC_SQ_INTERRUPT_MSG) {
0343 sq_int_enc = REG_GET_FIELD(context_id1,
0344 SQ_INTERRUPT_WORD_WAVE_CTXID1, ENCODING);
0345 switch (sq_int_enc) {
0346 case SQ_INTERRUPT_WORD_ENCODING_AUTO:
0347 print_sq_intr_info_auto(context_id0, context_id1);
0348 break;
0349 case SQ_INTERRUPT_WORD_ENCODING_INST:
0350 print_sq_intr_info_inst(context_id0, context_id1);
0351 sq_int_priv = REG_GET_FIELD(context_id0,
0352 SQ_INTERRUPT_WORD_WAVE_CTXID0, PRIV);
0353
0354
0355
0356
0357
0358 break;
0359 case SQ_INTERRUPT_WORD_ENCODING_ERROR:
0360 print_sq_intr_info_error(context_id0, context_id1);
0361 sq_int_errtype = REG_GET_FIELD(context_id0,
0362 SQ_INTERRUPT_WORD_ERROR_CTXID0, TYPE);
0363 if (sq_int_errtype != SQ_INTERRUPT_ERROR_TYPE_ILLEGAL_INST &&
0364 sq_int_errtype != SQ_INTERRUPT_ERROR_TYPE_MEMVIOL) {
0365 event_interrupt_poison_consumption_v11(
0366 dev, pasid, source_id);
0367 return;
0368 }
0369 break;
0370 default:
0371 break;
0372 }
0373 kfd_signal_event_interrupt(pasid, context_id0 & 0xffffff, 24);
0374 }
0375
0376
0377
0378 }
0379 }
0380
0381 const struct kfd_event_interrupt_class event_interrupt_class_v11 = {
0382 .interrupt_isr = event_interrupt_isr_v11,
0383 .interrupt_wq = event_interrupt_wq_v11,
0384 };