0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024 #include "kfd_priv.h"
0025 #include "kfd_events.h"
0026 #include "soc15_int.h"
0027 #include "kfd_device_queue_manager.h"
0028 #include "kfd_smi_events.h"
0029
0030 enum SQ_INTERRUPT_WORD_ENCODING {
0031 SQ_INTERRUPT_WORD_ENCODING_AUTO = 0x0,
0032 SQ_INTERRUPT_WORD_ENCODING_INST,
0033 SQ_INTERRUPT_WORD_ENCODING_ERROR,
0034 };
0035
0036 enum SQ_INTERRUPT_ERROR_TYPE {
0037 SQ_INTERRUPT_ERROR_TYPE_EDC_FUE = 0x0,
0038 SQ_INTERRUPT_ERROR_TYPE_ILLEGAL_INST,
0039 SQ_INTERRUPT_ERROR_TYPE_MEMVIOL,
0040 SQ_INTERRUPT_ERROR_TYPE_EDC_FED,
0041 };
0042
0043
0044 #define SQ_INTERRUPT_WORD_AUTO_CTXID__THREAD_TRACE__SHIFT 0
0045 #define SQ_INTERRUPT_WORD_AUTO_CTXID__WLT__SHIFT 1
0046 #define SQ_INTERRUPT_WORD_AUTO_CTXID__THREAD_TRACE_BUF_FULL__SHIFT 2
0047 #define SQ_INTERRUPT_WORD_AUTO_CTXID__REG_TIMESTAMP__SHIFT 3
0048 #define SQ_INTERRUPT_WORD_AUTO_CTXID__CMD_TIMESTAMP__SHIFT 4
0049 #define SQ_INTERRUPT_WORD_AUTO_CTXID__HOST_CMD_OVERFLOW__SHIFT 5
0050 #define SQ_INTERRUPT_WORD_AUTO_CTXID__HOST_REG_OVERFLOW__SHIFT 6
0051 #define SQ_INTERRUPT_WORD_AUTO_CTXID__IMMED_OVERFLOW__SHIFT 7
0052 #define SQ_INTERRUPT_WORD_AUTO_CTXID__THREAD_TRACE_UTC_ERROR__SHIFT 8
0053 #define SQ_INTERRUPT_WORD_AUTO_CTXID__SE_ID__SHIFT 24
0054 #define SQ_INTERRUPT_WORD_AUTO_CTXID__ENCODING__SHIFT 26
0055
0056 #define SQ_INTERRUPT_WORD_AUTO_CTXID__THREAD_TRACE_MASK 0x00000001
0057 #define SQ_INTERRUPT_WORD_AUTO_CTXID__WLT_MASK 0x00000002
0058 #define SQ_INTERRUPT_WORD_AUTO_CTXID__THREAD_TRACE_BUF_FULL_MASK 0x00000004
0059 #define SQ_INTERRUPT_WORD_AUTO_CTXID__REG_TIMESTAMP_MASK 0x00000008
0060 #define SQ_INTERRUPT_WORD_AUTO_CTXID__CMD_TIMESTAMP_MASK 0x00000010
0061 #define SQ_INTERRUPT_WORD_AUTO_CTXID__HOST_CMD_OVERFLOW_MASK 0x00000020
0062 #define SQ_INTERRUPT_WORD_AUTO_CTXID__HOST_REG_OVERFLOW_MASK 0x00000040
0063 #define SQ_INTERRUPT_WORD_AUTO_CTXID__IMMED_OVERFLOW_MASK 0x00000080
0064 #define SQ_INTERRUPT_WORD_AUTO_CTXID__THREAD_TRACE_UTC_ERROR_MASK 0x00000100
0065 #define SQ_INTERRUPT_WORD_AUTO_CTXID__SE_ID_MASK 0x03000000
0066 #define SQ_INTERRUPT_WORD_AUTO_CTXID__ENCODING_MASK 0x0c000000
0067
0068
0069 #define SQ_INTERRUPT_WORD_WAVE_CTXID__DATA__SHIFT 0
0070 #define SQ_INTERRUPT_WORD_WAVE_CTXID__SH_ID__SHIFT 12
0071 #define SQ_INTERRUPT_WORD_WAVE_CTXID__PRIV__SHIFT 13
0072 #define SQ_INTERRUPT_WORD_WAVE_CTXID__WAVE_ID__SHIFT 14
0073 #define SQ_INTERRUPT_WORD_WAVE_CTXID__SIMD_ID__SHIFT 18
0074 #define SQ_INTERRUPT_WORD_WAVE_CTXID__CU_ID__SHIFT 20
0075 #define SQ_INTERRUPT_WORD_WAVE_CTXID__SE_ID__SHIFT 24
0076 #define SQ_INTERRUPT_WORD_WAVE_CTXID__ENCODING__SHIFT 26
0077
0078 #define SQ_INTERRUPT_WORD_WAVE_CTXID__DATA_MASK 0x00000fff
0079 #define SQ_INTERRUPT_WORD_WAVE_CTXID__SH_ID_MASK 0x00001000
0080 #define SQ_INTERRUPT_WORD_WAVE_CTXID__PRIV_MASK 0x00002000
0081 #define SQ_INTERRUPT_WORD_WAVE_CTXID__WAVE_ID_MASK 0x0003c000
0082 #define SQ_INTERRUPT_WORD_WAVE_CTXID__SIMD_ID_MASK 0x000c0000
0083 #define SQ_INTERRUPT_WORD_WAVE_CTXID__CU_ID_MASK 0x00f00000
0084 #define SQ_INTERRUPT_WORD_WAVE_CTXID__SE_ID_MASK 0x03000000
0085 #define SQ_INTERRUPT_WORD_WAVE_CTXID__ENCODING_MASK 0x0c000000
0086
0087 #define KFD_CONTEXT_ID_GET_SQ_INT_DATA(ctx0, ctx1) \
0088 ((ctx0 & 0xfff) | ((ctx0 >> 16) & 0xf000) | ((ctx1 << 16) & 0xff0000))
0089
0090 #define KFD_SQ_INT_DATA__ERR_TYPE_MASK 0xF00000
0091 #define KFD_SQ_INT_DATA__ERR_TYPE__SHIFT 20
0092
0093 static void event_interrupt_poison_consumption_v9(struct kfd_dev *dev,
0094 uint16_t pasid, uint16_t client_id)
0095 {
0096 int old_poison, ret = -EINVAL;
0097 struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
0098
0099 if (!p)
0100 return;
0101
0102
0103 old_poison = atomic_cmpxchg(&p->poison, 0, 1);
0104 kfd_unref_process(p);
0105 if (old_poison)
0106 return;
0107
0108 switch (client_id) {
0109 case SOC15_IH_CLIENTID_SE0SH:
0110 case SOC15_IH_CLIENTID_SE1SH:
0111 case SOC15_IH_CLIENTID_SE2SH:
0112 case SOC15_IH_CLIENTID_SE3SH:
0113 case SOC15_IH_CLIENTID_UTCL2:
0114 ret = kfd_dqm_evict_pasid(dev->dqm, pasid);
0115 break;
0116 case SOC15_IH_CLIENTID_SDMA0:
0117 case SOC15_IH_CLIENTID_SDMA1:
0118 case SOC15_IH_CLIENTID_SDMA2:
0119 case SOC15_IH_CLIENTID_SDMA3:
0120 case SOC15_IH_CLIENTID_SDMA4:
0121 break;
0122 default:
0123 break;
0124 }
0125
0126 kfd_signal_poison_consumed_event(dev, pasid);
0127
0128
0129
0130
0131 if (!ret) {
0132 dev_warn(dev->adev->dev,
0133 "RAS poison consumption, unmap queue flow succeeded: client id %d\n",
0134 client_id);
0135 amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, false);
0136 } else {
0137 dev_warn(dev->adev->dev,
0138 "RAS poison consumption, fall back to gpu reset flow: client id %d\n",
0139 client_id);
0140 amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, true);
0141 }
0142 }
0143
0144 static bool context_id_expected(struct kfd_dev *dev)
0145 {
0146 switch (KFD_GC_VERSION(dev)) {
0147 case IP_VERSION(9, 0, 1):
0148 return dev->mec_fw_version >= 0x817a;
0149 case IP_VERSION(9, 1, 0):
0150 case IP_VERSION(9, 2, 1):
0151 case IP_VERSION(9, 2, 2):
0152 case IP_VERSION(9, 3, 0):
0153 case IP_VERSION(9, 4, 0):
0154 return dev->mec_fw_version >= 0x17a;
0155 default:
0156
0157
0158
0159 return KFD_GC_VERSION(dev) >= IP_VERSION(9, 4, 1);
0160 }
0161 }
0162
0163 static bool event_interrupt_isr_v9(struct kfd_dev *dev,
0164 const uint32_t *ih_ring_entry,
0165 uint32_t *patched_ihre,
0166 bool *patched_flag)
0167 {
0168 uint16_t source_id, client_id, pasid, vmid;
0169 const uint32_t *data = ih_ring_entry;
0170
0171
0172 vmid = SOC15_VMID_FROM_IH_ENTRY(ih_ring_entry);
0173 if (vmid < dev->vm_info.first_vmid_kfd ||
0174 vmid > dev->vm_info.last_vmid_kfd)
0175 return false;
0176
0177 source_id = SOC15_SOURCE_ID_FROM_IH_ENTRY(ih_ring_entry);
0178 client_id = SOC15_CLIENT_ID_FROM_IH_ENTRY(ih_ring_entry);
0179 pasid = SOC15_PASID_FROM_IH_ENTRY(ih_ring_entry);
0180
0181
0182 if (client_id != SOC15_IH_CLIENTID_GRBM_CP &&
0183 client_id != SOC15_IH_CLIENTID_SDMA0 &&
0184 client_id != SOC15_IH_CLIENTID_SDMA1 &&
0185 client_id != SOC15_IH_CLIENTID_SDMA2 &&
0186 client_id != SOC15_IH_CLIENTID_SDMA3 &&
0187 client_id != SOC15_IH_CLIENTID_SDMA4 &&
0188 client_id != SOC15_IH_CLIENTID_SDMA5 &&
0189 client_id != SOC15_IH_CLIENTID_SDMA6 &&
0190 client_id != SOC15_IH_CLIENTID_SDMA7 &&
0191 client_id != SOC15_IH_CLIENTID_VMC &&
0192 client_id != SOC15_IH_CLIENTID_VMC1 &&
0193 client_id != SOC15_IH_CLIENTID_UTCL2 &&
0194 client_id != SOC15_IH_CLIENTID_SE0SH &&
0195 client_id != SOC15_IH_CLIENTID_SE1SH &&
0196 client_id != SOC15_IH_CLIENTID_SE2SH &&
0197 client_id != SOC15_IH_CLIENTID_SE3SH)
0198 return false;
0199
0200
0201
0202
0203
0204 if (!pasid && dev->dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS) {
0205 const uint32_t pasid_mask = 0xffff;
0206
0207 *patched_flag = true;
0208 memcpy(patched_ihre, ih_ring_entry,
0209 dev->device_info.ih_ring_entry_size);
0210
0211 pasid = dev->dqm->vmid_pasid[vmid];
0212
0213
0214 patched_ihre[3] = cpu_to_le32((le32_to_cpu(patched_ihre[3])
0215 & ~pasid_mask) | pasid);
0216 }
0217
0218 pr_debug("client id 0x%x, source id %d, vmid %d, pasid 0x%x. raw data:\n",
0219 client_id, source_id, vmid, pasid);
0220 pr_debug("%8X, %8X, %8X, %8X, %8X, %8X, %8X, %8X.\n",
0221 data[0], data[1], data[2], data[3],
0222 data[4], data[5], data[6], data[7]);
0223
0224
0225 if (WARN_ONCE(pasid == 0, "Bug: No PASID in KFD interrupt"))
0226 return false;
0227
0228
0229
0230
0231
0232
0233
0234 if (source_id == SOC15_INTSRC_CP_END_OF_PIPE) {
0235 uint32_t context_id =
0236 SOC15_CONTEXT_ID0_FROM_IH_ENTRY(ih_ring_entry);
0237
0238 if (context_id == 0 && context_id_expected(dev))
0239 return false;
0240 }
0241
0242
0243
0244
0245 return source_id == SOC15_INTSRC_CP_END_OF_PIPE ||
0246 source_id == SOC15_INTSRC_SDMA_TRAP ||
0247 source_id == SOC15_INTSRC_SDMA_ECC ||
0248 source_id == SOC15_INTSRC_SQ_INTERRUPT_MSG ||
0249 source_id == SOC15_INTSRC_CP_BAD_OPCODE ||
0250 ((client_id == SOC15_IH_CLIENTID_VMC ||
0251 client_id == SOC15_IH_CLIENTID_VMC1 ||
0252 client_id == SOC15_IH_CLIENTID_UTCL2) &&
0253 !amdgpu_no_queue_eviction_on_vm_fault);
0254 }
0255
0256 static void event_interrupt_wq_v9(struct kfd_dev *dev,
0257 const uint32_t *ih_ring_entry)
0258 {
0259 uint16_t source_id, client_id, pasid, vmid;
0260 uint32_t context_id0, context_id1;
0261 uint32_t sq_intr_err, sq_int_data, encoding;
0262
0263 source_id = SOC15_SOURCE_ID_FROM_IH_ENTRY(ih_ring_entry);
0264 client_id = SOC15_CLIENT_ID_FROM_IH_ENTRY(ih_ring_entry);
0265 pasid = SOC15_PASID_FROM_IH_ENTRY(ih_ring_entry);
0266 vmid = SOC15_VMID_FROM_IH_ENTRY(ih_ring_entry);
0267 context_id0 = SOC15_CONTEXT_ID0_FROM_IH_ENTRY(ih_ring_entry);
0268 context_id1 = SOC15_CONTEXT_ID1_FROM_IH_ENTRY(ih_ring_entry);
0269
0270 if (client_id == SOC15_IH_CLIENTID_GRBM_CP ||
0271 client_id == SOC15_IH_CLIENTID_SE0SH ||
0272 client_id == SOC15_IH_CLIENTID_SE1SH ||
0273 client_id == SOC15_IH_CLIENTID_SE2SH ||
0274 client_id == SOC15_IH_CLIENTID_SE3SH) {
0275 if (source_id == SOC15_INTSRC_CP_END_OF_PIPE)
0276 kfd_signal_event_interrupt(pasid, context_id0, 32);
0277 else if (source_id == SOC15_INTSRC_SQ_INTERRUPT_MSG) {
0278 sq_int_data = KFD_CONTEXT_ID_GET_SQ_INT_DATA(context_id0, context_id1);
0279 encoding = REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID, ENCODING);
0280 switch (encoding) {
0281 case SQ_INTERRUPT_WORD_ENCODING_AUTO:
0282 pr_debug(
0283 "sq_intr: auto, se %d, ttrace %d, wlt %d, ttrac_buf_full %d, reg_tms %d, cmd_tms %d, host_cmd_ovf %d, host_reg_ovf %d, immed_ovf %d, ttrace_utc_err %d\n",
0284 REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID, SE_ID),
0285 REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID, THREAD_TRACE),
0286 REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID, WLT),
0287 REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID, THREAD_TRACE_BUF_FULL),
0288 REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID, REG_TIMESTAMP),
0289 REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID, CMD_TIMESTAMP),
0290 REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID, HOST_CMD_OVERFLOW),
0291 REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID, HOST_REG_OVERFLOW),
0292 REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID, IMMED_OVERFLOW),
0293 REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID, THREAD_TRACE_UTC_ERROR));
0294 break;
0295 case SQ_INTERRUPT_WORD_ENCODING_INST:
0296 pr_debug("sq_intr: inst, se %d, data 0x%x, sh %d, priv %d, wave_id %d, simd_id %d, cu_id %d, intr_data 0x%x\n",
0297 REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID, SE_ID),
0298 REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID, DATA),
0299 REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID, SH_ID),
0300 REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID, PRIV),
0301 REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID, WAVE_ID),
0302 REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID, SIMD_ID),
0303 REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID, CU_ID),
0304 sq_int_data);
0305 break;
0306 case SQ_INTERRUPT_WORD_ENCODING_ERROR:
0307 sq_intr_err = REG_GET_FIELD(sq_int_data, KFD_SQ_INT_DATA, ERR_TYPE);
0308 pr_warn("sq_intr: error, se %d, data 0x%x, sh %d, priv %d, wave_id %d, simd_id %d, cu_id %d, err_type %d\n",
0309 REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID, SE_ID),
0310 REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID, DATA),
0311 REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID, SH_ID),
0312 REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID, PRIV),
0313 REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID, WAVE_ID),
0314 REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID, SIMD_ID),
0315 REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID, CU_ID),
0316 sq_intr_err);
0317 if (sq_intr_err != SQ_INTERRUPT_ERROR_TYPE_ILLEGAL_INST &&
0318 sq_intr_err != SQ_INTERRUPT_ERROR_TYPE_MEMVIOL) {
0319 event_interrupt_poison_consumption_v9(dev, pasid, client_id);
0320 return;
0321 }
0322 break;
0323 default:
0324 break;
0325 }
0326 kfd_signal_event_interrupt(pasid, context_id0 & 0xffffff, 24);
0327 } else if (source_id == SOC15_INTSRC_CP_BAD_OPCODE)
0328 kfd_signal_hw_exception_event(pasid);
0329 } else if (client_id == SOC15_IH_CLIENTID_SDMA0 ||
0330 client_id == SOC15_IH_CLIENTID_SDMA1 ||
0331 client_id == SOC15_IH_CLIENTID_SDMA2 ||
0332 client_id == SOC15_IH_CLIENTID_SDMA3 ||
0333 client_id == SOC15_IH_CLIENTID_SDMA4 ||
0334 client_id == SOC15_IH_CLIENTID_SDMA5 ||
0335 client_id == SOC15_IH_CLIENTID_SDMA6 ||
0336 client_id == SOC15_IH_CLIENTID_SDMA7) {
0337 if (source_id == SOC15_INTSRC_SDMA_TRAP) {
0338 kfd_signal_event_interrupt(pasid, context_id0 & 0xfffffff, 28);
0339 } else if (source_id == SOC15_INTSRC_SDMA_ECC) {
0340 event_interrupt_poison_consumption_v9(dev, pasid, client_id);
0341 return;
0342 }
0343 } else if (client_id == SOC15_IH_CLIENTID_VMC ||
0344 client_id == SOC15_IH_CLIENTID_VMC1 ||
0345 client_id == SOC15_IH_CLIENTID_UTCL2) {
0346 struct kfd_vm_fault_info info = {0};
0347 uint16_t ring_id = SOC15_RING_ID_FROM_IH_ENTRY(ih_ring_entry);
0348
0349 if (client_id == SOC15_IH_CLIENTID_UTCL2 &&
0350 amdgpu_amdkfd_ras_query_utcl2_poison_status(dev->adev)) {
0351 event_interrupt_poison_consumption_v9(dev, pasid, client_id);
0352 return;
0353 }
0354
0355 info.vmid = vmid;
0356 info.mc_id = client_id;
0357 info.page_addr = ih_ring_entry[4] |
0358 (uint64_t)(ih_ring_entry[5] & 0xf) << 32;
0359 info.prot_valid = ring_id & 0x08;
0360 info.prot_read = ring_id & 0x10;
0361 info.prot_write = ring_id & 0x20;
0362
0363 kfd_smi_event_update_vmfault(dev, pasid);
0364 kfd_dqm_evict_pasid(dev->dqm, pasid);
0365 kfd_signal_vm_fault_event(dev, pasid, &info);
0366 }
0367 }
0368
0369 const struct kfd_event_interrupt_class event_interrupt_class_v9 = {
0370 .interrupt_isr = event_interrupt_isr_v9,
0371 .interrupt_wq = event_interrupt_wq_v9,
0372 };