0001
0002
0003
0004
0005
0006 #include <linux/sched/mm.h>
0007 #include <linux/stop_machine.h>
0008 #include <linux/string_helpers.h>
0009
0010 #include "display/intel_display.h"
0011 #include "display/intel_overlay.h"
0012
0013 #include "gem/i915_gem_context.h"
0014
0015 #include "gt/intel_gt_regs.h"
0016
0017 #include "i915_drv.h"
0018 #include "i915_file_private.h"
0019 #include "i915_gpu_error.h"
0020 #include "i915_irq.h"
0021 #include "intel_breadcrumbs.h"
0022 #include "intel_engine_pm.h"
0023 #include "intel_engine_regs.h"
0024 #include "intel_gt.h"
0025 #include "intel_gt_pm.h"
0026 #include "intel_gt_requests.h"
0027 #include "intel_mchbar_regs.h"
0028 #include "intel_pci_config.h"
0029 #include "intel_reset.h"
0030
0031 #include "uc/intel_guc.h"
0032
0033 #define RESET_MAX_RETRIES 3
0034
0035
0036 #define RESET_UNDER_STOP_MACHINE 0
0037
0038 static void rmw_set_fw(struct intel_uncore *uncore, i915_reg_t reg, u32 set)
0039 {
0040 intel_uncore_rmw_fw(uncore, reg, 0, set);
0041 }
0042
0043 static void rmw_clear_fw(struct intel_uncore *uncore, i915_reg_t reg, u32 clr)
0044 {
0045 intel_uncore_rmw_fw(uncore, reg, clr, 0);
0046 }
0047
0048 static void client_mark_guilty(struct i915_gem_context *ctx, bool banned)
0049 {
0050 struct drm_i915_file_private *file_priv = ctx->file_priv;
0051 unsigned long prev_hang;
0052 unsigned int score;
0053
0054 if (IS_ERR_OR_NULL(file_priv))
0055 return;
0056
0057 score = 0;
0058 if (banned)
0059 score = I915_CLIENT_SCORE_CONTEXT_BAN;
0060
0061 prev_hang = xchg(&file_priv->hang_timestamp, jiffies);
0062 if (time_before(jiffies, prev_hang + I915_CLIENT_FAST_HANG_JIFFIES))
0063 score += I915_CLIENT_SCORE_HANG_FAST;
0064
0065 if (score) {
0066 atomic_add(score, &file_priv->ban_score);
0067
0068 drm_dbg(&ctx->i915->drm,
0069 "client %s: gained %u ban score, now %u\n",
0070 ctx->name, score,
0071 atomic_read(&file_priv->ban_score));
0072 }
0073 }
0074
0075 static bool mark_guilty(struct i915_request *rq)
0076 {
0077 struct i915_gem_context *ctx;
0078 unsigned long prev_hang;
0079 bool banned;
0080 int i;
0081
0082 if (intel_context_is_closed(rq->context))
0083 return true;
0084
0085 rcu_read_lock();
0086 ctx = rcu_dereference(rq->context->gem_context);
0087 if (ctx && !kref_get_unless_zero(&ctx->ref))
0088 ctx = NULL;
0089 rcu_read_unlock();
0090 if (!ctx)
0091 return intel_context_is_banned(rq->context);
0092
0093 atomic_inc(&ctx->guilty_count);
0094
0095
0096 if (!i915_gem_context_is_bannable(ctx)) {
0097 banned = false;
0098 goto out;
0099 }
0100
0101 drm_notice(&ctx->i915->drm,
0102 "%s context reset due to GPU hang\n",
0103 ctx->name);
0104
0105
0106 prev_hang = ctx->hang_timestamp[0];
0107 for (i = 0; i < ARRAY_SIZE(ctx->hang_timestamp) - 1; i++)
0108 ctx->hang_timestamp[i] = ctx->hang_timestamp[i + 1];
0109 ctx->hang_timestamp[i] = jiffies;
0110
0111
0112 banned = !i915_gem_context_is_recoverable(ctx);
0113 if (time_before(jiffies, prev_hang + CONTEXT_FAST_HANG_JIFFIES))
0114 banned = true;
0115 if (banned)
0116 drm_dbg(&ctx->i915->drm, "context %s: guilty %d, banned\n",
0117 ctx->name, atomic_read(&ctx->guilty_count));
0118
0119 client_mark_guilty(ctx, banned);
0120
0121 out:
0122 i915_gem_context_put(ctx);
0123 return banned;
0124 }
0125
0126 static void mark_innocent(struct i915_request *rq)
0127 {
0128 struct i915_gem_context *ctx;
0129
0130 rcu_read_lock();
0131 ctx = rcu_dereference(rq->context->gem_context);
0132 if (ctx)
0133 atomic_inc(&ctx->active_count);
0134 rcu_read_unlock();
0135 }
0136
0137 void __i915_request_reset(struct i915_request *rq, bool guilty)
0138 {
0139 bool banned = false;
0140
0141 RQ_TRACE(rq, "guilty? %s\n", str_yes_no(guilty));
0142 GEM_BUG_ON(__i915_request_is_complete(rq));
0143
0144 rcu_read_lock();
0145 if (guilty) {
0146 i915_request_set_error_once(rq, -EIO);
0147 __i915_request_skip(rq);
0148 banned = mark_guilty(rq);
0149 } else {
0150 i915_request_set_error_once(rq, -EAGAIN);
0151 mark_innocent(rq);
0152 }
0153 rcu_read_unlock();
0154
0155 if (banned)
0156 intel_context_ban(rq->context, rq);
0157 }
0158
0159 static bool i915_in_reset(struct pci_dev *pdev)
0160 {
0161 u8 gdrst;
0162
0163 pci_read_config_byte(pdev, I915_GDRST, &gdrst);
0164 return gdrst & GRDOM_RESET_STATUS;
0165 }
0166
0167 static int i915_do_reset(struct intel_gt *gt,
0168 intel_engine_mask_t engine_mask,
0169 unsigned int retry)
0170 {
0171 struct pci_dev *pdev = to_pci_dev(gt->i915->drm.dev);
0172 int err;
0173
0174
0175 pci_write_config_byte(pdev, I915_GDRST, GRDOM_RESET_ENABLE);
0176 udelay(50);
0177 err = wait_for_atomic(i915_in_reset(pdev), 50);
0178
0179
0180 pci_write_config_byte(pdev, I915_GDRST, 0);
0181 udelay(50);
0182 if (!err)
0183 err = wait_for_atomic(!i915_in_reset(pdev), 50);
0184
0185 return err;
0186 }
0187
0188 static bool g4x_reset_complete(struct pci_dev *pdev)
0189 {
0190 u8 gdrst;
0191
0192 pci_read_config_byte(pdev, I915_GDRST, &gdrst);
0193 return (gdrst & GRDOM_RESET_ENABLE) == 0;
0194 }
0195
0196 static int g33_do_reset(struct intel_gt *gt,
0197 intel_engine_mask_t engine_mask,
0198 unsigned int retry)
0199 {
0200 struct pci_dev *pdev = to_pci_dev(gt->i915->drm.dev);
0201
0202 pci_write_config_byte(pdev, I915_GDRST, GRDOM_RESET_ENABLE);
0203 return wait_for_atomic(g4x_reset_complete(pdev), 50);
0204 }
0205
0206 static int g4x_do_reset(struct intel_gt *gt,
0207 intel_engine_mask_t engine_mask,
0208 unsigned int retry)
0209 {
0210 struct pci_dev *pdev = to_pci_dev(gt->i915->drm.dev);
0211 struct intel_uncore *uncore = gt->uncore;
0212 int ret;
0213
0214
0215 rmw_set_fw(uncore, VDECCLK_GATE_D, VCP_UNIT_CLOCK_GATE_DISABLE);
0216 intel_uncore_posting_read_fw(uncore, VDECCLK_GATE_D);
0217
0218 pci_write_config_byte(pdev, I915_GDRST,
0219 GRDOM_MEDIA | GRDOM_RESET_ENABLE);
0220 ret = wait_for_atomic(g4x_reset_complete(pdev), 50);
0221 if (ret) {
0222 GT_TRACE(gt, "Wait for media reset failed\n");
0223 goto out;
0224 }
0225
0226 pci_write_config_byte(pdev, I915_GDRST,
0227 GRDOM_RENDER | GRDOM_RESET_ENABLE);
0228 ret = wait_for_atomic(g4x_reset_complete(pdev), 50);
0229 if (ret) {
0230 GT_TRACE(gt, "Wait for render reset failed\n");
0231 goto out;
0232 }
0233
0234 out:
0235 pci_write_config_byte(pdev, I915_GDRST, 0);
0236
0237 rmw_clear_fw(uncore, VDECCLK_GATE_D, VCP_UNIT_CLOCK_GATE_DISABLE);
0238 intel_uncore_posting_read_fw(uncore, VDECCLK_GATE_D);
0239
0240 return ret;
0241 }
0242
0243 static int ilk_do_reset(struct intel_gt *gt, intel_engine_mask_t engine_mask,
0244 unsigned int retry)
0245 {
0246 struct intel_uncore *uncore = gt->uncore;
0247 int ret;
0248
0249 intel_uncore_write_fw(uncore, ILK_GDSR,
0250 ILK_GRDOM_RENDER | ILK_GRDOM_RESET_ENABLE);
0251 ret = __intel_wait_for_register_fw(uncore, ILK_GDSR,
0252 ILK_GRDOM_RESET_ENABLE, 0,
0253 5000, 0,
0254 NULL);
0255 if (ret) {
0256 GT_TRACE(gt, "Wait for render reset failed\n");
0257 goto out;
0258 }
0259
0260 intel_uncore_write_fw(uncore, ILK_GDSR,
0261 ILK_GRDOM_MEDIA | ILK_GRDOM_RESET_ENABLE);
0262 ret = __intel_wait_for_register_fw(uncore, ILK_GDSR,
0263 ILK_GRDOM_RESET_ENABLE, 0,
0264 5000, 0,
0265 NULL);
0266 if (ret) {
0267 GT_TRACE(gt, "Wait for media reset failed\n");
0268 goto out;
0269 }
0270
0271 out:
0272 intel_uncore_write_fw(uncore, ILK_GDSR, 0);
0273 intel_uncore_posting_read_fw(uncore, ILK_GDSR);
0274 return ret;
0275 }
0276
0277
0278 static int gen6_hw_domain_reset(struct intel_gt *gt, u32 hw_domain_mask)
0279 {
0280 struct intel_uncore *uncore = gt->uncore;
0281 int err;
0282
0283
0284
0285
0286
0287
0288 intel_uncore_write_fw(uncore, GEN6_GDRST, hw_domain_mask);
0289
0290
0291 err = __intel_wait_for_register_fw(uncore,
0292 GEN6_GDRST, hw_domain_mask, 0,
0293 500, 0,
0294 NULL);
0295 if (err)
0296 GT_TRACE(gt,
0297 "Wait for 0x%08x engines reset failed\n",
0298 hw_domain_mask);
0299
0300 return err;
0301 }
0302
0303 static int __gen6_reset_engines(struct intel_gt *gt,
0304 intel_engine_mask_t engine_mask,
0305 unsigned int retry)
0306 {
0307 struct intel_engine_cs *engine;
0308 u32 hw_mask;
0309
0310 if (engine_mask == ALL_ENGINES) {
0311 hw_mask = GEN6_GRDOM_FULL;
0312 } else {
0313 intel_engine_mask_t tmp;
0314
0315 hw_mask = 0;
0316 for_each_engine_masked(engine, gt, engine_mask, tmp) {
0317 hw_mask |= engine->reset_domain;
0318 }
0319 }
0320
0321 return gen6_hw_domain_reset(gt, hw_mask);
0322 }
0323
0324 static int gen6_reset_engines(struct intel_gt *gt,
0325 intel_engine_mask_t engine_mask,
0326 unsigned int retry)
0327 {
0328 unsigned long flags;
0329 int ret;
0330
0331 spin_lock_irqsave(>->uncore->lock, flags);
0332 ret = __gen6_reset_engines(gt, engine_mask, retry);
0333 spin_unlock_irqrestore(>->uncore->lock, flags);
0334
0335 return ret;
0336 }
0337
0338 static struct intel_engine_cs *find_sfc_paired_vecs_engine(struct intel_engine_cs *engine)
0339 {
0340 int vecs_id;
0341
0342 GEM_BUG_ON(engine->class != VIDEO_DECODE_CLASS);
0343
0344 vecs_id = _VECS((engine->instance) / 2);
0345
0346 return engine->gt->engine[vecs_id];
0347 }
0348
0349 struct sfc_lock_data {
0350 i915_reg_t lock_reg;
0351 i915_reg_t ack_reg;
0352 i915_reg_t usage_reg;
0353 u32 lock_bit;
0354 u32 ack_bit;
0355 u32 usage_bit;
0356 u32 reset_bit;
0357 };
0358
0359 static void get_sfc_forced_lock_data(struct intel_engine_cs *engine,
0360 struct sfc_lock_data *sfc_lock)
0361 {
0362 switch (engine->class) {
0363 default:
0364 MISSING_CASE(engine->class);
0365 fallthrough;
0366 case VIDEO_DECODE_CLASS:
0367 sfc_lock->lock_reg = GEN11_VCS_SFC_FORCED_LOCK(engine->mmio_base);
0368 sfc_lock->lock_bit = GEN11_VCS_SFC_FORCED_LOCK_BIT;
0369
0370 sfc_lock->ack_reg = GEN11_VCS_SFC_LOCK_STATUS(engine->mmio_base);
0371 sfc_lock->ack_bit = GEN11_VCS_SFC_LOCK_ACK_BIT;
0372
0373 sfc_lock->usage_reg = GEN11_VCS_SFC_LOCK_STATUS(engine->mmio_base);
0374 sfc_lock->usage_bit = GEN11_VCS_SFC_USAGE_BIT;
0375 sfc_lock->reset_bit = GEN11_VCS_SFC_RESET_BIT(engine->instance);
0376
0377 break;
0378 case VIDEO_ENHANCEMENT_CLASS:
0379 sfc_lock->lock_reg = GEN11_VECS_SFC_FORCED_LOCK(engine->mmio_base);
0380 sfc_lock->lock_bit = GEN11_VECS_SFC_FORCED_LOCK_BIT;
0381
0382 sfc_lock->ack_reg = GEN11_VECS_SFC_LOCK_ACK(engine->mmio_base);
0383 sfc_lock->ack_bit = GEN11_VECS_SFC_LOCK_ACK_BIT;
0384
0385 sfc_lock->usage_reg = GEN11_VECS_SFC_USAGE(engine->mmio_base);
0386 sfc_lock->usage_bit = GEN11_VECS_SFC_USAGE_BIT;
0387 sfc_lock->reset_bit = GEN11_VECS_SFC_RESET_BIT(engine->instance);
0388
0389 break;
0390 }
0391 }
0392
0393 static int gen11_lock_sfc(struct intel_engine_cs *engine,
0394 u32 *reset_mask,
0395 u32 *unlock_mask)
0396 {
0397 struct intel_uncore *uncore = engine->uncore;
0398 u8 vdbox_sfc_access = engine->gt->info.vdbox_sfc_access;
0399 struct sfc_lock_data sfc_lock;
0400 bool lock_obtained, lock_to_other = false;
0401 int ret;
0402
0403 switch (engine->class) {
0404 case VIDEO_DECODE_CLASS:
0405 if ((BIT(engine->instance) & vdbox_sfc_access) == 0)
0406 return 0;
0407
0408 fallthrough;
0409 case VIDEO_ENHANCEMENT_CLASS:
0410 get_sfc_forced_lock_data(engine, &sfc_lock);
0411
0412 break;
0413 default:
0414 return 0;
0415 }
0416
0417 if (!(intel_uncore_read_fw(uncore, sfc_lock.usage_reg) & sfc_lock.usage_bit)) {
0418 struct intel_engine_cs *paired_vecs;
0419
0420 if (engine->class != VIDEO_DECODE_CLASS ||
0421 GRAPHICS_VER(engine->i915) != 12)
0422 return 0;
0423
0424
0425
0426
0427
0428
0429
0430
0431 if (!(intel_uncore_read_fw(uncore,
0432 GEN12_HCP_SFC_LOCK_STATUS(engine->mmio_base)) &
0433 GEN12_HCP_SFC_USAGE_BIT))
0434 return 0;
0435
0436 paired_vecs = find_sfc_paired_vecs_engine(engine);
0437 get_sfc_forced_lock_data(paired_vecs, &sfc_lock);
0438 lock_to_other = true;
0439 *unlock_mask |= paired_vecs->mask;
0440 } else {
0441 *unlock_mask |= engine->mask;
0442 }
0443
0444
0445
0446
0447
0448
0449
0450
0451 rmw_set_fw(uncore, sfc_lock.lock_reg, sfc_lock.lock_bit);
0452
0453 ret = __intel_wait_for_register_fw(uncore,
0454 sfc_lock.ack_reg,
0455 sfc_lock.ack_bit,
0456 sfc_lock.ack_bit,
0457 1000, 0, NULL);
0458
0459
0460
0461
0462
0463
0464
0465
0466
0467
0468
0469
0470
0471 lock_obtained = (intel_uncore_read_fw(uncore, sfc_lock.usage_reg) &
0472 sfc_lock.usage_bit) != 0;
0473 if (lock_obtained == lock_to_other)
0474 return 0;
0475
0476 if (ret) {
0477 ENGINE_TRACE(engine, "Wait for SFC forced lock ack failed\n");
0478 return ret;
0479 }
0480
0481 *reset_mask |= sfc_lock.reset_bit;
0482 return 0;
0483 }
0484
0485 static void gen11_unlock_sfc(struct intel_engine_cs *engine)
0486 {
0487 struct intel_uncore *uncore = engine->uncore;
0488 u8 vdbox_sfc_access = engine->gt->info.vdbox_sfc_access;
0489 struct sfc_lock_data sfc_lock = {};
0490
0491 if (engine->class != VIDEO_DECODE_CLASS &&
0492 engine->class != VIDEO_ENHANCEMENT_CLASS)
0493 return;
0494
0495 if (engine->class == VIDEO_DECODE_CLASS &&
0496 (BIT(engine->instance) & vdbox_sfc_access) == 0)
0497 return;
0498
0499 get_sfc_forced_lock_data(engine, &sfc_lock);
0500
0501 rmw_clear_fw(uncore, sfc_lock.lock_reg, sfc_lock.lock_bit);
0502 }
0503
0504 static int __gen11_reset_engines(struct intel_gt *gt,
0505 intel_engine_mask_t engine_mask,
0506 unsigned int retry)
0507 {
0508 struct intel_engine_cs *engine;
0509 intel_engine_mask_t tmp;
0510 u32 reset_mask, unlock_mask = 0;
0511 int ret;
0512
0513 if (engine_mask == ALL_ENGINES) {
0514 reset_mask = GEN11_GRDOM_FULL;
0515 } else {
0516 reset_mask = 0;
0517 for_each_engine_masked(engine, gt, engine_mask, tmp) {
0518 reset_mask |= engine->reset_domain;
0519 ret = gen11_lock_sfc(engine, &reset_mask, &unlock_mask);
0520 if (ret)
0521 goto sfc_unlock;
0522 }
0523 }
0524
0525 ret = gen6_hw_domain_reset(gt, reset_mask);
0526
0527 sfc_unlock:
0528
0529
0530
0531
0532
0533
0534
0535
0536
0537
0538
0539 for_each_engine_masked(engine, gt, unlock_mask, tmp)
0540 gen11_unlock_sfc(engine);
0541
0542 return ret;
0543 }
0544
0545 static int gen8_engine_reset_prepare(struct intel_engine_cs *engine)
0546 {
0547 struct intel_uncore *uncore = engine->uncore;
0548 const i915_reg_t reg = RING_RESET_CTL(engine->mmio_base);
0549 u32 request, mask, ack;
0550 int ret;
0551
0552 if (I915_SELFTEST_ONLY(should_fail(&engine->reset_timeout, 1)))
0553 return -ETIMEDOUT;
0554
0555 ack = intel_uncore_read_fw(uncore, reg);
0556 if (ack & RESET_CTL_CAT_ERROR) {
0557
0558
0559
0560
0561 request = RESET_CTL_CAT_ERROR;
0562 mask = RESET_CTL_CAT_ERROR;
0563
0564
0565 ack = 0;
0566 } else if (!(ack & RESET_CTL_READY_TO_RESET)) {
0567 request = RESET_CTL_REQUEST_RESET;
0568 mask = RESET_CTL_READY_TO_RESET;
0569 ack = RESET_CTL_READY_TO_RESET;
0570 } else {
0571 return 0;
0572 }
0573
0574 intel_uncore_write_fw(uncore, reg, _MASKED_BIT_ENABLE(request));
0575 ret = __intel_wait_for_register_fw(uncore, reg, mask, ack,
0576 700, 0, NULL);
0577 if (ret)
0578 drm_err(&engine->i915->drm,
0579 "%s reset request timed out: {request: %08x, RESET_CTL: %08x}\n",
0580 engine->name, request,
0581 intel_uncore_read_fw(uncore, reg));
0582
0583 return ret;
0584 }
0585
0586 static void gen8_engine_reset_cancel(struct intel_engine_cs *engine)
0587 {
0588 intel_uncore_write_fw(engine->uncore,
0589 RING_RESET_CTL(engine->mmio_base),
0590 _MASKED_BIT_DISABLE(RESET_CTL_REQUEST_RESET));
0591 }
0592
0593 static int gen8_reset_engines(struct intel_gt *gt,
0594 intel_engine_mask_t engine_mask,
0595 unsigned int retry)
0596 {
0597 struct intel_engine_cs *engine;
0598 const bool reset_non_ready = retry >= 1;
0599 intel_engine_mask_t tmp;
0600 unsigned long flags;
0601 int ret;
0602
0603 spin_lock_irqsave(>->uncore->lock, flags);
0604
0605 for_each_engine_masked(engine, gt, engine_mask, tmp) {
0606 ret = gen8_engine_reset_prepare(engine);
0607 if (ret && !reset_non_ready)
0608 goto skip_reset;
0609
0610
0611
0612
0613
0614
0615
0616
0617
0618
0619
0620
0621
0622
0623 }
0624
0625
0626
0627
0628
0629
0630
0631 if (IS_DG2(gt->i915) && engine_mask == ALL_ENGINES)
0632 __gen11_reset_engines(gt, gt->info.engine_mask, 0);
0633
0634 if (GRAPHICS_VER(gt->i915) >= 11)
0635 ret = __gen11_reset_engines(gt, engine_mask, retry);
0636 else
0637 ret = __gen6_reset_engines(gt, engine_mask, retry);
0638
0639 skip_reset:
0640 for_each_engine_masked(engine, gt, engine_mask, tmp)
0641 gen8_engine_reset_cancel(engine);
0642
0643 spin_unlock_irqrestore(>->uncore->lock, flags);
0644
0645 return ret;
0646 }
0647
0648 static int mock_reset(struct intel_gt *gt,
0649 intel_engine_mask_t mask,
0650 unsigned int retry)
0651 {
0652 return 0;
0653 }
0654
0655 typedef int (*reset_func)(struct intel_gt *,
0656 intel_engine_mask_t engine_mask,
0657 unsigned int retry);
0658
0659 static reset_func intel_get_gpu_reset(const struct intel_gt *gt)
0660 {
0661 struct drm_i915_private *i915 = gt->i915;
0662
0663 if (is_mock_gt(gt))
0664 return mock_reset;
0665 else if (GRAPHICS_VER(i915) >= 8)
0666 return gen8_reset_engines;
0667 else if (GRAPHICS_VER(i915) >= 6)
0668 return gen6_reset_engines;
0669 else if (GRAPHICS_VER(i915) >= 5)
0670 return ilk_do_reset;
0671 else if (IS_G4X(i915))
0672 return g4x_do_reset;
0673 else if (IS_G33(i915) || IS_PINEVIEW(i915))
0674 return g33_do_reset;
0675 else if (GRAPHICS_VER(i915) >= 3)
0676 return i915_do_reset;
0677 else
0678 return NULL;
0679 }
0680
0681 int __intel_gt_reset(struct intel_gt *gt, intel_engine_mask_t engine_mask)
0682 {
0683 const int retries = engine_mask == ALL_ENGINES ? RESET_MAX_RETRIES : 1;
0684 reset_func reset;
0685 int ret = -ETIMEDOUT;
0686 int retry;
0687
0688 reset = intel_get_gpu_reset(gt);
0689 if (!reset)
0690 return -ENODEV;
0691
0692
0693
0694
0695
0696 intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL);
0697 for (retry = 0; ret == -ETIMEDOUT && retry < retries; retry++) {
0698 GT_TRACE(gt, "engine_mask=%x\n", engine_mask);
0699 preempt_disable();
0700 ret = reset(gt, engine_mask, retry);
0701 preempt_enable();
0702 }
0703 intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL);
0704
0705 return ret;
0706 }
0707
0708 bool intel_has_gpu_reset(const struct intel_gt *gt)
0709 {
0710 if (!gt->i915->params.reset)
0711 return NULL;
0712
0713 return intel_get_gpu_reset(gt);
0714 }
0715
0716 bool intel_has_reset_engine(const struct intel_gt *gt)
0717 {
0718 if (gt->i915->params.reset < 2)
0719 return false;
0720
0721 return INTEL_INFO(gt->i915)->has_reset_engine;
0722 }
0723
0724 int intel_reset_guc(struct intel_gt *gt)
0725 {
0726 u32 guc_domain =
0727 GRAPHICS_VER(gt->i915) >= 11 ? GEN11_GRDOM_GUC : GEN9_GRDOM_GUC;
0728 int ret;
0729
0730 GEM_BUG_ON(!HAS_GT_UC(gt->i915));
0731
0732 intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL);
0733 ret = gen6_hw_domain_reset(gt, guc_domain);
0734 intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL);
0735
0736 return ret;
0737 }
0738
0739
0740
0741
0742
0743 static void reset_prepare_engine(struct intel_engine_cs *engine)
0744 {
0745
0746
0747
0748
0749
0750
0751
0752 intel_uncore_forcewake_get(engine->uncore, FORCEWAKE_ALL);
0753 if (engine->reset.prepare)
0754 engine->reset.prepare(engine);
0755 }
0756
0757 static void revoke_mmaps(struct intel_gt *gt)
0758 {
0759 int i;
0760
0761 for (i = 0; i < gt->ggtt->num_fences; i++) {
0762 struct drm_vma_offset_node *node;
0763 struct i915_vma *vma;
0764 u64 vma_offset;
0765
0766 vma = READ_ONCE(gt->ggtt->fence_regs[i].vma);
0767 if (!vma)
0768 continue;
0769
0770 if (!i915_vma_has_userfault(vma))
0771 continue;
0772
0773 GEM_BUG_ON(vma->fence != >->ggtt->fence_regs[i]);
0774
0775 if (!vma->mmo)
0776 continue;
0777
0778 node = &vma->mmo->vma_node;
0779 vma_offset = vma->ggtt_view.partial.offset << PAGE_SHIFT;
0780
0781 unmap_mapping_range(gt->i915->drm.anon_inode->i_mapping,
0782 drm_vma_node_offset_addr(node) + vma_offset,
0783 vma->size,
0784 1);
0785 }
0786 }
0787
0788 static intel_engine_mask_t reset_prepare(struct intel_gt *gt)
0789 {
0790 struct intel_engine_cs *engine;
0791 intel_engine_mask_t awake = 0;
0792 enum intel_engine_id id;
0793
0794
0795 intel_uc_reset_prepare(>->uc);
0796
0797 for_each_engine(engine, gt, id) {
0798 if (intel_engine_pm_get_if_awake(engine))
0799 awake |= engine->mask;
0800 reset_prepare_engine(engine);
0801 }
0802
0803 return awake;
0804 }
0805
0806 static void gt_revoke(struct intel_gt *gt)
0807 {
0808 revoke_mmaps(gt);
0809 }
0810
0811 static int gt_reset(struct intel_gt *gt, intel_engine_mask_t stalled_mask)
0812 {
0813 struct intel_engine_cs *engine;
0814 enum intel_engine_id id;
0815 int err;
0816
0817
0818
0819
0820
0821 err = i915_ggtt_enable_hw(gt->i915);
0822 if (err)
0823 return err;
0824
0825 local_bh_disable();
0826 for_each_engine(engine, gt, id)
0827 __intel_engine_reset(engine, stalled_mask & engine->mask);
0828 local_bh_enable();
0829
0830 intel_uc_reset(>->uc, ALL_ENGINES);
0831
0832 intel_ggtt_restore_fences(gt->ggtt);
0833
0834 return err;
0835 }
0836
0837 static void reset_finish_engine(struct intel_engine_cs *engine)
0838 {
0839 if (engine->reset.finish)
0840 engine->reset.finish(engine);
0841 intel_uncore_forcewake_put(engine->uncore, FORCEWAKE_ALL);
0842
0843 intel_engine_signal_breadcrumbs(engine);
0844 }
0845
0846 static void reset_finish(struct intel_gt *gt, intel_engine_mask_t awake)
0847 {
0848 struct intel_engine_cs *engine;
0849 enum intel_engine_id id;
0850
0851 for_each_engine(engine, gt, id) {
0852 reset_finish_engine(engine);
0853 if (awake & engine->mask)
0854 intel_engine_pm_put(engine);
0855 }
0856
0857 intel_uc_reset_finish(>->uc);
0858 }
0859
0860 static void nop_submit_request(struct i915_request *request)
0861 {
0862 RQ_TRACE(request, "-EIO\n");
0863
0864 request = i915_request_mark_eio(request);
0865 if (request) {
0866 i915_request_submit(request);
0867 intel_engine_signal_breadcrumbs(request->engine);
0868
0869 i915_request_put(request);
0870 }
0871 }
0872
0873 static void __intel_gt_set_wedged(struct intel_gt *gt)
0874 {
0875 struct intel_engine_cs *engine;
0876 intel_engine_mask_t awake;
0877 enum intel_engine_id id;
0878
0879 if (test_bit(I915_WEDGED, >->reset.flags))
0880 return;
0881
0882 GT_TRACE(gt, "start\n");
0883
0884
0885
0886
0887
0888
0889 awake = reset_prepare(gt);
0890
0891
0892 if (!INTEL_INFO(gt->i915)->gpu_reset_clobbers_display)
0893 __intel_gt_reset(gt, ALL_ENGINES);
0894
0895 for_each_engine(engine, gt, id)
0896 engine->submit_request = nop_submit_request;
0897
0898
0899
0900
0901
0902
0903 synchronize_rcu_expedited();
0904 set_bit(I915_WEDGED, >->reset.flags);
0905
0906
0907 local_bh_disable();
0908 for_each_engine(engine, gt, id)
0909 if (engine->reset.cancel)
0910 engine->reset.cancel(engine);
0911 intel_uc_cancel_requests(>->uc);
0912 local_bh_enable();
0913
0914 reset_finish(gt, awake);
0915
0916 GT_TRACE(gt, "end\n");
0917 }
0918
0919 void intel_gt_set_wedged(struct intel_gt *gt)
0920 {
0921 intel_wakeref_t wakeref;
0922
0923 if (test_bit(I915_WEDGED, >->reset.flags))
0924 return;
0925
0926 wakeref = intel_runtime_pm_get(gt->uncore->rpm);
0927 mutex_lock(>->reset.mutex);
0928
0929 if (GEM_SHOW_DEBUG()) {
0930 struct drm_printer p = drm_debug_printer(__func__);
0931 struct intel_engine_cs *engine;
0932 enum intel_engine_id id;
0933
0934 drm_printf(&p, "called from %pS\n", (void *)_RET_IP_);
0935 for_each_engine(engine, gt, id) {
0936 if (intel_engine_is_idle(engine))
0937 continue;
0938
0939 intel_engine_dump(engine, &p, "%s\n", engine->name);
0940 }
0941 }
0942
0943 __intel_gt_set_wedged(gt);
0944
0945 mutex_unlock(>->reset.mutex);
0946 intel_runtime_pm_put(gt->uncore->rpm, wakeref);
0947 }
0948
0949 static bool __intel_gt_unset_wedged(struct intel_gt *gt)
0950 {
0951 struct intel_gt_timelines *timelines = >->timelines;
0952 struct intel_timeline *tl;
0953 bool ok;
0954
0955 if (!test_bit(I915_WEDGED, >->reset.flags))
0956 return true;
0957
0958
0959 if (intel_gt_has_unrecoverable_error(gt))
0960 return false;
0961
0962 GT_TRACE(gt, "start\n");
0963
0964
0965
0966
0967
0968
0969
0970
0971
0972
0973
0974 spin_lock(&timelines->lock);
0975 list_for_each_entry(tl, &timelines->active_list, link) {
0976 struct dma_fence *fence;
0977
0978 fence = i915_active_fence_get(&tl->last_request);
0979 if (!fence)
0980 continue;
0981
0982 spin_unlock(&timelines->lock);
0983
0984
0985
0986
0987
0988
0989
0990
0991 dma_fence_default_wait(fence, false, MAX_SCHEDULE_TIMEOUT);
0992 dma_fence_put(fence);
0993
0994
0995 spin_lock(&timelines->lock);
0996 tl = list_entry(&timelines->active_list, typeof(*tl), link);
0997 }
0998 spin_unlock(&timelines->lock);
0999
1000
1001 ok = !HAS_EXECLISTS(gt->i915);
1002 if (!INTEL_INFO(gt->i915)->gpu_reset_clobbers_display)
1003 ok = __intel_gt_reset(gt, ALL_ENGINES) == 0;
1004 if (!ok) {
1005
1006
1007
1008
1009 add_taint_for_CI(gt->i915, TAINT_WARN);
1010 return false;
1011 }
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022 intel_engines_reset_default_submission(gt);
1023
1024 GT_TRACE(gt, "end\n");
1025
1026 smp_mb__before_atomic();
1027 clear_bit(I915_WEDGED, >->reset.flags);
1028
1029 return true;
1030 }
1031
1032 bool intel_gt_unset_wedged(struct intel_gt *gt)
1033 {
1034 bool result;
1035
1036 mutex_lock(>->reset.mutex);
1037 result = __intel_gt_unset_wedged(gt);
1038 mutex_unlock(>->reset.mutex);
1039
1040 return result;
1041 }
1042
1043 static int do_reset(struct intel_gt *gt, intel_engine_mask_t stalled_mask)
1044 {
1045 int err, i;
1046
1047 err = __intel_gt_reset(gt, ALL_ENGINES);
1048 for (i = 0; err && i < RESET_MAX_RETRIES; i++) {
1049 msleep(10 * (i + 1));
1050 err = __intel_gt_reset(gt, ALL_ENGINES);
1051 }
1052 if (err)
1053 return err;
1054
1055 return gt_reset(gt, stalled_mask);
1056 }
1057
1058 static int resume(struct intel_gt *gt)
1059 {
1060 struct intel_engine_cs *engine;
1061 enum intel_engine_id id;
1062 int ret;
1063
1064 for_each_engine(engine, gt, id) {
1065 ret = intel_engine_resume(engine);
1066 if (ret)
1067 return ret;
1068 }
1069
1070 return 0;
1071 }
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090 void intel_gt_reset(struct intel_gt *gt,
1091 intel_engine_mask_t stalled_mask,
1092 const char *reason)
1093 {
1094 intel_engine_mask_t awake;
1095 int ret;
1096
1097 GT_TRACE(gt, "flags=%lx\n", gt->reset.flags);
1098
1099 might_sleep();
1100 GEM_BUG_ON(!test_bit(I915_RESET_BACKOFF, >->reset.flags));
1101
1102
1103
1104
1105
1106 gt_revoke(gt);
1107
1108 mutex_lock(>->reset.mutex);
1109
1110
1111 if (!__intel_gt_unset_wedged(gt))
1112 goto unlock;
1113
1114 if (reason)
1115 drm_notice(>->i915->drm,
1116 "Resetting chip for %s\n", reason);
1117 atomic_inc(>->i915->gpu_error.reset_count);
1118
1119 awake = reset_prepare(gt);
1120
1121 if (!intel_has_gpu_reset(gt)) {
1122 if (gt->i915->params.reset)
1123 drm_err(>->i915->drm, "GPU reset not supported\n");
1124 else
1125 drm_dbg(>->i915->drm, "GPU reset disabled\n");
1126 goto error;
1127 }
1128
1129 if (INTEL_INFO(gt->i915)->gpu_reset_clobbers_display)
1130 intel_runtime_pm_disable_interrupts(gt->i915);
1131
1132 if (do_reset(gt, stalled_mask)) {
1133 drm_err(>->i915->drm, "Failed to reset chip\n");
1134 goto taint;
1135 }
1136
1137 if (INTEL_INFO(gt->i915)->gpu_reset_clobbers_display)
1138 intel_runtime_pm_enable_interrupts(gt->i915);
1139
1140 intel_overlay_reset(gt->i915);
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150 ret = intel_gt_init_hw(gt);
1151 if (ret) {
1152 drm_err(>->i915->drm,
1153 "Failed to initialise HW following reset (%d)\n",
1154 ret);
1155 goto taint;
1156 }
1157
1158 ret = resume(gt);
1159 if (ret)
1160 goto taint;
1161
1162 finish:
1163 reset_finish(gt, awake);
1164 unlock:
1165 mutex_unlock(>->reset.mutex);
1166 return;
1167
1168 taint:
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181 add_taint_for_CI(gt->i915, TAINT_WARN);
1182 error:
1183 __intel_gt_set_wedged(gt);
1184 goto finish;
1185 }
1186
1187 static int intel_gt_reset_engine(struct intel_engine_cs *engine)
1188 {
1189 return __intel_gt_reset(engine->gt, engine->mask);
1190 }
1191
1192 int __intel_engine_reset_bh(struct intel_engine_cs *engine, const char *msg)
1193 {
1194 struct intel_gt *gt = engine->gt;
1195 int ret;
1196
1197 ENGINE_TRACE(engine, "flags=%lx\n", gt->reset.flags);
1198 GEM_BUG_ON(!test_bit(I915_RESET_ENGINE + engine->id, >->reset.flags));
1199
1200 if (intel_engine_uses_guc(engine))
1201 return -ENODEV;
1202
1203 if (!intel_engine_pm_get_if_awake(engine))
1204 return 0;
1205
1206 reset_prepare_engine(engine);
1207
1208 if (msg)
1209 drm_notice(&engine->i915->drm,
1210 "Resetting %s for %s\n", engine->name, msg);
1211 atomic_inc(&engine->i915->gpu_error.reset_engine_count[engine->uabi_class]);
1212
1213 ret = intel_gt_reset_engine(engine);
1214 if (ret) {
1215
1216 ENGINE_TRACE(engine, "Failed to reset %s, err: %d\n", engine->name, ret);
1217 goto out;
1218 }
1219
1220
1221
1222
1223
1224
1225 __intel_engine_reset(engine, true);
1226
1227
1228
1229
1230
1231
1232 ret = intel_engine_resume(engine);
1233
1234 out:
1235 intel_engine_cancel_stop_cs(engine);
1236 reset_finish_engine(engine);
1237 intel_engine_pm_put_async(engine);
1238 return ret;
1239 }
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254 int intel_engine_reset(struct intel_engine_cs *engine, const char *msg)
1255 {
1256 int err;
1257
1258 local_bh_disable();
1259 err = __intel_engine_reset_bh(engine, msg);
1260 local_bh_enable();
1261
1262 return err;
1263 }
1264
1265 static void intel_gt_reset_global(struct intel_gt *gt,
1266 u32 engine_mask,
1267 const char *reason)
1268 {
1269 struct kobject *kobj = >->i915->drm.primary->kdev->kobj;
1270 char *error_event[] = { I915_ERROR_UEVENT "=1", NULL };
1271 char *reset_event[] = { I915_RESET_UEVENT "=1", NULL };
1272 char *reset_done_event[] = { I915_ERROR_UEVENT "=0", NULL };
1273 struct intel_wedge_me w;
1274
1275 kobject_uevent_env(kobj, KOBJ_CHANGE, error_event);
1276
1277 GT_TRACE(gt, "resetting chip, engines=%x\n", engine_mask);
1278 kobject_uevent_env(kobj, KOBJ_CHANGE, reset_event);
1279
1280
1281 intel_wedge_on_timeout(&w, gt, 5 * HZ) {
1282 intel_display_prepare_reset(gt->i915);
1283
1284
1285 synchronize_srcu_expedited(>->reset.backoff_srcu);
1286
1287 intel_gt_reset(gt, engine_mask, reason);
1288
1289 intel_display_finish_reset(gt->i915);
1290 }
1291
1292 if (!test_bit(I915_WEDGED, >->reset.flags))
1293 kobject_uevent_env(kobj, KOBJ_CHANGE, reset_done_event);
1294 }
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309 void intel_gt_handle_error(struct intel_gt *gt,
1310 intel_engine_mask_t engine_mask,
1311 unsigned long flags,
1312 const char *fmt, ...)
1313 {
1314 struct intel_engine_cs *engine;
1315 intel_wakeref_t wakeref;
1316 intel_engine_mask_t tmp;
1317 char error_msg[80];
1318 char *msg = NULL;
1319
1320 if (fmt) {
1321 va_list args;
1322
1323 va_start(args, fmt);
1324 vscnprintf(error_msg, sizeof(error_msg), fmt, args);
1325 va_end(args);
1326
1327 msg = error_msg;
1328 }
1329
1330
1331
1332
1333
1334
1335
1336
1337 wakeref = intel_runtime_pm_get(gt->uncore->rpm);
1338
1339 engine_mask &= gt->info.engine_mask;
1340
1341 if (flags & I915_ERROR_CAPTURE) {
1342 i915_capture_error_state(gt, engine_mask, CORE_DUMP_FLAG_NONE);
1343 intel_gt_clear_error_registers(gt, engine_mask);
1344 }
1345
1346
1347
1348
1349
1350 if (!intel_uc_uses_guc_submission(>->uc) &&
1351 intel_has_reset_engine(gt) && !intel_gt_is_wedged(gt)) {
1352 local_bh_disable();
1353 for_each_engine_masked(engine, gt, engine_mask, tmp) {
1354 BUILD_BUG_ON(I915_RESET_MODESET >= I915_RESET_ENGINE);
1355 if (test_and_set_bit(I915_RESET_ENGINE + engine->id,
1356 >->reset.flags))
1357 continue;
1358
1359 if (__intel_engine_reset_bh(engine, msg) == 0)
1360 engine_mask &= ~engine->mask;
1361
1362 clear_and_wake_up_bit(I915_RESET_ENGINE + engine->id,
1363 >->reset.flags);
1364 }
1365 local_bh_enable();
1366 }
1367
1368 if (!engine_mask)
1369 goto out;
1370
1371
1372 if (test_and_set_bit(I915_RESET_BACKOFF, >->reset.flags)) {
1373 wait_event(gt->reset.queue,
1374 !test_bit(I915_RESET_BACKOFF, >->reset.flags));
1375 goto out;
1376 }
1377
1378
1379 synchronize_rcu_expedited();
1380
1381
1382
1383
1384
1385 if (!intel_uc_uses_guc_submission(>->uc)) {
1386 for_each_engine(engine, gt, tmp) {
1387 while (test_and_set_bit(I915_RESET_ENGINE + engine->id,
1388 >->reset.flags))
1389 wait_on_bit(>->reset.flags,
1390 I915_RESET_ENGINE + engine->id,
1391 TASK_UNINTERRUPTIBLE);
1392 }
1393 }
1394
1395 intel_gt_reset_global(gt, engine_mask, msg);
1396
1397 if (!intel_uc_uses_guc_submission(>->uc)) {
1398 for_each_engine(engine, gt, tmp)
1399 clear_bit_unlock(I915_RESET_ENGINE + engine->id,
1400 >->reset.flags);
1401 }
1402 clear_bit_unlock(I915_RESET_BACKOFF, >->reset.flags);
1403 smp_mb__after_atomic();
1404 wake_up_all(>->reset.queue);
1405
1406 out:
1407 intel_runtime_pm_put(gt->uncore->rpm, wakeref);
1408 }
1409
1410 int intel_gt_reset_trylock(struct intel_gt *gt, int *srcu)
1411 {
1412 might_lock(>->reset.backoff_srcu);
1413 might_sleep();
1414
1415 rcu_read_lock();
1416 while (test_bit(I915_RESET_BACKOFF, >->reset.flags)) {
1417 rcu_read_unlock();
1418
1419 if (wait_event_interruptible(gt->reset.queue,
1420 !test_bit(I915_RESET_BACKOFF,
1421 >->reset.flags)))
1422 return -EINTR;
1423
1424 rcu_read_lock();
1425 }
1426 *srcu = srcu_read_lock(>->reset.backoff_srcu);
1427 rcu_read_unlock();
1428
1429 return 0;
1430 }
1431
1432 void intel_gt_reset_unlock(struct intel_gt *gt, int tag)
1433 __releases(>->reset.backoff_srcu)
1434 {
1435 srcu_read_unlock(>->reset.backoff_srcu, tag);
1436 }
1437
1438 int intel_gt_terminally_wedged(struct intel_gt *gt)
1439 {
1440 might_sleep();
1441
1442 if (!intel_gt_is_wedged(gt))
1443 return 0;
1444
1445 if (intel_gt_has_unrecoverable_error(gt))
1446 return -EIO;
1447
1448
1449 if (wait_event_interruptible(gt->reset.queue,
1450 !test_bit(I915_RESET_BACKOFF,
1451 >->reset.flags)))
1452 return -EINTR;
1453
1454 return intel_gt_is_wedged(gt) ? -EIO : 0;
1455 }
1456
1457 void intel_gt_set_wedged_on_init(struct intel_gt *gt)
1458 {
1459 BUILD_BUG_ON(I915_RESET_ENGINE + I915_NUM_ENGINES >
1460 I915_WEDGED_ON_INIT);
1461 intel_gt_set_wedged(gt);
1462 i915_disable_error_state(gt->i915, -ENODEV);
1463 set_bit(I915_WEDGED_ON_INIT, >->reset.flags);
1464
1465
1466 add_taint_for_CI(gt->i915, TAINT_WARN);
1467 }
1468
1469 void intel_gt_set_wedged_on_fini(struct intel_gt *gt)
1470 {
1471 intel_gt_set_wedged(gt);
1472 i915_disable_error_state(gt->i915, -ENODEV);
1473 set_bit(I915_WEDGED_ON_FINI, >->reset.flags);
1474 intel_gt_retire_requests(gt);
1475 }
1476
1477 void intel_gt_init_reset(struct intel_gt *gt)
1478 {
1479 init_waitqueue_head(>->reset.queue);
1480 mutex_init(>->reset.mutex);
1481 init_srcu_struct(>->reset.backoff_srcu);
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492 i915_gem_shrinker_taints_mutex(gt->i915, >->reset.mutex);
1493
1494
1495 __set_bit(I915_WEDGED, >->reset.flags);
1496 }
1497
1498 void intel_gt_fini_reset(struct intel_gt *gt)
1499 {
1500 cleanup_srcu_struct(>->reset.backoff_srcu);
1501 }
1502
1503 static void intel_wedge_me(struct work_struct *work)
1504 {
1505 struct intel_wedge_me *w = container_of(work, typeof(*w), work.work);
1506
1507 drm_err(&w->gt->i915->drm,
1508 "%s timed out, cancelling all in-flight rendering.\n",
1509 w->name);
1510 intel_gt_set_wedged(w->gt);
1511 }
1512
1513 void __intel_init_wedge(struct intel_wedge_me *w,
1514 struct intel_gt *gt,
1515 long timeout,
1516 const char *name)
1517 {
1518 w->gt = gt;
1519 w->name = name;
1520
1521 INIT_DELAYED_WORK_ONSTACK(&w->work, intel_wedge_me);
1522 schedule_delayed_work(&w->work, timeout);
1523 }
1524
1525 void __intel_fini_wedge(struct intel_wedge_me *w)
1526 {
1527 cancel_delayed_work_sync(&w->work);
1528 destroy_delayed_work_on_stack(&w->work);
1529 w->gt = NULL;
1530 }
1531
1532 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1533 #include "selftest_reset.c"
1534 #include "selftest_hangcheck.c"
1535 #endif