0001
0002
0003
0004
0005
0006 #include <linux/kthread.h>
0007
0008 #include "gem/i915_gem_context.h"
0009 #include "gem/i915_gem_internal.h"
0010
0011 #include "i915_gem_evict.h"
0012 #include "intel_gt.h"
0013 #include "intel_engine_heartbeat.h"
0014 #include "intel_engine_pm.h"
0015 #include "selftest_engine_heartbeat.h"
0016
0017 #include "i915_selftest.h"
0018 #include "selftests/i915_random.h"
0019 #include "selftests/igt_flush_test.h"
0020 #include "selftests/igt_reset.h"
0021 #include "selftests/igt_atomic.h"
0022 #include "selftests/igt_spinner.h"
0023 #include "selftests/intel_scheduler_helpers.h"
0024
0025 #include "selftests/mock_drm.h"
0026
0027 #include "gem/selftests/mock_context.h"
0028 #include "gem/selftests/igt_gem_utils.h"
0029
0030 #define IGT_IDLE_TIMEOUT 50
0031
0032 struct hang {
0033 struct intel_gt *gt;
0034 struct drm_i915_gem_object *hws;
0035 struct drm_i915_gem_object *obj;
0036 struct i915_gem_context *ctx;
0037 u32 *seqno;
0038 u32 *batch;
0039 };
0040
0041 static int hang_init(struct hang *h, struct intel_gt *gt)
0042 {
0043 void *vaddr;
0044 int err;
0045
0046 memset(h, 0, sizeof(*h));
0047 h->gt = gt;
0048
0049 h->ctx = kernel_context(gt->i915, NULL);
0050 if (IS_ERR(h->ctx))
0051 return PTR_ERR(h->ctx);
0052
0053 GEM_BUG_ON(i915_gem_context_is_bannable(h->ctx));
0054
0055 h->hws = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
0056 if (IS_ERR(h->hws)) {
0057 err = PTR_ERR(h->hws);
0058 goto err_ctx;
0059 }
0060
0061 h->obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
0062 if (IS_ERR(h->obj)) {
0063 err = PTR_ERR(h->obj);
0064 goto err_hws;
0065 }
0066
0067 i915_gem_object_set_cache_coherency(h->hws, I915_CACHE_LLC);
0068 vaddr = i915_gem_object_pin_map_unlocked(h->hws, I915_MAP_WB);
0069 if (IS_ERR(vaddr)) {
0070 err = PTR_ERR(vaddr);
0071 goto err_obj;
0072 }
0073 h->seqno = memset(vaddr, 0xff, PAGE_SIZE);
0074
0075 vaddr = i915_gem_object_pin_map_unlocked(h->obj,
0076 i915_coherent_map_type(gt->i915, h->obj, false));
0077 if (IS_ERR(vaddr)) {
0078 err = PTR_ERR(vaddr);
0079 goto err_unpin_hws;
0080 }
0081 h->batch = vaddr;
0082
0083 return 0;
0084
0085 err_unpin_hws:
0086 i915_gem_object_unpin_map(h->hws);
0087 err_obj:
0088 i915_gem_object_put(h->obj);
0089 err_hws:
0090 i915_gem_object_put(h->hws);
0091 err_ctx:
0092 kernel_context_close(h->ctx);
0093 return err;
0094 }
0095
0096 static u64 hws_address(const struct i915_vma *hws,
0097 const struct i915_request *rq)
0098 {
0099 return hws->node.start + offset_in_page(sizeof(u32)*rq->fence.context);
0100 }
0101
0102 static int move_to_active(struct i915_vma *vma,
0103 struct i915_request *rq,
0104 unsigned int flags)
0105 {
0106 int err;
0107
0108 i915_vma_lock(vma);
0109 err = i915_request_await_object(rq, vma->obj,
0110 flags & EXEC_OBJECT_WRITE);
0111 if (err == 0)
0112 err = i915_vma_move_to_active(vma, rq, flags);
0113 i915_vma_unlock(vma);
0114
0115 return err;
0116 }
0117
0118 static struct i915_request *
0119 hang_create_request(struct hang *h, struct intel_engine_cs *engine)
0120 {
0121 struct intel_gt *gt = h->gt;
0122 struct i915_address_space *vm = i915_gem_context_get_eb_vm(h->ctx);
0123 struct drm_i915_gem_object *obj;
0124 struct i915_request *rq = NULL;
0125 struct i915_vma *hws, *vma;
0126 unsigned int flags;
0127 void *vaddr;
0128 u32 *batch;
0129 int err;
0130
0131 obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
0132 if (IS_ERR(obj)) {
0133 i915_vm_put(vm);
0134 return ERR_CAST(obj);
0135 }
0136
0137 vaddr = i915_gem_object_pin_map_unlocked(obj, i915_coherent_map_type(gt->i915, obj, false));
0138 if (IS_ERR(vaddr)) {
0139 i915_gem_object_put(obj);
0140 i915_vm_put(vm);
0141 return ERR_CAST(vaddr);
0142 }
0143
0144 i915_gem_object_unpin_map(h->obj);
0145 i915_gem_object_put(h->obj);
0146
0147 h->obj = obj;
0148 h->batch = vaddr;
0149
0150 vma = i915_vma_instance(h->obj, vm, NULL);
0151 if (IS_ERR(vma)) {
0152 i915_vm_put(vm);
0153 return ERR_CAST(vma);
0154 }
0155
0156 hws = i915_vma_instance(h->hws, vm, NULL);
0157 if (IS_ERR(hws)) {
0158 i915_vm_put(vm);
0159 return ERR_CAST(hws);
0160 }
0161
0162 err = i915_vma_pin(vma, 0, 0, PIN_USER);
0163 if (err) {
0164 i915_vm_put(vm);
0165 return ERR_PTR(err);
0166 }
0167
0168 err = i915_vma_pin(hws, 0, 0, PIN_USER);
0169 if (err)
0170 goto unpin_vma;
0171
0172 rq = igt_request_alloc(h->ctx, engine);
0173 if (IS_ERR(rq)) {
0174 err = PTR_ERR(rq);
0175 goto unpin_hws;
0176 }
0177
0178 err = move_to_active(vma, rq, 0);
0179 if (err)
0180 goto cancel_rq;
0181
0182 err = move_to_active(hws, rq, 0);
0183 if (err)
0184 goto cancel_rq;
0185
0186 batch = h->batch;
0187 if (GRAPHICS_VER(gt->i915) >= 8) {
0188 *batch++ = MI_STORE_DWORD_IMM_GEN4;
0189 *batch++ = lower_32_bits(hws_address(hws, rq));
0190 *batch++ = upper_32_bits(hws_address(hws, rq));
0191 *batch++ = rq->fence.seqno;
0192 *batch++ = MI_NOOP;
0193
0194 memset(batch, 0, 1024);
0195 batch += 1024 / sizeof(*batch);
0196
0197 *batch++ = MI_NOOP;
0198 *batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
0199 *batch++ = lower_32_bits(vma->node.start);
0200 *batch++ = upper_32_bits(vma->node.start);
0201 } else if (GRAPHICS_VER(gt->i915) >= 6) {
0202 *batch++ = MI_STORE_DWORD_IMM_GEN4;
0203 *batch++ = 0;
0204 *batch++ = lower_32_bits(hws_address(hws, rq));
0205 *batch++ = rq->fence.seqno;
0206 *batch++ = MI_NOOP;
0207
0208 memset(batch, 0, 1024);
0209 batch += 1024 / sizeof(*batch);
0210
0211 *batch++ = MI_NOOP;
0212 *batch++ = MI_BATCH_BUFFER_START | 1 << 8;
0213 *batch++ = lower_32_bits(vma->node.start);
0214 } else if (GRAPHICS_VER(gt->i915) >= 4) {
0215 *batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
0216 *batch++ = 0;
0217 *batch++ = lower_32_bits(hws_address(hws, rq));
0218 *batch++ = rq->fence.seqno;
0219 *batch++ = MI_NOOP;
0220
0221 memset(batch, 0, 1024);
0222 batch += 1024 / sizeof(*batch);
0223
0224 *batch++ = MI_NOOP;
0225 *batch++ = MI_BATCH_BUFFER_START | 2 << 6;
0226 *batch++ = lower_32_bits(vma->node.start);
0227 } else {
0228 *batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL;
0229 *batch++ = lower_32_bits(hws_address(hws, rq));
0230 *batch++ = rq->fence.seqno;
0231 *batch++ = MI_NOOP;
0232
0233 memset(batch, 0, 1024);
0234 batch += 1024 / sizeof(*batch);
0235
0236 *batch++ = MI_NOOP;
0237 *batch++ = MI_BATCH_BUFFER_START | 2 << 6;
0238 *batch++ = lower_32_bits(vma->node.start);
0239 }
0240 *batch++ = MI_BATCH_BUFFER_END;
0241 intel_gt_chipset_flush(engine->gt);
0242
0243 if (rq->engine->emit_init_breadcrumb) {
0244 err = rq->engine->emit_init_breadcrumb(rq);
0245 if (err)
0246 goto cancel_rq;
0247 }
0248
0249 flags = 0;
0250 if (GRAPHICS_VER(gt->i915) <= 5)
0251 flags |= I915_DISPATCH_SECURE;
0252
0253 err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags);
0254
0255 cancel_rq:
0256 if (err) {
0257 i915_request_set_error_once(rq, err);
0258 i915_request_add(rq);
0259 }
0260 unpin_hws:
0261 i915_vma_unpin(hws);
0262 unpin_vma:
0263 i915_vma_unpin(vma);
0264 i915_vm_put(vm);
0265 return err ? ERR_PTR(err) : rq;
0266 }
0267
0268 static u32 hws_seqno(const struct hang *h, const struct i915_request *rq)
0269 {
0270 return READ_ONCE(h->seqno[rq->fence.context % (PAGE_SIZE/sizeof(u32))]);
0271 }
0272
0273 static void hang_fini(struct hang *h)
0274 {
0275 *h->batch = MI_BATCH_BUFFER_END;
0276 intel_gt_chipset_flush(h->gt);
0277
0278 i915_gem_object_unpin_map(h->obj);
0279 i915_gem_object_put(h->obj);
0280
0281 i915_gem_object_unpin_map(h->hws);
0282 i915_gem_object_put(h->hws);
0283
0284 kernel_context_close(h->ctx);
0285
0286 igt_flush_test(h->gt->i915);
0287 }
0288
0289 static bool wait_until_running(struct hang *h, struct i915_request *rq)
0290 {
0291 return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq),
0292 rq->fence.seqno),
0293 10) &&
0294 wait_for(i915_seqno_passed(hws_seqno(h, rq),
0295 rq->fence.seqno),
0296 1000));
0297 }
0298
0299 static int igt_hang_sanitycheck(void *arg)
0300 {
0301 struct intel_gt *gt = arg;
0302 struct i915_request *rq;
0303 struct intel_engine_cs *engine;
0304 enum intel_engine_id id;
0305 struct hang h;
0306 int err;
0307
0308
0309
0310 err = hang_init(&h, gt);
0311 if (err)
0312 return err;
0313
0314 for_each_engine(engine, gt, id) {
0315 struct intel_wedge_me w;
0316 long timeout;
0317
0318 if (!intel_engine_can_store_dword(engine))
0319 continue;
0320
0321 rq = hang_create_request(&h, engine);
0322 if (IS_ERR(rq)) {
0323 err = PTR_ERR(rq);
0324 pr_err("Failed to create request for %s, err=%d\n",
0325 engine->name, err);
0326 goto fini;
0327 }
0328
0329 i915_request_get(rq);
0330
0331 *h.batch = MI_BATCH_BUFFER_END;
0332 intel_gt_chipset_flush(engine->gt);
0333
0334 i915_request_add(rq);
0335
0336 timeout = 0;
0337 intel_wedge_on_timeout(&w, gt, HZ / 10 )
0338 timeout = i915_request_wait(rq, 0,
0339 MAX_SCHEDULE_TIMEOUT);
0340 if (intel_gt_is_wedged(gt))
0341 timeout = -EIO;
0342
0343 i915_request_put(rq);
0344
0345 if (timeout < 0) {
0346 err = timeout;
0347 pr_err("Wait for request failed on %s, err=%d\n",
0348 engine->name, err);
0349 goto fini;
0350 }
0351 }
0352
0353 fini:
0354 hang_fini(&h);
0355 return err;
0356 }
0357
0358 static bool wait_for_idle(struct intel_engine_cs *engine)
0359 {
0360 return wait_for(intel_engine_is_idle(engine), IGT_IDLE_TIMEOUT) == 0;
0361 }
0362
0363 static int igt_reset_nop(void *arg)
0364 {
0365 struct intel_gt *gt = arg;
0366 struct i915_gpu_error *global = >->i915->gpu_error;
0367 struct intel_engine_cs *engine;
0368 unsigned int reset_count, count;
0369 enum intel_engine_id id;
0370 IGT_TIMEOUT(end_time);
0371 int err = 0;
0372
0373
0374
0375 reset_count = i915_reset_count(global);
0376 count = 0;
0377 do {
0378 for_each_engine(engine, gt, id) {
0379 struct intel_context *ce;
0380 int i;
0381
0382 ce = intel_context_create(engine);
0383 if (IS_ERR(ce)) {
0384 err = PTR_ERR(ce);
0385 pr_err("[%s] Create context failed: %d!\n", engine->name, err);
0386 break;
0387 }
0388
0389 for (i = 0; i < 16; i++) {
0390 struct i915_request *rq;
0391
0392 rq = intel_context_create_request(ce);
0393 if (IS_ERR(rq)) {
0394 err = PTR_ERR(rq);
0395 pr_err("[%s] Create request failed: %d!\n",
0396 engine->name, err);
0397 break;
0398 }
0399
0400 i915_request_add(rq);
0401 }
0402
0403 intel_context_put(ce);
0404 }
0405
0406 igt_global_reset_lock(gt);
0407 intel_gt_reset(gt, ALL_ENGINES, NULL);
0408 igt_global_reset_unlock(gt);
0409
0410 if (intel_gt_is_wedged(gt)) {
0411 pr_err("[%s] GT is wedged!\n", engine->name);
0412 err = -EIO;
0413 break;
0414 }
0415
0416 if (i915_reset_count(global) != reset_count + ++count) {
0417 pr_err("[%s] Reset not recorded: %d vs %d + %d!\n",
0418 engine->name, i915_reset_count(global), reset_count, count);
0419 err = -EINVAL;
0420 break;
0421 }
0422
0423 err = igt_flush_test(gt->i915);
0424 if (err) {
0425 pr_err("[%s] Flush failed: %d!\n", engine->name, err);
0426 break;
0427 }
0428 } while (time_before(jiffies, end_time));
0429 pr_info("%s: %d resets\n", __func__, count);
0430
0431 if (igt_flush_test(gt->i915)) {
0432 pr_err("Post flush failed: %d!\n", err);
0433 err = -EIO;
0434 }
0435
0436 return err;
0437 }
0438
0439 static int igt_reset_nop_engine(void *arg)
0440 {
0441 struct intel_gt *gt = arg;
0442 struct i915_gpu_error *global = >->i915->gpu_error;
0443 struct intel_engine_cs *engine;
0444 enum intel_engine_id id;
0445
0446
0447
0448 if (!intel_has_reset_engine(gt))
0449 return 0;
0450
0451 for_each_engine(engine, gt, id) {
0452 unsigned int reset_count, reset_engine_count, count;
0453 struct intel_context *ce;
0454 IGT_TIMEOUT(end_time);
0455 int err;
0456
0457 if (intel_engine_uses_guc(engine)) {
0458
0459
0460
0461
0462 continue;
0463 }
0464
0465 ce = intel_context_create(engine);
0466 if (IS_ERR(ce)) {
0467 pr_err("[%s] Create context failed: %pe!\n", engine->name, ce);
0468 return PTR_ERR(ce);
0469 }
0470
0471 reset_count = i915_reset_count(global);
0472 reset_engine_count = i915_reset_engine_count(global, engine);
0473 count = 0;
0474
0475 st_engine_heartbeat_disable(engine);
0476 GEM_BUG_ON(test_and_set_bit(I915_RESET_ENGINE + id,
0477 >->reset.flags));
0478 do {
0479 int i;
0480
0481 if (!wait_for_idle(engine)) {
0482 pr_err("%s failed to idle before reset\n",
0483 engine->name);
0484 err = -EIO;
0485 break;
0486 }
0487
0488 for (i = 0; i < 16; i++) {
0489 struct i915_request *rq;
0490
0491 rq = intel_context_create_request(ce);
0492 if (IS_ERR(rq)) {
0493 struct drm_printer p =
0494 drm_info_printer(gt->i915->drm.dev);
0495 intel_engine_dump(engine, &p,
0496 "%s(%s): failed to submit request\n",
0497 __func__,
0498 engine->name);
0499
0500 GEM_TRACE("%s(%s): failed to submit request\n",
0501 __func__,
0502 engine->name);
0503 GEM_TRACE_DUMP();
0504
0505 intel_gt_set_wedged(gt);
0506
0507 err = PTR_ERR(rq);
0508 break;
0509 }
0510
0511 i915_request_add(rq);
0512 }
0513 err = intel_engine_reset(engine, NULL);
0514 if (err) {
0515 pr_err("intel_engine_reset(%s) failed, err:%d\n",
0516 engine->name, err);
0517 break;
0518 }
0519
0520 if (i915_reset_count(global) != reset_count) {
0521 pr_err("Full GPU reset recorded! (engine reset expected)\n");
0522 err = -EINVAL;
0523 break;
0524 }
0525
0526 if (i915_reset_engine_count(global, engine) !=
0527 reset_engine_count + ++count) {
0528 pr_err("%s engine reset not recorded!\n",
0529 engine->name);
0530 err = -EINVAL;
0531 break;
0532 }
0533 } while (time_before(jiffies, end_time));
0534 clear_and_wake_up_bit(I915_RESET_ENGINE + id, >->reset.flags);
0535 st_engine_heartbeat_enable(engine);
0536
0537 pr_info("%s(%s): %d resets\n", __func__, engine->name, count);
0538
0539 intel_context_put(ce);
0540 if (igt_flush_test(gt->i915))
0541 err = -EIO;
0542 if (err)
0543 return err;
0544 }
0545
0546 return 0;
0547 }
0548
0549 static void force_reset_timeout(struct intel_engine_cs *engine)
0550 {
0551 engine->reset_timeout.probability = 999;
0552 atomic_set(&engine->reset_timeout.times, -1);
0553 }
0554
0555 static void cancel_reset_timeout(struct intel_engine_cs *engine)
0556 {
0557 memset(&engine->reset_timeout, 0, sizeof(engine->reset_timeout));
0558 }
0559
0560 static int igt_reset_fail_engine(void *arg)
0561 {
0562 struct intel_gt *gt = arg;
0563 struct intel_engine_cs *engine;
0564 enum intel_engine_id id;
0565
0566
0567
0568 if (!intel_has_reset_engine(gt))
0569 return 0;
0570
0571 for_each_engine(engine, gt, id) {
0572 unsigned int count;
0573 struct intel_context *ce;
0574 IGT_TIMEOUT(end_time);
0575 int err;
0576
0577
0578 if (intel_engine_uses_guc(engine))
0579 continue;
0580
0581 ce = intel_context_create(engine);
0582 if (IS_ERR(ce)) {
0583 pr_err("[%s] Create context failed: %pe!\n", engine->name, ce);
0584 return PTR_ERR(ce);
0585 }
0586
0587 st_engine_heartbeat_disable(engine);
0588 GEM_BUG_ON(test_and_set_bit(I915_RESET_ENGINE + id,
0589 >->reset.flags));
0590
0591 force_reset_timeout(engine);
0592 err = intel_engine_reset(engine, NULL);
0593 cancel_reset_timeout(engine);
0594 if (err == 0)
0595 goto skip;
0596
0597 count = 0;
0598 do {
0599 struct i915_request *last = NULL;
0600 int i;
0601
0602 if (!wait_for_idle(engine)) {
0603 pr_err("%s failed to idle before reset\n",
0604 engine->name);
0605 err = -EIO;
0606 break;
0607 }
0608
0609 for (i = 0; i < count % 15; i++) {
0610 struct i915_request *rq;
0611
0612 rq = intel_context_create_request(ce);
0613 if (IS_ERR(rq)) {
0614 struct drm_printer p =
0615 drm_info_printer(gt->i915->drm.dev);
0616 intel_engine_dump(engine, &p,
0617 "%s(%s): failed to submit request\n",
0618 __func__,
0619 engine->name);
0620
0621 GEM_TRACE("%s(%s): failed to submit request\n",
0622 __func__,
0623 engine->name);
0624 GEM_TRACE_DUMP();
0625
0626 intel_gt_set_wedged(gt);
0627 if (last)
0628 i915_request_put(last);
0629
0630 err = PTR_ERR(rq);
0631 goto out;
0632 }
0633
0634 if (last)
0635 i915_request_put(last);
0636 last = i915_request_get(rq);
0637 i915_request_add(rq);
0638 }
0639
0640 if (count & 1) {
0641 err = intel_engine_reset(engine, NULL);
0642 if (err) {
0643 GEM_TRACE_ERR("intel_engine_reset(%s) failed, err:%d\n",
0644 engine->name, err);
0645 GEM_TRACE_DUMP();
0646 i915_request_put(last);
0647 break;
0648 }
0649 } else {
0650 force_reset_timeout(engine);
0651 err = intel_engine_reset(engine, NULL);
0652 cancel_reset_timeout(engine);
0653 if (err != -ETIMEDOUT) {
0654 pr_err("intel_engine_reset(%s) did not fail, err:%d\n",
0655 engine->name, err);
0656 i915_request_put(last);
0657 break;
0658 }
0659 }
0660
0661 err = 0;
0662 if (last) {
0663 if (i915_request_wait(last, 0, HZ / 2) < 0) {
0664 struct drm_printer p =
0665 drm_info_printer(gt->i915->drm.dev);
0666
0667 intel_engine_dump(engine, &p,
0668 "%s(%s): failed to complete request\n",
0669 __func__,
0670 engine->name);
0671
0672 GEM_TRACE("%s(%s): failed to complete request\n",
0673 __func__,
0674 engine->name);
0675 GEM_TRACE_DUMP();
0676
0677 err = -EIO;
0678 }
0679 i915_request_put(last);
0680 }
0681 count++;
0682 } while (err == 0 && time_before(jiffies, end_time));
0683 out:
0684 pr_info("%s(%s): %d resets\n", __func__, engine->name, count);
0685 skip:
0686 clear_and_wake_up_bit(I915_RESET_ENGINE + id, >->reset.flags);
0687 st_engine_heartbeat_enable(engine);
0688 intel_context_put(ce);
0689
0690 if (igt_flush_test(gt->i915))
0691 err = -EIO;
0692 if (err)
0693 return err;
0694 }
0695
0696 return 0;
0697 }
0698
0699 static int __igt_reset_engine(struct intel_gt *gt, bool active)
0700 {
0701 struct i915_gpu_error *global = >->i915->gpu_error;
0702 struct intel_engine_cs *engine;
0703 enum intel_engine_id id;
0704 struct hang h;
0705 int err = 0;
0706
0707
0708
0709 if (!intel_has_reset_engine(gt))
0710 return 0;
0711
0712 if (active) {
0713 err = hang_init(&h, gt);
0714 if (err)
0715 return err;
0716 }
0717
0718 for_each_engine(engine, gt, id) {
0719 unsigned int reset_count, reset_engine_count;
0720 unsigned long count;
0721 bool using_guc = intel_engine_uses_guc(engine);
0722 IGT_TIMEOUT(end_time);
0723
0724 if (using_guc && !active)
0725 continue;
0726
0727 if (active && !intel_engine_can_store_dword(engine))
0728 continue;
0729
0730 if (!wait_for_idle(engine)) {
0731 pr_err("%s failed to idle before reset\n",
0732 engine->name);
0733 err = -EIO;
0734 break;
0735 }
0736
0737 reset_count = i915_reset_count(global);
0738 reset_engine_count = i915_reset_engine_count(global, engine);
0739
0740 st_engine_heartbeat_disable(engine);
0741 GEM_BUG_ON(test_and_set_bit(I915_RESET_ENGINE + id,
0742 >->reset.flags));
0743 count = 0;
0744 do {
0745 struct i915_request *rq = NULL;
0746 struct intel_selftest_saved_policy saved;
0747 int err2;
0748
0749 err = intel_selftest_modify_policy(engine, &saved,
0750 SELFTEST_SCHEDULER_MODIFY_FAST_RESET);
0751 if (err) {
0752 pr_err("[%s] Modify policy failed: %d!\n", engine->name, err);
0753 break;
0754 }
0755
0756 if (active) {
0757 rq = hang_create_request(&h, engine);
0758 if (IS_ERR(rq)) {
0759 err = PTR_ERR(rq);
0760 pr_err("[%s] Create hang request failed: %d!\n",
0761 engine->name, err);
0762 goto restore;
0763 }
0764
0765 i915_request_get(rq);
0766 i915_request_add(rq);
0767
0768 if (!wait_until_running(&h, rq)) {
0769 struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
0770
0771 pr_err("%s: Failed to start request %llx, at %x\n",
0772 __func__, rq->fence.seqno, hws_seqno(&h, rq));
0773 intel_engine_dump(engine, &p,
0774 "%s\n", engine->name);
0775
0776 i915_request_put(rq);
0777 err = -EIO;
0778 goto restore;
0779 }
0780 }
0781
0782 if (!using_guc) {
0783 err = intel_engine_reset(engine, NULL);
0784 if (err) {
0785 pr_err("intel_engine_reset(%s) failed, err:%d\n",
0786 engine->name, err);
0787 goto skip;
0788 }
0789 }
0790
0791 if (rq) {
0792
0793 err = intel_selftest_wait_for_rq(rq);
0794 if (err)
0795 pr_err("[%s] Wait for request %lld:%lld [0x%04X] failed: %d!\n",
0796 engine->name, rq->fence.context,
0797 rq->fence.seqno, rq->context->guc_id.id, err);
0798 }
0799
0800 skip:
0801 if (rq)
0802 i915_request_put(rq);
0803
0804 if (i915_reset_count(global) != reset_count) {
0805 pr_err("Full GPU reset recorded! (engine reset expected)\n");
0806 err = -EINVAL;
0807 goto restore;
0808 }
0809
0810
0811 if (!using_guc) {
0812 if (i915_reset_engine_count(global, engine) !=
0813 ++reset_engine_count) {
0814 pr_err("%s engine reset not recorded!\n",
0815 engine->name);
0816 err = -EINVAL;
0817 goto restore;
0818 }
0819 }
0820
0821 count++;
0822
0823 restore:
0824 err2 = intel_selftest_restore_policy(engine, &saved);
0825 if (err2)
0826 pr_err("[%s] Restore policy failed: %d!\n", engine->name, err);
0827 if (err == 0)
0828 err = err2;
0829 if (err)
0830 break;
0831 } while (time_before(jiffies, end_time));
0832 clear_and_wake_up_bit(I915_RESET_ENGINE + id, >->reset.flags);
0833 st_engine_heartbeat_enable(engine);
0834 pr_info("%s: Completed %lu %s resets\n",
0835 engine->name, count, active ? "active" : "idle");
0836
0837 if (err)
0838 break;
0839
0840 err = igt_flush_test(gt->i915);
0841 if (err) {
0842 pr_err("[%s] Flush failed: %d!\n", engine->name, err);
0843 break;
0844 }
0845 }
0846
0847 if (intel_gt_is_wedged(gt)) {
0848 pr_err("GT is wedged!\n");
0849 err = -EIO;
0850 }
0851
0852 if (active)
0853 hang_fini(&h);
0854
0855 return err;
0856 }
0857
0858 static int igt_reset_idle_engine(void *arg)
0859 {
0860 return __igt_reset_engine(arg, false);
0861 }
0862
0863 static int igt_reset_active_engine(void *arg)
0864 {
0865 return __igt_reset_engine(arg, true);
0866 }
0867
0868 struct active_engine {
0869 struct task_struct *task;
0870 struct intel_engine_cs *engine;
0871 unsigned long resets;
0872 unsigned int flags;
0873 };
0874
0875 #define TEST_ACTIVE BIT(0)
0876 #define TEST_OTHERS BIT(1)
0877 #define TEST_SELF BIT(2)
0878 #define TEST_PRIORITY BIT(3)
0879
0880 static int active_request_put(struct i915_request *rq)
0881 {
0882 int err = 0;
0883
0884 if (!rq)
0885 return 0;
0886
0887 if (i915_request_wait(rq, 0, 10 * HZ) < 0) {
0888 GEM_TRACE("%s timed out waiting for completion of fence %llx:%lld\n",
0889 rq->engine->name,
0890 rq->fence.context,
0891 rq->fence.seqno);
0892 GEM_TRACE_DUMP();
0893
0894 intel_gt_set_wedged(rq->engine->gt);
0895 err = -EIO;
0896 }
0897
0898 i915_request_put(rq);
0899
0900 return err;
0901 }
0902
0903 static int active_engine(void *data)
0904 {
0905 I915_RND_STATE(prng);
0906 struct active_engine *arg = data;
0907 struct intel_engine_cs *engine = arg->engine;
0908 struct i915_request *rq[8] = {};
0909 struct intel_context *ce[ARRAY_SIZE(rq)];
0910 unsigned long count;
0911 int err = 0;
0912
0913 for (count = 0; count < ARRAY_SIZE(ce); count++) {
0914 ce[count] = intel_context_create(engine);
0915 if (IS_ERR(ce[count])) {
0916 err = PTR_ERR(ce[count]);
0917 pr_err("[%s] Create context #%ld failed: %d!\n", engine->name, count, err);
0918 while (--count)
0919 intel_context_put(ce[count]);
0920 return err;
0921 }
0922 }
0923
0924 count = 0;
0925 while (!kthread_should_stop()) {
0926 unsigned int idx = count++ & (ARRAY_SIZE(rq) - 1);
0927 struct i915_request *old = rq[idx];
0928 struct i915_request *new;
0929
0930 new = intel_context_create_request(ce[idx]);
0931 if (IS_ERR(new)) {
0932 err = PTR_ERR(new);
0933 pr_err("[%s] Create request #%d failed: %d!\n", engine->name, idx, err);
0934 break;
0935 }
0936
0937 rq[idx] = i915_request_get(new);
0938 i915_request_add(new);
0939
0940 if (engine->sched_engine->schedule && arg->flags & TEST_PRIORITY) {
0941 struct i915_sched_attr attr = {
0942 .priority =
0943 i915_prandom_u32_max_state(512, &prng),
0944 };
0945 engine->sched_engine->schedule(rq[idx], &attr);
0946 }
0947
0948 err = active_request_put(old);
0949 if (err) {
0950 pr_err("[%s] Request put failed: %d!\n", engine->name, err);
0951 break;
0952 }
0953
0954 cond_resched();
0955 }
0956
0957 for (count = 0; count < ARRAY_SIZE(rq); count++) {
0958 int err__ = active_request_put(rq[count]);
0959
0960 if (err)
0961 pr_err("[%s] Request put #%ld failed: %d!\n", engine->name, count, err);
0962
0963
0964 if (!err)
0965 err = err__;
0966
0967 intel_context_put(ce[count]);
0968 }
0969
0970 return err;
0971 }
0972
0973 static int __igt_reset_engines(struct intel_gt *gt,
0974 const char *test_name,
0975 unsigned int flags)
0976 {
0977 struct i915_gpu_error *global = >->i915->gpu_error;
0978 struct intel_engine_cs *engine, *other;
0979 struct active_engine *threads;
0980 enum intel_engine_id id, tmp;
0981 struct hang h;
0982 int err = 0;
0983
0984
0985
0986
0987
0988 if (!intel_has_reset_engine(gt))
0989 return 0;
0990
0991 if (flags & TEST_ACTIVE) {
0992 err = hang_init(&h, gt);
0993 if (err)
0994 return err;
0995
0996 if (flags & TEST_PRIORITY)
0997 h.ctx->sched.priority = 1024;
0998 }
0999
1000 threads = kmalloc_array(I915_NUM_ENGINES, sizeof(*threads), GFP_KERNEL);
1001 if (!threads)
1002 return -ENOMEM;
1003
1004 for_each_engine(engine, gt, id) {
1005 unsigned long device = i915_reset_count(global);
1006 unsigned long count = 0, reported;
1007 bool using_guc = intel_engine_uses_guc(engine);
1008 IGT_TIMEOUT(end_time);
1009
1010 if (flags & TEST_ACTIVE) {
1011 if (!intel_engine_can_store_dword(engine))
1012 continue;
1013 } else if (using_guc)
1014 continue;
1015
1016 if (!wait_for_idle(engine)) {
1017 pr_err("i915_reset_engine(%s:%s): failed to idle before reset\n",
1018 engine->name, test_name);
1019 err = -EIO;
1020 break;
1021 }
1022
1023 memset(threads, 0, sizeof(*threads) * I915_NUM_ENGINES);
1024 for_each_engine(other, gt, tmp) {
1025 struct task_struct *tsk;
1026
1027 threads[tmp].resets =
1028 i915_reset_engine_count(global, other);
1029
1030 if (other == engine && !(flags & TEST_SELF))
1031 continue;
1032
1033 if (other != engine && !(flags & TEST_OTHERS))
1034 continue;
1035
1036 threads[tmp].engine = other;
1037 threads[tmp].flags = flags;
1038
1039 tsk = kthread_run(active_engine, &threads[tmp],
1040 "igt/%s", other->name);
1041 if (IS_ERR(tsk)) {
1042 err = PTR_ERR(tsk);
1043 pr_err("[%s] Thread spawn failed: %d!\n", engine->name, err);
1044 goto unwind;
1045 }
1046
1047 threads[tmp].task = tsk;
1048 get_task_struct(tsk);
1049 }
1050
1051 yield();
1052
1053 st_engine_heartbeat_disable_no_pm(engine);
1054 GEM_BUG_ON(test_and_set_bit(I915_RESET_ENGINE + id,
1055 >->reset.flags));
1056 do {
1057 struct i915_request *rq = NULL;
1058 struct intel_selftest_saved_policy saved;
1059 int err2;
1060
1061 err = intel_selftest_modify_policy(engine, &saved,
1062 SELFTEST_SCHEDULER_MODIFY_FAST_RESET);
1063 if (err) {
1064 pr_err("[%s] Modify policy failed: %d!\n", engine->name, err);
1065 break;
1066 }
1067
1068 if (flags & TEST_ACTIVE) {
1069 rq = hang_create_request(&h, engine);
1070 if (IS_ERR(rq)) {
1071 err = PTR_ERR(rq);
1072 pr_err("[%s] Create hang request failed: %d!\n",
1073 engine->name, err);
1074 goto restore;
1075 }
1076
1077 i915_request_get(rq);
1078 i915_request_add(rq);
1079
1080 if (!wait_until_running(&h, rq)) {
1081 struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1082
1083 pr_err("%s: Failed to start request %llx, at %x\n",
1084 __func__, rq->fence.seqno, hws_seqno(&h, rq));
1085 intel_engine_dump(engine, &p,
1086 "%s\n", engine->name);
1087
1088 i915_request_put(rq);
1089 err = -EIO;
1090 goto restore;
1091 }
1092 } else {
1093 intel_engine_pm_get(engine);
1094 }
1095
1096 if (!using_guc) {
1097 err = intel_engine_reset(engine, NULL);
1098 if (err) {
1099 pr_err("i915_reset_engine(%s:%s): failed, err=%d\n",
1100 engine->name, test_name, err);
1101 goto restore;
1102 }
1103 }
1104
1105 if (rq) {
1106
1107 err = intel_selftest_wait_for_rq(rq);
1108 if (err)
1109 pr_err("[%s] Wait for request %lld:%lld [0x%04X] failed: %d!\n",
1110 engine->name, rq->fence.context,
1111 rq->fence.seqno, rq->context->guc_id.id, err);
1112 }
1113
1114 count++;
1115
1116 if (rq) {
1117 if (rq->fence.error != -EIO) {
1118 pr_err("i915_reset_engine(%s:%s): failed to reset request %lld:%lld [0x%04X]\n",
1119 engine->name, test_name,
1120 rq->fence.context,
1121 rq->fence.seqno, rq->context->guc_id.id);
1122 i915_request_put(rq);
1123
1124 GEM_TRACE_DUMP();
1125 intel_gt_set_wedged(gt);
1126 err = -EIO;
1127 goto restore;
1128 }
1129
1130 if (i915_request_wait(rq, 0, HZ / 5) < 0) {
1131 struct drm_printer p =
1132 drm_info_printer(gt->i915->drm.dev);
1133
1134 pr_err("i915_reset_engine(%s:%s):"
1135 " failed to complete request %llx:%lld after reset\n",
1136 engine->name, test_name,
1137 rq->fence.context,
1138 rq->fence.seqno);
1139 intel_engine_dump(engine, &p,
1140 "%s\n", engine->name);
1141 i915_request_put(rq);
1142
1143 GEM_TRACE_DUMP();
1144 intel_gt_set_wedged(gt);
1145 err = -EIO;
1146 goto restore;
1147 }
1148
1149 i915_request_put(rq);
1150 }
1151
1152 if (!(flags & TEST_ACTIVE))
1153 intel_engine_pm_put(engine);
1154
1155 if (!(flags & TEST_SELF) && !wait_for_idle(engine)) {
1156 struct drm_printer p =
1157 drm_info_printer(gt->i915->drm.dev);
1158
1159 pr_err("i915_reset_engine(%s:%s):"
1160 " failed to idle after reset\n",
1161 engine->name, test_name);
1162 intel_engine_dump(engine, &p,
1163 "%s\n", engine->name);
1164
1165 err = -EIO;
1166 goto restore;
1167 }
1168
1169 restore:
1170 err2 = intel_selftest_restore_policy(engine, &saved);
1171 if (err2)
1172 pr_err("[%s] Restore policy failed: %d!\n", engine->name, err2);
1173 if (err == 0)
1174 err = err2;
1175 if (err)
1176 break;
1177 } while (time_before(jiffies, end_time));
1178 clear_and_wake_up_bit(I915_RESET_ENGINE + id, >->reset.flags);
1179 st_engine_heartbeat_enable_no_pm(engine);
1180
1181 pr_info("i915_reset_engine(%s:%s): %lu resets\n",
1182 engine->name, test_name, count);
1183
1184
1185 if (!using_guc) {
1186 reported = i915_reset_engine_count(global, engine);
1187 reported -= threads[engine->id].resets;
1188 if (reported != count) {
1189 pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu\n",
1190 engine->name, test_name, count, reported);
1191 if (!err)
1192 err = -EINVAL;
1193 }
1194 }
1195
1196 unwind:
1197 for_each_engine(other, gt, tmp) {
1198 int ret;
1199
1200 if (!threads[tmp].task)
1201 continue;
1202
1203 ret = kthread_stop(threads[tmp].task);
1204 if (ret) {
1205 pr_err("kthread for other engine %s failed, err=%d\n",
1206 other->name, ret);
1207 if (!err)
1208 err = ret;
1209 }
1210 put_task_struct(threads[tmp].task);
1211
1212
1213 if (!using_guc) {
1214 if (other->uabi_class != engine->uabi_class &&
1215 threads[tmp].resets !=
1216 i915_reset_engine_count(global, other)) {
1217 pr_err("Innocent engine %s was reset (count=%ld)\n",
1218 other->name,
1219 i915_reset_engine_count(global, other) -
1220 threads[tmp].resets);
1221 if (!err)
1222 err = -EINVAL;
1223 }
1224 }
1225 }
1226
1227 if (device != i915_reset_count(global)) {
1228 pr_err("Global reset (count=%ld)!\n",
1229 i915_reset_count(global) - device);
1230 if (!err)
1231 err = -EINVAL;
1232 }
1233
1234 if (err)
1235 break;
1236
1237 err = igt_flush_test(gt->i915);
1238 if (err) {
1239 pr_err("[%s] Flush failed: %d!\n", engine->name, err);
1240 break;
1241 }
1242 }
1243 kfree(threads);
1244
1245 if (intel_gt_is_wedged(gt))
1246 err = -EIO;
1247
1248 if (flags & TEST_ACTIVE)
1249 hang_fini(&h);
1250
1251 return err;
1252 }
1253
1254 static int igt_reset_engines(void *arg)
1255 {
1256 static const struct {
1257 const char *name;
1258 unsigned int flags;
1259 } phases[] = {
1260 { "idle", 0 },
1261 { "active", TEST_ACTIVE },
1262 { "others-idle", TEST_OTHERS },
1263 { "others-active", TEST_OTHERS | TEST_ACTIVE },
1264 {
1265 "others-priority",
1266 TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY
1267 },
1268 {
1269 "self-priority",
1270 TEST_ACTIVE | TEST_PRIORITY | TEST_SELF,
1271 },
1272 { }
1273 };
1274 struct intel_gt *gt = arg;
1275 typeof(*phases) *p;
1276 int err;
1277
1278 for (p = phases; p->name; p++) {
1279 if (p->flags & TEST_PRIORITY) {
1280 if (!(gt->i915->caps.scheduler & I915_SCHEDULER_CAP_PRIORITY))
1281 continue;
1282 }
1283
1284 err = __igt_reset_engines(arg, p->name, p->flags);
1285 if (err)
1286 return err;
1287 }
1288
1289 return 0;
1290 }
1291
1292 static u32 fake_hangcheck(struct intel_gt *gt, intel_engine_mask_t mask)
1293 {
1294 u32 count = i915_reset_count(>->i915->gpu_error);
1295
1296 intel_gt_reset(gt, mask, NULL);
1297
1298 return count;
1299 }
1300
1301 static int igt_reset_wait(void *arg)
1302 {
1303 struct intel_gt *gt = arg;
1304 struct i915_gpu_error *global = >->i915->gpu_error;
1305 struct intel_engine_cs *engine = gt->engine[RCS0];
1306 struct i915_request *rq;
1307 unsigned int reset_count;
1308 struct hang h;
1309 long timeout;
1310 int err;
1311
1312 if (!engine || !intel_engine_can_store_dword(engine))
1313 return 0;
1314
1315
1316
1317 igt_global_reset_lock(gt);
1318
1319 err = hang_init(&h, gt);
1320 if (err) {
1321 pr_err("[%s] Hang init failed: %d!\n", engine->name, err);
1322 goto unlock;
1323 }
1324
1325 rq = hang_create_request(&h, engine);
1326 if (IS_ERR(rq)) {
1327 err = PTR_ERR(rq);
1328 pr_err("[%s] Create hang request failed: %d!\n", engine->name, err);
1329 goto fini;
1330 }
1331
1332 i915_request_get(rq);
1333 i915_request_add(rq);
1334
1335 if (!wait_until_running(&h, rq)) {
1336 struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1337
1338 pr_err("%s: Failed to start request %llx, at %x\n",
1339 __func__, rq->fence.seqno, hws_seqno(&h, rq));
1340 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1341
1342 intel_gt_set_wedged(gt);
1343
1344 err = -EIO;
1345 goto out_rq;
1346 }
1347
1348 reset_count = fake_hangcheck(gt, ALL_ENGINES);
1349
1350 timeout = i915_request_wait(rq, 0, 10);
1351 if (timeout < 0) {
1352 pr_err("i915_request_wait failed on a stuck request: err=%ld\n",
1353 timeout);
1354 err = timeout;
1355 goto out_rq;
1356 }
1357
1358 if (i915_reset_count(global) == reset_count) {
1359 pr_err("No GPU reset recorded!\n");
1360 err = -EINVAL;
1361 goto out_rq;
1362 }
1363
1364 out_rq:
1365 i915_request_put(rq);
1366 fini:
1367 hang_fini(&h);
1368 unlock:
1369 igt_global_reset_unlock(gt);
1370
1371 if (intel_gt_is_wedged(gt))
1372 return -EIO;
1373
1374 return err;
1375 }
1376
1377 struct evict_vma {
1378 struct completion completion;
1379 struct i915_vma *vma;
1380 };
1381
1382 static int evict_vma(void *data)
1383 {
1384 struct evict_vma *arg = data;
1385 struct i915_address_space *vm = arg->vma->vm;
1386 struct drm_mm_node evict = arg->vma->node;
1387 int err;
1388
1389 complete(&arg->completion);
1390
1391 mutex_lock(&vm->mutex);
1392 err = i915_gem_evict_for_node(vm, NULL, &evict, 0);
1393 mutex_unlock(&vm->mutex);
1394
1395 return err;
1396 }
1397
1398 static int evict_fence(void *data)
1399 {
1400 struct evict_vma *arg = data;
1401 int err;
1402
1403 complete(&arg->completion);
1404
1405
1406 err = i915_gem_object_set_tiling(arg->vma->obj, I915_TILING_Y, 512);
1407 if (err) {
1408 pr_err("Invalid Y-tiling settings; err:%d\n", err);
1409 return err;
1410 }
1411
1412 err = i915_vma_pin(arg->vma, 0, 0, PIN_GLOBAL | PIN_MAPPABLE);
1413 if (err) {
1414 pr_err("Unable to pin vma for Y-tiled fence; err:%d\n", err);
1415 return err;
1416 }
1417
1418 err = i915_vma_pin_fence(arg->vma);
1419 i915_vma_unpin(arg->vma);
1420 if (err) {
1421 pr_err("Unable to pin Y-tiled fence; err:%d\n", err);
1422 return err;
1423 }
1424
1425 i915_vma_unpin_fence(arg->vma);
1426
1427 return 0;
1428 }
1429
1430 static int __igt_reset_evict_vma(struct intel_gt *gt,
1431 struct i915_address_space *vm,
1432 int (*fn)(void *),
1433 unsigned int flags)
1434 {
1435 struct intel_engine_cs *engine = gt->engine[RCS0];
1436 struct drm_i915_gem_object *obj;
1437 struct task_struct *tsk = NULL;
1438 struct i915_request *rq;
1439 struct evict_vma arg;
1440 struct hang h;
1441 unsigned int pin_flags;
1442 int err;
1443
1444 if (!gt->ggtt->num_fences && flags & EXEC_OBJECT_NEEDS_FENCE)
1445 return 0;
1446
1447 if (!engine || !intel_engine_can_store_dword(engine))
1448 return 0;
1449
1450
1451
1452 err = hang_init(&h, gt);
1453 if (err) {
1454 pr_err("[%s] Hang init failed: %d!\n", engine->name, err);
1455 return err;
1456 }
1457
1458 obj = i915_gem_object_create_internal(gt->i915, SZ_1M);
1459 if (IS_ERR(obj)) {
1460 err = PTR_ERR(obj);
1461 pr_err("[%s] Create object failed: %d!\n", engine->name, err);
1462 goto fini;
1463 }
1464
1465 if (flags & EXEC_OBJECT_NEEDS_FENCE) {
1466 err = i915_gem_object_set_tiling(obj, I915_TILING_X, 512);
1467 if (err) {
1468 pr_err("Invalid X-tiling settings; err:%d\n", err);
1469 goto out_obj;
1470 }
1471 }
1472
1473 arg.vma = i915_vma_instance(obj, vm, NULL);
1474 if (IS_ERR(arg.vma)) {
1475 err = PTR_ERR(arg.vma);
1476 pr_err("[%s] VMA instance failed: %d!\n", engine->name, err);
1477 goto out_obj;
1478 }
1479
1480 rq = hang_create_request(&h, engine);
1481 if (IS_ERR(rq)) {
1482 err = PTR_ERR(rq);
1483 pr_err("[%s] Create hang request failed: %d!\n", engine->name, err);
1484 goto out_obj;
1485 }
1486
1487 pin_flags = i915_vma_is_ggtt(arg.vma) ? PIN_GLOBAL : PIN_USER;
1488
1489 if (flags & EXEC_OBJECT_NEEDS_FENCE)
1490 pin_flags |= PIN_MAPPABLE;
1491
1492 err = i915_vma_pin(arg.vma, 0, 0, pin_flags);
1493 if (err) {
1494 i915_request_add(rq);
1495 pr_err("[%s] VMA pin failed: %d!\n", engine->name, err);
1496 goto out_obj;
1497 }
1498
1499 if (flags & EXEC_OBJECT_NEEDS_FENCE) {
1500 err = i915_vma_pin_fence(arg.vma);
1501 if (err) {
1502 pr_err("Unable to pin X-tiled fence; err:%d\n", err);
1503 i915_vma_unpin(arg.vma);
1504 i915_request_add(rq);
1505 goto out_obj;
1506 }
1507 }
1508
1509 i915_vma_lock(arg.vma);
1510 err = i915_request_await_object(rq, arg.vma->obj,
1511 flags & EXEC_OBJECT_WRITE);
1512 if (err == 0) {
1513 err = i915_vma_move_to_active(arg.vma, rq, flags);
1514 if (err)
1515 pr_err("[%s] Move to active failed: %d!\n", engine->name, err);
1516 } else {
1517 pr_err("[%s] Request await failed: %d!\n", engine->name, err);
1518 }
1519
1520 i915_vma_unlock(arg.vma);
1521
1522 if (flags & EXEC_OBJECT_NEEDS_FENCE)
1523 i915_vma_unpin_fence(arg.vma);
1524 i915_vma_unpin(arg.vma);
1525
1526 i915_request_get(rq);
1527 i915_request_add(rq);
1528 if (err)
1529 goto out_rq;
1530
1531 if (!wait_until_running(&h, rq)) {
1532 struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1533
1534 pr_err("%s: Failed to start request %llx, at %x\n",
1535 __func__, rq->fence.seqno, hws_seqno(&h, rq));
1536 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1537
1538 intel_gt_set_wedged(gt);
1539 goto out_reset;
1540 }
1541
1542 init_completion(&arg.completion);
1543
1544 tsk = kthread_run(fn, &arg, "igt/evict_vma");
1545 if (IS_ERR(tsk)) {
1546 err = PTR_ERR(tsk);
1547 pr_err("[%s] Thread spawn failed: %d!\n", engine->name, err);
1548 tsk = NULL;
1549 goto out_reset;
1550 }
1551 get_task_struct(tsk);
1552
1553 wait_for_completion(&arg.completion);
1554
1555 if (wait_for(!list_empty(&rq->fence.cb_list), 10)) {
1556 struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1557
1558 pr_err("igt/evict_vma kthread did not wait\n");
1559 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1560
1561 intel_gt_set_wedged(gt);
1562 goto out_reset;
1563 }
1564
1565 out_reset:
1566 igt_global_reset_lock(gt);
1567 fake_hangcheck(gt, rq->engine->mask);
1568 igt_global_reset_unlock(gt);
1569
1570 if (tsk) {
1571 struct intel_wedge_me w;
1572
1573
1574 intel_wedge_on_timeout(&w, gt, HZ / 10 )
1575 err = kthread_stop(tsk);
1576
1577 put_task_struct(tsk);
1578 }
1579
1580 out_rq:
1581 i915_request_put(rq);
1582 out_obj:
1583 i915_gem_object_put(obj);
1584 fini:
1585 hang_fini(&h);
1586 if (intel_gt_is_wedged(gt))
1587 return -EIO;
1588
1589 return err;
1590 }
1591
1592 static int igt_reset_evict_ggtt(void *arg)
1593 {
1594 struct intel_gt *gt = arg;
1595
1596 return __igt_reset_evict_vma(gt, >->ggtt->vm,
1597 evict_vma, EXEC_OBJECT_WRITE);
1598 }
1599
1600 static int igt_reset_evict_ppgtt(void *arg)
1601 {
1602 struct intel_gt *gt = arg;
1603 struct i915_ppgtt *ppgtt;
1604 int err;
1605
1606
1607 if (INTEL_PPGTT(gt->i915) < INTEL_PPGTT_FULL)
1608 return 0;
1609
1610 ppgtt = i915_ppgtt_create(gt, 0);
1611 if (IS_ERR(ppgtt))
1612 return PTR_ERR(ppgtt);
1613
1614 err = __igt_reset_evict_vma(gt, &ppgtt->vm,
1615 evict_vma, EXEC_OBJECT_WRITE);
1616 i915_vm_put(&ppgtt->vm);
1617
1618 return err;
1619 }
1620
1621 static int igt_reset_evict_fence(void *arg)
1622 {
1623 struct intel_gt *gt = arg;
1624
1625 return __igt_reset_evict_vma(gt, >->ggtt->vm,
1626 evict_fence, EXEC_OBJECT_NEEDS_FENCE);
1627 }
1628
1629 static int wait_for_others(struct intel_gt *gt,
1630 struct intel_engine_cs *exclude)
1631 {
1632 struct intel_engine_cs *engine;
1633 enum intel_engine_id id;
1634
1635 for_each_engine(engine, gt, id) {
1636 if (engine == exclude)
1637 continue;
1638
1639 if (!wait_for_idle(engine))
1640 return -EIO;
1641 }
1642
1643 return 0;
1644 }
1645
1646 static int igt_reset_queue(void *arg)
1647 {
1648 struct intel_gt *gt = arg;
1649 struct i915_gpu_error *global = >->i915->gpu_error;
1650 struct intel_engine_cs *engine;
1651 enum intel_engine_id id;
1652 struct hang h;
1653 int err;
1654
1655
1656
1657 igt_global_reset_lock(gt);
1658
1659 err = hang_init(&h, gt);
1660 if (err)
1661 goto unlock;
1662
1663 for_each_engine(engine, gt, id) {
1664 struct intel_selftest_saved_policy saved;
1665 struct i915_request *prev;
1666 IGT_TIMEOUT(end_time);
1667 unsigned int count;
1668 bool using_guc = intel_engine_uses_guc(engine);
1669
1670 if (!intel_engine_can_store_dword(engine))
1671 continue;
1672
1673 if (using_guc) {
1674 err = intel_selftest_modify_policy(engine, &saved,
1675 SELFTEST_SCHEDULER_MODIFY_NO_HANGCHECK);
1676 if (err) {
1677 pr_err("[%s] Modify policy failed: %d!\n", engine->name, err);
1678 goto fini;
1679 }
1680 }
1681
1682 prev = hang_create_request(&h, engine);
1683 if (IS_ERR(prev)) {
1684 err = PTR_ERR(prev);
1685 pr_err("[%s] Create 'prev' hang request failed: %d!\n", engine->name, err);
1686 goto restore;
1687 }
1688
1689 i915_request_get(prev);
1690 i915_request_add(prev);
1691
1692 count = 0;
1693 do {
1694 struct i915_request *rq;
1695 unsigned int reset_count;
1696
1697 rq = hang_create_request(&h, engine);
1698 if (IS_ERR(rq)) {
1699 err = PTR_ERR(rq);
1700 pr_err("[%s] Create hang request failed: %d!\n", engine->name, err);
1701 goto restore;
1702 }
1703
1704 i915_request_get(rq);
1705 i915_request_add(rq);
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717 err = wait_for_others(gt, engine);
1718 if (err) {
1719 pr_err("%s(%s): Failed to idle other inactive engines after device reset\n",
1720 __func__, engine->name);
1721 i915_request_put(rq);
1722 i915_request_put(prev);
1723
1724 GEM_TRACE_DUMP();
1725 intel_gt_set_wedged(gt);
1726 goto restore;
1727 }
1728
1729 if (!wait_until_running(&h, prev)) {
1730 struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1731
1732 pr_err("%s(%s): Failed to start request %llx, at %x\n",
1733 __func__, engine->name,
1734 prev->fence.seqno, hws_seqno(&h, prev));
1735 intel_engine_dump(engine, &p,
1736 "%s\n", engine->name);
1737
1738 i915_request_put(rq);
1739 i915_request_put(prev);
1740
1741 intel_gt_set_wedged(gt);
1742
1743 err = -EIO;
1744 goto restore;
1745 }
1746
1747 reset_count = fake_hangcheck(gt, BIT(id));
1748
1749 if (prev->fence.error != -EIO) {
1750 pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n",
1751 prev->fence.error);
1752 i915_request_put(rq);
1753 i915_request_put(prev);
1754 err = -EINVAL;
1755 goto restore;
1756 }
1757
1758 if (rq->fence.error) {
1759 pr_err("Fence error status not zero [%d] after unrelated reset\n",
1760 rq->fence.error);
1761 i915_request_put(rq);
1762 i915_request_put(prev);
1763 err = -EINVAL;
1764 goto restore;
1765 }
1766
1767 if (i915_reset_count(global) == reset_count) {
1768 pr_err("No GPU reset recorded!\n");
1769 i915_request_put(rq);
1770 i915_request_put(prev);
1771 err = -EINVAL;
1772 goto restore;
1773 }
1774
1775 i915_request_put(prev);
1776 prev = rq;
1777 count++;
1778 } while (time_before(jiffies, end_time));
1779 pr_info("%s: Completed %d queued resets\n",
1780 engine->name, count);
1781
1782 *h.batch = MI_BATCH_BUFFER_END;
1783 intel_gt_chipset_flush(engine->gt);
1784
1785 i915_request_put(prev);
1786
1787 restore:
1788 if (using_guc) {
1789 int err2 = intel_selftest_restore_policy(engine, &saved);
1790
1791 if (err2)
1792 pr_err("%s:%d> [%s] Restore policy failed: %d!\n",
1793 __func__, __LINE__, engine->name, err2);
1794 if (err == 0)
1795 err = err2;
1796 }
1797 if (err)
1798 goto fini;
1799
1800 err = igt_flush_test(gt->i915);
1801 if (err) {
1802 pr_err("[%s] Flush failed: %d!\n", engine->name, err);
1803 break;
1804 }
1805 }
1806
1807 fini:
1808 hang_fini(&h);
1809 unlock:
1810 igt_global_reset_unlock(gt);
1811
1812 if (intel_gt_is_wedged(gt))
1813 return -EIO;
1814
1815 return err;
1816 }
1817
1818 static int igt_handle_error(void *arg)
1819 {
1820 struct intel_gt *gt = arg;
1821 struct i915_gpu_error *global = >->i915->gpu_error;
1822 struct intel_engine_cs *engine = gt->engine[RCS0];
1823 struct hang h;
1824 struct i915_request *rq;
1825 struct i915_gpu_coredump *error;
1826 int err;
1827
1828
1829
1830 if (!intel_has_reset_engine(gt))
1831 return 0;
1832
1833 if (!engine || !intel_engine_can_store_dword(engine))
1834 return 0;
1835
1836 err = hang_init(&h, gt);
1837 if (err) {
1838 pr_err("[%s] Hang init failed: %d!\n", engine->name, err);
1839 return err;
1840 }
1841
1842 rq = hang_create_request(&h, engine);
1843 if (IS_ERR(rq)) {
1844 err = PTR_ERR(rq);
1845 pr_err("[%s] Create hang request failed: %d!\n", engine->name, err);
1846 goto err_fini;
1847 }
1848
1849 i915_request_get(rq);
1850 i915_request_add(rq);
1851
1852 if (!wait_until_running(&h, rq)) {
1853 struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1854
1855 pr_err("%s: Failed to start request %llx, at %x\n",
1856 __func__, rq->fence.seqno, hws_seqno(&h, rq));
1857 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1858
1859 intel_gt_set_wedged(gt);
1860
1861 err = -EIO;
1862 goto err_request;
1863 }
1864
1865
1866 error = xchg(&global->first_error, (void *)-1);
1867
1868 intel_gt_handle_error(gt, engine->mask, 0, NULL);
1869
1870 xchg(&global->first_error, error);
1871
1872 if (rq->fence.error != -EIO) {
1873 pr_err("Guilty request not identified!\n");
1874 err = -EINVAL;
1875 goto err_request;
1876 }
1877
1878 err_request:
1879 i915_request_put(rq);
1880 err_fini:
1881 hang_fini(&h);
1882 return err;
1883 }
1884
1885 static int __igt_atomic_reset_engine(struct intel_engine_cs *engine,
1886 const struct igt_atomic_section *p,
1887 const char *mode)
1888 {
1889 struct tasklet_struct * const t = &engine->sched_engine->tasklet;
1890 int err;
1891
1892 GEM_TRACE("i915_reset_engine(%s:%s) under %s\n",
1893 engine->name, mode, p->name);
1894
1895 if (t->func)
1896 tasklet_disable(t);
1897 if (strcmp(p->name, "softirq"))
1898 local_bh_disable();
1899 p->critical_section_begin();
1900
1901 err = __intel_engine_reset_bh(engine, NULL);
1902
1903 p->critical_section_end();
1904 if (strcmp(p->name, "softirq"))
1905 local_bh_enable();
1906 if (t->func) {
1907 tasklet_enable(t);
1908 tasklet_hi_schedule(t);
1909 }
1910
1911 if (err)
1912 pr_err("i915_reset_engine(%s:%s) failed under %s\n",
1913 engine->name, mode, p->name);
1914
1915 return err;
1916 }
1917
1918 static int igt_atomic_reset_engine(struct intel_engine_cs *engine,
1919 const struct igt_atomic_section *p)
1920 {
1921 struct i915_request *rq;
1922 struct hang h;
1923 int err;
1924
1925 err = __igt_atomic_reset_engine(engine, p, "idle");
1926 if (err)
1927 return err;
1928
1929 err = hang_init(&h, engine->gt);
1930 if (err) {
1931 pr_err("[%s] Hang init failed: %d!\n", engine->name, err);
1932 return err;
1933 }
1934
1935 rq = hang_create_request(&h, engine);
1936 if (IS_ERR(rq)) {
1937 err = PTR_ERR(rq);
1938 pr_err("[%s] Create hang request failed: %d!\n", engine->name, err);
1939 goto out;
1940 }
1941
1942 i915_request_get(rq);
1943 i915_request_add(rq);
1944
1945 if (wait_until_running(&h, rq)) {
1946 err = __igt_atomic_reset_engine(engine, p, "active");
1947 } else {
1948 pr_err("%s(%s): Failed to start request %llx, at %x\n",
1949 __func__, engine->name,
1950 rq->fence.seqno, hws_seqno(&h, rq));
1951 intel_gt_set_wedged(engine->gt);
1952 err = -EIO;
1953 }
1954
1955 if (err == 0) {
1956 struct intel_wedge_me w;
1957
1958 intel_wedge_on_timeout(&w, engine->gt, HZ / 20 )
1959 i915_request_wait(rq, 0, MAX_SCHEDULE_TIMEOUT);
1960 if (intel_gt_is_wedged(engine->gt))
1961 err = -EIO;
1962 }
1963
1964 i915_request_put(rq);
1965 out:
1966 hang_fini(&h);
1967 return err;
1968 }
1969
1970 static int igt_reset_engines_atomic(void *arg)
1971 {
1972 struct intel_gt *gt = arg;
1973 const typeof(*igt_atomic_phases) *p;
1974 int err = 0;
1975
1976
1977
1978 if (!intel_has_reset_engine(gt))
1979 return 0;
1980
1981 if (intel_uc_uses_guc_submission(>->uc))
1982 return 0;
1983
1984 igt_global_reset_lock(gt);
1985
1986
1987 if (!igt_force_reset(gt))
1988 goto unlock;
1989
1990 for (p = igt_atomic_phases; p->name; p++) {
1991 struct intel_engine_cs *engine;
1992 enum intel_engine_id id;
1993
1994 for_each_engine(engine, gt, id) {
1995 err = igt_atomic_reset_engine(engine, p);
1996 if (err)
1997 goto out;
1998 }
1999 }
2000
2001 out:
2002
2003 igt_force_reset(gt);
2004 unlock:
2005 igt_global_reset_unlock(gt);
2006
2007 return err;
2008 }
2009
2010 int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
2011 {
2012 static const struct i915_subtest tests[] = {
2013 SUBTEST(igt_hang_sanitycheck),
2014 SUBTEST(igt_reset_nop),
2015 SUBTEST(igt_reset_nop_engine),
2016 SUBTEST(igt_reset_idle_engine),
2017 SUBTEST(igt_reset_active_engine),
2018 SUBTEST(igt_reset_fail_engine),
2019 SUBTEST(igt_reset_engines),
2020 SUBTEST(igt_reset_engines_atomic),
2021 SUBTEST(igt_reset_queue),
2022 SUBTEST(igt_reset_wait),
2023 SUBTEST(igt_reset_evict_ggtt),
2024 SUBTEST(igt_reset_evict_ppgtt),
2025 SUBTEST(igt_reset_evict_fence),
2026 SUBTEST(igt_handle_error),
2027 };
2028 struct intel_gt *gt = to_gt(i915);
2029 intel_wakeref_t wakeref;
2030 int err;
2031
2032 if (!intel_has_gpu_reset(gt))
2033 return 0;
2034
2035 if (intel_gt_is_wedged(gt))
2036 return -EIO;
2037
2038 wakeref = intel_runtime_pm_get(gt->uncore->rpm);
2039
2040 err = intel_gt_live_subtests(tests, gt);
2041
2042 intel_runtime_pm_put(gt->uncore->rpm, wakeref);
2043
2044 return err;
2045 }