i915/gt/selftest_context.c

0001 // SPDX-License-Identifier: GPL-2.0
0002 /*
0003  * Copyright © 2019 Intel Corporation
0004  */
0005
0006 #include "i915_selftest.h"
0007 #include "intel_engine_heartbeat.h"
0008 #include "intel_engine_pm.h"
0009 #include "intel_gt.h"
0010
0011 #include "gem/selftests/mock_context.h"
0012 #include "selftests/igt_flush_test.h"
0013 #include "selftests/mock_drm.h"
0014
0015 static int request_sync(struct i915_request *rq)
0016 {
0017     struct intel_timeline *tl = i915_request_timeline(rq);
0018     long timeout;
0019     int err = 0;
0020
0021     intel_timeline_get(tl);
0022     i915_request_get(rq);
0023
0024     /* Opencode i915_request_add() so we can keep the timeline locked. */
0025     __i915_request_commit(rq);
0026     rq->sched.attr.priority = I915_PRIORITY_BARRIER;
0027     __i915_request_queue_bh(rq);
0028
0029     timeout = i915_request_wait(rq, 0, HZ / 10);
0030     if (timeout < 0)
0031         err = timeout;
0032     else
0033         i915_request_retire_upto(rq);
0034
0035     lockdep_unpin_lock(&tl->mutex, rq->cookie);
0036     mutex_unlock(&tl->mutex);
0037
0038     i915_request_put(rq);
0039     intel_timeline_put(tl);
0040
0041     return err;
0042 }
0043
0044 static int context_sync(struct intel_context *ce)
0045 {
0046     struct intel_timeline *tl = ce->timeline;
0047     int err = 0;
0048
0049     mutex_lock(&tl->mutex);
0050     do {
0051         struct i915_request *rq;
0052         long timeout;
0053
0054         if (list_empty(&tl->requests))
0055             break;
0056
0057         rq = list_last_entry(&tl->requests, typeof(*rq), link);
0058         i915_request_get(rq);
0059
0060         timeout = i915_request_wait(rq, 0, HZ / 10);
0061         if (timeout < 0)
0062             err = timeout;
0063         else
0064             i915_request_retire_upto(rq);
0065
0066         i915_request_put(rq);
0067     } while (!err);
0068     mutex_unlock(&tl->mutex);
0069
0070     /* Wait for all barriers to complete (remote CPU) before we check */
0071     i915_active_unlock_wait(&ce->active);
0072     return err;
0073 }
0074
0075 static int __live_context_size(struct intel_engine_cs *engine)
0076 {
0077     struct intel_context *ce;
0078     struct i915_request *rq;
0079     void *vaddr;
0080     int err;
0081
0082     ce = intel_context_create(engine);
0083     if (IS_ERR(ce))
0084         return PTR_ERR(ce);
0085
0086     err = intel_context_pin(ce);
0087     if (err)
0088         goto err;
0089
0090     vaddr = i915_gem_object_pin_map_unlocked(ce->state->obj,
0091                          i915_coherent_map_type(engine->i915,
0092                                     ce->state->obj, false));
0093     if (IS_ERR(vaddr)) {
0094         err = PTR_ERR(vaddr);
0095         intel_context_unpin(ce);
0096         goto err;
0097     }
0098
0099     /*
0100      * Note that execlists also applies a redzone which it checks on
0101      * context unpin when debugging. We are using the same location
0102      * and same poison value so that our checks overlap. Despite the
0103      * redundancy, we want to keep this little selftest so that we
0104      * get coverage of any and all submission backends, and we can
0105      * always extend this test to ensure we trick the HW into a
0106      * compromising position wrt to the various sections that need
0107      * to be written into the context state.
0108      *
0109      * TLDR; this overlaps with the execlists redzone.
0110      */
0111     vaddr += engine->context_size - I915_GTT_PAGE_SIZE;
0112     memset(vaddr, POISON_INUSE, I915_GTT_PAGE_SIZE);
0113
0114     rq = intel_context_create_request(ce);
0115     intel_context_unpin(ce);
0116     if (IS_ERR(rq)) {
0117         err = PTR_ERR(rq);
0118         goto err_unpin;
0119     }
0120
0121     err = request_sync(rq);
0122     if (err)
0123         goto err_unpin;
0124
0125     /* Force the context switch */
0126     rq = intel_engine_create_kernel_request(engine);
0127     if (IS_ERR(rq)) {
0128         err = PTR_ERR(rq);
0129         goto err_unpin;
0130     }
0131     err = request_sync(rq);
0132     if (err)
0133         goto err_unpin;
0134
0135     if (memchr_inv(vaddr, POISON_INUSE, I915_GTT_PAGE_SIZE)) {
0136         pr_err("%s context overwrote trailing red-zone!", engine->name);
0137         err = -EINVAL;
0138     }
0139
0140 err_unpin:
0141     i915_gem_object_unpin_map(ce->state->obj);
0142 err:
0143     intel_context_put(ce);
0144     return err;
0145 }
0146
0147 static int live_context_size(void *arg)
0148 {
0149     struct intel_gt *gt = arg;
0150     struct intel_engine_cs *engine;
0151     enum intel_engine_id id;
0152     int err = 0;
0153
0154     /*
0155      * Check that our context sizes are correct by seeing if the
0156      * HW tries to write past the end of one.
0157      */
0158
0159     for_each_engine(engine, gt, id) {
0160         struct file *saved;
0161
0162         if (!engine->context_size)
0163             continue;
0164
0165         intel_engine_pm_get(engine);
0166
0167         /*
0168          * Hide the old default state -- we lie about the context size
0169          * and get confused when the default state is smaller than
0170          * expected. For our do nothing request, inheriting the
0171          * active state is sufficient, we are only checking that we
0172          * don't use more than we planned.
0173          */
0174         saved = fetch_and_zero(&engine->default_state);
0175
0176         /* Overlaps with the execlists redzone */
0177         engine->context_size += I915_GTT_PAGE_SIZE;
0178
0179         err = __live_context_size(engine);
0180
0181         engine->context_size -= I915_GTT_PAGE_SIZE;
0182
0183         engine->default_state = saved;
0184
0185         intel_engine_pm_put(engine);
0186
0187         if (err)
0188             break;
0189     }
0190
0191     return err;
0192 }
0193
0194 static int __live_active_context(struct intel_engine_cs *engine)
0195 {
0196     unsigned long saved_heartbeat;
0197     struct intel_context *ce;
0198     int pass;
0199     int err;
0200
0201     /*
0202      * We keep active contexts alive until after a subsequent context
0203      * switch as the final write from the context-save will be after
0204      * we retire the final request. We track when we unpin the context,
0205      * under the presumption that the final pin is from the last request,
0206      * and instead of immediately unpinning the context, we add a task
0207      * to unpin the context from the next idle-barrier.
0208      *
0209      * This test makes sure that the context is kept alive until a
0210      * subsequent idle-barrier (emitted when the engine wakeref hits 0
0211      * with no more outstanding requests).
0212      *
0213      * In GuC submission mode we don't use idle barriers and we instead
0214      * get a message from the GuC to signal that it is safe to unpin the
0215      * context from memory.
0216      */
0217     if (intel_engine_uses_guc(engine))
0218         return 0;
0219
0220     if (intel_engine_pm_is_awake(engine)) {
0221         pr_err("%s is awake before starting %s!\n",
0222                engine->name, __func__);
0223         return -EINVAL;
0224     }
0225
0226     ce = intel_context_create(engine);
0227     if (IS_ERR(ce))
0228         return PTR_ERR(ce);
0229
0230     saved_heartbeat = engine->props.heartbeat_interval_ms;
0231     engine->props.heartbeat_interval_ms = 0;
0232
0233     for (pass = 0; pass <= 2; pass++) {
0234         struct i915_request *rq;
0235
0236         intel_engine_pm_get(engine);
0237
0238         rq = intel_context_create_request(ce);
0239         if (IS_ERR(rq)) {
0240             err = PTR_ERR(rq);
0241             goto out_engine;
0242         }
0243
0244         err = request_sync(rq);
0245         if (err)
0246             goto out_engine;
0247
0248         /* Context will be kept active until after an idle-barrier. */
0249         if (i915_active_is_idle(&ce->active)) {
0250             pr_err("context is not active; expected idle-barrier (%s pass %d)\n",
0251                    engine->name, pass);
0252             err = -EINVAL;
0253             goto out_engine;
0254         }
0255
0256         if (!intel_engine_pm_is_awake(engine)) {
0257             pr_err("%s is asleep before idle-barrier\n",
0258                    engine->name);
0259             err = -EINVAL;
0260             goto out_engine;
0261         }
0262
0263 out_engine:
0264         intel_engine_pm_put(engine);
0265         if (err)
0266             goto err;
0267     }
0268
0269     /* Now make sure our idle-barriers are flushed */
0270     err = intel_engine_flush_barriers(engine);
0271     if (err)
0272         goto err;
0273
0274     /* Wait for the barrier and in the process wait for engine to park */
0275     err = context_sync(engine->kernel_context);
0276     if (err)
0277         goto err;
0278
0279     if (!i915_active_is_idle(&ce->active)) {
0280         pr_err("context is still active!");
0281         err = -EINVAL;
0282     }
0283
0284     intel_engine_pm_flush(engine);
0285
0286     if (intel_engine_pm_is_awake(engine)) {
0287         struct drm_printer p = drm_debug_printer(__func__);
0288
0289         intel_engine_dump(engine, &p,
0290                   "%s is still awake:%d after idle-barriers\n",
0291                   engine->name,
0292                   atomic_read(&engine->wakeref.count));
0293         GEM_TRACE_DUMP();
0294
0295         err = -EINVAL;
0296         goto err;
0297     }
0298
0299 err:
0300     engine->props.heartbeat_interval_ms = saved_heartbeat;
0301     intel_context_put(ce);
0302     return err;
0303 }
0304
0305 static int live_active_context(void *arg)
0306 {
0307     struct intel_gt *gt = arg;
0308     struct intel_engine_cs *engine;
0309     enum intel_engine_id id;
0310     int err = 0;
0311
0312     for_each_engine(engine, gt, id) {
0313         err = __live_active_context(engine);
0314         if (err)
0315             break;
0316
0317         err = igt_flush_test(gt->i915);
0318         if (err)
0319             break;
0320     }
0321
0322     return err;
0323 }
0324
0325 static int __remote_sync(struct intel_context *ce, struct intel_context *remote)
0326 {
0327     struct i915_request *rq;
0328     int err;
0329
0330     err = intel_context_pin(remote);
0331     if (err)
0332         return err;
0333
0334     rq = intel_context_create_request(ce);
0335     if (IS_ERR(rq)) {
0336         err = PTR_ERR(rq);
0337         goto unpin;
0338     }
0339
0340     err = intel_context_prepare_remote_request(remote, rq);
0341     if (err) {
0342         i915_request_add(rq);
0343         goto unpin;
0344     }
0345
0346     err = request_sync(rq);
0347
0348 unpin:
0349     intel_context_unpin(remote);
0350     return err;
0351 }
0352
0353 static int __live_remote_context(struct intel_engine_cs *engine)
0354 {
0355     struct intel_context *local, *remote;
0356     unsigned long saved_heartbeat;
0357     int pass;
0358     int err;
0359
0360     /*
0361      * Check that our idle barriers do not interfere with normal
0362      * activity tracking. In particular, check that operating
0363      * on the context image remotely (intel_context_prepare_remote_request),
0364      * which inserts foreign fences into intel_context.active, does not
0365      * clobber the idle-barrier.
0366      *
0367      * In GuC submission mode we don't use idle barriers.
0368      */
0369     if (intel_engine_uses_guc(engine))
0370         return 0;
0371
0372     if (intel_engine_pm_is_awake(engine)) {
0373         pr_err("%s is awake before starting %s!\n",
0374                engine->name, __func__);
0375         return -EINVAL;
0376     }
0377
0378     remote = intel_context_create(engine);
0379     if (IS_ERR(remote))
0380         return PTR_ERR(remote);
0381
0382     local = intel_context_create(engine);
0383     if (IS_ERR(local)) {
0384         err = PTR_ERR(local);
0385         goto err_remote;
0386     }
0387
0388     saved_heartbeat = engine->props.heartbeat_interval_ms;
0389     engine->props.heartbeat_interval_ms = 0;
0390     intel_engine_pm_get(engine);
0391
0392     for (pass = 0; pass <= 2; pass++) {
0393         err = __remote_sync(local, remote);
0394         if (err)
0395             break;
0396
0397         err = __remote_sync(engine->kernel_context, remote);
0398         if (err)
0399             break;
0400
0401         if (i915_active_is_idle(&remote->active)) {
0402             pr_err("remote context is not active; expected idle-barrier (%s pass %d)\n",
0403                    engine->name, pass);
0404             err = -EINVAL;
0405             break;
0406         }
0407     }
0408
0409     intel_engine_pm_put(engine);
0410     engine->props.heartbeat_interval_ms = saved_heartbeat;
0411
0412     intel_context_put(local);
0413 err_remote:
0414     intel_context_put(remote);
0415     return err;
0416 }
0417
0418 static int live_remote_context(void *arg)
0419 {
0420     struct intel_gt *gt = arg;
0421     struct intel_engine_cs *engine;
0422     enum intel_engine_id id;
0423     int err = 0;
0424
0425     for_each_engine(engine, gt, id) {
0426         err = __live_remote_context(engine);
0427         if (err)
0428             break;
0429
0430         err = igt_flush_test(gt->i915);
0431         if (err)
0432             break;
0433     }
0434
0435     return err;
0436 }
0437
0438 int intel_context_live_selftests(struct drm_i915_private *i915)
0439 {
0440     static const struct i915_subtest tests[] = {
0441         SUBTEST(live_context_size),
0442         SUBTEST(live_active_context),
0443         SUBTEST(live_remote_context),
0444     };
0445     struct intel_gt *gt = to_gt(i915);
0446
0447     if (intel_gt_is_wedged(gt))
0448         return 0;
0449
0450     return intel_gt_live_subtests(tests, gt);
0451 }