Back to home page

OSCL-LXR

 
 

    


0001 /*
0002  * Copyright © 2016 Intel Corporation
0003  *
0004  * Permission is hereby granted, free of charge, to any person obtaining a
0005  * copy of this software and associated documentation files (the "Software"),
0006  * to deal in the Software without restriction, including without limitation
0007  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
0008  * and/or sell copies of the Software, and to permit persons to whom the
0009  * Software is furnished to do so, subject to the following conditions:
0010  *
0011  * The above copyright notice and this permission notice (including the next
0012  * paragraph) shall be included in all copies or substantial portions of the
0013  * Software.
0014  *
0015  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
0016  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
0017  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
0018  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
0019  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
0020  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
0021  * IN THE SOFTWARE.
0022  *
0023  */
0024 
0025 #include <linux/prime_numbers.h>
0026 #include <linux/pm_qos.h>
0027 #include <linux/sort.h>
0028 
0029 #include "gem/i915_gem_internal.h"
0030 #include "gem/i915_gem_pm.h"
0031 #include "gem/selftests/mock_context.h"
0032 
0033 #include "gt/intel_engine_heartbeat.h"
0034 #include "gt/intel_engine_pm.h"
0035 #include "gt/intel_engine_user.h"
0036 #include "gt/intel_gt.h"
0037 #include "gt/intel_gt_clock_utils.h"
0038 #include "gt/intel_gt_requests.h"
0039 #include "gt/selftest_engine_heartbeat.h"
0040 
0041 #include "i915_random.h"
0042 #include "i915_selftest.h"
0043 #include "igt_flush_test.h"
0044 #include "igt_live_test.h"
0045 #include "igt_spinner.h"
0046 #include "lib_sw_fence.h"
0047 
0048 #include "mock_drm.h"
0049 #include "mock_gem_device.h"
0050 
0051 static unsigned int num_uabi_engines(struct drm_i915_private *i915)
0052 {
0053     struct intel_engine_cs *engine;
0054     unsigned int count;
0055 
0056     count = 0;
0057     for_each_uabi_engine(engine, i915)
0058         count++;
0059 
0060     return count;
0061 }
0062 
0063 static struct intel_engine_cs *rcs0(struct drm_i915_private *i915)
0064 {
0065     return intel_engine_lookup_user(i915, I915_ENGINE_CLASS_RENDER, 0);
0066 }
0067 
0068 static int igt_add_request(void *arg)
0069 {
0070     struct drm_i915_private *i915 = arg;
0071     struct i915_request *request;
0072 
0073     /* Basic preliminary test to create a request and let it loose! */
0074 
0075     request = mock_request(rcs0(i915)->kernel_context, HZ / 10);
0076     if (!request)
0077         return -ENOMEM;
0078 
0079     i915_request_add(request);
0080 
0081     return 0;
0082 }
0083 
0084 static int igt_wait_request(void *arg)
0085 {
0086     const long T = HZ / 4;
0087     struct drm_i915_private *i915 = arg;
0088     struct i915_request *request;
0089     int err = -EINVAL;
0090 
0091     /* Submit a request, then wait upon it */
0092 
0093     request = mock_request(rcs0(i915)->kernel_context, T);
0094     if (!request)
0095         return -ENOMEM;
0096 
0097     i915_request_get(request);
0098 
0099     if (i915_request_wait(request, 0, 0) != -ETIME) {
0100         pr_err("request wait (busy query) succeeded (expected timeout before submit!)\n");
0101         goto out_request;
0102     }
0103 
0104     if (i915_request_wait(request, 0, T) != -ETIME) {
0105         pr_err("request wait succeeded (expected timeout before submit!)\n");
0106         goto out_request;
0107     }
0108 
0109     if (i915_request_completed(request)) {
0110         pr_err("request completed before submit!!\n");
0111         goto out_request;
0112     }
0113 
0114     i915_request_add(request);
0115 
0116     if (i915_request_wait(request, 0, 0) != -ETIME) {
0117         pr_err("request wait (busy query) succeeded (expected timeout after submit!)\n");
0118         goto out_request;
0119     }
0120 
0121     if (i915_request_completed(request)) {
0122         pr_err("request completed immediately!\n");
0123         goto out_request;
0124     }
0125 
0126     if (i915_request_wait(request, 0, T / 2) != -ETIME) {
0127         pr_err("request wait succeeded (expected timeout!)\n");
0128         goto out_request;
0129     }
0130 
0131     if (i915_request_wait(request, 0, T) == -ETIME) {
0132         pr_err("request wait timed out!\n");
0133         goto out_request;
0134     }
0135 
0136     if (!i915_request_completed(request)) {
0137         pr_err("request not complete after waiting!\n");
0138         goto out_request;
0139     }
0140 
0141     if (i915_request_wait(request, 0, T) == -ETIME) {
0142         pr_err("request wait timed out when already complete!\n");
0143         goto out_request;
0144     }
0145 
0146     err = 0;
0147 out_request:
0148     i915_request_put(request);
0149     mock_device_flush(i915);
0150     return err;
0151 }
0152 
0153 static int igt_fence_wait(void *arg)
0154 {
0155     const long T = HZ / 4;
0156     struct drm_i915_private *i915 = arg;
0157     struct i915_request *request;
0158     int err = -EINVAL;
0159 
0160     /* Submit a request, treat it as a fence and wait upon it */
0161 
0162     request = mock_request(rcs0(i915)->kernel_context, T);
0163     if (!request)
0164         return -ENOMEM;
0165 
0166     if (dma_fence_wait_timeout(&request->fence, false, T) != -ETIME) {
0167         pr_err("fence wait success before submit (expected timeout)!\n");
0168         goto out;
0169     }
0170 
0171     i915_request_add(request);
0172 
0173     if (dma_fence_is_signaled(&request->fence)) {
0174         pr_err("fence signaled immediately!\n");
0175         goto out;
0176     }
0177 
0178     if (dma_fence_wait_timeout(&request->fence, false, T / 2) != -ETIME) {
0179         pr_err("fence wait success after submit (expected timeout)!\n");
0180         goto out;
0181     }
0182 
0183     if (dma_fence_wait_timeout(&request->fence, false, T) <= 0) {
0184         pr_err("fence wait timed out (expected success)!\n");
0185         goto out;
0186     }
0187 
0188     if (!dma_fence_is_signaled(&request->fence)) {
0189         pr_err("fence unsignaled after waiting!\n");
0190         goto out;
0191     }
0192 
0193     if (dma_fence_wait_timeout(&request->fence, false, T) <= 0) {
0194         pr_err("fence wait timed out when complete (expected success)!\n");
0195         goto out;
0196     }
0197 
0198     err = 0;
0199 out:
0200     mock_device_flush(i915);
0201     return err;
0202 }
0203 
0204 static int igt_request_rewind(void *arg)
0205 {
0206     struct drm_i915_private *i915 = arg;
0207     struct i915_request *request, *vip;
0208     struct i915_gem_context *ctx[2];
0209     struct intel_context *ce;
0210     int err = -EINVAL;
0211 
0212     ctx[0] = mock_context(i915, "A");
0213     if (!ctx[0]) {
0214         err = -ENOMEM;
0215         goto err_ctx_0;
0216     }
0217 
0218     ce = i915_gem_context_get_engine(ctx[0], RCS0);
0219     GEM_BUG_ON(IS_ERR(ce));
0220     request = mock_request(ce, 2 * HZ);
0221     intel_context_put(ce);
0222     if (!request) {
0223         err = -ENOMEM;
0224         goto err_context_0;
0225     }
0226 
0227     i915_request_get(request);
0228     i915_request_add(request);
0229 
0230     ctx[1] = mock_context(i915, "B");
0231     if (!ctx[1]) {
0232         err = -ENOMEM;
0233         goto err_ctx_1;
0234     }
0235 
0236     ce = i915_gem_context_get_engine(ctx[1], RCS0);
0237     GEM_BUG_ON(IS_ERR(ce));
0238     vip = mock_request(ce, 0);
0239     intel_context_put(ce);
0240     if (!vip) {
0241         err = -ENOMEM;
0242         goto err_context_1;
0243     }
0244 
0245     /* Simulate preemption by manual reordering */
0246     if (!mock_cancel_request(request)) {
0247         pr_err("failed to cancel request (already executed)!\n");
0248         i915_request_add(vip);
0249         goto err_context_1;
0250     }
0251     i915_request_get(vip);
0252     i915_request_add(vip);
0253     rcu_read_lock();
0254     request->engine->submit_request(request);
0255     rcu_read_unlock();
0256 
0257 
0258     if (i915_request_wait(vip, 0, HZ) == -ETIME) {
0259         pr_err("timed out waiting for high priority request\n");
0260         goto err;
0261     }
0262 
0263     if (i915_request_completed(request)) {
0264         pr_err("low priority request already completed\n");
0265         goto err;
0266     }
0267 
0268     err = 0;
0269 err:
0270     i915_request_put(vip);
0271 err_context_1:
0272     mock_context_close(ctx[1]);
0273 err_ctx_1:
0274     i915_request_put(request);
0275 err_context_0:
0276     mock_context_close(ctx[0]);
0277 err_ctx_0:
0278     mock_device_flush(i915);
0279     return err;
0280 }
0281 
0282 struct smoketest {
0283     struct intel_engine_cs *engine;
0284     struct i915_gem_context **contexts;
0285     atomic_long_t num_waits, num_fences;
0286     int ncontexts, max_batch;
0287     struct i915_request *(*request_alloc)(struct intel_context *ce);
0288 };
0289 
0290 static struct i915_request *
0291 __mock_request_alloc(struct intel_context *ce)
0292 {
0293     return mock_request(ce, 0);
0294 }
0295 
0296 static struct i915_request *
0297 __live_request_alloc(struct intel_context *ce)
0298 {
0299     return intel_context_create_request(ce);
0300 }
0301 
0302 static int __igt_breadcrumbs_smoketest(void *arg)
0303 {
0304     struct smoketest *t = arg;
0305     const unsigned int max_batch = min(t->ncontexts, t->max_batch) - 1;
0306     const unsigned int total = 4 * t->ncontexts + 1;
0307     unsigned int num_waits = 0, num_fences = 0;
0308     struct i915_request **requests;
0309     I915_RND_STATE(prng);
0310     unsigned int *order;
0311     int err = 0;
0312 
0313     /*
0314      * A very simple test to catch the most egregious of list handling bugs.
0315      *
0316      * At its heart, we simply create oodles of requests running across
0317      * multiple kthreads and enable signaling on them, for the sole purpose
0318      * of stressing our breadcrumb handling. The only inspection we do is
0319      * that the fences were marked as signaled.
0320      */
0321 
0322     requests = kcalloc(total, sizeof(*requests), GFP_KERNEL);
0323     if (!requests)
0324         return -ENOMEM;
0325 
0326     order = i915_random_order(total, &prng);
0327     if (!order) {
0328         err = -ENOMEM;
0329         goto out_requests;
0330     }
0331 
0332     while (!kthread_should_stop()) {
0333         struct i915_sw_fence *submit, *wait;
0334         unsigned int n, count;
0335 
0336         submit = heap_fence_create(GFP_KERNEL);
0337         if (!submit) {
0338             err = -ENOMEM;
0339             break;
0340         }
0341 
0342         wait = heap_fence_create(GFP_KERNEL);
0343         if (!wait) {
0344             i915_sw_fence_commit(submit);
0345             heap_fence_put(submit);
0346             err = -ENOMEM;
0347             break;
0348         }
0349 
0350         i915_random_reorder(order, total, &prng);
0351         count = 1 + i915_prandom_u32_max_state(max_batch, &prng);
0352 
0353         for (n = 0; n < count; n++) {
0354             struct i915_gem_context *ctx =
0355                 t->contexts[order[n] % t->ncontexts];
0356             struct i915_request *rq;
0357             struct intel_context *ce;
0358 
0359             ce = i915_gem_context_get_engine(ctx, t->engine->legacy_idx);
0360             GEM_BUG_ON(IS_ERR(ce));
0361             rq = t->request_alloc(ce);
0362             intel_context_put(ce);
0363             if (IS_ERR(rq)) {
0364                 err = PTR_ERR(rq);
0365                 count = n;
0366                 break;
0367             }
0368 
0369             err = i915_sw_fence_await_sw_fence_gfp(&rq->submit,
0370                                    submit,
0371                                    GFP_KERNEL);
0372 
0373             requests[n] = i915_request_get(rq);
0374             i915_request_add(rq);
0375 
0376             if (err >= 0)
0377                 err = i915_sw_fence_await_dma_fence(wait,
0378                                     &rq->fence,
0379                                     0,
0380                                     GFP_KERNEL);
0381 
0382             if (err < 0) {
0383                 i915_request_put(rq);
0384                 count = n;
0385                 break;
0386             }
0387         }
0388 
0389         i915_sw_fence_commit(submit);
0390         i915_sw_fence_commit(wait);
0391 
0392         if (!wait_event_timeout(wait->wait,
0393                     i915_sw_fence_done(wait),
0394                     5 * HZ)) {
0395             struct i915_request *rq = requests[count - 1];
0396 
0397             pr_err("waiting for %d/%d fences (last %llx:%lld) on %s timed out!\n",
0398                    atomic_read(&wait->pending), count,
0399                    rq->fence.context, rq->fence.seqno,
0400                    t->engine->name);
0401             GEM_TRACE_DUMP();
0402 
0403             intel_gt_set_wedged(t->engine->gt);
0404             GEM_BUG_ON(!i915_request_completed(rq));
0405             i915_sw_fence_wait(wait);
0406             err = -EIO;
0407         }
0408 
0409         for (n = 0; n < count; n++) {
0410             struct i915_request *rq = requests[n];
0411 
0412             if (!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
0413                       &rq->fence.flags)) {
0414                 pr_err("%llu:%llu was not signaled!\n",
0415                        rq->fence.context, rq->fence.seqno);
0416                 err = -EINVAL;
0417             }
0418 
0419             i915_request_put(rq);
0420         }
0421 
0422         heap_fence_put(wait);
0423         heap_fence_put(submit);
0424 
0425         if (err < 0)
0426             break;
0427 
0428         num_fences += count;
0429         num_waits++;
0430 
0431         cond_resched();
0432     }
0433 
0434     atomic_long_add(num_fences, &t->num_fences);
0435     atomic_long_add(num_waits, &t->num_waits);
0436 
0437     kfree(order);
0438 out_requests:
0439     kfree(requests);
0440     return err;
0441 }
0442 
0443 static int mock_breadcrumbs_smoketest(void *arg)
0444 {
0445     struct drm_i915_private *i915 = arg;
0446     struct smoketest t = {
0447         .engine = rcs0(i915),
0448         .ncontexts = 1024,
0449         .max_batch = 1024,
0450         .request_alloc = __mock_request_alloc
0451     };
0452     unsigned int ncpus = num_online_cpus();
0453     struct task_struct **threads;
0454     unsigned int n;
0455     int ret = 0;
0456 
0457     /*
0458      * Smoketest our breadcrumb/signal handling for requests across multiple
0459      * threads. A very simple test to only catch the most egregious of bugs.
0460      * See __igt_breadcrumbs_smoketest();
0461      */
0462 
0463     threads = kcalloc(ncpus, sizeof(*threads), GFP_KERNEL);
0464     if (!threads)
0465         return -ENOMEM;
0466 
0467     t.contexts = kcalloc(t.ncontexts, sizeof(*t.contexts), GFP_KERNEL);
0468     if (!t.contexts) {
0469         ret = -ENOMEM;
0470         goto out_threads;
0471     }
0472 
0473     for (n = 0; n < t.ncontexts; n++) {
0474         t.contexts[n] = mock_context(t.engine->i915, "mock");
0475         if (!t.contexts[n]) {
0476             ret = -ENOMEM;
0477             goto out_contexts;
0478         }
0479     }
0480 
0481     for (n = 0; n < ncpus; n++) {
0482         threads[n] = kthread_run(__igt_breadcrumbs_smoketest,
0483                      &t, "igt/%d", n);
0484         if (IS_ERR(threads[n])) {
0485             ret = PTR_ERR(threads[n]);
0486             ncpus = n;
0487             break;
0488         }
0489 
0490         get_task_struct(threads[n]);
0491     }
0492 
0493     yield(); /* start all threads before we begin */
0494     msleep(jiffies_to_msecs(i915_selftest.timeout_jiffies));
0495 
0496     for (n = 0; n < ncpus; n++) {
0497         int err;
0498 
0499         err = kthread_stop(threads[n]);
0500         if (err < 0 && !ret)
0501             ret = err;
0502 
0503         put_task_struct(threads[n]);
0504     }
0505     pr_info("Completed %lu waits for %lu fence across %d cpus\n",
0506         atomic_long_read(&t.num_waits),
0507         atomic_long_read(&t.num_fences),
0508         ncpus);
0509 
0510 out_contexts:
0511     for (n = 0; n < t.ncontexts; n++) {
0512         if (!t.contexts[n])
0513             break;
0514         mock_context_close(t.contexts[n]);
0515     }
0516     kfree(t.contexts);
0517 out_threads:
0518     kfree(threads);
0519     return ret;
0520 }
0521 
0522 int i915_request_mock_selftests(void)
0523 {
0524     static const struct i915_subtest tests[] = {
0525         SUBTEST(igt_add_request),
0526         SUBTEST(igt_wait_request),
0527         SUBTEST(igt_fence_wait),
0528         SUBTEST(igt_request_rewind),
0529         SUBTEST(mock_breadcrumbs_smoketest),
0530     };
0531     struct drm_i915_private *i915;
0532     intel_wakeref_t wakeref;
0533     int err = 0;
0534 
0535     i915 = mock_gem_device();
0536     if (!i915)
0537         return -ENOMEM;
0538 
0539     with_intel_runtime_pm(&i915->runtime_pm, wakeref)
0540         err = i915_subtests(tests, i915);
0541 
0542     mock_destroy_device(i915);
0543 
0544     return err;
0545 }
0546 
0547 static int live_nop_request(void *arg)
0548 {
0549     struct drm_i915_private *i915 = arg;
0550     struct intel_engine_cs *engine;
0551     struct igt_live_test t;
0552     int err = -ENODEV;
0553 
0554     /*
0555      * Submit various sized batches of empty requests, to each engine
0556      * (individually), and wait for the batch to complete. We can check
0557      * the overhead of submitting requests to the hardware.
0558      */
0559 
0560     for_each_uabi_engine(engine, i915) {
0561         unsigned long n, prime;
0562         IGT_TIMEOUT(end_time);
0563         ktime_t times[2] = {};
0564 
0565         err = igt_live_test_begin(&t, i915, __func__, engine->name);
0566         if (err)
0567             return err;
0568 
0569         intel_engine_pm_get(engine);
0570         for_each_prime_number_from(prime, 1, 8192) {
0571             struct i915_request *request = NULL;
0572 
0573             times[1] = ktime_get_raw();
0574 
0575             for (n = 0; n < prime; n++) {
0576                 i915_request_put(request);
0577                 request = i915_request_create(engine->kernel_context);
0578                 if (IS_ERR(request))
0579                     return PTR_ERR(request);
0580 
0581                 /*
0582                  * This space is left intentionally blank.
0583                  *
0584                  * We do not actually want to perform any
0585                  * action with this request, we just want
0586                  * to measure the latency in allocation
0587                  * and submission of our breadcrumbs -
0588                  * ensuring that the bare request is sufficient
0589                  * for the system to work (i.e. proper HEAD
0590                  * tracking of the rings, interrupt handling,
0591                  * etc). It also gives us the lowest bounds
0592                  * for latency.
0593                  */
0594 
0595                 i915_request_get(request);
0596                 i915_request_add(request);
0597             }
0598             i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT);
0599             i915_request_put(request);
0600 
0601             times[1] = ktime_sub(ktime_get_raw(), times[1]);
0602             if (prime == 1)
0603                 times[0] = times[1];
0604 
0605             if (__igt_timeout(end_time, NULL))
0606                 break;
0607         }
0608         intel_engine_pm_put(engine);
0609 
0610         err = igt_live_test_end(&t);
0611         if (err)
0612             return err;
0613 
0614         pr_info("Request latencies on %s: 1 = %lluns, %lu = %lluns\n",
0615             engine->name,
0616             ktime_to_ns(times[0]),
0617             prime, div64_u64(ktime_to_ns(times[1]), prime));
0618     }
0619 
0620     return err;
0621 }
0622 
0623 static int __cancel_inactive(struct intel_engine_cs *engine)
0624 {
0625     struct intel_context *ce;
0626     struct igt_spinner spin;
0627     struct i915_request *rq;
0628     int err = 0;
0629 
0630     if (igt_spinner_init(&spin, engine->gt))
0631         return -ENOMEM;
0632 
0633     ce = intel_context_create(engine);
0634     if (IS_ERR(ce)) {
0635         err = PTR_ERR(ce);
0636         goto out_spin;
0637     }
0638 
0639     rq = igt_spinner_create_request(&spin, ce, MI_ARB_CHECK);
0640     if (IS_ERR(rq)) {
0641         err = PTR_ERR(rq);
0642         goto out_ce;
0643     }
0644 
0645     pr_debug("%s: Cancelling inactive request\n", engine->name);
0646     i915_request_cancel(rq, -EINTR);
0647     i915_request_get(rq);
0648     i915_request_add(rq);
0649 
0650     if (i915_request_wait(rq, 0, HZ / 5) < 0) {
0651         struct drm_printer p = drm_info_printer(engine->i915->drm.dev);
0652 
0653         pr_err("%s: Failed to cancel inactive request\n", engine->name);
0654         intel_engine_dump(engine, &p, "%s\n", engine->name);
0655         err = -ETIME;
0656         goto out_rq;
0657     }
0658 
0659     if (rq->fence.error != -EINTR) {
0660         pr_err("%s: fence not cancelled (%u)\n",
0661                engine->name, rq->fence.error);
0662         err = -EINVAL;
0663     }
0664 
0665 out_rq:
0666     i915_request_put(rq);
0667 out_ce:
0668     intel_context_put(ce);
0669 out_spin:
0670     igt_spinner_fini(&spin);
0671     if (err)
0672         pr_err("%s: %s error %d\n", __func__, engine->name, err);
0673     return err;
0674 }
0675 
0676 static int __cancel_active(struct intel_engine_cs *engine)
0677 {
0678     struct intel_context *ce;
0679     struct igt_spinner spin;
0680     struct i915_request *rq;
0681     int err = 0;
0682 
0683     if (igt_spinner_init(&spin, engine->gt))
0684         return -ENOMEM;
0685 
0686     ce = intel_context_create(engine);
0687     if (IS_ERR(ce)) {
0688         err = PTR_ERR(ce);
0689         goto out_spin;
0690     }
0691 
0692     rq = igt_spinner_create_request(&spin, ce, MI_ARB_CHECK);
0693     if (IS_ERR(rq)) {
0694         err = PTR_ERR(rq);
0695         goto out_ce;
0696     }
0697 
0698     pr_debug("%s: Cancelling active request\n", engine->name);
0699     i915_request_get(rq);
0700     i915_request_add(rq);
0701     if (!igt_wait_for_spinner(&spin, rq)) {
0702         struct drm_printer p = drm_info_printer(engine->i915->drm.dev);
0703 
0704         pr_err("Failed to start spinner on %s\n", engine->name);
0705         intel_engine_dump(engine, &p, "%s\n", engine->name);
0706         err = -ETIME;
0707         goto out_rq;
0708     }
0709     i915_request_cancel(rq, -EINTR);
0710 
0711     if (i915_request_wait(rq, 0, HZ / 5) < 0) {
0712         struct drm_printer p = drm_info_printer(engine->i915->drm.dev);
0713 
0714         pr_err("%s: Failed to cancel active request\n", engine->name);
0715         intel_engine_dump(engine, &p, "%s\n", engine->name);
0716         err = -ETIME;
0717         goto out_rq;
0718     }
0719 
0720     if (rq->fence.error != -EINTR) {
0721         pr_err("%s: fence not cancelled (%u)\n",
0722                engine->name, rq->fence.error);
0723         err = -EINVAL;
0724     }
0725 
0726 out_rq:
0727     i915_request_put(rq);
0728 out_ce:
0729     intel_context_put(ce);
0730 out_spin:
0731     igt_spinner_fini(&spin);
0732     if (err)
0733         pr_err("%s: %s error %d\n", __func__, engine->name, err);
0734     return err;
0735 }
0736 
0737 static int __cancel_completed(struct intel_engine_cs *engine)
0738 {
0739     struct intel_context *ce;
0740     struct igt_spinner spin;
0741     struct i915_request *rq;
0742     int err = 0;
0743 
0744     if (igt_spinner_init(&spin, engine->gt))
0745         return -ENOMEM;
0746 
0747     ce = intel_context_create(engine);
0748     if (IS_ERR(ce)) {
0749         err = PTR_ERR(ce);
0750         goto out_spin;
0751     }
0752 
0753     rq = igt_spinner_create_request(&spin, ce, MI_ARB_CHECK);
0754     if (IS_ERR(rq)) {
0755         err = PTR_ERR(rq);
0756         goto out_ce;
0757     }
0758     igt_spinner_end(&spin);
0759     i915_request_get(rq);
0760     i915_request_add(rq);
0761 
0762     if (i915_request_wait(rq, 0, HZ / 5) < 0) {
0763         err = -ETIME;
0764         goto out_rq;
0765     }
0766 
0767     pr_debug("%s: Cancelling completed request\n", engine->name);
0768     i915_request_cancel(rq, -EINTR);
0769     if (rq->fence.error) {
0770         pr_err("%s: fence not cancelled (%u)\n",
0771                engine->name, rq->fence.error);
0772         err = -EINVAL;
0773     }
0774 
0775 out_rq:
0776     i915_request_put(rq);
0777 out_ce:
0778     intel_context_put(ce);
0779 out_spin:
0780     igt_spinner_fini(&spin);
0781     if (err)
0782         pr_err("%s: %s error %d\n", __func__, engine->name, err);
0783     return err;
0784 }
0785 
0786 /*
0787  * Test to prove a non-preemptable request can be cancelled and a subsequent
0788  * request on the same context can successfully complete after cancellation.
0789  *
0790  * Testing methodology is to create a non-preemptible request and submit it,
0791  * wait for spinner to start, create a NOP request and submit it, cancel the
0792  * spinner, wait for spinner to complete and verify it failed with an error,
0793  * finally wait for NOP request to complete verify it succeeded without an
0794  * error. Preemption timeout also reduced / restored so test runs in a timely
0795  * maner.
0796  */
0797 static int __cancel_reset(struct drm_i915_private *i915,
0798               struct intel_engine_cs *engine)
0799 {
0800     struct intel_context *ce;
0801     struct igt_spinner spin;
0802     struct i915_request *rq, *nop;
0803     unsigned long preempt_timeout_ms;
0804     int err = 0;
0805 
0806     if (!CONFIG_DRM_I915_PREEMPT_TIMEOUT ||
0807         !intel_has_reset_engine(engine->gt))
0808         return 0;
0809 
0810     preempt_timeout_ms = engine->props.preempt_timeout_ms;
0811     engine->props.preempt_timeout_ms = 100;
0812 
0813     if (igt_spinner_init(&spin, engine->gt))
0814         goto out_restore;
0815 
0816     ce = intel_context_create(engine);
0817     if (IS_ERR(ce)) {
0818         err = PTR_ERR(ce);
0819         goto out_spin;
0820     }
0821 
0822     rq = igt_spinner_create_request(&spin, ce, MI_NOOP);
0823     if (IS_ERR(rq)) {
0824         err = PTR_ERR(rq);
0825         goto out_ce;
0826     }
0827 
0828     pr_debug("%s: Cancelling active non-preemptable request\n",
0829          engine->name);
0830     i915_request_get(rq);
0831     i915_request_add(rq);
0832     if (!igt_wait_for_spinner(&spin, rq)) {
0833         struct drm_printer p = drm_info_printer(engine->i915->drm.dev);
0834 
0835         pr_err("Failed to start spinner on %s\n", engine->name);
0836         intel_engine_dump(engine, &p, "%s\n", engine->name);
0837         err = -ETIME;
0838         goto out_rq;
0839     }
0840 
0841     nop = intel_context_create_request(ce);
0842     if (IS_ERR(nop))
0843         goto out_rq;
0844     i915_request_get(nop);
0845     i915_request_add(nop);
0846 
0847     i915_request_cancel(rq, -EINTR);
0848 
0849     if (i915_request_wait(rq, 0, HZ) < 0) {
0850         struct drm_printer p = drm_info_printer(engine->i915->drm.dev);
0851 
0852         pr_err("%s: Failed to cancel hung request\n", engine->name);
0853         intel_engine_dump(engine, &p, "%s\n", engine->name);
0854         err = -ETIME;
0855         goto out_nop;
0856     }
0857 
0858     if (rq->fence.error != -EINTR) {
0859         pr_err("%s: fence not cancelled (%u)\n",
0860                engine->name, rq->fence.error);
0861         err = -EINVAL;
0862         goto out_nop;
0863     }
0864 
0865     if (i915_request_wait(nop, 0, HZ) < 0) {
0866         struct drm_printer p = drm_info_printer(engine->i915->drm.dev);
0867 
0868         pr_err("%s: Failed to complete nop request\n", engine->name);
0869         intel_engine_dump(engine, &p, "%s\n", engine->name);
0870         err = -ETIME;
0871         goto out_nop;
0872     }
0873 
0874     if (nop->fence.error != 0) {
0875         pr_err("%s: Nop request errored (%u)\n",
0876                engine->name, nop->fence.error);
0877         err = -EINVAL;
0878     }
0879 
0880 out_nop:
0881     i915_request_put(nop);
0882 out_rq:
0883     i915_request_put(rq);
0884 out_ce:
0885     intel_context_put(ce);
0886 out_spin:
0887     igt_spinner_fini(&spin);
0888 out_restore:
0889     engine->props.preempt_timeout_ms = preempt_timeout_ms;
0890     if (err)
0891         pr_err("%s: %s error %d\n", __func__, engine->name, err);
0892     return err;
0893 }
0894 
0895 static int live_cancel_request(void *arg)
0896 {
0897     struct drm_i915_private *i915 = arg;
0898     struct intel_engine_cs *engine;
0899 
0900     /*
0901      * Check cancellation of requests. We expect to be able to immediately
0902      * cancel active requests, even if they are currently on the GPU.
0903      */
0904 
0905     for_each_uabi_engine(engine, i915) {
0906         struct igt_live_test t;
0907         int err, err2;
0908 
0909         if (!intel_engine_has_preemption(engine))
0910             continue;
0911 
0912         err = igt_live_test_begin(&t, i915, __func__, engine->name);
0913         if (err)
0914             return err;
0915 
0916         err = __cancel_inactive(engine);
0917         if (err == 0)
0918             err = __cancel_active(engine);
0919         if (err == 0)
0920             err = __cancel_completed(engine);
0921 
0922         err2 = igt_live_test_end(&t);
0923         if (err)
0924             return err;
0925         if (err2)
0926             return err2;
0927 
0928         /* Expects reset so call outside of igt_live_test_* */
0929         err = __cancel_reset(i915, engine);
0930         if (err)
0931             return err;
0932 
0933         if (igt_flush_test(i915))
0934             return -EIO;
0935     }
0936 
0937     return 0;
0938 }
0939 
0940 static struct i915_vma *empty_batch(struct drm_i915_private *i915)
0941 {
0942     struct drm_i915_gem_object *obj;
0943     struct i915_vma *vma;
0944     u32 *cmd;
0945     int err;
0946 
0947     obj = i915_gem_object_create_internal(i915, PAGE_SIZE);
0948     if (IS_ERR(obj))
0949         return ERR_CAST(obj);
0950 
0951     cmd = i915_gem_object_pin_map_unlocked(obj, I915_MAP_WB);
0952     if (IS_ERR(cmd)) {
0953         err = PTR_ERR(cmd);
0954         goto err;
0955     }
0956 
0957     *cmd = MI_BATCH_BUFFER_END;
0958 
0959     __i915_gem_object_flush_map(obj, 0, 64);
0960     i915_gem_object_unpin_map(obj);
0961 
0962     intel_gt_chipset_flush(to_gt(i915));
0963 
0964     vma = i915_vma_instance(obj, &to_gt(i915)->ggtt->vm, NULL);
0965     if (IS_ERR(vma)) {
0966         err = PTR_ERR(vma);
0967         goto err;
0968     }
0969 
0970     err = i915_vma_pin(vma, 0, 0, PIN_USER | PIN_GLOBAL);
0971     if (err)
0972         goto err;
0973 
0974     /* Force the wait wait now to avoid including it in the benchmark */
0975     err = i915_vma_sync(vma);
0976     if (err)
0977         goto err_pin;
0978 
0979     return vma;
0980 
0981 err_pin:
0982     i915_vma_unpin(vma);
0983 err:
0984     i915_gem_object_put(obj);
0985     return ERR_PTR(err);
0986 }
0987 
0988 static struct i915_request *
0989 empty_request(struct intel_engine_cs *engine,
0990           struct i915_vma *batch)
0991 {
0992     struct i915_request *request;
0993     int err;
0994 
0995     request = i915_request_create(engine->kernel_context);
0996     if (IS_ERR(request))
0997         return request;
0998 
0999     err = engine->emit_bb_start(request,
1000                     batch->node.start,
1001                     batch->node.size,
1002                     I915_DISPATCH_SECURE);
1003     if (err)
1004         goto out_request;
1005 
1006     i915_request_get(request);
1007 out_request:
1008     i915_request_add(request);
1009     return err ? ERR_PTR(err) : request;
1010 }
1011 
1012 static int live_empty_request(void *arg)
1013 {
1014     struct drm_i915_private *i915 = arg;
1015     struct intel_engine_cs *engine;
1016     struct igt_live_test t;
1017     struct i915_vma *batch;
1018     int err = 0;
1019 
1020     /*
1021      * Submit various sized batches of empty requests, to each engine
1022      * (individually), and wait for the batch to complete. We can check
1023      * the overhead of submitting requests to the hardware.
1024      */
1025 
1026     batch = empty_batch(i915);
1027     if (IS_ERR(batch))
1028         return PTR_ERR(batch);
1029 
1030     for_each_uabi_engine(engine, i915) {
1031         IGT_TIMEOUT(end_time);
1032         struct i915_request *request;
1033         unsigned long n, prime;
1034         ktime_t times[2] = {};
1035 
1036         err = igt_live_test_begin(&t, i915, __func__, engine->name);
1037         if (err)
1038             goto out_batch;
1039 
1040         intel_engine_pm_get(engine);
1041 
1042         /* Warmup / preload */
1043         request = empty_request(engine, batch);
1044         if (IS_ERR(request)) {
1045             err = PTR_ERR(request);
1046             intel_engine_pm_put(engine);
1047             goto out_batch;
1048         }
1049         i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT);
1050 
1051         for_each_prime_number_from(prime, 1, 8192) {
1052             times[1] = ktime_get_raw();
1053 
1054             for (n = 0; n < prime; n++) {
1055                 i915_request_put(request);
1056                 request = empty_request(engine, batch);
1057                 if (IS_ERR(request)) {
1058                     err = PTR_ERR(request);
1059                     intel_engine_pm_put(engine);
1060                     goto out_batch;
1061                 }
1062             }
1063             i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT);
1064 
1065             times[1] = ktime_sub(ktime_get_raw(), times[1]);
1066             if (prime == 1)
1067                 times[0] = times[1];
1068 
1069             if (__igt_timeout(end_time, NULL))
1070                 break;
1071         }
1072         i915_request_put(request);
1073         intel_engine_pm_put(engine);
1074 
1075         err = igt_live_test_end(&t);
1076         if (err)
1077             goto out_batch;
1078 
1079         pr_info("Batch latencies on %s: 1 = %lluns, %lu = %lluns\n",
1080             engine->name,
1081             ktime_to_ns(times[0]),
1082             prime, div64_u64(ktime_to_ns(times[1]), prime));
1083     }
1084 
1085 out_batch:
1086     i915_vma_unpin(batch);
1087     i915_vma_put(batch);
1088     return err;
1089 }
1090 
1091 static struct i915_vma *recursive_batch(struct drm_i915_private *i915)
1092 {
1093     struct drm_i915_gem_object *obj;
1094     const int ver = GRAPHICS_VER(i915);
1095     struct i915_vma *vma;
1096     u32 *cmd;
1097     int err;
1098 
1099     obj = i915_gem_object_create_internal(i915, PAGE_SIZE);
1100     if (IS_ERR(obj))
1101         return ERR_CAST(obj);
1102 
1103     vma = i915_vma_instance(obj, to_gt(i915)->vm, NULL);
1104     if (IS_ERR(vma)) {
1105         err = PTR_ERR(vma);
1106         goto err;
1107     }
1108 
1109     err = i915_vma_pin(vma, 0, 0, PIN_USER);
1110     if (err)
1111         goto err;
1112 
1113     cmd = i915_gem_object_pin_map_unlocked(obj, I915_MAP_WC);
1114     if (IS_ERR(cmd)) {
1115         err = PTR_ERR(cmd);
1116         goto err;
1117     }
1118 
1119     if (ver >= 8) {
1120         *cmd++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
1121         *cmd++ = lower_32_bits(vma->node.start);
1122         *cmd++ = upper_32_bits(vma->node.start);
1123     } else if (ver >= 6) {
1124         *cmd++ = MI_BATCH_BUFFER_START | 1 << 8;
1125         *cmd++ = lower_32_bits(vma->node.start);
1126     } else {
1127         *cmd++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT;
1128         *cmd++ = lower_32_bits(vma->node.start);
1129     }
1130     *cmd++ = MI_BATCH_BUFFER_END; /* terminate early in case of error */
1131 
1132     __i915_gem_object_flush_map(obj, 0, 64);
1133     i915_gem_object_unpin_map(obj);
1134 
1135     intel_gt_chipset_flush(to_gt(i915));
1136 
1137     return vma;
1138 
1139 err:
1140     i915_gem_object_put(obj);
1141     return ERR_PTR(err);
1142 }
1143 
1144 static int recursive_batch_resolve(struct i915_vma *batch)
1145 {
1146     u32 *cmd;
1147 
1148     cmd = i915_gem_object_pin_map_unlocked(batch->obj, I915_MAP_WC);
1149     if (IS_ERR(cmd))
1150         return PTR_ERR(cmd);
1151 
1152     *cmd = MI_BATCH_BUFFER_END;
1153 
1154     __i915_gem_object_flush_map(batch->obj, 0, sizeof(*cmd));
1155     i915_gem_object_unpin_map(batch->obj);
1156 
1157     intel_gt_chipset_flush(batch->vm->gt);
1158 
1159     return 0;
1160 }
1161 
1162 static int live_all_engines(void *arg)
1163 {
1164     struct drm_i915_private *i915 = arg;
1165     const unsigned int nengines = num_uabi_engines(i915);
1166     struct intel_engine_cs *engine;
1167     struct i915_request **request;
1168     struct igt_live_test t;
1169     struct i915_vma *batch;
1170     unsigned int idx;
1171     int err;
1172 
1173     /*
1174      * Check we can submit requests to all engines simultaneously. We
1175      * send a recursive batch to each engine - checking that we don't
1176      * block doing so, and that they don't complete too soon.
1177      */
1178 
1179     request = kcalloc(nengines, sizeof(*request), GFP_KERNEL);
1180     if (!request)
1181         return -ENOMEM;
1182 
1183     err = igt_live_test_begin(&t, i915, __func__, "");
1184     if (err)
1185         goto out_free;
1186 
1187     batch = recursive_batch(i915);
1188     if (IS_ERR(batch)) {
1189         err = PTR_ERR(batch);
1190         pr_err("%s: Unable to create batch, err=%d\n", __func__, err);
1191         goto out_free;
1192     }
1193 
1194     i915_vma_lock(batch);
1195 
1196     idx = 0;
1197     for_each_uabi_engine(engine, i915) {
1198         request[idx] = intel_engine_create_kernel_request(engine);
1199         if (IS_ERR(request[idx])) {
1200             err = PTR_ERR(request[idx]);
1201             pr_err("%s: Request allocation failed with err=%d\n",
1202                    __func__, err);
1203             goto out_request;
1204         }
1205 
1206         err = i915_request_await_object(request[idx], batch->obj, 0);
1207         if (err == 0)
1208             err = i915_vma_move_to_active(batch, request[idx], 0);
1209         GEM_BUG_ON(err);
1210 
1211         err = engine->emit_bb_start(request[idx],
1212                         batch->node.start,
1213                         batch->node.size,
1214                         0);
1215         GEM_BUG_ON(err);
1216         request[idx]->batch = batch;
1217 
1218         i915_request_get(request[idx]);
1219         i915_request_add(request[idx]);
1220         idx++;
1221     }
1222 
1223     i915_vma_unlock(batch);
1224 
1225     idx = 0;
1226     for_each_uabi_engine(engine, i915) {
1227         if (i915_request_completed(request[idx])) {
1228             pr_err("%s(%s): request completed too early!\n",
1229                    __func__, engine->name);
1230             err = -EINVAL;
1231             goto out_request;
1232         }
1233         idx++;
1234     }
1235 
1236     err = recursive_batch_resolve(batch);
1237     if (err) {
1238         pr_err("%s: failed to resolve batch, err=%d\n", __func__, err);
1239         goto out_request;
1240     }
1241 
1242     idx = 0;
1243     for_each_uabi_engine(engine, i915) {
1244         long timeout;
1245 
1246         timeout = i915_request_wait(request[idx], 0,
1247                         MAX_SCHEDULE_TIMEOUT);
1248         if (timeout < 0) {
1249             err = timeout;
1250             pr_err("%s: error waiting for request on %s, err=%d\n",
1251                    __func__, engine->name, err);
1252             goto out_request;
1253         }
1254 
1255         GEM_BUG_ON(!i915_request_completed(request[idx]));
1256         i915_request_put(request[idx]);
1257         request[idx] = NULL;
1258         idx++;
1259     }
1260 
1261     err = igt_live_test_end(&t);
1262 
1263 out_request:
1264     idx = 0;
1265     for_each_uabi_engine(engine, i915) {
1266         if (request[idx])
1267             i915_request_put(request[idx]);
1268         idx++;
1269     }
1270     i915_vma_unpin(batch);
1271     i915_vma_put(batch);
1272 out_free:
1273     kfree(request);
1274     return err;
1275 }
1276 
1277 static int live_sequential_engines(void *arg)
1278 {
1279     struct drm_i915_private *i915 = arg;
1280     const unsigned int nengines = num_uabi_engines(i915);
1281     struct i915_request **request;
1282     struct i915_request *prev = NULL;
1283     struct intel_engine_cs *engine;
1284     struct igt_live_test t;
1285     unsigned int idx;
1286     int err;
1287 
1288     /*
1289      * Check we can submit requests to all engines sequentially, such
1290      * that each successive request waits for the earlier ones. This
1291      * tests that we don't execute requests out of order, even though
1292      * they are running on independent engines.
1293      */
1294 
1295     request = kcalloc(nengines, sizeof(*request), GFP_KERNEL);
1296     if (!request)
1297         return -ENOMEM;
1298 
1299     err = igt_live_test_begin(&t, i915, __func__, "");
1300     if (err)
1301         goto out_free;
1302 
1303     idx = 0;
1304     for_each_uabi_engine(engine, i915) {
1305         struct i915_vma *batch;
1306 
1307         batch = recursive_batch(i915);
1308         if (IS_ERR(batch)) {
1309             err = PTR_ERR(batch);
1310             pr_err("%s: Unable to create batch for %s, err=%d\n",
1311                    __func__, engine->name, err);
1312             goto out_free;
1313         }
1314 
1315         i915_vma_lock(batch);
1316         request[idx] = intel_engine_create_kernel_request(engine);
1317         if (IS_ERR(request[idx])) {
1318             err = PTR_ERR(request[idx]);
1319             pr_err("%s: Request allocation failed for %s with err=%d\n",
1320                    __func__, engine->name, err);
1321             goto out_unlock;
1322         }
1323 
1324         if (prev) {
1325             err = i915_request_await_dma_fence(request[idx],
1326                                &prev->fence);
1327             if (err) {
1328                 i915_request_add(request[idx]);
1329                 pr_err("%s: Request await failed for %s with err=%d\n",
1330                        __func__, engine->name, err);
1331                 goto out_unlock;
1332             }
1333         }
1334 
1335         err = i915_request_await_object(request[idx],
1336                         batch->obj, false);
1337         if (err == 0)
1338             err = i915_vma_move_to_active(batch, request[idx], 0);
1339         GEM_BUG_ON(err);
1340 
1341         err = engine->emit_bb_start(request[idx],
1342                         batch->node.start,
1343                         batch->node.size,
1344                         0);
1345         GEM_BUG_ON(err);
1346         request[idx]->batch = batch;
1347 
1348         i915_request_get(request[idx]);
1349         i915_request_add(request[idx]);
1350 
1351         prev = request[idx];
1352         idx++;
1353 
1354 out_unlock:
1355         i915_vma_unlock(batch);
1356         if (err)
1357             goto out_request;
1358     }
1359 
1360     idx = 0;
1361     for_each_uabi_engine(engine, i915) {
1362         long timeout;
1363 
1364         if (i915_request_completed(request[idx])) {
1365             pr_err("%s(%s): request completed too early!\n",
1366                    __func__, engine->name);
1367             err = -EINVAL;
1368             goto out_request;
1369         }
1370 
1371         err = recursive_batch_resolve(request[idx]->batch);
1372         if (err) {
1373             pr_err("%s: failed to resolve batch, err=%d\n",
1374                    __func__, err);
1375             goto out_request;
1376         }
1377 
1378         timeout = i915_request_wait(request[idx], 0,
1379                         MAX_SCHEDULE_TIMEOUT);
1380         if (timeout < 0) {
1381             err = timeout;
1382             pr_err("%s: error waiting for request on %s, err=%d\n",
1383                    __func__, engine->name, err);
1384             goto out_request;
1385         }
1386 
1387         GEM_BUG_ON(!i915_request_completed(request[idx]));
1388         idx++;
1389     }
1390 
1391     err = igt_live_test_end(&t);
1392 
1393 out_request:
1394     idx = 0;
1395     for_each_uabi_engine(engine, i915) {
1396         u32 *cmd;
1397 
1398         if (!request[idx])
1399             break;
1400 
1401         cmd = i915_gem_object_pin_map_unlocked(request[idx]->batch->obj,
1402                                I915_MAP_WC);
1403         if (!IS_ERR(cmd)) {
1404             *cmd = MI_BATCH_BUFFER_END;
1405 
1406             __i915_gem_object_flush_map(request[idx]->batch->obj,
1407                             0, sizeof(*cmd));
1408             i915_gem_object_unpin_map(request[idx]->batch->obj);
1409 
1410             intel_gt_chipset_flush(engine->gt);
1411         }
1412 
1413         i915_vma_put(request[idx]->batch);
1414         i915_request_put(request[idx]);
1415         idx++;
1416     }
1417 out_free:
1418     kfree(request);
1419     return err;
1420 }
1421 
1422 static int __live_parallel_engine1(void *arg)
1423 {
1424     struct intel_engine_cs *engine = arg;
1425     IGT_TIMEOUT(end_time);
1426     unsigned long count;
1427     int err = 0;
1428 
1429     count = 0;
1430     intel_engine_pm_get(engine);
1431     do {
1432         struct i915_request *rq;
1433 
1434         rq = i915_request_create(engine->kernel_context);
1435         if (IS_ERR(rq)) {
1436             err = PTR_ERR(rq);
1437             break;
1438         }
1439 
1440         i915_request_get(rq);
1441         i915_request_add(rq);
1442 
1443         err = 0;
1444         if (i915_request_wait(rq, 0, HZ) < 0)
1445             err = -ETIME;
1446         i915_request_put(rq);
1447         if (err)
1448             break;
1449 
1450         count++;
1451     } while (!__igt_timeout(end_time, NULL));
1452     intel_engine_pm_put(engine);
1453 
1454     pr_info("%s: %lu request + sync\n", engine->name, count);
1455     return err;
1456 }
1457 
1458 static int __live_parallel_engineN(void *arg)
1459 {
1460     struct intel_engine_cs *engine = arg;
1461     IGT_TIMEOUT(end_time);
1462     unsigned long count;
1463     int err = 0;
1464 
1465     count = 0;
1466     intel_engine_pm_get(engine);
1467     do {
1468         struct i915_request *rq;
1469 
1470         rq = i915_request_create(engine->kernel_context);
1471         if (IS_ERR(rq)) {
1472             err = PTR_ERR(rq);
1473             break;
1474         }
1475 
1476         i915_request_add(rq);
1477         count++;
1478     } while (!__igt_timeout(end_time, NULL));
1479     intel_engine_pm_put(engine);
1480 
1481     pr_info("%s: %lu requests\n", engine->name, count);
1482     return err;
1483 }
1484 
1485 static bool wake_all(struct drm_i915_private *i915)
1486 {
1487     if (atomic_dec_and_test(&i915->selftest.counter)) {
1488         wake_up_var(&i915->selftest.counter);
1489         return true;
1490     }
1491 
1492     return false;
1493 }
1494 
1495 static int wait_for_all(struct drm_i915_private *i915)
1496 {
1497     if (wake_all(i915))
1498         return 0;
1499 
1500     if (wait_var_event_timeout(&i915->selftest.counter,
1501                    !atomic_read(&i915->selftest.counter),
1502                    i915_selftest.timeout_jiffies))
1503         return 0;
1504 
1505     return -ETIME;
1506 }
1507 
1508 static int __live_parallel_spin(void *arg)
1509 {
1510     struct intel_engine_cs *engine = arg;
1511     struct igt_spinner spin;
1512     struct i915_request *rq;
1513     int err = 0;
1514 
1515     /*
1516      * Create a spinner running for eternity on each engine. If a second
1517      * spinner is incorrectly placed on the same engine, it will not be
1518      * able to start in time.
1519      */
1520 
1521     if (igt_spinner_init(&spin, engine->gt)) {
1522         wake_all(engine->i915);
1523         return -ENOMEM;
1524     }
1525 
1526     intel_engine_pm_get(engine);
1527     rq = igt_spinner_create_request(&spin,
1528                     engine->kernel_context,
1529                     MI_NOOP); /* no preemption */
1530     intel_engine_pm_put(engine);
1531     if (IS_ERR(rq)) {
1532         err = PTR_ERR(rq);
1533         if (err == -ENODEV)
1534             err = 0;
1535         wake_all(engine->i915);
1536         goto out_spin;
1537     }
1538 
1539     i915_request_get(rq);
1540     i915_request_add(rq);
1541     if (igt_wait_for_spinner(&spin, rq)) {
1542         /* Occupy this engine for the whole test */
1543         err = wait_for_all(engine->i915);
1544     } else {
1545         pr_err("Failed to start spinner on %s\n", engine->name);
1546         err = -EINVAL;
1547     }
1548     igt_spinner_end(&spin);
1549 
1550     if (err == 0 && i915_request_wait(rq, 0, HZ) < 0)
1551         err = -EIO;
1552     i915_request_put(rq);
1553 
1554 out_spin:
1555     igt_spinner_fini(&spin);
1556     return err;
1557 }
1558 
1559 static int live_parallel_engines(void *arg)
1560 {
1561     struct drm_i915_private *i915 = arg;
1562     static int (* const func[])(void *arg) = {
1563         __live_parallel_engine1,
1564         __live_parallel_engineN,
1565         __live_parallel_spin,
1566         NULL,
1567     };
1568     const unsigned int nengines = num_uabi_engines(i915);
1569     struct intel_engine_cs *engine;
1570     int (* const *fn)(void *arg);
1571     struct task_struct **tsk;
1572     int err = 0;
1573 
1574     /*
1575      * Check we can submit requests to all engines concurrently. This
1576      * tests that we load up the system maximally.
1577      */
1578 
1579     tsk = kcalloc(nengines, sizeof(*tsk), GFP_KERNEL);
1580     if (!tsk)
1581         return -ENOMEM;
1582 
1583     for (fn = func; !err && *fn; fn++) {
1584         char name[KSYM_NAME_LEN];
1585         struct igt_live_test t;
1586         unsigned int idx;
1587 
1588         snprintf(name, sizeof(name), "%ps", *fn);
1589         err = igt_live_test_begin(&t, i915, __func__, name);
1590         if (err)
1591             break;
1592 
1593         atomic_set(&i915->selftest.counter, nengines);
1594 
1595         idx = 0;
1596         for_each_uabi_engine(engine, i915) {
1597             tsk[idx] = kthread_run(*fn, engine,
1598                            "igt/parallel:%s",
1599                            engine->name);
1600             if (IS_ERR(tsk[idx])) {
1601                 err = PTR_ERR(tsk[idx]);
1602                 break;
1603             }
1604             get_task_struct(tsk[idx++]);
1605         }
1606 
1607         yield(); /* start all threads before we kthread_stop() */
1608 
1609         idx = 0;
1610         for_each_uabi_engine(engine, i915) {
1611             int status;
1612 
1613             if (IS_ERR(tsk[idx]))
1614                 break;
1615 
1616             status = kthread_stop(tsk[idx]);
1617             if (status && !err)
1618                 err = status;
1619 
1620             put_task_struct(tsk[idx++]);
1621         }
1622 
1623         if (igt_live_test_end(&t))
1624             err = -EIO;
1625     }
1626 
1627     kfree(tsk);
1628     return err;
1629 }
1630 
1631 static int
1632 max_batches(struct i915_gem_context *ctx, struct intel_engine_cs *engine)
1633 {
1634     struct i915_request *rq;
1635     int ret;
1636 
1637     /*
1638      * Before execlists, all contexts share the same ringbuffer. With
1639      * execlists, each context/engine has a separate ringbuffer and
1640      * for the purposes of this test, inexhaustible.
1641      *
1642      * For the global ringbuffer though, we have to be very careful
1643      * that we do not wrap while preventing the execution of requests
1644      * with a unsignaled fence.
1645      */
1646     if (HAS_EXECLISTS(ctx->i915))
1647         return INT_MAX;
1648 
1649     rq = igt_request_alloc(ctx, engine);
1650     if (IS_ERR(rq)) {
1651         ret = PTR_ERR(rq);
1652     } else {
1653         int sz;
1654 
1655         ret = rq->ring->size - rq->reserved_space;
1656         i915_request_add(rq);
1657 
1658         sz = rq->ring->emit - rq->head;
1659         if (sz < 0)
1660             sz += rq->ring->size;
1661         ret /= sz;
1662         ret /= 2; /* leave half spare, in case of emergency! */
1663     }
1664 
1665     return ret;
1666 }
1667 
1668 static int live_breadcrumbs_smoketest(void *arg)
1669 {
1670     struct drm_i915_private *i915 = arg;
1671     const unsigned int nengines = num_uabi_engines(i915);
1672     const unsigned int ncpus = num_online_cpus();
1673     unsigned long num_waits, num_fences;
1674     struct intel_engine_cs *engine;
1675     struct task_struct **threads;
1676     struct igt_live_test live;
1677     intel_wakeref_t wakeref;
1678     struct smoketest *smoke;
1679     unsigned int n, idx;
1680     struct file *file;
1681     int ret = 0;
1682 
1683     /*
1684      * Smoketest our breadcrumb/signal handling for requests across multiple
1685      * threads. A very simple test to only catch the most egregious of bugs.
1686      * See __igt_breadcrumbs_smoketest();
1687      *
1688      * On real hardware this time.
1689      */
1690 
1691     wakeref = intel_runtime_pm_get(&i915->runtime_pm);
1692 
1693     file = mock_file(i915);
1694     if (IS_ERR(file)) {
1695         ret = PTR_ERR(file);
1696         goto out_rpm;
1697     }
1698 
1699     smoke = kcalloc(nengines, sizeof(*smoke), GFP_KERNEL);
1700     if (!smoke) {
1701         ret = -ENOMEM;
1702         goto out_file;
1703     }
1704 
1705     threads = kcalloc(ncpus * nengines, sizeof(*threads), GFP_KERNEL);
1706     if (!threads) {
1707         ret = -ENOMEM;
1708         goto out_smoke;
1709     }
1710 
1711     smoke[0].request_alloc = __live_request_alloc;
1712     smoke[0].ncontexts = 64;
1713     smoke[0].contexts = kcalloc(smoke[0].ncontexts,
1714                     sizeof(*smoke[0].contexts),
1715                     GFP_KERNEL);
1716     if (!smoke[0].contexts) {
1717         ret = -ENOMEM;
1718         goto out_threads;
1719     }
1720 
1721     for (n = 0; n < smoke[0].ncontexts; n++) {
1722         smoke[0].contexts[n] = live_context(i915, file);
1723         if (IS_ERR(smoke[0].contexts[n])) {
1724             ret = PTR_ERR(smoke[0].contexts[n]);
1725             goto out_contexts;
1726         }
1727     }
1728 
1729     ret = igt_live_test_begin(&live, i915, __func__, "");
1730     if (ret)
1731         goto out_contexts;
1732 
1733     idx = 0;
1734     for_each_uabi_engine(engine, i915) {
1735         smoke[idx] = smoke[0];
1736         smoke[idx].engine = engine;
1737         smoke[idx].max_batch =
1738             max_batches(smoke[0].contexts[0], engine);
1739         if (smoke[idx].max_batch < 0) {
1740             ret = smoke[idx].max_batch;
1741             goto out_flush;
1742         }
1743         /* One ring interleaved between requests from all cpus */
1744         smoke[idx].max_batch /= num_online_cpus() + 1;
1745         pr_debug("Limiting batches to %d requests on %s\n",
1746              smoke[idx].max_batch, engine->name);
1747 
1748         for (n = 0; n < ncpus; n++) {
1749             struct task_struct *tsk;
1750 
1751             tsk = kthread_run(__igt_breadcrumbs_smoketest,
1752                       &smoke[idx], "igt/%d.%d", idx, n);
1753             if (IS_ERR(tsk)) {
1754                 ret = PTR_ERR(tsk);
1755                 goto out_flush;
1756             }
1757 
1758             get_task_struct(tsk);
1759             threads[idx * ncpus + n] = tsk;
1760         }
1761 
1762         idx++;
1763     }
1764 
1765     yield(); /* start all threads before we begin */
1766     msleep(jiffies_to_msecs(i915_selftest.timeout_jiffies));
1767 
1768 out_flush:
1769     idx = 0;
1770     num_waits = 0;
1771     num_fences = 0;
1772     for_each_uabi_engine(engine, i915) {
1773         for (n = 0; n < ncpus; n++) {
1774             struct task_struct *tsk = threads[idx * ncpus + n];
1775             int err;
1776 
1777             if (!tsk)
1778                 continue;
1779 
1780             err = kthread_stop(tsk);
1781             if (err < 0 && !ret)
1782                 ret = err;
1783 
1784             put_task_struct(tsk);
1785         }
1786 
1787         num_waits += atomic_long_read(&smoke[idx].num_waits);
1788         num_fences += atomic_long_read(&smoke[idx].num_fences);
1789         idx++;
1790     }
1791     pr_info("Completed %lu waits for %lu fences across %d engines and %d cpus\n",
1792         num_waits, num_fences, idx, ncpus);
1793 
1794     ret = igt_live_test_end(&live) ?: ret;
1795 out_contexts:
1796     kfree(smoke[0].contexts);
1797 out_threads:
1798     kfree(threads);
1799 out_smoke:
1800     kfree(smoke);
1801 out_file:
1802     fput(file);
1803 out_rpm:
1804     intel_runtime_pm_put(&i915->runtime_pm, wakeref);
1805 
1806     return ret;
1807 }
1808 
1809 int i915_request_live_selftests(struct drm_i915_private *i915)
1810 {
1811     static const struct i915_subtest tests[] = {
1812         SUBTEST(live_nop_request),
1813         SUBTEST(live_all_engines),
1814         SUBTEST(live_sequential_engines),
1815         SUBTEST(live_parallel_engines),
1816         SUBTEST(live_empty_request),
1817         SUBTEST(live_cancel_request),
1818         SUBTEST(live_breadcrumbs_smoketest),
1819     };
1820 
1821     if (intel_gt_is_wedged(to_gt(i915)))
1822         return 0;
1823 
1824     return i915_subtests(tests, i915);
1825 }
1826 
1827 static int switch_to_kernel_sync(struct intel_context *ce, int err)
1828 {
1829     struct i915_request *rq;
1830     struct dma_fence *fence;
1831 
1832     rq = intel_engine_create_kernel_request(ce->engine);
1833     if (IS_ERR(rq))
1834         return PTR_ERR(rq);
1835 
1836     fence = i915_active_fence_get(&ce->timeline->last_request);
1837     if (fence) {
1838         i915_request_await_dma_fence(rq, fence);
1839         dma_fence_put(fence);
1840     }
1841 
1842     rq = i915_request_get(rq);
1843     i915_request_add(rq);
1844     if (i915_request_wait(rq, 0, HZ / 2) < 0 && !err)
1845         err = -ETIME;
1846     i915_request_put(rq);
1847 
1848     while (!err && !intel_engine_is_idle(ce->engine))
1849         intel_engine_flush_submission(ce->engine);
1850 
1851     return err;
1852 }
1853 
1854 struct perf_stats {
1855     struct intel_engine_cs *engine;
1856     unsigned long count;
1857     ktime_t time;
1858     ktime_t busy;
1859     u64 runtime;
1860 };
1861 
1862 struct perf_series {
1863     struct drm_i915_private *i915;
1864     unsigned int nengines;
1865     struct intel_context *ce[];
1866 };
1867 
1868 static int cmp_u32(const void *A, const void *B)
1869 {
1870     const u32 *a = A, *b = B;
1871 
1872     return *a - *b;
1873 }
1874 
1875 static u32 trifilter(u32 *a)
1876 {
1877     u64 sum;
1878 
1879 #define TF_COUNT 5
1880     sort(a, TF_COUNT, sizeof(*a), cmp_u32, NULL);
1881 
1882     sum = mul_u32_u32(a[2], 2);
1883     sum += a[1];
1884     sum += a[3];
1885 
1886     GEM_BUG_ON(sum > U32_MAX);
1887     return sum;
1888 #define TF_BIAS 2
1889 }
1890 
1891 static u64 cycles_to_ns(struct intel_engine_cs *engine, u32 cycles)
1892 {
1893     u64 ns = intel_gt_clock_interval_to_ns(engine->gt, cycles);
1894 
1895     return DIV_ROUND_CLOSEST(ns, 1 << TF_BIAS);
1896 }
1897 
1898 static u32 *emit_timestamp_store(u32 *cs, struct intel_context *ce, u32 offset)
1899 {
1900     *cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
1901     *cs++ = i915_mmio_reg_offset(RING_TIMESTAMP((ce->engine->mmio_base)));
1902     *cs++ = offset;
1903     *cs++ = 0;
1904 
1905     return cs;
1906 }
1907 
1908 static u32 *emit_store_dw(u32 *cs, u32 offset, u32 value)
1909 {
1910     *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
1911     *cs++ = offset;
1912     *cs++ = 0;
1913     *cs++ = value;
1914 
1915     return cs;
1916 }
1917 
1918 static u32 *emit_semaphore_poll(u32 *cs, u32 mode, u32 value, u32 offset)
1919 {
1920     *cs++ = MI_SEMAPHORE_WAIT |
1921         MI_SEMAPHORE_GLOBAL_GTT |
1922         MI_SEMAPHORE_POLL |
1923         mode;
1924     *cs++ = value;
1925     *cs++ = offset;
1926     *cs++ = 0;
1927 
1928     return cs;
1929 }
1930 
1931 static u32 *emit_semaphore_poll_until(u32 *cs, u32 offset, u32 value)
1932 {
1933     return emit_semaphore_poll(cs, MI_SEMAPHORE_SAD_EQ_SDD, value, offset);
1934 }
1935 
1936 static void semaphore_set(u32 *sema, u32 value)
1937 {
1938     WRITE_ONCE(*sema, value);
1939     wmb(); /* flush the update to the cache, and beyond */
1940 }
1941 
1942 static u32 *hwsp_scratch(const struct intel_context *ce)
1943 {
1944     return memset32(ce->engine->status_page.addr + 1000, 0, 21);
1945 }
1946 
1947 static u32 hwsp_offset(const struct intel_context *ce, u32 *dw)
1948 {
1949     return (i915_ggtt_offset(ce->engine->status_page.vma) +
1950         offset_in_page(dw));
1951 }
1952 
1953 static int measure_semaphore_response(struct intel_context *ce)
1954 {
1955     u32 *sema = hwsp_scratch(ce);
1956     const u32 offset = hwsp_offset(ce, sema);
1957     u32 elapsed[TF_COUNT], cycles;
1958     struct i915_request *rq;
1959     u32 *cs;
1960     int err;
1961     int i;
1962 
1963     /*
1964      * Measure how many cycles it takes for the HW to detect the change
1965      * in a semaphore value.
1966      *
1967      *    A: read CS_TIMESTAMP from CPU
1968      *    poke semaphore
1969      *    B: read CS_TIMESTAMP on GPU
1970      *
1971      * Semaphore latency: B - A
1972      */
1973 
1974     semaphore_set(sema, -1);
1975 
1976     rq = i915_request_create(ce);
1977     if (IS_ERR(rq))
1978         return PTR_ERR(rq);
1979 
1980     cs = intel_ring_begin(rq, 4 + 12 * ARRAY_SIZE(elapsed));
1981     if (IS_ERR(cs)) {
1982         i915_request_add(rq);
1983         err = PTR_ERR(cs);
1984         goto err;
1985     }
1986 
1987     cs = emit_store_dw(cs, offset, 0);
1988     for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
1989         cs = emit_semaphore_poll_until(cs, offset, i);
1990         cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
1991         cs = emit_store_dw(cs, offset, 0);
1992     }
1993 
1994     intel_ring_advance(rq, cs);
1995     i915_request_add(rq);
1996 
1997     if (wait_for(READ_ONCE(*sema) == 0, 50)) {
1998         err = -EIO;
1999         goto err;
2000     }
2001 
2002     for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2003         preempt_disable();
2004         cycles = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
2005         semaphore_set(sema, i);
2006         preempt_enable();
2007 
2008         if (wait_for(READ_ONCE(*sema) == 0, 50)) {
2009             err = -EIO;
2010             goto err;
2011         }
2012 
2013         elapsed[i - 1] = sema[i] - cycles;
2014     }
2015 
2016     cycles = trifilter(elapsed);
2017     pr_info("%s: semaphore response %d cycles, %lluns\n",
2018         ce->engine->name, cycles >> TF_BIAS,
2019         cycles_to_ns(ce->engine, cycles));
2020 
2021     return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2022 
2023 err:
2024     intel_gt_set_wedged(ce->engine->gt);
2025     return err;
2026 }
2027 
2028 static int measure_idle_dispatch(struct intel_context *ce)
2029 {
2030     u32 *sema = hwsp_scratch(ce);
2031     const u32 offset = hwsp_offset(ce, sema);
2032     u32 elapsed[TF_COUNT], cycles;
2033     u32 *cs;
2034     int err;
2035     int i;
2036 
2037     /*
2038      * Measure how long it takes for us to submit a request while the
2039      * engine is idle, but is resting in our context.
2040      *
2041      *    A: read CS_TIMESTAMP from CPU
2042      *    submit request
2043      *    B: read CS_TIMESTAMP on GPU
2044      *
2045      * Submission latency: B - A
2046      */
2047 
2048     for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
2049         struct i915_request *rq;
2050 
2051         err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
2052         if (err)
2053             return err;
2054 
2055         rq = i915_request_create(ce);
2056         if (IS_ERR(rq)) {
2057             err = PTR_ERR(rq);
2058             goto err;
2059         }
2060 
2061         cs = intel_ring_begin(rq, 4);
2062         if (IS_ERR(cs)) {
2063             i915_request_add(rq);
2064             err = PTR_ERR(cs);
2065             goto err;
2066         }
2067 
2068         cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
2069 
2070         intel_ring_advance(rq, cs);
2071 
2072         preempt_disable();
2073         local_bh_disable();
2074         elapsed[i] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
2075         i915_request_add(rq);
2076         local_bh_enable();
2077         preempt_enable();
2078     }
2079 
2080     err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
2081     if (err)
2082         goto err;
2083 
2084     for (i = 0; i < ARRAY_SIZE(elapsed); i++)
2085         elapsed[i] = sema[i] - elapsed[i];
2086 
2087     cycles = trifilter(elapsed);
2088     pr_info("%s: idle dispatch latency %d cycles, %lluns\n",
2089         ce->engine->name, cycles >> TF_BIAS,
2090         cycles_to_ns(ce->engine, cycles));
2091 
2092     return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2093 
2094 err:
2095     intel_gt_set_wedged(ce->engine->gt);
2096     return err;
2097 }
2098 
2099 static int measure_busy_dispatch(struct intel_context *ce)
2100 {
2101     u32 *sema = hwsp_scratch(ce);
2102     const u32 offset = hwsp_offset(ce, sema);
2103     u32 elapsed[TF_COUNT + 1], cycles;
2104     u32 *cs;
2105     int err;
2106     int i;
2107 
2108     /*
2109      * Measure how long it takes for us to submit a request while the
2110      * engine is busy, polling on a semaphore in our context. With
2111      * direct submission, this will include the cost of a lite restore.
2112      *
2113      *    A: read CS_TIMESTAMP from CPU
2114      *    submit request
2115      *    B: read CS_TIMESTAMP on GPU
2116      *
2117      * Submission latency: B - A
2118      */
2119 
2120     for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2121         struct i915_request *rq;
2122 
2123         rq = i915_request_create(ce);
2124         if (IS_ERR(rq)) {
2125             err = PTR_ERR(rq);
2126             goto err;
2127         }
2128 
2129         cs = intel_ring_begin(rq, 12);
2130         if (IS_ERR(cs)) {
2131             i915_request_add(rq);
2132             err = PTR_ERR(cs);
2133             goto err;
2134         }
2135 
2136         cs = emit_store_dw(cs, offset + i * sizeof(u32), -1);
2137         cs = emit_semaphore_poll_until(cs, offset, i);
2138         cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
2139 
2140         intel_ring_advance(rq, cs);
2141 
2142         if (i > 1 && wait_for(READ_ONCE(sema[i - 1]), 500)) {
2143             err = -EIO;
2144             goto err;
2145         }
2146 
2147         preempt_disable();
2148         local_bh_disable();
2149         elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
2150         i915_request_add(rq);
2151         local_bh_enable();
2152         semaphore_set(sema, i - 1);
2153         preempt_enable();
2154     }
2155 
2156     wait_for(READ_ONCE(sema[i - 1]), 500);
2157     semaphore_set(sema, i - 1);
2158 
2159     for (i = 1; i <= TF_COUNT; i++) {
2160         GEM_BUG_ON(sema[i] == -1);
2161         elapsed[i - 1] = sema[i] - elapsed[i];
2162     }
2163 
2164     cycles = trifilter(elapsed);
2165     pr_info("%s: busy dispatch latency %d cycles, %lluns\n",
2166         ce->engine->name, cycles >> TF_BIAS,
2167         cycles_to_ns(ce->engine, cycles));
2168 
2169     return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2170 
2171 err:
2172     intel_gt_set_wedged(ce->engine->gt);
2173     return err;
2174 }
2175 
2176 static int plug(struct intel_engine_cs *engine, u32 *sema, u32 mode, int value)
2177 {
2178     const u32 offset =
2179         i915_ggtt_offset(engine->status_page.vma) +
2180         offset_in_page(sema);
2181     struct i915_request *rq;
2182     u32 *cs;
2183 
2184     rq = i915_request_create(engine->kernel_context);
2185     if (IS_ERR(rq))
2186         return PTR_ERR(rq);
2187 
2188     cs = intel_ring_begin(rq, 4);
2189     if (IS_ERR(cs)) {
2190         i915_request_add(rq);
2191         return PTR_ERR(cs);
2192     }
2193 
2194     cs = emit_semaphore_poll(cs, mode, value, offset);
2195 
2196     intel_ring_advance(rq, cs);
2197     i915_request_add(rq);
2198 
2199     return 0;
2200 }
2201 
2202 static int measure_inter_request(struct intel_context *ce)
2203 {
2204     u32 *sema = hwsp_scratch(ce);
2205     const u32 offset = hwsp_offset(ce, sema);
2206     u32 elapsed[TF_COUNT + 1], cycles;
2207     struct i915_sw_fence *submit;
2208     int i, err;
2209 
2210     /*
2211      * Measure how long it takes to advance from one request into the
2212      * next. Between each request we flush the GPU caches to memory,
2213      * update the breadcrumbs, and then invalidate those caches.
2214      * We queue up all the requests to be submitted in one batch so
2215      * it should be one set of contiguous measurements.
2216      *
2217      *    A: read CS_TIMESTAMP on GPU
2218      *    advance request
2219      *    B: read CS_TIMESTAMP on GPU
2220      *
2221      * Request latency: B - A
2222      */
2223 
2224     err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
2225     if (err)
2226         return err;
2227 
2228     submit = heap_fence_create(GFP_KERNEL);
2229     if (!submit) {
2230         semaphore_set(sema, 1);
2231         return -ENOMEM;
2232     }
2233 
2234     intel_engine_flush_submission(ce->engine);
2235     for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2236         struct i915_request *rq;
2237         u32 *cs;
2238 
2239         rq = i915_request_create(ce);
2240         if (IS_ERR(rq)) {
2241             err = PTR_ERR(rq);
2242             goto err_submit;
2243         }
2244 
2245         err = i915_sw_fence_await_sw_fence_gfp(&rq->submit,
2246                                submit,
2247                                GFP_KERNEL);
2248         if (err < 0) {
2249             i915_request_add(rq);
2250             goto err_submit;
2251         }
2252 
2253         cs = intel_ring_begin(rq, 4);
2254         if (IS_ERR(cs)) {
2255             i915_request_add(rq);
2256             err = PTR_ERR(cs);
2257             goto err_submit;
2258         }
2259 
2260         cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
2261 
2262         intel_ring_advance(rq, cs);
2263         i915_request_add(rq);
2264     }
2265     i915_sw_fence_commit(submit);
2266     intel_engine_flush_submission(ce->engine);
2267     heap_fence_put(submit);
2268 
2269     semaphore_set(sema, 1);
2270     err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
2271     if (err)
2272         goto err;
2273 
2274     for (i = 1; i <= TF_COUNT; i++)
2275         elapsed[i - 1] = sema[i + 1] - sema[i];
2276 
2277     cycles = trifilter(elapsed);
2278     pr_info("%s: inter-request latency %d cycles, %lluns\n",
2279         ce->engine->name, cycles >> TF_BIAS,
2280         cycles_to_ns(ce->engine, cycles));
2281 
2282     return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2283 
2284 err_submit:
2285     i915_sw_fence_commit(submit);
2286     heap_fence_put(submit);
2287     semaphore_set(sema, 1);
2288 err:
2289     intel_gt_set_wedged(ce->engine->gt);
2290     return err;
2291 }
2292 
2293 static int measure_context_switch(struct intel_context *ce)
2294 {
2295     u32 *sema = hwsp_scratch(ce);
2296     const u32 offset = hwsp_offset(ce, sema);
2297     struct i915_request *fence = NULL;
2298     u32 elapsed[TF_COUNT + 1], cycles;
2299     int i, j, err;
2300     u32 *cs;
2301 
2302     /*
2303      * Measure how long it takes to advance from one request in one
2304      * context to a request in another context. This allows us to
2305      * measure how long the context save/restore take, along with all
2306      * the inter-context setup we require.
2307      *
2308      *    A: read CS_TIMESTAMP on GPU
2309      *    switch context
2310      *    B: read CS_TIMESTAMP on GPU
2311      *
2312      * Context switch latency: B - A
2313      */
2314 
2315     err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
2316     if (err)
2317         return err;
2318 
2319     for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2320         struct intel_context *arr[] = {
2321             ce, ce->engine->kernel_context
2322         };
2323         u32 addr = offset + ARRAY_SIZE(arr) * i * sizeof(u32);
2324 
2325         for (j = 0; j < ARRAY_SIZE(arr); j++) {
2326             struct i915_request *rq;
2327 
2328             rq = i915_request_create(arr[j]);
2329             if (IS_ERR(rq)) {
2330                 err = PTR_ERR(rq);
2331                 goto err_fence;
2332             }
2333 
2334             if (fence) {
2335                 err = i915_request_await_dma_fence(rq,
2336                                    &fence->fence);
2337                 if (err) {
2338                     i915_request_add(rq);
2339                     goto err_fence;
2340                 }
2341             }
2342 
2343             cs = intel_ring_begin(rq, 4);
2344             if (IS_ERR(cs)) {
2345                 i915_request_add(rq);
2346                 err = PTR_ERR(cs);
2347                 goto err_fence;
2348             }
2349 
2350             cs = emit_timestamp_store(cs, ce, addr);
2351             addr += sizeof(u32);
2352 
2353             intel_ring_advance(rq, cs);
2354 
2355             i915_request_put(fence);
2356             fence = i915_request_get(rq);
2357 
2358             i915_request_add(rq);
2359         }
2360     }
2361     i915_request_put(fence);
2362     intel_engine_flush_submission(ce->engine);
2363 
2364     semaphore_set(sema, 1);
2365     err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
2366     if (err)
2367         goto err;
2368 
2369     for (i = 1; i <= TF_COUNT; i++)
2370         elapsed[i - 1] = sema[2 * i + 2] - sema[2 * i + 1];
2371 
2372     cycles = trifilter(elapsed);
2373     pr_info("%s: context switch latency %d cycles, %lluns\n",
2374         ce->engine->name, cycles >> TF_BIAS,
2375         cycles_to_ns(ce->engine, cycles));
2376 
2377     return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2378 
2379 err_fence:
2380     i915_request_put(fence);
2381     semaphore_set(sema, 1);
2382 err:
2383     intel_gt_set_wedged(ce->engine->gt);
2384     return err;
2385 }
2386 
2387 static int measure_preemption(struct intel_context *ce)
2388 {
2389     u32 *sema = hwsp_scratch(ce);
2390     const u32 offset = hwsp_offset(ce, sema);
2391     u32 elapsed[TF_COUNT], cycles;
2392     u32 *cs;
2393     int err;
2394     int i;
2395 
2396     /*
2397      * We measure two latencies while triggering preemption. The first
2398      * latency is how long it takes for us to submit a preempting request.
2399      * The second latency is how it takes for us to return from the
2400      * preemption back to the original context.
2401      *
2402      *    A: read CS_TIMESTAMP from CPU
2403      *    submit preemption
2404      *    B: read CS_TIMESTAMP on GPU (in preempting context)
2405      *    context switch
2406      *    C: read CS_TIMESTAMP on GPU (in original context)
2407      *
2408      * Preemption dispatch latency: B - A
2409      * Preemption switch latency: C - B
2410      */
2411 
2412     if (!intel_engine_has_preemption(ce->engine))
2413         return 0;
2414 
2415     for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2416         u32 addr = offset + 2 * i * sizeof(u32);
2417         struct i915_request *rq;
2418 
2419         rq = i915_request_create(ce);
2420         if (IS_ERR(rq)) {
2421             err = PTR_ERR(rq);
2422             goto err;
2423         }
2424 
2425         cs = intel_ring_begin(rq, 12);
2426         if (IS_ERR(cs)) {
2427             i915_request_add(rq);
2428             err = PTR_ERR(cs);
2429             goto err;
2430         }
2431 
2432         cs = emit_store_dw(cs, addr, -1);
2433         cs = emit_semaphore_poll_until(cs, offset, i);
2434         cs = emit_timestamp_store(cs, ce, addr + sizeof(u32));
2435 
2436         intel_ring_advance(rq, cs);
2437         i915_request_add(rq);
2438 
2439         if (wait_for(READ_ONCE(sema[2 * i]) == -1, 500)) {
2440             err = -EIO;
2441             goto err;
2442         }
2443 
2444         rq = i915_request_create(ce->engine->kernel_context);
2445         if (IS_ERR(rq)) {
2446             err = PTR_ERR(rq);
2447             goto err;
2448         }
2449 
2450         cs = intel_ring_begin(rq, 8);
2451         if (IS_ERR(cs)) {
2452             i915_request_add(rq);
2453             err = PTR_ERR(cs);
2454             goto err;
2455         }
2456 
2457         cs = emit_timestamp_store(cs, ce, addr);
2458         cs = emit_store_dw(cs, offset, i);
2459 
2460         intel_ring_advance(rq, cs);
2461         rq->sched.attr.priority = I915_PRIORITY_BARRIER;
2462 
2463         elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
2464         i915_request_add(rq);
2465     }
2466 
2467     if (wait_for(READ_ONCE(sema[2 * i - 2]) != -1, 500)) {
2468         err = -EIO;
2469         goto err;
2470     }
2471 
2472     for (i = 1; i <= TF_COUNT; i++)
2473         elapsed[i - 1] = sema[2 * i + 0] - elapsed[i - 1];
2474 
2475     cycles = trifilter(elapsed);
2476     pr_info("%s: preemption dispatch latency %d cycles, %lluns\n",
2477         ce->engine->name, cycles >> TF_BIAS,
2478         cycles_to_ns(ce->engine, cycles));
2479 
2480     for (i = 1; i <= TF_COUNT; i++)
2481         elapsed[i - 1] = sema[2 * i + 1] - sema[2 * i + 0];
2482 
2483     cycles = trifilter(elapsed);
2484     pr_info("%s: preemption switch latency %d cycles, %lluns\n",
2485         ce->engine->name, cycles >> TF_BIAS,
2486         cycles_to_ns(ce->engine, cycles));
2487 
2488     return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2489 
2490 err:
2491     intel_gt_set_wedged(ce->engine->gt);
2492     return err;
2493 }
2494 
2495 struct signal_cb {
2496     struct dma_fence_cb base;
2497     bool seen;
2498 };
2499 
2500 static void signal_cb(struct dma_fence *fence, struct dma_fence_cb *cb)
2501 {
2502     struct signal_cb *s = container_of(cb, typeof(*s), base);
2503 
2504     smp_store_mb(s->seen, true); /* be safe, be strong */
2505 }
2506 
2507 static int measure_completion(struct intel_context *ce)
2508 {
2509     u32 *sema = hwsp_scratch(ce);
2510     const u32 offset = hwsp_offset(ce, sema);
2511     u32 elapsed[TF_COUNT], cycles;
2512     u32 *cs;
2513     int err;
2514     int i;
2515 
2516     /*
2517      * Measure how long it takes for the signal (interrupt) to be
2518      * sent from the GPU to be processed by the CPU.
2519      *
2520      *    A: read CS_TIMESTAMP on GPU
2521      *    signal
2522      *    B: read CS_TIMESTAMP from CPU
2523      *
2524      * Completion latency: B - A
2525      */
2526 
2527     for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2528         struct signal_cb cb = { .seen = false };
2529         struct i915_request *rq;
2530 
2531         rq = i915_request_create(ce);
2532         if (IS_ERR(rq)) {
2533             err = PTR_ERR(rq);
2534             goto err;
2535         }
2536 
2537         cs = intel_ring_begin(rq, 12);
2538         if (IS_ERR(cs)) {
2539             i915_request_add(rq);
2540             err = PTR_ERR(cs);
2541             goto err;
2542         }
2543 
2544         cs = emit_store_dw(cs, offset + i * sizeof(u32), -1);
2545         cs = emit_semaphore_poll_until(cs, offset, i);
2546         cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
2547 
2548         intel_ring_advance(rq, cs);
2549 
2550         dma_fence_add_callback(&rq->fence, &cb.base, signal_cb);
2551         i915_request_add(rq);
2552 
2553         intel_engine_flush_submission(ce->engine);
2554         if (wait_for(READ_ONCE(sema[i]) == -1, 50)) {
2555             err = -EIO;
2556             goto err;
2557         }
2558 
2559         preempt_disable();
2560         semaphore_set(sema, i);
2561         while (!READ_ONCE(cb.seen))
2562             cpu_relax();
2563 
2564         elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
2565         preempt_enable();
2566     }
2567 
2568     err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
2569     if (err)
2570         goto err;
2571 
2572     for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
2573         GEM_BUG_ON(sema[i + 1] == -1);
2574         elapsed[i] = elapsed[i] - sema[i + 1];
2575     }
2576 
2577     cycles = trifilter(elapsed);
2578     pr_info("%s: completion latency %d cycles, %lluns\n",
2579         ce->engine->name, cycles >> TF_BIAS,
2580         cycles_to_ns(ce->engine, cycles));
2581 
2582     return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2583 
2584 err:
2585     intel_gt_set_wedged(ce->engine->gt);
2586     return err;
2587 }
2588 
2589 static void rps_pin(struct intel_gt *gt)
2590 {
2591     /* Pin the frequency to max */
2592     atomic_inc(&gt->rps.num_waiters);
2593     intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL);
2594 
2595     mutex_lock(&gt->rps.lock);
2596     intel_rps_set(&gt->rps, gt->rps.max_freq);
2597     mutex_unlock(&gt->rps.lock);
2598 }
2599 
2600 static void rps_unpin(struct intel_gt *gt)
2601 {
2602     intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL);
2603     atomic_dec(&gt->rps.num_waiters);
2604 }
2605 
2606 static int perf_request_latency(void *arg)
2607 {
2608     struct drm_i915_private *i915 = arg;
2609     struct intel_engine_cs *engine;
2610     struct pm_qos_request qos;
2611     int err = 0;
2612 
2613     if (GRAPHICS_VER(i915) < 8) /* per-engine CS timestamp, semaphores */
2614         return 0;
2615 
2616     cpu_latency_qos_add_request(&qos, 0); /* disable cstates */
2617 
2618     for_each_uabi_engine(engine, i915) {
2619         struct intel_context *ce;
2620 
2621         ce = intel_context_create(engine);
2622         if (IS_ERR(ce)) {
2623             err = PTR_ERR(ce);
2624             goto out;
2625         }
2626 
2627         err = intel_context_pin(ce);
2628         if (err) {
2629             intel_context_put(ce);
2630             goto out;
2631         }
2632 
2633         st_engine_heartbeat_disable(engine);
2634         rps_pin(engine->gt);
2635 
2636         if (err == 0)
2637             err = measure_semaphore_response(ce);
2638         if (err == 0)
2639             err = measure_idle_dispatch(ce);
2640         if (err == 0)
2641             err = measure_busy_dispatch(ce);
2642         if (err == 0)
2643             err = measure_inter_request(ce);
2644         if (err == 0)
2645             err = measure_context_switch(ce);
2646         if (err == 0)
2647             err = measure_preemption(ce);
2648         if (err == 0)
2649             err = measure_completion(ce);
2650 
2651         rps_unpin(engine->gt);
2652         st_engine_heartbeat_enable(engine);
2653 
2654         intel_context_unpin(ce);
2655         intel_context_put(ce);
2656         if (err)
2657             goto out;
2658     }
2659 
2660 out:
2661     if (igt_flush_test(i915))
2662         err = -EIO;
2663 
2664     cpu_latency_qos_remove_request(&qos);
2665     return err;
2666 }
2667 
2668 static int s_sync0(void *arg)
2669 {
2670     struct perf_series *ps = arg;
2671     IGT_TIMEOUT(end_time);
2672     unsigned int idx = 0;
2673     int err = 0;
2674 
2675     GEM_BUG_ON(!ps->nengines);
2676     do {
2677         struct i915_request *rq;
2678 
2679         rq = i915_request_create(ps->ce[idx]);
2680         if (IS_ERR(rq)) {
2681             err = PTR_ERR(rq);
2682             break;
2683         }
2684 
2685         i915_request_get(rq);
2686         i915_request_add(rq);
2687 
2688         if (i915_request_wait(rq, 0, HZ / 5) < 0)
2689             err = -ETIME;
2690         i915_request_put(rq);
2691         if (err)
2692             break;
2693 
2694         if (++idx == ps->nengines)
2695             idx = 0;
2696     } while (!__igt_timeout(end_time, NULL));
2697 
2698     return err;
2699 }
2700 
2701 static int s_sync1(void *arg)
2702 {
2703     struct perf_series *ps = arg;
2704     struct i915_request *prev = NULL;
2705     IGT_TIMEOUT(end_time);
2706     unsigned int idx = 0;
2707     int err = 0;
2708 
2709     GEM_BUG_ON(!ps->nengines);
2710     do {
2711         struct i915_request *rq;
2712 
2713         rq = i915_request_create(ps->ce[idx]);
2714         if (IS_ERR(rq)) {
2715             err = PTR_ERR(rq);
2716             break;
2717         }
2718 
2719         i915_request_get(rq);
2720         i915_request_add(rq);
2721 
2722         if (prev && i915_request_wait(prev, 0, HZ / 5) < 0)
2723             err = -ETIME;
2724         i915_request_put(prev);
2725         prev = rq;
2726         if (err)
2727             break;
2728 
2729         if (++idx == ps->nengines)
2730             idx = 0;
2731     } while (!__igt_timeout(end_time, NULL));
2732     i915_request_put(prev);
2733 
2734     return err;
2735 }
2736 
2737 static int s_many(void *arg)
2738 {
2739     struct perf_series *ps = arg;
2740     IGT_TIMEOUT(end_time);
2741     unsigned int idx = 0;
2742 
2743     GEM_BUG_ON(!ps->nengines);
2744     do {
2745         struct i915_request *rq;
2746 
2747         rq = i915_request_create(ps->ce[idx]);
2748         if (IS_ERR(rq))
2749             return PTR_ERR(rq);
2750 
2751         i915_request_add(rq);
2752 
2753         if (++idx == ps->nengines)
2754             idx = 0;
2755     } while (!__igt_timeout(end_time, NULL));
2756 
2757     return 0;
2758 }
2759 
2760 static int perf_series_engines(void *arg)
2761 {
2762     struct drm_i915_private *i915 = arg;
2763     static int (* const func[])(void *arg) = {
2764         s_sync0,
2765         s_sync1,
2766         s_many,
2767         NULL,
2768     };
2769     const unsigned int nengines = num_uabi_engines(i915);
2770     struct intel_engine_cs *engine;
2771     int (* const *fn)(void *arg);
2772     struct pm_qos_request qos;
2773     struct perf_stats *stats;
2774     struct perf_series *ps;
2775     unsigned int idx;
2776     int err = 0;
2777 
2778     stats = kcalloc(nengines, sizeof(*stats), GFP_KERNEL);
2779     if (!stats)
2780         return -ENOMEM;
2781 
2782     ps = kzalloc(struct_size(ps, ce, nengines), GFP_KERNEL);
2783     if (!ps) {
2784         kfree(stats);
2785         return -ENOMEM;
2786     }
2787 
2788     cpu_latency_qos_add_request(&qos, 0); /* disable cstates */
2789 
2790     ps->i915 = i915;
2791     ps->nengines = nengines;
2792 
2793     idx = 0;
2794     for_each_uabi_engine(engine, i915) {
2795         struct intel_context *ce;
2796 
2797         ce = intel_context_create(engine);
2798         if (IS_ERR(ce)) {
2799             err = PTR_ERR(ce);
2800             goto out;
2801         }
2802 
2803         err = intel_context_pin(ce);
2804         if (err) {
2805             intel_context_put(ce);
2806             goto out;
2807         }
2808 
2809         ps->ce[idx++] = ce;
2810     }
2811     GEM_BUG_ON(idx != ps->nengines);
2812 
2813     for (fn = func; *fn && !err; fn++) {
2814         char name[KSYM_NAME_LEN];
2815         struct igt_live_test t;
2816 
2817         snprintf(name, sizeof(name), "%ps", *fn);
2818         err = igt_live_test_begin(&t, i915, __func__, name);
2819         if (err)
2820             break;
2821 
2822         for (idx = 0; idx < nengines; idx++) {
2823             struct perf_stats *p =
2824                 memset(&stats[idx], 0, sizeof(stats[idx]));
2825             struct intel_context *ce = ps->ce[idx];
2826 
2827             p->engine = ps->ce[idx]->engine;
2828             intel_engine_pm_get(p->engine);
2829 
2830             if (intel_engine_supports_stats(p->engine))
2831                 p->busy = intel_engine_get_busy_time(p->engine,
2832                                      &p->time) + 1;
2833             else
2834                 p->time = ktime_get();
2835             p->runtime = -intel_context_get_total_runtime_ns(ce);
2836         }
2837 
2838         err = (*fn)(ps);
2839         if (igt_live_test_end(&t))
2840             err = -EIO;
2841 
2842         for (idx = 0; idx < nengines; idx++) {
2843             struct perf_stats *p = &stats[idx];
2844             struct intel_context *ce = ps->ce[idx];
2845             int integer, decimal;
2846             u64 busy, dt, now;
2847 
2848             if (p->busy)
2849                 p->busy = ktime_sub(intel_engine_get_busy_time(p->engine,
2850                                            &now),
2851                             p->busy - 1);
2852             else
2853                 now = ktime_get();
2854             p->time = ktime_sub(now, p->time);
2855 
2856             err = switch_to_kernel_sync(ce, err);
2857             p->runtime += intel_context_get_total_runtime_ns(ce);
2858             intel_engine_pm_put(p->engine);
2859 
2860             busy = 100 * ktime_to_ns(p->busy);
2861             dt = ktime_to_ns(p->time);
2862             if (dt) {
2863                 integer = div64_u64(busy, dt);
2864                 busy -= integer * dt;
2865                 decimal = div64_u64(100 * busy, dt);
2866             } else {
2867                 integer = 0;
2868                 decimal = 0;
2869             }
2870 
2871             pr_info("%s %5s: { seqno:%d, busy:%d.%02d%%, runtime:%lldms, walltime:%lldms }\n",
2872                 name, p->engine->name, ce->timeline->seqno,
2873                 integer, decimal,
2874                 div_u64(p->runtime, 1000 * 1000),
2875                 div_u64(ktime_to_ns(p->time), 1000 * 1000));
2876         }
2877     }
2878 
2879 out:
2880     for (idx = 0; idx < nengines; idx++) {
2881         if (IS_ERR_OR_NULL(ps->ce[idx]))
2882             break;
2883 
2884         intel_context_unpin(ps->ce[idx]);
2885         intel_context_put(ps->ce[idx]);
2886     }
2887     kfree(ps);
2888 
2889     cpu_latency_qos_remove_request(&qos);
2890     kfree(stats);
2891     return err;
2892 }
2893 
2894 static int p_sync0(void *arg)
2895 {
2896     struct perf_stats *p = arg;
2897     struct intel_engine_cs *engine = p->engine;
2898     struct intel_context *ce;
2899     IGT_TIMEOUT(end_time);
2900     unsigned long count;
2901     bool busy;
2902     int err = 0;
2903 
2904     ce = intel_context_create(engine);
2905     if (IS_ERR(ce))
2906         return PTR_ERR(ce);
2907 
2908     err = intel_context_pin(ce);
2909     if (err) {
2910         intel_context_put(ce);
2911         return err;
2912     }
2913 
2914     if (intel_engine_supports_stats(engine)) {
2915         p->busy = intel_engine_get_busy_time(engine, &p->time);
2916         busy = true;
2917     } else {
2918         p->time = ktime_get();
2919         busy = false;
2920     }
2921 
2922     count = 0;
2923     do {
2924         struct i915_request *rq;
2925 
2926         rq = i915_request_create(ce);
2927         if (IS_ERR(rq)) {
2928             err = PTR_ERR(rq);
2929             break;
2930         }
2931 
2932         i915_request_get(rq);
2933         i915_request_add(rq);
2934 
2935         err = 0;
2936         if (i915_request_wait(rq, 0, HZ) < 0)
2937             err = -ETIME;
2938         i915_request_put(rq);
2939         if (err)
2940             break;
2941 
2942         count++;
2943     } while (!__igt_timeout(end_time, NULL));
2944 
2945     if (busy) {
2946         ktime_t now;
2947 
2948         p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now),
2949                     p->busy);
2950         p->time = ktime_sub(now, p->time);
2951     } else {
2952         p->time = ktime_sub(ktime_get(), p->time);
2953     }
2954 
2955     err = switch_to_kernel_sync(ce, err);
2956     p->runtime = intel_context_get_total_runtime_ns(ce);
2957     p->count = count;
2958 
2959     intel_context_unpin(ce);
2960     intel_context_put(ce);
2961     return err;
2962 }
2963 
2964 static int p_sync1(void *arg)
2965 {
2966     struct perf_stats *p = arg;
2967     struct intel_engine_cs *engine = p->engine;
2968     struct i915_request *prev = NULL;
2969     struct intel_context *ce;
2970     IGT_TIMEOUT(end_time);
2971     unsigned long count;
2972     bool busy;
2973     int err = 0;
2974 
2975     ce = intel_context_create(engine);
2976     if (IS_ERR(ce))
2977         return PTR_ERR(ce);
2978 
2979     err = intel_context_pin(ce);
2980     if (err) {
2981         intel_context_put(ce);
2982         return err;
2983     }
2984 
2985     if (intel_engine_supports_stats(engine)) {
2986         p->busy = intel_engine_get_busy_time(engine, &p->time);
2987         busy = true;
2988     } else {
2989         p->time = ktime_get();
2990         busy = false;
2991     }
2992 
2993     count = 0;
2994     do {
2995         struct i915_request *rq;
2996 
2997         rq = i915_request_create(ce);
2998         if (IS_ERR(rq)) {
2999             err = PTR_ERR(rq);
3000             break;
3001         }
3002 
3003         i915_request_get(rq);
3004         i915_request_add(rq);
3005 
3006         err = 0;
3007         if (prev && i915_request_wait(prev, 0, HZ) < 0)
3008             err = -ETIME;
3009         i915_request_put(prev);
3010         prev = rq;
3011         if (err)
3012             break;
3013 
3014         count++;
3015     } while (!__igt_timeout(end_time, NULL));
3016     i915_request_put(prev);
3017 
3018     if (busy) {
3019         ktime_t now;
3020 
3021         p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now),
3022                     p->busy);
3023         p->time = ktime_sub(now, p->time);
3024     } else {
3025         p->time = ktime_sub(ktime_get(), p->time);
3026     }
3027 
3028     err = switch_to_kernel_sync(ce, err);
3029     p->runtime = intel_context_get_total_runtime_ns(ce);
3030     p->count = count;
3031 
3032     intel_context_unpin(ce);
3033     intel_context_put(ce);
3034     return err;
3035 }
3036 
3037 static int p_many(void *arg)
3038 {
3039     struct perf_stats *p = arg;
3040     struct intel_engine_cs *engine = p->engine;
3041     struct intel_context *ce;
3042     IGT_TIMEOUT(end_time);
3043     unsigned long count;
3044     int err = 0;
3045     bool busy;
3046 
3047     ce = intel_context_create(engine);
3048     if (IS_ERR(ce))
3049         return PTR_ERR(ce);
3050 
3051     err = intel_context_pin(ce);
3052     if (err) {
3053         intel_context_put(ce);
3054         return err;
3055     }
3056 
3057     if (intel_engine_supports_stats(engine)) {
3058         p->busy = intel_engine_get_busy_time(engine, &p->time);
3059         busy = true;
3060     } else {
3061         p->time = ktime_get();
3062         busy = false;
3063     }
3064 
3065     count = 0;
3066     do {
3067         struct i915_request *rq;
3068 
3069         rq = i915_request_create(ce);
3070         if (IS_ERR(rq)) {
3071             err = PTR_ERR(rq);
3072             break;
3073         }
3074 
3075         i915_request_add(rq);
3076         count++;
3077     } while (!__igt_timeout(end_time, NULL));
3078 
3079     if (busy) {
3080         ktime_t now;
3081 
3082         p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now),
3083                     p->busy);
3084         p->time = ktime_sub(now, p->time);
3085     } else {
3086         p->time = ktime_sub(ktime_get(), p->time);
3087     }
3088 
3089     err = switch_to_kernel_sync(ce, err);
3090     p->runtime = intel_context_get_total_runtime_ns(ce);
3091     p->count = count;
3092 
3093     intel_context_unpin(ce);
3094     intel_context_put(ce);
3095     return err;
3096 }
3097 
3098 static int perf_parallel_engines(void *arg)
3099 {
3100     struct drm_i915_private *i915 = arg;
3101     static int (* const func[])(void *arg) = {
3102         p_sync0,
3103         p_sync1,
3104         p_many,
3105         NULL,
3106     };
3107     const unsigned int nengines = num_uabi_engines(i915);
3108     struct intel_engine_cs *engine;
3109     int (* const *fn)(void *arg);
3110     struct pm_qos_request qos;
3111     struct {
3112         struct perf_stats p;
3113         struct task_struct *tsk;
3114     } *engines;
3115     int err = 0;
3116 
3117     engines = kcalloc(nengines, sizeof(*engines), GFP_KERNEL);
3118     if (!engines)
3119         return -ENOMEM;
3120 
3121     cpu_latency_qos_add_request(&qos, 0);
3122 
3123     for (fn = func; *fn; fn++) {
3124         char name[KSYM_NAME_LEN];
3125         struct igt_live_test t;
3126         unsigned int idx;
3127 
3128         snprintf(name, sizeof(name), "%ps", *fn);
3129         err = igt_live_test_begin(&t, i915, __func__, name);
3130         if (err)
3131             break;
3132 
3133         atomic_set(&i915->selftest.counter, nengines);
3134 
3135         idx = 0;
3136         for_each_uabi_engine(engine, i915) {
3137             intel_engine_pm_get(engine);
3138 
3139             memset(&engines[idx].p, 0, sizeof(engines[idx].p));
3140             engines[idx].p.engine = engine;
3141 
3142             engines[idx].tsk = kthread_run(*fn, &engines[idx].p,
3143                                "igt:%s", engine->name);
3144             if (IS_ERR(engines[idx].tsk)) {
3145                 err = PTR_ERR(engines[idx].tsk);
3146                 intel_engine_pm_put(engine);
3147                 break;
3148             }
3149             get_task_struct(engines[idx++].tsk);
3150         }
3151 
3152         yield(); /* start all threads before we kthread_stop() */
3153 
3154         idx = 0;
3155         for_each_uabi_engine(engine, i915) {
3156             int status;
3157 
3158             if (IS_ERR(engines[idx].tsk))
3159                 break;
3160 
3161             status = kthread_stop(engines[idx].tsk);
3162             if (status && !err)
3163                 err = status;
3164 
3165             intel_engine_pm_put(engine);
3166             put_task_struct(engines[idx++].tsk);
3167         }
3168 
3169         if (igt_live_test_end(&t))
3170             err = -EIO;
3171         if (err)
3172             break;
3173 
3174         idx = 0;
3175         for_each_uabi_engine(engine, i915) {
3176             struct perf_stats *p = &engines[idx].p;
3177             u64 busy = 100 * ktime_to_ns(p->busy);
3178             u64 dt = ktime_to_ns(p->time);
3179             int integer, decimal;
3180 
3181             if (dt) {
3182                 integer = div64_u64(busy, dt);
3183                 busy -= integer * dt;
3184                 decimal = div64_u64(100 * busy, dt);
3185             } else {
3186                 integer = 0;
3187                 decimal = 0;
3188             }
3189 
3190             GEM_BUG_ON(engine != p->engine);
3191             pr_info("%s %5s: { count:%lu, busy:%d.%02d%%, runtime:%lldms, walltime:%lldms }\n",
3192                 name, engine->name, p->count, integer, decimal,
3193                 div_u64(p->runtime, 1000 * 1000),
3194                 div_u64(ktime_to_ns(p->time), 1000 * 1000));
3195             idx++;
3196         }
3197     }
3198 
3199     cpu_latency_qos_remove_request(&qos);
3200     kfree(engines);
3201     return err;
3202 }
3203 
3204 int i915_request_perf_selftests(struct drm_i915_private *i915)
3205 {
3206     static const struct i915_subtest tests[] = {
3207         SUBTEST(perf_request_latency),
3208         SUBTEST(perf_series_engines),
3209         SUBTEST(perf_parallel_engines),
3210     };
3211 
3212     if (intel_gt_is_wedged(to_gt(i915)))
3213         return 0;
3214 
3215     return i915_subtests(tests, i915);
3216 }