i915/gt/selftest_timeline.c

0001 // SPDX-License-Identifier: MIT
0002 /*
0003  * Copyright © 2017-2018 Intel Corporation
0004  */
0005
0006 #include <linux/prime_numbers.h>
0007 #include <linux/string_helpers.h>
0008
0009 #include "intel_context.h"
0010 #include "intel_engine_heartbeat.h"
0011 #include "intel_engine_pm.h"
0012 #include "intel_engine_regs.h"
0013 #include "intel_gpu_commands.h"
0014 #include "intel_gt.h"
0015 #include "intel_gt_requests.h"
0016 #include "intel_ring.h"
0017 #include "selftest_engine_heartbeat.h"
0018
0019 #include "../selftests/i915_random.h"
0020 #include "../i915_selftest.h"
0021
0022 #include "selftests/igt_flush_test.h"
0023 #include "selftests/lib_sw_fence.h"
0024 #include "selftests/mock_gem_device.h"
0025 #include "selftests/mock_timeline.h"
0026
0027 static struct page *hwsp_page(struct intel_timeline *tl)
0028 {
0029     struct drm_i915_gem_object *obj = tl->hwsp_ggtt->obj;
0030
0031     GEM_BUG_ON(!i915_gem_object_has_pinned_pages(obj));
0032     return sg_page(obj->mm.pages->sgl);
0033 }
0034
0035 static unsigned long hwsp_cacheline(struct intel_timeline *tl)
0036 {
0037     unsigned long address = (unsigned long)page_address(hwsp_page(tl));
0038
0039     return (address + offset_in_page(tl->hwsp_offset)) / TIMELINE_SEQNO_BYTES;
0040 }
0041
0042 static int selftest_tl_pin(struct intel_timeline *tl)
0043 {
0044     struct i915_gem_ww_ctx ww;
0045     int err;
0046
0047     i915_gem_ww_ctx_init(&ww, false);
0048 retry:
0049     err = i915_gem_object_lock(tl->hwsp_ggtt->obj, &ww);
0050     if (!err)
0051         err = intel_timeline_pin(tl, &ww);
0052
0053     if (err == -EDEADLK) {
0054         err = i915_gem_ww_ctx_backoff(&ww);
0055         if (!err)
0056             goto retry;
0057     }
0058     i915_gem_ww_ctx_fini(&ww);
0059     return err;
0060 }
0061
0062 /* Only half of seqno's are usable, see __intel_timeline_get_seqno() */
0063 #define CACHELINES_PER_PAGE (PAGE_SIZE / TIMELINE_SEQNO_BYTES / 2)
0064
0065 struct mock_hwsp_freelist {
0066     struct intel_gt *gt;
0067     struct radix_tree_root cachelines;
0068     struct intel_timeline **history;
0069     unsigned long count, max;
0070     struct rnd_state prng;
0071 };
0072
0073 enum {
0074     SHUFFLE = BIT(0),
0075 };
0076
0077 static void __mock_hwsp_record(struct mock_hwsp_freelist *state,
0078                    unsigned int idx,
0079                    struct intel_timeline *tl)
0080 {
0081     tl = xchg(&state->history[idx], tl);
0082     if (tl) {
0083         radix_tree_delete(&state->cachelines, hwsp_cacheline(tl));
0084         intel_timeline_unpin(tl);
0085         intel_timeline_put(tl);
0086     }
0087 }
0088
0089 static int __mock_hwsp_timeline(struct mock_hwsp_freelist *state,
0090                 unsigned int count,
0091                 unsigned int flags)
0092 {
0093     struct intel_timeline *tl;
0094     unsigned int idx;
0095
0096     while (count--) {
0097         unsigned long cacheline;
0098         int err;
0099
0100         tl = intel_timeline_create(state->gt);
0101         if (IS_ERR(tl))
0102             return PTR_ERR(tl);
0103
0104         err = selftest_tl_pin(tl);
0105         if (err) {
0106             intel_timeline_put(tl);
0107             return err;
0108         }
0109
0110         cacheline = hwsp_cacheline(tl);
0111         err = radix_tree_insert(&state->cachelines, cacheline, tl);
0112         if (err) {
0113             if (err == -EEXIST) {
0114                 pr_err("HWSP cacheline %lu already used; duplicate allocation!\n",
0115                        cacheline);
0116             }
0117             intel_timeline_unpin(tl);
0118             intel_timeline_put(tl);
0119             return err;
0120         }
0121
0122         idx = state->count++ % state->max;
0123         __mock_hwsp_record(state, idx, tl);
0124     }
0125
0126     if (flags & SHUFFLE)
0127         i915_prandom_shuffle(state->history,
0128                      sizeof(*state->history),
0129                      min(state->count, state->max),
0130                      &state->prng);
0131
0132     count = i915_prandom_u32_max_state(min(state->count, state->max),
0133                        &state->prng);
0134     while (count--) {
0135         idx = --state->count % state->max;
0136         __mock_hwsp_record(state, idx, NULL);
0137     }
0138
0139     return 0;
0140 }
0141
0142 static int mock_hwsp_freelist(void *arg)
0143 {
0144     struct mock_hwsp_freelist state;
0145     struct drm_i915_private *i915;
0146     const struct {
0147         const char *name;
0148         unsigned int flags;
0149     } phases[] = {
0150         { "linear", 0 },
0151         { "shuffled", SHUFFLE },
0152         { },
0153     }, *p;
0154     unsigned int na;
0155     int err = 0;
0156
0157     i915 = mock_gem_device();
0158     if (!i915)
0159         return -ENOMEM;
0160
0161     INIT_RADIX_TREE(&state.cachelines, GFP_KERNEL);
0162     state.prng = I915_RND_STATE_INITIALIZER(i915_selftest.random_seed);
0163
0164     state.gt = to_gt(i915);
0165
0166     /*
0167      * Create a bunch of timelines and check that their HWSP do not overlap.
0168      * Free some, and try again.
0169      */
0170
0171     state.max = PAGE_SIZE / sizeof(*state.history);
0172     state.count = 0;
0173     state.history = kcalloc(state.max, sizeof(*state.history), GFP_KERNEL);
0174     if (!state.history) {
0175         err = -ENOMEM;
0176         goto err_put;
0177     }
0178
0179     for (p = phases; p->name; p++) {
0180         pr_debug("%s(%s)\n", __func__, p->name);
0181         for_each_prime_number_from(na, 1, 2 * CACHELINES_PER_PAGE) {
0182             err = __mock_hwsp_timeline(&state, na, p->flags);
0183             if (err)
0184                 goto out;
0185         }
0186     }
0187
0188 out:
0189     for (na = 0; na < state.max; na++)
0190         __mock_hwsp_record(&state, na, NULL);
0191     kfree(state.history);
0192 err_put:
0193     mock_destroy_device(i915);
0194     return err;
0195 }
0196
0197 struct __igt_sync {
0198     const char *name;
0199     u32 seqno;
0200     bool expected;
0201     bool set;
0202 };
0203
0204 static int __igt_sync(struct intel_timeline *tl,
0205               u64 ctx,
0206               const struct __igt_sync *p,
0207               const char *name)
0208 {
0209     int ret;
0210
0211     if (__intel_timeline_sync_is_later(tl, ctx, p->seqno) != p->expected) {
0212         pr_err("%s: %s(ctx=%llu, seqno=%u) expected passed %s but failed\n",
0213                name, p->name, ctx, p->seqno, str_yes_no(p->expected));
0214         return -EINVAL;
0215     }
0216
0217     if (p->set) {
0218         ret = __intel_timeline_sync_set(tl, ctx, p->seqno);
0219         if (ret)
0220             return ret;
0221     }
0222
0223     return 0;
0224 }
0225
0226 static int igt_sync(void *arg)
0227 {
0228     const struct __igt_sync pass[] = {
0229         { "unset", 0, false, false },
0230         { "new", 0, false, true },
0231         { "0a", 0, true, true },
0232         { "1a", 1, false, true },
0233         { "1b", 1, true, true },
0234         { "0b", 0, true, false },
0235         { "2a", 2, false, true },
0236         { "4", 4, false, true },
0237         { "INT_MAX", INT_MAX, false, true },
0238         { "INT_MAX-1", INT_MAX-1, true, false },
0239         { "INT_MAX+1", (u32)INT_MAX+1, false, true },
0240         { "INT_MAX", INT_MAX, true, false },
0241         { "UINT_MAX", UINT_MAX, false, true },
0242         { "wrap", 0, false, true },
0243         { "unwrap", UINT_MAX, true, false },
0244         {},
0245     }, *p;
0246     struct intel_timeline tl;
0247     int order, offset;
0248     int ret = -ENODEV;
0249
0250     mock_timeline_init(&tl, 0);
0251     for (p = pass; p->name; p++) {
0252         for (order = 1; order < 64; order++) {
0253             for (offset = -1; offset <= (order > 1); offset++) {
0254                 u64 ctx = BIT_ULL(order) + offset;
0255
0256                 ret = __igt_sync(&tl, ctx, p, "1");
0257                 if (ret)
0258                     goto out;
0259             }
0260         }
0261     }
0262     mock_timeline_fini(&tl);
0263
0264     mock_timeline_init(&tl, 0);
0265     for (order = 1; order < 64; order++) {
0266         for (offset = -1; offset <= (order > 1); offset++) {
0267             u64 ctx = BIT_ULL(order) + offset;
0268
0269             for (p = pass; p->name; p++) {
0270                 ret = __igt_sync(&tl, ctx, p, "2");
0271                 if (ret)
0272                     goto out;
0273             }
0274         }
0275     }
0276
0277 out:
0278     mock_timeline_fini(&tl);
0279     return ret;
0280 }
0281
0282 static unsigned int random_engine(struct rnd_state *rnd)
0283 {
0284     return i915_prandom_u32_max_state(I915_NUM_ENGINES, rnd);
0285 }
0286
0287 static int bench_sync(void *arg)
0288 {
0289     struct rnd_state prng;
0290     struct intel_timeline tl;
0291     unsigned long end_time, count;
0292     u64 prng32_1M;
0293     ktime_t kt;
0294     int order, last_order;
0295
0296     mock_timeline_init(&tl, 0);
0297
0298     /* Lookups from cache are very fast and so the random number generation
0299      * and the loop itself becomes a significant factor in the per-iteration
0300      * timings. We try to compensate the results by measuring the overhead
0301      * of the prng and subtract it from the reported results.
0302      */
0303     prandom_seed_state(&prng, i915_selftest.random_seed);
0304     count = 0;
0305     kt = ktime_get();
0306     end_time = jiffies + HZ/10;
0307     do {
0308         u32 x;
0309
0310         /* Make sure the compiler doesn't optimise away the prng call */
0311         WRITE_ONCE(x, prandom_u32_state(&prng));
0312
0313         count++;
0314     } while (!time_after(jiffies, end_time));
0315     kt = ktime_sub(ktime_get(), kt);
0316     pr_debug("%s: %lu random evaluations, %lluns/prng\n",
0317          __func__, count, (long long)div64_ul(ktime_to_ns(kt), count));
0318     prng32_1M = div64_ul(ktime_to_ns(kt) << 20, count);
0319
0320     /* Benchmark (only) setting random context ids */
0321     prandom_seed_state(&prng, i915_selftest.random_seed);
0322     count = 0;
0323     kt = ktime_get();
0324     end_time = jiffies + HZ/10;
0325     do {
0326         u64 id = i915_prandom_u64_state(&prng);
0327
0328         __intel_timeline_sync_set(&tl, id, 0);
0329         count++;
0330     } while (!time_after(jiffies, end_time));
0331     kt = ktime_sub(ktime_get(), kt);
0332     kt = ktime_sub_ns(kt, (count * prng32_1M * 2) >> 20);
0333     pr_info("%s: %lu random insertions, %lluns/insert\n",
0334         __func__, count, (long long)div64_ul(ktime_to_ns(kt), count));
0335
0336     /* Benchmark looking up the exact same context ids as we just set */
0337     prandom_seed_state(&prng, i915_selftest.random_seed);
0338     end_time = count;
0339     kt = ktime_get();
0340     while (end_time--) {
0341         u64 id = i915_prandom_u64_state(&prng);
0342
0343         if (!__intel_timeline_sync_is_later(&tl, id, 0)) {
0344             mock_timeline_fini(&tl);
0345             pr_err("Lookup of %llu failed\n", id);
0346             return -EINVAL;
0347         }
0348     }
0349     kt = ktime_sub(ktime_get(), kt);
0350     kt = ktime_sub_ns(kt, (count * prng32_1M * 2) >> 20);
0351     pr_info("%s: %lu random lookups, %lluns/lookup\n",
0352         __func__, count, (long long)div64_ul(ktime_to_ns(kt), count));
0353
0354     mock_timeline_fini(&tl);
0355     cond_resched();
0356
0357     mock_timeline_init(&tl, 0);
0358
0359     /* Benchmark setting the first N (in order) contexts */
0360     count = 0;
0361     kt = ktime_get();
0362     end_time = jiffies + HZ/10;
0363     do {
0364         __intel_timeline_sync_set(&tl, count++, 0);
0365     } while (!time_after(jiffies, end_time));
0366     kt = ktime_sub(ktime_get(), kt);
0367     pr_info("%s: %lu in-order insertions, %lluns/insert\n",
0368         __func__, count, (long long)div64_ul(ktime_to_ns(kt), count));
0369
0370     /* Benchmark looking up the exact same context ids as we just set */
0371     end_time = count;
0372     kt = ktime_get();
0373     while (end_time--) {
0374         if (!__intel_timeline_sync_is_later(&tl, end_time, 0)) {
0375             pr_err("Lookup of %lu failed\n", end_time);
0376             mock_timeline_fini(&tl);
0377             return -EINVAL;
0378         }
0379     }
0380     kt = ktime_sub(ktime_get(), kt);
0381     pr_info("%s: %lu in-order lookups, %lluns/lookup\n",
0382         __func__, count, (long long)div64_ul(ktime_to_ns(kt), count));
0383
0384     mock_timeline_fini(&tl);
0385     cond_resched();
0386
0387     mock_timeline_init(&tl, 0);
0388
0389     /* Benchmark searching for a random context id and maybe changing it */
0390     prandom_seed_state(&prng, i915_selftest.random_seed);
0391     count = 0;
0392     kt = ktime_get();
0393     end_time = jiffies + HZ/10;
0394     do {
0395         u32 id = random_engine(&prng);
0396         u32 seqno = prandom_u32_state(&prng);
0397
0398         if (!__intel_timeline_sync_is_later(&tl, id, seqno))
0399             __intel_timeline_sync_set(&tl, id, seqno);
0400
0401         count++;
0402     } while (!time_after(jiffies, end_time));
0403     kt = ktime_sub(ktime_get(), kt);
0404     kt = ktime_sub_ns(kt, (count * prng32_1M * 2) >> 20);
0405     pr_info("%s: %lu repeated insert/lookups, %lluns/op\n",
0406         __func__, count, (long long)div64_ul(ktime_to_ns(kt), count));
0407     mock_timeline_fini(&tl);
0408     cond_resched();
0409
0410     /* Benchmark searching for a known context id and changing the seqno */
0411     for (last_order = 1, order = 1; order < 32;
0412          ({ int tmp = last_order; last_order = order; order += tmp; })) {
0413         unsigned int mask = BIT(order) - 1;
0414
0415         mock_timeline_init(&tl, 0);
0416
0417         count = 0;
0418         kt = ktime_get();
0419         end_time = jiffies + HZ/10;
0420         do {
0421             /* Without assuming too many details of the underlying
0422              * implementation, try to identify its phase-changes
0423              * (if any)!
0424              */
0425             u64 id = (u64)(count & mask) << order;
0426
0427             __intel_timeline_sync_is_later(&tl, id, 0);
0428             __intel_timeline_sync_set(&tl, id, 0);
0429
0430             count++;
0431         } while (!time_after(jiffies, end_time));
0432         kt = ktime_sub(ktime_get(), kt);
0433         pr_info("%s: %lu cyclic/%d insert/lookups, %lluns/op\n",
0434             __func__, count, order,
0435             (long long)div64_ul(ktime_to_ns(kt), count));
0436         mock_timeline_fini(&tl);
0437         cond_resched();
0438     }
0439
0440     return 0;
0441 }
0442
0443 int intel_timeline_mock_selftests(void)
0444 {
0445     static const struct i915_subtest tests[] = {
0446         SUBTEST(mock_hwsp_freelist),
0447         SUBTEST(igt_sync),
0448         SUBTEST(bench_sync),
0449     };
0450
0451     return i915_subtests(tests, NULL);
0452 }
0453
0454 static int emit_ggtt_store_dw(struct i915_request *rq, u32 addr, u32 value)
0455 {
0456     u32 *cs;
0457
0458     cs = intel_ring_begin(rq, 4);
0459     if (IS_ERR(cs))
0460         return PTR_ERR(cs);
0461
0462     if (GRAPHICS_VER(rq->engine->i915) >= 8) {
0463         *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
0464         *cs++ = addr;
0465         *cs++ = 0;
0466         *cs++ = value;
0467     } else if (GRAPHICS_VER(rq->engine->i915) >= 4) {
0468         *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
0469         *cs++ = 0;
0470         *cs++ = addr;
0471         *cs++ = value;
0472     } else {
0473         *cs++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL;
0474         *cs++ = addr;
0475         *cs++ = value;
0476         *cs++ = MI_NOOP;
0477     }
0478
0479     intel_ring_advance(rq, cs);
0480
0481     return 0;
0482 }
0483
0484 static struct i915_request *
0485 checked_tl_write(struct intel_timeline *tl, struct intel_engine_cs *engine, u32 value)
0486 {
0487     struct i915_request *rq;
0488     int err;
0489
0490     err = selftest_tl_pin(tl);
0491     if (err) {
0492         rq = ERR_PTR(err);
0493         goto out;
0494     }
0495
0496     if (READ_ONCE(*tl->hwsp_seqno) != tl->seqno) {
0497         pr_err("Timeline created with incorrect breadcrumb, found %x, expected %x\n",
0498                *tl->hwsp_seqno, tl->seqno);
0499         intel_timeline_unpin(tl);
0500         return ERR_PTR(-EINVAL);
0501     }
0502
0503     rq = intel_engine_create_kernel_request(engine);
0504     if (IS_ERR(rq))
0505         goto out_unpin;
0506
0507     i915_request_get(rq);
0508
0509     err = emit_ggtt_store_dw(rq, tl->hwsp_offset, value);
0510     i915_request_add(rq);
0511     if (err) {
0512         i915_request_put(rq);
0513         rq = ERR_PTR(err);
0514     }
0515
0516 out_unpin:
0517     intel_timeline_unpin(tl);
0518 out:
0519     if (IS_ERR(rq))
0520         pr_err("Failed to write to timeline!\n");
0521     return rq;
0522 }
0523
0524 static int live_hwsp_engine(void *arg)
0525 {
0526 #define NUM_TIMELINES 4096
0527     struct intel_gt *gt = arg;
0528     struct intel_timeline **timelines;
0529     struct intel_engine_cs *engine;
0530     enum intel_engine_id id;
0531     unsigned long count, n;
0532     int err = 0;
0533
0534     /*
0535      * Create a bunch of timelines and check we can write
0536      * independently to each of their breadcrumb slots.
0537      */
0538
0539     timelines = kvmalloc_array(NUM_TIMELINES * I915_NUM_ENGINES,
0540                    sizeof(*timelines),
0541                    GFP_KERNEL);
0542     if (!timelines)
0543         return -ENOMEM;
0544
0545     count = 0;
0546     for_each_engine(engine, gt, id) {
0547         if (!intel_engine_can_store_dword(engine))
0548             continue;
0549
0550         intel_engine_pm_get(engine);
0551
0552         for (n = 0; n < NUM_TIMELINES; n++) {
0553             struct intel_timeline *tl;
0554             struct i915_request *rq;
0555
0556             tl = intel_timeline_create(gt);
0557             if (IS_ERR(tl)) {
0558                 err = PTR_ERR(tl);
0559                 break;
0560             }
0561
0562             rq = checked_tl_write(tl, engine, count);
0563             if (IS_ERR(rq)) {
0564                 intel_timeline_put(tl);
0565                 err = PTR_ERR(rq);
0566                 break;
0567             }
0568
0569             timelines[count++] = tl;
0570             i915_request_put(rq);
0571         }
0572
0573         intel_engine_pm_put(engine);
0574         if (err)
0575             break;
0576     }
0577
0578     if (igt_flush_test(gt->i915))
0579         err = -EIO;
0580
0581     for (n = 0; n < count; n++) {
0582         struct intel_timeline *tl = timelines[n];
0583
0584         if (!err && READ_ONCE(*tl->hwsp_seqno) != n) {
0585             GEM_TRACE_ERR("Invalid seqno:%lu stored in timeline %llu @ %x, found 0x%x\n",
0586                       n, tl->fence_context, tl->hwsp_offset, *tl->hwsp_seqno);
0587             GEM_TRACE_DUMP();
0588             err = -EINVAL;
0589         }
0590         intel_timeline_put(tl);
0591     }
0592
0593     kvfree(timelines);
0594     return err;
0595 #undef NUM_TIMELINES
0596 }
0597
0598 static int live_hwsp_alternate(void *arg)
0599 {
0600 #define NUM_TIMELINES 4096
0601     struct intel_gt *gt = arg;
0602     struct intel_timeline **timelines;
0603     struct intel_engine_cs *engine;
0604     enum intel_engine_id id;
0605     unsigned long count, n;
0606     int err = 0;
0607
0608     /*
0609      * Create a bunch of timelines and check we can write
0610      * independently to each of their breadcrumb slots with adjacent
0611      * engines.
0612      */
0613
0614     timelines = kvmalloc_array(NUM_TIMELINES * I915_NUM_ENGINES,
0615                    sizeof(*timelines),
0616                    GFP_KERNEL);
0617     if (!timelines)
0618         return -ENOMEM;
0619
0620     count = 0;
0621     for (n = 0; n < NUM_TIMELINES; n++) {
0622         for_each_engine(engine, gt, id) {
0623             struct intel_timeline *tl;
0624             struct i915_request *rq;
0625
0626             if (!intel_engine_can_store_dword(engine))
0627                 continue;
0628
0629             tl = intel_timeline_create(gt);
0630             if (IS_ERR(tl)) {
0631                 err = PTR_ERR(tl);
0632                 goto out;
0633             }
0634
0635             intel_engine_pm_get(engine);
0636             rq = checked_tl_write(tl, engine, count);
0637             intel_engine_pm_put(engine);
0638             if (IS_ERR(rq)) {
0639                 intel_timeline_put(tl);
0640                 err = PTR_ERR(rq);
0641                 goto out;
0642             }
0643
0644             timelines[count++] = tl;
0645             i915_request_put(rq);
0646         }
0647     }
0648
0649 out:
0650     if (igt_flush_test(gt->i915))
0651         err = -EIO;
0652
0653     for (n = 0; n < count; n++) {
0654         struct intel_timeline *tl = timelines[n];
0655
0656         if (!err && READ_ONCE(*tl->hwsp_seqno) != n) {
0657             GEM_TRACE_ERR("Invalid seqno:%lu stored in timeline %llu @ %x, found 0x%x\n",
0658                       n, tl->fence_context, tl->hwsp_offset, *tl->hwsp_seqno);
0659             GEM_TRACE_DUMP();
0660             err = -EINVAL;
0661         }
0662         intel_timeline_put(tl);
0663     }
0664
0665     kvfree(timelines);
0666     return err;
0667 #undef NUM_TIMELINES
0668 }
0669
0670 static int live_hwsp_wrap(void *arg)
0671 {
0672     struct intel_gt *gt = arg;
0673     struct intel_engine_cs *engine;
0674     struct intel_timeline *tl;
0675     enum intel_engine_id id;
0676     int err = 0;
0677
0678     /*
0679      * Across a seqno wrap, we need to keep the old cacheline alive for
0680      * foreign GPU references.
0681      */
0682
0683     tl = intel_timeline_create(gt);
0684     if (IS_ERR(tl))
0685         return PTR_ERR(tl);
0686
0687     if (!tl->has_initial_breadcrumb)
0688         goto out_free;
0689
0690     err = selftest_tl_pin(tl);
0691     if (err)
0692         goto out_free;
0693
0694     for_each_engine(engine, gt, id) {
0695         const u32 *hwsp_seqno[2];
0696         struct i915_request *rq;
0697         u32 seqno[2];
0698
0699         if (!intel_engine_can_store_dword(engine))
0700             continue;
0701
0702         rq = intel_engine_create_kernel_request(engine);
0703         if (IS_ERR(rq)) {
0704             err = PTR_ERR(rq);
0705             goto out;
0706         }
0707
0708         tl->seqno = -4u;
0709
0710         mutex_lock_nested(&tl->mutex, SINGLE_DEPTH_NESTING);
0711         err = intel_timeline_get_seqno(tl, rq, &seqno[0]);
0712         mutex_unlock(&tl->mutex);
0713         if (err) {
0714             i915_request_add(rq);
0715             goto out;
0716         }
0717         pr_debug("seqno[0]:%08x, hwsp_offset:%08x\n",
0718              seqno[0], tl->hwsp_offset);
0719
0720         err = emit_ggtt_store_dw(rq, tl->hwsp_offset, seqno[0]);
0721         if (err) {
0722             i915_request_add(rq);
0723             goto out;
0724         }
0725         hwsp_seqno[0] = tl->hwsp_seqno;
0726
0727         mutex_lock_nested(&tl->mutex, SINGLE_DEPTH_NESTING);
0728         err = intel_timeline_get_seqno(tl, rq, &seqno[1]);
0729         mutex_unlock(&tl->mutex);
0730         if (err) {
0731             i915_request_add(rq);
0732             goto out;
0733         }
0734         pr_debug("seqno[1]:%08x, hwsp_offset:%08x\n",
0735              seqno[1], tl->hwsp_offset);
0736
0737         err = emit_ggtt_store_dw(rq, tl->hwsp_offset, seqno[1]);
0738         if (err) {
0739             i915_request_add(rq);
0740             goto out;
0741         }
0742         hwsp_seqno[1] = tl->hwsp_seqno;
0743
0744         /* With wrap should come a new hwsp */
0745         GEM_BUG_ON(seqno[1] >= seqno[0]);
0746         GEM_BUG_ON(hwsp_seqno[0] == hwsp_seqno[1]);
0747
0748         i915_request_add(rq);
0749
0750         if (i915_request_wait(rq, 0, HZ / 5) < 0) {
0751             pr_err("Wait for timeline writes timed out!\n");
0752             err = -EIO;
0753             goto out;
0754         }
0755
0756         if (READ_ONCE(*hwsp_seqno[0]) != seqno[0] ||
0757             READ_ONCE(*hwsp_seqno[1]) != seqno[1]) {
0758             pr_err("Bad timeline values: found (%x, %x), expected (%x, %x)\n",
0759                    *hwsp_seqno[0], *hwsp_seqno[1],
0760                    seqno[0], seqno[1]);
0761             err = -EINVAL;
0762             goto out;
0763         }
0764
0765         intel_gt_retire_requests(gt); /* recycle HWSP */
0766     }
0767
0768 out:
0769     if (igt_flush_test(gt->i915))
0770         err = -EIO;
0771
0772     intel_timeline_unpin(tl);
0773 out_free:
0774     intel_timeline_put(tl);
0775     return err;
0776 }
0777
0778 static int emit_read_hwsp(struct i915_request *rq,
0779               u32 seqno, u32 hwsp,
0780               u32 *addr)
0781 {
0782     const u32 gpr = i915_mmio_reg_offset(GEN8_RING_CS_GPR(rq->engine->mmio_base, 0));
0783     u32 *cs;
0784
0785     cs = intel_ring_begin(rq, 12);
0786     if (IS_ERR(cs))
0787         return PTR_ERR(cs);
0788
0789     *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
0790     *cs++ = *addr;
0791     *cs++ = 0;
0792     *cs++ = seqno;
0793     *addr += 4;
0794
0795     *cs++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_USE_GGTT;
0796     *cs++ = gpr;
0797     *cs++ = hwsp;
0798     *cs++ = 0;
0799
0800     *cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
0801     *cs++ = gpr;
0802     *cs++ = *addr;
0803     *cs++ = 0;
0804     *addr += 4;
0805
0806     intel_ring_advance(rq, cs);
0807
0808     return 0;
0809 }
0810
0811 struct hwsp_watcher {
0812     struct i915_vma *vma;
0813     struct i915_request *rq;
0814     u32 addr;
0815     u32 *map;
0816 };
0817
0818 static bool cmp_lt(u32 a, u32 b)
0819 {
0820     return a < b;
0821 }
0822
0823 static bool cmp_gte(u32 a, u32 b)
0824 {
0825     return a >= b;
0826 }
0827
0828 static int setup_watcher(struct hwsp_watcher *w, struct intel_gt *gt)
0829 {
0830     struct drm_i915_gem_object *obj;
0831     struct i915_vma *vma;
0832
0833     obj = i915_gem_object_create_internal(gt->i915, SZ_2M);
0834     if (IS_ERR(obj))
0835         return PTR_ERR(obj);
0836
0837     w->map = i915_gem_object_pin_map_unlocked(obj, I915_MAP_WB);
0838     if (IS_ERR(w->map)) {
0839         i915_gem_object_put(obj);
0840         return PTR_ERR(w->map);
0841     }
0842
0843     vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0, 0);
0844     if (IS_ERR(vma)) {
0845         i915_gem_object_put(obj);
0846         return PTR_ERR(vma);
0847     }
0848
0849     w->vma = vma;
0850     w->addr = i915_ggtt_offset(vma);
0851     return 0;
0852 }
0853
0854 static void switch_tl_lock(struct i915_request *from, struct i915_request *to)
0855 {
0856     /* some light mutex juggling required; think co-routines */
0857
0858     if (from) {
0859         lockdep_unpin_lock(&from->context->timeline->mutex, from->cookie);
0860         mutex_unlock(&from->context->timeline->mutex);
0861     }
0862
0863     if (to) {
0864         mutex_lock(&to->context->timeline->mutex);
0865         to->cookie = lockdep_pin_lock(&to->context->timeline->mutex);
0866     }
0867 }
0868
0869 static int create_watcher(struct hwsp_watcher *w,
0870               struct intel_engine_cs *engine,
0871               int ringsz)
0872 {
0873     struct intel_context *ce;
0874
0875     ce = intel_context_create(engine);
0876     if (IS_ERR(ce))
0877         return PTR_ERR(ce);
0878
0879     ce->ring_size = ringsz;
0880     w->rq = intel_context_create_request(ce);
0881     intel_context_put(ce);
0882     if (IS_ERR(w->rq))
0883         return PTR_ERR(w->rq);
0884
0885     w->addr = i915_ggtt_offset(w->vma);
0886
0887     switch_tl_lock(w->rq, NULL);
0888
0889     return 0;
0890 }
0891
0892 static int check_watcher(struct hwsp_watcher *w, const char *name,
0893              bool (*op)(u32 hwsp, u32 seqno))
0894 {
0895     struct i915_request *rq = fetch_and_zero(&w->rq);
0896     u32 offset, end;
0897     int err;
0898
0899     GEM_BUG_ON(w->addr - i915_ggtt_offset(w->vma) > w->vma->size);
0900
0901     i915_request_get(rq);
0902     switch_tl_lock(NULL, rq);
0903     i915_request_add(rq);
0904
0905     if (i915_request_wait(rq, 0, HZ) < 0) {
0906         err = -ETIME;
0907         goto out;
0908     }
0909
0910     err = 0;
0911     offset = 0;
0912     end = (w->addr - i915_ggtt_offset(w->vma)) / sizeof(*w->map);
0913     while (offset < end) {
0914         if (!op(w->map[offset + 1], w->map[offset])) {
0915             pr_err("Watcher '%s' found HWSP value %x for seqno %x\n",
0916                    name, w->map[offset + 1], w->map[offset]);
0917             err = -EINVAL;
0918         }
0919
0920         offset += 2;
0921     }
0922
0923 out:
0924     i915_request_put(rq);
0925     return err;
0926 }
0927
0928 static void cleanup_watcher(struct hwsp_watcher *w)
0929 {
0930     if (w->rq) {
0931         switch_tl_lock(NULL, w->rq);
0932
0933         i915_request_add(w->rq);
0934     }
0935
0936     i915_vma_unpin_and_release(&w->vma, I915_VMA_RELEASE_MAP);
0937 }
0938
0939 static bool retire_requests(struct intel_timeline *tl)
0940 {
0941     struct i915_request *rq, *rn;
0942
0943     mutex_lock(&tl->mutex);
0944     list_for_each_entry_safe(rq, rn, &tl->requests, link)
0945         if (!i915_request_retire(rq))
0946             break;
0947     mutex_unlock(&tl->mutex);
0948
0949     return !i915_active_fence_isset(&tl->last_request);
0950 }
0951
0952 static struct i915_request *wrap_timeline(struct i915_request *rq)
0953 {
0954     struct intel_context *ce = rq->context;
0955     struct intel_timeline *tl = ce->timeline;
0956     u32 seqno = rq->fence.seqno;
0957
0958     while (tl->seqno >= seqno) { /* Cause a wrap */
0959         i915_request_put(rq);
0960         rq = intel_context_create_request(ce);
0961         if (IS_ERR(rq))
0962             return rq;
0963
0964         i915_request_get(rq);
0965         i915_request_add(rq);
0966     }
0967
0968     i915_request_put(rq);
0969     rq = i915_request_create(ce);
0970     if (IS_ERR(rq))
0971         return rq;
0972
0973     i915_request_get(rq);
0974     i915_request_add(rq);
0975
0976     return rq;
0977 }
0978
0979 static int live_hwsp_read(void *arg)
0980 {
0981     struct intel_gt *gt = arg;
0982     struct hwsp_watcher watcher[2] = {};
0983     struct intel_engine_cs *engine;
0984     struct intel_timeline *tl;
0985     enum intel_engine_id id;
0986     int err = 0;
0987     int i;
0988
0989     /*
0990      * If we take a reference to the HWSP for reading on the GPU, that
0991      * read may be arbitrarily delayed (either by foreign fence or
0992      * priority saturation) and a wrap can happen within 30 minutes.
0993      * When the GPU read is finally submitted it should be correct,
0994      * even across multiple wraps.
0995      */
0996
0997     if (GRAPHICS_VER(gt->i915) < 8) /* CS convenience [SRM/LRM] */
0998         return 0;
0999
1000     tl = intel_timeline_create(gt);
1001     if (IS_ERR(tl))
1002         return PTR_ERR(tl);
1003
1004     if (!tl->has_initial_breadcrumb)
1005         goto out_free;
1006
1007     for (i = 0; i < ARRAY_SIZE(watcher); i++) {
1008         err = setup_watcher(&watcher[i], gt);
1009         if (err)
1010             goto out;
1011     }
1012
1013     for_each_engine(engine, gt, id) {
1014         struct intel_context *ce;
1015         unsigned long count = 0;
1016         IGT_TIMEOUT(end_time);
1017
1018         /* Create a request we can use for remote reading of the HWSP */
1019         err = create_watcher(&watcher[1], engine, SZ_512K);
1020         if (err)
1021             goto out;
1022
1023         do {
1024             struct i915_sw_fence *submit;
1025             struct i915_request *rq;
1026             u32 hwsp, dummy;
1027
1028             submit = heap_fence_create(GFP_KERNEL);
1029             if (!submit) {
1030                 err = -ENOMEM;
1031                 goto out;
1032             }
1033
1034             err = create_watcher(&watcher[0], engine, SZ_4K);
1035             if (err)
1036                 goto out;
1037
1038             ce = intel_context_create(engine);
1039             if (IS_ERR(ce)) {
1040                 err = PTR_ERR(ce);
1041                 goto out;
1042             }
1043
1044             ce->timeline = intel_timeline_get(tl);
1045
1046             /* Ensure timeline is mapped, done during first pin */
1047             err = intel_context_pin(ce);
1048             if (err) {
1049                 intel_context_put(ce);
1050                 goto out;
1051             }
1052
1053             /*
1054              * Start at a new wrap, and set seqno right before another wrap,
1055              * saving 30 minutes of nops
1056              */
1057             tl->seqno = -12u + 2 * (count & 3);
1058             __intel_timeline_get_seqno(tl, &dummy);
1059
1060             rq = i915_request_create(ce);
1061             if (IS_ERR(rq)) {
1062                 err = PTR_ERR(rq);
1063                 intel_context_unpin(ce);
1064                 intel_context_put(ce);
1065                 goto out;
1066             }
1067
1068             err = i915_sw_fence_await_dma_fence(&rq->submit,
1069                                 &watcher[0].rq->fence, 0,
1070                                 GFP_KERNEL);
1071             if (err < 0) {
1072                 i915_request_add(rq);
1073                 intel_context_unpin(ce);
1074                 intel_context_put(ce);
1075                 goto out;
1076             }
1077
1078             switch_tl_lock(rq, watcher[0].rq);
1079             err = intel_timeline_read_hwsp(rq, watcher[0].rq, &hwsp);
1080             if (err == 0)
1081                 err = emit_read_hwsp(watcher[0].rq, /* before */
1082                              rq->fence.seqno, hwsp,
1083                              &watcher[0].addr);
1084             switch_tl_lock(watcher[0].rq, rq);
1085             if (err) {
1086                 i915_request_add(rq);
1087                 intel_context_unpin(ce);
1088                 intel_context_put(ce);
1089                 goto out;
1090             }
1091
1092             switch_tl_lock(rq, watcher[1].rq);
1093             err = intel_timeline_read_hwsp(rq, watcher[1].rq, &hwsp);
1094             if (err == 0)
1095                 err = emit_read_hwsp(watcher[1].rq, /* after */
1096                              rq->fence.seqno, hwsp,
1097                              &watcher[1].addr);
1098             switch_tl_lock(watcher[1].rq, rq);
1099             if (err) {
1100                 i915_request_add(rq);
1101                 intel_context_unpin(ce);
1102                 intel_context_put(ce);
1103                 goto out;
1104             }
1105
1106             i915_request_get(rq);
1107             i915_request_add(rq);
1108
1109             rq = wrap_timeline(rq);
1110             intel_context_unpin(ce);
1111             intel_context_put(ce);
1112             if (IS_ERR(rq)) {
1113                 err = PTR_ERR(rq);
1114                 goto out;
1115             }
1116
1117             err = i915_sw_fence_await_dma_fence(&watcher[1].rq->submit,
1118                                 &rq->fence, 0,
1119                                 GFP_KERNEL);
1120             if (err < 0) {
1121                 i915_request_put(rq);
1122                 goto out;
1123             }
1124
1125             err = check_watcher(&watcher[0], "before", cmp_lt);
1126             i915_sw_fence_commit(submit);
1127             heap_fence_put(submit);
1128             if (err) {
1129                 i915_request_put(rq);
1130                 goto out;
1131             }
1132             count++;
1133
1134             /* Flush the timeline before manually wrapping again */
1135             if (i915_request_wait(rq,
1136                           I915_WAIT_INTERRUPTIBLE,
1137                           HZ) < 0) {
1138                 err = -ETIME;
1139                 i915_request_put(rq);
1140                 goto out;
1141             }
1142             retire_requests(tl);
1143             i915_request_put(rq);
1144
1145             /* Single requests are limited to half a ring at most */
1146             if (8 * watcher[1].rq->ring->emit >
1147                 3 * watcher[1].rq->ring->size)
1148                 break;
1149
1150         } while (!__igt_timeout(end_time, NULL) &&
1151              count < (PAGE_SIZE / TIMELINE_SEQNO_BYTES - 1) / 2);
1152
1153         pr_info("%s: simulated %lu wraps\n", engine->name, count);
1154         err = check_watcher(&watcher[1], "after", cmp_gte);
1155         if (err)
1156             goto out;
1157     }
1158
1159 out:
1160     for (i = 0; i < ARRAY_SIZE(watcher); i++)
1161         cleanup_watcher(&watcher[i]);
1162
1163     if (igt_flush_test(gt->i915))
1164         err = -EIO;
1165
1166 out_free:
1167     intel_timeline_put(tl);
1168     return err;
1169 }
1170
1171 static int live_hwsp_rollover_kernel(void *arg)
1172 {
1173     struct intel_gt *gt = arg;
1174     struct intel_engine_cs *engine;
1175     enum intel_engine_id id;
1176     int err = 0;
1177
1178     /*
1179      * Run the host for long enough, and even the kernel context will
1180      * see a seqno rollover.
1181      */
1182
1183     for_each_engine(engine, gt, id) {
1184         struct intel_context *ce = engine->kernel_context;
1185         struct intel_timeline *tl = ce->timeline;
1186         struct i915_request *rq[3] = {};
1187         int i;
1188
1189         st_engine_heartbeat_disable(engine);
1190         if (intel_gt_wait_for_idle(gt, HZ / 2)) {
1191             err = -EIO;
1192             goto out;
1193         }
1194
1195         GEM_BUG_ON(i915_active_fence_isset(&tl->last_request));
1196         tl->seqno = -2u;
1197         WRITE_ONCE(*(u32 *)tl->hwsp_seqno, tl->seqno);
1198
1199         for (i = 0; i < ARRAY_SIZE(rq); i++) {
1200             struct i915_request *this;
1201
1202             this = i915_request_create(ce);
1203             if (IS_ERR(this)) {
1204                 err = PTR_ERR(this);
1205                 goto out;
1206             }
1207
1208             pr_debug("%s: create fence.seqnp:%d\n",
1209                  engine->name,
1210                  lower_32_bits(this->fence.seqno));
1211
1212             GEM_BUG_ON(rcu_access_pointer(this->timeline) != tl);
1213
1214             rq[i] = i915_request_get(this);
1215             i915_request_add(this);
1216         }
1217
1218         /* We expected a wrap! */
1219         GEM_BUG_ON(rq[2]->fence.seqno > rq[0]->fence.seqno);
1220
1221         if (i915_request_wait(rq[2], 0, HZ / 5) < 0) {
1222             pr_err("Wait for timeline wrap timed out!\n");
1223             err = -EIO;
1224             goto out;
1225         }
1226
1227         for (i = 0; i < ARRAY_SIZE(rq); i++) {
1228             if (!i915_request_completed(rq[i])) {
1229                 pr_err("Pre-wrap request not completed!\n");
1230                 err = -EINVAL;
1231                 goto out;
1232             }
1233         }
1234
1235 out:
1236         for (i = 0; i < ARRAY_SIZE(rq); i++)
1237             i915_request_put(rq[i]);
1238         st_engine_heartbeat_enable(engine);
1239         if (err)
1240             break;
1241     }
1242
1243     if (igt_flush_test(gt->i915))
1244         err = -EIO;
1245
1246     return err;
1247 }
1248
1249 static int live_hwsp_rollover_user(void *arg)
1250 {
1251     struct intel_gt *gt = arg;
1252     struct intel_engine_cs *engine;
1253     enum intel_engine_id id;
1254     int err = 0;
1255
1256     /*
1257      * Simulate a long running user context, and force the seqno wrap
1258      * on the user's timeline.
1259      */
1260
1261     for_each_engine(engine, gt, id) {
1262         struct i915_request *rq[3] = {};
1263         struct intel_timeline *tl;
1264         struct intel_context *ce;
1265         int i;
1266
1267         ce = intel_context_create(engine);
1268         if (IS_ERR(ce))
1269             return PTR_ERR(ce);
1270
1271         err = intel_context_alloc_state(ce);
1272         if (err)
1273             goto out;
1274
1275         tl = ce->timeline;
1276         if (!tl->has_initial_breadcrumb)
1277             goto out;
1278
1279         err = intel_context_pin(ce);
1280         if (err)
1281             goto out;
1282
1283         tl->seqno = -4u;
1284         WRITE_ONCE(*(u32 *)tl->hwsp_seqno, tl->seqno);
1285
1286         for (i = 0; i < ARRAY_SIZE(rq); i++) {
1287             struct i915_request *this;
1288
1289             this = intel_context_create_request(ce);
1290             if (IS_ERR(this)) {
1291                 err = PTR_ERR(this);
1292                 goto out_unpin;
1293             }
1294
1295             pr_debug("%s: create fence.seqnp:%d\n",
1296                  engine->name,
1297                  lower_32_bits(this->fence.seqno));
1298
1299             GEM_BUG_ON(rcu_access_pointer(this->timeline) != tl);
1300
1301             rq[i] = i915_request_get(this);
1302             i915_request_add(this);
1303         }
1304
1305         /* We expected a wrap! */
1306         GEM_BUG_ON(rq[2]->fence.seqno > rq[0]->fence.seqno);
1307
1308         if (i915_request_wait(rq[2], 0, HZ / 5) < 0) {
1309             pr_err("Wait for timeline wrap timed out!\n");
1310             err = -EIO;
1311             goto out_unpin;
1312         }
1313
1314         for (i = 0; i < ARRAY_SIZE(rq); i++) {
1315             if (!i915_request_completed(rq[i])) {
1316                 pr_err("Pre-wrap request not completed!\n");
1317                 err = -EINVAL;
1318                 goto out_unpin;
1319             }
1320         }
1321 out_unpin:
1322         intel_context_unpin(ce);
1323 out:
1324         for (i = 0; i < ARRAY_SIZE(rq); i++)
1325             i915_request_put(rq[i]);
1326         intel_context_put(ce);
1327         if (err)
1328             break;
1329     }
1330
1331     if (igt_flush_test(gt->i915))
1332         err = -EIO;
1333
1334     return err;
1335 }
1336
1337 static int live_hwsp_recycle(void *arg)
1338 {
1339     struct intel_gt *gt = arg;
1340     struct intel_engine_cs *engine;
1341     enum intel_engine_id id;
1342     unsigned long count;
1343     int err = 0;
1344
1345     /*
1346      * Check seqno writes into one timeline at a time. We expect to
1347      * recycle the breadcrumb slot between iterations and neither
1348      * want to confuse ourselves or the GPU.
1349      */
1350
1351     count = 0;
1352     for_each_engine(engine, gt, id) {
1353         IGT_TIMEOUT(end_time);
1354
1355         if (!intel_engine_can_store_dword(engine))
1356             continue;
1357
1358         intel_engine_pm_get(engine);
1359
1360         do {
1361             struct intel_timeline *tl;
1362             struct i915_request *rq;
1363
1364             tl = intel_timeline_create(gt);
1365             if (IS_ERR(tl)) {
1366                 err = PTR_ERR(tl);
1367                 break;
1368             }
1369
1370             rq = checked_tl_write(tl, engine, count);
1371             if (IS_ERR(rq)) {
1372                 intel_timeline_put(tl);
1373                 err = PTR_ERR(rq);
1374                 break;
1375             }
1376
1377             if (i915_request_wait(rq, 0, HZ / 5) < 0) {
1378                 pr_err("Wait for timeline writes timed out!\n");
1379                 i915_request_put(rq);
1380                 intel_timeline_put(tl);
1381                 err = -EIO;
1382                 break;
1383             }
1384
1385             if (READ_ONCE(*tl->hwsp_seqno) != count) {
1386                 GEM_TRACE_ERR("Invalid seqno:%lu stored in timeline %llu @ %x found 0x%x\n",
1387                           count, tl->fence_context,
1388                           tl->hwsp_offset, *tl->hwsp_seqno);
1389                 GEM_TRACE_DUMP();
1390                 err = -EINVAL;
1391             }
1392
1393             i915_request_put(rq);
1394             intel_timeline_put(tl);
1395             count++;
1396
1397             if (err)
1398                 break;
1399         } while (!__igt_timeout(end_time, NULL));
1400
1401         intel_engine_pm_put(engine);
1402         if (err)
1403             break;
1404     }
1405
1406     return err;
1407 }
1408
1409 int intel_timeline_live_selftests(struct drm_i915_private *i915)
1410 {
1411     static const struct i915_subtest tests[] = {
1412         SUBTEST(live_hwsp_recycle),
1413         SUBTEST(live_hwsp_engine),
1414         SUBTEST(live_hwsp_alternate),
1415         SUBTEST(live_hwsp_wrap),
1416         SUBTEST(live_hwsp_read),
1417         SUBTEST(live_hwsp_rollover_kernel),
1418         SUBTEST(live_hwsp_rollover_user),
1419     };
1420
1421     if (intel_gt_is_wedged(to_gt(i915)))
1422         return 0;
1423
1424     return intel_gt_live_subtests(tests, to_gt(i915));
1425 }