i915/gt/gen8_engine_cs.c

0001 // SPDX-License-Identifier: MIT
0002 /*
0003  * Copyright © 2014 Intel Corporation
0004  */
0005
0006 #include "gen8_engine_cs.h"
0007 #include "i915_drv.h"
0008 #include "intel_engine_regs.h"
0009 #include "intel_gpu_commands.h"
0010 #include "intel_lrc.h"
0011 #include "intel_ring.h"
0012
0013 int gen8_emit_flush_rcs(struct i915_request *rq, u32 mode)
0014 {
0015     bool vf_flush_wa = false, dc_flush_wa = false;
0016     u32 *cs, flags = 0;
0017     int len;
0018
0019     flags |= PIPE_CONTROL_CS_STALL;
0020
0021     if (mode & EMIT_FLUSH) {
0022         flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
0023         flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
0024         flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
0025         flags |= PIPE_CONTROL_FLUSH_ENABLE;
0026     }
0027
0028     if (mode & EMIT_INVALIDATE) {
0029         flags |= PIPE_CONTROL_TLB_INVALIDATE;
0030         flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
0031         flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
0032         flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
0033         flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
0034         flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
0035         flags |= PIPE_CONTROL_QW_WRITE;
0036         flags |= PIPE_CONTROL_STORE_DATA_INDEX;
0037
0038         /*
0039          * On GEN9: before VF_CACHE_INVALIDATE we need to emit a NULL
0040          * pipe control.
0041          */
0042         if (GRAPHICS_VER(rq->engine->i915) == 9)
0043             vf_flush_wa = true;
0044
0045         /* WaForGAMHang:kbl */
0046         if (IS_KBL_GRAPHICS_STEP(rq->engine->i915, 0, STEP_C0))
0047             dc_flush_wa = true;
0048     }
0049
0050     len = 6;
0051
0052     if (vf_flush_wa)
0053         len += 6;
0054
0055     if (dc_flush_wa)
0056         len += 12;
0057
0058     cs = intel_ring_begin(rq, len);
0059     if (IS_ERR(cs))
0060         return PTR_ERR(cs);
0061
0062     if (vf_flush_wa)
0063         cs = gen8_emit_pipe_control(cs, 0, 0);
0064
0065     if (dc_flush_wa)
0066         cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_DC_FLUSH_ENABLE,
0067                         0);
0068
0069     cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
0070
0071     if (dc_flush_wa)
0072         cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_CS_STALL, 0);
0073
0074     intel_ring_advance(rq, cs);
0075
0076     return 0;
0077 }
0078
0079 int gen8_emit_flush_xcs(struct i915_request *rq, u32 mode)
0080 {
0081     u32 cmd, *cs;
0082
0083     cs = intel_ring_begin(rq, 4);
0084     if (IS_ERR(cs))
0085         return PTR_ERR(cs);
0086
0087     cmd = MI_FLUSH_DW + 1;
0088
0089     /*
0090      * We always require a command barrier so that subsequent
0091      * commands, such as breadcrumb interrupts, are strictly ordered
0092      * wrt the contents of the write cache being flushed to memory
0093      * (and thus being coherent from the CPU).
0094      */
0095     cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
0096
0097     if (mode & EMIT_INVALIDATE) {
0098         cmd |= MI_INVALIDATE_TLB;
0099         if (rq->engine->class == VIDEO_DECODE_CLASS)
0100             cmd |= MI_INVALIDATE_BSD;
0101     }
0102
0103     *cs++ = cmd;
0104     *cs++ = LRC_PPHWSP_SCRATCH_ADDR;
0105     *cs++ = 0; /* upper addr */
0106     *cs++ = 0; /* value */
0107     intel_ring_advance(rq, cs);
0108
0109     return 0;
0110 }
0111
0112 int gen11_emit_flush_rcs(struct i915_request *rq, u32 mode)
0113 {
0114     if (mode & EMIT_FLUSH) {
0115         u32 *cs;
0116         u32 flags = 0;
0117
0118         flags |= PIPE_CONTROL_CS_STALL;
0119
0120         flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
0121         flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
0122         flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
0123         flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
0124         flags |= PIPE_CONTROL_FLUSH_ENABLE;
0125         flags |= PIPE_CONTROL_QW_WRITE;
0126         flags |= PIPE_CONTROL_STORE_DATA_INDEX;
0127
0128         cs = intel_ring_begin(rq, 6);
0129         if (IS_ERR(cs))
0130             return PTR_ERR(cs);
0131
0132         cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
0133         intel_ring_advance(rq, cs);
0134     }
0135
0136     if (mode & EMIT_INVALIDATE) {
0137         u32 *cs;
0138         u32 flags = 0;
0139
0140         flags |= PIPE_CONTROL_CS_STALL;
0141
0142         flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
0143         flags |= PIPE_CONTROL_TLB_INVALIDATE;
0144         flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
0145         flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
0146         flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
0147         flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
0148         flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
0149         flags |= PIPE_CONTROL_QW_WRITE;
0150         flags |= PIPE_CONTROL_STORE_DATA_INDEX;
0151
0152         cs = intel_ring_begin(rq, 6);
0153         if (IS_ERR(cs))
0154             return PTR_ERR(cs);
0155
0156         cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
0157         intel_ring_advance(rq, cs);
0158     }
0159
0160     return 0;
0161 }
0162
0163 static u32 preparser_disable(bool state)
0164 {
0165     return MI_ARB_CHECK | 1 << 8 | state;
0166 }
0167
0168 u32 *gen12_emit_aux_table_inv(u32 *cs, const i915_reg_t inv_reg)
0169 {
0170     *cs++ = MI_LOAD_REGISTER_IMM(1) | MI_LRI_MMIO_REMAP_EN;
0171     *cs++ = i915_mmio_reg_offset(inv_reg);
0172     *cs++ = AUX_INV;
0173     *cs++ = MI_NOOP;
0174
0175     return cs;
0176 }
0177
0178 int gen12_emit_flush_rcs(struct i915_request *rq, u32 mode)
0179 {
0180     struct intel_engine_cs *engine = rq->engine;
0181
0182     if (mode & EMIT_FLUSH) {
0183         u32 flags = 0;
0184         u32 *cs;
0185
0186         flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
0187         flags |= PIPE_CONTROL_FLUSH_L3;
0188         flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
0189         flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
0190         /* Wa_1409600907:tgl,adl-p */
0191         flags |= PIPE_CONTROL_DEPTH_STALL;
0192         flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
0193         flags |= PIPE_CONTROL_FLUSH_ENABLE;
0194
0195         flags |= PIPE_CONTROL_STORE_DATA_INDEX;
0196         flags |= PIPE_CONTROL_QW_WRITE;
0197
0198         flags |= PIPE_CONTROL_CS_STALL;
0199
0200         if (!HAS_3D_PIPELINE(engine->i915))
0201             flags &= ~PIPE_CONTROL_3D_ARCH_FLAGS;
0202         else if (engine->class == COMPUTE_CLASS)
0203             flags &= ~PIPE_CONTROL_3D_ENGINE_FLAGS;
0204
0205         cs = intel_ring_begin(rq, 6);
0206         if (IS_ERR(cs))
0207             return PTR_ERR(cs);
0208
0209         cs = gen12_emit_pipe_control(cs,
0210                          PIPE_CONTROL0_HDC_PIPELINE_FLUSH,
0211                          flags, LRC_PPHWSP_SCRATCH_ADDR);
0212         intel_ring_advance(rq, cs);
0213     }
0214
0215     if (mode & EMIT_INVALIDATE) {
0216         u32 flags = 0;
0217         u32 *cs, count;
0218
0219         flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
0220         flags |= PIPE_CONTROL_TLB_INVALIDATE;
0221         flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
0222         flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
0223         flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
0224         flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
0225         flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
0226
0227         flags |= PIPE_CONTROL_STORE_DATA_INDEX;
0228         flags |= PIPE_CONTROL_QW_WRITE;
0229
0230         flags |= PIPE_CONTROL_CS_STALL;
0231
0232         if (!HAS_3D_PIPELINE(engine->i915))
0233             flags &= ~PIPE_CONTROL_3D_ARCH_FLAGS;
0234         else if (engine->class == COMPUTE_CLASS)
0235             flags &= ~PIPE_CONTROL_3D_ENGINE_FLAGS;
0236
0237         if (!HAS_FLAT_CCS(rq->engine->i915))
0238             count = 8 + 4;
0239         else
0240             count = 8;
0241
0242         cs = intel_ring_begin(rq, count);
0243         if (IS_ERR(cs))
0244             return PTR_ERR(cs);
0245
0246         /*
0247          * Prevent the pre-parser from skipping past the TLB
0248          * invalidate and loading a stale page for the batch
0249          * buffer / request payload.
0250          */
0251         *cs++ = preparser_disable(true);
0252
0253         cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
0254
0255         if (!HAS_FLAT_CCS(rq->engine->i915)) {
0256             /* hsdes: 1809175790 */
0257             cs = gen12_emit_aux_table_inv(cs, GEN12_GFX_CCS_AUX_NV);
0258         }
0259
0260         *cs++ = preparser_disable(false);
0261         intel_ring_advance(rq, cs);
0262     }
0263
0264     return 0;
0265 }
0266
0267 int gen12_emit_flush_xcs(struct i915_request *rq, u32 mode)
0268 {
0269     intel_engine_mask_t aux_inv = 0;
0270     u32 cmd, *cs;
0271
0272     cmd = 4;
0273     if (mode & EMIT_INVALIDATE) {
0274         cmd += 2;
0275
0276         if (!HAS_FLAT_CCS(rq->engine->i915) &&
0277             (rq->engine->class == VIDEO_DECODE_CLASS ||
0278              rq->engine->class == VIDEO_ENHANCEMENT_CLASS)) {
0279             aux_inv = rq->engine->mask &
0280                 ~GENMASK(_BCS(I915_MAX_BCS - 1), BCS0);
0281             if (aux_inv)
0282                 cmd += 4;
0283         }
0284     }
0285
0286     cs = intel_ring_begin(rq, cmd);
0287     if (IS_ERR(cs))
0288         return PTR_ERR(cs);
0289
0290     if (mode & EMIT_INVALIDATE)
0291         *cs++ = preparser_disable(true);
0292
0293     cmd = MI_FLUSH_DW + 1;
0294
0295     /*
0296      * We always require a command barrier so that subsequent
0297      * commands, such as breadcrumb interrupts, are strictly ordered
0298      * wrt the contents of the write cache being flushed to memory
0299      * (and thus being coherent from the CPU).
0300      */
0301     cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
0302
0303     if (mode & EMIT_INVALIDATE) {
0304         cmd |= MI_INVALIDATE_TLB;
0305         if (rq->engine->class == VIDEO_DECODE_CLASS)
0306             cmd |= MI_INVALIDATE_BSD;
0307     }
0308
0309     *cs++ = cmd;
0310     *cs++ = LRC_PPHWSP_SCRATCH_ADDR;
0311     *cs++ = 0; /* upper addr */
0312     *cs++ = 0; /* value */
0313
0314     if (aux_inv) { /* hsdes: 1809175790 */
0315         if (rq->engine->class == VIDEO_DECODE_CLASS)
0316             cs = gen12_emit_aux_table_inv(cs, GEN12_VD0_AUX_NV);
0317         else
0318             cs = gen12_emit_aux_table_inv(cs, GEN12_VE0_AUX_NV);
0319     }
0320
0321     if (mode & EMIT_INVALIDATE)
0322         *cs++ = preparser_disable(false);
0323
0324     intel_ring_advance(rq, cs);
0325
0326     return 0;
0327 }
0328
0329 static u32 preempt_address(struct intel_engine_cs *engine)
0330 {
0331     return (i915_ggtt_offset(engine->status_page.vma) +
0332         I915_GEM_HWS_PREEMPT_ADDR);
0333 }
0334
0335 static u32 hwsp_offset(const struct i915_request *rq)
0336 {
0337     const struct intel_timeline *tl;
0338
0339     /* Before the request is executed, the timeline is fixed */
0340     tl = rcu_dereference_protected(rq->timeline,
0341                        !i915_request_signaled(rq));
0342
0343     /* See the comment in i915_request_active_seqno(). */
0344     return page_mask_bits(tl->hwsp_offset) + offset_in_page(rq->hwsp_seqno);
0345 }
0346
0347 int gen8_emit_init_breadcrumb(struct i915_request *rq)
0348 {
0349     u32 *cs;
0350
0351     GEM_BUG_ON(i915_request_has_initial_breadcrumb(rq));
0352     if (!i915_request_timeline(rq)->has_initial_breadcrumb)
0353         return 0;
0354
0355     cs = intel_ring_begin(rq, 6);
0356     if (IS_ERR(cs))
0357         return PTR_ERR(cs);
0358
0359     *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
0360     *cs++ = hwsp_offset(rq);
0361     *cs++ = 0;
0362     *cs++ = rq->fence.seqno - 1;
0363
0364     /*
0365      * Check if we have been preempted before we even get started.
0366      *
0367      * After this point i915_request_started() reports true, even if
0368      * we get preempted and so are no longer running.
0369      *
0370      * i915_request_started() is used during preemption processing
0371      * to decide if the request is currently inside the user payload
0372      * or spinning on a kernel semaphore (or earlier). For no-preemption
0373      * requests, we do allow preemption on the semaphore before the user
0374      * payload, but do not allow preemption once the request is started.
0375      *
0376      * i915_request_started() is similarly used during GPU hangs to
0377      * determine if the user's payload was guilty, and if so, the
0378      * request is banned. Before the request is started, it is assumed
0379      * to be unharmed and an innocent victim of another's hang.
0380      */
0381     *cs++ = MI_NOOP;
0382     *cs++ = MI_ARB_CHECK;
0383
0384     intel_ring_advance(rq, cs);
0385
0386     /* Record the updated position of the request's payload */
0387     rq->infix = intel_ring_offset(rq, cs);
0388
0389     __set_bit(I915_FENCE_FLAG_INITIAL_BREADCRUMB, &rq->fence.flags);
0390
0391     return 0;
0392 }
0393
0394 static int __gen125_emit_bb_start(struct i915_request *rq,
0395                   u64 offset, u32 len,
0396                   const unsigned int flags,
0397                   u32 arb)
0398 {
0399     struct intel_context *ce = rq->context;
0400     u32 wa_offset = lrc_indirect_bb(ce);
0401     u32 *cs;
0402
0403     cs = intel_ring_begin(rq, 12);
0404     if (IS_ERR(cs))
0405         return PTR_ERR(cs);
0406
0407     *cs++ = MI_ARB_ON_OFF | arb;
0408
0409     *cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
0410         MI_SRM_LRM_GLOBAL_GTT |
0411         MI_LRI_LRM_CS_MMIO;
0412     *cs++ = i915_mmio_reg_offset(RING_PREDICATE_RESULT(0));
0413     *cs++ = wa_offset + DG2_PREDICATE_RESULT_WA;
0414     *cs++ = 0;
0415
0416     *cs++ = MI_BATCH_BUFFER_START_GEN8 |
0417         (flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
0418     *cs++ = lower_32_bits(offset);
0419     *cs++ = upper_32_bits(offset);
0420
0421     /* Fixup stray MI_SET_PREDICATE as it prevents us executing the ring */
0422     *cs++ = MI_BATCH_BUFFER_START_GEN8;
0423     *cs++ = wa_offset + DG2_PREDICATE_RESULT_BB;
0424     *cs++ = 0;
0425
0426     *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
0427
0428     intel_ring_advance(rq, cs);
0429
0430     return 0;
0431 }
0432
0433 int gen125_emit_bb_start_noarb(struct i915_request *rq,
0434                    u64 offset, u32 len,
0435                    const unsigned int flags)
0436 {
0437     return __gen125_emit_bb_start(rq, offset, len, flags, MI_ARB_DISABLE);
0438 }
0439
0440 int gen125_emit_bb_start(struct i915_request *rq,
0441              u64 offset, u32 len,
0442              const unsigned int flags)
0443 {
0444     return __gen125_emit_bb_start(rq, offset, len, flags, MI_ARB_ENABLE);
0445 }
0446
0447 int gen8_emit_bb_start_noarb(struct i915_request *rq,
0448                  u64 offset, u32 len,
0449                  const unsigned int flags)
0450 {
0451     u32 *cs;
0452
0453     cs = intel_ring_begin(rq, 4);
0454     if (IS_ERR(cs))
0455         return PTR_ERR(cs);
0456
0457     /*
0458      * WaDisableCtxRestoreArbitration:bdw,chv
0459      *
0460      * We don't need to perform MI_ARB_ENABLE as often as we do (in
0461      * particular all the gen that do not need the w/a at all!), if we
0462      * took care to make sure that on every switch into this context
0463      * (both ordinary and for preemption) that arbitrartion was enabled
0464      * we would be fine.  However, for gen8 there is another w/a that
0465      * requires us to not preempt inside GPGPU execution, so we keep
0466      * arbitration disabled for gen8 batches. Arbitration will be
0467      * re-enabled before we close the request
0468      * (engine->emit_fini_breadcrumb).
0469      */
0470     *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
0471
0472     /* FIXME(BDW+): Address space and security selectors. */
0473     *cs++ = MI_BATCH_BUFFER_START_GEN8 |
0474         (flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
0475     *cs++ = lower_32_bits(offset);
0476     *cs++ = upper_32_bits(offset);
0477
0478     intel_ring_advance(rq, cs);
0479
0480     return 0;
0481 }
0482
0483 int gen8_emit_bb_start(struct i915_request *rq,
0484                u64 offset, u32 len,
0485                const unsigned int flags)
0486 {
0487     u32 *cs;
0488
0489     if (unlikely(i915_request_has_nopreempt(rq)))
0490         return gen8_emit_bb_start_noarb(rq, offset, len, flags);
0491
0492     cs = intel_ring_begin(rq, 6);
0493     if (IS_ERR(cs))
0494         return PTR_ERR(cs);
0495
0496     *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
0497
0498     *cs++ = MI_BATCH_BUFFER_START_GEN8 |
0499         (flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
0500     *cs++ = lower_32_bits(offset);
0501     *cs++ = upper_32_bits(offset);
0502
0503     *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
0504     *cs++ = MI_NOOP;
0505
0506     intel_ring_advance(rq, cs);
0507
0508     return 0;
0509 }
0510
0511 static void assert_request_valid(struct i915_request *rq)
0512 {
0513     struct intel_ring *ring __maybe_unused = rq->ring;
0514
0515     /* Can we unwind this request without appearing to go forwards? */
0516     GEM_BUG_ON(intel_ring_direction(ring, rq->wa_tail, rq->head) <= 0);
0517 }
0518
0519 /*
0520  * Reserve space for 2 NOOPs at the end of each request to be
0521  * used as a workaround for not being allowed to do lite
0522  * restore with HEAD==TAIL (WaIdleLiteRestore).
0523  */
0524 static u32 *gen8_emit_wa_tail(struct i915_request *rq, u32 *cs)
0525 {
0526     /* Ensure there's always at least one preemption point per-request. */
0527     *cs++ = MI_ARB_CHECK;
0528     *cs++ = MI_NOOP;
0529     rq->wa_tail = intel_ring_offset(rq, cs);
0530
0531     /* Check that entire request is less than half the ring */
0532     assert_request_valid(rq);
0533
0534     return cs;
0535 }
0536
0537 static u32 *emit_preempt_busywait(struct i915_request *rq, u32 *cs)
0538 {
0539     *cs++ = MI_ARB_CHECK; /* trigger IDLE->ACTIVE first */
0540     *cs++ = MI_SEMAPHORE_WAIT |
0541         MI_SEMAPHORE_GLOBAL_GTT |
0542         MI_SEMAPHORE_POLL |
0543         MI_SEMAPHORE_SAD_EQ_SDD;
0544     *cs++ = 0;
0545     *cs++ = preempt_address(rq->engine);
0546     *cs++ = 0;
0547     *cs++ = MI_NOOP;
0548
0549     return cs;
0550 }
0551
0552 static __always_inline u32*
0553 gen8_emit_fini_breadcrumb_tail(struct i915_request *rq, u32 *cs)
0554 {
0555     *cs++ = MI_USER_INTERRUPT;
0556
0557     *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
0558     if (intel_engine_has_semaphores(rq->engine) &&
0559         !intel_uc_uses_guc_submission(&rq->engine->gt->uc))
0560         cs = emit_preempt_busywait(rq, cs);
0561
0562     rq->tail = intel_ring_offset(rq, cs);
0563     assert_ring_tail_valid(rq->ring, rq->tail);
0564
0565     return gen8_emit_wa_tail(rq, cs);
0566 }
0567
0568 static u32 *emit_xcs_breadcrumb(struct i915_request *rq, u32 *cs)
0569 {
0570     return gen8_emit_ggtt_write(cs, rq->fence.seqno, hwsp_offset(rq), 0);
0571 }
0572
0573 u32 *gen8_emit_fini_breadcrumb_xcs(struct i915_request *rq, u32 *cs)
0574 {
0575     return gen8_emit_fini_breadcrumb_tail(rq, emit_xcs_breadcrumb(rq, cs));
0576 }
0577
0578 u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *rq, u32 *cs)
0579 {
0580     cs = gen8_emit_pipe_control(cs,
0581                     PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
0582                     PIPE_CONTROL_DEPTH_CACHE_FLUSH |
0583                     PIPE_CONTROL_DC_FLUSH_ENABLE,
0584                     0);
0585
0586     /* XXX flush+write+CS_STALL all in one upsets gem_concurrent_blt:kbl */
0587     cs = gen8_emit_ggtt_write_rcs(cs,
0588                       rq->fence.seqno,
0589                       hwsp_offset(rq),
0590                       PIPE_CONTROL_FLUSH_ENABLE |
0591                       PIPE_CONTROL_CS_STALL);
0592
0593     return gen8_emit_fini_breadcrumb_tail(rq, cs);
0594 }
0595
0596 u32 *gen11_emit_fini_breadcrumb_rcs(struct i915_request *rq, u32 *cs)
0597 {
0598     cs = gen8_emit_ggtt_write_rcs(cs,
0599                       rq->fence.seqno,
0600                       hwsp_offset(rq),
0601                       PIPE_CONTROL_CS_STALL |
0602                       PIPE_CONTROL_TILE_CACHE_FLUSH |
0603                       PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
0604                       PIPE_CONTROL_DEPTH_CACHE_FLUSH |
0605                       PIPE_CONTROL_DC_FLUSH_ENABLE |
0606                       PIPE_CONTROL_FLUSH_ENABLE);
0607
0608     return gen8_emit_fini_breadcrumb_tail(rq, cs);
0609 }
0610
0611 /*
0612  * Note that the CS instruction pre-parser will not stall on the breadcrumb
0613  * flush and will continue pre-fetching the instructions after it before the
0614  * memory sync is completed. On pre-gen12 HW, the pre-parser will stop at
0615  * BB_START/END instructions, so, even though we might pre-fetch the pre-amble
0616  * of the next request before the memory has been flushed, we're guaranteed that
0617  * we won't access the batch itself too early.
0618  * However, on gen12+ the parser can pre-fetch across the BB_START/END commands,
0619  * so, if the current request is modifying an instruction in the next request on
0620  * the same intel_context, we might pre-fetch and then execute the pre-update
0621  * instruction. To avoid this, the users of self-modifying code should either
0622  * disable the parser around the code emitting the memory writes, via a new flag
0623  * added to MI_ARB_CHECK, or emit the writes from a different intel_context. For
0624  * the in-kernel use-cases we've opted to use a separate context, see
0625  * reloc_gpu() as an example.
0626  * All the above applies only to the instructions themselves. Non-inline data
0627  * used by the instructions is not pre-fetched.
0628  */
0629
0630 static u32 *gen12_emit_preempt_busywait(struct i915_request *rq, u32 *cs)
0631 {
0632     *cs++ = MI_ARB_CHECK; /* trigger IDLE->ACTIVE first */
0633     *cs++ = MI_SEMAPHORE_WAIT_TOKEN |
0634         MI_SEMAPHORE_GLOBAL_GTT |
0635         MI_SEMAPHORE_POLL |
0636         MI_SEMAPHORE_SAD_EQ_SDD;
0637     *cs++ = 0;
0638     *cs++ = preempt_address(rq->engine);
0639     *cs++ = 0;
0640     *cs++ = 0;
0641
0642     return cs;
0643 }
0644
0645 /* Wa_14014475959:dg2 */
0646 #define CCS_SEMAPHORE_PPHWSP_OFFSET 0x540
0647 static u32 ccs_semaphore_offset(struct i915_request *rq)
0648 {
0649     return i915_ggtt_offset(rq->context->state) +
0650         (LRC_PPHWSP_PN * PAGE_SIZE) + CCS_SEMAPHORE_PPHWSP_OFFSET;
0651 }
0652
0653 /* Wa_14014475959:dg2 */
0654 static u32 *ccs_emit_wa_busywait(struct i915_request *rq, u32 *cs)
0655 {
0656     int i;
0657
0658     *cs++ = MI_ATOMIC_INLINE | MI_ATOMIC_GLOBAL_GTT | MI_ATOMIC_CS_STALL |
0659         MI_ATOMIC_MOVE;
0660     *cs++ = ccs_semaphore_offset(rq);
0661     *cs++ = 0;
0662     *cs++ = 1;
0663
0664     /*
0665      * When MI_ATOMIC_INLINE_DATA set this command must be 11 DW + (1 NOP)
0666      * to align. 4 DWs above + 8 filler DWs here.
0667      */
0668     for (i = 0; i < 8; ++i)
0669         *cs++ = 0;
0670
0671     *cs++ = MI_SEMAPHORE_WAIT |
0672         MI_SEMAPHORE_GLOBAL_GTT |
0673         MI_SEMAPHORE_POLL |
0674         MI_SEMAPHORE_SAD_EQ_SDD;
0675     *cs++ = 0;
0676     *cs++ = ccs_semaphore_offset(rq);
0677     *cs++ = 0;
0678
0679     return cs;
0680 }
0681
0682 static __always_inline u32*
0683 gen12_emit_fini_breadcrumb_tail(struct i915_request *rq, u32 *cs)
0684 {
0685     *cs++ = MI_USER_INTERRUPT;
0686
0687     *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
0688     if (intel_engine_has_semaphores(rq->engine) &&
0689         !intel_uc_uses_guc_submission(&rq->engine->gt->uc))
0690         cs = gen12_emit_preempt_busywait(rq, cs);
0691
0692     /* Wa_14014475959:dg2 */
0693     if (intel_engine_uses_wa_hold_ccs_switchout(rq->engine))
0694         cs = ccs_emit_wa_busywait(rq, cs);
0695
0696     rq->tail = intel_ring_offset(rq, cs);
0697     assert_ring_tail_valid(rq->ring, rq->tail);
0698
0699     return gen8_emit_wa_tail(rq, cs);
0700 }
0701
0702 u32 *gen12_emit_fini_breadcrumb_xcs(struct i915_request *rq, u32 *cs)
0703 {
0704     /* XXX Stalling flush before seqno write; post-sync not */
0705     cs = emit_xcs_breadcrumb(rq, __gen8_emit_flush_dw(cs, 0, 0, 0));
0706     return gen12_emit_fini_breadcrumb_tail(rq, cs);
0707 }
0708
0709 u32 *gen12_emit_fini_breadcrumb_rcs(struct i915_request *rq, u32 *cs)
0710 {
0711     struct drm_i915_private *i915 = rq->engine->i915;
0712     u32 flags = (PIPE_CONTROL_CS_STALL |
0713              PIPE_CONTROL_TILE_CACHE_FLUSH |
0714              PIPE_CONTROL_FLUSH_L3 |
0715              PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
0716              PIPE_CONTROL_DEPTH_CACHE_FLUSH |
0717              PIPE_CONTROL_DC_FLUSH_ENABLE |
0718              PIPE_CONTROL_FLUSH_ENABLE);
0719
0720     if (GRAPHICS_VER(i915) == 12 && GRAPHICS_VER_FULL(i915) < IP_VER(12, 50))
0721         /* Wa_1409600907 */
0722         flags |= PIPE_CONTROL_DEPTH_STALL;
0723
0724     if (!HAS_3D_PIPELINE(rq->engine->i915))
0725         flags &= ~PIPE_CONTROL_3D_ARCH_FLAGS;
0726     else if (rq->engine->class == COMPUTE_CLASS)
0727         flags &= ~PIPE_CONTROL_3D_ENGINE_FLAGS;
0728
0729     cs = gen12_emit_ggtt_write_rcs(cs,
0730                        rq->fence.seqno,
0731                        hwsp_offset(rq),
0732                        PIPE_CONTROL0_HDC_PIPELINE_FLUSH,
0733                        flags);
0734
0735     return gen12_emit_fini_breadcrumb_tail(rq, cs);
0736 }