Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: MIT
0002 /*
0003  * Copyright © 2020 Intel Corporation
0004  */
0005 
0006 #include "gen6_engine_cs.h"
0007 #include "intel_engine.h"
0008 #include "intel_engine_regs.h"
0009 #include "intel_gpu_commands.h"
0010 #include "intel_gt.h"
0011 #include "intel_gt_irq.h"
0012 #include "intel_gt_pm_irq.h"
0013 #include "intel_ring.h"
0014 
0015 #define HWS_SCRATCH_ADDR    (I915_GEM_HWS_SCRATCH * sizeof(u32))
0016 
0017 /*
0018  * Emits a PIPE_CONTROL with a non-zero post-sync operation, for
0019  * implementing two workarounds on gen6.  From section 1.4.7.1
0020  * "PIPE_CONTROL" of the Sandy Bridge PRM volume 2 part 1:
0021  *
0022  * [DevSNB-C+{W/A}] Before any depth stall flush (including those
0023  * produced by non-pipelined state commands), software needs to first
0024  * send a PIPE_CONTROL with no bits set except Post-Sync Operation !=
0025  * 0.
0026  *
0027  * [Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache Flush Enable
0028  * =1, a PIPE_CONTROL with any non-zero post-sync-op is required.
0029  *
0030  * And the workaround for these two requires this workaround first:
0031  *
0032  * [Dev-SNB{W/A}]: Pipe-control with CS-stall bit set must be sent
0033  * BEFORE the pipe-control with a post-sync op and no write-cache
0034  * flushes.
0035  *
0036  * And this last workaround is tricky because of the requirements on
0037  * that bit.  From section 1.4.7.2.3 "Stall" of the Sandy Bridge PRM
0038  * volume 2 part 1:
0039  *
0040  *     "1 of the following must also be set:
0041  *      - Render Target Cache Flush Enable ([12] of DW1)
0042  *      - Depth Cache Flush Enable ([0] of DW1)
0043  *      - Stall at Pixel Scoreboard ([1] of DW1)
0044  *      - Depth Stall ([13] of DW1)
0045  *      - Post-Sync Operation ([13] of DW1)
0046  *      - Notify Enable ([8] of DW1)"
0047  *
0048  * The cache flushes require the workaround flush that triggered this
0049  * one, so we can't use it.  Depth stall would trigger the same.
0050  * Post-sync nonzero is what triggered this second workaround, so we
0051  * can't use that one either.  Notify enable is IRQs, which aren't
0052  * really our business.  That leaves only stall at scoreboard.
0053  */
0054 static int
0055 gen6_emit_post_sync_nonzero_flush(struct i915_request *rq)
0056 {
0057     u32 scratch_addr =
0058         intel_gt_scratch_offset(rq->engine->gt,
0059                     INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH);
0060     u32 *cs;
0061 
0062     cs = intel_ring_begin(rq, 6);
0063     if (IS_ERR(cs))
0064         return PTR_ERR(cs);
0065 
0066     *cs++ = GFX_OP_PIPE_CONTROL(5);
0067     *cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD;
0068     *cs++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT;
0069     *cs++ = 0; /* low dword */
0070     *cs++ = 0; /* high dword */
0071     *cs++ = MI_NOOP;
0072     intel_ring_advance(rq, cs);
0073 
0074     cs = intel_ring_begin(rq, 6);
0075     if (IS_ERR(cs))
0076         return PTR_ERR(cs);
0077 
0078     *cs++ = GFX_OP_PIPE_CONTROL(5);
0079     *cs++ = PIPE_CONTROL_QW_WRITE;
0080     *cs++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT;
0081     *cs++ = 0;
0082     *cs++ = 0;
0083     *cs++ = MI_NOOP;
0084     intel_ring_advance(rq, cs);
0085 
0086     return 0;
0087 }
0088 
0089 int gen6_emit_flush_rcs(struct i915_request *rq, u32 mode)
0090 {
0091     u32 scratch_addr =
0092         intel_gt_scratch_offset(rq->engine->gt,
0093                     INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH);
0094     u32 *cs, flags = 0;
0095     int ret;
0096 
0097     /* Force SNB workarounds for PIPE_CONTROL flushes */
0098     ret = gen6_emit_post_sync_nonzero_flush(rq);
0099     if (ret)
0100         return ret;
0101 
0102     /*
0103      * Just flush everything.  Experiments have shown that reducing the
0104      * number of bits based on the write domains has little performance
0105      * impact. And when rearranging requests, the order of flushes is
0106      * unknown.
0107      */
0108     if (mode & EMIT_FLUSH) {
0109         flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
0110         flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
0111         /*
0112          * Ensure that any following seqno writes only happen
0113          * when the render cache is indeed flushed.
0114          */
0115         flags |= PIPE_CONTROL_CS_STALL;
0116     }
0117     if (mode & EMIT_INVALIDATE) {
0118         flags |= PIPE_CONTROL_TLB_INVALIDATE;
0119         flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
0120         flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
0121         flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
0122         flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
0123         flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
0124         /*
0125          * TLB invalidate requires a post-sync write.
0126          */
0127         flags |= PIPE_CONTROL_QW_WRITE | PIPE_CONTROL_CS_STALL;
0128     }
0129 
0130     cs = intel_ring_begin(rq, 4);
0131     if (IS_ERR(cs))
0132         return PTR_ERR(cs);
0133 
0134     *cs++ = GFX_OP_PIPE_CONTROL(4);
0135     *cs++ = flags;
0136     *cs++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT;
0137     *cs++ = 0;
0138     intel_ring_advance(rq, cs);
0139 
0140     return 0;
0141 }
0142 
0143 u32 *gen6_emit_breadcrumb_rcs(struct i915_request *rq, u32 *cs)
0144 {
0145     /* First we do the gen6_emit_post_sync_nonzero_flush w/a */
0146     *cs++ = GFX_OP_PIPE_CONTROL(4);
0147     *cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD;
0148     *cs++ = 0;
0149     *cs++ = 0;
0150 
0151     *cs++ = GFX_OP_PIPE_CONTROL(4);
0152     *cs++ = PIPE_CONTROL_QW_WRITE;
0153     *cs++ = intel_gt_scratch_offset(rq->engine->gt,
0154                     INTEL_GT_SCRATCH_FIELD_DEFAULT) |
0155         PIPE_CONTROL_GLOBAL_GTT;
0156     *cs++ = 0;
0157 
0158     /* Finally we can flush and with it emit the breadcrumb */
0159     *cs++ = GFX_OP_PIPE_CONTROL(4);
0160     *cs++ = (PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
0161          PIPE_CONTROL_DEPTH_CACHE_FLUSH |
0162          PIPE_CONTROL_DC_FLUSH_ENABLE |
0163          PIPE_CONTROL_QW_WRITE |
0164          PIPE_CONTROL_CS_STALL);
0165     *cs++ = i915_request_active_seqno(rq) |
0166         PIPE_CONTROL_GLOBAL_GTT;
0167     *cs++ = rq->fence.seqno;
0168 
0169     *cs++ = MI_USER_INTERRUPT;
0170     *cs++ = MI_NOOP;
0171 
0172     rq->tail = intel_ring_offset(rq, cs);
0173     assert_ring_tail_valid(rq->ring, rq->tail);
0174 
0175     return cs;
0176 }
0177 
0178 static int mi_flush_dw(struct i915_request *rq, u32 flags)
0179 {
0180     u32 cmd, *cs;
0181 
0182     cs = intel_ring_begin(rq, 4);
0183     if (IS_ERR(cs))
0184         return PTR_ERR(cs);
0185 
0186     cmd = MI_FLUSH_DW;
0187 
0188     /*
0189      * We always require a command barrier so that subsequent
0190      * commands, such as breadcrumb interrupts, are strictly ordered
0191      * wrt the contents of the write cache being flushed to memory
0192      * (and thus being coherent from the CPU).
0193      */
0194     cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
0195 
0196     /*
0197      * Bspec vol 1c.3 - blitter engine command streamer:
0198      * "If ENABLED, all TLBs will be invalidated once the flush
0199      * operation is complete. This bit is only valid when the
0200      * Post-Sync Operation field is a value of 1h or 3h."
0201      */
0202     cmd |= flags;
0203 
0204     *cs++ = cmd;
0205     *cs++ = HWS_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT;
0206     *cs++ = 0;
0207     *cs++ = MI_NOOP;
0208 
0209     intel_ring_advance(rq, cs);
0210 
0211     return 0;
0212 }
0213 
0214 static int gen6_flush_dw(struct i915_request *rq, u32 mode, u32 invflags)
0215 {
0216     return mi_flush_dw(rq, mode & EMIT_INVALIDATE ? invflags : 0);
0217 }
0218 
0219 int gen6_emit_flush_xcs(struct i915_request *rq, u32 mode)
0220 {
0221     return gen6_flush_dw(rq, mode, MI_INVALIDATE_TLB);
0222 }
0223 
0224 int gen6_emit_flush_vcs(struct i915_request *rq, u32 mode)
0225 {
0226     return gen6_flush_dw(rq, mode, MI_INVALIDATE_TLB | MI_INVALIDATE_BSD);
0227 }
0228 
0229 int gen6_emit_bb_start(struct i915_request *rq,
0230                u64 offset, u32 len,
0231                unsigned int dispatch_flags)
0232 {
0233     u32 security;
0234     u32 *cs;
0235 
0236     security = MI_BATCH_NON_SECURE_I965;
0237     if (dispatch_flags & I915_DISPATCH_SECURE)
0238         security = 0;
0239 
0240     cs = intel_ring_begin(rq, 2);
0241     if (IS_ERR(cs))
0242         return PTR_ERR(cs);
0243 
0244     cs = __gen6_emit_bb_start(cs, offset, security);
0245     intel_ring_advance(rq, cs);
0246 
0247     return 0;
0248 }
0249 
0250 int
0251 hsw_emit_bb_start(struct i915_request *rq,
0252           u64 offset, u32 len,
0253           unsigned int dispatch_flags)
0254 {
0255     u32 security;
0256     u32 *cs;
0257 
0258     security = MI_BATCH_PPGTT_HSW | MI_BATCH_NON_SECURE_HSW;
0259     if (dispatch_flags & I915_DISPATCH_SECURE)
0260         security = 0;
0261 
0262     cs = intel_ring_begin(rq, 2);
0263     if (IS_ERR(cs))
0264         return PTR_ERR(cs);
0265 
0266     cs = __gen6_emit_bb_start(cs, offset, security);
0267     intel_ring_advance(rq, cs);
0268 
0269     return 0;
0270 }
0271 
0272 static int gen7_stall_cs(struct i915_request *rq)
0273 {
0274     u32 *cs;
0275 
0276     cs = intel_ring_begin(rq, 4);
0277     if (IS_ERR(cs))
0278         return PTR_ERR(cs);
0279 
0280     *cs++ = GFX_OP_PIPE_CONTROL(4);
0281     *cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD;
0282     *cs++ = 0;
0283     *cs++ = 0;
0284     intel_ring_advance(rq, cs);
0285 
0286     return 0;
0287 }
0288 
0289 int gen7_emit_flush_rcs(struct i915_request *rq, u32 mode)
0290 {
0291     u32 scratch_addr =
0292         intel_gt_scratch_offset(rq->engine->gt,
0293                     INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH);
0294     u32 *cs, flags = 0;
0295 
0296     /*
0297      * Ensure that any following seqno writes only happen when the render
0298      * cache is indeed flushed.
0299      *
0300      * Workaround: 4th PIPE_CONTROL command (except the ones with only
0301      * read-cache invalidate bits set) must have the CS_STALL bit set. We
0302      * don't try to be clever and just set it unconditionally.
0303      */
0304     flags |= PIPE_CONTROL_CS_STALL;
0305 
0306     /*
0307      * CS_STALL suggests at least a post-sync write.
0308      */
0309     flags |= PIPE_CONTROL_QW_WRITE;
0310     flags |= PIPE_CONTROL_GLOBAL_GTT_IVB;
0311 
0312     /*
0313      * Just flush everything.  Experiments have shown that reducing the
0314      * number of bits based on the write domains has little performance
0315      * impact.
0316      */
0317     if (mode & EMIT_FLUSH) {
0318         flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
0319         flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
0320         flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
0321         flags |= PIPE_CONTROL_FLUSH_ENABLE;
0322     }
0323     if (mode & EMIT_INVALIDATE) {
0324         flags |= PIPE_CONTROL_TLB_INVALIDATE;
0325         flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
0326         flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
0327         flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
0328         flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
0329         flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
0330         flags |= PIPE_CONTROL_MEDIA_STATE_CLEAR;
0331 
0332         /*
0333          * Workaround: we must issue a pipe_control with CS-stall bit
0334          * set before a pipe_control command that has the state cache
0335          * invalidate bit set.
0336          */
0337         gen7_stall_cs(rq);
0338     }
0339 
0340     cs = intel_ring_begin(rq, 4);
0341     if (IS_ERR(cs))
0342         return PTR_ERR(cs);
0343 
0344     *cs++ = GFX_OP_PIPE_CONTROL(4);
0345     *cs++ = flags;
0346     *cs++ = scratch_addr;
0347     *cs++ = 0;
0348     intel_ring_advance(rq, cs);
0349 
0350     return 0;
0351 }
0352 
0353 u32 *gen7_emit_breadcrumb_rcs(struct i915_request *rq, u32 *cs)
0354 {
0355     *cs++ = GFX_OP_PIPE_CONTROL(4);
0356     *cs++ = (PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
0357          PIPE_CONTROL_DEPTH_CACHE_FLUSH |
0358          PIPE_CONTROL_DC_FLUSH_ENABLE |
0359          PIPE_CONTROL_FLUSH_ENABLE |
0360          PIPE_CONTROL_QW_WRITE |
0361          PIPE_CONTROL_GLOBAL_GTT_IVB |
0362          PIPE_CONTROL_CS_STALL);
0363     *cs++ = i915_request_active_seqno(rq);
0364     *cs++ = rq->fence.seqno;
0365 
0366     *cs++ = MI_USER_INTERRUPT;
0367     *cs++ = MI_NOOP;
0368 
0369     rq->tail = intel_ring_offset(rq, cs);
0370     assert_ring_tail_valid(rq->ring, rq->tail);
0371 
0372     return cs;
0373 }
0374 
0375 u32 *gen6_emit_breadcrumb_xcs(struct i915_request *rq, u32 *cs)
0376 {
0377     GEM_BUG_ON(i915_request_active_timeline(rq)->hwsp_ggtt != rq->engine->status_page.vma);
0378     GEM_BUG_ON(offset_in_page(rq->hwsp_seqno) != I915_GEM_HWS_SEQNO_ADDR);
0379 
0380     *cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX;
0381     *cs++ = I915_GEM_HWS_SEQNO_ADDR | MI_FLUSH_DW_USE_GTT;
0382     *cs++ = rq->fence.seqno;
0383 
0384     *cs++ = MI_USER_INTERRUPT;
0385 
0386     rq->tail = intel_ring_offset(rq, cs);
0387     assert_ring_tail_valid(rq->ring, rq->tail);
0388 
0389     return cs;
0390 }
0391 
0392 #define GEN7_XCS_WA 32
0393 u32 *gen7_emit_breadcrumb_xcs(struct i915_request *rq, u32 *cs)
0394 {
0395     int i;
0396 
0397     GEM_BUG_ON(i915_request_active_timeline(rq)->hwsp_ggtt != rq->engine->status_page.vma);
0398     GEM_BUG_ON(offset_in_page(rq->hwsp_seqno) != I915_GEM_HWS_SEQNO_ADDR);
0399 
0400     *cs++ = MI_FLUSH_DW | MI_INVALIDATE_TLB |
0401         MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX;
0402     *cs++ = I915_GEM_HWS_SEQNO_ADDR | MI_FLUSH_DW_USE_GTT;
0403     *cs++ = rq->fence.seqno;
0404 
0405     for (i = 0; i < GEN7_XCS_WA; i++) {
0406         *cs++ = MI_STORE_DWORD_INDEX;
0407         *cs++ = I915_GEM_HWS_SEQNO_ADDR;
0408         *cs++ = rq->fence.seqno;
0409     }
0410 
0411     *cs++ = MI_FLUSH_DW;
0412     *cs++ = 0;
0413     *cs++ = 0;
0414 
0415     *cs++ = MI_USER_INTERRUPT;
0416     *cs++ = MI_NOOP;
0417 
0418     rq->tail = intel_ring_offset(rq, cs);
0419     assert_ring_tail_valid(rq->ring, rq->tail);
0420 
0421     return cs;
0422 }
0423 #undef GEN7_XCS_WA
0424 
0425 void gen6_irq_enable(struct intel_engine_cs *engine)
0426 {
0427     ENGINE_WRITE(engine, RING_IMR,
0428              ~(engine->irq_enable_mask | engine->irq_keep_mask));
0429 
0430     /* Flush/delay to ensure the RING_IMR is active before the GT IMR */
0431     ENGINE_POSTING_READ(engine, RING_IMR);
0432 
0433     gen5_gt_enable_irq(engine->gt, engine->irq_enable_mask);
0434 }
0435 
0436 void gen6_irq_disable(struct intel_engine_cs *engine)
0437 {
0438     ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask);
0439     gen5_gt_disable_irq(engine->gt, engine->irq_enable_mask);
0440 }
0441 
0442 void hsw_irq_enable_vecs(struct intel_engine_cs *engine)
0443 {
0444     ENGINE_WRITE(engine, RING_IMR, ~engine->irq_enable_mask);
0445 
0446     /* Flush/delay to ensure the RING_IMR is active before the GT IMR */
0447     ENGINE_POSTING_READ(engine, RING_IMR);
0448 
0449     gen6_gt_pm_unmask_irq(engine->gt, engine->irq_enable_mask);
0450 }
0451 
0452 void hsw_irq_disable_vecs(struct intel_engine_cs *engine)
0453 {
0454     ENGINE_WRITE(engine, RING_IMR, ~0);
0455     gen6_gt_pm_mask_irq(engine->gt, engine->irq_enable_mask);
0456 }