Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: MIT
0002 /*
0003  * Copyright © 2014 Intel Corporation
0004  */
0005 
0006 #include "gem/i915_gem_lmem.h"
0007 
0008 #include "gen8_engine_cs.h"
0009 #include "i915_drv.h"
0010 #include "i915_perf.h"
0011 #include "i915_reg.h"
0012 #include "intel_context.h"
0013 #include "intel_engine.h"
0014 #include "intel_engine_regs.h"
0015 #include "intel_gpu_commands.h"
0016 #include "intel_gt.h"
0017 #include "intel_gt_regs.h"
0018 #include "intel_lrc.h"
0019 #include "intel_lrc_reg.h"
0020 #include "intel_ring.h"
0021 #include "shmem_utils.h"
0022 
0023 static void set_offsets(u32 *regs,
0024             const u8 *data,
0025             const struct intel_engine_cs *engine,
0026             bool close)
0027 #define NOP(x) (BIT(7) | (x))
0028 #define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6)))
0029 #define POSTED BIT(0)
0030 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
0031 #define REG16(x) \
0032     (((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
0033     (((x) >> 2) & 0x7f)
0034 #define END 0
0035 {
0036     const u32 base = engine->mmio_base;
0037 
0038     while (*data) {
0039         u8 count, flags;
0040 
0041         if (*data & BIT(7)) { /* skip */
0042             count = *data++ & ~BIT(7);
0043             regs += count;
0044             continue;
0045         }
0046 
0047         count = *data & 0x3f;
0048         flags = *data >> 6;
0049         data++;
0050 
0051         *regs = MI_LOAD_REGISTER_IMM(count);
0052         if (flags & POSTED)
0053             *regs |= MI_LRI_FORCE_POSTED;
0054         if (GRAPHICS_VER(engine->i915) >= 11)
0055             *regs |= MI_LRI_LRM_CS_MMIO;
0056         regs++;
0057 
0058         GEM_BUG_ON(!count);
0059         do {
0060             u32 offset = 0;
0061             u8 v;
0062 
0063             do {
0064                 v = *data++;
0065                 offset <<= 7;
0066                 offset |= v & ~BIT(7);
0067             } while (v & BIT(7));
0068 
0069             regs[0] = base + (offset << 2);
0070             regs += 2;
0071         } while (--count);
0072     }
0073 
0074     if (close) {
0075         /* Close the batch; used mainly by live_lrc_layout() */
0076         *regs = MI_BATCH_BUFFER_END;
0077         if (GRAPHICS_VER(engine->i915) >= 11)
0078             *regs |= BIT(0);
0079     }
0080 }
0081 
0082 static const u8 gen8_xcs_offsets[] = {
0083     NOP(1),
0084     LRI(11, 0),
0085     REG16(0x244),
0086     REG(0x034),
0087     REG(0x030),
0088     REG(0x038),
0089     REG(0x03c),
0090     REG(0x168),
0091     REG(0x140),
0092     REG(0x110),
0093     REG(0x11c),
0094     REG(0x114),
0095     REG(0x118),
0096 
0097     NOP(9),
0098     LRI(9, 0),
0099     REG16(0x3a8),
0100     REG16(0x28c),
0101     REG16(0x288),
0102     REG16(0x284),
0103     REG16(0x280),
0104     REG16(0x27c),
0105     REG16(0x278),
0106     REG16(0x274),
0107     REG16(0x270),
0108 
0109     NOP(13),
0110     LRI(2, 0),
0111     REG16(0x200),
0112     REG(0x028),
0113 
0114     END
0115 };
0116 
0117 static const u8 gen9_xcs_offsets[] = {
0118     NOP(1),
0119     LRI(14, POSTED),
0120     REG16(0x244),
0121     REG(0x034),
0122     REG(0x030),
0123     REG(0x038),
0124     REG(0x03c),
0125     REG(0x168),
0126     REG(0x140),
0127     REG(0x110),
0128     REG(0x11c),
0129     REG(0x114),
0130     REG(0x118),
0131     REG(0x1c0),
0132     REG(0x1c4),
0133     REG(0x1c8),
0134 
0135     NOP(3),
0136     LRI(9, POSTED),
0137     REG16(0x3a8),
0138     REG16(0x28c),
0139     REG16(0x288),
0140     REG16(0x284),
0141     REG16(0x280),
0142     REG16(0x27c),
0143     REG16(0x278),
0144     REG16(0x274),
0145     REG16(0x270),
0146 
0147     NOP(13),
0148     LRI(1, POSTED),
0149     REG16(0x200),
0150 
0151     NOP(13),
0152     LRI(44, POSTED),
0153     REG(0x028),
0154     REG(0x09c),
0155     REG(0x0c0),
0156     REG(0x178),
0157     REG(0x17c),
0158     REG16(0x358),
0159     REG(0x170),
0160     REG(0x150),
0161     REG(0x154),
0162     REG(0x158),
0163     REG16(0x41c),
0164     REG16(0x600),
0165     REG16(0x604),
0166     REG16(0x608),
0167     REG16(0x60c),
0168     REG16(0x610),
0169     REG16(0x614),
0170     REG16(0x618),
0171     REG16(0x61c),
0172     REG16(0x620),
0173     REG16(0x624),
0174     REG16(0x628),
0175     REG16(0x62c),
0176     REG16(0x630),
0177     REG16(0x634),
0178     REG16(0x638),
0179     REG16(0x63c),
0180     REG16(0x640),
0181     REG16(0x644),
0182     REG16(0x648),
0183     REG16(0x64c),
0184     REG16(0x650),
0185     REG16(0x654),
0186     REG16(0x658),
0187     REG16(0x65c),
0188     REG16(0x660),
0189     REG16(0x664),
0190     REG16(0x668),
0191     REG16(0x66c),
0192     REG16(0x670),
0193     REG16(0x674),
0194     REG16(0x678),
0195     REG16(0x67c),
0196     REG(0x068),
0197 
0198     END
0199 };
0200 
0201 static const u8 gen12_xcs_offsets[] = {
0202     NOP(1),
0203     LRI(13, POSTED),
0204     REG16(0x244),
0205     REG(0x034),
0206     REG(0x030),
0207     REG(0x038),
0208     REG(0x03c),
0209     REG(0x168),
0210     REG(0x140),
0211     REG(0x110),
0212     REG(0x1c0),
0213     REG(0x1c4),
0214     REG(0x1c8),
0215     REG(0x180),
0216     REG16(0x2b4),
0217 
0218     NOP(5),
0219     LRI(9, POSTED),
0220     REG16(0x3a8),
0221     REG16(0x28c),
0222     REG16(0x288),
0223     REG16(0x284),
0224     REG16(0x280),
0225     REG16(0x27c),
0226     REG16(0x278),
0227     REG16(0x274),
0228     REG16(0x270),
0229 
0230     END
0231 };
0232 
0233 static const u8 dg2_xcs_offsets[] = {
0234     NOP(1),
0235     LRI(15, POSTED),
0236     REG16(0x244),
0237     REG(0x034),
0238     REG(0x030),
0239     REG(0x038),
0240     REG(0x03c),
0241     REG(0x168),
0242     REG(0x140),
0243     REG(0x110),
0244     REG(0x1c0),
0245     REG(0x1c4),
0246     REG(0x1c8),
0247     REG(0x180),
0248     REG16(0x2b4),
0249     REG(0x120),
0250     REG(0x124),
0251 
0252     NOP(1),
0253     LRI(9, POSTED),
0254     REG16(0x3a8),
0255     REG16(0x28c),
0256     REG16(0x288),
0257     REG16(0x284),
0258     REG16(0x280),
0259     REG16(0x27c),
0260     REG16(0x278),
0261     REG16(0x274),
0262     REG16(0x270),
0263 
0264     END
0265 };
0266 
0267 static const u8 gen8_rcs_offsets[] = {
0268     NOP(1),
0269     LRI(14, POSTED),
0270     REG16(0x244),
0271     REG(0x034),
0272     REG(0x030),
0273     REG(0x038),
0274     REG(0x03c),
0275     REG(0x168),
0276     REG(0x140),
0277     REG(0x110),
0278     REG(0x11c),
0279     REG(0x114),
0280     REG(0x118),
0281     REG(0x1c0),
0282     REG(0x1c4),
0283     REG(0x1c8),
0284 
0285     NOP(3),
0286     LRI(9, POSTED),
0287     REG16(0x3a8),
0288     REG16(0x28c),
0289     REG16(0x288),
0290     REG16(0x284),
0291     REG16(0x280),
0292     REG16(0x27c),
0293     REG16(0x278),
0294     REG16(0x274),
0295     REG16(0x270),
0296 
0297     NOP(13),
0298     LRI(1, 0),
0299     REG(0x0c8),
0300 
0301     END
0302 };
0303 
0304 static const u8 gen9_rcs_offsets[] = {
0305     NOP(1),
0306     LRI(14, POSTED),
0307     REG16(0x244),
0308     REG(0x34),
0309     REG(0x30),
0310     REG(0x38),
0311     REG(0x3c),
0312     REG(0x168),
0313     REG(0x140),
0314     REG(0x110),
0315     REG(0x11c),
0316     REG(0x114),
0317     REG(0x118),
0318     REG(0x1c0),
0319     REG(0x1c4),
0320     REG(0x1c8),
0321 
0322     NOP(3),
0323     LRI(9, POSTED),
0324     REG16(0x3a8),
0325     REG16(0x28c),
0326     REG16(0x288),
0327     REG16(0x284),
0328     REG16(0x280),
0329     REG16(0x27c),
0330     REG16(0x278),
0331     REG16(0x274),
0332     REG16(0x270),
0333 
0334     NOP(13),
0335     LRI(1, 0),
0336     REG(0xc8),
0337 
0338     NOP(13),
0339     LRI(44, POSTED),
0340     REG(0x28),
0341     REG(0x9c),
0342     REG(0xc0),
0343     REG(0x178),
0344     REG(0x17c),
0345     REG16(0x358),
0346     REG(0x170),
0347     REG(0x150),
0348     REG(0x154),
0349     REG(0x158),
0350     REG16(0x41c),
0351     REG16(0x600),
0352     REG16(0x604),
0353     REG16(0x608),
0354     REG16(0x60c),
0355     REG16(0x610),
0356     REG16(0x614),
0357     REG16(0x618),
0358     REG16(0x61c),
0359     REG16(0x620),
0360     REG16(0x624),
0361     REG16(0x628),
0362     REG16(0x62c),
0363     REG16(0x630),
0364     REG16(0x634),
0365     REG16(0x638),
0366     REG16(0x63c),
0367     REG16(0x640),
0368     REG16(0x644),
0369     REG16(0x648),
0370     REG16(0x64c),
0371     REG16(0x650),
0372     REG16(0x654),
0373     REG16(0x658),
0374     REG16(0x65c),
0375     REG16(0x660),
0376     REG16(0x664),
0377     REG16(0x668),
0378     REG16(0x66c),
0379     REG16(0x670),
0380     REG16(0x674),
0381     REG16(0x678),
0382     REG16(0x67c),
0383     REG(0x68),
0384 
0385     END
0386 };
0387 
0388 static const u8 gen11_rcs_offsets[] = {
0389     NOP(1),
0390     LRI(15, POSTED),
0391     REG16(0x244),
0392     REG(0x034),
0393     REG(0x030),
0394     REG(0x038),
0395     REG(0x03c),
0396     REG(0x168),
0397     REG(0x140),
0398     REG(0x110),
0399     REG(0x11c),
0400     REG(0x114),
0401     REG(0x118),
0402     REG(0x1c0),
0403     REG(0x1c4),
0404     REG(0x1c8),
0405     REG(0x180),
0406 
0407     NOP(1),
0408     LRI(9, POSTED),
0409     REG16(0x3a8),
0410     REG16(0x28c),
0411     REG16(0x288),
0412     REG16(0x284),
0413     REG16(0x280),
0414     REG16(0x27c),
0415     REG16(0x278),
0416     REG16(0x274),
0417     REG16(0x270),
0418 
0419     LRI(1, POSTED),
0420     REG(0x1b0),
0421 
0422     NOP(10),
0423     LRI(1, 0),
0424     REG(0x0c8),
0425 
0426     END
0427 };
0428 
0429 static const u8 gen12_rcs_offsets[] = {
0430     NOP(1),
0431     LRI(13, POSTED),
0432     REG16(0x244),
0433     REG(0x034),
0434     REG(0x030),
0435     REG(0x038),
0436     REG(0x03c),
0437     REG(0x168),
0438     REG(0x140),
0439     REG(0x110),
0440     REG(0x1c0),
0441     REG(0x1c4),
0442     REG(0x1c8),
0443     REG(0x180),
0444     REG16(0x2b4),
0445 
0446     NOP(5),
0447     LRI(9, POSTED),
0448     REG16(0x3a8),
0449     REG16(0x28c),
0450     REG16(0x288),
0451     REG16(0x284),
0452     REG16(0x280),
0453     REG16(0x27c),
0454     REG16(0x278),
0455     REG16(0x274),
0456     REG16(0x270),
0457 
0458     LRI(3, POSTED),
0459     REG(0x1b0),
0460     REG16(0x5a8),
0461     REG16(0x5ac),
0462 
0463     NOP(6),
0464     LRI(1, 0),
0465     REG(0x0c8),
0466     NOP(3 + 9 + 1),
0467 
0468     LRI(51, POSTED),
0469     REG16(0x588),
0470     REG16(0x588),
0471     REG16(0x588),
0472     REG16(0x588),
0473     REG16(0x588),
0474     REG16(0x588),
0475     REG(0x028),
0476     REG(0x09c),
0477     REG(0x0c0),
0478     REG(0x178),
0479     REG(0x17c),
0480     REG16(0x358),
0481     REG(0x170),
0482     REG(0x150),
0483     REG(0x154),
0484     REG(0x158),
0485     REG16(0x41c),
0486     REG16(0x600),
0487     REG16(0x604),
0488     REG16(0x608),
0489     REG16(0x60c),
0490     REG16(0x610),
0491     REG16(0x614),
0492     REG16(0x618),
0493     REG16(0x61c),
0494     REG16(0x620),
0495     REG16(0x624),
0496     REG16(0x628),
0497     REG16(0x62c),
0498     REG16(0x630),
0499     REG16(0x634),
0500     REG16(0x638),
0501     REG16(0x63c),
0502     REG16(0x640),
0503     REG16(0x644),
0504     REG16(0x648),
0505     REG16(0x64c),
0506     REG16(0x650),
0507     REG16(0x654),
0508     REG16(0x658),
0509     REG16(0x65c),
0510     REG16(0x660),
0511     REG16(0x664),
0512     REG16(0x668),
0513     REG16(0x66c),
0514     REG16(0x670),
0515     REG16(0x674),
0516     REG16(0x678),
0517     REG16(0x67c),
0518     REG(0x068),
0519     REG(0x084),
0520     NOP(1),
0521 
0522     END
0523 };
0524 
0525 static const u8 xehp_rcs_offsets[] = {
0526     NOP(1),
0527     LRI(13, POSTED),
0528     REG16(0x244),
0529     REG(0x034),
0530     REG(0x030),
0531     REG(0x038),
0532     REG(0x03c),
0533     REG(0x168),
0534     REG(0x140),
0535     REG(0x110),
0536     REG(0x1c0),
0537     REG(0x1c4),
0538     REG(0x1c8),
0539     REG(0x180),
0540     REG16(0x2b4),
0541 
0542     NOP(5),
0543     LRI(9, POSTED),
0544     REG16(0x3a8),
0545     REG16(0x28c),
0546     REG16(0x288),
0547     REG16(0x284),
0548     REG16(0x280),
0549     REG16(0x27c),
0550     REG16(0x278),
0551     REG16(0x274),
0552     REG16(0x270),
0553 
0554     LRI(3, POSTED),
0555     REG(0x1b0),
0556     REG16(0x5a8),
0557     REG16(0x5ac),
0558 
0559     NOP(6),
0560     LRI(1, 0),
0561     REG(0x0c8),
0562 
0563     END
0564 };
0565 
0566 static const u8 dg2_rcs_offsets[] = {
0567     NOP(1),
0568     LRI(15, POSTED),
0569     REG16(0x244),
0570     REG(0x034),
0571     REG(0x030),
0572     REG(0x038),
0573     REG(0x03c),
0574     REG(0x168),
0575     REG(0x140),
0576     REG(0x110),
0577     REG(0x1c0),
0578     REG(0x1c4),
0579     REG(0x1c8),
0580     REG(0x180),
0581     REG16(0x2b4),
0582     REG(0x120),
0583     REG(0x124),
0584 
0585     NOP(1),
0586     LRI(9, POSTED),
0587     REG16(0x3a8),
0588     REG16(0x28c),
0589     REG16(0x288),
0590     REG16(0x284),
0591     REG16(0x280),
0592     REG16(0x27c),
0593     REG16(0x278),
0594     REG16(0x274),
0595     REG16(0x270),
0596 
0597     LRI(3, POSTED),
0598     REG(0x1b0),
0599     REG16(0x5a8),
0600     REG16(0x5ac),
0601 
0602     NOP(6),
0603     LRI(1, 0),
0604     REG(0x0c8),
0605 
0606     END
0607 };
0608 
0609 #undef END
0610 #undef REG16
0611 #undef REG
0612 #undef LRI
0613 #undef NOP
0614 
0615 static const u8 *reg_offsets(const struct intel_engine_cs *engine)
0616 {
0617     /*
0618      * The gen12+ lists only have the registers we program in the basic
0619      * default state. We rely on the context image using relative
0620      * addressing to automatic fixup the register state between the
0621      * physical engines for virtual engine.
0622      */
0623     GEM_BUG_ON(GRAPHICS_VER(engine->i915) >= 12 &&
0624            !intel_engine_has_relative_mmio(engine));
0625 
0626     if (engine->flags & I915_ENGINE_HAS_RCS_REG_STATE) {
0627         if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55))
0628             return dg2_rcs_offsets;
0629         else if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
0630             return xehp_rcs_offsets;
0631         else if (GRAPHICS_VER(engine->i915) >= 12)
0632             return gen12_rcs_offsets;
0633         else if (GRAPHICS_VER(engine->i915) >= 11)
0634             return gen11_rcs_offsets;
0635         else if (GRAPHICS_VER(engine->i915) >= 9)
0636             return gen9_rcs_offsets;
0637         else
0638             return gen8_rcs_offsets;
0639     } else {
0640         if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55))
0641             return dg2_xcs_offsets;
0642         else if (GRAPHICS_VER(engine->i915) >= 12)
0643             return gen12_xcs_offsets;
0644         else if (GRAPHICS_VER(engine->i915) >= 9)
0645             return gen9_xcs_offsets;
0646         else
0647             return gen8_xcs_offsets;
0648     }
0649 }
0650 
0651 static int lrc_ring_mi_mode(const struct intel_engine_cs *engine)
0652 {
0653     if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
0654         return 0x70;
0655     else if (GRAPHICS_VER(engine->i915) >= 12)
0656         return 0x60;
0657     else if (GRAPHICS_VER(engine->i915) >= 9)
0658         return 0x54;
0659     else if (engine->class == RENDER_CLASS)
0660         return 0x58;
0661     else
0662         return -1;
0663 }
0664 
0665 static int lrc_ring_gpr0(const struct intel_engine_cs *engine)
0666 {
0667     if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
0668         return 0x84;
0669     else if (GRAPHICS_VER(engine->i915) >= 12)
0670         return 0x74;
0671     else if (GRAPHICS_VER(engine->i915) >= 9)
0672         return 0x68;
0673     else if (engine->class == RENDER_CLASS)
0674         return 0xd8;
0675     else
0676         return -1;
0677 }
0678 
0679 static int lrc_ring_wa_bb_per_ctx(const struct intel_engine_cs *engine)
0680 {
0681     if (GRAPHICS_VER(engine->i915) >= 12)
0682         return 0x12;
0683     else if (GRAPHICS_VER(engine->i915) >= 9 || engine->class == RENDER_CLASS)
0684         return 0x18;
0685     else
0686         return -1;
0687 }
0688 
0689 static int lrc_ring_indirect_ptr(const struct intel_engine_cs *engine)
0690 {
0691     int x;
0692 
0693     x = lrc_ring_wa_bb_per_ctx(engine);
0694     if (x < 0)
0695         return x;
0696 
0697     return x + 2;
0698 }
0699 
0700 static int lrc_ring_indirect_offset(const struct intel_engine_cs *engine)
0701 {
0702     int x;
0703 
0704     x = lrc_ring_indirect_ptr(engine);
0705     if (x < 0)
0706         return x;
0707 
0708     return x + 2;
0709 }
0710 
0711 static int lrc_ring_cmd_buf_cctl(const struct intel_engine_cs *engine)
0712 {
0713 
0714     if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
0715         /*
0716          * Note that the CSFE context has a dummy slot for CMD_BUF_CCTL
0717          * simply to match the RCS context image layout.
0718          */
0719         return 0xc6;
0720     else if (engine->class != RENDER_CLASS)
0721         return -1;
0722     else if (GRAPHICS_VER(engine->i915) >= 12)
0723         return 0xb6;
0724     else if (GRAPHICS_VER(engine->i915) >= 11)
0725         return 0xaa;
0726     else
0727         return -1;
0728 }
0729 
0730 static u32
0731 lrc_ring_indirect_offset_default(const struct intel_engine_cs *engine)
0732 {
0733     switch (GRAPHICS_VER(engine->i915)) {
0734     default:
0735         MISSING_CASE(GRAPHICS_VER(engine->i915));
0736         fallthrough;
0737     case 12:
0738         return GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
0739     case 11:
0740         return GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
0741     case 9:
0742         return GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
0743     case 8:
0744         return GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
0745     }
0746 }
0747 
0748 static void
0749 lrc_setup_indirect_ctx(u32 *regs,
0750                const struct intel_engine_cs *engine,
0751                u32 ctx_bb_ggtt_addr,
0752                u32 size)
0753 {
0754     GEM_BUG_ON(!size);
0755     GEM_BUG_ON(!IS_ALIGNED(size, CACHELINE_BYTES));
0756     GEM_BUG_ON(lrc_ring_indirect_ptr(engine) == -1);
0757     regs[lrc_ring_indirect_ptr(engine) + 1] =
0758         ctx_bb_ggtt_addr | (size / CACHELINE_BYTES);
0759 
0760     GEM_BUG_ON(lrc_ring_indirect_offset(engine) == -1);
0761     regs[lrc_ring_indirect_offset(engine) + 1] =
0762         lrc_ring_indirect_offset_default(engine) << 6;
0763 }
0764 
0765 static void init_common_regs(u32 * const regs,
0766                  const struct intel_context *ce,
0767                  const struct intel_engine_cs *engine,
0768                  bool inhibit)
0769 {
0770     u32 ctl;
0771 
0772     ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH);
0773     ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
0774     if (inhibit)
0775         ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT;
0776     if (GRAPHICS_VER(engine->i915) < 11)
0777         ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT |
0778                        CTX_CTRL_RS_CTX_ENABLE);
0779     regs[CTX_CONTEXT_CONTROL] = ctl;
0780 
0781     regs[CTX_TIMESTAMP] = ce->stats.runtime.last;
0782 }
0783 
0784 static void init_wa_bb_regs(u32 * const regs,
0785                 const struct intel_engine_cs *engine)
0786 {
0787     const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx;
0788 
0789     if (wa_ctx->per_ctx.size) {
0790         const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
0791 
0792         GEM_BUG_ON(lrc_ring_wa_bb_per_ctx(engine) == -1);
0793         regs[lrc_ring_wa_bb_per_ctx(engine) + 1] =
0794             (ggtt_offset + wa_ctx->per_ctx.offset) | 0x01;
0795     }
0796 
0797     if (wa_ctx->indirect_ctx.size) {
0798         lrc_setup_indirect_ctx(regs, engine,
0799                        i915_ggtt_offset(wa_ctx->vma) +
0800                        wa_ctx->indirect_ctx.offset,
0801                        wa_ctx->indirect_ctx.size);
0802     }
0803 }
0804 
0805 static void init_ppgtt_regs(u32 *regs, const struct i915_ppgtt *ppgtt)
0806 {
0807     if (i915_vm_is_4lvl(&ppgtt->vm)) {
0808         /* 64b PPGTT (48bit canonical)
0809          * PDP0_DESCRIPTOR contains the base address to PML4 and
0810          * other PDP Descriptors are ignored.
0811          */
0812         ASSIGN_CTX_PML4(ppgtt, regs);
0813     } else {
0814         ASSIGN_CTX_PDP(ppgtt, regs, 3);
0815         ASSIGN_CTX_PDP(ppgtt, regs, 2);
0816         ASSIGN_CTX_PDP(ppgtt, regs, 1);
0817         ASSIGN_CTX_PDP(ppgtt, regs, 0);
0818     }
0819 }
0820 
0821 static struct i915_ppgtt *vm_alias(struct i915_address_space *vm)
0822 {
0823     if (i915_is_ggtt(vm))
0824         return i915_vm_to_ggtt(vm)->alias;
0825     else
0826         return i915_vm_to_ppgtt(vm);
0827 }
0828 
0829 static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine)
0830 {
0831     int x;
0832 
0833     x = lrc_ring_mi_mode(engine);
0834     if (x != -1) {
0835         regs[x + 1] &= ~STOP_RING;
0836         regs[x + 1] |= STOP_RING << 16;
0837     }
0838 }
0839 
0840 static void __lrc_init_regs(u32 *regs,
0841                 const struct intel_context *ce,
0842                 const struct intel_engine_cs *engine,
0843                 bool inhibit)
0844 {
0845     /*
0846      * A context is actually a big batch buffer with several
0847      * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The
0848      * values we are setting here are only for the first context restore:
0849      * on a subsequent save, the GPU will recreate this batchbuffer with new
0850      * values (including all the missing MI_LOAD_REGISTER_IMM commands that
0851      * we are not initializing here).
0852      *
0853      * Must keep consistent with virtual_update_register_offsets().
0854      */
0855 
0856     if (inhibit)
0857         memset(regs, 0, PAGE_SIZE);
0858 
0859     set_offsets(regs, reg_offsets(engine), engine, inhibit);
0860 
0861     init_common_regs(regs, ce, engine, inhibit);
0862     init_ppgtt_regs(regs, vm_alias(ce->vm));
0863 
0864     init_wa_bb_regs(regs, engine);
0865 
0866     __reset_stop_ring(regs, engine);
0867 }
0868 
0869 void lrc_init_regs(const struct intel_context *ce,
0870            const struct intel_engine_cs *engine,
0871            bool inhibit)
0872 {
0873     __lrc_init_regs(ce->lrc_reg_state, ce, engine, inhibit);
0874 }
0875 
0876 void lrc_reset_regs(const struct intel_context *ce,
0877             const struct intel_engine_cs *engine)
0878 {
0879     __reset_stop_ring(ce->lrc_reg_state, engine);
0880 }
0881 
0882 static void
0883 set_redzone(void *vaddr, const struct intel_engine_cs *engine)
0884 {
0885     if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
0886         return;
0887 
0888     vaddr += engine->context_size;
0889 
0890     memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE);
0891 }
0892 
0893 static void
0894 check_redzone(const void *vaddr, const struct intel_engine_cs *engine)
0895 {
0896     if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
0897         return;
0898 
0899     vaddr += engine->context_size;
0900 
0901     if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE))
0902         drm_err_once(&engine->i915->drm,
0903                  "%s context redzone overwritten!\n",
0904                  engine->name);
0905 }
0906 
0907 static u32 context_wa_bb_offset(const struct intel_context *ce)
0908 {
0909     return PAGE_SIZE * ce->wa_bb_page;
0910 }
0911 
0912 static u32 *context_indirect_bb(const struct intel_context *ce)
0913 {
0914     void *ptr;
0915 
0916     GEM_BUG_ON(!ce->wa_bb_page);
0917 
0918     ptr = ce->lrc_reg_state;
0919     ptr -= LRC_STATE_OFFSET; /* back to start of context image */
0920     ptr += context_wa_bb_offset(ce);
0921 
0922     return ptr;
0923 }
0924 
0925 void lrc_init_state(struct intel_context *ce,
0926             struct intel_engine_cs *engine,
0927             void *state)
0928 {
0929     bool inhibit = true;
0930 
0931     set_redzone(state, engine);
0932 
0933     if (engine->default_state) {
0934         shmem_read(engine->default_state, 0,
0935                state, engine->context_size);
0936         __set_bit(CONTEXT_VALID_BIT, &ce->flags);
0937         inhibit = false;
0938     }
0939 
0940     /* Clear the ppHWSP (inc. per-context counters) */
0941     memset(state, 0, PAGE_SIZE);
0942 
0943     /* Clear the indirect wa and storage */
0944     if (ce->wa_bb_page)
0945         memset(state + context_wa_bb_offset(ce), 0, PAGE_SIZE);
0946 
0947     /*
0948      * The second page of the context object contains some registers which
0949      * must be set up prior to the first execution.
0950      */
0951     __lrc_init_regs(state + LRC_STATE_OFFSET, ce, engine, inhibit);
0952 }
0953 
0954 u32 lrc_indirect_bb(const struct intel_context *ce)
0955 {
0956     return i915_ggtt_offset(ce->state) + context_wa_bb_offset(ce);
0957 }
0958 
0959 static u32 *setup_predicate_disable_wa(const struct intel_context *ce, u32 *cs)
0960 {
0961     /* If predication is active, this will be noop'ed */
0962     *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT | (4 - 2);
0963     *cs++ = lrc_indirect_bb(ce) + DG2_PREDICATE_RESULT_WA;
0964     *cs++ = 0;
0965     *cs++ = 0; /* No predication */
0966 
0967     /* predicated end, only terminates if SET_PREDICATE_RESULT:0 is clear */
0968     *cs++ = MI_BATCH_BUFFER_END | BIT(15);
0969     *cs++ = MI_SET_PREDICATE | MI_SET_PREDICATE_DISABLE;
0970 
0971     /* Instructions are no longer predicated (disabled), we can proceed */
0972     *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT | (4 - 2);
0973     *cs++ = lrc_indirect_bb(ce) + DG2_PREDICATE_RESULT_WA;
0974     *cs++ = 0;
0975     *cs++ = 1; /* enable predication before the next BB */
0976 
0977     *cs++ = MI_BATCH_BUFFER_END;
0978     GEM_BUG_ON(offset_in_page(cs) > DG2_PREDICATE_RESULT_WA);
0979 
0980     return cs;
0981 }
0982 
0983 static struct i915_vma *
0984 __lrc_alloc_state(struct intel_context *ce, struct intel_engine_cs *engine)
0985 {
0986     struct drm_i915_gem_object *obj;
0987     struct i915_vma *vma;
0988     u32 context_size;
0989 
0990     context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE);
0991 
0992     if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
0993         context_size += I915_GTT_PAGE_SIZE; /* for redzone */
0994 
0995     if (GRAPHICS_VER(engine->i915) == 12) {
0996         ce->wa_bb_page = context_size / PAGE_SIZE;
0997         context_size += PAGE_SIZE;
0998     }
0999 
1000     if (intel_context_is_parent(ce) && intel_engine_uses_guc(engine)) {
1001         ce->parallel.guc.parent_page = context_size / PAGE_SIZE;
1002         context_size += PARENT_SCRATCH_SIZE;
1003     }
1004 
1005     obj = i915_gem_object_create_lmem(engine->i915, context_size,
1006                       I915_BO_ALLOC_PM_VOLATILE);
1007     if (IS_ERR(obj))
1008         obj = i915_gem_object_create_shmem(engine->i915, context_size);
1009     if (IS_ERR(obj))
1010         return ERR_CAST(obj);
1011 
1012     vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
1013     if (IS_ERR(vma)) {
1014         i915_gem_object_put(obj);
1015         return vma;
1016     }
1017 
1018     return vma;
1019 }
1020 
1021 static struct intel_timeline *
1022 pinned_timeline(struct intel_context *ce, struct intel_engine_cs *engine)
1023 {
1024     struct intel_timeline *tl = fetch_and_zero(&ce->timeline);
1025 
1026     return intel_timeline_create_from_engine(engine, page_unmask_bits(tl));
1027 }
1028 
1029 int lrc_alloc(struct intel_context *ce, struct intel_engine_cs *engine)
1030 {
1031     struct intel_ring *ring;
1032     struct i915_vma *vma;
1033     int err;
1034 
1035     GEM_BUG_ON(ce->state);
1036 
1037     vma = __lrc_alloc_state(ce, engine);
1038     if (IS_ERR(vma))
1039         return PTR_ERR(vma);
1040 
1041     ring = intel_engine_create_ring(engine, ce->ring_size);
1042     if (IS_ERR(ring)) {
1043         err = PTR_ERR(ring);
1044         goto err_vma;
1045     }
1046 
1047     if (!page_mask_bits(ce->timeline)) {
1048         struct intel_timeline *tl;
1049 
1050         /*
1051          * Use the static global HWSP for the kernel context, and
1052          * a dynamically allocated cacheline for everyone else.
1053          */
1054         if (unlikely(ce->timeline))
1055             tl = pinned_timeline(ce, engine);
1056         else
1057             tl = intel_timeline_create(engine->gt);
1058         if (IS_ERR(tl)) {
1059             err = PTR_ERR(tl);
1060             goto err_ring;
1061         }
1062 
1063         ce->timeline = tl;
1064     }
1065 
1066     ce->ring = ring;
1067     ce->state = vma;
1068 
1069     return 0;
1070 
1071 err_ring:
1072     intel_ring_put(ring);
1073 err_vma:
1074     i915_vma_put(vma);
1075     return err;
1076 }
1077 
1078 void lrc_reset(struct intel_context *ce)
1079 {
1080     GEM_BUG_ON(!intel_context_is_pinned(ce));
1081 
1082     intel_ring_reset(ce->ring, ce->ring->emit);
1083 
1084     /* Scrub away the garbage */
1085     lrc_init_regs(ce, ce->engine, true);
1086     ce->lrc.lrca = lrc_update_regs(ce, ce->engine, ce->ring->tail);
1087 }
1088 
1089 int
1090 lrc_pre_pin(struct intel_context *ce,
1091         struct intel_engine_cs *engine,
1092         struct i915_gem_ww_ctx *ww,
1093         void **vaddr)
1094 {
1095     GEM_BUG_ON(!ce->state);
1096     GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
1097 
1098     *vaddr = i915_gem_object_pin_map(ce->state->obj,
1099                      i915_coherent_map_type(ce->engine->i915,
1100                                 ce->state->obj,
1101                                 false) |
1102                      I915_MAP_OVERRIDE);
1103 
1104     return PTR_ERR_OR_ZERO(*vaddr);
1105 }
1106 
1107 int
1108 lrc_pin(struct intel_context *ce,
1109     struct intel_engine_cs *engine,
1110     void *vaddr)
1111 {
1112     ce->lrc_reg_state = vaddr + LRC_STATE_OFFSET;
1113 
1114     if (!__test_and_set_bit(CONTEXT_INIT_BIT, &ce->flags))
1115         lrc_init_state(ce, engine, vaddr);
1116 
1117     ce->lrc.lrca = lrc_update_regs(ce, engine, ce->ring->tail);
1118     return 0;
1119 }
1120 
1121 void lrc_unpin(struct intel_context *ce)
1122 {
1123     if (unlikely(ce->parallel.last_rq)) {
1124         i915_request_put(ce->parallel.last_rq);
1125         ce->parallel.last_rq = NULL;
1126     }
1127     check_redzone((void *)ce->lrc_reg_state - LRC_STATE_OFFSET,
1128               ce->engine);
1129 }
1130 
1131 void lrc_post_unpin(struct intel_context *ce)
1132 {
1133     i915_gem_object_unpin_map(ce->state->obj);
1134 }
1135 
1136 void lrc_fini(struct intel_context *ce)
1137 {
1138     if (!ce->state)
1139         return;
1140 
1141     intel_ring_put(fetch_and_zero(&ce->ring));
1142     i915_vma_put(fetch_and_zero(&ce->state));
1143 }
1144 
1145 void lrc_destroy(struct kref *kref)
1146 {
1147     struct intel_context *ce = container_of(kref, typeof(*ce), ref);
1148 
1149     GEM_BUG_ON(!i915_active_is_idle(&ce->active));
1150     GEM_BUG_ON(intel_context_is_pinned(ce));
1151 
1152     lrc_fini(ce);
1153 
1154     intel_context_fini(ce);
1155     intel_context_free(ce);
1156 }
1157 
1158 static u32 *
1159 gen12_emit_timestamp_wa(const struct intel_context *ce, u32 *cs)
1160 {
1161     *cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
1162         MI_SRM_LRM_GLOBAL_GTT |
1163         MI_LRI_LRM_CS_MMIO;
1164     *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1165     *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
1166         CTX_TIMESTAMP * sizeof(u32);
1167     *cs++ = 0;
1168 
1169     *cs++ = MI_LOAD_REGISTER_REG |
1170         MI_LRR_SOURCE_CS_MMIO |
1171         MI_LRI_LRM_CS_MMIO;
1172     *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1173     *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
1174 
1175     *cs++ = MI_LOAD_REGISTER_REG |
1176         MI_LRR_SOURCE_CS_MMIO |
1177         MI_LRI_LRM_CS_MMIO;
1178     *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1179     *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
1180 
1181     return cs;
1182 }
1183 
1184 static u32 *
1185 gen12_emit_restore_scratch(const struct intel_context *ce, u32 *cs)
1186 {
1187     GEM_BUG_ON(lrc_ring_gpr0(ce->engine) == -1);
1188 
1189     *cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
1190         MI_SRM_LRM_GLOBAL_GTT |
1191         MI_LRI_LRM_CS_MMIO;
1192     *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1193     *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
1194         (lrc_ring_gpr0(ce->engine) + 1) * sizeof(u32);
1195     *cs++ = 0;
1196 
1197     return cs;
1198 }
1199 
1200 static u32 *
1201 gen12_emit_cmd_buf_wa(const struct intel_context *ce, u32 *cs)
1202 {
1203     GEM_BUG_ON(lrc_ring_cmd_buf_cctl(ce->engine) == -1);
1204 
1205     *cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
1206         MI_SRM_LRM_GLOBAL_GTT |
1207         MI_LRI_LRM_CS_MMIO;
1208     *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1209     *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
1210         (lrc_ring_cmd_buf_cctl(ce->engine) + 1) * sizeof(u32);
1211     *cs++ = 0;
1212 
1213     *cs++ = MI_LOAD_REGISTER_REG |
1214         MI_LRR_SOURCE_CS_MMIO |
1215         MI_LRI_LRM_CS_MMIO;
1216     *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1217     *cs++ = i915_mmio_reg_offset(RING_CMD_BUF_CCTL(0));
1218 
1219     return cs;
1220 }
1221 
1222 /*
1223  * On DG2 during context restore of a preempted context in GPGPU mode,
1224  * RCS restore hang is detected. This is extremely timing dependent.
1225  * To address this below sw wabb is implemented for DG2 A steppings.
1226  */
1227 static u32 *
1228 dg2_emit_rcs_hang_wabb(const struct intel_context *ce, u32 *cs)
1229 {
1230     *cs++ = MI_LOAD_REGISTER_IMM(1);
1231     *cs++ = i915_mmio_reg_offset(GEN12_STATE_ACK_DEBUG);
1232     *cs++ = 0x21;
1233 
1234     *cs++ = MI_LOAD_REGISTER_REG;
1235     *cs++ = i915_mmio_reg_offset(RING_NOPID(ce->engine->mmio_base));
1236     *cs++ = i915_mmio_reg_offset(GEN12_CULLBIT1);
1237 
1238     *cs++ = MI_LOAD_REGISTER_REG;
1239     *cs++ = i915_mmio_reg_offset(RING_NOPID(ce->engine->mmio_base));
1240     *cs++ = i915_mmio_reg_offset(GEN12_CULLBIT2);
1241 
1242     return cs;
1243 }
1244 
1245 static u32 *
1246 gen12_emit_indirect_ctx_rcs(const struct intel_context *ce, u32 *cs)
1247 {
1248     cs = gen12_emit_timestamp_wa(ce, cs);
1249     cs = gen12_emit_cmd_buf_wa(ce, cs);
1250     cs = gen12_emit_restore_scratch(ce, cs);
1251 
1252     /* Wa_22011450934:dg2 */
1253     if (IS_DG2_GRAPHICS_STEP(ce->engine->i915, G10, STEP_A0, STEP_B0) ||
1254         IS_DG2_GRAPHICS_STEP(ce->engine->i915, G11, STEP_A0, STEP_B0))
1255         cs = dg2_emit_rcs_hang_wabb(ce, cs);
1256 
1257     /* Wa_16013000631:dg2 */
1258     if (IS_DG2_GRAPHICS_STEP(ce->engine->i915, G10, STEP_B0, STEP_C0) ||
1259         IS_DG2_G11(ce->engine->i915))
1260         cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE, 0);
1261 
1262     /* hsdes: 1809175790 */
1263     if (!HAS_FLAT_CCS(ce->engine->i915))
1264         cs = gen12_emit_aux_table_inv(cs, GEN12_GFX_CCS_AUX_NV);
1265 
1266     return cs;
1267 }
1268 
1269 static u32 *
1270 gen12_emit_indirect_ctx_xcs(const struct intel_context *ce, u32 *cs)
1271 {
1272     cs = gen12_emit_timestamp_wa(ce, cs);
1273     cs = gen12_emit_restore_scratch(ce, cs);
1274 
1275     /* Wa_16013000631:dg2 */
1276     if (IS_DG2_GRAPHICS_STEP(ce->engine->i915, G10, STEP_B0, STEP_C0) ||
1277         IS_DG2_G11(ce->engine->i915))
1278         if (ce->engine->class == COMPUTE_CLASS)
1279             cs = gen8_emit_pipe_control(cs,
1280                             PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE,
1281                             0);
1282 
1283     /* hsdes: 1809175790 */
1284     if (!HAS_FLAT_CCS(ce->engine->i915)) {
1285         if (ce->engine->class == VIDEO_DECODE_CLASS)
1286             cs = gen12_emit_aux_table_inv(cs, GEN12_VD0_AUX_NV);
1287         else if (ce->engine->class == VIDEO_ENHANCEMENT_CLASS)
1288             cs = gen12_emit_aux_table_inv(cs, GEN12_VE0_AUX_NV);
1289     }
1290 
1291     return cs;
1292 }
1293 
1294 static void
1295 setup_indirect_ctx_bb(const struct intel_context *ce,
1296               const struct intel_engine_cs *engine,
1297               u32 *(*emit)(const struct intel_context *, u32 *))
1298 {
1299     u32 * const start = context_indirect_bb(ce);
1300     u32 *cs;
1301 
1302     cs = emit(ce, start);
1303     GEM_BUG_ON(cs - start > I915_GTT_PAGE_SIZE / sizeof(*cs));
1304     while ((unsigned long)cs % CACHELINE_BYTES)
1305         *cs++ = MI_NOOP;
1306 
1307     GEM_BUG_ON(cs - start > DG2_PREDICATE_RESULT_BB / sizeof(*start));
1308     setup_predicate_disable_wa(ce, start + DG2_PREDICATE_RESULT_BB / sizeof(*start));
1309 
1310     lrc_setup_indirect_ctx(ce->lrc_reg_state, engine,
1311                    lrc_indirect_bb(ce),
1312                    (cs - start) * sizeof(*cs));
1313 }
1314 
1315 /*
1316  * The context descriptor encodes various attributes of a context,
1317  * including its GTT address and some flags. Because it's fairly
1318  * expensive to calculate, we'll just do it once and cache the result,
1319  * which remains valid until the context is unpinned.
1320  *
1321  * This is what a descriptor looks like, from LSB to MSB::
1322  *
1323  *      bits  0-11:    flags, GEN8_CTX_* (cached in ctx->desc_template)
1324  *      bits 12-31:    LRCA, GTT address of (the HWSP of) this context
1325  *      bits 32-52:    ctx ID, a globally unique tag (highest bit used by GuC)
1326  *      bits 53-54:    mbz, reserved for use by hardware
1327  *      bits 55-63:    group ID, currently unused and set to 0
1328  *
1329  * Starting from Gen11, the upper dword of the descriptor has a new format:
1330  *
1331  *      bits 32-36:    reserved
1332  *      bits 37-47:    SW context ID
1333  *      bits 48:53:    engine instance
1334  *      bit 54:        mbz, reserved for use by hardware
1335  *      bits 55-60:    SW counter
1336  *      bits 61-63:    engine class
1337  *
1338  * On Xe_HP, the upper dword of the descriptor has a new format:
1339  *
1340  *      bits 32-37:    virtual function number
1341  *      bit 38:        mbz, reserved for use by hardware
1342  *      bits 39-54:    SW context ID
1343  *      bits 55-57:    reserved
1344  *      bits 58-63:    SW counter
1345  *
1346  * engine info, SW context ID and SW counter need to form a unique number
1347  * (Context ID) per lrc.
1348  */
1349 static u32 lrc_descriptor(const struct intel_context *ce)
1350 {
1351     u32 desc;
1352 
1353     desc = INTEL_LEGACY_32B_CONTEXT;
1354     if (i915_vm_is_4lvl(ce->vm))
1355         desc = INTEL_LEGACY_64B_CONTEXT;
1356     desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT;
1357 
1358     desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE;
1359     if (GRAPHICS_VER(ce->vm->i915) == 8)
1360         desc |= GEN8_CTX_L3LLC_COHERENT;
1361 
1362     return i915_ggtt_offset(ce->state) | desc;
1363 }
1364 
1365 u32 lrc_update_regs(const struct intel_context *ce,
1366             const struct intel_engine_cs *engine,
1367             u32 head)
1368 {
1369     struct intel_ring *ring = ce->ring;
1370     u32 *regs = ce->lrc_reg_state;
1371 
1372     GEM_BUG_ON(!intel_ring_offset_valid(ring, head));
1373     GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail));
1374 
1375     regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1376     regs[CTX_RING_HEAD] = head;
1377     regs[CTX_RING_TAIL] = ring->tail;
1378     regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1379 
1380     /* RPCS */
1381     if (engine->class == RENDER_CLASS) {
1382         regs[CTX_R_PWR_CLK_STATE] =
1383             intel_sseu_make_rpcs(engine->gt, &ce->sseu);
1384 
1385         i915_oa_init_reg_state(ce, engine);
1386     }
1387 
1388     if (ce->wa_bb_page) {
1389         u32 *(*fn)(const struct intel_context *ce, u32 *cs);
1390 
1391         fn = gen12_emit_indirect_ctx_xcs;
1392         if (ce->engine->class == RENDER_CLASS)
1393             fn = gen12_emit_indirect_ctx_rcs;
1394 
1395         /* Mutually exclusive wrt to global indirect bb */
1396         GEM_BUG_ON(engine->wa_ctx.indirect_ctx.size);
1397         setup_indirect_ctx_bb(ce, engine, fn);
1398     }
1399 
1400     return lrc_descriptor(ce) | CTX_DESC_FORCE_RESTORE;
1401 }
1402 
1403 void lrc_update_offsets(struct intel_context *ce,
1404             struct intel_engine_cs *engine)
1405 {
1406     set_offsets(ce->lrc_reg_state, reg_offsets(engine), engine, false);
1407 }
1408 
1409 void lrc_check_regs(const struct intel_context *ce,
1410             const struct intel_engine_cs *engine,
1411             const char *when)
1412 {
1413     const struct intel_ring *ring = ce->ring;
1414     u32 *regs = ce->lrc_reg_state;
1415     bool valid = true;
1416     int x;
1417 
1418     if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) {
1419         pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n",
1420                engine->name,
1421                regs[CTX_RING_START],
1422                i915_ggtt_offset(ring->vma));
1423         regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1424         valid = false;
1425     }
1426 
1427     if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) !=
1428         (RING_CTL_SIZE(ring->size) | RING_VALID)) {
1429         pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n",
1430                engine->name,
1431                regs[CTX_RING_CTL],
1432                (u32)(RING_CTL_SIZE(ring->size) | RING_VALID));
1433         regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1434         valid = false;
1435     }
1436 
1437     x = lrc_ring_mi_mode(engine);
1438     if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) {
1439         pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n",
1440                engine->name, regs[x + 1]);
1441         regs[x + 1] &= ~STOP_RING;
1442         regs[x + 1] |= STOP_RING << 16;
1443         valid = false;
1444     }
1445 
1446     WARN_ONCE(!valid, "Invalid lrc state found %s submission\n", when);
1447 }
1448 
1449 /*
1450  * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
1451  * PIPE_CONTROL instruction. This is required for the flush to happen correctly
1452  * but there is a slight complication as this is applied in WA batch where the
1453  * values are only initialized once so we cannot take register value at the
1454  * beginning and reuse it further; hence we save its value to memory, upload a
1455  * constant value with bit21 set and then we restore it back with the saved value.
1456  * To simplify the WA, a constant value is formed by using the default value
1457  * of this register. This shouldn't be a problem because we are only modifying
1458  * it for a short period and this batch in non-premptible. We can ofcourse
1459  * use additional instructions that read the actual value of the register
1460  * at that time and set our bit of interest but it makes the WA complicated.
1461  *
1462  * This WA is also required for Gen9 so extracting as a function avoids
1463  * code duplication.
1464  */
1465 static u32 *
1466 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch)
1467 {
1468     /* NB no one else is allowed to scribble over scratch + 256! */
1469     *batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
1470     *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1471     *batch++ = intel_gt_scratch_offset(engine->gt,
1472                        INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
1473     *batch++ = 0;
1474 
1475     *batch++ = MI_LOAD_REGISTER_IMM(1);
1476     *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1477     *batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES;
1478 
1479     batch = gen8_emit_pipe_control(batch,
1480                        PIPE_CONTROL_CS_STALL |
1481                        PIPE_CONTROL_DC_FLUSH_ENABLE,
1482                        0);
1483 
1484     *batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
1485     *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1486     *batch++ = intel_gt_scratch_offset(engine->gt,
1487                        INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
1488     *batch++ = 0;
1489 
1490     return batch;
1491 }
1492 
1493 /*
1494  * Typically we only have one indirect_ctx and per_ctx batch buffer which are
1495  * initialized at the beginning and shared across all contexts but this field
1496  * helps us to have multiple batches at different offsets and select them based
1497  * on a criteria. At the moment this batch always start at the beginning of the page
1498  * and at this point we don't have multiple wa_ctx batch buffers.
1499  *
1500  * The number of WA applied are not known at the beginning; we use this field
1501  * to return the no of DWORDS written.
1502  *
1503  * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
1504  * so it adds NOOPs as padding to make it cacheline aligned.
1505  * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
1506  * makes a complete batch buffer.
1507  */
1508 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
1509 {
1510     /* WaDisableCtxRestoreArbitration:bdw,chv */
1511     *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
1512 
1513     /* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
1514     if (IS_BROADWELL(engine->i915))
1515         batch = gen8_emit_flush_coherentl3_wa(engine, batch);
1516 
1517     /* WaClearSlmSpaceAtContextSwitch:bdw,chv */
1518     /* Actual scratch location is at 128 bytes offset */
1519     batch = gen8_emit_pipe_control(batch,
1520                        PIPE_CONTROL_FLUSH_L3 |
1521                        PIPE_CONTROL_STORE_DATA_INDEX |
1522                        PIPE_CONTROL_CS_STALL |
1523                        PIPE_CONTROL_QW_WRITE,
1524                        LRC_PPHWSP_SCRATCH_ADDR);
1525 
1526     *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1527 
1528     /* Pad to end of cacheline */
1529     while ((unsigned long)batch % CACHELINE_BYTES)
1530         *batch++ = MI_NOOP;
1531 
1532     /*
1533      * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
1534      * execution depends on the length specified in terms of cache lines
1535      * in the register CTX_RCS_INDIRECT_CTX
1536      */
1537 
1538     return batch;
1539 }
1540 
1541 struct lri {
1542     i915_reg_t reg;
1543     u32 value;
1544 };
1545 
1546 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count)
1547 {
1548     GEM_BUG_ON(!count || count > 63);
1549 
1550     *batch++ = MI_LOAD_REGISTER_IMM(count);
1551     do {
1552         *batch++ = i915_mmio_reg_offset(lri->reg);
1553         *batch++ = lri->value;
1554     } while (lri++, --count);
1555     *batch++ = MI_NOOP;
1556 
1557     return batch;
1558 }
1559 
1560 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
1561 {
1562     static const struct lri lri[] = {
1563         /* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */
1564         {
1565             COMMON_SLICE_CHICKEN2,
1566             __MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE,
1567                        0),
1568         },
1569 
1570         /* BSpec: 11391 */
1571         {
1572             FF_SLICE_CHICKEN,
1573             __MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX,
1574                        FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX),
1575         },
1576 
1577         /* BSpec: 11299 */
1578         {
1579             _3D_CHICKEN3,
1580             __MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX,
1581                        _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX),
1582         }
1583     };
1584 
1585     *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
1586 
1587     /* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
1588     batch = gen8_emit_flush_coherentl3_wa(engine, batch);
1589 
1590     /* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */
1591     batch = gen8_emit_pipe_control(batch,
1592                        PIPE_CONTROL_FLUSH_L3 |
1593                        PIPE_CONTROL_STORE_DATA_INDEX |
1594                        PIPE_CONTROL_CS_STALL |
1595                        PIPE_CONTROL_QW_WRITE,
1596                        LRC_PPHWSP_SCRATCH_ADDR);
1597 
1598     batch = emit_lri(batch, lri, ARRAY_SIZE(lri));
1599 
1600     /* WaMediaPoolStateCmdInWABB:bxt,glk */
1601     if (HAS_POOLED_EU(engine->i915)) {
1602         /*
1603          * EU pool configuration is setup along with golden context
1604          * during context initialization. This value depends on
1605          * device type (2x6 or 3x6) and needs to be updated based
1606          * on which subslice is disabled especially for 2x6
1607          * devices, however it is safe to load default
1608          * configuration of 3x6 device instead of masking off
1609          * corresponding bits because HW ignores bits of a disabled
1610          * subslice and drops down to appropriate config. Please
1611          * see render_state_setup() in i915_gem_render_state.c for
1612          * possible configurations, to avoid duplication they are
1613          * not shown here again.
1614          */
1615         *batch++ = GEN9_MEDIA_POOL_STATE;
1616         *batch++ = GEN9_MEDIA_POOL_ENABLE;
1617         *batch++ = 0x00777000;
1618         *batch++ = 0;
1619         *batch++ = 0;
1620         *batch++ = 0;
1621     }
1622 
1623     *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1624 
1625     /* Pad to end of cacheline */
1626     while ((unsigned long)batch % CACHELINE_BYTES)
1627         *batch++ = MI_NOOP;
1628 
1629     return batch;
1630 }
1631 
1632 #define CTX_WA_BB_SIZE (PAGE_SIZE)
1633 
1634 static int lrc_create_wa_ctx(struct intel_engine_cs *engine)
1635 {
1636     struct drm_i915_gem_object *obj;
1637     struct i915_vma *vma;
1638     int err;
1639 
1640     obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_SIZE);
1641     if (IS_ERR(obj))
1642         return PTR_ERR(obj);
1643 
1644     vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
1645     if (IS_ERR(vma)) {
1646         err = PTR_ERR(vma);
1647         goto err;
1648     }
1649 
1650     engine->wa_ctx.vma = vma;
1651     return 0;
1652 
1653 err:
1654     i915_gem_object_put(obj);
1655     return err;
1656 }
1657 
1658 void lrc_fini_wa_ctx(struct intel_engine_cs *engine)
1659 {
1660     i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0);
1661 }
1662 
1663 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch);
1664 
1665 void lrc_init_wa_ctx(struct intel_engine_cs *engine)
1666 {
1667     struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
1668     struct i915_wa_ctx_bb *wa_bb[] = {
1669         &wa_ctx->indirect_ctx, &wa_ctx->per_ctx
1670     };
1671     wa_bb_func_t wa_bb_fn[ARRAY_SIZE(wa_bb)];
1672     struct i915_gem_ww_ctx ww;
1673     void *batch, *batch_ptr;
1674     unsigned int i;
1675     int err;
1676 
1677     if (!(engine->flags & I915_ENGINE_HAS_RCS_REG_STATE))
1678         return;
1679 
1680     switch (GRAPHICS_VER(engine->i915)) {
1681     case 12:
1682     case 11:
1683         return;
1684     case 9:
1685         wa_bb_fn[0] = gen9_init_indirectctx_bb;
1686         wa_bb_fn[1] = NULL;
1687         break;
1688     case 8:
1689         wa_bb_fn[0] = gen8_init_indirectctx_bb;
1690         wa_bb_fn[1] = NULL;
1691         break;
1692     default:
1693         MISSING_CASE(GRAPHICS_VER(engine->i915));
1694         return;
1695     }
1696 
1697     err = lrc_create_wa_ctx(engine);
1698     if (err) {
1699         /*
1700          * We continue even if we fail to initialize WA batch
1701          * because we only expect rare glitches but nothing
1702          * critical to prevent us from using GPU
1703          */
1704         drm_err(&engine->i915->drm,
1705             "Ignoring context switch w/a allocation error:%d\n",
1706             err);
1707         return;
1708     }
1709 
1710     if (!engine->wa_ctx.vma)
1711         return;
1712 
1713     i915_gem_ww_ctx_init(&ww, true);
1714 retry:
1715     err = i915_gem_object_lock(wa_ctx->vma->obj, &ww);
1716     if (!err)
1717         err = i915_ggtt_pin(wa_ctx->vma, &ww, 0, PIN_HIGH);
1718     if (err)
1719         goto err;
1720 
1721     batch = i915_gem_object_pin_map(wa_ctx->vma->obj, I915_MAP_WB);
1722     if (IS_ERR(batch)) {
1723         err = PTR_ERR(batch);
1724         goto err_unpin;
1725     }
1726 
1727     /*
1728      * Emit the two workaround batch buffers, recording the offset from the
1729      * start of the workaround batch buffer object for each and their
1730      * respective sizes.
1731      */
1732     batch_ptr = batch;
1733     for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) {
1734         wa_bb[i]->offset = batch_ptr - batch;
1735         if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset,
1736                           CACHELINE_BYTES))) {
1737             err = -EINVAL;
1738             break;
1739         }
1740         if (wa_bb_fn[i])
1741             batch_ptr = wa_bb_fn[i](engine, batch_ptr);
1742         wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset);
1743     }
1744     GEM_BUG_ON(batch_ptr - batch > CTX_WA_BB_SIZE);
1745 
1746     __i915_gem_object_flush_map(wa_ctx->vma->obj, 0, batch_ptr - batch);
1747     __i915_gem_object_release_map(wa_ctx->vma->obj);
1748 
1749     /* Verify that we can handle failure to setup the wa_ctx */
1750     if (!err)
1751         err = i915_inject_probe_error(engine->i915, -ENODEV);
1752 
1753 err_unpin:
1754     if (err)
1755         i915_vma_unpin(wa_ctx->vma);
1756 err:
1757     if (err == -EDEADLK) {
1758         err = i915_gem_ww_ctx_backoff(&ww);
1759         if (!err)
1760             goto retry;
1761     }
1762     i915_gem_ww_ctx_fini(&ww);
1763 
1764     if (err) {
1765         i915_vma_put(engine->wa_ctx.vma);
1766 
1767         /* Clear all flags to prevent further use */
1768         memset(wa_ctx, 0, sizeof(*wa_ctx));
1769     }
1770 }
1771 
1772 static void st_runtime_underflow(struct intel_context_stats *stats, s32 dt)
1773 {
1774 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1775     stats->runtime.num_underflow++;
1776     stats->runtime.max_underflow =
1777         max_t(u32, stats->runtime.max_underflow, -dt);
1778 #endif
1779 }
1780 
1781 static u32 lrc_get_runtime(const struct intel_context *ce)
1782 {
1783     /*
1784      * We can use either ppHWSP[16] which is recorded before the context
1785      * switch (and so excludes the cost of context switches) or use the
1786      * value from the context image itself, which is saved/restored earlier
1787      * and so includes the cost of the save.
1788      */
1789     return READ_ONCE(ce->lrc_reg_state[CTX_TIMESTAMP]);
1790 }
1791 
1792 void lrc_update_runtime(struct intel_context *ce)
1793 {
1794     struct intel_context_stats *stats = &ce->stats;
1795     u32 old;
1796     s32 dt;
1797 
1798     old = stats->runtime.last;
1799     stats->runtime.last = lrc_get_runtime(ce);
1800     dt = stats->runtime.last - old;
1801     if (!dt)
1802         return;
1803 
1804     if (unlikely(dt < 0)) {
1805         CE_TRACE(ce, "runtime underflow: last=%u, new=%u, delta=%d\n",
1806              old, stats->runtime.last, dt);
1807         st_runtime_underflow(stats, dt);
1808         return;
1809     }
1810 
1811     ewma_runtime_add(&stats->runtime.avg, dt);
1812     stats->runtime.total += dt;
1813 }
1814 
1815 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1816 #include "selftest_lrc.c"
1817 #endif