Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: MIT
0002 /*
0003  * Copyright © 2019 Intel Corporation
0004  */
0005 
0006 #include "i915_drv.h"
0007 #include "i915_request.h"
0008 
0009 #include "intel_context.h"
0010 #include "intel_engine_heartbeat.h"
0011 #include "intel_engine_pm.h"
0012 #include "intel_engine.h"
0013 #include "intel_gt.h"
0014 #include "intel_reset.h"
0015 
0016 /*
0017  * While the engine is active, we send a periodic pulse along the engine
0018  * to check on its health and to flush any idle-barriers. If that request
0019  * is stuck, and we fail to preempt it, we declare the engine hung and
0020  * issue a reset -- in the hope that restores progress.
0021  */
0022 
0023 static bool next_heartbeat(struct intel_engine_cs *engine)
0024 {
0025     long delay;
0026 
0027     delay = READ_ONCE(engine->props.heartbeat_interval_ms);
0028     if (!delay)
0029         return false;
0030 
0031     delay = msecs_to_jiffies_timeout(delay);
0032     if (delay >= HZ)
0033         delay = round_jiffies_up_relative(delay);
0034     mod_delayed_work(system_highpri_wq, &engine->heartbeat.work, delay + 1);
0035 
0036     return true;
0037 }
0038 
0039 static struct i915_request *
0040 heartbeat_create(struct intel_context *ce, gfp_t gfp)
0041 {
0042     struct i915_request *rq;
0043 
0044     intel_context_enter(ce);
0045     rq = __i915_request_create(ce, gfp);
0046     intel_context_exit(ce);
0047 
0048     return rq;
0049 }
0050 
0051 static void idle_pulse(struct intel_engine_cs *engine, struct i915_request *rq)
0052 {
0053     engine->wakeref_serial = READ_ONCE(engine->serial) + 1;
0054     i915_request_add_active_barriers(rq);
0055     if (!engine->heartbeat.systole && intel_engine_has_heartbeat(engine))
0056         engine->heartbeat.systole = i915_request_get(rq);
0057 }
0058 
0059 static void heartbeat_commit(struct i915_request *rq,
0060                  const struct i915_sched_attr *attr)
0061 {
0062     idle_pulse(rq->engine, rq);
0063 
0064     __i915_request_commit(rq);
0065     __i915_request_queue(rq, attr);
0066 }
0067 
0068 static void show_heartbeat(const struct i915_request *rq,
0069                struct intel_engine_cs *engine)
0070 {
0071     struct drm_printer p = drm_debug_printer("heartbeat");
0072 
0073     if (!rq) {
0074         intel_engine_dump(engine, &p,
0075                   "%s heartbeat not ticking\n",
0076                   engine->name);
0077     } else {
0078         intel_engine_dump(engine, &p,
0079                   "%s heartbeat {seqno:%llx:%lld, prio:%d} not ticking\n",
0080                   engine->name,
0081                   rq->fence.context,
0082                   rq->fence.seqno,
0083                   rq->sched.attr.priority);
0084     }
0085 }
0086 
0087 static void
0088 reset_engine(struct intel_engine_cs *engine, struct i915_request *rq)
0089 {
0090     if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
0091         show_heartbeat(rq, engine);
0092 
0093     if (intel_engine_uses_guc(engine))
0094         /*
0095          * GuC itself is toast or GuC's hang detection
0096          * is disabled. Either way, need to find the
0097          * hang culprit manually.
0098          */
0099         intel_guc_find_hung_context(engine);
0100 
0101     intel_gt_handle_error(engine->gt, engine->mask,
0102                   I915_ERROR_CAPTURE,
0103                   "stopped heartbeat on %s",
0104                   engine->name);
0105 }
0106 
0107 static void heartbeat(struct work_struct *wrk)
0108 {
0109     struct i915_sched_attr attr = { .priority = I915_PRIORITY_MIN };
0110     struct intel_engine_cs *engine =
0111         container_of(wrk, typeof(*engine), heartbeat.work.work);
0112     struct intel_context *ce = engine->kernel_context;
0113     struct i915_request *rq;
0114     unsigned long serial;
0115 
0116     /* Just in case everything has gone horribly wrong, give it a kick */
0117     intel_engine_flush_submission(engine);
0118 
0119     rq = engine->heartbeat.systole;
0120     if (rq && i915_request_completed(rq)) {
0121         i915_request_put(rq);
0122         engine->heartbeat.systole = NULL;
0123     }
0124 
0125     if (!intel_engine_pm_get_if_awake(engine))
0126         return;
0127 
0128     if (intel_gt_is_wedged(engine->gt))
0129         goto out;
0130 
0131     if (i915_sched_engine_disabled(engine->sched_engine)) {
0132         reset_engine(engine, engine->heartbeat.systole);
0133         goto out;
0134     }
0135 
0136     if (engine->heartbeat.systole) {
0137         long delay = READ_ONCE(engine->props.heartbeat_interval_ms);
0138 
0139         /* Safeguard against too-fast worker invocations */
0140         if (!time_after(jiffies,
0141                 rq->emitted_jiffies + msecs_to_jiffies(delay)))
0142             goto out;
0143 
0144         if (!i915_sw_fence_signaled(&rq->submit)) {
0145             /*
0146              * Not yet submitted, system is stalled.
0147              *
0148              * This more often happens for ring submission,
0149              * where all contexts are funnelled into a common
0150              * ringbuffer. If one context is blocked on an
0151              * external fence, not only is it not submitted,
0152              * but all other contexts, including the kernel
0153              * context are stuck waiting for the signal.
0154              */
0155         } else if (engine->sched_engine->schedule &&
0156                rq->sched.attr.priority < I915_PRIORITY_BARRIER) {
0157             /*
0158              * Gradually raise the priority of the heartbeat to
0159              * give high priority work [which presumably desires
0160              * low latency and no jitter] the chance to naturally
0161              * complete before being preempted.
0162              */
0163             attr.priority = 0;
0164             if (rq->sched.attr.priority >= attr.priority)
0165                 attr.priority = I915_PRIORITY_HEARTBEAT;
0166             if (rq->sched.attr.priority >= attr.priority)
0167                 attr.priority = I915_PRIORITY_BARRIER;
0168 
0169             local_bh_disable();
0170             engine->sched_engine->schedule(rq, &attr);
0171             local_bh_enable();
0172         } else {
0173             reset_engine(engine, rq);
0174         }
0175 
0176         rq->emitted_jiffies = jiffies;
0177         goto out;
0178     }
0179 
0180     serial = READ_ONCE(engine->serial);
0181     if (engine->wakeref_serial == serial)
0182         goto out;
0183 
0184     if (!mutex_trylock(&ce->timeline->mutex)) {
0185         /* Unable to lock the kernel timeline, is the engine stuck? */
0186         if (xchg(&engine->heartbeat.blocked, serial) == serial)
0187             intel_gt_handle_error(engine->gt, engine->mask,
0188                           I915_ERROR_CAPTURE,
0189                           "no heartbeat on %s",
0190                           engine->name);
0191         goto out;
0192     }
0193 
0194     rq = heartbeat_create(ce, GFP_NOWAIT | __GFP_NOWARN);
0195     if (IS_ERR(rq))
0196         goto unlock;
0197 
0198     heartbeat_commit(rq, &attr);
0199 
0200 unlock:
0201     mutex_unlock(&ce->timeline->mutex);
0202 out:
0203     if (!engine->i915->params.enable_hangcheck || !next_heartbeat(engine))
0204         i915_request_put(fetch_and_zero(&engine->heartbeat.systole));
0205     intel_engine_pm_put(engine);
0206 }
0207 
0208 void intel_engine_unpark_heartbeat(struct intel_engine_cs *engine)
0209 {
0210     if (!CONFIG_DRM_I915_HEARTBEAT_INTERVAL)
0211         return;
0212 
0213     next_heartbeat(engine);
0214 }
0215 
0216 void intel_engine_park_heartbeat(struct intel_engine_cs *engine)
0217 {
0218     if (cancel_delayed_work(&engine->heartbeat.work))
0219         i915_request_put(fetch_and_zero(&engine->heartbeat.systole));
0220 }
0221 
0222 void intel_gt_unpark_heartbeats(struct intel_gt *gt)
0223 {
0224     struct intel_engine_cs *engine;
0225     enum intel_engine_id id;
0226 
0227     for_each_engine(engine, gt, id)
0228         if (intel_engine_pm_is_awake(engine))
0229             intel_engine_unpark_heartbeat(engine);
0230 }
0231 
0232 void intel_gt_park_heartbeats(struct intel_gt *gt)
0233 {
0234     struct intel_engine_cs *engine;
0235     enum intel_engine_id id;
0236 
0237     for_each_engine(engine, gt, id)
0238         intel_engine_park_heartbeat(engine);
0239 }
0240 
0241 void intel_engine_init_heartbeat(struct intel_engine_cs *engine)
0242 {
0243     INIT_DELAYED_WORK(&engine->heartbeat.work, heartbeat);
0244 }
0245 
0246 static int __intel_engine_pulse(struct intel_engine_cs *engine)
0247 {
0248     struct i915_sched_attr attr = { .priority = I915_PRIORITY_BARRIER };
0249     struct intel_context *ce = engine->kernel_context;
0250     struct i915_request *rq;
0251 
0252     lockdep_assert_held(&ce->timeline->mutex);
0253     GEM_BUG_ON(!intel_engine_has_preemption(engine));
0254     GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
0255 
0256     rq = heartbeat_create(ce, GFP_NOWAIT | __GFP_NOWARN);
0257     if (IS_ERR(rq))
0258         return PTR_ERR(rq);
0259 
0260     __set_bit(I915_FENCE_FLAG_SENTINEL, &rq->fence.flags);
0261 
0262     heartbeat_commit(rq, &attr);
0263     GEM_BUG_ON(rq->sched.attr.priority < I915_PRIORITY_BARRIER);
0264 
0265     return 0;
0266 }
0267 
0268 static unsigned long set_heartbeat(struct intel_engine_cs *engine,
0269                    unsigned long delay)
0270 {
0271     unsigned long old;
0272 
0273     old = xchg(&engine->props.heartbeat_interval_ms, delay);
0274     if (delay)
0275         intel_engine_unpark_heartbeat(engine);
0276     else
0277         intel_engine_park_heartbeat(engine);
0278 
0279     return old;
0280 }
0281 
0282 int intel_engine_set_heartbeat(struct intel_engine_cs *engine,
0283                    unsigned long delay)
0284 {
0285     struct intel_context *ce = engine->kernel_context;
0286     int err = 0;
0287 
0288     if (!delay && !intel_engine_has_preempt_reset(engine))
0289         return -ENODEV;
0290 
0291     intel_engine_pm_get(engine);
0292 
0293     err = mutex_lock_interruptible(&ce->timeline->mutex);
0294     if (err)
0295         goto out_rpm;
0296 
0297     if (delay != engine->props.heartbeat_interval_ms) {
0298         unsigned long saved = set_heartbeat(engine, delay);
0299 
0300         /* recheck current execution */
0301         if (intel_engine_has_preemption(engine)) {
0302             err = __intel_engine_pulse(engine);
0303             if (err)
0304                 set_heartbeat(engine, saved);
0305         }
0306     }
0307 
0308     mutex_unlock(&ce->timeline->mutex);
0309 
0310 out_rpm:
0311     intel_engine_pm_put(engine);
0312     return err;
0313 }
0314 
0315 int intel_engine_pulse(struct intel_engine_cs *engine)
0316 {
0317     struct intel_context *ce = engine->kernel_context;
0318     int err;
0319 
0320     if (!intel_engine_has_preemption(engine))
0321         return -ENODEV;
0322 
0323     if (!intel_engine_pm_get_if_awake(engine))
0324         return 0;
0325 
0326     err = -EINTR;
0327     if (!mutex_lock_interruptible(&ce->timeline->mutex)) {
0328         err = __intel_engine_pulse(engine);
0329         mutex_unlock(&ce->timeline->mutex);
0330     }
0331 
0332     intel_engine_flush_submission(engine);
0333     intel_engine_pm_put(engine);
0334     return err;
0335 }
0336 
0337 int intel_engine_flush_barriers(struct intel_engine_cs *engine)
0338 {
0339     struct i915_sched_attr attr = { .priority = I915_PRIORITY_MIN };
0340     struct intel_context *ce = engine->kernel_context;
0341     struct i915_request *rq;
0342     int err;
0343 
0344     if (llist_empty(&engine->barrier_tasks))
0345         return 0;
0346 
0347     if (!intel_engine_pm_get_if_awake(engine))
0348         return 0;
0349 
0350     if (mutex_lock_interruptible(&ce->timeline->mutex)) {
0351         err = -EINTR;
0352         goto out_rpm;
0353     }
0354 
0355     rq = heartbeat_create(ce, GFP_KERNEL);
0356     if (IS_ERR(rq)) {
0357         err = PTR_ERR(rq);
0358         goto out_unlock;
0359     }
0360 
0361     heartbeat_commit(rq, &attr);
0362 
0363     err = 0;
0364 out_unlock:
0365     mutex_unlock(&ce->timeline->mutex);
0366 out_rpm:
0367     intel_engine_pm_put(engine);
0368     return err;
0369 }
0370 
0371 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
0372 #include "selftest_engine_heartbeat.c"
0373 #endif