drm/i915: Declare the driver wedged if hangcheck makes no progress
authorChris Wilson <chris@chris-wilson.co.uk>
Sat, 2 Jun 2018 10:48:53 +0000 (11:48 +0100)
committerChris Wilson <chris@chris-wilson.co.uk>
Thu, 14 Jun 2018 18:20:33 +0000 (19:20 +0100)
Hangcheck is our back up in case the GPU or the driver gets stuck. It
detects when the GPU is not making any progress and issues a GPU reset.
However, if the driver is failing to make any progress, we can get
ourselves into a situation where we continually try resetting the GPU to
no avail. Employ a second timeout such that if we continue to see the
same seqno (the stalled engine has made no progress at all) over the
course of several hangchecks, declare the driver wedged and attempt to
start afresh.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20180602104853.17140-1-chris@chris-wilson.co.uk
Reviewed-by: Mika Kuoppala <mika.kuoppala@intel.com>
drivers/gpu/drm/i915/i915_debugfs.c
drivers/gpu/drm/i915/i915_drv.h
drivers/gpu/drm/i915/intel_hangcheck.c
drivers/gpu/drm/i915/intel_ringbuffer.h

index 948b973af067815591e31fdbe086201562deb384..99d3272d82d805a0a237ee614e24de9d8e35227b 100644 (file)
@@ -1359,11 +1359,12 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
                seq_printf(m, "\tseqno = %x [current %x, last %x]\n",
                           engine->hangcheck.seqno, seqno[id],
                           intel_engine_last_submit(engine));
-               seq_printf(m, "\twaiters? %s, fake irq active? %s, stalled? %s\n",
+               seq_printf(m, "\twaiters? %s, fake irq active? %s, stalled? %s, wedged? %s\n",
                           yesno(intel_engine_has_waiter(engine)),
                           yesno(test_bit(engine->id,
                                          &dev_priv->gpu_error.missed_irq_rings)),
-                          yesno(engine->hangcheck.stalled));
+                          yesno(engine->hangcheck.stalled),
+                          yesno(engine->hangcheck.wedged));
 
                spin_lock_irq(&b->rb_lock);
                for (rb = rb_first(&b->waiters); rb; rb = rb_next(rb)) {
index 19defe73b1566addbed6c30be53463f2d76c06fb..74dd88d8563e698fd3446ae4e48a34964bf78b5a 100644 (file)
@@ -1006,6 +1006,8 @@ struct i915_gem_mm {
 #define I915_ENGINE_DEAD_TIMEOUT  (4 * HZ)  /* Seqno, head and subunits dead */
 #define I915_SEQNO_DEAD_TIMEOUT   (12 * HZ) /* Seqno dead with active head */
 
+#define I915_ENGINE_WEDGED_TIMEOUT  (60 * HZ)  /* Reset but no recovery? */
+
 enum modeset_restore {
        MODESET_ON_LID_OPEN,
        MODESET_DONE,
index d47e346bd49e97fc45e9aa0cb15563bbdb28c2fd..2fc7a0dd0df9b2bc7a88814f75d98334bfd4d4e9 100644 (file)
@@ -294,6 +294,7 @@ static void hangcheck_store_sample(struct intel_engine_cs *engine,
        engine->hangcheck.seqno = hc->seqno;
        engine->hangcheck.action = hc->action;
        engine->hangcheck.stalled = hc->stalled;
+       engine->hangcheck.wedged = hc->wedged;
 }
 
 static enum intel_engine_hangcheck_action
@@ -368,6 +369,9 @@ static void hangcheck_accumulate_sample(struct intel_engine_cs *engine,
 
        hc->stalled = time_after(jiffies,
                                 engine->hangcheck.action_timestamp + timeout);
+       hc->wedged = time_after(jiffies,
+                                engine->hangcheck.action_timestamp +
+                                I915_ENGINE_WEDGED_TIMEOUT);
 }
 
 static void hangcheck_declare_hang(struct drm_i915_private *i915,
@@ -409,7 +413,7 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
                             gpu_error.hangcheck_work.work);
        struct intel_engine_cs *engine;
        enum intel_engine_id id;
-       unsigned int hung = 0, stuck = 0;
+       unsigned int hung = 0, stuck = 0, wedged = 0;
 
        if (!i915_modparams.enable_hangcheck)
                return;
@@ -440,6 +444,17 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
                        if (hc.action != ENGINE_DEAD)
                                stuck |= intel_engine_flag(engine);
                }
+
+               if (engine->hangcheck.wedged)
+                       wedged |= intel_engine_flag(engine);
+       }
+
+       if (wedged) {
+               dev_err(dev_priv->drm.dev,
+                       "GPU recovery timed out,"
+                       " cancelling all in-flight rendering.\n");
+               GEM_TRACE_DUMP();
+               i915_gem_set_wedged(dev_priv);
        }
 
        if (hung)
index 2c35dd3525a6e40748f17e7bb73e85ce32a69d91..4003f3ebe3d1b3132903aa6e0a9165307faca376 100644 (file)
@@ -122,7 +122,8 @@ struct intel_engine_hangcheck {
        int deadlock;
        struct intel_instdone instdone;
        struct i915_request *active_request;
-       bool stalled;
+       bool stalled:1;
+       bool wedged:1;
 };
 
 struct intel_ring {