drm/i915/selftests: Flush old resets between engines
authorChris Wilson <chris@chris-wilson.co.uk>
Mon, 5 Feb 2018 15:24:28 +0000 (15:24 +0000)
committerChris Wilson <chris@chris-wilson.co.uk>
Mon, 5 Feb 2018 15:27:23 +0000 (15:27 +0000)
When injecting rapid resets, we must be careful to at least wait for the
previous reset to have taken effect and the engine restarted. If we
perform a second reset before that has happened, we will notice that the
engine hasn't recovered and declare it lost, wedging the device and
failing. In practice, since we wait for each hanging batch to start
before injecting the reset, this too-fast-reset condition can only be
triggered when moving onto the next engine in the test, so we need only
wait for the existing reset to complete before switching engines.

v2: Wrap up the wait inside a safety net to bail out in case of angry hw.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Cc: Michel Thierry <michel.thierry@intel.com>
Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20180205152431.12163-1-chris@chris-wilson.co.uk
drivers/gpu/drm/i915/selftests/intel_hangcheck.c

index d1f91a533afa14b708d2487ec731f27ddd7855b7..a42c539c1efeb097e1317a0567a0938ad62c5fde 100644 (file)
@@ -244,6 +244,58 @@ static u32 hws_seqno(const struct hang *h,
        return READ_ONCE(h->seqno[rq->fence.context % (PAGE_SIZE/sizeof(u32))]);
 }
 
+struct wedge_me {
+       struct delayed_work work;
+       struct drm_i915_private *i915;
+       const void *symbol;
+};
+
+static void wedge_me(struct work_struct *work)
+{
+       struct wedge_me *w = container_of(work, typeof(*w), work.work);
+
+       pr_err("%pS timed out, cancelling all further testing.\n",
+              w->symbol);
+       i915_gem_set_wedged(w->i915);
+}
+
+static void __init_wedge(struct wedge_me *w,
+                        struct drm_i915_private *i915,
+                        long timeout,
+                        const void *symbol)
+{
+       w->i915 = i915;
+       w->symbol = symbol;
+
+       INIT_DELAYED_WORK_ONSTACK(&w->work, wedge_me);
+       schedule_delayed_work(&w->work, timeout);
+}
+
+static void __fini_wedge(struct wedge_me *w)
+{
+       cancel_delayed_work_sync(&w->work);
+       destroy_delayed_work_on_stack(&w->work);
+       w->i915 = NULL;
+}
+
+#define wedge_on_timeout(W, DEV, TIMEOUT)                              \
+       for (__init_wedge((W), (DEV), (TIMEOUT), __builtin_return_address(0)); \
+            (W)->i915;                                                 \
+            __fini_wedge((W)))
+
+static noinline int
+flush_test(struct drm_i915_private *i915, unsigned int flags)
+{
+       struct wedge_me w;
+
+       cond_resched();
+
+       wedge_on_timeout(&w, i915, HZ)
+               i915_gem_wait_for_idle(i915, flags);
+
+       return i915_terminally_wedged(&i915->gpu_error) ? -EIO : 0;
+}
+
 static void hang_fini(struct hang *h)
 {
        *h->batch = MI_BATCH_BUFFER_END;
@@ -255,7 +307,7 @@ static void hang_fini(struct hang *h)
        i915_gem_object_unpin_map(h->hws);
        i915_gem_object_put(h->hws);
 
-       i915_gem_wait_for_idle(h->i915, I915_WAIT_LOCKED);
+       flush_test(h->i915, I915_WAIT_LOCKED);
 }
 
 static bool wait_for_hang(struct hang *h, struct drm_i915_gem_request *rq)
@@ -487,7 +539,9 @@ static int __igt_reset_engine(struct drm_i915_private *i915, bool active)
                if (err)
                        break;
 
-               cond_resched();
+               err = flush_test(i915, 0);
+               if (err)
+                       break;
        }
 
        if (i915_terminally_wedged(&i915->gpu_error))
@@ -726,7 +780,9 @@ unwind:
                if (err)
                        break;
 
-               cond_resched();
+               err = flush_test(i915, 0);
+               if (err)
+                       break;
        }
 
        if (i915_terminally_wedged(&i915->gpu_error))
@@ -952,6 +1008,10 @@ static int igt_reset_queue(void *arg)
                i915_gem_chipset_flush(i915);
 
                i915_gem_request_put(prev);
+
+               err = flush_test(i915, I915_WAIT_LOCKED);
+               if (err)
+                       break;
        }
 
 fini: