drm/i915: Don't claim an unstarted request was guilty
authorChris Wilson <chris@chris-wilson.co.uk>
Fri, 8 Feb 2019 15:37:08 +0000 (15:37 +0000)
committerChris Wilson <chris@chris-wilson.co.uk>
Fri, 8 Feb 2019 16:47:40 +0000 (16:47 +0000)
If we haven't even begun executing the payload of the stalled request,
then we should not claim that its userspace context was guilty of
submitting a hanging batch.

v2: Check for context corruption before trying to restart.
v3: Preserve semaphores on skipping requests (need to keep the timelines
intact).

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Mika Kuoppala <mika.kuoppala@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190208153708.20023-7-chris@chris-wilson.co.uk
drivers/gpu/drm/i915/intel_lrc.c
drivers/gpu/drm/i915/selftests/igt_spinner.c
drivers/gpu/drm/i915/selftests/intel_hangcheck.c

index 5e98fd79bd9df4a04950bdae683f8b67e2c4e7dd..1b567a3f006a14fc418001a71735e5baac52f4dc 100644 (file)
@@ -1387,6 +1387,10 @@ static int gen8_emit_init_breadcrumb(struct i915_request *rq)
        *cs++ = rq->fence.seqno - 1;
 
        intel_ring_advance(rq, cs);
+
+       /* Record the updated position of the request's payload */
+       rq->infix = intel_ring_offset(rq, cs);
+
        return 0;
 }
 
@@ -1878,6 +1882,23 @@ static void execlists_reset_prepare(struct intel_engine_cs *engine)
        spin_unlock_irqrestore(&engine->timeline.lock, flags);
 }
 
+static bool lrc_regs_ok(const struct i915_request *rq)
+{
+       const struct intel_ring *ring = rq->ring;
+       const u32 *regs = rq->hw_context->lrc_reg_state;
+
+       /* Quick spot check for the common signs of context corruption */
+
+       if (regs[CTX_RING_BUFFER_CONTROL + 1] !=
+           (RING_CTL_SIZE(ring->size) | RING_VALID))
+               return false;
+
+       if (regs[CTX_RING_BUFFER_START + 1] != i915_ggtt_offset(ring->vma))
+               return false;
+
+       return true;
+}
+
 static void execlists_reset(struct intel_engine_cs *engine, bool stalled)
 {
        struct intel_engine_execlists * const execlists = &engine->execlists;
@@ -1912,6 +1933,21 @@ static void execlists_reset(struct intel_engine_cs *engine, bool stalled)
        if (!rq)
                goto out_unlock;
 
+       /*
+        * If this request hasn't started yet, e.g. it is waiting on a
+        * semaphore, we need to avoid skipping the request or else we
+        * break the signaling chain. However, if the context is corrupt
+        * the request will not restart and we will be stuck with a wedged
+        * device. It is quite often the case that if we issue a reset
+        * while the GPU is loading the context image, that the context
+        * image becomes corrupt.
+        *
+        * Otherwise, if we have not started yet, the request should replay
+        * perfectly and we do not need to flag the result as being erroneous.
+        */
+       if (!i915_request_started(rq) && lrc_regs_ok(rq))
+               goto out_unlock;
+
        /*
         * If the request was innocent, we leave the request in the ELSP
         * and will try to replay it on restarting. The context image may
@@ -1924,7 +1960,7 @@ static void execlists_reset(struct intel_engine_cs *engine, bool stalled)
         * image back to the expected values to skip over the guilty request.
         */
        i915_reset_request(rq, stalled);
-       if (!stalled)
+       if (!stalled && lrc_regs_ok(rq))
                goto out_unlock;
 
        /*
@@ -1942,8 +1978,8 @@ static void execlists_reset(struct intel_engine_cs *engine, bool stalled)
                       engine->context_size - PAGE_SIZE);
        }
 
-       /* Move the RING_HEAD onto the breadcrumb, past the hanging batch */
-       rq->ring->head = intel_ring_wrap(rq->ring, rq->postfix);
+       /* Rerun the request; its payload has been neutered (if guilty). */
+       rq->ring->head = intel_ring_wrap(rq->ring, rq->head);
        intel_ring_update_space(rq->ring);
 
        execlists_init_reg_state(regs, rq->gem_context, engine, rq->ring);
index 9ebd9225684e8743f2b8db772de4c9d9f7f45465..d0b93a3fbc5455b43b09e0721f7dbe1f2ca3698e 100644 (file)
@@ -144,6 +144,13 @@ igt_spinner_create_request(struct igt_spinner *spin,
 
        i915_gem_chipset_flush(spin->i915);
 
+       if (engine->emit_init_breadcrumb &&
+           rq->timeline->has_initial_breadcrumb) {
+               err = engine->emit_init_breadcrumb(rq);
+               if (err)
+                       goto cancel_rq;
+       }
+
        err = engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, 0);
 
 cancel_rq:
index 4886fac126289077f8e3c34ba91358cba9458180..92475596ff40ab6946eb321cd338c9f1a60ccd57 100644 (file)
@@ -242,6 +242,12 @@ hang_create_request(struct hang *h, struct intel_engine_cs *engine)
        *batch++ = MI_BATCH_BUFFER_END; /* not reached */
        i915_gem_chipset_flush(h->i915);
 
+       if (rq->engine->emit_init_breadcrumb) {
+               err = rq->engine->emit_init_breadcrumb(rq);
+               if (err)
+                       goto cancel_rq;
+       }
+
        flags = 0;
        if (INTEL_GEN(vm->i915) <= 5)
                flags |= I915_DISPATCH_SECURE;