drm/i915/execlists: Move reset_active() from schedule-out to schedule-in
authorChris Wilson <chris@chris-wilson.co.uk>
Mon, 11 Nov 2019 13:32:05 +0000 (13:32 +0000)
committerChris Wilson <chris@chris-wilson.co.uk>
Mon, 11 Nov 2019 16:37:05 +0000 (16:37 +0000)
The gem_ctx_persistence/smoketest was detecting an odd coherency issue
inside the LRC context image; that the address of the ring buffer did
not match our associated struct intel_ring. As we set the address into
the context image when we pin the ring buffer into place before the
context is active, that leaves the question of where did it get
overwritten. Either the HW context save occurred after our pin which
would imply that our idle barriers are broken, or we overwrote the
context image ourselves. It is only in reset_active() where we dabble
inside the context image outside of a serialised path from schedule-out;
but we could equally perform the operation inside schedule-in which is
then fully serialised with the context pin -- and remains serialised by
the engine pulse with kill_context(). (The only downside, aside from
doing more work inside the engine->active.lock, was the plan to merge
all the reset paths into doing their context scrubbing on schedule-out
needs more thought.)

Fixes: d12acee84ffb ("drm/i915/execlists: Cancel banned contexts on schedule-out")
Testcase: igt/gem_ctx_persistence/smoketest
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20191111133205.11590-3-chris@chris-wilson.co.uk
drivers/gpu/drm/i915/gt/intel_lrc.c

index e57345795c083e167ff2729d970dee5a9a25a032..33ce258d484f72afdf471ae4c1e206aae9d934de 100644 (file)
@@ -1042,6 +1042,59 @@ execlists_check_context(const struct intel_context *ce,
        WARN_ONCE(!valid, "Invalid lrc state found before submission\n");
 }
 
+static void restore_default_state(struct intel_context *ce,
+                                 struct intel_engine_cs *engine)
+{
+       u32 *regs = ce->lrc_reg_state;
+
+       if (engine->pinned_default_state)
+               memcpy(regs, /* skip restoring the vanilla PPHWSP */
+                      engine->pinned_default_state + LRC_STATE_PN * PAGE_SIZE,
+                      engine->context_size - PAGE_SIZE);
+
+       execlists_init_reg_state(regs, ce, engine, ce->ring, false);
+}
+
+static void reset_active(struct i915_request *rq,
+                        struct intel_engine_cs *engine)
+{
+       struct intel_context * const ce = rq->hw_context;
+       u32 head;
+
+       /*
+        * The executing context has been cancelled. We want to prevent
+        * further execution along this context and propagate the error on
+        * to anything depending on its results.
+        *
+        * In __i915_request_submit(), we apply the -EIO and remove the
+        * requests' payloads for any banned requests. But first, we must
+        * rewind the context back to the start of the incomplete request so
+        * that we do not jump back into the middle of the batch.
+        *
+        * We preserve the breadcrumbs and semaphores of the incomplete
+        * requests so that inter-timeline dependencies (i.e other timelines)
+        * remain correctly ordered. And we defer to __i915_request_submit()
+        * so that all asynchronous waits are correctly handled.
+        */
+       GEM_TRACE("%s(%s): { rq=%llx:%lld }\n",
+                 __func__, engine->name, rq->fence.context, rq->fence.seqno);
+
+       /* On resubmission of the active request, payload will be scrubbed */
+       if (i915_request_completed(rq))
+               head = rq->tail;
+       else
+               head = active_request(ce->timeline, rq)->head;
+       ce->ring->head = intel_ring_wrap(ce->ring, head);
+       intel_ring_update_space(ce->ring);
+
+       /* Scrub the context image to prevent replaying the previous batch */
+       restore_default_state(ce, engine);
+       __execlists_update_reg_state(ce, engine);
+
+       /* We've switched away, so this should be a no-op, but intent matters */
+       ce->lrc_desc |= CTX_DESC_FORCE_RESTORE;
+}
+
 static inline struct intel_engine_cs *
 __execlists_schedule_in(struct i915_request *rq)
 {
@@ -1050,8 +1103,11 @@ __execlists_schedule_in(struct i915_request *rq)
 
        intel_context_get(ce);
 
+       if (unlikely(i915_gem_context_is_banned(ce->gem_context)))
+               reset_active(rq, engine);
+
        if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
-               execlists_check_context(ce, rq->engine);
+               execlists_check_context(ce, engine);
 
        if (ce->tag) {
                /* Use a fixed tag for OA and friends */
@@ -1102,72 +1158,22 @@ static void kick_siblings(struct i915_request *rq, struct intel_context *ce)
                tasklet_schedule(&ve->base.execlists.tasklet);
 }
 
-static void restore_default_state(struct intel_context *ce,
-                                 struct intel_engine_cs *engine)
-{
-       u32 *regs = ce->lrc_reg_state;
-
-       if (engine->pinned_default_state)
-               memcpy(regs, /* skip restoring the vanilla PPHWSP */
-                      engine->pinned_default_state + LRC_STATE_PN * PAGE_SIZE,
-                      engine->context_size - PAGE_SIZE);
-
-       execlists_init_reg_state(regs, ce, engine, ce->ring, false);
-}
-
-static void reset_active(struct i915_request *rq,
-                        struct intel_engine_cs *engine)
-{
-       struct intel_context * const ce = rq->hw_context;
-       u32 head;
-
-       /*
-        * The executing context has been cancelled. We want to prevent
-        * further execution along this context and propagate the error on
-        * to anything depending on its results.
-        *
-        * In __i915_request_submit(), we apply the -EIO and remove the
-        * requests' payloads for any banned requests. But first, we must
-        * rewind the context back to the start of the incomplete request so
-        * that we do not jump back into the middle of the batch.
-        *
-        * We preserve the breadcrumbs and semaphores of the incomplete
-        * requests so that inter-timeline dependencies (i.e other timelines)
-        * remain correctly ordered. And we defer to __i915_request_submit()
-        * so that all asynchronous waits are correctly handled.
-        */
-       GEM_TRACE("%s(%s): { rq=%llx:%lld }\n",
-                 __func__, engine->name, rq->fence.context, rq->fence.seqno);
-
-       /* On resubmission of the active request, payload will be scrubbed */
-       if (i915_request_completed(rq))
-               head = rq->tail;
-       else
-               head = active_request(ce->timeline, rq)->head;
-       ce->ring->head = intel_ring_wrap(ce->ring, head);
-       intel_ring_update_space(ce->ring);
-
-       /* Scrub the context image to prevent replaying the previous batch */
-       restore_default_state(ce, engine);
-       __execlists_update_reg_state(ce, engine);
-
-       /* We've switched away, so this should be a no-op, but intent matters */
-       ce->lrc_desc |= CTX_DESC_FORCE_RESTORE;
-}
-
 static inline void
 __execlists_schedule_out(struct i915_request *rq,
                         struct intel_engine_cs * const engine)
 {
        struct intel_context * const ce = rq->hw_context;
 
+       /*
+        * NB process_csb() is not under the engine->active.lock and hence
+        * schedule_out can race with schedule_in meaning that we should
+        * refrain from doing non-trivial work here.
+        */
+
        intel_engine_context_out(engine);
        execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT);
        intel_gt_pm_put(engine->gt);
 
-       if (unlikely(i915_gem_context_is_banned(ce->gem_context)))
-               reset_active(rq, engine);
-
        /*
         * If this is part of a virtual engine, its next request may
         * have been blocked waiting for access to the active context.