From 12471ba87a08bd1dd0aac18015d7782e02ea02de Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Sat, 9 Apr 2016 10:57:55 +0100 Subject: [PATCH] drm/i915: Harden detection of missed interrupts Only declare a missed interrupt if we find that the GPU is idle with waiters and a hangcheck interval has passed in which no new user interrupts have been raised. v2: Clear the stuck interrupt marker between successful batches Signed-off-by: Chris Wilson Cc: Mika Kuoppala Reviewed-by: Mika Kuoppala Link: http://patchwork.freedesktop.org/patch/msgid/1460195877-20520-3-git-send-email-chris@chris-wilson.co.uk --- drivers/gpu/drm/i915/i915_debugfs.c | 11 ++++--- drivers/gpu/drm/i915/i915_irq.c | 38 +++++++++++++++++-------- drivers/gpu/drm/i915/intel_ringbuffer.h | 2 ++ 3 files changed, 35 insertions(+), 16 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c index 919c05ba9932..9640738aabf2 100644 --- a/drivers/gpu/drm/i915/i915_debugfs.c +++ b/drivers/gpu/drm/i915/i915_debugfs.c @@ -728,10 +728,10 @@ static int i915_gem_request_info(struct seq_file *m, void *data) static void i915_ring_seqno_info(struct seq_file *m, struct intel_engine_cs *engine) { - if (engine->get_seqno) { - seq_printf(m, "Current sequence (%s): %x\n", - engine->name, engine->get_seqno(engine)); - } + seq_printf(m, "Current sequence (%s): %x\n", + engine->name, engine->get_seqno(engine)); + seq_printf(m, "Current user interrupts (%s): %x\n", + engine->name, READ_ONCE(engine->user_interrupts)); } static int i915_gem_seqno_info(struct seq_file *m, void *data) @@ -1367,6 +1367,9 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused) engine->hangcheck.seqno, seqno[id], engine->last_submitted_seqno); + seq_printf(m, "\tuser interrupts = %x [current %x]\n", + engine->hangcheck.user_interrupts, + READ_ONCE(engine->user_interrupts)); seq_printf(m, "\tACTHD = 0x%08llx [current 0x%08llx]\n", (long long)engine->hangcheck.acthd, (long long)acthd[id]); diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c index 3b946e1c7614..679f08c944ef 100644 --- a/drivers/gpu/drm/i915/i915_irq.c +++ b/drivers/gpu/drm/i915/i915_irq.c @@ -1000,6 +1000,7 @@ static void notify_ring(struct intel_engine_cs *engine) return; trace_i915_gem_request_notify(engine); + engine->user_interrupts++; wake_up_all(&engine->irq_queue); } @@ -3054,6 +3055,24 @@ ring_stuck(struct intel_engine_cs *engine, u64 acthd) return HANGCHECK_HUNG; } +static unsigned kick_waiters(struct intel_engine_cs *engine) +{ + struct drm_i915_private *i915 = to_i915(engine->dev); + unsigned user_interrupts = READ_ONCE(engine->user_interrupts); + + if (engine->hangcheck.user_interrupts == user_interrupts && + !test_and_set_bit(engine->id, &i915->gpu_error.missed_irq_rings)) { + if (!(i915->gpu_error.test_irq_rings & intel_engine_flag(engine))) + DRM_ERROR("Hangcheck timer elapsed... %s idle\n", + engine->name); + else + DRM_INFO("Fake missed irq on %s\n", + engine->name); + wake_up_all(&engine->irq_queue); + } + + return user_interrupts; +} /* * This is called when the chip hasn't reported back with completed * batchbuffers in a long time. We keep track per ring seqno progress and @@ -3096,6 +3115,7 @@ static void i915_hangcheck_elapsed(struct work_struct *work) for_each_engine_id(engine, dev_priv, id) { u64 acthd; u32 seqno; + unsigned user_interrupts; bool busy = true; semaphore_clear_deadlocks(dev_priv); @@ -3113,22 +3133,15 @@ static void i915_hangcheck_elapsed(struct work_struct *work) acthd = intel_ring_get_active_head(engine); seqno = engine->get_seqno(engine); + /* Reset stuck interrupts between batch advances */ + user_interrupts = 0; + if (engine->hangcheck.seqno == seqno) { if (ring_idle(engine, seqno)) { engine->hangcheck.action = HANGCHECK_IDLE; - if (waitqueue_active(&engine->irq_queue)) { - /* Issue a wake-up to catch stuck h/w. */ - if (!test_and_set_bit(engine->id, &dev_priv->gpu_error.missed_irq_rings)) { - if (!(dev_priv->gpu_error.test_irq_rings & intel_engine_flag(engine))) - DRM_ERROR("Hangcheck timer elapsed... %s idle\n", - engine->name); - else - DRM_INFO("Fake missed irq on %s\n", - engine->name); - wake_up_all(&engine->irq_queue); - } /* Safeguard against driver failure */ + user_interrupts = kick_waiters(engine); engine->hangcheck.score += BUSY; } else busy = false; @@ -3179,7 +3192,7 @@ static void i915_hangcheck_elapsed(struct work_struct *work) engine->hangcheck.score = 0; /* Clear head and subunit states on seqno movement */ - engine->hangcheck.acthd = 0; + acthd = 0; memset(engine->hangcheck.instdone, 0, sizeof(engine->hangcheck.instdone)); @@ -3187,6 +3200,7 @@ static void i915_hangcheck_elapsed(struct work_struct *work) engine->hangcheck.seqno = seqno; engine->hangcheck.acthd = acthd; + engine->hangcheck.user_interrupts = user_interrupts; busy_count += busy; } diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h index 3f04906a081f..29c54cc1ee5c 100644 --- a/drivers/gpu/drm/i915/intel_ringbuffer.h +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h @@ -87,6 +87,7 @@ enum intel_ring_hangcheck_action { struct intel_ring_hangcheck { u64 acthd; u32 seqno; + unsigned user_interrupts; int score; enum intel_ring_hangcheck_action action; int deadlock; @@ -305,6 +306,7 @@ struct intel_engine_cs { * inspecting request list. */ u32 last_submitted_seqno; + unsigned user_interrupts; bool gpu_caches_dirty; -- 2.30.2