drm/i915: Replace global_seqno with a hangcheck heartbeat seqno
authorChris Wilson <chris@chris-wilson.co.uk>
Tue, 26 Feb 2019 09:49:19 +0000 (09:49 +0000)
committerChris Wilson <chris@chris-wilson.co.uk>
Tue, 26 Feb 2019 09:55:31 +0000 (09:55 +0000)
To determine whether an engine has 'stuck', we simply check whether or
not is still on the same seqno for several seconds. To keep this simple
mechanism intact over the loss of a global seqno, we can simply add a
new global heartbeat seqno instead. As we cannot know the sequence in
which requests will then be completed, we use a primitive random number
generator instead (with a cycle long enough to not matter over an
interval of a few thousand requests between hangcheck samples).

The alternative to using a dedicated seqno on every request is to issue
a heartbeat request and query its progress through the system. Sadly
this requires us to reduce struct_mutex so that we can issue requests
without requiring that bkl.

v2: And without the extra CS_STALL for the hangcheck seqno -- we don't
need strict serialisation with what comes later, we just need to be sure
we don't write the hangcheck seqno before our batch is flushed.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190226094922.31617-1-chris@chris-wilson.co.uk
drivers/gpu/drm/i915/i915_debugfs.c
drivers/gpu/drm/i915/intel_engine_cs.c
drivers/gpu/drm/i915/intel_hangcheck.c
drivers/gpu/drm/i915/intel_lrc.c
drivers/gpu/drm/i915/intel_ringbuffer.c
drivers/gpu/drm/i915/intel_ringbuffer.h

index 37175414ce892a50d49846326efd5e29c836ff3b..545091a5180b82578cf003fabec76bfac6ac157d 100644 (file)
@@ -1295,7 +1295,7 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
        with_intel_runtime_pm(dev_priv, wakeref) {
                for_each_engine(engine, dev_priv, id) {
                        acthd[id] = intel_engine_get_active_head(engine);
-                       seqno[id] = intel_engine_get_seqno(engine);
+                       seqno[id] = intel_engine_get_hangcheck_seqno(engine);
                }
 
                intel_engine_get_instdone(dev_priv->engine[RCS], &instdone);
@@ -1315,8 +1315,9 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
        for_each_engine(engine, dev_priv, id) {
                seq_printf(m, "%s:\n", engine->name);
                seq_printf(m, "\tseqno = %x [current %x, last %x], %dms ago\n",
-                          engine->hangcheck.seqno, seqno[id],
-                          intel_engine_last_submit(engine),
+                          engine->hangcheck.last_seqno,
+                          seqno[id],
+                          engine->hangcheck.next_seqno,
                           jiffies_to_msecs(jiffies -
                                            engine->hangcheck.action_timestamp));
 
index 81b80f8fd9ea8030491947348220f0f7079d8c9c..57bc5c4fb3ff29318f3d82e852f12faca9b583cc 100644 (file)
@@ -1497,10 +1497,11 @@ void intel_engine_dump(struct intel_engine_cs *engine,
        if (i915_reset_failed(engine->i915))
                drm_printf(m, "*** WEDGED ***\n");
 
-       drm_printf(m, "\tcurrent seqno %x, last %x, hangcheck %x [%d ms]\n",
+       drm_printf(m, "\tcurrent seqno %x, last %x, hangcheck %x/%x [%d ms]\n",
                   intel_engine_get_seqno(engine),
                   intel_engine_last_submit(engine),
-                  engine->hangcheck.seqno,
+                  engine->hangcheck.last_seqno,
+                  engine->hangcheck.next_seqno,
                   jiffies_to_msecs(jiffies - engine->hangcheck.action_timestamp));
        drm_printf(m, "\tReset count: %d (global %d)\n",
                   i915_reset_engine_count(error, engine),
index 9be033b6f4d2220dd18400e64320cc2d51d5f383..f1d8dfc58049a7c47eb6a3a155820647843155d3 100644 (file)
@@ -133,21 +133,21 @@ static void hangcheck_load_sample(struct intel_engine_cs *engine,
                                  struct hangcheck *hc)
 {
        hc->acthd = intel_engine_get_active_head(engine);
-       hc->seqno = intel_engine_get_seqno(engine);
+       hc->seqno = intel_engine_get_hangcheck_seqno(engine);
 }
 
 static void hangcheck_store_sample(struct intel_engine_cs *engine,
                                   const struct hangcheck *hc)
 {
        engine->hangcheck.acthd = hc->acthd;
-       engine->hangcheck.seqno = hc->seqno;
+       engine->hangcheck.last_seqno = hc->seqno;
 }
 
 static enum intel_engine_hangcheck_action
 hangcheck_get_action(struct intel_engine_cs *engine,
                     const struct hangcheck *hc)
 {
-       if (engine->hangcheck.seqno != hc->seqno)
+       if (engine->hangcheck.last_seqno != hc->seqno)
                return ENGINE_ACTIVE_SEQNO;
 
        if (intel_engine_is_idle(engine))
index 34a0866959c5799dd96604ca748bc803c0d323c0..0516fc6b965284277e60d2b6108480fa975b47b9 100644 (file)
@@ -178,6 +178,12 @@ static inline u32 intel_hws_seqno_address(struct intel_engine_cs *engine)
                I915_GEM_HWS_INDEX_ADDR);
 }
 
+static inline u32 intel_hws_hangcheck_address(struct intel_engine_cs *engine)
+{
+       return (i915_ggtt_offset(engine->status_page.vma) +
+               I915_GEM_HWS_HANGCHECK_ADDR);
+}
+
 static inline struct i915_priolist *to_priolist(struct rb_node *rb)
 {
        return rb_entry(rb, struct i915_priolist, node);
@@ -2206,6 +2212,10 @@ static u32 *gen8_emit_fini_breadcrumb(struct i915_request *request, u32 *cs)
                                  request->fence.seqno,
                                  request->timeline->hwsp_offset);
 
+       cs = gen8_emit_ggtt_write(cs,
+                                 intel_engine_next_hangcheck_seqno(request->engine),
+                                 intel_hws_hangcheck_address(request->engine));
+
        cs = gen8_emit_ggtt_write(cs,
                                  request->global_seqno,
                                  intel_hws_seqno_address(request->engine));
@@ -2230,6 +2240,11 @@ static u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
                                      PIPE_CONTROL_FLUSH_ENABLE |
                                      PIPE_CONTROL_CS_STALL);
 
+       cs = gen8_emit_ggtt_write_rcs(cs,
+                                     intel_engine_next_hangcheck_seqno(request->engine),
+                                     intel_hws_hangcheck_address(request->engine),
+                                     0);
+
        cs = gen8_emit_ggtt_write_rcs(cs,
                                      request->global_seqno,
                                      intel_hws_seqno_address(request->engine),
index 7f841dba87b3026893b2cb330168cb74246d5804..870184bbd1690c8d80e1a063b8c5c7cedfcdb278 100644 (file)
  */
 #define LEGACY_REQUEST_SIZE 200
 
+static inline u32 hws_hangcheck_address(struct intel_engine_cs *engine)
+{
+       return (i915_ggtt_offset(engine->status_page.vma) +
+               I915_GEM_HWS_HANGCHECK_ADDR);
+}
+
 static inline u32 intel_hws_seqno_address(struct intel_engine_cs *engine)
 {
        return (i915_ggtt_offset(engine->status_page.vma) +
@@ -316,6 +322,11 @@ static u32 *gen6_rcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
        *cs++ = rq->timeline->hwsp_offset | PIPE_CONTROL_GLOBAL_GTT;
        *cs++ = rq->fence.seqno;
 
+       *cs++ = GFX_OP_PIPE_CONTROL(4);
+       *cs++ = PIPE_CONTROL_QW_WRITE;
+       *cs++ = hws_hangcheck_address(rq->engine) | PIPE_CONTROL_GLOBAL_GTT;
+       *cs++ = intel_engine_next_hangcheck_seqno(rq->engine);
+
        *cs++ = GFX_OP_PIPE_CONTROL(4);
        *cs++ = PIPE_CONTROL_QW_WRITE | PIPE_CONTROL_CS_STALL;
        *cs++ = intel_hws_seqno_address(rq->engine) | PIPE_CONTROL_GLOBAL_GTT;
@@ -422,6 +433,11 @@ static u32 *gen7_rcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
        *cs++ = rq->timeline->hwsp_offset;
        *cs++ = rq->fence.seqno;
 
+       *cs++ = GFX_OP_PIPE_CONTROL(4);
+       *cs++ = PIPE_CONTROL_QW_WRITE | PIPE_CONTROL_GLOBAL_GTT_IVB;
+       *cs++ = hws_hangcheck_address(rq->engine);
+       *cs++ = intel_engine_next_hangcheck_seqno(rq->engine);
+
        *cs++ = GFX_OP_PIPE_CONTROL(4);
        *cs++ = (PIPE_CONTROL_QW_WRITE |
                 PIPE_CONTROL_GLOBAL_GTT_IVB |
@@ -447,12 +463,15 @@ static u32 *gen6_xcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
        *cs++ = I915_GEM_HWS_SEQNO_ADDR | MI_FLUSH_DW_USE_GTT;
        *cs++ = rq->fence.seqno;
 
+       *cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX;
+       *cs++ = I915_GEM_HWS_HANGCHECK_ADDR | MI_FLUSH_DW_USE_GTT;
+       *cs++ = intel_engine_next_hangcheck_seqno(rq->engine);
+
        *cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX;
        *cs++ = I915_GEM_HWS_INDEX_ADDR | MI_FLUSH_DW_USE_GTT;
        *cs++ = rq->global_seqno;
 
        *cs++ = MI_USER_INTERRUPT;
-       *cs++ = MI_NOOP;
 
        rq->tail = intel_ring_offset(rq, cs);
        assert_ring_tail_valid(rq->ring, rq->tail);
@@ -472,6 +491,10 @@ static u32 *gen7_xcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
        *cs++ = I915_GEM_HWS_SEQNO_ADDR | MI_FLUSH_DW_USE_GTT;
        *cs++ = rq->fence.seqno;
 
+       *cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX;
+       *cs++ = I915_GEM_HWS_HANGCHECK_ADDR | MI_FLUSH_DW_USE_GTT;
+       *cs++ = intel_engine_next_hangcheck_seqno(rq->engine);
+
        *cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX;
        *cs++ = I915_GEM_HWS_INDEX_ADDR | MI_FLUSH_DW_USE_GTT;
        *cs++ = rq->global_seqno;
@@ -487,6 +510,7 @@ static u32 *gen7_xcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
        *cs++ = 0;
 
        *cs++ = MI_USER_INTERRUPT;
+       *cs++ = MI_NOOP;
 
        rq->tail = intel_ring_offset(rq, cs);
        assert_ring_tail_valid(rq->ring, rq->tail);
@@ -930,11 +954,16 @@ static u32 *i9xx_emit_breadcrumb(struct i915_request *rq, u32 *cs)
        *cs++ = I915_GEM_HWS_SEQNO_ADDR;
        *cs++ = rq->fence.seqno;
 
+       *cs++ = MI_STORE_DWORD_INDEX;
+       *cs++ = I915_GEM_HWS_HANGCHECK_ADDR;
+       *cs++ = intel_engine_next_hangcheck_seqno(rq->engine);
+
        *cs++ = MI_STORE_DWORD_INDEX;
        *cs++ = I915_GEM_HWS_INDEX_ADDR;
        *cs++ = rq->global_seqno;
 
        *cs++ = MI_USER_INTERRUPT;
+       *cs++ = MI_NOOP;
 
        rq->tail = intel_ring_offset(rq, cs);
        assert_ring_tail_valid(rq->ring, rq->tail);
@@ -956,6 +985,10 @@ static u32 *gen5_emit_breadcrumb(struct i915_request *rq, u32 *cs)
        *cs++ = I915_GEM_HWS_SEQNO_ADDR;
        *cs++ = rq->fence.seqno;
 
+       *cs++ = MI_STORE_DWORD_INDEX;
+       *cs++ = I915_GEM_HWS_HANGCHECK_ADDR;
+       *cs++ = intel_engine_next_hangcheck_seqno(rq->engine);
+
        BUILD_BUG_ON(GEN5_WA_STORES < 1);
        for (i = 0; i < GEN5_WA_STORES; i++) {
                *cs++ = MI_STORE_DWORD_INDEX;
@@ -964,7 +997,6 @@ static u32 *gen5_emit_breadcrumb(struct i915_request *rq, u32 *cs)
        }
 
        *cs++ = MI_USER_INTERRUPT;
-       *cs++ = MI_NOOP;
 
        rq->tail = intel_ring_offset(rq, cs);
        assert_ring_tail_valid(rq->ring, rq->tail);
index 5d45ad4ecca9039817c26abe5c2369c91bf39089..2869aaa9d22597a58e9473d8a05356f93ae180a6 100644 (file)
@@ -6,6 +6,7 @@
 
 #include <linux/hashtable.h>
 #include <linux/irq_work.h>
+#include <linux/random.h>
 #include <linux/seqlock.h>
 
 #include "i915_gem_batch_pool.h"
@@ -119,7 +120,8 @@ struct intel_instdone {
 
 struct intel_engine_hangcheck {
        u64 acthd;
-       u32 seqno;
+       u32 last_seqno;
+       u32 next_seqno;
        unsigned long action_timestamp;
        struct intel_instdone instdone;
 };
@@ -726,6 +728,8 @@ intel_write_status_page(struct intel_engine_cs *engine, int reg, u32 value)
 #define I915_GEM_HWS_INDEX_ADDR                (I915_GEM_HWS_INDEX * sizeof(u32))
 #define I915_GEM_HWS_PREEMPT           0x32
 #define I915_GEM_HWS_PREEMPT_ADDR      (I915_GEM_HWS_PREEMPT * sizeof(u32))
+#define I915_GEM_HWS_HANGCHECK         0x34
+#define I915_GEM_HWS_HANGCHECK_ADDR    (I915_GEM_HWS_HANGCHECK * sizeof(u32))
 #define I915_GEM_HWS_SEQNO             0x40
 #define I915_GEM_HWS_SEQNO_ADDR                (I915_GEM_HWS_SEQNO * sizeof(u32))
 #define I915_GEM_HWS_SCRATCH           0x80
@@ -1086,4 +1090,17 @@ static inline bool inject_preempt_hang(struct intel_engine_execlists *execlists)
 
 #endif
 
+static inline u32
+intel_engine_next_hangcheck_seqno(struct intel_engine_cs *engine)
+{
+       return engine->hangcheck.next_seqno =
+               next_pseudo_random32(engine->hangcheck.next_seqno);
+}
+
+static inline u32
+intel_engine_get_hangcheck_seqno(struct intel_engine_cs *engine)
+{
+       return intel_read_status_page(engine, I915_GEM_HWS_HANGCHECK);
+}
+
 #endif /* _INTEL_RINGBUFFER_H_ */