drm/i915: Provide a timeout to i915_gem_wait_for_idle()
authorChris Wilson <chris@chris-wilson.co.uk>
Mon, 9 Jul 2018 12:20:42 +0000 (13:20 +0100)
committerChris Wilson <chris@chris-wilson.co.uk>
Mon, 9 Jul 2018 12:55:41 +0000 (13:55 +0100)
Usually we have no idea about the upper bound we need to wait to catch
up with userspace when idling the device, but in a few situations we
know the system was idle beforehand and can provide a short timeout in
order to very quickly catch a failure, long before hangcheck kicks in.

In the following patches, we will use the timeout to curtain two overly
long waits, where we know we can expect the GPU to complete within a
reasonable time or declare it broken.

In particular, with a broken GPU we expect it to fail during the initial
GPU setup where do a couple of context switches to record the defaults.
This is a task that takes a few milliseconds even on the slowest of
devices, but we may have to wait 60s for hangcheck to give in and
declare the machine inoperable. In this a case where any gpu hang is
unacceptable, both from a timeliness and practical standpoint.

The other improvement is that in selftests, we do not need to arm an
independent timer to inject a wedge, as we can just limit the timeout on
the wait directly.

v2: Include the timeout parameter in the trace.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20180709122044.7028-1-chris@chris-wilson.co.uk
drivers/gpu/drm/i915/i915_debugfs.c
drivers/gpu/drm/i915/i915_drv.h
drivers/gpu/drm/i915/i915_gem.c
drivers/gpu/drm/i915/i915_gem_evict.c
drivers/gpu/drm/i915/i915_gem_gtt.c
drivers/gpu/drm/i915/i915_gem_shrinker.c
drivers/gpu/drm/i915/i915_perf.c
drivers/gpu/drm/i915/i915_request.c
drivers/gpu/drm/i915/selftests/i915_gem_context.c
drivers/gpu/drm/i915/selftests/i915_request.c
drivers/gpu/drm/i915/selftests/igt_flush_test.c

index 544e5e7f011fb7b3b9d9f6475af6431db93f6c85..099f97ef2303905539d59d6d80c6bf53923c29d9 100644 (file)
@@ -4105,7 +4105,8 @@ fault_irq_set(struct drm_i915_private *i915,
 
        err = i915_gem_wait_for_idle(i915,
                                     I915_WAIT_LOCKED |
-                                    I915_WAIT_INTERRUPTIBLE);
+                                    I915_WAIT_INTERRUPTIBLE,
+                                    MAX_SCHEDULE_TIMEOUT);
        if (err)
                goto err_unlock;
 
@@ -4210,7 +4211,8 @@ i915_drop_caches_set(void *data, u64 val)
                if (val & DROP_ACTIVE)
                        ret = i915_gem_wait_for_idle(dev_priv,
                                                     I915_WAIT_INTERRUPTIBLE |
-                                                    I915_WAIT_LOCKED);
+                                                    I915_WAIT_LOCKED,
+                                                    MAX_SCHEDULE_TIMEOUT);
 
                if (val & DROP_RETIRE)
                        i915_retire_requests(dev_priv);
index c790081777083c402948c5346b02f92b6f10c1ec..fcb8f49a9b8a7f114a0f7dfd11ba2acbcec3fe2b 100644 (file)
@@ -3157,7 +3157,7 @@ void i915_gem_init_swizzling(struct drm_i915_private *dev_priv);
 void i915_gem_fini(struct drm_i915_private *dev_priv);
 void i915_gem_cleanup_engines(struct drm_i915_private *dev_priv);
 int i915_gem_wait_for_idle(struct drm_i915_private *dev_priv,
-                          unsigned int flags);
+                          unsigned int flags, long timeout);
 int __must_check i915_gem_suspend(struct drm_i915_private *dev_priv);
 void i915_gem_suspend_late(struct drm_i915_private *dev_priv);
 void i915_gem_resume(struct drm_i915_private *dev_priv);
index 1a9dab302433ecd0cb9ceeb0081dd816d3c7dc6b..91d705a67d389ada8fcf5e471bbaf4e124d10bc0 100644 (file)
@@ -2267,7 +2267,9 @@ static int i915_gem_object_create_mmap_offset(struct drm_i915_gem_object *obj)
 
        /* Attempt to reap some mmap space from dead objects */
        do {
-               err = i915_gem_wait_for_idle(dev_priv, I915_WAIT_INTERRUPTIBLE);
+               err = i915_gem_wait_for_idle(dev_priv,
+                                            I915_WAIT_INTERRUPTIBLE,
+                                            MAX_SCHEDULE_TIMEOUT);
                if (err)
                        break;
 
@@ -3742,14 +3744,14 @@ i915_gem_wait_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
        return ret;
 }
 
-static int wait_for_timeline(struct i915_timeline *tl, unsigned int flags)
+static long wait_for_timeline(struct i915_timeline *tl,
+                             unsigned int flags, long timeout)
 {
        struct i915_request *rq;
-       long ret;
 
        rq = i915_gem_active_get_unlocked(&tl->last_request);
        if (!rq)
-               return 0;
+               return timeout;
 
        /*
         * "Race-to-idle".
@@ -3763,10 +3765,10 @@ static int wait_for_timeline(struct i915_timeline *tl, unsigned int flags)
        if (flags & I915_WAIT_FOR_IDLE_BOOST)
                gen6_rps_boost(rq, NULL);
 
-       ret = i915_request_wait(rq, flags, MAX_SCHEDULE_TIMEOUT);
+       timeout = i915_request_wait(rq, flags, timeout);
        i915_request_put(rq);
 
-       return ret < 0 ? ret : 0;
+       return timeout;
 }
 
 static int wait_for_engines(struct drm_i915_private *i915)
@@ -3782,10 +3784,12 @@ static int wait_for_engines(struct drm_i915_private *i915)
        return 0;
 }
 
-int i915_gem_wait_for_idle(struct drm_i915_private *i915, unsigned int flags)
+int i915_gem_wait_for_idle(struct drm_i915_private *i915,
+                          unsigned int flags, long timeout)
 {
-       GEM_TRACE("flags=%x (%s)\n",
-                 flags, flags & I915_WAIT_LOCKED ? "locked" : "unlocked");
+       GEM_TRACE("flags=%x (%s), timeout=%ld%s\n",
+                 flags, flags & I915_WAIT_LOCKED ? "locked" : "unlocked",
+                 timeout, timeout == MAX_SCHEDULE_TIMEOUT ? " (forever)" : "");
 
        /* If the device is asleep, we have no requests outstanding */
        if (!READ_ONCE(i915->gt.awake))
@@ -3798,9 +3802,9 @@ int i915_gem_wait_for_idle(struct drm_i915_private *i915, unsigned int flags)
                lockdep_assert_held(&i915->drm.struct_mutex);
 
                list_for_each_entry(tl, &i915->gt.timelines, link) {
-                       err = wait_for_timeline(tl, flags);
-                       if (err)
-                               return err;
+                       timeout = wait_for_timeline(tl, flags, timeout);
+                       if (timeout < 0)
+                               return timeout;
                }
 
                err = wait_for_engines(i915);
@@ -3812,12 +3816,13 @@ int i915_gem_wait_for_idle(struct drm_i915_private *i915, unsigned int flags)
        } else {
                struct intel_engine_cs *engine;
                enum intel_engine_id id;
-               int err;
 
                for_each_engine(engine, i915, id) {
-                       err = wait_for_timeline(&engine->timeline, flags);
-                       if (err)
-                               return err;
+                       struct i915_timeline *tl = &engine->timeline;
+
+                       timeout = wait_for_timeline(tl, flags, timeout);
+                       if (timeout < 0)
+                               return timeout;
                }
        }
 
@@ -5052,7 +5057,8 @@ int i915_gem_suspend(struct drm_i915_private *dev_priv)
                ret = i915_gem_wait_for_idle(dev_priv,
                                             I915_WAIT_INTERRUPTIBLE |
                                             I915_WAIT_LOCKED |
-                                            I915_WAIT_FOR_IDLE_BOOST);
+                                            I915_WAIT_FOR_IDLE_BOOST,
+                                            MAX_SCHEDULE_TIMEOUT);
                if (ret && ret != -EIO)
                        goto err_unlock;
 
@@ -5356,7 +5362,9 @@ static int __intel_engines_record_defaults(struct drm_i915_private *i915)
        if (err)
                goto err_active;
 
-       err = i915_gem_wait_for_idle(i915, I915_WAIT_LOCKED);
+       err = i915_gem_wait_for_idle(i915,
+                                    I915_WAIT_LOCKED,
+                                    MAX_SCHEDULE_TIMEOUT);
        if (err)
                goto err_active;
 
@@ -5421,7 +5429,9 @@ err_active:
        if (WARN_ON(i915_gem_switch_to_kernel_context(i915)))
                goto out_ctx;
 
-       if (WARN_ON(i915_gem_wait_for_idle(i915, I915_WAIT_LOCKED)))
+       if (WARN_ON(i915_gem_wait_for_idle(i915,
+                                          I915_WAIT_LOCKED,
+                                          MAX_SCHEDULE_TIMEOUT)))
                goto out_ctx;
 
        i915_gem_contexts_lost(i915);
index 54814a196ee4d3fe99d16a72083a926595eda6d6..02b83a5ed96c9ec7b539bec4bdc88ed3ac1946cd 100644 (file)
@@ -69,7 +69,8 @@ static int ggtt_flush(struct drm_i915_private *i915)
 
        err = i915_gem_wait_for_idle(i915,
                                     I915_WAIT_INTERRUPTIBLE |
-                                    I915_WAIT_LOCKED);
+                                    I915_WAIT_LOCKED,
+                                    MAX_SCHEDULE_TIMEOUT);
        if (err)
                return err;
 
index 4db31aaaa9d393a4ebd73d0b44a684b1661fc235..210baf3c8d11294ad5043d2ef098d1caceb8b2de 100644 (file)
@@ -2793,7 +2793,7 @@ void i915_gem_gtt_finish_pages(struct drm_i915_gem_object *obj,
        struct i915_ggtt *ggtt = &dev_priv->ggtt;
 
        if (unlikely(ggtt->do_idle_maps)) {
-               if (i915_gem_wait_for_idle(dev_priv, 0)) {
+               if (i915_gem_wait_for_idle(dev_priv, 0, MAX_SCHEDULE_TIMEOUT)) {
                        DRM_ERROR("Failed to wait for idle; VT'd may hang.\n");
                        /* Wait a bit, in hopes it avoids the hang */
                        udelay(10);
index 55e84e71f526d23bb34a71f1db60cd864e151e6c..c61f5b80fee3a15c83dab2d8aa037921c33af4a8 100644 (file)
@@ -172,7 +172,9 @@ i915_gem_shrink(struct drm_i915_private *i915,
         * we will free as much as we can and hope to get a second chance.
         */
        if (flags & I915_SHRINK_ACTIVE)
-               i915_gem_wait_for_idle(i915, I915_WAIT_LOCKED);
+               i915_gem_wait_for_idle(i915,
+                                      I915_WAIT_LOCKED,
+                                      MAX_SCHEDULE_TIMEOUT);
 
        trace_i915_gem_shrink(i915, target, flags);
        i915_retire_requests(i915);
@@ -392,7 +394,8 @@ shrinker_lock_uninterruptible(struct drm_i915_private *i915, bool *unlock,
        unsigned long timeout = jiffies + msecs_to_jiffies_timeout(timeout_ms);
 
        do {
-               if (i915_gem_wait_for_idle(i915, 0) == 0 &&
+               if (i915_gem_wait_for_idle(i915,
+                                          0, MAX_SCHEDULE_TIMEOUT) == 0 &&
                    shrinker_lock(i915, unlock))
                        break;
 
@@ -466,7 +469,9 @@ i915_gem_shrinker_vmap(struct notifier_block *nb, unsigned long event, void *ptr
                return NOTIFY_DONE;
 
        /* Force everything onto the inactive lists */
-       ret = i915_gem_wait_for_idle(i915, I915_WAIT_LOCKED);
+       ret = i915_gem_wait_for_idle(i915,
+                                    I915_WAIT_LOCKED,
+                                    MAX_SCHEDULE_TIMEOUT);
        if (ret)
                goto out;
 
index 447407fee3b89dc5f24c66dda9c626e8120d98e5..6bf10952c7240363fdcd2df907979ef9aef0d247 100644 (file)
@@ -1836,7 +1836,9 @@ static int gen8_configure_all_contexts(struct drm_i915_private *dev_priv,
         * So far the best way to work around this issue seems to be draining
         * the GPU from any submitted work.
         */
-       ret = i915_gem_wait_for_idle(dev_priv, wait_flags);
+       ret = i915_gem_wait_for_idle(dev_priv,
+                                    wait_flags,
+                                    MAX_SCHEDULE_TIMEOUT);
        if (ret)
                goto out;
 
index 3248369dbcfb248a9ff8478a09f1850d050a31d1..5c2c93cbab12f8ebff29507a00953a24a9a877c6 100644 (file)
@@ -206,7 +206,8 @@ static int reset_all_global_seqno(struct drm_i915_private *i915, u32 seqno)
        /* Carefully retire all requests without writing to the rings */
        ret = i915_gem_wait_for_idle(i915,
                                     I915_WAIT_INTERRUPTIBLE |
-                                    I915_WAIT_LOCKED);
+                                    I915_WAIT_LOCKED,
+                                    MAX_SCHEDULE_TIMEOUT);
        if (ret)
                return ret;
 
@@ -735,7 +736,8 @@ i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
                /* Ratelimit ourselves to prevent oom from malicious clients */
                ret = i915_gem_wait_for_idle(i915,
                                             I915_WAIT_LOCKED |
-                                            I915_WAIT_INTERRUPTIBLE);
+                                            I915_WAIT_INTERRUPTIBLE,
+                                            MAX_SCHEDULE_TIMEOUT);
                if (ret)
                        goto err_unreserve;
 
index 5fbe15f4effdbd7888a3c4b89c34fe90ee20e792..ab25902420333c0bf938dc178674ec9659c603e4 100644 (file)
@@ -478,7 +478,9 @@ static int __igt_switch_to_kernel_context(struct drm_i915_private *i915,
                }
        }
 
-       err = i915_gem_wait_for_idle(i915, I915_WAIT_LOCKED);
+       err = i915_gem_wait_for_idle(i915,
+                                    I915_WAIT_LOCKED,
+                                    MAX_SCHEDULE_TIMEOUT);
        if (err)
                return err;
 
index 43995fc3534d77e46ecc1b2874d502525d279924..c4aac6141e04d0a217302122ebff9d8668de9ed8 100644 (file)
@@ -286,7 +286,9 @@ static int begin_live_test(struct live_test *t,
        t->func = func;
        t->name = name;
 
-       err = i915_gem_wait_for_idle(i915, I915_WAIT_LOCKED);
+       err = i915_gem_wait_for_idle(i915,
+                                    I915_WAIT_LOCKED,
+                                    MAX_SCHEDULE_TIMEOUT);
        if (err) {
                pr_err("%s(%s): failed to idle before, with err=%d!",
                       func, name, err);
index 0d06f559243f9e7c462fe557a0fdd7ce11f88240..09ab037ce8038f6cd8cd97138babd0882f34d471 100644 (file)
@@ -64,7 +64,7 @@ int igt_flush_test(struct drm_i915_private *i915, unsigned int flags)
        }
 
        wedge_on_timeout(&w, i915, HZ)
-               i915_gem_wait_for_idle(i915, flags);
+               i915_gem_wait_for_idle(i915, flags, MAX_SCHEDULE_TIMEOUT);
 
        return i915_terminally_wedged(&i915->gpu_error) ? -EIO : 0;
 }