drm/i915: Beware temporary wedging when determining -EIO
authorChris Wilson <chris@chris-wilson.co.uk>
Wed, 20 Feb 2019 14:56:37 +0000 (14:56 +0000)
committerChris Wilson <chris@chris-wilson.co.uk>
Wed, 20 Feb 2019 16:31:08 +0000 (16:31 +0000)
At a few points in our uABI, we check to see if the driver is wedged and
report -EIO back to the user in that case. However, as we perform the
check and reset asynchronously (where once before they were both
serialised by the struct_mutex), we may instead see the temporary wedging
used to cancel inflight rendering to avoid a deadlock during reset
(caused by either us timing out in our reset handler,
i915_wedge_on_timeout or with malice aforethought in intel_reset_prepare
for a stuck modeset). If we suspect this is the case, that is we see a
wedged driver *and* reset in progress, then wait until the reset is
resolved before reporting upon the wedged status.

v2: might_sleep() (Mika)

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=109580
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190220145637.23503-1-chris@chris-wilson.co.uk
19 files changed:
drivers/gpu/drm/i915/i915_debugfs.c
drivers/gpu/drm/i915/i915_drv.h
drivers/gpu/drm/i915/i915_gem.c
drivers/gpu/drm/i915/i915_request.c
drivers/gpu/drm/i915/i915_reset.c
drivers/gpu/drm/i915/i915_reset.h
drivers/gpu/drm/i915/intel_engine_cs.c
drivers/gpu/drm/i915/intel_hangcheck.c
drivers/gpu/drm/i915/selftests/huge_pages.c
drivers/gpu/drm/i915/selftests/i915_active.c
drivers/gpu/drm/i915/selftests/i915_gem_coherency.c
drivers/gpu/drm/i915/selftests/i915_gem_context.c
drivers/gpu/drm/i915/selftests/i915_gem_evict.c
drivers/gpu/drm/i915/selftests/i915_gem_object.c
drivers/gpu/drm/i915/selftests/i915_request.c
drivers/gpu/drm/i915/selftests/igt_flush_test.c
drivers/gpu/drm/i915/selftests/intel_hangcheck.c
drivers/gpu/drm/i915/selftests/intel_lrc.c
drivers/gpu/drm/i915/selftests/intel_workarounds.c

index 2aeea977283fa67c85afdc2e1b5cdbef3f9b3ba0..37175414ce892a50d49846326efd5e29c836ff3b 100644 (file)
@@ -3833,11 +3833,18 @@ static const struct file_operations i915_cur_wm_latency_fops = {
 static int
 i915_wedged_get(void *data, u64 *val)
 {
-       struct drm_i915_private *dev_priv = data;
-
-       *val = i915_terminally_wedged(&dev_priv->gpu_error);
+       int ret = i915_terminally_wedged(data);
 
-       return 0;
+       switch (ret) {
+       case -EIO:
+               *val = 1;
+               return 0;
+       case 0:
+               *val = 0;
+               return 0;
+       default:
+               return ret;
+       }
 }
 
 static int
@@ -3918,7 +3925,7 @@ i915_drop_caches_set(void *data, u64 val)
                mutex_unlock(&i915->drm.struct_mutex);
        }
 
-       if (val & DROP_RESET_ACTIVE && i915_terminally_wedged(&i915->gpu_error))
+       if (val & DROP_RESET_ACTIVE && i915_terminally_wedged(i915))
                i915_handle_error(i915, ALL_ENGINES, 0, NULL);
 
        fs_reclaim_acquire(GFP_KERNEL);
index 58c9058da4b4c08d0a1f8ab2e353b0290b7532ee..2a78fb3e6eee5b2dfa4b49790a772c173b6199aa 100644 (file)
@@ -3020,11 +3020,16 @@ int __must_check i915_gem_set_global_seqno(struct drm_device *dev, u32 seqno);
 struct i915_request *
 i915_gem_find_active_request(struct intel_engine_cs *engine);
 
-static inline bool i915_terminally_wedged(struct i915_gpu_error *error)
+static inline bool __i915_wedged(struct i915_gpu_error *error)
 {
        return unlikely(test_bit(I915_WEDGED, &error->flags));
 }
 
+static inline bool i915_reset_failed(struct drm_i915_private *i915)
+{
+       return __i915_wedged(&i915->gpu_error);
+}
+
 static inline u32 i915_reset_count(struct i915_gpu_error *error)
 {
        return READ_ONCE(error->reset_count);
index b421bc7a2e261c4c802d1eeeac5c642d6a913a9f..cc88e7974422c9e396779f03d905a4545efea331 100644 (file)
@@ -1928,7 +1928,7 @@ err:
                 * fail). But any other -EIO isn't ours (e.g. swap in failure)
                 * and so needs to be reported.
                 */
-               if (!i915_terminally_wedged(&dev_priv->gpu_error))
+               if (!i915_terminally_wedged(dev_priv))
                        return VM_FAULT_SIGBUS;
                /* else: fall through */
        case -EAGAIN:
@@ -2958,7 +2958,7 @@ static void assert_kernel_context_is_current(struct drm_i915_private *i915)
        struct intel_engine_cs *engine;
        enum intel_engine_id id;
 
-       if (i915_terminally_wedged(&i915->gpu_error))
+       if (i915_reset_failed(i915))
                return;
 
        GEM_BUG_ON(i915->gt.active_requests);
@@ -3806,8 +3806,9 @@ i915_gem_ring_throttle(struct drm_device *dev, struct drm_file *file)
        long ret;
 
        /* ABI: return -EIO if already wedged */
-       if (i915_terminally_wedged(&dev_priv->gpu_error))
-               return -EIO;
+       ret = i915_terminally_wedged(dev_priv);
+       if (ret)
+               return ret;
 
        spin_lock(&file_priv->mm.lock);
        list_for_each_entry(request, &file_priv->mm.request_list, client_link) {
@@ -4460,7 +4461,7 @@ void i915_gem_sanitize(struct drm_i915_private *i915)
         * back to defaults, recovering from whatever wedged state we left it
         * in and so worth trying to use the device once more.
         */
-       if (i915_terminally_wedged(&i915->gpu_error))
+       if (i915_terminally_wedged(i915))
                i915_gem_unset_wedged(i915);
 
        /*
@@ -4504,7 +4505,7 @@ int i915_gem_suspend(struct drm_i915_private *i915)
         * state. Fortunately, the kernel_context is disposable and we do
         * not rely on its state.
         */
-       if (!i915_terminally_wedged(&i915->gpu_error)) {
+       if (!i915_reset_failed(i915)) {
                ret = i915_gem_switch_to_kernel_context(i915);
                if (ret)
                        goto err_unlock;
@@ -4625,8 +4626,9 @@ out_unlock:
        return;
 
 err_wedged:
-       if (!i915_terminally_wedged(&i915->gpu_error)) {
-               DRM_ERROR("failed to re-initialize GPU, declaring wedged!\n");
+       if (!i915_reset_failed(i915)) {
+               dev_err(i915->drm.dev,
+                       "Failed to re-initialize GPU, declaring it wedged!\n");
                i915_gem_set_wedged(i915);
        }
        goto out_unlock;
@@ -4731,10 +4733,9 @@ int i915_gem_init_hw(struct drm_i915_private *dev_priv)
        init_unused_rings(dev_priv);
 
        BUG_ON(!dev_priv->kernel_context);
-       if (i915_terminally_wedged(&dev_priv->gpu_error)) {
-               ret = -EIO;
+       ret = i915_terminally_wedged(dev_priv);
+       if (ret)
                goto out;
-       }
 
        ret = i915_ppgtt_init_hw(dev_priv);
        if (ret) {
@@ -5107,7 +5108,7 @@ err_uc_misc:
                 * wedged. But we only want to do this where the GPU is angry,
                 * for all other failure, such as an allocation failure, bail.
                 */
-               if (!i915_terminally_wedged(&dev_priv->gpu_error)) {
+               if (!i915_reset_failed(dev_priv)) {
                        i915_load_error(dev_priv,
                                        "Failed to initialize GPU, declaring it wedged!\n");
                        i915_gem_set_wedged(dev_priv);
index a61e3a4fc9dc0d9bc3b3251bf6d8b65daaf63a96..124b3e279c880675d14b04b7b5719f133493e873 100644 (file)
@@ -559,8 +559,9 @@ i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
         * ABI: Before userspace accesses the GPU (e.g. execbuffer), report
         * EIO if the GPU is already wedged.
         */
-       if (i915_terminally_wedged(&i915->gpu_error))
-               return ERR_PTR(-EIO);
+       ret = i915_terminally_wedged(i915);
+       if (ret)
+               return ERR_PTR(ret);
 
        /*
         * Pinning the contexts may generate requests in order to acquire
index c234feb5fdf55009fb65ce6229bb8f02a37151d5..19ad56ba22a7d63118985a88abdbfbc7a542e45e 100644 (file)
@@ -1032,7 +1032,7 @@ void i915_reset(struct drm_i915_private *i915,
 
 finish:
        reset_finish(i915);
-       if (!i915_terminally_wedged(error))
+       if (!__i915_wedged(error))
                reset_restart(i915);
        return;
 
@@ -1253,7 +1253,7 @@ void i915_handle_error(struct drm_i915_private *i915,
         * Try engine reset when available. We fall back to full reset if
         * single reset fails.
         */
-       if (intel_has_reset_engine(i915) && !i915_terminally_wedged(error)) {
+       if (intel_has_reset_engine(i915) && !__i915_wedged(error)) {
                for_each_engine_masked(engine, i915, engine_mask, tmp) {
                        BUILD_BUG_ON(I915_RESET_MODESET >= I915_RESET_ENGINE);
                        if (test_and_set_bit(I915_RESET_ENGINE + engine->id,
@@ -1339,6 +1339,31 @@ __releases(&i915->gpu_error.reset_backoff_srcu)
        srcu_read_unlock(&error->reset_backoff_srcu, tag);
 }
 
+int i915_terminally_wedged(struct drm_i915_private *i915)
+{
+       struct i915_gpu_error *error = &i915->gpu_error;
+
+       might_sleep();
+
+       if (!__i915_wedged(error))
+               return 0;
+
+       /* Reset still in progress? Maybe we will recover? */
+       if (!test_bit(I915_RESET_BACKOFF, &error->flags))
+               return -EIO;
+
+       /* XXX intel_reset_finish() still takes struct_mutex!!! */
+       if (mutex_is_locked(&i915->drm.struct_mutex))
+               return -EAGAIN;
+
+       if (wait_event_interruptible(error->reset_queue,
+                                    !test_bit(I915_RESET_BACKOFF,
+                                              &error->flags)))
+               return -EINTR;
+
+       return __i915_wedged(error) ? -EIO : 0;
+}
+
 bool i915_reset_flush(struct drm_i915_private *i915)
 {
        int err;
index 893c5d1c2eb81a9861c11402f89dfd6b3f9e0a84..16f2389f656f39a32c7581a444bb13b370a214cb 100644 (file)
@@ -36,6 +36,8 @@ bool i915_reset_flush(struct drm_i915_private *i915);
 int __must_check i915_reset_trylock(struct drm_i915_private *i915);
 void i915_reset_unlock(struct drm_i915_private *i915, int tag);
 
+int i915_terminally_wedged(struct drm_i915_private *i915);
+
 bool intel_has_gpu_reset(struct drm_i915_private *i915);
 bool intel_has_reset_engine(struct drm_i915_private *i915);
 
index 2547e2e51db8d090187f91d3300ac57ed6c3f9b5..81b80f8fd9ea8030491947348220f0f7079d8c9c 100644 (file)
@@ -1007,10 +1007,8 @@ static bool ring_is_idle(struct intel_engine_cs *engine)
  */
 bool intel_engine_is_idle(struct intel_engine_cs *engine)
 {
-       struct drm_i915_private *dev_priv = engine->i915;
-
        /* More white lies, if wedged, hw state is inconsistent */
-       if (i915_terminally_wedged(&dev_priv->gpu_error))
+       if (i915_reset_failed(engine->i915))
                return true;
 
        /* Any inflight/incomplete requests? */
@@ -1054,7 +1052,7 @@ bool intel_engines_are_idle(struct drm_i915_private *dev_priv)
         * If the driver is wedged, HW state may be very inconsistent and
         * report that it is still busy, even though we have stopped using it.
         */
-       if (i915_terminally_wedged(&dev_priv->gpu_error))
+       if (i915_reset_failed(dev_priv))
                return true;
 
        for_each_engine(engine, dev_priv, id) {
@@ -1496,7 +1494,7 @@ void intel_engine_dump(struct intel_engine_cs *engine,
                va_end(ap);
        }
 
-       if (i915_terminally_wedged(&engine->i915->gpu_error))
+       if (i915_reset_failed(engine->i915))
                drm_printf(m, "*** WEDGED ***\n");
 
        drm_printf(m, "\tcurrent seqno %x, last %x, hangcheck %x [%d ms]\n",
index a219c796e56d9c1f718e5e4d434ede8225a4823c..9be033b6f4d2220dd18400e64320cc2d51d5f383 100644 (file)
@@ -263,7 +263,7 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
        if (!READ_ONCE(dev_priv->gt.awake))
                return;
 
-       if (i915_terminally_wedged(&dev_priv->gpu_error))
+       if (i915_terminally_wedged(dev_priv))
                return;
 
        /* As enabling the GPU requires fairly extensive mmio access,
index a9a2fa35876fe5537db2e4693334097aae4f7def..40607ba7dda6264fc334122b1ee6d7d5da2530a8 100644 (file)
@@ -1764,7 +1764,7 @@ int i915_gem_huge_page_live_selftests(struct drm_i915_private *dev_priv)
                return 0;
        }
 
-       if (i915_terminally_wedged(&dev_priv->gpu_error))
+       if (i915_terminally_wedged(dev_priv))
                return 0;
 
        file = mock_file(dev_priv);
index 337b1f98b9230fe2caa6866927debd8fd7ecb98e..27d8f853111be92fa340a2404224f5dba48cf57c 100644 (file)
@@ -150,7 +150,7 @@ int i915_active_live_selftests(struct drm_i915_private *i915)
                SUBTEST(live_active_retire),
        };
 
-       if (i915_terminally_wedged(&i915->gpu_error))
+       if (i915_terminally_wedged(i915))
                return 0;
 
        return i915_subtests(tests, i915);
index fd89a5a33c1a0b1719c8bbbeb354c55ec498484f..9c16aff8702875d469ec7c0a0a1276daf6ee3d25 100644 (file)
@@ -248,12 +248,12 @@ static bool always_valid(struct drm_i915_private *i915)
 
 static bool needs_fence_registers(struct drm_i915_private *i915)
 {
-       return !i915_terminally_wedged(&i915->gpu_error);
+       return !i915_terminally_wedged(i915);
 }
 
 static bool needs_mi_store_dword(struct drm_i915_private *i915)
 {
-       if (i915_terminally_wedged(&i915->gpu_error))
+       if (i915_terminally_wedged(i915))
                return false;
 
        return intel_engine_can_store_dword(i915->engine[RCS]);
index b7b97c57ad057526bec4a90f5165abc26356cd76..3a238b9628b3be8085da769a353062a713f9bfde 100644 (file)
@@ -1632,7 +1632,7 @@ int i915_gem_context_live_selftests(struct drm_i915_private *dev_priv)
                SUBTEST(igt_vm_isolation),
        };
 
-       if (i915_terminally_wedged(&dev_priv->gpu_error))
+       if (i915_terminally_wedged(dev_priv))
                return 0;
 
        return i915_subtests(tests, dev_priv);
index 32dce7176f6381dc2a0429691dccc2eafc7fe360..b270eab1cad167253fe12f6f025aebca40f057a1 100644 (file)
@@ -547,7 +547,7 @@ int i915_gem_evict_live_selftests(struct drm_i915_private *i915)
                SUBTEST(igt_evict_contexts),
        };
 
-       if (i915_terminally_wedged(&i915->gpu_error))
+       if (i915_terminally_wedged(i915))
                return 0;
 
        return i915_subtests(tests, i915);
index 395ae878e0f7c3529ede2201b3b388a0a46dc785..784982aed625963d0718cb6b3a09f96a559293f5 100644 (file)
@@ -583,7 +583,7 @@ static int igt_mmap_offset_exhaustion(void *arg)
        for (loop = 0; loop < 3; loop++) {
                intel_wakeref_t wakeref;
 
-               if (i915_terminally_wedged(&i915->gpu_error))
+               if (i915_terminally_wedged(i915))
                        break;
 
                obj = i915_gem_object_create_internal(i915, PAGE_SIZE);
index 6733dc5b6b4c84e19f89ca0ea24be967dc69a625..03cc8ab6a620f520700cba2506c3ccdf7a3b0542 100644 (file)
@@ -1246,7 +1246,7 @@ int i915_request_live_selftests(struct drm_i915_private *i915)
                SUBTEST(live_breadcrumbs_smoketest),
        };
 
-       if (i915_terminally_wedged(&i915->gpu_error))
+       if (i915_terminally_wedged(i915))
                return 0;
 
        return i915_subtests(tests, i915);
index af66e3d4e23a421521bbbfc27093d84dd2a4d8ea..e0d3122fd35a0e1e6125e41644079da97285a484 100644 (file)
@@ -29,5 +29,5 @@ int igt_flush_test(struct drm_i915_private *i915, unsigned int flags)
                i915_gem_set_wedged(i915);
        }
 
-       return i915_terminally_wedged(&i915->gpu_error) ? -EIO : 0;
+       return i915_terminally_wedged(i915);
 }
index c32bc31192ae3f1e9d690cbfd7cfb6bd3c236b25..a77e4bf1ab555a9b405f71321aaf10e8c280fabe 100644 (file)
@@ -342,7 +342,7 @@ static int igt_hang_sanitycheck(void *arg)
                        timeout = i915_request_wait(rq,
                                                    I915_WAIT_LOCKED,
                                                    MAX_SCHEDULE_TIMEOUT);
-               if (i915_terminally_wedged(&i915->gpu_error))
+               if (i915_reset_failed(i915))
                        timeout = -EIO;
 
                i915_request_put(rq);
@@ -383,7 +383,7 @@ static int igt_global_reset(void *arg)
 
        igt_global_reset_unlock(i915);
 
-       if (i915_terminally_wedged(&i915->gpu_error))
+       if (i915_reset_failed(i915))
                err = -EIO;
 
        return err;
@@ -401,13 +401,13 @@ static int igt_wedged_reset(void *arg)
 
        i915_gem_set_wedged(i915);
 
-       GEM_BUG_ON(!i915_terminally_wedged(&i915->gpu_error));
+       GEM_BUG_ON(!i915_reset_failed(i915));
        i915_reset(i915, ALL_ENGINES, NULL);
 
        intel_runtime_pm_put(i915, wakeref);
        igt_global_reset_unlock(i915);
 
-       return i915_terminally_wedged(&i915->gpu_error) ? -EIO : 0;
+       return i915_reset_failed(i915) ? -EIO : 0;
 }
 
 static bool wait_for_idle(struct intel_engine_cs *engine)
@@ -529,7 +529,7 @@ static int __igt_reset_engine(struct drm_i915_private *i915, bool active)
                        break;
        }
 
-       if (i915_terminally_wedged(&i915->gpu_error))
+       if (i915_reset_failed(i915))
                err = -EIO;
 
        if (active) {
@@ -843,7 +843,7 @@ unwind:
                        break;
        }
 
-       if (i915_terminally_wedged(&i915->gpu_error))
+       if (i915_reset_failed(i915))
                err = -EIO;
 
        if (flags & TEST_ACTIVE) {
@@ -969,7 +969,7 @@ unlock:
        mutex_unlock(&i915->drm.struct_mutex);
        igt_global_reset_unlock(i915);
 
-       if (i915_terminally_wedged(&i915->gpu_error))
+       if (i915_reset_failed(i915))
                return -EIO;
 
        return err;
@@ -1166,7 +1166,7 @@ fini:
 unlock:
        mutex_unlock(&i915->drm.struct_mutex);
 
-       if (i915_terminally_wedged(&i915->gpu_error))
+       if (i915_reset_failed(i915))
                return -EIO;
 
        return err;
@@ -1372,7 +1372,7 @@ unlock:
        mutex_unlock(&i915->drm.struct_mutex);
        igt_global_reset_unlock(i915);
 
-       if (i915_terminally_wedged(&i915->gpu_error))
+       if (i915_reset_failed(i915))
                return -EIO;
 
        return err;
@@ -1552,7 +1552,7 @@ static int igt_atomic_reset_engine(struct intel_engine_cs *engine,
                        i915_request_wait(rq,
                                          I915_WAIT_LOCKED,
                                          MAX_SCHEDULE_TIMEOUT);
-               if (i915_terminally_wedged(&i915->gpu_error))
+               if (i915_reset_failed(i915))
                        err = -EIO;
        }
 
@@ -1591,7 +1591,7 @@ static int igt_atomic_reset(void *arg)
 
        /* Flush any requests before we get started and check basics */
        force_reset(i915);
-       if (i915_terminally_wedged(&i915->gpu_error))
+       if (i915_reset_failed(i915))
                goto unlock;
 
        if (intel_has_gpu_reset(i915)) {
@@ -1665,7 +1665,7 @@ int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
        if (!intel_has_gpu_reset(i915))
                return 0;
 
-       if (i915_terminally_wedged(&i915->gpu_error))
+       if (i915_terminally_wedged(i915))
                return -EIO; /* we're long past hope of a successful reset */
 
        wakeref = intel_runtime_pm_get(i915);
index 58144e024751fced7c19649b2569f72ab350c310..0f7a5bf696468383d96031bddfb9a5cc20d75580 100644 (file)
@@ -895,7 +895,7 @@ int intel_execlists_live_selftests(struct drm_i915_private *i915)
        if (!HAS_EXECLISTS(i915))
                return 0;
 
-       if (i915_terminally_wedged(&i915->gpu_error))
+       if (i915_terminally_wedged(i915))
                return 0;
 
        return i915_subtests(tests, i915);
index fb479a2c04fb5adbe106e65e77de7cd5be68b732..e6ffc8ac22dc659567fd49afd8bd08b1f4c49707 100644 (file)
@@ -181,7 +181,7 @@ static int check_whitelist(struct i915_gem_context *ctx,
        err = 0;
        igt_wedge_on_timeout(&wedge, ctx->i915, HZ / 5) /* a safety net! */
                err = i915_gem_object_set_to_cpu_domain(results, false);
-       if (i915_terminally_wedged(&ctx->i915->gpu_error))
+       if (i915_terminally_wedged(ctx->i915))
                err = -EIO;
        if (err)
                goto out_put;
@@ -510,7 +510,7 @@ int intel_workarounds_live_selftests(struct drm_i915_private *i915)
        };
        int err;
 
-       if (i915_terminally_wedged(&i915->gpu_error))
+       if (i915_terminally_wedged(i915))
                return 0;
 
        mutex_lock(&i915->drm.struct_mutex);