drm/i915: Prevent machine hang from Broxton's vtd w/a and error capture
authorChris Wilson <chris@chris-wilson.co.uk>
Fri, 2 Nov 2018 16:12:12 +0000 (16:12 +0000)
committerJoonas Lahtinen <joonas.lahtinen@linux.intel.com>
Tue, 20 Nov 2018 08:09:27 +0000 (10:09 +0200)
Since capturing the error state requires fiddling around with the GGTT
to read arbitrary buffers and is itself run under stop_machine(), it
deadlocks the machine (effectively a hard hang) when run in conjunction
with Broxton's VTd workaround to serialize GGTT access.

v2: Store the ERR_PTR in first_error so that the error can be reported
to the user via sysfs.
v3: Mention the quirk in dmesg (using info as per usual)

Fixes: 0ef34ad6222a ("drm/i915: Serialize GTT/Aperture accesses on BXT")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Jon Bloomfield <jon.bloomfield@intel.com>
Cc: John Harrison <john.C.Harrison@intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20181102161232.17742-5-chris@chris-wilson.co.uk
(cherry picked from commit fb6f0b64e455b207a636346588e65bf9598d30eb)
Signed-off-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
drivers/gpu/drm/i915/i915_gem_gtt.c
drivers/gpu/drm/i915/i915_gpu_error.c
drivers/gpu/drm/i915/i915_gpu_error.h

index 47c302543799007bee421d85039be8b016aed6e7..07999fe09ad231a037bc73379b5fc27aaf484b94 100644 (file)
@@ -3413,6 +3413,11 @@ static int gen8_gmch_probe(struct i915_ggtt *ggtt)
                ggtt->vm.insert_page    = bxt_vtd_ggtt_insert_page__BKL;
                if (ggtt->vm.clear_range != nop_clear_range)
                        ggtt->vm.clear_range = bxt_vtd_ggtt_clear_range__BKL;
+
+               /* Prevent recursively calling stop_machine() and deadlocks. */
+               dev_info(dev_priv->drm.dev,
+                        "Disabling error capture for VT-d workaround\n");
+               i915_disable_error_state(dev_priv, -ENODEV);
        }
 
        ggtt->invalidate = gen6_ggtt_invalidate;
index 8762d17b66591e2afc8fc9647bef2784145f2c19..3eb33e000d6f00f3ae4b3fdbc8f37f38e97b4d83 100644 (file)
@@ -648,6 +648,9 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
                return 0;
        }
 
+       if (IS_ERR(error))
+               return PTR_ERR(error);
+
        if (*error->error_msg)
                err_printf(m, "%s\n", error->error_msg);
        err_printf(m, "Kernel: " UTS_RELEASE "\n");
@@ -1859,6 +1862,7 @@ void i915_capture_error_state(struct drm_i915_private *i915,
        error = i915_capture_gpu_state(i915);
        if (!error) {
                DRM_DEBUG_DRIVER("out of memory, not capturing error state\n");
+               i915_disable_error_state(i915, -ENOMEM);
                return;
        }
 
@@ -1914,5 +1918,14 @@ void i915_reset_error_state(struct drm_i915_private *i915)
        i915->gpu_error.first_error = NULL;
        spin_unlock_irq(&i915->gpu_error.lock);
 
-       i915_gpu_state_put(error);
+       if (!IS_ERR(error))
+               i915_gpu_state_put(error);
+}
+
+void i915_disable_error_state(struct drm_i915_private *i915, int err)
+{
+       spin_lock_irq(&i915->gpu_error.lock);
+       if (!i915->gpu_error.first_error)
+               i915->gpu_error.first_error = ERR_PTR(err);
+       spin_unlock_irq(&i915->gpu_error.lock);
 }
index 8710fb18ed746cface7e9a7b2d6d6ac7cd06b2b4..3ec89a504de52331ade6a9452a844527d84ec515 100644 (file)
@@ -343,6 +343,7 @@ static inline void i915_gpu_state_put(struct i915_gpu_state *gpu)
 
 struct i915_gpu_state *i915_first_error_state(struct drm_i915_private *i915);
 void i915_reset_error_state(struct drm_i915_private *i915);
+void i915_disable_error_state(struct drm_i915_private *i915, int err);
 
 #else
 
@@ -355,13 +356,18 @@ static inline void i915_capture_error_state(struct drm_i915_private *dev_priv,
 static inline struct i915_gpu_state *
 i915_first_error_state(struct drm_i915_private *i915)
 {
-       return NULL;
+       return ERR_PTR(-ENODEV);
 }
 
 static inline void i915_reset_error_state(struct drm_i915_private *i915)
 {
 }
 
+static inline void i915_disable_error_state(struct drm_i915_private *i915,
+                                           int err)
+{
+}
+
 #endif /* IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR) */
 
 #endif /* _I915_GPU_ERROR_H_ */