drm/i915: create a race-free reset detection

author Daniel Vetter <daniel.vetter@ffwll.ch>

Thu, 6 Dec 2012 08:01:42 +0000 (09:01 +0100)

committer Daniel Vetter <daniel.vetter@ffwll.ch>

Mon, 21 Jan 2013 18:53:54 +0000 (19:53 +0100)
author Daniel Vetter <daniel.vetter@ffwll.ch>
Thu, 6 Dec 2012 08:01:42 +0000 (09:01 +0100)
committer Daniel Vetter <daniel.vetter@ffwll.ch>
Mon, 21 Jan 2013 18:53:54 +0000 (19:53 +0100)
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h

index c84743bb693761d59a02f0937186868eb18ad03e..56ece50910a87a544f0c5be05ce29e50a58524ef 100644 (file)
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -775,9 +775,16 @@ struct i915_gpu_error {
         unsigned long last_reset;
  
         /**
-        * State variable controlling the reset flow
+        * State variable and reset counter controlling the reset flow
          *
-        * Upper bits are for the reset counter.
+        * Upper bits are for the reset counter.  This counter is used by the
+        * wait_seqno code to race-free noticed that a reset event happened and
+        * that it needs to restart the entire ioctl (since most likely the
+        * seqno it waited for won't ever signal anytime soon).
+        *
+        * This is important for lock-free wait paths, where no contended lock
+        * naturally enforces the correct ordering between the bail-out of the
+        * waiter and the gpu reset work code.
          *
          * Lowest bit controls the reset state machine: Set means a reset is in
          * progress. This state will (presuming we don't have any bugs) decay
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c

index 5bb370fdc99cd8a4d9cd0d0a774c53b940f97c63..5226ebcac33afa53b1daea5a1e9ee791de2cfb18 100644 (file)
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -976,13 +976,22 @@ i915_gem_check_olr(struct intel_ring_buffer *ring, u32 seqno)
   * __wait_seqno - wait until execution of seqno has finished
   * @ring: the ring expected to report seqno
   * @seqno: duh!
+ * @reset_counter: reset sequence associated with the given seqno
   * @interruptible: do an interruptible wait (normally yes)
   * @timeout: in - how long to wait (NULL forever); out - how much time remaining
   *
+ * Note: It is of utmost importance that the passed in seqno and reset_counter
+ * values have been read by the caller in an smp safe manner. Where read-side
+ * locks are involved, it is sufficient to read the reset_counter before
+ * unlocking the lock that protects the seqno. For lockless tricks, the
+ * reset_counter _must_ be read before, and an appropriate smp_rmb must be
+ * inserted.
+ *
   * Returns 0 if the seqno was found within the alloted time. Else returns the
   * errno with remaining time filled in timeout argument.
   */
  static int __wait_seqno(struct intel_ring_buffer *ring, u32 seqno,
+                       unsigned reset_counter,
                         bool interruptible, struct timespec *timeout)
  {
         drm_i915_private_t *dev_priv = ring->dev->dev_private;
@@ -1012,7 +1021,8 @@ static int __wait_seqno(struct intel_ring_buffer *ring, u32 seqno,
  
  #define EXIT_COND \
         (i915_seqno_passed(ring->get_seqno(ring, false), seqno) || \
-       i915_reset_in_progress(&dev_priv->gpu_error))
+        i915_reset_in_progress(&dev_priv->gpu_error) || \
+        reset_counter != atomic_read(&dev_priv->gpu_error.reset_counter))
         do {
                 if (interruptible)
                         end = wait_event_interruptible_timeout(ring->irq_queue,
@@ -1022,6 +1032,13 @@ static int __wait_seqno(struct intel_ring_buffer *ring, u32 seqno,
                         end = wait_event_timeout(ring->irq_queue, EXIT_COND,
                                                  timeout_jiffies);
  
+               /* We need to check whether any gpu reset happened in between
+                * the caller grabbing the seqno and now ... */
+               if (reset_counter != atomic_read(&dev_priv->gpu_error.reset_counter))
+                       end = -EAGAIN;
+
+               /* ... but upgrade the -EGAIN to an -EIO if the gpu is truely
+                * gone. */
                 ret = i915_gem_check_wedge(&dev_priv->gpu_error, interruptible);
                 if (ret)
                         end = ret;
@@ -1076,7 +1093,9 @@ i915_wait_seqno(struct intel_ring_buffer *ring, uint32_t seqno)
         if (ret)
                 return ret;
  
-       return __wait_seqno(ring, seqno, interruptible, NULL);
+       return __wait_seqno(ring, seqno,
+                           atomic_read(&dev_priv->gpu_error.reset_counter),
+                           interruptible, NULL);
  }
  
  /**
@@ -1123,6 +1142,7 @@ i915_gem_object_wait_rendering__nonblocking(struct drm_i915_gem_object *obj,
         struct drm_device *dev = obj->base.dev;
         struct drm_i915_private *dev_priv = dev->dev_private;
         struct intel_ring_buffer *ring = obj->ring;
+       unsigned reset_counter;
         u32 seqno;
         int ret;
  
@@ -1141,8 +1161,9 @@ i915_gem_object_wait_rendering__nonblocking(struct drm_i915_gem_object *obj,
         if (ret)
                 return ret;
  
+       reset_counter = atomic_read(&dev_priv->gpu_error.reset_counter);
         mutex_unlock(&dev->struct_mutex);
-       ret = __wait_seqno(ring, seqno, true, NULL);
+       ret = __wait_seqno(ring, seqno, reset_counter, true, NULL);
         mutex_lock(&dev->struct_mutex);
  
         i915_gem_retire_requests_ring(ring);
@@ -2297,10 +2318,12 @@ i915_gem_object_flush_active(struct drm_i915_gem_object *obj)
  int
  i915_gem_wait_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
  {
+       drm_i915_private_t *dev_priv = dev->dev_private;
         struct drm_i915_gem_wait *args = data;
         struct drm_i915_gem_object *obj;
         struct intel_ring_buffer *ring = NULL;
         struct timespec timeout_stack, *timeout = NULL;
+       unsigned reset_counter;
         u32 seqno = 0;
         int ret = 0;
  
@@ -2341,9 +2364,10 @@ i915_gem_wait_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
         }
  
         drm_gem_object_unreference(&obj->base);
+       reset_counter = atomic_read(&dev_priv->gpu_error.reset_counter);
         mutex_unlock(&dev->struct_mutex);
  
-       ret = __wait_seqno(ring, seqno, true, timeout);
+       ret = __wait_seqno(ring, seqno, reset_counter, true, timeout);
         if (timeout) {
                 WARN_ON(!timespec_valid(timeout));
                 args->timeout_ns = timespec_to_ns(timeout);
@@ -3394,6 +3418,7 @@ i915_gem_ring_throttle(struct drm_device *dev, struct drm_file *file)
         unsigned long recent_enough = jiffies - msecs_to_jiffies(20);
         struct drm_i915_gem_request *request;
         struct intel_ring_buffer *ring = NULL;
+       unsigned reset_counter;
         u32 seqno = 0;
         int ret;
  
@@ -3413,12 +3438,13 @@ i915_gem_ring_throttle(struct drm_device *dev, struct drm_file *file)
                 ring = request->ring;
                 seqno = request->seqno;
         }
+       reset_counter = atomic_read(&dev_priv->gpu_error.reset_counter);
         spin_unlock(&file_priv->mm.lock);
  
         if (seqno == 0)
                 return 0;
  
-       ret = __wait_seqno(ring, seqno, true, NULL);
+       ret = __wait_seqno(ring, seqno, reset_counter, true, NULL);
         if (ret == 0)
                 queue_delayed_work(dev_priv->wq, &dev_priv->mm.retire_work, 0);
  
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c

index 4562c5406ef83dac1789a01e3101570a8c017f30..f833f2c155f815c31fd9729cc5fe7a256f207b97 100644 (file)
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -867,9 +867,11 @@ static void i915_error_work_func(struct work_struct *work)
         drm_i915_private_t *dev_priv = container_of(error, drm_i915_private_t,
                                                     gpu_error);
         struct drm_device *dev = dev_priv->dev;
+       struct intel_ring_buffer *ring;
         char *error_event[] = { "ERROR=1", NULL };
         char *reset_event[] = { "RESET=1", NULL };
         char *reset_done_event[] = { "ERROR=0", NULL };
+       int i, ret;
  
         kobject_uevent_env(&dev->primary->kdev.kobj, KOBJ_CHANGE, error_event);
  
@@ -877,13 +879,31 @@ static void i915_error_work_func(struct work_struct *work)
                 DRM_DEBUG_DRIVER("resetting chip\n");
                 kobject_uevent_env(&dev->primary->kdev.kobj, KOBJ_CHANGE, reset_event);
  
-               if (!i915_reset(dev)) {
-                       atomic_set(&error->reset_counter, 0);
-                       kobject_uevent_env(&dev->primary->kdev.kobj, KOBJ_CHANGE, reset_done_event);
+               ret = i915_reset(dev);
+
+               if (ret == 0) {
+                       /*
+                        * After all the gem state is reset, increment the reset
+                        * counter and wake up everyone waiting for the reset to
+                        * complete.
+                        *
+                        * Since unlock operations are a one-sided barrier only,
+                        * we need to insert a barrier here to order any seqno
+                        * updates before
+                        * the counter increment.
+                        */
+                       smp_mb__before_atomic_inc();
+                       atomic_inc(&dev_priv->gpu_error.reset_counter);
+
+                       kobject_uevent_env(&dev->primary->kdev.kobj,
+                                          KOBJ_CHANGE, reset_done_event);
                 } else {
                         atomic_set(&error->reset_counter, I915_WEDGED);
                 }
  
+               for_each_ring(ring, dev_priv, i)
+                       wake_up_all(&ring->irq_queue);
+
                 wake_up_all(&dev_priv->gpu_error.reset_queue);
         }
  }
@@ -1488,8 +1508,8 @@ void i915_handle_error(struct drm_device *dev, bool wedged)
         i915_report_and_clear_eir(dev);
  
         if (wedged) {
-               atomic_set(&dev_priv->gpu_error.reset_counter,
-                          I915_RESET_IN_PROGRESS_FLAG);
+               atomic_set_mask(I915_RESET_IN_PROGRESS_FLAG,
+                               &dev_priv->gpu_error.reset_counter);
  
                 /*
                  * Wakeup waiting processes so that the reset work item
author	Daniel Vetter <daniel.vetter@ffwll.ch>
	Thu, 6 Dec 2012 08:01:42 +0000 (09:01 +0100)
committer	Daniel Vetter <daniel.vetter@ffwll.ch>
	Mon, 21 Jan 2013 18:53:54 +0000 (19:53 +0100)
drivers/gpu/drm/i915/i915_drv.h		patch \| blob \| history
drivers/gpu/drm/i915/i915_gem.c		patch \| blob \| history
drivers/gpu/drm/i915/i915_irq.c		patch \| blob \| history