drm/i915: Use HW semaphores for inter-engine synchronisation on gen8+

author Chris Wilson <chris@chris-wilson.co.uk>

Fri, 1 Mar 2019 17:09:00 +0000 (17:09 +0000)

committer Chris Wilson <chris@chris-wilson.co.uk>

Fri, 1 Mar 2019 17:45:07 +0000 (17:45 +0000)
author Chris Wilson <chris@chris-wilson.co.uk>
Fri, 1 Mar 2019 17:09:00 +0000 (17:09 +0000)
committer Chris Wilson <chris@chris-wilson.co.uk>
Fri, 1 Mar 2019 17:45:07 +0000 (17:45 +0000)
diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c

index c6354f6cdbdb8b46247140ae38a12afc93d577eb..c08abdef5eb62c7e3aca272aea1f714d7bc33a39 100644 (file)
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -351,7 +351,7 @@ static int i915_getparam_ioctl(struct drm_device *dev, void *data,
                 value = min_t(int, INTEL_PPGTT(dev_priv), I915_GEM_PPGTT_FULL);
                 break;
         case I915_PARAM_HAS_SEMAPHORES:
-               value = 0;
+               value = !!(dev_priv->caps.scheduler & I915_SCHEDULER_CAP_SEMAPHORES);
                 break;
         case I915_PARAM_HAS_SECURE_BATCHES:
                 value = capable(CAP_SYS_ADMIN);
diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c

index d354967d6ae8b5d4e0b9a0525390cb0de942ae6d..59e30b8c4ee91450d2a5dddcd9c9a332739c52ed 100644 (file)
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -22,8 +22,9 @@
   *
   */
  
-#include <linux/prefetch.h>
  #include <linux/dma-fence-array.h>
+#include <linux/irq_work.h>
+#include <linux/prefetch.h>
  #include <linux/sched.h>
  #include <linux/sched/clock.h>
  #include <linux/sched/signal.h>
@@ -32,9 +33,16 @@
  #include "i915_active.h"
  #include "i915_reset.h"
  
+struct execute_cb {
+       struct list_head link;
+       struct irq_work work;
+       struct i915_sw_fence *fence;
+};
+
  static struct i915_global_request {
         struct kmem_cache *slab_requests;
         struct kmem_cache *slab_dependencies;
+       struct kmem_cache *slab_execute_cbs;
  } global;
  
  static const char *i915_fence_get_driver_name(struct dma_fence *fence)
@@ -325,6 +333,69 @@ void i915_request_retire_upto(struct i915_request *rq)
         } while (tmp != rq);
  }
  
+static void irq_execute_cb(struct irq_work *wrk)
+{
+       struct execute_cb *cb = container_of(wrk, typeof(*cb), work);
+
+       i915_sw_fence_complete(cb->fence);
+       kmem_cache_free(global.slab_execute_cbs, cb);
+}
+
+static void __notify_execute_cb(struct i915_request *rq)
+{
+       struct execute_cb *cb;
+
+       lockdep_assert_held(&rq->lock);
+
+       if (list_empty(&rq->execute_cb))
+               return;
+
+       list_for_each_entry(cb, &rq->execute_cb, link)
+               irq_work_queue(&cb->work);
+
+       /*
+        * XXX Rollback on __i915_request_unsubmit()
+        *
+        * In the future, perhaps when we have an active time-slicing scheduler,
+        * it will be interesting to unsubmit parallel execution and remove
+        * busywaits from the GPU until their master is restarted. This is
+        * quite hairy, we have to carefully rollback the fence and do a
+        * preempt-to-idle cycle on the target engine, all the while the
+        * master execute_cb may refire.
+        */
+       INIT_LIST_HEAD(&rq->execute_cb);
+}
+
+static int
+i915_request_await_execution(struct i915_request *rq,
+                            struct i915_request *signal,
+                            gfp_t gfp)
+{
+       struct execute_cb *cb;
+
+       if (i915_request_is_active(signal))
+               return 0;
+
+       cb = kmem_cache_alloc(global.slab_execute_cbs, gfp);
+       if (!cb)
+               return -ENOMEM;
+
+       cb->fence = &rq->submit;
+       i915_sw_fence_await(cb->fence);
+       init_irq_work(&cb->work, irq_execute_cb);
+
+       spin_lock_irq(&signal->lock);
+       if (i915_request_is_active(signal)) {
+               i915_sw_fence_complete(cb->fence);
+               kmem_cache_free(global.slab_execute_cbs, cb);
+       } else {
+               list_add_tail(&cb->link, &signal->execute_cb);
+       }
+       spin_unlock_irq(&signal->lock);
+
+       return 0;
+}
+
  static void move_to_timeline(struct i915_request *request,
                              struct i915_timeline *timeline)
  {
@@ -361,6 +432,8 @@ void __i915_request_submit(struct i915_request *request)
             !i915_request_enable_breadcrumb(request))
                 intel_engine_queue_breadcrumbs(engine);
  
+       __notify_execute_cb(request);
+
         spin_unlock(&request->lock);
  
         engine->emit_fini_breadcrumb(request,
@@ -608,6 +681,7 @@ i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
         }
  
         INIT_LIST_HEAD(&rq->active_list);
+       INIT_LIST_HEAD(&rq->execute_cb);
  
         tl = ce->ring->timeline;
         ret = i915_timeline_get_seqno(tl, rq, &seqno);
@@ -696,6 +770,52 @@ err_unreserve:
         return ERR_PTR(ret);
  }
  
+static int
+emit_semaphore_wait(struct i915_request *to,
+                   struct i915_request *from,
+                   gfp_t gfp)
+{
+       u32 hwsp_offset;
+       u32 *cs;
+       int err;
+
+       GEM_BUG_ON(!from->timeline->has_initial_breadcrumb);
+       GEM_BUG_ON(INTEL_GEN(to->i915) < 8);
+
+       /* We need to pin the signaler's HWSP until we are finished reading. */
+       err = i915_timeline_read_hwsp(from, to, &hwsp_offset);
+       if (err)
+               return err;
+
+       /* Only submit our spinner after the signaler is running! */
+       err = i915_request_await_execution(to, from, gfp);
+       if (err)
+               return err;
+
+       cs = intel_ring_begin(to, 4);
+       if (IS_ERR(cs))
+               return PTR_ERR(cs);
+
+       /*
+        * Using greater-than-or-equal here means we have to worry
+        * about seqno wraparound. To side step that issue, we swap
+        * the timeline HWSP upon wrapping, so that everyone listening
+        * for the old (pre-wrap) values do not see the much smaller
+        * (post-wrap) values than they were expecting (and so wait
+        * forever).
+        */
+       *cs++ = MI_SEMAPHORE_WAIT |
+               MI_SEMAPHORE_GLOBAL_GTT |
+               MI_SEMAPHORE_POLL |
+               MI_SEMAPHORE_SAD_GTE_SDD;
+       *cs++ = from->fence.seqno;
+       *cs++ = hwsp_offset;
+       *cs++ = 0;
+
+       intel_ring_advance(to, cs);
+       return 0;
+}
+
  static int
  i915_request_await_request(struct i915_request *to, struct i915_request *from)
  {
@@ -717,6 +837,9 @@ i915_request_await_request(struct i915_request *to, struct i915_request *from)
                 ret = i915_sw_fence_await_sw_fence_gfp(&to->submit,
                                                        &from->submit,
                                                        I915_FENCE_GFP);
+       } else if (intel_engine_has_semaphores(to->engine) &&
+                  to->gem_context->sched.priority >= I915_PRIORITY_NORMAL) {
+               ret = emit_semaphore_wait(to, from, I915_FENCE_GFP);
         } else {
                 ret = i915_sw_fence_await_dma_fence(&to->submit,
                                                     &from->fence, 0,
@@ -1208,14 +1331,23 @@ int __init i915_global_request_init(void)
         if (!global.slab_requests)
                 return -ENOMEM;
  
+       global.slab_execute_cbs = KMEM_CACHE(execute_cb,
+                                            SLAB_HWCACHE_ALIGN |
+                                            SLAB_RECLAIM_ACCOUNT |
+                                            SLAB_TYPESAFE_BY_RCU);
+       if (!global.slab_execute_cbs)
+               goto err_requests;
+
         global.slab_dependencies = KMEM_CACHE(i915_dependency,
                                               SLAB_HWCACHE_ALIGN |
                                               SLAB_RECLAIM_ACCOUNT);
         if (!global.slab_dependencies)
-               goto err_requests;
+               goto err_execute_cbs;
  
         return 0;
  
+err_execute_cbs:
+       kmem_cache_destroy(global.slab_execute_cbs);
  err_requests:
         kmem_cache_destroy(global.slab_requests);
         return -ENOMEM;
@@ -1224,11 +1356,13 @@ err_requests:
  void i915_global_request_shrink(void)
  {
         kmem_cache_shrink(global.slab_dependencies);
+       kmem_cache_shrink(global.slab_execute_cbs);
         kmem_cache_shrink(global.slab_requests);
  }
  
  void i915_global_request_exit(void)
  {
         kmem_cache_destroy(global.slab_dependencies);
+       kmem_cache_destroy(global.slab_execute_cbs);
         kmem_cache_destroy(global.slab_requests);
  }
diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h

index 09eaad06d2c6e0e9776b638b9e1543452fedbb65..4dd1dea1d1afe2d2e5c6bd8ae176a4ade03bb27a 100644 (file)
--- a/drivers/gpu/drm/i915/i915_request.h
+++ b/drivers/gpu/drm/i915/i915_request.h
@@ -129,6 +129,7 @@ struct i915_request {
          */
         struct i915_sw_fence submit;
         wait_queue_entry_t submitq;
+       struct list_head execute_cb;
  
         /*
          * A list of everyone we wait upon, and everyone who waits upon us.
@@ -343,10 +344,27 @@ static inline bool __i915_request_has_started(const struct i915_request *rq)
   * i915_request_started - check if the request has begun being executed
   * @rq: the request
   *
- * Returns true if the request has been submitted to hardware, and the hardware
- * has advanced passed the end of the previous request and so should be either
- * currently processing the request (though it may be preempted and so
- * not necessarily the next request to complete) or have completed the request.
+ * If the timeline is not using initial breadcrumbs, a request is
+ * considered started if the previous request on its timeline (i.e.
+ * context) has been signaled.
+ *
+ * If the timeline is using semaphores, it will also be emitting an
+ * "initial breadcrumb" after the semaphores are complete and just before
+ * it began executing the user payload. A request can therefore be active
+ * on the HW and not yet started as it is still busywaiting on its
+ * dependencies (via HW semaphores).
+ *
+ * If the request has started, its dependencies will have been signaled
+ * (either by fences or by semaphores) and it will have begun processing
+ * the user payload.
+ *
+ * However, even if a request has started, it may have been preempted and
+ * so no longer active, or it may have already completed.
+ *
+ * See also i915_request_is_active().
+ *
+ * Returns true if the request has begun executing the user payload, or
+ * has completed:
   */
  static inline bool i915_request_started(const struct i915_request *rq)
  {
diff --git a/drivers/gpu/drm/i915/i915_sw_fence.c b/drivers/gpu/drm/i915/i915_sw_fence.c

index 7c58b049ecb50a47bf06d0322d6077f6e06e955d..8d1400d378d7a7746ec08426d683b52e5ea066f2 100644 (file)
--- a/drivers/gpu/drm/i915/i915_sw_fence.c
+++ b/drivers/gpu/drm/i915/i915_sw_fence.c
@@ -192,7 +192,7 @@ static void __i915_sw_fence_complete(struct i915_sw_fence *fence,
         __i915_sw_fence_notify(fence, FENCE_FREE);
  }
  
-static void i915_sw_fence_complete(struct i915_sw_fence *fence)
+void i915_sw_fence_complete(struct i915_sw_fence *fence)
  {
         debug_fence_assert(fence);
  
@@ -202,7 +202,7 @@ static void i915_sw_fence_complete(struct i915_sw_fence *fence)
         __i915_sw_fence_complete(fence, NULL);
  }
  
-static void i915_sw_fence_await(struct i915_sw_fence *fence)
+void i915_sw_fence_await(struct i915_sw_fence *fence)
  {
         debug_fence_assert(fence);
         WARN_ON(atomic_inc_return(&fence->pending) <= 1);
diff --git a/drivers/gpu/drm/i915/i915_sw_fence.h b/drivers/gpu/drm/i915/i915_sw_fence.h

index 0e055ea0179f321a4aafc8b95a667f8bbc8543f6..6dec9e1d11029888d11d3e250aa9644c7339430a 100644 (file)
--- a/drivers/gpu/drm/i915/i915_sw_fence.h
+++ b/drivers/gpu/drm/i915/i915_sw_fence.h
@@ -79,6 +79,9 @@ int i915_sw_fence_await_reservation(struct i915_sw_fence *fence,
                                     unsigned long timeout,
                                     gfp_t gfp);
  
+void i915_sw_fence_await(struct i915_sw_fence *fence);
+void i915_sw_fence_complete(struct i915_sw_fence *fence);
+
  static inline bool i915_sw_fence_signaled(const struct i915_sw_fence *fence)
  {
         return atomic_read(&fence->pending) <= 0;
diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c

index df8f88142f1dfb6d2b52f40f086dc5bf452fc457..4fc7e2ac6278127ae776a05038184fc91d4dd909 100644 (file)
--- a/drivers/gpu/drm/i915/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/intel_engine_cs.c
@@ -616,6 +616,7 @@ void intel_engines_set_scheduler_caps(struct drm_i915_private *i915)
         } map[] = {
  #define MAP(x, y) { ilog2(I915_ENGINE_HAS_##x), ilog2(I915_SCHEDULER_CAP_##y) }
                 MAP(PREEMPTION, PREEMPTION),
+               MAP(SEMAPHORES, SEMAPHORES),
  #undef MAP
         };
         struct intel_engine_cs *engine;
diff --git a/drivers/gpu/drm/i915/intel_gpu_commands.h b/drivers/gpu/drm/i915/intel_gpu_commands.h

index b96a31bc10809c44ecb17414174991758603dc7b..a34ece53a7712a6efe3e1b1db268238a09ece7f4 100644 (file)
--- a/drivers/gpu/drm/i915/intel_gpu_commands.h
+++ b/drivers/gpu/drm/i915/intel_gpu_commands.h
@@ -105,8 +105,13 @@
  #define MI_SEMAPHORE_SIGNAL    MI_INSTR(0x1b, 0) /* GEN8+ */
  #define   MI_SEMAPHORE_TARGET(engine)  ((engine)<<15)
  #define MI_SEMAPHORE_WAIT      MI_INSTR(0x1c, 2) /* GEN8+ */
-#define   MI_SEMAPHORE_POLL            (1<<15)
-#define   MI_SEMAPHORE_SAD_GTE_SDD     (1<<12)
+#define   MI_SEMAPHORE_POLL            (1 << 15)
+#define   MI_SEMAPHORE_SAD_GT_SDD      (0 << 12)
+#define   MI_SEMAPHORE_SAD_GTE_SDD     (1 << 12)
+#define   MI_SEMAPHORE_SAD_LT_SDD      (2 << 12)
+#define   MI_SEMAPHORE_SAD_LTE_SDD     (3 << 12)
+#define   MI_SEMAPHORE_SAD_EQ_SDD      (4 << 12)
+#define   MI_SEMAPHORE_SAD_NEQ_SDD     (5 << 12)
  #define MI_STORE_DWORD_IMM     MI_INSTR(0x20, 1)
  #define MI_STORE_DWORD_IMM_GEN4        MI_INSTR(0x20, 2)
  #define   MI_MEM_VIRTUAL       (1 << 22) /* 945,g33,965 */
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c

index 3fd0c45a29208f64a90ba1cd9302cab2c44171cd..6c9479acb4331b1ba72a171b03f51bd6cf4f1330 100644 (file)
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -2330,6 +2330,7 @@ void intel_execlists_set_default_submission(struct intel_engine_cs *engine)
         engine->park = NULL;
         engine->unpark = NULL;
  
+       engine->flags |= I915_ENGINE_HAS_SEMAPHORES;
         engine->flags |= I915_ENGINE_SUPPORTS_STATS;
         if (engine->i915->preempt_context)
                 engine->flags |= I915_ENGINE_HAS_PREEMPTION;
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h

index b8ec7e40a59bfc7e08714a6f6836da118131ecbe..2e7264119ec49d47f9bc49f03b1a5c391020a9e1 100644 (file)
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -499,6 +499,7 @@ struct intel_engine_cs {
  #define I915_ENGINE_NEEDS_CMD_PARSER BIT(0)
  #define I915_ENGINE_SUPPORTS_STATS   BIT(1)
  #define I915_ENGINE_HAS_PREEMPTION   BIT(2)
+#define I915_ENGINE_HAS_SEMAPHORES   BIT(3)
         unsigned int flags;
  
         /*
@@ -576,6 +577,12 @@ intel_engine_has_preemption(const struct intel_engine_cs *engine)
         return engine->flags & I915_ENGINE_HAS_PREEMPTION;
  }
  
+static inline bool
+intel_engine_has_semaphores(const struct intel_engine_cs *engine)
+{
+       return engine->flags & I915_ENGINE_HAS_SEMAPHORES;
+}
+
  void intel_engines_set_scheduler_caps(struct drm_i915_private *i915);
  
  static inline bool __execlists_need_preempt(int prio, int last)
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h

index 8304a7f1ec3f98da958f8e9cc6cd8bba58035f30..b10eea3f6d24f2386f1396a988ad8c3988b1f9b7 100644 (file)
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -479,6 +479,7 @@ typedef struct drm_i915_irq_wait {
  #define   I915_SCHEDULER_CAP_ENABLED   (1ul << 0)
  #define   I915_SCHEDULER_CAP_PRIORITY  (1ul << 1)
  #define   I915_SCHEDULER_CAP_PREEMPTION        (1ul << 2)
+#define   I915_SCHEDULER_CAP_SEMAPHORES        (1ul << 3)
  
  #define I915_PARAM_HUC_STATUS           42
author	Chris Wilson <chris@chris-wilson.co.uk>
	Fri, 1 Mar 2019 17:09:00 +0000 (17:09 +0000)
committer	Chris Wilson <chris@chris-wilson.co.uk>
	Fri, 1 Mar 2019 17:45:07 +0000 (17:45 +0000)
drivers/gpu/drm/i915/i915_drv.c		patch \| blob \| history
drivers/gpu/drm/i915/i915_request.c		patch \| blob \| history
drivers/gpu/drm/i915/i915_request.h		patch \| blob \| history
drivers/gpu/drm/i915/i915_sw_fence.c		patch \| blob \| history
drivers/gpu/drm/i915/i915_sw_fence.h		patch \| blob \| history
drivers/gpu/drm/i915/intel_engine_cs.c		patch \| blob \| history
drivers/gpu/drm/i915/intel_gpu_commands.h		patch \| blob \| history
drivers/gpu/drm/i915/intel_lrc.c		patch \| blob \| history
drivers/gpu/drm/i915/intel_ringbuffer.h		patch \| blob \| history
include/uapi/drm/i915_drm.h		patch \| blob \| history