drm/i915: Use HW semaphores for inter-engine synchronisation on gen8+
authorChris Wilson <chris@chris-wilson.co.uk>
Fri, 1 Mar 2019 17:09:00 +0000 (17:09 +0000)
committerChris Wilson <chris@chris-wilson.co.uk>
Fri, 1 Mar 2019 17:45:07 +0000 (17:45 +0000)
Having introduced per-context seqno, we now have a means to identity
progress across the system without feel of rollback as befell the
global_seqno. That is we can program a MI_SEMAPHORE_WAIT operation in
advance of submission safe in the knowledge that our target seqno and
address is stable.

However, since we are telling the GPU to busy-spin on the target address
until it matches the signaling seqno, we only want to do so when we are
sure that busy-spin will be completed quickly. To achieve this we only
submit the request to HW once the signaler is itself executing (modulo
preemption causing us to wait longer), and we only do so for default and
above priority requests (so that idle priority tasks never themselves
hog the GPU waiting for others).

As might be reasonably expected, HW semaphores excel in inter-engine
synchronisation microbenchmarks (where the 3x reduced latency / increased
throughput more than offset the power cost of spinning on a second ring)
and have significant improvement (can be up to ~10%, most see no change)
for single clients that utilize multiple engines (typically media players
and transcoders), without regressing multiple clients that can saturate
the system or changing the power envelope dramatically.

v3: Drop the older NEQ branch, now we pin the signaler's HWSP anyway.
v4: Tell the world and include it as part of scheduler caps.

Testcase: igt/gem_exec_whisper
Testcase: igt/benchmarks/gem_wsim
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190301170901.8340-3-chris@chris-wilson.co.uk
drivers/gpu/drm/i915/i915_drv.c
drivers/gpu/drm/i915/i915_request.c
drivers/gpu/drm/i915/i915_request.h
drivers/gpu/drm/i915/i915_sw_fence.c
drivers/gpu/drm/i915/i915_sw_fence.h
drivers/gpu/drm/i915/intel_engine_cs.c
drivers/gpu/drm/i915/intel_gpu_commands.h
drivers/gpu/drm/i915/intel_lrc.c
drivers/gpu/drm/i915/intel_ringbuffer.h
include/uapi/drm/i915_drm.h

index c6354f6cdbdb8b46247140ae38a12afc93d577eb..c08abdef5eb62c7e3aca272aea1f714d7bc33a39 100644 (file)
@@ -351,7 +351,7 @@ static int i915_getparam_ioctl(struct drm_device *dev, void *data,
                value = min_t(int, INTEL_PPGTT(dev_priv), I915_GEM_PPGTT_FULL);
                break;
        case I915_PARAM_HAS_SEMAPHORES:
-               value = 0;
+               value = !!(dev_priv->caps.scheduler & I915_SCHEDULER_CAP_SEMAPHORES);
                break;
        case I915_PARAM_HAS_SECURE_BATCHES:
                value = capable(CAP_SYS_ADMIN);
index d354967d6ae8b5d4e0b9a0525390cb0de942ae6d..59e30b8c4ee91450d2a5dddcd9c9a332739c52ed 100644 (file)
@@ -22,8 +22,9 @@
  *
  */
 
-#include <linux/prefetch.h>
 #include <linux/dma-fence-array.h>
+#include <linux/irq_work.h>
+#include <linux/prefetch.h>
 #include <linux/sched.h>
 #include <linux/sched/clock.h>
 #include <linux/sched/signal.h>
 #include "i915_active.h"
 #include "i915_reset.h"
 
+struct execute_cb {
+       struct list_head link;
+       struct irq_work work;
+       struct i915_sw_fence *fence;
+};
+
 static struct i915_global_request {
        struct kmem_cache *slab_requests;
        struct kmem_cache *slab_dependencies;
+       struct kmem_cache *slab_execute_cbs;
 } global;
 
 static const char *i915_fence_get_driver_name(struct dma_fence *fence)
@@ -325,6 +333,69 @@ void i915_request_retire_upto(struct i915_request *rq)
        } while (tmp != rq);
 }
 
+static void irq_execute_cb(struct irq_work *wrk)
+{
+       struct execute_cb *cb = container_of(wrk, typeof(*cb), work);
+
+       i915_sw_fence_complete(cb->fence);
+       kmem_cache_free(global.slab_execute_cbs, cb);
+}
+
+static void __notify_execute_cb(struct i915_request *rq)
+{
+       struct execute_cb *cb;
+
+       lockdep_assert_held(&rq->lock);
+
+       if (list_empty(&rq->execute_cb))
+               return;
+
+       list_for_each_entry(cb, &rq->execute_cb, link)
+               irq_work_queue(&cb->work);
+
+       /*
+        * XXX Rollback on __i915_request_unsubmit()
+        *
+        * In the future, perhaps when we have an active time-slicing scheduler,
+        * it will be interesting to unsubmit parallel execution and remove
+        * busywaits from the GPU until their master is restarted. This is
+        * quite hairy, we have to carefully rollback the fence and do a
+        * preempt-to-idle cycle on the target engine, all the while the
+        * master execute_cb may refire.
+        */
+       INIT_LIST_HEAD(&rq->execute_cb);
+}
+
+static int
+i915_request_await_execution(struct i915_request *rq,
+                            struct i915_request *signal,
+                            gfp_t gfp)
+{
+       struct execute_cb *cb;
+
+       if (i915_request_is_active(signal))
+               return 0;
+
+       cb = kmem_cache_alloc(global.slab_execute_cbs, gfp);
+       if (!cb)
+               return -ENOMEM;
+
+       cb->fence = &rq->submit;
+       i915_sw_fence_await(cb->fence);
+       init_irq_work(&cb->work, irq_execute_cb);
+
+       spin_lock_irq(&signal->lock);
+       if (i915_request_is_active(signal)) {
+               i915_sw_fence_complete(cb->fence);
+               kmem_cache_free(global.slab_execute_cbs, cb);
+       } else {
+               list_add_tail(&cb->link, &signal->execute_cb);
+       }
+       spin_unlock_irq(&signal->lock);
+
+       return 0;
+}
+
 static void move_to_timeline(struct i915_request *request,
                             struct i915_timeline *timeline)
 {
@@ -361,6 +432,8 @@ void __i915_request_submit(struct i915_request *request)
            !i915_request_enable_breadcrumb(request))
                intel_engine_queue_breadcrumbs(engine);
 
+       __notify_execute_cb(request);
+
        spin_unlock(&request->lock);
 
        engine->emit_fini_breadcrumb(request,
@@ -608,6 +681,7 @@ i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
        }
 
        INIT_LIST_HEAD(&rq->active_list);
+       INIT_LIST_HEAD(&rq->execute_cb);
 
        tl = ce->ring->timeline;
        ret = i915_timeline_get_seqno(tl, rq, &seqno);
@@ -696,6 +770,52 @@ err_unreserve:
        return ERR_PTR(ret);
 }
 
+static int
+emit_semaphore_wait(struct i915_request *to,
+                   struct i915_request *from,
+                   gfp_t gfp)
+{
+       u32 hwsp_offset;
+       u32 *cs;
+       int err;
+
+       GEM_BUG_ON(!from->timeline->has_initial_breadcrumb);
+       GEM_BUG_ON(INTEL_GEN(to->i915) < 8);
+
+       /* We need to pin the signaler's HWSP until we are finished reading. */
+       err = i915_timeline_read_hwsp(from, to, &hwsp_offset);
+       if (err)
+               return err;
+
+       /* Only submit our spinner after the signaler is running! */
+       err = i915_request_await_execution(to, from, gfp);
+       if (err)
+               return err;
+
+       cs = intel_ring_begin(to, 4);
+       if (IS_ERR(cs))
+               return PTR_ERR(cs);
+
+       /*
+        * Using greater-than-or-equal here means we have to worry
+        * about seqno wraparound. To side step that issue, we swap
+        * the timeline HWSP upon wrapping, so that everyone listening
+        * for the old (pre-wrap) values do not see the much smaller
+        * (post-wrap) values than they were expecting (and so wait
+        * forever).
+        */
+       *cs++ = MI_SEMAPHORE_WAIT |
+               MI_SEMAPHORE_GLOBAL_GTT |
+               MI_SEMAPHORE_POLL |
+               MI_SEMAPHORE_SAD_GTE_SDD;
+       *cs++ = from->fence.seqno;
+       *cs++ = hwsp_offset;
+       *cs++ = 0;
+
+       intel_ring_advance(to, cs);
+       return 0;
+}
+
 static int
 i915_request_await_request(struct i915_request *to, struct i915_request *from)
 {
@@ -717,6 +837,9 @@ i915_request_await_request(struct i915_request *to, struct i915_request *from)
                ret = i915_sw_fence_await_sw_fence_gfp(&to->submit,
                                                       &from->submit,
                                                       I915_FENCE_GFP);
+       } else if (intel_engine_has_semaphores(to->engine) &&
+                  to->gem_context->sched.priority >= I915_PRIORITY_NORMAL) {
+               ret = emit_semaphore_wait(to, from, I915_FENCE_GFP);
        } else {
                ret = i915_sw_fence_await_dma_fence(&to->submit,
                                                    &from->fence, 0,
@@ -1208,14 +1331,23 @@ int __init i915_global_request_init(void)
        if (!global.slab_requests)
                return -ENOMEM;
 
+       global.slab_execute_cbs = KMEM_CACHE(execute_cb,
+                                            SLAB_HWCACHE_ALIGN |
+                                            SLAB_RECLAIM_ACCOUNT |
+                                            SLAB_TYPESAFE_BY_RCU);
+       if (!global.slab_execute_cbs)
+               goto err_requests;
+
        global.slab_dependencies = KMEM_CACHE(i915_dependency,
                                              SLAB_HWCACHE_ALIGN |
                                              SLAB_RECLAIM_ACCOUNT);
        if (!global.slab_dependencies)
-               goto err_requests;
+               goto err_execute_cbs;
 
        return 0;
 
+err_execute_cbs:
+       kmem_cache_destroy(global.slab_execute_cbs);
 err_requests:
        kmem_cache_destroy(global.slab_requests);
        return -ENOMEM;
@@ -1224,11 +1356,13 @@ err_requests:
 void i915_global_request_shrink(void)
 {
        kmem_cache_shrink(global.slab_dependencies);
+       kmem_cache_shrink(global.slab_execute_cbs);
        kmem_cache_shrink(global.slab_requests);
 }
 
 void i915_global_request_exit(void)
 {
        kmem_cache_destroy(global.slab_dependencies);
+       kmem_cache_destroy(global.slab_execute_cbs);
        kmem_cache_destroy(global.slab_requests);
 }
index 09eaad06d2c6e0e9776b638b9e1543452fedbb65..4dd1dea1d1afe2d2e5c6bd8ae176a4ade03bb27a 100644 (file)
@@ -129,6 +129,7 @@ struct i915_request {
         */
        struct i915_sw_fence submit;
        wait_queue_entry_t submitq;
+       struct list_head execute_cb;
 
        /*
         * A list of everyone we wait upon, and everyone who waits upon us.
@@ -343,10 +344,27 @@ static inline bool __i915_request_has_started(const struct i915_request *rq)
  * i915_request_started - check if the request has begun being executed
  * @rq: the request
  *
- * Returns true if the request has been submitted to hardware, and the hardware
- * has advanced passed the end of the previous request and so should be either
- * currently processing the request (though it may be preempted and so
- * not necessarily the next request to complete) or have completed the request.
+ * If the timeline is not using initial breadcrumbs, a request is
+ * considered started if the previous request on its timeline (i.e.
+ * context) has been signaled.
+ *
+ * If the timeline is using semaphores, it will also be emitting an
+ * "initial breadcrumb" after the semaphores are complete and just before
+ * it began executing the user payload. A request can therefore be active
+ * on the HW and not yet started as it is still busywaiting on its
+ * dependencies (via HW semaphores).
+ *
+ * If the request has started, its dependencies will have been signaled
+ * (either by fences or by semaphores) and it will have begun processing
+ * the user payload.
+ *
+ * However, even if a request has started, it may have been preempted and
+ * so no longer active, or it may have already completed.
+ *
+ * See also i915_request_is_active().
+ *
+ * Returns true if the request has begun executing the user payload, or
+ * has completed:
  */
 static inline bool i915_request_started(const struct i915_request *rq)
 {
index 7c58b049ecb50a47bf06d0322d6077f6e06e955d..8d1400d378d7a7746ec08426d683b52e5ea066f2 100644 (file)
@@ -192,7 +192,7 @@ static void __i915_sw_fence_complete(struct i915_sw_fence *fence,
        __i915_sw_fence_notify(fence, FENCE_FREE);
 }
 
-static void i915_sw_fence_complete(struct i915_sw_fence *fence)
+void i915_sw_fence_complete(struct i915_sw_fence *fence)
 {
        debug_fence_assert(fence);
 
@@ -202,7 +202,7 @@ static void i915_sw_fence_complete(struct i915_sw_fence *fence)
        __i915_sw_fence_complete(fence, NULL);
 }
 
-static void i915_sw_fence_await(struct i915_sw_fence *fence)
+void i915_sw_fence_await(struct i915_sw_fence *fence)
 {
        debug_fence_assert(fence);
        WARN_ON(atomic_inc_return(&fence->pending) <= 1);
index 0e055ea0179f321a4aafc8b95a667f8bbc8543f6..6dec9e1d11029888d11d3e250aa9644c7339430a 100644 (file)
@@ -79,6 +79,9 @@ int i915_sw_fence_await_reservation(struct i915_sw_fence *fence,
                                    unsigned long timeout,
                                    gfp_t gfp);
 
+void i915_sw_fence_await(struct i915_sw_fence *fence);
+void i915_sw_fence_complete(struct i915_sw_fence *fence);
+
 static inline bool i915_sw_fence_signaled(const struct i915_sw_fence *fence)
 {
        return atomic_read(&fence->pending) <= 0;
index df8f88142f1dfb6d2b52f40f086dc5bf452fc457..4fc7e2ac6278127ae776a05038184fc91d4dd909 100644 (file)
@@ -616,6 +616,7 @@ void intel_engines_set_scheduler_caps(struct drm_i915_private *i915)
        } map[] = {
 #define MAP(x, y) { ilog2(I915_ENGINE_HAS_##x), ilog2(I915_SCHEDULER_CAP_##y) }
                MAP(PREEMPTION, PREEMPTION),
+               MAP(SEMAPHORES, SEMAPHORES),
 #undef MAP
        };
        struct intel_engine_cs *engine;
index b96a31bc10809c44ecb17414174991758603dc7b..a34ece53a7712a6efe3e1b1db268238a09ece7f4 100644 (file)
 #define MI_SEMAPHORE_SIGNAL    MI_INSTR(0x1b, 0) /* GEN8+ */
 #define   MI_SEMAPHORE_TARGET(engine)  ((engine)<<15)
 #define MI_SEMAPHORE_WAIT      MI_INSTR(0x1c, 2) /* GEN8+ */
-#define   MI_SEMAPHORE_POLL            (1<<15)
-#define   MI_SEMAPHORE_SAD_GTE_SDD     (1<<12)
+#define   MI_SEMAPHORE_POLL            (1 << 15)
+#define   MI_SEMAPHORE_SAD_GT_SDD      (0 << 12)
+#define   MI_SEMAPHORE_SAD_GTE_SDD     (1 << 12)
+#define   MI_SEMAPHORE_SAD_LT_SDD      (2 << 12)
+#define   MI_SEMAPHORE_SAD_LTE_SDD     (3 << 12)
+#define   MI_SEMAPHORE_SAD_EQ_SDD      (4 << 12)
+#define   MI_SEMAPHORE_SAD_NEQ_SDD     (5 << 12)
 #define MI_STORE_DWORD_IMM     MI_INSTR(0x20, 1)
 #define MI_STORE_DWORD_IMM_GEN4        MI_INSTR(0x20, 2)
 #define   MI_MEM_VIRTUAL       (1 << 22) /* 945,g33,965 */
index 3fd0c45a29208f64a90ba1cd9302cab2c44171cd..6c9479acb4331b1ba72a171b03f51bd6cf4f1330 100644 (file)
@@ -2330,6 +2330,7 @@ void intel_execlists_set_default_submission(struct intel_engine_cs *engine)
        engine->park = NULL;
        engine->unpark = NULL;
 
+       engine->flags |= I915_ENGINE_HAS_SEMAPHORES;
        engine->flags |= I915_ENGINE_SUPPORTS_STATS;
        if (engine->i915->preempt_context)
                engine->flags |= I915_ENGINE_HAS_PREEMPTION;
index b8ec7e40a59bfc7e08714a6f6836da118131ecbe..2e7264119ec49d47f9bc49f03b1a5c391020a9e1 100644 (file)
@@ -499,6 +499,7 @@ struct intel_engine_cs {
 #define I915_ENGINE_NEEDS_CMD_PARSER BIT(0)
 #define I915_ENGINE_SUPPORTS_STATS   BIT(1)
 #define I915_ENGINE_HAS_PREEMPTION   BIT(2)
+#define I915_ENGINE_HAS_SEMAPHORES   BIT(3)
        unsigned int flags;
 
        /*
@@ -576,6 +577,12 @@ intel_engine_has_preemption(const struct intel_engine_cs *engine)
        return engine->flags & I915_ENGINE_HAS_PREEMPTION;
 }
 
+static inline bool
+intel_engine_has_semaphores(const struct intel_engine_cs *engine)
+{
+       return engine->flags & I915_ENGINE_HAS_SEMAPHORES;
+}
+
 void intel_engines_set_scheduler_caps(struct drm_i915_private *i915);
 
 static inline bool __execlists_need_preempt(int prio, int last)
index 8304a7f1ec3f98da958f8e9cc6cd8bba58035f30..b10eea3f6d24f2386f1396a988ad8c3988b1f9b7 100644 (file)
@@ -479,6 +479,7 @@ typedef struct drm_i915_irq_wait {
 #define   I915_SCHEDULER_CAP_ENABLED   (1ul << 0)
 #define   I915_SCHEDULER_CAP_PRIORITY  (1ul << 1)
 #define   I915_SCHEDULER_CAP_PREEMPTION        (1ul << 2)
+#define   I915_SCHEDULER_CAP_SEMAPHORES        (1ul << 3)
 
 #define I915_PARAM_HUC_STATUS           42