drm/i915: Defer breadcrumb emission
authorChris Wilson <chris@chris-wilson.co.uk>
Fri, 28 Oct 2016 12:58:52 +0000 (13:58 +0100)
committerChris Wilson <chris@chris-wilson.co.uk>
Fri, 28 Oct 2016 19:53:54 +0000 (20:53 +0100)
Move the actual emission of the breadcrumb for closing the request from
i915_add_request() to the submit callback. (It can be moved later when
required.) This allows us to defer the allocation of the global_seqno
from request construction to actual submission, allowing us to emit the
requests out of order (wrt to the order of their construction, they
still will only be executed one all of their dependencies are resolved
including that all earlier requests on their timeline have been
submitted.) We have to specialise how we then emit the request in order
to write into the preallocated space, rather than at the tail of the
ringbuffer (which will have been advanced by the addition of new
requests).

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20161028125858.23563-29-chris@chris-wilson.co.uk
drivers/gpu/drm/i915/i915_gem_request.c
drivers/gpu/drm/i915/intel_lrc.c
drivers/gpu/drm/i915/intel_ringbuffer.c
drivers/gpu/drm/i915/intel_ringbuffer.h

index be9e23b32e4a3b1fd91abf8ed98715c6d8ac0d13..06daa4d203a7a2e64afeccbd9ec25eb513dd8b0f 100644 (file)
@@ -318,17 +318,16 @@ submit_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
                container_of(fence, typeof(*request), submit);
        struct intel_engine_cs *engine = request->engine;
 
+       if (state != FENCE_COMPLETE)
+               return NOTIFY_DONE;
+
        /* Will be called from irq-context when using foreign DMA fences */
 
-       switch (state) {
-       case FENCE_COMPLETE:
-               engine->timeline->last_submitted_seqno = request->fence.seqno;
-               engine->submit_request(request);
-               break;
+       engine->timeline->last_submitted_seqno = request->fence.seqno;
 
-       case FENCE_FREE:
-               break;
-       }
+       engine->emit_breadcrumb(request,
+                               request->ring->vaddr + request->postfix);
+       engine->submit_request(request);
 
        return NOTIFY_DONE;
 }
@@ -648,9 +647,7 @@ void __i915_add_request(struct drm_i915_gem_request *request, bool flush_caches)
        struct intel_ring *ring = request->ring;
        struct intel_timeline *timeline = request->timeline;
        struct drm_i915_gem_request *prev;
-       u32 request_start;
-       u32 reserved_tail;
-       int ret;
+       int err;
 
        lockdep_assert_held(&request->i915->drm.struct_mutex);
        trace_i915_gem_request_add(request);
@@ -660,8 +657,6 @@ void __i915_add_request(struct drm_i915_gem_request *request, bool flush_caches)
         * should already have been reserved in the ring buffer. Let the ring
         * know that it is time to use that space up.
         */
-       request_start = ring->tail;
-       reserved_tail = request->reserved_space;
        request->reserved_space = 0;
 
        /*
@@ -672,10 +667,10 @@ void __i915_add_request(struct drm_i915_gem_request *request, bool flush_caches)
         * what.
         */
        if (flush_caches) {
-               ret = engine->emit_flush(request, EMIT_FLUSH);
+               err = engine->emit_flush(request, EMIT_FLUSH);
 
                /* Not allowed to fail! */
-               WARN(ret, "engine->emit_flush() failed: %d!\n", ret);
+               WARN(err, "engine->emit_flush() failed: %d!\n", err);
        }
 
        /* Record the position of the start of the breadcrumb so that
@@ -683,20 +678,10 @@ void __i915_add_request(struct drm_i915_gem_request *request, bool flush_caches)
         * GPU processing the request, we never over-estimate the
         * position of the ring's HEAD.
         */
+       err = intel_ring_begin(request, engine->emit_breadcrumb_sz);
+       GEM_BUG_ON(err);
        request->postfix = ring->tail;
-
-       /* Not allowed to fail! */
-       ret = engine->emit_breadcrumb(request);
-       WARN(ret, "(%s)->emit_breadcrumb failed: %d!\n", engine->name, ret);
-
-       /* Sanity check that the reserved size was large enough. */
-       ret = ring->tail - request_start;
-       if (ret < 0)
-               ret += ring->size;
-       WARN_ONCE(ret > reserved_tail,
-                 "Not enough space reserved (%d bytes) "
-                 "for adding the request (%d bytes)\n",
-                 reserved_tail, ret);
+       ring->tail += engine->emit_breadcrumb_sz * sizeof(u32);
 
        /* Seal the request and mark it as pending execution. Note that
         * we may inspect this state, without holding any locks, during
index 8229baebb2b3e95ccb5ad32e76dd99bc3be31f66..fa3012c342cc47ea7ebc82d1caaa00684e91aca6 100644 (file)
@@ -365,7 +365,7 @@ static u64 execlists_update_context(struct drm_i915_gem_request *rq)
        struct i915_hw_ppgtt *ppgtt = rq->ctx->ppgtt;
        u32 *reg_state = ce->lrc_reg_state;
 
-       reg_state[CTX_RING_TAIL+1] = intel_ring_offset(rq->ring, rq->tail);
+       reg_state[CTX_RING_TAIL+1] = rq->tail;
 
        /* True 32b PPGTT with dynamic page allocation: update PDP
         * registers and point the unallocated PDPs to scratch page.
@@ -599,6 +599,15 @@ static void execlists_submit_request(struct drm_i915_gem_request *request)
 
        spin_lock_irqsave(&engine->execlist_lock, flags);
 
+       /* We keep the previous context alive until we retire the following
+        * request. This ensures that any the context object is still pinned
+        * for any residual writes the HW makes into it on the context switch
+        * into the next object following the breadcrumb. Otherwise, we may
+        * retire the context too early.
+        */
+       request->previous_context = engine->last_context;
+       engine->last_context = request->ctx;
+
        list_add_tail(&request->execlist_link, &engine->execlist_queue);
        if (execlists_elsp_idle(engine))
                tasklet_hi_schedule(&engine->irq_tasklet);
@@ -671,46 +680,6 @@ err_unpin:
        return ret;
 }
 
-/*
- * intel_logical_ring_advance() - advance the tail and prepare for submission
- * @request: Request to advance the logical ringbuffer of.
- *
- * The tail is updated in our logical ringbuffer struct, not in the actual context. What
- * really happens during submission is that the context and current tail will be placed
- * on a queue waiting for the ELSP to be ready to accept a new context submission. At that
- * point, the tail *inside* the context is updated and the ELSP written to.
- */
-static int
-intel_logical_ring_advance(struct drm_i915_gem_request *request)
-{
-       struct intel_ring *ring = request->ring;
-       struct intel_engine_cs *engine = request->engine;
-
-       intel_ring_advance(ring);
-       request->tail = ring->tail;
-
-       /*
-        * Here we add two extra NOOPs as padding to avoid
-        * lite restore of a context with HEAD==TAIL.
-        *
-        * Caller must reserve WA_TAIL_DWORDS for us!
-        */
-       intel_ring_emit(ring, MI_NOOP);
-       intel_ring_emit(ring, MI_NOOP);
-       intel_ring_advance(ring);
-       request->wa_tail = ring->tail;
-
-       /* We keep the previous context alive until we retire the following
-        * request. This ensures that any the context object is still pinned
-        * for any residual writes the HW makes into it on the context switch
-        * into the next object following the breadcrumb. Otherwise, we may
-        * retire the context too early.
-        */
-       request->previous_context = engine->last_context;
-       engine->last_context = request->ctx;
-       return 0;
-}
-
 static int intel_lr_context_pin(struct i915_gem_context *ctx,
                                struct intel_engine_cs *engine)
 {
@@ -1566,41 +1535,35 @@ static void bxt_a_seqno_barrier(struct intel_engine_cs *engine)
  * used as a workaround for not being allowed to do lite
  * restore with HEAD==TAIL (WaIdleLiteRestore).
  */
-
-static int gen8_emit_breadcrumb(struct drm_i915_gem_request *request)
+static void gen8_emit_wa_tail(struct drm_i915_gem_request *request, u32 *out)
 {
-       struct intel_ring *ring = request->ring;
-       int ret;
-
-       ret = intel_ring_begin(request, 6 + WA_TAIL_DWORDS);
-       if (ret)
-               return ret;
+       *out++ = MI_NOOP;
+       *out++ = MI_NOOP;
+       request->wa_tail = intel_ring_offset(request->ring, out);
+}
 
+static void gen8_emit_breadcrumb(struct drm_i915_gem_request *request,
+                                u32 *out)
+{
        /* w/a: bit 5 needs to be zero for MI_FLUSH_DW address. */
        BUILD_BUG_ON(I915_GEM_HWS_INDEX_ADDR & (1 << 5));
 
-       intel_ring_emit(ring, (MI_FLUSH_DW + 1) | MI_FLUSH_DW_OP_STOREDW);
-       intel_ring_emit(ring,
-                       intel_hws_seqno_address(request->engine) |
-                       MI_FLUSH_DW_USE_GTT);
-       intel_ring_emit(ring, 0);
-       intel_ring_emit(ring, request->global_seqno);
-       intel_ring_emit(ring, MI_USER_INTERRUPT);
-       intel_ring_emit(ring, MI_NOOP);
-       return intel_logical_ring_advance(request);
+       *out++ = (MI_FLUSH_DW + 1) | MI_FLUSH_DW_OP_STOREDW;
+       *out++ = intel_hws_seqno_address(request->engine) | MI_FLUSH_DW_USE_GTT;
+       *out++ = 0;
+       *out++ = request->global_seqno;
+       *out++ = MI_USER_INTERRUPT;
+       *out++ = MI_NOOP;
+       request->tail = intel_ring_offset(request->ring, out);
+
+       gen8_emit_wa_tail(request, out);
 }
 
 static const int gen8_emit_breadcrumb_sz = 6 + WA_TAIL_DWORDS;
 
-static int gen8_emit_breadcrumb_render(struct drm_i915_gem_request *request)
+static void gen8_emit_breadcrumb_render(struct drm_i915_gem_request *request,
+                                       u32 *out)
 {
-       struct intel_ring *ring = request->ring;
-       int ret;
-
-       ret = intel_ring_begin(request, 8 + WA_TAIL_DWORDS);
-       if (ret)
-               return ret;
-
        /* We're using qword write, seqno should be aligned to 8 bytes. */
        BUILD_BUG_ON(I915_GEM_HWS_INDEX & 1);
 
@@ -1608,19 +1571,20 @@ static int gen8_emit_breadcrumb_render(struct drm_i915_gem_request *request)
         * need a prior CS_STALL, which is emitted by the flush
         * following the batch.
         */
-       intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(6));
-       intel_ring_emit(ring,
-                       (PIPE_CONTROL_GLOBAL_GTT_IVB |
-                        PIPE_CONTROL_CS_STALL |
-                        PIPE_CONTROL_QW_WRITE));
-       intel_ring_emit(ring, intel_hws_seqno_address(request->engine));
-       intel_ring_emit(ring, 0);
-       intel_ring_emit(ring, request->global_seqno);
+       *out++ = GFX_OP_PIPE_CONTROL(6);
+       *out++ = (PIPE_CONTROL_GLOBAL_GTT_IVB |
+                 PIPE_CONTROL_CS_STALL |
+                 PIPE_CONTROL_QW_WRITE);
+       *out++ = intel_hws_seqno_address(request->engine);
+       *out++ = 0;
+       *out++ = request->global_seqno;
        /* We're thrashing one dword of HWS. */
-       intel_ring_emit(ring, 0);
-       intel_ring_emit(ring, MI_USER_INTERRUPT);
-       intel_ring_emit(ring, MI_NOOP);
-       return intel_logical_ring_advance(request);
+       *out++ = 0;
+       *out++ = MI_USER_INTERRUPT;
+       *out++ = MI_NOOP;
+       request->tail = intel_ring_offset(request->ring, out);
+
+       gen8_emit_wa_tail(request, out);
 }
 
 static const int gen8_emit_breadcrumb_render_sz = 8 + WA_TAIL_DWORDS;
index ae9cf6bb4def476c0d27c5985d7f6a8119304fc8..16244775b9d104b321f952cc6c4a0adaad21057f 100644 (file)
@@ -1213,90 +1213,62 @@ static void render_ring_cleanup(struct intel_engine_cs *engine)
        i915_vma_unpin_and_release(&dev_priv->semaphore);
 }
 
-static int gen8_rcs_signal(struct drm_i915_gem_request *req)
+static u32 *gen8_rcs_signal(struct drm_i915_gem_request *req, u32 *out)
 {
-       struct intel_ring *ring = req->ring;
        struct drm_i915_private *dev_priv = req->i915;
        struct intel_engine_cs *waiter;
        enum intel_engine_id id;
-       int ret, num_rings;
-
-       num_rings = INTEL_INFO(dev_priv)->num_rings;
-       ret = intel_ring_begin(req, (num_rings-1) * 8);
-       if (ret)
-               return ret;
 
        for_each_engine(waiter, dev_priv, id) {
                u64 gtt_offset = req->engine->semaphore.signal_ggtt[id];
                if (gtt_offset == MI_SEMAPHORE_SYNC_INVALID)
                        continue;
 
-               intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(6));
-               intel_ring_emit(ring,
-                               PIPE_CONTROL_GLOBAL_GTT_IVB |
-                               PIPE_CONTROL_QW_WRITE |
-                               PIPE_CONTROL_CS_STALL);
-               intel_ring_emit(ring, lower_32_bits(gtt_offset));
-               intel_ring_emit(ring, upper_32_bits(gtt_offset));
-               intel_ring_emit(ring, req->global_seqno);
-               intel_ring_emit(ring, 0);
-               intel_ring_emit(ring,
-                               MI_SEMAPHORE_SIGNAL |
-                               MI_SEMAPHORE_TARGET(waiter->hw_id));
-               intel_ring_emit(ring, 0);
+               *out++ = GFX_OP_PIPE_CONTROL(6);
+               *out++ = (PIPE_CONTROL_GLOBAL_GTT_IVB |
+                         PIPE_CONTROL_QW_WRITE |
+                         PIPE_CONTROL_CS_STALL);
+               *out++ = lower_32_bits(gtt_offset);
+               *out++ = upper_32_bits(gtt_offset);
+               *out++ = req->global_seqno;
+               *out++ = 0;
+               *out++ = (MI_SEMAPHORE_SIGNAL |
+                         MI_SEMAPHORE_TARGET(waiter->hw_id));
+               *out++ = 0;
        }
-       intel_ring_advance(ring);
 
-       return 0;
+       return out;
 }
 
-static int gen8_xcs_signal(struct drm_i915_gem_request *req)
+static u32 *gen8_xcs_signal(struct drm_i915_gem_request *req, u32 *out)
 {
-       struct intel_ring *ring = req->ring;
        struct drm_i915_private *dev_priv = req->i915;
        struct intel_engine_cs *waiter;
        enum intel_engine_id id;
-       int ret, num_rings;
-
-       num_rings = INTEL_INFO(dev_priv)->num_rings;
-       ret = intel_ring_begin(req, (num_rings-1) * 6);
-       if (ret)
-               return ret;
 
        for_each_engine(waiter, dev_priv, id) {
                u64 gtt_offset = req->engine->semaphore.signal_ggtt[id];
                if (gtt_offset == MI_SEMAPHORE_SYNC_INVALID)
                        continue;
 
-               intel_ring_emit(ring,
-                               (MI_FLUSH_DW + 1) | MI_FLUSH_DW_OP_STOREDW);
-               intel_ring_emit(ring,
-                               lower_32_bits(gtt_offset) |
-                               MI_FLUSH_DW_USE_GTT);
-               intel_ring_emit(ring, upper_32_bits(gtt_offset));
-               intel_ring_emit(ring, req->global_seqno);
-               intel_ring_emit(ring,
-                               MI_SEMAPHORE_SIGNAL |
-                               MI_SEMAPHORE_TARGET(waiter->hw_id));
-               intel_ring_emit(ring, 0);
+               *out++ = (MI_FLUSH_DW + 1) | MI_FLUSH_DW_OP_STOREDW;
+               *out++ = lower_32_bits(gtt_offset) | MI_FLUSH_DW_USE_GTT;
+               *out++ = upper_32_bits(gtt_offset);
+               *out++ = req->global_seqno;
+               *out++ = (MI_SEMAPHORE_SIGNAL |
+                         MI_SEMAPHORE_TARGET(waiter->hw_id));
+               *out++ = 0;
        }
-       intel_ring_advance(ring);
 
-       return 0;
+       return out;
 }
 
-static int gen6_signal(struct drm_i915_gem_request *req)
+static u32 *gen6_signal(struct drm_i915_gem_request *req, u32 *out)
 {
-       struct intel_ring *ring = req->ring;
        struct drm_i915_private *dev_priv = req->i915;
        struct intel_engine_cs *engine;
        enum intel_engine_id id;
-       int ret, num_rings;
-
-       num_rings = INTEL_INFO(dev_priv)->num_rings;
-       ret = intel_ring_begin(req, round_up((num_rings-1) * 3, 2));
-       if (ret)
-               return ret;
+       int num_rings = 0;
 
        for_each_engine(engine, dev_priv, id) {
                i915_reg_t mbox_reg;
@@ -1306,46 +1278,34 @@ static int gen6_signal(struct drm_i915_gem_request *req)
 
                mbox_reg = req->engine->semaphore.mbox.signal[engine->hw_id];
                if (i915_mmio_reg_valid(mbox_reg)) {
-                       intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(1));
-                       intel_ring_emit_reg(ring, mbox_reg);
-                       intel_ring_emit(ring, req->global_seqno);
+                       *out++ = MI_LOAD_REGISTER_IMM(1);
+                       *out++ = i915_mmio_reg_offset(mbox_reg);
+                       *out++ = req->global_seqno;
+                       num_rings++;
                }
        }
+       if (num_rings & 1)
+               *out++ = MI_NOOP;
 
-       /* If num_dwords was rounded, make sure the tail pointer is correct */
-       if (num_rings % 2 == 0)
-               intel_ring_emit(ring, MI_NOOP);
-       intel_ring_advance(ring);
-
-       return 0;
+       return out;
 }
 
 static void i9xx_submit_request(struct drm_i915_gem_request *request)
 {
        struct drm_i915_private *dev_priv = request->i915;
 
-       I915_WRITE_TAIL(request->engine,
-                       intel_ring_offset(request->ring, request->tail));
+       I915_WRITE_TAIL(request->engine, request->tail);
 }
 
-static int i9xx_emit_breadcrumb(struct drm_i915_gem_request *req)
+static void i9xx_emit_breadcrumb(struct drm_i915_gem_request *req,
+                                u32 *out)
 {
-       struct intel_ring *ring = req->ring;
-       int ret;
-
-       ret = intel_ring_begin(req, 4);
-       if (ret)
-               return ret;
-
-       intel_ring_emit(ring, MI_STORE_DWORD_INDEX);
-       intel_ring_emit(ring, I915_GEM_HWS_INDEX << MI_STORE_DWORD_INDEX_SHIFT);
-       intel_ring_emit(ring, req->global_seqno);
-       intel_ring_emit(ring, MI_USER_INTERRUPT);
-       intel_ring_advance(ring);
-
-       req->tail = ring->tail;
+       *out++ = MI_STORE_DWORD_INDEX;
+       *out++ = I915_GEM_HWS_INDEX << MI_STORE_DWORD_INDEX_SHIFT;
+       *out++ = req->global_seqno;
+       *out++ = MI_USER_INTERRUPT;
 
-       return 0;
+       req->tail = intel_ring_offset(req->ring, out);
 }
 
 static const int i9xx_emit_breadcrumb_sz = 4;
@@ -1358,49 +1318,34 @@ static const int i9xx_emit_breadcrumb_sz = 4;
  * Update the mailbox registers in the *other* rings with the current seqno.
  * This acts like a signal in the canonical semaphore.
  */
-static int gen6_sema_emit_breadcrumb(struct drm_i915_gem_request *req)
+static void gen6_sema_emit_breadcrumb(struct drm_i915_gem_request *req,
+                                     u32 *out)
 {
-       int ret;
-
-       ret = req->engine->semaphore.signal(req);
-       if (ret)
-               return ret;
-
-       return i9xx_emit_breadcrumb(req);
+       return i9xx_emit_breadcrumb(req,
+                                   req->engine->semaphore.signal(req, out));
 }
 
-static int gen8_render_emit_breadcrumb(struct drm_i915_gem_request *req)
+static void gen8_render_emit_breadcrumb(struct drm_i915_gem_request *req,
+                                       u32 *out)
 {
        struct intel_engine_cs *engine = req->engine;
-       struct intel_ring *ring = req->ring;
-       int ret;
 
-       if (engine->semaphore.signal) {
-               ret = engine->semaphore.signal(req);
-               if (ret)
-                       return ret;
-       }
-
-       ret = intel_ring_begin(req, 8);
-       if (ret)
-               return ret;
+       if (engine->semaphore.signal)
+               out = engine->semaphore.signal(req, out);
 
-       intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(6));
-       intel_ring_emit(ring, (PIPE_CONTROL_GLOBAL_GTT_IVB |
+       *out++ = GFX_OP_PIPE_CONTROL(6);
+       *out++ = (PIPE_CONTROL_GLOBAL_GTT_IVB |
                               PIPE_CONTROL_CS_STALL |
-                              PIPE_CONTROL_QW_WRITE));
-       intel_ring_emit(ring, intel_hws_seqno_address(engine));
-       intel_ring_emit(ring, 0);
-       intel_ring_emit(ring, req->global_seqno);
+                              PIPE_CONTROL_QW_WRITE);
+       *out++ = intel_hws_seqno_address(engine);
+       *out++ = 0;
+       *out++ = req->global_seqno;
        /* We're thrashing one dword of HWS. */
-       intel_ring_emit(ring, 0);
-       intel_ring_emit(ring, MI_USER_INTERRUPT);
-       intel_ring_emit(ring, MI_NOOP);
-       intel_ring_advance(ring);
-
-       req->tail = ring->tail;
+       *out++ = 0;
+       *out++ = MI_USER_INTERRUPT;
+       *out++ = MI_NOOP;
 
-       return 0;
+       req->tail = intel_ring_offset(req->ring, out);
 }
 
 static const int gen8_render_emit_breadcrumb_sz = 8;
index 7b7aaafac0dac18cf1ca9926c200255bec817e94..9d228bee3511ed8347c5fb6ee7c7bd940b4ac0b3 100644 (file)
@@ -255,7 +255,8 @@ struct intel_engine_cs {
 #define I915_DISPATCH_SECURE BIT(0)
 #define I915_DISPATCH_PINNED BIT(1)
 #define I915_DISPATCH_RS     BIT(2)
-       int             (*emit_breadcrumb)(struct drm_i915_gem_request *req);
+       void            (*emit_breadcrumb)(struct drm_i915_gem_request *req,
+                                          u32 *out);
        int             emit_breadcrumb_sz;
 
        /* Pass the request to the hardware queue (e.g. directly into
@@ -331,7 +332,7 @@ struct intel_engine_cs {
                /* AKA wait() */
                int     (*sync_to)(struct drm_i915_gem_request *req,
                                   struct drm_i915_gem_request *signal);
-               int     (*signal)(struct drm_i915_gem_request *req);
+               u32     *(*signal)(struct drm_i915_gem_request *req, u32 *out);
        } semaphore;
 
        /* Execlists */
@@ -487,10 +488,11 @@ static inline void intel_ring_advance(struct intel_ring *ring)
         */
 }
 
-static inline u32 intel_ring_offset(struct intel_ring *ring, u32 value)
+static inline u32 intel_ring_offset(struct intel_ring *ring, void *addr)
 {
        /* Don't write ring->size (equivalent to 0) as that hangs some GPUs. */
-       return value & (ring->size - 1);
+       u32 offset = addr - ring->vaddr;
+       return offset & (ring->size - 1);
 }
 
 int __intel_ring_space(int head, int tail, int size);