drm/i915: Bump ready tasks ahead of busywaits
authorChris Wilson <chris@chris-wilson.co.uk>
Tue, 9 Apr 2019 15:29:22 +0000 (16:29 +0100)
committerChris Wilson <chris@chris-wilson.co.uk>
Thu, 11 Apr 2019 06:14:27 +0000 (07:14 +0100)
Consider two tasks that are running in parallel on a pair of engines
(vcs0, vcs1), but then must complete on a shared engine (rcs0). To
maximise throughput, we want to run the first ready task on rcs0 (i.e.
the first task that completes on either of vcs0 or vcs1). When using
semaphores, however, we will instead queue onto rcs in submission order.

To resolve this incorrect ordering, we want to re-evaluate the priority
queue when each of the request is ready. Normally this happens because
we only insert into the priority queue requests that are ready, but with
semaphores we are inserting ahead of their readiness and to compensate
we penalize those tasks with reduced priority (so that tasks that do not
need to busywait should naturally be run first). However, given a series
of tasks that each use semaphores, the queue degrades into submission
fifo rather than readiness fifo, and so to counter this we give a small
boost to semaphore users as their dependent tasks are completed (and so
we no longer require any busywait prior to running the user task as they
are then ready themselves).

v2: Fixup irqsave for schedule_lock (Tvrtko)

Testcase: igt/gem_exec_schedule/semaphore-codependency
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Dmitry Rogozhkin <dmitry.v.rogozhkin@intel.com>
Cc: Dmitry Ermilov <dmitry.ermilov@intel.com>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190409152922.23894-1-chris@chris-wilson.co.uk
drivers/gpu/drm/i915/i915_request.c
drivers/gpu/drm/i915/i915_request.h
drivers/gpu/drm/i915/i915_scheduler.c

index 96a9e8bcd80567150c9b1d7bd98a5952ce4f0e04..a7d87cfaabcbe0ab4b5d85929ca5a85c3f47abe9 100644 (file)
@@ -552,6 +552,36 @@ submit_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
        return NOTIFY_DONE;
 }
 
+static int __i915_sw_fence_call
+semaphore_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
+{
+       struct i915_request *request =
+               container_of(fence, typeof(*request), semaphore);
+
+       switch (state) {
+       case FENCE_COMPLETE:
+               /*
+                * We only check a small portion of our dependencies
+                * and so cannot guarantee that there remains no
+                * semaphore chain across all. Instead of opting
+                * for the full NOSEMAPHORE boost, we go for the
+                * smaller (but still preempting) boost of
+                * NEWCLIENT. This will be enough to boost over
+                * a busywaiting request (as that cannot be
+                * NEWCLIENT) without accidentally boosting
+                * a busywait over real work elsewhere.
+                */
+               i915_schedule_bump_priority(request, I915_PRIORITY_NEWCLIENT);
+               break;
+
+       case FENCE_FREE:
+               i915_request_put(request);
+               break;
+       }
+
+       return NOTIFY_DONE;
+}
+
 static void ring_retire_requests(struct intel_ring *ring)
 {
        struct i915_request *rq, *rn;
@@ -702,6 +732,7 @@ i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
 
        /* We bump the ref for the fence chain */
        i915_sw_fence_init(&i915_request_get(rq)->submit, submit_notify);
+       i915_sw_fence_init(&i915_request_get(rq)->semaphore, semaphore_notify);
 
        i915_sched_node_init(&rq->sched);
 
@@ -784,6 +815,12 @@ emit_semaphore_wait(struct i915_request *to,
                                                     &from->fence, 0,
                                                     I915_FENCE_GFP);
 
+       err = i915_sw_fence_await_dma_fence(&to->semaphore,
+                                           &from->fence, 0,
+                                           I915_FENCE_GFP);
+       if (err < 0)
+               return err;
+
        /* We need to pin the signaler's HWSP until we are finished reading. */
        err = i915_timeline_read_hwsp(from, to, &hwsp_offset);
        if (err)
@@ -1114,6 +1151,7 @@ void i915_request_add(struct i915_request *request)
         * run at the earliest possible convenience.
         */
        local_bh_disable();
+       i915_sw_fence_commit(&request->semaphore);
        rcu_read_lock(); /* RCU serialisation for set-wedged protection */
        if (engine->schedule) {
                struct i915_sched_attr attr = request->gem_context->sched;
@@ -1320,7 +1358,9 @@ long i915_request_wait(struct i915_request *rq,
        if (flags & I915_WAIT_PRIORITY) {
                if (!i915_request_started(rq) && INTEL_GEN(rq->i915) >= 6)
                        gen6_rps_boost(rq);
+               local_bh_disable(); /* suspend tasklets for reprioritisation */
                i915_schedule_bump_priority(rq, I915_PRIORITY_WAIT);
+               local_bh_enable(); /* kick tasklets en masse */
        }
 
        wait.tsk = current;
index 875be6f714125914d247483d699a3122e8df0a85..a982664618c2c977d89875dcc1dc4dd575e31106 100644 (file)
@@ -143,6 +143,7 @@ struct i915_request {
                struct i915_sw_dma_fence_cb dmaq;
        };
        struct list_head execute_cb;
+       struct i915_sw_fence semaphore;
 
        /*
         * A list of everyone we wait upon, and everyone who waits upon us.
index 77dbf7d74e12843f5b6ef9a1bbd8dfe06208bb9d..39bc4f54e2720e28aea474d3e7ead7e48c6afe43 100644 (file)
@@ -64,7 +64,7 @@ bool __i915_sched_node_add_dependency(struct i915_sched_node *node,
 {
        bool ret = false;
 
-       spin_lock(&schedule_lock);
+       spin_lock_irq(&schedule_lock);
 
        if (!node_signaled(signal)) {
                INIT_LIST_HEAD(&dep->dfs_link);
@@ -81,7 +81,7 @@ bool __i915_sched_node_add_dependency(struct i915_sched_node *node,
                ret = true;
        }
 
-       spin_unlock(&schedule_lock);
+       spin_unlock_irq(&schedule_lock);
 
        return ret;
 }
@@ -108,7 +108,7 @@ void i915_sched_node_fini(struct i915_sched_node *node)
 
        GEM_BUG_ON(!list_empty(&node->link));
 
-       spin_lock(&schedule_lock);
+       spin_lock_irq(&schedule_lock);
 
        /*
         * Everyone we depended upon (the fences we wait to be signaled)
@@ -135,7 +135,7 @@ void i915_sched_node_fini(struct i915_sched_node *node)
                        i915_dependency_free(dep);
        }
 
-       spin_unlock(&schedule_lock);
+       spin_unlock_irq(&schedule_lock);
 }
 
 static inline struct i915_priolist *to_priolist(struct rb_node *rb)
@@ -356,7 +356,7 @@ static void __i915_schedule(struct i915_request *rq,
 
        memset(&cache, 0, sizeof(cache));
        engine = rq->engine;
-       spin_lock_irq(&engine->timeline.lock);
+       spin_lock(&engine->timeline.lock);
 
        /* Fifo and depth-first replacement ensure our deps execute before us */
        list_for_each_entry_safe_reverse(dep, p, &dfs, dfs_link) {
@@ -407,32 +407,33 @@ static void __i915_schedule(struct i915_request *rq,
                tasklet_hi_schedule(&engine->execlists.tasklet);
        }
 
-       spin_unlock_irq(&engine->timeline.lock);
+       spin_unlock(&engine->timeline.lock);
 }
 
 void i915_schedule(struct i915_request *rq, const struct i915_sched_attr *attr)
 {
-       spin_lock(&schedule_lock);
+       spin_lock_irq(&schedule_lock);
        __i915_schedule(rq, attr);
-       spin_unlock(&schedule_lock);
+       spin_unlock_irq(&schedule_lock);
 }
 
 void i915_schedule_bump_priority(struct i915_request *rq, unsigned int bump)
 {
        struct i915_sched_attr attr;
+       unsigned long flags;
 
        GEM_BUG_ON(bump & ~I915_PRIORITY_MASK);
 
        if (READ_ONCE(rq->sched.attr.priority) == I915_PRIORITY_INVALID)
                return;
 
-       spin_lock_bh(&schedule_lock);
+       spin_lock_irqsave(&schedule_lock, flags);
 
        attr = rq->sched.attr;
        attr.priority |= bump;
        __i915_schedule(rq, &attr);
 
-       spin_unlock_bh(&schedule_lock);
+       spin_unlock_irqrestore(&schedule_lock, flags);
 }
 
 void __i915_priolist_free(struct i915_priolist *p)