From 253a774bb08b549488047c3fb4afc0faf6f194ae Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Fri, 18 Oct 2019 08:20:27 +0100
Subject: [PATCH] drm/i915/execlists: Don't merely skip submission if maybe
 timeslicing

Normally, we try and skip submission if ELSP[1] is filled. However, we
may desire to enable timeslicing due to the queue priority, even if
ELSP[1] itself does not require timeslicing. That is the queue is equal
priority to ELSP[0] and higher priority then ELSP[1]. Previously, we
would wait until the context switch to preempt the current ELSP[1], but
with timeslicing, we want to preempt ELSP[0] and replace it with the
queue.

In writing the test case, it become quickly apparent that we were also
suppressing the tasklet during promotion and so failing to notice when
the queue started requiring timeslicing.

Fixes: 2229adc81380 ("drm/i915/execlist: Trim immediate timeslice expiry")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20191018072027.31948-1-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/gt/intel_lrc.c    |  22 +++-
 drivers/gpu/drm/i915/gt/selftest_lrc.c | 168 ++++++++++++++++++++++++-
 drivers/gpu/drm/i915/i915_scheduler.c  |  17 ++-
 drivers/gpu/drm/i915/i915_scheduler.h  |  18 ---
 4 files changed, 196 insertions(+), 29 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
index e9fe9f79cedd..d0088d020220 100644
--- a/drivers/gpu/drm/i915/gt/intel_lrc.c
+++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
@@ -352,10 +352,15 @@ static inline bool need_preempt(const struct intel_engine_cs *engine,
 	 * However, the priority hint is a mere hint that we may need to
 	 * preempt. If that hint is stale or we may be trying to preempt
 	 * ourselves, ignore the request.
+	 *
+	 * More naturally we would write
+	 *      prio >= max(0, last);
+	 * except that we wish to prevent triggering preemption at the same
+	 * priority level: the task that is running should remain running
+	 * to preserve FIFO ordering of dependencies.
 	 */
-	last_prio = effective_prio(rq);
-	if (!i915_scheduler_need_preempt(engine->execlists.queue_priority_hint,
-					 last_prio))
+	last_prio = max(effective_prio(rq), I915_PRIORITY_NORMAL - 1);
+	if (engine->execlists.queue_priority_hint <= last_prio)
 		return false;
 
 	/*
@@ -1509,8 +1514,17 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
 			 * submission.
 			 */
 			if (!list_is_last(&last->sched.link,
-					  &engine->active.requests))
+					  &engine->active.requests)) {
+				/*
+				 * Even if ELSP[1] is occupied and not worthy
+				 * of timeslices, our queue might be.
+				 */
+				if (!execlists->timer.expires &&
+				    need_timeslice(engine, last))
+					mod_timer(&execlists->timer,
+						  jiffies + 1);
 				return;
+			}
 
 			/*
 			 * WaIdleLiteRestore:bdw,skl
diff --git a/drivers/gpu/drm/i915/gt/selftest_lrc.c b/drivers/gpu/drm/i915/gt/selftest_lrc.c
index 007d6203fe0e..5dc679781a08 100644
--- a/drivers/gpu/drm/i915/gt/selftest_lrc.c
+++ b/drivers/gpu/drm/i915/gt/selftest_lrc.c
@@ -325,7 +325,13 @@ semaphore_queue(struct intel_engine_cs *engine, struct i915_vma *vma, int idx)
 	if (IS_ERR(rq))
 		goto out_ctx;
 
-	err = emit_semaphore_chain(rq, vma, idx);
+	err = 0;
+	if (rq->engine->emit_init_breadcrumb)
+		err = rq->engine->emit_init_breadcrumb(rq);
+	if (err == 0)
+		err = emit_semaphore_chain(rq, vma, idx);
+	if (err == 0)
+		i915_request_get(rq);
 	i915_request_add(rq);
 	if (err)
 		rq = ERR_PTR(err);
@@ -338,10 +344,10 @@ out_ctx:
 static int
 release_queue(struct intel_engine_cs *engine,
 	      struct i915_vma *vma,
-	      int idx)
+	      int idx, int prio)
 {
 	struct i915_sched_attr attr = {
-		.priority = I915_USER_PRIORITY(I915_PRIORITY_MAX),
+		.priority = prio,
 	};
 	struct i915_request *rq;
 	u32 *cs;
@@ -362,9 +368,15 @@ release_queue(struct intel_engine_cs *engine,
 	*cs++ = 1;
 
 	intel_ring_advance(rq, cs);
+
+	i915_request_get(rq);
 	i915_request_add(rq);
 
+	local_bh_disable();
 	engine->schedule(rq, &attr);
+	local_bh_enable(); /* kick tasklet */
+
+	i915_request_put(rq);
 
 	return 0;
 }
@@ -383,7 +395,6 @@ slice_semaphore_queue(struct intel_engine_cs *outer,
 	if (IS_ERR(head))
 		return PTR_ERR(head);
 
-	i915_request_get(head);
 	for_each_engine(engine, outer->gt, id) {
 		for (i = 0; i < count; i++) {
 			struct i915_request *rq;
@@ -393,10 +404,12 @@ slice_semaphore_queue(struct intel_engine_cs *outer,
 				err = PTR_ERR(rq);
 				goto out;
 			}
+
+			i915_request_put(rq);
 		}
 	}
 
-	err = release_queue(outer, vma, n);
+	err = release_queue(outer, vma, n, INT_MAX);
 	if (err)
 		goto out;
 
@@ -482,6 +495,150 @@ err_obj:
 	return err;
 }
 
+static struct i915_request *nop_request(struct intel_engine_cs *engine)
+{
+	struct i915_request *rq;
+
+	rq = i915_request_create(engine->kernel_context);
+	if (IS_ERR(rq))
+		return rq;
+
+	i915_request_get(rq);
+	i915_request_add(rq);
+
+	return rq;
+}
+
+static void wait_for_submit(struct intel_engine_cs *engine,
+			    struct i915_request *rq)
+{
+	do {
+		cond_resched();
+		intel_engine_flush_submission(engine);
+	} while (!i915_request_is_active(rq));
+}
+
+static int live_timeslice_queue(void *arg)
+{
+	struct intel_gt *gt = arg;
+	struct drm_i915_gem_object *obj;
+	struct intel_engine_cs *engine;
+	enum intel_engine_id id;
+	struct i915_vma *vma;
+	void *vaddr;
+	int err = 0;
+
+	/*
+	 * Make sure that even if ELSP[0] and ELSP[1] are filled with
+	 * timeslicing between them disabled, we *do* enable timeslicing
+	 * if the queue demands it. (Normally, we do not submit if
+	 * ELSP[1] is already occupied, so must rely on timeslicing to
+	 * eject ELSP[0] in favour of the queue.)
+	 */
+
+	obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
+	if (IS_ERR(obj))
+		return PTR_ERR(obj);
+
+	vma = i915_vma_instance(obj, &gt->ggtt->vm, NULL);
+	if (IS_ERR(vma)) {
+		err = PTR_ERR(vma);
+		goto err_obj;
+	}
+
+	vaddr = i915_gem_object_pin_map(obj, I915_MAP_WC);
+	if (IS_ERR(vaddr)) {
+		err = PTR_ERR(vaddr);
+		goto err_obj;
+	}
+
+	err = i915_vma_pin(vma, 0, 0, PIN_GLOBAL);
+	if (err)
+		goto err_map;
+
+	for_each_engine(engine, gt, id) {
+		struct i915_sched_attr attr = {
+			.priority = I915_USER_PRIORITY(I915_PRIORITY_MAX),
+		};
+		struct i915_request *rq, *nop;
+
+		if (!intel_engine_has_preemption(engine))
+			continue;
+
+		memset(vaddr, 0, PAGE_SIZE);
+
+		/* ELSP[0]: semaphore wait */
+		rq = semaphore_queue(engine, vma, 0);
+		if (IS_ERR(rq)) {
+			err = PTR_ERR(rq);
+			goto err_pin;
+		}
+		engine->schedule(rq, &attr);
+		wait_for_submit(engine, rq);
+
+		/* ELSP[1]: nop request */
+		nop = nop_request(engine);
+		if (IS_ERR(nop)) {
+			err = PTR_ERR(nop);
+			i915_request_put(rq);
+			goto err_pin;
+		}
+		wait_for_submit(engine, nop);
+		i915_request_put(nop);
+
+		GEM_BUG_ON(i915_request_completed(rq));
+		GEM_BUG_ON(execlists_active(&engine->execlists) != rq);
+
+		/* Queue: semaphore signal, matching priority as semaphore */
+		err = release_queue(engine, vma, 1, effective_prio(rq));
+		if (err) {
+			i915_request_put(rq);
+			goto err_pin;
+		}
+
+		intel_engine_flush_submission(engine);
+		if (!READ_ONCE(engine->execlists.timer.expires) &&
+		    !i915_request_completed(rq)) {
+			struct drm_printer p =
+				drm_info_printer(gt->i915->drm.dev);
+
+			GEM_TRACE_ERR("%s: Failed to enable timeslicing!\n",
+				      engine->name);
+			intel_engine_dump(engine, &p,
+					  "%s\n", engine->name);
+			GEM_TRACE_DUMP();
+
+			memset(vaddr, 0xff, PAGE_SIZE);
+			err = -EINVAL;
+		}
+
+		/* Timeslice every jiffie, so within 2 we should signal */
+		if (i915_request_wait(rq, 0, 3) < 0) {
+			struct drm_printer p =
+				drm_info_printer(gt->i915->drm.dev);
+
+			pr_err("%s: Failed to timeslice into queue\n",
+			       engine->name);
+			intel_engine_dump(engine, &p,
+					  "%s\n", engine->name);
+
+			memset(vaddr, 0xff, PAGE_SIZE);
+			err = -EIO;
+		}
+		i915_request_put(rq);
+		if (err)
+			break;
+	}
+
+err_pin:
+	i915_vma_unpin(vma);
+err_map:
+	i915_gem_object_unpin_map(obj);
+err_obj:
+	i915_gem_object_put(obj);
+	return err;
+}
+
 static int live_busywait_preempt(void *arg)
 {
 	struct intel_gt *gt = arg;
@@ -2437,6 +2594,7 @@ int intel_execlists_live_selftests(struct drm_i915_private *i915)
 		SUBTEST(live_unlite_switch),
 		SUBTEST(live_unlite_preempt),
 		SUBTEST(live_timeslice_preempt),
+		SUBTEST(live_timeslice_queue),
 		SUBTEST(live_busywait_preempt),
 		SUBTEST(live_preempt),
 		SUBTEST(live_late_preempt),
diff --git a/drivers/gpu/drm/i915/i915_scheduler.c b/drivers/gpu/drm/i915/i915_scheduler.c
index 7b84ebca2901..0ca40f6bf08c 100644
--- a/drivers/gpu/drm/i915/i915_scheduler.c
+++ b/drivers/gpu/drm/i915/i915_scheduler.c
@@ -177,9 +177,22 @@ static inline int rq_prio(const struct i915_request *rq)
 	return rq->sched.attr.priority | __NO_PREEMPTION;
 }
 
+static inline bool need_preempt(int prio, int active)
+{
+	/*
+	 * Allow preemption of low -> normal -> high, but we do
+	 * not allow low priority tasks to preempt other low priority
+	 * tasks under the impression that latency for low priority
+	 * tasks does not matter (as much as background throughput),
+	 * so kiss.
+	 */
+	return prio >= max(I915_PRIORITY_NORMAL, active);
+}
+
 static void kick_submission(struct intel_engine_cs *engine, int prio)
 {
-	const struct i915_request *inflight = *engine->execlists.active;
+	const struct i915_request *inflight =
+		execlists_active(&engine->execlists);
 
 	/*
 	 * If we are already the currently executing context, don't
@@ -188,7 +201,7 @@ static void kick_submission(struct intel_engine_cs *engine, int prio)
 	 * tasklet, i.e. we have not change the priority queue
 	 * sufficiently to oust the running context.
 	 */
-	if (!inflight || !i915_scheduler_need_preempt(prio, rq_prio(inflight)))
+	if (!inflight || !need_preempt(prio, rq_prio(inflight)))
 		return;
 
 	tasklet_hi_schedule(&engine->execlists.tasklet);
diff --git a/drivers/gpu/drm/i915/i915_scheduler.h b/drivers/gpu/drm/i915/i915_scheduler.h
index 7eefccff39bf..07d243acf553 100644
--- a/drivers/gpu/drm/i915/i915_scheduler.h
+++ b/drivers/gpu/drm/i915/i915_scheduler.h
@@ -52,22 +52,4 @@ static inline void i915_priolist_free(struct i915_priolist *p)
 		__i915_priolist_free(p);
 }
 
-static inline bool i915_scheduler_need_preempt(int prio, int active)
-{
-	/*
-	 * Allow preemption of low -> normal -> high, but we do
-	 * not allow low priority tasks to preempt other low priority
-	 * tasks under the impression that latency for low priority
-	 * tasks does not matter (as much as background throughput),
-	 * so kiss.
-	 *
-	 * More naturally we would write
-	 *	prio >= max(0, last);
-	 * except that we wish to prevent triggering preemption at the same
-	 * priority level: the task that is running should remain running
-	 * to preserve FIFO ordering of dependencies.
-	 */
-	return prio > max(I915_PRIORITY_NORMAL - 1, active);
-}
-
 #endif /* _I915_SCHEDULER_H_ */
-- 
2.30.2