drm/sched: Add boolean to mark if sched is ready to work v5
authorAndrey Grodzovsky <andrey.grodzovsky@amd.com>
Thu, 18 Oct 2018 16:32:46 +0000 (12:32 -0400)
committerAlex Deucher <alexander.deucher@amd.com>
Mon, 5 Nov 2018 19:21:22 +0000 (14:21 -0500)
Problem:
A particular scheduler may become unsuable (underlying HW) after
some event (e.g. GPU reset). If it's later chosen by
the get free sched. policy a command will fail to be
submitted.

Fix:
Add a driver specific callback to report the sched status so
rq with bad sched can be avoided in favor of working one or
none in which case job init will fail.

v2: Switch from driver callback to flag in scheduler.

v3: rebase

v4: Remove ready paramter from drm_sched_init, set
uncoditionally to true once init done.

v5: fix missed change in v3d in v4 (Alex)

Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/scheduler/sched_entity.c
drivers/gpu/drm/scheduler/sched_main.c
include/drm/gpu_scheduler.h

index 3e22a54a99c25b52c62ece421596695faf3d3489..ba54c30a466e05e2e31ea8bf938c24cb07b1a180 100644 (file)
@@ -130,7 +130,14 @@ drm_sched_entity_get_free_sched(struct drm_sched_entity *entity)
        int i;
 
        for (i = 0; i < entity->num_rq_list; ++i) {
-               num_jobs = atomic_read(&entity->rq_list[i]->sched->num_jobs);
+               struct drm_gpu_scheduler *sched = entity->rq_list[i]->sched;
+
+               if (!entity->rq_list[i]->sched->ready) {
+                       DRM_WARN("sched%s is not ready, skipping", sched->name);
+                       continue;
+               }
+
+               num_jobs = atomic_read(&sched->num_jobs);
                if (num_jobs < min_jobs) {
                        min_jobs = num_jobs;
                        rq = entity->rq_list[i];
index 63b997d9c5620398181510eae50959249f1e9ab0..6b2fd49334f7c3952b81376b4533970a0128597a 100644 (file)
@@ -420,6 +420,9 @@ int drm_sched_job_init(struct drm_sched_job *job,
        struct drm_gpu_scheduler *sched;
 
        drm_sched_entity_select_rq(entity);
+       if (!entity->rq)
+               return -ENOENT;
+
        sched = entity->rq->sched;
 
        job->sched = sched;
@@ -633,6 +636,7 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
                return PTR_ERR(sched->thread);
        }
 
+       sched->ready = true;
        return 0;
 }
 EXPORT_SYMBOL(drm_sched_init);
@@ -648,5 +652,7 @@ void drm_sched_fini(struct drm_gpu_scheduler *sched)
 {
        if (sched->thread)
                kthread_stop(sched->thread);
+
+       sched->ready = false;
 }
 EXPORT_SYMBOL(drm_sched_fini);
index 0684dcd99c0feb26735e28fc06c65ad6ab6400d6..4ae192a21c3fe9a9e85871edff8ed185788e271d 100644 (file)
@@ -264,6 +264,7 @@ struct drm_sched_backend_ops {
  * @hang_limit: once the hangs by a job crosses this limit then it is marked
  *              guilty and it will be considered for scheduling further.
  * @num_jobs: the number of jobs in queue in the scheduler
+ * @ready: marks if the underlying HW is ready to work
  *
  * One scheduler is implemented for each hardware ring.
  */
@@ -283,12 +284,14 @@ struct drm_gpu_scheduler {
        spinlock_t                      job_list_lock;
        int                             hang_limit;
        atomic_t                        num_jobs;
+       bool                    ready;
 };
 
 int drm_sched_init(struct drm_gpu_scheduler *sched,
                   const struct drm_sched_backend_ops *ops,
                   uint32_t hw_submission, unsigned hang_limit, long timeout,
                   const char *name);
+
 void drm_sched_fini(struct drm_gpu_scheduler *sched);
 int drm_sched_job_init(struct drm_sched_job *job,
                       struct drm_sched_entity *entity,