From 73ea648d921ed5c58895c8592321456fe12b697f Mon Sep 17 00:00:00 2001 From: Shaoyun Liu Date: Wed, 11 Jul 2018 22:32:58 -0400 Subject: [PATCH] drm/amdkfd: Implement hang detection in KFD and call amdgpu MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit The reset will be performed in a new hw_exception work thread to handle HWS hang without blocking the thread that detected the hang. Signed-off-by: Shaoyun Liu Reviewed-by: Felix Kuehling Signed-off-by: Felix Kuehling Acked-by: Christian König Signed-off-by: Oded Gabbay --- .../drm/amd/amdkfd/kfd_device_queue_manager.c | 21 ++++++++++++++++++- .../drm/amd/amdkfd/kfd_device_queue_manager.h | 4 ++++ 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index 44fc2038770e..6b59eab39fbe 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -61,6 +61,8 @@ static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm, static void deallocate_sdma_queue(struct device_queue_manager *dqm, unsigned int sdma_queue_id); +static void kfd_process_hw_exception(struct work_struct *work); + static inline enum KFD_MQD_TYPE get_mqd_type_from_queue_type(enum kfd_queue_type type) { @@ -1010,6 +1012,8 @@ static int initialize_cpsch(struct device_queue_manager *dqm) dqm->active_runlist = false; dqm->sdma_bitmap = (1 << CIK_SDMA_QUEUES) - 1; + INIT_WORK(&dqm->hw_exception_work, kfd_process_hw_exception); + return 0; } @@ -1042,6 +1046,8 @@ static int start_cpsch(struct device_queue_manager *dqm) init_interrupts(dqm); dqm_lock(dqm); + /* clear hang status when driver try to start the hw scheduler */ + dqm->is_hws_hang = false; execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0); dqm_unlock(dqm); @@ -1255,6 +1261,8 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm, { int retval = 0; + if (dqm->is_hws_hang) + return -EIO; if (!dqm->active_runlist) return retval; @@ -1293,9 +1301,13 @@ static int execute_queues_cpsch(struct device_queue_manager *dqm, { int retval; + if (dqm->is_hws_hang) + return -EIO; retval = unmap_queues_cpsch(dqm, filter, filter_param); if (retval) { pr_err("The cp might be in an unrecoverable state due to an unsuccessful queues preemption\n"); + dqm->is_hws_hang = true; + schedule_work(&dqm->hw_exception_work); return retval; } @@ -1543,7 +1555,7 @@ static int process_termination_cpsch(struct device_queue_manager *dqm, } retval = execute_queues_cpsch(dqm, filter, 0); - if (retval || qpd->reset_wavefronts) { + if ((!dqm->is_hws_hang) && (retval || qpd->reset_wavefronts)) { pr_warn("Resetting wave fronts (cpsch) on dev %p\n", dqm->dev); dbgdev_wave_reset_wavefronts(dqm->dev, qpd->pqm->process); qpd->reset_wavefronts = false; @@ -1701,6 +1713,13 @@ int kfd_process_vm_fault(struct device_queue_manager *dqm, return ret; } +static void kfd_process_hw_exception(struct work_struct *work) +{ + struct device_queue_manager *dqm = container_of(work, + struct device_queue_manager, hw_exception_work); + dqm->dev->kfd2kgd->gpu_recover(dqm->dev->kgd); +} + #if defined(CONFIG_DEBUG_FS) static void seq_reg_dump(struct seq_file *m, diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h index 0a23ddac345e..70179a6826f4 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h @@ -193,6 +193,10 @@ struct device_queue_manager { struct kfd_mem_obj *fence_mem; bool active_runlist; int sched_policy; + + /* hw exception */ + bool is_hws_hang; + struct work_struct hw_exception_work; }; void device_queue_manager_init_cik( -- 2.30.2