drm/amdkfd: Implement GPU reset handlers in KFD

author Shaoyun Liu <Shaoyun.Liu@amd.com>

Thu, 12 Jul 2018 02:32:56 +0000 (22:32 -0400)

committer Oded Gabbay <oded.gabbay@gmail.com>

Thu, 12 Jul 2018 02:32:56 +0000 (22:32 -0400)
author Shaoyun Liu <Shaoyun.Liu@amd.com>
Thu, 12 Jul 2018 02:32:56 +0000 (22:32 -0400)
committer Oded Gabbay <oded.gabbay@gmail.com>
Thu, 12 Jul 2018 02:32:56 +0000 (22:32 -0400)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c

index 7e717716b90e8daf8edb7811b2fd545a84a73232..b5338bff8cef48b9c82a65e3561dd37aa5c6fd76 100644 (file)
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -122,6 +122,9 @@ static int kfd_open(struct inode *inode, struct file *filep)
         if (IS_ERR(process))
                 return PTR_ERR(process);
  
+       if (kfd_is_locked())
+               return -EAGAIN;
+
         dev_dbg(kfd_device, "process %d opened, compat mode (32 bit) - %d\n",
                 process->pasid, process->is_32bit_user_mode);
  
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c

index a8226d8b86cc05790bfd7c19fa4d1ed161dc0ac4..9f63ac366284b30ee022f29761f75869528386a3 100644 (file)
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -30,7 +30,13 @@
  #include "kfd_iommu.h"
  
  #define MQD_SIZE_ALIGNED 768
-static atomic_t kfd_device_suspended = ATOMIC_INIT(0);
+
+/*
+ * kfd_locked is used to lock the kfd driver during suspend or reset
+ * once locked, kfd driver will stop any further GPU execution.
+ * create process (open) will return -EAGAIN.
+ */
+static atomic_t kfd_locked = ATOMIC_INIT(0);
  
  #ifdef KFD_SUPPORT_IOMMU_V2
  static const struct kfd_device_info kaveri_device_info = {
@@ -516,21 +522,52 @@ void kgd2kfd_device_exit(struct kfd_dev *kfd)
  
  int kgd2kfd_pre_reset(struct kfd_dev *kfd)
  {
+       if (!kfd->init_complete)
+               return 0;
+       kgd2kfd_suspend(kfd);
+
+       /* hold dqm->lock to prevent further execution*/
+       dqm_lock(kfd->dqm);
+
+       kfd_signal_reset_event(kfd);
         return 0;
  }
  
+/*
+ * Fix me. KFD won't be able to resume existing process for now.
+ * We will keep all existing process in a evicted state and
+ * wait the process to be terminated.
+ */
+
  int kgd2kfd_post_reset(struct kfd_dev *kfd)
  {
+       int ret, count;
+
+       if (!kfd->init_complete)
+               return 0;
+
+       dqm_unlock(kfd->dqm);
+
+       ret = kfd_resume(kfd);
+       if (ret)
+               return ret;
+       count = atomic_dec_return(&kfd_locked);
+       WARN_ONCE(count != 0, "KFD reset ref. error");
         return 0;
  }
  
+bool kfd_is_locked(void)
+{
+       return  (atomic_read(&kfd_locked) > 0);
+}
+
  void kgd2kfd_suspend(struct kfd_dev *kfd)
  {
         if (!kfd->init_complete)
                 return;
  
         /* For first KFD device suspend all the KFD processes */
-       if (atomic_inc_return(&kfd_device_suspended) == 1)
+       if (atomic_inc_return(&kfd_locked) == 1)
                 kfd_suspend_all_processes();
  
         kfd->dqm->ops.stop(kfd->dqm);
@@ -549,7 +586,7 @@ int kgd2kfd_resume(struct kfd_dev *kfd)
         if (ret)
                 return ret;
  
-       count = atomic_dec_return(&kfd_device_suspended);
+       count = atomic_dec_return(&kfd_locked);
         WARN_ONCE(count < 0, "KFD suspend / resume ref. error");
         if (count == 0)
                 ret = kfd_resume_all_processes();
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c

index b58a0e665ebc3d05f10f324cdd75324febe82196..820133cdef83efdd8ec8570613da262cd50d0bfb 100644 (file)
--- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
@@ -1000,3 +1000,30 @@ void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid,
         mutex_unlock(&p->event_mutex);
         kfd_unref_process(p);
  }
+
+void kfd_signal_reset_event(struct kfd_dev *dev)
+{
+       struct kfd_hsa_hw_exception_data hw_exception_data;
+       struct kfd_process *p;
+       struct kfd_event *ev;
+       unsigned int temp;
+       uint32_t id, idx;
+
+       /* Whole gpu reset caused by GPU hang and memory is lost */
+       memset(&hw_exception_data, 0, sizeof(hw_exception_data));
+       hw_exception_data.gpu_id = dev->id;
+       hw_exception_data.memory_lost = 1;
+
+       idx = srcu_read_lock(&kfd_processes_srcu);
+       hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
+               mutex_lock(&p->event_mutex);
+               id = KFD_FIRST_NONSIGNAL_EVENT_ID;
+               idr_for_each_entry_continue(&p->event_idr, ev, id)
+                       if (ev->type == KFD_EVENT_TYPE_HW_EXCEPTION) {
+                               ev->hw_exception_data = hw_exception_data;
+                               set_event(ev);
+                       }
+               mutex_unlock(&p->event_mutex);
+       }
+       srcu_read_unlock(&kfd_processes_srcu, idx);
+}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.h b/drivers/gpu/drm/amd/amdkfd/kfd_events.h

index abca5bfebbff16fadd5699fff91bf0a939ba0b4b..c7ac6c73af86eb80c1f166bb96286682675a3c4a 100644 (file)
--- a/drivers/gpu/drm/amd/amdkfd/kfd_events.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.h
@@ -66,6 +66,7 @@ struct kfd_event {
         /* type specific data */
         union {
                 struct kfd_hsa_memory_exception_data memory_exception_data;
+               struct kfd_hsa_hw_exception_data hw_exception_data;
         };
  };
  
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h

index 4bc8d5af419de52ea9e3213838f0c9f0acc7df08..2e03d6c80aa0d45dae7e2d698721a5c36c19cd8b 100644 (file)
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -975,10 +975,14 @@ int kfd_event_destroy(struct kfd_process *p, uint32_t event_id);
  void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid,
                                 struct kfd_vm_fault_info *info);
  
+void kfd_signal_reset_event(struct kfd_dev *dev);
+
  void kfd_flush_tlb(struct kfd_process_device *pdd);
  
  int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p);
  
+bool kfd_is_locked(void);
+
  /* Debugfs */
  #if defined(CONFIG_DEBUG_FS)
author	Shaoyun Liu <Shaoyun.Liu@amd.com>
	Thu, 12 Jul 2018 02:32:56 +0000 (22:32 -0400)
committer	Oded Gabbay <oded.gabbay@gmail.com>
	Thu, 12 Jul 2018 02:32:56 +0000 (22:32 -0400)
drivers/gpu/drm/amd/amdkfd/kfd_chardev.c		patch \| blob \| history
drivers/gpu/drm/amd/amdkfd/kfd_device.c		patch \| blob \| history
drivers/gpu/drm/amd/amdkfd/kfd_events.c		patch \| blob \| history
drivers/gpu/drm/amd/amdkfd/kfd_events.h		patch \| blob \| history
drivers/gpu/drm/amd/amdkfd/kfd_priv.h		patch \| blob \| history