From 2640c3facbd6e21e63c95f19588cc24913a263cd Mon Sep 17 00:00:00 2001 From: shaoyunl Date: Wed, 11 Jul 2018 22:32:50 -0400 Subject: [PATCH] drm/amdkfd: Handle VM faults in KFD MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit 1. Pre-GFX9 the amdgpu ISR saves the vm-fault status and address per per-vmid. amdkfd needs to get the information from amdgpu through the new get_vm_fault_info interface. On GFX9 and later, all the required information is in the IH ring 2. amdkfd unmaps all queues from the faulting process and create new run-list without the guilty process 3. amdkfd notifies the runtime of the vm fault trap via EVENT_TYPE_MEMORY Signed-off-by: shaoyun liu Signed-off-by: Felix Kuehling Acked-by: Christian König Signed-off-by: Oded Gabbay --- .../gpu/drm/amd/amdkfd/cik_event_interrupt.c | 25 +++++++++++-- drivers/gpu/drm/amd/amdkfd/cik_int.h | 2 + .../drm/amd/amdkfd/kfd_device_queue_manager.c | 17 +++++++++ drivers/gpu/drm/amd/amdkfd/kfd_events.c | 37 +++++++++++++++++++ .../gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 18 ++++++++- drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 4 ++ include/uapi/linux/kfd_ioctl.h | 2 +- 7 files changed, 98 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c index 49df6c791cfc..cc33870e7edb 100644 --- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c +++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c @@ -48,18 +48,19 @@ static bool cik_event_interrupt_isr(struct kfd_dev *dev, return ihre->source_id == CIK_INTSRC_CP_END_OF_PIPE || ihre->source_id == CIK_INTSRC_SDMA_TRAP || ihre->source_id == CIK_INTSRC_SQ_INTERRUPT_MSG || - ihre->source_id == CIK_INTSRC_CP_BAD_OPCODE; + ihre->source_id == CIK_INTSRC_CP_BAD_OPCODE || + ihre->source_id == CIK_INTSRC_GFX_PAGE_INV_FAULT || + ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT; } static void cik_event_interrupt_wq(struct kfd_dev *dev, const uint32_t *ih_ring_entry) { - unsigned int pasid; const struct cik_ih_ring_entry *ihre = (const struct cik_ih_ring_entry *)ih_ring_entry; uint32_t context_id = ihre->data & 0xfffffff; - - pasid = (ihre->ring_id & 0xffff0000) >> 16; + unsigned int vmid = (ihre->ring_id & 0x0000ff00) >> 8; + unsigned int pasid = (ihre->ring_id & 0xffff0000) >> 16; if (pasid == 0) return; @@ -72,6 +73,22 @@ static void cik_event_interrupt_wq(struct kfd_dev *dev, kfd_signal_event_interrupt(pasid, context_id & 0xff, 8); else if (ihre->source_id == CIK_INTSRC_CP_BAD_OPCODE) kfd_signal_hw_exception_event(pasid); + else if (ihre->source_id == CIK_INTSRC_GFX_PAGE_INV_FAULT || + ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) { + struct kfd_vm_fault_info info; + + kfd_process_vm_fault(dev->dqm, pasid); + + memset(&info, 0, sizeof(info)); + dev->kfd2kgd->get_vm_fault_info(dev->kgd, &info); + if (!info.page_addr && !info.status) + return; + + if (info.vmid == vmid) + kfd_signal_vm_fault_event(dev, pasid, &info); + else + kfd_signal_vm_fault_event(dev, pasid, NULL); + } } const struct kfd_event_interrupt_class event_interrupt_class_cik = { diff --git a/drivers/gpu/drm/amd/amdkfd/cik_int.h b/drivers/gpu/drm/amd/amdkfd/cik_int.h index 109298b9d507..a2079a04a673 100644 --- a/drivers/gpu/drm/amd/amdkfd/cik_int.h +++ b/drivers/gpu/drm/amd/amdkfd/cik_int.h @@ -37,6 +37,8 @@ struct cik_ih_ring_entry { #define CIK_INTSRC_DEQUEUE_COMPLETE 0xC6 #define CIK_INTSRC_SDMA_TRAP 0xE0 #define CIK_INTSRC_SQ_INTERRUPT_MSG 0xEF +#define CIK_INTSRC_GFX_PAGE_INV_FAULT 0x92 +#define CIK_INTSRC_GFX_MEM_PROT_FAULT 0x93 #endif diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index f2f81d26db0c..44fc2038770e 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -1684,6 +1684,23 @@ void device_queue_manager_uninit(struct device_queue_manager *dqm) kfree(dqm); } +int kfd_process_vm_fault(struct device_queue_manager *dqm, + unsigned int pasid) +{ + struct kfd_process_device *pdd; + struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); + int ret = 0; + + if (!p) + return -EINVAL; + pdd = kfd_get_process_device_data(dqm->dev, p); + if (pdd) + ret = dqm->ops.evict_process_queues(dqm, &pdd->qpd); + kfd_unref_process(p); + + return ret; +} + #if defined(CONFIG_DEBUG_FS) static void seq_reg_dump(struct seq_file *m, diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c index 3d5a8332e8c0..b58a0e665ebc 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c @@ -963,3 +963,40 @@ void kfd_signal_hw_exception_event(unsigned int pasid) mutex_unlock(&p->event_mutex); kfd_unref_process(p); } + +void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid, + struct kfd_vm_fault_info *info) +{ + struct kfd_event *ev; + uint32_t id; + struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); + struct kfd_hsa_memory_exception_data memory_exception_data; + + if (!p) + return; /* Presumably process exited. */ + memset(&memory_exception_data, 0, sizeof(memory_exception_data)); + memory_exception_data.gpu_id = dev->id; + memory_exception_data.failure.imprecise = 1; + /* Set failure reason */ + if (info) { + memory_exception_data.va = (info->page_addr) << PAGE_SHIFT; + memory_exception_data.failure.NotPresent = + info->prot_valid ? 1 : 0; + memory_exception_data.failure.NoExecute = + info->prot_exec ? 1 : 0; + memory_exception_data.failure.ReadOnly = + info->prot_write ? 1 : 0; + memory_exception_data.failure.imprecise = 0; + } + mutex_lock(&p->event_mutex); + + id = KFD_FIRST_NONSIGNAL_EVENT_ID; + idr_for_each_entry_continue(&p->event_idr, ev, id) + if (ev->type == KFD_EVENT_TYPE_MEMORY) { + ev->memory_exception_data = memory_exception_data; + set_event(ev); + } + + mutex_unlock(&p->event_mutex); + kfd_unref_process(p); +} diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c index 37029baa3346..d6b64e692760 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c @@ -57,7 +57,9 @@ static bool event_interrupt_isr_v9(struct kfd_dev *dev, return source_id == SOC15_INTSRC_CP_END_OF_PIPE || source_id == SOC15_INTSRC_SDMA_TRAP || source_id == SOC15_INTSRC_SQ_INTERRUPT_MSG || - source_id == SOC15_INTSRC_CP_BAD_OPCODE; + source_id == SOC15_INTSRC_CP_BAD_OPCODE || + client_id == SOC15_IH_CLIENTID_VMC || + client_id == SOC15_IH_CLIENTID_UTCL2; } static void event_interrupt_wq_v9(struct kfd_dev *dev, @@ -82,7 +84,19 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev, kfd_signal_hw_exception_event(pasid); else if (client_id == SOC15_IH_CLIENTID_VMC || client_id == SOC15_IH_CLIENTID_UTCL2) { - /* TODO */ + struct kfd_vm_fault_info info = {0}; + uint16_t ring_id = SOC15_RING_ID_FROM_IH_ENTRY(ih_ring_entry); + + info.vmid = vmid; + info.mc_id = client_id; + info.page_addr = ih_ring_entry[4] | + (uint64_t)(ih_ring_entry[5] & 0xf) << 32; + info.prot_valid = ring_id & 0x08; + info.prot_read = ring_id & 0x10; + info.prot_write = ring_id & 0x20; + + kfd_process_vm_fault(dev->dqm, pasid); + kfd_signal_vm_fault_event(dev, pasid, &info); } } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index 5e3990bb4c4b..91a3368421b1 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -838,6 +838,7 @@ void device_queue_manager_uninit(struct device_queue_manager *dqm); struct kernel_queue *kernel_queue_init(struct kfd_dev *dev, enum kfd_queue_type type); void kernel_queue_uninit(struct kernel_queue *kq); +int kfd_process_vm_fault(struct device_queue_manager *dqm, unsigned int pasid); /* Process Queue Manager */ struct process_queue_node { @@ -964,6 +965,9 @@ int kfd_event_create(struct file *devkfd, struct kfd_process *p, uint64_t *event_page_offset, uint32_t *event_slot_index); int kfd_event_destroy(struct kfd_process *p, uint32_t event_id); +void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid, + struct kfd_vm_fault_info *info); + void kfd_flush_tlb(struct kfd_process_device *pdd); int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p); diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index b4f5073dbac2..46a54ab1e728 100644 --- a/include/uapi/linux/kfd_ioctl.h +++ b/include/uapi/linux/kfd_ioctl.h @@ -219,7 +219,7 @@ struct kfd_memory_exception_failure { __u32 NotPresent; /* Page not present or supervisor privilege */ __u32 ReadOnly; /* Write access to a read-only page */ __u32 NoExecute; /* Execute access to a page marked NX */ - __u32 pad; + __u32 imprecise; /* Can't determine the exact fault address */ }; /* memory exception data*/ -- 2.30.2