drm/amdgpu: improve HMM error -ENOMEM and -EBUSY handling
authorPhilip Yang <Philip.Yang@amd.com>
Fri, 14 Jun 2019 18:03:36 +0000 (14:03 -0400)
committerAlex Deucher <alexander.deucher@amd.com>
Thu, 20 Jun 2019 16:33:41 +0000 (11:33 -0500)
Under memory pressure, hmm_range_fault may return error code -ENOMEM
or -EBUSY, change pr_info to pr_debug to remove unnecessary kernel log
message because we will retry restore again.

Call get_user_pages_done if TTM get user pages failed will have
WARN_ONCE kernel calling stack dump log.

Signed-off-by: Philip Yang <Philip.Yang@amd.com>
Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c

index 74e86952553f348742d3b5d72fd99e146c14c830..10abae398e5123687a8bfdf4c0dea2ec77ec2a47 100644 (file)
@@ -1731,35 +1731,17 @@ static int update_invalid_user_pages(struct amdkfd_process_info *process_info,
                ret = amdgpu_ttm_tt_get_user_pages(bo->tbo.ttm,
                                                   bo->tbo.ttm->pages);
                if (ret) {
-                       bo->tbo.ttm->pages[0] = NULL;
-                       pr_info("%s: Failed to get user pages: %d\n",
+                       pr_debug("%s: Failed to get user pages: %d\n",
                                __func__, ret);
-                       /* Pretend it succeeded. It will fail later
-                        * with a VM fault if the GPU tries to access
-                        * it. Better than hanging indefinitely with
-                        * stalled user mode queues.
-                        */
-               }
-       }
-
-       return 0;
-}
 
-/* Remove invalid userptr BOs from hmm track list
- *
- * Stop HMM track the userptr update
- */
-static void untrack_invalid_user_pages(struct amdkfd_process_info *process_info)
-{
-       struct kgd_mem *mem, *tmp_mem;
-       struct amdgpu_bo *bo;
+                       /* Return error -EBUSY or -ENOMEM, retry restore */
+                       return ret;
+               }
 
-       list_for_each_entry_safe(mem, tmp_mem,
-                                &process_info->userptr_inval_list,
-                                validate_list.head) {
-               bo = mem->bo;
                amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm);
        }
+
+       return 0;
 }
 
 /* Validate invalid userptr BOs
@@ -1841,13 +1823,6 @@ static int validate_invalid_user_pages(struct amdkfd_process_info *process_info)
                list_move_tail(&mem->validate_list.head,
                               &process_info->userptr_valid_list);
 
-               /* Stop HMM track the userptr update. We dont check the return
-                * value for concurrent CPU page table update because we will
-                * reschedule the restore worker if process_info->evicted_bos
-                * is updated.
-                */
-               amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm);
-
                /* Update mapping. If the BO was not validated
                 * (because we couldn't get user pages), this will
                 * clear the page table entries, which will result in
@@ -1946,7 +1921,6 @@ static void amdgpu_amdkfd_restore_userptr_worker(struct work_struct *work)
        }
 
 unlock_out:
-       untrack_invalid_user_pages(process_info);
        mutex_unlock(&process_info->lock);
        mmput(mm);
        put_task_struct(usertask);