From e82fdb16a0650250e27f7241133682342d476ad3 Mon Sep 17 00:00:00 2001 From: Philip Yang Date: Fri, 14 Jun 2019 14:03:36 -0400 Subject: [PATCH] drm/amdgpu: improve HMM error -ENOMEM and -EBUSY handling Under memory pressure, hmm_range_fault may return error code -ENOMEM or -EBUSY, change pr_info to pr_debug to remove unnecessary kernel log message because we will retry restore again. Call get_user_pages_done if TTM get user pages failed will have WARN_ONCE kernel calling stack dump log. Signed-off-by: Philip Yang Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher --- .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 38 +++---------------- 1 file changed, 6 insertions(+), 32 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index 74e86952553f..10abae398e51 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -1731,35 +1731,17 @@ static int update_invalid_user_pages(struct amdkfd_process_info *process_info, ret = amdgpu_ttm_tt_get_user_pages(bo->tbo.ttm, bo->tbo.ttm->pages); if (ret) { - bo->tbo.ttm->pages[0] = NULL; - pr_info("%s: Failed to get user pages: %d\n", + pr_debug("%s: Failed to get user pages: %d\n", __func__, ret); - /* Pretend it succeeded. It will fail later - * with a VM fault if the GPU tries to access - * it. Better than hanging indefinitely with - * stalled user mode queues. - */ - } - } - - return 0; -} -/* Remove invalid userptr BOs from hmm track list - * - * Stop HMM track the userptr update - */ -static void untrack_invalid_user_pages(struct amdkfd_process_info *process_info) -{ - struct kgd_mem *mem, *tmp_mem; - struct amdgpu_bo *bo; + /* Return error -EBUSY or -ENOMEM, retry restore */ + return ret; + } - list_for_each_entry_safe(mem, tmp_mem, - &process_info->userptr_inval_list, - validate_list.head) { - bo = mem->bo; amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm); } + + return 0; } /* Validate invalid userptr BOs @@ -1841,13 +1823,6 @@ static int validate_invalid_user_pages(struct amdkfd_process_info *process_info) list_move_tail(&mem->validate_list.head, &process_info->userptr_valid_list); - /* Stop HMM track the userptr update. We dont check the return - * value for concurrent CPU page table update because we will - * reschedule the restore worker if process_info->evicted_bos - * is updated. - */ - amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm); - /* Update mapping. If the BO was not validated * (because we couldn't get user pages), this will * clear the page table entries, which will result in @@ -1946,7 +1921,6 @@ static void amdgpu_amdkfd_restore_userptr_worker(struct work_struct *work) } unlock_out: - untrack_invalid_user_pages(process_info); mutex_unlock(&process_info->lock); mmput(mm); put_task_struct(usertask); -- GitLab