提交 e82fdb16 编写于 作者: P Philip Yang 提交者: Alex Deucher

drm/amdgpu: improve HMM error -ENOMEM and -EBUSY handling

Under memory pressure, hmm_range_fault may return error code -ENOMEM
or -EBUSY, change pr_info to pr_debug to remove unnecessary kernel log
message because we will retry restore again.

Call get_user_pages_done if TTM get user pages failed will have
WARN_ONCE kernel calling stack dump log.
Signed-off-by: NPhilip Yang <Philip.Yang@amd.com>
Reviewed-by: NFelix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: NAlex Deucher <alexander.deucher@amd.com>
上级 c1d827d6
...@@ -1731,35 +1731,17 @@ static int update_invalid_user_pages(struct amdkfd_process_info *process_info, ...@@ -1731,35 +1731,17 @@ static int update_invalid_user_pages(struct amdkfd_process_info *process_info,
ret = amdgpu_ttm_tt_get_user_pages(bo->tbo.ttm, ret = amdgpu_ttm_tt_get_user_pages(bo->tbo.ttm,
bo->tbo.ttm->pages); bo->tbo.ttm->pages);
if (ret) { if (ret) {
bo->tbo.ttm->pages[0] = NULL; pr_debug("%s: Failed to get user pages: %d\n",
pr_info("%s: Failed to get user pages: %d\n",
__func__, ret); __func__, ret);
/* Pretend it succeeded. It will fail later
* with a VM fault if the GPU tries to access
* it. Better than hanging indefinitely with
* stalled user mode queues.
*/
}
}
return 0;
}
/* Remove invalid userptr BOs from hmm track list /* Return error -EBUSY or -ENOMEM, retry restore */
* return ret;
* Stop HMM track the userptr update }
*/
static void untrack_invalid_user_pages(struct amdkfd_process_info *process_info)
{
struct kgd_mem *mem, *tmp_mem;
struct amdgpu_bo *bo;
list_for_each_entry_safe(mem, tmp_mem,
&process_info->userptr_inval_list,
validate_list.head) {
bo = mem->bo;
amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm); amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm);
} }
return 0;
} }
/* Validate invalid userptr BOs /* Validate invalid userptr BOs
...@@ -1841,13 +1823,6 @@ static int validate_invalid_user_pages(struct amdkfd_process_info *process_info) ...@@ -1841,13 +1823,6 @@ static int validate_invalid_user_pages(struct amdkfd_process_info *process_info)
list_move_tail(&mem->validate_list.head, list_move_tail(&mem->validate_list.head,
&process_info->userptr_valid_list); &process_info->userptr_valid_list);
/* Stop HMM track the userptr update. We dont check the return
* value for concurrent CPU page table update because we will
* reschedule the restore worker if process_info->evicted_bos
* is updated.
*/
amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm);
/* Update mapping. If the BO was not validated /* Update mapping. If the BO was not validated
* (because we couldn't get user pages), this will * (because we couldn't get user pages), this will
* clear the page table entries, which will result in * clear the page table entries, which will result in
...@@ -1946,7 +1921,6 @@ static void amdgpu_amdkfd_restore_userptr_worker(struct work_struct *work) ...@@ -1946,7 +1921,6 @@ static void amdgpu_amdkfd_restore_userptr_worker(struct work_struct *work)
} }
unlock_out: unlock_out:
untrack_invalid_user_pages(process_info);
mutex_unlock(&process_info->lock); mutex_unlock(&process_info->lock);
mmput(mm); mmput(mm);
put_task_struct(usertask); put_task_struct(usertask);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册