提交 9e94d22c 编写于 作者: E Evan Quan 提交者: Alex Deucher

drm/amdgpu: optimize the gpu reset for XGMI setup V2

This is basically just some code cosmetic. The current design
for XGMI setup gput reset is to operate on current device(adev)
first and then on other devices from the hive(by another 'for' loop).
But actually we can do some sort to the device list(to put current
device 1st position) and handle all the devices in a single 'for'
loop.

V2: added missing hive->hive_lock protection
Signed-off-by: NEvan Quan <evan.quan@amd.com>
Reviewed-by: NAndrey Grodzovsky <andrey.grodzovsky@amd.com>
Signed-off-by: NAlex Deucher <alexander.deucher@amd.com>
上级 52fb44cf
......@@ -4152,16 +4152,11 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
}
need_full_reset = job_signaled = false;
INIT_LIST_HEAD(&device_list);
amdgpu_ras_set_error_query_ready(adev, false);
dev_info(adev->dev, "GPU %s begin!\n",
(in_ras_intr && !use_baco) ? "jobs stop":"reset");
cancel_delayed_work_sync(&adev->delayed_init_work);
hive = amdgpu_get_xgmi_hive(adev, false);
hive = amdgpu_get_xgmi_hive(adev, true);
/*
* Here we trylock to avoid chain of resets executing from
......@@ -4174,35 +4169,21 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
if (hive && !mutex_trylock(&hive->reset_lock)) {
DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
job ? job->base.id : -1, hive->hive_id);
mutex_unlock(&hive->hive_lock);
return 0;
}
/* Start with adev pre asic reset first for soft reset check.*/
if (!amdgpu_device_lock_adev(adev, !hive)) {
DRM_INFO("Bailing on TDR for s_job:%llx, as another already in progress",
job ? job->base.id : -1);
return 0;
}
/* Block kfd: SRIOV would do it separately */
if (!amdgpu_sriov_vf(adev))
amdgpu_amdkfd_pre_reset(adev);
/* Build list of devices to reset */
if (adev->gmc.xgmi.num_physical_nodes > 1) {
if (!hive) {
/*unlock kfd: SRIOV would do it separately */
if (!amdgpu_sriov_vf(adev))
amdgpu_amdkfd_post_reset(adev);
amdgpu_device_unlock_adev(adev);
return -ENODEV;
}
/*
* In case we are in XGMI hive mode device reset is done for all the
* nodes in the hive to retrain all XGMI links and hence the reset
* sequence is executed in loop on all nodes.
* Build list of devices to reset.
* In case we are in XGMI hive mode, resort the device list
* to put adev in the 1st position.
*/
INIT_LIST_HEAD(&device_list);
if (adev->gmc.xgmi.num_physical_nodes > 1) {
if (!hive)
return -ENODEV;
if (!list_is_first(&adev->gmc.xgmi.head, &hive->device_list))
list_rotate_to_front(&adev->gmc.xgmi.head, &hive->device_list);
device_list_handle = &hive->device_list;
} else {
list_add_tail(&adev->gmc.xgmi.head, &device_list);
......@@ -4211,15 +4192,20 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
/* block all schedulers and reset given job's ring */
list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
if (tmp_adev != adev) {
amdgpu_ras_set_error_query_ready(tmp_adev, false);
amdgpu_device_lock_adev(tmp_adev, false);
if (!amdgpu_sriov_vf(tmp_adev))
amdgpu_amdkfd_pre_reset(tmp_adev);
if (!amdgpu_device_lock_adev(tmp_adev, !hive)) {
DRM_INFO("Bailing on TDR for s_job:%llx, as another already in progress",
job ? job->base.id : -1);
mutex_unlock(&hive->hive_lock);
return 0;
}
amdgpu_ras_set_error_query_ready(tmp_adev, false);
cancel_delayed_work_sync(&tmp_adev->delayed_init_work);
if (!amdgpu_sriov_vf(tmp_adev))
amdgpu_amdkfd_pre_reset(tmp_adev);
/*
* Mark these ASICs to be reseted as untracked first
* And add them back after reset completed
......@@ -4265,22 +4251,8 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
goto skip_hw_reset;
}
/* Guilty job will be freed after this*/
r = amdgpu_device_pre_asic_reset(adev, job, &need_full_reset);
if (r) {
/*TODO Should we stop ?*/
DRM_ERROR("GPU pre asic reset failed with err, %d for drm dev, %s ",
r, adev->ddev->unique);
adev->asic_reset_res = r;
}
retry: /* Rest of adevs pre asic reset from XGMI hive. */
list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
if (tmp_adev == adev)
continue;
r = amdgpu_device_pre_asic_reset(tmp_adev,
NULL,
&need_full_reset);
......@@ -4345,8 +4317,10 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
amdgpu_device_unlock_adev(tmp_adev);
}
if (hive)
if (hive) {
mutex_unlock(&hive->reset_lock);
mutex_unlock(&hive->hive_lock);
}
if (r)
dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册