drm/amdgpu: Handle the GPU recovery failure in SRIOV environment.

This patch handles the GPU recovery failure in sriov environment by retrying the reset if the first reset fails. To determine the condition of retry, a new macro AMDGPU_RETRY_SRIOV_RESET is added which returns true if failure is due to ETIMEDOUT, EINVAL or EBUSY, otherwise return false.A new macro AMDGPU_MAX_RETRY_LIMIT is used to limit the retry to 2. It also handles the return status in Post Asic Reset by updating the return code with asic_reset_res and eventually return the return code in amdgpu_job_timedout(). Signed-off-by: N Surbhi Kakarya <surbhi.kakarya@amd.com> Reviewed-by: N Andrey Grodzovsky <andrey.grodzovsky@amd.com> Signed-off-by: N Alex Deucher <alexander.deucher@amd.com>

drm/amdgpu: Handle the GPU recovery failure in SRIOV environment.
This patch handles the GPU recovery failure in sriov environment by retrying the reset if the first reset fails. To determine the condition of retry, a new macro AMDGPU_RETRY_SRIOV_RESET is added which returns true if failure is due to ETIMEDOUT, EINVAL or EBUSY, otherwise return false.A new macro AMDGPU_MAX_RETRY_LIMIT is used to limit the retry to 2. It also handles the return status in Post Asic Reset by updating the return code with asic_reset_res and eventually return the return code in amdgpu_job_timedout(). Signed-off-by: N Surbhi Kakarya <surbhi.kakarya@amd.com> Reviewed-by: N Andrey Grodzovsky <andrey.grodzovsky@amd.com> Signed-off-by: N Alex Deucher <alexander.deucher@amd.com>
7258fa31 · Surbhi Kakarya · Alex Deucher · 1ec1944e · 7258fa31 · 7258fa31
隐藏空白更改
内联并排

Showing with 20 addition and 1 deletion

drivers/gpu/drm/amd/amdgpu/amdgpu_device.c drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +15 -0

drivers/gpu/drm/amd/amdgpu/amdgpu_job.c drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +5 -1

未找到文件。
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -88,6 +88,8 @@ MODULE_FIRMWARE("amdgpu/vangogh_gpu_info.bin");
 MODULE_FIRMWARE("amdgpu/yellow_carp_gpu_info.bin");

 #define AMDGPU_RESUME_MS		2000
+#define AMDGPU_MAX_RETRY_LIMIT		2
+#define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL)

 const char *amdgpu_asic_name[] = {
 	"TAHITI",
@@ -4366,7 +4368,9 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
 {
 	int r;
 	struct amdgpu_hive_info *hive = NULL;
+	int retry_limit = 0;

+retry:
 	amdgpu_amdkfd_pre_reset(adev);

 	amdgpu_amdkfd_pre_reset(adev);
@@ -4415,6 +4419,14 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
 	}
 	amdgpu_virt_release_full_gpu(adev, true);

+	if (AMDGPU_RETRY_SRIOV_RESET(r)) {
+		if (retry_limit < AMDGPU_MAX_RETRY_LIMIT) {
+			retry_limit++;
+			goto retry;
+		} else
+			DRM_ERROR("GPU reset retry is beyond the retry limit\n");
+	}
+
 	return r;
 }

@@ -5206,6 +5218,9 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 			drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
 		}

+		if (tmp_adev->asic_reset_res)
+			r = tmp_adev->asic_reset_res;
+
 		tmp_adev->asic_reset_res = 0;

 		if (r) {

--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -37,6 +37,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
 	struct amdgpu_task_info ti;
 	struct amdgpu_device *adev = ring->adev;
 	int idx;
+	int r;

 	if (!drm_dev_enter(adev_to_drm(adev), &idx)) {
 		DRM_INFO("%s - device unplugged skipping recovery on scheduler:%s",
@@ -63,7 +64,10 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
 		  ti.process_name, ti.tgid, ti.task_name, ti.pid);

 	if (amdgpu_device_should_recover_gpu(ring->adev)) {
-		amdgpu_device_gpu_recover(ring->adev, job);
+		r = amdgpu_device_gpu_recover(ring->adev, job);
+		if (r)
+			DRM_ERROR("GPU Recovery Failed: %d\n", r);
+
 	} else {
 		drm_sched_suspend_timeout(&ring->sched);
 		if (amdgpu_sriov_vf(adev))