提交 b3c85a0f 编写于 作者: C Christian König 提交者: Alex Deucher

drm/amdgpu: fix fundamental suspend/resume issue

Reinitializing the VM manager during suspend/resume is a very very bad
idea since all the VMs are still active and kicking.

This can lead to random VM faults after resume when new processes
become the same client ID assigned.
Signed-off-by: NChristian König <christian.koenig@amd.com>
Reviewed-by: NAlex Deucher <alexander.deucher@amd.com>
Signed-off-by: NAlex Deucher <alexander.deucher@amd.com>
Cc: stable@vger.kernel.org
上级 d51aff16
...@@ -672,6 +672,7 @@ void amdgpu_vm_reset_id(struct amdgpu_device *adev, unsigned vmhub, ...@@ -672,6 +672,7 @@ void amdgpu_vm_reset_id(struct amdgpu_device *adev, unsigned vmhub,
struct amdgpu_vm_id_manager *id_mgr = &adev->vm_manager.id_mgr[vmhub]; struct amdgpu_vm_id_manager *id_mgr = &adev->vm_manager.id_mgr[vmhub];
struct amdgpu_vm_id *id = &id_mgr->ids[vmid]; struct amdgpu_vm_id *id = &id_mgr->ids[vmid];
atomic64_set(&id->owner, 0);
id->gds_base = 0; id->gds_base = 0;
id->gds_size = 0; id->gds_size = 0;
id->gws_base = 0; id->gws_base = 0;
...@@ -680,6 +681,26 @@ void amdgpu_vm_reset_id(struct amdgpu_device *adev, unsigned vmhub, ...@@ -680,6 +681,26 @@ void amdgpu_vm_reset_id(struct amdgpu_device *adev, unsigned vmhub,
id->oa_size = 0; id->oa_size = 0;
} }
/**
* amdgpu_vm_reset_all_id - reset VMID to zero
*
* @adev: amdgpu device structure
*
* Reset VMID to force flush on next use
*/
void amdgpu_vm_reset_all_ids(struct amdgpu_device *adev)
{
unsigned i, j;
for (i = 0; i < AMDGPU_MAX_VMHUBS; ++i) {
struct amdgpu_vm_id_manager *id_mgr =
&adev->vm_manager.id_mgr[i];
for (j = 1; j < id_mgr->num_ids; ++j)
amdgpu_vm_reset_id(adev, i, j);
}
}
/** /**
* amdgpu_vm_bo_find - find the bo_va for a specific vm & bo * amdgpu_vm_bo_find - find the bo_va for a specific vm & bo
* *
...@@ -2270,7 +2291,6 @@ void amdgpu_vm_manager_init(struct amdgpu_device *adev) ...@@ -2270,7 +2291,6 @@ void amdgpu_vm_manager_init(struct amdgpu_device *adev)
for (i = 0; i < AMDGPU_MAX_RINGS; ++i) for (i = 0; i < AMDGPU_MAX_RINGS; ++i)
adev->vm_manager.seqno[i] = 0; adev->vm_manager.seqno[i] = 0;
atomic_set(&adev->vm_manager.vm_pte_next_ring, 0); atomic_set(&adev->vm_manager.vm_pte_next_ring, 0);
atomic64_set(&adev->vm_manager.client_counter, 0); atomic64_set(&adev->vm_manager.client_counter, 0);
spin_lock_init(&adev->vm_manager.prt_lock); spin_lock_init(&adev->vm_manager.prt_lock);
......
...@@ -204,6 +204,7 @@ int amdgpu_vm_grab_id(struct amdgpu_vm *vm, struct amdgpu_ring *ring, ...@@ -204,6 +204,7 @@ int amdgpu_vm_grab_id(struct amdgpu_vm *vm, struct amdgpu_ring *ring,
int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job); int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job);
void amdgpu_vm_reset_id(struct amdgpu_device *adev, unsigned vmhub, void amdgpu_vm_reset_id(struct amdgpu_device *adev, unsigned vmhub,
unsigned vmid); unsigned vmid);
void amdgpu_vm_reset_all_ids(struct amdgpu_device *adev);
int amdgpu_vm_update_directories(struct amdgpu_device *adev, int amdgpu_vm_update_directories(struct amdgpu_device *adev,
struct amdgpu_vm *vm); struct amdgpu_vm *vm);
int amdgpu_vm_clear_freed(struct amdgpu_device *adev, int amdgpu_vm_clear_freed(struct amdgpu_device *adev,
......
...@@ -950,10 +950,6 @@ static int gmc_v6_0_suspend(void *handle) ...@@ -950,10 +950,6 @@ static int gmc_v6_0_suspend(void *handle)
{ {
struct amdgpu_device *adev = (struct amdgpu_device *)handle; struct amdgpu_device *adev = (struct amdgpu_device *)handle;
if (adev->vm_manager.enabled) {
gmc_v6_0_vm_fini(adev);
adev->vm_manager.enabled = false;
}
gmc_v6_0_hw_fini(adev); gmc_v6_0_hw_fini(adev);
return 0; return 0;
...@@ -968,16 +964,9 @@ static int gmc_v6_0_resume(void *handle) ...@@ -968,16 +964,9 @@ static int gmc_v6_0_resume(void *handle)
if (r) if (r)
return r; return r;
if (!adev->vm_manager.enabled) { amdgpu_vm_reset_all_ids(adev);
r = gmc_v6_0_vm_init(adev);
if (r) {
dev_err(adev->dev, "vm manager initialization failed (%d).\n", r);
return r;
}
adev->vm_manager.enabled = true;
}
return r; return 0;
} }
static bool gmc_v6_0_is_idle(void *handle) static bool gmc_v6_0_is_idle(void *handle)
......
...@@ -1117,10 +1117,6 @@ static int gmc_v7_0_suspend(void *handle) ...@@ -1117,10 +1117,6 @@ static int gmc_v7_0_suspend(void *handle)
{ {
struct amdgpu_device *adev = (struct amdgpu_device *)handle; struct amdgpu_device *adev = (struct amdgpu_device *)handle;
if (adev->vm_manager.enabled) {
gmc_v7_0_vm_fini(adev);
adev->vm_manager.enabled = false;
}
gmc_v7_0_hw_fini(adev); gmc_v7_0_hw_fini(adev);
return 0; return 0;
...@@ -1135,16 +1131,9 @@ static int gmc_v7_0_resume(void *handle) ...@@ -1135,16 +1131,9 @@ static int gmc_v7_0_resume(void *handle)
if (r) if (r)
return r; return r;
if (!adev->vm_manager.enabled) { amdgpu_vm_reset_all_ids(adev);
r = gmc_v7_0_vm_init(adev);
if (r) {
dev_err(adev->dev, "vm manager initialization failed (%d).\n", r);
return r;
}
adev->vm_manager.enabled = true;
}
return r; return 0;
} }
static bool gmc_v7_0_is_idle(void *handle) static bool gmc_v7_0_is_idle(void *handle)
......
...@@ -1209,10 +1209,6 @@ static int gmc_v8_0_suspend(void *handle) ...@@ -1209,10 +1209,6 @@ static int gmc_v8_0_suspend(void *handle)
{ {
struct amdgpu_device *adev = (struct amdgpu_device *)handle; struct amdgpu_device *adev = (struct amdgpu_device *)handle;
if (adev->vm_manager.enabled) {
gmc_v8_0_vm_fini(adev);
adev->vm_manager.enabled = false;
}
gmc_v8_0_hw_fini(adev); gmc_v8_0_hw_fini(adev);
return 0; return 0;
...@@ -1227,16 +1223,9 @@ static int gmc_v8_0_resume(void *handle) ...@@ -1227,16 +1223,9 @@ static int gmc_v8_0_resume(void *handle)
if (r) if (r)
return r; return r;
if (!adev->vm_manager.enabled) { amdgpu_vm_reset_all_ids(adev);
r = gmc_v8_0_vm_init(adev);
if (r) {
dev_err(adev->dev, "vm manager initialization failed (%d).\n", r);
return r;
}
adev->vm_manager.enabled = true;
}
return r; return 0;
} }
static bool gmc_v8_0_is_idle(void *handle) static bool gmc_v8_0_is_idle(void *handle)
......
...@@ -791,10 +791,6 @@ static int gmc_v9_0_suspend(void *handle) ...@@ -791,10 +791,6 @@ static int gmc_v9_0_suspend(void *handle)
{ {
struct amdgpu_device *adev = (struct amdgpu_device *)handle; struct amdgpu_device *adev = (struct amdgpu_device *)handle;
if (adev->vm_manager.enabled) {
gmc_v9_0_vm_fini(adev);
adev->vm_manager.enabled = false;
}
gmc_v9_0_hw_fini(adev); gmc_v9_0_hw_fini(adev);
return 0; return 0;
...@@ -809,17 +805,9 @@ static int gmc_v9_0_resume(void *handle) ...@@ -809,17 +805,9 @@ static int gmc_v9_0_resume(void *handle)
if (r) if (r)
return r; return r;
if (!adev->vm_manager.enabled) { amdgpu_vm_reset_all_ids(adev);
r = gmc_v9_0_vm_init(adev);
if (r) {
dev_err(adev->dev,
"vm manager initialization failed (%d).\n", r);
return r;
}
adev->vm_manager.enabled = true;
}
return r; return 0;
} }
static bool gmc_v9_0_is_idle(void *handle) static bool gmc_v9_0_is_idle(void *handle)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册