提交 7876fa4f 编写于 作者: C Christian König 提交者: Alex Deucher

drm/amdgpu: add ring soft recovery v4

Instead of hammering hard on the GPU try a soft recovery first.

v2: reorder code a bit
v3: increase timeout to 10ms, increment GPU reset counter
v4: squash in compile fix (Christian)
Signed-off-by: NChristian König <christian.koenig@amd.com>
Reviewed-by: NHuang Rui <ray.huang@amd.com>
上级 07e6d3f0
...@@ -33,6 +33,12 @@ static void amdgpu_job_timedout(struct drm_sched_job *s_job) ...@@ -33,6 +33,12 @@ static void amdgpu_job_timedout(struct drm_sched_job *s_job)
struct amdgpu_ring *ring = to_amdgpu_ring(s_job->sched); struct amdgpu_ring *ring = to_amdgpu_ring(s_job->sched);
struct amdgpu_job *job = to_amdgpu_job(s_job); struct amdgpu_job *job = to_amdgpu_job(s_job);
if (amdgpu_ring_soft_recovery(ring, job->vmid, s_job->s_fence->parent)) {
DRM_ERROR("ring %s timeout, but soft recovered\n",
s_job->sched->name);
return;
}
DRM_ERROR("ring %s timeout, signaled seq=%u, emitted seq=%u\n", DRM_ERROR("ring %s timeout, signaled seq=%u, emitted seq=%u\n",
job->base.sched->name, atomic_read(&ring->fence_drv.last_seq), job->base.sched->name, atomic_read(&ring->fence_drv.last_seq),
ring->fence_drv.sync_seq); ring->fence_drv.sync_seq);
......
...@@ -383,6 +383,31 @@ void amdgpu_ring_emit_reg_write_reg_wait_helper(struct amdgpu_ring *ring, ...@@ -383,6 +383,31 @@ void amdgpu_ring_emit_reg_write_reg_wait_helper(struct amdgpu_ring *ring,
amdgpu_ring_emit_reg_wait(ring, reg1, mask, mask); amdgpu_ring_emit_reg_wait(ring, reg1, mask, mask);
} }
/**
* amdgpu_ring_soft_recovery - try to soft recover a ring lockup
*
* @ring: ring to try the recovery on
* @vmid: VMID we try to get going again
* @fence: timedout fence
*
* Tries to get a ring proceeding again when it is stuck.
*/
bool amdgpu_ring_soft_recovery(struct amdgpu_ring *ring, unsigned int vmid,
struct dma_fence *fence)
{
ktime_t deadline = ktime_add_us(ktime_get(), 10000);
if (!ring->funcs->soft_recovery)
return false;
atomic_inc(&ring->adev->gpu_reset_counter);
while (!dma_fence_is_signaled(fence) &&
ktime_to_ns(ktime_sub(deadline, ktime_get())) > 0)
ring->funcs->soft_recovery(ring, vmid);
return dma_fence_is_signaled(fence);
}
/* /*
* Debugfs info * Debugfs info
*/ */
......
...@@ -168,6 +168,8 @@ struct amdgpu_ring_funcs { ...@@ -168,6 +168,8 @@ struct amdgpu_ring_funcs {
/* priority functions */ /* priority functions */
void (*set_priority) (struct amdgpu_ring *ring, void (*set_priority) (struct amdgpu_ring *ring,
enum drm_sched_priority priority); enum drm_sched_priority priority);
/* Try to soft recover the ring to make the fence signal */
void (*soft_recovery)(struct amdgpu_ring *ring, unsigned vmid);
}; };
struct amdgpu_ring { struct amdgpu_ring {
...@@ -260,6 +262,8 @@ void amdgpu_ring_fini(struct amdgpu_ring *ring); ...@@ -260,6 +262,8 @@ void amdgpu_ring_fini(struct amdgpu_ring *ring);
void amdgpu_ring_emit_reg_write_reg_wait_helper(struct amdgpu_ring *ring, void amdgpu_ring_emit_reg_write_reg_wait_helper(struct amdgpu_ring *ring,
uint32_t reg0, uint32_t val0, uint32_t reg0, uint32_t val0,
uint32_t reg1, uint32_t val1); uint32_t reg1, uint32_t val1);
bool amdgpu_ring_soft_recovery(struct amdgpu_ring *ring, unsigned int vmid,
struct dma_fence *fence);
static inline void amdgpu_ring_clear_ring(struct amdgpu_ring *ring) static inline void amdgpu_ring_clear_ring(struct amdgpu_ring *ring)
{ {
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册