提交 9ae55f03 编写于 作者: A Andrey Grodzovsky 提交者: Alex Deucher

drm/amdgpu: Follow up change to previous drm scheduler change.

Align refcount behaviour for amdgpu_job embedded HW fence with
classic pointer style HW fences by increasing refcount each
time emit is called so amdgpu code doesn't need to make workarounds
using amdgpu_job.job_run_counter to keep the HW fence refcount balanced.

Also since in the previous patch we resumed setting s_fence->parent to NULL
in drm_sched_stop switch to directly checking if job->hw_fence is
signaled to short circuit reset if already signed.
Signed-off-by: NAndrey Grodzovsky <andrey.grodzovsky@amd.com>
Tested-by: NYiqing Yao <yiqing.yao@amd.com>
Acked-by: NChristian König <christian.koenig@amd.com>
Signed-off-by: NAlex Deucher <alexander.deucher@amd.com>
上级 45ecaea7
...@@ -684,6 +684,8 @@ int amdgpu_amdkfd_submit_ib(struct amdgpu_device *adev, ...@@ -684,6 +684,8 @@ int amdgpu_amdkfd_submit_ib(struct amdgpu_device *adev,
goto err_ib_sched; goto err_ib_sched;
} }
/* Drop the initial kref_init count (see drm_sched_main as example) */
dma_fence_put(f);
ret = dma_fence_wait(f, false); ret = dma_fence_wait(f, false);
err_ib_sched: err_ib_sched:
......
...@@ -5010,16 +5010,32 @@ static void amdgpu_device_recheck_guilty_jobs( ...@@ -5010,16 +5010,32 @@ static void amdgpu_device_recheck_guilty_jobs(
/* clear job's guilty and depend the folowing step to decide the real one */ /* clear job's guilty and depend the folowing step to decide the real one */
drm_sched_reset_karma(s_job); drm_sched_reset_karma(s_job);
/* for the real bad job, it will be resubmitted twice, adding a dma_fence_get
* to make sure fence is balanced */
dma_fence_get(s_job->s_fence->parent);
drm_sched_resubmit_jobs_ext(&ring->sched, 1); drm_sched_resubmit_jobs_ext(&ring->sched, 1);
if (!s_job->s_fence->parent) {
DRM_WARN("Failed to get a HW fence for job!");
continue;
}
ret = dma_fence_wait_timeout(s_job->s_fence->parent, false, ring->sched.timeout); ret = dma_fence_wait_timeout(s_job->s_fence->parent, false, ring->sched.timeout);
if (ret == 0) { /* timeout */ if (ret == 0) { /* timeout */
DRM_ERROR("Found the real bad job! ring:%s, job_id:%llx\n", DRM_ERROR("Found the real bad job! ring:%s, job_id:%llx\n",
ring->sched.name, s_job->id); ring->sched.name, s_job->id);
amdgpu_fence_driver_isr_toggle(adev, true);
/* Clear this failed job from fence array */
amdgpu_fence_driver_clear_job_fences(ring);
amdgpu_fence_driver_isr_toggle(adev, false);
/* Since the job won't signal and we go for
* another resubmit drop this parent pointer
*/
dma_fence_put(s_job->s_fence->parent);
s_job->s_fence->parent = NULL;
/* set guilty */ /* set guilty */
drm_sched_increase_karma(s_job); drm_sched_increase_karma(s_job);
retry: retry:
...@@ -5048,7 +5064,6 @@ static void amdgpu_device_recheck_guilty_jobs( ...@@ -5048,7 +5064,6 @@ static void amdgpu_device_recheck_guilty_jobs(
/* got the hw fence, signal finished fence */ /* got the hw fence, signal finished fence */
atomic_dec(ring->sched.score); atomic_dec(ring->sched.score);
dma_fence_put(s_job->s_fence->parent);
dma_fence_get(&s_job->s_fence->finished); dma_fence_get(&s_job->s_fence->finished);
dma_fence_signal(&s_job->s_fence->finished); dma_fence_signal(&s_job->s_fence->finished);
dma_fence_put(&s_job->s_fence->finished); dma_fence_put(&s_job->s_fence->finished);
...@@ -5221,8 +5236,8 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, ...@@ -5221,8 +5236,8 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
* *
* job->base holds a reference to parent fence * job->base holds a reference to parent fence
*/ */
if (job && job->base.s_fence->parent && if (job && (job->hw_fence.ops != NULL) &&
dma_fence_is_signaled(job->base.s_fence->parent)) { dma_fence_is_signaled(&job->hw_fence)) {
job_signaled = true; job_signaled = true;
dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
goto skip_hw_reset; goto skip_hw_reset;
......
...@@ -164,11 +164,16 @@ int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **f, struct amd ...@@ -164,11 +164,16 @@ int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **f, struct amd
if (job && job->job_run_counter) { if (job && job->job_run_counter) {
/* reinit seq for resubmitted jobs */ /* reinit seq for resubmitted jobs */
fence->seqno = seq; fence->seqno = seq;
/* TO be inline with external fence creation and other drivers */
dma_fence_get(fence);
} else { } else {
if (job) if (job) {
dma_fence_init(fence, &amdgpu_job_fence_ops, dma_fence_init(fence, &amdgpu_job_fence_ops,
&ring->fence_drv.lock, &ring->fence_drv.lock,
adev->fence_context + ring->idx, seq); adev->fence_context + ring->idx, seq);
/* Against remove in amdgpu_job_{free, free_cb} */
dma_fence_get(fence);
}
else else
dma_fence_init(fence, &amdgpu_fence_ops, dma_fence_init(fence, &amdgpu_fence_ops,
&ring->fence_drv.lock, &ring->fence_drv.lock,
......
...@@ -262,10 +262,6 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job) ...@@ -262,10 +262,6 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job)
DRM_ERROR("Error scheduling IBs (%d)\n", r); DRM_ERROR("Error scheduling IBs (%d)\n", r);
} }
if (!job->job_run_counter)
dma_fence_get(fence);
else if (finished->error < 0)
dma_fence_put(&job->hw_fence);
job->job_run_counter++; job->job_run_counter++;
amdgpu_job_free_resources(job); amdgpu_job_free_resources(job);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册