diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 620e3002001da7a47109adcaaae216906dfd786c..d5d450e380bdff87a63764f7de948065aa9cc9ae 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2869,7 +2869,7 @@ int amdgpu_sriov_gpu_reset(struct amdgpu_device *adev, struct amdgpu_job *job) amd_sched_job_kickout(&job->base); /* only do job_reset on the hang ring if @job not NULL */ - amd_sched_hw_job_reset(&ring->sched); + amd_sched_hw_job_reset(&ring->sched, NULL); /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ amdgpu_fence_driver_force_completion(ring); @@ -2990,7 +2990,7 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev) if (!ring || !ring->sched.thread) continue; kthread_park(ring->sched.thread); - amd_sched_hw_job_reset(&ring->sched); + amd_sched_hw_job_reset(&ring->sched, NULL); /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ amdgpu_fence_driver_force_completion(ring); } diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c index 764606ce3541a82ad746acc1288e769812aa8df1..1474866d9048dbf9d65f842f017b91035b5ef8a2 100644 --- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c +++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c @@ -443,9 +443,18 @@ static void amd_sched_job_timedout(struct work_struct *work) job->sched->ops->timedout_job(job); } -void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched) +static void amd_sched_set_guilty(struct amd_sched_job *s_job) +{ + if (atomic_inc_return(&s_job->karma) > s_job->sched->hang_limit) + if (s_job->s_entity->guilty) + atomic_set(s_job->s_entity->guilty, 1); +} + +void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched, struct amd_sched_job *bad) { struct amd_sched_job *s_job; + struct amd_sched_entity *entity, *tmp; + int i;; spin_lock(&sched->job_list_lock); list_for_each_entry_reverse(s_job, &sched->ring_mirror_list, node) { @@ -458,6 +467,26 @@ void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched) } } spin_unlock(&sched->job_list_lock); + + if (bad) { + bool found = false; + + for (i = AMD_SCHED_PRIORITY_MIN; i < AMD_SCHED_PRIORITY_MAX; i++ ) { + struct amd_sched_rq *rq = &sched->sched_rq[i]; + + spin_lock(&rq->lock); + list_for_each_entry_safe(entity, tmp, &rq->entities, list) { + if (bad->s_fence->scheduled.context == entity->fence_context) { + found = true; + amd_sched_set_guilty(bad); + break; + } + } + spin_unlock(&rq->lock); + if (found) + break; + } + } } void amd_sched_job_kickout(struct amd_sched_job *s_job) diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h index a05994c60b34168984c87d6b83a7df85a66eab16..be75172587da579f7dc3c702ed45af0cb1131327 100644 --- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h +++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h @@ -174,7 +174,7 @@ int amd_sched_job_init(struct amd_sched_job *job, struct amd_gpu_scheduler *sched, struct amd_sched_entity *entity, void *owner); -void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched); +void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched, struct amd_sched_job *job); void amd_sched_job_recovery(struct amd_gpu_scheduler *sched); bool amd_sched_dependency_optimized(struct dma_fence* fence, struct amd_sched_entity *entity);