drm/amdgpu: move guilty handling into ring resets

Move guilty logic into the ring reset callbacks.  This
allows each ring reset callback to better handle fence
errors and force completions in line with the reset
behavior for each IP.  It also allows us to remove
the ring guilty callback since that logic now lives
in the reset callback.

Acked-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
Alex Deucher
2025-06-05 23:34:48 -04:00
parent 2dee58ca47
commit 43ca5eb94b
3 changed files with 3 additions and 51 deletions

View File

@@ -91,7 +91,6 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
struct amdgpu_job *job = to_amdgpu_job(s_job);
struct amdgpu_task_info *ti;
struct amdgpu_device *adev = ring->adev;
bool set_error = false;
int idx, r;
if (!drm_dev_enter(adev_to_drm(adev), &idx)) {
@@ -134,8 +133,6 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
if (unlikely(adev->debug_disable_gpu_ring_reset)) {
dev_err(adev->dev, "Ring reset disabled by debug mask\n");
} else if (amdgpu_gpu_recovery && ring->funcs->reset) {
bool is_guilty;
dev_err(adev->dev, "Starting %s ring reset\n",
s_job->sched->name);
@@ -145,24 +142,9 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
*/
drm_sched_wqueue_stop(&ring->sched);
/* for engine resets, we need to reset the engine,
* but individual queues may be unaffected.
* check here to make sure the accounting is correct.
*/
if (ring->funcs->is_guilty)
is_guilty = ring->funcs->is_guilty(ring);
else
is_guilty = true;
if (is_guilty) {
dma_fence_set_error(&s_job->s_fence->finished, -ETIME);
set_error = true;
}
r = amdgpu_ring_reset(ring, job->vmid, NULL);
if (!r) {
if (is_guilty)
atomic_inc(&ring->adev->gpu_reset_counter);
atomic_inc(&ring->adev->gpu_reset_counter);
drm_sched_wqueue_start(&ring->sched);
dev_err(adev->dev, "Ring %s reset succeeded\n",
ring->sched.name);
@@ -173,8 +155,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
dev_err(adev->dev, "Ring %s reset failed\n", ring->sched.name);
}
if (!set_error)
dma_fence_set_error(&s_job->s_fence->finished, -ETIME);
dma_fence_set_error(&s_job->s_fence->finished, -ETIME);
if (amdgpu_device_should_recover_gpu(ring->adev)) {
struct amdgpu_reset_context reset_context;