drm/amdgpu: completely rework eviction fence handling v2

Well that was broken on multiple levels.

First of all a lot of checks were placed at incorrect locations, especially if
the resume worker should run or not.

Then a bunch of code was just mid-layering because of incorrect assignment who
should do what.

And finally comments explaining what happens instead of why.

Just re-write it from scratch, that should at least fix some of the hangs we
are seeing.

Use RCU for the eviction fence pointer in the manager, the spinlock usage was
mostly incorrect as well. Then finally remove all the nonsense checks and
actually add them in the correct locations.

v2: some typo fixes and cleanups suggested by Sunil

Signed-off-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Sunil Khatri <sunil.khatri@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
Christian König
2026-01-28 13:58:14 +01:00
committed by Alex Deucher
parent 87327658c8
commit 2cd7284ba5
7 changed files with 113 additions and 209 deletions

View File

@@ -472,17 +472,16 @@ void
amdgpu_userq_ensure_ev_fence(struct amdgpu_userq_mgr *uq_mgr,
struct amdgpu_eviction_fence_mgr *evf_mgr)
{
struct amdgpu_eviction_fence *ev_fence;
struct dma_fence *ev_fence;
retry:
/* Flush any pending resume work to create ev_fence */
flush_delayed_work(&uq_mgr->resume_work);
mutex_lock(&uq_mgr->userq_mutex);
spin_lock(&evf_mgr->ev_fence_lock);
ev_fence = evf_mgr->ev_fence;
spin_unlock(&evf_mgr->ev_fence_lock);
if (!ev_fence || dma_fence_is_signaled(&ev_fence->base)) {
ev_fence = amdgpu_evf_mgr_get_fence(evf_mgr);
if (dma_fence_is_signaled(ev_fence)) {
dma_fence_put(ev_fence);
mutex_unlock(&uq_mgr->userq_mutex);
/*
* Looks like there was no pending resume work,
@@ -491,6 +490,7 @@ retry:
schedule_delayed_work(&uq_mgr->resume_work, 0);
goto retry;
}
dma_fence_put(ev_fence);
}
int amdgpu_userq_create_object(struct amdgpu_userq_mgr *uq_mgr,
@@ -1197,7 +1197,7 @@ retry_lock:
dma_fence_wait(bo_va->last_pt_update, false);
dma_fence_wait(vm->last_update, false);
ret = amdgpu_eviction_fence_replace_fence(&fpriv->evf_mgr, &exec);
ret = amdgpu_evf_mgr_rearm(&fpriv->evf_mgr, &exec);
if (ret)
drm_file_err(uq_mgr->file, "Failed to replace eviction fence\n");
@@ -1217,11 +1217,13 @@ static void amdgpu_userq_restore_worker(struct work_struct *work)
{
struct amdgpu_userq_mgr *uq_mgr = work_to_uq_mgr(work, resume_work.work);
struct amdgpu_fpriv *fpriv = uq_mgr_to_fpriv(uq_mgr);
struct dma_fence *ev_fence;
int ret;
flush_delayed_work(&fpriv->evf_mgr.suspend_work);
mutex_lock(&uq_mgr->userq_mutex);
ev_fence = amdgpu_evf_mgr_get_fence(&fpriv->evf_mgr);
if (!dma_fence_is_signaled(ev_fence))
goto unlock;
ret = amdgpu_userq_vm_validate(uq_mgr);
if (ret) {
@@ -1237,6 +1239,7 @@ static void amdgpu_userq_restore_worker(struct work_struct *work)
unlock:
mutex_unlock(&uq_mgr->userq_mutex);
dma_fence_put(ev_fence);
}
static int
@@ -1312,11 +1315,8 @@ amdgpu_userq_wait_for_signal(struct amdgpu_userq_mgr *uq_mgr)
}
void
amdgpu_userq_evict(struct amdgpu_userq_mgr *uq_mgr,
struct amdgpu_eviction_fence *ev_fence)
amdgpu_userq_evict(struct amdgpu_userq_mgr *uq_mgr, bool schedule_resume)
{
struct amdgpu_fpriv *fpriv = uq_mgr_to_fpriv(uq_mgr);
struct amdgpu_eviction_fence_mgr *evf_mgr = &fpriv->evf_mgr;
struct amdgpu_device *adev = uq_mgr->adev;
int ret;
@@ -1329,10 +1329,7 @@ amdgpu_userq_evict(struct amdgpu_userq_mgr *uq_mgr,
if (ret)
dev_err(adev->dev, "Failed to evict userqueue\n");
/* Signal current eviction fence */
amdgpu_eviction_fence_signal(evf_mgr, ev_fence);
if (!evf_mgr->fd_closing)
if (schedule_resume)
schedule_delayed_work(&uq_mgr->resume_work, 0);
}