mirror of
https://github.com/torvalds/linux.git
synced 2026-04-18 14:53:58 -04:00
drm/amdgpu: Implement user queue reset functionality
This patch adds robust reset handling for user queues (userq) to improve
recovery from queue failures. The key components include:
1. Queue detection and reset logic:
- amdgpu_userq_detect_and_reset_queues() identifies failed queues
- Per-IP detect_and_reset callbacks for targeted recovery
- Falls back to full GPU reset when needed
2. Reset infrastructure:
- Adds userq_reset_work workqueue for async reset handling
- Implements pre/post reset handlers for queue state management
- Integrates with existing GPU reset framework
3. Error handling improvements:
- Enhanced state tracking with HUNG state
- Automatic reset triggering on critical failures
- VRAM loss handling during recovery
4. Integration points:
- Added to device init/reset paths
- Called during queue destroy, suspend, and isolation events
- Handles both individual queue and full GPU resets
The reset functionality works with both gfx/compute and sdma queues,
providing better resilience against queue failures while minimizing
disruption to unaffected queues.
v2: add detection and reset calls when preemption/unmaped fails.
add a per device userq counter for each user queue type.(Alex)
v3: make sure we hold the adev->userq_mutex when we call amdgpu_userq_detect_and_reset_queues. (Alex)
warn if the adev->userq_mutex is not held.
v4: make sure we have all of the uqm->userq_mutex held.
warn if the uqm->userq_mutex is not held.
v5: Use array for user queue type counters.(Alex)
all of the uqm->userq_mutex need to be held when calling detect and reset. (Alex)
v6: fix lock dep warning in amdgpu_userq_fence_dence_driver_process
v7: add the queue types in an array and use a loop in amdgpu_userq_detect_and_reset_queues (Lijo)
v8: remove atomic_set(&userq_mgr->userq_count[i], 0).
it should already be 0 since we kzalloc the structure (Alex)
v9: For consistency with kernel queues, We may want something like:
amdgpu_userq_is_reset_type_supported (Alex)
Signed-off-by: Jesse Zhang <Jesse.Zhang@amd.com>
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
committed by
Alex Deucher
parent
825df7ff4b
commit
290f46cf57
@@ -4651,6 +4651,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
|
||||
}
|
||||
|
||||
INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
|
||||
INIT_WORK(&adev->userq_reset_work, amdgpu_userq_reset_work);
|
||||
|
||||
adev->gfx.gfx_off_req_count = 1;
|
||||
adev->gfx.gfx_off_residency = 0;
|
||||
@@ -6080,6 +6081,10 @@ int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context)
|
||||
if (r)
|
||||
goto out;
|
||||
|
||||
r = amdgpu_userq_post_reset(tmp_adev, vram_lost);
|
||||
if (r)
|
||||
goto out;
|
||||
|
||||
drm_client_dev_resume(adev_to_drm(tmp_adev));
|
||||
|
||||
/*
|
||||
@@ -6302,6 +6307,7 @@ static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
|
||||
if (!amdgpu_sriov_vf(adev))
|
||||
cancel_work(&adev->reset_work);
|
||||
#endif
|
||||
cancel_work(&adev->userq_reset_work);
|
||||
|
||||
if (adev->kfd.dev)
|
||||
cancel_work(&adev->kfd.reset_work);
|
||||
@@ -6422,6 +6428,8 @@ static void amdgpu_device_halt_activities(struct amdgpu_device *adev,
|
||||
amdgpu_device_ip_need_full_reset(tmp_adev))
|
||||
amdgpu_ras_suspend(tmp_adev);
|
||||
|
||||
amdgpu_userq_pre_reset(tmp_adev);
|
||||
|
||||
for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
|
||||
struct amdgpu_ring *ring = tmp_adev->rings[i];
|
||||
|
||||
|
||||
Reference in New Issue
Block a user