drm/amdgpu: suspend ras module before gpu reset

During gpu reset, all GPU-related resources are
inaccessible. To avoid affecting ras functionality,
suspend ras module before gpu reset and resume
it after gpu reset is complete.

V2:
  Rename functions to avoid misunderstanding.

V3:
  Move flush_delayed_work to amdgpu_ras_process_pause,
  Move schedule_delayed_work to amdgpu_ras_process_unpause.

V4:
  Rename functions.

V5:
  Move the function to amdgpu_ras.c.

Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
Reviewed-by: Tao Zhou <tao.zhou1@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Acked-by: Lijo Lazar <lijo.lazar@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
YiPeng Chai
2025-10-28 16:18:31 +08:00
committed by Alex Deucher
parent d4432f16d3
commit d95ca7f515
10 changed files with 148 additions and 2 deletions

View File

@@ -2921,8 +2921,12 @@ static void amdgpu_ras_do_recovery(struct work_struct *work)
type = amdgpu_ras_get_fatal_error_event(adev);
list_for_each_entry(remote_adev,
device_list_handle, gmc.xgmi.head) {
amdgpu_ras_query_err_status(remote_adev);
amdgpu_ras_log_on_err_counter(remote_adev, type);
if (amdgpu_uniras_enabled(remote_adev)) {
amdgpu_ras_mgr_update_ras_ecc(remote_adev);
} else {
amdgpu_ras_query_err_status(remote_adev);
amdgpu_ras_log_on_err_counter(remote_adev, type);
}
}
}
@@ -5673,3 +5677,25 @@ bool amdgpu_ras_check_critical_address(struct amdgpu_device *adev, uint64_t addr
return ret;
}
void amdgpu_ras_pre_reset(struct amdgpu_device *adev,
struct list_head *device_list)
{
struct amdgpu_device *tmp_adev = NULL;
list_for_each_entry(tmp_adev, device_list, reset_list) {
if (amdgpu_uniras_enabled(tmp_adev))
amdgpu_ras_mgr_pre_reset(tmp_adev);
}
}
void amdgpu_ras_post_reset(struct amdgpu_device *adev,
struct list_head *device_list)
{
struct amdgpu_device *tmp_adev = NULL;
list_for_each_entry(tmp_adev, device_list, reset_list) {
if (amdgpu_uniras_enabled(tmp_adev))
amdgpu_ras_mgr_post_reset(tmp_adev);
}
}