drm/amdgpu: refine amdgpu ras event id core code

v1:
- use unified event id to manage ras events
- add a new function amdgpu_ras_query_error_status_with_event() to accept
  event type as parameter.

v2:
add a warn log to show the location of function failure
when calling amdgpu_ras_mark_event(). (Tao Zhou)

v3:
change RAS_EVENT_TYPE_ISR to RAS_EVENT_TYPE_FATAL.

v4:
rename amdgpu_ras_get_recovery_event() to
amdgpu_ras_get_fatal_error_event().

Signed-off-by: Yang Wang <kevinyang.wang@amd.com>
Reviewed-by: Tao Zhou <tao.zhou1@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
Yang Wang
2024-06-25 14:23:42 +08:00
committed by Alex Deucher
parent e33697141b
commit 75ac6a2506
4 changed files with 105 additions and 27 deletions

View File

@@ -274,7 +274,7 @@ int amdgpu_mca_smu_set_debug_mode(struct amdgpu_device *adev, bool enable)
static void amdgpu_mca_smu_mca_bank_dump(struct amdgpu_device *adev, int idx, struct mca_bank_entry *entry,
struct ras_query_context *qctx)
{
u64 event_id = qctx->event_id;
u64 event_id = qctx ? qctx->evid.event_id : RAS_EVENT_INVALID_ID;
RAS_EVENT_LOG(adev, event_id, HW_ERR "Accelerator Check Architecture events logged\n");
RAS_EVENT_LOG(adev, event_id, HW_ERR "aca entry[%02d].STATUS=0x%016llx\n",
@@ -543,7 +543,7 @@ static int mca_dump_show(struct seq_file *m, enum amdgpu_mca_error_type type)
amdgpu_mca_bank_set_init(&mca_set);
qctx.event_id = 0ULL;
qctx.evid.event_id = RAS_EVENT_INVALID_ID;
ret = amdgpu_mca_smu_get_mca_set(adev, type, &mca_set, &qctx);
if (ret)
goto err_free_mca_set;