drm/amdgpu: refine amdgpu ras event id core code

v1:
- use unified event id to manage ras events
- add a new function amdgpu_ras_query_error_status_with_event() to accept
  event type as parameter.

v2:
add a warn log to show the location of function failure
when calling amdgpu_ras_mark_event(). (Tao Zhou)

v3:
change RAS_EVENT_TYPE_ISR to RAS_EVENT_TYPE_FATAL.

v4:
rename amdgpu_ras_get_recovery_event() to
amdgpu_ras_get_fatal_error_event().

Signed-off-by: Yang Wang <kevinyang.wang@amd.com>
Reviewed-by: Tao Zhou <tao.zhou1@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
Yang Wang
2024-06-25 14:23:42 +08:00
committed by Alex Deucher
parent e33697141b
commit 75ac6a2506
4 changed files with 105 additions and 27 deletions

View File

@@ -68,9 +68,15 @@ struct amdgpu_iv_entry;
/* The high three bits indicates socketid */
#define AMDGPU_RAS_GET_FEATURES(val) ((val) & ~AMDGPU_RAS_FEATURES_SOCKETID_MASK)
#define RAS_EVENT_INVALID_ID (BIT_ULL(63))
#define RAS_EVENT_ID_IS_VALID(x) (!((x) & BIT_ULL(63)))
#define RAS_EVENT_LOG(adev, id, fmt, ...) \
amdgpu_ras_event_log_print((adev), (id), (fmt), ##__VA_ARGS__)
#define amdgpu_ras_mark_ras_event(adev, type) \
(amdgpu_ras_mark_ras_event_caller((adev), (type), __builtin_return_address(0)))
enum amdgpu_ras_block {
AMDGPU_RAS_BLOCK__UMC = 0,
AMDGPU_RAS_BLOCK__SDMA,
@@ -427,18 +433,23 @@ struct umc_ecc_info {
};
enum ras_event_type {
RAS_EVENT_TYPE_INVALID = -1,
RAS_EVENT_TYPE_ISR = 0,
RAS_EVENT_TYPE_INVALID = 0,
RAS_EVENT_TYPE_FATAL,
RAS_EVENT_TYPE_COUNT,
};
struct ras_event_manager {
atomic64_t seqnos[RAS_EVENT_TYPE_COUNT];
atomic64_t seqno;
u64 last_seqno[RAS_EVENT_TYPE_COUNT];
};
struct ras_event_id {
enum ras_event_type type;
u64 event_id;
};
struct ras_query_context {
enum ras_event_type type;
u64 event_id;
struct ras_event_id evid;
};
typedef int (*pasid_notify)(struct amdgpu_device *adev,
@@ -947,8 +958,9 @@ void amdgpu_ras_del_mca_err_addr(struct ras_err_info *err_info,
void amdgpu_ras_set_fed(struct amdgpu_device *adev, bool status);
bool amdgpu_ras_get_fed_status(struct amdgpu_device *adev);
bool amdgpu_ras_event_id_is_valid(struct amdgpu_device *adev, u64 id);
u64 amdgpu_ras_acquire_event_id(struct amdgpu_device *adev, enum ras_event_type type);
int amdgpu_ras_mark_ras_event_caller(struct amdgpu_device *adev, enum ras_event_type type,
const void *caller);
int amdgpu_ras_reserve_page(struct amdgpu_device *adev, uint64_t pfn);