drm/amdgpu: add MCA smu cache support

v1:
because SMU CE valid mca bank will be cleared after reading,
this patch adds mca cache at the driver level to ensure that the mca bank is not lost.

v2:
refine amdgpu_mca_init/fini/reset() function name.

v3:
add mca_cache.lock support
only add CE bank to mca bank cache.

Signed-off-by: Yang Wang <kevinyang.wang@amd.com>
Reviewed-by: Tao Zhou <tao.zhou1@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
Yang Wang
2024-04-18 15:46:00 +08:00
committed by Alex Deucher
parent 8fb20d9551
commit 76ad30f51a
3 changed files with 116 additions and 7 deletions

View File

@@ -77,11 +77,22 @@ struct amdgpu_mca_ras {
struct amdgpu_mca_ras_block *ras;
};
struct mca_bank_set {
int nr_entries;
struct list_head list;
};
struct mca_bank_cache {
struct mca_bank_set mca_set;
struct mutex lock;
};
struct amdgpu_mca {
struct amdgpu_mca_ras mp0;
struct amdgpu_mca_ras mp1;
struct amdgpu_mca_ras mpio;
const struct amdgpu_mca_smu_funcs *mca_funcs;
struct mca_bank_cache mca_caches[AMDGPU_MCA_ERROR_TYPE_DE];
};
enum mca_reg_idx {
@@ -113,11 +124,6 @@ struct mca_bank_node {
struct list_head node;
};
struct mca_bank_set {
int nr_entries;
struct list_head list;
};
struct amdgpu_mca_smu_funcs {
int max_ue_count;
int max_ce_count;
@@ -149,6 +155,9 @@ int amdgpu_mca_mp1_ras_sw_init(struct amdgpu_device *adev);
int amdgpu_mca_mpio_ras_sw_init(struct amdgpu_device *adev);
void amdgpu_mca_smu_init_funcs(struct amdgpu_device *adev, const struct amdgpu_mca_smu_funcs *mca_funcs);
int amdgpu_mca_init(struct amdgpu_device *adev);
void amdgpu_mca_fini(struct amdgpu_device *adev);
int amdgpu_mca_reset(struct amdgpu_device *adev);
int amdgpu_mca_smu_set_debug_mode(struct amdgpu_device *adev, bool enable);
int amdgpu_mca_smu_get_mca_set_error_count(struct amdgpu_device *adev, enum amdgpu_ras_block blk,
enum amdgpu_mca_error_type type, uint32_t *total);