drm/amdgpu: refine smu v13.0.6 mca dump driver

refine smu mca driver to support query ras error from pmfw path.
- correct gfx smu bank hwid (from mp5 to smu bank)
- retire unused callback function in amdgpu_mca_smu_funcs{}
- add new mca_bank_set{} structure to collect mca bank
- move enum mca_reg_idx into amdgpu_mca.h header
- add mca status register field decode macro

Signed-off-by: Yang Wang <kevinyang.wang@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
Yang Wang
2023-11-02 14:17:04 +08:00
committed by Alex Deucher
parent 0b1695710a
commit 07c1db7036
3 changed files with 341 additions and 204 deletions

View File

@@ -143,6 +143,46 @@ int amdgpu_mca_mpio_ras_sw_init(struct amdgpu_device *adev)
return 0;
}
void amdgpu_mca_bank_set_init(struct mca_bank_set *mca_set)
{
if (!mca_set)
return;
memset(mca_set, 0, sizeof(*mca_set));
INIT_LIST_HEAD(&mca_set->list);
}
int amdgpu_mca_bank_set_add_entry(struct mca_bank_set *mca_set, struct mca_bank_entry *entry)
{
struct mca_bank_node *node;
if (!entry)
return -EINVAL;
node = kvzalloc(sizeof(*node), GFP_KERNEL);
if (!node)
return -ENOMEM;
memcpy(&node->entry, entry, sizeof(*entry));
INIT_LIST_HEAD(&node->node);
list_add_tail(&node->node, &mca_set->list);
mca_set->nr_entries++;
return 0;
}
void amdgpu_mca_bank_set_release(struct mca_bank_set *mca_set)
{
struct mca_bank_node *node, *tmp;
list_for_each_entry_safe(node, tmp, &mca_set->list, node) {
list_del(&node->node);
kvfree(node);
}
}
void amdgpu_mca_smu_init_funcs(struct amdgpu_device *adev, const struct amdgpu_mca_smu_funcs *mca_funcs)
{
struct amdgpu_mca *mca = &adev->mca;
@@ -160,6 +200,58 @@ int amdgpu_mca_smu_set_debug_mode(struct amdgpu_device *adev, bool enable)
return -EOPNOTSUPP;
}
static void amdgpu_mca_smu_mca_bank_dump(struct amdgpu_device *adev, int idx, struct mca_bank_entry *entry)
{
int i;
for (i = 0; i < ARRAY_SIZE(entry->regs); i++)
dev_dbg(adev->dev, "mca entry[%02d].regs[%02d]=0x%016llx\n", idx, i, entry->regs[i]);
}
int amdgpu_mca_smu_log_ras_error(struct amdgpu_device *adev, enum amdgpu_ras_block blk, enum amdgpu_mca_error_type type, struct ras_err_data *err_data)
{
struct amdgpu_smuio_mcm_config_info mcm_info;
struct mca_bank_set mca_set;
struct mca_bank_node *node;
struct mca_bank_entry *entry;
uint32_t count;
int ret, i = 0;
amdgpu_mca_bank_set_init(&mca_set);
ret = amdgpu_mca_smu_get_mca_set(adev, blk, type, &mca_set);
if (ret)
goto out_mca_release;
list_for_each_entry(node, &mca_set.list, node) {
entry = &node->entry;
amdgpu_mca_smu_mca_bank_dump(adev, i++, entry);
count = 0;
ret = amdgpu_mca_smu_parse_mca_error_count(adev, blk, type, entry, &count);
if (ret)
goto out_mca_release;
if (!count)
continue;
mcm_info.socket_id = entry->info.socket_id;
mcm_info.die_id = entry->info.aid;
if (type == AMDGPU_MCA_ERROR_TYPE_UE)
amdgpu_ras_error_statistic_ue_count(err_data, &mcm_info, (uint64_t)count);
else
amdgpu_ras_error_statistic_ce_count(err_data, &mcm_info, (uint64_t)count);
}
out_mca_release:
amdgpu_mca_bank_set_release(&mca_set);
return ret;
}
int amdgpu_mca_smu_get_valid_mca_count(struct amdgpu_device *adev, enum amdgpu_mca_error_type type, uint32_t *count)
{
const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs;
@@ -173,17 +265,77 @@ int amdgpu_mca_smu_get_valid_mca_count(struct amdgpu_device *adev, enum amdgpu_m
return -EOPNOTSUPP;
}
int amdgpu_mca_smu_get_error_count(struct amdgpu_device *adev, enum amdgpu_ras_block blk,
enum amdgpu_mca_error_type type, uint32_t *count)
int amdgpu_mca_smu_get_mca_set_error_count(struct amdgpu_device *adev, enum amdgpu_ras_block blk,
enum amdgpu_mca_error_type type, uint32_t *total)
{
const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs;
if (!count)
struct mca_bank_set mca_set;
struct mca_bank_node *node;
struct mca_bank_entry *entry;
uint32_t count;
int ret;
if (!total)
return -EINVAL;
if (mca_funcs && mca_funcs->mca_get_error_count)
return mca_funcs->mca_get_error_count(adev, blk, type, count);
if (!mca_funcs)
return -EOPNOTSUPP;
return -EOPNOTSUPP;
if (!mca_funcs->mca_get_ras_mca_set || !mca_funcs->mca_get_valid_mca_count)
return -EOPNOTSUPP;
amdgpu_mca_bank_set_init(&mca_set);
ret = mca_funcs->mca_get_ras_mca_set(adev, blk, type, &mca_set);
if (ret)
goto err_mca_set_release;
*total = 0;
list_for_each_entry(node, &mca_set.list, node) {
entry = &node->entry;
count = 0;
ret = mca_funcs->mca_parse_mca_error_count(adev, blk, type, entry, &count);
if (ret)
goto err_mca_set_release;
*total += count;
}
err_mca_set_release:
amdgpu_mca_bank_set_release(&mca_set);
return ret;
}
int amdgpu_mca_smu_parse_mca_error_count(struct amdgpu_device *adev, enum amdgpu_ras_block blk,
enum amdgpu_mca_error_type type, struct mca_bank_entry *entry, uint32_t *count)
{
const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs;
if (!count || !entry)
return -EINVAL;
if (!mca_funcs || !mca_funcs->mca_parse_mca_error_count)
return -EOPNOTSUPP;
return mca_funcs->mca_parse_mca_error_count(adev, blk, type, entry, count);
}
int amdgpu_mca_smu_get_mca_set(struct amdgpu_device *adev, enum amdgpu_ras_block blk,
enum amdgpu_mca_error_type type, struct mca_bank_set *mca_set)
{
const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs;
if (!mca_set)
return -EINVAL;
if (!mca_funcs || !mca_funcs->mca_get_ras_mca_set)
return -EOPNOTSUPP;
WARN_ON(!list_empty(&mca_set->list));
return mca_funcs->mca_get_ras_mca_set(adev, blk, type, mca_set);
}
int amdgpu_mca_smu_get_mca_entry(struct amdgpu_device *adev, enum amdgpu_mca_error_type type,