drm/amdgpu: break driver init process when it's bad GPU(v5)

When retrieving bad gpu tag from eeprom, GPU init should
fail as the GPU needs to be retired for further check.

v2: Fix spelling typo, correct the condition to detect
    bad gpu tag and refine error message.

v3: Refine function argument name.

v4: Fix missing check of returning value of i2c
    initialization error case.

v5: Use dev_err to print PCI information in dmesg instead
    of DRM_ERROR.

Signed-off-by: Guchun Chen <guchun.chen@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
Guchun Chen
2020-07-23 15:42:19 +08:00
committed by Alex Deucher
parent 1d6a9d122d
commit b82e65a935
4 changed files with 36 additions and 7 deletions

View File

@@ -241,7 +241,8 @@ int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control)
}
int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control)
int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control,
bool *exceed_err_limit)
{
int ret = 0;
struct amdgpu_device *adev = to_amdgpu_device(control);
@@ -254,6 +255,8 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control)
.buf = buff,
};
*exceed_err_limit = false;
/* Verify i2c adapter is initialized */
if (!adev->pm.smu_i2c.algo)
return -ENOENT;
@@ -282,6 +285,11 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control)
DRM_DEBUG_DRIVER("Found existing EEPROM table with %d records",
control->num_recs);
} else if ((hdr->header == EEPROM_TABLE_HDR_BAD) &&
(amdgpu_bad_page_threshold != 0)) {
*exceed_err_limit = true;
dev_err(adev->dev, "Exceeding the bad_page_threshold parameter, "
"disabling the GPU.\n");
} else {
DRM_INFO("Creating new EEPROM table");