drm/amdgpu: support ras critical address check

Support ras critical address check.

Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
YiPeng Chai
2025-07-24 15:34:29 +08:00
committed by Alex Deucher
parent d45c5e6845
commit f348691897
2 changed files with 103 additions and 0 deletions

View File

@@ -143,6 +143,10 @@ static int amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con,
uint64_t addr);
static int amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
uint64_t addr);
static void amdgpu_ras_critical_region_init(struct amdgpu_device *adev);
static void amdgpu_ras_critical_region_fini(struct amdgpu_device *adev);
#ifdef CONFIG_X86_MCE_AMD
static void amdgpu_register_bad_pages_mca_notifier(struct amdgpu_device *adev);
struct mce_notifier_adev_list {
@@ -3728,6 +3732,8 @@ static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
kfree(data);
mutex_unlock(&con->recovery_lock);
amdgpu_ras_critical_region_init(adev);
return 0;
}
/* recovery end */
@@ -4157,6 +4163,9 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
con->init_task_pid = task_pid_nr(current);
get_task_comm(con->init_task_comm, current);
mutex_init(&con->critical_region_lock);
INIT_LIST_HEAD(&con->critical_region_head);
dev_info(adev->dev, "RAS INFO: ras initialized successfully, "
"hardware ability[%x] ras_mask[%x]\n",
adev->ras_hw_enabled, adev->ras_enabled);
@@ -4436,6 +4445,9 @@ int amdgpu_ras_fini(struct amdgpu_device *adev)
if (!adev->ras_enabled || !con)
return 0;
amdgpu_ras_critical_region_fini(adev);
mutex_destroy(&con->critical_region_lock);
list_for_each_entry_safe(ras_node, tmp, &adev->ras_list, node) {
if (ras_node->ras_obj) {
obj = ras_node->ras_obj;
@@ -5380,3 +5392,80 @@ bool amdgpu_ras_is_rma(struct amdgpu_device *adev)
return con->is_rma;
}
int amdgpu_ras_add_critical_region(struct amdgpu_device *adev,
struct amdgpu_bo *bo)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct amdgpu_vram_mgr_resource *vres;
struct ras_critical_region *region;
struct drm_buddy_block *block;
int ret = 0;
if (!bo || !bo->tbo.resource)
return -EINVAL;
vres = to_amdgpu_vram_mgr_resource(bo->tbo.resource);
mutex_lock(&con->critical_region_lock);
/* Check if the bo had been recorded */
list_for_each_entry(region, &con->critical_region_head, node)
if (region->bo == bo)
goto out;
/* Record new critical amdgpu bo */
list_for_each_entry(block, &vres->blocks, link) {
region = kzalloc(sizeof(*region), GFP_KERNEL);
if (!region) {
ret = -ENOMEM;
goto out;
}
region->bo = bo;
region->start = amdgpu_vram_mgr_block_start(block);
region->size = amdgpu_vram_mgr_block_size(block);
list_add_tail(&region->node, &con->critical_region_head);
}
out:
mutex_unlock(&con->critical_region_lock);
return ret;
}
static void amdgpu_ras_critical_region_init(struct amdgpu_device *adev)
{
amdgpu_ras_add_critical_region(adev, adev->mman.fw_reserved_memory);
}
static void amdgpu_ras_critical_region_fini(struct amdgpu_device *adev)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct ras_critical_region *region, *tmp;
mutex_lock(&con->critical_region_lock);
list_for_each_entry_safe(region, tmp, &con->critical_region_head, node) {
list_del(&region->node);
kfree(region);
}
mutex_unlock(&con->critical_region_lock);
}
bool amdgpu_ras_check_critical_address(struct amdgpu_device *adev, uint64_t addr)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct ras_critical_region *region;
bool ret = false;
mutex_lock(&con->critical_region_lock);
list_for_each_entry(region, &con->critical_region_head, node) {
if ((region->start <= addr) &&
(addr < (region->start + region->size))) {
ret = true;
break;
}
}
mutex_unlock(&con->critical_region_lock);
return ret;
}