mirror of
https://github.com/torvalds/linux.git
synced 2026-04-18 14:53:58 -04:00
drm/amdgpu: support ras critical address check
Support ras critical address check. Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com> Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
committed by
Alex Deucher
parent
d45c5e6845
commit
f348691897
@@ -143,6 +143,10 @@ static int amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con,
|
||||
uint64_t addr);
|
||||
static int amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
|
||||
uint64_t addr);
|
||||
|
||||
static void amdgpu_ras_critical_region_init(struct amdgpu_device *adev);
|
||||
static void amdgpu_ras_critical_region_fini(struct amdgpu_device *adev);
|
||||
|
||||
#ifdef CONFIG_X86_MCE_AMD
|
||||
static void amdgpu_register_bad_pages_mca_notifier(struct amdgpu_device *adev);
|
||||
struct mce_notifier_adev_list {
|
||||
@@ -3728,6 +3732,8 @@ static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
|
||||
kfree(data);
|
||||
mutex_unlock(&con->recovery_lock);
|
||||
|
||||
amdgpu_ras_critical_region_init(adev);
|
||||
|
||||
return 0;
|
||||
}
|
||||
/* recovery end */
|
||||
@@ -4157,6 +4163,9 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
|
||||
con->init_task_pid = task_pid_nr(current);
|
||||
get_task_comm(con->init_task_comm, current);
|
||||
|
||||
mutex_init(&con->critical_region_lock);
|
||||
INIT_LIST_HEAD(&con->critical_region_head);
|
||||
|
||||
dev_info(adev->dev, "RAS INFO: ras initialized successfully, "
|
||||
"hardware ability[%x] ras_mask[%x]\n",
|
||||
adev->ras_hw_enabled, adev->ras_enabled);
|
||||
@@ -4436,6 +4445,9 @@ int amdgpu_ras_fini(struct amdgpu_device *adev)
|
||||
if (!adev->ras_enabled || !con)
|
||||
return 0;
|
||||
|
||||
amdgpu_ras_critical_region_fini(adev);
|
||||
mutex_destroy(&con->critical_region_lock);
|
||||
|
||||
list_for_each_entry_safe(ras_node, tmp, &adev->ras_list, node) {
|
||||
if (ras_node->ras_obj) {
|
||||
obj = ras_node->ras_obj;
|
||||
@@ -5380,3 +5392,80 @@ bool amdgpu_ras_is_rma(struct amdgpu_device *adev)
|
||||
|
||||
return con->is_rma;
|
||||
}
|
||||
|
||||
int amdgpu_ras_add_critical_region(struct amdgpu_device *adev,
|
||||
struct amdgpu_bo *bo)
|
||||
{
|
||||
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
|
||||
struct amdgpu_vram_mgr_resource *vres;
|
||||
struct ras_critical_region *region;
|
||||
struct drm_buddy_block *block;
|
||||
int ret = 0;
|
||||
|
||||
if (!bo || !bo->tbo.resource)
|
||||
return -EINVAL;
|
||||
|
||||
vres = to_amdgpu_vram_mgr_resource(bo->tbo.resource);
|
||||
|
||||
mutex_lock(&con->critical_region_lock);
|
||||
|
||||
/* Check if the bo had been recorded */
|
||||
list_for_each_entry(region, &con->critical_region_head, node)
|
||||
if (region->bo == bo)
|
||||
goto out;
|
||||
|
||||
/* Record new critical amdgpu bo */
|
||||
list_for_each_entry(block, &vres->blocks, link) {
|
||||
region = kzalloc(sizeof(*region), GFP_KERNEL);
|
||||
if (!region) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
region->bo = bo;
|
||||
region->start = amdgpu_vram_mgr_block_start(block);
|
||||
region->size = amdgpu_vram_mgr_block_size(block);
|
||||
list_add_tail(®ion->node, &con->critical_region_head);
|
||||
}
|
||||
|
||||
out:
|
||||
mutex_unlock(&con->critical_region_lock);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void amdgpu_ras_critical_region_init(struct amdgpu_device *adev)
|
||||
{
|
||||
amdgpu_ras_add_critical_region(adev, adev->mman.fw_reserved_memory);
|
||||
}
|
||||
|
||||
static void amdgpu_ras_critical_region_fini(struct amdgpu_device *adev)
|
||||
{
|
||||
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
|
||||
struct ras_critical_region *region, *tmp;
|
||||
|
||||
mutex_lock(&con->critical_region_lock);
|
||||
list_for_each_entry_safe(region, tmp, &con->critical_region_head, node) {
|
||||
list_del(®ion->node);
|
||||
kfree(region);
|
||||
}
|
||||
mutex_unlock(&con->critical_region_lock);
|
||||
}
|
||||
|
||||
bool amdgpu_ras_check_critical_address(struct amdgpu_device *adev, uint64_t addr)
|
||||
{
|
||||
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
|
||||
struct ras_critical_region *region;
|
||||
bool ret = false;
|
||||
|
||||
mutex_lock(&con->critical_region_lock);
|
||||
list_for_each_entry(region, &con->critical_region_head, node) {
|
||||
if ((region->start <= addr) &&
|
||||
(addr < (region->start + region->size))) {
|
||||
ret = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
mutex_unlock(&con->critical_region_lock);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user