mirror of
https://github.com/torvalds/linux.git
synced 2026-04-25 01:52:32 -04:00
drm/amdgpu: fix the issue of reserving bad pages failed
In amdgpu_ras_reset_gpu, because bad pages may not be freed, it has high probability to reserve bad pages failed. Change to reserve bad pages when freeing VRAM. v2: 1. avoid allocating the drm_mm node outside of amdgpu_vram_mgr.c 2. move bad page reserving into amdgpu_ras_add_bad_pages, if vram mgr reserve bad page failed, it will put it into pending list, otherwise put it into processed list; 3. remove amdgpu_ras_release_bad_pages, because retired page's info has been moved into amdgpu_vram_mgr v3: 1. formate code style; 2. rename amdgpu_vram_reserve_scope as amdgpu_vram_reservation; 3. rename scope_pending as reservations_pending; 4. rename scope_processed as reserved_pages; 5. change to iterate over all the pending ones and try to insert them with drm_mm_reserve_node(); v4: 1. rename amdgpu_vram_mgr_reserve_scope as amdgpu_vram_mgr_reserve_range; 2. remove unused include "amdgpu_ras.h"; 3. rename amdgpu_vram_mgr_check_and_reserve as amdgpu_vram_mgr_do_reserve; 4. refine amdgpu_vram_mgr_reserve_range to call amdgpu_vram_mgr_do_reserve. Reviewed-by: Christian König <christian.koenig@amd.com> Reviewed-by: Hawking Zhang <hawking.zhang@amd.com> Signed-off-by: Dennis Li <Dennis.Li@amd.com> Signed-off-by: Wenhui Sheng <Wenhui.Sheng@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
@@ -80,6 +80,8 @@ enum amdgpu_ras_retire_page_reservation {
|
||||
|
||||
atomic_t amdgpu_ras_in_intr = ATOMIC_INIT(0);
|
||||
|
||||
static bool amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con,
|
||||
uint64_t addr);
|
||||
static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
|
||||
uint64_t addr);
|
||||
|
||||
@@ -1551,10 +1553,12 @@ static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
|
||||
.size = AMDGPU_GPU_PAGE_SIZE,
|
||||
.flags = AMDGPU_RAS_RETIRE_PAGE_RESERVED,
|
||||
};
|
||||
|
||||
if (data->last_reserved <= i)
|
||||
ret = amdgpu_vram_mgr_query_page_status(
|
||||
ttm_manager_type(&adev->mman.bdev, TTM_PL_VRAM),
|
||||
data->bps[i].retired_page);
|
||||
if (ret == -EBUSY)
|
||||
(*bps)[i].flags = AMDGPU_RAS_RETIRE_PAGE_PENDING;
|
||||
else if (data->bps_bo[i] == NULL)
|
||||
else if (ret == -ENOENT)
|
||||
(*bps)[i].flags = AMDGPU_RAS_RETIRE_PAGE_FAULT;
|
||||
}
|
||||
|
||||
@@ -1606,12 +1610,9 @@ static int amdgpu_ras_realloc_eh_data_space(struct amdgpu_device *adev,
|
||||
unsigned int new_space = old_space + pages;
|
||||
unsigned int align_space = ALIGN(new_space, 512);
|
||||
void *bps = kmalloc(align_space * sizeof(*data->bps), GFP_KERNEL);
|
||||
struct amdgpu_bo **bps_bo =
|
||||
kmalloc(align_space * sizeof(*data->bps_bo), GFP_KERNEL);
|
||||
|
||||
if (!bps || !bps_bo) {
|
||||
if (!bps) {
|
||||
kfree(bps);
|
||||
kfree(bps_bo);
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
@@ -1620,14 +1621,8 @@ static int amdgpu_ras_realloc_eh_data_space(struct amdgpu_device *adev,
|
||||
data->count * sizeof(*data->bps));
|
||||
kfree(data->bps);
|
||||
}
|
||||
if (data->bps_bo) {
|
||||
memcpy(bps_bo, data->bps_bo,
|
||||
data->count * sizeof(*data->bps_bo));
|
||||
kfree(data->bps_bo);
|
||||
}
|
||||
|
||||
data->bps = bps;
|
||||
data->bps_bo = bps_bo;
|
||||
data->space_left += align_space - old_space;
|
||||
return 0;
|
||||
}
|
||||
@@ -1639,6 +1634,7 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
|
||||
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
|
||||
struct ras_err_handler_data *data;
|
||||
int ret = 0;
|
||||
uint32_t i;
|
||||
|
||||
if (!con || !con->eh_data || !bps || pages <= 0)
|
||||
return 0;
|
||||
@@ -1648,16 +1644,26 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
|
||||
if (!data)
|
||||
goto out;
|
||||
|
||||
if (data->space_left <= pages)
|
||||
if (amdgpu_ras_realloc_eh_data_space(adev, data, pages)) {
|
||||
for (i = 0; i < pages; i++) {
|
||||
if (amdgpu_ras_check_bad_page_unlock(con,
|
||||
bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT))
|
||||
continue;
|
||||
|
||||
if (!data->space_left &&
|
||||
amdgpu_ras_realloc_eh_data_space(adev, data, 256)) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
memcpy(&data->bps[data->count], bps, pages * sizeof(*data->bps));
|
||||
data->count += pages;
|
||||
data->space_left -= pages;
|
||||
amdgpu_vram_mgr_reserve_range(
|
||||
ttm_manager_type(&adev->mman.bdev, TTM_PL_VRAM),
|
||||
bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT,
|
||||
AMDGPU_GPU_PAGE_SIZE);
|
||||
|
||||
memcpy(&data->bps[data->count], &bps[i], sizeof(*data->bps));
|
||||
data->count++;
|
||||
data->space_left--;
|
||||
}
|
||||
out:
|
||||
mutex_unlock(&con->recovery_lock);
|
||||
|
||||
@@ -1730,6 +1736,20 @@ out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
static bool amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con,
|
||||
uint64_t addr)
|
||||
{
|
||||
struct ras_err_handler_data *data = con->eh_data;
|
||||
int i;
|
||||
|
||||
addr >>= AMDGPU_GPU_PAGE_SHIFT;
|
||||
for (i = 0; i < data->count; i++)
|
||||
if (addr == data->bps[i].retired_page)
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
* check if an address belongs to bad page
|
||||
*
|
||||
@@ -1739,26 +1759,13 @@ static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
|
||||
uint64_t addr)
|
||||
{
|
||||
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
|
||||
struct ras_err_handler_data *data;
|
||||
int i;
|
||||
bool ret = false;
|
||||
|
||||
if (!con || !con->eh_data)
|
||||
return ret;
|
||||
|
||||
mutex_lock(&con->recovery_lock);
|
||||
data = con->eh_data;
|
||||
if (!data)
|
||||
goto out;
|
||||
|
||||
addr >>= AMDGPU_GPU_PAGE_SHIFT;
|
||||
for (i = 0; i < data->count; i++)
|
||||
if (addr == data->bps[i].retired_page) {
|
||||
ret = true;
|
||||
goto out;
|
||||
}
|
||||
|
||||
out:
|
||||
ret = amdgpu_ras_check_bad_page_unlock(con, addr);
|
||||
mutex_unlock(&con->recovery_lock);
|
||||
return ret;
|
||||
}
|
||||
@@ -1804,77 +1811,6 @@ static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev,
|
||||
}
|
||||
}
|
||||
|
||||
/* called in gpu recovery/init */
|
||||
int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev)
|
||||
{
|
||||
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
|
||||
struct ras_err_handler_data *data;
|
||||
uint64_t bp;
|
||||
struct amdgpu_bo *bo = NULL;
|
||||
int i, ret = 0;
|
||||
|
||||
/* Not reserve bad page when amdgpu_bad_page_threshold == 0. */
|
||||
if (!con || !con->eh_data || (amdgpu_bad_page_threshold == 0))
|
||||
return 0;
|
||||
|
||||
mutex_lock(&con->recovery_lock);
|
||||
data = con->eh_data;
|
||||
if (!data)
|
||||
goto out;
|
||||
/* reserve vram at driver post stage. */
|
||||
for (i = data->last_reserved; i < data->count; i++) {
|
||||
bp = data->bps[i].retired_page;
|
||||
|
||||
/* There are two cases of reserve error should be ignored:
|
||||
* 1) a ras bad page has been allocated (used by someone);
|
||||
* 2) a ras bad page has been reserved (duplicate error injection
|
||||
* for one page);
|
||||
*/
|
||||
if (amdgpu_bo_create_kernel_at(adev, bp << AMDGPU_GPU_PAGE_SHIFT,
|
||||
AMDGPU_GPU_PAGE_SIZE,
|
||||
AMDGPU_GEM_DOMAIN_VRAM,
|
||||
&bo, NULL))
|
||||
dev_warn(adev->dev, "RAS WARN: reserve vram for "
|
||||
"retired page %llx fail\n", bp);
|
||||
|
||||
data->bps_bo[i] = bo;
|
||||
data->last_reserved = i + 1;
|
||||
bo = NULL;
|
||||
}
|
||||
out:
|
||||
mutex_unlock(&con->recovery_lock);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* called when driver unload */
|
||||
static int amdgpu_ras_release_bad_pages(struct amdgpu_device *adev)
|
||||
{
|
||||
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
|
||||
struct ras_err_handler_data *data;
|
||||
struct amdgpu_bo *bo;
|
||||
int i;
|
||||
|
||||
if (!con || !con->eh_data)
|
||||
return 0;
|
||||
|
||||
mutex_lock(&con->recovery_lock);
|
||||
data = con->eh_data;
|
||||
if (!data)
|
||||
goto out;
|
||||
|
||||
for (i = data->last_reserved - 1; i >= 0; i--) {
|
||||
bo = data->bps_bo[i];
|
||||
|
||||
amdgpu_bo_free_kernel(&bo, NULL, NULL);
|
||||
|
||||
data->bps_bo[i] = bo;
|
||||
data->last_reserved = i;
|
||||
}
|
||||
out:
|
||||
mutex_unlock(&con->recovery_lock);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
|
||||
{
|
||||
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
|
||||
@@ -1914,18 +1850,12 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
|
||||
ret = amdgpu_ras_load_bad_pages(adev);
|
||||
if (ret)
|
||||
goto free;
|
||||
ret = amdgpu_ras_reserve_bad_pages(adev);
|
||||
if (ret)
|
||||
goto release;
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
||||
release:
|
||||
amdgpu_ras_release_bad_pages(adev);
|
||||
free:
|
||||
kfree((*data)->bps);
|
||||
kfree((*data)->bps_bo);
|
||||
kfree(*data);
|
||||
con->eh_data = NULL;
|
||||
out:
|
||||
@@ -1953,12 +1883,10 @@ static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
|
||||
return 0;
|
||||
|
||||
cancel_work_sync(&con->recovery_work);
|
||||
amdgpu_ras_release_bad_pages(adev);
|
||||
|
||||
mutex_lock(&con->recovery_lock);
|
||||
con->eh_data = NULL;
|
||||
kfree(data->bps);
|
||||
kfree(data->bps_bo);
|
||||
kfree(data);
|
||||
mutex_unlock(&con->recovery_lock);
|
||||
|
||||
|
||||
Reference in New Issue
Block a user