mirror of
https://github.com/torvalds/linux.git
synced 2026-04-18 14:53:58 -04:00
Merge tag 'amd-drm-next-6.18-2025-08-29' of https://gitlab.freedesktop.org/agd5f/linux into drm-next
amd-drm-next-6.18-2025-08-29: amdgpu: - Replay fixes - RAS updates - VCN SRAM load fixes - EDID read fixes - eDP ALPM support - AUX fixes - Documenation updates - Rework how PTE flags are generated - DCE6 fixes - VCN devcoredump cleanup - MMHUB client id fixes - SR-IOV fixes - VRR fixes - VCN 5.0.1 RAS support - Backlight fixes - UserQ fixes - Misc code cleanups - SMU 13.0.12 updates - Expanded PCIe DPC support - Expanded VCN reset support - SMU 13.0.x Updates - VPE per queue reset support - Cusor rotation fix - DSC fixes - GC 12 MES TLB invalidation update - Cursor fixes - Non-DC TMDS clock validation fix amdkfd: - debugfs fixes - Misc code cleanups - Page migration fixes - Partition fixes - SVM fixes radeon: - Misc code cleanups Signed-off-by: Dave Airlie <airlied@redhat.com> From: Alex Deucher <alexander.deucher@amd.com> Link: https://lore.kernel.org/r/20250829190848.1921648-1-alexander.deucher@amd.com
This commit is contained in:
@@ -122,12 +122,15 @@ const char *get_ras_block_str(struct ras_common_if *ras_block)
|
||||
/* typical ECC bad page rate is 1 bad page per 100MB VRAM */
|
||||
#define RAS_BAD_PAGE_COVER (100 * 1024 * 1024ULL)
|
||||
|
||||
#define MAX_UMC_POISON_POLLING_TIME_ASYNC 300 //ms
|
||||
#define MAX_UMC_POISON_POLLING_TIME_ASYNC 10
|
||||
|
||||
#define AMDGPU_RAS_RETIRE_PAGE_INTERVAL 100 //ms
|
||||
|
||||
#define MAX_FLUSH_RETIRE_DWORK_TIMES 100
|
||||
|
||||
#define BYPASS_ALLOCATED_ADDRESS 0x0
|
||||
#define BYPASS_INITIALIZATION_ADDRESS 0x1
|
||||
|
||||
enum amdgpu_ras_retire_page_reservation {
|
||||
AMDGPU_RAS_RETIRE_PAGE_RESERVED,
|
||||
AMDGPU_RAS_RETIRE_PAGE_PENDING,
|
||||
@@ -136,10 +139,14 @@ enum amdgpu_ras_retire_page_reservation {
|
||||
|
||||
atomic_t amdgpu_ras_in_intr = ATOMIC_INIT(0);
|
||||
|
||||
static bool amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con,
|
||||
static int amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con,
|
||||
uint64_t addr);
|
||||
static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
|
||||
static int amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
|
||||
uint64_t addr);
|
||||
|
||||
static void amdgpu_ras_critical_region_init(struct amdgpu_device *adev);
|
||||
static void amdgpu_ras_critical_region_fini(struct amdgpu_device *adev);
|
||||
|
||||
#ifdef CONFIG_X86_MCE_AMD
|
||||
static void amdgpu_register_bad_pages_mca_notifier(struct amdgpu_device *adev);
|
||||
struct mce_notifier_adev_list {
|
||||
@@ -169,18 +176,16 @@ static int amdgpu_reserve_page_direct(struct amdgpu_device *adev, uint64_t addre
|
||||
struct eeprom_table_record err_rec;
|
||||
int ret;
|
||||
|
||||
if ((address >= adev->gmc.mc_vram_size) ||
|
||||
(address >= RAS_UMC_INJECT_ADDR_LIMIT)) {
|
||||
ret = amdgpu_ras_check_bad_page(adev, address);
|
||||
if (ret == -EINVAL) {
|
||||
dev_warn(adev->dev,
|
||||
"RAS WARN: input address 0x%llx is invalid.\n",
|
||||
address);
|
||||
"RAS WARN: input address 0x%llx is invalid.\n",
|
||||
address);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
if (amdgpu_ras_check_bad_page(adev, address)) {
|
||||
} else if (ret == 1) {
|
||||
dev_warn(adev->dev,
|
||||
"RAS WARN: 0x%llx has already been marked as bad page!\n",
|
||||
address);
|
||||
"RAS WARN: 0x%llx has already been marked as bad page!\n",
|
||||
address);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -207,6 +212,49 @@ static int amdgpu_reserve_page_direct(struct amdgpu_device *adev, uint64_t addre
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int amdgpu_check_address_validity(struct amdgpu_device *adev,
|
||||
uint64_t address, uint64_t flags)
|
||||
{
|
||||
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
|
||||
struct amdgpu_vram_block_info blk_info;
|
||||
uint64_t page_pfns[32] = {0};
|
||||
int i, ret, count;
|
||||
|
||||
if (amdgpu_ip_version(adev, UMC_HWIP, 0) < IP_VERSION(12, 0, 0))
|
||||
return 0;
|
||||
|
||||
if ((address >= adev->gmc.mc_vram_size) ||
|
||||
(address >= RAS_UMC_INJECT_ADDR_LIMIT))
|
||||
return -EFAULT;
|
||||
|
||||
count = amdgpu_umc_lookup_bad_pages_in_a_row(adev,
|
||||
address, page_pfns, ARRAY_SIZE(page_pfns));
|
||||
if (count <= 0)
|
||||
return -EPERM;
|
||||
|
||||
for (i = 0; i < count; i++) {
|
||||
memset(&blk_info, 0, sizeof(blk_info));
|
||||
ret = amdgpu_vram_mgr_query_address_block_info(&adev->mman.vram_mgr,
|
||||
page_pfns[i] << AMDGPU_GPU_PAGE_SHIFT, &blk_info);
|
||||
if (!ret) {
|
||||
/* The input address that needs to be checked is allocated by
|
||||
* current calling process, so it is necessary to exclude
|
||||
* the calling process.
|
||||
*/
|
||||
if ((flags == BYPASS_ALLOCATED_ADDRESS) &&
|
||||
((blk_info.task.pid != task_pid_nr(current)) ||
|
||||
strncmp(blk_info.task.comm, current->comm, TASK_COMM_LEN)))
|
||||
return -EACCES;
|
||||
else if ((flags == BYPASS_INITIALIZATION_ADDRESS) &&
|
||||
(blk_info.task.pid == con->init_task_pid) &&
|
||||
!strncmp(blk_info.task.comm, con->init_task_comm, TASK_COMM_LEN))
|
||||
return -EACCES;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static ssize_t amdgpu_ras_debugfs_read(struct file *f, char __user *buf,
|
||||
size_t size, loff_t *pos)
|
||||
{
|
||||
@@ -297,6 +345,8 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
|
||||
op = 2;
|
||||
else if (strstr(str, "retire_page") != NULL)
|
||||
op = 3;
|
||||
else if (strstr(str, "check_address") != NULL)
|
||||
op = 4;
|
||||
else if (str[0] && str[1] && str[2] && str[3])
|
||||
/* ascii string, but commands are not matched. */
|
||||
return -EINVAL;
|
||||
@@ -310,6 +360,15 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
|
||||
data->op = op;
|
||||
data->inject.address = address;
|
||||
|
||||
return 0;
|
||||
} else if (op == 4) {
|
||||
if (sscanf(str, "%*s 0x%llx 0x%llx", &address, &value) != 2 &&
|
||||
sscanf(str, "%*s %llu %llu", &address, &value) != 2)
|
||||
return -EINVAL;
|
||||
|
||||
data->op = op;
|
||||
data->inject.address = address;
|
||||
data->inject.value = value;
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -500,6 +559,9 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f,
|
||||
return size;
|
||||
else
|
||||
return ret;
|
||||
} else if (data.op == 4) {
|
||||
ret = amdgpu_check_address_validity(adev, data.inject.address, data.inject.value);
|
||||
return ret ? ret : size;
|
||||
}
|
||||
|
||||
if (!amdgpu_ras_is_supported(adev, data.head.block))
|
||||
@@ -513,22 +575,16 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f,
|
||||
ret = amdgpu_ras_feature_enable(adev, &data.head, 1);
|
||||
break;
|
||||
case 2:
|
||||
if ((data.inject.address >= adev->gmc.mc_vram_size &&
|
||||
adev->gmc.mc_vram_size) ||
|
||||
(data.inject.address >= RAS_UMC_INJECT_ADDR_LIMIT)) {
|
||||
dev_warn(adev->dev, "RAS WARN: input address "
|
||||
"0x%llx is invalid.",
|
||||
data.inject.address);
|
||||
ret = -EINVAL;
|
||||
break;
|
||||
}
|
||||
|
||||
/* umc ce/ue error injection for a bad page is not allowed */
|
||||
if ((data.head.block == AMDGPU_RAS_BLOCK__UMC) &&
|
||||
amdgpu_ras_check_bad_page(adev, data.inject.address)) {
|
||||
dev_warn(adev->dev, "RAS WARN: inject: 0x%llx has "
|
||||
"already been marked as bad!\n",
|
||||
data.inject.address);
|
||||
if (data.head.block == AMDGPU_RAS_BLOCK__UMC)
|
||||
ret = amdgpu_ras_check_bad_page(adev, data.inject.address);
|
||||
if (ret == -EINVAL) {
|
||||
dev_warn(adev->dev, "RAS WARN: input address 0x%llx is invalid.",
|
||||
data.inject.address);
|
||||
break;
|
||||
} else if (ret == 1) {
|
||||
dev_warn(adev->dev, "RAS WARN: inject: 0x%llx has already been marked as bad!\n",
|
||||
data.inject.address);
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -2566,18 +2622,26 @@ static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
|
||||
goto out;
|
||||
}
|
||||
|
||||
*bps = kmalloc(sizeof(struct ras_badpage) * data->count, GFP_KERNEL);
|
||||
*bps = kmalloc_array(data->count, sizeof(struct ras_badpage), GFP_KERNEL);
|
||||
if (!*bps) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
for (; i < data->count; i++) {
|
||||
if (!data->bps[i].ts)
|
||||
continue;
|
||||
|
||||
(*bps)[i] = (struct ras_badpage){
|
||||
.bp = data->bps[i].retired_page,
|
||||
.size = AMDGPU_GPU_PAGE_SIZE,
|
||||
.flags = AMDGPU_RAS_RETIRE_PAGE_RESERVED,
|
||||
};
|
||||
|
||||
if (amdgpu_ras_check_critical_address(adev,
|
||||
data->bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT))
|
||||
continue;
|
||||
|
||||
status = amdgpu_vram_mgr_query_page_status(&adev->mman.vram_mgr,
|
||||
data->bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT);
|
||||
if (status == -EBUSY)
|
||||
@@ -2586,7 +2650,7 @@ static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
|
||||
(*bps)[i].flags = AMDGPU_RAS_RETIRE_PAGE_FAULT;
|
||||
}
|
||||
|
||||
*count = data->count;
|
||||
*count = con->bad_page_num;
|
||||
out:
|
||||
mutex_unlock(&con->recovery_lock);
|
||||
return ret;
|
||||
@@ -2722,7 +2786,7 @@ static int amdgpu_ras_realloc_eh_data_space(struct amdgpu_device *adev,
|
||||
unsigned int old_space = data->count + data->space_left;
|
||||
unsigned int new_space = old_space + pages;
|
||||
unsigned int align_space = ALIGN(new_space, 512);
|
||||
void *bps = kmalloc(align_space * sizeof(*data->bps), GFP_KERNEL);
|
||||
void *bps = kmalloc_array(align_space, sizeof(*data->bps), GFP_KERNEL);
|
||||
|
||||
if (!bps) {
|
||||
return -ENOMEM;
|
||||
@@ -2814,8 +2878,11 @@ static int __amdgpu_ras_restore_bad_pages(struct amdgpu_device *adev,
|
||||
|
||||
for (j = 0; j < count; j++) {
|
||||
if (amdgpu_ras_check_bad_page_unlock(con,
|
||||
bps[j].retired_page << AMDGPU_GPU_PAGE_SHIFT))
|
||||
bps[j].retired_page << AMDGPU_GPU_PAGE_SHIFT)) {
|
||||
data->count++;
|
||||
data->space_left--;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!data->space_left &&
|
||||
amdgpu_ras_realloc_eh_data_space(adev, data, 256)) {
|
||||
@@ -2828,6 +2895,7 @@ static int __amdgpu_ras_restore_bad_pages(struct amdgpu_device *adev,
|
||||
sizeof(struct eeprom_table_record));
|
||||
data->count++;
|
||||
data->space_left--;
|
||||
con->bad_page_num++;
|
||||
}
|
||||
|
||||
return 0;
|
||||
@@ -2974,7 +3042,7 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
|
||||
ret = __amdgpu_ras_convert_rec_array_from_rom(adev,
|
||||
&bps[i], &err_data, nps);
|
||||
if (ret)
|
||||
control->ras_num_bad_pages -= adev->umc.retire_unit;
|
||||
con->bad_page_num -= adev->umc.retire_unit;
|
||||
i += (adev->umc.retire_unit - 1);
|
||||
} else {
|
||||
break;
|
||||
@@ -2988,8 +3056,10 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
|
||||
ret = __amdgpu_ras_convert_rec_from_rom(adev,
|
||||
&bps[i], &err_data, nps);
|
||||
if (ret)
|
||||
control->ras_num_bad_pages -= adev->umc.retire_unit;
|
||||
con->bad_page_num -= adev->umc.retire_unit;
|
||||
}
|
||||
|
||||
con->eh_data->count_saved = con->eh_data->count;
|
||||
} else {
|
||||
ret = __amdgpu_ras_restore_bad_pages(adev, bps, pages);
|
||||
}
|
||||
@@ -3012,7 +3082,7 @@ int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev,
|
||||
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
|
||||
struct ras_err_handler_data *data;
|
||||
struct amdgpu_ras_eeprom_control *control;
|
||||
int save_count, unit_num, bad_page_num, i;
|
||||
int save_count, unit_num, i;
|
||||
|
||||
if (!con || !con->eh_data) {
|
||||
if (new_cnt)
|
||||
@@ -3033,27 +3103,26 @@ int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev,
|
||||
mutex_lock(&con->recovery_lock);
|
||||
control = &con->eeprom_control;
|
||||
data = con->eh_data;
|
||||
bad_page_num = control->ras_num_bad_pages;
|
||||
save_count = data->count - bad_page_num;
|
||||
unit_num = data->count / adev->umc.retire_unit - control->ras_num_recs;
|
||||
save_count = con->bad_page_num - control->ras_num_bad_pages;
|
||||
mutex_unlock(&con->recovery_lock);
|
||||
|
||||
unit_num = save_count / adev->umc.retire_unit;
|
||||
if (new_cnt)
|
||||
*new_cnt = unit_num;
|
||||
|
||||
/* only new entries are saved */
|
||||
if (save_count > 0) {
|
||||
if (unit_num > 0) {
|
||||
/*old asics only save pa to eeprom like before*/
|
||||
if (IP_VERSION_MAJ(amdgpu_ip_version(adev, UMC_HWIP, 0)) < 12) {
|
||||
if (amdgpu_ras_eeprom_append(control,
|
||||
&data->bps[bad_page_num], save_count)) {
|
||||
&data->bps[data->count_saved], unit_num)) {
|
||||
dev_err(adev->dev, "Failed to save EEPROM table data!");
|
||||
return -EIO;
|
||||
}
|
||||
} else {
|
||||
for (i = 0; i < unit_num; i++) {
|
||||
if (amdgpu_ras_eeprom_append(control,
|
||||
&data->bps[bad_page_num +
|
||||
&data->bps[data->count_saved +
|
||||
i * adev->umc.retire_unit], 1)) {
|
||||
dev_err(adev->dev, "Failed to save EEPROM table data!");
|
||||
return -EIO;
|
||||
@@ -3062,6 +3131,7 @@ int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev,
|
||||
}
|
||||
|
||||
dev_info(adev->dev, "Saved %d pages to EEPROM table.\n", save_count);
|
||||
data->count_saved = data->count;
|
||||
}
|
||||
|
||||
return 0;
|
||||
@@ -3116,17 +3186,17 @@ static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev)
|
||||
}
|
||||
}
|
||||
|
||||
ret = amdgpu_ras_add_bad_pages(adev, bps, control->ras_num_recs, true);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
ret = amdgpu_ras_eeprom_check(control);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
/* HW not usable */
|
||||
if (amdgpu_ras_is_rma(adev)) {
|
||||
if (amdgpu_ras_is_rma(adev))
|
||||
ret = -EHWPOISON;
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = amdgpu_ras_add_bad_pages(adev, bps, control->ras_num_recs, true);
|
||||
}
|
||||
|
||||
out:
|
||||
@@ -3134,18 +3204,24 @@ out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
static bool amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con,
|
||||
static int amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con,
|
||||
uint64_t addr)
|
||||
{
|
||||
struct ras_err_handler_data *data = con->eh_data;
|
||||
struct amdgpu_device *adev = con->adev;
|
||||
int i;
|
||||
|
||||
if ((addr >= adev->gmc.mc_vram_size &&
|
||||
adev->gmc.mc_vram_size) ||
|
||||
(addr >= RAS_UMC_INJECT_ADDR_LIMIT))
|
||||
return -EINVAL;
|
||||
|
||||
addr >>= AMDGPU_GPU_PAGE_SHIFT;
|
||||
for (i = 0; i < data->count; i++)
|
||||
if (addr == data->bps[i].retired_page)
|
||||
return true;
|
||||
return 1;
|
||||
|
||||
return false;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -3153,11 +3229,11 @@ static bool amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con,
|
||||
*
|
||||
* Note: this check is only for umc block
|
||||
*/
|
||||
static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
|
||||
static int amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
|
||||
uint64_t addr)
|
||||
{
|
||||
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
|
||||
bool ret = false;
|
||||
int ret = 0;
|
||||
|
||||
if (!con || !con->eh_data)
|
||||
return ret;
|
||||
@@ -3241,7 +3317,7 @@ static void amdgpu_ras_ecc_log_init(struct ras_ecc_log_info *ecc_log)
|
||||
|
||||
INIT_RADIX_TREE(&ecc_log->de_page_tree, GFP_KERNEL);
|
||||
ecc_log->de_queried_count = 0;
|
||||
ecc_log->prev_de_queried_count = 0;
|
||||
ecc_log->consumption_q_count = 0;
|
||||
}
|
||||
|
||||
static void amdgpu_ras_ecc_log_fini(struct ras_ecc_log_info *ecc_log)
|
||||
@@ -3261,7 +3337,7 @@ static void amdgpu_ras_ecc_log_fini(struct ras_ecc_log_info *ecc_log)
|
||||
|
||||
mutex_destroy(&ecc_log->lock);
|
||||
ecc_log->de_queried_count = 0;
|
||||
ecc_log->prev_de_queried_count = 0;
|
||||
ecc_log->consumption_q_count = 0;
|
||||
}
|
||||
|
||||
static bool amdgpu_ras_schedule_retirement_dwork(struct amdgpu_ras *con,
|
||||
@@ -3287,7 +3363,6 @@ static void amdgpu_ras_do_page_retirement(struct work_struct *work)
|
||||
page_retirement_dwork.work);
|
||||
struct amdgpu_device *adev = con->adev;
|
||||
struct ras_err_data err_data;
|
||||
unsigned long err_cnt;
|
||||
|
||||
/* If gpu reset is ongoing, delay retiring the bad pages */
|
||||
if (amdgpu_in_reset(adev) || amdgpu_ras_in_recovery(adev)) {
|
||||
@@ -3299,13 +3374,9 @@ static void amdgpu_ras_do_page_retirement(struct work_struct *work)
|
||||
amdgpu_ras_error_data_init(&err_data);
|
||||
|
||||
amdgpu_umc_handle_bad_pages(adev, &err_data);
|
||||
err_cnt = err_data.err_addr_cnt;
|
||||
|
||||
amdgpu_ras_error_data_fini(&err_data);
|
||||
|
||||
if (err_cnt && amdgpu_ras_is_rma(adev))
|
||||
amdgpu_ras_reset_gpu(adev);
|
||||
|
||||
amdgpu_ras_schedule_retirement_dwork(con,
|
||||
AMDGPU_RAS_RETIRE_PAGE_INTERVAL);
|
||||
}
|
||||
@@ -3316,49 +3387,39 @@ static int amdgpu_ras_poison_creation_handler(struct amdgpu_device *adev,
|
||||
int ret = 0;
|
||||
struct ras_ecc_log_info *ecc_log;
|
||||
struct ras_query_if info;
|
||||
uint32_t timeout = 0;
|
||||
u32 timeout = MAX_UMC_POISON_POLLING_TIME_ASYNC;
|
||||
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
|
||||
uint64_t de_queried_count;
|
||||
uint32_t new_detect_count, total_detect_count;
|
||||
uint32_t need_query_count = poison_creation_count;
|
||||
u64 de_queried_count;
|
||||
u64 consumption_q_count;
|
||||
enum ras_event_type type = RAS_EVENT_TYPE_POISON_CREATION;
|
||||
|
||||
memset(&info, 0, sizeof(info));
|
||||
info.head.block = AMDGPU_RAS_BLOCK__UMC;
|
||||
|
||||
ecc_log = &ras->umc_ecc_log;
|
||||
total_detect_count = 0;
|
||||
ecc_log->de_queried_count = 0;
|
||||
ecc_log->consumption_q_count = 0;
|
||||
|
||||
do {
|
||||
ret = amdgpu_ras_query_error_status_with_event(adev, &info, type);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
de_queried_count = ecc_log->de_queried_count;
|
||||
if (de_queried_count > ecc_log->prev_de_queried_count) {
|
||||
new_detect_count = de_queried_count - ecc_log->prev_de_queried_count;
|
||||
ecc_log->prev_de_queried_count = de_queried_count;
|
||||
timeout = 0;
|
||||
} else {
|
||||
new_detect_count = 0;
|
||||
}
|
||||
consumption_q_count = ecc_log->consumption_q_count;
|
||||
|
||||
if (new_detect_count) {
|
||||
total_detect_count += new_detect_count;
|
||||
} else {
|
||||
if (!timeout && need_query_count)
|
||||
timeout = MAX_UMC_POISON_POLLING_TIME_ASYNC;
|
||||
if (de_queried_count && consumption_q_count)
|
||||
break;
|
||||
|
||||
if (timeout) {
|
||||
if (!--timeout)
|
||||
break;
|
||||
msleep(1);
|
||||
}
|
||||
}
|
||||
} while (total_detect_count < need_query_count);
|
||||
msleep(100);
|
||||
} while (--timeout);
|
||||
|
||||
if (total_detect_count)
|
||||
if (de_queried_count)
|
||||
schedule_delayed_work(&ras->page_retirement_dwork, 0);
|
||||
|
||||
if (amdgpu_ras_is_rma(adev) && atomic_cmpxchg(&ras->rma_in_recovery, 0, 1) == 0)
|
||||
amdgpu_ras_reset_gpu(adev);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -3394,6 +3455,12 @@ static int amdgpu_ras_poison_consumption_handler(struct amdgpu_device *adev,
|
||||
reset_flags |= msg.reset;
|
||||
}
|
||||
|
||||
/*
|
||||
* Try to ensure poison creation handler is completed first
|
||||
* to set rma if bad page exceed threshold.
|
||||
*/
|
||||
flush_delayed_work(&con->page_retirement_dwork);
|
||||
|
||||
/* for RMA, amdgpu_ras_poison_creation_handler will trigger gpu reset */
|
||||
if (reset_flags && !amdgpu_ras_is_rma(adev)) {
|
||||
if (reset_flags & AMDGPU_RAS_GPU_RESET_MODE1_RESET)
|
||||
@@ -3403,8 +3470,6 @@ static int amdgpu_ras_poison_consumption_handler(struct amdgpu_device *adev,
|
||||
else
|
||||
reset = reset_flags;
|
||||
|
||||
flush_delayed_work(&con->page_retirement_dwork);
|
||||
|
||||
con->gpu_reset_flags |= reset;
|
||||
amdgpu_ras_reset_gpu(adev);
|
||||
|
||||
@@ -3434,6 +3499,7 @@ static int amdgpu_ras_page_retirement_thread(void *param)
|
||||
if (kthread_should_stop())
|
||||
break;
|
||||
|
||||
mutex_lock(&con->poison_lock);
|
||||
gpu_reset = 0;
|
||||
|
||||
do {
|
||||
@@ -3446,7 +3512,8 @@ static int amdgpu_ras_page_retirement_thread(void *param)
|
||||
atomic_sub(poison_creation_count, &con->poison_creation_count);
|
||||
atomic_sub(poison_creation_count, &con->page_retirement_req_cnt);
|
||||
}
|
||||
} while (atomic_read(&con->poison_creation_count));
|
||||
} while (atomic_read(&con->poison_creation_count) &&
|
||||
!atomic_read(&con->poison_consumption_count));
|
||||
|
||||
if (ret != -EIO) {
|
||||
msg_count = kfifo_len(&con->poison_fifo);
|
||||
@@ -3463,6 +3530,7 @@ static int amdgpu_ras_page_retirement_thread(void *param)
|
||||
/* gpu mode-1 reset is ongoing or just completed ras mode-1 reset */
|
||||
/* Clear poison creation request */
|
||||
atomic_set(&con->poison_creation_count, 0);
|
||||
atomic_set(&con->poison_consumption_count, 0);
|
||||
|
||||
/* Clear poison fifo */
|
||||
amdgpu_ras_clear_poison_fifo(adev);
|
||||
@@ -3487,9 +3555,12 @@ static int amdgpu_ras_page_retirement_thread(void *param)
|
||||
atomic_sub(msg_count, &con->page_retirement_req_cnt);
|
||||
}
|
||||
|
||||
atomic_set(&con->poison_consumption_count, 0);
|
||||
|
||||
/* Wake up work to save bad pages to eeprom */
|
||||
schedule_delayed_work(&con->page_retirement_dwork, 0);
|
||||
}
|
||||
mutex_unlock(&con->poison_lock);
|
||||
}
|
||||
|
||||
return 0;
|
||||
@@ -3570,8 +3641,10 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev, bool init_bp_info)
|
||||
}
|
||||
|
||||
mutex_init(&con->recovery_lock);
|
||||
mutex_init(&con->poison_lock);
|
||||
INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery);
|
||||
atomic_set(&con->in_recovery, 0);
|
||||
atomic_set(&con->rma_in_recovery, 0);
|
||||
con->eeprom_control.bad_channel_bitmap = 0;
|
||||
|
||||
max_eeprom_records_count = amdgpu_ras_eeprom_max_record_count(&con->eeprom_control);
|
||||
@@ -3589,6 +3662,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev, bool init_bp_info)
|
||||
init_waitqueue_head(&con->page_retirement_wq);
|
||||
atomic_set(&con->page_retirement_req_cnt, 0);
|
||||
atomic_set(&con->poison_creation_count, 0);
|
||||
atomic_set(&con->poison_consumption_count, 0);
|
||||
con->page_retirement_thread =
|
||||
kthread_run(amdgpu_ras_page_retirement_thread, adev, "umc_page_retirement");
|
||||
if (IS_ERR(con->page_retirement_thread)) {
|
||||
@@ -3661,6 +3735,8 @@ static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
|
||||
kfree(data);
|
||||
mutex_unlock(&con->recovery_lock);
|
||||
|
||||
amdgpu_ras_critical_region_init(adev);
|
||||
|
||||
return 0;
|
||||
}
|
||||
/* recovery end */
|
||||
@@ -4087,6 +4163,12 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
|
||||
goto release_con;
|
||||
}
|
||||
|
||||
con->init_task_pid = task_pid_nr(current);
|
||||
get_task_comm(con->init_task_comm, current);
|
||||
|
||||
mutex_init(&con->critical_region_lock);
|
||||
INIT_LIST_HEAD(&con->critical_region_head);
|
||||
|
||||
dev_info(adev->dev, "RAS INFO: ras initialized successfully, "
|
||||
"hardware ability[%x] ras_mask[%x]\n",
|
||||
adev->ras_hw_enabled, adev->ras_enabled);
|
||||
@@ -4366,6 +4448,9 @@ int amdgpu_ras_fini(struct amdgpu_device *adev)
|
||||
if (!adev->ras_enabled || !con)
|
||||
return 0;
|
||||
|
||||
amdgpu_ras_critical_region_fini(adev);
|
||||
mutex_destroy(&con->critical_region_lock);
|
||||
|
||||
list_for_each_entry_safe(ras_node, tmp, &adev->ras_list, node) {
|
||||
if (ras_node->ras_obj) {
|
||||
obj = ras_node->ras_obj;
|
||||
@@ -5274,6 +5359,9 @@ int amdgpu_ras_reserve_page(struct amdgpu_device *adev, uint64_t pfn)
|
||||
uint64_t start = pfn << AMDGPU_GPU_PAGE_SHIFT;
|
||||
int ret = 0;
|
||||
|
||||
if (amdgpu_ras_check_critical_address(adev, start))
|
||||
return 0;
|
||||
|
||||
mutex_lock(&con->page_rsv_lock);
|
||||
ret = amdgpu_vram_mgr_query_page_status(mgr, start);
|
||||
if (ret == -ENOENT)
|
||||
@@ -5310,3 +5398,80 @@ bool amdgpu_ras_is_rma(struct amdgpu_device *adev)
|
||||
|
||||
return con->is_rma;
|
||||
}
|
||||
|
||||
int amdgpu_ras_add_critical_region(struct amdgpu_device *adev,
|
||||
struct amdgpu_bo *bo)
|
||||
{
|
||||
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
|
||||
struct amdgpu_vram_mgr_resource *vres;
|
||||
struct ras_critical_region *region;
|
||||
struct drm_buddy_block *block;
|
||||
int ret = 0;
|
||||
|
||||
if (!bo || !bo->tbo.resource)
|
||||
return -EINVAL;
|
||||
|
||||
vres = to_amdgpu_vram_mgr_resource(bo->tbo.resource);
|
||||
|
||||
mutex_lock(&con->critical_region_lock);
|
||||
|
||||
/* Check if the bo had been recorded */
|
||||
list_for_each_entry(region, &con->critical_region_head, node)
|
||||
if (region->bo == bo)
|
||||
goto out;
|
||||
|
||||
/* Record new critical amdgpu bo */
|
||||
list_for_each_entry(block, &vres->blocks, link) {
|
||||
region = kzalloc(sizeof(*region), GFP_KERNEL);
|
||||
if (!region) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
region->bo = bo;
|
||||
region->start = amdgpu_vram_mgr_block_start(block);
|
||||
region->size = amdgpu_vram_mgr_block_size(block);
|
||||
list_add_tail(®ion->node, &con->critical_region_head);
|
||||
}
|
||||
|
||||
out:
|
||||
mutex_unlock(&con->critical_region_lock);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void amdgpu_ras_critical_region_init(struct amdgpu_device *adev)
|
||||
{
|
||||
amdgpu_ras_add_critical_region(adev, adev->mman.fw_reserved_memory);
|
||||
}
|
||||
|
||||
static void amdgpu_ras_critical_region_fini(struct amdgpu_device *adev)
|
||||
{
|
||||
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
|
||||
struct ras_critical_region *region, *tmp;
|
||||
|
||||
mutex_lock(&con->critical_region_lock);
|
||||
list_for_each_entry_safe(region, tmp, &con->critical_region_head, node) {
|
||||
list_del(®ion->node);
|
||||
kfree(region);
|
||||
}
|
||||
mutex_unlock(&con->critical_region_lock);
|
||||
}
|
||||
|
||||
bool amdgpu_ras_check_critical_address(struct amdgpu_device *adev, uint64_t addr)
|
||||
{
|
||||
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
|
||||
struct ras_critical_region *region;
|
||||
bool ret = false;
|
||||
|
||||
mutex_lock(&con->critical_region_lock);
|
||||
list_for_each_entry(region, &con->critical_region_head, node) {
|
||||
if ((region->start <= addr) &&
|
||||
(addr < (region->start + region->size))) {
|
||||
ret = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
mutex_unlock(&con->critical_region_lock);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user