Merge tag 'amd-drm-next-6.18-2025-08-29' of https://gitlab.freedesktop.org/agd5f/linux into drm-next

amd-drm-next-6.18-2025-08-29:

amdgpu:
- Replay fixes
- RAS updates
- VCN SRAM load fixes
- EDID read fixes
- eDP ALPM support
- AUX fixes
- Documenation updates
- Rework how PTE flags are generated
- DCE6 fixes
- VCN devcoredump cleanup
- MMHUB client id fixes
- SR-IOV fixes
- VRR fixes
- VCN 5.0.1 RAS support
- Backlight fixes
- UserQ fixes
- Misc code cleanups
- SMU 13.0.12 updates
- Expanded PCIe DPC support
- Expanded VCN reset support
- SMU 13.0.x Updates
- VPE per queue reset support
- Cusor rotation fix
- DSC fixes
- GC 12 MES TLB invalidation update
- Cursor fixes
- Non-DC TMDS clock validation fix

amdkfd:
- debugfs fixes
- Misc code cleanups
- Page migration fixes
- Partition fixes
- SVM fixes

radeon:
- Misc code cleanups

Signed-off-by: Dave Airlie <airlied@redhat.com>

From: Alex Deucher <alexander.deucher@amd.com>
Link: https://lore.kernel.org/r/20250829190848.1921648-1-alexander.deucher@amd.com
This commit is contained in:
Dave Airlie
2025-09-02 09:35:11 +10:00
279 changed files with 4889 additions and 2438 deletions

View File

@@ -122,12 +122,15 @@ const char *get_ras_block_str(struct ras_common_if *ras_block)
/* typical ECC bad page rate is 1 bad page per 100MB VRAM */
#define RAS_BAD_PAGE_COVER (100 * 1024 * 1024ULL)
#define MAX_UMC_POISON_POLLING_TIME_ASYNC 300 //ms
#define MAX_UMC_POISON_POLLING_TIME_ASYNC 10
#define AMDGPU_RAS_RETIRE_PAGE_INTERVAL 100 //ms
#define MAX_FLUSH_RETIRE_DWORK_TIMES 100
#define BYPASS_ALLOCATED_ADDRESS 0x0
#define BYPASS_INITIALIZATION_ADDRESS 0x1
enum amdgpu_ras_retire_page_reservation {
AMDGPU_RAS_RETIRE_PAGE_RESERVED,
AMDGPU_RAS_RETIRE_PAGE_PENDING,
@@ -136,10 +139,14 @@ enum amdgpu_ras_retire_page_reservation {
atomic_t amdgpu_ras_in_intr = ATOMIC_INIT(0);
static bool amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con,
static int amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con,
uint64_t addr);
static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
static int amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
uint64_t addr);
static void amdgpu_ras_critical_region_init(struct amdgpu_device *adev);
static void amdgpu_ras_critical_region_fini(struct amdgpu_device *adev);
#ifdef CONFIG_X86_MCE_AMD
static void amdgpu_register_bad_pages_mca_notifier(struct amdgpu_device *adev);
struct mce_notifier_adev_list {
@@ -169,18 +176,16 @@ static int amdgpu_reserve_page_direct(struct amdgpu_device *adev, uint64_t addre
struct eeprom_table_record err_rec;
int ret;
if ((address >= adev->gmc.mc_vram_size) ||
(address >= RAS_UMC_INJECT_ADDR_LIMIT)) {
ret = amdgpu_ras_check_bad_page(adev, address);
if (ret == -EINVAL) {
dev_warn(adev->dev,
"RAS WARN: input address 0x%llx is invalid.\n",
address);
"RAS WARN: input address 0x%llx is invalid.\n",
address);
return -EINVAL;
}
if (amdgpu_ras_check_bad_page(adev, address)) {
} else if (ret == 1) {
dev_warn(adev->dev,
"RAS WARN: 0x%llx has already been marked as bad page!\n",
address);
"RAS WARN: 0x%llx has already been marked as bad page!\n",
address);
return 0;
}
@@ -207,6 +212,49 @@ static int amdgpu_reserve_page_direct(struct amdgpu_device *adev, uint64_t addre
return 0;
}
static int amdgpu_check_address_validity(struct amdgpu_device *adev,
uint64_t address, uint64_t flags)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct amdgpu_vram_block_info blk_info;
uint64_t page_pfns[32] = {0};
int i, ret, count;
if (amdgpu_ip_version(adev, UMC_HWIP, 0) < IP_VERSION(12, 0, 0))
return 0;
if ((address >= adev->gmc.mc_vram_size) ||
(address >= RAS_UMC_INJECT_ADDR_LIMIT))
return -EFAULT;
count = amdgpu_umc_lookup_bad_pages_in_a_row(adev,
address, page_pfns, ARRAY_SIZE(page_pfns));
if (count <= 0)
return -EPERM;
for (i = 0; i < count; i++) {
memset(&blk_info, 0, sizeof(blk_info));
ret = amdgpu_vram_mgr_query_address_block_info(&adev->mman.vram_mgr,
page_pfns[i] << AMDGPU_GPU_PAGE_SHIFT, &blk_info);
if (!ret) {
/* The input address that needs to be checked is allocated by
* current calling process, so it is necessary to exclude
* the calling process.
*/
if ((flags == BYPASS_ALLOCATED_ADDRESS) &&
((blk_info.task.pid != task_pid_nr(current)) ||
strncmp(blk_info.task.comm, current->comm, TASK_COMM_LEN)))
return -EACCES;
else if ((flags == BYPASS_INITIALIZATION_ADDRESS) &&
(blk_info.task.pid == con->init_task_pid) &&
!strncmp(blk_info.task.comm, con->init_task_comm, TASK_COMM_LEN))
return -EACCES;
}
}
return 0;
}
static ssize_t amdgpu_ras_debugfs_read(struct file *f, char __user *buf,
size_t size, loff_t *pos)
{
@@ -297,6 +345,8 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
op = 2;
else if (strstr(str, "retire_page") != NULL)
op = 3;
else if (strstr(str, "check_address") != NULL)
op = 4;
else if (str[0] && str[1] && str[2] && str[3])
/* ascii string, but commands are not matched. */
return -EINVAL;
@@ -310,6 +360,15 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
data->op = op;
data->inject.address = address;
return 0;
} else if (op == 4) {
if (sscanf(str, "%*s 0x%llx 0x%llx", &address, &value) != 2 &&
sscanf(str, "%*s %llu %llu", &address, &value) != 2)
return -EINVAL;
data->op = op;
data->inject.address = address;
data->inject.value = value;
return 0;
}
@@ -500,6 +559,9 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f,
return size;
else
return ret;
} else if (data.op == 4) {
ret = amdgpu_check_address_validity(adev, data.inject.address, data.inject.value);
return ret ? ret : size;
}
if (!amdgpu_ras_is_supported(adev, data.head.block))
@@ -513,22 +575,16 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f,
ret = amdgpu_ras_feature_enable(adev, &data.head, 1);
break;
case 2:
if ((data.inject.address >= adev->gmc.mc_vram_size &&
adev->gmc.mc_vram_size) ||
(data.inject.address >= RAS_UMC_INJECT_ADDR_LIMIT)) {
dev_warn(adev->dev, "RAS WARN: input address "
"0x%llx is invalid.",
data.inject.address);
ret = -EINVAL;
break;
}
/* umc ce/ue error injection for a bad page is not allowed */
if ((data.head.block == AMDGPU_RAS_BLOCK__UMC) &&
amdgpu_ras_check_bad_page(adev, data.inject.address)) {
dev_warn(adev->dev, "RAS WARN: inject: 0x%llx has "
"already been marked as bad!\n",
data.inject.address);
if (data.head.block == AMDGPU_RAS_BLOCK__UMC)
ret = amdgpu_ras_check_bad_page(adev, data.inject.address);
if (ret == -EINVAL) {
dev_warn(adev->dev, "RAS WARN: input address 0x%llx is invalid.",
data.inject.address);
break;
} else if (ret == 1) {
dev_warn(adev->dev, "RAS WARN: inject: 0x%llx has already been marked as bad!\n",
data.inject.address);
break;
}
@@ -2566,18 +2622,26 @@ static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
goto out;
}
*bps = kmalloc(sizeof(struct ras_badpage) * data->count, GFP_KERNEL);
*bps = kmalloc_array(data->count, sizeof(struct ras_badpage), GFP_KERNEL);
if (!*bps) {
ret = -ENOMEM;
goto out;
}
for (; i < data->count; i++) {
if (!data->bps[i].ts)
continue;
(*bps)[i] = (struct ras_badpage){
.bp = data->bps[i].retired_page,
.size = AMDGPU_GPU_PAGE_SIZE,
.flags = AMDGPU_RAS_RETIRE_PAGE_RESERVED,
};
if (amdgpu_ras_check_critical_address(adev,
data->bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT))
continue;
status = amdgpu_vram_mgr_query_page_status(&adev->mman.vram_mgr,
data->bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT);
if (status == -EBUSY)
@@ -2586,7 +2650,7 @@ static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
(*bps)[i].flags = AMDGPU_RAS_RETIRE_PAGE_FAULT;
}
*count = data->count;
*count = con->bad_page_num;
out:
mutex_unlock(&con->recovery_lock);
return ret;
@@ -2722,7 +2786,7 @@ static int amdgpu_ras_realloc_eh_data_space(struct amdgpu_device *adev,
unsigned int old_space = data->count + data->space_left;
unsigned int new_space = old_space + pages;
unsigned int align_space = ALIGN(new_space, 512);
void *bps = kmalloc(align_space * sizeof(*data->bps), GFP_KERNEL);
void *bps = kmalloc_array(align_space, sizeof(*data->bps), GFP_KERNEL);
if (!bps) {
return -ENOMEM;
@@ -2814,8 +2878,11 @@ static int __amdgpu_ras_restore_bad_pages(struct amdgpu_device *adev,
for (j = 0; j < count; j++) {
if (amdgpu_ras_check_bad_page_unlock(con,
bps[j].retired_page << AMDGPU_GPU_PAGE_SHIFT))
bps[j].retired_page << AMDGPU_GPU_PAGE_SHIFT)) {
data->count++;
data->space_left--;
continue;
}
if (!data->space_left &&
amdgpu_ras_realloc_eh_data_space(adev, data, 256)) {
@@ -2828,6 +2895,7 @@ static int __amdgpu_ras_restore_bad_pages(struct amdgpu_device *adev,
sizeof(struct eeprom_table_record));
data->count++;
data->space_left--;
con->bad_page_num++;
}
return 0;
@@ -2974,7 +3042,7 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
ret = __amdgpu_ras_convert_rec_array_from_rom(adev,
&bps[i], &err_data, nps);
if (ret)
control->ras_num_bad_pages -= adev->umc.retire_unit;
con->bad_page_num -= adev->umc.retire_unit;
i += (adev->umc.retire_unit - 1);
} else {
break;
@@ -2988,8 +3056,10 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
ret = __amdgpu_ras_convert_rec_from_rom(adev,
&bps[i], &err_data, nps);
if (ret)
control->ras_num_bad_pages -= adev->umc.retire_unit;
con->bad_page_num -= adev->umc.retire_unit;
}
con->eh_data->count_saved = con->eh_data->count;
} else {
ret = __amdgpu_ras_restore_bad_pages(adev, bps, pages);
}
@@ -3012,7 +3082,7 @@ int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev,
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct ras_err_handler_data *data;
struct amdgpu_ras_eeprom_control *control;
int save_count, unit_num, bad_page_num, i;
int save_count, unit_num, i;
if (!con || !con->eh_data) {
if (new_cnt)
@@ -3033,27 +3103,26 @@ int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev,
mutex_lock(&con->recovery_lock);
control = &con->eeprom_control;
data = con->eh_data;
bad_page_num = control->ras_num_bad_pages;
save_count = data->count - bad_page_num;
unit_num = data->count / adev->umc.retire_unit - control->ras_num_recs;
save_count = con->bad_page_num - control->ras_num_bad_pages;
mutex_unlock(&con->recovery_lock);
unit_num = save_count / adev->umc.retire_unit;
if (new_cnt)
*new_cnt = unit_num;
/* only new entries are saved */
if (save_count > 0) {
if (unit_num > 0) {
/*old asics only save pa to eeprom like before*/
if (IP_VERSION_MAJ(amdgpu_ip_version(adev, UMC_HWIP, 0)) < 12) {
if (amdgpu_ras_eeprom_append(control,
&data->bps[bad_page_num], save_count)) {
&data->bps[data->count_saved], unit_num)) {
dev_err(adev->dev, "Failed to save EEPROM table data!");
return -EIO;
}
} else {
for (i = 0; i < unit_num; i++) {
if (amdgpu_ras_eeprom_append(control,
&data->bps[bad_page_num +
&data->bps[data->count_saved +
i * adev->umc.retire_unit], 1)) {
dev_err(adev->dev, "Failed to save EEPROM table data!");
return -EIO;
@@ -3062,6 +3131,7 @@ int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev,
}
dev_info(adev->dev, "Saved %d pages to EEPROM table.\n", save_count);
data->count_saved = data->count;
}
return 0;
@@ -3116,17 +3186,17 @@ static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev)
}
}
ret = amdgpu_ras_add_bad_pages(adev, bps, control->ras_num_recs, true);
if (ret)
goto out;
ret = amdgpu_ras_eeprom_check(control);
if (ret)
goto out;
/* HW not usable */
if (amdgpu_ras_is_rma(adev)) {
if (amdgpu_ras_is_rma(adev))
ret = -EHWPOISON;
goto out;
}
ret = amdgpu_ras_add_bad_pages(adev, bps, control->ras_num_recs, true);
}
out:
@@ -3134,18 +3204,24 @@ out:
return ret;
}
static bool amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con,
static int amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con,
uint64_t addr)
{
struct ras_err_handler_data *data = con->eh_data;
struct amdgpu_device *adev = con->adev;
int i;
if ((addr >= adev->gmc.mc_vram_size &&
adev->gmc.mc_vram_size) ||
(addr >= RAS_UMC_INJECT_ADDR_LIMIT))
return -EINVAL;
addr >>= AMDGPU_GPU_PAGE_SHIFT;
for (i = 0; i < data->count; i++)
if (addr == data->bps[i].retired_page)
return true;
return 1;
return false;
return 0;
}
/*
@@ -3153,11 +3229,11 @@ static bool amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con,
*
* Note: this check is only for umc block
*/
static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
static int amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
uint64_t addr)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
bool ret = false;
int ret = 0;
if (!con || !con->eh_data)
return ret;
@@ -3241,7 +3317,7 @@ static void amdgpu_ras_ecc_log_init(struct ras_ecc_log_info *ecc_log)
INIT_RADIX_TREE(&ecc_log->de_page_tree, GFP_KERNEL);
ecc_log->de_queried_count = 0;
ecc_log->prev_de_queried_count = 0;
ecc_log->consumption_q_count = 0;
}
static void amdgpu_ras_ecc_log_fini(struct ras_ecc_log_info *ecc_log)
@@ -3261,7 +3337,7 @@ static void amdgpu_ras_ecc_log_fini(struct ras_ecc_log_info *ecc_log)
mutex_destroy(&ecc_log->lock);
ecc_log->de_queried_count = 0;
ecc_log->prev_de_queried_count = 0;
ecc_log->consumption_q_count = 0;
}
static bool amdgpu_ras_schedule_retirement_dwork(struct amdgpu_ras *con,
@@ -3287,7 +3363,6 @@ static void amdgpu_ras_do_page_retirement(struct work_struct *work)
page_retirement_dwork.work);
struct amdgpu_device *adev = con->adev;
struct ras_err_data err_data;
unsigned long err_cnt;
/* If gpu reset is ongoing, delay retiring the bad pages */
if (amdgpu_in_reset(adev) || amdgpu_ras_in_recovery(adev)) {
@@ -3299,13 +3374,9 @@ static void amdgpu_ras_do_page_retirement(struct work_struct *work)
amdgpu_ras_error_data_init(&err_data);
amdgpu_umc_handle_bad_pages(adev, &err_data);
err_cnt = err_data.err_addr_cnt;
amdgpu_ras_error_data_fini(&err_data);
if (err_cnt && amdgpu_ras_is_rma(adev))
amdgpu_ras_reset_gpu(adev);
amdgpu_ras_schedule_retirement_dwork(con,
AMDGPU_RAS_RETIRE_PAGE_INTERVAL);
}
@@ -3316,49 +3387,39 @@ static int amdgpu_ras_poison_creation_handler(struct amdgpu_device *adev,
int ret = 0;
struct ras_ecc_log_info *ecc_log;
struct ras_query_if info;
uint32_t timeout = 0;
u32 timeout = MAX_UMC_POISON_POLLING_TIME_ASYNC;
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
uint64_t de_queried_count;
uint32_t new_detect_count, total_detect_count;
uint32_t need_query_count = poison_creation_count;
u64 de_queried_count;
u64 consumption_q_count;
enum ras_event_type type = RAS_EVENT_TYPE_POISON_CREATION;
memset(&info, 0, sizeof(info));
info.head.block = AMDGPU_RAS_BLOCK__UMC;
ecc_log = &ras->umc_ecc_log;
total_detect_count = 0;
ecc_log->de_queried_count = 0;
ecc_log->consumption_q_count = 0;
do {
ret = amdgpu_ras_query_error_status_with_event(adev, &info, type);
if (ret)
return ret;
de_queried_count = ecc_log->de_queried_count;
if (de_queried_count > ecc_log->prev_de_queried_count) {
new_detect_count = de_queried_count - ecc_log->prev_de_queried_count;
ecc_log->prev_de_queried_count = de_queried_count;
timeout = 0;
} else {
new_detect_count = 0;
}
consumption_q_count = ecc_log->consumption_q_count;
if (new_detect_count) {
total_detect_count += new_detect_count;
} else {
if (!timeout && need_query_count)
timeout = MAX_UMC_POISON_POLLING_TIME_ASYNC;
if (de_queried_count && consumption_q_count)
break;
if (timeout) {
if (!--timeout)
break;
msleep(1);
}
}
} while (total_detect_count < need_query_count);
msleep(100);
} while (--timeout);
if (total_detect_count)
if (de_queried_count)
schedule_delayed_work(&ras->page_retirement_dwork, 0);
if (amdgpu_ras_is_rma(adev) && atomic_cmpxchg(&ras->rma_in_recovery, 0, 1) == 0)
amdgpu_ras_reset_gpu(adev);
return 0;
}
@@ -3394,6 +3455,12 @@ static int amdgpu_ras_poison_consumption_handler(struct amdgpu_device *adev,
reset_flags |= msg.reset;
}
/*
* Try to ensure poison creation handler is completed first
* to set rma if bad page exceed threshold.
*/
flush_delayed_work(&con->page_retirement_dwork);
/* for RMA, amdgpu_ras_poison_creation_handler will trigger gpu reset */
if (reset_flags && !amdgpu_ras_is_rma(adev)) {
if (reset_flags & AMDGPU_RAS_GPU_RESET_MODE1_RESET)
@@ -3403,8 +3470,6 @@ static int amdgpu_ras_poison_consumption_handler(struct amdgpu_device *adev,
else
reset = reset_flags;
flush_delayed_work(&con->page_retirement_dwork);
con->gpu_reset_flags |= reset;
amdgpu_ras_reset_gpu(adev);
@@ -3434,6 +3499,7 @@ static int amdgpu_ras_page_retirement_thread(void *param)
if (kthread_should_stop())
break;
mutex_lock(&con->poison_lock);
gpu_reset = 0;
do {
@@ -3446,7 +3512,8 @@ static int amdgpu_ras_page_retirement_thread(void *param)
atomic_sub(poison_creation_count, &con->poison_creation_count);
atomic_sub(poison_creation_count, &con->page_retirement_req_cnt);
}
} while (atomic_read(&con->poison_creation_count));
} while (atomic_read(&con->poison_creation_count) &&
!atomic_read(&con->poison_consumption_count));
if (ret != -EIO) {
msg_count = kfifo_len(&con->poison_fifo);
@@ -3463,6 +3530,7 @@ static int amdgpu_ras_page_retirement_thread(void *param)
/* gpu mode-1 reset is ongoing or just completed ras mode-1 reset */
/* Clear poison creation request */
atomic_set(&con->poison_creation_count, 0);
atomic_set(&con->poison_consumption_count, 0);
/* Clear poison fifo */
amdgpu_ras_clear_poison_fifo(adev);
@@ -3487,9 +3555,12 @@ static int amdgpu_ras_page_retirement_thread(void *param)
atomic_sub(msg_count, &con->page_retirement_req_cnt);
}
atomic_set(&con->poison_consumption_count, 0);
/* Wake up work to save bad pages to eeprom */
schedule_delayed_work(&con->page_retirement_dwork, 0);
}
mutex_unlock(&con->poison_lock);
}
return 0;
@@ -3570,8 +3641,10 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev, bool init_bp_info)
}
mutex_init(&con->recovery_lock);
mutex_init(&con->poison_lock);
INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery);
atomic_set(&con->in_recovery, 0);
atomic_set(&con->rma_in_recovery, 0);
con->eeprom_control.bad_channel_bitmap = 0;
max_eeprom_records_count = amdgpu_ras_eeprom_max_record_count(&con->eeprom_control);
@@ -3589,6 +3662,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev, bool init_bp_info)
init_waitqueue_head(&con->page_retirement_wq);
atomic_set(&con->page_retirement_req_cnt, 0);
atomic_set(&con->poison_creation_count, 0);
atomic_set(&con->poison_consumption_count, 0);
con->page_retirement_thread =
kthread_run(amdgpu_ras_page_retirement_thread, adev, "umc_page_retirement");
if (IS_ERR(con->page_retirement_thread)) {
@@ -3661,6 +3735,8 @@ static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
kfree(data);
mutex_unlock(&con->recovery_lock);
amdgpu_ras_critical_region_init(adev);
return 0;
}
/* recovery end */
@@ -4087,6 +4163,12 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
goto release_con;
}
con->init_task_pid = task_pid_nr(current);
get_task_comm(con->init_task_comm, current);
mutex_init(&con->critical_region_lock);
INIT_LIST_HEAD(&con->critical_region_head);
dev_info(adev->dev, "RAS INFO: ras initialized successfully, "
"hardware ability[%x] ras_mask[%x]\n",
adev->ras_hw_enabled, adev->ras_enabled);
@@ -4366,6 +4448,9 @@ int amdgpu_ras_fini(struct amdgpu_device *adev)
if (!adev->ras_enabled || !con)
return 0;
amdgpu_ras_critical_region_fini(adev);
mutex_destroy(&con->critical_region_lock);
list_for_each_entry_safe(ras_node, tmp, &adev->ras_list, node) {
if (ras_node->ras_obj) {
obj = ras_node->ras_obj;
@@ -5274,6 +5359,9 @@ int amdgpu_ras_reserve_page(struct amdgpu_device *adev, uint64_t pfn)
uint64_t start = pfn << AMDGPU_GPU_PAGE_SHIFT;
int ret = 0;
if (amdgpu_ras_check_critical_address(adev, start))
return 0;
mutex_lock(&con->page_rsv_lock);
ret = amdgpu_vram_mgr_query_page_status(mgr, start);
if (ret == -ENOENT)
@@ -5310,3 +5398,80 @@ bool amdgpu_ras_is_rma(struct amdgpu_device *adev)
return con->is_rma;
}
int amdgpu_ras_add_critical_region(struct amdgpu_device *adev,
struct amdgpu_bo *bo)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct amdgpu_vram_mgr_resource *vres;
struct ras_critical_region *region;
struct drm_buddy_block *block;
int ret = 0;
if (!bo || !bo->tbo.resource)
return -EINVAL;
vres = to_amdgpu_vram_mgr_resource(bo->tbo.resource);
mutex_lock(&con->critical_region_lock);
/* Check if the bo had been recorded */
list_for_each_entry(region, &con->critical_region_head, node)
if (region->bo == bo)
goto out;
/* Record new critical amdgpu bo */
list_for_each_entry(block, &vres->blocks, link) {
region = kzalloc(sizeof(*region), GFP_KERNEL);
if (!region) {
ret = -ENOMEM;
goto out;
}
region->bo = bo;
region->start = amdgpu_vram_mgr_block_start(block);
region->size = amdgpu_vram_mgr_block_size(block);
list_add_tail(&region->node, &con->critical_region_head);
}
out:
mutex_unlock(&con->critical_region_lock);
return ret;
}
static void amdgpu_ras_critical_region_init(struct amdgpu_device *adev)
{
amdgpu_ras_add_critical_region(adev, adev->mman.fw_reserved_memory);
}
static void amdgpu_ras_critical_region_fini(struct amdgpu_device *adev)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct ras_critical_region *region, *tmp;
mutex_lock(&con->critical_region_lock);
list_for_each_entry_safe(region, tmp, &con->critical_region_head, node) {
list_del(&region->node);
kfree(region);
}
mutex_unlock(&con->critical_region_lock);
}
bool amdgpu_ras_check_critical_address(struct amdgpu_device *adev, uint64_t addr)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct ras_critical_region *region;
bool ret = false;
mutex_lock(&con->critical_region_lock);
list_for_each_entry(region, &con->critical_region_head, node) {
if ((region->start <= addr) &&
(addr < (region->start + region->size))) {
ret = true;
break;
}
}
mutex_unlock(&con->critical_region_lock);
return ret;
}