drm/amdgpu: fix system hang issue during GPU reset

when GPU hang, driver has multi-paths to enter amdgpu_device_gpu_recover,
the atomic adev->in_gpu_reset and hive->in_reset are used to avoid
re-entering GPU recovery.

During GPU reset and resume, it is unsafe that other threads access GPU,
which maybe cause GPU reset failed. Therefore the new rw_semaphore
adev->reset_sem is introduced, which protect GPU from being accessed by
external threads during recovery.

v2:
1. add rwlock for some ioctls, debugfs and file-close function.
2. change to use dqm->is_resetting and dqm_lock for protection in kfd
driver.
3. remove try_lock and change adev->in_gpu_reset as atomic, to avoid
re-enter GPU recovery for the same GPU hang.

v3:
1. change back to use adev->reset_sem to protect kfd callback
functions, because dqm_lock couldn't protect all codes, for example:
free_mqd must be called outside of dqm_lock;

[ 1230.176199] Hardware name: Supermicro SYS-7049GP-TRT/X11DPG-QT, BIOS 3.1 05/23/2019
[ 1230.177221] Call Trace:
[ 1230.178249]  dump_stack+0x98/0xd5
[ 1230.179443]  amdgpu_virt_kiq_reg_write_reg_wait+0x181/0x190 [amdgpu]
[ 1230.180673]  gmc_v9_0_flush_gpu_tlb+0xcc/0x310 [amdgpu]
[ 1230.181882]  amdgpu_gart_unbind+0xa9/0xe0 [amdgpu]
[ 1230.183098]  amdgpu_ttm_backend_unbind+0x46/0x180 [amdgpu]
[ 1230.184239]  ? ttm_bo_put+0x171/0x5f0 [ttm]
[ 1230.185394]  ttm_tt_unbind+0x21/0x40 [ttm]
[ 1230.186558]  ttm_tt_destroy.part.12+0x12/0x60 [ttm]
[ 1230.187707]  ttm_tt_destroy+0x13/0x20 [ttm]
[ 1230.188832]  ttm_bo_cleanup_memtype_use+0x36/0x80 [ttm]
[ 1230.189979]  ttm_bo_put+0x1be/0x5f0 [ttm]
[ 1230.191230]  amdgpu_bo_unref+0x1e/0x30 [amdgpu]
[ 1230.192522]  amdgpu_amdkfd_free_gtt_mem+0xaf/0x140 [amdgpu]
[ 1230.193833]  free_mqd+0x25/0x40 [amdgpu]
[ 1230.195143]  destroy_queue_cpsch+0x1a7/0x270 [amdgpu]
[ 1230.196475]  pqm_destroy_queue+0x105/0x260 [amdgpu]
[ 1230.197819]  kfd_ioctl_destroy_queue+0x37/0x70 [amdgpu]
[ 1230.199154]  kfd_ioctl+0x277/0x500 [amdgpu]
[ 1230.200458]  ? kfd_ioctl_get_clock_counters+0x60/0x60 [amdgpu]
[ 1230.201656]  ? tomoyo_file_ioctl+0x19/0x20
[ 1230.202831]  ksys_ioctl+0x98/0xb0
[ 1230.204004]  __x64_sys_ioctl+0x1a/0x20
[ 1230.205174]  do_syscall_64+0x5f/0x250
[ 1230.206339]  entry_SYSCALL_64_after_hwframe+0x49/0xbe

2. remove try_lock and introduce atomic hive->in_reset, to avoid
re-enter GPU recovery.

v4:
1. remove an unnecessary whitespace change in kfd_chardev.c
2. remove comment codes in amdgpu_device.c
3. add more detailed comment in commit message
4. define a wrap function amdgpu_in_reset

v5:
1. Fix some style issues.

Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Suggested-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
Suggested-by: Christian König <christian.koenig@amd.com>
Suggested-by: Felix Kuehling <Felix.Kuehling@amd.com>
Suggested-by: Lijo Lazar <Lijo.Lazar@amd.com>
Suggested-by: Luben Tukov <luben.tuikov@amd.com>
Signed-off-by: Dennis Li <Dennis.Li@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
Dennis Li
2020-07-08 15:07:13 +08:00
committed by Alex Deucher
parent c5079f35c0
commit df9c8d1aa2
39 changed files with 463 additions and 183 deletions

View File

@@ -1935,7 +1935,7 @@ static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
if (adev->ip_blocks[i].status.hw == true)
break;
if (adev->in_gpu_reset || adev->in_suspend) {
if (amdgpu_in_reset(adev) || adev->in_suspend) {
r = adev->ip_blocks[i].version->funcs->resume(adev);
if (r) {
DRM_ERROR("resume of IP block <%s> failed %d\n",
@@ -2106,7 +2106,7 @@ static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)
AMDGPU_RESET_MAGIC_NUM))
return true;
if (!adev->in_gpu_reset)
if (!amdgpu_in_reset(adev))
return false;
/*
@@ -3036,7 +3036,8 @@ int amdgpu_device_init(struct amdgpu_device *adev,
mutex_init(&adev->mn_lock);
mutex_init(&adev->virt.vf_errors.lock);
hash_init(adev->mn_hash);
mutex_init(&adev->lock_reset);
init_rwsem(&adev->reset_sem);
atomic_set(&adev->in_gpu_reset, 0);
mutex_init(&adev->psp.mutex);
mutex_init(&adev->notifier_lock);
@@ -4064,8 +4065,11 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
if (need_full_reset) {
/* post card */
if (amdgpu_atom_asic_init(tmp_adev->mode_info.atom_context))
DRM_WARN("asic atom init failed!");
if (amdgpu_atom_asic_init(tmp_adev->mode_info.atom_context)) {
dev_warn(tmp_adev->dev, "asic atom init failed!");
r = -EAGAIN;
goto out;
}
if (!r) {
dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
@@ -4141,16 +4145,14 @@ end:
return r;
}
static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, bool trylock)
static bool amdgpu_device_lock_adev(struct amdgpu_device *adev)
{
if (trylock) {
if (!mutex_trylock(&adev->lock_reset))
return false;
} else
mutex_lock(&adev->lock_reset);
if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0)
return false;
down_write(&adev->reset_sem);
atomic_inc(&adev->gpu_reset_counter);
adev->in_gpu_reset = true;
switch (amdgpu_asic_reset_method(adev)) {
case AMD_RESET_METHOD_MODE1:
adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
@@ -4170,8 +4172,8 @@ static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
{
amdgpu_vf_error_trans_all(adev);
adev->mp1_state = PP_MP1_STATE_NONE;
adev->in_gpu_reset = false;
mutex_unlock(&adev->lock_reset);
atomic_set(&adev->in_gpu_reset, 0);
up_write(&adev->reset_sem);
}
static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
@@ -4281,12 +4283,14 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
* We always reset all schedulers for device and all devices for XGMI
* hive so that should take care of them too.
*/
hive = amdgpu_get_xgmi_hive(adev, true);
if (hive && !mutex_trylock(&hive->reset_lock)) {
DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
job ? job->base.id : -1, hive->hive_id);
mutex_unlock(&hive->hive_lock);
return 0;
hive = amdgpu_get_xgmi_hive(adev, false);
if (hive) {
if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) {
DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
job ? job->base.id : -1, hive->hive_id);
return 0;
}
mutex_lock(&hive->hive_lock);
}
/*
@@ -4308,11 +4312,11 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
/* block all schedulers and reset given job's ring */
list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
if (!amdgpu_device_lock_adev(tmp_adev, !hive)) {
if (!amdgpu_device_lock_adev(tmp_adev)) {
DRM_INFO("Bailing on TDR for s_job:%llx, as another already in progress",
job ? job->base.id : -1);
mutex_unlock(&hive->hive_lock);
return 0;
r = 0;
goto skip_recovery;
}
/*
@@ -4445,8 +4449,9 @@ skip_sched_resume:
amdgpu_device_unlock_adev(tmp_adev);
}
skip_recovery:
if (hive) {
mutex_unlock(&hive->reset_lock);
atomic_set(&hive->in_reset, 0);
mutex_unlock(&hive->hive_lock);
}