mirror of
https://github.com/torvalds/linux.git
synced 2026-04-18 14:53:58 -04:00
drm/amdgpu: Implement user queue reset functionality
This patch adds robust reset handling for user queues (userq) to improve
recovery from queue failures. The key components include:
1. Queue detection and reset logic:
- amdgpu_userq_detect_and_reset_queues() identifies failed queues
- Per-IP detect_and_reset callbacks for targeted recovery
- Falls back to full GPU reset when needed
2. Reset infrastructure:
- Adds userq_reset_work workqueue for async reset handling
- Implements pre/post reset handlers for queue state management
- Integrates with existing GPU reset framework
3. Error handling improvements:
- Enhanced state tracking with HUNG state
- Automatic reset triggering on critical failures
- VRAM loss handling during recovery
4. Integration points:
- Added to device init/reset paths
- Called during queue destroy, suspend, and isolation events
- Handles both individual queue and full GPU resets
The reset functionality works with both gfx/compute and sdma queues,
providing better resilience against queue failures while minimizing
disruption to unaffected queues.
v2: add detection and reset calls when preemption/unmaped fails.
add a per device userq counter for each user queue type.(Alex)
v3: make sure we hold the adev->userq_mutex when we call amdgpu_userq_detect_and_reset_queues. (Alex)
warn if the adev->userq_mutex is not held.
v4: make sure we have all of the uqm->userq_mutex held.
warn if the uqm->userq_mutex is not held.
v5: Use array for user queue type counters.(Alex)
all of the uqm->userq_mutex need to be held when calling detect and reset. (Alex)
v6: fix lock dep warning in amdgpu_userq_fence_dence_driver_process
v7: add the queue types in an array and use a loop in amdgpu_userq_detect_and_reset_queues (Lijo)
v8: remove atomic_set(&userq_mgr->userq_count[i], 0).
it should already be 0 since we kzalloc the structure (Alex)
v9: For consistency with kernel queues, We may want something like:
amdgpu_userq_is_reset_type_supported (Alex)
Signed-off-by: Jesse Zhang <Jesse.Zhang@amd.com>
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
committed by
Alex Deucher
parent
825df7ff4b
commit
290f46cf57
@@ -25,8 +25,10 @@
|
||||
#include <drm/drm_auth.h>
|
||||
#include <drm/drm_exec.h>
|
||||
#include <linux/pm_runtime.h>
|
||||
#include <drm/drm_drv.h>
|
||||
|
||||
#include "amdgpu.h"
|
||||
#include "amdgpu_reset.h"
|
||||
#include "amdgpu_vm.h"
|
||||
#include "amdgpu_userq.h"
|
||||
#include "amdgpu_hmm.h"
|
||||
@@ -46,6 +48,107 @@ u32 amdgpu_userq_get_supported_ip_mask(struct amdgpu_device *adev)
|
||||
return userq_ip_mask;
|
||||
}
|
||||
|
||||
static bool amdgpu_userq_is_reset_type_supported(struct amdgpu_device *adev,
|
||||
enum amdgpu_ring_type ring_type, int reset_type)
|
||||
{
|
||||
|
||||
if (ring_type < 0 || ring_type >= AMDGPU_RING_TYPE_MAX)
|
||||
return false;
|
||||
|
||||
switch (ring_type) {
|
||||
case AMDGPU_RING_TYPE_GFX:
|
||||
if (adev->gfx.gfx_supported_reset & reset_type)
|
||||
return true;
|
||||
break;
|
||||
case AMDGPU_RING_TYPE_COMPUTE:
|
||||
if (adev->gfx.compute_supported_reset & reset_type)
|
||||
return true;
|
||||
break;
|
||||
case AMDGPU_RING_TYPE_SDMA:
|
||||
if (adev->sdma.supported_reset & reset_type)
|
||||
return true;
|
||||
break;
|
||||
case AMDGPU_RING_TYPE_VCN_DEC:
|
||||
case AMDGPU_RING_TYPE_VCN_ENC:
|
||||
if (adev->vcn.supported_reset & reset_type)
|
||||
return true;
|
||||
break;
|
||||
case AMDGPU_RING_TYPE_VCN_JPEG:
|
||||
if (adev->jpeg.supported_reset & reset_type)
|
||||
return true;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
static void amdgpu_userq_gpu_reset(struct amdgpu_device *adev)
|
||||
{
|
||||
if (amdgpu_device_should_recover_gpu(adev)) {
|
||||
amdgpu_reset_domain_schedule(adev->reset_domain,
|
||||
&adev->userq_reset_work);
|
||||
/* Wait for the reset job to complete */
|
||||
flush_work(&adev->userq_reset_work);
|
||||
}
|
||||
}
|
||||
|
||||
static int
|
||||
amdgpu_userq_detect_and_reset_queues(struct amdgpu_userq_mgr *uq_mgr)
|
||||
{
|
||||
struct amdgpu_device *adev = uq_mgr->adev;
|
||||
const int queue_types[] = {
|
||||
AMDGPU_RING_TYPE_COMPUTE,
|
||||
AMDGPU_RING_TYPE_GFX,
|
||||
AMDGPU_RING_TYPE_SDMA
|
||||
};
|
||||
const int num_queue_types = ARRAY_SIZE(queue_types);
|
||||
bool gpu_reset = false;
|
||||
int r = 0;
|
||||
int i;
|
||||
|
||||
/* Warning if current process mutex is not held */
|
||||
WARN_ON(!mutex_is_locked(&uq_mgr->userq_mutex));
|
||||
|
||||
if (unlikely(adev->debug_disable_gpu_ring_reset)) {
|
||||
dev_err(adev->dev, "userq reset disabled by debug mask\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* If GPU recovery feature is disabled system-wide,
|
||||
* skip all reset detection logic
|
||||
*/
|
||||
if (!amdgpu_gpu_recovery)
|
||||
return 0;
|
||||
|
||||
/*
|
||||
* Iterate through all queue types to detect and reset problematic queues
|
||||
* Process each queue type in the defined order
|
||||
*/
|
||||
for (i = 0; i < num_queue_types; i++) {
|
||||
int ring_type = queue_types[i];
|
||||
const struct amdgpu_userq_funcs *funcs = adev->userq_funcs[ring_type];
|
||||
|
||||
if (!amdgpu_userq_is_reset_type_supported(adev, ring_type, AMDGPU_RESET_TYPE_PER_QUEUE))
|
||||
continue;
|
||||
|
||||
if (atomic_read(&uq_mgr->userq_count[ring_type]) > 0 &&
|
||||
funcs && funcs->detect_and_reset) {
|
||||
r = funcs->detect_and_reset(adev, ring_type);
|
||||
if (r) {
|
||||
gpu_reset = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (gpu_reset)
|
||||
amdgpu_userq_gpu_reset(adev);
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static int amdgpu_userq_buffer_va_list_add(struct amdgpu_usermode_queue *queue,
|
||||
struct amdgpu_bo_va_mapping *va_map, u64 addr)
|
||||
{
|
||||
@@ -176,17 +279,22 @@ amdgpu_userq_preempt_helper(struct amdgpu_userq_mgr *uq_mgr,
|
||||
struct amdgpu_device *adev = uq_mgr->adev;
|
||||
const struct amdgpu_userq_funcs *userq_funcs =
|
||||
adev->userq_funcs[queue->queue_type];
|
||||
bool found_hung_queue = false;
|
||||
int r = 0;
|
||||
|
||||
if (queue->state == AMDGPU_USERQ_STATE_MAPPED) {
|
||||
r = userq_funcs->preempt(uq_mgr, queue);
|
||||
if (r) {
|
||||
queue->state = AMDGPU_USERQ_STATE_HUNG;
|
||||
found_hung_queue = true;
|
||||
} else {
|
||||
queue->state = AMDGPU_USERQ_STATE_PREEMPTED;
|
||||
}
|
||||
}
|
||||
|
||||
if (found_hung_queue)
|
||||
amdgpu_userq_detect_and_reset_queues(uq_mgr);
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
@@ -218,16 +326,23 @@ amdgpu_userq_unmap_helper(struct amdgpu_userq_mgr *uq_mgr,
|
||||
struct amdgpu_device *adev = uq_mgr->adev;
|
||||
const struct amdgpu_userq_funcs *userq_funcs =
|
||||
adev->userq_funcs[queue->queue_type];
|
||||
bool found_hung_queue = false;
|
||||
int r = 0;
|
||||
|
||||
if ((queue->state == AMDGPU_USERQ_STATE_MAPPED) ||
|
||||
(queue->state == AMDGPU_USERQ_STATE_PREEMPTED)) {
|
||||
r = userq_funcs->unmap(uq_mgr, queue);
|
||||
if (r)
|
||||
if (r) {
|
||||
queue->state = AMDGPU_USERQ_STATE_HUNG;
|
||||
else
|
||||
found_hung_queue = true;
|
||||
} else {
|
||||
queue->state = AMDGPU_USERQ_STATE_UNMAPPED;
|
||||
}
|
||||
}
|
||||
|
||||
if (found_hung_queue)
|
||||
amdgpu_userq_detect_and_reset_queues(uq_mgr);
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
@@ -244,10 +359,12 @@ amdgpu_userq_map_helper(struct amdgpu_userq_mgr *uq_mgr,
|
||||
r = userq_funcs->map(uq_mgr, queue);
|
||||
if (r) {
|
||||
queue->state = AMDGPU_USERQ_STATE_HUNG;
|
||||
amdgpu_userq_detect_and_reset_queues(uq_mgr);
|
||||
} else {
|
||||
queue->state = AMDGPU_USERQ_STATE_MAPPED;
|
||||
}
|
||||
}
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
@@ -475,10 +592,11 @@ amdgpu_userq_destroy(struct drm_file *filp, int queue_id)
|
||||
amdgpu_bo_unreserve(queue->db_obj.obj);
|
||||
}
|
||||
amdgpu_bo_unref(&queue->db_obj.obj);
|
||||
|
||||
atomic_dec(&uq_mgr->userq_count[queue->queue_type]);
|
||||
#if defined(CONFIG_DEBUG_FS)
|
||||
debugfs_remove_recursive(queue->debugfs_queue);
|
||||
#endif
|
||||
amdgpu_userq_detect_and_reset_queues(uq_mgr);
|
||||
r = amdgpu_userq_unmap_helper(uq_mgr, queue);
|
||||
/*TODO: It requires a reset for userq hw unmap error*/
|
||||
if (unlikely(r != AMDGPU_USERQ_STATE_UNMAPPED)) {
|
||||
@@ -701,6 +819,7 @@ amdgpu_userq_create(struct drm_file *filp, union drm_amdgpu_userq *args)
|
||||
kfree(queue_name);
|
||||
|
||||
args->out.queue_id = qid;
|
||||
atomic_inc(&uq_mgr->userq_count[queue->queue_type]);
|
||||
|
||||
unlock:
|
||||
mutex_unlock(&uq_mgr->userq_mutex);
|
||||
@@ -1045,6 +1164,7 @@ amdgpu_userq_evict_all(struct amdgpu_userq_mgr *uq_mgr)
|
||||
unsigned long queue_id;
|
||||
int ret = 0, r;
|
||||
|
||||
amdgpu_userq_detect_and_reset_queues(uq_mgr);
|
||||
/* Try to unmap all the queues in this process ctx */
|
||||
xa_for_each(&uq_mgr->userq_mgr_xa, queue_id, queue) {
|
||||
r = amdgpu_userq_preempt_helper(uq_mgr, queue);
|
||||
@@ -1057,6 +1177,23 @@ amdgpu_userq_evict_all(struct amdgpu_userq_mgr *uq_mgr)
|
||||
return ret;
|
||||
}
|
||||
|
||||
void amdgpu_userq_reset_work(struct work_struct *work)
|
||||
{
|
||||
struct amdgpu_device *adev = container_of(work, struct amdgpu_device,
|
||||
userq_reset_work);
|
||||
struct amdgpu_reset_context reset_context;
|
||||
|
||||
memset(&reset_context, 0, sizeof(reset_context));
|
||||
|
||||
reset_context.method = AMD_RESET_METHOD_NONE;
|
||||
reset_context.reset_req_dev = adev;
|
||||
reset_context.src = AMDGPU_RESET_SRC_USERQ;
|
||||
set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
|
||||
/*set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags);*/
|
||||
|
||||
amdgpu_device_gpu_recover(adev, NULL, &reset_context);
|
||||
}
|
||||
|
||||
static int
|
||||
amdgpu_userq_wait_for_signal(struct amdgpu_userq_mgr *uq_mgr)
|
||||
{
|
||||
@@ -1084,22 +1221,19 @@ void
|
||||
amdgpu_userq_evict(struct amdgpu_userq_mgr *uq_mgr,
|
||||
struct amdgpu_eviction_fence *ev_fence)
|
||||
{
|
||||
int ret;
|
||||
struct amdgpu_fpriv *fpriv = uq_mgr_to_fpriv(uq_mgr);
|
||||
struct amdgpu_eviction_fence_mgr *evf_mgr = &fpriv->evf_mgr;
|
||||
struct amdgpu_device *adev = uq_mgr->adev;
|
||||
int ret;
|
||||
|
||||
/* Wait for any pending userqueue fence work to finish */
|
||||
ret = amdgpu_userq_wait_for_signal(uq_mgr);
|
||||
if (ret) {
|
||||
drm_file_err(uq_mgr->file, "Not evicting userqueue, timeout waiting for work\n");
|
||||
return;
|
||||
}
|
||||
if (ret)
|
||||
dev_err(adev->dev, "Not evicting userqueue, timeout waiting for work\n");
|
||||
|
||||
ret = amdgpu_userq_evict_all(uq_mgr);
|
||||
if (ret) {
|
||||
drm_file_err(uq_mgr->file, "Failed to evict userqueue\n");
|
||||
return;
|
||||
}
|
||||
if (ret)
|
||||
dev_err(adev->dev, "Failed to evict userqueue\n");
|
||||
|
||||
/* Signal current eviction fence */
|
||||
amdgpu_eviction_fence_signal(evf_mgr, ev_fence);
|
||||
@@ -1133,6 +1267,7 @@ void amdgpu_userq_mgr_fini(struct amdgpu_userq_mgr *userq_mgr)
|
||||
cancel_delayed_work_sync(&userq_mgr->resume_work);
|
||||
|
||||
mutex_lock(&userq_mgr->userq_mutex);
|
||||
amdgpu_userq_detect_and_reset_queues(userq_mgr);
|
||||
xa_for_each(&userq_mgr->userq_mgr_xa, queue_id, queue) {
|
||||
amdgpu_userq_wait_for_last_fence(userq_mgr, queue);
|
||||
amdgpu_userq_unmap_helper(userq_mgr, queue);
|
||||
@@ -1159,6 +1294,7 @@ int amdgpu_userq_suspend(struct amdgpu_device *adev)
|
||||
uqm = queue->userq_mgr;
|
||||
cancel_delayed_work_sync(&uqm->resume_work);
|
||||
guard(mutex)(&uqm->userq_mutex);
|
||||
amdgpu_userq_detect_and_reset_queues(uqm);
|
||||
if (adev->in_s0ix)
|
||||
r = amdgpu_userq_preempt_helper(uqm, queue);
|
||||
else
|
||||
@@ -1217,6 +1353,7 @@ int amdgpu_userq_stop_sched_for_enforce_isolation(struct amdgpu_device *adev,
|
||||
if (((queue->queue_type == AMDGPU_HW_IP_GFX) ||
|
||||
(queue->queue_type == AMDGPU_HW_IP_COMPUTE)) &&
|
||||
(queue->xcp_id == idx)) {
|
||||
amdgpu_userq_detect_and_reset_queues(uqm);
|
||||
r = amdgpu_userq_preempt_helper(uqm, queue);
|
||||
if (r)
|
||||
ret = r;
|
||||
@@ -1289,3 +1426,56 @@ int amdgpu_userq_gem_va_unmap_validate(struct amdgpu_device *adev,
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void amdgpu_userq_pre_reset(struct amdgpu_device *adev)
|
||||
{
|
||||
const struct amdgpu_userq_funcs *userq_funcs;
|
||||
struct amdgpu_usermode_queue *queue;
|
||||
struct amdgpu_userq_mgr *uqm;
|
||||
unsigned long queue_id;
|
||||
|
||||
xa_for_each(&adev->userq_doorbell_xa, queue_id, queue) {
|
||||
uqm = queue->userq_mgr;
|
||||
cancel_delayed_work_sync(&uqm->resume_work);
|
||||
if (queue->state == AMDGPU_USERQ_STATE_MAPPED) {
|
||||
amdgpu_userq_wait_for_last_fence(uqm, queue);
|
||||
userq_funcs = adev->userq_funcs[queue->queue_type];
|
||||
userq_funcs->unmap(uqm, queue);
|
||||
/* just mark all queues as hung at this point.
|
||||
* if unmap succeeds, we could map again
|
||||
* in amdgpu_userq_post_reset() if vram is not lost
|
||||
*/
|
||||
queue->state = AMDGPU_USERQ_STATE_HUNG;
|
||||
amdgpu_userq_fence_driver_force_completion(queue);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int amdgpu_userq_post_reset(struct amdgpu_device *adev, bool vram_lost)
|
||||
{
|
||||
/* if any queue state is AMDGPU_USERQ_STATE_UNMAPPED
|
||||
* at this point, we should be able to map it again
|
||||
* and continue if vram is not lost.
|
||||
*/
|
||||
struct amdgpu_userq_mgr *uqm;
|
||||
struct amdgpu_usermode_queue *queue;
|
||||
const struct amdgpu_userq_funcs *userq_funcs;
|
||||
unsigned long queue_id;
|
||||
int r = 0;
|
||||
|
||||
xa_for_each(&adev->userq_doorbell_xa, queue_id, queue) {
|
||||
uqm = queue->userq_mgr;
|
||||
if (queue->state == AMDGPU_USERQ_STATE_HUNG && !vram_lost) {
|
||||
userq_funcs = adev->userq_funcs[queue->queue_type];
|
||||
/* Re-map queue */
|
||||
r = userq_funcs->map(uqm, queue);
|
||||
if (r) {
|
||||
dev_err(adev->dev, "Failed to remap queue %ld\n", queue_id);
|
||||
continue;
|
||||
}
|
||||
queue->state = AMDGPU_USERQ_STATE_MAPPED;
|
||||
}
|
||||
}
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user