diff --git a/drivers/accel/ivpu/ivpu_debugfs.c b/drivers/accel/ivpu/ivpu_debugfs.c index a09f54fc4302..189dbe94cf14 100644 --- a/drivers/accel/ivpu/ivpu_debugfs.c +++ b/drivers/accel/ivpu/ivpu_debugfs.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * Copyright (C) 2020-2024 Intel Corporation + * Copyright (C) 2020-2026 Intel Corporation */ #include @@ -127,6 +127,14 @@ static int firewall_irq_counter_show(struct seq_file *s, void *v) return 0; } +static int engine_reset_counter_show(struct seq_file *s, void *v) +{ + struct ivpu_device *vdev = seq_to_ivpu(s); + + seq_printf(s, "%d\n", atomic_read(&vdev->pm->engine_reset_counter)); + return 0; +} + static const struct drm_debugfs_info vdev_debugfs_list[] = { {"bo_list", bo_list_show, 0}, {"fw_name", fw_name_show, 0}, @@ -137,6 +145,7 @@ static const struct drm_debugfs_info vdev_debugfs_list[] = { {"reset_counter", reset_counter_show, 0}, {"reset_pending", reset_pending_show, 0}, {"firewall_irq_counter", firewall_irq_counter_show, 0}, + {"engine_reset_counter", engine_reset_counter_show, 0}, }; static int dvfs_mode_get(void *data, u64 *dvfs_mode) @@ -352,8 +361,9 @@ static const struct file_operations ivpu_force_recovery_fops = { static int ivpu_reset_engine_fn(void *data, u64 val) { struct ivpu_device *vdev = (struct ivpu_device *)data; + struct vpu_jsm_msg resp; - return ivpu_jsm_reset_engine(vdev, (u32)val); + return ivpu_jsm_reset_engine(vdev, (u32)val, &resp); } DEFINE_DEBUGFS_ATTRIBUTE(ivpu_reset_engine_fops, NULL, ivpu_reset_engine_fn, "0x%02llx\n"); diff --git a/drivers/accel/ivpu/ivpu_drv.c b/drivers/accel/ivpu/ivpu_drv.c index dd3a486df5f1..2801378e3e19 100644 --- a/drivers/accel/ivpu/ivpu_drv.c +++ b/drivers/accel/ivpu/ivpu_drv.c @@ -665,6 +665,7 @@ static int ivpu_dev_init(struct ivpu_device *vdev) vdev->context_xa_limit.max = IVPU_USER_CONTEXT_MAX_SSID; atomic64_set(&vdev->unique_id_counter, 0); atomic_set(&vdev->job_timeout_counter, 0); + atomic_set(&vdev->faults_detected, 0); xa_init_flags(&vdev->context_xa, XA_FLAGS_ALLOC | XA_FLAGS_LOCK_IRQ); xa_init_flags(&vdev->submitted_jobs_xa, XA_FLAGS_ALLOC1); xa_init_flags(&vdev->db_xa, XA_FLAGS_ALLOC1); diff --git a/drivers/accel/ivpu/ivpu_drv.h b/drivers/accel/ivpu/ivpu_drv.h index 6378e23e0c97..b739738c4566 100644 --- a/drivers/accel/ivpu/ivpu_drv.h +++ b/drivers/accel/ivpu/ivpu_drv.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0-only */ /* - * Copyright (C) 2020-2025 Intel Corporation + * Copyright (C) 2020-2026 Intel Corporation */ #ifndef __IVPU_DRV_H__ @@ -168,6 +168,7 @@ struct ivpu_device { struct xarray submitted_jobs_xa; struct ivpu_ipc_consumer job_done_consumer; atomic_t job_timeout_counter; + atomic_t faults_detected; atomic64_t unique_id_counter; diff --git a/drivers/accel/ivpu/ivpu_job.c b/drivers/accel/ivpu/ivpu_job.c index f0154dfa6ddc..521931d1f7fc 100644 --- a/drivers/accel/ivpu/ivpu_job.c +++ b/drivers/accel/ivpu/ivpu_job.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * Copyright (C) 2020-2025 Intel Corporation + * Copyright (C) 2020-2026 Intel Corporation */ #include @@ -607,6 +607,7 @@ bool ivpu_job_handle_engine_error(struct ivpu_device *vdev, u32 job_id, u32 job_ * status and ensure both are handled in the same way */ job->file_priv->has_mmu_faults = true; + atomic_set(&vdev->faults_detected, 1); queue_work(system_percpu_wq, &vdev->context_abort_work); return true; } @@ -1115,6 +1116,51 @@ void ivpu_job_done_consumer_fini(struct ivpu_device *vdev) ivpu_ipc_consumer_del(vdev, &vdev->job_done_consumer); } +static int reset_engine_and_mark_faulty_contexts(struct ivpu_device *vdev) +{ + u32 num_impacted_contexts; + struct vpu_jsm_msg resp; + int ret; + u32 i; + + ret = ivpu_jsm_reset_engine(vdev, 0, &resp); + if (ret) + return ret; + + /* + * If faults are detected, ignore guilty contexts from engine reset as NPU may not be stuck + * and could return currently running good context and faulty contexts are already marked + */ + if (atomic_cmpxchg(&vdev->faults_detected, 1, 0) == 1) + return 0; + + num_impacted_contexts = resp.payload.engine_reset_done.num_impacted_contexts; + + ivpu_warn_ratelimited(vdev, "Engine reset performed, impacted contexts: %u\n", + num_impacted_contexts); + + if (!in_range(num_impacted_contexts, 1, VPU_MAX_ENGINE_RESET_IMPACTED_CONTEXTS - 1)) { + ivpu_pm_trigger_recovery(vdev, "Cannot determine guilty contexts"); + return -EIO; + } + + /* No faults detected, NPU likely got stuck. Mark returned contexts as guilty */ + guard(mutex)(&vdev->context_list_lock); + + for (i = 0; i < num_impacted_contexts; i++) { + u32 ssid = resp.payload.engine_reset_done.impacted_contexts[i].host_ssid; + struct ivpu_file_priv *file_priv = xa_load(&vdev->context_xa, ssid); + + if (file_priv) { + mutex_lock(&file_priv->lock); + file_priv->has_mmu_faults = true; + mutex_unlock(&file_priv->lock); + } + } + + return 0; +} + void ivpu_context_abort_work_fn(struct work_struct *work) { struct ivpu_device *vdev = container_of(work, struct ivpu_device, context_abort_work); @@ -1127,7 +1173,7 @@ void ivpu_context_abort_work_fn(struct work_struct *work) return; if (vdev->fw->sched_mode == VPU_SCHEDULING_MODE_HW) - if (ivpu_jsm_reset_engine(vdev, 0)) + if (reset_engine_and_mark_faulty_contexts(vdev)) goto runtime_put; mutex_lock(&vdev->context_list_lock); diff --git a/drivers/accel/ivpu/ivpu_jsm_msg.c b/drivers/accel/ivpu/ivpu_jsm_msg.c index 0256b2dfefc1..07b1d6f615a9 100644 --- a/drivers/accel/ivpu/ivpu_jsm_msg.c +++ b/drivers/accel/ivpu/ivpu_jsm_msg.c @@ -151,10 +151,9 @@ int ivpu_jsm_get_heartbeat(struct ivpu_device *vdev, u32 engine, u64 *heartbeat) return ret; } -int ivpu_jsm_reset_engine(struct ivpu_device *vdev, u32 engine) +int ivpu_jsm_reset_engine(struct ivpu_device *vdev, u32 engine, struct vpu_jsm_msg *resp) { struct vpu_jsm_msg req = { .type = VPU_JSM_MSG_ENGINE_RESET }; - struct vpu_jsm_msg resp; int ret; if (engine != VPU_ENGINE_COMPUTE) @@ -162,14 +161,17 @@ int ivpu_jsm_reset_engine(struct ivpu_device *vdev, u32 engine) req.payload.engine_reset.engine_idx = engine; - ret = ivpu_ipc_send_receive(vdev, &req, VPU_JSM_MSG_ENGINE_RESET_DONE, &resp, + ret = ivpu_ipc_send_receive(vdev, &req, VPU_JSM_MSG_ENGINE_RESET_DONE, resp, VPU_IPC_CHAN_ASYNC_CMD, vdev->timeout.jsm); if (ret) { ivpu_err_ratelimited(vdev, "Failed to reset engine %d: %d\n", engine, ret); ivpu_pm_trigger_recovery(vdev, "Engine reset failed"); + return ret; } - return ret; + atomic_inc(&vdev->pm->engine_reset_counter); + + return 0; } int ivpu_jsm_preempt_engine(struct ivpu_device *vdev, u32 engine, u32 preempt_id) @@ -554,6 +556,15 @@ int ivpu_jsm_dct_disable(struct ivpu_device *vdev) } int ivpu_jsm_state_dump(struct ivpu_device *vdev) +{ + struct vpu_jsm_msg req = { .type = VPU_JSM_MSG_STATE_DUMP }; + struct vpu_jsm_msg resp; + + return ivpu_ipc_send_receive_internal(vdev, &req, VPU_JSM_MSG_STATE_DUMP_RSP, &resp, + VPU_IPC_CHAN_ASYNC_CMD, vdev->timeout.jsm); +} + +int ivpu_jsm_state_dump_no_reply(struct ivpu_device *vdev) { struct vpu_jsm_msg req = { .type = VPU_JSM_MSG_STATE_DUMP }; diff --git a/drivers/accel/ivpu/ivpu_jsm_msg.h b/drivers/accel/ivpu/ivpu_jsm_msg.h index 9e84d3526a14..a74f5a0b0d93 100644 --- a/drivers/accel/ivpu/ivpu_jsm_msg.h +++ b/drivers/accel/ivpu/ivpu_jsm_msg.h @@ -14,7 +14,7 @@ int ivpu_jsm_register_db(struct ivpu_device *vdev, u32 ctx_id, u32 db_id, u64 jobq_base, u32 jobq_size); int ivpu_jsm_unregister_db(struct ivpu_device *vdev, u32 db_id); int ivpu_jsm_get_heartbeat(struct ivpu_device *vdev, u32 engine, u64 *heartbeat); -int ivpu_jsm_reset_engine(struct ivpu_device *vdev, u32 engine); +int ivpu_jsm_reset_engine(struct ivpu_device *vdev, u32 engine, struct vpu_jsm_msg *response); int ivpu_jsm_preempt_engine(struct ivpu_device *vdev, u32 engine, u32 preempt_id); int ivpu_jsm_dyndbg_control(struct ivpu_device *vdev, char *command, size_t size); int ivpu_jsm_trace_get_capability(struct ivpu_device *vdev, u32 *trace_destination_mask, @@ -44,5 +44,6 @@ int ivpu_jsm_metric_streamer_info(struct ivpu_device *vdev, u64 metric_group_mas int ivpu_jsm_dct_enable(struct ivpu_device *vdev, u32 active_us, u32 inactive_us); int ivpu_jsm_dct_disable(struct ivpu_device *vdev); int ivpu_jsm_state_dump(struct ivpu_device *vdev); +int ivpu_jsm_state_dump_no_reply(struct ivpu_device *vdev); #endif diff --git a/drivers/accel/ivpu/ivpu_mmu.c b/drivers/accel/ivpu/ivpu_mmu.c index e1baf6b64935..41efd8985fa6 100644 --- a/drivers/accel/ivpu/ivpu_mmu.c +++ b/drivers/accel/ivpu/ivpu_mmu.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * Copyright (C) 2020-2024 Intel Corporation + * Copyright (C) 2020-2026 Intel Corporation */ #include @@ -964,6 +964,7 @@ void ivpu_mmu_irq_evtq_handler(struct ivpu_device *vdev) file_priv = xa_load(&vdev->context_xa, ssid); if (file_priv) { if (!READ_ONCE(file_priv->has_mmu_faults)) { + atomic_set(&vdev->faults_detected, 1); ivpu_mmu_dump_event(vdev, event); WRITE_ONCE(file_priv->has_mmu_faults, true); } diff --git a/drivers/accel/ivpu/ivpu_pm.c b/drivers/accel/ivpu/ivpu_pm.c index d20144a21e09..83da9b297f37 100644 --- a/drivers/accel/ivpu/ivpu_pm.c +++ b/drivers/accel/ivpu/ivpu_pm.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * Copyright (C) 2020-2024 Intel Corporation + * Copyright (C) 2020-2026 Intel Corporation */ #include @@ -166,7 +166,7 @@ static void ivpu_pm_recovery_work(struct work_struct *work) ivpu_pm_reset_begin(vdev); if (!pm_runtime_status_suspended(vdev->drm.dev)) { - ivpu_jsm_state_dump(vdev); + ivpu_jsm_state_dump_no_reply(vdev); ivpu_dev_coredump(vdev); ivpu_suspend(vdev); } @@ -205,23 +205,25 @@ static void ivpu_job_timeout_work(struct work_struct *work) if (ivpu_jsm_get_heartbeat(vdev, 0, &heartbeat) || heartbeat <= vdev->fw->last_heartbeat) { ivpu_err(vdev, "Job timeout detected, heartbeat not progressed\n"); - goto recovery; + goto abort; } inference_max_retries = DIV_ROUND_UP(inference_timeout_ms, timeout_ms); if (atomic_fetch_inc(&vdev->job_timeout_counter) >= inference_max_retries) { ivpu_err(vdev, "Job timeout detected, heartbeat limit (%lld) exceeded\n", inference_max_retries); - goto recovery; + goto abort; } vdev->fw->last_heartbeat = heartbeat; ivpu_start_job_timeout_detection(vdev); return; -recovery: +abort: atomic_set(&vdev->job_timeout_counter, 0); - ivpu_pm_trigger_recovery(vdev, "TDR"); + ivpu_jsm_state_dump(vdev); + ivpu_dev_coredump(vdev); + queue_work(system_percpu_wq, &vdev->context_abort_work); } void ivpu_start_job_timeout_detection(struct ivpu_device *vdev) @@ -404,6 +406,7 @@ void ivpu_pm_init(struct ivpu_device *vdev) init_rwsem(&pm->reset_lock); atomic_set(&pm->reset_pending, 0); atomic_set(&pm->reset_counter, 0); + atomic_set(&pm->engine_reset_counter, 0); INIT_WORK(&pm->recovery_work, ivpu_pm_recovery_work); INIT_DELAYED_WORK(&pm->job_timeout_work, ivpu_job_timeout_work); diff --git a/drivers/accel/ivpu/ivpu_pm.h b/drivers/accel/ivpu/ivpu_pm.h index 00f2a01e3df6..2f07bb0b43be 100644 --- a/drivers/accel/ivpu/ivpu_pm.h +++ b/drivers/accel/ivpu/ivpu_pm.h @@ -18,6 +18,7 @@ struct ivpu_pm_info { struct rw_semaphore reset_lock; atomic_t reset_counter; atomic_t reset_pending; + atomic_t engine_reset_counter; u8 dct_active_percent; };