mirror of
https://github.com/torvalds/linux.git
synced 2026-04-18 14:53:58 -04:00
drm/amdgpu: fix strsep() corrupting lockup_timeout on multi-GPU (v3)
amdgpu_device_get_job_timeout_settings() passes a pointer directly
to the global amdgpu_lockup_timeout[] buffer into strsep().
strsep() destructively replaces delimiter characters with '\0'
in-place.
On multi-GPU systems, this function is called once per device.
When a multi-value setting like "0,0,0,-1" is used, the first
GPU's call transforms the global buffer into "0\00\00\0-1". The
second GPU then sees only "0" (terminated at the first '\0'),
parses a single value, hits the single-value fallthrough
(index == 1), and applies timeout=0 to all rings — causing
immediate false job timeouts.
Fix this by copying into a stack-local array before calling
strsep(), so the global module parameter buffer remains intact
across calls. The buffer is AMDGPU_MAX_TIMEOUT_PARAM_LENGTH
(256) bytes, which is safe for the stack.
v2: wrap commit message to 72 columns, add Assisted-by tag.
v3: use stack array with strscpy() instead of kstrdup()/kfree()
to avoid unnecessary heap allocation (Christian).
This patch was developed with assistance from Claude (claude-opus-4-6).
Assisted-by: Claude:claude-opus-4-6
Reviewed-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Ruijing Dong <ruijing.dong@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
(cherry picked from commit 94d79f51ef)
Cc: stable@vger.kernel.org
This commit is contained in:
committed by
Alex Deucher
parent
aed3d041ab
commit
2d300ebfc4
@@ -4207,7 +4207,8 @@ fail:
|
||||
|
||||
static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
|
||||
{
|
||||
char *input = amdgpu_lockup_timeout;
|
||||
char buf[AMDGPU_MAX_TIMEOUT_PARAM_LENGTH];
|
||||
char *input = buf;
|
||||
char *timeout_setting = NULL;
|
||||
int index = 0;
|
||||
long timeout;
|
||||
@@ -4217,9 +4218,17 @@ static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
|
||||
adev->gfx_timeout = adev->compute_timeout = adev->sdma_timeout =
|
||||
adev->video_timeout = msecs_to_jiffies(2000);
|
||||
|
||||
if (!strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH))
|
||||
if (!strnlen(amdgpu_lockup_timeout, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH))
|
||||
return 0;
|
||||
|
||||
/*
|
||||
* strsep() destructively modifies its input by replacing delimiters
|
||||
* with '\0'. Use a stack copy so the global module parameter buffer
|
||||
* remains intact for multi-GPU systems where this function is called
|
||||
* once per device.
|
||||
*/
|
||||
strscpy(buf, amdgpu_lockup_timeout, sizeof(buf));
|
||||
|
||||
while ((timeout_setting = strsep(&input, ",")) &&
|
||||
strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
|
||||
ret = kstrtol(timeout_setting, 0, &timeout);
|
||||
|
||||
Reference in New Issue
Block a user