drm/amdgpu: fix vf error handling

The error handling for virtual functions assumed a single
vf per VM and didn't properly account for bare metal.  Make
the error arrays per device and add locking.

Reviewed-by: Gavin Wan <gavin.wan@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
Alex Deucher
2017-09-28 09:47:32 -04:00
parent 6f87a89570
commit e23b74aab5
4 changed files with 54 additions and 41 deletions

View File

@@ -25,30 +25,21 @@
#include "amdgpu_vf_error.h"
#include "mxgpu_ai.h"
#define AMDGPU_VF_ERROR_ENTRY_SIZE 16
/* struct error_entry - amdgpu VF error information. */
struct amdgpu_vf_error_buffer {
int read_count;
int write_count;
uint16_t code[AMDGPU_VF_ERROR_ENTRY_SIZE];
uint16_t flags[AMDGPU_VF_ERROR_ENTRY_SIZE];
uint64_t data[AMDGPU_VF_ERROR_ENTRY_SIZE];
};
struct amdgpu_vf_error_buffer admgpu_vf_errors;
void amdgpu_vf_error_put(uint16_t sub_error_code, uint16_t error_flags, uint64_t error_data)
void amdgpu_vf_error_put(struct amdgpu_device *adev,
uint16_t sub_error_code,
uint16_t error_flags,
uint64_t error_data)
{
int index;
uint16_t error_code = AMDGIM_ERROR_CODE(AMDGIM_ERROR_CATEGORY_VF, sub_error_code);
index = admgpu_vf_errors.write_count % AMDGPU_VF_ERROR_ENTRY_SIZE;
admgpu_vf_errors.code [index] = error_code;
admgpu_vf_errors.flags [index] = error_flags;
admgpu_vf_errors.data [index] = error_data;
admgpu_vf_errors.write_count ++;
mutex_lock(&adev->virt.vf_errors.lock);
index = adev->virt.vf_errors.write_count % AMDGPU_VF_ERROR_ENTRY_SIZE;
adev->virt.vf_errors.code [index] = error_code;
adev->virt.vf_errors.flags [index] = error_flags;
adev->virt.vf_errors.data [index] = error_data;
adev->virt.vf_errors.write_count ++;
mutex_unlock(&adev->virt.vf_errors.lock);
}
@@ -58,7 +49,8 @@ void amdgpu_vf_error_trans_all(struct amdgpu_device *adev)
u32 data1, data2, data3;
int index;
if ((NULL == adev) || (!amdgpu_sriov_vf(adev)) || (!adev->virt.ops) || (!adev->virt.ops->trans_msg)) {
if ((NULL == adev) || (!amdgpu_sriov_vf(adev)) ||
(!adev->virt.ops) || (!adev->virt.ops->trans_msg)) {
return;
}
/*
@@ -68,18 +60,22 @@ void amdgpu_vf_error_trans_all(struct amdgpu_device *adev)
return;
}
*/
mutex_lock(&adev->virt.vf_errors.lock);
/* The errors are overlay of array, correct read_count as full. */
if (admgpu_vf_errors.write_count - admgpu_vf_errors.read_count > AMDGPU_VF_ERROR_ENTRY_SIZE) {
admgpu_vf_errors.read_count = admgpu_vf_errors.write_count - AMDGPU_VF_ERROR_ENTRY_SIZE;
if (adev->virt.vf_errors.write_count - adev->virt.vf_errors.read_count > AMDGPU_VF_ERROR_ENTRY_SIZE) {
adev->virt.vf_errors.read_count = adev->virt.vf_errors.write_count - AMDGPU_VF_ERROR_ENTRY_SIZE;
}
while (admgpu_vf_errors.read_count < admgpu_vf_errors.write_count) {
index =admgpu_vf_errors.read_count % AMDGPU_VF_ERROR_ENTRY_SIZE;
data1 = AMDGIM_ERROR_CODE_FLAGS_TO_MAILBOX (admgpu_vf_errors.code[index], admgpu_vf_errors.flags[index]);
data2 = admgpu_vf_errors.data[index] & 0xFFFFFFFF;
data3 = (admgpu_vf_errors.data[index] >> 32) & 0xFFFFFFFF;
while (adev->virt.vf_errors.read_count < adev->virt.vf_errors.write_count) {
index =adev->virt.vf_errors.read_count % AMDGPU_VF_ERROR_ENTRY_SIZE;
data1 = AMDGIM_ERROR_CODE_FLAGS_TO_MAILBOX(adev->virt.vf_errors.code[index],
adev->virt.vf_errors.flags[index]);
data2 = adev->virt.vf_errors.data[index] & 0xFFFFFFFF;
data3 = (adev->virt.vf_errors.data[index] >> 32) & 0xFFFFFFFF;
adev->virt.ops->trans_msg(adev, IDH_LOG_VF_ERROR, data1, data2, data3);
admgpu_vf_errors.read_count ++;
adev->virt.vf_errors.read_count ++;
}
mutex_unlock(&adev->virt.vf_errors.lock);
}