drm/amdgpu: fix userptr HMM range handling v2

The basic problem here is that it's not allowed to page fault while
holding the reservation lock.

So it can happen that multiple processes try to validate an userptr
at the same time.

Work around that by putting the HMM range object into the mutex
protected bo list for now.

v2: make sure range is set to NULL in case of an error

Signed-off-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
CC: stable@vger.kernel.org
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
Christian König
2022-11-10 12:31:41 +01:00
committed by Alex Deucher
parent b39df63b16
commit 4458da0bb0
7 changed files with 46 additions and 51 deletions

View File

@@ -986,6 +986,7 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr,
struct amdkfd_process_info *process_info = mem->process_info;
struct amdgpu_bo *bo = mem->bo;
struct ttm_operation_ctx ctx = { true, false };
struct hmm_range *range;
int ret = 0;
mutex_lock(&process_info->lock);
@@ -1015,7 +1016,7 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr,
return 0;
}
ret = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages);
ret = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages, &range);
if (ret) {
pr_err("%s: Failed to get user pages: %d\n", __func__, ret);
goto unregister_out;
@@ -1033,7 +1034,7 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr,
amdgpu_bo_unreserve(bo);
release_out:
amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm);
amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm, range);
unregister_out:
if (ret)
amdgpu_mn_unregister(bo);
@@ -2370,6 +2371,8 @@ static int update_invalid_user_pages(struct amdkfd_process_info *process_info,
/* Go through userptr_inval_list and update any invalid user_pages */
list_for_each_entry(mem, &process_info->userptr_inval_list,
validate_list.head) {
struct hmm_range *range;
invalid = atomic_read(&mem->invalid);
if (!invalid)
/* BO hasn't been invalidated since the last
@@ -2380,7 +2383,8 @@ static int update_invalid_user_pages(struct amdkfd_process_info *process_info,
bo = mem->bo;
/* Get updated user pages */
ret = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages);
ret = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages,
&range);
if (ret) {
pr_debug("Failed %d to get user pages\n", ret);
@@ -2399,7 +2403,7 @@ static int update_invalid_user_pages(struct amdkfd_process_info *process_info,
* FIXME: Cannot ignore the return code, must hold
* notifier_lock
*/
amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm);
amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm, range);
}
/* Mark the BO as valid unless it was invalidated