drm/amdgpu: add configurable grace period for unmap queues

The HWS schedule allows a grace period for wave completion prior to
preemption for better performance by avoiding CWSR on waves that can
potentially complete quickly. The debugger, on the other hand, will
want to inspect wave status immediately after it actively triggers
preemption (a suspend function to be provided).

To minimize latency between preemption and debugger wave inspection, allow
immediate preemption by setting the grace period to 0.

Note that setting the preepmtion grace period to 0 will result in an
infinite grace period being set due to a CP FW bug so set it to 1 for now.

Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
Reviewed-by: Felix Kuehling <felix.kuehling@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
Jonathan Kim
2023-03-23 17:17:20 -04:00
committed by Alex Deucher
parent 33f3437ae1
commit 7cee6a6824
14 changed files with 295 additions and 20 deletions

View File

@@ -46,10 +46,13 @@ static int set_pasid_vmid_mapping(struct device_queue_manager *dqm,
static int execute_queues_cpsch(struct device_queue_manager *dqm,
enum kfd_unmap_queues_filter filter,
uint32_t filter_param);
uint32_t filter_param,
uint32_t grace_period);
static int unmap_queues_cpsch(struct device_queue_manager *dqm,
enum kfd_unmap_queues_filter filter,
uint32_t filter_param, bool reset);
uint32_t filter_param,
uint32_t grace_period,
bool reset);
static int map_queues_cpsch(struct device_queue_manager *dqm);
@@ -866,7 +869,7 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q,
if (dqm->sched_policy != KFD_SCHED_POLICY_NO_HWS) {
if (!dqm->dev->kfd->shared_resources.enable_mes)
retval = unmap_queues_cpsch(dqm,
KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0, false);
KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0, USE_DEFAULT_GRACE_PERIOD, false);
else if (prev_active)
retval = remove_queue_mes(dqm, q, &pdd->qpd);
@@ -1042,7 +1045,8 @@ static int evict_process_queues_cpsch(struct device_queue_manager *dqm,
retval = execute_queues_cpsch(dqm,
qpd->is_debug ?
KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES :
KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0,
USE_DEFAULT_GRACE_PERIOD);
out:
dqm_unlock(dqm);
@@ -1182,8 +1186,7 @@ static int restore_process_queues_cpsch(struct device_queue_manager *dqm,
}
if (!dqm->dev->kfd->shared_resources.enable_mes)
retval = execute_queues_cpsch(dqm,
KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0, USE_DEFAULT_GRACE_PERIOD);
eviction_duration = get_jiffies_64() - pdd->last_evict_timestamp;
atomic64_add(eviction_duration, &pdd->evict_duration_counter);
vm_not_acquired:
@@ -1525,6 +1528,9 @@ static int initialize_cpsch(struct device_queue_manager *dqm)
init_sdma_bitmaps(dqm);
if (dqm->dev->kfd2kgd->get_iq_wait_times)
dqm->dev->kfd2kgd->get_iq_wait_times(dqm->dev->adev,
&dqm->wait_times);
return 0;
}
@@ -1563,8 +1569,9 @@ static int start_cpsch(struct device_queue_manager *dqm)
dqm->is_hws_hang = false;
dqm->is_resetting = false;
dqm->sched_running = true;
if (!dqm->dev->kfd->shared_resources.enable_mes)
execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0, USE_DEFAULT_GRACE_PERIOD);
dqm_unlock(dqm);
return 0;
@@ -1589,7 +1596,7 @@ static int stop_cpsch(struct device_queue_manager *dqm)
if (!dqm->is_hws_hang) {
if (!dqm->dev->kfd->shared_resources.enable_mes)
unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0, false);
unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0, USE_DEFAULT_GRACE_PERIOD, false);
else
remove_all_queues_mes(dqm);
}
@@ -1631,7 +1638,8 @@ static int create_kernel_queue_cpsch(struct device_queue_manager *dqm,
list_add(&kq->list, &qpd->priv_queue_list);
increment_queue_count(dqm, qpd, kq->queue);
qpd->is_debug = true;
execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0,
USE_DEFAULT_GRACE_PERIOD);
dqm_unlock(dqm);
return 0;
@@ -1645,7 +1653,8 @@ static void destroy_kernel_queue_cpsch(struct device_queue_manager *dqm,
list_del(&kq->list);
decrement_queue_count(dqm, qpd, kq->queue);
qpd->is_debug = false;
execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0,
USE_DEFAULT_GRACE_PERIOD);
/*
* Unconditionally decrement this counter, regardless of the queue's
* type.
@@ -1722,7 +1731,7 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
if (!dqm->dev->kfd->shared_resources.enable_mes)
retval = execute_queues_cpsch(dqm,
KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0, USE_DEFAULT_GRACE_PERIOD);
else
retval = add_queue_mes(dqm, q, qpd);
if (retval)
@@ -1811,7 +1820,9 @@ static int map_queues_cpsch(struct device_queue_manager *dqm)
/* dqm->lock mutex has to be locked before calling this function */
static int unmap_queues_cpsch(struct device_queue_manager *dqm,
enum kfd_unmap_queues_filter filter,
uint32_t filter_param, bool reset)
uint32_t filter_param,
uint32_t grace_period,
bool reset)
{
int retval = 0;
struct mqd_manager *mqd_mgr;
@@ -1823,6 +1834,12 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
if (!dqm->active_runlist)
return retval;
if (grace_period != USE_DEFAULT_GRACE_PERIOD) {
retval = pm_update_grace_period(&dqm->packet_mgr, grace_period);
if (retval)
return retval;
}
retval = pm_send_unmap_queue(&dqm->packet_mgr, filter, filter_param, reset);
if (retval)
return retval;
@@ -1855,6 +1872,13 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
return -ETIME;
}
/* We need to reset the grace period value for this device */
if (grace_period != USE_DEFAULT_GRACE_PERIOD) {
if (pm_update_grace_period(&dqm->packet_mgr,
USE_DEFAULT_GRACE_PERIOD))
pr_err("Failed to reset grace period\n");
}
pm_release_ib(&dqm->packet_mgr);
dqm->active_runlist = false;
@@ -1870,7 +1894,7 @@ static int reset_queues_cpsch(struct device_queue_manager *dqm,
dqm_lock(dqm);
retval = unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_BY_PASID,
pasid, true);
pasid, USE_DEFAULT_GRACE_PERIOD, true);
dqm_unlock(dqm);
return retval;
@@ -1879,13 +1903,14 @@ static int reset_queues_cpsch(struct device_queue_manager *dqm,
/* dqm->lock mutex has to be locked before calling this function */
static int execute_queues_cpsch(struct device_queue_manager *dqm,
enum kfd_unmap_queues_filter filter,
uint32_t filter_param)
uint32_t filter_param,
uint32_t grace_period)
{
int retval;
if (dqm->is_hws_hang)
return -EIO;
retval = unmap_queues_cpsch(dqm, filter, filter_param, false);
retval = unmap_queues_cpsch(dqm, filter, filter_param, grace_period, false);
if (retval)
return retval;
@@ -1943,7 +1968,8 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
if (!dqm->dev->kfd->shared_resources.enable_mes) {
decrement_queue_count(dqm, qpd, q);
retval = execute_queues_cpsch(dqm,
KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0,
USE_DEFAULT_GRACE_PERIOD);
if (retval == -ETIME)
qpd->reset_wavefronts = true;
} else {
@@ -2228,7 +2254,7 @@ static int process_termination_cpsch(struct device_queue_manager *dqm,
}
if (!dqm->dev->kfd->shared_resources.enable_mes)
retval = execute_queues_cpsch(dqm, filter, 0);
retval = execute_queues_cpsch(dqm, filter, 0, USE_DEFAULT_GRACE_PERIOD);
if ((!dqm->is_hws_hang) && (retval || qpd->reset_wavefronts)) {
pr_warn("Resetting wave fronts (cpsch) on dev %p\n", dqm->dev);
@@ -2589,7 +2615,8 @@ int dqm_debugfs_hang_hws(struct device_queue_manager *dqm)
return r;
}
dqm->active_runlist = true;
r = execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
r = execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES,
0, USE_DEFAULT_GRACE_PERIOD);
dqm_unlock(dqm);
return r;