mirror of
https://github.com/torvalds/linux.git
synced 2026-04-18 14:53:58 -04:00
drm/amdgpu: Convert to common fdinfo format v5
Convert fdinfo format to one documented in drm-usage-stats.rst.
It turned out that the existing implementation was actually completely
nonsense. The calculated percentages indeed represented the usage of the
engine, but with varying time slices.
So 10% usage for application A could mean something completely different
than 10% usage for application B.
Completely nuke that and just use the now standardized nanosecond
interface.
v2: drop the documentation change for now, nuke percentage calculation
v3: only account for each hw_ip, move the time_spend to the ctx mgr.
v4: move general ctx changes into separate patch, rework the fdinfo to
ctx_mgr interface so that all usages are calculated at once, drop
some unecessary and dangerous refcount dance.
v5: add one more comment how we calculate the time spend
Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Signed-off-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Shashank Sharma <shashank.sharma@amd.com>
Cc: Daniel Vetter <daniel@ffwll.ch>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
committed by
Alex Deucher
parent
08cffb3eb7
commit
af0b541670
@@ -162,17 +162,50 @@ static unsigned int amdgpu_ctx_get_hw_prio(struct amdgpu_ctx *ctx, u32 hw_ip)
|
||||
return hw_prio;
|
||||
}
|
||||
|
||||
/* Calculate the time spend on the hw */
|
||||
static ktime_t amdgpu_ctx_fence_time(struct dma_fence *fence)
|
||||
{
|
||||
struct drm_sched_fence *s_fence;
|
||||
|
||||
if (!fence)
|
||||
return ns_to_ktime(0);
|
||||
|
||||
/* When the fence is not even scheduled it can't have spend time */
|
||||
s_fence = to_drm_sched_fence(fence);
|
||||
if (!test_bit(DMA_FENCE_FLAG_TIMESTAMP_BIT, &s_fence->scheduled.flags))
|
||||
return ns_to_ktime(0);
|
||||
|
||||
/* When it is still running account how much already spend */
|
||||
if (!test_bit(DMA_FENCE_FLAG_TIMESTAMP_BIT, &s_fence->finished.flags))
|
||||
return ktime_sub(ktime_get(), s_fence->scheduled.timestamp);
|
||||
|
||||
return ktime_sub(s_fence->finished.timestamp,
|
||||
s_fence->scheduled.timestamp);
|
||||
}
|
||||
|
||||
static ktime_t amdgpu_ctx_entity_time(struct amdgpu_ctx *ctx,
|
||||
struct amdgpu_ctx_entity *centity)
|
||||
{
|
||||
ktime_t res = ns_to_ktime(0);
|
||||
uint32_t i;
|
||||
|
||||
spin_lock(&ctx->ring_lock);
|
||||
for (i = 0; i < amdgpu_sched_jobs; i++) {
|
||||
res = ktime_add(res, amdgpu_ctx_fence_time(centity->fences[i]));
|
||||
}
|
||||
spin_unlock(&ctx->ring_lock);
|
||||
return res;
|
||||
}
|
||||
|
||||
static int amdgpu_ctx_init_entity(struct amdgpu_ctx *ctx, u32 hw_ip,
|
||||
const u32 ring)
|
||||
{
|
||||
struct drm_gpu_scheduler **scheds = NULL, *sched = NULL;
|
||||
struct amdgpu_device *adev = ctx->mgr->adev;
|
||||
struct amdgpu_ctx_entity *entity;
|
||||
struct drm_gpu_scheduler **scheds = NULL, *sched = NULL;
|
||||
unsigned num_scheds = 0;
|
||||
int32_t ctx_prio;
|
||||
unsigned int hw_prio;
|
||||
enum drm_sched_priority drm_prio;
|
||||
unsigned int hw_prio, num_scheds;
|
||||
int32_t ctx_prio;
|
||||
int r;
|
||||
|
||||
entity = kzalloc(struct_size(entity, fences, amdgpu_sched_jobs),
|
||||
@@ -182,6 +215,7 @@ static int amdgpu_ctx_init_entity(struct amdgpu_ctx *ctx, u32 hw_ip,
|
||||
|
||||
ctx_prio = (ctx->override_priority == AMDGPU_CTX_PRIORITY_UNSET) ?
|
||||
ctx->init_priority : ctx->override_priority;
|
||||
entity->hw_ip = hw_ip;
|
||||
entity->sequence = 1;
|
||||
hw_prio = amdgpu_ctx_get_hw_prio(ctx, hw_ip);
|
||||
drm_prio = amdgpu_ctx_to_drm_sched_prio(ctx_prio);
|
||||
@@ -220,6 +254,23 @@ error_free_entity:
|
||||
return r;
|
||||
}
|
||||
|
||||
static ktime_t amdgpu_ctx_fini_entity(struct amdgpu_ctx_entity *entity)
|
||||
{
|
||||
ktime_t res = ns_to_ktime(0);
|
||||
int i;
|
||||
|
||||
if (!entity)
|
||||
return res;
|
||||
|
||||
for (i = 0; i < amdgpu_sched_jobs; ++i) {
|
||||
res = ktime_add(res, amdgpu_ctx_fence_time(entity->fences[i]));
|
||||
dma_fence_put(entity->fences[i]);
|
||||
}
|
||||
|
||||
kfree(entity);
|
||||
return res;
|
||||
}
|
||||
|
||||
static int amdgpu_ctx_init(struct amdgpu_ctx_mgr *mgr, int32_t priority,
|
||||
struct drm_file *filp, struct amdgpu_ctx *ctx)
|
||||
{
|
||||
@@ -246,20 +297,6 @@ static int amdgpu_ctx_init(struct amdgpu_ctx_mgr *mgr, int32_t priority,
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void amdgpu_ctx_fini_entity(struct amdgpu_ctx_entity *entity)
|
||||
{
|
||||
|
||||
int i;
|
||||
|
||||
if (!entity)
|
||||
return;
|
||||
|
||||
for (i = 0; i < amdgpu_sched_jobs; ++i)
|
||||
dma_fence_put(entity->fences[i]);
|
||||
|
||||
kfree(entity);
|
||||
}
|
||||
|
||||
static int amdgpu_ctx_get_stable_pstate(struct amdgpu_ctx *ctx,
|
||||
u32 *stable_pstate)
|
||||
{
|
||||
@@ -351,8 +388,10 @@ static void amdgpu_ctx_fini(struct kref *ref)
|
||||
|
||||
for (i = 0; i < AMDGPU_HW_IP_NUM; ++i) {
|
||||
for (j = 0; j < AMDGPU_MAX_ENTITY_NUM; ++j) {
|
||||
amdgpu_ctx_fini_entity(ctx->entities[i][j]);
|
||||
ctx->entities[i][j] = NULL;
|
||||
ktime_t spend;
|
||||
|
||||
spend = amdgpu_ctx_fini_entity(ctx->entities[i][j]);
|
||||
atomic64_add(ktime_to_ns(spend), &mgr->time_spend[i]);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -689,6 +728,9 @@ uint64_t amdgpu_ctx_add_fence(struct amdgpu_ctx *ctx,
|
||||
centity->sequence++;
|
||||
spin_unlock(&ctx->ring_lock);
|
||||
|
||||
atomic64_add(ktime_to_ns(amdgpu_ctx_fence_time(other)),
|
||||
&ctx->mgr->time_spend[centity->hw_ip]);
|
||||
|
||||
dma_fence_put(other);
|
||||
return seq;
|
||||
}
|
||||
@@ -795,9 +837,14 @@ int amdgpu_ctx_wait_prev_fence(struct amdgpu_ctx *ctx,
|
||||
void amdgpu_ctx_mgr_init(struct amdgpu_ctx_mgr *mgr,
|
||||
struct amdgpu_device *adev)
|
||||
{
|
||||
unsigned int i;
|
||||
|
||||
mgr->adev = adev;
|
||||
mutex_init(&mgr->lock);
|
||||
idr_init(&mgr->ctx_handles);
|
||||
|
||||
for (i = 0; i < AMDGPU_HW_IP_NUM; ++i)
|
||||
atomic64_set(&mgr->time_spend[i], 0);
|
||||
}
|
||||
|
||||
long amdgpu_ctx_mgr_entity_flush(struct amdgpu_ctx_mgr *mgr, long timeout)
|
||||
@@ -873,80 +920,38 @@ void amdgpu_ctx_mgr_fini(struct amdgpu_ctx_mgr *mgr)
|
||||
mutex_destroy(&mgr->lock);
|
||||
}
|
||||
|
||||
static void amdgpu_ctx_fence_time(struct amdgpu_ctx *ctx,
|
||||
struct amdgpu_ctx_entity *centity, ktime_t *total, ktime_t *max)
|
||||
void amdgpu_ctx_mgr_usage(struct amdgpu_ctx_mgr *mgr,
|
||||
ktime_t usage[AMDGPU_HW_IP_NUM])
|
||||
{
|
||||
ktime_t now, t1;
|
||||
uint32_t i;
|
||||
|
||||
*total = *max = 0;
|
||||
|
||||
now = ktime_get();
|
||||
for (i = 0; i < amdgpu_sched_jobs; i++) {
|
||||
struct dma_fence *fence;
|
||||
struct drm_sched_fence *s_fence;
|
||||
|
||||
spin_lock(&ctx->ring_lock);
|
||||
fence = dma_fence_get(centity->fences[i]);
|
||||
spin_unlock(&ctx->ring_lock);
|
||||
if (!fence)
|
||||
continue;
|
||||
s_fence = to_drm_sched_fence(fence);
|
||||
if (!dma_fence_is_signaled(&s_fence->scheduled)) {
|
||||
dma_fence_put(fence);
|
||||
continue;
|
||||
}
|
||||
t1 = s_fence->scheduled.timestamp;
|
||||
if (!ktime_before(t1, now)) {
|
||||
dma_fence_put(fence);
|
||||
continue;
|
||||
}
|
||||
if (dma_fence_is_signaled(&s_fence->finished) &&
|
||||
s_fence->finished.timestamp < now)
|
||||
*total += ktime_sub(s_fence->finished.timestamp, t1);
|
||||
else
|
||||
*total += ktime_sub(now, t1);
|
||||
t1 = ktime_sub(now, t1);
|
||||
dma_fence_put(fence);
|
||||
*max = max(t1, *max);
|
||||
}
|
||||
}
|
||||
|
||||
ktime_t amdgpu_ctx_mgr_fence_usage(struct amdgpu_ctx_mgr *mgr, uint32_t hwip,
|
||||
uint32_t idx, uint64_t *elapsed)
|
||||
{
|
||||
struct idr *idp;
|
||||
struct amdgpu_ctx *ctx;
|
||||
unsigned int hw_ip, i;
|
||||
uint32_t id;
|
||||
struct amdgpu_ctx_entity *centity;
|
||||
ktime_t total = 0, max = 0;
|
||||
|
||||
if (idx >= AMDGPU_MAX_ENTITY_NUM)
|
||||
return 0;
|
||||
idp = &mgr->ctx_handles;
|
||||
/*
|
||||
* This is a little bit racy because it can be that a ctx or a fence are
|
||||
* destroyed just in the moment we try to account them. But that is ok
|
||||
* since exactly that case is explicitely allowed by the interface.
|
||||
*/
|
||||
mutex_lock(&mgr->lock);
|
||||
idr_for_each_entry(idp, ctx, id) {
|
||||
ktime_t ttotal, tmax;
|
||||
for (hw_ip = 0; hw_ip < AMDGPU_HW_IP_NUM; ++hw_ip) {
|
||||
uint64_t ns = atomic64_read(&mgr->time_spend[hw_ip]);
|
||||
|
||||
if (!ctx->entities[hwip][idx])
|
||||
continue;
|
||||
|
||||
centity = ctx->entities[hwip][idx];
|
||||
amdgpu_ctx_fence_time(ctx, centity, &ttotal, &tmax);
|
||||
|
||||
/* Harmonic mean approximation diverges for very small
|
||||
* values. If ratio < 0.01% ignore
|
||||
*/
|
||||
if (AMDGPU_CTX_FENCE_USAGE_MIN_RATIO(tmax, ttotal))
|
||||
continue;
|
||||
|
||||
total = ktime_add(total, ttotal);
|
||||
max = ktime_after(tmax, max) ? tmax : max;
|
||||
usage[hw_ip] = ns_to_ktime(ns);
|
||||
}
|
||||
|
||||
mutex_unlock(&mgr->lock);
|
||||
if (elapsed)
|
||||
*elapsed = max;
|
||||
idr_for_each_entry(&mgr->ctx_handles, ctx, id) {
|
||||
for (hw_ip = 0; hw_ip < AMDGPU_HW_IP_NUM; ++hw_ip) {
|
||||
for (i = 0; i < amdgpu_ctx_num_entities[hw_ip]; ++i) {
|
||||
struct amdgpu_ctx_entity *centity;
|
||||
ktime_t spend;
|
||||
|
||||
return total;
|
||||
centity = ctx->entities[hw_ip][i];
|
||||
if (!centity)
|
||||
continue;
|
||||
spend = amdgpu_ctx_entity_time(ctx, centity);
|
||||
usage[hw_ip] = ktime_add(usage[hw_ip], spend);
|
||||
}
|
||||
}
|
||||
}
|
||||
mutex_unlock(&mgr->lock);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user