diff mbox series

[1/4] drm/scheduler: implement hardware time accounting

Message ID 20240701171447.3823888-1-l.stach@pengutronix.de (mailing list archive)
State New
Headers show
Series [1/4] drm/scheduler: implement hardware time accounting | expand

Commit Message

Lucas Stach July 1, 2024, 5:14 p.m. UTC
From: Christian König <ckoenig.leichtzumerken@gmail.com>

Multiple drivers came up with the requirement to measure how
much runtime each entity accumulated on the HW.

A previous attempt of accounting this had to be reverted because
HW submissions can have a lifetime exceeding that of the entity
originally issuing them.

Amdgpu on the other hand solves this task by keeping track of
all the submissions and calculating how much time they have used
on demand.

Move this approach over into the scheduler to provide an easy to
use interface for all drivers.

Signed-off-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Lucas Stach <l.stach@pengutronix.de>
---
v2:
- rebase to v6.10-rc1
- fix for non-power-of-two number of HW submission
- add comment explaining the logic behind the fence tracking array
- rename some function and fix documentation
---
 drivers/gpu/drm/scheduler/sched_entity.c | 82 +++++++++++++++++++++++-
 drivers/gpu/drm/scheduler/sched_fence.c  | 19 ++++++
 include/drm/gpu_scheduler.h              | 31 +++++++++
 3 files changed, 131 insertions(+), 1 deletion(-)


base-commit: 1613e604df0cd359cf2a7fbd9be7a0bcfacfabd0

Comments

Tvrtko Ursulin July 2, 2024, 8:42 a.m. UTC | #1
Hi,

I few questions below.

On 01/07/2024 18:14, Lucas Stach wrote:
> From: Christian König <ckoenig.leichtzumerken@gmail.com>
> 
> Multiple drivers came up with the requirement to measure how
> much runtime each entity accumulated on the HW.
> 
> A previous attempt of accounting this had to be reverted because
> HW submissions can have a lifetime exceeding that of the entity
> originally issuing them.
> 
> Amdgpu on the other hand solves this task by keeping track of
> all the submissions and calculating how much time they have used
> on demand.
> 
> Move this approach over into the scheduler to provide an easy to
> use interface for all drivers.
> 
> Signed-off-by: Christian König <christian.koenig@amd.com>
> Signed-off-by: Lucas Stach <l.stach@pengutronix.de>
> ---
> v2:
> - rebase to v6.10-rc1
> - fix for non-power-of-two number of HW submission
> - add comment explaining the logic behind the fence tracking array
> - rename some function and fix documentation
> ---
>   drivers/gpu/drm/scheduler/sched_entity.c | 82 +++++++++++++++++++++++-
>   drivers/gpu/drm/scheduler/sched_fence.c  | 19 ++++++
>   include/drm/gpu_scheduler.h              | 31 +++++++++
>   3 files changed, 131 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/gpu/drm/scheduler/sched_entity.c b/drivers/gpu/drm/scheduler/sched_entity.c
> index 58c8161289fe..d678d0b9b29e 100644
> --- a/drivers/gpu/drm/scheduler/sched_entity.c
> +++ b/drivers/gpu/drm/scheduler/sched_entity.c
> @@ -62,7 +62,9 @@ int drm_sched_entity_init(struct drm_sched_entity *entity,
>   			  unsigned int num_sched_list,
>   			  atomic_t *guilty)
>   {
> -	if (!(entity && sched_list && (num_sched_list == 0 || sched_list[0])))
> +	unsigned int i, num_submissions = 0;
> +
> +	if (!entity || !sched_list)
>   		return -EINVAL;
>   
>   	memset(entity, 0, sizeof(struct drm_sched_entity));
> @@ -98,6 +100,11 @@ int drm_sched_entity_init(struct drm_sched_entity *entity,
>   						 (s32) DRM_SCHED_PRIORITY_KERNEL);
>   		}
>   		entity->rq = sched_list[0]->sched_rq[entity->priority];
> +
> +		for (i = 0; i < num_sched_list; ++i) {
> +			num_submissions = max(num_submissions,
> +					      sched_list[i]->credit_limit);
> +		}

Does this work (in concept and naming) for all drivers if introduction 
of credits broke the 1:1 between jobs and hw "ring" capacity?

How big is the array for different drivers?

>   	}
>   
>   	init_completion(&entity->entity_idle);
> @@ -110,11 +117,52 @@ int drm_sched_entity_init(struct drm_sched_entity *entity,
>   
>   	atomic_set(&entity->fence_seq, 0);
>   	entity->fence_context = dma_fence_context_alloc(2);
> +	spin_lock_init(&entity->accounting_lock);
> +
> +	if (!num_submissions)
> +		return 0;
> +
> +	entity->max_hw_submissions = num_submissions;
> +	entity->hw_submissions = kcalloc(num_submissions, sizeof(void *),
> +					 GFP_KERNEL);
> +	if (!entity->hw_submissions)
> +		return -ENOMEM;
>   
>   	return 0;
>   }
>   EXPORT_SYMBOL(drm_sched_entity_init);
>   
> +/**
> + * drm_sched_entity_time_spent - Accumulated HW runtime used by this entity
> + * @entity: scheduler entity to check
> + *
> + * Get the current accumulated HW runtime used by all submissions made through
> + * this entity.
> + */
> +ktime_t drm_sched_entity_time_spent(struct drm_sched_entity *entity)
> +{
> +	ktime_t result;
> +	unsigned int i;
> +
> +	if (!entity->max_hw_submissions)
> +		return ns_to_ktime(0);
> +
> +	spin_lock(&entity->accounting_lock);
> +	result = entity->hw_time_used;
> +	for (i = 0; i < entity->max_hw_submissions; ++i) {
> +		struct drm_sched_fence *fence = entity->hw_submissions[i];
> +
> +		if (!fence)
> +			continue;
> +
> +		result = ktime_add(result, drm_sched_fence_get_runtime(fence));

Does this end up counting from when jobs have been submitted to the hw 
backend and may not be actually executing?

Say if a driver configures a backend N deep and is filled with N jobs, 
while in actuality they are executed sequentially one at a time, the 
time as reported here would over-account by some series such as 
(job[0].finish - job[0].submit) + ... + (job[N].finish - job[N].submit)?

Or in other words if one submits N jobs to a ring serving an 1-wide hw 
backend, will we see "N*100%" utilisation instead of "100%" if sampling 
while first job is still executing, the rest queued behind it?

Regards,

Tvrtko

> +	}
> +	spin_unlock(&entity->accounting_lock);
> +
> +	return result;
> +}
> +EXPORT_SYMBOL(drm_sched_entity_time_spent);
> +
>   /**
>    * drm_sched_entity_modify_sched - Modify sched of an entity
>    * @entity: scheduler entity to init
> @@ -326,6 +374,8 @@ EXPORT_SYMBOL(drm_sched_entity_flush);
>    */
>   void drm_sched_entity_fini(struct drm_sched_entity *entity)
>   {
> +	unsigned int i;
> +
>   	/*
>   	 * If consumption of existing IBs wasn't completed. Forcefully remove
>   	 * them here. Also makes sure that the scheduler won't touch this entity
> @@ -341,6 +391,9 @@ void drm_sched_entity_fini(struct drm_sched_entity *entity)
>   
>   	dma_fence_put(rcu_dereference_check(entity->last_scheduled, true));
>   	RCU_INIT_POINTER(entity->last_scheduled, NULL);
> +	for (i = 0; i < entity->max_hw_submissions; ++i)
> +		dma_fence_put(&entity->hw_submissions[i]->scheduled);
> +	kfree(entity->hw_submissions);
>   }
>   EXPORT_SYMBOL(drm_sched_entity_fini);
>   
> @@ -522,6 +575,33 @@ struct drm_sched_job *drm_sched_entity_pop_job(struct drm_sched_entity *entity)
>   	 */
>   	sched_job->entity = NULL;
>   
> +	if (entity->max_hw_submissions) {
> +		struct drm_sched_fence *fence = sched_job->s_fence;
> +		unsigned int idx = fence->scheduled.seqno;
> +
> +		dma_fence_get(&fence->scheduled);
> +		idx %= entity->max_hw_submissions;
> +
> +		spin_lock(&entity->accounting_lock);
> +		/*
> +		 * The fence seqno is dense and monotonically increasing. By
> +		 * cycling through a array sized to match the maximum number of
> +		 * submissions queued in the HW we can be sure that once we need
> +		 * to reuse a slot the fence stored in this slot refers to a
> +		 * retired submission and we can safely sum up the accumulated
> +		 * runtime and dispose the fence.
> +		 */
> +		swap(fence, entity->hw_submissions[idx]);
> +		if (fence) {
> +			ktime_t runtime = drm_sched_fence_get_runtime(fence);
> +
> +			entity->hw_time_used = ktime_add(entity->hw_time_used,
> +							 runtime);
> +			dma_fence_put(&fence->scheduled);
> +		}
> +		spin_unlock(&entity->accounting_lock);
> +	}
> +
>   	return sched_job;
>   }
>   
> diff --git a/drivers/gpu/drm/scheduler/sched_fence.c b/drivers/gpu/drm/scheduler/sched_fence.c
> index 0f35f009b9d3..55981ada1829 100644
> --- a/drivers/gpu/drm/scheduler/sched_fence.c
> +++ b/drivers/gpu/drm/scheduler/sched_fence.c
> @@ -82,6 +82,25 @@ void drm_sched_fence_finished(struct drm_sched_fence *fence, int result)
>   	dma_fence_signal(&fence->finished);
>   }
>   
> +/**
> + * drm_sched_fence_get_runtime - accumulated runtime on HW
> + * @fence: fence
> + *
> + * Calculate how much runtime this fence has accumulated on the HW.
> + */
> +ktime_t drm_sched_fence_get_runtime(struct drm_sched_fence *fence)
> +{
> +	/* When the fence is not scheduled, it can't have accumulated runtime */
> +	if (!test_bit(DMA_FENCE_FLAG_TIMESTAMP_BIT, &fence->scheduled.flags))
> +		return ns_to_ktime(0);
> +
> +	/* When it is still running, calculate runtime until now */
> +	if (!test_bit(DMA_FENCE_FLAG_TIMESTAMP_BIT, &fence->finished.flags))
> +		return ktime_sub(ktime_get(), fence->scheduled.timestamp);
> +
> +	return ktime_sub(fence->finished.timestamp, fence->scheduled.timestamp);
> +}
> +
>   static const char *drm_sched_fence_get_driver_name(struct dma_fence *fence)
>   {
>   	return "drm_sched";
> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
> index 5acc64954a88..52bcff324a92 100644
> --- a/include/drm/gpu_scheduler.h
> +++ b/include/drm/gpu_scheduler.h
> @@ -238,6 +238,35 @@ struct drm_sched_entity {
>   	 */
>   	struct rb_node			rb_tree_node;
>   
> +	/**
> +	 * @accounting_lock:
> +	 *
> +	 * Protects the array of fences tracking the in-flight HW submissions
> +	 * and the accumulator counter.
> +	 */
> +	spinlock_t			accounting_lock;
> +
> +	/**
> +	 * @hw_time_used:
> +	 *
> +	 * How much HW runtime has been accumulated by retired submissions
> +	 * from this entity.
> +	 */
> +	ktime_t				hw_time_used;
> +
> +	/**
> +	 * @max_hw_submissions:
> +	 *
> +	 * Maximum number of submissions queued in the HW.
> +	 */
> +	unsigned int			max_hw_submissions;
> +
> +	/**
> +	 * @hw_submissions:
> +	 *
> +	 * Scheduler fences of the HW submissions in flight.
> +	 */
> +	struct drm_sched_fence		**hw_submissions;
>   };
>   
>   /**
> @@ -600,6 +629,7 @@ int drm_sched_entity_init(struct drm_sched_entity *entity,
>   			  struct drm_gpu_scheduler **sched_list,
>   			  unsigned int num_sched_list,
>   			  atomic_t *guilty);
> +ktime_t drm_sched_entity_time_spent(struct drm_sched_entity *entity);
>   long drm_sched_entity_flush(struct drm_sched_entity *entity, long timeout);
>   void drm_sched_entity_fini(struct drm_sched_entity *entity);
>   void drm_sched_entity_destroy(struct drm_sched_entity *entity);
> @@ -620,6 +650,7 @@ void drm_sched_fence_free(struct drm_sched_fence *fence);
>   void drm_sched_fence_scheduled(struct drm_sched_fence *fence,
>   			       struct dma_fence *parent);
>   void drm_sched_fence_finished(struct drm_sched_fence *fence, int result);
> +ktime_t drm_sched_fence_get_runtime(struct drm_sched_fence *fence);
>   
>   unsigned long drm_sched_suspend_timeout(struct drm_gpu_scheduler *sched);
>   void drm_sched_resume_timeout(struct drm_gpu_scheduler *sched,
> 
> base-commit: 1613e604df0cd359cf2a7fbd9be7a0bcfacfabd0
Christian König July 2, 2024, 8:56 a.m. UTC | #2
Am 02.07.24 um 10:42 schrieb Tvrtko Ursulin:
>
> Hi,
>
> I few questions below.
>
> On 01/07/2024 18:14, Lucas Stach wrote:
>> From: Christian König <ckoenig.leichtzumerken@gmail.com>
>>
>> Multiple drivers came up with the requirement to measure how
>> much runtime each entity accumulated on the HW.
>>
>> A previous attempt of accounting this had to be reverted because
>> HW submissions can have a lifetime exceeding that of the entity
>> originally issuing them.
>>
>> Amdgpu on the other hand solves this task by keeping track of
>> all the submissions and calculating how much time they have used
>> on demand.
>>
>> Move this approach over into the scheduler to provide an easy to
>> use interface for all drivers.
>>
>> Signed-off-by: Christian König <christian.koenig@amd.com>
>> Signed-off-by: Lucas Stach <l.stach@pengutronix.de>
>> ---
>> v2:
>> - rebase to v6.10-rc1
>> - fix for non-power-of-two number of HW submission
>> - add comment explaining the logic behind the fence tracking array
>> - rename some function and fix documentation
>> ---
>>   drivers/gpu/drm/scheduler/sched_entity.c | 82 +++++++++++++++++++++++-
>>   drivers/gpu/drm/scheduler/sched_fence.c  | 19 ++++++
>>   include/drm/gpu_scheduler.h              | 31 +++++++++
>>   3 files changed, 131 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/gpu/drm/scheduler/sched_entity.c 
>> b/drivers/gpu/drm/scheduler/sched_entity.c
>> index 58c8161289fe..d678d0b9b29e 100644
>> --- a/drivers/gpu/drm/scheduler/sched_entity.c
>> +++ b/drivers/gpu/drm/scheduler/sched_entity.c
>> @@ -62,7 +62,9 @@ int drm_sched_entity_init(struct drm_sched_entity 
>> *entity,
>>                 unsigned int num_sched_list,
>>                 atomic_t *guilty)
>>   {
>> -    if (!(entity && sched_list && (num_sched_list == 0 || 
>> sched_list[0])))
>> +    unsigned int i, num_submissions = 0;
>> +
>> +    if (!entity || !sched_list)
>>           return -EINVAL;
>>         memset(entity, 0, sizeof(struct drm_sched_entity));
>> @@ -98,6 +100,11 @@ int drm_sched_entity_init(struct drm_sched_entity 
>> *entity,
>>                            (s32) DRM_SCHED_PRIORITY_KERNEL);
>>           }
>>           entity->rq = sched_list[0]->sched_rq[entity->priority];
>> +
>> +        for (i = 0; i < num_sched_list; ++i) {
>> +            num_submissions = max(num_submissions,
>> +                          sched_list[i]->credit_limit);
>> +        }
>
> Does this work (in concept and naming) for all drivers if introduction 
> of credits broke the 1:1 between jobs and hw "ring" capacity?
>
> How big is the array for different drivers?

Oh, good point. I don't really know the answer.

>
>>       }
>>         init_completion(&entity->entity_idle);
>> @@ -110,11 +117,52 @@ int drm_sched_entity_init(struct 
>> drm_sched_entity *entity,
>>         atomic_set(&entity->fence_seq, 0);
>>       entity->fence_context = dma_fence_context_alloc(2);
>> +    spin_lock_init(&entity->accounting_lock);
>> +
>> +    if (!num_submissions)
>> +        return 0;
>> +
>> +    entity->max_hw_submissions = num_submissions;
>> +    entity->hw_submissions = kcalloc(num_submissions, sizeof(void *),
>> +                     GFP_KERNEL);
>> +    if (!entity->hw_submissions)
>> +        return -ENOMEM;
>>         return 0;
>>   }
>>   EXPORT_SYMBOL(drm_sched_entity_init);
>>   +/**
>> + * drm_sched_entity_time_spent - Accumulated HW runtime used by this 
>> entity
>> + * @entity: scheduler entity to check
>> + *
>> + * Get the current accumulated HW runtime used by all submissions 
>> made through
>> + * this entity.
>> + */
>> +ktime_t drm_sched_entity_time_spent(struct drm_sched_entity *entity)
>> +{
>> +    ktime_t result;
>> +    unsigned int i;
>> +
>> +    if (!entity->max_hw_submissions)
>> +        return ns_to_ktime(0);
>> +
>> +    spin_lock(&entity->accounting_lock);
>> +    result = entity->hw_time_used;
>> +    for (i = 0; i < entity->max_hw_submissions; ++i) {
>> +        struct drm_sched_fence *fence = entity->hw_submissions[i];
>> +
>> +        if (!fence)
>> +            continue;
>> +
>> +        result = ktime_add(result, drm_sched_fence_get_runtime(fence));
>
> Does this end up counting from when jobs have been submitted to the hw 
> backend and may not be actually executing?
>
> Say if a driver configures a backend N deep and is filled with N jobs, 
> while in actuality they are executed sequentially one at a time, the 
> time as reported here would over-account by some series such as 
> (job[0].finish - job[0].submit) + ... + (job[N].finish - job[N].submit)?
>
> Or in other words if one submits N jobs to a ring serving an 1-wide hw 
> backend, will we see "N*100%" utilisation instead of "100%" if 
> sampling while first job is still executing, the rest queued behind it?

Oh, good question. Yeah that can happen I think.

Regards,
Christian.

>
> Regards,
>
> Tvrtko
>
>> +    }
>> +    spin_unlock(&entity->accounting_lock);
>> +
>> +    return result;
>> +}
>> +EXPORT_SYMBOL(drm_sched_entity_time_spent);
>> +
>>   /**
>>    * drm_sched_entity_modify_sched - Modify sched of an entity
>>    * @entity: scheduler entity to init
>> @@ -326,6 +374,8 @@ EXPORT_SYMBOL(drm_sched_entity_flush);
>>    */
>>   void drm_sched_entity_fini(struct drm_sched_entity *entity)
>>   {
>> +    unsigned int i;
>> +
>>       /*
>>        * If consumption of existing IBs wasn't completed. Forcefully 
>> remove
>>        * them here. Also makes sure that the scheduler won't touch 
>> this entity
>> @@ -341,6 +391,9 @@ void drm_sched_entity_fini(struct 
>> drm_sched_entity *entity)
>> dma_fence_put(rcu_dereference_check(entity->last_scheduled, true));
>>       RCU_INIT_POINTER(entity->last_scheduled, NULL);
>> +    for (i = 0; i < entity->max_hw_submissions; ++i)
>> + dma_fence_put(&entity->hw_submissions[i]->scheduled);
>> +    kfree(entity->hw_submissions);
>>   }
>>   EXPORT_SYMBOL(drm_sched_entity_fini);
>>   @@ -522,6 +575,33 @@ struct drm_sched_job 
>> *drm_sched_entity_pop_job(struct drm_sched_entity *entity)
>>        */
>>       sched_job->entity = NULL;
>>   +    if (entity->max_hw_submissions) {
>> +        struct drm_sched_fence *fence = sched_job->s_fence;
>> +        unsigned int idx = fence->scheduled.seqno;
>> +
>> +        dma_fence_get(&fence->scheduled);
>> +        idx %= entity->max_hw_submissions;
>> +
>> +        spin_lock(&entity->accounting_lock);
>> +        /*
>> +         * The fence seqno is dense and monotonically increasing. By
>> +         * cycling through a array sized to match the maximum number of
>> +         * submissions queued in the HW we can be sure that once we 
>> need
>> +         * to reuse a slot the fence stored in this slot refers to a
>> +         * retired submission and we can safely sum up the accumulated
>> +         * runtime and dispose the fence.
>> +         */
>> +        swap(fence, entity->hw_submissions[idx]);
>> +        if (fence) {
>> +            ktime_t runtime = drm_sched_fence_get_runtime(fence);
>> +
>> +            entity->hw_time_used = ktime_add(entity->hw_time_used,
>> +                             runtime);
>> +            dma_fence_put(&fence->scheduled);
>> +        }
>> +        spin_unlock(&entity->accounting_lock);
>> +    }
>> +
>>       return sched_job;
>>   }
>>   diff --git a/drivers/gpu/drm/scheduler/sched_fence.c 
>> b/drivers/gpu/drm/scheduler/sched_fence.c
>> index 0f35f009b9d3..55981ada1829 100644
>> --- a/drivers/gpu/drm/scheduler/sched_fence.c
>> +++ b/drivers/gpu/drm/scheduler/sched_fence.c
>> @@ -82,6 +82,25 @@ void drm_sched_fence_finished(struct 
>> drm_sched_fence *fence, int result)
>>       dma_fence_signal(&fence->finished);
>>   }
>>   +/**
>> + * drm_sched_fence_get_runtime - accumulated runtime on HW
>> + * @fence: fence
>> + *
>> + * Calculate how much runtime this fence has accumulated on the HW.
>> + */
>> +ktime_t drm_sched_fence_get_runtime(struct drm_sched_fence *fence)
>> +{
>> +    /* When the fence is not scheduled, it can't have accumulated 
>> runtime */
>> +    if (!test_bit(DMA_FENCE_FLAG_TIMESTAMP_BIT, 
>> &fence->scheduled.flags))
>> +        return ns_to_ktime(0);
>> +
>> +    /* When it is still running, calculate runtime until now */
>> +    if (!test_bit(DMA_FENCE_FLAG_TIMESTAMP_BIT, 
>> &fence->finished.flags))
>> +        return ktime_sub(ktime_get(), fence->scheduled.timestamp);
>> +
>> +    return ktime_sub(fence->finished.timestamp, 
>> fence->scheduled.timestamp);
>> +}
>> +
>>   static const char *drm_sched_fence_get_driver_name(struct dma_fence 
>> *fence)
>>   {
>>       return "drm_sched";
>> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
>> index 5acc64954a88..52bcff324a92 100644
>> --- a/include/drm/gpu_scheduler.h
>> +++ b/include/drm/gpu_scheduler.h
>> @@ -238,6 +238,35 @@ struct drm_sched_entity {
>>        */
>>       struct rb_node            rb_tree_node;
>>   +    /**
>> +     * @accounting_lock:
>> +     *
>> +     * Protects the array of fences tracking the in-flight HW 
>> submissions
>> +     * and the accumulator counter.
>> +     */
>> +    spinlock_t            accounting_lock;
>> +
>> +    /**
>> +     * @hw_time_used:
>> +     *
>> +     * How much HW runtime has been accumulated by retired submissions
>> +     * from this entity.
>> +     */
>> +    ktime_t                hw_time_used;
>> +
>> +    /**
>> +     * @max_hw_submissions:
>> +     *
>> +     * Maximum number of submissions queued in the HW.
>> +     */
>> +    unsigned int            max_hw_submissions;
>> +
>> +    /**
>> +     * @hw_submissions:
>> +     *
>> +     * Scheduler fences of the HW submissions in flight.
>> +     */
>> +    struct drm_sched_fence        **hw_submissions;
>>   };
>>     /**
>> @@ -600,6 +629,7 @@ int drm_sched_entity_init(struct drm_sched_entity 
>> *entity,
>>                 struct drm_gpu_scheduler **sched_list,
>>                 unsigned int num_sched_list,
>>                 atomic_t *guilty);
>> +ktime_t drm_sched_entity_time_spent(struct drm_sched_entity *entity);
>>   long drm_sched_entity_flush(struct drm_sched_entity *entity, long 
>> timeout);
>>   void drm_sched_entity_fini(struct drm_sched_entity *entity);
>>   void drm_sched_entity_destroy(struct drm_sched_entity *entity);
>> @@ -620,6 +650,7 @@ void drm_sched_fence_free(struct drm_sched_fence 
>> *fence);
>>   void drm_sched_fence_scheduled(struct drm_sched_fence *fence,
>>                      struct dma_fence *parent);
>>   void drm_sched_fence_finished(struct drm_sched_fence *fence, int 
>> result);
>> +ktime_t drm_sched_fence_get_runtime(struct drm_sched_fence *fence);
>>     unsigned long drm_sched_suspend_timeout(struct drm_gpu_scheduler 
>> *sched);
>>   void drm_sched_resume_timeout(struct drm_gpu_scheduler *sched,
>>
>> base-commit: 1613e604df0cd359cf2a7fbd9be7a0bcfacfabd0
Lucas Stach July 2, 2024, 9:15 a.m. UTC | #3
Hi Tvrtko,

Am Dienstag, dem 02.07.2024 um 09:42 +0100 schrieb Tvrtko Ursulin:
> Hi,
> 
> I few questions below.
> 
> On 01/07/2024 18:14, Lucas Stach wrote:
> > From: Christian König <ckoenig.leichtzumerken@gmail.com>
> > 
> > Multiple drivers came up with the requirement to measure how
> > much runtime each entity accumulated on the HW.
> > 
> > A previous attempt of accounting this had to be reverted because
> > HW submissions can have a lifetime exceeding that of the entity
> > originally issuing them.
> > 
> > Amdgpu on the other hand solves this task by keeping track of
> > all the submissions and calculating how much time they have used
> > on demand.
> > 
> > Move this approach over into the scheduler to provide an easy to
> > use interface for all drivers.
> > 
> > Signed-off-by: Christian König <christian.koenig@amd.com>
> > Signed-off-by: Lucas Stach <l.stach@pengutronix.de>
> > ---
> > v2:
> > - rebase to v6.10-rc1
> > - fix for non-power-of-two number of HW submission
> > - add comment explaining the logic behind the fence tracking array
> > - rename some function and fix documentation
> > ---
> >   drivers/gpu/drm/scheduler/sched_entity.c | 82 +++++++++++++++++++++++-
> >   drivers/gpu/drm/scheduler/sched_fence.c  | 19 ++++++
> >   include/drm/gpu_scheduler.h              | 31 +++++++++
> >   3 files changed, 131 insertions(+), 1 deletion(-)
> > 
> > diff --git a/drivers/gpu/drm/scheduler/sched_entity.c b/drivers/gpu/drm/scheduler/sched_entity.c
> > index 58c8161289fe..d678d0b9b29e 100644
> > --- a/drivers/gpu/drm/scheduler/sched_entity.c
> > +++ b/drivers/gpu/drm/scheduler/sched_entity.c
> > @@ -62,7 +62,9 @@ int drm_sched_entity_init(struct drm_sched_entity *entity,
> >   			  unsigned int num_sched_list,
> >   			  atomic_t *guilty)
> >   {
> > -	if (!(entity && sched_list && (num_sched_list == 0 || sched_list[0])))
> > +	unsigned int i, num_submissions = 0;
> > +
> > +	if (!entity || !sched_list)
> >   		return -EINVAL;
> >   
> >   	memset(entity, 0, sizeof(struct drm_sched_entity));
> > @@ -98,6 +100,11 @@ int drm_sched_entity_init(struct drm_sched_entity *entity,
> >   						 (s32) DRM_SCHED_PRIORITY_KERNEL);
> >   		}
> >   		entity->rq = sched_list[0]->sched_rq[entity->priority];
> > +
> > +		for (i = 0; i < num_sched_list; ++i) {
> > +			num_submissions = max(num_submissions,
> > +					      sched_list[i]->credit_limit);
> > +		}
> 
> Does this work (in concept and naming) for all drivers if introduction 
> of credits broke the 1:1 between jobs and hw "ring" capacity?
> 
Functionally it's still correct, as we simply cycle through the array a
bit slower when a job takes multiple credits. It doesn't violate any
fundamental assumption. However, ...

> How big is the array for different drivers?
> 
... the tracking array is getting really big for drivers which allow
for a large number of credits. Worst cases in order are PVR with 64k
and nouveau with 1024 credits. The memory required to hold the pointers
to the fences, as well as referenced fences is huge.

I guess we need something more clever like a xarray to track the fences
and clean up signalled fences on each submit or something like that.

Regards,
Lucas

> >   	}
> >   
> >   	init_completion(&entity->entity_idle);
> > @@ -110,11 +117,52 @@ int drm_sched_entity_init(struct drm_sched_entity *entity,
> >   
> >   	atomic_set(&entity->fence_seq, 0);
> >   	entity->fence_context = dma_fence_context_alloc(2);
> > +	spin_lock_init(&entity->accounting_lock);
> > +
> > +	if (!num_submissions)
> > +		return 0;
> > +
> > +	entity->max_hw_submissions = num_submissions;
> > +	entity->hw_submissions = kcalloc(num_submissions, sizeof(void *),
> > +					 GFP_KERNEL);
> > +	if (!entity->hw_submissions)
> > +		return -ENOMEM;
> >   
> >   	return 0;
> >   }
> >   EXPORT_SYMBOL(drm_sched_entity_init);
> >   
> > +/**
> > + * drm_sched_entity_time_spent - Accumulated HW runtime used by this entity
> > + * @entity: scheduler entity to check
> > + *
> > + * Get the current accumulated HW runtime used by all submissions made through
> > + * this entity.
> > + */
> > +ktime_t drm_sched_entity_time_spent(struct drm_sched_entity *entity)
> > +{
> > +	ktime_t result;
> > +	unsigned int i;
> > +
> > +	if (!entity->max_hw_submissions)
> > +		return ns_to_ktime(0);
> > +
> > +	spin_lock(&entity->accounting_lock);
> > +	result = entity->hw_time_used;
> > +	for (i = 0; i < entity->max_hw_submissions; ++i) {
> > +		struct drm_sched_fence *fence = entity->hw_submissions[i];
> > +
> > +		if (!fence)
> > +			continue;
> > +
> > +		result = ktime_add(result, drm_sched_fence_get_runtime(fence));
> 
> Does this end up counting from when jobs have been submitted to the hw 
> backend and may not be actually executing?
> 
> Say if a driver configures a backend N deep and is filled with N jobs, 
> while in actuality they are executed sequentially one at a time, the 
> time as reported here would over-account by some series such as 
> (job[0].finish - job[0].submit) + ... + (job[N].finish - job[N].submit)?
> 
> Or in other words if one submits N jobs to a ring serving an 1-wide hw 
> backend, will we see "N*100%" utilisation instead of "100%" if sampling 
> while first job is still executing, the rest queued behind it?
> 
> Regards,
> 
> Tvrtko
> 
> > +	}
> > +	spin_unlock(&entity->accounting_lock);
> > +
> > +	return result;
> > +}
> > +EXPORT_SYMBOL(drm_sched_entity_time_spent);
> > +
> >   /**
> >    * drm_sched_entity_modify_sched - Modify sched of an entity
> >    * @entity: scheduler entity to init
> > @@ -326,6 +374,8 @@ EXPORT_SYMBOL(drm_sched_entity_flush);
> >    */
> >   void drm_sched_entity_fini(struct drm_sched_entity *entity)
> >   {
> > +	unsigned int i;
> > +
> >   	/*
> >   	 * If consumption of existing IBs wasn't completed. Forcefully remove
> >   	 * them here. Also makes sure that the scheduler won't touch this entity
> > @@ -341,6 +391,9 @@ void drm_sched_entity_fini(struct drm_sched_entity *entity)
> >   
> >   	dma_fence_put(rcu_dereference_check(entity->last_scheduled, true));
> >   	RCU_INIT_POINTER(entity->last_scheduled, NULL);
> > +	for (i = 0; i < entity->max_hw_submissions; ++i)
> > +		dma_fence_put(&entity->hw_submissions[i]->scheduled);
> > +	kfree(entity->hw_submissions);
> >   }
> >   EXPORT_SYMBOL(drm_sched_entity_fini);
> >   
> > @@ -522,6 +575,33 @@ struct drm_sched_job *drm_sched_entity_pop_job(struct drm_sched_entity *entity)
> >   	 */
> >   	sched_job->entity = NULL;
> >   
> > +	if (entity->max_hw_submissions) {
> > +		struct drm_sched_fence *fence = sched_job->s_fence;
> > +		unsigned int idx = fence->scheduled.seqno;
> > +
> > +		dma_fence_get(&fence->scheduled);
> > +		idx %= entity->max_hw_submissions;
> > +
> > +		spin_lock(&entity->accounting_lock);
> > +		/*
> > +		 * The fence seqno is dense and monotonically increasing. By
> > +		 * cycling through a array sized to match the maximum number of
> > +		 * submissions queued in the HW we can be sure that once we need
> > +		 * to reuse a slot the fence stored in this slot refers to a
> > +		 * retired submission and we can safely sum up the accumulated
> > +		 * runtime and dispose the fence.
> > +		 */
> > +		swap(fence, entity->hw_submissions[idx]);
> > +		if (fence) {
> > +			ktime_t runtime = drm_sched_fence_get_runtime(fence);
> > +
> > +			entity->hw_time_used = ktime_add(entity->hw_time_used,
> > +							 runtime);
> > +			dma_fence_put(&fence->scheduled);
> > +		}
> > +		spin_unlock(&entity->accounting_lock);
> > +	}
> > +
> >   	return sched_job;
> >   }
> >   
> > diff --git a/drivers/gpu/drm/scheduler/sched_fence.c b/drivers/gpu/drm/scheduler/sched_fence.c
> > index 0f35f009b9d3..55981ada1829 100644
> > --- a/drivers/gpu/drm/scheduler/sched_fence.c
> > +++ b/drivers/gpu/drm/scheduler/sched_fence.c
> > @@ -82,6 +82,25 @@ void drm_sched_fence_finished(struct drm_sched_fence *fence, int result)
> >   	dma_fence_signal(&fence->finished);
> >   }
> >   
> > +/**
> > + * drm_sched_fence_get_runtime - accumulated runtime on HW
> > + * @fence: fence
> > + *
> > + * Calculate how much runtime this fence has accumulated on the HW.
> > + */
> > +ktime_t drm_sched_fence_get_runtime(struct drm_sched_fence *fence)
> > +{
> > +	/* When the fence is not scheduled, it can't have accumulated runtime */
> > +	if (!test_bit(DMA_FENCE_FLAG_TIMESTAMP_BIT, &fence->scheduled.flags))
> > +		return ns_to_ktime(0);
> > +
> > +	/* When it is still running, calculate runtime until now */
> > +	if (!test_bit(DMA_FENCE_FLAG_TIMESTAMP_BIT, &fence->finished.flags))
> > +		return ktime_sub(ktime_get(), fence->scheduled.timestamp);
> > +
> > +	return ktime_sub(fence->finished.timestamp, fence->scheduled.timestamp);
> > +}
> > +
> >   static const char *drm_sched_fence_get_driver_name(struct dma_fence *fence)
> >   {
> >   	return "drm_sched";
> > diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
> > index 5acc64954a88..52bcff324a92 100644
> > --- a/include/drm/gpu_scheduler.h
> > +++ b/include/drm/gpu_scheduler.h
> > @@ -238,6 +238,35 @@ struct drm_sched_entity {
> >   	 */
> >   	struct rb_node			rb_tree_node;
> >   
> > +	/**
> > +	 * @accounting_lock:
> > +	 *
> > +	 * Protects the array of fences tracking the in-flight HW submissions
> > +	 * and the accumulator counter.
> > +	 */
> > +	spinlock_t			accounting_lock;
> > +
> > +	/**
> > +	 * @hw_time_used:
> > +	 *
> > +	 * How much HW runtime has been accumulated by retired submissions
> > +	 * from this entity.
> > +	 */
> > +	ktime_t				hw_time_used;
> > +
> > +	/**
> > +	 * @max_hw_submissions:
> > +	 *
> > +	 * Maximum number of submissions queued in the HW.
> > +	 */
> > +	unsigned int			max_hw_submissions;
> > +
> > +	/**
> > +	 * @hw_submissions:
> > +	 *
> > +	 * Scheduler fences of the HW submissions in flight.
> > +	 */
> > +	struct drm_sched_fence		**hw_submissions;
> >   };
> >   
> >   /**
> > @@ -600,6 +629,7 @@ int drm_sched_entity_init(struct drm_sched_entity *entity,
> >   			  struct drm_gpu_scheduler **sched_list,
> >   			  unsigned int num_sched_list,
> >   			  atomic_t *guilty);
> > +ktime_t drm_sched_entity_time_spent(struct drm_sched_entity *entity);
> >   long drm_sched_entity_flush(struct drm_sched_entity *entity, long timeout);
> >   void drm_sched_entity_fini(struct drm_sched_entity *entity);
> >   void drm_sched_entity_destroy(struct drm_sched_entity *entity);
> > @@ -620,6 +650,7 @@ void drm_sched_fence_free(struct drm_sched_fence *fence);
> >   void drm_sched_fence_scheduled(struct drm_sched_fence *fence,
> >   			       struct dma_fence *parent);
> >   void drm_sched_fence_finished(struct drm_sched_fence *fence, int result);
> > +ktime_t drm_sched_fence_get_runtime(struct drm_sched_fence *fence);
> >   
> >   unsigned long drm_sched_suspend_timeout(struct drm_gpu_scheduler *sched);
> >   void drm_sched_resume_timeout(struct drm_gpu_scheduler *sched,
> > 
> > base-commit: 1613e604df0cd359cf2a7fbd9be7a0bcfacfabd0
diff mbox series

Patch

diff --git a/drivers/gpu/drm/scheduler/sched_entity.c b/drivers/gpu/drm/scheduler/sched_entity.c
index 58c8161289fe..d678d0b9b29e 100644
--- a/drivers/gpu/drm/scheduler/sched_entity.c
+++ b/drivers/gpu/drm/scheduler/sched_entity.c
@@ -62,7 +62,9 @@  int drm_sched_entity_init(struct drm_sched_entity *entity,
 			  unsigned int num_sched_list,
 			  atomic_t *guilty)
 {
-	if (!(entity && sched_list && (num_sched_list == 0 || sched_list[0])))
+	unsigned int i, num_submissions = 0;
+
+	if (!entity || !sched_list)
 		return -EINVAL;
 
 	memset(entity, 0, sizeof(struct drm_sched_entity));
@@ -98,6 +100,11 @@  int drm_sched_entity_init(struct drm_sched_entity *entity,
 						 (s32) DRM_SCHED_PRIORITY_KERNEL);
 		}
 		entity->rq = sched_list[0]->sched_rq[entity->priority];
+
+		for (i = 0; i < num_sched_list; ++i) {
+			num_submissions = max(num_submissions,
+					      sched_list[i]->credit_limit);
+		}
 	}
 
 	init_completion(&entity->entity_idle);
@@ -110,11 +117,52 @@  int drm_sched_entity_init(struct drm_sched_entity *entity,
 
 	atomic_set(&entity->fence_seq, 0);
 	entity->fence_context = dma_fence_context_alloc(2);
+	spin_lock_init(&entity->accounting_lock);
+
+	if (!num_submissions)
+		return 0;
+
+	entity->max_hw_submissions = num_submissions;
+	entity->hw_submissions = kcalloc(num_submissions, sizeof(void *),
+					 GFP_KERNEL);
+	if (!entity->hw_submissions)
+		return -ENOMEM;
 
 	return 0;
 }
 EXPORT_SYMBOL(drm_sched_entity_init);
 
+/**
+ * drm_sched_entity_time_spent - Accumulated HW runtime used by this entity
+ * @entity: scheduler entity to check
+ *
+ * Get the current accumulated HW runtime used by all submissions made through
+ * this entity.
+ */
+ktime_t drm_sched_entity_time_spent(struct drm_sched_entity *entity)
+{
+	ktime_t result;
+	unsigned int i;
+
+	if (!entity->max_hw_submissions)
+		return ns_to_ktime(0);
+
+	spin_lock(&entity->accounting_lock);
+	result = entity->hw_time_used;
+	for (i = 0; i < entity->max_hw_submissions; ++i) {
+		struct drm_sched_fence *fence = entity->hw_submissions[i];
+
+		if (!fence)
+			continue;
+
+		result = ktime_add(result, drm_sched_fence_get_runtime(fence));
+	}
+	spin_unlock(&entity->accounting_lock);
+
+	return result;
+}
+EXPORT_SYMBOL(drm_sched_entity_time_spent);
+
 /**
  * drm_sched_entity_modify_sched - Modify sched of an entity
  * @entity: scheduler entity to init
@@ -326,6 +374,8 @@  EXPORT_SYMBOL(drm_sched_entity_flush);
  */
 void drm_sched_entity_fini(struct drm_sched_entity *entity)
 {
+	unsigned int i;
+
 	/*
 	 * If consumption of existing IBs wasn't completed. Forcefully remove
 	 * them here. Also makes sure that the scheduler won't touch this entity
@@ -341,6 +391,9 @@  void drm_sched_entity_fini(struct drm_sched_entity *entity)
 
 	dma_fence_put(rcu_dereference_check(entity->last_scheduled, true));
 	RCU_INIT_POINTER(entity->last_scheduled, NULL);
+	for (i = 0; i < entity->max_hw_submissions; ++i)
+		dma_fence_put(&entity->hw_submissions[i]->scheduled);
+	kfree(entity->hw_submissions);
 }
 EXPORT_SYMBOL(drm_sched_entity_fini);
 
@@ -522,6 +575,33 @@  struct drm_sched_job *drm_sched_entity_pop_job(struct drm_sched_entity *entity)
 	 */
 	sched_job->entity = NULL;
 
+	if (entity->max_hw_submissions) {
+		struct drm_sched_fence *fence = sched_job->s_fence;
+		unsigned int idx = fence->scheduled.seqno;
+
+		dma_fence_get(&fence->scheduled);
+		idx %= entity->max_hw_submissions;
+
+		spin_lock(&entity->accounting_lock);
+		/*
+		 * The fence seqno is dense and monotonically increasing. By
+		 * cycling through a array sized to match the maximum number of
+		 * submissions queued in the HW we can be sure that once we need
+		 * to reuse a slot the fence stored in this slot refers to a
+		 * retired submission and we can safely sum up the accumulated
+		 * runtime and dispose the fence.
+		 */
+		swap(fence, entity->hw_submissions[idx]);
+		if (fence) {
+			ktime_t runtime = drm_sched_fence_get_runtime(fence);
+
+			entity->hw_time_used = ktime_add(entity->hw_time_used,
+							 runtime);
+			dma_fence_put(&fence->scheduled);
+		}
+		spin_unlock(&entity->accounting_lock);
+	}
+
 	return sched_job;
 }
 
diff --git a/drivers/gpu/drm/scheduler/sched_fence.c b/drivers/gpu/drm/scheduler/sched_fence.c
index 0f35f009b9d3..55981ada1829 100644
--- a/drivers/gpu/drm/scheduler/sched_fence.c
+++ b/drivers/gpu/drm/scheduler/sched_fence.c
@@ -82,6 +82,25 @@  void drm_sched_fence_finished(struct drm_sched_fence *fence, int result)
 	dma_fence_signal(&fence->finished);
 }
 
+/**
+ * drm_sched_fence_get_runtime - accumulated runtime on HW
+ * @fence: fence
+ *
+ * Calculate how much runtime this fence has accumulated on the HW.
+ */
+ktime_t drm_sched_fence_get_runtime(struct drm_sched_fence *fence)
+{
+	/* When the fence is not scheduled, it can't have accumulated runtime */
+	if (!test_bit(DMA_FENCE_FLAG_TIMESTAMP_BIT, &fence->scheduled.flags))
+		return ns_to_ktime(0);
+
+	/* When it is still running, calculate runtime until now */
+	if (!test_bit(DMA_FENCE_FLAG_TIMESTAMP_BIT, &fence->finished.flags))
+		return ktime_sub(ktime_get(), fence->scheduled.timestamp);
+
+	return ktime_sub(fence->finished.timestamp, fence->scheduled.timestamp);
+}
+
 static const char *drm_sched_fence_get_driver_name(struct dma_fence *fence)
 {
 	return "drm_sched";
diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
index 5acc64954a88..52bcff324a92 100644
--- a/include/drm/gpu_scheduler.h
+++ b/include/drm/gpu_scheduler.h
@@ -238,6 +238,35 @@  struct drm_sched_entity {
 	 */
 	struct rb_node			rb_tree_node;
 
+	/**
+	 * @accounting_lock:
+	 *
+	 * Protects the array of fences tracking the in-flight HW submissions
+	 * and the accumulator counter.
+	 */
+	spinlock_t			accounting_lock;
+
+	/**
+	 * @hw_time_used:
+	 *
+	 * How much HW runtime has been accumulated by retired submissions
+	 * from this entity.
+	 */
+	ktime_t				hw_time_used;
+
+	/**
+	 * @max_hw_submissions:
+	 *
+	 * Maximum number of submissions queued in the HW.
+	 */
+	unsigned int			max_hw_submissions;
+
+	/**
+	 * @hw_submissions:
+	 *
+	 * Scheduler fences of the HW submissions in flight.
+	 */
+	struct drm_sched_fence		**hw_submissions;
 };
 
 /**
@@ -600,6 +629,7 @@  int drm_sched_entity_init(struct drm_sched_entity *entity,
 			  struct drm_gpu_scheduler **sched_list,
 			  unsigned int num_sched_list,
 			  atomic_t *guilty);
+ktime_t drm_sched_entity_time_spent(struct drm_sched_entity *entity);
 long drm_sched_entity_flush(struct drm_sched_entity *entity, long timeout);
 void drm_sched_entity_fini(struct drm_sched_entity *entity);
 void drm_sched_entity_destroy(struct drm_sched_entity *entity);
@@ -620,6 +650,7 @@  void drm_sched_fence_free(struct drm_sched_fence *fence);
 void drm_sched_fence_scheduled(struct drm_sched_fence *fence,
 			       struct dma_fence *parent);
 void drm_sched_fence_finished(struct drm_sched_fence *fence, int result);
+ktime_t drm_sched_fence_get_runtime(struct drm_sched_fence *fence);
 
 unsigned long drm_sched_suspend_timeout(struct drm_gpu_scheduler *sched);
 void drm_sched_resume_timeout(struct drm_gpu_scheduler *sched,