diff mbox series

[RFC,16/18] drm/sched: Connect with dma-fence deadlines

Message ID 20250108183528.41007-17-tvrtko.ursulin@igalia.com (mailing list archive)
State New
Headers show
Series Deadline scheduler and other ideas | expand

Commit Message

Tvrtko Ursulin Jan. 8, 2025, 6:35 p.m. UTC
Now that the scheduling policy is deadline based it feels completely
natural to allow propagating externaly set deadlines to the scheduler.

Scheduler deadlines are not a guarantee but as the dma-fence facility is
already in use by userspace lets wire it up.

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@igalia.com>
Cc: Christian König <christian.koenig@amd.com>
Cc: Danilo Krummrich <dakr@redhat.com>
Cc: Matthew Brost <matthew.brost@intel.com>
Cc: Philipp Stanner <pstanner@redhat.com>
Cc: Rob Clark <robdclark@gmail.com>
---
 drivers/gpu/drm/scheduler/sched_entity.c | 24 ++++++++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)

Comments

Christian König Jan. 9, 2025, 1:07 p.m. UTC | #1
Am 08.01.25 um 19:35 schrieb Tvrtko Ursulin:
> Now that the scheduling policy is deadline based it feels completely
> natural to allow propagating externaly set deadlines to the scheduler.
>
> Scheduler deadlines are not a guarantee but as the dma-fence facility is
> already in use by userspace lets wire it up.
>
> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@igalia.com>
> Cc: Christian König <christian.koenig@amd.com>
> Cc: Danilo Krummrich <dakr@redhat.com>
> Cc: Matthew Brost <matthew.brost@intel.com>
> Cc: Philipp Stanner <pstanner@redhat.com>
> Cc: Rob Clark <robdclark@gmail.com>
> ---
>   drivers/gpu/drm/scheduler/sched_entity.c | 24 ++++++++++++++++++++++--
>   1 file changed, 22 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/scheduler/sched_entity.c b/drivers/gpu/drm/scheduler/sched_entity.c
> index 98c78d1373d8..db5d34310b18 100644
> --- a/drivers/gpu/drm/scheduler/sched_entity.c
> +++ b/drivers/gpu/drm/scheduler/sched_entity.c
> @@ -410,7 +410,16 @@ ktime_t
>   drm_sched_entity_get_job_deadline(struct drm_sched_entity *entity,
>   				  struct drm_sched_job *job)
>   {
> -	return __drm_sched_entity_get_job_deadline(entity, job->submit_ts);
> +	struct drm_sched_fence *s_fence = job->s_fence;
> +	struct dma_fence *fence = &s_fence->finished;
> +	ktime_t deadline;
> +
> +	deadline = __drm_sched_entity_get_job_deadline(entity, job->submit_ts);
> +	if (test_bit(DRM_SCHED_FENCE_FLAG_HAS_DEADLINE_BIT, &fence->flags) &&
> +	    ktime_before(s_fence->deadline, deadline))
> +		deadline = s_fence->deadline;
> +
> +	return deadline;
>   }
>   
>   /*
> @@ -579,9 +588,12 @@ void drm_sched_entity_select_rq(struct drm_sched_entity *entity)
>    */
>   void drm_sched_entity_push_job(struct drm_sched_job *sched_job)
>   {
> +	struct drm_sched_fence *s_fence = sched_job->s_fence;
>   	struct drm_sched_entity *entity = sched_job->entity;
> -	bool first;
> +	struct dma_fence *fence = &s_fence->finished;
> +	ktime_t fence_deadline;
>   	ktime_t submit_ts;
> +	bool first;
>   
>   	trace_drm_sched_job(sched_job, entity);
>   	atomic_inc(entity->rq->sched->score);
> @@ -593,6 +605,11 @@ void drm_sched_entity_push_job(struct drm_sched_job *sched_job)
>   	 * Make sure to set the submit_ts first, to avoid a race.
>   	 */
>   	sched_job->submit_ts = submit_ts = ktime_get();
> +	if (test_bit(DRM_SCHED_FENCE_FLAG_HAS_DEADLINE_BIT, &fence->flags))
> +		fence_deadline = s_fence->deadline;
> +	else
> +		fence_deadline = KTIME_MAX;
> +

That makes no sense. When the job is pushed the fence is not made public 
yet.

So no deadline can be set on the fence.

>   	first = spsc_queue_push(&entity->job_queue, &sched_job->queue_node);
>   
>   	/* first job wakes up scheduler */
> @@ -601,6 +618,9 @@ void drm_sched_entity_push_job(struct drm_sched_job *sched_job)
>   
>   		submit_ts = __drm_sched_entity_get_job_deadline(entity,
>   								submit_ts);
> +		if (ktime_before(fence_deadline, submit_ts))
> +			submit_ts = fence_deadline;
> +

Yeah, that won't work at all as far as I can see.

Regards,
Christian.

>   		sched = drm_sched_rq_add_entity(entity->rq, entity, submit_ts);
>   		if (sched)
>   			drm_sched_wakeup(sched);
Tvrtko Ursulin Jan. 9, 2025, 1:41 p.m. UTC | #2
On 09/01/2025 13:07, Christian König wrote:
> Am 08.01.25 um 19:35 schrieb Tvrtko Ursulin:
>> Now that the scheduling policy is deadline based it feels completely
>> natural to allow propagating externaly set deadlines to the scheduler.
>>
>> Scheduler deadlines are not a guarantee but as the dma-fence facility is
>> already in use by userspace lets wire it up.
>>
>> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@igalia.com>
>> Cc: Christian König <christian.koenig@amd.com>
>> Cc: Danilo Krummrich <dakr@redhat.com>
>> Cc: Matthew Brost <matthew.brost@intel.com>
>> Cc: Philipp Stanner <pstanner@redhat.com>
>> Cc: Rob Clark <robdclark@gmail.com>
>> ---
>>   drivers/gpu/drm/scheduler/sched_entity.c | 24 ++++++++++++++++++++++--
>>   1 file changed, 22 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/scheduler/sched_entity.c 
>> b/drivers/gpu/drm/scheduler/sched_entity.c
>> index 98c78d1373d8..db5d34310b18 100644
>> --- a/drivers/gpu/drm/scheduler/sched_entity.c
>> +++ b/drivers/gpu/drm/scheduler/sched_entity.c
>> @@ -410,7 +410,16 @@ ktime_t
>>   drm_sched_entity_get_job_deadline(struct drm_sched_entity *entity,
>>                     struct drm_sched_job *job)
>>   {
>> -    return __drm_sched_entity_get_job_deadline(entity, job->submit_ts);
>> +    struct drm_sched_fence *s_fence = job->s_fence;
>> +    struct dma_fence *fence = &s_fence->finished;
>> +    ktime_t deadline;
>> +
>> +    deadline = __drm_sched_entity_get_job_deadline(entity, 
>> job->submit_ts);
>> +    if (test_bit(DRM_SCHED_FENCE_FLAG_HAS_DEADLINE_BIT, 
>> &fence->flags) &&
>> +        ktime_before(s_fence->deadline, deadline))
>> +        deadline = s_fence->deadline;
>> +
>> +    return deadline;
>>   }
>>   /*
>> @@ -579,9 +588,12 @@ void drm_sched_entity_select_rq(struct 
>> drm_sched_entity *entity)
>>    */
>>   void drm_sched_entity_push_job(struct drm_sched_job *sched_job)
>>   {
>> +    struct drm_sched_fence *s_fence = sched_job->s_fence;
>>       struct drm_sched_entity *entity = sched_job->entity;
>> -    bool first;
>> +    struct dma_fence *fence = &s_fence->finished;
>> +    ktime_t fence_deadline;
>>       ktime_t submit_ts;
>> +    bool first;
>>       trace_drm_sched_job(sched_job, entity);
>>       atomic_inc(entity->rq->sched->score);
>> @@ -593,6 +605,11 @@ void drm_sched_entity_push_job(struct 
>> drm_sched_job *sched_job)
>>        * Make sure to set the submit_ts first, to avoid a race.
>>        */
>>       sched_job->submit_ts = submit_ts = ktime_get();
>> +    if (test_bit(DRM_SCHED_FENCE_FLAG_HAS_DEADLINE_BIT, &fence->flags))
>> +        fence_deadline = s_fence->deadline;
>> +    else
>> +        fence_deadline = KTIME_MAX;
>> +
> 
> That makes no sense. When the job is pushed the fence is not made public 
> yet.
> 
> So no deadline can be set on the fence.

You are correct, the push side of things was a mistake a laziness that I 
did not remove it from the RFC.

>>       first = spsc_queue_push(&entity->job_queue, 
>> &sched_job->queue_node);
>>       /* first job wakes up scheduler */
>> @@ -601,6 +618,9 @@ void drm_sched_entity_push_job(struct 
>> drm_sched_job *sched_job)
>>           submit_ts = __drm_sched_entity_get_job_deadline(entity,
>>                                   submit_ts);
>> +        if (ktime_before(fence_deadline, submit_ts))
>> +            submit_ts = fence_deadline;
>> +
> 
> Yeah, that won't work at all as far as I can see.

It works from the pop side though.

When job N is scheduled, deadline is taken from N+1 and tree 
re-balanced. At the point of N scheduling N+1 can definitely have a real 
deadline set.

What does not work is for queue depth of one. No way at the moment to 
"bump" the entity in the tree while N is waiting for submission because 
we cannot dereference the entity from the job. (I had that in v1 of the 
series and realized it was unsafe.)

I (very) briefly though about reference counting entities but quickly 
had a feeling it would be annoying. So for now this patch only offers a 
partial solution.

Regards,

Tvrtko

>>           sched = drm_sched_rq_add_entity(entity->rq, entity, submit_ts);
>>           if (sched)
>>               drm_sched_wakeup(sched);
>
Christian König Jan. 9, 2025, 1:51 p.m. UTC | #3
Am 09.01.25 um 14:41 schrieb Tvrtko Ursulin:
>
> On 09/01/2025 13:07, Christian König wrote:
>> Am 08.01.25 um 19:35 schrieb Tvrtko Ursulin:
>>> Now that the scheduling policy is deadline based it feels completely
>>> natural to allow propagating externaly set deadlines to the scheduler.
>>>
>>> Scheduler deadlines are not a guarantee but as the dma-fence 
>>> facility is
>>> already in use by userspace lets wire it up.
>>>
>>> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@igalia.com>
>>> Cc: Christian König <christian.koenig@amd.com>
>>> Cc: Danilo Krummrich <dakr@redhat.com>
>>> Cc: Matthew Brost <matthew.brost@intel.com>
>>> Cc: Philipp Stanner <pstanner@redhat.com>
>>> Cc: Rob Clark <robdclark@gmail.com>
>>> ---
>>>   drivers/gpu/drm/scheduler/sched_entity.c | 24 
>>> ++++++++++++++++++++++--
>>>   1 file changed, 22 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/scheduler/sched_entity.c 
>>> b/drivers/gpu/drm/scheduler/sched_entity.c
>>> index 98c78d1373d8..db5d34310b18 100644
>>> --- a/drivers/gpu/drm/scheduler/sched_entity.c
>>> +++ b/drivers/gpu/drm/scheduler/sched_entity.c
>>> @@ -410,7 +410,16 @@ ktime_t
>>>   drm_sched_entity_get_job_deadline(struct drm_sched_entity *entity,
>>>                     struct drm_sched_job *job)
>>>   {
>>> -    return __drm_sched_entity_get_job_deadline(entity, 
>>> job->submit_ts);
>>> +    struct drm_sched_fence *s_fence = job->s_fence;
>>> +    struct dma_fence *fence = &s_fence->finished;
>>> +    ktime_t deadline;
>>> +
>>> +    deadline = __drm_sched_entity_get_job_deadline(entity, 
>>> job->submit_ts);
>>> +    if (test_bit(DRM_SCHED_FENCE_FLAG_HAS_DEADLINE_BIT, 
>>> &fence->flags) &&
>>> +        ktime_before(s_fence->deadline, deadline))
>>> +        deadline = s_fence->deadline;
>>> +
>>> +    return deadline;
>>>   }
>>>   /*
>>> @@ -579,9 +588,12 @@ void drm_sched_entity_select_rq(struct 
>>> drm_sched_entity *entity)
>>>    */
>>>   void drm_sched_entity_push_job(struct drm_sched_job *sched_job)
>>>   {
>>> +    struct drm_sched_fence *s_fence = sched_job->s_fence;
>>>       struct drm_sched_entity *entity = sched_job->entity;
>>> -    bool first;
>>> +    struct dma_fence *fence = &s_fence->finished;
>>> +    ktime_t fence_deadline;
>>>       ktime_t submit_ts;
>>> +    bool first;
>>>       trace_drm_sched_job(sched_job, entity);
>>>       atomic_inc(entity->rq->sched->score);
>>> @@ -593,6 +605,11 @@ void drm_sched_entity_push_job(struct 
>>> drm_sched_job *sched_job)
>>>        * Make sure to set the submit_ts first, to avoid a race.
>>>        */
>>>       sched_job->submit_ts = submit_ts = ktime_get();
>>> +    if (test_bit(DRM_SCHED_FENCE_FLAG_HAS_DEADLINE_BIT, 
>>> &fence->flags))
>>> +        fence_deadline = s_fence->deadline;
>>> +    else
>>> +        fence_deadline = KTIME_MAX;
>>> +
>>
>> That makes no sense. When the job is pushed the fence is not made 
>> public yet.
>>
>> So no deadline can be set on the fence.
>
> You are correct, the push side of things was a mistake a laziness that 
> I did not remove it from the RFC.
>
>>>       first = spsc_queue_push(&entity->job_queue, 
>>> &sched_job->queue_node);
>>>       /* first job wakes up scheduler */
>>> @@ -601,6 +618,9 @@ void drm_sched_entity_push_job(struct 
>>> drm_sched_job *sched_job)
>>>           submit_ts = __drm_sched_entity_get_job_deadline(entity,
>>>                                   submit_ts);
>>> +        if (ktime_before(fence_deadline, submit_ts))
>>> +            submit_ts = fence_deadline;
>>> +
>>
>> Yeah, that won't work at all as far as I can see.
>
> It works from the pop side though.

Yeah, but only partially.

>
> When job N is scheduled, deadline is taken from N+1 and tree 
> re-balanced. At the point of N scheduling N+1 can definitely have a 
> real deadline set.

The fundamental design problem with the fence deadline approach is that 
it sets the deadline only on the last submission instead of the first one.

E.g. unigine heaven for example uses 3 submissions on a typical system.

We would somehow need to propagate a deadline to previous submissions 
for this to work halve way correctly.

> What does not work is for queue depth of one. No way at the moment to 
> "bump" the entity in the tree while N is waiting for submission 
> because we cannot dereference the entity from the job. (I had that in 
> v1 of the series and realized it was unsafe.)
>
> I (very) briefly though about reference counting entities but quickly 
> had a feeling it would be annoying. So for now this patch only offers 
> a partial solution.

Nah, please not.

Regards,
Christian.

>
> Regards,
>
> Tvrtko
>
>>>           sched = drm_sched_rq_add_entity(entity->rq, entity, 
>>> submit_ts);
>>>           if (sched)
>>>               drm_sched_wakeup(sched);
>>
Tvrtko Ursulin Jan. 9, 2025, 4:03 p.m. UTC | #4
On 09/01/2025 13:51, Christian König wrote:
> Am 09.01.25 um 14:41 schrieb Tvrtko Ursulin:
>>
>> On 09/01/2025 13:07, Christian König wrote:
>>> Am 08.01.25 um 19:35 schrieb Tvrtko Ursulin:
>>>> Now that the scheduling policy is deadline based it feels completely
>>>> natural to allow propagating externaly set deadlines to the scheduler.
>>>>
>>>> Scheduler deadlines are not a guarantee but as the dma-fence 
>>>> facility is
>>>> already in use by userspace lets wire it up.
>>>>
>>>> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@igalia.com>
>>>> Cc: Christian König <christian.koenig@amd.com>
>>>> Cc: Danilo Krummrich <dakr@redhat.com>
>>>> Cc: Matthew Brost <matthew.brost@intel.com>
>>>> Cc: Philipp Stanner <pstanner@redhat.com>
>>>> Cc: Rob Clark <robdclark@gmail.com>
>>>> ---
>>>>   drivers/gpu/drm/scheduler/sched_entity.c | 24 
>>>> ++++++++++++++++++++++--
>>>>   1 file changed, 22 insertions(+), 2 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/scheduler/sched_entity.c 
>>>> b/drivers/gpu/drm/scheduler/sched_entity.c
>>>> index 98c78d1373d8..db5d34310b18 100644
>>>> --- a/drivers/gpu/drm/scheduler/sched_entity.c
>>>> +++ b/drivers/gpu/drm/scheduler/sched_entity.c
>>>> @@ -410,7 +410,16 @@ ktime_t
>>>>   drm_sched_entity_get_job_deadline(struct drm_sched_entity *entity,
>>>>                     struct drm_sched_job *job)
>>>>   {
>>>> -    return __drm_sched_entity_get_job_deadline(entity, 
>>>> job->submit_ts);
>>>> +    struct drm_sched_fence *s_fence = job->s_fence;
>>>> +    struct dma_fence *fence = &s_fence->finished;
>>>> +    ktime_t deadline;
>>>> +
>>>> +    deadline = __drm_sched_entity_get_job_deadline(entity, 
>>>> job->submit_ts);
>>>> +    if (test_bit(DRM_SCHED_FENCE_FLAG_HAS_DEADLINE_BIT, 
>>>> &fence->flags) &&
>>>> +        ktime_before(s_fence->deadline, deadline))
>>>> +        deadline = s_fence->deadline;
>>>> +
>>>> +    return deadline;
>>>>   }
>>>>   /*
>>>> @@ -579,9 +588,12 @@ void drm_sched_entity_select_rq(struct 
>>>> drm_sched_entity *entity)
>>>>    */
>>>>   void drm_sched_entity_push_job(struct drm_sched_job *sched_job)
>>>>   {
>>>> +    struct drm_sched_fence *s_fence = sched_job->s_fence;
>>>>       struct drm_sched_entity *entity = sched_job->entity;
>>>> -    bool first;
>>>> +    struct dma_fence *fence = &s_fence->finished;
>>>> +    ktime_t fence_deadline;
>>>>       ktime_t submit_ts;
>>>> +    bool first;
>>>>       trace_drm_sched_job(sched_job, entity);
>>>>       atomic_inc(entity->rq->sched->score);
>>>> @@ -593,6 +605,11 @@ void drm_sched_entity_push_job(struct 
>>>> drm_sched_job *sched_job)
>>>>        * Make sure to set the submit_ts first, to avoid a race.
>>>>        */
>>>>       sched_job->submit_ts = submit_ts = ktime_get();
>>>> +    if (test_bit(DRM_SCHED_FENCE_FLAG_HAS_DEADLINE_BIT, 
>>>> &fence->flags))
>>>> +        fence_deadline = s_fence->deadline;
>>>> +    else
>>>> +        fence_deadline = KTIME_MAX;
>>>> +
>>>
>>> That makes no sense. When the job is pushed the fence is not made 
>>> public yet.
>>>
>>> So no deadline can be set on the fence.
>>
>> You are correct, the push side of things was a mistake a laziness that 
>> I did not remove it from the RFC.
>>
>>>>       first = spsc_queue_push(&entity->job_queue, 
>>>> &sched_job->queue_node);
>>>>       /* first job wakes up scheduler */
>>>> @@ -601,6 +618,9 @@ void drm_sched_entity_push_job(struct 
>>>> drm_sched_job *sched_job)
>>>>           submit_ts = __drm_sched_entity_get_job_deadline(entity,
>>>>                                   submit_ts);
>>>> +        if (ktime_before(fence_deadline, submit_ts))
>>>> +            submit_ts = fence_deadline;
>>>> +
>>>
>>> Yeah, that won't work at all as far as I can see.
>>
>> It works from the pop side though.
> 
> Yeah, but only partially.
> 
>>
>> When job N is scheduled, deadline is taken from N+1 and tree 
>> re-balanced. At the point of N scheduling N+1 can definitely have a 
>> real deadline set.
> 
> The fundamental design problem with the fence deadline approach is that 
> it sets the deadline only on the last submission instead of the first one.
> 
> E.g. unigine heaven for example uses 3 submissions on a typical system.
> 
> We would somehow need to propagate a deadline to previous submissions 
> for this to work halve way correctly.

I played with this in one of my branches but was lacking a good test 
case. In any case clean solution(s) would not be easy with the current 
scheduler design.

Regards,

Tvrtko

>> What does not work is for queue depth of one. No way at the moment to 
>> "bump" the entity in the tree while N is waiting for submission 
>> because we cannot dereference the entity from the job. (I had that in 
>> v1 of the series and realized it was unsafe.)
>>
>> I (very) briefly though about reference counting entities but quickly 
>> had a feeling it would be annoying. So for now this patch only offers 
>> a partial solution.
> 
> Nah, please not.
> 
> Regards,
> Christian.
> 
>>
>> Regards,
>>
>> Tvrtko
>>
>>>>           sched = drm_sched_rq_add_entity(entity->rq, entity, 
>>>> submit_ts);
>>>>           if (sched)
>>>>               drm_sched_wakeup(sched);
>>>
>
diff mbox series

Patch

diff --git a/drivers/gpu/drm/scheduler/sched_entity.c b/drivers/gpu/drm/scheduler/sched_entity.c
index 98c78d1373d8..db5d34310b18 100644
--- a/drivers/gpu/drm/scheduler/sched_entity.c
+++ b/drivers/gpu/drm/scheduler/sched_entity.c
@@ -410,7 +410,16 @@  ktime_t
 drm_sched_entity_get_job_deadline(struct drm_sched_entity *entity,
 				  struct drm_sched_job *job)
 {
-	return __drm_sched_entity_get_job_deadline(entity, job->submit_ts);
+	struct drm_sched_fence *s_fence = job->s_fence;
+	struct dma_fence *fence = &s_fence->finished;
+	ktime_t deadline;
+
+	deadline = __drm_sched_entity_get_job_deadline(entity, job->submit_ts);
+	if (test_bit(DRM_SCHED_FENCE_FLAG_HAS_DEADLINE_BIT, &fence->flags) &&
+	    ktime_before(s_fence->deadline, deadline))
+		deadline = s_fence->deadline;
+
+	return deadline;
 }
 
 /*
@@ -579,9 +588,12 @@  void drm_sched_entity_select_rq(struct drm_sched_entity *entity)
  */
 void drm_sched_entity_push_job(struct drm_sched_job *sched_job)
 {
+	struct drm_sched_fence *s_fence = sched_job->s_fence;
 	struct drm_sched_entity *entity = sched_job->entity;
-	bool first;
+	struct dma_fence *fence = &s_fence->finished;
+	ktime_t fence_deadline;
 	ktime_t submit_ts;
+	bool first;
 
 	trace_drm_sched_job(sched_job, entity);
 	atomic_inc(entity->rq->sched->score);
@@ -593,6 +605,11 @@  void drm_sched_entity_push_job(struct drm_sched_job *sched_job)
 	 * Make sure to set the submit_ts first, to avoid a race.
 	 */
 	sched_job->submit_ts = submit_ts = ktime_get();
+	if (test_bit(DRM_SCHED_FENCE_FLAG_HAS_DEADLINE_BIT, &fence->flags))
+		fence_deadline = s_fence->deadline;
+	else
+		fence_deadline = KTIME_MAX;
+
 	first = spsc_queue_push(&entity->job_queue, &sched_job->queue_node);
 
 	/* first job wakes up scheduler */
@@ -601,6 +618,9 @@  void drm_sched_entity_push_job(struct drm_sched_job *sched_job)
 
 		submit_ts = __drm_sched_entity_get_job_deadline(entity,
 								submit_ts);
+		if (ktime_before(fence_deadline, submit_ts))
+			submit_ts = fence_deadline;
+
 		sched = drm_sched_rq_add_entity(entity->rq, entity, submit_ts);
 		if (sched)
 			drm_sched_wakeup(sched);