drm/i915: Copy across scheduler behaviour flags across submit fences
diff mbox series

Message ID 20191127111742.3271036-1-chris@chris-wilson.co.uk
State New
Headers show
Series
  • drm/i915: Copy across scheduler behaviour flags across submit fences
Related show

Commit Message

Chris Wilson Nov. 27, 2019, 11:17 a.m. UTC
We want the bonded request to have the same scheduler properties as its
master so that it is placed at the same depth in the queue. For example,
consider we have requests A, B and B', where B & B' are a bonded pair to
run in parallel on two engines.

	A -> B
     	     \- B'

B will run after A and so may be scheduled on an idle engine and wait on
A using a semaphore. B' sees B being executed and so enters the queue on
the same engine as A. As B' did not inherit the semaphore-chain from B,
it may have higher precedence than A and so preempts execution. However,
B' then sits on a semaphore waiting for B, who is waiting for A, who is
blocked by B.

Ergo B' needs to inherit the scheduler properties from B (i.e. the
semaphore chain) so that it is scheduled with the same priority as B and
will not be executed ahead of Bs dependencies.

Furthermore, to prevent the priorities changing via the expose fence on
B', we need to couple in the dependencies for PI. This requires us to
relax our sanity-checks that dependencies are strictly in order.

Fixes: ee1136908e9b ("drm/i915/execlists: Virtual engine bonding")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/i915_request.c   | 46 ++++++++++++++++++++++-----
 drivers/gpu/drm/i915/i915_scheduler.c |  1 -
 2 files changed, 38 insertions(+), 9 deletions(-)

Comments

Tvrtko Ursulin Nov. 27, 2019, 1:46 p.m. UTC | #1
On 27/11/2019 11:17, Chris Wilson wrote:
> We want the bonded request to have the same scheduler properties as its
> master so that it is placed at the same depth in the queue. For example,
> consider we have requests A, B and B', where B & B' are a bonded pair to
> run in parallel on two engines.
> 
> 	A -> B
>       	     \- B'
> 
> B will run after A and so may be scheduled on an idle engine and wait on
> A using a semaphore. B' sees B being executed and so enters the queue on
> the same engine as A. As B' did not inherit the semaphore-chain from B,
> it may have higher precedence than A and so preempts execution. However,
> B' then sits on a semaphore waiting for B, who is waiting for A, who is
> blocked by B.
> 
> Ergo B' needs to inherit the scheduler properties from B (i.e. the
> semaphore chain) so that it is scheduled with the same priority as B and
> will not be executed ahead of Bs dependencies.
> 
> Furthermore, to prevent the priorities changing via the expose fence on
> B', we need to couple in the dependencies for PI. This requires us to
> relax our sanity-checks that dependencies are strictly in order.

Good catch, this needed some deep thinking! And it looks okay, even 
though ideally we would be able to fix it not to signal the submit fence 
until semaphore was completed. But for that I think we would need to 
emit a request while emitting a request, so that the semaphore wait 
would be in its own.

Once this passes BAT I'll pass it on to customer to try.

Regards,

Tvrtko

> Fixes: ee1136908e9b ("drm/i915/execlists: Virtual engine bonding")
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> ---
>   drivers/gpu/drm/i915/i915_request.c   | 46 ++++++++++++++++++++++-----
>   drivers/gpu/drm/i915/i915_scheduler.c |  1 -
>   2 files changed, 38 insertions(+), 9 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
> index a558f64186fa..93bea6987e57 100644
> --- a/drivers/gpu/drm/i915/i915_request.c
> +++ b/drivers/gpu/drm/i915/i915_request.c
> @@ -300,11 +300,11 @@ void i915_request_retire_upto(struct i915_request *rq)
>   }
>   
>   static int
> -__i915_request_await_execution(struct i915_request *rq,
> -			       struct i915_request *signal,
> -			       void (*hook)(struct i915_request *rq,
> -					    struct dma_fence *signal),
> -			       gfp_t gfp)
> +__await_execution(struct i915_request *rq,
> +		  struct i915_request *signal,
> +		  void (*hook)(struct i915_request *rq,
> +			       struct dma_fence *signal),
> +		  gfp_t gfp)
>   {
>   	struct execute_cb *cb;
>   
> @@ -341,6 +341,8 @@ __i915_request_await_execution(struct i915_request *rq,
>   	}
>   	spin_unlock_irq(&signal->lock);
>   
> +	/* Copy across semaphore status as we need the same behaviour */
> +	rq->sched.flags |= signal->sched.flags;
>   	return 0;
>   }
>   
> @@ -843,7 +845,7 @@ emit_semaphore_wait(struct i915_request *to,
>   		goto await_fence;
>   
>   	/* Only submit our spinner after the signaler is running! */
> -	if (__i915_request_await_execution(to, from, NULL, gfp))
> +	if (__await_execution(to, from, NULL, gfp))
>   		goto await_fence;
>   
>   	/* We need to pin the signaler's HWSP until we are finished reading. */
> @@ -993,6 +995,35 @@ i915_request_await_dma_fence(struct i915_request *rq, struct dma_fence *fence)
>   	return 0;
>   }
>   
> +static int
> +__i915_request_await_execution(struct i915_request *rq,
> +			       struct i915_request *signal,
> +			       void (*hook)(struct i915_request *rq,
> +					    struct dma_fence *signal))
> +{
> +	int err;
> +
> +	err = __await_execution(rq, signal, hook, I915_FENCE_GFP);
> +	if (err)
> +		return err;
> +
> +	if (!rq->engine->schedule)
> +		return 0;
> +
> +	/* Squash repeated depenendices to the same timelines */
> +	if (signal->fence.context &&
> +	    intel_timeline_sync_is_later(i915_request_timeline(rq),
> +					 &signal->fence))
> +		return 0;
> +
> +	/* Couple the dependency tree for PI on this exposed rq->fence */
> +	err = i915_sched_node_add_dependency(&rq->sched, &signal->sched);
> +	if (err < 0)
> +		return err;
> +
> +	return 0;
> +}
> +
>   int
>   i915_request_await_execution(struct i915_request *rq,
>   			     struct dma_fence *fence,
> @@ -1026,8 +1057,7 @@ i915_request_await_execution(struct i915_request *rq,
>   		if (dma_fence_is_i915(fence))
>   			ret = __i915_request_await_execution(rq,
>   							     to_request(fence),
> -							     hook,
> -							     I915_FENCE_GFP);
> +							     hook);
>   		else
>   			ret = i915_sw_fence_await_dma_fence(&rq->submit, fence,
>   							    I915_FENCE_TIMEOUT,
> diff --git a/drivers/gpu/drm/i915/i915_scheduler.c b/drivers/gpu/drm/i915/i915_scheduler.c
> index 1937a26d412f..2bc2aa46a1b9 100644
> --- a/drivers/gpu/drm/i915/i915_scheduler.c
> +++ b/drivers/gpu/drm/i915/i915_scheduler.c
> @@ -484,7 +484,6 @@ void i915_sched_node_fini(struct i915_sched_node *node)
>   	 * so we may be called out-of-order.
>   	 */
>   	list_for_each_entry_safe(dep, tmp, &node->signalers_list, signal_link) {
> -		GEM_BUG_ON(!node_signaled(dep->signaler));
>   		GEM_BUG_ON(!list_empty(&dep->dfs_link));
>   
>   		list_del(&dep->wait_link);
>
Chris Wilson Nov. 27, 2019, 2:04 p.m. UTC | #2
Quoting Tvrtko Ursulin (2019-11-27 13:46:14)
> 
> On 27/11/2019 11:17, Chris Wilson wrote:
> > We want the bonded request to have the same scheduler properties as its
> > master so that it is placed at the same depth in the queue. For example,
> > consider we have requests A, B and B', where B & B' are a bonded pair to
> > run in parallel on two engines.
> > 
> >       A -> B
> >                    \- B'
> > 
> > B will run after A and so may be scheduled on an idle engine and wait on
> > A using a semaphore. B' sees B being executed and so enters the queue on
> > the same engine as A. As B' did not inherit the semaphore-chain from B,
> > it may have higher precedence than A and so preempts execution. However,
> > B' then sits on a semaphore waiting for B, who is waiting for A, who is
> > blocked by B.
> > 
> > Ergo B' needs to inherit the scheduler properties from B (i.e. the
> > semaphore chain) so that it is scheduled with the same priority as B and
> > will not be executed ahead of Bs dependencies.
> > 
> > Furthermore, to prevent the priorities changing via the expose fence on
> > B', we need to couple in the dependencies for PI. This requires us to
> > relax our sanity-checks that dependencies are strictly in order.
> 
> Good catch, this needed some deep thinking! And it looks okay, even 
> though ideally we would be able to fix it not to signal the submit fence 
> until semaphore was completed. But for that I think we would need to 
> emit a request while emitting a request, so that the semaphore wait 
> would be in its own.

At a push we could add an MI_USER_INTERRUPT after the initial breadcrumb
and couple the submit fence into that. That would be virtually
equivalent to emitting a separate request for semaphores. Something to
ponder over.
-Chris
Tvrtko Ursulin Nov. 27, 2019, 2:22 p.m. UTC | #3
On 27/11/2019 14:04, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2019-11-27 13:46:14)
>> On 27/11/2019 11:17, Chris Wilson wrote:
>>> We want the bonded request to have the same scheduler properties as its
>>> master so that it is placed at the same depth in the queue. For example,
>>> consider we have requests A, B and B', where B & B' are a bonded pair to
>>> run in parallel on two engines.
>>>
>>>        A -> B
>>>                     \- B'
>>>
>>> B will run after A and so may be scheduled on an idle engine and wait on
>>> A using a semaphore. B' sees B being executed and so enters the queue on
>>> the same engine as A. As B' did not inherit the semaphore-chain from B,
>>> it may have higher precedence than A and so preempts execution. However,
>>> B' then sits on a semaphore waiting for B, who is waiting for A, who is
>>> blocked by B.
>>>
>>> Ergo B' needs to inherit the scheduler properties from B (i.e. the
>>> semaphore chain) so that it is scheduled with the same priority as B and
>>> will not be executed ahead of Bs dependencies.
>>>
>>> Furthermore, to prevent the priorities changing via the expose fence on
>>> B', we need to couple in the dependencies for PI. This requires us to
>>> relax our sanity-checks that dependencies are strictly in order.
>>
>> Good catch, this needed some deep thinking! And it looks okay, even
>> though ideally we would be able to fix it not to signal the submit fence
>> until semaphore was completed. But for that I think we would need to
>> emit a request while emitting a request, so that the semaphore wait
>> would be in its own.
> 
> At a push we could add an MI_USER_INTERRUPT after the initial breadcrumb
> and couple the submit fence into that. That would be virtually
> equivalent to emitting a separate request for semaphores. Something to
> ponder over.

Hm, if not too difficult it would definitely be much preferable since 
relying on controlling preemption decisions feels a bit fragile/hackish.

Simply moving __notify_execute_cb from __i915_request_submit to 
intel_engine_breadcrumbs_irq, under a __i915_request_has_started check, 
could do it?

Regards,

Tvrtko
Chris Wilson Nov. 27, 2019, 2:37 p.m. UTC | #4
Quoting Tvrtko Ursulin (2019-11-27 14:22:37)
> 
> On 27/11/2019 14:04, Chris Wilson wrote:
> > Quoting Tvrtko Ursulin (2019-11-27 13:46:14)
> >> On 27/11/2019 11:17, Chris Wilson wrote:
> >>> We want the bonded request to have the same scheduler properties as its
> >>> master so that it is placed at the same depth in the queue. For example,
> >>> consider we have requests A, B and B', where B & B' are a bonded pair to
> >>> run in parallel on two engines.
> >>>
> >>>        A -> B
> >>>                     \- B'
> >>>
> >>> B will run after A and so may be scheduled on an idle engine and wait on
> >>> A using a semaphore. B' sees B being executed and so enters the queue on
> >>> the same engine as A. As B' did not inherit the semaphore-chain from B,
> >>> it may have higher precedence than A and so preempts execution. However,
> >>> B' then sits on a semaphore waiting for B, who is waiting for A, who is
> >>> blocked by B.
> >>>
> >>> Ergo B' needs to inherit the scheduler properties from B (i.e. the
> >>> semaphore chain) so that it is scheduled with the same priority as B and
> >>> will not be executed ahead of Bs dependencies.
> >>>
> >>> Furthermore, to prevent the priorities changing via the expose fence on
> >>> B', we need to couple in the dependencies for PI. This requires us to
> >>> relax our sanity-checks that dependencies are strictly in order.
> >>
> >> Good catch, this needed some deep thinking! And it looks okay, even
> >> though ideally we would be able to fix it not to signal the submit fence
> >> until semaphore was completed. But for that I think we would need to
> >> emit a request while emitting a request, so that the semaphore wait
> >> would be in its own.
> > 
> > At a push we could add an MI_USER_INTERRUPT after the initial breadcrumb
> > and couple the submit fence into that. That would be virtually
> > equivalent to emitting a separate request for semaphores. Something to
> > ponder over.
> 
> Hm, if not too difficult it would definitely be much preferable since 
> relying on controlling preemption decisions feels a bit fragile/hackish.
> 
> Simply moving __notify_execute_cb from __i915_request_submit to 
> intel_engine_breadcrumbs_irq, under a __i915_request_has_started check, 
> could do it?

95% of the way, yes.
-Chris
Chris Wilson Nov. 27, 2019, 2:51 p.m. UTC | #5
Quoting Chris Wilson (2019-11-27 11:17:42)
> We want the bonded request to have the same scheduler properties as its
> master so that it is placed at the same depth in the queue. For example,
> consider we have requests A, B and B', where B & B' are a bonded pair to
> run in parallel on two engines.
> 
>         A -> B
>              \- B'
> 
> B will run after A and so may be scheduled on an idle engine and wait on
> A using a semaphore. B' sees B being executed and so enters the queue on
> the same engine as A. As B' did not inherit the semaphore-chain from B,
> it may have higher precedence than A and so preempts execution. However,
> B' then sits on a semaphore waiting for B, who is waiting for A, who is
> blocked by B.
> 
> Ergo B' needs to inherit the scheduler properties from B (i.e. the
> semaphore chain) so that it is scheduled with the same priority as B and
> will not be executed ahead of Bs dependencies.
> 
> Furthermore, to prevent the priorities changing via the expose fence on
> B', we need to couple in the dependencies for PI. This requires us to
> relax our sanity-checks that dependencies are strictly in order.
> 
> Fixes: ee1136908e9b ("drm/i915/execlists: Virtual engine bonding")
Testcase: igt/gem_exec_balancer/bonded-chain
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
-Chris
Tvrtko Ursulin Nov. 27, 2019, 3:21 p.m. UTC | #6
On 27/11/2019 14:37, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2019-11-27 14:22:37)
>>
>> On 27/11/2019 14:04, Chris Wilson wrote:
>>> Quoting Tvrtko Ursulin (2019-11-27 13:46:14)
>>>> On 27/11/2019 11:17, Chris Wilson wrote:
>>>>> We want the bonded request to have the same scheduler properties as its
>>>>> master so that it is placed at the same depth in the queue. For example,
>>>>> consider we have requests A, B and B', where B & B' are a bonded pair to
>>>>> run in parallel on two engines.
>>>>>
>>>>>         A -> B
>>>>>                      \- B'
>>>>>
>>>>> B will run after A and so may be scheduled on an idle engine and wait on
>>>>> A using a semaphore. B' sees B being executed and so enters the queue on
>>>>> the same engine as A. As B' did not inherit the semaphore-chain from B,
>>>>> it may have higher precedence than A and so preempts execution. However,
>>>>> B' then sits on a semaphore waiting for B, who is waiting for A, who is
>>>>> blocked by B.
>>>>>
>>>>> Ergo B' needs to inherit the scheduler properties from B (i.e. the
>>>>> semaphore chain) so that it is scheduled with the same priority as B and
>>>>> will not be executed ahead of Bs dependencies.
>>>>>
>>>>> Furthermore, to prevent the priorities changing via the expose fence on
>>>>> B', we need to couple in the dependencies for PI. This requires us to
>>>>> relax our sanity-checks that dependencies are strictly in order.
>>>>
>>>> Good catch, this needed some deep thinking! And it looks okay, even
>>>> though ideally we would be able to fix it not to signal the submit fence
>>>> until semaphore was completed. But for that I think we would need to
>>>> emit a request while emitting a request, so that the semaphore wait
>>>> would be in its own.
>>>
>>> At a push we could add an MI_USER_INTERRUPT after the initial breadcrumb
>>> and couple the submit fence into that. That would be virtually
>>> equivalent to emitting a separate request for semaphores. Something to
>>> ponder over.
>>
>> Hm, if not too difficult it would definitely be much preferable since
>> relying on controlling preemption decisions feels a bit fragile/hackish.
>>
>> Simply moving __notify_execute_cb from __i915_request_submit to
>> intel_engine_breadcrumbs_irq, under a __i915_request_has_started check,
>> could do it?
> 
> 95% of the way, yes.

Is the remaining 5% just a new flavour of __i915_request_has_started 
which does away with rcu_read_lock? :)

Regards,

Tvrtko
Chris Wilson Nov. 27, 2019, 3:44 p.m. UTC | #7
Quoting Tvrtko Ursulin (2019-11-27 15:21:30)
> 
> On 27/11/2019 14:37, Chris Wilson wrote:
> > Quoting Tvrtko Ursulin (2019-11-27 14:22:37)
> >>
> >> On 27/11/2019 14:04, Chris Wilson wrote:
> >>> Quoting Tvrtko Ursulin (2019-11-27 13:46:14)
> >>>> On 27/11/2019 11:17, Chris Wilson wrote:
> >>>>> We want the bonded request to have the same scheduler properties as its
> >>>>> master so that it is placed at the same depth in the queue. For example,
> >>>>> consider we have requests A, B and B', where B & B' are a bonded pair to
> >>>>> run in parallel on two engines.
> >>>>>
> >>>>>         A -> B
> >>>>>                      \- B'
> >>>>>
> >>>>> B will run after A and so may be scheduled on an idle engine and wait on
> >>>>> A using a semaphore. B' sees B being executed and so enters the queue on
> >>>>> the same engine as A. As B' did not inherit the semaphore-chain from B,
> >>>>> it may have higher precedence than A and so preempts execution. However,
> >>>>> B' then sits on a semaphore waiting for B, who is waiting for A, who is
> >>>>> blocked by B.
> >>>>>
> >>>>> Ergo B' needs to inherit the scheduler properties from B (i.e. the
> >>>>> semaphore chain) so that it is scheduled with the same priority as B and
> >>>>> will not be executed ahead of Bs dependencies.
> >>>>>
> >>>>> Furthermore, to prevent the priorities changing via the expose fence on
> >>>>> B', we need to couple in the dependencies for PI. This requires us to
> >>>>> relax our sanity-checks that dependencies are strictly in order.
> >>>>
> >>>> Good catch, this needed some deep thinking! And it looks okay, even
> >>>> though ideally we would be able to fix it not to signal the submit fence
> >>>> until semaphore was completed. But for that I think we would need to
> >>>> emit a request while emitting a request, so that the semaphore wait
> >>>> would be in its own.
> >>>
> >>> At a push we could add an MI_USER_INTERRUPT after the initial breadcrumb
> >>> and couple the submit fence into that. That would be virtually
> >>> equivalent to emitting a separate request for semaphores. Something to
> >>> ponder over.
> >>
> >> Hm, if not too difficult it would definitely be much preferable since
> >> relying on controlling preemption decisions feels a bit fragile/hackish.
> >>
> >> Simply moving __notify_execute_cb from __i915_request_submit to
> >> intel_engine_breadcrumbs_irq, under a __i915_request_has_started check,
> >> could do it?
> > 
> > 95% of the way, yes.
> 
> Is the remaining 5% just a new flavour of __i915_request_has_started 
> which does away with rcu_read_lock? :)

Adding the MI_USER_INTERRUPT, enabling fence signaling for submit
fences, tidying up, worrying about interrupt latency.

The devilish details.
-Chris

Patch
diff mbox series

diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
index a558f64186fa..93bea6987e57 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -300,11 +300,11 @@  void i915_request_retire_upto(struct i915_request *rq)
 }
 
 static int
-__i915_request_await_execution(struct i915_request *rq,
-			       struct i915_request *signal,
-			       void (*hook)(struct i915_request *rq,
-					    struct dma_fence *signal),
-			       gfp_t gfp)
+__await_execution(struct i915_request *rq,
+		  struct i915_request *signal,
+		  void (*hook)(struct i915_request *rq,
+			       struct dma_fence *signal),
+		  gfp_t gfp)
 {
 	struct execute_cb *cb;
 
@@ -341,6 +341,8 @@  __i915_request_await_execution(struct i915_request *rq,
 	}
 	spin_unlock_irq(&signal->lock);
 
+	/* Copy across semaphore status as we need the same behaviour */
+	rq->sched.flags |= signal->sched.flags;
 	return 0;
 }
 
@@ -843,7 +845,7 @@  emit_semaphore_wait(struct i915_request *to,
 		goto await_fence;
 
 	/* Only submit our spinner after the signaler is running! */
-	if (__i915_request_await_execution(to, from, NULL, gfp))
+	if (__await_execution(to, from, NULL, gfp))
 		goto await_fence;
 
 	/* We need to pin the signaler's HWSP until we are finished reading. */
@@ -993,6 +995,35 @@  i915_request_await_dma_fence(struct i915_request *rq, struct dma_fence *fence)
 	return 0;
 }
 
+static int
+__i915_request_await_execution(struct i915_request *rq,
+			       struct i915_request *signal,
+			       void (*hook)(struct i915_request *rq,
+					    struct dma_fence *signal))
+{
+	int err;
+
+	err = __await_execution(rq, signal, hook, I915_FENCE_GFP);
+	if (err)
+		return err;
+
+	if (!rq->engine->schedule)
+		return 0;
+
+	/* Squash repeated depenendices to the same timelines */
+	if (signal->fence.context &&
+	    intel_timeline_sync_is_later(i915_request_timeline(rq),
+					 &signal->fence))
+		return 0;
+
+	/* Couple the dependency tree for PI on this exposed rq->fence */
+	err = i915_sched_node_add_dependency(&rq->sched, &signal->sched);
+	if (err < 0)
+		return err;
+
+	return 0;
+}
+
 int
 i915_request_await_execution(struct i915_request *rq,
 			     struct dma_fence *fence,
@@ -1026,8 +1057,7 @@  i915_request_await_execution(struct i915_request *rq,
 		if (dma_fence_is_i915(fence))
 			ret = __i915_request_await_execution(rq,
 							     to_request(fence),
-							     hook,
-							     I915_FENCE_GFP);
+							     hook);
 		else
 			ret = i915_sw_fence_await_dma_fence(&rq->submit, fence,
 							    I915_FENCE_TIMEOUT,
diff --git a/drivers/gpu/drm/i915/i915_scheduler.c b/drivers/gpu/drm/i915/i915_scheduler.c
index 1937a26d412f..2bc2aa46a1b9 100644
--- a/drivers/gpu/drm/i915/i915_scheduler.c
+++ b/drivers/gpu/drm/i915/i915_scheduler.c
@@ -484,7 +484,6 @@  void i915_sched_node_fini(struct i915_sched_node *node)
 	 * so we may be called out-of-order.
 	 */
 	list_for_each_entry_safe(dep, tmp, &node->signalers_list, signal_link) {
-		GEM_BUG_ON(!node_signaled(dep->signaler));
 		GEM_BUG_ON(!list_empty(&dep->dfs_link));
 
 		list_del(&dep->wait_link);