diff mbox series

[04/19] drm/i915/gt: Unlock engine-pm after queuing the kernel context switch

Message ID 20191118230254.2615942-5-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show
Series [01/19] drm/i915/selftests: Force bonded submission to overlap | expand

Commit Message

Chris Wilson Nov. 18, 2019, 11:02 p.m. UTC
In commit a79ca656b648 ("drm/i915: Push the wakeref->count deferral to
the backend"), I erroneously concluded that we last modify the engine
inside __i915_request_commit() meaning that we could enable concurrent
submission for userspace as we enqueued this request. However, this
falls into a trap with other users of the engine->kernel_context waking
up and submitting their request before the idle-switch is queued, with
the result that the kernel_context is executed out-of-sequence most
likely upsetting the GPU and certainly ourselves when we try to retire
the out-of-sequence requests.

As such we need to hold onto the effective engine->kernel_context mutex
lock (via the engine pm mutex proxy) until we have finish queuing the
request to the engine.

v2: Serialise against concurrent intel_gt_retire_requests()

Fixes: a79ca656b648 ("drm/i915: Push the wakeref->count deferral to the backend")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/gt/intel_engine_pm.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

Comments

Tvrtko Ursulin Nov. 19, 2019, 2:35 p.m. UTC | #1
On 18/11/2019 23:02, Chris Wilson wrote:
> In commit a79ca656b648 ("drm/i915: Push the wakeref->count deferral to
> the backend"), I erroneously concluded that we last modify the engine
> inside __i915_request_commit() meaning that we could enable concurrent
> submission for userspace as we enqueued this request. However, this
> falls into a trap with other users of the engine->kernel_context waking
> up and submitting their request before the idle-switch is queued, with
> the result that the kernel_context is executed out-of-sequence most
> likely upsetting the GPU and certainly ourselves when we try to retire
> the out-of-sequence requests.
> 
> As such we need to hold onto the effective engine->kernel_context mutex
> lock (via the engine pm mutex proxy) until we have finish queuing the
> request to the engine.
> 
> v2: Serialise against concurrent intel_gt_retire_requests()
> 
> Fixes: a79ca656b648 ("drm/i915: Push the wakeref->count deferral to the backend")
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> ---
>   drivers/gpu/drm/i915/gt/intel_engine_pm.c | 15 +++++++++------
>   1 file changed, 9 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_pm.c b/drivers/gpu/drm/i915/gt/intel_engine_pm.c
> index 3c0f490ff2c7..722d3eec226e 100644
> --- a/drivers/gpu/drm/i915/gt/intel_engine_pm.c
> +++ b/drivers/gpu/drm/i915/gt/intel_engine_pm.c
> @@ -75,6 +75,7 @@ static inline void __timeline_mark_unlock(struct intel_context *ce,
>   
>   static bool switch_to_kernel_context(struct intel_engine_cs *engine)
>   {
> +	struct intel_context *ce = engine->kernel_context;
>   	struct i915_request *rq;
>   	unsigned long flags;
>   	bool result = true;
> @@ -99,15 +100,13 @@ static bool switch_to_kernel_context(struct intel_engine_cs *engine)
>   	 * retiring the last request, thus all rings should be empty and
>   	 * all timelines idle.
>   	 */
> -	flags = __timeline_mark_lock(engine->kernel_context);
> +	flags = __timeline_mark_lock(ce);
>   
> -	rq = __i915_request_create(engine->kernel_context, GFP_NOWAIT);
> +	rq = __i915_request_create(ce, GFP_NOWAIT);
>   	if (IS_ERR(rq))
>   		/* Context switch failed, hope for the best! Maybe reset? */
>   		goto out_unlock;
>   
> -	intel_timeline_enter(i915_request_timeline(rq));
> -
>   	/* Check again on the next retirement. */
>   	engine->wakeref_serial = engine->serial + 1;
>   	i915_request_add_active_barriers(rq);
> @@ -116,13 +115,17 @@ static bool switch_to_kernel_context(struct intel_engine_cs *engine)
>   	rq->sched.attr.priority = I915_PRIORITY_BARRIER;
>   	__i915_request_commit(rq);
>   
> +	__i915_request_queue(rq, NULL);
> +
>   	/* Release our exclusive hold on the engine */
>   	__intel_wakeref_defer_park(&engine->wakeref);
> -	__i915_request_queue(rq, NULL);
> +
> +	/* And finally expose our selfselves to intel_gt_retire_requests() 

ourselves

*/
> +	intel_timeline_enter(ce->timeline);

I haven't really managed to follow this.

What are these other clients which can queue requests and what in this 
block prevents them doing so?

The change seems to be moving the queuing to before 
__intel_wakeref_defer_park and intel_timeline_enter to last. Wakeref 
defer extends the engine lifetime until the submitted rq is retired. But 
how is that consider "unlocking"?

Regards,

Tvrtko

>   
>   	result = false;
>   out_unlock:
> -	__timeline_mark_unlock(engine->kernel_context, flags);
> +	__timeline_mark_unlock(ce, flags);
>   	return result;
>   }
>   
>
Chris Wilson Nov. 19, 2019, 2:50 p.m. UTC | #2
Quoting Tvrtko Ursulin (2019-11-19 14:35:04)
> 
> On 18/11/2019 23:02, Chris Wilson wrote:
> > In commit a79ca656b648 ("drm/i915: Push the wakeref->count deferral to
> > the backend"), I erroneously concluded that we last modify the engine
> > inside __i915_request_commit() meaning that we could enable concurrent
> > submission for userspace as we enqueued this request. However, this
> > falls into a trap with other users of the engine->kernel_context waking
> > up and submitting their request before the idle-switch is queued, with
> > the result that the kernel_context is executed out-of-sequence most
> > likely upsetting the GPU and certainly ourselves when we try to retire
> > the out-of-sequence requests.
> > 
> > As such we need to hold onto the effective engine->kernel_context mutex
> > lock (via the engine pm mutex proxy) until we have finish queuing the
> > request to the engine.
> > 
> > v2: Serialise against concurrent intel_gt_retire_requests()
> > 
> > Fixes: a79ca656b648 ("drm/i915: Push the wakeref->count deferral to the backend")
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> > Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> > ---
> >   drivers/gpu/drm/i915/gt/intel_engine_pm.c | 15 +++++++++------
> >   1 file changed, 9 insertions(+), 6 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/gt/intel_engine_pm.c b/drivers/gpu/drm/i915/gt/intel_engine_pm.c
> > index 3c0f490ff2c7..722d3eec226e 100644
> > --- a/drivers/gpu/drm/i915/gt/intel_engine_pm.c
> > +++ b/drivers/gpu/drm/i915/gt/intel_engine_pm.c
> > @@ -75,6 +75,7 @@ static inline void __timeline_mark_unlock(struct intel_context *ce,
> >   
> >   static bool switch_to_kernel_context(struct intel_engine_cs *engine)
> >   {
> > +     struct intel_context *ce = engine->kernel_context;
> >       struct i915_request *rq;
> >       unsigned long flags;
> >       bool result = true;
> > @@ -99,15 +100,13 @@ static bool switch_to_kernel_context(struct intel_engine_cs *engine)
> >        * retiring the last request, thus all rings should be empty and
> >        * all timelines idle.
> >        */
> > -     flags = __timeline_mark_lock(engine->kernel_context);
> > +     flags = __timeline_mark_lock(ce);
> >   
> > -     rq = __i915_request_create(engine->kernel_context, GFP_NOWAIT);
> > +     rq = __i915_request_create(ce, GFP_NOWAIT);
> >       if (IS_ERR(rq))
> >               /* Context switch failed, hope for the best! Maybe reset? */
> >               goto out_unlock;
> >   
> > -     intel_timeline_enter(i915_request_timeline(rq));
> > -
> >       /* Check again on the next retirement. */
> >       engine->wakeref_serial = engine->serial + 1;
> >       i915_request_add_active_barriers(rq);
> > @@ -116,13 +115,17 @@ static bool switch_to_kernel_context(struct intel_engine_cs *engine)
> >       rq->sched.attr.priority = I915_PRIORITY_BARRIER;
> >       __i915_request_commit(rq);
> >   
> > +     __i915_request_queue(rq, NULL);
> > +
> >       /* Release our exclusive hold on the engine */
> >       __intel_wakeref_defer_park(&engine->wakeref);
> > -     __i915_request_queue(rq, NULL);
> > +
> > +     /* And finally expose our selfselves to intel_gt_retire_requests() 
> 
> ourselves
> 
> */
> > +     intel_timeline_enter(ce->timeline);
> 
> I haven't really managed to follow this.
> 
> What are these other clients which can queue requests and what in this 
> block prevents them doing so?

There are 2 other parties and the GPU who have a stake in this.

A new gpu user will be waiting on the engine-pm to start their
engine_unpark. New waiters are predicated on engine->wakeref.count and
so intel_wakeref_defer_park() acts like a mutex_unlock of the
engine->wakeref.

The other party is intel_gt_retire_requests(), which is walking the list
of active timelines looking for completions.  Meanwhile as soon as we
call __i915_request_queue(), the GPU may complete our request. Ergo,
if we put ourselves on the timelines.active_list (intel_timeline_enter)
before we raise the wakeref.count, we may see the request completion and
retire it causing an undeflow of the engine->wakeref.

> The change seems to be moving the queuing to before 
> __intel_wakeref_defer_park and intel_timeline_enter to last. Wakeref 
> defer extends the engine lifetime until the submitted rq is retired. But 
> how is that consider "unlocking"?

static inline int
intel_wakeref_get(struct intel_wakeref *wf)
{
        if (unlikely(!atomic_inc_not_zero(&wf->count)))
                return __intel_wakeref_get_first(wf);

        return 0;
}

As we build the switch_to_kernel_context(), wf->count is 0, and so all
new users will enter the slow path and take the mutex_lock(&wf->mutex).

As soon as we call intel_wakeref_defer_park(), we call
atomic_set_release(&wf->count, 1) and so all new users will take the
fast path and skip the mutex_lock. Hence why I connote it with being a
"mutex_unlock"
-Chris
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/gt/intel_engine_pm.c b/drivers/gpu/drm/i915/gt/intel_engine_pm.c
index 3c0f490ff2c7..722d3eec226e 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_pm.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_pm.c
@@ -75,6 +75,7 @@  static inline void __timeline_mark_unlock(struct intel_context *ce,
 
 static bool switch_to_kernel_context(struct intel_engine_cs *engine)
 {
+	struct intel_context *ce = engine->kernel_context;
 	struct i915_request *rq;
 	unsigned long flags;
 	bool result = true;
@@ -99,15 +100,13 @@  static bool switch_to_kernel_context(struct intel_engine_cs *engine)
 	 * retiring the last request, thus all rings should be empty and
 	 * all timelines idle.
 	 */
-	flags = __timeline_mark_lock(engine->kernel_context);
+	flags = __timeline_mark_lock(ce);
 
-	rq = __i915_request_create(engine->kernel_context, GFP_NOWAIT);
+	rq = __i915_request_create(ce, GFP_NOWAIT);
 	if (IS_ERR(rq))
 		/* Context switch failed, hope for the best! Maybe reset? */
 		goto out_unlock;
 
-	intel_timeline_enter(i915_request_timeline(rq));
-
 	/* Check again on the next retirement. */
 	engine->wakeref_serial = engine->serial + 1;
 	i915_request_add_active_barriers(rq);
@@ -116,13 +115,17 @@  static bool switch_to_kernel_context(struct intel_engine_cs *engine)
 	rq->sched.attr.priority = I915_PRIORITY_BARRIER;
 	__i915_request_commit(rq);
 
+	__i915_request_queue(rq, NULL);
+
 	/* Release our exclusive hold on the engine */
 	__intel_wakeref_defer_park(&engine->wakeref);
-	__i915_request_queue(rq, NULL);
+
+	/* And finally expose our selfselves to intel_gt_retire_requests() */
+	intel_timeline_enter(ce->timeline);
 
 	result = false;
 out_unlock:
-	__timeline_mark_unlock(engine->kernel_context, flags);
+	__timeline_mark_unlock(ce, flags);
 	return result;
 }