diff mbox series

drm/i915: Defer application of request banning to submission

Message ID 20190213182737.12695-1-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show
Series drm/i915: Defer application of request banning to submission | expand

Commit Message

Chris Wilson Feb. 13, 2019, 6:27 p.m. UTC
As we currently do not check on submission whether the context is banned
in a timely manner it is possible for some requests to escape
cancellation after their parent context is banned. By moving the ban
into the request submission under the engine->timeline.lock, we
serialise it with the reset and setting of the context ban.

References: eb8d0f5af4ec ("drm/i915: Remove GPU reset dependence on struct_mutex")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@intel.com>
---
 drivers/gpu/drm/i915/i915_request.c |  3 +++
 drivers/gpu/drm/i915/i915_reset.c   | 19 +++++--------------
 2 files changed, 8 insertions(+), 14 deletions(-)

Comments

Mika Kuoppala Feb. 15, 2019, 12:52 p.m. UTC | #1
Chris Wilson <chris@chris-wilson.co.uk> writes:

> As we currently do not check on submission whether the context is banned
> in a timely manner it is possible for some requests to escape
> cancellation after their parent context is banned. By moving the ban
> into the request submission under the engine->timeline.lock, we
> serialise it with the reset and setting of the context ban.
>
> References: eb8d0f5af4ec ("drm/i915: Remove GPU reset dependence on struct_mutex")
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_request.c |  3 +++
>  drivers/gpu/drm/i915/i915_reset.c   | 19 +++++--------------
>  2 files changed, 8 insertions(+), 14 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
> index 0acd6baa3c88..5ab4e1c01618 100644
> --- a/drivers/gpu/drm/i915/i915_request.c
> +++ b/drivers/gpu/drm/i915/i915_request.c
> @@ -366,6 +366,9 @@ void __i915_request_submit(struct i915_request *request)
>  	GEM_BUG_ON(!irqs_disabled());
>  	lockdep_assert_held(&engine->timeline.lock);
>  
> +	if (i915_gem_context_is_banned(request->gem_context))
> +		i915_request_skip(request, -EIO);
> +
>  	GEM_BUG_ON(request->global_seqno);
>  
>  	seqno = next_global_seqno(&engine->timeline);
> diff --git a/drivers/gpu/drm/i915/i915_reset.c b/drivers/gpu/drm/i915/i915_reset.c
> index 12e74decd7a2..7fa97ce19bfd 100644
> --- a/drivers/gpu/drm/i915/i915_reset.c
> +++ b/drivers/gpu/drm/i915/i915_reset.c
> @@ -22,24 +22,15 @@ static void engine_skip_context(struct i915_request *rq)
>  {
>  	struct intel_engine_cs *engine = rq->engine;
>  	struct i915_gem_context *hung_ctx = rq->gem_context;
> -	struct i915_timeline *timeline = rq->timeline;
>  
>  	lockdep_assert_held(&engine->timeline.lock);

This was golden.

> -	GEM_BUG_ON(timeline == &engine->timeline);
>  
> -	spin_lock(&timeline->lock);
> -
> -	if (i915_request_is_active(rq)) {
> -		list_for_each_entry_continue(rq,
> -					     &engine->timeline.requests, link)
> -			if (rq->gem_context == hung_ctx)
> -				i915_request_skip(rq, -EIO);
> -	}
> -
> -	list_for_each_entry(rq, &timeline->requests, link)
> -		i915_request_skip(rq, -EIO);
> +	if (!i915_request_is_active(rq))
> +		return;

Only thing that got me actually pondering was that
we don't flush anything after we have modify the ring.

But that is not about this patch

Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>


>  
> -	spin_unlock(&timeline->lock);
> +	list_for_each_entry_continue(rq, &engine->timeline.requests, link)
> +		if (rq->gem_context == hung_ctx)
> +			i915_request_skip(rq, -EIO);
>  }
>  
>  static void client_mark_guilty(struct drm_i915_file_private *file_priv,
> -- 
> 2.20.1
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
Chris Wilson Feb. 15, 2019, 12:57 p.m. UTC | #2
Quoting Mika Kuoppala (2019-02-15 12:52:06)
> Chris Wilson <chris@chris-wilson.co.uk> writes:
> 
> > As we currently do not check on submission whether the context is banned
> > in a timely manner it is possible for some requests to escape
> > cancellation after their parent context is banned. By moving the ban
> > into the request submission under the engine->timeline.lock, we
> > serialise it with the reset and setting of the context ban.
> >
> > References: eb8d0f5af4ec ("drm/i915: Remove GPU reset dependence on struct_mutex")
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > Cc: Mika Kuoppala <mika.kuoppala@intel.com>
> > ---
> >  drivers/gpu/drm/i915/i915_request.c |  3 +++
> >  drivers/gpu/drm/i915/i915_reset.c   | 19 +++++--------------
> >  2 files changed, 8 insertions(+), 14 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
> > index 0acd6baa3c88..5ab4e1c01618 100644
> > --- a/drivers/gpu/drm/i915/i915_request.c
> > +++ b/drivers/gpu/drm/i915/i915_request.c
> > @@ -366,6 +366,9 @@ void __i915_request_submit(struct i915_request *request)
> >       GEM_BUG_ON(!irqs_disabled());
> >       lockdep_assert_held(&engine->timeline.lock);
> >  
> > +     if (i915_gem_context_is_banned(request->gem_context))
> > +             i915_request_skip(request, -EIO);
> > +
> >       GEM_BUG_ON(request->global_seqno);
> >  
> >       seqno = next_global_seqno(&engine->timeline);
> > diff --git a/drivers/gpu/drm/i915/i915_reset.c b/drivers/gpu/drm/i915/i915_reset.c
> > index 12e74decd7a2..7fa97ce19bfd 100644
> > --- a/drivers/gpu/drm/i915/i915_reset.c
> > +++ b/drivers/gpu/drm/i915/i915_reset.c
> > @@ -22,24 +22,15 @@ static void engine_skip_context(struct i915_request *rq)
> >  {
> >       struct intel_engine_cs *engine = rq->engine;
> >       struct i915_gem_context *hung_ctx = rq->gem_context;
> > -     struct i915_timeline *timeline = rq->timeline;
> >  
> >       lockdep_assert_held(&engine->timeline.lock);
> 
> This was golden.
> 
> > -     GEM_BUG_ON(timeline == &engine->timeline);
> >  
> > -     spin_lock(&timeline->lock);
> > -
> > -     if (i915_request_is_active(rq)) {
> > -             list_for_each_entry_continue(rq,
> > -                                          &engine->timeline.requests, link)
> > -                     if (rq->gem_context == hung_ctx)
> > -                             i915_request_skip(rq, -EIO);
> > -     }
> > -
> > -     list_for_each_entry(rq, &timeline->requests, link)
> > -             i915_request_skip(rq, -EIO);
> > +     if (!i915_request_is_active(rq))
> > +             return;
> 
> Only thing that got me actually pondering was that
> we don't flush anything after we have modify the ring.

The magic is that we don't need to flush, the requests are all on their
way to HW and we don't care when, until somebody actually cares at a
later date (e.g. i915_request_wait(), or calling unwedge before
restarting). Flushing is hard since the requests have their own webs of
dependencies which must be maintained even if they die.
-Chris
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
index 0acd6baa3c88..5ab4e1c01618 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -366,6 +366,9 @@  void __i915_request_submit(struct i915_request *request)
 	GEM_BUG_ON(!irqs_disabled());
 	lockdep_assert_held(&engine->timeline.lock);
 
+	if (i915_gem_context_is_banned(request->gem_context))
+		i915_request_skip(request, -EIO);
+
 	GEM_BUG_ON(request->global_seqno);
 
 	seqno = next_global_seqno(&engine->timeline);
diff --git a/drivers/gpu/drm/i915/i915_reset.c b/drivers/gpu/drm/i915/i915_reset.c
index 12e74decd7a2..7fa97ce19bfd 100644
--- a/drivers/gpu/drm/i915/i915_reset.c
+++ b/drivers/gpu/drm/i915/i915_reset.c
@@ -22,24 +22,15 @@  static void engine_skip_context(struct i915_request *rq)
 {
 	struct intel_engine_cs *engine = rq->engine;
 	struct i915_gem_context *hung_ctx = rq->gem_context;
-	struct i915_timeline *timeline = rq->timeline;
 
 	lockdep_assert_held(&engine->timeline.lock);
-	GEM_BUG_ON(timeline == &engine->timeline);
 
-	spin_lock(&timeline->lock);
-
-	if (i915_request_is_active(rq)) {
-		list_for_each_entry_continue(rq,
-					     &engine->timeline.requests, link)
-			if (rq->gem_context == hung_ctx)
-				i915_request_skip(rq, -EIO);
-	}
-
-	list_for_each_entry(rq, &timeline->requests, link)
-		i915_request_skip(rq, -EIO);
+	if (!i915_request_is_active(rq))
+		return;
 
-	spin_unlock(&timeline->lock);
+	list_for_each_entry_continue(rq, &engine->timeline.requests, link)
+		if (rq->gem_context == hung_ctx)
+			i915_request_skip(rq, -EIO);
 }
 
 static void client_mark_guilty(struct drm_i915_file_private *file_priv,