diff mbox

[04/15] drm/i915: Flush the execlist ports if idle

Message ID 20170717091141.23102-4-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show

Commit Message

Chris Wilson July 17, 2017, 9:11 a.m. UTC
When doing a GPU reset, the CSB register will be trashed and we will
lose any context-switch notifications that happened since the tasklet
was disabled. If we find that all requests on this engine were
completed, we want to make sure that the ELSP tracker is similarly empty
so that we do not feed back in the completed requests upon recovering
from the reset.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
---
 drivers/gpu/drm/i915/intel_lrc.c | 36 ++++++++++++++++++++++++++----------
 1 file changed, 26 insertions(+), 10 deletions(-)

Comments

Mika Kuoppala July 20, 2017, 12:18 p.m. UTC | #1
Chris Wilson <chris@chris-wilson.co.uk> writes:

> When doing a GPU reset, the CSB register will be trashed and we will
> lose any context-switch notifications that happened since the tasklet
> was disabled. If we find that all requests on this engine were
> completed, we want to make sure that the ELSP tracker is similarly empty
> so that we do not feed back in the completed requests upon recovering
> from the reset.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> ---
>  drivers/gpu/drm/i915/intel_lrc.c | 36 ++++++++++++++++++++++++++----------
>  1 file changed, 26 insertions(+), 10 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 3c83f2dd6798..ad61d1998fb7 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -1327,6 +1327,31 @@ static void reset_common_ring(struct intel_engine_cs *engine,
>  {
>  	struct execlist_port *port = engine->execlist_port;
>  	struct intel_context *ce;
> +	unsigned int n;
> +
> +	/*
> +	 * Catch up with any missed context-switch interrupts.
> +	 *
> +	 * Ideally we would just read the remaining CSB entries now that we
> +	 * know the gpu is idle. However, the CSB registers are sometimes^W
> +	 * often trashed across a GPU reset! Instead we have to rely on
> +	 * guessing the missed context-switch events by looking at what
> +	 * requests were completed.
> +	 */
> +	if (!request) {
> +		for (n = 0; n < ARRAY_SIZE(engine->execlist_port); n++)

You need to check against null before put in here?
-Mika

> +			i915_gem_request_put(port_request(&port[n]));
> +		memset(engine->execlist_port, 0, sizeof(engine->execlist_port));
> +		return;
> +	}
> +
> +	if (request->ctx != port_request(port)->ctx) {
> +		i915_gem_request_put(port_request(port));
> +		port[0] = port[1];
> +		memset(&port[1], 0, sizeof(port[1]));
> +	}
> +
> +	GEM_BUG_ON(request->ctx != port_request(port)->ctx);
>  
>  	/* If the request was innocent, we leave the request in the ELSP
>  	 * and will try to replay it on restarting. The context image may
> @@ -1338,7 +1363,7 @@ static void reset_common_ring(struct intel_engine_cs *engine,
>  	 * and have to at least restore the RING register in the context
>  	 * image back to the expected values to skip over the guilty request.
>  	 */
> -	if (!request || request->fence.error != -EIO)
> +	if (request->fence.error != -EIO)
>  		return;
>  
>  	/* We want a simple context + ring to execute the breadcrumb update.
> @@ -1360,15 +1385,6 @@ static void reset_common_ring(struct intel_engine_cs *engine,
>  	request->ring->head = request->postfix;
>  	intel_ring_update_space(request->ring);
>  
> -	/* Catch up with any missed context-switch interrupts */
> -	if (request->ctx != port_request(port)->ctx) {
> -		i915_gem_request_put(port_request(port));
> -		port[0] = port[1];
> -		memset(&port[1], 0, sizeof(port[1]));
> -	}
> -
> -	GEM_BUG_ON(request->ctx != port_request(port)->ctx);
> -
>  	/* Reset WaIdleLiteRestore:bdw,skl as well */
>  	request->tail =
>  		intel_ring_wrap(request->ring,
> -- 
> 2.13.2
Chris Wilson July 20, 2017, 12:28 p.m. UTC | #2
Quoting Mika Kuoppala (2017-07-20 13:18:40)
> > diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> > index 3c83f2dd6798..ad61d1998fb7 100644
> > --- a/drivers/gpu/drm/i915/intel_lrc.c
> > +++ b/drivers/gpu/drm/i915/intel_lrc.c
> > @@ -1327,6 +1327,31 @@ static void reset_common_ring(struct intel_engine_cs *engine,
> >  {
> >       struct execlist_port *port = engine->execlist_port;
> >       struct intel_context *ce;
> > +     unsigned int n;
> > +
> > +     /*
> > +      * Catch up with any missed context-switch interrupts.
> > +      *
> > +      * Ideally we would just read the remaining CSB entries now that we
> > +      * know the gpu is idle. However, the CSB registers are sometimes^W
> > +      * often trashed across a GPU reset! Instead we have to rely on
> > +      * guessing the missed context-switch events by looking at what
> > +      * requests were completed.
> > +      */
> > +     if (!request) {
> > +             for (n = 0; n < ARRAY_SIZE(engine->execlist_port); n++)
> 
> You need to check against null before put in here?

dma_fence_put and i915_gem_request_put, by extension, are NULL-safe.
-Chris
Mika Kuoppala July 20, 2017, 1:12 p.m. UTC | #3
Chris Wilson <chris@chris-wilson.co.uk> writes:

> When doing a GPU reset, the CSB register will be trashed and we will
> lose any context-switch notifications that happened since the tasklet
> was disabled. If we find that all requests on this engine were
> completed, we want to make sure that the ELSP tracker is similarly empty
> so that we do not feed back in the completed requests upon recovering
> from the reset.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>

Reviewed-by: Mika Kuoppala <mika.kuoppala@intel.com>

> ---
>  drivers/gpu/drm/i915/intel_lrc.c | 36 ++++++++++++++++++++++++++----------
>  1 file changed, 26 insertions(+), 10 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 3c83f2dd6798..ad61d1998fb7 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -1327,6 +1327,31 @@ static void reset_common_ring(struct intel_engine_cs *engine,
>  {
>  	struct execlist_port *port = engine->execlist_port;
>  	struct intel_context *ce;
> +	unsigned int n;
> +
> +	/*
> +	 * Catch up with any missed context-switch interrupts.
> +	 *
> +	 * Ideally we would just read the remaining CSB entries now that we
> +	 * know the gpu is idle. However, the CSB registers are sometimes^W
> +	 * often trashed across a GPU reset! Instead we have to rely on
> +	 * guessing the missed context-switch events by looking at what
> +	 * requests were completed.
> +	 */
> +	if (!request) {
> +		for (n = 0; n < ARRAY_SIZE(engine->execlist_port); n++)
> +			i915_gem_request_put(port_request(&port[n]));
> +		memset(engine->execlist_port, 0, sizeof(engine->execlist_port));
> +		return;
> +	}
> +
> +	if (request->ctx != port_request(port)->ctx) {
> +		i915_gem_request_put(port_request(port));
> +		port[0] = port[1];
> +		memset(&port[1], 0, sizeof(port[1]));
> +	}
> +
> +	GEM_BUG_ON(request->ctx != port_request(port)->ctx);
>  
>  	/* If the request was innocent, we leave the request in the ELSP
>  	 * and will try to replay it on restarting. The context image may
> @@ -1338,7 +1363,7 @@ static void reset_common_ring(struct intel_engine_cs *engine,
>  	 * and have to at least restore the RING register in the context
>  	 * image back to the expected values to skip over the guilty request.
>  	 */
> -	if (!request || request->fence.error != -EIO)
> +	if (request->fence.error != -EIO)
>  		return;
>  
>  	/* We want a simple context + ring to execute the breadcrumb update.
> @@ -1360,15 +1385,6 @@ static void reset_common_ring(struct intel_engine_cs *engine,
>  	request->ring->head = request->postfix;
>  	intel_ring_update_space(request->ring);
>  
> -	/* Catch up with any missed context-switch interrupts */
> -	if (request->ctx != port_request(port)->ctx) {
> -		i915_gem_request_put(port_request(port));
> -		port[0] = port[1];
> -		memset(&port[1], 0, sizeof(port[1]));
> -	}
> -
> -	GEM_BUG_ON(request->ctx != port_request(port)->ctx);
> -
>  	/* Reset WaIdleLiteRestore:bdw,skl as well */
>  	request->tail =
>  		intel_ring_wrap(request->ring,
> -- 
> 2.13.2
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 3c83f2dd6798..ad61d1998fb7 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -1327,6 +1327,31 @@  static void reset_common_ring(struct intel_engine_cs *engine,
 {
 	struct execlist_port *port = engine->execlist_port;
 	struct intel_context *ce;
+	unsigned int n;
+
+	/*
+	 * Catch up with any missed context-switch interrupts.
+	 *
+	 * Ideally we would just read the remaining CSB entries now that we
+	 * know the gpu is idle. However, the CSB registers are sometimes^W
+	 * often trashed across a GPU reset! Instead we have to rely on
+	 * guessing the missed context-switch events by looking at what
+	 * requests were completed.
+	 */
+	if (!request) {
+		for (n = 0; n < ARRAY_SIZE(engine->execlist_port); n++)
+			i915_gem_request_put(port_request(&port[n]));
+		memset(engine->execlist_port, 0, sizeof(engine->execlist_port));
+		return;
+	}
+
+	if (request->ctx != port_request(port)->ctx) {
+		i915_gem_request_put(port_request(port));
+		port[0] = port[1];
+		memset(&port[1], 0, sizeof(port[1]));
+	}
+
+	GEM_BUG_ON(request->ctx != port_request(port)->ctx);
 
 	/* If the request was innocent, we leave the request in the ELSP
 	 * and will try to replay it on restarting. The context image may
@@ -1338,7 +1363,7 @@  static void reset_common_ring(struct intel_engine_cs *engine,
 	 * and have to at least restore the RING register in the context
 	 * image back to the expected values to skip over the guilty request.
 	 */
-	if (!request || request->fence.error != -EIO)
+	if (request->fence.error != -EIO)
 		return;
 
 	/* We want a simple context + ring to execute the breadcrumb update.
@@ -1360,15 +1385,6 @@  static void reset_common_ring(struct intel_engine_cs *engine,
 	request->ring->head = request->postfix;
 	intel_ring_update_space(request->ring);
 
-	/* Catch up with any missed context-switch interrupts */
-	if (request->ctx != port_request(port)->ctx) {
-		i915_gem_request_put(port_request(port));
-		port[0] = port[1];
-		memset(&port[1], 0, sizeof(port[1]));
-	}
-
-	GEM_BUG_ON(request->ctx != port_request(port)->ctx);
-
 	/* Reset WaIdleLiteRestore:bdw,skl as well */
 	request->tail =
 		intel_ring_wrap(request->ring,