diff mbox

[5/5] drm/i915: Improve irq handling after gpu resets

Message ID 1400774195-19414-5-git-send-email-daniel.vetter@ffwll.ch (mailing list archive)
State New, archived
Headers show

Commit Message

Daniel Vetter May 22, 2014, 3:56 p.m. UTC
Currently we do a full re-init of all interrupts after a gpu hang.
Which is pretty bad since we don't restore the interrupts we've
enabled at runtime correctly. Even with that addressed it's rather
horribly race.

But on g4x and later we only reset the gt and not the entire gpu.
Which means we only need to reset the GT interrupt bits. Which has the
nice benefit that vblank waits, pipe CRC interrupts and everything
else display related just keeps on working.

The downside is that gt interrupt handling (i.e. ring->get/put_irq) is
still racy. But as long as the gpu hang reliably wakes all waters and
we have a short time where the refcount drops to 0 we'll recover. So
not that bad really.

Testcase: igt/kms_flip/vblank-vs-hang
Testcase: igt/kms_pipe_crc_basic/hang-*
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 drivers/gpu/drm/i915/i915_drv.c | 17 +++++++++--------
 drivers/gpu/drm/i915/i915_drv.h |  1 +
 drivers/gpu/drm/i915/i915_irq.c | 35 +++++++++++++++++++++++++++--------
 3 files changed, 37 insertions(+), 16 deletions(-)

Comments

Ville Syrjälä May 22, 2014, 4:51 p.m. UTC | #1
On Thu, May 22, 2014 at 05:56:35PM +0200, Daniel Vetter wrote:
> Currently we do a full re-init of all interrupts after a gpu hang.
> Which is pretty bad since we don't restore the interrupts we've
> enabled at runtime correctly. Even with that addressed it's rather
> horribly race.
> 
> But on g4x and later we only reset the gt and not the entire gpu.
> Which means we only need to reset the GT interrupt bits. Which has the
> nice benefit that vblank waits, pipe CRC interrupts and everything
> else display related just keeps on working.
> 
> The downside is that gt interrupt handling (i.e. ring->get/put_irq) is
> still racy. But as long as the gpu hang reliably wakes all waters and
> we have a short time where the refcount drops to 0 we'll recover. So
> not that bad really.

A quick test on IVB tells me that GTIMR and GEN6_PMIMR survive the full
gt reset. But the ring IMRs do get clobbered. So could we just skip the
entire irq reset here?

> 
> Testcase: igt/kms_flip/vblank-vs-hang
> Testcase: igt/kms_pipe_crc_basic/hang-*
> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
> ---
>  drivers/gpu/drm/i915/i915_drv.c | 17 +++++++++--------
>  drivers/gpu/drm/i915/i915_drv.h |  1 +
>  drivers/gpu/drm/i915/i915_irq.c | 35 +++++++++++++++++++++++++++--------
>  3 files changed, 37 insertions(+), 16 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
> index c83c83b74bf4..69a75bb7ad83 100644
> --- a/drivers/gpu/drm/i915/i915_drv.c
> +++ b/drivers/gpu/drm/i915/i915_drv.c
> @@ -811,17 +811,18 @@ int i915_reset(struct drm_device *dev)
>  		}
>  
>  		/*
> -		 * FIXME: This is horribly race against concurrent pageflip and
> -		 * vblank wait ioctls since they can observe dev->irqs_disabled
> -		 * being false when they shouldn't be able to.
> +		 * FIXME: This races pretty badly against concurrent holders of
> +		 * ring interrupts. This is possible since we've started to drop
> +		 * dev->struct_mutex in select places when waiting for the gpu.
>  		 */
> -		drm_irq_uninstall(dev);
> -		drm_irq_install(dev, dev->pdev->irq);
> +		intel_gt_irq_reinit(dev);
>  
> -		/* rps/rc6 re-init is necessary to restore state lost after the
> -		 * reset and the re-install of drm irq. Skip for ironlake per
> +		/*
> +		 * rps/rc6 re-init is necessary to restore state lost after the
> +		 * reset and the re-install of gt irqs. Skip for ironlake per
>  		 * previous concerns that it doesn't respond well to some forms
> -		 * of re-init after reset. */
> +		 * of re-init after reset.
> +		 */
>  		if (INTEL_INFO(dev)->gen > 5)
>  			intel_reset_gt_powersave(dev);
>  
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 459b02ad1ef3..61792c4050e7 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -2078,6 +2078,7 @@ void gen6_set_pm_mask(struct drm_i915_private *dev_priv, u32 pm_iir,
>  							int new_delay);
>  extern void intel_irq_init(struct drm_device *dev);
>  extern void intel_hpd_init(struct drm_device *dev);
> +extern void intel_gt_irq_reinit(struct drm_device *dev);
>  
>  extern void intel_uncore_sanitize(struct drm_device *dev);
>  extern void intel_uncore_early_sanitize(struct drm_device *dev);
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index 3cd659b47dd2..6dba645a81fa 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -3140,6 +3140,14 @@ static void valleyview_irq_preinstall(struct drm_device *dev)
>  	POSTING_READ(VLV_IER);
>  }
>  
> +static void gen8_gt_irq_reset(struct drm_i915_private *dev_priv)
> +{
> +	GEN8_IRQ_RESET_NDX(GT, 0);
> +	GEN8_IRQ_RESET_NDX(GT, 1);
> +	GEN8_IRQ_RESET_NDX(GT, 2);
> +	GEN8_IRQ_RESET_NDX(GT, 3);
> +}
> +
>  static void gen8_irq_reset(struct drm_device *dev)
>  {
>  	struct drm_i915_private *dev_priv = dev->dev_private;
> @@ -3148,10 +3156,7 @@ static void gen8_irq_reset(struct drm_device *dev)
>  	I915_WRITE(GEN8_MASTER_IRQ, 0);
>  	POSTING_READ(GEN8_MASTER_IRQ);
>  
> -	GEN8_IRQ_RESET_NDX(GT, 0);
> -	GEN8_IRQ_RESET_NDX(GT, 1);
> -	GEN8_IRQ_RESET_NDX(GT, 2);
> -	GEN8_IRQ_RESET_NDX(GT, 3);
> +	gen8_gt_irq_reset(dev_priv);
>  
>  	for_each_pipe(pipe)
>  		GEN8_IRQ_RESET_NDX(DE_PIPE, pipe);
> @@ -3171,10 +3176,7 @@ static void cherryview_irq_preinstall(struct drm_device *dev)
>  	I915_WRITE(GEN8_MASTER_IRQ, 0);
>  	POSTING_READ(GEN8_MASTER_IRQ);
>  
> -	GEN8_IRQ_RESET_NDX(GT, 0);
> -	GEN8_IRQ_RESET_NDX(GT, 1);
> -	GEN8_IRQ_RESET_NDX(GT, 2);
> -	GEN8_IRQ_RESET_NDX(GT, 3);
> +	gen8_gt_irq_reset(dev_priv);
>  
>  	GEN5_IRQ_RESET(GEN8_PCU_);
>  
> @@ -4436,6 +4438,23 @@ void intel_hpd_init(struct drm_device *dev)
>  	spin_unlock_irqrestore(&dev_priv->irq_lock, irqflags);
>  }
>  
> +void intel_gt_irq_reinit(struct drm_device *dev)
> +{
> +	struct drm_i915_private *dev_priv = dev->dev_private;
> +
> +	spin_lock_irq(&dev_priv->irq_lock);
> +	if (INTEL_INFO(dev)->gen >= 8) {
> +		gen8_gt_irq_reset(dev_priv);
> +		gen8_gt_irq_postinstall(dev_priv);
> +	} else if (INTEL_INFO(dev)->gen >= 5) {
> +		gen5_gt_irq_reset(dev);
> +		gen5_gt_irq_postinstall(dev);
> +	} else {
> +		WARN_ON(!IS_G4X(dev));
> +	}
> +	spin_unlock_irq(&dev_priv->irq_lock);
> +}
> +
>  /* Disable interrupts so we can allow runtime PM. */
>  void intel_runtime_pm_disable_interrupts(struct drm_device *dev)
>  {
> -- 
> 1.8.4.rc3
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx
Ville Syrjälä May 22, 2014, 5:06 p.m. UTC | #2
On Thu, May 22, 2014 at 07:51:45PM +0300, Ville Syrjälä wrote:
> On Thu, May 22, 2014 at 05:56:35PM +0200, Daniel Vetter wrote:
> > Currently we do a full re-init of all interrupts after a gpu hang.
> > Which is pretty bad since we don't restore the interrupts we've
> > enabled at runtime correctly. Even with that addressed it's rather
> > horribly race.
> > 
> > But on g4x and later we only reset the gt and not the entire gpu.
> > Which means we only need to reset the GT interrupt bits. Which has the
> > nice benefit that vblank waits, pipe CRC interrupts and everything
> > else display related just keeps on working.
> > 
> > The downside is that gt interrupt handling (i.e. ring->get/put_irq) is
> > still racy. But as long as the gpu hang reliably wakes all waters and
> > we have a short time where the refcount drops to 0 we'll recover. So
> > not that bad really.
> 
> A quick test on IVB tells me that GTIMR and GEN6_PMIMR survive the full
> gt reset. But the ring IMRs do get clobbered. So could we just skip the
> entire irq reset here?

Same on ILK. GTIMR survives both render and media resets.
Daniel Vetter May 22, 2014, 8:12 p.m. UTC | #3
On Thu, May 22, 2014 at 07:51:45PM +0300, Ville Syrjälä wrote:
> On Thu, May 22, 2014 at 05:56:35PM +0200, Daniel Vetter wrote:
> > Currently we do a full re-init of all interrupts after a gpu hang.
> > Which is pretty bad since we don't restore the interrupts we've
> > enabled at runtime correctly. Even with that addressed it's rather
> > horribly race.
> > 
> > But on g4x and later we only reset the gt and not the entire gpu.
> > Which means we only need to reset the GT interrupt bits. Which has the
> > nice benefit that vblank waits, pipe CRC interrupts and everything
> > else display related just keeps on working.
> > 
> > The downside is that gt interrupt handling (i.e. ring->get/put_irq) is
> > still racy. But as long as the gpu hang reliably wakes all waters and
> > we have a short time where the refcount drops to 0 we'll recover. So
> > not that bad really.
> 
> A quick test on IVB tells me that GTIMR and GEN6_PMIMR survive the full
> gt reset. But the ring IMRs do get clobbered. So could we just skip the
> entire irq reset here?

Hm ... And things still neatly work afterwards?

I guess the FIXME we need to keep since clobbering the per-ring registers
through the reset is still racy. But we'd only need to restore the bits
correctly.

I'll respin.
-Daniel

> 
> > 
> > Testcase: igt/kms_flip/vblank-vs-hang
> > Testcase: igt/kms_pipe_crc_basic/hang-*
> > Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
> > ---
> >  drivers/gpu/drm/i915/i915_drv.c | 17 +++++++++--------
> >  drivers/gpu/drm/i915/i915_drv.h |  1 +
> >  drivers/gpu/drm/i915/i915_irq.c | 35 +++++++++++++++++++++++++++--------
> >  3 files changed, 37 insertions(+), 16 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
> > index c83c83b74bf4..69a75bb7ad83 100644
> > --- a/drivers/gpu/drm/i915/i915_drv.c
> > +++ b/drivers/gpu/drm/i915/i915_drv.c
> > @@ -811,17 +811,18 @@ int i915_reset(struct drm_device *dev)
> >  		}
> >  
> >  		/*
> > -		 * FIXME: This is horribly race against concurrent pageflip and
> > -		 * vblank wait ioctls since they can observe dev->irqs_disabled
> > -		 * being false when they shouldn't be able to.
> > +		 * FIXME: This races pretty badly against concurrent holders of
> > +		 * ring interrupts. This is possible since we've started to drop
> > +		 * dev->struct_mutex in select places when waiting for the gpu.
> >  		 */
> > -		drm_irq_uninstall(dev);
> > -		drm_irq_install(dev, dev->pdev->irq);
> > +		intel_gt_irq_reinit(dev);
> >  
> > -		/* rps/rc6 re-init is necessary to restore state lost after the
> > -		 * reset and the re-install of drm irq. Skip for ironlake per
> > +		/*
> > +		 * rps/rc6 re-init is necessary to restore state lost after the
> > +		 * reset and the re-install of gt irqs. Skip for ironlake per
> >  		 * previous concerns that it doesn't respond well to some forms
> > -		 * of re-init after reset. */
> > +		 * of re-init after reset.
> > +		 */
> >  		if (INTEL_INFO(dev)->gen > 5)
> >  			intel_reset_gt_powersave(dev);
> >  
> > diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> > index 459b02ad1ef3..61792c4050e7 100644
> > --- a/drivers/gpu/drm/i915/i915_drv.h
> > +++ b/drivers/gpu/drm/i915/i915_drv.h
> > @@ -2078,6 +2078,7 @@ void gen6_set_pm_mask(struct drm_i915_private *dev_priv, u32 pm_iir,
> >  							int new_delay);
> >  extern void intel_irq_init(struct drm_device *dev);
> >  extern void intel_hpd_init(struct drm_device *dev);
> > +extern void intel_gt_irq_reinit(struct drm_device *dev);
> >  
> >  extern void intel_uncore_sanitize(struct drm_device *dev);
> >  extern void intel_uncore_early_sanitize(struct drm_device *dev);
> > diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> > index 3cd659b47dd2..6dba645a81fa 100644
> > --- a/drivers/gpu/drm/i915/i915_irq.c
> > +++ b/drivers/gpu/drm/i915/i915_irq.c
> > @@ -3140,6 +3140,14 @@ static void valleyview_irq_preinstall(struct drm_device *dev)
> >  	POSTING_READ(VLV_IER);
> >  }
> >  
> > +static void gen8_gt_irq_reset(struct drm_i915_private *dev_priv)
> > +{
> > +	GEN8_IRQ_RESET_NDX(GT, 0);
> > +	GEN8_IRQ_RESET_NDX(GT, 1);
> > +	GEN8_IRQ_RESET_NDX(GT, 2);
> > +	GEN8_IRQ_RESET_NDX(GT, 3);
> > +}
> > +
> >  static void gen8_irq_reset(struct drm_device *dev)
> >  {
> >  	struct drm_i915_private *dev_priv = dev->dev_private;
> > @@ -3148,10 +3156,7 @@ static void gen8_irq_reset(struct drm_device *dev)
> >  	I915_WRITE(GEN8_MASTER_IRQ, 0);
> >  	POSTING_READ(GEN8_MASTER_IRQ);
> >  
> > -	GEN8_IRQ_RESET_NDX(GT, 0);
> > -	GEN8_IRQ_RESET_NDX(GT, 1);
> > -	GEN8_IRQ_RESET_NDX(GT, 2);
> > -	GEN8_IRQ_RESET_NDX(GT, 3);
> > +	gen8_gt_irq_reset(dev_priv);
> >  
> >  	for_each_pipe(pipe)
> >  		GEN8_IRQ_RESET_NDX(DE_PIPE, pipe);
> > @@ -3171,10 +3176,7 @@ static void cherryview_irq_preinstall(struct drm_device *dev)
> >  	I915_WRITE(GEN8_MASTER_IRQ, 0);
> >  	POSTING_READ(GEN8_MASTER_IRQ);
> >  
> > -	GEN8_IRQ_RESET_NDX(GT, 0);
> > -	GEN8_IRQ_RESET_NDX(GT, 1);
> > -	GEN8_IRQ_RESET_NDX(GT, 2);
> > -	GEN8_IRQ_RESET_NDX(GT, 3);
> > +	gen8_gt_irq_reset(dev_priv);
> >  
> >  	GEN5_IRQ_RESET(GEN8_PCU_);
> >  
> > @@ -4436,6 +4438,23 @@ void intel_hpd_init(struct drm_device *dev)
> >  	spin_unlock_irqrestore(&dev_priv->irq_lock, irqflags);
> >  }
> >  
> > +void intel_gt_irq_reinit(struct drm_device *dev)
> > +{
> > +	struct drm_i915_private *dev_priv = dev->dev_private;
> > +
> > +	spin_lock_irq(&dev_priv->irq_lock);
> > +	if (INTEL_INFO(dev)->gen >= 8) {
> > +		gen8_gt_irq_reset(dev_priv);
> > +		gen8_gt_irq_postinstall(dev_priv);
> > +	} else if (INTEL_INFO(dev)->gen >= 5) {
> > +		gen5_gt_irq_reset(dev);
> > +		gen5_gt_irq_postinstall(dev);
> > +	} else {
> > +		WARN_ON(!IS_G4X(dev));
> > +	}
> > +	spin_unlock_irq(&dev_priv->irq_lock);
> > +}
> > +
> >  /* Disable interrupts so we can allow runtime PM. */
> >  void intel_runtime_pm_disable_interrupts(struct drm_device *dev)
> >  {
> > -- 
> > 1.8.4.rc3
> > 
> > _______________________________________________
> > Intel-gfx mailing list
> > Intel-gfx@lists.freedesktop.org
> > http://lists.freedesktop.org/mailman/listinfo/intel-gfx
> 
> -- 
> Ville Syrjälä
> Intel OTC
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index c83c83b74bf4..69a75bb7ad83 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -811,17 +811,18 @@  int i915_reset(struct drm_device *dev)
 		}
 
 		/*
-		 * FIXME: This is horribly race against concurrent pageflip and
-		 * vblank wait ioctls since they can observe dev->irqs_disabled
-		 * being false when they shouldn't be able to.
+		 * FIXME: This races pretty badly against concurrent holders of
+		 * ring interrupts. This is possible since we've started to drop
+		 * dev->struct_mutex in select places when waiting for the gpu.
 		 */
-		drm_irq_uninstall(dev);
-		drm_irq_install(dev, dev->pdev->irq);
+		intel_gt_irq_reinit(dev);
 
-		/* rps/rc6 re-init is necessary to restore state lost after the
-		 * reset and the re-install of drm irq. Skip for ironlake per
+		/*
+		 * rps/rc6 re-init is necessary to restore state lost after the
+		 * reset and the re-install of gt irqs. Skip for ironlake per
 		 * previous concerns that it doesn't respond well to some forms
-		 * of re-init after reset. */
+		 * of re-init after reset.
+		 */
 		if (INTEL_INFO(dev)->gen > 5)
 			intel_reset_gt_powersave(dev);
 
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 459b02ad1ef3..61792c4050e7 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2078,6 +2078,7 @@  void gen6_set_pm_mask(struct drm_i915_private *dev_priv, u32 pm_iir,
 							int new_delay);
 extern void intel_irq_init(struct drm_device *dev);
 extern void intel_hpd_init(struct drm_device *dev);
+extern void intel_gt_irq_reinit(struct drm_device *dev);
 
 extern void intel_uncore_sanitize(struct drm_device *dev);
 extern void intel_uncore_early_sanitize(struct drm_device *dev);
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 3cd659b47dd2..6dba645a81fa 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -3140,6 +3140,14 @@  static void valleyview_irq_preinstall(struct drm_device *dev)
 	POSTING_READ(VLV_IER);
 }
 
+static void gen8_gt_irq_reset(struct drm_i915_private *dev_priv)
+{
+	GEN8_IRQ_RESET_NDX(GT, 0);
+	GEN8_IRQ_RESET_NDX(GT, 1);
+	GEN8_IRQ_RESET_NDX(GT, 2);
+	GEN8_IRQ_RESET_NDX(GT, 3);
+}
+
 static void gen8_irq_reset(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
@@ -3148,10 +3156,7 @@  static void gen8_irq_reset(struct drm_device *dev)
 	I915_WRITE(GEN8_MASTER_IRQ, 0);
 	POSTING_READ(GEN8_MASTER_IRQ);
 
-	GEN8_IRQ_RESET_NDX(GT, 0);
-	GEN8_IRQ_RESET_NDX(GT, 1);
-	GEN8_IRQ_RESET_NDX(GT, 2);
-	GEN8_IRQ_RESET_NDX(GT, 3);
+	gen8_gt_irq_reset(dev_priv);
 
 	for_each_pipe(pipe)
 		GEN8_IRQ_RESET_NDX(DE_PIPE, pipe);
@@ -3171,10 +3176,7 @@  static void cherryview_irq_preinstall(struct drm_device *dev)
 	I915_WRITE(GEN8_MASTER_IRQ, 0);
 	POSTING_READ(GEN8_MASTER_IRQ);
 
-	GEN8_IRQ_RESET_NDX(GT, 0);
-	GEN8_IRQ_RESET_NDX(GT, 1);
-	GEN8_IRQ_RESET_NDX(GT, 2);
-	GEN8_IRQ_RESET_NDX(GT, 3);
+	gen8_gt_irq_reset(dev_priv);
 
 	GEN5_IRQ_RESET(GEN8_PCU_);
 
@@ -4436,6 +4438,23 @@  void intel_hpd_init(struct drm_device *dev)
 	spin_unlock_irqrestore(&dev_priv->irq_lock, irqflags);
 }
 
+void intel_gt_irq_reinit(struct drm_device *dev)
+{
+	struct drm_i915_private *dev_priv = dev->dev_private;
+
+	spin_lock_irq(&dev_priv->irq_lock);
+	if (INTEL_INFO(dev)->gen >= 8) {
+		gen8_gt_irq_reset(dev_priv);
+		gen8_gt_irq_postinstall(dev_priv);
+	} else if (INTEL_INFO(dev)->gen >= 5) {
+		gen5_gt_irq_reset(dev);
+		gen5_gt_irq_postinstall(dev);
+	} else {
+		WARN_ON(!IS_G4X(dev));
+	}
+	spin_unlock_irq(&dev_priv->irq_lock);
+}
+
 /* Disable interrupts so we can allow runtime PM. */
 void intel_runtime_pm_disable_interrupts(struct drm_device *dev)
 {