diff mbox

[02/13] drm/i915: Stop the machine whilst capturing the GPU crash dump

Message ID 1470387964-3363-3-git-send-email-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show

Commit Message

Chris Wilson Aug. 5, 2016, 9:05 a.m. UTC
The error state is purposefully racy as we expect it to be called at any
time and so have avoided any locking whilst capturing the crash dump.
However, with multi-engine GPUs and multiple CPUs, those races can
manifest into OOPSes as we attempt to chase dangling pointers freed on
other CPUs. Under discussion are lots of ways to slow down normal
operation in order to protect the post-mortem error capture, but what it
we take the opposite approach and freeze the machine whilst the error
capture runs (note the GPU may still running, but as long as we don't
process any of the results the driver's bookkeeping will be static).

Note that by of itself, this is not a complete fix. It also depends on
the compiler barriers in list_add/list_del to prevent traversing the
lists into the void.

v2: Avoid drm_clflush_pages() inside stop_machine() as it may use
stop_machine() itself for its wbinvd fallback.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/Kconfig          |  1 +
 drivers/gpu/drm/i915/i915_drv.h       |  2 ++
 drivers/gpu/drm/i915/i915_gpu_error.c | 48 +++++++++++++++++++++--------------
 3 files changed, 32 insertions(+), 19 deletions(-)

Comments

Daniel Vetter Aug. 5, 2016, 6:50 p.m. UTC | #1
On Fri, Aug 05, 2016 at 10:05:53AM +0100, Chris Wilson wrote:
> The error state is purposefully racy as we expect it to be called at any
> time and so have avoided any locking whilst capturing the crash dump.
> However, with multi-engine GPUs and multiple CPUs, those races can
> manifest into OOPSes as we attempt to chase dangling pointers freed on
> other CPUs. Under discussion are lots of ways to slow down normal
> operation in order to protect the post-mortem error capture, but what it
> we take the opposite approach and freeze the machine whilst the error
> capture runs (note the GPU may still running, but as long as we don't
> process any of the results the driver's bookkeeping will be static).
> 
> Note that by of itself, this is not a complete fix. It also depends on
> the compiler barriers in list_add/list_del to prevent traversing the
> lists into the void.

The other important bit I think are NULL checks. I think the commit
message should mention that too.
> 
> v2: Avoid drm_clflush_pages() inside stop_machine() as it may use
> stop_machine() itself for its wbinvd fallback.

Droppingt he clflush from error capture seems like a pretty big
regression, at least at a glance. Why does this not result in piles of
corrupted error state captures?

I guess since we need to flush on incoherent platforms before gpu access
anyway this is mostly true (as long as userspace doesn't do something
silly), but I think it should be captured in a comment at the place of the
clflush.
-Daniel

> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>  drivers/gpu/drm/i915/Kconfig          |  1 +
>  drivers/gpu/drm/i915/i915_drv.h       |  2 ++
>  drivers/gpu/drm/i915/i915_gpu_error.c | 48 +++++++++++++++++++++--------------
>  3 files changed, 32 insertions(+), 19 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/Kconfig b/drivers/gpu/drm/i915/Kconfig
> index 7769e469118f..7badcee88ebf 100644
> --- a/drivers/gpu/drm/i915/Kconfig
> +++ b/drivers/gpu/drm/i915/Kconfig
> @@ -4,6 +4,7 @@ config DRM_I915
>  	depends on X86 && PCI
>  	select INTEL_GTT
>  	select INTERVAL_TREE
> +	select STOP_MACHINE
>  	# we need shmfs for the swappable backing store, and in particular
>  	# the shmem_readpage() which depends upon tmpfs
>  	select SHMEM
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 7054baa92061..9a7ffa49cd1f 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -481,6 +481,8 @@ struct drm_i915_error_state {
>  	struct kref ref;
>  	struct timeval time;
>  
> +	struct drm_i915_private *i915;
> +
>  	char error_msg[128];
>  	bool simulated;
>  	int iommu;
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
> index ced296983caa..b94a59733cf8 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
> @@ -28,6 +28,7 @@
>   */
>  
>  #include <generated/utsrelease.h>
> +#include <linux/stop_machine.h>
>  #include "i915_drv.h"
>  
>  static const char *engine_str(int engine)
> @@ -684,14 +685,12 @@ i915_error_object_create(struct drm_i915_private *dev_priv,
>  
>  	dst->page_count = num_pages;
>  	while (num_pages--) {
> -		unsigned long flags;
>  		void *d;
>  
>  		d = kmalloc(PAGE_SIZE, GFP_ATOMIC);
>  		if (d == NULL)
>  			goto unwind;
>  
> -		local_irq_save(flags);
>  		if (use_ggtt) {
>  			void __iomem *s;
>  
> @@ -710,15 +709,10 @@ i915_error_object_create(struct drm_i915_private *dev_priv,
>  
>  			page = i915_gem_object_get_page(src, i);
>  
> -			drm_clflush_pages(&page, 1);
> -
>  			s = kmap_atomic(page);
>  			memcpy(d, s, PAGE_SIZE);
>  			kunmap_atomic(s);
> -
> -			drm_clflush_pages(&page, 1);
>  		}
> -		local_irq_restore(flags);
>  
>  		dst->pages[i++] = d;
>  		reloc_offset += PAGE_SIZE;
> @@ -1371,6 +1365,32 @@ static void i915_capture_gen_state(struct drm_i915_private *dev_priv,
>  	error->suspend_count = dev_priv->suspend_count;
>  }
>  
> +static int capture(void *data)
> +{
> +	struct drm_i915_error_state *error = data;
> +
> +	/* Ensure that what we readback from memory matches what the GPU sees */
> +	wbinvd();
> +
> +	i915_capture_gen_state(error->i915, error);
> +	i915_capture_reg_state(error->i915, error);
> +	i915_gem_record_fences(error->i915, error);
> +	i915_gem_record_rings(error->i915, error);
> +
> +	i915_capture_active_buffers(error->i915, error);
> +	i915_capture_pinned_buffers(error->i915, error);
> +
> +	do_gettimeofday(&error->time);
> +
> +	error->overlay = intel_overlay_capture_error_state(error->i915);
> +	error->display = intel_display_capture_error_state(error->i915);
> +
> +	/* And make sure we don't leave trash in the CPU cache */
> +	wbinvd();
> +
> +	return 0;
> +}
> +
>  /**
>   * i915_capture_error_state - capture an error record for later analysis
>   * @dev: drm device
> @@ -1399,19 +1419,9 @@ void i915_capture_error_state(struct drm_i915_private *dev_priv,
>  	}
>  
>  	kref_init(&error->ref);
> +	error->i915 = dev_priv;
>  
> -	i915_capture_gen_state(dev_priv, error);
> -	i915_capture_reg_state(dev_priv, error);
> -	i915_gem_record_fences(dev_priv, error);
> -	i915_gem_record_rings(dev_priv, error);
> -
> -	i915_capture_active_buffers(dev_priv, error);
> -	i915_capture_pinned_buffers(dev_priv, error);
> -
> -	do_gettimeofday(&error->time);
> -
> -	error->overlay = intel_overlay_capture_error_state(dev_priv);
> -	error->display = intel_display_capture_error_state(dev_priv);
> +	stop_machine(capture, error, NULL);
>  
>  	i915_error_capture_msg(dev_priv, error, engine_mask, error_msg);
>  	DRM_INFO("%s\n", error->error_msg);
> -- 
> 2.8.1
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
Chris Wilson Aug. 5, 2016, 7:01 p.m. UTC | #2
On Fri, Aug 05, 2016 at 08:50:11PM +0200, Daniel Vetter wrote:
> On Fri, Aug 05, 2016 at 10:05:53AM +0100, Chris Wilson wrote:
> > The error state is purposefully racy as we expect it to be called at any
> > time and so have avoided any locking whilst capturing the crash dump.
> > However, with multi-engine GPUs and multiple CPUs, those races can
> > manifest into OOPSes as we attempt to chase dangling pointers freed on
> > other CPUs. Under discussion are lots of ways to slow down normal
> > operation in order to protect the post-mortem error capture, but what it
> > we take the opposite approach and freeze the machine whilst the error
> > capture runs (note the GPU may still running, but as long as we don't
> > process any of the results the driver's bookkeeping will be static).
> > 
> > Note that by of itself, this is not a complete fix. It also depends on
> > the compiler barriers in list_add/list_del to prevent traversing the
> > lists into the void.
> 
> The other important bit I think are NULL checks. I think the commit
> message should mention that too.

Not so convinced here. My idea for GPU error capture is that we
basically grab the bad request and then probe from there. The only list
(and pointer chasing) I want to walk in a dangerous manner is the request
list. Everything we need to report should be derivable from that
request, and as the request is reachable from the list we know it is
intact. I guess we could err on !request->fence.refcount and run away.

> > v2: Avoid drm_clflush_pages() inside stop_machine() as it may use
> > stop_machine() itself for its wbinvd fallback.
> 
> Droppingt he clflush from error capture seems like a pretty big
> regression, at least at a glance. Why does this not result in piles of
> corrupted error state captures?

We replace the clflush with an ever bigger complete cache flush.

However...

> I guess since we need to flush on incoherent platforms before gpu access
> anyway this is mostly true (as long as userspace doesn't do something
> silly), but I think it should be captured in a comment at the place of the
> clflush.

Indeed. All the data we read has been flushed out of the cpu caches.
If not, then right there is a reason for a hang.  It's moot as I
propose just using the GTT for accessing every page since it is the only
universal method we have - and it has been my preference for GPU capture
because it gives us coherent data as the GPU would see it. We only
stopped doing so more recently when the batches were not accessible from
the GTT.
-Chris
Daniel Vetter Aug. 5, 2016, 7:57 p.m. UTC | #3
On Fri, Aug 05, 2016 at 08:01:58PM +0100, Chris Wilson wrote:
> On Fri, Aug 05, 2016 at 08:50:11PM +0200, Daniel Vetter wrote:
> > On Fri, Aug 05, 2016 at 10:05:53AM +0100, Chris Wilson wrote:
> > > The error state is purposefully racy as we expect it to be called at any
> > > time and so have avoided any locking whilst capturing the crash dump.
> > > However, with multi-engine GPUs and multiple CPUs, those races can
> > > manifest into OOPSes as we attempt to chase dangling pointers freed on
> > > other CPUs. Under discussion are lots of ways to slow down normal
> > > operation in order to protect the post-mortem error capture, but what it
> > > we take the opposite approach and freeze the machine whilst the error
> > > capture runs (note the GPU may still running, but as long as we don't
> > > process any of the results the driver's bookkeeping will be static).
> > > 
> > > Note that by of itself, this is not a complete fix. It also depends on
> > > the compiler barriers in list_add/list_del to prevent traversing the
> > > lists into the void.
> > 
> > The other important bit I think are NULL checks. I think the commit
> > message should mention that too.
> 
> Not so convinced here. My idea for GPU error capture is that we
> basically grab the bad request and then probe from there. The only list
> (and pointer chasing) I want to walk in a dangerous manner is the request
> list. Everything we need to report should be derivable from that
> request, and as the request is reachable from the list we know it is
> intact. I guess we could err on !request->fence.refcount and run away.

Yeah, this story is more convincing, would be good to add it.
-Daniel

> 
> > > v2: Avoid drm_clflush_pages() inside stop_machine() as it may use
> > > stop_machine() itself for its wbinvd fallback.
> > 
> > Droppingt he clflush from error capture seems like a pretty big
> > regression, at least at a glance. Why does this not result in piles of
> > corrupted error state captures?
> 
> We replace the clflush with an ever bigger complete cache flush.
> 
> However...
> 
> > I guess since we need to flush on incoherent platforms before gpu access
> > anyway this is mostly true (as long as userspace doesn't do something
> > silly), but I think it should be captured in a comment at the place of the
> > clflush.
> 
> Indeed. All the data we read has been flushed out of the cpu caches.
> If not, then right there is a reason for a hang.  It's moot as I
> propose just using the GTT for accessing every page since it is the only
> universal method we have - and it has been my preference for GPU capture
> because it gives us coherent data as the GPU would see it. We only
> stopped doing so more recently when the batches were not accessible from
> the GTT.
> -Chris
> 
> -- 
> Chris Wilson, Intel Open Source Technology Centre
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/Kconfig b/drivers/gpu/drm/i915/Kconfig
index 7769e469118f..7badcee88ebf 100644
--- a/drivers/gpu/drm/i915/Kconfig
+++ b/drivers/gpu/drm/i915/Kconfig
@@ -4,6 +4,7 @@  config DRM_I915
 	depends on X86 && PCI
 	select INTEL_GTT
 	select INTERVAL_TREE
+	select STOP_MACHINE
 	# we need shmfs for the swappable backing store, and in particular
 	# the shmem_readpage() which depends upon tmpfs
 	select SHMEM
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 7054baa92061..9a7ffa49cd1f 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -481,6 +481,8 @@  struct drm_i915_error_state {
 	struct kref ref;
 	struct timeval time;
 
+	struct drm_i915_private *i915;
+
 	char error_msg[128];
 	bool simulated;
 	int iommu;
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index ced296983caa..b94a59733cf8 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -28,6 +28,7 @@ 
  */
 
 #include <generated/utsrelease.h>
+#include <linux/stop_machine.h>
 #include "i915_drv.h"
 
 static const char *engine_str(int engine)
@@ -684,14 +685,12 @@  i915_error_object_create(struct drm_i915_private *dev_priv,
 
 	dst->page_count = num_pages;
 	while (num_pages--) {
-		unsigned long flags;
 		void *d;
 
 		d = kmalloc(PAGE_SIZE, GFP_ATOMIC);
 		if (d == NULL)
 			goto unwind;
 
-		local_irq_save(flags);
 		if (use_ggtt) {
 			void __iomem *s;
 
@@ -710,15 +709,10 @@  i915_error_object_create(struct drm_i915_private *dev_priv,
 
 			page = i915_gem_object_get_page(src, i);
 
-			drm_clflush_pages(&page, 1);
-
 			s = kmap_atomic(page);
 			memcpy(d, s, PAGE_SIZE);
 			kunmap_atomic(s);
-
-			drm_clflush_pages(&page, 1);
 		}
-		local_irq_restore(flags);
 
 		dst->pages[i++] = d;
 		reloc_offset += PAGE_SIZE;
@@ -1371,6 +1365,32 @@  static void i915_capture_gen_state(struct drm_i915_private *dev_priv,
 	error->suspend_count = dev_priv->suspend_count;
 }
 
+static int capture(void *data)
+{
+	struct drm_i915_error_state *error = data;
+
+	/* Ensure that what we readback from memory matches what the GPU sees */
+	wbinvd();
+
+	i915_capture_gen_state(error->i915, error);
+	i915_capture_reg_state(error->i915, error);
+	i915_gem_record_fences(error->i915, error);
+	i915_gem_record_rings(error->i915, error);
+
+	i915_capture_active_buffers(error->i915, error);
+	i915_capture_pinned_buffers(error->i915, error);
+
+	do_gettimeofday(&error->time);
+
+	error->overlay = intel_overlay_capture_error_state(error->i915);
+	error->display = intel_display_capture_error_state(error->i915);
+
+	/* And make sure we don't leave trash in the CPU cache */
+	wbinvd();
+
+	return 0;
+}
+
 /**
  * i915_capture_error_state - capture an error record for later analysis
  * @dev: drm device
@@ -1399,19 +1419,9 @@  void i915_capture_error_state(struct drm_i915_private *dev_priv,
 	}
 
 	kref_init(&error->ref);
+	error->i915 = dev_priv;
 
-	i915_capture_gen_state(dev_priv, error);
-	i915_capture_reg_state(dev_priv, error);
-	i915_gem_record_fences(dev_priv, error);
-	i915_gem_record_rings(dev_priv, error);
-
-	i915_capture_active_buffers(dev_priv, error);
-	i915_capture_pinned_buffers(dev_priv, error);
-
-	do_gettimeofday(&error->time);
-
-	error->overlay = intel_overlay_capture_error_state(dev_priv);
-	error->display = intel_display_capture_error_state(dev_priv);
+	stop_machine(capture, error, NULL);
 
 	i915_error_capture_msg(dev_priv, error, engine_mask, error_msg);
 	DRM_INFO("%s\n", error->error_msg);