Message ID | 20161012090522.367-3-chris@chris-wilson.co.uk (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Wed, Oct 12, 2016 at 10:05:19AM +0100, Chris Wilson wrote: > The error state is purposefully racy as we expect it to be called at any > time and so have avoided any locking whilst capturing the crash dump. > However, with multi-engine GPUs and multiple CPUs, those races can > manifest into OOPSes as we attempt to chase dangling pointers freed on > other CPUs. Under discussion are lots of ways to slow down normal > operation in order to protect the post-mortem error capture, but what it > we take the opposite approach and freeze the machine whilst the error > capture runs (note the GPU may still running, but as long as we don't > process any of the results the driver's bookkeeping will be static). > > Note that by of itself, this is not a complete fix. It also depends on > the compiler barriers in list_add/list_del to prevent traversing the > lists into the void. We also depend that we only require state from > carefully controlled sources - i.e. all the state we require for > post-mortem debugging should be reachable from the request itself so > that we only have to worry about retrieving the request carefully. Once > we have the request, we know that all pointers from it are intact. > > v2: Avoid drm_clflush_pages() inside stop_machine() as it may use > stop_machine() itself for its wbinvd fallback. Hm, won't this hurt us real bad on any atom with ppgtt? Maybe a big check gen check with a scary comment about why we can't call drm_clflush_pages on old machines? Iirc gen3+ should all be able to flush without stop_machine. With that addressed you can upgrade to Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>, but I think Mika should ack this too. -Daniel > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> > Acked-by: Daniel Vetter <daniel.vetter@ffwll.ch> > --- > drivers/gpu/drm/i915/Kconfig | 1 + > drivers/gpu/drm/i915/i915_drv.h | 2 ++ > drivers/gpu/drm/i915/i915_gpu_error.c | 46 +++++++++++++++++++++-------------- > 3 files changed, 31 insertions(+), 18 deletions(-) > > diff --git a/drivers/gpu/drm/i915/Kconfig b/drivers/gpu/drm/i915/Kconfig > index 8844b99bd760..3eff42e4a441 100644 > --- a/drivers/gpu/drm/i915/Kconfig > +++ b/drivers/gpu/drm/i915/Kconfig > @@ -4,6 +4,7 @@ config DRM_I915 > depends on X86 && PCI > select INTEL_GTT > select INTERVAL_TREE > + select STOP_MACHINE > # we need shmfs for the swappable backing store, and in particular > # the shmem_readpage() which depends upon tmpfs > select SHMEM > diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h > index 380590b30bbf..4199e8aa436a 100644 > --- a/drivers/gpu/drm/i915/i915_drv.h > +++ b/drivers/gpu/drm/i915/i915_drv.h > @@ -746,6 +746,8 @@ struct drm_i915_error_state { > struct kref ref; > struct timeval time; > > + struct drm_i915_private *i915; > + > char error_msg[128]; > bool simulated; > int iommu; > diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c > index c88c0d192a60..159d6d7e0cee 100644 > --- a/drivers/gpu/drm/i915/i915_gpu_error.c > +++ b/drivers/gpu/drm/i915/i915_gpu_error.c > @@ -28,6 +28,7 @@ > */ > > #include <generated/utsrelease.h> > +#include <linux/stop_machine.h> > #include "i915_drv.h" > > static const char *engine_str(int engine) > @@ -744,14 +745,12 @@ i915_error_object_create(struct drm_i915_private *dev_priv, > > dst->page_count = num_pages; > while (num_pages--) { > - unsigned long flags; > void *d; > > d = kmalloc(PAGE_SIZE, GFP_ATOMIC); > if (d == NULL) > goto unwind; > > - local_irq_save(flags); > if (use_ggtt) { > void __iomem *s; > > @@ -770,15 +769,10 @@ i915_error_object_create(struct drm_i915_private *dev_priv, > > page = i915_gem_object_get_page(src, i); > > - drm_clflush_pages(&page, 1); > - > s = kmap_atomic(page); > memcpy(d, s, PAGE_SIZE); > kunmap_atomic(s); > - > - drm_clflush_pages(&page, 1); > } > - local_irq_restore(flags); > > dst->pages[i++] = d; > reloc_offset += PAGE_SIZE; > @@ -1447,6 +1441,31 @@ static void i915_capture_gen_state(struct drm_i915_private *dev_priv, > sizeof(error->device_info)); > } > > +static int capture(void *data) > +{ > + struct drm_i915_error_state *error = data; > + > + /* Ensure that what we readback from memory matches what the GPU sees */ > + wbinvd(); > + > + i915_capture_gen_state(error->i915, error); > + i915_capture_reg_state(error->i915, error); > + i915_gem_record_fences(error->i915, error); > + i915_gem_record_rings(error->i915, error); > + i915_capture_active_buffers(error->i915, error); > + i915_capture_pinned_buffers(error->i915, error); > + > + do_gettimeofday(&error->time); > + > + error->overlay = intel_overlay_capture_error_state(error->i915); > + error->display = intel_display_capture_error_state(error->i915); > + > + /* And make sure we don't leave trash in the CPU cache */ > + wbinvd(); > + > + return 0; > +} > + > /** > * i915_capture_error_state - capture an error record for later analysis > * @dev: drm device > @@ -1478,18 +1497,9 @@ void i915_capture_error_state(struct drm_i915_private *dev_priv, > } > > kref_init(&error->ref); > + error->i915 = dev_priv; > > - i915_capture_gen_state(dev_priv, error); > - i915_capture_reg_state(dev_priv, error); > - i915_gem_record_fences(dev_priv, error); > - i915_gem_record_rings(dev_priv, error); > - i915_capture_active_buffers(dev_priv, error); > - i915_capture_pinned_buffers(dev_priv, error); > - > - do_gettimeofday(&error->time); > - > - error->overlay = intel_overlay_capture_error_state(dev_priv); > - error->display = intel_display_capture_error_state(dev_priv); > + stop_machine(capture, error, NULL); > > i915_error_capture_msg(dev_priv, error, engine_mask, error_msg); > DRM_INFO("%s\n", error->error_msg); > -- > 2.9.3 > > _______________________________________________ > Intel-gfx mailing list > Intel-gfx@lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/intel-gfx
On Thu, Oct 13, 2016 at 04:57:39PM +0200, Daniel Vetter wrote: > On Wed, Oct 12, 2016 at 10:05:19AM +0100, Chris Wilson wrote: > > The error state is purposefully racy as we expect it to be called at any > > time and so have avoided any locking whilst capturing the crash dump. > > However, with multi-engine GPUs and multiple CPUs, those races can > > manifest into OOPSes as we attempt to chase dangling pointers freed on > > other CPUs. Under discussion are lots of ways to slow down normal > > operation in order to protect the post-mortem error capture, but what it > > we take the opposite approach and freeze the machine whilst the error > > capture runs (note the GPU may still running, but as long as we don't > > process any of the results the driver's bookkeeping will be static). > > > > Note that by of itself, this is not a complete fix. It also depends on > > the compiler barriers in list_add/list_del to prevent traversing the > > lists into the void. We also depend that we only require state from > > carefully controlled sources - i.e. all the state we require for > > post-mortem debugging should be reachable from the request itself so > > that we only have to worry about retrieving the request carefully. Once > > we have the request, we know that all pointers from it are intact. > > > > v2: Avoid drm_clflush_pages() inside stop_machine() as it may use > > stop_machine() itself for its wbinvd fallback. > > Hm, won't this hurt us real bad on any atom with ppgtt? Maybe a big check > gen check with a scary comment about why we can't call drm_clflush_pages > on old machines? Iirc gen3+ should all be able to flush without > stop_machine. :) Patch 2 switched to using coherent reads through the GTT for all. Everyone is now equal (and the nice part about that was that it uncovered the WC bug from kernel 4.0!) -Chris
diff --git a/drivers/gpu/drm/i915/Kconfig b/drivers/gpu/drm/i915/Kconfig index 8844b99bd760..3eff42e4a441 100644 --- a/drivers/gpu/drm/i915/Kconfig +++ b/drivers/gpu/drm/i915/Kconfig @@ -4,6 +4,7 @@ config DRM_I915 depends on X86 && PCI select INTEL_GTT select INTERVAL_TREE + select STOP_MACHINE # we need shmfs for the swappable backing store, and in particular # the shmem_readpage() which depends upon tmpfs select SHMEM diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 380590b30bbf..4199e8aa436a 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -746,6 +746,8 @@ struct drm_i915_error_state { struct kref ref; struct timeval time; + struct drm_i915_private *i915; + char error_msg[128]; bool simulated; int iommu; diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c index c88c0d192a60..159d6d7e0cee 100644 --- a/drivers/gpu/drm/i915/i915_gpu_error.c +++ b/drivers/gpu/drm/i915/i915_gpu_error.c @@ -28,6 +28,7 @@ */ #include <generated/utsrelease.h> +#include <linux/stop_machine.h> #include "i915_drv.h" static const char *engine_str(int engine) @@ -744,14 +745,12 @@ i915_error_object_create(struct drm_i915_private *dev_priv, dst->page_count = num_pages; while (num_pages--) { - unsigned long flags; void *d; d = kmalloc(PAGE_SIZE, GFP_ATOMIC); if (d == NULL) goto unwind; - local_irq_save(flags); if (use_ggtt) { void __iomem *s; @@ -770,15 +769,10 @@ i915_error_object_create(struct drm_i915_private *dev_priv, page = i915_gem_object_get_page(src, i); - drm_clflush_pages(&page, 1); - s = kmap_atomic(page); memcpy(d, s, PAGE_SIZE); kunmap_atomic(s); - - drm_clflush_pages(&page, 1); } - local_irq_restore(flags); dst->pages[i++] = d; reloc_offset += PAGE_SIZE; @@ -1447,6 +1441,31 @@ static void i915_capture_gen_state(struct drm_i915_private *dev_priv, sizeof(error->device_info)); } +static int capture(void *data) +{ + struct drm_i915_error_state *error = data; + + /* Ensure that what we readback from memory matches what the GPU sees */ + wbinvd(); + + i915_capture_gen_state(error->i915, error); + i915_capture_reg_state(error->i915, error); + i915_gem_record_fences(error->i915, error); + i915_gem_record_rings(error->i915, error); + i915_capture_active_buffers(error->i915, error); + i915_capture_pinned_buffers(error->i915, error); + + do_gettimeofday(&error->time); + + error->overlay = intel_overlay_capture_error_state(error->i915); + error->display = intel_display_capture_error_state(error->i915); + + /* And make sure we don't leave trash in the CPU cache */ + wbinvd(); + + return 0; +} + /** * i915_capture_error_state - capture an error record for later analysis * @dev: drm device @@ -1478,18 +1497,9 @@ void i915_capture_error_state(struct drm_i915_private *dev_priv, } kref_init(&error->ref); + error->i915 = dev_priv; - i915_capture_gen_state(dev_priv, error); - i915_capture_reg_state(dev_priv, error); - i915_gem_record_fences(dev_priv, error); - i915_gem_record_rings(dev_priv, error); - i915_capture_active_buffers(dev_priv, error); - i915_capture_pinned_buffers(dev_priv, error); - - do_gettimeofday(&error->time); - - error->overlay = intel_overlay_capture_error_state(dev_priv); - error->display = intel_display_capture_error_state(dev_priv); + stop_machine(capture, error, NULL); i915_error_capture_msg(dev_priv, error, engine_mask, error_msg); DRM_INFO("%s\n", error->error_msg);