diff mbox

[18/48] drm/i915: Better reset handling for contexts

Message ID 1386367941-7131-18-git-send-email-benjamin.widawsky@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Ben Widawsky Dec. 6, 2013, 10:11 p.m. UTC
From: Ben Widawsky <ben@bwidawsk.net>

This patch adds to changes for contexts on reset:
Sets last context to default - this will prevent the context switch
happening after a reset. That switch is not possible because the
rings are hung during reset and context switch requires reset. This
behavior will need to be reworked in the future, but this is what we
want for now.

In the future, we'll also want to reset the guilty context to
uninitialized. We should wait for ARB_Robustness related code to land
for that.

This is somewhat for paranoia.  Because we really don't know what the
GPU was doing when it hung, or the state it was in (mid context write,
for example), later restoring the context is a bad idea. By setting the
flag to not initialized, the next load of that context will not restore
the state, and thus on the subsequent switch away from the context will
overwrite the old data.

NOTE: This code needs a fixup when we actually have multiple VMs. The
issue that can occur is inactive objects in a VM will need to be
destroyed before the last context unref. This can now happen via the
fake switch introduced in this patch (and it other ways in the future)

Signed-off-by: Ben Widawsky <ben@bwidawsk.net>
---
 drivers/gpu/drm/i915/i915_drv.h         |  1 +
 drivers/gpu/drm/i915/i915_gem.c         |  2 ++
 drivers/gpu/drm/i915/i915_gem_context.c | 43 +++++++++++++++++++++++++++++++++
 3 files changed, 46 insertions(+)

Comments

Daniel Vetter Dec. 18, 2013, 2:21 p.m. UTC | #1
On Fri, Dec 06, 2013 at 02:11:03PM -0800, Ben Widawsky wrote:
> From: Ben Widawsky <ben@bwidawsk.net>
> 
> This patch adds to changes for contexts on reset:
> Sets last context to default - this will prevent the context switch
> happening after a reset. That switch is not possible because the
> rings are hung during reset and context switch requires reset. This
> behavior will need to be reworked in the future, but this is what we
> want for now.
> 
> In the future, we'll also want to reset the guilty context to
> uninitialized. We should wait for ARB_Robustness related code to land
> for that.
> 
> This is somewhat for paranoia.  Because we really don't know what the
> GPU was doing when it hung, or the state it was in (mid context write,
> for example), later restoring the context is a bad idea. By setting the
> flag to not initialized, the next load of that context will not restore
> the state, and thus on the subsequent switch away from the context will
> overwrite the old data.
> 
> NOTE: This code needs a fixup when we actually have multiple VMs. The
> issue that can occur is inactive objects in a VM will need to be
> destroyed before the last context unref. This can now happen via the
> fake switch introduced in this patch (and it other ways in the future)
> 
> Signed-off-by: Ben Widawsky <ben@bwidawsk.net>

Have we actually seen a gpu hang due to context restore after reset? Afaik
most of this stuff just loads a bit of register state, so as long as we
don't run any commands I don't see much issues.

The real fix here is to not run any batches any more, which some of the
arb robustness stuff from Mika will eventually do. So if this doesn't fix
any real-world issues (testcases pls) I'll revert it again ...
-Daniel

> ---
>  drivers/gpu/drm/i915/i915_drv.h         |  1 +
>  drivers/gpu/drm/i915/i915_gem.c         |  2 ++
>  drivers/gpu/drm/i915/i915_gem_context.c | 43 +++++++++++++++++++++++++++++++++
>  3 files changed, 46 insertions(+)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 1d4651c..37e8208050 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -2222,6 +2222,7 @@ i915_gem_obj_ggtt_pin(struct drm_i915_gem_object *obj,
>  /* i915_gem_context.c */
>  int __must_check i915_gem_context_init(struct drm_device *dev);
>  void i915_gem_context_fini(struct drm_device *dev);
> +void i915_gem_context_reset(struct drm_device *dev);
>  int i915_gem_context_open(struct drm_device *dev, struct drm_file *file);
>  void i915_gem_context_close(struct drm_device *dev, struct drm_file *file);
>  int i915_switch_context(struct intel_ring_buffer *ring,
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 169f673..030d9ce 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -2412,6 +2412,8 @@ void i915_gem_reset(struct drm_device *dev)
>  
>  	i915_gem_cleanup_ringbuffer(dev);
>  
> +	i915_gem_context_reset(dev);
> +
>  	i915_gem_restore_fences(dev);
>  }
>  
> diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
> index 0c2ff5a..688e093 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/i915_gem_context.c
> @@ -255,6 +255,49 @@ err_destroy:
>  	return ret;
>  }
>  
> +void i915_gem_context_reset(struct drm_device *dev)
> +{
> +	struct drm_i915_private *dev_priv = dev->dev_private;
> +	struct intel_ring_buffer *ring;
> +	int i;
> +
> +	if (!HAS_HW_CONTEXTS(dev))
> +		return;
> +
> +	/* Prevent the hardware from restoring the last context (which hung) on
> +	 * the next switch */
> +	for (i = 0; i < I915_NUM_RINGS; i++) {
> +		struct i915_hw_context *dctx;
> +		if (!(INTEL_INFO(dev)->ring_mask & (1<<i)))
> +			continue;
> +
> +		/* Do a fake switch to the default context */
> +		ring = &dev_priv->ring[i];
> +		dctx = ring->default_context;
> +		if (WARN_ON(!dctx))
> +			continue;
> +
> +		if (!ring->last_context)
> +			continue;
> +
> +		if (ring->last_context == dctx)
> +			continue;
> +
> +		if (i == RCS) {
> +			WARN_ON(i915_gem_obj_ggtt_pin(dctx->obj,
> +						      get_context_alignment(dev),
> +						      false, false));
> +			/* Fake a finish/inactive */
> +			dctx->obj->base.write_domain = 0;
> +			dctx->obj->active = 0;
> +		}
> +
> +		i915_gem_context_unreference(ring->last_context);
> +		i915_gem_context_reference(dctx);
> +		ring->last_context = dctx;
> +	}
> +}
> +
>  int i915_gem_context_init(struct drm_device *dev)
>  {
>  	struct drm_i915_private *dev_priv = dev->dev_private;
> -- 
> 1.8.4.2
>
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 1d4651c..37e8208050 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2222,6 +2222,7 @@  i915_gem_obj_ggtt_pin(struct drm_i915_gem_object *obj,
 /* i915_gem_context.c */
 int __must_check i915_gem_context_init(struct drm_device *dev);
 void i915_gem_context_fini(struct drm_device *dev);
+void i915_gem_context_reset(struct drm_device *dev);
 int i915_gem_context_open(struct drm_device *dev, struct drm_file *file);
 void i915_gem_context_close(struct drm_device *dev, struct drm_file *file);
 int i915_switch_context(struct intel_ring_buffer *ring,
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 169f673..030d9ce 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2412,6 +2412,8 @@  void i915_gem_reset(struct drm_device *dev)
 
 	i915_gem_cleanup_ringbuffer(dev);
 
+	i915_gem_context_reset(dev);
+
 	i915_gem_restore_fences(dev);
 }
 
diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index 0c2ff5a..688e093 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -255,6 +255,49 @@  err_destroy:
 	return ret;
 }
 
+void i915_gem_context_reset(struct drm_device *dev)
+{
+	struct drm_i915_private *dev_priv = dev->dev_private;
+	struct intel_ring_buffer *ring;
+	int i;
+
+	if (!HAS_HW_CONTEXTS(dev))
+		return;
+
+	/* Prevent the hardware from restoring the last context (which hung) on
+	 * the next switch */
+	for (i = 0; i < I915_NUM_RINGS; i++) {
+		struct i915_hw_context *dctx;
+		if (!(INTEL_INFO(dev)->ring_mask & (1<<i)))
+			continue;
+
+		/* Do a fake switch to the default context */
+		ring = &dev_priv->ring[i];
+		dctx = ring->default_context;
+		if (WARN_ON(!dctx))
+			continue;
+
+		if (!ring->last_context)
+			continue;
+
+		if (ring->last_context == dctx)
+			continue;
+
+		if (i == RCS) {
+			WARN_ON(i915_gem_obj_ggtt_pin(dctx->obj,
+						      get_context_alignment(dev),
+						      false, false));
+			/* Fake a finish/inactive */
+			dctx->obj->base.write_domain = 0;
+			dctx->obj->active = 0;
+		}
+
+		i915_gem_context_unreference(ring->last_context);
+		i915_gem_context_reference(dctx);
+		ring->last_context = dctx;
+	}
+}
+
 int i915_gem_context_init(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;