diff mbox

[v2] drm/i915: Rework GPU reset sequence to match driver load & thaw

Message ID 1407228427-27797-1-git-send-email-alistair.mcaulay@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

alistair.mcaulay@intel.com Aug. 5, 2014, 8:47 a.m. UTC
From: "McAulay, Alistair" <alistair.mcaulay@intel.com>

This patch is to address Daniels concerns over different code during reset:

http://lists.freedesktop.org/archives/intel-gfx/2014-June/047758.html

"The reason for aiming as hard as possible to use the exact same code for
driver load, gpu reset and runtime pm/system resume is that we've simply
seen too many bugs due to slight variations and unintended omissions."

Tested using igt drv_hangman.

V2: Cleaner way of preventing check_wedge returning -EAGAIN

Signed-off-by: McAulay, Alistair <alistair.mcaulay@intel.com>
---
 drivers/gpu/drm/i915/i915_drv.c         |  6 +++
 drivers/gpu/drm/i915/i915_drv.h         |  3 ++
 drivers/gpu/drm/i915/i915_gem.c         |  6 +--
 drivers/gpu/drm/i915/i915_gem_context.c | 42 +--------------------
 drivers/gpu/drm/i915/i915_gem_gtt.c     | 67 +++++----------------------------
 drivers/gpu/drm/i915/i915_gem_gtt.h     |  3 +-
 6 files changed, 23 insertions(+), 104 deletions(-)

Comments

alistair.mcaulay@intel.com Aug. 6, 2014, 12:58 p.m. UTC | #1
Hi Daniel,

I think this new patch fixes your issues with the previous one, can you please let me know.

Thanks,
Alistair.

> -----Original Message-----
> From: Mcaulay, Alistair
> Sent: Tuesday, August 05, 2014 9:47 AM
> To: intel-gfx@lists.freedesktop.org
> Cc: Mcaulay, Alistair
> Subject: [PATCH v2] drm/i915: Rework GPU reset sequence to match driver
> load & thaw
> 
> From: "McAulay, Alistair" <alistair.mcaulay@intel.com>
> 
> This patch is to address Daniels concerns over different code during reset:
> 
> http://lists.freedesktop.org/archives/intel-gfx/2014-June/047758.html
> 
> "The reason for aiming as hard as possible to use the exact same code for
> driver load, gpu reset and runtime pm/system resume is that we've simply
> seen too many bugs due to slight variations and unintended omissions."
> 
> Tested using igt drv_hangman.
> 
> V2: Cleaner way of preventing check_wedge returning -EAGAIN
> 
> Signed-off-by: McAulay, Alistair <alistair.mcaulay@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_drv.c         |  6 +++
>  drivers/gpu/drm/i915/i915_drv.h         |  3 ++
>  drivers/gpu/drm/i915/i915_gem.c         |  6 +--
>  drivers/gpu/drm/i915/i915_gem_context.c | 42 +--------------------
>  drivers/gpu/drm/i915/i915_gem_gtt.c     | 67 +++++----------------------------
>  drivers/gpu/drm/i915/i915_gem_gtt.h     |  3 +-
>  6 files changed, 23 insertions(+), 104 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.c
> b/drivers/gpu/drm/i915/i915_drv.c index 5e4fefd..3bfafe6 100644
> --- a/drivers/gpu/drm/i915/i915_drv.c
> +++ b/drivers/gpu/drm/i915/i915_drv.c
> @@ -806,7 +806,13 @@ int i915_reset(struct drm_device *dev)
>  			!dev_priv->ums.mm_suspended) {
>  		dev_priv->ums.mm_suspended = 0;
> 
> +		/* Used to prevent gem_check_wedged returning -EAGAIN
> during gpu reset */
> +		dev_priv->gpu_error.reload_in_reset = true;
> +
>  		ret = i915_gem_init_hw(dev);
> +
> +		dev_priv->gpu_error.reload_in_reset = false;
> +
>  		mutex_unlock(&dev->struct_mutex);
>  		if (ret) {
>  			DRM_ERROR("Failed hw init on reset %d\n", ret); diff
> --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 991b663..116daff 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -1217,6 +1217,9 @@ struct i915_gpu_error {
> 
>  	/* For missed irq/seqno simulation. */
>  	unsigned int test_irq_rings;
> +
> +	/* Used to prevent gem_check_wedged returning -EAGAIN during
> gpu reset   */
> +	bool reload_in_reset;
>  };
> 
>  enum modeset_restore {
> diff --git a/drivers/gpu/drm/i915/i915_gem.c
> b/drivers/gpu/drm/i915/i915_gem.c index ef047bc..14e1770 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -1085,7 +1085,9 @@ i915_gem_check_wedge(struct i915_gpu_error
> *error,
>  		if (i915_terminally_wedged(error))
>  			return -EIO;
> 
> -		return -EAGAIN;
> +		/* Check if GPU Reset is in progress */
> +		if (!error->reload_in_reset)
> +			return -EAGAIN;
>  	}
> 
>  	return 0;
> @@ -2590,8 +2592,6 @@ void i915_gem_reset(struct drm_device *dev)
>  	for_each_ring(ring, dev_priv, i)
>  		i915_gem_reset_ring_cleanup(dev_priv, ring);
> 
> -	i915_gem_context_reset(dev);
> -
>  	i915_gem_restore_fences(dev);
>  }
> 
> diff --git a/drivers/gpu/drm/i915/i915_gem_context.c
> b/drivers/gpu/drm/i915/i915_gem_context.c
> index de72a28..d96219f 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/i915_gem_context.c
> @@ -372,42 +372,6 @@ err_destroy:
>  	return ERR_PTR(ret);
>  }
> 
> -void i915_gem_context_reset(struct drm_device *dev) -{
> -	struct drm_i915_private *dev_priv = dev->dev_private;
> -	int i;
> -
> -	/* Prevent the hardware from restoring the last context (which
> hung) on
> -	 * the next switch */
> -	for (i = 0; i < I915_NUM_RINGS; i++) {
> -		struct intel_engine_cs *ring = &dev_priv->ring[i];
> -		struct intel_context *dctx = ring->default_context;
> -		struct intel_context *lctx = ring->last_context;
> -
> -		/* Do a fake switch to the default context */
> -		if (lctx == dctx)
> -			continue;
> -
> -		if (!lctx)
> -			continue;
> -
> -		if (dctx->legacy_hw_ctx.rcs_state && i == RCS) {
> -			WARN_ON(i915_gem_obj_ggtt_pin(dctx-
> >legacy_hw_ctx.rcs_state,
> -
> get_context_alignment(dev), 0));
> -			/* Fake a finish/inactive */
> -			dctx->legacy_hw_ctx.rcs_state->base.write_domain
> = 0;
> -			dctx->legacy_hw_ctx.rcs_state->active = 0;
> -		}
> -
> -		if (lctx->legacy_hw_ctx.rcs_state && i == RCS)
> -			i915_gem_object_ggtt_unpin(lctx-
> >legacy_hw_ctx.rcs_state);
> -
> -		i915_gem_context_unreference(lctx);
> -		i915_gem_context_reference(dctx);
> -		ring->last_context = dctx;
> -	}
> -}
> -
>  int i915_gem_context_init(struct drm_device *dev)  {
>  	struct drm_i915_private *dev_priv = dev->dev_private; @@ -498,10
> +462,6 @@ int i915_gem_context_enable(struct drm_i915_private
> *dev_priv)
>  		ppgtt->enable(ppgtt);
>  	}
> 
> -	/* FIXME: We should make this work, even in reset */
> -	if (i915_reset_in_progress(&dev_priv->gpu_error))
> -		return 0;
> -
>  	BUG_ON(!dev_priv->ring[RCS].default_context);
> 
>  	for_each_ring(ring, dev_priv, i) {
> @@ -645,7 +605,7 @@ static int do_switch(struct intel_engine_cs *ring,
>  	from = ring->last_context;
> 
>  	if (USES_FULL_PPGTT(ring->dev)) {
> -		ret = ppgtt->switch_mm(ppgtt, ring, false);
> +		ret = ppgtt->switch_mm(ppgtt, ring);
>  		if (ret)
>  			goto unpin_out;
>  	}
> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c
> b/drivers/gpu/drm/i915/i915_gem_gtt.c
> index 5188936..450c8a9 100644
> --- a/drivers/gpu/drm/i915/i915_gem_gtt.c
> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
> @@ -216,19 +216,12 @@ static gen6_gtt_pte_t
> iris_pte_encode(dma_addr_t addr,
> 
>  /* Broadwell Page Directory Pointer Descriptors */  static int
> gen8_write_pdp(struct intel_engine_cs *ring, unsigned entry,
> -			   uint64_t val, bool synchronous)
> +			   uint64_t val)
>  {
> -	struct drm_i915_private *dev_priv = ring->dev->dev_private;
>  	int ret;
> 
>  	BUG_ON(entry >= 4);
> 
> -	if (synchronous) {
> -		I915_WRITE(GEN8_RING_PDP_UDW(ring, entry), val >> 32);
> -		I915_WRITE(GEN8_RING_PDP_LDW(ring, entry), (u32)val);
> -		return 0;
> -	}
> -
>  	ret = intel_ring_begin(ring, 6);
>  	if (ret)
>  		return ret;
> @@ -245,8 +238,7 @@ static int gen8_write_pdp(struct intel_engine_cs
> *ring, unsigned entry,  }
> 
>  static int gen8_mm_switch(struct i915_hw_ppgtt *ppgtt,
> -			  struct intel_engine_cs *ring,
> -			  bool synchronous)
> +			  struct intel_engine_cs *ring)
>  {
>  	int i, ret;
> 
> @@ -255,7 +247,7 @@ static int gen8_mm_switch(struct i915_hw_ppgtt
> *ppgtt,
> 
>  	for (i = used_pd - 1; i >= 0; i--) {
>  		dma_addr_t addr = ppgtt->pd_dma_addr[i];
> -		ret = gen8_write_pdp(ring, i, addr, synchronous);
> +		ret = gen8_write_pdp(ring, i, addr);
>  		if (ret)
>  			return ret;
>  	}
> @@ -724,29 +716,10 @@ static uint32_t get_pd_offset(struct i915_hw_ppgtt
> *ppgtt)  }
> 
>  static int hsw_mm_switch(struct i915_hw_ppgtt *ppgtt,
> -			 struct intel_engine_cs *ring,
> -			 bool synchronous)
> +			 struct intel_engine_cs *ring)
>  {
> -	struct drm_device *dev = ppgtt->base.dev;
> -	struct drm_i915_private *dev_priv = dev->dev_private;
>  	int ret;
> 
> -	/* If we're in reset, we can assume the GPU is sufficiently idle to
> -	 * manually frob these bits. Ideally we could use the ring functions,
> -	 * except our error handling makes it quite difficult (can't use
> -	 * intel_ring_begin, ring->flush, or intel_ring_advance)
> -	 *
> -	 * FIXME: We should try not to special case reset
> -	 */
> -	if (synchronous ||
> -	    i915_reset_in_progress(&dev_priv->gpu_error)) {
> -		WARN_ON(ppgtt != dev_priv->mm.aliasing_ppgtt);
> -		I915_WRITE(RING_PP_DIR_DCLV(ring), PP_DIR_DCLV_2G);
> -		I915_WRITE(RING_PP_DIR_BASE(ring),
> get_pd_offset(ppgtt));
> -		POSTING_READ(RING_PP_DIR_BASE(ring));
> -		return 0;
> -	}
> -
>  	/* NB: TLBs must be flushed and invalidated before a switch */
>  	ret = ring->flush(ring, I915_GEM_GPU_DOMAINS,
> I915_GEM_GPU_DOMAINS);
>  	if (ret)
> @@ -768,29 +741,10 @@ static int hsw_mm_switch(struct i915_hw_ppgtt
> *ppgtt,  }
> 
>  static int gen7_mm_switch(struct i915_hw_ppgtt *ppgtt,
> -			  struct intel_engine_cs *ring,
> -			  bool synchronous)
> +			  struct intel_engine_cs *ring)
>  {
> -	struct drm_device *dev = ppgtt->base.dev;
> -	struct drm_i915_private *dev_priv = dev->dev_private;
>  	int ret;
> 
> -	/* If we're in reset, we can assume the GPU is sufficiently idle to
> -	 * manually frob these bits. Ideally we could use the ring functions,
> -	 * except our error handling makes it quite difficult (can't use
> -	 * intel_ring_begin, ring->flush, or intel_ring_advance)
> -	 *
> -	 * FIXME: We should try not to special case reset
> -	 */
> -	if (synchronous ||
> -	    i915_reset_in_progress(&dev_priv->gpu_error)) {
> -		WARN_ON(ppgtt != dev_priv->mm.aliasing_ppgtt);
> -		I915_WRITE(RING_PP_DIR_DCLV(ring), PP_DIR_DCLV_2G);
> -		I915_WRITE(RING_PP_DIR_BASE(ring),
> get_pd_offset(ppgtt));
> -		POSTING_READ(RING_PP_DIR_BASE(ring));
> -		return 0;
> -	}
> -
>  	/* NB: TLBs must be flushed and invalidated before a switch */
>  	ret = ring->flush(ring, I915_GEM_GPU_DOMAINS,
> I915_GEM_GPU_DOMAINS);
>  	if (ret)
> @@ -819,14 +773,11 @@ static int gen7_mm_switch(struct i915_hw_ppgtt
> *ppgtt,  }
> 
>  static int gen6_mm_switch(struct i915_hw_ppgtt *ppgtt,
> -			  struct intel_engine_cs *ring,
> -			  bool synchronous)
> +			  struct intel_engine_cs *ring)
>  {
>  	struct drm_device *dev = ppgtt->base.dev;
>  	struct drm_i915_private *dev_priv = dev->dev_private;
> 
> -	if (!synchronous)
> -		return 0;
> 
>  	I915_WRITE(RING_PP_DIR_DCLV(ring), PP_DIR_DCLV_2G);
>  	I915_WRITE(RING_PP_DIR_BASE(ring), get_pd_offset(ppgtt)); @@ -
> 852,7 +803,7 @@ static int gen8_ppgtt_enable(struct i915_hw_ppgtt *ppgtt)
>  		if (USES_FULL_PPGTT(dev))
>  			continue;
> 
> -		ret = ppgtt->switch_mm(ppgtt, ring, true);
> +		ret = ppgtt->switch_mm(ppgtt, ring);
>  		if (ret)
>  			goto err_out;
>  	}
> @@ -897,7 +848,7 @@ static int gen7_ppgtt_enable(struct i915_hw_ppgtt
> *ppgtt)
>  		if (USES_FULL_PPGTT(dev))
>  			continue;
> 
> -		ret = ppgtt->switch_mm(ppgtt, ring, true);
> +		ret = ppgtt->switch_mm(ppgtt, ring);
>  		if (ret)
>  			return ret;
>  	}
> @@ -926,7 +877,7 @@ static int gen6_ppgtt_enable(struct i915_hw_ppgtt
> *ppgtt)
>  	I915_WRITE(GFX_MODE,
> _MASKED_BIT_ENABLE(GFX_PPGTT_ENABLE));
> 
>  	for_each_ring(ring, dev_priv, i) {
> -		int ret = ppgtt->switch_mm(ppgtt, ring, true);
> +		int ret = ppgtt->switch_mm(ppgtt, ring);
>  		if (ret)
>  			return ret;
>  	}
> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.h
> b/drivers/gpu/drm/i915/i915_gem_gtt.h
> index 8d6f7c1..bf1e4fc 100644
> --- a/drivers/gpu/drm/i915/i915_gem_gtt.h
> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.h
> @@ -262,8 +262,7 @@ struct i915_hw_ppgtt {
> 
>  	int (*enable)(struct i915_hw_ppgtt *ppgtt);
>  	int (*switch_mm)(struct i915_hw_ppgtt *ppgtt,
> -			 struct intel_engine_cs *ring,
> -			 bool synchronous);
> +			 struct intel_engine_cs *ring);
>  	void (*debug_dump)(struct i915_hw_ppgtt *ppgtt, struct seq_file
> *m);  };
> 
> --
> 2.0.0
Mika Kuoppala Aug. 6, 2014, 4:24 p.m. UTC | #2
Hi,

alistair.mcaulay@intel.com writes:

> From: "McAulay, Alistair" <alistair.mcaulay@intel.com>
>
> This patch is to address Daniels concerns over different code during reset:
>
> http://lists.freedesktop.org/archives/intel-gfx/2014-June/047758.html
>
> "The reason for aiming as hard as possible to use the exact same code for
> driver load, gpu reset and runtime pm/system resume is that we've simply
> seen too many bugs due to slight variations and unintended omissions."
>
> Tested using igt drv_hangman.
>
> V2: Cleaner way of preventing check_wedge returning -EAGAIN
>
> Signed-off-by: McAulay, Alistair <alistair.mcaulay@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_drv.c         |  6 +++
>  drivers/gpu/drm/i915/i915_drv.h         |  3 ++
>  drivers/gpu/drm/i915/i915_gem.c         |  6 +--
>  drivers/gpu/drm/i915/i915_gem_context.c | 42 +--------------------
>  drivers/gpu/drm/i915/i915_gem_gtt.c     | 67 +++++----------------------------
>  drivers/gpu/drm/i915/i915_gem_gtt.h     |  3 +-
>  6 files changed, 23 insertions(+), 104 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
> index 5e4fefd..3bfafe6 100644
> --- a/drivers/gpu/drm/i915/i915_drv.c
> +++ b/drivers/gpu/drm/i915/i915_drv.c
> @@ -806,7 +806,13 @@ int i915_reset(struct drm_device *dev)
>  			!dev_priv->ums.mm_suspended) {
>  		dev_priv->ums.mm_suspended = 0;
>  
> +		/* Used to prevent gem_check_wedged returning -EAGAIN during gpu reset */
> +		dev_priv->gpu_error.reload_in_reset = true;
> +
>  		ret = i915_gem_init_hw(dev);
> +
> +		dev_priv->gpu_error.reload_in_reset = false;
> +
>  		mutex_unlock(&dev->struct_mutex);
>  		if (ret) {
>  			DRM_ERROR("Failed hw init on reset %d\n", ret);
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 991b663..116daff 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -1217,6 +1217,9 @@ struct i915_gpu_error {
>  
>  	/* For missed irq/seqno simulation. */
>  	unsigned int test_irq_rings;
> +
> +	/* Used to prevent gem_check_wedged returning -EAGAIN during gpu reset   */
> +	bool reload_in_reset;
>  };
>  
>  enum modeset_restore {
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index ef047bc..14e1770 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -1085,7 +1085,9 @@ i915_gem_check_wedge(struct i915_gpu_error *error,
>  		if (i915_terminally_wedged(error))
>  			return -EIO;
>  
> -		return -EAGAIN;
> +		/* Check if GPU Reset is in progress */
> +		if (!error->reload_in_reset)
> +			return -EAGAIN;
>  	}
>  
>  	return 0;
> @@ -2590,8 +2592,6 @@ void i915_gem_reset(struct drm_device *dev)
>  	for_each_ring(ring, dev_priv, i)
>  		i915_gem_reset_ring_cleanup(dev_priv, ring);
>  
> -	i915_gem_context_reset(dev);
> -
>  	i915_gem_restore_fences(dev);
>  }
>  
> diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
> index de72a28..d96219f 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/i915_gem_context.c
> @@ -372,42 +372,6 @@ err_destroy:
>  	return ERR_PTR(ret);
>  }
>  
> -void i915_gem_context_reset(struct drm_device *dev)
> -{
> -	struct drm_i915_private *dev_priv = dev->dev_private;
> -	int i;
> -
> -	/* Prevent the hardware from restoring the last context (which hung) on
> -	 * the next switch */
> -	for (i = 0; i < I915_NUM_RINGS; i++) {
> -		struct intel_engine_cs *ring = &dev_priv->ring[i];
> -		struct intel_context *dctx = ring->default_context;
> -		struct intel_context *lctx = ring->last_context;
> -
> -		/* Do a fake switch to the default context */
> -		if (lctx == dctx)
> -			continue;
> -
> -		if (!lctx)
> -			continue;
> -
> -		if (dctx->legacy_hw_ctx.rcs_state && i == RCS) {
> -			WARN_ON(i915_gem_obj_ggtt_pin(dctx->legacy_hw_ctx.rcs_state,
> -						      get_context_alignment(dev), 0));
> -			/* Fake a finish/inactive */
> -			dctx->legacy_hw_ctx.rcs_state->base.write_domain = 0;
> -			dctx->legacy_hw_ctx.rcs_state->active = 0;
> -		}
> -
> -		if (lctx->legacy_hw_ctx.rcs_state && i == RCS)
> -			i915_gem_object_ggtt_unpin(lctx->legacy_hw_ctx.rcs_state);
> -
> -		i915_gem_context_unreference(lctx);
> -		i915_gem_context_reference(dctx);
> -		ring->last_context = dctx;
> -	}
> -}
> -

I am with Daniel on this one. I don't understand how can we throw
everything in here away.

We need to force hw to switch to a working context, after reset,
so that our internal state tracking matches.

Further, if we aim to more unification I think we should make it
so that the initial render state will get run, also after reset.

If we cleanup the last context for each ring set default context
carefully, i915_gem_context_enable() will then switch to default contexts
and reinit them using the initial render state. Something like this:

void i915_gem_context_reset(struct drm_device *dev)
{
	struct drm_i915_private *dev_priv = dev->dev_private;
	int i;

	for (i = 0; i < I915_NUM_RINGS; i++) {
		struct intel_engine_cs *ring = &dev_priv->ring[i];
		struct intel_context *lctx = ring->last_context;
		struct intel_context *dctx = ring->default_context;

		if (lctx) {
			if (lctx->legacy_hw_ctx.rcs_state && i == RCS)
				i915_gem_object_ggtt_unpin(lctx->legacy_hw_ctx.rcs_state);

			i915_gem_context_unreference(lctx);
			ring->last_context = NULL;
		}

		if (dctx->legacy_hw_ctx.rcs_state && i == RCS) {
			dctx->legacy_hw_ctx.rcs_state->base.write_domain = 0;
			dctx->legacy_hw_ctx.rcs_state->active = 0;
			dctx->legacy_hw_ctx.initialized = false;
		}
	}
}

The state would be closer of what we get after module reload.

-Mika

>  int i915_gem_context_init(struct drm_device *dev)
>  {
>  	struct drm_i915_private *dev_priv = dev->dev_private;
> @@ -498,10 +462,6 @@ int i915_gem_context_enable(struct drm_i915_private *dev_priv)
>  		ppgtt->enable(ppgtt);
>  	}
>  
> -	/* FIXME: We should make this work, even in reset */
> -	if (i915_reset_in_progress(&dev_priv->gpu_error))
> -		return 0;
> -
>  	BUG_ON(!dev_priv->ring[RCS].default_context);
>  
>  	for_each_ring(ring, dev_priv, i) {
> @@ -645,7 +605,7 @@ static int do_switch(struct intel_engine_cs *ring,
>  	from = ring->last_context;
>  
>  	if (USES_FULL_PPGTT(ring->dev)) {
> -		ret = ppgtt->switch_mm(ppgtt, ring, false);
> +		ret = ppgtt->switch_mm(ppgtt, ring);
>  		if (ret)
>  			goto unpin_out;
>  	}
> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
> index 5188936..450c8a9 100644
> --- a/drivers/gpu/drm/i915/i915_gem_gtt.c
> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
> @@ -216,19 +216,12 @@ static gen6_gtt_pte_t iris_pte_encode(dma_addr_t addr,
>  
>  /* Broadwell Page Directory Pointer Descriptors */
>  static int gen8_write_pdp(struct intel_engine_cs *ring, unsigned entry,
> -			   uint64_t val, bool synchronous)
> +			   uint64_t val)
>  {
> -	struct drm_i915_private *dev_priv = ring->dev->dev_private;
>  	int ret;
>  
>  	BUG_ON(entry >= 4);
>  
> -	if (synchronous) {
> -		I915_WRITE(GEN8_RING_PDP_UDW(ring, entry), val >> 32);
> -		I915_WRITE(GEN8_RING_PDP_LDW(ring, entry), (u32)val);
> -		return 0;
> -	}
> -
>  	ret = intel_ring_begin(ring, 6);
>  	if (ret)
>  		return ret;
> @@ -245,8 +238,7 @@ static int gen8_write_pdp(struct intel_engine_cs *ring, unsigned entry,
>  }
>  
>  static int gen8_mm_switch(struct i915_hw_ppgtt *ppgtt,
> -			  struct intel_engine_cs *ring,
> -			  bool synchronous)
> +			  struct intel_engine_cs *ring)
>  {
>  	int i, ret;
>  
> @@ -255,7 +247,7 @@ static int gen8_mm_switch(struct i915_hw_ppgtt *ppgtt,
>  
>  	for (i = used_pd - 1; i >= 0; i--) {
>  		dma_addr_t addr = ppgtt->pd_dma_addr[i];
> -		ret = gen8_write_pdp(ring, i, addr, synchronous);
> +		ret = gen8_write_pdp(ring, i, addr);
>  		if (ret)
>  			return ret;
>  	}
> @@ -724,29 +716,10 @@ static uint32_t get_pd_offset(struct i915_hw_ppgtt *ppgtt)
>  }
>  
>  static int hsw_mm_switch(struct i915_hw_ppgtt *ppgtt,
> -			 struct intel_engine_cs *ring,
> -			 bool synchronous)
> +			 struct intel_engine_cs *ring)
>  {
> -	struct drm_device *dev = ppgtt->base.dev;
> -	struct drm_i915_private *dev_priv = dev->dev_private;
>  	int ret;
>  
> -	/* If we're in reset, we can assume the GPU is sufficiently idle to
> -	 * manually frob these bits. Ideally we could use the ring functions,
> -	 * except our error handling makes it quite difficult (can't use
> -	 * intel_ring_begin, ring->flush, or intel_ring_advance)
> -	 *
> -	 * FIXME: We should try not to special case reset
> -	 */
> -	if (synchronous ||
> -	    i915_reset_in_progress(&dev_priv->gpu_error)) {
> -		WARN_ON(ppgtt != dev_priv->mm.aliasing_ppgtt);
> -		I915_WRITE(RING_PP_DIR_DCLV(ring), PP_DIR_DCLV_2G);
> -		I915_WRITE(RING_PP_DIR_BASE(ring), get_pd_offset(ppgtt));
> -		POSTING_READ(RING_PP_DIR_BASE(ring));
> -		return 0;
> -	}
> -
>  	/* NB: TLBs must be flushed and invalidated before a switch */
>  	ret = ring->flush(ring, I915_GEM_GPU_DOMAINS, I915_GEM_GPU_DOMAINS);
>  	if (ret)
> @@ -768,29 +741,10 @@ static int hsw_mm_switch(struct i915_hw_ppgtt *ppgtt,
>  }
>  
>  static int gen7_mm_switch(struct i915_hw_ppgtt *ppgtt,
> -			  struct intel_engine_cs *ring,
> -			  bool synchronous)
> +			  struct intel_engine_cs *ring)
>  {
> -	struct drm_device *dev = ppgtt->base.dev;
> -	struct drm_i915_private *dev_priv = dev->dev_private;
>  	int ret;
>  
> -	/* If we're in reset, we can assume the GPU is sufficiently idle to
> -	 * manually frob these bits. Ideally we could use the ring functions,
> -	 * except our error handling makes it quite difficult (can't use
> -	 * intel_ring_begin, ring->flush, or intel_ring_advance)
> -	 *
> -	 * FIXME: We should try not to special case reset
> -	 */
> -	if (synchronous ||
> -	    i915_reset_in_progress(&dev_priv->gpu_error)) {
> -		WARN_ON(ppgtt != dev_priv->mm.aliasing_ppgtt);
> -		I915_WRITE(RING_PP_DIR_DCLV(ring), PP_DIR_DCLV_2G);
> -		I915_WRITE(RING_PP_DIR_BASE(ring), get_pd_offset(ppgtt));
> -		POSTING_READ(RING_PP_DIR_BASE(ring));
> -		return 0;
> -	}
> -
>  	/* NB: TLBs must be flushed and invalidated before a switch */
>  	ret = ring->flush(ring, I915_GEM_GPU_DOMAINS, I915_GEM_GPU_DOMAINS);
>  	if (ret)
> @@ -819,14 +773,11 @@ static int gen7_mm_switch(struct i915_hw_ppgtt *ppgtt,
>  }
>  
>  static int gen6_mm_switch(struct i915_hw_ppgtt *ppgtt,
> -			  struct intel_engine_cs *ring,
> -			  bool synchronous)
> +			  struct intel_engine_cs *ring)
>  {
>  	struct drm_device *dev = ppgtt->base.dev;
>  	struct drm_i915_private *dev_priv = dev->dev_private;
>  
> -	if (!synchronous)
> -		return 0;
>  
>  	I915_WRITE(RING_PP_DIR_DCLV(ring), PP_DIR_DCLV_2G);
>  	I915_WRITE(RING_PP_DIR_BASE(ring), get_pd_offset(ppgtt));
> @@ -852,7 +803,7 @@ static int gen8_ppgtt_enable(struct i915_hw_ppgtt *ppgtt)
>  		if (USES_FULL_PPGTT(dev))
>  			continue;
>  
> -		ret = ppgtt->switch_mm(ppgtt, ring, true);
> +		ret = ppgtt->switch_mm(ppgtt, ring);
>  		if (ret)
>  			goto err_out;
>  	}
> @@ -897,7 +848,7 @@ static int gen7_ppgtt_enable(struct i915_hw_ppgtt *ppgtt)
>  		if (USES_FULL_PPGTT(dev))
>  			continue;
>  
> -		ret = ppgtt->switch_mm(ppgtt, ring, true);
> +		ret = ppgtt->switch_mm(ppgtt, ring);
>  		if (ret)
>  			return ret;
>  	}
> @@ -926,7 +877,7 @@ static int gen6_ppgtt_enable(struct i915_hw_ppgtt *ppgtt)
>  	I915_WRITE(GFX_MODE, _MASKED_BIT_ENABLE(GFX_PPGTT_ENABLE));
>  
>  	for_each_ring(ring, dev_priv, i) {
> -		int ret = ppgtt->switch_mm(ppgtt, ring, true);
> +		int ret = ppgtt->switch_mm(ppgtt, ring);
>  		if (ret)
>  			return ret;
>  	}
> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.h b/drivers/gpu/drm/i915/i915_gem_gtt.h
> index 8d6f7c1..bf1e4fc 100644
> --- a/drivers/gpu/drm/i915/i915_gem_gtt.h
> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.h
> @@ -262,8 +262,7 @@ struct i915_hw_ppgtt {
>  
>  	int (*enable)(struct i915_hw_ppgtt *ppgtt);
>  	int (*switch_mm)(struct i915_hw_ppgtt *ppgtt,
> -			 struct intel_engine_cs *ring,
> -			 bool synchronous);
> +			 struct intel_engine_cs *ring);
>  	void (*debug_dump)(struct i915_hw_ppgtt *ppgtt, struct seq_file *m);
>  };
>  
> -- 
> 2.0.0
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx
alistair.mcaulay@intel.com Aug. 15, 2014, 1:33 p.m. UTC | #3
Hi Mika / Daniel,

below is the basic code path of a reset which has been changed by my patch:

i915_reset()
{
	....
	i915_gem_reset() -> This used to call i915_gem_context_reset(), which has now been removed. 
	.....
	i915_gem_init_hw()
		.....
		i915_gem_context_enable() -> This used to return during reset. Now it doesn't
		.....
			for each ring, i915_switch_context(default)
				do_switch();
		.....

	.....
}

" I am with Daniel on this one. I don't understand how can we throw everything in here away."
Did you maybe mean Ben?
Daniel, I thought you were happy with the implementation, and V2 fixed your last concern, could you please comment.

" We need to force hw to switch to a working context, after reset, so that our internal state tracking matches."
I believe this patch does that using i915_switch_context, rather than the hacky i915_gem_context_reset()

Alistair.

> -----Original Message-----
> From: Mika Kuoppala [mailto:mika.kuoppala@linux.intel.com]
> Sent: Wednesday, August 06, 2014 5:25 PM
> To: Mcaulay, Alistair; intel-gfx@lists.freedesktop.org
> Subject: Re: [Intel-gfx] [PATCH v2] drm/i915: Rework GPU reset sequence to
> match driver load & thaw
> 
> 
> Hi,
> 
> alistair.mcaulay@intel.com writes:
> 
> > From: "McAulay, Alistair" <alistair.mcaulay@intel.com>
> >
> > This patch is to address Daniels concerns over different code during reset:
> >
> > http://lists.freedesktop.org/archives/intel-gfx/2014-June/047758.html
> >
> > "The reason for aiming as hard as possible to use the exact same code
> > for driver load, gpu reset and runtime pm/system resume is that we've
> > simply seen too many bugs due to slight variations and unintended
> omissions."
> >
> > Tested using igt drv_hangman.
> >
> > V2: Cleaner way of preventing check_wedge returning -EAGAIN
> >
> > Signed-off-by: McAulay, Alistair <alistair.mcaulay@intel.com>
> > ---
> >  drivers/gpu/drm/i915/i915_drv.c         |  6 +++
> >  drivers/gpu/drm/i915/i915_drv.h         |  3 ++
> >  drivers/gpu/drm/i915/i915_gem.c         |  6 +--
> >  drivers/gpu/drm/i915/i915_gem_context.c | 42 +--------------------
> >  drivers/gpu/drm/i915/i915_gem_gtt.c     | 67 +++++----------------------------
> >  drivers/gpu/drm/i915/i915_gem_gtt.h     |  3 +-
> >  6 files changed, 23 insertions(+), 104 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/i915/i915_drv.c
> > b/drivers/gpu/drm/i915/i915_drv.c index 5e4fefd..3bfafe6 100644
> > --- a/drivers/gpu/drm/i915/i915_drv.c
> > +++ b/drivers/gpu/drm/i915/i915_drv.c
> > @@ -806,7 +806,13 @@ int i915_reset(struct drm_device *dev)
> >  			!dev_priv->ums.mm_suspended) {
> >  		dev_priv->ums.mm_suspended = 0;
> >
> > +		/* Used to prevent gem_check_wedged returning -EAGAIN
> during gpu reset */
> > +		dev_priv->gpu_error.reload_in_reset = true;
> > +
> >  		ret = i915_gem_init_hw(dev);
> > +
> > +		dev_priv->gpu_error.reload_in_reset = false;
> > +
> >  		mutex_unlock(&dev->struct_mutex);
> >  		if (ret) {
> >  			DRM_ERROR("Failed hw init on reset %d\n", ret); diff
> --git
> > a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> > index 991b663..116daff 100644
> > --- a/drivers/gpu/drm/i915/i915_drv.h
> > +++ b/drivers/gpu/drm/i915/i915_drv.h
> > @@ -1217,6 +1217,9 @@ struct i915_gpu_error {
> >
> >  	/* For missed irq/seqno simulation. */
> >  	unsigned int test_irq_rings;
> > +
> > +	/* Used to prevent gem_check_wedged returning -EAGAIN during
> gpu reset   */
> > +	bool reload_in_reset;
> >  };
> >
> >  enum modeset_restore {
> > diff --git a/drivers/gpu/drm/i915/i915_gem.c
> > b/drivers/gpu/drm/i915/i915_gem.c index ef047bc..14e1770 100644
> > --- a/drivers/gpu/drm/i915/i915_gem.c
> > +++ b/drivers/gpu/drm/i915/i915_gem.c
> > @@ -1085,7 +1085,9 @@ i915_gem_check_wedge(struct i915_gpu_error
> *error,
> >  		if (i915_terminally_wedged(error))
> >  			return -EIO;
> >
> > -		return -EAGAIN;
> > +		/* Check if GPU Reset is in progress */
> > +		if (!error->reload_in_reset)
> > +			return -EAGAIN;
> >  	}
> >
> >  	return 0;
> > @@ -2590,8 +2592,6 @@ void i915_gem_reset(struct drm_device *dev)
> >  	for_each_ring(ring, dev_priv, i)
> >  		i915_gem_reset_ring_cleanup(dev_priv, ring);
> >
> > -	i915_gem_context_reset(dev);
> > -
> >  	i915_gem_restore_fences(dev);
> >  }
> >
> > diff --git a/drivers/gpu/drm/i915/i915_gem_context.c
> > b/drivers/gpu/drm/i915/i915_gem_context.c
> > index de72a28..d96219f 100644
> > --- a/drivers/gpu/drm/i915/i915_gem_context.c
> > +++ b/drivers/gpu/drm/i915/i915_gem_context.c
> > @@ -372,42 +372,6 @@ err_destroy:
> >  	return ERR_PTR(ret);
> >  }
> >
> > -void i915_gem_context_reset(struct drm_device *dev) -{
> > -	struct drm_i915_private *dev_priv = dev->dev_private;
> > -	int i;
> > -
> > -	/* Prevent the hardware from restoring the last context (which
> hung) on
> > -	 * the next switch */
> > -	for (i = 0; i < I915_NUM_RINGS; i++) {
> > -		struct intel_engine_cs *ring = &dev_priv->ring[i];
> > -		struct intel_context *dctx = ring->default_context;
> > -		struct intel_context *lctx = ring->last_context;
> > -
> > -		/* Do a fake switch to the default context */
> > -		if (lctx == dctx)
> > -			continue;
> > -
> > -		if (!lctx)
> > -			continue;
> > -
> > -		if (dctx->legacy_hw_ctx.rcs_state && i == RCS) {
> > -			WARN_ON(i915_gem_obj_ggtt_pin(dctx-
> >legacy_hw_ctx.rcs_state,
> > -
> get_context_alignment(dev), 0));
> > -			/* Fake a finish/inactive */
> > -			dctx->legacy_hw_ctx.rcs_state->base.write_domain
> = 0;
> > -			dctx->legacy_hw_ctx.rcs_state->active = 0;
> > -		}
> > -
> > -		if (lctx->legacy_hw_ctx.rcs_state && i == RCS)
> > -			i915_gem_object_ggtt_unpin(lctx-
> >legacy_hw_ctx.rcs_state);
> > -
> > -		i915_gem_context_unreference(lctx);
> > -		i915_gem_context_reference(dctx);
> > -		ring->last_context = dctx;
> > -	}
> > -}
> > -
> 
> I am with Daniel on this one. I don't understand how can we throw
> everything in here away.
> 
> We need to force hw to switch to a working context, after reset, so that our
> internal state tracking matches.
> 
> Further, if we aim to more unification I think we should make it so that the
> initial render state will get run, also after reset.
> 
> If we cleanup the last context for each ring set default context carefully,
> i915_gem_context_enable() will then switch to default contexts and reinit
> them using the initial render state. Something like this:
> 
> void i915_gem_context_reset(struct drm_device *dev) {
> 	struct drm_i915_private *dev_priv = dev->dev_private;
> 	int i;
> 
> 	for (i = 0; i < I915_NUM_RINGS; i++) {
> 		struct intel_engine_cs *ring = &dev_priv->ring[i];
> 		struct intel_context *lctx = ring->last_context;
> 		struct intel_context *dctx = ring->default_context;
> 
> 		if (lctx) {
> 			if (lctx->legacy_hw_ctx.rcs_state && i == RCS)
> 				i915_gem_object_ggtt_unpin(lctx-
> >legacy_hw_ctx.rcs_state);
> 
> 			i915_gem_context_unreference(lctx);
> 			ring->last_context = NULL;
> 		}
> 
> 		if (dctx->legacy_hw_ctx.rcs_state && i == RCS) {
> 			dctx->legacy_hw_ctx.rcs_state->base.write_domain
> = 0;
> 			dctx->legacy_hw_ctx.rcs_state->active = 0;
> 			dctx->legacy_hw_ctx.initialized = false;
> 		}
> 	}
> }
> 
> The state would be closer of what we get after module reload.
> 
> -Mika
> 
> >  int i915_gem_context_init(struct drm_device *dev)  {
> >  	struct drm_i915_private *dev_priv = dev->dev_private; @@ -498,10
> > +462,6 @@ int i915_gem_context_enable(struct drm_i915_private
> *dev_priv)
> >  		ppgtt->enable(ppgtt);
> >  	}
> >
> > -	/* FIXME: We should make this work, even in reset */
> > -	if (i915_reset_in_progress(&dev_priv->gpu_error))
> > -		return 0;
> > -
> >  	BUG_ON(!dev_priv->ring[RCS].default_context);
> >
> >  	for_each_ring(ring, dev_priv, i) {
> > @@ -645,7 +605,7 @@ static int do_switch(struct intel_engine_cs *ring,
> >  	from = ring->last_context;
> >
> >  	if (USES_FULL_PPGTT(ring->dev)) {
> > -		ret = ppgtt->switch_mm(ppgtt, ring, false);
> > +		ret = ppgtt->switch_mm(ppgtt, ring);
> >  		if (ret)
> >  			goto unpin_out;
> >  	}
> > diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c
> > b/drivers/gpu/drm/i915/i915_gem_gtt.c
> > index 5188936..450c8a9 100644
> > --- a/drivers/gpu/drm/i915/i915_gem_gtt.c
> > +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
> > @@ -216,19 +216,12 @@ static gen6_gtt_pte_t
> iris_pte_encode(dma_addr_t
> > addr,
> >
> >  /* Broadwell Page Directory Pointer Descriptors */  static int
> > gen8_write_pdp(struct intel_engine_cs *ring, unsigned entry,
> > -			   uint64_t val, bool synchronous)
> > +			   uint64_t val)
> >  {
> > -	struct drm_i915_private *dev_priv = ring->dev->dev_private;
> >  	int ret;
> >
> >  	BUG_ON(entry >= 4);
> >
> > -	if (synchronous) {
> > -		I915_WRITE(GEN8_RING_PDP_UDW(ring, entry), val >> 32);
> > -		I915_WRITE(GEN8_RING_PDP_LDW(ring, entry), (u32)val);
> > -		return 0;
> > -	}
> > -
> >  	ret = intel_ring_begin(ring, 6);
> >  	if (ret)
> >  		return ret;
> > @@ -245,8 +238,7 @@ static int gen8_write_pdp(struct intel_engine_cs
> > *ring, unsigned entry,  }
> >
> >  static int gen8_mm_switch(struct i915_hw_ppgtt *ppgtt,
> > -			  struct intel_engine_cs *ring,
> > -			  bool synchronous)
> > +			  struct intel_engine_cs *ring)
> >  {
> >  	int i, ret;
> >
> > @@ -255,7 +247,7 @@ static int gen8_mm_switch(struct i915_hw_ppgtt
> > *ppgtt,
> >
> >  	for (i = used_pd - 1; i >= 0; i--) {
> >  		dma_addr_t addr = ppgtt->pd_dma_addr[i];
> > -		ret = gen8_write_pdp(ring, i, addr, synchronous);
> > +		ret = gen8_write_pdp(ring, i, addr);
> >  		if (ret)
> >  			return ret;
> >  	}
> > @@ -724,29 +716,10 @@ static uint32_t get_pd_offset(struct
> > i915_hw_ppgtt *ppgtt)  }
> >
> >  static int hsw_mm_switch(struct i915_hw_ppgtt *ppgtt,
> > -			 struct intel_engine_cs *ring,
> > -			 bool synchronous)
> > +			 struct intel_engine_cs *ring)
> >  {
> > -	struct drm_device *dev = ppgtt->base.dev;
> > -	struct drm_i915_private *dev_priv = dev->dev_private;
> >  	int ret;
> >
> > -	/* If we're in reset, we can assume the GPU is sufficiently idle to
> > -	 * manually frob these bits. Ideally we could use the ring functions,
> > -	 * except our error handling makes it quite difficult (can't use
> > -	 * intel_ring_begin, ring->flush, or intel_ring_advance)
> > -	 *
> > -	 * FIXME: We should try not to special case reset
> > -	 */
> > -	if (synchronous ||
> > -	    i915_reset_in_progress(&dev_priv->gpu_error)) {
> > -		WARN_ON(ppgtt != dev_priv->mm.aliasing_ppgtt);
> > -		I915_WRITE(RING_PP_DIR_DCLV(ring), PP_DIR_DCLV_2G);
> > -		I915_WRITE(RING_PP_DIR_BASE(ring),
> get_pd_offset(ppgtt));
> > -		POSTING_READ(RING_PP_DIR_BASE(ring));
> > -		return 0;
> > -	}
> > -
> >  	/* NB: TLBs must be flushed and invalidated before a switch */
> >  	ret = ring->flush(ring, I915_GEM_GPU_DOMAINS,
> I915_GEM_GPU_DOMAINS);
> >  	if (ret)
> > @@ -768,29 +741,10 @@ static int hsw_mm_switch(struct i915_hw_ppgtt
> > *ppgtt,  }
> >
> >  static int gen7_mm_switch(struct i915_hw_ppgtt *ppgtt,
> > -			  struct intel_engine_cs *ring,
> > -			  bool synchronous)
> > +			  struct intel_engine_cs *ring)
> >  {
> > -	struct drm_device *dev = ppgtt->base.dev;
> > -	struct drm_i915_private *dev_priv = dev->dev_private;
> >  	int ret;
> >
> > -	/* If we're in reset, we can assume the GPU is sufficiently idle to
> > -	 * manually frob these bits. Ideally we could use the ring functions,
> > -	 * except our error handling makes it quite difficult (can't use
> > -	 * intel_ring_begin, ring->flush, or intel_ring_advance)
> > -	 *
> > -	 * FIXME: We should try not to special case reset
> > -	 */
> > -	if (synchronous ||
> > -	    i915_reset_in_progress(&dev_priv->gpu_error)) {
> > -		WARN_ON(ppgtt != dev_priv->mm.aliasing_ppgtt);
> > -		I915_WRITE(RING_PP_DIR_DCLV(ring), PP_DIR_DCLV_2G);
> > -		I915_WRITE(RING_PP_DIR_BASE(ring),
> get_pd_offset(ppgtt));
> > -		POSTING_READ(RING_PP_DIR_BASE(ring));
> > -		return 0;
> > -	}
> > -
> >  	/* NB: TLBs must be flushed and invalidated before a switch */
> >  	ret = ring->flush(ring, I915_GEM_GPU_DOMAINS,
> I915_GEM_GPU_DOMAINS);
> >  	if (ret)
> > @@ -819,14 +773,11 @@ static int gen7_mm_switch(struct i915_hw_ppgtt
> > *ppgtt,  }
> >
> >  static int gen6_mm_switch(struct i915_hw_ppgtt *ppgtt,
> > -			  struct intel_engine_cs *ring,
> > -			  bool synchronous)
> > +			  struct intel_engine_cs *ring)
> >  {
> >  	struct drm_device *dev = ppgtt->base.dev;
> >  	struct drm_i915_private *dev_priv = dev->dev_private;
> >
> > -	if (!synchronous)
> > -		return 0;
> >
> >  	I915_WRITE(RING_PP_DIR_DCLV(ring), PP_DIR_DCLV_2G);
> >  	I915_WRITE(RING_PP_DIR_BASE(ring), get_pd_offset(ppgtt)); @@ -
> 852,7
> > +803,7 @@ static int gen8_ppgtt_enable(struct i915_hw_ppgtt *ppgtt)
> >  		if (USES_FULL_PPGTT(dev))
> >  			continue;
> >
> > -		ret = ppgtt->switch_mm(ppgtt, ring, true);
> > +		ret = ppgtt->switch_mm(ppgtt, ring);
> >  		if (ret)
> >  			goto err_out;
> >  	}
> > @@ -897,7 +848,7 @@ static int gen7_ppgtt_enable(struct i915_hw_ppgtt
> *ppgtt)
> >  		if (USES_FULL_PPGTT(dev))
> >  			continue;
> >
> > -		ret = ppgtt->switch_mm(ppgtt, ring, true);
> > +		ret = ppgtt->switch_mm(ppgtt, ring);
> >  		if (ret)
> >  			return ret;
> >  	}
> > @@ -926,7 +877,7 @@ static int gen6_ppgtt_enable(struct i915_hw_ppgtt
> *ppgtt)
> >  	I915_WRITE(GFX_MODE,
> _MASKED_BIT_ENABLE(GFX_PPGTT_ENABLE));
> >
> >  	for_each_ring(ring, dev_priv, i) {
> > -		int ret = ppgtt->switch_mm(ppgtt, ring, true);
> > +		int ret = ppgtt->switch_mm(ppgtt, ring);
> >  		if (ret)
> >  			return ret;
> >  	}
> > diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.h
> > b/drivers/gpu/drm/i915/i915_gem_gtt.h
> > index 8d6f7c1..bf1e4fc 100644
> > --- a/drivers/gpu/drm/i915/i915_gem_gtt.h
> > +++ b/drivers/gpu/drm/i915/i915_gem_gtt.h
> > @@ -262,8 +262,7 @@ struct i915_hw_ppgtt {
> >
> >  	int (*enable)(struct i915_hw_ppgtt *ppgtt);
> >  	int (*switch_mm)(struct i915_hw_ppgtt *ppgtt,
> > -			 struct intel_engine_cs *ring,
> > -			 bool synchronous);
> > +			 struct intel_engine_cs *ring);
> >  	void (*debug_dump)(struct i915_hw_ppgtt *ppgtt, struct seq_file
> *m);
> > };
> >
> > --
> > 2.0.0
> >
> > _______________________________________________
> > Intel-gfx mailing list
> > Intel-gfx@lists.freedesktop.org
> > http://lists.freedesktop.org/mailman/listinfo/intel-gfx
Daniel Vetter Aug. 15, 2014, 3:41 p.m. UTC | #4
On Fri, Aug 15, 2014 at 3:33 PM, Mcaulay, Alistair
<alistair.mcaulay@intel.com> wrote:
> below is the basic code path of a reset which has been changed by my patch:
>
> i915_reset()
> {
>         ....
>         i915_gem_reset() -> This used to call i915_gem_context_reset(), which has now been removed.
>         .....
>         i915_gem_init_hw()
>                 .....
>                 i915_gem_context_enable() -> This used to return during reset. Now it doesn't
>                 .....
>                         for each ring, i915_switch_context(default)
>                                 do_switch();
>                 .....
>
>         .....
> }
>
> " I am with Daniel on this one. I don't understand how can we throw everything in here away."
> Did you maybe mean Ben?
> Daniel, I thought you were happy with the implementation, and V2 fixed your last concern, could you please comment.

I'm happy with the underlying fix, but I didn't check all the details
and instead signed up Mika for that. Helps me scale and also makes
sure that more people understand tricky parts.

> " We need to force hw to switch to a working context, after reset, so that our internal state tracking matches."
> I believe this patch does that using i915_switch_context, rather than the hacky i915_gem_context_reset()

I'm half in a plane already, so I'll leave it to you and Mika to
figure this out. Maybe the only thing missing is a bit more
explanation in the commit message why exactly we can remove all that
code. On a quick glance both Mika's concern and your reply make sense.
-Daniel
Mika Kuoppala Aug. 15, 2014, 5:03 p.m. UTC | #5
"Mcaulay, Alistair" <alistair.mcaulay@intel.com> writes:

> Hi Mika / Daniel,
>
> below is the basic code path of a reset which has been changed by my patch:
>
> i915_reset()
> {
> 	....
> 	i915_gem_reset() -> This used to call i915_gem_context_reset(), which has now been removed. 
> 	.....
> 	i915_gem_init_hw()
> 		.....
> 		i915_gem_context_enable() -> This used to return during reset. Now it doesn't
> 		.....
> 			for each ring, i915_switch_context(default)
> 				do_switch();
> 		.....
>
> 	.....
> }
>
> " I am with Daniel on this one. I don't understand how can we throw everything in here away."
> Did you maybe mean Ben?
> Daniel, I thought you were happy with the implementation, and V2 fixed your last concern, could you please comment.
>
> " We need to force hw to switch to a working context, after reset, so that our internal state tracking matches."
> I believe this patch does that using i915_switch_context, rather than the hacky i915_gem_context_reset()

Our internal state tracking will be ok after i915_gem_context_enable()
has been called. All rings have been set to the default context.

But what happens with this sequence:

- render ring was running in default context 
- reset happens
- we call i915_gem_context_enable 
- do_switch will get called 
- it figure out that last context is the same as we are switching to
  (from == to) and it bails out
- we never wrote anything to ring, so we have pre reset context running.
  MI_SET_CONTEXT was never run.

Even if reset would not clear the CCID, I think it is safest to
force a MI_SET_CONTEXT here.

Further, if the default context was mangled before the reset,
we should reinitialize it to a known state by running
i915_gem_render_state_init() for it. But this can be
considered as a possible future work.

-Mika

> Alistair.
>
>> -----Original Message-----
>> From: Mika Kuoppala [mailto:mika.kuoppala@linux.intel.com]
>> Sent: Wednesday, August 06, 2014 5:25 PM
>> To: Mcaulay, Alistair; intel-gfx@lists.freedesktop.org
>> Subject: Re: [Intel-gfx] [PATCH v2] drm/i915: Rework GPU reset sequence to
>> match driver load & thaw
>> 
>> 
>> Hi,
>> 
>> alistair.mcaulay@intel.com writes:
>> 
>> > From: "McAulay, Alistair" <alistair.mcaulay@intel.com>
>> >
>> > This patch is to address Daniels concerns over different code during reset:
>> >
>> > http://lists.freedesktop.org/archives/intel-gfx/2014-June/047758.html
>> >
>> > "The reason for aiming as hard as possible to use the exact same code
>> > for driver load, gpu reset and runtime pm/system resume is that we've
>> > simply seen too many bugs due to slight variations and unintended
>> omissions."
>> >
>> > Tested using igt drv_hangman.
>> >
>> > V2: Cleaner way of preventing check_wedge returning -EAGAIN
>> >
>> > Signed-off-by: McAulay, Alistair <alistair.mcaulay@intel.com>
>> > ---
>> >  drivers/gpu/drm/i915/i915_drv.c         |  6 +++
>> >  drivers/gpu/drm/i915/i915_drv.h         |  3 ++
>> >  drivers/gpu/drm/i915/i915_gem.c         |  6 +--
>> >  drivers/gpu/drm/i915/i915_gem_context.c | 42 +--------------------
>> >  drivers/gpu/drm/i915/i915_gem_gtt.c     | 67 +++++----------------------------
>> >  drivers/gpu/drm/i915/i915_gem_gtt.h     |  3 +-
>> >  6 files changed, 23 insertions(+), 104 deletions(-)
>> >
>> > diff --git a/drivers/gpu/drm/i915/i915_drv.c
>> > b/drivers/gpu/drm/i915/i915_drv.c index 5e4fefd..3bfafe6 100644
>> > --- a/drivers/gpu/drm/i915/i915_drv.c
>> > +++ b/drivers/gpu/drm/i915/i915_drv.c
>> > @@ -806,7 +806,13 @@ int i915_reset(struct drm_device *dev)
>> >  			!dev_priv->ums.mm_suspended) {
>> >  		dev_priv->ums.mm_suspended = 0;
>> >
>> > +		/* Used to prevent gem_check_wedged returning -EAGAIN
>> during gpu reset */
>> > +		dev_priv->gpu_error.reload_in_reset = true;
>> > +
>> >  		ret = i915_gem_init_hw(dev);
>> > +
>> > +		dev_priv->gpu_error.reload_in_reset = false;
>> > +
>> >  		mutex_unlock(&dev->struct_mutex);
>> >  		if (ret) {
>> >  			DRM_ERROR("Failed hw init on reset %d\n", ret); diff
>> --git
>> > a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
>> > index 991b663..116daff 100644
>> > --- a/drivers/gpu/drm/i915/i915_drv.h
>> > +++ b/drivers/gpu/drm/i915/i915_drv.h
>> > @@ -1217,6 +1217,9 @@ struct i915_gpu_error {
>> >
>> >  	/* For missed irq/seqno simulation. */
>> >  	unsigned int test_irq_rings;
>> > +
>> > +	/* Used to prevent gem_check_wedged returning -EAGAIN during
>> gpu reset   */
>> > +	bool reload_in_reset;
>> >  };
>> >
>> >  enum modeset_restore {
>> > diff --git a/drivers/gpu/drm/i915/i915_gem.c
>> > b/drivers/gpu/drm/i915/i915_gem.c index ef047bc..14e1770 100644
>> > --- a/drivers/gpu/drm/i915/i915_gem.c
>> > +++ b/drivers/gpu/drm/i915/i915_gem.c
>> > @@ -1085,7 +1085,9 @@ i915_gem_check_wedge(struct i915_gpu_error
>> *error,
>> >  		if (i915_terminally_wedged(error))
>> >  			return -EIO;
>> >
>> > -		return -EAGAIN;
>> > +		/* Check if GPU Reset is in progress */
>> > +		if (!error->reload_in_reset)
>> > +			return -EAGAIN;
>> >  	}
>> >
>> >  	return 0;
>> > @@ -2590,8 +2592,6 @@ void i915_gem_reset(struct drm_device *dev)
>> >  	for_each_ring(ring, dev_priv, i)
>> >  		i915_gem_reset_ring_cleanup(dev_priv, ring);
>> >
>> > -	i915_gem_context_reset(dev);
>> > -
>> >  	i915_gem_restore_fences(dev);
>> >  }
>> >
>> > diff --git a/drivers/gpu/drm/i915/i915_gem_context.c
>> > b/drivers/gpu/drm/i915/i915_gem_context.c
>> > index de72a28..d96219f 100644
>> > --- a/drivers/gpu/drm/i915/i915_gem_context.c
>> > +++ b/drivers/gpu/drm/i915/i915_gem_context.c
>> > @@ -372,42 +372,6 @@ err_destroy:
>> >  	return ERR_PTR(ret);
>> >  }
>> >
>> > -void i915_gem_context_reset(struct drm_device *dev) -{
>> > -	struct drm_i915_private *dev_priv = dev->dev_private;
>> > -	int i;
>> > -
>> > -	/* Prevent the hardware from restoring the last context (which
>> hung) on
>> > -	 * the next switch */
>> > -	for (i = 0; i < I915_NUM_RINGS; i++) {
>> > -		struct intel_engine_cs *ring = &dev_priv->ring[i];
>> > -		struct intel_context *dctx = ring->default_context;
>> > -		struct intel_context *lctx = ring->last_context;
>> > -
>> > -		/* Do a fake switch to the default context */
>> > -		if (lctx == dctx)
>> > -			continue;
>> > -
>> > -		if (!lctx)
>> > -			continue;
>> > -
>> > -		if (dctx->legacy_hw_ctx.rcs_state && i == RCS) {
>> > -			WARN_ON(i915_gem_obj_ggtt_pin(dctx-
>> >legacy_hw_ctx.rcs_state,
>> > -
>> get_context_alignment(dev), 0));
>> > -			/* Fake a finish/inactive */
>> > -			dctx->legacy_hw_ctx.rcs_state->base.write_domain
>> = 0;
>> > -			dctx->legacy_hw_ctx.rcs_state->active = 0;
>> > -		}
>> > -
>> > -		if (lctx->legacy_hw_ctx.rcs_state && i == RCS)
>> > -			i915_gem_object_ggtt_unpin(lctx-
>> >legacy_hw_ctx.rcs_state);
>> > -
>> > -		i915_gem_context_unreference(lctx);
>> > -		i915_gem_context_reference(dctx);
>> > -		ring->last_context = dctx;
>> > -	}
>> > -}
>> > -
>> 
>> I am with Daniel on this one. I don't understand how can we throw
>> everything in here away.
>> 
>> We need to force hw to switch to a working context, after reset, so that our
>> internal state tracking matches.
>> 
>> Further, if we aim to more unification I think we should make it so that the
>> initial render state will get run, also after reset.
>> 
>> If we cleanup the last context for each ring set default context carefully,
>> i915_gem_context_enable() will then switch to default contexts and reinit
>> them using the initial render state. Something like this:
>> 
>> void i915_gem_context_reset(struct drm_device *dev) {
>> 	struct drm_i915_private *dev_priv = dev->dev_private;
>> 	int i;
>> 
>> 	for (i = 0; i < I915_NUM_RINGS; i++) {
>> 		struct intel_engine_cs *ring = &dev_priv->ring[i];
>> 		struct intel_context *lctx = ring->last_context;
>> 		struct intel_context *dctx = ring->default_context;
>> 
>> 		if (lctx) {
>> 			if (lctx->legacy_hw_ctx.rcs_state && i == RCS)
>> 				i915_gem_object_ggtt_unpin(lctx-
>> >legacy_hw_ctx.rcs_state);
>> 
>> 			i915_gem_context_unreference(lctx);
>> 			ring->last_context = NULL;
>> 		}
>> 
>> 		if (dctx->legacy_hw_ctx.rcs_state && i == RCS) {
>> 			dctx->legacy_hw_ctx.rcs_state->base.write_domain
>> = 0;
>> 			dctx->legacy_hw_ctx.rcs_state->active = 0;
>> 			dctx->legacy_hw_ctx.initialized = false;
>> 		}
>> 	}
>> }
>> 
>> The state would be closer of what we get after module reload.
>> 
>> -Mika
>> 
>> >  int i915_gem_context_init(struct drm_device *dev)  {
>> >  	struct drm_i915_private *dev_priv = dev->dev_private; @@ -498,10
>> > +462,6 @@ int i915_gem_context_enable(struct drm_i915_private
>> *dev_priv)
>> >  		ppgtt->enable(ppgtt);
>> >  	}
>> >
>> > -	/* FIXME: We should make this work, even in reset */
>> > -	if (i915_reset_in_progress(&dev_priv->gpu_error))
>> > -		return 0;
>> > -
>> >  	BUG_ON(!dev_priv->ring[RCS].default_context);
>> >
>> >  	for_each_ring(ring, dev_priv, i) {
>> > @@ -645,7 +605,7 @@ static int do_switch(struct intel_engine_cs *ring,
>> >  	from = ring->last_context;
>> >
>> >  	if (USES_FULL_PPGTT(ring->dev)) {
>> > -		ret = ppgtt->switch_mm(ppgtt, ring, false);
>> > +		ret = ppgtt->switch_mm(ppgtt, ring);
>> >  		if (ret)
>> >  			goto unpin_out;
>> >  	}
>> > diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c
>> > b/drivers/gpu/drm/i915/i915_gem_gtt.c
>> > index 5188936..450c8a9 100644
>> > --- a/drivers/gpu/drm/i915/i915_gem_gtt.c
>> > +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
>> > @@ -216,19 +216,12 @@ static gen6_gtt_pte_t
>> iris_pte_encode(dma_addr_t
>> > addr,
>> >
>> >  /* Broadwell Page Directory Pointer Descriptors */  static int
>> > gen8_write_pdp(struct intel_engine_cs *ring, unsigned entry,
>> > -			   uint64_t val, bool synchronous)
>> > +			   uint64_t val)
>> >  {
>> > -	struct drm_i915_private *dev_priv = ring->dev->dev_private;
>> >  	int ret;
>> >
>> >  	BUG_ON(entry >= 4);
>> >
>> > -	if (synchronous) {
>> > -		I915_WRITE(GEN8_RING_PDP_UDW(ring, entry), val >> 32);
>> > -		I915_WRITE(GEN8_RING_PDP_LDW(ring, entry), (u32)val);
>> > -		return 0;
>> > -	}
>> > -
>> >  	ret = intel_ring_begin(ring, 6);
>> >  	if (ret)
>> >  		return ret;
>> > @@ -245,8 +238,7 @@ static int gen8_write_pdp(struct intel_engine_cs
>> > *ring, unsigned entry,  }
>> >
>> >  static int gen8_mm_switch(struct i915_hw_ppgtt *ppgtt,
>> > -			  struct intel_engine_cs *ring,
>> > -			  bool synchronous)
>> > +			  struct intel_engine_cs *ring)
>> >  {
>> >  	int i, ret;
>> >
>> > @@ -255,7 +247,7 @@ static int gen8_mm_switch(struct i915_hw_ppgtt
>> > *ppgtt,
>> >
>> >  	for (i = used_pd - 1; i >= 0; i--) {
>> >  		dma_addr_t addr = ppgtt->pd_dma_addr[i];
>> > -		ret = gen8_write_pdp(ring, i, addr, synchronous);
>> > +		ret = gen8_write_pdp(ring, i, addr);
>> >  		if (ret)
>> >  			return ret;
>> >  	}
>> > @@ -724,29 +716,10 @@ static uint32_t get_pd_offset(struct
>> > i915_hw_ppgtt *ppgtt)  }
>> >
>> >  static int hsw_mm_switch(struct i915_hw_ppgtt *ppgtt,
>> > -			 struct intel_engine_cs *ring,
>> > -			 bool synchronous)
>> > +			 struct intel_engine_cs *ring)
>> >  {
>> > -	struct drm_device *dev = ppgtt->base.dev;
>> > -	struct drm_i915_private *dev_priv = dev->dev_private;
>> >  	int ret;
>> >
>> > -	/* If we're in reset, we can assume the GPU is sufficiently idle to
>> > -	 * manually frob these bits. Ideally we could use the ring functions,
>> > -	 * except our error handling makes it quite difficult (can't use
>> > -	 * intel_ring_begin, ring->flush, or intel_ring_advance)
>> > -	 *
>> > -	 * FIXME: We should try not to special case reset
>> > -	 */
>> > -	if (synchronous ||
>> > -	    i915_reset_in_progress(&dev_priv->gpu_error)) {
>> > -		WARN_ON(ppgtt != dev_priv->mm.aliasing_ppgtt);
>> > -		I915_WRITE(RING_PP_DIR_DCLV(ring), PP_DIR_DCLV_2G);
>> > -		I915_WRITE(RING_PP_DIR_BASE(ring),
>> get_pd_offset(ppgtt));
>> > -		POSTING_READ(RING_PP_DIR_BASE(ring));
>> > -		return 0;
>> > -	}
>> > -
>> >  	/* NB: TLBs must be flushed and invalidated before a switch */
>> >  	ret = ring->flush(ring, I915_GEM_GPU_DOMAINS,
>> I915_GEM_GPU_DOMAINS);
>> >  	if (ret)
>> > @@ -768,29 +741,10 @@ static int hsw_mm_switch(struct i915_hw_ppgtt
>> > *ppgtt,  }
>> >
>> >  static int gen7_mm_switch(struct i915_hw_ppgtt *ppgtt,
>> > -			  struct intel_engine_cs *ring,
>> > -			  bool synchronous)
>> > +			  struct intel_engine_cs *ring)
>> >  {
>> > -	struct drm_device *dev = ppgtt->base.dev;
>> > -	struct drm_i915_private *dev_priv = dev->dev_private;
>> >  	int ret;
>> >
>> > -	/* If we're in reset, we can assume the GPU is sufficiently idle to
>> > -	 * manually frob these bits. Ideally we could use the ring functions,
>> > -	 * except our error handling makes it quite difficult (can't use
>> > -	 * intel_ring_begin, ring->flush, or intel_ring_advance)
>> > -	 *
>> > -	 * FIXME: We should try not to special case reset
>> > -	 */
>> > -	if (synchronous ||
>> > -	    i915_reset_in_progress(&dev_priv->gpu_error)) {
>> > -		WARN_ON(ppgtt != dev_priv->mm.aliasing_ppgtt);
>> > -		I915_WRITE(RING_PP_DIR_DCLV(ring), PP_DIR_DCLV_2G);
>> > -		I915_WRITE(RING_PP_DIR_BASE(ring),
>> get_pd_offset(ppgtt));
>> > -		POSTING_READ(RING_PP_DIR_BASE(ring));
>> > -		return 0;
>> > -	}
>> > -
>> >  	/* NB: TLBs must be flushed and invalidated before a switch */
>> >  	ret = ring->flush(ring, I915_GEM_GPU_DOMAINS,
>> I915_GEM_GPU_DOMAINS);
>> >  	if (ret)
>> > @@ -819,14 +773,11 @@ static int gen7_mm_switch(struct i915_hw_ppgtt
>> > *ppgtt,  }
>> >
>> >  static int gen6_mm_switch(struct i915_hw_ppgtt *ppgtt,
>> > -			  struct intel_engine_cs *ring,
>> > -			  bool synchronous)
>> > +			  struct intel_engine_cs *ring)
>> >  {
>> >  	struct drm_device *dev = ppgtt->base.dev;
>> >  	struct drm_i915_private *dev_priv = dev->dev_private;
>> >
>> > -	if (!synchronous)
>> > -		return 0;
>> >
>> >  	I915_WRITE(RING_PP_DIR_DCLV(ring), PP_DIR_DCLV_2G);
>> >  	I915_WRITE(RING_PP_DIR_BASE(ring), get_pd_offset(ppgtt)); @@ -
>> 852,7
>> > +803,7 @@ static int gen8_ppgtt_enable(struct i915_hw_ppgtt *ppgtt)
>> >  		if (USES_FULL_PPGTT(dev))
>> >  			continue;
>> >
>> > -		ret = ppgtt->switch_mm(ppgtt, ring, true);
>> > +		ret = ppgtt->switch_mm(ppgtt, ring);
>> >  		if (ret)
>> >  			goto err_out;
>> >  	}
>> > @@ -897,7 +848,7 @@ static int gen7_ppgtt_enable(struct i915_hw_ppgtt
>> *ppgtt)
>> >  		if (USES_FULL_PPGTT(dev))
>> >  			continue;
>> >
>> > -		ret = ppgtt->switch_mm(ppgtt, ring, true);
>> > +		ret = ppgtt->switch_mm(ppgtt, ring);
>> >  		if (ret)
>> >  			return ret;
>> >  	}
>> > @@ -926,7 +877,7 @@ static int gen6_ppgtt_enable(struct i915_hw_ppgtt
>> *ppgtt)
>> >  	I915_WRITE(GFX_MODE,
>> _MASKED_BIT_ENABLE(GFX_PPGTT_ENABLE));
>> >
>> >  	for_each_ring(ring, dev_priv, i) {
>> > -		int ret = ppgtt->switch_mm(ppgtt, ring, true);
>> > +		int ret = ppgtt->switch_mm(ppgtt, ring);
>> >  		if (ret)
>> >  			return ret;
>> >  	}
>> > diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.h
>> > b/drivers/gpu/drm/i915/i915_gem_gtt.h
>> > index 8d6f7c1..bf1e4fc 100644
>> > --- a/drivers/gpu/drm/i915/i915_gem_gtt.h
>> > +++ b/drivers/gpu/drm/i915/i915_gem_gtt.h
>> > @@ -262,8 +262,7 @@ struct i915_hw_ppgtt {
>> >
>> >  	int (*enable)(struct i915_hw_ppgtt *ppgtt);
>> >  	int (*switch_mm)(struct i915_hw_ppgtt *ppgtt,
>> > -			 struct intel_engine_cs *ring,
>> > -			 bool synchronous);
>> > +			 struct intel_engine_cs *ring);
>> >  	void (*debug_dump)(struct i915_hw_ppgtt *ppgtt, struct seq_file
>> *m);
>> > };
>> >
>> > --
>> > 2.0.0
>> >
>> > _______________________________________________
>> > Intel-gfx mailing list
>> > Intel-gfx@lists.freedesktop.org
>> > http://lists.freedesktop.org/mailman/listinfo/intel-gfx
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index 5e4fefd..3bfafe6 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -806,7 +806,13 @@  int i915_reset(struct drm_device *dev)
 			!dev_priv->ums.mm_suspended) {
 		dev_priv->ums.mm_suspended = 0;
 
+		/* Used to prevent gem_check_wedged returning -EAGAIN during gpu reset */
+		dev_priv->gpu_error.reload_in_reset = true;
+
 		ret = i915_gem_init_hw(dev);
+
+		dev_priv->gpu_error.reload_in_reset = false;
+
 		mutex_unlock(&dev->struct_mutex);
 		if (ret) {
 			DRM_ERROR("Failed hw init on reset %d\n", ret);
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 991b663..116daff 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1217,6 +1217,9 @@  struct i915_gpu_error {
 
 	/* For missed irq/seqno simulation. */
 	unsigned int test_irq_rings;
+
+	/* Used to prevent gem_check_wedged returning -EAGAIN during gpu reset   */
+	bool reload_in_reset;
 };
 
 enum modeset_restore {
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index ef047bc..14e1770 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -1085,7 +1085,9 @@  i915_gem_check_wedge(struct i915_gpu_error *error,
 		if (i915_terminally_wedged(error))
 			return -EIO;
 
-		return -EAGAIN;
+		/* Check if GPU Reset is in progress */
+		if (!error->reload_in_reset)
+			return -EAGAIN;
 	}
 
 	return 0;
@@ -2590,8 +2592,6 @@  void i915_gem_reset(struct drm_device *dev)
 	for_each_ring(ring, dev_priv, i)
 		i915_gem_reset_ring_cleanup(dev_priv, ring);
 
-	i915_gem_context_reset(dev);
-
 	i915_gem_restore_fences(dev);
 }
 
diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index de72a28..d96219f 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -372,42 +372,6 @@  err_destroy:
 	return ERR_PTR(ret);
 }
 
-void i915_gem_context_reset(struct drm_device *dev)
-{
-	struct drm_i915_private *dev_priv = dev->dev_private;
-	int i;
-
-	/* Prevent the hardware from restoring the last context (which hung) on
-	 * the next switch */
-	for (i = 0; i < I915_NUM_RINGS; i++) {
-		struct intel_engine_cs *ring = &dev_priv->ring[i];
-		struct intel_context *dctx = ring->default_context;
-		struct intel_context *lctx = ring->last_context;
-
-		/* Do a fake switch to the default context */
-		if (lctx == dctx)
-			continue;
-
-		if (!lctx)
-			continue;
-
-		if (dctx->legacy_hw_ctx.rcs_state && i == RCS) {
-			WARN_ON(i915_gem_obj_ggtt_pin(dctx->legacy_hw_ctx.rcs_state,
-						      get_context_alignment(dev), 0));
-			/* Fake a finish/inactive */
-			dctx->legacy_hw_ctx.rcs_state->base.write_domain = 0;
-			dctx->legacy_hw_ctx.rcs_state->active = 0;
-		}
-
-		if (lctx->legacy_hw_ctx.rcs_state && i == RCS)
-			i915_gem_object_ggtt_unpin(lctx->legacy_hw_ctx.rcs_state);
-
-		i915_gem_context_unreference(lctx);
-		i915_gem_context_reference(dctx);
-		ring->last_context = dctx;
-	}
-}
-
 int i915_gem_context_init(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
@@ -498,10 +462,6 @@  int i915_gem_context_enable(struct drm_i915_private *dev_priv)
 		ppgtt->enable(ppgtt);
 	}
 
-	/* FIXME: We should make this work, even in reset */
-	if (i915_reset_in_progress(&dev_priv->gpu_error))
-		return 0;
-
 	BUG_ON(!dev_priv->ring[RCS].default_context);
 
 	for_each_ring(ring, dev_priv, i) {
@@ -645,7 +605,7 @@  static int do_switch(struct intel_engine_cs *ring,
 	from = ring->last_context;
 
 	if (USES_FULL_PPGTT(ring->dev)) {
-		ret = ppgtt->switch_mm(ppgtt, ring, false);
+		ret = ppgtt->switch_mm(ppgtt, ring);
 		if (ret)
 			goto unpin_out;
 	}
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
index 5188936..450c8a9 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
@@ -216,19 +216,12 @@  static gen6_gtt_pte_t iris_pte_encode(dma_addr_t addr,
 
 /* Broadwell Page Directory Pointer Descriptors */
 static int gen8_write_pdp(struct intel_engine_cs *ring, unsigned entry,
-			   uint64_t val, bool synchronous)
+			   uint64_t val)
 {
-	struct drm_i915_private *dev_priv = ring->dev->dev_private;
 	int ret;
 
 	BUG_ON(entry >= 4);
 
-	if (synchronous) {
-		I915_WRITE(GEN8_RING_PDP_UDW(ring, entry), val >> 32);
-		I915_WRITE(GEN8_RING_PDP_LDW(ring, entry), (u32)val);
-		return 0;
-	}
-
 	ret = intel_ring_begin(ring, 6);
 	if (ret)
 		return ret;
@@ -245,8 +238,7 @@  static int gen8_write_pdp(struct intel_engine_cs *ring, unsigned entry,
 }
 
 static int gen8_mm_switch(struct i915_hw_ppgtt *ppgtt,
-			  struct intel_engine_cs *ring,
-			  bool synchronous)
+			  struct intel_engine_cs *ring)
 {
 	int i, ret;
 
@@ -255,7 +247,7 @@  static int gen8_mm_switch(struct i915_hw_ppgtt *ppgtt,
 
 	for (i = used_pd - 1; i >= 0; i--) {
 		dma_addr_t addr = ppgtt->pd_dma_addr[i];
-		ret = gen8_write_pdp(ring, i, addr, synchronous);
+		ret = gen8_write_pdp(ring, i, addr);
 		if (ret)
 			return ret;
 	}
@@ -724,29 +716,10 @@  static uint32_t get_pd_offset(struct i915_hw_ppgtt *ppgtt)
 }
 
 static int hsw_mm_switch(struct i915_hw_ppgtt *ppgtt,
-			 struct intel_engine_cs *ring,
-			 bool synchronous)
+			 struct intel_engine_cs *ring)
 {
-	struct drm_device *dev = ppgtt->base.dev;
-	struct drm_i915_private *dev_priv = dev->dev_private;
 	int ret;
 
-	/* If we're in reset, we can assume the GPU is sufficiently idle to
-	 * manually frob these bits. Ideally we could use the ring functions,
-	 * except our error handling makes it quite difficult (can't use
-	 * intel_ring_begin, ring->flush, or intel_ring_advance)
-	 *
-	 * FIXME: We should try not to special case reset
-	 */
-	if (synchronous ||
-	    i915_reset_in_progress(&dev_priv->gpu_error)) {
-		WARN_ON(ppgtt != dev_priv->mm.aliasing_ppgtt);
-		I915_WRITE(RING_PP_DIR_DCLV(ring), PP_DIR_DCLV_2G);
-		I915_WRITE(RING_PP_DIR_BASE(ring), get_pd_offset(ppgtt));
-		POSTING_READ(RING_PP_DIR_BASE(ring));
-		return 0;
-	}
-
 	/* NB: TLBs must be flushed and invalidated before a switch */
 	ret = ring->flush(ring, I915_GEM_GPU_DOMAINS, I915_GEM_GPU_DOMAINS);
 	if (ret)
@@ -768,29 +741,10 @@  static int hsw_mm_switch(struct i915_hw_ppgtt *ppgtt,
 }
 
 static int gen7_mm_switch(struct i915_hw_ppgtt *ppgtt,
-			  struct intel_engine_cs *ring,
-			  bool synchronous)
+			  struct intel_engine_cs *ring)
 {
-	struct drm_device *dev = ppgtt->base.dev;
-	struct drm_i915_private *dev_priv = dev->dev_private;
 	int ret;
 
-	/* If we're in reset, we can assume the GPU is sufficiently idle to
-	 * manually frob these bits. Ideally we could use the ring functions,
-	 * except our error handling makes it quite difficult (can't use
-	 * intel_ring_begin, ring->flush, or intel_ring_advance)
-	 *
-	 * FIXME: We should try not to special case reset
-	 */
-	if (synchronous ||
-	    i915_reset_in_progress(&dev_priv->gpu_error)) {
-		WARN_ON(ppgtt != dev_priv->mm.aliasing_ppgtt);
-		I915_WRITE(RING_PP_DIR_DCLV(ring), PP_DIR_DCLV_2G);
-		I915_WRITE(RING_PP_DIR_BASE(ring), get_pd_offset(ppgtt));
-		POSTING_READ(RING_PP_DIR_BASE(ring));
-		return 0;
-	}
-
 	/* NB: TLBs must be flushed and invalidated before a switch */
 	ret = ring->flush(ring, I915_GEM_GPU_DOMAINS, I915_GEM_GPU_DOMAINS);
 	if (ret)
@@ -819,14 +773,11 @@  static int gen7_mm_switch(struct i915_hw_ppgtt *ppgtt,
 }
 
 static int gen6_mm_switch(struct i915_hw_ppgtt *ppgtt,
-			  struct intel_engine_cs *ring,
-			  bool synchronous)
+			  struct intel_engine_cs *ring)
 {
 	struct drm_device *dev = ppgtt->base.dev;
 	struct drm_i915_private *dev_priv = dev->dev_private;
 
-	if (!synchronous)
-		return 0;
 
 	I915_WRITE(RING_PP_DIR_DCLV(ring), PP_DIR_DCLV_2G);
 	I915_WRITE(RING_PP_DIR_BASE(ring), get_pd_offset(ppgtt));
@@ -852,7 +803,7 @@  static int gen8_ppgtt_enable(struct i915_hw_ppgtt *ppgtt)
 		if (USES_FULL_PPGTT(dev))
 			continue;
 
-		ret = ppgtt->switch_mm(ppgtt, ring, true);
+		ret = ppgtt->switch_mm(ppgtt, ring);
 		if (ret)
 			goto err_out;
 	}
@@ -897,7 +848,7 @@  static int gen7_ppgtt_enable(struct i915_hw_ppgtt *ppgtt)
 		if (USES_FULL_PPGTT(dev))
 			continue;
 
-		ret = ppgtt->switch_mm(ppgtt, ring, true);
+		ret = ppgtt->switch_mm(ppgtt, ring);
 		if (ret)
 			return ret;
 	}
@@ -926,7 +877,7 @@  static int gen6_ppgtt_enable(struct i915_hw_ppgtt *ppgtt)
 	I915_WRITE(GFX_MODE, _MASKED_BIT_ENABLE(GFX_PPGTT_ENABLE));
 
 	for_each_ring(ring, dev_priv, i) {
-		int ret = ppgtt->switch_mm(ppgtt, ring, true);
+		int ret = ppgtt->switch_mm(ppgtt, ring);
 		if (ret)
 			return ret;
 	}
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.h b/drivers/gpu/drm/i915/i915_gem_gtt.h
index 8d6f7c1..bf1e4fc 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.h
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.h
@@ -262,8 +262,7 @@  struct i915_hw_ppgtt {
 
 	int (*enable)(struct i915_hw_ppgtt *ppgtt);
 	int (*switch_mm)(struct i915_hw_ppgtt *ppgtt,
-			 struct intel_engine_cs *ring,
-			 bool synchronous);
+			 struct intel_engine_cs *ring);
 	void (*debug_dump)(struct i915_hw_ppgtt *ppgtt, struct seq_file *m);
 };