diff mbox

drm/i915/vlv: Added/removed Render specific Hw Workarounds for VLV

Message ID 1389268309-13555-1-git-send-email-akash.goel@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

akash.goel@intel.com Jan. 9, 2014, 11:51 a.m. UTC
From: akashgoe <akash.goel@intel.com>

The following changes leads to stable behavior, especially
when playing 3D Apps, benchmarks.

Added 4 new rendering specific Workarounds
1. WaTlbInvalidateStoreDataBefore :-
     Before pipecontrol with TLB invalidate set,
     need 2 store data commands
2. WaReadAfterWriteHazard :-
     Send 8 store dword commands after flush
     for read after write hazard
     (HSD Gen6 bug_de 3047871)
3. WaVSThreadDispatchOverride
     Performance optimization - Hw will
     decide which half slice the thread
     will dispatch, May not be really needed
     for VLV, as its single slice
4. WaDisable_RenderCache_OperationalFlush
     Operational flush cannot be enabled on
     BWG A0 [Errata BWT006]

Removed 3 workarounds as not needed for VLV+(B0 onwards)
1. WaDisableRHWOOptimizationForRenderHang
2. WaDisableL3CacheAging
3. WaDisableDopClockGating

Modified the implementation of 1 workaround
1. WaDisableL3Bank2xClockGate
    Disabling L3 clock gating- MMIO 940c[25] = 1

Modified the programming of 2 registers in render ring init function
1. GFX_MODE_GEN7 (Enabling TLB invalidate)
2. MI_MODE (Enabling MI Flush)

Change-Id: Ib60f8dc7d2213552e498d588e1bba7eaa1a921ed
Signed-off-by: akashgoe <akash.goel@intel.com>
Signed-off-by: Akash Goel <akash.goel@intel.com>
---
 drivers/gpu/drm/i915/i915_reg.h         |  3 ++
 drivers/gpu/drm/i915/intel_pm.c         | 32 +++++++++++-------
 drivers/gpu/drm/i915/intel_ringbuffer.c | 58 ++++++++++++++++++++++++++++++---
 3 files changed, 77 insertions(+), 16 deletions(-)

Comments

Jani Nikula Jan. 9, 2014, 12:26 p.m. UTC | #1
On Thu, 09 Jan 2014, akash.goel@intel.com wrote:
> From: akashgoe <akash.goel@intel.com>
>
> The following changes leads to stable behavior, especially
> when playing 3D Apps, benchmarks.
>
> Added 4 new rendering specific Workarounds
> 1. WaTlbInvalidateStoreDataBefore :-
>      Before pipecontrol with TLB invalidate set,
>      need 2 store data commands
> 2. WaReadAfterWriteHazard :-
>      Send 8 store dword commands after flush
>      for read after write hazard
>      (HSD Gen6 bug_de 3047871)
> 3. WaVSThreadDispatchOverride
>      Performance optimization - Hw will
>      decide which half slice the thread
>      will dispatch, May not be really needed
>      for VLV, as its single slice
> 4. WaDisable_RenderCache_OperationalFlush
>      Operational flush cannot be enabled on
>      BWG A0 [Errata BWT006]
>
> Removed 3 workarounds as not needed for VLV+(B0 onwards)
> 1. WaDisableRHWOOptimizationForRenderHang
> 2. WaDisableL3CacheAging
> 3. WaDisableDopClockGating
>
> Modified the implementation of 1 workaround
> 1. WaDisableL3Bank2xClockGate
>     Disabling L3 clock gating- MMIO 940c[25] = 1
>
> Modified the programming of 2 registers in render ring init function
> 1. GFX_MODE_GEN7 (Enabling TLB invalidate)
> 2. MI_MODE (Enabling MI Flush)


Frankly, I don't know much about these particular workarounds, but for
bisecting any regressions it would probably be a good idea to split the
patch per workaround touched.

BR,
Jani.

>
> Change-Id: Ib60f8dc7d2213552e498d588e1bba7eaa1a921ed
> Signed-off-by: akashgoe <akash.goel@intel.com>
> Signed-off-by: Akash Goel <akash.goel@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_reg.h         |  3 ++
>  drivers/gpu/drm/i915/intel_pm.c         | 32 +++++++++++-------
>  drivers/gpu/drm/i915/intel_ringbuffer.c | 58 ++++++++++++++++++++++++++++++---
>  3 files changed, 77 insertions(+), 16 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
> index a699efd..d829754 100644
> --- a/drivers/gpu/drm/i915/i915_reg.h
> +++ b/drivers/gpu/drm/i915/i915_reg.h
> @@ -934,6 +934,9 @@
>  #define   ECO_GATING_CX_ONLY	(1<<3)
>  #define   ECO_FLIP_DONE		(1<<0)
>  
> +#define GEN7_CACHE_MODE_0	0x07000 /* IVB+ only */
> +#define GEN7_RC_OP_FLUSH_ENABLE (1<<0)
> +
>  #define CACHE_MODE_1		0x7004 /* IVB+ */
>  #define   PIXEL_SUBSPAN_COLLECT_OPT_DISABLE (1<<6)
>  
> diff --git a/drivers/gpu/drm/i915/intel_pm.c b/drivers/gpu/drm/i915/intel_pm.c
> index 469170c..e4d220c 100644
> --- a/drivers/gpu/drm/i915/intel_pm.c
> +++ b/drivers/gpu/drm/i915/intel_pm.c
> @@ -4947,22 +4947,18 @@ static void valleyview_init_clock_gating(struct drm_device *dev)
>  		   _MASKED_BIT_ENABLE(GEN7_MAX_PS_THREAD_DEP |
>  				      GEN7_PSD_SINGLE_PORT_DISPATCH_ENABLE));
>  
> -	/* Apply the WaDisableRHWOOptimizationForRenderHang:vlv workaround. */
> -	I915_WRITE(GEN7_COMMON_SLICE_CHICKEN1,
> -		   GEN7_CSC1_RHWO_OPT_DISABLE_IN_RCC);
> -
> -	/* WaApplyL3ControlAndL3ChickenMode:vlv */
> -	I915_WRITE(GEN7_L3CNTLREG1, I915_READ(GEN7_L3CNTLREG1) | GEN7_L3AGDIS);
>  	I915_WRITE(GEN7_L3_CHICKEN_MODE_REGISTER, GEN7_WA_L3_CHICKEN_MODE);
>  
> +	/* WaDisable_RenderCache_OperationalFlush
> +	 * Clear bit 0, so we do a AND with the mask
> +	 * to keep other bits the same */
> +	I915_WRITE(GEN7_CACHE_MODE_0,  (I915_READ(GEN7_CACHE_MODE_0) |
> +			  _MASKED_BIT_DISABLE(GEN7_RC_OP_FLUSH_ENABLE)));
> +
>  	/* WaForceL3Serialization:vlv */
>  	I915_WRITE(GEN7_L3SQCREG4, I915_READ(GEN7_L3SQCREG4) &
>  		   ~L3SQ_URB_READ_CAM_MATCH_DISABLE);
>  
> -	/* WaDisableDopClockGating:vlv */
> -	I915_WRITE(GEN7_ROW_CHICKEN2,
> -		   _MASKED_BIT_ENABLE(DOP_CLOCK_GATING_DISABLE));
> -
>  	/* This is required by WaCatErrorRejectionIssue:vlv */
>  	I915_WRITE(GEN7_SQ_CHICKEN_MBCUNIT_CONFIG,
>  		   I915_READ(GEN7_SQ_CHICKEN_MBCUNIT_CONFIG) |
> @@ -4991,10 +4987,24 @@ static void valleyview_init_clock_gating(struct drm_device *dev)
>  		   GEN6_RCPBUNIT_CLOCK_GATE_DISABLE |
>  		   GEN6_RCCUNIT_CLOCK_GATE_DISABLE);
>  
> -	I915_WRITE(GEN7_UCGCTL4, GEN7_L3BANK2X_CLOCK_GATE_DISABLE);
> +	/* WaDisableL3Bank2xClockGate
> +	 * Disabling L3 clock gating- MMIO 940c[25] = 1
> +	 * Set bit 25, to disable L3_BANK_2x_CLK_GATING */
> +	I915_WRITE(GEN7_UCGCTL4,
> +		I915_READ(GEN7_UCGCTL4) | GEN7_L3BANK2X_CLOCK_GATE_DISABLE);
>  
>  	I915_WRITE(MI_ARB_VLV, MI_ARB_DISPLAY_TRICKLE_FEED_DISABLE);
>  
> +	/* WaVSThreadDispatchOverride
> +	 * Hw will decide which half slice the thread will dispatch.
> +	 * May not be needed for VLV, as its a single slice */
> +	I915_WRITE(GEN7_CACHE_MODE_0,
> +		I915_READ(GEN7_FF_THREAD_MODE) &
> +		(~GEN7_FF_VS_SCHED_LOAD_BALANCE));
> +
> +	/* WaDisable4x2SubspanOptimization,
> +	 * Disable combining of two 2x2 subspans into a 4x2 subspan
> +	 * Set chicken bit to disable subspan optimization */
>  	I915_WRITE(CACHE_MODE_1,
>  		   _MASKED_BIT_ENABLE(PIXEL_SUBSPAN_COLLECT_OPT_DISABLE));
>  
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
> index 442c9a6..c9350a1 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
> @@ -563,7 +563,9 @@ static int init_render_ring(struct intel_ring_buffer *ring)
>  	int ret = init_ring_common(ring);
>  
>  	if (INTEL_INFO(dev)->gen > 3)
> -		I915_WRITE(MI_MODE, _MASKED_BIT_ENABLE(VS_TIMER_DISPATCH));
> +		if (!IS_VALLEYVIEW(dev))
> +			I915_WRITE(MI_MODE,
> +				_MASKED_BIT_ENABLE(VS_TIMER_DISPATCH));
>  
>  	/* We need to disable the AsyncFlip performance optimisations in order
>  	 * to use MI_WAIT_FOR_EVENT within the CS. It should already be
> @@ -579,10 +581,17 @@ static int init_render_ring(struct intel_ring_buffer *ring)
>  		I915_WRITE(GFX_MODE,
>  			   _MASKED_BIT_ENABLE(GFX_TLB_INVALIDATE_ALWAYS));
>  
> -	if (IS_GEN7(dev))
> -		I915_WRITE(GFX_MODE_GEN7,
> -			   _MASKED_BIT_DISABLE(GFX_TLB_INVALIDATE_ALWAYS) |
> -			   _MASKED_BIT_ENABLE(GFX_REPLAY_MODE));
> +	if (IS_GEN7(dev)) {
> +		if (IS_VALLEYVIEW(dev)) {
> +			I915_WRITE(GFX_MODE_GEN7,
> +				_MASKED_BIT_ENABLE(GFX_REPLAY_MODE));
> +			I915_WRITE(MI_MODE, I915_READ(MI_MODE) |
> +				_MASKED_BIT_ENABLE(MI_FLUSH_ENABLE));
> +		} else
> +			I915_WRITE(GFX_MODE_GEN7,
> +				_MASKED_BIT_DISABLE(GFX_TLB_INVALIDATE_ALWAYS) |
> +				_MASKED_BIT_ENABLE(GFX_REPLAY_MODE));
> +	}
>  
>  	if (INTEL_INFO(dev)->gen >= 5) {
>  		ret = init_pipe_control(ring);
> @@ -2157,6 +2166,7 @@ int
>  intel_ring_flush_all_caches(struct intel_ring_buffer *ring)
>  {
>  	int ret;
> +	int i;
>  
>  	if (!ring->gpu_caches_dirty)
>  		return 0;
> @@ -2165,6 +2175,26 @@ intel_ring_flush_all_caches(struct intel_ring_buffer *ring)
>  	if (ret)
>  		return ret;
>  
> +	/* WaReadAfterWriteHazard*/
> +	/* Send a number of Store Data commands here to finish
> +	   flushing hardware pipeline.This is needed in the case
> +	   where the next workload tries reading from the same
> +	   surface that this batch writes to. Without these StoreDWs,
> +	   not all of the data will actually be flushd to the surface
> +	   by the time the next batch starts reading it, possibly
> +	   causing a small amount of corruption.*/
> +	ret = intel_ring_begin(ring, 4 * 12);
> +	if (ret)
> +		return ret;
> +	for (i = 0; i < 12; i++) {
> +		intel_ring_emit(ring, MI_STORE_DWORD_INDEX);
> +		intel_ring_emit(ring, I915_GEM_HWS_SCRATCH_INDEX <<
> +						MI_STORE_DWORD_INDEX_SHIFT);
> +		intel_ring_emit(ring, 0);
> +		intel_ring_emit(ring, MI_NOOP);
> +	}
> +	intel_ring_advance(ring);
> +
>  	trace_i915_gem_ring_flush(ring, 0, I915_GEM_GPU_DOMAINS);
>  
>  	ring->gpu_caches_dirty = false;
> @@ -2176,6 +2206,24 @@ intel_ring_invalidate_all_caches(struct intel_ring_buffer *ring)
>  {
>  	uint32_t flush_domains;
>  	int ret;
> +	int i;
> +
> +	/* WaTlbInvalidateStoreDataBefore*/
> +	/* Before pipecontrol with TLB invalidate set, need 2 store
> +	   data commands (such as MI_STORE_DATA_IMM or MI_STORE_DATA_INDEX)
> +	   Without this, hardware cannot guarantee the command after the
> +	   PIPE_CONTROL with TLB inv will not use the old TLB values.*/
> +	ret = intel_ring_begin(ring, 4 * 2);
> +	if (ret)
> +		return ret;
> +	for (i = 0; i < 2; i++) {
> +		intel_ring_emit(ring, MI_STORE_DWORD_INDEX);
> +		intel_ring_emit(ring, I915_GEM_HWS_SCRATCH_INDEX <<
> +						MI_STORE_DWORD_INDEX_SHIFT);
> +		intel_ring_emit(ring, 0);
> +		intel_ring_emit(ring, MI_NOOP);
> +	}
> +	intel_ring_advance(ring);
>  
>  	flush_domains = 0;
>  	if (ring->gpu_caches_dirty)
> -- 
> 1.8.5.2
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx
Ville Syrjälä Jan. 9, 2014, 4:05 p.m. UTC | #2
On Thu, Jan 09, 2014 at 05:21:49PM +0530, akash.goel@intel.com wrote:
> From: akashgoe <akash.goel@intel.com>
> 
> The following changes leads to stable behavior, especially
> when playing 3D Apps, benchmarks.
> 
> Added 4 new rendering specific Workarounds
> 1. WaTlbInvalidateStoreDataBefore :-
>      Before pipecontrol with TLB invalidate set,
>      need 2 store data commands
> 2. WaReadAfterWriteHazard :-
>      Send 8 store dword commands after flush
>      for read after write hazard
>      (HSD Gen6 bug_de 3047871)
> 3. WaVSThreadDispatchOverride
>      Performance optimization - Hw will
>      decide which half slice the thread
>      will dispatch, May not be really needed
>      for VLV, as its single slice
> 4. WaDisable_RenderCache_OperationalFlush
>      Operational flush cannot be enabled on
>      BWG A0 [Errata BWT006]
> 
> Removed 3 workarounds as not needed for VLV+(B0 onwards)
> 1. WaDisableRHWOOptimizationForRenderHang
> 2. WaDisableL3CacheAging
> 3. WaDisableDopClockGating
> 
> Modified the implementation of 1 workaround
> 1. WaDisableL3Bank2xClockGate
>     Disabling L3 clock gating- MMIO 940c[25] = 1
> 
> Modified the programming of 2 registers in render ring init function
> 1. GFX_MODE_GEN7 (Enabling TLB invalidate)
> 2. MI_MODE (Enabling MI Flush)

I posted a bunch of workaround stuff a long time ago. It may have some
overlaps with your stuff, and maybe there was something you overlooked.
Maybe you could have a look if there's something useful there:
http://lists.freedesktop.org/archives/intel-gfx/2013-July/029685.html
akash.goel@intel.com Jan. 19, 2014, 3:48 p.m. UTC | #3
>> Frankly, I don't know much about these particular workarounds, but for bisecting any regressions it would probably be a good idea to split the patch per workaround touched.We have 
Sorry for late response on this, actually we have done sufficient testing for this patch. Moreover these workarounds are applicable to VLV only & will not affect any other platforms. 
So probably it can be pushed as it is without splitting. 

Best Regards
Akash
-----Original Message-----
From: Jani Nikula [mailto:jani.nikula@linux.intel.com] 
Sent: Thursday, January 09, 2014 5:56 PM
To: Goel, Akash; intel-gfx@lists.freedesktop.org
Cc: Goel, Akash
Subject: Re: [Intel-gfx] [PATCH] drm/i915/vlv: Added/removed Render specific Hw Workarounds for VLV

On Thu, 09 Jan 2014, akash.goel@intel.com wrote:
> From: akashgoe <akash.goel@intel.com>
>
> The following changes leads to stable behavior, especially when 
> playing 3D Apps, benchmarks.
>
> Added 4 new rendering specific Workarounds 1. 
> WaTlbInvalidateStoreDataBefore :-
>      Before pipecontrol with TLB invalidate set,
>      need 2 store data commands
> 2. WaReadAfterWriteHazard :-
>      Send 8 store dword commands after flush
>      for read after write hazard
>      (HSD Gen6 bug_de 3047871)
> 3. WaVSThreadDispatchOverride
>      Performance optimization - Hw will
>      decide which half slice the thread
>      will dispatch, May not be really needed
>      for VLV, as its single slice
> 4. WaDisable_RenderCache_OperationalFlush
>      Operational flush cannot be enabled on
>      BWG A0 [Errata BWT006]
>
> Removed 3 workarounds as not needed for VLV+(B0 onwards) 1. 
> WaDisableRHWOOptimizationForRenderHang
> 2. WaDisableL3CacheAging
> 3. WaDisableDopClockGating
>
> Modified the implementation of 1 workaround 1. 
> WaDisableL3Bank2xClockGate
>     Disabling L3 clock gating- MMIO 940c[25] = 1
>
> Modified the programming of 2 registers in render ring init function 
> 1. GFX_MODE_GEN7 (Enabling TLB invalidate) 2. MI_MODE (Enabling MI 
> Flush)


Frankly, I don't know much about these particular workarounds, but for bisecting any regressions it would probably be a good idea to split the patch per workaround touched.

BR,
Jani.

>
> Change-Id: Ib60f8dc7d2213552e498d588e1bba7eaa1a921ed
> Signed-off-by: akashgoe <akash.goel@intel.com>
> Signed-off-by: Akash Goel <akash.goel@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_reg.h         |  3 ++
>  drivers/gpu/drm/i915/intel_pm.c         | 32 +++++++++++-------
>  drivers/gpu/drm/i915/intel_ringbuffer.c | 58 
> ++++++++++++++++++++++++++++++---
>  3 files changed, 77 insertions(+), 16 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_reg.h 
> b/drivers/gpu/drm/i915/i915_reg.h index a699efd..d829754 100644
> --- a/drivers/gpu/drm/i915/i915_reg.h
> +++ b/drivers/gpu/drm/i915/i915_reg.h
> @@ -934,6 +934,9 @@
>  #define   ECO_GATING_CX_ONLY	(1<<3)
>  #define   ECO_FLIP_DONE		(1<<0)
>  
> +#define GEN7_CACHE_MODE_0	0x07000 /* IVB+ only */
> +#define GEN7_RC_OP_FLUSH_ENABLE (1<<0)
> +
>  #define CACHE_MODE_1		0x7004 /* IVB+ */
>  #define   PIXEL_SUBSPAN_COLLECT_OPT_DISABLE (1<<6)
>  
> diff --git a/drivers/gpu/drm/i915/intel_pm.c 
> b/drivers/gpu/drm/i915/intel_pm.c index 469170c..e4d220c 100644
> --- a/drivers/gpu/drm/i915/intel_pm.c
> +++ b/drivers/gpu/drm/i915/intel_pm.c
> @@ -4947,22 +4947,18 @@ static void valleyview_init_clock_gating(struct drm_device *dev)
>  		   _MASKED_BIT_ENABLE(GEN7_MAX_PS_THREAD_DEP |
>  				      GEN7_PSD_SINGLE_PORT_DISPATCH_ENABLE));
>  
> -	/* Apply the WaDisableRHWOOptimizationForRenderHang:vlv workaround. */
> -	I915_WRITE(GEN7_COMMON_SLICE_CHICKEN1,
> -		   GEN7_CSC1_RHWO_OPT_DISABLE_IN_RCC);
> -
> -	/* WaApplyL3ControlAndL3ChickenMode:vlv */
> -	I915_WRITE(GEN7_L3CNTLREG1, I915_READ(GEN7_L3CNTLREG1) | GEN7_L3AGDIS);
>  	I915_WRITE(GEN7_L3_CHICKEN_MODE_REGISTER, GEN7_WA_L3_CHICKEN_MODE);
>  
> +	/* WaDisable_RenderCache_OperationalFlush
> +	 * Clear bit 0, so we do a AND with the mask
> +	 * to keep other bits the same */
> +	I915_WRITE(GEN7_CACHE_MODE_0,  (I915_READ(GEN7_CACHE_MODE_0) |
> +			  _MASKED_BIT_DISABLE(GEN7_RC_OP_FLUSH_ENABLE)));
> +
>  	/* WaForceL3Serialization:vlv */
>  	I915_WRITE(GEN7_L3SQCREG4, I915_READ(GEN7_L3SQCREG4) &
>  		   ~L3SQ_URB_READ_CAM_MATCH_DISABLE);
>  
> -	/* WaDisableDopClockGating:vlv */
> -	I915_WRITE(GEN7_ROW_CHICKEN2,
> -		   _MASKED_BIT_ENABLE(DOP_CLOCK_GATING_DISABLE));
> -
>  	/* This is required by WaCatErrorRejectionIssue:vlv */
>  	I915_WRITE(GEN7_SQ_CHICKEN_MBCUNIT_CONFIG,
>  		   I915_READ(GEN7_SQ_CHICKEN_MBCUNIT_CONFIG) | @@ -4991,10 +4987,24 
> @@ static void valleyview_init_clock_gating(struct drm_device *dev)
>  		   GEN6_RCPBUNIT_CLOCK_GATE_DISABLE |
>  		   GEN6_RCCUNIT_CLOCK_GATE_DISABLE);
>  
> -	I915_WRITE(GEN7_UCGCTL4, GEN7_L3BANK2X_CLOCK_GATE_DISABLE);
> +	/* WaDisableL3Bank2xClockGate
> +	 * Disabling L3 clock gating- MMIO 940c[25] = 1
> +	 * Set bit 25, to disable L3_BANK_2x_CLK_GATING */
> +	I915_WRITE(GEN7_UCGCTL4,
> +		I915_READ(GEN7_UCGCTL4) | GEN7_L3BANK2X_CLOCK_GATE_DISABLE);
>  
>  	I915_WRITE(MI_ARB_VLV, MI_ARB_DISPLAY_TRICKLE_FEED_DISABLE);
>  
> +	/* WaVSThreadDispatchOverride
> +	 * Hw will decide which half slice the thread will dispatch.
> +	 * May not be needed for VLV, as its a single slice */
> +	I915_WRITE(GEN7_CACHE_MODE_0,
> +		I915_READ(GEN7_FF_THREAD_MODE) &
> +		(~GEN7_FF_VS_SCHED_LOAD_BALANCE));
> +
> +	/* WaDisable4x2SubspanOptimization,
> +	 * Disable combining of two 2x2 subspans into a 4x2 subspan
> +	 * Set chicken bit to disable subspan optimization */
>  	I915_WRITE(CACHE_MODE_1,
>  		   _MASKED_BIT_ENABLE(PIXEL_SUBSPAN_COLLECT_OPT_DISABLE));
>  
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c 
> b/drivers/gpu/drm/i915/intel_ringbuffer.c
> index 442c9a6..c9350a1 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
> @@ -563,7 +563,9 @@ static int init_render_ring(struct intel_ring_buffer *ring)
>  	int ret = init_ring_common(ring);
>  
>  	if (INTEL_INFO(dev)->gen > 3)
> -		I915_WRITE(MI_MODE, _MASKED_BIT_ENABLE(VS_TIMER_DISPATCH));
> +		if (!IS_VALLEYVIEW(dev))
> +			I915_WRITE(MI_MODE,
> +				_MASKED_BIT_ENABLE(VS_TIMER_DISPATCH));
>  
>  	/* We need to disable the AsyncFlip performance optimisations in order
>  	 * to use MI_WAIT_FOR_EVENT within the CS. It should already be @@ 
> -579,10 +581,17 @@ static int init_render_ring(struct intel_ring_buffer *ring)
>  		I915_WRITE(GFX_MODE,
>  			   _MASKED_BIT_ENABLE(GFX_TLB_INVALIDATE_ALWAYS));
>  
> -	if (IS_GEN7(dev))
> -		I915_WRITE(GFX_MODE_GEN7,
> -			   _MASKED_BIT_DISABLE(GFX_TLB_INVALIDATE_ALWAYS) |
> -			   _MASKED_BIT_ENABLE(GFX_REPLAY_MODE));
> +	if (IS_GEN7(dev)) {
> +		if (IS_VALLEYVIEW(dev)) {
> +			I915_WRITE(GFX_MODE_GEN7,
> +				_MASKED_BIT_ENABLE(GFX_REPLAY_MODE));
> +			I915_WRITE(MI_MODE, I915_READ(MI_MODE) |
> +				_MASKED_BIT_ENABLE(MI_FLUSH_ENABLE));
> +		} else
> +			I915_WRITE(GFX_MODE_GEN7,
> +				_MASKED_BIT_DISABLE(GFX_TLB_INVALIDATE_ALWAYS) |
> +				_MASKED_BIT_ENABLE(GFX_REPLAY_MODE));
> +	}
>  
>  	if (INTEL_INFO(dev)->gen >= 5) {
>  		ret = init_pipe_control(ring);
> @@ -2157,6 +2166,7 @@ int
>  intel_ring_flush_all_caches(struct intel_ring_buffer *ring)  {
>  	int ret;
> +	int i;
>  
>  	if (!ring->gpu_caches_dirty)
>  		return 0;
> @@ -2165,6 +2175,26 @@ intel_ring_flush_all_caches(struct intel_ring_buffer *ring)
>  	if (ret)
>  		return ret;
>  
> +	/* WaReadAfterWriteHazard*/
> +	/* Send a number of Store Data commands here to finish
> +	   flushing hardware pipeline.This is needed in the case
> +	   where the next workload tries reading from the same
> +	   surface that this batch writes to. Without these StoreDWs,
> +	   not all of the data will actually be flushd to the surface
> +	   by the time the next batch starts reading it, possibly
> +	   causing a small amount of corruption.*/
> +	ret = intel_ring_begin(ring, 4 * 12);
> +	if (ret)
> +		return ret;
> +	for (i = 0; i < 12; i++) {
> +		intel_ring_emit(ring, MI_STORE_DWORD_INDEX);
> +		intel_ring_emit(ring, I915_GEM_HWS_SCRATCH_INDEX <<
> +						MI_STORE_DWORD_INDEX_SHIFT);
> +		intel_ring_emit(ring, 0);
> +		intel_ring_emit(ring, MI_NOOP);
> +	}
> +	intel_ring_advance(ring);
> +
>  	trace_i915_gem_ring_flush(ring, 0, I915_GEM_GPU_DOMAINS);
>  
>  	ring->gpu_caches_dirty = false;
> @@ -2176,6 +2206,24 @@ intel_ring_invalidate_all_caches(struct 
> intel_ring_buffer *ring)  {
>  	uint32_t flush_domains;
>  	int ret;
> +	int i;
> +
> +	/* WaTlbInvalidateStoreDataBefore*/
> +	/* Before pipecontrol with TLB invalidate set, need 2 store
> +	   data commands (such as MI_STORE_DATA_IMM or MI_STORE_DATA_INDEX)
> +	   Without this, hardware cannot guarantee the command after the
> +	   PIPE_CONTROL with TLB inv will not use the old TLB values.*/
> +	ret = intel_ring_begin(ring, 4 * 2);
> +	if (ret)
> +		return ret;
> +	for (i = 0; i < 2; i++) {
> +		intel_ring_emit(ring, MI_STORE_DWORD_INDEX);
> +		intel_ring_emit(ring, I915_GEM_HWS_SCRATCH_INDEX <<
> +						MI_STORE_DWORD_INDEX_SHIFT);
> +		intel_ring_emit(ring, 0);
> +		intel_ring_emit(ring, MI_NOOP);
> +	}
> +	intel_ring_advance(ring);
>  
>  	flush_domains = 0;
>  	if (ring->gpu_caches_dirty)
> --
> 1.8.5.2
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx

--
Jani Nikula, Intel Open Source Technology Center
Jani Nikula Jan. 20, 2014, 8:19 a.m. UTC | #4
On Sun, 19 Jan 2014, "Goel, Akash" <akash.goel@intel.com> wrote:
>>> Frankly, I don't know much about these particular workarounds, but
>>> for bisecting any regressions it would probably be a good idea to
>>> split the patch per workaround touched.

> Sorry for late response on this, actually we have done sufficient
> testing for this patch.

I'm happy to hear that. However no reasonable amount of testing will
cover the astonishing variety of conditions this code will run under out
there in the real world, and if something blows up, we might never be
able reproduce it ourselves.

> Moreover these workarounds are applicable to VLV only & will not
> affect any other platforms.

Really? WaReadAfterWriteHazard in intel_ring_flush_all_caches() and
WaTlbInvalidateStoreDataBefore in intel_ring_invalidate_all_caches()?

Btw our convention is to append the affected platforms to the w/a names
in the code; please look around for examples.

> So probably it can be pushed as it is without splitting. 

My opinion still stands. I think all workarounds are fragile by
definition, and should be separate patches.

Quoting Daniel, "if a regression would bisect to this, and the bisect is
the only useful piece of evidence, would I stand a chance to understand
it?"


BR,
Jani.
akash.goel@intel.com Jan. 20, 2014, 8:21 a.m. UTC | #5
>> Really? WaReadAfterWriteHazard in intel_ring_flush_all_caches() and WaTlbInvalidateStoreDataBefore in intel_ring_invalidate_all_caches()?
Sorry for this lapse, we have already identified this, will cover this change inside the VLV platform check.

>> My opinion still stands. I think all workarounds are fragile by definition, and should be separate patches.
Understood your concern, will split the work in multiple patches.

Best Regards
Akash
-----Original Message-----
From: Jani Nikula [mailto:jani.nikula@linux.intel.com] 
Sent: Monday, January 20, 2014 1:50 PM
To: Goel, Akash; intel-gfx@lists.freedesktop.org
Subject: RE: [Intel-gfx] [PATCH] drm/i915/vlv: Added/removed Render specific Hw Workarounds for VLV

On Sun, 19 Jan 2014, "Goel, Akash" <akash.goel@intel.com> wrote:
>>> Frankly, I don't know much about these particular workarounds, but 
>>> for bisecting any regressions it would probably be a good idea to 
>>> split the patch per workaround touched.

> Sorry for late response on this, actually we have done sufficient 
> testing for this patch.

I'm happy to hear that. However no reasonable amount of testing will cover the astonishing variety of conditions this code will run under out there in the real world, and if something blows up, we might never be able reproduce it ourselves.

> Moreover these workarounds are applicable to VLV only & will not 
> affect any other platforms.

Really? WaReadAfterWriteHazard in intel_ring_flush_all_caches() and WaTlbInvalidateStoreDataBefore in intel_ring_invalidate_all_caches()?

Btw our convention is to append the affected platforms to the w/a names in the code; please look around for examples.

> So probably it can be pushed as it is without splitting. 

My opinion still stands. I think all workarounds are fragile by definition, and should be separate patches.

Quoting Daniel, "if a regression would bisect to this, and the bisect is the only useful piece of evidence, would I stand a chance to understand it?"


BR,
Jani.


--
Jani Nikula, Intel Open Source Technology Center
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index a699efd..d829754 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -934,6 +934,9 @@ 
 #define   ECO_GATING_CX_ONLY	(1<<3)
 #define   ECO_FLIP_DONE		(1<<0)
 
+#define GEN7_CACHE_MODE_0	0x07000 /* IVB+ only */
+#define GEN7_RC_OP_FLUSH_ENABLE (1<<0)
+
 #define CACHE_MODE_1		0x7004 /* IVB+ */
 #define   PIXEL_SUBSPAN_COLLECT_OPT_DISABLE (1<<6)
 
diff --git a/drivers/gpu/drm/i915/intel_pm.c b/drivers/gpu/drm/i915/intel_pm.c
index 469170c..e4d220c 100644
--- a/drivers/gpu/drm/i915/intel_pm.c
+++ b/drivers/gpu/drm/i915/intel_pm.c
@@ -4947,22 +4947,18 @@  static void valleyview_init_clock_gating(struct drm_device *dev)
 		   _MASKED_BIT_ENABLE(GEN7_MAX_PS_THREAD_DEP |
 				      GEN7_PSD_SINGLE_PORT_DISPATCH_ENABLE));
 
-	/* Apply the WaDisableRHWOOptimizationForRenderHang:vlv workaround. */
-	I915_WRITE(GEN7_COMMON_SLICE_CHICKEN1,
-		   GEN7_CSC1_RHWO_OPT_DISABLE_IN_RCC);
-
-	/* WaApplyL3ControlAndL3ChickenMode:vlv */
-	I915_WRITE(GEN7_L3CNTLREG1, I915_READ(GEN7_L3CNTLREG1) | GEN7_L3AGDIS);
 	I915_WRITE(GEN7_L3_CHICKEN_MODE_REGISTER, GEN7_WA_L3_CHICKEN_MODE);
 
+	/* WaDisable_RenderCache_OperationalFlush
+	 * Clear bit 0, so we do a AND with the mask
+	 * to keep other bits the same */
+	I915_WRITE(GEN7_CACHE_MODE_0,  (I915_READ(GEN7_CACHE_MODE_0) |
+			  _MASKED_BIT_DISABLE(GEN7_RC_OP_FLUSH_ENABLE)));
+
 	/* WaForceL3Serialization:vlv */
 	I915_WRITE(GEN7_L3SQCREG4, I915_READ(GEN7_L3SQCREG4) &
 		   ~L3SQ_URB_READ_CAM_MATCH_DISABLE);
 
-	/* WaDisableDopClockGating:vlv */
-	I915_WRITE(GEN7_ROW_CHICKEN2,
-		   _MASKED_BIT_ENABLE(DOP_CLOCK_GATING_DISABLE));
-
 	/* This is required by WaCatErrorRejectionIssue:vlv */
 	I915_WRITE(GEN7_SQ_CHICKEN_MBCUNIT_CONFIG,
 		   I915_READ(GEN7_SQ_CHICKEN_MBCUNIT_CONFIG) |
@@ -4991,10 +4987,24 @@  static void valleyview_init_clock_gating(struct drm_device *dev)
 		   GEN6_RCPBUNIT_CLOCK_GATE_DISABLE |
 		   GEN6_RCCUNIT_CLOCK_GATE_DISABLE);
 
-	I915_WRITE(GEN7_UCGCTL4, GEN7_L3BANK2X_CLOCK_GATE_DISABLE);
+	/* WaDisableL3Bank2xClockGate
+	 * Disabling L3 clock gating- MMIO 940c[25] = 1
+	 * Set bit 25, to disable L3_BANK_2x_CLK_GATING */
+	I915_WRITE(GEN7_UCGCTL4,
+		I915_READ(GEN7_UCGCTL4) | GEN7_L3BANK2X_CLOCK_GATE_DISABLE);
 
 	I915_WRITE(MI_ARB_VLV, MI_ARB_DISPLAY_TRICKLE_FEED_DISABLE);
 
+	/* WaVSThreadDispatchOverride
+	 * Hw will decide which half slice the thread will dispatch.
+	 * May not be needed for VLV, as its a single slice */
+	I915_WRITE(GEN7_CACHE_MODE_0,
+		I915_READ(GEN7_FF_THREAD_MODE) &
+		(~GEN7_FF_VS_SCHED_LOAD_BALANCE));
+
+	/* WaDisable4x2SubspanOptimization,
+	 * Disable combining of two 2x2 subspans into a 4x2 subspan
+	 * Set chicken bit to disable subspan optimization */
 	I915_WRITE(CACHE_MODE_1,
 		   _MASKED_BIT_ENABLE(PIXEL_SUBSPAN_COLLECT_OPT_DISABLE));
 
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index 442c9a6..c9350a1 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -563,7 +563,9 @@  static int init_render_ring(struct intel_ring_buffer *ring)
 	int ret = init_ring_common(ring);
 
 	if (INTEL_INFO(dev)->gen > 3)
-		I915_WRITE(MI_MODE, _MASKED_BIT_ENABLE(VS_TIMER_DISPATCH));
+		if (!IS_VALLEYVIEW(dev))
+			I915_WRITE(MI_MODE,
+				_MASKED_BIT_ENABLE(VS_TIMER_DISPATCH));
 
 	/* We need to disable the AsyncFlip performance optimisations in order
 	 * to use MI_WAIT_FOR_EVENT within the CS. It should already be
@@ -579,10 +581,17 @@  static int init_render_ring(struct intel_ring_buffer *ring)
 		I915_WRITE(GFX_MODE,
 			   _MASKED_BIT_ENABLE(GFX_TLB_INVALIDATE_ALWAYS));
 
-	if (IS_GEN7(dev))
-		I915_WRITE(GFX_MODE_GEN7,
-			   _MASKED_BIT_DISABLE(GFX_TLB_INVALIDATE_ALWAYS) |
-			   _MASKED_BIT_ENABLE(GFX_REPLAY_MODE));
+	if (IS_GEN7(dev)) {
+		if (IS_VALLEYVIEW(dev)) {
+			I915_WRITE(GFX_MODE_GEN7,
+				_MASKED_BIT_ENABLE(GFX_REPLAY_MODE));
+			I915_WRITE(MI_MODE, I915_READ(MI_MODE) |
+				_MASKED_BIT_ENABLE(MI_FLUSH_ENABLE));
+		} else
+			I915_WRITE(GFX_MODE_GEN7,
+				_MASKED_BIT_DISABLE(GFX_TLB_INVALIDATE_ALWAYS) |
+				_MASKED_BIT_ENABLE(GFX_REPLAY_MODE));
+	}
 
 	if (INTEL_INFO(dev)->gen >= 5) {
 		ret = init_pipe_control(ring);
@@ -2157,6 +2166,7 @@  int
 intel_ring_flush_all_caches(struct intel_ring_buffer *ring)
 {
 	int ret;
+	int i;
 
 	if (!ring->gpu_caches_dirty)
 		return 0;
@@ -2165,6 +2175,26 @@  intel_ring_flush_all_caches(struct intel_ring_buffer *ring)
 	if (ret)
 		return ret;
 
+	/* WaReadAfterWriteHazard*/
+	/* Send a number of Store Data commands here to finish
+	   flushing hardware pipeline.This is needed in the case
+	   where the next workload tries reading from the same
+	   surface that this batch writes to. Without these StoreDWs,
+	   not all of the data will actually be flushd to the surface
+	   by the time the next batch starts reading it, possibly
+	   causing a small amount of corruption.*/
+	ret = intel_ring_begin(ring, 4 * 12);
+	if (ret)
+		return ret;
+	for (i = 0; i < 12; i++) {
+		intel_ring_emit(ring, MI_STORE_DWORD_INDEX);
+		intel_ring_emit(ring, I915_GEM_HWS_SCRATCH_INDEX <<
+						MI_STORE_DWORD_INDEX_SHIFT);
+		intel_ring_emit(ring, 0);
+		intel_ring_emit(ring, MI_NOOP);
+	}
+	intel_ring_advance(ring);
+
 	trace_i915_gem_ring_flush(ring, 0, I915_GEM_GPU_DOMAINS);
 
 	ring->gpu_caches_dirty = false;
@@ -2176,6 +2206,24 @@  intel_ring_invalidate_all_caches(struct intel_ring_buffer *ring)
 {
 	uint32_t flush_domains;
 	int ret;
+	int i;
+
+	/* WaTlbInvalidateStoreDataBefore*/
+	/* Before pipecontrol with TLB invalidate set, need 2 store
+	   data commands (such as MI_STORE_DATA_IMM or MI_STORE_DATA_INDEX)
+	   Without this, hardware cannot guarantee the command after the
+	   PIPE_CONTROL with TLB inv will not use the old TLB values.*/
+	ret = intel_ring_begin(ring, 4 * 2);
+	if (ret)
+		return ret;
+	for (i = 0; i < 2; i++) {
+		intel_ring_emit(ring, MI_STORE_DWORD_INDEX);
+		intel_ring_emit(ring, I915_GEM_HWS_SCRATCH_INDEX <<
+						MI_STORE_DWORD_INDEX_SHIFT);
+		intel_ring_emit(ring, 0);
+		intel_ring_emit(ring, MI_NOOP);
+	}
+	intel_ring_advance(ring);
 
 	flush_domains = 0;
 	if (ring->gpu_caches_dirty)