diff mbox

[v6,6/6] drm/i915/gen8: Add WaRsRestoreWithPerCtxtBb workaround

Message ID 1434735435-14728-7-git-send-email-arun.siluvery@linux.intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

arun.siluvery@linux.intel.com June 19, 2015, 5:37 p.m. UTC
In Per context w/a batch buffer,
WaRsRestoreWithPerCtxtBb

This WA performs writes to scratch page so it must be valid, this check
is performed before initializing the batch with this WA.

v2: This patches modifies definitions of MI_LOAD_REGISTER_MEM and
MI_LOAD_REGISTER_REG; Add GEN8 specific defines for these instructions
so as to not break any future users of existing definitions (Michel)

v3: Length defined in current definitions of LRM, LRR instructions was specified
as 0. It seems it is common convention for instructions whose length vary between
platforms. This is not an issue so far because they are not used anywhere except
command parser; now that we use in this patch update them with correct length
and also move them out of command parser placeholder to appropriate place.
remove unnecessary padding and follow the WA programming sequence exactly
as mentioned in spec which is essential for this WA (Dave).

Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Dave Gordon <david.s.gordon@intel.com>
Signed-off-by: Rafael Barbalho <rafael.barbalho@intel.com>
Signed-off-by: Arun Siluvery <arun.siluvery@linux.intel.com>
---
 drivers/gpu/drm/i915/i915_reg.h  | 29 +++++++++++++++++++--
 drivers/gpu/drm/i915/intel_lrc.c | 54 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 81 insertions(+), 2 deletions(-)

Comments

arun.siluvery@linux.intel.com June 22, 2015, 11:30 a.m. UTC | #1
On 19/06/2015 18:37, Arun Siluvery wrote:
> In Per context w/a batch buffer,
> WaRsRestoreWithPerCtxtBb
>
> This WA performs writes to scratch page so it must be valid, this check
> is performed before initializing the batch with this WA.
>
> v2: This patches modifies definitions of MI_LOAD_REGISTER_MEM and
> MI_LOAD_REGISTER_REG; Add GEN8 specific defines for these instructions
> so as to not break any future users of existing definitions (Michel)
>
> v3: Length defined in current definitions of LRM, LRR instructions was specified
> as 0. It seems it is common convention for instructions whose length vary between
> platforms. This is not an issue so far because they are not used anywhere except
> command parser; now that we use in this patch update them with correct length
> and also move them out of command parser placeholder to appropriate place.
> remove unnecessary padding and follow the WA programming sequence exactly
> as mentioned in spec which is essential for this WA (Dave).
>
> Cc: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Dave Gordon <david.s.gordon@intel.com>
> Signed-off-by: Rafael Barbalho <rafael.barbalho@intel.com>
> Signed-off-by: Arun Siluvery <arun.siluvery@linux.intel.com>
> ---
>   drivers/gpu/drm/i915/i915_reg.h  | 29 +++++++++++++++++++--
>   drivers/gpu/drm/i915/intel_lrc.c | 54 ++++++++++++++++++++++++++++++++++++++++
>   2 files changed, 81 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
> index 7637e64..208620d 100644
> --- a/drivers/gpu/drm/i915/i915_reg.h
> +++ b/drivers/gpu/drm/i915/i915_reg.h
> @@ -347,6 +347,31 @@
>   #define   MI_INVALIDATE_BSD		(1<<7)
>   #define   MI_FLUSH_DW_USE_GTT		(1<<2)
>   #define   MI_FLUSH_DW_USE_PPGTT		(0<<2)
> +#define MI_LOAD_REGISTER_MEM    MI_INSTR(0x29, 1)
> +#define MI_LOAD_REGISTER_MEM_GEN8 MI_INSTR(0x29, 2)
> +#define   MI_LRM_USE_GLOBAL_GTT (1<<22)
> +#define   MI_LRM_ASYNC_MODE_ENABLE (1<<21)
> +#define MI_LOAD_REGISTER_REG    MI_INSTR(0x2A, 1)
> +#define MI_ATOMIC(len)	MI_INSTR(0x2F, (len-2))
> +#define   MI_ATOMIC_MEMORY_TYPE_GGTT	(1<<22)
> +#define   MI_ATOMIC_INLINE_DATA		(1<<18)
> +#define   MI_ATOMIC_CS_STALL		(1<<17)
> +#define   MI_ATOMIC_RETURN_DATA_CTL	(1<<16)
> +#define MI_ATOMIC_OP_MASK(op)  ((op) << 8)
> +#define MI_ATOMIC_AND	MI_ATOMIC_OP_MASK(0x01)
> +#define MI_ATOMIC_OR	MI_ATOMIC_OP_MASK(0x02)
> +#define MI_ATOMIC_XOR	MI_ATOMIC_OP_MASK(0x03)
> +#define MI_ATOMIC_MOVE	MI_ATOMIC_OP_MASK(0x04)
> +#define MI_ATOMIC_INC	MI_ATOMIC_OP_MASK(0x05)
> +#define MI_ATOMIC_DEC	MI_ATOMIC_OP_MASK(0x06)
> +#define MI_ATOMIC_ADD	MI_ATOMIC_OP_MASK(0x07)
> +#define MI_ATOMIC_SUB	MI_ATOMIC_OP_MASK(0x08)
> +#define MI_ATOMIC_RSUB	MI_ATOMIC_OP_MASK(0x09)
> +#define MI_ATOMIC_IMAX	MI_ATOMIC_OP_MASK(0x0A)
> +#define MI_ATOMIC_IMIN	MI_ATOMIC_OP_MASK(0x0B)
> +#define MI_ATOMIC_UMAX	MI_ATOMIC_OP_MASK(0x0C)
> +#define MI_ATOMIC_UMIN	MI_ATOMIC_OP_MASK(0x0D)
> +
>   #define MI_BATCH_BUFFER		MI_INSTR(0x30, 1)
>   #define   MI_BATCH_NON_SECURE		(1)
>   /* for snb/ivb/vlv this also means "batch in ppgtt" when ppgtt is enabled. */
> @@ -451,8 +476,6 @@
>   #define MI_CLFLUSH              MI_INSTR(0x27, 0)
>   #define MI_REPORT_PERF_COUNT    MI_INSTR(0x28, 0)
>   #define   MI_REPORT_PERF_COUNT_GGTT (1<<0)
> -#define MI_LOAD_REGISTER_MEM    MI_INSTR(0x29, 0)
> -#define MI_LOAD_REGISTER_REG    MI_INSTR(0x2A, 0)
>   #define MI_RS_STORE_DATA_IMM    MI_INSTR(0x2B, 0)
>   #define MI_LOAD_URB_MEM         MI_INSTR(0x2C, 0)
>   #define MI_STORE_URB_MEM        MI_INSTR(0x2D, 0)
> @@ -1799,6 +1822,8 @@ enum skl_disp_power_wells {
>   #define   GEN8_RC_SEMA_IDLE_MSG_DISABLE	(1 << 12)
>   #define   GEN8_FF_DOP_CLOCK_GATE_DISABLE	(1<<10)
>
> +#define GEN8_RS_PREEMPT_STATUS		0x215C
> +
>   /* Fuse readout registers for GT */
>   #define CHV_FUSE_GT			(VLV_DISPLAY_BASE + 0x2168)
>   #define   CHV_FGT_DISABLE_SS0		(1 << 10)
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 664455c..28198c4 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -1215,11 +1215,65 @@ static int gen8_init_perctx_bb(struct intel_engine_cs *ring,
>   			       uint32_t *const batch,
>   			       uint32_t *offset)
>   {
> +	uint32_t scratch_addr;
>   	uint32_t index = wa_ctx_start(wa_ctx, *offset, CACHELINE_DWORDS);
>
> +	/* Actual scratch location is at 128 bytes offset */
> +	scratch_addr = ring->scratch.gtt_offset + 2*CACHELINE_BYTES;
> +	scratch_addr |= PIPE_CONTROL_GLOBAL_GTT;
> +

Daniel, could you please remove this line when applying this patch?
sorry for additional work.

 > +	scratch_addr |= PIPE_CONTROL_GLOBAL_GTT;

regards
Arun

>   	/* WaDisableCtxRestoreArbitration:bdw,chv */
>   	wa_ctx_emit(batch, MI_ARB_ON_OFF | MI_ARB_ENABLE);
>
> +	/*
> +	 * As per Bspec, to workaround a known HW issue, SW must perform the
> +	 * below programming sequence prior to programming MI_BATCH_BUFFER_END.
> +	 *
> +	 * This is only applicable for Gen8.
> +	 */
> +
> +	/* WaRsRestoreWithPerCtxtBb:bdw,chv */
> +	wa_ctx_emit(batch, MI_LOAD_REGISTER_IMM(1));
> +	wa_ctx_emit(batch, INSTPM);
> +	wa_ctx_emit(batch, _MASKED_BIT_DISABLE(INSTPM_FORCE_ORDERING));
> +
> +	wa_ctx_emit(batch, (MI_ATOMIC(5) |
> +			    MI_ATOMIC_MEMORY_TYPE_GGTT |
> +			    MI_ATOMIC_INLINE_DATA |
> +			    MI_ATOMIC_CS_STALL |
> +			    MI_ATOMIC_RETURN_DATA_CTL |
> +			    MI_ATOMIC_MOVE));
> +	wa_ctx_emit(batch, scratch_addr);
> +	wa_ctx_emit(batch, 0);
> +	wa_ctx_emit(batch, _MASKED_BIT_ENABLE(INSTPM_FORCE_ORDERING));
> +	wa_ctx_emit(batch, _MASKED_BIT_ENABLE(INSTPM_FORCE_ORDERING));
> +
> +	/*
> +	 * BSpec says MI_LOAD_REGISTER_MEM, MI_LOAD_REGISTER_REG and
> +	 * MI_BATCH_BUFFER_END instructions in this sequence need to be
> +	 * in the same cacheline. To satisfy this case even if more WA are
> +	 * added in future, pad current cacheline and start remaining sequence
> +	 * in new cacheline.
> +	 */
> +	while (index % CACHELINE_DWORDS)
> +		wa_ctx_emit(batch, MI_NOOP);
> +
> +	wa_ctx_emit(batch, (MI_LOAD_REGISTER_MEM_GEN8 |
> +			    MI_LRM_USE_GLOBAL_GTT |
> +			    MI_LRM_ASYNC_MODE_ENABLE));
> +	wa_ctx_emit(batch, INSTPM);
> +	wa_ctx_emit(batch, scratch_addr);
> +	wa_ctx_emit(batch, 0);
> +
> +	/*
> +	 * BSpec says there should not be any commands programmed
> +	 * between MI_LOAD_REGISTER_REG and MI_BATCH_BUFFER_END so
> +	 * do not add any new commands
> +	 */
> +	wa_ctx_emit(batch, MI_LOAD_REGISTER_REG);
> +	wa_ctx_emit(batch, GEN8_RS_PREEMPT_STATUS);
> +	wa_ctx_emit(batch, GEN8_RS_PREEMPT_STATUS);
> +
>   	wa_ctx_emit(batch, MI_BATCH_BUFFER_END);
>
>   	return wa_ctx_end(wa_ctx, *offset = index, 1);
>
Ville Syrjälä June 22, 2015, 4:21 p.m. UTC | #2
On Fri, Jun 19, 2015 at 06:37:15PM +0100, Arun Siluvery wrote:
> In Per context w/a batch buffer,
> WaRsRestoreWithPerCtxtBb
> 
> This WA performs writes to scratch page so it must be valid, this check
> is performed before initializing the batch with this WA.
> 
> v2: This patches modifies definitions of MI_LOAD_REGISTER_MEM and
> MI_LOAD_REGISTER_REG; Add GEN8 specific defines for these instructions
> so as to not break any future users of existing definitions (Michel)
> 
> v3: Length defined in current definitions of LRM, LRR instructions was specified
> as 0. It seems it is common convention for instructions whose length vary between
> platforms. This is not an issue so far because they are not used anywhere except
> command parser; now that we use in this patch update them with correct length
> and also move them out of command parser placeholder to appropriate place.
> remove unnecessary padding and follow the WA programming sequence exactly
> as mentioned in spec which is essential for this WA (Dave).
> 
> Cc: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Dave Gordon <david.s.gordon@intel.com>
> Signed-off-by: Rafael Barbalho <rafael.barbalho@intel.com>
> Signed-off-by: Arun Siluvery <arun.siluvery@linux.intel.com>
> ---
>  drivers/gpu/drm/i915/i915_reg.h  | 29 +++++++++++++++++++--
>  drivers/gpu/drm/i915/intel_lrc.c | 54 ++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 81 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
> index 7637e64..208620d 100644
> --- a/drivers/gpu/drm/i915/i915_reg.h
> +++ b/drivers/gpu/drm/i915/i915_reg.h
> @@ -347,6 +347,31 @@
>  #define   MI_INVALIDATE_BSD		(1<<7)
>  #define   MI_FLUSH_DW_USE_GTT		(1<<2)
>  #define   MI_FLUSH_DW_USE_PPGTT		(0<<2)
> +#define MI_LOAD_REGISTER_MEM    MI_INSTR(0x29, 1)
> +#define MI_LOAD_REGISTER_MEM_GEN8 MI_INSTR(0x29, 2)
> +#define   MI_LRM_USE_GLOBAL_GTT (1<<22)
> +#define   MI_LRM_ASYNC_MODE_ENABLE (1<<21)
> +#define MI_LOAD_REGISTER_REG    MI_INSTR(0x2A, 1)
> +#define MI_ATOMIC(len)	MI_INSTR(0x2F, (len-2))
> +#define   MI_ATOMIC_MEMORY_TYPE_GGTT	(1<<22)
> +#define   MI_ATOMIC_INLINE_DATA		(1<<18)
> +#define   MI_ATOMIC_CS_STALL		(1<<17)
> +#define   MI_ATOMIC_RETURN_DATA_CTL	(1<<16)
> +#define MI_ATOMIC_OP_MASK(op)  ((op) << 8)
> +#define MI_ATOMIC_AND	MI_ATOMIC_OP_MASK(0x01)
> +#define MI_ATOMIC_OR	MI_ATOMIC_OP_MASK(0x02)
> +#define MI_ATOMIC_XOR	MI_ATOMIC_OP_MASK(0x03)
> +#define MI_ATOMIC_MOVE	MI_ATOMIC_OP_MASK(0x04)
> +#define MI_ATOMIC_INC	MI_ATOMIC_OP_MASK(0x05)
> +#define MI_ATOMIC_DEC	MI_ATOMIC_OP_MASK(0x06)
> +#define MI_ATOMIC_ADD	MI_ATOMIC_OP_MASK(0x07)
> +#define MI_ATOMIC_SUB	MI_ATOMIC_OP_MASK(0x08)
> +#define MI_ATOMIC_RSUB	MI_ATOMIC_OP_MASK(0x09)
> +#define MI_ATOMIC_IMAX	MI_ATOMIC_OP_MASK(0x0A)
> +#define MI_ATOMIC_IMIN	MI_ATOMIC_OP_MASK(0x0B)
> +#define MI_ATOMIC_UMAX	MI_ATOMIC_OP_MASK(0x0C)
> +#define MI_ATOMIC_UMIN	MI_ATOMIC_OP_MASK(0x0D)
> +
>  #define MI_BATCH_BUFFER		MI_INSTR(0x30, 1)
>  #define   MI_BATCH_NON_SECURE		(1)
>  /* for snb/ivb/vlv this also means "batch in ppgtt" when ppgtt is enabled. */
> @@ -451,8 +476,6 @@
>  #define MI_CLFLUSH              MI_INSTR(0x27, 0)
>  #define MI_REPORT_PERF_COUNT    MI_INSTR(0x28, 0)
>  #define   MI_REPORT_PERF_COUNT_GGTT (1<<0)
> -#define MI_LOAD_REGISTER_MEM    MI_INSTR(0x29, 0)
> -#define MI_LOAD_REGISTER_REG    MI_INSTR(0x2A, 0)
>  #define MI_RS_STORE_DATA_IMM    MI_INSTR(0x2B, 0)
>  #define MI_LOAD_URB_MEM         MI_INSTR(0x2C, 0)
>  #define MI_STORE_URB_MEM        MI_INSTR(0x2D, 0)
> @@ -1799,6 +1822,8 @@ enum skl_disp_power_wells {
>  #define   GEN8_RC_SEMA_IDLE_MSG_DISABLE	(1 << 12)
>  #define   GEN8_FF_DOP_CLOCK_GATE_DISABLE	(1<<10)
>  
> +#define GEN8_RS_PREEMPT_STATUS		0x215C
> +
>  /* Fuse readout registers for GT */
>  #define CHV_FUSE_GT			(VLV_DISPLAY_BASE + 0x2168)
>  #define   CHV_FGT_DISABLE_SS0		(1 << 10)
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 664455c..28198c4 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -1215,11 +1215,65 @@ static int gen8_init_perctx_bb(struct intel_engine_cs *ring,
>  			       uint32_t *const batch,
>  			       uint32_t *offset)
>  {
> +	uint32_t scratch_addr;
>  	uint32_t index = wa_ctx_start(wa_ctx, *offset, CACHELINE_DWORDS);
>  
> +	/* Actual scratch location is at 128 bytes offset */
> +	scratch_addr = ring->scratch.gtt_offset + 2*CACHELINE_BYTES;
> +	scratch_addr |= PIPE_CONTROL_GLOBAL_GTT;
> +
>  	/* WaDisableCtxRestoreArbitration:bdw,chv */
>  	wa_ctx_emit(batch, MI_ARB_ON_OFF | MI_ARB_ENABLE);
>  
> +	/*
> +	 * As per Bspec, to workaround a known HW issue, SW must perform the
> +	 * below programming sequence prior to programming MI_BATCH_BUFFER_END.
> +	 *
> +	 * This is only applicable for Gen8.
> +	 */
> +
> +	/* WaRsRestoreWithPerCtxtBb:bdw,chv */

This w/a doesn't seem to be needed for CHV. Also BDW seems to have
gained a chicken bit in H0 (FF_SLICE_CS_CHICKEN3[5]) that supposedly
means we shouldn't need this w/a on BDW either.

> +	wa_ctx_emit(batch, MI_LOAD_REGISTER_IMM(1));
> +	wa_ctx_emit(batch, INSTPM);
> +	wa_ctx_emit(batch, _MASKED_BIT_DISABLE(INSTPM_FORCE_ORDERING));
> +
> +	wa_ctx_emit(batch, (MI_ATOMIC(5) |
> +			    MI_ATOMIC_MEMORY_TYPE_GGTT |
> +			    MI_ATOMIC_INLINE_DATA |
> +			    MI_ATOMIC_CS_STALL |
> +			    MI_ATOMIC_RETURN_DATA_CTL |
> +			    MI_ATOMIC_MOVE));
> +	wa_ctx_emit(batch, scratch_addr);
> +	wa_ctx_emit(batch, 0);
> +	wa_ctx_emit(batch, _MASKED_BIT_ENABLE(INSTPM_FORCE_ORDERING));
> +	wa_ctx_emit(batch, _MASKED_BIT_ENABLE(INSTPM_FORCE_ORDERING));
> +
> +	/*
> +	 * BSpec says MI_LOAD_REGISTER_MEM, MI_LOAD_REGISTER_REG and
> +	 * MI_BATCH_BUFFER_END instructions in this sequence need to be
> +	 * in the same cacheline. To satisfy this case even if more WA are
> +	 * added in future, pad current cacheline and start remaining sequence
> +	 * in new cacheline.
> +	 */
> +	while (index % CACHELINE_DWORDS)
> +		wa_ctx_emit(batch, MI_NOOP);
> +
> +	wa_ctx_emit(batch, (MI_LOAD_REGISTER_MEM_GEN8 |
> +			    MI_LRM_USE_GLOBAL_GTT |
> +			    MI_LRM_ASYNC_MODE_ENABLE));
> +	wa_ctx_emit(batch, INSTPM);
> +	wa_ctx_emit(batch, scratch_addr);
> +	wa_ctx_emit(batch, 0);
> +
> +	/*
> +	 * BSpec says there should not be any commands programmed
> +	 * between MI_LOAD_REGISTER_REG and MI_BATCH_BUFFER_END so
> +	 * do not add any new commands
> +	 */
> +	wa_ctx_emit(batch, MI_LOAD_REGISTER_REG);
> +	wa_ctx_emit(batch, GEN8_RS_PREEMPT_STATUS);
> +	wa_ctx_emit(batch, GEN8_RS_PREEMPT_STATUS);
> +
>  	wa_ctx_emit(batch, MI_BATCH_BUFFER_END);
>  
>  	return wa_ctx_end(wa_ctx, *offset = index, 1);
> -- 
> 2.3.0
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx
arun.siluvery@linux.intel.com June 22, 2015, 4:59 p.m. UTC | #3
On 22/06/2015 17:21, Ville Syrjälä wrote:
> On Fri, Jun 19, 2015 at 06:37:15PM +0100, Arun Siluvery wrote:
>> In Per context w/a batch buffer,
>> WaRsRestoreWithPerCtxtBb
>>
>> This WA performs writes to scratch page so it must be valid, this check
>> is performed before initializing the batch with this WA.
>>
>> v2: This patches modifies definitions of MI_LOAD_REGISTER_MEM and
>> MI_LOAD_REGISTER_REG; Add GEN8 specific defines for these instructions
>> so as to not break any future users of existing definitions (Michel)
>>
>> v3: Length defined in current definitions of LRM, LRR instructions was specified
>> as 0. It seems it is common convention for instructions whose length vary between
>> platforms. This is not an issue so far because they are not used anywhere except
>> command parser; now that we use in this patch update them with correct length
>> and also move them out of command parser placeholder to appropriate place.
>> remove unnecessary padding and follow the WA programming sequence exactly
>> as mentioned in spec which is essential for this WA (Dave).
>>
>> Cc: Chris Wilson <chris@chris-wilson.co.uk>
>> Cc: Dave Gordon <david.s.gordon@intel.com>
>> Signed-off-by: Rafael Barbalho <rafael.barbalho@intel.com>
>> Signed-off-by: Arun Siluvery <arun.siluvery@linux.intel.com>
>> ---
>>   drivers/gpu/drm/i915/i915_reg.h  | 29 +++++++++++++++++++--
>>   drivers/gpu/drm/i915/intel_lrc.c | 54 ++++++++++++++++++++++++++++++++++++++++
>>   2 files changed, 81 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
>> index 7637e64..208620d 100644
>> --- a/drivers/gpu/drm/i915/i915_reg.h
>> +++ b/drivers/gpu/drm/i915/i915_reg.h
>> @@ -347,6 +347,31 @@
>>   #define   MI_INVALIDATE_BSD		(1<<7)
>>   #define   MI_FLUSH_DW_USE_GTT		(1<<2)
>>   #define   MI_FLUSH_DW_USE_PPGTT		(0<<2)
>> +#define MI_LOAD_REGISTER_MEM    MI_INSTR(0x29, 1)
>> +#define MI_LOAD_REGISTER_MEM_GEN8 MI_INSTR(0x29, 2)
>> +#define   MI_LRM_USE_GLOBAL_GTT (1<<22)
>> +#define   MI_LRM_ASYNC_MODE_ENABLE (1<<21)
>> +#define MI_LOAD_REGISTER_REG    MI_INSTR(0x2A, 1)
>> +#define MI_ATOMIC(len)	MI_INSTR(0x2F, (len-2))
>> +#define   MI_ATOMIC_MEMORY_TYPE_GGTT	(1<<22)
>> +#define   MI_ATOMIC_INLINE_DATA		(1<<18)
>> +#define   MI_ATOMIC_CS_STALL		(1<<17)
>> +#define   MI_ATOMIC_RETURN_DATA_CTL	(1<<16)
>> +#define MI_ATOMIC_OP_MASK(op)  ((op) << 8)
>> +#define MI_ATOMIC_AND	MI_ATOMIC_OP_MASK(0x01)
>> +#define MI_ATOMIC_OR	MI_ATOMIC_OP_MASK(0x02)
>> +#define MI_ATOMIC_XOR	MI_ATOMIC_OP_MASK(0x03)
>> +#define MI_ATOMIC_MOVE	MI_ATOMIC_OP_MASK(0x04)
>> +#define MI_ATOMIC_INC	MI_ATOMIC_OP_MASK(0x05)
>> +#define MI_ATOMIC_DEC	MI_ATOMIC_OP_MASK(0x06)
>> +#define MI_ATOMIC_ADD	MI_ATOMIC_OP_MASK(0x07)
>> +#define MI_ATOMIC_SUB	MI_ATOMIC_OP_MASK(0x08)
>> +#define MI_ATOMIC_RSUB	MI_ATOMIC_OP_MASK(0x09)
>> +#define MI_ATOMIC_IMAX	MI_ATOMIC_OP_MASK(0x0A)
>> +#define MI_ATOMIC_IMIN	MI_ATOMIC_OP_MASK(0x0B)
>> +#define MI_ATOMIC_UMAX	MI_ATOMIC_OP_MASK(0x0C)
>> +#define MI_ATOMIC_UMIN	MI_ATOMIC_OP_MASK(0x0D)
>> +
>>   #define MI_BATCH_BUFFER		MI_INSTR(0x30, 1)
>>   #define   MI_BATCH_NON_SECURE		(1)
>>   /* for snb/ivb/vlv this also means "batch in ppgtt" when ppgtt is enabled. */
>> @@ -451,8 +476,6 @@
>>   #define MI_CLFLUSH              MI_INSTR(0x27, 0)
>>   #define MI_REPORT_PERF_COUNT    MI_INSTR(0x28, 0)
>>   #define   MI_REPORT_PERF_COUNT_GGTT (1<<0)
>> -#define MI_LOAD_REGISTER_MEM    MI_INSTR(0x29, 0)
>> -#define MI_LOAD_REGISTER_REG    MI_INSTR(0x2A, 0)
>>   #define MI_RS_STORE_DATA_IMM    MI_INSTR(0x2B, 0)
>>   #define MI_LOAD_URB_MEM         MI_INSTR(0x2C, 0)
>>   #define MI_STORE_URB_MEM        MI_INSTR(0x2D, 0)
>> @@ -1799,6 +1822,8 @@ enum skl_disp_power_wells {
>>   #define   GEN8_RC_SEMA_IDLE_MSG_DISABLE	(1 << 12)
>>   #define   GEN8_FF_DOP_CLOCK_GATE_DISABLE	(1<<10)
>>
>> +#define GEN8_RS_PREEMPT_STATUS		0x215C
>> +
>>   /* Fuse readout registers for GT */
>>   #define CHV_FUSE_GT			(VLV_DISPLAY_BASE + 0x2168)
>>   #define   CHV_FGT_DISABLE_SS0		(1 << 10)
>> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
>> index 664455c..28198c4 100644
>> --- a/drivers/gpu/drm/i915/intel_lrc.c
>> +++ b/drivers/gpu/drm/i915/intel_lrc.c
>> @@ -1215,11 +1215,65 @@ static int gen8_init_perctx_bb(struct intel_engine_cs *ring,
>>   			       uint32_t *const batch,
>>   			       uint32_t *offset)
>>   {
>> +	uint32_t scratch_addr;
>>   	uint32_t index = wa_ctx_start(wa_ctx, *offset, CACHELINE_DWORDS);
>>
>> +	/* Actual scratch location is at 128 bytes offset */
>> +	scratch_addr = ring->scratch.gtt_offset + 2*CACHELINE_BYTES;
>> +	scratch_addr |= PIPE_CONTROL_GLOBAL_GTT;
>> +
>>   	/* WaDisableCtxRestoreArbitration:bdw,chv */
>>   	wa_ctx_emit(batch, MI_ARB_ON_OFF | MI_ARB_ENABLE);
>>
>> +	/*
>> +	 * As per Bspec, to workaround a known HW issue, SW must perform the
>> +	 * below programming sequence prior to programming MI_BATCH_BUFFER_END.
>> +	 *
>> +	 * This is only applicable for Gen8.
>> +	 */
>> +
>> +	/* WaRsRestoreWithPerCtxtBb:bdw,chv */
>
> This w/a doesn't seem to be needed for CHV. Also BDW seems to have
> gained a chicken bit in H0 (FF_SLICE_CS_CHICKEN3[5]) that supposedly
> means we shouldn't need this w/a on BDW either.
>
looks like this chicken bit is applying this WA, if this is working as 
expected then we can ignore this patch, I will try to get some 
confirmation on this.

regards
Arun

>> +	wa_ctx_emit(batch, MI_LOAD_REGISTER_IMM(1));
>> +	wa_ctx_emit(batch, INSTPM);
>> +	wa_ctx_emit(batch, _MASKED_BIT_DISABLE(INSTPM_FORCE_ORDERING));
>> +
>> +	wa_ctx_emit(batch, (MI_ATOMIC(5) |
>> +			    MI_ATOMIC_MEMORY_TYPE_GGTT |
>> +			    MI_ATOMIC_INLINE_DATA |
>> +			    MI_ATOMIC_CS_STALL |
>> +			    MI_ATOMIC_RETURN_DATA_CTL |
>> +			    MI_ATOMIC_MOVE));
>> +	wa_ctx_emit(batch, scratch_addr);
>> +	wa_ctx_emit(batch, 0);
>> +	wa_ctx_emit(batch, _MASKED_BIT_ENABLE(INSTPM_FORCE_ORDERING));
>> +	wa_ctx_emit(batch, _MASKED_BIT_ENABLE(INSTPM_FORCE_ORDERING));
>> +
>> +	/*
>> +	 * BSpec says MI_LOAD_REGISTER_MEM, MI_LOAD_REGISTER_REG and
>> +	 * MI_BATCH_BUFFER_END instructions in this sequence need to be
>> +	 * in the same cacheline. To satisfy this case even if more WA are
>> +	 * added in future, pad current cacheline and start remaining sequence
>> +	 * in new cacheline.
>> +	 */
>> +	while (index % CACHELINE_DWORDS)
>> +		wa_ctx_emit(batch, MI_NOOP);
>> +
>> +	wa_ctx_emit(batch, (MI_LOAD_REGISTER_MEM_GEN8 |
>> +			    MI_LRM_USE_GLOBAL_GTT |
>> +			    MI_LRM_ASYNC_MODE_ENABLE));
>> +	wa_ctx_emit(batch, INSTPM);
>> +	wa_ctx_emit(batch, scratch_addr);
>> +	wa_ctx_emit(batch, 0);
>> +
>> +	/*
>> +	 * BSpec says there should not be any commands programmed
>> +	 * between MI_LOAD_REGISTER_REG and MI_BATCH_BUFFER_END so
>> +	 * do not add any new commands
>> +	 */
>> +	wa_ctx_emit(batch, MI_LOAD_REGISTER_REG);
>> +	wa_ctx_emit(batch, GEN8_RS_PREEMPT_STATUS);
>> +	wa_ctx_emit(batch, GEN8_RS_PREEMPT_STATUS);
>> +
>>   	wa_ctx_emit(batch, MI_BATCH_BUFFER_END);
>>
>>   	return wa_ctx_end(wa_ctx, *offset = index, 1);
>> --
>> 2.3.0
>>
>> _______________________________________________
>> Intel-gfx mailing list
>> Intel-gfx@lists.freedesktop.org
>> http://lists.freedesktop.org/mailman/listinfo/intel-gfx
>
arun.siluvery@linux.intel.com June 23, 2015, 2:48 p.m. UTC | #4
On 22/06/2015 17:59, Siluvery, Arun wrote:
> On 22/06/2015 17:21, Ville Syrjälä wrote:
>> On Fri, Jun 19, 2015 at 06:37:15PM +0100, Arun Siluvery wrote:
>>> In Per context w/a batch buffer,
>>> WaRsRestoreWithPerCtxtBb
>>>
>>> This WA performs writes to scratch page so it must be valid, this check
>>> is performed before initializing the batch with this WA.
>>>
>>> v2: This patches modifies definitions of MI_LOAD_REGISTER_MEM and
>>> MI_LOAD_REGISTER_REG; Add GEN8 specific defines for these instructions
>>> so as to not break any future users of existing definitions (Michel)
>>>
>>> v3: Length defined in current definitions of LRM, LRR instructions was specified
>>> as 0. It seems it is common convention for instructions whose length vary between
>>> platforms. This is not an issue so far because they are not used anywhere except
>>> command parser; now that we use in this patch update them with correct length
>>> and also move them out of command parser placeholder to appropriate place.
>>> remove unnecessary padding and follow the WA programming sequence exactly
>>> as mentioned in spec which is essential for this WA (Dave).
>>>
>>> Cc: Chris Wilson <chris@chris-wilson.co.uk>
>>> Cc: Dave Gordon <david.s.gordon@intel.com>
>>> Signed-off-by: Rafael Barbalho <rafael.barbalho@intel.com>
>>> Signed-off-by: Arun Siluvery <arun.siluvery@linux.intel.com>
>>> ---
>>>    drivers/gpu/drm/i915/i915_reg.h  | 29 +++++++++++++++++++--
>>>    drivers/gpu/drm/i915/intel_lrc.c | 54 ++++++++++++++++++++++++++++++++++++++++
>>>    2 files changed, 81 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
>>> index 7637e64..208620d 100644
>>> --- a/drivers/gpu/drm/i915/i915_reg.h
>>> +++ b/drivers/gpu/drm/i915/i915_reg.h
>>> @@ -347,6 +347,31 @@
>>>    #define   MI_INVALIDATE_BSD		(1<<7)
>>>    #define   MI_FLUSH_DW_USE_GTT		(1<<2)
>>>    #define   MI_FLUSH_DW_USE_PPGTT		(0<<2)
>>> +#define MI_LOAD_REGISTER_MEM    MI_INSTR(0x29, 1)
>>> +#define MI_LOAD_REGISTER_MEM_GEN8 MI_INSTR(0x29, 2)
>>> +#define   MI_LRM_USE_GLOBAL_GTT (1<<22)
>>> +#define   MI_LRM_ASYNC_MODE_ENABLE (1<<21)
>>> +#define MI_LOAD_REGISTER_REG    MI_INSTR(0x2A, 1)
>>> +#define MI_ATOMIC(len)	MI_INSTR(0x2F, (len-2))
>>> +#define   MI_ATOMIC_MEMORY_TYPE_GGTT	(1<<22)
>>> +#define   MI_ATOMIC_INLINE_DATA		(1<<18)
>>> +#define   MI_ATOMIC_CS_STALL		(1<<17)
>>> +#define   MI_ATOMIC_RETURN_DATA_CTL	(1<<16)
>>> +#define MI_ATOMIC_OP_MASK(op)  ((op) << 8)
>>> +#define MI_ATOMIC_AND	MI_ATOMIC_OP_MASK(0x01)
>>> +#define MI_ATOMIC_OR	MI_ATOMIC_OP_MASK(0x02)
>>> +#define MI_ATOMIC_XOR	MI_ATOMIC_OP_MASK(0x03)
>>> +#define MI_ATOMIC_MOVE	MI_ATOMIC_OP_MASK(0x04)
>>> +#define MI_ATOMIC_INC	MI_ATOMIC_OP_MASK(0x05)
>>> +#define MI_ATOMIC_DEC	MI_ATOMIC_OP_MASK(0x06)
>>> +#define MI_ATOMIC_ADD	MI_ATOMIC_OP_MASK(0x07)
>>> +#define MI_ATOMIC_SUB	MI_ATOMIC_OP_MASK(0x08)
>>> +#define MI_ATOMIC_RSUB	MI_ATOMIC_OP_MASK(0x09)
>>> +#define MI_ATOMIC_IMAX	MI_ATOMIC_OP_MASK(0x0A)
>>> +#define MI_ATOMIC_IMIN	MI_ATOMIC_OP_MASK(0x0B)
>>> +#define MI_ATOMIC_UMAX	MI_ATOMIC_OP_MASK(0x0C)
>>> +#define MI_ATOMIC_UMIN	MI_ATOMIC_OP_MASK(0x0D)
>>> +
>>>    #define MI_BATCH_BUFFER		MI_INSTR(0x30, 1)
>>>    #define   MI_BATCH_NON_SECURE		(1)
>>>    /* for snb/ivb/vlv this also means "batch in ppgtt" when ppgtt is enabled. */
>>> @@ -451,8 +476,6 @@
>>>    #define MI_CLFLUSH              MI_INSTR(0x27, 0)
>>>    #define MI_REPORT_PERF_COUNT    MI_INSTR(0x28, 0)
>>>    #define   MI_REPORT_PERF_COUNT_GGTT (1<<0)
>>> -#define MI_LOAD_REGISTER_MEM    MI_INSTR(0x29, 0)
>>> -#define MI_LOAD_REGISTER_REG    MI_INSTR(0x2A, 0)
>>>    #define MI_RS_STORE_DATA_IMM    MI_INSTR(0x2B, 0)
>>>    #define MI_LOAD_URB_MEM         MI_INSTR(0x2C, 0)
>>>    #define MI_STORE_URB_MEM        MI_INSTR(0x2D, 0)
>>> @@ -1799,6 +1822,8 @@ enum skl_disp_power_wells {
>>>    #define   GEN8_RC_SEMA_IDLE_MSG_DISABLE	(1 << 12)
>>>    #define   GEN8_FF_DOP_CLOCK_GATE_DISABLE	(1<<10)
>>>
>>> +#define GEN8_RS_PREEMPT_STATUS		0x215C
>>> +
>>>    /* Fuse readout registers for GT */
>>>    #define CHV_FUSE_GT			(VLV_DISPLAY_BASE + 0x2168)
>>>    #define   CHV_FGT_DISABLE_SS0		(1 << 10)
>>> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
>>> index 664455c..28198c4 100644
>>> --- a/drivers/gpu/drm/i915/intel_lrc.c
>>> +++ b/drivers/gpu/drm/i915/intel_lrc.c
>>> @@ -1215,11 +1215,65 @@ static int gen8_init_perctx_bb(struct intel_engine_cs *ring,
>>>    			       uint32_t *const batch,
>>>    			       uint32_t *offset)
>>>    {
>>> +	uint32_t scratch_addr;
>>>    	uint32_t index = wa_ctx_start(wa_ctx, *offset, CACHELINE_DWORDS);
>>>
>>> +	/* Actual scratch location is at 128 bytes offset */
>>> +	scratch_addr = ring->scratch.gtt_offset + 2*CACHELINE_BYTES;
>>> +	scratch_addr |= PIPE_CONTROL_GLOBAL_GTT;
>>> +
>>>    	/* WaDisableCtxRestoreArbitration:bdw,chv */
>>>    	wa_ctx_emit(batch, MI_ARB_ON_OFF | MI_ARB_ENABLE);
>>>
>>> +	/*
>>> +	 * As per Bspec, to workaround a known HW issue, SW must perform the
>>> +	 * below programming sequence prior to programming MI_BATCH_BUFFER_END.
>>> +	 *
>>> +	 * This is only applicable for Gen8.
>>> +	 */
>>> +
>>> +	/* WaRsRestoreWithPerCtxtBb:bdw,chv */
>>
>> This w/a doesn't seem to be needed for CHV. Also BDW seems to have
>> gained a chicken bit in H0 (FF_SLICE_CS_CHICKEN3[5]) that supposedly
>> means we shouldn't need this w/a on BDW either.
>>
> looks like this chicken bit is applying this WA, if this is working as
> expected then we can ignore this patch, I will try to get some
> confirmation on this.

I got confirmation from HW that chicken bit is enough, this patch can be 
ignored.

regards
Arun

>
> regards
> Arun
>
>>> +	wa_ctx_emit(batch, MI_LOAD_REGISTER_IMM(1));
>>> +	wa_ctx_emit(batch, INSTPM);
>>> +	wa_ctx_emit(batch, _MASKED_BIT_DISABLE(INSTPM_FORCE_ORDERING));
>>> +
>>> +	wa_ctx_emit(batch, (MI_ATOMIC(5) |
>>> +			    MI_ATOMIC_MEMORY_TYPE_GGTT |
>>> +			    MI_ATOMIC_INLINE_DATA |
>>> +			    MI_ATOMIC_CS_STALL |
>>> +			    MI_ATOMIC_RETURN_DATA_CTL |
>>> +			    MI_ATOMIC_MOVE));
>>> +	wa_ctx_emit(batch, scratch_addr);
>>> +	wa_ctx_emit(batch, 0);
>>> +	wa_ctx_emit(batch, _MASKED_BIT_ENABLE(INSTPM_FORCE_ORDERING));
>>> +	wa_ctx_emit(batch, _MASKED_BIT_ENABLE(INSTPM_FORCE_ORDERING));
>>> +
>>> +	/*
>>> +	 * BSpec says MI_LOAD_REGISTER_MEM, MI_LOAD_REGISTER_REG and
>>> +	 * MI_BATCH_BUFFER_END instructions in this sequence need to be
>>> +	 * in the same cacheline. To satisfy this case even if more WA are
>>> +	 * added in future, pad current cacheline and start remaining sequence
>>> +	 * in new cacheline.
>>> +	 */
>>> +	while (index % CACHELINE_DWORDS)
>>> +		wa_ctx_emit(batch, MI_NOOP);
>>> +
>>> +	wa_ctx_emit(batch, (MI_LOAD_REGISTER_MEM_GEN8 |
>>> +			    MI_LRM_USE_GLOBAL_GTT |
>>> +			    MI_LRM_ASYNC_MODE_ENABLE));
>>> +	wa_ctx_emit(batch, INSTPM);
>>> +	wa_ctx_emit(batch, scratch_addr);
>>> +	wa_ctx_emit(batch, 0);
>>> +
>>> +	/*
>>> +	 * BSpec says there should not be any commands programmed
>>> +	 * between MI_LOAD_REGISTER_REG and MI_BATCH_BUFFER_END so
>>> +	 * do not add any new commands
>>> +	 */
>>> +	wa_ctx_emit(batch, MI_LOAD_REGISTER_REG);
>>> +	wa_ctx_emit(batch, GEN8_RS_PREEMPT_STATUS);
>>> +	wa_ctx_emit(batch, GEN8_RS_PREEMPT_STATUS);
>>> +
>>>    	wa_ctx_emit(batch, MI_BATCH_BUFFER_END);
>>>
>>>    	return wa_ctx_end(wa_ctx, *offset = index, 1);
>>> --
>>> 2.3.0
>>>
>>> _______________________________________________
>>> Intel-gfx mailing list
>>> Intel-gfx@lists.freedesktop.org
>>> http://lists.freedesktop.org/mailman/listinfo/intel-gfx
>>
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx
>
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index 7637e64..208620d 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -347,6 +347,31 @@ 
 #define   MI_INVALIDATE_BSD		(1<<7)
 #define   MI_FLUSH_DW_USE_GTT		(1<<2)
 #define   MI_FLUSH_DW_USE_PPGTT		(0<<2)
+#define MI_LOAD_REGISTER_MEM    MI_INSTR(0x29, 1)
+#define MI_LOAD_REGISTER_MEM_GEN8 MI_INSTR(0x29, 2)
+#define   MI_LRM_USE_GLOBAL_GTT (1<<22)
+#define   MI_LRM_ASYNC_MODE_ENABLE (1<<21)
+#define MI_LOAD_REGISTER_REG    MI_INSTR(0x2A, 1)
+#define MI_ATOMIC(len)	MI_INSTR(0x2F, (len-2))
+#define   MI_ATOMIC_MEMORY_TYPE_GGTT	(1<<22)
+#define   MI_ATOMIC_INLINE_DATA		(1<<18)
+#define   MI_ATOMIC_CS_STALL		(1<<17)
+#define   MI_ATOMIC_RETURN_DATA_CTL	(1<<16)
+#define MI_ATOMIC_OP_MASK(op)  ((op) << 8)
+#define MI_ATOMIC_AND	MI_ATOMIC_OP_MASK(0x01)
+#define MI_ATOMIC_OR	MI_ATOMIC_OP_MASK(0x02)
+#define MI_ATOMIC_XOR	MI_ATOMIC_OP_MASK(0x03)
+#define MI_ATOMIC_MOVE	MI_ATOMIC_OP_MASK(0x04)
+#define MI_ATOMIC_INC	MI_ATOMIC_OP_MASK(0x05)
+#define MI_ATOMIC_DEC	MI_ATOMIC_OP_MASK(0x06)
+#define MI_ATOMIC_ADD	MI_ATOMIC_OP_MASK(0x07)
+#define MI_ATOMIC_SUB	MI_ATOMIC_OP_MASK(0x08)
+#define MI_ATOMIC_RSUB	MI_ATOMIC_OP_MASK(0x09)
+#define MI_ATOMIC_IMAX	MI_ATOMIC_OP_MASK(0x0A)
+#define MI_ATOMIC_IMIN	MI_ATOMIC_OP_MASK(0x0B)
+#define MI_ATOMIC_UMAX	MI_ATOMIC_OP_MASK(0x0C)
+#define MI_ATOMIC_UMIN	MI_ATOMIC_OP_MASK(0x0D)
+
 #define MI_BATCH_BUFFER		MI_INSTR(0x30, 1)
 #define   MI_BATCH_NON_SECURE		(1)
 /* for snb/ivb/vlv this also means "batch in ppgtt" when ppgtt is enabled. */
@@ -451,8 +476,6 @@ 
 #define MI_CLFLUSH              MI_INSTR(0x27, 0)
 #define MI_REPORT_PERF_COUNT    MI_INSTR(0x28, 0)
 #define   MI_REPORT_PERF_COUNT_GGTT (1<<0)
-#define MI_LOAD_REGISTER_MEM    MI_INSTR(0x29, 0)
-#define MI_LOAD_REGISTER_REG    MI_INSTR(0x2A, 0)
 #define MI_RS_STORE_DATA_IMM    MI_INSTR(0x2B, 0)
 #define MI_LOAD_URB_MEM         MI_INSTR(0x2C, 0)
 #define MI_STORE_URB_MEM        MI_INSTR(0x2D, 0)
@@ -1799,6 +1822,8 @@  enum skl_disp_power_wells {
 #define   GEN8_RC_SEMA_IDLE_MSG_DISABLE	(1 << 12)
 #define   GEN8_FF_DOP_CLOCK_GATE_DISABLE	(1<<10)
 
+#define GEN8_RS_PREEMPT_STATUS		0x215C
+
 /* Fuse readout registers for GT */
 #define CHV_FUSE_GT			(VLV_DISPLAY_BASE + 0x2168)
 #define   CHV_FGT_DISABLE_SS0		(1 << 10)
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 664455c..28198c4 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -1215,11 +1215,65 @@  static int gen8_init_perctx_bb(struct intel_engine_cs *ring,
 			       uint32_t *const batch,
 			       uint32_t *offset)
 {
+	uint32_t scratch_addr;
 	uint32_t index = wa_ctx_start(wa_ctx, *offset, CACHELINE_DWORDS);
 
+	/* Actual scratch location is at 128 bytes offset */
+	scratch_addr = ring->scratch.gtt_offset + 2*CACHELINE_BYTES;
+	scratch_addr |= PIPE_CONTROL_GLOBAL_GTT;
+
 	/* WaDisableCtxRestoreArbitration:bdw,chv */
 	wa_ctx_emit(batch, MI_ARB_ON_OFF | MI_ARB_ENABLE);
 
+	/*
+	 * As per Bspec, to workaround a known HW issue, SW must perform the
+	 * below programming sequence prior to programming MI_BATCH_BUFFER_END.
+	 *
+	 * This is only applicable for Gen8.
+	 */
+
+	/* WaRsRestoreWithPerCtxtBb:bdw,chv */
+	wa_ctx_emit(batch, MI_LOAD_REGISTER_IMM(1));
+	wa_ctx_emit(batch, INSTPM);
+	wa_ctx_emit(batch, _MASKED_BIT_DISABLE(INSTPM_FORCE_ORDERING));
+
+	wa_ctx_emit(batch, (MI_ATOMIC(5) |
+			    MI_ATOMIC_MEMORY_TYPE_GGTT |
+			    MI_ATOMIC_INLINE_DATA |
+			    MI_ATOMIC_CS_STALL |
+			    MI_ATOMIC_RETURN_DATA_CTL |
+			    MI_ATOMIC_MOVE));
+	wa_ctx_emit(batch, scratch_addr);
+	wa_ctx_emit(batch, 0);
+	wa_ctx_emit(batch, _MASKED_BIT_ENABLE(INSTPM_FORCE_ORDERING));
+	wa_ctx_emit(batch, _MASKED_BIT_ENABLE(INSTPM_FORCE_ORDERING));
+
+	/*
+	 * BSpec says MI_LOAD_REGISTER_MEM, MI_LOAD_REGISTER_REG and
+	 * MI_BATCH_BUFFER_END instructions in this sequence need to be
+	 * in the same cacheline. To satisfy this case even if more WA are
+	 * added in future, pad current cacheline and start remaining sequence
+	 * in new cacheline.
+	 */
+	while (index % CACHELINE_DWORDS)
+		wa_ctx_emit(batch, MI_NOOP);
+
+	wa_ctx_emit(batch, (MI_LOAD_REGISTER_MEM_GEN8 |
+			    MI_LRM_USE_GLOBAL_GTT |
+			    MI_LRM_ASYNC_MODE_ENABLE));
+	wa_ctx_emit(batch, INSTPM);
+	wa_ctx_emit(batch, scratch_addr);
+	wa_ctx_emit(batch, 0);
+
+	/*
+	 * BSpec says there should not be any commands programmed
+	 * between MI_LOAD_REGISTER_REG and MI_BATCH_BUFFER_END so
+	 * do not add any new commands
+	 */
+	wa_ctx_emit(batch, MI_LOAD_REGISTER_REG);
+	wa_ctx_emit(batch, GEN8_RS_PREEMPT_STATUS);
+	wa_ctx_emit(batch, GEN8_RS_PREEMPT_STATUS);
+
 	wa_ctx_emit(batch, MI_BATCH_BUFFER_END);
 
 	return wa_ctx_end(wa_ctx, *offset = index, 1);