diff mbox series

[v3,06/10] drm/msm/A6xx: Use posamble to reset counters on preemption

Message ID 20240905-preemption-a750-t-v3-6-fd947699f7bc@gmail.com (mailing list archive)
State Superseded
Headers show
Series Preemption support for A7XX | expand

Commit Message

Antonino Maniscalco Sept. 5, 2024, 2:51 p.m. UTC
Use the postamble to reset perf counters when switching between rings,
except when sysprof is enabled, analogously to how they are reset
between submissions when switching pagetables.

Signed-off-by: Antonino Maniscalco <antomani103@gmail.com>
---
 drivers/gpu/drm/msm/adreno/a6xx_gpu.c     | 20 ++++++++++++++++++-
 drivers/gpu/drm/msm/adreno/a6xx_gpu.h     |  5 +++++
 drivers/gpu/drm/msm/adreno/a6xx_preempt.c | 32 +++++++++++++++++++++++++++++++
 drivers/gpu/drm/msm/adreno/adreno_gpu.h   |  7 +++++--
 4 files changed, 61 insertions(+), 3 deletions(-)

Comments

Akhil P Oommen Sept. 6, 2024, 8:08 p.m. UTC | #1
On Thu, Sep 05, 2024 at 04:51:24PM +0200, Antonino Maniscalco wrote:
> Use the postamble to reset perf counters when switching between rings,
> except when sysprof is enabled, analogously to how they are reset
> between submissions when switching pagetables.
> 
> Signed-off-by: Antonino Maniscalco <antomani103@gmail.com>
> ---
>  drivers/gpu/drm/msm/adreno/a6xx_gpu.c     | 20 ++++++++++++++++++-
>  drivers/gpu/drm/msm/adreno/a6xx_gpu.h     |  5 +++++
>  drivers/gpu/drm/msm/adreno/a6xx_preempt.c | 32 +++++++++++++++++++++++++++++++
>  drivers/gpu/drm/msm/adreno/adreno_gpu.h   |  7 +++++--
>  4 files changed, 61 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
> index ed0b138a2d66..710ec3ce2923 100644
> --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
> +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
> @@ -366,7 +366,8 @@ static void a6xx_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit)
>  static void a6xx_emit_set_pseudo_reg(struct msm_ringbuffer *ring,
>  		struct a6xx_gpu *a6xx_gpu, struct msm_gpu_submitqueue *queue)
>  {
> -	u64 preempt_offset_priv_secure;
> +	bool sysprof = refcount_read(&a6xx_gpu->base.base.sysprof_active) > 1;
> +	u64 preempt_offset_priv_secure, preempt_postamble;
>  
>  	OUT_PKT7(ring, CP_SET_PSEUDO_REG, 15);
>  
> @@ -398,6 +399,23 @@ static void a6xx_emit_set_pseudo_reg(struct msm_ringbuffer *ring,
>  	/* seems OK to set to 0 to disable it */
>  	OUT_RING(ring, 0);
>  	OUT_RING(ring, 0);
> +
> +	/* if not profiling set postamble to clear perfcounters, else clear it */
> +	if (!sysprof && a6xx_gpu->preempt_postamble_len) {
> +		preempt_postamble = a6xx_gpu->preempt_postamble_iova;
> +
> +		OUT_PKT7(ring, CP_SET_AMBLE, 3);
> +		OUT_RING(ring, lower_32_bits(preempt_postamble));
> +		OUT_RING(ring, upper_32_bits(preempt_postamble));
> +		OUT_RING(ring, CP_SET_AMBLE_2_DWORDS(
> +					a6xx_gpu->preempt_postamble_len) |
> +				CP_SET_AMBLE_2_TYPE(KMD_AMBLE_TYPE));
> +	} else {

Why do we need this else part?

> +		OUT_PKT7(ring, CP_SET_AMBLE, 3);
> +		OUT_RING(ring, 0);
> +		OUT_RING(ring, 0);
> +		OUT_RING(ring, CP_SET_AMBLE_2_TYPE(KMD_AMBLE_TYPE));
> +	}
>  }
>  
>  static void a7xx_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit)
> diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.h b/drivers/gpu/drm/msm/adreno/a6xx_gpu.h
> index da10060e38dc..b009732c08c5 100644
> --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.h
> +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.h
> @@ -71,6 +71,11 @@ struct a6xx_gpu {
>  	bool uses_gmem;
>  	bool skip_save_restore;
>  
> +	struct drm_gem_object *preempt_postamble_bo;
> +	void *preempt_postamble_ptr;
> +	uint64_t preempt_postamble_iova;
> +	uint64_t preempt_postamble_len;
> +
>  	struct a6xx_gmu gmu;
>  
>  	struct drm_gem_object *shadow_bo;
> diff --git a/drivers/gpu/drm/msm/adreno/a6xx_preempt.c b/drivers/gpu/drm/msm/adreno/a6xx_preempt.c
> index 1caff76aca6e..ec44f44d925f 100644
> --- a/drivers/gpu/drm/msm/adreno/a6xx_preempt.c
> +++ b/drivers/gpu/drm/msm/adreno/a6xx_preempt.c
> @@ -346,6 +346,28 @@ static int preempt_init_ring(struct a6xx_gpu *a6xx_gpu,
>  	return 0;
>  }
>  
> +static void preempt_prepare_postamble(struct a6xx_gpu *a6xx_gpu)
> +{
> +	u32 *postamble = a6xx_gpu->preempt_postamble_ptr;
> +	u32 count = 0;
> +
> +	postamble[count++] = PKT7(CP_REG_RMW, 3);
> +	postamble[count++] = REG_A6XX_RBBM_PERFCTR_SRAM_INIT_CMD;
> +	postamble[count++] = 0;
> +	postamble[count++] = 1;
> +
> +	postamble[count++] = PKT7(CP_WAIT_REG_MEM, 6);
> +	postamble[count++] = CP_WAIT_REG_MEM_0_FUNCTION(WRITE_EQ);
> +	postamble[count++] = CP_WAIT_REG_MEM_1_POLL_ADDR_LO(
> +				REG_A6XX_RBBM_PERFCTR_SRAM_INIT_STATUS);
> +	postamble[count++] = CP_WAIT_REG_MEM_2_POLL_ADDR_HI(0);
> +	postamble[count++] = CP_WAIT_REG_MEM_3_REF(0x1);
> +	postamble[count++] = CP_WAIT_REG_MEM_4_MASK(0x1);
> +	postamble[count++] = CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(0);

Isn't it better to just replace this with NOP packets when sysprof is
enabled, just before triggering preemption? It will help to have an
immediate effect.

-Akhil

> +
> +	a6xx_gpu->preempt_postamble_len = count;
> +}
> +
>  void a6xx_preempt_fini(struct msm_gpu *gpu)
>  {
>  	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
> @@ -376,6 +398,16 @@ void a6xx_preempt_init(struct msm_gpu *gpu)
>  	a6xx_gpu->uses_gmem = 1;
>  	a6xx_gpu->skip_save_restore = 1;
>  
> +	a6xx_gpu->preempt_postamble_ptr  = msm_gem_kernel_new(gpu->dev,
> +			PAGE_SIZE, MSM_BO_WC | MSM_BO_MAP_PRIV,
> +			gpu->aspace, &a6xx_gpu->preempt_postamble_bo,
> +			&a6xx_gpu->preempt_postamble_iova);
> +
> +	preempt_prepare_postamble(a6xx_gpu);
> +
> +	if (IS_ERR(a6xx_gpu->preempt_postamble_ptr))
> +		goto fail;
> +
>  	timer_setup(&a6xx_gpu->preempt_timer, a6xx_preempt_timer, 0);
>  
>  	return;
> diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.h b/drivers/gpu/drm/msm/adreno/adreno_gpu.h
> index 6b1888280a83..87098567483b 100644
> --- a/drivers/gpu/drm/msm/adreno/adreno_gpu.h
> +++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.h
> @@ -610,12 +610,15 @@ OUT_PKT4(struct msm_ringbuffer *ring, uint16_t regindx, uint16_t cnt)
>  	OUT_RING(ring, PKT4(regindx, cnt));
>  }
>  
> +#define PKT7(opcode, cnt) \
> +	(CP_TYPE7_PKT | (cnt << 0) | (PM4_PARITY(cnt) << 15) | \
> +		((opcode & 0x7F) << 16) | (PM4_PARITY(opcode) << 23))
> +
>  static inline void
>  OUT_PKT7(struct msm_ringbuffer *ring, uint8_t opcode, uint16_t cnt)
>  {
>  	adreno_wait_ring(ring, cnt + 1);
> -	OUT_RING(ring, CP_TYPE7_PKT | (cnt << 0) | (PM4_PARITY(cnt) << 15) |
> -		((opcode & 0x7F) << 16) | (PM4_PARITY(opcode) << 23));
> +	OUT_RING(ring, PKT7(opcode, cnt));
>  }
>  
>  struct msm_gpu *a2xx_gpu_init(struct drm_device *dev);
> 
> -- 
> 2.46.0
>
Antonino Maniscalco Sept. 9, 2024, 3:07 p.m. UTC | #2
On 9/6/24 10:08 PM, Akhil P Oommen wrote:
> On Thu, Sep 05, 2024 at 04:51:24PM +0200, Antonino Maniscalco wrote:
>> Use the postamble to reset perf counters when switching between rings,
>> except when sysprof is enabled, analogously to how they are reset
>> between submissions when switching pagetables.
>>
>> Signed-off-by: Antonino Maniscalco <antomani103@gmail.com>
>> ---
>>   drivers/gpu/drm/msm/adreno/a6xx_gpu.c     | 20 ++++++++++++++++++-
>>   drivers/gpu/drm/msm/adreno/a6xx_gpu.h     |  5 +++++
>>   drivers/gpu/drm/msm/adreno/a6xx_preempt.c | 32 +++++++++++++++++++++++++++++++
>>   drivers/gpu/drm/msm/adreno/adreno_gpu.h   |  7 +++++--
>>   4 files changed, 61 insertions(+), 3 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
>> index ed0b138a2d66..710ec3ce2923 100644
>> --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
>> +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
>> @@ -366,7 +366,8 @@ static void a6xx_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit)
>>   static void a6xx_emit_set_pseudo_reg(struct msm_ringbuffer *ring,
>>   		struct a6xx_gpu *a6xx_gpu, struct msm_gpu_submitqueue *queue)
>>   {
>> -	u64 preempt_offset_priv_secure;
>> +	bool sysprof = refcount_read(&a6xx_gpu->base.base.sysprof_active) > 1;
>> +	u64 preempt_offset_priv_secure, preempt_postamble;
>>   
>>   	OUT_PKT7(ring, CP_SET_PSEUDO_REG, 15);
>>   
>> @@ -398,6 +399,23 @@ static void a6xx_emit_set_pseudo_reg(struct msm_ringbuffer *ring,
>>   	/* seems OK to set to 0 to disable it */
>>   	OUT_RING(ring, 0);
>>   	OUT_RING(ring, 0);
>> +
>> +	/* if not profiling set postamble to clear perfcounters, else clear it */
>> +	if (!sysprof && a6xx_gpu->preempt_postamble_len) {
>> +		preempt_postamble = a6xx_gpu->preempt_postamble_iova;
>> +
>> +		OUT_PKT7(ring, CP_SET_AMBLE, 3);
>> +		OUT_RING(ring, lower_32_bits(preempt_postamble));
>> +		OUT_RING(ring, upper_32_bits(preempt_postamble));
>> +		OUT_RING(ring, CP_SET_AMBLE_2_DWORDS(
>> +					a6xx_gpu->preempt_postamble_len) |
>> +				CP_SET_AMBLE_2_TYPE(KMD_AMBLE_TYPE));
>> +	} else {
> 
> Why do we need this else part?

Wouldn't the postmable remain set if we don't explicitly set it to 0?

> 
>> +		OUT_PKT7(ring, CP_SET_AMBLE, 3);
>> +		OUT_RING(ring, 0);
>> +		OUT_RING(ring, 0);
>> +		OUT_RING(ring, CP_SET_AMBLE_2_TYPE(KMD_AMBLE_TYPE));
>> +	}
>>   }
>>   
>>   static void a7xx_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit)
>> diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.h b/drivers/gpu/drm/msm/adreno/a6xx_gpu.h
>> index da10060e38dc..b009732c08c5 100644
>> --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.h
>> +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.h
>> @@ -71,6 +71,11 @@ struct a6xx_gpu {
>>   	bool uses_gmem;
>>   	bool skip_save_restore;
>>   
>> +	struct drm_gem_object *preempt_postamble_bo;
>> +	void *preempt_postamble_ptr;
>> +	uint64_t preempt_postamble_iova;
>> +	uint64_t preempt_postamble_len;
>> +
>>   	struct a6xx_gmu gmu;
>>   
>>   	struct drm_gem_object *shadow_bo;
>> diff --git a/drivers/gpu/drm/msm/adreno/a6xx_preempt.c b/drivers/gpu/drm/msm/adreno/a6xx_preempt.c
>> index 1caff76aca6e..ec44f44d925f 100644
>> --- a/drivers/gpu/drm/msm/adreno/a6xx_preempt.c
>> +++ b/drivers/gpu/drm/msm/adreno/a6xx_preempt.c
>> @@ -346,6 +346,28 @@ static int preempt_init_ring(struct a6xx_gpu *a6xx_gpu,
>>   	return 0;
>>   }
>>   
>> +static void preempt_prepare_postamble(struct a6xx_gpu *a6xx_gpu)
>> +{
>> +	u32 *postamble = a6xx_gpu->preempt_postamble_ptr;
>> +	u32 count = 0;
>> +
>> +	postamble[count++] = PKT7(CP_REG_RMW, 3);
>> +	postamble[count++] = REG_A6XX_RBBM_PERFCTR_SRAM_INIT_CMD;
>> +	postamble[count++] = 0;
>> +	postamble[count++] = 1;
>> +
>> +	postamble[count++] = PKT7(CP_WAIT_REG_MEM, 6);
>> +	postamble[count++] = CP_WAIT_REG_MEM_0_FUNCTION(WRITE_EQ);
>> +	postamble[count++] = CP_WAIT_REG_MEM_1_POLL_ADDR_LO(
>> +				REG_A6XX_RBBM_PERFCTR_SRAM_INIT_STATUS);
>> +	postamble[count++] = CP_WAIT_REG_MEM_2_POLL_ADDR_HI(0);
>> +	postamble[count++] = CP_WAIT_REG_MEM_3_REF(0x1);
>> +	postamble[count++] = CP_WAIT_REG_MEM_4_MASK(0x1);
>> +	postamble[count++] = CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(0);
> 
> Isn't it better to just replace this with NOP packets when sysprof is
> enabled, just before triggering preemption? It will help to have an
> immediate effect.
> 
> -Akhil
> 

Mmm, this being a postamble I wonder whether we have the guarantee that 
it finishes execution before the IRQ is called so updating it doesn't 
race with the CP executing it.

>> +
>> +	a6xx_gpu->preempt_postamble_len = count;
>> +}
>> +
>>   void a6xx_preempt_fini(struct msm_gpu *gpu)
>>   {
>>   	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
>> @@ -376,6 +398,16 @@ void a6xx_preempt_init(struct msm_gpu *gpu)
>>   	a6xx_gpu->uses_gmem = 1;
>>   	a6xx_gpu->skip_save_restore = 1;
>>   
>> +	a6xx_gpu->preempt_postamble_ptr  = msm_gem_kernel_new(gpu->dev,
>> +			PAGE_SIZE, MSM_BO_WC | MSM_BO_MAP_PRIV,
>> +			gpu->aspace, &a6xx_gpu->preempt_postamble_bo,
>> +			&a6xx_gpu->preempt_postamble_iova);
>> +
>> +	preempt_prepare_postamble(a6xx_gpu);
>> +
>> +	if (IS_ERR(a6xx_gpu->preempt_postamble_ptr))
>> +		goto fail;
>> +
>>   	timer_setup(&a6xx_gpu->preempt_timer, a6xx_preempt_timer, 0);
>>   
>>   	return;
>> diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.h b/drivers/gpu/drm/msm/adreno/adreno_gpu.h
>> index 6b1888280a83..87098567483b 100644
>> --- a/drivers/gpu/drm/msm/adreno/adreno_gpu.h
>> +++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.h
>> @@ -610,12 +610,15 @@ OUT_PKT4(struct msm_ringbuffer *ring, uint16_t regindx, uint16_t cnt)
>>   	OUT_RING(ring, PKT4(regindx, cnt));
>>   }
>>   
>> +#define PKT7(opcode, cnt) \
>> +	(CP_TYPE7_PKT | (cnt << 0) | (PM4_PARITY(cnt) << 15) | \
>> +		((opcode & 0x7F) << 16) | (PM4_PARITY(opcode) << 23))
>> +
>>   static inline void
>>   OUT_PKT7(struct msm_ringbuffer *ring, uint8_t opcode, uint16_t cnt)
>>   {
>>   	adreno_wait_ring(ring, cnt + 1);
>> -	OUT_RING(ring, CP_TYPE7_PKT | (cnt << 0) | (PM4_PARITY(cnt) << 15) |
>> -		((opcode & 0x7F) << 16) | (PM4_PARITY(opcode) << 23));
>> +	OUT_RING(ring, PKT7(opcode, cnt));
>>   }
>>   
>>   struct msm_gpu *a2xx_gpu_init(struct drm_device *dev);
>>
>> -- 
>> 2.46.0
>>

Best regards,
Akhil P Oommen Sept. 10, 2024, 9:34 p.m. UTC | #3
On Mon, Sep 09, 2024 at 05:07:42PM +0200, Antonino Maniscalco wrote:
> On 9/6/24 10:08 PM, Akhil P Oommen wrote:
> > On Thu, Sep 05, 2024 at 04:51:24PM +0200, Antonino Maniscalco wrote:
> > > Use the postamble to reset perf counters when switching between rings,
> > > except when sysprof is enabled, analogously to how they are reset
> > > between submissions when switching pagetables.
> > > 
> > > Signed-off-by: Antonino Maniscalco <antomani103@gmail.com>
> > > ---
> > >   drivers/gpu/drm/msm/adreno/a6xx_gpu.c     | 20 ++++++++++++++++++-
> > >   drivers/gpu/drm/msm/adreno/a6xx_gpu.h     |  5 +++++
> > >   drivers/gpu/drm/msm/adreno/a6xx_preempt.c | 32 +++++++++++++++++++++++++++++++
> > >   drivers/gpu/drm/msm/adreno/adreno_gpu.h   |  7 +++++--
> > >   4 files changed, 61 insertions(+), 3 deletions(-)
> > > 
> > > diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
> > > index ed0b138a2d66..710ec3ce2923 100644
> > > --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
> > > +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
> > > @@ -366,7 +366,8 @@ static void a6xx_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit)
> > >   static void a6xx_emit_set_pseudo_reg(struct msm_ringbuffer *ring,
> > >   		struct a6xx_gpu *a6xx_gpu, struct msm_gpu_submitqueue *queue)
> > >   {
> > > -	u64 preempt_offset_priv_secure;
> > > +	bool sysprof = refcount_read(&a6xx_gpu->base.base.sysprof_active) > 1;
> > > +	u64 preempt_offset_priv_secure, preempt_postamble;
> > >   	OUT_PKT7(ring, CP_SET_PSEUDO_REG, 15);
> > > @@ -398,6 +399,23 @@ static void a6xx_emit_set_pseudo_reg(struct msm_ringbuffer *ring,
> > >   	/* seems OK to set to 0 to disable it */
> > >   	OUT_RING(ring, 0);
> > >   	OUT_RING(ring, 0);
> > > +
> > > +	/* if not profiling set postamble to clear perfcounters, else clear it */
> > > +	if (!sysprof && a6xx_gpu->preempt_postamble_len) {

Setting len = 0 is enough to skip processing postamble packets. So how
about a simpler:

len = a6xx_gpu->preempt_postamble_len;
if (sysprof)
	len = 0;

OUT_PKT7(ring, CP_SET_AMBLE, 3);
OUT_RING(ring, lower_32_bits(preempt_postamble));
OUT_RING(ring, upper_32_bits(preempt_postamble));
OUT_RING(ring, CP_SET_AMBLE_2_DWORDS(len) |
		CP_SET_AMBLE_2_TYPE(KMD_AMBLE_TYPE));

> > > +		preempt_postamble = a6xx_gpu->preempt_postamble_iova;
> > > +
> > > +		OUT_PKT7(ring, CP_SET_AMBLE, 3);
> > > +		OUT_RING(ring, lower_32_bits(preempt_postamble));
> > > +		OUT_RING(ring, upper_32_bits(preempt_postamble));
> > > +		OUT_RING(ring, CP_SET_AMBLE_2_DWORDS(
> > > +					a6xx_gpu->preempt_postamble_len) |
> > > +				CP_SET_AMBLE_2_TYPE(KMD_AMBLE_TYPE));
> > > +	} else {
> > 
> > Why do we need this else part?
> 
> Wouldn't the postmable remain set if we don't explicitly set it to 0?

Aah, that is a genuine concern. I am not sure! Lets keep it.

> 
> > 
> > > +		OUT_PKT7(ring, CP_SET_AMBLE, 3);
> > > +		OUT_RING(ring, 0);
> > > +		OUT_RING(ring, 0);
> > > +		OUT_RING(ring, CP_SET_AMBLE_2_TYPE(KMD_AMBLE_TYPE));
> > > +	}
> > >   }
> > >   static void a7xx_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit)
> > > diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.h b/drivers/gpu/drm/msm/adreno/a6xx_gpu.h
> > > index da10060e38dc..b009732c08c5 100644
> > > --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.h
> > > +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.h
> > > @@ -71,6 +71,11 @@ struct a6xx_gpu {
> > >   	bool uses_gmem;
> > >   	bool skip_save_restore;
> > > +	struct drm_gem_object *preempt_postamble_bo;
> > > +	void *preempt_postamble_ptr;
> > > +	uint64_t preempt_postamble_iova;
> > > +	uint64_t preempt_postamble_len;
> > > +
> > >   	struct a6xx_gmu gmu;
> > >   	struct drm_gem_object *shadow_bo;
> > > diff --git a/drivers/gpu/drm/msm/adreno/a6xx_preempt.c b/drivers/gpu/drm/msm/adreno/a6xx_preempt.c
> > > index 1caff76aca6e..ec44f44d925f 100644
> > > --- a/drivers/gpu/drm/msm/adreno/a6xx_preempt.c
> > > +++ b/drivers/gpu/drm/msm/adreno/a6xx_preempt.c
> > > @@ -346,6 +346,28 @@ static int preempt_init_ring(struct a6xx_gpu *a6xx_gpu,
> > >   	return 0;
> > >   }
> > > +static void preempt_prepare_postamble(struct a6xx_gpu *a6xx_gpu)
> > > +{
> > > +	u32 *postamble = a6xx_gpu->preempt_postamble_ptr;
> > > +	u32 count = 0;
> > > +
> > > +	postamble[count++] = PKT7(CP_REG_RMW, 3);
> > > +	postamble[count++] = REG_A6XX_RBBM_PERFCTR_SRAM_INIT_CMD;
> > > +	postamble[count++] = 0;
> > > +	postamble[count++] = 1;
> > > +
> > > +	postamble[count++] = PKT7(CP_WAIT_REG_MEM, 6);
> > > +	postamble[count++] = CP_WAIT_REG_MEM_0_FUNCTION(WRITE_EQ);
> > > +	postamble[count++] = CP_WAIT_REG_MEM_1_POLL_ADDR_LO(
> > > +				REG_A6XX_RBBM_PERFCTR_SRAM_INIT_STATUS);
> > > +	postamble[count++] = CP_WAIT_REG_MEM_2_POLL_ADDR_HI(0);
> > > +	postamble[count++] = CP_WAIT_REG_MEM_3_REF(0x1);
> > > +	postamble[count++] = CP_WAIT_REG_MEM_4_MASK(0x1);
> > > +	postamble[count++] = CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(0);
> > 
> > Isn't it better to just replace this with NOP packets when sysprof is
> > enabled, just before triggering preemption? It will help to have an
> > immediate effect.
> > 
> > -Akhil
> > 
> 
> Mmm, this being a postamble I wonder whether we have the guarantee that it
> finishes execution before the IRQ is called so updating it doesn't race with
> the CP executing it.

Yes, it will be complete. But on a second thought now, this suggestion from me
looks like an overkill.

-Akhil.

> 
> > > +
> > > +	a6xx_gpu->preempt_postamble_len = count;
> > > +}
> > > +
> > >   void a6xx_preempt_fini(struct msm_gpu *gpu)
> > >   {
> > >   	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
> > > @@ -376,6 +398,16 @@ void a6xx_preempt_init(struct msm_gpu *gpu)
> > >   	a6xx_gpu->uses_gmem = 1;
> > >   	a6xx_gpu->skip_save_restore = 1;
> > > +	a6xx_gpu->preempt_postamble_ptr  = msm_gem_kernel_new(gpu->dev,
> > > +			PAGE_SIZE, MSM_BO_WC | MSM_BO_MAP_PRIV,
> > > +			gpu->aspace, &a6xx_gpu->preempt_postamble_bo,
> > > +			&a6xx_gpu->preempt_postamble_iova);
> > > +
> > > +	preempt_prepare_postamble(a6xx_gpu);
> > > +
> > > +	if (IS_ERR(a6xx_gpu->preempt_postamble_ptr))
> > > +		goto fail;
> > > +
> > >   	timer_setup(&a6xx_gpu->preempt_timer, a6xx_preempt_timer, 0);
> > >   	return;
> > > diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.h b/drivers/gpu/drm/msm/adreno/adreno_gpu.h
> > > index 6b1888280a83..87098567483b 100644
> > > --- a/drivers/gpu/drm/msm/adreno/adreno_gpu.h
> > > +++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.h
> > > @@ -610,12 +610,15 @@ OUT_PKT4(struct msm_ringbuffer *ring, uint16_t regindx, uint16_t cnt)
> > >   	OUT_RING(ring, PKT4(regindx, cnt));
> > >   }
> > > +#define PKT7(opcode, cnt) \
> > > +	(CP_TYPE7_PKT | (cnt << 0) | (PM4_PARITY(cnt) << 15) | \
> > > +		((opcode & 0x7F) << 16) | (PM4_PARITY(opcode) << 23))
> > > +
> > >   static inline void
> > >   OUT_PKT7(struct msm_ringbuffer *ring, uint8_t opcode, uint16_t cnt)
> > >   {
> > >   	adreno_wait_ring(ring, cnt + 1);
> > > -	OUT_RING(ring, CP_TYPE7_PKT | (cnt << 0) | (PM4_PARITY(cnt) << 15) |
> > > -		((opcode & 0x7F) << 16) | (PM4_PARITY(opcode) << 23));
> > > +	OUT_RING(ring, PKT7(opcode, cnt));
> > >   }
> > >   struct msm_gpu *a2xx_gpu_init(struct drm_device *dev);
> > > 
> > > -- 
> > > 2.46.0
> > > 
> 
> Best regards,
> -- 
> Antonino Maniscalco <antomani103@gmail.com>
>
Antonino Maniscalco Sept. 10, 2024, 10:35 p.m. UTC | #4
On 9/10/24 11:34 PM, Akhil P Oommen wrote:
> On Mon, Sep 09, 2024 at 05:07:42PM +0200, Antonino Maniscalco wrote:
>> On 9/6/24 10:08 PM, Akhil P Oommen wrote:
>>> On Thu, Sep 05, 2024 at 04:51:24PM +0200, Antonino Maniscalco wrote:
>>>> Use the postamble to reset perf counters when switching between rings,
>>>> except when sysprof is enabled, analogously to how they are reset
>>>> between submissions when switching pagetables.
>>>>
>>>> Signed-off-by: Antonino Maniscalco <antomani103@gmail.com>
>>>> ---
>>>>    drivers/gpu/drm/msm/adreno/a6xx_gpu.c     | 20 ++++++++++++++++++-
>>>>    drivers/gpu/drm/msm/adreno/a6xx_gpu.h     |  5 +++++
>>>>    drivers/gpu/drm/msm/adreno/a6xx_preempt.c | 32 +++++++++++++++++++++++++++++++
>>>>    drivers/gpu/drm/msm/adreno/adreno_gpu.h   |  7 +++++--
>>>>    4 files changed, 61 insertions(+), 3 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
>>>> index ed0b138a2d66..710ec3ce2923 100644
>>>> --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
>>>> +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
>>>> @@ -366,7 +366,8 @@ static void a6xx_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit)
>>>>    static void a6xx_emit_set_pseudo_reg(struct msm_ringbuffer *ring,
>>>>    		struct a6xx_gpu *a6xx_gpu, struct msm_gpu_submitqueue *queue)
>>>>    {
>>>> -	u64 preempt_offset_priv_secure;
>>>> +	bool sysprof = refcount_read(&a6xx_gpu->base.base.sysprof_active) > 1;
>>>> +	u64 preempt_offset_priv_secure, preempt_postamble;
>>>>    	OUT_PKT7(ring, CP_SET_PSEUDO_REG, 15);
>>>> @@ -398,6 +399,23 @@ static void a6xx_emit_set_pseudo_reg(struct msm_ringbuffer *ring,
>>>>    	/* seems OK to set to 0 to disable it */
>>>>    	OUT_RING(ring, 0);
>>>>    	OUT_RING(ring, 0);
>>>> +
>>>> +	/* if not profiling set postamble to clear perfcounters, else clear it */
>>>> +	if (!sysprof && a6xx_gpu->preempt_postamble_len) {
> 
> Setting len = 0 is enough to skip processing postamble packets. So how
> about a simpler:
> 
> len = a6xx_gpu->preempt_postamble_len;
> if (sysprof)
> 	len = 0;
> 
> OUT_PKT7(ring, CP_SET_AMBLE, 3);
> OUT_RING(ring, lower_32_bits(preempt_postamble));
> OUT_RING(ring, upper_32_bits(preempt_postamble));
> OUT_RING(ring, CP_SET_AMBLE_2_DWORDS(len) |
> 		CP_SET_AMBLE_2_TYPE(KMD_AMBLE_TYPE));
> 
>>>> +		preempt_postamble = a6xx_gpu->preempt_postamble_iova;
>>>> +
>>>> +		OUT_PKT7(ring, CP_SET_AMBLE, 3);
>>>> +		OUT_RING(ring, lower_32_bits(preempt_postamble));
>>>> +		OUT_RING(ring, upper_32_bits(preempt_postamble));
>>>> +		OUT_RING(ring, CP_SET_AMBLE_2_DWORDS(
>>>> +					a6xx_gpu->preempt_postamble_len) |
>>>> +				CP_SET_AMBLE_2_TYPE(KMD_AMBLE_TYPE));
>>>> +	} else {
>>>
>>> Why do we need this else part?
>>
>> Wouldn't the postmable remain set if we don't explicitly set it to 0?
> 
> Aah, that is a genuine concern. I am not sure! Lets keep it.
> 
>>
>>>
>>>> +		OUT_PKT7(ring, CP_SET_AMBLE, 3);
>>>> +		OUT_RING(ring, 0);
>>>> +		OUT_RING(ring, 0);
>>>> +		OUT_RING(ring, CP_SET_AMBLE_2_TYPE(KMD_AMBLE_TYPE));
>>>> +	}
>>>>    }
>>>>    static void a7xx_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit)
>>>> diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.h b/drivers/gpu/drm/msm/adreno/a6xx_gpu.h
>>>> index da10060e38dc..b009732c08c5 100644
>>>> --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.h
>>>> +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.h
>>>> @@ -71,6 +71,11 @@ struct a6xx_gpu {
>>>>    	bool uses_gmem;
>>>>    	bool skip_save_restore;
>>>> +	struct drm_gem_object *preempt_postamble_bo;
>>>> +	void *preempt_postamble_ptr;
>>>> +	uint64_t preempt_postamble_iova;
>>>> +	uint64_t preempt_postamble_len;
>>>> +
>>>>    	struct a6xx_gmu gmu;
>>>>    	struct drm_gem_object *shadow_bo;
>>>> diff --git a/drivers/gpu/drm/msm/adreno/a6xx_preempt.c b/drivers/gpu/drm/msm/adreno/a6xx_preempt.c
>>>> index 1caff76aca6e..ec44f44d925f 100644
>>>> --- a/drivers/gpu/drm/msm/adreno/a6xx_preempt.c
>>>> +++ b/drivers/gpu/drm/msm/adreno/a6xx_preempt.c
>>>> @@ -346,6 +346,28 @@ static int preempt_init_ring(struct a6xx_gpu *a6xx_gpu,
>>>>    	return 0;
>>>>    }
>>>> +static void preempt_prepare_postamble(struct a6xx_gpu *a6xx_gpu)
>>>> +{
>>>> +	u32 *postamble = a6xx_gpu->preempt_postamble_ptr;
>>>> +	u32 count = 0;
>>>> +
>>>> +	postamble[count++] = PKT7(CP_REG_RMW, 3);
>>>> +	postamble[count++] = REG_A6XX_RBBM_PERFCTR_SRAM_INIT_CMD;
>>>> +	postamble[count++] = 0;
>>>> +	postamble[count++] = 1;
>>>> +
>>>> +	postamble[count++] = PKT7(CP_WAIT_REG_MEM, 6);
>>>> +	postamble[count++] = CP_WAIT_REG_MEM_0_FUNCTION(WRITE_EQ);
>>>> +	postamble[count++] = CP_WAIT_REG_MEM_1_POLL_ADDR_LO(
>>>> +				REG_A6XX_RBBM_PERFCTR_SRAM_INIT_STATUS);
>>>> +	postamble[count++] = CP_WAIT_REG_MEM_2_POLL_ADDR_HI(0);
>>>> +	postamble[count++] = CP_WAIT_REG_MEM_3_REF(0x1);
>>>> +	postamble[count++] = CP_WAIT_REG_MEM_4_MASK(0x1);
>>>> +	postamble[count++] = CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(0);
>>>
>>> Isn't it better to just replace this with NOP packets when sysprof is
>>> enabled, just before triggering preemption? It will help to have an
>>> immediate effect.
>>>
>>> -Akhil
>>>
>>
>> Mmm, this being a postamble I wonder whether we have the guarantee that it
>> finishes execution before the IRQ is called so updating it doesn't race with
>> the CP executing it.
> 
> Yes, it will be complete. But on a second thought now, this suggestion from me
> looks like an overkill.

Thanks for confirming! I have actually already implemented something 
similar to what you proposed 
https://gitlab.com/pac85/inux/-/commit/8b8ab1d89b0f611cfdbac4c3edba4192be91a7f9 
so we can chose between the two. Let me know your prefence.

> 
> -Akhil.
> 
>>
>>>> +
>>>> +	a6xx_gpu->preempt_postamble_len = count;
>>>> +}
>>>> +
>>>>    void a6xx_preempt_fini(struct msm_gpu *gpu)
>>>>    {
>>>>    	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
>>>> @@ -376,6 +398,16 @@ void a6xx_preempt_init(struct msm_gpu *gpu)
>>>>    	a6xx_gpu->uses_gmem = 1;
>>>>    	a6xx_gpu->skip_save_restore = 1;
>>>> +	a6xx_gpu->preempt_postamble_ptr  = msm_gem_kernel_new(gpu->dev,
>>>> +			PAGE_SIZE, MSM_BO_WC | MSM_BO_MAP_PRIV,
>>>> +			gpu->aspace, &a6xx_gpu->preempt_postamble_bo,
>>>> +			&a6xx_gpu->preempt_postamble_iova);
>>>> +
>>>> +	preempt_prepare_postamble(a6xx_gpu);
>>>> +
>>>> +	if (IS_ERR(a6xx_gpu->preempt_postamble_ptr))
>>>> +		goto fail;
>>>> +
>>>>    	timer_setup(&a6xx_gpu->preempt_timer, a6xx_preempt_timer, 0);
>>>>    	return;
>>>> diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.h b/drivers/gpu/drm/msm/adreno/adreno_gpu.h
>>>> index 6b1888280a83..87098567483b 100644
>>>> --- a/drivers/gpu/drm/msm/adreno/adreno_gpu.h
>>>> +++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.h
>>>> @@ -610,12 +610,15 @@ OUT_PKT4(struct msm_ringbuffer *ring, uint16_t regindx, uint16_t cnt)
>>>>    	OUT_RING(ring, PKT4(regindx, cnt));
>>>>    }
>>>> +#define PKT7(opcode, cnt) \
>>>> +	(CP_TYPE7_PKT | (cnt << 0) | (PM4_PARITY(cnt) << 15) | \
>>>> +		((opcode & 0x7F) << 16) | (PM4_PARITY(opcode) << 23))
>>>> +
>>>>    static inline void
>>>>    OUT_PKT7(struct msm_ringbuffer *ring, uint8_t opcode, uint16_t cnt)
>>>>    {
>>>>    	adreno_wait_ring(ring, cnt + 1);
>>>> -	OUT_RING(ring, CP_TYPE7_PKT | (cnt << 0) | (PM4_PARITY(cnt) << 15) |
>>>> -		((opcode & 0x7F) << 16) | (PM4_PARITY(opcode) << 23));
>>>> +	OUT_RING(ring, PKT7(opcode, cnt));
>>>>    }
>>>>    struct msm_gpu *a2xx_gpu_init(struct drm_device *dev);
>>>>
>>>> -- 
>>>> 2.46.0
>>>>
>>
>> Best regards,
>> -- 
>> Antonino Maniscalco <antomani103@gmail.com>
>>

Best regards,
Akhil P Oommen Sept. 12, 2024, 7:12 a.m. UTC | #5
On Wed, Sep 11, 2024 at 12:35:08AM +0200, Antonino Maniscalco wrote:
> On 9/10/24 11:34 PM, Akhil P Oommen wrote:
> > On Mon, Sep 09, 2024 at 05:07:42PM +0200, Antonino Maniscalco wrote:
> > > On 9/6/24 10:08 PM, Akhil P Oommen wrote:
> > > > On Thu, Sep 05, 2024 at 04:51:24PM +0200, Antonino Maniscalco wrote:
> > > > > Use the postamble to reset perf counters when switching between rings,
> > > > > except when sysprof is enabled, analogously to how they are reset
> > > > > between submissions when switching pagetables.
> > > > > 
> > > > > Signed-off-by: Antonino Maniscalco <antomani103@gmail.com>
> > > > > ---
> > > > >    drivers/gpu/drm/msm/adreno/a6xx_gpu.c     | 20 ++++++++++++++++++-
> > > > >    drivers/gpu/drm/msm/adreno/a6xx_gpu.h     |  5 +++++
> > > > >    drivers/gpu/drm/msm/adreno/a6xx_preempt.c | 32 +++++++++++++++++++++++++++++++
> > > > >    drivers/gpu/drm/msm/adreno/adreno_gpu.h   |  7 +++++--
> > > > >    4 files changed, 61 insertions(+), 3 deletions(-)
> > > > > 
> > > > > diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
> > > > > index ed0b138a2d66..710ec3ce2923 100644
> > > > > --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
> > > > > +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
> > > > > @@ -366,7 +366,8 @@ static void a6xx_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit)
> > > > >    static void a6xx_emit_set_pseudo_reg(struct msm_ringbuffer *ring,
> > > > >    		struct a6xx_gpu *a6xx_gpu, struct msm_gpu_submitqueue *queue)
> > > > >    {
> > > > > -	u64 preempt_offset_priv_secure;
> > > > > +	bool sysprof = refcount_read(&a6xx_gpu->base.base.sysprof_active) > 1;
> > > > > +	u64 preempt_offset_priv_secure, preempt_postamble;
> > > > >    	OUT_PKT7(ring, CP_SET_PSEUDO_REG, 15);
> > > > > @@ -398,6 +399,23 @@ static void a6xx_emit_set_pseudo_reg(struct msm_ringbuffer *ring,
> > > > >    	/* seems OK to set to 0 to disable it */
> > > > >    	OUT_RING(ring, 0);
> > > > >    	OUT_RING(ring, 0);
> > > > > +
> > > > > +	/* if not profiling set postamble to clear perfcounters, else clear it */
> > > > > +	if (!sysprof && a6xx_gpu->preempt_postamble_len) {
> > 
> > Setting len = 0 is enough to skip processing postamble packets. So how
> > about a simpler:
> > 
> > len = a6xx_gpu->preempt_postamble_len;
> > if (sysprof)
> > 	len = 0;
> > 
> > OUT_PKT7(ring, CP_SET_AMBLE, 3);
> > OUT_RING(ring, lower_32_bits(preempt_postamble));
> > OUT_RING(ring, upper_32_bits(preempt_postamble));
> > OUT_RING(ring, CP_SET_AMBLE_2_DWORDS(len) |
> > 		CP_SET_AMBLE_2_TYPE(KMD_AMBLE_TYPE));
> > 
> > > > > +		preempt_postamble = a6xx_gpu->preempt_postamble_iova;
> > > > > +
> > > > > +		OUT_PKT7(ring, CP_SET_AMBLE, 3);
> > > > > +		OUT_RING(ring, lower_32_bits(preempt_postamble));
> > > > > +		OUT_RING(ring, upper_32_bits(preempt_postamble));
> > > > > +		OUT_RING(ring, CP_SET_AMBLE_2_DWORDS(
> > > > > +					a6xx_gpu->preempt_postamble_len) |
> > > > > +				CP_SET_AMBLE_2_TYPE(KMD_AMBLE_TYPE));
> > > > > +	} else {
> > > > 
> > > > Why do we need this else part?
> > > 
> > > Wouldn't the postmable remain set if we don't explicitly set it to 0?
> > 
> > Aah, that is a genuine concern. I am not sure! Lets keep it.
> > 
> > > 
> > > > 
> > > > > +		OUT_PKT7(ring, CP_SET_AMBLE, 3);
> > > > > +		OUT_RING(ring, 0);
> > > > > +		OUT_RING(ring, 0);
> > > > > +		OUT_RING(ring, CP_SET_AMBLE_2_TYPE(KMD_AMBLE_TYPE));
> > > > > +	}
> > > > >    }
> > > > >    static void a7xx_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit)
> > > > > diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.h b/drivers/gpu/drm/msm/adreno/a6xx_gpu.h
> > > > > index da10060e38dc..b009732c08c5 100644
> > > > > --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.h
> > > > > +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.h
> > > > > @@ -71,6 +71,11 @@ struct a6xx_gpu {
> > > > >    	bool uses_gmem;
> > > > >    	bool skip_save_restore;
> > > > > +	struct drm_gem_object *preempt_postamble_bo;
> > > > > +	void *preempt_postamble_ptr;
> > > > > +	uint64_t preempt_postamble_iova;
> > > > > +	uint64_t preempt_postamble_len;
> > > > > +
> > > > >    	struct a6xx_gmu gmu;
> > > > >    	struct drm_gem_object *shadow_bo;
> > > > > diff --git a/drivers/gpu/drm/msm/adreno/a6xx_preempt.c b/drivers/gpu/drm/msm/adreno/a6xx_preempt.c
> > > > > index 1caff76aca6e..ec44f44d925f 100644
> > > > > --- a/drivers/gpu/drm/msm/adreno/a6xx_preempt.c
> > > > > +++ b/drivers/gpu/drm/msm/adreno/a6xx_preempt.c
> > > > > @@ -346,6 +346,28 @@ static int preempt_init_ring(struct a6xx_gpu *a6xx_gpu,
> > > > >    	return 0;
> > > > >    }
> > > > > +static void preempt_prepare_postamble(struct a6xx_gpu *a6xx_gpu)
> > > > > +{
> > > > > +	u32 *postamble = a6xx_gpu->preempt_postamble_ptr;
> > > > > +	u32 count = 0;
> > > > > +
> > > > > +	postamble[count++] = PKT7(CP_REG_RMW, 3);
> > > > > +	postamble[count++] = REG_A6XX_RBBM_PERFCTR_SRAM_INIT_CMD;
> > > > > +	postamble[count++] = 0;
> > > > > +	postamble[count++] = 1;
> > > > > +
> > > > > +	postamble[count++] = PKT7(CP_WAIT_REG_MEM, 6);
> > > > > +	postamble[count++] = CP_WAIT_REG_MEM_0_FUNCTION(WRITE_EQ);
> > > > > +	postamble[count++] = CP_WAIT_REG_MEM_1_POLL_ADDR_LO(
> > > > > +				REG_A6XX_RBBM_PERFCTR_SRAM_INIT_STATUS);
> > > > > +	postamble[count++] = CP_WAIT_REG_MEM_2_POLL_ADDR_HI(0);
> > > > > +	postamble[count++] = CP_WAIT_REG_MEM_3_REF(0x1);
> > > > > +	postamble[count++] = CP_WAIT_REG_MEM_4_MASK(0x1);
> > > > > +	postamble[count++] = CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(0);
> > > > 
> > > > Isn't it better to just replace this with NOP packets when sysprof is
> > > > enabled, just before triggering preemption? It will help to have an
> > > > immediate effect.
> > > > 
> > > > -Akhil
> > > > 
> > > 
> > > Mmm, this being a postamble I wonder whether we have the guarantee that it
> > > finishes execution before the IRQ is called so updating it doesn't race with
> > > the CP executing it.
> > 
> > Yes, it will be complete. But on a second thought now, this suggestion from me
> > looks like an overkill.
> 
> Thanks for confirming! I have actually already implemented something similar
> to what you proposed https://gitlab.com/pac85/inux/-/commit/8b8ab1d89b0f611cfdbac4c3edba4192be91a7f9
> so we can chose between the two. Let me know your prefence.

That looks fine. Can we try to simplify that patch further? We can lean
towards readability instead of saving few writes. I don't think there
will be frequent sysprof toggles.

-Akhil

> 
> > 
> > -Akhil.
> > 
> > > 
> > > > > +
> > > > > +	a6xx_gpu->preempt_postamble_len = count;
> > > > > +}
> > > > > +
> > > > >    void a6xx_preempt_fini(struct msm_gpu *gpu)
> > > > >    {
> > > > >    	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
> > > > > @@ -376,6 +398,16 @@ void a6xx_preempt_init(struct msm_gpu *gpu)
> > > > >    	a6xx_gpu->uses_gmem = 1;
> > > > >    	a6xx_gpu->skip_save_restore = 1;
> > > > > +	a6xx_gpu->preempt_postamble_ptr  = msm_gem_kernel_new(gpu->dev,
> > > > > +			PAGE_SIZE, MSM_BO_WC | MSM_BO_MAP_PRIV,
> > > > > +			gpu->aspace, &a6xx_gpu->preempt_postamble_bo,
> > > > > +			&a6xx_gpu->preempt_postamble_iova);
> > > > > +
> > > > > +	preempt_prepare_postamble(a6xx_gpu);
> > > > > +
> > > > > +	if (IS_ERR(a6xx_gpu->preempt_postamble_ptr))
> > > > > +		goto fail;
> > > > > +
> > > > >    	timer_setup(&a6xx_gpu->preempt_timer, a6xx_preempt_timer, 0);
> > > > >    	return;
> > > > > diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.h b/drivers/gpu/drm/msm/adreno/adreno_gpu.h
> > > > > index 6b1888280a83..87098567483b 100644
> > > > > --- a/drivers/gpu/drm/msm/adreno/adreno_gpu.h
> > > > > +++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.h
> > > > > @@ -610,12 +610,15 @@ OUT_PKT4(struct msm_ringbuffer *ring, uint16_t regindx, uint16_t cnt)
> > > > >    	OUT_RING(ring, PKT4(regindx, cnt));
> > > > >    }
> > > > > +#define PKT7(opcode, cnt) \
> > > > > +	(CP_TYPE7_PKT | (cnt << 0) | (PM4_PARITY(cnt) << 15) | \
> > > > > +		((opcode & 0x7F) << 16) | (PM4_PARITY(opcode) << 23))
> > > > > +
> > > > >    static inline void
> > > > >    OUT_PKT7(struct msm_ringbuffer *ring, uint8_t opcode, uint16_t cnt)
> > > > >    {
> > > > >    	adreno_wait_ring(ring, cnt + 1);
> > > > > -	OUT_RING(ring, CP_TYPE7_PKT | (cnt << 0) | (PM4_PARITY(cnt) << 15) |
> > > > > -		((opcode & 0x7F) << 16) | (PM4_PARITY(opcode) << 23));
> > > > > +	OUT_RING(ring, PKT7(opcode, cnt));
> > > > >    }
> > > > >    struct msm_gpu *a2xx_gpu_init(struct drm_device *dev);
> > > > > 
> > > > > -- 
> > > > > 2.46.0
> > > > > 
> > > 
> > > Best regards,
> > > -- 
> > > Antonino Maniscalco <antomani103@gmail.com>
> > > 
> 
> Best regards,
> -- 
> Antonino Maniscalco <antomani103@gmail.com>
>
Antonino Maniscalco Sept. 12, 2024, 11:15 a.m. UTC | #6
On 9/12/24 9:12 AM, Akhil P Oommen wrote:
> On Wed, Sep 11, 2024 at 12:35:08AM +0200, Antonino Maniscalco wrote:
>> On 9/10/24 11:34 PM, Akhil P Oommen wrote:
>>> On Mon, Sep 09, 2024 at 05:07:42PM +0200, Antonino Maniscalco wrote:
>>>> On 9/6/24 10:08 PM, Akhil P Oommen wrote:
>>>>> On Thu, Sep 05, 2024 at 04:51:24PM +0200, Antonino Maniscalco wrote:
>>>>>> Use the postamble to reset perf counters when switching between rings,
>>>>>> except when sysprof is enabled, analogously to how they are reset
>>>>>> between submissions when switching pagetables.
>>>>>>
>>>>>> Signed-off-by: Antonino Maniscalco <antomani103@gmail.com>
>>>>>> ---
>>>>>>     drivers/gpu/drm/msm/adreno/a6xx_gpu.c     | 20 ++++++++++++++++++-
>>>>>>     drivers/gpu/drm/msm/adreno/a6xx_gpu.h     |  5 +++++
>>>>>>     drivers/gpu/drm/msm/adreno/a6xx_preempt.c | 32 +++++++++++++++++++++++++++++++
>>>>>>     drivers/gpu/drm/msm/adreno/adreno_gpu.h   |  7 +++++--
>>>>>>     4 files changed, 61 insertions(+), 3 deletions(-)
>>>>>>
>>>>>> diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
>>>>>> index ed0b138a2d66..710ec3ce2923 100644
>>>>>> --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
>>>>>> +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
>>>>>> @@ -366,7 +366,8 @@ static void a6xx_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit)
>>>>>>     static void a6xx_emit_set_pseudo_reg(struct msm_ringbuffer *ring,
>>>>>>     		struct a6xx_gpu *a6xx_gpu, struct msm_gpu_submitqueue *queue)
>>>>>>     {
>>>>>> -	u64 preempt_offset_priv_secure;
>>>>>> +	bool sysprof = refcount_read(&a6xx_gpu->base.base.sysprof_active) > 1;
>>>>>> +	u64 preempt_offset_priv_secure, preempt_postamble;
>>>>>>     	OUT_PKT7(ring, CP_SET_PSEUDO_REG, 15);
>>>>>> @@ -398,6 +399,23 @@ static void a6xx_emit_set_pseudo_reg(struct msm_ringbuffer *ring,
>>>>>>     	/* seems OK to set to 0 to disable it */
>>>>>>     	OUT_RING(ring, 0);
>>>>>>     	OUT_RING(ring, 0);
>>>>>> +
>>>>>> +	/* if not profiling set postamble to clear perfcounters, else clear it */
>>>>>> +	if (!sysprof && a6xx_gpu->preempt_postamble_len) {
>>>
>>> Setting len = 0 is enough to skip processing postamble packets. So how
>>> about a simpler:
>>>
>>> len = a6xx_gpu->preempt_postamble_len;
>>> if (sysprof)
>>> 	len = 0;
>>>
>>> OUT_PKT7(ring, CP_SET_AMBLE, 3);
>>> OUT_RING(ring, lower_32_bits(preempt_postamble));
>>> OUT_RING(ring, upper_32_bits(preempt_postamble));
>>> OUT_RING(ring, CP_SET_AMBLE_2_DWORDS(len) |
>>> 		CP_SET_AMBLE_2_TYPE(KMD_AMBLE_TYPE));
>>>
>>>>>> +		preempt_postamble = a6xx_gpu->preempt_postamble_iova;
>>>>>> +
>>>>>> +		OUT_PKT7(ring, CP_SET_AMBLE, 3);
>>>>>> +		OUT_RING(ring, lower_32_bits(preempt_postamble));
>>>>>> +		OUT_RING(ring, upper_32_bits(preempt_postamble));
>>>>>> +		OUT_RING(ring, CP_SET_AMBLE_2_DWORDS(
>>>>>> +					a6xx_gpu->preempt_postamble_len) |
>>>>>> +				CP_SET_AMBLE_2_TYPE(KMD_AMBLE_TYPE));
>>>>>> +	} else {
>>>>>
>>>>> Why do we need this else part?
>>>>
>>>> Wouldn't the postmable remain set if we don't explicitly set it to 0?
>>>
>>> Aah, that is a genuine concern. I am not sure! Lets keep it.
>>>
>>>>
>>>>>
>>>>>> +		OUT_PKT7(ring, CP_SET_AMBLE, 3);
>>>>>> +		OUT_RING(ring, 0);
>>>>>> +		OUT_RING(ring, 0);
>>>>>> +		OUT_RING(ring, CP_SET_AMBLE_2_TYPE(KMD_AMBLE_TYPE));
>>>>>> +	}
>>>>>>     }
>>>>>>     static void a7xx_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit)
>>>>>> diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.h b/drivers/gpu/drm/msm/adreno/a6xx_gpu.h
>>>>>> index da10060e38dc..b009732c08c5 100644
>>>>>> --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.h
>>>>>> +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.h
>>>>>> @@ -71,6 +71,11 @@ struct a6xx_gpu {
>>>>>>     	bool uses_gmem;
>>>>>>     	bool skip_save_restore;
>>>>>> +	struct drm_gem_object *preempt_postamble_bo;
>>>>>> +	void *preempt_postamble_ptr;
>>>>>> +	uint64_t preempt_postamble_iova;
>>>>>> +	uint64_t preempt_postamble_len;
>>>>>> +
>>>>>>     	struct a6xx_gmu gmu;
>>>>>>     	struct drm_gem_object *shadow_bo;
>>>>>> diff --git a/drivers/gpu/drm/msm/adreno/a6xx_preempt.c b/drivers/gpu/drm/msm/adreno/a6xx_preempt.c
>>>>>> index 1caff76aca6e..ec44f44d925f 100644
>>>>>> --- a/drivers/gpu/drm/msm/adreno/a6xx_preempt.c
>>>>>> +++ b/drivers/gpu/drm/msm/adreno/a6xx_preempt.c
>>>>>> @@ -346,6 +346,28 @@ static int preempt_init_ring(struct a6xx_gpu *a6xx_gpu,
>>>>>>     	return 0;
>>>>>>     }
>>>>>> +static void preempt_prepare_postamble(struct a6xx_gpu *a6xx_gpu)
>>>>>> +{
>>>>>> +	u32 *postamble = a6xx_gpu->preempt_postamble_ptr;
>>>>>> +	u32 count = 0;
>>>>>> +
>>>>>> +	postamble[count++] = PKT7(CP_REG_RMW, 3);
>>>>>> +	postamble[count++] = REG_A6XX_RBBM_PERFCTR_SRAM_INIT_CMD;
>>>>>> +	postamble[count++] = 0;
>>>>>> +	postamble[count++] = 1;
>>>>>> +
>>>>>> +	postamble[count++] = PKT7(CP_WAIT_REG_MEM, 6);
>>>>>> +	postamble[count++] = CP_WAIT_REG_MEM_0_FUNCTION(WRITE_EQ);
>>>>>> +	postamble[count++] = CP_WAIT_REG_MEM_1_POLL_ADDR_LO(
>>>>>> +				REG_A6XX_RBBM_PERFCTR_SRAM_INIT_STATUS);
>>>>>> +	postamble[count++] = CP_WAIT_REG_MEM_2_POLL_ADDR_HI(0);
>>>>>> +	postamble[count++] = CP_WAIT_REG_MEM_3_REF(0x1);
>>>>>> +	postamble[count++] = CP_WAIT_REG_MEM_4_MASK(0x1);
>>>>>> +	postamble[count++] = CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(0);
>>>>>
>>>>> Isn't it better to just replace this with NOP packets when sysprof is
>>>>> enabled, just before triggering preemption? It will help to have an
>>>>> immediate effect.
>>>>>
>>>>> -Akhil
>>>>>
>>>>
>>>> Mmm, this being a postamble I wonder whether we have the guarantee that it
>>>> finishes execution before the IRQ is called so updating it doesn't race with
>>>> the CP executing it.
>>>
>>> Yes, it will be complete. But on a second thought now, this suggestion from me
>>> looks like an overkill.
>>
>> Thanks for confirming! I have actually already implemented something similar
>> to what you proposed https://gitlab.com/pac85/inux/-/commit/8b8ab1d89b0f611cfdbac4c3edba4192be91a7f9
>> so we can chose between the two. Let me know your prefence.
> 
> That looks fine. Can we try to simplify that patch further? We can lean
> towards readability instead of saving few writes. I don't think there
> will be frequent sysprof toggles.
> 

Sure yeah, I removed the patch argument on preempt_prepare_postamble so 
when we enable the postamble we just re-emit the entire IB.

> -Akhil
> 
>>
>>>
>>> -Akhil.
>>>
>>>>
>>>>>> +
>>>>>> +	a6xx_gpu->preempt_postamble_len = count;
>>>>>> +}
>>>>>> +
>>>>>>     void a6xx_preempt_fini(struct msm_gpu *gpu)
>>>>>>     {
>>>>>>     	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
>>>>>> @@ -376,6 +398,16 @@ void a6xx_preempt_init(struct msm_gpu *gpu)
>>>>>>     	a6xx_gpu->uses_gmem = 1;
>>>>>>     	a6xx_gpu->skip_save_restore = 1;
>>>>>> +	a6xx_gpu->preempt_postamble_ptr  = msm_gem_kernel_new(gpu->dev,
>>>>>> +			PAGE_SIZE, MSM_BO_WC | MSM_BO_MAP_PRIV,
>>>>>> +			gpu->aspace, &a6xx_gpu->preempt_postamble_bo,
>>>>>> +			&a6xx_gpu->preempt_postamble_iova);
>>>>>> +
>>>>>> +	preempt_prepare_postamble(a6xx_gpu);
>>>>>> +
>>>>>> +	if (IS_ERR(a6xx_gpu->preempt_postamble_ptr))
>>>>>> +		goto fail;
>>>>>> +
>>>>>>     	timer_setup(&a6xx_gpu->preempt_timer, a6xx_preempt_timer, 0);
>>>>>>     	return;
>>>>>> diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.h b/drivers/gpu/drm/msm/adreno/adreno_gpu.h
>>>>>> index 6b1888280a83..87098567483b 100644
>>>>>> --- a/drivers/gpu/drm/msm/adreno/adreno_gpu.h
>>>>>> +++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.h
>>>>>> @@ -610,12 +610,15 @@ OUT_PKT4(struct msm_ringbuffer *ring, uint16_t regindx, uint16_t cnt)
>>>>>>     	OUT_RING(ring, PKT4(regindx, cnt));
>>>>>>     }
>>>>>> +#define PKT7(opcode, cnt) \
>>>>>> +	(CP_TYPE7_PKT | (cnt << 0) | (PM4_PARITY(cnt) << 15) | \
>>>>>> +		((opcode & 0x7F) << 16) | (PM4_PARITY(opcode) << 23))
>>>>>> +
>>>>>>     static inline void
>>>>>>     OUT_PKT7(struct msm_ringbuffer *ring, uint8_t opcode, uint16_t cnt)
>>>>>>     {
>>>>>>     	adreno_wait_ring(ring, cnt + 1);
>>>>>> -	OUT_RING(ring, CP_TYPE7_PKT | (cnt << 0) | (PM4_PARITY(cnt) << 15) |
>>>>>> -		((opcode & 0x7F) << 16) | (PM4_PARITY(opcode) << 23));
>>>>>> +	OUT_RING(ring, PKT7(opcode, cnt));
>>>>>>     }
>>>>>>     struct msm_gpu *a2xx_gpu_init(struct drm_device *dev);
>>>>>>
>>>>>> -- 
>>>>>> 2.46.0
>>>>>>
>>>>
>>>> Best regards,
>>>> -- 
>>>> Antonino Maniscalco <antomani103@gmail.com>
>>>>
>>
>> Best regards,
>> -- 
>> Antonino Maniscalco <antomani103@gmail.com>
>>

Best regards,
diff mbox series

Patch

diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
index ed0b138a2d66..710ec3ce2923 100644
--- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
@@ -366,7 +366,8 @@  static void a6xx_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit)
 static void a6xx_emit_set_pseudo_reg(struct msm_ringbuffer *ring,
 		struct a6xx_gpu *a6xx_gpu, struct msm_gpu_submitqueue *queue)
 {
-	u64 preempt_offset_priv_secure;
+	bool sysprof = refcount_read(&a6xx_gpu->base.base.sysprof_active) > 1;
+	u64 preempt_offset_priv_secure, preempt_postamble;
 
 	OUT_PKT7(ring, CP_SET_PSEUDO_REG, 15);
 
@@ -398,6 +399,23 @@  static void a6xx_emit_set_pseudo_reg(struct msm_ringbuffer *ring,
 	/* seems OK to set to 0 to disable it */
 	OUT_RING(ring, 0);
 	OUT_RING(ring, 0);
+
+	/* if not profiling set postamble to clear perfcounters, else clear it */
+	if (!sysprof && a6xx_gpu->preempt_postamble_len) {
+		preempt_postamble = a6xx_gpu->preempt_postamble_iova;
+
+		OUT_PKT7(ring, CP_SET_AMBLE, 3);
+		OUT_RING(ring, lower_32_bits(preempt_postamble));
+		OUT_RING(ring, upper_32_bits(preempt_postamble));
+		OUT_RING(ring, CP_SET_AMBLE_2_DWORDS(
+					a6xx_gpu->preempt_postamble_len) |
+				CP_SET_AMBLE_2_TYPE(KMD_AMBLE_TYPE));
+	} else {
+		OUT_PKT7(ring, CP_SET_AMBLE, 3);
+		OUT_RING(ring, 0);
+		OUT_RING(ring, 0);
+		OUT_RING(ring, CP_SET_AMBLE_2_TYPE(KMD_AMBLE_TYPE));
+	}
 }
 
 static void a7xx_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit)
diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.h b/drivers/gpu/drm/msm/adreno/a6xx_gpu.h
index da10060e38dc..b009732c08c5 100644
--- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.h
+++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.h
@@ -71,6 +71,11 @@  struct a6xx_gpu {
 	bool uses_gmem;
 	bool skip_save_restore;
 
+	struct drm_gem_object *preempt_postamble_bo;
+	void *preempt_postamble_ptr;
+	uint64_t preempt_postamble_iova;
+	uint64_t preempt_postamble_len;
+
 	struct a6xx_gmu gmu;
 
 	struct drm_gem_object *shadow_bo;
diff --git a/drivers/gpu/drm/msm/adreno/a6xx_preempt.c b/drivers/gpu/drm/msm/adreno/a6xx_preempt.c
index 1caff76aca6e..ec44f44d925f 100644
--- a/drivers/gpu/drm/msm/adreno/a6xx_preempt.c
+++ b/drivers/gpu/drm/msm/adreno/a6xx_preempt.c
@@ -346,6 +346,28 @@  static int preempt_init_ring(struct a6xx_gpu *a6xx_gpu,
 	return 0;
 }
 
+static void preempt_prepare_postamble(struct a6xx_gpu *a6xx_gpu)
+{
+	u32 *postamble = a6xx_gpu->preempt_postamble_ptr;
+	u32 count = 0;
+
+	postamble[count++] = PKT7(CP_REG_RMW, 3);
+	postamble[count++] = REG_A6XX_RBBM_PERFCTR_SRAM_INIT_CMD;
+	postamble[count++] = 0;
+	postamble[count++] = 1;
+
+	postamble[count++] = PKT7(CP_WAIT_REG_MEM, 6);
+	postamble[count++] = CP_WAIT_REG_MEM_0_FUNCTION(WRITE_EQ);
+	postamble[count++] = CP_WAIT_REG_MEM_1_POLL_ADDR_LO(
+				REG_A6XX_RBBM_PERFCTR_SRAM_INIT_STATUS);
+	postamble[count++] = CP_WAIT_REG_MEM_2_POLL_ADDR_HI(0);
+	postamble[count++] = CP_WAIT_REG_MEM_3_REF(0x1);
+	postamble[count++] = CP_WAIT_REG_MEM_4_MASK(0x1);
+	postamble[count++] = CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(0);
+
+	a6xx_gpu->preempt_postamble_len = count;
+}
+
 void a6xx_preempt_fini(struct msm_gpu *gpu)
 {
 	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
@@ -376,6 +398,16 @@  void a6xx_preempt_init(struct msm_gpu *gpu)
 	a6xx_gpu->uses_gmem = 1;
 	a6xx_gpu->skip_save_restore = 1;
 
+	a6xx_gpu->preempt_postamble_ptr  = msm_gem_kernel_new(gpu->dev,
+			PAGE_SIZE, MSM_BO_WC | MSM_BO_MAP_PRIV,
+			gpu->aspace, &a6xx_gpu->preempt_postamble_bo,
+			&a6xx_gpu->preempt_postamble_iova);
+
+	preempt_prepare_postamble(a6xx_gpu);
+
+	if (IS_ERR(a6xx_gpu->preempt_postamble_ptr))
+		goto fail;
+
 	timer_setup(&a6xx_gpu->preempt_timer, a6xx_preempt_timer, 0);
 
 	return;
diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.h b/drivers/gpu/drm/msm/adreno/adreno_gpu.h
index 6b1888280a83..87098567483b 100644
--- a/drivers/gpu/drm/msm/adreno/adreno_gpu.h
+++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.h
@@ -610,12 +610,15 @@  OUT_PKT4(struct msm_ringbuffer *ring, uint16_t regindx, uint16_t cnt)
 	OUT_RING(ring, PKT4(regindx, cnt));
 }
 
+#define PKT7(opcode, cnt) \
+	(CP_TYPE7_PKT | (cnt << 0) | (PM4_PARITY(cnt) << 15) | \
+		((opcode & 0x7F) << 16) | (PM4_PARITY(opcode) << 23))
+
 static inline void
 OUT_PKT7(struct msm_ringbuffer *ring, uint8_t opcode, uint16_t cnt)
 {
 	adreno_wait_ring(ring, cnt + 1);
-	OUT_RING(ring, CP_TYPE7_PKT | (cnt << 0) | (PM4_PARITY(cnt) << 15) |
-		((opcode & 0x7F) << 16) | (PM4_PARITY(opcode) << 23));
+	OUT_RING(ring, PKT7(opcode, cnt));
 }
 
 struct msm_gpu *a2xx_gpu_init(struct drm_device *dev);