diff mbox series

[3/3] drm/msm: Use Hardware counters for perf profiling

Message ID 1539781441-13076-4-git-send-email-smasetty@codeaurora.org (mailing list archive)
State New, archived
Headers show
Series drm/msm: Revive GPU workload profiling | expand

Commit Message

Sharat Masetty Oct. 17, 2018, 1:04 p.m. UTC
This patch attempts to make use of the hardware counters for GPU busy %
estimation when possible and skip using the software counters as it also
accounts for software side delays. This should help give more accurate
representation of the GPU workload.

Signed-off-by: Sharat Masetty <smasetty@codeaurora.org>
---
 drivers/gpu/drm/msm/msm_gpu.c  | 30 ++++++++++++++++++++++++++----
 drivers/gpu/drm/msm/msm_gpu.h  |  5 +++--
 drivers/gpu/drm/msm/msm_perf.c | 10 +++++-----
 3 files changed, 34 insertions(+), 11 deletions(-)

Comments

Jordan Crouse Oct. 17, 2018, 2:35 p.m. UTC | #1
On Wed, Oct 17, 2018 at 06:34:01PM +0530, Sharat Masetty wrote:
> This patch attempts to make use of the hardware counters for GPU busy %
> estimation when possible and skip using the software counters as it also
> accounts for software side delays. This should help give more accurate
> representation of the GPU workload.

I would leave this more to Rob as this particular file makes more sense for
freedreno than it does for us but I think in general if you want to do this
then we should do use the hardware for all targets and get rid of the
mix of the old and the new.

> Signed-off-by: Sharat Masetty <smasetty@codeaurora.org>
> ---
>  drivers/gpu/drm/msm/msm_gpu.c  | 30 ++++++++++++++++++++++++++----
>  drivers/gpu/drm/msm/msm_gpu.h  |  5 +++--
>  drivers/gpu/drm/msm/msm_perf.c | 10 +++++-----
>  3 files changed, 34 insertions(+), 11 deletions(-)
> 
> diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c
> index e9b5426..a896541 100644
> --- a/drivers/gpu/drm/msm/msm_gpu.c
> +++ b/drivers/gpu/drm/msm/msm_gpu.c
> @@ -592,6 +592,9 @@ static void update_sw_cntrs(struct msm_gpu *gpu)
>  	uint32_t elapsed;
>  	unsigned long flags;
>  
> +	if (gpu->funcs->gpu_busy)
> +		return;

Like here - there isn't any reason to not have a gpu_busy for each target
so then this code could mostly go away.

>  	spin_lock_irqsave(&gpu->perf_lock, flags);
>  	if (!gpu->perfcntr_active)
>  		goto out;
> @@ -620,6 +623,7 @@ void msm_gpu_perfcntr_start(struct msm_gpu *gpu)
>  	/* we could dynamically enable/disable perfcntr registers too.. */
>  	gpu->last_sample.active = msm_gpu_active(gpu);
>  	gpu->last_sample.time = ktime_get();
> +	gpu->last_sample.busy_cycles = 0;
>  	gpu->activetime = gpu->totaltime = 0;
>  	gpu->perfcntr_active = true;
>  	update_hw_cntrs(gpu, 0, NULL);
> @@ -632,9 +636,22 @@ void msm_gpu_perfcntr_stop(struct msm_gpu *gpu)
>  	pm_runtime_put_sync(&gpu->pdev->dev);
>  }
>  
> +static void msm_gpu_hw_sample(struct msm_gpu *gpu, uint64_t *activetime,
> +		uint64_t *totaltime)
> +{
> +	ktime_t time;
> +
> +	*activetime = gpu->funcs->gpu_busy(gpu,
> +			&gpu->last_sample.busy_cycles);
> +
> +	time = ktime_get();
> +	*totaltime = ktime_us_delta(time, gpu->last_sample.time);
> +	gpu->last_sample.time = time;
> +}

This can all be done inline in the sample function below.

>  /* returns -errno or # of cntrs sampled */
> -int msm_gpu_perfcntr_sample(struct msm_gpu *gpu, uint32_t *activetime,
> -		uint32_t *totaltime, uint32_t ncntrs, uint32_t *cntrs)
> +int msm_gpu_perfcntr_sample(struct msm_gpu *gpu, uint64_t *activetime,
> +		uint64_t *totaltime, uint32_t ncntrs, uint32_t *cntrs)

This might be an good change (though if our activetime or totaltime ever
goes over 32 bits we've got ourselves a problem) - but it should be in a
separate patch.

>  {
>  	unsigned long flags;
>  	int ret;
> @@ -646,13 +663,18 @@ int msm_gpu_perfcntr_sample(struct msm_gpu *gpu, uint32_t *activetime,
>  		goto out;
>  	}
>  
> +	ret = update_hw_cntrs(gpu, ncntrs, cntrs);
> +
> +	if (gpu->funcs->gpu_busy) {
> +		msm_gpu_hw_sample(gpu, activetime, totaltime);
> +		goto out;
> +	}
> +
>  	*activetime = gpu->activetime;
>  	*totaltime = gpu->totaltime;
>  
>  	gpu->activetime = gpu->totaltime = 0;
>  
> -	ret = update_hw_cntrs(gpu, ncntrs, cntrs);
> -
>  out:
>  	spin_unlock_irqrestore(&gpu->perf_lock, flags);
>  
> diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h
> index 0ff23ca..7dc775f 100644
> --- a/drivers/gpu/drm/msm/msm_gpu.h
> +++ b/drivers/gpu/drm/msm/msm_gpu.h
> @@ -90,6 +90,7 @@ struct msm_gpu {
>  	struct {
>  		bool active;
>  		ktime_t time;
> +		u64 busy_cycles;
>  	} last_sample;
>  	uint32_t totaltime, activetime;    /* sw counters */
>  	uint32_t last_cntrs[5];            /* hw counters */
> @@ -275,8 +276,8 @@ static inline void gpu_write64(struct msm_gpu *gpu, u32 lo, u32 hi, u64 val)
>  
>  void msm_gpu_perfcntr_start(struct msm_gpu *gpu);
>  void msm_gpu_perfcntr_stop(struct msm_gpu *gpu);
> -int msm_gpu_perfcntr_sample(struct msm_gpu *gpu, uint32_t *activetime,
> -		uint32_t *totaltime, uint32_t ncntrs, uint32_t *cntrs);
> +int msm_gpu_perfcntr_sample(struct msm_gpu *gpu, uint64_t *activetime,
> +		uint64_t *totaltime, uint32_t ncntrs, uint32_t *cntrs);
>  
>  void msm_gpu_retire(struct msm_gpu *gpu);
>  void msm_gpu_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit,
> diff --git a/drivers/gpu/drm/msm/msm_perf.c b/drivers/gpu/drm/msm/msm_perf.c
> index 5ab21bd..318f7dd 100644
> --- a/drivers/gpu/drm/msm/msm_perf.c
> +++ b/drivers/gpu/drm/msm/msm_perf.c
> @@ -17,7 +17,7 @@
>  
>  /* For profiling, userspace can:
>   *
> - *   tail -f /sys/kernel/debug/dri/<minor>/gpu
> + *   tail -f /sys/kernel/debug/dri/<minor>/perf
>   *
>   * This will enable performance counters/profiling to track the busy time
>   * and any gpu specific performance counters that are supported.
> @@ -85,9 +85,9 @@ static int refill_buf(struct msm_perf_state *perf)
>  		}
>  	} else {
>  		/* Sample line: */
> -		uint32_t activetime = 0, totaltime = 0;
> +		uint64_t activetime = 0, totaltime = 0;

All the changes to msm_perf.c shuld also be in a separate patch that moves
the sample size to u64.

>  		uint32_t cntrs[5];
> -		uint32_t val;
> +		uint64_t val;
>  		int ret;
>  
>  		/* sleep until next sample time: */
> @@ -101,14 +101,14 @@ static int refill_buf(struct msm_perf_state *perf)
>  			return ret;
>  
>  		val = totaltime ? 1000 * activetime / totaltime : 0;
> -		n = snprintf(ptr, rem, "%3d.%d%%", val / 10, val % 10);
> +		n = snprintf(ptr, rem, "%3llu.%llu%%", val / 10, val % 10);
>  		ptr += n;
>  		rem -= n;
>  
>  		for (i = 0; i < ret; i++) {
>  			/* cycle counters (I think).. convert to MHz.. */
>  			val = cntrs[i] / 10000;
> -			n = snprintf(ptr, rem, "\t%5d.%02d",
> +			n = snprintf(ptr, rem, "\t%5llu.%02llu",
>  					val / 100, val % 100);
>  			ptr += n;
>  			rem -= n;
> -- 
> 1.9.1
>
Sharat Masetty Oct. 26, 2018, 1:46 p.m. UTC | #2
Added Rob to this thread.

On 10/17/2018 8:05 PM, Jordan Crouse wrote:
> On Wed, Oct 17, 2018 at 06:34:01PM +0530, Sharat Masetty wrote:
>> This patch attempts to make use of the hardware counters for GPU busy %
>> estimation when possible and skip using the software counters as it also
>> accounts for software side delays. This should help give more accurate
>> representation of the GPU workload.
> 
> I would leave this more to Rob as this particular file makes more sense for
> freedreno than it does for us but I think in general if you want to do this
> then we should do use the hardware for all targets and get rid of the
> mix of the old and the new.
Thanks for the comments Jordan. Yes, I prefer the same too, but the only 
limiting factor for me is that I don't have a way to test changes for 
targets such as a3xx and a4xx, and I also do not know if there is anyone 
actually using this currently.

Hi Rob,
Can you please share your comments on this? Is it okay to remove 
software counters functionality?

Sharat
> 
>> Signed-off-by: Sharat Masetty <smasetty@codeaurora.org>
>> ---
>>   drivers/gpu/drm/msm/msm_gpu.c  | 30 ++++++++++++++++++++++++++----
>>   drivers/gpu/drm/msm/msm_gpu.h  |  5 +++--
>>   drivers/gpu/drm/msm/msm_perf.c | 10 +++++-----
>>   3 files changed, 34 insertions(+), 11 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c
>> index e9b5426..a896541 100644
>> --- a/drivers/gpu/drm/msm/msm_gpu.c
>> +++ b/drivers/gpu/drm/msm/msm_gpu.c
>> @@ -592,6 +592,9 @@ static void update_sw_cntrs(struct msm_gpu *gpu)
>>   	uint32_t elapsed;
>>   	unsigned long flags;
>>   
>> +	if (gpu->funcs->gpu_busy)
>> +		return;
> 
> Like here - there isn't any reason to not have a gpu_busy for each target
> so then this code could mostly go away.
> 
>>   	spin_lock_irqsave(&gpu->perf_lock, flags);
>>   	if (!gpu->perfcntr_active)
>>   		goto out;
>> @@ -620,6 +623,7 @@ void msm_gpu_perfcntr_start(struct msm_gpu *gpu)
>>   	/* we could dynamically enable/disable perfcntr registers too.. */
>>   	gpu->last_sample.active = msm_gpu_active(gpu);
>>   	gpu->last_sample.time = ktime_get();
>> +	gpu->last_sample.busy_cycles = 0;
>>   	gpu->activetime = gpu->totaltime = 0;
>>   	gpu->perfcntr_active = true;
>>   	update_hw_cntrs(gpu, 0, NULL);
>> @@ -632,9 +636,22 @@ void msm_gpu_perfcntr_stop(struct msm_gpu *gpu)
>>   	pm_runtime_put_sync(&gpu->pdev->dev);
>>   }
>>   
>> +static void msm_gpu_hw_sample(struct msm_gpu *gpu, uint64_t *activetime,
>> +		uint64_t *totaltime)
>> +{
>> +	ktime_t time;
>> +
>> +	*activetime = gpu->funcs->gpu_busy(gpu,
>> +			&gpu->last_sample.busy_cycles);
>> +
>> +	time = ktime_get();
>> +	*totaltime = ktime_us_delta(time, gpu->last_sample.time);
>> +	gpu->last_sample.time = time;
>> +}
> 
> This can all be done inline in the sample function below.
> 
>>   /* returns -errno or # of cntrs sampled */
>> -int msm_gpu_perfcntr_sample(struct msm_gpu *gpu, uint32_t *activetime,
>> -		uint32_t *totaltime, uint32_t ncntrs, uint32_t *cntrs)
>> +int msm_gpu_perfcntr_sample(struct msm_gpu *gpu, uint64_t *activetime,
>> +		uint64_t *totaltime, uint32_t ncntrs, uint32_t *cntrs)
> 
> This might be an good change (though if our activetime or totaltime ever
> goes over 32 bits we've got ourselves a problem) - but it should be in a
> separate patch.
> 
>>   {
>>   	unsigned long flags;
>>   	int ret;
>> @@ -646,13 +663,18 @@ int msm_gpu_perfcntr_sample(struct msm_gpu *gpu, uint32_t *activetime,
>>   		goto out;
>>   	}
>>   
>> +	ret = update_hw_cntrs(gpu, ncntrs, cntrs);
>> +
>> +	if (gpu->funcs->gpu_busy) {
>> +		msm_gpu_hw_sample(gpu, activetime, totaltime);
>> +		goto out;
>> +	}
>> +
>>   	*activetime = gpu->activetime;
>>   	*totaltime = gpu->totaltime;
>>   
>>   	gpu->activetime = gpu->totaltime = 0;
>>   
>> -	ret = update_hw_cntrs(gpu, ncntrs, cntrs);
>> -
>>   out:
>>   	spin_unlock_irqrestore(&gpu->perf_lock, flags);
>>   
>> diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h
>> index 0ff23ca..7dc775f 100644
>> --- a/drivers/gpu/drm/msm/msm_gpu.h
>> +++ b/drivers/gpu/drm/msm/msm_gpu.h
>> @@ -90,6 +90,7 @@ struct msm_gpu {
>>   	struct {
>>   		bool active;
>>   		ktime_t time;
>> +		u64 busy_cycles;
>>   	} last_sample;
>>   	uint32_t totaltime, activetime;    /* sw counters */
>>   	uint32_t last_cntrs[5];            /* hw counters */
>> @@ -275,8 +276,8 @@ static inline void gpu_write64(struct msm_gpu *gpu, u32 lo, u32 hi, u64 val)
>>   
>>   void msm_gpu_perfcntr_start(struct msm_gpu *gpu);
>>   void msm_gpu_perfcntr_stop(struct msm_gpu *gpu);
>> -int msm_gpu_perfcntr_sample(struct msm_gpu *gpu, uint32_t *activetime,
>> -		uint32_t *totaltime, uint32_t ncntrs, uint32_t *cntrs);
>> +int msm_gpu_perfcntr_sample(struct msm_gpu *gpu, uint64_t *activetime,
>> +		uint64_t *totaltime, uint32_t ncntrs, uint32_t *cntrs);
>>   
>>   void msm_gpu_retire(struct msm_gpu *gpu);
>>   void msm_gpu_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit,
>> diff --git a/drivers/gpu/drm/msm/msm_perf.c b/drivers/gpu/drm/msm/msm_perf.c
>> index 5ab21bd..318f7dd 100644
>> --- a/drivers/gpu/drm/msm/msm_perf.c
>> +++ b/drivers/gpu/drm/msm/msm_perf.c
>> @@ -17,7 +17,7 @@
>>   
>>   /* For profiling, userspace can:
>>    *
>> - *   tail -f /sys/kernel/debug/dri/<minor>/gpu
>> + *   tail -f /sys/kernel/debug/dri/<minor>/perf
>>    *
>>    * This will enable performance counters/profiling to track the busy time
>>    * and any gpu specific performance counters that are supported.
>> @@ -85,9 +85,9 @@ static int refill_buf(struct msm_perf_state *perf)
>>   		}
>>   	} else {
>>   		/* Sample line: */
>> -		uint32_t activetime = 0, totaltime = 0;
>> +		uint64_t activetime = 0, totaltime = 0;
> 
> All the changes to msm_perf.c shuld also be in a separate patch that moves
> the sample size to u64.
> 
>>   		uint32_t cntrs[5];
>> -		uint32_t val;
>> +		uint64_t val;
>>   		int ret;
>>   
>>   		/* sleep until next sample time: */
>> @@ -101,14 +101,14 @@ static int refill_buf(struct msm_perf_state *perf)
>>   			return ret;
>>   
>>   		val = totaltime ? 1000 * activetime / totaltime : 0;
>> -		n = snprintf(ptr, rem, "%3d.%d%%", val / 10, val % 10);
>> +		n = snprintf(ptr, rem, "%3llu.%llu%%", val / 10, val % 10);
>>   		ptr += n;
>>   		rem -= n;
>>   
>>   		for (i = 0; i < ret; i++) {
>>   			/* cycle counters (I think).. convert to MHz.. */
>>   			val = cntrs[i] / 10000;
>> -			n = snprintf(ptr, rem, "\t%5d.%02d",
>> +			n = snprintf(ptr, rem, "\t%5llu.%02llu",
>>   					val / 100, val % 100);
>>   			ptr += n;
>>   			rem -= n;
>> -- 
>> 1.9.1
>>
>
Rob Clark Nov. 28, 2018, 8:09 p.m. UTC | #3
On Fri, Oct 26, 2018 at 9:46 AM Sharat Masetty <smasetty@codeaurora.org> wrote:
>
> Added Rob to this thread.
>
> On 10/17/2018 8:05 PM, Jordan Crouse wrote:
> > On Wed, Oct 17, 2018 at 06:34:01PM +0530, Sharat Masetty wrote:
> >> This patch attempts to make use of the hardware counters for GPU busy %
> >> estimation when possible and skip using the software counters as it also
> >> accounts for software side delays. This should help give more accurate
> >> representation of the GPU workload.
> >
> > I would leave this more to Rob as this particular file makes more sense for
> > freedreno than it does for us but I think in general if you want to do this
> > then we should do use the hardware for all targets and get rid of the
> > mix of the old and the new.
> Thanks for the comments Jordan. Yes, I prefer the same too, but the only
> limiting factor for me is that I don't have a way to test changes for
> targets such as a3xx and a4xx, and I also do not know if there is anyone
> actually using this currently.
>
> Hi Rob,
> Can you please share your comments on this? Is it okay to remove
> software counters functionality?

In principle yes..  although one side-comment is that there are
patches floating around for a2xx, which is from long enough ago that I
don't remember what the perf-ctr situation is there.

I guess if we can be reasonably confident to implement it w/ hw
counters for all the generations, then I don't see a big need to keep
the sw counter functionality.

I should, in theory, be able to test a3xx/a4xx/a5xx.. a4xx might be a
bit harder to get a recent upstream kernel running on

BR,
-R

>
> Sharat
> >
> >> Signed-off-by: Sharat Masetty <smasetty@codeaurora.org>
> >> ---
> >>   drivers/gpu/drm/msm/msm_gpu.c  | 30 ++++++++++++++++++++++++++----
> >>   drivers/gpu/drm/msm/msm_gpu.h  |  5 +++--
> >>   drivers/gpu/drm/msm/msm_perf.c | 10 +++++-----
> >>   3 files changed, 34 insertions(+), 11 deletions(-)
> >>
> >> diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c
> >> index e9b5426..a896541 100644
> >> --- a/drivers/gpu/drm/msm/msm_gpu.c
> >> +++ b/drivers/gpu/drm/msm/msm_gpu.c
> >> @@ -592,6 +592,9 @@ static void update_sw_cntrs(struct msm_gpu *gpu)
> >>      uint32_t elapsed;
> >>      unsigned long flags;
> >>
> >> +    if (gpu->funcs->gpu_busy)
> >> +            return;
> >
> > Like here - there isn't any reason to not have a gpu_busy for each target
> > so then this code could mostly go away.
> >
> >>      spin_lock_irqsave(&gpu->perf_lock, flags);
> >>      if (!gpu->perfcntr_active)
> >>              goto out;
> >> @@ -620,6 +623,7 @@ void msm_gpu_perfcntr_start(struct msm_gpu *gpu)
> >>      /* we could dynamically enable/disable perfcntr registers too.. */
> >>      gpu->last_sample.active = msm_gpu_active(gpu);
> >>      gpu->last_sample.time = ktime_get();
> >> +    gpu->last_sample.busy_cycles = 0;
> >>      gpu->activetime = gpu->totaltime = 0;
> >>      gpu->perfcntr_active = true;
> >>      update_hw_cntrs(gpu, 0, NULL);
> >> @@ -632,9 +636,22 @@ void msm_gpu_perfcntr_stop(struct msm_gpu *gpu)
> >>      pm_runtime_put_sync(&gpu->pdev->dev);
> >>   }
> >>
> >> +static void msm_gpu_hw_sample(struct msm_gpu *gpu, uint64_t *activetime,
> >> +            uint64_t *totaltime)
> >> +{
> >> +    ktime_t time;
> >> +
> >> +    *activetime = gpu->funcs->gpu_busy(gpu,
> >> +                    &gpu->last_sample.busy_cycles);
> >> +
> >> +    time = ktime_get();
> >> +    *totaltime = ktime_us_delta(time, gpu->last_sample.time);
> >> +    gpu->last_sample.time = time;
> >> +}
> >
> > This can all be done inline in the sample function below.
> >
> >>   /* returns -errno or # of cntrs sampled */
> >> -int msm_gpu_perfcntr_sample(struct msm_gpu *gpu, uint32_t *activetime,
> >> -            uint32_t *totaltime, uint32_t ncntrs, uint32_t *cntrs)
> >> +int msm_gpu_perfcntr_sample(struct msm_gpu *gpu, uint64_t *activetime,
> >> +            uint64_t *totaltime, uint32_t ncntrs, uint32_t *cntrs)
> >
> > This might be an good change (though if our activetime or totaltime ever
> > goes over 32 bits we've got ourselves a problem) - but it should be in a
> > separate patch.
> >
> >>   {
> >>      unsigned long flags;
> >>      int ret;
> >> @@ -646,13 +663,18 @@ int msm_gpu_perfcntr_sample(struct msm_gpu *gpu, uint32_t *activetime,
> >>              goto out;
> >>      }
> >>
> >> +    ret = update_hw_cntrs(gpu, ncntrs, cntrs);
> >> +
> >> +    if (gpu->funcs->gpu_busy) {
> >> +            msm_gpu_hw_sample(gpu, activetime, totaltime);
> >> +            goto out;
> >> +    }
> >> +
> >>      *activetime = gpu->activetime;
> >>      *totaltime = gpu->totaltime;
> >>
> >>      gpu->activetime = gpu->totaltime = 0;
> >>
> >> -    ret = update_hw_cntrs(gpu, ncntrs, cntrs);
> >> -
> >>   out:
> >>      spin_unlock_irqrestore(&gpu->perf_lock, flags);
> >>
> >> diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h
> >> index 0ff23ca..7dc775f 100644
> >> --- a/drivers/gpu/drm/msm/msm_gpu.h
> >> +++ b/drivers/gpu/drm/msm/msm_gpu.h
> >> @@ -90,6 +90,7 @@ struct msm_gpu {
> >>      struct {
> >>              bool active;
> >>              ktime_t time;
> >> +            u64 busy_cycles;
> >>      } last_sample;
> >>      uint32_t totaltime, activetime;    /* sw counters */
> >>      uint32_t last_cntrs[5];            /* hw counters */
> >> @@ -275,8 +276,8 @@ static inline void gpu_write64(struct msm_gpu *gpu, u32 lo, u32 hi, u64 val)
> >>
> >>   void msm_gpu_perfcntr_start(struct msm_gpu *gpu);
> >>   void msm_gpu_perfcntr_stop(struct msm_gpu *gpu);
> >> -int msm_gpu_perfcntr_sample(struct msm_gpu *gpu, uint32_t *activetime,
> >> -            uint32_t *totaltime, uint32_t ncntrs, uint32_t *cntrs);
> >> +int msm_gpu_perfcntr_sample(struct msm_gpu *gpu, uint64_t *activetime,
> >> +            uint64_t *totaltime, uint32_t ncntrs, uint32_t *cntrs);
> >>
> >>   void msm_gpu_retire(struct msm_gpu *gpu);
> >>   void msm_gpu_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit,
> >> diff --git a/drivers/gpu/drm/msm/msm_perf.c b/drivers/gpu/drm/msm/msm_perf.c
> >> index 5ab21bd..318f7dd 100644
> >> --- a/drivers/gpu/drm/msm/msm_perf.c
> >> +++ b/drivers/gpu/drm/msm/msm_perf.c
> >> @@ -17,7 +17,7 @@
> >>
> >>   /* For profiling, userspace can:
> >>    *
> >> - *   tail -f /sys/kernel/debug/dri/<minor>/gpu
> >> + *   tail -f /sys/kernel/debug/dri/<minor>/perf
> >>    *
> >>    * This will enable performance counters/profiling to track the busy time
> >>    * and any gpu specific performance counters that are supported.
> >> @@ -85,9 +85,9 @@ static int refill_buf(struct msm_perf_state *perf)
> >>              }
> >>      } else {
> >>              /* Sample line: */
> >> -            uint32_t activetime = 0, totaltime = 0;
> >> +            uint64_t activetime = 0, totaltime = 0;
> >
> > All the changes to msm_perf.c shuld also be in a separate patch that moves
> > the sample size to u64.
> >
> >>              uint32_t cntrs[5];
> >> -            uint32_t val;
> >> +            uint64_t val;
> >>              int ret;
> >>
> >>              /* sleep until next sample time: */
> >> @@ -101,14 +101,14 @@ static int refill_buf(struct msm_perf_state *perf)
> >>                      return ret;
> >>
> >>              val = totaltime ? 1000 * activetime / totaltime : 0;
> >> -            n = snprintf(ptr, rem, "%3d.%d%%", val / 10, val % 10);
> >> +            n = snprintf(ptr, rem, "%3llu.%llu%%", val / 10, val % 10);
> >>              ptr += n;
> >>              rem -= n;
> >>
> >>              for (i = 0; i < ret; i++) {
> >>                      /* cycle counters (I think).. convert to MHz.. */
> >>                      val = cntrs[i] / 10000;
> >> -                    n = snprintf(ptr, rem, "\t%5d.%02d",
> >> +                    n = snprintf(ptr, rem, "\t%5llu.%02llu",
> >>                                      val / 100, val % 100);
> >>                      ptr += n;
> >>                      rem -= n;
> >> --
> >> 1.9.1
> >>
> >
>
> --
> The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
> Linux Foundation Collaborative Project
diff mbox series

Patch

diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c
index e9b5426..a896541 100644
--- a/drivers/gpu/drm/msm/msm_gpu.c
+++ b/drivers/gpu/drm/msm/msm_gpu.c
@@ -592,6 +592,9 @@  static void update_sw_cntrs(struct msm_gpu *gpu)
 	uint32_t elapsed;
 	unsigned long flags;
 
+	if (gpu->funcs->gpu_busy)
+		return;
+
 	spin_lock_irqsave(&gpu->perf_lock, flags);
 	if (!gpu->perfcntr_active)
 		goto out;
@@ -620,6 +623,7 @@  void msm_gpu_perfcntr_start(struct msm_gpu *gpu)
 	/* we could dynamically enable/disable perfcntr registers too.. */
 	gpu->last_sample.active = msm_gpu_active(gpu);
 	gpu->last_sample.time = ktime_get();
+	gpu->last_sample.busy_cycles = 0;
 	gpu->activetime = gpu->totaltime = 0;
 	gpu->perfcntr_active = true;
 	update_hw_cntrs(gpu, 0, NULL);
@@ -632,9 +636,22 @@  void msm_gpu_perfcntr_stop(struct msm_gpu *gpu)
 	pm_runtime_put_sync(&gpu->pdev->dev);
 }
 
+static void msm_gpu_hw_sample(struct msm_gpu *gpu, uint64_t *activetime,
+		uint64_t *totaltime)
+{
+	ktime_t time;
+
+	*activetime = gpu->funcs->gpu_busy(gpu,
+			&gpu->last_sample.busy_cycles);
+
+	time = ktime_get();
+	*totaltime = ktime_us_delta(time, gpu->last_sample.time);
+	gpu->last_sample.time = time;
+}
+
 /* returns -errno or # of cntrs sampled */
-int msm_gpu_perfcntr_sample(struct msm_gpu *gpu, uint32_t *activetime,
-		uint32_t *totaltime, uint32_t ncntrs, uint32_t *cntrs)
+int msm_gpu_perfcntr_sample(struct msm_gpu *gpu, uint64_t *activetime,
+		uint64_t *totaltime, uint32_t ncntrs, uint32_t *cntrs)
 {
 	unsigned long flags;
 	int ret;
@@ -646,13 +663,18 @@  int msm_gpu_perfcntr_sample(struct msm_gpu *gpu, uint32_t *activetime,
 		goto out;
 	}
 
+	ret = update_hw_cntrs(gpu, ncntrs, cntrs);
+
+	if (gpu->funcs->gpu_busy) {
+		msm_gpu_hw_sample(gpu, activetime, totaltime);
+		goto out;
+	}
+
 	*activetime = gpu->activetime;
 	*totaltime = gpu->totaltime;
 
 	gpu->activetime = gpu->totaltime = 0;
 
-	ret = update_hw_cntrs(gpu, ncntrs, cntrs);
-
 out:
 	spin_unlock_irqrestore(&gpu->perf_lock, flags);
 
diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h
index 0ff23ca..7dc775f 100644
--- a/drivers/gpu/drm/msm/msm_gpu.h
+++ b/drivers/gpu/drm/msm/msm_gpu.h
@@ -90,6 +90,7 @@  struct msm_gpu {
 	struct {
 		bool active;
 		ktime_t time;
+		u64 busy_cycles;
 	} last_sample;
 	uint32_t totaltime, activetime;    /* sw counters */
 	uint32_t last_cntrs[5];            /* hw counters */
@@ -275,8 +276,8 @@  static inline void gpu_write64(struct msm_gpu *gpu, u32 lo, u32 hi, u64 val)
 
 void msm_gpu_perfcntr_start(struct msm_gpu *gpu);
 void msm_gpu_perfcntr_stop(struct msm_gpu *gpu);
-int msm_gpu_perfcntr_sample(struct msm_gpu *gpu, uint32_t *activetime,
-		uint32_t *totaltime, uint32_t ncntrs, uint32_t *cntrs);
+int msm_gpu_perfcntr_sample(struct msm_gpu *gpu, uint64_t *activetime,
+		uint64_t *totaltime, uint32_t ncntrs, uint32_t *cntrs);
 
 void msm_gpu_retire(struct msm_gpu *gpu);
 void msm_gpu_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit,
diff --git a/drivers/gpu/drm/msm/msm_perf.c b/drivers/gpu/drm/msm/msm_perf.c
index 5ab21bd..318f7dd 100644
--- a/drivers/gpu/drm/msm/msm_perf.c
+++ b/drivers/gpu/drm/msm/msm_perf.c
@@ -17,7 +17,7 @@ 
 
 /* For profiling, userspace can:
  *
- *   tail -f /sys/kernel/debug/dri/<minor>/gpu
+ *   tail -f /sys/kernel/debug/dri/<minor>/perf
  *
  * This will enable performance counters/profiling to track the busy time
  * and any gpu specific performance counters that are supported.
@@ -85,9 +85,9 @@  static int refill_buf(struct msm_perf_state *perf)
 		}
 	} else {
 		/* Sample line: */
-		uint32_t activetime = 0, totaltime = 0;
+		uint64_t activetime = 0, totaltime = 0;
 		uint32_t cntrs[5];
-		uint32_t val;
+		uint64_t val;
 		int ret;
 
 		/* sleep until next sample time: */
@@ -101,14 +101,14 @@  static int refill_buf(struct msm_perf_state *perf)
 			return ret;
 
 		val = totaltime ? 1000 * activetime / totaltime : 0;
-		n = snprintf(ptr, rem, "%3d.%d%%", val / 10, val % 10);
+		n = snprintf(ptr, rem, "%3llu.%llu%%", val / 10, val % 10);
 		ptr += n;
 		rem -= n;
 
 		for (i = 0; i < ret; i++) {
 			/* cycle counters (I think).. convert to MHz.. */
 			val = cntrs[i] / 10000;
-			n = snprintf(ptr, rem, "\t%5d.%02d",
+			n = snprintf(ptr, rem, "\t%5llu.%02llu",
 					val / 100, val % 100);
 			ptr += n;
 			rem -= n;