diff mbox series

[11/12] drm/v3d: Do not use intermediate storage when copying performance query results

Message ID 20240709163425.58276-12-tursulin@igalia.com (mailing list archive)
State New, archived
Headers show
Series v3d: Perfmon cleanup | expand

Commit Message

Tvrtko Ursulin July 9, 2024, 4:34 p.m. UTC
From: Tvrtko Ursulin <tvrtko.ursulin@igalia.com>

Removing the intermediate buffer removes the last use of the
V3D_MAX_COUNTERS define, which will enable further driver cleanup.

While at it pull the 32 vs 64 bit copying decision outside the loop in
order to reduce the number of conditional instructions.

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@igalia.com>
---
 drivers/gpu/drm/v3d/v3d_sched.c | 60 ++++++++++++++++++++-------------
 1 file changed, 37 insertions(+), 23 deletions(-)

Comments

Iago Toral July 11, 2024, 12:31 p.m. UTC | #1
El mar, 09-07-2024 a las 17:34 +0100, Tvrtko Ursulin escribió:
> From: Tvrtko Ursulin <tvrtko.ursulin@igalia.com>
> 
> Removing the intermediate buffer removes the last use of the
> V3D_MAX_COUNTERS define, which will enable further driver cleanup.
> 
> While at it pull the 32 vs 64 bit copying decision outside the loop
> in
> order to reduce the number of conditional instructions.
> 
> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@igalia.com>
> ---
>  drivers/gpu/drm/v3d/v3d_sched.c | 60 ++++++++++++++++++++-----------
> --
>  1 file changed, 37 insertions(+), 23 deletions(-)
> 
> diff --git a/drivers/gpu/drm/v3d/v3d_sched.c
> b/drivers/gpu/drm/v3d/v3d_sched.c
> index fc8730264386..77f795e38fad 100644
> --- a/drivers/gpu/drm/v3d/v3d_sched.c
> +++ b/drivers/gpu/drm/v3d/v3d_sched.c
> @@ -421,18 +421,23 @@ v3d_reset_timestamp_queries(struct v3d_cpu_job
> *job)
>  	v3d_put_bo_vaddr(bo);
>  }
>  
> +static void write_to_buffer_32(u32 *dst, unsigned int idx, u32
> value)
> +{
> +	dst[idx] = value;
> +}
> +
> +static void write_to_buffer_64(u64 *dst, unsigned int idx, u64
> value)
> +{
> +	dst[idx] = value;
> +}
> +
>  static void
> -write_to_buffer(void *dst, u32 idx, bool do_64bit, u64 value)
> +write_to_buffer(void *dst, unsigned int idx, bool do_64bit, u64
> value)
>  {
> -	if (do_64bit) {
> -		u64 *dst64 = (u64 *)dst;
> -
> -		dst64[idx] = value;
> -	} else {
> -		u32 *dst32 = (u32 *)dst;
> -
> -		dst32[idx] = (u32)value;
> -	}
> +	if (do_64bit)
> +		write_to_buffer_64(dst, idx, value);
> +	else
> +		write_to_buffer_32(dst, idx, value);
>  }
>  
>  static void
> @@ -505,18 +510,23 @@ v3d_reset_performance_queries(struct
> v3d_cpu_job *job)
>  }
>  
>  static void
> -v3d_write_performance_query_result(struct v3d_cpu_job *job, void
> *data, u32 query)
> +v3d_write_performance_query_result(struct v3d_cpu_job *job, void
> *data,
> +				   unsigned int query)
>  {
> -	struct v3d_performance_query_info *performance_query = &job-
> >performance_query;
> -	struct v3d_copy_query_results_info *copy = &job->copy;
> +	struct v3d_performance_query_info *performance_query =
> +						&job-
> >performance_query;
>  	struct v3d_file_priv *v3d_priv = job->base.file-
> >driver_priv;
>  	struct v3d_dev *v3d = job->base.v3d;
> -	struct v3d_perfmon *perfmon;
> -	u64 counter_values[V3D_MAX_COUNTERS];
> +	unsigned int i, j, offset;
>  
> -	for (int i = 0; i < performance_query->nperfmons; i++) {
> -		perfmon = v3d_perfmon_find(v3d_priv,
> -					   performance_query-
> >queries[query].kperfmon_ids[i]);
> +	for (i = 0, offset = 0;
> +	     i < performance_query->nperfmons;
> +	     i++, offset += DRM_V3D_MAX_PERF_COUNTERS) {
> +		struct v3d_performance_query *q =
> +				&performance_query->queries[query];

Looks like we could move this before the loop.

Otherwise this patch is:
Reviewed-by: Iago Toral Quiroga <itoral@igalia.com>


> +		struct v3d_perfmon *perfmon;
> +
> +		perfmon = v3d_perfmon_find(v3d_priv, q-
> >kperfmon_ids[i]);
>  		if (!perfmon) {
>  			DRM_DEBUG("Failed to find perfmon.");
>  			continue;
> @@ -524,14 +534,18 @@ v3d_write_performance_query_result(struct
> v3d_cpu_job *job, void *data, u32 quer
>  
>  		v3d_perfmon_stop(v3d, perfmon, true);
>  
> -		memcpy(&counter_values[i *
> DRM_V3D_MAX_PERF_COUNTERS], perfmon->values,
> -		       perfmon->ncounters * sizeof(u64));
> +		if (job->copy.do_64bit) {
> +			for (j = 0; j < perfmon->ncounters; j++)
> +				write_to_buffer_64(data, offset + j,
> +						   perfmon-
> >values[j]);
> +		} else {
> +			for (j = 0; j < perfmon->ncounters; j++)
> +				write_to_buffer_32(data, offset + j,
> +						   perfmon-
> >values[j]);
> +		}
>  
>  		v3d_perfmon_put(perfmon);
>  	}
> -
> -	for (int i = 0; i < performance_query->ncounters; i++)
> -		write_to_buffer(data, i, copy->do_64bit,
> counter_values[i]);
>  }
>  
>  static void
Tvrtko Ursulin July 11, 2024, 12:55 p.m. UTC | #2
On 11/07/2024 13:31, Iago Toral wrote:
> El mar, 09-07-2024 a las 17:34 +0100, Tvrtko Ursulin escribió:
>> From: Tvrtko Ursulin <tvrtko.ursulin@igalia.com>
>>
>> Removing the intermediate buffer removes the last use of the
>> V3D_MAX_COUNTERS define, which will enable further driver cleanup.
>>
>> While at it pull the 32 vs 64 bit copying decision outside the loop
>> in
>> order to reduce the number of conditional instructions.
>>
>> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@igalia.com>
>> ---
>>   drivers/gpu/drm/v3d/v3d_sched.c | 60 ++++++++++++++++++++-----------
>> --
>>   1 file changed, 37 insertions(+), 23 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/v3d/v3d_sched.c
>> b/drivers/gpu/drm/v3d/v3d_sched.c
>> index fc8730264386..77f795e38fad 100644
>> --- a/drivers/gpu/drm/v3d/v3d_sched.c
>> +++ b/drivers/gpu/drm/v3d/v3d_sched.c
>> @@ -421,18 +421,23 @@ v3d_reset_timestamp_queries(struct v3d_cpu_job
>> *job)
>>   	v3d_put_bo_vaddr(bo);
>>   }
>>   
>> +static void write_to_buffer_32(u32 *dst, unsigned int idx, u32
>> value)
>> +{
>> +	dst[idx] = value;
>> +}
>> +
>> +static void write_to_buffer_64(u64 *dst, unsigned int idx, u64
>> value)
>> +{
>> +	dst[idx] = value;
>> +}
>> +
>>   static void
>> -write_to_buffer(void *dst, u32 idx, bool do_64bit, u64 value)
>> +write_to_buffer(void *dst, unsigned int idx, bool do_64bit, u64
>> value)
>>   {
>> -	if (do_64bit) {
>> -		u64 *dst64 = (u64 *)dst;
>> -
>> -		dst64[idx] = value;
>> -	} else {
>> -		u32 *dst32 = (u32 *)dst;
>> -
>> -		dst32[idx] = (u32)value;
>> -	}
>> +	if (do_64bit)
>> +		write_to_buffer_64(dst, idx, value);
>> +	else
>> +		write_to_buffer_32(dst, idx, value);
>>   }
>>   
>>   static void
>> @@ -505,18 +510,23 @@ v3d_reset_performance_queries(struct
>> v3d_cpu_job *job)
>>   }
>>   
>>   static void
>> -v3d_write_performance_query_result(struct v3d_cpu_job *job, void
>> *data, u32 query)
>> +v3d_write_performance_query_result(struct v3d_cpu_job *job, void
>> *data,
>> +				   unsigned int query)
>>   {
>> -	struct v3d_performance_query_info *performance_query = &job-
>>> performance_query;
>> -	struct v3d_copy_query_results_info *copy = &job->copy;
>> +	struct v3d_performance_query_info *performance_query =
>> +						&job-
>>> performance_query;
>>   	struct v3d_file_priv *v3d_priv = job->base.file-
>>> driver_priv;
>>   	struct v3d_dev *v3d = job->base.v3d;
>> -	struct v3d_perfmon *perfmon;
>> -	u64 counter_values[V3D_MAX_COUNTERS];
>> +	unsigned int i, j, offset;
>>   
>> -	for (int i = 0; i < performance_query->nperfmons; i++) {
>> -		perfmon = v3d_perfmon_find(v3d_priv,
>> -					   performance_query-
>>> queries[query].kperfmon_ids[i]);
>> +	for (i = 0, offset = 0;
>> +	     i < performance_query->nperfmons;
>> +	     i++, offset += DRM_V3D_MAX_PERF_COUNTERS) {
>> +		struct v3d_performance_query *q =
>> +				&performance_query->queries[query];
> 
> Looks like we could move this before the loop.

Indeed! I will change it and re-send, either for v4 of the series, or 
single update if there will not be any other changes required.

> Otherwise this patch is:
> Reviewed-by: Iago Toral Quiroga <itoral@igalia.com>

Thanks!

Regards,

Tvrtko

>> +		struct v3d_perfmon *perfmon;
>> +
>> +		perfmon = v3d_perfmon_find(v3d_priv, q-
>>> kperfmon_ids[i]);
>>   		if (!perfmon) {
>>   			DRM_DEBUG("Failed to find perfmon.");
>>   			continue;
>> @@ -524,14 +534,18 @@ v3d_write_performance_query_result(struct
>> v3d_cpu_job *job, void *data, u32 quer
>>   
>>   		v3d_perfmon_stop(v3d, perfmon, true);
>>   
>> -		memcpy(&counter_values[i *
>> DRM_V3D_MAX_PERF_COUNTERS], perfmon->values,
>> -		       perfmon->ncounters * sizeof(u64));
>> +		if (job->copy.do_64bit) {
>> +			for (j = 0; j < perfmon->ncounters; j++)
>> +				write_to_buffer_64(data, offset + j,
>> +						   perfmon-
>>> values[j]);
>> +		} else {
>> +			for (j = 0; j < perfmon->ncounters; j++)
>> +				write_to_buffer_32(data, offset + j,
>> +						   perfmon-
>>> values[j]);
>> +		}
>>   
>>   		v3d_perfmon_put(perfmon);
>>   	}
>> -
>> -	for (int i = 0; i < performance_query->ncounters; i++)
>> -		write_to_buffer(data, i, copy->do_64bit,
>> counter_values[i]);
>>   }
>>   
>>   static void
>
diff mbox series

Patch

diff --git a/drivers/gpu/drm/v3d/v3d_sched.c b/drivers/gpu/drm/v3d/v3d_sched.c
index fc8730264386..77f795e38fad 100644
--- a/drivers/gpu/drm/v3d/v3d_sched.c
+++ b/drivers/gpu/drm/v3d/v3d_sched.c
@@ -421,18 +421,23 @@  v3d_reset_timestamp_queries(struct v3d_cpu_job *job)
 	v3d_put_bo_vaddr(bo);
 }
 
+static void write_to_buffer_32(u32 *dst, unsigned int idx, u32 value)
+{
+	dst[idx] = value;
+}
+
+static void write_to_buffer_64(u64 *dst, unsigned int idx, u64 value)
+{
+	dst[idx] = value;
+}
+
 static void
-write_to_buffer(void *dst, u32 idx, bool do_64bit, u64 value)
+write_to_buffer(void *dst, unsigned int idx, bool do_64bit, u64 value)
 {
-	if (do_64bit) {
-		u64 *dst64 = (u64 *)dst;
-
-		dst64[idx] = value;
-	} else {
-		u32 *dst32 = (u32 *)dst;
-
-		dst32[idx] = (u32)value;
-	}
+	if (do_64bit)
+		write_to_buffer_64(dst, idx, value);
+	else
+		write_to_buffer_32(dst, idx, value);
 }
 
 static void
@@ -505,18 +510,23 @@  v3d_reset_performance_queries(struct v3d_cpu_job *job)
 }
 
 static void
-v3d_write_performance_query_result(struct v3d_cpu_job *job, void *data, u32 query)
+v3d_write_performance_query_result(struct v3d_cpu_job *job, void *data,
+				   unsigned int query)
 {
-	struct v3d_performance_query_info *performance_query = &job->performance_query;
-	struct v3d_copy_query_results_info *copy = &job->copy;
+	struct v3d_performance_query_info *performance_query =
+						&job->performance_query;
 	struct v3d_file_priv *v3d_priv = job->base.file->driver_priv;
 	struct v3d_dev *v3d = job->base.v3d;
-	struct v3d_perfmon *perfmon;
-	u64 counter_values[V3D_MAX_COUNTERS];
+	unsigned int i, j, offset;
 
-	for (int i = 0; i < performance_query->nperfmons; i++) {
-		perfmon = v3d_perfmon_find(v3d_priv,
-					   performance_query->queries[query].kperfmon_ids[i]);
+	for (i = 0, offset = 0;
+	     i < performance_query->nperfmons;
+	     i++, offset += DRM_V3D_MAX_PERF_COUNTERS) {
+		struct v3d_performance_query *q =
+				&performance_query->queries[query];
+		struct v3d_perfmon *perfmon;
+
+		perfmon = v3d_perfmon_find(v3d_priv, q->kperfmon_ids[i]);
 		if (!perfmon) {
 			DRM_DEBUG("Failed to find perfmon.");
 			continue;
@@ -524,14 +534,18 @@  v3d_write_performance_query_result(struct v3d_cpu_job *job, void *data, u32 quer
 
 		v3d_perfmon_stop(v3d, perfmon, true);
 
-		memcpy(&counter_values[i * DRM_V3D_MAX_PERF_COUNTERS], perfmon->values,
-		       perfmon->ncounters * sizeof(u64));
+		if (job->copy.do_64bit) {
+			for (j = 0; j < perfmon->ncounters; j++)
+				write_to_buffer_64(data, offset + j,
+						   perfmon->values[j]);
+		} else {
+			for (j = 0; j < perfmon->ncounters; j++)
+				write_to_buffer_32(data, offset + j,
+						   perfmon->values[j]);
+		}
 
 		v3d_perfmon_put(perfmon);
 	}
-
-	for (int i = 0; i < performance_query->ncounters; i++)
-		write_to_buffer(data, i, copy->do_64bit, counter_values[i]);
 }
 
 static void