diff mbox series

[6/7] drm/syncobj: Add a fast path to drm_syncobj_array_wait_timeout

Message ID 20250318155424.78552-7-tvrtko.ursulin@igalia.com (mailing list archive)
State New, archived
Headers show
Series A few drm_syncobj optimisations | expand

Commit Message

Tvrtko Ursulin March 18, 2025, 3:54 p.m. UTC
Running the Cyberpunk 2077 benchmark we can observe that waiting on DRM
sycobjs is relatively hot, but the 96% of the calls are for a single
object. (~4% for two points, and never more than three points. While
a more trivial workload like vkmark under Plasma is even more skewed
to single point waits.)

Therefore lets add a fast path to bypass the kcalloc/kfree and use a pre-
allocated stack array for those cases.

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@igalia.com>
---
 drivers/gpu/drm/drm_syncobj.c | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

Comments

Maíra Canal March 24, 2025, 11 p.m. UTC | #1
Hi Tvrtko,

On 18/03/25 12:54, Tvrtko Ursulin wrote:
> Running the Cyberpunk 2077 benchmark we can observe that waiting on DRM
> sycobjs is relatively hot, but the 96% of the calls are for a single
> object. (~4% for two points, and never more than three points. While
> a more trivial workload like vkmark under Plasma is even more skewed
> to single point waits.)
> 
> Therefore lets add a fast path to bypass the kcalloc/kfree and use a pre-
> allocated stack array for those cases.
> 
> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@igalia.com>
> ---
>   drivers/gpu/drm/drm_syncobj.c | 16 ++++++++++++----
>   1 file changed, 12 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/gpu/drm/drm_syncobj.c b/drivers/gpu/drm/drm_syncobj.c
> index b4563c696056..94932b89298f 100644
> --- a/drivers/gpu/drm/drm_syncobj.c
> +++ b/drivers/gpu/drm/drm_syncobj.c
> @@ -1035,6 +1035,7 @@ static signed long drm_syncobj_array_wait_timeout(struct drm_syncobj **syncobjs,
>   						  uint32_t *idx,
>   						  ktime_t *deadline)
>   {
> +	struct syncobj_wait_entry stack_entries[4];

Can't you initialize as

struct syncobj_wait_entry stack_entries[4] = {0};

?

>   	struct syncobj_wait_entry *entries;
>   	uint32_t signaled_count, i;
>   	struct dma_fence *fence;
> @@ -1049,9 +1050,14 @@ static signed long drm_syncobj_array_wait_timeout(struct drm_syncobj **syncobjs,
>   	    !access_ok(user_points, count * sizeof(*user_points)))
>   		return -EFAULT;
>   
> -	entries = kcalloc(count, sizeof(*entries), GFP_KERNEL);
> -	if (!entries)
> -		return -ENOMEM;
> +	if (count > ARRAY_SIZE(stack_entries)) {
> +		entries = kcalloc(count, sizeof(*entries), GFP_KERNEL);
> +		if (!entries)
> +			return -ENOMEM;
> +	} else {
> +		memset(stack_entries, 0, sizeof(stack_entries));

Then, you can avoid this step.

> +		entries = stack_entries;
> +	}
>   
>   	/* Walk the list of sync objects and initialize entries.  We do
>   	 * this up-front so that we can properly return -EINVAL if there is
> @@ -1174,7 +1180,9 @@ static signed long drm_syncobj_array_wait_timeout(struct drm_syncobj **syncobjs,
>   						  &entries[i].fence_cb);
>   		dma_fence_put(entries[i].fence);
>   	}
> -	kfree(entries);
> +
> +	if (entries != stack_entries)

You can initialize `entries = NULL` and avoid this step.

Anyway,

Reviewed-by: Maíra Canal <mcanal@igalia.com>

Best Regards,
- Maíra

> +		kfree(entries);
>   
>   	return timeout;
>   }
Tvrtko Ursulin March 25, 2025, 9:33 a.m. UTC | #2
On 24/03/2025 23:00, Maíra Canal wrote:
> Hi Tvrtko,
> 
> On 18/03/25 12:54, Tvrtko Ursulin wrote:
>> Running the Cyberpunk 2077 benchmark we can observe that waiting on DRM
>> sycobjs is relatively hot, but the 96% of the calls are for a single
>> object. (~4% for two points, and never more than three points. While
>> a more trivial workload like vkmark under Plasma is even more skewed
>> to single point waits.)
>>
>> Therefore lets add a fast path to bypass the kcalloc/kfree and use a pre-
>> allocated stack array for those cases.
>>
>> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@igalia.com>
>> ---
>>   drivers/gpu/drm/drm_syncobj.c | 16 ++++++++++++----
>>   1 file changed, 12 insertions(+), 4 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/drm_syncobj.c b/drivers/gpu/drm/ 
>> drm_syncobj.c
>> index b4563c696056..94932b89298f 100644
>> --- a/drivers/gpu/drm/drm_syncobj.c
>> +++ b/drivers/gpu/drm/drm_syncobj.c
>> @@ -1035,6 +1035,7 @@ static signed long 
>> drm_syncobj_array_wait_timeout(struct drm_syncobj **syncobjs,
>>                             uint32_t *idx,
>>                             ktime_t *deadline)
>>   {
>> +    struct syncobj_wait_entry stack_entries[4];
> 
> Can't you initialize as
> 
> struct syncobj_wait_entry stack_entries[4] = {0};
> 
> ?

Could do but I preferred to only do it when it is used.

> 
>>       struct syncobj_wait_entry *entries;
>>       uint32_t signaled_count, i;
>>       struct dma_fence *fence;
>> @@ -1049,9 +1050,14 @@ static signed long 
>> drm_syncobj_array_wait_timeout(struct drm_syncobj **syncobjs,
>>           !access_ok(user_points, count * sizeof(*user_points)))
>>           return -EFAULT;
>> -    entries = kcalloc(count, sizeof(*entries), GFP_KERNEL);
>> -    if (!entries)
>> -        return -ENOMEM;
>> +    if (count > ARRAY_SIZE(stack_entries)) {
>> +        entries = kcalloc(count, sizeof(*entries), GFP_KERNEL);
>> +        if (!entries)
>> +            return -ENOMEM;
>> +    } else {
>> +        memset(stack_entries, 0, sizeof(stack_entries));
> 
> Then, you can avoid this step.
> 
>> +        entries = stack_entries;
>> +    }
>>       /* Walk the list of sync objects and initialize entries.  We do
>>        * this up-front so that we can properly return -EINVAL if there is
>> @@ -1174,7 +1180,9 @@ static signed long 
>> drm_syncobj_array_wait_timeout(struct drm_syncobj **syncobjs,
>>                             &entries[i].fence_cb);
>>           dma_fence_put(entries[i].fence);
>>       }
>> -    kfree(entries);
>> +
>> +    if (entries != stack_entries)
> 
> You can initialize `entries = NULL` and avoid this step.

Hmm where? You mean like:

if (entries == stack_entries)
	entries = NULL;
kfree(entries);

Or something different?

Regards,

Tvrtko

> 
> Anyway,
> 
> Reviewed-by: Maíra Canal <mcanal@igalia.com>
> 
> Best Regards,
> - Maíra
> 
>> +        kfree(entries);
>>       return timeout;
>>   }
>
Maíra Canal March 25, 2025, 7:56 p.m. UTC | #3
Hi Tvrtko,

On 25/03/25 06:33, Tvrtko Ursulin wrote:
> 
> On 24/03/2025 23:00, Maíra Canal wrote:
>> Hi Tvrtko,
>>
>> On 18/03/25 12:54, Tvrtko Ursulin wrote:
>>> Running the Cyberpunk 2077 benchmark we can observe that waiting on DRM
>>> sycobjs is relatively hot, but the 96% of the calls are for a single
>>> object. (~4% for two points, and never more than three points. While
>>> a more trivial workload like vkmark under Plasma is even more skewed
>>> to single point waits.)
>>>
>>> Therefore lets add a fast path to bypass the kcalloc/kfree and use a 
>>> pre-
>>> allocated stack array for those cases.
>>>
>>> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@igalia.com>
>>> ---
>>>   drivers/gpu/drm/drm_syncobj.c | 16 ++++++++++++----
>>>   1 file changed, 12 insertions(+), 4 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/drm_syncobj.c b/drivers/gpu/drm/ 
>>> drm_syncobj.c
>>> index b4563c696056..94932b89298f 100644
>>> --- a/drivers/gpu/drm/drm_syncobj.c
>>> +++ b/drivers/gpu/drm/drm_syncobj.c
>>> @@ -1035,6 +1035,7 @@ static signed long 
>>> drm_syncobj_array_wait_timeout(struct drm_syncobj **syncobjs,
>>>                             uint32_t *idx,
>>>                             ktime_t *deadline)
>>>   {
>>> +    struct syncobj_wait_entry stack_entries[4];
>>
>> Can't you initialize as
>>
>> struct syncobj_wait_entry stack_entries[4] = {0};
>>
>> ?
> 
> Could do but I preferred to only do it when it is used.

Np.

> 
>>
>>>       struct syncobj_wait_entry *entries;
 >>>       uint32_t signaled_count, i;>>>       struct dma_fence *fence;
>>> @@ -1049,9 +1050,14 @@ static signed long 
>>> drm_syncobj_array_wait_timeout(struct drm_syncobj **syncobjs,
>>>           !access_ok(user_points, count * sizeof(*user_points)))
>>>           return -EFAULT;
>>> -    entries = kcalloc(count, sizeof(*entries), GFP_KERNEL);
>>> -    if (!entries)
>>> -        return -ENOMEM;
>>> +    if (count > ARRAY_SIZE(stack_entries)) {
>>> +        entries = kcalloc(count, sizeof(*entries), GFP_KERNEL);
>>> +        if (!entries)
>>> +            return -ENOMEM;
>>> +    } else {
>>> +        memset(stack_entries, 0, sizeof(stack_entries));
>>
>> Then, you can avoid this step.
>>
>>> +        entries = stack_entries;
>>> +    }
>>>       /* Walk the list of sync objects and initialize entries.  We do
>>>        * this up-front so that we can properly return -EINVAL if 
>>> there is
>>> @@ -1174,7 +1180,9 @@ static signed long 
>>> drm_syncobj_array_wait_timeout(struct drm_syncobj **syncobjs,
>>>                             &entries[i].fence_cb);
>>>           dma_fence_put(entries[i].fence);
>>>       }
>>> -    kfree(entries);
>>> +
>>> +    if (entries != stack_entries)
>>
>> You can initialize `entries = NULL` and avoid this step.
> 
> Hmm where? You mean like:
> 
> if (entries == stack_entries)
>      entries = NULL;
> kfree(entries);
> 
> Or something different?

Nvm, what I had thought initially was wrong. Sorry for the noise! Again,
feel free to you add my R-b.

Best Regards,
- Maíra

> 
> Regards,
> 
> Tvrtko
> 
>>
>> Anyway,
>>
>> Reviewed-by: Maíra Canal <mcanal@igalia.com>
>>
>> Best Regards,
>> - Maíra
>>
>>> +        kfree(entries);
>>>       return timeout;
>>>   }
>>
>
diff mbox series

Patch

diff --git a/drivers/gpu/drm/drm_syncobj.c b/drivers/gpu/drm/drm_syncobj.c
index b4563c696056..94932b89298f 100644
--- a/drivers/gpu/drm/drm_syncobj.c
+++ b/drivers/gpu/drm/drm_syncobj.c
@@ -1035,6 +1035,7 @@  static signed long drm_syncobj_array_wait_timeout(struct drm_syncobj **syncobjs,
 						  uint32_t *idx,
 						  ktime_t *deadline)
 {
+	struct syncobj_wait_entry stack_entries[4];
 	struct syncobj_wait_entry *entries;
 	uint32_t signaled_count, i;
 	struct dma_fence *fence;
@@ -1049,9 +1050,14 @@  static signed long drm_syncobj_array_wait_timeout(struct drm_syncobj **syncobjs,
 	    !access_ok(user_points, count * sizeof(*user_points)))
 		return -EFAULT;
 
-	entries = kcalloc(count, sizeof(*entries), GFP_KERNEL);
-	if (!entries)
-		return -ENOMEM;
+	if (count > ARRAY_SIZE(stack_entries)) {
+		entries = kcalloc(count, sizeof(*entries), GFP_KERNEL);
+		if (!entries)
+			return -ENOMEM;
+	} else {
+		memset(stack_entries, 0, sizeof(stack_entries));
+		entries = stack_entries;
+	}
 
 	/* Walk the list of sync objects and initialize entries.  We do
 	 * this up-front so that we can properly return -EINVAL if there is
@@ -1174,7 +1180,9 @@  static signed long drm_syncobj_array_wait_timeout(struct drm_syncobj **syncobjs,
 						  &entries[i].fence_cb);
 		dma_fence_put(entries[i].fence);
 	}
-	kfree(entries);
+
+	if (entries != stack_entries)
+		kfree(entries);
 
 	return timeout;
 }