diff mbox series

[v2,7/7] drm/syncobj: Add a fast path to drm_syncobj_array_find

Message ID 20250327084215.26662-8-tvrtko.ursulin@igalia.com (mailing list archive)
State New
Headers show
Series A few drm_syncobj optimisations | expand

Commit Message

Tvrtko Ursulin March 27, 2025, 8:42 a.m. UTC
Running the Cyberpunk 2077 benchmark we can observe that the lookup helper
is relatively hot, but the 97% of the calls are for a single object. (~3%
for two points, and never more than three points. While a more trivial
workload like vkmark under Plasma is even more skewed to single point
lookups.)

Therefore lets add a fast path to bypass the kmalloc_array/kfree and use a
pre-allocated stack array for those cases.

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@igalia.com>
Reviewed-by: Maíra Canal <mcanal@igalia.com>
---
v2:
 * Added comments describing how the fast path arrays were sized.
 * Make container freeing criteria clearer by using a boolean.
---
 drivers/gpu/drm/drm_syncobj.c | 71 ++++++++++++++++++++++++++---------
 1 file changed, 53 insertions(+), 18 deletions(-)

Comments

Maíra Canal March 27, 2025, 1:45 p.m. UTC | #1
Hi Tvrtko,

On 27/03/25 05:42, Tvrtko Ursulin wrote:
> Running the Cyberpunk 2077 benchmark we can observe that the lookup helper
> is relatively hot, but the 97% of the calls are for a single object. (~3%
> for two points, and never more than three points. While a more trivial
> workload like vkmark under Plasma is even more skewed to single point
> lookups.)
> 
> Therefore lets add a fast path to bypass the kmalloc_array/kfree and use a
> pre-allocated stack array for those cases.
> 
> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@igalia.com>
> Reviewed-by: Maíra Canal <mcanal@igalia.com>
> ---
> v2:
>   * Added comments describing how the fast path arrays were sized.
>   * Make container freeing criteria clearer by using a boolean.
> ---
>   drivers/gpu/drm/drm_syncobj.c | 71 ++++++++++++++++++++++++++---------
>   1 file changed, 53 insertions(+), 18 deletions(-)
> 
> diff --git a/drivers/gpu/drm/drm_syncobj.c b/drivers/gpu/drm/drm_syncobj.c
> index b906d6acb4ef..d5b99bfea9a5 100644
> --- a/drivers/gpu/drm/drm_syncobj.c
> +++ b/drivers/gpu/drm/drm_syncobj.c
> @@ -236,6 +236,14 @@ static void
>   syncobj_eventfd_entry_func(struct drm_syncobj *syncobj,
>   			   struct syncobj_eventfd_entry *entry);
>   
> +/*
> + * Empirically vast majority of ioctls pass in a single syncobj (96%) and never
> + * more than three points. Therefore implement a fast path with a small stack
> + * array to avoid going into the allocator sometimes several times per
> + * userspace rendered frame.
> + */
> +#define DRM_SYNCOBJ_FAST_PATH_ENTRIES 4
> +
>   /**
>    * drm_syncobj_find - lookup and reference a sync object.
>    * @file_private: drm file private pointer
> @@ -1035,12 +1043,7 @@ static signed long drm_syncobj_array_wait_timeout(struct drm_syncobj **syncobjs,
>   						  uint32_t *idx,
>   						  ktime_t *deadline)
>   {
> -	/*
> -	 * Empirically vast majority of calls here works with just a single
> -	 * point (96%) and never more than three points. Therefore a small stack
> -	 * array can cheaply avoid multiple per frame allocations.
> -	 */
> -	struct syncobj_wait_entry stack_entries[4];
> +	struct syncobj_wait_entry stack_entries[DRM_SYNCOBJ_FAST_PATH_ENTRIES];

Could you introduce this change in 6/7 to avoid changing the lines you
introduced earlier?

Best Regards,
- Maíra

>   	struct syncobj_wait_entry *entries;
>   	uint32_t signaled_count, i;
>   	struct dma_fence *fence;
> @@ -1228,6 +1231,8 @@ EXPORT_SYMBOL(drm_timeout_abs_to_jiffies);
>   static int drm_syncobj_array_find(struct drm_file *file_private,
>   				  u32 __user *handles,
>   				  uint32_t count,
> +				  struct drm_syncobj **stack_syncobjs,
> +				  u32 stack_count,
>   				  struct drm_syncobj ***syncobjs_out)
>   {
>   	struct drm_syncobj **syncobjs;
> @@ -1237,9 +1242,13 @@ static int drm_syncobj_array_find(struct drm_file *file_private,
>   	if (!access_ok(handles, count * sizeof(*handles)))
>   		return -EFAULT;
>   
> -	syncobjs = kmalloc_array(count, sizeof(*syncobjs), GFP_KERNEL);
> -	if (!syncobjs)
> -		return -ENOMEM;
> +	if (count > stack_count) {
> +		syncobjs = kmalloc_array(count, sizeof(*syncobjs), GFP_KERNEL);
> +		if (!syncobjs)
> +			return -ENOMEM;
> +	} else {
> +		syncobjs = stack_syncobjs;
> +	}
>   
>   	for (i = 0; i < count; i++) {
>   		u32 handle;
> @@ -1261,25 +1270,31 @@ static int drm_syncobj_array_find(struct drm_file *file_private,
>   err_put_syncobjs:
>   	while (i-- > 0)
>   		drm_syncobj_put(syncobjs[i]);
> -	kfree(syncobjs);
> +
> +	if (syncobjs != stack_syncobjs)
> +		kfree(syncobjs);
>   
>   	return ret;
>   }
>   
>   static void drm_syncobj_array_free(struct drm_syncobj **syncobjs,
> -				   uint32_t count)
> +				   uint32_t count,
> +				   bool free_container)
>   {
>   	uint32_t i;
>   
>   	for (i = 0; i < count; i++)
>   		drm_syncobj_put(syncobjs[i]);
> -	kfree(syncobjs);
> +
> +	if (free_container)
> +		kfree(syncobjs);
>   }
>   
>   int
>   drm_syncobj_wait_ioctl(struct drm_device *dev, void *data,
>   		       struct drm_file *file_private)
>   {
> +	struct drm_syncobj *stack_syncobjs[DRM_SYNCOBJ_FAST_PATH_ENTRIES];
>   	struct drm_syncobj_wait *args = data;
>   	ktime_t deadline, *pdeadline = NULL;
>   	u32 count = args->count_handles;
> @@ -1305,6 +1320,8 @@ drm_syncobj_wait_ioctl(struct drm_device *dev, void *data,
>   	ret = drm_syncobj_array_find(file_private,
>   				     u64_to_user_ptr(args->handles),
>   				     count,
> +				     stack_syncobjs,
> +				     ARRAY_SIZE(stack_syncobjs),
>   				     &syncobjs);
>   	if (ret < 0)
>   		return ret;
> @@ -1323,7 +1340,7 @@ drm_syncobj_wait_ioctl(struct drm_device *dev, void *data,
>   						 &first,
>   						 pdeadline);
>   
> -	drm_syncobj_array_free(syncobjs, count);
> +	drm_syncobj_array_free(syncobjs, count, syncobjs != stack_syncobjs);
>   
>   	if (timeout < 0)
>   		return timeout;
> @@ -1337,6 +1354,7 @@ int
>   drm_syncobj_timeline_wait_ioctl(struct drm_device *dev, void *data,
>   				struct drm_file *file_private)
>   {
> +	struct drm_syncobj *stack_syncobjs[DRM_SYNCOBJ_FAST_PATH_ENTRIES];
>   	struct drm_syncobj_timeline_wait *args = data;
>   	ktime_t deadline, *pdeadline = NULL;
>   	u32 count = args->count_handles;
> @@ -1363,6 +1381,8 @@ drm_syncobj_timeline_wait_ioctl(struct drm_device *dev, void *data,
>   	ret = drm_syncobj_array_find(file_private,
>   				     u64_to_user_ptr(args->handles),
>   				     count,
> +				     stack_syncobjs,
> +				     ARRAY_SIZE(stack_syncobjs),
>   				     &syncobjs);
>   	if (ret < 0)
>   		return ret;
> @@ -1381,7 +1401,7 @@ drm_syncobj_timeline_wait_ioctl(struct drm_device *dev, void *data,
>   						 &first,
>   						 pdeadline);
>   
> -	drm_syncobj_array_free(syncobjs, count);
> +	drm_syncobj_array_free(syncobjs, count, syncobjs != stack_syncobjs);
>   
>   	if (timeout < 0)
>   		return timeout;
> @@ -1498,6 +1518,7 @@ int
>   drm_syncobj_reset_ioctl(struct drm_device *dev, void *data,
>   			struct drm_file *file_private)
>   {
> +	struct drm_syncobj *stack_syncobjs[DRM_SYNCOBJ_FAST_PATH_ENTRIES];
>   	struct drm_syncobj_array *args = data;
>   	struct drm_syncobj **syncobjs;
>   	uint32_t i;
> @@ -1515,6 +1536,8 @@ drm_syncobj_reset_ioctl(struct drm_device *dev, void *data,
>   	ret = drm_syncobj_array_find(file_private,
>   				     u64_to_user_ptr(args->handles),
>   				     args->count_handles,
> +				     stack_syncobjs,
> +				     ARRAY_SIZE(stack_syncobjs),
>   				     &syncobjs);
>   	if (ret < 0)
>   		return ret;
> @@ -1522,7 +1545,8 @@ drm_syncobj_reset_ioctl(struct drm_device *dev, void *data,
>   	for (i = 0; i < args->count_handles; i++)
>   		drm_syncobj_replace_fence(syncobjs[i], NULL);
>   
> -	drm_syncobj_array_free(syncobjs, args->count_handles);
> +	drm_syncobj_array_free(syncobjs, args->count_handles,
> +			       syncobjs != stack_syncobjs);
>   
>   	return 0;
>   }
> @@ -1531,6 +1555,7 @@ int
>   drm_syncobj_signal_ioctl(struct drm_device *dev, void *data,
>   			 struct drm_file *file_private)
>   {
> +	struct drm_syncobj *stack_syncobjs[DRM_SYNCOBJ_FAST_PATH_ENTRIES];
>   	struct drm_syncobj_array *args = data;
>   	struct drm_syncobj **syncobjs;
>   	uint32_t i;
> @@ -1548,6 +1573,8 @@ drm_syncobj_signal_ioctl(struct drm_device *dev, void *data,
>   	ret = drm_syncobj_array_find(file_private,
>   				     u64_to_user_ptr(args->handles),
>   				     args->count_handles,
> +				     stack_syncobjs,
> +				     ARRAY_SIZE(stack_syncobjs),
>   				     &syncobjs);
>   	if (ret < 0)
>   		return ret;
> @@ -1558,7 +1585,8 @@ drm_syncobj_signal_ioctl(struct drm_device *dev, void *data,
>   			break;
>   	}
>   
> -	drm_syncobj_array_free(syncobjs, args->count_handles);
> +	drm_syncobj_array_free(syncobjs, args->count_handles,
> +			       syncobjs != stack_syncobjs);
>   
>   	return ret;
>   }
> @@ -1567,6 +1595,7 @@ int
>   drm_syncobj_timeline_signal_ioctl(struct drm_device *dev, void *data,
>   				  struct drm_file *file_private)
>   {
> +	struct drm_syncobj *stack_syncobjs[DRM_SYNCOBJ_FAST_PATH_ENTRIES];
>   	struct drm_syncobj_timeline_array *args = data;
>   	uint64_t __user *points = u64_to_user_ptr(args->points);
>   	uint32_t i, j, count = args->count_handles;
> @@ -1589,6 +1618,8 @@ drm_syncobj_timeline_signal_ioctl(struct drm_device *dev, void *data,
>   	ret = drm_syncobj_array_find(file_private,
>   				     u64_to_user_ptr(args->handles),
>   				     count,
> +				     stack_syncobjs,
> +				     ARRAY_SIZE(stack_syncobjs),
>   				     &syncobjs);
>   	if (ret < 0)
>   		return ret;
> @@ -1625,7 +1656,7 @@ drm_syncobj_timeline_signal_ioctl(struct drm_device *dev, void *data,
>   err_chains:
>   	kfree(chains);
>   out:
> -	drm_syncobj_array_free(syncobjs, count);
> +	drm_syncobj_array_free(syncobjs, count, syncobjs != stack_syncobjs);
>   
>   	return ret;
>   }
> @@ -1633,6 +1664,7 @@ drm_syncobj_timeline_signal_ioctl(struct drm_device *dev, void *data,
>   int drm_syncobj_query_ioctl(struct drm_device *dev, void *data,
>   			    struct drm_file *file_private)
>   {
> +	struct drm_syncobj *stack_syncobjs[DRM_SYNCOBJ_FAST_PATH_ENTRIES];
>   	struct drm_syncobj_timeline_array *args = data;
>   	struct drm_syncobj **syncobjs;
>   	uint64_t __user *points = u64_to_user_ptr(args->points);
> @@ -1654,6 +1686,8 @@ int drm_syncobj_query_ioctl(struct drm_device *dev, void *data,
>   	ret = drm_syncobj_array_find(file_private,
>   				     u64_to_user_ptr(args->handles),
>   				     args->count_handles,
> +				     stack_syncobjs,
> +				     ARRAY_SIZE(stack_syncobjs),
>   				     &syncobjs);
>   	if (ret < 0)
>   		return ret;
> @@ -1697,7 +1731,8 @@ int drm_syncobj_query_ioctl(struct drm_device *dev, void *data,
>   			break;
>   		}
>   	}
> -	drm_syncobj_array_free(syncobjs, args->count_handles);
> +	drm_syncobj_array_free(syncobjs, args->count_handles,
> +			       syncobjs != stack_syncobjs);
>   
>   	return ret;
>   }
diff mbox series

Patch

diff --git a/drivers/gpu/drm/drm_syncobj.c b/drivers/gpu/drm/drm_syncobj.c
index b906d6acb4ef..d5b99bfea9a5 100644
--- a/drivers/gpu/drm/drm_syncobj.c
+++ b/drivers/gpu/drm/drm_syncobj.c
@@ -236,6 +236,14 @@  static void
 syncobj_eventfd_entry_func(struct drm_syncobj *syncobj,
 			   struct syncobj_eventfd_entry *entry);
 
+/*
+ * Empirically vast majority of ioctls pass in a single syncobj (96%) and never
+ * more than three points. Therefore implement a fast path with a small stack
+ * array to avoid going into the allocator sometimes several times per
+ * userspace rendered frame.
+ */
+#define DRM_SYNCOBJ_FAST_PATH_ENTRIES 4
+
 /**
  * drm_syncobj_find - lookup and reference a sync object.
  * @file_private: drm file private pointer
@@ -1035,12 +1043,7 @@  static signed long drm_syncobj_array_wait_timeout(struct drm_syncobj **syncobjs,
 						  uint32_t *idx,
 						  ktime_t *deadline)
 {
-	/*
-	 * Empirically vast majority of calls here works with just a single
-	 * point (96%) and never more than three points. Therefore a small stack
-	 * array can cheaply avoid multiple per frame allocations.
-	 */
-	struct syncobj_wait_entry stack_entries[4];
+	struct syncobj_wait_entry stack_entries[DRM_SYNCOBJ_FAST_PATH_ENTRIES];
 	struct syncobj_wait_entry *entries;
 	uint32_t signaled_count, i;
 	struct dma_fence *fence;
@@ -1228,6 +1231,8 @@  EXPORT_SYMBOL(drm_timeout_abs_to_jiffies);
 static int drm_syncobj_array_find(struct drm_file *file_private,
 				  u32 __user *handles,
 				  uint32_t count,
+				  struct drm_syncobj **stack_syncobjs,
+				  u32 stack_count,
 				  struct drm_syncobj ***syncobjs_out)
 {
 	struct drm_syncobj **syncobjs;
@@ -1237,9 +1242,13 @@  static int drm_syncobj_array_find(struct drm_file *file_private,
 	if (!access_ok(handles, count * sizeof(*handles)))
 		return -EFAULT;
 
-	syncobjs = kmalloc_array(count, sizeof(*syncobjs), GFP_KERNEL);
-	if (!syncobjs)
-		return -ENOMEM;
+	if (count > stack_count) {
+		syncobjs = kmalloc_array(count, sizeof(*syncobjs), GFP_KERNEL);
+		if (!syncobjs)
+			return -ENOMEM;
+	} else {
+		syncobjs = stack_syncobjs;
+	}
 
 	for (i = 0; i < count; i++) {
 		u32 handle;
@@ -1261,25 +1270,31 @@  static int drm_syncobj_array_find(struct drm_file *file_private,
 err_put_syncobjs:
 	while (i-- > 0)
 		drm_syncobj_put(syncobjs[i]);
-	kfree(syncobjs);
+
+	if (syncobjs != stack_syncobjs)
+		kfree(syncobjs);
 
 	return ret;
 }
 
 static void drm_syncobj_array_free(struct drm_syncobj **syncobjs,
-				   uint32_t count)
+				   uint32_t count,
+				   bool free_container)
 {
 	uint32_t i;
 
 	for (i = 0; i < count; i++)
 		drm_syncobj_put(syncobjs[i]);
-	kfree(syncobjs);
+
+	if (free_container)
+		kfree(syncobjs);
 }
 
 int
 drm_syncobj_wait_ioctl(struct drm_device *dev, void *data,
 		       struct drm_file *file_private)
 {
+	struct drm_syncobj *stack_syncobjs[DRM_SYNCOBJ_FAST_PATH_ENTRIES];
 	struct drm_syncobj_wait *args = data;
 	ktime_t deadline, *pdeadline = NULL;
 	u32 count = args->count_handles;
@@ -1305,6 +1320,8 @@  drm_syncobj_wait_ioctl(struct drm_device *dev, void *data,
 	ret = drm_syncobj_array_find(file_private,
 				     u64_to_user_ptr(args->handles),
 				     count,
+				     stack_syncobjs,
+				     ARRAY_SIZE(stack_syncobjs),
 				     &syncobjs);
 	if (ret < 0)
 		return ret;
@@ -1323,7 +1340,7 @@  drm_syncobj_wait_ioctl(struct drm_device *dev, void *data,
 						 &first,
 						 pdeadline);
 
-	drm_syncobj_array_free(syncobjs, count);
+	drm_syncobj_array_free(syncobjs, count, syncobjs != stack_syncobjs);
 
 	if (timeout < 0)
 		return timeout;
@@ -1337,6 +1354,7 @@  int
 drm_syncobj_timeline_wait_ioctl(struct drm_device *dev, void *data,
 				struct drm_file *file_private)
 {
+	struct drm_syncobj *stack_syncobjs[DRM_SYNCOBJ_FAST_PATH_ENTRIES];
 	struct drm_syncobj_timeline_wait *args = data;
 	ktime_t deadline, *pdeadline = NULL;
 	u32 count = args->count_handles;
@@ -1363,6 +1381,8 @@  drm_syncobj_timeline_wait_ioctl(struct drm_device *dev, void *data,
 	ret = drm_syncobj_array_find(file_private,
 				     u64_to_user_ptr(args->handles),
 				     count,
+				     stack_syncobjs,
+				     ARRAY_SIZE(stack_syncobjs),
 				     &syncobjs);
 	if (ret < 0)
 		return ret;
@@ -1381,7 +1401,7 @@  drm_syncobj_timeline_wait_ioctl(struct drm_device *dev, void *data,
 						 &first,
 						 pdeadline);
 
-	drm_syncobj_array_free(syncobjs, count);
+	drm_syncobj_array_free(syncobjs, count, syncobjs != stack_syncobjs);
 
 	if (timeout < 0)
 		return timeout;
@@ -1498,6 +1518,7 @@  int
 drm_syncobj_reset_ioctl(struct drm_device *dev, void *data,
 			struct drm_file *file_private)
 {
+	struct drm_syncobj *stack_syncobjs[DRM_SYNCOBJ_FAST_PATH_ENTRIES];
 	struct drm_syncobj_array *args = data;
 	struct drm_syncobj **syncobjs;
 	uint32_t i;
@@ -1515,6 +1536,8 @@  drm_syncobj_reset_ioctl(struct drm_device *dev, void *data,
 	ret = drm_syncobj_array_find(file_private,
 				     u64_to_user_ptr(args->handles),
 				     args->count_handles,
+				     stack_syncobjs,
+				     ARRAY_SIZE(stack_syncobjs),
 				     &syncobjs);
 	if (ret < 0)
 		return ret;
@@ -1522,7 +1545,8 @@  drm_syncobj_reset_ioctl(struct drm_device *dev, void *data,
 	for (i = 0; i < args->count_handles; i++)
 		drm_syncobj_replace_fence(syncobjs[i], NULL);
 
-	drm_syncobj_array_free(syncobjs, args->count_handles);
+	drm_syncobj_array_free(syncobjs, args->count_handles,
+			       syncobjs != stack_syncobjs);
 
 	return 0;
 }
@@ -1531,6 +1555,7 @@  int
 drm_syncobj_signal_ioctl(struct drm_device *dev, void *data,
 			 struct drm_file *file_private)
 {
+	struct drm_syncobj *stack_syncobjs[DRM_SYNCOBJ_FAST_PATH_ENTRIES];
 	struct drm_syncobj_array *args = data;
 	struct drm_syncobj **syncobjs;
 	uint32_t i;
@@ -1548,6 +1573,8 @@  drm_syncobj_signal_ioctl(struct drm_device *dev, void *data,
 	ret = drm_syncobj_array_find(file_private,
 				     u64_to_user_ptr(args->handles),
 				     args->count_handles,
+				     stack_syncobjs,
+				     ARRAY_SIZE(stack_syncobjs),
 				     &syncobjs);
 	if (ret < 0)
 		return ret;
@@ -1558,7 +1585,8 @@  drm_syncobj_signal_ioctl(struct drm_device *dev, void *data,
 			break;
 	}
 
-	drm_syncobj_array_free(syncobjs, args->count_handles);
+	drm_syncobj_array_free(syncobjs, args->count_handles,
+			       syncobjs != stack_syncobjs);
 
 	return ret;
 }
@@ -1567,6 +1595,7 @@  int
 drm_syncobj_timeline_signal_ioctl(struct drm_device *dev, void *data,
 				  struct drm_file *file_private)
 {
+	struct drm_syncobj *stack_syncobjs[DRM_SYNCOBJ_FAST_PATH_ENTRIES];
 	struct drm_syncobj_timeline_array *args = data;
 	uint64_t __user *points = u64_to_user_ptr(args->points);
 	uint32_t i, j, count = args->count_handles;
@@ -1589,6 +1618,8 @@  drm_syncobj_timeline_signal_ioctl(struct drm_device *dev, void *data,
 	ret = drm_syncobj_array_find(file_private,
 				     u64_to_user_ptr(args->handles),
 				     count,
+				     stack_syncobjs,
+				     ARRAY_SIZE(stack_syncobjs),
 				     &syncobjs);
 	if (ret < 0)
 		return ret;
@@ -1625,7 +1656,7 @@  drm_syncobj_timeline_signal_ioctl(struct drm_device *dev, void *data,
 err_chains:
 	kfree(chains);
 out:
-	drm_syncobj_array_free(syncobjs, count);
+	drm_syncobj_array_free(syncobjs, count, syncobjs != stack_syncobjs);
 
 	return ret;
 }
@@ -1633,6 +1664,7 @@  drm_syncobj_timeline_signal_ioctl(struct drm_device *dev, void *data,
 int drm_syncobj_query_ioctl(struct drm_device *dev, void *data,
 			    struct drm_file *file_private)
 {
+	struct drm_syncobj *stack_syncobjs[DRM_SYNCOBJ_FAST_PATH_ENTRIES];
 	struct drm_syncobj_timeline_array *args = data;
 	struct drm_syncobj **syncobjs;
 	uint64_t __user *points = u64_to_user_ptr(args->points);
@@ -1654,6 +1686,8 @@  int drm_syncobj_query_ioctl(struct drm_device *dev, void *data,
 	ret = drm_syncobj_array_find(file_private,
 				     u64_to_user_ptr(args->handles),
 				     args->count_handles,
+				     stack_syncobjs,
+				     ARRAY_SIZE(stack_syncobjs),
 				     &syncobjs);
 	if (ret < 0)
 		return ret;
@@ -1697,7 +1731,8 @@  int drm_syncobj_query_ioctl(struct drm_device *dev, void *data,
 			break;
 		}
 	}
-	drm_syncobj_array_free(syncobjs, args->count_handles);
+	drm_syncobj_array_free(syncobjs, args->count_handles,
+			       syncobjs != stack_syncobjs);
 
 	return ret;
 }