Message ID | 20250327084215.26662-8-tvrtko.ursulin@igalia.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | A few drm_syncobj optimisations | expand |
Hi Tvrtko, On 27/03/25 05:42, Tvrtko Ursulin wrote: > Running the Cyberpunk 2077 benchmark we can observe that the lookup helper > is relatively hot, but the 97% of the calls are for a single object. (~3% > for two points, and never more than three points. While a more trivial > workload like vkmark under Plasma is even more skewed to single point > lookups.) > > Therefore lets add a fast path to bypass the kmalloc_array/kfree and use a > pre-allocated stack array for those cases. > > Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@igalia.com> > Reviewed-by: Maíra Canal <mcanal@igalia.com> > --- > v2: > * Added comments describing how the fast path arrays were sized. > * Make container freeing criteria clearer by using a boolean. > --- > drivers/gpu/drm/drm_syncobj.c | 71 ++++++++++++++++++++++++++--------- > 1 file changed, 53 insertions(+), 18 deletions(-) > > diff --git a/drivers/gpu/drm/drm_syncobj.c b/drivers/gpu/drm/drm_syncobj.c > index b906d6acb4ef..d5b99bfea9a5 100644 > --- a/drivers/gpu/drm/drm_syncobj.c > +++ b/drivers/gpu/drm/drm_syncobj.c > @@ -236,6 +236,14 @@ static void > syncobj_eventfd_entry_func(struct drm_syncobj *syncobj, > struct syncobj_eventfd_entry *entry); > > +/* > + * Empirically vast majority of ioctls pass in a single syncobj (96%) and never > + * more than three points. Therefore implement a fast path with a small stack > + * array to avoid going into the allocator sometimes several times per > + * userspace rendered frame. > + */ > +#define DRM_SYNCOBJ_FAST_PATH_ENTRIES 4 > + > /** > * drm_syncobj_find - lookup and reference a sync object. > * @file_private: drm file private pointer > @@ -1035,12 +1043,7 @@ static signed long drm_syncobj_array_wait_timeout(struct drm_syncobj **syncobjs, > uint32_t *idx, > ktime_t *deadline) > { > - /* > - * Empirically vast majority of calls here works with just a single > - * point (96%) and never more than three points. Therefore a small stack > - * array can cheaply avoid multiple per frame allocations. > - */ > - struct syncobj_wait_entry stack_entries[4]; > + struct syncobj_wait_entry stack_entries[DRM_SYNCOBJ_FAST_PATH_ENTRIES]; Could you introduce this change in 6/7 to avoid changing the lines you introduced earlier? Best Regards, - Maíra > struct syncobj_wait_entry *entries; > uint32_t signaled_count, i; > struct dma_fence *fence; > @@ -1228,6 +1231,8 @@ EXPORT_SYMBOL(drm_timeout_abs_to_jiffies); > static int drm_syncobj_array_find(struct drm_file *file_private, > u32 __user *handles, > uint32_t count, > + struct drm_syncobj **stack_syncobjs, > + u32 stack_count, > struct drm_syncobj ***syncobjs_out) > { > struct drm_syncobj **syncobjs; > @@ -1237,9 +1242,13 @@ static int drm_syncobj_array_find(struct drm_file *file_private, > if (!access_ok(handles, count * sizeof(*handles))) > return -EFAULT; > > - syncobjs = kmalloc_array(count, sizeof(*syncobjs), GFP_KERNEL); > - if (!syncobjs) > - return -ENOMEM; > + if (count > stack_count) { > + syncobjs = kmalloc_array(count, sizeof(*syncobjs), GFP_KERNEL); > + if (!syncobjs) > + return -ENOMEM; > + } else { > + syncobjs = stack_syncobjs; > + } > > for (i = 0; i < count; i++) { > u32 handle; > @@ -1261,25 +1270,31 @@ static int drm_syncobj_array_find(struct drm_file *file_private, > err_put_syncobjs: > while (i-- > 0) > drm_syncobj_put(syncobjs[i]); > - kfree(syncobjs); > + > + if (syncobjs != stack_syncobjs) > + kfree(syncobjs); > > return ret; > } > > static void drm_syncobj_array_free(struct drm_syncobj **syncobjs, > - uint32_t count) > + uint32_t count, > + bool free_container) > { > uint32_t i; > > for (i = 0; i < count; i++) > drm_syncobj_put(syncobjs[i]); > - kfree(syncobjs); > + > + if (free_container) > + kfree(syncobjs); > } > > int > drm_syncobj_wait_ioctl(struct drm_device *dev, void *data, > struct drm_file *file_private) > { > + struct drm_syncobj *stack_syncobjs[DRM_SYNCOBJ_FAST_PATH_ENTRIES]; > struct drm_syncobj_wait *args = data; > ktime_t deadline, *pdeadline = NULL; > u32 count = args->count_handles; > @@ -1305,6 +1320,8 @@ drm_syncobj_wait_ioctl(struct drm_device *dev, void *data, > ret = drm_syncobj_array_find(file_private, > u64_to_user_ptr(args->handles), > count, > + stack_syncobjs, > + ARRAY_SIZE(stack_syncobjs), > &syncobjs); > if (ret < 0) > return ret; > @@ -1323,7 +1340,7 @@ drm_syncobj_wait_ioctl(struct drm_device *dev, void *data, > &first, > pdeadline); > > - drm_syncobj_array_free(syncobjs, count); > + drm_syncobj_array_free(syncobjs, count, syncobjs != stack_syncobjs); > > if (timeout < 0) > return timeout; > @@ -1337,6 +1354,7 @@ int > drm_syncobj_timeline_wait_ioctl(struct drm_device *dev, void *data, > struct drm_file *file_private) > { > + struct drm_syncobj *stack_syncobjs[DRM_SYNCOBJ_FAST_PATH_ENTRIES]; > struct drm_syncobj_timeline_wait *args = data; > ktime_t deadline, *pdeadline = NULL; > u32 count = args->count_handles; > @@ -1363,6 +1381,8 @@ drm_syncobj_timeline_wait_ioctl(struct drm_device *dev, void *data, > ret = drm_syncobj_array_find(file_private, > u64_to_user_ptr(args->handles), > count, > + stack_syncobjs, > + ARRAY_SIZE(stack_syncobjs), > &syncobjs); > if (ret < 0) > return ret; > @@ -1381,7 +1401,7 @@ drm_syncobj_timeline_wait_ioctl(struct drm_device *dev, void *data, > &first, > pdeadline); > > - drm_syncobj_array_free(syncobjs, count); > + drm_syncobj_array_free(syncobjs, count, syncobjs != stack_syncobjs); > > if (timeout < 0) > return timeout; > @@ -1498,6 +1518,7 @@ int > drm_syncobj_reset_ioctl(struct drm_device *dev, void *data, > struct drm_file *file_private) > { > + struct drm_syncobj *stack_syncobjs[DRM_SYNCOBJ_FAST_PATH_ENTRIES]; > struct drm_syncobj_array *args = data; > struct drm_syncobj **syncobjs; > uint32_t i; > @@ -1515,6 +1536,8 @@ drm_syncobj_reset_ioctl(struct drm_device *dev, void *data, > ret = drm_syncobj_array_find(file_private, > u64_to_user_ptr(args->handles), > args->count_handles, > + stack_syncobjs, > + ARRAY_SIZE(stack_syncobjs), > &syncobjs); > if (ret < 0) > return ret; > @@ -1522,7 +1545,8 @@ drm_syncobj_reset_ioctl(struct drm_device *dev, void *data, > for (i = 0; i < args->count_handles; i++) > drm_syncobj_replace_fence(syncobjs[i], NULL); > > - drm_syncobj_array_free(syncobjs, args->count_handles); > + drm_syncobj_array_free(syncobjs, args->count_handles, > + syncobjs != stack_syncobjs); > > return 0; > } > @@ -1531,6 +1555,7 @@ int > drm_syncobj_signal_ioctl(struct drm_device *dev, void *data, > struct drm_file *file_private) > { > + struct drm_syncobj *stack_syncobjs[DRM_SYNCOBJ_FAST_PATH_ENTRIES]; > struct drm_syncobj_array *args = data; > struct drm_syncobj **syncobjs; > uint32_t i; > @@ -1548,6 +1573,8 @@ drm_syncobj_signal_ioctl(struct drm_device *dev, void *data, > ret = drm_syncobj_array_find(file_private, > u64_to_user_ptr(args->handles), > args->count_handles, > + stack_syncobjs, > + ARRAY_SIZE(stack_syncobjs), > &syncobjs); > if (ret < 0) > return ret; > @@ -1558,7 +1585,8 @@ drm_syncobj_signal_ioctl(struct drm_device *dev, void *data, > break; > } > > - drm_syncobj_array_free(syncobjs, args->count_handles); > + drm_syncobj_array_free(syncobjs, args->count_handles, > + syncobjs != stack_syncobjs); > > return ret; > } > @@ -1567,6 +1595,7 @@ int > drm_syncobj_timeline_signal_ioctl(struct drm_device *dev, void *data, > struct drm_file *file_private) > { > + struct drm_syncobj *stack_syncobjs[DRM_SYNCOBJ_FAST_PATH_ENTRIES]; > struct drm_syncobj_timeline_array *args = data; > uint64_t __user *points = u64_to_user_ptr(args->points); > uint32_t i, j, count = args->count_handles; > @@ -1589,6 +1618,8 @@ drm_syncobj_timeline_signal_ioctl(struct drm_device *dev, void *data, > ret = drm_syncobj_array_find(file_private, > u64_to_user_ptr(args->handles), > count, > + stack_syncobjs, > + ARRAY_SIZE(stack_syncobjs), > &syncobjs); > if (ret < 0) > return ret; > @@ -1625,7 +1656,7 @@ drm_syncobj_timeline_signal_ioctl(struct drm_device *dev, void *data, > err_chains: > kfree(chains); > out: > - drm_syncobj_array_free(syncobjs, count); > + drm_syncobj_array_free(syncobjs, count, syncobjs != stack_syncobjs); > > return ret; > } > @@ -1633,6 +1664,7 @@ drm_syncobj_timeline_signal_ioctl(struct drm_device *dev, void *data, > int drm_syncobj_query_ioctl(struct drm_device *dev, void *data, > struct drm_file *file_private) > { > + struct drm_syncobj *stack_syncobjs[DRM_SYNCOBJ_FAST_PATH_ENTRIES]; > struct drm_syncobj_timeline_array *args = data; > struct drm_syncobj **syncobjs; > uint64_t __user *points = u64_to_user_ptr(args->points); > @@ -1654,6 +1686,8 @@ int drm_syncobj_query_ioctl(struct drm_device *dev, void *data, > ret = drm_syncobj_array_find(file_private, > u64_to_user_ptr(args->handles), > args->count_handles, > + stack_syncobjs, > + ARRAY_SIZE(stack_syncobjs), > &syncobjs); > if (ret < 0) > return ret; > @@ -1697,7 +1731,8 @@ int drm_syncobj_query_ioctl(struct drm_device *dev, void *data, > break; > } > } > - drm_syncobj_array_free(syncobjs, args->count_handles); > + drm_syncobj_array_free(syncobjs, args->count_handles, > + syncobjs != stack_syncobjs); > > return ret; > }
diff --git a/drivers/gpu/drm/drm_syncobj.c b/drivers/gpu/drm/drm_syncobj.c index b906d6acb4ef..d5b99bfea9a5 100644 --- a/drivers/gpu/drm/drm_syncobj.c +++ b/drivers/gpu/drm/drm_syncobj.c @@ -236,6 +236,14 @@ static void syncobj_eventfd_entry_func(struct drm_syncobj *syncobj, struct syncobj_eventfd_entry *entry); +/* + * Empirically vast majority of ioctls pass in a single syncobj (96%) and never + * more than three points. Therefore implement a fast path with a small stack + * array to avoid going into the allocator sometimes several times per + * userspace rendered frame. + */ +#define DRM_SYNCOBJ_FAST_PATH_ENTRIES 4 + /** * drm_syncobj_find - lookup and reference a sync object. * @file_private: drm file private pointer @@ -1035,12 +1043,7 @@ static signed long drm_syncobj_array_wait_timeout(struct drm_syncobj **syncobjs, uint32_t *idx, ktime_t *deadline) { - /* - * Empirically vast majority of calls here works with just a single - * point (96%) and never more than three points. Therefore a small stack - * array can cheaply avoid multiple per frame allocations. - */ - struct syncobj_wait_entry stack_entries[4]; + struct syncobj_wait_entry stack_entries[DRM_SYNCOBJ_FAST_PATH_ENTRIES]; struct syncobj_wait_entry *entries; uint32_t signaled_count, i; struct dma_fence *fence; @@ -1228,6 +1231,8 @@ EXPORT_SYMBOL(drm_timeout_abs_to_jiffies); static int drm_syncobj_array_find(struct drm_file *file_private, u32 __user *handles, uint32_t count, + struct drm_syncobj **stack_syncobjs, + u32 stack_count, struct drm_syncobj ***syncobjs_out) { struct drm_syncobj **syncobjs; @@ -1237,9 +1242,13 @@ static int drm_syncobj_array_find(struct drm_file *file_private, if (!access_ok(handles, count * sizeof(*handles))) return -EFAULT; - syncobjs = kmalloc_array(count, sizeof(*syncobjs), GFP_KERNEL); - if (!syncobjs) - return -ENOMEM; + if (count > stack_count) { + syncobjs = kmalloc_array(count, sizeof(*syncobjs), GFP_KERNEL); + if (!syncobjs) + return -ENOMEM; + } else { + syncobjs = stack_syncobjs; + } for (i = 0; i < count; i++) { u32 handle; @@ -1261,25 +1270,31 @@ static int drm_syncobj_array_find(struct drm_file *file_private, err_put_syncobjs: while (i-- > 0) drm_syncobj_put(syncobjs[i]); - kfree(syncobjs); + + if (syncobjs != stack_syncobjs) + kfree(syncobjs); return ret; } static void drm_syncobj_array_free(struct drm_syncobj **syncobjs, - uint32_t count) + uint32_t count, + bool free_container) { uint32_t i; for (i = 0; i < count; i++) drm_syncobj_put(syncobjs[i]); - kfree(syncobjs); + + if (free_container) + kfree(syncobjs); } int drm_syncobj_wait_ioctl(struct drm_device *dev, void *data, struct drm_file *file_private) { + struct drm_syncobj *stack_syncobjs[DRM_SYNCOBJ_FAST_PATH_ENTRIES]; struct drm_syncobj_wait *args = data; ktime_t deadline, *pdeadline = NULL; u32 count = args->count_handles; @@ -1305,6 +1320,8 @@ drm_syncobj_wait_ioctl(struct drm_device *dev, void *data, ret = drm_syncobj_array_find(file_private, u64_to_user_ptr(args->handles), count, + stack_syncobjs, + ARRAY_SIZE(stack_syncobjs), &syncobjs); if (ret < 0) return ret; @@ -1323,7 +1340,7 @@ drm_syncobj_wait_ioctl(struct drm_device *dev, void *data, &first, pdeadline); - drm_syncobj_array_free(syncobjs, count); + drm_syncobj_array_free(syncobjs, count, syncobjs != stack_syncobjs); if (timeout < 0) return timeout; @@ -1337,6 +1354,7 @@ int drm_syncobj_timeline_wait_ioctl(struct drm_device *dev, void *data, struct drm_file *file_private) { + struct drm_syncobj *stack_syncobjs[DRM_SYNCOBJ_FAST_PATH_ENTRIES]; struct drm_syncobj_timeline_wait *args = data; ktime_t deadline, *pdeadline = NULL; u32 count = args->count_handles; @@ -1363,6 +1381,8 @@ drm_syncobj_timeline_wait_ioctl(struct drm_device *dev, void *data, ret = drm_syncobj_array_find(file_private, u64_to_user_ptr(args->handles), count, + stack_syncobjs, + ARRAY_SIZE(stack_syncobjs), &syncobjs); if (ret < 0) return ret; @@ -1381,7 +1401,7 @@ drm_syncobj_timeline_wait_ioctl(struct drm_device *dev, void *data, &first, pdeadline); - drm_syncobj_array_free(syncobjs, count); + drm_syncobj_array_free(syncobjs, count, syncobjs != stack_syncobjs); if (timeout < 0) return timeout; @@ -1498,6 +1518,7 @@ int drm_syncobj_reset_ioctl(struct drm_device *dev, void *data, struct drm_file *file_private) { + struct drm_syncobj *stack_syncobjs[DRM_SYNCOBJ_FAST_PATH_ENTRIES]; struct drm_syncobj_array *args = data; struct drm_syncobj **syncobjs; uint32_t i; @@ -1515,6 +1536,8 @@ drm_syncobj_reset_ioctl(struct drm_device *dev, void *data, ret = drm_syncobj_array_find(file_private, u64_to_user_ptr(args->handles), args->count_handles, + stack_syncobjs, + ARRAY_SIZE(stack_syncobjs), &syncobjs); if (ret < 0) return ret; @@ -1522,7 +1545,8 @@ drm_syncobj_reset_ioctl(struct drm_device *dev, void *data, for (i = 0; i < args->count_handles; i++) drm_syncobj_replace_fence(syncobjs[i], NULL); - drm_syncobj_array_free(syncobjs, args->count_handles); + drm_syncobj_array_free(syncobjs, args->count_handles, + syncobjs != stack_syncobjs); return 0; } @@ -1531,6 +1555,7 @@ int drm_syncobj_signal_ioctl(struct drm_device *dev, void *data, struct drm_file *file_private) { + struct drm_syncobj *stack_syncobjs[DRM_SYNCOBJ_FAST_PATH_ENTRIES]; struct drm_syncobj_array *args = data; struct drm_syncobj **syncobjs; uint32_t i; @@ -1548,6 +1573,8 @@ drm_syncobj_signal_ioctl(struct drm_device *dev, void *data, ret = drm_syncobj_array_find(file_private, u64_to_user_ptr(args->handles), args->count_handles, + stack_syncobjs, + ARRAY_SIZE(stack_syncobjs), &syncobjs); if (ret < 0) return ret; @@ -1558,7 +1585,8 @@ drm_syncobj_signal_ioctl(struct drm_device *dev, void *data, break; } - drm_syncobj_array_free(syncobjs, args->count_handles); + drm_syncobj_array_free(syncobjs, args->count_handles, + syncobjs != stack_syncobjs); return ret; } @@ -1567,6 +1595,7 @@ int drm_syncobj_timeline_signal_ioctl(struct drm_device *dev, void *data, struct drm_file *file_private) { + struct drm_syncobj *stack_syncobjs[DRM_SYNCOBJ_FAST_PATH_ENTRIES]; struct drm_syncobj_timeline_array *args = data; uint64_t __user *points = u64_to_user_ptr(args->points); uint32_t i, j, count = args->count_handles; @@ -1589,6 +1618,8 @@ drm_syncobj_timeline_signal_ioctl(struct drm_device *dev, void *data, ret = drm_syncobj_array_find(file_private, u64_to_user_ptr(args->handles), count, + stack_syncobjs, + ARRAY_SIZE(stack_syncobjs), &syncobjs); if (ret < 0) return ret; @@ -1625,7 +1656,7 @@ drm_syncobj_timeline_signal_ioctl(struct drm_device *dev, void *data, err_chains: kfree(chains); out: - drm_syncobj_array_free(syncobjs, count); + drm_syncobj_array_free(syncobjs, count, syncobjs != stack_syncobjs); return ret; } @@ -1633,6 +1664,7 @@ drm_syncobj_timeline_signal_ioctl(struct drm_device *dev, void *data, int drm_syncobj_query_ioctl(struct drm_device *dev, void *data, struct drm_file *file_private) { + struct drm_syncobj *stack_syncobjs[DRM_SYNCOBJ_FAST_PATH_ENTRIES]; struct drm_syncobj_timeline_array *args = data; struct drm_syncobj **syncobjs; uint64_t __user *points = u64_to_user_ptr(args->points); @@ -1654,6 +1686,8 @@ int drm_syncobj_query_ioctl(struct drm_device *dev, void *data, ret = drm_syncobj_array_find(file_private, u64_to_user_ptr(args->handles), args->count_handles, + stack_syncobjs, + ARRAY_SIZE(stack_syncobjs), &syncobjs); if (ret < 0) return ret; @@ -1697,7 +1731,8 @@ int drm_syncobj_query_ioctl(struct drm_device *dev, void *data, break; } } - drm_syncobj_array_free(syncobjs, args->count_handles); + drm_syncobj_array_free(syncobjs, args->count_handles, + syncobjs != stack_syncobjs); return ret; }