Message ID | 20250318155424.78552-8-tvrtko.ursulin@igalia.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | A few drm_syncobj optimisations | expand |
Hi Tvrtko, Some nits inline, mostly personal comments. In any case, Reviewed-by: Maíra Canal <mcanal@igalia.com> On 18/03/25 12:54, Tvrtko Ursulin wrote: > Running the Cyberpunk 2077 benchmark we can observe that the lookup helper > is relatively hot, but the 97% of the calls are for a single object. (~3% > for two points, and never more than three points. While a more trivial > workload like vkmark under Plasma is even more skewed to single point > lookups.) > > Therefore lets add a fast path to bypass the kmalloc_array/kfree and use a > pre-allocated stack array for those cases. > > Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@igalia.com> > --- > drivers/gpu/drm/drm_syncobj.c | 53 +++++++++++++++++++++++++++-------- > 1 file changed, 41 insertions(+), 12 deletions(-) > > diff --git a/drivers/gpu/drm/drm_syncobj.c b/drivers/gpu/drm/drm_syncobj.c > index 94932b89298f..233bdef53c87 100644 > --- a/drivers/gpu/drm/drm_syncobj.c > +++ b/drivers/gpu/drm/drm_syncobj.c > @@ -1223,6 +1223,8 @@ EXPORT_SYMBOL(drm_timeout_abs_to_jiffies); > static int drm_syncobj_array_find(struct drm_file *file_private, > u32 __user *handles, > uint32_t count, > + struct drm_syncobj **stack_syncobjs, > + u32 stack_count, > struct drm_syncobj ***syncobjs_out) > { > struct drm_syncobj **syncobjs; > @@ -1232,9 +1234,13 @@ static int drm_syncobj_array_find(struct drm_file *file_private, > if (!access_ok(handles, count * sizeof(*handles))) > return -EFAULT; > > - syncobjs = kmalloc_array(count, sizeof(*syncobjs), GFP_KERNEL); > - if (!syncobjs) > - return -ENOMEM; > + if (count > stack_count) { I believe it's worth adding a comment mentioning that using the stack syncobj is a fast-path that covers most cases. > + syncobjs = kmalloc_array(count, sizeof(*syncobjs), GFP_KERNEL); > + if (!syncobjs) > + return -ENOMEM; > + } else { > + syncobjs = stack_syncobjs; > + } > > for (i = 0; i < count; i++) { > u64 handle; > @@ -1260,25 +1266,31 @@ static int drm_syncobj_array_find(struct drm_file *file_private, > drm_syncobj_put(syncobjs[i]); > i--; > } > - kfree(syncobjs); > + > + if (syncobjs != stack_syncobjs) Again, I have a slight preference to make `syncobjs = NULL` and avoid this if condition. But it's just a personal preference. > + kfree(syncobjs); > > return ret; > } > > static void drm_syncobj_array_free(struct drm_syncobj **syncobjs, > - uint32_t count) > + uint32_t count, > + struct drm_syncobj **stack_syncobjs) IMO, I think the order `syncobjs, stack_syncobjs, count` is a bit more intuitive. > { > uint32_t i; > > for (i = 0; i < count; i++) > drm_syncobj_put(syncobjs[i]); > - kfree(syncobjs); > + > + if (syncobjs != stack_syncobjs) > + kfree(syncobjs); > } > > int > drm_syncobj_wait_ioctl(struct drm_device *dev, void *data, > struct drm_file *file_private) > { > + struct drm_syncobj *stack_syncobjs[4]; > struct drm_syncobj_wait *args = data; > ktime_t deadline, *pdeadline = NULL; > u32 count = args->count_handles; > @@ -1304,6 +1316,8 @@ drm_syncobj_wait_ioctl(struct drm_device *dev, void *data, > ret = drm_syncobj_array_find(file_private, > u64_to_user_ptr(args->handles), > count, > + stack_syncobjs, > + ARRAY_SIZE(stack_syncobjs), > &syncobjs); > if (ret < 0) > return ret; > @@ -1321,7 +1335,7 @@ drm_syncobj_wait_ioctl(struct drm_device *dev, void *data, > &first, > pdeadline); > > - drm_syncobj_array_free(syncobjs, count); > + drm_syncobj_array_free(syncobjs, count, stack_syncobjs); > > if (timeout < 0) > return timeout; > @@ -1336,6 +1350,7 @@ drm_syncobj_timeline_wait_ioctl(struct drm_device *dev, void *data, > struct drm_file *file_private) > { > struct drm_syncobj_timeline_wait *args = data; > + struct drm_syncobj *stack_syncobjs[4]; Zero initialize it? Best Regards, - Maíra > ktime_t deadline, *pdeadline = NULL; > u32 count = args->count_handles; > struct drm_syncobj **syncobjs; > @@ -1361,6 +1376,8 @@ drm_syncobj_timeline_wait_ioctl(struct drm_device *dev, void *data, > ret = drm_syncobj_array_find(file_private, > u64_to_user_ptr(args->handles), > count, > + stack_syncobjs, > + ARRAY_SIZE(stack_syncobjs), > &syncobjs); > if (ret < 0) > return ret; > @@ -1378,7 +1395,7 @@ drm_syncobj_timeline_wait_ioctl(struct drm_device *dev, void *data, > &first, > pdeadline); > > - drm_syncobj_array_free(syncobjs, count); > + drm_syncobj_array_free(syncobjs, count, stack_syncobjs); > > if (timeout < 0) > return timeout; > @@ -1496,6 +1513,7 @@ drm_syncobj_reset_ioctl(struct drm_device *dev, void *data, > struct drm_file *file_private) > { > struct drm_syncobj_array *args = data; > + struct drm_syncobj *stack_syncobjs[4]; > struct drm_syncobj **syncobjs; > uint32_t i; > int ret; > @@ -1512,6 +1530,8 @@ drm_syncobj_reset_ioctl(struct drm_device *dev, void *data, > ret = drm_syncobj_array_find(file_private, > u64_to_user_ptr(args->handles), > args->count_handles, > + stack_syncobjs, > + ARRAY_SIZE(stack_syncobjs), > &syncobjs); > if (ret < 0) > return ret; > @@ -1519,7 +1539,7 @@ drm_syncobj_reset_ioctl(struct drm_device *dev, void *data, > for (i = 0; i < args->count_handles; i++) > drm_syncobj_replace_fence(syncobjs[i], NULL); > > - drm_syncobj_array_free(syncobjs, args->count_handles); > + drm_syncobj_array_free(syncobjs, args->count_handles, stack_syncobjs); > > return 0; > } > @@ -1529,6 +1549,7 @@ drm_syncobj_signal_ioctl(struct drm_device *dev, void *data, > struct drm_file *file_private) > { > struct drm_syncobj_array *args = data; > + struct drm_syncobj *stack_syncobjs[4]; > struct drm_syncobj **syncobjs; > uint32_t i; > int ret; > @@ -1545,6 +1566,8 @@ drm_syncobj_signal_ioctl(struct drm_device *dev, void *data, > ret = drm_syncobj_array_find(file_private, > u64_to_user_ptr(args->handles), > args->count_handles, > + stack_syncobjs, > + ARRAY_SIZE(stack_syncobjs), > &syncobjs); > if (ret < 0) > return ret; > @@ -1555,7 +1578,7 @@ drm_syncobj_signal_ioctl(struct drm_device *dev, void *data, > break; > } > > - drm_syncobj_array_free(syncobjs, args->count_handles); > + drm_syncobj_array_free(syncobjs, args->count_handles, stack_syncobjs); > > return ret; > } > @@ -1567,6 +1590,7 @@ drm_syncobj_timeline_signal_ioctl(struct drm_device *dev, void *data, > struct drm_syncobj_timeline_array *args = data; > uint64_t __user *points = u64_to_user_ptr(args->points); > uint32_t i, j, count = args->count_handles; > + struct drm_syncobj *stack_syncobjs[4]; > struct drm_syncobj **syncobjs; > struct dma_fence_chain **chains; > int ret; > @@ -1586,6 +1610,8 @@ drm_syncobj_timeline_signal_ioctl(struct drm_device *dev, void *data, > ret = drm_syncobj_array_find(file_private, > u64_to_user_ptr(args->handles), > count, > + stack_syncobjs, > + ARRAY_SIZE(stack_syncobjs), > &syncobjs); > if (ret < 0) > return ret; > @@ -1622,7 +1648,7 @@ drm_syncobj_timeline_signal_ioctl(struct drm_device *dev, void *data, > err_chains: > kfree(chains); > out: > - drm_syncobj_array_free(syncobjs, count); > + drm_syncobj_array_free(syncobjs, count, stack_syncobjs); > > return ret; > } > @@ -1631,6 +1657,7 @@ int drm_syncobj_query_ioctl(struct drm_device *dev, void *data, > struct drm_file *file_private) > { > struct drm_syncobj_timeline_array *args = data; > + struct drm_syncobj *stack_syncobjs[4]; > struct drm_syncobj **syncobjs; > uint64_t __user *points = u64_to_user_ptr(args->points); > uint32_t i; > @@ -1651,6 +1678,8 @@ int drm_syncobj_query_ioctl(struct drm_device *dev, void *data, > ret = drm_syncobj_array_find(file_private, > u64_to_user_ptr(args->handles), > args->count_handles, > + stack_syncobjs, > + ARRAY_SIZE(stack_syncobjs), > &syncobjs); > if (ret < 0) > return ret; > @@ -1694,7 +1723,7 @@ int drm_syncobj_query_ioctl(struct drm_device *dev, void *data, > break; > } > } > - drm_syncobj_array_free(syncobjs, args->count_handles); > + drm_syncobj_array_free(syncobjs, args->count_handles, stack_syncobjs); > > return ret; > }
On 24/03/2025 23:06, Maíra Canal wrote: > Hi Tvrtko, > > Some nits inline, mostly personal comments. In any case, > > Reviewed-by: Maíra Canal <mcanal@igalia.com> > > > On 18/03/25 12:54, Tvrtko Ursulin wrote: >> Running the Cyberpunk 2077 benchmark we can observe that the lookup >> helper >> is relatively hot, but the 97% of the calls are for a single object. (~3% >> for two points, and never more than three points. While a more trivial >> workload like vkmark under Plasma is even more skewed to single point >> lookups.) >> >> Therefore lets add a fast path to bypass the kmalloc_array/kfree and >> use a >> pre-allocated stack array for those cases. >> >> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@igalia.com> >> --- >> drivers/gpu/drm/drm_syncobj.c | 53 +++++++++++++++++++++++++++-------- >> 1 file changed, 41 insertions(+), 12 deletions(-) >> >> diff --git a/drivers/gpu/drm/drm_syncobj.c b/drivers/gpu/drm/ >> drm_syncobj.c >> index 94932b89298f..233bdef53c87 100644 >> --- a/drivers/gpu/drm/drm_syncobj.c >> +++ b/drivers/gpu/drm/drm_syncobj.c >> @@ -1223,6 +1223,8 @@ EXPORT_SYMBOL(drm_timeout_abs_to_jiffies); >> static int drm_syncobj_array_find(struct drm_file *file_private, >> u32 __user *handles, >> uint32_t count, >> + struct drm_syncobj **stack_syncobjs, >> + u32 stack_count, >> struct drm_syncobj ***syncobjs_out) >> { >> struct drm_syncobj **syncobjs; >> @@ -1232,9 +1234,13 @@ static int drm_syncobj_array_find(struct >> drm_file *file_private, >> if (!access_ok(handles, count * sizeof(*handles))) >> return -EFAULT; >> - syncobjs = kmalloc_array(count, sizeof(*syncobjs), GFP_KERNEL); >> - if (!syncobjs) >> - return -ENOMEM; >> + if (count > stack_count) { > > I believe it's worth adding a comment mentioning that using the stack > syncobj is a fast-path that covers most cases. Yep. But it didn't feel like here is the place so I added comments to where callers size the arrays. That however means there are two duplicated comments. Okay with you? >> + syncobjs = kmalloc_array(count, sizeof(*syncobjs), GFP_KERNEL); >> + if (!syncobjs) >> + return -ENOMEM; >> + } else { >> + syncobjs = stack_syncobjs; >> + } >> for (i = 0; i < count; i++) { >> u64 handle; >> @@ -1260,25 +1266,31 @@ static int drm_syncobj_array_find(struct >> drm_file *file_private, >> drm_syncobj_put(syncobjs[i]); >> i--; >> } >> - kfree(syncobjs); >> + >> + if (syncobjs != stack_syncobjs) > > Again, I have a slight preference to make `syncobjs = NULL` and avoid > this if condition. But it's just a personal preference. Pending clarifications from the other patch. > >> + kfree(syncobjs); >> return ret; >> } >> static void drm_syncobj_array_free(struct drm_syncobj **syncobjs, >> - uint32_t count) >> + uint32_t count, >> + struct drm_syncobj **stack_syncobjs) > > IMO, I think the order `syncobjs, stack_syncobjs, count` is a bit more > intuitive. But count is not directly related to the size of the stack array in this function. I could make it a boolean perhaps like this: static void drm_syncobj_array_free(struct drm_syncobj **syncobjs, uint32_t count, bool free_array) And then in the callers: drm_syncobj_array_free(syncobjs, count, syncobjs != stack_syncobjs); Would that be clearer? > >> { >> uint32_t i; >> for (i = 0; i < count; i++) >> drm_syncobj_put(syncobjs[i]); >> - kfree(syncobjs); >> + >> + if (syncobjs != stack_syncobjs) >> + kfree(syncobjs); >> } >> int >> drm_syncobj_wait_ioctl(struct drm_device *dev, void *data, >> struct drm_file *file_private) >> { >> + struct drm_syncobj *stack_syncobjs[4]; >> struct drm_syncobj_wait *args = data; >> ktime_t deadline, *pdeadline = NULL; >> u32 count = args->count_handles; >> @@ -1304,6 +1316,8 @@ drm_syncobj_wait_ioctl(struct drm_device *dev, >> void *data, >> ret = drm_syncobj_array_find(file_private, >> u64_to_user_ptr(args->handles), >> count, >> + stack_syncobjs, >> + ARRAY_SIZE(stack_syncobjs), >> &syncobjs); >> if (ret < 0) >> return ret; >> @@ -1321,7 +1335,7 @@ drm_syncobj_wait_ioctl(struct drm_device *dev, >> void *data, >> &first, >> pdeadline); >> - drm_syncobj_array_free(syncobjs, count); >> + drm_syncobj_array_free(syncobjs, count, stack_syncobjs); >> if (timeout < 0) >> return timeout; >> @@ -1336,6 +1350,7 @@ drm_syncobj_timeline_wait_ioctl(struct >> drm_device *dev, void *data, >> struct drm_file *file_private) >> { >> struct drm_syncobj_timeline_wait *args = data; >> + struct drm_syncobj *stack_syncobjs[4]; > > Zero initialize it? Do you see it is required? Regards, Tvrtko >> ktime_t deadline, *pdeadline = NULL; >> u32 count = args->count_handles; >> struct drm_syncobj **syncobjs; >> @@ -1361,6 +1376,8 @@ drm_syncobj_timeline_wait_ioctl(struct >> drm_device *dev, void *data, >> ret = drm_syncobj_array_find(file_private, >> u64_to_user_ptr(args->handles), >> count, >> + stack_syncobjs, >> + ARRAY_SIZE(stack_syncobjs), >> &syncobjs); >> if (ret < 0) >> return ret; >> @@ -1378,7 +1395,7 @@ drm_syncobj_timeline_wait_ioctl(struct >> drm_device *dev, void *data, >> &first, >> pdeadline); >> - drm_syncobj_array_free(syncobjs, count); >> + drm_syncobj_array_free(syncobjs, count, stack_syncobjs); >> if (timeout < 0) >> return timeout; >> @@ -1496,6 +1513,7 @@ drm_syncobj_reset_ioctl(struct drm_device *dev, >> void *data, >> struct drm_file *file_private) >> { >> struct drm_syncobj_array *args = data; >> + struct drm_syncobj *stack_syncobjs[4]; >> struct drm_syncobj **syncobjs; >> uint32_t i; >> int ret; >> @@ -1512,6 +1530,8 @@ drm_syncobj_reset_ioctl(struct drm_device *dev, >> void *data, >> ret = drm_syncobj_array_find(file_private, >> u64_to_user_ptr(args->handles), >> args->count_handles, >> + stack_syncobjs, >> + ARRAY_SIZE(stack_syncobjs), >> &syncobjs); >> if (ret < 0) >> return ret; >> @@ -1519,7 +1539,7 @@ drm_syncobj_reset_ioctl(struct drm_device *dev, >> void *data, >> for (i = 0; i < args->count_handles; i++) >> drm_syncobj_replace_fence(syncobjs[i], NULL); >> - drm_syncobj_array_free(syncobjs, args->count_handles); >> + drm_syncobj_array_free(syncobjs, args->count_handles, >> stack_syncobjs); >> return 0; >> } >> @@ -1529,6 +1549,7 @@ drm_syncobj_signal_ioctl(struct drm_device *dev, >> void *data, >> struct drm_file *file_private) >> { >> struct drm_syncobj_array *args = data; >> + struct drm_syncobj *stack_syncobjs[4]; >> struct drm_syncobj **syncobjs; >> uint32_t i; >> int ret; >> @@ -1545,6 +1566,8 @@ drm_syncobj_signal_ioctl(struct drm_device *dev, >> void *data, >> ret = drm_syncobj_array_find(file_private, >> u64_to_user_ptr(args->handles), >> args->count_handles, >> + stack_syncobjs, >> + ARRAY_SIZE(stack_syncobjs), >> &syncobjs); >> if (ret < 0) >> return ret; >> @@ -1555,7 +1578,7 @@ drm_syncobj_signal_ioctl(struct drm_device *dev, >> void *data, >> break; >> } >> - drm_syncobj_array_free(syncobjs, args->count_handles); >> + drm_syncobj_array_free(syncobjs, args->count_handles, >> stack_syncobjs); >> return ret; >> } >> @@ -1567,6 +1590,7 @@ drm_syncobj_timeline_signal_ioctl(struct >> drm_device *dev, void *data, >> struct drm_syncobj_timeline_array *args = data; >> uint64_t __user *points = u64_to_user_ptr(args->points); >> uint32_t i, j, count = args->count_handles; >> + struct drm_syncobj *stack_syncobjs[4]; >> struct drm_syncobj **syncobjs; >> struct dma_fence_chain **chains; >> int ret; >> @@ -1586,6 +1610,8 @@ drm_syncobj_timeline_signal_ioctl(struct >> drm_device *dev, void *data, >> ret = drm_syncobj_array_find(file_private, >> u64_to_user_ptr(args->handles), >> count, >> + stack_syncobjs, >> + ARRAY_SIZE(stack_syncobjs), >> &syncobjs); >> if (ret < 0) >> return ret; >> @@ -1622,7 +1648,7 @@ drm_syncobj_timeline_signal_ioctl(struct >> drm_device *dev, void *data, >> err_chains: >> kfree(chains); >> out: >> - drm_syncobj_array_free(syncobjs, count); >> + drm_syncobj_array_free(syncobjs, count, stack_syncobjs); >> return ret; >> } >> @@ -1631,6 +1657,7 @@ int drm_syncobj_query_ioctl(struct drm_device >> *dev, void *data, >> struct drm_file *file_private) >> { >> struct drm_syncobj_timeline_array *args = data; >> + struct drm_syncobj *stack_syncobjs[4]; >> struct drm_syncobj **syncobjs; >> uint64_t __user *points = u64_to_user_ptr(args->points); >> uint32_t i; >> @@ -1651,6 +1678,8 @@ int drm_syncobj_query_ioctl(struct drm_device >> *dev, void *data, >> ret = drm_syncobj_array_find(file_private, >> u64_to_user_ptr(args->handles), >> args->count_handles, >> + stack_syncobjs, >> + ARRAY_SIZE(stack_syncobjs), >> &syncobjs); >> if (ret < 0) >> return ret; >> @@ -1694,7 +1723,7 @@ int drm_syncobj_query_ioctl(struct drm_device >> *dev, void *data, >> break; >> } >> } >> - drm_syncobj_array_free(syncobjs, args->count_handles); >> + drm_syncobj_array_free(syncobjs, args->count_handles, >> stack_syncobjs); >> return ret; >> } >
Hi Tvrtko, On 25/03/25 06:54, Tvrtko Ursulin wrote: > > On 24/03/2025 23:06, Maíra Canal wrote: >> Hi Tvrtko, >> >> Some nits inline, mostly personal comments. In any case, >> >> Reviewed-by: Maíra Canal <mcanal@igalia.com> >> >> >> On 18/03/25 12:54, Tvrtko Ursulin wrote: >>> Running the Cyberpunk 2077 benchmark we can observe that the lookup >>> helper >>> is relatively hot, but the 97% of the calls are for a single object. >>> (~3% >>> for two points, and never more than three points. While a more trivial >>> workload like vkmark under Plasma is even more skewed to single point >>> lookups.) >>> >>> Therefore lets add a fast path to bypass the kmalloc_array/kfree and >>> use a >>> pre-allocated stack array for those cases. >>> >>> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@igalia.com> >>> --- >>> drivers/gpu/drm/drm_syncobj.c | 53 +++++++++++++++++++++++++++-------- >>> 1 file changed, 41 insertions(+), 12 deletions(-) >>> >>> diff --git a/drivers/gpu/drm/drm_syncobj.c b/drivers/gpu/drm/ >>> drm_syncobj.c >>> index 94932b89298f..233bdef53c87 100644 >>> --- a/drivers/gpu/drm/drm_syncobj.c >>> +++ b/drivers/gpu/drm/drm_syncobj.c >>> @@ -1223,6 +1223,8 @@ EXPORT_SYMBOL(drm_timeout_abs_to_jiffies); >>> static int drm_syncobj_array_find(struct drm_file *file_private, >>> u32 __user *handles, >>> uint32_t count, >>> + struct drm_syncobj **stack_syncobjs, >>> + u32 stack_count, >>> struct drm_syncobj ***syncobjs_out) >>> { >>> struct drm_syncobj **syncobjs; >>> @@ -1232,9 +1234,13 @@ static int drm_syncobj_array_find(struct >>> drm_file *file_private, >>> if (!access_ok(handles, count * sizeof(*handles))) >>> return -EFAULT; >>> - syncobjs = kmalloc_array(count, sizeof(*syncobjs), GFP_KERNEL); >>> - if (!syncobjs) >>> - return -ENOMEM; >>> + if (count > stack_count) { >> >> I believe it's worth adding a comment mentioning that using the stack >> syncobj is a fast-path that covers most cases. > > Yep. But it didn't feel like here is the place so I added comments to > where callers size the arrays. That however means there are two > duplicated comments. Okay with you? Sure. > >>> + syncobjs = kmalloc_array(count, sizeof(*syncobjs), GFP_KERNEL); >>> + if (!syncobjs) >>> + return -ENOMEM; >>> + } else { >>> + syncobjs = stack_syncobjs; >>> + } >>> for (i = 0; i < count; i++) { >>> u64 handle; >>> @@ -1260,25 +1266,31 @@ static int drm_syncobj_array_find(struct >>> drm_file *file_private, >>> drm_syncobj_put(syncobjs[i]); >>> i--; >>> } >>> - kfree(syncobjs); >>> + >>> + if (syncobjs != stack_syncobjs) >> >> Again, I have a slight preference to make `syncobjs = NULL` and avoid >> this if condition. But it's just a personal preference. > > Pending clarifications from the other patch. Nvm, it wasn't a good idea. > >> >>> + kfree(syncobjs); >>> return ret; >>> } >>> static void drm_syncobj_array_free(struct drm_syncobj **syncobjs, >>> - uint32_t count) >>> + uint32_t count, >>> + struct drm_syncobj **stack_syncobjs) >> >> IMO, I think the order `syncobjs, stack_syncobjs, count` is a bit more >> intuitive. > > But count is not directly related to the size of the stack array in this > function. I could make it a boolean perhaps like this:> > static void drm_syncobj_array_free(struct drm_syncobj **syncobjs, > uint32_t count, > bool free_array) > > And then in the callers: > > drm_syncobj_array_free(syncobjs, count, syncobjs != stack_syncobjs); > > Would that be clearer? Yeah, it does. > >> >>> { >>> uint32_t i; >>> for (i = 0; i < count; i++) >>> drm_syncobj_put(syncobjs[i]); >>> - kfree(syncobjs); >>> + >>> + if (syncobjs != stack_syncobjs) >>> + kfree(syncobjs); >>> } >>> int >>> drm_syncobj_wait_ioctl(struct drm_device *dev, void *data, >>> struct drm_file *file_private) >>> { >>> + struct drm_syncobj *stack_syncobjs[4]; >>> struct drm_syncobj_wait *args = data; >>> ktime_t deadline, *pdeadline = NULL; >>> u32 count = args->count_handles; >>> @@ -1304,6 +1316,8 @@ drm_syncobj_wait_ioctl(struct drm_device *dev, >>> void *data, >>> ret = drm_syncobj_array_find(file_private, >>> u64_to_user_ptr(args->handles), >>> count, >>> + stack_syncobjs, >>> + ARRAY_SIZE(stack_syncobjs), >>> &syncobjs); >>> if (ret < 0) >>> return ret; >>> @@ -1321,7 +1335,7 @@ drm_syncobj_wait_ioctl(struct drm_device *dev, >>> void *data, >>> &first, >>> pdeadline); >>> - drm_syncobj_array_free(syncobjs, count); >>> + drm_syncobj_array_free(syncobjs, count, stack_syncobjs); >>> if (timeout < 0) >>> return timeout; >>> @@ -1336,6 +1350,7 @@ drm_syncobj_timeline_wait_ioctl(struct >>> drm_device *dev, void *data, >>> struct drm_file *file_private) >>> { >>> struct drm_syncobj_timeline_wait *args = data; >>> + struct drm_syncobj *stack_syncobjs[4]; >> >> Zero initialize it? > > Do you see it is required? Not required, I was just suggesting to double-check if it wasn't needed indeed. Best Regards, - Maíra > > Regards, > > Tvrtko > >>> ktime_t deadline, *pdeadline = NULL; >>> u32 count = args->count_handles; >>> struct drm_syncobj **syncobjs; >>> @@ -1361,6 +1376,8 @@ drm_syncobj_timeline_wait_ioctl(struct >>> drm_device *dev, void *data, >>> ret = drm_syncobj_array_find(file_private, >>> u64_to_user_ptr(args->handles), >>> count, >>> + stack_syncobjs, >>> + ARRAY_SIZE(stack_syncobjs), >>> &syncobjs); >>> if (ret < 0) >>> return ret; >>> @@ -1378,7 +1395,7 @@ drm_syncobj_timeline_wait_ioctl(struct >>> drm_device *dev, void *data, >>> &first, >>> pdeadline); >>> - drm_syncobj_array_free(syncobjs, count); >>> + drm_syncobj_array_free(syncobjs, count, stack_syncobjs); >>> if (timeout < 0) >>> return timeout; >>> @@ -1496,6 +1513,7 @@ drm_syncobj_reset_ioctl(struct drm_device *dev, >>> void *data, >>> struct drm_file *file_private) >>> { >>> struct drm_syncobj_array *args = data; >>> + struct drm_syncobj *stack_syncobjs[4]; >>> struct drm_syncobj **syncobjs; >>> uint32_t i; >>> int ret; >>> @@ -1512,6 +1530,8 @@ drm_syncobj_reset_ioctl(struct drm_device *dev, >>> void *data, >>> ret = drm_syncobj_array_find(file_private, >>> u64_to_user_ptr(args->handles), >>> args->count_handles, >>> + stack_syncobjs, >>> + ARRAY_SIZE(stack_syncobjs), >>> &syncobjs); >>> if (ret < 0) >>> return ret; >>> @@ -1519,7 +1539,7 @@ drm_syncobj_reset_ioctl(struct drm_device *dev, >>> void *data, >>> for (i = 0; i < args->count_handles; i++) >>> drm_syncobj_replace_fence(syncobjs[i], NULL); >>> - drm_syncobj_array_free(syncobjs, args->count_handles); >>> + drm_syncobj_array_free(syncobjs, args->count_handles, >>> stack_syncobjs); >>> return 0; >>> } >>> @@ -1529,6 +1549,7 @@ drm_syncobj_signal_ioctl(struct drm_device >>> *dev, void *data, >>> struct drm_file *file_private) >>> { >>> struct drm_syncobj_array *args = data; >>> + struct drm_syncobj *stack_syncobjs[4]; >>> struct drm_syncobj **syncobjs; >>> uint32_t i; >>> int ret; >>> @@ -1545,6 +1566,8 @@ drm_syncobj_signal_ioctl(struct drm_device >>> *dev, void *data, >>> ret = drm_syncobj_array_find(file_private, >>> u64_to_user_ptr(args->handles), >>> args->count_handles, >>> + stack_syncobjs, >>> + ARRAY_SIZE(stack_syncobjs), >>> &syncobjs); >>> if (ret < 0) >>> return ret; >>> @@ -1555,7 +1578,7 @@ drm_syncobj_signal_ioctl(struct drm_device >>> *dev, void *data, >>> break; >>> } >>> - drm_syncobj_array_free(syncobjs, args->count_handles); >>> + drm_syncobj_array_free(syncobjs, args->count_handles, >>> stack_syncobjs); >>> return ret; >>> } >>> @@ -1567,6 +1590,7 @@ drm_syncobj_timeline_signal_ioctl(struct >>> drm_device *dev, void *data, >>> struct drm_syncobj_timeline_array *args = data; >>> uint64_t __user *points = u64_to_user_ptr(args->points); >>> uint32_t i, j, count = args->count_handles; >>> + struct drm_syncobj *stack_syncobjs[4]; >>> struct drm_syncobj **syncobjs; >>> struct dma_fence_chain **chains; >>> int ret; >>> @@ -1586,6 +1610,8 @@ drm_syncobj_timeline_signal_ioctl(struct >>> drm_device *dev, void *data, >>> ret = drm_syncobj_array_find(file_private, >>> u64_to_user_ptr(args->handles), >>> count, >>> + stack_syncobjs, >>> + ARRAY_SIZE(stack_syncobjs), >>> &syncobjs); >>> if (ret < 0) >>> return ret; >>> @@ -1622,7 +1648,7 @@ drm_syncobj_timeline_signal_ioctl(struct >>> drm_device *dev, void *data, >>> err_chains: >>> kfree(chains); >>> out: >>> - drm_syncobj_array_free(syncobjs, count); >>> + drm_syncobj_array_free(syncobjs, count, stack_syncobjs); >>> return ret; >>> } >>> @@ -1631,6 +1657,7 @@ int drm_syncobj_query_ioctl(struct drm_device >>> *dev, void *data, >>> struct drm_file *file_private) >>> { >>> struct drm_syncobj_timeline_array *args = data; >>> + struct drm_syncobj *stack_syncobjs[4]; >>> struct drm_syncobj **syncobjs; >>> uint64_t __user *points = u64_to_user_ptr(args->points); >>> uint32_t i; >>> @@ -1651,6 +1678,8 @@ int drm_syncobj_query_ioctl(struct drm_device >>> *dev, void *data, >>> ret = drm_syncobj_array_find(file_private, >>> u64_to_user_ptr(args->handles), >>> args->count_handles, >>> + stack_syncobjs, >>> + ARRAY_SIZE(stack_syncobjs), >>> &syncobjs); >>> if (ret < 0) >>> return ret; >>> @@ -1694,7 +1723,7 @@ int drm_syncobj_query_ioctl(struct drm_device >>> *dev, void *data, >>> break; >>> } >>> } >>> - drm_syncobj_array_free(syncobjs, args->count_handles); >>> + drm_syncobj_array_free(syncobjs, args->count_handles, >>> stack_syncobjs); >>> return ret; >>> } >> >
diff --git a/drivers/gpu/drm/drm_syncobj.c b/drivers/gpu/drm/drm_syncobj.c index 94932b89298f..233bdef53c87 100644 --- a/drivers/gpu/drm/drm_syncobj.c +++ b/drivers/gpu/drm/drm_syncobj.c @@ -1223,6 +1223,8 @@ EXPORT_SYMBOL(drm_timeout_abs_to_jiffies); static int drm_syncobj_array_find(struct drm_file *file_private, u32 __user *handles, uint32_t count, + struct drm_syncobj **stack_syncobjs, + u32 stack_count, struct drm_syncobj ***syncobjs_out) { struct drm_syncobj **syncobjs; @@ -1232,9 +1234,13 @@ static int drm_syncobj_array_find(struct drm_file *file_private, if (!access_ok(handles, count * sizeof(*handles))) return -EFAULT; - syncobjs = kmalloc_array(count, sizeof(*syncobjs), GFP_KERNEL); - if (!syncobjs) - return -ENOMEM; + if (count > stack_count) { + syncobjs = kmalloc_array(count, sizeof(*syncobjs), GFP_KERNEL); + if (!syncobjs) + return -ENOMEM; + } else { + syncobjs = stack_syncobjs; + } for (i = 0; i < count; i++) { u64 handle; @@ -1260,25 +1266,31 @@ static int drm_syncobj_array_find(struct drm_file *file_private, drm_syncobj_put(syncobjs[i]); i--; } - kfree(syncobjs); + + if (syncobjs != stack_syncobjs) + kfree(syncobjs); return ret; } static void drm_syncobj_array_free(struct drm_syncobj **syncobjs, - uint32_t count) + uint32_t count, + struct drm_syncobj **stack_syncobjs) { uint32_t i; for (i = 0; i < count; i++) drm_syncobj_put(syncobjs[i]); - kfree(syncobjs); + + if (syncobjs != stack_syncobjs) + kfree(syncobjs); } int drm_syncobj_wait_ioctl(struct drm_device *dev, void *data, struct drm_file *file_private) { + struct drm_syncobj *stack_syncobjs[4]; struct drm_syncobj_wait *args = data; ktime_t deadline, *pdeadline = NULL; u32 count = args->count_handles; @@ -1304,6 +1316,8 @@ drm_syncobj_wait_ioctl(struct drm_device *dev, void *data, ret = drm_syncobj_array_find(file_private, u64_to_user_ptr(args->handles), count, + stack_syncobjs, + ARRAY_SIZE(stack_syncobjs), &syncobjs); if (ret < 0) return ret; @@ -1321,7 +1335,7 @@ drm_syncobj_wait_ioctl(struct drm_device *dev, void *data, &first, pdeadline); - drm_syncobj_array_free(syncobjs, count); + drm_syncobj_array_free(syncobjs, count, stack_syncobjs); if (timeout < 0) return timeout; @@ -1336,6 +1350,7 @@ drm_syncobj_timeline_wait_ioctl(struct drm_device *dev, void *data, struct drm_file *file_private) { struct drm_syncobj_timeline_wait *args = data; + struct drm_syncobj *stack_syncobjs[4]; ktime_t deadline, *pdeadline = NULL; u32 count = args->count_handles; struct drm_syncobj **syncobjs; @@ -1361,6 +1376,8 @@ drm_syncobj_timeline_wait_ioctl(struct drm_device *dev, void *data, ret = drm_syncobj_array_find(file_private, u64_to_user_ptr(args->handles), count, + stack_syncobjs, + ARRAY_SIZE(stack_syncobjs), &syncobjs); if (ret < 0) return ret; @@ -1378,7 +1395,7 @@ drm_syncobj_timeline_wait_ioctl(struct drm_device *dev, void *data, &first, pdeadline); - drm_syncobj_array_free(syncobjs, count); + drm_syncobj_array_free(syncobjs, count, stack_syncobjs); if (timeout < 0) return timeout; @@ -1496,6 +1513,7 @@ drm_syncobj_reset_ioctl(struct drm_device *dev, void *data, struct drm_file *file_private) { struct drm_syncobj_array *args = data; + struct drm_syncobj *stack_syncobjs[4]; struct drm_syncobj **syncobjs; uint32_t i; int ret; @@ -1512,6 +1530,8 @@ drm_syncobj_reset_ioctl(struct drm_device *dev, void *data, ret = drm_syncobj_array_find(file_private, u64_to_user_ptr(args->handles), args->count_handles, + stack_syncobjs, + ARRAY_SIZE(stack_syncobjs), &syncobjs); if (ret < 0) return ret; @@ -1519,7 +1539,7 @@ drm_syncobj_reset_ioctl(struct drm_device *dev, void *data, for (i = 0; i < args->count_handles; i++) drm_syncobj_replace_fence(syncobjs[i], NULL); - drm_syncobj_array_free(syncobjs, args->count_handles); + drm_syncobj_array_free(syncobjs, args->count_handles, stack_syncobjs); return 0; } @@ -1529,6 +1549,7 @@ drm_syncobj_signal_ioctl(struct drm_device *dev, void *data, struct drm_file *file_private) { struct drm_syncobj_array *args = data; + struct drm_syncobj *stack_syncobjs[4]; struct drm_syncobj **syncobjs; uint32_t i; int ret; @@ -1545,6 +1566,8 @@ drm_syncobj_signal_ioctl(struct drm_device *dev, void *data, ret = drm_syncobj_array_find(file_private, u64_to_user_ptr(args->handles), args->count_handles, + stack_syncobjs, + ARRAY_SIZE(stack_syncobjs), &syncobjs); if (ret < 0) return ret; @@ -1555,7 +1578,7 @@ drm_syncobj_signal_ioctl(struct drm_device *dev, void *data, break; } - drm_syncobj_array_free(syncobjs, args->count_handles); + drm_syncobj_array_free(syncobjs, args->count_handles, stack_syncobjs); return ret; } @@ -1567,6 +1590,7 @@ drm_syncobj_timeline_signal_ioctl(struct drm_device *dev, void *data, struct drm_syncobj_timeline_array *args = data; uint64_t __user *points = u64_to_user_ptr(args->points); uint32_t i, j, count = args->count_handles; + struct drm_syncobj *stack_syncobjs[4]; struct drm_syncobj **syncobjs; struct dma_fence_chain **chains; int ret; @@ -1586,6 +1610,8 @@ drm_syncobj_timeline_signal_ioctl(struct drm_device *dev, void *data, ret = drm_syncobj_array_find(file_private, u64_to_user_ptr(args->handles), count, + stack_syncobjs, + ARRAY_SIZE(stack_syncobjs), &syncobjs); if (ret < 0) return ret; @@ -1622,7 +1648,7 @@ drm_syncobj_timeline_signal_ioctl(struct drm_device *dev, void *data, err_chains: kfree(chains); out: - drm_syncobj_array_free(syncobjs, count); + drm_syncobj_array_free(syncobjs, count, stack_syncobjs); return ret; } @@ -1631,6 +1657,7 @@ int drm_syncobj_query_ioctl(struct drm_device *dev, void *data, struct drm_file *file_private) { struct drm_syncobj_timeline_array *args = data; + struct drm_syncobj *stack_syncobjs[4]; struct drm_syncobj **syncobjs; uint64_t __user *points = u64_to_user_ptr(args->points); uint32_t i; @@ -1651,6 +1678,8 @@ int drm_syncobj_query_ioctl(struct drm_device *dev, void *data, ret = drm_syncobj_array_find(file_private, u64_to_user_ptr(args->handles), args->count_handles, + stack_syncobjs, + ARRAY_SIZE(stack_syncobjs), &syncobjs); if (ret < 0) return ret; @@ -1694,7 +1723,7 @@ int drm_syncobj_query_ioctl(struct drm_device *dev, void *data, break; } } - drm_syncobj_array_free(syncobjs, args->count_handles); + drm_syncobj_array_free(syncobjs, args->count_handles, stack_syncobjs); return ret; }
Running the Cyberpunk 2077 benchmark we can observe that the lookup helper is relatively hot, but the 97% of the calls are for a single object. (~3% for two points, and never more than three points. While a more trivial workload like vkmark under Plasma is even more skewed to single point lookups.) Therefore lets add a fast path to bypass the kmalloc_array/kfree and use a pre-allocated stack array for those cases. Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@igalia.com> --- drivers/gpu/drm/drm_syncobj.c | 53 +++++++++++++++++++++++++++-------- 1 file changed, 41 insertions(+), 12 deletions(-)