Message ID | 20250318155424.78552-7-tvrtko.ursulin@igalia.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | A few drm_syncobj optimisations | expand |
Hi Tvrtko, On 18/03/25 12:54, Tvrtko Ursulin wrote: > Running the Cyberpunk 2077 benchmark we can observe that waiting on DRM > sycobjs is relatively hot, but the 96% of the calls are for a single > object. (~4% for two points, and never more than three points. While > a more trivial workload like vkmark under Plasma is even more skewed > to single point waits.) > > Therefore lets add a fast path to bypass the kcalloc/kfree and use a pre- > allocated stack array for those cases. > > Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@igalia.com> > --- > drivers/gpu/drm/drm_syncobj.c | 16 ++++++++++++---- > 1 file changed, 12 insertions(+), 4 deletions(-) > > diff --git a/drivers/gpu/drm/drm_syncobj.c b/drivers/gpu/drm/drm_syncobj.c > index b4563c696056..94932b89298f 100644 > --- a/drivers/gpu/drm/drm_syncobj.c > +++ b/drivers/gpu/drm/drm_syncobj.c > @@ -1035,6 +1035,7 @@ static signed long drm_syncobj_array_wait_timeout(struct drm_syncobj **syncobjs, > uint32_t *idx, > ktime_t *deadline) > { > + struct syncobj_wait_entry stack_entries[4]; Can't you initialize as struct syncobj_wait_entry stack_entries[4] = {0}; ? > struct syncobj_wait_entry *entries; > uint32_t signaled_count, i; > struct dma_fence *fence; > @@ -1049,9 +1050,14 @@ static signed long drm_syncobj_array_wait_timeout(struct drm_syncobj **syncobjs, > !access_ok(user_points, count * sizeof(*user_points))) > return -EFAULT; > > - entries = kcalloc(count, sizeof(*entries), GFP_KERNEL); > - if (!entries) > - return -ENOMEM; > + if (count > ARRAY_SIZE(stack_entries)) { > + entries = kcalloc(count, sizeof(*entries), GFP_KERNEL); > + if (!entries) > + return -ENOMEM; > + } else { > + memset(stack_entries, 0, sizeof(stack_entries)); Then, you can avoid this step. > + entries = stack_entries; > + } > > /* Walk the list of sync objects and initialize entries. We do > * this up-front so that we can properly return -EINVAL if there is > @@ -1174,7 +1180,9 @@ static signed long drm_syncobj_array_wait_timeout(struct drm_syncobj **syncobjs, > &entries[i].fence_cb); > dma_fence_put(entries[i].fence); > } > - kfree(entries); > + > + if (entries != stack_entries) You can initialize `entries = NULL` and avoid this step. Anyway, Reviewed-by: Maíra Canal <mcanal@igalia.com> Best Regards, - Maíra > + kfree(entries); > > return timeout; > }
On 24/03/2025 23:00, Maíra Canal wrote: > Hi Tvrtko, > > On 18/03/25 12:54, Tvrtko Ursulin wrote: >> Running the Cyberpunk 2077 benchmark we can observe that waiting on DRM >> sycobjs is relatively hot, but the 96% of the calls are for a single >> object. (~4% for two points, and never more than three points. While >> a more trivial workload like vkmark under Plasma is even more skewed >> to single point waits.) >> >> Therefore lets add a fast path to bypass the kcalloc/kfree and use a pre- >> allocated stack array for those cases. >> >> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@igalia.com> >> --- >> drivers/gpu/drm/drm_syncobj.c | 16 ++++++++++++---- >> 1 file changed, 12 insertions(+), 4 deletions(-) >> >> diff --git a/drivers/gpu/drm/drm_syncobj.c b/drivers/gpu/drm/ >> drm_syncobj.c >> index b4563c696056..94932b89298f 100644 >> --- a/drivers/gpu/drm/drm_syncobj.c >> +++ b/drivers/gpu/drm/drm_syncobj.c >> @@ -1035,6 +1035,7 @@ static signed long >> drm_syncobj_array_wait_timeout(struct drm_syncobj **syncobjs, >> uint32_t *idx, >> ktime_t *deadline) >> { >> + struct syncobj_wait_entry stack_entries[4]; > > Can't you initialize as > > struct syncobj_wait_entry stack_entries[4] = {0}; > > ? Could do but I preferred to only do it when it is used. > >> struct syncobj_wait_entry *entries; >> uint32_t signaled_count, i; >> struct dma_fence *fence; >> @@ -1049,9 +1050,14 @@ static signed long >> drm_syncobj_array_wait_timeout(struct drm_syncobj **syncobjs, >> !access_ok(user_points, count * sizeof(*user_points))) >> return -EFAULT; >> - entries = kcalloc(count, sizeof(*entries), GFP_KERNEL); >> - if (!entries) >> - return -ENOMEM; >> + if (count > ARRAY_SIZE(stack_entries)) { >> + entries = kcalloc(count, sizeof(*entries), GFP_KERNEL); >> + if (!entries) >> + return -ENOMEM; >> + } else { >> + memset(stack_entries, 0, sizeof(stack_entries)); > > Then, you can avoid this step. > >> + entries = stack_entries; >> + } >> /* Walk the list of sync objects and initialize entries. We do >> * this up-front so that we can properly return -EINVAL if there is >> @@ -1174,7 +1180,9 @@ static signed long >> drm_syncobj_array_wait_timeout(struct drm_syncobj **syncobjs, >> &entries[i].fence_cb); >> dma_fence_put(entries[i].fence); >> } >> - kfree(entries); >> + >> + if (entries != stack_entries) > > You can initialize `entries = NULL` and avoid this step. Hmm where? You mean like: if (entries == stack_entries) entries = NULL; kfree(entries); Or something different? Regards, Tvrtko > > Anyway, > > Reviewed-by: Maíra Canal <mcanal@igalia.com> > > Best Regards, > - Maíra > >> + kfree(entries); >> return timeout; >> } >
Hi Tvrtko, On 25/03/25 06:33, Tvrtko Ursulin wrote: > > On 24/03/2025 23:00, Maíra Canal wrote: >> Hi Tvrtko, >> >> On 18/03/25 12:54, Tvrtko Ursulin wrote: >>> Running the Cyberpunk 2077 benchmark we can observe that waiting on DRM >>> sycobjs is relatively hot, but the 96% of the calls are for a single >>> object. (~4% for two points, and never more than three points. While >>> a more trivial workload like vkmark under Plasma is even more skewed >>> to single point waits.) >>> >>> Therefore lets add a fast path to bypass the kcalloc/kfree and use a >>> pre- >>> allocated stack array for those cases. >>> >>> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@igalia.com> >>> --- >>> drivers/gpu/drm/drm_syncobj.c | 16 ++++++++++++---- >>> 1 file changed, 12 insertions(+), 4 deletions(-) >>> >>> diff --git a/drivers/gpu/drm/drm_syncobj.c b/drivers/gpu/drm/ >>> drm_syncobj.c >>> index b4563c696056..94932b89298f 100644 >>> --- a/drivers/gpu/drm/drm_syncobj.c >>> +++ b/drivers/gpu/drm/drm_syncobj.c >>> @@ -1035,6 +1035,7 @@ static signed long >>> drm_syncobj_array_wait_timeout(struct drm_syncobj **syncobjs, >>> uint32_t *idx, >>> ktime_t *deadline) >>> { >>> + struct syncobj_wait_entry stack_entries[4]; >> >> Can't you initialize as >> >> struct syncobj_wait_entry stack_entries[4] = {0}; >> >> ? > > Could do but I preferred to only do it when it is used. Np. > >> >>> struct syncobj_wait_entry *entries; >>> uint32_t signaled_count, i;>>> struct dma_fence *fence; >>> @@ -1049,9 +1050,14 @@ static signed long >>> drm_syncobj_array_wait_timeout(struct drm_syncobj **syncobjs, >>> !access_ok(user_points, count * sizeof(*user_points))) >>> return -EFAULT; >>> - entries = kcalloc(count, sizeof(*entries), GFP_KERNEL); >>> - if (!entries) >>> - return -ENOMEM; >>> + if (count > ARRAY_SIZE(stack_entries)) { >>> + entries = kcalloc(count, sizeof(*entries), GFP_KERNEL); >>> + if (!entries) >>> + return -ENOMEM; >>> + } else { >>> + memset(stack_entries, 0, sizeof(stack_entries)); >> >> Then, you can avoid this step. >> >>> + entries = stack_entries; >>> + } >>> /* Walk the list of sync objects and initialize entries. We do >>> * this up-front so that we can properly return -EINVAL if >>> there is >>> @@ -1174,7 +1180,9 @@ static signed long >>> drm_syncobj_array_wait_timeout(struct drm_syncobj **syncobjs, >>> &entries[i].fence_cb); >>> dma_fence_put(entries[i].fence); >>> } >>> - kfree(entries); >>> + >>> + if (entries != stack_entries) >> >> You can initialize `entries = NULL` and avoid this step. > > Hmm where? You mean like: > > if (entries == stack_entries) > entries = NULL; > kfree(entries); > > Or something different? Nvm, what I had thought initially was wrong. Sorry for the noise! Again, feel free to you add my R-b. Best Regards, - Maíra > > Regards, > > Tvrtko > >> >> Anyway, >> >> Reviewed-by: Maíra Canal <mcanal@igalia.com> >> >> Best Regards, >> - Maíra >> >>> + kfree(entries); >>> return timeout; >>> } >> >
diff --git a/drivers/gpu/drm/drm_syncobj.c b/drivers/gpu/drm/drm_syncobj.c index b4563c696056..94932b89298f 100644 --- a/drivers/gpu/drm/drm_syncobj.c +++ b/drivers/gpu/drm/drm_syncobj.c @@ -1035,6 +1035,7 @@ static signed long drm_syncobj_array_wait_timeout(struct drm_syncobj **syncobjs, uint32_t *idx, ktime_t *deadline) { + struct syncobj_wait_entry stack_entries[4]; struct syncobj_wait_entry *entries; uint32_t signaled_count, i; struct dma_fence *fence; @@ -1049,9 +1050,14 @@ static signed long drm_syncobj_array_wait_timeout(struct drm_syncobj **syncobjs, !access_ok(user_points, count * sizeof(*user_points))) return -EFAULT; - entries = kcalloc(count, sizeof(*entries), GFP_KERNEL); - if (!entries) - return -ENOMEM; + if (count > ARRAY_SIZE(stack_entries)) { + entries = kcalloc(count, sizeof(*entries), GFP_KERNEL); + if (!entries) + return -ENOMEM; + } else { + memset(stack_entries, 0, sizeof(stack_entries)); + entries = stack_entries; + } /* Walk the list of sync objects and initialize entries. We do * this up-front so that we can properly return -EINVAL if there is @@ -1174,7 +1180,9 @@ static signed long drm_syncobj_array_wait_timeout(struct drm_syncobj **syncobjs, &entries[i].fence_cb); dma_fence_put(entries[i].fence); } - kfree(entries); + + if (entries != stack_entries) + kfree(entries); return timeout; }
Running the Cyberpunk 2077 benchmark we can observe that waiting on DRM sycobjs is relatively hot, but the 96% of the calls are for a single object. (~4% for two points, and never more than three points. While a more trivial workload like vkmark under Plasma is even more skewed to single point waits.) Therefore lets add a fast path to bypass the kcalloc/kfree and use a pre- allocated stack array for those cases. Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@igalia.com> --- drivers/gpu/drm/drm_syncobj.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-)