diff mbox series

dma-buf: Eliminate all duplicate fences in dma_fence_unwrap_merge

Message ID 20241018054608.6478-1-friedrich.vock@gmx.de (mailing list archive)
State New
Headers show
Series dma-buf: Eliminate all duplicate fences in dma_fence_unwrap_merge | expand

Commit Message

Friedrich Vock Oct. 18, 2024, 5:46 a.m. UTC
When dma_fence_unwrap_merge is called on fence chains where the fences
aren't ordered by context, the merging logic breaks down and we end up
inserting fences twice. Doing this repeatedly leads to the number of
fences going up exponentially, and in some gaming workloads we'll end up
running out of memory to store the resulting array altogether, leading
to a warning such as:

vkd3d_queue: page allocation failure: order:7, mode:0x40dc0(GFP_KERNEL|__GFP_COMP|__GFP_ZERO), nodemask=(null),cpuset=/,mems_allowed=0
CPU: 2 PID: 5287 Comm: vkd3d_queue Tainted: G S                 6.10.7-200.fsync.fc40.x86_64 #1
Hardware name: Dell Inc. G5 5505/0NCW8W, BIOS 1.11.0 03/22/2022
Call Trace:
 <TASK>
 dump_stack_lvl+0x5d/0x80
 warn_alloc+0x164/0x190
 ? srso_return_thunk+0x5/0x5f
 ? __alloc_pages_direct_compact+0x1d9/0x220
 __alloc_pages_slowpath.constprop.2+0xd14/0xd80
 __alloc_pages_noprof+0x32b/0x350
 ? dma_fence_array_create+0x48/0x110
 __kmalloc_large_node+0x6f/0x130
 __kmalloc_noprof+0x2dd/0x4a0
 ? dma_fence_array_create+0x48/0x110
 dma_fence_array_create+0x48/0x110
 __dma_fence_unwrap_merge+0x481/0x5b0
 sync_file_merge.constprop.0+0xf8/0x180
 sync_file_ioctl+0x476/0x590
 ? srso_return_thunk+0x5/0x5f
 ? __seccomp_filter+0xe8/0x5a0
 __x64_sys_ioctl+0x97/0xd0
 do_syscall_64+0x82/0x160
 ? srso_return_thunk+0x5/0x5f
 ? drm_syncobj_destroy_ioctl+0x8b/0xb0
 ? srso_return_thunk+0x5/0x5f
 ? srso_return_thunk+0x5/0x5f
 ? __check_object_size+0x58/0x230
 ? srso_return_thunk+0x5/0x5f
 ? srso_return_thunk+0x5/0x5f
 ? drm_ioctl+0x2ba/0x530
 ? __pfx_drm_syncobj_destroy_ioctl+0x10/0x10
 ? srso_return_thunk+0x5/0x5f
 ? ktime_get_mono_fast_ns+0x3b/0xd0
 ? srso_return_thunk+0x5/0x5f
 ? amdgpu_drm_ioctl+0x71/0x90 [amdgpu]
 ? srso_return_thunk+0x5/0x5f
 ? syscall_exit_to_user_mode+0x72/0x200
 ? srso_return_thunk+0x5/0x5f
 ? do_syscall_64+0x8e/0x160
 ? syscall_exit_to_user_mode+0x72/0x200
 ? srso_return_thunk+0x5/0x5f
 ? do_syscall_64+0x8e/0x160
 ? srso_return_thunk+0x5/0x5f
 ? syscall_exit_to_user_mode+0x72/0x200
 ? srso_return_thunk+0x5/0x5f
 ? do_syscall_64+0x8e/0x160
 ? do_syscall_64+0x8e/0x160
 ? srso_return_thunk+0x5/0x5f
 entry_SYSCALL_64_after_hwframe+0x76/0x7e

It's a bit unfortunate that we end up with quadratic complexity w.r.t.
the number of merged fences in all cases, but I'd argue in practice
there shouldn't be more than a handful of in-flight fences to merge.

Link: https://gitlab.freedesktop.org/drm/amd/-/issues/3617
Signed-off-by: Friedrich Vock <friedrich.vock@gmx.de>
---
 drivers/dma-buf/dma-fence-unwrap.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

--
2.47.0

Comments

Christian König Oct. 18, 2024, 8:56 a.m. UTC | #1
Am 18.10.24 um 07:46 schrieb Friedrich Vock:
> When dma_fence_unwrap_merge is called on fence chains where the fences
> aren't ordered by context, the merging logic breaks down and we end up
> inserting fences twice. Doing this repeatedly leads to the number of
> fences going up exponentially, and in some gaming workloads we'll end up
> running out of memory to store the resulting array altogether, leading
> to a warning such as:

Ah! I was searching for that one for quite a while now.

I own you a beer should you ever be near Cologne.

Please also see my patch on the mailing list to use kvzalloc() to 
mitigate this.

>
> vkd3d_queue: page allocation failure: order:7, mode:0x40dc0(GFP_KERNEL|__GFP_COMP|__GFP_ZERO), nodemask=(null),cpuset=/,mems_allowed=0
> CPU: 2 PID: 5287 Comm: vkd3d_queue Tainted: G S                 6.10.7-200.fsync.fc40.x86_64 #1
> Hardware name: Dell Inc. G5 5505/0NCW8W, BIOS 1.11.0 03/22/2022
> Call Trace:
>   <TASK>
>   dump_stack_lvl+0x5d/0x80
>   warn_alloc+0x164/0x190
>   ? srso_return_thunk+0x5/0x5f
>   ? __alloc_pages_direct_compact+0x1d9/0x220
>   __alloc_pages_slowpath.constprop.2+0xd14/0xd80
>   __alloc_pages_noprof+0x32b/0x350
>   ? dma_fence_array_create+0x48/0x110
>   __kmalloc_large_node+0x6f/0x130
>   __kmalloc_noprof+0x2dd/0x4a0
>   ? dma_fence_array_create+0x48/0x110
>   dma_fence_array_create+0x48/0x110
>   __dma_fence_unwrap_merge+0x481/0x5b0
>   sync_file_merge.constprop.0+0xf8/0x180
>   sync_file_ioctl+0x476/0x590
>   ? srso_return_thunk+0x5/0x5f
>   ? __seccomp_filter+0xe8/0x5a0
>   __x64_sys_ioctl+0x97/0xd0
>   do_syscall_64+0x82/0x160
>   ? srso_return_thunk+0x5/0x5f
>   ? drm_syncobj_destroy_ioctl+0x8b/0xb0
>   ? srso_return_thunk+0x5/0x5f
>   ? srso_return_thunk+0x5/0x5f
>   ? __check_object_size+0x58/0x230
>   ? srso_return_thunk+0x5/0x5f
>   ? srso_return_thunk+0x5/0x5f
>   ? drm_ioctl+0x2ba/0x530
>   ? __pfx_drm_syncobj_destroy_ioctl+0x10/0x10
>   ? srso_return_thunk+0x5/0x5f
>   ? ktime_get_mono_fast_ns+0x3b/0xd0
>   ? srso_return_thunk+0x5/0x5f
>   ? amdgpu_drm_ioctl+0x71/0x90 [amdgpu]
>   ? srso_return_thunk+0x5/0x5f
>   ? syscall_exit_to_user_mode+0x72/0x200
>   ? srso_return_thunk+0x5/0x5f
>   ? do_syscall_64+0x8e/0x160
>   ? syscall_exit_to_user_mode+0x72/0x200
>   ? srso_return_thunk+0x5/0x5f
>   ? do_syscall_64+0x8e/0x160
>   ? srso_return_thunk+0x5/0x5f
>   ? syscall_exit_to_user_mode+0x72/0x200
>   ? srso_return_thunk+0x5/0x5f
>   ? do_syscall_64+0x8e/0x160
>   ? do_syscall_64+0x8e/0x160
>   ? srso_return_thunk+0x5/0x5f
>   entry_SYSCALL_64_after_hwframe+0x76/0x7e
>
> It's a bit unfortunate that we end up with quadratic complexity w.r.t.
> the number of merged fences in all cases, but I'd argue in practice
> there shouldn't be more than a handful of in-flight fences to merge.
>
> Link: https://gitlab.freedesktop.org/drm/amd/-/issues/3617
> Signed-off-by: Friedrich Vock <friedrich.vock@gmx.de>
> ---
>   drivers/dma-buf/dma-fence-unwrap.c | 13 +++++++++++--
>   1 file changed, 11 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/dma-buf/dma-fence-unwrap.c b/drivers/dma-buf/dma-fence-unwrap.c
> index 628af51c81af..46277cef0bc6 100644
> --- a/drivers/dma-buf/dma-fence-unwrap.c
> +++ b/drivers/dma-buf/dma-fence-unwrap.c
> @@ -68,7 +68,7 @@ struct dma_fence *__dma_fence_unwrap_merge(unsigned int num_fences,
>   	struct dma_fence *tmp, **array;
>   	ktime_t timestamp;
>   	unsigned int i;
> -	size_t count;
> +	size_t count, j;
>
>   	count = 0;
>   	timestamp = ns_to_ktime(0);
> @@ -127,6 +127,10 @@ struct dma_fence *__dma_fence_unwrap_merge(unsigned int num_fences,
>   			 * function is used multiple times. So attempt to order
>   			 * the fences by context as we pass over them and merge
>   			 * fences with the same context.
> +			 *
> +			 * We will remove any remaining duplicate fences down
> +			 * below, but doing this here saves us from having to
> +			 * iterate over the array to detect the duplicate.
>   			 */
>   			if (!tmp || tmp->context > next->context) {
>   				tmp = next;
> @@ -145,7 +149,12 @@ struct dma_fence *__dma_fence_unwrap_merge(unsigned int num_fences,
>   		}
>
>   		if (tmp) {
> -			array[count++] = dma_fence_get(tmp);
> +			for (j = 0; j < count; ++j) {
> +				if (array[count] == tmp)
> +					break;
> +			}
> +			if (j == count)
> +				array[count++] = dma_fence_get(tmp);

That is clearly not the right solution. Since comparing the context 
should have already removed all duplicates.

Going to double check the code.

Thanks,
Christian.

>   			fences[sel] = dma_fence_unwrap_next(&iter[sel]);
>   		}
>   	} while (tmp);
> --
> 2.47.0
>
diff mbox series

Patch

diff --git a/drivers/dma-buf/dma-fence-unwrap.c b/drivers/dma-buf/dma-fence-unwrap.c
index 628af51c81af..46277cef0bc6 100644
--- a/drivers/dma-buf/dma-fence-unwrap.c
+++ b/drivers/dma-buf/dma-fence-unwrap.c
@@ -68,7 +68,7 @@  struct dma_fence *__dma_fence_unwrap_merge(unsigned int num_fences,
 	struct dma_fence *tmp, **array;
 	ktime_t timestamp;
 	unsigned int i;
-	size_t count;
+	size_t count, j;

 	count = 0;
 	timestamp = ns_to_ktime(0);
@@ -127,6 +127,10 @@  struct dma_fence *__dma_fence_unwrap_merge(unsigned int num_fences,
 			 * function is used multiple times. So attempt to order
 			 * the fences by context as we pass over them and merge
 			 * fences with the same context.
+			 *
+			 * We will remove any remaining duplicate fences down
+			 * below, but doing this here saves us from having to
+			 * iterate over the array to detect the duplicate.
 			 */
 			if (!tmp || tmp->context > next->context) {
 				tmp = next;
@@ -145,7 +149,12 @@  struct dma_fence *__dma_fence_unwrap_merge(unsigned int num_fences,
 		}

 		if (tmp) {
-			array[count++] = dma_fence_get(tmp);
+			for (j = 0; j < count; ++j) {
+				if (array[count] == tmp)
+					break;
+			}
+			if (j == count)
+				array[count++] = dma_fence_get(tmp);
 			fences[sel] = dma_fence_unwrap_next(&iter[sel]);
 		}
 	} while (tmp);