diff mbox series

[v3,bpf-next,2/6] bpf: udp: Make sure iter->batch always contains a full bucket snapshot

Message ID 20250416233622.1212256-3-jordan@jrife.io (mailing list archive)
State New
Delegated to: BPF
Headers show
Series bpf: udp: Exactly-once socket iteration | expand

Checks

Context Check Description
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for bpf-next, async
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 0 this patch: 0
netdev/build_tools success No tools touched, skip
netdev/cc_maintainers warning 5 maintainers not CCed: dsahern@kernel.org edumazet@google.com kuba@kernel.org pabeni@redhat.com horms@kernel.org
netdev/build_clang success Errors and warnings before: 0 this patch: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn fail Errors and warnings before: 14 this patch: 15
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 102 lines checked
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 3 this patch: 3
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next-PR success PR summary
bpf/vmtest-bpf-next-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-next-VM_Test-4 success Logs for aarch64-gcc / GCC BPF
bpf/vmtest-bpf-next-VM_Test-5 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-6 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-11 success Logs for aarch64-gcc / veristat-kernel
bpf/vmtest-bpf-next-VM_Test-15 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-13 success Logs for s390x-gcc / GCC BPF
bpf/vmtest-bpf-next-VM_Test-12 success Logs for aarch64-gcc / veristat-meta
bpf/vmtest-bpf-next-VM_Test-14 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-10 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-18 success Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-19 success Logs for s390x-gcc / veristat-kernel
bpf/vmtest-bpf-next-VM_Test-20 success Logs for s390x-gcc / veristat-meta
bpf/vmtest-bpf-next-VM_Test-21 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-23 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-24 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-25 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-30 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-31 success Logs for x86_64-gcc / veristat-kernel / x86_64-gcc veristat_kernel
bpf/vmtest-bpf-next-VM_Test-32 success Logs for x86_64-gcc / veristat-meta / x86_64-gcc veristat_meta
bpf/vmtest-bpf-next-VM_Test-34 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-35 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17-O2
bpf/vmtest-bpf-next-VM_Test-36 success Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-39 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-40 success Logs for x86_64-llvm-17 / veristat-kernel
bpf/vmtest-bpf-next-VM_Test-41 success Logs for x86_64-llvm-17 / veristat-meta
bpf/vmtest-bpf-next-VM_Test-43 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-44 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18-O2
bpf/vmtest-bpf-next-VM_Test-45 success Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-49 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-50 success Logs for x86_64-llvm-18 / veristat-kernel
bpf/vmtest-bpf-next-VM_Test-51 success Logs for x86_64-llvm-18 / veristat-meta
bpf/vmtest-bpf-next-VM_Test-7 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-22 success Logs for x86_64-gcc / GCC BPF / GCC BPF
bpf/vmtest-bpf-next-VM_Test-26 success Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-27 success Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-28 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-29 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-8 success Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-9 success Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-33 success Logs for x86_64-llvm-17 / GCC BPF / GCC BPF
bpf/vmtest-bpf-next-VM_Test-37 success Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-38 success Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-42 success Logs for x86_64-llvm-18 / GCC BPF / GCC BPF
bpf/vmtest-bpf-next-VM_Test-46 success Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-47 success Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-48 success Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-16 success Logs for s390x-gcc / test (test_progs, false, 360) / test_progs on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-17 success Logs for s390x-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on s390x with gcc

Commit Message

Jordan Rife April 16, 2025, 11:36 p.m. UTC
Require that iter->batch always contains a full bucket snapshot. This
invariant is important to avoid skipping or repeating sockets during
iteration when combined with the next few patches. Before, there were
two cases where a call to bpf_iter_udp_batch may only capture part of a
bucket:

1. When bpf_iter_udp_realloc_batch() returns -ENOMEM [1].
2. When more sockets are added to the bucket while calling
   bpf_iter_udp_realloc_batch(), making the updated batch size
   insufficient [2].

In cases where the batch size only covers part of a bucket, it is
possible to forget which sockets were already visited, especially if we
have to process a bucket in more than two batches. This forces us to
choose between repeating or skipping sockets, so don't allow this:

1. Stop iteration and propagate -ENOMEM up to userspace if reallocation
   fails instead of continuing with a partial batch.
2. Retry bpf_iter_udp_realloc_batch() up to two times if we fail to
   capture the full bucket. On the third attempt, hold onto the bucket
   lock (hslot2->lock) through bpf_iter_udp_realloc_batch() with
   GFP_ATOMIC to guarantee that the bucket size doesn't change before
   our next attempt. Try with GFP_USER first to improve the chances
   that memory allocation succeeds; only use GFP_ATOMIC as a last
   resort.

Testing all scenarios directly is a bit difficult, but I did some manual
testing to exercise the code paths where GFP_ATOMIC is used and where
where ERR_PTR(err) is returned to make sure there are no deadlocks. I
used the realloc test case included later in this series to trigger a
scenario where a realloc happens inside bpf_iter_udp_realloc_batch and
made a small code tweak to force the first two realloc attempts to
allocate a too-small buffer, thus requiring another attempt until the
GFP_ATOMIC case is hit. Some printks showed three reallocs with the
tests passing:

Apr 16 00:08:32 crow kernel: go again (mem_flags=GFP_USER)
Apr 16 00:08:32 crow kernel: go again (mem_flags=GFP_USER)
Apr 16 00:08:32 crow kernel: go again (mem_flags=GFP_ATOMIC)

With this setup, I also forced bpf_iter_udp_realloc_batch to return
-ENOMEM on one of the retries to ensure that iteration ends and that the
read() in userspace fails, forced the hlist_empty condition to be true
on the GFP_ATOMIC pass to test the first WARN_ON_ONCE condition code
path, and walked back iter->end_sk on the GFP_ATOMIC pass to test the
second WARN_ON_ONCE condition code path. In each case, locks were
released and the loop terminated.

[1]: https://lore.kernel.org/bpf/CABi4-ogUtMrH8-NVB6W8Xg_F_KDLq=yy-yu-tKr2udXE2Mu1Lg@mail.gmail.com/
[2]: https://lore.kernel.org/bpf/7ed28273-a716-4638-912d-f86f965e54bb@linux.dev/

Signed-off-by: Jordan Rife <jordan@jrife.io>
Suggested-by: Martin KaFai Lau <martin.lau@linux.dev>
---
 net/ipv4/udp.c | 57 ++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 44 insertions(+), 13 deletions(-)

Comments

Kuniyuki Iwashima April 17, 2025, 10:41 p.m. UTC | #1
From: Jordan Rife <jordan@jrife.io>
Date: Wed, 16 Apr 2025 16:36:17 -0700
> Require that iter->batch always contains a full bucket snapshot. This
> invariant is important to avoid skipping or repeating sockets during
> iteration when combined with the next few patches. Before, there were
> two cases where a call to bpf_iter_udp_batch may only capture part of a
> bucket:
> 
> 1. When bpf_iter_udp_realloc_batch() returns -ENOMEM [1].
> 2. When more sockets are added to the bucket while calling
>    bpf_iter_udp_realloc_batch(), making the updated batch size
>    insufficient [2].
> 
> In cases where the batch size only covers part of a bucket, it is
> possible to forget which sockets were already visited, especially if we
> have to process a bucket in more than two batches. This forces us to
> choose between repeating or skipping sockets, so don't allow this:
> 
> 1. Stop iteration and propagate -ENOMEM up to userspace if reallocation
>    fails instead of continuing with a partial batch.
> 2. Retry bpf_iter_udp_realloc_batch() up to two times if we fail to
>    capture the full bucket. On the third attempt, hold onto the bucket
>    lock (hslot2->lock) through bpf_iter_udp_realloc_batch() with
>    GFP_ATOMIC to guarantee that the bucket size doesn't change before
>    our next attempt. Try with GFP_USER first to improve the chances
>    that memory allocation succeeds; only use GFP_ATOMIC as a last
>    resort.

kvmalloc() tries the kmalloc path, 1. slab and 2. page allocator, and
if both of them fails, then tries 3. vmalloc().  But, vmalloc() does not
support GFP_ATOMIC, __kvmalloc_node_noprof() returns at
!gfpflags_allow_blocking().

So, falling back to GFP_ATOMIC is most unlikely to work as the last resort.

GFP_ATOMIC first and falling back to GFP_USER few more times, or not using
GFP_ATOMIC makes more sense to me.
Martin KaFai Lau April 17, 2025, 11:12 p.m. UTC | #2
On 4/17/25 3:41 PM, Kuniyuki Iwashima wrote:
> From: Jordan Rife <jordan@jrife.io>
> Date: Wed, 16 Apr 2025 16:36:17 -0700
>> Require that iter->batch always contains a full bucket snapshot. This
>> invariant is important to avoid skipping or repeating sockets during
>> iteration when combined with the next few patches. Before, there were
>> two cases where a call to bpf_iter_udp_batch may only capture part of a
>> bucket:
>>
>> 1. When bpf_iter_udp_realloc_batch() returns -ENOMEM [1].
>> 2. When more sockets are added to the bucket while calling
>>     bpf_iter_udp_realloc_batch(), making the updated batch size
>>     insufficient [2].
>>
>> In cases where the batch size only covers part of a bucket, it is
>> possible to forget which sockets were already visited, especially if we
>> have to process a bucket in more than two batches. This forces us to
>> choose between repeating or skipping sockets, so don't allow this:
>>
>> 1. Stop iteration and propagate -ENOMEM up to userspace if reallocation
>>     fails instead of continuing with a partial batch.
>> 2. Retry bpf_iter_udp_realloc_batch() up to two times if we fail to
>>     capture the full bucket. On the third attempt, hold onto the bucket
>>     lock (hslot2->lock) through bpf_iter_udp_realloc_batch() with
>>     GFP_ATOMIC to guarantee that the bucket size doesn't change before
>>     our next attempt. Try with GFP_USER first to improve the chances
>>     that memory allocation succeeds; only use GFP_ATOMIC as a last
>>     resort.
> 
> kvmalloc() tries the kmalloc path, 1. slab and 2. page allocator, and
> if both of them fails, then tries 3. vmalloc().  But, vmalloc() does not
> support GFP_ATOMIC, __kvmalloc_node_noprof() returns at
> !gfpflags_allow_blocking().
> 
> So, falling back to GFP_ATOMIC is most unlikely to work as the last resort.
> 
> GFP_ATOMIC first and falling back to GFP_USER few more times, or not using
> GFP_ATOMIC makes more sense to me.

If I read it correctly, the last retry with GFP_ATOMIC is not because of the 
earlier GFP_USER allocation failure but the size of the bucket has changed a lot 
that it is doing one final attempt to get the whole bucket and this requires to 
hold the bucket lock to ensure the size stays the same which then must use 
GFP_ATOMIC.
Kuniyuki Iwashima April 17, 2025, 11:32 p.m. UTC | #3
From: Martin KaFai Lau <martin.lau@linux.dev>
Date: Thu, 17 Apr 2025 16:12:57 -0700
> On 4/17/25 3:41 PM, Kuniyuki Iwashima wrote:
> > From: Jordan Rife <jordan@jrife.io>
> > Date: Wed, 16 Apr 2025 16:36:17 -0700
> >> Require that iter->batch always contains a full bucket snapshot. This
> >> invariant is important to avoid skipping or repeating sockets during
> >> iteration when combined with the next few patches. Before, there were
> >> two cases where a call to bpf_iter_udp_batch may only capture part of a
> >> bucket:
> >>
> >> 1. When bpf_iter_udp_realloc_batch() returns -ENOMEM [1].
> >> 2. When more sockets are added to the bucket while calling
> >>     bpf_iter_udp_realloc_batch(), making the updated batch size
> >>     insufficient [2].
> >>
> >> In cases where the batch size only covers part of a bucket, it is
> >> possible to forget which sockets were already visited, especially if we
> >> have to process a bucket in more than two batches. This forces us to
> >> choose between repeating or skipping sockets, so don't allow this:
> >>
> >> 1. Stop iteration and propagate -ENOMEM up to userspace if reallocation
> >>     fails instead of continuing with a partial batch.
> >> 2. Retry bpf_iter_udp_realloc_batch() up to two times if we fail to
> >>     capture the full bucket. On the third attempt, hold onto the bucket
> >>     lock (hslot2->lock) through bpf_iter_udp_realloc_batch() with
> >>     GFP_ATOMIC to guarantee that the bucket size doesn't change before
> >>     our next attempt. Try with GFP_USER first to improve the chances
> >>     that memory allocation succeeds; only use GFP_ATOMIC as a last
> >>     resort.
> > 
> > kvmalloc() tries the kmalloc path, 1. slab and 2. page allocator, and
> > if both of them fails, then tries 3. vmalloc().  But, vmalloc() does not
> > support GFP_ATOMIC, __kvmalloc_node_noprof() returns at
> > !gfpflags_allow_blocking().
> > 
> > So, falling back to GFP_ATOMIC is most unlikely to work as the last resort.
> > 
> > GFP_ATOMIC first and falling back to GFP_USER few more times, or not using
> > GFP_ATOMIC makes more sense to me.
> 
> If I read it correctly, the last retry with GFP_ATOMIC is not because of the 
> earlier GFP_USER allocation failure but the size of the bucket has changed a lot 
> that it is doing one final attempt to get the whole bucket and this requires to 
> hold the bucket lock to ensure the size stays the same which then must use 
> GFP_ATOMIC.

Ah exactly, when allocation fails, it always returned an error.

Sorry, I should've read code first.
Kuniyuki Iwashima April 17, 2025, 11:45 p.m. UTC | #4
From: Jordan Rife <jordan@jrife.io>
Date: Wed, 16 Apr 2025 16:36:17 -0700
> @@ -3454,15 +3460,26 @@ static struct sock *bpf_iter_udp_batch(struct seq_file *seq)
>  				batch_sks++;
>  			}
>  		}
> -		spin_unlock_bh(&hslot2->lock);
>  
>  		if (iter->end_sk)
>  			break;
> +next_bucket:
> +		/* Somehow the bucket was emptied or all matching sockets were
> +		 * removed while we held onto its lock. This should not happen.
> +		 */
> +		if (WARN_ON_ONCE(!resizes))
> +			/* Best effort; reset the resize budget and move on. */
> +			resizes = MAX_REALLOC_ATTEMPTS;
> +		if (lock)
> +			spin_unlock_bh(lock);
> +		lock = NULL;
>  	}
>  
>  	/* All done: no batch made. */
>  	if (!iter->end_sk)
> -		return NULL;
> +		goto done;

If we jump here when no UDP socket exists, uninitialised sk is returned.
Maybe move this condition down below the sk initialisation.


> +
> +	sk = iter->batch[0];
>  
>  	if (iter->end_sk == batch_sks) {
>  		/* Batching is done for the current bucket; return the first
> @@ -3471,16 +3488,30 @@ static struct sock *bpf_iter_udp_batch(struct seq_file *seq)
>  		iter->st_bucket_done = true;
>  		goto done;
>  	}
> -	if (!resized && !bpf_iter_udp_realloc_batch(iter, batch_sks * 3 / 2,
> -						    GFP_USER)) {
> -		resized = true;
> -		/* After allocating a larger batch, retry one more time to grab
> -		 * the whole bucket.
> -		 */
> -		goto again;
> +
> +	/* Somehow the batch size still wasn't big enough even though we held
> +	 * a lock on the bucket. This should not happen.
> +	 */
> +	if (WARN_ON_ONCE(!resizes))
> +		goto done;
> +
> +	resizes--;
> +	if (resizes) {
> +		spin_unlock_bh(lock);
> +		lock = NULL;
> +	}
> +	err = bpf_iter_udp_realloc_batch(iter, batch_sks * 3 / 2,
> +					 resizes ? GFP_USER : GFP_ATOMIC);
> +	if (err) {
> +		sk = ERR_PTR(err);
> +		goto done;
>  	}
> +
> +	goto again;
>  done:
> -	return iter->batch[0];
> +	if (lock)
> +		spin_unlock_bh(lock);
> +	return sk;
>  }
>
Jordan Rife April 17, 2025, 11:51 p.m. UTC | #5
> > If I read it correctly, the last retry with GFP_ATOMIC is not because of the 
> > earlier GFP_USER allocation failure but the size of the bucket has changed a lot 
> > that it is doing one final attempt to get the whole bucket and this requires to 
> > hold the bucket lock to ensure the size stays the same which then must use 
> > GFP_ATOMIC.
> 
> Ah exactly, when allocation fails, it always returned an error.
> 
> Sorry, I should've read code first.

I was about to type out a response, but Martin beat me to it :). Yep,
GFP_ATOMIC is a necessary side-effect of holding onto the lock to make
sure the bucket doesn't grow anymore. It's a last resort to make sure
the batch size is big enough to grab a full bucket snapshot not a last
resort to allocate memory.

-Jordan
Jordan Rife April 17, 2025, 11:57 p.m. UTC | #6
> > @@ -3454,15 +3460,26 @@ static struct sock *bpf_iter_udp_batch(struct seq_file *seq)
> >  				batch_sks++;
> >  			}
> >  		}
> > -		spin_unlock_bh(&hslot2->lock);
> >  
> >  		if (iter->end_sk)
> >  			break;
> > +next_bucket:
> > +		/* Somehow the bucket was emptied or all matching sockets were
> > +		 * removed while we held onto its lock. This should not happen.
> > +		 */
> > +		if (WARN_ON_ONCE(!resizes))
> > +			/* Best effort; reset the resize budget and move on. */
> > +			resizes = MAX_REALLOC_ATTEMPTS;
> > +		if (lock)
> > +			spin_unlock_bh(lock);
> > +		lock = NULL;
> >  	}
> >  
> >  	/* All done: no batch made. */
> >  	if (!iter->end_sk)
> > -		return NULL;
> > +		goto done;
> 
> If we jump here when no UDP socket exists, uninitialised sk is returned.
> Maybe move this condition down below the sk initialisation.

In this case, we'd want to return NULL just like it did before, since
there's no socket in the batch. Do you want me to make this more
explicit by setting sk = NULL here?

-Jordan
Kuniyuki Iwashima April 18, 2025, 12:15 a.m. UTC | #7
From: Jordan Rife <jordan@jrife.io>
Date: Thu, 17 Apr 2025 16:57:11 -0700
> > > @@ -3454,15 +3460,26 @@ static struct sock *bpf_iter_udp_batch(struct seq_file *seq)
> > >  				batch_sks++;
> > >  			}
> > >  		}
> > > -		spin_unlock_bh(&hslot2->lock);
> > >  
> > >  		if (iter->end_sk)
> > >  			break;
> > > +next_bucket:
> > > +		/* Somehow the bucket was emptied or all matching sockets were
> > > +		 * removed while we held onto its lock. This should not happen.
> > > +		 */
> > > +		if (WARN_ON_ONCE(!resizes))
> > > +			/* Best effort; reset the resize budget and move on. */
> > > +			resizes = MAX_REALLOC_ATTEMPTS;
> > > +		if (lock)
> > > +			spin_unlock_bh(lock);
> > > +		lock = NULL;
> > >  	}
> > >  
> > >  	/* All done: no batch made. */
> > >  	if (!iter->end_sk)
> > > -		return NULL;
> > > +		goto done;
> > 
> > If we jump here when no UDP socket exists, uninitialised sk is returned.
> > Maybe move this condition down below the sk initialisation.
> 
> In this case, we'd want to return NULL just like it did before, since
> there's no socket in the batch. Do you want me to make this more
> explicit by setting sk = NULL here?

Sounds good to me
diff mbox series

Patch

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 0ac31dec339a..4802d3fa37ed 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -3377,6 +3377,7 @@  int udp4_seq_show(struct seq_file *seq, void *v)
 }
 
 #ifdef CONFIG_BPF_SYSCALL
+#define MAX_REALLOC_ATTEMPTS 3
 struct bpf_iter__udp {
 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
 	__bpf_md_ptr(struct udp_sock *, udp_sk);
@@ -3401,11 +3402,13 @@  static struct sock *bpf_iter_udp_batch(struct seq_file *seq)
 	struct bpf_udp_iter_state *iter = seq->private;
 	struct udp_iter_state *state = &iter->state;
 	struct net *net = seq_file_net(seq);
+	int resizes = MAX_REALLOC_ATTEMPTS;
 	int resume_bucket, resume_offset;
 	struct udp_table *udptable;
 	unsigned int batch_sks = 0;
-	bool resized = false;
+	spinlock_t *lock = NULL;
 	struct sock *sk;
+	int err = 0;
 
 	resume_bucket = state->bucket;
 	resume_offset = iter->offset;
@@ -3433,10 +3436,13 @@  static struct sock *bpf_iter_udp_batch(struct seq_file *seq)
 		struct udp_hslot *hslot2 = &udptable->hash2[state->bucket].hslot;
 
 		if (hlist_empty(&hslot2->head))
-			continue;
+			goto next_bucket;
 
 		iter->offset = 0;
-		spin_lock_bh(&hslot2->lock);
+		if (!lock) {
+			lock = &hslot2->lock;
+			spin_lock_bh(lock);
+		}
 		udp_portaddr_for_each_entry(sk, &hslot2->head) {
 			if (seq_sk_match(seq, sk)) {
 				/* Resume from the last iterated socket at the
@@ -3454,15 +3460,26 @@  static struct sock *bpf_iter_udp_batch(struct seq_file *seq)
 				batch_sks++;
 			}
 		}
-		spin_unlock_bh(&hslot2->lock);
 
 		if (iter->end_sk)
 			break;
+next_bucket:
+		/* Somehow the bucket was emptied or all matching sockets were
+		 * removed while we held onto its lock. This should not happen.
+		 */
+		if (WARN_ON_ONCE(!resizes))
+			/* Best effort; reset the resize budget and move on. */
+			resizes = MAX_REALLOC_ATTEMPTS;
+		if (lock)
+			spin_unlock_bh(lock);
+		lock = NULL;
 	}
 
 	/* All done: no batch made. */
 	if (!iter->end_sk)
-		return NULL;
+		goto done;
+
+	sk = iter->batch[0];
 
 	if (iter->end_sk == batch_sks) {
 		/* Batching is done for the current bucket; return the first
@@ -3471,16 +3488,30 @@  static struct sock *bpf_iter_udp_batch(struct seq_file *seq)
 		iter->st_bucket_done = true;
 		goto done;
 	}
-	if (!resized && !bpf_iter_udp_realloc_batch(iter, batch_sks * 3 / 2,
-						    GFP_USER)) {
-		resized = true;
-		/* After allocating a larger batch, retry one more time to grab
-		 * the whole bucket.
-		 */
-		goto again;
+
+	/* Somehow the batch size still wasn't big enough even though we held
+	 * a lock on the bucket. This should not happen.
+	 */
+	if (WARN_ON_ONCE(!resizes))
+		goto done;
+
+	resizes--;
+	if (resizes) {
+		spin_unlock_bh(lock);
+		lock = NULL;
+	}
+	err = bpf_iter_udp_realloc_batch(iter, batch_sks * 3 / 2,
+					 resizes ? GFP_USER : GFP_ATOMIC);
+	if (err) {
+		sk = ERR_PTR(err);
+		goto done;
 	}
+
+	goto again;
 done:
-	return iter->batch[0];
+	if (lock)
+		spin_unlock_bh(lock);
+	return sk;
 }
 
 static void *bpf_iter_udp_seq_next(struct seq_file *seq, void *v, loff_t *pos)