diff mbox series

[v3,bpf,1/2] bpf: skip non exist keys in generic_map_lookup_batch

Message ID 85618439eea75930630685c467ccefeac0942e2b.1739171594.git.yan@cloudflare.com (mailing list archive)
State Accepted
Commit 5644c6b50ffee0a56c1e01430a8c88e34decb120
Delegated to: BPF
Headers show
Series bpf: skip non exist keys in generic_map_lookup_batch | expand

Checks

Context Check Description
bpf/vmtest-bpf-PR success PR summary
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for bpf
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag present in non-next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 0 this patch: 0
netdev/build_tools success No tools touched, skip
netdev/cc_maintainers success CCed 14 of 14 maintainers
netdev/build_clang success Errors and warnings before: 4 this patch: 4
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success Fixes tag looks correct
netdev/build_allmodconfig_warn success Errors and warnings before: 0 this patch: 0
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 47 lines checked
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-VM_Test-5 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-VM_Test-11 success Logs for aarch64-gcc / veristat-kernel
bpf/vmtest-bpf-VM_Test-12 success Logs for aarch64-gcc / veristat-meta
bpf/vmtest-bpf-VM_Test-7 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-VM_Test-10 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-VM_Test-13 success Logs for s390x-gcc / GCC BPF
bpf/vmtest-bpf-VM_Test-14 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-VM_Test-15 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-VM_Test-19 success Logs for s390x-gcc / veristat-kernel
bpf/vmtest-bpf-VM_Test-20 success Logs for s390x-gcc / veristat-meta
bpf/vmtest-bpf-VM_Test-21 success Logs for set-matrix
bpf/vmtest-bpf-VM_Test-23 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-VM_Test-24 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-VM_Test-34 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-VM_Test-35 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17-O2
bpf/vmtest-bpf-VM_Test-40 success Logs for x86_64-llvm-17 / veristat-kernel
bpf/vmtest-bpf-VM_Test-41 success Logs for x86_64-llvm-17 / veristat-meta
bpf/vmtest-bpf-VM_Test-43 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-VM_Test-8 success Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-VM_Test-9 success Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-VM_Test-16 success Logs for s390x-gcc / test (test_progs, false, 360) / test_progs on s390x with gcc
bpf/vmtest-bpf-VM_Test-17 success Logs for s390x-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-VM_Test-18 success Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-VM_Test-44 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18-O2
bpf/vmtest-bpf-VM_Test-50 success Logs for x86_64-llvm-18 / veristat-kernel
bpf/vmtest-bpf-VM_Test-51 success Logs for x86_64-llvm-18 / veristat-meta
bpf/vmtest-bpf-VM_Test-25 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-31 success Logs for x86_64-gcc / veristat-kernel / x86_64-gcc veristat_kernel
bpf/vmtest-bpf-VM_Test-30 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-32 success Logs for x86_64-gcc / veristat-meta / x86_64-gcc veristat_meta
bpf/vmtest-bpf-VM_Test-39 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-VM_Test-49 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
bpf/vmtest-bpf-VM_Test-26 success Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-38 success Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-VM_Test-27 success Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-28 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-37 success Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-VM_Test-33 success Logs for x86_64-llvm-17 / GCC BPF / GCC BPF
bpf/vmtest-bpf-VM_Test-22 success Logs for x86_64-gcc / GCC BPF / GCC BPF
bpf/vmtest-bpf-VM_Test-48 success Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18
bpf/vmtest-bpf-VM_Test-29 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-36 success Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-VM_Test-46 success Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-VM_Test-47 success Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-VM_Test-42 success Logs for x86_64-llvm-18 / GCC BPF / GCC BPF
bpf/vmtest-bpf-VM_Test-45 success Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-VM_Test-4 success Logs for set-matrix
bpf/vmtest-bpf-VM_Test-3 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-VM_Test-6 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-VM_Test-1 success Logs for aarch64-gcc / build-release

Commit Message

Yan Zhai Feb. 10, 2025, 7:22 a.m. UTC
The generic_map_lookup_batch currently returns EINTR if it fails with
ENOENT and retries several times on bpf_map_copy_value. The next batch
would start from the same location, presuming it's a transient issue.
This is incorrect if a map can actually have "holes", i.e.
"get_next_key" can return a key that does not point to a valid value. At
least the array of maps type may contain such holes legitly. Right now
these holes show up, generic batch lookup cannot proceed any more. It
will always fail with EINTR errors.

Rather, do not retry in generic_map_lookup_batch. If it finds a non
existing element, skip to the next key. This simple solution comes with
a price that transient errors may not be recovered, and the iteration
might cycle back to the first key under parallel deletion. For example,
Hou Tao <houtao@huaweicloud.com> pointed out a following scenario:

For LPM trie map:
(1) ->map_get_next_key(map, prev_key, key) returns a valid key

(2) bpf_map_copy_value() return -ENOMENT
It means the key must be deleted concurrently.

(3) goto next_key
It swaps the prev_key and key

(4) ->map_get_next_key(map, prev_key, key) again
prev_key points to a non-existing key, for LPM trie it will treat just
like prev_key=NULL case, the returned key will be duplicated.

With the retry logic, the iteration can continue to the key next to the
deleted one. But if we directly skip to the next key, the iteration loop
would restart from the first key for the lpm_trie type.

However, not all races may be recovered. For example, if current key is
deleted after instead of before bpf_map_copy_value, or if the prev_key
also gets deleted, then the loop will still restart from the first key
for lpm_tire anyway. For generic lookup it might be better to stay
simple, i.e. just skip to the next key. To guarantee that the output
keys are not duplicated, it is better to implement map type specific
batch operations, which can properly lock the trie and synchronize with
concurrent mutators.

Fixes: cb4d03ab499d ("bpf: Add generic support for lookup batch op")
Closes: https://lore.kernel.org/bpf/Z6JXtA1M5jAZx8xD@debian.debian/
Signed-off-by: Yan Zhai <yan@cloudflare.com>
Acked-by: Hou Tao <houtao1@huawei.com>
---
v2->v3: deleted a used macro
v1->v2: incorporate more useful information inside commit message.
---
 kernel/bpf/syscall.c | 18 +++++-------------
 1 file changed, 5 insertions(+), 13 deletions(-)

Comments

Jiri Olsa Feb. 10, 2025, 9:19 a.m. UTC | #1
On Sun, Feb 09, 2025 at 11:22:35PM -0800, Yan Zhai wrote:
> The generic_map_lookup_batch currently returns EINTR if it fails with
> ENOENT and retries several times on bpf_map_copy_value. The next batch
> would start from the same location, presuming it's a transient issue.
> This is incorrect if a map can actually have "holes", i.e.
> "get_next_key" can return a key that does not point to a valid value. At
> least the array of maps type may contain such holes legitly. Right now
> these holes show up, generic batch lookup cannot proceed any more. It
> will always fail with EINTR errors.
> 
> Rather, do not retry in generic_map_lookup_batch. If it finds a non
> existing element, skip to the next key. This simple solution comes with
> a price that transient errors may not be recovered, and the iteration
> might cycle back to the first key under parallel deletion. For example,

probably stupid question, but why not keep the retry logic and when
it fails then instead of returning EINTR just jump to the next key

jirka


> Hou Tao <houtao@huaweicloud.com> pointed out a following scenario:
> 
> For LPM trie map:
> (1) ->map_get_next_key(map, prev_key, key) returns a valid key
> 
> (2) bpf_map_copy_value() return -ENOMENT
> It means the key must be deleted concurrently.
> 
> (3) goto next_key
> It swaps the prev_key and key
> 
> (4) ->map_get_next_key(map, prev_key, key) again
> prev_key points to a non-existing key, for LPM trie it will treat just
> like prev_key=NULL case, the returned key will be duplicated.
> 
> With the retry logic, the iteration can continue to the key next to the
> deleted one. But if we directly skip to the next key, the iteration loop
> would restart from the first key for the lpm_trie type.
> 
> However, not all races may be recovered. For example, if current key is
> deleted after instead of before bpf_map_copy_value, or if the prev_key
> also gets deleted, then the loop will still restart from the first key
> for lpm_tire anyway. For generic lookup it might be better to stay
> simple, i.e. just skip to the next key. To guarantee that the output
> keys are not duplicated, it is better to implement map type specific
> batch operations, which can properly lock the trie and synchronize with
> concurrent mutators.
> 
> Fixes: cb4d03ab499d ("bpf: Add generic support for lookup batch op")
> Closes: https://lore.kernel.org/bpf/Z6JXtA1M5jAZx8xD@debian.debian/
> Signed-off-by: Yan Zhai <yan@cloudflare.com>
> Acked-by: Hou Tao <houtao1@huawei.com>
> ---
> v2->v3: deleted a used macro
> v1->v2: incorporate more useful information inside commit message.
> ---
>  kernel/bpf/syscall.c | 18 +++++-------------
>  1 file changed, 5 insertions(+), 13 deletions(-)
> 
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index c420edbfb7c8..e5f1c7fd0ba7 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -1968,8 +1968,6 @@ int generic_map_update_batch(struct bpf_map *map, struct file *map_file,
>  	return err;
>  }
>  
> -#define MAP_LOOKUP_RETRIES 3
> -
>  int generic_map_lookup_batch(struct bpf_map *map,
>  				    const union bpf_attr *attr,
>  				    union bpf_attr __user *uattr)
> @@ -1979,8 +1977,8 @@ int generic_map_lookup_batch(struct bpf_map *map,
>  	void __user *values = u64_to_user_ptr(attr->batch.values);
>  	void __user *keys = u64_to_user_ptr(attr->batch.keys);
>  	void *buf, *buf_prevkey, *prev_key, *key, *value;
> -	int err, retry = MAP_LOOKUP_RETRIES;
>  	u32 value_size, cp, max_count;
> +	int err;
>  
>  	if (attr->batch.elem_flags & ~BPF_F_LOCK)
>  		return -EINVAL;
> @@ -2026,14 +2024,8 @@ int generic_map_lookup_batch(struct bpf_map *map,
>  		err = bpf_map_copy_value(map, key, value,
>  					 attr->batch.elem_flags);
>  
> -		if (err == -ENOENT) {
> -			if (retry) {
> -				retry--;
> -				continue;
> -			}
> -			err = -EINTR;
> -			break;
> -		}
> +		if (err == -ENOENT)
> +			goto next_key;
>  
>  		if (err)
>  			goto free_buf;
> @@ -2048,12 +2040,12 @@ int generic_map_lookup_batch(struct bpf_map *map,
>  			goto free_buf;
>  		}
>  
> +		cp++;
> +next_key:
>  		if (!prev_key)
>  			prev_key = buf_prevkey;
>  
>  		swap(prev_key, key);
> -		retry = MAP_LOOKUP_RETRIES;
> -		cp++;
>  		cond_resched();
>  	}
>  
> -- 
> 2.39.5
> 
>
Brian Vazquez Feb. 10, 2025, 2:47 p.m. UTC | #2
On Mon, Feb 10, 2025 at 4:19 AM Jiri Olsa <olsajiri@gmail.com> wrote:
>
> On Sun, Feb 09, 2025 at 11:22:35PM -0800, Yan Zhai wrote:
> > The generic_map_lookup_batch currently returns EINTR if it fails with
> > ENOENT and retries several times on bpf_map_copy_value. The next batch
> > would start from the same location, presuming it's a transient issue.
> > This is incorrect if a map can actually have "holes", i.e.
> > "get_next_key" can return a key that does not point to a valid value. At
> > least the array of maps type may contain such holes legitly. Right now
> > these holes show up, generic batch lookup cannot proceed any more. It
> > will always fail with EINTR errors.
> >
> > Rather, do not retry in generic_map_lookup_batch. If it finds a non
> > existing element, skip to the next key. This simple solution comes with
> > a price that transient errors may not be recovered, and the iteration
> > might cycle back to the first key under parallel deletion. For example,
>
> probably stupid question, but why not keep the retry logic and when
> it fails then instead of returning EINTR just jump to the next key
>
> jirka

+1, keeping the retry logic but moving to the next key on error sounds
like a sensible approach.


>
>
> > Hou Tao <houtao@huaweicloud.com> pointed out a following scenario:
> >
> > For LPM trie map:
> > (1) ->map_get_next_key(map, prev_key, key) returns a valid key
> >
> > (2) bpf_map_copy_value() return -ENOMENT
> > It means the key must be deleted concurrently.
> >
> > (3) goto next_key
> > It swaps the prev_key and key
> >
> > (4) ->map_get_next_key(map, prev_key, key) again
> > prev_key points to a non-existing key, for LPM trie it will treat just
> > like prev_key=NULL case, the returned key will be duplicated.
> >
> > With the retry logic, the iteration can continue to the key next to the
> > deleted one. But if we directly skip to the next key, the iteration loop
> > would restart from the first key for the lpm_trie type.
> >
> > However, not all races may be recovered. For example, if current key is
> > deleted after instead of before bpf_map_copy_value, or if the prev_key
> > also gets deleted, then the loop will still restart from the first key
> > for lpm_tire anyway. For generic lookup it might be better to stay
> > simple, i.e. just skip to the next key. To guarantee that the output
> > keys are not duplicated, it is better to implement map type specific
> > batch operations, which can properly lock the trie and synchronize with
> > concurrent mutators.
> >
> > Fixes: cb4d03ab499d ("bpf: Add generic support for lookup batch op")
> > Closes: https://lore.kernel.org/bpf/Z6JXtA1M5jAZx8xD@debian.debian/
> > Signed-off-by: Yan Zhai <yan@cloudflare.com>
> > Acked-by: Hou Tao <houtao1@huawei.com>
> > ---
> > v2->v3: deleted a used macro
> > v1->v2: incorporate more useful information inside commit message.
> > ---
> >  kernel/bpf/syscall.c | 18 +++++-------------
> >  1 file changed, 5 insertions(+), 13 deletions(-)
> >
> > diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> > index c420edbfb7c8..e5f1c7fd0ba7 100644
> > --- a/kernel/bpf/syscall.c
> > +++ b/kernel/bpf/syscall.c
> > @@ -1968,8 +1968,6 @@ int generic_map_update_batch(struct bpf_map *map, struct file *map_file,
> >       return err;
> >  }
> >
> > -#define MAP_LOOKUP_RETRIES 3
> > -
> >  int generic_map_lookup_batch(struct bpf_map *map,
> >                                   const union bpf_attr *attr,
> >                                   union bpf_attr __user *uattr)
> > @@ -1979,8 +1977,8 @@ int generic_map_lookup_batch(struct bpf_map *map,
> >       void __user *values = u64_to_user_ptr(attr->batch.values);
> >       void __user *keys = u64_to_user_ptr(attr->batch.keys);
> >       void *buf, *buf_prevkey, *prev_key, *key, *value;
> > -     int err, retry = MAP_LOOKUP_RETRIES;
> >       u32 value_size, cp, max_count;
> > +     int err;
> >
> >       if (attr->batch.elem_flags & ~BPF_F_LOCK)
> >               return -EINVAL;
> > @@ -2026,14 +2024,8 @@ int generic_map_lookup_batch(struct bpf_map *map,
> >               err = bpf_map_copy_value(map, key, value,
> >                                        attr->batch.elem_flags);
> >
> > -             if (err == -ENOENT) {
> > -                     if (retry) {
> > -                             retry--;
> > -                             continue;
> > -                     }
> > -                     err = -EINTR;
> > -                     break;
> > -             }
> > +             if (err == -ENOENT)
> > +                     goto next_key;
> >
> >               if (err)
> >                       goto free_buf;
> > @@ -2048,12 +2040,12 @@ int generic_map_lookup_batch(struct bpf_map *map,
> >                       goto free_buf;
> >               }
> >
> > +             cp++;
> > +next_key:
> >               if (!prev_key)
> >                       prev_key = buf_prevkey;
> >
> >               swap(prev_key, key);
> > -             retry = MAP_LOOKUP_RETRIES;
> > -             cp++;
> >               cond_resched();
> >       }
> >
> > --
> > 2.39.5
> >
> >
Yan Zhai Feb. 10, 2025, 4:21 p.m. UTC | #3
Hi Brian, Jiri

thanks for the comments.

On Mon, Feb 10, 2025 at 8:47 AM Brian Vazquez <brianvv@google.com> wrote:
>
> On Mon, Feb 10, 2025 at 4:19 AM Jiri Olsa <olsajiri@gmail.com> wrote:
> >
> > On Sun, Feb 09, 2025 at 11:22:35PM -0800, Yan Zhai wrote:
> > > The generic_map_lookup_batch currently returns EINTR if it fails with
> > > ENOENT and retries several times on bpf_map_copy_value. The next batch
> > > would start from the same location, presuming it's a transient issue.
> > > This is incorrect if a map can actually have "holes", i.e.
> > > "get_next_key" can return a key that does not point to a valid value. At
> > > least the array of maps type may contain such holes legitly. Right now
> > > these holes show up, generic batch lookup cannot proceed any more. It
> > > will always fail with EINTR errors.
> > >
> > > Rather, do not retry in generic_map_lookup_batch. If it finds a non
> > > existing element, skip to the next key. This simple solution comes with
> > > a price that transient errors may not be recovered, and the iteration
> > > might cycle back to the first key under parallel deletion. For example,
> >
> > probably stupid question, but why not keep the retry logic and when
> > it fails then instead of returning EINTR just jump to the next key
> >
> > jirka
>
> +1, keeping the retry logic but moving to the next key on error sounds
> like a sensible approach.
>
I made the trade off since retry would consistently fail for the array
of maps, so it is merely wasting cycles to ever do so. It is already
pretty slow to read these maps today from userspace (for us we read
them for accounting/monitoring purposes), so it is nice to save a few
cycles especially for sparse maps. E.g. We use inner maps to store
protocol specific actions in an array of maps with 256 slots, but
usually only a few common protocols like TCP/UDP/ICMP are populated,
leaving most "holes". On the other hand, I personally feel it is
really "fragile" if users rely heavily on this logic to survive
concurrent lookup and deletion. Would it make more sense to provide
concurrency guarantee with map specific ops like hash map?

best
Yan
Jiri Olsa Feb. 12, 2025, 5:04 p.m. UTC | #4
On Mon, Feb 10, 2025 at 10:21:38AM -0600, Yan Zhai wrote:
> Hi Brian, Jiri
> 
> thanks for the comments.
> 
> On Mon, Feb 10, 2025 at 8:47 AM Brian Vazquez <brianvv@google.com> wrote:
> >
> > On Mon, Feb 10, 2025 at 4:19 AM Jiri Olsa <olsajiri@gmail.com> wrote:
> > >
> > > On Sun, Feb 09, 2025 at 11:22:35PM -0800, Yan Zhai wrote:
> > > > The generic_map_lookup_batch currently returns EINTR if it fails with
> > > > ENOENT and retries several times on bpf_map_copy_value. The next batch
> > > > would start from the same location, presuming it's a transient issue.
> > > > This is incorrect if a map can actually have "holes", i.e.
> > > > "get_next_key" can return a key that does not point to a valid value. At
> > > > least the array of maps type may contain such holes legitly. Right now
> > > > these holes show up, generic batch lookup cannot proceed any more. It
> > > > will always fail with EINTR errors.
> > > >
> > > > Rather, do not retry in generic_map_lookup_batch. If it finds a non
> > > > existing element, skip to the next key. This simple solution comes with
> > > > a price that transient errors may not be recovered, and the iteration
> > > > might cycle back to the first key under parallel deletion. For example,
> > >
> > > probably stupid question, but why not keep the retry logic and when
> > > it fails then instead of returning EINTR just jump to the next key
> > >
> > > jirka
> >
> > +1, keeping the retry logic but moving to the next key on error sounds
> > like a sensible approach.
> >
> I made the trade off since retry would consistently fail for the array
> of maps, so it is merely wasting cycles to ever do so. It is already
> pretty slow to read these maps today from userspace (for us we read
> them for accounting/monitoring purposes), so it is nice to save a few
> cycles especially for sparse maps. E.g. We use inner maps to store
> protocol specific actions in an array of maps with 256 slots, but
> usually only a few common protocols like TCP/UDP/ICMP are populated,
> leaving most "holes". On the other hand, I personally feel it is
> really "fragile" if users rely heavily on this logic to survive
> concurrent lookup and deletion. Would it make more sense to provide
> concurrency guarantee with map specific ops like hash map?

Brian, any details on the EINTR path? is that just to survive concurent
batch-lookup and delete?

if that's important use case I guess the map specific function would be
possible, because it's broken for maps with holes as you described

thanks,
jirka
diff mbox series

Patch

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index c420edbfb7c8..e5f1c7fd0ba7 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1968,8 +1968,6 @@  int generic_map_update_batch(struct bpf_map *map, struct file *map_file,
 	return err;
 }
 
-#define MAP_LOOKUP_RETRIES 3
-
 int generic_map_lookup_batch(struct bpf_map *map,
 				    const union bpf_attr *attr,
 				    union bpf_attr __user *uattr)
@@ -1979,8 +1977,8 @@  int generic_map_lookup_batch(struct bpf_map *map,
 	void __user *values = u64_to_user_ptr(attr->batch.values);
 	void __user *keys = u64_to_user_ptr(attr->batch.keys);
 	void *buf, *buf_prevkey, *prev_key, *key, *value;
-	int err, retry = MAP_LOOKUP_RETRIES;
 	u32 value_size, cp, max_count;
+	int err;
 
 	if (attr->batch.elem_flags & ~BPF_F_LOCK)
 		return -EINVAL;
@@ -2026,14 +2024,8 @@  int generic_map_lookup_batch(struct bpf_map *map,
 		err = bpf_map_copy_value(map, key, value,
 					 attr->batch.elem_flags);
 
-		if (err == -ENOENT) {
-			if (retry) {
-				retry--;
-				continue;
-			}
-			err = -EINTR;
-			break;
-		}
+		if (err == -ENOENT)
+			goto next_key;
 
 		if (err)
 			goto free_buf;
@@ -2048,12 +2040,12 @@  int generic_map_lookup_batch(struct bpf_map *map,
 			goto free_buf;
 		}
 
+		cp++;
+next_key:
 		if (!prev_key)
 			prev_key = buf_prevkey;
 
 		swap(prev_key, key);
-		retry = MAP_LOOKUP_RETRIES;
-		cp++;
 		cond_resched();
 	}