diff mbox series

[v4,bpf-next,5/6] selftests/bpf: test map percpu stats

Message ID 20230705160139.19967-6-aspsk@isovalent.com (mailing list archive)
State Superseded
Delegated to: BPF
Headers show
Series bpf: add percpu stats for bpf_map | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-PR success PR summary
bpf/vmtest-bpf-next-VM_Test-30 success Logs for test_verifier on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-31 success Logs for veristat
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-4 success Logs for build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-5 success Logs for build for x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-6 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-2 success Logs for build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-3 success Logs for build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-7 success Logs for test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-9 success Logs for test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-10 success Logs for test_maps on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-17 success Logs for test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-18 success Logs for test_progs_no_alu32 on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-19 success Logs for test_progs_no_alu32_parallel on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-20 success Logs for test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-21 success Logs for test_progs_no_alu32_parallel on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-22 success Logs for test_progs_parallel on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-23 success Logs for test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-24 success Logs for test_progs_parallel on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-25 success Logs for test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-26 success Logs for test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-27 success Logs for test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-28 success Logs for test_verifier on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-29 success Logs for veristat
bpf/vmtest-bpf-next-VM_Test-11 success Logs for test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-13 success Logs for test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-14 success Logs for test_progs on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-15 success Logs for test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-16 success Logs for test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-12 success Logs for test_progs on s390x with gcc
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for bpf-next, async
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 8 this patch: 8
netdev/cc_maintainers warning 3 maintainers not CCed: mykolal@fb.com shuah@kernel.org linux-kselftest@vger.kernel.org
netdev/build_clang fail Errors and warnings before: 18 this patch: 18
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 8 this patch: 8
netdev/checkpatch warning CHECK: Alignment should match open parenthesis CHECK: Comparison to NULL could be written "!skel" CHECK: spaces preferred around that '*' (ctx:VxV) CHECK: spaces preferred around that '-' (ctx:VxV) WARNING: Block comments use a trailing */ on a separate line WARNING: added, moved or deleted file(s), does MAINTAINERS need updating? WARNING: line length of 81 exceeds 80 columns WARNING: line length of 82 exceeds 80 columns WARNING: line length of 83 exceeds 80 columns WARNING: line length of 84 exceeds 80 columns WARNING: line length of 87 exceeds 80 columns WARNING: line length of 88 exceeds 80 columns WARNING: line length of 89 exceeds 80 columns WARNING: line length of 90 exceeds 80 columns WARNING: line length of 96 exceeds 80 columns
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next-VM_Test-8 success Logs for test_maps on s390x with gcc

Commit Message

Anton Protopopov July 5, 2023, 4:01 p.m. UTC
Add a new map test, map_percpu_stats.c, which is checking the correctness of
map's percpu elements counters.  For supported maps the test upserts a number
of elements, checks the correctness of the counters, then deletes all the
elements and checks again that the counters sum drops down to zero.

The following map types are tested:

    * BPF_MAP_TYPE_HASH, BPF_F_NO_PREALLOC
    * BPF_MAP_TYPE_PERCPU_HASH, BPF_F_NO_PREALLOC
    * BPF_MAP_TYPE_HASH,
    * BPF_MAP_TYPE_PERCPU_HASH,
    * BPF_MAP_TYPE_LRU_HASH
    * BPF_MAP_TYPE_LRU_PERCPU_HASH
    * BPF_MAP_TYPE_LRU_HASH, BPF_F_NO_COMMON_LRU
    * BPF_MAP_TYPE_LRU_PERCPU_HASH, BPF_F_NO_COMMON_LRU
    * BPF_MAP_TYPE_HASH_OF_MAPS

Signed-off-by: Anton Protopopov <aspsk@isovalent.com>
---
 .../bpf/map_tests/map_percpu_stats.c          | 450 ++++++++++++++++++
 .../selftests/bpf/progs/map_percpu_stats.c    |  24 +
 2 files changed, 474 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/map_tests/map_percpu_stats.c
 create mode 100644 tools/testing/selftests/bpf/progs/map_percpu_stats.c

Comments

Hou Tao July 6, 2023, 10:49 a.m. UTC | #1
Hi,

On 7/6/2023 12:01 AM, Anton Protopopov wrote:
> Add a new map test, map_percpu_stats.c, which is checking the correctness of
> map's percpu elements counters.  For supported maps the test upserts a number
> of elements, checks the correctness of the counters, then deletes all the
> elements and checks again that the counters sum drops down to zero.
>
> The following map types are tested:
>
>     * BPF_MAP_TYPE_HASH, BPF_F_NO_PREALLOC
>     * BPF_MAP_TYPE_PERCPU_HASH, BPF_F_NO_PREALLOC
>     * BPF_MAP_TYPE_HASH,
>     * BPF_MAP_TYPE_PERCPU_HASH,
>     * BPF_MAP_TYPE_LRU_HASH
>     * BPF_MAP_TYPE_LRU_PERCPU_HASH
>     * BPF_MAP_TYPE_LRU_HASH, BPF_F_NO_COMMON_LRU
>     * BPF_MAP_TYPE_LRU_PERCPU_HASH, BPF_F_NO_COMMON_LRU
>     * BPF_MAP_TYPE_HASH_OF_MAPS
>
> Signed-off-by: Anton Protopopov <aspsk@isovalent.com>

Acked-by: Hou Tao <houtao1@huawei.com>

With two nits below.
> +
> +static const char *map_type_to_s(__u32 type)
> +{
> +	switch (type) {
> +	case BPF_MAP_TYPE_HASH:
> +		return "HASH";
> +	case BPF_MAP_TYPE_PERCPU_HASH:
> +		return "PERCPU_HASH";
> +	case BPF_MAP_TYPE_LRU_HASH:
> +		return "LRU_HASH";
> +	case BPF_MAP_TYPE_LRU_PERCPU_HASH:
> +		return "LRU_PERCPU_HASH";
> +	default:
> +		return "<define-me>";
> +	}
Missing BPF_MAP_TYPE_HASH_OF_MAPS ?
> +}
> +
> +static __u32 map_count_elements(__u32 type, int map_fd)
> +{
> +	__u32 key = -1;
> +	int n = 0;
> +
> +	while (!bpf_map_get_next_key(map_fd, &key, &key))
> +		n++;
> +	return n;
> +}
> +
> +#define BATCH	true
> +
> +static void delete_and_lookup_batch(int map_fd, void *keys, __u32 count)
> +{
> +	static __u8 values[(8 << 10) * MAX_ENTRIES];
> +	void *in_batch = NULL, *out_batch;
> +	__u32 save_count = count;
> +	int ret;
> +
> +	ret = bpf_map_lookup_and_delete_batch(map_fd,
> +					      &in_batch, &out_batch,
> +					      keys, values, &count,
> +					      NULL);
> +
> +	/*
> +	 * Despite what uapi header says, lookup_and_delete_batch will return
> +	 * -ENOENT in case we successfully have deleted all elements, so check
> +	 * this separately
> +	 */

It seems it is a bug in __htab_map_lookup_and_delete_batch(). I could
post a patch to fix it if you don't plan to do that by yourself.
> +	CHECK(ret < 0 && (errno != ENOENT || !count), "bpf_map_lookup_and_delete_batch",
> +		       "error: %s\n", strerror(errno));
> +
> +	CHECK(count != save_count,
> +			"bpf_map_lookup_and_delete_batch",
> +			"deleted not all elements: removed=%u expected=%u\n",
> +			count, save_count);
> +}
> +
SNIP
> +static __u32 get_cur_elements(int map_id)
> +{
> +	LIBBPF_OPTS(bpf_iter_attach_opts, opts);
> +	union bpf_iter_link_info linfo;
> +	struct map_percpu_stats *skel;
> +	struct bpf_link *link;
> +	__u32 n_elements;
> +	int iter_fd;
> +	int ret;
> +
> +	opts.link_info = &linfo;
> +	opts.link_info_len = sizeof(linfo);
> +
> +	skel = map_percpu_stats__open();
> +	CHECK(skel == NULL, "map_percpu_stats__open", "error: %s", strerror(errno));
> +
> +	skel->bss->target_id = map_id;
> +
> +	ret = map_percpu_stats__load(skel);
> +	CHECK(ret != 0, "map_percpu_stats__load", "error: %s", strerror(errno));
> +
> +	link = bpf_program__attach_iter(skel->progs.dump_bpf_map, &opts);

Instead of passing a uninitialized opts, I think using NULL will be fine
here because there is no option for bpf map iterator now.
Anton Protopopov July 6, 2023, 11:54 a.m. UTC | #2
On Thu, Jul 06, 2023 at 06:49:02PM +0800, Hou Tao wrote:
> Hi,
> 
> On 7/6/2023 12:01 AM, Anton Protopopov wrote:
> > Add a new map test, map_percpu_stats.c, which is checking the correctness of
> > map's percpu elements counters.  For supported maps the test upserts a number
> > of elements, checks the correctness of the counters, then deletes all the
> > elements and checks again that the counters sum drops down to zero.
> >
> > The following map types are tested:
> >
> >     * BPF_MAP_TYPE_HASH, BPF_F_NO_PREALLOC
> >     * BPF_MAP_TYPE_PERCPU_HASH, BPF_F_NO_PREALLOC
> >     * BPF_MAP_TYPE_HASH,
> >     * BPF_MAP_TYPE_PERCPU_HASH,
> >     * BPF_MAP_TYPE_LRU_HASH
> >     * BPF_MAP_TYPE_LRU_PERCPU_HASH
> >     * BPF_MAP_TYPE_LRU_HASH, BPF_F_NO_COMMON_LRU
> >     * BPF_MAP_TYPE_LRU_PERCPU_HASH, BPF_F_NO_COMMON_LRU
> >     * BPF_MAP_TYPE_HASH_OF_MAPS
> >
> > Signed-off-by: Anton Protopopov <aspsk@isovalent.com>
> 
> Acked-by: Hou Tao <houtao1@huawei.com>
> 
> With two nits below.

Thanks, fixed both for v5.

> > +
> > +static const char *map_type_to_s(__u32 type)
> > +{
> > +	switch (type) {
> > +	case BPF_MAP_TYPE_HASH:
> > +		return "HASH";
> > +	case BPF_MAP_TYPE_PERCPU_HASH:
> > +		return "PERCPU_HASH";
> > +	case BPF_MAP_TYPE_LRU_HASH:
> > +		return "LRU_HASH";
> > +	case BPF_MAP_TYPE_LRU_PERCPU_HASH:
> > +		return "LRU_PERCPU_HASH";
> > +	default:
> > +		return "<define-me>";
> > +	}
> Missing BPF_MAP_TYPE_HASH_OF_MAPS ?
> > +}
> > +
> > +static __u32 map_count_elements(__u32 type, int map_fd)
> > +{
> > +	__u32 key = -1;
> > +	int n = 0;
> > +
> > +	while (!bpf_map_get_next_key(map_fd, &key, &key))
> > +		n++;
> > +	return n;
> > +}
> > +
> > +#define BATCH	true
> > +
> > +static void delete_and_lookup_batch(int map_fd, void *keys, __u32 count)
> > +{
> > +	static __u8 values[(8 << 10) * MAX_ENTRIES];
> > +	void *in_batch = NULL, *out_batch;
> > +	__u32 save_count = count;
> > +	int ret;
> > +
> > +	ret = bpf_map_lookup_and_delete_batch(map_fd,
> > +					      &in_batch, &out_batch,
> > +					      keys, values, &count,
> > +					      NULL);
> > +
> > +	/*
> > +	 * Despite what uapi header says, lookup_and_delete_batch will return
> > +	 * -ENOENT in case we successfully have deleted all elements, so check
> > +	 * this separately
> > +	 */
> 
> It seems it is a bug in __htab_map_lookup_and_delete_batch(). I could
> post a patch to fix it if you don't plan to do that by yourself.

This should be as simple as

@@ -1876,7 +1876,8 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map,
        total += bucket_cnt;
        batch++;
        if (batch >= htab->n_buckets) {
-               ret = -ENOENT;
+               if (!total)
+                       ret = -ENOENT;
                goto after_loop;
        }
        goto again;

However, this might be already utilized by some apps to check that they've read
all entries. Two local examples are map_tests/map_in_map_batch_ops.c and
map_tests/htab_map_batch_ops.c. Another example I know is from BCC tools:
https://github.com/iovisor/bcc/blob/master/libbpf-tools/map_helpers.c#L58

Can we update comments in include/uapi/linux/bpf.h?

> > +	CHECK(ret < 0 && (errno != ENOENT || !count), "bpf_map_lookup_and_delete_batch",
> > +		       "error: %s\n", strerror(errno));
> > +
> > +	CHECK(count != save_count,
> > +			"bpf_map_lookup_and_delete_batch",
> > +			"deleted not all elements: removed=%u expected=%u\n",
> > +			count, save_count);
> > +}
> > +
> SNIP
> > +static __u32 get_cur_elements(int map_id)
> > +{
> > +	LIBBPF_OPTS(bpf_iter_attach_opts, opts);
> > +	union bpf_iter_link_info linfo;
> > +	struct map_percpu_stats *skel;
> > +	struct bpf_link *link;
> > +	__u32 n_elements;
> > +	int iter_fd;
> > +	int ret;
> > +
> > +	opts.link_info = &linfo;
> > +	opts.link_info_len = sizeof(linfo);
> > +
> > +	skel = map_percpu_stats__open();
> > +	CHECK(skel == NULL, "map_percpu_stats__open", "error: %s", strerror(errno));
> > +
> > +	skel->bss->target_id = map_id;
> > +
> > +	ret = map_percpu_stats__load(skel);
> > +	CHECK(ret != 0, "map_percpu_stats__load", "error: %s", strerror(errno));
> > +
> > +	link = bpf_program__attach_iter(skel->progs.dump_bpf_map, &opts);
> 
> Instead of passing a uninitialized opts, I think using NULL will be fine
> here because there is no option for bpf map iterator now.
>
Hou Tao July 6, 2023, 12:21 p.m. UTC | #3
Hi,

On 7/6/2023 7:54 PM, Anton Protopopov wrote:
> On Thu, Jul 06, 2023 at 06:49:02PM +0800, Hou Tao wrote:
>> Hi,
>>
>> On 7/6/2023 12:01 AM, Anton Protopopov wrote:
>>> Add a new map test, map_percpu_stats.c, which is checking the correctness of
>>> map's percpu elements counters.  For supported maps the test upserts a number
>>> of elements, checks the correctness of the counters, then deletes all the
>>> elements and checks again that the counters sum drops down to zero.
>>>
>>> The following map types are tested:
>>>
>>>     * BPF_MAP_TYPE_HASH, BPF_F_NO_PREALLOC
>>>     * BPF_MAP_TYPE_PERCPU_HASH, BPF_F_NO_PREALLOC
>>>     * BPF_MAP_TYPE_HASH,
>>>     * BPF_MAP_TYPE_PERCPU_HASH,
>>>     * BPF_MAP_TYPE_LRU_HASH
>>>     * BPF_MAP_TYPE_LRU_PERCPU_HASH
>>>     * BPF_MAP_TYPE_LRU_HASH, BPF_F_NO_COMMON_LRU
>>>     * BPF_MAP_TYPE_LRU_PERCPU_HASH, BPF_F_NO_COMMON_LRU
>>>     * BPF_MAP_TYPE_HASH_OF_MAPS
>>>
>>> Signed-off-by: Anton Protopopov <aspsk@isovalent.com>
>> Acked-by: Hou Tao <houtao1@huawei.com>
>>
>> With two nits below.
> Thanks, fixed both for v5.
Great.
SNIP
> +static void delete_and_lookup_batch(int map_fd, void *keys, __u32 count)
> +{
> +	static __u8 values[(8 << 10) * MAX_ENTRIES];
> +	void *in_batch = NULL, *out_batch;
> +	__u32 save_count = count;
> +	int ret;
> +
> +	ret = bpf_map_lookup_and_delete_batch(map_fd,
> +					      &in_batch, &out_batch,
> +					      keys, values, &count,
> +					      NULL);
> +
> +	/*
> +	 * Despite what uapi header says, lookup_and_delete_batch will return
> +	 * -ENOENT in case we successfully have deleted all elements, so check
> +	 * this separately
> +	 */
>> It seems it is a bug in __htab_map_lookup_and_delete_batch(). I could
>> post a patch to fix it if you don't plan to do that by yourself.
> This should be as simple as
>
> @@ -1876,7 +1876,8 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map,
>         total += bucket_cnt;
>         batch++;
>         if (batch >= htab->n_buckets) {
> -               ret = -ENOENT;
> +               if (!total)
> +                       ret = -ENOENT;
>                 goto after_loop;
>         }
>         goto again;

No. I think changing it to "if (max_count > total) ret = -ENOENT;" will
be more appropriate, because it means the requested count couldn't been
fulfilled and it is also consistent with the comments in 
include/uapi/linux/bpf.h
>
> However, this might be already utilized by some apps to check that they've read
> all entries. Two local examples are map_tests/map_in_map_batch_ops.c and
> map_tests/htab_map_batch_ops.c. Another example I know is from BCC tools:
> https://github.com/iovisor/bcc/blob/master/libbpf-tools/map_helpers.c#L58
I think these use cases will be fine. Because when the last element has
been successfully iterated and returned, the out_batch is also updated,
so if the batch op is called again, -ENOENT will be returned.
>
> Can we update comments in include/uapi/linux/bpf.h?
I think the comments are correct.
>
>
> .
Anton Protopopov July 6, 2023, 12:57 p.m. UTC | #4
On Thu, Jul 06, 2023 at 08:21:17PM +0800, Hou Tao wrote:
> Hi,
> 
> On 7/6/2023 7:54 PM, Anton Protopopov wrote:
> > On Thu, Jul 06, 2023 at 06:49:02PM +0800, Hou Tao wrote:
> >> Hi,
> >>
> >> On 7/6/2023 12:01 AM, Anton Protopopov wrote:
> >>> Add a new map test, map_percpu_stats.c, which is checking the correctness of
> >>> map's percpu elements counters.  For supported maps the test upserts a number
> >>> of elements, checks the correctness of the counters, then deletes all the
> >>> elements and checks again that the counters sum drops down to zero.
> >>>
> >>> The following map types are tested:
> >>>
> >>>     * BPF_MAP_TYPE_HASH, BPF_F_NO_PREALLOC
> >>>     * BPF_MAP_TYPE_PERCPU_HASH, BPF_F_NO_PREALLOC
> >>>     * BPF_MAP_TYPE_HASH,
> >>>     * BPF_MAP_TYPE_PERCPU_HASH,
> >>>     * BPF_MAP_TYPE_LRU_HASH
> >>>     * BPF_MAP_TYPE_LRU_PERCPU_HASH
> >>>     * BPF_MAP_TYPE_LRU_HASH, BPF_F_NO_COMMON_LRU
> >>>     * BPF_MAP_TYPE_LRU_PERCPU_HASH, BPF_F_NO_COMMON_LRU
> >>>     * BPF_MAP_TYPE_HASH_OF_MAPS
> >>>
> >>> Signed-off-by: Anton Protopopov <aspsk@isovalent.com>
> >> Acked-by: Hou Tao <houtao1@huawei.com>
> >>
> >> With two nits below.
> > Thanks, fixed both for v5.
> Great.
> SNIP
> > +static void delete_and_lookup_batch(int map_fd, void *keys, __u32 count)
> > +{
> > +	static __u8 values[(8 << 10) * MAX_ENTRIES];
> > +	void *in_batch = NULL, *out_batch;
> > +	__u32 save_count = count;
> > +	int ret;
> > +
> > +	ret = bpf_map_lookup_and_delete_batch(map_fd,
> > +					      &in_batch, &out_batch,
> > +					      keys, values, &count,
> > +					      NULL);
> > +
> > +	/*
> > +	 * Despite what uapi header says, lookup_and_delete_batch will return
> > +	 * -ENOENT in case we successfully have deleted all elements, so check
> > +	 * this separately
> > +	 */
> >> It seems it is a bug in __htab_map_lookup_and_delete_batch(). I could
> >> post a patch to fix it if you don't plan to do that by yourself.
> > This should be as simple as
> >
> > @@ -1876,7 +1876,8 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map,
> >         total += bucket_cnt;
> >         batch++;
> >         if (batch >= htab->n_buckets) {
> > -               ret = -ENOENT;
> > +               if (!total)
> > +                       ret = -ENOENT;
> >                 goto after_loop;
> >         }
> >         goto again;
> 
> No. I think changing it to "if (max_count > total) ret = -ENOENT;" will
> be more appropriate, because it means the requested count couldn't been
> fulfilled and it is also consistent with the comments in 
> include/uapi/linux/bpf.h

Say, I have a map of size N and I don't know how many entries there are.
Then I will do

    count=N
    lookup_and_delete(&count)

In this case we will walk through the whole map, reach the 'batch >=
htab->n_buckets', and set the count to the number of elements we read.

(If, in opposite, there's no space to read a whole bucket, then we check this
above and return -ENOSPC.)

> > However, this might be already utilized by some apps to check that they've read
> > all entries. Two local examples are map_tests/map_in_map_batch_ops.c and
> > map_tests/htab_map_batch_ops.c. Another example I know is from BCC tools:
> > https://github.com/iovisor/bcc/blob/master/libbpf-tools/map_helpers.c#L58
> I think these use cases will be fine. Because when the last element has
> been successfully iterated and returned, the out_batch is also updated,
> so if the batch op is called again, -ENOENT will be returned.
> >
> > Can we update comments in include/uapi/linux/bpf.h?
> I think the comments are correct.

Currently we return -ENOENT as an indicator that (a) 'in_batch' is out of
bounds (b) we reached the end of map. So technically, this is an optimization,
as if we read elements in a loop by passing 'in_batch', 'out_batch', even if we
return 0 in case (b), the next syscall would return -ENOENT, because the new
'in_batch' would point to out of bounds.

This also makes sense for a map which is empty: we reached the end of map,
didn't find any elements, so we're returning -ENOENT (in contrast with saying
"all is ok, we read 0 elements").

So from my point of view -ENOENT makes sense. However, comments say "Returns
zero on success" which doesn't look true to me as I think that reading the
whole map in one syscall is a success :)
Hou Tao July 7, 2023, 1:41 a.m. UTC | #5
Hi,

On 7/6/2023 8:57 PM, Anton Protopopov wrote:
> On Thu, Jul 06, 2023 at 08:21:17PM +0800, Hou Tao wrote:
>> Hi,
>>
>> On 7/6/2023 7:54 PM, Anton Protopopov wrote:
>>
SNIP
>>> +static void delete_and_lookup_batch(int map_fd, void *keys, __u32 count)
>>> +{
>>> +	static __u8 values[(8 << 10) * MAX_ENTRIES];
>>> +	void *in_batch = NULL, *out_batch;
>>> +	__u32 save_count = count;
>>> +	int ret;
>>> +
>>> +	ret = bpf_map_lookup_and_delete_batch(map_fd,
>>> +					      &in_batch, &out_batch,
>>> +					      keys, values, &count,
>>> +					      NULL);
>>> +
>>> +	/*
>>> +	 * Despite what uapi header says, lookup_and_delete_batch will return
>>> +	 * -ENOENT in case we successfully have deleted all elements, so check
>>> +	 * this separately
>>> +	 */
>>>> It seems it is a bug in __htab_map_lookup_and_delete_batch(). I could
>>>> post a patch to fix it if you don't plan to do that by yourself.
>>> This should be as simple as
>>>
>>> @@ -1876,7 +1876,8 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map,
>>>         total += bucket_cnt;
>>>         batch++;
>>>         if (batch >= htab->n_buckets) {
>>> -               ret = -ENOENT;
>>> +               if (!total)
>>> +                       ret = -ENOENT;
>>>                 goto after_loop;
>>>         }
>>>         goto again;
>> No. I think changing it to "if (max_count > total) ret = -ENOENT;" will
>> be more appropriate, because it means the requested count couldn't been
>> fulfilled and it is also consistent with the comments in 
>> include/uapi/linux/bpf.h
> Say, I have a map of size N and I don't know how many entries there are.
> Then I will do
>
>     count=N
>     lookup_and_delete(&count)
>
> In this case we will walk through the whole map, reach the 'batch >=
> htab->n_buckets', and set the count to the number of elements we read.
>
> (If, in opposite, there's no space to read a whole bucket, then we check this
> above and return -ENOSPC.)
>
>>> However, this might be already utilized by some apps to check that they've read
>>> all entries. Two local examples are map_tests/map_in_map_batch_ops.c and
>>> map_tests/htab_map_batch_ops.c. Another example I know is from BCC tools:
>>> https://github.com/iovisor/bcc/blob/master/libbpf-tools/map_helpers.c#L58
>> I think these use cases will be fine. Because when the last element has
>> been successfully iterated and returned, the out_batch is also updated,
>> so if the batch op is called again, -ENOENT will be returned.
>>> Can we update comments in include/uapi/linux/bpf.h?
>> I think the comments are correct.
> Currently we return -ENOENT as an indicator that (a) 'in_batch' is out of
> bounds (b) we reached the end of map. So technically, this is an optimization,
> as if we read elements in a loop by passing 'in_batch', 'out_batch', even if we
> return 0 in case (b), the next syscall would return -ENOENT, because the new
> 'in_batch' would point to out of bounds.
>
> This also makes sense for a map which is empty: we reached the end of map,
> didn't find any elements, so we're returning -ENOENT (in contrast with saying
> "all is ok, we read 0 elements").
>
> So from my point of view -ENOENT makes sense. However, comments say "Returns
> zero on success" which doesn't look true to me as I think that reading the
> whole map in one syscall is a success :)

I get your point. The current implementation of BPF_MAP_LOOKUP_BATCH
does the following two things:
1) returns 0 when the whole map has not been iterated but there is no
space for current bucket.
2) doesn't return 0 when the whole map has been iterated successfully
(and the requested count is fulfilled)

For 1) I prefer to update the comments in uapi. If instead we fix the
implementation, we may break the existed users which need to check
ENOSPC to continue the batch op.
For 2) I don't have a preference. Both updating the comments and
implementation are fine to me.

WDYT ?
Anton Protopopov July 7, 2023, 7:28 a.m. UTC | #6
On Fri, Jul 07, 2023 at 09:41:03AM +0800, Hou Tao wrote:
> Hi,
> 
> On 7/6/2023 8:57 PM, Anton Protopopov wrote:
> > On Thu, Jul 06, 2023 at 08:21:17PM +0800, Hou Tao wrote:
> >> Hi,
> >>
> >> On 7/6/2023 7:54 PM, Anton Protopopov wrote:
> >>
> SNIP
> >>> +static void delete_and_lookup_batch(int map_fd, void *keys, __u32 count)
> >>> +{
> >>> +	static __u8 values[(8 << 10) * MAX_ENTRIES];
> >>> +	void *in_batch = NULL, *out_batch;
> >>> +	__u32 save_count = count;
> >>> +	int ret;
> >>> +
> >>> +	ret = bpf_map_lookup_and_delete_batch(map_fd,
> >>> +					      &in_batch, &out_batch,
> >>> +					      keys, values, &count,
> >>> +					      NULL);
> >>> +
> >>> +	/*
> >>> +	 * Despite what uapi header says, lookup_and_delete_batch will return
> >>> +	 * -ENOENT in case we successfully have deleted all elements, so check
> >>> +	 * this separately
> >>> +	 */
> >>>> It seems it is a bug in __htab_map_lookup_and_delete_batch(). I could
> >>>> post a patch to fix it if you don't plan to do that by yourself.
> >>> This should be as simple as
> >>>
> >>> @@ -1876,7 +1876,8 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map,
> >>>         total += bucket_cnt;
> >>>         batch++;
> >>>         if (batch >= htab->n_buckets) {
> >>> -               ret = -ENOENT;
> >>> +               if (!total)
> >>> +                       ret = -ENOENT;
> >>>                 goto after_loop;
> >>>         }
> >>>         goto again;
> >> No. I think changing it to "if (max_count > total) ret = -ENOENT;" will
> >> be more appropriate, because it means the requested count couldn't been
> >> fulfilled and it is also consistent with the comments in 
> >> include/uapi/linux/bpf.h
> > Say, I have a map of size N and I don't know how many entries there are.
> > Then I will do
> >
> >     count=N
> >     lookup_and_delete(&count)
> >
> > In this case we will walk through the whole map, reach the 'batch >=
> > htab->n_buckets', and set the count to the number of elements we read.
> >
> > (If, in opposite, there's no space to read a whole bucket, then we check this
> > above and return -ENOSPC.)
> >
> >>> However, this might be already utilized by some apps to check that they've read
> >>> all entries. Two local examples are map_tests/map_in_map_batch_ops.c and
> >>> map_tests/htab_map_batch_ops.c. Another example I know is from BCC tools:
> >>> https://github.com/iovisor/bcc/blob/master/libbpf-tools/map_helpers.c#L58
> >> I think these use cases will be fine. Because when the last element has
> >> been successfully iterated and returned, the out_batch is also updated,
> >> so if the batch op is called again, -ENOENT will be returned.
> >>> Can we update comments in include/uapi/linux/bpf.h?
> >> I think the comments are correct.
> > Currently we return -ENOENT as an indicator that (a) 'in_batch' is out of
> > bounds (b) we reached the end of map. So technically, this is an optimization,
> > as if we read elements in a loop by passing 'in_batch', 'out_batch', even if we
> > return 0 in case (b), the next syscall would return -ENOENT, because the new
> > 'in_batch' would point to out of bounds.
> >
> > This also makes sense for a map which is empty: we reached the end of map,
> > didn't find any elements, so we're returning -ENOENT (in contrast with saying
> > "all is ok, we read 0 elements").
> >
> > So from my point of view -ENOENT makes sense. However, comments say "Returns
> > zero on success" which doesn't look true to me as I think that reading the
> > whole map in one syscall is a success :)
> 
> I get your point. The current implementation of BPF_MAP_LOOKUP_BATCH
> does the following two things:
> 1) returns 0 when the whole map has not been iterated but there is no
> space for current bucket.

The algorithm works per bucket. For a bucket number X it checks if there is
enough space in the output buffer to store all bucket elements. If there is,
ok, go to the next bucket. If not, then it checks if any elements were written
already [from previous buckets]. If not, then it returns -ENOSPC, meaning,
"you've asked to copy at most N elements, but I can only copy M > N, not less,
please provide a bigger buffer."

> 2) doesn't return 0 when the whole map has been iterated successfully
> (and the requested count is fulfilled)
>
> For 1) I prefer to update the comments in uapi. If instead we fix the
> implementation, we may break the existed users which need to check
> ENOSPC to continue the batch op.
> For 2) I don't have a preference. Both updating the comments and
> implementation are fine to me.
> 
> WDYT ?
> 

I think that (1) is perfectly fine, -ENOSPC is returned only when we can't copy
elements, which is an error.

The (2) requires updating docs. The API is similar to get_next_key, and docs
can be updated in the same way. By updating docs we're not changing any uapi,
right?
Hou Tao July 7, 2023, 9:40 a.m. UTC | #7
Hi,

On 7/7/2023 3:28 PM, Anton Protopopov wrote:
> On Fri, Jul 07, 2023 at 09:41:03AM +0800, Hou Tao wrote:
>> Hi,
>>
>> On 7/6/2023 8:57 PM, Anton Protopopov wrote:
>>> On Thu, Jul 06, 2023 at 08:21:17PM +0800, Hou Tao wrote:
>>>> Hi,
>>>>
>>>> On 7/6/2023 7:54 PM, Anton Protopopov wrote:
>>>>
>> SNIP
>>>>> +static void delete_and_lookup_batch(int map_fd, void *keys, __u32 count)
>>>>> +{
>>>>> +	static __u8 values[(8 << 10) * MAX_ENTRIES];
>>>>> +	void *in_batch = NULL, *out_batch;
>>>>> +	__u32 save_count = count;
>>>>> +	int ret;
>>>>> +
>>>>> +	ret = bpf_map_lookup_and_delete_batch(map_fd,
>>>>> +					      &in_batch, &out_batch,
>>>>> +					      keys, values, &count,
>>>>> +					      NULL);
>>>>> +
>>>>> +	/*
>>>>> +	 * Despite what uapi header says, lookup_and_delete_batch will return
>>>>> +	 * -ENOENT in case we successfully have deleted all elements, so check
>>>>> +	 * this separately
>>>>> +	 */
>>>>>> It seems it is a bug in __htab_map_lookup_and_delete_batch(). I could
>>>>>> post a patch to fix it if you don't plan to do that by yourself.
>>>>> This should be as simple as
>>>>>
>>>>> @@ -1876,7 +1876,8 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map,
>>>>>         total += bucket_cnt;
>>>>>         batch++;
>>>>>         if (batch >= htab->n_buckets) {
>>>>> -               ret = -ENOENT;
>>>>> +               if (!total)
>>>>> +                       ret = -ENOENT;
>>>>>                 goto after_loop;
>>>>>         }
>>>>>         goto again;
>>>> No. I think changing it to "if (max_count > total) ret = -ENOENT;" will
>>>> be more appropriate, because it means the requested count couldn't been
>>>> fulfilled and it is also consistent with the comments in 
>>>> include/uapi/linux/bpf.h
>>> Say, I have a map of size N and I don't know how many entries there are.
>>> Then I will do
>>>
>>>     count=N
>>>     lookup_and_delete(&count)
>>>
>>> In this case we will walk through the whole map, reach the 'batch >=
>>> htab->n_buckets', and set the count to the number of elements we read.
>>>
>>> (If, in opposite, there's no space to read a whole bucket, then we check this
>>> above and return -ENOSPC.)
>>>
>>>>> However, this might be already utilized by some apps to check that they've read
>>>>> all entries. Two local examples are map_tests/map_in_map_batch_ops.c and
>>>>> map_tests/htab_map_batch_ops.c. Another example I know is from BCC tools:
>>>>> https://github.com/iovisor/bcc/blob/master/libbpf-tools/map_helpers.c#L58
>>>> I think these use cases will be fine. Because when the last element has
>>>> been successfully iterated and returned, the out_batch is also updated,
>>>> so if the batch op is called again, -ENOENT will be returned.
>>>>> Can we update comments in include/uapi/linux/bpf.h?
>>>> I think the comments are correct.
>>> Currently we return -ENOENT as an indicator that (a) 'in_batch' is out of
>>> bounds (b) we reached the end of map. So technically, this is an optimization,
>>> as if we read elements in a loop by passing 'in_batch', 'out_batch', even if we
>>> return 0 in case (b), the next syscall would return -ENOENT, because the new
>>> 'in_batch' would point to out of bounds.
>>>
>>> This also makes sense for a map which is empty: we reached the end of map,
>>> didn't find any elements, so we're returning -ENOENT (in contrast with saying
>>> "all is ok, we read 0 elements").
>>>
>>> So from my point of view -ENOENT makes sense. However, comments say "Returns
>>> zero on success" which doesn't look true to me as I think that reading the
>>> whole map in one syscall is a success :)
>> I get your point. The current implementation of BPF_MAP_LOOKUP_BATCH
>> does the following two things:
>> 1) returns 0 when the whole map has not been iterated but there is no
>> space for current bucket.
> The algorithm works per bucket. For a bucket number X it checks if there is
> enough space in the output buffer to store all bucket elements. If there is,
> ok, go to the next bucket. If not, then it checks if any elements were written
> already [from previous buckets]. If not, then it returns -ENOSPC, meaning,
> "you've asked to copy at most N elements, but I can only copy M > N, not less,
> please provide a bigger buffer."
Yes.
>
>> 2) doesn't return 0 when the whole map has been iterated successfully
>> (and the requested count is fulfilled)
>>
>> For 1) I prefer to update the comments in uapi. If instead we fix the
>> implementation, we may break the existed users which need to check
>> ENOSPC to continue the batch op.
>> For 2) I don't have a preference. Both updating the comments and
>> implementation are fine to me.
>>
>> WDYT ?
>>
> I think that (1) is perfectly fine, -ENOSPC is returned only when we can't copy
> elements, which is an error.

Maybe I misinterpreted the comments in bpf.h. As said in the comment:
"On success, *count* elements from the map are copied into the user
buffer",  I think the count here means the value of count which is used
as input instead of output.
>
> The (2) requires updating docs. The API is similar to get_next_key, and docs
> can be updated in the same way. By updating docs we're not changing any uapi,
> right?
I think it is fine.
Anton Protopopov July 7, 2023, 11:06 a.m. UTC | #8
On Fri, Jul 07, 2023 at 05:40:28PM +0800, Hou Tao wrote:
> Hi,
> 
> On 7/7/2023 3:28 PM, Anton Protopopov wrote:
> > On Fri, Jul 07, 2023 at 09:41:03AM +0800, Hou Tao wrote:
> >> Hi,
> >>
> >> On 7/6/2023 8:57 PM, Anton Protopopov wrote:
> >>> On Thu, Jul 06, 2023 at 08:21:17PM +0800, Hou Tao wrote:
> >>>> Hi,
> >>>>
> >>>> On 7/6/2023 7:54 PM, Anton Protopopov wrote:
> >>>>
> >> SNIP
> >>>>> +static void delete_and_lookup_batch(int map_fd, void *keys, __u32 count)
> >>>>> +{
> >>>>> +	static __u8 values[(8 << 10) * MAX_ENTRIES];
> >>>>> +	void *in_batch = NULL, *out_batch;
> >>>>> +	__u32 save_count = count;
> >>>>> +	int ret;
> >>>>> +
> >>>>> +	ret = bpf_map_lookup_and_delete_batch(map_fd,
> >>>>> +					      &in_batch, &out_batch,
> >>>>> +					      keys, values, &count,
> >>>>> +					      NULL);
> >>>>> +
> >>>>> +	/*
> >>>>> +	 * Despite what uapi header says, lookup_and_delete_batch will return
> >>>>> +	 * -ENOENT in case we successfully have deleted all elements, so check
> >>>>> +	 * this separately
> >>>>> +	 */
> >>>>>> It seems it is a bug in __htab_map_lookup_and_delete_batch(). I could
> >>>>>> post a patch to fix it if you don't plan to do that by yourself.
> >>>>> This should be as simple as
> >>>>>
> >>>>> @@ -1876,7 +1876,8 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map,
> >>>>>         total += bucket_cnt;
> >>>>>         batch++;
> >>>>>         if (batch >= htab->n_buckets) {
> >>>>> -               ret = -ENOENT;
> >>>>> +               if (!total)
> >>>>> +                       ret = -ENOENT;
> >>>>>                 goto after_loop;
> >>>>>         }
> >>>>>         goto again;
> >>>> No. I think changing it to "if (max_count > total) ret = -ENOENT;" will
> >>>> be more appropriate, because it means the requested count couldn't been
> >>>> fulfilled and it is also consistent with the comments in 
> >>>> include/uapi/linux/bpf.h
> >>> Say, I have a map of size N and I don't know how many entries there are.
> >>> Then I will do
> >>>
> >>>     count=N
> >>>     lookup_and_delete(&count)
> >>>
> >>> In this case we will walk through the whole map, reach the 'batch >=
> >>> htab->n_buckets', and set the count to the number of elements we read.
> >>>
> >>> (If, in opposite, there's no space to read a whole bucket, then we check this
> >>> above and return -ENOSPC.)
> >>>
> >>>>> However, this might be already utilized by some apps to check that they've read
> >>>>> all entries. Two local examples are map_tests/map_in_map_batch_ops.c and
> >>>>> map_tests/htab_map_batch_ops.c. Another example I know is from BCC tools:
> >>>>> https://github.com/iovisor/bcc/blob/master/libbpf-tools/map_helpers.c#L58
> >>>> I think these use cases will be fine. Because when the last element has
> >>>> been successfully iterated and returned, the out_batch is also updated,
> >>>> so if the batch op is called again, -ENOENT will be returned.
> >>>>> Can we update comments in include/uapi/linux/bpf.h?
> >>>> I think the comments are correct.
> >>> Currently we return -ENOENT as an indicator that (a) 'in_batch' is out of
> >>> bounds (b) we reached the end of map. So technically, this is an optimization,
> >>> as if we read elements in a loop by passing 'in_batch', 'out_batch', even if we
> >>> return 0 in case (b), the next syscall would return -ENOENT, because the new
> >>> 'in_batch' would point to out of bounds.
> >>>
> >>> This also makes sense for a map which is empty: we reached the end of map,
> >>> didn't find any elements, so we're returning -ENOENT (in contrast with saying
> >>> "all is ok, we read 0 elements").
> >>>
> >>> So from my point of view -ENOENT makes sense. However, comments say "Returns
> >>> zero on success" which doesn't look true to me as I think that reading the
> >>> whole map in one syscall is a success :)
> >> I get your point. The current implementation of BPF_MAP_LOOKUP_BATCH
> >> does the following two things:
> >> 1) returns 0 when the whole map has not been iterated but there is no
> >> space for current bucket.
> > The algorithm works per bucket. For a bucket number X it checks if there is
> > enough space in the output buffer to store all bucket elements. If there is,
> > ok, go to the next bucket. If not, then it checks if any elements were written
> > already [from previous buckets]. If not, then it returns -ENOSPC, meaning,
> > "you've asked to copy at most N elements, but I can only copy M > N, not less,
> > please provide a bigger buffer."
> Yes.
> >
> >> 2) doesn't return 0 when the whole map has been iterated successfully
> >> (and the requested count is fulfilled)
> >>
> >> For 1) I prefer to update the comments in uapi. If instead we fix the
> >> implementation, we may break the existed users which need to check
> >> ENOSPC to continue the batch op.
> >> For 2) I don't have a preference. Both updating the comments and
> >> implementation are fine to me.
> >>
> >> WDYT ?
> >>
> > I think that (1) is perfectly fine, -ENOSPC is returned only when we can't copy
> > elements, which is an error.
> 
> Maybe I misinterpreted the comments in bpf.h. As said in the comment:
> "On success, *count* elements from the map are copied into the user
> buffer",  I think the count here means the value of count which is used
> as input instead of output.

Yes, also may be updated, as this is actually "up to *count*" (*count* is an
output parameter which is not mentioned in the LOOKUP_BATCH description, only
in LOOKUP_AND_DELETE_BATCH). On the other side, the LOOKUP_AND_DELETE_BATCH
comment says "delete at least count" which is not true as well.

> > The (2) requires updating docs. The API is similar to get_next_key, and docs
> > can be updated in the same way. By updating docs we're not changing any uapi,
> > right?
> I think it is fine.
diff mbox series

Patch

diff --git a/tools/testing/selftests/bpf/map_tests/map_percpu_stats.c b/tools/testing/selftests/bpf/map_tests/map_percpu_stats.c
new file mode 100644
index 000000000000..37df564d9604
--- /dev/null
+++ b/tools/testing/selftests/bpf/map_tests/map_percpu_stats.c
@@ -0,0 +1,450 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2023 Isovalent */
+
+#include <errno.h>
+#include <unistd.h>
+#include <pthread.h>
+
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+
+#include <bpf_util.h>
+#include <test_maps.h>
+
+#include "map_percpu_stats.skel.h"
+
+#define MAX_ENTRIES			16384
+#define MAX_ENTRIES_HASH_OF_MAPS	64
+#define N_THREADS			8
+#define MAX_MAP_KEY_SIZE		4
+
+static void map_info(int map_fd, struct bpf_map_info *info)
+{
+	__u32 len = sizeof(*info);
+	int ret;
+
+	memset(info, 0, sizeof(*info));
+
+	ret = bpf_obj_get_info_by_fd(map_fd, info, &len);
+	CHECK(ret < 0, "bpf_obj_get_info_by_fd", "error: %s\n", strerror(errno));
+}
+
+static const char *map_type_to_s(__u32 type)
+{
+	switch (type) {
+	case BPF_MAP_TYPE_HASH:
+		return "HASH";
+	case BPF_MAP_TYPE_PERCPU_HASH:
+		return "PERCPU_HASH";
+	case BPF_MAP_TYPE_LRU_HASH:
+		return "LRU_HASH";
+	case BPF_MAP_TYPE_LRU_PERCPU_HASH:
+		return "LRU_PERCPU_HASH";
+	default:
+		return "<define-me>";
+	}
+}
+
+static __u32 map_count_elements(__u32 type, int map_fd)
+{
+	__u32 key = -1;
+	int n = 0;
+
+	while (!bpf_map_get_next_key(map_fd, &key, &key))
+		n++;
+	return n;
+}
+
+#define BATCH	true
+
+static void delete_and_lookup_batch(int map_fd, void *keys, __u32 count)
+{
+	static __u8 values[(8 << 10) * MAX_ENTRIES];
+	void *in_batch = NULL, *out_batch;
+	__u32 save_count = count;
+	int ret;
+
+	ret = bpf_map_lookup_and_delete_batch(map_fd,
+					      &in_batch, &out_batch,
+					      keys, values, &count,
+					      NULL);
+
+	/*
+	 * Despite what uapi header says, lookup_and_delete_batch will return
+	 * -ENOENT in case we successfully have deleted all elements, so check
+	 * this separately
+	 */
+	CHECK(ret < 0 && (errno != ENOENT || !count), "bpf_map_lookup_and_delete_batch",
+		       "error: %s\n", strerror(errno));
+
+	CHECK(count != save_count,
+			"bpf_map_lookup_and_delete_batch",
+			"deleted not all elements: removed=%u expected=%u\n",
+			count, save_count);
+}
+
+static void delete_all_elements(__u32 type, int map_fd, bool batch)
+{
+	static __u8 val[8 << 10]; /* enough for 1024 CPUs */
+	__u32 key = -1;
+	void *keys;
+	__u32 i, n;
+	int ret;
+
+	keys = calloc(MAX_MAP_KEY_SIZE, MAX_ENTRIES);
+	CHECK(!keys, "calloc", "error: %s\n", strerror(errno));
+
+	for (n = 0; !bpf_map_get_next_key(map_fd, &key, &key); n++)
+		memcpy(keys + n*MAX_MAP_KEY_SIZE, &key, MAX_MAP_KEY_SIZE);
+
+	if (batch) {
+		/* Can't mix delete_batch and delete_and_lookup_batch because
+		 * they have different semantics in relation to the keys
+		 * argument. However, delete_batch utilize map_delete_elem,
+		 * so we actually test it in non-batch scenario */
+		delete_and_lookup_batch(map_fd, keys, n);
+	} else {
+		/* Intentionally mix delete and lookup_and_delete so we can test both */
+		for (i = 0; i < n; i++) {
+			void *keyp = keys + i*MAX_MAP_KEY_SIZE;
+
+			if (i % 2 || type == BPF_MAP_TYPE_HASH_OF_MAPS) {
+				ret = bpf_map_delete_elem(map_fd, keyp);
+				CHECK(ret < 0, "bpf_map_delete_elem",
+					       "error: key %u: %s\n", i, strerror(errno));
+			} else {
+				ret = bpf_map_lookup_and_delete_elem(map_fd, keyp, val);
+				CHECK(ret < 0, "bpf_map_lookup_and_delete_elem",
+					       "error: key %u: %s\n", i, strerror(errno));
+			}
+		}
+	}
+
+	free(keys);
+}
+
+static bool is_lru(__u32 map_type)
+{
+	return map_type == BPF_MAP_TYPE_LRU_HASH ||
+	       map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH;
+}
+
+struct upsert_opts {
+	__u32 map_type;
+	int map_fd;
+	__u32 n;
+};
+
+static int create_small_hash(void)
+{
+	int map_fd;
+
+	map_fd = bpf_map_create(BPF_MAP_TYPE_HASH, "small", 4, 4, 4, NULL);
+	CHECK(map_fd < 0, "bpf_map_create()", "error:%s (name=%s)\n",
+			strerror(errno), "small");
+
+	return map_fd;
+}
+
+static void *patch_map_thread(void *arg)
+{
+	struct upsert_opts *opts = arg;
+	int val;
+	int ret;
+	int i;
+
+	for (i = 0; i < opts->n; i++) {
+		if (opts->map_type == BPF_MAP_TYPE_HASH_OF_MAPS)
+			val = create_small_hash();
+		else
+			val = rand();
+		ret = bpf_map_update_elem(opts->map_fd, &i, &val, 0);
+		CHECK(ret < 0, "bpf_map_update_elem", "key=%d error: %s\n", i, strerror(errno));
+
+		if (opts->map_type == BPF_MAP_TYPE_HASH_OF_MAPS)
+			close(val);
+	}
+	return NULL;
+}
+
+static void upsert_elements(struct upsert_opts *opts)
+{
+	pthread_t threads[N_THREADS];
+	int ret;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(threads); i++) {
+		ret = pthread_create(&i[threads], NULL, patch_map_thread, opts);
+		CHECK(ret != 0, "pthread_create", "error: %s\n", strerror(ret));
+	}
+
+	for (i = 0; i < ARRAY_SIZE(threads); i++) {
+		ret = pthread_join(i[threads], NULL);
+		CHECK(ret != 0, "pthread_join", "error: %s\n", strerror(ret));
+	}
+}
+
+static __u32 read_cur_elements(int iter_fd)
+{
+	char buf[64];
+	ssize_t n;
+	__u32 ret;
+
+	n = read(iter_fd, buf, sizeof(buf)-1);
+	CHECK(n <= 0, "read", "error: %s\n", strerror(errno));
+	buf[n] = '\0';
+
+	errno = 0;
+	ret = (__u32)strtol(buf, NULL, 10);
+	CHECK(errno != 0, "strtol", "error: %s\n", strerror(errno));
+
+	return ret;
+}
+
+static __u32 get_cur_elements(int map_id)
+{
+	LIBBPF_OPTS(bpf_iter_attach_opts, opts);
+	union bpf_iter_link_info linfo;
+	struct map_percpu_stats *skel;
+	struct bpf_link *link;
+	__u32 n_elements;
+	int iter_fd;
+	int ret;
+
+	opts.link_info = &linfo;
+	opts.link_info_len = sizeof(linfo);
+
+	skel = map_percpu_stats__open();
+	CHECK(skel == NULL, "map_percpu_stats__open", "error: %s", strerror(errno));
+
+	skel->bss->target_id = map_id;
+
+	ret = map_percpu_stats__load(skel);
+	CHECK(ret != 0, "map_percpu_stats__load", "error: %s", strerror(errno));
+
+	link = bpf_program__attach_iter(skel->progs.dump_bpf_map, &opts);
+	CHECK(!link, "bpf_program__attach_iter", "error: %s\n", strerror(errno));
+
+	iter_fd = bpf_iter_create(bpf_link__fd(link));
+	CHECK(iter_fd < 0, "bpf_iter_create", "error: %s\n", strerror(errno));
+
+	n_elements = read_cur_elements(iter_fd);
+
+	close(iter_fd);
+	bpf_link__destroy(link);
+	map_percpu_stats__destroy(skel);
+
+	return n_elements;
+}
+
+static void check_expected_number_elements(__u32 n_inserted, int map_fd,
+					   struct bpf_map_info *info)
+{
+	__u32 n_real;
+	__u32 n_iter;
+
+	/* Count the current number of elements in the map by iterating through
+	 * all the map keys via bpf_get_next_key
+	 */
+	n_real = map_count_elements(info->type, map_fd);
+
+	/* The "real" number of elements should be the same as the inserted
+	 * number of elements in all cases except LRU maps, where some elements
+	 * may have been evicted
+	 */
+	if (n_inserted == 0 || !is_lru(info->type))
+		CHECK(n_inserted != n_real, "map_count_elements",
+		      "n_real(%u) != n_inserted(%u)\n", n_real, n_inserted);
+
+	/* Count the current number of elements in the map using an iterator */
+	n_iter = get_cur_elements(info->id);
+
+	/* Both counts should be the same, as all updates are over */
+	CHECK(n_iter != n_real, "get_cur_elements",
+	      "n_iter=%u, expected %u (map_type=%s,map_flags=%08x)\n",
+	      n_iter, n_real, map_type_to_s(info->type), info->map_flags);
+}
+
+static void __test(int map_fd)
+{
+	struct upsert_opts opts = {
+		.map_fd = map_fd,
+	};
+	struct bpf_map_info info;
+
+	map_info(map_fd, &info);
+	opts.map_type = info.type;
+	opts.n = info.max_entries;
+
+	/* Reduce the number of elements we are updating such that we don't
+	 * bump into -E2BIG from non-preallocated hash maps, but still will
+	 * have some evictions for LRU maps  */
+	if (opts.map_type != BPF_MAP_TYPE_HASH_OF_MAPS)
+		opts.n -= 512;
+	else
+		opts.n /= 2;
+
+	/*
+	 * Upsert keys [0, n) under some competition: with random values from
+	 * N_THREADS threads. Check values, then delete all elements and check
+	 * values again.
+	 */
+	upsert_elements(&opts);
+	check_expected_number_elements(opts.n, map_fd, &info);
+	delete_all_elements(info.type, map_fd, !BATCH);
+	check_expected_number_elements(0, map_fd, &info);
+
+	/* Now do the same, but using batch delete operations */
+	upsert_elements(&opts);
+	check_expected_number_elements(opts.n, map_fd, &info);
+	delete_all_elements(info.type, map_fd, BATCH);
+	check_expected_number_elements(0, map_fd, &info);
+
+	close(map_fd);
+}
+
+static int map_create_opts(__u32 type, const char *name,
+			   struct bpf_map_create_opts *map_opts,
+			   __u32 key_size, __u32 val_size)
+{
+	int max_entries;
+	int map_fd;
+
+	if (type == BPF_MAP_TYPE_HASH_OF_MAPS)
+		max_entries = MAX_ENTRIES_HASH_OF_MAPS;
+	else
+		max_entries = MAX_ENTRIES;
+
+	map_fd = bpf_map_create(type, name, key_size, val_size, max_entries, map_opts);
+	CHECK(map_fd < 0, "bpf_map_create()", "error:%s (name=%s)\n",
+			strerror(errno), name);
+
+	return map_fd;
+}
+
+static int map_create(__u32 type, const char *name, struct bpf_map_create_opts *map_opts)
+{
+	return map_create_opts(type, name, map_opts, sizeof(int), sizeof(int));
+}
+
+static int create_hash(void)
+{
+	struct bpf_map_create_opts map_opts = {
+		.sz = sizeof(map_opts),
+		.map_flags = BPF_F_NO_PREALLOC,
+	};
+
+	return map_create(BPF_MAP_TYPE_HASH, "hash", &map_opts);
+}
+
+static int create_percpu_hash(void)
+{
+	struct bpf_map_create_opts map_opts = {
+		.sz = sizeof(map_opts),
+		.map_flags = BPF_F_NO_PREALLOC,
+	};
+
+	return map_create(BPF_MAP_TYPE_PERCPU_HASH, "percpu_hash", &map_opts);
+}
+
+static int create_hash_prealloc(void)
+{
+	return map_create(BPF_MAP_TYPE_HASH, "hash", NULL);
+}
+
+static int create_percpu_hash_prealloc(void)
+{
+	return map_create(BPF_MAP_TYPE_PERCPU_HASH, "percpu_hash_prealloc", NULL);
+}
+
+static int create_lru_hash(__u32 type, __u32 map_flags)
+{
+	struct bpf_map_create_opts map_opts = {
+		.sz = sizeof(map_opts),
+		.map_flags = map_flags,
+	};
+
+	return map_create(type, "lru_hash", &map_opts);
+}
+
+static int create_hash_of_maps(void)
+{
+	struct bpf_map_create_opts map_opts = {
+		.sz = sizeof(map_opts),
+		.map_flags = BPF_F_NO_PREALLOC,
+		.inner_map_fd = create_small_hash(),
+	};
+	int ret;
+
+	ret = map_create_opts(BPF_MAP_TYPE_HASH_OF_MAPS, "hash_of_maps",
+			      &map_opts, sizeof(int), sizeof(int));
+	close(map_opts.inner_map_fd);
+	return ret;
+}
+
+static void map_percpu_stats_hash(void)
+{
+	__test(create_hash());
+	printf("test_%s:PASS\n", __func__);
+}
+
+static void map_percpu_stats_percpu_hash(void)
+{
+	__test(create_percpu_hash());
+	printf("test_%s:PASS\n", __func__);
+}
+
+static void map_percpu_stats_hash_prealloc(void)
+{
+	__test(create_hash_prealloc());
+	printf("test_%s:PASS\n", __func__);
+}
+
+static void map_percpu_stats_percpu_hash_prealloc(void)
+{
+	__test(create_percpu_hash_prealloc());
+	printf("test_%s:PASS\n", __func__);
+}
+
+static void map_percpu_stats_lru_hash(void)
+{
+	__test(create_lru_hash(BPF_MAP_TYPE_LRU_HASH, 0));
+	printf("test_%s:PASS\n", __func__);
+}
+
+static void map_percpu_stats_lru_hash_no_common(void)
+{
+	__test(create_lru_hash(BPF_MAP_TYPE_LRU_HASH, BPF_F_NO_COMMON_LRU));
+	printf("test_%s:PASS\n", __func__);
+}
+
+static void map_percpu_stats_percpu_lru_hash(void)
+{
+	__test(create_lru_hash(BPF_MAP_TYPE_LRU_PERCPU_HASH, 0));
+	printf("test_%s:PASS\n", __func__);
+}
+
+static void map_percpu_stats_percpu_lru_hash_no_common(void)
+{
+	__test(create_lru_hash(BPF_MAP_TYPE_LRU_PERCPU_HASH, BPF_F_NO_COMMON_LRU));
+	printf("test_%s:PASS\n", __func__);
+}
+
+static void map_percpu_stats_hash_of_maps(void)
+{
+	__test(create_hash_of_maps());
+	printf("test_%s:PASS\n", __func__);
+}
+
+void test_map_percpu_stats(void)
+{
+	map_percpu_stats_hash();
+	map_percpu_stats_percpu_hash();
+	map_percpu_stats_hash_prealloc();
+	map_percpu_stats_percpu_hash_prealloc();
+	map_percpu_stats_lru_hash();
+	map_percpu_stats_lru_hash_no_common();
+	map_percpu_stats_percpu_lru_hash();
+	map_percpu_stats_percpu_lru_hash_no_common();
+	map_percpu_stats_hash_of_maps();
+}
diff --git a/tools/testing/selftests/bpf/progs/map_percpu_stats.c b/tools/testing/selftests/bpf/progs/map_percpu_stats.c
new file mode 100644
index 000000000000..10b2325c1720
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/map_percpu_stats.c
@@ -0,0 +1,24 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2023 Isovalent */
+
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+__u32 target_id;
+
+__s64 bpf_map_sum_elem_count(struct bpf_map *map) __ksym;
+
+SEC("iter/bpf_map")
+int dump_bpf_map(struct bpf_iter__bpf_map *ctx)
+{
+	struct seq_file *seq = ctx->meta->seq;
+	struct bpf_map *map = ctx->map;
+
+	if (map && map->id == target_id)
+		BPF_SEQ_PRINTF(seq, "%lld", bpf_map_sum_elem_count(map));
+
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";