diff mbox series

[bpf-next,v3,3/3] selftests/bpf: Add benchmark for bounded/unbounded string kfuncs

Message ID ecb1300906ac106648a1bbfdd33895fb12275761.1741874348.git.vmalik@redhat.com (mailing list archive)
State Changes Requested
Delegated to: BPF
Headers show
Series bpf: Add kfuncs for read-only string operations | expand

Checks

Context Check Description
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for bpf-next, async
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 0 this patch: 0
netdev/build_tools success Errors and warnings before: 26 (+0) this patch: 26 (+0)
netdev/cc_maintainers warning 1 maintainers not CCed: linux-kselftest@vger.kernel.org
netdev/build_clang success Errors and warnings before: 1 this patch: 1
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success net selftest script(s) already in Makefile
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 8 this patch: 8
netdev/checkpatch warning CHECK: Comparison to NULL could be written "bpf_strchr" CHECK: Comparison to NULL could be written "bpf_strnchr" CHECK: Comparison to NULL could be written "bpf_strnstr" CHECK: Comparison to NULL could be written "bpf_strstr" WARNING: Use of volatile is usually wrong: see Documentation/process/volatile-considered-harmful.rst WARNING: added, moved or deleted file(s), does MAINTAINERS need updating? WARNING: externs should be avoided in .c files WARNING: line length of 81 exceeds 80 columns WARNING: line length of 83 exceeds 80 columns WARNING: line length of 84 exceeds 80 columns WARNING: line length of 87 exceeds 80 columns
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next-PR success PR summary
bpf/vmtest-bpf-next-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-next-VM_Test-34 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-35 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17-O2
bpf/vmtest-bpf-next-VM_Test-40 success Logs for x86_64-llvm-17 / veristat-kernel
bpf/vmtest-bpf-next-VM_Test-41 success Logs for x86_64-llvm-17 / veristat-meta
bpf/vmtest-bpf-next-VM_Test-15 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-6 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-19 success Logs for s390x-gcc / veristat-kernel
bpf/vmtest-bpf-next-VM_Test-4 success Logs for aarch64-gcc / GCC BPF
bpf/vmtest-bpf-next-VM_Test-11 success Logs for aarch64-gcc / veristat-kernel
bpf/vmtest-bpf-next-VM_Test-14 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-24 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-23 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-13 success Logs for s390x-gcc / GCC BPF
bpf/vmtest-bpf-next-VM_Test-12 success Logs for aarch64-gcc / veristat-meta
bpf/vmtest-bpf-next-VM_Test-5 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-20 success Logs for s390x-gcc / veristat-meta
bpf/vmtest-bpf-next-VM_Test-21 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-33 success Logs for x86_64-llvm-17 / GCC BPF / GCC BPF
bpf/vmtest-bpf-next-VM_Test-43 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-44 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18-O2
bpf/vmtest-bpf-next-VM_Test-25 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-30 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-28 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-22 success Logs for x86_64-gcc / GCC BPF / GCC BPF
bpf/vmtest-bpf-next-VM_Test-50 success Logs for x86_64-llvm-18 / veristat-kernel
bpf/vmtest-bpf-next-VM_Test-51 success Logs for x86_64-llvm-18 / veristat-meta
bpf/vmtest-bpf-next-VM_Test-39 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-42 success Logs for x86_64-llvm-18 / GCC BPF / GCC BPF
bpf/vmtest-bpf-next-VM_Test-27 success Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-26 success Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-29 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-31 success Logs for x86_64-gcc / veristat-kernel / x86_64-gcc veristat_kernel
bpf/vmtest-bpf-next-VM_Test-36 success Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-9 success Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-32 success Logs for x86_64-gcc / veristat-meta / x86_64-gcc veristat_meta
bpf/vmtest-bpf-next-VM_Test-37 success Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-49 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-38 success Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-10 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-45 success Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-46 success Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-48 success Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-7 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-47 success Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-8 success Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-16 success Logs for s390x-gcc / test (test_progs, false, 360) / test_progs on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-18 success Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-17 success Logs for s390x-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on s390x with gcc

Commit Message

Viktor Malik March 24, 2025, 12:03 p.m. UTC
Add a new benchmark using the existing bench infrastructure which
compares performance of bounded and unbounded string kfuncs added in the
previous commits.

Running on x86_64 and arm64, the most significant difference is in the
strlen/strnlen and strstr/strnstr comparisons on arm64:

    strlen/strnlen
    ==============
    strlen-1             0.453 ± 0.002M/s (drops 0.000 ± 0.000M/s)
    strnlen-1            0.470 ± 0.006M/s (drops 0.000 ± 0.000M/s)
    strlen-8             0.459 ± 0.011M/s (drops 0.000 ± 0.000M/s)
    strnlen-8            0.451 ± 0.006M/s (drops 0.000 ± 0.000M/s)
    strlen-64            0.439 ± 0.007M/s (drops 0.000 ± 0.000M/s)
    strnlen-64           0.455 ± 0.006M/s (drops 0.000 ± 0.000M/s)
    strlen-512           0.359 ± 0.006M/s (drops 0.000 ± 0.000M/s)
    strnlen-512          0.441 ± 0.007M/s (drops 0.000 ± 0.000M/s)
    strlen-2048          0.232 ± 0.003M/s (drops 0.000 ± 0.000M/s)
    strnlen-2048         0.403 ± 0.005M/s (drops 0.000 ± 0.000M/s)
    strlen-4095          0.151 ± 0.001M/s (drops 0.000 ± 0.000M/s)
    strnlen-4095         0.362 ± 0.005M/s (drops 0.000 ± 0.000M/s)

    strstr/strnstr
    ==============
    strstr-8             0.452 ± 0.005M/s (drops 0.000 ± 0.000M/s)
    strnstr-8            0.442 ± 0.006M/s (drops 0.000 ± 0.000M/s)
    strstr-64            0.390 ± 0.004M/s (drops 0.000 ± 0.000M/s)
    strnstr-64           0.400 ± 0.004M/s (drops 0.000 ± 0.000M/s)
    strstr-512           0.228 ± 0.003M/s (drops 0.000 ± 0.000M/s)
    strnstr-512          0.256 ± 0.002M/s (drops 0.000 ± 0.000M/s)
    strstr-2048          0.095 ± 0.001M/s (drops 0.000 ± 0.000M/s)
    strnstr-2048         0.113 ± 0.001M/s (drops 0.000 ± 0.000M/s)
    strstr-4095          0.052 ± 0.001M/s (drops 0.000 ± 0.000M/s)
    strnstr-4095         0.064 ± 0.001M/s (drops 0.000 ± 0.000M/s)

For strings longer than 64B, the unbounded variants are notably faster,
having as much as 140% performance gain over the bounded variants
(strncmp for strings of length 4095). The reason is that arm64 has an
optimized implementation of strnlen in assembly which is also used
inside strnstr.

On x86_64, which doesn't have any optimized string operations, there is
still an observable difference in strlen/strnlen and strstr/strnstr,
albeit much smaller than for arm64:

    strlen/strnlen
    ==============
    strlen-1             7.021 ± 0.036M/s (drops 0.000 ± 0.000M/s)
    strnlen-1            7.000 ± 0.038M/s (drops 0.000 ± 0.000M/s)
    strlen-8             6.837 ± 0.011M/s (drops 0.000 ± 0.000M/s)
    strnlen-8            6.832 ± 0.064M/s (drops 0.000 ± 0.000M/s)
    strlen-64            5.638 ± 0.026M/s (drops 0.000 ± 0.000M/s)
    strnlen-64           6.010 ± 0.034M/s (drops 0.000 ± 0.000M/s)
    strlen-512           3.322 ± 0.011M/s (drops 0.000 ± 0.000M/s)
    strnlen-512          3.449 ± 0.014M/s (drops 0.000 ± 0.000M/s)
    strlen-2048          1.390 ± 0.007M/s (drops 0.000 ± 0.000M/s)
    strnlen-2048         1.429 ± 0.003M/s (drops 0.000 ± 0.000M/s)
    strlen-4095          0.786 ± 0.003M/s (drops 0.000 ± 0.000M/s)
    strnlen-4095         0.803 ± 0.002M/s (drops 0.000 ± 0.000M/s)

    strstr/strnstr
    ==============
    strstr-8             6.031 ± 0.012M/s (drops 0.000 ± 0.000M/s)
    strnstr-8            6.322 ± 0.048M/s (drops 0.000 ± 0.000M/s)
    strstr-64            3.221 ± 0.054M/s (drops 0.000 ± 0.000M/s)
    strnstr-64           3.059 ± 0.025M/s (drops 0.000 ± 0.000M/s)
    strstr-512           0.734 ± 0.006M/s (drops 0.000 ± 0.000M/s)
    strnstr-512          0.849 ± 0.004M/s (drops 0.000 ± 0.000M/s)
    strstr-2048          0.220 ± 0.004M/s (drops 0.000 ± 0.000M/s)
    strnstr-2048         0.246 ± 0.002M/s (drops 0.000 ± 0.000M/s)
    strstr-4095          0.104 ± 0.000M/s (drops 0.000 ± 0.000M/s)
    strnstr-4095         0.122 ± 0.000M/s (drops 0.000 ± 0.000M/s)

The performance gain of the bounded variants on strings over 64B is
3%-6% for strlen/strnlen and 12%-18% for strstr/strnstr. The likely
explanation is that the unbounded variants use __get_kernel_nofault
instead of plain derefence which introduces some small overhead. This
manifests mainly in the above functions as they iterate multiple
strings (i.e. use __get_kernel_nofault more).

For the rest of the functions in the benchmark (strchr/strnchr and
strchrnul/strnchrnul), the performance difference is negligable or
within the bounds of a statistical error, with an exception of
strchr/strnchr on arm64:

    strchr/strnchr
    ==============
    strchr-1             0.475 ± 0.010M/s (drops 0.000 ± 0.000M/s)
    strnchr-1            0.469 ± 0.008M/s (drops 0.000 ± 0.000M/s)
    strchr-8             0.448 ± 0.011M/s (drops 0.000 ± 0.000M/s)
    strnchr-8            0.472 ± 0.006M/s (drops 0.000 ± 0.000M/s)
    strchr-64            0.432 ± 0.010M/s (drops 0.000 ± 0.000M/s)
    strnchr-64           0.445 ± 0.008M/s (drops 0.000 ± 0.000M/s)
    strchr-512           0.308 ± 0.003M/s (drops 0.000 ± 0.000M/s)
    strnchr-512          0.330 ± 0.005M/s (drops 0.000 ± 0.000M/s)
    strchr-2048          0.156 ± 0.002M/s (drops 0.000 ± 0.000M/s)
    strnchr-2048         0.186 ± 0.003M/s (drops 0.000 ± 0.000M/s)
    strchr-4095          0.094 ± 0.001M/s (drops 0.000 ± 0.000M/s)
    strnchr-4095         0.115 ± 0.004M/s (drops 0.000 ± 0.000M/s)

Here, I'm not sure what the reason for the performance benefit is,
possibly a combination of compiler optimizations and
__get_kernel_nofault overhead.

Signed-off-by: Viktor Malik <vmalik@redhat.com>
---
 tools/testing/selftests/bpf/Makefile          |   2 +
 tools/testing/selftests/bpf/bench.c           |  21 ++
 .../bpf/benchs/bench_string_kfuncs.c          | 259 ++++++++++++++++++
 .../bpf/benchs/run_bench_string_kfuncs.sh     |  34 +++
 .../selftests/bpf/progs/string_kfuncs_bench.c |  88 ++++++
 5 files changed, 404 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/benchs/bench_string_kfuncs.c
 create mode 100755 tools/testing/selftests/bpf/benchs/run_bench_string_kfuncs.sh
 create mode 100644 tools/testing/selftests/bpf/progs/string_kfuncs_bench.c
diff mbox series

Patch

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index ca41d47d4ba6..d04f7e78c8ab 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -813,6 +813,7 @@  $(OUTPUT)/bench_local_storage_create.o: $(OUTPUT)/bench_local_storage_create.ske
 $(OUTPUT)/bench_bpf_hashmap_lookup.o: $(OUTPUT)/bpf_hashmap_lookup.skel.h
 $(OUTPUT)/bench_htab_mem.o: $(OUTPUT)/htab_mem_bench.skel.h
 $(OUTPUT)/bench_bpf_crypto.o: $(OUTPUT)/crypto_bench.skel.h
+$(OUTPUT)/bench_string_kfuncs.o: $(OUTPUT)/string_kfuncs_bench.skel.h
 $(OUTPUT)/bench.o: bench.h testing_helpers.h $(BPFOBJ)
 $(OUTPUT)/bench: LDLIBS += -lm
 $(OUTPUT)/bench: $(OUTPUT)/bench.o \
@@ -833,6 +834,7 @@  $(OUTPUT)/bench: $(OUTPUT)/bench.o \
 		 $(OUTPUT)/bench_local_storage_create.o \
 		 $(OUTPUT)/bench_htab_mem.o \
 		 $(OUTPUT)/bench_bpf_crypto.o \
+		 $(OUTPUT)/bench_string_kfuncs.o \
 		 #
 	$(call msg,BINARY,,$@)
 	$(Q)$(CC) $(CFLAGS) $(LDFLAGS) $(filter %.a %.o,$^) $(LDLIBS) -o $@
diff --git a/tools/testing/selftests/bpf/bench.c b/tools/testing/selftests/bpf/bench.c
index 1bd403a5ef7b..5aa7f63436f6 100644
--- a/tools/testing/selftests/bpf/bench.c
+++ b/tools/testing/selftests/bpf/bench.c
@@ -283,6 +283,7 @@  extern struct argp bench_local_storage_create_argp;
 extern struct argp bench_htab_mem_argp;
 extern struct argp bench_trigger_batch_argp;
 extern struct argp bench_crypto_argp;
+extern struct argp bench_string_kfuncs_argp;
 
 static const struct argp_child bench_parsers[] = {
 	{ &bench_ringbufs_argp, 0, "Ring buffers benchmark", 0 },
@@ -297,6 +298,7 @@  static const struct argp_child bench_parsers[] = {
 	{ &bench_htab_mem_argp, 0, "hash map memory benchmark", 0 },
 	{ &bench_trigger_batch_argp, 0, "BPF triggering benchmark", 0 },
 	{ &bench_crypto_argp, 0, "bpf crypto benchmark", 0 },
+	{ &bench_string_kfuncs_argp, 0, "string kfuncs benchmark", 0 },
 	{},
 };
 
@@ -550,6 +552,16 @@  extern const struct bench bench_htab_mem;
 extern const struct bench bench_crypto_encrypt;
 extern const struct bench bench_crypto_decrypt;
 
+/* string kfunc benchmarks */
+extern const struct bench bench_string_kfuncs_strlen;
+extern const struct bench bench_string_kfuncs_strnlen;
+extern const struct bench bench_string_kfuncs_strchr;
+extern const struct bench bench_string_kfuncs_strnchr;
+extern const struct bench bench_string_kfuncs_strchrnul;
+extern const struct bench bench_string_kfuncs_strnchrnul;
+extern const struct bench bench_string_kfuncs_strstr;
+extern const struct bench bench_string_kfuncs_strnstr;
+
 static const struct bench *benchs[] = {
 	&bench_count_global,
 	&bench_count_local,
@@ -609,6 +621,15 @@  static const struct bench *benchs[] = {
 	&bench_htab_mem,
 	&bench_crypto_encrypt,
 	&bench_crypto_decrypt,
+	/* string kfuncs */
+	&bench_string_kfuncs_strlen,
+	&bench_string_kfuncs_strnlen,
+	&bench_string_kfuncs_strchr,
+	&bench_string_kfuncs_strnchr,
+	&bench_string_kfuncs_strchrnul,
+	&bench_string_kfuncs_strnchrnul,
+	&bench_string_kfuncs_strstr,
+	&bench_string_kfuncs_strnstr,
 };
 
 static void find_benchmark(void)
diff --git a/tools/testing/selftests/bpf/benchs/bench_string_kfuncs.c b/tools/testing/selftests/bpf/benchs/bench_string_kfuncs.c
new file mode 100644
index 000000000000..a2e11af092ce
--- /dev/null
+++ b/tools/testing/selftests/bpf/benchs/bench_string_kfuncs.c
@@ -0,0 +1,259 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2025. Red Hat, Inc. */
+#include <argp.h>
+#include "bench.h"
+#include "string_kfuncs_bench.skel.h"
+
+static struct string_kfuncs_ctx {
+	struct string_kfuncs_bench *skel;
+} ctx;
+
+static struct string_kfuncs_args {
+	u32 str_len;
+} args = {
+	.str_len = 32,
+};
+
+enum {
+	ARG_STR_LEN = 5000,
+};
+
+static const struct argp_option opts[] = {
+	{ "str-len", ARG_STR_LEN, "STR_LEN", 0, "Set the length of string(s)" },
+	{},
+};
+
+static error_t string_kfuncs_parse_arg(int key, char *arg, struct argp_state *state)
+{
+	switch (key) {
+	case ARG_STR_LEN:
+		args.str_len = strtoul(arg, NULL, 10);
+		if (!args.str_len ||
+		    args.str_len >= sizeof(ctx.skel->bss->str)) {
+			fprintf(stderr, "Invalid str len (limit %zu)\n",
+				sizeof(ctx.skel->bss->str) - 1);
+			argp_usage(state);
+		}
+		break;
+	default:
+		return ARGP_ERR_UNKNOWN;
+	}
+
+	return 0;
+}
+
+const struct argp bench_string_kfuncs_argp = {
+	.options = opts,
+	.parser = string_kfuncs_parse_arg,
+};
+
+static void string_kfuncs_validate(void)
+{
+	if (env.consumer_cnt != 0) {
+		fprintf(stderr, "string_kfuncs benchmark doesn't support consumer!\n");
+		exit(1);
+	}
+}
+
+static void string_kfuncs_setup(void)
+{
+	int err;
+	char *str;
+	size_t i, sz, quarter;
+
+	sz = sizeof(ctx.skel->bss->str);
+	if (!sz) {
+		fprintf(stderr, "invalid string size (%zu)\n", sz);
+		exit(1);
+	}
+
+	setup_libbpf();
+
+	ctx.skel = string_kfuncs_bench__open();
+	if (!ctx.skel) {
+		fprintf(stderr, "failed to open skeleton\n");
+		exit(1);
+	}
+
+	/* Fill str with random digits 1-9 */
+	srandom(time(NULL));
+	str = ctx.skel->bss->str;
+	for (i = 0; i < args.str_len - 1; i++)
+		str[i] = '1' + random() % 9;
+
+	/* For strchr and variants - set the last character to '0' */
+	str[args.str_len - 1] = '0';
+	str[args.str_len] = '\0';
+
+	/* For strstr and variants - copy the last quarter of str to substr */
+	quarter = args.str_len / 4;
+	memcpy(ctx.skel->bss->substr, str + args.str_len - quarter, quarter + 1);
+
+	ctx.skel->rodata->str_len = args.str_len;
+
+	err = string_kfuncs_bench__load(ctx.skel);
+	if (err) {
+		fprintf(stderr, "failed to load skeleton\n");
+		string_kfuncs_bench__destroy(ctx.skel);
+		exit(1);
+	}
+}
+
+static void string_kfuncs_attach_prog(struct bpf_program *prog)
+{
+	struct bpf_link *link;
+
+	link = bpf_program__attach(prog);
+	if (!link) {
+		fprintf(stderr, "failed to attach program!\n");
+		exit(1);
+	}
+}
+
+static void string_kfuncs_strlen_setup(void)
+{
+	string_kfuncs_setup();
+	string_kfuncs_attach_prog(ctx.skel->progs.strlen_bench);
+}
+
+static void string_kfuncs_strnlen_setup(void)
+{
+	string_kfuncs_setup();
+	string_kfuncs_attach_prog(ctx.skel->progs.strnlen_bench);
+}
+
+static void string_kfuncs_strchr_setup(void)
+{
+	string_kfuncs_setup();
+	string_kfuncs_attach_prog(ctx.skel->progs.strchr_bench);
+}
+
+static void string_kfuncs_strnchr_setup(void)
+{
+	string_kfuncs_setup();
+	string_kfuncs_attach_prog(ctx.skel->progs.strnchr_bench);
+}
+
+static void string_kfuncs_strchrnul_setup(void)
+{
+	string_kfuncs_setup();
+	string_kfuncs_attach_prog(ctx.skel->progs.strchrnul_bench);
+}
+
+static void string_kfuncs_strnchrnul_setup(void)
+{
+	string_kfuncs_setup();
+	string_kfuncs_attach_prog(ctx.skel->progs.strnchrnul_bench);
+}
+
+static void string_kfuncs_strstr_setup(void)
+{
+	string_kfuncs_setup();
+	string_kfuncs_attach_prog(ctx.skel->progs.strstr_bench);
+}
+
+static void string_kfuncs_strnstr_setup(void)
+{
+	string_kfuncs_setup();
+	string_kfuncs_attach_prog(ctx.skel->progs.strnstr_bench);
+}
+
+static void *string_kfuncs_producer(void *ctx)
+{
+	while (true)
+		(void)syscall(__NR_getpgid);
+	return NULL;
+}
+
+static void string_kfuncs_measure(struct bench_res *res)
+{
+	res->hits = atomic_swap(&ctx.skel->bss->hits, 0);
+}
+
+const struct bench bench_string_kfuncs_strlen = {
+	.name = "string-kfuncs-strlen",
+	.argp = &bench_string_kfuncs_argp,
+	.validate = string_kfuncs_validate,
+	.setup = string_kfuncs_strlen_setup,
+	.producer_thread = string_kfuncs_producer,
+	.measure = string_kfuncs_measure,
+	.report_progress = hits_drops_report_progress,
+	.report_final = hits_drops_report_final,
+};
+
+const struct bench bench_string_kfuncs_strnlen = {
+	.name = "string-kfuncs-strnlen",
+	.argp = &bench_string_kfuncs_argp,
+	.validate = string_kfuncs_validate,
+	.setup = string_kfuncs_strnlen_setup,
+	.producer_thread = string_kfuncs_producer,
+	.measure = string_kfuncs_measure,
+	.report_progress = hits_drops_report_progress,
+	.report_final = hits_drops_report_final,
+};
+
+const struct bench bench_string_kfuncs_strchr = {
+	.name = "string-kfuncs-strchr",
+	.argp = &bench_string_kfuncs_argp,
+	.validate = string_kfuncs_validate,
+	.setup = string_kfuncs_strchr_setup,
+	.producer_thread = string_kfuncs_producer,
+	.measure = string_kfuncs_measure,
+	.report_progress = hits_drops_report_progress,
+	.report_final = hits_drops_report_final,
+};
+
+const struct bench bench_string_kfuncs_strnchr = {
+	.name = "string-kfuncs-strnchr",
+	.argp = &bench_string_kfuncs_argp,
+	.validate = string_kfuncs_validate,
+	.setup = string_kfuncs_strnchr_setup,
+	.producer_thread = string_kfuncs_producer,
+	.measure = string_kfuncs_measure,
+	.report_progress = hits_drops_report_progress,
+	.report_final = hits_drops_report_final,
+};
+
+const struct bench bench_string_kfuncs_strchrnul = {
+	.name = "string-kfuncs-strchrnul",
+	.argp = &bench_string_kfuncs_argp,
+	.validate = string_kfuncs_validate,
+	.setup = string_kfuncs_strchrnul_setup,
+	.producer_thread = string_kfuncs_producer,
+	.measure = string_kfuncs_measure,
+	.report_progress = hits_drops_report_progress,
+	.report_final = hits_drops_report_final,
+};
+
+const struct bench bench_string_kfuncs_strnchrnul = {
+	.name = "string-kfuncs-strnchrnul",
+	.argp = &bench_string_kfuncs_argp,
+	.validate = string_kfuncs_validate,
+	.setup = string_kfuncs_strnchrnul_setup,
+	.producer_thread = string_kfuncs_producer,
+	.measure = string_kfuncs_measure,
+	.report_progress = hits_drops_report_progress,
+	.report_final = hits_drops_report_final,
+};
+
+const struct bench bench_string_kfuncs_strstr = {
+	.name = "string-kfuncs-strstr",
+	.argp = &bench_string_kfuncs_argp,
+	.validate = string_kfuncs_validate,
+	.setup = string_kfuncs_strstr_setup,
+	.producer_thread = string_kfuncs_producer,
+	.measure = string_kfuncs_measure,
+	.report_progress = hits_drops_report_progress,
+	.report_final = hits_drops_report_final,
+};
+
+const struct bench bench_string_kfuncs_strnstr = {
+	.name = "string-kfuncs-strnstr",
+	.argp = &bench_string_kfuncs_argp,
+	.validate = string_kfuncs_validate,
+	.setup = string_kfuncs_strnstr_setup,
+	.producer_thread = string_kfuncs_producer,
+	.measure = string_kfuncs_measure,
+	.report_progress = hits_drops_report_progress,
+	.report_final = hits_drops_report_final,
+};
diff --git a/tools/testing/selftests/bpf/benchs/run_bench_string_kfuncs.sh b/tools/testing/selftests/bpf/benchs/run_bench_string_kfuncs.sh
new file mode 100755
index 000000000000..5e635681cd85
--- /dev/null
+++ b/tools/testing/selftests/bpf/benchs/run_bench_string_kfuncs.sh
@@ -0,0 +1,34 @@ 
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+source ./benchs/run_common.sh
+
+set -eufo pipefail
+
+header "strlen/strnlen"
+for s in 1 8 64 512 2048 4095; do
+	for b in strlen strnlen; do
+		summarize ${b}-${s} "$($RUN_BENCH --str-len=$s string-kfuncs-${b})"
+	done
+done
+
+header "strchr/strnchr"
+for s in 1 8 64 512 2048 4095; do
+	for b in strchr strnchr; do
+		summarize ${b}-${s} "$($RUN_BENCH --str-len=$s string-kfuncs-${b})"
+	done
+done
+
+header "strchrnul/strnchrnul"
+for s in 1 8 64 512 2048 4095; do
+	for b in strchrnul strnchrnul; do
+		summarize ${b}-${s} "$($RUN_BENCH --str-len=$s string-kfuncs-${b})"
+	done
+done
+
+header "strstr/strnstr"
+for s in 8 64 512 2048 4095; do
+	for b in strstr strnstr; do
+		summarize ${b}-${s} "$($RUN_BENCH --str-len=$s string-kfuncs-${b})"
+	done
+done
diff --git a/tools/testing/selftests/bpf/progs/string_kfuncs_bench.c b/tools/testing/selftests/bpf/progs/string_kfuncs_bench.c
new file mode 100644
index 000000000000..e227c54a5b92
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/string_kfuncs_bench.c
@@ -0,0 +1,88 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2025. Red Hat, Inc. */
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+#define STR_SZ 4096
+
+size_t bpf_strlen(const char *s) __ksym;
+size_t bpf_strnlen(void *s, u32 s__sz) __ksym;
+char *bpf_strchr(const char *s, int c) __ksym;
+char *bpf_strnchr(void *s, u32 s__sz, int c) __ksym;
+char *bpf_strchrnul(const char *s, int c) __ksym;
+char *bpf_strnchrnul(void *s, u32 s__sz, int c) __ksym;
+char *bpf_strstr(const char *s1, const char *s2) __ksym;
+char *bpf_strnstr(void *s1, u32 s1__sz, void *s2, u32 s2__sz) __ksym;
+
+/* Will be updated by benchmark before program loading */
+const volatile unsigned int str_len = 1;
+long hits = 0;
+char str[STR_SZ];
+char substr[STR_SZ];
+
+char _license[] SEC("license") = "GPL";
+
+SEC("tp/syscalls/sys_enter_getpgid")
+int strlen_bench(void *ctx)
+{
+	if (bpf_strlen(str) > 0)
+		__sync_add_and_fetch(&hits, 1);
+	return 0;
+}
+
+SEC("tp/syscalls/sys_enter_getpgid")
+int strnlen_bench(void *ctx)
+{
+	if (bpf_strnlen(str, str_len + 1) > 0)
+		__sync_add_and_fetch(&hits, 1);
+	return 0;
+}
+
+SEC("tp/syscalls/sys_enter_getpgid")
+int strchr_bench(void *ctx)
+{
+	if (bpf_strchr(str, '0') != NULL)
+		__sync_add_and_fetch(&hits, 1);
+	return 0;
+}
+
+SEC("tp/syscalls/sys_enter_getpgid")
+int strnchr_bench(void *ctx)
+{
+	if (bpf_strnchr(str, str_len + 1, '0') != NULL)
+		__sync_add_and_fetch(&hits, 1);
+	return 0;
+}
+
+SEC("tp/syscalls/sys_enter_getpgid")
+int strchrnul_bench(void *ctx)
+{
+	if (*bpf_strchrnul(str, '0') != '\0')
+		__sync_add_and_fetch(&hits, 1);
+	return 0;
+}
+
+SEC("tp/syscalls/sys_enter_getpgid")
+int strnchrnul_bench(void *ctx)
+{
+	if (*bpf_strnchrnul(str, str_len + 1, '0') != '\0')
+		__sync_add_and_fetch(&hits, 1);
+	return 0;
+}
+
+SEC("tp/syscalls/sys_enter_getpgid")
+int strstr_bench(void *ctx)
+{
+	if (bpf_strstr(str, substr) != NULL)
+		__sync_add_and_fetch(&hits, 1);
+	return 0;
+}
+
+SEC("tp/syscalls/sys_enter_getpgid")
+int strnstr_bench(void *ctx)
+{
+	if (bpf_strnstr(str, str_len + 1, substr, str_len / 4 + 1) != NULL)
+		__sync_add_and_fetch(&hits, 1);
+	return 0;
+}