diff mbox series

[bpf-next] bpf/lpm_trie: inline longest_prefix_match for fastpath

Message ID 171025648415.2098287.4441181253947701605.stgit@firesoul (mailing list archive)
State Superseded
Delegated to: BPF
Headers show
Series [bpf-next] bpf/lpm_trie: inline longest_prefix_match for fastpath | expand

Checks

Context Check Description
netdev/series_format success Single patches do not need cover letters
netdev/tree_selection success Clearly marked for bpf-next
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit fail Errors and warnings before: 942 this patch: 943
netdev/build_tools success No tools touched, skip
netdev/cc_maintainers warning 11 maintainers not CCed: martin.lau@linux.dev daniel@iogearbox.net kpsingh@kernel.org haoluo@google.com song@kernel.org sdf@google.com yonghong.song@linux.dev jolsa@kernel.org john.fastabend@gmail.com andrii@kernel.org eddyz87@gmail.com
netdev/build_clang fail Errors and warnings before: 957 this patch: 958
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn fail Errors and warnings before: 958 this patch: 959
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 34 lines checked
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc fail Errors and warnings before: 0 this patch: 1
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next-PR fail PR summary
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-next-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-5 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-4 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-10 success Logs for aarch64-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-12 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-6 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-9 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-11 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-17 success Logs for s390x-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-18 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-20 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-7 success Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-8 success Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-14 fail Logs for s390x-gcc / test (test_progs, false, 360) / test_progs on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-15 fail Logs for s390x-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-19 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-28 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-34 success Logs for x86_64-llvm-17 / veristat
bpf/vmtest-bpf-next-VM_Test-35 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-42 success Logs for x86_64-llvm-18 / veristat
bpf/vmtest-bpf-next-VM_Test-16 success Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-13 success Logs for s390x-gcc / test (test_maps, false, 360) / test_maps on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-36 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18 and -O2 optimization
bpf/vmtest-bpf-next-VM_Test-26 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-33 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-41 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-23 success Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-22 success Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-24 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-25 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-21 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-27 success Logs for x86_64-gcc / veristat / veristat on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-29 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17 and -O2 optimization
bpf/vmtest-bpf-next-VM_Test-30 success Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-32 success Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-31 success Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-37 success Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-40 success Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-39 success Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-38 fail Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18

Commit Message

Jesper Dangaard Brouer March 12, 2024, 3:17 p.m. UTC
The BPF map type LPM (Longest Prefix Match) is used heavily
in production by multiple products that have BPF components.
Perf data shows trie_lookup_elem() and longest_prefix_match()
being part of kernels perf top.

For every level in the LPM tree trie_lookup_elem() calls out
to longest_prefix_match().  The compiler is free to inline this
call, but chooses not to inline, because other slowpath callers
(that can be invoked via syscall) exists like trie_update_elem(),
trie_delete_elem() or trie_get_next_key().

 bcc/tools/funccount -Ti 1 'trie_lookup_elem|longest_prefix_match.isra.0'
 FUNC                                    COUNT
 trie_lookup_elem                       664945
 longest_prefix_match.isra.0           8101507

Observation on a single random metal shows a factor 12 between
the two functions. Given an average of 12 levels in the trie being
searched.

This patch force inlining longest_prefix_match(), but only for
the lookup fastpath to balance object instruction size.

 $ bloat-o-meter kernel/bpf/lpm_trie.o.orig-noinline kernel/bpf/lpm_trie.o
 add/remove: 1/1 grow/shrink: 1/0 up/down: 335/-4 (331)
 Function                                     old     new   delta
 trie_lookup_elem                             179     510    +331
 __BTF_ID__struct__lpm_trie__706741             -       4      +4
 __BTF_ID__struct__lpm_trie__706733             4       -      -4
 Total: Before=3056, After=3387, chg +10.83%

Details: Due to AMD mitigation for SRSO (Speculative Return Stack Overflow)
these function calls have additional overhead. On newer kernels this shows
up under srso_safe_ret() + srso_return_thunk(), and on older kernels (6.1)
under __x86_return_thunk(). Thus, for production workloads the biggest gain
comes from avoiding this mitigation overhead.

Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
---
I do know net-next is closed
 https://netdev.bots.linux.dev/net-next.html

Hoping BPF maintainers can queue this patch anyhow.
If not feel free to drop and I will try to remember to resubmit.

 kernel/bpf/lpm_trie.c |   16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

Comments

Daniel Borkmann March 15, 2024, 3:03 p.m. UTC | #1
On 3/12/24 4:17 PM, Jesper Dangaard Brouer wrote:
> The BPF map type LPM (Longest Prefix Match) is used heavily
> in production by multiple products that have BPF components.
> Perf data shows trie_lookup_elem() and longest_prefix_match()
> being part of kernels perf top.

You mention these are heavy hitters in prod ...

> For every level in the LPM tree trie_lookup_elem() calls out
> to longest_prefix_match().  The compiler is free to inline this
> call, but chooses not to inline, because other slowpath callers
> (that can be invoked via syscall) exists like trie_update_elem(),
> trie_delete_elem() or trie_get_next_key().
> 
>   bcc/tools/funccount -Ti 1 'trie_lookup_elem|longest_prefix_match.isra.0'
>   FUNC                                    COUNT
>   trie_lookup_elem                       664945
>   longest_prefix_match.isra.0           8101507
> 
> Observation on a single random metal shows a factor 12 between
> the two functions. Given an average of 12 levels in the trie being
> searched.
> 
> This patch force inlining longest_prefix_match(), but only for
> the lookup fastpath to balance object instruction size.
> 
>   $ bloat-o-meter kernel/bpf/lpm_trie.o.orig-noinline kernel/bpf/lpm_trie.o
>   add/remove: 1/1 grow/shrink: 1/0 up/down: 335/-4 (331)
>   Function                                     old     new   delta
>   trie_lookup_elem                             179     510    +331
>   __BTF_ID__struct__lpm_trie__706741             -       4      +4
>   __BTF_ID__struct__lpm_trie__706733             4       -      -4
>   Total: Before=3056, After=3387, chg +10.83%

... and here you quote bloat-o-meter instead. But do you also see an
observable perf gain in prod after this change? (No objection from my
side but might be good to mention here.. given if not then why do the
change?)

> Details: Due to AMD mitigation for SRSO (Speculative Return Stack Overflow)
> these function calls have additional overhead. On newer kernels this shows
> up under srso_safe_ret() + srso_return_thunk(), and on older kernels (6.1)
> under __x86_return_thunk(). Thus, for production workloads the biggest gain
> comes from avoiding this mitigation overhead.
> 
> Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
Jesper Dangaard Brouer March 15, 2024, 5:08 p.m. UTC | #2
On 15/03/2024 16.03, Daniel Borkmann wrote:
> On 3/12/24 4:17 PM, Jesper Dangaard Brouer wrote:
>> The BPF map type LPM (Longest Prefix Match) is used heavily
>> in production by multiple products that have BPF components.
>> Perf data shows trie_lookup_elem() and longest_prefix_match()
>> being part of kernels perf top.
> 
> You mention these are heavy hitters in prod ...
> 
>> For every level in the LPM tree trie_lookup_elem() calls out
>> to longest_prefix_match().  The compiler is free to inline this
>> call, but chooses not to inline, because other slowpath callers
>> (that can be invoked via syscall) exists like trie_update_elem(),
>> trie_delete_elem() or trie_get_next_key().
>>
>>   bcc/tools/funccount -Ti 1 
>> 'trie_lookup_elem|longest_prefix_match.isra.0'
>>   FUNC                                    COUNT
>>   trie_lookup_elem                       664945
>>   longest_prefix_match.isra.0           8101507
>>
>> Observation on a single random metal shows a factor 12 between
>> the two functions. Given an average of 12 levels in the trie being
>> searched.
>>
>> This patch force inlining longest_prefix_match(), but only for
>> the lookup fastpath to balance object instruction size.
>>
>>   $ bloat-o-meter kernel/bpf/lpm_trie.o.orig-noinline 
>> kernel/bpf/lpm_trie.o
>>   add/remove: 1/1 grow/shrink: 1/0 up/down: 335/-4 (331)
>>   Function                                     old     new   delta
>>   trie_lookup_elem                             179     510    +331
>>   __BTF_ID__struct__lpm_trie__706741             -       4      +4
>>   __BTF_ID__struct__lpm_trie__706733             4       -      -4
>>   Total: Before=3056, After=3387, chg +10.83%
> 
> ... and here you quote bloat-o-meter instead. But do you also see an
> observable perf gain in prod after this change? (No objection from my
> side but might be good to mention here.. given if not then why do the
> change?)
> 

I'm still waiting for more production servers to reboot into patched
kernels.  I do have some "low-level" numbers from previous generation
AMD servers, running kernel 6.1, which should be less affected by the
SRSO (than our 6.6 kernels). Waiting for newer generation to get kernel
updates, and especially 6.6 will be interesting.

 From production measurements the latency overhead of trie_lookup_elem:
  - avg 1220 nsecs for patched kernel
  - avg 1329 nsecs for non patched kernel
  - around 8% improvement or 109 nanosec
  - given approx 12 calls "saved" this is 9 ns per function call
  - for reference on Intel I measured func call to cost 1.3ns
  - this extra overhead is caused by __x86_return_thunk().

I also see slight improvement in the graphs, but given how much
production varies I don't want to draw conclusions yet.


>> Details: Due to AMD mitigation for SRSO (Speculative Return Stack 
>> Overflow)
>> these function calls have additional overhead. On newer kernels this 
>> shows
>> up under srso_safe_ret() + srso_return_thunk(), and on older kernels 
>> (6.1)
>> under __x86_return_thunk(). Thus, for production workloads the biggest 
>> gain
>> comes from avoiding this mitigation overhead.
>>
>> Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
Jesper Dangaard Brouer March 18, 2024, 12:12 p.m. UTC | #3
On 15/03/2024 18.08, Jesper Dangaard Brouer wrote:
> 
> 
> On 15/03/2024 16.03, Daniel Borkmann wrote:
>> On 3/12/24 4:17 PM, Jesper Dangaard Brouer wrote:
>>> The BPF map type LPM (Longest Prefix Match) is used heavily
>>> in production by multiple products that have BPF components.
>>> Perf data shows trie_lookup_elem() and longest_prefix_match()
>>> being part of kernels perf top.
>>
>> You mention these are heavy hitters in prod ...
>>
>>> For every level in the LPM tree trie_lookup_elem() calls out
>>> to longest_prefix_match().  The compiler is free to inline this
>>> call, but chooses not to inline, because other slowpath callers
>>> (that can be invoked via syscall) exists like trie_update_elem(),
>>> trie_delete_elem() or trie_get_next_key().
>>>
>>>   bcc/tools/funccount -Ti 1 
>>> 'trie_lookup_elem|longest_prefix_match.isra.0'
>>>   FUNC                                    COUNT
>>>   trie_lookup_elem                       664945
>>>   longest_prefix_match.isra.0           8101507
>>>
>>> Observation on a single random metal shows a factor 12 between
>>> the two functions. Given an average of 12 levels in the trie being
>>> searched.
>>>
>>> This patch force inlining longest_prefix_match(), but only for
>>> the lookup fastpath to balance object instruction size.
>>>
>>>   $ bloat-o-meter kernel/bpf/lpm_trie.o.orig-noinline 
>>> kernel/bpf/lpm_trie.o
>>>   add/remove: 1/1 grow/shrink: 1/0 up/down: 335/-4 (331)
>>>   Function                                     old     new   delta
>>>   trie_lookup_elem                             179     510    +331
>>>   __BTF_ID__struct__lpm_trie__706741             -       4      +4
>>>   __BTF_ID__struct__lpm_trie__706733             4       -      -4
>>>   Total: Before=3056, After=3387, chg +10.83%
>>
>> ... and here you quote bloat-o-meter instead. But do you also see an
>> observable perf gain in prod after this change? (No objection from my
>> side but might be good to mention here.. given if not then why do the
>> change?)
>>
> 
> I'm still waiting for more production servers to reboot into patched
> kernels.  I do have some "low-level" numbers from previous generation
> AMD servers, running kernel 6.1, which should be less affected by the
> SRSO (than our 6.6 kernels). Waiting for newer generation to get kernel
> updates, and especially 6.6 will be interesting.

There were no larger performance benefit on 6.6 it is basically the same.

Newer generation (11G) hardware latency overhead of trie_lookup_elem
  - avg 1181 nsecs for patched kernel
  - avg 1269 nsecs for non patched kernel
  - around 7% improvement or 88 ns

> 
>  From production measurements the latency overhead of trie_lookup_elem:
>   - avg 1220 nsecs for patched kernel
>   - avg 1329 nsecs for non patched kernel
>   - around 8% improvement or 109 nanosec
>   - given approx 12 calls "saved" this is 9 ns per function call
>   - for reference on Intel I measured func call to cost 1.3ns
>   - this extra overhead is caused by __x86_return_thunk().
> 

> I also see slight improvement in the graphs, but given how much
> production varies I don't want to draw conclusions yet.
> 

Still inconclusive due to variations in traffic distribution due to
load-balancing isn't perfect.

> 
>>> Details: Due to AMD mitigation for SRSO (Speculative Return Stack Overflow)
>>> these function calls have additional overhead. On newer kernels this shows
>>> up under srso_safe_ret() + srso_return_thunk(), and on older kernels (6.1)
>>> under __x86_return_thunk(). Thus, for production workloads the biggest gain
>>> comes from avoiding this mitigation overhead.
>>>
>>> Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>

I'm going to send a V2, because kernel doc processing found an issue:
  - https://netdev.bots.linux.dev/static/nipa/834681/13590144/kdoc/stderr

I'll try to incorporate the production improvements we are seeing, but
it feels wrong to write about kernel 6.1 and 6.6 improvements, but I
(currently) cannot deploy a bpf-next kernel in production (but I do test
latest kernels in my local testlab).

--Jesper
diff mbox series

Patch

diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index 050fe1ebf0f7..7a6f39425e14 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -162,9 +162,10 @@  static inline int extract_bit(const u8 *data, size_t index)
  *
  * Determine the longest prefix of @node that matches the bits in @key.
  */
-static size_t longest_prefix_match(const struct lpm_trie *trie,
-				   const struct lpm_trie_node *node,
-				   const struct bpf_lpm_trie_key_u8 *key)
+static __always_inline
+size_t __longest_prefix_match(const struct lpm_trie *trie,
+			      const struct lpm_trie_node *node,
+			      const struct bpf_lpm_trie_key_u8 *key)
 {
 	u32 limit = min(node->prefixlen, key->prefixlen);
 	u32 prefixlen = 0, i = 0;
@@ -224,6 +225,13 @@  static size_t longest_prefix_match(const struct lpm_trie *trie,
 	return prefixlen;
 }
 
+static size_t longest_prefix_match(const struct lpm_trie *trie,
+				   const struct lpm_trie_node *node,
+				   const struct bpf_lpm_trie_key_u8 *key)
+{
+	return __longest_prefix_match(trie, node, key);
+}
+
 /* Called from syscall or from eBPF program */
 static void *trie_lookup_elem(struct bpf_map *map, void *_key)
 {
@@ -245,7 +253,7 @@  static void *trie_lookup_elem(struct bpf_map *map, void *_key)
 		 * If it's the maximum possible prefix for this trie, we have
 		 * an exact match and can return it directly.
 		 */
-		matchlen = longest_prefix_match(trie, node, key);
+		matchlen = __longest_prefix_match(trie, node, key);
 		if (matchlen == trie->max_prefixlen) {
 			found = node;
 			break;