diff mbox series

[ipsec-next,v2,2/3] xfrm: interface: Add unstable helpers for setting/getting XFRM metadata from TC-BPF

Message ID 20221129132018.985887-3-eyal.birger@gmail.com (mailing list archive)
State Superseded
Delegated to: Netdev Maintainers
Headers show
Series xfrm: interface: Add unstable helpers for XFRM metadata | expand

Checks

Context Check Description
netdev/tree_selection success Guessed tree name to be net-next, async
netdev/fixes_present success Fixes tag not required for -next series
netdev/subject_prefix success Link
netdev/cover_letter success Series has a cover letter
netdev/patch_count success Link
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 294 this patch: 294
netdev/cc_maintainers success CCed 9 of 9 maintainers
netdev/build_clang success Errors and warnings before: 13 this patch: 13
netdev/module_param success Was 0 now: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 295 this patch: 295
netdev/checkpatch warning CHECK: extern prototypes should be avoided in .h files WARNING: added, moved or deleted file(s), does MAINTAINERS need updating? WARNING: line length of 81 exceeds 80 columns WARNING: line length of 83 exceeds 80 columns WARNING: please, no spaces at the start of a line
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-7 success Logs for llvm-toolchain
bpf/vmtest-bpf-next-VM_Test-8 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-6 success Logs for build for x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-2 success Logs for build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-3 success Logs for build for aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-5 success Logs for build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-4 success Logs for build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-37 success Logs for test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-38 success Logs for test_verifier on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-9 success Logs for test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-10 success Logs for test_maps on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-12 success Logs for test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-13 success Logs for test_maps on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-14 fail Logs for test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-15 fail Logs for test_progs on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-17 success Logs for test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-18 success Logs for test_progs on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-19 fail Logs for test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-20 fail Logs for test_progs_no_alu32 on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-22 success Logs for test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-23 success Logs for test_progs_no_alu32 on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-24 success Logs for test_progs_no_alu32_parallel on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-25 success Logs for test_progs_no_alu32_parallel on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-27 success Logs for test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-28 success Logs for test_progs_no_alu32_parallel on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-29 success Logs for test_progs_parallel on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-30 success Logs for test_progs_parallel on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-32 success Logs for test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-33 success Logs for test_progs_parallel on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-34 success Logs for test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-35 success Logs for test_verifier on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-36 success Logs for test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-26 success Logs for test_progs_no_alu32_parallel on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-31 success Logs for test_progs_parallel on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-16 fail Logs for test_progs on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-21 fail Logs for test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-PR fail PR summary
bpf/vmtest-bpf-next-VM_Test-11 success Logs for test_maps on s390x with gcc

Commit Message

Eyal Birger Nov. 29, 2022, 1:20 p.m. UTC
This change adds xfrm metadata helpers using the unstable kfunc call
interface for the TC-BPF hooks. This allows steering traffic towards
different IPsec connections based on logic implemented in bpf programs.

This object is built based on the availabilty of BTF debug info.

The metadata percpu dsts used on TX take ownership of the original skb
dsts so that they may be used as part of the xfrm transmittion logic -
e.g.  for MTU calculations.

Signed-off-by: Eyal Birger <eyal.birger@gmail.com>

---

v2: changed added following points raised by Martin KaFai Lau:
  - make sure dst is refcounted prior to caching
  - free dst_orig regardless of CONFIG_DST_CACHE
  - call xfrm interface bpf cleanup in case of kfunc registration errors
---
 include/net/dst_metadata.h     |   1 +
 include/net/xfrm.h             |  20 +++++++
 net/core/dst.c                 |   8 ++-
 net/xfrm/Makefile              |   6 ++
 net/xfrm/xfrm_interface_bpf.c  | 100 +++++++++++++++++++++++++++++++++
 net/xfrm/xfrm_interface_core.c |  15 +++++
 6 files changed, 148 insertions(+), 2 deletions(-)
 create mode 100644 net/xfrm/xfrm_interface_bpf.c

Comments

Martin KaFai Lau Nov. 30, 2022, 6:14 p.m. UTC | #1
On 11/29/22 5:20 AM, Eyal Birger wrote:
> diff --git a/net/xfrm/xfrm_interface_bpf.c b/net/xfrm/xfrm_interface_bpf.c
> new file mode 100644
> index 000000000000..757e15857dbf
> --- /dev/null
> +++ b/net/xfrm/xfrm_interface_bpf.c
> @@ -0,0 +1,100 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/* Unstable XFRM Helpers for TC-BPF hook
> + *
> + * These are called from SCHED_CLS BPF programs. Note that it is
> + * allowed to break compatibility for these functions since the interface they
> + * are exposed through to BPF programs is explicitly unstable.
> + */
> +
> +#include <linux/bpf.h>
> +#include <linux/btf_ids.h>
> +
> +#include <net/dst_metadata.h>
> +#include <net/xfrm.h>
> +
> +struct bpf_xfrm_info {
No need to introduce a bpf variant of the "struct xfrm_md_info" (more on this 
later).

> +	u32 if_id;
> +	int link;
> +};
> +
> +static struct metadata_dst __percpu *xfrm_md_dst;
> +__diag_push();
> +__diag_ignore_all("-Wmissing-prototypes",
> +		  "Global functions as their definitions will be in xfrm_interface BTF");
> +
> +__used noinline
> +int bpf_skb_get_xfrm_info(struct __sk_buff *skb_ctx, struct bpf_xfrm_info *to)

This kfunc is not needed.  It only reads the skb->_skb_refdst.  The new kfunc 
bpf_rdonly_cast() can be used.  Take a look at the bpf_rdonly_cast() usages in 
the selftests/bpf/progs/type_cast.c.  It was in bpf-next only but should also be 
in net-next now.

> +{
> +	struct sk_buff *skb = (struct sk_buff *)skb_ctx;
> +	struct xfrm_md_info *info;
> +
> +	memset(to, 0, sizeof(*to));
> +
> +	info = skb_xfrm_md_info(skb);
> +	if (!info)
> +		return -EINVAL;
> +
> +	to->if_id = info->if_id;
> +	to->link = info->link;
> +	return 0;
> +}
> +
> +__used noinline
> +int bpf_skb_set_xfrm_info(struct __sk_buff *skb_ctx,
> +			  const struct bpf_xfrm_info *from)

Directly use "const struct xfrm_md_info *from" instead.  This kfunc can check 
from->dst_orig != NULL and return -EINVAL.  It will then have a consistent API 
with the bpf_rdonly_cast() mentioned above.

> +{
> +	struct sk_buff *skb = (struct sk_buff *)skb_ctx;
> +	struct metadata_dst *md_dst;
> +	struct xfrm_md_info *info;
> +
> +	if (unlikely(skb_metadata_dst(skb)))
> +		return -EINVAL;
> +
> +	md_dst = this_cpu_ptr(xfrm_md_dst);
> +
> +	info = &md_dst->u.xfrm_info;
> +	memset(info, 0, sizeof(*info));

Unnecessary memset here.  Everything should have been initialized below. 
bpf_skb_set_tunnel_key() needs memset but not here.

> +
> +	info->if_id = from->if_id;
> +	info->link = from->link;
> +	skb_dst_force(skb);
> +	info->dst_orig = skb_dst(skb);
> +
> +	dst_hold((struct dst_entry *)md_dst);
> +	skb_dst_set(skb, (struct dst_entry *)md_dst);
> +	return 0;
> +}
> +
> +__diag_pop()
> +
> +BTF_SET8_START(xfrm_ifc_kfunc_set)
> +BTF_ID_FLAGS(func, bpf_skb_get_xfrm_info)
> +BTF_ID_FLAGS(func, bpf_skb_set_xfrm_info)
> +BTF_SET8_END(xfrm_ifc_kfunc_set)
> +
> +static const struct btf_kfunc_id_set xfrm_interface_kfunc_set = {
> +	.owner = THIS_MODULE,
> +	.set   = &xfrm_ifc_kfunc_set,
> +};
> +
> +int __init register_xfrm_interface_bpf(void)
> +{
> +	int err;
> +
> +	xfrm_md_dst = metadata_dst_alloc_percpu(0, METADATA_XFRM,
> +						GFP_KERNEL);

May be DEFINE_PER_CPU() instead?

> +	if (!xfrm_md_dst)
> +		return -ENOMEM;
> +	err = register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS,
> +					&xfrm_interface_kfunc_set);
> +	if (err < 0) {
> +		cleanup_xfrm_interface_bpf();
> +		return err;
> +	}
> +	return 0;
> +}
> +
> +void __exit cleanup_xfrm_interface_bpf(void)
> +{
> +	metadata_dst_free_percpu(xfrm_md_dst);
> +}
Eyal Birger Dec. 1, 2022, 5:55 a.m. UTC | #2
Hi Martin,

On Wed, Nov 30, 2022 at 8:15 PM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>
> On 11/29/22 5:20 AM, Eyal Birger wrote:
> > diff --git a/net/xfrm/xfrm_interface_bpf.c b/net/xfrm/xfrm_interface_bpf.c
> > new file mode 100644
> > index 000000000000..757e15857dbf
> > --- /dev/null
> > +++ b/net/xfrm/xfrm_interface_bpf.c
> > @@ -0,0 +1,100 @@
> > +// SPDX-License-Identifier: GPL-2.0-only
> > +/* Unstable XFRM Helpers for TC-BPF hook
> > + *
> > + * These are called from SCHED_CLS BPF programs. Note that it is
> > + * allowed to break compatibility for these functions since the interface they
> > + * are exposed through to BPF programs is explicitly unstable.
> > + */
> > +
> > +#include <linux/bpf.h>
> > +#include <linux/btf_ids.h>
> > +
> > +#include <net/dst_metadata.h>
> > +#include <net/xfrm.h>
> > +
> > +struct bpf_xfrm_info {
> No need to introduce a bpf variant of the "struct xfrm_md_info" (more on this
> later).
>
> > +     u32 if_id;
> > +     int link;
> > +};
> > +
> > +static struct metadata_dst __percpu *xfrm_md_dst;
> > +__diag_push();
> > +__diag_ignore_all("-Wmissing-prototypes",
> > +               "Global functions as their definitions will be in xfrm_interface BTF");
> > +
> > +__used noinline
> > +int bpf_skb_get_xfrm_info(struct __sk_buff *skb_ctx, struct bpf_xfrm_info *to)
>
> This kfunc is not needed.  It only reads the skb->_skb_refdst.  The new kfunc
> bpf_rdonly_cast() can be used.  Take a look at the bpf_rdonly_cast() usages in
> the selftests/bpf/progs/type_cast.c.  It was in bpf-next only but should also be
> in net-next now.

I'm somewhat concerned with this approach.
Indeed it would remove the kfunc, and the API is declared "unstable", but
still the implementation as dst isn't relevant to the user and would make
the programs less readable.

Also note that the helper can be also used as it is to get the xfrm info at
egress from an lwt route (which stores the xfrm_info in the dst lwstate).

>
> > +{
> > +     struct sk_buff *skb = (struct sk_buff *)skb_ctx;
> > +     struct xfrm_md_info *info;
> > +
> > +     memset(to, 0, sizeof(*to));
> > +
> > +     info = skb_xfrm_md_info(skb);
> > +     if (!info)
> > +             return -EINVAL;
> > +
> > +     to->if_id = info->if_id;
> > +     to->link = info->link;
> > +     return 0;
> > +}
> > +
> > +__used noinline
> > +int bpf_skb_set_xfrm_info(struct __sk_buff *skb_ctx,
> > +                       const struct bpf_xfrm_info *from)
>
> Directly use "const struct xfrm_md_info *from" instead.  This kfunc can check
> from->dst_orig != NULL and return -EINVAL.  It will then have a consistent API
> with the bpf_rdonly_cast() mentioned above.

See above.

>
> > +{
> > +     struct sk_buff *skb = (struct sk_buff *)skb_ctx;
> > +     struct metadata_dst *md_dst;
> > +     struct xfrm_md_info *info;
> > +
> > +     if (unlikely(skb_metadata_dst(skb)))
> > +             return -EINVAL;
> > +
> > +     md_dst = this_cpu_ptr(xfrm_md_dst);
> > +
> > +     info = &md_dst->u.xfrm_info;
> > +     memset(info, 0, sizeof(*info));
>
> Unnecessary memset here.  Everything should have been initialized below.
> bpf_skb_set_tunnel_key() needs memset but not here.

Ok.

>
> > +
> > +     info->if_id = from->if_id;
> > +     info->link = from->link;
> > +     skb_dst_force(skb);
> > +     info->dst_orig = skb_dst(skb);
> > +
> > +     dst_hold((struct dst_entry *)md_dst);
> > +     skb_dst_set(skb, (struct dst_entry *)md_dst);
> > +     return 0;
> > +}
> > +
> > +__diag_pop()
> > +
> > +BTF_SET8_START(xfrm_ifc_kfunc_set)
> > +BTF_ID_FLAGS(func, bpf_skb_get_xfrm_info)
> > +BTF_ID_FLAGS(func, bpf_skb_set_xfrm_info)
> > +BTF_SET8_END(xfrm_ifc_kfunc_set)
> > +
> > +static const struct btf_kfunc_id_set xfrm_interface_kfunc_set = {
> > +     .owner = THIS_MODULE,
> > +     .set   = &xfrm_ifc_kfunc_set,
> > +};
> > +
> > +int __init register_xfrm_interface_bpf(void)
> > +{
> > +     int err;
> > +
> > +     xfrm_md_dst = metadata_dst_alloc_percpu(0, METADATA_XFRM,
> > +                                             GFP_KERNEL);
>
> May be DEFINE_PER_CPU() instead?

Are you suggesting duplicating the dst init/cleanup logic here?
Personally given that this happens once at module load, I think it's best to
use the existing infrastructure, but am willing to add this here if you think
it's better.

Thanks for the review!
Eyal.
Eyal Birger Dec. 1, 2022, 1:30 p.m. UTC | #3
Hi Martin,

On Thu, Dec 1, 2022 at 7:55 AM Eyal Birger <eyal.birger@gmail.com> wrote:
>
> Hi Martin,
>
> On Wed, Nov 30, 2022 at 8:15 PM Martin KaFai Lau <martin.lau@linux.dev> wrote:
> >
> > On 11/29/22 5:20 AM, Eyal Birger wrote:
> > > diff --git a/net/xfrm/xfrm_interface_bpf.c b/net/xfrm/xfrm_interface_bpf.c
> > > new file mode 100644
> > > index 000000000000..757e15857dbf
> > > --- /dev/null
> > > +++ b/net/xfrm/xfrm_interface_bpf.c
> > > @@ -0,0 +1,100 @@
> > > +// SPDX-License-Identifier: GPL-2.0-only
> > > +/* Unstable XFRM Helpers for TC-BPF hook
> > > + *
> > > + * These are called from SCHED_CLS BPF programs. Note that it is
> > > + * allowed to break compatibility for these functions since the interface they
> > > + * are exposed through to BPF programs is explicitly unstable.
> > > + */
> > > +
> > > +#include <linux/bpf.h>
> > > +#include <linux/btf_ids.h>
> > > +
> > > +#include <net/dst_metadata.h>
> > > +#include <net/xfrm.h>
> > > +
> > > +struct bpf_xfrm_info {
> > No need to introduce a bpf variant of the "struct xfrm_md_info" (more on this
> > later).
> >
> > > +     u32 if_id;
> > > +     int link;
> > > +};
> > > +
> > > +static struct metadata_dst __percpu *xfrm_md_dst;
> > > +__diag_push();
> > > +__diag_ignore_all("-Wmissing-prototypes",
> > > +               "Global functions as their definitions will be in xfrm_interface BTF");
> > > +
> > > +__used noinline
> > > +int bpf_skb_get_xfrm_info(struct __sk_buff *skb_ctx, struct bpf_xfrm_info *to)
> >
> > This kfunc is not needed.  It only reads the skb->_skb_refdst.  The new kfunc
> > bpf_rdonly_cast() can be used.  Take a look at the bpf_rdonly_cast() usages in
> > the selftests/bpf/progs/type_cast.c.  It was in bpf-next only but should also be
> > in net-next now.
>
> I'm somewhat concerned with this approach.
> Indeed it would remove the kfunc, and the API is declared "unstable", but
> still the implementation as dst isn't relevant to the user and would make
> the programs less readable.
>
> Also note that the helper can be also used as it is to get the xfrm info at
> egress from an lwt route (which stores the xfrm_info in the dst lwstate).
>
> >
> > > +{
> > > +     struct sk_buff *skb = (struct sk_buff *)skb_ctx;
> > > +     struct xfrm_md_info *info;
> > > +
> > > +     memset(to, 0, sizeof(*to));
> > > +
> > > +     info = skb_xfrm_md_info(skb);
> > > +     if (!info)
> > > +             return -EINVAL;
> > > +
> > > +     to->if_id = info->if_id;
> > > +     to->link = info->link;
> > > +     return 0;
> > > +}
> > > +
> > > +__used noinline
> > > +int bpf_skb_set_xfrm_info(struct __sk_buff *skb_ctx,
> > > +                       const struct bpf_xfrm_info *from)
> >
> > Directly use "const struct xfrm_md_info *from" instead.  This kfunc can check
> > from->dst_orig != NULL and return -EINVAL.  It will then have a consistent API
> > with the bpf_rdonly_cast() mentioned above.
>
> See above.

Also, when trying this approach with bpf_set_xfrm_info() accepting
"const struct xfrm_md_info *from" I fail to load the program:

libbpf: prog 'set_xfrm_info': BPF program load failed: Invalid argument
libbpf: prog 'set_xfrm_info': -- BEGIN PROG LOAD LOG --
0: R1=ctx(off=0,imm=0) R10=fp0
; int set_xfrm_info(struct __sk_buff *skb)
0: (bf) r6 = r1                       ; R1=ctx(off=0,imm=0)
R6_w=ctx(off=0,imm=0)
1: (b7) r1 = 0                        ; R1_w=0
; struct xfrm_md_info info = {};
2: (7b) *(u64 *)(r10 -8) = r1         ; R1_w=0 R10=fp0 fp-8_w=00000000
3: (7b) *(u64 *)(r10 -16) = r1        ; R1_w=0 R10=fp0 fp-16_w=00000000
4: (b4) w1 = 0                        ; R1_w=0
; __u32 index = 0;
5: (63) *(u32 *)(r10 -20) = r1        ; R1_w=0 R10=fp0 fp-24=0000????
6: (bf) r2 = r10                      ; R2_w=fp0 R10=fp0
;
7: (07) r2 += -20                     ; R2_w=fp-20
; if_id = bpf_map_lookup_elem(&dst_if_id_map, &index);
8: (18) r1 = 0xffff888006751c00       ; R1_w=map_ptr(off=0,ks=4,vs=4,imm=0)
10: (85) call bpf_map_lookup_elem#1   ;
R0_w=map_value_or_null(id=1,off=0,ks=4,vs=4,imm=0)
11: (bf) r1 = r0                      ;
R0_w=map_value_or_null(id=1,off=0,ks=4,vs=4,imm=0)
R1_w=map_value_or_null(id=1,off=0,ks=4,vs=4,imm=0)
12: (b4) w0 = 2                       ; R0_w=2
; if (!if_id)
13: (15) if r1 == 0x0 goto pc+10      ; R1_w=map_value(off=0,ks=4,vs=4,imm=0)
14: (bf) r2 = r10                     ; R2_w=fp0 R10=fp0
;
15: (07) r2 += -16                    ; R2_w=fp-16
; info.if_id = *if_id;
16: (61) r1 = *(u32 *)(r1 +0)         ;
R1_w=scalar(umax=4294967295,var_off=(0x0; 0xffffffff))
; info.if_id = *if_id;
17: (63) *(u32 *)(r2 +0) = r1         ;
R1_w=scalar(umax=4294967295,var_off=(0x0; 0xffffffff)) R2_w=fp-16
fp-16_w=
; ret = bpf_skb_set_xfrm_info(skb, &info);
18: (bf) r1 = r6                      ; R1_w=ctx(off=0,imm=0)
R6_w=ctx(off=0,imm=0)
19: (85) call bpf_skb_set_xfrm_info#81442
arg#1 pointer type STRUCT xfrm_md_info must point to scalar, or struct
with scalar

Is there some registration I need to do for this struct?
Martin KaFai Lau Dec. 1, 2022, 8:18 p.m. UTC | #4
On 12/1/22 5:30 AM, Eyal Birger wrote:
> Hi Martin,
> 
> On Thu, Dec 1, 2022 at 7:55 AM Eyal Birger <eyal.birger@gmail.com> wrote:
>>
>> Hi Martin,
>>
>> On Wed, Nov 30, 2022 at 8:15 PM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>>>
>>> On 11/29/22 5:20 AM, Eyal Birger wrote:
>>>> diff --git a/net/xfrm/xfrm_interface_bpf.c b/net/xfrm/xfrm_interface_bpf.c
>>>> new file mode 100644
>>>> index 000000000000..757e15857dbf
>>>> --- /dev/null
>>>> +++ b/net/xfrm/xfrm_interface_bpf.c
>>>> @@ -0,0 +1,100 @@
>>>> +// SPDX-License-Identifier: GPL-2.0-only
>>>> +/* Unstable XFRM Helpers for TC-BPF hook
>>>> + *
>>>> + * These are called from SCHED_CLS BPF programs. Note that it is
>>>> + * allowed to break compatibility for these functions since the interface they
>>>> + * are exposed through to BPF programs is explicitly unstable.
>>>> + */
>>>> +
>>>> +#include <linux/bpf.h>
>>>> +#include <linux/btf_ids.h>
>>>> +
>>>> +#include <net/dst_metadata.h>
>>>> +#include <net/xfrm.h>
>>>> +
>>>> +struct bpf_xfrm_info {
>>> No need to introduce a bpf variant of the "struct xfrm_md_info" (more on this
>>> later).
>>>
>>>> +     u32 if_id;
>>>> +     int link;
>>>> +};
>>>> +
>>>> +static struct metadata_dst __percpu *xfrm_md_dst;
>>>> +__diag_push();
>>>> +__diag_ignore_all("-Wmissing-prototypes",
>>>> +               "Global functions as their definitions will be in xfrm_interface BTF");
>>>> +
>>>> +__used noinline
>>>> +int bpf_skb_get_xfrm_info(struct __sk_buff *skb_ctx, struct bpf_xfrm_info *to)
>>>
>>> This kfunc is not needed.  It only reads the skb->_skb_refdst.  The new kfunc
>>> bpf_rdonly_cast() can be used.  Take a look at the bpf_rdonly_cast() usages in
>>> the selftests/bpf/progs/type_cast.c.  It was in bpf-next only but should also be
>>> in net-next now.
>>
>> I'm somewhat concerned with this approach.
>> Indeed it would remove the kfunc, and the API is declared "unstable", but
>> still the implementation as dst isn't relevant to the user and would make
>> the programs less readable.
>>
>> Also note that the helper can be also used as it is to get the xfrm info at
>> egress from an lwt route (which stores the xfrm_info in the dst lwstate).

Right, the whole skb_xfrm_md_info() can be implemented in bpf prog itself, like 
checking lwtstate.

If adding a kfunc, how about directly expose skb_xfrm_md_info() itself as a 
kfunc to bpf prog and directly return a "struct xfrm_md_info *" instead.  Then 
there is no need to copy if_id/link...etc.  The bpf prog has no need to 
initialize the "to" also.  Something like this:

__used noinline
const struct xfrm_md_info *bpf_skb_xfrm_md_info(const struct __sk_buff *skb) { ... }

BTF_ID_FLAGS(func, bpf_skb_xfrm_md_info, KF_RET_NULL)

>>
>>>
>>>> +{
>>>> +     struct sk_buff *skb = (struct sk_buff *)skb_ctx;
>>>> +     struct xfrm_md_info *info;
>>>> +
>>>> +     memset(to, 0, sizeof(*to));
>>>> +
>>>> +     info = skb_xfrm_md_info(skb);
>>>> +     if (!info)
>>>> +             return -EINVAL;
>>>> +
>>>> +     to->if_id = info->if_id;
>>>> +     to->link = info->link;
>>>> +     return 0;
>>>> +}
>>>> +
>>>> +__used noinline
>>>> +int bpf_skb_set_xfrm_info(struct __sk_buff *skb_ctx,
>>>> +                       const struct bpf_xfrm_info *from)
>>>
>>> Directly use "const struct xfrm_md_info *from" instead.  This kfunc can check
>>> from->dst_orig != NULL and return -EINVAL.  It will then have a consistent API
>>> with the bpf_rdonly_cast() mentioned above.
>>
>> See above.
> 
> Also, when trying this approach with bpf_set_xfrm_info() accepting
> "const struct xfrm_md_info *from" I fail to load the program:
> 
> libbpf: prog 'set_xfrm_info': BPF program load failed: Invalid argument
> libbpf: prog 'set_xfrm_info': -- BEGIN PROG LOAD LOG --
> 0: R1=ctx(off=0,imm=0) R10=fp0
> ; int set_xfrm_info(struct __sk_buff *skb)
> 0: (bf) r6 = r1                       ; R1=ctx(off=0,imm=0)
> R6_w=ctx(off=0,imm=0)
> 1: (b7) r1 = 0                        ; R1_w=0
> ; struct xfrm_md_info info = {};
> 2: (7b) *(u64 *)(r10 -8) = r1         ; R1_w=0 R10=fp0 fp-8_w=00000000
> 3: (7b) *(u64 *)(r10 -16) = r1        ; R1_w=0 R10=fp0 fp-16_w=00000000
> 4: (b4) w1 = 0                        ; R1_w=0
> ; __u32 index = 0;
> 5: (63) *(u32 *)(r10 -20) = r1        ; R1_w=0 R10=fp0 fp-24=0000????
> 6: (bf) r2 = r10                      ; R2_w=fp0 R10=fp0
> ;
> 7: (07) r2 += -20                     ; R2_w=fp-20
> ; if_id = bpf_map_lookup_elem(&dst_if_id_map, &index);
> 8: (18) r1 = 0xffff888006751c00       ; R1_w=map_ptr(off=0,ks=4,vs=4,imm=0)
> 10: (85) call bpf_map_lookup_elem#1   ;
> R0_w=map_value_or_null(id=1,off=0,ks=4,vs=4,imm=0)
> 11: (bf) r1 = r0                      ;
> R0_w=map_value_or_null(id=1,off=0,ks=4,vs=4,imm=0)
> R1_w=map_value_or_null(id=1,off=0,ks=4,vs=4,imm=0)
> 12: (b4) w0 = 2                       ; R0_w=2
> ; if (!if_id)
> 13: (15) if r1 == 0x0 goto pc+10      ; R1_w=map_value(off=0,ks=4,vs=4,imm=0)
> 14: (bf) r2 = r10                     ; R2_w=fp0 R10=fp0
> ;
> 15: (07) r2 += -16                    ; R2_w=fp-16
> ; info.if_id = *if_id;
> 16: (61) r1 = *(u32 *)(r1 +0)         ;
> R1_w=scalar(umax=4294967295,var_off=(0x0; 0xffffffff))
> ; info.if_id = *if_id;
> 17: (63) *(u32 *)(r2 +0) = r1         ;
> R1_w=scalar(umax=4294967295,var_off=(0x0; 0xffffffff)) R2_w=fp-16
> fp-16_w=
> ; ret = bpf_skb_set_xfrm_info(skb, &info);
> 18: (bf) r1 = r6                      ; R1_w=ctx(off=0,imm=0)
> R6_w=ctx(off=0,imm=0)
> 19: (85) call bpf_skb_set_xfrm_info#81442
> arg#1 pointer type STRUCT xfrm_md_info must point to scalar, or struct
> with scalar
> 
> Is there some registration I need to do for this struct?

Ah, thanks for trying!
hmm... it will need a change to the verifier.  likely tag the param with 
something like "const struct xfrm_md_info *from__nonscalar_ok".

The reason of my earlier suggestion was to avoid the need to duplicate future 
changes in xfrm_md_info to bpf_xfrm_info and more importantly avoid creating 
another __sk_buff vs sk_buff like usage confusion.

For now, lets stay with bpf_xfrm_info.  This can be changed later.
Martin KaFai Lau Dec. 1, 2022, 8:21 p.m. UTC | #5
On 11/30/22 9:55 PM, Eyal Birger wrote:
>>> +
>>> +     info->if_id = from->if_id;
>>> +     info->link = from->link;
>>> +     skb_dst_force(skb);
>>> +     info->dst_orig = skb_dst(skb);
>>> +
>>> +     dst_hold((struct dst_entry *)md_dst);
>>> +     skb_dst_set(skb, (struct dst_entry *)md_dst);
>>> +     return 0;
>>> +}
>>> +
>>> +__diag_pop()
>>> +
>>> +BTF_SET8_START(xfrm_ifc_kfunc_set)
>>> +BTF_ID_FLAGS(func, bpf_skb_get_xfrm_info)
>>> +BTF_ID_FLAGS(func, bpf_skb_set_xfrm_info)
>>> +BTF_SET8_END(xfrm_ifc_kfunc_set)
>>> +
>>> +static const struct btf_kfunc_id_set xfrm_interface_kfunc_set = {
>>> +     .owner = THIS_MODULE,
>>> +     .set   = &xfrm_ifc_kfunc_set,
>>> +};
>>> +
>>> +int __init register_xfrm_interface_bpf(void)
>>> +{
>>> +     int err;
>>> +
>>> +     xfrm_md_dst = metadata_dst_alloc_percpu(0, METADATA_XFRM,
>>> +                                             GFP_KERNEL);
>>
>> May be DEFINE_PER_CPU() instead?
> 
> Are you suggesting duplicating the dst init/cleanup logic here?
> Personally given that this happens once at module load, I think it's best to
> use the existing infrastructure, but am willing to add this here if you think
> it's better.

Agree, staying with the current patch is better.  I somehow thought 
metadata_dst_alloc_percpu() was newly added in this patch also.
Eyal Birger Dec. 1, 2022, 8:47 p.m. UTC | #6
On Thu, Dec 1, 2022 at 10:18 PM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>
> On 12/1/22 5:30 AM, Eyal Birger wrote:
> > Hi Martin,
> >
> > On Thu, Dec 1, 2022 at 7:55 AM Eyal Birger <eyal.birger@gmail.com> wrote:
> >>
> >> Hi Martin,
> >>
> >> On Wed, Nov 30, 2022 at 8:15 PM Martin KaFai Lau <martin.lau@linux.dev> wrote:
> >>>
> >>> On 11/29/22 5:20 AM, Eyal Birger wrote:
> >>>> diff --git a/net/xfrm/xfrm_interface_bpf.c b/net/xfrm/xfrm_interface_bpf.c
> >>>> new file mode 100644
> >>>> index 000000000000..757e15857dbf
> >>>> --- /dev/null
> >>>> +++ b/net/xfrm/xfrm_interface_bpf.c
> >>>> @@ -0,0 +1,100 @@
> >>>> +// SPDX-License-Identifier: GPL-2.0-only
> >>>> +/* Unstable XFRM Helpers for TC-BPF hook
> >>>> + *
> >>>> + * These are called from SCHED_CLS BPF programs. Note that it is
> >>>> + * allowed to break compatibility for these functions since the interface they
> >>>> + * are exposed through to BPF programs is explicitly unstable.
> >>>> + */
> >>>> +
> >>>> +#include <linux/bpf.h>
> >>>> +#include <linux/btf_ids.h>
> >>>> +
> >>>> +#include <net/dst_metadata.h>
> >>>> +#include <net/xfrm.h>
> >>>> +
> >>>> +struct bpf_xfrm_info {
> >>> No need to introduce a bpf variant of the "struct xfrm_md_info" (more on this
> >>> later).
> >>>
> >>>> +     u32 if_id;
> >>>> +     int link;
> >>>> +};
> >>>> +
> >>>> +static struct metadata_dst __percpu *xfrm_md_dst;
> >>>> +__diag_push();
> >>>> +__diag_ignore_all("-Wmissing-prototypes",
> >>>> +               "Global functions as their definitions will be in xfrm_interface BTF");
> >>>> +
> >>>> +__used noinline
> >>>> +int bpf_skb_get_xfrm_info(struct __sk_buff *skb_ctx, struct bpf_xfrm_info *to)
> >>>
> >>> This kfunc is not needed.  It only reads the skb->_skb_refdst.  The new kfunc
> >>> bpf_rdonly_cast() can be used.  Take a look at the bpf_rdonly_cast() usages in
> >>> the selftests/bpf/progs/type_cast.c.  It was in bpf-next only but should also be
> >>> in net-next now.
> >>
> >> I'm somewhat concerned with this approach.
> >> Indeed it would remove the kfunc, and the API is declared "unstable", but
> >> still the implementation as dst isn't relevant to the user and would make
> >> the programs less readable.
> >>
> >> Also note that the helper can be also used as it is to get the xfrm info at
> >> egress from an lwt route (which stores the xfrm_info in the dst lwstate).
>
> Right, the whole skb_xfrm_md_info() can be implemented in bpf prog itself, like
> checking lwtstate.
>
> If adding a kfunc, how about directly expose skb_xfrm_md_info() itself as a
> kfunc to bpf prog and directly return a "struct xfrm_md_info *" instead.  Then
> there is no need to copy if_id/link...etc.  The bpf prog has no need to
> initialize the "to" also.  Something like this:
>
> __used noinline
> const struct xfrm_md_info *bpf_skb_xfrm_md_info(const struct __sk_buff *skb) { ... }
>
> BTF_ID_FLAGS(func, bpf_skb_xfrm_md_info, KF_RET_NULL)
>
> >>
> >>>
> >>>> +{
> >>>> +     struct sk_buff *skb = (struct sk_buff *)skb_ctx;
> >>>> +     struct xfrm_md_info *info;
> >>>> +
> >>>> +     memset(to, 0, sizeof(*to));
> >>>> +
> >>>> +     info = skb_xfrm_md_info(skb);
> >>>> +     if (!info)
> >>>> +             return -EINVAL;
> >>>> +
> >>>> +     to->if_id = info->if_id;
> >>>> +     to->link = info->link;
> >>>> +     return 0;
> >>>> +}
> >>>> +
> >>>> +__used noinline
> >>>> +int bpf_skb_set_xfrm_info(struct __sk_buff *skb_ctx,
> >>>> +                       const struct bpf_xfrm_info *from)
> >>>
> >>> Directly use "const struct xfrm_md_info *from" instead.  This kfunc can check
> >>> from->dst_orig != NULL and return -EINVAL.  It will then have a consistent API
> >>> with the bpf_rdonly_cast() mentioned above.
> >>
> >> See above.
> >
> > Also, when trying this approach with bpf_set_xfrm_info() accepting
> > "const struct xfrm_md_info *from" I fail to load the program:
> >
> > libbpf: prog 'set_xfrm_info': BPF program load failed: Invalid argument
> > libbpf: prog 'set_xfrm_info': -- BEGIN PROG LOAD LOG --
> > 0: R1=ctx(off=0,imm=0) R10=fp0
> > ; int set_xfrm_info(struct __sk_buff *skb)
> > 0: (bf) r6 = r1                       ; R1=ctx(off=0,imm=0)
> > R6_w=ctx(off=0,imm=0)
> > 1: (b7) r1 = 0                        ; R1_w=0
> > ; struct xfrm_md_info info = {};
> > 2: (7b) *(u64 *)(r10 -8) = r1         ; R1_w=0 R10=fp0 fp-8_w=00000000
> > 3: (7b) *(u64 *)(r10 -16) = r1        ; R1_w=0 R10=fp0 fp-16_w=00000000
> > 4: (b4) w1 = 0                        ; R1_w=0
> > ; __u32 index = 0;
> > 5: (63) *(u32 *)(r10 -20) = r1        ; R1_w=0 R10=fp0 fp-24=0000????
> > 6: (bf) r2 = r10                      ; R2_w=fp0 R10=fp0
> > ;
> > 7: (07) r2 += -20                     ; R2_w=fp-20
> > ; if_id = bpf_map_lookup_elem(&dst_if_id_map, &index);
> > 8: (18) r1 = 0xffff888006751c00       ; R1_w=map_ptr(off=0,ks=4,vs=4,imm=0)
> > 10: (85) call bpf_map_lookup_elem#1   ;
> > R0_w=map_value_or_null(id=1,off=0,ks=4,vs=4,imm=0)
> > 11: (bf) r1 = r0                      ;
> > R0_w=map_value_or_null(id=1,off=0,ks=4,vs=4,imm=0)
> > R1_w=map_value_or_null(id=1,off=0,ks=4,vs=4,imm=0)
> > 12: (b4) w0 = 2                       ; R0_w=2
> > ; if (!if_id)
> > 13: (15) if r1 == 0x0 goto pc+10      ; R1_w=map_value(off=0,ks=4,vs=4,imm=0)
> > 14: (bf) r2 = r10                     ; R2_w=fp0 R10=fp0
> > ;
> > 15: (07) r2 += -16                    ; R2_w=fp-16
> > ; info.if_id = *if_id;
> > 16: (61) r1 = *(u32 *)(r1 +0)         ;
> > R1_w=scalar(umax=4294967295,var_off=(0x0; 0xffffffff))
> > ; info.if_id = *if_id;
> > 17: (63) *(u32 *)(r2 +0) = r1         ;
> > R1_w=scalar(umax=4294967295,var_off=(0x0; 0xffffffff)) R2_w=fp-16
> > fp-16_w=
> > ; ret = bpf_skb_set_xfrm_info(skb, &info);
> > 18: (bf) r1 = r6                      ; R1_w=ctx(off=0,imm=0)
> > R6_w=ctx(off=0,imm=0)
> > 19: (85) call bpf_skb_set_xfrm_info#81442
> > arg#1 pointer type STRUCT xfrm_md_info must point to scalar, or struct
> > with scalar
> >
> > Is there some registration I need to do for this struct?
>
> Ah, thanks for trying!
> hmm... it will need a change to the verifier.  likely tag the param with
> something like "const struct xfrm_md_info *from__nonscalar_ok".
>
> The reason of my earlier suggestion was to avoid the need to duplicate future
> changes in xfrm_md_info to bpf_xfrm_info and more importantly avoid creating
> another __sk_buff vs sk_buff like usage confusion.
>
> For now, lets stay with bpf_xfrm_info.  This can be changed later.
Ok.

I suggest that in that case we use struct bpf_xfrm_info for both the getter
and setter for now, as it would be more confusing to use different structs.

Btw, I did try using vmlinux.h and importing struct xfrm_md_info, however,
it seems vmlinux.h and linux/pkt_cls.h don't play well together, and the
latter is needed for the TC_ACT_* macro definitions. So at least for the
time being it's another reason to define a single struct in the test program.

Eyal.
diff mbox series

Patch

diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h
index a454cf4327fe..1b7fae4c6b24 100644
--- a/include/net/dst_metadata.h
+++ b/include/net/dst_metadata.h
@@ -26,6 +26,7 @@  struct macsec_info {
 struct xfrm_md_info {
 	u32 if_id;
 	int link;
+	struct dst_entry *dst_orig;
 };
 
 struct metadata_dst {
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index e0cc6791c001..5e5fea3087b6 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -2086,4 +2086,24 @@  static inline bool xfrm6_local_dontfrag(const struct sock *sk)
 	return false;
 }
 #endif
+
+#if (IS_BUILTIN(CONFIG_XFRM_INTERFACE) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) || \
+    (IS_MODULE(CONFIG_XFRM_INTERFACE) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES))
+
+extern int register_xfrm_interface_bpf(void);
+extern void cleanup_xfrm_interface_bpf(void);
+
+#else
+
+static inline int register_xfrm_interface_bpf(void)
+{
+	return 0;
+}
+
+static inline void cleanup_xfrm_interface_bpf(void)
+{
+}
+
+#endif
+
 #endif	/* _NET_XFRM_H */
diff --git a/net/core/dst.c b/net/core/dst.c
index bc9c9be4e080..bb14a0392388 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -316,6 +316,8 @@  void metadata_dst_free(struct metadata_dst *md_dst)
 	if (md_dst->type == METADATA_IP_TUNNEL)
 		dst_cache_destroy(&md_dst->u.tun_info.dst_cache);
 #endif
+	if (md_dst->type == METADATA_XFRM)
+		dst_release(md_dst->u.xfrm_info.dst_orig);
 	kfree(md_dst);
 }
 EXPORT_SYMBOL_GPL(metadata_dst_free);
@@ -340,16 +342,18 @@  EXPORT_SYMBOL_GPL(metadata_dst_alloc_percpu);
 
 void metadata_dst_free_percpu(struct metadata_dst __percpu *md_dst)
 {
-#ifdef CONFIG_DST_CACHE
 	int cpu;
 
 	for_each_possible_cpu(cpu) {
 		struct metadata_dst *one_md_dst = per_cpu_ptr(md_dst, cpu);
 
+#ifdef CONFIG_DST_CACHE
 		if (one_md_dst->type == METADATA_IP_TUNNEL)
 			dst_cache_destroy(&one_md_dst->u.tun_info.dst_cache);
-	}
 #endif
+		if (one_md_dst->type == METADATA_XFRM)
+			dst_release(one_md_dst->u.xfrm_info.dst_orig);
+	}
 	free_percpu(md_dst);
 }
 EXPORT_SYMBOL_GPL(metadata_dst_free_percpu);
diff --git a/net/xfrm/Makefile b/net/xfrm/Makefile
index 08a2870fdd36..cd47f88921f5 100644
--- a/net/xfrm/Makefile
+++ b/net/xfrm/Makefile
@@ -5,6 +5,12 @@ 
 
 xfrm_interface-$(CONFIG_XFRM_INTERFACE) += xfrm_interface_core.o
 
+ifeq ($(CONFIG_XFRM_INTERFACE),m)
+xfrm_interface-$(CONFIG_DEBUG_INFO_BTF_MODULES) += xfrm_interface_bpf.o
+else ifeq ($(CONFIG_XFRM_INTERFACE),y)
+xfrm_interface-$(CONFIG_DEBUG_INFO_BTF) += xfrm_interface_bpf.o
+endif
+
 obj-$(CONFIG_XFRM) := xfrm_policy.o xfrm_state.o xfrm_hash.o \
 		      xfrm_input.o xfrm_output.o \
 		      xfrm_sysctl.o xfrm_replay.o xfrm_device.o
diff --git a/net/xfrm/xfrm_interface_bpf.c b/net/xfrm/xfrm_interface_bpf.c
new file mode 100644
index 000000000000..757e15857dbf
--- /dev/null
+++ b/net/xfrm/xfrm_interface_bpf.c
@@ -0,0 +1,100 @@ 
+// SPDX-License-Identifier: GPL-2.0-only
+/* Unstable XFRM Helpers for TC-BPF hook
+ *
+ * These are called from SCHED_CLS BPF programs. Note that it is
+ * allowed to break compatibility for these functions since the interface they
+ * are exposed through to BPF programs is explicitly unstable.
+ */
+
+#include <linux/bpf.h>
+#include <linux/btf_ids.h>
+
+#include <net/dst_metadata.h>
+#include <net/xfrm.h>
+
+struct bpf_xfrm_info {
+	u32 if_id;
+	int link;
+};
+
+static struct metadata_dst __percpu *xfrm_md_dst;
+__diag_push();
+__diag_ignore_all("-Wmissing-prototypes",
+		  "Global functions as their definitions will be in xfrm_interface BTF");
+
+__used noinline
+int bpf_skb_get_xfrm_info(struct __sk_buff *skb_ctx, struct bpf_xfrm_info *to)
+{
+	struct sk_buff *skb = (struct sk_buff *)skb_ctx;
+	struct xfrm_md_info *info;
+
+	memset(to, 0, sizeof(*to));
+
+	info = skb_xfrm_md_info(skb);
+	if (!info)
+		return -EINVAL;
+
+	to->if_id = info->if_id;
+	to->link = info->link;
+	return 0;
+}
+
+__used noinline
+int bpf_skb_set_xfrm_info(struct __sk_buff *skb_ctx,
+			  const struct bpf_xfrm_info *from)
+{
+	struct sk_buff *skb = (struct sk_buff *)skb_ctx;
+	struct metadata_dst *md_dst;
+	struct xfrm_md_info *info;
+
+	if (unlikely(skb_metadata_dst(skb)))
+		return -EINVAL;
+
+	md_dst = this_cpu_ptr(xfrm_md_dst);
+
+	info = &md_dst->u.xfrm_info;
+	memset(info, 0, sizeof(*info));
+
+	info->if_id = from->if_id;
+	info->link = from->link;
+	skb_dst_force(skb);
+	info->dst_orig = skb_dst(skb);
+
+	dst_hold((struct dst_entry *)md_dst);
+	skb_dst_set(skb, (struct dst_entry *)md_dst);
+	return 0;
+}
+
+__diag_pop()
+
+BTF_SET8_START(xfrm_ifc_kfunc_set)
+BTF_ID_FLAGS(func, bpf_skb_get_xfrm_info)
+BTF_ID_FLAGS(func, bpf_skb_set_xfrm_info)
+BTF_SET8_END(xfrm_ifc_kfunc_set)
+
+static const struct btf_kfunc_id_set xfrm_interface_kfunc_set = {
+	.owner = THIS_MODULE,
+	.set   = &xfrm_ifc_kfunc_set,
+};
+
+int __init register_xfrm_interface_bpf(void)
+{
+	int err;
+
+	xfrm_md_dst = metadata_dst_alloc_percpu(0, METADATA_XFRM,
+						GFP_KERNEL);
+	if (!xfrm_md_dst)
+		return -ENOMEM;
+	err = register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS,
+					&xfrm_interface_kfunc_set);
+	if (err < 0) {
+		cleanup_xfrm_interface_bpf();
+		return err;
+	}
+	return 0;
+}
+
+void __exit cleanup_xfrm_interface_bpf(void)
+{
+	metadata_dst_free_percpu(xfrm_md_dst);
+}
diff --git a/net/xfrm/xfrm_interface_core.c b/net/xfrm/xfrm_interface_core.c
index 5a67b120c4db..1e1e8e965939 100644
--- a/net/xfrm/xfrm_interface_core.c
+++ b/net/xfrm/xfrm_interface_core.c
@@ -396,6 +396,14 @@  xfrmi_xmit2(struct sk_buff *skb, struct net_device *dev, struct flowi *fl)
 
 		if_id = md_info->if_id;
 		fl->flowi_oif = md_info->link;
+		if (md_info->dst_orig) {
+			struct dst_entry *tmp_dst = dst;
+
+			dst = md_info->dst_orig;
+			skb_dst_set(skb, dst);
+			md_info->dst_orig = NULL;
+			dst_release(tmp_dst);
+		}
 	} else {
 		if_id = xi->p.if_id;
 	}
@@ -1162,12 +1170,18 @@  static int __init xfrmi_init(void)
 	if (err < 0)
 		goto rtnl_link_failed;
 
+	err = register_xfrm_interface_bpf();
+	if (err < 0)
+		goto kfunc_failed;
+
 	lwtunnel_encap_add_ops(&xfrmi_encap_ops, LWTUNNEL_ENCAP_XFRM);
 
 	xfrm_if_register_cb(&xfrm_if_cb);
 
 	return err;
 
+kfunc_failed:
+	rtnl_link_unregister(&xfrmi_link_ops);
 rtnl_link_failed:
 	xfrmi6_fini();
 xfrmi6_failed:
@@ -1183,6 +1197,7 @@  static void __exit xfrmi_fini(void)
 {
 	xfrm_if_unregister_cb();
 	lwtunnel_encap_del_ops(&xfrmi_encap_ops, LWTUNNEL_ENCAP_XFRM);
+	cleanup_xfrm_interface_bpf();
 	rtnl_link_unregister(&xfrmi_link_ops);
 	xfrmi4_fini();
 	xfrmi6_fini();