diff mbox series

[bpf-next,v4,2/4] xfrm: interface: Add unstable helpers for setting/getting XFRM metadata from TC-BPF

Message ID 20221202095920.1659332-3-eyal.birger@gmail.com (mailing list archive)
State New
Headers show
Series xfrm: interface: Add unstable helpers for XFRM metadata | expand

Commit Message

Eyal Birger Dec. 2, 2022, 9:59 a.m. UTC
This change adds xfrm metadata helpers using the unstable kfunc call
interface for the TC-BPF hooks. This allows steering traffic towards
different IPsec connections based on logic implemented in bpf programs.

This object is built based on the availability of BTF debug info.

The metadata percpu dsts used on TX take ownership of the original skb
dsts so that they may be used as part of the xfrm transmission logic -
e.g.  for MTU calculations.

Signed-off-by: Eyal Birger <eyal.birger@gmail.com>

---

v4: changes requested by Martin KaFai Lau:
  - add kfunc documentation
  - remove redundant memset
  - minor coding

v3:
  - remove redunant memset() as suggested by Martin KaFai Lau
  - remove __exit annotation from cleanup() function as it's used from
    an __init function

v2: changed added following points raised by Martin KaFai Lau:
  - make sure dst is refcounted prior to caching
  - free dst_orig regardless of CONFIG_DST_CACHE
  - call xfrm interface bpf cleanup in case of kfunc registration errors
---
 include/net/dst_metadata.h     |   1 +
 include/net/xfrm.h             |  20 ++++++
 net/core/dst.c                 |   8 ++-
 net/xfrm/Makefile              |   6 ++
 net/xfrm/xfrm_interface_bpf.c  | 123 +++++++++++++++++++++++++++++++++
 net/xfrm/xfrm_interface_core.c |  15 ++++
 6 files changed, 171 insertions(+), 2 deletions(-)
 create mode 100644 net/xfrm/xfrm_interface_bpf.c

Comments

Martin KaFai Lau Dec. 2, 2022, 7:08 p.m. UTC | #1
On 12/2/22 1:59 AM, Eyal Birger wrote:
> +__used noinline
> +int bpf_skb_set_xfrm_info(struct __sk_buff *skb_ctx,
> +			  const struct bpf_xfrm_info *from)
> +{
> +	struct sk_buff *skb = (struct sk_buff *)skb_ctx;
> +	struct metadata_dst *md_dst;
> +	struct xfrm_md_info *info;
> +
> +	if (unlikely(skb_metadata_dst(skb)))
> +		return -EINVAL;
> +
> +	md_dst = this_cpu_ptr(xfrm_md_dst);
> +
> +	info = &md_dst->u.xfrm_info;
> +
> +	info->if_id = from->if_id;
> +	info->link = from->link;
> +	skb_dst_force(skb);
> +	info->dst_orig = skb_dst(skb);
> +
> +	dst_hold((struct dst_entry *)md_dst);
> +	skb_dst_set(skb, (struct dst_entry *)md_dst);


I may be missed something obvious and this just came to my mind,

What stops cleanup_xfrm_interface_bpf() being run while skb is still holding the 
md_dst?

[ ... ]

> +static const struct btf_kfunc_id_set xfrm_interface_kfunc_set = {
> +	.owner = THIS_MODULE,
> +	.set   = &xfrm_ifc_kfunc_set,
> +};
> +
> +int __init register_xfrm_interface_bpf(void)
> +{
> +	int err;
> +
> +	xfrm_md_dst = metadata_dst_alloc_percpu(0, METADATA_XFRM,
> +						GFP_KERNEL);
> +	if (!xfrm_md_dst)
> +		return -ENOMEM;
> +	err = register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS,
> +					&xfrm_interface_kfunc_set);
> +	if (err < 0) {
> +		metadata_dst_free_percpu(xfrm_md_dst);
> +		return err;
> +	}
> +	return 0;
> +}
> +
> +void cleanup_xfrm_interface_bpf(void)
> +{
> +	metadata_dst_free_percpu(xfrm_md_dst);
> +}
Eyal Birger Dec. 2, 2022, 7:42 p.m. UTC | #2
Hi Martin,

On Fri, Dec 2, 2022 at 9:08 PM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>
> On 12/2/22 1:59 AM, Eyal Birger wrote:
> > +__used noinline
> > +int bpf_skb_set_xfrm_info(struct __sk_buff *skb_ctx,
> > +                       const struct bpf_xfrm_info *from)
> > +{
> > +     struct sk_buff *skb = (struct sk_buff *)skb_ctx;
> > +     struct metadata_dst *md_dst;
> > +     struct xfrm_md_info *info;
> > +
> > +     if (unlikely(skb_metadata_dst(skb)))
> > +             return -EINVAL;
> > +
> > +     md_dst = this_cpu_ptr(xfrm_md_dst);
> > +
> > +     info = &md_dst->u.xfrm_info;
> > +
> > +     info->if_id = from->if_id;
> > +     info->link = from->link;
> > +     skb_dst_force(skb);
> > +     info->dst_orig = skb_dst(skb);
> > +
> > +     dst_hold((struct dst_entry *)md_dst);
> > +     skb_dst_set(skb, (struct dst_entry *)md_dst);
>
>
> I may be missed something obvious and this just came to my mind,
>
> What stops cleanup_xfrm_interface_bpf() being run while skb is still holding the
> md_dst?
>
Oh I think you're right. I missed this.

In order to keep this implementation I suppose it means that the module would
not be allowed to be removed upon use of this kfunc. but this could be seen as
annoying from the configuration user experience.

Alternatively the metadata dsts can be separately allocated from the kfunc,
which is probably the simplest approach to maintain, so I'll work on that
approach.

Thanks for noticing this!
Eyal.
Martin KaFai Lau Dec. 2, 2022, 8:27 p.m. UTC | #3
On 12/2/22 11:42 AM, Eyal Birger wrote:
> Hi Martin,
> 
> On Fri, Dec 2, 2022 at 9:08 PM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>>
>> On 12/2/22 1:59 AM, Eyal Birger wrote:
>>> +__used noinline
>>> +int bpf_skb_set_xfrm_info(struct __sk_buff *skb_ctx,
>>> +                       const struct bpf_xfrm_info *from)
>>> +{
>>> +     struct sk_buff *skb = (struct sk_buff *)skb_ctx;
>>> +     struct metadata_dst *md_dst;
>>> +     struct xfrm_md_info *info;
>>> +
>>> +     if (unlikely(skb_metadata_dst(skb)))
>>> +             return -EINVAL;
>>> +
>>> +     md_dst = this_cpu_ptr(xfrm_md_dst);
>>> +
>>> +     info = &md_dst->u.xfrm_info;
>>> +
>>> +     info->if_id = from->if_id;
>>> +     info->link = from->link;
>>> +     skb_dst_force(skb);
>>> +     info->dst_orig = skb_dst(skb);
>>> +
>>> +     dst_hold((struct dst_entry *)md_dst);
>>> +     skb_dst_set(skb, (struct dst_entry *)md_dst);
>>
>>
>> I may be missed something obvious and this just came to my mind,
>>
>> What stops cleanup_xfrm_interface_bpf() being run while skb is still holding the
>> md_dst?
>>
> Oh I think you're right. I missed this.
> 
> In order to keep this implementation I suppose it means that the module would
> not be allowed to be removed upon use of this kfunc. but this could be seen as
> annoying from the configuration user experience.
> 
> Alternatively the metadata dsts can be separately allocated from the kfunc,
> which is probably the simplest approach to maintain, so I'll work on that
> approach.

If it means dst_alloc on every skb, it will not be cheap.

Another option is to metadata_dst_alloc_percpu() once during the very first 
bpf_skb_set_xfrm_info() call and the xfrm_md_dst memory will never be freed.  It 
is a tradeoff but likely the correct one.  You can take a look at 
bpf_get_skb_set_tunnel_proto().
Eyal Birger Dec. 2, 2022, 8:49 p.m. UTC | #4
On Fri, Dec 2, 2022 at 10:27 PM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>
> On 12/2/22 11:42 AM, Eyal Birger wrote:
> > Hi Martin,
> >
> > On Fri, Dec 2, 2022 at 9:08 PM Martin KaFai Lau <martin.lau@linux.dev> wrote:
> >>
> >> On 12/2/22 1:59 AM, Eyal Birger wrote:
> >>> +__used noinline
> >>> +int bpf_skb_set_xfrm_info(struct __sk_buff *skb_ctx,
> >>> +                       const struct bpf_xfrm_info *from)
> >>> +{
> >>> +     struct sk_buff *skb = (struct sk_buff *)skb_ctx;
> >>> +     struct metadata_dst *md_dst;
> >>> +     struct xfrm_md_info *info;
> >>> +
> >>> +     if (unlikely(skb_metadata_dst(skb)))
> >>> +             return -EINVAL;
> >>> +
> >>> +     md_dst = this_cpu_ptr(xfrm_md_dst);
> >>> +
> >>> +     info = &md_dst->u.xfrm_info;
> >>> +
> >>> +     info->if_id = from->if_id;
> >>> +     info->link = from->link;
> >>> +     skb_dst_force(skb);
> >>> +     info->dst_orig = skb_dst(skb);
> >>> +
> >>> +     dst_hold((struct dst_entry *)md_dst);
> >>> +     skb_dst_set(skb, (struct dst_entry *)md_dst);
> >>
> >>
> >> I may be missed something obvious and this just came to my mind,
> >>
> >> What stops cleanup_xfrm_interface_bpf() being run while skb is still holding the
> >> md_dst?
> >>
> > Oh I think you're right. I missed this.
> >
> > In order to keep this implementation I suppose it means that the module would
> > not be allowed to be removed upon use of this kfunc. but this could be seen as
> > annoying from the configuration user experience.
> >
> > Alternatively the metadata dsts can be separately allocated from the kfunc,
> > which is probably the simplest approach to maintain, so I'll work on that
> > approach.
>
> If it means dst_alloc on every skb, it will not be cheap.
>
> Another option is to metadata_dst_alloc_percpu() once during the very first
> bpf_skb_set_xfrm_info() call and the xfrm_md_dst memory will never be freed.  It
> is a tradeoff but likely the correct one.  You can take a look at
> bpf_get_skb_set_tunnel_proto().
>

Yes, I originally wrote this as a helper similar to the tunnel key
helper which uses bpf_get_skb_set_tunnel_proto(), and when converting
to kfuncs I kept the
percpu implementation.

However, the set tunnel key code is never unloaded. Whereas taking this
approach here would mean that this memory would leak on each module reload
iiuc.

Eyal.
Martin KaFai Lau Dec. 2, 2022, 9:27 p.m. UTC | #5
On 12/2/22 12:49 PM, Eyal Birger wrote:
> On Fri, Dec 2, 2022 at 10:27 PM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>>
>> On 12/2/22 11:42 AM, Eyal Birger wrote:
>>> Hi Martin,
>>>
>>> On Fri, Dec 2, 2022 at 9:08 PM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>>>>
>>>> On 12/2/22 1:59 AM, Eyal Birger wrote:
>>>>> +__used noinline
>>>>> +int bpf_skb_set_xfrm_info(struct __sk_buff *skb_ctx,
>>>>> +                       const struct bpf_xfrm_info *from)
>>>>> +{
>>>>> +     struct sk_buff *skb = (struct sk_buff *)skb_ctx;
>>>>> +     struct metadata_dst *md_dst;
>>>>> +     struct xfrm_md_info *info;
>>>>> +
>>>>> +     if (unlikely(skb_metadata_dst(skb)))
>>>>> +             return -EINVAL;
>>>>> +
>>>>> +     md_dst = this_cpu_ptr(xfrm_md_dst);
>>>>> +
>>>>> +     info = &md_dst->u.xfrm_info;
>>>>> +
>>>>> +     info->if_id = from->if_id;
>>>>> +     info->link = from->link;
>>>>> +     skb_dst_force(skb);
>>>>> +     info->dst_orig = skb_dst(skb);
>>>>> +
>>>>> +     dst_hold((struct dst_entry *)md_dst);
>>>>> +     skb_dst_set(skb, (struct dst_entry *)md_dst);
>>>>
>>>>
>>>> I may be missed something obvious and this just came to my mind,
>>>>
>>>> What stops cleanup_xfrm_interface_bpf() being run while skb is still holding the
>>>> md_dst?
>>>>
>>> Oh I think you're right. I missed this.
>>>
>>> In order to keep this implementation I suppose it means that the module would
>>> not be allowed to be removed upon use of this kfunc. but this could be seen as
>>> annoying from the configuration user experience.
>>>
>>> Alternatively the metadata dsts can be separately allocated from the kfunc,
>>> which is probably the simplest approach to maintain, so I'll work on that
>>> approach.
>>
>> If it means dst_alloc on every skb, it will not be cheap.
>>
>> Another option is to metadata_dst_alloc_percpu() once during the very first
>> bpf_skb_set_xfrm_info() call and the xfrm_md_dst memory will never be freed.  It
>> is a tradeoff but likely the correct one.  You can take a look at
>> bpf_get_skb_set_tunnel_proto().
>>
> 
> Yes, I originally wrote this as a helper similar to the tunnel key
> helper which uses bpf_get_skb_set_tunnel_proto(), and when converting
> to kfuncs I kept the
> percpu implementation.
> 
> However, the set tunnel key code is never unloaded. Whereas taking this
> approach here would mean that this memory would leak on each module reload
> iiuc.

'struct metadata_dst __percpu *xfrm_md_dst' cannot be in the xfrm module. 
filter.c could be an option.
Eyal Birger Dec. 3, 2022, 3:55 a.m. UTC | #6
Hi Martin,

On Fri, Dec 2, 2022 at 11:27 PM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>
> On 12/2/22 12:49 PM, Eyal Birger wrote:
> > On Fri, Dec 2, 2022 at 10:27 PM Martin KaFai Lau <martin.lau@linux.dev> wrote:
> >>
> >> On 12/2/22 11:42 AM, Eyal Birger wrote:
> >>> Hi Martin,
> >>>
> >>> On Fri, Dec 2, 2022 at 9:08 PM Martin KaFai Lau <martin.lau@linux.dev> wrote:
> >>>>
> >>>> On 12/2/22 1:59 AM, Eyal Birger wrote:
> >>>>> +__used noinline
> >>>>> +int bpf_skb_set_xfrm_info(struct __sk_buff *skb_ctx,
> >>>>> +                       const struct bpf_xfrm_info *from)
> >>>>> +{
> >>>>> +     struct sk_buff *skb = (struct sk_buff *)skb_ctx;
> >>>>> +     struct metadata_dst *md_dst;
> >>>>> +     struct xfrm_md_info *info;
> >>>>> +
> >>>>> +     if (unlikely(skb_metadata_dst(skb)))
> >>>>> +             return -EINVAL;
> >>>>> +
> >>>>> +     md_dst = this_cpu_ptr(xfrm_md_dst);
> >>>>> +
> >>>>> +     info = &md_dst->u.xfrm_info;
> >>>>> +
> >>>>> +     info->if_id = from->if_id;
> >>>>> +     info->link = from->link;
> >>>>> +     skb_dst_force(skb);
> >>>>> +     info->dst_orig = skb_dst(skb);
> >>>>> +
> >>>>> +     dst_hold((struct dst_entry *)md_dst);
> >>>>> +     skb_dst_set(skb, (struct dst_entry *)md_dst);
> >>>>
> >>>>
> >>>> I may be missed something obvious and this just came to my mind,
> >>>>
> >>>> What stops cleanup_xfrm_interface_bpf() being run while skb is still holding the
> >>>> md_dst?
> >>>>
> >>> Oh I think you're right. I missed this.
> >>>
> >>> In order to keep this implementation I suppose it means that the module would
> >>> not be allowed to be removed upon use of this kfunc. but this could be seen as
> >>> annoying from the configuration user experience.
> >>>
> >>> Alternatively the metadata dsts can be separately allocated from the kfunc,
> >>> which is probably the simplest approach to maintain, so I'll work on that
> >>> approach.
> >>
> >> If it means dst_alloc on every skb, it will not be cheap.
> >>
> >> Another option is to metadata_dst_alloc_percpu() once during the very first
> >> bpf_skb_set_xfrm_info() call and the xfrm_md_dst memory will never be freed.  It
> >> is a tradeoff but likely the correct one.  You can take a look at
> >> bpf_get_skb_set_tunnel_proto().
> >>
> >
> > Yes, I originally wrote this as a helper similar to the tunnel key
> > helper which uses bpf_get_skb_set_tunnel_proto(), and when converting
> > to kfuncs I kept the
> > percpu implementation.
> >
> > However, the set tunnel key code is never unloaded. Whereas taking this
> > approach here would mean that this memory would leak on each module reload
> > iiuc.
>
> 'struct metadata_dst __percpu *xfrm_md_dst' cannot be in the xfrm module.
> filter.c could be an option.

Looking at it some more, won't the module reference taken by the kfunc btf
guarantee that the module can't be unloaded while the kfunc is used by a
loaded program?

I tried this using a synthetic test attaching the program to a dummy interface
and the module couldn't be unloaded while the program was loaded.

In such case, is it possible for the memory to be freed while there are in-use
percpu metadata dsts?

Eyal.
Eyal Birger Dec. 3, 2022, 7:35 a.m. UTC | #7
On Sat, Dec 3, 2022 at 5:55 AM Eyal Birger <eyal.birger@gmail.com> wrote:
>
> Hi Martin,
>
> On Fri, Dec 2, 2022 at 11:27 PM Martin KaFai Lau <martin.lau@linux.dev> wrote:
> >
> > On 12/2/22 12:49 PM, Eyal Birger wrote:
> > > On Fri, Dec 2, 2022 at 10:27 PM Martin KaFai Lau <martin.lau@linux.dev> wrote:
> > >>
> > >> On 12/2/22 11:42 AM, Eyal Birger wrote:
> > >>> Hi Martin,
> > >>>
> > >>> On Fri, Dec 2, 2022 at 9:08 PM Martin KaFai Lau <martin.lau@linux.dev> wrote:
> > >>>>
> > >>>> On 12/2/22 1:59 AM, Eyal Birger wrote:
> > >>>>> +__used noinline
> > >>>>> +int bpf_skb_set_xfrm_info(struct __sk_buff *skb_ctx,
> > >>>>> +                       const struct bpf_xfrm_info *from)
> > >>>>> +{
> > >>>>> +     struct sk_buff *skb = (struct sk_buff *)skb_ctx;
> > >>>>> +     struct metadata_dst *md_dst;
> > >>>>> +     struct xfrm_md_info *info;
> > >>>>> +
> > >>>>> +     if (unlikely(skb_metadata_dst(skb)))
> > >>>>> +             return -EINVAL;
> > >>>>> +
> > >>>>> +     md_dst = this_cpu_ptr(xfrm_md_dst);
> > >>>>> +
> > >>>>> +     info = &md_dst->u.xfrm_info;
> > >>>>> +
> > >>>>> +     info->if_id = from->if_id;
> > >>>>> +     info->link = from->link;
> > >>>>> +     skb_dst_force(skb);
> > >>>>> +     info->dst_orig = skb_dst(skb);
> > >>>>> +
> > >>>>> +     dst_hold((struct dst_entry *)md_dst);
> > >>>>> +     skb_dst_set(skb, (struct dst_entry *)md_dst);
> > >>>>
> > >>>>
> > >>>> I may be missed something obvious and this just came to my mind,
> > >>>>
> > >>>> What stops cleanup_xfrm_interface_bpf() being run while skb is still holding the
> > >>>> md_dst?
> > >>>>
> > >>> Oh I think you're right. I missed this.
> > >>>
> > >>> In order to keep this implementation I suppose it means that the module would
> > >>> not be allowed to be removed upon use of this kfunc. but this could be seen as
> > >>> annoying from the configuration user experience.
> > >>>
> > >>> Alternatively the metadata dsts can be separately allocated from the kfunc,
> > >>> which is probably the simplest approach to maintain, so I'll work on that
> > >>> approach.
> > >>
> > >> If it means dst_alloc on every skb, it will not be cheap.
> > >>
> > >> Another option is to metadata_dst_alloc_percpu() once during the very first
> > >> bpf_skb_set_xfrm_info() call and the xfrm_md_dst memory will never be freed.  It
> > >> is a tradeoff but likely the correct one.  You can take a look at
> > >> bpf_get_skb_set_tunnel_proto().
> > >>
> > >
> > > Yes, I originally wrote this as a helper similar to the tunnel key
> > > helper which uses bpf_get_skb_set_tunnel_proto(), and when converting
> > > to kfuncs I kept the
> > > percpu implementation.
> > >
> > > However, the set tunnel key code is never unloaded. Whereas taking this
> > > approach here would mean that this memory would leak on each module reload
> > > iiuc.
> >
> > 'struct metadata_dst __percpu *xfrm_md_dst' cannot be in the xfrm module.
> > filter.c could be an option.
>
> Looking at it some more, won't the module reference taken by the kfunc btf
> guarantee that the module can't be unloaded while the kfunc is used by a
> loaded program?
>
> I tried this using a synthetic test attaching the program to a dummy interface
> and the module couldn't be unloaded while the program was loaded.
>
> In such case, is it possible for the memory to be freed while there are in-use
> percpu metadata dsts?

Decided to err on the side of caution and avoid the release of the percpu
dsts. It seems unlikely that the module could be unloaded while there are
in flight skbs pointing to the percpu memory, but it's safer not to rely on
this, and the cost is rather minimal, so I agree this is the correct tradeoff.

Eyal.
diff mbox series

Patch

diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h
index a454cf4327fe..1b7fae4c6b24 100644
--- a/include/net/dst_metadata.h
+++ b/include/net/dst_metadata.h
@@ -26,6 +26,7 @@  struct macsec_info {
 struct xfrm_md_info {
 	u32 if_id;
 	int link;
+	struct dst_entry *dst_orig;
 };
 
 struct metadata_dst {
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index e0cc6791c001..5e5fea3087b6 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -2086,4 +2086,24 @@  static inline bool xfrm6_local_dontfrag(const struct sock *sk)
 	return false;
 }
 #endif
+
+#if (IS_BUILTIN(CONFIG_XFRM_INTERFACE) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) || \
+    (IS_MODULE(CONFIG_XFRM_INTERFACE) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES))
+
+extern int register_xfrm_interface_bpf(void);
+extern void cleanup_xfrm_interface_bpf(void);
+
+#else
+
+static inline int register_xfrm_interface_bpf(void)
+{
+	return 0;
+}
+
+static inline void cleanup_xfrm_interface_bpf(void)
+{
+}
+
+#endif
+
 #endif	/* _NET_XFRM_H */
diff --git a/net/core/dst.c b/net/core/dst.c
index bc9c9be4e080..bb14a0392388 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -316,6 +316,8 @@  void metadata_dst_free(struct metadata_dst *md_dst)
 	if (md_dst->type == METADATA_IP_TUNNEL)
 		dst_cache_destroy(&md_dst->u.tun_info.dst_cache);
 #endif
+	if (md_dst->type == METADATA_XFRM)
+		dst_release(md_dst->u.xfrm_info.dst_orig);
 	kfree(md_dst);
 }
 EXPORT_SYMBOL_GPL(metadata_dst_free);
@@ -340,16 +342,18 @@  EXPORT_SYMBOL_GPL(metadata_dst_alloc_percpu);
 
 void metadata_dst_free_percpu(struct metadata_dst __percpu *md_dst)
 {
-#ifdef CONFIG_DST_CACHE
 	int cpu;
 
 	for_each_possible_cpu(cpu) {
 		struct metadata_dst *one_md_dst = per_cpu_ptr(md_dst, cpu);
 
+#ifdef CONFIG_DST_CACHE
 		if (one_md_dst->type == METADATA_IP_TUNNEL)
 			dst_cache_destroy(&one_md_dst->u.tun_info.dst_cache);
-	}
 #endif
+		if (one_md_dst->type == METADATA_XFRM)
+			dst_release(one_md_dst->u.xfrm_info.dst_orig);
+	}
 	free_percpu(md_dst);
 }
 EXPORT_SYMBOL_GPL(metadata_dst_free_percpu);
diff --git a/net/xfrm/Makefile b/net/xfrm/Makefile
index 08a2870fdd36..cd47f88921f5 100644
--- a/net/xfrm/Makefile
+++ b/net/xfrm/Makefile
@@ -5,6 +5,12 @@ 
 
 xfrm_interface-$(CONFIG_XFRM_INTERFACE) += xfrm_interface_core.o
 
+ifeq ($(CONFIG_XFRM_INTERFACE),m)
+xfrm_interface-$(CONFIG_DEBUG_INFO_BTF_MODULES) += xfrm_interface_bpf.o
+else ifeq ($(CONFIG_XFRM_INTERFACE),y)
+xfrm_interface-$(CONFIG_DEBUG_INFO_BTF) += xfrm_interface_bpf.o
+endif
+
 obj-$(CONFIG_XFRM) := xfrm_policy.o xfrm_state.o xfrm_hash.o \
 		      xfrm_input.o xfrm_output.o \
 		      xfrm_sysctl.o xfrm_replay.o xfrm_device.o
diff --git a/net/xfrm/xfrm_interface_bpf.c b/net/xfrm/xfrm_interface_bpf.c
new file mode 100644
index 000000000000..de281847c5f1
--- /dev/null
+++ b/net/xfrm/xfrm_interface_bpf.c
@@ -0,0 +1,123 @@ 
+// SPDX-License-Identifier: GPL-2.0-only
+/* Unstable XFRM Helpers for TC-BPF hook
+ *
+ * These are called from SCHED_CLS BPF programs. Note that it is
+ * allowed to break compatibility for these functions since the interface they
+ * are exposed through to BPF programs is explicitly unstable.
+ */
+
+#include <linux/bpf.h>
+#include <linux/btf_ids.h>
+
+#include <net/dst_metadata.h>
+#include <net/xfrm.h>
+
+/* bpf_xfrm_info - XFRM metadata information
+ *
+ * Members:
+ * @if_id	- XFRM if_id:
+ *		    Transmit: if_id to be used in policy and state lookups
+ *		    Receive: if_id of the state matched for the incoming packet
+ * @link	- Underlying device ifindex:
+ *		    Transmit: used as the underlying device in VRF routing
+ *		    Receive: the device on which the packet had been received
+ */
+struct bpf_xfrm_info {
+	u32 if_id;
+	int link;
+};
+
+static struct metadata_dst __percpu *xfrm_md_dst;
+__diag_push();
+__diag_ignore_all("-Wmissing-prototypes",
+		  "Global functions as their definitions will be in xfrm_interface BTF");
+
+/* bpf_skb_get_xfrm_info - Get XFRM metadata
+ *
+ * Parameters:
+ * @skb_ctx	- Pointer to ctx (__sk_buff) in TC program
+ *		    Cannot be NULL
+ * @to		- Pointer to memory to which the metadata will be copied
+ *		    Cannot be NULL
+ */
+__used noinline
+int bpf_skb_get_xfrm_info(struct __sk_buff *skb_ctx, struct bpf_xfrm_info *to)
+{
+	struct sk_buff *skb = (struct sk_buff *)skb_ctx;
+	struct xfrm_md_info *info;
+
+	info = skb_xfrm_md_info(skb);
+	if (!info)
+		return -EINVAL;
+
+	to->if_id = info->if_id;
+	to->link = info->link;
+	return 0;
+}
+
+/* bpf_skb_get_xfrm_info - Set XFRM metadata
+ *
+ * Parameters:
+ * @skb_ctx	- Pointer to ctx (__sk_buff) in TC program
+ *		    Cannot be NULL
+ * @from	- Pointer to memory from which the metadata will be copied
+ *		    Cannot be NULL
+ */
+__used noinline
+int bpf_skb_set_xfrm_info(struct __sk_buff *skb_ctx,
+			  const struct bpf_xfrm_info *from)
+{
+	struct sk_buff *skb = (struct sk_buff *)skb_ctx;
+	struct metadata_dst *md_dst;
+	struct xfrm_md_info *info;
+
+	if (unlikely(skb_metadata_dst(skb)))
+		return -EINVAL;
+
+	md_dst = this_cpu_ptr(xfrm_md_dst);
+
+	info = &md_dst->u.xfrm_info;
+
+	info->if_id = from->if_id;
+	info->link = from->link;
+	skb_dst_force(skb);
+	info->dst_orig = skb_dst(skb);
+
+	dst_hold((struct dst_entry *)md_dst);
+	skb_dst_set(skb, (struct dst_entry *)md_dst);
+	return 0;
+}
+
+__diag_pop()
+
+BTF_SET8_START(xfrm_ifc_kfunc_set)
+BTF_ID_FLAGS(func, bpf_skb_get_xfrm_info)
+BTF_ID_FLAGS(func, bpf_skb_set_xfrm_info)
+BTF_SET8_END(xfrm_ifc_kfunc_set)
+
+static const struct btf_kfunc_id_set xfrm_interface_kfunc_set = {
+	.owner = THIS_MODULE,
+	.set   = &xfrm_ifc_kfunc_set,
+};
+
+int __init register_xfrm_interface_bpf(void)
+{
+	int err;
+
+	xfrm_md_dst = metadata_dst_alloc_percpu(0, METADATA_XFRM,
+						GFP_KERNEL);
+	if (!xfrm_md_dst)
+		return -ENOMEM;
+	err = register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS,
+					&xfrm_interface_kfunc_set);
+	if (err < 0) {
+		metadata_dst_free_percpu(xfrm_md_dst);
+		return err;
+	}
+	return 0;
+}
+
+void cleanup_xfrm_interface_bpf(void)
+{
+	metadata_dst_free_percpu(xfrm_md_dst);
+}
diff --git a/net/xfrm/xfrm_interface_core.c b/net/xfrm/xfrm_interface_core.c
index 5a67b120c4db..1e1e8e965939 100644
--- a/net/xfrm/xfrm_interface_core.c
+++ b/net/xfrm/xfrm_interface_core.c
@@ -396,6 +396,14 @@  xfrmi_xmit2(struct sk_buff *skb, struct net_device *dev, struct flowi *fl)
 
 		if_id = md_info->if_id;
 		fl->flowi_oif = md_info->link;
+		if (md_info->dst_orig) {
+			struct dst_entry *tmp_dst = dst;
+
+			dst = md_info->dst_orig;
+			skb_dst_set(skb, dst);
+			md_info->dst_orig = NULL;
+			dst_release(tmp_dst);
+		}
 	} else {
 		if_id = xi->p.if_id;
 	}
@@ -1162,12 +1170,18 @@  static int __init xfrmi_init(void)
 	if (err < 0)
 		goto rtnl_link_failed;
 
+	err = register_xfrm_interface_bpf();
+	if (err < 0)
+		goto kfunc_failed;
+
 	lwtunnel_encap_add_ops(&xfrmi_encap_ops, LWTUNNEL_ENCAP_XFRM);
 
 	xfrm_if_register_cb(&xfrm_if_cb);
 
 	return err;
 
+kfunc_failed:
+	rtnl_link_unregister(&xfrmi_link_ops);
 rtnl_link_failed:
 	xfrmi6_fini();
 xfrmi6_failed:
@@ -1183,6 +1197,7 @@  static void __exit xfrmi_fini(void)
 {
 	xfrm_if_unregister_cb();
 	lwtunnel_encap_del_ops(&xfrmi_encap_ops, LWTUNNEL_ENCAP_XFRM);
+	cleanup_xfrm_interface_bpf();
 	rtnl_link_unregister(&xfrmi_link_ops);
 	xfrmi4_fini();
 	xfrmi6_fini();