diff mbox series

[bpf,v2,1/3] bpf_redirect_neigh: Support supplying the nexthop as a helper parameter

Message ID 160319106221.15822.2629789706666194966.stgit@toke.dk (mailing list archive)
State Superseded
Delegated to: BPF
Headers show
Series bpf: Rework bpf_redirect_neigh() to allow supplying nexthop from caller | expand

Commit Message

Toke Høiland-Jørgensen Oct. 20, 2020, 10:51 a.m. UTC
From: Toke Høiland-Jørgensen <toke@redhat.com>

Based on the discussion in [0], update the bpf_redirect_neigh() helper to
accept an optional parameter specifying the nexthop information. This makes
it possible to combine bpf_fib_lookup() and bpf_redirect_neigh() without
incurring a duplicate FIB lookup - since the FIB lookup helper will return
the nexthop information even if no neighbour is present, this can simply be
passed on to bpf_redirect_neigh() if bpf_fib_lookup() returns
BPF_FIB_LKUP_RET_NO_NEIGH.

[0] https://lore.kernel.org/bpf/393e17fc-d187-3a8d-2f0d-a627c7c63fca@iogearbox.net/

Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
---
 include/linux/filter.h         |    9 ++
 include/uapi/linux/bpf.h       |   24 +++++-
 net/core/filter.c              |  163 +++++++++++++++++++++++++---------------
 scripts/bpf_helpers_doc.py     |    1 
 tools/include/uapi/linux/bpf.h |   24 +++++-
 5 files changed, 153 insertions(+), 68 deletions(-)

Comments

Daniel Borkmann Oct. 20, 2020, 3:08 p.m. UTC | #1
On 10/20/20 12:51 PM, Toke Høiland-Jørgensen wrote:
> From: Toke Høiland-Jørgensen <toke@redhat.com>
[...]
>   BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags)
> @@ -2455,8 +2487,8 @@ int skb_do_redirect(struct sk_buff *skb)
>   		return -EAGAIN;
>   	}
>   	return flags & BPF_F_NEIGH ?
> -	       __bpf_redirect_neigh(skb, dev) :
> -	       __bpf_redirect(skb, dev, flags);
> +		__bpf_redirect_neigh(skb, dev, flags & BPF_F_NEXTHOP ? &ri->nh : NULL) :
> +		__bpf_redirect(skb, dev, flags);
>   out_drop:
>   	kfree_skb(skb);
>   	return -EINVAL;
> @@ -2504,16 +2536,25 @@ static const struct bpf_func_proto bpf_redirect_peer_proto = {
>   	.arg2_type      = ARG_ANYTHING,
>   };
>   
> -BPF_CALL_2(bpf_redirect_neigh, u32, ifindex, u64, flags)
> +BPF_CALL_4(bpf_redirect_neigh, u32, ifindex, struct bpf_redir_neigh *, params,
> +	   int, plen, u64, flags)
>   {
>   	struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
>   
> -	if (unlikely(flags))
> +	if (unlikely((plen && plen < sizeof(*params)) || flags))
> +		return TC_ACT_SHOT;
> +
> +	if (unlikely(plen && (params->unused[0] || params->unused[1] ||
> +			      params->unused[2])))

small nit: maybe fold this into the prior check that already tests non-zero plen

if (unlikely((plen && (plen < sizeof(*params) ||
                        (params->unused[0] | params->unused[1] |
                         params->unused[2]))) || flags))
         return TC_ACT_SHOT;

>   		return TC_ACT_SHOT;
>   
> -	ri->flags = BPF_F_NEIGH;
> +	ri->flags = BPF_F_NEIGH | (plen ? BPF_F_NEXTHOP : 0);
>   	ri->tgt_index = ifindex;
>   
> +	BUILD_BUG_ON(sizeof(struct bpf_redir_neigh) != sizeof(struct bpf_nh_params));
> +	if (plen)
> +		memcpy(&ri->nh, params, sizeof(ri->nh));
> +
>   	return TC_ACT_REDIRECT;
>   }
>
Jakub Kicinski Oct. 20, 2020, 4:30 p.m. UTC | #2
On Tue, 20 Oct 2020 12:51:02 +0200 Toke Høiland-Jørgensen wrote:
> diff --git a/include/linux/filter.h b/include/linux/filter.h
> index 20fc24c9779a..ba9de7188cd0 100644
> --- a/include/linux/filter.h
> +++ b/include/linux/filter.h
> @@ -607,12 +607,21 @@ struct bpf_skb_data_end {
>  	void *data_end;
>  };
>  
> +struct bpf_nh_params {
> +	u8 nh_family;
> +	union {
> +		__u32 ipv4_nh;
> +		struct in6_addr ipv6_nh;
> +	};
> +};

> @@ -4906,6 +4910,18 @@ struct bpf_fib_lookup {
>  	__u8	dmac[6];     /* ETH_ALEN */
>  };
>  
> +struct bpf_redir_neigh {
> +	/* network family for lookup (AF_INET, AF_INET6) */
> +	__u8 nh_family;
> +	 /* avoid hole in struct - must be set to 0 */
> +	__u8 unused[3];
> +	/* network address of nexthop; skips fib lookup to find gateway */
> +	union {
> +		__be32		ipv4_nh;
> +		__u32		ipv6_nh[4];  /* in6_addr; network order */
> +	};
> +};

Isn't this backward? The hole could be named in the internal structure.
This is a bit of a gray area, but if you name this hole in uAPI and
programs start referring to it you will never be able to reuse it.
So you may as well not require it to be zeroed..
Jakub Kicinski Oct. 20, 2020, 4:34 p.m. UTC | #3
On Tue, 20 Oct 2020 12:51:02 +0200 Toke Høiland-Jørgensen wrote:
> +struct bpf_nh_params {
> +	u8 nh_family;
> +	union {
> +		__u32 ipv4_nh;
> +		struct in6_addr ipv6_nh;
> +	};
> +};

Folks, not directly related to this set, but there's a SRv6 patch going
around which adds ifindex, otherwise nh can't be link local.

I wonder if we want to consider this use case from the start (or the
close approximation of start in this case ;)).
Toke Høiland-Jørgensen Oct. 20, 2020, 6:03 p.m. UTC | #4
Jakub Kicinski <kuba@kernel.org> writes:

> On Tue, 20 Oct 2020 12:51:02 +0200 Toke Høiland-Jørgensen wrote:
>> +struct bpf_nh_params {
>> +	u8 nh_family;
>> +	union {
>> +		__u32 ipv4_nh;
>> +		struct in6_addr ipv6_nh;
>> +	};
>> +};
>
> Folks, not directly related to this set, but there's a SRv6 patch going
> around which adds ifindex, otherwise nh can't be link local.
>
> I wonder if we want to consider this use case from the start (or the
> close approximation of start in this case ;)).

The ifindex is there, it's just in the function call signature instead
of the struct... Or did you mean something different?

-Toke
Toke Høiland-Jørgensen Oct. 20, 2020, 6:08 p.m. UTC | #5
Jakub Kicinski <kuba@kernel.org> writes:

> On Tue, 20 Oct 2020 12:51:02 +0200 Toke Høiland-Jørgensen wrote:
>> diff --git a/include/linux/filter.h b/include/linux/filter.h
>> index 20fc24c9779a..ba9de7188cd0 100644
>> --- a/include/linux/filter.h
>> +++ b/include/linux/filter.h
>> @@ -607,12 +607,21 @@ struct bpf_skb_data_end {
>>  	void *data_end;
>>  };
>>  
>> +struct bpf_nh_params {
>> +	u8 nh_family;
>> +	union {
>> +		__u32 ipv4_nh;
>> +		struct in6_addr ipv6_nh;
>> +	};
>> +};
>
>> @@ -4906,6 +4910,18 @@ struct bpf_fib_lookup {
>>  	__u8	dmac[6];     /* ETH_ALEN */
>>  };
>>  
>> +struct bpf_redir_neigh {
>> +	/* network family for lookup (AF_INET, AF_INET6) */
>> +	__u8 nh_family;
>> +	 /* avoid hole in struct - must be set to 0 */
>> +	__u8 unused[3];
>> +	/* network address of nexthop; skips fib lookup to find gateway */
>> +	union {
>> +		__be32		ipv4_nh;
>> +		__u32		ipv6_nh[4];  /* in6_addr; network order */
>> +	};
>> +};
>
> Isn't this backward? The hole could be named in the internal structure.
> This is a bit of a gray area, but if you name this hole in uAPI and
> programs start referring to it you will never be able to reuse it.
> So you may as well not require it to be zeroed..

Hmm, yeah, suppose you're right. Doesn't the verifier prevent any part
of the memory from being unitialised anyway? I seem to recall having run
into verifier complaints when I didn't initialise struct on the stack...

-Toke
Toke Høiland-Jørgensen Oct. 20, 2020, 6:08 p.m. UTC | #6
Daniel Borkmann <daniel@iogearbox.net> writes:

> On 10/20/20 12:51 PM, Toke Høiland-Jørgensen wrote:
>> From: Toke Høiland-Jørgensen <toke@redhat.com>
> [...]
>>   BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags)
>> @@ -2455,8 +2487,8 @@ int skb_do_redirect(struct sk_buff *skb)
>>   		return -EAGAIN;
>>   	}
>>   	return flags & BPF_F_NEIGH ?
>> -	       __bpf_redirect_neigh(skb, dev) :
>> -	       __bpf_redirect(skb, dev, flags);
>> +		__bpf_redirect_neigh(skb, dev, flags & BPF_F_NEXTHOP ? &ri->nh : NULL) :
>> +		__bpf_redirect(skb, dev, flags);
>>   out_drop:
>>   	kfree_skb(skb);
>>   	return -EINVAL;
>> @@ -2504,16 +2536,25 @@ static const struct bpf_func_proto bpf_redirect_peer_proto = {
>>   	.arg2_type      = ARG_ANYTHING,
>>   };
>>   
>> -BPF_CALL_2(bpf_redirect_neigh, u32, ifindex, u64, flags)
>> +BPF_CALL_4(bpf_redirect_neigh, u32, ifindex, struct bpf_redir_neigh *, params,
>> +	   int, plen, u64, flags)
>>   {
>>   	struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
>>   
>> -	if (unlikely(flags))
>> +	if (unlikely((plen && plen < sizeof(*params)) || flags))
>> +		return TC_ACT_SHOT;
>> +
>> +	if (unlikely(plen && (params->unused[0] || params->unused[1] ||
>> +			      params->unused[2])))
>
> small nit: maybe fold this into the prior check that already tests non-zero plen
>
> if (unlikely((plen && (plen < sizeof(*params) ||
>                         (params->unused[0] | params->unused[1] |
>                          params->unused[2]))) || flags))
>          return TC_ACT_SHOT;

Well that was my first thought as well, but I thought it was uglier.
Isn't the compiler smart enough to make those two equivalent?

Anyway, given Jakub's comment, I guess this is moot anyway, as we should
just get rid of the member, no?

-Toke
David Ahern Oct. 20, 2020, 6:12 p.m. UTC | #7
On 10/20/20 10:30 AM, Jakub Kicinski wrote:
> On Tue, 20 Oct 2020 12:51:02 +0200 Toke Høiland-Jørgensen wrote:
>> diff --git a/include/linux/filter.h b/include/linux/filter.h
>> index 20fc24c9779a..ba9de7188cd0 100644
>> --- a/include/linux/filter.h
>> +++ b/include/linux/filter.h
>> @@ -607,12 +607,21 @@ struct bpf_skb_data_end {
>>  	void *data_end;
>>  };
>>  
>> +struct bpf_nh_params {
>> +	u8 nh_family;
>> +	union {
>> +		__u32 ipv4_nh;
>> +		struct in6_addr ipv6_nh;
>> +	};
>> +};
> 
>> @@ -4906,6 +4910,18 @@ struct bpf_fib_lookup {
>>  	__u8	dmac[6];     /* ETH_ALEN */
>>  };
>>  
>> +struct bpf_redir_neigh {
>> +	/* network family for lookup (AF_INET, AF_INET6) */
>> +	__u8 nh_family;
>> +	 /* avoid hole in struct - must be set to 0 */
>> +	__u8 unused[3];
>> +	/* network address of nexthop; skips fib lookup to find gateway */
>> +	union {
>> +		__be32		ipv4_nh;
>> +		__u32		ipv6_nh[4];  /* in6_addr; network order */
>> +	};
>> +};
> 
> Isn't this backward? The hole could be named in the internal structure.
> This is a bit of a gray area, but if you name this hole in uAPI and
> programs start referring to it you will never be able to reuse it.
> So you may as well not require it to be zeroed..
> 

for uapi naming the holes, stating they are unused and requiring a 0
value allows them to be used later if an api change needs to.
David Ahern Oct. 20, 2020, 6:14 p.m. UTC | #8
On 10/20/20 12:03 PM, Toke Høiland-Jørgensen wrote:
> Jakub Kicinski <kuba@kernel.org> writes:
> 
>> On Tue, 20 Oct 2020 12:51:02 +0200 Toke Høiland-Jørgensen wrote:
>>> +struct bpf_nh_params {
>>> +	u8 nh_family;
>>> +	union {
>>> +		__u32 ipv4_nh;
>>> +		struct in6_addr ipv6_nh;
>>> +	};
>>> +};
>>
>> Folks, not directly related to this set, but there's a SRv6 patch going
>> around which adds ifindex, otherwise nh can't be link local.
>>
>> I wonder if we want to consider this use case from the start (or the
>> close approximation of start in this case ;)).
> 
> The ifindex is there, it's just in the function call signature instead
> of the struct... Or did you mean something different?
> 

ifindex as the first argument qualifies the device for the address.
Jakub Kicinski Oct. 20, 2020, 6:50 p.m. UTC | #9
On Tue, 20 Oct 2020 12:14:16 -0600 David Ahern wrote:
> On 10/20/20 12:03 PM, Toke Høiland-Jørgensen wrote:
> > Jakub Kicinski <kuba@kernel.org> writes:
> >> On Tue, 20 Oct 2020 12:51:02 +0200 Toke Høiland-Jørgensen wrote:  
> >>> +struct bpf_nh_params {
> >>> +	u8 nh_family;
> >>> +	union {
> >>> +		__u32 ipv4_nh;
> >>> +		struct in6_addr ipv6_nh;
> >>> +	};
> >>> +};  
> >>
> >> Folks, not directly related to this set, but there's a SRv6 patch going
> >> around which adds ifindex, otherwise nh can't be link local.
> >>
> >> I wonder if we want to consider this use case from the start (or the
> >> close approximation of start in this case ;)).  
> > 
> > The ifindex is there, it's just in the function call signature instead
> > of the struct... Or did you mean something different?
> 
> ifindex as the first argument qualifies the device for the address.

Ah, I should have read closer. Seeing there is a plen I assumed all
args would naturally be in the structure, but I'm guessing the case
where params are NULL will be quite common. Don't mind me.
Jakub Kicinski Oct. 20, 2020, 6:56 p.m. UTC | #10
On Tue, 20 Oct 2020 12:12:32 -0600 David Ahern wrote:
> On 10/20/20 10:30 AM, Jakub Kicinski wrote:
> > On Tue, 20 Oct 2020 12:51:02 +0200 Toke Høiland-Jørgensen wrote:  
> >> diff --git a/include/linux/filter.h b/include/linux/filter.h
> >> index 20fc24c9779a..ba9de7188cd0 100644
> >> --- a/include/linux/filter.h
> >> +++ b/include/linux/filter.h
> >> @@ -607,12 +607,21 @@ struct bpf_skb_data_end {
> >>  	void *data_end;
> >>  };
> >>  
> >> +struct bpf_nh_params {
> >> +	u8 nh_family;
> >> +	union {
> >> +		__u32 ipv4_nh;
> >> +		struct in6_addr ipv6_nh;
> >> +	};
> >> +};  
> >   
> >> @@ -4906,6 +4910,18 @@ struct bpf_fib_lookup {
> >>  	__u8	dmac[6];     /* ETH_ALEN */
> >>  };
> >>  
> >> +struct bpf_redir_neigh {
> >> +	/* network family for lookup (AF_INET, AF_INET6) */
> >> +	__u8 nh_family;
> >> +	 /* avoid hole in struct - must be set to 0 */
> >> +	__u8 unused[3];
> >> +	/* network address of nexthop; skips fib lookup to find gateway */
> >> +	union {
> >> +		__be32		ipv4_nh;
> >> +		__u32		ipv6_nh[4];  /* in6_addr; network order */
> >> +	};
> >> +};  
> > 
> > Isn't this backward? The hole could be named in the internal structure.
> > This is a bit of a gray area, but if you name this hole in uAPI and
> > programs start referring to it you will never be able to reuse it.
> > So you may as well not require it to be zeroed..
> 
> for uapi naming the holes, stating they are unused and requiring a 0
> value allows them to be used later if an api change needs to.

I'm not sure what you're saying, if the field is referenced it can't be
removed. But we could use a union, so I guess it's not a deal breaker.
Jakub Kicinski Oct. 20, 2020, 7:01 p.m. UTC | #11
On Tue, 20 Oct 2020 20:08:18 +0200 Toke Høiland-Jørgensen wrote:
> > Isn't this backward? The hole could be named in the internal structure.
> > This is a bit of a gray area, but if you name this hole in uAPI and
> > programs start referring to it you will never be able to reuse it.
> > So you may as well not require it to be zeroed..  
> 
> Hmm, yeah, suppose you're right. Doesn't the verifier prevent any part
> of the memory from being unitialised anyway? I seem to recall having run
> into verifier complaints when I didn't initialise struct on the stack...

Good point, in which case we have a convenient way to zero the hole
after nh_family but no convenient way to zero the empty address space
for IPv4 :) (even though that one only needs to be zeroed for the
verifier)
Daniel Borkmann Oct. 20, 2020, 7:47 p.m. UTC | #12
On 10/20/20 9:01 PM, Jakub Kicinski wrote:
> On Tue, 20 Oct 2020 20:08:18 +0200 Toke Høiland-Jørgensen wrote:
>>> Isn't this backward? The hole could be named in the internal structure.
>>> This is a bit of a gray area, but if you name this hole in uAPI and
>>> programs start referring to it you will never be able to reuse it.
>>> So you may as well not require it to be zeroed..
>>
>> Hmm, yeah, suppose you're right. Doesn't the verifier prevent any part
>> of the memory from being unitialised anyway? I seem to recall having run
>> into verifier complaints when I didn't initialise struct on the stack...
> 
> Good point, in which case we have a convenient way to zero the hole
> after nh_family but no convenient way to zero the empty address space
> for IPv4 :) (even though that one only needs to be zeroed for the
> verifier)

Technically, it's uninitialized, so zero or any other garbage from BPF stack's
previous use of the program. We could use couple of __u8 :8 after nh_family to
have an unnamed placeholder (like in __bpf_md_ptr()), or we might as well just
switch to __u32 nh_family and avoid the hole that way (also gets rid of the extra
check) ... given we have the liberty to extend later anyway if ever needed.
diff mbox series

Patch

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 20fc24c9779a..ba9de7188cd0 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -607,12 +607,21 @@  struct bpf_skb_data_end {
 	void *data_end;
 };
 
+struct bpf_nh_params {
+	u8 nh_family;
+	union {
+		__u32 ipv4_nh;
+		struct in6_addr ipv6_nh;
+	};
+};
+
 struct bpf_redirect_info {
 	u32 flags;
 	u32 tgt_index;
 	void *tgt_value;
 	struct bpf_map *map;
 	u32 kern_flags;
+	struct bpf_nh_params nh;
 };
 
 DECLARE_PER_CPU(struct bpf_redirect_info, bpf_redirect_info);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index bf5a99d803e4..9668cde9d684 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -3677,15 +3677,19 @@  union bpf_attr {
  * 	Return
  * 		The id is returned or 0 in case the id could not be retrieved.
  *
- * long bpf_redirect_neigh(u32 ifindex, u64 flags)
+ * long bpf_redirect_neigh(u32 ifindex, struct bpf_redir_neigh *params, int plen, u64 flags)
  * 	Description
  * 		Redirect the packet to another net device of index *ifindex*
  * 		and fill in L2 addresses from neighboring subsystem. This helper
  * 		is somewhat similar to **bpf_redirect**\ (), except that it
  * 		populates L2 addresses as well, meaning, internally, the helper
- * 		performs a FIB lookup based on the skb's networking header to
- * 		get the address of the next hop and then relies on the neighbor
- * 		lookup for the L2 address of the nexthop.
+ * 		relies on the neighbor lookup for the L2 address of the nexthop.
+ *
+ * 		The helper will perform a FIB lookup based on the skb's
+ * 		networking header to get the address of the next hop, unless
+ * 		this is supplied by the caller in the *params* argument. The
+ * 		*plen* argument indicates the len of *params* and should be set
+ * 		to 0 if *params* is NULL.
  *
  * 		The *flags* argument is reserved and must be 0. The helper is
  * 		currently only supported for tc BPF program types, and enabled
@@ -4906,6 +4910,18 @@  struct bpf_fib_lookup {
 	__u8	dmac[6];     /* ETH_ALEN */
 };
 
+struct bpf_redir_neigh {
+	/* network family for lookup (AF_INET, AF_INET6) */
+	__u8 nh_family;
+	 /* avoid hole in struct - must be set to 0 */
+	__u8 unused[3];
+	/* network address of nexthop; skips fib lookup to find gateway */
+	union {
+		__be32		ipv4_nh;
+		__u32		ipv6_nh[4];  /* in6_addr; network order */
+	};
+};
+
 enum bpf_task_fd_type {
 	BPF_FD_TYPE_RAW_TRACEPOINT,	/* tp name */
 	BPF_FD_TYPE_TRACEPOINT,		/* tp name */
diff --git a/net/core/filter.c b/net/core/filter.c
index c5e2a1c5fd8d..fa09b4f141ae 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2165,12 +2165,12 @@  static int __bpf_redirect(struct sk_buff *skb, struct net_device *dev,
 }
 
 #if IS_ENABLED(CONFIG_IPV6)
-static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb)
+static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb,
+			    struct net_device *dev, struct bpf_nh_params *nh)
 {
-	struct dst_entry *dst = skb_dst(skb);
-	struct net_device *dev = dst->dev;
 	u32 hh_len = LL_RESERVED_SPACE(dev);
 	const struct in6_addr *nexthop;
+	struct dst_entry *dst = NULL;
 	struct neighbour *neigh;
 
 	if (dev_xmit_recursion()) {
@@ -2196,8 +2196,13 @@  static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb)
 	}
 
 	rcu_read_lock_bh();
-	nexthop = rt6_nexthop(container_of(dst, struct rt6_info, dst),
-			      &ipv6_hdr(skb)->daddr);
+	if (!nh) {
+		dst = skb_dst(skb);
+		nexthop = rt6_nexthop(container_of(dst, struct rt6_info, dst),
+				      &ipv6_hdr(skb)->daddr);
+	} else {
+		nexthop = &nh->ipv6_nh;
+	}
 	neigh = ip_neigh_gw6(dev, nexthop);
 	if (likely(!IS_ERR(neigh))) {
 		int ret;
@@ -2210,36 +2215,43 @@  static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb)
 		return ret;
 	}
 	rcu_read_unlock_bh();
-	IP6_INC_STATS(dev_net(dst->dev),
-		      ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
+	if (dst)
+		IP6_INC_STATS(dev_net(dst->dev),
+			      ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
 out_drop:
 	kfree_skb(skb);
 	return -ENETDOWN;
 }
 
-static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev)
+static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev,
+				   struct bpf_nh_params *nh)
 {
 	const struct ipv6hdr *ip6h = ipv6_hdr(skb);
 	struct net *net = dev_net(dev);
 	int err, ret = NET_XMIT_DROP;
-	struct dst_entry *dst;
-	struct flowi6 fl6 = {
-		.flowi6_flags	= FLOWI_FLAG_ANYSRC,
-		.flowi6_mark	= skb->mark,
-		.flowlabel	= ip6_flowinfo(ip6h),
-		.flowi6_oif	= dev->ifindex,
-		.flowi6_proto	= ip6h->nexthdr,
-		.daddr		= ip6h->daddr,
-		.saddr		= ip6h->saddr,
-	};
 
-	dst = ipv6_stub->ipv6_dst_lookup_flow(net, NULL, &fl6, NULL);
-	if (IS_ERR(dst))
-		goto out_drop;
+	if (!nh) {
+		struct dst_entry *dst;
+		struct flowi6 fl6 = {
+			.flowi6_flags = FLOWI_FLAG_ANYSRC,
+			.flowi6_mark  = skb->mark,
+			.flowlabel    = ip6_flowinfo(ip6h),
+			.flowi6_oif   = dev->ifindex,
+			.flowi6_proto = ip6h->nexthdr,
+			.daddr	      = ip6h->daddr,
+			.saddr	      = ip6h->saddr,
+		};
+
+		dst = ipv6_stub->ipv6_dst_lookup_flow(net, NULL, &fl6, NULL);
+		if (IS_ERR(dst))
+			goto out_drop;
 
-	skb_dst_set(skb, dst);
+		skb_dst_set(skb, dst);
+	} else if (nh->nh_family != AF_INET6) {
+		goto out_drop;
+	}
 
-	err = bpf_out_neigh_v6(net, skb);
+	err = bpf_out_neigh_v6(net, skb, dev, nh);
 	if (unlikely(net_xmit_eval(err)))
 		dev->stats.tx_errors++;
 	else
@@ -2252,7 +2264,8 @@  static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev)
 	return ret;
 }
 #else
-static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev)
+static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev,
+				   struct bpf_nh_params *nh)
 {
 	kfree_skb(skb);
 	return NET_XMIT_DROP;
@@ -2260,11 +2273,9 @@  static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev)
 #endif /* CONFIG_IPV6 */
 
 #if IS_ENABLED(CONFIG_INET)
-static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb)
+static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb,
+			    struct net_device *dev, struct bpf_nh_params *nh)
 {
-	struct dst_entry *dst = skb_dst(skb);
-	struct rtable *rt = container_of(dst, struct rtable, dst);
-	struct net_device *dev = dst->dev;
 	u32 hh_len = LL_RESERVED_SPACE(dev);
 	struct neighbour *neigh;
 	bool is_v6gw = false;
@@ -2292,7 +2303,21 @@  static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb)
 	}
 
 	rcu_read_lock_bh();
-	neigh = ip_neigh_for_gw(rt, skb, &is_v6gw);
+	if (!nh) {
+		struct dst_entry *dst = skb_dst(skb);
+		struct rtable *rt = container_of(dst, struct rtable, dst);
+
+		neigh = ip_neigh_for_gw(rt, skb, &is_v6gw);
+	} else if (nh->nh_family == AF_INET6) {
+		neigh = ip_neigh_gw6(dev, &nh->ipv6_nh);
+		is_v6gw = true;
+	} else if (nh->nh_family == AF_INET) {
+		neigh = ip_neigh_gw4(dev, nh->ipv4_nh);
+	} else {
+		rcu_read_unlock_bh();
+		goto out_drop;
+	}
+
 	if (likely(!IS_ERR(neigh))) {
 		int ret;
 
@@ -2309,33 +2334,37 @@  static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb)
 	return -ENETDOWN;
 }
 
-static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev)
+static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev,
+				   struct bpf_nh_params *nh)
 {
 	const struct iphdr *ip4h = ip_hdr(skb);
 	struct net *net = dev_net(dev);
 	int err, ret = NET_XMIT_DROP;
-	struct rtable *rt;
-	struct flowi4 fl4 = {
-		.flowi4_flags	= FLOWI_FLAG_ANYSRC,
-		.flowi4_mark	= skb->mark,
-		.flowi4_tos	= RT_TOS(ip4h->tos),
-		.flowi4_oif	= dev->ifindex,
-		.flowi4_proto	= ip4h->protocol,
-		.daddr		= ip4h->daddr,
-		.saddr		= ip4h->saddr,
-	};
 
-	rt = ip_route_output_flow(net, &fl4, NULL);
-	if (IS_ERR(rt))
-		goto out_drop;
-	if (rt->rt_type != RTN_UNICAST && rt->rt_type != RTN_LOCAL) {
-		ip_rt_put(rt);
-		goto out_drop;
-	}
+	if (!nh) {
+		struct flowi4 fl4 = {
+			.flowi4_flags = FLOWI_FLAG_ANYSRC,
+			.flowi4_mark  = skb->mark,
+			.flowi4_tos   = RT_TOS(ip4h->tos),
+			.flowi4_oif   = dev->ifindex,
+			.flowi4_proto = ip4h->protocol,
+			.daddr	      = ip4h->daddr,
+			.saddr	      = ip4h->saddr,
+		};
+		struct rtable *rt;
+
+		rt = ip_route_output_flow(net, &fl4, NULL);
+		if (IS_ERR(rt))
+			goto out_drop;
+		if (rt->rt_type != RTN_UNICAST && rt->rt_type != RTN_LOCAL) {
+			ip_rt_put(rt);
+			goto out_drop;
+		}
 
-	skb_dst_set(skb, &rt->dst);
+		skb_dst_set(skb, &rt->dst);
+	}
 
-	err = bpf_out_neigh_v4(net, skb);
+	err = bpf_out_neigh_v4(net, skb, dev, nh);
 	if (unlikely(net_xmit_eval(err)))
 		dev->stats.tx_errors++;
 	else
@@ -2348,14 +2377,16 @@  static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev)
 	return ret;
 }
 #else
-static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev)
+static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev,
+				   struct bpf_nh_params *nh)
 {
 	kfree_skb(skb);
 	return NET_XMIT_DROP;
 }
 #endif /* CONFIG_INET */
 
-static int __bpf_redirect_neigh(struct sk_buff *skb, struct net_device *dev)
+static int __bpf_redirect_neigh(struct sk_buff *skb, struct net_device *dev,
+				struct bpf_nh_params *nh)
 {
 	struct ethhdr *ethh = eth_hdr(skb);
 
@@ -2370,9 +2401,9 @@  static int __bpf_redirect_neigh(struct sk_buff *skb, struct net_device *dev)
 	skb_reset_network_header(skb);
 
 	if (skb->protocol == htons(ETH_P_IP))
-		return __bpf_redirect_neigh_v4(skb, dev);
+		return __bpf_redirect_neigh_v4(skb, dev, nh);
 	else if (skb->protocol == htons(ETH_P_IPV6))
-		return __bpf_redirect_neigh_v6(skb, dev);
+		return __bpf_redirect_neigh_v6(skb, dev, nh);
 out:
 	kfree_skb(skb);
 	return -ENOTSUPP;
@@ -2382,7 +2413,8 @@  static int __bpf_redirect_neigh(struct sk_buff *skb, struct net_device *dev)
 enum {
 	BPF_F_NEIGH	= (1ULL << 1),
 	BPF_F_PEER	= (1ULL << 2),
-#define BPF_F_REDIRECT_INTERNAL	(BPF_F_NEIGH | BPF_F_PEER)
+	BPF_F_NEXTHOP	= (1ULL << 3),
+#define BPF_F_REDIRECT_INTERNAL	(BPF_F_NEIGH | BPF_F_PEER | BPF_F_NEXTHOP)
 };
 
 BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags)
@@ -2455,8 +2487,8 @@  int skb_do_redirect(struct sk_buff *skb)
 		return -EAGAIN;
 	}
 	return flags & BPF_F_NEIGH ?
-	       __bpf_redirect_neigh(skb, dev) :
-	       __bpf_redirect(skb, dev, flags);
+		__bpf_redirect_neigh(skb, dev, flags & BPF_F_NEXTHOP ? &ri->nh : NULL) :
+		__bpf_redirect(skb, dev, flags);
 out_drop:
 	kfree_skb(skb);
 	return -EINVAL;
@@ -2504,16 +2536,25 @@  static const struct bpf_func_proto bpf_redirect_peer_proto = {
 	.arg2_type      = ARG_ANYTHING,
 };
 
-BPF_CALL_2(bpf_redirect_neigh, u32, ifindex, u64, flags)
+BPF_CALL_4(bpf_redirect_neigh, u32, ifindex, struct bpf_redir_neigh *, params,
+	   int, plen, u64, flags)
 {
 	struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
 
-	if (unlikely(flags))
+	if (unlikely((plen && plen < sizeof(*params)) || flags))
+		return TC_ACT_SHOT;
+
+	if (unlikely(plen && (params->unused[0] || params->unused[1] ||
+			      params->unused[2])))
 		return TC_ACT_SHOT;
 
-	ri->flags = BPF_F_NEIGH;
+	ri->flags = BPF_F_NEIGH | (plen ? BPF_F_NEXTHOP : 0);
 	ri->tgt_index = ifindex;
 
+	BUILD_BUG_ON(sizeof(struct bpf_redir_neigh) != sizeof(struct bpf_nh_params));
+	if (plen)
+		memcpy(&ri->nh, params, sizeof(ri->nh));
+
 	return TC_ACT_REDIRECT;
 }
 
@@ -2522,7 +2563,9 @@  static const struct bpf_func_proto bpf_redirect_neigh_proto = {
 	.gpl_only	= false,
 	.ret_type	= RET_INTEGER,
 	.arg1_type	= ARG_ANYTHING,
-	.arg2_type	= ARG_ANYTHING,
+	.arg2_type      = ARG_PTR_TO_MEM_OR_NULL,
+	.arg3_type      = ARG_CONST_SIZE_OR_ZERO,
+	.arg4_type	= ARG_ANYTHING,
 };
 
 BPF_CALL_2(bpf_msg_apply_bytes, struct sk_msg *, msg, u32, bytes)
diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py
index 7d86fdd190be..6769caae142f 100755
--- a/scripts/bpf_helpers_doc.py
+++ b/scripts/bpf_helpers_doc.py
@@ -453,6 +453,7 @@  class PrinterHelpers(Printer):
             'struct bpf_perf_event_data',
             'struct bpf_perf_event_value',
             'struct bpf_pidns_info',
+            'struct bpf_redir_neigh',
             'struct bpf_sk_lookup',
             'struct bpf_sock',
             'struct bpf_sock_addr',
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index bf5a99d803e4..9668cde9d684 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -3677,15 +3677,19 @@  union bpf_attr {
  * 	Return
  * 		The id is returned or 0 in case the id could not be retrieved.
  *
- * long bpf_redirect_neigh(u32 ifindex, u64 flags)
+ * long bpf_redirect_neigh(u32 ifindex, struct bpf_redir_neigh *params, int plen, u64 flags)
  * 	Description
  * 		Redirect the packet to another net device of index *ifindex*
  * 		and fill in L2 addresses from neighboring subsystem. This helper
  * 		is somewhat similar to **bpf_redirect**\ (), except that it
  * 		populates L2 addresses as well, meaning, internally, the helper
- * 		performs a FIB lookup based on the skb's networking header to
- * 		get the address of the next hop and then relies on the neighbor
- * 		lookup for the L2 address of the nexthop.
+ * 		relies on the neighbor lookup for the L2 address of the nexthop.
+ *
+ * 		The helper will perform a FIB lookup based on the skb's
+ * 		networking header to get the address of the next hop, unless
+ * 		this is supplied by the caller in the *params* argument. The
+ * 		*plen* argument indicates the len of *params* and should be set
+ * 		to 0 if *params* is NULL.
  *
  * 		The *flags* argument is reserved and must be 0. The helper is
  * 		currently only supported for tc BPF program types, and enabled
@@ -4906,6 +4910,18 @@  struct bpf_fib_lookup {
 	__u8	dmac[6];     /* ETH_ALEN */
 };
 
+struct bpf_redir_neigh {
+	/* network family for lookup (AF_INET, AF_INET6) */
+	__u8 nh_family;
+	 /* avoid hole in struct - must be set to 0 */
+	__u8 unused[3];
+	/* network address of nexthop; skips fib lookup to find gateway */
+	union {
+		__be32		ipv4_nh;
+		__u32		ipv6_nh[4];  /* in6_addr; network order */
+	};
+};
+
 enum bpf_task_fd_type {
 	BPF_FD_TYPE_RAW_TRACEPOINT,	/* tp name */
 	BPF_FD_TYPE_TRACEPOINT,		/* tp name */