diff mbox series

[net-next,v6] vxlan: try to send a packet normally if local bypass fails

Message ID 20230405050102.15612-1-vladimir@nikishkin.pw (mailing list archive)
State Changes Requested
Delegated to: Netdev Maintainers
Headers show
Series [net-next,v6] vxlan: try to send a packet normally if local bypass fails | expand

Checks

Context Check Description
netdev/series_format success Single patches do not need cover letters
netdev/tree_selection success Clearly marked for net-next
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 4246 this patch: 4246
netdev/cc_maintainers warning 2 maintainers not CCed: shuah@kernel.org linux-kselftest@vger.kernel.org
netdev/build_clang success Errors and warnings before: 948 this patch: 948
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success net selftest script(s) already in Makefile
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 4453 this patch: 4453
netdev/checkpatch warning WARNING: added, moved or deleted file(s), does MAINTAINERS need updating? WARNING: line length of 81 exceeds 80 columns WARNING: line length of 83 exceeds 80 columns WARNING: line length of 86 exceeds 80 columns
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Vladimir Nikishkin April 5, 2023, 5:01 a.m. UTC
In vxlan_core, if an fdb entry is pointing to a local
address with some port, the system tries to get the packet to
deliver the packet to the vxlan directly, bypassing the network
stack.

This patch makes it still try canonical delivery, if there is no
linux kernel vxlan listening on this port. This will be useful
for the cases when there is some userspace daemon expecting
vxlan packets for post-processing, or some other implementation
of vxlan.

Signed-off-by: Vladimir Nikishkin <vladimir@nikishkin.pw>
---
 drivers/net/vxlan/vxlan_core.c                |  29 +++--
 include/net/vxlan.h                           |   4 +-
 include/uapi/linux/if_link.h                  |   1 +
 tools/include/uapi/linux/if_link.h            |   2 +
 tools/testing/selftests/net/Makefile          |   1 +
 tools/testing/selftests/net/rtnetlink.sh      |   3 +
 .../selftests/net/test_vxlan_nolocalbypass.sh | 102 ++++++++++++++++++
 7 files changed, 135 insertions(+), 7 deletions(-)
 create mode 100755 tools/testing/selftests/net/test_vxlan_nolocalbypass.sh

Comments

Paolo Abeni April 6, 2023, 10:16 a.m. UTC | #1
On Wed, 2023-04-05 at 13:01 +0800, Vladimir Nikishkin wrote:
> In vxlan_core, if an fdb entry is pointing to a local
> address with some port, the system tries to get the packet to
> deliver the packet to the vxlan directly, bypassing the network

The above sentence sounds confusing to me. Possibly:

... the system tries to deliver the packet to ...

?

> stack.
> 
> This patch makes it still try canonical delivery, if there is no
> linux kernel vxlan listening on this port. This will be useful
> for the cases when there is some userspace daemon expecting
> vxlan packets for post-processing, or some other implementation
> of vxlan.
> 
> Signed-off-by: Vladimir Nikishkin <vladimir@nikishkin.pw>
> ---
>  drivers/net/vxlan/vxlan_core.c                |  29 +++--
>  include/net/vxlan.h                           |   4 +-
>  include/uapi/linux/if_link.h                  |   1 +
>  tools/include/uapi/linux/if_link.h            |   2 +
>  tools/testing/selftests/net/Makefile          |   1 +
>  tools/testing/selftests/net/rtnetlink.sh      |   3 +
>  .../selftests/net/test_vxlan_nolocalbypass.sh | 102 ++++++++++++++++++
>  7 files changed, 135 insertions(+), 7 deletions(-)
>  create mode 100755 tools/testing/selftests/net/test_vxlan_nolocalbypass.sh
> 
> diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c
> index 561fe1b314f5..f9dfb179af58 100644
> --- a/drivers/net/vxlan/vxlan_core.c
> +++ b/drivers/net/vxlan/vxlan_core.c
> @@ -2341,7 +2341,7 @@ static int encap_bypass_if_local(struct sk_buff *skb, struct net_device *dev,
>  				 union vxlan_addr *daddr,
>  				 __be16 dst_port, int dst_ifindex, __be32 vni,
>  				 struct dst_entry *dst,
> -				 u32 rt_flags)
> +				 u32 rt_flags, bool localbypass)

No need to add an additional argument, you can use vxlan->cfg.flags
instead

>  {
>  #if IS_ENABLED(CONFIG_IPV6)
>  	/* IPv6 rt-flags are checked against RTF_LOCAL, but the value of
> @@ -2355,11 +2355,13 @@ static int encap_bypass_if_local(struct sk_buff *skb, struct net_device *dev,
>  	    !(rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))) {
>  		struct vxlan_dev *dst_vxlan;
>  
> -		dst_release(dst);
>  		dst_vxlan = vxlan_find_vni(vxlan->net, dst_ifindex, vni,
>  					   daddr->sa.sa_family, dst_port,
>  					   vxlan->cfg.flags);
>  		if (!dst_vxlan) {
> +			if (!localbypass)
> +				return 0;

A space here would make the code more clear

> +			dst_release(dst);
>  			dev->stats.tx_errors++;
>  			vxlan_vnifilter_count(vxlan, vni, NULL,
>  					      VXLAN_VNI_STATS_TX_ERRORS, 0);
> @@ -2367,6 +2369,7 @@ static int encap_bypass_if_local(struct sk_buff *skb, struct net_device *dev,
>  
>  			return -ENOENT;
>  		}
> +		dst_release(dst);
>  		vxlan_encap_bypass(skb, vxlan, dst_vxlan, vni, true);
>  		return 1;
>  	}
> @@ -2494,9 +2497,11 @@ void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
>  
>  		if (!info) {
>  			/* Bypass encapsulation if the destination is local */
> +			bool localbypass = flags & VXLAN_F_LOCALBYPASS;

The new variable is not needed (see above). Anyway you need to add an
empty line between variable declarations and code - btw strange that
checkpatch did not complain.

>  			err = encap_bypass_if_local(skb, dev, vxlan, dst,
>  						    dst_port, ifindex, vni,
> -						    &rt->dst, rt->rt_flags);
> +						    &rt->dst, rt->rt_flags,
> +						    localbypass);
>  			if (err)
>  				goto out_unlock;
>  
> @@ -2568,10 +2573,10 @@ void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
>  
>  		if (!info) {
>  			u32 rt6i_flags = ((struct rt6_info *)ndst)->rt6i_flags;
> -
> +			bool localbypass = flags & VXLAN_F_LOCALBYPASS;
>  			err = encap_bypass_if_local(skb, dev, vxlan, dst,
>  						    dst_port, ifindex, vni,
> -						    ndst, rt6i_flags);
> +						    ndst, rt6i_flags, localbypass);
>  			if (err)
>  				goto out_unlock;
>  		}
> @@ -3202,6 +3207,7 @@ static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = {
>  	[IFLA_VXLAN_TTL_INHERIT]	= { .type = NLA_FLAG },
>  	[IFLA_VXLAN_DF]		= { .type = NLA_U8 },
>  	[IFLA_VXLAN_VNIFILTER]	= { .type = NLA_U8 },
> +	[IFLA_VXLAN_LOCALBYPASS]	= { .type = NLA_U8 },
>  };
>  
>  static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[],
> @@ -4011,6 +4017,14 @@ static int vxlan_nl2conf(struct nlattr *tb[], struct nlattr *data[],
>  			conf->flags |= VXLAN_F_UDP_ZERO_CSUM_TX;
>  	}
>  
> +	if (data[IFLA_VXLAN_LOCALBYPASS]) {
> +		err = vxlan_nl2flag(conf, data, IFLA_VXLAN_LOCALBYPASS,
> +				    VXLAN_F_LOCALBYPASS, changelink,
> +				    false, extack);
> +		if (err)
> +			return err;
> +	}
> +
>  	if (data[IFLA_VXLAN_UDP_ZERO_CSUM6_TX]) {
>  		err = vxlan_nl2flag(conf, data, IFLA_VXLAN_UDP_ZERO_CSUM6_TX,
>  				    VXLAN_F_UDP_ZERO_CSUM6_TX, changelink,
> @@ -4232,6 +4246,7 @@ static size_t vxlan_get_size(const struct net_device *dev)
>  		nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_UDP_ZERO_CSUM6_RX */
>  		nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_REMCSUM_TX */
>  		nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_REMCSUM_RX */
> +		nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_LOCALBYPASS */
>  		0;
>  }
>  
> @@ -4308,7 +4323,9 @@ static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev)
>  	    nla_put_u8(skb, IFLA_VXLAN_REMCSUM_TX,
>  		       !!(vxlan->cfg.flags & VXLAN_F_REMCSUM_TX)) ||
>  	    nla_put_u8(skb, IFLA_VXLAN_REMCSUM_RX,
> -		       !!(vxlan->cfg.flags & VXLAN_F_REMCSUM_RX)))
> +		       !!(vxlan->cfg.flags & VXLAN_F_REMCSUM_RX)) ||
> +	    nla_put_u8(skb, IFLA_VXLAN_LOCALBYPASS,
> +		       !!(vxlan->cfg.flags & VXLAN_F_LOCALBYPASS)))
>  		goto nla_put_failure;
>  
>  	if (nla_put(skb, IFLA_VXLAN_PORT_RANGE, sizeof(ports), &ports))
> diff --git a/include/net/vxlan.h b/include/net/vxlan.h
> index 20bd7d893e10..0be91ca78d3a 100644
> --- a/include/net/vxlan.h
> +++ b/include/net/vxlan.h
> @@ -328,6 +328,7 @@ struct vxlan_dev {
>  #define VXLAN_F_TTL_INHERIT		0x10000
>  #define VXLAN_F_VNIFILTER               0x20000
>  #define VXLAN_F_MDB			0x40000
> +#define VXLAN_F_LOCALBYPASS		0x80000
>  
>  /* Flags that are used in the receive path. These flags must match in
>   * order for a socket to be shareable
> @@ -348,7 +349,8 @@ struct vxlan_dev {
>  					 VXLAN_F_UDP_ZERO_CSUM6_TX |	\
>  					 VXLAN_F_UDP_ZERO_CSUM6_RX |	\
>  					 VXLAN_F_COLLECT_METADATA  |	\
> -					 VXLAN_F_VNIFILTER)
> +					 VXLAN_F_VNIFILTER         |    \
> +					 VXLAN_F_LOCALBYPASS)
>  
>  struct net_device *vxlan_dev_create(struct net *net, const char *name,
>  				    u8 name_assign_type, struct vxlan_config *conf);
> diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
> index 8d679688efe0..0fc56be5e19f 100644
> --- a/include/uapi/linux/if_link.h
> +++ b/include/uapi/linux/if_link.h
> @@ -827,6 +827,7 @@ enum {
>  	IFLA_VXLAN_TTL_INHERIT,
>  	IFLA_VXLAN_DF,
>  	IFLA_VXLAN_VNIFILTER, /* only applicable with COLLECT_METADATA mode */
> +	IFLA_VXLAN_LOCALBYPASS,
>  	__IFLA_VXLAN_MAX
>  };
>  #define IFLA_VXLAN_MAX	(__IFLA_VXLAN_MAX - 1)
> diff --git a/tools/include/uapi/linux/if_link.h b/tools/include/uapi/linux/if_link.h
> index 39e659c83cfd..1253bd0aa90e 100644
> --- a/tools/include/uapi/linux/if_link.h
> +++ b/tools/include/uapi/linux/if_link.h
> @@ -748,6 +748,8 @@ enum {
>  	IFLA_VXLAN_GPE,
>  	IFLA_VXLAN_TTL_INHERIT,
>  	IFLA_VXLAN_DF,
> +	IFLA_VXLAN_VNIFILTER,
> +	IFLA_VXLAN_LOCALBYPASS,
>  	__IFLA_VXLAN_MAX
>  };
>  #define IFLA_VXLAN_MAX	(__IFLA_VXLAN_MAX - 1)
> diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile
> index 1de34ec99290..7a9cfd0c92db 100644
> --- a/tools/testing/selftests/net/Makefile
> +++ b/tools/testing/selftests/net/Makefile
> @@ -83,6 +83,7 @@ TEST_GEN_FILES += nat6to4.o
>  TEST_GEN_FILES += ip_local_port_range
>  TEST_GEN_FILES += bind_wildcard
>  TEST_PROGS += test_vxlan_mdb.sh
> +TEST_PROGS += test_vxlan_nolocalbypass.sh
>  
>  TEST_FILES := settings
>  
> diff --git a/tools/testing/selftests/net/rtnetlink.sh b/tools/testing/selftests/net/rtnetlink.sh
> index 383ac6fc037d..09a5ef4bd42b 100755
> --- a/tools/testing/selftests/net/rtnetlink.sh
> +++ b/tools/testing/selftests/net/rtnetlink.sh
> @@ -505,6 +505,9 @@ kci_test_encap_vxlan()
>  	ip -netns "$testns" link set dev "$vxlan" type vxlan udpcsum 2>/dev/null
>  	check_fail $?
>  
> +	ip -netns "$testns" link set dev "$vxlan" type vxlan nolocalbypass 2>/dev/null
> +	check_fail $?

This will fail until a new version of 'ip' is deployed, right? I would
wait before adding this test case (or will move it in the
test_vxlan_nolocalbypass.sh, after the relevant check)

> +
>  	ip -netns "$testns" link set dev "$vxlan" type vxlan udp6zerocsumtx 2>/dev/null
>  	check_fail $?
>  
> diff --git a/tools/testing/selftests/net/test_vxlan_nolocalbypass.sh b/tools/testing/selftests/net/test_vxlan_nolocalbypass.sh
> new file mode 100755
> index 000000000000..efa37af2da7b
> --- /dev/null
> +++ b/tools/testing/selftests/net/test_vxlan_nolocalbypass.sh
> @@ -0,0 +1,102 @@
> +#!/bin/bash
> +# SPDX-License-Identifier: GPL-2.0
> +
> +# This file is testing that the [no]localbypass option for a vxlan device is
> +# working. With the nolocalbypass option, packets to a local destination, which
> +# have no corresponding vxlan in the kernel, will be delivered to userspace, for
> +# any userspace process to process. In this test tcpdump plays the role of such a
> +# process. This is what the test 1 is checking.
> +# The test 2 checks that without the nolocalbypass (which is equivalent to the
> +# localbypass option), the packets do not reach userspace.
> +
> +EXIT_FAIL=1
> +ksft_skip=4
> +EXIT_SUCCESS=0
> +
> +if [ "$(id -u)" -ne 0 ];then
> +        echo "SKIP: Need root privileges"
> +        exit $ksft_skip;
> +fi
> +
> +if [ ! -x "$(command -v ip)" ]; then
> +        echo "SKIP: Could not run test without ip tool"
> +        exit $ksft_skip
> +fi
> +
> +if [ ! -x "$(command -v bridge)" ]; then
> +        echo "SKIP: Could not run test without bridge tool"
> +        exit $ksft_skip
> +fi

No need to check for the above tool presence. You can assume they are
present, as most net self-tests unconditionally relay on them.

> +
> +if [ ! -x "$(command -v tcpdump)" ]; then
> +        echo "SKIP: Could not run test without tcpdump tool"
> +        exit $ksft_skip
> +fi

I personally would opt for adding nft rules and checking the counters,
but no strong opinion on that.

> +
> +if [ ! -x "$(command -v grep)" ]; then
> +        echo "SKIP: Could not run test without grep tool"
> +        exit $ksft_skip
> +fi

Some self-tests check for 'grep' presence but most simply assume such
command is available. I would opt for the latter.

> +
> +ip link help vxlan 2>&1 | grep -q "localbypass"
> +if [ $? -ne 0 ]; then
> +   echo "SKIP: iproute2 bridge too old, missing VXLAN nolocalbypass support"
> +   exit $ksft_skip
> +fi
> +
> +

Additional empty line not needed

> +packetfile=/tmp/packets-"$(uuidgen)"
> +
> +# test 1: packets going to userspace
> +rm "$packetfile"
> +ip link del dev testvxlan0
> +ip link add testvxlan0 type vxlan  \
> +  id 100 \
> +  dstport 4789 \
> +  srcport 4789 4790 \
> +  nolearning noproxy \
> +  nolocalbypass
> +ip link set up dev testvxlan0
> +bridge fdb add 00:00:00:00:00:00 dev testvxlan0 dst 127.0.0.1 port 4792
> +ip address add 172.16.100.1/24 dev testvxlan0
> +tcpdump -i lo 'udp and port 4792' > "$packetfile" &
> +tcpdump_pid=$!
> +timeout 5 ping -c 5 172.16.100.2
> +kill "$tcpdump_pid"
> +ip link del dev testvxlan0
> +
> +if grep -q "VXLAN" "$packetfile" ; then
> +  echo 'Positive test passed'
> +else
> +  echo 'Positive test failed'
> +  exit $EXIT_FAIL
> +fi
> +rm "$packetfile"

It's better if you additionally remove the generate file(s) in a
cleanup function set as trap for exit:

cleanup()
{
	rm -f ${packetfile}
}

trap cleanup EXIT


Cheers,

Paolo
Ido Schimmel April 8, 2023, 5:03 p.m. UTC | #2
I agree with Paolo's comments. Some more below.

On Wed, Apr 05, 2023 at 01:01:02PM +0800, Vladimir Nikishkin wrote:
> In vxlan_core, if an fdb entry is pointing to a local
> address with some port, the system tries to get the packet to
> deliver the packet to the vxlan directly, bypassing the network
> stack.
> 
> This patch makes it still try canonical delivery, if there is no
> linux kernel vxlan listening on this port. This will be useful
> for the cases when there is some userspace daemon expecting
> vxlan packets for post-processing, or some other implementation
> of vxlan.

Use "imperative mood" as described here:
https://www.kernel.org/doc/html/latest/process/submitting-patches.html#describe-your-changes

Something like this:

"
If a packet needs to be encapsulated towards a local destination IP and
a VXLAN device that matches the destination port and VNI exists, then
the packet will be injected into the Rx path as if it was received by
the target VXLAN device without undergoing encapsulation. If such a
device does not exist, the packet will be dropped.

There are scenarios where we do not want to drop such packets and
instead want to let them be encapsulated and locally received by a user
space program that post-processes these VXLAN packets.

To that end, add a new VXLAN device attribute that controls whether such
packets are dropped or not. When set ("localbypass") these packets are
dropped and when unset ("nolocalbypass") the packets are encapsulated
and locally delivered to the listening user space application. Default
to "localbypass" to maintain existing behavior.
"

Assuming you agree with this description, I think it reveals a bug. The
current implementation does not actually default to "localbypass":

 # ip link add name vxlan0 up type vxlan id 1000 local 192.0.2.1 dstport 4789
 # ip -d -j -p link show dev vxlan0 | jq '.[]["linkinfo"]["info_data"]["localbypass"]'
 false

Fixed by this hunk:

diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c
index f9dfb179af58..9c8135fd7be5 100644
--- a/drivers/net/vxlan/vxlan_core.c
+++ b/drivers/net/vxlan/vxlan_core.c
@@ -4023,6 +4023,9 @@ static int vxlan_nl2conf(struct nlattr *tb[], struct nlattr *data[],
                                    false, extack);
                if (err)
                        return err;
+       } else if (!changelink) {
+               /* default to local bypass on a new device */
+               conf->flags |= VXLAN_F_LOCALBYPASS;
        }
 
        if (data[IFLA_VXLAN_UDP_ZERO_CSUM6_TX]) {

> 
> Signed-off-by: Vladimir Nikishkin <vladimir@nikishkin.pw>

Please add the selftest in a separate commit with its own description.

[...]

> @@ -3202,6 +3207,7 @@ static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = {
>  	[IFLA_VXLAN_TTL_INHERIT]	= { .type = NLA_FLAG },
>  	[IFLA_VXLAN_DF]		= { .type = NLA_U8 },
>  	[IFLA_VXLAN_VNIFILTER]	= { .type = NLA_U8 },
> +	[IFLA_VXLAN_LOCALBYPASS]	= { .type = NLA_U8 },

I suggest NLA_POLICY_MAX(NLA_U8, 1) to make sure values other than 0 and
1 are rejected in the unlikely case that we will want to utilize them in
the future.

Also, it's a good time to enable strict validation for new VXLAN
attributes:

diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c
index 9c8135fd7be5..c37face0d021 100644
--- a/drivers/net/vxlan/vxlan_core.c
+++ b/drivers/net/vxlan/vxlan_core.c
@@ -3177,6 +3177,7 @@ static void vxlan_raw_setup(struct net_device *dev)
 }
 
 static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = {
+       [IFLA_VXLAN_UNSPEC]     = { .strict_start_type = IFLA_VXLAN_LOCALBYPASS },
        [IFLA_VXLAN_ID]         = { .type = NLA_U32 },
        [IFLA_VXLAN_GROUP]      = { .len = sizeof_field(struct iphdr, daddr) },
        [IFLA_VXLAN_GROUP6]     = { .len = sizeof(struct in6_addr) },

>  };
>  
>  static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[],
> @@ -4011,6 +4017,14 @@ static int vxlan_nl2conf(struct nlattr *tb[], struct nlattr *data[],
>  			conf->flags |= VXLAN_F_UDP_ZERO_CSUM_TX;
>  	}
>  
> +	if (data[IFLA_VXLAN_LOCALBYPASS]) {
> +		err = vxlan_nl2flag(conf, data, IFLA_VXLAN_LOCALBYPASS,
> +				    VXLAN_F_LOCALBYPASS, changelink,
> +				    false, extack);

What is the idea behind forbidding changing this attribute? It's not
fundamental to the VXLAN device like "id" / "external" / "vnifilter". I
suggest enabling it unless you have a good reason not to. It is useful
for the selftest (see below).

> +		if (err)
> +			return err;
> +	}
> +
>  	if (data[IFLA_VXLAN_UDP_ZERO_CSUM6_TX]) {
>  		err = vxlan_nl2flag(conf, data, IFLA_VXLAN_UDP_ZERO_CSUM6_TX,
>  				    VXLAN_F_UDP_ZERO_CSUM6_TX, changelink,

[...]

> diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
> index 8d679688efe0..0fc56be5e19f 100644
> --- a/include/uapi/linux/if_link.h
> +++ b/include/uapi/linux/if_link.h
> @@ -827,6 +827,7 @@ enum {
>  	IFLA_VXLAN_TTL_INHERIT,
>  	IFLA_VXLAN_DF,
>  	IFLA_VXLAN_VNIFILTER, /* only applicable with COLLECT_METADATA mode */
> +	IFLA_VXLAN_LOCALBYPASS,
>  	__IFLA_VXLAN_MAX
>  };
>  #define IFLA_VXLAN_MAX	(__IFLA_VXLAN_MAX - 1)
> diff --git a/tools/include/uapi/linux/if_link.h b/tools/include/uapi/linux/if_link.h
> index 39e659c83cfd..1253bd0aa90e 100644
> --- a/tools/include/uapi/linux/if_link.h
> +++ b/tools/include/uapi/linux/if_link.h
> @@ -748,6 +748,8 @@ enum {
>  	IFLA_VXLAN_GPE,
>  	IFLA_VXLAN_TTL_INHERIT,
>  	IFLA_VXLAN_DF,
> +	IFLA_VXLAN_VNIFILTER,
> +	IFLA_VXLAN_LOCALBYPASS,

I wasn't aware of this file (looks like others weren't as well). Not
sure you actually need to touch it (git history shows that it is synced
by those who need it), but if you do, then make sure you also copy the
comment next to the vnifilter attribute.

>  	__IFLA_VXLAN_MAX
>  };
>  #define IFLA_VXLAN_MAX	(__IFLA_VXLAN_MAX - 1)
> diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile
> index 1de34ec99290..7a9cfd0c92db 100644
> --- a/tools/testing/selftests/net/Makefile
> +++ b/tools/testing/selftests/net/Makefile
> @@ -83,6 +83,7 @@ TEST_GEN_FILES += nat6to4.o
>  TEST_GEN_FILES += ip_local_port_range
>  TEST_GEN_FILES += bind_wildcard
>  TEST_PROGS += test_vxlan_mdb.sh
> +TEST_PROGS += test_vxlan_nolocalbypass.sh
>  
>  TEST_FILES := settings
>  
> diff --git a/tools/testing/selftests/net/rtnetlink.sh b/tools/testing/selftests/net/rtnetlink.sh
> index 383ac6fc037d..09a5ef4bd42b 100755
> --- a/tools/testing/selftests/net/rtnetlink.sh
> +++ b/tools/testing/selftests/net/rtnetlink.sh
> @@ -505,6 +505,9 @@ kci_test_encap_vxlan()
>  	ip -netns "$testns" link set dev "$vxlan" type vxlan udpcsum 2>/dev/null
>  	check_fail $?
>  
> +	ip -netns "$testns" link set dev "$vxlan" type vxlan nolocalbypass 2>/dev/null
> +	check_fail $?
> +

This is going to fail if you are going to enable change link support
like I suggested above. I suggest removing this test and instead testing
changing this attribute in the dedicated test.

>  	ip -netns "$testns" link set dev "$vxlan" type vxlan udp6zerocsumtx 2>/dev/null
>  	check_fail $?
>  
> diff --git a/tools/testing/selftests/net/test_vxlan_nolocalbypass.sh b/tools/testing/selftests/net/test_vxlan_nolocalbypass.sh
> new file mode 100755
> index 000000000000..efa37af2da7b
> --- /dev/null
> +++ b/tools/testing/selftests/net/test_vxlan_nolocalbypass.sh
> @@ -0,0 +1,102 @@
> +#!/bin/bash
> +# SPDX-License-Identifier: GPL-2.0
> +
> +# This file is testing that the [no]localbypass option for a vxlan device is
> +# working. With the nolocalbypass option, packets to a local destination, which
> +# have no corresponding vxlan in the kernel, will be delivered to userspace, for
> +# any userspace process to process. In this test tcpdump plays the role of such a
> +# process. This is what the test 1 is checking.
> +# The test 2 checks that without the nolocalbypass (which is equivalent to the
> +# localbypass option), the packets do not reach userspace.
> +
> +EXIT_FAIL=1
> +ksft_skip=4
> +EXIT_SUCCESS=0
> +
> +if [ "$(id -u)" -ne 0 ];then
> +        echo "SKIP: Need root privileges"
> +        exit $ksft_skip;
> +fi
> +
> +if [ ! -x "$(command -v ip)" ]; then
> +        echo "SKIP: Could not run test without ip tool"
> +        exit $ksft_skip
> +fi
> +
> +if [ ! -x "$(command -v bridge)" ]; then
> +        echo "SKIP: Could not run test without bridge tool"
> +        exit $ksft_skip
> +fi
> +
> +if [ ! -x "$(command -v tcpdump)" ]; then
> +        echo "SKIP: Could not run test without tcpdump tool"
> +        exit $ksft_skip
> +fi
> +
> +if [ ! -x "$(command -v grep)" ]; then
> +        echo "SKIP: Could not run test without grep tool"
> +        exit $ksft_skip
> +fi
> +
> +ip link help vxlan 2>&1 | grep -q "localbypass"
> +if [ $? -ne 0 ]; then
> +   echo "SKIP: iproute2 bridge too old, missing VXLAN nolocalbypass support"

s/bridge/ip/

> +   exit $ksft_skip
> +fi
> +
> +
> +packetfile=/tmp/packets-"$(uuidgen)"
> +
> +# test 1: packets going to userspace
> +rm "$packetfile"
> +ip link del dev testvxlan0

Try using namespaces (like in test_vxlan_mdb.sh) to avoid interfering
with the existing user environment. Also, like test_vxlan_mdb.sh, please
use the general framework of log_test() and run_cmd(). I ran your test
and although it passed, it emitted a lot of error messages (unlike other
tests):

# ./test_vxlan_nolocalbypass.sh 
rm: cannot remove '/tmp/packets-d614f8e4-549e-461e-9500-65bfea4d827e': No such file or directory
Cannot find device "testvxlan0"
PING 172.16.100.2 (172.16.100.2) 56(84) bytes of data.
dropped privs to tcpdump
tcpdump: verbose output suppressed, use -v[v]... for full protocol decode
listening on lo, link-type EN10MB (Ethernet), snapshot length 262144 bytes
From 172.16.100.1 icmp_seq=1 Destination Host Unreachable
From 172.16.100.1 icmp_seq=2 Destination Host Unreachable
From 172.16.100.1 icmp_seq=3 Destination Host Unreachable
From 172.16.100.1 icmp_seq=4 Destination Host Unreachable
8 packets captured
16 packets received by filter
0 packets dropped by kernel
Positive test passed
Cannot find device "testvxlan0"
PING 172.16.100.2 (172.16.100.2) 56(84) bytes of data.
dropped privs to tcpdump
tcpdump: verbose output suppressed, use -v[v]... for full protocol decode
listening on lo, link-type EN10MB (Ethernet), snapshot length 262144 bytes
From 172.16.100.1 icmp_seq=1 Destination Host Unreachable
From 172.16.100.1 icmp_seq=2 Destination Host Unreachable
From 172.16.100.1 icmp_seq=3 Destination Host Unreachable
0 packets captured
0 packets received by filter
0 packets dropped by kernel
Negative test passed

> +ip link add testvxlan0 type vxlan  \
> +  id 100 \
> +  dstport 4789 \
> +  srcport 4789 4790 \
> +  nolearning noproxy \
> +  nolocalbypass
> +ip link set up dev testvxlan0
> +bridge fdb add 00:00:00:00:00:00 dev testvxlan0 dst 127.0.0.1 port 4792
> +ip address add 172.16.100.1/24 dev testvxlan0
> +tcpdump -i lo 'udp and port 4792' > "$packetfile" &
> +tcpdump_pid=$!
> +timeout 5 ping -c 5 172.16.100.2
> +kill "$tcpdump_pid"
> +ip link del dev testvxlan0

Instead of creating and deleting a VXLAN device. I suggest the
following:

1. Create a VXLAN device without setting the new option.

2. Test that by default packets do not reach the listening user space
application.

3. Set "nolocalbypass". This is possible if you enable change link
support like I suggested above.

4. Test that packets do reach the listening user space application.

5. Set "localbypass".

6. Test that packets do not reach the listening user space application.

Regarding the "listening user space application", you can try to use
socat. See commit 8826218215de ("selftests: fib_tests: Add test cases
for interaction with mangling") for reference.

I think it's better than using tcpdump on the loopback device because
you can actually test that packets are delivered to user space.

Thanks

> +
> +if grep -q "VXLAN" "$packetfile" ; then
> +  echo 'Positive test passed'
> +else
> +  echo 'Positive test failed'
> +  exit $EXIT_FAIL
> +fi
> +rm "$packetfile"
> +
> +# test 2: old behaviour preserved
> +ip link del dev testvxlan0
> +ip link add testvxlan0 type vxlan  \
> +  id 100 \
> +  dstport 4789 \
> +  srcport 4789 4790 \
> +  nolearning noproxy \
> +  localbypass
> +ip link set up dev testvxlan0
> +bridge fdb add 00:00:00:00:00:00 dev testvxlan0 dst 127.0.0.1 port 4792
> +ip address add 172.16.100.1/24 dev testvxlan0
> +tcpdump -i lo 'udp and port 4792' > "$packetfile" &
> +tcpdump_pid=$!
> +timeout 5 ping -c 5 172.16.100.2
> +kill "$tcpdump_pid"
> +ip link del dev testvxlan0
> +
> +if grep -q "VXLAN" "$packetfile" ; then
> +  echo 'Negative test failed'
> +  exit $EXIT_FAIL
> +else
> +  echo 'Negative test passed'
> +fi
> +rm "$packetfile"
> +
> +exit $EXIT_SUCCESS
> +
> -- 
> 2.35.7
> 
> --
> Fastmail.
>
diff mbox series

Patch

diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c
index 561fe1b314f5..f9dfb179af58 100644
--- a/drivers/net/vxlan/vxlan_core.c
+++ b/drivers/net/vxlan/vxlan_core.c
@@ -2341,7 +2341,7 @@  static int encap_bypass_if_local(struct sk_buff *skb, struct net_device *dev,
 				 union vxlan_addr *daddr,
 				 __be16 dst_port, int dst_ifindex, __be32 vni,
 				 struct dst_entry *dst,
-				 u32 rt_flags)
+				 u32 rt_flags, bool localbypass)
 {
 #if IS_ENABLED(CONFIG_IPV6)
 	/* IPv6 rt-flags are checked against RTF_LOCAL, but the value of
@@ -2355,11 +2355,13 @@  static int encap_bypass_if_local(struct sk_buff *skb, struct net_device *dev,
 	    !(rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))) {
 		struct vxlan_dev *dst_vxlan;
 
-		dst_release(dst);
 		dst_vxlan = vxlan_find_vni(vxlan->net, dst_ifindex, vni,
 					   daddr->sa.sa_family, dst_port,
 					   vxlan->cfg.flags);
 		if (!dst_vxlan) {
+			if (!localbypass)
+				return 0;
+			dst_release(dst);
 			dev->stats.tx_errors++;
 			vxlan_vnifilter_count(vxlan, vni, NULL,
 					      VXLAN_VNI_STATS_TX_ERRORS, 0);
@@ -2367,6 +2369,7 @@  static int encap_bypass_if_local(struct sk_buff *skb, struct net_device *dev,
 
 			return -ENOENT;
 		}
+		dst_release(dst);
 		vxlan_encap_bypass(skb, vxlan, dst_vxlan, vni, true);
 		return 1;
 	}
@@ -2494,9 +2497,11 @@  void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 
 		if (!info) {
 			/* Bypass encapsulation if the destination is local */
+			bool localbypass = flags & VXLAN_F_LOCALBYPASS;
 			err = encap_bypass_if_local(skb, dev, vxlan, dst,
 						    dst_port, ifindex, vni,
-						    &rt->dst, rt->rt_flags);
+						    &rt->dst, rt->rt_flags,
+						    localbypass);
 			if (err)
 				goto out_unlock;
 
@@ -2568,10 +2573,10 @@  void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 
 		if (!info) {
 			u32 rt6i_flags = ((struct rt6_info *)ndst)->rt6i_flags;
-
+			bool localbypass = flags & VXLAN_F_LOCALBYPASS;
 			err = encap_bypass_if_local(skb, dev, vxlan, dst,
 						    dst_port, ifindex, vni,
-						    ndst, rt6i_flags);
+						    ndst, rt6i_flags, localbypass);
 			if (err)
 				goto out_unlock;
 		}
@@ -3202,6 +3207,7 @@  static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = {
 	[IFLA_VXLAN_TTL_INHERIT]	= { .type = NLA_FLAG },
 	[IFLA_VXLAN_DF]		= { .type = NLA_U8 },
 	[IFLA_VXLAN_VNIFILTER]	= { .type = NLA_U8 },
+	[IFLA_VXLAN_LOCALBYPASS]	= { .type = NLA_U8 },
 };
 
 static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[],
@@ -4011,6 +4017,14 @@  static int vxlan_nl2conf(struct nlattr *tb[], struct nlattr *data[],
 			conf->flags |= VXLAN_F_UDP_ZERO_CSUM_TX;
 	}
 
+	if (data[IFLA_VXLAN_LOCALBYPASS]) {
+		err = vxlan_nl2flag(conf, data, IFLA_VXLAN_LOCALBYPASS,
+				    VXLAN_F_LOCALBYPASS, changelink,
+				    false, extack);
+		if (err)
+			return err;
+	}
+
 	if (data[IFLA_VXLAN_UDP_ZERO_CSUM6_TX]) {
 		err = vxlan_nl2flag(conf, data, IFLA_VXLAN_UDP_ZERO_CSUM6_TX,
 				    VXLAN_F_UDP_ZERO_CSUM6_TX, changelink,
@@ -4232,6 +4246,7 @@  static size_t vxlan_get_size(const struct net_device *dev)
 		nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_UDP_ZERO_CSUM6_RX */
 		nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_REMCSUM_TX */
 		nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_REMCSUM_RX */
+		nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_LOCALBYPASS */
 		0;
 }
 
@@ -4308,7 +4323,9 @@  static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev)
 	    nla_put_u8(skb, IFLA_VXLAN_REMCSUM_TX,
 		       !!(vxlan->cfg.flags & VXLAN_F_REMCSUM_TX)) ||
 	    nla_put_u8(skb, IFLA_VXLAN_REMCSUM_RX,
-		       !!(vxlan->cfg.flags & VXLAN_F_REMCSUM_RX)))
+		       !!(vxlan->cfg.flags & VXLAN_F_REMCSUM_RX)) ||
+	    nla_put_u8(skb, IFLA_VXLAN_LOCALBYPASS,
+		       !!(vxlan->cfg.flags & VXLAN_F_LOCALBYPASS)))
 		goto nla_put_failure;
 
 	if (nla_put(skb, IFLA_VXLAN_PORT_RANGE, sizeof(ports), &ports))
diff --git a/include/net/vxlan.h b/include/net/vxlan.h
index 20bd7d893e10..0be91ca78d3a 100644
--- a/include/net/vxlan.h
+++ b/include/net/vxlan.h
@@ -328,6 +328,7 @@  struct vxlan_dev {
 #define VXLAN_F_TTL_INHERIT		0x10000
 #define VXLAN_F_VNIFILTER               0x20000
 #define VXLAN_F_MDB			0x40000
+#define VXLAN_F_LOCALBYPASS		0x80000
 
 /* Flags that are used in the receive path. These flags must match in
  * order for a socket to be shareable
@@ -348,7 +349,8 @@  struct vxlan_dev {
 					 VXLAN_F_UDP_ZERO_CSUM6_TX |	\
 					 VXLAN_F_UDP_ZERO_CSUM6_RX |	\
 					 VXLAN_F_COLLECT_METADATA  |	\
-					 VXLAN_F_VNIFILTER)
+					 VXLAN_F_VNIFILTER         |    \
+					 VXLAN_F_LOCALBYPASS)
 
 struct net_device *vxlan_dev_create(struct net *net, const char *name,
 				    u8 name_assign_type, struct vxlan_config *conf);
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 8d679688efe0..0fc56be5e19f 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -827,6 +827,7 @@  enum {
 	IFLA_VXLAN_TTL_INHERIT,
 	IFLA_VXLAN_DF,
 	IFLA_VXLAN_VNIFILTER, /* only applicable with COLLECT_METADATA mode */
+	IFLA_VXLAN_LOCALBYPASS,
 	__IFLA_VXLAN_MAX
 };
 #define IFLA_VXLAN_MAX	(__IFLA_VXLAN_MAX - 1)
diff --git a/tools/include/uapi/linux/if_link.h b/tools/include/uapi/linux/if_link.h
index 39e659c83cfd..1253bd0aa90e 100644
--- a/tools/include/uapi/linux/if_link.h
+++ b/tools/include/uapi/linux/if_link.h
@@ -748,6 +748,8 @@  enum {
 	IFLA_VXLAN_GPE,
 	IFLA_VXLAN_TTL_INHERIT,
 	IFLA_VXLAN_DF,
+	IFLA_VXLAN_VNIFILTER,
+	IFLA_VXLAN_LOCALBYPASS,
 	__IFLA_VXLAN_MAX
 };
 #define IFLA_VXLAN_MAX	(__IFLA_VXLAN_MAX - 1)
diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile
index 1de34ec99290..7a9cfd0c92db 100644
--- a/tools/testing/selftests/net/Makefile
+++ b/tools/testing/selftests/net/Makefile
@@ -83,6 +83,7 @@  TEST_GEN_FILES += nat6to4.o
 TEST_GEN_FILES += ip_local_port_range
 TEST_GEN_FILES += bind_wildcard
 TEST_PROGS += test_vxlan_mdb.sh
+TEST_PROGS += test_vxlan_nolocalbypass.sh
 
 TEST_FILES := settings
 
diff --git a/tools/testing/selftests/net/rtnetlink.sh b/tools/testing/selftests/net/rtnetlink.sh
index 383ac6fc037d..09a5ef4bd42b 100755
--- a/tools/testing/selftests/net/rtnetlink.sh
+++ b/tools/testing/selftests/net/rtnetlink.sh
@@ -505,6 +505,9 @@  kci_test_encap_vxlan()
 	ip -netns "$testns" link set dev "$vxlan" type vxlan udpcsum 2>/dev/null
 	check_fail $?
 
+	ip -netns "$testns" link set dev "$vxlan" type vxlan nolocalbypass 2>/dev/null
+	check_fail $?
+
 	ip -netns "$testns" link set dev "$vxlan" type vxlan udp6zerocsumtx 2>/dev/null
 	check_fail $?
 
diff --git a/tools/testing/selftests/net/test_vxlan_nolocalbypass.sh b/tools/testing/selftests/net/test_vxlan_nolocalbypass.sh
new file mode 100755
index 000000000000..efa37af2da7b
--- /dev/null
+++ b/tools/testing/selftests/net/test_vxlan_nolocalbypass.sh
@@ -0,0 +1,102 @@ 
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# This file is testing that the [no]localbypass option for a vxlan device is
+# working. With the nolocalbypass option, packets to a local destination, which
+# have no corresponding vxlan in the kernel, will be delivered to userspace, for
+# any userspace process to process. In this test tcpdump plays the role of such a
+# process. This is what the test 1 is checking.
+# The test 2 checks that without the nolocalbypass (which is equivalent to the
+# localbypass option), the packets do not reach userspace.
+
+EXIT_FAIL=1
+ksft_skip=4
+EXIT_SUCCESS=0
+
+if [ "$(id -u)" -ne 0 ];then
+        echo "SKIP: Need root privileges"
+        exit $ksft_skip;
+fi
+
+if [ ! -x "$(command -v ip)" ]; then
+        echo "SKIP: Could not run test without ip tool"
+        exit $ksft_skip
+fi
+
+if [ ! -x "$(command -v bridge)" ]; then
+        echo "SKIP: Could not run test without bridge tool"
+        exit $ksft_skip
+fi
+
+if [ ! -x "$(command -v tcpdump)" ]; then
+        echo "SKIP: Could not run test without tcpdump tool"
+        exit $ksft_skip
+fi
+
+if [ ! -x "$(command -v grep)" ]; then
+        echo "SKIP: Could not run test without grep tool"
+        exit $ksft_skip
+fi
+
+ip link help vxlan 2>&1 | grep -q "localbypass"
+if [ $? -ne 0 ]; then
+   echo "SKIP: iproute2 bridge too old, missing VXLAN nolocalbypass support"
+   exit $ksft_skip
+fi
+
+
+packetfile=/tmp/packets-"$(uuidgen)"
+
+# test 1: packets going to userspace
+rm "$packetfile"
+ip link del dev testvxlan0
+ip link add testvxlan0 type vxlan  \
+  id 100 \
+  dstport 4789 \
+  srcport 4789 4790 \
+  nolearning noproxy \
+  nolocalbypass
+ip link set up dev testvxlan0
+bridge fdb add 00:00:00:00:00:00 dev testvxlan0 dst 127.0.0.1 port 4792
+ip address add 172.16.100.1/24 dev testvxlan0
+tcpdump -i lo 'udp and port 4792' > "$packetfile" &
+tcpdump_pid=$!
+timeout 5 ping -c 5 172.16.100.2
+kill "$tcpdump_pid"
+ip link del dev testvxlan0
+
+if grep -q "VXLAN" "$packetfile" ; then
+  echo 'Positive test passed'
+else
+  echo 'Positive test failed'
+  exit $EXIT_FAIL
+fi
+rm "$packetfile"
+
+# test 2: old behaviour preserved
+ip link del dev testvxlan0
+ip link add testvxlan0 type vxlan  \
+  id 100 \
+  dstport 4789 \
+  srcport 4789 4790 \
+  nolearning noproxy \
+  localbypass
+ip link set up dev testvxlan0
+bridge fdb add 00:00:00:00:00:00 dev testvxlan0 dst 127.0.0.1 port 4792
+ip address add 172.16.100.1/24 dev testvxlan0
+tcpdump -i lo 'udp and port 4792' > "$packetfile" &
+tcpdump_pid=$!
+timeout 5 ping -c 5 172.16.100.2
+kill "$tcpdump_pid"
+ip link del dev testvxlan0
+
+if grep -q "VXLAN" "$packetfile" ; then
+  echo 'Negative test failed'
+  exit $EXIT_FAIL
+else
+  echo 'Negative test passed'
+fi
+rm "$packetfile"
+
+exit $EXIT_SUCCESS
+