diff mbox series

[RFC,net-next,v2,02/10] ipv4: Add a sysctl to control multipath hash fields

Message ID 20210509151615.200608-3-idosch@idosch.org (mailing list archive)
State RFC
Delegated to: Netdev Maintainers
Headers show
Series Add support for custom multipath hash | expand

Checks

Context Check Description
netdev/cover_letter success Link
netdev/fixes_present success Link
netdev/patch_count success Link
netdev/tree_selection success Clearly marked for net-next
netdev/subject_prefix success Link
netdev/cc_maintainers warning 9 maintainers not CCed: dsahern@kernel.org yoshfuji@linux-ipv6.org andreas.a.roeseler@gmail.com linux-doc@vger.kernel.org edumazet@google.com corbet@lwn.net fw@strlen.de amcohen@nvidia.com weiwan@google.com
netdev/source_inline success Was 0 now: 0
netdev/verify_signedoff success Link
netdev/module_param success Was 0 now: 0
netdev/build_32bit success Errors and warnings before: 5577 this patch: 5577
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/verify_fixes success Link
netdev/checkpatch warning WARNING: line length of 82 exceeds 80 columns
netdev/build_allmodconfig_warn success Errors and warnings before: 5642 this patch: 5642
netdev/header_inline success Link

Commit Message

Ido Schimmel May 9, 2021, 3:16 p.m. UTC
From: Ido Schimmel <idosch@nvidia.com>

A subsequent patch will add a new multipath hash policy where the packet
fields used for multipath hash calculation are determined by user space.
This patch adds a sysctl that allows user space to set these fields.

The packet fields are represented using a bitmask and are common between
IPv4 and IPv6 to allow user space to use the same numbering across both
protocols. For example, to hash based on standard 5-tuple:

 # sysctl -w net.ipv4.fib_multipath_hash_fields=0x0037
 net.ipv4.fib_multipath_hash_fields = 0x0037

The kernel rejects unknown fields, for example:

 # sysctl -w net.ipv4.fib_multipath_hash_fields=0x1000
 sysctl: setting key "net.ipv4.fib_multipath_hash_fields": Invalid argument

More fields can be added in the future, if needed.

Signed-off-by: Ido Schimmel <idosch@nvidia.com>
---
 Documentation/networking/ip-sysctl.rst | 27 ++++++++++++++++
 include/net/ip_fib.h                   | 43 ++++++++++++++++++++++++++
 include/net/netns/ipv4.h               |  1 +
 net/ipv4/fib_frontend.c                |  6 ++++
 net/ipv4/sysctl_net_ipv4.c             | 11 +++++++
 5 files changed, 88 insertions(+)

Comments

David Ahern May 11, 2021, 3:10 p.m. UTC | #1
On 5/9/21 9:16 AM, Ido Schimmel wrote:
> diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst
> index c2ecc9894fd0..15982f830abc 100644
> --- a/Documentation/networking/ip-sysctl.rst
> +++ b/Documentation/networking/ip-sysctl.rst
> @@ -100,6 +100,33 @@ fib_multipath_hash_policy - INTEGER
>  	- 1 - Layer 4
>  	- 2 - Layer 3 or inner Layer 3 if present
>  
> +fib_multipath_hash_fields - UNSIGNED INTEGER
> +	When fib_multipath_hash_policy is set to 3 (custom multipath hash), the
> +	fields used for multipath hash calculation are determined by this
> +	sysctl.
> +
> +	This value is a bitmask which enables various fields for multipath hash
> +	calculation.
> +
> +	Possible fields are:
> +
> +	====== ============================
> +	0x0001 Source IP address
> +	0x0002 Destination IP address
> +	0x0004 IP protocol
> +	0x0008 Unused

Document that this bit is flowlabel for IPv6 and ignored for ipv4.
David Ahern May 11, 2021, 3:49 p.m. UTC | #2
On 5/9/21 9:16 AM, Ido Schimmel wrote:
> diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
> index a62934b9f15a..da627c4d633a 100644
> --- a/net/ipv4/sysctl_net_ipv4.c
> +++ b/net/ipv4/sysctl_net_ipv4.c
> @@ -19,6 +19,7 @@
>  #include <net/snmp.h>
>  #include <net/icmp.h>
>  #include <net/ip.h>
> +#include <net/ip_fib.h>
>  #include <net/route.h>
>  #include <net/tcp.h>
>  #include <net/udp.h>
> @@ -48,6 +49,8 @@ static int ip_ping_group_range_min[] = { 0, 0 };
>  static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX };
>  static u32 u32_max_div_HZ = UINT_MAX / HZ;
>  static int one_day_secs = 24 * 3600;
> +static u32 fib_multipath_hash_fields_all_mask __maybe_unused =
> +	FIB_MULTIPATH_HASH_FIELD_ALL_MASK;
>  
>  /* obsolete */
>  static int sysctl_tcp_low_latency __read_mostly;
> @@ -1052,6 +1055,14 @@ static struct ctl_table ipv4_net_table[] = {
>  		.extra1		= SYSCTL_ZERO,
>  		.extra2		= &two,
>  	},
> +	{
> +		.procname	= "fib_multipath_hash_fields",
> +		.data		= &init_net.ipv4.sysctl_fib_multipath_hash_fields,
> +		.maxlen		= sizeof(u32),
> +		.mode		= 0644,
> +		.proc_handler	= proc_douintvec_minmax,
> +		.extra2		= &fib_multipath_hash_fields_all_mask,

no .extra1 means 0 is allowed which effectively disables hashing and
multipath selection; only the first leg will be used. Is that intended?



> +	},
>  #endif
>  	{
>  		.procname	= "ip_unprivileged_port_start",
>
Ido Schimmel May 11, 2021, 7:58 p.m. UTC | #3
On Tue, May 11, 2021 at 09:10:46AM -0600, David Ahern wrote:
> On 5/9/21 9:16 AM, Ido Schimmel wrote:
> > diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst
> > index c2ecc9894fd0..15982f830abc 100644
> > --- a/Documentation/networking/ip-sysctl.rst
> > +++ b/Documentation/networking/ip-sysctl.rst
> > @@ -100,6 +100,33 @@ fib_multipath_hash_policy - INTEGER
> >  	- 1 - Layer 4
> >  	- 2 - Layer 3 or inner Layer 3 if present
> >  
> > +fib_multipath_hash_fields - UNSIGNED INTEGER
> > +	When fib_multipath_hash_policy is set to 3 (custom multipath hash), the
> > +	fields used for multipath hash calculation are determined by this
> > +	sysctl.
> > +
> > +	This value is a bitmask which enables various fields for multipath hash
> > +	calculation.
> > +
> > +	Possible fields are:
> > +
> > +	====== ============================
> > +	0x0001 Source IP address
> > +	0x0002 Destination IP address
> > +	0x0004 IP protocol
> > +	0x0008 Unused
> 
> Document that this bit is flowlabel for IPv6 and ignored for ipv4.

OK
Ido Schimmel May 11, 2021, 8:05 p.m. UTC | #4
On Tue, May 11, 2021 at 09:49:30AM -0600, David Ahern wrote:
> On 5/9/21 9:16 AM, Ido Schimmel wrote:
> > diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
> > index a62934b9f15a..da627c4d633a 100644
> > --- a/net/ipv4/sysctl_net_ipv4.c
> > +++ b/net/ipv4/sysctl_net_ipv4.c
> > @@ -19,6 +19,7 @@
> >  #include <net/snmp.h>
> >  #include <net/icmp.h>
> >  #include <net/ip.h>
> > +#include <net/ip_fib.h>
> >  #include <net/route.h>
> >  #include <net/tcp.h>
> >  #include <net/udp.h>
> > @@ -48,6 +49,8 @@ static int ip_ping_group_range_min[] = { 0, 0 };
> >  static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX };
> >  static u32 u32_max_div_HZ = UINT_MAX / HZ;
> >  static int one_day_secs = 24 * 3600;
> > +static u32 fib_multipath_hash_fields_all_mask __maybe_unused =
> > +	FIB_MULTIPATH_HASH_FIELD_ALL_MASK;
> >  
> >  /* obsolete */
> >  static int sysctl_tcp_low_latency __read_mostly;
> > @@ -1052,6 +1055,14 @@ static struct ctl_table ipv4_net_table[] = {
> >  		.extra1		= SYSCTL_ZERO,
> >  		.extra2		= &two,
> >  	},
> > +	{
> > +		.procname	= "fib_multipath_hash_fields",
> > +		.data		= &init_net.ipv4.sysctl_fib_multipath_hash_fields,
> > +		.maxlen		= sizeof(u32),
> > +		.mode		= 0644,
> > +		.proc_handler	= proc_douintvec_minmax,
> > +		.extra2		= &fib_multipath_hash_fields_all_mask,
> 
> no .extra1 means 0 is allowed which effectively disables hashing and
> multipath selection; only the first leg will be used. Is that intended?

I didn't see any reason to forbid it, but I don't see any reason to use
it with '0' either. With this patch:

diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 81343037de06..4fa77f182dcb 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -1078,6 +1078,7 @@ static struct ctl_table ipv4_net_table[] = {
 		.maxlen		= sizeof(u32),
 		.mode		= 0644,
 		.proc_handler	= proc_fib_multipath_hash_fields,
+		.extra1		= SYSCTL_ONE,
 		.extra2		= &fib_multipath_hash_fields_all_mask,
 	},
 #endif

We get:

# sysctl -w net.ipv4.fib_multipath_hash_fields=0
sysctl: setting key "net.ipv4.fib_multipath_hash_fields": Invalid argument

I assume you want to see this change in the next version (and for IPv6)?

> 
> 
> 
> > +	},
> >  #endif
> >  	{
> >  		.procname	= "ip_unprivileged_port_start",
> > 
>
David Ahern May 17, 2021, 2:47 p.m. UTC | #5
On 5/11/21 2:05 PM, Ido Schimmel wrote:
> 
> # sysctl -w net.ipv4.fib_multipath_hash_fields=0
> sysctl: setting key "net.ipv4.fib_multipath_hash_fields": Invalid argument
> 
> I assume you want to see this change in the next version (and for IPv6)?
> 

Yes, I do not know of any reason to allow the hash policy to disable
multipath routes.
diff mbox series

Patch

diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst
index c2ecc9894fd0..15982f830abc 100644
--- a/Documentation/networking/ip-sysctl.rst
+++ b/Documentation/networking/ip-sysctl.rst
@@ -100,6 +100,33 @@  fib_multipath_hash_policy - INTEGER
 	- 1 - Layer 4
 	- 2 - Layer 3 or inner Layer 3 if present
 
+fib_multipath_hash_fields - UNSIGNED INTEGER
+	When fib_multipath_hash_policy is set to 3 (custom multipath hash), the
+	fields used for multipath hash calculation are determined by this
+	sysctl.
+
+	This value is a bitmask which enables various fields for multipath hash
+	calculation.
+
+	Possible fields are:
+
+	====== ============================
+	0x0001 Source IP address
+	0x0002 Destination IP address
+	0x0004 IP protocol
+	0x0008 Unused
+	0x0010 Source port
+	0x0020 Destination port
+	0x0040 Inner source IP address
+	0x0080 Inner destination IP address
+	0x0100 Inner IP protocol
+	0x0200 Inner Flow Label
+	0x0400 Inner source port
+	0x0800 Inner destination port
+	====== ============================
+
+	Default: 0x0007 (source IP, destination IP and IP protocol)
+
 fib_sync_mem - UNSIGNED INTEGER
 	Amount of dirty memory from fib entries that can be backlogged before
 	synchronize_rcu is forced.
diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index a914f33f3ed5..3ab2563b1a23 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -466,6 +466,49 @@  int fib_sync_up(struct net_device *dev, unsigned char nh_flags);
 void fib_sync_mtu(struct net_device *dev, u32 orig_mtu);
 void fib_nhc_update_mtu(struct fib_nh_common *nhc, u32 new, u32 orig);
 
+/* Fields used for sysctl_fib_multipath_hash_fields.
+ * Common to IPv4 and IPv6.
+ *
+ * Add new fields at the end. This is user API.
+ */
+#define FIB_MULTIPATH_HASH_FIELD_SRC_IP			BIT(0)
+#define FIB_MULTIPATH_HASH_FIELD_DST_IP			BIT(1)
+#define FIB_MULTIPATH_HASH_FIELD_IP_PROTO		BIT(2)
+#define FIB_MULTIPATH_HASH_FIELD_FLOWLABEL		BIT(3)
+#define FIB_MULTIPATH_HASH_FIELD_SRC_PORT		BIT(4)
+#define FIB_MULTIPATH_HASH_FIELD_DST_PORT		BIT(5)
+#define FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP		BIT(6)
+#define FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP		BIT(7)
+#define FIB_MULTIPATH_HASH_FIELD_INNER_IP_PROTO		BIT(8)
+#define FIB_MULTIPATH_HASH_FIELD_INNER_FLOWLABEL	BIT(9)
+#define FIB_MULTIPATH_HASH_FIELD_INNER_SRC_PORT		BIT(10)
+#define FIB_MULTIPATH_HASH_FIELD_INNER_DST_PORT		BIT(11)
+
+#define FIB_MULTIPATH_HASH_FIELD_OUTER_MASK		\
+	(FIB_MULTIPATH_HASH_FIELD_SRC_IP |		\
+	 FIB_MULTIPATH_HASH_FIELD_DST_IP |		\
+	 FIB_MULTIPATH_HASH_FIELD_IP_PROTO |		\
+	 FIB_MULTIPATH_HASH_FIELD_FLOWLABEL |		\
+	 FIB_MULTIPATH_HASH_FIELD_SRC_PORT |		\
+	 FIB_MULTIPATH_HASH_FIELD_DST_PORT)
+
+#define FIB_MULTIPATH_HASH_FIELD_INNER_MASK		\
+	(FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP |	\
+	 FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP |	\
+	 FIB_MULTIPATH_HASH_FIELD_INNER_IP_PROTO |	\
+	 FIB_MULTIPATH_HASH_FIELD_INNER_FLOWLABEL |	\
+	 FIB_MULTIPATH_HASH_FIELD_INNER_SRC_PORT |	\
+	 FIB_MULTIPATH_HASH_FIELD_INNER_DST_PORT)
+
+#define FIB_MULTIPATH_HASH_FIELD_ALL_MASK		\
+	(FIB_MULTIPATH_HASH_FIELD_OUTER_MASK |		\
+	 FIB_MULTIPATH_HASH_FIELD_INNER_MASK)
+
+#define FIB_MULTIPATH_HASH_FIELD_DEFAULT_MASK		\
+	(FIB_MULTIPATH_HASH_FIELD_SRC_IP |		\
+	 FIB_MULTIPATH_HASH_FIELD_DST_IP |		\
+	 FIB_MULTIPATH_HASH_FIELD_IP_PROTO)
+
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
 		       const struct sk_buff *skb, struct flow_keys *flkeys);
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index f6af8d96d3c6..746c80cd4257 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -210,6 +210,7 @@  struct netns_ipv4 {
 #endif
 #endif
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
+	u32 sysctl_fib_multipath_hash_fields;
 	u8 sysctl_fib_multipath_use_neigh;
 	u8 sysctl_fib_multipath_hash_policy;
 #endif
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 84bb707bd88d..129213b7d834 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -1516,6 +1516,12 @@  static int __net_init ip_fib_net_init(struct net *net)
 	if (err)
 		return err;
 
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+	/* Default to 3-tuple */
+	net->ipv4.sysctl_fib_multipath_hash_fields =
+		FIB_MULTIPATH_HASH_FIELD_DEFAULT_MASK;
+#endif
+
 	/* Avoid false sharing : Use at least a full cache line */
 	size = max_t(size_t, size, L1_CACHE_BYTES);
 
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index a62934b9f15a..da627c4d633a 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -19,6 +19,7 @@ 
 #include <net/snmp.h>
 #include <net/icmp.h>
 #include <net/ip.h>
+#include <net/ip_fib.h>
 #include <net/route.h>
 #include <net/tcp.h>
 #include <net/udp.h>
@@ -48,6 +49,8 @@  static int ip_ping_group_range_min[] = { 0, 0 };
 static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX };
 static u32 u32_max_div_HZ = UINT_MAX / HZ;
 static int one_day_secs = 24 * 3600;
+static u32 fib_multipath_hash_fields_all_mask __maybe_unused =
+	FIB_MULTIPATH_HASH_FIELD_ALL_MASK;
 
 /* obsolete */
 static int sysctl_tcp_low_latency __read_mostly;
@@ -1052,6 +1055,14 @@  static struct ctl_table ipv4_net_table[] = {
 		.extra1		= SYSCTL_ZERO,
 		.extra2		= &two,
 	},
+	{
+		.procname	= "fib_multipath_hash_fields",
+		.data		= &init_net.ipv4.sysctl_fib_multipath_hash_fields,
+		.maxlen		= sizeof(u32),
+		.mode		= 0644,
+		.proc_handler	= proc_douintvec_minmax,
+		.extra2		= &fib_multipath_hash_fields_all_mask,
+	},
 #endif
 	{
 		.procname	= "ip_unprivileged_port_start",