diff mbox series

[net-next,06/16] net: bridge: Add a tracepoint for MDB overflows

Message ID ed2e2e305dd49423745b62c0152a0b85bc84a767.1674752051.git.petrm@nvidia.com (mailing list archive)
State Superseded
Headers show
Series None | expand

Commit Message

Petr Machata Jan. 26, 2023, 5:01 p.m. UTC
The following patch will add two more maximum MDB allowances to the global
one, mcast_hash_max, that exists today. In all these cases, attempts to add
MDB entries above the configured maximums through netlink, fail noisily and
obviously. Such visibility is missing when adding entries through the
control plane traffic, by IGMP or MLD packets.

To improve visibility in those cases, add a trace point that reports the
violation, including the relevant netdevice (be it a slave or the bridge
itself), and the MDB entry parameters:

	# perf record -e bridge:br_mdb_full &
	# [...]
	# perf script | cut -d: -f4-
	 dev v2 af 2 src 192.0.2.1/:: grp 239.1.1.1/::/00:00:00:00:00:00 vid 0
	 dev v2 af 10 src 0.0.0.0/2001:db8:1::1 grp 0.0.0.0/ff0e::1/00:00:00:00:00:00 vid 0
	 dev v2 af 2 src 192.0.2.1/:: grp 239.1.1.1/::/00:00:00:00:00:00 vid 10
	 dev v2 af 10 src 0.0.0.0/2001:db8:1::1 grp 0.0.0.0/ff0e::1/00:00:00:00:00:00 vid 10

CC: Steven Rostedt <rostedt@goodmis.org>
CC: linux-trace-kernel@vger.kernel.org
Signed-off-by: Petr Machata <petrm@nvidia.com>
Reviewed-by: Ido Schimmel <idosch@nvidia.com>
---
 include/trace/events/bridge.h | 67 +++++++++++++++++++++++++++++++++++
 net/core/net-traces.c         |  1 +
 2 files changed, 68 insertions(+)

Comments

Steven Rostedt Jan. 26, 2023, 5:53 p.m. UTC | #1
On Thu, 26 Jan 2023 18:01:14 +0100
Petr Machata <petrm@nvidia.com> wrote:

> The following patch will add two more maximum MDB allowances to the global
> one, mcast_hash_max, that exists today. In all these cases, attempts to add
> MDB entries above the configured maximums through netlink, fail noisily and
> obviously. Such visibility is missing when adding entries through the
> control plane traffic, by IGMP or MLD packets.
> 
> To improve visibility in those cases, add a trace point that reports the
> violation, including the relevant netdevice (be it a slave or the bridge
> itself), and the MDB entry parameters:
> 
> 	# perf record -e bridge:br_mdb_full &
> 	# [...]
> 	# perf script | cut -d: -f4-
> 	 dev v2 af 2 src 192.0.2.1/:: grp 239.1.1.1/::/00:00:00:00:00:00 vid 0
> 	 dev v2 af 10 src 0.0.0.0/2001:db8:1::1 grp 0.0.0.0/ff0e::1/00:00:00:00:00:00 vid 0
> 	 dev v2 af 2 src 192.0.2.1/:: grp 239.1.1.1/::/00:00:00:00:00:00 vid 10
> 	 dev v2 af 10 src 0.0.0.0/2001:db8:1::1 grp 0.0.0.0/ff0e::1/00:00:00:00:00:00 vid 10
> 
> CC: Steven Rostedt <rostedt@goodmis.org>
> CC: linux-trace-kernel@vger.kernel.org
> Signed-off-by: Petr Machata <petrm@nvidia.com>
> Reviewed-by: Ido Schimmel <idosch@nvidia.com>
> ---
>  include/trace/events/bridge.h | 67 +++++++++++++++++++++++++++++++++++
>  net/core/net-traces.c         |  1 +
>  2 files changed, 68 insertions(+)
> 
> diff --git a/include/trace/events/bridge.h b/include/trace/events/bridge.h
> index 6b200059c2c5..00d5e2dcb3ad 100644
> --- a/include/trace/events/bridge.h
> +++ b/include/trace/events/bridge.h
> @@ -122,6 +122,73 @@ TRACE_EVENT(br_fdb_update,
>  		  __entry->flags)
>  );
>  
> +TRACE_EVENT(br_mdb_full,
> +
> +	TP_PROTO(const struct net_device *dev,
> +		 const struct br_ip *group),
> +
> +	TP_ARGS(dev, group),
> +
> +	TP_STRUCT__entry(
> +		__string(dev, dev->name)
> +		__field(int, af)
> +		__field(u16, vid)
> +		__array(__u8, src4, 4)
> +		__array(__u8, src6, 16)
> +		__array(__u8, grp4, 4)
> +		__array(__u8, grp6, 16)
> +		__array(__u8, grpmac, ETH_ALEN) /* For af == 0. */

Instead of wasting ring buffer space, why not just have:

		__array(__u8, src, 16)
		__array(__u8, grp, 16)


> +	),
> +
> +	TP_fast_assign(
> +		__assign_str(dev, dev->name);
> +		__entry->vid = group->vid;
> +
> +		if (!group->proto) {
> +			__entry->af = 0;
> +
> +			memset(__entry->src4, 0, sizeof(__entry->src4));
> +			memset(__entry->src6, 0, sizeof(__entry->src6));
> +			memset(__entry->grp4, 0, sizeof(__entry->grp4));
> +			memset(__entry->grp6, 0, sizeof(__entry->grp6));
> +			memcpy(__entry->grpmac, group->dst.mac_addr, ETH_ALEN);
> +		} else if (group->proto == htons(ETH_P_IP)) {
> +			__be32 *p32;
> +
> +			__entry->af = AF_INET;
> +
> +			p32 = (__be32 *) __entry->src4;
> +			*p32 = group->src.ip4;
> +
> +			p32 = (__be32 *) __entry->grp4;
> +			*p32 = group->dst.ip4;

			struct in6_addr *in6;

			in6 = (struct in6_addr *)__entry->src;
			ipv6_addr_set_v4mapped(group->src.ip4, in6);

			in6 = (struct in6_addr *)__entry->grp;
			ipv6_addr_set_v4mapped(group->grp.ip4, in6);

> +
> +			memset(__entry->src6, 0, sizeof(__entry->src6));
> +			memset(__entry->grp6, 0, sizeof(__entry->grp6));
> +			memset(__entry->grpmac, 0, ETH_ALEN);
> +#if IS_ENABLED(CONFIG_IPV6)
> +		} else {
> +			struct in6_addr *in6;
> +
> +			__entry->af = AF_INET6;
> +
> +			in6 = (struct in6_addr *)__entry->src6;
> +			*in6 = group->src.ip6;
> +
> +			in6 = (struct in6_addr *)__entry->grp6;
> +			*in6 = group->dst.ip6;
> +
> +			memset(__entry->src4, 0, sizeof(__entry->src4));
> +			memset(__entry->grp4, 0, sizeof(__entry->grp4));
> +			memset(__entry->grpmac, 0, ETH_ALEN);
> +#endif
> +		}
> +	),
> +
> +	TP_printk("dev %s af %u src %pI4/%pI6c grp %pI4/%pI6c/%pM vid %u",
> +		  __get_str(dev), __entry->af, __entry->src4, __entry->src6,
> +		  __entry->grp4, __entry->grp6, __entry->grpmac, __entry->vid)

And just have: 

	TP_printk("dev %s af %u src %pI6c grp %pI6c/%pM vid %u",
		  __get_str(dev), __entry->af, __entry->src, __entry->grp,
		  __entry->grpmac, __entry->vid)

As the %pI6c should detect that it's a ipv4 address and show that.

-- Steve


> +);
>  
>  #endif /* _TRACE_BRIDGE_H */
>  
> diff --git a/net/core/net-traces.c b/net/core/net-traces.c
> index ee7006bbe49b..805b7385dd8d 100644
> --- a/net/core/net-traces.c
> +++ b/net/core/net-traces.c
> @@ -41,6 +41,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(br_fdb_add);
>  EXPORT_TRACEPOINT_SYMBOL_GPL(br_fdb_external_learn_add);
>  EXPORT_TRACEPOINT_SYMBOL_GPL(fdb_delete);
>  EXPORT_TRACEPOINT_SYMBOL_GPL(br_fdb_update);
> +EXPORT_TRACEPOINT_SYMBOL_GPL(br_mdb_full);
>  #endif
>  
>  #if IS_ENABLED(CONFIG_PAGE_POOL)
Petr Machata Jan. 27, 2023, 2:29 p.m. UTC | #2
Steven Rostedt <rostedt@goodmis.org> writes:

>> diff --git a/include/trace/events/bridge.h b/include/trace/events/bridge.h
>> index 6b200059c2c5..00d5e2dcb3ad 100644
>> --- a/include/trace/events/bridge.h
>> +++ b/include/trace/events/bridge.h
>> @@ -122,6 +122,73 @@ TRACE_EVENT(br_fdb_update,
>>  		  __entry->flags)
>>  );
>>  
>> +TRACE_EVENT(br_mdb_full,
>> +
>> +	TP_PROTO(const struct net_device *dev,
>> +		 const struct br_ip *group),
>> +
>> +	TP_ARGS(dev, group),
>> +
>> +	TP_STRUCT__entry(
>> +		__string(dev, dev->name)
>> +		__field(int, af)
>> +		__field(u16, vid)
>> +		__array(__u8, src4, 4)
>> +		__array(__u8, src6, 16)
>> +		__array(__u8, grp4, 4)
>> +		__array(__u8, grp6, 16)
>> +		__array(__u8, grpmac, ETH_ALEN) /* For af == 0. */
>
> Instead of wasting ring buffer space, why not just have:
>
> 		__array(__u8, src, 16)
> 		__array(__u8, grp, 16)
>
>> +	),
>> +
>> +	TP_fast_assign(
>> +		__assign_str(dev, dev->name);
>> +		__entry->vid = group->vid;
>> +
>> +		if (!group->proto) {
>> +			__entry->af = 0;
>> +
>> +			memset(__entry->src4, 0, sizeof(__entry->src4));
>> +			memset(__entry->src6, 0, sizeof(__entry->src6));
>> +			memset(__entry->grp4, 0, sizeof(__entry->grp4));
>> +			memset(__entry->grp6, 0, sizeof(__entry->grp6));
>> +			memcpy(__entry->grpmac, group->dst.mac_addr, ETH_ALEN);
>> +		} else if (group->proto == htons(ETH_P_IP)) {
>> +			__be32 *p32;
>> +
>> +			__entry->af = AF_INET;
>> +
>> +			p32 = (__be32 *) __entry->src4;
>> +			*p32 = group->src.ip4;
>> +
>> +			p32 = (__be32 *) __entry->grp4;
>> +			*p32 = group->dst.ip4;
>
> 			struct in6_addr *in6;
>
> 			in6 = (struct in6_addr *)__entry->src;
> 			ipv6_addr_set_v4mapped(group->src.ip4, in6);
>
> 			in6 = (struct in6_addr *)__entry->grp;
> 			ipv6_addr_set_v4mapped(group->grp.ip4, in6);
>
>> +
>> +			memset(__entry->src6, 0, sizeof(__entry->src6));
>> +			memset(__entry->grp6, 0, sizeof(__entry->grp6));
>> +			memset(__entry->grpmac, 0, ETH_ALEN);
>> +#if IS_ENABLED(CONFIG_IPV6)
>> +		} else {
>> +			struct in6_addr *in6;
>> +
>> +			__entry->af = AF_INET6;
>> +
>> +			in6 = (struct in6_addr *)__entry->src6;
>> +			*in6 = group->src.ip6;
>> +
>> +			in6 = (struct in6_addr *)__entry->grp6;
>> +			*in6 = group->dst.ip6;
>> +
>> +			memset(__entry->src4, 0, sizeof(__entry->src4));
>> +			memset(__entry->grp4, 0, sizeof(__entry->grp4));
>> +			memset(__entry->grpmac, 0, ETH_ALEN);
>> +#endif
>> +		}
>> +	),
>> +
>> +	TP_printk("dev %s af %u src %pI4/%pI6c grp %pI4/%pI6c/%pM vid %u",
>> +		  __get_str(dev), __entry->af, __entry->src4, __entry->src6,
>> +		  __entry->grp4, __entry->grp6, __entry->grpmac, __entry->vid)
>
> And just have: 
>
> 	TP_printk("dev %s af %u src %pI6c grp %pI6c/%pM vid %u",
> 		  __get_str(dev), __entry->af, __entry->src, __entry->grp,
> 		  __entry->grpmac, __entry->vid)
>
> As the %pI6c should detect that it's a ipv4 address and show that.

So the reason I split the fields was that %pI4, %pI6c, %pM do not seem
to work with buffers of wrong size.

But I can consolidate 4/6 by changing the address to IPv6 like you
propose. I'll do this for v2. Thanks!
Petr Machata Jan. 30, 2023, 3:50 p.m. UTC | #3
Steven Rostedt <rostedt@goodmis.org> writes:

> On Thu, 26 Jan 2023 18:01:14 +0100
> Petr Machata <petrm@nvidia.com> wrote:
>
>> +	TP_printk("dev %s af %u src %pI4/%pI6c grp %pI4/%pI6c/%pM vid %u",
>> +		  __get_str(dev), __entry->af, __entry->src4, __entry->src6,
>> +		  __entry->grp4, __entry->grp6, __entry->grpmac, __entry->vid)
>
> And just have: 
>
> 	TP_printk("dev %s af %u src %pI6c grp %pI6c/%pM vid %u",
> 		  __get_str(dev), __entry->af, __entry->src, __entry->grp,
> 		  __entry->grpmac, __entry->vid)
>
> As the %pI6c should detect that it's a ipv4 address and show that.

This means the IP addresses will always be IPv6, even for an IPv4 MDB
entries. One can still figure out the true protocol from the address
family field, but it might not be obvious. Plus the IPv4-mapped IPv6
addresses are not really formatted as IPv4, though yeah, IPv4 notation
is embedded in that.

All the information is still there, but... scrambled? Not sure the
reduction in clarity is worth the 8 bytes that we save. The tracepoint
is unlikely to trigger often.

What say you?
Steven Rostedt Jan. 30, 2023, 11:23 p.m. UTC | #4
On Mon, 30 Jan 2023 16:50:32 +0100
Petr Machata <petrm@nvidia.com> wrote:

> Steven Rostedt <rostedt@goodmis.org> writes:
> 
> > On Thu, 26 Jan 2023 18:01:14 +0100
> > Petr Machata <petrm@nvidia.com> wrote:
> >  
> >> +	TP_printk("dev %s af %u src %pI4/%pI6c grp %pI4/%pI6c/%pM vid %u",
> >> +		  __get_str(dev), __entry->af, __entry->src4, __entry->src6,
> >> +		  __entry->grp4, __entry->grp6, __entry->grpmac, __entry->vid)  
> >
> > And just have: 
> >
> > 	TP_printk("dev %s af %u src %pI6c grp %pI6c/%pM vid %u",
> > 		  __get_str(dev), __entry->af, __entry->src, __entry->grp,
> > 		  __entry->grpmac, __entry->vid)
> >
> > As the %pI6c should detect that it's a ipv4 address and show that.  
> 
> This means the IP addresses will always be IPv6, even for an IPv4 MDB
> entries. One can still figure out the true protocol from the address
> family field, but it might not be obvious. Plus the IPv4-mapped IPv6
> addresses are not really formatted as IPv4, though yeah, IPv4 notation
> is embedded in that.
> 
> All the information is still there, but... scrambled? Not sure the
> reduction in clarity is worth the 8 bytes that we save. The tracepoint
> is unlikely to trigger often.

8 bytes per event, and yes, ring buffer real estate is expensive.

And if you use trace-cmd or perf, we can always add a plugin to
libtraceevent that can format this much nicer based on the information that
is there.

-- Steve
diff mbox series

Patch

diff --git a/include/trace/events/bridge.h b/include/trace/events/bridge.h
index 6b200059c2c5..00d5e2dcb3ad 100644
--- a/include/trace/events/bridge.h
+++ b/include/trace/events/bridge.h
@@ -122,6 +122,73 @@  TRACE_EVENT(br_fdb_update,
 		  __entry->flags)
 );
 
+TRACE_EVENT(br_mdb_full,
+
+	TP_PROTO(const struct net_device *dev,
+		 const struct br_ip *group),
+
+	TP_ARGS(dev, group),
+
+	TP_STRUCT__entry(
+		__string(dev, dev->name)
+		__field(int, af)
+		__field(u16, vid)
+		__array(__u8, src4, 4)
+		__array(__u8, src6, 16)
+		__array(__u8, grp4, 4)
+		__array(__u8, grp6, 16)
+		__array(__u8, grpmac, ETH_ALEN) /* For af == 0. */
+	),
+
+	TP_fast_assign(
+		__assign_str(dev, dev->name);
+		__entry->vid = group->vid;
+
+		if (!group->proto) {
+			__entry->af = 0;
+
+			memset(__entry->src4, 0, sizeof(__entry->src4));
+			memset(__entry->src6, 0, sizeof(__entry->src6));
+			memset(__entry->grp4, 0, sizeof(__entry->grp4));
+			memset(__entry->grp6, 0, sizeof(__entry->grp6));
+			memcpy(__entry->grpmac, group->dst.mac_addr, ETH_ALEN);
+		} else if (group->proto == htons(ETH_P_IP)) {
+			__be32 *p32;
+
+			__entry->af = AF_INET;
+
+			p32 = (__be32 *) __entry->src4;
+			*p32 = group->src.ip4;
+
+			p32 = (__be32 *) __entry->grp4;
+			*p32 = group->dst.ip4;
+
+			memset(__entry->src6, 0, sizeof(__entry->src6));
+			memset(__entry->grp6, 0, sizeof(__entry->grp6));
+			memset(__entry->grpmac, 0, ETH_ALEN);
+#if IS_ENABLED(CONFIG_IPV6)
+		} else {
+			struct in6_addr *in6;
+
+			__entry->af = AF_INET6;
+
+			in6 = (struct in6_addr *)__entry->src6;
+			*in6 = group->src.ip6;
+
+			in6 = (struct in6_addr *)__entry->grp6;
+			*in6 = group->dst.ip6;
+
+			memset(__entry->src4, 0, sizeof(__entry->src4));
+			memset(__entry->grp4, 0, sizeof(__entry->grp4));
+			memset(__entry->grpmac, 0, ETH_ALEN);
+#endif
+		}
+	),
+
+	TP_printk("dev %s af %u src %pI4/%pI6c grp %pI4/%pI6c/%pM vid %u",
+		  __get_str(dev), __entry->af, __entry->src4, __entry->src6,
+		  __entry->grp4, __entry->grp6, __entry->grpmac, __entry->vid)
+);
 
 #endif /* _TRACE_BRIDGE_H */
 
diff --git a/net/core/net-traces.c b/net/core/net-traces.c
index ee7006bbe49b..805b7385dd8d 100644
--- a/net/core/net-traces.c
+++ b/net/core/net-traces.c
@@ -41,6 +41,7 @@  EXPORT_TRACEPOINT_SYMBOL_GPL(br_fdb_add);
 EXPORT_TRACEPOINT_SYMBOL_GPL(br_fdb_external_learn_add);
 EXPORT_TRACEPOINT_SYMBOL_GPL(fdb_delete);
 EXPORT_TRACEPOINT_SYMBOL_GPL(br_fdb_update);
+EXPORT_TRACEPOINT_SYMBOL_GPL(br_mdb_full);
 #endif
 
 #if IS_ENABLED(CONFIG_PAGE_POOL)