diff mbox series

[RFC,v2,net-next,09/16] net: dsa: replay port and local fdb entries when joining the bridge

Message ID 20210318231829.3892920-10-olteanv@gmail.com (mailing list archive)
State New, archived
Headers show
Series Better support for sandwiched LAGs with bridge and DSA | expand

Commit Message

Vladimir Oltean March 18, 2021, 11:18 p.m. UTC
From: Vladimir Oltean <vladimir.oltean@nxp.com>

When a DSA port joins a LAG that already had an FDB entry pointing to it:

ip link set bond0 master br0
bridge fdb add dev bond0 00:01:02:03:04:05 master static
ip link set swp0 master bond0

the DSA port will have no idea that this FDB entry is there, because it
missed the switchdev event emitted at its creation.

Ido Schimmel pointed this out during a discussion about challenges with
switchdev offloading of stacked interfaces between the physical port and
the bridge, and recommended to just catch that condition and deny the
CHANGEUPPER event:
https://lore.kernel.org/netdev/20210210105949.GB287766@shredder.lan/

But in fact, we might need to deal with the hard thing anyway, which is
to replay all FDB addresses relevant to this port, because it isn't just
static FDB entries, but also local addresses (ones that are not
forwarded but terminated by the bridge). There, we can't just say 'oh
yeah, there was an upper already so I'm not joining that'.

So, similar to the logic for replaying MDB entries, add a function that
must be called by individual switchdev drivers and replays local FDB
entries as well as ones pointing towards a bridge port. This time, we
use the atomic switchdev notifier block, since that's what FDB entries
expect for some reason.

Reported-by: Ido Schimmel <idosch@idosch.org>
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
---
 include/linux/if_bridge.h |  9 +++++++
 include/net/switchdev.h   |  1 +
 net/bridge/br_fdb.c       | 52 +++++++++++++++++++++++++++++++++++++++
 net/dsa/dsa_priv.h        |  1 +
 net/dsa/port.c            |  4 +++
 net/dsa/slave.c           |  2 +-
 6 files changed, 68 insertions(+), 1 deletion(-)

Comments

Tobias Waldekranz March 22, 2021, 3:44 p.m. UTC | #1
On Fri, Mar 19, 2021 at 01:18, Vladimir Oltean <olteanv@gmail.com> wrote:
> From: Vladimir Oltean <vladimir.oltean@nxp.com>
>
> When a DSA port joins a LAG that already had an FDB entry pointing to it:
>
> ip link set bond0 master br0
> bridge fdb add dev bond0 00:01:02:03:04:05 master static
> ip link set swp0 master bond0
>
> the DSA port will have no idea that this FDB entry is there, because it
> missed the switchdev event emitted at its creation.
>
> Ido Schimmel pointed this out during a discussion about challenges with
> switchdev offloading of stacked interfaces between the physical port and
> the bridge, and recommended to just catch that condition and deny the
> CHANGEUPPER event:
> https://lore.kernel.org/netdev/20210210105949.GB287766@shredder.lan/
>
> But in fact, we might need to deal with the hard thing anyway, which is
> to replay all FDB addresses relevant to this port, because it isn't just
> static FDB entries, but also local addresses (ones that are not
> forwarded but terminated by the bridge). There, we can't just say 'oh
> yeah, there was an upper already so I'm not joining that'.
>
> So, similar to the logic for replaying MDB entries, add a function that
> must be called by individual switchdev drivers and replays local FDB
> entries as well as ones pointing towards a bridge port. This time, we
> use the atomic switchdev notifier block, since that's what FDB entries
> expect for some reason.
>
> Reported-by: Ido Schimmel <idosch@idosch.org>
> Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
> ---
>  include/linux/if_bridge.h |  9 +++++++
>  include/net/switchdev.h   |  1 +
>  net/bridge/br_fdb.c       | 52 +++++++++++++++++++++++++++++++++++++++
>  net/dsa/dsa_priv.h        |  1 +
>  net/dsa/port.c            |  4 +++
>  net/dsa/slave.c           |  2 +-
>  6 files changed, 68 insertions(+), 1 deletion(-)
>
> diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h
> index 4c25dafb013d..89596134e88f 100644
> --- a/include/linux/if_bridge.h
> +++ b/include/linux/if_bridge.h
> @@ -147,6 +147,8 @@ void br_fdb_clear_offload(const struct net_device *dev, u16 vid);
>  bool br_port_flag_is_set(const struct net_device *dev, unsigned long flag);
>  u8 br_port_get_stp_state(const struct net_device *dev);
>  clock_t br_get_ageing_time(struct net_device *br_dev);
> +int br_fdb_replay(struct net_device *br_dev, struct net_device *dev,
> +		  struct notifier_block *nb);
>  #else
>  static inline struct net_device *
>  br_fdb_find_port(const struct net_device *br_dev,
> @@ -175,6 +177,13 @@ static inline clock_t br_get_ageing_time(struct net_device *br_dev)
>  {
>  	return 0;
>  }
> +
> +static inline int br_fdb_replay(struct net_device *br_dev,
> +				struct net_device *dev,
> +				struct notifier_block *nb)
> +{
> +	return -EINVAL;
> +}
>  #endif
>  
>  #endif
> diff --git a/include/net/switchdev.h b/include/net/switchdev.h
> index b7fc7d0f54e2..7688ec572757 100644
> --- a/include/net/switchdev.h
> +++ b/include/net/switchdev.h
> @@ -205,6 +205,7 @@ struct switchdev_notifier_info {
>  
>  struct switchdev_notifier_fdb_info {
>  	struct switchdev_notifier_info info; /* must be first */
> +	struct list_head list;
>  	const unsigned char *addr;
>  	u16 vid;
>  	u8 added_by_user:1,
> diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
> index b7490237f3fc..49125cc196ac 100644
> --- a/net/bridge/br_fdb.c
> +++ b/net/bridge/br_fdb.c
> @@ -726,6 +726,58 @@ static inline size_t fdb_nlmsg_size(void)
>  		+ nla_total_size(sizeof(u8)); /* NFEA_ACTIVITY_NOTIFY */
>  }
>  
> +static int br_fdb_replay_one(struct notifier_block *nb,
> +			     struct net_bridge_fdb_entry *fdb,
> +			     struct net_device *dev)
> +{
> +	struct switchdev_notifier_fdb_info item;
> +	int err;
> +
> +	item.addr = fdb->key.addr.addr;
> +	item.vid = fdb->key.vlan_id;
> +	item.added_by_user = test_bit(BR_FDB_ADDED_BY_USER, &fdb->flags);
> +	item.offloaded = test_bit(BR_FDB_OFFLOADED, &fdb->flags);
> +	item.info.dev = dev;
> +
> +	err = nb->notifier_call(nb, SWITCHDEV_FDB_ADD_TO_DEVICE, &item);
> +	return notifier_to_errno(err);
> +}
> +
> +int br_fdb_replay(struct net_device *br_dev, struct net_device *dev,
> +		  struct notifier_block *nb)
> +{
> +	struct net_bridge_fdb_entry *fdb;
> +	struct net_bridge *br;
> +	int err = 0;
> +
> +	if (!netif_is_bridge_master(br_dev))
> +		return -EINVAL;
> +
> +	if (!netif_is_bridge_port(dev))
> +		return -EINVAL;
> +
> +	br = netdev_priv(br_dev);
> +
> +	rcu_read_lock();
> +
> +	hlist_for_each_entry_rcu(fdb, &br->fdb_list, fdb_node) {
> +		struct net_device *dst_dev;
> +
> +		dst_dev = fdb->dst ? fdb->dst->dev : br->dev;
> +		if (dst_dev != br_dev && dst_dev != dev)
> +			continue;
> +

I do not know if it is a problem or not, more of an observation: This is
not guaranteed to be an exact replay of the events that the bridge port
(i.e. bond0 or whatever) has received since, in fdb_insert, we exit
early when adding local entries if that address is already in the
database.

Do we have to guard against this somehow? Or maybe we should consider
the current behavior a bug and make sure to always send the event in the
first place?

> +		err = br_fdb_replay_one(nb, fdb, dst_dev);
> +		if (err)
> +			break;
> +	}
> +
> +	rcu_read_unlock();
> +
> +	return err;
> +}
> +EXPORT_SYMBOL(br_fdb_replay);
> +
>  static void fdb_notify(struct net_bridge *br,
>  		       const struct net_bridge_fdb_entry *fdb, int type,
>  		       bool swdev_notify)
> diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
> index b14c43cb88bb..92282de54230 100644
> --- a/net/dsa/dsa_priv.h
> +++ b/net/dsa/dsa_priv.h
> @@ -262,6 +262,7 @@ static inline bool dsa_tree_offloads_bridge_port(struct dsa_switch_tree *dst,
>  
>  /* slave.c */
>  extern const struct dsa_device_ops notag_netdev_ops;
> +extern struct notifier_block dsa_slave_switchdev_notifier;
>  extern struct notifier_block dsa_slave_switchdev_blocking_notifier;
>  
>  void dsa_slave_mii_bus_init(struct dsa_switch *ds);
> diff --git a/net/dsa/port.c b/net/dsa/port.c
> index 6670612f96c6..9850051071f2 100644
> --- a/net/dsa/port.c
> +++ b/net/dsa/port.c
> @@ -205,6 +205,10 @@ static int dsa_port_switchdev_sync(struct dsa_port *dp,
>  	if (err && err != -EOPNOTSUPP)
>  		return err;
>  
> +	err = br_fdb_replay(br, brport_dev, &dsa_slave_switchdev_notifier);
> +	if (err && err != -EOPNOTSUPP)
> +		return err;
> +
>  	return 0;
>  }
>  
> diff --git a/net/dsa/slave.c b/net/dsa/slave.c
> index b974d8f84a2e..c51e52418a62 100644
> --- a/net/dsa/slave.c
> +++ b/net/dsa/slave.c
> @@ -2392,7 +2392,7 @@ static struct notifier_block dsa_slave_nb __read_mostly = {
>  	.notifier_call  = dsa_slave_netdevice_event,
>  };
>  
> -static struct notifier_block dsa_slave_switchdev_notifier = {
> +struct notifier_block dsa_slave_switchdev_notifier = {
>  	.notifier_call = dsa_slave_switchdev_event,
>  };
>  
> -- 
> 2.25.1
Vladimir Oltean March 22, 2021, 4:19 p.m. UTC | #2
On Mon, Mar 22, 2021 at 04:44:41PM +0100, Tobias Waldekranz wrote:
> I do not know if it is a problem or not, more of an observation: This is
> not guaranteed to be an exact replay of the events that the bridge port
> (i.e. bond0 or whatever) has received since, in fdb_insert, we exit
> early when adding local entries if that address is already in the
> database.
> 
> Do we have to guard against this somehow? Or maybe we should consider
> the current behavior a bug and make sure to always send the event in the
> first place?

I don't really understand what you're saying.
fdb_insert has:

	fdb = br_fdb_find(br, addr, vid);
	if (fdb) {
		/* it is okay to have multiple ports with same
		 * address, just use the first one.
		 */
		if (test_bit(BR_FDB_LOCAL, &fdb->flags))
			return 0;
		br_warn(br, "adding interface %s with same address as a received packet (addr:%pM, vlan:%u)\n",
		       source ? source->dev->name : br->dev->name, addr, vid);
		fdb_delete(br, fdb, true);
	}

	fdb = fdb_create(br, source, addr, vid,
			 BIT(BR_FDB_LOCAL) | BIT(BR_FDB_STATIC));

Basically, if the {addr, vid} pair already exists in the fdb, and it
points to a local entry, fdb_create is bypassed.

Whereas my br_fdb_replay() function iterates over br->fdb_list, which is
exactly where fdb_create() also lays its eggs. That is to say, unless
I'm missing something, that duplicate local FDB entries that skipped the
fdb_create() call in fdb_insert() because they were for already-existing
local FDB entries will also be skipped by br_fdb_replay(), because it
iterates over a br->fdb_list which contains unique local addresses.
Where am I wrong?
Tobias Waldekranz March 22, 2021, 5:07 p.m. UTC | #3
On Mon, Mar 22, 2021 at 18:19, Vladimir Oltean <olteanv@gmail.com> wrote:
> On Mon, Mar 22, 2021 at 04:44:41PM +0100, Tobias Waldekranz wrote:
>> I do not know if it is a problem or not, more of an observation: This is
>> not guaranteed to be an exact replay of the events that the bridge port
>> (i.e. bond0 or whatever) has received since, in fdb_insert, we exit
>> early when adding local entries if that address is already in the
>> database.
>> 
>> Do we have to guard against this somehow? Or maybe we should consider
>> the current behavior a bug and make sure to always send the event in the
>> first place?
>
> I don't really understand what you're saying.
> fdb_insert has:
>
> 	fdb = br_fdb_find(br, addr, vid);
> 	if (fdb) {
> 		/* it is okay to have multiple ports with same
> 		 * address, just use the first one.
> 		 */
> 		if (test_bit(BR_FDB_LOCAL, &fdb->flags))
> 			return 0;
> 		br_warn(br, "adding interface %s with same address as a received packet (addr:%pM, vlan:%u)\n",
> 		       source ? source->dev->name : br->dev->name, addr, vid);
> 		fdb_delete(br, fdb, true);
> 	}
>
> 	fdb = fdb_create(br, source, addr, vid,
> 			 BIT(BR_FDB_LOCAL) | BIT(BR_FDB_STATIC));
>
> Basically, if the {addr, vid} pair already exists in the fdb, and it
> points to a local entry, fdb_create is bypassed.
>
> Whereas my br_fdb_replay() function iterates over br->fdb_list, which is
> exactly where fdb_create() also lays its eggs. That is to say, unless
> I'm missing something, that duplicate local FDB entries that skipped the
> fdb_create() call in fdb_insert() because they were for already-existing
> local FDB entries will also be skipped by br_fdb_replay(), because it
> iterates over a br->fdb_list which contains unique local addresses.
> Where am I wrong?

No you are right. I was thinking back to my attempt of offloading local
addresses and I distinctly remembered that local addresses could be
added without a notification being sent.

But that is not what is happening. It is just already inserted on
another port. So the notification would reach DSA, or not, depending on
ordering the of events. But there will be no discrepancy between that
and the replay.
Vladimir Oltean March 22, 2021, 5:13 p.m. UTC | #4
On Mon, Mar 22, 2021 at 06:07:51PM +0100, Tobias Waldekranz wrote:
> On Mon, Mar 22, 2021 at 18:19, Vladimir Oltean <olteanv@gmail.com> wrote:
> > On Mon, Mar 22, 2021 at 04:44:41PM +0100, Tobias Waldekranz wrote:
> >> I do not know if it is a problem or not, more of an observation: This is
> >> not guaranteed to be an exact replay of the events that the bridge port
> >> (i.e. bond0 or whatever) has received since, in fdb_insert, we exit
> >> early when adding local entries if that address is already in the
> >> database.
> >> 
> >> Do we have to guard against this somehow? Or maybe we should consider
> >> the current behavior a bug and make sure to always send the event in the
> >> first place?
> >
> > I don't really understand what you're saying.
> > fdb_insert has:
> >
> > 	fdb = br_fdb_find(br, addr, vid);
> > 	if (fdb) {
> > 		/* it is okay to have multiple ports with same
> > 		 * address, just use the first one.
> > 		 */
> > 		if (test_bit(BR_FDB_LOCAL, &fdb->flags))
> > 			return 0;
> > 		br_warn(br, "adding interface %s with same address as a received packet (addr:%pM, vlan:%u)\n",
> > 		       source ? source->dev->name : br->dev->name, addr, vid);
> > 		fdb_delete(br, fdb, true);
> > 	}
> >
> > 	fdb = fdb_create(br, source, addr, vid,
> > 			 BIT(BR_FDB_LOCAL) | BIT(BR_FDB_STATIC));
> >
> > Basically, if the {addr, vid} pair already exists in the fdb, and it
> > points to a local entry, fdb_create is bypassed.
> >
> > Whereas my br_fdb_replay() function iterates over br->fdb_list, which is
> > exactly where fdb_create() also lays its eggs. That is to say, unless
> > I'm missing something, that duplicate local FDB entries that skipped the
> > fdb_create() call in fdb_insert() because they were for already-existing
> > local FDB entries will also be skipped by br_fdb_replay(), because it
> > iterates over a br->fdb_list which contains unique local addresses.
> > Where am I wrong?
> 
> No you are right. I was thinking back to my attempt of offloading local
> addresses and I distinctly remembered that local addresses could be
> added without a notification being sent.
> 
> But that is not what is happening. It is just already inserted on
> another port. So the notification would reach DSA, or not, depending on
> ordering the of events. But there will be no discrepancy between that
> and the replay.

I'm not saying that the bridge isn't broken, because it is, but for
different reasons, as explained here:
https://patchwork.kernel.org/project/netdevbpf/patch/20210224114350.2791260-9-olteanv@gmail.com/

What I can do is I can make br_switchdev_fdb_notify() skip fdb entries
with the BR_FDB_LOCAL bit set, and target that patch against "net", with
a Fixes: tag of 6b26b51b1d13 ("net: bridge: Add support for notifying
devices about FDB add/del").
Then I can also skip the entries with BR_FDB_LOCAL from br_fdb_replay.
Then, when I return to the "RX filtering for DSA" series, I can add the
"is_local" bit to switchdev FDB objects, and make all drivers reject
"is_local" entries (which is what the linked patch does) unless more
specific treatment is applied to those (trap to CPU).
Nikolay?
diff mbox series

Patch

diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h
index 4c25dafb013d..89596134e88f 100644
--- a/include/linux/if_bridge.h
+++ b/include/linux/if_bridge.h
@@ -147,6 +147,8 @@  void br_fdb_clear_offload(const struct net_device *dev, u16 vid);
 bool br_port_flag_is_set(const struct net_device *dev, unsigned long flag);
 u8 br_port_get_stp_state(const struct net_device *dev);
 clock_t br_get_ageing_time(struct net_device *br_dev);
+int br_fdb_replay(struct net_device *br_dev, struct net_device *dev,
+		  struct notifier_block *nb);
 #else
 static inline struct net_device *
 br_fdb_find_port(const struct net_device *br_dev,
@@ -175,6 +177,13 @@  static inline clock_t br_get_ageing_time(struct net_device *br_dev)
 {
 	return 0;
 }
+
+static inline int br_fdb_replay(struct net_device *br_dev,
+				struct net_device *dev,
+				struct notifier_block *nb)
+{
+	return -EINVAL;
+}
 #endif
 
 #endif
diff --git a/include/net/switchdev.h b/include/net/switchdev.h
index b7fc7d0f54e2..7688ec572757 100644
--- a/include/net/switchdev.h
+++ b/include/net/switchdev.h
@@ -205,6 +205,7 @@  struct switchdev_notifier_info {
 
 struct switchdev_notifier_fdb_info {
 	struct switchdev_notifier_info info; /* must be first */
+	struct list_head list;
 	const unsigned char *addr;
 	u16 vid;
 	u8 added_by_user:1,
diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index b7490237f3fc..49125cc196ac 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -726,6 +726,58 @@  static inline size_t fdb_nlmsg_size(void)
 		+ nla_total_size(sizeof(u8)); /* NFEA_ACTIVITY_NOTIFY */
 }
 
+static int br_fdb_replay_one(struct notifier_block *nb,
+			     struct net_bridge_fdb_entry *fdb,
+			     struct net_device *dev)
+{
+	struct switchdev_notifier_fdb_info item;
+	int err;
+
+	item.addr = fdb->key.addr.addr;
+	item.vid = fdb->key.vlan_id;
+	item.added_by_user = test_bit(BR_FDB_ADDED_BY_USER, &fdb->flags);
+	item.offloaded = test_bit(BR_FDB_OFFLOADED, &fdb->flags);
+	item.info.dev = dev;
+
+	err = nb->notifier_call(nb, SWITCHDEV_FDB_ADD_TO_DEVICE, &item);
+	return notifier_to_errno(err);
+}
+
+int br_fdb_replay(struct net_device *br_dev, struct net_device *dev,
+		  struct notifier_block *nb)
+{
+	struct net_bridge_fdb_entry *fdb;
+	struct net_bridge *br;
+	int err = 0;
+
+	if (!netif_is_bridge_master(br_dev))
+		return -EINVAL;
+
+	if (!netif_is_bridge_port(dev))
+		return -EINVAL;
+
+	br = netdev_priv(br_dev);
+
+	rcu_read_lock();
+
+	hlist_for_each_entry_rcu(fdb, &br->fdb_list, fdb_node) {
+		struct net_device *dst_dev;
+
+		dst_dev = fdb->dst ? fdb->dst->dev : br->dev;
+		if (dst_dev != br_dev && dst_dev != dev)
+			continue;
+
+		err = br_fdb_replay_one(nb, fdb, dst_dev);
+		if (err)
+			break;
+	}
+
+	rcu_read_unlock();
+
+	return err;
+}
+EXPORT_SYMBOL(br_fdb_replay);
+
 static void fdb_notify(struct net_bridge *br,
 		       const struct net_bridge_fdb_entry *fdb, int type,
 		       bool swdev_notify)
diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index b14c43cb88bb..92282de54230 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -262,6 +262,7 @@  static inline bool dsa_tree_offloads_bridge_port(struct dsa_switch_tree *dst,
 
 /* slave.c */
 extern const struct dsa_device_ops notag_netdev_ops;
+extern struct notifier_block dsa_slave_switchdev_notifier;
 extern struct notifier_block dsa_slave_switchdev_blocking_notifier;
 
 void dsa_slave_mii_bus_init(struct dsa_switch *ds);
diff --git a/net/dsa/port.c b/net/dsa/port.c
index 6670612f96c6..9850051071f2 100644
--- a/net/dsa/port.c
+++ b/net/dsa/port.c
@@ -205,6 +205,10 @@  static int dsa_port_switchdev_sync(struct dsa_port *dp,
 	if (err && err != -EOPNOTSUPP)
 		return err;
 
+	err = br_fdb_replay(br, brport_dev, &dsa_slave_switchdev_notifier);
+	if (err && err != -EOPNOTSUPP)
+		return err;
+
 	return 0;
 }
 
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index b974d8f84a2e..c51e52418a62 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -2392,7 +2392,7 @@  static struct notifier_block dsa_slave_nb __read_mostly = {
 	.notifier_call  = dsa_slave_netdevice_event,
 };
 
-static struct notifier_block dsa_slave_switchdev_notifier = {
+struct notifier_block dsa_slave_switchdev_notifier = {
 	.notifier_call = dsa_slave_switchdev_event,
 };