diff mbox series

[v2,net-next,5/6] net: dsa: listen for SWITCHDEV_{FDB,DEL}_ADD_TO_DEVICE on foreign bridge neighbors

Message ID 20201213024018.772586-6-vladimir.oltean@nxp.com (mailing list archive)
State Superseded
Delegated to: Netdev Maintainers
Headers show
Series Offload software learnt bridge addresses to DSA | expand

Checks

Context Check Description
netdev/cover_letter success Link
netdev/fixes_present success Link
netdev/patch_count success Link
netdev/tree_selection success Clearly marked for net-next
netdev/subject_prefix success Link
netdev/source_inline success Was 0 now: 0
netdev/verify_signedoff success Link
netdev/module_param success Was 0 now: 0
netdev/build_32bit success Errors and warnings before: 20 this patch: 20
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/verify_fixes success Link
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 92 lines checked
netdev/build_allmodconfig_warn success Errors and warnings before: 20 this patch: 20
netdev/header_inline success Link
netdev/stable success Stable not CCed

Commit Message

Vladimir Oltean Dec. 13, 2020, 2:40 a.m. UTC
Some DSA switches (and not only) cannot learn source MAC addresses from
packets injected from the CPU. They only perform hardware address
learning from inbound traffic.

This can be problematic when we have a bridge spanning some DSA switch
ports and some non-DSA ports (which we'll call "foreign interfaces" from
DSA's perspective).

There are 2 classes of problems created by the lack of learning on
CPU-injected traffic:
- excessive flooding, due to the fact that DSA treats those addresses as
  unknown
- the risk of stale routes, which can lead to temporary packet loss

To illustrate the second class, consider the following situation, which
is common in production equipment (wireless access points, where there
is a WLAN interface and an Ethernet switch, and these form a single
bridging domain).

 AP 1:
 +------------------------------------------------------------------------+
 |                                          br0                           |
 +------------------------------------------------------------------------+
 +------------+ +------------+ +------------+ +------------+ +------------+
 |    swp0    | |    swp1    | |    swp2    | |    swp3    | |    wlan0   |
 +------------+ +------------+ +------------+ +------------+ +------------+
       |                                                       ^        ^
       |                                                       |        |
       |                                                       |        |
       |                                                    Client A  Client B
       |
       |
       |
 +------------+ +------------+ +------------+ +------------+ +------------+
 |    swp0    | |    swp1    | |    swp2    | |    swp3    | |    wlan0   |
 +------------+ +------------+ +------------+ +------------+ +------------+
 +------------------------------------------------------------------------+
 |                                          br0                           |
 +------------------------------------------------------------------------+
 AP 2

- br0 of AP 1 will know that Clients A and B are reachable via wlan0
- the hardware fdb of a DSA switch driver today is not kept in sync with
  the software entries on other bridge ports, so it will not know that
  clients A and B are reachable via the CPU port UNLESS the hardware
  switch itself performs SA learning from traffic injected from the CPU.
  Nonetheless, a substantial number of switches don't.
- the hardware fdb of the DSA switch on AP 2 may autonomously learn that
  Client A and B are reachable through swp0. Therefore, the software br0
  of AP 2 also may or may not learn this. In the example we're
  illustrating, some Ethernet traffic has been going on, and br0 from AP
  2 has indeed learnt that it can reach Client B through swp0.

One of the wireless clients, say Client B, disconnects from AP 1 and
roams to AP 2. The topology now looks like this:

 AP 1:
 +------------------------------------------------------------------------+
 |                                          br0                           |
 +------------------------------------------------------------------------+
 +------------+ +------------+ +------------+ +------------+ +------------+
 |    swp0    | |    swp1    | |    swp2    | |    swp3    | |    wlan0   |
 +------------+ +------------+ +------------+ +------------+ +------------+
       |                                                            ^
       |                                                            |
       |                                                         Client A
       |
       |
       |                                                         Client B
       |                                                            |
       |                                                            v
 +------------+ +------------+ +------------+ +------------+ +------------+
 |    swp0    | |    swp1    | |    swp2    | |    swp3    | |    wlan0   |
 +------------+ +------------+ +------------+ +------------+ +------------+
 +------------------------------------------------------------------------+
 |                                          br0                           |
 +------------------------------------------------------------------------+
 AP 2

- br0 of AP 1 still knows that Client A is reachable via wlan0 (no change)
- br0 of AP 1 will (possibly) know that Client B has left wlan0. There
  are cases where it might never find out though. Either way, DSA today
  does not process that notification in any way.
- the hardware FDB of the DSA switch on AP 1 may learn autonomously that
  Client B can be reached via swp0, if it receives any packet with
  Client 1's source MAC address over Ethernet.
- the hardware FDB of the DSA switch on AP 2 still thinks that Client B
  can be reached via swp0. It does not know that it has roamed to wlan0,
  because it doesn't perform SA learning from the CPU port.

Now Client A contacts Client B.
AP 1 routes the packet fine towards swp0 and delivers it on the Ethernet
segment.
AP 2 sees a frame on swp0 and its fdb says that the destination is swp0.
Hairpinning is disabled => drop.

This problem comes from the fact that these switches have a 'blind spot'
for addresses coming from software bridging. The generic solution is not
to assume that hardware learning can be enabled somehow, but to listen
to more bridge learning events. It turns out that the bridge driver does
learn in software from all inbound frames, in __br_handle_local_finish.
A proper SWITCHDEV_FDB_ADD_TO_DEVICE notification is emitted for the
addresses serviced by the bridge on 'foreign' interfaces. The software
bridge also does the right thing on migration, by notifying that the old
entry is deleted, so that does not need to be special-cased in DSA. When
it is deleted, we just need to delete our static FDB entry towards the
CPU too, and wait.

The problem is that DSA currently only cares about SWITCHDEV_FDB_ADD_TO_DEVICE
events received on its own interfaces, such as static FDB entries.

Luckily we can change that, and DSA can listen to all switchdev FDB
add/del events in the system and figure out if those events were emitted
by a bridge that spans at least one of DSA's own ports. In case that is
true, DSA will also offload that address towards its own CPU port, in
the eventuality that there might be bridge clients attached to the DSA
switch who want to talk to the station connected to the foreign
interface.

In terms of implementation, we need to keep the fdb_info->added_by_user
check for the case where the switchdev event was targeted directly at a
DSA switch port. But we don't need to look at that flag for snooped
events. So the check is currently too late, we need to move it earlier.
This also simplifies the code a bit, since we avoid uselessly allocating
and freeing switchdev_work.

We could probably do some improvements in the future. For example,
multi-bridge support is rudimentary at the moment. If there are two
bridges spanning a DSA switch's ports, and both of them need to service
the same MAC address, then what will happen is that the migration of one
of those stations will trigger the deletion of the FDB entry from the
CPU port while it is still used by other bridge. That could be improved
with reference counting but is left for another time.

This behavior needs to be enabled at driver level by setting
ds->learning_broken_on_cpu_port = true. This is because we don't want to
inflict a potential performance penalty (accesses through MDIO/I2C/SPI
are expensive) to hardware that really doesn't need it because address
learning on the CPU port works there.

Reported-by: DENG Qingfang <dqfext@gmail.com>
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
---
Changes in v2:
Made the behavior conditional.

 include/net/dsa.h |  5 ++++
 net/dsa/slave.c   | 62 +++++++++++++++++++++++++++++++++++++++--------
 2 files changed, 57 insertions(+), 10 deletions(-)

Comments

Florian Fainelli Dec. 13, 2020, 3:48 a.m. UTC | #1
On 12/12/2020 6:40 PM, Vladimir Oltean wrote:
> Some DSA switches (and not only) cannot learn source MAC addresses from
> packets injected from the CPU. They only perform hardware address
> learning from inbound traffic.
> 
> This can be problematic when we have a bridge spanning some DSA switch
> ports and some non-DSA ports (which we'll call "foreign interfaces" from
> DSA's perspective).
> 
> There are 2 classes of problems created by the lack of learning on
> CPU-injected traffic:
> - excessive flooding, due to the fact that DSA treats those addresses as
>   unknown
> - the risk of stale routes, which can lead to temporary packet loss
> 
> To illustrate the second class, consider the following situation, which
> is common in production equipment (wireless access points, where there
> is a WLAN interface and an Ethernet switch, and these form a single
> bridging domain).
> 
>  AP 1:
>  +------------------------------------------------------------------------+
>  |                                          br0                           |
>  +------------------------------------------------------------------------+
>  +------------+ +------------+ +------------+ +------------+ +------------+
>  |    swp0    | |    swp1    | |    swp2    | |    swp3    | |    wlan0   |
>  +------------+ +------------+ +------------+ +------------+ +------------+
>        |                                                       ^        ^
>        |                                                       |        |
>        |                                                       |        |
>        |                                                    Client A  Client B
>        |
>        |
>        |
>  +------------+ +------------+ +------------+ +------------+ +------------+
>  |    swp0    | |    swp1    | |    swp2    | |    swp3    | |    wlan0   |
>  +------------+ +------------+ +------------+ +------------+ +------------+
>  +------------------------------------------------------------------------+
>  |                                          br0                           |
>  +------------------------------------------------------------------------+
>  AP 2
> 
> - br0 of AP 1 will know that Clients A and B are reachable via wlan0
> - the hardware fdb of a DSA switch driver today is not kept in sync with
>   the software entries on other bridge ports, so it will not know that
>   clients A and B are reachable via the CPU port UNLESS the hardware
>   switch itself performs SA learning from traffic injected from the CPU.
>   Nonetheless, a substantial number of switches don't.
> - the hardware fdb of the DSA switch on AP 2 may autonomously learn that
>   Client A and B are reachable through swp0. Therefore, the software br0
>   of AP 2 also may or may not learn this. In the example we're
>   illustrating, some Ethernet traffic has been going on, and br0 from AP
>   2 has indeed learnt that it can reach Client B through swp0.
> 
> One of the wireless clients, say Client B, disconnects from AP 1 and
> roams to AP 2. The topology now looks like this:
> 
>  AP 1:
>  +------------------------------------------------------------------------+
>  |                                          br0                           |
>  +------------------------------------------------------------------------+
>  +------------+ +------------+ +------------+ +------------+ +------------+
>  |    swp0    | |    swp1    | |    swp2    | |    swp3    | |    wlan0   |
>  +------------+ +------------+ +------------+ +------------+ +------------+
>        |                                                            ^
>        |                                                            |
>        |                                                         Client A
>        |
>        |
>        |                                                         Client B
>        |                                                            |
>        |                                                            v
>  +------------+ +------------+ +------------+ +------------+ +------------+
>  |    swp0    | |    swp1    | |    swp2    | |    swp3    | |    wlan0   |
>  +------------+ +------------+ +------------+ +------------+ +------------+
>  +------------------------------------------------------------------------+
>  |                                          br0                           |
>  +------------------------------------------------------------------------+
>  AP 2
> 
> - br0 of AP 1 still knows that Client A is reachable via wlan0 (no change)
> - br0 of AP 1 will (possibly) know that Client B has left wlan0. There
>   are cases where it might never find out though. Either way, DSA today
>   does not process that notification in any way.
> - the hardware FDB of the DSA switch on AP 1 may learn autonomously that
>   Client B can be reached via swp0, if it receives any packet with
>   Client 1's source MAC address over Ethernet.
> - the hardware FDB of the DSA switch on AP 2 still thinks that Client B
>   can be reached via swp0. It does not know that it has roamed to wlan0,
>   because it doesn't perform SA learning from the CPU port.
> 
> Now Client A contacts Client B.
> AP 1 routes the packet fine towards swp0 and delivers it on the Ethernet
> segment.
> AP 2 sees a frame on swp0 and its fdb says that the destination is swp0.
> Hairpinning is disabled => drop.
> 
> This problem comes from the fact that these switches have a 'blind spot'
> for addresses coming from software bridging. The generic solution is not
> to assume that hardware learning can be enabled somehow, but to listen
> to more bridge learning events. It turns out that the bridge driver does
> learn in software from all inbound frames, in __br_handle_local_finish.
> A proper SWITCHDEV_FDB_ADD_TO_DEVICE notification is emitted for the
> addresses serviced by the bridge on 'foreign' interfaces. The software
> bridge also does the right thing on migration, by notifying that the old
> entry is deleted, so that does not need to be special-cased in DSA. When
> it is deleted, we just need to delete our static FDB entry towards the
> CPU too, and wait.
> 
> The problem is that DSA currently only cares about SWITCHDEV_FDB_ADD_TO_DEVICE
> events received on its own interfaces, such as static FDB entries.
> 
> Luckily we can change that, and DSA can listen to all switchdev FDB
> add/del events in the system and figure out if those events were emitted
> by a bridge that spans at least one of DSA's own ports. In case that is
> true, DSA will also offload that address towards its own CPU port, in
> the eventuality that there might be bridge clients attached to the DSA
> switch who want to talk to the station connected to the foreign
> interface.
> 
> In terms of implementation, we need to keep the fdb_info->added_by_user
> check for the case where the switchdev event was targeted directly at a
> DSA switch port. But we don't need to look at that flag for snooped
> events. So the check is currently too late, we need to move it earlier.
> This also simplifies the code a bit, since we avoid uselessly allocating
> and freeing switchdev_work.
> 
> We could probably do some improvements in the future. For example,
> multi-bridge support is rudimentary at the moment. If there are two
> bridges spanning a DSA switch's ports, and both of them need to service
> the same MAC address, then what will happen is that the migration of one
> of those stations will trigger the deletion of the FDB entry from the
> CPU port while it is still used by other bridge. That could be improved
> with reference counting but is left for another time.
> 
> This behavior needs to be enabled at driver level by setting
> ds->learning_broken_on_cpu_port = true. This is because we don't want to
> inflict a potential performance penalty (accesses through MDIO/I2C/SPI
> are expensive) to hardware that really doesn't need it because address
> learning on the CPU port works there.
> 
> Reported-by: DENG Qingfang <dqfext@gmail.com>
> Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>

Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>

The implementation is much simpler than I though it would be, nice! Just
in case you need to spin a v2, I would probably name the flag
"learning_on_cpu_port_challenged", or preferably
"no_learning_on_cpu_port", the term "broken" is a bit subjective IMHO
(although honestly, why not learn from the CPU port though...)
diff mbox series

Patch

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 4e60d2610f20..d02851854c76 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -319,6 +319,11 @@  struct dsa_switch {
 	 */
 	bool			untag_bridge_pvid;
 
+	/* Let DSA manage the FDB entries towards the CPU, based on the
+	 * software bridge database.
+	 */
+	bool			learning_broken_on_cpu_port;
+
 	/* In case vlan_filtering_is_global is set, the VLAN awareness state
 	 * should be retrieved from here and not from the per-port settings.
 	 */
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 53d9d2ea9369..d1a3d717f5af 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -2106,6 +2106,28 @@  static void dsa_slave_switchdev_event_work(struct work_struct *work)
 		dev_put(dp->slave);
 }
 
+static int dsa_lower_dev_walk(struct net_device *lower_dev,
+			      struct netdev_nested_priv *priv)
+{
+	if (dsa_slave_dev_check(lower_dev)) {
+		priv->data = (void *)netdev_priv(lower_dev);
+		return 1;
+	}
+
+	return 0;
+}
+
+static struct dsa_slave_priv *dsa_slave_dev_lower_find(struct net_device *dev)
+{
+	struct netdev_nested_priv priv = {
+		.data = NULL,
+	};
+
+	netdev_walk_all_lower_dev_rcu(dev, dsa_lower_dev_walk, &priv);
+
+	return (struct dsa_slave_priv *)priv.data;
+}
+
 /* Called under rcu_read_lock() */
 static int dsa_slave_switchdev_event(struct notifier_block *unused,
 				     unsigned long event, void *ptr)
@@ -2124,10 +2146,37 @@  static int dsa_slave_switchdev_event(struct notifier_block *unused,
 		return notifier_from_errno(err);
 	case SWITCHDEV_FDB_ADD_TO_DEVICE:
 	case SWITCHDEV_FDB_DEL_TO_DEVICE:
-		if (!dsa_slave_dev_check(dev))
-			return NOTIFY_DONE;
+		fdb_info = ptr;
 
-		dp = dsa_slave_to_port(dev);
+		if (dsa_slave_dev_check(dev)) {
+			if (!fdb_info->added_by_user)
+				return NOTIFY_OK;
+
+			dp = dsa_slave_to_port(dev);
+		} else {
+			/* Snoop addresses learnt on foreign interfaces
+			 * bridged with us, for switches that don't
+			 * automatically learn SA from CPU-injected traffic
+			 */
+			struct net_device *br_dev;
+			struct dsa_slave_priv *p;
+
+			br_dev = netdev_master_upper_dev_get_rcu(dev);
+			if (!br_dev)
+				return NOTIFY_DONE;
+
+			if (!netif_is_bridge_master(br_dev))
+				return NOTIFY_DONE;
+
+			p = dsa_slave_dev_lower_find(br_dev);
+			if (!p)
+				return NOTIFY_DONE;
+
+			dp = p->dp->cpu_dp;
+
+			if (!dp->ds->learning_broken_on_cpu_port)
+				return NOTIFY_DONE;
+		}
 
 		if (!dp->ds->ops->port_fdb_add || !dp->ds->ops->port_fdb_del)
 			return NOTIFY_DONE;
@@ -2142,13 +2191,6 @@  static int dsa_slave_switchdev_event(struct notifier_block *unused,
 		switchdev_work->port = dp->index;
 		switchdev_work->event = event;
 
-		fdb_info = ptr;
-
-		if (!fdb_info->added_by_user) {
-			kfree(switchdev_work);
-			return NOTIFY_OK;
-		}
-
 		ether_addr_copy(switchdev_work->addr,
 				fdb_info->addr);
 		switchdev_work->vid = fdb_info->vid;