diff mbox series

[v2,net-next] net: dsa: reference count the host mdb addresses

Message ID 20201212190352.214642-1-vladimir.oltean@nxp.com (mailing list archive)
State Superseded
Delegated to: Netdev Maintainers
Headers show
Series [v2,net-next] net: dsa: reference count the host mdb addresses | expand

Checks

Context Check Description
netdev/cover_letter success Link
netdev/fixes_present success Link
netdev/patch_count success Link
netdev/tree_selection success Clearly marked for net-next
netdev/subject_prefix success Link
netdev/source_inline success Was 0 now: 0
netdev/verify_signedoff success Link
netdev/module_param success Was 0 now: 0
netdev/build_32bit success Errors and warnings before: 20 this patch: 20
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/verify_fixes success Link
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 138 lines checked
netdev/build_allmodconfig_warn success Errors and warnings before: 20 this patch: 20
netdev/header_inline success Link
netdev/stable success Stable not CCed

Commit Message

Vladimir Oltean Dec. 12, 2020, 7:03 p.m. UTC
Currently any DSA switch that is strict when implementing the mdb
operations prints these benign errors after the addresses expire, with
at least 2 ports bridged:

[  286.013814] mscc_felix 0000:00:00.5 swp3: failed (err=-2) to del object (id=3)

The reason has to do with this piece of code:

	netdev_for_each_lower_dev(dev, lower_dev, iter)
		br_mdb_switchdev_host_port(dev, lower_dev, mp, type);

called from:

br_multicast_group_expired
-> br_multicast_host_leave
   -> br_mdb_notify
      -> br_mdb_switchdev_host

Basically, that code is correct. It tells each switchdev port that the
host can leave that multicast group. But in the case of DSA, all user
ports are connected to the host through the same pipe. So, because DSA
translates a host MDB to a normal MDB on the CPU port, this means that
when all user ports leave a multicast group, DSA tries to remove it N
times from the CPU port.

We should be reference-counting these addresses. Otherwise, the first
port on which the MDB expires will cause an entry removal from the CPU
port, which will break the host MDB for the remaining ports.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
---
Changes in v2:
- Re-targeted against net-next, since this is not breaking any use case
  that I know of.
- Re-did the refcounting logic. The problem is that the MDB addition is
  two-phase, but the deletion is one-phase. So refcounting on addition
  needs to be done only on one phase - the commit one. Before, we had a
  problem there, and for host MDB additions where an entry already existed
  on the CPU port, we would call the prepare phase but never commit.
  This would break drivers that allocate memory on prepare, and then
  expect the commit phase to actually apply. So we're not doing this any
  longer. Both prepare and commit phases are now stubbed out for additions
  of host MDB entries that are already present on the CPU port.
- Renamed dsa_host_mdb_find into dsa_host_addr_find, and we're now
  passing it the host_mdb list rather than struct dsa_switch *ds. This
  is a generic function and we might be able to reuse it in the future
  for host FDB entries (such as slave net_device MAC addresses).
- Left the allocation as GFP_KERNEL, since that is fine - the switchdev
  notifier runs as deferred, therefore in process context.

 include/net/dsa.h |  9 +++++
 net/dsa/dsa2.c    |  2 ++
 net/dsa/slave.c   | 91 ++++++++++++++++++++++++++++++++++++++++++-----
 3 files changed, 93 insertions(+), 9 deletions(-)

Comments

Andrew Lunn Dec. 12, 2020, 7:28 p.m. UTC | #1
> +/* DSA can directly translate this to a normal MDB add, but on the CPU port.
> + * But because multiple user ports can join the same multicast group and the
> + * bridge will emit a notification for each port, we need to add/delete the
> + * entry towards the host only once, so we reference count it.
> + */
> +static int dsa_host_mdb_add(struct dsa_port *dp,
> +			    const struct switchdev_obj_port_mdb *mdb,
> +			    struct switchdev_trans *trans)
> +{
> +	struct dsa_port *cpu_dp = dp->cpu_dp;
> +	struct dsa_switch *ds = dp->ds;
> +	struct dsa_host_addr *a;
> +	int err;
> +
> +	a = dsa_host_addr_find(&ds->host_mdb, mdb);
> +	if (a) {
> +		/* Only the commit phase is refcounted */
> +		if (switchdev_trans_ph_commit(trans))
> +			refcount_inc(&a->refcount);
> +		return 0;
> +	}
> +
> +	err = dsa_port_mdb_add(cpu_dp, mdb, trans);
> +	if (err)
> +		return err;
> +
> +	/* Only the commit phase is refcounted, so don't save this just yet */
> +	if (switchdev_trans_ph_prepare(trans))
> +		return 0;
> +
> +	a = kzalloc(sizeof(*a), GFP_KERNEL);
> +	if (!a)
> +		return -ENOMEM;

Hi Vladimir

This allocation should be done in the prepare phase, since it can
fail. You should only return catastrophic errors during the commit
phase.

So i think you can allocate it in the prepare phase, but leave the
reference count as 0. Then increment it in the commit phase.

And then make the dsa_host_mdb_del() decrement the specific refcount,
and then do a generic garbage collect for all entries with refcount of
0, to cleanup any left over failures.

   Andrew
diff mbox series

Patch

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 4e60d2610f20..e639db28e238 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -253,6 +253,13 @@  struct dsa_link {
 	struct list_head list;
 };
 
+struct dsa_host_addr {
+	unsigned char addr[ETH_ALEN];
+	u16 vid;
+	refcount_t refcount;
+	struct list_head list;
+};
+
 struct dsa_switch {
 	bool setup;
 
@@ -335,6 +342,8 @@  struct dsa_switch {
 	 */
 	bool			mtu_enforcement_ingress;
 
+	struct list_head	host_mdb;
+
 	size_t num_ports;
 };
 
diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index 183003e45762..52b3ef34a2cb 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -413,6 +413,8 @@  static int dsa_switch_setup(struct dsa_switch *ds)
 	if (ds->setup)
 		return 0;
 
+	INIT_LIST_HEAD(&ds->host_mdb);
+
 	/* Initialize ds->phys_mii_mask before registering the slave MDIO bus
 	 * driver and before ops->setup() has run, since the switch drivers and
 	 * the slave MDIO bus driver rely on these values for probing PHY
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 4a0498bf6c65..ccf397f02129 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -376,6 +376,86 @@  static int dsa_slave_vlan_add(struct net_device *dev,
 	return 0;
 }
 
+static struct dsa_host_addr *
+dsa_host_addr_find(struct list_head *addr_list,
+		   const struct switchdev_obj_port_mdb *mdb)
+{
+	struct dsa_host_addr *a;
+
+	list_for_each_entry(a, addr_list, list)
+		if (ether_addr_equal(a->addr, mdb->addr) && a->vid == mdb->vid)
+			return a;
+
+	return NULL;
+}
+
+/* DSA can directly translate this to a normal MDB add, but on the CPU port.
+ * But because multiple user ports can join the same multicast group and the
+ * bridge will emit a notification for each port, we need to add/delete the
+ * entry towards the host only once, so we reference count it.
+ */
+static int dsa_host_mdb_add(struct dsa_port *dp,
+			    const struct switchdev_obj_port_mdb *mdb,
+			    struct switchdev_trans *trans)
+{
+	struct dsa_port *cpu_dp = dp->cpu_dp;
+	struct dsa_switch *ds = dp->ds;
+	struct dsa_host_addr *a;
+	int err;
+
+	a = dsa_host_addr_find(&ds->host_mdb, mdb);
+	if (a) {
+		/* Only the commit phase is refcounted */
+		if (switchdev_trans_ph_commit(trans))
+			refcount_inc(&a->refcount);
+		return 0;
+	}
+
+	err = dsa_port_mdb_add(cpu_dp, mdb, trans);
+	if (err)
+		return err;
+
+	/* Only the commit phase is refcounted, so don't save this just yet */
+	if (switchdev_trans_ph_prepare(trans))
+		return 0;
+
+	a = kzalloc(sizeof(*a), GFP_KERNEL);
+	if (!a)
+		return -ENOMEM;
+
+	ether_addr_copy(a->addr, mdb->addr);
+	a->vid = mdb->vid;
+	refcount_set(&a->refcount, 1);
+	list_add_tail(&a->list, &ds->host_mdb);
+
+	return 0;
+}
+
+static int dsa_host_mdb_del(struct dsa_port *dp,
+			    const struct switchdev_obj_port_mdb *mdb)
+{
+	struct dsa_port *cpu_dp = dp->cpu_dp;
+	struct dsa_switch *ds = dp->ds;
+	struct dsa_host_addr *a;
+	int err;
+
+	a = dsa_host_addr_find(&ds->host_mdb, mdb);
+	if (!a)
+		return -ENOENT;
+
+	if (!refcount_dec_and_test(&a->refcount))
+		return 0;
+
+	err = dsa_port_mdb_del(cpu_dp, mdb);
+	if (err)
+		return err;
+
+	list_del(&a->list);
+	kfree(a);
+
+	return 0;
+}
+
 static int dsa_slave_port_obj_add(struct net_device *dev,
 				  const struct switchdev_obj *obj,
 				  struct switchdev_trans *trans,
@@ -396,11 +476,7 @@  static int dsa_slave_port_obj_add(struct net_device *dev,
 		err = dsa_port_mdb_add(dp, SWITCHDEV_OBJ_PORT_MDB(obj), trans);
 		break;
 	case SWITCHDEV_OBJ_ID_HOST_MDB:
-		/* DSA can directly translate this to a normal MDB add,
-		 * but on the CPU port.
-		 */
-		err = dsa_port_mdb_add(dp->cpu_dp, SWITCHDEV_OBJ_PORT_MDB(obj),
-				       trans);
+		err = dsa_host_mdb_add(dp, SWITCHDEV_OBJ_PORT_MDB(obj), trans);
 		break;
 	case SWITCHDEV_OBJ_ID_PORT_VLAN:
 		err = dsa_slave_vlan_add(dev, obj, trans);
@@ -455,10 +531,7 @@  static int dsa_slave_port_obj_del(struct net_device *dev,
 		err = dsa_port_mdb_del(dp, SWITCHDEV_OBJ_PORT_MDB(obj));
 		break;
 	case SWITCHDEV_OBJ_ID_HOST_MDB:
-		/* DSA can directly translate this to a normal MDB add,
-		 * but on the CPU port.
-		 */
-		err = dsa_port_mdb_del(dp->cpu_dp, SWITCHDEV_OBJ_PORT_MDB(obj));
+		err = dsa_host_mdb_del(dp, SWITCHDEV_OBJ_PORT_MDB(obj));
 		break;
 	case SWITCHDEV_OBJ_ID_PORT_VLAN:
 		err = dsa_slave_vlan_del(dev, obj);