@@ -895,6 +895,7 @@ static int dpaa2_switch_port_fdb_dump(struct sk_buff *skb, struct netlink_callba
struct net_device *net_dev,
struct net_device *filter_dev, int *idx)
{
+ struct rtnl_fdb_dump_ctx *ctx = (struct rtnl_fdb_dump_ctx *)cb->ctx;
struct ethsw_port_priv *port_priv = netdev_priv(net_dev);
struct ethsw_dump_ctx dump = {
.dev = net_dev,
@@ -904,6 +905,9 @@ static int dpaa2_switch_port_fdb_dump(struct sk_buff *skb, struct netlink_callba
};
int err;
+ if (ctx->state != RTNL_FDB_DUMP_COMMIT)
+ return 0;
+
err = dpaa2_switch_fdb_iterate(port_priv, dpaa2_switch_fdb_entry_dump, &dump);
*idx = dump.idx;
@@ -687,6 +687,7 @@ static int ocelot_port_fdb_dump(struct sk_buff *skb,
struct net_device *dev,
struct net_device *filter_dev, int *idx)
{
+ struct rtnl_fdb_dump_ctx *ctx = (struct rtnl_fdb_dump_ctx *)cb->ctx;
struct ocelot_port_private *priv = netdev_priv(dev);
struct ocelot *ocelot = priv->port.ocelot;
struct ocelot_dump_ctx dump = {
@@ -698,6 +699,9 @@ static int ocelot_port_fdb_dump(struct sk_buff *skb,
int port = priv->chip_port;
int ret;
+ if (ctx->state != RTNL_FDB_DUMP_COMMIT)
+ return 0;
+
ret = ocelot_fdb_dump(ocelot, port, ocelot_port_fdb_do_dump, &dump);
*idx = dump.idx;
@@ -1376,6 +1376,9 @@ static int vxlan_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb,
unsigned int h;
int err = 0;
+ if (ctx->state != RTNL_FDB_DUMP_COMMIT)
+ return 0;
+
for (h = 0; h < FDB_HASH_SIZE; ++h) {
struct vxlan_fdb *f;
@@ -110,6 +110,12 @@ void rtnl_kfree_skbs(struct sk_buff *head, struct sk_buff *tail);
WARN_ONCE(!rtnl_is_locked(), \
"RTNL: assertion failed at %s (%d)\n", __FILE__, __LINE__)
+enum rtnl_fdb_dump_state {
+ RTNL_FDB_DUMP_PREPARE,
+ RTNL_FDB_DUMP_COMMIT,
+ RTNL_FDB_DUMP_FINISH,
+};
+
struct rtnl_fdb_dump_ctx {
/* Last bucket in the dev_index_head hash list that was checked.
* Used by rtnl_fdb_dump to resume in case the procedure is
@@ -126,6 +132,7 @@ struct rtnl_fdb_dump_ctx {
* the dump procedure is interrupted.
*/
int fidx;
+ enum rtnl_fdb_dump_state state;
};
extern int ndo_dflt_fdb_dump(struct sk_buff *skb,
@@ -826,6 +826,9 @@ int br_fdb_dump(struct sk_buff *skb,
struct net_bridge_fdb_entry *f;
int err = 0;
+ if (ctx->state != RTNL_FDB_DUMP_COMMIT)
+ return 0;
+
if (!(dev->priv_flags & IFF_EBRIDGE))
return err;
@@ -4225,8 +4225,12 @@ int ndo_dflt_fdb_dump(struct sk_buff *skb,
struct net_device *filter_dev,
int *idx)
{
+ struct rtnl_fdb_dump_ctx *ctx = (struct rtnl_fdb_dump_ctx *)cb->ctx;
int err;
+ if (ctx->state != RTNL_FDB_DUMP_COMMIT)
+ return 0;
+
if (dev->type != ARPHRD_ETHER)
return -EINVAL;
@@ -4330,30 +4334,40 @@ static int valid_fdb_dump_legacy(const struct nlmsghdr *nlh,
return 0;
}
-static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb)
+static void rtnl_fdb_dump_prepare_finish(struct sk_buff *skb,
+ struct netlink_callback *cb)
{
- struct rtnl_fdb_dump_ctx *ctx = (struct rtnl_fdb_dump_ctx *)cb->ctx;
+ struct net *net = sock_net(skb->sk);
+ struct hlist_head *head;
struct net_device *dev;
- struct net_device *br_dev = NULL;
- const struct net_device_ops *ops = NULL;
+ int h, fidx = 0;
+
+ for (h = 0; h < NETDEV_HASHENTRIES; h++) {
+ head = &net->dev_index_head[h];
+ hlist_for_each_entry(dev, head, index_hlist) {
+ if (!dev->netdev_ops->ndo_fdb_dump)
+ continue;
+
+ dev->netdev_ops->ndo_fdb_dump(skb, cb, dev,
+ NULL, &fidx);
+ }
+ }
+}
+
+static int rtnl_fdb_dump_commit(struct sk_buff *skb, struct netlink_callback *cb,
+ int br_idx, int brport_idx)
+{
+ struct rtnl_fdb_dump_ctx *ctx = (struct rtnl_fdb_dump_ctx *)cb->ctx;
const struct net_device_ops *cops = NULL;
+ const struct net_device_ops *ops = NULL;
struct net *net = sock_net(skb->sk);
+ struct net_device *br_dev = NULL;
struct hlist_head *head;
- int brport_idx = 0;
- int br_idx = 0;
- int h, s_h;
+ struct net_device *dev;
int idx = 0, s_idx;
- int err = 0;
int fidx = 0;
-
- if (cb->strict_check)
- err = valid_fdb_dump_strict(cb->nlh, &br_idx, &brport_idx,
- cb->extack);
- else
- err = valid_fdb_dump_legacy(cb->nlh, &br_idx, &brport_idx,
- cb->extack);
- if (err < 0)
- return err;
+ int err = 0;
+ int h, s_h;
if (br_idx) {
br_dev = __dev_get_by_index(net, br_idx);
@@ -4431,6 +4445,49 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb)
return skb->len;
}
+static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct rtnl_fdb_dump_ctx *ctx = (struct rtnl_fdb_dump_ctx *)cb->ctx;
+ int brport_idx = 0;
+ int br_idx = 0;
+ int err;
+
+ if (cb->strict_check)
+ err = valid_fdb_dump_strict(cb->nlh, &br_idx, &brport_idx,
+ cb->extack);
+ else
+ err = valid_fdb_dump_legacy(cb->nlh, &br_idx, &brport_idx,
+ cb->extack);
+ if (err < 0)
+ return err;
+
+ /* user did not specify a bridge or a bridge port */
+ if (!brport_idx && !br_idx) {
+ switch (ctx->state) {
+ case RTNL_FDB_DUMP_PREPARE:
+ rtnl_fdb_dump_prepare_finish(skb, cb);
+ ctx->state = RTNL_FDB_DUMP_COMMIT;
+ fallthrough;
+ case RTNL_FDB_DUMP_COMMIT:
+ err = rtnl_fdb_dump_commit(skb, cb, br_idx, brport_idx);
+ if (err)
+ return err;
+ ctx->state = RTNL_FDB_DUMP_FINISH;
+ fallthrough;
+ case RTNL_FDB_DUMP_FINISH:
+ rtnl_fdb_dump_prepare_finish(skb, cb);
+ break;
+ }
+ } else {
+ ctx->state = RTNL_FDB_DUMP_COMMIT;
+ err = rtnl_fdb_dump_commit(skb, cb, br_idx, brport_idx);
+ if (err)
+ return err;
+ }
+
+ return err;
+}
+
static int valid_fdb_get_strict(const struct nlmsghdr *nlh,
struct nlattr **tb, u8 *ndm_flags,
int *br_idx, int *brport_idx, u8 **addr,
@@ -238,6 +238,7 @@ dsa_slave_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb,
struct net_device *dev, struct net_device *filter_dev,
int *idx)
{
+ struct rtnl_fdb_dump_ctx *ctx = (struct rtnl_fdb_dump_ctx *)cb->ctx;
struct dsa_port *dp = dsa_slave_to_port(dev);
struct dsa_slave_dump_ctx dump = {
.dev = dev,
@@ -247,6 +248,9 @@ dsa_slave_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb,
};
int err;
+ if (ctx->state != RTNL_FDB_DUMP_COMMIT)
+ return 0;
+
err = dsa_port_fdb_dump(dp, dsa_slave_port_fdb_do_dump, &dump);
*idx = dump.idx;
Some drivers which offload the bridge do not have the ability to synchronize their hardware FDB with the bridge's software FDB, but they perform autonomous address learning nonetheless. These drivers implement .ndo_fdb_dump and report their hardware learned entries as 'self' to netlink. The FDB dump procedure for these drivers is wasteful, since many times, the FDB is shared across all ports of a bridge (or even globally shared across all ports of a switch in some cases). So since an FDB dump means to walk the entire FDB, and .ndo_fdb_dump is per netdev, we end up walking an FDB shared by 10 netdevices 10 times, once for each port. If on top of that, the access to the FDB is also slow (which is actually not all that uncommon, since this is one of the reasons these drivers do not bother to synchronize their hardware FDB in the first place), it means that an FDB dump is a very inefficient and slow operation - it may take minutes or more. This change keeps the .ndo_fdb_dump function prototype as is, but: - introduces a "prepare" and a "finish" phase. The phase that exists in the code base right now is retroactively named "commit" phase. - if the rtnl_fdb_dump request is specific to a single port, nothing changes. We jump straight to the commit phase of that specific port. - if the rtnl_fdb_dump request is imprecise (no brport_idx or br_idx specified), that is when there is an opportunity for improvement. rtnl_fdb_dump first enters the "prepare" phase, where it notifies _all_ netdev drivers that have the .ndo_fdb_dump method implemented. It only enters the "commit" phase once all netdevs were prepared. The "commit" phase may be interrupted by lack of space in the netlink skb. No problem, when user space comes back with a new buffer we return to the commit phase, just like in the code that exists now. After the commit phase ends for all netdevs, rtnl_fdb_dump proceeds to call the "finish" phase for all drivers. In the envisioned use case, a multi-port [ switch ] driver will dump its shared FDB in the "prepare" phase: for .ndo_fdb_dump(dev), it checks what is the FDB corresponding to "dev", and if the FDB has been already dumped, do nothing, otherwise dump it and just save the FDB entries collected (in a list, array, whatever), no matter which port they correspond to. Then, in the "commit" phase, the FDB entries collected above are filtered by the "dev" in .ndo_fdb_dump(dev). Only those are reported inside the netlink skb. Then, in the "finish" phase, any allocated memory can be freed. All drivers are modified to ignore any other phase except the "commit" phase, to preserve existing functionality. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> --- .../ethernet/freescale/dpaa2/dpaa2-switch.c | 4 + drivers/net/ethernet/mscc/ocelot_net.c | 4 + drivers/net/vxlan.c | 3 + include/linux/rtnetlink.h | 7 ++ net/bridge/br_fdb.c | 3 + net/core/rtnetlink.c | 91 +++++++++++++++---- net/dsa/slave.c | 4 + 7 files changed, 99 insertions(+), 17 deletions(-)