diff mbox series

[v3,net-next,2/4] net: dsa: Link aggregation support

Message ID 20201202091356.24075-3-tobias@waldekranz.com (mailing list archive)
State Changes Requested
Delegated to: Netdev Maintainers
Headers show
Series net: dsa: Link aggregation support | expand

Checks

Context Check Description
netdev/cover_letter success Link
netdev/fixes_present success Link
netdev/patch_count success Link
netdev/tree_selection success Clearly marked for net-next
netdev/subject_prefix success Link
netdev/source_inline success Was 0 now: 0
netdev/verify_signedoff success Link
netdev/module_param success Was 0 now: 0
netdev/build_32bit fail Errors and warnings before: 20 this patch: 24
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/verify_fixes success Link
netdev/checkpatch warning WARNING: line length of 82 exceeds 80 columns
netdev/build_allmodconfig_warn fail Errors and warnings before: 20 this patch: 24
netdev/header_inline success Link
netdev/stable success Stable not CCed

Commit Message

Tobias Waldekranz Dec. 2, 2020, 9:13 a.m. UTC
Monitor the following events and notify the driver when:

- A DSA port joins/leaves a LAG.
- A LAG, made up of DSA ports, joins/leaves a bridge.
- A DSA port in a LAG is enabled/disabled (enabled meaning
  "distributing" in 802.3ad LACP terms).

Each LAG interface to which a DSA port is attached is represented by a
`struct dsa_lag` which is globally reachable from the switch tree and
from each associated port.

When a LAG joins a bridge, the DSA subsystem will treat that as each
individual port joining the bridge. The driver may look at the port's
LAG pointer to see if it is associated with any LAG, if that is
required. This is analogue to how switchdev events are replicated out
to all lower devices when reaching e.g. a LAG.

Signed-off-by: Tobias Waldekranz <tobias@waldekranz.com>
---
 include/net/dsa.h  |  97 +++++++++++++++++++++++++++++++++
 net/dsa/dsa2.c     |  51 ++++++++++++++++++
 net/dsa/dsa_priv.h |  31 +++++++++++
 net/dsa/port.c     | 132 +++++++++++++++++++++++++++++++++++++++++++++
 net/dsa/slave.c    |  83 +++++++++++++++++++++++++---
 net/dsa/switch.c   |  49 +++++++++++++++++
 6 files changed, 437 insertions(+), 6 deletions(-)

Comments

Vladimir Oltean Dec. 2, 2020, 10:07 a.m. UTC | #1
On Wed, Dec 02, 2020 at 10:13:54AM +0100, Tobias Waldekranz wrote:
> Monitor the following events and notify the driver when:
> 
> - A DSA port joins/leaves a LAG.
> - A LAG, made up of DSA ports, joins/leaves a bridge.
> - A DSA port in a LAG is enabled/disabled (enabled meaning
>   "distributing" in 802.3ad LACP terms).
> 
> Each LAG interface to which a DSA port is attached is represented by a
> `struct dsa_lag` which is globally reachable from the switch tree and
> from each associated port.
> 
> When a LAG joins a bridge, the DSA subsystem will treat that as each
> individual port joining the bridge. The driver may look at the port's
> LAG pointer to see if it is associated with any LAG, if that is
> required. This is analogue to how switchdev events are replicated out
> to all lower devices when reaching e.g. a LAG.
> 
> Signed-off-by: Tobias Waldekranz <tobias@waldekranz.com>
> ---
>  include/net/dsa.h  |  97 +++++++++++++++++++++++++++++++++
>  net/dsa/dsa2.c     |  51 ++++++++++++++++++
>  net/dsa/dsa_priv.h |  31 +++++++++++
>  net/dsa/port.c     | 132 +++++++++++++++++++++++++++++++++++++++++++++
>  net/dsa/slave.c    |  83 +++++++++++++++++++++++++---
>  net/dsa/switch.c   |  49 +++++++++++++++++
>  6 files changed, 437 insertions(+), 6 deletions(-)
> 
> diff --git a/include/net/dsa.h b/include/net/dsa.h
> index 4e60d2610f20..aaa350b78c55 100644
> --- a/include/net/dsa.h
> +++ b/include/net/dsa.h
> @@ -7,6 +7,7 @@
>  #ifndef __LINUX_NET_DSA_H
>  #define __LINUX_NET_DSA_H
>  
> +#include <linux/bitmap.h>
>  #include <linux/if.h>
>  #include <linux/if_ether.h>
>  #include <linux/list.h>
> @@ -71,6 +72,7 @@ enum dsa_tag_protocol {
>  
>  struct packet_type;
>  struct dsa_switch;
> +struct dsa_lag;
>  
>  struct dsa_device_ops {
>  	struct sk_buff *(*xmit)(struct sk_buff *skb, struct net_device *dev);
> @@ -149,6 +151,13 @@ struct dsa_switch_tree {
>  
>  	/* List of DSA links composing the routing table */
>  	struct list_head rtable;
> +
> +	/* Link aggregates */
> +	struct {
> +		struct dsa_lag *pool;
> +		unsigned long *busy;
> +		unsigned int num;

Can we get rid of the busy array and just look at the refcounts?
Can we also get rid of the "num" variable?

> +	} lags;
>  };
Tobias Waldekranz Dec. 2, 2020, 10:51 a.m. UTC | #2
On Wed, Dec 02, 2020 at 12:07, Vladimir Oltean <olteanv@gmail.com> wrote:
> On Wed, Dec 02, 2020 at 10:13:54AM +0100, Tobias Waldekranz wrote:
>> +
>> +	/* Link aggregates */
>> +	struct {
>> +		struct dsa_lag *pool;
>> +		unsigned long *busy;
>> +		unsigned int num;
>
> Can we get rid of the busy array and just look at the refcounts?

We can, but it is typically 4/8B that makes iterating over used LAGs,
finding the first free LAG etc. much easier.

> Can we also get rid of the "num" variable?

It can be computed but it requires traversing the entire dst port list
every time, as you can see in dsa_tree_setup_lags. We need to hoist the
lowest supported num up from the ds level to the dst.
Jakub Kicinski Dec. 2, 2020, 6:58 p.m. UTC | #3
On Wed,  2 Dec 2020 10:13:54 +0100 Tobias Waldekranz wrote:
> Monitor the following events and notify the driver when:
> 
> - A DSA port joins/leaves a LAG.
> - A LAG, made up of DSA ports, joins/leaves a bridge.
> - A DSA port in a LAG is enabled/disabled (enabled meaning
>   "distributing" in 802.3ad LACP terms).
> 
> Each LAG interface to which a DSA port is attached is represented by a
> `struct dsa_lag` which is globally reachable from the switch tree and
> from each associated port.
> 
> When a LAG joins a bridge, the DSA subsystem will treat that as each
> individual port joining the bridge. The driver may look at the port's
> LAG pointer to see if it is associated with any LAG, if that is
> required. This is analogue to how switchdev events are replicated out
> to all lower devices when reaching e.g. a LAG.
> 
> Signed-off-by: Tobias Waldekranz <tobias@waldekranz.com>

Any idea where this is coming from?

net/dsa/slave.c: note: in included file:
net/dsa/dsa_priv.h:194:31: error: incompatible types in comparison expression (different address spaces):
net/dsa/dsa_priv.h:194:31:    struct net_device [noderef] __rcu *
net/dsa/dsa_priv.h:194:31:    struct net_device *
Tobias Waldekranz Dec. 2, 2020, 9:29 p.m. UTC | #4
On Wed, Dec 02, 2020 at 10:58, Jakub Kicinski <kuba@kernel.org> wrote:
> On Wed,  2 Dec 2020 10:13:54 +0100 Tobias Waldekranz wrote:
>> Monitor the following events and notify the driver when:
>> 
>> - A DSA port joins/leaves a LAG.
>> - A LAG, made up of DSA ports, joins/leaves a bridge.
>> - A DSA port in a LAG is enabled/disabled (enabled meaning
>>   "distributing" in 802.3ad LACP terms).
>> 
>> Each LAG interface to which a DSA port is attached is represented by a
>> `struct dsa_lag` which is globally reachable from the switch tree and
>> from each associated port.
>> 
>> When a LAG joins a bridge, the DSA subsystem will treat that as each
>> individual port joining the bridge. The driver may look at the port's
>> LAG pointer to see if it is associated with any LAG, if that is
>> required. This is analogue to how switchdev events are replicated out
>> to all lower devices when reaching e.g. a LAG.
>> 
>> Signed-off-by: Tobias Waldekranz <tobias@waldekranz.com>
>
> Any idea where this is coming from?
>
> net/dsa/slave.c: note: in included file:
> net/dsa/dsa_priv.h:194:31: error: incompatible types in comparison expression (different address spaces):
> net/dsa/dsa_priv.h:194:31:    struct net_device [noderef] __rcu *
> net/dsa/dsa_priv.h:194:31:    struct net_device *

Sorry about this. I missed an rtnl_dereference in the refactor between
v2->v3. I have now integrated sparse into my workflow so at least this
should not happen again.
Vladimir Oltean Dec. 2, 2020, 9:32 p.m. UTC | #5
On Wed, Dec 02, 2020 at 10:29:38PM +0100, Tobias Waldekranz wrote:
> Sorry about this. I missed an rtnl_dereference in the refactor between
> v2->v3. I have now integrated sparse into my workflow so at least this
> should not happen again.

Please do not resend right away. Give me one more day or so with the
current series to really digest it.
Vladimir Oltean Dec. 3, 2020, 4:24 p.m. UTC | #6
On Wed, Dec 02, 2020 at 10:13:54AM +0100, Tobias Waldekranz wrote:
> +static inline bool dsa_lag_offloading(struct dsa_switch_tree *dst)
> +{
> +	return dst->lags.num > 0;
> +}

You assume that the DSA switch, when it sets a non-zero number of LAGs,
can offload any type of LAG TX type, when in fact the switch might be
able to offload just NETDEV_LAG_TX_TYPE_HASH.

I like the fact that we revert to a software-based implementation for
features the hardware can't offload. So rejecting other TX types is out
of the question. However we still have to prevent hardware bridging.

Should we add an array of supported TX types that the switch port can
offload, and that should be checked by DSA in dsa_lag_offloading?
Vladimir Oltean Dec. 3, 2020, 8:48 p.m. UTC | #7
On Wed, Dec 02, 2020 at 10:13:54AM +0100, Tobias Waldekranz wrote:
> +static int dsa_slave_check_lag_upper(struct net_device *dev)
> +{
> +	struct dsa_port *dp = dsa_slave_to_port(dev);
> +	struct dsa_switch_tree *dst = dp->ds->dst;
> +
> +	if (!dsa_lag_offloading(dst))
> +		return NOTIFY_DONE;
> +
> +	if (dsa_lag_by_dev(dst, dev))
> +		return NOTIFY_OK;
> +
> +	if (!dsa_lag_available(dst))
> +		return notifier_from_errno(-EBUSY);

If for any reason there are no LAGs available in hardware, I think this
should still return NOTIFY_OK and we should not reject it, just not
offload it.

Which is to say that I basically don't understand the point of the
PRECHANGEUPPER checks.

> +
> +	return NOTIFY_OK;
> +}
Tobias Waldekranz Dec. 3, 2020, 8:53 p.m. UTC | #8
On Thu, Dec 03, 2020 at 18:24, Vladimir Oltean <olteanv@gmail.com> wrote:
> On Wed, Dec 02, 2020 at 10:13:54AM +0100, Tobias Waldekranz wrote:
>> +static inline bool dsa_lag_offloading(struct dsa_switch_tree *dst)
>> +{
>> +	return dst->lags.num > 0;
>> +}
>
> You assume that the DSA switch, when it sets a non-zero number of LAGs,
> can offload any type of LAG TX type, when in fact the switch might be
> able to offload just NETDEV_LAG_TX_TYPE_HASH.

Right you are, I had this on my TODO but I most have lost track of
it. Good catch!

> I like the fact that we revert to a software-based implementation for
> features the hardware can't offload. So rejecting other TX types is out
> of the question.

Well if we really want to be precise, we must also ensure that the exact
hash type is supported by the hardware. mv88e6xxx only supports
NETDEV_LAG_HASH_L2 for example. There is a needle to thread here I
think. Story time ('tis the season after all):

    A user, Linus, has just installed OpenWRT on his gateway. Finally,
    he can unlock the full potential of his 802.11 AP by setting up a
    LAG to it.

    He carefully studies teamd.conf(5) and rightfully comes to the
    conclusion that he should set up the tx_hash to include the full
    monty of available keys. Teamd gets nothing but praise from the
    kernel when applying the configuration.

    And yet, Linus is not happy - the throughput between his NAS and his
    smart TV is now lower than before. It is enough for Linus to start
    working on his OS. It won't be big and professional like Linux of
    course, but it will at least get this bit right.

One could argue that if Linus had received an error instead, adapted his
teamd config and tried again, he would be a happier user and we might
not have to compete with his OS.

I am not sure which way is the correct one, but I do not think that it
necessarily _always_ correct to silently fallback to a non-offloaded
mode.

> However we still have to prevent hardware bridging.

The idea behind checking for dsa_lag_offloading in
dsa_slave_lag_changeupper was exactly this. If the LAG itself is not
offloaded, we should never call dsa_port_bridge_join on the lowers.

> Should we add an array of supported TX types that the switch port can
> offload, and that should be checked by DSA in dsa_lag_offloading?

That would work. We could also create a new DSA op that we would call
for each chip from PRECHANGEUPPER to verify that it is supported. One
advantage with this approach is that we can just pass along the `struct
netdev_lag_upper_info` so drivers always have access to all information,
in the event that new fields are added to it for example.
Andrew Lunn Dec. 3, 2020, 9:09 p.m. UTC | #9
> One could argue that if Linus had received an error instead, adapted his
> teamd config and tried again, he would be a happier user and we might
> not have to compete with his OS.
> 
> I am not sure which way is the correct one, but I do not think that it
> necessarily _always_ correct to silently fallback to a non-offloaded
> mode.

This is an argument Mellanox makes, where falling back to software
would be a bad idea given the huge bandwidth of their hardware
accelerator, and the slow bandwidth of their CPU.

If the switch has no hardware support for LAG at all, then falling
back to software is reasonable. It is less clear when there is some
support in the switch. If we do reject it, we should try to make use
of extack to give the user a useful error messages: Not supported, try
configuration XYZ. But i guess that needs some plumbing, getting
extack available in the place we make the decision.

	Andrew
Tobias Waldekranz Dec. 3, 2020, 9:35 p.m. UTC | #10
On Thu, Dec 03, 2020 at 22:09, Andrew Lunn <andrew@lunn.ch> wrote:
>> One could argue that if Linus had received an error instead, adapted his
>> teamd config and tried again, he would be a happier user and we might
>> not have to compete with his OS.
>> 
>> I am not sure which way is the correct one, but I do not think that it
>> necessarily _always_ correct to silently fallback to a non-offloaded
>> mode.
>
> This is an argument Mellanox makes, where falling back to software
> would be a bad idea given the huge bandwidth of their hardware
> accelerator, and the slow bandwidth of their CPU.

Yeah when you have 100G interfaces the choice becomes easier :)

> If the switch has no hardware support for LAG at all, then falling
> back to software is reasonable. It is less clear when there is some
> support in the switch. If we do reject it, we should try to make use
> of extack to give the user a useful error messages: Not supported, try
> configuration XYZ. But i guess that needs some plumbing, getting
> extack available in the place we make the decision.

I do not think we need to add anything. Looking at mlxsw, the extack is
available in the `struct netdev_notifier_changeupper_info`.

I am leaning towards the behavior you just described:

- If there is no offloading available, accept anything and let the
  software take care of it. The user wants a LAG and that is the best we
  can do.

- If offloading is available, reject anything that can not be
  offloaded. My guess is that _any_ hardware offloaded setup will almost
  always yield a better solution for the user than a software fallback.
Vladimir Oltean Dec. 3, 2020, 9:57 p.m. UTC | #11
On Thu, Dec 03, 2020 at 09:53:08PM +0100, Tobias Waldekranz wrote:
> > I like the fact that we revert to a software-based implementation for
> > features the hardware can't offload. So rejecting other TX types is out
> > of the question.
>
> Well if we really want to be precise, we must also ensure that the exact
> hash type is supported by the hardware. mv88e6xxx only supports
> NETDEV_LAG_HASH_L2 for example. There is a needle to thread here I
> think. Story time ('tis the season after all):

Fine, I'll go along with the story-telling mood, even if that'll make my
message less than concise or informative.

>
>     A user, Linus, has just installed OpenWRT on his gateway. Finally,
>     he can unlock the full potential of his 802.11 AP by setting up a
>     LAG to it.
>
>     He carefully studies teamd.conf(5) and rightfully comes to the
>     conclusion that he should set up the tx_hash to include the full
>     monty of available keys. Teamd gets nothing but praise from the
>     kernel when applying the configuration.
>
>     And yet, Linus is not happy - the throughput between his NAS and his
>     smart TV is now lower than before. It is enough for Linus to start
>     working on his OS. It won't be big and professional like Linux of
>     course, but it will at least get this bit right.
>
> One could argue that if Linus had received an error instead, adapted his
> teamd config and tried again, he would be a happier user and we might
> not have to compete with his OS.
>
> I am not sure which way is the correct one, but I do not think that it
> necessarily _always_ correct to silently fallback to a non-offloaded
> mode.

Of course, neither is fully correct. There is always more to improve on
the communication side of things. But DSA, which stands for "Not Just A
Switch", promises you, above all, a port multiplexer with the ability to
make full use of the network stack. Maybe I'm still under the influence
of a recent customer meeting I had, but there is a large base of use
cases there, where people just don't seem to have enough ports, and they
can just add a $3 switch to their system, and voila, problem solved.
Everything works absolutely the same as before. The newly added switch
ports are completely integrated with the kernel's IP routing database.
They aren't even switch ports, they're just ports.

And even there, on the software fallback front, we don't do enough,
currently. I've had other customers who said that they even _prefer_
to do bridging in software, in order to get a chance to run their
netfilter/ebtables based firewall on their traffic. Of course, DSA
currently has no idea when netfilter rules are installed, so I just told
them to hack the driver and remove the bridging offload, which they
happily did.

I suspect you're looking at this from the wrong angle. I did, too, for
the longest time, because I was focused on squeezing the most out of the
hardware I had. And "I feel cheated because the operating system sits
between me and the performance I want from my hardware" is an argument
I've heard all too often. But not everybody needs even gigabits of
traffic, or absurdly low latency. Making a product that works acceptably
and at a low cost is better than hand-picking only hardware that can
accelerate everything and spending $$$$ on it, for a performance
improvement that no user will notice. Doing link aggregation in software
is fine. Doing bridging in software is fine. Sure, the CPU can't compete
past a number of KPPS, but maybe it doesn't even have to.

Again, this doesn't mean that we should not report, somehow, what is
offloaded and what isn't, and more importantly, the reason why
offloading the set of required actions is not supported. And I do
realize that I wrote a long rant about something that has nothing to do
with the topic at hand, which is: should we deny bonding interfaces that
use balance-rr, active-backup, broadcast, balance-tlb, balance-alb on
top of a DSA interface? Hell no, you wouldn't expect an intel e1000 card
to error out when you would set up a bonding interface that isn't xor or
802.3ad, would you? But you wouldn't go ahead and set up bridging
offload either, therefore running into the problem I raised above with
netfilter rules. You would just set out to do what the user asked for,
in the way that you can, and let them decide if the performance is what
they need or not.

> > Should we add an array of supported TX types that the switch port can
> > offload, and that should be checked by DSA in dsa_lag_offloading?
>
> That would work. We could also create a new DSA op that we would call
> for each chip from PRECHANGEUPPER to verify that it is supported. One
> advantage with this approach is that we can just pass along the `struct
> netdev_lag_upper_info` so drivers always have access to all information,
> in the event that new fields are added to it for example.

Hmm, not super pleased to have yet another function pointer, but I don't
have any other idea, so I guess that works just fine.
I would even go out on a limb and say hardcode the TX_TYPE_HASH in DSA
for now. I would be completely surprised to see hardware that can
offload anything else in the near future.
Tobias Waldekranz Dec. 3, 2020, 11:12 p.m. UTC | #12
On Thu, Dec 03, 2020 at 23:57, Vladimir Oltean <olteanv@gmail.com> wrote:
> On Thu, Dec 03, 2020 at 09:53:08PM +0100, Tobias Waldekranz wrote:
>> I am not sure which way is the correct one, but I do not think that it
>> necessarily _always_ correct to silently fallback to a non-offloaded
>> mode.
>
> Of course, neither is fully correct. There is always more to improve on
> the communication side of things. But DSA, which stands for "Not Just A
> Switch", promises you, above all, a port multiplexer with the ability to
> make full use of the network stack. Maybe I'm still under the influence
> of a recent customer meeting I had, but there is a large base of use
> cases there, where people just don't seem to have enough ports, and they
> can just add a $3 switch to their system, and voila, problem solved.
> Everything works absolutely the same as before. The newly added switch
> ports are completely integrated with the kernel's IP routing database.
> They aren't even switch ports, they're just ports.
>
> And even there, on the software fallback front, we don't do enough,
> currently. I've had other customers who said that they even _prefer_
> to do bridging in software, in order to get a chance to run their
> netfilter/ebtables based firewall on their traffic. Of course, DSA
> currently has no idea when netfilter rules are installed, so I just told
> them to hack the driver and remove the bridging offload, which they
> happily did.

Totally agree with your customer, we should be able to disable all
offloading for an individual port and run it in a plain "NIC mode". In
fact, that might be what opens up a third option for how LAG offloading
should be handled.

> I suspect you're looking at this from the wrong angle. I did, too, for
> the longest time, because I was focused on squeezing the most out of the
> hardware I had. And "I feel cheated because the operating system sits
> between me and the performance I want from my hardware" is an argument
> I've heard all too often. But not everybody needs even gigabits of
> traffic, or absurdly low latency. Making a product that works acceptably
> and at a low cost is better than hand-picking only hardware that can
> accelerate everything and spending $$$$ on it, for a performance
> improvement that no user will notice. Doing link aggregation in software
> is fine. Doing bridging in software is fine. Sure, the CPU can't compete
> past a number of KPPS, but maybe it doesn't even have to.

I really get the need for being able to apply some CPU-duct-tape to
solve that final corner case in a network. Or to use it as a cheap port
expander.

> Again, this doesn't mean that we should not report, somehow, what is
> offloaded and what isn't, and more importantly, the reason why
> offloading the set of required actions is not supported. And I do
> realize that I wrote a long rant about something that has nothing to do
> with the topic at hand, which is: should we deny bonding interfaces that
> use balance-rr, active-backup, broadcast, balance-tlb, balance-alb on
> top of a DSA interface? Hell no, you wouldn't expect an intel e1000 card
> to error out when you would set up a bonding interface that isn't xor or
> 802.3ad, would you? But you wouldn't go ahead and set up bridging
> offload either, therefore running into the problem I raised above with
> netfilter rules. You would just set out to do what the user asked for,
> in the way that you can, and let them decide if the performance is what
> they need or not.

You make a lot of good points. I think it might be better to force the
user to be explicit about their choice though. Imagine something like
this:

- We add NETIF_F_SWITCHDEV_OFFLOAD, which is set on switchdev ports by
  default. This flag is only allowed to be toggled when the port has no
  uppers - we do not want to deal with a port in a LAG in a bridge all
  of a sudden changing mode.

- If it is set, we only allow uppers/tc-rules/etc that we can
  offload. If the user tries to configure something outside of that, we
  can suggest disabling offloading in the error we emit.

- If it is not set, we just sit back and let the kernel do its thing.

This would work well both for exotic LAG modes and for advanced
netfilter(ebtables)/tc setups I think. Example session:

$ ip link add dev bond0 type bond mode balance-rr
$ ip link set dev swp0 master bond0
Error: swp0: balance-rr not supported when using switchdev offloading
$ ethtool -K swp0 switchdev off
$ ip link set dev swp0 master bond0
$ echo $?
0

>> > Should we add an array of supported TX types that the switch port can
>> > offload, and that should be checked by DSA in dsa_lag_offloading?
>>
>> That would work. We could also create a new DSA op that we would call
>> for each chip from PRECHANGEUPPER to verify that it is supported. One
>> advantage with this approach is that we can just pass along the `struct
>> netdev_lag_upper_info` so drivers always have access to all information,
>> in the event that new fields are added to it for example.
>
> Hmm, not super pleased to have yet another function pointer, but I don't
> have any other idea, so I guess that works just fine.

Yeah I do not like it either, maybe it is just the least bad thing.

> I would even go out on a limb and say hardcode the TX_TYPE_HASH in DSA
> for now. I would be completely surprised to see hardware that can
> offload anything else in the near future.

If you tilt your head a little, I think active backup is really just a
trivial case of a hashed LAG wherein only a single member is ever
active. I.e. all buckets are always allocated to one port (effectivly
negating the hashing). The active member is controlled by software, so I
think we should be able to support that.

mv88e6xxx could also theoretically be made to support broadcast. You can
enable any given bucket on multiple ports, but that seems silly.
Vladimir Oltean Dec. 4, 2020, 12:35 a.m. UTC | #13
On Thu, Dec 03, 2020 at 10:35:12PM +0100, Tobias Waldekranz wrote:
> - If offloading is available, reject anything that can not be
>   offloaded. My guess is that _any_ hardware offloaded setup will almost
>   always yield a better solution for the user than a software fallback.

For the case where the switch ports are not bridged, just under a bond,
do you expect the hardware offloading to make any sort of difference?
If you don't, then why would we deny software-applied bonding policies?
Vladimir Oltean Dec. 4, 2020, 12:56 a.m. UTC | #14
On Fri, Dec 04, 2020 at 12:12:32AM +0100, Tobias Waldekranz wrote:
> You make a lot of good points. I think it might be better to force the
> user to be explicit about their choice though. Imagine something like
> this:
>
> - We add NETIF_F_SWITCHDEV_OFFLOAD, which is set on switchdev ports by
>   default. This flag is only allowed to be toggled when the port has no
>   uppers - we do not want to deal with a port in a LAG in a bridge all
>   of a sudden changing mode.
>
> - If it is set, we only allow uppers/tc-rules/etc that we can
>   offload. If the user tries to configure something outside of that, we
>   can suggest disabling offloading in the error we emit.
>
> - If it is not set, we just sit back and let the kernel do its thing.
>
> This would work well both for exotic LAG modes and for advanced
> netfilter(ebtables)/tc setups I think. Example session:
>
> $ ip link add dev bond0 type bond mode balance-rr
> $ ip link set dev swp0 master bond0
> Error: swp0: balance-rr not supported when using switchdev offloading
> $ ethtool -K swp0 switchdev off
> $ ip link set dev swp0 master bond0
> $ echo $?
> 0

And you want the default to be what, on or off? I believe on?
I'd say the default should be off. The idea being that you could have
"write once, run everywhere" types of scripts. You can only get that
behavior with "off", otherwise you'd get random errors on some equipment
and it wouldn't be portable. And "ethtool -K swp0 switchdev off" is a
bit of a strange incantation to add to every script just to avoid
errors.. But if the default switchdev mode is off, then what's the
point in even having the knob, your poor Linus will still be confused
and frustrated, and it won't help him any bit if he can flip the switch
- it's too late, he already knows what the problem is by the time he
finds the switch.

> > I would even go out on a limb and say hardcode the TX_TYPE_HASH in DSA
> > for now. I would be completely surprised to see hardware that can
> > offload anything else in the near future.
>
> If you tilt your head a little, I think active backup is really just a
> trivial case of a hashed LAG wherein only a single member is ever
> active. I.e. all buckets are always allocated to one port (effectivly
> negating the hashing). The active member is controlled by software, so I
> think we should be able to support that.

Yup, my head is tilted and I see it now. If I understand this mode
(never used it), then any hardware switch that can offload bridging can
also offload the active-backup LAG.

> mv88e6xxx could also theoretically be made to support broadcast. You can
> enable any given bucket on multiple ports, but that seems silly.

Yeah, the broadcast bonding mode looks like an oddball. It sounds to me
almost like HSR/PRP/FRER but without the sequence numbering, which is a
surefire way to make a mess out of everything. I have no idea how it is
used (how duplicate elimination is achieved).
Andrew Lunn Dec. 4, 2020, 1:33 a.m. UTC | #15
> Of course, neither is fully correct. There is always more to improve on
> the communication side of things.

I wonder if switchdev needs to gain an enumeration API? A way to ask
the underlying driver, what can you offload? The user can then get an
idea what is likely to be offloaded, and what not. If that API is fine
grain enough, it can list the different LAG algorithms supported.

      Andrew
Andrew Lunn Dec. 4, 2020, 2:20 a.m. UTC | #16
> +static int dsa_tree_setup_lags(struct dsa_switch_tree *dst)
> +{
> +	struct dsa_port *dp;
> +	unsigned int num;
> +
> +	list_for_each_entry(dp, &dst->ports, list)
> +		num = dp->ds->num_lags;
> +
> +	list_for_each_entry(dp, &dst->ports, list)
> +		num = min(num, dp->ds->num_lags);

Do you really need to loop over the list twice? Cannot num be
initialised to UINT_MAX and then just do the second loop.

> +static inline bool dsa_port_can_offload(struct dsa_port *dp,
> +					struct net_device *dev)

That name is a bit generic. We have a number of different offloads.
The mv88E6060 cannot offload anything!

> +{
> +	/* Switchdev offloading can be configured on: */
> +
> +	if (dev == dp->slave)
> +		/* DSA ports directly connected to a bridge. */
> +		return true;
> +
> +	if (dp->lag && dev == rtnl_dereference(dp->lag->dev))
> +		/* DSA ports connected to a bridge via a LAG */
> +		return true;
> +
> +	return false;
> +}

> +static void dsa_lag_put(struct dsa_switch_tree *dst, struct dsa_lag *lag)
> +{
> +	if (!refcount_dec_and_test(&lag->refcount))
> +		return;
> +
> +	clear_bit(lag->id, dst->lags.busy);
> +	WRITE_ONCE(lag->dev, NULL);
> +	memset(lag, 0, sizeof(*lag));
> +}

I don't know what the locking is here, but wouldn't it be safer to
clear the bit last, after the memset and WRITE_ONCE.

    Andrew
Florian Fainelli Dec. 4, 2020, 4:04 a.m. UTC | #17
On 12/2/2020 1:13 AM, Tobias Waldekranz wrote:
> Monitor the following events and notify the driver when:
> 
> - A DSA port joins/leaves a LAG.
> - A LAG, made up of DSA ports, joins/leaves a bridge.
> - A DSA port in a LAG is enabled/disabled (enabled meaning
>   "distributing" in 802.3ad LACP terms).
> 
> Each LAG interface to which a DSA port is attached is represented by a
> `struct dsa_lag` which is globally reachable from the switch tree and
> from each associated port.
> 
> When a LAG joins a bridge, the DSA subsystem will treat that as each
> individual port joining the bridge. The driver may look at the port's
> LAG pointer to see if it is associated with any LAG, if that is
> required. This is analogue to how switchdev events are replicated out
> to all lower devices when reaching e.g. a LAG.
> 
> Signed-off-by: Tobias Waldekranz <tobias@waldekranz.com>

Vladimir and Andrew have already spotted what I was going to comment on,
just a few suggestions below:
[snip]

> +struct dsa_lag {
> +	struct net_device *dev;
> +	int id;

unsigned int?

> +
> +	struct list_head ports;
> +
> +	/* For multichip systems, we must ensure that each hash bucket
> +	 * is only enabled on a single egress port throughout the
> +	 * whole tree, lest we send duplicates. Therefore we must
> +	 * maintain a global list of active tx ports, so that each
> +	 * switch can figure out which buckets to enable on which
> +	 * ports.
> +	 */
> +	struct list_head tx_ports;
> +	int num_tx;

unsigned int?

Which if you change the type would require you to also change the types
of some iterators you used.
Florian Fainelli Dec. 4, 2020, 4:18 a.m. UTC | #18
On 12/3/2020 5:33 PM, Andrew Lunn wrote:
>> Of course, neither is fully correct. There is always more to improve on
>> the communication side of things.
> 
> I wonder if switchdev needs to gain an enumeration API? A way to ask
> the underlying driver, what can you offload? The user can then get an
> idea what is likely to be offloaded, and what not. If that API is fine
> grain enough, it can list the different LAG algorithms supported.

For stack offloads we can probably easily agree on what constitutes a
vendor neutral offload and a name for that enumeration. For other
features this is going to become an unmaintainable list of features and
then we are no better than we started 6 years ago with submitting
OpenWrt's swconfig and each switch driver advertising its features and
configuration API via netlink.

NETIF_F_SWITCHDEV_OFFLOAD would not be fine grained enough, this needs
to be a per action selection, just like when offloading the bridge, or
tc, you need to be able to hint the driver whether the offload is being
requested by the user.

For now, I would just go with implicitly falling back to doing the LAG
in software if the requested mode is not supported and leveraging extack
to indicate that was the case.
Tobias Waldekranz Dec. 7, 2020, 9:19 p.m. UTC | #19
On Fri, Dec 04, 2020 at 03:20, Andrew Lunn <andrew@lunn.ch> wrote:
>> +static int dsa_tree_setup_lags(struct dsa_switch_tree *dst)
>> +{
>> +	struct dsa_port *dp;
>> +	unsigned int num;
>> +
>> +	list_for_each_entry(dp, &dst->ports, list)
>> +		num = dp->ds->num_lags;
>> +
>> +	list_for_each_entry(dp, &dst->ports, list)
>> +		num = min(num, dp->ds->num_lags);
>
> Do you really need to loop over the list twice? Cannot num be
> initialised to UINT_MAX and then just do the second loop.

I was mostly paranoid about the case where, for some reason, the list of
ports was empty due to an invalid DT or something. But I now see that
since num is not initialized, that would not have helped.

So, is my paranoia valid, i.e. fix is `unsigned int num = 0`? Or can
that never happen, i.e. fix is to initialize to UINT_MAX and remove
first loop?

>> +static inline bool dsa_port_can_offload(struct dsa_port *dp,
>> +					struct net_device *dev)
>
> That name is a bit generic. We have a number of different offloads.
> The mv88E6060 cannot offload anything!

The name is intentionally generic as it answers the question "can this
dp offload requests for this netdev?"

>> +{
>> +	/* Switchdev offloading can be configured on: */
>> +
>> +	if (dev == dp->slave)
>> +		/* DSA ports directly connected to a bridge. */
>> +		return true;

This condition is the normal case of a bridged port, i.e. no LAG
involved.

>> +	if (dp->lag && dev == rtnl_dereference(dp->lag->dev))
>> +		/* DSA ports connected to a bridge via a LAG */
>> +		return true;

And then the indirect case of a bridged port under a LAG.

I am happy to take requests for a better name though.

>> +	return false;
>> +}
>
>> +static void dsa_lag_put(struct dsa_switch_tree *dst, struct dsa_lag *lag)
>> +{
>> +	if (!refcount_dec_and_test(&lag->refcount))
>> +		return;
>> +
>> +	clear_bit(lag->id, dst->lags.busy);
>> +	WRITE_ONCE(lag->dev, NULL);
>> +	memset(lag, 0, sizeof(*lag));
>> +}
>
> I don't know what the locking is here, but wouldn't it be safer to
> clear the bit last, after the memset and WRITE_ONCE.

All writers of dst->lags.busy are serialized with respect to dsa_lag_put
(on rtnl_lock), and concurrent readers (dsa_lag_dev_by_id) start by
checking busy before reading lag->dev. To my understanding, WRITE_ONCE
would insert the proper fence to make sure busy was cleared before
clearing dev?

>     Andrew
Tobias Waldekranz Dec. 7, 2020, 9:49 p.m. UTC | #20
On Fri, Dec 04, 2020 at 02:56, Vladimir Oltean <olteanv@gmail.com> wrote:
> On Fri, Dec 04, 2020 at 12:12:32AM +0100, Tobias Waldekranz wrote:
>> You make a lot of good points. I think it might be better to force the
>> user to be explicit about their choice though. Imagine something like
>> this:
>>
>> - We add NETIF_F_SWITCHDEV_OFFLOAD, which is set on switchdev ports by
>>   default. This flag is only allowed to be toggled when the port has no
>>   uppers - we do not want to deal with a port in a LAG in a bridge all
>>   of a sudden changing mode.
>>
>> - If it is set, we only allow uppers/tc-rules/etc that we can
>>   offload. If the user tries to configure something outside of that, we
>>   can suggest disabling offloading in the error we emit.
>>
>> - If it is not set, we just sit back and let the kernel do its thing.
>>
>> This would work well both for exotic LAG modes and for advanced
>> netfilter(ebtables)/tc setups I think. Example session:
>>
>> $ ip link add dev bond0 type bond mode balance-rr
>> $ ip link set dev swp0 master bond0
>> Error: swp0: balance-rr not supported when using switchdev offloading
>> $ ethtool -K swp0 switchdev off
>> $ ip link set dev swp0 master bond0
>> $ echo $?
>> 0
>
> And you want the default to be what, on or off? I believe on?
> I'd say the default should be off. The idea being that you could have
> "write once, run everywhere" types of scripts. You can only get that
> behavior with "off", otherwise you'd get random errors on some equipment
> and it wouldn't be portable. And "ethtool -K swp0 switchdev off" is a
> bit of a strange incantation to add to every script just to avoid
> errors.. But if the default switchdev mode is off, then what's the
> point in even having the knob, your poor Linus will still be confused
> and frustrated, and it won't help him any bit if he can flip the switch
> - it's too late, he already knows what the problem is by the time he
> finds the switch.

Yeah I can not argue with that. OK, I surrender, software fallback it
is.

>> > I would even go out on a limb and say hardcode the TX_TYPE_HASH in DSA
>> > for now. I would be completely surprised to see hardware that can
>> > offload anything else in the near future.
>>
>> If you tilt your head a little, I think active backup is really just a
>> trivial case of a hashed LAG wherein only a single member is ever
>> active. I.e. all buckets are always allocated to one port (effectivly
>> negating the hashing). The active member is controlled by software, so I
>> think we should be able to support that.
>
> Yup, my head is tilted and I see it now. If I understand this mode
> (never used it), then any hardware switch that can offload bridging can
> also offload the active-backup LAG.

Neither have I, but I guess you still need an actual LAG to associate
neighbors with, instead of the physical port? Maybe ocelot is different,
but on mv88e6xxx you otherwise get either (1) packet loss when the
active member changes or (2) duplicated packets when more than one
member is active.

>> mv88e6xxx could also theoretically be made to support broadcast. You can
>> enable any given bucket on multiple ports, but that seems silly.
>
> Yeah, the broadcast bonding mode looks like an oddball. It sounds to me
> almost like HSR/PRP/FRER but without the sequence numbering, which is a
> surefire way to make a mess out of everything. I have no idea how it is
> used (how duplicate elimination is achieved).

That is the way I interpret it as well. I suppose the dedup is done on
some higher layer, or it is some kind of redundant rx-only recorder or
something.
Tobias Waldekranz Dec. 7, 2020, 9:56 p.m. UTC | #21
On Thu, Dec 03, 2020 at 20:18, Florian Fainelli <f.fainelli@gmail.com> wrote:
> On 12/3/2020 5:33 PM, Andrew Lunn wrote:
>>> Of course, neither is fully correct. There is always more to improve on
>>> the communication side of things.
>> 
>> I wonder if switchdev needs to gain an enumeration API? A way to ask
>> the underlying driver, what can you offload? The user can then get an
>> idea what is likely to be offloaded, and what not. If that API is fine
>> grain enough, it can list the different LAG algorithms supported.
>
> For stack offloads we can probably easily agree on what constitutes a
> vendor neutral offload and a name for that enumeration. For other
> features this is going to become an unmaintainable list of features and
> then we are no better than we started 6 years ago with submitting
> OpenWrt's swconfig and each switch driver advertising its features and
> configuration API via netlink.
>
> NETIF_F_SWITCHDEV_OFFLOAD would not be fine grained enough, this needs
> to be a per action selection, just like when offloading the bridge, or
> tc, you need to be able to hint the driver whether the offload is being
> requested by the user.

That makes sense. So you are talking about adding something akin to tc's
skip_hw/skip_sw to `ip link`?

> For now, I would just go with implicitly falling back to doing the LAG
> in software if the requested mode is not supported and leveraging extack
> to indicate that was the case.

Ahh, you can use extack for successful operations? I did not know that,
I think that strikes a good balance.
Andrew Lunn Dec. 7, 2020, 11:26 p.m. UTC | #22
On Mon, Dec 07, 2020 at 10:19:47PM +0100, Tobias Waldekranz wrote:
> On Fri, Dec 04, 2020 at 03:20, Andrew Lunn <andrew@lunn.ch> wrote:
> >> +static int dsa_tree_setup_lags(struct dsa_switch_tree *dst)
> >> +{
> >> +	struct dsa_port *dp;
> >> +	unsigned int num;
> >> +
> >> +	list_for_each_entry(dp, &dst->ports, list)
> >> +		num = dp->ds->num_lags;
> >> +
> >> +	list_for_each_entry(dp, &dst->ports, list)
> >> +		num = min(num, dp->ds->num_lags);
> >
> > Do you really need to loop over the list twice? Cannot num be
> > initialised to UINT_MAX and then just do the second loop.
> 
> I was mostly paranoid about the case where, for some reason, the list of
> ports was empty due to an invalid DT or something. But I now see that
> since num is not initialized, that would not have helped.
> 
> So, is my paranoia valid, i.e. fix is `unsigned int num = 0`? Or can
> that never happen, i.e. fix is to initialize to UINT_MAX and remove
> first loop?

I would probably initialize to UINT_MAX, and do a WARN_ON(num ==
UINT_MAX) afterwards. I don't think the DT parsing code prevents a
setup with no ports, so it could happen.

> >> +static inline bool dsa_port_can_offload(struct dsa_port *dp,
> >> +					struct net_device *dev)
> >
> > That name is a bit generic. We have a number of different offloads.
> > The mv88E6060 cannot offload anything!
> 
> The name is intentionally generic as it answers the question "can this
> dp offload requests for this netdev?"

I think it is a bit more specific. Mirroring is an offload, but is not
taken into account here, and does mirroring setup call this to see if
mirroring can be offloaded? The hardware has rate control traffic
shaping which we might sometime add support for via TC. That again is
an offload.

> >> +{
> >> +	/* Switchdev offloading can be configured on: */
> >> +
> >> +	if (dev == dp->slave)
> >> +		/* DSA ports directly connected to a bridge. */
> >> +		return true;
> 
> This condition is the normal case of a bridged port, i.e. no LAG
> involved.
> 
> >> +	if (dp->lag && dev == rtnl_dereference(dp->lag->dev))
> >> +		/* DSA ports connected to a bridge via a LAG */
> >> +		return true;
> 
> And then the indirect case of a bridged port under a LAG.
> 
> I am happy to take requests for a better name though.

There probably needs to be lag in the name.

> 
> >> +	return false;
> >> +}
> >
> >> +static void dsa_lag_put(struct dsa_switch_tree *dst, struct dsa_lag *lag)
> >> +{
> >> +	if (!refcount_dec_and_test(&lag->refcount))
> >> +		return;
> >> +
> >> +	clear_bit(lag->id, dst->lags.busy);
> >> +	WRITE_ONCE(lag->dev, NULL);
> >> +	memset(lag, 0, sizeof(*lag));
> >> +}
> >
> > I don't know what the locking is here, but wouldn't it be safer to
> > clear the bit last, after the memset and WRITE_ONCE.
> 
> All writers of dst->lags.busy are serialized with respect to dsa_lag_put
> (on rtnl_lock), and concurrent readers (dsa_lag_dev_by_id) start by
> checking busy before reading lag->dev. To my understanding, WRITE_ONCE
> would insert the proper fence to make sure busy was cleared before
> clearing dev?

I was thinking about the lag being freed and then immediately
reused. So it sounds the RTNL allows the WRITE_ONCE and memset to
happen before the next user comes along. So this is O.K.  But maybe
you can document your locking design?

       Andrew
Vladimir Oltean Dec. 8, 2020, 11:23 a.m. UTC | #23
Hi Tobias,

On Wed, Dec 02, 2020 at 10:13:54AM +0100, Tobias Waldekranz wrote:
> Monitor the following events and notify the driver when:
>
> - A DSA port joins/leaves a LAG.
> - A LAG, made up of DSA ports, joins/leaves a bridge.
> - A DSA port in a LAG is enabled/disabled (enabled meaning
>   "distributing" in 802.3ad LACP terms).
>
> Each LAG interface to which a DSA port is attached is represented by a
> `struct dsa_lag` which is globally reachable from the switch tree and
> from each associated port.
>
> When a LAG joins a bridge, the DSA subsystem will treat that as each
> individual port joining the bridge. The driver may look at the port's
> LAG pointer to see if it is associated with any LAG, if that is
> required. This is analogue to how switchdev events are replicated out
> to all lower devices when reaching e.g. a LAG.
>
> Signed-off-by: Tobias Waldekranz <tobias@waldekranz.com>
> ---
>
> +struct dsa_lag {
> +	struct net_device *dev;
> +	int id;
> +
> +	struct list_head ports;
> +
> +	/* For multichip systems, we must ensure that each hash bucket
> +	 * is only enabled on a single egress port throughout the
> +	 * whole tree, lest we send duplicates. Therefore we must
> +	 * maintain a global list of active tx ports, so that each
> +	 * switch can figure out which buckets to enable on which
> +	 * ports.
> +	 */
> +	struct list_head tx_ports;
> +	int num_tx;
> +
> +	refcount_t refcount;
> +};

Sorry it took so long. I wanted to understand:
(a) where are the challenged for drivers to uniformly support software
    bridging when they already have code for bridge offloading. I found
    the following issues:
    - We have taggers that unconditionally set skb->offload_fwd_mark = 1,
      which kind of prevents software bridging. I'm not sure what the
      fix for these should be.
    - Source address is a big problem, but this time not in the sense
      that it traditionally has been. Specifically, due to address
      learning being enabled, the hardware FDB will set destinations to
      take the autonomous fast path. But surprise, the autonomous fast
      path is blocked, because as far as the switch is concerned, the
      ports are standalone and not offloading the bridge. We have drivers
      that don't disable address learning when they operate in standalone
      mode, which is something they definitely should do.
    There is nothing actionable for you in this patch set to resolve this.
    I just wanted to get an idea.
(b) Whether struct dsa_lag really brings us any significant benefit. I
    found that it doesn't. It's a lot of code added to the DSA core, that
    should not really belong in the middle layer. I need to go back and
    quote your motivation in the RFC:

| All LAG configuration is cached in `struct dsa_lag`s. I realize that
| the standard M.O. of DSA is to read back information from hardware
| when required. With LAGs this becomes very tricky though. For example,
| the change of a link state on one switch will require re-balancing of
| LAG hash buckets on another one, which in turn depends on the total
| number of active links in the LAG. Do you agree that this is
| motivated?

    After reimplementing bonding offload in ocelot, I have found
    struct dsa_lag to not provide any benefit. All the information a
    driver needs is already provided through the
    struct net_device *lag_dev argument given to lag_join and lag_leave,
    and through the struct netdev_lag_lower_state_info *info given to
    lag_change. I will send an RFC to you and the list shortly to prove
    that this information is absolutely sufficient for the driver to do
    decent internal bookkeeping, and that DSA should not really care
    beyond that.

    There are two points to be made:
    - Recently we have seen people with non-DSA (pure switchdev) hardware
      being compelled to write DSA drivers, because they noticed that a
      large part of the middle layer had already been written, and it
      presents an API with a lot of syntactic sugar. Maybe there is a
      larger issue here in that the switchdev offloading APIs are fairly
      bulky and repetitive, but that does not mean that we should be
      encouraging the attitude "come to DSA, we have cookies".
      https://lwn.net/ml/linux-kernel/20201125232459.378-1-lukma@denx.de/
    - Remember that the only reason why the DSA framework and the
      syntactic sugar exists is that we are presenting the hardware a
      unified view for the ports which have a struct net_device registered,
      and the ports which don't (DSA links and CPU ports). The argument
      really needs to be broken down into two:
      - For cross-chip DSA links, I can see why it was convenient for
        you to have the dsa_lag_by_dev(ds->dst, lag_dev) helper. But
        just as we currently have a struct net_device *bridge_dev in
        struct dsa_port, so we could have a struct net_device *bond,
        without the extra fat of struct dsa_lag, and reference counting,
        active ports, etc etc, would become simpler (actually inexistent
        as far as the DSA layer is concerned). Two ports are in the same
        bond if they have the same struct net_device *bond, just as they
        are bridged if they have the same struct net_device *bridge_dev.
      - For CPU ports, this raises an important question, which is
        whether LAG on switches with multiple CPU ports is ever going to
        be a thing. And if it is, how it is even going to be configured
        from the user's perspective. Because on a multi-CPU port system,
        you should actually see it as two bonding interfaces back to back.
        First, there's the bonding interface that spans the DSA masters.
        That needs no hardware offloading. Then there's the bonding
        interface that is the mirror image of that, and spans the CPU
        ports. I think this is a bit up in the air now. Because with
        your struct dsa_lag or without, we still have no bonding device
        associated with it, so things like the balancing policy are not
        really defined.

I would like you to reiterate some of the reasons why you prefer having
struct dsa_lag.
Tobias Waldekranz Dec. 8, 2020, 3:33 p.m. UTC | #24
On Tue, Dec 08, 2020 at 13:23, Vladimir Oltean <olteanv@gmail.com> wrote:
> Hi Tobias,
>
> On Wed, Dec 02, 2020 at 10:13:54AM +0100, Tobias Waldekranz wrote:
>> Monitor the following events and notify the driver when:
>>
>> - A DSA port joins/leaves a LAG.
>> - A LAG, made up of DSA ports, joins/leaves a bridge.
>> - A DSA port in a LAG is enabled/disabled (enabled meaning
>>   "distributing" in 802.3ad LACP terms).
>>
>> Each LAG interface to which a DSA port is attached is represented by a
>> `struct dsa_lag` which is globally reachable from the switch tree and
>> from each associated port.
>>
>> When a LAG joins a bridge, the DSA subsystem will treat that as each
>> individual port joining the bridge. The driver may look at the port's
>> LAG pointer to see if it is associated with any LAG, if that is
>> required. This is analogue to how switchdev events are replicated out
>> to all lower devices when reaching e.g. a LAG.
>>
>> Signed-off-by: Tobias Waldekranz <tobias@waldekranz.com>
>> ---
>>
>> +struct dsa_lag {
>> +	struct net_device *dev;
>> +	int id;
>> +
>> +	struct list_head ports;
>> +
>> +	/* For multichip systems, we must ensure that each hash bucket
>> +	 * is only enabled on a single egress port throughout the
>> +	 * whole tree, lest we send duplicates. Therefore we must
>> +	 * maintain a global list of active tx ports, so that each
>> +	 * switch can figure out which buckets to enable on which
>> +	 * ports.
>> +	 */
>> +	struct list_head tx_ports;
>> +	int num_tx;
>> +
>> +	refcount_t refcount;
>> +};
>
> Sorry it took so long. I wanted to understand:
> (a) where are the challenged for drivers to uniformly support software
>     bridging when they already have code for bridge offloading. I found
>     the following issues:
>     - We have taggers that unconditionally set skb->offload_fwd_mark = 1,
>       which kind of prevents software bridging. I'm not sure what the
>       fix for these should be.

At least on mv88e6xxx you would not be able to determine this simply
from looking at the tag. Both in standalone mode and bridged mode, you
would receive FORWARDs with the same source. You could look at
dp->bridge_dev to figure it out though.

>     - Source address is a big problem, but this time not in the sense
>       that it traditionally has been. Specifically, due to address
>       learning being enabled, the hardware FDB will set destinations to
>       take the autonomous fast path. But surprise, the autonomous fast
>       path is blocked, because as far as the switch is concerned, the
>       ports are standalone and not offloading the bridge. We have drivers
>       that don't disable address learning when they operate in standalone
>       mode, which is something they definitely should do.

Some hardware can function with it on (e.g. mv88e6xxx can associate an
FDB per port), but there is no reason to do it, so yes it should be
disabled.

>     There is nothing actionable for you in this patch set to resolve this.
>     I just wanted to get an idea.
> (b) Whether struct dsa_lag really brings us any significant benefit. I
>     found that it doesn't. It's a lot of code added to the DSA core, that
>     should not really belong in the middle layer. I need to go back and
>     quote your motivation in the RFC:
>
> | All LAG configuration is cached in `struct dsa_lag`s. I realize that
> | the standard M.O. of DSA is to read back information from hardware
> | when required. With LAGs this becomes very tricky though. For example,
> | the change of a link state on one switch will require re-balancing of
> | LAG hash buckets on another one, which in turn depends on the total
> | number of active links in the LAG. Do you agree that this is
> | motivated?
>
>     After reimplementing bonding offload in ocelot, I have found
>     struct dsa_lag to not provide any benefit. All the information a
>     driver needs is already provided through the
>     struct net_device *lag_dev argument given to lag_join and lag_leave,
>     and through the struct netdev_lag_lower_state_info *info given to
>     lag_change. I will send an RFC to you and the list shortly to prove
>     that this information is absolutely sufficient for the driver to do
>     decent internal bookkeeping, and that DSA should not really care
>     beyond that.

Do you have a multi-chip setup? If not then I understand that `struct
dsa_lag` does not give you anything extra. In a multi-chip scenario
things become harder. Example:

.-----.   .-----.
| sw0 +---+ sw1 |
'-+-+-'3 3'--+--'
  1 2        1

Let's say that sw0p1, sw0p2 and sw1p1 are in a LAG. This system can hash
flows into 8 buckets. So with all ports active you would need an
allocation like this:

sw0p1: 0,1,2
sw0p2: 3,4,5
sw1p1: 6,7

For some reason, the system determines that sw0p2 is now inactive and
the LAG should be rebalanced over the two remaining active links:

sw0p1: 0,1,2,3
sw0p2: -
sw1p1: 4,5,6,7

In order for sw0 and sw1 to agree on the assignment they need access to
a shared view of the LAG at the tree level, both about the set of active
ports and their ordering. This is `struct dsa_lag`s main raison d'être.

The same goes for when a port joins/leaves a LAG. For example, if sw1p1
was to leave the LAG, we want to make sure that we do not needlessly
flood LAG traffic over the backplane (sw0p3<->sw1p3). If you want to
solve this at the ds level without `struct dsa_lag`, you need a refcount
per backplane port in order to figure out if the leaving port was the
last one behind that backplane port.


>     There are two points to be made:
>     - Recently we have seen people with non-DSA (pure switchdev) hardware
>       being compelled to write DSA drivers, because they noticed that a
>       large part of the middle layer had already been written, and it
>       presents an API with a lot of syntactic sugar. Maybe there is a
>       larger issue here in that the switchdev offloading APIs are fairly
>       bulky and repetitive, but that does not mean that we should be
>       encouraging the attitude "come to DSA, we have cookies".
>       https://lwn.net/ml/linux-kernel/20201125232459.378-1-lukma@denx.de/

I think you are right, but having written a (no DSA) switchdev driver
myself, I understand where that desire comes from. I have found myself
looking/copying stuff from mlxsw and dsa that could have been provided
as a shared switchdev library. Things demuxing all of the different
events to figure out when uppers change etc.

>     - Remember that the only reason why the DSA framework and the
>       syntactic sugar exists is that we are presenting the hardware a
>       unified view for the ports which have a struct net_device registered,
>       and the ports which don't (DSA links and CPU ports). The argument
>       really needs to be broken down into two:
>       - For cross-chip DSA links, I can see why it was convenient for
>         you to have the dsa_lag_by_dev(ds->dst, lag_dev) helper. But
>         just as we currently have a struct net_device *bridge_dev in
>         struct dsa_port, so we could have a struct net_device *bond,
>         without the extra fat of struct dsa_lag, and reference counting,
>         active ports, etc etc, would become simpler (actually inexistent
>         as far as the DSA layer is concerned). Two ports are in the same
>         bond if they have the same struct net_device *bond, just as they
>         are bridged if they have the same struct net_device *bridge_dev.
>       - For CPU ports, this raises an important question, which is
>         whether LAG on switches with multiple CPU ports is ever going to
>         be a thing. And if it is, how it is even going to be configured
>         from the user's perspective. Because on a multi-CPU port system,
>         you should actually see it as two bonding interfaces back to back.
>         First, there's the bonding interface that spans the DSA masters.
>         That needs no hardware offloading. Then there's the bonding
>         interface that is the mirror image of that, and spans the CPU
>         ports. I think this is a bit up in the air now. Because with

Aside. On our devices we use the term cpu0, cpu1 etc. to refer to a
switch port that is connected to a CPU. The CPU side of those
connections are chan0, chan1 ("channel"). I am not saying we have to
adopt those, but some unambiguous terms would be great in these
conversations.

>         your struct dsa_lag or without, we still have no bonding device
>         associated with it, so things like the balancing policy are not
>         really defined.
>

I have a different take on that. I do not think you need to create a
user-visible LAG at all in that case. You just setup the hardware to
treat the two CPU ports as a static LAG based on the information from
the DT. Then you attach the same rx handler to both. On tx you hash and
loadbalance on flows that allow it (FORWARDs on mv88e6xxx) and use the
primary CPU port for control traffic (FROM_CPU).

The CPU port is completely defined by the DT today, so I do not see why
we could not add balancing policy to that if it is ever required.

> I would like you to reiterate some of the reasons why you prefer having
> struct dsa_lag.

I hope I did that already. But I will add that if there was a dst->priv
for the drivers to use as they see fit, I guess the bookkeeping could be
moved into the mv88e6xxx driver instead if you feel it pollutes the DSA
layer. Maybe you can not assume that all chips in a tree use a
compatible driver though?

Are there any other divers that support multi-chip that might want to
use the same thing?
Vladimir Oltean Dec. 8, 2020, 4:37 p.m. UTC | #25
On Tue, Dec 08, 2020 at 04:33:19PM +0100, Tobias Waldekranz wrote:
> On Tue, Dec 08, 2020 at 13:23, Vladimir Oltean <olteanv@gmail.com> wrote:
> > Sorry it took so long. I wanted to understand:
> > (a) where are the challenged for drivers to uniformly support software
> >     bridging when they already have code for bridge offloading. I found
> >     the following issues:
> >     - We have taggers that unconditionally set skb->offload_fwd_mark = 1,
> >       which kind of prevents software bridging. I'm not sure what the
> >       fix for these should be.
>
> At least on mv88e6xxx you would not be able to determine this simply
> from looking at the tag. Both in standalone mode and bridged mode, you
> would receive FORWARDs with the same source. You could look at
> dp->bridge_dev to figure it out though.

Yes, but that raises the question whether it should be DSA that fixes it
up globally for everyone, like in dsa_switch_rcv:

	if (!dp->bridge_dev)
		skb->offload_fwd_mark = 0;

with a nice comment above that everyone can refer to,
or make each and every tagger do this. I'm leaning towards the latter.

> >     - Source address is a big problem, but this time not in the sense
> >       that it traditionally has been. Specifically, due to address
> >       learning being enabled, the hardware FDB will set destinations to
> >       take the autonomous fast path. But surprise, the autonomous fast
> >       path is blocked, because as far as the switch is concerned, the
> >       ports are standalone and not offloading the bridge. We have drivers
> >       that don't disable address learning when they operate in standalone
> >       mode, which is something they definitely should do.
>
> Some hardware can function with it on (e.g. mv88e6xxx can associate an
> FDB per port), but there is no reason to do it, so yes it should be
> disabled.

So how does mv88e6xxx handle the address learning issue currently?

> >     There is nothing actionable for you in this patch set to resolve this.
> >     I just wanted to get an idea.
> > (b) Whether struct dsa_lag really brings us any significant benefit. I
> >     found that it doesn't. It's a lot of code added to the DSA core, that
> >     should not really belong in the middle layer. I need to go back and
> >     quote your motivation in the RFC:
> >
> > | All LAG configuration is cached in `struct dsa_lag`s. I realize that
> > | the standard M.O. of DSA is to read back information from hardware
> > | when required. With LAGs this becomes very tricky though. For example,
> > | the change of a link state on one switch will require re-balancing of
> > | LAG hash buckets on another one, which in turn depends on the total
> > | number of active links in the LAG. Do you agree that this is
> > | motivated?
> >
> >     After reimplementing bonding offload in ocelot, I have found
> >     struct dsa_lag to not provide any benefit. All the information a
> >     driver needs is already provided through the
> >     struct net_device *lag_dev argument given to lag_join and lag_leave,
> >     and through the struct netdev_lag_lower_state_info *info given to
> >     lag_change. I will send an RFC to you and the list shortly to prove
> >     that this information is absolutely sufficient for the driver to do
> >     decent internal bookkeeping, and that DSA should not really care
> >     beyond that.
>
> Do you have a multi-chip setup? If not then I understand that `struct
> dsa_lag` does not give you anything extra. In a multi-chip scenario
> things become harder. Example:
>
> .-----.   .-----.
> | sw0 +---+ sw1 |
> '-+-+-'3 3'--+--'
>   1 2        1
>
> Let's say that sw0p1, sw0p2 and sw1p1 are in a LAG. This system can hash
> flows into 8 buckets. So with all ports active you would need an
> allocation like this:
>
> sw0p1: 0,1,2
> sw0p2: 3,4,5
> sw1p1: 6,7
>
> For some reason, the system determines that sw0p2 is now inactive and
> the LAG should be rebalanced over the two remaining active links:
>
> sw0p1: 0,1,2,3
> sw0p2: -
> sw1p1: 4,5,6,7
>
> In order for sw0 and sw1 to agree on the assignment they need access to
> a shared view of the LAG at the tree level, both about the set of active
> ports and their ordering. This is `struct dsa_lag`s main raison d'être.

Yup, you could do that just like I did with ocelot, aka keep in
struct dsa_port:
	struct net_device *bond;
	bool lag_tx_active;

This is enough to replace your usage of:

	list_for_each_entry(dp, &lag->ports, lag_list) {
		...
	}

with:

	list_for_each_entry(dp, &dst->ports, list) {
		if (dp->bond != lag_dev)
			continue;

		...
	}

and:

	list_for_each_entry(dp, &lag->tx_ports, lag_tx_list) {
		...
	}

with:

	list_for_each_entry(dp, &dst->ports, list) {
		if (dp->bond != lag_dev || !dp->lag_tx_active)
			continue;

		...
	}

The amount of iteration that you would do would be about the same. Just
this:

	struct dsa_lag *lag = dsa_lag_by_dev(ds->dst, lag_dev);

would need to be replaced with something more precise, depending on what
you need the struct dsa_lag pointer for. You use it in crosschip_lag_join
and in crosschip_lag_leave to call mv88e6xxx_lag_sync_map, where you
again iterate over the ports in the DSA switch tree. But if you passed
just the struct net_device *lag_dev directly, you could keep the same
iteration and do away with the reference-counted struct dsa_lag.

> The same goes for when a port joins/leaves a LAG. For example, if sw1p1
> was to leave the LAG, we want to make sure that we do not needlessly
> flood LAG traffic over the backplane (sw0p3<->sw1p3). If you want to
> solve this at the ds level without `struct dsa_lag`, you need a refcount
> per backplane port in order to figure out if the leaving port was the
> last one behind that backplane port.

Humm, why?
Nothing would change. Just as you start with a map of 0 in
mv88e6xxx_lag_sync_map, and use dsa_towards_port for every dp that you
find in the switch tree, I am saying keep that iteration, but don't keep
those extra lists for the bonded ports and the active bonded ports. Just
use as a search key the LAG net device itself, and keep an extra bool
per dp. I think it's really simpler this way, with a lot less overhead
in terms of data structures, lists and whatnot.

> >     - Remember that the only reason why the DSA framework and the
> >       syntactic sugar exists is that we are presenting the hardware a
> >       unified view for the ports which have a struct net_device registered,
> >       and the ports which don't (DSA links and CPU ports). The argument
> >       really needs to be broken down into two:
> >       - For cross-chip DSA links, I can see why it was convenient for
> >         you to have the dsa_lag_by_dev(ds->dst, lag_dev) helper. But
> >         just as we currently have a struct net_device *bridge_dev in
> >         struct dsa_port, so we could have a struct net_device *bond,
> >         without the extra fat of struct dsa_lag, and reference counting,
> >         active ports, etc etc, would become simpler (actually inexistent
> >         as far as the DSA layer is concerned). Two ports are in the same
> >         bond if they have the same struct net_device *bond, just as they
> >         are bridged if they have the same struct net_device *bridge_dev.
> >       - For CPU ports, this raises an important question, which is
> >         whether LAG on switches with multiple CPU ports is ever going to
> >         be a thing. And if it is, how it is even going to be configured
> >         from the user's perspective. Because on a multi-CPU port system,
> >         you should actually see it as two bonding interfaces back to back.
> >         First, there's the bonding interface that spans the DSA masters.
> >         That needs no hardware offloading. Then there's the bonding
> >         interface that is the mirror image of that, and spans the CPU
> >         ports. I think this is a bit up in the air now. Because with
>
> Aside. On our devices we use the term cpu0, cpu1 etc. to refer to a
> switch port that is connected to a CPU. The CPU side of those
> connections are chan0, chan1 ("channel"). I am not saying we have to
> adopt those, but some unambiguous terms would be great in these
> conversations.

You have me confused. chan0, chan1 are DSA master interfaces? Can we
keep calling them DSA master interfaces, or do you find that confusing?

> >         your struct dsa_lag or without, we still have no bonding device
> >         associated with it, so things like the balancing policy are not
> >         really defined.
> >
>
> I have a different take on that. I do not think you need to create a
> user-visible LAG at all in that case. You just setup the hardware to
> treat the two CPU ports as a static LAG based on the information from
> the DT. Then you attach the same rx handler to both. On tx you hash and
> loadbalance on flows that allow it (FORWARDs on mv88e6xxx) and use the
> primary CPU port for control traffic (FROM_CPU).
>
> The CPU port is completely defined by the DT today, so I do not see why
> we could not add balancing policy to that if it is ever required.

Hashing policy for a bonding interface, defined in DT? Yummm.

> > I would like you to reiterate some of the reasons why you prefer having
> > struct dsa_lag.
>
> I hope I did that already. But I will add that if there was a dst->priv
> for the drivers to use as they see fit, I guess the bookkeeping could be
> moved into the mv88e6xxx driver instead if you feel it pollutes the DSA
> layer. Maybe you can not assume that all chips in a tree use a
> compatible driver though?

I don't think the DSA switch tree is private to anyone.

> Are there any other divers that support multi-chip that might want to
> use the same thing?

Nope.
Andrew Lunn Dec. 8, 2020, 5:26 p.m. UTC | #26
>     There are two points to be made:
>     - Recently we have seen people with non-DSA (pure switchdev) hardware
>       being compelled to write DSA drivers, because they noticed that a
>       large part of the middle layer had already been written, and it
>       presents an API with a lot of syntactic sugar. Maybe there is a
>       larger issue here in that the switchdev offloading APIs are fairly
>       bulky and repetitive, but that does not mean that we should be
>       encouraging the attitude "come to DSA, we have cookies".
>       https://lwn.net/ml/linux-kernel/20201125232459.378-1-lukma@denx.de/

We often see developers stumbling around in the dark, not knowing the
subsystems and how best to solve a problem. So i would not read too
much into that particular email discussion. It was just another
example of we the maintainers, trying to get an understanding of the
hardware and help point a developer in the right direction. We don't
consider DSA the solution for all switch problems.

We do however have a growing number of pure switchdev drivers now, so
it might be time to take a look and see what is common, and pull some
code out of the drivers and into a library. This is a common pattern
you see all over the kernel. One driver often leads the way with a new
subsystem, but it is not until you have a few different drivers using
the subsystem do you have a real feel for what is common and can be
pulled out of the drivers and into a framework.

       Andrew
Tobias Waldekranz Dec. 9, 2020, 8:37 a.m. UTC | #27
On Tue, Dec 08, 2020 at 18:37, Vladimir Oltean <olteanv@gmail.com> wrote:
> On Tue, Dec 08, 2020 at 04:33:19PM +0100, Tobias Waldekranz wrote:
>> On Tue, Dec 08, 2020 at 13:23, Vladimir Oltean <olteanv@gmail.com> wrote:
>> > Sorry it took so long. I wanted to understand:
>> > (a) where are the challenged for drivers to uniformly support software
>> >     bridging when they already have code for bridge offloading. I found
>> >     the following issues:
>> >     - We have taggers that unconditionally set skb->offload_fwd_mark = 1,
>> >       which kind of prevents software bridging. I'm not sure what the
>> >       fix for these should be.
>>
>> At least on mv88e6xxx you would not be able to determine this simply
>> from looking at the tag. Both in standalone mode and bridged mode, you
>> would receive FORWARDs with the same source. You could look at
>> dp->bridge_dev to figure it out though.
>
> Yes, but that raises the question whether it should be DSA that fixes it
> up globally for everyone, like in dsa_switch_rcv:
>
> 	if (!dp->bridge_dev)
> 		skb->offload_fwd_mark = 0;
>
> with a nice comment above that everyone can refer to,
> or make each and every tagger do this. I'm leaning towards the latter.

I agree, I think each tagger should have to deal with this complexity
explicitly rather than be "saved" by the framework.

>> >     - Source address is a big problem, but this time not in the sense
>> >       that it traditionally has been. Specifically, due to address
>> >       learning being enabled, the hardware FDB will set destinations to
>> >       take the autonomous fast path. But surprise, the autonomous fast
>> >       path is blocked, because as far as the switch is concerned, the
>> >       ports are standalone and not offloading the bridge. We have drivers
>> >       that don't disable address learning when they operate in standalone
>> >       mode, which is something they definitely should do.
>>
>> Some hardware can function with it on (e.g. mv88e6xxx can associate an
>> FDB per port), but there is no reason to do it, so yes it should be
>> disabled.
>
> So how does mv88e6xxx handle the address learning issue currently?

Learning is enabled, but addresses are learned in a separate DB. So they
do not interfere with offloaded traffic. Since the FDB is not consulted
when sending frames from the CPU, it works. But we are needlessly using
up precious rows in the FDB.

>> >     There is nothing actionable for you in this patch set to resolve this.
>> >     I just wanted to get an idea.
>> > (b) Whether struct dsa_lag really brings us any significant benefit. I
>> >     found that it doesn't. It's a lot of code added to the DSA core, that
>> >     should not really belong in the middle layer. I need to go back and
>> >     quote your motivation in the RFC:
>> >
>> > | All LAG configuration is cached in `struct dsa_lag`s. I realize that
>> > | the standard M.O. of DSA is to read back information from hardware
>> > | when required. With LAGs this becomes very tricky though. For example,
>> > | the change of a link state on one switch will require re-balancing of
>> > | LAG hash buckets on another one, which in turn depends on the total
>> > | number of active links in the LAG. Do you agree that this is
>> > | motivated?
>> >
>> >     After reimplementing bonding offload in ocelot, I have found
>> >     struct dsa_lag to not provide any benefit. All the information a
>> >     driver needs is already provided through the
>> >     struct net_device *lag_dev argument given to lag_join and lag_leave,
>> >     and through the struct netdev_lag_lower_state_info *info given to
>> >     lag_change. I will send an RFC to you and the list shortly to prove
>> >     that this information is absolutely sufficient for the driver to do
>> >     decent internal bookkeeping, and that DSA should not really care
>> >     beyond that.
>>
>> Do you have a multi-chip setup? If not then I understand that `struct
>> dsa_lag` does not give you anything extra. In a multi-chip scenario
>> things become harder. Example:
>>
>> .-----.   .-----.
>> | sw0 +---+ sw1 |
>> '-+-+-'3 3'--+--'
>>   1 2        1
>>
>> Let's say that sw0p1, sw0p2 and sw1p1 are in a LAG. This system can hash
>> flows into 8 buckets. So with all ports active you would need an
>> allocation like this:
>>
>> sw0p1: 0,1,2
>> sw0p2: 3,4,5
>> sw1p1: 6,7
>>
>> For some reason, the system determines that sw0p2 is now inactive and
>> the LAG should be rebalanced over the two remaining active links:
>>
>> sw0p1: 0,1,2,3
>> sw0p2: -
>> sw1p1: 4,5,6,7
>>
>> In order for sw0 and sw1 to agree on the assignment they need access to
>> a shared view of the LAG at the tree level, both about the set of active
>> ports and their ordering. This is `struct dsa_lag`s main raison d'être.
>
> Yup, you could do that just like I did with ocelot, aka keep in
> struct dsa_port:
> 	struct net_device *bond;
> 	bool lag_tx_active;
>
> This is enough to replace your usage of:
>
> 	list_for_each_entry(dp, &lag->ports, lag_list) {
> 		...
> 	}
>
> with:
>
> 	list_for_each_entry(dp, &dst->ports, list) {
> 		if (dp->bond != lag_dev)
> 			continue;
>
> 		...
> 	}
>
> and:
>
> 	list_for_each_entry(dp, &lag->tx_ports, lag_tx_list) {
> 		...
> 	}
>
> with:
>
> 	list_for_each_entry(dp, &dst->ports, list) {
> 		if (dp->bond != lag_dev || !dp->lag_tx_active)
> 			continue;
>
> 		...
> 	}
>
> The amount of iteration that you would do would be about the same. Just
> this:
>
> 	struct dsa_lag *lag = dsa_lag_by_dev(ds->dst, lag_dev);
>
> would need to be replaced with something more precise, depending on what
> you need the struct dsa_lag pointer for. You use it in crosschip_lag_join
> and in crosschip_lag_leave to call mv88e6xxx_lag_sync_map, where you
> again iterate over the ports in the DSA switch tree. But if you passed
> just the struct net_device *lag_dev directly, you could keep the same
> iteration and do away with the reference-counted struct dsa_lag.
>
>> The same goes for when a port joins/leaves a LAG. For example, if sw1p1
>> was to leave the LAG, we want to make sure that we do not needlessly
>> flood LAG traffic over the backplane (sw0p3<->sw1p3). If you want to
>> solve this at the ds level without `struct dsa_lag`, you need a refcount
>> per backplane port in order to figure out if the leaving port was the
>> last one behind that backplane port.
>
> Humm, why?
> Nothing would change. Just as you start with a map of 0 in
> mv88e6xxx_lag_sync_map, and use dsa_towards_port for every dp that you
> find in the switch tree, I am saying keep that iteration, but don't keep
> those extra lists for the bonded ports and the active bonded ports. Just
> use as a search key the LAG net device itself, and keep an extra bool
> per dp. I think it's really simpler this way, with a lot less overhead
> in terms of data structures, lists and whatnot.

I will remove `struct dsa_lag` in v4.

>> >     - Remember that the only reason why the DSA framework and the
>> >       syntactic sugar exists is that we are presenting the hardware a
>> >       unified view for the ports which have a struct net_device registered,
>> >       and the ports which don't (DSA links and CPU ports). The argument
>> >       really needs to be broken down into two:
>> >       - For cross-chip DSA links, I can see why it was convenient for
>> >         you to have the dsa_lag_by_dev(ds->dst, lag_dev) helper. But
>> >         just as we currently have a struct net_device *bridge_dev in
>> >         struct dsa_port, so we could have a struct net_device *bond,
>> >         without the extra fat of struct dsa_lag, and reference counting,
>> >         active ports, etc etc, would become simpler (actually inexistent
>> >         as far as the DSA layer is concerned). Two ports are in the same
>> >         bond if they have the same struct net_device *bond, just as they
>> >         are bridged if they have the same struct net_device *bridge_dev.
>> >       - For CPU ports, this raises an important question, which is
>> >         whether LAG on switches with multiple CPU ports is ever going to
>> >         be a thing. And if it is, how it is even going to be configured
>> >         from the user's perspective. Because on a multi-CPU port system,
>> >         you should actually see it as two bonding interfaces back to back.
>> >         First, there's the bonding interface that spans the DSA masters.
>> >         That needs no hardware offloading. Then there's the bonding
>> >         interface that is the mirror image of that, and spans the CPU
>> >         ports. I think this is a bit up in the air now. Because with
>>
>> Aside. On our devices we use the term cpu0, cpu1 etc. to refer to a
>> switch port that is connected to a CPU. The CPU side of those
>> connections are chan0, chan1 ("channel"). I am not saying we have to
>> adopt those, but some unambiguous terms would be great in these
>> conversations.
>
> You have me confused. chan0, chan1 are DSA master interfaces? Can we
> keep calling them DSA master interfaces, or do you find that confusing?

Not confusing, just a mouthful. It was just an idea, DSA master
interface works.

>> >         your struct dsa_lag or without, we still have no bonding device
>> >         associated with it, so things like the balancing policy are not
>> >         really defined.
>> >
>>
>> I have a different take on that. I do not think you need to create a
>> user-visible LAG at all in that case. You just setup the hardware to
>> treat the two CPU ports as a static LAG based on the information from
>> the DT. Then you attach the same rx handler to both. On tx you hash and
>> loadbalance on flows that allow it (FORWARDs on mv88e6xxx) and use the
>> primary CPU port for control traffic (FROM_CPU).
>>
>> The CPU port is completely defined by the DT today, so I do not see why
>> we could not add balancing policy to that if it is ever required.
>
> Hashing policy for a bonding interface, defined in DT? Yummm.

Notice the "if it is ever required" part, I can not see why it ever
would be. The driver will know how to get the best loadbalancing out of
the hardware.

>> > I would like you to reiterate some of the reasons why you prefer having
>> > struct dsa_lag.
>>
>> I hope I did that already. But I will add that if there was a dst->priv
>> for the drivers to use as they see fit, I guess the bookkeeping could be
>> moved into the mv88e6xxx driver instead if you feel it pollutes the DSA
>> layer. Maybe you can not assume that all chips in a tree use a
>> compatible driver though?
>
> I don't think the DSA switch tree is private to anyone.

Well I need somewhere to store the association from LAG netdev to LAG
ID. These IDs are shared by all chips in the tree. It could be
replicated on each ds of course, but that does not feel quite right.

>> Are there any other divers that support multi-chip that might want to
>> use the same thing?
>
> Nope.
Tobias Waldekranz Dec. 9, 2020, 8:57 a.m. UTC | #28
On Tue, Dec 08, 2020 at 00:26, Andrew Lunn <andrew@lunn.ch> wrote:
> On Mon, Dec 07, 2020 at 10:19:47PM +0100, Tobias Waldekranz wrote:
>> On Fri, Dec 04, 2020 at 03:20, Andrew Lunn <andrew@lunn.ch> wrote:
>> >> +static inline bool dsa_port_can_offload(struct dsa_port *dp,
>> >> +					struct net_device *dev)
>> >
>> > That name is a bit generic. We have a number of different offloads.
>> > The mv88E6060 cannot offload anything!
>> 
>> The name is intentionally generic as it answers the question "can this
>> dp offload requests for this netdev?"
>
> I think it is a bit more specific. Mirroring is an offload, but is not
> taken into account here, and does mirroring setup call this to see if
> mirroring can be offloaded? The hardware has rate control traffic
> shaping which we might sometime add support for via TC. That again is
> an offload.

I see what you are saying.

>> >> +{
>> >> +	/* Switchdev offloading can be configured on: */
>> >> +
>> >> +	if (dev == dp->slave)
>> >> +		/* DSA ports directly connected to a bridge. */
>> >> +		return true;
>> 
>> This condition is the normal case of a bridged port, i.e. no LAG
>> involved.
>> 
>> >> +	if (dp->lag && dev == rtnl_dereference(dp->lag->dev))
>> >> +		/* DSA ports connected to a bridge via a LAG */
>> >> +		return true;
>> 
>> And then the indirect case of a bridged port under a LAG.
>> 
>> I am happy to take requests for a better name though.
>
> There probably needs to be lag in the name.

I disagree. A LAG is one type of netdev that a DSA port can offload. The
other one is the DSA port's own netdev, i.e. what we have had since time
immemorial.

dsa_port_offloads_netdev(dp, dev)?
Vladimir Oltean Dec. 9, 2020, 10:53 a.m. UTC | #29
On Wed, Dec 09, 2020 at 09:37:39AM +0100, Tobias Waldekranz wrote:
> I will remove `struct dsa_lag` in v4.

Ok, thanks.
It would be nice if you could also make .port_lag_leave return an int code.
And I think that .port_lag_change passes more arguments than needed to
the driver.

> > I don't think the DSA switch tree is private to anyone.
>
> Well I need somewhere to store the association from LAG netdev to LAG
> ID. These IDs are shared by all chips in the tree. It could be
> replicated on each ds of course, but that does not feel quite right.

The LAG ID does not have significance beyond the mv88e6xxx driver, does
it? And even there, it's just a number. You could recalculate all IDs
dynamically upon every join/leave, and they would be consistent by
virtue of the fact that you use a formula which ensures consistency
without storing the LAG ID anywhere. Like, say, the LAG ID is to be
determined by the first struct dsa_port in the DSA switch tree that has
dp->bond == lag_dev. The ID itself can be equal to (dp->ds->index *
MAX_NUM_PORTS + dp->index). All switches will agree on what is the first
dp in dst, since they iterate in the same way, and any LAG join/leave
will notify all of them. It has to be like this anyway.
Tobias Waldekranz Dec. 9, 2020, 2:11 p.m. UTC | #30
On Wed, Dec 09, 2020 at 12:53, Vladimir Oltean <olteanv@gmail.com> wrote:
> On Wed, Dec 09, 2020 at 09:37:39AM +0100, Tobias Waldekranz wrote:
>> I will remove `struct dsa_lag` in v4.
>
> Ok, thanks.
> It would be nice if you could also make .port_lag_leave return an int code.

Sure.

> And I think that .port_lag_change passes more arguments than needed to
> the driver.

You mean the `struct netdev_lag_lower_state_info`? Fine by me, it was
mostly to avoid hiding state from the driver if anyone needed it.

>> > I don't think the DSA switch tree is private to anyone.
>>
>> Well I need somewhere to store the association from LAG netdev to LAG
>> ID. These IDs are shared by all chips in the tree. It could be
>> replicated on each ds of course, but that does not feel quite right.
>
> The LAG ID does not have significance beyond the mv88e6xxx driver, does
> it? And even there, it's just a number. You could recalculate all IDs
> dynamically upon every join/leave, and they would be consistent by
> virtue of the fact that you use a formula which ensures consistency
> without storing the LAG ID anywhere. Like, say, the LAG ID is to be
> determined by the first struct dsa_port in the DSA switch tree that has
> dp->bond == lag_dev. The ID itself can be equal to (dp->ds->index *
> MAX_NUM_PORTS + dp->index). All switches will agree on what is the first
> dp in dst, since they iterate in the same way, and any LAG join/leave
> will notify all of them. It has to be like this anyway.

This will not work for mv88e6xxx. The ID is not just an internal number
used by the driver. If that was the case we could just as well use the
LAG netdev pointer for this purpose. This ID is configured in hardware,
and it is shared between blocks in the switch, we can not just
dynamically change them. Neither can we use your formula since this is a
4-bit field.

Another issue is how we are going to handle this in the tagger now,
since we can no longer call dsa_lag_dev_by_id. I.e. with `struct
dsa_lag` we could resolve the LAG ID (which is the only source
information we have in the tag) to the corresponding netdev. This
information is now only available in mv88e6xxx driver. I am not sure how
I am supposed to conjure it up. Ideas?
Andrew Lunn Dec. 9, 2020, 2:23 p.m. UTC | #31
On Wed, Dec 09, 2020 at 12:53:26PM +0200, Vladimir Oltean wrote:
> On Wed, Dec 09, 2020 at 09:37:39AM +0100, Tobias Waldekranz wrote:
> > I will remove `struct dsa_lag` in v4.
> 
> Ok, thanks.
> It would be nice if you could also make .port_lag_leave return an int code.

I've not looked at the code, but how can it fail, other than something
probably fatal with communication with the hardware. And what should
the rest of the stack do? Recreate what is just destroyed? And does
that really work?

     Andrew
Andrew Lunn Dec. 9, 2020, 2:27 p.m. UTC | #32
> I disagree. A LAG is one type of netdev that a DSA port can offload. The
> other one is the DSA port's own netdev, i.e. what we have had since time
> immemorial.
> 
> dsa_port_offloads_netdev(dp, dev)?

That is better. But a comment explaining what the function does might
be useful.

   Andrew
Tobias Waldekranz Dec. 9, 2020, 3:21 p.m. UTC | #33
On Wed, Dec 09, 2020 at 15:27, Andrew Lunn <andrew@lunn.ch> wrote:
>> I disagree. A LAG is one type of netdev that a DSA port can offload. The
>> other one is the DSA port's own netdev, i.e. what we have had since time
>> immemorial.
>> 
>> dsa_port_offloads_netdev(dp, dev)?
>
> That is better.

...but there is an even better one?

> But a comment explaining what the function does might
> be useful.

This is the function body:

	/* Switchdev offloading can be configured on: */

	if (dev == dp->slave)
		/* DSA ports directly connected to a bridge. */
		return true;

	if (dp->lag && dev == dp->lag->dev)
		/* DSA ports connected to a bridge via a LAG */
		return true;

	return false;

What more is there to explain? Is it the style?  Do you prefer initial
block comments over explaining the individual statements? Is the lanuage
not up to standard?

I am sorry for the tone, I am aware of it. It is just that I really want
to contributem but I am starting to feel like a tty-over-email-proxy to
my emacs session - and an extremely unreliable one at that.
Vladimir Oltean Dec. 9, 2020, 4:04 p.m. UTC | #34
On Wed, Dec 09, 2020 at 03:11:41PM +0100, Tobias Waldekranz wrote:
> On Wed, Dec 09, 2020 at 12:53, Vladimir Oltean <olteanv@gmail.com> wrote:
> > And I think that .port_lag_change passes more arguments than needed to
> > the driver.
> 
> You mean the `struct netdev_lag_lower_state_info`? Fine by me, it was
> mostly to avoid hiding state from the driver if anyone needed it.

There are two approaches really.
Either you extract the useful information from it, which you already do,
and in that case you don't need to provide the structure from the netdev
notifier to the driver, or you let the driver pick it up. Either way is
fine with me, but having both seems redundant.

> >> > I don't think the DSA switch tree is private to anyone.
> >>
> >> Well I need somewhere to store the association from LAG netdev to LAG
> >> ID. These IDs are shared by all chips in the tree. It could be
> >> replicated on each ds of course, but that does not feel quite right.
> >
> > The LAG ID does not have significance beyond the mv88e6xxx driver, does
> > it? And even there, it's just a number. You could recalculate all IDs
> > dynamically upon every join/leave, and they would be consistent by
> > virtue of the fact that you use a formula which ensures consistency
> > without storing the LAG ID anywhere. Like, say, the LAG ID is to be
> > determined by the first struct dsa_port in the DSA switch tree that has
> > dp->bond == lag_dev. The ID itself can be equal to (dp->ds->index *
> > MAX_NUM_PORTS + dp->index). All switches will agree on what is the first
> > dp in dst, since they iterate in the same way, and any LAG join/leave
> > will notify all of them. It has to be like this anyway.
> 
> This will not work for mv88e6xxx. The ID is not just an internal number
> used by the driver. If that was the case we could just as well use the
> LAG netdev pointer for this purpose. This ID is configured in hardware,
> and it is shared between blocks in the switch, we can not just
> dynamically change them. Neither can we use your formula since this is a
> 4-bit field.
> 
> Another issue is how we are going to handle this in the tagger now,
> since we can no longer call dsa_lag_dev_by_id. I.e. with `struct
> dsa_lag` we could resolve the LAG ID (which is the only source
> information we have in the tag) to the corresponding netdev. This
> information is now only available in mv88e6xxx driver. I am not sure how
> I am supposed to conjure it up. Ideas?

Yeah, ok, I get your point. I was going to say that:
(a) accessing the bonding interface in the fast path seems to be a
    mv88e6xxx problem
(b) the LAG IDs that you are making DSA fabricate are not necessarily
    the ones that other drivers will use, due to various other
    constraints (e.g. ocelot)
so basically my point was that I think you are adding a lot of infra in
core DSA that only mv88e6xxx will use.

My practical proposal would have been to keep a 16-entry bonding pointer
array private in mv88e6xxx, which you could retrieve via:

	struct dsa_port *cpu_dp = netdev->dsa_ptr;
	struct dsa_switch *ds = cpu_dp->ds;
	struct mv88e6xxx_chip *chip = ds->priv;

	skb->dev = chip->lags[source_port];

This is just how I would do it. It would not be the first tagger that
accesses driver private data prior to knowing the net device. It would,
however, mean that we know for sure that the mv88e6xxx device will
always be top-of-tree in a cascaded setup. And this is in conflict with
what I initially said, that the dst is not private to anyone.

I think that there is a low practical risk that the assumption will not
hold true basically forever. But I also see why you might like your
approach more. Maybe Vivien, Andrew, Florian could also chime in and we
can see if struct dsa_lag "bothers" anybody else except me (bothers in
the sense that it's an unnecessary complication to hold in DSA). We
could, of course, also take the middle ground, which would be to keep
the 16-entry array of bonding net_device pointers in DSA, and you could
still call your dsa_lag_dev_by_id() and pretend it's generic, and that
would just look up that table. Even with this middle ground, we are
getting rid of the port lists and of the reference counting, which is
still a welcome simplification in my book.
Tobias Waldekranz Dec. 9, 2020, 10:01 p.m. UTC | #35
On Wed, Dec 09, 2020 at 18:04, Vladimir Oltean <olteanv@gmail.com> wrote:
> On Wed, Dec 09, 2020 at 03:11:41PM +0100, Tobias Waldekranz wrote:
>> On Wed, Dec 09, 2020 at 12:53, Vladimir Oltean <olteanv@gmail.com> wrote:
>> > And I think that .port_lag_change passes more arguments than needed to
>> > the driver.
>> 
>> You mean the `struct netdev_lag_lower_state_info`? Fine by me, it was
>> mostly to avoid hiding state from the driver if anyone needed it.
>
> There are two approaches really.
> Either you extract the useful information from it, which you already do,
> and in that case you don't need to provide the structure from the netdev
> notifier to the driver, or you let the driver pick it up. Either way is
> fine with me, but having both seems redundant.

Consider it cut.

>> >> > I don't think the DSA switch tree is private to anyone.
>> >>
>> >> Well I need somewhere to store the association from LAG netdev to LAG
>> >> ID. These IDs are shared by all chips in the tree. It could be
>> >> replicated on each ds of course, but that does not feel quite right.
>> >
>> > The LAG ID does not have significance beyond the mv88e6xxx driver, does
>> > it? And even there, it's just a number. You could recalculate all IDs
>> > dynamically upon every join/leave, and they would be consistent by
>> > virtue of the fact that you use a formula which ensures consistency
>> > without storing the LAG ID anywhere. Like, say, the LAG ID is to be
>> > determined by the first struct dsa_port in the DSA switch tree that has
>> > dp->bond == lag_dev. The ID itself can be equal to (dp->ds->index *
>> > MAX_NUM_PORTS + dp->index). All switches will agree on what is the first
>> > dp in dst, since they iterate in the same way, and any LAG join/leave
>> > will notify all of them. It has to be like this anyway.
>> 
>> This will not work for mv88e6xxx. The ID is not just an internal number
>> used by the driver. If that was the case we could just as well use the
>> LAG netdev pointer for this purpose. This ID is configured in hardware,
>> and it is shared between blocks in the switch, we can not just
>> dynamically change them. Neither can we use your formula since this is a
>> 4-bit field.
>> 
>> Another issue is how we are going to handle this in the tagger now,
>> since we can no longer call dsa_lag_dev_by_id. I.e. with `struct
>> dsa_lag` we could resolve the LAG ID (which is the only source
>> information we have in the tag) to the corresponding netdev. This
>> information is now only available in mv88e6xxx driver. I am not sure how
>> I am supposed to conjure it up. Ideas?
>
> Yeah, ok, I get your point. I was going to say that:
> (a) accessing the bonding interface in the fast path seems to be a
>     mv88e6xxx problem
> (b) the LAG IDs that you are making DSA fabricate are not necessarily
>     the ones that other drivers will use, due to various other
>     constraints (e.g. ocelot)

It is not the Fibonacci sequence or anything, it is an integer in the
range 0..num_lags-1. I realize that some hardware probably allocate IDs
from some shared (and thus possibly non-contiguous) pool. Maybe ocelot
works like that. But it seems reasonable to think that at least some
other drivers could make use of a linear range.

> so basically my point was that I think you are adding a lot of infra in
> core DSA that only mv88e6xxx will use.

Message received.

> My practical proposal would have been to keep a 16-entry bonding pointer
> array private in mv88e6xxx, which you could retrieve via:
>
> 	struct dsa_port *cpu_dp = netdev->dsa_ptr;
> 	struct dsa_switch *ds = cpu_dp->ds;
> 	struct mv88e6xxx_chip *chip = ds->priv;
>
> 	skb->dev = chip->lags[source_port];
>
> This is just how I would do it. It would not be the first tagger that
> accesses driver private data prior to knowing the net device. It would,
> however, mean that we know for sure that the mv88e6xxx device will
> always be top-of-tree in a cascaded setup. And this is in conflict with
> what I initially said, that the dst is not private to anyone.

Indeed. I am trying to keep up.

> I think that there is a low practical risk that the assumption will not
> hold true basically forever. But I also see why you might like your
> approach more. Maybe Vivien, Andrew, Florian could also chime in and we
> can see if struct dsa_lag "bothers" anybody else except me (bothers in
> the sense that it's an unnecessary complication to hold in DSA). We
> could, of course, also take the middle ground, which would be to keep
> the 16-entry array of bonding net_device pointers in DSA, and you could
> still call your dsa_lag_dev_by_id() and pretend it's generic, and that
> would just look up that table. Even with this middle ground, we are
> getting rid of the port lists and of the reference counting, which is
> still a welcome simplification in my book.

Yeah I agree that we can trim it down to just the array. Going beyond
that point, i.e. doing something like how sja1105 works, is more painful
but possible if Andrew can live with it.
Vladimir Oltean Dec. 9, 2020, 10:21 p.m. UTC | #36
On Wed, Dec 09, 2020 at 11:01:25PM +0100, Tobias Waldekranz wrote:
> It is not the Fibonacci sequence or anything, it is an integer in the
> range 0..num_lags-1. I realize that some hardware probably allocate IDs
> from some shared (and thus possibly non-contiguous) pool. Maybe ocelot
> works like that. But it seems reasonable to think that at least some
> other drivers could make use of a linear range.

In the ocelot RFC patches that I've sent to the list yesterday, you
could see that the ports within the same bond must have the same logical
port ID (as opposed to regular mode, when each port has a logical ID
equal to its physical ID, i.e. swp0 -> 0, swp1 -> 1, etc). We can't use
the contiguous LAG ID assignment that you do in DSA, because maybe we
have swp1 and swp2 in a bond, and the LAG ID you give that bond is 0.
But if we assign logical port ID 0 to physical ports 1 and 2, then we
end up also bonding with swp0... So what is done in ocelot is that the
LAG ID is derived from the index of the first port that is part of the
bond, and the logical port IDs are all assigned to that value. It's
really simple when you think about it. It would have probably been the
same for Marvell too if it weren't for that cross-chip thing.

If I were to take a look at Florian's b53-bond branch, I do see that
Broadcom switches also expect a contiguous range of LAG IDs:
https://github.com/ffainelli/linux/tree/b53-bond

So ok, maybe ocelot is in the minority. Not an issue. If you add that
lookup table in the DSA layer, then you could get your linear "LAG ID"
by searching through it using the struct net_device *bond as key.
Drivers which don't need this linear array will just not use it.

> > I think that there is a low practical risk that the assumption will not
> > hold true basically forever. But I also see why you might like your
> > approach more. Maybe Vivien, Andrew, Florian could also chime in and we
> > can see if struct dsa_lag "bothers" anybody else except me (bothers in
> > the sense that it's an unnecessary complication to hold in DSA). We
> > could, of course, also take the middle ground, which would be to keep
> > the 16-entry array of bonding net_device pointers in DSA, and you could
> > still call your dsa_lag_dev_by_id() and pretend it's generic, and that
> > would just look up that table. Even with this middle ground, we are
> > getting rid of the port lists and of the reference counting, which is
> > still a welcome simplification in my book.
>
> Yeah I agree that we can trim it down to just the array. Going beyond
> that point, i.e. doing something like how sja1105 works, is more painful
> but possible if Andrew can live with it.

I did not get the reference to sja1105 here. That does not support
bonding offload, but does work properly with software bridging thanks to
your patches.
Andrew Lunn Dec. 9, 2020, 10:59 p.m. UTC | #37
> so basically my point was that I think you are adding a lot of infra
> in core DSA that only mv88e6xxx will use.

That is true already, since mv88e6xxx is currently the only driver
which supports D in DSA. And it has been Marvell devices which
initially inspired the DSA framework, and has pushed it forward. But I
someday expect another vendors switches will get added which also have
a D in DSA concept, and hopefully we can reuse the code. And is
Marvells implementation of LAGs truly unique? With only two drivers
with WIP code, it is too early to say that.

The important thing for me, is that we don't make the API for other
drivers more complex because if D in DSA, or the maybe odd way Marvell
implements LAGs.

One of the long learnt lessons in kernel development. Put complexity
in the core and make drivers simple. Because hopefully you have the
best developers working on the core and they can deal with the
complexity, and typically you have average developers working on
drivers, and they are sometimes stretched getting drivers correct.

Given your work on ocelot driver, does this core code make the ocelot
implementation more difficult? Or just the same?

       Andrew
Andrew Lunn Dec. 9, 2020, 11:03 p.m. UTC | #38
On Wed, Dec 09, 2020 at 04:21:31PM +0100, Tobias Waldekranz wrote:
> On Wed, Dec 09, 2020 at 15:27, Andrew Lunn <andrew@lunn.ch> wrote:
> >> I disagree. A LAG is one type of netdev that a DSA port can offload. The
> >> other one is the DSA port's own netdev, i.e. what we have had since time
> >> immemorial.
> >> 
> >> dsa_port_offloads_netdev(dp, dev)?
> >
> > That is better.
> 
> ...but there is an even better one?

Not that comes to mind. 

> 
> > But a comment explaining what the function does might
> > be useful.
> 
> This is the function body:
> 
> 	/* Switchdev offloading can be configured on: */
> 
> 	if (dev == dp->slave)
> 		/* DSA ports directly connected to a bridge. */
> 		return true;
> 
> 	if (dp->lag && dev == dp->lag->dev)
> 		/* DSA ports connected to a bridge via a LAG */
> 		return true;
> 
> 	return false;
> 
> What more is there to explain?

OK, you are right. It is well commented as is. Just change the name
and we are good.

    Andrew
Vladimir Oltean Dec. 9, 2020, 11:17 p.m. UTC | #39
On Wed, Dec 09, 2020 at 03:23:40PM +0100, Andrew Lunn wrote:
> On Wed, Dec 09, 2020 at 12:53:26PM +0200, Vladimir Oltean wrote:
> > On Wed, Dec 09, 2020 at 09:37:39AM +0100, Tobias Waldekranz wrote:
> > > I will remove `struct dsa_lag` in v4.
> >
> > Ok, thanks.
> > It would be nice if you could also make .port_lag_leave return an int code.
>
> I've not looked at the code, but how can it fail, other than something
> probably fatal with communication with the hardware. And what should
> the rest of the stack do? Recreate what is just destroyed? And does
> that really work?

What can fail is memory allocation, which other drivers might need to do
when they rebalance the remaining ports in the LAG.
Vladimir Oltean Dec. 10, 2020, 1:05 a.m. UTC | #40
On Wed, Dec 09, 2020 at 11:59:50PM +0100, Andrew Lunn wrote:
> > so basically my point was that I think you are adding a lot of infra
> > in core DSA that only mv88e6xxx will use.
>
> That is true already, since mv88e6xxx is currently the only driver
> which supports D in DSA. And it has been Marvell devices which
> initially inspired the DSA framework, and has pushed it forward.

Well actually, if you remember...
https://patchwork.ozlabs.org/project/netdev/list/?series=175986&state=*
This is pretty "D in DSA" too, even if the three switches are in three
disjoint trees due to tagger incompatibility.

Actually I have this novelty idea in the back of my head, where, once I
send the tag_8021q patches for ocelot, I make my ocelot+3x sja1105 board
use a single DSA switch tree, just to see how it works. That would be
the first instance of "D in DSA" with different drivers, just tagger
compatibility.

> But I someday expect another vendors switches will get added which
> also have a D in DSA concept, and hopefully we can reuse the code. And
> is Marvells implementation of LAGs truly unique? With only two drivers
> with WIP code, it is too early to say that.

Exactly, it is too early to say what is truly reusable and what isn't,
so I would incline towards trying to push as much out of DSA as possible.
When we'll see 3-4 LAG offload implementations, we can decide to share
more. But the other way around, pushing from the DSA core into drivers,
later down the road? That sounds more difficult to me. We aren't done
with the configure_vlans_while_not_filtering transition yet.

> The important thing for me, is that we don't make the API for other
> drivers more complex because if D in DSA, or the maybe odd way Marvell
> implements LAGs.

Well, at the end of the day, I don't believe we're here to define APIs,
at least not in DSA. The idea of monitoring netdev event notifiers in
order to offload certain operations is nothing new under the sun now. I
would expect that a driver writer does not see radically different APIs
between working on plain switchdev and on DSA.

> One of the long learnt lessons in kernel development. Put complexity
> in the core and make drivers simple. Because hopefully you have the
> best developers working on the core and they can deal with the
> complexity, and typically you have average developers working on
> drivers, and they are sometimes stretched getting drivers correct.

If anything, I would see "the core" here as being some sort of layer
that both switchdev and DSA could reuse. The thicker this layer would
be, the slimmer DSA and switchdev drivers would become. I am getting
tired of duplicating DSA code in the mscc/ocelot switchdev driver before
I could pass a one-liner function pointer from the felix DSA driver into
the common ocelot switch library. The "good developers/core code" vs
"average developers/hardware-specific code" dichotomy doesn't really
hold water when you take a look past the fence and see that what DSA
maintainers call core code, an "average" developer for a plain switchdev
driver calls just another Tuesday. But Tobias has nothing to do with
that and this is not the reason why I would like him to simplify.

> Given your work on ocelot driver, does this core code make the ocelot
> implementation more difficult? Or just the same?

It doesn't bother ocelot, and it doesn't help it either. I posted the
ocelot RFC series to prove that a driver can offload LAG without even
touching struct dsa_lag. And I don't even think I'm being unreasonable,
let me show you.

> struct dsa_lag {
> 	struct net_device *dev;
	^
	|
	This one we need. No doubt about it. If you don't have the ball
	you can't play.
> 
> 	int id;
	^
	|
	This one maybe or maybe not, since the driver might have a
	different understanding of a LAG ID than DSA had. Either way,
	this is low impact.
> 
> 	struct list_head ports;
	^
	|
	This is interesting. A struct dsa_port has a pointer to a
	struct dsa_lag. And a struct dsa_lag has a list of struct dsa_ports,
	on which the initial port can find itself. The mesh is too connected.
	And it doesn't provide useful information. You want to find the
	struct dsa_ports in the same LAG as you? You already can, without
	this list.
> 
> 	/* For multichip systems, we must ensure that each hash bucket
> 	 * is only enabled on a single egress port throughout the
> 	 * whole tree, lest we send duplicates. Therefore we must
> 	 * maintain a global list of active tx ports, so that each
> 	 * switch can figure out which buckets to enable on which
> 	 * ports.
> 	 */
> 	struct list_head tx_ports;
	^
	|
	First of all, my problem with this is the misplaced comment.
	When I read it, I expect it to explain why there's a list for
	ports and a separate one for tx_ports. But it doesn't do that.
	Then I need to read it diagonally to understand why multichip
	is even mentioned, since you would need to rebalance the LAGs
	only among the ports which are active regardless of multichip or
	not. The reason why multichip is mentioned can be found 4 lines
	later: "global" list.
	But as Tobias and I discussed earlier too, the port list in the
	DSA switch tree is already global. So it's not really a good
	justification. And adding a boolean "is this port active for LAG
	or not" is a lot simpler than Yet Another List. It doesn't even
	consume more memory. It consumes less, in fact, because it
	balances out with the pointers that struct dsa_port also needs
	to keep for these linked lists.
> 
> 	int num_tx;
	^
	|
	Why, just why? (To make implementation easier, yes, I was told.
	But again: why?)
> 
> 	refcount_t refcount;
	^
	|
	This is the glue that keeps all the DSA metadata associated with
	a upper bonding interface from falling apart. Of course it needs
	to be reference-counted. My only argument is: we can _easily_ do
	away with the ports and the tx_ports lists, and not really lose
	anything. All that remains is struct net_device *dev, and int
	id. If we only had the struct net_device *dev, we wouldn't need
	the reference count. You can have 10 pointers to the same
	structure, and you don't need to deal with its memory management,
	because it's not your structure. You only need to deal with
	memory management as soon as you keep metadata associated with
	it, like the LAG ID in this case. That's why I want to push back
	and see if we can't avoid computing and storing it in DSA too.
	Tobias made a good argument that he does need the int id in the
	RX data path. That should be the only place. And we settled at
	the compromise that he could keep an extra array, indexed by LAG
	ID, which returns the struct net_device *dev. So basically we'll
	have two references to a bonding interface now:
	- By port: dp->lag_dev (previously dp->lag->dev)
	- By ID: dst->lags[lag_id] (previously dst->lags.pool[id].dev)
	So basically the pool remains, but struct dsa_lag sublimates
	into a simple struct net_device *lag_dev pointer.
> };

That's it. If you're ok with having a convoluted structure that you
don't really need, fine. I mean, I could ignore it.
Tobias Waldekranz Dec. 10, 2020, 10:18 a.m. UTC | #41
On Thu, Dec 10, 2020 at 00:21, Vladimir Oltean <olteanv@gmail.com> wrote:
> On Wed, Dec 09, 2020 at 11:01:25PM +0100, Tobias Waldekranz wrote:
>> It is not the Fibonacci sequence or anything, it is an integer in the
>> range 0..num_lags-1. I realize that some hardware probably allocate IDs
>> from some shared (and thus possibly non-contiguous) pool. Maybe ocelot
>> works like that. But it seems reasonable to think that at least some
>> other drivers could make use of a linear range.
>
> In the ocelot RFC patches that I've sent to the list yesterday, you
> could see that the ports within the same bond must have the same logical
> port ID (as opposed to regular mode, when each port has a logical ID
> equal to its physical ID, i.e. swp0 -> 0, swp1 -> 1, etc). We can't use
> the contiguous LAG ID assignment that you do in DSA, because maybe we
> have swp1 and swp2 in a bond, and the LAG ID you give that bond is 0.
> But if we assign logical port ID 0 to physical ports 1 and 2, then we
> end up also bonding with swp0... So what is done in ocelot is that the
> LAG ID is derived from the index of the first port that is part of the
> bond, and the logical port IDs are all assigned to that value. It's
> really simple when you think about it. It would have probably been the
> same for Marvell too if it weren't for that cross-chip thing.
>
> If I were to take a look at Florian's b53-bond branch, I do see that
> Broadcom switches also expect a contiguous range of LAG IDs:
> https://github.com/ffainelli/linux/tree/b53-bond
>
> So ok, maybe ocelot is in the minority. Not an issue. If you add that
> lookup table in the DSA layer, then you could get your linear "LAG ID"
> by searching through it using the struct net_device *bond as key.
> Drivers which don't need this linear array will just not use it.

Great, I can work with that.

>> > I think that there is a low practical risk that the assumption will not
>> > hold true basically forever. But I also see why you might like your
>> > approach more. Maybe Vivien, Andrew, Florian could also chime in and we
>> > can see if struct dsa_lag "bothers" anybody else except me (bothers in
>> > the sense that it's an unnecessary complication to hold in DSA). We
>> > could, of course, also take the middle ground, which would be to keep
>> > the 16-entry array of bonding net_device pointers in DSA, and you could
>> > still call your dsa_lag_dev_by_id() and pretend it's generic, and that
>> > would just look up that table. Even with this middle ground, we are
>> > getting rid of the port lists and of the reference counting, which is
>> > still a welcome simplification in my book.
>>
>> Yeah I agree that we can trim it down to just the array. Going beyond
>> that point, i.e. doing something like how sja1105 works, is more painful
>> but possible if Andrew can live with it.
>
> I did not get the reference to sja1105 here. That does not support
> bonding offload, but does work properly with software bridging thanks to
> your patches.

Yeah sorry, I should have explained that better.

I meant in the sense that it shares information between the tagger and
the driver (struct sja1105_tagger_data).
Tobias Waldekranz Dec. 11, 2020, 8:50 p.m. UTC | #42
On Tue, Dec 08, 2020 at 13:23, Vladimir Oltean <olteanv@gmail.com> wrote:
> Sorry it took so long. I wanted to understand:
> (a) where are the challenged for drivers to uniformly support software
>     bridging when they already have code for bridge offloading. I found
>     the following issues:
>     - We have taggers that unconditionally set skb->offload_fwd_mark = 1,
>       which kind of prevents software bridging. I'm not sure what the
>       fix for these should be.

I took a closer look at the software fallback mode for LAGs and I've
found three issues that prevent this from working in a bridged setup,
two of which are easy to fix. This is the setup (team0 is _not_
offloaded):

(A)  br0
     /
  team0
   / \
swp0 swp1


1. DSA tries to offload port attributes for standalone ports. So in this
   setup, if vlan filtering is enabled on br0, we will enable it in
   hardware which on mv88e6xxx causes swp0/1 to drop all packets on
   ingress due to a VTU violation. This is a very easy fix, I will
   include it in v4.

2. The issue Vladimir mentioned above. This is also a straight forward
   fix, I have patch for tag_dsa, making sure that offload_fwd_mark is
   never set for ports in standalone mode.

   I am not sure if I should solve it like that or if we should just
   clear the mark in dsa_switch_rcv if the dp does not have a
   bridge_dev. I know both Vladimir and I were leaning towards each
   tagger solving it internally. But looking at the code, I get the
   feeling that all taggers will end up copying the same block of code
   anyway. What do you think?

With these two patches in place, setup (A) works as expected. But if you
extend it to (team0 still not offloaded):

(B)   br0
     /   \
  team0   \
   / \     \
swp0 swp1  swp2

You instantly run into:

3. Only traffic which does _not_ have offload_fwd_mark set is allowed to
   pass from swp2 to team0. This is because the bridge uses
   dev_get_port_parent_id to figure out which ports belong to the same
   switch. This will recurse down through all lowers and find swp0/1
   which will answer with the same ID as swp2.

   In the case where team0 is offloaded, this is exactly what we want,
   but in a setup like (B) they do not have the same "logical" parent in
   the sense that br0 is led to believe. I.e. the hardware will never
   forward packets between swp0/1 and swp2.

   I do not see an obvious solution to this. Refusing to divulge the
   parent just because you are a part of a software LAG seems fraught
   with danger as there are other users of those APIs. Adding yet
   another ndo would theoretically be possible, but not
   desirable. Ideas?

As for this series, my intention is to make sure that (A) works as
intended, leaving (B) for another day. Does that seem reasonable?

NOTE: In the offloaded case, (B) will of course also be supported.
Vladimir Oltean Dec. 12, 2020, 2:26 p.m. UTC | #43
On Fri, Dec 11, 2020 at 09:50:24PM +0100, Tobias Waldekranz wrote:
> On Tue, Dec 08, 2020 at 13:23, Vladimir Oltean <olteanv@gmail.com> wrote:
> > Sorry it took so long. I wanted to understand:
> > (a) where are the challenged for drivers to uniformly support software
> >     bridging when they already have code for bridge offloading. I found
> >     the following issues:
> >     - We have taggers that unconditionally set skb->offload_fwd_mark = 1,
> >       which kind of prevents software bridging. I'm not sure what the
> >       fix for these should be.
>
> I took a closer look at the software fallback mode for LAGs and I've
> found three issues that prevent this from working in a bridged setup,
> two of which are easy to fix. This is the setup (team0 is _not_
> offloaded):
>
> (A)  br0
>      /
>   team0
>    / \
> swp0 swp1
>
>
> 1. DSA tries to offload port attributes for standalone ports. So in this
>    setup, if vlan filtering is enabled on br0, we will enable it in
>    hardware which on mv88e6xxx causes swp0/1 to drop all packets on
>    ingress due to a VTU violation. This is a very easy fix, I will
>    include it in v4.

Makes sense, I did not enable VLAN filtering when I tried it.

> 2. The issue Vladimir mentioned above. This is also a straight forward
>    fix, I have patch for tag_dsa, making sure that offload_fwd_mark is
>    never set for ports in standalone mode.
>
>    I am not sure if I should solve it like that or if we should just
>    clear the mark in dsa_switch_rcv if the dp does not have a
>    bridge_dev. I know both Vladimir and I were leaning towards each
>    tagger solving it internally. But looking at the code, I get the
>    feeling that all taggers will end up copying the same block of code
>    anyway. What do you think?

I am not sure what constitutes a good separation between DSA and taggers
here. We have many taggers that just set skb->offload_fwd_mark = 1. We
could have this as an opportunity to even let DSA take the decision
altogether. What do you say if we stop setting skb->offload_fwd_mark
from taggers, just add this:

+#define DSA_SKB_TRAPPED	BIT(0)
+
 struct dsa_skb_cb {
 	struct sk_buff *clone;
+	unsigned long flags;
 };

and basically just reverse the logic. Make taggers just assign this flag
for packets which are known to have reached software via data or control
traps. Don't make the taggers set skb->offload_fwd_mark = 1 if they
don't need to. Let DSA take that decision upon a more complex thought
process, which looks at DSA_SKB_CB(skb)->flags & DSA_SKB_TRAPPED too,
among other things.

> With these two patches in place, setup (A) works as expected. But if you
> extend it to (team0 still not offloaded):
>
> (B)   br0
>      /   \
>   team0   \
>    / \     \
> swp0 swp1  swp2
>
> You instantly run into:
>
> 3. Only traffic which does _not_ have offload_fwd_mark set is allowed to
>    pass from swp2 to team0. This is because the bridge uses
>    dev_get_port_parent_id to figure out which ports belong to the same
>    switch. This will recurse down through all lowers and find swp0/1
>    which will answer with the same ID as swp2.
>
>    In the case where team0 is offloaded, this is exactly what we want,
>    but in a setup like (B) they do not have the same "logical" parent in
>    the sense that br0 is led to believe. I.e. the hardware will never
>    forward packets between swp0/1 and swp2.
>
>    I do not see an obvious solution to this. Refusing to divulge the
>    parent just because you are a part of a software LAG seems fraught
>    with danger as there are other users of those APIs. Adding yet
>    another ndo would theoretically be possible, but not
>    desirable. Ideas?
>
> As for this series, my intention is to make sure that (A) works as
> intended, leaving (B) for another day. Does that seem reasonable?
>
> NOTE: In the offloaded case, (B) will of course also be supported.

Yeah, ok, one can already tell that the way I've tested this setup was
by commenting out skb->offload_fwd_mark = 1 altogether. It seems ok to
postpone this a bit.

For what it's worth, in the giant "RX filtering for DSA switches" fiasco
https://patchwork.ozlabs.org/project/netdev/patch/20200521211036.668624-11-olteanv@gmail.com/
we seemed to reach the conclusion that it would be ok to add a new NDO
answering the question "can this interface do forwarding in hardware
towards this other interface". We can probably start with the question
being asked for L2 forwarding only.

Maybe you are just the kick I need to come back to that series and
simplify it.
Tobias Waldekranz Dec. 13, 2020, 9:18 p.m. UTC | #44
On Sat, Dec 12, 2020 at 16:26, Vladimir Oltean <olteanv@gmail.com> wrote:
> On Fri, Dec 11, 2020 at 09:50:24PM +0100, Tobias Waldekranz wrote:
>> 2. The issue Vladimir mentioned above. This is also a straight forward
>>    fix, I have patch for tag_dsa, making sure that offload_fwd_mark is
>>    never set for ports in standalone mode.
>>
>>    I am not sure if I should solve it like that or if we should just
>>    clear the mark in dsa_switch_rcv if the dp does not have a
>>    bridge_dev. I know both Vladimir and I were leaning towards each
>>    tagger solving it internally. But looking at the code, I get the
>>    feeling that all taggers will end up copying the same block of code
>>    anyway. What do you think?
>
> I am not sure what constitutes a good separation between DSA and taggers
> here. We have many taggers that just set skb->offload_fwd_mark = 1. We
> could have this as an opportunity to even let DSA take the decision
> altogether. What do you say if we stop setting skb->offload_fwd_mark
> from taggers, just add this:
>
> +#define DSA_SKB_TRAPPED	BIT(0)
> +
>  struct dsa_skb_cb {
>  	struct sk_buff *clone;
> +	unsigned long flags;
>  };
>
> and basically just reverse the logic. Make taggers just assign this flag
> for packets which are known to have reached software via data or control
> traps. Don't make the taggers set skb->offload_fwd_mark = 1 if they
> don't need to. Let DSA take that decision upon a more complex thought
> process, which looks at DSA_SKB_CB(skb)->flags & DSA_SKB_TRAPPED too,
> among other things.

What would the benefit of this over using the OFM directly? Would the
flag not carry the exact same bit of information, albeit inverted? Is it
about not giving the taggers any illusions about having the final say on
the OFM value?

>> As for this series, my intention is to make sure that (A) works as
>> intended, leaving (B) for another day. Does that seem reasonable?
>>
>> NOTE: In the offloaded case, (B) will of course also be supported.
>
> Yeah, ok, one can already tell that the way I've tested this setup was
> by commenting out skb->offload_fwd_mark = 1 altogether. It seems ok to
> postpone this a bit.
>
> For what it's worth, in the giant "RX filtering for DSA switches" fiasco
> https://patchwork.ozlabs.org/project/netdev/patch/20200521211036.668624-11-olteanv@gmail.com/
> we seemed to reach the conclusion that it would be ok to add a new NDO
> answering the question "can this interface do forwarding in hardware
> towards this other interface". We can probably start with the question
> being asked for L2 forwarding only.

Very interesting, though I did not completely understand the VXLAN
scenario laid out in that thread. I understand that OFM can not be 0,
because you might have successfully forwarded to some destinations. But
setting it to 1 does not smell right either. OFM=1 means "this has
already been forwarded according to your current configuration" which is
not completely true in this case. This is something in the middle, more
like skb->offload_fwd_mark = its_complicated;

Anyway, so we are essentially talking about replacing the question "do
you share a parent with this netdev?" with "do you share the same
hardware bridging domain as this netdev?" when choosing the port's OFM
in a bridge, correct? If so, great, that would also solve the software
LAG case. This would also get us one step closer to selectively
disabling bridge offloading on a switchdev port.
Vladimir Oltean Dec. 14, 2020, 12:12 a.m. UTC | #45
On Sun, Dec 13, 2020 at 10:18:27PM +0100, Tobias Waldekranz wrote:
> On Sat, Dec 12, 2020 at 16:26, Vladimir Oltean <olteanv@gmail.com> wrote:
> > On Fri, Dec 11, 2020 at 09:50:24PM +0100, Tobias Waldekranz wrote:
> >> 2. The issue Vladimir mentioned above. This is also a straight forward
> >>    fix, I have patch for tag_dsa, making sure that offload_fwd_mark is
> >>    never set for ports in standalone mode.
> >>
> >>    I am not sure if I should solve it like that or if we should just
> >>    clear the mark in dsa_switch_rcv if the dp does not have a
> >>    bridge_dev. I know both Vladimir and I were leaning towards each
> >>    tagger solving it internally. But looking at the code, I get the
> >>    feeling that all taggers will end up copying the same block of code
> >>    anyway. What do you think?
> >> As for this series, my intention is to make sure that (A) works as
> >> intended, leaving (B) for another day. Does that seem reasonable?
> >>
> >> NOTE: In the offloaded case, (B) will of course also be supported.
> >
> > Yeah, ok, one can already tell that the way I've tested this setup was
> > by commenting out skb->offload_fwd_mark = 1 altogether. It seems ok to
> > postpone this a bit.
> >
> > For what it's worth, in the giant "RX filtering for DSA switches" fiasco
> > https://patchwork.ozlabs.org/project/netdev/patch/20200521211036.668624-11-olteanv@gmail.com/
> > we seemed to reach the conclusion that it would be ok to add a new NDO
> > answering the question "can this interface do forwarding in hardware
> > towards this other interface". We can probably start with the question
> > being asked for L2 forwarding only.
>
> Very interesting, though I did not completely understand the VXLAN
> scenario laid out in that thread. I understand that OFM can not be 0,
> because you might have successfully forwarded to some destinations. But
> setting it to 1 does not smell right either. OFM=1 means "this has
> already been forwarded according to your current configuration" which is
> not completely true in this case. This is something in the middle, more
> like skb->offload_fwd_mark = its_complicated;

Very pertinent question. Given your observation that nbp_switchdev_mark_set()
calls dev_get_port_parent_id() with recurse=true, this means that a vxlan
upper should have the same parent ID as the real interface. At least the
theory coincides with the little practice I applied to my setup where
felix does not support vxlan offload:

I printed the p->offload_fwd_mark assigned by nbp_switchdev_mark_set:
ip link add br0 type bridge
ip link set swp1 master br0
[   15.887217] mscc_felix 0000:00:00.5 swp1: offload_fwd_mark 1
ip link add vxlan10 type vxlan id 10 group 224.10.10.10 dstport 4789 ttl 10 dev swp0
ip link set vxlan10 master br0
[  102.734390] vxlan10: offload_fwd_mark 1

So a clearer explanation needs to be found for how Ido's exception
traffic due to missing neighbor in the vxlan underlay gets re-forwarded
by the software bridge to the software vxlan interface. It cannot be due
to a mismatch of bridge port offload_fwd_mark values unless there is
some different logic applied for Mellanox hardware that I am not seeing.
So after all, it must be due to skb->offload_fwd_mark being unset?

To be honest, I almost expect that the Mellanox switches are "all or
nothing" in terms of forwarding. So if the vxlan interface (which is
only one of the bridge ports) could not deliver the packet, it would
seem cleaner to me that none of the other interfaces deliver the packet
either. Then the driver picks up this exception packet on the original
ingress interface, and the software bridge + software vxlan do the job.
And this means that skb->offload_fwd_mark = it_isnt_complicated.

But this is clearly at odds with what Ido said, that "swp0 and vxlan0 do
not have the same parent ID", and which was the center of his entire
argument. It's my fault really, I should have checked. Let's hope that
Ido can explain again.

> Anyway, so we are essentially talking about replacing the question "do
> you share a parent with this netdev?" with "do you share the same
> hardware bridging domain as this netdev?" when choosing the port's OFM
> in a bridge, correct? If so, great, that would also solve the software
> LAG case. This would also get us one step closer to selectively
> disabling bridge offloading on a switchdev port.

Well, I cannot answer this until I fully understand the other issue
above - basically how is it that Mellanox switches do software
forwarding for exception traffic today.

Ido, for background, here's the relevant portion of the thread. We're
talking about software fallback for a bridge-over-bonding-over-DSA
scenario:
https://lore.kernel.org/netdev/87a6uk5apb.fsf@waldekranz.com/
Tobias Waldekranz Dec. 14, 2020, 9:41 a.m. UTC | #46
On Sun, Dec 13, 2020 at 22:18, Tobias Waldekranz <tobias@waldekranz.com> wrote:
> On Sat, Dec 12, 2020 at 16:26, Vladimir Oltean <olteanv@gmail.com> wrote:
>> On Fri, Dec 11, 2020 at 09:50:24PM +0100, Tobias Waldekranz wrote:
>>> 2. The issue Vladimir mentioned above. This is also a straight forward
>>>    fix, I have patch for tag_dsa, making sure that offload_fwd_mark is
>>>    never set for ports in standalone mode.
>>>
>>>    I am not sure if I should solve it like that or if we should just
>>>    clear the mark in dsa_switch_rcv if the dp does not have a
>>>    bridge_dev. I know both Vladimir and I were leaning towards each
>>>    tagger solving it internally. But looking at the code, I get the
>>>    feeling that all taggers will end up copying the same block of code
>>>    anyway. What do you think?
>>
>> I am not sure what constitutes a good separation between DSA and taggers
>> here. We have many taggers that just set skb->offload_fwd_mark = 1. We
>> could have this as an opportunity to even let DSA take the decision
>> altogether. What do you say if we stop setting skb->offload_fwd_mark
>> from taggers, just add this:
>>
>> +#define DSA_SKB_TRAPPED	BIT(0)
>> +
>>  struct dsa_skb_cb {
>>  	struct sk_buff *clone;
>> +	unsigned long flags;
>>  };
>>
>> and basically just reverse the logic. Make taggers just assign this flag
>> for packets which are known to have reached software via data or control
>> traps. Don't make the taggers set skb->offload_fwd_mark = 1 if they
>> don't need to. Let DSA take that decision upon a more complex thought
>> process, which looks at DSA_SKB_CB(skb)->flags & DSA_SKB_TRAPPED too,
>> among other things.
>
> What would the benefit of this over using the OFM directly? Would the
> flag not carry the exact same bit of information, albeit inverted? Is it
> about not giving the taggers any illusions about having the final say on
> the OFM value?

On second thought, does this even matter if we solve the issue with
properly separating the different L2 domains? I.e. in this setup:

      br0
     /   \
  team0   \
   / \     \
swp0 swp1 swp2

If team0 is not offloaded, and our new fancy ndo were to relay that to
the bridge, then team0 and swp2 would no longer share OFM. In that case
traffic will flow between them indepent of OFM, just like it would
between ports from two different switchdevs.

With that in mind, I will leave this patch out of v4 as case (A) works
without it, and including it does not solve (B). I suppose there could
be other reasons to accurately convey the OFM in these cases, but I
think we can revisit that once everything else is in place.

>>> As for this series, my intention is to make sure that (A) works as
>>> intended, leaving (B) for another day. Does that seem reasonable?
>>>
>>> NOTE: In the offloaded case, (B) will of course also be supported.
>>
>> Yeah, ok, one can already tell that the way I've tested this setup was
>> by commenting out skb->offload_fwd_mark = 1 altogether. It seems ok to
>> postpone this a bit.
>>
>> For what it's worth, in the giant "RX filtering for DSA switches" fiasco
>> https://patchwork.ozlabs.org/project/netdev/patch/20200521211036.668624-11-olteanv@gmail.com/
>> we seemed to reach the conclusion that it would be ok to add a new NDO
>> answering the question "can this interface do forwarding in hardware
>> towards this other interface". We can probably start with the question
>> being asked for L2 forwarding only.
>
> Very interesting, though I did not completely understand the VXLAN
> scenario laid out in that thread. I understand that OFM can not be 0,
> because you might have successfully forwarded to some destinations. But
> setting it to 1 does not smell right either. OFM=1 means "this has
> already been forwarded according to your current configuration" which is
> not completely true in this case. This is something in the middle, more
> like skb->offload_fwd_mark = its_complicated;
>
> Anyway, so we are essentially talking about replacing the question "do
> you share a parent with this netdev?" with "do you share the same
> hardware bridging domain as this netdev?" when choosing the port's OFM
> in a bridge, correct? If so, great, that would also solve the software
> LAG case. This would also get us one step closer to selectively
> disabling bridge offloading on a switchdev port.
Ido Schimmel Dec. 14, 2020, 11:42 a.m. UTC | #47
On Mon, Dec 14, 2020 at 02:12:31AM +0200, Vladimir Oltean wrote:
> On Sun, Dec 13, 2020 at 10:18:27PM +0100, Tobias Waldekranz wrote:
> > On Sat, Dec 12, 2020 at 16:26, Vladimir Oltean <olteanv@gmail.com> wrote:
> > > On Fri, Dec 11, 2020 at 09:50:24PM +0100, Tobias Waldekranz wrote:
> > >> 2. The issue Vladimir mentioned above. This is also a straight forward
> > >>    fix, I have patch for tag_dsa, making sure that offload_fwd_mark is
> > >>    never set for ports in standalone mode.
> > >>
> > >>    I am not sure if I should solve it like that or if we should just
> > >>    clear the mark in dsa_switch_rcv if the dp does not have a
> > >>    bridge_dev. I know both Vladimir and I were leaning towards each
> > >>    tagger solving it internally. But looking at the code, I get the
> > >>    feeling that all taggers will end up copying the same block of code
> > >>    anyway. What do you think?
> > >> As for this series, my intention is to make sure that (A) works as
> > >> intended, leaving (B) for another day. Does that seem reasonable?
> > >>
> > >> NOTE: In the offloaded case, (B) will of course also be supported.
> > >
> > > Yeah, ok, one can already tell that the way I've tested this setup was
> > > by commenting out skb->offload_fwd_mark = 1 altogether. It seems ok to
> > > postpone this a bit.
> > >
> > > For what it's worth, in the giant "RX filtering for DSA switches" fiasco
> > > https://patchwork.ozlabs.org/project/netdev/patch/20200521211036.668624-11-olteanv@gmail.com/
> > > we seemed to reach the conclusion that it would be ok to add a new NDO
> > > answering the question "can this interface do forwarding in hardware
> > > towards this other interface". We can probably start with the question
> > > being asked for L2 forwarding only.
> >
> > Very interesting, though I did not completely understand the VXLAN
> > scenario laid out in that thread. I understand that OFM can not be 0,
> > because you might have successfully forwarded to some destinations. But
> > setting it to 1 does not smell right either. OFM=1 means "this has
> > already been forwarded according to your current configuration" which is
> > not completely true in this case. This is something in the middle, more
> > like skb->offload_fwd_mark = its_complicated;
> 
> Very pertinent question. Given your observation that nbp_switchdev_mark_set()
> calls dev_get_port_parent_id() with recurse=true, this means that a vxlan
> upper should have the same parent ID as the real interface. At least the
> theory coincides with the little practice I applied to my setup where
> felix does not support vxlan offload:
> 
> I printed the p->offload_fwd_mark assigned by nbp_switchdev_mark_set:
> ip link add br0 type bridge
> ip link set swp1 master br0
> [   15.887217] mscc_felix 0000:00:00.5 swp1: offload_fwd_mark 1
> ip link add vxlan10 type vxlan id 10 group 224.10.10.10 dstport 4789 ttl 10 dev swp0
> ip link set vxlan10 master br0
> [  102.734390] vxlan10: offload_fwd_mark 1
> 
> So a clearer explanation needs to be found for how Ido's exception
> traffic due to missing neighbor in the vxlan underlay gets re-forwarded
> by the software bridge to the software vxlan interface. It cannot be due
> to a mismatch of bridge port offload_fwd_mark values unless there is
> some different logic applied for Mellanox hardware that I am not seeing.
> So after all, it must be due to skb->offload_fwd_mark being unset?
> 
> To be honest, I almost expect that the Mellanox switches are "all or
> nothing" in terms of forwarding. So if the vxlan interface (which is
> only one of the bridge ports) could not deliver the packet, it would
> seem cleaner to me that none of the other interfaces deliver the packet
> either. Then the driver picks up this exception packet on the original
> ingress interface, and the software bridge + software vxlan do the job.
> And this means that skb->offload_fwd_mark = it_isnt_complicated.
> 
> But this is clearly at odds with what Ido said, that "swp0 and vxlan0 do
> not have the same parent ID", and which was the center of his entire
> argument. It's my fault really, I should have checked. Let's hope that
> Ido can explain again.

Problem is here:

ip link add vxlan10 type vxlan id 10 group 224.10.10.10 dstport 4789 ttl 10 dev swp0

We don't configure VXLAN with a bound device. In fact, we forbid it:
https://elixir.bootlin.com/linux/latest/source/drivers/net/ethernet/mellanox/mlxsw/spectrum_nve_vxlan.c#L46
https://elixir.bootlin.com/linux/latest/source/tools/testing/selftests/drivers/net/mlxsw/vxlan.sh#L182

Even if we were to support a bound device, it is unlikely to be a switch
port, but some dummy interface that we would enslave to a VRF in which
we would like the underlay lookup to be performed. We use this with GRE
tunnels:
https://github.com/Mellanox/mlxsw/wiki/L3-Tunneling#general-gre-configuration

Currently, underlay lookup always happens in the default VRF.

VXLAN recently got support for this as well. See this series:
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=79dfab43a976b76713c40222987c48e32510ebc1

> 
> > Anyway, so we are essentially talking about replacing the question "do
> > you share a parent with this netdev?" with "do you share the same
> > hardware bridging domain as this netdev?" when choosing the port's OFM
> > in a bridge, correct? If so, great, that would also solve the software
> > LAG case. This would also get us one step closer to selectively
> > disabling bridge offloading on a switchdev port.
> 
> Well, I cannot answer this until I fully understand the other issue
> above - basically how is it that Mellanox switches do software
> forwarding for exception traffic today.
> 
> Ido, for background, here's the relevant portion of the thread. We're
> talking about software fallback for a bridge-over-bonding-over-DSA
> scenario:
> https://lore.kernel.org/netdev/87a6uk5apb.fsf@waldekranz.com/
Tobias Waldekranz Dec. 16, 2020, 3:15 p.m. UTC | #48
On Mon, Dec 14, 2020 at 13:42, Ido Schimmel <idosch@idosch.org> wrote:
> On Mon, Dec 14, 2020 at 02:12:31AM +0200, Vladimir Oltean wrote:
>> On Sun, Dec 13, 2020 at 10:18:27PM +0100, Tobias Waldekranz wrote:
>> > On Sat, Dec 12, 2020 at 16:26, Vladimir Oltean <olteanv@gmail.com> wrote:
>> > > On Fri, Dec 11, 2020 at 09:50:24PM +0100, Tobias Waldekranz wrote:
>> > >> 2. The issue Vladimir mentioned above. This is also a straight forward
>> > >>    fix, I have patch for tag_dsa, making sure that offload_fwd_mark is
>> > >>    never set for ports in standalone mode.
>> > >>
>> > >>    I am not sure if I should solve it like that or if we should just
>> > >>    clear the mark in dsa_switch_rcv if the dp does not have a
>> > >>    bridge_dev. I know both Vladimir and I were leaning towards each
>> > >>    tagger solving it internally. But looking at the code, I get the
>> > >>    feeling that all taggers will end up copying the same block of code
>> > >>    anyway. What do you think?
>> > >> As for this series, my intention is to make sure that (A) works as
>> > >> intended, leaving (B) for another day. Does that seem reasonable?
>> > >>
>> > >> NOTE: In the offloaded case, (B) will of course also be supported.
>> > >
>> > > Yeah, ok, one can already tell that the way I've tested this setup was
>> > > by commenting out skb->offload_fwd_mark = 1 altogether. It seems ok to
>> > > postpone this a bit.
>> > >
>> > > For what it's worth, in the giant "RX filtering for DSA switches" fiasco
>> > > https://patchwork.ozlabs.org/project/netdev/patch/20200521211036.668624-11-olteanv@gmail.com/
>> > > we seemed to reach the conclusion that it would be ok to add a new NDO
>> > > answering the question "can this interface do forwarding in hardware
>> > > towards this other interface". We can probably start with the question
>> > > being asked for L2 forwarding only.
>> >
>> > Very interesting, though I did not completely understand the VXLAN
>> > scenario laid out in that thread. I understand that OFM can not be 0,
>> > because you might have successfully forwarded to some destinations. But
>> > setting it to 1 does not smell right either. OFM=1 means "this has
>> > already been forwarded according to your current configuration" which is
>> > not completely true in this case. This is something in the middle, more
>> > like skb->offload_fwd_mark = its_complicated;
>> 
>> Very pertinent question. Given your observation that nbp_switchdev_mark_set()
>> calls dev_get_port_parent_id() with recurse=true, this means that a vxlan
>> upper should have the same parent ID as the real interface. At least the
>> theory coincides with the little practice I applied to my setup where
>> felix does not support vxlan offload:
>> 
>> I printed the p->offload_fwd_mark assigned by nbp_switchdev_mark_set:
>> ip link add br0 type bridge
>> ip link set swp1 master br0
>> [   15.887217] mscc_felix 0000:00:00.5 swp1: offload_fwd_mark 1
>> ip link add vxlan10 type vxlan id 10 group 224.10.10.10 dstport 4789 ttl 10 dev swp0
>> ip link set vxlan10 master br0
>> [  102.734390] vxlan10: offload_fwd_mark 1
>> 
>> So a clearer explanation needs to be found for how Ido's exception
>> traffic due to missing neighbor in the vxlan underlay gets re-forwarded
>> by the software bridge to the software vxlan interface. It cannot be due
>> to a mismatch of bridge port offload_fwd_mark values unless there is
>> some different logic applied for Mellanox hardware that I am not seeing.
>> So after all, it must be due to skb->offload_fwd_mark being unset?
>> 
>> To be honest, I almost expect that the Mellanox switches are "all or
>> nothing" in terms of forwarding. So if the vxlan interface (which is
>> only one of the bridge ports) could not deliver the packet, it would
>> seem cleaner to me that none of the other interfaces deliver the packet
>> either. Then the driver picks up this exception packet on the original
>> ingress interface, and the software bridge + software vxlan do the job.
>> And this means that skb->offload_fwd_mark = it_isnt_complicated.
>> 
>> But this is clearly at odds with what Ido said, that "swp0 and vxlan0 do
>> not have the same parent ID", and which was the center of his entire
>> argument. It's my fault really, I should have checked. Let's hope that
>> Ido can explain again.
>
> Problem is here:
>
> ip link add vxlan10 type vxlan id 10 group 224.10.10.10 dstport 4789 ttl 10 dev swp0
>
> We don't configure VXLAN with a bound device. In fact, we forbid it:
> https://elixir.bootlin.com/linux/latest/source/drivers/net/ethernet/mellanox/mlxsw/spectrum_nve_vxlan.c#L46
> https://elixir.bootlin.com/linux/latest/source/tools/testing/selftests/drivers/net/mlxsw/vxlan.sh#L182
>
> Even if we were to support a bound device, it is unlikely to be a switch
> port, but some dummy interface that we would enslave to a VRF in which
> we would like the underlay lookup to be performed. We use this with GRE
> tunnels:
> https://github.com/Mellanox/mlxsw/wiki/L3-Tunneling#general-gre-configuration
>
> Currently, underlay lookup always happens in the default VRF.
>
> VXLAN recently got support for this as well. See this series:
> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=79dfab43a976b76713c40222987c48e32510ebc1

How do you handle multiple VXLAN interfaces?  I.e. in this setup:

         br0
   .--' .' '. '----.
  /    /     \      \
swp0 swp1  vxlan0 vxlan1

Say that both VXLANs are offloaded, the nexthop of vxlan0 is in the
hardware ARP cache, but vxlan1's is not.

If a broadcast is received on swp0, hardware will forward to swp1 and
vxlan0, then trap the original frame to the CPU with offload_fwd_mark=1.

What prevents duplicates from being sent out through vxlan0 in that
case?

>> 
>> > Anyway, so we are essentially talking about replacing the question "do
>> > you share a parent with this netdev?" with "do you share the same
>> > hardware bridging domain as this netdev?" when choosing the port's OFM
>> > in a bridge, correct? If so, great, that would also solve the software
>> > LAG case. This would also get us one step closer to selectively
>> > disabling bridge offloading on a switchdev port.
>> 
>> Well, I cannot answer this until I fully understand the other issue
>> above - basically how is it that Mellanox switches do software
>> forwarding for exception traffic today.
>> 
>> Ido, for background, here's the relevant portion of the thread. We're
>> talking about software fallback for a bridge-over-bonding-over-DSA
>> scenario:
>> https://lore.kernel.org/netdev/87a6uk5apb.fsf@waldekranz.com/
Ido Schimmel Dec. 16, 2020, 6:48 p.m. UTC | #49
On Wed, Dec 16, 2020 at 04:15:03PM +0100, Tobias Waldekranz wrote:
> On Mon, Dec 14, 2020 at 13:42, Ido Schimmel <idosch@idosch.org> wrote:
> > On Mon, Dec 14, 2020 at 02:12:31AM +0200, Vladimir Oltean wrote:
> >> On Sun, Dec 13, 2020 at 10:18:27PM +0100, Tobias Waldekranz wrote:
> >> > On Sat, Dec 12, 2020 at 16:26, Vladimir Oltean <olteanv@gmail.com> wrote:
> >> > > On Fri, Dec 11, 2020 at 09:50:24PM +0100, Tobias Waldekranz wrote:
> >> > >> 2. The issue Vladimir mentioned above. This is also a straight forward
> >> > >>    fix, I have patch for tag_dsa, making sure that offload_fwd_mark is
> >> > >>    never set for ports in standalone mode.
> >> > >>
> >> > >>    I am not sure if I should solve it like that or if we should just
> >> > >>    clear the mark in dsa_switch_rcv if the dp does not have a
> >> > >>    bridge_dev. I know both Vladimir and I were leaning towards each
> >> > >>    tagger solving it internally. But looking at the code, I get the
> >> > >>    feeling that all taggers will end up copying the same block of code
> >> > >>    anyway. What do you think?
> >> > >> As for this series, my intention is to make sure that (A) works as
> >> > >> intended, leaving (B) for another day. Does that seem reasonable?
> >> > >>
> >> > >> NOTE: In the offloaded case, (B) will of course also be supported.
> >> > >
> >> > > Yeah, ok, one can already tell that the way I've tested this setup was
> >> > > by commenting out skb->offload_fwd_mark = 1 altogether. It seems ok to
> >> > > postpone this a bit.
> >> > >
> >> > > For what it's worth, in the giant "RX filtering for DSA switches" fiasco
> >> > > https://patchwork.ozlabs.org/project/netdev/patch/20200521211036.668624-11-olteanv@gmail.com/
> >> > > we seemed to reach the conclusion that it would be ok to add a new NDO
> >> > > answering the question "can this interface do forwarding in hardware
> >> > > towards this other interface". We can probably start with the question
> >> > > being asked for L2 forwarding only.
> >> >
> >> > Very interesting, though I did not completely understand the VXLAN
> >> > scenario laid out in that thread. I understand that OFM can not be 0,
> >> > because you might have successfully forwarded to some destinations. But
> >> > setting it to 1 does not smell right either. OFM=1 means "this has
> >> > already been forwarded according to your current configuration" which is
> >> > not completely true in this case. This is something in the middle, more
> >> > like skb->offload_fwd_mark = its_complicated;
> >> 
> >> Very pertinent question. Given your observation that nbp_switchdev_mark_set()
> >> calls dev_get_port_parent_id() with recurse=true, this means that a vxlan
> >> upper should have the same parent ID as the real interface. At least the
> >> theory coincides with the little practice I applied to my setup where
> >> felix does not support vxlan offload:
> >> 
> >> I printed the p->offload_fwd_mark assigned by nbp_switchdev_mark_set:
> >> ip link add br0 type bridge
> >> ip link set swp1 master br0
> >> [   15.887217] mscc_felix 0000:00:00.5 swp1: offload_fwd_mark 1
> >> ip link add vxlan10 type vxlan id 10 group 224.10.10.10 dstport 4789 ttl 10 dev swp0
> >> ip link set vxlan10 master br0
> >> [  102.734390] vxlan10: offload_fwd_mark 1
> >> 
> >> So a clearer explanation needs to be found for how Ido's exception
> >> traffic due to missing neighbor in the vxlan underlay gets re-forwarded
> >> by the software bridge to the software vxlan interface. It cannot be due
> >> to a mismatch of bridge port offload_fwd_mark values unless there is
> >> some different logic applied for Mellanox hardware that I am not seeing.
> >> So after all, it must be due to skb->offload_fwd_mark being unset?
> >> 
> >> To be honest, I almost expect that the Mellanox switches are "all or
> >> nothing" in terms of forwarding. So if the vxlan interface (which is
> >> only one of the bridge ports) could not deliver the packet, it would
> >> seem cleaner to me that none of the other interfaces deliver the packet
> >> either. Then the driver picks up this exception packet on the original
> >> ingress interface, and the software bridge + software vxlan do the job.
> >> And this means that skb->offload_fwd_mark = it_isnt_complicated.
> >> 
> >> But this is clearly at odds with what Ido said, that "swp0 and vxlan0 do
> >> not have the same parent ID", and which was the center of his entire
> >> argument. It's my fault really, I should have checked. Let's hope that
> >> Ido can explain again.
> >
> > Problem is here:
> >
> > ip link add vxlan10 type vxlan id 10 group 224.10.10.10 dstport 4789 ttl 10 dev swp0
> >
> > We don't configure VXLAN with a bound device. In fact, we forbid it:
> > https://elixir.bootlin.com/linux/latest/source/drivers/net/ethernet/mellanox/mlxsw/spectrum_nve_vxlan.c#L46
> > https://elixir.bootlin.com/linux/latest/source/tools/testing/selftests/drivers/net/mlxsw/vxlan.sh#L182
> >
> > Even if we were to support a bound device, it is unlikely to be a switch
> > port, but some dummy interface that we would enslave to a VRF in which
> > we would like the underlay lookup to be performed. We use this with GRE
> > tunnels:
> > https://github.com/Mellanox/mlxsw/wiki/L3-Tunneling#general-gre-configuration
> >
> > Currently, underlay lookup always happens in the default VRF.
> >
> > VXLAN recently got support for this as well. See this series:
> > https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=79dfab43a976b76713c40222987c48e32510ebc1
> 
> How do you handle multiple VXLAN interfaces?  I.e. in this setup:
> 
>          br0
>    .--' .' '. '----.
>   /    /     \      \
> swp0 swp1  vxlan0 vxlan1
> 
> Say that both VXLANs are offloaded, the nexthop of vxlan0 is in the
> hardware ARP cache, but vxlan1's is not.
> 
> If a broadcast is received on swp0, hardware will forward to swp1 and
> vxlan0, then trap the original frame to the CPU with offload_fwd_mark=1.
> 
> What prevents duplicates from being sent out through vxlan0 in that
> case?

We don't support multiple VXLAN devices in the same VLAN, but lets look
at a simpler topology:

         br0
   .--' .' '.
  /    /     \
swp0 swp1  vxlan0

VXLAN supports something called Head End Replication (HER) [1]. You can
flood a packet to multiple remote VTEPs. In Linux, this is configured
using the all-zero MAC address in the VXLAN's FDB [2]. It is possible
that after flooding a packet to 100 VTEPs, in the 101th VTEP, the packet
hit an exception in the underlay and was trapped to the CPU for
resolution via swp0. It won't be flooded to swp1 since it's marked, but
what prevents it from being flooded to all the other VTEPs again?
Nothing. Per-VTEP flood indication is not encoded in the skb and I'm not
aware of any hardware that can provide it.

[1] https://en.wikipedia.org/wiki/Broadcast,_unknown-unicast_and_multicast_traffic#BUM_handling_in_VxLAN
[2] https://vincent.bernat.ch/en/blog/2017-vxlan-linux#unicast-with-static-flooding
diff mbox series

Patch

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 4e60d2610f20..aaa350b78c55 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -7,6 +7,7 @@ 
 #ifndef __LINUX_NET_DSA_H
 #define __LINUX_NET_DSA_H
 
+#include <linux/bitmap.h>
 #include <linux/if.h>
 #include <linux/if_ether.h>
 #include <linux/list.h>
@@ -71,6 +72,7 @@  enum dsa_tag_protocol {
 
 struct packet_type;
 struct dsa_switch;
+struct dsa_lag;
 
 struct dsa_device_ops {
 	struct sk_buff *(*xmit)(struct sk_buff *skb, struct net_device *dev);
@@ -149,6 +151,13 @@  struct dsa_switch_tree {
 
 	/* List of DSA links composing the routing table */
 	struct list_head rtable;
+
+	/* Link aggregates */
+	struct {
+		struct dsa_lag *pool;
+		unsigned long *busy;
+		unsigned int num;
+	} lags;
 };
 
 /* TC matchall action types */
@@ -180,6 +189,69 @@  struct dsa_mall_tc_entry {
 	};
 };
 
+struct dsa_lag {
+	struct net_device *dev;
+	int id;
+
+	struct list_head ports;
+
+	/* For multichip systems, we must ensure that each hash bucket
+	 * is only enabled on a single egress port throughout the
+	 * whole tree, lest we send duplicates. Therefore we must
+	 * maintain a global list of active tx ports, so that each
+	 * switch can figure out which buckets to enable on which
+	 * ports.
+	 */
+	struct list_head tx_ports;
+	int num_tx;
+
+	refcount_t refcount;
+};
+
+#define dsa_lag_foreach(_id, _dst) \
+	for_each_set_bit(_id, (_dst)->lags.busy, (_dst)->lags.num)
+
+static inline bool dsa_lag_offloading(struct dsa_switch_tree *dst)
+{
+	return dst->lags.num > 0;
+}
+
+static inline bool dsa_lag_available(struct dsa_switch_tree *dst)
+{
+	return !bitmap_full(dst->lags.busy, dst->lags.num);
+}
+
+static inline struct dsa_lag *dsa_lag_by_id(struct dsa_switch_tree *dst, int id)
+{
+	if (!test_bit(id, dst->lags.busy))
+		return NULL;
+
+	return &dst->lags.pool[id];
+}
+
+static inline struct net_device *dsa_lag_dev_by_id(struct dsa_switch_tree *dst,
+						   int id)
+{
+	struct dsa_lag *lag = dsa_lag_by_id(dst, id);
+
+	return lag ? READ_ONCE(lag->dev) : NULL;
+}
+
+static inline struct dsa_lag *dsa_lag_by_dev(struct dsa_switch_tree *dst,
+					     struct net_device *dev)
+{
+	struct dsa_lag *lag;
+	int id;
+
+	dsa_lag_foreach(id, dst) {
+		lag = dsa_lag_by_id(dst, id);
+
+		if (lag->dev == dev)
+			return lag;
+	}
+
+	return NULL;
+}
 
 struct dsa_port {
 	/* A CPU port is physically connected to a master device.
@@ -220,6 +292,9 @@  struct dsa_port {
 	bool			devlink_port_setup;
 	struct phylink		*pl;
 	struct phylink_config	pl_config;
+	struct dsa_lag		*lag;
+	struct list_head	lag_list;
+	struct list_head	lag_tx_list;
 
 	struct list_head list;
 
@@ -335,6 +410,11 @@  struct dsa_switch {
 	 */
 	bool			mtu_enforcement_ingress;
 
+	/* The maximum number of LAGs that can be configured. A value of zero
+	 * is used to indicate that LAG offloading is not supported.
+	 */
+	unsigned int		num_lags;
+
 	size_t num_ports;
 };
 
@@ -624,6 +704,13 @@  struct dsa_switch_ops {
 	void	(*crosschip_bridge_leave)(struct dsa_switch *ds, int tree_index,
 					  int sw_index, int port,
 					  struct net_device *br);
+	int	(*crosschip_lag_change)(struct dsa_switch *ds, int sw_index,
+					int port, struct net_device *lag_dev,
+					struct netdev_lag_lower_state_info *info);
+	int	(*crosschip_lag_join)(struct dsa_switch *ds, int sw_index,
+				      int port, struct net_device *lag_dev);
+	void	(*crosschip_lag_leave)(struct dsa_switch *ds, int sw_index,
+				       int port, struct net_device *lag_dev);
 
 	/*
 	 * PTP functionality
@@ -655,6 +742,16 @@  struct dsa_switch_ops {
 	int	(*port_change_mtu)(struct dsa_switch *ds, int port,
 				   int new_mtu);
 	int	(*port_max_mtu)(struct dsa_switch *ds, int port);
+
+	/*
+	 * LAG integration
+	 */
+	int	(*port_lag_change)(struct dsa_switch *ds, int port,
+				   struct netdev_lag_lower_state_info *info);
+	int	(*port_lag_join)(struct dsa_switch *ds, int port,
+				 struct net_device *lag_dev);
+	void	(*port_lag_leave)(struct dsa_switch *ds, int port,
+				  struct net_device *lag_dev);
 };
 
 #define DSA_DEVLINK_PARAM_DRIVER(_id, _name, _type, _cmodes)		\
diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index 183003e45762..786277a21955 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -578,6 +578,47 @@  static void dsa_tree_teardown_master(struct dsa_switch_tree *dst)
 			dsa_master_teardown(dp->master);
 }
 
+static int dsa_tree_setup_lags(struct dsa_switch_tree *dst)
+{
+	struct dsa_port *dp;
+	unsigned int num;
+
+	list_for_each_entry(dp, &dst->ports, list)
+		num = dp->ds->num_lags;
+
+	list_for_each_entry(dp, &dst->ports, list)
+		num = min(num, dp->ds->num_lags);
+
+	if (num == 0)
+		return 0;
+
+	dst->lags.pool = kcalloc(num, sizeof(*dst->lags.pool), GFP_KERNEL);
+	if (!dst->lags.pool)
+		goto err;
+
+	dst->lags.busy = bitmap_zalloc(num, GFP_KERNEL);
+	if (!dst->lags.busy)
+		goto err_free_pool;
+
+	dst->lags.num = num;
+	return 0;
+
+err_free_pool:
+	kfree(dst->lags.pool);
+err:
+	return -ENOMEM;
+}
+
+static void dsa_tree_teardown_lags(struct dsa_switch_tree *dst)
+{
+	if (dst->lags.num == 0)
+		return;
+
+	kfree(dst->lags.busy);
+	kfree(dst->lags.pool);
+	dst->lags.num = 0;
+}
+
 static int dsa_tree_setup(struct dsa_switch_tree *dst)
 {
 	bool complete;
@@ -605,12 +646,18 @@  static int dsa_tree_setup(struct dsa_switch_tree *dst)
 	if (err)
 		goto teardown_switches;
 
+	err = dsa_tree_setup_lags(dst);
+	if (err)
+		goto teardown_master;
+
 	dst->setup = true;
 
 	pr_info("DSA: tree %d setup\n", dst->index);
 
 	return 0;
 
+teardown_master:
+	dsa_tree_teardown_master(dst);
 teardown_switches:
 	dsa_tree_teardown_switches(dst);
 teardown_default_cpu:
@@ -626,6 +673,8 @@  static void dsa_tree_teardown(struct dsa_switch_tree *dst)
 	if (!dst->setup)
 		return;
 
+	dsa_tree_teardown_lags(dst);
+
 	dsa_tree_teardown_master(dst);
 
 	dsa_tree_teardown_switches(dst);
@@ -659,6 +708,8 @@  static struct dsa_port *dsa_port_touch(struct dsa_switch *ds, int index)
 	dp->index = index;
 
 	INIT_LIST_HEAD(&dp->list);
+	INIT_LIST_HEAD(&dp->lag_list);
+	INIT_LIST_HEAD(&dp->lag_tx_list);
 	list_add_tail(&dp->list, &dst->ports);
 
 	return dp;
diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index 7c96aae9062c..77e07a0cff29 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -20,6 +20,9 @@  enum {
 	DSA_NOTIFIER_BRIDGE_LEAVE,
 	DSA_NOTIFIER_FDB_ADD,
 	DSA_NOTIFIER_FDB_DEL,
+	DSA_NOTIFIER_LAG_CHANGE,
+	DSA_NOTIFIER_LAG_JOIN,
+	DSA_NOTIFIER_LAG_LEAVE,
 	DSA_NOTIFIER_MDB_ADD,
 	DSA_NOTIFIER_MDB_DEL,
 	DSA_NOTIFIER_VLAN_ADD,
@@ -57,6 +60,14 @@  struct dsa_notifier_mdb_info {
 	int port;
 };
 
+/* DSA_NOTIFIER_LAG_* */
+struct dsa_notifier_lag_info {
+	struct netdev_lag_lower_state_info *info;
+	struct net_device *lag;
+	int sw_index;
+	int port;
+};
+
 /* DSA_NOTIFIER_VLAN_* */
 struct dsa_notifier_vlan_info {
 	const struct switchdev_obj_port_vlan *vlan;
@@ -135,6 +146,10 @@  void dsa_port_disable_rt(struct dsa_port *dp);
 void dsa_port_disable(struct dsa_port *dp);
 int dsa_port_bridge_join(struct dsa_port *dp, struct net_device *br);
 void dsa_port_bridge_leave(struct dsa_port *dp, struct net_device *br);
+int dsa_port_lag_change(struct dsa_port *dp,
+			struct netdev_lag_lower_state_info *linfo);
+int dsa_port_lag_join(struct dsa_port *dp, struct net_device *lag_dev);
+void dsa_port_lag_leave(struct dsa_port *dp, struct net_device *lag_dev);
 int dsa_port_vlan_filtering(struct dsa_port *dp, bool vlan_filtering,
 			    struct switchdev_trans *trans);
 bool dsa_port_skip_vlan_configuration(struct dsa_port *dp);
@@ -167,6 +182,22 @@  int dsa_port_link_register_of(struct dsa_port *dp);
 void dsa_port_link_unregister_of(struct dsa_port *dp);
 extern const struct phylink_mac_ops dsa_port_phylink_mac_ops;
 
+static inline bool dsa_port_can_offload(struct dsa_port *dp,
+					struct net_device *dev)
+{
+	/* Switchdev offloading can be configured on: */
+
+	if (dev == dp->slave)
+		/* DSA ports directly connected to a bridge. */
+		return true;
+
+	if (dp->lag && dev == rtnl_dereference(dp->lag->dev))
+		/* DSA ports connected to a bridge via a LAG */
+		return true;
+
+	return false;
+}
+
 /* slave.c */
 extern const struct dsa_device_ops notag_netdev_ops;
 void dsa_slave_mii_bus_init(struct dsa_switch *ds);
diff --git a/net/dsa/port.c b/net/dsa/port.c
index 73569c9af3cc..5a06890db918 100644
--- a/net/dsa/port.c
+++ b/net/dsa/port.c
@@ -193,6 +193,138 @@  void dsa_port_bridge_leave(struct dsa_port *dp, struct net_device *br)
 	dsa_port_set_state_now(dp, BR_STATE_FORWARDING);
 }
 
+static struct dsa_lag *dsa_lag_get(struct dsa_switch_tree *dst,
+				   struct net_device *dev)
+{
+	struct dsa_lag *lag;
+	int id;
+
+	lag = dsa_lag_by_dev(dst, dev);
+	if (lag) {
+		refcount_inc(&lag->refcount);
+		return lag;
+	}
+
+	id = find_first_zero_bit(dst->lags.busy, dst->lags.num);
+	if (id >= dst->lags.num) {
+		WARN(1, "No LAGs available");
+		return NULL;
+	}
+
+	lag = &dst->lags.pool[id];
+	lag->dev = dev;
+	lag->id = id;
+	INIT_LIST_HEAD(&lag->ports);
+	INIT_LIST_HEAD(&lag->tx_ports);
+	refcount_set(&lag->refcount, 1);
+	set_bit(id, dst->lags.busy);
+	return lag;
+}
+
+static void dsa_lag_put(struct dsa_switch_tree *dst, struct dsa_lag *lag)
+{
+	if (!refcount_dec_and_test(&lag->refcount))
+		return;
+
+	clear_bit(lag->id, dst->lags.busy);
+	WRITE_ONCE(lag->dev, NULL);
+	memset(lag, 0, sizeof(*lag));
+}
+
+int dsa_port_lag_change(struct dsa_port *dp,
+			struct netdev_lag_lower_state_info *linfo)
+{
+	struct dsa_notifier_lag_info info = {
+		.sw_index = dp->ds->index,
+		.port = dp->index,
+		.info = linfo,
+	};
+	bool old, new;
+
+	if (!dp->lag)
+		return 0;
+
+	info.lag = dp->lag->dev;
+
+	/* If this port is on the tx list, it is already enabled. */
+	old = !list_empty(&dp->lag_tx_list);
+
+	/* On statically configured aggregates (e.g. loadbalance
+	 * without LACP) ports will always be tx_enabled, even if the
+	 * link is down. Thus we require both link_up and tx_enabled
+	 * in order to include it in the tx set.
+	 */
+	new = linfo->link_up && linfo->tx_enabled;
+
+	if (new == old)
+		return 0;
+
+	if (new) {
+		dp->lag->num_tx++;
+		list_add_tail(&dp->lag_tx_list, &dp->lag->tx_ports);
+	} else {
+		list_del_init(&dp->lag_tx_list);
+		dp->lag->num_tx--;
+	}
+
+	return dsa_port_notify(dp, DSA_NOTIFIER_LAG_CHANGE, &info);
+}
+
+int dsa_port_lag_join(struct dsa_port *dp, struct net_device *lag_dev)
+{
+	struct dsa_notifier_lag_info info = {
+		.sw_index = dp->ds->index,
+		.port = dp->index,
+		.lag = lag_dev,
+	};
+	struct dsa_lag *lag;
+	int err;
+
+	lag = dsa_lag_get(dp->ds->dst, lag_dev);
+	if (!lag)
+		return -ENODEV;
+
+	dp->lag = lag;
+	list_add_tail(&dp->lag_list, &lag->ports);
+
+	err = dsa_port_notify(dp, DSA_NOTIFIER_LAG_JOIN, &info);
+	if (err) {
+		dp->lag = NULL;
+		list_del_init(&dp->lag_list);
+		dsa_lag_put(dp->ds->dst, lag);
+	}
+
+	return err;
+}
+
+void dsa_port_lag_leave(struct dsa_port *dp, struct net_device *lag_dev)
+{
+	struct dsa_notifier_lag_info info = {
+		.sw_index = dp->ds->index,
+		.port = dp->index,
+		.lag = lag_dev,
+	};
+	int err;
+
+	/* Port might have been part of a LAG that in turn was
+	 * attached to a bridge.
+	 */
+	if (dp->bridge_dev)
+		dsa_port_bridge_leave(dp, dp->bridge_dev);
+
+	list_del_init(&dp->lag_list);
+	list_del_init(&dp->lag_tx_list);
+
+	err = dsa_port_notify(dp, DSA_NOTIFIER_LAG_LEAVE, &info);
+	if (err)
+		pr_err("DSA: failed to notify DSA_NOTIFIER_LAG_LEAVE: %d\n",
+		       err);
+
+	dsa_lag_put(dp->ds->dst, dp->lag);
+
+	dp->lag = NULL;
+}
+
 /* Must be called under rcu_read_lock() */
 static bool dsa_port_can_apply_vlan_filtering(struct dsa_port *dp,
 					      bool vlan_filtering)
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 7efc753e4d9d..6d7878cc7f3d 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -334,7 +334,7 @@  static int dsa_slave_vlan_add(struct net_device *dev,
 	struct switchdev_obj_port_vlan vlan;
 	int vid, err;
 
-	if (obj->orig_dev != dev)
+	if (!dsa_port_can_offload(dp, obj->orig_dev))
 		return -EOPNOTSUPP;
 
 	if (dsa_port_skip_vlan_configuration(dp))
@@ -391,7 +391,7 @@  static int dsa_slave_port_obj_add(struct net_device *dev,
 
 	switch (obj->id) {
 	case SWITCHDEV_OBJ_ID_PORT_MDB:
-		if (obj->orig_dev != dev)
+		if (!dsa_port_can_offload(dp, obj->orig_dev))
 			return -EOPNOTSUPP;
 		err = dsa_port_mdb_add(dp, SWITCHDEV_OBJ_PORT_MDB(obj), trans);
 		break;
@@ -421,7 +421,7 @@  static int dsa_slave_vlan_del(struct net_device *dev,
 	struct switchdev_obj_port_vlan *vlan;
 	int vid, err;
 
-	if (obj->orig_dev != dev)
+	if (!dsa_port_can_offload(dp, obj->orig_dev))
 		return -EOPNOTSUPP;
 
 	if (dsa_port_skip_vlan_configuration(dp))
@@ -450,7 +450,7 @@  static int dsa_slave_port_obj_del(struct net_device *dev,
 
 	switch (obj->id) {
 	case SWITCHDEV_OBJ_ID_PORT_MDB:
-		if (obj->orig_dev != dev)
+		if (!dsa_port_can_offload(dp, obj->orig_dev))
 			return -EOPNOTSUPP;
 		err = dsa_port_mdb_del(dp, SWITCHDEV_OBJ_PORT_MDB(obj));
 		break;
@@ -1941,6 +1941,40 @@  static int dsa_slave_changeupper(struct net_device *dev,
 			dsa_port_bridge_leave(dp, info->upper_dev);
 			err = NOTIFY_OK;
 		}
+	} else if (netif_is_lag_master(info->upper_dev) &&
+		   dsa_lag_offloading(dp->ds->dst)) {
+		if (info->linking) {
+			err = dsa_port_lag_join(dp, info->upper_dev);
+			err = notifier_from_errno(err);
+		} else {
+			dsa_port_lag_leave(dp, info->upper_dev);
+			err = NOTIFY_OK;
+		}
+	}
+
+	return err;
+}
+
+static int
+dsa_slave_lag_changeupper(struct net_device *dev,
+			  struct netdev_notifier_changeupper_info *info)
+{
+	struct net_device *lower;
+	struct list_head *iter;
+	int err = NOTIFY_DONE;
+	struct dsa_port *dp;
+
+	netdev_for_each_lower_dev(dev, lower, iter) {
+		if (!dsa_slave_dev_check(lower))
+			continue;
+
+		dp = dsa_slave_to_port(lower);
+		if (!dsa_lag_offloading(dp->ds->dst))
+			break;
+
+		err = dsa_slave_changeupper(lower, info);
+		if (notifier_to_errno(err))
+			break;
 	}
 
 	return err;
@@ -2009,6 +2043,23 @@  dsa_slave_check_8021q_upper(struct net_device *dev,
 	return NOTIFY_DONE;
 }
 
+static int dsa_slave_check_lag_upper(struct net_device *dev)
+{
+	struct dsa_port *dp = dsa_slave_to_port(dev);
+	struct dsa_switch_tree *dst = dp->ds->dst;
+
+	if (!dsa_lag_offloading(dst))
+		return NOTIFY_DONE;
+
+	if (dsa_lag_by_dev(dst, dev))
+		return NOTIFY_OK;
+
+	if (!dsa_lag_available(dst))
+		return notifier_from_errno(-EBUSY);
+
+	return NOTIFY_OK;
+}
+
 static int dsa_slave_netdevice_event(struct notifier_block *nb,
 				     unsigned long event, void *ptr)
 {
@@ -2035,13 +2086,33 @@  static int dsa_slave_netdevice_event(struct notifier_block *nb,
 
 		if (is_vlan_dev(info->upper_dev))
 			return dsa_slave_check_8021q_upper(dev, ptr);
+
+		if (netif_is_lag_master(info->upper_dev))
+			return dsa_slave_check_lag_upper(dev);
+
 		break;
 	}
 	case NETDEV_CHANGEUPPER:
+		if (dsa_slave_dev_check(dev))
+			return dsa_slave_changeupper(dev, ptr);
+
+		if (netif_is_lag_master(dev))
+			return dsa_slave_lag_changeupper(dev, ptr);
+
+		break;
+	case NETDEV_CHANGELOWERSTATE: {
+		struct netdev_notifier_changelowerstate_info *info = ptr;
+		struct dsa_port *dp;
+		int err;
+
 		if (!dsa_slave_dev_check(dev))
-			return NOTIFY_DONE;
+			break;
+
+		dp = dsa_slave_to_port(dev);
 
-		return dsa_slave_changeupper(dev, ptr);
+		err = dsa_port_lag_change(dp, info->lower_state_info);
+		return notifier_from_errno(err);
+	}
 	}
 
 	return NOTIFY_DONE;
diff --git a/net/dsa/switch.c b/net/dsa/switch.c
index 3fb362b6874e..3e518df7cd1f 100644
--- a/net/dsa/switch.c
+++ b/net/dsa/switch.c
@@ -178,6 +178,46 @@  static int dsa_switch_fdb_del(struct dsa_switch *ds,
 	return ds->ops->port_fdb_del(ds, port, info->addr, info->vid);
 }
 
+static int dsa_switch_lag_change(struct dsa_switch *ds,
+				 struct dsa_notifier_lag_info *info)
+{
+	if (ds->index == info->sw_index && ds->ops->port_lag_change)
+		return ds->ops->port_lag_change(ds, info->port, info->info);
+
+	if (ds->index != info->sw_index && ds->ops->crosschip_lag_change)
+		return ds->ops->crosschip_lag_change(ds, info->sw_index,
+						     info->port, info->lag,
+						     info->info);
+
+	return 0;
+}
+
+static int dsa_switch_lag_join(struct dsa_switch *ds,
+			       struct dsa_notifier_lag_info *info)
+{
+	if (ds->index == info->sw_index && ds->ops->port_lag_join)
+		return ds->ops->port_lag_join(ds, info->port, info->lag);
+
+	if (ds->index != info->sw_index && ds->ops->crosschip_lag_join)
+		return ds->ops->crosschip_lag_join(ds, info->sw_index,
+						   info->port, info->lag);
+
+	return 0;
+}
+
+static int dsa_switch_lag_leave(struct dsa_switch *ds,
+				struct dsa_notifier_lag_info *info)
+{
+	if (ds->index == info->sw_index && ds->ops->port_lag_leave)
+		ds->ops->port_lag_leave(ds, info->port, info->lag);
+
+	if (ds->index != info->sw_index && ds->ops->crosschip_lag_leave)
+		ds->ops->crosschip_lag_leave(ds, info->sw_index,
+					     info->port, info->lag);
+
+	return 0;
+}
+
 static bool dsa_switch_mdb_match(struct dsa_switch *ds, int port,
 				 struct dsa_notifier_mdb_info *info)
 {
@@ -325,6 +365,15 @@  static int dsa_switch_event(struct notifier_block *nb,
 	case DSA_NOTIFIER_FDB_DEL:
 		err = dsa_switch_fdb_del(ds, info);
 		break;
+	case DSA_NOTIFIER_LAG_CHANGE:
+		err = dsa_switch_lag_change(ds, info);
+		break;
+	case DSA_NOTIFIER_LAG_JOIN:
+		err = dsa_switch_lag_join(ds, info);
+		break;
+	case DSA_NOTIFIER_LAG_LEAVE:
+		err = dsa_switch_lag_leave(ds, info);
+		break;
 	case DSA_NOTIFIER_MDB_ADD:
 		err = dsa_switch_mdb_add(ds, info);
 		break;