diff mbox series

[net-next,5/7] net: marvell: prestera: add LAG support

Message ID 20210203165458.28717-6-vadym.kochan@plvision.eu (mailing list archive)
State Changes Requested
Delegated to: Netdev Maintainers
Headers show
Series Marvell Prestera Switchdev misc updates | expand

Checks

Context Check Description
netdev/cover_letter success Link
netdev/fixes_present success Link
netdev/patch_count success Link
netdev/tree_selection success Clearly marked for net-next
netdev/subject_prefix success Link
netdev/cc_maintainers warning 2 maintainers not CCed: tchornyi@marvell.com vkochan@marvell.com
netdev/source_inline success Was 0 now: 0
netdev/verify_signedoff success Link
netdev/module_param success Was 0 now: 0
netdev/build_32bit success Errors and warnings before: 0 this patch: 0
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/verify_fixes success Link
netdev/checkpatch warning WARNING: line length of 84 exceeds 80 columns WARNING: line length of 86 exceeds 80 columns WARNING: line length of 91 exceeds 80 columns
netdev/build_allmodconfig_warn success Errors and warnings before: 0 this patch: 0
netdev/header_inline success Link
netdev/stable success Stable not CCed

Commit Message

Vadym Kochan Feb. 3, 2021, 4:54 p.m. UTC
From: Serhiy Boiko <serhiy.boiko@plvision.eu>

The following features are supported:

    - LAG basic operations
        - create/delete LAG
        - add/remove a member to LAG
        - enable/disable member in LAG
    - LAG Bridge support
    - LAG VLAN support
    - LAG FDB support

Limitations:

    - Only HASH lag tx type is supported
    - The Hash parameters are not configurable. They are applied
      during the LAG creation stage.
    - Enslaving a port to the LAG device that already has an
      upper device is not supported.

Co-developed-by: Andrii Savka <andrii.savka@plvision.eu>
Signed-off-by: Andrii Savka <andrii.savka@plvision.eu>
Signed-off-by: Serhiy Boiko <serhiy.boiko@plvision.eu>
Signed-off-by: Vadym Kochan <vadym.kochan@plvision.eu>
---
 .../net/ethernet/marvell/prestera/prestera.h  |  30 ++-
 .../ethernet/marvell/prestera/prestera_hw.c   | 180 ++++++++++++-
 .../ethernet/marvell/prestera/prestera_hw.h   |  14 +
 .../ethernet/marvell/prestera/prestera_main.c | 247 +++++++++++++++++-
 .../marvell/prestera/prestera_switchdev.c     | 109 ++++++--
 .../marvell/prestera/prestera_switchdev.h     |   4 +-
 6 files changed, 538 insertions(+), 46 deletions(-)

Comments

Jakub Kicinski Feb. 5, 2021, 5:16 a.m. UTC | #1
On Wed,  3 Feb 2021 18:54:56 +0200 Vadym Kochan wrote:
> From: Serhiy Boiko <serhiy.boiko@plvision.eu>
> 
> The following features are supported:
> 
>     - LAG basic operations
>         - create/delete LAG
>         - add/remove a member to LAG
>         - enable/disable member in LAG
>     - LAG Bridge support
>     - LAG VLAN support
>     - LAG FDB support
> 
> Limitations:
> 
>     - Only HASH lag tx type is supported
>     - The Hash parameters are not configurable. They are applied
>       during the LAG creation stage.
>     - Enslaving a port to the LAG device that already has an
>       upper device is not supported.

Tobias, Vladimir, you worked on LAG support recently, would you mind
taking a look at this one?
Vladimir Oltean Feb. 5, 2021, 3:24 p.m. UTC | #2
On Wed, Feb 03, 2021 at 06:54:56PM +0200, Vadym Kochan wrote:
> +static struct prestera_lag *prestera_lag_by_dev(struct prestera_switch *sw,
> +						struct net_device *dev)
> +{
> +	struct prestera_lag *lag;
> +	u16 id;
> +
> +	for (id = 0; id < sw->lag_max; id++) {
> +		lag = &sw->lags[id];
> +		if (lag->dev == dev)
> +			return lag;
> +	}
> +
> +	return NULL;
> +}
> +
> +static struct prestera_lag *prestera_lag_create(struct prestera_switch *sw,
> +						struct net_device *lag_dev)
> +{
> +	struct prestera_lag *lag;

You should initialize with NULL.

> +	u16 id;
> +
> +	for (id = 0; id < sw->lag_max; id++) {
> +		lag = &sw->lags[id];
> +		if (!lag->dev)
> +			break;
> +	}
> +	if (lag) {
> +		INIT_LIST_HEAD(&lag->members);
> +		lag->dev = lag_dev;
> +	}
> +
> +	return lag;
> +}
> +
> +static void prestera_lag_destroy(struct prestera_switch *sw,
> +				 struct prestera_lag *lag)
> +{
> +	WARN_ON(!list_empty(&lag->members));
> +	lag->member_count = 0;
> +	lag->dev = NULL;
> +}
> +
> +static int prestera_lag_port_add(struct prestera_port *port,
> +				 struct net_device *lag_dev)
> +{
> +	struct prestera_switch *sw = port->sw;
> +	struct prestera_lag *lag;
> +	int err;
> +
> +	lag = prestera_lag_by_dev(sw, lag_dev);
> +	if (!lag) {
> +		lag = prestera_lag_create(sw, lag_dev);
> +		if (!lag)
> +			return -ENOMEM;

I think ENOMEM is reserved for dynamic memory allocation. I think
-ENOSPC may be a better error code (here and everywhere else).
Maybe you would also like to propagate the netlink extack from the
changeupper event and say what went wrong?

> +	}
> +
> +	if (lag->member_count >= sw->lag_member_max)
> +		return -ENOMEM;
> +
> +	err = prestera_hw_lag_member_add(port, lag->lag_id);
> +	if (err) {
> +		if (!lag->member_count)
> +			prestera_lag_destroy(sw, lag);
> +		return err;
> +	}
> +
> +	list_add(&port->lag_member, &lag->members);
> +	lag->member_count++;
> +	port->lag = lag;
> +
> +	return 0;
> +}
> +
> +static int prestera_lag_port_del(struct prestera_port *port)
> +{
> +	struct prestera_switch *sw = port->sw;
> +	struct prestera_lag *lag = port->lag;
> +	int err;
> +
> +	if (!lag || !lag->member_count)
> +		return -EINVAL;
> +
> +	err = prestera_hw_lag_member_del(port, lag->lag_id);
> +	if (err)
> +		return err;
> +
> +	list_del(&port->lag_member);
> +	lag->member_count--;
> +	port->lag = NULL;
> +
> +	if (netif_is_bridge_port(lag->dev)) {
> +		struct netdev_notifier_changeupper_info br_info;
> +
> +		br_info.upper_dev = netdev_master_upper_dev_get(lag->dev);
> +		br_info.linking = false;
> +
> +		prestera_bridge_port_event(lag->dev, port->dev,
> +					   NETDEV_CHANGEUPPER, &br_info);
> +	}

I think it might be more intuitive if you just call
prestera_port_bridge_leave than simulate a notifier call.

> +
> +	if (!lag->member_count)
> +		prestera_lag_destroy(sw, lag);
> +
> +	return 0;
> +}
> +
> +bool prestera_port_is_lag_member(const struct prestera_port *port)
> +{
> +	return !!port->lag;
> +}
> +
> +u16 prestera_port_lag_id(const struct prestera_port *port)
> +{
> +	return port->lag->lag_id;
> +}
> +
> +static int prestera_lag_init(struct prestera_switch *sw)
> +{
> +	u16 id;
> +
> +	sw->lags = kcalloc(sw->lag_max, sizeof(*sw->lags), GFP_KERNEL);
> +	if (!sw->lags)
> +		return -ENOMEM;
> +
> +	for (id = 0; id < sw->lag_max; id++)
> +		sw->lags[id].lag_id = id;
> +
> +	return 0;
> +}
> +
> +static void prestera_lag_fini(struct prestera_switch *sw)
> +{
> +	u8 idx;
> +
> +	for (idx = 0; idx < sw->lag_max; idx++)
> +		WARN_ON(sw->lags[idx].member_count);
> +
> +	kfree(sw->lags);
> +}
> +
>  bool prestera_netdev_check(const struct net_device *dev)
>  {
>  	return dev->netdev_ops == &prestera_netdev_ops;
> @@ -507,19 +654,54 @@ struct prestera_port *prestera_port_dev_lower_find(struct net_device *dev)
>  	return port;
>  }
>  
> -static int prestera_netdev_port_event(struct net_device *dev,
> +static int prestera_netdev_port_lower_event(struct net_device *dev,
> +					    unsigned long event, void *ptr)
> +{
> +	struct netdev_notifier_changelowerstate_info *info = ptr;
> +	struct netdev_lag_lower_state_info *lower_state_info;
> +	struct prestera_port *port = netdev_priv(dev);
> +	bool enabled;
> +
> +	if (!netif_is_lag_port(dev))
> +		return 0;
> +	if (!prestera_port_is_lag_member(port))
> +		return 0;
> +
> +	lower_state_info = info->lower_state_info;
> +	enabled = lower_state_info->tx_enabled;

You also need to check for info->link_up, otherwise the ports won't get
rebalanced for bonding interfaces with "mode balance-xor miimon 1" and such.
There is also a comment in net/dsa/port.c with more details.

> +
> +	return prestera_hw_lag_member_enable(port, port->lag->lag_id, enabled);
> +}
> +
> +static bool prestera_lag_master_check(struct net_device *lag_dev,
> +				      struct netdev_lag_upper_info *info,
> +				      struct netlink_ext_ack *ext_ack)
> +{
> +	if (info->tx_type != NETDEV_LAG_TX_TYPE_HASH) {
> +		NL_SET_ERR_MSG_MOD(ext_ack, "Unsupported LAG Tx type");
> +		return false;
> +	}
> +
> +	return true;
> +}
> +
> +static int prestera_netdev_port_event(struct net_device *lower,
> +				      struct net_device *dev,
>  				      unsigned long event, void *ptr)
>  {
>  	struct netdev_notifier_changeupper_info *info = ptr;
> +	struct prestera_port *port = netdev_priv(dev);
>  	struct netlink_ext_ack *extack;
>  	struct net_device *upper;
> +	int err;
>  
>  	extack = netdev_notifier_info_to_extack(&info->info);
>  	upper = info->upper_dev;
>  
>  	switch (event) {
>  	case NETDEV_PRECHANGEUPPER:
> -		if (!netif_is_bridge_master(upper)) {
> +		if (!netif_is_bridge_master(upper) &&
> +		    !netif_is_lag_master(upper)) {

No 8021q uppers allowed on Marvell Prestera switch ports?

>  			NL_SET_ERR_MSG_MOD(extack, "Unknown upper device type");
>  			return -EINVAL;
>  		}
> @@ -531,12 +713,60 @@ static int prestera_netdev_port_event(struct net_device *dev,
>  			NL_SET_ERR_MSG_MOD(extack, "Upper device is already enslaved");
>  			return -EINVAL;
>  		}
> +
> +		if (netif_is_lag_master(upper) &&
> +		    !prestera_lag_master_check(upper, info->upper_info, extack))
> +			return -EINVAL;

-EOPNOTSUPP maybe?
In DSA we had a discussion and convened to do software fallback for
bonding modes that can't be offloaded, and just print an extack and
return 0. What is your take on that?

> +		if (netif_is_lag_master(upper) && vlan_uses_dev(dev)) {
> +			NL_SET_ERR_MSG_MOD(extack,
> +					   "Master device is a LAG master and port has a VLAN");
> +			return -EINVAL;
> +		}
> +		if (netif_is_lag_port(dev) && is_vlan_dev(upper) &&
> +		    !netif_is_lag_master(vlan_dev_real_dev(upper))) {
> +			NL_SET_ERR_MSG_MOD(extack,
> +					   "Can not put a VLAN on a LAG port");
> +			return -EINVAL;
> +		}
>  		break;
>  
>  	case NETDEV_CHANGEUPPER:
>  		if (netif_is_bridge_master(upper))
> -			return prestera_bridge_port_event(dev, event, ptr);
> +			return prestera_bridge_port_event(lower, dev, event,
> +							  ptr);
> +
> +		if (netif_is_lag_master(upper)) {
> +			if (info->linking) {
> +				err = prestera_lag_port_add(port, upper);
> +				if (err)
> +					return err;
> +			} else {
> +				prestera_lag_port_del(port);
> +			}
> +		}
>  		break;
> +
> +	case NETDEV_CHANGELOWERSTATE:
> +		return prestera_netdev_port_lower_event(dev, event, ptr);
> +	}
> +
> +	return 0;
> +}
> +
> +static int prestera_netdevice_lag_event(struct net_device *lag_dev,
> +					unsigned long event, void *ptr)
> +{
> +	struct net_device *dev;
> +	struct list_head *iter;
> +	int err;
> +
> +	netdev_for_each_lower_dev(lag_dev, dev, iter) {
> +		if (prestera_netdev_check(dev)) {
> +			err = prestera_netdev_port_event(lag_dev, dev, event,
> +							 ptr);
> +			if (err)
> +				return err;
> +		}
>  	}
>  
>  	return 0;
> @@ -549,7 +779,9 @@ static int prestera_netdev_event_handler(struct notifier_block *nb,
>  	int err = 0;
>  
>  	if (prestera_netdev_check(dev))
> -		err = prestera_netdev_port_event(dev, event, ptr);
> +		err = prestera_netdev_port_event(dev, dev, event, ptr);
> +	else if (netif_is_lag_master(dev))
> +		err = prestera_netdevice_lag_event(dev, event, ptr);
>  
>  	return notifier_from_errno(err);
>  }
> @@ -603,6 +835,10 @@ static int prestera_switch_init(struct prestera_switch *sw)
>  	if (err)
>  		goto err_dl_register;
>  
> +	err = prestera_lag_init(sw);
> +	if (err)
> +		goto err_lag_init;
> +
>  	err = prestera_create_ports(sw);
>  	if (err)
>  		goto err_ports_create;
> @@ -610,6 +846,8 @@ static int prestera_switch_init(struct prestera_switch *sw)
>  	return 0;
>  
>  err_ports_create:
> +	prestera_lag_fini(sw);
> +err_lag_init:
>  	prestera_devlink_unregister(sw);
>  err_dl_register:
>  	prestera_event_handlers_unregister(sw);
> @@ -627,6 +865,7 @@ static int prestera_switch_init(struct prestera_switch *sw)
>  static void prestera_switch_fini(struct prestera_switch *sw)
>  {
>  	prestera_destroy_ports(sw);
> +	prestera_lag_fini(sw);
>  	prestera_devlink_unregister(sw);
>  	prestera_event_handlers_unregister(sw);
>  	prestera_rxtx_switch_fini(sw);
> diff --git a/drivers/net/ethernet/marvell/prestera/prestera_switchdev.c b/drivers/net/ethernet/marvell/prestera/prestera_switchdev.c
> index 7736d5f498c9..3750c66a550b 100644
> --- a/drivers/net/ethernet/marvell/prestera/prestera_switchdev.c
> +++ b/drivers/net/ethernet/marvell/prestera/prestera_switchdev.c
> @@ -180,6 +180,45 @@ prestera_port_vlan_create(struct prestera_port *port, u16 vid, bool untagged)
>  	return ERR_PTR(err);
>  }
>  
> +static int prestera_fdb_add(struct prestera_port *port,
> +			    const unsigned char *mac, u16 vid, bool dynamic)
> +{
> +	if (prestera_port_is_lag_member(port))
> +		return prestera_hw_lag_fdb_add(port->sw, prestera_port_lag_id(port),
> +					      mac, vid, dynamic);
> +	else
> +		return prestera_hw_fdb_add(port, mac, vid, dynamic);
> +}

I think checkpatch tells you that "else" after "return" is not really
necessary.

> +
> +static int prestera_fdb_del(struct prestera_port *port,
> +			    const unsigned char *mac, u16 vid)
> +{
> +	if (prestera_port_is_lag_member(port))
> +		return prestera_hw_lag_fdb_del(port->sw, prestera_port_lag_id(port),
> +					      mac, vid);
> +	else
> +		return prestera_hw_fdb_del(port, mac, vid);
> +}
> +
> +static int prestera_fdb_flush_port_vlan(struct prestera_port *port, u16 vid,
> +					u32 mode)
> +{
> +	if (prestera_port_is_lag_member(port))
> +		return prestera_hw_fdb_flush_lag_vlan(port->sw, prestera_port_lag_id(port),
> +						      vid, mode);
> +	else
> +		return prestera_hw_fdb_flush_port_vlan(port, vid, mode);
> +}
> +
> +static int prestera_fdb_flush_port(struct prestera_port *port, u32 mode)
> +{
> +	if (prestera_port_is_lag_member(port))
> +		return prestera_hw_fdb_flush_lag(port->sw, prestera_port_lag_id(port),
> +						 mode);
> +	else
> +		return prestera_hw_fdb_flush_port(port, mode);
> +}
> +
>  static void
>  prestera_port_vlan_bridge_leave(struct prestera_port_vlan *port_vlan)
>  {
> @@ -199,11 +238,11 @@ prestera_port_vlan_bridge_leave(struct prestera_port_vlan *port_vlan)
>  	last_port = port_count == 1;
>  
>  	if (last_vlan)
> -		prestera_hw_fdb_flush_port(port, fdb_flush_mode);
> +		prestera_fdb_flush_port(port, fdb_flush_mode);
>  	else if (last_port)
>  		prestera_hw_fdb_flush_vlan(port->sw, vid, fdb_flush_mode);
>  	else
> -		prestera_hw_fdb_flush_port_vlan(port, vid, fdb_flush_mode);
> +		prestera_fdb_flush_port_vlan(port, vid, fdb_flush_mode);
>  
>  	list_del(&port_vlan->br_vlan_head);
>  	prestera_bridge_vlan_put(br_vlan);
> @@ -394,9 +433,9 @@ prestera_bridge_port_add(struct prestera_bridge *bridge, struct net_device *dev)
>  }
>  
>  static int
> -prestera_bridge_1d_port_join(struct prestera_bridge_port *br_port)
> +prestera_bridge_1d_port_join(struct prestera_bridge_port *br_port,
> +			     struct prestera_port *port)
>  {
> -	struct prestera_port *port = netdev_priv(br_port->dev);
>  	struct prestera_bridge *bridge = br_port->bridge;
>  	int err;
>  
> @@ -423,6 +462,7 @@ prestera_bridge_1d_port_join(struct prestera_bridge_port *br_port)
>  }
>  
>  static int prestera_port_bridge_join(struct prestera_port *port,
> +				     struct net_device *lower,
>  				     struct net_device *upper)
>  {
>  	struct prestera_switchdev *swdev = port->sw->swdev;
> @@ -437,7 +477,7 @@ static int prestera_port_bridge_join(struct prestera_port *port,
>  			return PTR_ERR(bridge);
>  	}
>  
> -	br_port = prestera_bridge_port_add(bridge, port->dev);
> +	br_port = prestera_bridge_port_add(bridge, lower);
>  	if (IS_ERR(br_port)) {
>  		err = PTR_ERR(br_port);
>  		goto err_brport_create;
> @@ -446,7 +486,7 @@ static int prestera_port_bridge_join(struct prestera_port *port,
>  	if (bridge->vlan_enabled)
>  		return 0;
>  
> -	err = prestera_bridge_1d_port_join(br_port);
> +	err = prestera_bridge_1d_port_join(br_port, port);
>  	if (err)
>  		goto err_port_join;
>  
> @@ -459,19 +499,17 @@ static int prestera_port_bridge_join(struct prestera_port *port,
>  	return err;
>  }
>  
> -static void prestera_bridge_1q_port_leave(struct prestera_bridge_port *br_port)
> +static void prestera_bridge_1q_port_leave(struct prestera_bridge_port *br_port,
> +					  struct prestera_port *port)
>  {
> -	struct prestera_port *port = netdev_priv(br_port->dev);
> -
> -	prestera_hw_fdb_flush_port(port, PRESTERA_FDB_FLUSH_MODE_ALL);
> +	prestera_fdb_flush_port(port, PRESTERA_FDB_FLUSH_MODE_ALL);
>  	prestera_port_pvid_set(port, PRESTERA_DEFAULT_VID);
>  }
>  
> -static void prestera_bridge_1d_port_leave(struct prestera_bridge_port *br_port)
> +static void prestera_bridge_1d_port_leave(struct prestera_bridge_port *br_port,
> +					  struct prestera_port *port)
>  {
> -	struct prestera_port *port = netdev_priv(br_port->dev);
> -
> -	prestera_hw_fdb_flush_port(port, PRESTERA_FDB_FLUSH_MODE_ALL);
> +	prestera_fdb_flush_port(port, PRESTERA_FDB_FLUSH_MODE_ALL);
>  	prestera_hw_bridge_port_delete(port, br_port->bridge->bridge_id);
>  }
>  
> @@ -506,6 +544,7 @@ static int prestera_port_vid_stp_set(struct prestera_port *port, u16 vid,
>  }
>  
>  static void prestera_port_bridge_leave(struct prestera_port *port,
> +				       struct net_device *lower,
>  				       struct net_device *upper)
>  {
>  	struct prestera_switchdev *swdev = port->sw->swdev;
> @@ -516,16 +555,16 @@ static void prestera_port_bridge_leave(struct prestera_port *port,
>  	if (!bridge)
>  		return;
>  
> -	br_port = __prestera_bridge_port_by_dev(bridge, port->dev);
> +	br_port = __prestera_bridge_port_by_dev(bridge, lower);
>  	if (!br_port)
>  		return;
>  
>  	bridge = br_port->bridge;
>  
>  	if (bridge->vlan_enabled)
> -		prestera_bridge_1q_port_leave(br_port);
> +		prestera_bridge_1q_port_leave(br_port, port);
>  	else
> -		prestera_bridge_1d_port_leave(br_port);
> +		prestera_bridge_1d_port_leave(br_port, port);
>  
>  	prestera_hw_port_learning_set(port, false);
>  	prestera_hw_port_flood_set(port, false);
> @@ -533,8 +572,8 @@ static void prestera_port_bridge_leave(struct prestera_port *port,
>  	prestera_bridge_port_put(br_port);
>  }
>  
> -int prestera_bridge_port_event(struct net_device *dev, unsigned long event,
> -			       void *ptr)
> +int prestera_bridge_port_event(struct net_device *lower, struct net_device *dev,
> +			       unsigned long event, void *ptr)

It's odd that you have a net_device lower and a net_device dev.
You're only using "dev" to retrieve the struct prestera_port, can't you
just pass that as parameter? It will also help avoid possible mistakes
in the future between lower (which can be a LAG or a port) and which is
associated with a struct prestera_bridge_port, and dev which is only a
port, and is associated with struct prestera_port.
Tobias Waldekranz Feb. 8, 2021, 7:54 p.m. UTC | #3
On Thu, Feb 04, 2021 at 21:16, Jakub Kicinski <kuba@kernel.org> wrote:
> On Wed,  3 Feb 2021 18:54:56 +0200 Vadym Kochan wrote:
>> From: Serhiy Boiko <serhiy.boiko@plvision.eu>
>> 
>> The following features are supported:
>> 
>>     - LAG basic operations
>>         - create/delete LAG
>>         - add/remove a member to LAG
>>         - enable/disable member in LAG
>>     - LAG Bridge support
>>     - LAG VLAN support
>>     - LAG FDB support
>> 
>> Limitations:
>> 
>>     - Only HASH lag tx type is supported
>>     - The Hash parameters are not configurable. They are applied
>>       during the LAG creation stage.
>>     - Enslaving a port to the LAG device that already has an
>>       upper device is not supported.
>
> Tobias, Vladimir, you worked on LAG support recently, would you mind
> taking a look at this one?

Hi Jakub,

I took a quick look at it, and what I found left me very puzzled. I hope
you do not mind me asking a generic question about the policy around
switchdev drivers. If someone published a driver using something similar
to the following configuration flow:

iproute2  daemon(SDK)
   |        ^    |
   :        :    : user/kernel boundary
   v        |    |
netlink     |    |
   |        |    |
   v        |    |
 driver     |    |
   |        |    |
   '--------'    |
                 : kernel/hardware boundary
                 v
                ASIC

My guess is that they would be (rightly IMO) told something along the
lines of "we do not accept drivers that are just shims for proprietary
SDKs".

But it seems like if that same someone has enough area to spare in their
ASIC to embed a CPU, it is perfectly fine to run that same SDK on it,
call it "firmware", and then push a shim driver into the kernel tree.

iproute2
   |
   :               user/kernel boundary
   v
netlink
   |
   v
 driver
   |
   |
   :               kernel/hardware boundary
   '-------------.
                 v
             daemon(SDK)
                 |
                 v
                ASIC

What have we, the community, gained by this? In the old world, the
vendor usually at least had to ship me the SDK in source form. Having
seen the inside of some of those sausage factories, they are not the
kinds of code bases that I want at the bottom of my stack; even less so
in binary form where I am entirely at the vendor's mercy for bugfixes.

We are talking about a pure Ethernet fabric here, so there is no fig
leaf of "regulatory requirements" to hide behind, in contrast to WiFi
for example.

Is it the opinion of the netdev community that it is OK for vendors to
use this model?
Jakub Kicinski Feb. 8, 2021, 9:05 p.m. UTC | #4
On Mon, 08 Feb 2021 20:54:29 +0100 Tobias Waldekranz wrote:
> On Thu, Feb 04, 2021 at 21:16, Jakub Kicinski <kuba@kernel.org> wrote:
> > On Wed,  3 Feb 2021 18:54:56 +0200 Vadym Kochan wrote:  
> >> From: Serhiy Boiko <serhiy.boiko@plvision.eu>
> >> 
> >> The following features are supported:
> >> 
> >>     - LAG basic operations
> >>         - create/delete LAG
> >>         - add/remove a member to LAG
> >>         - enable/disable member in LAG
> >>     - LAG Bridge support
> >>     - LAG VLAN support
> >>     - LAG FDB support
> >> 
> >> Limitations:
> >> 
> >>     - Only HASH lag tx type is supported
> >>     - The Hash parameters are not configurable. They are applied
> >>       during the LAG creation stage.
> >>     - Enslaving a port to the LAG device that already has an
> >>       upper device is not supported.  
> >
> > Tobias, Vladimir, you worked on LAG support recently, would you mind
> > taking a look at this one?  
> 
> I took a quick look at it, and what I found left me very puzzled. I hope
> you do not mind me asking a generic question about the policy around
> switchdev drivers. If someone published a driver using something similar
> to the following configuration flow:
> 
> iproute2  daemon(SDK)
>    |        ^    |
>    :        :    : user/kernel boundary
>    v        |    |
> netlink     |    |
>    |        |    |
>    v        |    |
>  driver     |    |
>    |        |    |
>    '--------'    |
>                  : kernel/hardware boundary
>                  v
>                 ASIC
> 
> My guess is that they would be (rightly IMO) told something along the
> lines of "we do not accept drivers that are just shims for proprietary
> SDKs".
> 
> But it seems like if that same someone has enough area to spare in their
> ASIC to embed a CPU, it is perfectly fine to run that same SDK on it,
> call it "firmware", and then push a shim driver into the kernel tree.
> 
> iproute2
>    |
>    :               user/kernel boundary
>    v
> netlink
>    |
>    v
>  driver
>    |
>    |
>    :               kernel/hardware boundary
>    '-------------.
>                  v
>              daemon(SDK)
>                  |
>                  v
>                 ASIC
> 
> What have we, the community, gained by this? In the old world, the
> vendor usually at least had to ship me the SDK in source form. Having
> seen the inside of some of those sausage factories, they are not the
> kinds of code bases that I want at the bottom of my stack; even less so
> in binary form where I am entirely at the vendor's mercy for bugfixes.
> 
> We are talking about a pure Ethernet fabric here, so there is no fig
> leaf of "regulatory requirements" to hide behind, in contrast to WiFi
> for example.
> 
> Is it the opinion of the netdev community that it is OK for vendors to
> use this model?

I ask myself that question pretty much every day. Sadly I have no clear
answer.

Silicon is cheap, you can embed a reasonable ARM or Risc-V core in the
chip for the area and power draw comparable to one high speed serdes
lane.

The drivers landing in the kernel are increasingly meaningless. My day
job is working for a hyperscaler. Even though we have one of the most
capable kernel teams on the planet most of issues with HW we face
result in "something is wrong with the FW, let's call the vendor".

And even when I say "drivers landing" it is an overstatement.
If you look at high speed anything these days the drivers cover
multiple generations of hardware, seems like ~5 years ago most
NIC vendors reached sufficient FW saturation to cover up differences
between HW generations.

At the same time some FW is necessary. Certain chip functions, are 
best driven by a micro-controller running a tight control loop. 
The complexity of FW is a spectrum, from basic to Qualcomm. 
The problem is there is no way for us to know what FW is hiding
by just looking at the driver.

Where do we draw the line? 

Personally I'd really like to see us pushing back stronger.
Andrew Lunn Feb. 8, 2021, 10:30 p.m. UTC | #5
> > I took a quick look at it, and what I found left me very puzzled. I hope
> > you do not mind me asking a generic question about the policy around
> > switchdev drivers. If someone published a driver using something similar
> > to the following configuration flow:
> > 
> > iproute2  daemon(SDK)
> >    |        ^    |
> >    :        :    : user/kernel boundary
> >    v        |    |
> > netlink     |    |
> >    |        |    |
> >    v        |    |
> >  driver     |    |
> >    |        |    |
> >    '--------'    |
> >                  : kernel/hardware boundary
> >                  v
> >                 ASIC
> > 
> > My guess is that they would be (rightly IMO) told something along the
> > lines of "we do not accept drivers that are just shims for proprietary
> > SDKs".
> > 
> > But it seems like if that same someone has enough area to spare in their
> > ASIC to embed a CPU, it is perfectly fine to run that same SDK on it,
> > call it "firmware", and then push a shim driver into the kernel tree.
> > 
> > iproute2
> >    |
> >    :               user/kernel boundary
> >    v
> > netlink
> >    |
> >    v
> >  driver
> >    |
> >    |
> >    :               kernel/hardware boundary
> >    '-------------.
> >                  v
> >              daemon(SDK)
> >                  |
> >                  v
> >                 ASIC
> > 
> > What have we, the community, gained by this? In the old world, the
> > vendor usually at least had to ship me the SDK in source form. Having
> > seen the inside of some of those sausage factories, they are not the
> > kinds of code bases that I want at the bottom of my stack; even less so
> > in binary form where I am entirely at the vendor's mercy for bugfixes.
> > 
> > We are talking about a pure Ethernet fabric here, so there is no fig
> > leaf of "regulatory requirements" to hide behind, in contrast to WiFi
> > for example.
> > 
> > Is it the opinion of the netdev community that it is OK for vendors to
> > use this model?

What i find interesting is the comparison between Microchip Sparx5 and
Marvell Prestera. They offer similar capabilities. Both have a CPU on
them. As you say Marvell is pushing their SDK into this CPU, black
box. Microchip decided to open everything, no firmware, the kernel
driver is directly accessing the hardware, the datasheet is available,
and microchip engineers are here on the list.

I really hope that Sparx5 takes off, and displaces Prestera. In terms
of being able to solve issues, we the community can work with
Sparx5. Prestera is too much a black box.

	Andrew
Tobias Waldekranz Feb. 9, 2021, 11:56 a.m. UTC | #6
On Mon, Feb 08, 2021 at 13:05, Jakub Kicinski <kuba@kernel.org> wrote:
> On Mon, 08 Feb 2021 20:54:29 +0100 Tobias Waldekranz wrote:
>> On Thu, Feb 04, 2021 at 21:16, Jakub Kicinski <kuba@kernel.org> wrote:
>> > On Wed,  3 Feb 2021 18:54:56 +0200 Vadym Kochan wrote:  
>> >> From: Serhiy Boiko <serhiy.boiko@plvision.eu>
>> >> 
>> >> The following features are supported:
>> >> 
>> >>     - LAG basic operations
>> >>         - create/delete LAG
>> >>         - add/remove a member to LAG
>> >>         - enable/disable member in LAG
>> >>     - LAG Bridge support
>> >>     - LAG VLAN support
>> >>     - LAG FDB support
>> >> 
>> >> Limitations:
>> >> 
>> >>     - Only HASH lag tx type is supported
>> >>     - The Hash parameters are not configurable. They are applied
>> >>       during the LAG creation stage.
>> >>     - Enslaving a port to the LAG device that already has an
>> >>       upper device is not supported.  
>> >
>> > Tobias, Vladimir, you worked on LAG support recently, would you mind
>> > taking a look at this one?  
>> 
>> I took a quick look at it, and what I found left me very puzzled. I hope
>> you do not mind me asking a generic question about the policy around
>> switchdev drivers. If someone published a driver using something similar
>> to the following configuration flow:
>> 
>> iproute2  daemon(SDK)
>>    |        ^    |
>>    :        :    : user/kernel boundary
>>    v        |    |
>> netlink     |    |
>>    |        |    |
>>    v        |    |
>>  driver     |    |
>>    |        |    |
>>    '--------'    |
>>                  : kernel/hardware boundary
>>                  v
>>                 ASIC
>> 
>> My guess is that they would be (rightly IMO) told something along the
>> lines of "we do not accept drivers that are just shims for proprietary
>> SDKs".
>> 
>> But it seems like if that same someone has enough area to spare in their
>> ASIC to embed a CPU, it is perfectly fine to run that same SDK on it,
>> call it "firmware", and then push a shim driver into the kernel tree.
>> 
>> iproute2
>>    |
>>    :               user/kernel boundary
>>    v
>> netlink
>>    |
>>    v
>>  driver
>>    |
>>    |
>>    :               kernel/hardware boundary
>>    '-------------.
>>                  v
>>              daemon(SDK)
>>                  |
>>                  v
>>                 ASIC
>> 
>> What have we, the community, gained by this? In the old world, the
>> vendor usually at least had to ship me the SDK in source form. Having
>> seen the inside of some of those sausage factories, they are not the
>> kinds of code bases that I want at the bottom of my stack; even less so
>> in binary form where I am entirely at the vendor's mercy for bugfixes.
>> 
>> We are talking about a pure Ethernet fabric here, so there is no fig
>> leaf of "regulatory requirements" to hide behind, in contrast to WiFi
>> for example.
>> 
>> Is it the opinion of the netdev community that it is OK for vendors to
>> use this model?
>
> I ask myself that question pretty much every day. Sadly I have no clear
> answer.

Thank you for your candid answer, really appreciate it. I do not envy
you one bit, making those decisions must be extremely hard.

> Silicon is cheap, you can embed a reasonable ARM or Risc-V core in the
> chip for the area and power draw comparable to one high speed serdes
> lane.
>
> The drivers landing in the kernel are increasingly meaningless. My day
> job is working for a hyperscaler. Even though we have one of the most
> capable kernel teams on the planet most of issues with HW we face
> result in "something is wrong with the FW, let's call the vendor".

Right, and being a hyperscaler probably at least gets you some attention
when you call your vendor. My day job is working for a nanoscaler, so my
experience is that we must be prepared to solve all issues in-house; if
we get any help from the vendor that is just a bonus.

> And even when I say "drivers landing" it is an overstatement.
> If you look at high speed anything these days the drivers cover
> multiple generations of hardware, seems like ~5 years ago most
> NIC vendors reached sufficient FW saturation to cover up differences
> between HW generations.
>
> At the same time some FW is necessary. Certain chip functions, are 
> best driven by a micro-controller running a tight control loop. 

I agree. But I still do not understand why vendors cling to the source
of these like it was their wallet. That is the beauty of selling
silicon; you can fully leverage OSS and still have a very straight
forward business model.

> The complexity of FW is a spectrum, from basic to Qualcomm. 
> The problem is there is no way for us to know what FW is hiding
> by just looking at the driver.
>
> Where do we draw the line? 

Yeah it is a very hard problem. In this particular case though, the
vendor explicitly said that what they have done is compiled their
existing SDK to run on the ASIC:

https://lore.kernel.org/netdev/BN6PR18MB1587EB225C6B80BF35A44EBFBA5A0@BN6PR18MB1587.namprd18.prod.outlook.com

So there is no reason that it could not be done as a proper driver.

> Personally I'd really like to see us pushing back stronger.

Hear, hear!
Tobias Waldekranz Feb. 9, 2021, 12:37 p.m. UTC | #7
On Mon, Feb 08, 2021 at 23:30, Andrew Lunn <andrew@lunn.ch> wrote:
>> > I took a quick look at it, and what I found left me very puzzled. I hope
>> > you do not mind me asking a generic question about the policy around
>> > switchdev drivers. If someone published a driver using something similar
>> > to the following configuration flow:
>> > 
>> > iproute2  daemon(SDK)
>> >    |        ^    |
>> >    :        :    : user/kernel boundary
>> >    v        |    |
>> > netlink     |    |
>> >    |        |    |
>> >    v        |    |
>> >  driver     |    |
>> >    |        |    |
>> >    '--------'    |
>> >                  : kernel/hardware boundary
>> >                  v
>> >                 ASIC
>> > 
>> > My guess is that they would be (rightly IMO) told something along the
>> > lines of "we do not accept drivers that are just shims for proprietary
>> > SDKs".
>> > 
>> > But it seems like if that same someone has enough area to spare in their
>> > ASIC to embed a CPU, it is perfectly fine to run that same SDK on it,
>> > call it "firmware", and then push a shim driver into the kernel tree.
>> > 
>> > iproute2
>> >    |
>> >    :               user/kernel boundary
>> >    v
>> > netlink
>> >    |
>> >    v
>> >  driver
>> >    |
>> >    |
>> >    :               kernel/hardware boundary
>> >    '-------------.
>> >                  v
>> >              daemon(SDK)
>> >                  |
>> >                  v
>> >                 ASIC
>> > 
>> > What have we, the community, gained by this? In the old world, the
>> > vendor usually at least had to ship me the SDK in source form. Having
>> > seen the inside of some of those sausage factories, they are not the
>> > kinds of code bases that I want at the bottom of my stack; even less so
>> > in binary form where I am entirely at the vendor's mercy for bugfixes.
>> > 
>> > We are talking about a pure Ethernet fabric here, so there is no fig
>> > leaf of "regulatory requirements" to hide behind, in contrast to WiFi
>> > for example.
>> > 
>> > Is it the opinion of the netdev community that it is OK for vendors to
>> > use this model?
>
> What i find interesting is the comparison between Microchip Sparx5 and
> Marvell Prestera. They offer similar capabilities. Both have a CPU on
> them. As you say Marvell is pushing their SDK into this CPU, black
> box. Microchip decided to open everything, no firmware, the kernel
> driver is directly accessing the hardware, the datasheet is available,
> and microchip engineers are here on the list.

Indeed, it is a very stark difference in approach. Perhaps a silly
example, but it speaks to their developer focus, just the fact that they
have an online register reference on GitHub[1] amazed me. What a breath
of fresh air! ...and speaks to the general state of things, I guess :)

Unsurprisingly the team behind it are also really great to work with!

> I really hope that Sparx5 takes off, and displaces Prestera. In terms

We are certainly keeping our eyes on it!

> of being able to solve issues, we the community can work with
> Sparx5. Prestera is too much a black box.

I would only add that I still, perhaps naively, hope Marvell will
eventually see the benefits of having a truly open driver.

> 	Andrew

[1]: https://microchip-ung.github.io/sparx-5_reginfo/reginfo_sparx-5.html
Andrew Lunn Feb. 9, 2021, 1:58 p.m. UTC | #8
> At the same time some FW is necessary. Certain chip functions, are 
> best driven by a micro-controller running a tight control loop. 

For a smart NIC, i could agree. But a switch? The data path is in
hardware. The driver is all about configuring this hardware, and then
it is idle. Polls the PHYs once a second, maybe gather statistics,
allows the network stack to perform STP, but otherwise it does
nothing.

So for me, i don't see that being a valid argument for this driver.

By putting their SDK inside the CPU on the switch, and adding an RPC
interface, Marvell can quickly get some sort of support working in the
Linux ecosystem. But this solution has all the problems of a binary
blob in userspace.

I doubt there is going to be any community engagement with this
driver. Marvell is going to have to add all the features. If a user
wants a feature which is not currently supported, they have little
chance of being able to add it themselves. There is no documentation
of the RPC interface. So even if the firmware has support for more
than what the Linux driver implements, only Marvell knows about it.

Products based around this driver are going to find it hard to
differentiate on switch features. The switch can do what Marvell
allows you to do. All differentiation is going to be limited to above
that, the user interface.

For some market segments, that might be enough. You don't see
community based patches adding new features to the Mellanex/nvidia
hardware. But when you look at the DSA drivers, a lot of the features
there are from the community. There is probably space for both.

Looking into my crystal ball, Marvell will probably have the base
features of their switch implemented before Microchip does, simply
because they are reusing code hidden away in the CPU. But then
development will stagnate. Microchip will take a bit longer to get the
base features implemented. But then because of the openness, users
will start using the hardware in different ways, and implement
features which are important to them. And contribute bug fixes. The
driver will keep gaining new features and mature, and in the end, the
device built from it will be a lot more divers and interesting.

What i'm not sure is how we as a community push back. Marvells whole
strategy is black box. I doubt we can make them open up the firmware.
Do we want to throw out the driver from the kernel? I don't think it
is that bad. We can point out the problems with Marvell's model. We
can put in review effort for Microchip, make their driver better. And
we can encourage the 3rd and 4th vendors in the enterprise switch
space to follow Microchips lead.

      Andrew
Jakub Kicinski Feb. 9, 2021, 5:35 p.m. UTC | #9
On Tue, 9 Feb 2021 14:58:26 +0100 Andrew Lunn wrote:
> > At the same time some FW is necessary. Certain chip functions, are 
> > best driven by a micro-controller running a tight control loop.   
> 
> For a smart NIC, i could agree. But a switch? The data path is in
> hardware. The driver is all about configuring this hardware, and then
> it is idle. Polls the PHYs once a second, maybe gather statistics,
> allows the network stack to perform STP, but otherwise it does
> nothing.
> 
> So for me, i don't see that being a valid argument for this driver.
> 
> By putting their SDK inside the CPU on the switch, and adding an RPC
> interface, Marvell can quickly get some sort of support working in the
> Linux ecosystem. But this solution has all the problems of a binary
> blob in userspace.
> 
> I doubt there is going to be any community engagement with this
> driver. Marvell is going to have to add all the features. If a user
> wants a feature which is not currently supported, they have little
> chance of being able to add it themselves. There is no documentation
> of the RPC interface. So even if the firmware has support for more
> than what the Linux driver implements, only Marvell knows about it.
> 
> Products based around this driver are going to find it hard to
> differentiate on switch features. The switch can do what Marvell
> allows you to do. All differentiation is going to be limited to above
> that, the user interface.
> 
> For some market segments, that might be enough. You don't see
> community based patches adding new features to the Mellanex/nvidia
> hardware. But when you look at the DSA drivers, a lot of the features
> there are from the community. There is probably space for both.
> 
> Looking into my crystal ball, Marvell will probably have the base
> features of their switch implemented before Microchip does, simply
> because they are reusing code hidden away in the CPU. But then
> development will stagnate. Microchip will take a bit longer to get the
> base features implemented. But then because of the openness, users
> will start using the hardware in different ways, and implement
> features which are important to them. And contribute bug fixes. The
> driver will keep gaining new features and mature, and in the end, the
> device built from it will be a lot more divers and interesting.
> 
> What i'm not sure is how we as a community push back. Marvells whole
> strategy is black box. I doubt we can make them open up the firmware.
> Do we want to throw out the driver from the kernel? I don't think it
> is that bad. We can point out the problems with Marvell's model. We
> can put in review effort for Microchip, make their driver better. And
> we can encourage the 3rd and 4th vendors in the enterprise switch
> space to follow Microchips lead.

Sounds like we have 3 people who don't like FW-heavy designs dominating
the kernel - this conversation can only go one way. 

Marvell, Plvision anything to share? AFAIU the values of Linux kernel
are open source, healthy community, empowering users. With the SDK on
the embedded CPU your driver does not seem to tick any of these boxes.
Jakub Kicinski Feb. 9, 2021, 5:48 p.m. UTC | #10
On Tue, 09 Feb 2021 12:56:55 +0100 Tobias Waldekranz wrote:
> > I ask myself that question pretty much every day. Sadly I have no clear
> > answer.  
> 
> Thank you for your candid answer, really appreciate it. I do not envy
> you one bit, making those decisions must be extremely hard.
> 
> > Silicon is cheap, you can embed a reasonable ARM or Risc-V core in the
> > chip for the area and power draw comparable to one high speed serdes
> > lane.
> >
> > The drivers landing in the kernel are increasingly meaningless. My day
> > job is working for a hyperscaler. Even though we have one of the most
> > capable kernel teams on the planet most of issues with HW we face
> > result in "something is wrong with the FW, let's call the vendor".  
> 
> Right, and being a hyperscaler probably at least gets you some attention
> when you call your vendor. My day job is working for a nanoscaler, so my
> experience is that we must be prepared to solve all issues in-house; if
> we get any help from the vendor that is just a bonus.
> 
> > And even when I say "drivers landing" it is an overstatement.
> > If you look at high speed anything these days the drivers cover
> > multiple generations of hardware, seems like ~5 years ago most
> > NIC vendors reached sufficient FW saturation to cover up differences
> > between HW generations.
> >
> > At the same time some FW is necessary. Certain chip functions, are 
> > best driven by a micro-controller running a tight control loop.   
> 
> I agree. But I still do not understand why vendors cling to the source
> of these like it was their wallet. That is the beauty of selling
> silicon; you can fully leverage OSS and still have a very straight
> forward business model.

Vendors want to be able to "add value", lock users in and sell support.
Users adding features themselves hurts their bottom line. Take a look
at income breakdown for publicly traded companies. There were also
rumors recently about certain huge silicon vendor revoking the SDK
license from a NOS company after that company got bought.

Business people make rational choices, trust me. It's on us to make
rational choices in the interest of the community (incl. our users).

> > The complexity of FW is a spectrum, from basic to Qualcomm. 
> > The problem is there is no way for us to know what FW is hiding
> > by just looking at the driver.
> >
> > Where do we draw the line?   
> 
> Yeah it is a very hard problem. In this particular case though, the
> vendor explicitly said that what they have done is compiled their
> existing SDK to run on the ASIC:
> 
> https://lore.kernel.org/netdev/BN6PR18MB1587EB225C6B80BF35A44EBFBA5A0@BN6PR18MB1587.namprd18.prod.outlook.com
> 
> So there is no reason that it could not be done as a proper driver.

I guess you meant "no _technical_ reason" ;)

> > Personally I'd really like to see us pushing back stronger.  
> 
> Hear, hear!
Mickey Rachamim Feb. 9, 2021, 8:31 p.m. UTC | #11
Hi Andrew, Jakub, Tobias,

On Tuesday, February 9, 2021 7:35 PM Jakub Kicinski wrote:
> Sounds like we have 3 people who don't like FW-heavy designs dominating the kernel - this conversation can only go one way. 
> Marvell, Plvision anything to share? AFAIU the values of Linux kernel are open source, healthy community, empowering users. With the SDK on the embedded CPU your driver does not seem to tick any of these boxes.

I'll try to share Marvell's insight and plans regarding our Prestera drivers;
 
We do understand the importance and the vision behind the open-source community - while being committed to quality, functionality and the developers/end-users.

We started working on the Prestera driver in Q2 2019. it took us more than a year to get the first approved driver into 5.10, and we just started.
Right at the beginning - we implemented PP function into the Kernel driver like the SDMA operation (This is the RX/TX DMA engine). 
Yet, the FW itself - is an SW package that supports many Marvell Prestera Switching families of devices - this is a significant SW package that will take many working years to adapt to the Kernel environment.
We do plan to port more and more PP functions as Kernel drivers along the way.
 
We also are working with the community to extend Kernel functionality with a new feature beneficial to all Kernel users (e.g. Devlink changes) and we will continue to do it.
By extending the Prestera driver to in-kernel implementation with more PP features - we will simplify the FW logic and enables cost-effective solutions to the market/developers.

Regards,
Mickey.
Tobias Waldekranz Feb. 9, 2021, 9:34 p.m. UTC | #12
On Tue, Feb 09, 2021 at 20:31, Mickey Rachamim <mickeyr@marvell.com> wrote:
> Hi Andrew, Jakub, Tobias,
>
> On Tuesday, February 9, 2021 7:35 PM Jakub Kicinski wrote:
>> Sounds like we have 3 people who don't like FW-heavy designs dominating the kernel - this conversation can only go one way. 
>> Marvell, Plvision anything to share? AFAIU the values of Linux kernel are open source, healthy community, empowering users. With the SDK on the embedded CPU your driver does not seem to tick any of these boxes.
>
> I'll try to share Marvell's insight and plans regarding our Prestera drivers;
>  
> We do understand the importance and the vision behind the open-source community - while being committed to quality, functionality and the developers/end-users.
>
> We started working on the Prestera driver in Q2 2019. it took us more than a year to get the first approved driver into 5.10, and we just started.
> Right at the beginning - we implemented PP function into the Kernel driver like the SDMA operation (This is the RX/TX DMA engine). 
> Yet, the FW itself - is an SW package that supports many Marvell Prestera Switching families of devices - this is a significant SW package that will take many working years to adapt to the Kernel environment.
> We do plan to port more and more PP functions as Kernel drivers along the way.

This is very encouraging to hear. I understand that it is a massive
undertaking.

> We also are working with the community to extend Kernel functionality with a new feature beneficial to all Kernel users (e.g. Devlink changes) and we will continue to do it.
> By extending the Prestera driver to in-kernel implementation with more PP features - we will simplify the FW logic and enables cost-effective solutions to the market/developers.

Until that day arrives, are there any chances of Marvell opening up CPSS
in the same way DSDT was re-licensed some years back?

Being able to clone github.com/Marvell-switching/prestera-firmware (or
whatever) and build the firmware from source would go a long way to
alleviate my fears at least.

In such a world, I at least have a chance of debugging any issue all the
way to the bottom of the stack. It would also make it possible for the
community to help out with the porting effort.

> Regards,
> Mickey.
Andrew Lunn Feb. 10, 2021, 12:28 a.m. UTC | #13
> Right at the beginning - we implemented PP function into the Kernel
> driver like the SDMA operation (This is the RX/TX DMA engine).

> We do plan to port more and more PP functions as Kernel drivers
> along the way.

It will be interesting to see how well you manage to handle the 'split
brain' problem.

DMA packets to/from the host is pretty isolated from the rest of the
driver. Look at DSA, it has completely separate drivers. But when you
start having parts of the control plain in the driver poking switch
registers, and parts of the control plane in the SDK poking registers,
you have an interesting synchronisation problem.

I guess stats would be a good place to start. Throw away the current
code making an RPC into the SDK, and just directly get the values from
the registers. No real synchronisation problems there. In fact, most
of the ethtool get API calls should be reasonably easy to do via
direct hardware access, rather than using the SDK RPC. Getting values
like that should be easy to synchronise.

     Andrew
Mickey Rachamim Feb. 10, 2021, 10:41 a.m. UTC | #14
> Until that day arrives, are there any chances of Marvell opening up CPSS in the same way DSDT was re-licensed some years back?
The CPSS code is available to everyone on Marvell Extranet (Requires simple registration process)
Anyway, as the transition process will progress - it will be less required.

> Being able to clone github.com/Marvell-switching/prestera-firmware (or
> whatever) and build the firmware from source would go a long way to alleviate my fears at least.
I understand your concerns but at this stage - we also concerned about others that might build not reliable FW images.
I also agree that at some point we should ensure most of the concerns are being addressed.

Mickey.
Mickey Rachamim Feb. 10, 2021, 10:42 a.m. UTC | #15
> It will be interesting to see how well you manage to handle the 'split brain' problem.
Right 
Jakub Kicinski Feb. 10, 2021, 7:25 p.m. UTC | #16
On Tue, 9 Feb 2021 20:31:32 +0000 Mickey Rachamim wrote:
> On Tuesday, February 9, 2021 7:35 PM Jakub Kicinski wrote:
> > Sounds like we have 3 people who don't like FW-heavy designs dominating the kernel - this conversation can only go one way. 
> > Marvell, Plvision anything to share? AFAIU the values of Linux kernel are open source, healthy community, empowering users. With the SDK on the embedded CPU your driver does not seem to tick any of these boxes.  
> 
> I'll try to share Marvell's insight and plans regarding our Prestera drivers;
>  
> We do understand the importance and the vision behind the open-source
> community - while being committed to quality, functionality and the
> developers/end-users.
> 
> We started working on the Prestera driver in Q2 2019. it took us more
> than a year to get the first approved driver into 5.10, and we just
> started. Right at the beginning - we implemented PP function into the
> Kernel driver like the SDMA operation (This is the RX/TX DMA engine).
> Yet, the FW itself - is an SW package that supports many Marvell
> Prestera Switching families of devices - this is a significant SW
> package that will take many working years to adapt to the Kernel
> environment. We do plan to port more and more PP functions as Kernel
> drivers along the way. 

Okay, so it sounds like there are no technical reason for you to keep
the SDK. My guess is also that you have a large customer who is
expecting you to provide upstream integration, hence the contractors
and taking the easiest way out.

> We also are working with the community to extend Kernel functionality
> with a new feature beneficial to all Kernel users (e.g. Devlink
> changes) and we will continue to do it.

Ah, devlink, every vendor's favorite interface. I keep my fingers
crossed that you're not just talking about exposing a bunch of
implementation-specific params, traps etc.

> By extending the Prestera driver to in-kernel implementation with
> more PP features - we will simplify the FW logic and enables
> cost-effective solutions to the market/developers.
Taras Chornyi Feb. 10, 2021, 8:52 p.m. UTC | #17
On 09.02.2021 19:35, Jakub Kicinski wrote:

> 
> Sounds like we have 3 people who don't like FW-heavy designs dominating
> the kernel - this conversation can only go one way. 
> 
> Marvell, Plvision anything to share? AFAIU the values of Linux kernel
> are open source, healthy community, empowering users. With the SDK on
> the embedded CPU your driver does not seem to tick any of these boxes.
> 

I agree that FW-less solution has many advantages that enable the community
to engage in its development actively. We have continuance discussions with
Marvell and as Mickey stated, more PP modules will be managed from in-kernel 
code and not from FW allowing kernel developers to extend/improve it.
Tobias Waldekranz Feb. 10, 2021, 9:44 p.m. UTC | #18
On Wed, Feb 10, 2021 at 10:41, Mickey Rachamim <mickeyr@marvell.com> wrote:
>> Until that day arrives, are there any chances of Marvell opening up CPSS in the same way DSDT was re-licensed some years back?
> The CPSS code is available to everyone on Marvell Extranet (Requires simple registration process)

Right, but "available" is not the same as "open" unfortunately. Being
able to study the source is better than nothing, but it is a far cry
from having the ability to modify it and, most importantly, publish
those changes.

So, to restate my question more precisely: Can we expect that Marvell
will provide CPSS under a license that is compatible with the Linux
kernel?

If that is not possible, will Marvell at least commit to allow the
publishing of drivers developed from functional specifications and other
chip documentation?

> Anyway, as the transition process will progress - it will be less required.

Yes, but it makes it hard for smaller players to get on the ride early.

>> Being able to clone github.com/Marvell-switching/prestera-firmware (or
>> whatever) and build the firmware from source would go a long way to alleviate my fears at least.
> I understand your concerns but at this stage - we also concerned about others that might build not reliable FW images.

Totally fair. That problem should be solvable by some kind of taint
concept though. Presumably you have this problem already with the
existing SDK model? Customers can build things on top of CPSS that are
broken in a million ways.

> I also agree that at some point we should ensure most of the concerns are being addressed.
>
> Mickey.
diff mbox series

Patch

diff --git a/drivers/net/ethernet/marvell/prestera/prestera.h b/drivers/net/ethernet/marvell/prestera/prestera.h
index 55aa4bf8a27c..ad0f33a7e517 100644
--- a/drivers/net/ethernet/marvell/prestera/prestera.h
+++ b/drivers/net/ethernet/marvell/prestera/prestera.h
@@ -60,10 +60,19 @@  struct prestera_port_caps {
 	u8 transceiver;
 };
 
+struct prestera_lag {
+	struct net_device *dev;
+	struct list_head members;
+	u16 member_count;
+	u16 lag_id;
+};
+
 struct prestera_port {
 	struct net_device *dev;
 	struct prestera_switch *sw;
 	struct devlink_port dl_port;
+	struct list_head lag_member;
+	struct prestera_lag *lag;
 	u32 id;
 	u32 hw_id;
 	u32 dev_id;
@@ -127,6 +136,12 @@  struct prestera_port_event {
 	} data;
 };
 
+enum prestera_fdb_entry_type {
+	PRESTERA_FDB_ENTRY_TYPE_REG_PORT,
+	PRESTERA_FDB_ENTRY_TYPE_LAG,
+	PRESTERA_FDB_ENTRY_TYPE_MAX
+};
+
 enum prestera_fdb_event_id {
 	PRESTERA_FDB_EVENT_UNSPEC,
 	PRESTERA_FDB_EVENT_LEARNED,
@@ -134,7 +149,11 @@  enum prestera_fdb_event_id {
 };
 
 struct prestera_fdb_event {
-	u32 port_id;
+	enum prestera_fdb_entry_type type;
+	union {
+		u32 port_id;
+		u16 lag_id;
+	} dest;
 	u32 vid;
 	union {
 		u8 mac[ETH_ALEN];
@@ -165,6 +184,9 @@  struct prestera_switch {
 	u32 mtu_min;
 	u32 mtu_max;
 	u8 id;
+	struct prestera_lag *lags;
+	u8 lag_member_max;
+	u8 lag_max;
 };
 
 struct prestera_rxtx_params {
@@ -203,4 +225,10 @@  int prestera_port_pvid_set(struct prestera_port *port, u16 vid);
 
 bool prestera_netdev_check(const struct net_device *dev);
 
+bool prestera_port_is_lag_member(const struct prestera_port *port);
+
+struct prestera_lag *prestera_lag_by_id(struct prestera_switch *sw, u16 id);
+
+u16 prestera_port_lag_id(const struct prestera_port *port);
+
 #endif /* _PRESTERA_H_ */
diff --git a/drivers/net/ethernet/marvell/prestera/prestera_hw.c b/drivers/net/ethernet/marvell/prestera/prestera_hw.c
index 0424718d5998..8afb45f66862 100644
--- a/drivers/net/ethernet/marvell/prestera/prestera_hw.c
+++ b/drivers/net/ethernet/marvell/prestera/prestera_hw.c
@@ -39,6 +39,11 @@  enum prestera_cmd_type_t {
 	PRESTERA_CMD_TYPE_RXTX_INIT = 0x800,
 	PRESTERA_CMD_TYPE_RXTX_PORT_INIT = 0x801,
 
+	PRESTERA_CMD_TYPE_LAG_MEMBER_ADD = 0x900,
+	PRESTERA_CMD_TYPE_LAG_MEMBER_DELETE = 0x901,
+	PRESTERA_CMD_TYPE_LAG_MEMBER_ENABLE = 0x902,
+	PRESTERA_CMD_TYPE_LAG_MEMBER_DISABLE = 0x903,
+
 	PRESTERA_CMD_TYPE_STP_PORT_SET = 0x1000,
 
 	PRESTERA_CMD_TYPE_ACK = 0x10000,
@@ -127,6 +132,12 @@  enum {
 	PRESTERA_FC_SYMM_ASYMM,
 };
 
+enum {
+	PRESTERA_HW_FDB_ENTRY_TYPE_REG_PORT = 0,
+	PRESTERA_HW_FDB_ENTRY_TYPE_LAG = 1,
+	PRESTERA_HW_FDB_ENTRY_TYPE_MAX = 2,
+};
+
 struct prestera_fw_event_handler {
 	struct list_head list;
 	struct rcu_head rcu;
@@ -168,6 +179,8 @@  struct prestera_msg_switch_init_resp {
 	u32 port_count;
 	u32 mtu_max;
 	u8  switch_id;
+	u8  lag_max;
+	u8  lag_member_max;
 };
 
 struct prestera_msg_port_autoneg_param {
@@ -249,8 +262,13 @@  struct prestera_msg_vlan_req {
 struct prestera_msg_fdb_req {
 	struct prestera_msg_cmd cmd;
 	u8 dest_type;
-	u32 port;
-	u32 dev;
+	union {
+		struct {
+			u32 port;
+			u32 dev;
+		};
+		u16 lag_id;
+	} dest;
 	u8  mac[ETH_ALEN];
 	u16 vid;
 	u8  dynamic;
@@ -293,6 +311,13 @@  struct prestera_msg_rxtx_port_req {
 	u32 dev;
 };
 
+struct prestera_msg_lag_req {
+	struct prestera_msg_cmd cmd;
+	u32 port;
+	u32 dev;
+	u16 lag_id;
+};
+
 struct prestera_msg_event {
 	u16 type;
 	u16 id;
@@ -315,7 +340,10 @@  union prestera_msg_event_fdb_param {
 struct prestera_msg_event_fdb {
 	struct prestera_msg_event id;
 	u8 dest_type;
-	u32 port_id;
+	union {
+		u32 port_id;
+		u16 lag_id;
+	} dest;
 	u32 vid;
 	union prestera_msg_event_fdb_param param;
 };
@@ -386,7 +414,19 @@  static int prestera_fw_parse_fdb_evt(void *msg, struct prestera_event *evt)
 {
 	struct prestera_msg_event_fdb *hw_evt = msg;
 
-	evt->fdb_evt.port_id = hw_evt->port_id;
+	switch (hw_evt->dest_type) {
+	case PRESTERA_HW_FDB_ENTRY_TYPE_REG_PORT:
+		evt->fdb_evt.type = PRESTERA_FDB_ENTRY_TYPE_REG_PORT;
+		evt->fdb_evt.dest.port_id = hw_evt->dest.port_id;
+		break;
+	case PRESTERA_HW_FDB_ENTRY_TYPE_LAG:
+		evt->fdb_evt.type = PRESTERA_FDB_ENTRY_TYPE_LAG;
+		evt->fdb_evt.dest.lag_id = hw_evt->dest.lag_id;
+		break;
+	default:
+		return -EINVAL;
+	}
+
 	evt->fdb_evt.vid = hw_evt->vid;
 
 	ether_addr_copy(evt->fdb_evt.data.mac, hw_evt->param.mac);
@@ -531,6 +571,8 @@  int prestera_hw_switch_init(struct prestera_switch *sw)
 	sw->mtu_min = PRESTERA_MIN_MTU;
 	sw->mtu_max = resp.mtu_max;
 	sw->id = resp.switch_id;
+	sw->lag_member_max = resp.lag_member_max;
+	sw->lag_max = resp.lag_max;
 
 	return 0;
 }
@@ -1067,8 +1109,10 @@  int prestera_hw_fdb_add(struct prestera_port *port, const unsigned char *mac,
 			u16 vid, bool dynamic)
 {
 	struct prestera_msg_fdb_req req = {
-		.port = port->hw_id,
-		.dev = port->dev_id,
+		.dest = {
+			.dev = port->dev_id,
+			.port = port->hw_id,
+		},
 		.vid = vid,
 		.dynamic = dynamic,
 	};
@@ -1083,8 +1127,10 @@  int prestera_hw_fdb_del(struct prestera_port *port, const unsigned char *mac,
 			u16 vid)
 {
 	struct prestera_msg_fdb_req req = {
-		.port = port->hw_id,
-		.dev = port->dev_id,
+		.dest = {
+			.dev = port->dev_id,
+			.port = port->hw_id,
+		},
 		.vid = vid,
 	};
 
@@ -1094,11 +1140,48 @@  int prestera_hw_fdb_del(struct prestera_port *port, const unsigned char *mac,
 			    &req.cmd, sizeof(req));
 }
 
+int prestera_hw_lag_fdb_add(struct prestera_switch *sw, u16 lag_id,
+			    const unsigned char *mac, u16 vid, bool dynamic)
+{
+	struct prestera_msg_fdb_req req = {
+		.dest_type = PRESTERA_HW_FDB_ENTRY_TYPE_LAG,
+		.dest = {
+			.lag_id = lag_id,
+		},
+		.vid = vid,
+		.dynamic = dynamic,
+	};
+
+	ether_addr_copy(req.mac, mac);
+
+	return prestera_cmd(sw, PRESTERA_CMD_TYPE_FDB_ADD,
+			    &req.cmd, sizeof(req));
+}
+
+int prestera_hw_lag_fdb_del(struct prestera_switch *sw, u16 lag_id,
+			    const unsigned char *mac, u16 vid)
+{
+	struct prestera_msg_fdb_req req = {
+		.dest_type = PRESTERA_HW_FDB_ENTRY_TYPE_LAG,
+		.dest = {
+			.lag_id = lag_id,
+		},
+		.vid = vid,
+	};
+
+	ether_addr_copy(req.mac, mac);
+
+	return prestera_cmd(sw, PRESTERA_CMD_TYPE_FDB_DELETE,
+			    &req.cmd, sizeof(req));
+}
+
 int prestera_hw_fdb_flush_port(struct prestera_port *port, u32 mode)
 {
 	struct prestera_msg_fdb_req req = {
-		.port = port->hw_id,
-		.dev = port->dev_id,
+		.dest = {
+			.dev = port->dev_id,
+			.port = port->hw_id,
+		},
 		.flush_mode = mode,
 	};
 
@@ -1121,8 +1204,10 @@  int prestera_hw_fdb_flush_port_vlan(struct prestera_port *port, u16 vid,
 				    u32 mode)
 {
 	struct prestera_msg_fdb_req req = {
-		.port = port->hw_id,
-		.dev = port->dev_id,
+		.dest = {
+			.dev = port->dev_id,
+			.port = port->hw_id,
+		},
 		.vid = vid,
 		.flush_mode = mode,
 	};
@@ -1131,6 +1216,37 @@  int prestera_hw_fdb_flush_port_vlan(struct prestera_port *port, u16 vid,
 			    &req.cmd, sizeof(req));
 }
 
+int prestera_hw_fdb_flush_lag(struct prestera_switch *sw, u16 lag_id,
+			      u32 mode)
+{
+	struct prestera_msg_fdb_req req = {
+		.dest_type = PRESTERA_HW_FDB_ENTRY_TYPE_LAG,
+		.dest = {
+			.lag_id = lag_id,
+		},
+		.flush_mode = mode,
+	};
+
+	return prestera_cmd(sw, PRESTERA_CMD_TYPE_FDB_FLUSH_PORT,
+			    &req.cmd, sizeof(req));
+}
+
+int prestera_hw_fdb_flush_lag_vlan(struct prestera_switch *sw,
+				   u16 lag_id, u16 vid, u32 mode)
+{
+	struct prestera_msg_fdb_req req = {
+		.dest_type = PRESTERA_HW_FDB_ENTRY_TYPE_LAG,
+		.dest = {
+			.lag_id = lag_id,
+		},
+		.vid = vid,
+		.flush_mode = mode,
+	};
+
+	return prestera_cmd(sw, PRESTERA_CMD_TYPE_FDB_FLUSH_PORT_VLAN,
+			    &req.cmd, sizeof(req));
+}
+
 int prestera_hw_bridge_create(struct prestera_switch *sw, u16 *bridge_id)
 {
 	struct prestera_msg_bridge_resp resp;
@@ -1212,6 +1328,46 @@  int prestera_hw_rxtx_port_init(struct prestera_port *port)
 			    &req.cmd, sizeof(req));
 }
 
+int prestera_hw_lag_member_add(struct prestera_port *port, u16 lag_id)
+{
+	struct prestera_msg_lag_req req = {
+		.port = port->hw_id,
+		.dev = port->dev_id,
+		.lag_id = lag_id,
+	};
+
+	return prestera_cmd(port->sw, PRESTERA_CMD_TYPE_LAG_MEMBER_ADD,
+			    &req.cmd, sizeof(req));
+}
+
+int prestera_hw_lag_member_del(struct prestera_port *port, u16 lag_id)
+{
+	struct prestera_msg_lag_req req = {
+		.port = port->hw_id,
+		.dev = port->dev_id,
+		.lag_id = lag_id,
+	};
+
+	return prestera_cmd(port->sw, PRESTERA_CMD_TYPE_LAG_MEMBER_DELETE,
+			    &req.cmd, sizeof(req));
+}
+
+int prestera_hw_lag_member_enable(struct prestera_port *port, u16 lag_id,
+				  bool enable)
+{
+	struct prestera_msg_lag_req req = {
+		.port = port->hw_id,
+		.dev = port->dev_id,
+		.lag_id = lag_id,
+	};
+	u32 cmd;
+
+	cmd = enable ? PRESTERA_CMD_TYPE_LAG_MEMBER_ENABLE :
+			PRESTERA_CMD_TYPE_LAG_MEMBER_DISABLE;
+
+	return prestera_cmd(port->sw, cmd, &req.cmd, sizeof(req));
+}
+
 int prestera_hw_event_handler_register(struct prestera_switch *sw,
 				       enum prestera_event_type type,
 				       prestera_event_cb_t fn,
diff --git a/drivers/net/ethernet/marvell/prestera/prestera_hw.h b/drivers/net/ethernet/marvell/prestera/prestera_hw.h
index b2b5ac95b4e3..68ce41595349 100644
--- a/drivers/net/ethernet/marvell/prestera/prestera_hw.h
+++ b/drivers/net/ethernet/marvell/prestera/prestera_hw.h
@@ -179,4 +179,18 @@  int prestera_hw_rxtx_init(struct prestera_switch *sw,
 			  struct prestera_rxtx_params *params);
 int prestera_hw_rxtx_port_init(struct prestera_port *port);
 
+/* LAG API */
+int prestera_hw_lag_member_add(struct prestera_port *port, u16 lag_id);
+int prestera_hw_lag_member_del(struct prestera_port *port, u16 lag_id);
+int prestera_hw_lag_member_enable(struct prestera_port *port, u16 lag_id,
+				  bool enable);
+int prestera_hw_lag_fdb_add(struct prestera_switch *sw, u16 lag_id,
+			    const unsigned char *mac, u16 vid, bool dynamic);
+int prestera_hw_lag_fdb_del(struct prestera_switch *sw, u16 lag_id,
+			    const unsigned char *mac, u16 vid);
+int prestera_hw_fdb_flush_lag(struct prestera_switch *sw, u16 lag_id,
+			      u32 mode);
+int prestera_hw_fdb_flush_lag_vlan(struct prestera_switch *sw,
+				   u16 lag_id, u16 vid, u32 mode);
+
 #endif /* _PRESTERA_HW_H_ */
diff --git a/drivers/net/ethernet/marvell/prestera/prestera_main.c b/drivers/net/ethernet/marvell/prestera/prestera_main.c
index 53c7628a3938..39465e65d09b 100644
--- a/drivers/net/ethernet/marvell/prestera/prestera_main.c
+++ b/drivers/net/ethernet/marvell/prestera/prestera_main.c
@@ -8,6 +8,7 @@ 
 #include <linux/netdev_features.h>
 #include <linux/of.h>
 #include <linux/of_net.h>
+#include <linux/if_vlan.h>
 
 #include "prestera.h"
 #include "prestera_hw.h"
@@ -281,6 +282,7 @@  static int prestera_port_create(struct prestera_switch *sw, u32 id)
 
 	INIT_LIST_HEAD(&port->vlans_list);
 	port->pvid = PRESTERA_DEFAULT_VID;
+	port->lag = NULL;
 	port->dev = dev;
 	port->id = id;
 	port->sw = sw;
@@ -474,6 +476,151 @@  static int prestera_switch_set_base_mac_addr(struct prestera_switch *sw)
 	return prestera_hw_switch_mac_set(sw, sw->base_mac);
 }
 
+struct prestera_lag *prestera_lag_by_id(struct prestera_switch *sw, u16 id)
+{
+	return id < sw->lag_max ? &sw->lags[id] : NULL;
+}
+
+static struct prestera_lag *prestera_lag_by_dev(struct prestera_switch *sw,
+						struct net_device *dev)
+{
+	struct prestera_lag *lag;
+	u16 id;
+
+	for (id = 0; id < sw->lag_max; id++) {
+		lag = &sw->lags[id];
+		if (lag->dev == dev)
+			return lag;
+	}
+
+	return NULL;
+}
+
+static struct prestera_lag *prestera_lag_create(struct prestera_switch *sw,
+						struct net_device *lag_dev)
+{
+	struct prestera_lag *lag;
+	u16 id;
+
+	for (id = 0; id < sw->lag_max; id++) {
+		lag = &sw->lags[id];
+		if (!lag->dev)
+			break;
+	}
+	if (lag) {
+		INIT_LIST_HEAD(&lag->members);
+		lag->dev = lag_dev;
+	}
+
+	return lag;
+}
+
+static void prestera_lag_destroy(struct prestera_switch *sw,
+				 struct prestera_lag *lag)
+{
+	WARN_ON(!list_empty(&lag->members));
+	lag->member_count = 0;
+	lag->dev = NULL;
+}
+
+static int prestera_lag_port_add(struct prestera_port *port,
+				 struct net_device *lag_dev)
+{
+	struct prestera_switch *sw = port->sw;
+	struct prestera_lag *lag;
+	int err;
+
+	lag = prestera_lag_by_dev(sw, lag_dev);
+	if (!lag) {
+		lag = prestera_lag_create(sw, lag_dev);
+		if (!lag)
+			return -ENOMEM;
+	}
+
+	if (lag->member_count >= sw->lag_member_max)
+		return -ENOMEM;
+
+	err = prestera_hw_lag_member_add(port, lag->lag_id);
+	if (err) {
+		if (!lag->member_count)
+			prestera_lag_destroy(sw, lag);
+		return err;
+	}
+
+	list_add(&port->lag_member, &lag->members);
+	lag->member_count++;
+	port->lag = lag;
+
+	return 0;
+}
+
+static int prestera_lag_port_del(struct prestera_port *port)
+{
+	struct prestera_switch *sw = port->sw;
+	struct prestera_lag *lag = port->lag;
+	int err;
+
+	if (!lag || !lag->member_count)
+		return -EINVAL;
+
+	err = prestera_hw_lag_member_del(port, lag->lag_id);
+	if (err)
+		return err;
+
+	list_del(&port->lag_member);
+	lag->member_count--;
+	port->lag = NULL;
+
+	if (netif_is_bridge_port(lag->dev)) {
+		struct netdev_notifier_changeupper_info br_info;
+
+		br_info.upper_dev = netdev_master_upper_dev_get(lag->dev);
+		br_info.linking = false;
+
+		prestera_bridge_port_event(lag->dev, port->dev,
+					   NETDEV_CHANGEUPPER, &br_info);
+	}
+
+	if (!lag->member_count)
+		prestera_lag_destroy(sw, lag);
+
+	return 0;
+}
+
+bool prestera_port_is_lag_member(const struct prestera_port *port)
+{
+	return !!port->lag;
+}
+
+u16 prestera_port_lag_id(const struct prestera_port *port)
+{
+	return port->lag->lag_id;
+}
+
+static int prestera_lag_init(struct prestera_switch *sw)
+{
+	u16 id;
+
+	sw->lags = kcalloc(sw->lag_max, sizeof(*sw->lags), GFP_KERNEL);
+	if (!sw->lags)
+		return -ENOMEM;
+
+	for (id = 0; id < sw->lag_max; id++)
+		sw->lags[id].lag_id = id;
+
+	return 0;
+}
+
+static void prestera_lag_fini(struct prestera_switch *sw)
+{
+	u8 idx;
+
+	for (idx = 0; idx < sw->lag_max; idx++)
+		WARN_ON(sw->lags[idx].member_count);
+
+	kfree(sw->lags);
+}
+
 bool prestera_netdev_check(const struct net_device *dev)
 {
 	return dev->netdev_ops == &prestera_netdev_ops;
@@ -507,19 +654,54 @@  struct prestera_port *prestera_port_dev_lower_find(struct net_device *dev)
 	return port;
 }
 
-static int prestera_netdev_port_event(struct net_device *dev,
+static int prestera_netdev_port_lower_event(struct net_device *dev,
+					    unsigned long event, void *ptr)
+{
+	struct netdev_notifier_changelowerstate_info *info = ptr;
+	struct netdev_lag_lower_state_info *lower_state_info;
+	struct prestera_port *port = netdev_priv(dev);
+	bool enabled;
+
+	if (!netif_is_lag_port(dev))
+		return 0;
+	if (!prestera_port_is_lag_member(port))
+		return 0;
+
+	lower_state_info = info->lower_state_info;
+	enabled = lower_state_info->tx_enabled;
+
+	return prestera_hw_lag_member_enable(port, port->lag->lag_id, enabled);
+}
+
+static bool prestera_lag_master_check(struct net_device *lag_dev,
+				      struct netdev_lag_upper_info *info,
+				      struct netlink_ext_ack *ext_ack)
+{
+	if (info->tx_type != NETDEV_LAG_TX_TYPE_HASH) {
+		NL_SET_ERR_MSG_MOD(ext_ack, "Unsupported LAG Tx type");
+		return false;
+	}
+
+	return true;
+}
+
+static int prestera_netdev_port_event(struct net_device *lower,
+				      struct net_device *dev,
 				      unsigned long event, void *ptr)
 {
 	struct netdev_notifier_changeupper_info *info = ptr;
+	struct prestera_port *port = netdev_priv(dev);
 	struct netlink_ext_ack *extack;
 	struct net_device *upper;
+	int err;
 
 	extack = netdev_notifier_info_to_extack(&info->info);
 	upper = info->upper_dev;
 
 	switch (event) {
 	case NETDEV_PRECHANGEUPPER:
-		if (!netif_is_bridge_master(upper)) {
+		if (!netif_is_bridge_master(upper) &&
+		    !netif_is_lag_master(upper)) {
 			NL_SET_ERR_MSG_MOD(extack, "Unknown upper device type");
 			return -EINVAL;
 		}
@@ -531,12 +713,60 @@  static int prestera_netdev_port_event(struct net_device *dev,
 			NL_SET_ERR_MSG_MOD(extack, "Upper device is already enslaved");
 			return -EINVAL;
 		}
+
+		if (netif_is_lag_master(upper) &&
+		    !prestera_lag_master_check(upper, info->upper_info, extack))
+			return -EINVAL;
+		if (netif_is_lag_master(upper) && vlan_uses_dev(dev)) {
+			NL_SET_ERR_MSG_MOD(extack,
+					   "Master device is a LAG master and port has a VLAN");
+			return -EINVAL;
+		}
+		if (netif_is_lag_port(dev) && is_vlan_dev(upper) &&
+		    !netif_is_lag_master(vlan_dev_real_dev(upper))) {
+			NL_SET_ERR_MSG_MOD(extack,
+					   "Can not put a VLAN on a LAG port");
+			return -EINVAL;
+		}
 		break;
 
 	case NETDEV_CHANGEUPPER:
 		if (netif_is_bridge_master(upper))
-			return prestera_bridge_port_event(dev, event, ptr);
+			return prestera_bridge_port_event(lower, dev, event,
+							  ptr);
+
+		if (netif_is_lag_master(upper)) {
+			if (info->linking) {
+				err = prestera_lag_port_add(port, upper);
+				if (err)
+					return err;
+			} else {
+				prestera_lag_port_del(port);
+			}
+		}
 		break;
+
+	case NETDEV_CHANGELOWERSTATE:
+		return prestera_netdev_port_lower_event(dev, event, ptr);
+	}
+
+	return 0;
+}
+
+static int prestera_netdevice_lag_event(struct net_device *lag_dev,
+					unsigned long event, void *ptr)
+{
+	struct net_device *dev;
+	struct list_head *iter;
+	int err;
+
+	netdev_for_each_lower_dev(lag_dev, dev, iter) {
+		if (prestera_netdev_check(dev)) {
+			err = prestera_netdev_port_event(lag_dev, dev, event,
+							 ptr);
+			if (err)
+				return err;
+		}
 	}
 
 	return 0;
@@ -549,7 +779,9 @@  static int prestera_netdev_event_handler(struct notifier_block *nb,
 	int err = 0;
 
 	if (prestera_netdev_check(dev))
-		err = prestera_netdev_port_event(dev, event, ptr);
+		err = prestera_netdev_port_event(dev, dev, event, ptr);
+	else if (netif_is_lag_master(dev))
+		err = prestera_netdevice_lag_event(dev, event, ptr);
 
 	return notifier_from_errno(err);
 }
@@ -603,6 +835,10 @@  static int prestera_switch_init(struct prestera_switch *sw)
 	if (err)
 		goto err_dl_register;
 
+	err = prestera_lag_init(sw);
+	if (err)
+		goto err_lag_init;
+
 	err = prestera_create_ports(sw);
 	if (err)
 		goto err_ports_create;
@@ -610,6 +846,8 @@  static int prestera_switch_init(struct prestera_switch *sw)
 	return 0;
 
 err_ports_create:
+	prestera_lag_fini(sw);
+err_lag_init:
 	prestera_devlink_unregister(sw);
 err_dl_register:
 	prestera_event_handlers_unregister(sw);
@@ -627,6 +865,7 @@  static int prestera_switch_init(struct prestera_switch *sw)
 static void prestera_switch_fini(struct prestera_switch *sw)
 {
 	prestera_destroy_ports(sw);
+	prestera_lag_fini(sw);
 	prestera_devlink_unregister(sw);
 	prestera_event_handlers_unregister(sw);
 	prestera_rxtx_switch_fini(sw);
diff --git a/drivers/net/ethernet/marvell/prestera/prestera_switchdev.c b/drivers/net/ethernet/marvell/prestera/prestera_switchdev.c
index 7736d5f498c9..3750c66a550b 100644
--- a/drivers/net/ethernet/marvell/prestera/prestera_switchdev.c
+++ b/drivers/net/ethernet/marvell/prestera/prestera_switchdev.c
@@ -180,6 +180,45 @@  prestera_port_vlan_create(struct prestera_port *port, u16 vid, bool untagged)
 	return ERR_PTR(err);
 }
 
+static int prestera_fdb_add(struct prestera_port *port,
+			    const unsigned char *mac, u16 vid, bool dynamic)
+{
+	if (prestera_port_is_lag_member(port))
+		return prestera_hw_lag_fdb_add(port->sw, prestera_port_lag_id(port),
+					      mac, vid, dynamic);
+	else
+		return prestera_hw_fdb_add(port, mac, vid, dynamic);
+}
+
+static int prestera_fdb_del(struct prestera_port *port,
+			    const unsigned char *mac, u16 vid)
+{
+	if (prestera_port_is_lag_member(port))
+		return prestera_hw_lag_fdb_del(port->sw, prestera_port_lag_id(port),
+					      mac, vid);
+	else
+		return prestera_hw_fdb_del(port, mac, vid);
+}
+
+static int prestera_fdb_flush_port_vlan(struct prestera_port *port, u16 vid,
+					u32 mode)
+{
+	if (prestera_port_is_lag_member(port))
+		return prestera_hw_fdb_flush_lag_vlan(port->sw, prestera_port_lag_id(port),
+						      vid, mode);
+	else
+		return prestera_hw_fdb_flush_port_vlan(port, vid, mode);
+}
+
+static int prestera_fdb_flush_port(struct prestera_port *port, u32 mode)
+{
+	if (prestera_port_is_lag_member(port))
+		return prestera_hw_fdb_flush_lag(port->sw, prestera_port_lag_id(port),
+						 mode);
+	else
+		return prestera_hw_fdb_flush_port(port, mode);
+}
+
 static void
 prestera_port_vlan_bridge_leave(struct prestera_port_vlan *port_vlan)
 {
@@ -199,11 +238,11 @@  prestera_port_vlan_bridge_leave(struct prestera_port_vlan *port_vlan)
 	last_port = port_count == 1;
 
 	if (last_vlan)
-		prestera_hw_fdb_flush_port(port, fdb_flush_mode);
+		prestera_fdb_flush_port(port, fdb_flush_mode);
 	else if (last_port)
 		prestera_hw_fdb_flush_vlan(port->sw, vid, fdb_flush_mode);
 	else
-		prestera_hw_fdb_flush_port_vlan(port, vid, fdb_flush_mode);
+		prestera_fdb_flush_port_vlan(port, vid, fdb_flush_mode);
 
 	list_del(&port_vlan->br_vlan_head);
 	prestera_bridge_vlan_put(br_vlan);
@@ -394,9 +433,9 @@  prestera_bridge_port_add(struct prestera_bridge *bridge, struct net_device *dev)
 }
 
 static int
-prestera_bridge_1d_port_join(struct prestera_bridge_port *br_port)
+prestera_bridge_1d_port_join(struct prestera_bridge_port *br_port,
+			     struct prestera_port *port)
 {
-	struct prestera_port *port = netdev_priv(br_port->dev);
 	struct prestera_bridge *bridge = br_port->bridge;
 	int err;
 
@@ -423,6 +462,7 @@  prestera_bridge_1d_port_join(struct prestera_bridge_port *br_port)
 }
 
 static int prestera_port_bridge_join(struct prestera_port *port,
+				     struct net_device *lower,
 				     struct net_device *upper)
 {
 	struct prestera_switchdev *swdev = port->sw->swdev;
@@ -437,7 +477,7 @@  static int prestera_port_bridge_join(struct prestera_port *port,
 			return PTR_ERR(bridge);
 	}
 
-	br_port = prestera_bridge_port_add(bridge, port->dev);
+	br_port = prestera_bridge_port_add(bridge, lower);
 	if (IS_ERR(br_port)) {
 		err = PTR_ERR(br_port);
 		goto err_brport_create;
@@ -446,7 +486,7 @@  static int prestera_port_bridge_join(struct prestera_port *port,
 	if (bridge->vlan_enabled)
 		return 0;
 
-	err = prestera_bridge_1d_port_join(br_port);
+	err = prestera_bridge_1d_port_join(br_port, port);
 	if (err)
 		goto err_port_join;
 
@@ -459,19 +499,17 @@  static int prestera_port_bridge_join(struct prestera_port *port,
 	return err;
 }
 
-static void prestera_bridge_1q_port_leave(struct prestera_bridge_port *br_port)
+static void prestera_bridge_1q_port_leave(struct prestera_bridge_port *br_port,
+					  struct prestera_port *port)
 {
-	struct prestera_port *port = netdev_priv(br_port->dev);
-
-	prestera_hw_fdb_flush_port(port, PRESTERA_FDB_FLUSH_MODE_ALL);
+	prestera_fdb_flush_port(port, PRESTERA_FDB_FLUSH_MODE_ALL);
 	prestera_port_pvid_set(port, PRESTERA_DEFAULT_VID);
 }
 
-static void prestera_bridge_1d_port_leave(struct prestera_bridge_port *br_port)
+static void prestera_bridge_1d_port_leave(struct prestera_bridge_port *br_port,
+					  struct prestera_port *port)
 {
-	struct prestera_port *port = netdev_priv(br_port->dev);
-
-	prestera_hw_fdb_flush_port(port, PRESTERA_FDB_FLUSH_MODE_ALL);
+	prestera_fdb_flush_port(port, PRESTERA_FDB_FLUSH_MODE_ALL);
 	prestera_hw_bridge_port_delete(port, br_port->bridge->bridge_id);
 }
 
@@ -506,6 +544,7 @@  static int prestera_port_vid_stp_set(struct prestera_port *port, u16 vid,
 }
 
 static void prestera_port_bridge_leave(struct prestera_port *port,
+				       struct net_device *lower,
 				       struct net_device *upper)
 {
 	struct prestera_switchdev *swdev = port->sw->swdev;
@@ -516,16 +555,16 @@  static void prestera_port_bridge_leave(struct prestera_port *port,
 	if (!bridge)
 		return;
 
-	br_port = __prestera_bridge_port_by_dev(bridge, port->dev);
+	br_port = __prestera_bridge_port_by_dev(bridge, lower);
 	if (!br_port)
 		return;
 
 	bridge = br_port->bridge;
 
 	if (bridge->vlan_enabled)
-		prestera_bridge_1q_port_leave(br_port);
+		prestera_bridge_1q_port_leave(br_port, port);
 	else
-		prestera_bridge_1d_port_leave(br_port);
+		prestera_bridge_1d_port_leave(br_port, port);
 
 	prestera_hw_port_learning_set(port, false);
 	prestera_hw_port_flood_set(port, false);
@@ -533,8 +572,8 @@  static void prestera_port_bridge_leave(struct prestera_port *port,
 	prestera_bridge_port_put(br_port);
 }
 
-int prestera_bridge_port_event(struct net_device *dev, unsigned long event,
-			       void *ptr)
+int prestera_bridge_port_event(struct net_device *lower, struct net_device *dev,
+			       unsigned long event, void *ptr)
 {
 	struct netdev_notifier_changeupper_info *info = ptr;
 	struct prestera_port *port;
@@ -547,11 +586,11 @@  int prestera_bridge_port_event(struct net_device *dev, unsigned long event,
 	switch (event) {
 	case NETDEV_CHANGEUPPER:
 		if (info->linking) {
-			err = prestera_port_bridge_join(port, upper);
+			err = prestera_port_bridge_join(port, lower, upper);
 			if (err)
 				return err;
 		} else {
-			prestera_port_bridge_leave(port, upper);
+			prestera_port_bridge_leave(port, lower, upper);
 		}
 		break;
 	}
@@ -745,9 +784,9 @@  static int prestera_port_fdb_set(struct prestera_port *port,
 		vid = bridge->bridge_id;
 
 	if (adding)
-		err = prestera_hw_fdb_add(port, fdb_info->addr, vid, false);
+		err = prestera_fdb_add(port, fdb_info->addr, vid, false);
 	else
-		err = prestera_hw_fdb_del(port, fdb_info->addr, vid);
+		err = prestera_fdb_del(port, fdb_info->addr, vid);
 
 	return err;
 }
@@ -1088,10 +1127,26 @@  static void prestera_fdb_event(struct prestera_switch *sw,
 			       struct prestera_event *evt, void *arg)
 {
 	struct switchdev_notifier_fdb_info info;
+	struct net_device *dev = NULL;
 	struct prestera_port *port;
+	struct prestera_lag *lag;
 
-	port = prestera_find_port(sw, evt->fdb_evt.port_id);
-	if (!port)
+	switch (evt->fdb_evt.type) {
+	case PRESTERA_FDB_ENTRY_TYPE_REG_PORT:
+		port = prestera_find_port(sw, evt->fdb_evt.dest.port_id);
+		if (port)
+			dev = port->dev;
+		break;
+	case PRESTERA_FDB_ENTRY_TYPE_LAG:
+		lag = prestera_lag_by_id(sw, evt->fdb_evt.dest.lag_id);
+		if (lag)
+			dev = lag->dev;
+		break;
+	default:
+		return;
+	}
+
+	if (!dev)
 		return;
 
 	info.addr = evt->fdb_evt.data.mac;
@@ -1103,11 +1158,11 @@  static void prestera_fdb_event(struct prestera_switch *sw,
 	switch (evt->id) {
 	case PRESTERA_FDB_EVENT_LEARNED:
 		call_switchdev_notifiers(SWITCHDEV_FDB_ADD_TO_BRIDGE,
-					 port->dev, &info.info, NULL);
+					 dev, &info.info, NULL);
 		break;
 	case PRESTERA_FDB_EVENT_AGED:
 		call_switchdev_notifiers(SWITCHDEV_FDB_DEL_TO_BRIDGE,
-					 port->dev, &info.info, NULL);
+					 dev, &info.info, NULL);
 		break;
 	}
 
diff --git a/drivers/net/ethernet/marvell/prestera/prestera_switchdev.h b/drivers/net/ethernet/marvell/prestera/prestera_switchdev.h
index 606e21d2355b..70e9ed87e24a 100644
--- a/drivers/net/ethernet/marvell/prestera/prestera_switchdev.h
+++ b/drivers/net/ethernet/marvell/prestera/prestera_switchdev.h
@@ -7,7 +7,7 @@ 
 int prestera_switchdev_init(struct prestera_switch *sw);
 void prestera_switchdev_fini(struct prestera_switch *sw);
 
-int prestera_bridge_port_event(struct net_device *dev, unsigned long event,
-			       void *ptr);
+int prestera_bridge_port_event(struct net_device *lower, struct net_device *dev,
+			       unsigned long event, void *ptr);
 
 #endif /* _PRESTERA_SWITCHDEV_H_ */