diff mbox series

[net-next,3/6] net: dsa: add support for retrieving the interface mode

Message ID E1oCNl3-006e3n-PT@rmk-PC.armlinux.org.uk (mailing list archive)
State New, archived
Headers show
Series net: dsa: always use phylink | expand

Commit Message

Russell King (Oracle) July 15, 2022, 4:01 p.m. UTC
DSA port bindings allow for an optional phy interface mode. When an
interface mode is not specified, DSA uses the NA interface mode type.

However, phylink needs to know the parameters of the link, and this
will become especially important when using phylink for ports that
are devoid of all properties except the required "reg" property, so
that phylink can select the maximum supported link settings. Without
knowing the interface mode, phylink can't truely know the maximum
link speed.

Update the prototype for the phylink_get_caps method to allow drivers
to report this information back to DSA, and update all DSA
implementations function declarations to cater for this change. No
code is added to the implementations.

Reviewed-by: Marek Behún <kabel@kernel.org>
Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
---
 drivers/net/dsa/b53/b53_common.c       |  3 ++-
 drivers/net/dsa/bcm_sf2.c              |  3 ++-
 drivers/net/dsa/hirschmann/hellcreek.c |  3 ++-
 drivers/net/dsa/lantiq_gswip.c         |  6 ++++--
 drivers/net/dsa/microchip/ksz_common.c |  3 ++-
 drivers/net/dsa/mt7530.c               |  3 ++-
 drivers/net/dsa/mv88e6xxx/chip.c       |  3 ++-
 drivers/net/dsa/ocelot/felix.c         |  3 ++-
 drivers/net/dsa/qca/ar9331.c           |  3 ++-
 drivers/net/dsa/qca8k.c                |  3 ++-
 drivers/net/dsa/realtek/rtl8365mb.c    |  3 ++-
 drivers/net/dsa/sja1105/sja1105_main.c |  3 ++-
 drivers/net/dsa/xrs700x/xrs700x.c      |  3 ++-
 include/net/dsa.h                      |  3 ++-
 net/dsa/port.c                         | 23 +++++++++++++++++------
 15 files changed, 47 insertions(+), 21 deletions(-)

Comments

Vladimir Oltean July 15, 2022, 5:24 p.m. UTC | #1
On Fri, Jul 15, 2022 at 05:01:37PM +0100, Russell King (Oracle) wrote:
> DSA port bindings allow for an optional phy interface mode. When an
> interface mode is not specified, DSA uses the NA interface mode type.
> 
> However, phylink needs to know the parameters of the link, and this
> will become especially important when using phylink for ports that
> are devoid of all properties except the required "reg" property, so
> that phylink can select the maximum supported link settings. Without
> knowing the interface mode, phylink can't truely know the maximum
> link speed.
> 
> Update the prototype for the phylink_get_caps method to allow drivers
> to report this information back to DSA, and update all DSA
> implementations function declarations to cater for this change. No
> code is added to the implementations.
> 
> Reviewed-by: Marek Behún <kabel@kernel.org>
> Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
> ---
(...)
> diff --git a/include/net/dsa.h b/include/net/dsa.h
> index b902b31bebce..7c6870d2c607 100644
> --- a/include/net/dsa.h
> +++ b/include/net/dsa.h
> @@ -852,7 +852,8 @@ struct dsa_switch_ops {
>  	 * PHYLINK integration
>  	 */
>  	void	(*phylink_get_caps)(struct dsa_switch *ds, int port,
> -				    struct phylink_config *config);
> +				    struct phylink_config *config,
> +				    phy_interface_t *default_interface);

I would prefer having a dedicated void (*port_max_speed_interface),
because the post-phylink DSA drivers (which are not few) will generally
not need to concern themselves with implementing this, and I don't want
driver writers to think they need to populate every parameter they see
in phylink_get_caps. So the new function needs to be documented
appropriately (specify who needs and who does not need to implement it,
on which ports it will be called, etc).

In addition, if we have a dedicated ds->ops->port_max_speed_interface(),
we can do a better job of avoiding breakage with this patch set, since
if DSA cannot find a valid phylink fwnode, AND there is no
port_max_speed_interface() callback for this driver, DSA can still
preserve the current logic of not putting the port down, and not
registering it with phylink. That can be accompanied by a dev_warn() to
state that the CPU/DSA port isn't registered with phylink, please
implement port_max_speed_interface() to address that.
Russell King (Oracle) July 15, 2022, 9:31 p.m. UTC | #2
On Fri, Jul 15, 2022 at 08:24:44PM +0300, Vladimir Oltean wrote:
> On Fri, Jul 15, 2022 at 05:01:37PM +0100, Russell King (Oracle) wrote:
> > DSA port bindings allow for an optional phy interface mode. When an
> > interface mode is not specified, DSA uses the NA interface mode type.
> > 
> > However, phylink needs to know the parameters of the link, and this
> > will become especially important when using phylink for ports that
> > are devoid of all properties except the required "reg" property, so
> > that phylink can select the maximum supported link settings. Without
> > knowing the interface mode, phylink can't truely know the maximum
> > link speed.
> > 
> > Update the prototype for the phylink_get_caps method to allow drivers
> > to report this information back to DSA, and update all DSA
> > implementations function declarations to cater for this change. No
> > code is added to the implementations.
> > 
> > Reviewed-by: Marek Behún <kabel@kernel.org>
> > Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
> > ---
> (...)
> > diff --git a/include/net/dsa.h b/include/net/dsa.h
> > index b902b31bebce..7c6870d2c607 100644
> > --- a/include/net/dsa.h
> > +++ b/include/net/dsa.h
> > @@ -852,7 +852,8 @@ struct dsa_switch_ops {
> >  	 * PHYLINK integration
> >  	 */
> >  	void	(*phylink_get_caps)(struct dsa_switch *ds, int port,
> > -				    struct phylink_config *config);
> > +				    struct phylink_config *config,
> > +				    phy_interface_t *default_interface);
> 
> I would prefer having a dedicated void (*port_max_speed_interface),
> because the post-phylink DSA drivers (which are not few) will generally
> not need to concern themselves with implementing this, and I don't want
> driver writers to think they need to populate every parameter they see
> in phylink_get_caps. So the new function needs to be documented
> appropriately (specify who needs and who does not need to implement it,
> on which ports it will be called, etc).
> 
> In addition, if we have a dedicated ds->ops->port_max_speed_interface(),
> we can do a better job of avoiding breakage with this patch set, since
> if DSA cannot find a valid phylink fwnode, AND there is no
> port_max_speed_interface() callback for this driver, DSA can still
> preserve the current logic of not putting the port down, and not
> registering it with phylink. That can be accompanied by a dev_warn() to
> state that the CPU/DSA port isn't registered with phylink, please
> implement port_max_speed_interface() to address that.

To continue my previous email...

This is a great illustration why posting RFC series is a waste of time.
This patch was posted as RFC on:

24th June
29th June
5th July
13th July

Only when it's been posted today has there been a concern raised about
the approach. So, what's the use of asking for comments if comments only
come when patches are posted for merging. None what so ever. So, we've
lost the last three weeks because I decided to "be kind" and post RFC.
Total waste of effort.

Now, on your point... the series posted on the 24th June was using
the mv88e6xxx port_max_speed_interface() but discussion off the mailing
list:

20:19 < rmk> kabel: hmm, is mv88e6393x_port_max_speed_mode() correct?
20:20 < rmk> it seems to be suggesting to use PHY_INTERFACE_MODE_10GBASER for
             port 9
09:50 < kabel> rmk: yes, 10gbase-r is correct for 6393x. But we need to add
               exception for 6191x, as is done in chip.c function
               mv88e6393x_phylink_get_caps()
09:51 < kabel> rmk: on 6191x only port 10 supports >1g speeds
11:51 < rmk> kabel: moving it into the get_caps function makes it easier to set
             the default_interfaces for 6193x
14:20 < kabel> rmk: yes, get_caps doing it would be better

The problem is this - we call get_caps(), and we have to read registers
to work out what the port supports. If we have a separate callback, then
we need to re-read those registers to get the same information to report
what the default interface should be.

Since almost all of the Marvell implementations the values for both the
list of supported interfaces and the default interface both require
reading a register and translating it to a phy_interface_t, and then
setting the support mask, it seems logical to combine these two
functioalities into one function.
Vladimir Oltean July 15, 2022, 10:23 p.m. UTC | #3
On Fri, Jul 15, 2022 at 10:31:17PM +0100, Russell King (Oracle) wrote:
> On Fri, Jul 15, 2022 at 08:24:44PM +0300, Vladimir Oltean wrote:
> > On Fri, Jul 15, 2022 at 05:01:37PM +0100, Russell King (Oracle) wrote:
> > > DSA port bindings allow for an optional phy interface mode. When an
> > > interface mode is not specified, DSA uses the NA interface mode type.
> > > 
> > > However, phylink needs to know the parameters of the link, and this
> > > will become especially important when using phylink for ports that
> > > are devoid of all properties except the required "reg" property, so
> > > that phylink can select the maximum supported link settings. Without
> > > knowing the interface mode, phylink can't truely know the maximum
> > > link speed.
> > > 
> > > Update the prototype for the phylink_get_caps method to allow drivers
> > > to report this information back to DSA, and update all DSA
> > > implementations function declarations to cater for this change. No
> > > code is added to the implementations.
> > > 
> > > Reviewed-by: Marek Behún <kabel@kernel.org>
> > > Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
> > > ---
> > (...)
> > > diff --git a/include/net/dsa.h b/include/net/dsa.h
> > > index b902b31bebce..7c6870d2c607 100644
> > > --- a/include/net/dsa.h
> > > +++ b/include/net/dsa.h
> > > @@ -852,7 +852,8 @@ struct dsa_switch_ops {
> > >  	 * PHYLINK integration
> > >  	 */
> > >  	void	(*phylink_get_caps)(struct dsa_switch *ds, int port,
> > > -				    struct phylink_config *config);
> > > +				    struct phylink_config *config,
> > > +				    phy_interface_t *default_interface);
> > 
> > I would prefer having a dedicated void (*port_max_speed_interface),
> > because the post-phylink DSA drivers (which are not few) will generally
> > not need to concern themselves with implementing this, and I don't want
> > driver writers to think they need to populate every parameter they see
> > in phylink_get_caps. So the new function needs to be documented
> > appropriately (specify who needs and who does not need to implement it,
> > on which ports it will be called, etc).
> > 
> > In addition, if we have a dedicated ds->ops->port_max_speed_interface(),
> > we can do a better job of avoiding breakage with this patch set, since
> > if DSA cannot find a valid phylink fwnode, AND there is no
> > port_max_speed_interface() callback for this driver, DSA can still
> > preserve the current logic of not putting the port down, and not
> > registering it with phylink. That can be accompanied by a dev_warn() to
> > state that the CPU/DSA port isn't registered with phylink, please
> > implement port_max_speed_interface() to address that.
> 
> To continue my previous email...
> 
> This is a great illustration why posting RFC series is a waste of time.
> This patch was posted as RFC on:
> 
> 24th June
> 29th June
> 5th July
> 13th July
> 
> Only when it's been posted today has there been a concern raised about
> the approach. So, what's the use of asking for comments if comments only
> come when patches are posted for merging. None what so ever. So, we've
> lost the last three weeks because I decided to "be kind" and post RFC.
> Total waste of effort.

Sorry, but I don't exactly have a reason to respond to this series earlier
than others more directly affected, even less so when it's an RFC.
My feedback is strictly from the point of view of the "other" drivers
who don't care about context-specific interpretations of the CPU port
OF node. For them it doesn't make sense to have "default_interface" an
argument of phylink_get_caps.

Also about the total waste of effort (or at least time), it's not at all
obvious to me that if I had provided more feedback earlier, this series
would have been done with even one day earlier, considering you've stated
at least twice that you're waiting for a reply from Andrew, which didn't come.

> 
> Now, on your point... the series posted on the 24th June was using
> the mv88e6xxx port_max_speed_interface() but discussion off the mailing
> list:
> 
> 20:19 < rmk> kabel: hmm, is mv88e6393x_port_max_speed_mode() correct?
> 20:20 < rmk> it seems to be suggesting to use PHY_INTERFACE_MODE_10GBASER for
>              port 9
> 09:50 < kabel> rmk: yes, 10gbase-r is correct for 6393x. But we need to add
>                exception for 6191x, as is done in chip.c function
>                mv88e6393x_phylink_get_caps()
> 09:51 < kabel> rmk: on 6191x only port 10 supports >1g speeds
> 11:51 < rmk> kabel: moving it into the get_caps function makes it easier to set
>              the default_interfaces for 6193x
> 14:20 < kabel> rmk: yes, get_caps doing it would be better
> 
> The problem is this - we call get_caps(), and we have to read registers
> to work out what the port supports. If we have a separate callback, then
> we need to re-read those registers to get the same information to report
> what the default interface should be.
> 
> Since almost all of the Marvell implementations the values for both the
> list of supported interfaces and the default interface both require
> reading a register and translating it to a phy_interface_t, and then
> setting the support mask, it seems logical to combine these two
> functioalities into one function.

In essence that doesn't mean much; DSA isn't Marvell only, but I'll give
it to you: if only the Marvell driver (and Broadcom later, I expect) is
going to add support for the context-specific interpretation of CPU port
OF nodes, then we may consider tailoring the implementation to their
hardware register layout details. In any case, my concern can be
addressed even if you insist on keeping the default interface as an
argument of phylink_get_caps. There just needs to be a lot more
documentation explaining who needs to populate that argument and why.

Also, perhaps more importantly, a real effort needs to be put to prevent
breakage for drivers that work without a phylink instance registered for
the CPU port, and also don't report the default interface. Practically
that just means not deleting the current logic, but making it one of 3
options.

fwnode is valid from phylink's perspective?
       /                             \
 yes  /                               \ no
     /                                 \
register with phylink         can we determine the link parameters to create
                                  a fixed-link software node?
                                       /                \                     \
                                 yes  /                  \  no                |
                                     /                    \                   | this is missing
                                    /                      \                  |
             create the software node and       don't put the port down,      |
             register with phylink              don't register with phylink   /
Russell King (Oracle) July 15, 2022, 10:57 p.m. UTC | #4
On Sat, Jul 16, 2022 at 01:23:48AM +0300, Vladimir Oltean wrote:
> On Fri, Jul 15, 2022 at 10:31:17PM +0100, Russell King (Oracle) wrote:
> > On Fri, Jul 15, 2022 at 08:24:44PM +0300, Vladimir Oltean wrote:
> > > On Fri, Jul 15, 2022 at 05:01:37PM +0100, Russell King (Oracle) wrote:
> > > > DSA port bindings allow for an optional phy interface mode. When an
> > > > interface mode is not specified, DSA uses the NA interface mode type.
> > > > 
> > > > However, phylink needs to know the parameters of the link, and this
> > > > will become especially important when using phylink for ports that
> > > > are devoid of all properties except the required "reg" property, so
> > > > that phylink can select the maximum supported link settings. Without
> > > > knowing the interface mode, phylink can't truely know the maximum
> > > > link speed.
> > > > 
> > > > Update the prototype for the phylink_get_caps method to allow drivers
> > > > to report this information back to DSA, and update all DSA
> > > > implementations function declarations to cater for this change. No
> > > > code is added to the implementations.
> > > > 
> > > > Reviewed-by: Marek Behún <kabel@kernel.org>
> > > > Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
> > > > ---
> > > (...)
> > > > diff --git a/include/net/dsa.h b/include/net/dsa.h
> > > > index b902b31bebce..7c6870d2c607 100644
> > > > --- a/include/net/dsa.h
> > > > +++ b/include/net/dsa.h
> > > > @@ -852,7 +852,8 @@ struct dsa_switch_ops {
> > > >  	 * PHYLINK integration
> > > >  	 */
> > > >  	void	(*phylink_get_caps)(struct dsa_switch *ds, int port,
> > > > -				    struct phylink_config *config);
> > > > +				    struct phylink_config *config,
> > > > +				    phy_interface_t *default_interface);
> > > 
> > > I would prefer having a dedicated void (*port_max_speed_interface),
> > > because the post-phylink DSA drivers (which are not few) will generally
> > > not need to concern themselves with implementing this, and I don't want
> > > driver writers to think they need to populate every parameter they see
> > > in phylink_get_caps. So the new function needs to be documented
> > > appropriately (specify who needs and who does not need to implement it,
> > > on which ports it will be called, etc).
> > > 
> > > In addition, if we have a dedicated ds->ops->port_max_speed_interface(),
> > > we can do a better job of avoiding breakage with this patch set, since
> > > if DSA cannot find a valid phylink fwnode, AND there is no
> > > port_max_speed_interface() callback for this driver, DSA can still
> > > preserve the current logic of not putting the port down, and not
> > > registering it with phylink. That can be accompanied by a dev_warn() to
> > > state that the CPU/DSA port isn't registered with phylink, please
> > > implement port_max_speed_interface() to address that.
> > 
> > To continue my previous email...
> > 
> > This is a great illustration why posting RFC series is a waste of time.
> > This patch was posted as RFC on:
> > 
> > 24th June
> > 29th June
> > 5th July
> > 13th July
> > 
> > Only when it's been posted today has there been a concern raised about
> > the approach. So, what's the use of asking for comments if comments only
> > come when patches are posted for merging. None what so ever. So, we've
> > lost the last three weeks because I decided to "be kind" and post RFC.
> > Total waste of effort.
> 
> Sorry, but I don't exactly have a reason to respond to this series earlier
> than others more directly affected, even less so when it's an RFC.
> My feedback is strictly from the point of view of the "other" drivers
> who don't care about context-specific interpretations of the CPU port
> OF node. For them it doesn't make sense to have "default_interface" an
> argument of phylink_get_caps.
> 
> Also about the total waste of effort (or at least time), it's not at all
> obvious to me that if I had provided more feedback earlier, this series
> would have been done with even one day earlier, considering you've stated
> at least twice that you're waiting for a reply from Andrew, which didn't come.

I've given up waiting, basically. I think anyone reasonable will also
have decided the same thing. As I've said, Jakub seems to have given up
waiting for people to review my RFC patches to - this is what Jakub
said to me last week on my RFC series:

| IIUC the folks we expect to pay attention/test have already done so,
| all we can do now is apply and deal with the reports. 5.21 is more
| likely to be LTS than 5.20, right? So no point delaying.

And I really am at the point of agreeing with this - people have had
more than sufficient time to comment, and if they haven't by now, either
they don't care or they're just being difficult and intentionally trying
to stall development effort.

> > Now, on your point... the series posted on the 24th June was using
> > the mv88e6xxx port_max_speed_interface() but discussion off the mailing
> > list:
> > 
> > 20:19 < rmk> kabel: hmm, is mv88e6393x_port_max_speed_mode() correct?
> > 20:20 < rmk> it seems to be suggesting to use PHY_INTERFACE_MODE_10GBASER for
> >              port 9
> > 09:50 < kabel> rmk: yes, 10gbase-r is correct for 6393x. But we need to add
> >                exception for 6191x, as is done in chip.c function
> >                mv88e6393x_phylink_get_caps()
> > 09:51 < kabel> rmk: on 6191x only port 10 supports >1g speeds
> > 11:51 < rmk> kabel: moving it into the get_caps function makes it easier to set
> >              the default_interfaces for 6193x
> > 14:20 < kabel> rmk: yes, get_caps doing it would be better
> > 
> > The problem is this - we call get_caps(), and we have to read registers
> > to work out what the port supports. If we have a separate callback, then
> > we need to re-read those registers to get the same information to report
> > what the default interface should be.
> > 
> > Since almost all of the Marvell implementations the values for both the
> > list of supported interfaces and the default interface both require
> > reading a register and translating it to a phy_interface_t, and then
> > setting the support mask, it seems logical to combine these two
> > functioalities into one function.
> 
> In essence that doesn't mean much; DSA isn't Marvell only, but I'll give
> it to you: if only the Marvell driver (and Broadcom later, I expect) is
> going to add support for the context-specific interpretation of CPU port
> OF nodes, then we may consider tailoring the implementation to their
> hardware register layout details. In any case, my concern can be
> addressed even if you insist on keeping the default interface as an
> argument of phylink_get_caps. There just needs to be a lot more
> documentation explaining who needs to populate that argument and why.

I don't get the point you're making here.

> Also, perhaps more importantly, a real effort needs to be put to prevent
> breakage for drivers that work without a phylink instance registered for
> the CPU port, and also don't report the default interface. Practically
> that just means not deleting the current logic, but making it one of 3
> options.
> 
> fwnode is valid from phylink's perspective?
>        /                             \
>  yes  /                               \ no
>      /                                 \
> register with phylink         can we determine the link parameters to create
>                                   a fixed-link software node?
>                                        /                \                     \
>                                  yes  /                  \  no                |
>                                      /                    \                   | this is missing
>                                     /                      \                  |
>              create the software node and       don't put the port down,      |
>              register with phylink              don't register with phylink   /

This is exactly what we have today, and is exactly what I'm trying to
get rid of, so we have _consistency_ in the implementation, to prevent
fuckups like I've created by converting many DSA drivers to use
phylink_pcs. Any DSA driver that used a PCS for the DSA or CPu port and
has been converted to phylink_pcs support has been broken in the last
few kernel cycles. I'm trying to address that breakage before
converting the Marvell DSA driver - which is the driver that highlighted
the problem.

We need to move away from the current model in DSA where we only use
stuff in random situations.

Well, at this point, I'm just going to give up with this kernel cycle.
It seems impossible to get this sorted. It seems impossible to move
forward with the conversion of Marvell DSA to phylink_pcs. In fact,
I might just give up with the idea of further phylink development
because it's just too fucking difficult, and getting feedback is just
impossible.
Vladimir Oltean July 16, 2022, 10:57 a.m. UTC | #5
On Fri, Jul 15, 2022 at 11:57:20PM +0100, Russell King (Oracle) wrote:
> > > The problem is this - we call get_caps(), and we have to read registers
> > > to work out what the port supports. If we have a separate callback, then
> > > we need to re-read those registers to get the same information to report
> > > what the default interface should be.
> > > 
> > > Since almost all of the Marvell implementations the values for both the
> > > list of supported interfaces and the default interface both require
> > > reading a register and translating it to a phy_interface_t, and then
> > > setting the support mask, it seems logical to combine these two
> > > functioalities into one function.
> > 
> > In essence that doesn't mean much; DSA isn't Marvell only, but I'll give
> > it to you: if only the Marvell driver (and Broadcom later, I expect) is
> > going to add support for the context-specific interpretation of CPU port
> > OF nodes, then we may consider tailoring the implementation to their
> > hardware register layout details. In any case, my concern can be
> > addressed even if you insist on keeping the default interface as an
> > argument of phylink_get_caps. There just needs to be a lot more
> > documentation explaining who needs to populate that argument and why.
> 
> I don't get the point you're making here.

The point I'm making is that I dislike where this is going. The addition
of "default_interface" to phylink_get_caps is confusing because it lacks
proper qualifiers.

The concrete reasons why it's confusing are:

(a) there is no comment which specifies on which kinds of ports (DSA and CPU)
    the default_interface will be used. This might result in useless effort
    from driver authors to report a default_interface for a port where it
    will never be asked for.

(b) there is no comment which specifies that only the drivers which have
    DT blobs with missing phylink bindings on CPU and DSA ports should
    fill this out. I wouldn't want to see a new driver use this facility,
    I just don't see a reason for it. I'd rather see a comment that the
    recommendation for new drivers is to validate their bindings and not
    rely on context-specific interpretations of empty DT nodes.

(c) especially with the dsa_port_find_max_caps() heuristic in place, I
    can't say I'm clear at all on who should populate "default_interface"
    and who could safely rely on the heuristic if they populate
    supported_interfaces. It's simply put unclear what is the expectation
    from driver authors.

For (b) I was thinking that making it a separate function would make it
clearer that it isn't for everyone. Doing just that wouldn't solve everything,
so I've also said that adding more documentation to this function
prototype would go a long way.

Some dsa_switch_ops already have inline comments in include/net/dsa.h,
see get_tag_protocol, change_tag_protocol, port_change_mtu. Also, there
is the the "PHY devices and link management" chapter in Documentation/networking/dsa/dsa.rst.
We have places to document what the DSA framework expects drivers to do.
I was expecting that wherever default_interface gets reported, we could
see some answers and explanations to the questions above.

> > Also, perhaps more importantly, a real effort needs to be put to prevent
> > breakage for drivers that work without a phylink instance registered for
> > the CPU port, and also don't report the default interface. Practically
> > that just means not deleting the current logic, but making it one of 3
> > options.
> > 
> > fwnode is valid from phylink's perspective?
> >        /                             \
> >  yes  /                               \ no
> >      /                                 \
> > register with phylink         can we determine the link parameters to create
> >                                   a fixed-link software node?
> >                                        /                \                     \
> >                                  yes  /                  \  no                |
> >                                      /                    \                   | this is missing
> >                                     /                      \                  |
> >              create the software node and       don't put the port down,      |
> >              register with phylink              don't register with phylink   /
> 
> This is exactly what we have today,

Wait a minute, how come this is exactly what we have "today"?

In tree we have this:

fwnode is valid from phylink's perspective?
       /                             \
 yes  /                               \  no
     /                                 \
register with phylink                   \
                             don't put the port down,
                             don't register with phylink


In your patch set we have this:


fwnode is valid from phylink's perspective?
       /                             \
 yes  /                               \ no
     /                                 \
register with phylink         can we determine the link parameters to create
                                  a fixed-link software node?
                                       /                \
                                 yes  /                  \  no
                                     /                    \
                                    /                      \
             create the software node and            fail to create the port
             register with phylink

> and is exactly what I'm trying to get rid of, so we have _consistency_
> in the implementation, to prevent fuckups like I've created by
> converting many DSA drivers to use phylink_pcs. Any DSA driver that
> used a PCS for the DSA or CPu port and has been converted to
> phylink_pcs support has been broken in the last few kernel cycles. I'm
> trying to address that breakage before converting the Marvell DSA
> driver - which is the driver that highlighted the problem.

You are essentially saying that it's of no use to keep in DSA the
fallback logic of not registering with phylink, because the phylink_pcs
conversions have broken the defaulting functionality already in all
other drivers.

I may have missed something, but this is new information to me.
Specifically, before you've said that it is *this* patch set which would
risk introducing breakage (by forcing a link down + a phylink creation).
https://lore.kernel.org/netdev/YsCqFM8qM1h1MKu%2F@shell.armlinux.org.uk/
What you're saying now directly contradicts that.

Do you have concrete evidence that there is actually any regression of
this kind introduced by prior phylink_pcs conversions? Because if there
is, I retract the proposal to keep the fallback logic.

> We need to move away from the current model in DSA where we only use
> stuff in random situations.
> 
> Well, at this point, I'm just going to give up with this kernel cycle.
> It seems impossible to get this sorted. It seems impossible to move
> forward with the conversion of Marvell DSA to phylink_pcs. In fact,
> I might just give up with the idea of further phylink development
> because it's just too fucking difficult, and getting feedback is just
> impossible.
> 
> -- 
> RMK's Patch system: https://www.armlinux.org.uk/developer/patches/
> FTTP is here! 40Mbps down 10Mbps up. Decent connectivity at last!
Russell King (Oracle) July 16, 2022, 11:13 a.m. UTC | #6
On Sat, Jul 16, 2022 at 01:57:11PM +0300, Vladimir Oltean wrote:
> On Fri, Jul 15, 2022 at 11:57:20PM +0100, Russell King (Oracle) wrote:
> > > > The problem is this - we call get_caps(), and we have to read registers
> > > > to work out what the port supports. If we have a separate callback, then
> > > > we need to re-read those registers to get the same information to report
> > > > what the default interface should be.
> > > > 
> > > > Since almost all of the Marvell implementations the values for both the
> > > > list of supported interfaces and the default interface both require
> > > > reading a register and translating it to a phy_interface_t, and then
> > > > setting the support mask, it seems logical to combine these two
> > > > functioalities into one function.
> > > 
> > > In essence that doesn't mean much; DSA isn't Marvell only, but I'll give
> > > it to you: if only the Marvell driver (and Broadcom later, I expect) is
> > > going to add support for the context-specific interpretation of CPU port
> > > OF nodes, then we may consider tailoring the implementation to their
> > > hardware register layout details. In any case, my concern can be
> > > addressed even if you insist on keeping the default interface as an
> > > argument of phylink_get_caps. There just needs to be a lot more
> > > documentation explaining who needs to populate that argument and why.
> > 
> > I don't get the point you're making here.
> 
> The point I'm making is that I dislike where this is going. The addition
> of "default_interface" to phylink_get_caps is confusing because it lacks
> proper qualifiers.
> 
> The concrete reasons why it's confusing are:
> 
> (a) there is no comment which specifies on which kinds of ports (DSA and CPU)
>     the default_interface will be used. This might result in useless effort
>     from driver authors to report a default_interface for a port where it
>     will never be asked for.
> 
> (b) there is no comment which specifies that only the drivers which have
>     DT blobs with missing phylink bindings on CPU and DSA ports should
>     fill this out. I wouldn't want to see a new driver use this facility,
>     I just don't see a reason for it. I'd rather see a comment that the
>     recommendation for new drivers is to validate their bindings and not
>     rely on context-specific interpretations of empty DT nodes.
> 
> (c) especially with the dsa_port_find_max_caps() heuristic in place, I
>     can't say I'm clear at all on who should populate "default_interface"
>     and who could safely rely on the heuristic if they populate
>     supported_interfaces. It's simply put unclear what is the expectation
>     from driver authors.
> 
> For (b) I was thinking that making it a separate function would make it
> clearer that it isn't for everyone. Doing just that wouldn't solve everything,
> so I've also said that adding more documentation to this function
> prototype would go a long way.
> 
> Some dsa_switch_ops already have inline comments in include/net/dsa.h,
> see get_tag_protocol, change_tag_protocol, port_change_mtu. Also, there
> is the the "PHY devices and link management" chapter in Documentation/networking/dsa/dsa.rst.
> We have places to document what the DSA framework expects drivers to do.
> I was expecting that wherever default_interface gets reported, we could
> see some answers and explanations to the questions above.

Thanks for clarifying.

> > > Also, perhaps more importantly, a real effort needs to be put to prevent
> > > breakage for drivers that work without a phylink instance registered for
> > > the CPU port, and also don't report the default interface. Practically
> > > that just means not deleting the current logic, but making it one of 3
> > > options.
> > > 
> > > fwnode is valid from phylink's perspective?
> > >        /                             \
> > >  yes  /                               \ no
> > >      /                                 \
> > > register with phylink         can we determine the link parameters to create
> > >                                   a fixed-link software node?
> > >                                        /                \                     \
> > >                                  yes  /                  \  no                |
> > >                                      /                    \                   | this is missing
> > >                                     /                      \                  |
> > >              create the software node and       don't put the port down,      |
> > >              register with phylink              don't register with phylink   /
> > 
> > This is exactly what we have today,
> 
> Wait a minute, how come this is exactly what we have "today"?
> 
> In tree we have this:
> 
> fwnode is valid from phylink's perspective?
>        /                             \
>  yes  /                               \  no
>      /                                 \
> register with phylink                   \
>                              don't put the port down,
>                              don't register with phylink
> 
> 
> In your patch set we have this:
> 
> 
> fwnode is valid from phylink's perspective?
>        /                             \
>  yes  /                               \ no
>      /                                 \
> register with phylink         can we determine the link parameters to create
>                                   a fixed-link software node?
>                                        /                \
>                                  yes  /                  \  no
>                                      /                    \
>                                     /                      \
>              create the software node and            fail to create the port
>              register with phylink
> 
> > and is exactly what I'm trying to get rid of, so we have _consistency_
> > in the implementation, to prevent fuckups like I've created by
> > converting many DSA drivers to use phylink_pcs. Any DSA driver that
> > used a PCS for the DSA or CPu port and has been converted to
> > phylink_pcs support has been broken in the last few kernel cycles. I'm
> > trying to address that breakage before converting the Marvell DSA
> > driver - which is the driver that highlighted the problem.
> 
> You are essentially saying that it's of no use to keep in DSA the
> fallback logic of not registering with phylink, because the phylink_pcs
> conversions have broken the defaulting functionality already in all
> other drivers.

Correct, and I don't want these exceptions precisely because it creates
stupid mistakes like the one I've highlighted. If we have one way of
doing something, then development becomes much easier. When we have
multiple different ways, mistakes happen, stuff breaks.

> I may have missed something, but this is new information to me.
> Specifically, before you've said that it is *this* patch set which would
> risk introducing breakage (by forcing a link down + a phylink creation).
> https://lore.kernel.org/netdev/YsCqFM8qM1h1MKu%2F@shell.armlinux.org.uk/
> What you're saying now directly contradicts that.

No, it is not contradictory at all.

There is previous breakage caused by converting DSA drivers to
phylink_pcs - because the PCS code will *not* be called where a driver
makes use of the "default-to-fastest-speed" mechanism, so is likely
broken. Some drivers work around this by doing things manually.

This series *also* risks breakage, because it means phylink gets used
in more situations which could confuse drivers - such as those which
have manually brought stuff up and are not expecting phylink to also
do it. Or those which do not fill in the mac_capabilities but do
default to this "fastest-speed" mechanism. Or some other unforseen
scenario.

So there's known breakage - which we know because we've tripped over
the issue with mv88e6xxx pcs conversion which isn't in net-next yet,
but illustrates the issue, and there's unknown breakage through
applying this patch and not having had test feedback from all the
DSA driver authors.

There are no contradiction there what so ever.

> Do you have concrete evidence that there is actually any regression of
> this kind introduced by prior phylink_pcs conversions? Because if there
> is, I retract the proposal to keep the fallback logic.

Since we have no idea which DSA drivers make use of this "default-to-
fastest-speed", and Andrew who has been promoting this doesn't seem to
know which drivers make use of this, I do not, but we know that the
breakage does occur if someone does make use of this with one of the
converted DSA drivers through the behaviour of mv88e6xxx. Just because
no one has reported it yet does not mean it doesn't exist. Not everyone
updates their kernels each time Linus releases a final kernel version,
especially in the embedded world. Sometimes it can take until a LTS
release until people move forward, which we think will be 5.21.
Vladimir Oltean July 16, 2022, 12:36 p.m. UTC | #7
On Sat, Jul 16, 2022 at 12:13:55PM +0100, Russell King (Oracle) wrote:
> > > and is exactly what I'm trying to get rid of, so we have _consistency_
> > > in the implementation, to prevent fuckups like I've created by
> > > converting many DSA drivers to use phylink_pcs. Any DSA driver that
> > > used a PCS for the DSA or CPu port and has been converted to
> > > phylink_pcs support has been broken in the last few kernel cycles. I'm
> > > trying to address that breakage before converting the Marvell DSA
> > > driver - which is the driver that highlighted the problem.
> > 
> > You are essentially saying that it's of no use to keep in DSA the
> > fallback logic of not registering with phylink, because the phylink_pcs
> > conversions have broken the defaulting functionality already in all
> > other drivers.
> 
> Correct, and I don't want these exceptions precisely because it creates
> stupid mistakes like the one I've highlighted. If we have one way of
> doing something, then development becomes much easier. When we have
> multiple different ways, mistakes happen, stuff breaks.

Agree that when we have multiple different ways, mistakes happen and
stuff breaks. This is also why I want to avoid the proliferation of the
default_interface reporting when most of the drivers were just fine
without it. It needs to be opt-in, and checking for the presence of a
dedicated function is an easy way to check for that opt-in.

> > I may have missed something, but this is new information to me.
> > Specifically, before you've said that it is *this* patch set which would
> > risk introducing breakage (by forcing a link down + a phylink creation).
> > https://lore.kernel.org/netdev/YsCqFM8qM1h1MKu%2F@shell.armlinux.org.uk/
> > What you're saying now directly contradicts that.
> 
> No, it is not contradictory at all.
> 
> There is previous breakage caused by converting DSA drivers to
> phylink_pcs - because the PCS code will *not* be called where a driver
> makes use of the "default-to-fastest-speed" mechanism, so is likely
> broken. Some drivers work around this by doing things manually.

Marvell conversion to phylink_pcs doesn't count as "previous" breakage
if it's not in net-next.

> This series *also* risks breakage, because it means phylink gets used
> in more situations which could confuse drivers - such as those which
> have manually brought stuff up and are not expecting phylink to also
> do it. Or those which do not fill in the mac_capabilities but do
> default to this "fastest-speed" mechanism. Or some other unforseen
> scenario.

I don't exactly understand the mechanics through which a phylink_pcs
conversion would break a driver that used defaulting behavior,
but I can only imagine it involves moving code that didn't depend on
phylink, to phylink callbacks. It's hard to say whether that happened
for any of the phylink_pcs conversions in net-next.

But drivers could also have their CPU port working simply because those
are internal to an SoC and don't need any software configuration to pass
traffic. In their case, there is no breakage caused by the phylink_pcs
conversion, but breakage caused by sudden registration of phylink is
plausible, if phylink doesn't get the link parameters right.

And that breakage is preventable. Gradually more drivers could be
converted to create a fixed-link software node by printing a warning
that they should, and still keep the logic to avoid phylink registration
and putting the respective port down. Driver authors might not be very
responsive to RFC patch sets, but they do look at new warnings in dmesg
and try to see what they're about.

What I'm missing is the proof that the phylink_pcs conversion has broken
those kinds of switches, and that it's therefore pointless to keep the
existing logic for them. Sorry, but you didn't provide it.

> So there's known breakage - which we know because we've tripped over
> the issue with mv88e6xxx pcs conversion which isn't in net-next yet,
> but illustrates the issue, and there's unknown breakage through
> applying this patch and not having had test feedback from all the
> DSA driver authors.
> 
> There are no contradiction there what so ever.
> 
> > Do you have concrete evidence that there is actually any regression of
> > this kind introduced by prior phylink_pcs conversions? Because if there
> > is, I retract the proposal to keep the fallback logic.
> 
> Since we have no idea which DSA drivers make use of this "default-to-
> fastest-speed", and Andrew who has been promoting this doesn't seem to
> know which drivers make use of this, I do not, but we know that the
> breakage does occur if someone does make use of this with one of the
> converted DSA drivers through the behaviour of mv88e6xxx. Just because
> no one has reported it yet does not mean it doesn't exist. Not everyone
> updates their kernels each time Linus releases a final kernel version,
> especially in the embedded world. Sometimes it can take until a LTS
> release until people move forward, which we think will be 5.21.
Russell King (Oracle) July 18, 2022, 8:48 a.m. UTC | #8
On Sat, Jul 16, 2022 at 03:36:08PM +0300, Vladimir Oltean wrote:
> On Sat, Jul 16, 2022 at 12:13:55PM +0100, Russell King (Oracle) wrote:
> > > > and is exactly what I'm trying to get rid of, so we have _consistency_
> > > > in the implementation, to prevent fuckups like I've created by
> > > > converting many DSA drivers to use phylink_pcs. Any DSA driver that
> > > > used a PCS for the DSA or CPu port and has been converted to
> > > > phylink_pcs support has been broken in the last few kernel cycles. I'm
> > > > trying to address that breakage before converting the Marvell DSA
> > > > driver - which is the driver that highlighted the problem.
> > > 
> > > You are essentially saying that it's of no use to keep in DSA the
> > > fallback logic of not registering with phylink, because the phylink_pcs
> > > conversions have broken the defaulting functionality already in all
> > > other drivers.
> > 
> > Correct, and I don't want these exceptions precisely because it creates
> > stupid mistakes like the one I've highlighted. If we have one way of
> > doing something, then development becomes much easier. When we have
> > multiple different ways, mistakes happen, stuff breaks.
> 
> Agree that when we have multiple different ways, mistakes happen and
> stuff breaks. This is also why I want to avoid the proliferation of the
> default_interface reporting when most of the drivers were just fine
> without it. It needs to be opt-in, and checking for the presence of a
> dedicated function is an easy way to check for that opt-in.
> 
> > > I may have missed something, but this is new information to me.
> > > Specifically, before you've said that it is *this* patch set which would
> > > risk introducing breakage (by forcing a link down + a phylink creation).
> > > https://lore.kernel.org/netdev/YsCqFM8qM1h1MKu%2F@shell.armlinux.org.uk/
> > > What you're saying now directly contradicts that.
> > 
> > No, it is not contradictory at all.
> > 
> > There is previous breakage caused by converting DSA drivers to
> > phylink_pcs - because the PCS code will *not* be called where a driver
> > makes use of the "default-to-fastest-speed" mechanism, so is likely
> > broken. Some drivers work around this by doing things manually.
> 
> Marvell conversion to phylink_pcs doesn't count as "previous" breakage
> if it's not in net-next.

Thank you for stating the obvious. I never claimed it was.

> > This series *also* risks breakage, because it means phylink gets used
> > in more situations which could confuse drivers - such as those which
> > have manually brought stuff up and are not expecting phylink to also
> > do it. Or those which do not fill in the mac_capabilities but do
> > default to this "fastest-speed" mechanism. Or some other unforseen
> > scenario.
> 
> I don't exactly understand the mechanics through which a phylink_pcs
> conversion would break a driver that used defaulting behavior,
> but I can only imagine it involves moving code that didn't depend on
> phylink, to phylink callbacks. It's hard to say whether that happened
> for any of the phylink_pcs conversions in net-next.

Situation before phylink_pcs conversion:
- code paths that configure the PCS are integrated with the rest of the
  driver and would be called where appropriate.

Situation after phylink_pcs conversion:
- code paths that configure the PCS are no longer integrated with the
  rest of the driver and are now dependent on phylink being used for
  them to be called.

If there is no problem with doing this conversion, then why has the
proposed conversion of mv88e6xxx caused breakage when it was working
fine previously - it's because of exactly the above.

> But drivers could also have their CPU port working simply because those
> are internal to an SoC and don't need any software configuration to pass
> traffic. In their case, there is no breakage caused by the phylink_pcs
> conversion, but breakage caused by sudden registration of phylink is
> plausible, if phylink doesn't get the link parameters right.
> 
> And that breakage is preventable. Gradually more drivers could be
> converted to create a fixed-link software node by printing a warning
> that they should, and still keep the logic to avoid phylink registration
> and putting the respective port down. Driver authors might not be very
> responsive to RFC patch sets, but they do look at new warnings in dmesg
> and try to see what they're about.

Are you going to do that conversion then? Good luck trying to find all
the drivers, sending out series of patches, trying to get people to test
the changes.

> What I'm missing is the proof that the phylink_pcs conversion has broken
> those kinds of switches, and that it's therefore pointless to keep the
> existing logic for them. Sorry, but you didn't provide it.

I don't have evidence that existing drivers have broken because I don't
have the hardware to test. I only have Marvell DSA switch hardware and
that is it.

Everything else has to be based on theory because no one bothers to
respond to my patches, so for 99% of the DSA drivers I'm working in the
dark - and that makes development an utter shitpile of poo in hell.

As I've said many times, we have NO CLUE which DSA drivers make use of
this defaulting behaviour - the only one I'm aware of that does is
mv88e6xxx. It all depends on the firmware description, driver behaviour
and hardware behaviour. There is not enough information in the kernel 
to be able to derive this.

If there was a reported regression, then I would be looking to get this
into the NET tree not the NET-NEXT tree.
Vladimir Oltean July 20, 2022, 10:44 p.m. UTC | #9
On Mon, Jul 18, 2022 at 09:48:51AM +0100, Russell King (Oracle) wrote:
> > But drivers could also have their CPU port working simply because those
> > are internal to an SoC and don't need any software configuration to pass
> > traffic. In their case, there is no breakage caused by the phylink_pcs
> > conversion, but breakage caused by sudden registration of phylink is
> > plausible, if phylink doesn't get the link parameters right.
> > 
> > And that breakage is preventable. Gradually more drivers could be
> > converted to create a fixed-link software node by printing a warning
> > that they should, and still keep the logic to avoid phylink registration
> > and putting the respective port down. Driver authors might not be very
> > responsive to RFC patch sets, but they do look at new warnings in dmesg
> > and try to see what they're about.
> 
> Are you going to do that conversion then? Good luck trying to find all
> the drivers, sending out series of patches, trying to get people to test
> the changes.

Not sure if that's a rhetorical question and if it is, what it's trying
to prove. Of course I'm not going to do any conversion, that's literally
my whole point, I'd rather err on the side of letting sleeping dogs cry
than force-converting everyone at once.

If there is a breakage report and the driver maintainer won't respond,
then yeah, maybe I'll consider looking at that particular issue and
converting if it helps, but that's kind of why I'm here.

Otherwise, we may end up pushing phylink to drivers which have some
wacky link speeds on the CPU port (like 2000, see b53_force_port_config),
which the software node auto-creation logic absolutely won't get right.
I know Florian said that "we won't see a regression since we do not use
the NATP accelerator which would be the reason to run the port at
2Gbits/sec", but frankly I'm not entirely sure what that even means or
what Florian counts as "regression". Lower overall termination throughput
counts or not? Still not worth risking if this isn't the only instance
of a non-standard speed.

> > What I'm missing is the proof that the phylink_pcs conversion has broken
> > those kinds of switches, and that it's therefore pointless to keep the
> > existing logic for them. Sorry, but you didn't provide it.
> 
> I don't have evidence that existing drivers have broken because I don't
> have the hardware to test. I only have Marvell DSA switch hardware and
> that is it.
> 
> Everything else has to be based on theory because no one bothers to
> respond to my patches, so for 99% of the DSA drivers I'm working in the
> dark - and that makes development an utter shitpile of poo in hell.
> 
> As I've said many times, we have NO CLUE which DSA drivers make use of
> this defaulting behaviour - the only one I'm aware of that does is
> mv88e6xxx. It all depends on the firmware description, driver behaviour
> and hardware behaviour. There is not enough information in the kernel 
> to be able to derive this.
> 
> If there was a reported regression, then I would be looking to get this
> into the NET tree not the NET-NEXT tree.

I think there are authors who weren't even aware they were opting into
this interesting DSA feature when they wrote their driver, but didn't
have the inspiration at the time to validate strict DT bindings either.

We have no proof that they don't have DT blobs using the defaulting
feature somewhere out there, but we don't have any proof that they do,
either. The whole problem could become more manageable if we would let
maintainers say to a user "hey, for my driver this feature was never
intended to work, so sorry if by accident it did, it was a marginal and
undocumented condition and now it's broken, so please use something that
was documented".

The ocelot driver is in this exact state, in fact. I really wish there
was a ready-made helper for validating phylink's OF node; I mentioned this
already, it needs to cater for all of fixed-link/phy-handle/managed/sfp.
The more drivers would be calling this (I think the vast majority of
post-phylink drivers would do it), the more we'd minimize the spectrum
of unforeseen breakage. In turn this would make the issue more tractable.
Vladimir Oltean July 21, 2022, 1:46 p.m. UTC | #10
On Thu, Jul 21, 2022 at 01:44:47AM +0300, Vladimir Oltean wrote:
> I really wish there was a ready-made helper for validating phylink's
> OF node; I mentioned this already, it needs to cater for all of
> fixed-link/phy-handle/managed/sfp.

While I was going to expand on this point and state that DSA doesn't
currently instantiate phylink for this OF node:

			port@9 {
				reg = <0x9>;
				label = "cpu";
				ethernet = <&eth1>;
				phy-mode = "2500base-x";
				managed = "in-band-status";
			};

I was proven wrong. Today I learned that of_phy_is_fixed_link() returns
true if the "managed" property exists and its value differs from "auto".
So in the above case, of_phy_is_fixed_link() returns true, hmmm.



On the other hand I found arm64/boot/dts/marvell/cn9130-crb.dtsi, where
the switch, a "marvell,mv88e6190"-compatible (can't determine going just
by that what it actually is) has this:

			port@a {
				reg = <10>;
				label = "cpu";
				ethernet = <&cp0_eth0>;
			};

To illustrate how odd the situation is, I am able to follow the phandle
to the CPU port and find a comment that it's a 88E6393X, and that the
CPU port uses managed = "in-band-status":

&cp0_eth0 {
	/* This port is connected to 88E6393X switch */
	status = "okay";
	phy-mode = "10gbase-r";
	managed = "in-band-status";
	phys = <&cp0_comphy4 0>;
};

Open question: is it sane to even do what we're trying here, to create a
fixed-link for port@a (which makes the phylink instance use MLO_AN_FIXED)
when &cp0_eth0 uses MLO_AN_INBAND? My simple mind thinks that if all
involved drivers were to behave correctly and not have bugs that cancel
out other bugs, the above device tree shouldn't work. The host port
would expect a clause 37 base page exchange to take place, the switch
wouldn't send any in-band information, and the SERDES lane would never
transition to data mode. To fix the above, we'd really need to chase the
"ethernet" phandle and attempt to mimic what the DSA master did. This is
indeed logic that never existed before, and I don't particularly feel
like adding it. How far do we want to go? It seems like never-ending
insanity the more I look at it.
Andrew Lunn July 21, 2022, 2:46 p.m. UTC | #11
> On the other hand I found arm64/boot/dts/marvell/cn9130-crb.dtsi, where
> the switch, a "marvell,mv88e6190"-compatible (can't determine going just
> by that what it actually is) has this:
> 
> 			port@a {
> 				reg = <10>;
> 				label = "cpu";
> 				ethernet = <&cp0_eth0>;
> 			};

Both CPU and DSA ports default to their maximum speed, if nothing else
is specified. If this is a 6393X, port 10 can do 10Gbps, and that is
how the port will be configured by the driver. It is undefined how it
actually implement this maximum speed, if there are multiple choices,
if in band is enabled or not etc. This is historical, the first
mv88e6xxx devices had a mixture of Fast and 1G ethernet interfaces,
and it was simply to choosing between MII and GMII. The platform data
at that time had no way to express link information, and this simple
default mechanism was enough to get boards of the time working.

> To illustrate how odd the situation is, I am able to follow the phandle
> to the CPU port and find a comment that it's a 88E6393X, and that the
> CPU port uses managed = "in-band-status":
> 
> &cp0_eth0 {
> 	/* This port is connected to 88E6393X switch */
> 	status = "okay";
> 	phy-mode = "10gbase-r";
> 	managed = "in-band-status";
> 	phys = <&cp0_comphy4 0>;
> };
> 
> Open question: is it sane to even do what we're trying here, to create a
> fixed-link for port@a (which makes the phylink instance use MLO_AN_FIXED)
> when &cp0_eth0 uses MLO_AN_INBAND? My simple mind thinks that if all
> involved drivers were to behave correctly

Define 'correctly', given that some of these drivers and devices have
been around much longer than device tree, etc.

     Andrew
Russell King (Oracle) July 21, 2022, 2:54 p.m. UTC | #12
On Thu, Jul 21, 2022 at 04:46:18PM +0300, Vladimir Oltean wrote:
> On Thu, Jul 21, 2022 at 01:44:47AM +0300, Vladimir Oltean wrote:
> > I really wish there was a ready-made helper for validating phylink's
> > OF node; I mentioned this already, it needs to cater for all of
> > fixed-link/phy-handle/managed/sfp.
> 
> While I was going to expand on this point and state that DSA doesn't
> currently instantiate phylink for this OF node:
> 
> 			port@9 {
> 				reg = <0x9>;
> 				label = "cpu";
> 				ethernet = <&eth1>;
> 				phy-mode = "2500base-x";
> 				managed = "in-band-status";
> 			};
> 
> I was proven wrong. Today I learned that of_phy_is_fixed_link() returns
> true if the "managed" property exists and its value differs from "auto".
> So in the above case, of_phy_is_fixed_link() returns true, hmmm.

Yes, which is why I said on July 7th:

"So I also don't see a problem - sja1105 rejects DTs that fail to
describe a port using at least one of a phy-handle, a fixed-link, or
a managed in-band link, and I don't think it needs to do further
validation, certainly not for the phy describing properties that
the kernel has chosen to deprecate for new implementations."

I had assumed you knew of_phy_is_fixed_link() returns true in this
case. Do you now see that sja1105's validation is close enough
(except for the legacy phy phandle properties which we don't care
about), and thus do we finally have agreement on this point?

> On the other hand I found arm64/boot/dts/marvell/cn9130-crb.dtsi, where
> the switch, a "marvell,mv88e6190"-compatible (can't determine going just
> by that what it actually is) has this:
> 
> 			port@a {
> 				reg = <10>;
> 				label = "cpu";
> 				ethernet = <&cp0_eth0>;
> 			};

Port 10 on 88E6393X supports 10GBASE-R, and maybe one day someone will
get around to implementing USXGMII. This description relies upon this
defaulting behaviour - as Andrew has described, this has been entirely
normal behaviour with mv88e6xxx.

> To illustrate how odd the situation is, I am able to follow the phandle
> to the CPU port and find a comment that it's a 88E6393X, and that the
> CPU port uses managed = "in-band-status":
> 
> &cp0_eth0 {
> 	/* This port is connected to 88E6393X switch */
> 	status = "okay";
> 	phy-mode = "10gbase-r";
> 	managed = "in-band-status";
> 	phys = <&cp0_comphy4 0>;
> };

10GBASE-R has no in-band signalling per-se, so the only effect this has
on the phylink instance on the CPU side is to read the status from the
PCS as it does for any other in-band mode. In the case of 10GBASE-R, the
only retrievable parameter is the link up/down status. This is no
different from a 10GBASE-R based fibre link in that regard.

A fixed link on the other hand would not read status from the PCS but
would assume that the link is always up.

> Open question: is it sane to even do what we're trying here, to create a
> fixed-link for port@a (which makes the phylink instance use MLO_AN_FIXED)
> when &cp0_eth0 uses MLO_AN_INBAND? My simple mind thinks that if all
> involved drivers were to behave correctly and not have bugs that cancel
> out other bugs, the above device tree shouldn't work. The host port
> would expect a clause 37 base page exchange to take place, the switch
> wouldn't send any in-band information, and the SERDES lane would never
> transition to data mode. To fix the above, we'd really need to chase the
> "ethernet" phandle and attempt to mimic what the DSA master did. This is
> indeed logic that never existed before, and I don't particularly feel
> like adding it. How far do we want to go? It seems like never-ending
> insanity the more I look at it.

10GBASE-R doesn't support clause 37 AN. 10GBASE-KR does support
inband AN, but it's a different clause and different format.
Vladimir Oltean July 21, 2022, 3:15 p.m. UTC | #13
On Thu, Jul 21, 2022 at 03:54:15PM +0100, Russell King (Oracle) wrote:
> Yes, which is why I said on July 7th:
> 
> "So I also don't see a problem - sja1105 rejects DTs that fail to
> describe a port using at least one of a phy-handle, a fixed-link, or
> a managed in-band link, and I don't think it needs to do further
> validation, certainly not for the phy describing properties that
> the kernel has chosen to deprecate for new implementations."
> 
> I had assumed you knew of_phy_is_fixed_link() returns true in this
> case. Do you now see that sja1105's validation is close enough
> (except for the legacy phy phandle properties which we don't care
> about),

This is why your comment struck me as odd for mentioning managed in-band.

> and thus do we finally have agreement on this point?

Yes we do.

> > On the other hand I found arm64/boot/dts/marvell/cn9130-crb.dtsi, where
> > the switch, a "marvell,mv88e6190"-compatible (can't determine going just
> > by that what it actually is) has this:
> > 
> > 			port@a {
> > 				reg = <10>;
> > 				label = "cpu";
> > 				ethernet = <&cp0_eth0>;
> > 			};
> 
> Port 10 on 88E6393X supports 10GBASE-R, and maybe one day someone will
> get around to implementing USXGMII. This description relies upon this
> defaulting behaviour - as Andrew has described, this has been entirely
> normal behaviour with mv88e6xxx.
> 
> > To illustrate how odd the situation is, I am able to follow the phandle
> > to the CPU port and find a comment that it's a 88E6393X, and that the
> > CPU port uses managed = "in-band-status":
> > 
> > &cp0_eth0 {
> > 	/* This port is connected to 88E6393X switch */
> > 	status = "okay";
> > 	phy-mode = "10gbase-r";
> > 	managed = "in-band-status";
> > 	phys = <&cp0_comphy4 0>;
> > };
> 
> 10GBASE-R has no in-band signalling per-se, so the only effect this has
> on the phylink instance on the CPU side is to read the status from the
> PCS as it does for any other in-band mode. In the case of 10GBASE-R, the
> only retrievable parameter is the link up/down status. This is no
> different from a 10GBASE-R based fibre link in that regard.

Is there any formal definition for what managed = "in-band-status"
actually means? Is it context-specific depending on phy-mode?
In the case of SGMII, would it also mean that clause 37 exchange would
also take place (and its absence would mean it wouldn't), or does it
mean just that, that the driver should read the status from the PCS?

> A fixed link on the other hand would not read status from the PCS but
> would assume that the link is always up.
> 
> > Open question: is it sane to even do what we're trying here, to create a
> > fixed-link for port@a (which makes the phylink instance use MLO_AN_FIXED)
> > when &cp0_eth0 uses MLO_AN_INBAND? My simple mind thinks that if all
> > involved drivers were to behave correctly and not have bugs that cancel
> > out other bugs, the above device tree shouldn't work. The host port
> > would expect a clause 37 base page exchange to take place, the switch
> > wouldn't send any in-band information, and the SERDES lane would never
> > transition to data mode. To fix the above, we'd really need to chase the
> > "ethernet" phandle and attempt to mimic what the DSA master did. This is
> > indeed logic that never existed before, and I don't particularly feel
> > like adding it. How far do we want to go? It seems like never-ending
> > insanity the more I look at it.
> 
> 10GBASE-R doesn't support clause 37 AN. 10GBASE-KR does support
> inband AN, but it's a different clause and different format.

I thought it wouldn't, but then I was led to believe, after seeing it
here, that just the hardware I'm working with doesn't. How about
2500base-x in Marvell, is there any base page exchange, or is this still
only about retrieving link status from the PCS?
Marek Behún July 21, 2022, 5:21 p.m. UTC | #14
On Thu, 21 Jul 2022 18:15:33 +0300
Vladimir Oltean <olteanv@gmail.com> wrote:

> On Thu, Jul 21, 2022 at 03:54:15PM +0100, Russell King (Oracle) wrote:
> > Yes, which is why I said on July 7th:
> > 
> > "So I also don't see a problem - sja1105 rejects DTs that fail to
> > describe a port using at least one of a phy-handle, a fixed-link, or
> > a managed in-band link, and I don't think it needs to do further
> > validation, certainly not for the phy describing properties that
> > the kernel has chosen to deprecate for new implementations."
> > 
> > I had assumed you knew of_phy_is_fixed_link() returns true in this
> > case. Do you now see that sja1105's validation is close enough
> > (except for the legacy phy phandle properties which we don't care
> > about),  
> 
> This is why your comment struck me as odd for mentioning managed in-band.
> 
> > and thus do we finally have agreement on this point?  
> 
> Yes we do.
> 
> > > On the other hand I found arm64/boot/dts/marvell/cn9130-crb.dtsi, where
> > > the switch, a "marvell,mv88e6190"-compatible (can't determine going just
> > > by that what it actually is) has this:
> > > 
> > > 			port@a {
> > > 				reg = <10>;
> > > 				label = "cpu";
> > > 				ethernet = <&cp0_eth0>;
> > > 			};  
> > 
> > Port 10 on 88E6393X supports 10GBASE-R, and maybe one day someone will
> > get around to implementing USXGMII. This description relies upon this
> > defaulting behaviour - as Andrew has described, this has been entirely
> > normal behaviour with mv88e6xxx.
> >   
> > > To illustrate how odd the situation is, I am able to follow the phandle
> > > to the CPU port and find a comment that it's a 88E6393X, and that the
> > > CPU port uses managed = "in-band-status":
> > > 
> > > &cp0_eth0 {
> > > 	/* This port is connected to 88E6393X switch */
> > > 	status = "okay";
> > > 	phy-mode = "10gbase-r";
> > > 	managed = "in-band-status";
> > > 	phys = <&cp0_comphy4 0>;
> > > };  
> > 
> > 10GBASE-R has no in-band signalling per-se, so the only effect this has
> > on the phylink instance on the CPU side is to read the status from the
> > PCS as it does for any other in-band mode. In the case of 10GBASE-R, the
> > only retrievable parameter is the link up/down status. This is no
> > different from a 10GBASE-R based fibre link in that regard.  
> 
> Is there any formal definition for what managed = "in-band-status"
> actually means? Is it context-specific depending on phy-mode?
> In the case of SGMII, would it also mean that clause 37 exchange would
> also take place (and its absence would mean it wouldn't), or does it
> mean just that, that the driver should read the status from the PCS?
> 
> > A fixed link on the other hand would not read status from the PCS but
> > would assume that the link is always up.
> >   
> > > Open question: is it sane to even do what we're trying here, to create a
> > > fixed-link for port@a (which makes the phylink instance use MLO_AN_FIXED)
> > > when &cp0_eth0 uses MLO_AN_INBAND? My simple mind thinks that if all
> > > involved drivers were to behave correctly and not have bugs that cancel
> > > out other bugs, the above device tree shouldn't work. The host port
> > > would expect a clause 37 base page exchange to take place, the switch
> > > wouldn't send any in-band information, and the SERDES lane would never
> > > transition to data mode. To fix the above, we'd really need to chase the
> > > "ethernet" phandle and attempt to mimic what the DSA master did. This is
> > > indeed logic that never existed before, and I don't particularly feel
> > > like adding it. How far do we want to go? It seems like never-ending
> > > insanity the more I look at it.  
> > 
> > 10GBASE-R doesn't support clause 37 AN. 10GBASE-KR does support
> > inband AN, but it's a different clause and different format.  
> 
> I thought it wouldn't, but then I was led to believe, after seeing it
> here, that just the hardware I'm working with doesn't. How about
> 2500base-x in Marvell, is there any base page exchange, or is this still
> only about retrieving link status from the PCS?

Marvell documentation says that 2500base-x does not implement inband
AN.

But when it was first implemented, for some reason it was thought that
2500base-x is just 1000base-x at 2.5x speed, and 1000base-x does
support inband AN. Also it worked during tests for both switches and
SOC NICs, so it was enabled.

At the time 2500base-x was not standardized. Now 2500base-x is
stanradrized, and the standard says that 2500base-x does not support
clause 37 AN. I guess this is because where it is used, it is intended
to work with clause 73 AN somehow.

And then came 6373X switch, which didn't support clause 37 inband AN in
2500base-x mode (the AN reigster returned 0xffff or something when
2500base-x CMODE was set). Maybe 6373X finally supports clause 73 AN
(I don't know, but I don't think so) and that is the reason they now
forbid clause 37 AN in HW in 2500base-x.

But the problem is that by this time there is software out there then
expects 2500base-x to have clause 37 AN enabled. Indeed a passive SFP
cable did not work between MOX' SFP port and CN9130-CRB's SFP port
when used with Peridot (6190), if C37 AN was disabled on 6393x and left
enabled on Peridot.

I managed to work out how to enable C37 AN on 6393x:
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=163000dbc772c1eae9bdfe7c8fe30155db1efd74

So currently we try to enable C37 AN in 2500base-x mode, although
the standard says that it shouldn't be there, and it shouldn't be there
presumably because they want it to work with C73 AN.

I don't know how to solve this issue. Maybe declare a new PHY interface
mode constant, 2500base-x-no-c37-an ?

:)

Marek
Russell King (Oracle) July 21, 2022, 6:15 p.m. UTC | #15
On Thu, Jul 21, 2022 at 07:21:45PM +0200, Marek Behún wrote:
> On Thu, 21 Jul 2022 18:15:33 +0300
> Vladimir Oltean <olteanv@gmail.com> wrote:
> 
> > On Thu, Jul 21, 2022 at 03:54:15PM +0100, Russell King (Oracle) wrote:
> > > Yes, which is why I said on July 7th:
> > > 
> > > "So I also don't see a problem - sja1105 rejects DTs that fail to
> > > describe a port using at least one of a phy-handle, a fixed-link, or
> > > a managed in-band link, and I don't think it needs to do further
> > > validation, certainly not for the phy describing properties that
> > > the kernel has chosen to deprecate for new implementations."
> > > 
> > > I had assumed you knew of_phy_is_fixed_link() returns true in this
> > > case. Do you now see that sja1105's validation is close enough
> > > (except for the legacy phy phandle properties which we don't care
> > > about),  
> > 
> > This is why your comment struck me as odd for mentioning managed in-band.
> > 
> > > and thus do we finally have agreement on this point?  
> > 
> > Yes we do.
> > 
> > > > On the other hand I found arm64/boot/dts/marvell/cn9130-crb.dtsi, where
> > > > the switch, a "marvell,mv88e6190"-compatible (can't determine going just
> > > > by that what it actually is) has this:
> > > > 
> > > > 			port@a {
> > > > 				reg = <10>;
> > > > 				label = "cpu";
> > > > 				ethernet = <&cp0_eth0>;
> > > > 			};  
> > > 
> > > Port 10 on 88E6393X supports 10GBASE-R, and maybe one day someone will
> > > get around to implementing USXGMII. This description relies upon this
> > > defaulting behaviour - as Andrew has described, this has been entirely
> > > normal behaviour with mv88e6xxx.
> > >   
> > > > To illustrate how odd the situation is, I am able to follow the phandle
> > > > to the CPU port and find a comment that it's a 88E6393X, and that the
> > > > CPU port uses managed = "in-band-status":
> > > > 
> > > > &cp0_eth0 {
> > > > 	/* This port is connected to 88E6393X switch */
> > > > 	status = "okay";
> > > > 	phy-mode = "10gbase-r";
> > > > 	managed = "in-band-status";
> > > > 	phys = <&cp0_comphy4 0>;
> > > > };  
> > > 
> > > 10GBASE-R has no in-band signalling per-se, so the only effect this has
> > > on the phylink instance on the CPU side is to read the status from the
> > > PCS as it does for any other in-band mode. In the case of 10GBASE-R, the
> > > only retrievable parameter is the link up/down status. This is no
> > > different from a 10GBASE-R based fibre link in that regard.  
> > 
> > Is there any formal definition for what managed = "in-band-status"
> > actually means? Is it context-specific depending on phy-mode?
> > In the case of SGMII, would it also mean that clause 37 exchange would
> > also take place (and its absence would mean it wouldn't), or does it
> > mean just that, that the driver should read the status from the PCS?
> > 
> > > A fixed link on the other hand would not read status from the PCS but
> > > would assume that the link is always up.
> > >   
> > > > Open question: is it sane to even do what we're trying here, to create a
> > > > fixed-link for port@a (which makes the phylink instance use MLO_AN_FIXED)
> > > > when &cp0_eth0 uses MLO_AN_INBAND? My simple mind thinks that if all
> > > > involved drivers were to behave correctly and not have bugs that cancel
> > > > out other bugs, the above device tree shouldn't work. The host port
> > > > would expect a clause 37 base page exchange to take place, the switch
> > > > wouldn't send any in-band information, and the SERDES lane would never
> > > > transition to data mode. To fix the above, we'd really need to chase the
> > > > "ethernet" phandle and attempt to mimic what the DSA master did. This is
> > > > indeed logic that never existed before, and I don't particularly feel
> > > > like adding it. How far do we want to go? It seems like never-ending
> > > > insanity the more I look at it.  
> > > 
> > > 10GBASE-R doesn't support clause 37 AN. 10GBASE-KR does support
> > > inband AN, but it's a different clause and different format.  
> > 
> > I thought it wouldn't, but then I was led to believe, after seeing it
> > here, that just the hardware I'm working with doesn't. How about
> > 2500base-x in Marvell, is there any base page exchange, or is this still
> > only about retrieving link status from the PCS?
> 
> Marvell documentation says that 2500base-x does not implement inband
> AN.
> 
> But when it was first implemented, for some reason it was thought that
> 2500base-x is just 1000base-x at 2.5x speed, and 1000base-x does
> support inband AN. Also it worked during tests for both switches and
> SOC NICs, so it was enabled.

It comes from Marvell NETA and PP2 documentation, which clearly states
that when a port is operating in 1000base-X mode, autonegotiation must
be enabled. It then implements 2500base-X by up-clocking the Serdes by
2.5x.

Therefore, to get 2500base-X using 1000base-X mode on these devices, it
follows that autonegotiation must be enabled. Since phylink's origins
are for these devices, and 2500base-X was not standardised at the time,
and there was no documentation online to describe what 2500base-X
actually was, a decision had to be made, and the logical thing to do
at that point was to support AN, especially as one can use it to
negotiate pause modes.

Note that NETA and PP2 have never supported half-duplex on 1G or 2.5G
speeds, so the clause 37 negotiation has only ever dealt with pause
modes, and again, it seemed logical and sensible at the time to allow
pause modes to still be negotiated at 2500base-X speed. Especially as
one can use FibreChannel SFPs to link two machines together using
2500base-X in the same way that you can link them together using
1000base-X.

Had manufacturers been more open with their documentation, and had they
used a common term, maybe this could have been different, but in an
information vacuum, decisions have to be made, even if they turn out to
be wrong later on - but we have to live with those consequences.

As I've stated before, I know I'm not alone in this - there are SFPs
that support 2500base-X (particularly GPON SFPs) that appear to expect
caluse 37 AN on 2500base-X, since the Armada 3700 uDPU product is
designed to work with them and it was a requirement to have a working
2.5G connection between the NETA interfaces and the GPON SFPs. And as
I say, NETA wants AN enabled.
Vladimir Oltean July 21, 2022, 6:22 p.m. UTC | #16
On Thu, Jul 21, 2022 at 07:21:45PM +0200, Marek Behún wrote:
> Marvell documentation says that 2500base-x does not implement inband
> AN.

Does Marvell documentation actually call it 2500base-x when it says it
doesn't support in-band autoneg?

> But when it was first implemented, for some reason it was thought that
> 2500base-x is just 1000base-x at 2.5x speed, and 1000base-x does
> support inband AN. Also it worked during tests for both switches and
> SOC NICs, so it was enabled.
> 
> At the time 2500base-x was not standardized. Now 2500base-x is
> stanradrized, and the standard says that 2500base-x does not support
> clause 37 AN. I guess this is because where it is used, it is intended
> to work with clause 73 AN somehow.

When you say 2500base-x is standardized, do you mean there is a document
somewhere which I could use to read more about this?

> And then came 6373X switch, which didn't support clause 37 inband AN in
> 2500base-x mode (the AN reigster returned 0xffff or something when
> 2500base-x CMODE was set). Maybe 6373X finally supports clause 73 AN
> (I don't know, but I don't think so) and that is the reason they now
> forbid clause 37 AN in HW in 2500base-x.
> 
> But the problem is that by this time there is software out there then
> expects 2500base-x to have clause 37 AN enabled. Indeed a passive SFP
> cable did not work between MOX' SFP port and CN9130-CRB's SFP port
> when used with Peridot (6190), if C37 AN was disabled on 6393x and left
> enabled on Peridot.
> 
> I managed to work out how to enable C37 AN on 6393x:
> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=163000dbc772c1eae9bdfe7c8fe30155db1efd74
> 
> So currently we try to enable C37 AN in 2500base-x mode, although
> the standard says that it shouldn't be there, and it shouldn't be there
> presumably because they want it to work with C73 AN.
> 
> I don't know how to solve this issue. Maybe declare a new PHY interface
> mode constant, 2500base-x-no-c37-an ?

So this is essentially what I'm asking, and you didn't necessarily fully
answer. I take it that there exist Marvell switches which enable in-band
autoneg for 2500base-x and switches which don't, and managed = "in-band-status"
has nothing to do with that decision. Right?

Is this by design of the 'managed' property, or is it an interpretation
quirk of Marvell drivers? Some other drivers enable in-band autoneg only
when manage = "in-band-status", and no one really said anything about
that during review, so I came to believe that this is the expectation.
I'm confused now, I was hoping Russell could clarify.
Russell King (Oracle) July 21, 2022, 9:14 p.m. UTC | #17
On Thu, Jul 21, 2022 at 09:22:16PM +0300, Vladimir Oltean wrote:
> On Thu, Jul 21, 2022 at 07:21:45PM +0200, Marek Behún wrote:
> > And then came 6373X switch, which didn't support clause 37 inband AN in
> > 2500base-x mode (the AN reigster returned 0xffff or something when
> > 2500base-x CMODE was set). Maybe 6373X finally supports clause 73 AN
> > (I don't know, but I don't think so) and that is the reason they now
> > forbid clause 37 AN in HW in 2500base-x.
> > 
> > But the problem is that by this time there is software out there then
> > expects 2500base-x to have clause 37 AN enabled. Indeed a passive SFP
> > cable did not work between MOX' SFP port and CN9130-CRB's SFP port
> > when used with Peridot (6190), if C37 AN was disabled on 6393x and left
> > enabled on Peridot.
> > 
> > I managed to work out how to enable C37 AN on 6393x:
> > https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=163000dbc772c1eae9bdfe7c8fe30155db1efd74
> > 
> > So currently we try to enable C37 AN in 2500base-x mode, although
> > the standard says that it shouldn't be there, and it shouldn't be there
> > presumably because they want it to work with C73 AN.
> > 
> > I don't know how to solve this issue. Maybe declare a new PHY interface
> > mode constant, 2500base-x-no-c37-an ?
> 
> So this is essentially what I'm asking, and you didn't necessarily fully
> answer. I take it that there exist Marvell switches which enable in-band
> autoneg for 2500base-x and switches which don't, and managed = "in-band-status"
> has nothing to do with that decision. Right?

I think we're getting a little too het up over this.

We have 1000base-X where, when we're not using in-band-status, we don't
use autoneg (some drivers that weren't caught in review annoyingly do
still use autoneg, but they shouldn't). We ignore the ethtool autoneg
bit.

We also have 1000base-X where we're using in-band-status, and we then
respect the ethtool autoneg bit.

So, wouldn't it be logical if 2500base-X were implemented the same way,
and on setups where 2500base-X does not support clause 37 AN, we
clear the ethtool autoneg bit? If we have 2500base-X being used as the
media link, surely this is the right behaviour?

(This has implications for the rate adaption case, since the 2500base-X
link is not the media, and therefore the state of the autoneg bit
shouldn't apply to the 2500base-X link.)
Vladimir Oltean July 21, 2022, 9:36 p.m. UTC | #18
On Thu, Jul 21, 2022 at 10:14:00PM +0100, Russell King (Oracle) wrote:
> > > So currently we try to enable C37 AN in 2500base-x mode, although
> > > the standard says that it shouldn't be there, and it shouldn't be there
> > > presumably because they want it to work with C73 AN.
> > > 
> > > I don't know how to solve this issue. Maybe declare a new PHY interface
> > > mode constant, 2500base-x-no-c37-an ?
> > 
> > So this is essentially what I'm asking, and you didn't necessarily fully
> > answer. I take it that there exist Marvell switches which enable in-band
> > autoneg for 2500base-x and switches which don't, and managed = "in-band-status"
> > has nothing to do with that decision. Right?
> 
> I think we're getting a little too het up over this.

No, I think it's relevant to this patch set.

> We have 1000base-X where, when we're not using in-band-status, we don't
> use autoneg (some drivers that weren't caught in review annoyingly do
> still use autoneg, but they shouldn't). We ignore the ethtool autoneg
> bit.
> 
> We also have 1000base-X where we're using in-band-status, and we then
> respect the ethtool autoneg bit.
> 
> So, wouldn't it be logical if 2500base-X were implemented the same way,
> and on setups where 2500base-X does not support clause 37 AN, we
> clear the ethtool autoneg bit? If we have 2500base-X being used as the
> media link, surely this is the right behaviour?

The ethtool autoneg bit is only relevant when the PCS is the last thing
before the medium. But if the SERDES protocol connects the MAC to the PHY,
or the MAC to another MAC (such as the case here, CPU or DSA ports),
there won't be any ethtool bit to take into consideration, and that's
where my question is. Is there any expected correlation between enabling
in-band autoneg and the presence or absence of managed = "in-band-status"?

> (This has implications for the rate adaption case, since the 2500base-X
> link is not the media, and therefore the state of the autoneg bit
> shouldn't apply to the 2500base-X link.)

This is closer to what I was interested in knowing, but still not that.
Russell King (Oracle) July 22, 2022, 8:28 a.m. UTC | #19
On Fri, Jul 22, 2022 at 12:36:45AM +0300, Vladimir Oltean wrote:
> On Thu, Jul 21, 2022 at 10:14:00PM +0100, Russell King (Oracle) wrote:
> > > > So currently we try to enable C37 AN in 2500base-x mode, although
> > > > the standard says that it shouldn't be there, and it shouldn't be there
> > > > presumably because they want it to work with C73 AN.
> > > > 
> > > > I don't know how to solve this issue. Maybe declare a new PHY interface
> > > > mode constant, 2500base-x-no-c37-an ?
> > > 
> > > So this is essentially what I'm asking, and you didn't necessarily fully
> > > answer. I take it that there exist Marvell switches which enable in-band
> > > autoneg for 2500base-x and switches which don't, and managed = "in-band-status"
> > > has nothing to do with that decision. Right?
> > 
> > I think we're getting a little too het up over this.
> 
> No, I think it's relevant to this patch set.
> 
> > We have 1000base-X where, when we're not using in-band-status, we don't
> > use autoneg (some drivers that weren't caught in review annoyingly do
> > still use autoneg, but they shouldn't). We ignore the ethtool autoneg
> > bit.
> > 
> > We also have 1000base-X where we're using in-band-status, and we then
> > respect the ethtool autoneg bit.
> > 
> > So, wouldn't it be logical if 2500base-X were implemented the same way,
> > and on setups where 2500base-X does not support clause 37 AN, we
> > clear the ethtool autoneg bit? If we have 2500base-X being used as the
> > media link, surely this is the right behaviour?
> 
> The ethtool autoneg bit is only relevant when the PCS is the last thing
> before the medium. But if the SERDES protocol connects the MAC to the PHY,
> or the MAC to another MAC (such as the case here, CPU or DSA ports),
> there won't be any ethtool bit to take into consideration, and that's
> where my question is. Is there any expected correlation between enabling
> in-band autoneg and the presence or absence of managed = "in-band-status"?

This topic is something I was looking at back in November 2021, trying
to work out what the most sensible way of indicating to a PCS whether
it should enable in-band or not:

http://git.armlinux.org.uk/cgit/linux-arm.git/commit/?h=net-queue&id=e4ea7d035e7e04e87dfd86702f59952e0cecc18d
http://git.armlinux.org.uk/cgit/linux-arm.git/commit/?h=net-queue&id=e454bf101fa457dd5c2cea0b1aaab7ba33048089
http://git.armlinux.org.uk/cgit/linux-arm.git/commit/?h=net-queue&id=e2c57490f205ae7c0e11fcf756675937f933be5e

The intention there was to move the decision about whether a PCS should
enable autoneg out of the PCS and into phylink, but doing that one comes
immediately on the problem of (e.g.) Marvell NETA/PP2 vs Lynx having
different interpretations for 2500base-X. There are also a number of
drivers that do not follow MLO_AN_INBAND-means-use-inband or not for
things such as SGMII or 1000base-X.

This means we have no standard interpretation amongst phylink users
about when in-band signalling should be enabled or disabled, which
means moving that decision into phylink today isn't possible.

The only thing we could do is provide the PCS with an additional bit
of information so it can make the decision - something like a boolean
"pcs_connects_to_medium" flag, and keep the decision making in the
PCS-specific code - sadly keeping the variability between different
PCS implementations.
Vladimir Oltean July 22, 2022, 10:52 a.m. UTC | #20
On Fri, Jul 22, 2022 at 09:28:08AM +0100, Russell King (Oracle) wrote:
> On Fri, Jul 22, 2022 at 12:36:45AM +0300, Vladimir Oltean wrote:
> > On Thu, Jul 21, 2022 at 10:14:00PM +0100, Russell King (Oracle) wrote:
> > > > > So currently we try to enable C37 AN in 2500base-x mode, although
> > > > > the standard says that it shouldn't be there, and it shouldn't be there
> > > > > presumably because they want it to work with C73 AN.
> > > > > 
> > > > > I don't know how to solve this issue. Maybe declare a new PHY interface
> > > > > mode constant, 2500base-x-no-c37-an ?
> > > > 
> > > > So this is essentially what I'm asking, and you didn't necessarily fully
> > > > answer. I take it that there exist Marvell switches which enable in-band
> > > > autoneg for 2500base-x and switches which don't, and managed = "in-band-status"
> > > > has nothing to do with that decision. Right?
> > > 
> > > I think we're getting a little too het up over this.
> > 
> > No, I think it's relevant to this patch set.
> > 
> > > We have 1000base-X where, when we're not using in-band-status, we don't
> > > use autoneg (some drivers that weren't caught in review annoyingly do
> > > still use autoneg, but they shouldn't). We ignore the ethtool autoneg
> > > bit.
> > > 
> > > We also have 1000base-X where we're using in-band-status, and we then
> > > respect the ethtool autoneg bit.
> > > 
> > > So, wouldn't it be logical if 2500base-X were implemented the same way,
> > > and on setups where 2500base-X does not support clause 37 AN, we
> > > clear the ethtool autoneg bit? If we have 2500base-X being used as the
> > > media link, surely this is the right behaviour?
> > 
> > The ethtool autoneg bit is only relevant when the PCS is the last thing
> > before the medium. But if the SERDES protocol connects the MAC to the PHY,
> > or the MAC to another MAC (such as the case here, CPU or DSA ports),
> > there won't be any ethtool bit to take into consideration, and that's
> > where my question is. Is there any expected correlation between enabling
> > in-band autoneg and the presence or absence of managed = "in-band-status"?
> 
> This topic is something I was looking at back in November 2021, trying
> to work out what the most sensible way of indicating to a PCS whether
> it should enable in-band or not:
> 
> http://git.armlinux.org.uk/cgit/linux-arm.git/commit/?h=net-queue&id=e4ea7d035e7e04e87dfd86702f59952e0cecc18d
> http://git.armlinux.org.uk/cgit/linux-arm.git/commit/?h=net-queue&id=e454bf101fa457dd5c2cea0b1aaab7ba33048089
> http://git.armlinux.org.uk/cgit/linux-arm.git/commit/?h=net-queue&id=e2c57490f205ae7c0e11fcf756675937f933be5e
> 
> The intention there was to move the decision about whether a PCS should
> enable autoneg out of the PCS and into phylink, but doing that one comes
> immediately on the problem of (e.g.) Marvell NETA/PP2 vs Lynx having
> different interpretations for 2500base-X. There are also a number of
> drivers that do not follow MLO_AN_INBAND-means-use-inband or not for
> things such as SGMII or 1000base-X.
> 
> This means we have no standard interpretation amongst phylink users
> about when in-band signalling should be enabled or disabled, which
> means moving that decision into phylink today isn't possible.
> 
> The only thing we could do is provide the PCS with an additional bit
> of information so it can make the decision - something like a boolean
> "pcs_connects_to_medium" flag, and keep the decision making in the
> PCS-specific code - sadly keeping the variability between different
> PCS implementations.

The way I understand what you're saying is that there is no guarantee
that the DSA master and CPU port will agree whether to use in-band
autoneg or not here (and implicitly, there is no guarantee that this
link will work):

	&eth0 {
		phy-mode = "2500base-x";
		managed = "in-band-status";
	};

	&switch_cpu_port {
		ethernet = <&eth0>;
		phy-mode = "25000base-x";
		managed = "in-band-status";
	};

similarly, there is a good chance that the DT description below might
result in a functional link:

	&eth0 {
		phy-mode = "2500base-x";
		managed = "in-band-status";
	};

	&switch_cpu_port {
		ethernet = <&eth0>;
		phy-mode = "25000base-x";

		fixed-link {
			speed = <2500>;
			full-duplex;
		};
	};

There is no expectation from either DT description to use in-band
autoneg or not.

The fact that of_phy_is_fixed_link() was made by Stas Sergeev to say
that a 'managed' link with the value != 'auto' is fixed prompted me to
study exactly what those changes were about.

Was the managed = "in-band-status" property introduced by Stas Sergeev
in this commit:

commit 4cba5c2103657d43d0886e4cff8004d95a3d0def
Author: Stas Sergeev <stsp@list.ru>
Date:   Mon Jul 20 17:49:57 2015 -0700

    of_mdio: add new DT property 'managed' to specify the PHY management type

    Currently the PHY management type is selected by the MAC driver arbitrary.
    The decision is based on the presence of the "fixed-link" node and on a
    will of the driver's authors.
    This caused a regression recently, when mvneta driver suddenly started
    to use the in-band status for auto-negotiation on fixed links.
    It appears the auto-negotiation may not work when expected by the MAC driver.
    Sebastien Rannou explains:
    << Yes, I confirm that my HW does not generate an in-band status. AFAIK, it's
    a PHY that aggregates 4xSGMIIs to 1xQSGMII ; the MAC side of the PHY (with
    inband status) is connected to the switch through QSGMII, and in this context
    we are on the media side of the PHY. >>
    https://lkml.org/lkml/2015/7/10/206

    This patch introduces the new string property 'managed' that allows
    the user to set the management type explicitly.
    The supported values are:
    "auto" - default. Uses either MDIO or nothing, depending on the presence
    of the fixed-link node
    "in-band-status" - use in-band status

    Signed-off-by: Stas Sergeev <stsp@users.sourceforge.net>

    CC: Rob Herring <robh+dt@kernel.org>
    CC: Pawel Moll <pawel.moll@arm.com>
    CC: Mark Rutland <mark.rutland@arm.com>
    CC: Ian Campbell <ijc+devicetree@hellion.org.uk>
    CC: Kumar Gala <galak@codeaurora.org>
    CC: Florian Fainelli <f.fainelli@gmail.com>
    CC: Grant Likely <grant.likely@linaro.org>
    CC: devicetree@vger.kernel.org
    CC: linux-kernel@vger.kernel.org
    CC: netdev@vger.kernel.org
    Signed-off-by: David S. Miller <davem@davemloft.net>

not added specifically to mean whether the MAC should use in-band
autoneg or not? See commit f8af8e6eb950 ("mvneta: use inband status only
when explicitly enabled") for the mvneta change, after which the code
became:

mvneta_probe:

	err = of_property_read_string(dn, "managed", &managed);
	pp->use_inband_status = (err == 0 &&
				 strcmp(managed, "in-band-status") == 0);

mvneta_defaults_set:

	if (pp->use_inband_status) {
		val = mvreg_read(pp, MVNETA_GMAC_AUTONEG_CONFIG);
		val &= ~(MVNETA_GMAC_FORCE_LINK_PASS |
			 MVNETA_GMAC_FORCE_LINK_DOWN |
			 MVNETA_GMAC_AN_FLOW_CTRL_EN);
		val |= MVNETA_GMAC_INBAND_AN_ENABLE |
		       MVNETA_GMAC_AN_SPEED_EN |
		       MVNETA_GMAC_AN_DUPLEX_EN;
		mvreg_write(pp, MVNETA_GMAC_AUTONEG_CONFIG, val);
	} else {
		val = mvreg_read(pp, MVNETA_GMAC_AUTONEG_CONFIG);
		val &= ~(MVNETA_GMAC_INBAND_AN_ENABLE |
		       MVNETA_GMAC_AN_SPEED_EN |
		       MVNETA_GMAC_AN_DUPLEX_EN);
		mvreg_write(pp, MVNETA_GMAC_AUTONEG_CONFIG, val);
	}

mvneta_port_power_up:

	if (pp->use_inband_status)
		ctrl |= MVNETA_GMAC2_INBAND_AN_ENABLE;

This is why I am asking whether there is any formal definition of what
managed = "in-band-status" means. You've said it means about retrieving
link status from the PCS. What are you basing upon when you are saying that?
Russell King (Oracle) July 22, 2022, 11:44 a.m. UTC | #21
On Fri, Jul 22, 2022 at 01:52:38PM +0300, Vladimir Oltean wrote:
> On Fri, Jul 22, 2022 at 09:28:08AM +0100, Russell King (Oracle) wrote:
> > On Fri, Jul 22, 2022 at 12:36:45AM +0300, Vladimir Oltean wrote:
> > > On Thu, Jul 21, 2022 at 10:14:00PM +0100, Russell King (Oracle) wrote:
> > > > > > So currently we try to enable C37 AN in 2500base-x mode, although
> > > > > > the standard says that it shouldn't be there, and it shouldn't be there
> > > > > > presumably because they want it to work with C73 AN.
> > > > > > 
> > > > > > I don't know how to solve this issue. Maybe declare a new PHY interface
> > > > > > mode constant, 2500base-x-no-c37-an ?
> > > > > 
> > > > > So this is essentially what I'm asking, and you didn't necessarily fully
> > > > > answer. I take it that there exist Marvell switches which enable in-band
> > > > > autoneg for 2500base-x and switches which don't, and managed = "in-band-status"
> > > > > has nothing to do with that decision. Right?
> > > > 
> > > > I think we're getting a little too het up over this.
> > > 
> > > No, I think it's relevant to this patch set.
> > > 
> > > > We have 1000base-X where, when we're not using in-band-status, we don't
> > > > use autoneg (some drivers that weren't caught in review annoyingly do
> > > > still use autoneg, but they shouldn't). We ignore the ethtool autoneg
> > > > bit.
> > > > 
> > > > We also have 1000base-X where we're using in-band-status, and we then
> > > > respect the ethtool autoneg bit.
> > > > 
> > > > So, wouldn't it be logical if 2500base-X were implemented the same way,
> > > > and on setups where 2500base-X does not support clause 37 AN, we
> > > > clear the ethtool autoneg bit? If we have 2500base-X being used as the
> > > > media link, surely this is the right behaviour?
> > > 
> > > The ethtool autoneg bit is only relevant when the PCS is the last thing
> > > before the medium. But if the SERDES protocol connects the MAC to the PHY,
> > > or the MAC to another MAC (such as the case here, CPU or DSA ports),
> > > there won't be any ethtool bit to take into consideration, and that's
> > > where my question is. Is there any expected correlation between enabling
> > > in-band autoneg and the presence or absence of managed = "in-band-status"?
> > 
> > This topic is something I was looking at back in November 2021, trying
> > to work out what the most sensible way of indicating to a PCS whether
> > it should enable in-band or not:
> > 
> > http://git.armlinux.org.uk/cgit/linux-arm.git/commit/?h=net-queue&id=e4ea7d035e7e04e87dfd86702f59952e0cecc18d
> > http://git.armlinux.org.uk/cgit/linux-arm.git/commit/?h=net-queue&id=e454bf101fa457dd5c2cea0b1aaab7ba33048089
> > http://git.armlinux.org.uk/cgit/linux-arm.git/commit/?h=net-queue&id=e2c57490f205ae7c0e11fcf756675937f933be5e
> > 
> > The intention there was to move the decision about whether a PCS should
> > enable autoneg out of the PCS and into phylink, but doing that one comes
> > immediately on the problem of (e.g.) Marvell NETA/PP2 vs Lynx having
> > different interpretations for 2500base-X. There are also a number of
> > drivers that do not follow MLO_AN_INBAND-means-use-inband or not for
> > things such as SGMII or 1000base-X.
> > 
> > This means we have no standard interpretation amongst phylink users
> > about when in-band signalling should be enabled or disabled, which
> > means moving that decision into phylink today isn't possible.
> > 
> > The only thing we could do is provide the PCS with an additional bit
> > of information so it can make the decision - something like a boolean
> > "pcs_connects_to_medium" flag, and keep the decision making in the
> > PCS-specific code - sadly keeping the variability between different
> > PCS implementations.
> 
> The way I understand what you're saying is that there is no guarantee
> that the DSA master and CPU port will agree whether to use in-band
> autoneg or not here (and implicitly, there is no guarantee that this
> link will work):
> 
> 	&eth0 {
> 		phy-mode = "2500base-x";
> 		managed = "in-band-status";
> 	};
> 
> 	&switch_cpu_port {
> 		ethernet = <&eth0>;
> 		phy-mode = "25000base-x";

I'll assume that 25000 is a typo.

> 		managed = "in-band-status";
> 	};

Today, there is no guarantee - because it depends on how people have
chosen to implement 2500base-X, and whether the hardware requires the
use of in-band AN or prohibits it. This is what happens when stuff
isn't standardised - one ends up with differing implementations doing
different things, and this has happened not _only_ at hardware level
but also software level as well.

You have to also throw into this that various implementations also have
an "AN bypass" flag, which means if they see what looks like a valid
SERDES data stream, but do not see the AN data, after a certain timeout
they allow the link to come up - and again, whether that is enabled or
not is pot luck today.

> similarly, there is a good chance that the DT description below might
> result in a functional link:
> 
> 	&eth0 {
> 		phy-mode = "2500base-x";
> 		managed = "in-band-status";
> 	};
> 
> 	&switch_cpu_port {
> 		ethernet = <&eth0>;
> 		phy-mode = "25000base-x";
> 
> 		fixed-link {
> 			speed = <2500>;
> 			full-duplex;
> 		};
> 	};
> 
> There is no expectation from either DT description to use in-band
> autoneg or not.
> 
> The fact that of_phy_is_fixed_link() was made by Stas Sergeev to say
> that a 'managed' link with the value != 'auto' is fixed prompted me to
> study exactly what those changes were about.

From what I can see, there is no formal definition of "in-band-status"
beyond what it says on the tin. The description in the DT binding
specification, which is really where this should be formally documented,
is totally lacking.

>     This patch introduces the new string property 'managed' that allows
>     the user to set the management type explicitly.
>     The supported values are:
>     "auto" - default. Uses either MDIO or nothing, depending on the presence
>     of the fixed-link node
>     "in-band-status" - use in-band status

This, and how this is implemented by mvneta, is the best we have to go
on for the meaning of this.

> This is why I am asking whether there is any formal definition of what
> managed = "in-band-status" means. You've said it means about retrieving
> link status from the PCS. What are you basing upon when you are saying that?

Given that this managed property was introduced for mvneta, mvneta's
implementation of it is the best reference we have to work out what
the intentions of it were beyond the commit text.

With in-band mode enabled, mvneta makes use of a fixed-link PHY, and
updates the fixed-link PHY with the status from its GMAC block (which
is the combined PCS+MAC).

So, when in-band mode is specified, the results from SGMII or 1000base-X
negotiation are read from the MAC side of the link, pushed into the
fixed-PHY, which then are reflected back into the driver via the usual
phylib adjust_link().

Have a read through mvneta's code at this commit:

git show 2eecb2e04abb62ef8ea7b43e1a46bdb5b99d1bf8:drivers/net/ethernet/marvell/mvneta.c

specifically, mvneta_fixed_link_update() and mvneta_adjust_link().
Note that when operating in in-band mode, there is actually no need
for the configuration of MVNETA_GMAC_AUTONEG_CONFIG to be touched
in any way since the values read from the MVNETA_GMAC_STATUS register
indicate what parameters the MAC is actually using. (The speed,
duplex, and pause bits in AUTONEG_CONFIG are ignored anyway if AN
is enabled.)

I know this is rather wooly, but not everything is defined in black and
white, and we need to do the best we can with the information that is
available.
Russell King (Oracle) July 22, 2022, 12:14 p.m. UTC | #22
On Fri, Jul 22, 2022 at 12:44:17PM +0100, Russell King (Oracle) wrote:
> Given that this managed property was introduced for mvneta, mvneta's
> implementation of it is the best reference we have to work out what
> the intentions of it were beyond the commit text.
> 
> With in-band mode enabled, mvneta makes use of a fixed-link PHY, and
> updates the fixed-link PHY with the status from its GMAC block (which
> is the combined PCS+MAC).
> 
> So, when in-band mode is specified, the results from SGMII or 1000base-X
> negotiation are read from the MAC side of the link, pushed into the
> fixed-PHY, which then are reflected back into the driver via the usual
> phylib adjust_link().

... and I should have said that this is exactly why in-band mode is
treated as a fixed-link, even though it's nothing of the sort. It makes
use of the infrastructure that was present at the time (fixed-phy) to
implement this feature of reading the link status from the PCS/MAC end
of the link.

It may not have been the best design, but it's an evolved design based
on the code that was available and what people thought at the time (and
pre-dates my involvement with mvneta in mainline.)
Vladimir Oltean July 22, 2022, 12:46 p.m. UTC | #23
On Fri, Jul 22, 2022 at 12:44:17PM +0100, Russell King (Oracle) wrote:
> On Fri, Jul 22, 2022 at 01:52:38PM +0300, Vladimir Oltean wrote:
> > On Fri, Jul 22, 2022 at 09:28:08AM +0100, Russell King (Oracle) wrote:
> > > On Fri, Jul 22, 2022 at 12:36:45AM +0300, Vladimir Oltean wrote:
> > > > On Thu, Jul 21, 2022 at 10:14:00PM +0100, Russell King (Oracle) wrote:
> > > > > > > So currently we try to enable C37 AN in 2500base-x mode, although
> > > > > > > the standard says that it shouldn't be there, and it shouldn't be there
> > > > > > > presumably because they want it to work with C73 AN.
> > > > > > > 
> > > > > > > I don't know how to solve this issue. Maybe declare a new PHY interface
> > > > > > > mode constant, 2500base-x-no-c37-an ?
> > > > > > 
> > > > > > So this is essentially what I'm asking, and you didn't necessarily fully
> > > > > > answer. I take it that there exist Marvell switches which enable in-band
> > > > > > autoneg for 2500base-x and switches which don't, and managed = "in-band-status"
> > > > > > has nothing to do with that decision. Right?
> > > > > 
> > > > > I think we're getting a little too het up over this.
> > > > 
> > > > No, I think it's relevant to this patch set.
> > > > 
> > > > > We have 1000base-X where, when we're not using in-band-status, we don't
> > > > > use autoneg (some drivers that weren't caught in review annoyingly do
> > > > > still use autoneg, but they shouldn't). We ignore the ethtool autoneg
> > > > > bit.
> > > > > 
> > > > > We also have 1000base-X where we're using in-band-status, and we then
> > > > > respect the ethtool autoneg bit.
> > > > > 
> > > > > So, wouldn't it be logical if 2500base-X were implemented the same way,
> > > > > and on setups where 2500base-X does not support clause 37 AN, we
> > > > > clear the ethtool autoneg bit? If we have 2500base-X being used as the
> > > > > media link, surely this is the right behaviour?
> > > > 
> > > > The ethtool autoneg bit is only relevant when the PCS is the last thing
> > > > before the medium. But if the SERDES protocol connects the MAC to the PHY,
> > > > or the MAC to another MAC (such as the case here, CPU or DSA ports),
> > > > there won't be any ethtool bit to take into consideration, and that's
> > > > where my question is. Is there any expected correlation between enabling
> > > > in-band autoneg and the presence or absence of managed = "in-band-status"?
> > > 
> > > This topic is something I was looking at back in November 2021, trying
> > > to work out what the most sensible way of indicating to a PCS whether
> > > it should enable in-band or not:
> > > 
> > > http://git.armlinux.org.uk/cgit/linux-arm.git/commit/?h=net-queue&id=e4ea7d035e7e04e87dfd86702f59952e0cecc18d
> > > http://git.armlinux.org.uk/cgit/linux-arm.git/commit/?h=net-queue&id=e454bf101fa457dd5c2cea0b1aaab7ba33048089
> > > http://git.armlinux.org.uk/cgit/linux-arm.git/commit/?h=net-queue&id=e2c57490f205ae7c0e11fcf756675937f933be5e
> > > 
> > > The intention there was to move the decision about whether a PCS should
> > > enable autoneg out of the PCS and into phylink, but doing that one comes
> > > immediately on the problem of (e.g.) Marvell NETA/PP2 vs Lynx having
> > > different interpretations for 2500base-X. There are also a number of
> > > drivers that do not follow MLO_AN_INBAND-means-use-inband or not for
> > > things such as SGMII or 1000base-X.
> > > 
> > > This means we have no standard interpretation amongst phylink users
> > > about when in-band signalling should be enabled or disabled, which
> > > means moving that decision into phylink today isn't possible.
> > > 
> > > The only thing we could do is provide the PCS with an additional bit
> > > of information so it can make the decision - something like a boolean
> > > "pcs_connects_to_medium" flag, and keep the decision making in the
> > > PCS-specific code - sadly keeping the variability between different
> > > PCS implementations.
> > 
> > The way I understand what you're saying is that there is no guarantee
> > that the DSA master and CPU port will agree whether to use in-band
> > autoneg or not here (and implicitly, there is no guarantee that this
> > link will work):
> > 
> > 	&eth0 {
> > 		phy-mode = "2500base-x";
> > 		managed = "in-band-status";
> > 	};
> > 
> > 	&switch_cpu_port {
> > 		ethernet = <&eth0>;
> > 		phy-mode = "25000base-x";
> 
> I'll assume that 25000 is a typo.

typo.

> > 		managed = "in-band-status";
> > 	};
> 
> Today, there is no guarantee - because it depends on how people have
> chosen to implement 2500base-X, and whether the hardware requires the
> use of in-band AN or prohibits it. This is what happens when stuff
> isn't standardised - one ends up with differing implementations doing
> different things, and this has happened not _only_ at hardware level
> but also software level as well.
> 
> You have to also throw into this that various implementations also have
> an "AN bypass" flag, which means if they see what looks like a valid
> SERDES data stream, but do not see the AN data, after a certain timeout
> they allow the link to come up - and again, whether that is enabled or
> not is pot luck today.

Interesting. After the timeout expires, does the lane ever transition
back into the encoding required for AN mode, in case there appears at a
later time someone willing to negotiate?

> > similarly, there is a good chance that the DT description below might
> > result in a functional link:
> > 
> > 	&eth0 {
> > 		phy-mode = "2500base-x";
> > 		managed = "in-band-status";
> > 	};
> > 
> > 	&switch_cpu_port {
> > 		ethernet = <&eth0>;
> > 		phy-mode = "25000base-x";
> > 
> > 		fixed-link {
> > 			speed = <2500>;
> > 			full-duplex;
> > 		};
> > 	};
> > 
> > There is no expectation from either DT description to use in-band
> > autoneg or not.
> > 
> > The fact that of_phy_is_fixed_link() was made by Stas Sergeev to say
> > that a 'managed' link with the value != 'auto' is fixed prompted me to
> > study exactly what those changes were about.
> 
> From what I can see, there is no formal definition of "in-band-status"
> beyond what it says on the tin. The description in the DT binding
> specification, which is really where this should be formally documented,
> is totally lacking.
> 
> >     This patch introduces the new string property 'managed' that allows
> >     the user to set the management type explicitly.
> >     The supported values are:
> >     "auto" - default. Uses either MDIO or nothing, depending on the presence
> >     of the fixed-link node
> >     "in-band-status" - use in-band status
> 
> This, and how this is implemented by mvneta, is the best we have to go
> on for the meaning of this.
> 
> > This is why I am asking whether there is any formal definition of what
> > managed = "in-band-status" means. You've said it means about retrieving
> > link status from the PCS. What are you basing upon when you are saying that?
> 
> Given that this managed property was introduced for mvneta, mvneta's
> implementation of it is the best reference we have to work out what
> the intentions of it were beyond the commit text.
> 
> With in-band mode enabled, mvneta makes use of a fixed-link PHY, and
> updates the fixed-link PHY with the status from its GMAC block (which
> is the combined PCS+MAC).
> 
> So, when in-band mode is specified, the results from SGMII or 1000base-X
> negotiation are read from the MAC side of the link, pushed into the
> fixed-PHY, which then are reflected back into the driver via the usual
> phylib adjust_link().
> 
> Have a read through mvneta's code at this commit:
> 
> git show 2eecb2e04abb62ef8ea7b43e1a46bdb5b99d1bf8:drivers/net/ethernet/marvell/mvneta.c
> 
> specifically, mvneta_fixed_link_update() and mvneta_adjust_link().
> Note that when operating in in-band mode, there is actually no need
> for the configuration of MVNETA_GMAC_AUTONEG_CONFIG to be touched
> in any way since the values read from the MVNETA_GMAC_STATUS register
> indicate what parameters the MAC is actually using. (The speed,
> duplex, and pause bits in AUTONEG_CONFIG are ignored anyway if AN
> is enabled.)

I view this as just an implementation detail and not as something that
influences what managed = "in-band-status" is supposed to mean.

> I know this is rather wooly, but not everything is defined in black and
> white, and we need to do the best we can with the information that is
> available.

So mvneta at the stage of the commit you've mentioned calls
mvneta_set_autoneg() with the value of pp->use_inband_status. There is
then the exception to be made for the PCS being what's exposed to the
medium, and in that case, ethtool may also override the pp->use_inband_status
variable (which in turn affects the autoneg).

So if we take mvneta at this commit as the reference, what we learn is
that using in-band status essentially depends on using in-band autoneg
in the first place.

What is hard for me to comprehend is how we ever came to conclude that
for SERDES protocols where clause 37 is possible (2500base-x should be
part of this group), managed = "in-band-status" does not imply in-band
autoneg, considering the mvneta precedent.
And why would we essentially redefine its meaning by stating that no,
it is only about the status, not about the autoneg, even though the
status comes from the autoneg for these protocols.

There is a separate discussion to be had about SERDES protocols without
clause 37 AN (10GBase-R), I don't want to go there just yet, I want to
concentrate on what's similar to what mvneta originally supported.
Marek Behún July 22, 2022, 12:59 p.m. UTC | #24
On Thu, 21 Jul 2022 21:22:16 +0300
Vladimir Oltean <olteanv@gmail.com> wrote:

> On Thu, Jul 21, 2022 at 07:21:45PM +0200, Marek Behún wrote:
> > Marvell documentation says that 2500base-x does not implement inband
> > AN.  
> 
> Does Marvell documentation actually call it 2500base-x when it says it
> doesn't support in-band autoneg?

Yes, it does.

> > But when it was first implemented, for some reason it was thought that
> > 2500base-x is just 1000base-x at 2.5x speed, and 1000base-x does
> > support inband AN. Also it worked during tests for both switches and
> > SOC NICs, so it was enabled.
> > 
> > At the time 2500base-x was not standardized. Now 2500base-x is
> > stanradrized, and the standard says that 2500base-x does not support
> > clause 37 AN. I guess this is because where it is used, it is intended
> > to work with clause 73 AN somehow.  
> 
> When you say 2500base-x is standardized, do you mean there is a document
> somewhere which I could use to read more about this?

IEEE Std 802.3cb-2018: Amendment 1: Physical Layer Specifications and
Management Parameters for 2.5 Gb/s and 5 Gb/s Operation over Backplane.

Annex 127A (informative): Compatibility of 2.5GBASE-X PCS/PMA with
1000BASE-X PCS/PMA running 2.5 times faster

  ...
  This annex discusses the restrictions when operating 2.5GBASE-X
  PCS/PMA with a 1000BASE-X PCS/PMA link partner running 2.5 times
  faster. Compatibility of the PMD is outside the scope of this annex.
  In this annex when 1000BASE-X PCS/PMA is referred to, the 2.5 times
  speed up is implied.
  ...
  The 2.5GBASE-X PCS does not support Clause 37 Auto-Negotiation.
  Hence, the 1000BASE-X PCS is expected to have its Clause 37
  Auto-Negotiation functionality disabled so that the /C/ ordered set
  will not be transmitted. If a 2.5GBASE-X PCS receives /C/ ordered
  set, then undefined behavior may occur.
  ...

Marek
Russell King (Oracle) July 22, 2022, 1:16 p.m. UTC | #25
On Fri, Jul 22, 2022 at 03:46:29PM +0300, Vladimir Oltean wrote:
> On Fri, Jul 22, 2022 at 12:44:17PM +0100, Russell King (Oracle) wrote:
> > Today, there is no guarantee - because it depends on how people have
> > chosen to implement 2500base-X, and whether the hardware requires the
> > use of in-band AN or prohibits it. This is what happens when stuff
> > isn't standardised - one ends up with differing implementations doing
> > different things, and this has happened not _only_ at hardware level
> > but also software level as well.
> > 
> > You have to also throw into this that various implementations also have
> > an "AN bypass" flag, which means if they see what looks like a valid
> > SERDES data stream, but do not see the AN data, after a certain timeout
> > they allow the link to come up - and again, whether that is enabled or
> > not is pot luck today.
> 
> Interesting. After the timeout expires, does the lane ever transition
> back into the encoding required for AN mode, in case there appears at a
> later time someone willing to negotiate?

They don't document that it does.

> > > similarly, there is a good chance that the DT description below might
> > > result in a functional link:
> > > 
> > > 	&eth0 {
> > > 		phy-mode = "2500base-x";
> > > 		managed = "in-band-status";
> > > 	};
> > > 
> > > 	&switch_cpu_port {
> > > 		ethernet = <&eth0>;
> > > 		phy-mode = "25000base-x";
> > > 
> > > 		fixed-link {
> > > 			speed = <2500>;
> > > 			full-duplex;
> > > 		};
> > > 	};
> > > 
> > > There is no expectation from either DT description to use in-band
> > > autoneg or not.
> > > 
> > > The fact that of_phy_is_fixed_link() was made by Stas Sergeev to say
> > > that a 'managed' link with the value != 'auto' is fixed prompted me to
> > > study exactly what those changes were about.
> > 
> > From what I can see, there is no formal definition of "in-band-status"
> > beyond what it says on the tin. The description in the DT binding
> > specification, which is really where this should be formally documented,
> > is totally lacking.
> > 
> > >     This patch introduces the new string property 'managed' that allows
> > >     the user to set the management type explicitly.
> > >     The supported values are:
> > >     "auto" - default. Uses either MDIO or nothing, depending on the presence
> > >     of the fixed-link node
> > >     "in-band-status" - use in-band status
> > 
> > This, and how this is implemented by mvneta, is the best we have to go
> > on for the meaning of this.
> > 
> > > This is why I am asking whether there is any formal definition of what
> > > managed = "in-band-status" means. You've said it means about retrieving
> > > link status from the PCS. What are you basing upon when you are saying that?
> > 
> > Given that this managed property was introduced for mvneta, mvneta's
> > implementation of it is the best reference we have to work out what
> > the intentions of it were beyond the commit text.
> > 
> > With in-band mode enabled, mvneta makes use of a fixed-link PHY, and
> > updates the fixed-link PHY with the status from its GMAC block (which
> > is the combined PCS+MAC).
> > 
> > So, when in-band mode is specified, the results from SGMII or 1000base-X
> > negotiation are read from the MAC side of the link, pushed into the
> > fixed-PHY, which then are reflected back into the driver via the usual
> > phylib adjust_link().
> > 
> > Have a read through mvneta's code at this commit:
> > 
> > git show 2eecb2e04abb62ef8ea7b43e1a46bdb5b99d1bf8:drivers/net/ethernet/marvell/mvneta.c
> > 
> > specifically, mvneta_fixed_link_update() and mvneta_adjust_link().
> > Note that when operating in in-band mode, there is actually no need
> > for the configuration of MVNETA_GMAC_AUTONEG_CONFIG to be touched
> > in any way since the values read from the MVNETA_GMAC_STATUS register
> > indicate what parameters the MAC is actually using. (The speed,
> > duplex, and pause bits in AUTONEG_CONFIG are ignored anyway if AN
> > is enabled.)
> 
> I view this as just an implementation detail and not as something that
> influences what managed = "in-band-status" is supposed to mean.
> 
> > I know this is rather wooly, but not everything is defined in black and
> > white, and we need to do the best we can with the information that is
> > available.
> 
> So mvneta at the stage of the commit you've mentioned calls
> mvneta_set_autoneg() with the value of pp->use_inband_status. There is
> then the exception to be made for the PCS being what's exposed to the
> medium, and in that case, ethtool may also override the pp->use_inband_status
> variable (which in turn affects the autoneg).
> 
> So if we take mvneta at this commit as the reference, what we learn is
> that using in-band status essentially depends on using in-band autoneg
> in the first place.
> 
> What is hard for me to comprehend is how we ever came to conclude that
> for SERDES protocols where clause 37 is possible (2500base-x should be
> part of this group), managed = "in-band-status" does not imply in-band
> autoneg, considering the mvneta precedent.

That is a recent addition, since the argument was made that when using
a 1000base-X fibre transceiver, using ethtool to disable autoneg is a
reasonable thing to do - and something that was supported with
mvneta_ethtool_set_link_ksettings() as it stands at the point in the
commit above.

> And why would we essentially redefine its meaning by stating that no,
> it is only about the status, not about the autoneg, even though the
> status comes from the autoneg for these protocols.

I'm not sure I understand what you're getting at there.

Going back to the mvneta combined PCS+MAC implementation, we read the
link parameters from the PCS when operating in in-band mode and throw
them at the fixed PHY so that ethtool works, along with all the usual
link up/down state reporting, carrier etc.

If autoneg is disabled, then we effectively operate in fixed-link mode
(use_inband_status becomes false, and we start forcing the link up/down
and also force the speed and duplex parameters by disabling autoneg.)

Note that this version of mvneta does not support 1000base-X mode, only
SGMII is actually supported.

There's a few things that are rather confusing in the driver:

MVNETA_GMAC_INBAND_AN_ENABLE - this controls whether in-band negotiation
is performed or not.
MVNETA_GMAC_AN_SPEED_EN - this controls whether the result of in-band
negotiation for speed is used, or the manually programmed speed in this
register.
MVNETA_GMAC_AN_DUPLEX_EN - same for duplex.
MVNETA_GMAC_AN_FLOW_CTRL_EN - same for pause (only symmetric pause is
supported)

MVNETA_GMAC2_INBAND_AN_ENABLE - misnamed, it selects whether SGMII (set)
or 1000base-X (unset) format for the 16-bit control word is used.

There is another bit in MVNETA_GMAC_CTRL_0 that selects between
1000base-X and SGMII operation mode, and when this bit is set for
1000base-X. This version of the driver doesn't support 1000base-X,
so this bit is never set.
Andrew Lunn July 22, 2022, 1:20 p.m. UTC | #26
> > The way I understand what you're saying is that there is no guarantee
> > that the DSA master and CPU port will agree whether to use in-band
> > autoneg or not here (and implicitly, there is no guarantee that this
> > link will work):
> > 
> > 	&eth0 {
> > 		phy-mode = "2500base-x";
> > 		managed = "in-band-status";
> > 	};
> > 
> > 	&switch_cpu_port {
> > 		ethernet = <&eth0>;
> > 		phy-mode = "25000base-x";
> 
> I'll assume that 25000 is a typo.
> 
> > 		managed = "in-band-status";
> > 	};
> 
> Today, there is no guarantee - because it depends on how people have
> chosen to implement 2500base-X, and whether the hardware requires the
> use of in-band AN or prohibits it.

In practice, a Marvell MAC and a Marvell switch are likely to work,
since Marvell produce and tested both ends. I would expect this to be
true for any vendor. It is only going to be a problem when you have
devices from different vendors. And they have different
interpretations of what 2500Base-X is.

So far, mixed vendor systems tend to be

1) Freescale FEC and Marvell switches, and the FEC it still only 1G RGMII
2) The smaller simpler devices which are 1G and do not yet use a SERDES.

So this might be a future problem we will have, as more devices start
supporting 2.5G and 5G, but hopefully those future devices follow the
standard.

	Andrew
Russell King (Oracle) July 22, 2022, 1:23 p.m. UTC | #27
On Fri, Jul 22, 2022 at 02:59:36PM +0200, Marek Behún wrote:
>   The 2.5GBASE-X PCS does not support Clause 37 Auto-Negotiation.
>   Hence, the 1000BASE-X PCS is expected to have its Clause 37
>   Auto-Negotiation functionality disabled so that the /C/ ordered set
>   will not be transmitted. If a 2.5GBASE-X PCS receives /C/ ordered
>   set, then undefined behavior may occur.
>   ...

The reason that's probably stated is because there hasn't been any
standardisation of it, different implementations behave differently,
so they can't define a standard behaviour without breaking what's
already out there.

With mvneta, the reality is that the 2.5G speed is implemented by
changing the clock configuration in the COMPHY block (serdes) - which
basically clocks everything 2.5x faster. I seem to remember mvpp2 is
the same deal.
Marek Behún July 22, 2022, 2:19 p.m. UTC | #28
On Fri, 22 Jul 2022 14:23:18 +0100
"Russell King (Oracle)" <linux@armlinux.org.uk> wrote:

> On Fri, Jul 22, 2022 at 02:59:36PM +0200, Marek Behún wrote:
> >   The 2.5GBASE-X PCS does not support Clause 37 Auto-Negotiation.
> >   Hence, the 1000BASE-X PCS is expected to have its Clause 37
> >   Auto-Negotiation functionality disabled so that the /C/ ordered set
> >   will not be transmitted. If a 2.5GBASE-X PCS receives /C/ ordered
> >   set, then undefined behavior may occur.
> >   ...  
> 
> The reason that's probably stated is because there hasn't been any
> standardisation of it, different implementations behave differently,
> so they can't define a standard behaviour without breaking what's
> already out there.

I think they are also considering clause 73 AN, which is supported with
1000base-KX (if I am not mistaken). Maybe the intention was to use
clause 73 with >1G speeds, and so they've forbid clause 37 on
2500base-x.
Vladimir Oltean July 22, 2022, 4:56 p.m. UTC | #29
On Fri, Jul 22, 2022 at 02:16:04PM +0100, Russell King (Oracle) wrote:
> > So mvneta at the stage of the commit you've mentioned calls
> > mvneta_set_autoneg() with the value of pp->use_inband_status. There is
> > then the exception to be made for the PCS being what's exposed to the
> > medium, and in that case, ethtool may also override the pp->use_inband_status
> > variable (which in turn affects the autoneg).
> > 
> > So if we take mvneta at this commit as the reference, what we learn is
> > that using in-band status essentially depends on using in-band autoneg
> > in the first place.
> > 
> > What is hard for me to comprehend is how we ever came to conclude that
> > for SERDES protocols where clause 37 is possible (2500base-x should be
> > part of this group), managed = "in-band-status" does not imply in-band
> > autoneg, considering the mvneta precedent.
> 
> That is a recent addition, since the argument was made that when using
> a 1000base-X fibre transceiver, using ethtool to disable autoneg is a
> reasonable thing to do - and something that was supported with
> mvneta_ethtool_set_link_ksettings() as it stands at the point in the
> commit above.

I'm sorry, I don't understand. What is the recent addition, and recent
relative to what? The 2500base-x link mode? Ok, but this is only
tangentially related to the point overall, more below.

> > And why would we essentially redefine its meaning by stating that no,
> > it is only about the status, not about the autoneg, even though the
> > status comes from the autoneg for these protocols.
> 
> I'm not sure I understand what you're getting at there.

Sorry if I haven't made my point clear.

My point is that drivers may have more restrictive interpretations of
managed = "in-band-status", and the current logic of automatically
create a fixed-link for DSA's CPU ports is going to cause problems when
matched up with a DSA master that expects in-band autoneg for whatever
SERDES protocol.

What I'd like to happen as a result is that no DSA driver except Marvell
opts into this by default, and no driver opts into it without its maintainer
understanding the implications. Otherwise we're going to normalize the
expectation that a managed = "in-band-status" DSA master should be able
to interoperate with a fixed-link CPU port, but during this discussion
there was no argument being brought that a strict interpretation of
"in-band-status" as "enable autoneg" is incorrect in any way.

> Going back to the mvneta combined PCS+MAC implementation, we read the
> link parameters from the PCS when operating in in-band mode and throw
> them at the fixed PHY so that ethtool works, along with all the usual
> link up/down state reporting, carrier etc.
> 
> If autoneg is disabled, then we effectively operate in fixed-link mode
> (use_inband_status becomes false, and we start forcing the link up/down
> and also force the speed and duplex parameters by disabling autoneg.)
> 
> Note that this version of mvneta does not support 1000base-X mode, only
> SGMII is actually supported.
> 
> There's a few things that are rather confusing in the driver:
> 
> MVNETA_GMAC_INBAND_AN_ENABLE - this controls whether in-band negotiation
> is performed or not.
> MVNETA_GMAC_AN_SPEED_EN - this controls whether the result of in-band
> negotiation for speed is used, or the manually programmed speed in this
> register.
> MVNETA_GMAC_AN_DUPLEX_EN - same for duplex.
> MVNETA_GMAC_AN_FLOW_CTRL_EN - same for pause (only symmetric pause is
> supported)
> 
> MVNETA_GMAC2_INBAND_AN_ENABLE - misnamed, it selects whether SGMII (set)
> or 1000base-X (unset) format for the 16-bit control word is used.
> 
> There is another bit in MVNETA_GMAC_CTRL_0 that selects between
> 1000base-X and SGMII operation mode, and when this bit is set for
> 1000base-X. This version of the driver doesn't support 1000base-X,
> so this bit is never set.

Thanks for this explanation, if nothing else, it seems to support the
way in which I was interpreting managed = "in-band-status" to mean
"enable in-band autoneg", but to be clear, I wasn't debating something
about the way in which mvneta was doing things. But rather, I was
debating why would *other* drivers do things differently such as to come
to expect that a fixed-link master + an in-band-status CPU port, or the
other way around, may be compatible with each other.

Anyway, before I comment any further on the other points, I have a board
using armada-3720-turris-mox.dts on which I wanted to make a test, but I
don't fully understand the results, could you help me do so?

By default, both the mvneta master and my 6390 top-most switch are
configured for inband/2500base-x. In essence I'm perfectly fine with
that. I don't care whether IEEE standardized inband/2500base-x, as long
as both drivers come to expect to enable or disable inband depending on
the device tree.

I've dumped the variables from mvneta_pcs_get_state() and it appears
that the mvneta is reporting AN complete. This would suggest that there
is indeed in-band autoneg taking place with the 6390 switch.

Then I modified the device tree to disable in-band autoneg (I've checked
mv88e6390_serdes_pcs_config and it behaves how I'd expect, enabling
BMCR_ANENABLE strictly according to phylink_autoneg_inband(mode)).
Essentially what I'm trying is to intentionally break in-band autoneg by
causing a mismatch, to prove that it is indeed taking place.

The results are interesting: state->an_complete is still reported as 1
for eth1 (mvneta).

ip link set eth1 up
[   70.809889] mvneta d0040000.ethernet eth1: configuring for inband/2500base-x link mode
[   70.818086] mvneta_pcs_get_state: MVNETA_GMAC_AUTONEG_CONFIG 0x9b4c state->an_complete 0 state->speed 2500 state->pause 0x3 state->link 0 state->duplex 1
[   70.836081] mvneta_pcs_get_state: MVNETA_GMAC_AUTONEG_CONFIG 0x9b4c state->an_complete 0 state->speed 2500 state->pause 0x3 state->link 0 state->duplex 1
[   70.843748] mv88e6085 d0032004.mdio-mii:10: Link is Down
[   70.859944] mv88e6085 d0032004.mdio-mii:10: mv88e6390_serdes_pcs_config: port 9 MV88E6390_SGMII_ADVERTISE 0xa0 adv 0x80 changed 1
[   70.878737] mv88e6085 d0032004.mdio-mii:10: mv88e6390_serdes_pcs_config: port 9 MV88E6390_SGMII_BMCR 0x140 bmcr 0x140 phylink_autoneg_inband(mode) 0
[   70.898302] mv88e6085 d0032004.mdio-mii:10: Link is Up - 2.5Gbps/Full - flow control off
[   71.069532] mvneta_pcs_get_state: MVNETA_GMAC_AUTONEG_CONFIG 0x9b4c state->an_complete 1 state->speed 2500 state->pause 0x3 state->link 1 state->duplex 1
[   71.083376] mvneta d0040000.ethernet eth1: Link is Up - 2.5Gbps/Full - flow control rx/tx
[   71.091672] IPv6: ADDRCONF(NETDEV_CHANGE): eth1: link becomes ready

Then I studied MVNETA_GMAC_AUTONEG_CONFIG and I noticed that the bit
you're talking about, MVNETA_GMAC_AN_BYPASS_ENABLE (bit 3) is indeed set
by default (the driver doesn't set it).

I've intentionally masked it off in mvneta_pcs_config() by setting it in
the "mask" variable and nowhere else. Now I get:

ip link set eth1 up
[  434.336679] mvneta d0040000.ethernet eth1: configuring for inband/2500base-x link mode
[  434.342618] mv88e6085 d0032004.mdio-mii:10: Link is Down
[  434.350020] mvneta_pcs_get_state: MVNETA_GMAC_AUTONEG_CONFIG 0x9b44 state->an_complete 0 state->speed 2500 state->pause 0x3 state->link 0 state->duplex 1
[  434.350055] mvneta_pcs_get_state: MVNETA_GMAC_AUTONEG_CONFIG 0x9b44 state->an_complete 0 state->speed 2500 state->pause 0x3 state->link 0 state->duplex 1
[  434.384794] mv88e6085 d0032004.mdio-mii:10: mv88e6390_serdes_pcs_config: port 9 MV88E6390_SGMII_ADVERTISE 0xa0 adv 0x80 changed 1
[  434.403808] mv88e6085 d0032004.mdio-mii:10: mv88e6390_serdes_pcs_config: port 9 MV88E6390_SGMII_BMCR 0x140 bmcr 0x140 phylink_autoneg_inband(mode) 0
[  434.423732] mv88e6085 d0032004.mdio-mii:10: Link is Up - 2.5Gbps/Full - flow control off

so state->an_complete now remains zero, and the link is down on the CPU
port, and indeed I can no longer ping the board from the outside world.


So what this is telling me is that mvneta has some built-in resilience
to in-band autoneg mismatches, via MVNETA_GMAC_AN_BYPASS_ENABLE. But that
(a) doesn't make it valid to mix and match a fixed-link with a managed =
    "in-band-status" mode
(b) doesn't mean it's unspecified whether managed = "in-band-status"
    should dictate whether to enable in-band autoneg or not
(c) doesn't mean that other devices/drivers support "AN bypass" to save
    the day and make an invalid DT description appear to work just fine

This further supports my idea that we should make a better attempt of
matching the DSA master's mode with the node we're faking in DSA for
phylink. For Marvell hardware you or Andrew are surely more knowledgeable
to be able to say whether that's needed right now or not. But in the
general case please don't push this to everyone, it just muddies the
waters.
Russell King (Oracle) July 22, 2022, 9:20 p.m. UTC | #30
On Fri, Jul 22, 2022 at 07:56:00PM +0300, Vladimir Oltean wrote:
> On Fri, Jul 22, 2022 at 02:16:04PM +0100, Russell King (Oracle) wrote:
> > > So mvneta at the stage of the commit you've mentioned calls
> > > mvneta_set_autoneg() with the value of pp->use_inband_status. There is
> > > then the exception to be made for the PCS being what's exposed to the
> > > medium, and in that case, ethtool may also override the pp->use_inband_status
> > > variable (which in turn affects the autoneg).
> > > 
> > > So if we take mvneta at this commit as the reference, what we learn is
> > > that using in-band status essentially depends on using in-band autoneg
> > > in the first place.
> > > 
> > > What is hard for me to comprehend is how we ever came to conclude that
> > > for SERDES protocols where clause 37 is possible (2500base-x should be
> > > part of this group), managed = "in-band-status" does not imply in-band
> > > autoneg, considering the mvneta precedent.
> > 
> > That is a recent addition, since the argument was made that when using
> > a 1000base-X fibre transceiver, using ethtool to disable autoneg is a
> > reasonable thing to do - and something that was supported with
> > mvneta_ethtool_set_link_ksettings() as it stands at the point in the
> > commit above.
> 
> I'm sorry, I don't understand. What is the recent addition, and recent
> relative to what? The 2500base-x link mode? Ok, but this is only
> tangentially related to the point overall, more below.

I'm talking about how we handle 1000base-X autoneg - specifically this
commit:

92817dad7dcb net: phylink: Support disabling autonegotiation for PCS

where we can be in 1000base-X with managed = "in-band-status" but we
have autoneg disabled. I thought that is what you were referring to.

As for 2500base-X, I had been raising the issue of AN in the past, for
example (have I said it's really difficult to find old emails even with
google?):

https://lwn.net/ml/netdev/20200618140623.GC1551@shell.armlinux.org.uk/

and eventually I stopped caring about it, as it became pointless to
raise it anymore when we had an established mixture of behaviours. This
is why we have ended up with PCS drivers configuring for no AN for a
firmware description of:

	managed = "in-band-status";
	phy-mode = "2500base-x";

and hence now have unclean semantics for this - some such as mvneta
and mvpp2 will have AN enabled. Others such as pcs-lynx will not.
However, both will request link status from the PCS side and use that
to determine whether the link is up, and use the parameters that the
PCS code returns for the link. Since 2500base-X can only operate at
2.5G, PCS code always reports SPEED_2500, and as half duplex is
virtually never supported above 1G, DUPLEX_FULL.

> > > And why would we essentially redefine its meaning by stating that no,
> > > it is only about the status, not about the autoneg, even though the
> > > status comes from the autoneg for these protocols.
> > 
> > I'm not sure I understand what you're getting at there.
> 
> Sorry if I haven't made my point clear.
> 
> My point is that drivers may have more restrictive interpretations of
> managed = "in-band-status", and the current logic of automatically
> create a fixed-link for DSA's CPU ports is going to cause problems when
> matched up with a DSA master that expects in-band autoneg for whatever
> SERDES protocol.
> 
> What I'd like to happen as a result is that no DSA driver except Marvell
> opts into this by default, and no driver opts into it without its maintainer
> understanding the implications. Otherwise we're going to normalize the
> expectation that a managed = "in-band-status" DSA master should be able
> to interoperate with a fixed-link CPU port, but during this discussion
> there was no argument being brought that a strict interpretation of
> "in-band-status" as "enable autoneg" is incorrect in any way.

I still don't understand your point - because you seem to be conflating
two different things (at least as I understand it.)

We have this:

		port@N {
			reg = <N>;
			label = "cpu";
			ethernet = <&ethX>;
		};

This specifies that the port operates at whatever interface mode and
settings gives the maximum speed. There is no mention of a "managed"
property, and therefore (Andrew, correct me if I'm wrong) in-band
negotiation is not expected to be used.

The configuration of the ethX parameters on the other end of the link
are up to the system integrator to get right, and the actual behaviour
would depend on the ethernet driver. As I've said in previous emails,
there is such a thing as "AN bypass" that can be implemented, and it
can default to be enabled, and drivers can ignore that such a bit even
exists. So, it's possible that even with "managed" set to
"in-band-status" in DT, a link to such a DSA switch will still come up
even though we've requested in DT for AN to be used.

If an ethernet driver is implemented to strictly require in-band AN in
this case, then the link won't come up, and the system integrator would
have to debug the problem.

I think this is actually true on Clearfog - if one specifies the CPU
port as I have above, and requests in-band on the host ethernet, then
the link doesn't come up, because mvneta turns off AN bypass.

> > Going back to the mvneta combined PCS+MAC implementation, we read the
> > link parameters from the PCS when operating in in-band mode and throw
> > them at the fixed PHY so that ethtool works, along with all the usual
> > link up/down state reporting, carrier etc.
> > 
> > If autoneg is disabled, then we effectively operate in fixed-link mode
> > (use_inband_status becomes false, and we start forcing the link up/down
> > and also force the speed and duplex parameters by disabling autoneg.)
> > 
> > Note that this version of mvneta does not support 1000base-X mode, only
> > SGMII is actually supported.
> > 
> > There's a few things that are rather confusing in the driver:
> > 
> > MVNETA_GMAC_INBAND_AN_ENABLE - this controls whether in-band negotiation
> > is performed or not.
> > MVNETA_GMAC_AN_SPEED_EN - this controls whether the result of in-band
> > negotiation for speed is used, or the manually programmed speed in this
> > register.
> > MVNETA_GMAC_AN_DUPLEX_EN - same for duplex.
> > MVNETA_GMAC_AN_FLOW_CTRL_EN - same for pause (only symmetric pause is
> > supported)
> > 
> > MVNETA_GMAC2_INBAND_AN_ENABLE - misnamed, it selects whether SGMII (set)
> > or 1000base-X (unset) format for the 16-bit control word is used.
> > 
> > There is another bit in MVNETA_GMAC_CTRL_0 that selects between
> > 1000base-X and SGMII operation mode, and when this bit is set for
> > 1000base-X. This version of the driver doesn't support 1000base-X,
> > so this bit is never set.
> 
> Thanks for this explanation, if nothing else, it seems to support the
> way in which I was interpreting managed = "in-band-status" to mean
> "enable in-band autoneg", but to be clear, I wasn't debating something
> about the way in which mvneta was doing things. But rather, I was
> debating why would *other* drivers do things differently such as to come
> to expect that a fixed-link master + an in-band-status CPU port, or the
> other way around, may be compatible with each other.

Please note that phylink makes a DT specification including both a
fixed-link descriptor and a managed in-band-status property illegal
because these are two different modes of operating the link, and they
conflict with each other.

The fact that the of_fixed_link_whateveritwas() function (sorry I can't
remember the name) returns true for both indicating that they're both
fixed-link is a historic artifact that has not been changed. As the
fixed-PHY code supporting that way was dropped, I suppose that should
have been cleaned up at some point, but I never got around to it
(remember, development in this space is a very slow process.) There
were always more pressing matters to be dealt with.

Maybe we should now make of_fixed_link_whateveritwas() no longer return
true, and introduce of_managed_in_band() or something like that which
drivers can test that separately. I'm not sure it's worth the driver
churn to make such a change, I'm not sure what the benefit would be.

> Anyway, before I comment any further on the other points, I have a board
> using armada-3720-turris-mox.dts on which I wanted to make a test, but I
> don't fully understand the results, could you help me do so?
> 
> By default, both the mvneta master and my 6390 top-most switch are
> configured for inband/2500base-x. In essence I'm perfectly fine with
> that. I don't care whether IEEE standardized inband/2500base-x, as long
> as both drivers come to expect to enable or disable inband depending on
> the device tree.
> 
> I've dumped the variables from mvneta_pcs_get_state() and it appears
> that the mvneta is reporting AN complete. This would suggest that there
> is indeed in-band autoneg taking place with the 6390 switch.
> 
> Then I modified the device tree to disable in-band autoneg (I've checked
> mv88e6390_serdes_pcs_config and it behaves how I'd expect, enabling
> BMCR_ANENABLE strictly according to phylink_autoneg_inband(mode)).
> Essentially what I'm trying is to intentionally break in-band autoneg by
> causing a mismatch, to prove that it is indeed taking place.
> 
> The results are interesting: state->an_complete is still reported as 1
> for eth1 (mvneta).
> 
> ip link set eth1 up
> [   70.809889] mvneta d0040000.ethernet eth1: configuring for inband/2500base-x link mode
> [   70.818086] mvneta_pcs_get_state: MVNETA_GMAC_AUTONEG_CONFIG 0x9b4c state->an_complete 0 state->speed 2500 state->pause 0x3 state->link 0 state->duplex 1
> [   70.836081] mvneta_pcs_get_state: MVNETA_GMAC_AUTONEG_CONFIG 0x9b4c state->an_complete 0 state->speed 2500 state->pause 0x3 state->link 0 state->duplex 1
> [   70.843748] mv88e6085 d0032004.mdio-mii:10: Link is Down
> [   70.859944] mv88e6085 d0032004.mdio-mii:10: mv88e6390_serdes_pcs_config: port 9 MV88E6390_SGMII_ADVERTISE 0xa0 adv 0x80 changed 1
> [   70.878737] mv88e6085 d0032004.mdio-mii:10: mv88e6390_serdes_pcs_config: port 9 MV88E6390_SGMII_BMCR 0x140 bmcr 0x140 phylink_autoneg_inband(mode) 0
> [   70.898302] mv88e6085 d0032004.mdio-mii:10: Link is Up - 2.5Gbps/Full - flow control off
> [   71.069532] mvneta_pcs_get_state: MVNETA_GMAC_AUTONEG_CONFIG 0x9b4c state->an_complete 1 state->speed 2500 state->pause 0x3 state->link 1 state->duplex 1
> [   71.083376] mvneta d0040000.ethernet eth1: Link is Up - 2.5Gbps/Full - flow control rx/tx
> [   71.091672] IPv6: ADDRCONF(NETDEV_CHANGE): eth1: link becomes ready
> 
> Then I studied MVNETA_GMAC_AUTONEG_CONFIG and I noticed that the bit
> you're talking about, MVNETA_GMAC_AN_BYPASS_ENABLE (bit 3) is indeed set
> by default (the driver doesn't set it).

Correct - because of history, and changing it could break setups that
have been working since before DT. The driver has never changed the
bypass bit, so playing with that when phylink was introduced risked
regressions.

> I've intentionally masked it off in mvneta_pcs_config() by setting it in
> the "mask" variable and nowhere else. Now I get:
> 
> ip link set eth1 up
> [  434.336679] mvneta d0040000.ethernet eth1: configuring for inband/2500base-x link mode
> [  434.342618] mv88e6085 d0032004.mdio-mii:10: Link is Down
> [  434.350020] mvneta_pcs_get_state: MVNETA_GMAC_AUTONEG_CONFIG 0x9b44 state->an_complete 0 state->speed 2500 state->pause 0x3 state->link 0 state->duplex 1
> [  434.350055] mvneta_pcs_get_state: MVNETA_GMAC_AUTONEG_CONFIG 0x9b44 state->an_complete 0 state->speed 2500 state->pause 0x3 state->link 0 state->duplex 1
> [  434.384794] mv88e6085 d0032004.mdio-mii:10: mv88e6390_serdes_pcs_config: port 9 MV88E6390_SGMII_ADVERTISE 0xa0 adv 0x80 changed 1
> [  434.403808] mv88e6085 d0032004.mdio-mii:10: mv88e6390_serdes_pcs_config: port 9 MV88E6390_SGMII_BMCR 0x140 bmcr 0x140 phylink_autoneg_inband(mode) 0
> [  434.423732] mv88e6085 d0032004.mdio-mii:10: Link is Up - 2.5Gbps/Full - flow control off
> 
> so state->an_complete now remains zero, and the link is down on the CPU
> port, and indeed I can no longer ping the board from the outside world.

We can see that the DSA switch thinks the link came up, but mvneta
didn't because autoneg never completed. We can see from the
MV88E6390_SGMII_BMCR value that AN is definitely disabled. So we
positively have one end of the link configured for no AN (a fixed
link) and the other side configured for AN.

That is expected with a strict and correct implementation. As you've
spotted, mvneta is not a strict implementation, because it defaults
to allowing bypass mode, which allows the dissimilar configuration
due to history.

However, what you will find is that you will get the same results
with or without these patches applied - because the DSA switch will
default to not using in-band AN on its CPU port, and that isn't
changed when the DSA port is brought up without phylink.

> So what this is telling me is that mvneta has some built-in resilience
> to in-band autoneg mismatches, via MVNETA_GMAC_AN_BYPASS_ENABLE. But that
> (a) doesn't make it valid to mix and match a fixed-link with a managed =
>     "in-band-status" mode
> (b) doesn't mean it's unspecified whether managed = "in-band-status"
>     should dictate whether to enable in-band autoneg or not
> (c) doesn't mean that other devices/drivers support "AN bypass" to save
>     the day and make an invalid DT description appear to work just fine
> 
> This further supports my idea that we should make a better attempt of
> matching the DSA master's mode with the node we're faking in DSA for
> phylink. For Marvell hardware you or Andrew are surely more knowledgeable
> to be able to say whether that's needed right now or not. But in the
> general case please don't push this to everyone, it just muddies the
> waters.

I really don't get this.

For a mv88e6xxx port which supports 1000base-X, with these patches
applied, then these are all effectively equivalent:

		port@N {
			reg = <N>;
			label = "cpu";
			ethernet = <&ethX>;
		};

		port@N {
			reg = <N>;
			label = "cpu";
			ethernet = <&ethX>;
			phy-mode = "1000base-x";
		};

		port@N {
			reg = <N>;
			label = "cpu";
			ethernet = <&ethX>;
			fixed-link {
				speed = <1000>;
				full-duplex;
			};
		};

		port@N {
			reg = <N>;
			label = "cpu";
			ethernet = <&ethX>;
			phy-mode = "1000base-x";
			fixed-link {
				speed = <1000>;
				full-duplex;
			};
		};

and all _should_ lead to inband AN being disabled.

That is my understanding of Andrew's statements on the defaulting
parameters for both the inter-switch and CPU ports. (Maybe Andrew can
clarify whether this is correct or not.)

However, this would not equivalent to any of the above:

		port@N {
			reg = <N>;
			label = "cpu";
			ethernet = <&ethX>;
			managed = "in-band-status";
		};

The reason this is not equivalent is - as you've recently spotted -
of_phy_is_fixed_link() will return true, and therefore phylink gets
passed the above node to work with - and we do not generate a swnode
fixed-link stanza for it. The behaviour in this case is completely
unaffected by these patches.

If a DSA driver defaults to AN enabled on the DSA/CPU ports, and makes
use of the defaulting firmware description, then this will break with
these patches, since we setup a fixed-link specifier that states that
no AN should be used. That's why I've been trying to get these tested,
and that's why there's a risk with them. However, that's got nothing
to do with whether the driver implements filling in this new
"default_interface" field.

We could go delving into the node pointed to by the phandle and retrieve
whatever parameters from there, but that is an entirely new behaviour
and would be a functional change to the behaviour that Andrew has been
promoting - and is itself not free of a risk of regressions caused by
that approach. What if there's an interface converter in the path between
the# CPU ethernet device and the DSA that hasn't needed to be described?
Digging out the phy-mode from the CPU side could be the wrong thing to
do.

Then there's the question whether DSA should have been validating that
the description on both ends of the link are compatible with each other.
The problem with that is just the same as the above - an undescribed
interface converter would make such validation problematical.

So, I don't think we could rely on the description on the other end of
the link to be capable of describing the setup on the DSA port end.
Andrew Lunn July 22, 2022, 9:53 p.m. UTC | #31
> I still don't understand your point - because you seem to be conflating
> two different things (at least as I understand it.)
> 
> We have this:
> 
> 		port@N {
> 			reg = <N>;
> 			label = "cpu";
> 			ethernet = <&ethX>;
> 		};
> 
> This specifies that the port operates at whatever interface mode and
> settings gives the maximum speed. There is no mention of a "managed"
> property, and therefore (Andrew, correct me if I'm wrong) in-band
> negotiation is not expected to be used.

I would actually say it is undefined if in-band is expected or
not. Pretty much everything is undefined, expect 'maximum speed'.

If you can choose between SGMII and 1000BaseX, GMII or RGMII, it is
not defined which you should pick. However generally, *MII and a
SERDES are mutually exclusive in hardware, except for the 6352 which
have some ports with both. The switches do have strapping pins which
can configure most ports into specific modes, which is probably want
most boards do, and the "maximum speed" probably does not in fact
adjust the port mode unless really required. It does however configure
the MAC to fixed speed. There is a degree of separation between the
MAC and the internal PHY/PCS. So the MAC could be fixed, and the PCS
is probably using its power up defaults which could be to perform
in-band signalling. And it is very likely the results from that in
band signalling is ignored by the MAC.

So this is all pretty fragile, but that is the way this has all
evolved over time for the mv88e6xxx driver. And so far, boards
continue to work, or at least, we are not going reports they are
broken. That however does not mean it will not all implode sometime in
the future, and we probably should be asking new submissions to always
use a fixed-link and a phy-mode, even if it is not strictly needed.

	Andrew
Andrew Lunn July 22, 2022, 10:35 p.m. UTC | #32
> If a DSA driver defaults to AN enabled on the DSA/CPU ports, and makes
> use of the defaulting firmware description, then this will break with
> these patches, since we setup a fixed-link specifier that states that
> no AN should be used.

There is another way to look at this. AN is only an issue for SERDES
based links. A bit of grepping:

vf610-zii-cfu1.dts:			compatible = "marvell,mv88e6085";
vf610-zii-dev-rev-b.dts:				compatible = "marvell,mv88e6085";
vf610-zii-dev-rev-b.dts:				compatible = "marvell,mv88e6085";
vf610-zii-dev-rev-b.dts:				compatible = "marvell,mv88e6085";
vf610-zii-dev-rev-c.dts:				compatible = "marvell,mv88e6190";
vf610-zii-dev-rev-c.dts:				compatible = "marvell,mv88e6190";
vf610-zii-scu4-aib.dts:				compatible = "marvell,mv88e6190";
vf610-zii-scu4-aib.dts:				compatible = "marvell,mv88e6190";
vf610-zii-scu4-aib.dts:				compatible = "marvell,mv88e6190";
vf610-zii-scu4-aib.dts:				compatible = "marvell,mv88e6190";
vf610-zii-spb4.dts:			compatible = "marvell,mv88e6190";
vf610-zii-ssmb-dtu.dts:			compatible = "marvell,mv88e6190";
vf610-zii-ssmb-dtu.dts:				compatible = "marvell,mv88e6xxx-mdio-external";
vf610-zii-ssmb-spu3.dts:			compatible = "marvell,mv88e6190";

vf610 is a Vybrid, which is fast Ethernet. No SERDES. We cannot break
the CPU port on these...

kirkwood-dir665.dts:		compatible = "marvell,mv88e6085";
kirkwood-l-50.dts:		compatible = "marvell,mv88e6085";
kirkwood-l-50.dts:		compatible = "marvell,mv88e6085";
kirkwood-linksys-viper.dts:		compatible = "marvell,mv88e6085";
kirkwood-mv88f6281gtw-ge.dts:		compatible = "marvell,mv88e6085";
kirkwood-rd88f6281.dtsi:		compatible = "marvell,mv88e6085";

RGMII or GMII. You cannot break the CPU port on these.

orion5x-netgear-wnr854t.dts:		compatible = "marvell,mv88e6085";

Even older than kirkwood, mo chance it uses SERDES.

imx51-zii-rdu1.dts:			compatible = "marvell,mv88e6085";
imx51-zii-scu2-mezz.dts:			compatible = "marvell,mv88e6085";
imx51-zii-scu3-esb.dts
imx6q-bx50v3.dtsi:			compatible = "marvell,mv88e6085"; /* 88e6240*/
imx6qdl-gw5904.dtsi:			compatible = "marvell,mv88e6085";
imx6qdl-zii-rdu2.dtsi:			compatible = "marvell,mv88e6085";
imx7d-zii-rpu2.dts:			compatible = "marvell,mv88e6085";

These all have a FEC, so are either GMII or MII. No SERDES.

What is left for 32bit ARM is:

armada-370-rd.dts:		compatible = "marvell,mv88e6085";
Has a fixed-link for the switch, and nothing for the SoC

armada-381-netgear-gs110emx.dts:		compatible = "marvell,mv88e6190";
Has a fixed-link for the switch and a fixed-link for the SoC, as is RGMII

armada-385-clearfog-gtr-l8.dts:		compatible = "marvell,mv88e6190";
armada-385-clearfog-gtr-s4.dts:		compatible = "marvell,mv88e6085";
These two have nothing for the CPU port, SoC has fixed-link, "2500base-x"

armada-385-linksys.dtsi:		compatible = "marvell,mv88e6085";
Has a fixed link, and Soc also has a fixed link, SGMII.

armada-385-turris-omnia.dts:		compatible = "marvell,mv88e6085";
Has a fixed link, with phy-mode rgmii-id.

armada-388-clearfog.dts:		compatible = "marvell,mv88e6085";
Has a fixed link, SoC also has a fixed link.

armada-xp-linksys-mamba.dts:		compatible = "marvell,mv88e6085";
Has a fixed-link, nothing for the SoC side.

So the majority of boards are:

1) Not SERDES based

or

2) Have a fixed-link.

It is just the two clearfog boards which might have a problem, but
these two also use mvneta, and Russell already pointed out, they are
by default forgiving with inband signalling.

In the arm64 world, we have:

freescale/imx8mq-zii-ultra.dtsi:			compatible = "marvell,mv88e6085";
marvell/cn9130-crb.dtsi:		compatible = "marvell,mv88e6190";
marvell/armada-3720-turris-mox.dts:		compatible = "marvell,mv88e6190";
marvell/armada-3720-turris-mox.dts:		compatible = "marvell,mv88e6085";
marvell/armada-3720-turris-mox.dts:		compatible = "marvell,mv88e6190";
marvell/armada-3720-turris-mox.dts:		compatible = "marvell,mv88e6085";
marvell/armada-3720-turris-mox.dts:		compatible = "marvell,mv88e6190";
marvell/armada-3720-turris-mox.dts:		compatible = "marvell,mv88e6085";
marvell/armada-3720-espressobin.dtsi:		compatible = "marvell,mv88e6085";
marvell/armada-7040-mochabin.dts:		compatible = "marvell,mv88e6085";
marvell/armada-8040-clearfog-gt-8k.dts:		compatible = "marvell,mv88e6085";

So another RGMII FEC, and then Marvell devices which are all pretty
forgiving.

So i would say, the likelihood of the CPU port breaking is pretty low.

DSA ports could also be an issue here.

armada-3720-turris-mox.dts has:

                                phy-mode = "2500base-x";
                                managed = "in-band-status";
for all its DSA ports.

vf610-zii-dev-rev-b.dts has fixed link, some ports are rgmii, some are
1000base-X.

vf610-zii-dev-rev-c.dts does not have fixed link and the ports are
xaui. Does xaui have in-band signalling?

vf610-zii-scu4-aib.dts does not have fixed link and the ports are xgmii and 2500base-x.

So there are more open questions here, but a lot less boards.

   Andrew
Vladimir Oltean July 22, 2022, 10:39 p.m. UTC | #33
On Fri, Jul 22, 2022 at 10:20:05PM +0100, Russell King (Oracle) wrote:
> > > > What is hard for me to comprehend is how we ever came to conclude that
> > > > for SERDES protocols where clause 37 is possible (2500base-x should be
> > > > part of this group), managed = "in-band-status" does not imply in-band
> > > > autoneg, considering the mvneta precedent.
> > > 
> > > That is a recent addition, since the argument was made that when using
> > > a 1000base-X fibre transceiver, using ethtool to disable autoneg is a
> > > reasonable thing to do - and something that was supported with
> > > mvneta_ethtool_set_link_ksettings() as it stands at the point in the
> > > commit above.
> > 
> > I'm sorry, I don't understand. What is the recent addition, and recent
> > relative to what? The 2500base-x link mode? Ok, but this is only
> > tangentially related to the point overall, more below.
> 
> I'm talking about how we handle 1000base-X autoneg - specifically this
> commit:
> 
> 92817dad7dcb net: phylink: Support disabling autonegotiation for PCS
> 
> where we can be in 1000base-X with managed = "in-band-status" but we
> have autoneg disabled. I thought that is what you were referring to.

So the correction you're persistently trying to make is:
managed = "in-band-status" does *not* necessarily imply in-band autoneg
being enabled, because the user can still run "ethtool -s eth0 autoneg off"
?

Yeah, ok, I wasn't trying to build any argument upon ethtool being able
to toggle autoneg. You can disable it, but it should still come up as
enabled. What I was saying was as a (possibly too generic) response to
this:

| | The way I understand what you're saying is that there is no guarantee
| | that the DSA master and CPU port will agree whether to use in-band
| | autoneg or not here (and implicitly, there is no guarantee that this
| | link will work):
| |
| |       &eth0 {
| |               phy-mode = "2500base-x";
| |               managed = "in-band-status";
| |       };
| |
| |       &switch_cpu_port {
| |               ethernet = <&eth0>;
| |               phy-mode = "2500base-x";
| |               managed = "in-band-status";
| |       };
| 
| Today, there is no guarantee - because it depends on how people have
| chosen to implement 2500base-X, and whether the hardware requires the
| use of in-band AN or prohibits it. This is what happens when stuff
| isn't standardised - one ends up with differing implementations doing
| different things, and this has happened not _only_ at hardware level
| but also software level as well.

If there is no guarantee that the above will (at least try) to use in-band
autoneg, it means that there is someone who decided, when he coded up
the driver, that managed = "in-band-status" doesn't imply using in-band
autoneg. That's what I was complaining about: I don't understand how we
got here. In turn, this came from an observation about the inband/10gbase-r
not having any actual in-band autoneg (more about this at the very end).

> As for 2500base-X, I had been raising the issue of AN in the past, for
> example (have I said it's really difficult to find old emails even with
> google?):
> 
> https://lwn.net/ml/netdev/20200618140623.GC1551@shell.armlinux.org.uk/

Why is this old conversation relevant? You said you suspect the hardware
is capable of AN at 2500base-x. It isn't - I had tested that on LS1028A
when I wrote the comment, and the check - all part of the code that was
being moved.

> and eventually I stopped caring about it, as it became pointless to
> raise it anymore when we had an established mixture of behaviours. This
> is why we have ended up with PCS drivers configuring for no AN for a
> firmware description of:
> 
> 	managed = "in-band-status";
> 	phy-mode = "2500base-x";

Sorry, I don't get why?

> and hence now have unclean semantics for this - some such as mvneta
> and mvpp2 will have AN enabled. Others such as pcs-lynx will not.

You mean in general, or with the firmware description you posted above?
Because the Lynx PCS does the best it can (considering it does this from
a function that returns void) to complain that you shouldn't put
MLO_AN_INBAND for 2500base-x.

static void lynx_pcs_link_up_2500basex(struct mdio_device *pcs,
				       unsigned int mode,
				       int speed, int duplex)
{
	u16 if_mode = 0;

	if (mode == MLO_AN_INBAND) {
		dev_err(&pcs->dev, "AN not supported for 2500BaseX\n");
		return;
	}

I noticed just earlier today that I made a blunder while upstreaming some
riser cards for some LS1028A-QDS development boards, and I did just that
(left 2500base-x with in-band-status). But the system just errors out.
I need to boot a board and fix that up. They're just NXP development
systems so not a big issue. Otherwise I'm not aware of what you're
talking about.

> However, both will request link status from the PCS side and use that
> to determine whether the link is up, and use the parameters that the
> PCS code returns for the link. Since 2500base-X can only operate at
> 2.5G, PCS code always reports SPEED_2500, and as half duplex is
> virtually never supported above 1G, DUPLEX_FULL.

If you're saying this just because Lynx implements pcs_get_state for
2500base-x, it's extremely likely that this simply originates from
vsc9959_pcs_link_state_2500basex(), which was deleted in ocelot in
commit 588d05504d2d ("net: dsa: ocelot: use the Lynx PCS helpers in
Felix and Seville"), and it was always dead code. It wasn't the only
dead code, remember commit b4c2354537b4 ("net: dsa: felix: delete
.phylink_mac_an_restart code").

Since the Lynx PCS prints error messages in inband/2500base-x mode,
and so did Felix/Ocelot before the code became common, I'm pretty sure
no one relies on this mode.

> > > > And why would we essentially redefine its meaning by stating that no,
> > > > it is only about the status, not about the autoneg, even though the
> > > > status comes from the autoneg for these protocols.
> > > 
> > > I'm not sure I understand what you're getting at there.
> > 
> > Sorry if I haven't made my point clear.
> > 
> > My point is that drivers may have more restrictive interpretations of
> > managed = "in-band-status", and the current logic of automatically
> > create a fixed-link for DSA's CPU ports is going to cause problems when
> > matched up with a DSA master that expects in-band autoneg for whatever
> > SERDES protocol.
> > 
> > What I'd like to happen as a result is that no DSA driver except Marvell
> > opts into this by default, and no driver opts into it without its maintainer
> > understanding the implications. Otherwise we're going to normalize the
> > expectation that a managed = "in-band-status" DSA master should be able
> > to interoperate with a fixed-link CPU port, but during this discussion
> > there was no argument being brought that a strict interpretation of
> > "in-band-status" as "enable autoneg" is incorrect in any way.
> 
> I still don't understand your point - because you seem to be conflating
> two different things (at least as I understand it.)
> 
> We have this:
> 
> 		port@N {
> 			reg = <N>;
> 			label = "cpu";
> 			ethernet = <&ethX>;
> 		};
> 
> This specifies that the port operates at whatever interface mode and
> settings gives the maximum speed. There is no mention of a "managed"
> property, and therefore (Andrew, correct me if I'm wrong) in-band
> negotiation is not expected to be used.
> 
> The configuration of the ethX parameters on the other end of the link
> are up to the system integrator to get right, and the actual behaviour
> would depend on the ethernet driver. As I've said in previous emails,
> there is such a thing as "AN bypass" that can be implemented,

Not everyone has AN bypass, try to assume that no one except mvneta does.

> and it can default to be enabled, and drivers can ignore that such a
> bit even exists. So, it's possible that even with "managed" set to
> "in-band-status" in DT, a link to such a DSA switch will still come up
> even though we've requested in DT for AN to be used.
> 
> If an ethernet driver is implemented to strictly require in-band AN in
> this case, then the link won't come up, and the system integrator would
> have to debug the problem.
> 
> I think this is actually true on Clearfog - if one specifies the CPU
> port as I have above, and requests in-band on the host ethernet, then
> the link doesn't come up, because mvneta turns off AN bypass.

So what am I conflating in this case?

> > Thanks for this explanation, if nothing else, it seems to support the
> > way in which I was interpreting managed = "in-band-status" to mean
> > "enable in-band autoneg", but to be clear, I wasn't debating something
> > about the way in which mvneta was doing things. But rather, I was
> > debating why would *other* drivers do things differently such as to come
> > to expect that a fixed-link master + an in-band-status CPU port, or the
> > other way around, may be compatible with each other.
> 
> Please note that phylink makes a DT specification including both a
> fixed-link descriptor and a managed in-band-status property illegal
> because these are two different modes of operating the link, and they
> conflict with each other.

Ok, thank you for this information which I already knew, what is the context?

> The fact that the of_fixed_link_whateveritwas() function (sorry I can't
> remember the name) returns true for both indicating that they're both

of_phy_is_fixed_link()

> fixed-link is a historic artifact that has not been changed. As the
> fixed-PHY code supporting that way was dropped, I suppose that should
> have been cleaned up at some point, but I never got around to it
> (remember, development in this space is a very slow process.) There
> were always more pressing matters to be dealt with.
> 
> Maybe we should now make of_fixed_link_whateveritwas() no longer return
> true, and introduce of_managed_in_band() or something like that which
> drivers can test that separately. I'm not sure it's worth the driver
> churn to make such a change, I'm not sure what the benefit would be.

If we were to split of_phy_is_fixed_link() into of_phy_is_fixed_link()
and of_managed_in_band(), we'd effectively need to replace every
instance of "if (of_phy_is_fixed_link(np))" with
"if (of_phy_is_fixed_link(dp) || of_managed_in_band(np))" unless
instructed otherwise by maintainers. And maintainers will think:
"whoa, I had no idea...". Indeed, there are better uses of time.

> > Then I studied MVNETA_GMAC_AUTONEG_CONFIG and I noticed that the bit
> > you're talking about, MVNETA_GMAC_AN_BYPASS_ENABLE (bit 3) is indeed set
> > by default (the driver doesn't set it).
> 
> Correct - because of history, and changing it could break setups that
> have been working since before DT. The driver has never changed the
> bypass bit, so playing with that when phylink was introduced risked
> regressions.

No, as mentioned, it's good that it exists, just please don't come to
assume that everyone has something like it, and use it as a justification
that hey, I can create a fixed-link willy-nilly, and that will disable
in-band AN on the switch side, and then the system integrator will come
to debug, he'll see what happened there and he'll enable the AN bypass
bit in his other driver. That's not what's going to happen.

> > So what this is telling me is that mvneta has some built-in resilience
> > to in-band autoneg mismatches, via MVNETA_GMAC_AN_BYPASS_ENABLE. But that
> > (a) doesn't make it valid to mix and match a fixed-link with a managed =
> >     "in-band-status" mode
> > (b) doesn't mean it's unspecified whether managed = "in-band-status"
> >     should dictate whether to enable in-band autoneg or not
> > (c) doesn't mean that other devices/drivers support "AN bypass" to save
> >     the day and make an invalid DT description appear to work just fine
> > 
> > This further supports my idea that we should make a better attempt of
> > matching the DSA master's mode with the node we're faking in DSA for
> > phylink. For Marvell hardware you or Andrew are surely more knowledgeable
> > to be able to say whether that's needed right now or not. But in the
> > general case please don't push this to everyone, it just muddies the
> > waters.
> 
> I really don't get this.
> 
> For a mv88e6xxx port which supports 1000base-X, with these patches
> applied, then these are all effectively equivalent:
> 
> 		port@N {
> 			reg = <N>;
> 			label = "cpu";
> 			ethernet = <&ethX>;
> 		};
> 
> 		port@N {
> 			reg = <N>;
> 			label = "cpu";
> 			ethernet = <&ethX>;
> 			phy-mode = "1000base-x";
> 		};
> 
> 		port@N {
> 			reg = <N>;
> 			label = "cpu";
> 			ethernet = <&ethX>;
> 			fixed-link {
> 				speed = <1000>;
> 				full-duplex;
> 			};
> 		};
> 
> 		port@N {
> 			reg = <N>;
> 			label = "cpu";
> 			ethernet = <&ethX>;
> 			phy-mode = "1000base-x";
> 			fixed-link {
> 				speed = <1000>;
> 				full-duplex;
> 			};
> 		};
> 
> and all _should_ lead to inband AN being disabled.
> 
> That is my understanding of Andrew's statements on the defaulting
> parameters for both the inter-switch and CPU ports. (Maybe Andrew can
> clarify whether this is correct or not.)
> 
> However, this would not equivalent to any of the above:
> 
> 		port@N {
> 			reg = <N>;
> 			label = "cpu";
> 			ethernet = <&ethX>;
> 			managed = "in-band-status";
> 		};
> 
> The reason this is not equivalent is - as you've recently spotted -
> of_phy_is_fixed_link() will return true, and therefore phylink gets
> passed the above node to work with - and we do not generate a swnode
> fixed-link stanza for it. The behaviour in this case is completely
> unaffected by these patches.

Thank you for this explanation, it is correct and I don't disagree with it.

> If a DSA driver defaults to AN enabled on the DSA/CPU ports, and makes
> use of the defaulting firmware description, then this will break with
> these patches, since we setup a fixed-link specifier that states that
> no AN should be used.

Bingo, you hit the nail on the head. I was saying that we don't know
whether in-band should be used or not. We could take as a vague hint the
managed = 'in-band-status' property of the DSA master. Additionally,
if the DSA master is mvneta-like, in-band AN could be broken and no one
might even notice. But chances are it might not be mvneta-like.
Only you or Andrew know the chances of that, and maybe you're willing to
take the gamble of unconditionally creating a fixed-link (no in-band AN)
as you're doing now, when in fact, correctness and all, a managed =
"in-band-status" OF property is what should have been auto-created.
My feedback is - if you're willing to take that gamble, do it just for
mv88e6xxx and the boards that got integrated into, I'm not willing to
opt other drivers into it.

> That's why I've been trying to get these tested, and that's why
> there's a risk with them. However, that's got nothing to do with
> whether the driver implements filling in this new "default_interface"
> field.
> 
> We could go delving into the node pointed to by the phandle and retrieve
> whatever parameters from there, but that is an entirely new behaviour
> and would be a functional change to the behaviour that Andrew has been
> promoting - and is itself not free of a risk of regressions caused by
> that approach. What if there's an interface converter in the path between
> the# CPU ethernet device and the DSA that hasn't needed to be described?
> Digging out the phy-mode from the CPU side could be the wrong thing to
> do.
> 
> Then there's the question whether DSA should have been validating that
> the description on both ends of the link are compatible with each other.
> The problem with that is just the same as the above - an undescribed
> interface converter would make such validation problematical.

Not on both ends, but individual drivers could validate their own
bindings to be perfectly honest with you. I think we should prioritize
on that. If driver maintainers don't know that there are defaulting DT
blobs in the wild, don't give them ideas.

> So, I don't think we could rely on the description on the other end of
> the link to be capable of describing the setup on the DSA port end.

If you go back in our conversation, I wasn't saying you should, either.
It was just a comment about the impossibility of the situation - it's is
a guessing game that shouldn't in any circumstance be generalized and
will never become perfect. I've been saying this a million times.
Move to phylink no more than what you need; assume everything else
works; add validation to what you don't have proof uses defaulting mode;
wait.

Now consider what originally triggered me. cn9130-crb.dtsi has a
defaulting CPU port of a 6393X, where the DSA master uses inband/10gbase-r
mode. What is DSA even supposed to make of that, beats me, if it were to
hypothetically follow the 'ethernet' phandle to determine whether the
master wants in-band AN or doesn't...
Russell King (Oracle) July 23, 2022, 7:12 a.m. UTC | #34
On Sat, Jul 23, 2022 at 01:39:32AM +0300, Vladimir Oltean wrote:
> On Fri, Jul 22, 2022 at 10:20:05PM +0100, Russell King (Oracle) wrote:
> > > > > What is hard for me to comprehend is how we ever came to conclude that
> > > > > for SERDES protocols where clause 37 is possible (2500base-x should be
> > > > > part of this group), managed = "in-band-status" does not imply in-band
> > > > > autoneg, considering the mvneta precedent.
> > > > 
> > > > That is a recent addition, since the argument was made that when using
> > > > a 1000base-X fibre transceiver, using ethtool to disable autoneg is a
> > > > reasonable thing to do - and something that was supported with
> > > > mvneta_ethtool_set_link_ksettings() as it stands at the point in the
> > > > commit above.
> > > 
> > > I'm sorry, I don't understand. What is the recent addition, and recent
> > > relative to what? The 2500base-x link mode? Ok, but this is only
> > > tangentially related to the point overall, more below.
> > 
> > I'm talking about how we handle 1000base-X autoneg - specifically this
> > commit:
> > 
> > 92817dad7dcb net: phylink: Support disabling autonegotiation for PCS
> > 
> > where we can be in 1000base-X with managed = "in-band-status" but we
> > have autoneg disabled. I thought that is what you were referring to.
> 
> So the correction you're persistently trying to make is:
> managed = "in-band-status" does *not* necessarily imply in-band autoneg
> being enabled, because the user can still run "ethtool -s eth0 autoneg off"
> ?

Correct.

> | | The way I understand what you're saying is that there is no guarantee
> | | that the DSA master and CPU port will agree whether to use in-band
> | | autoneg or not here (and implicitly, there is no guarantee that this
> | | link will work):
> | |
> | |       &eth0 {
> | |               phy-mode = "2500base-x";
> | |               managed = "in-band-status";
> | |       };
> | |
> | |       &switch_cpu_port {
> | |               ethernet = <&eth0>;
> | |               phy-mode = "2500base-x";
> | |               managed = "in-band-status";
> | |       };
> | 
> | Today, there is no guarantee - because it depends on how people have
> | chosen to implement 2500base-X, and whether the hardware requires the
> | use of in-band AN or prohibits it. This is what happens when stuff
> | isn't standardised - one ends up with differing implementations doing
> | different things, and this has happened not _only_ at hardware level
> | but also software level as well.
> 
> If there is no guarantee that the above will (at least try) to use in-band
> autoneg, it means that there is someone who decided, when he coded up
> the driver, that managed = "in-band-status" doesn't imply using in-band
> autoneg. That's what I was complaining about: I don't understand how we
> got here. In turn, this came from an observation about the inband/10gbase-r
> not having any actual in-band autoneg (more about this at the very end).

We got here through cases such as the one I pointed out where I tried
to highlight the issue. Maybe something happened in the pcs-lynx case,
it's pretty hard to find the history via google now, because searching
there does not give all the different versions of the patch set.

Maybe it was some other PCS, I don't know. Same problem, searching
google is very patchy at finding the various versions of patchsets
that were submitted.

All I know is that at some point I gave up pointing the issue out.

pcs-lynx today does issue an error. pcs-xpcs doesn't, it just turns
off AN no matter what. I can't find the history via google for that.

> > and eventually I stopped caring about it, as it became pointless to
> > raise it anymore when we had an established mixture of behaviours. This
> > is why we have ended up with PCS drivers configuring for no AN for a
> > firmware description of:
> > 
> > 	managed = "in-band-status";
> > 	phy-mode = "2500base-x";
> 
> Sorry, I don't get why?

Why what? Why I stopped caring about this issue? Or Why we ended up
with drivers configuring the above without AN? I think for the latter
I've explained how we got here. For the former, it's the feeling that
my comments were being ignored or just lead to arguments, so I stopped
bothering.

> > I still don't understand your point - because you seem to be conflating
> > two different things (at least as I understand it.)
> > 
> > We have this:
> > 
> > 		port@N {
> > 			reg = <N>;
> > 			label = "cpu";
> > 			ethernet = <&ethX>;
> > 		};
> > 
> > This specifies that the port operates at whatever interface mode and
> > settings gives the maximum speed. There is no mention of a "managed"
> > property, and therefore (Andrew, correct me if I'm wrong) in-band
> > negotiation is not expected to be used.
> > 
> > The configuration of the ethX parameters on the other end of the link
> > are up to the system integrator to get right, and the actual behaviour
> > would depend on the ethernet driver. As I've said in previous emails,
> > there is such a thing as "AN bypass" that can be implemented,
> 
> Not everyone has AN bypass, try to assume that no one except mvneta does.

I think I said "can be implemented", meaning not everyone does.

> > and it can default to be enabled, and drivers can ignore that such a
> > bit even exists. So, it's possible that even with "managed" set to
> > "in-band-status" in DT, a link to such a DSA switch will still come up
> > even though we've requested in DT for AN to be used.
> > 
> > If an ethernet driver is implemented to strictly require in-band AN in
> > this case, then the link won't come up, and the system integrator would
> > have to debug the problem.
> > 
> > I think this is actually true on Clearfog - if one specifies the CPU
> > port as I have above, and requests in-band on the host ethernet, then
> > the link doesn't come up, because mvneta turns off AN bypass.
> 
> So what am I conflating in this case?

You seem to think that the above DT stanza can end up with in-band AN
enabled.

> > > Thanks for this explanation, if nothing else, it seems to support the
> > > way in which I was interpreting managed = "in-band-status" to mean
> > > "enable in-band autoneg", but to be clear, I wasn't debating something
> > > about the way in which mvneta was doing things. But rather, I was
> > > debating why would *other* drivers do things differently such as to come
> > > to expect that a fixed-link master + an in-band-status CPU port, or the
> > > other way around, may be compatible with each other.
> > 
> > Please note that phylink makes a DT specification including both a
> > fixed-link descriptor and a managed in-band-status property illegal
> > because these are two different modes of operating the link, and they
> > conflict with each other.
> 
> Ok, thank you for this information which I already knew, what is the context?

FFS. You're the one who's writing emails to me that include *both*
"fixed-link" and "in-band-status" together. I'm pointing out that
specifying that in DT for a port together is not permitted.

And here I give up reading this email. Sorry, I'm too frustrated
with this nitpicking, and too frustrated with spending hours writing a
reply only to have it torn apart.
Vladimir Oltean July 23, 2022, 1:44 p.m. UTC | #35
On Sat, Jul 23, 2022 at 08:12:04AM +0100, Russell King (Oracle) wrote:
> > > > Thanks for this explanation, if nothing else, it seems to support the
> > > > way in which I was interpreting managed = "in-band-status" to mean
> > > > "enable in-band autoneg", but to be clear, I wasn't debating something
> > > > about the way in which mvneta was doing things. But rather, I was
> > > > debating why would *other* drivers do things differently such as to come
> > > > to expect that a fixed-link master + an in-band-status CPU port, or the
> > > > other way around, may be compatible with each other.
> > > 
> > > Please note that phylink makes a DT specification including both a
> > > fixed-link descriptor and a managed in-band-status property illegal
> > > because these are two different modes of operating the link, and they
> > > conflict with each other.
> > 
> > Ok, thank you for this information which I already knew, what is the context?
> 
> FFS. You're the one who's writing emails to me that include *both*
> "fixed-link" and "in-band-status" together. I'm pointing out that
> specifying that in DT for a port together is not permitted.
> 
> And here I give up reading this email. Sorry, I'm too frustrated
> with this nitpicking, and too frustrated with spending hours writing a
> reply only to have it torn apart.

This is becoming toxic. You've imagined that when I was talking about
mixing fixed-link and managed = "in-band-status" I was referring to this
kind of DSA port OF node:

	port@N {
		managed = "in-band-status";

		fixed-link {
			speed = <1000>;
			full-duplex;
		};
	};

Now you're thinking I'm retarded because you've politely told me the above
is invalid, and you're wondering why I'm still talking despite of that.

Well guess what, I've never once mentioned this kind of invalid OF node,
I'm not the one who's writing emails to you that include
"both fixed-link and in-band-status together", yet in your mind the fact
that you may have misunderstood isn't even a possibility. What I'm
talking about is TWO OF nodes, one like this:

	master: ethernet@N: {
		managed = "in-band-status";
	};

	switch_cpu_port: port@N: {
		ethernet = <&master>;

		fixed-link {
			speed = <1000>;
			full-duplex;
		};
	};

It is *these* two that need to agree on in-band autoneg, when the "fixed-link"
of switch_cpu_port was created using software nodes, damn it. Andrew
said that it isn't specified whether in-band autoneg is used or not.
It was even repeated for the millionth time in the continuation of my
email, which you should have read instead of frustrating yourself for a
stupid reason again.

If you think I'm making this up and I *was* talking about in-band-status
and fixed-link together in the same node, go ahead and search back where
I've said that, or even implied that. But don't blame me when you'll get
frustrated again that you won't find it. I re-read what I said once
yesterday and once today.

That's where our communication problem is, you're politely trying to
tell your conversation partner that he's an idiot and he JUST DOESN'T
WANT TO UNDERSTAND, DAMN IT.

In any case, it looks like it's time to remove myself from this conversation.
I am going to propose a patch shortly which adds validation in DSA for
the OF nodes of DSA and CPU ports, and opts some drivers out of it.
I'm going to opt into validation as many drivers as reasonably possible.
You'll then have to work with the driver maintainers who opted out of
it. I'm not one of them, so you won't have to work with me. Beyond that,
I just don't care and I had enough. I have other things to do too.
Marek Behún July 23, 2022, 5:26 p.m. UTC | #36
On Sat, 23 Jul 2022 01:39:32 +0300
Vladimir Oltean <olteanv@gmail.com> wrote:

> You mean in general, or with the firmware description you posted above?
> Because the Lynx PCS does the best it can (considering it does this from
> a function that returns void) to complain that you shouldn't put
> MLO_AN_INBAND for 2500base-x.
> 
> static void lynx_pcs_link_up_2500basex(struct mdio_device *pcs,
> 				       unsigned int mode,
> 				       int speed, int duplex)
> {
> 	u16 if_mode = 0;
> 
> 	if (mode == MLO_AN_INBAND) {
> 		dev_err(&pcs->dev, "AN not supported for 2500BaseX\n");
> 		return;
> 	}
> 
> I noticed just earlier today that I made a blunder while upstreaming some
> riser cards for some LS1028A-QDS development boards, and I did just that
> (left 2500base-x with in-band-status). But the system just errors out.
> I need to boot a board and fix that up. They're just NXP development
> systems so not a big issue. Otherwise I'm not aware of what you're
> talking about.
> 
> > However, both will request link status from the PCS side and use that
> > to determine whether the link is up, and use the parameters that the
> > PCS code returns for the link. Since 2500base-X can only operate at
> > 2.5G, PCS code always reports SPEED_2500, and as half duplex is
> > virtually never supported above 1G, DUPLEX_FULL.  
> 
> If you're saying this just because Lynx implements pcs_get_state for
> 2500base-x, it's extremely likely that this simply originates from
> vsc9959_pcs_link_state_2500basex(), which was deleted in ocelot in
> commit 588d05504d2d ("net: dsa: ocelot: use the Lynx PCS helpers in
> Felix and Seville"), and it was always dead code. It wasn't the only
> dead code, remember commit b4c2354537b4 ("net: dsa: felix: delete
> .phylink_mac_an_restart code").
> 
> Since the Lynx PCS prints error messages in inband/2500base-x mode,
> and so did Felix/Ocelot before the code became common, I'm pretty sure
> no one relies on this mode.

Does Lynx PCS support 1000base-x with AN? Because if so, it may be
possible to somehow hack working AN for 2500base-x, as I managed it for
88E6393X in the commit I mentioned (by configuring 1000base-x and then
hacking the PHY speed to 2.5x).

Anyway, I am now looking at the standards, and it seems that all the X
and R have K variant: 1000base-kx, 2500base-kx, 5gbase-kr and
10gbase-kr. These modes have mandatory clause 73 autonegotiation.

So either we need to add these as different modes of the
phy_interface_t type, or we need to differentiate whether clause 37 or
clause 73 AN should be used by another property.

But since 1000base-x supports clause 37 and 1000base-kx clause 73, the
one property that we have, managed="in-band-status" is not enough, if
we keep calling both modes '1000base-x'.

So maybe we really need to add K variants as separate
PHY_INTERFACE_MODED_ constants. That way we can keep assuming clause 37
for 2500base-x, and try to implement it for as much drivers as
possible, by hacking it up...

And I still don't understand this clause 73 AN at all. For example, if
one PHY supports only up to 2.5g speeds, will it complete AN with
another PHY that supports up to 10g speeds, if the second PHY will
(maybe?) try at higher frequency?

Marek
Vladimir Oltean July 24, 2022, 5:39 p.m. UTC | #37
On Sat, Jul 23, 2022 at 07:26:55PM +0200, Marek Behún wrote:
> Does Lynx PCS support 1000base-x with AN?

Yes, that would be the intention.

> Because if so, it may be possible to somehow hack working AN for
> 2500base-x, as I managed it for 88E6393X in the commit I mentioned (by
> configuring 1000base-x and then hacking the PHY speed to 2.5x).

I would need to try and see. For Lynx, to dynamically change from
1000base-x to 2500base-x essentially means to move the SERDES lane from
a PLL that can provide the 1.25 GHz required for 1000base-x to a PLL
that can provide the 3.125 GHz required for 2500base-x. The procedure
itself doesn't involve resetting the PCS, but to be honest with you,
I don't know whether the state of the PCS registers is going to be
preserved across the PLL change. Maybe it isn't, but this is entirely
masked out by the phylink major reconfig process, I don't know.

The alternative to dynamic reconfiguration is to program some bits that
instruct the SoC what to do on power-on reset, and these bits include
the initial SERDES protocols and PLL assignments too. I only tried to
experiment with in-band autoneg in this mode (with the lane being
configured for 2.5G out of reset, rather than dynamically switching it
to 2.5G).

> Anyway, I am now looking at the standards, and it seems that all the X
> and R have K variant: 1000base-kx, 2500base-kx, 5gbase-kr and
> 10gbase-kr. These modes have mandatory clause 73 autonegotiation.

The X in BASE-X stands for 8b/10b coding, the R stands for 64b/66b coding.
Whereas the K stands for bacKplane, i.e. the medium (compare this with
the T in BASE-T, for twisted pair copper cable). Or with 1000BASE-SX and
1000BASE-LX, the S stands for Short wavelength laser and the L for Long
wavelength.

What I'm trying to say, the 'X' in BASE-X doesn't stand for anything
having to do with fiber, I guess 1000BASE-X is just a generic name for
the coding scheme (PCS level) rather than something about the medium
(PMD level). The terminology is pretty much a mess.

> So either we need to add these as different modes of the
> phy_interface_t type, or we need to differentiate whether clause 37 or
> clause 73 AN should be used by another property.
> 
> But since 1000base-x supports clause 37 and 1000base-kx clause 73, the
> one property that we have, managed="in-band-status" is not enough, if
> we keep calling both modes '1000base-x'.
> 
> So maybe we really need to add K variants as separate
> PHY_INTERFACE_MODED_ constants. That way we can keep assuming clause 37
> for 2500base-x, and try to implement it for as much drivers as
> possible, by hacking it up...

Well, for good or bad, 10GBase-KR does have its own phy-mode string,
and Sean Anderson is sending a patch to add 1000base-KX now too.
https://patchwork.kernel.org/project/netdevbpf/patch/20220719235002.1944800-3-sean.anderson@seco.com/
(I still don't understand what that has to do with the topic of his
series, but anyway)

More at the end.

> 
> And I still don't understand this clause 73 AN at all. For example, if
> one PHY supports only up to 2.5g speeds, will it complete AN with
> another PHY that supports up to 10g speeds, if the second PHY will
> (maybe?) try at higher frequency?

Define what you mean by "one PHY supports only up to 2.5G speeds".
My copy of IEEE 802.3-2018 doesn't list in Table 73–4—Technology Ability
Field encoding any signaling mode that is capable of 2.5G, but rather
1000BASE-KX, 10GBASE-KR, 25GBASE-KR and so on. So you'd have to express
your question in terms of bits that are actually advertised through the
Technology Ability field.

Then, clause 73 AN, very much like the clause 28/40 AN of BASE-T (to
which it is most directly comparable) has a priority resolution function,
meaning that if 2 link partners advertise support for multiple
technologies, Table 73–5—Priority Resolution will decide which one of
the commonly advertised technologies gets used.

Side note: contrast this with flow control, which annoyingly was
designed by IEEE to not have a priority resolution, in other words you
don't get a graceful falloff of the resolved pause modes depending on
what you and the link partner advertised, instead you need to
preconfigure both ends if you want to achieve a particular result;
this is IMO as useless as not having AN at all.

There is of course no guarantee that two backplane link partners will
have any technology ability in common, for example one may advertise
only 1000Base-KX and the other only 10GBase-KR. In that case, autoneg
will complete, but the link will simply not come up.

The clause 73 autoneg signaling takes place using a predetermined, low-speed
encoding. The medium transitions to the highest negotiated technology,
and performs clause 74 link training on that medium, only after both
ends agree that clause 73 autoneg has completed. This kind of implies
that they will agree on the frequency being used for the data traffic.

If you're asking whether 2 backplane devices will advertise 10GBase-KR
but one of them supports a data rate of only up to 2.5Gbps over that 10G
link, I think this is vendor-dependent and IEEE doesn't say anything
about it. For example this is where rate adaptation could come into
play, either through flow control, or there could be an extension to
clause 73 similar to what Cisco did with USXGMII, where the lane
operates at 10GBaud but via symbol replication your data rate can
actually be only 2.5Gbps. I'm not aware of real life applications of
rate adaptation over backplane links.


I hinted earlier that clause 73 autoneg is most directly comparable to
BASE-T autoneg (these 2 are even situated at different layers if you
look at the IEEE OSI stack pictures, compared to where clause 37 AN is).
The problem is that the Linux kernel support for new physical technologies
grew organically, and we don't have a structure in place that scales
naturally to all the places in which these technologies may appear in
the stack. For example we have the phy-mode, and this represents the

...

/goes searching for the documentation, I don't want to be making this up/

...

  phy-connection-type:
    description:
      Specifies interface type between the Ethernet device and a physical
      layer (PHY) device.

There you go, pretty vague. What's the Ethernet device, and what's the
PHY device?

For example SGMII connects a MAC to a PHY, but to speak SGMII to reach
to your PHY, you need another PHY that does the parallel GMII to serial
translation for you. So to say that the phy-mode is SGMII, you need to
ignore that the MAC has a PHY too.

10GBase-KR is similar in a way, it can be placed at multiple layers, and
traditionally, where you put it makes a difference to how we describe it
in Linux.

Maybe you have a 10GBase-T PHY chip with a backplane host-side PHY, it
supports clause 73 declaring the 10GBase-KR technology, then it supports
clause 74 link training, the whole shebang. These things exist. How would
you describe this? You'd say the phy-mode is "10gbase-kr", according to
precedent. Would that be the best thing to do, in the spirit of clause 73?
I don't think it would. Essentially what would need to happen as a
consequence of this description is that your PCS would essentially
populate its Technology Ability with a single bit, corresponding to what
you put in phy-mode, because that's how we shoehorned this. Then we'd
say what, that managed = "in-band-status" decides whether to bypass
clause 73 AN or not? I don't think so.

Truth is, a 10G-KR "PCS" (what we mean when we say a PHY integrated into a MAC)
is much more similar to a dedicated 10G-KR PHY, to the point that it's
indistinguishable (what Linux thinks of a phy_device is actually 2 PHYs
back to back, one for the host side and one for the medium side), and it
*needs* to be treated by Linux in the same way regardless of where it's
placed. You *need* to be able to control the backplane PCS' advertisement,
whether to use FEC or not, regardless if it's your medium facing device,
or an in-between device.

The discussion is much, much bigger than this, but in summary, I think
it would be quite short-sighted to expand managed = "in-band-status" for
anything related to clause 73, or for much more than what it means right
now (the problem is, what _does_ it mean and what _doesn't_ it?).

This, plus I think development needs to be driven by someone with real
world needs and a sense for what's practical. I am quite well outside of
the sphere of 10-gig-and-higher networking, I'm just looking from the
peanut gallery, so that won't be me.
Russell King (Oracle) July 25, 2022, 10:11 a.m. UTC | #38
On Sat, Jul 23, 2022 at 04:44:44PM +0300, Vladimir Oltean wrote:
> On Sat, Jul 23, 2022 at 08:12:04AM +0100, Russell King (Oracle) wrote:
> > > > > Thanks for this explanation, if nothing else, it seems to support the
> > > > > way in which I was interpreting managed = "in-band-status" to mean
> > > > > "enable in-band autoneg", but to be clear, I wasn't debating something
> > > > > about the way in which mvneta was doing things. But rather, I was
> > > > > debating why would *other* drivers do things differently such as to come
> > > > > to expect that a fixed-link master + an in-band-status CPU port, or the
> > > > > other way around, may be compatible with each other.
> > > > 
> > > > Please note that phylink makes a DT specification including both a
> > > > fixed-link descriptor and a managed in-band-status property illegal
> > > > because these are two different modes of operating the link, and they
> > > > conflict with each other.
> > > 
> > > Ok, thank you for this information which I already knew, what is the context?
> > 
> > FFS. You're the one who's writing emails to me that include *both*
> > "fixed-link" and "in-band-status" together. I'm pointing out that
> > specifying that in DT for a port together is not permitted.
> > 
> > And here I give up reading this email. Sorry, I'm too frustrated
> > with this nitpicking, and too frustrated with spending hours writing a
> > reply only to have it torn apart.
> 
> This is becoming toxic.

It is toxic, because I'm spending longer and longer replying to each of
your emails, because every time I do, you ask more and more questions
despite my best effort to provide clear answers. This cycle of forever
growing emails that take longer and longer to reply to can not continue.

When I spend three hours trying to compose a reply to your email, and
then get a reply that tears it apart, needing another multi-hour effort
to reply, it has to stop. Sorry, but it has to.

I am not going to spend endless hours composing one reply after another
on the same topic - I will just stop trying to compose a reply to an
email if its turning into another multi-hour effort leaving the rest of
the email unreplied - and in many cases even unread. Sorry, but I don't
have the patience.
diff mbox series

Patch

diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c
index 48cf344750ff..fe75b84ab791 100644
--- a/drivers/net/dsa/b53/b53_common.c
+++ b/drivers/net/dsa/b53/b53_common.c
@@ -1310,7 +1310,8 @@  void b53_port_event(struct dsa_switch *ds, int port)
 EXPORT_SYMBOL(b53_port_event);
 
 static void b53_phylink_get_caps(struct dsa_switch *ds, int port,
-				 struct phylink_config *config)
+				 struct phylink_config *config,
+				 phy_interface_t *default_interface)
 {
 	struct b53_device *dev = ds->priv;
 
diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c
index be0edfa093d0..18a3847bd82b 100644
--- a/drivers/net/dsa/bcm_sf2.c
+++ b/drivers/net/dsa/bcm_sf2.c
@@ -713,7 +713,8 @@  static u32 bcm_sf2_sw_get_phy_flags(struct dsa_switch *ds, int port)
 }
 
 static void bcm_sf2_sw_get_caps(struct dsa_switch *ds, int port,
-				struct phylink_config *config)
+				struct phylink_config *config,
+				phy_interface_t *default_interface)
 {
 	unsigned long *interfaces = config->supported_interfaces;
 	struct bcm_sf2_priv *priv = bcm_sf2_to_priv(ds);
diff --git a/drivers/net/dsa/hirschmann/hellcreek.c b/drivers/net/dsa/hirschmann/hellcreek.c
index 01f90994dedd..6c1ef6997789 100644
--- a/drivers/net/dsa/hirschmann/hellcreek.c
+++ b/drivers/net/dsa/hirschmann/hellcreek.c
@@ -1462,7 +1462,8 @@  static void hellcreek_teardown(struct dsa_switch *ds)
 }
 
 static void hellcreek_phylink_get_caps(struct dsa_switch *ds, int port,
-				       struct phylink_config *config)
+				       struct phylink_config *config,
+				       phy_interface_t *default_interface)
 {
 	struct hellcreek *hellcreek = ds->priv;
 
diff --git a/drivers/net/dsa/lantiq_gswip.c b/drivers/net/dsa/lantiq_gswip.c
index e531b93f3cb2..a43dabfa5453 100644
--- a/drivers/net/dsa/lantiq_gswip.c
+++ b/drivers/net/dsa/lantiq_gswip.c
@@ -1492,7 +1492,8 @@  static int gswip_port_change_mtu(struct dsa_switch *ds, int port, int new_mtu)
 }
 
 static void gswip_xrx200_phylink_get_caps(struct dsa_switch *ds, int port,
-					  struct phylink_config *config)
+					  struct phylink_config *config,
+					  phy_interface_t *default_interface)
 {
 	switch (port) {
 	case 0:
@@ -1525,7 +1526,8 @@  static void gswip_xrx200_phylink_get_caps(struct dsa_switch *ds, int port,
 }
 
 static void gswip_xrx300_phylink_get_caps(struct dsa_switch *ds, int port,
-					  struct phylink_config *config)
+					  struct phylink_config *config,
+					  phy_interface_t *default_interface)
 {
 	switch (port) {
 	case 0:
diff --git a/drivers/net/dsa/microchip/ksz_common.c b/drivers/net/dsa/microchip/ksz_common.c
index 28d7cb2ce98f..4329e29a1695 100644
--- a/drivers/net/dsa/microchip/ksz_common.c
+++ b/drivers/net/dsa/microchip/ksz_common.c
@@ -721,7 +721,8 @@  static int ksz_check_device_id(struct ksz_device *dev)
 }
 
 static void ksz_phylink_get_caps(struct dsa_switch *ds, int port,
-				 struct phylink_config *config)
+				 struct phylink_config *config,
+				 phy_interface_t *default_interface)
 {
 	struct ksz_device *dev = ds->priv;
 
diff --git a/drivers/net/dsa/mt7530.c b/drivers/net/dsa/mt7530.c
index 835807911be0..dab308e454e3 100644
--- a/drivers/net/dsa/mt7530.c
+++ b/drivers/net/dsa/mt7530.c
@@ -2914,7 +2914,8 @@  mt7531_cpu_port_config(struct dsa_switch *ds, int port)
 }
 
 static void mt753x_phylink_get_caps(struct dsa_switch *ds, int port,
-				    struct phylink_config *config)
+				    struct phylink_config *config,
+				    phy_interface_t *default_interface)
 {
 	struct mt7530_priv *priv = ds->priv;
 
diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index 37b649501500..f98be98551ef 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -819,7 +819,8 @@  static void mv88e6393x_phylink_get_caps(struct mv88e6xxx_chip *chip, int port,
 }
 
 static void mv88e6xxx_get_caps(struct dsa_switch *ds, int port,
-			       struct phylink_config *config)
+			       struct phylink_config *config,
+			       phy_interface_t *default_interface)
 {
 	struct mv88e6xxx_chip *chip = ds->priv;
 
diff --git a/drivers/net/dsa/ocelot/felix.c b/drivers/net/dsa/ocelot/felix.c
index 859196898a7d..0c1ac902b110 100644
--- a/drivers/net/dsa/ocelot/felix.c
+++ b/drivers/net/dsa/ocelot/felix.c
@@ -937,7 +937,8 @@  static int felix_vlan_del(struct dsa_switch *ds, int port,
 }
 
 static void felix_phylink_get_caps(struct dsa_switch *ds, int port,
-				   struct phylink_config *config)
+				   struct phylink_config *config,
+				   phy_interface_t *default_interface)
 {
 	struct ocelot *ocelot = ds->priv;
 
diff --git a/drivers/net/dsa/qca/ar9331.c b/drivers/net/dsa/qca/ar9331.c
index 0796b7cf8cae..19e95dabe5b9 100644
--- a/drivers/net/dsa/qca/ar9331.c
+++ b/drivers/net/dsa/qca/ar9331.c
@@ -501,7 +501,8 @@  static enum dsa_tag_protocol ar9331_sw_get_tag_protocol(struct dsa_switch *ds,
 }
 
 static void ar9331_sw_phylink_get_caps(struct dsa_switch *ds, int port,
-				       struct phylink_config *config)
+				       struct phylink_config *config,
+				       phy_interface_t *default_interface)
 {
 	config->mac_capabilities = MAC_ASYM_PAUSE | MAC_SYM_PAUSE |
 		MAC_10 | MAC_100;
diff --git a/drivers/net/dsa/qca8k.c b/drivers/net/dsa/qca8k.c
index 1cbb05b0323f..beccd8338c81 100644
--- a/drivers/net/dsa/qca8k.c
+++ b/drivers/net/dsa/qca8k.c
@@ -1749,7 +1749,8 @@  qca8k_phylink_mac_config(struct dsa_switch *ds, int port, unsigned int mode,
 }
 
 static void qca8k_phylink_get_caps(struct dsa_switch *ds, int port,
-				   struct phylink_config *config)
+				   struct phylink_config *config,
+				   phy_interface_t *default_interface)
 {
 	switch (port) {
 	case 0: /* 1st CPU port */
diff --git a/drivers/net/dsa/realtek/rtl8365mb.c b/drivers/net/dsa/realtek/rtl8365mb.c
index da31d8b839ac..7bf420c2b083 100644
--- a/drivers/net/dsa/realtek/rtl8365mb.c
+++ b/drivers/net/dsa/realtek/rtl8365mb.c
@@ -1024,7 +1024,8 @@  static int rtl8365mb_ext_config_forcemode(struct realtek_priv *priv, int port,
 }
 
 static void rtl8365mb_phylink_get_caps(struct dsa_switch *ds, int port,
-				       struct phylink_config *config)
+				       struct phylink_config *config,
+				       phy_interface_t *default_interface)
 {
 	const struct rtl8365mb_extint *extint =
 		rtl8365mb_get_port_extint(ds->priv, port);
diff --git a/drivers/net/dsa/sja1105/sja1105_main.c b/drivers/net/dsa/sja1105/sja1105_main.c
index b253e27bcfb4..e15033177643 100644
--- a/drivers/net/dsa/sja1105/sja1105_main.c
+++ b/drivers/net/dsa/sja1105/sja1105_main.c
@@ -1390,7 +1390,8 @@  static void sja1105_mac_link_up(struct dsa_switch *ds, int port,
 }
 
 static void sja1105_phylink_get_caps(struct dsa_switch *ds, int port,
-				     struct phylink_config *config)
+				     struct phylink_config *config,
+				     phy_interface_t *default_interface)
 {
 	struct sja1105_private *priv = ds->priv;
 	struct sja1105_xmii_params_entry *mii;
diff --git a/drivers/net/dsa/xrs700x/xrs700x.c b/drivers/net/dsa/xrs700x/xrs700x.c
index 3887ed33c5fe..214a1dd670c2 100644
--- a/drivers/net/dsa/xrs700x/xrs700x.c
+++ b/drivers/net/dsa/xrs700x/xrs700x.c
@@ -443,7 +443,8 @@  static void xrs700x_teardown(struct dsa_switch *ds)
 }
 
 static void xrs700x_phylink_get_caps(struct dsa_switch *ds, int port,
-				     struct phylink_config *config)
+				     struct phylink_config *config,
+				     phy_interface_t *default_interface)
 {
 	switch (port) {
 	case 0:
diff --git a/include/net/dsa.h b/include/net/dsa.h
index b902b31bebce..7c6870d2c607 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -852,7 +852,8 @@  struct dsa_switch_ops {
 	 * PHYLINK integration
 	 */
 	void	(*phylink_get_caps)(struct dsa_switch *ds, int port,
-				    struct phylink_config *config);
+				    struct phylink_config *config,
+				    phy_interface_t *default_interface);
 	void	(*phylink_validate)(struct dsa_switch *ds, int port,
 				    unsigned long *supported,
 				    struct phylink_link_state *state);
diff --git a/net/dsa/port.c b/net/dsa/port.c
index 3738f2d40a0b..35b4e1f8dc05 100644
--- a/net/dsa/port.c
+++ b/net/dsa/port.c
@@ -1524,13 +1524,9 @@  static const struct phylink_mac_ops dsa_port_phylink_mac_ops = {
 int dsa_port_phylink_create(struct dsa_port *dp)
 {
 	struct dsa_switch *ds = dp->ds;
-	phy_interface_t mode;
+	phy_interface_t mode, def_mode;
 	int err;
 
-	err = of_get_phy_mode(dp->dn, &mode);
-	if (err)
-		mode = PHY_INTERFACE_MODE_NA;
-
 	/* Presence of phylink_mac_link_state or phylink_mac_an_restart is
 	 * an indicator of a legacy phylink driver.
 	 */
@@ -1538,8 +1534,23 @@  int dsa_port_phylink_create(struct dsa_port *dp)
 	    ds->ops->phylink_mac_an_restart)
 		dp->pl_config.legacy_pre_march2020 = true;
 
+	def_mode = PHY_INTERFACE_MODE_NA;
 	if (ds->ops->phylink_get_caps)
-		ds->ops->phylink_get_caps(ds, dp->index, &dp->pl_config);
+		ds->ops->phylink_get_caps(ds, dp->index, &dp->pl_config,
+					  &def_mode);
+
+	err = of_get_phy_mode(dp->dn, &mode);
+	if (err) {
+		/* We must not set the default mode for user ports as a PHY
+		 * overrides the NA mode in phylink. Setting it here would
+		 * prevent the interface mode being updated.
+		 */
+		if (dp->type == DSA_PORT_TYPE_CPU ||
+		    dp->type == DSA_PORT_TYPE_DSA)
+			mode = def_mode;
+		else
+			mode = PHY_INTERFACE_MODE_NA;
+	}
 
 	dp->pl = phylink_create(&dp->pl_config, of_fwnode_handle(dp->dn),
 				mode, &dsa_port_phylink_mac_ops);