diff mbox series

[net-next,RFC,v4,4/5] net: Let the active time stamping layer be selectable.

Message ID 20230406173308.401924-5-kory.maincent@bootlin.com (mailing list archive)
State RFC
Delegated to: Netdev Maintainers
Headers show
Series net: Make MAC/PHY time stamping selectable | expand

Checks

Context Check Description
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for net-next, async
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit fail Errors and warnings before: 4245 this patch: 4248
netdev/cc_maintainers warning 9 maintainers not CCed: pabeni@redhat.com f.fainelli@gmail.com corbet@lwn.net linux-doc@vger.kernel.org hkallweit1@gmail.com gal@nvidia.com edumazet@google.com andrew@lunn.ch davem@davemloft.net
netdev/build_clang fail Errors and warnings before: 938 this patch: 942
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn fail Errors and warnings before: 4452 this patch: 4455
netdev/checkpatch fail CHECK: Comparison to NULL could be written "phy_timestamping_whitelist[i]" ERROR: that open brace { should be on the previous line WARNING: braces {} are not necessary for single statement blocks WARNING: break quoted strings at a space character WARNING: else is not generally useful after a break or return WARNING: line length of 82 exceeds 80 columns WARNING: quoted string split across lines
netdev/kdoc fail Errors and warnings before: 0 this patch: 2
netdev/source_inline success Was 0 now: 0

Commit Message

Kory Maincent April 6, 2023, 5:33 p.m. UTC
From: Kory Maincent <kory.maincent@bootlin.com>

Add the ETHTOOL_MSG_TS_SET ethtool netlink socket, and add checks in the
ioctl and time stamping paths to respect the currently selected time
stamping layer.

Add a preferred-timestamp devicetree binding to select the preferred
hardware timestamp layer between PHY and MAC. The choice of using
devicetree binding has been made as the PTP precision and quality depends
of external things, like adjustable clock, or the lack of a temperature
compensated crystal or specific features. Even if the preferred timestamp
is a configuration it is hardly related to the design oh the board.

Introduce a NETDEV_CHANGE_HWTSTAMP netdev notifier to let MAC or DSA know
when a hwtstamp change happen. This is useful to manage packet trap in that
case like done by the lan966x driver.

Change the API to select MAC default time stamping instead of the PHY.
Indeed the PHY is closer to the wire therefore theoretically it have less
delay than the MAC timestamping but the reality is different. Due to lower
time stamping clock frequency, latency in the MDIO bus and no PHC hardware
synchronization between different PHY, the PHY PTP is often less precise
than the MAC. The exception is for PHY designed specially for PTP case but
these board are not very widespread. For not breaking the compatibility I
introduce a whitelist to reference all current PHY that support
time stamping and let them keep the old API behavior.

Signed-off-by: Richard Cochran <richardcochran@gmail.com>
Signed-off-by: Kory Maincent <kory.maincent@bootlin.com>
---

Notes:
    Changes in v2:
    - Move selected_timestamping_layer introduction in this patch.
    - Replace strmcmp by sysfs_streq.
    - Use the PHY timestamp only if available.
    
    Changes in v3:
    - Add a devicetree binding to select the preferred timestamp
    - Replace the way to select timestamp through ethtool instead of sysfs
    You can test it with the ethtool source on branch feature_ptp of:
    https://github.com/kmaincent/ethtool
    
    Changes in v4:
    - Change the API to select MAC default time stamping instead of the PHY.
    - Add a whitelist to no break the old timestamp PHY default preferences
      for current PHY.
    - Replace the ethtool ioctl by netlink.
    - Add a netdev notifier to allow network device to create trap on PTP
      packets. Not tested as it need to change the lan966x driver that
      implement packet traps. I will do after the hwtstamp management change
      to NDOs.

 Documentation/networking/ethtool-netlink.rst | 13 +++
 drivers/net/phy/phy_device.c                 | 85 ++++++++++++++++++++
 include/linux/netdevice.h                    | 12 +++
 include/uapi/linux/ethtool_netlink.h         |  2 +
 net/core/dev.c                               |  2 +-
 net/core/dev_ioctl.c                         | 56 ++++++++++++-
 net/core/timestamping.c                      |  6 ++
 net/ethtool/common.c                         | 15 +++-
 net/ethtool/netlink.c                        | 10 +++
 net/ethtool/netlink.h                        |  1 +
 net/ethtool/ts.c                             | 68 ++++++++++++++--
 11 files changed, 256 insertions(+), 14 deletions(-)

Comments

Jakub Kicinski April 7, 2023, 1:47 a.m. UTC | #1
On Thu,  6 Apr 2023 19:33:07 +0200 Köry Maincent wrote:
> +/**
> + * A whitelist for PHYs selected as default timesetamping.
> + * Its use is to keep compatibility with old PTP API which is selecting
> + * these PHYs as default timestamping.
> + * The new API is selecting the MAC as default timestamping.
> + */
> +const char * const phy_timestamping_whitelist[] = {

whitelist -> allowlist or some such, inclusive naming
Vladimir Oltean April 29, 2023, 5:58 p.m. UTC | #2
On Thu, Apr 06, 2023 at 07:33:07PM +0200, Köry Maincent wrote:
> +static void of_set_timestamp(struct net_device *netdev, struct phy_device *phydev)
> +{
> +	struct device_node *node = phydev->mdio.dev.of_node;
> +	const struct ethtool_ops *ops = netdev->ethtool_ops;
> +	const char *s;
> +	enum timestamping_layer ts_layer = 0;

netdev likes variable declaration lines sorted longest to shortest

> +	int i;
> +
> +	/* Backward compatibility to old API */
> +	for (i = 0; phy_timestamping_whitelist[i] != NULL; i++)
> +	{

The kernel coding style likes the opening { on the same line as the for

> +		if (!strcmp(phy_timestamping_whitelist[i],
> +			    phydev->drv->name)) {
> +			if (phy_has_hwtstamp(phydev))
> +				ts_layer = SOF_PHY_TIMESTAMPING;
> +			else if (ops->get_ts_info)
> +				ts_layer = SOF_MAC_TIMESTAMPING;
> +			goto out;
> +		}
> +	}
> +
> +	if (ops->get_ts_info)
> +		ts_layer = SOF_MAC_TIMESTAMPING;
> +	else if (phy_has_hwtstamp(phydev))
> +		ts_layer = SOF_PHY_TIMESTAMPING;
> +
> +	if (of_property_read_string(node, "preferred-timestamp", &s))
> +		goto out;
> +
> +	if (!s)
> +		goto out;
> +
> +	if (phy_has_hwtstamp(phydev) && !strcmp(s, "phy"))
> +		ts_layer = SOF_PHY_TIMESTAMPING;
> +
> +	if (ops->get_ts_info && !strcmp(s, "mac"))
> +		ts_layer = SOF_MAC_TIMESTAMPING;
> +
> +out:
> +	netdev->selected_timestamping_layer = ts_layer;
> +}
> +
>  /**
>   * phy_attach_direct - attach a network device to a given PHY device pointer
>   * @dev: network device to attach
> @@ -1481,6 +1560,8 @@ int phy_attach_direct(struct net_device *dev, struct phy_device *phydev,
>  
>  	phydev->phy_link_change = phy_link_change;
>  	if (dev) {
> +		of_set_timestamp(dev, phydev);
> +
>  		phydev->attached_dev = dev;
>  		dev->phydev = phydev;
>  
> @@ -1811,6 +1892,10 @@ void phy_detach(struct phy_device *phydev)
>  
>  	phy_suspend(phydev);
>  	if (dev) {
> +		if (dev->ethtool_ops->get_ts_info)
> +			dev->selected_timestamping_layer = SOF_MAC_TIMESTAMPING;
> +		else
> +			dev->selected_timestamping_layer = 0;
>  		phydev->attached_dev->phydev = NULL;
>  		phydev->attached_dev = NULL;
>  	}
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index a740be3bb911..3dd6be012daf 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -48,6 +48,7 @@
>  #include <uapi/linux/if_bonding.h>
>  #include <uapi/linux/pkt_cls.h>
>  #include <uapi/linux/netdev.h>
> +#include <uapi/linux/net_tstamp.h>
>  #include <linux/hashtable.h>
>  #include <linux/rbtree.h>
>  #include <net/net_trackers.h>
> @@ -2019,6 +2020,9 @@ enum netdev_ml_priv_type {
>   *
>   *	@threaded:	napi threaded mode is enabled
>   *
> + *	@selected_timestamping_layer:	Tracks whether the MAC or the PHY
> + *					performs packet time stamping.
> + *
>   *	@net_notifier_list:	List of per-net netdev notifier block
>   *				that follow this device when it is moved
>   *				to another network namespace.
> @@ -2388,6 +2392,8 @@ struct net_device {
>  	unsigned		wol_enabled:1;
>  	unsigned		threaded:1;
>  
> +	enum timestamping_layer selected_timestamping_layer;
> +
>  	struct list_head	net_notifier_list;
>  
>  #if IS_ENABLED(CONFIG_MACSEC)
> @@ -2879,6 +2885,7 @@ enum netdev_cmd {
>  	NETDEV_OFFLOAD_XSTATS_REPORT_DELTA,
>  	NETDEV_XDP_FEAT_CHANGE,
>  	NETDEV_PRE_CHANGE_HWTSTAMP,
> +	NETDEV_CHANGE_HWTSTAMP,

Don't create new netdev notifiers with no users. Also, NETDEV_PRE_CHANGE_HWTSTAMP
has disappered.

>  };
>  const char *netdev_cmd_to_name(enum netdev_cmd cmd);
>  
> @@ -2934,6 +2941,11 @@ struct netdev_notifier_hwtstamp_info {
>  	struct kernel_hwtstamp_config *config;
>  };
>  
> +struct netdev_notifier_hwtstamp_layer {
> +	struct netdev_notifier_info info; /* must be first */
> +	enum timestamping_layer ts_layer;
> +};
> +
>  enum netdev_offload_xstats_type {
>  	NETDEV_OFFLOAD_XSTATS_TYPE_L3 = 1,
>  };
> diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h
> index 447908393b91..4f03e7cde271 100644
> --- a/include/uapi/linux/ethtool_netlink.h
> +++ b/include/uapi/linux/ethtool_netlink.h
> @@ -41,6 +41,7 @@ enum {
>  	ETHTOOL_MSG_TSINFO_GET,
>  	ETHTOOL_MSG_TSLIST_GET,
>  	ETHTOOL_MSG_TS_GET,
> +	ETHTOOL_MSG_TS_SET,

The way in which the Linux kernel ensures a stable API towards user
space is that programs written against kernel headers from 5 years ago
still work with kernels from today. In your case, you would be breaking
that, because before this patch, ETHTOOL_MSG_CABLE_TEST_ACT was 26, and
after your patch it is 27. So old applications emitting a cable test
netlink message would be interpreted by new kernels as emitting a "set
timestamping layer" netlink message. Of course not only the cable test
is affected, every other netlink message until the end is now shifted by
1. This is why we put new enum values only at the very end, where it
actually says they should go:

	/* add new constants above here */

>  	ETHTOOL_MSG_CABLE_TEST_ACT,
>  	ETHTOOL_MSG_CABLE_TEST_TDR_ACT,
>  	ETHTOOL_MSG_TUNNEL_INFO_GET,
> @@ -96,6 +97,7 @@ enum {
>  	ETHTOOL_MSG_TSINFO_GET_REPLY,
>  	ETHTOOL_MSG_TSLIST_GET_REPLY,
>  	ETHTOOL_MSG_TS_GET_REPLY,
> +	ETHTOOL_MSG_TS_NTF,

Similar issue.

>  	ETHTOOL_MSG_CABLE_TEST_NTF,
>  	ETHTOOL_MSG_CABLE_TEST_TDR_NTF,
>  	ETHTOOL_MSG_TUNNEL_INFO_GET_REPLY,
> diff --git a/net/ethtool/common.c b/net/ethtool/common.c
> index 695c7c4a816b..daea7221dd25 100644
> --- a/net/ethtool/common.c
> +++ b/net/ethtool/common.c
> @@ -61,9 +55,65 @@ static int ts_fill_reply(struct sk_buff *skb,
>  			     const struct ethnl_reply_data *reply_base)
>  {
>  	struct ts_reply_data *data = TS_REPDATA(reply_base);
> +

gratuitous change

>  	return nla_put_u32(skb, ETHTOOL_A_TS_LAYER, data->ts);
>  }
>  
> +/* TS_SET */
> +const struct nla_policy ethnl_ts_set_policy[] = {
> +	[ETHTOOL_A_TS_HEADER]	= NLA_POLICY_NESTED(ethnl_header_policy),
> +	[ETHTOOL_A_TS_LAYER]	= { .type = NLA_U32 },
> +};

I wanted to explore this topic myself, but I can't seem to find the
time, so here's something a bit hand-wavey.

We should make a distinction between what the kernel exposes as UAPI
(our future selves need to work with that in a backwards-compatible way)
and the internal, unstable kernel implementation.

It would be good if, instead of the ETHTOOL_A_TS_LAYER netlink
attribute, the kernel could expose a more generic ETHTOOL_A_TS_PHC, from
which the ethtool core could deduce if the PHC number belongs to the MAC
or to the PHY. We could still keep something like netdev->selected_timestamping_layer
in code private to the kernel, but from the UAPI perspective, I agree
with Andrew that we should design something that is cleanly extensible
to N PHCs, not just to a vague "layer".
Kory Maincent May 2, 2023, 11:05 a.m. UTC | #3
On Sat, 29 Apr 2023 20:58:07 +0300
Vladimir Oltean <vladimir.oltean@nxp.com> wrote:

Thanks for the review and sorry for the coding style issues, I forgot to run the
checkpatch script.

> >  
> >  #if IS_ENABLED(CONFIG_MACSEC)
> > @@ -2879,6 +2885,7 @@ enum netdev_cmd {
> >  	NETDEV_OFFLOAD_XSTATS_REPORT_DELTA,
> >  	NETDEV_XDP_FEAT_CHANGE,
> >  	NETDEV_PRE_CHANGE_HWTSTAMP,
> > +	NETDEV_CHANGE_HWTSTAMP,  
> 
> Don't create new netdev notifiers with no users. Also,
> NETDEV_PRE_CHANGE_HWTSTAMP has disappered.

Ok, right you move it on to dsa stub. What do you think of our case, should we
continue with netdev notifier? 

> > diff --git a/include/uapi/linux/ethtool_netlink.h
> > b/include/uapi/linux/ethtool_netlink.h index 447908393b91..4f03e7cde271
> > 100644 --- a/include/uapi/linux/ethtool_netlink.h
> > +++ b/include/uapi/linux/ethtool_netlink.h
> > @@ -41,6 +41,7 @@ enum {
> >  	ETHTOOL_MSG_TSINFO_GET,
> >  	ETHTOOL_MSG_TSLIST_GET,
> >  	ETHTOOL_MSG_TS_GET,
> > +	ETHTOOL_MSG_TS_SET,  
> 
> The way in which the Linux kernel ensures a stable API towards user
> space is that programs written against kernel headers from 5 years ago
> still work with kernels from today. In your case, you would be breaking
> that, because before this patch, ETHTOOL_MSG_CABLE_TEST_ACT was 26, and
> after your patch it is 27. So old applications emitting a cable test
> netlink message would be interpreted by new kernels as emitting a "set
> timestamping layer" netlink message. Of course not only the cable test
> is affected, every other netlink message until the end is now shifted by
> 1. This is why we put new enum values only at the very end, where it
> actually says they should go:

Sorry for that, this indeed didn't cross my head. I will fix it.

> >  
> > +/* TS_SET */
> > +const struct nla_policy ethnl_ts_set_policy[] = {
> > +	[ETHTOOL_A_TS_HEADER]	=
> > NLA_POLICY_NESTED(ethnl_header_policy),
> > +	[ETHTOOL_A_TS_LAYER]	= { .type = NLA_U32 },
> > +};  
> 
> I wanted to explore this topic myself, but I can't seem to find the
> time, so here's something a bit hand-wavey.
> 
> We should make a distinction between what the kernel exposes as UAPI
> (our future selves need to work with that in a backwards-compatible way)
> and the internal, unstable kernel implementation.
> 
> It would be good if, instead of the ETHTOOL_A_TS_LAYER netlink
> attribute, the kernel could expose a more generic ETHTOOL_A_TS_PHC, from
> which the ethtool core could deduce if the PHC number belongs to the MAC
> or to the PHY. We could still keep something like
> netdev->selected_timestamping_layer in code private to the kernel, but from
> the UAPI perspective, I agree with Andrew that we should design something
> that is cleanly extensible to N PHCs, not just to a vague "layer".

Just some thought:
Maybe we could create a PHC_ID 0xXY where X is the layer and Y the PHCs number
in this layer, but what will append if there is two MACs consecutively?
Also in case of several MACs or PHYs in serial
netdev->selected_timestamping_layer is inappropriate. 

Maybe using it like 0xABC where A is the MAC number, B the PHY number and C
the PHC number in the device.
For example if we select the second MAC and its third PHC, PHC_ID will be
0x203. Or if we select the second PHC of the PHY it will be 0x012.
Vladimir Oltean May 11, 2023, 1:48 p.m. UTC | #4
On Tue, May 02, 2023 at 01:05:25PM +0200, Köry Maincent wrote:
> On Sat, 29 Apr 2023 20:58:07 +0300
> Vladimir Oltean <vladimir.oltean@nxp.com> wrote:
> 
> Thanks for the review and sorry for the coding style issues, I forgot to run the
> checkpatch script.
> 
> > >  
> > >  #if IS_ENABLED(CONFIG_MACSEC)
> > > @@ -2879,6 +2885,7 @@ enum netdev_cmd {
> > >  	NETDEV_OFFLOAD_XSTATS_REPORT_DELTA,
> > >  	NETDEV_XDP_FEAT_CHANGE,
> > >  	NETDEV_PRE_CHANGE_HWTSTAMP,
> > > +	NETDEV_CHANGE_HWTSTAMP,  
> > 
> > Don't create new netdev notifiers with no users. Also,
> > NETDEV_PRE_CHANGE_HWTSTAMP has disappered.
> 
> Ok, right you move it on to dsa stub. What do you think of our case, should we
> continue with netdev notifier? 

I don't know.

AFAIU, the plan forward with this patch set is that, if the active
timestamping layer is the PHY, phy_mii_ioctl() gets called and the MAC
driver does not get notified in any way of that. That is an issue
because if it's a switch, it will want to trap PTP even if it doesn't
timestamp it, and with this proposal it doesn't get a chance to do that.

What is your need for this? Do you have this scenario? If not, just drop
this part from the patch.

Jakub, you said "nope" to netdev notifiers, what would you suggest here
instead? ndo_change_ptp_traps()?

> Just some thought:
> Maybe we could create a PHC_ID 0xXY where X is the layer and Y the PHCs number
> in this layer, but what will append if there is two MACs consecutively?
> Also in case of several MACs or PHYs in serial
> netdev->selected_timestamping_layer is inappropriate. 
> 
> Maybe using it like 0xABC where A is the MAC number, B the PHY number and C
> the PHC number in the device.
> For example if we select the second MAC and its third PHC, PHC_ID will be
> 0x203. Or if we select the second PHC of the PHY it will be 0x012.

I think the kernel has (or can have) enough information to deduce the
timestamping layer from just the PHC number, everything else is redundant
information (also, what's "MAC number"? what is "PHY number"? how can
user space correlate them to anything?).

If you want to design an UAPI where individual PHCs present in the
timestamping list of a netdev can individually have timestamping turned
on or off (for a future where a socket can communicate hardware
timestamps from multiple PHCs to user space in an identifiable way),
then that is fine by me and maybe even a good idea. Just be aware that
the current kernel will have to deny more than a single active
timestamping PHC for now, because of the lack of hwtstamp identification
in the SO_TIMESTAMPING API.
Jakub Kicinski May 11, 2023, 3:36 p.m. UTC | #5
On Thu, 11 May 2023 16:48:07 +0300 Vladimir Oltean wrote:
> > Ok, right you move it on to dsa stub. What do you think of our case, should we
> > continue with netdev notifier?   
> 
> I don't know.
> 
> AFAIU, the plan forward with this patch set is that, if the active
> timestamping layer is the PHY, phy_mii_ioctl() gets called and the MAC
> driver does not get notified in any way of that. That is an issue
> because if it's a switch, it will want to trap PTP even if it doesn't
> timestamp it, and with this proposal it doesn't get a chance to do that.
> 
> What is your need for this? Do you have this scenario? If not, just drop
> this part from the patch.
> 
> Jakub, you said "nope" to netdev notifiers, what would you suggest here
> instead? ndo_change_ptp_traps()?

More importantly "monolithic" drivers have DMA/MAC/PHY all under 
the NDO so assuming that SOF_PHY_TIMESTAMPING implies a phylib PHY
is not going to work.

We need a more complex calling convention for the NDO.
Vladimir Oltean May 11, 2023, 3:56 p.m. UTC | #6
On Thu, May 11, 2023 at 08:36:20AM -0700, Jakub Kicinski wrote:
> On Thu, 11 May 2023 16:48:07 +0300 Vladimir Oltean wrote:
> > > Ok, right you move it on to dsa stub. What do you think of our case, should we
> > > continue with netdev notifier?   
> > 
> > I don't know.
> > 
> > AFAIU, the plan forward with this patch set is that, if the active
> > timestamping layer is the PHY, phy_mii_ioctl() gets called and the MAC
> > driver does not get notified in any way of that. That is an issue
> > because if it's a switch, it will want to trap PTP even if it doesn't
> > timestamp it, and with this proposal it doesn't get a chance to do that.
> > 
> > What is your need for this? Do you have this scenario? If not, just drop
> > this part from the patch.
> > 
> > Jakub, you said "nope" to netdev notifiers, what would you suggest here
> > instead? ndo_change_ptp_traps()?
> 
> More importantly "monolithic" drivers have DMA/MAC/PHY all under 
> the NDO so assuming that SOF_PHY_TIMESTAMPING implies a phylib PHY
> is not going to work.
> 
> We need a more complex calling convention for the NDO.

It's the first time I become aware of the issue of PHY timestamping in
monolithic drivers that don't use phylib, and it's actually a very good
point. I guess that input gives a clearer set of constraints for Köry to
design an API where the selected timestamping layer is maybe passed to
ndo_hwtstamp_set() and MAC drivers are obliged to look at it.

OTOH, a complaint about the current ndo_eth_ioctl() -> phy_mii_ioctl()
code path was that phylib PHY drivers need to have the explicit blessing
from the MAC driver in order to enable timestamping. This patch set
attempts to circumvent that, and you're basically saying that it shouldn't.
Jakub Kicinski May 11, 2023, 4:25 p.m. UTC | #7
On Thu, 11 May 2023 18:56:40 +0300 Vladimir Oltean wrote:
> > More importantly "monolithic" drivers have DMA/MAC/PHY all under 
> > the NDO so assuming that SOF_PHY_TIMESTAMPING implies a phylib PHY
> > is not going to work.
> > 
> > We need a more complex calling convention for the NDO.  
> 
> It's the first time I become aware of the issue of PHY timestamping in
> monolithic drivers that don't use phylib, and it's actually a very good
> point. I guess that input gives a clearer set of constraints for Köry to
> design an API where the selected timestamping layer is maybe passed to
> ndo_hwtstamp_set() and MAC drivers are obliged to look at it.
> 
> OTOH, a complaint about the current ndo_eth_ioctl() -> phy_mii_ioctl()
> code path was that phylib PHY drivers need to have the explicit blessing
> from the MAC driver in order to enable timestamping. This patch set
> attempts to circumvent that, and you're basically saying that it shouldn't.

Yes, we don't want to lose the simplification benefit for the common
cases. I think we should make the "please call me for PHY requests"
an opt in.

Annoyingly the "please call me for PHY/all requests" needs to be
separate from "this MAC driver supports PHY timestamps". Because in
your example the switch driver may not in fact implement PHY stamping,
it just wants to know about the configuration.

So we need a bit somewhere (in ops? in some other struct? in output 
of get_ts?) to let the driver declare that it wants to see all TS
requests. (I've been using bits in ops, IDK if people find that
repulsive or neat :))

Then if bit is not set or NDO returns -EOPNOTSUPP for PHY requests we
still try to call the PHY in the core?

Separately the MAC driver needs to be able to report what stamping 
it supports (DMA, MAC, PHY, as mentioned in reply to patch 2).
Vladimir Oltean May 11, 2023, 8:54 p.m. UTC | #8
On Thu, May 11, 2023 at 09:25:39AM -0700, Jakub Kicinski wrote:
> On Thu, 11 May 2023 18:56:40 +0300 Vladimir Oltean wrote:
> > > More importantly "monolithic" drivers have DMA/MAC/PHY all under 
> > > the NDO so assuming that SOF_PHY_TIMESTAMPING implies a phylib PHY
> > > is not going to work.
> > > 
> > > We need a more complex calling convention for the NDO.  
> > 
> > It's the first time I become aware of the issue of PHY timestamping in
> > monolithic drivers that don't use phylib, and it's actually a very good
> > point. I guess that input gives a clearer set of constraints for Köry to
> > design an API where the selected timestamping layer is maybe passed to
> > ndo_hwtstamp_set() and MAC drivers are obliged to look at it.
> > 
> > OTOH, a complaint about the current ndo_eth_ioctl() -> phy_mii_ioctl()
> > code path was that phylib PHY drivers need to have the explicit blessing
> > from the MAC driver in order to enable timestamping. This patch set
> > attempts to circumvent that, and you're basically saying that it shouldn't.
> 
> Yes, we don't want to lose the simplification benefit for the common
> cases.

While I'm all for simplification in general, let's not take that for
granted and see what it implies, first.

If the new default behavior for the common case is going to be to bypass
the MAC's ndo_hwtstamp_set(), then MAC drivers which didn't explicitly
allow phylib PHY timestamping will now do.

Let's group them into:

(A) MAC drivers where that is perfectly fine

(B) MAC drivers with forwarding offload, which would forward/flood PTP
    frames carrying PHY timestamps

(C) "solipsistic" MAC drivers which assume that any skb with
    SKBTX_HW_TSTAMP is a skb for *me* to timestamp

Going for the simplification would mean making sure that cases (B)
and (C) are well handled, and have a reasonable chance of not getting
reintroduced in the future.

For case (B) it would mean patching all existing switch drivers which
don't allow PHY timestamping to still not allow PHY timestamping, and
fixing those switch drivers which allow PHY timestamping but don't set
up traps (probably those weren't tested in a bridging configuration).

For case (C) it would mean scanning all MAC drivers for bugs akin to the
one fixed in commit c26a2c2ddc01 ("gianfar: Fix TX timestamping with a
stacked DSA driver"). I did that once, but it was years ago and I can't
guarantee what is the current state or that I didn't miss anything.
For example, I missed the minor fact that igc would count skbs
timestamped by a non-igc entity in its TX path as 'tx_hwtstamp_skipped'
packets, visible in ethtool -S:
https://lore.kernel.org/intel-wired-lan/20230504235233.1850428-2-vinicius.gomes@intel.com/

It has to be said that nowadays, Documentation/networking/timestamping.rst
does warn about stacked PHCs, and those who read will find it. Also,
ptp4l nowadays warns if there are multiple TX timestamps received for
the same skb, and that's a major user of this functionality. So I don't
mean to point this case out as a form of discouragement, but it is going
to be a bit of a roll of dice.


The alternative (ditching the simplification) is that someone still
has to code up the glue logic from ndo_hwtstamp_set() -> phy_mii_ioctl(),
and that presumes some minimal level of testing, which we are now
"simplifying" away.

The counter-argument against ditching the simplification is that DSA
is also vulnerable to the bugs from case (C), but as opposed to PHY
timestamping where currently MACs need to voluntarily pass the
phy_mii_ioctl() call themselves, MACs don't get to choose if they want
to act as DSA masters or not. That gives some more confidence that bugs
in this area might not be so common, and leaves just (B) a concern.

To analyze how common is the common case is a simple matter of counting
how many drivers among those with SIOCSHWTSTAMP implementations also
have some form of forwarding offload, OR, as you point out, how many
don't use phylib.

> I think we should make the "please call me for PHY requests" an opt in.
> 
> Annoyingly the "please call me for PHY/all requests" needs to be
> separate from "this MAC driver supports PHY timestamps". Because in
> your example the switch driver may not in fact implement PHY stamping,
> it just wants to know about the configuration.
> 
> So we need a bit somewhere (in ops? in some other struct? in output 
> of get_ts?) to let the driver declare that it wants to see all TS
> requests. (I've been using bits in ops, IDK if people find that
> repulsive or neat :))

It's either repulsive or neat, depending on the context.

Last time you suggested something like this in an ops structure was IIRC
something like whether "MAC Merge is supported". My objection was that
DSA has a common set of ops structures (dsa_slave_ethtool_ops,
dsa_slave_netdev_ops) behind which lie different switch families from
at least 13 vendors. A shared const ops structure is not an appropriate
means to communicate whether 13 switch vendors support a TSN MAC Merge
layer or not.

With declaring interest in all hardware timestamping requests in the
data path of a MAC, be they MAC-level requests or otherwise, it's a bit
different, because all DSA switches have one thing in common, which is
that they're switches, and that is relevant here. So I'm not opposed to
setting a bit in the ethtool ops structure, at least for DSA that could
work just fine.

> Then if bit is not set or NDO returns -EOPNOTSUPP for PHY requests we
> still try to call the PHY in the core?

Well, if there is no interest for special behavior from the MAC driver,
then I guess the memo is "yolo"...

But OTOH, if a macro-driver like DSA declares its interest in receiving
all timestamping requests, but then that particular DSA switch returns
-EOPNOTSUPP in the ndo_hwtstamp_set() handler, it would be silly for the
core to still "force the entry" and call phy_mii_ioctl() anyway - because
we know that's going to be broken.

So with "NDO returns -EOPNOTSUPP", I hope you don't mean *that* NDO
(ndo_hwtstamp_set()) but a previously unnamed one that serves the same
purpose as the capability bit - ndo_hey_are_you_interested_in_this_hwtstamp_request().
In that case, yes - with -EOPNOTSUPP we're back to "yolo".

> Separately the MAC driver needs to be able to report what stamping 
> it supports (DMA, MAC, PHY, as mentioned in reply to patch 2).

I'm a bit unclear on that - responded there.
Russell King (Oracle) May 11, 2023, 9:06 p.m. UTC | #9
On Thu, May 11, 2023 at 09:25:39AM -0700, Jakub Kicinski wrote:
> Yes, we don't want to lose the simplification benefit for the common
> cases. I think we should make the "please call me for PHY requests"
> an opt in.
> 
> Annoyingly the "please call me for PHY/all requests" needs to be
> separate from "this MAC driver supports PHY timestamps". Because in
> your example the switch driver may not in fact implement PHY stamping,
> it just wants to know about the configuration.
> 
> So we need a bit somewhere (in ops? in some other struct? in output 
> of get_ts?) to let the driver declare that it wants to see all TS
> requests. (I've been using bits in ops, IDK if people find that
> repulsive or neat :))

I haven't thought this through fully, but just putting this out there
as a potential suggestion...

Would it help at all if we distilled the entire timestamping interface
into a separate set of ops which are registered independently from the
NDO, and NDO has a call to get the ops for the layer being configured?

That would allow a netdev driver to return the ops appropriate for the
MAC layer or its own PHY layer, or maybe phylib.

In the case of phylib, as there is a raft of drivers that only bind to
their phylib PHY in the NDO open method, we'd need to figure out how
to get the ops for the current mode at that time.

There's probably lots I've missed though...
Jakub Kicinski May 11, 2023, 10:54 p.m. UTC | #10
On Thu, 11 May 2023 22:06:36 +0100 Russell King (Oracle) wrote:
> I haven't thought this through fully, but just putting this out there
> as a potential suggestion...
> 
> Would it help at all if we distilled the entire timestamping interface
> into a separate set of ops which are registered independently from the
> NDO, and NDO has a call to get the ops for the layer being configured?
> 
> That would allow a netdev driver to return the ops appropriate for the
> MAC layer or its own PHY layer, or maybe phylib.
> 
> In the case of phylib, as there is a raft of drivers that only bind to
> their phylib PHY in the NDO open method, we'd need to figure out how
> to get the ops for the current mode at that time.
> 
> There's probably lots I've missed though...

Sounds reasonable, only question is what to pass as the object (first
argument) of these ops. Some new struct?
Jakub Kicinski May 11, 2023, 11:08 p.m. UTC | #11
On Thu, 11 May 2023 23:54:35 +0300 Vladimir Oltean wrote:
> On Thu, May 11, 2023 at 09:25:39AM -0700, Jakub Kicinski wrote:
> > > It's the first time I become aware of the issue of PHY timestamping in
> > > monolithic drivers that don't use phylib, and it's actually a very good
> > > point. I guess that input gives a clearer set of constraints for Köry to
> > > design an API where the selected timestamping layer is maybe passed to
> > > ndo_hwtstamp_set() and MAC drivers are obliged to look at it.
> > > 
> > > OTOH, a complaint about the current ndo_eth_ioctl() -> phy_mii_ioctl()
> > > code path was that phylib PHY drivers need to have the explicit blessing
> > > from the MAC driver in order to enable timestamping. This patch set
> > > attempts to circumvent that, and you're basically saying that it shouldn't.  
> > 
> > Yes, we don't want to lose the simplification benefit for the common
> > cases.  
> 
> While I'm all for simplification in general, let's not take that for
> granted and see what it implies, first.
> 
> If the new default behavior for the common case is going to be to bypass
> the MAC's ndo_hwtstamp_set(), then MAC drivers which didn't explicitly
> allow phylib PHY timestamping will now do.
> 
> Let's group them into:
> 
> (A) MAC drivers where that is perfectly fine
> 
> (B) MAC drivers with forwarding offload, which would forward/flood PTP
>     frames carrying PHY timestamps
> 
> (C) "solipsistic" MAC drivers which assume that any skb with
>     SKBTX_HW_TSTAMP is a skb for *me* to timestamp
> 
> Going for the simplification would mean making sure that cases (B)
> and (C) are well handled, and have a reasonable chance of not getting
> reintroduced in the future.
> 
> For case (B) it would mean patching all existing switch drivers which
> don't allow PHY timestamping to still not allow PHY timestamping, and
> fixing those switch drivers which allow PHY timestamping but don't set
> up traps (probably those weren't tested in a bridging configuration).
> 
> For case (C) it would mean scanning all MAC drivers for bugs akin to the
> one fixed in commit c26a2c2ddc01 ("gianfar: Fix TX timestamping with a
> stacked DSA driver"). I did that once, but it was years ago and I can't
> guarantee what is the current state or that I didn't miss anything.
> For example, I missed the minor fact that igc would count skbs
> timestamped by a non-igc entity in its TX path as 'tx_hwtstamp_skipped'
> packets, visible in ethtool -S:
> https://lore.kernel.org/intel-wired-lan/20230504235233.1850428-2-vinicius.gomes@intel.com/
> 
> It has to be said that nowadays, Documentation/networking/timestamping.rst
> does warn about stacked PHCs, and those who read will find it. Also,
> ptp4l nowadays warns if there are multiple TX timestamps received for
> the same skb, and that's a major user of this functionality. So I don't
> mean to point this case out as a form of discouragement, but it is going
> to be a bit of a roll of dice.

I think it's worth calling out that we may be touching most / all
drivers anyway to transition them from the IOCTL NDO to a normal
timestamp NDO.

> The alternative (ditching the simplification) is that someone still
> has to code up the glue logic from ndo_hwtstamp_set() -> phy_mii_ioctl(),
> and that presumes some minimal level of testing, which we are now
> "simplifying" away.
> 
> The counter-argument against ditching the simplification is that DSA
> is also vulnerable to the bugs from case (C), but as opposed to PHY
> timestamping where currently MACs need to voluntarily pass the
> phy_mii_ioctl() call themselves, MACs don't get to choose if they want
> to act as DSA masters or not. That gives some more confidence that bugs
> in this area might not be so common, and leaves just (B) a concern.
> 
> To analyze how common is the common case is a simple matter of counting
> how many drivers among those with SIOCSHWTSTAMP implementations also
> have some form of forwarding offload, OR, as you point out, how many
> don't use phylib.

"Common" is one way of looking at it, another is trying to get the
default ("I didn't know I have to set extra flags") to do the right
thing or fail.

> > I think we should make the "please call me for PHY requests" an opt in.
> > 
> > Annoyingly the "please call me for PHY/all requests" needs to be
> > separate from "this MAC driver supports PHY timestamps". Because in
> > your example the switch driver may not in fact implement PHY stamping,
> > it just wants to know about the configuration.
> > 
> > So we need a bit somewhere (in ops? in some other struct? in output 
> > of get_ts?) to let the driver declare that it wants to see all TS
> > requests. (I've been using bits in ops, IDK if people find that
> > repulsive or neat :))  
> 
> It's either repulsive or neat, depending on the context.
> 
> Last time you suggested something like this in an ops structure was IIRC
> something like whether "MAC Merge is supported". My objection was that
> DSA has a common set of ops structures (dsa_slave_ethtool_ops,
> dsa_slave_netdev_ops) behind which lie different switch families from
> at least 13 vendors. A shared const ops structure is not an appropriate
> means to communicate whether 13 switch vendors support a TSN MAC Merge
> layer or not.
> 
> With declaring interest in all hardware timestamping requests in the
> data path of a MAC, be they MAC-level requests or otherwise, it's a bit
> different, because all DSA switches have one thing in common, which is
> that they're switches, and that is relevant here. So I'm not opposed to
> setting a bit in the ethtool ops structure, at least for DSA that could
> work just fine.
> 
> > Then if bit is not set or NDO returns -EOPNOTSUPP for PHY requests we
> > still try to call the PHY in the core?  
> 
> Well, if there is no interest for special behavior from the MAC driver,
> then I guess the memo is "yolo"...
> 
> But OTOH, if a macro-driver like DSA declares its interest in receiving
> all timestamping requests, but then that particular DSA switch returns
> -EOPNOTSUPP in the ndo_hwtstamp_set() handler, it would be silly for the
> core to still "force the entry" and call phy_mii_ioctl() anyway - because
> we know that's going to be broken.
> 
> So with "NDO returns -EOPNOTSUPP", I hope you don't mean *that* NDO
> (ndo_hwtstamp_set()) but a previously unnamed one that serves the same
> purpose as the capability bit - ndo_hey_are_you_interested_in_this_hwtstamp_request().
> In that case, yes - with -EOPNOTSUPP we're back to "yolo".

Why can't we treat ndo_hwtstamp_set() == -EOPNOTSUPP as a signal 
to call the PHY? ndo_hwtstamp_set() does not exist, we can give
it whatever semantics we want.

> > Separately the MAC driver needs to be able to report what stamping 
> > it supports (DMA, MAC, PHY, as mentioned in reply to patch 2).  
> 
> I'm a bit unclear on that - responded there.
Vladimir Oltean May 11, 2023, 11:18 p.m. UTC | #12
On Thu, May 11, 2023 at 04:08:45PM -0700, Jakub Kicinski wrote:
> I think it's worth calling out that we may be touching most / all
> drivers anyway to transition them from the IOCTL NDO to a normal
> timestamp NDO.

Right, and figuring out how to deal with PHY timestamping in the new API
is "step 1.5" between creating that new API (Maxim's patch set) and
making the timestamping layer selectable (Köry's patch set).

> > But OTOH, if a macro-driver like DSA declares its interest in receiving
> > all timestamping requests, but then that particular DSA switch returns
> > -EOPNOTSUPP in the ndo_hwtstamp_set() handler, it would be silly for the
> > core to still "force the entry" and call phy_mii_ioctl() anyway - because
> > we know that's going to be broken.
> > 
> > So with "NDO returns -EOPNOTSUPP", I hope you don't mean *that* NDO
> > (ndo_hwtstamp_set()) but a previously unnamed one that serves the same
> > purpose as the capability bit - ndo_hey_are_you_interested_in_this_hwtstamp_request().
> > In that case, yes - with -EOPNOTSUPP we're back to "yolo".
> 
> Why can't we treat ndo_hwtstamp_set() == -EOPNOTSUPP as a signal 
> to call the PHY? ndo_hwtstamp_set() does not exist, we can give
> it whatever semantics we want.

Hmm, because if we do that, bridged DSA switch ports without hardware
timestamping support and without logic to trap PTP to the CPU will just
spew those PTP frames with PHY hardware timestamps everywhere, instead
of just telling the user hey, the configuration isn't supported?
Jakub Kicinski May 11, 2023, 11:35 p.m. UTC | #13
On Fri, 12 May 2023 02:18:03 +0300 Vladimir Oltean wrote:
> > Why can't we treat ndo_hwtstamp_set() == -EOPNOTSUPP as a signal 
> > to call the PHY? ndo_hwtstamp_set() does not exist, we can give
> > it whatever semantics we want.  
> 
> Hmm, because if we do that, bridged DSA switch ports without hardware
> timestamping support and without logic to trap PTP to the CPU will just
> spew those PTP frames with PHY hardware timestamps everywhere, instead
> of just telling the user hey, the configuration isn't supported?

I see, so there is a legit reason to abort. 

We could use one of the high error codes, then, to signal 
the "I didn't care, please carry on to the PHY" condition?
-ENOTSUPP?

I guess we can add a separate "please configure traps for PTP/NTP" 
NDO, if you prefer. Mostly an implementation detail.
Kory Maincent May 15, 2023, 9:04 a.m. UTC | #14
On Thu, 11 May 2023 16:35:47 -0700
Jakub Kicinski <kuba@kernel.org> wrote:

> On Fri, 12 May 2023 02:18:03 +0300 Vladimir Oltean wrote:
> > > Why can't we treat ndo_hwtstamp_set() == -EOPNOTSUPP as a signal 
> > > to call the PHY? ndo_hwtstamp_set() does not exist, we can give
> > > it whatever semantics we want.    
> > 
> > Hmm, because if we do that, bridged DSA switch ports without hardware
> > timestamping support and without logic to trap PTP to the CPU will just
> > spew those PTP frames with PHY hardware timestamps everywhere, instead
> > of just telling the user hey, the configuration isn't supported?  
> 
> I see, so there is a legit reason to abort. 
> 
> We could use one of the high error codes, then, to signal 
> the "I didn't care, please carry on to the PHY" condition?
> -ENOTSUPP?
> 
> I guess we can add a separate "please configure traps for PTP/NTP" 
> NDO, if you prefer. Mostly an implementation detail.

I am not as expert as you on the network stack therefore I am trying to follow
and understand all the remarks. Please correct me if I say something wrong. It
is interesting to understand all the complications that these changes bring.

To summary, what do you think is preferable for this patch series?
- New ops for TS as suggested by Russell.

- Continue on this implementation and check that Vladimir A,B and C cases are
  handled. Which imply, if I understand well, find a good way to deal with PTP
  change trap (bit or new ndo ops), convert most drivers from IOCTL to NDO
  beforehand. 

- Add MAC-DMA TS? It think it is needed as MAC-DMA TS seems already used and
  different from simple MAC TS in term of quality, as described by Jakub.
Jakub Kicinski May 16, 2023, 7:28 p.m. UTC | #15
On Mon, 15 May 2023 11:04:32 +0200 Köry Maincent wrote:
> > I see, so there is a legit reason to abort. 
> > 
> > We could use one of the high error codes, then, to signal 
> > the "I didn't care, please carry on to the PHY" condition?
> > -ENOTSUPP?
> > 
> > I guess we can add a separate "please configure traps for PTP/NTP" 
> > NDO, if you prefer. Mostly an implementation detail.  
> 
> I am not as expert as you on the network stack therefore I am trying to follow
> and understand all the remarks. Please correct me if I say something wrong. It
> is interesting to understand all the complications that these changes bring.
> 
> To summary, what do you think is preferable for this patch series?
> - New ops for TS as suggested by Russell.
> 
> - Continue on this implementation and check that Vladimir A,B and C cases are
>   handled. Which imply, if I understand well, find a good way to deal with PTP
>   change trap (bit or new ndo ops), convert most drivers from IOCTL to NDO
>   beforehand. 
> 
> - Add MAC-DMA TS? It think it is needed as MAC-DMA TS seems already used and
>   different from simple MAC TS in term of quality, as described by Jakub.

These aren't really disjoint alternatives, we need MAC-DMA TS and we
need a way to direct all TS requests to the MAC driver. Whether we do
it via an NDO, flags or new ops is kind of up to you. Question of
aesthetics.

Perhaps to move forward it'd be good to rev the patches, and address
the feedback which is clear?
Jacob Keller May 17, 2023, 10:19 p.m. UTC | #16
On 5/11/2023 9:25 AM, Jakub Kicinski wrote:
> So we need a bit somewhere (in ops? in some other struct? in output 
> of get_ts?) to let the driver declare that it wants to see all TS
> requests. (I've been using bits in ops, IDK if people find that
> repulsive or neat :))

+1 to using bits in ops for opt-in functionality. I like it.
diff mbox series

Patch

diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst
index 539425fdaf7c..12996f38b956 100644
--- a/Documentation/networking/ethtool-netlink.rst
+++ b/Documentation/networking/ethtool-netlink.rst
@@ -2028,6 +2028,18 @@  Kernel response contents:
   ``ETHTOOL_A_TS_LAYER``   u32     available hardware timestamping
   =======================  ======  ===================================
 
+TS_SET
+======
+
+Gets transceiver module parameters.
+
+Request contents:
+
+  =============================  ======  ==============================
+  ``ETHTOOL_A_TS_HEADER``        nested  request header
+  ``ETHTOOL_A_TS_LAYER``         u32     set hardware timestamping
+  =============================  ======  ==============================
+
 Request translation
 ===================
 
@@ -2136,4 +2148,5 @@  are netlink only.
   n/a                                 ``ETHTOOL_MSG_MM_SET``
   n/a                                 ``ETHTOOL_MSG_TS_GET``
   n/a                                 ``ETHTOOL_MSG_TSLIST_GET``
+  n/a                                 ``ETHTOOL_MSG_TS_SET``
   =================================== =====================================
diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index 917ba84105fc..d415c62938cd 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -34,6 +34,8 @@ 
 #include <linux/string.h>
 #include <linux/uaccess.h>
 #include <linux/unistd.h>
+#include <linux/of.h>
+#include <uapi/linux/net_tstamp.h>
 
 MODULE_DESCRIPTION("PHY library");
 MODULE_AUTHOR("Andy Fleming");
@@ -1408,6 +1410,83 @@  int phy_sfp_probe(struct phy_device *phydev,
 }
 EXPORT_SYMBOL(phy_sfp_probe);
 
+/**
+ * A whitelist for PHYs selected as default timesetamping.
+ * Its use is to keep compatibility with old PTP API which is selecting
+ * these PHYs as default timestamping.
+ * The new API is selecting the MAC as default timestamping.
+ */
+const char * const phy_timestamping_whitelist[] = {
+	"Broadcom BCM5411",
+	"Broadcom BCM5421",
+	"Broadcom BCM54210E",
+	"Broadcom BCM5461",
+	"Broadcom BCM54612E",
+	"Broadcom BCM5464",
+	"Broadcom BCM5481",
+	"Broadcom BCM54810",
+	"Broadcom BCM54811",
+	"Broadcom BCM5482",
+	"Broadcom BCM50610",
+	"Broadcom BCM50610M",
+	"Broadcom BCM57780",
+	"Broadcom BCM5395",
+	"Broadcom BCM53125",
+	"Broadcom BCM53128",
+	"Broadcom BCM89610",
+	"NatSemi DP83640",
+	"Microchip LAN8841 Gigabit PHY"
+	"Microchip INDY Gigabit Quad PHY",
+	"Microsemi GE VSC856X SyncE",
+	"Microsemi GE VSC8575 SyncE",
+	"Microsemi GE VSC8582 SyncE",
+	"Microsemi GE VSC8584 SyncE",
+	"NXP C45 TJA1103",
+	NULL,
+};
+
+static void of_set_timestamp(struct net_device *netdev, struct phy_device *phydev)
+{
+	struct device_node *node = phydev->mdio.dev.of_node;
+	const struct ethtool_ops *ops = netdev->ethtool_ops;
+	const char *s;
+	enum timestamping_layer ts_layer = 0;
+	int i;
+
+	/* Backward compatibility to old API */
+	for (i = 0; phy_timestamping_whitelist[i] != NULL; i++)
+	{
+		if (!strcmp(phy_timestamping_whitelist[i],
+			    phydev->drv->name)) {
+			if (phy_has_hwtstamp(phydev))
+				ts_layer = SOF_PHY_TIMESTAMPING;
+			else if (ops->get_ts_info)
+				ts_layer = SOF_MAC_TIMESTAMPING;
+			goto out;
+		}
+	}
+
+	if (ops->get_ts_info)
+		ts_layer = SOF_MAC_TIMESTAMPING;
+	else if (phy_has_hwtstamp(phydev))
+		ts_layer = SOF_PHY_TIMESTAMPING;
+
+	if (of_property_read_string(node, "preferred-timestamp", &s))
+		goto out;
+
+	if (!s)
+		goto out;
+
+	if (phy_has_hwtstamp(phydev) && !strcmp(s, "phy"))
+		ts_layer = SOF_PHY_TIMESTAMPING;
+
+	if (ops->get_ts_info && !strcmp(s, "mac"))
+		ts_layer = SOF_MAC_TIMESTAMPING;
+
+out:
+	netdev->selected_timestamping_layer = ts_layer;
+}
+
 /**
  * phy_attach_direct - attach a network device to a given PHY device pointer
  * @dev: network device to attach
@@ -1481,6 +1560,8 @@  int phy_attach_direct(struct net_device *dev, struct phy_device *phydev,
 
 	phydev->phy_link_change = phy_link_change;
 	if (dev) {
+		of_set_timestamp(dev, phydev);
+
 		phydev->attached_dev = dev;
 		dev->phydev = phydev;
 
@@ -1811,6 +1892,10 @@  void phy_detach(struct phy_device *phydev)
 
 	phy_suspend(phydev);
 	if (dev) {
+		if (dev->ethtool_ops->get_ts_info)
+			dev->selected_timestamping_layer = SOF_MAC_TIMESTAMPING;
+		else
+			dev->selected_timestamping_layer = 0;
 		phydev->attached_dev->phydev = NULL;
 		phydev->attached_dev = NULL;
 	}
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index a740be3bb911..3dd6be012daf 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -48,6 +48,7 @@ 
 #include <uapi/linux/if_bonding.h>
 #include <uapi/linux/pkt_cls.h>
 #include <uapi/linux/netdev.h>
+#include <uapi/linux/net_tstamp.h>
 #include <linux/hashtable.h>
 #include <linux/rbtree.h>
 #include <net/net_trackers.h>
@@ -2019,6 +2020,9 @@  enum netdev_ml_priv_type {
  *
  *	@threaded:	napi threaded mode is enabled
  *
+ *	@selected_timestamping_layer:	Tracks whether the MAC or the PHY
+ *					performs packet time stamping.
+ *
  *	@net_notifier_list:	List of per-net netdev notifier block
  *				that follow this device when it is moved
  *				to another network namespace.
@@ -2388,6 +2392,8 @@  struct net_device {
 	unsigned		wol_enabled:1;
 	unsigned		threaded:1;
 
+	enum timestamping_layer selected_timestamping_layer;
+
 	struct list_head	net_notifier_list;
 
 #if IS_ENABLED(CONFIG_MACSEC)
@@ -2879,6 +2885,7 @@  enum netdev_cmd {
 	NETDEV_OFFLOAD_XSTATS_REPORT_DELTA,
 	NETDEV_XDP_FEAT_CHANGE,
 	NETDEV_PRE_CHANGE_HWTSTAMP,
+	NETDEV_CHANGE_HWTSTAMP,
 };
 const char *netdev_cmd_to_name(enum netdev_cmd cmd);
 
@@ -2934,6 +2941,11 @@  struct netdev_notifier_hwtstamp_info {
 	struct kernel_hwtstamp_config *config;
 };
 
+struct netdev_notifier_hwtstamp_layer {
+	struct netdev_notifier_info info; /* must be first */
+	enum timestamping_layer ts_layer;
+};
+
 enum netdev_offload_xstats_type {
 	NETDEV_OFFLOAD_XSTATS_TYPE_L3 = 1,
 };
diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h
index 447908393b91..4f03e7cde271 100644
--- a/include/uapi/linux/ethtool_netlink.h
+++ b/include/uapi/linux/ethtool_netlink.h
@@ -41,6 +41,7 @@  enum {
 	ETHTOOL_MSG_TSINFO_GET,
 	ETHTOOL_MSG_TSLIST_GET,
 	ETHTOOL_MSG_TS_GET,
+	ETHTOOL_MSG_TS_SET,
 	ETHTOOL_MSG_CABLE_TEST_ACT,
 	ETHTOOL_MSG_CABLE_TEST_TDR_ACT,
 	ETHTOOL_MSG_TUNNEL_INFO_GET,
@@ -96,6 +97,7 @@  enum {
 	ETHTOOL_MSG_TSINFO_GET_REPLY,
 	ETHTOOL_MSG_TSLIST_GET_REPLY,
 	ETHTOOL_MSG_TS_GET_REPLY,
+	ETHTOOL_MSG_TS_NTF,
 	ETHTOOL_MSG_CABLE_TEST_NTF,
 	ETHTOOL_MSG_CABLE_TEST_TDR_NTF,
 	ETHTOOL_MSG_TUNNEL_INFO_GET_REPLY,
diff --git a/net/core/dev.c b/net/core/dev.c
index 7ce5985be84b..481f03ef2283 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1612,7 +1612,7 @@  const char *netdev_cmd_to_name(enum netdev_cmd cmd)
 	N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO)
 	N(PRE_CHANGEADDR) N(OFFLOAD_XSTATS_ENABLE) N(OFFLOAD_XSTATS_DISABLE)
 	N(OFFLOAD_XSTATS_REPORT_USED) N(OFFLOAD_XSTATS_REPORT_DELTA)
-	N(XDP_FEAT_CHANGE) N(PRE_CHANGE_HWTSTAMP)
+	N(XDP_FEAT_CHANGE) N(PRE_CHANGE_HWTSTAMP) N(CHANGE_HWTSTAMP)
 	}
 #undef N
 	return "UNKNOWN_NETDEV_EVENT";
diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c
index 6d772837eb3f..210ff1fd0574 100644
--- a/net/core/dev_ioctl.c
+++ b/net/core/dev_ioctl.c
@@ -7,6 +7,7 @@ 
 #include <linux/net_tstamp.h>
 #include <linux/wireless.h>
 #include <linux/if_bridge.h>
+#include <linux/phy.h>
 #include <net/dsa.h>
 #include <net/wext.h>
 
@@ -252,9 +253,60 @@  static int dev_eth_ioctl(struct net_device *dev,
 	return ops->ndo_eth_ioctl(dev, ifr, cmd);
 }
 
+static int __dev_eth_ioctl(struct net_device *dev,
+			   struct ifreq *ifr, unsigned int cmd)
+{
+	struct netdev_notifier_hwtstamp_layer hwtstamp_layer = {
+		.info.dev = dev,
+		.ts_layer = dev->selected_timestamping_layer,
+	};
+	const struct net_device_ops *ops = dev->netdev_ops;
+	struct netlink_ext_ack extack = {};
+	int err;
+
+	if (!netif_device_present(dev))
+		return -ENODEV;
+
+	switch (dev->selected_timestamping_layer) {
+	case SOF_MAC_TIMESTAMPING:
+		if (ops->ndo_do_ioctl == phy_do_ioctl) {
+			/* Some drivers set .ndo_do_ioctl to phy_do_ioctl. */
+			return -EOPNOTSUPP;
+		} else {
+			err = dev_eth_ioctl(dev, ifr, cmd);
+			if (err)
+				goto out;
+		}
+		break;
+
+	case SOF_PHY_TIMESTAMPING:
+		if (phy_has_hwtstamp(dev->phydev)) {
+			err = phy_mii_ioctl(dev->phydev, ifr, cmd);
+			if (err)
+				goto out;
+		} else {
+			return -ENODEV;
+		}
+		break;
+	}
+
+	hwtstamp_layer.info.extack = &extack;
+
+	err = call_netdevice_notifiers_info(NETDEV_CHANGE_HWTSTAMP,
+					    &hwtstamp_layer.info);
+	err = notifier_to_errno(err);
+	if (err) {
+		if (extack._msg)
+			netdev_err(dev, "%s\n", extack._msg);
+	}
+
+out:
+	return err;
+}
+
 static int dev_get_hwtstamp(struct net_device *dev, struct ifreq *ifr)
 {
-	return dev_eth_ioctl(dev, ifr, SIOCGHWTSTAMP);
+	return __dev_eth_ioctl(dev, ifr, SIOCGHWTSTAMP);
 }
 
 static int dev_set_hwtstamp(struct net_device *dev, struct ifreq *ifr)
@@ -288,7 +340,7 @@  static int dev_set_hwtstamp(struct net_device *dev, struct ifreq *ifr)
 		return err;
 	}
 
-	return dev_eth_ioctl(dev, ifr, SIOCSHWTSTAMP);
+	return __dev_eth_ioctl(dev, ifr, SIOCSHWTSTAMP);
 }
 
 static int dev_siocbond(struct net_device *dev,
diff --git a/net/core/timestamping.c b/net/core/timestamping.c
index 04840697fe79..8bd7b2c21ab6 100644
--- a/net/core/timestamping.c
+++ b/net/core/timestamping.c
@@ -28,6 +28,9 @@  void skb_clone_tx_timestamp(struct sk_buff *skb)
 	if (!skb->sk)
 		return;
 
+	if (skb->dev->selected_timestamping_layer != SOF_PHY_TIMESTAMPING)
+		return;
+
 	type = classify(skb);
 	if (type == PTP_CLASS_NONE)
 		return;
@@ -50,6 +53,9 @@  bool skb_defer_rx_timestamp(struct sk_buff *skb)
 	if (!skb->dev || !skb->dev->phydev || !skb->dev->phydev->mii_ts)
 		return false;
 
+	if (skb->dev->selected_timestamping_layer != SOF_PHY_TIMESTAMPING)
+		return false;
+
 	if (skb_headroom(skb) < ETH_HLEN)
 		return false;
 
diff --git a/net/ethtool/common.c b/net/ethtool/common.c
index 695c7c4a816b..daea7221dd25 100644
--- a/net/ethtool/common.c
+++ b/net/ethtool/common.c
@@ -637,10 +637,17 @@  int __ethtool_get_ts_info(struct net_device *dev, struct ethtool_ts_info *info)
 	memset(info, 0, sizeof(*info));
 	info->cmd = ETHTOOL_GET_TS_INFO;
 
-	if (phy_has_tsinfo(phydev))
-		return phy_ts_info(phydev, info);
-	if (ops->get_ts_info)
-		return ops->get_ts_info(dev, info);
+	switch (dev->selected_timestamping_layer) {
+	case SOF_MAC_TIMESTAMPING:
+		if (ops->get_ts_info)
+			return ops->get_ts_info(dev, info);
+		break;
+
+	case SOF_PHY_TIMESTAMPING:
+		if (phy_has_tsinfo(phydev))
+			return phy_ts_info(phydev, info);
+		return -ENODEV;
+	}
 
 	info->so_timestamping = SOF_TIMESTAMPING_RX_SOFTWARE |
 				SOF_TIMESTAMPING_SOFTWARE;
diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c
index 8d9e27b13e28..b753b7da51f1 100644
--- a/net/ethtool/netlink.c
+++ b/net/ethtool/netlink.c
@@ -294,6 +294,7 @@  ethnl_default_requests[__ETHTOOL_MSG_USER_CNT] = {
 	[ETHTOOL_MSG_FEC_SET]		= &ethnl_fec_request_ops,
 	[ETHTOOL_MSG_TSINFO_GET]	= &ethnl_tsinfo_request_ops,
 	[ETHTOOL_MSG_TS_GET]		= &ethnl_ts_request_ops,
+	[ETHTOOL_MSG_TS_SET]		= &ethnl_ts_request_ops,
 	[ETHTOOL_MSG_TSLIST_GET]	= &ethnl_tslist_request_ops,
 	[ETHTOOL_MSG_MODULE_EEPROM_GET]	= &ethnl_module_eeprom_request_ops,
 	[ETHTOOL_MSG_STATS_GET]		= &ethnl_stats_request_ops,
@@ -671,6 +672,7 @@  ethnl_default_notify_ops[ETHTOOL_MSG_KERNEL_MAX + 1] = {
 	[ETHTOOL_MSG_MODULE_NTF]	= &ethnl_module_request_ops,
 	[ETHTOOL_MSG_PLCA_NTF]		= &ethnl_plca_cfg_request_ops,
 	[ETHTOOL_MSG_MM_NTF]		= &ethnl_mm_request_ops,
+	[ETHTOOL_MSG_TS_NTF]		= &ethnl_ts_request_ops,
 };
 
 /* default notification handler */
@@ -766,6 +768,7 @@  static const ethnl_notify_handler_t ethnl_notify_handlers[] = {
 	[ETHTOOL_MSG_MODULE_NTF]	= ethnl_default_notify,
 	[ETHTOOL_MSG_PLCA_NTF]		= ethnl_default_notify,
 	[ETHTOOL_MSG_MM_NTF]		= ethnl_default_notify,
+	[ETHTOOL_MSG_TS_NTF]		= ethnl_default_notify,
 };
 
 void ethtool_notify(struct net_device *dev, unsigned int cmd, const void *data)
@@ -1031,6 +1034,13 @@  static const struct genl_ops ethtool_genl_ops[] = {
 		.policy = ethnl_ts_get_policy,
 		.maxattr = ARRAY_SIZE(ethnl_ts_get_policy) - 1,
 	},
+	{
+		.cmd	= ETHTOOL_MSG_TS_SET,
+		.flags	= GENL_UNS_ADMIN_PERM,
+		.doit	= ethnl_default_set_doit,
+		.policy = ethnl_ts_set_policy,
+		.maxattr = ARRAY_SIZE(ethnl_ts_set_policy) - 1,
+	},
 	{
 		.cmd	= ETHTOOL_MSG_CABLE_TEST_ACT,
 		.flags	= GENL_UNS_ADMIN_PERM,
diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h
index 49c700777a32..ba38aa0b5506 100644
--- a/net/ethtool/netlink.h
+++ b/net/ethtool/netlink.h
@@ -426,6 +426,7 @@  extern const struct nla_policy ethnl_eee_get_policy[ETHTOOL_A_EEE_HEADER + 1];
 extern const struct nla_policy ethnl_eee_set_policy[ETHTOOL_A_EEE_TX_LPI_TIMER + 1];
 extern const struct nla_policy ethnl_tsinfo_get_policy[ETHTOOL_A_TSINFO_HEADER + 1];
 extern const struct nla_policy ethnl_ts_get_policy[ETHTOOL_A_TS_HEADER + 1];
+extern const struct nla_policy ethnl_ts_set_policy[ETHTOOL_A_TS_LAYER + 1];
 extern const struct nla_policy ethnl_cable_test_act_policy[ETHTOOL_A_CABLE_TEST_HEADER + 1];
 extern const struct nla_policy ethnl_cable_test_tdr_act_policy[ETHTOOL_A_CABLE_TEST_TDR_CFG + 1];
 extern const struct nla_policy ethnl_tunnel_info_get_policy[ETHTOOL_A_TUNNEL_INFO_HEADER + 1];
diff --git a/net/ethtool/ts.c b/net/ethtool/ts.c
index a71c47ff0c6b..7a1e27add492 100644
--- a/net/ethtool/ts.c
+++ b/net/ethtool/ts.c
@@ -31,19 +31,13 @@  static int ts_prepare_data(const struct ethnl_req_info *req_base,
 {
 	struct ts_reply_data *data = TS_REPDATA(reply_base);
 	struct net_device *dev = reply_base->dev;
-	const struct ethtool_ops *ops = dev->ethtool_ops;
 	int ret;
 
 	ret = ethnl_ops_begin(dev);
 	if (ret < 0)
 		return ret;
 
-	if (phy_has_tsinfo(dev->phydev))
-		data->ts = SOF_PHY_TIMESTAMPING;
-	else if (ops->get_ts_info)
-		data->ts = SOF_MAC_TIMESTAMPING;
-	else
-		return -EOPNOTSUPP;
+	data->ts = dev->selected_timestamping_layer;
 
 	ethnl_ops_complete(dev);
 
@@ -61,9 +55,65 @@  static int ts_fill_reply(struct sk_buff *skb,
 			     const struct ethnl_reply_data *reply_base)
 {
 	struct ts_reply_data *data = TS_REPDATA(reply_base);
+
 	return nla_put_u32(skb, ETHTOOL_A_TS_LAYER, data->ts);
 }
 
+/* TS_SET */
+const struct nla_policy ethnl_ts_set_policy[] = {
+	[ETHTOOL_A_TS_HEADER]	= NLA_POLICY_NESTED(ethnl_header_policy),
+	[ETHTOOL_A_TS_LAYER]	= { .type = NLA_U32 },
+};
+
+static int ethnl_set_ts_validate(struct ethnl_req_info *req_info,
+				 struct genl_info *info)
+{
+	const struct ethtool_ops *ops = req_info->dev->ethtool_ops;
+	struct net_device *dev = req_info->dev;
+	struct nlattr **tb = info->attrs;
+
+	if (!tb[ETHTOOL_A_TS_LAYER])
+		return 0;
+
+	if (!phy_has_tsinfo(dev->phydev) && !ops->get_ts_info) {
+		NL_SET_ERR_MSG_ATTR(info->extack,
+				    tb[ETHTOOL_A_TS_LAYER],
+				    "Hardware time stamping is not supported by this device");
+		return -EOPNOTSUPP;
+	}
+
+	return 1;
+}
+
+static int ethnl_set_ts(struct ethnl_req_info *req_info, struct genl_info *info)
+{
+	struct net_device *dev = req_info->dev;
+	struct nlattr **tb = info->attrs;
+	enum timestamping_layer flavor;
+	bool mod = false;
+
+	if (!dev->phydev) {
+		return -EOPNOTSUPP;
+	}
+
+	flavor = dev->selected_timestamping_layer;
+	ethnl_update_u32(&flavor, tb[ETHTOOL_A_TS_LAYER], &mod);
+
+	if (mod) {
+		const struct net_device_ops *ops = dev->netdev_ops;
+		struct ifreq ifr = {0};
+
+		/* Disable time stamping in the current layer. */
+		if (netif_device_present(dev) && ops->ndo_eth_ioctl)
+			ops->ndo_eth_ioctl(dev, &ifr, SIOCSHWTSTAMP);
+
+		dev->selected_timestamping_layer = flavor;
+		return 1;
+	};
+
+	return 0;
+}
+
 const struct ethnl_request_ops ethnl_ts_request_ops = {
 	.request_cmd		= ETHTOOL_MSG_TS_GET,
 	.reply_cmd		= ETHTOOL_MSG_TS_GET_REPLY,
@@ -74,6 +124,10 @@  const struct ethnl_request_ops ethnl_ts_request_ops = {
 	.prepare_data		= ts_prepare_data,
 	.reply_size		= ts_reply_size,
 	.fill_reply		= ts_fill_reply,
+
+	.set_validate		= ethnl_set_ts_validate,
+	.set			= ethnl_set_ts,
+	.set_ntf_cmd		= ETHTOOL_MSG_TS_NTF,
 };
 
 /* TSLIST_GET */