diff mbox series

[net-next,v3,3/5] eth: fbnic: add RX packets timestamping support

Message ID 20241003123933.2589036-4-vadfed@meta.com (mailing list archive)
State Superseded
Delegated to: Netdev Maintainers
Headers show
Series eth: fbnic: add timestamping support | expand

Checks

Context Check Description
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for net-next
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 9 this patch: 9
netdev/build_tools success No tools touched, skip
netdev/cc_maintainers warning 4 maintainers not CCed: mohsin.bashr@gmail.com kernel-team@meta.com horms@kernel.org edumazet@google.com
netdev/build_clang success Errors and warnings before: 9 this patch: 9
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 8 this patch: 8
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 278 lines checked
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
netdev/contest success net-next-2024-10-06--15-00 (tests: 775)

Commit Message

Vadim Fedorenko Oct. 3, 2024, 12:39 p.m. UTC
Add callbacks to support timestamping configuration via ethtool.
Add processing of RX timestamps.

Signed-off-by: Vadim Fedorenko <vadfed@meta.com>
---
 drivers/net/ethernet/meta/fbnic/fbnic_csr.h   |  1 +
 .../net/ethernet/meta/fbnic/fbnic_ethtool.c   | 16 +++-
 .../net/ethernet/meta/fbnic/fbnic_netdev.c    | 80 +++++++++++++++++++
 .../net/ethernet/meta/fbnic/fbnic_netdev.h    |  3 +
 drivers/net/ethernet/meta/fbnic/fbnic_rpc.c   | 31 +++++++
 drivers/net/ethernet/meta/fbnic/fbnic_txrx.c  | 61 ++++++++++++++
 drivers/net/ethernet/meta/fbnic/fbnic_txrx.h  |  1 +
 7 files changed, 192 insertions(+), 1 deletion(-)

Comments

Jacob Keller Oct. 4, 2024, 11:14 p.m. UTC | #1
On 10/3/2024 5:39 AM, Vadim Fedorenko wrote:
> Add callbacks to support timestamping configuration via ethtool.
> Add processing of RX timestamps.
> 
> Signed-off-by: Vadim Fedorenko <vadfed@meta.com>
>  
> +/**
> + * fbnic_ts40_to_ns() - convert descriptor timestamp to PHC time
> + * @fbn: netdev priv of the FB NIC
> + * @ts40: timestamp read from a descriptor
> + *
> + * Return: u64 value of PHC time in nanoseconds
> + *
> + * Convert truncated 40 bit device timestamp as read from a descriptor
> + * to the full PHC time in nanoseconds.
> + */
> +static __maybe_unused u64 fbnic_ts40_to_ns(struct fbnic_net *fbn, u64 ts40)
> +{
> +	unsigned int s;
> +	u64 time_ns;
> +	s64 offset;
> +	u8 ts_top;
> +	u32 high;
> +
> +	do {
> +		s = u64_stats_fetch_begin(&fbn->time_seq);
> +		offset = READ_ONCE(fbn->time_offset);
> +	} while (u64_stats_fetch_retry(&fbn->time_seq, s));
> +
> +	high = READ_ONCE(fbn->time_high);
> +
> +	/* Bits 63..40 from periodic clock reads, 39..0 from ts40 */
> +	time_ns = (u64)(high >> 8) << 40 | ts40;
> +
> +	/* Compare bits 32-39 between periodic reads and ts40,
> +	 * see if HW clock may have wrapped since last read
> +	 */
> +	ts_top = ts40 >> 32;
> +	if (ts_top < (u8)high && (u8)high - ts_top > U8_MAX / 2)
> +		time_ns += 1ULL << 40;
> +
> +	return time_ns + offset;
> +}
> +

This logic doesn't seem to match the logic used by the cyclecounter
code, and Its not clear to me if this safe against a race between
time_high updating and the packet timestamp arriving.

the timestamp could arrive either before or after the time_high update,
and the logic needs to ensure the appropriate high bits are applied in
both cases.

Again, I think your use case makes sense to just implement with a
timecounter and cyclecounter, since you're not modifying the hardware
cycle counter and are leaving it as free-running.
Jacob Keller Oct. 4, 2024, 11:18 p.m. UTC | #2
On 10/3/2024 5:39 AM, Vadim Fedorenko wrote:
> Add callbacks to support timestamping configuration via ethtool.
> Add processing of RX timestamps.
> 
> Signed-off-by: Vadim Fedorenko <vadfed@meta.com>
> ---
> +static int fbnic_hwtstamp_set(struct net_device *netdev,
> +			      struct kernel_hwtstamp_config *config,
> +			      struct netlink_ext_ack *extack)
> +{
> +	struct fbnic_net *fbn = netdev_priv(netdev);
> +	int old_rx_filter;
> +
> +	if (config->source != HWTSTAMP_SOURCE_NETDEV)
> +		return -EOPNOTSUPP;
> +
> +	if (!kernel_hwtstamp_config_changed(config, &fbn->hwtstamp_config))
> +		return 0;
> +
> +	/* Upscale the filters */
> +	switch (config->rx_filter) {
> +	case HWTSTAMP_FILTER_NONE:
> +	case HWTSTAMP_FILTER_ALL:
> +	case HWTSTAMP_FILTER_PTP_V1_L4_EVENT:
> +	case HWTSTAMP_FILTER_PTP_V2_L4_EVENT:
> +	case HWTSTAMP_FILTER_PTP_V2_L2_EVENT:
> +	case HWTSTAMP_FILTER_PTP_V2_EVENT:
> +		break;
> +	case HWTSTAMP_FILTER_NTP_ALL:
> +		config->rx_filter = HWTSTAMP_FILTER_ALL;
> +		break;
> +	case HWTSTAMP_FILTER_PTP_V1_L4_SYNC:
> +	case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ:
> +		config->rx_filter = HWTSTAMP_FILTER_PTP_V1_L4_EVENT;
> +		break;
> +	case HWTSTAMP_FILTER_PTP_V2_L4_SYNC:
> +	case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ:
> +		config->rx_filter = HWTSTAMP_FILTER_PTP_V2_L4_EVENT;
> +		break;
> +	case HWTSTAMP_FILTER_PTP_V2_L2_SYNC:
> +	case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ:
> +		config->rx_filter = HWTSTAMP_FILTER_PTP_V2_L2_EVENT;
> +		break;
> +	case HWTSTAMP_FILTER_PTP_V2_SYNC:
> +	case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
> +		config->rx_filter = HWTSTAMP_FILTER_PTP_V2_EVENT;
> +		break;
> +	default:
> +		return -ERANGE;
> +	}
> +
> +	/* Configure */
> +	old_rx_filter = fbn->hwtstamp_config.rx_filter;
> +	memcpy(&fbn->hwtstamp_config, config, sizeof(*config));
> +
> +	if (old_rx_filter != config->rx_filter && netif_running(fbn->netdev)) {
> +		fbnic_rss_reinit(fbn->fbd, fbn);
> +		fbnic_write_rules(fbn->fbd);
> +	}
> +
> +	/* Save / report back filter configuration
> +	 * Note that our filter configuration is inexact. Instead of
> +	 * filtering for a specific UDP port or L2 Ethertype we are
> +	 * filtering in all UDP or all non-IP packets for timestamping. So
> +	 * if anything other than FILTER_ALL is requested we report
> +	 * FILTER_SOME indicating that we will be timestamping a few
> +	 * additional packets.
> +	 */

Is there any benefit to implementing anything other than
HWTSTAMP_FILTER_ALL?

Those are typically considered legacy with the primary reason being to
support hardware which does not support timestamping all frames.

I suppose if you have measurement that supporting them is valuable (i.e.
because of performance impact on timestamping all frames?) it makes
sense to support. But otherwise I'm not sure its worth the extra complexity.

Upgrading the filtering modes to HWTSTAMP_FILTER_ALL is acceptable and
is done by a few drivers.
Vadim Fedorenko Oct. 7, 2024, 10:26 a.m. UTC | #3
On 05/10/2024 00:18, Jacob Keller wrote:
> 
> 
> On 10/3/2024 5:39 AM, Vadim Fedorenko wrote:
>> Add callbacks to support timestamping configuration via ethtool.
>> Add processing of RX timestamps.
>>
>> Signed-off-by: Vadim Fedorenko <vadfed@meta.com>
>> ---
>> +static int fbnic_hwtstamp_set(struct net_device *netdev,
>> +			      struct kernel_hwtstamp_config *config,
>> +			      struct netlink_ext_ack *extack)
>> +{
>> +	struct fbnic_net *fbn = netdev_priv(netdev);
>> +	int old_rx_filter;
>> +
>> +	if (config->source != HWTSTAMP_SOURCE_NETDEV)
>> +		return -EOPNOTSUPP;
>> +
>> +	if (!kernel_hwtstamp_config_changed(config, &fbn->hwtstamp_config))
>> +		return 0;
>> +
>> +	/* Upscale the filters */
>> +	switch (config->rx_filter) {
>> +	case HWTSTAMP_FILTER_NONE:
>> +	case HWTSTAMP_FILTER_ALL:
>> +	case HWTSTAMP_FILTER_PTP_V1_L4_EVENT:
>> +	case HWTSTAMP_FILTER_PTP_V2_L4_EVENT:
>> +	case HWTSTAMP_FILTER_PTP_V2_L2_EVENT:
>> +	case HWTSTAMP_FILTER_PTP_V2_EVENT:
>> +		break;
>> +	case HWTSTAMP_FILTER_NTP_ALL:
>> +		config->rx_filter = HWTSTAMP_FILTER_ALL;
>> +		break;
>> +	case HWTSTAMP_FILTER_PTP_V1_L4_SYNC:
>> +	case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ:
>> +		config->rx_filter = HWTSTAMP_FILTER_PTP_V1_L4_EVENT;
>> +		break;
>> +	case HWTSTAMP_FILTER_PTP_V2_L4_SYNC:
>> +	case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ:
>> +		config->rx_filter = HWTSTAMP_FILTER_PTP_V2_L4_EVENT;
>> +		break;
>> +	case HWTSTAMP_FILTER_PTP_V2_L2_SYNC:
>> +	case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ:
>> +		config->rx_filter = HWTSTAMP_FILTER_PTP_V2_L2_EVENT;
>> +		break;
>> +	case HWTSTAMP_FILTER_PTP_V2_SYNC:
>> +	case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
>> +		config->rx_filter = HWTSTAMP_FILTER_PTP_V2_EVENT;
>> +		break;
>> +	default:
>> +		return -ERANGE;
>> +	}
>> +
>> +	/* Configure */
>> +	old_rx_filter = fbn->hwtstamp_config.rx_filter;
>> +	memcpy(&fbn->hwtstamp_config, config, sizeof(*config));
>> +
>> +	if (old_rx_filter != config->rx_filter && netif_running(fbn->netdev)) {
>> +		fbnic_rss_reinit(fbn->fbd, fbn);
>> +		fbnic_write_rules(fbn->fbd);
>> +	}
>> +
>> +	/* Save / report back filter configuration
>> +	 * Note that our filter configuration is inexact. Instead of
>> +	 * filtering for a specific UDP port or L2 Ethertype we are
>> +	 * filtering in all UDP or all non-IP packets for timestamping. So
>> +	 * if anything other than FILTER_ALL is requested we report
>> +	 * FILTER_SOME indicating that we will be timestamping a few
>> +	 * additional packets.
>> +	 */
> 
> Is there any benefit to implementing anything other than
> HWTSTAMP_FILTER_ALL?
> 
> Those are typically considered legacy with the primary reason being to
> support hardware which does not support timestamping all frames.
> 
> I suppose if you have measurement that supporting them is valuable (i.e.
> because of performance impact on timestamping all frames?) it makes
> sense to support. But otherwise I'm not sure its worth the extra complexity.
> 
> Upgrading the filtering modes to HWTSTAMP_FILTER_ALL is acceptable and
> is done by a few drivers.

Even though the hardware is able to timestamp TX packets at line rate,
we would like to avoid having 2x times more interrupts for the cases
when we don't need all packets to be timestamped. And as it mentioned
in the comment, we don't have very precise HW filters, but we would like
to avoid timestamping TCP packets when TCP is the most used one on the
host.
Jacob Keller Oct. 7, 2024, 11:51 p.m. UTC | #4
On 10/7/2024 3:26 AM, Vadim Fedorenko wrote:
> On 05/10/2024 00:18, Jacob Keller wrote:
>> Is there any benefit to implementing anything other than
>> HWTSTAMP_FILTER_ALL?
>>
>> Those are typically considered legacy with the primary reason being to
>> support hardware which does not support timestamping all frames.
>>
>> I suppose if you have measurement that supporting them is valuable (i.e.
>> because of performance impact on timestamping all frames?) it makes
>> sense to support. But otherwise I'm not sure its worth the extra complexity.
>>
>> Upgrading the filtering modes to HWTSTAMP_FILTER_ALL is acceptable and
>> is done by a few drivers.
> 
> Even though the hardware is able to timestamp TX packets at line rate,
> we would like to avoid having 2x times more interrupts for the cases
> when we don't need all packets to be timestamped. And as it mentioned
> in the comment, we don't have very precise HW filters, but we would like
> to avoid timestamping TCP packets when TCP is the most used one on the
> host.

Tx timestamps don't use the filters in the first place. The filter only
applies to Rx timestamps. You should only initiate a Tx timestamp when
requested, which will generally not be the case for TCP.

Are you saying that Rx timestamps generate interrupts?
Vadim Fedorenko Oct. 8, 2024, 9:58 a.m. UTC | #5
On 08/10/2024 00:51, Jacob Keller wrote:
> 
> 
> On 10/7/2024 3:26 AM, Vadim Fedorenko wrote:
>> On 05/10/2024 00:18, Jacob Keller wrote:
>>> Is there any benefit to implementing anything other than
>>> HWTSTAMP_FILTER_ALL?
>>>
>>> Those are typically considered legacy with the primary reason being to
>>> support hardware which does not support timestamping all frames.
>>>
>>> I suppose if you have measurement that supporting them is valuable (i.e.
>>> because of performance impact on timestamping all frames?) it makes
>>> sense to support. But otherwise I'm not sure its worth the extra complexity.
>>>
>>> Upgrading the filtering modes to HWTSTAMP_FILTER_ALL is acceptable and
>>> is done by a few drivers.
>>
>> Even though the hardware is able to timestamp TX packets at line rate,
>> we would like to avoid having 2x times more interrupts for the cases
>> when we don't need all packets to be timestamped. And as it mentioned
>> in the comment, we don't have very precise HW filters, but we would like
>> to avoid timestamping TCP packets when TCP is the most used one on the
>> host.
> 
> Tx timestamps don't use the filters in the first place. The filter only
> applies to Rx timestamps. You should only initiate a Tx timestamp when
> requested, which will generally not be the case for TCP.
> 
> Are you saying that Rx timestamps generate interrupts?

Sorry for the confusion with TX timestamping.
For RX we will utilize additional buffer to provide timestamp metadata,
and we will have to process this metadata even if it will not be needed
later in the stack. For 100G links that will add some delays which we
would like to avoid.
Vadim Fedorenko Oct. 8, 2024, 4:47 p.m. UTC | #6
On 05/10/2024 00:14, Jacob Keller wrote:
> 
> 
> On 10/3/2024 5:39 AM, Vadim Fedorenko wrote:
>> Add callbacks to support timestamping configuration via ethtool.
>> Add processing of RX timestamps.
>>
>> Signed-off-by: Vadim Fedorenko <vadfed@meta.com>
>>   
>> +/**
>> + * fbnic_ts40_to_ns() - convert descriptor timestamp to PHC time
>> + * @fbn: netdev priv of the FB NIC
>> + * @ts40: timestamp read from a descriptor
>> + *
>> + * Return: u64 value of PHC time in nanoseconds
>> + *
>> + * Convert truncated 40 bit device timestamp as read from a descriptor
>> + * to the full PHC time in nanoseconds.
>> + */
>> +static __maybe_unused u64 fbnic_ts40_to_ns(struct fbnic_net *fbn, u64 ts40)
>> +{
>> +	unsigned int s;
>> +	u64 time_ns;
>> +	s64 offset;
>> +	u8 ts_top;
>> +	u32 high;
>> +
>> +	do {
>> +		s = u64_stats_fetch_begin(&fbn->time_seq);
>> +		offset = READ_ONCE(fbn->time_offset);
>> +	} while (u64_stats_fetch_retry(&fbn->time_seq, s));
>> +
>> +	high = READ_ONCE(fbn->time_high);
>> +
>> +	/* Bits 63..40 from periodic clock reads, 39..0 from ts40 */
>> +	time_ns = (u64)(high >> 8) << 40 | ts40;
>> +
>> +	/* Compare bits 32-39 between periodic reads and ts40,
>> +	 * see if HW clock may have wrapped since last read
>> +	 */
>> +	ts_top = ts40 >> 32;
>> +	if (ts_top < (u8)high && (u8)high - ts_top > U8_MAX / 2)
>> +		time_ns += 1ULL << 40;
>> +
>> +	return time_ns + offset;
>> +}
>> +
> 
> This logic doesn't seem to match the logic used by the cyclecounter
> code, and Its not clear to me if this safe against a race between
> time_high updating and the packet timestamp arriving.
> 
> the timestamp could arrive either before or after the time_high update,
> and the logic needs to ensure the appropriate high bits are applied in
> both cases.

To avoid this race condition we decided to make sure that incoming
timestamps are always later then cached high bits. That will make the
logic above correct.

> Again, I think your use case makes sense to just implement with a
> timecounter and cyclecounter, since you're not modifying the hardware
> cycle counter and are leaving it as free-running.

After discussion with Jakub we decided to keep simple logic without
timecounter + cyclecounter, as it's pretty straight forward.
Jacob Keller Oct. 8, 2024, 5:01 p.m. UTC | #7
On 10/8/2024 9:47 AM, Vadim Fedorenko wrote:
> On 05/10/2024 00:14, Jacob Keller wrote:
>>
>>
>> On 10/3/2024 5:39 AM, Vadim Fedorenko wrote:
>>> Add callbacks to support timestamping configuration via ethtool.
>>> Add processing of RX timestamps.
>>>
>>> Signed-off-by: Vadim Fedorenko <vadfed@meta.com>
>>>   
>>> +/**
>>> + * fbnic_ts40_to_ns() - convert descriptor timestamp to PHC time
>>> + * @fbn: netdev priv of the FB NIC
>>> + * @ts40: timestamp read from a descriptor
>>> + *
>>> + * Return: u64 value of PHC time in nanoseconds
>>> + *
>>> + * Convert truncated 40 bit device timestamp as read from a descriptor
>>> + * to the full PHC time in nanoseconds.
>>> + */
>>> +static __maybe_unused u64 fbnic_ts40_to_ns(struct fbnic_net *fbn, u64 ts40)
>>> +{
>>> +	unsigned int s;
>>> +	u64 time_ns;
>>> +	s64 offset;
>>> +	u8 ts_top;
>>> +	u32 high;
>>> +
>>> +	do {
>>> +		s = u64_stats_fetch_begin(&fbn->time_seq);
>>> +		offset = READ_ONCE(fbn->time_offset);
>>> +	} while (u64_stats_fetch_retry(&fbn->time_seq, s));
>>> +
>>> +	high = READ_ONCE(fbn->time_high);
>>> +
>>> +	/* Bits 63..40 from periodic clock reads, 39..0 from ts40 */
>>> +	time_ns = (u64)(high >> 8) << 40 | ts40;
>>> +
>>> +	/* Compare bits 32-39 between periodic reads and ts40,
>>> +	 * see if HW clock may have wrapped since last read
>>> +	 */
>>> +	ts_top = ts40 >> 32;
>>> +	if (ts_top < (u8)high && (u8)high - ts_top > U8_MAX / 2)
>>> +		time_ns += 1ULL << 40;
>>> +
>>> +	return time_ns + offset;
>>> +}
>>> +
>>
>> This logic doesn't seem to match the logic used by the cyclecounter
>> code, and Its not clear to me if this safe against a race between
>> time_high updating and the packet timestamp arriving.
>>
>> the timestamp could arrive either before or after the time_high update,
>> and the logic needs to ensure the appropriate high bits are applied in
>> both cases.
> 
> To avoid this race condition we decided to make sure that incoming
> timestamps are always later then cached high bits. That will make the
> logic above correct.
> 

How do you do that? Timestamps come in asynchronously. The value is
captured by hardware. How do you guarantee that it was captured only
after an update to the cached high bits?

I guess if it arrives before the roll-over, you handle that by the range
check to see if the clock wrapped around.

Hmm.

But what happens if an Rx timestamp races with an update to the high
value and comes in just before the 40 bit time would have overflowed,
but the cached time_high value is captured just after it overflowed.

Do you have some mechanism to ensure that this is impossible? i.e.
either ensuring that the conversion uses the old time_high value, or
ensuring that Rx timestamps can't come in during an update?

Otherwise, I think the logic here could accidentally combine a time
value from an Rx timestamp that is just prior to the time_high update
and just prior to a rollover, then it would see a huge gap between the
values and trigger the addition of another 1 << 40, which would cycle it
even farther out of what the real value should have been.

>> Again, I think your use case makes sense to just implement with a
>> timecounter and cyclecounter, since you're not modifying the hardware
>> cycle counter and are leaving it as free-running.
> 
> After discussion with Jakub we decided to keep simple logic without
> timecounter + cyclecounter, as it's pretty straight forward.

Fair enough.
Vadim Fedorenko Oct. 8, 2024, 5:13 p.m. UTC | #8
On 08/10/2024 18:01, Jacob Keller wrote:
> 
> 
> On 10/8/2024 9:47 AM, Vadim Fedorenko wrote:
>> On 05/10/2024 00:14, Jacob Keller wrote:
>>>
>>>
>>> On 10/3/2024 5:39 AM, Vadim Fedorenko wrote:
>>>> Add callbacks to support timestamping configuration via ethtool.
>>>> Add processing of RX timestamps.
>>>>
>>>> Signed-off-by: Vadim Fedorenko <vadfed@meta.com>
>>>>    
>>>> +/**
>>>> + * fbnic_ts40_to_ns() - convert descriptor timestamp to PHC time
>>>> + * @fbn: netdev priv of the FB NIC
>>>> + * @ts40: timestamp read from a descriptor
>>>> + *
>>>> + * Return: u64 value of PHC time in nanoseconds
>>>> + *
>>>> + * Convert truncated 40 bit device timestamp as read from a descriptor
>>>> + * to the full PHC time in nanoseconds.
>>>> + */
>>>> +static __maybe_unused u64 fbnic_ts40_to_ns(struct fbnic_net *fbn, u64 ts40)
>>>> +{
>>>> +	unsigned int s;
>>>> +	u64 time_ns;
>>>> +	s64 offset;
>>>> +	u8 ts_top;
>>>> +	u32 high;
>>>> +
>>>> +	do {
>>>> +		s = u64_stats_fetch_begin(&fbn->time_seq);
>>>> +		offset = READ_ONCE(fbn->time_offset);
>>>> +	} while (u64_stats_fetch_retry(&fbn->time_seq, s));
>>>> +
>>>> +	high = READ_ONCE(fbn->time_high);
>>>> +
>>>> +	/* Bits 63..40 from periodic clock reads, 39..0 from ts40 */
>>>> +	time_ns = (u64)(high >> 8) << 40 | ts40;
>>>> +
>>>> +	/* Compare bits 32-39 between periodic reads and ts40,
>>>> +	 * see if HW clock may have wrapped since last read
>>>> +	 */
>>>> +	ts_top = ts40 >> 32;
>>>> +	if (ts_top < (u8)high && (u8)high - ts_top > U8_MAX / 2)
>>>> +		time_ns += 1ULL << 40;
>>>> +
>>>> +	return time_ns + offset;
>>>> +}
>>>> +
>>>
>>> This logic doesn't seem to match the logic used by the cyclecounter
>>> code, and Its not clear to me if this safe against a race between
>>> time_high updating and the packet timestamp arriving.
>>>
>>> the timestamp could arrive either before or after the time_high update,
>>> and the logic needs to ensure the appropriate high bits are applied in
>>> both cases.
>>
>> To avoid this race condition we decided to make sure that incoming
>> timestamps are always later then cached high bits. That will make the
>> logic above correct.
>>
> 
> How do you do that? Timestamps come in asynchronously. The value is
> captured by hardware. How do you guarantee that it was captured only
> after an update to the cached high bits?
> 
> I guess if it arrives before the roll-over, you handle that by the range
> check to see if the clock wrapped around.
> 
> Hmm.
> 
> But what happens if an Rx timestamp races with an update to the high
> value and comes in just before the 40 bit time would have overflowed,
> but the cached time_high value is captured just after it overflowed.
> 
> Do you have some mechanism to ensure that this is impossible? i.e.
> either ensuring that the conversion uses the old time_high value, or
> ensuring that Rx timestamps can't come in during an update?
> 
> Otherwise, I think the logic here could accidentally combine a time
> value from an Rx timestamp that is just prior to the time_high update
> and just prior to a rollover, then it would see a huge gap between the
> values and trigger the addition of another 1 << 40, which would cycle it
> even farther out of what the real value should have been.

Yes, you are absolutely correct, we have missed the situation when the
logic can bring additional (1 << 40) value on top of wrongly calculated
higher bits. This can only happen in case of overflow of lower 8 bits of
high cached value. But if we keep high cached value a bit below the real
value, this will never happen. If we subtract 16 from high value it will
translate into ~1 minute of oldness of cached value. If for any reasons
the packet processing will be delayed by 1 minute, user-space app will
definitely give up on waiting for the packet/timestamp and will ignore
wrong timestamp. In all other cases the logic in fbnic_ts40_to_ns() will
work perfectly fine.

>>> Again, I think your use case makes sense to just implement with a
>>> timecounter and cyclecounter, since you're not modifying the hardware
>>> cycle counter and are leaving it as free-running.
>>
>> After discussion with Jakub we decided to keep simple logic without
>> timecounter + cyclecounter, as it's pretty straight forward.
> 
> Fair enough.
diff mbox series

Patch

diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_csr.h b/drivers/net/ethernet/meta/fbnic/fbnic_csr.h
index 290b924b7749..79cdd231d327 100644
--- a/drivers/net/ethernet/meta/fbnic/fbnic_csr.h
+++ b/drivers/net/ethernet/meta/fbnic/fbnic_csr.h
@@ -586,6 +586,7 @@  enum {
 };
 
 #define FBNIC_RPC_ACT_TBL0_DMA_HINT		CSR_GENMASK(24, 16)
+#define FBNIC_RPC_ACT_TBL0_TS_ENA		CSR_BIT(28)
 #define FBNIC_RPC_ACT_TBL0_RSS_CTXT_ID		CSR_BIT(30)
 
 #define FBNIC_RPC_ACT_TBL1_DEFAULT	0x0840b		/* 0x2102c */
diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_ethtool.c b/drivers/net/ethernet/meta/fbnic/fbnic_ethtool.c
index ffc773014e0f..3afb7227574a 100644
--- a/drivers/net/ethernet/meta/fbnic/fbnic_ethtool.c
+++ b/drivers/net/ethernet/meta/fbnic/fbnic_ethtool.c
@@ -10,8 +10,22 @@  static int
 fbnic_get_ts_info(struct net_device *netdev,
 		  struct kernel_ethtool_ts_info *tsinfo)
 {
+	struct fbnic_net *fbn = netdev_priv(netdev);
+
+	tsinfo->phc_index = ptp_clock_index(fbn->fbd->ptp);
+
 	tsinfo->so_timestamping =
-		SOF_TIMESTAMPING_TX_SOFTWARE;
+		SOF_TIMESTAMPING_TX_SOFTWARE |
+		SOF_TIMESTAMPING_RX_HARDWARE |
+		SOF_TIMESTAMPING_RAW_HARDWARE;
+
+	tsinfo->rx_filters =
+		BIT(HWTSTAMP_FILTER_NONE) |
+		BIT(HWTSTAMP_FILTER_PTP_V1_L4_EVENT) |
+		BIT(HWTSTAMP_FILTER_PTP_V2_L4_EVENT) |
+		BIT(HWTSTAMP_FILTER_PTP_V2_L2_EVENT) |
+		BIT(HWTSTAMP_FILTER_PTP_V2_EVENT) |
+		BIT(HWTSTAMP_FILTER_ALL);
 
 	return 0;
 }
diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_netdev.c b/drivers/net/ethernet/meta/fbnic/fbnic_netdev.c
index 6e6d8988db54..c08798fad203 100644
--- a/drivers/net/ethernet/meta/fbnic/fbnic_netdev.c
+++ b/drivers/net/ethernet/meta/fbnic/fbnic_netdev.c
@@ -324,6 +324,84 @@  void fbnic_clear_rx_mode(struct net_device *netdev)
 	__dev_mc_unsync(netdev, NULL);
 }
 
+static int fbnic_hwtstamp_get(struct net_device *netdev,
+			      struct kernel_hwtstamp_config *config)
+{
+	struct fbnic_net *fbn = netdev_priv(netdev);
+
+	*config = fbn->hwtstamp_config;
+
+	return 0;
+}
+
+static int fbnic_hwtstamp_set(struct net_device *netdev,
+			      struct kernel_hwtstamp_config *config,
+			      struct netlink_ext_ack *extack)
+{
+	struct fbnic_net *fbn = netdev_priv(netdev);
+	int old_rx_filter;
+
+	if (config->source != HWTSTAMP_SOURCE_NETDEV)
+		return -EOPNOTSUPP;
+
+	if (!kernel_hwtstamp_config_changed(config, &fbn->hwtstamp_config))
+		return 0;
+
+	/* Upscale the filters */
+	switch (config->rx_filter) {
+	case HWTSTAMP_FILTER_NONE:
+	case HWTSTAMP_FILTER_ALL:
+	case HWTSTAMP_FILTER_PTP_V1_L4_EVENT:
+	case HWTSTAMP_FILTER_PTP_V2_L4_EVENT:
+	case HWTSTAMP_FILTER_PTP_V2_L2_EVENT:
+	case HWTSTAMP_FILTER_PTP_V2_EVENT:
+		break;
+	case HWTSTAMP_FILTER_NTP_ALL:
+		config->rx_filter = HWTSTAMP_FILTER_ALL;
+		break;
+	case HWTSTAMP_FILTER_PTP_V1_L4_SYNC:
+	case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ:
+		config->rx_filter = HWTSTAMP_FILTER_PTP_V1_L4_EVENT;
+		break;
+	case HWTSTAMP_FILTER_PTP_V2_L4_SYNC:
+	case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ:
+		config->rx_filter = HWTSTAMP_FILTER_PTP_V2_L4_EVENT;
+		break;
+	case HWTSTAMP_FILTER_PTP_V2_L2_SYNC:
+	case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ:
+		config->rx_filter = HWTSTAMP_FILTER_PTP_V2_L2_EVENT;
+		break;
+	case HWTSTAMP_FILTER_PTP_V2_SYNC:
+	case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
+		config->rx_filter = HWTSTAMP_FILTER_PTP_V2_EVENT;
+		break;
+	default:
+		return -ERANGE;
+	}
+
+	/* Configure */
+	old_rx_filter = fbn->hwtstamp_config.rx_filter;
+	memcpy(&fbn->hwtstamp_config, config, sizeof(*config));
+
+	if (old_rx_filter != config->rx_filter && netif_running(fbn->netdev)) {
+		fbnic_rss_reinit(fbn->fbd, fbn);
+		fbnic_write_rules(fbn->fbd);
+	}
+
+	/* Save / report back filter configuration
+	 * Note that our filter configuration is inexact. Instead of
+	 * filtering for a specific UDP port or L2 Ethertype we are
+	 * filtering in all UDP or all non-IP packets for timestamping. So
+	 * if anything other than FILTER_ALL is requested we report
+	 * FILTER_SOME indicating that we will be timestamping a few
+	 * additional packets.
+	 */
+	if (config->rx_filter > HWTSTAMP_FILTER_ALL)
+		config->rx_filter = HWTSTAMP_FILTER_SOME;
+
+	return 0;
+}
+
 static void fbnic_get_stats64(struct net_device *dev,
 			      struct rtnl_link_stats64 *stats64)
 {
@@ -401,6 +479,8 @@  static const struct net_device_ops fbnic_netdev_ops = {
 	.ndo_set_mac_address	= fbnic_set_mac,
 	.ndo_set_rx_mode	= fbnic_set_rx_mode,
 	.ndo_get_stats64	= fbnic_get_stats64,
+	.ndo_hwtstamp_get	= fbnic_hwtstamp_get,
+	.ndo_hwtstamp_set	= fbnic_hwtstamp_set,
 };
 
 static void fbnic_get_queue_stats_rx(struct net_device *dev, int idx,
diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_netdev.h b/drivers/net/ethernet/meta/fbnic/fbnic_netdev.h
index f530e3235634..b8417b300778 100644
--- a/drivers/net/ethernet/meta/fbnic/fbnic_netdev.h
+++ b/drivers/net/ethernet/meta/fbnic/fbnic_netdev.h
@@ -54,6 +54,9 @@  struct fbnic_net {
 	struct fbnic_queue_stats rx_stats;
 	u64 link_down_events;
 
+	/* Time stampinn filter config */
+	struct kernel_hwtstamp_config hwtstamp_config;
+
 	struct list_head napis;
 };
 
diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_rpc.c b/drivers/net/ethernet/meta/fbnic/fbnic_rpc.c
index c8aa29fc052b..337b8b3aef2f 100644
--- a/drivers/net/ethernet/meta/fbnic/fbnic_rpc.c
+++ b/drivers/net/ethernet/meta/fbnic/fbnic_rpc.c
@@ -244,6 +244,12 @@  void fbnic_bmc_rpc_init(struct fbnic_dev *fbd)
 	 ((_ip) ? FBNIC_RPC_TCAM_ACT1_IP_VALID : 0) |	\
 	 ((_v6) ? FBNIC_RPC_TCAM_ACT1_IP_IS_V6 : 0))
 
+#define FBNIC_TSTAMP_MASK(_all, _udp, _ether)			\
+	(((_all) ? ((1u << FBNIC_NUM_HASH_OPT) - 1) : 0) |	\
+	 ((_udp) ? (1u << FBNIC_UDP6_HASH_OPT) |		\
+		   (1u << FBNIC_UDP4_HASH_OPT) : 0) |		\
+	 ((_ether) ? (1u << FBNIC_ETHER_HASH_OPT) : 0))
+
 void fbnic_rss_reinit(struct fbnic_dev *fbd, struct fbnic_net *fbn)
 {
 	static const u32 act1_value[FBNIC_NUM_HASH_OPT] = {
@@ -255,6 +261,7 @@  void fbnic_rss_reinit(struct fbnic_dev *fbd, struct fbnic_net *fbn)
 		FBNIC_ACT1_INIT(0, 0, 1, 0),	/* IP4 */
 		0				/* Ether */
 	};
+	u32 tstamp_mask = 0;
 	unsigned int i;
 
 	/* To support scenarios where a BMC is present we must write the
@@ -264,6 +271,28 @@  void fbnic_rss_reinit(struct fbnic_dev *fbd, struct fbnic_net *fbn)
 	BUILD_BUG_ON(FBNIC_RSS_EN_NUM_UNICAST * 2 != FBNIC_RSS_EN_NUM_ENTRIES);
 	BUILD_BUG_ON(ARRAY_SIZE(act1_value) != FBNIC_NUM_HASH_OPT);
 
+	/* Set timestamp mask with 1b per flow type */
+	if (fbn->hwtstamp_config.rx_filter != HWTSTAMP_FILTER_NONE) {
+		switch (fbn->hwtstamp_config.rx_filter) {
+		case HWTSTAMP_FILTER_ALL:
+			tstamp_mask = FBNIC_TSTAMP_MASK(1, 1, 1);
+			break;
+		case HWTSTAMP_FILTER_PTP_V2_EVENT:
+			tstamp_mask = FBNIC_TSTAMP_MASK(0, 1, 1);
+			break;
+		case HWTSTAMP_FILTER_PTP_V1_L4_EVENT:
+		case HWTSTAMP_FILTER_PTP_V2_L4_EVENT:
+			tstamp_mask = FBNIC_TSTAMP_MASK(0, 1, 0);
+			break;
+		case HWTSTAMP_FILTER_PTP_V2_L2_EVENT:
+			tstamp_mask = FBNIC_TSTAMP_MASK(0, 0, 1);
+			break;
+		default:
+			netdev_warn(fbn->netdev, "Unsupported hwtstamp_rx_filter\n");
+			break;
+		}
+	}
+
 	/* Program RSS hash enable mask for host in action TCAM/table. */
 	for (i = fbnic_bmc_present(fbd) ? 0 : FBNIC_RSS_EN_NUM_UNICAST;
 	     i < FBNIC_RSS_EN_NUM_ENTRIES; i++) {
@@ -287,6 +316,8 @@  void fbnic_rss_reinit(struct fbnic_dev *fbd, struct fbnic_net *fbn)
 
 		if (!dest)
 			dest = FBNIC_RPC_ACT_TBL0_DROP;
+		else if (tstamp_mask & (1u << flow_type))
+			dest |= FBNIC_RPC_ACT_TBL0_TS_ENA;
 
 		if (act1_value[flow_type] & FBNIC_RPC_TCAM_ACT1_L4_VALID)
 			dest |= FIELD_PREP(FBNIC_RPC_ACT_TBL0_DMA_HINT,
diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_txrx.c b/drivers/net/ethernet/meta/fbnic/fbnic_txrx.c
index 8337d49bad0b..b64c1d4f925e 100644
--- a/drivers/net/ethernet/meta/fbnic/fbnic_txrx.c
+++ b/drivers/net/ethernet/meta/fbnic/fbnic_txrx.c
@@ -43,6 +43,44 @@  static void fbnic_ring_wr32(struct fbnic_ring *ring, unsigned int csr, u32 val)
 	writel(val, csr_base + csr);
 }
 
+/**
+ * fbnic_ts40_to_ns() - convert descriptor timestamp to PHC time
+ * @fbn: netdev priv of the FB NIC
+ * @ts40: timestamp read from a descriptor
+ *
+ * Return: u64 value of PHC time in nanoseconds
+ *
+ * Convert truncated 40 bit device timestamp as read from a descriptor
+ * to the full PHC time in nanoseconds.
+ */
+static __maybe_unused u64 fbnic_ts40_to_ns(struct fbnic_net *fbn, u64 ts40)
+{
+	unsigned int s;
+	u64 time_ns;
+	s64 offset;
+	u8 ts_top;
+	u32 high;
+
+	do {
+		s = u64_stats_fetch_begin(&fbn->time_seq);
+		offset = READ_ONCE(fbn->time_offset);
+	} while (u64_stats_fetch_retry(&fbn->time_seq, s));
+
+	high = READ_ONCE(fbn->time_high);
+
+	/* Bits 63..40 from periodic clock reads, 39..0 from ts40 */
+	time_ns = (u64)(high >> 8) << 40 | ts40;
+
+	/* Compare bits 32-39 between periodic reads and ts40,
+	 * see if HW clock may have wrapped since last read
+	 */
+	ts_top = ts40 >> 32;
+	if (ts_top < (u8)high && (u8)high - ts_top > U8_MAX / 2)
+		time_ns += 1ULL << 40;
+
+	return time_ns + offset;
+}
+
 static unsigned int fbnic_desc_unused(struct fbnic_ring *ring)
 {
 	return (ring->head - ring->tail - 1) & ring->size_mask;
@@ -710,6 +748,10 @@  static struct sk_buff *fbnic_build_skb(struct fbnic_napi_vector *nv,
 	/* Set MAC header specific fields */
 	skb->protocol = eth_type_trans(skb, nv->napi.dev);
 
+	/* Add timestamp if present */
+	if (pkt->hwtstamp)
+		skb_hwtstamps(skb)->hwtstamp = pkt->hwtstamp;
+
 	return skb;
 }
 
@@ -720,6 +762,23 @@  static enum pkt_hash_types fbnic_skb_hash_type(u64 rcd)
 						     PKT_HASH_TYPE_L2;
 }
 
+static void fbnic_rx_tstamp(struct fbnic_napi_vector *nv, u64 rcd,
+			    struct fbnic_pkt_buff *pkt)
+{
+	struct fbnic_net *fbn;
+	u64 ns, ts;
+
+	if (!FIELD_GET(FBNIC_RCD_OPT_META_TS, rcd))
+		return;
+
+	fbn = netdev_priv(nv->napi.dev);
+	ts = FIELD_GET(FBNIC_RCD_OPT_META_TS_MASK, rcd);
+	ns = fbnic_ts40_to_ns(fbn, ts);
+
+	/* Add timestamp to shared info */
+	pkt->hwtstamp = ns_to_ktime(ns);
+}
+
 static void fbnic_populate_skb_fields(struct fbnic_napi_vector *nv,
 				      u64 rcd, struct sk_buff *skb,
 				      struct fbnic_q_triad *qt)
@@ -784,6 +843,8 @@  static int fbnic_clean_rcq(struct fbnic_napi_vector *nv,
 			if (FIELD_GET(FBNIC_RCD_OPT_META_TYPE_MASK, rcd))
 				break;
 
+			fbnic_rx_tstamp(nv, rcd, pkt);
+
 			/* We currently ignore the action table index */
 			break;
 		case FBNIC_RCD_TYPE_META:
diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_txrx.h b/drivers/net/ethernet/meta/fbnic/fbnic_txrx.h
index 2f91f68d11d5..682d875f08c0 100644
--- a/drivers/net/ethernet/meta/fbnic/fbnic_txrx.h
+++ b/drivers/net/ethernet/meta/fbnic/fbnic_txrx.h
@@ -47,6 +47,7 @@  struct fbnic_net;
 
 struct fbnic_pkt_buff {
 	struct xdp_buff buff;
+	ktime_t hwtstamp;
 	u32 data_truesize;
 	u16 data_len;
 	u16 nr_frags;