diff mbox series

[RFC,net-next,2/2] net: Reset forwarded skb->tstamp before delivering to user space

Message ID 20211207020108.3691229-1-kafai@fb.com (mailing list archive)
State RFC
Delegated to: Netdev Maintainers
Headers show
Series [RFC,net-next,1/2] net: Use mono clock in RX timestamp | expand

Checks

Context Check Description
netdev/tree_selection success Clearly marked for net-next
netdev/fixes_present success Fixes tag not required for -next series
netdev/subject_prefix success Link
netdev/cover_letter success Single patches do not need cover letters
netdev/patch_count success Link
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit fail Errors and warnings before: 5919 this patch: 5365
netdev/cc_maintainers warning 2 maintainers not CCed: alobakin@pm.me jonathan.lemon@gmail.com
netdev/build_clang fail Errors and warnings before: 914 this patch: 672
netdev/module_param success Was 0 now: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn fail Errors and warnings before: 5552 this patch: 4601
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 79 lines checked
netdev/kdoc fail Errors and warnings before: 0 this patch: 1
netdev/source_inline success Was 1 now: 0

Commit Message

Martin KaFai Lau Dec. 7, 2021, 2:01 a.m. UTC
The skb->tstamp may be set by a local sk (as a sender in tcp) which then
forwarded and delivered to another sk (as a receiver).

An example:
    sender-sk => veth@netns =====> veth@host => receiver-sk
                             ^^^
			__dev_forward_skb

The skb->tstamp is marked with a future TX time.  This future
skb->tstamp will confuse the receiver-sk.

This patch marks the skb if the skb->tstamp is forwarded.
Before using the skb->tstamp as a rx timestamp, it needs
to be re-stamped to avoid getting a future time.  It is
done in the RX timestamp reading helper skb_get_ktime().

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
---
 include/linux/skbuff.h | 14 +++++++++-----
 net/core/dev.c         |  4 +++-
 net/core/skbuff.c      |  6 +++++-
 3 files changed, 17 insertions(+), 7 deletions(-)

Comments

Willem de Bruijn Dec. 7, 2021, 2:27 p.m. UTC | #1
On Mon, Dec 6, 2021 at 9:01 PM Martin KaFai Lau <kafai@fb.com> wrote:
>
> The skb->tstamp may be set by a local sk (as a sender in tcp) which then
> forwarded and delivered to another sk (as a receiver).
>
> An example:
>     sender-sk => veth@netns =====> veth@host => receiver-sk
>                              ^^^
>                         __dev_forward_skb
>
> The skb->tstamp is marked with a future TX time.  This future
> skb->tstamp will confuse the receiver-sk.
>
> This patch marks the skb if the skb->tstamp is forwarded.
> Before using the skb->tstamp as a rx timestamp, it needs
> to be re-stamped to avoid getting a future time.  It is
> done in the RX timestamp reading helper skb_get_ktime().
>
> Signed-off-by: Martin KaFai Lau <kafai@fb.com>
> ---
>  include/linux/skbuff.h | 14 +++++++++-----
>  net/core/dev.c         |  4 +++-
>  net/core/skbuff.c      |  6 +++++-
>  3 files changed, 17 insertions(+), 7 deletions(-)
>
> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> index b609bdc5398b..bc4ae34c4e22 100644
> --- a/include/linux/skbuff.h
> +++ b/include/linux/skbuff.h
> @@ -867,6 +867,7 @@ struct sk_buff {
>         __u8                    decrypted:1;
>  #endif
>         __u8                    slow_gro:1;
> +       __u8                    fwd_tstamp:1;
>
>  #ifdef CONFIG_NET_SCHED
>         __u16                   tc_index;       /* traffic control index */
> @@ -3806,9 +3807,12 @@ static inline void skb_copy_to_linear_data_offset(struct sk_buff *skb,
>  }
>
>  void skb_init(void);
> +void net_timestamp_set(struct sk_buff *skb);
>
> -static inline ktime_t skb_get_ktime(const struct sk_buff *skb)
> +static inline ktime_t skb_get_ktime(struct sk_buff *skb)
>  {
> +       if (unlikely(skb->fwd_tstamp))
> +               net_timestamp_set(skb);
>         return ktime_mono_to_real_cond(skb->tstamp);

This changes timestamp behavior for existing applications, probably
worth mentioning in the commit message if nothing else. A timestamp
taking at the time of the recv syscall is not very useful.

If a forwarded timestamp is not a future delivery time (as those are
scrubbed), is it not correct to just deliver the original timestamp?
It probably was taken at some earlier __netif_receive_skb_core.

>  }
>
> -static inline void net_timestamp_set(struct sk_buff *skb)
> +void net_timestamp_set(struct sk_buff *skb)
>  {
>         skb->tstamp = 0;
> +       skb->fwd_tstamp = 0;
>         if (static_branch_unlikely(&netstamp_needed_key))
>                 __net_timestamp(skb);
>  }
> +EXPORT_SYMBOL(net_timestamp_set);
>
>  #define net_timestamp_check(COND, SKB)                         \
>         if (static_branch_unlikely(&netstamp_needed_key)) {     \
> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> index f091c7807a9e..181ddc989ead 100644
> --- a/net/core/skbuff.c
> +++ b/net/core/skbuff.c
> @@ -5295,8 +5295,12 @@ void skb_scrub_tstamp(struct sk_buff *skb)
>  {
>         struct sock *sk = skb->sk;
>
> -       if (sk && sk_fullsock(sk) && sock_flag(sk, SOCK_TXTIME))
> +       if (sk && sk_fullsock(sk) && sock_flag(sk, SOCK_TXTIME)) {

There is a slight race here with the socket flipping the feature on/off.

>
>                 skb->tstamp = 0;
> +               skb->fwd_tstamp = 0;
> +       } else if (skb->tstamp) {
> +               skb->fwd_tstamp = 1;
> +       }

SO_TXTIME future delivery times are scrubbed, but TCP future delivery
times are not?

If adding a bit, might it be simpler to add a bit tstamp_is_edt, and
scrub based on that. That is also not open to the above race.
Daniel Borkmann Dec. 7, 2021, 9:48 p.m. UTC | #2
On 12/7/21 3:27 PM, Willem de Bruijn wrote:
> On Mon, Dec 6, 2021 at 9:01 PM Martin KaFai Lau <kafai@fb.com> wrote:
>>
>> The skb->tstamp may be set by a local sk (as a sender in tcp) which then
>> forwarded and delivered to another sk (as a receiver).
>>
>> An example:
>>      sender-sk => veth@netns =====> veth@host => receiver-sk
>>                               ^^^
>>                          __dev_forward_skb
>>
>> The skb->tstamp is marked with a future TX time.  This future
>> skb->tstamp will confuse the receiver-sk.
>>
>> This patch marks the skb if the skb->tstamp is forwarded.
>> Before using the skb->tstamp as a rx timestamp, it needs
>> to be re-stamped to avoid getting a future time.  It is
>> done in the RX timestamp reading helper skb_get_ktime().
>>
>> Signed-off-by: Martin KaFai Lau <kafai@fb.com>
>> ---
>>   include/linux/skbuff.h | 14 +++++++++-----
>>   net/core/dev.c         |  4 +++-
>>   net/core/skbuff.c      |  6 +++++-
>>   3 files changed, 17 insertions(+), 7 deletions(-)
>>
>> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
>> index b609bdc5398b..bc4ae34c4e22 100644
>> --- a/include/linux/skbuff.h
>> +++ b/include/linux/skbuff.h
>> @@ -867,6 +867,7 @@ struct sk_buff {
>>          __u8                    decrypted:1;
>>   #endif
>>          __u8                    slow_gro:1;
>> +       __u8                    fwd_tstamp:1;
>>
>>   #ifdef CONFIG_NET_SCHED
>>          __u16                   tc_index;       /* traffic control index */
>> @@ -3806,9 +3807,12 @@ static inline void skb_copy_to_linear_data_offset(struct sk_buff *skb,
>>   }
>>
>>   void skb_init(void);
>> +void net_timestamp_set(struct sk_buff *skb);
>>
>> -static inline ktime_t skb_get_ktime(const struct sk_buff *skb)
>> +static inline ktime_t skb_get_ktime(struct sk_buff *skb)
>>   {
>> +       if (unlikely(skb->fwd_tstamp))
>> +               net_timestamp_set(skb);
>>          return ktime_mono_to_real_cond(skb->tstamp);
> 
> This changes timestamp behavior for existing applications, probably
> worth mentioning in the commit message if nothing else. A timestamp
> taking at the time of the recv syscall is not very useful.
> 
> If a forwarded timestamp is not a future delivery time (as those are
> scrubbed), is it not correct to just deliver the original timestamp?
> It probably was taken at some earlier __netif_receive_skb_core.
> 
>>   }
>>
>> -static inline void net_timestamp_set(struct sk_buff *skb)
>> +void net_timestamp_set(struct sk_buff *skb)
>>   {
>>          skb->tstamp = 0;
>> +       skb->fwd_tstamp = 0;
>>          if (static_branch_unlikely(&netstamp_needed_key))
>>                  __net_timestamp(skb);
>>   }
>> +EXPORT_SYMBOL(net_timestamp_set);
>>
>>   #define net_timestamp_check(COND, SKB)                         \
>>          if (static_branch_unlikely(&netstamp_needed_key)) {     \
>> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
>> index f091c7807a9e..181ddc989ead 100644
>> --- a/net/core/skbuff.c
>> +++ b/net/core/skbuff.c
>> @@ -5295,8 +5295,12 @@ void skb_scrub_tstamp(struct sk_buff *skb)
>>   {
>>          struct sock *sk = skb->sk;
>>
>> -       if (sk && sk_fullsock(sk) && sock_flag(sk, SOCK_TXTIME))
>> +       if (sk && sk_fullsock(sk) && sock_flag(sk, SOCK_TXTIME)) {
> 
> There is a slight race here with the socket flipping the feature on/off.
> 
>>                  skb->tstamp = 0;
>> +               skb->fwd_tstamp = 0;
>> +       } else if (skb->tstamp) {
>> +               skb->fwd_tstamp = 1;
>> +       }
> 
> SO_TXTIME future delivery times are scrubbed, but TCP future delivery
> times are not?
> 
> If adding a bit, might it be simpler to add a bit tstamp_is_edt, and
> scrub based on that. That is also not open to the above race.

One other thing I wonder, BPF progs at host-facing veth's tc ingress which
are not aware of skb->tstamp will then see a tstamp from future given we
intentionally bypass the net_timestamp_check() and might get confused (or
would confuse higher-layer application logic)? Not quite sure yet if they
would be the only affected user.

With regards to open question on mono clock and time namespaces (which
cover mono + boottime offsets), looks like it seems not an issue as they
only affect syscall-facing APIs.

Thanks,
Daniel
Daniel Borkmann Dec. 7, 2021, 11:06 p.m. UTC | #3
On 12/7/21 10:48 PM, Daniel Borkmann wrote:
> On 12/7/21 3:27 PM, Willem de Bruijn wrote:
>> On Mon, Dec 6, 2021 at 9:01 PM Martin KaFai Lau <kafai@fb.com> wrote:
>>>
>>> The skb->tstamp may be set by a local sk (as a sender in tcp) which then
>>> forwarded and delivered to another sk (as a receiver).
>>>
>>> An example:
>>>      sender-sk => veth@netns =====> veth@host => receiver-sk
>>>                               ^^^
>>>                          __dev_forward_skb
>>>
>>> The skb->tstamp is marked with a future TX time.  This future
>>> skb->tstamp will confuse the receiver-sk.
>>>
>>> This patch marks the skb if the skb->tstamp is forwarded.
>>> Before using the skb->tstamp as a rx timestamp, it needs
>>> to be re-stamped to avoid getting a future time.  It is
>>> done in the RX timestamp reading helper skb_get_ktime().
>>>
>>> Signed-off-by: Martin KaFai Lau <kafai@fb.com>
>>> ---
>>>   include/linux/skbuff.h | 14 +++++++++-----
>>>   net/core/dev.c         |  4 +++-
>>>   net/core/skbuff.c      |  6 +++++-
>>>   3 files changed, 17 insertions(+), 7 deletions(-)
>>>
>>> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
>>> index b609bdc5398b..bc4ae34c4e22 100644
>>> --- a/include/linux/skbuff.h
>>> +++ b/include/linux/skbuff.h
>>> @@ -867,6 +867,7 @@ struct sk_buff {
>>>          __u8                    decrypted:1;
>>>   #endif
>>>          __u8                    slow_gro:1;
>>> +       __u8                    fwd_tstamp:1;
>>>
>>>   #ifdef CONFIG_NET_SCHED
>>>          __u16                   tc_index;       /* traffic control index */
>>> @@ -3806,9 +3807,12 @@ static inline void skb_copy_to_linear_data_offset(struct sk_buff *skb,
>>>   }
>>>
>>>   void skb_init(void);
>>> +void net_timestamp_set(struct sk_buff *skb);
>>>
>>> -static inline ktime_t skb_get_ktime(const struct sk_buff *skb)
>>> +static inline ktime_t skb_get_ktime(struct sk_buff *skb)
>>>   {
>>> +       if (unlikely(skb->fwd_tstamp))
>>> +               net_timestamp_set(skb);
>>>          return ktime_mono_to_real_cond(skb->tstamp);
>>
>> This changes timestamp behavior for existing applications, probably
>> worth mentioning in the commit message if nothing else. A timestamp
>> taking at the time of the recv syscall is not very useful.
>>
>> If a forwarded timestamp is not a future delivery time (as those are
>> scrubbed), is it not correct to just deliver the original timestamp?
>> It probably was taken at some earlier __netif_receive_skb_core.
>>
>>>   }
>>>
>>> -static inline void net_timestamp_set(struct sk_buff *skb)
>>> +void net_timestamp_set(struct sk_buff *skb)
>>>   {
>>>          skb->tstamp = 0;
>>> +       skb->fwd_tstamp = 0;
>>>          if (static_branch_unlikely(&netstamp_needed_key))
>>>                  __net_timestamp(skb);
>>>   }
>>> +EXPORT_SYMBOL(net_timestamp_set);
>>>
>>>   #define net_timestamp_check(COND, SKB)                         \
>>>          if (static_branch_unlikely(&netstamp_needed_key)) {     \
>>> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
>>> index f091c7807a9e..181ddc989ead 100644
>>> --- a/net/core/skbuff.c
>>> +++ b/net/core/skbuff.c
>>> @@ -5295,8 +5295,12 @@ void skb_scrub_tstamp(struct sk_buff *skb)
>>>   {
>>>          struct sock *sk = skb->sk;
>>>
>>> -       if (sk && sk_fullsock(sk) && sock_flag(sk, SOCK_TXTIME))
>>> +       if (sk && sk_fullsock(sk) && sock_flag(sk, SOCK_TXTIME)) {
>>
>> There is a slight race here with the socket flipping the feature on/off.
>>
>>>                  skb->tstamp = 0;
>>> +               skb->fwd_tstamp = 0;
>>> +       } else if (skb->tstamp) {
>>> +               skb->fwd_tstamp = 1;
>>> +       }
>>
>> SO_TXTIME future delivery times are scrubbed, but TCP future delivery
>> times are not?
>>
>> If adding a bit, might it be simpler to add a bit tstamp_is_edt, and
>> scrub based on that. That is also not open to the above race.
> 
> One other thing I wonder, BPF progs at host-facing veth's tc ingress which
> are not aware of skb->tstamp will then see a tstamp from future given we
> intentionally bypass the net_timestamp_check() and might get confused (or
> would confuse higher-layer application logic)? Not quite sure yet if they
> would be the only affected user.

Untested (& unoptimized wrt netdev cachelines), but worst case maybe something
like this could work ... not generic, but smaller risk wrt timestamp behavior
changes for applications when pushing up the stack (?).

Meaning, the attribute would be set for host-facing veths and the phys dev with
sch_fq. Not generic unfortunately given this would require the coorperation from
BPF side on tc ingress of the host veths, meaning, given the net_timestamp_check()
is skipped, the prog would have to call net_timestamp_set() via BPF helper if it
decides to return with TC_ACT_OK. (So orchestrator would opt-in(/out) to set the
devs it manages to xnet_flush_tstamp to 0 and to update tstamp via helper.. hm)

  include/linux/netdevice.h |  4 +++-
  include/linux/skbuff.h    |  6 +++++-
  net/core/dev.c            |  1 +
  net/core/filter.c         |  9 ++++++---
  net/core/net-sysfs.c      | 18 ++++++++++++++++++
  net/core/skbuff.c         | 15 +++++++++------
  6 files changed, 42 insertions(+), 11 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 3ec42495a43a..df9141f92bbf 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2172,6 +2172,7 @@ struct net_device {
  	struct timer_list	watchdog_timer;
  	int			watchdog_timeo;

+	u32			xnet_flush_tstamp;
  	u32                     proto_down_reason;

  	struct list_head	todo_list;
@@ -4137,7 +4138,8 @@ static __always_inline int ____dev_forward_skb(struct net_device *dev,
  		return NET_RX_DROP;
  	}

-	skb_scrub_packet(skb, !net_eq(dev_net(dev), dev_net(skb->dev)));
+	__skb_scrub_packet(skb, !net_eq(dev_net(dev), dev_net(skb->dev)),
+			   READ_ONCE(dev->xnet_flush_tstamp));
  	skb->priority = 0;
  	return 0;
  }
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 686a666d073d..09b670bcd7fd 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3688,7 +3688,11 @@ int skb_zerocopy(struct sk_buff *to, struct sk_buff *from,
  		 int len, int hlen);
  void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len);
  int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen);
-void skb_scrub_packet(struct sk_buff *skb, bool xnet);
+void __skb_scrub_packet(struct sk_buff *skb, bool xnet, bool tstamp);
+static __always_inline void skb_scrub_packet(struct sk_buff *skb, bool xnet)
+{
+	__skb_scrub_packet(skb, xnet, true);
+}
  bool skb_gso_validate_network_len(const struct sk_buff *skb, unsigned int mtu);
  bool skb_gso_validate_mac_len(const struct sk_buff *skb, unsigned int len);
  struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features);
diff --git a/net/core/dev.c b/net/core/dev.c
index 15ac064b5562..1678032bd5a3 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -10853,6 +10853,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
  	dev->gso_max_segs = GSO_MAX_SEGS;
  	dev->upper_level = 1;
  	dev->lower_level = 1;
+	dev->xnet_flush_tstamp = 1;
  #ifdef CONFIG_LOCKDEP
  	dev->nested_level = 0;
  	INIT_LIST_HEAD(&dev->unlink_list);
diff --git a/net/core/filter.c b/net/core/filter.c
index fe27c91e3758..69366af42141 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2107,7 +2107,8 @@ static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb)
  	}

  	skb->dev = dev;
-	skb->tstamp = 0;
+	if (READ_ONCE(dev->xnet_flush_tstamp))
+		skb->tstamp = 0;

  	dev_xmit_recursion_inc();
  	ret = dev_queue_xmit(skb);
@@ -2176,7 +2177,8 @@ static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb,
  	}

  	skb->dev = dev;
-	skb->tstamp = 0;
+	if (READ_ONCE(dev->xnet_flush_tstamp))
+		skb->tstamp = 0;

  	if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
  		skb = skb_expand_head(skb, hh_len);
@@ -2274,7 +2276,8 @@ static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb,
  	}

  	skb->dev = dev;
-	skb->tstamp = 0;
+	if (READ_ONCE(dev->xnet_flush_tstamp))
+		skb->tstamp = 0;

  	if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
  		skb = skb_expand_head(skb, hh_len);
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 9c01c642cf9e..d8ad9dbbbf55 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -403,6 +403,23 @@ static ssize_t gro_flush_timeout_store(struct device *dev,
  }
  NETDEVICE_SHOW_RW(gro_flush_timeout, fmt_ulong);

+static int change_xnet_flush_tstamp(struct net_device *dev, unsigned long val)
+{
+	WRITE_ONCE(dev->xnet_flush_tstamp, val);
+	return 0;
+}
+
+static ssize_t xnet_flush_tstamp_store(struct device *dev,
+				      struct device_attribute *attr,
+				      const char *buf, size_t len)
+{
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	return netdev_store(dev, attr, buf, len, change_xnet_flush_tstamp);
+}
+NETDEVICE_SHOW_RW(xnet_flush_tstamp, fmt_dec);
+
  static int change_napi_defer_hard_irqs(struct net_device *dev, unsigned long val)
  {
  	WRITE_ONCE(dev->napi_defer_hard_irqs, val);
@@ -651,6 +668,7 @@ static struct attribute *net_class_attrs[] __ro_after_init = {
  	&dev_attr_flags.attr,
  	&dev_attr_tx_queue_len.attr,
  	&dev_attr_gro_flush_timeout.attr,
+	&dev_attr_xnet_flush_tstamp.attr,
  	&dev_attr_napi_defer_hard_irqs.attr,
  	&dev_attr_phys_port_id.attr,
  	&dev_attr_phys_port_name.attr,
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index ba2f38246f07..b0f6b96c7b2a 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -5440,19 +5440,21 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from,
  EXPORT_SYMBOL(skb_try_coalesce);

  /**
- * skb_scrub_packet - scrub an skb
+ * __skb_scrub_packet - scrub an skb
   *
   * @skb: buffer to clean
   * @xnet: packet is crossing netns
+ * @tstamp: timestamp needs scrubbing
   *
- * skb_scrub_packet can be used after encapsulating or decapsulting a packet
+ * __skb_scrub_packet can be used after encapsulating or decapsulting a packet
   * into/from a tunnel. Some information have to be cleared during these
   * operations.
- * skb_scrub_packet can also be used to clean a skb before injecting it in
+ *
+ * __skb_scrub_packet can also be used to clean a skb before injecting it in
   * another namespace (@xnet == true). We have to clear all information in the
   * skb that could impact namespace isolation.
   */
-void skb_scrub_packet(struct sk_buff *skb, bool xnet)
+void __skb_scrub_packet(struct sk_buff *skb, bool xnet, bool tstamp)
  {
  	skb->pkt_type = PACKET_HOST;
  	skb->skb_iif = 0;
@@ -5472,9 +5474,10 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet)

  	ipvs_reset(skb);
  	skb->mark = 0;
-	skb->tstamp = 0;
+	if (tstamp)
+		skb->tstamp = 0;
  }
-EXPORT_SYMBOL_GPL(skb_scrub_packet);
+EXPORT_SYMBOL_GPL(__skb_scrub_packet);

  /**
   * skb_gso_transport_seglen - Return length of individual segments of a gso packet
Martin KaFai Lau Dec. 8, 2021, 12:07 a.m. UTC | #4
On Tue, Dec 07, 2021 at 09:27:55AM -0500, Willem de Bruijn wrote:
> On Mon, Dec 6, 2021 at 9:01 PM Martin KaFai Lau <kafai@fb.com> wrote:
> >
> > The skb->tstamp may be set by a local sk (as a sender in tcp) which then
> > forwarded and delivered to another sk (as a receiver).
> >
> > An example:
> >     sender-sk => veth@netns =====> veth@host => receiver-sk
> >                              ^^^
> >                         __dev_forward_skb
> >
> > The skb->tstamp is marked with a future TX time.  This future
> > skb->tstamp will confuse the receiver-sk.
> >
> > This patch marks the skb if the skb->tstamp is forwarded.
> > Before using the skb->tstamp as a rx timestamp, it needs
> > to be re-stamped to avoid getting a future time.  It is
> > done in the RX timestamp reading helper skb_get_ktime().
> >
> > Signed-off-by: Martin KaFai Lau <kafai@fb.com>
> > ---
> >  include/linux/skbuff.h | 14 +++++++++-----
> >  net/core/dev.c         |  4 +++-
> >  net/core/skbuff.c      |  6 +++++-
> >  3 files changed, 17 insertions(+), 7 deletions(-)
> >
> > diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> > index b609bdc5398b..bc4ae34c4e22 100644
> > --- a/include/linux/skbuff.h
> > +++ b/include/linux/skbuff.h
> > @@ -867,6 +867,7 @@ struct sk_buff {
> >         __u8                    decrypted:1;
> >  #endif
> >         __u8                    slow_gro:1;
> > +       __u8                    fwd_tstamp:1;
> >
> >  #ifdef CONFIG_NET_SCHED
> >         __u16                   tc_index;       /* traffic control index */
> > @@ -3806,9 +3807,12 @@ static inline void skb_copy_to_linear_data_offset(struct sk_buff *skb,
> >  }
> >
> >  void skb_init(void);
> > +void net_timestamp_set(struct sk_buff *skb);
> >
> > -static inline ktime_t skb_get_ktime(const struct sk_buff *skb)
> > +static inline ktime_t skb_get_ktime(struct sk_buff *skb)
> >  {
> > +       if (unlikely(skb->fwd_tstamp))
> > +               net_timestamp_set(skb);
> >         return ktime_mono_to_real_cond(skb->tstamp);
> 
> This changes timestamp behavior for existing applications, probably
> worth mentioning in the commit message if nothing else. A timestamp
> taking at the time of the recv syscall is not very useful.
> 
> If a forwarded timestamp is not a future delivery time (as those are
> scrubbed), is it not correct to just deliver the original timestamp?
> It probably was taken at some earlier __netif_receive_skb_core.
Make sense.  I will compare with the current mono clock first before
resetting and also mention this behavior change in the commit message.

Do you think it will be too heavy to always compare with
the current time without testing the skb->fwd_tstamp bit
first?

> 
> >  }
> >
> > -static inline void net_timestamp_set(struct sk_buff *skb)
> > +void net_timestamp_set(struct sk_buff *skb)
> >  {
> >         skb->tstamp = 0;
> > +       skb->fwd_tstamp = 0;
> >         if (static_branch_unlikely(&netstamp_needed_key))
> >                 __net_timestamp(skb);
> >  }
> > +EXPORT_SYMBOL(net_timestamp_set);
> >
> >  #define net_timestamp_check(COND, SKB)                         \
> >         if (static_branch_unlikely(&netstamp_needed_key)) {     \
> > diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> > index f091c7807a9e..181ddc989ead 100644
> > --- a/net/core/skbuff.c
> > +++ b/net/core/skbuff.c
> > @@ -5295,8 +5295,12 @@ void skb_scrub_tstamp(struct sk_buff *skb)
> >  {
> >         struct sock *sk = skb->sk;
> >
> > -       if (sk && sk_fullsock(sk) && sock_flag(sk, SOCK_TXTIME))
> > +       if (sk && sk_fullsock(sk) && sock_flag(sk, SOCK_TXTIME)) {
> 
> There is a slight race here with the socket flipping the feature on/off.
Right, I think it is an inherited race by relating skb->tstamp with
a bit in sk, like the existing sch_etf.c.
Directly setting a bit in skb when setting the skb->tstamp will help.

> 
> >
> >                 skb->tstamp = 0;
> > +               skb->fwd_tstamp = 0;
> > +       } else if (skb->tstamp) {
> > +               skb->fwd_tstamp = 1;
> > +       }
> 
> SO_TXTIME future delivery times are scrubbed, but TCP future delivery
> times are not?
It is not too much about scrubbing future SO_TXTIME or future TCP
delivery time for the local delivery.

fwd_mono_tstamp may be a better name.  It is about the forwarded tstamp
is in mono.  e.g. the packet from a container-netns can be queued
at the fq@hostns (the case described in patch 1 commit log).
Also, the bpf@ingress@veth@hostns can now expect the skb->tstamp is in
mono time.  BPF side does not have helper returning real clock, so it is
safe to assume that bpf prog is comparing (or setting) skb->tstamp as
mono also.

> If adding a bit, might it be simpler to add a bit tstamp_is_edt, and
> scrub based on that. That is also not open to the above race.
It was one of my earlier attempts by adding tstamp_is_tx_mono and
set it in tcp_output.c and then test it before scrubbing.
Other than changing the tcp_output.c (e.g. in __tcp_transmit_skb),
I ended up making another change on the bpf side to also set
this bit when the bpf_prog is updating the __sk_buff->tstamp.  Thus,
in this patch , I ended up setting a bit only in the forward path.

I can go back to retry the tstamp_is_edt/tstamp_is_tx_mono idea and
that can also avoid the race in testing sock_flag(sk, SOCK_TXTIME)
as you suggested.

Thanks for the review !
Willem de Bruijn Dec. 8, 2021, 12:44 a.m. UTC | #5
> > > -static inline ktime_t skb_get_ktime(const struct sk_buff *skb)
> > > +static inline ktime_t skb_get_ktime(struct sk_buff *skb)
> > >  {
> > > +       if (unlikely(skb->fwd_tstamp))
> > > +               net_timestamp_set(skb);
> > >         return ktime_mono_to_real_cond(skb->tstamp);
> >
> > This changes timestamp behavior for existing applications, probably
> > worth mentioning in the commit message if nothing else. A timestamp
> > taking at the time of the recv syscall is not very useful.
> >
> > If a forwarded timestamp is not a future delivery time (as those are
> > scrubbed), is it not correct to just deliver the original timestamp?
> > It probably was taken at some earlier __netif_receive_skb_core.
> Make sense.  I will compare with the current mono clock first before
> resetting and also mention this behavior change in the commit message.
>
> Do you think it will be too heavy to always compare with
> the current time without testing the skb->fwd_tstamp bit
> first?

There are other examples of code using ktime_get and variants in the
hot path, such as FQ.

Especially if skb_get_ktime is called in recv() timestamp helpers, it
is perhaps acceptable. If not ideal. If we need an skb bit anyway,
then this is moot.

> >
> > >  }
> > >
> > > -static inline void net_timestamp_set(struct sk_buff *skb)
> > > +void net_timestamp_set(struct sk_buff *skb)
> > >  {
> > >         skb->tstamp = 0;
> > > +       skb->fwd_tstamp = 0;
> > >         if (static_branch_unlikely(&netstamp_needed_key))
> > >                 __net_timestamp(skb);
> > >  }
> > > +EXPORT_SYMBOL(net_timestamp_set);
> > >
> > >  #define net_timestamp_check(COND, SKB)                         \
> > >         if (static_branch_unlikely(&netstamp_needed_key)) {     \
> > > diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> > > index f091c7807a9e..181ddc989ead 100644
> > > --- a/net/core/skbuff.c
> > > +++ b/net/core/skbuff.c
> > > @@ -5295,8 +5295,12 @@ void skb_scrub_tstamp(struct sk_buff *skb)
> > >  {
> > >         struct sock *sk = skb->sk;
> > >
> > > -       if (sk && sk_fullsock(sk) && sock_flag(sk, SOCK_TXTIME))
> > > +       if (sk && sk_fullsock(sk) && sock_flag(sk, SOCK_TXTIME)) {
> >
> > There is a slight race here with the socket flipping the feature on/off.
> Right, I think it is an inherited race by relating skb->tstamp with
> a bit in sk, like the existing sch_etf.c.
> Directly setting a bit in skb when setting the skb->tstamp will help.
>
> >
> > >
> > >                 skb->tstamp = 0;
> > > +               skb->fwd_tstamp = 0;
> > > +       } else if (skb->tstamp) {
> > > +               skb->fwd_tstamp = 1;
> > > +       }
> >
> > SO_TXTIME future delivery times are scrubbed, but TCP future delivery
> > times are not?
> It is not too much about scrubbing future SO_TXTIME or future TCP
> delivery time for the local delivery.

The purpose of the above is to reset future delivery time whenever it
can be mistaken for a timestamp, right?

This function is called on forwarding, redirection, looping from
egress to ingress with __dev_forward_skb, etc. But then it breaks the
delivery time forwarding over veth that I thought was the purpose of
this patch series. I guess I'm a bit hazy when this is supposed to be
scrubbed exactly.

> fwd_mono_tstamp may be a better name.  It is about the forwarded tstamp
> is in mono.

After your change skb->tstamp is no longer in CLOCK_REALTIME, right?

Somewhat annoyingly, that does not imply that it is always
CLOCK_MONOTONIC. Because while FQ uses that, ETF is programmed with
CLOCK_TAI.

Perhaps skb->delivery_time is the most specific description. And that
is easy to test for in skb_scrub_tstamp.


> e.g. the packet from a container-netns can be queued
> at the fq@hostns (the case described in patch 1 commit log).
> Also, the bpf@ingress@veth@hostns can now expect the skb->tstamp is in
> mono time.  BPF side does not have helper returning real clock, so it is
> safe to assume that bpf prog is comparing (or setting) skb->tstamp as
> mono also.
>
> > If adding a bit, might it be simpler to add a bit tstamp_is_edt, and
> > scrub based on that. That is also not open to the above race.
> It was one of my earlier attempts by adding tstamp_is_tx_mono and
> set it in tcp_output.c and then test it before scrubbing.
> Other than changing the tcp_output.c (e.g. in __tcp_transmit_skb),
> I ended up making another change on the bpf side to also set
> this bit when the bpf_prog is updating the __sk_buff->tstamp.  Thus,
> in this patch , I ended up setting a bit only in the forward path.
>
> I can go back to retry the tstamp_is_edt/tstamp_is_tx_mono idea and
> that can also avoid the race in testing sock_flag(sk, SOCK_TXTIME)
> as you suggested.

Sounds great, thanks
Martin KaFai Lau Dec. 8, 2021, 2:45 a.m. UTC | #6
On Tue, Dec 07, 2021 at 07:44:05PM -0500, Willem de Bruijn wrote:
> > > > diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> > > > index f091c7807a9e..181ddc989ead 100644
> > > > --- a/net/core/skbuff.c
> > > > +++ b/net/core/skbuff.c
> > > > @@ -5295,8 +5295,12 @@ void skb_scrub_tstamp(struct sk_buff *skb)
> > > >  {
> > > >         struct sock *sk = skb->sk;
> > > >
> > > > -       if (sk && sk_fullsock(sk) && sock_flag(sk, SOCK_TXTIME))
> > > > +       if (sk && sk_fullsock(sk) && sock_flag(sk, SOCK_TXTIME)) {
> > >
> > > There is a slight race here with the socket flipping the feature on/off.
> > Right, I think it is an inherited race by relating skb->tstamp with
> > a bit in sk, like the existing sch_etf.c.
> > Directly setting a bit in skb when setting the skb->tstamp will help.
> >
> > >
> > > >
> > > >                 skb->tstamp = 0;
> > > > +               skb->fwd_tstamp = 0;
> > > > +       } else if (skb->tstamp) {
> > > > +               skb->fwd_tstamp = 1;
> > > > +       }
> > >
> > > SO_TXTIME future delivery times are scrubbed, but TCP future delivery
> > > times are not?
> > It is not too much about scrubbing future SO_TXTIME or future TCP
> > delivery time for the local delivery.
> 
> The purpose of the above is to reset future delivery time whenever it
> can be mistaken for a timestamp, right?
> 
> This function is called on forwarding, redirection, looping from
> egress to ingress with __dev_forward_skb, etc. But then it breaks the
> delivery time forwarding over veth that I thought was the purpose of
> this patch series. I guess I'm a bit hazy when this is supposed to be
> scrubbed exactly.
> 
> > fwd_mono_tstamp may be a better name.  It is about the forwarded tstamp
> > is in mono.
> 
> After your change skb->tstamp is no longer in CLOCK_REALTIME, right?
Right.  The __net_timestamp() will use CLOCK_MONOTONIC.

> Somewhat annoyingly, that does not imply that it is always
> CLOCK_MONOTONIC. Because while FQ uses that, ETF is programmed with
> CLOCK_TAI.
Yes, it is the annoying part, so this patch keeps the tstamp
scrubbing for SO_TXTIME.

If a sk in veth@netns uses SO_TXTIME setting tstamp to TAI and
it is not scrubbed here, it may get forwarded to the fq@hostns
and then get dropped.

skb_ktime_get() also won't know how to compare with the current
time (mono or tai?) and then reset if needed.
Alternatively, it can always re-stamp (__net_timestamp()) much earlier
in the stack before recvmsg().  e.g. just after the sch_handle_ingress()
when TC_ACT_OK is returned as Daniel also mentioned in another thread.
That will be more limited to the bpf@ingress (and then bpf_redirect) usecase
instead of generally applicable to the ip[6]_forward.  However,
the benefit is a more limited impact scope and potential breakage.

> Perhaps skb->delivery_time is the most specific description. And that
> is easy to test for in skb_scrub_tstamp.
> 
> 
> > e.g. the packet from a container-netns can be queued
> > at the fq@hostns (the case described in patch 1 commit log).
> > Also, the bpf@ingress@veth@hostns can now expect the skb->tstamp is in
> > mono time.  BPF side does not have helper returning real clock, so it is
> > safe to assume that bpf prog is comparing (or setting) skb->tstamp as
> > mono also.
> >
> > > If adding a bit, might it be simpler to add a bit tstamp_is_edt, and
> > > scrub based on that. That is also not open to the above race.
> > It was one of my earlier attempts by adding tstamp_is_tx_mono and
> > set it in tcp_output.c and then test it before scrubbing.
> > Other than changing the tcp_output.c (e.g. in __tcp_transmit_skb),
> > I ended up making another change on the bpf side to also set
> > this bit when the bpf_prog is updating the __sk_buff->tstamp.  Thus,
> > in this patch , I ended up setting a bit only in the forward path.
> >
> > I can go back to retry the tstamp_is_edt/tstamp_is_tx_mono idea and
> > that can also avoid the race in testing sock_flag(sk, SOCK_TXTIME)
> > as you suggested.
> 
> Sounds great, thanks
Martin KaFai Lau Dec. 8, 2021, 8:18 a.m. UTC | #7
On Tue, Dec 07, 2021 at 10:48:53PM +0100, Daniel Borkmann wrote:
> On 12/7/21 3:27 PM, Willem de Bruijn wrote:
> > On Mon, Dec 6, 2021 at 9:01 PM Martin KaFai Lau <kafai@fb.com> wrote:
> > > 
> > > The skb->tstamp may be set by a local sk (as a sender in tcp) which then
> > > forwarded and delivered to another sk (as a receiver).
> > > 
> > > An example:
> > >      sender-sk => veth@netns =====> veth@host => receiver-sk
> > >                               ^^^
> > >                          __dev_forward_skb
> > > 
> > > The skb->tstamp is marked with a future TX time.  This future
> > > skb->tstamp will confuse the receiver-sk.
> > > 
> > > This patch marks the skb if the skb->tstamp is forwarded.
> > > Before using the skb->tstamp as a rx timestamp, it needs
> > > to be re-stamped to avoid getting a future time.  It is
> > > done in the RX timestamp reading helper skb_get_ktime().
> > > 
> > > Signed-off-by: Martin KaFai Lau <kafai@fb.com>
> > > ---
> > >   include/linux/skbuff.h | 14 +++++++++-----
> > >   net/core/dev.c         |  4 +++-
> > >   net/core/skbuff.c      |  6 +++++-
> > >   3 files changed, 17 insertions(+), 7 deletions(-)
> > > 
> > > diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> > > index b609bdc5398b..bc4ae34c4e22 100644
> > > --- a/include/linux/skbuff.h
> > > +++ b/include/linux/skbuff.h
> > > @@ -867,6 +867,7 @@ struct sk_buff {
> > >          __u8                    decrypted:1;
> > >   #endif
> > >          __u8                    slow_gro:1;
> > > +       __u8                    fwd_tstamp:1;
> > > 
> > >   #ifdef CONFIG_NET_SCHED
> > >          __u16                   tc_index;       /* traffic control index */
> > > @@ -3806,9 +3807,12 @@ static inline void skb_copy_to_linear_data_offset(struct sk_buff *skb,
> > >   }
> > > 
> > >   void skb_init(void);
> > > +void net_timestamp_set(struct sk_buff *skb);
> > > 
> > > -static inline ktime_t skb_get_ktime(const struct sk_buff *skb)
> > > +static inline ktime_t skb_get_ktime(struct sk_buff *skb)
> > >   {
> > > +       if (unlikely(skb->fwd_tstamp))
> > > +               net_timestamp_set(skb);
> > >          return ktime_mono_to_real_cond(skb->tstamp);
> > 
> > This changes timestamp behavior for existing applications, probably
> > worth mentioning in the commit message if nothing else. A timestamp
> > taking at the time of the recv syscall is not very useful.
> > 
> > If a forwarded timestamp is not a future delivery time (as those are
> > scrubbed), is it not correct to just deliver the original timestamp?
> > It probably was taken at some earlier __netif_receive_skb_core.
> > 
> > >   }
> > > 
> > > -static inline void net_timestamp_set(struct sk_buff *skb)
> > > +void net_timestamp_set(struct sk_buff *skb)
> > >   {
> > >          skb->tstamp = 0;
> > > +       skb->fwd_tstamp = 0;
> > >          if (static_branch_unlikely(&netstamp_needed_key))
> > >                  __net_timestamp(skb);
> > >   }
> > > +EXPORT_SYMBOL(net_timestamp_set);
> > > 
> > >   #define net_timestamp_check(COND, SKB)                         \
> > >          if (static_branch_unlikely(&netstamp_needed_key)) {     \
> > > diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> > > index f091c7807a9e..181ddc989ead 100644
> > > --- a/net/core/skbuff.c
> > > +++ b/net/core/skbuff.c
> > > @@ -5295,8 +5295,12 @@ void skb_scrub_tstamp(struct sk_buff *skb)
> > >   {
> > >          struct sock *sk = skb->sk;
> > > 
> > > -       if (sk && sk_fullsock(sk) && sock_flag(sk, SOCK_TXTIME))
> > > +       if (sk && sk_fullsock(sk) && sock_flag(sk, SOCK_TXTIME)) {
> > 
> > There is a slight race here with the socket flipping the feature on/off.
> > 
> > >                  skb->tstamp = 0;
> > > +               skb->fwd_tstamp = 0;
> > > +       } else if (skb->tstamp) {
> > > +               skb->fwd_tstamp = 1;
> > > +       }
> > 
> > SO_TXTIME future delivery times are scrubbed, but TCP future delivery
> > times are not?
> > 
> > If adding a bit, might it be simpler to add a bit tstamp_is_edt, and
> > scrub based on that. That is also not open to the above race.
> 
> One other thing I wonder, BPF progs at host-facing veth's tc ingress which
> are not aware of skb->tstamp will then see a tstamp from future given we
> intentionally bypass the net_timestamp_check() and might get confused (or
> would confuse higher-layer application logic)? Not quite sure yet if they
> would be the only affected user.
Considering the variety of clock used in skb->tstamp (real/mono, and also
tai in SO_TXTIME),  in general I am not sure if the tc-bpf can assume anything
in the skb->tstamp now.
Also, there is only mono clock bpf_ktime_get helper, the most reasonable usage
now for tc-bpf is to set the EDT which is in mono.  This seems to be the
intention when the __sk_buff->tstamp was added.
For ingress, it is real clock now.  Other than simply printing it out,
it is hard to think of a good way to use the value.  Also, although
it is unlikely, net_timestamp_check() does not always stamp the skb.
Martin KaFai Lau Dec. 8, 2021, 8:30 a.m. UTC | #8
On Wed, Dec 08, 2021 at 12:18:46AM -0800, Martin KaFai Lau wrote:
> On Tue, Dec 07, 2021 at 10:48:53PM +0100, Daniel Borkmann wrote:
> > On 12/7/21 3:27 PM, Willem de Bruijn wrote:
> > > On Mon, Dec 6, 2021 at 9:01 PM Martin KaFai Lau <kafai@fb.com> wrote:
> > > > 
> > > > The skb->tstamp may be set by a local sk (as a sender in tcp) which then
> > > > forwarded and delivered to another sk (as a receiver).
> > > > 
> > > > An example:
> > > >      sender-sk => veth@netns =====> veth@host => receiver-sk
> > > >                               ^^^
> > > >                          __dev_forward_skb
> > > > 
> > > > The skb->tstamp is marked with a future TX time.  This future
> > > > skb->tstamp will confuse the receiver-sk.
> > > > 
> > > > This patch marks the skb if the skb->tstamp is forwarded.
> > > > Before using the skb->tstamp as a rx timestamp, it needs
> > > > to be re-stamped to avoid getting a future time.  It is
> > > > done in the RX timestamp reading helper skb_get_ktime().
> > > > 
> > > > Signed-off-by: Martin KaFai Lau <kafai@fb.com>
> > > > ---
> > > >   include/linux/skbuff.h | 14 +++++++++-----
> > > >   net/core/dev.c         |  4 +++-
> > > >   net/core/skbuff.c      |  6 +++++-
> > > >   3 files changed, 17 insertions(+), 7 deletions(-)
> > > > 
> > > > diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> > > > index b609bdc5398b..bc4ae34c4e22 100644
> > > > --- a/include/linux/skbuff.h
> > > > +++ b/include/linux/skbuff.h
> > > > @@ -867,6 +867,7 @@ struct sk_buff {
> > > >          __u8                    decrypted:1;
> > > >   #endif
> > > >          __u8                    slow_gro:1;
> > > > +       __u8                    fwd_tstamp:1;
> > > > 
> > > >   #ifdef CONFIG_NET_SCHED
> > > >          __u16                   tc_index;       /* traffic control index */
> > > > @@ -3806,9 +3807,12 @@ static inline void skb_copy_to_linear_data_offset(struct sk_buff *skb,
> > > >   }
> > > > 
> > > >   void skb_init(void);
> > > > +void net_timestamp_set(struct sk_buff *skb);
> > > > 
> > > > -static inline ktime_t skb_get_ktime(const struct sk_buff *skb)
> > > > +static inline ktime_t skb_get_ktime(struct sk_buff *skb)
> > > >   {
> > > > +       if (unlikely(skb->fwd_tstamp))
> > > > +               net_timestamp_set(skb);
> > > >          return ktime_mono_to_real_cond(skb->tstamp);
> > > 
> > > This changes timestamp behavior for existing applications, probably
> > > worth mentioning in the commit message if nothing else. A timestamp
> > > taking at the time of the recv syscall is not very useful.
> > > 
> > > If a forwarded timestamp is not a future delivery time (as those are
> > > scrubbed), is it not correct to just deliver the original timestamp?
> > > It probably was taken at some earlier __netif_receive_skb_core.
> > > 
> > > >   }
> > > > 
> > > > -static inline void net_timestamp_set(struct sk_buff *skb)
> > > > +void net_timestamp_set(struct sk_buff *skb)
> > > >   {
> > > >          skb->tstamp = 0;
> > > > +       skb->fwd_tstamp = 0;
> > > >          if (static_branch_unlikely(&netstamp_needed_key))
> > > >                  __net_timestamp(skb);
> > > >   }
> > > > +EXPORT_SYMBOL(net_timestamp_set);
> > > > 
> > > >   #define net_timestamp_check(COND, SKB)                         \
> > > >          if (static_branch_unlikely(&netstamp_needed_key)) {     \
> > > > diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> > > > index f091c7807a9e..181ddc989ead 100644
> > > > --- a/net/core/skbuff.c
> > > > +++ b/net/core/skbuff.c
> > > > @@ -5295,8 +5295,12 @@ void skb_scrub_tstamp(struct sk_buff *skb)
> > > >   {
> > > >          struct sock *sk = skb->sk;
> > > > 
> > > > -       if (sk && sk_fullsock(sk) && sock_flag(sk, SOCK_TXTIME))
> > > > +       if (sk && sk_fullsock(sk) && sock_flag(sk, SOCK_TXTIME)) {
> > > 
> > > There is a slight race here with the socket flipping the feature on/off.
> > > 
> > > >                  skb->tstamp = 0;
> > > > +               skb->fwd_tstamp = 0;
> > > > +       } else if (skb->tstamp) {
> > > > +               skb->fwd_tstamp = 1;
> > > > +       }
> > > 
> > > SO_TXTIME future delivery times are scrubbed, but TCP future delivery
> > > times are not?
> > > 
> > > If adding a bit, might it be simpler to add a bit tstamp_is_edt, and
> > > scrub based on that. That is also not open to the above race.
> > 
> > One other thing I wonder, BPF progs at host-facing veth's tc ingress which
> > are not aware of skb->tstamp will then see a tstamp from future given we
> > intentionally bypass the net_timestamp_check() and might get confused (or
> > would confuse higher-layer application logic)? Not quite sure yet if they
> > would be the only affected user.
> Considering the variety of clock used in skb->tstamp (real/mono, and also
> tai in SO_TXTIME),  in general I am not sure if the tc-bpf can assume anything
> in the skb->tstamp now.
> Also, there is only mono clock bpf_ktime_get helper, the most reasonable usage
> now for tc-bpf is to set the EDT which is in mono.  This seems to be the
> intention when the __sk_buff->tstamp was added.
> For ingress, it is real clock now.  Other than simply printing it out,
> it is hard to think of a good way to use the value.  Also, although
> it is unlikely, net_timestamp_check() does not always stamp the skb.
For non bpf ingress, hmmm.... yeah, not sure if it is indeed an issue :/
may be save the tx tstamp first and then temporarily restamp with __net_timestamp()
Eric Dumazet Dec. 8, 2021, 6:27 p.m. UTC | #9
On Wed, Dec 8, 2021 at 12:30 AM Martin KaFai Lau <kafai@fb.com> wrote:

> For non bpf ingress, hmmm.... yeah, not sure if it is indeed an issue :/
> may be save the tx tstamp first and then temporarily restamp with __net_timestamp()

Martin, have you looked at time namespaces (CLONE_NEWTIME) ?

Perhaps we need to have more than one bit to describe time bases.
Daniel Borkmann Dec. 8, 2021, 7:27 p.m. UTC | #10
On 12/8/21 9:30 AM, Martin KaFai Lau wrote:
> On Wed, Dec 08, 2021 at 12:18:46AM -0800, Martin KaFai Lau wrote:
>> On Tue, Dec 07, 2021 at 10:48:53PM +0100, Daniel Borkmann wrote:
[...]
>>> One other thing I wonder, BPF progs at host-facing veth's tc ingress which
>>> are not aware of skb->tstamp will then see a tstamp from future given we
>>> intentionally bypass the net_timestamp_check() and might get confused (or
>>> would confuse higher-layer application logic)? Not quite sure yet if they
>>> would be the only affected user.
>> Considering the variety of clock used in skb->tstamp (real/mono, and also
>> tai in SO_TXTIME),  in general I am not sure if the tc-bpf can assume anything
>> in the skb->tstamp now.

But today that's either only 0 or real via __net_timestamp() if skb->tstamp is
read at bpf@ingress@veth@host, no?

>> Also, there is only mono clock bpf_ktime_get helper, the most reasonable usage
>> now for tc-bpf is to set the EDT which is in mono.  This seems to be the
>> intention when the __sk_buff->tstamp was added.

Yep, fwiw, that's also how we only use it in our code base today.

>> For ingress, it is real clock now.  Other than simply printing it out,
>> it is hard to think of a good way to use the value.  Also, although
>> it is unlikely, net_timestamp_check() does not always stamp the skb.
> For non bpf ingress, hmmm.... yeah, not sure if it is indeed an issue :/
> may be save the tx tstamp first and then temporarily restamp with __net_timestamp()
Martin KaFai Lau Dec. 8, 2021, 8:48 p.m. UTC | #11
On Wed, Dec 08, 2021 at 10:27:51AM -0800, Eric Dumazet wrote:
> On Wed, Dec 8, 2021 at 12:30 AM Martin KaFai Lau <kafai@fb.com> wrote:
> 
> > For non bpf ingress, hmmm.... yeah, not sure if it is indeed an issue :/
> > may be save the tx tstamp first and then temporarily restamp with __net_timestamp()
> 
> Martin, have you looked at time namespaces (CLONE_NEWTIME) ?
> 
> Perhaps we need to have more than one bit to describe time bases.
My noob understanding is it only affects the time returning
to the user in the syscall.  Could you explain how that
may affect the time in skb->tstamp?
Martin KaFai Lau Dec. 8, 2021, 10:19 p.m. UTC | #12
On Wed, Dec 08, 2021 at 08:27:45PM +0100, Daniel Borkmann wrote:
> On 12/8/21 9:30 AM, Martin KaFai Lau wrote:
> > On Wed, Dec 08, 2021 at 12:18:46AM -0800, Martin KaFai Lau wrote:
> > > On Tue, Dec 07, 2021 at 10:48:53PM +0100, Daniel Borkmann wrote:
> [...]
> > > > One other thing I wonder, BPF progs at host-facing veth's tc ingress which
> > > > are not aware of skb->tstamp will then see a tstamp from future given we
> > > > intentionally bypass the net_timestamp_check() and might get confused (or
> > > > would confuse higher-layer application logic)? Not quite sure yet if they
> > > > would be the only affected user.
> > > Considering the variety of clock used in skb->tstamp (real/mono, and also
> > > tai in SO_TXTIME),  in general I am not sure if the tc-bpf can assume anything
> > > in the skb->tstamp now.
> 
> But today that's either only 0 or real via __net_timestamp() if skb->tstamp is
> read at bpf@ingress@veth@host, no?
I think I was trying to say the CLOCK_REALTIME in __sk_buff->tstamp 
is not practically useful in bpf@ingress other than an increasing number.
No easy way to get the 'now' in CLOCK_REALTIME to compare with
in order to learn if it is future or current time.  Setting
it based on bpf_ktime_get_ns() in MONO is likely broken currently
regardless of the skb is forwarded or delivered locally.

We have a use case that wants to change the forwarded EDT
in bpf@ingress@veth@host before forwarding.  If it needs to
keep __sk_buff->tstamp as the 'now' CLOCK_REALTIME in ingress,
it needs to provide a separate way for the bpf to check/change
the forwarded EDT.

Daniel, do you have suggestion on where to temporarily store
the forwarded EDT so that the bpf@ingress can access?

> 
> > > Also, there is only mono clock bpf_ktime_get helper, the most reasonable usage
> > > now for tc-bpf is to set the EDT which is in mono.  This seems to be the
> > > intention when the __sk_buff->tstamp was added.
> 
> Yep, fwiw, that's also how we only use it in our code base today.
> 
> > > For ingress, it is real clock now.  Other than simply printing it out,
> > > it is hard to think of a good way to use the value.  Also, although
> > > it is unlikely, net_timestamp_check() does not always stamp the skb.
> > For non bpf ingress, hmmm.... yeah, not sure if it is indeed an issue :/
> > may be save the tx tstamp first and then temporarily restamp with __net_timestamp()
Daniel Borkmann Dec. 9, 2021, 12:58 p.m. UTC | #13
On 12/8/21 11:19 PM, Martin KaFai Lau wrote:
> On Wed, Dec 08, 2021 at 08:27:45PM +0100, Daniel Borkmann wrote:
>> On 12/8/21 9:30 AM, Martin KaFai Lau wrote:
>>> On Wed, Dec 08, 2021 at 12:18:46AM -0800, Martin KaFai Lau wrote:
>>>> On Tue, Dec 07, 2021 at 10:48:53PM +0100, Daniel Borkmann wrote:
>> [...]
>>>>> One other thing I wonder, BPF progs at host-facing veth's tc ingress which
>>>>> are not aware of skb->tstamp will then see a tstamp from future given we
>>>>> intentionally bypass the net_timestamp_check() and might get confused (or
>>>>> would confuse higher-layer application logic)? Not quite sure yet if they
>>>>> would be the only affected user.
>>>> Considering the variety of clock used in skb->tstamp (real/mono, and also
>>>> tai in SO_TXTIME),  in general I am not sure if the tc-bpf can assume anything
>>>> in the skb->tstamp now.
>>
>> But today that's either only 0 or real via __net_timestamp() if skb->tstamp is
>> read at bpf@ingress@veth@host, no?
> I think I was trying to say the CLOCK_REALTIME in __sk_buff->tstamp
> is not practically useful in bpf@ingress other than an increasing number.
> No easy way to get the 'now' in CLOCK_REALTIME to compare with
> in order to learn if it is future or current time.  Setting
> it based on bpf_ktime_get_ns() in MONO is likely broken currently
> regardless of the skb is forwarded or delivered locally.
> 
> We have a use case that wants to change the forwarded EDT
> in bpf@ingress@veth@host before forwarding.  If it needs to
> keep __sk_buff->tstamp as the 'now' CLOCK_REALTIME in ingress,
> it needs to provide a separate way for the bpf to check/change
> the forwarded EDT.
> 
> Daniel, do you have suggestion on where to temporarily store
> the forwarded EDT so that the bpf@ingress can access?

Hm, was thinking maybe moving skb->skb_mstamp_ns into the shared info as
in skb_hwtstamps(skb)->skb_mstamp_ns could work. In other words, as a union
with hwtstamp to not bloat it further. And TCP stack as well as everything
else (like sch_fq) could switch to it natively (hwtstamp might only be used
on RX or TX completion from driver side if I'm not mistaken).

But then while this would solve the netns transfer, we would run into the
/same/ issue again when implementing a hairpinning LB where we loop from RX
to TX given this would have to be cleared somewhere again if driver populates
hwtstamp, so not really feasible and bloating shared info with a second
tstamp would bump it by one cacheline. :(

A cleaner BUT still non-generic solution compared to the previous diff I could
think of might be the below. So no change in behavior in general, but if the
bpf@ingress@veth@host needs to access the original tstamp, it could do so
via existing mapping we already have in BPF, and then it could transfer it
for all or certain traffic (up to the prog) via BPF code setting ...

   skb->tstamp = skb->hwtstamp

... and do the redirect from there to the phys dev with BPF_F_KEEP_TSTAMP
flag. Minimal intrusive, but unfortunately only accessible for BPF. Maybe use
of skb_hwtstamps(skb)->nststamp could be extended though (?)

Thanks,
Daniel

  include/linux/skbuff.h   | 14 +++++++++++++-
  include/uapi/linux/bpf.h |  1 +
  net/core/filter.c        | 37 ++++++++++++++++++++++---------------
  net/core/skbuff.c        |  2 +-
  4 files changed, 37 insertions(+), 17 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index c8cb7e697d47..b7c72fe7e1bb 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -418,7 +418,10 @@ static inline bool skb_frag_must_loop(struct page *p)
   * &skb_shared_info. Use skb_hwtstamps() to get a pointer.
   */
  struct skb_shared_hwtstamps {
-	ktime_t	hwtstamp;
+	union {
+		ktime_t	hwtstamp;
+		ktime_t	nststamp;
+	};
  };

  /* Definitions for tx_flags in struct skb_shared_info */
@@ -3711,6 +3714,15 @@ int skb_mpls_dec_ttl(struct sk_buff *skb);
  struct sk_buff *pskb_extract(struct sk_buff *skb, int off, int to_copy,
  			     gfp_t gfp);

+static inline void skb_xfer_tstamp(struct sk_buff *skb)
+{
+	/* initns might still need access to the original
+	 * skb->tstamp from a netns, e.g. for EDT.
+	 */
+	skb_hwtstamps(skb)->nststamp = skb->tstamp;
+	skb->tstamp = 0;
+}
+
  static inline int memcpy_from_msg(void *data, struct msghdr *msg, int len)
  {
  	return copy_from_iter_full(data, len, &msg->msg_iter) ? 0 : -EFAULT;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index ba5af15e25f5..12d10ab8bff7 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -5157,6 +5157,7 @@ enum {
  /* BPF_FUNC_clone_redirect and BPF_FUNC_redirect flags. */
  enum {
  	BPF_F_INGRESS			= (1ULL << 0),
+	BPF_F_KEEP_TSTAMP		= (1ULL << 1),
  };

  /* BPF_FUNC_skb_set_tunnel_key and BPF_FUNC_skb_get_tunnel_key flags. */
diff --git a/net/core/filter.c b/net/core/filter.c
index 6102f093d59a..e233b998387c 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2097,7 +2097,8 @@ static inline int __bpf_rx_skb_no_mac(struct net_device *dev,
  	return ret;
  }

-static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb)
+static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb,
+			       bool keep_tstamp)
  {
  	int ret;

@@ -2108,7 +2109,8 @@ static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb)
  	}

  	skb->dev = dev;
-	skb->tstamp = 0;
+	if (!keep_tstamp)
+		skb_xfer_tstamp(skb);

  	dev_xmit_recursion_inc();
  	ret = dev_queue_xmit(skb);
@@ -2136,7 +2138,8 @@ static int __bpf_redirect_no_mac(struct sk_buff *skb, struct net_device *dev,
  	skb_pop_mac_header(skb);
  	skb_reset_mac_len(skb);
  	return flags & BPF_F_INGRESS ?
-	       __bpf_rx_skb_no_mac(dev, skb) : __bpf_tx_skb(dev, skb);
+	       __bpf_rx_skb_no_mac(dev, skb) :
+	       __bpf_tx_skb(dev, skb, flags & BPF_F_KEEP_TSTAMP);
  }

  static int __bpf_redirect_common(struct sk_buff *skb, struct net_device *dev,
@@ -2150,7 +2153,8 @@ static int __bpf_redirect_common(struct sk_buff *skb, struct net_device *dev,

  	bpf_push_mac_rcsum(skb);
  	return flags & BPF_F_INGRESS ?
-	       __bpf_rx_skb(dev, skb) : __bpf_tx_skb(dev, skb);
+	       __bpf_rx_skb(dev, skb) :
+	       __bpf_tx_skb(dev, skb, flags & BPF_F_KEEP_TSTAMP);
  }

  static int __bpf_redirect(struct sk_buff *skb, struct net_device *dev,
@@ -2177,7 +2181,6 @@ static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb,
  	}

  	skb->dev = dev;
-	skb->tstamp = 0;

  	if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
  		skb = skb_expand_head(skb, hh_len);
@@ -2275,7 +2278,6 @@ static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb,
  	}

  	skb->dev = dev;
-	skb->tstamp = 0;

  	if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
  		skb = skb_expand_head(skb, hh_len);
@@ -2367,7 +2369,7 @@ static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev,
  #endif /* CONFIG_INET */

  static int __bpf_redirect_neigh(struct sk_buff *skb, struct net_device *dev,
-				struct bpf_nh_params *nh)
+				struct bpf_nh_params *nh, bool keep_tstamp)
  {
  	struct ethhdr *ethh = eth_hdr(skb);

@@ -2380,6 +2382,8 @@ static int __bpf_redirect_neigh(struct sk_buff *skb, struct net_device *dev,
  	skb_pull(skb, sizeof(*ethh));
  	skb_unset_mac_header(skb);
  	skb_reset_network_header(skb);
+	if (!keep_tstamp)
+		skb_xfer_tstamp(skb);

  	if (skb->protocol == htons(ETH_P_IP))
  		return __bpf_redirect_neigh_v4(skb, dev, nh);
@@ -2392,9 +2396,9 @@ static int __bpf_redirect_neigh(struct sk_buff *skb, struct net_device *dev,

  /* Internal, non-exposed redirect flags. */
  enum {
-	BPF_F_NEIGH	= (1ULL << 1),
-	BPF_F_PEER	= (1ULL << 2),
-	BPF_F_NEXTHOP	= (1ULL << 3),
+	BPF_F_NEIGH	= (1ULL << 2),
+	BPF_F_PEER	= (1ULL << 3),
+	BPF_F_NEXTHOP	= (1ULL << 4),
  #define BPF_F_REDIRECT_INTERNAL	(BPF_F_NEIGH | BPF_F_PEER | BPF_F_NEXTHOP)
  };

@@ -2468,8 +2472,9 @@ int skb_do_redirect(struct sk_buff *skb)
  		return -EAGAIN;
  	}
  	return flags & BPF_F_NEIGH ?
-	       __bpf_redirect_neigh(skb, dev, flags & BPF_F_NEXTHOP ?
-				    &ri->nh : NULL) :
+	       __bpf_redirect_neigh(skb, dev,
+				    flags & BPF_F_NEXTHOP ? &ri->nh : NULL,
+				    flags & BPF_F_KEEP_TSTAMP) :
  	       __bpf_redirect(skb, dev, flags);
  out_drop:
  	kfree_skb(skb);
@@ -2480,7 +2485,8 @@ BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags)
  {
  	struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);

-	if (unlikely(flags & (~(BPF_F_INGRESS) | BPF_F_REDIRECT_INTERNAL)))
+	if (unlikely(flags & (~(BPF_F_INGRESS | BPF_F_KEEP_TSTAMP) |
+			      BPF_F_REDIRECT_INTERNAL)))
  		return TC_ACT_SHOT;

  	ri->flags = flags;
@@ -2523,10 +2529,11 @@ BPF_CALL_4(bpf_redirect_neigh, u32, ifindex, struct bpf_redir_neigh *, params,
  {
  	struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);

-	if (unlikely((plen && plen < sizeof(*params)) || flags))
+	if (unlikely((plen && plen < sizeof(*params)) ||
+		     (flags & ~BPF_F_KEEP_TSTAMP)))
  		return TC_ACT_SHOT;

-	ri->flags = BPF_F_NEIGH | (plen ? BPF_F_NEXTHOP : 0);
+	ri->flags = BPF_F_NEIGH | (plen ? BPF_F_NEXTHOP : 0) | flags;
  	ri->tgt_index = ifindex;

  	BUILD_BUG_ON(sizeof(struct bpf_redir_neigh) != sizeof(struct bpf_nh_params));
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index ba2f38246f07..782b152a000c 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -5472,7 +5472,7 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet)

  	ipvs_reset(skb);
  	skb->mark = 0;
-	skb->tstamp = 0;
+	skb_xfer_tstamp(skb);
  }
  EXPORT_SYMBOL_GPL(skb_scrub_packet);
Martin KaFai Lau Dec. 10, 2021, 1:37 a.m. UTC | #14
On Thu, Dec 09, 2021 at 01:58:52PM +0100, Daniel Borkmann wrote:
> > Daniel, do you have suggestion on where to temporarily store
> > the forwarded EDT so that the bpf@ingress can access?
> 
> Hm, was thinking maybe moving skb->skb_mstamp_ns into the shared info as
> in skb_hwtstamps(skb)->skb_mstamp_ns could work. In other words, as a union
> with hwtstamp to not bloat it further. And TCP stack as well as everything
> else (like sch_fq) could switch to it natively (hwtstamp might only be used
> on RX or TX completion from driver side if I'm not mistaken).
> 
> But then while this would solve the netns transfer, we would run into the
> /same/ issue again when implementing a hairpinning LB where we loop from RX
> to TX given this would have to be cleared somewhere again if driver populates
> hwtstamp, so not really feasible and bloating shared info with a second
> tstamp would bump it by one cacheline. :(
If the edt is set at skb_hwtstamps,
skb->tstamp probably needs to be re-populated for the bpf@tc-egress
but should be minor since there is a skb_at_tc_ingress() test.

It seems fq does not need shinfo now, so that will be an extra cacheline to
bring... hmm

> A cleaner BUT still non-generic solution compared to the previous diff I could
> think of might be the below. So no change in behavior in general, but if the
> bpf@ingress@veth@host needs to access the original tstamp, it could do so
> via existing mapping we already have in BPF, and then it could transfer it
> for all or certain traffic (up to the prog) via BPF code setting ...
> 
>   skb->tstamp = skb->hwtstamp
> 
> ... and do the redirect from there to the phys dev with BPF_F_KEEP_TSTAMP
> flag. Minimal intrusive, but unfortunately only accessible for BPF. Maybe use
> of skb_hwtstamps(skb)->nststamp could be extended though (?)
I like the idea of the possibility in temporarily storing a future mono EDT
in skb_shared_hwtstamps.

It may open up some possibilities.  Not sure how that may look like yet
but I will try to develop on this.

I may have to separate the fwd-edt problem from __sk_buff->tstamp accessibility
@ingress to keep it simple first.
will try to make it generic also before scaling back to a bpf-specific solution.

Thanks for the code and the idea !
Daniel Borkmann Dec. 10, 2021, 10:08 a.m. UTC | #15
On 12/10/21 2:37 AM, Martin KaFai Lau wrote:
> On Thu, Dec 09, 2021 at 01:58:52PM +0100, Daniel Borkmann wrote:
>>> Daniel, do you have suggestion on where to temporarily store
>>> the forwarded EDT so that the bpf@ingress can access?
>>
>> Hm, was thinking maybe moving skb->skb_mstamp_ns into the shared info as
>> in skb_hwtstamps(skb)->skb_mstamp_ns could work. In other words, as a union
>> with hwtstamp to not bloat it further. And TCP stack as well as everything
>> else (like sch_fq) could switch to it natively (hwtstamp might only be used
>> on RX or TX completion from driver side if I'm not mistaken).
>>
>> But then while this would solve the netns transfer, we would run into the
>> /same/ issue again when implementing a hairpinning LB where we loop from RX
>> to TX given this would have to be cleared somewhere again if driver populates
>> hwtstamp, so not really feasible and bloating shared info with a second
>> tstamp would bump it by one cacheline. :(
> If the edt is set at skb_hwtstamps,
> skb->tstamp probably needs to be re-populated for the bpf@tc-egress
> but should be minor since there is a skb_at_tc_ingress() test.
> 
> It seems fq does not need shinfo now, so that will be an extra cacheline to
> bring... hmm

Right. :/ The other thing I was wondering (but haven't looked enough into the
code yet whether feasible or not) ... maybe skb_hwtstamps(skb)->hwtstamp could
be changed to cover both hw & sw ingress tstamp (meaning, if nic doesn't provide
it, then we fall back to the sw one and __net_timestamp() stores it there, too)
whereas skb->tstamp would always concern an egress tstamp. However, it might
result in quite a lot of churn given the wider-spread use, but more importantly,
performance implications are also not quite clear as you mentioned above wrt
extra cache miss.

>> A cleaner BUT still non-generic solution compared to the previous diff I could
>> think of might be the below. So no change in behavior in general, but if the
>> bpf@ingress@veth@host needs to access the original tstamp, it could do so
>> via existing mapping we already have in BPF, and then it could transfer it
>> for all or certain traffic (up to the prog) via BPF code setting ...
>>
>>    skb->tstamp = skb->hwtstamp
>>
>> ... and do the redirect from there to the phys dev with BPF_F_KEEP_TSTAMP
>> flag. Minimal intrusive, but unfortunately only accessible for BPF. Maybe use
>> of skb_hwtstamps(skb)->nststamp could be extended though (?)
> I like the idea of the possibility in temporarily storing a future mono EDT
> in skb_shared_hwtstamps.
> 
> It may open up some possibilities.  Not sure how that may look like yet
> but I will try to develop on this.

Ok! One thing I noticed later in the diff, that for the ingressing direction
aka phys -> host veth -> netns veth, we also do the skb_xfer_tstamp() switch
and might override the one stored from driver with potentially the one from
__net_timestamp(), but maybe for netns'es that's acceptable (perhaps a test
for existing skb->sk owner before skb_xfer_tstamp() could do the trick..).

> I may have to separate the fwd-edt problem from __sk_buff->tstamp accessibility
> @ingress to keep it simple first.
> will try to make it generic also before scaling back to a bpf-specific solution.

Yeah sounds good, if we can solve it generically, even better!

> Thanks for the code and the idea !
Thanks,
Daniel
Eric Dumazet Dec. 10, 2021, 10:19 a.m. UTC | #16
On Wed, Dec 8, 2021 at 12:48 PM Martin KaFai Lau <kafai@fb.com> wrote:
>
> On Wed, Dec 08, 2021 at 10:27:51AM -0800, Eric Dumazet wrote:
> > On Wed, Dec 8, 2021 at 12:30 AM Martin KaFai Lau <kafai@fb.com> wrote:
> >
> > > For non bpf ingress, hmmm.... yeah, not sure if it is indeed an issue :/
> > > may be save the tx tstamp first and then temporarily restamp with __net_timestamp()
> >
> > Martin, have you looked at time namespaces (CLONE_NEWTIME) ?
> >
> > Perhaps we need to have more than one bit to describe time bases.
> My noob understanding is it only affects the time returning
> to the user in the syscall.  Could you explain how that
> may affect the time in skb->tstamp?

I would think it should affect timestamps (rx/tx network ones),
otherwise user applications would be broken ?
diff mbox series

Patch

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index b609bdc5398b..bc4ae34c4e22 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -867,6 +867,7 @@  struct sk_buff {
 	__u8			decrypted:1;
 #endif
 	__u8			slow_gro:1;
+	__u8			fwd_tstamp:1;
 
 #ifdef CONFIG_NET_SCHED
 	__u16			tc_index;	/* traffic control index */
@@ -3806,9 +3807,12 @@  static inline void skb_copy_to_linear_data_offset(struct sk_buff *skb,
 }
 
 void skb_init(void);
+void net_timestamp_set(struct sk_buff *skb);
 
-static inline ktime_t skb_get_ktime(const struct sk_buff *skb)
+static inline ktime_t skb_get_ktime(struct sk_buff *skb)
 {
+	if (unlikely(skb->fwd_tstamp))
+		net_timestamp_set(skb);
 	return ktime_mono_to_real_cond(skb->tstamp);
 }
 
@@ -3821,13 +3825,13 @@  static inline ktime_t skb_get_ktime(const struct sk_buff *skb)
  *	This function converts the offset back to a struct timeval and stores
  *	it in stamp.
  */
-static inline void skb_get_timestamp(const struct sk_buff *skb,
+static inline void skb_get_timestamp(struct sk_buff *skb,
 				     struct __kernel_old_timeval *stamp)
 {
 	*stamp = ns_to_kernel_old_timeval(skb_get_ktime(skb));
 }
 
-static inline void skb_get_new_timestamp(const struct sk_buff *skb,
+static inline void skb_get_new_timestamp(struct sk_buff *skb,
 					 struct __kernel_sock_timeval *stamp)
 {
 	struct timespec64 ts = ktime_to_timespec64(skb_get_ktime(skb));
@@ -3836,7 +3840,7 @@  static inline void skb_get_new_timestamp(const struct sk_buff *skb,
 	stamp->tv_usec = ts.tv_nsec / 1000;
 }
 
-static inline void skb_get_timestampns(const struct sk_buff *skb,
+static inline void skb_get_timestampns(struct sk_buff *skb,
 				       struct __kernel_old_timespec *stamp)
 {
 	struct timespec64 ts = ktime_to_timespec64(skb_get_ktime(skb));
@@ -3845,7 +3849,7 @@  static inline void skb_get_timestampns(const struct sk_buff *skb,
 	stamp->tv_nsec = ts.tv_nsec;
 }
 
-static inline void skb_get_new_timestampns(const struct sk_buff *skb,
+static inline void skb_get_new_timestampns(struct sk_buff *skb,
 					   struct __kernel_timespec *stamp)
 {
 	struct timespec64 ts = ktime_to_timespec64(skb_get_ktime(skb));
diff --git a/net/core/dev.c b/net/core/dev.c
index 4420086f3aeb..96cd31d9a359 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2058,12 +2058,14 @@  void net_disable_timestamp(void)
 }
 EXPORT_SYMBOL(net_disable_timestamp);
 
-static inline void net_timestamp_set(struct sk_buff *skb)
+void net_timestamp_set(struct sk_buff *skb)
 {
 	skb->tstamp = 0;
+	skb->fwd_tstamp = 0;
 	if (static_branch_unlikely(&netstamp_needed_key))
 		__net_timestamp(skb);
 }
+EXPORT_SYMBOL(net_timestamp_set);
 
 #define net_timestamp_check(COND, SKB)				\
 	if (static_branch_unlikely(&netstamp_needed_key)) {	\
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index f091c7807a9e..181ddc989ead 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -5295,8 +5295,12 @@  void skb_scrub_tstamp(struct sk_buff *skb)
 {
 	struct sock *sk = skb->sk;
 
-	if (sk && sk_fullsock(sk) && sock_flag(sk, SOCK_TXTIME))
+	if (sk && sk_fullsock(sk) && sock_flag(sk, SOCK_TXTIME)) {
 		skb->tstamp = 0;
+		skb->fwd_tstamp = 0;
+	} else if (skb->tstamp) {
+		skb->fwd_tstamp = 1;
+	}
 }
 
 /**