diff mbox series

[net-next,v3,02/14] net-timestamp: allow two features to work parallelly

Message ID 20241028110535.82999-3-kerneljasonxing@gmail.com (mailing list archive)
State Changes Requested
Delegated to: Netdev Maintainers
Headers show
Series net-timestamp: bpf extension to equip applications transparently | expand

Checks

Context Check Description
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for net-next, async
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 18 this patch: 18
netdev/build_tools success Errors and warnings before: 0 (+0) this patch: 0 (+0)
netdev/cc_maintainers warning 1 maintainers not CCed: horms@kernel.org
netdev/build_clang success Errors and warnings before: 27 this patch: 27
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 2608 this patch: 2608
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 70 lines checked
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc fail Errors and warnings before: 58 this patch: 59
netdev/source_inline success Was 0 now: 0

Commit Message

Jason Xing Oct. 28, 2024, 11:05 a.m. UTC
From: Jason Xing <kernelxing@tencent.com>

This patch has introduced a separate sk_tsflags_bpf for bpf
extension, which helps us let two feature work nearly at the
same time.

Each feature will finally take effect on skb_shinfo(skb)->tx_flags,
say, tcp_tx_timestamp() for TCP or skb_setup_tx_timestamp() for
other types, so in __skb_tstamp_tx() we are unable to know which
feature is turned on, unless we check each feature's own socket
flag field.

Signed-off-by: Jason Xing <kernelxing@tencent.com>
---
 include/net/sock.h |  1 +
 net/core/skbuff.c  | 39 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 40 insertions(+)

Comments

Martin KaFai Lau Oct. 29, 2024, 11 p.m. UTC | #1
On 10/28/24 4:05 AM, Jason Xing wrote:
> From: Jason Xing <kernelxing@tencent.com>
> 
> This patch has introduced a separate sk_tsflags_bpf for bpf
> extension, which helps us let two feature work nearly at the
> same time.
> 
> Each feature will finally take effect on skb_shinfo(skb)->tx_flags,
> say, tcp_tx_timestamp() for TCP or skb_setup_tx_timestamp() for
> other types, so in __skb_tstamp_tx() we are unable to know which
> feature is turned on, unless we check each feature's own socket
> flag field.
> 
> Signed-off-by: Jason Xing <kernelxing@tencent.com>
> ---
>   include/net/sock.h |  1 +
>   net/core/skbuff.c  | 39 +++++++++++++++++++++++++++++++++++++++
>   2 files changed, 40 insertions(+)
> 
> diff --git a/include/net/sock.h b/include/net/sock.h
> index 7464e9f9f47c..5384f1e49f5c 100644
> --- a/include/net/sock.h
> +++ b/include/net/sock.h
> @@ -445,6 +445,7 @@ struct sock {
>   	u32			sk_reserved_mem;
>   	int			sk_forward_alloc;
>   	u32			sk_tsflags;
> +	u32			sk_tsflags_bpf;
>   	__cacheline_group_end(sock_write_rxtx);
>   
>   	__cacheline_group_begin(sock_write_tx);
> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> index 1cf8416f4123..39309f75e105 100644
> --- a/net/core/skbuff.c
> +++ b/net/core/skbuff.c
> @@ -5539,6 +5539,32 @@ void skb_complete_tx_timestamp(struct sk_buff *skb,
>   }
>   EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp);
>   
> +/* This function is used to test if application SO_TIMESTAMPING feature
> + * or bpf SO_TIMESTAMPING feature is loaded by checking its own socket flags.
> + */
> +static bool sk_tstamp_tx_flags(struct sock *sk, u32 tsflags, int tstype)
> +{
> +	u32 testflag;
> +
> +	switch (tstype) {
> +	case SCM_TSTAMP_SCHED:
> +		testflag = SOF_TIMESTAMPING_TX_SCHED;
> +		break;
> +	case SCM_TSTAMP_SND:
> +		testflag = SOF_TIMESTAMPING_TX_SOFTWARE;
> +		break;
> +	case SCM_TSTAMP_ACK:
> +		testflag = SOF_TIMESTAMPING_TX_ACK;
> +		break;
> +	default:
> +		return false;
> +	}
> +	if (tsflags & testflag)
> +		return true;
> +
> +	return false;
> +}
> +
>   static void skb_tstamp_tx_output(struct sk_buff *orig_skb,
>   				 const struct sk_buff *ack_skb,
>   				 struct skb_shared_hwtstamps *hwtstamps,
> @@ -5549,6 +5575,9 @@ static void skb_tstamp_tx_output(struct sk_buff *orig_skb,
>   	u32 tsflags;
>   
>   	tsflags = READ_ONCE(sk->sk_tsflags);
> +	if (!sk_tstamp_tx_flags(sk, tsflags, tstype))

I still don't get this part since v2. How does it work with cmsg only 
SOF_TIMESTAMPING_TX_*?

I tried with "./txtimestamp -6 -c 1 -C -N -L ::1" and it does not return any tx 
time stamp after this patch.

I am likely missing something
or v2 concluded that this behavior change is acceptable?

> +		return;
> +
>   	if (!hwtstamps && !(tsflags & SOF_TIMESTAMPING_OPT_TX_SWHW) &&
>   	    skb_shinfo(orig_skb)->tx_flags & SKBTX_IN_PROGRESS)
>   		return;
> @@ -5592,6 +5621,15 @@ static void skb_tstamp_tx_output(struct sk_buff *orig_skb,
>   	__skb_complete_tx_timestamp(skb, sk, tstype, opt_stats);
>   }
>   
> +static void skb_tstamp_tx_output_bpf(struct sock *sk, int tstype)
> +{
> +	u32 tsflags;
> +
> +	tsflags = READ_ONCE(sk->sk_tsflags_bpf);
> +	if (!sk_tstamp_tx_flags(sk, tsflags, tstype))
> +		return;
> +}
> +
>   void __skb_tstamp_tx(struct sk_buff *orig_skb,
>   		     const struct sk_buff *ack_skb,
>   		     struct skb_shared_hwtstamps *hwtstamps,
> @@ -5600,6 +5638,7 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb,
>   	if (!sk)
>   		return;
>   
> +	skb_tstamp_tx_output_bpf(sk, tstype);
>   	skb_tstamp_tx_output(orig_skb, ack_skb, hwtstamps, sk, tstype);
>   }
>   EXPORT_SYMBOL_GPL(__skb_tstamp_tx);
Jason Xing Oct. 30, 2024, 1:23 a.m. UTC | #2
On Wed, Oct 30, 2024 at 7:00 AM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>
> On 10/28/24 4:05 AM, Jason Xing wrote:
> > From: Jason Xing <kernelxing@tencent.com>
> >
> > This patch has introduced a separate sk_tsflags_bpf for bpf
> > extension, which helps us let two feature work nearly at the
> > same time.
> >
> > Each feature will finally take effect on skb_shinfo(skb)->tx_flags,
> > say, tcp_tx_timestamp() for TCP or skb_setup_tx_timestamp() for
> > other types, so in __skb_tstamp_tx() we are unable to know which
> > feature is turned on, unless we check each feature's own socket
> > flag field.
> >
> > Signed-off-by: Jason Xing <kernelxing@tencent.com>
> > ---
> >   include/net/sock.h |  1 +
> >   net/core/skbuff.c  | 39 +++++++++++++++++++++++++++++++++++++++
> >   2 files changed, 40 insertions(+)
> >
> > diff --git a/include/net/sock.h b/include/net/sock.h
> > index 7464e9f9f47c..5384f1e49f5c 100644
> > --- a/include/net/sock.h
> > +++ b/include/net/sock.h
> > @@ -445,6 +445,7 @@ struct sock {
> >       u32                     sk_reserved_mem;
> >       int                     sk_forward_alloc;
> >       u32                     sk_tsflags;
> > +     u32                     sk_tsflags_bpf;
> >       __cacheline_group_end(sock_write_rxtx);
> >
> >       __cacheline_group_begin(sock_write_tx);
> > diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> > index 1cf8416f4123..39309f75e105 100644
> > --- a/net/core/skbuff.c
> > +++ b/net/core/skbuff.c
> > @@ -5539,6 +5539,32 @@ void skb_complete_tx_timestamp(struct sk_buff *skb,
> >   }
> >   EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp);
> >
> > +/* This function is used to test if application SO_TIMESTAMPING feature
> > + * or bpf SO_TIMESTAMPING feature is loaded by checking its own socket flags.
> > + */
> > +static bool sk_tstamp_tx_flags(struct sock *sk, u32 tsflags, int tstype)
> > +{
> > +     u32 testflag;
> > +
> > +     switch (tstype) {
> > +     case SCM_TSTAMP_SCHED:
> > +             testflag = SOF_TIMESTAMPING_TX_SCHED;
> > +             break;
> > +     case SCM_TSTAMP_SND:
> > +             testflag = SOF_TIMESTAMPING_TX_SOFTWARE;
> > +             break;
> > +     case SCM_TSTAMP_ACK:
> > +             testflag = SOF_TIMESTAMPING_TX_ACK;
> > +             break;
> > +     default:
> > +             return false;
> > +     }
> > +     if (tsflags & testflag)
> > +             return true;
> > +
> > +     return false;
> > +}
> > +
> >   static void skb_tstamp_tx_output(struct sk_buff *orig_skb,
> >                                const struct sk_buff *ack_skb,
> >                                struct skb_shared_hwtstamps *hwtstamps,
> > @@ -5549,6 +5575,9 @@ static void skb_tstamp_tx_output(struct sk_buff *orig_skb,
> >       u32 tsflags;
> >
> >       tsflags = READ_ONCE(sk->sk_tsflags);
> > +     if (!sk_tstamp_tx_flags(sk, tsflags, tstype))
>
> I still don't get this part since v2. How does it work with cmsg only
> SOF_TIMESTAMPING_TX_*?
>
> I tried with "./txtimestamp -6 -c 1 -C -N -L ::1" and it does not return any tx
> time stamp after this patch.
>
> I am likely missing something
> or v2 concluded that this behavior change is acceptable?

Sorry, I submitted this series accidentally removing one important
thing which is similar to what Vadim Fedorenko mentioned in the v1
[1]:
adding another member like sk_flags_bpf to handle the cmsg case.

Willem, would it be acceptable to add another field in struct sock to
help us recognise the case where BPF and cmsg works parallelly?

[1]: https://lore.kernel.org/all/662873cb-a897-464e-bdb3-edf01363c3b2@linux.dev/

Thanks,
Jason
Willem de Bruijn Oct. 30, 2024, 1:45 a.m. UTC | #3
Jason Xing wrote:
> On Wed, Oct 30, 2024 at 7:00 AM Martin KaFai Lau <martin.lau@linux.dev> wrote:
> >
> > On 10/28/24 4:05 AM, Jason Xing wrote:
> > > From: Jason Xing <kernelxing@tencent.com>
> > >
> > > This patch has introduced a separate sk_tsflags_bpf for bpf
> > > extension, which helps us let two feature work nearly at the
> > > same time.
> > >
> > > Each feature will finally take effect on skb_shinfo(skb)->tx_flags,
> > > say, tcp_tx_timestamp() for TCP or skb_setup_tx_timestamp() for
> > > other types, so in __skb_tstamp_tx() we are unable to know which
> > > feature is turned on, unless we check each feature's own socket
> > > flag field.
> > >
> > > Signed-off-by: Jason Xing <kernelxing@tencent.com>
> > > ---
> > >   include/net/sock.h |  1 +
> > >   net/core/skbuff.c  | 39 +++++++++++++++++++++++++++++++++++++++
> > >   2 files changed, 40 insertions(+)
> > >
> > > diff --git a/include/net/sock.h b/include/net/sock.h
> > > index 7464e9f9f47c..5384f1e49f5c 100644
> > > --- a/include/net/sock.h
> > > +++ b/include/net/sock.h
> > > @@ -445,6 +445,7 @@ struct sock {
> > >       u32                     sk_reserved_mem;
> > >       int                     sk_forward_alloc;
> > >       u32                     sk_tsflags;
> > > +     u32                     sk_tsflags_bpf;
> > >       __cacheline_group_end(sock_write_rxtx);
> > >
> > >       __cacheline_group_begin(sock_write_tx);
> > > diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> > > index 1cf8416f4123..39309f75e105 100644
> > > --- a/net/core/skbuff.c
> > > +++ b/net/core/skbuff.c
> > > @@ -5539,6 +5539,32 @@ void skb_complete_tx_timestamp(struct sk_buff *skb,
> > >   }
> > >   EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp);
> > >
> > > +/* This function is used to test if application SO_TIMESTAMPING feature
> > > + * or bpf SO_TIMESTAMPING feature is loaded by checking its own socket flags.
> > > + */
> > > +static bool sk_tstamp_tx_flags(struct sock *sk, u32 tsflags, int tstype)
> > > +{
> > > +     u32 testflag;
> > > +
> > > +     switch (tstype) {
> > > +     case SCM_TSTAMP_SCHED:
> > > +             testflag = SOF_TIMESTAMPING_TX_SCHED;
> > > +             break;
> > > +     case SCM_TSTAMP_SND:
> > > +             testflag = SOF_TIMESTAMPING_TX_SOFTWARE;
> > > +             break;
> > > +     case SCM_TSTAMP_ACK:
> > > +             testflag = SOF_TIMESTAMPING_TX_ACK;
> > > +             break;
> > > +     default:
> > > +             return false;
> > > +     }
> > > +     if (tsflags & testflag)
> > > +             return true;
> > > +
> > > +     return false;
> > > +}
> > > +
> > >   static void skb_tstamp_tx_output(struct sk_buff *orig_skb,
> > >                                const struct sk_buff *ack_skb,
> > >                                struct skb_shared_hwtstamps *hwtstamps,
> > > @@ -5549,6 +5575,9 @@ static void skb_tstamp_tx_output(struct sk_buff *orig_skb,
> > >       u32 tsflags;
> > >
> > >       tsflags = READ_ONCE(sk->sk_tsflags);
> > > +     if (!sk_tstamp_tx_flags(sk, tsflags, tstype))
> >
> > I still don't get this part since v2. How does it work with cmsg only
> > SOF_TIMESTAMPING_TX_*?
> >
> > I tried with "./txtimestamp -6 -c 1 -C -N -L ::1" and it does not return any tx
> > time stamp after this patch.
> >
> > I am likely missing something
> > or v2 concluded that this behavior change is acceptable?
> 
> Sorry, I submitted this series accidentally removing one important
> thing which is similar to what Vadim Fedorenko mentioned in the v1
> [1]:
> adding another member like sk_flags_bpf to handle the cmsg case.
> 
> Willem, would it be acceptable to add another field in struct sock to
> help us recognise the case where BPF and cmsg works parallelly?
> 
> [1]: https://lore.kernel.org/all/662873cb-a897-464e-bdb3-edf01363c3b2@linux.dev/

The current timestamp flags don't need a u32. Maybe just reserve a bit
for this purpose?
Jason Xing Oct. 30, 2024, 2:32 a.m. UTC | #4
On Wed, Oct 30, 2024 at 9:45 AM Willem de Bruijn
<willemdebruijn.kernel@gmail.com> wrote:
>
> Jason Xing wrote:
> > On Wed, Oct 30, 2024 at 7:00 AM Martin KaFai Lau <martin.lau@linux.dev> wrote:
> > >
> > > On 10/28/24 4:05 AM, Jason Xing wrote:
> > > > From: Jason Xing <kernelxing@tencent.com>
> > > >
> > > > This patch has introduced a separate sk_tsflags_bpf for bpf
> > > > extension, which helps us let two feature work nearly at the
> > > > same time.
> > > >
> > > > Each feature will finally take effect on skb_shinfo(skb)->tx_flags,
> > > > say, tcp_tx_timestamp() for TCP or skb_setup_tx_timestamp() for
> > > > other types, so in __skb_tstamp_tx() we are unable to know which
> > > > feature is turned on, unless we check each feature's own socket
> > > > flag field.
> > > >
> > > > Signed-off-by: Jason Xing <kernelxing@tencent.com>
> > > > ---
> > > >   include/net/sock.h |  1 +
> > > >   net/core/skbuff.c  | 39 +++++++++++++++++++++++++++++++++++++++
> > > >   2 files changed, 40 insertions(+)
> > > >
> > > > diff --git a/include/net/sock.h b/include/net/sock.h
> > > > index 7464e9f9f47c..5384f1e49f5c 100644
> > > > --- a/include/net/sock.h
> > > > +++ b/include/net/sock.h
> > > > @@ -445,6 +445,7 @@ struct sock {
> > > >       u32                     sk_reserved_mem;
> > > >       int                     sk_forward_alloc;
> > > >       u32                     sk_tsflags;
> > > > +     u32                     sk_tsflags_bpf;
> > > >       __cacheline_group_end(sock_write_rxtx);
> > > >
> > > >       __cacheline_group_begin(sock_write_tx);
> > > > diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> > > > index 1cf8416f4123..39309f75e105 100644
> > > > --- a/net/core/skbuff.c
> > > > +++ b/net/core/skbuff.c
> > > > @@ -5539,6 +5539,32 @@ void skb_complete_tx_timestamp(struct sk_buff *skb,
> > > >   }
> > > >   EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp);
> > > >
> > > > +/* This function is used to test if application SO_TIMESTAMPING feature
> > > > + * or bpf SO_TIMESTAMPING feature is loaded by checking its own socket flags.
> > > > + */
> > > > +static bool sk_tstamp_tx_flags(struct sock *sk, u32 tsflags, int tstype)
> > > > +{
> > > > +     u32 testflag;
> > > > +
> > > > +     switch (tstype) {
> > > > +     case SCM_TSTAMP_SCHED:
> > > > +             testflag = SOF_TIMESTAMPING_TX_SCHED;
> > > > +             break;
> > > > +     case SCM_TSTAMP_SND:
> > > > +             testflag = SOF_TIMESTAMPING_TX_SOFTWARE;
> > > > +             break;
> > > > +     case SCM_TSTAMP_ACK:
> > > > +             testflag = SOF_TIMESTAMPING_TX_ACK;
> > > > +             break;
> > > > +     default:
> > > > +             return false;
> > > > +     }
> > > > +     if (tsflags & testflag)
> > > > +             return true;
> > > > +
> > > > +     return false;
> > > > +}
> > > > +
> > > >   static void skb_tstamp_tx_output(struct sk_buff *orig_skb,
> > > >                                const struct sk_buff *ack_skb,
> > > >                                struct skb_shared_hwtstamps *hwtstamps,
> > > > @@ -5549,6 +5575,9 @@ static void skb_tstamp_tx_output(struct sk_buff *orig_skb,
> > > >       u32 tsflags;
> > > >
> > > >       tsflags = READ_ONCE(sk->sk_tsflags);
> > > > +     if (!sk_tstamp_tx_flags(sk, tsflags, tstype))
> > >
> > > I still don't get this part since v2. How does it work with cmsg only
> > > SOF_TIMESTAMPING_TX_*?
> > >
> > > I tried with "./txtimestamp -6 -c 1 -C -N -L ::1" and it does not return any tx
> > > time stamp after this patch.
> > >
> > > I am likely missing something
> > > or v2 concluded that this behavior change is acceptable?
> >
> > Sorry, I submitted this series accidentally removing one important
> > thing which is similar to what Vadim Fedorenko mentioned in the v1
> > [1]:
> > adding another member like sk_flags_bpf to handle the cmsg case.
> >
> > Willem, would it be acceptable to add another field in struct sock to
> > help us recognise the case where BPF and cmsg works parallelly?
> >
> > [1]: https://lore.kernel.org/all/662873cb-a897-464e-bdb3-edf01363c3b2@linux.dev/
>
> The current timestamp flags don't need a u32. Maybe just reserve a bit
> for this purpose?

Sure. Good suggestion.

But I think only using one bit to reflect whether the sk->sk_tsflags
is used by normal or cmsg features is not enough. The existing
implementation in tcp_sendmsg_locked() doesn't override the
sk->sk_tsflags even the normal and cmsg features enabled parallelly.
It only overrides sockc.tsflags in tcp_sendmsg_locked(). Based on
that, even if at some point users suddenly remove the cmsg use and
then the prior normal SO_TIMESTAMPING continues to work.

How about this, please see below:
For now, sk->sk_tsflags only uses 17 bits (see the last one
SOF_TIMESTAMPING_OPT_RX_FILTER). The cmsg feature only uses 4 flags
(see SOF_TIMESTAMPING_TX_RECORD_MASK in __sock_cmsg_send()). With that
said, we could reserve the highest four bits for cmsg use for the
moment. Four bits represents four points where we can record the
timestamp in the tx case.

Do you agree on this point?

Thanks,
Jason
Willem de Bruijn Oct. 30, 2024, 2:47 a.m. UTC | #5
Jason Xing wrote:
> On Wed, Oct 30, 2024 at 9:45 AM Willem de Bruijn
> <willemdebruijn.kernel@gmail.com> wrote:
> >
> > Jason Xing wrote:
> > > On Wed, Oct 30, 2024 at 7:00 AM Martin KaFai Lau <martin.lau@linux.dev> wrote:
> > > >
> > > > On 10/28/24 4:05 AM, Jason Xing wrote:
> > > > > From: Jason Xing <kernelxing@tencent.com>
> > > > >
> > > > > This patch has introduced a separate sk_tsflags_bpf for bpf
> > > > > extension, which helps us let two feature work nearly at the
> > > > > same time.
> > > > >
> > > > > Each feature will finally take effect on skb_shinfo(skb)->tx_flags,
> > > > > say, tcp_tx_timestamp() for TCP or skb_setup_tx_timestamp() for
> > > > > other types, so in __skb_tstamp_tx() we are unable to know which
> > > > > feature is turned on, unless we check each feature's own socket
> > > > > flag field.
> > > > >
> > > > > Signed-off-by: Jason Xing <kernelxing@tencent.com>
> > > > > ---
> > > > >   include/net/sock.h |  1 +
> > > > >   net/core/skbuff.c  | 39 +++++++++++++++++++++++++++++++++++++++
> > > > >   2 files changed, 40 insertions(+)
> > > > >
> > > > > diff --git a/include/net/sock.h b/include/net/sock.h
> > > > > index 7464e9f9f47c..5384f1e49f5c 100644
> > > > > --- a/include/net/sock.h
> > > > > +++ b/include/net/sock.h
> > > > > @@ -445,6 +445,7 @@ struct sock {
> > > > >       u32                     sk_reserved_mem;
> > > > >       int                     sk_forward_alloc;
> > > > >       u32                     sk_tsflags;
> > > > > +     u32                     sk_tsflags_bpf;
> > > > >       __cacheline_group_end(sock_write_rxtx);
> > > > >
> > > > >       __cacheline_group_begin(sock_write_tx);
> > > > > diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> > > > > index 1cf8416f4123..39309f75e105 100644
> > > > > --- a/net/core/skbuff.c
> > > > > +++ b/net/core/skbuff.c
> > > > > @@ -5539,6 +5539,32 @@ void skb_complete_tx_timestamp(struct sk_buff *skb,
> > > > >   }
> > > > >   EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp);
> > > > >
> > > > > +/* This function is used to test if application SO_TIMESTAMPING feature
> > > > > + * or bpf SO_TIMESTAMPING feature is loaded by checking its own socket flags.
> > > > > + */
> > > > > +static bool sk_tstamp_tx_flags(struct sock *sk, u32 tsflags, int tstype)
> > > > > +{
> > > > > +     u32 testflag;
> > > > > +
> > > > > +     switch (tstype) {
> > > > > +     case SCM_TSTAMP_SCHED:
> > > > > +             testflag = SOF_TIMESTAMPING_TX_SCHED;
> > > > > +             break;
> > > > > +     case SCM_TSTAMP_SND:
> > > > > +             testflag = SOF_TIMESTAMPING_TX_SOFTWARE;
> > > > > +             break;
> > > > > +     case SCM_TSTAMP_ACK:
> > > > > +             testflag = SOF_TIMESTAMPING_TX_ACK;
> > > > > +             break;
> > > > > +     default:
> > > > > +             return false;
> > > > > +     }
> > > > > +     if (tsflags & testflag)
> > > > > +             return true;
> > > > > +
> > > > > +     return false;
> > > > > +}
> > > > > +
> > > > >   static void skb_tstamp_tx_output(struct sk_buff *orig_skb,
> > > > >                                const struct sk_buff *ack_skb,
> > > > >                                struct skb_shared_hwtstamps *hwtstamps,
> > > > > @@ -5549,6 +5575,9 @@ static void skb_tstamp_tx_output(struct sk_buff *orig_skb,
> > > > >       u32 tsflags;
> > > > >
> > > > >       tsflags = READ_ONCE(sk->sk_tsflags);
> > > > > +     if (!sk_tstamp_tx_flags(sk, tsflags, tstype))
> > > >
> > > > I still don't get this part since v2. How does it work with cmsg only
> > > > SOF_TIMESTAMPING_TX_*?
> > > >
> > > > I tried with "./txtimestamp -6 -c 1 -C -N -L ::1" and it does not return any tx
> > > > time stamp after this patch.
> > > >
> > > > I am likely missing something
> > > > or v2 concluded that this behavior change is acceptable?
> > >
> > > Sorry, I submitted this series accidentally removing one important
> > > thing which is similar to what Vadim Fedorenko mentioned in the v1
> > > [1]:
> > > adding another member like sk_flags_bpf to handle the cmsg case.
> > >
> > > Willem, would it be acceptable to add another field in struct sock to
> > > help us recognise the case where BPF and cmsg works parallelly?
> > >
> > > [1]: https://lore.kernel.org/all/662873cb-a897-464e-bdb3-edf01363c3b2@linux.dev/
> >
> > The current timestamp flags don't need a u32. Maybe just reserve a bit
> > for this purpose?
> 
> Sure. Good suggestion.
> 
> But I think only using one bit to reflect whether the sk->sk_tsflags
> is used by normal or cmsg features is not enough. The existing
> implementation in tcp_sendmsg_locked() doesn't override the
> sk->sk_tsflags even the normal and cmsg features enabled parallelly.
> It only overrides sockc.tsflags in tcp_sendmsg_locked(). Based on
> that, even if at some point users suddenly remove the cmsg use and
> then the prior normal SO_TIMESTAMPING continues to work.
> 
> How about this, please see below:
> For now, sk->sk_tsflags only uses 17 bits (see the last one
> SOF_TIMESTAMPING_OPT_RX_FILTER). The cmsg feature only uses 4 flags
> (see SOF_TIMESTAMPING_TX_RECORD_MASK in __sock_cmsg_send()). With that
> said, we could reserve the highest four bits for cmsg use for the
> moment. Four bits represents four points where we can record the
> timestamp in the tx case.
> 
> Do you agree on this point?

I don't follow.

I probably miss the entire point.

The goal for sockcm fields is to start with the sk field and
optionally override based on cmsg. This is what sockcm_init does for
tsflags.

This information is for the skb, so these are recording flags.

Why does the new datapath need to know whether features are enabled
through setsockopt or on a per-call basis with a cmsg?

The goal was always to keep the reporting flags per socket, but make
the recording flag per packet, mainly for sampling.
Jason Xing Oct. 30, 2024, 3:04 a.m. UTC | #6
On Wed, Oct 30, 2024 at 10:47 AM Willem de Bruijn
<willemdebruijn.kernel@gmail.com> wrote:
>
> Jason Xing wrote:
> > On Wed, Oct 30, 2024 at 9:45 AM Willem de Bruijn
> > <willemdebruijn.kernel@gmail.com> wrote:
> > >
> > > Jason Xing wrote:
> > > > On Wed, Oct 30, 2024 at 7:00 AM Martin KaFai Lau <martin.lau@linux.dev> wrote:
> > > > >
> > > > > On 10/28/24 4:05 AM, Jason Xing wrote:
> > > > > > From: Jason Xing <kernelxing@tencent.com>
> > > > > >
> > > > > > This patch has introduced a separate sk_tsflags_bpf for bpf
> > > > > > extension, which helps us let two feature work nearly at the
> > > > > > same time.
> > > > > >
> > > > > > Each feature will finally take effect on skb_shinfo(skb)->tx_flags,
> > > > > > say, tcp_tx_timestamp() for TCP or skb_setup_tx_timestamp() for
> > > > > > other types, so in __skb_tstamp_tx() we are unable to know which
> > > > > > feature is turned on, unless we check each feature's own socket
> > > > > > flag field.
> > > > > >
> > > > > > Signed-off-by: Jason Xing <kernelxing@tencent.com>
> > > > > > ---
> > > > > >   include/net/sock.h |  1 +
> > > > > >   net/core/skbuff.c  | 39 +++++++++++++++++++++++++++++++++++++++
> > > > > >   2 files changed, 40 insertions(+)
> > > > > >
> > > > > > diff --git a/include/net/sock.h b/include/net/sock.h
> > > > > > index 7464e9f9f47c..5384f1e49f5c 100644
> > > > > > --- a/include/net/sock.h
> > > > > > +++ b/include/net/sock.h
> > > > > > @@ -445,6 +445,7 @@ struct sock {
> > > > > >       u32                     sk_reserved_mem;
> > > > > >       int                     sk_forward_alloc;
> > > > > >       u32                     sk_tsflags;
> > > > > > +     u32                     sk_tsflags_bpf;
> > > > > >       __cacheline_group_end(sock_write_rxtx);
> > > > > >
> > > > > >       __cacheline_group_begin(sock_write_tx);
> > > > > > diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> > > > > > index 1cf8416f4123..39309f75e105 100644
> > > > > > --- a/net/core/skbuff.c
> > > > > > +++ b/net/core/skbuff.c
> > > > > > @@ -5539,6 +5539,32 @@ void skb_complete_tx_timestamp(struct sk_buff *skb,
> > > > > >   }
> > > > > >   EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp);
> > > > > >
> > > > > > +/* This function is used to test if application SO_TIMESTAMPING feature
> > > > > > + * or bpf SO_TIMESTAMPING feature is loaded by checking its own socket flags.
> > > > > > + */
> > > > > > +static bool sk_tstamp_tx_flags(struct sock *sk, u32 tsflags, int tstype)
> > > > > > +{
> > > > > > +     u32 testflag;
> > > > > > +
> > > > > > +     switch (tstype) {
> > > > > > +     case SCM_TSTAMP_SCHED:
> > > > > > +             testflag = SOF_TIMESTAMPING_TX_SCHED;
> > > > > > +             break;
> > > > > > +     case SCM_TSTAMP_SND:
> > > > > > +             testflag = SOF_TIMESTAMPING_TX_SOFTWARE;
> > > > > > +             break;
> > > > > > +     case SCM_TSTAMP_ACK:
> > > > > > +             testflag = SOF_TIMESTAMPING_TX_ACK;
> > > > > > +             break;
> > > > > > +     default:
> > > > > > +             return false;
> > > > > > +     }
> > > > > > +     if (tsflags & testflag)
> > > > > > +             return true;
> > > > > > +
> > > > > > +     return false;
> > > > > > +}
> > > > > > +
> > > > > >   static void skb_tstamp_tx_output(struct sk_buff *orig_skb,
> > > > > >                                const struct sk_buff *ack_skb,
> > > > > >                                struct skb_shared_hwtstamps *hwtstamps,
> > > > > > @@ -5549,6 +5575,9 @@ static void skb_tstamp_tx_output(struct sk_buff *orig_skb,
> > > > > >       u32 tsflags;
> > > > > >
> > > > > >       tsflags = READ_ONCE(sk->sk_tsflags);
> > > > > > +     if (!sk_tstamp_tx_flags(sk, tsflags, tstype))
> > > > >
> > > > > I still don't get this part since v2. How does it work with cmsg only
> > > > > SOF_TIMESTAMPING_TX_*?
> > > > >
> > > > > I tried with "./txtimestamp -6 -c 1 -C -N -L ::1" and it does not return any tx
> > > > > time stamp after this patch.
> > > > >
> > > > > I am likely missing something
> > > > > or v2 concluded that this behavior change is acceptable?
> > > >
> > > > Sorry, I submitted this series accidentally removing one important
> > > > thing which is similar to what Vadim Fedorenko mentioned in the v1
> > > > [1]:
> > > > adding another member like sk_flags_bpf to handle the cmsg case.
> > > >
> > > > Willem, would it be acceptable to add another field in struct sock to
> > > > help us recognise the case where BPF and cmsg works parallelly?
> > > >
> > > > [1]: https://lore.kernel.org/all/662873cb-a897-464e-bdb3-edf01363c3b2@linux.dev/
> > >
> > > The current timestamp flags don't need a u32. Maybe just reserve a bit
> > > for this purpose?
> >
> > Sure. Good suggestion.
> >
> > But I think only using one bit to reflect whether the sk->sk_tsflags
> > is used by normal or cmsg features is not enough. The existing
> > implementation in tcp_sendmsg_locked() doesn't override the
> > sk->sk_tsflags even the normal and cmsg features enabled parallelly.
> > It only overrides sockc.tsflags in tcp_sendmsg_locked(). Based on
> > that, even if at some point users suddenly remove the cmsg use and
> > then the prior normal SO_TIMESTAMPING continues to work.
> >
> > How about this, please see below:
> > For now, sk->sk_tsflags only uses 17 bits (see the last one
> > SOF_TIMESTAMPING_OPT_RX_FILTER). The cmsg feature only uses 4 flags
> > (see SOF_TIMESTAMPING_TX_RECORD_MASK in __sock_cmsg_send()). With that
> > said, we could reserve the highest four bits for cmsg use for the
> > moment. Four bits represents four points where we can record the
> > timestamp in the tx case.
> >
> > Do you agree on this point?
>
> I don't follow.
>
> I probably miss the entire point.
>
> The goal for sockcm fields is to start with the sk field and
> optionally override based on cmsg. This is what sockcm_init does for
> tsflags.
>
> This information is for the skb, so these are recording flags.
>
> Why does the new datapath need to know whether features are enabled
> through setsockopt or on a per-call basis with a cmsg?
>
> The goal was always to keep the reporting flags per socket, but make
> the recording flag per packet, mainly for sampling.

If a user uses 1) cmsg feature, 2) bpf feature at the same time, we
allow each feature to work independently.

How could it work? It relies on sk_tstamp_tx_flags() function in the
current patch: when we are in __skb_tstamp_tx(), we cannot know which
flags in each feature are set without fetching sk->sk_tsflags and
sk->sk_tsflags_bpf. Then we are able to know what timestamp we want to
record. To put it in a simple way, we're not sure if the user wants to
see a SCHED timestamp by using the cmsg feature in __skb_tstamp_tx()
if we hit this test statement "skb_shinfo(skb)->tx_flags &
SKBTX_SCHED_TSTAMP)". So we need those two socket tsflag fields to
help us.

Thanks,
Jason
Martin KaFai Lau Oct. 30, 2024, 5:37 a.m. UTC | #7
On 10/29/24 8:04 PM, Jason Xing wrote:
>>>>>>>    static void skb_tstamp_tx_output(struct sk_buff *orig_skb,
>>>>>>>                                 const struct sk_buff *ack_skb,
>>>>>>>                                 struct skb_shared_hwtstamps *hwtstamps,
>>>>>>> @@ -5549,6 +5575,9 @@ static void skb_tstamp_tx_output(struct sk_buff *orig_skb,
>>>>>>>        u32 tsflags;
>>>>>>>
>>>>>>>        tsflags = READ_ONCE(sk->sk_tsflags);
>>>>>>> +     if (!sk_tstamp_tx_flags(sk, tsflags, tstype))
>>>>>>
>>>>>> I still don't get this part since v2. How does it work with cmsg only
>>>>>> SOF_TIMESTAMPING_TX_*?
>>>>>>
>>>>>> I tried with "./txtimestamp -6 -c 1 -C -N -L ::1" and it does not return any tx
>>>>>> time stamp after this patch.
>>>>>>
>>>>>> I am likely missing something
>>>>>> or v2 concluded that this behavior change is acceptable?
>>>>>
>>>>> Sorry, I submitted this series accidentally removing one important
>>>>> thing which is similar to what Vadim Fedorenko mentioned in the v1
>>>>> [1]:
>>>>> adding another member like sk_flags_bpf to handle the cmsg case.
>>>>>
>>>>> Willem, would it be acceptable to add another field in struct sock to
>>>>> help us recognise the case where BPF and cmsg works parallelly?
>>>>>
>>>>> [1]: https://lore.kernel.org/all/662873cb-a897-464e-bdb3-edf01363c3b2@linux.dev/
>>>>
>>>> The current timestamp flags don't need a u32. Maybe just reserve a bit
>>>> for this purpose?
>>>
>>> Sure. Good suggestion.
>>>
>>> But I think only using one bit to reflect whether the sk->sk_tsflags
>>> is used by normal or cmsg features is not enough. The existing
>>> implementation in tcp_sendmsg_locked() doesn't override the
>>> sk->sk_tsflags even the normal and cmsg features enabled parallelly.
>>> It only overrides sockc.tsflags in tcp_sendmsg_locked(). Based on
>>> that, even if at some point users suddenly remove the cmsg use and
>>> then the prior normal SO_TIMESTAMPING continues to work.
>>>
>>> How about this, please see below:
>>> For now, sk->sk_tsflags only uses 17 bits (see the last one
>>> SOF_TIMESTAMPING_OPT_RX_FILTER). The cmsg feature only uses 4 flags
>>> (see SOF_TIMESTAMPING_TX_RECORD_MASK in __sock_cmsg_send()). With that
>>> said, we could reserve the highest four bits for cmsg use for the
>>> moment. Four bits represents four points where we can record the
>>> timestamp in the tx case.
>>>
>>> Do you agree on this point?
>>
>> I don't follow.
>>
>> I probably miss the entire point.
>>
>> The goal for sockcm fields is to start with the sk field and
>> optionally override based on cmsg. This is what sockcm_init does for
>> tsflags.
>>
>> This information is for the skb, so these are recording flags.
>>
>> Why does the new datapath need to know whether features are enabled
>> through setsockopt or on a per-call basis with a cmsg?
>>
>> The goal was always to keep the reporting flags per socket, but make
>> the recording flag per packet, mainly for sampling.
> 
> If a user uses 1) cmsg feature, 2) bpf feature at the same time, we
> allow each feature to work independently.
> 
> How could it work? It relies on sk_tstamp_tx_flags() function in the
> current patch: when we are in __skb_tstamp_tx(), we cannot know which
> flags in each feature are set without fetching sk->sk_tsflags and
> sk->sk_tsflags_bpf. Then we are able to know what timestamp we want to
> record. To put it in a simple way, we're not sure if the user wants to
> see a SCHED timestamp by using the cmsg feature in __skb_tstamp_tx()
> if we hit this test statement "skb_shinfo(skb)->tx_flags &
> SKBTX_SCHED_TSTAMP)". So we need those two socket tsflag fields to
> help us.

I also don't see how a new bit/integer in a sk can help to tell the per cmsg 
on/off. This cmsg may have tx timestamp on while the next cmsg can have it off.

There is still one bit in skb_shinfo(skb)->tx_flags. How about define a 
SKBTX_BPF for everything. imo, the fine control on 
SOF_TIMESTAMPING_TX_{SCHED,SOFTWARE} is not useful for bpf. Almost all of the 
time the bpf program wants all available time stamps (sched, software, and 
hwtstamp if the NIC has it). Since bpf is in the kernel, it is much cheaper 
because it does not need to do skb_alloc/clone and queue to the error queue.

I think the bpf prog needs to capture a timestamp at the sendmsg() time, so a 
bpf prog needs to be called at sendmsg(). Then it may as well allow the bpf 
prog@sendmsg() to decide if it needs to set the SKBTX_BPF bit in 
skb_shinfo(skb)->tx_flags or not.

TCP_SKB_CB(skb)->txstamp_ack can also work similarly. There is still unused bit 
in "struct tcp_skb_cb", so may be adding TCP_SKB_CB(skb)->bpf_txstamp_ack

Then there is no need to control SOF_TIMESTAMPING_TX_* through bpf_setsockopt(). 
It only needs one bpf specific socket option like bpf_setsockopt(SOL_SOCKET, 
BPF_TX_TIMESTAMPING) to guard if the bpf-prog@sendmsg() needs to be called or 
not. There are already other TCP_BPF_IW,TCP_BPF_SNDCWND_CLAMP,... specific 
socket options.

imo, this is a simpler interface and also gives the bpf prog per packet control 
at the same time.

[ This user space cmsg-only testing has to be in the selftests/bpf to show how 
it can work. ]

> 
> Thanks,
> Jason
Jason Xing Oct. 30, 2024, 6:42 a.m. UTC | #8
On Wed, Oct 30, 2024 at 1:37 PM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>
> On 10/29/24 8:04 PM, Jason Xing wrote:
> >>>>>>>    static void skb_tstamp_tx_output(struct sk_buff *orig_skb,
> >>>>>>>                                 const struct sk_buff *ack_skb,
> >>>>>>>                                 struct skb_shared_hwtstamps *hwtstamps,
> >>>>>>> @@ -5549,6 +5575,9 @@ static void skb_tstamp_tx_output(struct sk_buff *orig_skb,
> >>>>>>>        u32 tsflags;
> >>>>>>>
> >>>>>>>        tsflags = READ_ONCE(sk->sk_tsflags);
> >>>>>>> +     if (!sk_tstamp_tx_flags(sk, tsflags, tstype))
> >>>>>>
> >>>>>> I still don't get this part since v2. How does it work with cmsg only
> >>>>>> SOF_TIMESTAMPING_TX_*?
> >>>>>>
> >>>>>> I tried with "./txtimestamp -6 -c 1 -C -N -L ::1" and it does not return any tx
> >>>>>> time stamp after this patch.
> >>>>>>
> >>>>>> I am likely missing something
> >>>>>> or v2 concluded that this behavior change is acceptable?
> >>>>>
> >>>>> Sorry, I submitted this series accidentally removing one important
> >>>>> thing which is similar to what Vadim Fedorenko mentioned in the v1
> >>>>> [1]:
> >>>>> adding another member like sk_flags_bpf to handle the cmsg case.
> >>>>>
> >>>>> Willem, would it be acceptable to add another field in struct sock to
> >>>>> help us recognise the case where BPF and cmsg works parallelly?
> >>>>>
> >>>>> [1]: https://lore.kernel.org/all/662873cb-a897-464e-bdb3-edf01363c3b2@linux.dev/
> >>>>
> >>>> The current timestamp flags don't need a u32. Maybe just reserve a bit
> >>>> for this purpose?
> >>>
> >>> Sure. Good suggestion.
> >>>
> >>> But I think only using one bit to reflect whether the sk->sk_tsflags
> >>> is used by normal or cmsg features is not enough. The existing
> >>> implementation in tcp_sendmsg_locked() doesn't override the
> >>> sk->sk_tsflags even the normal and cmsg features enabled parallelly.
> >>> It only overrides sockc.tsflags in tcp_sendmsg_locked(). Based on
> >>> that, even if at some point users suddenly remove the cmsg use and
> >>> then the prior normal SO_TIMESTAMPING continues to work.
> >>>
> >>> How about this, please see below:
> >>> For now, sk->sk_tsflags only uses 17 bits (see the last one
> >>> SOF_TIMESTAMPING_OPT_RX_FILTER). The cmsg feature only uses 4 flags
> >>> (see SOF_TIMESTAMPING_TX_RECORD_MASK in __sock_cmsg_send()). With that
> >>> said, we could reserve the highest four bits for cmsg use for the
> >>> moment. Four bits represents four points where we can record the
> >>> timestamp in the tx case.
> >>>
> >>> Do you agree on this point?
> >>
> >> I don't follow.
> >>
> >> I probably miss the entire point.
> >>
> >> The goal for sockcm fields is to start with the sk field and
> >> optionally override based on cmsg. This is what sockcm_init does for
> >> tsflags.
> >>
> >> This information is for the skb, so these are recording flags.
> >>
> >> Why does the new datapath need to know whether features are enabled
> >> through setsockopt or on a per-call basis with a cmsg?
> >>
> >> The goal was always to keep the reporting flags per socket, but make
> >> the recording flag per packet, mainly for sampling.
> >
> > If a user uses 1) cmsg feature, 2) bpf feature at the same time, we
> > allow each feature to work independently.
> >
> > How could it work? It relies on sk_tstamp_tx_flags() function in the
> > current patch: when we are in __skb_tstamp_tx(), we cannot know which
> > flags in each feature are set without fetching sk->sk_tsflags and
> > sk->sk_tsflags_bpf. Then we are able to know what timestamp we want to
> > record. To put it in a simple way, we're not sure if the user wants to
> > see a SCHED timestamp by using the cmsg feature in __skb_tstamp_tx()
> > if we hit this test statement "skb_shinfo(skb)->tx_flags &
> > SKBTX_SCHED_TSTAMP)". So we need those two socket tsflag fields to
> > help us.
>
> I also don't see how a new bit/integer in a sk can help to tell the per cmsg
> on/off. This cmsg may have tx timestamp on while the next cmsg can have it off.

It's not hard to use it because we can clear every socket cmsg tsflags
when we're done the check in tcp_sendmsg_locked() if the cmsg feature
is not enabled. Then we can accurately know which timestamp should we
print in the tx path.

>
> There is still one bit in skb_shinfo(skb)->tx_flags. How about define a
> SKBTX_BPF for everything. imo, the fine control on
> SOF_TIMESTAMPING_TX_{SCHED,SOFTWARE} is not useful for bpf. Almost all of the
> time the bpf program wants all available time stamps (sched, software, and
> hwtstamp if the NIC has it).

Sorry, I really doubt that we can lose the fine control. I still
reckon that providing more options to users is a good way to go,
especially for some latency sensitive applications, enabling one or
two or three tx flags could lead to different performances. For the
users of SO_TIMESTAMPING, they use the feature very differently. Not
all users prefer to record everything.

> Since bpf is in the kernel, it is much cheaper
> because it does not need to do skb_alloc/clone and queue to the error queue.
>
> I think the bpf prog needs to capture a timestamp at the sendmsg() time, so a
> bpf prog needs to be called at sendmsg().

Agreed, I planned to implement this after this series.

> Then it may as well allow the bpf
> prog@sendmsg() to decide if it needs to set the SKBTX_BPF bit in
> skb_shinfo(skb)->tx_flags or not.
>
> TCP_SKB_CB(skb)->txstamp_ack can also work similarly. There is still unused bit
> in "struct tcp_skb_cb", so may be adding TCP_SKB_CB(skb)->bpf_txstamp_ack
>
> Then there is no need to control SOF_TIMESTAMPING_TX_* through bpf_setsockopt().
> It only needs one bpf specific socket option like bpf_setsockopt(SOL_SOCKET,
> BPF_TX_TIMESTAMPING) to guard if the bpf-prog@sendmsg() needs to be called or
> not. There are already other TCP_BPF_IW,TCP_BPF_SNDCWND_CLAMP,... specific
> socket options.
>
> imo, this is a simpler interface and also gives the bpf prog per packet control
> at the same time.

Very interesting idea, but the precondition is that we give up the
fine control...

Thanks,
Jason
Willem de Bruijn Oct. 30, 2024, 5:15 p.m. UTC | #9
Jason Xing wrote:
> On Wed, Oct 30, 2024 at 1:37 PM Martin KaFai Lau <martin.lau@linux.dev> wrote:
> >
> > On 10/29/24 8:04 PM, Jason Xing wrote:
> > >>>>>>>    static void skb_tstamp_tx_output(struct sk_buff *orig_skb,
> > >>>>>>>                                 const struct sk_buff *ack_skb,
> > >>>>>>>                                 struct skb_shared_hwtstamps *hwtstamps,
> > >>>>>>> @@ -5549,6 +5575,9 @@ static void skb_tstamp_tx_output(struct sk_buff *orig_skb,
> > >>>>>>>        u32 tsflags;
> > >>>>>>>
> > >>>>>>>        tsflags = READ_ONCE(sk->sk_tsflags);
> > >>>>>>> +     if (!sk_tstamp_tx_flags(sk, tsflags, tstype))
> > >>>>>>
> > >>>>>> I still don't get this part since v2. How does it work with cmsg only
> > >>>>>> SOF_TIMESTAMPING_TX_*?
> > >>>>>>
> > >>>>>> I tried with "./txtimestamp -6 -c 1 -C -N -L ::1" and it does not return any tx
> > >>>>>> time stamp after this patch.
> > >>>>>>
> > >>>>>> I am likely missing something
> > >>>>>> or v2 concluded that this behavior change is acceptable?
> > >>>>>
> > >>>>> Sorry, I submitted this series accidentally removing one important
> > >>>>> thing which is similar to what Vadim Fedorenko mentioned in the v1
> > >>>>> [1]:
> > >>>>> adding another member like sk_flags_bpf to handle the cmsg case.
> > >>>>>
> > >>>>> Willem, would it be acceptable to add another field in struct sock to
> > >>>>> help us recognise the case where BPF and cmsg works parallelly?
> > >>>>>
> > >>>>> [1]: https://lore.kernel.org/all/662873cb-a897-464e-bdb3-edf01363c3b2@linux.dev/
> > >>>>
> > >>>> The current timestamp flags don't need a u32. Maybe just reserve a bit
> > >>>> for this purpose?
> > >>>
> > >>> Sure. Good suggestion.
> > >>>
> > >>> But I think only using one bit to reflect whether the sk->sk_tsflags
> > >>> is used by normal or cmsg features is not enough. The existing
> > >>> implementation in tcp_sendmsg_locked() doesn't override the
> > >>> sk->sk_tsflags even the normal and cmsg features enabled parallelly.
> > >>> It only overrides sockc.tsflags in tcp_sendmsg_locked(). Based on
> > >>> that, even if at some point users suddenly remove the cmsg use and
> > >>> then the prior normal SO_TIMESTAMPING continues to work.
> > >>>
> > >>> How about this, please see below:
> > >>> For now, sk->sk_tsflags only uses 17 bits (see the last one
> > >>> SOF_TIMESTAMPING_OPT_RX_FILTER). The cmsg feature only uses 4 flags
> > >>> (see SOF_TIMESTAMPING_TX_RECORD_MASK in __sock_cmsg_send()). With that
> > >>> said, we could reserve the highest four bits for cmsg use for the
> > >>> moment. Four bits represents four points where we can record the
> > >>> timestamp in the tx case.
> > >>>
> > >>> Do you agree on this point?
> > >>
> > >> I don't follow.
> > >>
> > >> I probably miss the entire point.
> > >>
> > >> The goal for sockcm fields is to start with the sk field and
> > >> optionally override based on cmsg. This is what sockcm_init does for
> > >> tsflags.
> > >>
> > >> This information is for the skb, so these are recording flags.
> > >>
> > >> Why does the new datapath need to know whether features are enabled
> > >> through setsockopt or on a per-call basis with a cmsg?
> > >>
> > >> The goal was always to keep the reporting flags per socket, but make
> > >> the recording flag per packet, mainly for sampling.
> > >
> > > If a user uses 1) cmsg feature, 2) bpf feature at the same time, we
> > > allow each feature to work independently.
> > >
> > > How could it work? It relies on sk_tstamp_tx_flags() function in the
> > > current patch: when we are in __skb_tstamp_tx(), we cannot know which
> > > flags in each feature are set without fetching sk->sk_tsflags and
> > > sk->sk_tsflags_bpf. Then we are able to know what timestamp we want to
> > > record. To put it in a simple way, we're not sure if the user wants to
> > > see a SCHED timestamp by using the cmsg feature in __skb_tstamp_tx()
> > > if we hit this test statement "skb_shinfo(skb)->tx_flags &
> > > SKBTX_SCHED_TSTAMP)". So we need those two socket tsflag fields to
> > > help us.
> >
> > I also don't see how a new bit/integer in a sk can help to tell the per cmsg
> > on/off. This cmsg may have tx timestamp on while the next cmsg can have it off.
> 
> It's not hard to use it because we can clear every socket cmsg tsflags
> when we're done the check in tcp_sendmsg_locked() if the cmsg feature
> is not enabled. Then we can accurately know which timestamp should we
> print in the tx path.
> 
> >
> > There is still one bit in skb_shinfo(skb)->tx_flags. How about define a
> > SKBTX_BPF for everything. imo, the fine control on
> > SOF_TIMESTAMPING_TX_{SCHED,SOFTWARE} is not useful for bpf. Almost all of the
> > time the bpf program wants all available time stamps (sched, software, and
> > hwtstamp if the NIC has it).

I like the approach of just calling BPF on every hook. Assuming that
the call is very cheap, which AFAIK is true.

In that case we don't need complex branching in C to optionally skip
this step, as we do for reporting to userspace.

All the logic and complexity is in the BPF program itself.

We obviously then let go of the goal to model the BPF API close to the
existing SO_TIMESTAMPING API. Though I advocated for keeping them
aligned, I also think we should just tailor it to what makes most
sense in the BPF space.
 
> Sorry, I really doubt that we can lose the fine control. 

Since BPF is called at each reporting point, no control is lost,
actually.

> I still
> reckon that providing more options to users is a good way to go,
> especially for some latency sensitive applications, enabling one or
> two or three tx flags could lead to different performances. For the
> users of SO_TIMESTAMPING, they use the feature very differently. Not
> all users prefer to record everything.
> 
> > Since bpf is in the kernel, it is much cheaper
> > because it does not need to do skb_alloc/clone and queue to the error queue.
> >
> > I think the bpf prog needs to capture a timestamp at the sendmsg() time, so a
> > bpf prog needs to be called at sendmsg().
> 
> Agreed, I planned to implement this after this series.
> 
> > Then it may as well allow the bpf
> > prog@sendmsg() to decide if it needs to set the SKBTX_BPF bit in
> > skb_shinfo(skb)->tx_flags or not.
> >
> > TCP_SKB_CB(skb)->txstamp_ack can also work similarly. There is still unused bit
> > in "struct tcp_skb_cb", so may be adding TCP_SKB_CB(skb)->bpf_txstamp_ack
> >
> > Then there is no need to control SOF_TIMESTAMPING_TX_* through bpf_setsockopt().
> > It only needs one bpf specific socket option like bpf_setsockopt(SOL_SOCKET,
> > BPF_TX_TIMESTAMPING) to guard if the bpf-prog@sendmsg() needs to be called or
> > not. There are already other TCP_BPF_IW,TCP_BPF_SNDCWND_CLAMP,... specific
> > socket options.
> >
> > imo, this is a simpler interface and also gives the bpf prog per packet control
> > at the same time.
> 
> Very interesting idea, but the precondition is that we give up the
> fine control...
> 
> Thanks,
> Jason
Jason Xing Oct. 30, 2024, 11:54 p.m. UTC | #10
On Thu, Oct 31, 2024 at 1:15 AM Willem de Bruijn
<willemdebruijn.kernel@gmail.com> wrote:
>
> Jason Xing wrote:
> > On Wed, Oct 30, 2024 at 1:37 PM Martin KaFai Lau <martin.lau@linux.dev> wrote:
> > >
> > > On 10/29/24 8:04 PM, Jason Xing wrote:
> > > >>>>>>>    static void skb_tstamp_tx_output(struct sk_buff *orig_skb,
> > > >>>>>>>                                 const struct sk_buff *ack_skb,
> > > >>>>>>>                                 struct skb_shared_hwtstamps *hwtstamps,
> > > >>>>>>> @@ -5549,6 +5575,9 @@ static void skb_tstamp_tx_output(struct sk_buff *orig_skb,
> > > >>>>>>>        u32 tsflags;
> > > >>>>>>>
> > > >>>>>>>        tsflags = READ_ONCE(sk->sk_tsflags);
> > > >>>>>>> +     if (!sk_tstamp_tx_flags(sk, tsflags, tstype))
> > > >>>>>>
> > > >>>>>> I still don't get this part since v2. How does it work with cmsg only
> > > >>>>>> SOF_TIMESTAMPING_TX_*?
> > > >>>>>>
> > > >>>>>> I tried with "./txtimestamp -6 -c 1 -C -N -L ::1" and it does not return any tx
> > > >>>>>> time stamp after this patch.
> > > >>>>>>
> > > >>>>>> I am likely missing something
> > > >>>>>> or v2 concluded that this behavior change is acceptable?
> > > >>>>>
> > > >>>>> Sorry, I submitted this series accidentally removing one important
> > > >>>>> thing which is similar to what Vadim Fedorenko mentioned in the v1
> > > >>>>> [1]:
> > > >>>>> adding another member like sk_flags_bpf to handle the cmsg case.
> > > >>>>>
> > > >>>>> Willem, would it be acceptable to add another field in struct sock to
> > > >>>>> help us recognise the case where BPF and cmsg works parallelly?
> > > >>>>>
> > > >>>>> [1]: https://lore.kernel.org/all/662873cb-a897-464e-bdb3-edf01363c3b2@linux.dev/
> > > >>>>
> > > >>>> The current timestamp flags don't need a u32. Maybe just reserve a bit
> > > >>>> for this purpose?
> > > >>>
> > > >>> Sure. Good suggestion.
> > > >>>
> > > >>> But I think only using one bit to reflect whether the sk->sk_tsflags
> > > >>> is used by normal or cmsg features is not enough. The existing
> > > >>> implementation in tcp_sendmsg_locked() doesn't override the
> > > >>> sk->sk_tsflags even the normal and cmsg features enabled parallelly.
> > > >>> It only overrides sockc.tsflags in tcp_sendmsg_locked(). Based on
> > > >>> that, even if at some point users suddenly remove the cmsg use and
> > > >>> then the prior normal SO_TIMESTAMPING continues to work.
> > > >>>
> > > >>> How about this, please see below:
> > > >>> For now, sk->sk_tsflags only uses 17 bits (see the last one
> > > >>> SOF_TIMESTAMPING_OPT_RX_FILTER). The cmsg feature only uses 4 flags
> > > >>> (see SOF_TIMESTAMPING_TX_RECORD_MASK in __sock_cmsg_send()). With that
> > > >>> said, we could reserve the highest four bits for cmsg use for the
> > > >>> moment. Four bits represents four points where we can record the
> > > >>> timestamp in the tx case.
> > > >>>
> > > >>> Do you agree on this point?
> > > >>
> > > >> I don't follow.
> > > >>
> > > >> I probably miss the entire point.
> > > >>
> > > >> The goal for sockcm fields is to start with the sk field and
> > > >> optionally override based on cmsg. This is what sockcm_init does for
> > > >> tsflags.
> > > >>
> > > >> This information is for the skb, so these are recording flags.
> > > >>
> > > >> Why does the new datapath need to know whether features are enabled
> > > >> through setsockopt or on a per-call basis with a cmsg?
> > > >>
> > > >> The goal was always to keep the reporting flags per socket, but make
> > > >> the recording flag per packet, mainly for sampling.
> > > >
> > > > If a user uses 1) cmsg feature, 2) bpf feature at the same time, we
> > > > allow each feature to work independently.
> > > >
> > > > How could it work? It relies on sk_tstamp_tx_flags() function in the
> > > > current patch: when we are in __skb_tstamp_tx(), we cannot know which
> > > > flags in each feature are set without fetching sk->sk_tsflags and
> > > > sk->sk_tsflags_bpf. Then we are able to know what timestamp we want to
> > > > record. To put it in a simple way, we're not sure if the user wants to
> > > > see a SCHED timestamp by using the cmsg feature in __skb_tstamp_tx()
> > > > if we hit this test statement "skb_shinfo(skb)->tx_flags &
> > > > SKBTX_SCHED_TSTAMP)". So we need those two socket tsflag fields to
> > > > help us.
> > >
> > > I also don't see how a new bit/integer in a sk can help to tell the per cmsg
> > > on/off. This cmsg may have tx timestamp on while the next cmsg can have it off.
> >
> > It's not hard to use it because we can clear every socket cmsg tsflags
> > when we're done the check in tcp_sendmsg_locked() if the cmsg feature
> > is not enabled. Then we can accurately know which timestamp should we
> > print in the tx path.
> >
> > >
> > > There is still one bit in skb_shinfo(skb)->tx_flags. How about define a
> > > SKBTX_BPF for everything. imo, the fine control on
> > > SOF_TIMESTAMPING_TX_{SCHED,SOFTWARE} is not useful for bpf. Almost all of the
> > > time the bpf program wants all available time stamps (sched, software, and
> > > hwtstamp if the NIC has it).
>
> I like the approach of just calling BPF on every hook. Assuming that
> the call is very cheap, which AFAIK is true.
>
> In that case we don't need complex branching in C to optionally skip
> this step, as we do for reporting to userspace.
>
> All the logic and complexity is in the BPF program itself.
>
> We obviously then let go of the goal to model the BPF API close to the
> existing SO_TIMESTAMPING API. Though I advocated for keeping them
> aligned, I also think we should just tailor it to what makes most
> sense in the BPF space.
>
> > Sorry, I really doubt that we can lose the fine control.
>
> Since BPF is called at each reporting point, no control is lost,
> actually.

Sorry, I still don't get it :( If there is something wrong with my
understanding, please correct me.

BPF is only called on every sock_opt point in this case, like
BPF_SOCK_OPS_TCP_CONNECT_CB, not every report point of
SO_TIMESTAMPING. If we add check to test if skb is set SKBTX_BPF in
__skb_tstamp_tx(), then at every point bpf will be called. But it's
different from SO_TIMESTAMPING drived by each bit (SCHED/TX_SOFTWARE)
to control each point. My question is if we would use SKBTX_BPF for
everything, how could we control and know when we hit
SCHED/TX_SOFTWARE/ACK time from the bpf programs' perspective? Only
one bit... It will print everything without the ability to control.

Then if we try the SKBTX_BPF approach, it seems we don't actually
insist on adding a test statement in __skb_tstamp_tx(). Instead, we
could add into more places (by only checking the SKBTX_BPF flag), say,
tcp_write_xmit(), right?

I'm not saying I'm opposed to this idea. Instead I think it's very
useful, just a few questions haunting me...

Thanks,
Jason
Jason Xing Oct. 31, 2024, 12:13 a.m. UTC | #11
On Thu, Oct 31, 2024 at 7:54 AM Jason Xing <kerneljasonxing@gmail.com> wrote:
>
> On Thu, Oct 31, 2024 at 1:15 AM Willem de Bruijn
> <willemdebruijn.kernel@gmail.com> wrote:
> >
> > Jason Xing wrote:
> > > On Wed, Oct 30, 2024 at 1:37 PM Martin KaFai Lau <martin.lau@linux.dev> wrote:
> > > >
> > > > On 10/29/24 8:04 PM, Jason Xing wrote:
> > > > >>>>>>>    static void skb_tstamp_tx_output(struct sk_buff *orig_skb,
> > > > >>>>>>>                                 const struct sk_buff *ack_skb,
> > > > >>>>>>>                                 struct skb_shared_hwtstamps *hwtstamps,
> > > > >>>>>>> @@ -5549,6 +5575,9 @@ static void skb_tstamp_tx_output(struct sk_buff *orig_skb,
> > > > >>>>>>>        u32 tsflags;
> > > > >>>>>>>
> > > > >>>>>>>        tsflags = READ_ONCE(sk->sk_tsflags);
> > > > >>>>>>> +     if (!sk_tstamp_tx_flags(sk, tsflags, tstype))
> > > > >>>>>>
> > > > >>>>>> I still don't get this part since v2. How does it work with cmsg only
> > > > >>>>>> SOF_TIMESTAMPING_TX_*?
> > > > >>>>>>
> > > > >>>>>> I tried with "./txtimestamp -6 -c 1 -C -N -L ::1" and it does not return any tx
> > > > >>>>>> time stamp after this patch.
> > > > >>>>>>
> > > > >>>>>> I am likely missing something
> > > > >>>>>> or v2 concluded that this behavior change is acceptable?
> > > > >>>>>
> > > > >>>>> Sorry, I submitted this series accidentally removing one important
> > > > >>>>> thing which is similar to what Vadim Fedorenko mentioned in the v1
> > > > >>>>> [1]:
> > > > >>>>> adding another member like sk_flags_bpf to handle the cmsg case.
> > > > >>>>>
> > > > >>>>> Willem, would it be acceptable to add another field in struct sock to
> > > > >>>>> help us recognise the case where BPF and cmsg works parallelly?
> > > > >>>>>
> > > > >>>>> [1]: https://lore.kernel.org/all/662873cb-a897-464e-bdb3-edf01363c3b2@linux.dev/
> > > > >>>>
> > > > >>>> The current timestamp flags don't need a u32. Maybe just reserve a bit
> > > > >>>> for this purpose?
> > > > >>>
> > > > >>> Sure. Good suggestion.
> > > > >>>
> > > > >>> But I think only using one bit to reflect whether the sk->sk_tsflags
> > > > >>> is used by normal or cmsg features is not enough. The existing
> > > > >>> implementation in tcp_sendmsg_locked() doesn't override the
> > > > >>> sk->sk_tsflags even the normal and cmsg features enabled parallelly.
> > > > >>> It only overrides sockc.tsflags in tcp_sendmsg_locked(). Based on
> > > > >>> that, even if at some point users suddenly remove the cmsg use and
> > > > >>> then the prior normal SO_TIMESTAMPING continues to work.
> > > > >>>
> > > > >>> How about this, please see below:
> > > > >>> For now, sk->sk_tsflags only uses 17 bits (see the last one
> > > > >>> SOF_TIMESTAMPING_OPT_RX_FILTER). The cmsg feature only uses 4 flags
> > > > >>> (see SOF_TIMESTAMPING_TX_RECORD_MASK in __sock_cmsg_send()). With that
> > > > >>> said, we could reserve the highest four bits for cmsg use for the
> > > > >>> moment. Four bits represents four points where we can record the
> > > > >>> timestamp in the tx case.
> > > > >>>
> > > > >>> Do you agree on this point?
> > > > >>
> > > > >> I don't follow.
> > > > >>
> > > > >> I probably miss the entire point.
> > > > >>
> > > > >> The goal for sockcm fields is to start with the sk field and
> > > > >> optionally override based on cmsg. This is what sockcm_init does for
> > > > >> tsflags.
> > > > >>
> > > > >> This information is for the skb, so these are recording flags.
> > > > >>
> > > > >> Why does the new datapath need to know whether features are enabled
> > > > >> through setsockopt or on a per-call basis with a cmsg?
> > > > >>
> > > > >> The goal was always to keep the reporting flags per socket, but make
> > > > >> the recording flag per packet, mainly for sampling.
> > > > >
> > > > > If a user uses 1) cmsg feature, 2) bpf feature at the same time, we
> > > > > allow each feature to work independently.
> > > > >
> > > > > How could it work? It relies on sk_tstamp_tx_flags() function in the
> > > > > current patch: when we are in __skb_tstamp_tx(), we cannot know which
> > > > > flags in each feature are set without fetching sk->sk_tsflags and
> > > > > sk->sk_tsflags_bpf. Then we are able to know what timestamp we want to
> > > > > record. To put it in a simple way, we're not sure if the user wants to
> > > > > see a SCHED timestamp by using the cmsg feature in __skb_tstamp_tx()
> > > > > if we hit this test statement "skb_shinfo(skb)->tx_flags &
> > > > > SKBTX_SCHED_TSTAMP)". So we need those two socket tsflag fields to
> > > > > help us.
> > > >
> > > > I also don't see how a new bit/integer in a sk can help to tell the per cmsg
> > > > on/off. This cmsg may have tx timestamp on while the next cmsg can have it off.
> > >
> > > It's not hard to use it because we can clear every socket cmsg tsflags
> > > when we're done the check in tcp_sendmsg_locked() if the cmsg feature
> > > is not enabled. Then we can accurately know which timestamp should we
> > > print in the tx path.
> > >
> > > >
> > > > There is still one bit in skb_shinfo(skb)->tx_flags. How about define a
> > > > SKBTX_BPF for everything. imo, the fine control on
> > > > SOF_TIMESTAMPING_TX_{SCHED,SOFTWARE} is not useful for bpf. Almost all of the
> > > > time the bpf program wants all available time stamps (sched, software, and
> > > > hwtstamp if the NIC has it).
> >
> > I like the approach of just calling BPF on every hook. Assuming that
> > the call is very cheap, which AFAIK is true.
> >
> > In that case we don't need complex branching in C to optionally skip
> > this step, as we do for reporting to userspace.
> >
> > All the logic and complexity is in the BPF program itself.
> >
> > We obviously then let go of the goal to model the BPF API close to the
> > existing SO_TIMESTAMPING API. Though I advocated for keeping them
> > aligned, I also think we should just tailor it to what makes most
> > sense in the BPF space.
> >
> > > Sorry, I really doubt that we can lose the fine control.
> >
> > Since BPF is called at each reporting point, no control is lost,
> > actually.
>
> Sorry, I still don't get it :( If there is something wrong with my
> understanding, please correct me.
>
> BPF is only called on every sock_opt point in this case, like
> BPF_SOCK_OPS_TCP_CONNECT_CB, not every report point of
> SO_TIMESTAMPING. If we add check to test if skb is set SKBTX_BPF in
> __skb_tstamp_tx(), then at every point bpf will be called. But it's
> different from SO_TIMESTAMPING drived by each bit (SCHED/TX_SOFTWARE)
> to control each point. My question is if we would use SKBTX_BPF for
> everything, how could we control and know when we hit
> SCHED/TX_SOFTWARE/ACK time from the bpf programs' perspective? Only
> one bit... It will print everything without the ability to control.
>
> Then if we try the SKBTX_BPF approach, it seems we don't actually
> insist on adding a test statement in __skb_tstamp_tx(). Instead, we
> could add into more places (by only checking the SKBTX_BPF flag), say,
> tcp_write_xmit(), right?

I realized that we will have some new sock_opt flags like
TS_SCHED_OPT_CB in patch 4, so we can control whether to print or
not... For each sock_opt point, they will be called without caring if
related flags in skb are set. Well, it's meaningless to add more
control of skb tsflags at each TS_xx_OPT_CB point.

Am I understanding in a correct way? Now, I'm totally fine with this great idea!

Thanks,
Jason
Martin KaFai Lau Oct. 31, 2024, 6:27 a.m. UTC | #12
On 10/30/24 5:13 PM, Jason Xing wrote:
> I realized that we will have some new sock_opt flags like
> TS_SCHED_OPT_CB in patch 4, so we can control whether to print or
> not... For each sock_opt point, they will be called without caring if
> related flags in skb are set. Well, it's meaningless to add more
> control of skb tsflags at each TS_xx_OPT_CB point.
> 
> Am I understanding in a correct way? Now, I'm totally fine with this great idea!
Yes, I think so.

The sockops prog can choose to ignore any BPF_SOCK_OPS_TS_*_CB. The are only 3: 
SCHED, SND, and ACK. If the hwtstamp is available from a NIC, I think it would 
be quite wasteful to throw it away. ACK can be controlled by the 
TCP_SKB_CB(skb)->bpf_txstamp_ack.

Going back to my earlier bpf_setsockopt(SOL_SOCKET, BPF_TX_TIMESTAMPING) 
comment. I think it may as well go back to use the "u8 
bpf_sock_ops_cb_flags;" and use the bpf_sock_ops_cb_flags_set() helper to 
enable/disable the timestamp related callback hook. May be add one 
BPF_SOCK_OPS_TX_TIMESTAMPING_CB_FLAG.

For tx, one new hook should be at the sendmsg and should be around 
tcp_tx_timestamp (?) for tcp. Another hook is __skb_tstamp_tx() which should be 
similar to your patch. Add a new kfunc to set shinfo->tx_flags |= SKBTX_BPF 
and/or TCP_SKB_CB(skb)->bpf_txstamp_ack during sendmsg.


For rx, add one BPF_SOCK_OPS_RX_TIMESTAMPING_CB_FLAG. bpf_sock_ops_cb_flags 
needs to move from the tcp_sock to the sock because it will be used by UDP also. 
When enabling or disabling this flag, it needs to take care of the 
net_{enable,disable}_timestamp. The same for the __sk_destruct() also.
Jason Xing Oct. 31, 2024, 7:04 a.m. UTC | #13
On Thu, Oct 31, 2024 at 2:27 PM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>
> On 10/30/24 5:13 PM, Jason Xing wrote:
> > I realized that we will have some new sock_opt flags like
> > TS_SCHED_OPT_CB in patch 4, so we can control whether to print or
> > not... For each sock_opt point, they will be called without caring if
> > related flags in skb are set. Well, it's meaningless to add more
> > control of skb tsflags at each TS_xx_OPT_CB point.
> >
> > Am I understanding in a correct way? Now, I'm totally fine with this great idea!
> Yes, I think so.
>
> The sockops prog can choose to ignore any BPF_SOCK_OPS_TS_*_CB. The are only 3:
> SCHED, SND, and ACK. If the hwtstamp is available from a NIC, I think it would
> be quite wasteful to throw it away. ACK can be controlled by the
> TCP_SKB_CB(skb)->bpf_txstamp_ack.

Right, let me try this:)

> Going back to my earlier bpf_setsockopt(SOL_SOCKET, BPF_TX_TIMESTAMPING)
> comment. I think it may as well go back to use the "u8
> bpf_sock_ops_cb_flags;" and use the bpf_sock_ops_cb_flags_set() helper to
> enable/disable the timestamp related callback hook. May be add one
> BPF_SOCK_OPS_TX_TIMESTAMPING_CB_FLAG.

bpf_sock_ops_cb_flags this flag is only used in TCP condition, right?
If that is so, it cannot be suitable for UDP.

I'm thinking of this solution:
1) adding a new flag in SOF_TIMESTAMPING_OPT_BPF flag (in
include/uapi/linux/net_tstamp.h) which can be used by sk->sk_tsflags
2) flags =   SOF_TIMESTAMPING_OPT_BPF;    bpf_setsockopt(skops,
SOL_SOCKET, SO_TIMESTAMPING, &flags, sizeof(flags));
3) test if sk->sk_tsflags has this new flag in tcp_tx_timestamp() or
in udp_sendmsg()
...

>
> For tx, one new hook should be at the sendmsg and should be around
> tcp_tx_timestamp (?) for tcp. Another hook is __skb_tstamp_tx() which should be

I think there are two points we're supposed to record:
1) the moment tcp/udp_sendmsg() is triggered. It represents the syscall time.
2) another point in tcp_tx_timestamp(). It represents the timestamp of
the last skb in this sendmsg() call.
Users may happen to send a big packet.

> similar to your patch. Add a new kfunc to set shinfo->tx_flags |= SKBTX_BPF
> and/or TCP_SKB_CB(skb)->bpf_txstamp_ack during sendmsg.

Got it.

>
>
> For rx, add one BPF_SOCK_OPS_RX_TIMESTAMPING_CB_FLAG. bpf_sock_ops_cb_flags
> needs to move from the tcp_sock to the sock because it will be used by UDP also.
> When enabling or disabling this flag, it needs to take care of the
> net_{enable,disable}_timestamp. The same for the __sk_destruct() also.
>

I think if the solution I proposed as above is feasible, then we don't
have to move the tcp_sock which brings more extra work :)

Thanks,
Jason
Willem de Bruijn Oct. 31, 2024, 12:30 p.m. UTC | #14
Jason Xing wrote:
> On Thu, Oct 31, 2024 at 2:27 PM Martin KaFai Lau <martin.lau@linux.dev> wrote:
> >
> > On 10/30/24 5:13 PM, Jason Xing wrote:
> > > I realized that we will have some new sock_opt flags like
> > > TS_SCHED_OPT_CB in patch 4, so we can control whether to print or
> > > not... For each sock_opt point, they will be called without caring if
> > > related flags in skb are set. Well, it's meaningless to add more
> > > control of skb tsflags at each TS_xx_OPT_CB point.
> > >
> > > Am I understanding in a correct way? Now, I'm totally fine with this great idea!
> > Yes, I think so.
> >
> > The sockops prog can choose to ignore any BPF_SOCK_OPS_TS_*_CB. The are only 3:
> > SCHED, SND, and ACK. If the hwtstamp is available from a NIC, I think it would
> > be quite wasteful to throw it away. ACK can be controlled by the
> > TCP_SKB_CB(skb)->bpf_txstamp_ack.
> 
> Right, let me try this:)
> 
> > Going back to my earlier bpf_setsockopt(SOL_SOCKET, BPF_TX_TIMESTAMPING)
> > comment. I think it may as well go back to use the "u8
> > bpf_sock_ops_cb_flags;" and use the bpf_sock_ops_cb_flags_set() helper to
> > enable/disable the timestamp related callback hook. May be add one
> > BPF_SOCK_OPS_TX_TIMESTAMPING_CB_FLAG.
> 
> bpf_sock_ops_cb_flags this flag is only used in TCP condition, right?
> If that is so, it cannot be suitable for UDP.
> 
> I'm thinking of this solution:
> 1) adding a new flag in SOF_TIMESTAMPING_OPT_BPF flag (in
> include/uapi/linux/net_tstamp.h) which can be used by sk->sk_tsflags
> 2) flags =   SOF_TIMESTAMPING_OPT_BPF;    bpf_setsockopt(skops,
> SOL_SOCKET, SO_TIMESTAMPING, &flags, sizeof(flags));
> 3) test if sk->sk_tsflags has this new flag in tcp_tx_timestamp() or
> in udp_sendmsg()
> ...
> 
> >
> > For tx, one new hook should be at the sendmsg and should be around
> > tcp_tx_timestamp (?) for tcp. Another hook is __skb_tstamp_tx() which should be
> 
> I think there are two points we're supposed to record:
> 1) the moment tcp/udp_sendmsg() is triggered. It represents the syscall time.
> 2) another point in tcp_tx_timestamp(). It represents the timestamp of
> the last skb in this sendmsg() call.
> Users may happen to send a big packet.

Err on the side of fewer measurement points. It's always possible to
add more later, but not possible to remove them (depending on whether
BPF infra is ABI).

Overall great suggestion. Thanks a lot for sharing your BPF expertise
on this, Martin.

On using the raw seqno: this data is accessible to anyone root in
namespace (ns_capable) using packet sockets, so as long as it does not
open to more than that, it is logically equivalent to the current
setting.

With seqno the BPF program has to be careful that the same seqno can
be retransmitted, so for instance seeing an ACK before a (second) SND
must be anticipated. That is true for SO_TIMESTAMPING today too.

For datagrams (UDP as well as RAW and many non IP protocols), an
alternative still needs to be found.
Jason Xing Oct. 31, 2024, 1:50 p.m. UTC | #15
On Thu, Oct 31, 2024 at 8:30 PM Willem de Bruijn
<willemdebruijn.kernel@gmail.com> wrote:
>
> Jason Xing wrote:
> > On Thu, Oct 31, 2024 at 2:27 PM Martin KaFai Lau <martin.lau@linux.dev> wrote:
> > >
> > > On 10/30/24 5:13 PM, Jason Xing wrote:
> > > > I realized that we will have some new sock_opt flags like
> > > > TS_SCHED_OPT_CB in patch 4, so we can control whether to print or
> > > > not... For each sock_opt point, they will be called without caring if
> > > > related flags in skb are set. Well, it's meaningless to add more
> > > > control of skb tsflags at each TS_xx_OPT_CB point.
> > > >
> > > > Am I understanding in a correct way? Now, I'm totally fine with this great idea!
> > > Yes, I think so.
> > >
> > > The sockops prog can choose to ignore any BPF_SOCK_OPS_TS_*_CB. The are only 3:
> > > SCHED, SND, and ACK. If the hwtstamp is available from a NIC, I think it would
> > > be quite wasteful to throw it away. ACK can be controlled by the
> > > TCP_SKB_CB(skb)->bpf_txstamp_ack.
> >
> > Right, let me try this:)
> >
> > > Going back to my earlier bpf_setsockopt(SOL_SOCKET, BPF_TX_TIMESTAMPING)
> > > comment. I think it may as well go back to use the "u8
> > > bpf_sock_ops_cb_flags;" and use the bpf_sock_ops_cb_flags_set() helper to
> > > enable/disable the timestamp related callback hook. May be add one
> > > BPF_SOCK_OPS_TX_TIMESTAMPING_CB_FLAG.
> >
> > bpf_sock_ops_cb_flags this flag is only used in TCP condition, right?
> > If that is so, it cannot be suitable for UDP.
> >
> > I'm thinking of this solution:
> > 1) adding a new flag in SOF_TIMESTAMPING_OPT_BPF flag (in
> > include/uapi/linux/net_tstamp.h) which can be used by sk->sk_tsflags
> > 2) flags =   SOF_TIMESTAMPING_OPT_BPF;    bpf_setsockopt(skops,
> > SOL_SOCKET, SO_TIMESTAMPING, &flags, sizeof(flags));
> > 3) test if sk->sk_tsflags has this new flag in tcp_tx_timestamp() or
> > in udp_sendmsg()
> > ...
> >
> > >
> > > For tx, one new hook should be at the sendmsg and should be around
> > > tcp_tx_timestamp (?) for tcp. Another hook is __skb_tstamp_tx() which should be
> >
> > I think there are two points we're supposed to record:
> > 1) the moment tcp/udp_sendmsg() is triggered. It represents the syscall time.
> > 2) another point in tcp_tx_timestamp(). It represents the timestamp of
> > the last skb in this sendmsg() call.
> > Users may happen to send a big packet.
>
> Err on the side of fewer measurement points. It's always possible to
> add more later, but not possible to remove them (depending on whether
> BPF infra is ABI).
>
> Overall great suggestion. Thanks a lot for sharing your BPF expertise
> on this, Martin.
>
> On using the raw seqno: this data is accessible to anyone root in
> namespace (ns_capable) using packet sockets, so as long as it does not
> open to more than that, it is logically equivalent to the current
> setting.
>
> With seqno the BPF program has to be careful that the same seqno can
> be retransmitted, so for instance seeing an ACK before a (second) SND
> must be anticipated. That is true for SO_TIMESTAMPING today too.
>
> For datagrams (UDP as well as RAW and many non IP protocols), an
> alternative still needs to be found.

It seems that using the tskey for bpf extension is always correct and
easy to use.

Could we provide the tskey to users and then let users decide the
better way to identify the call of sendmsg. We could keep the
traditional use of tskey. If without it, people need to figure out a
good way and may find it difficult to use the bpf extension.

I will keep thinking of alternatives for UDP in the meantime.

Thanks,
Jason
Martin KaFai Lau Oct. 31, 2024, 11:26 p.m. UTC | #16
On 10/31/24 6:50 AM, Jason Xing wrote:
> On Thu, Oct 31, 2024 at 8:30 PM Willem de Bruijn
> <willemdebruijn.kernel@gmail.com> wrote:
>>
>> Jason Xing wrote:
>>> On Thu, Oct 31, 2024 at 2:27 PM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>>>>
>>>> On 10/30/24 5:13 PM, Jason Xing wrote:
>>>>> I realized that we will have some new sock_opt flags like
>>>>> TS_SCHED_OPT_CB in patch 4, so we can control whether to print or
>>>>> not... For each sock_opt point, they will be called without caring if
>>>>> related flags in skb are set. Well, it's meaningless to add more
>>>>> control of skb tsflags at each TS_xx_OPT_CB point.
>>>>>
>>>>> Am I understanding in a correct way? Now, I'm totally fine with this great idea!
>>>> Yes, I think so.
>>>>
>>>> The sockops prog can choose to ignore any BPF_SOCK_OPS_TS_*_CB. The are only 3:
>>>> SCHED, SND, and ACK. If the hwtstamp is available from a NIC, I think it would
>>>> be quite wasteful to throw it away. ACK can be controlled by the
>>>> TCP_SKB_CB(skb)->bpf_txstamp_ack.
>>>
>>> Right, let me try this:)
>>>
>>>> Going back to my earlier bpf_setsockopt(SOL_SOCKET, BPF_TX_TIMESTAMPING)
>>>> comment. I think it may as well go back to use the "u8
>>>> bpf_sock_ops_cb_flags;" and use the bpf_sock_ops_cb_flags_set() helper to
>>>> enable/disable the timestamp related callback hook. May be add one
>>>> BPF_SOCK_OPS_TX_TIMESTAMPING_CB_FLAG.
>>>
>>> bpf_sock_ops_cb_flags this flag is only used in TCP condition, right?
>>> If that is so, it cannot be suitable for UDP.
>>>
>>> I'm thinking of this solution:
>>> 1) adding a new flag in SOF_TIMESTAMPING_OPT_BPF flag (in
>>> include/uapi/linux/net_tstamp.h) which can be used by sk->sk_tsflags

probably not in include/uapi/linux/net_tstamp.h. This flag can only be used by a 
bpf prog (meaning will not be used by user space syscall). More below.

>>> 2) flags =   SOF_TIMESTAMPING_OPT_BPF;    bpf_setsockopt(skops,
>>> SOL_SOCKET, SO_TIMESTAMPING, &flags, sizeof(flags));
>>> 3) test if sk->sk_tsflags has this new flag in tcp_tx_timestamp() or
>>> in udp_sendmsg()
>>> ...

Not sure how many churns/audits is needed to ensure the user space cannot 
set/clear the SOF_TIMESTAMPING_OPT_BPF bit in sk->sk_tsflags. Could be not much.

May be it is cleaner to leave the sk->sk_tsflags for user space only and having 
a separate field in "struct sock" to track bpf specific needs. More like your 
current sk_tsflags_bpf approach but I was thinking to reuse the 
bpf_sock_ops_cb_flags instead. e.g. "BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), 
BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG)" is used to check if it needs to call a bpf 
prog to decide if it needs to add tcp header option. Here we want to test if it 
should call a bpf prog to make a decision on tx timestamp on a skb.

The bpf_sock_ops_cb_flags can be moved from struct tcp_sock to struct sock. It 
is doable from the bpf side.

All that said, but, yes, it will add some TCP specific enum flag (e.g. 
BPF_SOCK_OPS_RTO_CB_FLAG) to the struct sock which will not be used by 
UDP/raw/...etc, so may be keep your current sk_tsflags_bpf approach but rename 
it to sk_bpf_cb_flags in struct "sock" so that it can be reused for other non 
tstamp ops in the future? probably a u8 is enough.

This optname is used by the bpf prog only and not usable by user space syscall. 
If it prefers to stay with bpf_setsockopt (which is fine), it needs a bpf 
specific optname like the current TCP_BPF_SOCK_OPS_CB_FLAGS which currently sets 
the tp->bpf_sock_ops_cb_flags. May be a new SK_BPF_CB_FLAGS optname for setting 
the sk->sk_bpf_cb_flags, like bpf_setsockopt(skops_ctx, SOL_SOCKET, 
SK_BPF_CB_FLAGS, &val, sizeof(val)) and handle it in the sol_socket_sockopt() 
alone without calling into sk_{set,get}sockopt. Add a new enum for the optval 
for the sk_bpf_cb_flags:

enum {
	SK_BPF_CB_TX_TIMESTAMPING = (1 << 0),
	SK_BPF_CB_RX_TIEMSTAMPING = (1 << 1),
};


>>>
>>>>
>>>> For tx, one new hook should be at the sendmsg and should be around
>>>> tcp_tx_timestamp (?) for tcp. Another hook is __skb_tstamp_tx() which should be
>>>
>>> I think there are two points we're supposed to record:
>>> 1) the moment tcp/udp_sendmsg() is triggered. It represents the syscall time.
>>> 2) another point in tcp_tx_timestamp(). It represents the timestamp of
>>> the last skb in this sendmsg() call.
>>> Users may happen to send a big packet.

hmm... a big packet and sendmsg is blocked waiting for memory?

>>
>> Err on the side of fewer measurement points. It's always possible to
>> add more later, but not possible to remove them (depending on whether
>> BPF infra is ABI).

I also think it is better to start with tcp_tx_timestamp() alone first to keep 
the patch set simple now. The selftest prog can use a bpf fentry prog to trace 
the tcp_sendmsg_locked(). This can be revisited later if the bpf fentry prog is 
not enough.

>>
>> Overall great suggestion. Thanks a lot for sharing your BPF expertise
>> on this, Martin.

Thanks!

>>
>> On using the raw seqno: this data is accessible to anyone root in
>> namespace (ns_capable) using packet sockets, so as long as it does not
>> open to more than that, it is logically equivalent to the current
>> setting.
>>
>> With seqno the BPF program has to be careful that the same seqno can
>> be retransmitted, so for instance seeing an ACK before a (second) SND
>> must be anticipated. That is true for SO_TIMESTAMPING today too.

Ah. It will be a very useful comment to add to the selftests bpf prog.

>>
>> For datagrams (UDP as well as RAW and many non IP protocols), an
>> alternative still needs to be found.

In udp/raw/..., I don't know how likely is the user space having "cork->tx_flags 
& SKBTX_ANY_TSTAMP" set but has neither "READ_ONCE(sk->sk_tsflags) & 
SOF_TIMESTAMPING_OPT_ID" nor "cork->flags & IPCORK_TS_OPT_ID" set. If it is 
unlikely, may be we can just disallow bpf prog from directly setting 
skb_shinfo(skb)->tskey for this particular skb.

For all other cases, in __ip[6]_append_data, directly call a bpf prog and also 
pass the kernel decided tskey to the bpf prog.

The kernel passed tskey could be 0 (meaning the user space has not used it). The 
bpf prog can give one for the kernel to use. The bpf prog can store the 
sk_tskey_bpf in the bpf_sk_storage now. Meaning no need to add one to the struct 
sock. The bpf prog does not have to start from 0 (e.g. start from U32_MAX 
instead) if it helps.

If the kernel passed tskey is not 0, the bpf prog can just use that one 
(assuming the user space is doing something sane, like the value in 
SCM_TS_OPT_ID won't be jumping back and front between 0 to U32_MAX). I hope this 
is very unlikely also (?) but the bpf prog can probably detect this and choose 
to ignore this sk.

To solve the above unsupported corner cases, I think we can allow the bpf prog 
to store something in the shinfo->hwtstamps at the tx path. The bpf-only key 
could be one of the things to store there. Change __ip[6]_append_data to handle 
the shinfo->hwtstamps. I think allowing the bpf prog to write to the 
shinfo->hwtsatmps could be considered later when needed.

[ I may be off tomorrow, so reply could be slower. ]

> 
> It seems that using the tskey for bpf extension is always correct and
> easy to use.
> 
> Could we provide the tskey to users and then let users decide the
> better way to identify the call of sendmsg. We could keep the
> traditional use of tskey. If without it, people need to figure out a
> good way and may find it difficult to use the bpf extension.
> 
> I will keep thinking of alternatives for UDP in the meantime.
Jason Xing Nov. 1, 2024, 7:47 a.m. UTC | #17
On Fri, Nov 1, 2024 at 7:26 AM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>
> On 10/31/24 6:50 AM, Jason Xing wrote:
> > On Thu, Oct 31, 2024 at 8:30 PM Willem de Bruijn
> > <willemdebruijn.kernel@gmail.com> wrote:
> >>
> >> Jason Xing wrote:
> >>> On Thu, Oct 31, 2024 at 2:27 PM Martin KaFai Lau <martin.lau@linux.dev> wrote:
> >>>>
> >>>> On 10/30/24 5:13 PM, Jason Xing wrote:
> >>>>> I realized that we will have some new sock_opt flags like
> >>>>> TS_SCHED_OPT_CB in patch 4, so we can control whether to print or
> >>>>> not... For each sock_opt point, they will be called without caring if
> >>>>> related flags in skb are set. Well, it's meaningless to add more
> >>>>> control of skb tsflags at each TS_xx_OPT_CB point.
> >>>>>
> >>>>> Am I understanding in a correct way? Now, I'm totally fine with this great idea!
> >>>> Yes, I think so.
> >>>>
> >>>> The sockops prog can choose to ignore any BPF_SOCK_OPS_TS_*_CB. The are only 3:
> >>>> SCHED, SND, and ACK. If the hwtstamp is available from a NIC, I think it would
> >>>> be quite wasteful to throw it away. ACK can be controlled by the
> >>>> TCP_SKB_CB(skb)->bpf_txstamp_ack.
> >>>
> >>> Right, let me try this:)
> >>>
> >>>> Going back to my earlier bpf_setsockopt(SOL_SOCKET, BPF_TX_TIMESTAMPING)
> >>>> comment. I think it may as well go back to use the "u8
> >>>> bpf_sock_ops_cb_flags;" and use the bpf_sock_ops_cb_flags_set() helper to
> >>>> enable/disable the timestamp related callback hook. May be add one
> >>>> BPF_SOCK_OPS_TX_TIMESTAMPING_CB_FLAG.
> >>>
> >>> bpf_sock_ops_cb_flags this flag is only used in TCP condition, right?
> >>> If that is so, it cannot be suitable for UDP.
> >>>
> >>> I'm thinking of this solution:
> >>> 1) adding a new flag in SOF_TIMESTAMPING_OPT_BPF flag (in
> >>> include/uapi/linux/net_tstamp.h) which can be used by sk->sk_tsflags
>
> probably not in include/uapi/linux/net_tstamp.h. This flag can only be used by a
> bpf prog (meaning will not be used by user space syscall). More below.
>
> >>> 2) flags =   SOF_TIMESTAMPING_OPT_BPF;    bpf_setsockopt(skops,
> >>> SOL_SOCKET, SO_TIMESTAMPING, &flags, sizeof(flags));
> >>> 3) test if sk->sk_tsflags has this new flag in tcp_tx_timestamp() or
> >>> in udp_sendmsg()
> >>> ...
>
> Not sure how many churns/audits is needed to ensure the user space cannot
> set/clear the SOF_TIMESTAMPING_OPT_BPF bit in sk->sk_tsflags. Could be not much.
>
> May be it is cleaner to leave the sk->sk_tsflags for user space only and having
> a separate field in "struct sock" to track bpf specific needs. More like your
> current sk_tsflags_bpf approach but I was thinking to reuse the
> bpf_sock_ops_cb_flags instead. e.g. "BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk),
> BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG)" is used to check if it needs to call a bpf
> prog to decide if it needs to add tcp header option. Here we want to test if it
> should call a bpf prog to make a decision on tx timestamp on a skb.
>
> The bpf_sock_ops_cb_flags can be moved from struct tcp_sock to struct sock. It
> is doable from the bpf side.
>
> All that said, but, yes, it will add some TCP specific enum flag (e.g.
> BPF_SOCK_OPS_RTO_CB_FLAG) to the struct sock which will not be used by
> UDP/raw/...etc, so may be keep your current sk_tsflags_bpf approach but rename
> it to sk_bpf_cb_flags in struct "sock" so that it can be reused for other non
> tstamp ops in the future? probably a u8 is enough.

Thanks so much for the details.

>
> This optname is used by the bpf prog only and not usable by user space syscall.
> If it prefers to stay with bpf_setsockopt (which is fine), it needs a bpf
> specific optname like the current TCP_BPF_SOCK_OPS_CB_FLAGS which currently sets
> the tp->bpf_sock_ops_cb_flags. May be a new SK_BPF_CB_FLAGS optname for setting
> the sk->sk_bpf_cb_flags, like bpf_setsockopt(skops_ctx, SOL_SOCKET,

> SK_BPF_CB_FLAGS, &val, sizeof(val)) and handle it in the sol_socket_sockopt()
> alone without calling into sk_{set,get}sockopt. Add a new enum for the optval
> for the sk_bpf_cb_flags:
>
> enum {
>         SK_BPF_CB_TX_TIMESTAMPING = (1 << 0),
>         SK_BPF_CB_RX_TIEMSTAMPING = (1 << 1),
> };

Then it will involve more strange modification in sol_socket_sockopt()
to retrieve the opt value like what I did in V2 (see
https://lore.kernel.org/all/20241012040651.95616-3-kerneljasonxing@gmail.com/).
It's the reason why I did set and get operation in
sk_{set,get}sockopt() in this series to keep align with other flags.
Handling it in sk_{set,get}sockopt() is not a bad idea and easy to
implement, I feel.

Overall the suggestion looks good to me. I can give it a try :)

I'm thinking of another approach to using bpf_sock_ops_cb_flags_set()
instead of bpf_setsockopt() when sockops like
BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB is triggered. I can modify the
bpf_sock_ops_cb_flags_set like this:
diff --git a/net/core/filter.c b/net/core/filter.c
index 58761263176c..001140067c1a 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -5770,14 +5770,25 @@ BPF_CALL_2(bpf_sock_ops_cb_flags_set, struct
bpf_sock_ops_kern *, bpf_sock,
           int, argval)
 {
        struct sock *sk = bpf_sock->sk;
-       int val = argval & BPF_SOCK_OPS_ALL_CB_FLAGS;
+       int val = argval;

-       if (!IS_ENABLED(CONFIG_INET) || !sk_fullsock(sk))
+       if (!IS_ENABLED(CONFIG_INET))
                return -EINVAL;

-       tcp_sk(sk)->bpf_sock_ops_cb_flags = val;
+       if (sk_is_tcp(sk)) {
+               val = argval & BPF_SOCK_OPS_ALL_CB_FLAGS;
+               if (!sk_fullsock(sk))
+                       return -EINVAL;
+
+               tcp_sk(sk)->bpf_sock_ops_cb_flags = val;
+
+               val = argval & (~BPF_SOCK_OPS_ALL_CB_FLAGS);
+       } else {
+               sk->bpf_sock_ops_cb_flags = val;
+               val = argval &
(~(SK_BPF_CB_TX_TIEMSTAMPING|SK_BPF_CB_RX_TIEMSTAMPING));
+       }

-       return argval & (~BPF_SOCK_OPS_ALL_CB_FLAGS);
+       return val;
 }

The BPF program uses bpf_sock_ops_cb_flags_set(skops,
SK_BPF_CB_FLAGS); to set the flags. Then we can implement a similar
function like BPF_SOCK_OPS_TEST_FLAG() in tcp_tx_timestamp() to check
if we are allowed to set shinfo->tx_flags |= SKBTX_BPF.

One advantage of this approach is that the bpf_sock_ops_cb_flags_set()
could be extended for more than only TCP in the future. Admittedly,
this will involve more work.

Which way would you prefer?

>
>
> >>>
> >>>>
> >>>> For tx, one new hook should be at the sendmsg and should be around
> >>>> tcp_tx_timestamp (?) for tcp. Another hook is __skb_tstamp_tx() which should be
> >>>
> >>> I think there are two points we're supposed to record:
> >>> 1) the moment tcp/udp_sendmsg() is triggered. It represents the syscall time.
> >>> 2) another point in tcp_tx_timestamp(). It represents the timestamp of
> >>> the last skb in this sendmsg() call.
> >>> Users may happen to send a big packet.
>
> hmm... a big packet and sendmsg is blocked waiting for memory?
>
> >>
> >> Err on the side of fewer measurement points. It's always possible to
> >> add more later, but not possible to remove them (depending on whether
> >> BPF infra is ABI).
>
> I also think it is better to start with tcp_tx_timestamp() alone first to keep
> the patch set simple now. The selftest prog can use a bpf fentry prog to trace
> the tcp_sendmsg_locked(). This can be revisited later if the bpf fentry prog is
> not enough.
>
> >>
> >> Overall great suggestion. Thanks a lot for sharing your BPF expertise
> >> on this, Martin.
>
> Thanks!
>
> >>
> >> On using the raw seqno: this data is accessible to anyone root in
> >> namespace (ns_capable) using packet sockets, so as long as it does not
> >> open to more than that, it is logically equivalent to the current
> >> setting.
> >>
> >> With seqno the BPF program has to be careful that the same seqno can
> >> be retransmitted, so for instance seeing an ACK before a (second) SND
> >> must be anticipated. That is true for SO_TIMESTAMPING today too.
>
> Ah. It will be a very useful comment to add to the selftests bpf prog.
>
> >>
> >> For datagrams (UDP as well as RAW and many non IP protocols), an
> >> alternative still needs to be found.
>
> In udp/raw/..., I don't know how likely is the user space having "cork->tx_flags
> & SKBTX_ANY_TSTAMP" set but has neither "READ_ONCE(sk->sk_tsflags) &
> SOF_TIMESTAMPING_OPT_ID" nor "cork->flags & IPCORK_TS_OPT_ID" set. If it is
> unlikely, may be we can just disallow bpf prog from directly setting
> skb_shinfo(skb)->tskey for this particular skb.
>
> For all other cases, in __ip[6]_append_data, directly call a bpf prog and also
> pass the kernel decided tskey to the bpf prog.

I'm a bit confused here. IIUC, we need to support the tskey like what
we did in this series to handle non TCP cases?

I think I can keep those three patches related to tskey to support
both TCP and non-TCP cases. Then let the bpf program decide to use
tskey.

>
> The kernel passed tskey could be 0 (meaning the user space has not used it). The
> bpf prog can give one for the kernel to use. The bpf prog can store the
> sk_tskey_bpf in the bpf_sk_storage now. Meaning no need to add one to the struct
> sock. The bpf prog does not have to start from 0 (e.g. start from U32_MAX
> instead) if it helps.
>
> If the kernel passed tskey is not 0, the bpf prog can just use that one
> (assuming the user space is doing something sane, like the value in
> SCM_TS_OPT_ID won't be jumping back and front between 0 to U32_MAX). I hope this
> is very unlikely also (?) but the bpf prog can probably detect this and choose
> to ignore this sk.
>
> To solve the above unsupported corner cases, I think we can allow the bpf prog
> to store something in the shinfo->hwtstamps at the tx path. The bpf-only key
> could be one of the things to store there. Change __ip[6]_append_data to handle
> the shinfo->hwtstamps. I think allowing the bpf prog to write to the
> shinfo->hwtsatmps could be considered later when needed.
>
> [ I may be off tomorrow, so reply could be slower. ]

Thanks for your help!

Thanks,
Jason
Willem de Bruijn Nov. 1, 2024, 1:32 p.m. UTC | #18
Martin KaFai Lau wrote:
> On 10/31/24 6:50 AM, Jason Xing wrote:
> > On Thu, Oct 31, 2024 at 8:30 PM Willem de Bruijn
> > <willemdebruijn.kernel@gmail.com> wrote:
> >>
> >> Jason Xing wrote:
> >>> On Thu, Oct 31, 2024 at 2:27 PM Martin KaFai Lau <martin.lau@linux.dev> wrote:
> >>>>
> >>>> On 10/30/24 5:13 PM, Jason Xing wrote:
> >>>>> I realized that we will have some new sock_opt flags like
> >>>>> TS_SCHED_OPT_CB in patch 4, so we can control whether to print or
> >>>>> not... For each sock_opt point, they will be called without caring if
> >>>>> related flags in skb are set. Well, it's meaningless to add more
> >>>>> control of skb tsflags at each TS_xx_OPT_CB point.
> >>>>>
> >>>>> Am I understanding in a correct way? Now, I'm totally fine with this great idea!
> >>>> Yes, I think so.
> >>>>
> >>>> The sockops prog can choose to ignore any BPF_SOCK_OPS_TS_*_CB. The are only 3:
> >>>> SCHED, SND, and ACK. If the hwtstamp is available from a NIC, I think it would
> >>>> be quite wasteful to throw it away. ACK can be controlled by the
> >>>> TCP_SKB_CB(skb)->bpf_txstamp_ack.
> >>>
> >>> Right, let me try this:)
> >>>
> >>>> Going back to my earlier bpf_setsockopt(SOL_SOCKET, BPF_TX_TIMESTAMPING)
> >>>> comment. I think it may as well go back to use the "u8
> >>>> bpf_sock_ops_cb_flags;" and use the bpf_sock_ops_cb_flags_set() helper to
> >>>> enable/disable the timestamp related callback hook. May be add one
> >>>> BPF_SOCK_OPS_TX_TIMESTAMPING_CB_FLAG.
> >>>
> >>> bpf_sock_ops_cb_flags this flag is only used in TCP condition, right?
> >>> If that is so, it cannot be suitable for UDP.
> >>>
> >>> I'm thinking of this solution:
> >>> 1) adding a new flag in SOF_TIMESTAMPING_OPT_BPF flag (in
> >>> include/uapi/linux/net_tstamp.h) which can be used by sk->sk_tsflags
> 
> probably not in include/uapi/linux/net_tstamp.h. This flag can only be used by a 
> bpf prog (meaning will not be used by user space syscall). More below.
> 
> >>> 2) flags =   SOF_TIMESTAMPING_OPT_BPF;    bpf_setsockopt(skops,
> >>> SOL_SOCKET, SO_TIMESTAMPING, &flags, sizeof(flags));
> >>> 3) test if sk->sk_tsflags has this new flag in tcp_tx_timestamp() or
> >>> in udp_sendmsg()
> >>> ...
> 
> Not sure how many churns/audits is needed to ensure the user space cannot 
> set/clear the SOF_TIMESTAMPING_OPT_BPF bit in sk->sk_tsflags. Could be not much.

Stores are limited to defined bits with the following in
sock_set_timestamping

        if (val & ~SOF_TIMESTAMPING_MASK)
                return -EINVAL;
 
> May be it is cleaner to leave the sk->sk_tsflags for user space only and having 
> a separate field in "struct sock" to track bpf specific needs. More like your 
> current sk_tsflags_bpf approach but I was thinking to reuse the 
> bpf_sock_ops_cb_flags instead. e.g. "BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), 
> BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG)" is used to check if it needs to call a bpf 
> prog to decide if it needs to add tcp header option. Here we want to test if it 
> should call a bpf prog to make a decision on tx timestamp on a skb.
> 
> The bpf_sock_ops_cb_flags can be moved from struct tcp_sock to struct sock. It 
> is doable from the bpf side.
> 
> All that said, but, yes, it will add some TCP specific enum flag (e.g. 
> BPF_SOCK_OPS_RTO_CB_FLAG) to the struct sock which will not be used by 
> UDP/raw/...etc, so may be keep your current sk_tsflags_bpf approach but rename 
> it to sk_bpf_cb_flags in struct "sock" so that it can be reused for other non 
> tstamp ops in the future? probably a u8 is enough.
> 
> This optname is used by the bpf prog only and not usable by user space syscall. 
> If it prefers to stay with bpf_setsockopt (which is fine), it needs a bpf 
> specific optname like the current TCP_BPF_SOCK_OPS_CB_FLAGS which currently sets 
> the tp->bpf_sock_ops_cb_flags. May be a new SK_BPF_CB_FLAGS optname for setting 
> the sk->sk_bpf_cb_flags, like bpf_setsockopt(skops_ctx, SOL_SOCKET, 
> SK_BPF_CB_FLAGS, &val, sizeof(val)) and handle it in the sol_socket_sockopt() 
> alone without calling into sk_{set,get}sockopt. Add a new enum for the optval 
> for the sk_bpf_cb_flags:
> 
> enum {
> 	SK_BPF_CB_TX_TIMESTAMPING = (1 << 0),
> 	SK_BPF_CB_RX_TIEMSTAMPING = (1 << 1),
> };
> 
> >>
> >> On using the raw seqno: this data is accessible to anyone root in
> >> namespace (ns_capable) using packet sockets, so as long as it does not
> >> open to more than that, it is logically equivalent to the current
> >> setting.
> >>
> >> With seqno the BPF program has to be careful that the same seqno can
> >> be retransmitted, so for instance seeing an ACK before a (second) SND
> >> must be anticipated. That is true for SO_TIMESTAMPING today too.
> 
> Ah. It will be a very useful comment to add to the selftests bpf prog.
> 
> >>
> >> For datagrams (UDP as well as RAW and many non IP protocols), an
> >> alternative still needs to be found.
> 
> In udp/raw/..., I don't know how likely is the user space having "cork->tx_flags 
> & SKBTX_ANY_TSTAMP" set but has neither "READ_ONCE(sk->sk_tsflags) & 
> SOF_TIMESTAMPING_OPT_ID" nor "cork->flags & IPCORK_TS_OPT_ID" set.

This is not something to rely on. OPT_ID was added relatively recently.
Older applications, or any that just use the most straightforward API,
will not set this.

> If it is 
> unlikely, may be we can just disallow bpf prog from directly setting 
> skb_shinfo(skb)->tskey for this particular skb.
> 
> For all other cases, in __ip[6]_append_data, directly call a bpf prog and also 
> pass the kernel decided tskey to the bpf prog.
> 
> The kernel passed tskey could be 0 (meaning the user space has not used it). The 
> bpf prog can give one for the kernel to use. The bpf prog can store the 
> sk_tskey_bpf in the bpf_sk_storage now. Meaning no need to add one to the struct 
> sock. The bpf prog does not have to start from 0 (e.g. start from U32_MAX 
> instead) if it helps.
> 
> If the kernel passed tskey is not 0, the bpf prog can just use that one 
> (assuming the user space is doing something sane, like the value in 
> SCM_TS_OPT_ID won't be jumping back and front between 0 to U32_MAX). I hope this 
> is very unlikely also (?) but the bpf prog can probably detect this and choose 
> to ignore this sk.

If an applications uses OPT_ID, it is unlikely that they will toggle
the feature on and off on a per-packet basis. So in the common case
the program could use the user-set counter or use its own if userspace
does not enable the feature. In the rare case that an application does
intermittently set an OPT_ID, the numbering would be erratic. This
does mean that an actively malicious application could mess with admin
measurements.

> To solve the above unsupported corner cases, I think we can allow the bpf prog 
> to store something in the shinfo->hwtstamps at the tx path. The bpf-only key 
> could be one of the things to store there. Change __ip[6]_append_data to handle 
> the shinfo->hwtstamps. I think allowing the bpf prog to write to the 
> shinfo->hwtsatmps could be considered later when needed.
> 
> [ I may be off tomorrow, so reply could be slower. ]
> 
> > 
> > It seems that using the tskey for bpf extension is always correct and
> > easy to use.
> > 
> > Could we provide the tskey to users and then let users decide the
> > better way to identify the call of sendmsg. We could keep the
> > traditional use of tskey. If without it, people need to figure out a
> > good way and may find it difficult to use the bpf extension.
> > 
> > I will keep thinking of alternatives for UDP in the meantime.
Jason Xing Nov. 1, 2024, 4:08 p.m. UTC | #19
On Fri, Nov 1, 2024 at 9:32 PM Willem de Bruijn
<willemdebruijn.kernel@gmail.com> wrote:
>
> Martin KaFai Lau wrote:
> > On 10/31/24 6:50 AM, Jason Xing wrote:
> > > On Thu, Oct 31, 2024 at 8:30 PM Willem de Bruijn
> > > <willemdebruijn.kernel@gmail.com> wrote:
> > >>
> > >> Jason Xing wrote:
> > >>> On Thu, Oct 31, 2024 at 2:27 PM Martin KaFai Lau <martin.lau@linux.dev> wrote:
> > >>>>
> > >>>> On 10/30/24 5:13 PM, Jason Xing wrote:
> > >>>>> I realized that we will have some new sock_opt flags like
> > >>>>> TS_SCHED_OPT_CB in patch 4, so we can control whether to print or
> > >>>>> not... For each sock_opt point, they will be called without caring if
> > >>>>> related flags in skb are set. Well, it's meaningless to add more
> > >>>>> control of skb tsflags at each TS_xx_OPT_CB point.
> > >>>>>
> > >>>>> Am I understanding in a correct way? Now, I'm totally fine with this great idea!
> > >>>> Yes, I think so.
> > >>>>
> > >>>> The sockops prog can choose to ignore any BPF_SOCK_OPS_TS_*_CB. The are only 3:
> > >>>> SCHED, SND, and ACK. If the hwtstamp is available from a NIC, I think it would
> > >>>> be quite wasteful to throw it away. ACK can be controlled by the
> > >>>> TCP_SKB_CB(skb)->bpf_txstamp_ack.
> > >>>
> > >>> Right, let me try this:)
> > >>>
> > >>>> Going back to my earlier bpf_setsockopt(SOL_SOCKET, BPF_TX_TIMESTAMPING)
> > >>>> comment. I think it may as well go back to use the "u8
> > >>>> bpf_sock_ops_cb_flags;" and use the bpf_sock_ops_cb_flags_set() helper to
> > >>>> enable/disable the timestamp related callback hook. May be add one
> > >>>> BPF_SOCK_OPS_TX_TIMESTAMPING_CB_FLAG.
> > >>>
> > >>> bpf_sock_ops_cb_flags this flag is only used in TCP condition, right?
> > >>> If that is so, it cannot be suitable for UDP.
> > >>>
> > >>> I'm thinking of this solution:
> > >>> 1) adding a new flag in SOF_TIMESTAMPING_OPT_BPF flag (in
> > >>> include/uapi/linux/net_tstamp.h) which can be used by sk->sk_tsflags
> >
> > probably not in include/uapi/linux/net_tstamp.h. This flag can only be used by a
> > bpf prog (meaning will not be used by user space syscall). More below.
> >
> > >>> 2) flags =   SOF_TIMESTAMPING_OPT_BPF;    bpf_setsockopt(skops,
> > >>> SOL_SOCKET, SO_TIMESTAMPING, &flags, sizeof(flags));
> > >>> 3) test if sk->sk_tsflags has this new flag in tcp_tx_timestamp() or
> > >>> in udp_sendmsg()
> > >>> ...
> >
> > Not sure how many churns/audits is needed to ensure the user space cannot
> > set/clear the SOF_TIMESTAMPING_OPT_BPF bit in sk->sk_tsflags. Could be not much.
>
> Stores are limited to defined bits with the following in
> sock_set_timestamping
>
>         if (val & ~SOF_TIMESTAMPING_MASK)
>                 return -EINVAL;
>
> > May be it is cleaner to leave the sk->sk_tsflags for user space only and having
> > a separate field in "struct sock" to track bpf specific needs. More like your
> > current sk_tsflags_bpf approach but I was thinking to reuse the
> > bpf_sock_ops_cb_flags instead. e.g. "BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk),
> > BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG)" is used to check if it needs to call a bpf
> > prog to decide if it needs to add tcp header option. Here we want to test if it
> > should call a bpf prog to make a decision on tx timestamp on a skb.
> >
> > The bpf_sock_ops_cb_flags can be moved from struct tcp_sock to struct sock. It
> > is doable from the bpf side.
> >
> > All that said, but, yes, it will add some TCP specific enum flag (e.g.
> > BPF_SOCK_OPS_RTO_CB_FLAG) to the struct sock which will not be used by
> > UDP/raw/...etc, so may be keep your current sk_tsflags_bpf approach but rename
> > it to sk_bpf_cb_flags in struct "sock" so that it can be reused for other non
> > tstamp ops in the future? probably a u8 is enough.
> >
> > This optname is used by the bpf prog only and not usable by user space syscall.
> > If it prefers to stay with bpf_setsockopt (which is fine), it needs a bpf
> > specific optname like the current TCP_BPF_SOCK_OPS_CB_FLAGS which currently sets
> > the tp->bpf_sock_ops_cb_flags. May be a new SK_BPF_CB_FLAGS optname for setting
> > the sk->sk_bpf_cb_flags, like bpf_setsockopt(skops_ctx, SOL_SOCKET,
> > SK_BPF_CB_FLAGS, &val, sizeof(val)) and handle it in the sol_socket_sockopt()
> > alone without calling into sk_{set,get}sockopt. Add a new enum for the optval
> > for the sk_bpf_cb_flags:
> >
> > enum {
> >       SK_BPF_CB_TX_TIMESTAMPING = (1 << 0),
> >       SK_BPF_CB_RX_TIEMSTAMPING = (1 << 1),
> > };
> >
> > >>
> > >> On using the raw seqno: this data is accessible to anyone root in
> > >> namespace (ns_capable) using packet sockets, so as long as it does not
> > >> open to more than that, it is logically equivalent to the current
> > >> setting.
> > >>
> > >> With seqno the BPF program has to be careful that the same seqno can
> > >> be retransmitted, so for instance seeing an ACK before a (second) SND
> > >> must be anticipated. That is true for SO_TIMESTAMPING today too.
> >
> > Ah. It will be a very useful comment to add to the selftests bpf prog.
> >
> > >>
> > >> For datagrams (UDP as well as RAW and many non IP protocols), an
> > >> alternative still needs to be found.
> >
> > In udp/raw/..., I don't know how likely is the user space having "cork->tx_flags
> > & SKBTX_ANY_TSTAMP" set but has neither "READ_ONCE(sk->sk_tsflags) &
> > SOF_TIMESTAMPING_OPT_ID" nor "cork->flags & IPCORK_TS_OPT_ID" set.
>
> This is not something to rely on. OPT_ID was added relatively recently.
> Older applications, or any that just use the most straightforward API,
> will not set this.
>
> > If it is
> > unlikely, may be we can just disallow bpf prog from directly setting
> > skb_shinfo(skb)->tskey for this particular skb.
> >
> > For all other cases, in __ip[6]_append_data, directly call a bpf prog and also
> > pass the kernel decided tskey to the bpf prog.
> >
> > The kernel passed tskey could be 0 (meaning the user space has not used it). The
> > bpf prog can give one for the kernel to use. The bpf prog can store the
> > sk_tskey_bpf in the bpf_sk_storage now. Meaning no need to add one to the struct
> > sock. The bpf prog does not have to start from 0 (e.g. start from U32_MAX
> > instead) if it helps.
> >
> > If the kernel passed tskey is not 0, the bpf prog can just use that one
> > (assuming the user space is doing something sane, like the value in
> > SCM_TS_OPT_ID won't be jumping back and front between 0 to U32_MAX). I hope this
> > is very unlikely also (?) but the bpf prog can probably detect this and choose
> > to ignore this sk.
>
> If an applications uses OPT_ID, it is unlikely that they will toggle
> the feature on and off on a per-packet basis. So in the common case
> the program could use the user-set counter or use its own if userspace
> does not enable the feature. In the rare case that an application does
> intermittently set an OPT_ID, the numbering would be erratic. This
> does mean that an actively malicious application could mess with admin
> measurements.
>

Sorry, I got lost in this part. What would you recommend I should do
about OPT_ID in the next move? Should I keep those three OPT_ID
patches?

Thanks,
Jason
Willem de Bruijn Nov. 1, 2024, 4:39 p.m. UTC | #20
> > > >>
> > > >> For datagrams (UDP as well as RAW and many non IP protocols), an
> > > >> alternative still needs to be found.
> > >
> > > In udp/raw/..., I don't know how likely is the user space having "cork->tx_flags
> > > & SKBTX_ANY_TSTAMP" set but has neither "READ_ONCE(sk->sk_tsflags) &
> > > SOF_TIMESTAMPING_OPT_ID" nor "cork->flags & IPCORK_TS_OPT_ID" set.
> >
> > This is not something to rely on. OPT_ID was added relatively recently.
> > Older applications, or any that just use the most straightforward API,
> > will not set this.
> >
> > > If it is
> > > unlikely, may be we can just disallow bpf prog from directly setting
> > > skb_shinfo(skb)->tskey for this particular skb.
> > >
> > > For all other cases, in __ip[6]_append_data, directly call a bpf prog and also
> > > pass the kernel decided tskey to the bpf prog.
> > >
> > > The kernel passed tskey could be 0 (meaning the user space has not used it). The
> > > bpf prog can give one for the kernel to use. The bpf prog can store the
> > > sk_tskey_bpf in the bpf_sk_storage now. Meaning no need to add one to the struct
> > > sock. The bpf prog does not have to start from 0 (e.g. start from U32_MAX
> > > instead) if it helps.
> > >
> > > If the kernel passed tskey is not 0, the bpf prog can just use that one
> > > (assuming the user space is doing something sane, like the value in
> > > SCM_TS_OPT_ID won't be jumping back and front between 0 to U32_MAX). I hope this
> > > is very unlikely also (?) but the bpf prog can probably detect this and choose
> > > to ignore this sk.
> >
> > If an applications uses OPT_ID, it is unlikely that they will toggle
> > the feature on and off on a per-packet basis. So in the common case
> > the program could use the user-set counter or use its own if userspace
> > does not enable the feature. In the rare case that an application does
> > intermittently set an OPT_ID, the numbering would be erratic. This
> > does mean that an actively malicious application could mess with admin
> > measurements.
> >
> 
> Sorry, I got lost in this part. What would you recommend I should do
> about OPT_ID in the next move? Should I keep those three OPT_ID
> patches?

I did not offer a suggestion. Just pointed out a constraint.
Simon Horman Nov. 2, 2024, 1:43 p.m. UTC | #21
On Mon, Oct 28, 2024 at 07:05:23PM +0800, Jason Xing wrote:
> From: Jason Xing <kernelxing@tencent.com>
> 
> This patch has introduced a separate sk_tsflags_bpf for bpf
> extension, which helps us let two feature work nearly at the
> same time.
> 
> Each feature will finally take effect on skb_shinfo(skb)->tx_flags,
> say, tcp_tx_timestamp() for TCP or skb_setup_tx_timestamp() for
> other types, so in __skb_tstamp_tx() we are unable to know which
> feature is turned on, unless we check each feature's own socket
> flag field.
> 
> Signed-off-by: Jason Xing <kernelxing@tencent.com>
> ---
>  include/net/sock.h |  1 +
>  net/core/skbuff.c  | 39 +++++++++++++++++++++++++++++++++++++++
>  2 files changed, 40 insertions(+)
> 
> diff --git a/include/net/sock.h b/include/net/sock.h
> index 7464e9f9f47c..5384f1e49f5c 100644
> --- a/include/net/sock.h
> +++ b/include/net/sock.h
> @@ -445,6 +445,7 @@ struct sock {
>  	u32			sk_reserved_mem;
>  	int			sk_forward_alloc;
>  	u32			sk_tsflags;
> +	u32			sk_tsflags_bpf;

Please add sk_tsflags_bpf to the Kernel doc for this structure.
Likewise for sk_tskey_bpf_offset which is added by a subsequent patch.

>  	__cacheline_group_end(sock_write_rxtx);
>  
>  	__cacheline_group_begin(sock_write_tx);

...
Jason Xing Nov. 3, 2024, 12:42 a.m. UTC | #22
On Sat, Nov 2, 2024 at 9:44 PM Simon Horman <horms@kernel.org> wrote:
>
> On Mon, Oct 28, 2024 at 07:05:23PM +0800, Jason Xing wrote:
> > From: Jason Xing <kernelxing@tencent.com>
> >
> > This patch has introduced a separate sk_tsflags_bpf for bpf
> > extension, which helps us let two feature work nearly at the
> > same time.
> >
> > Each feature will finally take effect on skb_shinfo(skb)->tx_flags,
> > say, tcp_tx_timestamp() for TCP or skb_setup_tx_timestamp() for
> > other types, so in __skb_tstamp_tx() we are unable to know which
> > feature is turned on, unless we check each feature's own socket
> > flag field.
> >
> > Signed-off-by: Jason Xing <kernelxing@tencent.com>
> > ---
> >  include/net/sock.h |  1 +
> >  net/core/skbuff.c  | 39 +++++++++++++++++++++++++++++++++++++++
> >  2 files changed, 40 insertions(+)
> >
> > diff --git a/include/net/sock.h b/include/net/sock.h
> > index 7464e9f9f47c..5384f1e49f5c 100644
> > --- a/include/net/sock.h
> > +++ b/include/net/sock.h
> > @@ -445,6 +445,7 @@ struct sock {
> >       u32                     sk_reserved_mem;
> >       int                     sk_forward_alloc;
> >       u32                     sk_tsflags;
> > +     u32                     sk_tsflags_bpf;
>
> Please add sk_tsflags_bpf to the Kernel doc for this structure.
> Likewise for sk_tskey_bpf_offset which is added by a subsequent patch.

Oh, thanks for reminding me!
Martin KaFai Lau Nov. 5, 2024, 1:50 a.m. UTC | #23
On 11/1/24 12:47 AM, Jason Xing wrote:

>> If it prefers to stay with bpf_setsockopt (which is fine), it needs a bpf
>> specific optname like the current TCP_BPF_SOCK_OPS_CB_FLAGS which currently sets
>> the tp->bpf_sock_ops_cb_flags. May be a new SK_BPF_CB_FLAGS optname for setting
>> the sk->sk_bpf_cb_flags, like bpf_setsockopt(skops_ctx, SOL_SOCKET,
> 
>> SK_BPF_CB_FLAGS, &val, sizeof(val)) and handle it in the sol_socket_sockopt()
>> alone without calling into sk_{set,get}sockopt. Add a new enum for the optval
>> for the sk_bpf_cb_flags:
>>
>> enum {
>>          SK_BPF_CB_TX_TIMESTAMPING = (1 << 0),
>>          SK_BPF_CB_RX_TIEMSTAMPING = (1 << 1),
>> };
> 
> Then it will involve more strange modification in sol_socket_sockopt()
> to retrieve the opt value like what I did in V2 (see
> https://lore.kernel.org/all/20241012040651.95616-3-kerneljasonxing@gmail.com/).
> It's the reason why I did set and get operation in
> sk_{set,get}sockopt() in this series to keep align with other flags.
> Handling it in sk_{set,get}sockopt() is not a bad idea and easy to
> implement, I feel.

This will look very different now. It is handling bpf specific
optname and accessing the bpf specific field in sk->sk_bpf_cb_flags.

I really don't see why it needs to spill over to sk_{set,get}sockopt()
to handle sk->sk_bpf_cb_flags.

I have quickly typed out a small part of discussion so far.
It is likely buggy and not compiler tested. Pieces are still missing.
The bpf_tstamp_ack will need a few more changes in the
tcp_{input,output}.c. May be merging with the tstamp_ack to become
2 bits will be cleaner, not sure.

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 39f1d16f3628..0b4913315854 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -488,6 +488,7 @@ enum {
  
  	/* generate software time stamp when entering packet scheduling */
  	SKBTX_SCHED_TSTAMP = 1 << 6,
+	SKBTX_BPF = 1 << 7,
  };
  
  #define SKBTX_ANY_SW_TSTAMP	(SKBTX_SW_TSTAMP    | \
diff --git a/include/net/sock.h b/include/net/sock.h
index f29c14448938..4ec27c524f49 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -234,6 +234,20 @@ struct sock_common {
  struct bpf_local_storage;
  struct sk_filter;
  
+enum {
+	SK_BPF_CB_TX_TIMESTAMPING = BIT(0),
+	SK_BPF_CB_RX_TIEMSTAMPING = BIT(1),
+	SK_BPF_CB_MASK		= BIT(2) - 1,
+};
+
+#ifdef CONFIG_BPF_SYSCALL
+#define SK_BPF_CB_FLAG_TEST(SK, FLAG) ((SK)->sk_bpf_cb_flags & (FLAG))
+void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb);
+#else
+#define SK_BPF_CB_FLAG_TEST(SK, FLAG)
+static inline void bpf_skops_timestamping(struct sock *sk, struct sk_buff *skb) {}
+#endif
+
  /**
    *	struct sock - network layer representation of sockets
    *	@__sk_common: shared layout with inet_timewait_sock
@@ -444,7 +458,10 @@ struct sock {
  	socket_lock_t		sk_lock;
  	u32			sk_reserved_mem;
  	int			sk_forward_alloc;
-	u32			sk_tsflags;
+	u16			sk_tsflags;
+#ifdef CONFIG_BPF_SYSCALL
+	u16			sk_bpf_cb_flags;
+#endif
  	__cacheline_group_end(sock_write_rxtx);
  
  	__cacheline_group_begin(sock_write_tx);
diff --git a/include/net/tcp.h b/include/net/tcp.h
index d1948d357dad..224b697bae9d 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -961,7 +961,8 @@ struct tcp_skb_cb {
  	__u8		txstamp_ack:1,	/* Record TX timestamp for ack? */
  			eor:1,		/* Is skb MSG_EOR marked? */
  			has_rxtstamp:1,	/* SKB has a RX timestamp	*/
-			unused:5;
+			bpf_txstamp_ack:1,
+			unused:4;
  	__u32		ack_seq;	/* Sequence number ACK'd	*/
  	union {
  		struct {
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index f28b6527e815..2ff7ff0ebdab 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -7014,6 +7014,7 @@ enum {
  					 * by the kernel or the
  					 * earlier bpf-progs.
  					 */
+	BPF_SOCK_OPS_TX_TIMESTAMPING_CB,
  };
  
  /* List of TCP states. There is a build check in net/ipv4/tcp.c to detect
@@ -7080,6 +7081,7 @@ enum {
  	TCP_BPF_SYN_IP		= 1006, /* Copy the IP[46] and TCP header */
  	TCP_BPF_SYN_MAC         = 1007, /* Copy the MAC, IP[46], and TCP header */
  	TCP_BPF_SOCK_OPS_CB_FLAGS = 1008, /* Get or Set TCP sock ops flags */
+	SK_BPF_CB_FLAGS		= 1009,
  };
  
  enum {
diff --git a/net/core/filter.c b/net/core/filter.c
index e31ee8be2de0..81a36e50047b 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -5206,6 +5206,19 @@ static const struct bpf_func_proto bpf_get_socket_uid_proto = {
  	.arg1_type      = ARG_PTR_TO_CTX,
  };
  
+static int sk_bpf_cb_flags(struct sock *sk, int sk_bpf_cb_flags, bool getopt)
+{
+	if (getopt)
+		return -EINVAL;
+
+	if (sk_bpf_cb_flags & ~SK_BPF_CB_MASK)
+		return -EINVAL;
+
+	sk->sk_bpf_cb_flags = sk->sk_bpf_cb_flags;
+
+	return 0;
+}
+
  static int sol_socket_sockopt(struct sock *sk, int optname,
  			      char *optval, int *optlen,
  			      bool getopt)
@@ -5222,6 +5235,7 @@ static int sol_socket_sockopt(struct sock *sk, int optname,
  	case SO_MAX_PACING_RATE:
  	case SO_BINDTOIFINDEX:
  	case SO_TXREHASH:
+	case SK_BPF_CB_FLAGS:
  		if (*optlen != sizeof(int))
  			return -EINVAL;
  		break;
@@ -5231,6 +5245,9 @@ static int sol_socket_sockopt(struct sock *sk, int optname,
  		return -EINVAL;
  	}
  
+	if (optname == SK_BPF_CB_FLAGS)
+		return sk_bpf_cb_flags(sk, *(int *)optval, getopt);
+
  	if (getopt) {
  		if (optname == SO_BINDTODEVICE)
  			return -EINVAL;
diff --git a/net/core/sock.c b/net/core/sock.c
index 039be95c40cf..d0406639cee9 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -137,6 +137,7 @@
  #include <linux/sock_diag.h>
  
  #include <linux/filter.h>
+#include <linux/bpf-cgroup.h>
  #include <net/sock_reuseport.h>
  #include <net/bpf_sk_storage.h>
  
@@ -946,6 +947,20 @@ int sock_set_timestamping(struct sock *sk, int optname,
  	return 0;
  }
  
+#ifdef CONFIG_BPF_SYSCALL
+void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb)
+{
+	struct bpf_sock_ops_kern sock_ops;
+
+	memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
+	sock_ops.op = BPF_SOCK_OPS_TX_TIMESTAMPING_CB;
+	sock_ops.is_fullsock = 1;
+	sock_ops.sk = sk;
+	sock_ops.skb = skb;
+	__cgroup_bpf_run_filter_sock_ops(sk, &sock_ops, CGROUP_SOCK_OPS);
+}
+#endif
+
  void sock_set_keepalive(struct sock *sk)
  {
  	lock_sock(sk);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 4f77bd862e95..1e7f2d5fd792 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -491,6 +491,15 @@ static void tcp_tx_timestamp(struct sock *sk, u16 tsflags)
  		if (tsflags & SOF_TIMESTAMPING_TX_RECORD_MASK)
  			shinfo->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1;
  	}
+
+	if (cgroup_bpf_enabled(CGROUP_SOCK_OPS) &&
+	    SK_BPF_CB_FLAG_TEST(sk, SK_BPF_CB_TX_TIMESTAMPING))
+		/* The bpf prog can do:
+		 * shinfo->tx_flags |= SKBTX_BPF,
+		 * tcb->bpf_txstamp_ack = 1,
+		 * shinfo->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1 (if tskey not set)
+		 */
+		bpf_skops_tx_timestamping(sk, skb);
  }


> 
> Overall the suggestion looks good to me. I can give it a try :)
> 
> I'm thinking of another approach to using bpf_sock_ops_cb_flags_set()
> instead of bpf_setsockopt() when sockops like
> BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB is triggered. I can modify the
> bpf_sock_ops_cb_flags_set like this:
> diff --git a/net/core/filter.c b/net/core/filter.c
> index 58761263176c..001140067c1a 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -5770,14 +5770,25 @@ BPF_CALL_2(bpf_sock_ops_cb_flags_set, struct
> bpf_sock_ops_kern *, bpf_sock,
>             int, argval)
>   {
>          struct sock *sk = bpf_sock->sk;
> -       int val = argval & BPF_SOCK_OPS_ALL_CB_FLAGS;
> +       int val = argval;
> 
> -       if (!IS_ENABLED(CONFIG_INET) || !sk_fullsock(sk))
> +       if (!IS_ENABLED(CONFIG_INET))
>                  return -EINVAL;
> 
> -       tcp_sk(sk)->bpf_sock_ops_cb_flags = val;
> +       if (sk_is_tcp(sk)) {
> +               val = argval & BPF_SOCK_OPS_ALL_CB_FLAGS;
> +               if (!sk_fullsock(sk))
> +                       return -EINVAL;
> +
> +               tcp_sk(sk)->bpf_sock_ops_cb_flags = val;
> +
> +               val = argval & (~BPF_SOCK_OPS_ALL_CB_FLAGS);
> +       } else {
> +               sk->bpf_sock_ops_cb_flags = val;

Why separate tcp vs non-tcp case? The tcp_sk(sk)->bpf_sock_ops_cb_flags
is running out of bits anyway for tcp specific callback.

just keep the SK_BPF_CB_{TX,RX}_TIEMSTAMPING in sk->sk_bpf_cb_flags
for all tcp/udp/raw/...

> +               val = argval &
> (~(SK_BPF_CB_TX_TIEMSTAMPING|SK_BPF_CB_RX_TIEMSTAMPING));

imo, we also don't need to return val to tell the caller what
is not supported in the running kernel. The BPF CO-RE can
handle this also, so less reason to keep extending the
bpf_sock_ops_cb_flags_set API for non tcp.

>>>> For datagrams (UDP as well as RAW and many non IP protocols), an
>>>> alternative still needs to be found.
>>
>> In udp/raw/..., I don't know how likely is the user space having "cork->tx_flags
>> & SKBTX_ANY_TSTAMP" set but has neither "READ_ONCE(sk->sk_tsflags) &
>> SOF_TIMESTAMPING_OPT_ID" nor "cork->flags & IPCORK_TS_OPT_ID" set. If it is
>> unlikely, may be we can just disallow bpf prog from directly setting
>> skb_shinfo(skb)->tskey for this particular skb.
>>
>> For all other cases, in __ip[6]_append_data, directly call a bpf prog and also
>> pass the kernel decided tskey to the bpf prog.
> 
> I'm a bit confused here. IIUC, we need to support the tskey like what
> we did in this series to handle non TCP cases?

Like tcp, I don't think it really needs to use the sk->sk_tskey to mark the
ID of a skb for the non tcp cases also. will comment on another thread.
Martin KaFai Lau Nov. 5, 2024, 2:09 a.m. UTC | #24
On 11/1/24 6:32 AM, Willem de Bruijn wrote:
>> In udp/raw/..., I don't know how likely is the user space having "cork->tx_flags
>> & SKBTX_ANY_TSTAMP" set but has neither "READ_ONCE(sk->sk_tsflags) &
>> SOF_TIMESTAMPING_OPT_ID" nor "cork->flags & IPCORK_TS_OPT_ID" set.
> This is not something to rely on. OPT_ID was added relatively recently.
> Older applications, or any that just use the most straightforward API,
> will not set this.

Good point that the OPT_ID per cmsg is very new.

The datagram support on SOF_TIMESTAMPING_OPT_ID in sk->sk_tsflags had
been there for quite some time now. Is it a safe assumption that
most applications doing udp tx timestamping should have
the SOF_TIMESTAMPING_OPT_ID set to be useful?

> 
>> If it is
>> unlikely, may be we can just disallow bpf prog from directly setting
>> skb_shinfo(skb)->tskey for this particular skb.
>>
>> For all other cases, in __ip[6]_append_data, directly call a bpf prog and also
>> pass the kernel decided tskey to the bpf prog.
>>
>> The kernel passed tskey could be 0 (meaning the user space has not used it). The
>> bpf prog can give one for the kernel to use. The bpf prog can store the
>> sk_tskey_bpf in the bpf_sk_storage now. Meaning no need to add one to the struct
>> sock. The bpf prog does not have to start from 0 (e.g. start from U32_MAX
>> instead) if it helps.
>>
>> If the kernel passed tskey is not 0, the bpf prog can just use that one
>> (assuming the user space is doing something sane, like the value in
>> SCM_TS_OPT_ID won't be jumping back and front between 0 to U32_MAX). I hope this
>> is very unlikely also (?) but the bpf prog can probably detect this and choose
>> to ignore this sk.
> If an applications uses OPT_ID, it is unlikely that they will toggle
> the feature on and off on a per-packet basis. So in the common case
> the program could use the user-set counter or use its own if userspace
> does not enable the feature. In the rare case that an application does
> intermittently set an OPT_ID, the numbering would be erratic. This
> does mean that an actively malicious application could mess with admin
> measurements.

All make sense. Given it is reasonable to assume the user space should either 
has SOF_TIMESTAMPING_OPT_ID always on or always off. When it is off, the bpf 
prog can directly provide its own tskey to be used in shinfo->tskey. The bpf 
prog can generate the id itself without using the sk->sk_tskey, e.g. store an 
atomic int in the bpf_sk_storage.
Jason Xing Nov. 5, 2024, 3:13 a.m. UTC | #25
On Tue, Nov 5, 2024 at 9:51 AM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>
> On 11/1/24 12:47 AM, Jason Xing wrote:
>
> >> If it prefers to stay with bpf_setsockopt (which is fine), it needs a bpf
> >> specific optname like the current TCP_BPF_SOCK_OPS_CB_FLAGS which currently sets
> >> the tp->bpf_sock_ops_cb_flags. May be a new SK_BPF_CB_FLAGS optname for setting
> >> the sk->sk_bpf_cb_flags, like bpf_setsockopt(skops_ctx, SOL_SOCKET,
> >
> >> SK_BPF_CB_FLAGS, &val, sizeof(val)) and handle it in the sol_socket_sockopt()
> >> alone without calling into sk_{set,get}sockopt. Add a new enum for the optval
> >> for the sk_bpf_cb_flags:
> >>
> >> enum {
> >>          SK_BPF_CB_TX_TIMESTAMPING = (1 << 0),
> >>          SK_BPF_CB_RX_TIEMSTAMPING = (1 << 1),
> >> };
> >
> > Then it will involve more strange modification in sol_socket_sockopt()
> > to retrieve the opt value like what I did in V2 (see
> > https://lore.kernel.org/all/20241012040651.95616-3-kerneljasonxing@gmail.com/).
> > It's the reason why I did set and get operation in
> > sk_{set,get}sockopt() in this series to keep align with other flags.
> > Handling it in sk_{set,get}sockopt() is not a bad idea and easy to
> > implement, I feel.
>
> This will look very different now. It is handling bpf specific
> optname and accessing the bpf specific field in sk->sk_bpf_cb_flags.
>
> I really don't see why it needs to spill over to sk_{set,get}sockopt()
> to handle sk->sk_bpf_cb_flags.
>
> I have quickly typed out a small part of discussion so far.
> It is likely buggy and not compiler tested. Pieces are still missing.
> The bpf_tstamp_ack will need a few more changes in the
> tcp_{input,output}.c. May be merging with the tstamp_ack to become
> 2 bits will be cleaner, not sure.
>
> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> index 39f1d16f3628..0b4913315854 100644
> --- a/include/linux/skbuff.h
> +++ b/include/linux/skbuff.h
> @@ -488,6 +488,7 @@ enum {
>
>         /* generate software time stamp when entering packet scheduling */
>         SKBTX_SCHED_TSTAMP = 1 << 6,
> +       SKBTX_BPF = 1 << 7,
>   };
>
>   #define SKBTX_ANY_SW_TSTAMP   (SKBTX_SW_TSTAMP    | \
> diff --git a/include/net/sock.h b/include/net/sock.h
> index f29c14448938..4ec27c524f49 100644
> --- a/include/net/sock.h
> +++ b/include/net/sock.h
> @@ -234,6 +234,20 @@ struct sock_common {
>   struct bpf_local_storage;
>   struct sk_filter;
>
> +enum {
> +       SK_BPF_CB_TX_TIMESTAMPING = BIT(0),
> +       SK_BPF_CB_RX_TIEMSTAMPING = BIT(1),
> +       SK_BPF_CB_MASK          = BIT(2) - 1,
> +};
> +
> +#ifdef CONFIG_BPF_SYSCALL
> +#define SK_BPF_CB_FLAG_TEST(SK, FLAG) ((SK)->sk_bpf_cb_flags & (FLAG))
> +void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb);
> +#else
> +#define SK_BPF_CB_FLAG_TEST(SK, FLAG)
> +static inline void bpf_skops_timestamping(struct sock *sk, struct sk_buff *skb) {}
> +#endif

Until now, I know that I misunderstood what you meant in the previous
thread. I thought you were suggesting we need to use bpf_setsockopt.
Sorry.

I completely agree with this approach! Thanks!

> +
>   /**
>     *   struct sock - network layer representation of sockets
>     *   @__sk_common: shared layout with inet_timewait_sock
> @@ -444,7 +458,10 @@ struct sock {
>         socket_lock_t           sk_lock;
>         u32                     sk_reserved_mem;
>         int                     sk_forward_alloc;
> -       u32                     sk_tsflags;
> +       u16                     sk_tsflags;
> +#ifdef CONFIG_BPF_SYSCALL
> +       u16                     sk_bpf_cb_flags;

We cannot use u16 for sk_tsflags because the
SOF_TIMESTAMPING_OPT_RX_FILTER uses the 17th bit already. I will
handle it.

> +#endif
>         __cacheline_group_end(sock_write_rxtx);
>
>         __cacheline_group_begin(sock_write_tx);
> diff --git a/include/net/tcp.h b/include/net/tcp.h
> index d1948d357dad..224b697bae9d 100644
> --- a/include/net/tcp.h
> +++ b/include/net/tcp.h
> @@ -961,7 +961,8 @@ struct tcp_skb_cb {
>         __u8            txstamp_ack:1,  /* Record TX timestamp for ack? */
>                         eor:1,          /* Is skb MSG_EOR marked? */
>                         has_rxtstamp:1, /* SKB has a RX timestamp       */
> -                       unused:5;
> +                       bpf_txstamp_ack:1,
> +                       unused:4;
>         __u32           ack_seq;        /* Sequence number ACK'd        */
>         union {
>                 struct {
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index f28b6527e815..2ff7ff0ebdab 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -7014,6 +7014,7 @@ enum {
>                                          * by the kernel or the
>                                          * earlier bpf-progs.
>                                          */
> +       BPF_SOCK_OPS_TX_TIMESTAMPING_CB,
>   };
>
>   /* List of TCP states. There is a build check in net/ipv4/tcp.c to detect
> @@ -7080,6 +7081,7 @@ enum {
>         TCP_BPF_SYN_IP          = 1006, /* Copy the IP[46] and TCP header */
>         TCP_BPF_SYN_MAC         = 1007, /* Copy the MAC, IP[46], and TCP header */
>         TCP_BPF_SOCK_OPS_CB_FLAGS = 1008, /* Get or Set TCP sock ops flags */
> +       SK_BPF_CB_FLAGS         = 1009,
>   };
>
>   enum {
> diff --git a/net/core/filter.c b/net/core/filter.c
> index e31ee8be2de0..81a36e50047b 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -5206,6 +5206,19 @@ static const struct bpf_func_proto bpf_get_socket_uid_proto = {
>         .arg1_type      = ARG_PTR_TO_CTX,
>   };
>
> +static int sk_bpf_cb_flags(struct sock *sk, int sk_bpf_cb_flags, bool getopt)
> +{
> +       if (getopt)
> +               return -EINVAL;
> +
> +       if (sk_bpf_cb_flags & ~SK_BPF_CB_MASK)
> +               return -EINVAL;
> +
> +       sk->sk_bpf_cb_flags = sk->sk_bpf_cb_flags;
> +
> +       return 0;
> +}
> +
>   static int sol_socket_sockopt(struct sock *sk, int optname,
>                               char *optval, int *optlen,
>                               bool getopt)
> @@ -5222,6 +5235,7 @@ static int sol_socket_sockopt(struct sock *sk, int optname,
>         case SO_MAX_PACING_RATE:
>         case SO_BINDTOIFINDEX:
>         case SO_TXREHASH:
> +       case SK_BPF_CB_FLAGS:
>                 if (*optlen != sizeof(int))
>                         return -EINVAL;
>                 break;
> @@ -5231,6 +5245,9 @@ static int sol_socket_sockopt(struct sock *sk, int optname,
>                 return -EINVAL;
>         }
>
> +       if (optname == SK_BPF_CB_FLAGS)
> +               return sk_bpf_cb_flags(sk, *(int *)optval, getopt);
> +
>         if (getopt) {
>                 if (optname == SO_BINDTODEVICE)
>                         return -EINVAL;
> diff --git a/net/core/sock.c b/net/core/sock.c
> index 039be95c40cf..d0406639cee9 100644
> --- a/net/core/sock.c
> +++ b/net/core/sock.c
> @@ -137,6 +137,7 @@
>   #include <linux/sock_diag.h>
>
>   #include <linux/filter.h>
> +#include <linux/bpf-cgroup.h>
>   #include <net/sock_reuseport.h>
>   #include <net/bpf_sk_storage.h>
>
> @@ -946,6 +947,20 @@ int sock_set_timestamping(struct sock *sk, int optname,
>         return 0;
>   }
>
> +#ifdef CONFIG_BPF_SYSCALL
> +void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb)
> +{
> +       struct bpf_sock_ops_kern sock_ops;
> +
> +       memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
> +       sock_ops.op = BPF_SOCK_OPS_TX_TIMESTAMPING_CB;
> +       sock_ops.is_fullsock = 1;
> +       sock_ops.sk = sk;
> +       sock_ops.skb = skb;
> +       __cgroup_bpf_run_filter_sock_ops(sk, &sock_ops, CGROUP_SOCK_OPS);
> +}
> +#endif
> +
>   void sock_set_keepalive(struct sock *sk)
>   {
>         lock_sock(sk);
> diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
> index 4f77bd862e95..1e7f2d5fd792 100644
> --- a/net/ipv4/tcp.c
> +++ b/net/ipv4/tcp.c
> @@ -491,6 +491,15 @@ static void tcp_tx_timestamp(struct sock *sk, u16 tsflags)
>                 if (tsflags & SOF_TIMESTAMPING_TX_RECORD_MASK)
>                         shinfo->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1;
>         }
> +
> +       if (cgroup_bpf_enabled(CGROUP_SOCK_OPS) &&
> +           SK_BPF_CB_FLAG_TEST(sk, SK_BPF_CB_TX_TIMESTAMPING))
> +               /* The bpf prog can do:
> +                * shinfo->tx_flags |= SKBTX_BPF,
> +                * tcb->bpf_txstamp_ack = 1,
> +                * shinfo->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1 (if tskey not set)
> +                */
> +               bpf_skops_tx_timestamping(sk, skb);
>   }
>
>
> >
> > Overall the suggestion looks good to me. I can give it a try :)
> >
> > I'm thinking of another approach to using bpf_sock_ops_cb_flags_set()
> > instead of bpf_setsockopt() when sockops like
> > BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB is triggered. I can modify the
> > bpf_sock_ops_cb_flags_set like this:
> > diff --git a/net/core/filter.c b/net/core/filter.c
> > index 58761263176c..001140067c1a 100644
> > --- a/net/core/filter.c
> > +++ b/net/core/filter.c
> > @@ -5770,14 +5770,25 @@ BPF_CALL_2(bpf_sock_ops_cb_flags_set, struct
> > bpf_sock_ops_kern *, bpf_sock,
> >             int, argval)
> >   {
> >          struct sock *sk = bpf_sock->sk;
> > -       int val = argval & BPF_SOCK_OPS_ALL_CB_FLAGS;
> > +       int val = argval;
> >
> > -       if (!IS_ENABLED(CONFIG_INET) || !sk_fullsock(sk))
> > +       if (!IS_ENABLED(CONFIG_INET))
> >                  return -EINVAL;
> >
> > -       tcp_sk(sk)->bpf_sock_ops_cb_flags = val;
> > +       if (sk_is_tcp(sk)) {
> > +               val = argval & BPF_SOCK_OPS_ALL_CB_FLAGS;
> > +               if (!sk_fullsock(sk))
> > +                       return -EINVAL;
> > +
> > +               tcp_sk(sk)->bpf_sock_ops_cb_flags = val;
> > +
> > +               val = argval & (~BPF_SOCK_OPS_ALL_CB_FLAGS);
> > +       } else {
> > +               sk->bpf_sock_ops_cb_flags = val;
>
> Why separate tcp vs non-tcp case? The tcp_sk(sk)->bpf_sock_ops_cb_flags
> is running out of bits anyway for tcp specific callback.
>
> just keep the SK_BPF_CB_{TX,RX}_TIEMSTAMPING in sk->sk_bpf_cb_flags
> for all tcp/udp/raw/...

Agreed!

>
> > +               val = argval &
> > (~(SK_BPF_CB_TX_TIEMSTAMPING|SK_BPF_CB_RX_TIEMSTAMPING));
>
> imo, we also don't need to return val to tell the caller what
> is not supported in the running kernel. The BPF CO-RE can
> handle this also, so less reason to keep extending the
> bpf_sock_ops_cb_flags_set API for non tcp.
>
> >>>> For datagrams (UDP as well as RAW and many non IP protocols), an
> >>>> alternative still needs to be found.
> >>
> >> In udp/raw/..., I don't know how likely is the user space having "cork->tx_flags
> >> & SKBTX_ANY_TSTAMP" set but has neither "READ_ONCE(sk->sk_tsflags) &
> >> SOF_TIMESTAMPING_OPT_ID" nor "cork->flags & IPCORK_TS_OPT_ID" set. If it is
> >> unlikely, may be we can just disallow bpf prog from directly setting
> >> skb_shinfo(skb)->tskey for this particular skb.
> >>
> >> For all other cases, in __ip[6]_append_data, directly call a bpf prog and also
> >> pass the kernel decided tskey to the bpf prog.
> >
> > I'm a bit confused here. IIUC, we need to support the tskey like what
> > we did in this series to handle non TCP cases?
>
> Like tcp, I don't think it really needs to use the sk->sk_tskey to mark the
> ID of a skb for the non tcp cases also. will comment on another thread.

Fine, I will let go of the tskey logic in v4.

Thanks,
Jason
Jason Xing Nov. 5, 2024, 6:22 a.m. UTC | #26
On Tue, Nov 5, 2024 at 10:09 AM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>
> On 11/1/24 6:32 AM, Willem de Bruijn wrote:
> >> In udp/raw/..., I don't know how likely is the user space having "cork->tx_flags
> >> & SKBTX_ANY_TSTAMP" set but has neither "READ_ONCE(sk->sk_tsflags) &
> >> SOF_TIMESTAMPING_OPT_ID" nor "cork->flags & IPCORK_TS_OPT_ID" set.
> > This is not something to rely on. OPT_ID was added relatively recently.
> > Older applications, or any that just use the most straightforward API,
> > will not set this.
>
> Good point that the OPT_ID per cmsg is very new.
>
> The datagram support on SOF_TIMESTAMPING_OPT_ID in sk->sk_tsflags had
> been there for quite some time now. Is it a safe assumption that
> most applications doing udp tx timestamping should have
> the SOF_TIMESTAMPING_OPT_ID set to be useful?
>
> >
> >> If it is
> >> unlikely, may be we can just disallow bpf prog from directly setting
> >> skb_shinfo(skb)->tskey for this particular skb.
> >>
> >> For all other cases, in __ip[6]_append_data, directly call a bpf prog and also
> >> pass the kernel decided tskey to the bpf prog.
> >>
> >> The kernel passed tskey could be 0 (meaning the user space has not used it). The
> >> bpf prog can give one for the kernel to use. The bpf prog can store the
> >> sk_tskey_bpf in the bpf_sk_storage now. Meaning no need to add one to the struct
> >> sock. The bpf prog does not have to start from 0 (e.g. start from U32_MAX
> >> instead) if it helps.
> >>
> >> If the kernel passed tskey is not 0, the bpf prog can just use that one
> >> (assuming the user space is doing something sane, like the value in
> >> SCM_TS_OPT_ID won't be jumping back and front between 0 to U32_MAX). I hope this
> >> is very unlikely also (?) but the bpf prog can probably detect this and choose
> >> to ignore this sk.
> > If an applications uses OPT_ID, it is unlikely that they will toggle
> > the feature on and off on a per-packet basis. So in the common case
> > the program could use the user-set counter or use its own if userspace
> > does not enable the feature. In the rare case that an application does
> > intermittently set an OPT_ID, the numbering would be erratic. This
> > does mean that an actively malicious application could mess with admin
> > measurements.
>
> All make sense. Given it is reasonable to assume the user space should either
> has SOF_TIMESTAMPING_OPT_ID always on or always off. When it is off, the bpf
> prog can directly provide its own tskey to be used in shinfo->tskey. The bpf
> prog can generate the id itself without using the sk->sk_tskey, e.g. store an
> atomic int in the bpf_sk_storage.

I wonder, how can we correlate the key with each skb in the bpf
program for non-TCP type without implementing a bpf extension for
SCM_TS_OPT_ID? Every time the timestamp is reported, we cannot know
which sendmsg() the skb belongs to for non-TCP cases.

Thanks,
Jason
Willem de Bruijn Nov. 5, 2024, 2:29 p.m. UTC | #27
Martin KaFai Lau wrote:
> On 11/1/24 6:32 AM, Willem de Bruijn wrote:
> >> In udp/raw/..., I don't know how likely is the user space having "cork->tx_flags
> >> & SKBTX_ANY_TSTAMP" set but has neither "READ_ONCE(sk->sk_tsflags) &
> >> SOF_TIMESTAMPING_OPT_ID" nor "cork->flags & IPCORK_TS_OPT_ID" set.
> > This is not something to rely on. OPT_ID was added relatively recently.
> > Older applications, or any that just use the most straightforward API,
> > will not set this.
> 
> Good point that the OPT_ID per cmsg is very new.
> 
> The datagram support on SOF_TIMESTAMPING_OPT_ID in sk->sk_tsflags had
> been there for quite some time now. Is it a safe assumption that
> most applications doing udp tx timestamping should have
> the SOF_TIMESTAMPING_OPT_ID set to be useful?

I don't think so.

The very first open source code I happen to look at, github.com/ptpd,
already sets SO_TIMESTAMPING without OPT_ID.
Martin KaFai Lau Nov. 5, 2024, 7:22 p.m. UTC | #28
On 11/4/24 10:22 PM, Jason Xing wrote:
> On Tue, Nov 5, 2024 at 10:09 AM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>>
>> On 11/1/24 6:32 AM, Willem de Bruijn wrote:
>>>> In udp/raw/..., I don't know how likely is the user space having "cork->tx_flags
>>>> & SKBTX_ANY_TSTAMP" set but has neither "READ_ONCE(sk->sk_tsflags) &
>>>> SOF_TIMESTAMPING_OPT_ID" nor "cork->flags & IPCORK_TS_OPT_ID" set.
>>> This is not something to rely on. OPT_ID was added relatively recently.
>>> Older applications, or any that just use the most straightforward API,
>>> will not set this.
>>
>> Good point that the OPT_ID per cmsg is very new.
>>
>> The datagram support on SOF_TIMESTAMPING_OPT_ID in sk->sk_tsflags had
>> been there for quite some time now. Is it a safe assumption that
>> most applications doing udp tx timestamping should have
>> the SOF_TIMESTAMPING_OPT_ID set to be useful?
>>
>>>
>>>> If it is
>>>> unlikely, may be we can just disallow bpf prog from directly setting
>>>> skb_shinfo(skb)->tskey for this particular skb.
>>>>
>>>> For all other cases, in __ip[6]_append_data, directly call a bpf prog and also
>>>> pass the kernel decided tskey to the bpf prog.
>>>>
>>>> The kernel passed tskey could be 0 (meaning the user space has not used it). The
>>>> bpf prog can give one for the kernel to use. The bpf prog can store the
>>>> sk_tskey_bpf in the bpf_sk_storage now. Meaning no need to add one to the struct
>>>> sock. The bpf prog does not have to start from 0 (e.g. start from U32_MAX
>>>> instead) if it helps.
>>>>
>>>> If the kernel passed tskey is not 0, the bpf prog can just use that one
>>>> (assuming the user space is doing something sane, like the value in
>>>> SCM_TS_OPT_ID won't be jumping back and front between 0 to U32_MAX). I hope this
>>>> is very unlikely also (?) but the bpf prog can probably detect this and choose
>>>> to ignore this sk.
>>> If an applications uses OPT_ID, it is unlikely that they will toggle
>>> the feature on and off on a per-packet basis. So in the common case
>>> the program could use the user-set counter or use its own if userspace
>>> does not enable the feature. In the rare case that an application does
>>> intermittently set an OPT_ID, the numbering would be erratic. This
>>> does mean that an actively malicious application could mess with admin
>>> measurements.
>>
>> All make sense. Given it is reasonable to assume the user space should either
>> has SOF_TIMESTAMPING_OPT_ID always on or always off. When it is off, the bpf
>> prog can directly provide its own tskey to be used in shinfo->tskey. The bpf
>> prog can generate the id itself without using the sk->sk_tskey, e.g. store an
>> atomic int in the bpf_sk_storage.
> 
> I wonder, how can we correlate the key with each skb in the bpf
> program for non-TCP type without implementing a bpf extension for
> SCM_TS_OPT_ID? Every time the timestamp is reported, we cannot know
> which sendmsg() the skb belongs to for non-TCP cases.

SCM_TS_OPT_ID is eventually setting the shinfo->tskey.
If the shinfo->tskey is not set by the user space, the bpf prog can directly set 
the shinfo->tskey. There is no need to use the sk->sk_tskey as the ID generator 
also. The bpf prog can have its own id generator.

If the user space has already set the shinfo->tskey (either by sk->sk_tskey or 
SCM_TS_OPT_ID), the bpf prog can just use the user space one.

If there is a weird application that flips flops between OPT_ID on/off, the bpf 
prog will get confused which is fine. The bpf prog can detect this and choose to 
ignore measuring this sk/skb. The bpf prog can also choose to be on the very 
safe side and ignore all skb with SKBTX_ANY_TSTAMP set in txflags but with no 
OPT_ID. The bpf prog can look into the details of the sk and skb to decide what 
makes the most sense for its deployment.

I don't know whether it makes more sense to call the bpf prog to decide the 
shinfo->{tx_flags,tskey} just before the "while (length > 0)" in 
__ip[6]_append_data or it is better to call the bpf prog in ip[6]_setup_cork.
I admittedly less familiar with this code path than the tcp one.
Jason Xing Nov. 6, 2024, 12:17 a.m. UTC | #29
On Wed, Nov 6, 2024 at 3:22 AM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>
> On 11/4/24 10:22 PM, Jason Xing wrote:
> > On Tue, Nov 5, 2024 at 10:09 AM Martin KaFai Lau <martin.lau@linux.dev> wrote:
> >>
> >> On 11/1/24 6:32 AM, Willem de Bruijn wrote:
> >>>> In udp/raw/..., I don't know how likely is the user space having "cork->tx_flags
> >>>> & SKBTX_ANY_TSTAMP" set but has neither "READ_ONCE(sk->sk_tsflags) &
> >>>> SOF_TIMESTAMPING_OPT_ID" nor "cork->flags & IPCORK_TS_OPT_ID" set.
> >>> This is not something to rely on. OPT_ID was added relatively recently.
> >>> Older applications, or any that just use the most straightforward API,
> >>> will not set this.
> >>
> >> Good point that the OPT_ID per cmsg is very new.
> >>
> >> The datagram support on SOF_TIMESTAMPING_OPT_ID in sk->sk_tsflags had
> >> been there for quite some time now. Is it a safe assumption that
> >> most applications doing udp tx timestamping should have
> >> the SOF_TIMESTAMPING_OPT_ID set to be useful?
> >>
> >>>
> >>>> If it is
> >>>> unlikely, may be we can just disallow bpf prog from directly setting
> >>>> skb_shinfo(skb)->tskey for this particular skb.
> >>>>
> >>>> For all other cases, in __ip[6]_append_data, directly call a bpf prog and also
> >>>> pass the kernel decided tskey to the bpf prog.
> >>>>
> >>>> The kernel passed tskey could be 0 (meaning the user space has not used it). The
> >>>> bpf prog can give one for the kernel to use. The bpf prog can store the
> >>>> sk_tskey_bpf in the bpf_sk_storage now. Meaning no need to add one to the struct
> >>>> sock. The bpf prog does not have to start from 0 (e.g. start from U32_MAX
> >>>> instead) if it helps.
> >>>>
> >>>> If the kernel passed tskey is not 0, the bpf prog can just use that one
> >>>> (assuming the user space is doing something sane, like the value in
> >>>> SCM_TS_OPT_ID won't be jumping back and front between 0 to U32_MAX). I hope this
> >>>> is very unlikely also (?) but the bpf prog can probably detect this and choose
> >>>> to ignore this sk.
> >>> If an applications uses OPT_ID, it is unlikely that they will toggle
> >>> the feature on and off on a per-packet basis. So in the common case
> >>> the program could use the user-set counter or use its own if userspace
> >>> does not enable the feature. In the rare case that an application does
> >>> intermittently set an OPT_ID, the numbering would be erratic. This
> >>> does mean that an actively malicious application could mess with admin
> >>> measurements.
> >>
> >> All make sense. Given it is reasonable to assume the user space should either
> >> has SOF_TIMESTAMPING_OPT_ID always on or always off. When it is off, the bpf
> >> prog can directly provide its own tskey to be used in shinfo->tskey. The bpf
> >> prog can generate the id itself without using the sk->sk_tskey, e.g. store an
> >> atomic int in the bpf_sk_storage.
> >
> > I wonder, how can we correlate the key with each skb in the bpf
> > program for non-TCP type without implementing a bpf extension for
> > SCM_TS_OPT_ID? Every time the timestamp is reported, we cannot know
> > which sendmsg() the skb belongs to for non-TCP cases.
>
> SCM_TS_OPT_ID is eventually setting the shinfo->tskey.
> If the shinfo->tskey is not set by the user space, the bpf prog can directly set
> the shinfo->tskey. There is no need to use the sk->sk_tskey as the ID generator
> also. The bpf prog can have its own id generator.
>
> If the user space has already set the shinfo->tskey (either by sk->sk_tskey or
> SCM_TS_OPT_ID), the bpf prog can just use the user space one.
>
> If there is a weird application that flips flops between OPT_ID on/off, the bpf
> prog will get confused which is fine. The bpf prog can detect this and choose to
> ignore measuring this sk/skb. The bpf prog can also choose to be on the very
> safe side and ignore all skb with SKBTX_ANY_TSTAMP set in txflags but with no
> OPT_ID. The bpf prog can look into the details of the sk and skb to decide what
> makes the most sense for its deployment.
>
> I don't know whether it makes more sense to call the bpf prog to decide the
> shinfo->{tx_flags,tskey} just before the "while (length > 0)" in
> __ip[6]_append_data or it is better to call the bpf prog in ip[6]_setup_cork.
> I admittedly less familiar with this code path than the tcp one.

Now I feel it could be complicated for a software engineer to consider
how they will handle the key if they don't read the kernel code very
carefully. They are facing different situations. Being user-friendly
lets this feature have more chances to get widely used. As I insisted
before, I still would like to know if it is possible that we can try
to introduce sk_tskey_bpf_offset (like patch 10-12) to calculate a bpf
exclusive tskey for bpf use? Only exporting one key. It will be really
simple and easy-to-use :)

Thanks,
Jason
Martin KaFai Lau Nov. 6, 2024, 1:09 a.m. UTC | #30
On 11/5/24 4:17 PM, Jason Xing wrote:
> On Wed, Nov 6, 2024 at 3:22 AM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>>
>> On 11/4/24 10:22 PM, Jason Xing wrote:
>>> On Tue, Nov 5, 2024 at 10:09 AM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>>>>
>>>> On 11/1/24 6:32 AM, Willem de Bruijn wrote:
>>>>>> In udp/raw/..., I don't know how likely is the user space having "cork->tx_flags
>>>>>> & SKBTX_ANY_TSTAMP" set but has neither "READ_ONCE(sk->sk_tsflags) &
>>>>>> SOF_TIMESTAMPING_OPT_ID" nor "cork->flags & IPCORK_TS_OPT_ID" set.
>>>>> This is not something to rely on. OPT_ID was added relatively recently.
>>>>> Older applications, or any that just use the most straightforward API,
>>>>> will not set this.
>>>>
>>>> Good point that the OPT_ID per cmsg is very new.
>>>>
>>>> The datagram support on SOF_TIMESTAMPING_OPT_ID in sk->sk_tsflags had
>>>> been there for quite some time now. Is it a safe assumption that
>>>> most applications doing udp tx timestamping should have
>>>> the SOF_TIMESTAMPING_OPT_ID set to be useful?
>>>>
>>>>>
>>>>>> If it is
>>>>>> unlikely, may be we can just disallow bpf prog from directly setting
>>>>>> skb_shinfo(skb)->tskey for this particular skb.
>>>>>>
>>>>>> For all other cases, in __ip[6]_append_data, directly call a bpf prog and also
>>>>>> pass the kernel decided tskey to the bpf prog.
>>>>>>
>>>>>> The kernel passed tskey could be 0 (meaning the user space has not used it). The
>>>>>> bpf prog can give one for the kernel to use. The bpf prog can store the
>>>>>> sk_tskey_bpf in the bpf_sk_storage now. Meaning no need to add one to the struct
>>>>>> sock. The bpf prog does not have to start from 0 (e.g. start from U32_MAX
>>>>>> instead) if it helps.
>>>>>>
>>>>>> If the kernel passed tskey is not 0, the bpf prog can just use that one
>>>>>> (assuming the user space is doing something sane, like the value in
>>>>>> SCM_TS_OPT_ID won't be jumping back and front between 0 to U32_MAX). I hope this
>>>>>> is very unlikely also (?) but the bpf prog can probably detect this and choose
>>>>>> to ignore this sk.
>>>>> If an applications uses OPT_ID, it is unlikely that they will toggle
>>>>> the feature on and off on a per-packet basis. So in the common case
>>>>> the program could use the user-set counter or use its own if userspace
>>>>> does not enable the feature. In the rare case that an application does
>>>>> intermittently set an OPT_ID, the numbering would be erratic. This
>>>>> does mean that an actively malicious application could mess with admin
>>>>> measurements.
>>>>
>>>> All make sense. Given it is reasonable to assume the user space should either
>>>> has SOF_TIMESTAMPING_OPT_ID always on or always off. When it is off, the bpf
>>>> prog can directly provide its own tskey to be used in shinfo->tskey. The bpf
>>>> prog can generate the id itself without using the sk->sk_tskey, e.g. store an
>>>> atomic int in the bpf_sk_storage.
>>>
>>> I wonder, how can we correlate the key with each skb in the bpf
>>> program for non-TCP type without implementing a bpf extension for
>>> SCM_TS_OPT_ID? Every time the timestamp is reported, we cannot know
>>> which sendmsg() the skb belongs to for non-TCP cases.
>>
>> SCM_TS_OPT_ID is eventually setting the shinfo->tskey.
>> If the shinfo->tskey is not set by the user space, the bpf prog can directly set
>> the shinfo->tskey. There is no need to use the sk->sk_tskey as the ID generator
>> also. The bpf prog can have its own id generator.
>>
>> If the user space has already set the shinfo->tskey (either by sk->sk_tskey or
>> SCM_TS_OPT_ID), the bpf prog can just use the user space one.
>>
>> If there is a weird application that flips flops between OPT_ID on/off, the bpf
>> prog will get confused which is fine. The bpf prog can detect this and choose to
>> ignore measuring this sk/skb. The bpf prog can also choose to be on the very
>> safe side and ignore all skb with SKBTX_ANY_TSTAMP set in txflags but with no
>> OPT_ID. The bpf prog can look into the details of the sk and skb to decide what
>> makes the most sense for its deployment.
>>
>> I don't know whether it makes more sense to call the bpf prog to decide the
>> shinfo->{tx_flags,tskey} just before the "while (length > 0)" in
>> __ip[6]_append_data or it is better to call the bpf prog in ip[6]_setup_cork.
>> I admittedly less familiar with this code path than the tcp one.
> 
> Now I feel it could be complicated for a software engineer to consider
> how they will handle the key if they don't read the kernel code very
> carefully. They are facing different situations. Being user-friendly
> lets this feature have more chances to get widely used. As I insisted
> before, I still would like to know if it is possible that we can try
> to introduce sk_tskey_bpf_offset (like patch 10-12) to calculate a bpf
> exclusive tskey for bpf use? Only exporting one key. It will be really
> simple and easy-to-use :)

imo, there is no need for adding sk_tskey_bpf_offset to sk. just allow the bpf 
prog to decide what is the tskey.

There is no usability issue in bpf prog. It is pretty normal for a bpf prog 
author to look at the sk details to make decision.

Abstracting the sk/skb is not helping the bpf prog and not the right direction 
to go. Over time, there has been case over case that the bpf prog wants to know 
more instead of being abstracted away like running in the user space. e.g. The 
"struct bpf_sock" abstraction in the uapi/linux/bpf.h does not scale and we have 
stopped adding more abstraction this way. The btf (and PTR_TO_BTF_ID, 
CO-RE...etc) has been added to allow the bpf prog to learn other details in sk 
and skb.

Instead, design a better bpf kfunc to help the bpf prog to set the bits/tskey in 
the skb. I think this is more important. tcp tskey is easy. just need some care 
on the udp tskey and need to check if the user space has already set one.
A good designed bpf kfunc is all it needs.
Willem de Bruijn Nov. 6, 2024, 1:11 a.m. UTC | #31
Jason Xing wrote:
> On Wed, Nov 6, 2024 at 3:22 AM Martin KaFai Lau <martin.lau@linux.dev> wrote:
> >
> > On 11/4/24 10:22 PM, Jason Xing wrote:
> > > On Tue, Nov 5, 2024 at 10:09 AM Martin KaFai Lau <martin.lau@linux.dev> wrote:
> > >>
> > >> On 11/1/24 6:32 AM, Willem de Bruijn wrote:
> > >>>> In udp/raw/..., I don't know how likely is the user space having "cork->tx_flags
> > >>>> & SKBTX_ANY_TSTAMP" set but has neither "READ_ONCE(sk->sk_tsflags) &
> > >>>> SOF_TIMESTAMPING_OPT_ID" nor "cork->flags & IPCORK_TS_OPT_ID" set.
> > >>> This is not something to rely on. OPT_ID was added relatively recently.
> > >>> Older applications, or any that just use the most straightforward API,
> > >>> will not set this.
> > >>
> > >> Good point that the OPT_ID per cmsg is very new.
> > >>
> > >> The datagram support on SOF_TIMESTAMPING_OPT_ID in sk->sk_tsflags had
> > >> been there for quite some time now. Is it a safe assumption that
> > >> most applications doing udp tx timestamping should have
> > >> the SOF_TIMESTAMPING_OPT_ID set to be useful?
> > >>
> > >>>
> > >>>> If it is
> > >>>> unlikely, may be we can just disallow bpf prog from directly setting
> > >>>> skb_shinfo(skb)->tskey for this particular skb.
> > >>>>
> > >>>> For all other cases, in __ip[6]_append_data, directly call a bpf prog and also
> > >>>> pass the kernel decided tskey to the bpf prog.
> > >>>>
> > >>>> The kernel passed tskey could be 0 (meaning the user space has not used it). The
> > >>>> bpf prog can give one for the kernel to use. The bpf prog can store the
> > >>>> sk_tskey_bpf in the bpf_sk_storage now. Meaning no need to add one to the struct
> > >>>> sock. The bpf prog does not have to start from 0 (e.g. start from U32_MAX
> > >>>> instead) if it helps.
> > >>>>
> > >>>> If the kernel passed tskey is not 0, the bpf prog can just use that one
> > >>>> (assuming the user space is doing something sane, like the value in
> > >>>> SCM_TS_OPT_ID won't be jumping back and front between 0 to U32_MAX). I hope this
> > >>>> is very unlikely also (?) but the bpf prog can probably detect this and choose
> > >>>> to ignore this sk.
> > >>> If an applications uses OPT_ID, it is unlikely that they will toggle
> > >>> the feature on and off on a per-packet basis. So in the common case
> > >>> the program could use the user-set counter or use its own if userspace
> > >>> does not enable the feature. In the rare case that an application does
> > >>> intermittently set an OPT_ID, the numbering would be erratic. This
> > >>> does mean that an actively malicious application could mess with admin
> > >>> measurements.
> > >>
> > >> All make sense. Given it is reasonable to assume the user space should either
> > >> has SOF_TIMESTAMPING_OPT_ID always on or always off. When it is off, the bpf
> > >> prog can directly provide its own tskey to be used in shinfo->tskey. The bpf
> > >> prog can generate the id itself without using the sk->sk_tskey, e.g. store an
> > >> atomic int in the bpf_sk_storage.
> > >
> > > I wonder, how can we correlate the key with each skb in the bpf
> > > program for non-TCP type without implementing a bpf extension for
> > > SCM_TS_OPT_ID? Every time the timestamp is reported, we cannot know
> > > which sendmsg() the skb belongs to for non-TCP cases.
> >
> > SCM_TS_OPT_ID is eventually setting the shinfo->tskey.
> > If the shinfo->tskey is not set by the user space, the bpf prog can directly set
> > the shinfo->tskey. There is no need to use the sk->sk_tskey as the ID generator
> > also. The bpf prog can have its own id generator.
> >
> > If the user space has already set the shinfo->tskey (either by sk->sk_tskey or
> > SCM_TS_OPT_ID), the bpf prog can just use the user space one.
> >
> > If there is a weird application that flips flops between OPT_ID on/off, the bpf
> > prog will get confused which is fine. The bpf prog can detect this and choose to
> > ignore measuring this sk/skb.

That will skew measurement and is under control of the process.

I don't immediately foresee this being used to measure untrusted
processes that would have an incentive to game this.

But the caveat should be stated explicitly.

> > The bpf prog can also choose to be on the very
> > safe side and ignore all skb with SKBTX_ANY_TSTAMP set in txflags but with no
> > OPT_ID. The bpf prog can look into the details of the sk and skb to decide what
> > makes the most sense for its deployment.
> >
> > I don't know whether it makes more sense to call the bpf prog to decide the
> > shinfo->{tx_flags,tskey} just before the "while (length > 0)" in
> > __ip[6]_append_data or it is better to call the bpf prog in ip[6]_setup_cork.
> > I admittedly less familiar with this code path than the tcp one.

Probably the current spot, mainly because no skb exists yet in
ip_setup_cork.
 
> Now I feel it could be complicated for a software engineer to consider
> how they will handle the key if they don't read the kernel code very
> carefully. They are facing different situations. Being user-friendly
> lets this feature have more chances to get widely used. As I insisted
> before, I still would like to know if it is possible that we can try
> to introduce sk_tskey_bpf_offset (like patch 10-12) to calculate a bpf
> exclusive tskey for bpf use? Only exporting one key. It will be really
> simple and easy-to-use :)

That has complications of its own. It also has to deal with the user
enabling/disabling/resetting its key, and with OPT_ID passed by cmsg.
Multiple skbs may be in flight, derived from each of these sources.
A single sk flag can only offset against one of them.

I think Martin's approach is more workable. Use the tskey that is set,
if any. Else, set one.
Jason Xing Nov. 6, 2024, 2:37 a.m. UTC | #32
On Wed, Nov 6, 2024 at 9:11 AM Willem de Bruijn
<willemdebruijn.kernel@gmail.com> wrote:
>
> Jason Xing wrote:
> > On Wed, Nov 6, 2024 at 3:22 AM Martin KaFai Lau <martin.lau@linux.dev> wrote:
> > >
> > > On 11/4/24 10:22 PM, Jason Xing wrote:
> > > > On Tue, Nov 5, 2024 at 10:09 AM Martin KaFai Lau <martin.lau@linux.dev> wrote:
> > > >>
> > > >> On 11/1/24 6:32 AM, Willem de Bruijn wrote:
> > > >>>> In udp/raw/..., I don't know how likely is the user space having "cork->tx_flags
> > > >>>> & SKBTX_ANY_TSTAMP" set but has neither "READ_ONCE(sk->sk_tsflags) &
> > > >>>> SOF_TIMESTAMPING_OPT_ID" nor "cork->flags & IPCORK_TS_OPT_ID" set.
> > > >>> This is not something to rely on. OPT_ID was added relatively recently.
> > > >>> Older applications, or any that just use the most straightforward API,
> > > >>> will not set this.
> > > >>
> > > >> Good point that the OPT_ID per cmsg is very new.
> > > >>
> > > >> The datagram support on SOF_TIMESTAMPING_OPT_ID in sk->sk_tsflags had
> > > >> been there for quite some time now. Is it a safe assumption that
> > > >> most applications doing udp tx timestamping should have
> > > >> the SOF_TIMESTAMPING_OPT_ID set to be useful?
> > > >>
> > > >>>
> > > >>>> If it is
> > > >>>> unlikely, may be we can just disallow bpf prog from directly setting
> > > >>>> skb_shinfo(skb)->tskey for this particular skb.
> > > >>>>
> > > >>>> For all other cases, in __ip[6]_append_data, directly call a bpf prog and also
> > > >>>> pass the kernel decided tskey to the bpf prog.
> > > >>>>
> > > >>>> The kernel passed tskey could be 0 (meaning the user space has not used it). The
> > > >>>> bpf prog can give one for the kernel to use. The bpf prog can store the
> > > >>>> sk_tskey_bpf in the bpf_sk_storage now. Meaning no need to add one to the struct
> > > >>>> sock. The bpf prog does not have to start from 0 (e.g. start from U32_MAX
> > > >>>> instead) if it helps.
> > > >>>>
> > > >>>> If the kernel passed tskey is not 0, the bpf prog can just use that one
> > > >>>> (assuming the user space is doing something sane, like the value in
> > > >>>> SCM_TS_OPT_ID won't be jumping back and front between 0 to U32_MAX). I hope this
> > > >>>> is very unlikely also (?) but the bpf prog can probably detect this and choose
> > > >>>> to ignore this sk.
> > > >>> If an applications uses OPT_ID, it is unlikely that they will toggle
> > > >>> the feature on and off on a per-packet basis. So in the common case
> > > >>> the program could use the user-set counter or use its own if userspace
> > > >>> does not enable the feature. In the rare case that an application does
> > > >>> intermittently set an OPT_ID, the numbering would be erratic. This
> > > >>> does mean that an actively malicious application could mess with admin
> > > >>> measurements.
> > > >>
> > > >> All make sense. Given it is reasonable to assume the user space should either
> > > >> has SOF_TIMESTAMPING_OPT_ID always on or always off. When it is off, the bpf
> > > >> prog can directly provide its own tskey to be used in shinfo->tskey. The bpf
> > > >> prog can generate the id itself without using the sk->sk_tskey, e.g. store an
> > > >> atomic int in the bpf_sk_storage.
> > > >
> > > > I wonder, how can we correlate the key with each skb in the bpf
> > > > program for non-TCP type without implementing a bpf extension for
> > > > SCM_TS_OPT_ID? Every time the timestamp is reported, we cannot know
> > > > which sendmsg() the skb belongs to for non-TCP cases.
> > >
> > > SCM_TS_OPT_ID is eventually setting the shinfo->tskey.
> > > If the shinfo->tskey is not set by the user space, the bpf prog can directly set
> > > the shinfo->tskey. There is no need to use the sk->sk_tskey as the ID generator
> > > also. The bpf prog can have its own id generator.
> > >
> > > If the user space has already set the shinfo->tskey (either by sk->sk_tskey or
> > > SCM_TS_OPT_ID), the bpf prog can just use the user space one.
> > >
> > > If there is a weird application that flips flops between OPT_ID on/off, the bpf
> > > prog will get confused which is fine. The bpf prog can detect this and choose to
> > > ignore measuring this sk/skb.
>
> That will skew measurement and is under control of the process.
>
> I don't immediately foresee this being used to measure untrusted
> processes that would have an incentive to game this.
>
> But the caveat should be stated explicitly.
>
> > > The bpf prog can also choose to be on the very
> > > safe side and ignore all skb with SKBTX_ANY_TSTAMP set in txflags but with no
> > > OPT_ID. The bpf prog can look into the details of the sk and skb to decide what
> > > makes the most sense for its deployment.
> > >
> > > I don't know whether it makes more sense to call the bpf prog to decide the
> > > shinfo->{tx_flags,tskey} just before the "while (length > 0)" in
> > > __ip[6]_append_data or it is better to call the bpf prog in ip[6]_setup_cork.
> > > I admittedly less familiar with this code path than the tcp one.
>
> Probably the current spot, mainly because no skb exists yet in
> ip_setup_cork.
>
> > Now I feel it could be complicated for a software engineer to consider
> > how they will handle the key if they don't read the kernel code very
> > carefully. They are facing different situations. Being user-friendly
> > lets this feature have more chances to get widely used. As I insisted
> > before, I still would like to know if it is possible that we can try
> > to introduce sk_tskey_bpf_offset (like patch 10-12) to calculate a bpf
> > exclusive tskey for bpf use? Only exporting one key. It will be really
> > simple and easy-to-use :)
>
> That has complications of its own. It also has to deal with the user
> enabling/disabling/resetting its key, and with OPT_ID passed by cmsg.
> Multiple skbs may be in flight, derived from each of these sources.
> A single sk flag can only offset against one of them.
>
> I think Martin's approach is more workable. Use the tskey that is set,
> if any. Else, set one.

Got it. Thanks!
Jason Xing Nov. 6, 2024, 2:51 a.m. UTC | #33
On Wed, Nov 6, 2024 at 9:09 AM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>
> On 11/5/24 4:17 PM, Jason Xing wrote:
> > On Wed, Nov 6, 2024 at 3:22 AM Martin KaFai Lau <martin.lau@linux.dev> wrote:
> >>
> >> On 11/4/24 10:22 PM, Jason Xing wrote:
> >>> On Tue, Nov 5, 2024 at 10:09 AM Martin KaFai Lau <martin.lau@linux.dev> wrote:
> >>>>
> >>>> On 11/1/24 6:32 AM, Willem de Bruijn wrote:
> >>>>>> In udp/raw/..., I don't know how likely is the user space having "cork->tx_flags
> >>>>>> & SKBTX_ANY_TSTAMP" set but has neither "READ_ONCE(sk->sk_tsflags) &
> >>>>>> SOF_TIMESTAMPING_OPT_ID" nor "cork->flags & IPCORK_TS_OPT_ID" set.
> >>>>> This is not something to rely on. OPT_ID was added relatively recently.
> >>>>> Older applications, or any that just use the most straightforward API,
> >>>>> will not set this.
> >>>>
> >>>> Good point that the OPT_ID per cmsg is very new.
> >>>>
> >>>> The datagram support on SOF_TIMESTAMPING_OPT_ID in sk->sk_tsflags had
> >>>> been there for quite some time now. Is it a safe assumption that
> >>>> most applications doing udp tx timestamping should have
> >>>> the SOF_TIMESTAMPING_OPT_ID set to be useful?
> >>>>
> >>>>>
> >>>>>> If it is
> >>>>>> unlikely, may be we can just disallow bpf prog from directly setting
> >>>>>> skb_shinfo(skb)->tskey for this particular skb.
> >>>>>>
> >>>>>> For all other cases, in __ip[6]_append_data, directly call a bpf prog and also
> >>>>>> pass the kernel decided tskey to the bpf prog.
> >>>>>>
> >>>>>> The kernel passed tskey could be 0 (meaning the user space has not used it). The
> >>>>>> bpf prog can give one for the kernel to use. The bpf prog can store the
> >>>>>> sk_tskey_bpf in the bpf_sk_storage now. Meaning no need to add one to the struct
> >>>>>> sock. The bpf prog does not have to start from 0 (e.g. start from U32_MAX
> >>>>>> instead) if it helps.
> >>>>>>
> >>>>>> If the kernel passed tskey is not 0, the bpf prog can just use that one
> >>>>>> (assuming the user space is doing something sane, like the value in
> >>>>>> SCM_TS_OPT_ID won't be jumping back and front between 0 to U32_MAX). I hope this
> >>>>>> is very unlikely also (?) but the bpf prog can probably detect this and choose
> >>>>>> to ignore this sk.
> >>>>> If an applications uses OPT_ID, it is unlikely that they will toggle
> >>>>> the feature on and off on a per-packet basis. So in the common case
> >>>>> the program could use the user-set counter or use its own if userspace
> >>>>> does not enable the feature. In the rare case that an application does
> >>>>> intermittently set an OPT_ID, the numbering would be erratic. This
> >>>>> does mean that an actively malicious application could mess with admin
> >>>>> measurements.
> >>>>
> >>>> All make sense. Given it is reasonable to assume the user space should either
> >>>> has SOF_TIMESTAMPING_OPT_ID always on or always off. When it is off, the bpf
> >>>> prog can directly provide its own tskey to be used in shinfo->tskey. The bpf
> >>>> prog can generate the id itself without using the sk->sk_tskey, e.g. store an
> >>>> atomic int in the bpf_sk_storage.
> >>>
> >>> I wonder, how can we correlate the key with each skb in the bpf
> >>> program for non-TCP type without implementing a bpf extension for
> >>> SCM_TS_OPT_ID? Every time the timestamp is reported, we cannot know
> >>> which sendmsg() the skb belongs to for non-TCP cases.
> >>
> >> SCM_TS_OPT_ID is eventually setting the shinfo->tskey.
> >> If the shinfo->tskey is not set by the user space, the bpf prog can directly set
> >> the shinfo->tskey. There is no need to use the sk->sk_tskey as the ID generator
> >> also. The bpf prog can have its own id generator.
> >>
> >> If the user space has already set the shinfo->tskey (either by sk->sk_tskey or
> >> SCM_TS_OPT_ID), the bpf prog can just use the user space one.
> >>
> >> If there is a weird application that flips flops between OPT_ID on/off, the bpf
> >> prog will get confused which is fine. The bpf prog can detect this and choose to
> >> ignore measuring this sk/skb. The bpf prog can also choose to be on the very
> >> safe side and ignore all skb with SKBTX_ANY_TSTAMP set in txflags but with no
> >> OPT_ID. The bpf prog can look into the details of the sk and skb to decide what
> >> makes the most sense for its deployment.
> >>
> >> I don't know whether it makes more sense to call the bpf prog to decide the
> >> shinfo->{tx_flags,tskey} just before the "while (length > 0)" in
> >> __ip[6]_append_data or it is better to call the bpf prog in ip[6]_setup_cork.
> >> I admittedly less familiar with this code path than the tcp one.
> >
> > Now I feel it could be complicated for a software engineer to consider
> > how they will handle the key if they don't read the kernel code very
> > carefully. They are facing different situations. Being user-friendly
> > lets this feature have more chances to get widely used. As I insisted
> > before, I still would like to know if it is possible that we can try
> > to introduce sk_tskey_bpf_offset (like patch 10-12) to calculate a bpf
> > exclusive tskey for bpf use? Only exporting one key. It will be really
> > simple and easy-to-use :)
>
> imo, there is no need for adding sk_tskey_bpf_offset to sk. just allow the bpf
> prog to decide what is the tskey.
>
> There is no usability issue in bpf prog. It is pretty normal for a bpf prog
> author to look at the sk details to make decision.
>
> Abstracting the sk/skb is not helping the bpf prog and not the right direction
> to go. Over time, there has been case over case that the bpf prog wants to know
> more instead of being abstracted away like running in the user space. e.g. The
> "struct bpf_sock" abstraction in the uapi/linux/bpf.h does not scale and we have
> stopped adding more abstraction this way. The btf (and PTR_TO_BTF_ID,
> CO-RE...etc) has been added to allow the bpf prog to learn other details in sk
> and skb.
>
> Instead, design a better bpf kfunc to help the bpf prog to set the bits/tskey in
> the skb. I think this is more important. tcp tskey is easy. just need some care
> on the udp tskey and need to check if the user space has already set one.
> A good designed bpf kfunc is all it needs.

Thanks!

Let me confirm again in case I'm missing something important.
1) For tcp, as you said before, bpf prog can extract the seq from the
exported skb, so I don't need to export any key in this case.
2) For udp, if the skb has skb_shinfo(skb)->tskey set, then export the
key, else, export zero to the bpf program.
3) extend SCM_TS_OPT_ID for the udp/bpf case.

I'm not sure if I should postpone implementing this part after the
basic framework of this series gets merged. Anyway, I will try this :)

Thanks,
Jason
Martin KaFai Lau Nov. 7, 2024, 1:19 a.m. UTC | #34
On 11/5/24 6:51 PM, Jason Xing wrote:
> On Wed, Nov 6, 2024 at 9:09 AM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>>
>> On 11/5/24 4:17 PM, Jason Xing wrote:
>>> On Wed, Nov 6, 2024 at 3:22 AM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>>>>
>>>> On 11/4/24 10:22 PM, Jason Xing wrote:
>>>>> On Tue, Nov 5, 2024 at 10:09 AM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>>>>>>
>>>>>> On 11/1/24 6:32 AM, Willem de Bruijn wrote:
>>>>>>>> In udp/raw/..., I don't know how likely is the user space having "cork->tx_flags
>>>>>>>> & SKBTX_ANY_TSTAMP" set but has neither "READ_ONCE(sk->sk_tsflags) &
>>>>>>>> SOF_TIMESTAMPING_OPT_ID" nor "cork->flags & IPCORK_TS_OPT_ID" set.
>>>>>>> This is not something to rely on. OPT_ID was added relatively recently.
>>>>>>> Older applications, or any that just use the most straightforward API,
>>>>>>> will not set this.
>>>>>>
>>>>>> Good point that the OPT_ID per cmsg is very new.
>>>>>>
>>>>>> The datagram support on SOF_TIMESTAMPING_OPT_ID in sk->sk_tsflags had
>>>>>> been there for quite some time now. Is it a safe assumption that
>>>>>> most applications doing udp tx timestamping should have
>>>>>> the SOF_TIMESTAMPING_OPT_ID set to be useful?
>>>>>>
>>>>>>>
>>>>>>>> If it is
>>>>>>>> unlikely, may be we can just disallow bpf prog from directly setting
>>>>>>>> skb_shinfo(skb)->tskey for this particular skb.
>>>>>>>>
>>>>>>>> For all other cases, in __ip[6]_append_data, directly call a bpf prog and also
>>>>>>>> pass the kernel decided tskey to the bpf prog.
>>>>>>>>
>>>>>>>> The kernel passed tskey could be 0 (meaning the user space has not used it). The
>>>>>>>> bpf prog can give one for the kernel to use. The bpf prog can store the
>>>>>>>> sk_tskey_bpf in the bpf_sk_storage now. Meaning no need to add one to the struct
>>>>>>>> sock. The bpf prog does not have to start from 0 (e.g. start from U32_MAX
>>>>>>>> instead) if it helps.
>>>>>>>>
>>>>>>>> If the kernel passed tskey is not 0, the bpf prog can just use that one
>>>>>>>> (assuming the user space is doing something sane, like the value in
>>>>>>>> SCM_TS_OPT_ID won't be jumping back and front between 0 to U32_MAX). I hope this
>>>>>>>> is very unlikely also (?) but the bpf prog can probably detect this and choose
>>>>>>>> to ignore this sk.
>>>>>>> If an applications uses OPT_ID, it is unlikely that they will toggle
>>>>>>> the feature on and off on a per-packet basis. So in the common case
>>>>>>> the program could use the user-set counter or use its own if userspace
>>>>>>> does not enable the feature. In the rare case that an application does
>>>>>>> intermittently set an OPT_ID, the numbering would be erratic. This
>>>>>>> does mean that an actively malicious application could mess with admin
>>>>>>> measurements.
>>>>>>
>>>>>> All make sense. Given it is reasonable to assume the user space should either
>>>>>> has SOF_TIMESTAMPING_OPT_ID always on or always off. When it is off, the bpf
>>>>>> prog can directly provide its own tskey to be used in shinfo->tskey. The bpf
>>>>>> prog can generate the id itself without using the sk->sk_tskey, e.g. store an
>>>>>> atomic int in the bpf_sk_storage.
>>>>>
>>>>> I wonder, how can we correlate the key with each skb in the bpf
>>>>> program for non-TCP type without implementing a bpf extension for
>>>>> SCM_TS_OPT_ID? Every time the timestamp is reported, we cannot know
>>>>> which sendmsg() the skb belongs to for non-TCP cases.
>>>>
>>>> SCM_TS_OPT_ID is eventually setting the shinfo->tskey.
>>>> If the shinfo->tskey is not set by the user space, the bpf prog can directly set
>>>> the shinfo->tskey. There is no need to use the sk->sk_tskey as the ID generator
>>>> also. The bpf prog can have its own id generator.
>>>>
>>>> If the user space has already set the shinfo->tskey (either by sk->sk_tskey or
>>>> SCM_TS_OPT_ID), the bpf prog can just use the user space one.
>>>>
>>>> If there is a weird application that flips flops between OPT_ID on/off, the bpf
>>>> prog will get confused which is fine. The bpf prog can detect this and choose to
>>>> ignore measuring this sk/skb. The bpf prog can also choose to be on the very
>>>> safe side and ignore all skb with SKBTX_ANY_TSTAMP set in txflags but with no
>>>> OPT_ID. The bpf prog can look into the details of the sk and skb to decide what
>>>> makes the most sense for its deployment.
>>>>
>>>> I don't know whether it makes more sense to call the bpf prog to decide the
>>>> shinfo->{tx_flags,tskey} just before the "while (length > 0)" in
>>>> __ip[6]_append_data or it is better to call the bpf prog in ip[6]_setup_cork.
>>>> I admittedly less familiar with this code path than the tcp one.
>>>
>>> Now I feel it could be complicated for a software engineer to consider
>>> how they will handle the key if they don't read the kernel code very
>>> carefully. They are facing different situations. Being user-friendly
>>> lets this feature have more chances to get widely used. As I insisted
>>> before, I still would like to know if it is possible that we can try
>>> to introduce sk_tskey_bpf_offset (like patch 10-12) to calculate a bpf
>>> exclusive tskey for bpf use? Only exporting one key. It will be really
>>> simple and easy-to-use :)
>>
>> imo, there is no need for adding sk_tskey_bpf_offset to sk. just allow the bpf
>> prog to decide what is the tskey.
>>
>> There is no usability issue in bpf prog. It is pretty normal for a bpf prog
>> author to look at the sk details to make decision.
>>
>> Abstracting the sk/skb is not helping the bpf prog and not the right direction
>> to go. Over time, there has been case over case that the bpf prog wants to know
>> more instead of being abstracted away like running in the user space. e.g. The
>> "struct bpf_sock" abstraction in the uapi/linux/bpf.h does not scale and we have
>> stopped adding more abstraction this way. The btf (and PTR_TO_BTF_ID,
>> CO-RE...etc) has been added to allow the bpf prog to learn other details in sk
>> and skb.
>>
>> Instead, design a better bpf kfunc to help the bpf prog to set the bits/tskey in
>> the skb. I think this is more important. tcp tskey is easy. just need some care
>> on the udp tskey and need to check if the user space has already set one.
>> A good designed bpf kfunc is all it needs.
> 
> Thanks!
> 
> Let me confirm again in case I'm missing something important.
> 1) For tcp, as you said before, bpf prog can extract the seq from the
> exported skb, so I don't need to export any key in this case.
> 2) For udp, if the skb has skb_shinfo(skb)->tskey set, then export the
> key, else, export zero to the bpf program.

A follow up to myself on the earlier bpf kfunc comment. Something like this:

/* ack: request ACK timestamp (tcp only)
  * req_tskey: bpf prog can request to use a particular tskey.
  *            req_tskey should always be 0 for tcp.
  * return: -ve for error. u32 for the tskey that the bpf prog should use.
  *	   may be different from the req_tskey (e.g. the user space has
  *         already set one).
  */
__bpf_kfunc s64 bpf_skops_enable_tx_tstamp(struct bpf_sock_ops_kern *skops,
					   bool ack, u32 req_tskey);

/* "not sure" if this kfunc is needed. probably no. I think it is easier to pass
  * true/false in the args[0]. It seems tskey can be 0 in udp, so
  * passing tskey can't tell if the skb/cork/sockcm_cookie has the tskey.
  */
__bpf_kfunc bool bpf_skops_has_tskey(struct bpf_sock_ops_kern *skops);

For udp, I don't know whether it will be easier to set the tskey in the 'cork' 
or 'sockcm_cookie' or 'skb'. I guess it depends where the bpf prog is called. If 
skb, it seems the bpf prog may be called repetitively for doing the same thing 
in the while loop in __ip[6]_append_data. If it is better to set the 'cork' or 
'sockcm_cookie', the cork/sockcm_cookie pointer can be added to 'struct 
bpf_sock_ops_kern'. The sizeof(struct bpf_sock_ops_kern) is at 64bytes. Adding 
one pointer is not ideal.... probably it can be union with syn_skb but will need 
some code audit (so please check).


> 3) extend SCM_TS_OPT_ID for the udp/bpf case.

I don't understand. What does it mean to extend SCM_TS_OPT_ID?

> I'm not sure if I should postpone implementing this part after the
> basic framework of this series gets merged. Anyway, I will try this :)
Jason Xing Nov. 7, 2024, 3:31 a.m. UTC | #35
On Thu, Nov 7, 2024 at 9:19 AM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>
> On 11/5/24 6:51 PM, Jason Xing wrote:
> > On Wed, Nov 6, 2024 at 9:09 AM Martin KaFai Lau <martin.lau@linux.dev> wrote:
> >>
> >> On 11/5/24 4:17 PM, Jason Xing wrote:
> >>> On Wed, Nov 6, 2024 at 3:22 AM Martin KaFai Lau <martin.lau@linux.dev> wrote:
> >>>>
> >>>> On 11/4/24 10:22 PM, Jason Xing wrote:
> >>>>> On Tue, Nov 5, 2024 at 10:09 AM Martin KaFai Lau <martin.lau@linux.dev> wrote:
> >>>>>>
> >>>>>> On 11/1/24 6:32 AM, Willem de Bruijn wrote:
> >>>>>>>> In udp/raw/..., I don't know how likely is the user space having "cork->tx_flags
> >>>>>>>> & SKBTX_ANY_TSTAMP" set but has neither "READ_ONCE(sk->sk_tsflags) &
> >>>>>>>> SOF_TIMESTAMPING_OPT_ID" nor "cork->flags & IPCORK_TS_OPT_ID" set.
> >>>>>>> This is not something to rely on. OPT_ID was added relatively recently.
> >>>>>>> Older applications, or any that just use the most straightforward API,
> >>>>>>> will not set this.
> >>>>>>
> >>>>>> Good point that the OPT_ID per cmsg is very new.
> >>>>>>
> >>>>>> The datagram support on SOF_TIMESTAMPING_OPT_ID in sk->sk_tsflags had
> >>>>>> been there for quite some time now. Is it a safe assumption that
> >>>>>> most applications doing udp tx timestamping should have
> >>>>>> the SOF_TIMESTAMPING_OPT_ID set to be useful?
> >>>>>>
> >>>>>>>
> >>>>>>>> If it is
> >>>>>>>> unlikely, may be we can just disallow bpf prog from directly setting
> >>>>>>>> skb_shinfo(skb)->tskey for this particular skb.
> >>>>>>>>
> >>>>>>>> For all other cases, in __ip[6]_append_data, directly call a bpf prog and also
> >>>>>>>> pass the kernel decided tskey to the bpf prog.
> >>>>>>>>
> >>>>>>>> The kernel passed tskey could be 0 (meaning the user space has not used it). The
> >>>>>>>> bpf prog can give one for the kernel to use. The bpf prog can store the
> >>>>>>>> sk_tskey_bpf in the bpf_sk_storage now. Meaning no need to add one to the struct
> >>>>>>>> sock. The bpf prog does not have to start from 0 (e.g. start from U32_MAX
> >>>>>>>> instead) if it helps.
> >>>>>>>>
> >>>>>>>> If the kernel passed tskey is not 0, the bpf prog can just use that one
> >>>>>>>> (assuming the user space is doing something sane, like the value in
> >>>>>>>> SCM_TS_OPT_ID won't be jumping back and front between 0 to U32_MAX). I hope this
> >>>>>>>> is very unlikely also (?) but the bpf prog can probably detect this and choose
> >>>>>>>> to ignore this sk.
> >>>>>>> If an applications uses OPT_ID, it is unlikely that they will toggle
> >>>>>>> the feature on and off on a per-packet basis. So in the common case
> >>>>>>> the program could use the user-set counter or use its own if userspace
> >>>>>>> does not enable the feature. In the rare case that an application does
> >>>>>>> intermittently set an OPT_ID, the numbering would be erratic. This
> >>>>>>> does mean that an actively malicious application could mess with admin
> >>>>>>> measurements.
> >>>>>>
> >>>>>> All make sense. Given it is reasonable to assume the user space should either
> >>>>>> has SOF_TIMESTAMPING_OPT_ID always on or always off. When it is off, the bpf
> >>>>>> prog can directly provide its own tskey to be used in shinfo->tskey. The bpf
> >>>>>> prog can generate the id itself without using the sk->sk_tskey, e.g. store an
> >>>>>> atomic int in the bpf_sk_storage.
> >>>>>
> >>>>> I wonder, how can we correlate the key with each skb in the bpf
> >>>>> program for non-TCP type without implementing a bpf extension for
> >>>>> SCM_TS_OPT_ID? Every time the timestamp is reported, we cannot know
> >>>>> which sendmsg() the skb belongs to for non-TCP cases.
> >>>>
> >>>> SCM_TS_OPT_ID is eventually setting the shinfo->tskey.
> >>>> If the shinfo->tskey is not set by the user space, the bpf prog can directly set
> >>>> the shinfo->tskey. There is no need to use the sk->sk_tskey as the ID generator
> >>>> also. The bpf prog can have its own id generator.
> >>>>
> >>>> If the user space has already set the shinfo->tskey (either by sk->sk_tskey or
> >>>> SCM_TS_OPT_ID), the bpf prog can just use the user space one.
> >>>>
> >>>> If there is a weird application that flips flops between OPT_ID on/off, the bpf
> >>>> prog will get confused which is fine. The bpf prog can detect this and choose to
> >>>> ignore measuring this sk/skb. The bpf prog can also choose to be on the very
> >>>> safe side and ignore all skb with SKBTX_ANY_TSTAMP set in txflags but with no
> >>>> OPT_ID. The bpf prog can look into the details of the sk and skb to decide what
> >>>> makes the most sense for its deployment.
> >>>>
> >>>> I don't know whether it makes more sense to call the bpf prog to decide the
> >>>> shinfo->{tx_flags,tskey} just before the "while (length > 0)" in
> >>>> __ip[6]_append_data or it is better to call the bpf prog in ip[6]_setup_cork.
> >>>> I admittedly less familiar with this code path than the tcp one.
> >>>
> >>> Now I feel it could be complicated for a software engineer to consider
> >>> how they will handle the key if they don't read the kernel code very
> >>> carefully. They are facing different situations. Being user-friendly
> >>> lets this feature have more chances to get widely used. As I insisted
> >>> before, I still would like to know if it is possible that we can try
> >>> to introduce sk_tskey_bpf_offset (like patch 10-12) to calculate a bpf
> >>> exclusive tskey for bpf use? Only exporting one key. It will be really
> >>> simple and easy-to-use :)
> >>
> >> imo, there is no need for adding sk_tskey_bpf_offset to sk. just allow the bpf
> >> prog to decide what is the tskey.
> >>
> >> There is no usability issue in bpf prog. It is pretty normal for a bpf prog
> >> author to look at the sk details to make decision.
> >>
> >> Abstracting the sk/skb is not helping the bpf prog and not the right direction
> >> to go. Over time, there has been case over case that the bpf prog wants to know
> >> more instead of being abstracted away like running in the user space. e.g. The
> >> "struct bpf_sock" abstraction in the uapi/linux/bpf.h does not scale and we have
> >> stopped adding more abstraction this way. The btf (and PTR_TO_BTF_ID,
> >> CO-RE...etc) has been added to allow the bpf prog to learn other details in sk
> >> and skb.
> >>
> >> Instead, design a better bpf kfunc to help the bpf prog to set the bits/tskey in
> >> the skb. I think this is more important. tcp tskey is easy. just need some care
> >> on the udp tskey and need to check if the user space has already set one.
> >> A good designed bpf kfunc is all it needs.
> >
> > Thanks!
> >
> > Let me confirm again in case I'm missing something important.
> > 1) For tcp, as you said before, bpf prog can extract the seq from the
> > exported skb, so I don't need to export any key in this case.
> > 2) For udp, if the skb has skb_shinfo(skb)->tskey set, then export the
> > key, else, export zero to the bpf program.
>
> A follow up to myself on the earlier bpf kfunc comment. Something like this:

Thank you so much!

>
> /* ack: request ACK timestamp (tcp only)
>   * req_tskey: bpf prog can request to use a particular tskey.
>   *            req_tskey should always be 0 for tcp.
>   * return: -ve for error. u32 for the tskey that the bpf prog should use.
>   *        may be different from the req_tskey (e.g. the user space has
>   *         already set one).
>   */
> __bpf_kfunc s64 bpf_skops_enable_tx_tstamp(struct bpf_sock_ops_kern *skops,
>                                            bool ack, u32 req_tskey);
>
> /* "not sure" if this kfunc is needed. probably no. I think it is easier to pass
>   * true/false in the args[0]. It seems tskey can be 0 in udp, so

Good idea.

>   * passing tskey can't tell if the skb/cork/sockcm_cookie has the tskey.
>   */
> __bpf_kfunc bool bpf_skops_has_tskey(struct bpf_sock_ops_kern *skops);
>
> For udp, I don't know whether it will be easier to set the tskey in the 'cork'
> or 'sockcm_cookie' or 'skb'. I guess it depends where the bpf prog is called. If
> skb, it seems the bpf prog may be called repetitively for doing the same thing
> in the while loop in __ip[6]_append_data. If it is better to set the 'cork' or
> 'sockcm_cookie', the cork/sockcm_cookie pointer can be added to 'struct
> bpf_sock_ops_kern'. The sizeof(struct bpf_sock_ops_kern) is at 64bytes. Adding
> one pointer is not ideal.... probably it can be union with syn_skb but will need
> some code audit (so please check).

Let me dig into it :)

>
>
> > 3) extend SCM_TS_OPT_ID for the udp/bpf case.
>
> I don't understand. What does it mean to extend SCM_TS_OPT_ID?

Oh, I thought you expect to pass the key from the bpf program through
using the interface of SCM_TS_OPT_ID feature which isn't supported by
bpf. Let me think more about it first.

Thanks,
Jason
Martin KaFai Lau Nov. 7, 2024, 7:05 p.m. UTC | #36
On 11/6/24 7:31 PM, Jason Xing wrote:
>> /* ack: request ACK timestamp (tcp only)
>>    * req_tskey: bpf prog can request to use a particular tskey.
>>    *            req_tskey should always be 0 for tcp.
>>    * return: -ve for error. u32 for the tskey that the bpf prog should use.
>>    *        may be different from the req_tskey (e.g. the user space has
>>    *         already set one).
>>    */
>> __bpf_kfunc s64 bpf_skops_enable_tx_tstamp(struct bpf_sock_ops_kern *skops,
>>                                             bool ack, u32 req_tskey);
>>

>>
>> For udp, I don't know whether it will be easier to set the tskey in the 'cork'
>> or 'sockcm_cookie' or 'skb'. I guess it depends where the bpf prog is called. If
>> skb, it seems the bpf prog may be called repetitively for doing the same thing
>> in the while loop in __ip[6]_append_data. If it is better to set the 'cork' or
>> 'sockcm_cookie', the cork/sockcm_cookie pointer can be added to 'struct
>> bpf_sock_ops_kern'. The sizeof(struct bpf_sock_ops_kern) is at 64bytes. Adding
>> one pointer is not ideal.... probably it can be union with syn_skb but will need
>> some code audit (so please check).

>>
>>
>>> 3) extend SCM_TS_OPT_ID for the udp/bpf case.
>>
>> I don't understand. What does it mean to extend SCM_TS_OPT_ID?
> 
> Oh, I thought you expect to pass the key from the bpf program through
> using the interface of SCM_TS_OPT_ID feature which isn't supported by
> bpf. Let me think more about it first.

I still don't understand the SCM_TS_OPT_ID part but no I don't mean that.

The bpf prog uses the kfunc to directly set the tskey (and tx_flags) in 
skb/cork/sockcm_cookie. The name here for the tskey and tx_flags may be 
different based on if it is skb/cork/sockcm_cookie..
diff mbox series

Patch

diff --git a/include/net/sock.h b/include/net/sock.h
index 7464e9f9f47c..5384f1e49f5c 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -445,6 +445,7 @@  struct sock {
 	u32			sk_reserved_mem;
 	int			sk_forward_alloc;
 	u32			sk_tsflags;
+	u32			sk_tsflags_bpf;
 	__cacheline_group_end(sock_write_rxtx);
 
 	__cacheline_group_begin(sock_write_tx);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 1cf8416f4123..39309f75e105 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -5539,6 +5539,32 @@  void skb_complete_tx_timestamp(struct sk_buff *skb,
 }
 EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp);
 
+/* This function is used to test if application SO_TIMESTAMPING feature
+ * or bpf SO_TIMESTAMPING feature is loaded by checking its own socket flags.
+ */
+static bool sk_tstamp_tx_flags(struct sock *sk, u32 tsflags, int tstype)
+{
+	u32 testflag;
+
+	switch (tstype) {
+	case SCM_TSTAMP_SCHED:
+		testflag = SOF_TIMESTAMPING_TX_SCHED;
+		break;
+	case SCM_TSTAMP_SND:
+		testflag = SOF_TIMESTAMPING_TX_SOFTWARE;
+		break;
+	case SCM_TSTAMP_ACK:
+		testflag = SOF_TIMESTAMPING_TX_ACK;
+		break;
+	default:
+		return false;
+	}
+	if (tsflags & testflag)
+		return true;
+
+	return false;
+}
+
 static void skb_tstamp_tx_output(struct sk_buff *orig_skb,
 				 const struct sk_buff *ack_skb,
 				 struct skb_shared_hwtstamps *hwtstamps,
@@ -5549,6 +5575,9 @@  static void skb_tstamp_tx_output(struct sk_buff *orig_skb,
 	u32 tsflags;
 
 	tsflags = READ_ONCE(sk->sk_tsflags);
+	if (!sk_tstamp_tx_flags(sk, tsflags, tstype))
+		return;
+
 	if (!hwtstamps && !(tsflags & SOF_TIMESTAMPING_OPT_TX_SWHW) &&
 	    skb_shinfo(orig_skb)->tx_flags & SKBTX_IN_PROGRESS)
 		return;
@@ -5592,6 +5621,15 @@  static void skb_tstamp_tx_output(struct sk_buff *orig_skb,
 	__skb_complete_tx_timestamp(skb, sk, tstype, opt_stats);
 }
 
+static void skb_tstamp_tx_output_bpf(struct sock *sk, int tstype)
+{
+	u32 tsflags;
+
+	tsflags = READ_ONCE(sk->sk_tsflags_bpf);
+	if (!sk_tstamp_tx_flags(sk, tsflags, tstype))
+		return;
+}
+
 void __skb_tstamp_tx(struct sk_buff *orig_skb,
 		     const struct sk_buff *ack_skb,
 		     struct skb_shared_hwtstamps *hwtstamps,
@@ -5600,6 +5638,7 @@  void __skb_tstamp_tx(struct sk_buff *orig_skb,
 	if (!sk)
 		return;
 
+	skb_tstamp_tx_output_bpf(sk, tstype);
 	skb_tstamp_tx_output(orig_skb, ack_skb, hwtstamps, sk, tstype);
 }
 EXPORT_SYMBOL_GPL(__skb_tstamp_tx);