diff mbox series

[net-next,v2,06/12] net-timestamp: introduce TS_SCHED_OPT_CB to generate dev xmit timestamp

Message ID 20241012040651.95616-7-kerneljasonxing@gmail.com (mailing list archive)
State Changes Requested
Delegated to: Netdev Maintainers
Headers show
Series net-timestamp: bpf extension to equip applications transparently | expand

Checks

Context Check Description
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for net-next, async
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 204 this patch: 204
netdev/build_tools success Errors and warnings before: 159 (+0) this patch: 159 (+0)
netdev/cc_maintainers warning 1 maintainers not CCed: horms@kernel.org
netdev/build_clang success Errors and warnings before: 247 this patch: 247
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 6942 this patch: 6942
netdev/checkpatch warning WARNING: line length of 83 exceeds 80 columns
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 54 this patch: 54
netdev/source_inline success Was 0 now: 0

Commit Message

Jason Xing Oct. 12, 2024, 4:06 a.m. UTC
From: Jason Xing <kernelxing@tencent.com>

Introduce BPF_SOCK_OPS_TS_SCHED_OPT_CB flag so that we can decide to
print timestamps when the skb just passes the dev layer.

Signed-off-by: Jason Xing <kernelxing@tencent.com>
---
 include/uapi/linux/bpf.h       |  5 +++++
 net/core/skbuff.c              | 17 +++++++++++++++--
 tools/include/uapi/linux/bpf.h |  5 +++++
 3 files changed, 25 insertions(+), 2 deletions(-)

Comments

Martin KaFai Lau Oct. 16, 2024, 1:01 a.m. UTC | #1
On 10/11/24 9:06 PM, Jason Xing wrote:
> From: Jason Xing <kernelxing@tencent.com>
> 
> Introduce BPF_SOCK_OPS_TS_SCHED_OPT_CB flag so that we can decide to
> print timestamps when the skb just passes the dev layer.
> 
> Signed-off-by: Jason Xing <kernelxing@tencent.com>
> ---
>   include/uapi/linux/bpf.h       |  5 +++++
>   net/core/skbuff.c              | 17 +++++++++++++++--
>   tools/include/uapi/linux/bpf.h |  5 +++++
>   3 files changed, 25 insertions(+), 2 deletions(-)
> 
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 157e139ed6fc..3cf3c9c896c7 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -7019,6 +7019,11 @@ enum {
>   					 * by the kernel or the
>   					 * earlier bpf-progs.
>   					 */
> +	BPF_SOCK_OPS_TS_SCHED_OPT_CB,	/* Called when skb is passing through
> +					 * dev layer when SO_TIMESTAMPING
> +					 * feature is on. It indicates the
> +					 * recorded timestamp.
> +					 */
>   };
>   
>   /* List of TCP states. There is a build check in net/ipv4/tcp.c to detect
> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> index 3a4110d0f983..16e7bdc1eacb 100644
> --- a/net/core/skbuff.c
> +++ b/net/core/skbuff.c
> @@ -5632,8 +5632,21 @@ static void bpf_skb_tstamp_tx_output(struct sock *sk, int tstype)
>   		return;
>   
>   	tp = tcp_sk(sk);
> -	if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_TX_TIMESTAMPING_OPT_CB_FLAG))
> -		return;
> +	if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_TX_TIMESTAMPING_OPT_CB_FLAG)) {
> +		struct timespec64 tstamp;
> +		u32 cb_flag;
> +
> +		switch (tstype) {
> +		case SCM_TSTAMP_SCHED:
> +			cb_flag = BPF_SOCK_OPS_TS_SCHED_OPT_CB;
> +			break;
> +		default:
> +			return;
> +		}
> +
> +		tstamp = ktime_to_timespec64(ktime_get_real());
> +		tcp_call_bpf_2arg(sk, cb_flag, tstamp.tv_sec, tstamp.tv_nsec);

There is bpf_ktime_get_*() helper. The bpf prog can directly call the 
bpf_ktime_get_* helper and use whatever clock it sees fit instead of enforcing 
real clock here and doing an extra ktime_to_timespec64. Right now the 
bpf_ktime_get_*() does not have real clock which I think it can be added.

I think overall the tstamp reporting interface does not necessarily have to 
follow the socket API. The bpf prog is running in the kernel. It could pass 
other information to the bpf prog if it sees fit. e.g. the bpf prog could also 
get the original transmitted tcp skb if it is useful.

> +	}
>   }
>   
>   void __skb_tstamp_tx(struct sk_buff *orig_skb,
> diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
> index 93853d9d4922..d60675e1a5a0 100644
> --- a/tools/include/uapi/linux/bpf.h
> +++ b/tools/include/uapi/linux/bpf.h
> @@ -7018,6 +7018,11 @@ enum {
>   					 * by the kernel or the
>   					 * earlier bpf-progs.
>   					 */
> +	BPF_SOCK_OPS_TS_SCHED_OPT_CB,	/* Called when skb is passing through
> +					 * dev layer when SO_TIMESTAMPING
> +					 * feature is on. It indicates the
> +					 * recorded timestamp.
> +					 */
>   };
>   
>   /* List of TCP states. There is a build check in net/ipv4/tcp.c to detect
Jason Xing Oct. 16, 2024, 1:24 a.m. UTC | #2
On Wed, Oct 16, 2024 at 9:01 AM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>
> On 10/11/24 9:06 PM, Jason Xing wrote:
> > From: Jason Xing <kernelxing@tencent.com>
> >
> > Introduce BPF_SOCK_OPS_TS_SCHED_OPT_CB flag so that we can decide to
> > print timestamps when the skb just passes the dev layer.
> >
> > Signed-off-by: Jason Xing <kernelxing@tencent.com>
> > ---
> >   include/uapi/linux/bpf.h       |  5 +++++
> >   net/core/skbuff.c              | 17 +++++++++++++++--
> >   tools/include/uapi/linux/bpf.h |  5 +++++
> >   3 files changed, 25 insertions(+), 2 deletions(-)
> >
> > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > index 157e139ed6fc..3cf3c9c896c7 100644
> > --- a/include/uapi/linux/bpf.h
> > +++ b/include/uapi/linux/bpf.h
> > @@ -7019,6 +7019,11 @@ enum {
> >                                        * by the kernel or the
> >                                        * earlier bpf-progs.
> >                                        */
> > +     BPF_SOCK_OPS_TS_SCHED_OPT_CB,   /* Called when skb is passing through
> > +                                      * dev layer when SO_TIMESTAMPING
> > +                                      * feature is on. It indicates the
> > +                                      * recorded timestamp.
> > +                                      */
> >   };
> >
> >   /* List of TCP states. There is a build check in net/ipv4/tcp.c to detect
> > diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> > index 3a4110d0f983..16e7bdc1eacb 100644
> > --- a/net/core/skbuff.c
> > +++ b/net/core/skbuff.c
> > @@ -5632,8 +5632,21 @@ static void bpf_skb_tstamp_tx_output(struct sock *sk, int tstype)
> >               return;
> >
> >       tp = tcp_sk(sk);
> > -     if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_TX_TIMESTAMPING_OPT_CB_FLAG))
> > -             return;
> > +     if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_TX_TIMESTAMPING_OPT_CB_FLAG)) {
> > +             struct timespec64 tstamp;
> > +             u32 cb_flag;
> > +
> > +             switch (tstype) {
> > +             case SCM_TSTAMP_SCHED:
> > +                     cb_flag = BPF_SOCK_OPS_TS_SCHED_OPT_CB;
> > +                     break;
> > +             default:
> > +                     return;
> > +             }
> > +
> > +             tstamp = ktime_to_timespec64(ktime_get_real());
> > +             tcp_call_bpf_2arg(sk, cb_flag, tstamp.tv_sec, tstamp.tv_nsec);
>
> There is bpf_ktime_get_*() helper. The bpf prog can directly call the
> bpf_ktime_get_* helper and use whatever clock it sees fit instead of enforcing
> real clock here and doing an extra ktime_to_timespec64. Right now the
> bpf_ktime_get_*() does not have real clock which I think it can be added.

In this way, there is no need to add tcp_call_bpf_*arg() to pass
timestamp to userspace, right? Let the bpf program implement it.

Now I wonder what information I should pass? Sorry for the lack of BPF
related knowledge :(

>
> I think overall the tstamp reporting interface does not necessarily have to
> follow the socket API. The bpf prog is running in the kernel. It could pass
> other information to the bpf prog if it sees fit. e.g. the bpf prog could also
> get the original transmitted tcp skb if it is useful.

Good to know that! But how the BPF program parses the skb by using
tcp_call_bpf_2arg() which only passes u32 parameters.

Thanks,
Jason
Martin KaFai Lau Oct. 16, 2024, 5:35 a.m. UTC | #3
On 10/15/24 6:24 PM, Jason Xing wrote:
> On Wed, Oct 16, 2024 at 9:01 AM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>>
>> On 10/11/24 9:06 PM, Jason Xing wrote:
>>> From: Jason Xing <kernelxing@tencent.com>
>>>
>>> Introduce BPF_SOCK_OPS_TS_SCHED_OPT_CB flag so that we can decide to
>>> print timestamps when the skb just passes the dev layer.
>>>
>>> Signed-off-by: Jason Xing <kernelxing@tencent.com>
>>> ---
>>>    include/uapi/linux/bpf.h       |  5 +++++
>>>    net/core/skbuff.c              | 17 +++++++++++++++--
>>>    tools/include/uapi/linux/bpf.h |  5 +++++
>>>    3 files changed, 25 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>>> index 157e139ed6fc..3cf3c9c896c7 100644
>>> --- a/include/uapi/linux/bpf.h
>>> +++ b/include/uapi/linux/bpf.h
>>> @@ -7019,6 +7019,11 @@ enum {
>>>                                         * by the kernel or the
>>>                                         * earlier bpf-progs.
>>>                                         */
>>> +     BPF_SOCK_OPS_TS_SCHED_OPT_CB,   /* Called when skb is passing through
>>> +                                      * dev layer when SO_TIMESTAMPING
>>> +                                      * feature is on. It indicates the
>>> +                                      * recorded timestamp.
>>> +                                      */
>>>    };
>>>
>>>    /* List of TCP states. There is a build check in net/ipv4/tcp.c to detect
>>> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
>>> index 3a4110d0f983..16e7bdc1eacb 100644
>>> --- a/net/core/skbuff.c
>>> +++ b/net/core/skbuff.c
>>> @@ -5632,8 +5632,21 @@ static void bpf_skb_tstamp_tx_output(struct sock *sk, int tstype)
>>>                return;
>>>
>>>        tp = tcp_sk(sk);
>>> -     if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_TX_TIMESTAMPING_OPT_CB_FLAG))
>>> -             return;
>>> +     if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_TX_TIMESTAMPING_OPT_CB_FLAG)) {
>>> +             struct timespec64 tstamp;
>>> +             u32 cb_flag;
>>> +
>>> +             switch (tstype) {
>>> +             case SCM_TSTAMP_SCHED:
>>> +                     cb_flag = BPF_SOCK_OPS_TS_SCHED_OPT_CB;
>>> +                     break;
>>> +             default:
>>> +                     return;
>>> +             }
>>> +
>>> +             tstamp = ktime_to_timespec64(ktime_get_real());
>>> +             tcp_call_bpf_2arg(sk, cb_flag, tstamp.tv_sec, tstamp.tv_nsec);
>>
>> There is bpf_ktime_get_*() helper. The bpf prog can directly call the
>> bpf_ktime_get_* helper and use whatever clock it sees fit instead of enforcing
>> real clock here and doing an extra ktime_to_timespec64. Right now the
>> bpf_ktime_get_*() does not have real clock which I think it can be added.
> 
> In this way, there is no need to add tcp_call_bpf_*arg() to pass
> timestamp to userspace, right? Let the bpf program implement it.
> 
> Now I wonder what information I should pass? Sorry for the lack of BPF
> related knowledge :(

Just pass the cb_flag op in this case.

A bpf selftest is missing in this series to show how it is going to be used. 
Yes, there are existing socket API tests on time stamping but I believe this 
discussion has already shown some subtle differences that warrant a closer to 
real world bpf prog example first.

> 
>>
>> I think overall the tstamp reporting interface does not necessarily have to
>> follow the socket API. The bpf prog is running in the kernel. It could pass
>> other information to the bpf prog if it sees fit. e.g. the bpf prog could also
>> get the original transmitted tcp skb if it is useful.
> 
> Good to know that! But how the BPF program parses the skb by using
> tcp_call_bpf_2arg() which only passes u32 parameters.

"struct skbuff *skb" has already been added to "struct bpf_sock_ops_kern". It is 
only assigned during the "BPF_SOCK_OPS_PARSE_*HDR_CB". It is not exposed 
directly to bpf prog but it could be. However, it may need to change some 
convert_ctx code in filter.c which I am not excited about. We haven't added 
convert_ctx changes for a while since it is the old way.

Together with the "u32	bpf_sock_ops_cb_flags;" change in patch 9 which is only 
for tcp_sock and other _CB flags are also tcp specific only. For now, I am not 
sure carrying this sockops to the future UDP support is desired.

Take a look at tcp_call_bpf(). It needs to initialize the whole "struct 
bpf_sock_ops_kern" regardless of what the bpf prog is needed before calling the 
bpf prog. The "u32 args[4]" is one of them. The is the older way of using bpf to 
extend kernel.

bpf has struct_ops support now which can pass only what is needed and without 
the need of doing the convert_ctx in filter.c. The "struct tcp_congestion_ops" 
can already be implemented in bpf. Take a look at 
selftests/bpf/progs/bpf_cubic.c. All the BPF_SOCK_OPS_*_CB (e.g. 
BPF_SOCK_OPS_TS_SCHED_OPT_CB here) could just a "ops" in the struct_ops.

That said, I think the first thing needs to figure out is how to enable bpf time 
stamping without having side effect on the user space. Continue the sockops 
approach first and use it to create a selftest bpf prog example. Then we can decide.
Jason Xing Oct. 16, 2024, 6:08 a.m. UTC | #4
On Wed, Oct 16, 2024 at 1:35 PM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>
> On 10/15/24 6:24 PM, Jason Xing wrote:
> > On Wed, Oct 16, 2024 at 9:01 AM Martin KaFai Lau <martin.lau@linux.dev> wrote:
> >>
> >> On 10/11/24 9:06 PM, Jason Xing wrote:
> >>> From: Jason Xing <kernelxing@tencent.com>
> >>>
> >>> Introduce BPF_SOCK_OPS_TS_SCHED_OPT_CB flag so that we can decide to
> >>> print timestamps when the skb just passes the dev layer.
> >>>
> >>> Signed-off-by: Jason Xing <kernelxing@tencent.com>
> >>> ---
> >>>    include/uapi/linux/bpf.h       |  5 +++++
> >>>    net/core/skbuff.c              | 17 +++++++++++++++--
> >>>    tools/include/uapi/linux/bpf.h |  5 +++++
> >>>    3 files changed, 25 insertions(+), 2 deletions(-)
> >>>
> >>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> >>> index 157e139ed6fc..3cf3c9c896c7 100644
> >>> --- a/include/uapi/linux/bpf.h
> >>> +++ b/include/uapi/linux/bpf.h
> >>> @@ -7019,6 +7019,11 @@ enum {
> >>>                                         * by the kernel or the
> >>>                                         * earlier bpf-progs.
> >>>                                         */
> >>> +     BPF_SOCK_OPS_TS_SCHED_OPT_CB,   /* Called when skb is passing through
> >>> +                                      * dev layer when SO_TIMESTAMPING
> >>> +                                      * feature is on. It indicates the
> >>> +                                      * recorded timestamp.
> >>> +                                      */
> >>>    };
> >>>
> >>>    /* List of TCP states. There is a build check in net/ipv4/tcp.c to detect
> >>> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> >>> index 3a4110d0f983..16e7bdc1eacb 100644
> >>> --- a/net/core/skbuff.c
> >>> +++ b/net/core/skbuff.c
> >>> @@ -5632,8 +5632,21 @@ static void bpf_skb_tstamp_tx_output(struct sock *sk, int tstype)
> >>>                return;
> >>>
> >>>        tp = tcp_sk(sk);
> >>> -     if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_TX_TIMESTAMPING_OPT_CB_FLAG))
> >>> -             return;
> >>> +     if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_TX_TIMESTAMPING_OPT_CB_FLAG)) {
> >>> +             struct timespec64 tstamp;
> >>> +             u32 cb_flag;
> >>> +
> >>> +             switch (tstype) {
> >>> +             case SCM_TSTAMP_SCHED:
> >>> +                     cb_flag = BPF_SOCK_OPS_TS_SCHED_OPT_CB;
> >>> +                     break;
> >>> +             default:
> >>> +                     return;
> >>> +             }
> >>> +
> >>> +             tstamp = ktime_to_timespec64(ktime_get_real());
> >>> +             tcp_call_bpf_2arg(sk, cb_flag, tstamp.tv_sec, tstamp.tv_nsec);
> >>
> >> There is bpf_ktime_get_*() helper. The bpf prog can directly call the
> >> bpf_ktime_get_* helper and use whatever clock it sees fit instead of enforcing
> >> real clock here and doing an extra ktime_to_timespec64. Right now the
> >> bpf_ktime_get_*() does not have real clock which I think it can be added.
> >
> > In this way, there is no need to add tcp_call_bpf_*arg() to pass
> > timestamp to userspace, right? Let the bpf program implement it.
> >
> > Now I wonder what information I should pass? Sorry for the lack of BPF
> > related knowledge :(
>
> Just pass the cb_flag op in this case.

I see. I saw one example just passing a NULL parameter:
tcp_call_bpf(sk, BPF_SOCK_OPS_BASE_RTT, 0, NULL);.

>
> A bpf selftest is missing in this series to show how it is going to be used.

Sorry, I didn't implement a standard selftest, but I wrote a full BPF
program in patch[0/12]. I planned to write a selftests after every
expert agrees the current approach.

> Yes, there are existing socket API tests on time stamping but I believe this
> discussion has already shown some subtle differences that warrant a closer to
> real world bpf prog example first.
>
> >
> >>
> >> I think overall the tstamp reporting interface does not necessarily have to
> >> follow the socket API. The bpf prog is running in the kernel. It could pass
> >> other information to the bpf prog if it sees fit. e.g. the bpf prog could also
> >> get the original transmitted tcp skb if it is useful.
> >
> > Good to know that! But how the BPF program parses the skb by using
> > tcp_call_bpf_2arg() which only passes u32 parameters.
>
> "struct skbuff *skb" has already been added to "struct bpf_sock_ops_kern". It is
> only assigned during the "BPF_SOCK_OPS_PARSE_*HDR_CB". It is not exposed
> directly to bpf prog but it could be. However, it may need to change some
> convert_ctx code in filter.c which I am not excited about. We haven't added
> convert_ctx changes for a while since it is the old way.
>
> Together with the "u32  bpf_sock_ops_cb_flags;" change in patch 9 which is only
> for tcp_sock and other _CB flags are also tcp specific only. For now, I am not

Right, the first move I made is to make TCP work.

> sure carrying this sockops to the future UDP support is desired.

I hope so. But it's not an urgent thing that needs to be done recently.

>
> Take a look at tcp_call_bpf(). It needs to initialize the whole "struct
> bpf_sock_ops_kern" regardless of what the bpf prog is needed before calling the
> bpf prog. The "u32 args[4]" is one of them. The is the older way of using bpf to
> extend kernel.

I see.

>
> bpf has struct_ops support now which can pass only what is needed and without
> the need of doing the convert_ctx in filter.c. The "struct tcp_congestion_ops"
> can already be implemented in bpf. Take a look at
> selftests/bpf/progs/bpf_cubic.c. All the BPF_SOCK_OPS_*_CB (e.g.
> BPF_SOCK_OPS_TS_SCHED_OPT_CB here) could just a "ops" in the struct_ops.

Interesting, but it seems this way is much more complex than the
current approach.

>
> That said, I think the first thing needs to figure out is how to enable bpf time
> stamping without having side effect on the user space.

In the next version, I will avoid affecting the cmsg case, so no more
side effects I think.

> Continue the sockops approach first

I'm a little hesitant to do so because it looks like we will introduce
more codes. Please let me investigate more :)

> and use it to create a selftest bpf prog example. Then we can decide.

I copy the BPF program from patch [0/12], please take a look and help
me review this:
---
Here is the test output:
1) receive path
iperf3-987305  [008] ...11 179955.200990: bpf_trace_printk: rx: port:
5201:55192, swtimestamp: 1728167973,670426346, hwtimestamp: 0,0
2) xmit path
iperf3-19765   [013] ...11  2021.329602: bpf_trace_printk: tx: port:
47528:5201, key: 1036, timestamp: 1728357067,436678584
iperf3-19765   [013] b..11  2021.329611: bpf_trace_printk: tx: port:
47528:5201, key: 1036, timestamp: 1728357067,436689976
iperf3-19765   [013] ...11  2021.329622: bpf_trace_printk: tx: port:
47528:5201, key: 1036, timestamp: 1728357067,436700739

Here is the full bpf program:
#include <linux/bpf.h>

#include <bpf/bpf_helpers.h>
#include <bpf/bpf_endian.h>
#include <uapi/linux/net_tstamp.h>

int _version SEC("version") = 1;
char _license[] SEC("license") = "GPL";

# define SO_TIMESTAMPING         37

__section("sockops")
int set_initial_rto(struct bpf_sock_ops *skops)
{
        int op = (int) skops->op;
        u32 sport = 0, dport = 0;
        int flags;

        switch (op) {
        //case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB:
        case BPF_SOCK_OPS_TCP_CONNECT_CB:
        case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB:
                flags = SOF_TIMESTAMPING_RX_SOFTWARE |
                        SOF_TIMESTAMPING_TX_SCHED |
SOF_TIMESTAMPING_TX_ACK | SOF_TIMESTAMPING_TX_SOFTWARE |
                        SOF_TIMESTAMPING_OPT_ID | SOF_TIMESTAMPING_OPT_ID_TCP;
                bpf_setsockopt(skops, SOL_SOCKET, SO_TIMESTAMPING,
&flags, sizeof(flags));
                bpf_sock_ops_cb_flags_set(skops,
BPF_SOCK_OPS_TX_TIMESTAMPING_OPT_CB_FLAG|BPF_SOCK_OPS_RX_TIMESTAMPING_OPT_CB_FLAG);
                break;
        case BPF_SOCK_OPS_TS_SCHED_OPT_CB:
        case BPF_SOCK_OPS_TS_SW_OPT_CB:
        case BPF_SOCK_OPS_TS_ACK_OPT_CB:
                dport = bpf_ntohl(skops->remote_port);
                sport = skops->local_port;
                bpf_printk("tx: port: %u:%u, key: %u, timestamp: %u,%u\n",
                            sport, dport, skops->args[0],
skops->args[1], skops->args[2]);
                break;
        case BPF_SOCK_OPS_TS_RX_OPT_CB:
                dport = bpf_ntohl(skops->remote_port);
                sport = skops->local_port;
                bpf_printk("rx: port: %u:%u, swtimestamp: %u,%u,
hwtimestamp: %u,%u\n",
                           sport, dport, skops->args[0],
skops->args[1], skops->args[2], skops->args[3]);
                break;
        }
        return 1;
}
---

What is your opinion on the above?

Thanks,
Jason
diff mbox series

Patch

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 157e139ed6fc..3cf3c9c896c7 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -7019,6 +7019,11 @@  enum {
 					 * by the kernel or the
 					 * earlier bpf-progs.
 					 */
+	BPF_SOCK_OPS_TS_SCHED_OPT_CB,	/* Called when skb is passing through
+					 * dev layer when SO_TIMESTAMPING
+					 * feature is on. It indicates the
+					 * recorded timestamp.
+					 */
 };
 
 /* List of TCP states. There is a build check in net/ipv4/tcp.c to detect
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 3a4110d0f983..16e7bdc1eacb 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -5632,8 +5632,21 @@  static void bpf_skb_tstamp_tx_output(struct sock *sk, int tstype)
 		return;
 
 	tp = tcp_sk(sk);
-	if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_TX_TIMESTAMPING_OPT_CB_FLAG))
-		return;
+	if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_TX_TIMESTAMPING_OPT_CB_FLAG)) {
+		struct timespec64 tstamp;
+		u32 cb_flag;
+
+		switch (tstype) {
+		case SCM_TSTAMP_SCHED:
+			cb_flag = BPF_SOCK_OPS_TS_SCHED_OPT_CB;
+			break;
+		default:
+			return;
+		}
+
+		tstamp = ktime_to_timespec64(ktime_get_real());
+		tcp_call_bpf_2arg(sk, cb_flag, tstamp.tv_sec, tstamp.tv_nsec);
+	}
 }
 
 void __skb_tstamp_tx(struct sk_buff *orig_skb,
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 93853d9d4922..d60675e1a5a0 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -7018,6 +7018,11 @@  enum {
 					 * by the kernel or the
 					 * earlier bpf-progs.
 					 */
+	BPF_SOCK_OPS_TS_SCHED_OPT_CB,	/* Called when skb is passing through
+					 * dev layer when SO_TIMESTAMPING
+					 * feature is on. It indicates the
+					 * recorded timestamp.
+					 */
 };
 
 /* List of TCP states. There is a build check in net/ipv4/tcp.c to detect