diff mbox series

[RFC,net-next] trace: tcp: Add tracepoint for tcp_cwnd_reduction()

Message ID 20250120-cwnd_tracepoint-v1-1-36b0e0d643fa@debian.org (mailing list archive)
State New
Headers show
Series [RFC,net-next] trace: tcp: Add tracepoint for tcp_cwnd_reduction() | expand

Commit Message

Breno Leitao Jan. 20, 2025, 12:02 p.m. UTC
Add a tracepoint to monitor TCP congestion window adjustments through the
tcp_cwnd_reduction() function. This tracepoint helps track:
  - TCP window size fluctuations
  - Active socket behavior
  - Congestion window reduction events

Meta has been using BPF programs to monitor this function for years. By
adding a proper tracepoint, we provide a stable API for all users who
need to monitor TCP congestion window behavior.

The tracepoint captures:
  - Socket source and destination IPs
  - Number of newly acknowledged packets
  - Number of newly lost packets
  - Packets in flight

Here is an example of a tracepoint when viewed in the trace buffer:

tcp_cwnd_reduction: src=[2401:db00:3021:10e1:face:0:32a:0]:45904 dest=[2401:db00:3021:1fb:face:0:23:0]:5201 newly_lost=0 newly_acked_sacked=27 in_flight=34

CC: Yonghong Song <yonghong.song@linux.dev>
CC: Song Liu <song@kernel.org>
CC: Martin KaFai Lau <martin.lau@kernel.org>
Signed-off-by: Breno Leitao <leitao@debian.org>
---
 include/trace/events/tcp.h | 34 ++++++++++++++++++++++++++++++++++
 net/ipv4/tcp_input.c       |  2 ++
 2 files changed, 36 insertions(+)


---
base-commit: 96e12defe5a8fa3f3a10e3ef1d20fee503245a10
change-id: 20250120-cwnd_tracepoint-2e11c996a9cb

Best regards,

Comments

Jason Xing Jan. 20, 2025, 12:08 p.m. UTC | #1
On Mon, Jan 20, 2025 at 8:03 PM Breno Leitao <leitao@debian.org> wrote:
>
> Add a tracepoint to monitor TCP congestion window adjustments through the
> tcp_cwnd_reduction() function. This tracepoint helps track:
>   - TCP window size fluctuations
>   - Active socket behavior
>   - Congestion window reduction events
>
> Meta has been using BPF programs to monitor this function for years. By
> adding a proper tracepoint, we provide a stable API for all users who
> need to monitor TCP congestion window behavior.
>
> The tracepoint captures:
>   - Socket source and destination IPs
>   - Number of newly acknowledged packets
>   - Number of newly lost packets
>   - Packets in flight
>
> Here is an example of a tracepoint when viewed in the trace buffer:
>
> tcp_cwnd_reduction: src=[2401:db00:3021:10e1:face:0:32a:0]:45904 dest=[2401:db00:3021:1fb:face:0:23:0]:5201 newly_lost=0 newly_acked_sacked=27 in_flight=34
>
> CC: Yonghong Song <yonghong.song@linux.dev>
> CC: Song Liu <song@kernel.org>
> CC: Martin KaFai Lau <martin.lau@kernel.org>
> Signed-off-by: Breno Leitao <leitao@debian.org>
> ---
>  include/trace/events/tcp.h | 34 ++++++++++++++++++++++++++++++++++
>  net/ipv4/tcp_input.c       |  2 ++
>  2 files changed, 36 insertions(+)
>
> diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
> index a27c4b619dffd7dcc72fffa71bf0fd5e34fe6681..b3a636658b39721cca843c0000eaa573cf4d09d5 100644
> --- a/include/trace/events/tcp.h
> +++ b/include/trace/events/tcp.h
> @@ -259,6 +259,40 @@ TRACE_EVENT(tcp_retransmit_synack,
>                   __entry->saddr_v6, __entry->daddr_v6)
>  );
>
> +TRACE_EVENT(tcp_cwnd_reduction,
> +
> +       TP_PROTO(const struct sock *sk, const int newly_acked_sacked,
> +                const int newly_lost, const int flag),
> +
> +       TP_ARGS(sk, newly_acked_sacked, newly_lost, flag),
> +
> +       TP_STRUCT__entry(
> +               __array(__u8, saddr, sizeof(struct sockaddr_in6))
> +               __array(__u8, daddr, sizeof(struct sockaddr_in6))
> +               __field(int, in_flight)
> +
> +               __field(int, newly_acked_sacked)
> +               __field(int, newly_lost)
> +       ),
> +
> +       TP_fast_assign(
> +               const struct inet_sock *inet = inet_sk(sk);
> +               const struct tcp_sock *tp = tcp_sk(sk);
> +
> +               memset(__entry->saddr, 0, sizeof(struct sockaddr_in6));
> +               memset(__entry->daddr, 0, sizeof(struct sockaddr_in6));
> +
> +               TP_STORE_ADDR_PORTS(__entry, inet, sk);
> +               __entry->in_flight = tcp_packets_in_flight(tp);
> +               __entry->newly_lost = newly_lost;
> +               __entry->newly_acked_sacked = newly_acked_sacked;
> +       ),
> +
> +       TP_printk("src=%pISpc dest=%pISpc newly_lost=%d newly_acked_sacked=%d in_flight=%d",
> +                 __entry->saddr, __entry->daddr, __entry->newly_lost,
> +                 __entry->newly_acked_sacked, __entry->in_flight)
> +);
> +
>  #include <trace/events/net_probe_common.h>
>
>  TRACE_EVENT(tcp_probe,
> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> index 4811727b8a02258ec6fa1fd129beecf7cbb0f90e..fc88c511e81bc12ec57e8dc3e9185a920d1bd079 100644
> --- a/net/ipv4/tcp_input.c
> +++ b/net/ipv4/tcp_input.c
> @@ -2710,6 +2710,8 @@ void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int newly_lost,
>         if (newly_acked_sacked <= 0 || WARN_ON_ONCE(!tp->prior_cwnd))
>                 return;
>
> +       trace_tcp_cwnd_reduction(sk, newly_acked_sacked, newly_lost, flag);
> +

Are there any other reasons why introducing a new tracepoint here?
AFAIK, it can be easily replaced by a bpf related program or script to
monitor in the above position.

Thanks,
Jason
Breno Leitao Jan. 20, 2025, 1:02 p.m. UTC | #2
Hello Jason,

On Mon, Jan 20, 2025 at 08:08:52PM +0800, Jason Xing wrote:
> On Mon, Jan 20, 2025 at 8:03 PM Breno Leitao <leitao@debian.org> wrote:
> > diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> > index 4811727b8a02258ec6fa1fd129beecf7cbb0f90e..fc88c511e81bc12ec57e8dc3e9185a920d1bd079 100644
> > --- a/net/ipv4/tcp_input.c
> > +++ b/net/ipv4/tcp_input.c
> > @@ -2710,6 +2710,8 @@ void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int newly_lost,
> >         if (newly_acked_sacked <= 0 || WARN_ON_ONCE(!tp->prior_cwnd))
> >                 return;
> >
> > +       trace_tcp_cwnd_reduction(sk, newly_acked_sacked, newly_lost, flag);
> > +
> 
> Are there any other reasons why introducing a new tracepoint here?
> AFAIK, it can be easily replaced by a bpf related program or script to
> monitor in the above position.

In which position exactly?

Thank you,
--breno
Jason Xing Jan. 20, 2025, 1:06 p.m. UTC | #3
On Mon, Jan 20, 2025 at 9:02 PM Breno Leitao <leitao@debian.org> wrote:
>
> Hello Jason,
>
> On Mon, Jan 20, 2025 at 08:08:52PM +0800, Jason Xing wrote:
> > On Mon, Jan 20, 2025 at 8:03 PM Breno Leitao <leitao@debian.org> wrote:
> > > diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> > > index 4811727b8a02258ec6fa1fd129beecf7cbb0f90e..fc88c511e81bc12ec57e8dc3e9185a920d1bd079 100644
> > > --- a/net/ipv4/tcp_input.c
> > > +++ b/net/ipv4/tcp_input.c
> > > @@ -2710,6 +2710,8 @@ void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int newly_lost,
> > >         if (newly_acked_sacked <= 0 || WARN_ON_ONCE(!tp->prior_cwnd))
> > >                 return;
> > >
> > > +       trace_tcp_cwnd_reduction(sk, newly_acked_sacked, newly_lost, flag);
> > > +
> >
> > Are there any other reasons why introducing a new tracepoint here?
> > AFAIK, it can be easily replaced by a bpf related program or script to
> > monitor in the above position.
>
> In which position exactly?

I meant, in the position where you insert a one-line tracepoint, which
should be easily replaced with a bpf program (kprobe
tcp_cwnd_reduction with two checks like in the earlier if-statement).
It doesn't mean that I object to this new tracepoint, just curious if
you have other motivations.

Thanks,
Jason
Breno Leitao Jan. 20, 2025, 1:20 p.m. UTC | #4
On Mon, Jan 20, 2025 at 09:06:43PM +0800, Jason Xing wrote:
> On Mon, Jan 20, 2025 at 9:02 PM Breno Leitao <leitao@debian.org> wrote:
> > On Mon, Jan 20, 2025 at 08:08:52PM +0800, Jason Xing wrote:
> > > On Mon, Jan 20, 2025 at 8:03 PM Breno Leitao <leitao@debian.org> wrote:
> > > > diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> > > > index 4811727b8a02258ec6fa1fd129beecf7cbb0f90e..fc88c511e81bc12ec57e8dc3e9185a920d1bd079 100644
> > > > --- a/net/ipv4/tcp_input.c
> > > > +++ b/net/ipv4/tcp_input.c
> > > > @@ -2710,6 +2710,8 @@ void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int newly_lost,
> > > >         if (newly_acked_sacked <= 0 || WARN_ON_ONCE(!tp->prior_cwnd))
> > > >                 return;
> > > >
> > > > +       trace_tcp_cwnd_reduction(sk, newly_acked_sacked, newly_lost, flag);
> > > > +
> > >
> > > Are there any other reasons why introducing a new tracepoint here?
> > > AFAIK, it can be easily replaced by a bpf related program or script to
> > > monitor in the above position.
> >
> > In which position exactly?
> 
> I meant, in the position where you insert a one-line tracepoint, which
> should be easily replaced with a bpf program (kprobe
> tcp_cwnd_reduction with two checks like in the earlier if-statement).
> It doesn't mean that I object to this new tracepoint, just curious if
> you have other motivations.

This is exactly the current implementation we have at Meta, as it relies on
hooking into this specific function. This approach is unstable, as
compiler optimizations like inlining can break the functionality.

This patch enhances the API's stability by introducing a guaranteed hook
point, allowing the compiler to make changes without disrupting the
BPF program's functionality.
Steven Rostedt Jan. 20, 2025, 3:03 p.m. UTC | #5
On Mon, 20 Jan 2025 05:20:05 -0800
Breno Leitao <leitao@debian.org> wrote:

> This patch enhances the API's stability by introducing a guaranteed hook
> point, allowing the compiler to make changes without disrupting the
> BPF program's functionality.

Instead of using a TRACE_EVENT() macro, you can use DECLARE_TRACE()
which will create the tracepoint in the kernel, but will not create a
trace event that is exported to the tracefs file system. Then BPF could
hook to it and it will still not be exposed as an user space API.

You can see its use in include/trace/events/sched.h

-- Steve
Jason Xing Jan. 21, 2025, 1:15 a.m. UTC | #6
On Mon, Jan 20, 2025 at 9:20 PM Breno Leitao <leitao@debian.org> wrote:
>
>
> On Mon, Jan 20, 2025 at 09:06:43PM +0800, Jason Xing wrote:
> > On Mon, Jan 20, 2025 at 9:02 PM Breno Leitao <leitao@debian.org> wrote:
> > > On Mon, Jan 20, 2025 at 08:08:52PM +0800, Jason Xing wrote:
> > > > On Mon, Jan 20, 2025 at 8:03 PM Breno Leitao <leitao@debian.org> wrote:
> > > > > diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> > > > > index 4811727b8a02258ec6fa1fd129beecf7cbb0f90e..fc88c511e81bc12ec57e8dc3e9185a920d1bd079 100644
> > > > > --- a/net/ipv4/tcp_input.c
> > > > > +++ b/net/ipv4/tcp_input.c
> > > > > @@ -2710,6 +2710,8 @@ void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int newly_lost,
> > > > >         if (newly_acked_sacked <= 0 || WARN_ON_ONCE(!tp->prior_cwnd))
> > > > >                 return;
> > > > >
> > > > > +       trace_tcp_cwnd_reduction(sk, newly_acked_sacked, newly_lost, flag);
> > > > > +
> > > >
> > > > Are there any other reasons why introducing a new tracepoint here?
> > > > AFAIK, it can be easily replaced by a bpf related program or script to
> > > > monitor in the above position.
> > >
> > > In which position exactly?
> >
> > I meant, in the position where you insert a one-line tracepoint, which
> > should be easily replaced with a bpf program (kprobe
> > tcp_cwnd_reduction with two checks like in the earlier if-statement).
> > It doesn't mean that I object to this new tracepoint, just curious if
> > you have other motivations.
>
> This is exactly the current implementation we have at Meta, as it relies on
> hooking into this specific function. This approach is unstable, as
> compiler optimizations like inlining can break the functionality.
>
> This patch enhances the API's stability by introducing a guaranteed hook
> point, allowing the compiler to make changes without disrupting the
> BPF program's functionality.

Surely it does :) The reason why I asked is that perhaps one year ago
I'm trying to add many tracepoints so that the user space monitor can
take advantage of these various stable hook points. I believe that
there are many other places which might be inlined because of gcc,
receiving a few similar reports in recent months, but there is no
guarantee to not touch some functions which obviously break some
monitor applications.

I'm wondering that except for this new transepoint, if we can design a
common usage for the monitor program, so that people who are
interested will work on this together? Profiling is becoming more and
more important nowadays, from the point of my view.

Thanks,
Jason
Jason Xing Jan. 21, 2025, 1:22 a.m. UTC | #7
On Tue, Jan 21, 2025 at 9:15 AM Jason Xing <kerneljasonxing@gmail.com> wrote:
>
> On Mon, Jan 20, 2025 at 9:20 PM Breno Leitao <leitao@debian.org> wrote:
> >
> >
> > On Mon, Jan 20, 2025 at 09:06:43PM +0800, Jason Xing wrote:
> > > On Mon, Jan 20, 2025 at 9:02 PM Breno Leitao <leitao@debian.org> wrote:
> > > > On Mon, Jan 20, 2025 at 08:08:52PM +0800, Jason Xing wrote:
> > > > > On Mon, Jan 20, 2025 at 8:03 PM Breno Leitao <leitao@debian.org> wrote:
> > > > > > diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> > > > > > index 4811727b8a02258ec6fa1fd129beecf7cbb0f90e..fc88c511e81bc12ec57e8dc3e9185a920d1bd079 100644
> > > > > > --- a/net/ipv4/tcp_input.c
> > > > > > +++ b/net/ipv4/tcp_input.c
> > > > > > @@ -2710,6 +2710,8 @@ void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int newly_lost,
> > > > > >         if (newly_acked_sacked <= 0 || WARN_ON_ONCE(!tp->prior_cwnd))
> > > > > >                 return;
> > > > > >
> > > > > > +       trace_tcp_cwnd_reduction(sk, newly_acked_sacked, newly_lost, flag);
> > > > > > +
> > > > >
> > > > > Are there any other reasons why introducing a new tracepoint here?
> > > > > AFAIK, it can be easily replaced by a bpf related program or script to
> > > > > monitor in the above position.
> > > >
> > > > In which position exactly?
> > >
> > > I meant, in the position where you insert a one-line tracepoint, which
> > > should be easily replaced with a bpf program (kprobe
> > > tcp_cwnd_reduction with two checks like in the earlier if-statement).
> > > It doesn't mean that I object to this new tracepoint, just curious if
> > > you have other motivations.
> >
> > This is exactly the current implementation we have at Meta, as it relies on
> > hooking into this specific function. This approach is unstable, as
> > compiler optimizations like inlining can break the functionality.
> >
> > This patch enhances the API's stability by introducing a guaranteed hook
> > point, allowing the compiler to make changes without disrupting the
> > BPF program's functionality.
>
> Surely it does :) The reason why I asked is that perhaps one year ago
> I'm trying to add many tracepoints so that the user space monitor can
> take advantage of these various stable hook points. I believe that
> there are many other places which might be inlined because of gcc,
> receiving a few similar reports in recent months, but there is no
> guarantee to not touch some functions which obviously break some
> monitor applications.

Before BPF gets widely used, changing function names will not hurt
applications. Things are a little bit different now, changing names or
inlined function will lead the monitor application adjusting codes
through versions, because they are not kabi functions.

Thanks,
Jason
diff mbox series

Patch

diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
index a27c4b619dffd7dcc72fffa71bf0fd5e34fe6681..b3a636658b39721cca843c0000eaa573cf4d09d5 100644
--- a/include/trace/events/tcp.h
+++ b/include/trace/events/tcp.h
@@ -259,6 +259,40 @@  TRACE_EVENT(tcp_retransmit_synack,
 		  __entry->saddr_v6, __entry->daddr_v6)
 );
 
+TRACE_EVENT(tcp_cwnd_reduction,
+
+	TP_PROTO(const struct sock *sk, const int newly_acked_sacked,
+		 const int newly_lost, const int flag),
+
+	TP_ARGS(sk, newly_acked_sacked, newly_lost, flag),
+
+	TP_STRUCT__entry(
+		__array(__u8, saddr, sizeof(struct sockaddr_in6))
+		__array(__u8, daddr, sizeof(struct sockaddr_in6))
+		__field(int, in_flight)
+
+		__field(int, newly_acked_sacked)
+		__field(int, newly_lost)
+	),
+
+	TP_fast_assign(
+		const struct inet_sock *inet = inet_sk(sk);
+		const struct tcp_sock *tp = tcp_sk(sk);
+
+		memset(__entry->saddr, 0, sizeof(struct sockaddr_in6));
+		memset(__entry->daddr, 0, sizeof(struct sockaddr_in6));
+
+		TP_STORE_ADDR_PORTS(__entry, inet, sk);
+		__entry->in_flight = tcp_packets_in_flight(tp);
+		__entry->newly_lost = newly_lost;
+		__entry->newly_acked_sacked = newly_acked_sacked;
+	),
+
+	TP_printk("src=%pISpc dest=%pISpc newly_lost=%d newly_acked_sacked=%d in_flight=%d",
+		  __entry->saddr, __entry->daddr, __entry->newly_lost,
+		  __entry->newly_acked_sacked, __entry->in_flight)
+);
+
 #include <trace/events/net_probe_common.h>
 
 TRACE_EVENT(tcp_probe,
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 4811727b8a02258ec6fa1fd129beecf7cbb0f90e..fc88c511e81bc12ec57e8dc3e9185a920d1bd079 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2710,6 +2710,8 @@  void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int newly_lost,
 	if (newly_acked_sacked <= 0 || WARN_ON_ONCE(!tp->prior_cwnd))
 		return;
 
+	trace_tcp_cwnd_reduction(sk, newly_acked_sacked, newly_lost, flag);
+
 	tp->prr_delivered += newly_acked_sacked;
 	if (delta < 0) {
 		u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered +