diff mbox series

[net-next,2/2] trace: tcp: fully support trace_tcp_send_reset

Message ID 20240311024104.67522-3-kerneljasonxing@gmail.com (mailing list archive)
State Changes Requested
Delegated to: Netdev Maintainers
Headers show
Series tcp: make trace of reset logic complete | expand

Checks

Context Check Description
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for net-next
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 964 this patch: 964
netdev/build_tools success Errors and warnings before: 0 this patch: 0
netdev/cc_maintainers warning 1 maintainers not CCed: dsahern@kernel.org
netdev/build_clang success Errors and warnings before: 959 this patch: 959
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 980 this patch: 980
netdev/checkpatch warning CHECK: Alignment should match open parenthesis CHECK: Lines should not end with a '(' WARNING: line length of 81 exceeds 80 columns WARNING: line length of 83 exceeds 80 columns
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
netdev/contest warning net-next-2024-03-11--21-00 (tests: 883)

Commit Message

Jason Xing March 11, 2024, 2:41 a.m. UTC
From: Jason Xing <kernelxing@tencent.com>

Prior to this patch, what we can see by enabling trace_tcp_send is
only happening under two circumstances:
1) active rst mode
2) non-active rst mode and based on the full socket

That means the inconsistency occurs if we use tcpdump and trace
simultaneously to see how rst happens.

It's necessary that we should take into other cases into considerations,
say:
1) time-wait socket
2) no socket
...

By parsing the incoming skb and reversing its 4-turple can
we know the exact 'flow' which might not exist.

Samples after applied this patch:
1. tcp_send_reset: skbaddr=XXX skaddr=XXX src=ip:port dest=ip:port
state=TCP_ESTABLISHED
2. tcp_send_reset: skbaddr=000...000 skaddr=XXX src=ip:port dest=ip:port
state=UNKNOWN
Note:
1) UNKNOWN means we cannot extract the right information from skb.
2) skbaddr/skaddr could be 0

Signed-off-by: Jason Xing <kernelxing@tencent.com>
---
 include/trace/events/tcp.h | 39 ++++++++++++++++++++++++++++++++++++--
 net/ipv4/tcp_ipv4.c        |  4 ++--
 net/ipv6/tcp_ipv6.c        |  3 ++-
 3 files changed, 41 insertions(+), 5 deletions(-)

Comments

Ratheesh Kannoth March 11, 2024, 3:27 a.m. UTC | #1
On 2024-03-11 at 08:11:04, Jason Xing (kerneljasonxing@gmail.com) wrote:
> From: Jason Xing <kernelxing@tencent.com>
>
> Prior to this patch, what we can see by enabling trace_tcp_send is
> only happening under two circumstances:
> 1) active rst mode
> 2) non-active rst mode and based on the full socket
>
> That means the inconsistency occurs if we use tcpdump and trace
> simultaneously to see how rst happens.
>
> It's necessary that we should take into other cases into considerations,
> say:
> 1) time-wait socket
> 2) no socket
> ...
>
> By parsing the incoming skb and reversing its 4-turple can
> we know the exact 'flow' which might not exist.
>
> Samples after applied this patch:
> 1. tcp_send_reset: skbaddr=XXX skaddr=XXX src=ip:port dest=ip:port
> state=TCP_ESTABLISHED
> 2. tcp_send_reset: skbaddr=000...000 skaddr=XXX src=ip:port dest=ip:port
> state=UNKNOWN
> Note:
> 1) UNKNOWN means we cannot extract the right information from skb.
> 2) skbaddr/skaddr could be 0
>
> Signed-off-by: Jason Xing <kernelxing@tencent.com>
> ---
>  include/trace/events/tcp.h | 39 ++++++++++++++++++++++++++++++++++++--
>  net/ipv4/tcp_ipv4.c        |  4 ++--
>  net/ipv6/tcp_ipv6.c        |  3 ++-
>  3 files changed, 41 insertions(+), 5 deletions(-)
>
> diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
> index 2495a1d579be..6c09d7941583 100644
> --- a/include/trace/events/tcp.h
> +++ b/include/trace/events/tcp.h
> @@ -107,11 +107,46 @@ DEFINE_EVENT(tcp_event_sk_skb, tcp_retransmit_skb,
>   * skb of trace_tcp_send_reset is the skb that caused RST. In case of
>   * active reset, skb should be NULL
>   */
> -DEFINE_EVENT(tcp_event_sk_skb, tcp_send_reset,
> +TRACE_EVENT(tcp_send_reset,
>
>  	TP_PROTO(const struct sock *sk, const struct sk_buff *skb),
>
> -	TP_ARGS(sk, skb)
> +	TP_ARGS(sk, skb),
> +
> +	TP_STRUCT__entry(
> +		__field(const void *, skbaddr)
> +		__field(const void *, skaddr)
> +		__field(int, state)
> +		__array(__u8, saddr, sizeof(struct sockaddr_in6))
> +		__array(__u8, daddr, sizeof(struct sockaddr_in6))
> +	),
> +
> +	TP_fast_assign(
> +		__entry->skbaddr = skb;
> +		__entry->skaddr = sk;
> +		/* Zero means unknown state. */
> +		__entry->state = sk ? sk->sk_state : 0;
> +
> +		memset(__entry->saddr, 0, sizeof(struct sockaddr_in6));
> +		memset(__entry->daddr, 0, sizeof(struct sockaddr_in6));
> +
> +		if (sk && sk_fullsock(sk)) {
> +			const struct inet_sock *inet = inet_sk(sk);
> +
> +			TP_STORE_ADDR_PORTS(__entry, inet, sk);
> +		} else {
> +			/*
> +			 * We should reverse the 4-turple of skb, so later
> +			 * it can print the right flow direction of rst.
> +			 */
> +			TP_STORE_ADDR_PORTS_SKB(skb, entry->daddr, entry->saddr);
> +		}
> +	),
> +
> +	TP_printk("skbaddr=%p skaddr=%p src=%pISpc dest=%pISpc state=%s",
Could you consider using %px ? is it permitted ? it will be easy to track skb.

> +		  __entry->skbaddr, __entry->skaddr,
> +		  __entry->saddr, __entry->daddr,
> +		  __entry->state ? show_tcp_state_name(__entry->state) : "UNKNOWN")
>  );
>
>  /*
> diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
> index a22ee5838751..d5c4a969c066 100644
> --- a/net/ipv4/tcp_ipv4.c
> +++ b/net/ipv4/tcp_ipv4.c
> @@ -868,10 +868,10 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
>  	 */
>  	if (sk) {
>  		arg.bound_dev_if = sk->sk_bound_dev_if;
> -		if (sk_fullsock(sk))
> -			trace_tcp_send_reset(sk, skb);
>  	}
>
> +	trace_tcp_send_reset(sk, skb);
> +
>  	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
>  		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
>
> diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
> index 3f4cba49e9ee..8e9c59b6c00c 100644
> --- a/net/ipv6/tcp_ipv6.c
> +++ b/net/ipv6/tcp_ipv6.c
> @@ -1113,7 +1113,6 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb)
>  	if (sk) {
>  		oif = sk->sk_bound_dev_if;
>  		if (sk_fullsock(sk)) {
> -			trace_tcp_send_reset(sk, skb);
>  			if (inet6_test_bit(REPFLOW, sk))
>  				label = ip6_flowlabel(ipv6h);
>  			priority = READ_ONCE(sk->sk_priority);
> @@ -1129,6 +1128,8 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb)
>  			label = ip6_flowlabel(ipv6h);
>  	}
>
> +	trace_tcp_send_reset(sk, skb);
> +
>  	tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, oif, 1,
>  			     ipv6_get_dsfield(ipv6h), label, priority, txhash,
>  			     &key);
> --
> 2.37.3
>
Jason Xing March 11, 2024, 5 a.m. UTC | #2
On Mon, Mar 11, 2024 at 11:27 AM Ratheesh Kannoth <rkannoth@marvell.com> wrote:
>
> On 2024-03-11 at 08:11:04, Jason Xing (kerneljasonxing@gmail.com) wrote:
> > From: Jason Xing <kernelxing@tencent.com>
> >
> > Prior to this patch, what we can see by enabling trace_tcp_send is
> > only happening under two circumstances:
> > 1) active rst mode
> > 2) non-active rst mode and based on the full socket
> >
> > That means the inconsistency occurs if we use tcpdump and trace
> > simultaneously to see how rst happens.
> >
> > It's necessary that we should take into other cases into considerations,
> > say:
> > 1) time-wait socket
> > 2) no socket
> > ...
> >
> > By parsing the incoming skb and reversing its 4-turple can
> > we know the exact 'flow' which might not exist.
> >
> > Samples after applied this patch:
> > 1. tcp_send_reset: skbaddr=XXX skaddr=XXX src=ip:port dest=ip:port
> > state=TCP_ESTABLISHED
> > 2. tcp_send_reset: skbaddr=000...000 skaddr=XXX src=ip:port dest=ip:port
> > state=UNKNOWN
> > Note:
> > 1) UNKNOWN means we cannot extract the right information from skb.
> > 2) skbaddr/skaddr could be 0
> >
> > Signed-off-by: Jason Xing <kernelxing@tencent.com>
> > ---
> >  include/trace/events/tcp.h | 39 ++++++++++++++++++++++++++++++++++++--
> >  net/ipv4/tcp_ipv4.c        |  4 ++--
> >  net/ipv6/tcp_ipv6.c        |  3 ++-
> >  3 files changed, 41 insertions(+), 5 deletions(-)
> >
> > diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
> > index 2495a1d579be..6c09d7941583 100644
> > --- a/include/trace/events/tcp.h
> > +++ b/include/trace/events/tcp.h
> > @@ -107,11 +107,46 @@ DEFINE_EVENT(tcp_event_sk_skb, tcp_retransmit_skb,
> >   * skb of trace_tcp_send_reset is the skb that caused RST. In case of
> >   * active reset, skb should be NULL
> >   */
> > -DEFINE_EVENT(tcp_event_sk_skb, tcp_send_reset,
> > +TRACE_EVENT(tcp_send_reset,
> >
> >       TP_PROTO(const struct sock *sk, const struct sk_buff *skb),
> >
> > -     TP_ARGS(sk, skb)
> > +     TP_ARGS(sk, skb),
> > +
> > +     TP_STRUCT__entry(
> > +             __field(const void *, skbaddr)
> > +             __field(const void *, skaddr)
> > +             __field(int, state)
> > +             __array(__u8, saddr, sizeof(struct sockaddr_in6))
> > +             __array(__u8, daddr, sizeof(struct sockaddr_in6))
> > +     ),
> > +
> > +     TP_fast_assign(
> > +             __entry->skbaddr = skb;
> > +             __entry->skaddr = sk;
> > +             /* Zero means unknown state. */
> > +             __entry->state = sk ? sk->sk_state : 0;
> > +
> > +             memset(__entry->saddr, 0, sizeof(struct sockaddr_in6));
> > +             memset(__entry->daddr, 0, sizeof(struct sockaddr_in6));
> > +
> > +             if (sk && sk_fullsock(sk)) {
> > +                     const struct inet_sock *inet = inet_sk(sk);
> > +
> > +                     TP_STORE_ADDR_PORTS(__entry, inet, sk);
> > +             } else {
> > +                     /*
> > +                      * We should reverse the 4-turple of skb, so later
> > +                      * it can print the right flow direction of rst.
> > +                      */
> > +                     TP_STORE_ADDR_PORTS_SKB(skb, entry->daddr, entry->saddr);
> > +             }
> > +     ),
> > +
> > +     TP_printk("skbaddr=%p skaddr=%p src=%pISpc dest=%pISpc state=%s",
> Could you consider using %px ? is it permitted ? it will be easy to track skb.

I prefer not to use %px because we cannot make use of the real address
of skb. Besides, using %px would leak kernel addresses.

Here is the Documentation (see Documentation/core-api/printk-formats.rst):
"Pointers printed without a specifier extension (i.e unadorned %p) are
hashed to give a unique identifier without leaking kernel addresses to user
space."

Perhaps, that's the reason why all the tracepoints didn't print in %px format:)

Thanks,
Jason

>
> > +               __entry->skbaddr, __entry->skaddr,
> > +               __entry->saddr, __entry->daddr,
> > +               __entry->state ? show_tcp_state_name(__entry->state) : "UNKNOWN")
> >  );
> >
> >  /*
> > diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
> > index a22ee5838751..d5c4a969c066 100644
> > --- a/net/ipv4/tcp_ipv4.c
> > +++ b/net/ipv4/tcp_ipv4.c
> > @@ -868,10 +868,10 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
> >        */
> >       if (sk) {
> >               arg.bound_dev_if = sk->sk_bound_dev_if;
> > -             if (sk_fullsock(sk))
> > -                     trace_tcp_send_reset(sk, skb);
> >       }
> >
> > +     trace_tcp_send_reset(sk, skb);
> > +
> >       BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
> >                    offsetof(struct inet_timewait_sock, tw_bound_dev_if));
> >
> > diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
> > index 3f4cba49e9ee..8e9c59b6c00c 100644
> > --- a/net/ipv6/tcp_ipv6.c
> > +++ b/net/ipv6/tcp_ipv6.c
> > @@ -1113,7 +1113,6 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb)
> >       if (sk) {
> >               oif = sk->sk_bound_dev_if;
> >               if (sk_fullsock(sk)) {
> > -                     trace_tcp_send_reset(sk, skb);
> >                       if (inet6_test_bit(REPFLOW, sk))
> >                               label = ip6_flowlabel(ipv6h);
> >                       priority = READ_ONCE(sk->sk_priority);
> > @@ -1129,6 +1128,8 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb)
> >                       label = ip6_flowlabel(ipv6h);
> >       }
> >
> > +     trace_tcp_send_reset(sk, skb);
> > +
> >       tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, oif, 1,
> >                            ipv6_get_dsfield(ipv6h), label, priority, txhash,
> >                            &key);
> > --
> > 2.37.3
> >
Ratheesh Kannoth March 11, 2024, 5:09 a.m. UTC | #3
> From: Jason Xing <kerneljasonxing@gmail.com>
> Sent: Monday, March 11, 2024 10:30 AM
> To: Ratheesh Kannoth <rkannoth@marvell.com>
> Cc: edumazet@google.com; mhiramat@kernel.org;
> mathieu.desnoyers@efficios.com; rostedt@goodmis.org; kuba@kernel.org;
> pabeni@redhat.com; davem@davemloft.net; netdev@vger.kernel.org; linux-
> trace-kernel@vger.kernel.org; Jason Xing <kernelxing@tencent.com>
> Subject: [EXTERNAL] Re: [PATCH net-next 2/2] trace: tcp: fully support
> trace_tcp_send_reset
> 
> > > +     ),
> > > +
> > > +     TP_printk("skbaddr=%p skaddr=%p src=%pISpc dest=%pISpc
> > > + state=%s",
> > Could you consider using %px ? is it permitted ? it will be easy to track skb.
> 
> I prefer not to use %px because we cannot make use of the real address of
> skb. Besides, using %px would leak kernel addresses.
> 
> Here is the Documentation (see Documentation/core-api/printk-formats.rst):
> "Pointers printed without a specifier extension (i.e unadorned %p) are
> hashed to give a unique identifier without leaking kernel addresses to user
> space."
> 
> Perhaps, that's the reason why all the tracepoints didn't print in %px format:)
> 
ACK, I understand that.  In offload cases, or in mostly in IOVA space, we often end up changing this locally to %px to debug and track skb. 
That is why I asked the question. 
 
> Thanks,
> Jason
> 
> >
> > > +               __entry->skbaddr, __entry->skaddr,
> > > +               __entry->saddr, __entry->daddr,
> > > +               __entry->state ? show_tcp_state_name(__entry->state)
> > > + : "UNKNOWN")
> > >  );
> > >
> > >  /*
> > > diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index
> > > a22ee5838751..d5c4a969c066 100644
> > > --- a/net/ipv4/tcp_ipv4.c
> > > +++ b/net/ipv4/tcp_ipv4.c
> > > @@ -868,10 +868,10 @@ static void tcp_v4_send_reset(const struct sock
> *sk, struct sk_buff *skb)
> > >        */
> > >       if (sk) {
> > >               arg.bound_dev_if = sk->sk_bound_dev_if;
> > > -             if (sk_fullsock(sk))
> > > -                     trace_tcp_send_reset(sk, skb);
> > >       }
> > >
> > > +     trace_tcp_send_reset(sk, skb);
> > > +
> > >       BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
> > >                    offsetof(struct inet_timewait_sock,
> > > tw_bound_dev_if));
> > >
> > > diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index
> > > 3f4cba49e9ee..8e9c59b6c00c 100644
> > > --- a/net/ipv6/tcp_ipv6.c
> > > +++ b/net/ipv6/tcp_ipv6.c
> > > @@ -1113,7 +1113,6 @@ static void tcp_v6_send_reset(const struct sock
> *sk, struct sk_buff *skb)
> > >       if (sk) {
> > >               oif = sk->sk_bound_dev_if;
> > >               if (sk_fullsock(sk)) {
> > > -                     trace_tcp_send_reset(sk, skb);
> > >                       if (inet6_test_bit(REPFLOW, sk))
> > >                               label = ip6_flowlabel(ipv6h);
> > >                       priority = READ_ONCE(sk->sk_priority); @@
> > > -1129,6 +1128,8 @@ static void tcp_v6_send_reset(const struct sock *sk,
> struct sk_buff *skb)
> > >                       label = ip6_flowlabel(ipv6h);
> > >       }
> > >
> > > +     trace_tcp_send_reset(sk, skb);
> > > +
> > >       tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, oif, 1,
> > >                            ipv6_get_dsfield(ipv6h), label, priority, txhash,
> > >                            &key);
> > > --
> > > 2.37.3
> > >
diff mbox series

Patch

diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
index 2495a1d579be..6c09d7941583 100644
--- a/include/trace/events/tcp.h
+++ b/include/trace/events/tcp.h
@@ -107,11 +107,46 @@  DEFINE_EVENT(tcp_event_sk_skb, tcp_retransmit_skb,
  * skb of trace_tcp_send_reset is the skb that caused RST. In case of
  * active reset, skb should be NULL
  */
-DEFINE_EVENT(tcp_event_sk_skb, tcp_send_reset,
+TRACE_EVENT(tcp_send_reset,
 
 	TP_PROTO(const struct sock *sk, const struct sk_buff *skb),
 
-	TP_ARGS(sk, skb)
+	TP_ARGS(sk, skb),
+
+	TP_STRUCT__entry(
+		__field(const void *, skbaddr)
+		__field(const void *, skaddr)
+		__field(int, state)
+		__array(__u8, saddr, sizeof(struct sockaddr_in6))
+		__array(__u8, daddr, sizeof(struct sockaddr_in6))
+	),
+
+	TP_fast_assign(
+		__entry->skbaddr = skb;
+		__entry->skaddr = sk;
+		/* Zero means unknown state. */
+		__entry->state = sk ? sk->sk_state : 0;
+
+		memset(__entry->saddr, 0, sizeof(struct sockaddr_in6));
+		memset(__entry->daddr, 0, sizeof(struct sockaddr_in6));
+
+		if (sk && sk_fullsock(sk)) {
+			const struct inet_sock *inet = inet_sk(sk);
+
+			TP_STORE_ADDR_PORTS(__entry, inet, sk);
+		} else {
+			/*
+			 * We should reverse the 4-turple of skb, so later
+			 * it can print the right flow direction of rst.
+			 */
+			TP_STORE_ADDR_PORTS_SKB(skb, entry->daddr, entry->saddr);
+		}
+	),
+
+	TP_printk("skbaddr=%p skaddr=%p src=%pISpc dest=%pISpc state=%s",
+		  __entry->skbaddr, __entry->skaddr,
+		  __entry->saddr, __entry->daddr,
+		  __entry->state ? show_tcp_state_name(__entry->state) : "UNKNOWN")
 );
 
 /*
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index a22ee5838751..d5c4a969c066 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -868,10 +868,10 @@  static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 	 */
 	if (sk) {
 		arg.bound_dev_if = sk->sk_bound_dev_if;
-		if (sk_fullsock(sk))
-			trace_tcp_send_reset(sk, skb);
 	}
 
+	trace_tcp_send_reset(sk, skb);
+
 	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
 		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
 
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 3f4cba49e9ee..8e9c59b6c00c 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1113,7 +1113,6 @@  static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb)
 	if (sk) {
 		oif = sk->sk_bound_dev_if;
 		if (sk_fullsock(sk)) {
-			trace_tcp_send_reset(sk, skb);
 			if (inet6_test_bit(REPFLOW, sk))
 				label = ip6_flowlabel(ipv6h);
 			priority = READ_ONCE(sk->sk_priority);
@@ -1129,6 +1128,8 @@  static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb)
 			label = ip6_flowlabel(ipv6h);
 	}
 
+	trace_tcp_send_reset(sk, skb);
+
 	tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, oif, 1,
 			     ipv6_get_dsfield(ipv6h), label, priority, txhash,
 			     &key);