Message ID | 20240329034243.7929-3-kerneljasonxing@gmail.com (mailing list archive) |
---|---|
State | Superseded |
Headers | show |
Series | tcp: make trace of reset logic complete | expand |
On Fri, Mar 29, 2024 at 4:43 AM Jason Xing <kerneljasonxing@gmail.com> wrote: > > From: Jason Xing <kernelxing@tencent.com> > > Prior to this patch, what we can see by enabling trace_tcp_send is > only happening under two circumstances: > 1) active rst mode > 2) non-active rst mode and based on the full socket > > That means the inconsistency occurs if we use tcpdump and trace > simultaneously to see how rst happens. > > It's necessary that we should take into other cases into considerations, > say: > 1) time-wait socket > 2) no socket > ... > > By parsing the incoming skb and reversing its 4-tuple can > we know the exact 'flow' which might not exist. > > Samples after applied this patch: > 1. tcp_send_reset: skbaddr=XXX skaddr=XXX src=ip:port dest=ip:port > state=TCP_ESTABLISHED > 2. tcp_send_reset: skbaddr=000...000 skaddr=XXX src=ip:port dest=ip:port > state=UNKNOWN > Note: > 1) UNKNOWN means we cannot extract the right information from skb. > 2) skbaddr/skaddr could be 0 > > Signed-off-by: Jason Xing <kernelxing@tencent.com> > --- > include/trace/events/tcp.h | 39 ++++++++++++++++++++++++++++++++++++-- > net/ipv4/tcp_ipv4.c | 4 ++-- > net/ipv6/tcp_ipv6.c | 3 ++- > 3 files changed, 41 insertions(+), 5 deletions(-) > > diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h > index 194425f69642..289438c54227 100644 > --- a/include/trace/events/tcp.h > +++ b/include/trace/events/tcp.h > @@ -78,11 +78,46 @@ DEFINE_EVENT(tcp_event_sk_skb, tcp_retransmit_skb, > * skb of trace_tcp_send_reset is the skb that caused RST. In case of > * active reset, skb should be NULL > */ > -DEFINE_EVENT(tcp_event_sk_skb, tcp_send_reset, > +TRACE_EVENT(tcp_send_reset, > > TP_PROTO(const struct sock *sk, const struct sk_buff *skb), > > - TP_ARGS(sk, skb) > + TP_ARGS(sk, skb), > + > + TP_STRUCT__entry( > + __field(const void *, skbaddr) > + __field(const void *, skaddr) > + __field(int, state) > + __array(__u8, saddr, sizeof(struct sockaddr_in6)) > + __array(__u8, daddr, sizeof(struct sockaddr_in6)) > + ), > + > + TP_fast_assign( > + __entry->skbaddr = skb; > + __entry->skaddr = sk; > + /* Zero means unknown state. */ > + __entry->state = sk ? sk->sk_state : 0; > + > + memset(__entry->saddr, 0, sizeof(struct sockaddr_in6)); > + memset(__entry->daddr, 0, sizeof(struct sockaddr_in6)); > + > + if (sk && sk_fullsock(sk)) { > + const struct inet_sock *inet = inet_sk(sk); > + > + TP_STORE_ADDR_PORTS(__entry, inet, sk); > + } else { To be on the safe side, I would test if (skb) here. We have one caller with skb == NULL, we might have more in the future. > + /* > + * We should reverse the 4-tuple of skb, so later > + * it can print the right flow direction of rst. > + */ > + TP_STORE_ADDR_PORTS_SKB(skb, entry->daddr, entry->saddr); > + } > + ), > + > + TP_printk("skbaddr=%p skaddr=%p src=%pISpc dest=%pISpc state=%s", > + __entry->skbaddr, __entry->skaddr, > + __entry->saddr, __entry->daddr, > + __entry->state ? show_tcp_state_name(__entry->state) : "UNKNOWN") > ); > > /* > diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c > index a22ee5838751..d5c4a969c066 100644 > --- a/net/ipv4/tcp_ipv4.c > +++ b/net/ipv4/tcp_ipv4.c > @@ -868,10 +868,10 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) > */ > if (sk) { > arg.bound_dev_if = sk->sk_bound_dev_if; > - if (sk_fullsock(sk)) > - trace_tcp_send_reset(sk, skb); > } Remove the { } ? > > + trace_tcp_send_reset(sk, skb); > + > BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) != > offsetof(struct inet_timewait_sock, tw_bound_dev_if)); > > diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c > index 3f4cba49e9ee..8e9c59b6c00c 100644 > --- a/net/ipv6/tcp_ipv6.c > +++ b/net/ipv6/tcp_ipv6.c > @@ -1113,7 +1113,6 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) > if (sk) { > oif = sk->sk_bound_dev_if; > if (sk_fullsock(sk)) { > - trace_tcp_send_reset(sk, skb); > if (inet6_test_bit(REPFLOW, sk)) > label = ip6_flowlabel(ipv6h); > priority = READ_ONCE(sk->sk_priority); > @@ -1129,6 +1128,8 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) > label = ip6_flowlabel(ipv6h); > } > > + trace_tcp_send_reset(sk, skb); > + > tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, oif, 1, > ipv6_get_dsfield(ipv6h), label, priority, txhash, > &key); > -- > 2.37.3 >
On Fri, Mar 29, 2024 at 5:07 PM Eric Dumazet <edumazet@google.com> wrote: > > On Fri, Mar 29, 2024 at 4:43 AM Jason Xing <kerneljasonxing@gmail.com> wrote: > > > > From: Jason Xing <kernelxing@tencent.com> > > > > Prior to this patch, what we can see by enabling trace_tcp_send is > > only happening under two circumstances: > > 1) active rst mode > > 2) non-active rst mode and based on the full socket > > > > That means the inconsistency occurs if we use tcpdump and trace > > simultaneously to see how rst happens. > > > > It's necessary that we should take into other cases into considerations, > > say: > > 1) time-wait socket > > 2) no socket > > ... > > > > By parsing the incoming skb and reversing its 4-tuple can > > we know the exact 'flow' which might not exist. > > > > Samples after applied this patch: > > 1. tcp_send_reset: skbaddr=XXX skaddr=XXX src=ip:port dest=ip:port > > state=TCP_ESTABLISHED > > 2. tcp_send_reset: skbaddr=000...000 skaddr=XXX src=ip:port dest=ip:port > > state=UNKNOWN > > Note: > > 1) UNKNOWN means we cannot extract the right information from skb. > > 2) skbaddr/skaddr could be 0 > > > > Signed-off-by: Jason Xing <kernelxing@tencent.com> > > --- > > include/trace/events/tcp.h | 39 ++++++++++++++++++++++++++++++++++++-- > > net/ipv4/tcp_ipv4.c | 4 ++-- > > net/ipv6/tcp_ipv6.c | 3 ++- > > 3 files changed, 41 insertions(+), 5 deletions(-) > > > > diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h > > index 194425f69642..289438c54227 100644 > > --- a/include/trace/events/tcp.h > > +++ b/include/trace/events/tcp.h > > @@ -78,11 +78,46 @@ DEFINE_EVENT(tcp_event_sk_skb, tcp_retransmit_skb, > > * skb of trace_tcp_send_reset is the skb that caused RST. In case of > > * active reset, skb should be NULL > > */ > > -DEFINE_EVENT(tcp_event_sk_skb, tcp_send_reset, > > +TRACE_EVENT(tcp_send_reset, > > > > TP_PROTO(const struct sock *sk, const struct sk_buff *skb), > > > > - TP_ARGS(sk, skb) > > + TP_ARGS(sk, skb), > > + > > + TP_STRUCT__entry( > > + __field(const void *, skbaddr) > > + __field(const void *, skaddr) > > + __field(int, state) > > + __array(__u8, saddr, sizeof(struct sockaddr_in6)) > > + __array(__u8, daddr, sizeof(struct sockaddr_in6)) > > + ), > > + > > + TP_fast_assign( > > + __entry->skbaddr = skb; > > + __entry->skaddr = sk; > > + /* Zero means unknown state. */ > > + __entry->state = sk ? sk->sk_state : 0; > > + > > + memset(__entry->saddr, 0, sizeof(struct sockaddr_in6)); > > + memset(__entry->daddr, 0, sizeof(struct sockaddr_in6)); > > + > > + if (sk && sk_fullsock(sk)) { > > + const struct inet_sock *inet = inet_sk(sk); > > + > > + TP_STORE_ADDR_PORTS(__entry, inet, sk); > > + } else { > > To be on the safe side, I would test if (skb) here. > We have one caller with skb == NULL, we might have more in the future. Thanks for the review. How about changing '} else {' to '} else if (skb) {', then if we go into this else-if branch, we will print nothing, right? I'll test it in this case. > > > + /* > > + * We should reverse the 4-tuple of skb, so later > > + * it can print the right flow direction of rst. > > + */ > > + TP_STORE_ADDR_PORTS_SKB(skb, entry->daddr, entry->saddr); > > + } > > + ), > > + > > + TP_printk("skbaddr=%p skaddr=%p src=%pISpc dest=%pISpc state=%s", > > + __entry->skbaddr, __entry->skaddr, > > + __entry->saddr, __entry->daddr, > > + __entry->state ? show_tcp_state_name(__entry->state) : "UNKNOWN") > > ); > > > > /* > > diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c > > index a22ee5838751..d5c4a969c066 100644 > > --- a/net/ipv4/tcp_ipv4.c > > +++ b/net/ipv4/tcp_ipv4.c > > @@ -868,10 +868,10 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) > > */ > > if (sk) { > > arg.bound_dev_if = sk->sk_bound_dev_if; > > - if (sk_fullsock(sk)) > > - trace_tcp_send_reset(sk, skb); > > } > > Remove the { } ? Yes, I forgot to remove them. Thanks, Jason > > > > > > + trace_tcp_send_reset(sk, skb); > > + > > BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) != > > offsetof(struct inet_timewait_sock, tw_bound_dev_if)); > > > > diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c > > index 3f4cba49e9ee..8e9c59b6c00c 100644 > > --- a/net/ipv6/tcp_ipv6.c > > +++ b/net/ipv6/tcp_ipv6.c > > @@ -1113,7 +1113,6 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) > > if (sk) { > > oif = sk->sk_bound_dev_if; > > if (sk_fullsock(sk)) { > > - trace_tcp_send_reset(sk, skb); > > if (inet6_test_bit(REPFLOW, sk)) > > label = ip6_flowlabel(ipv6h); > > priority = READ_ONCE(sk->sk_priority); > > @@ -1129,6 +1128,8 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) > > label = ip6_flowlabel(ipv6h); > > } > > > > + trace_tcp_send_reset(sk, skb); > > + > > tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, oif, 1, > > ipv6_get_dsfield(ipv6h), label, priority, txhash, > > &key); > > -- > > 2.37.3 > >
On Fri, Mar 29, 2024 at 11:23 AM Jason Xing <kerneljasonxing@gmail.com> wrote: > > On Fri, Mar 29, 2024 at 5:07 PM Eric Dumazet <edumazet@google.com> wrote: > > > > On Fri, Mar 29, 2024 at 4:43 AM Jason Xing <kerneljasonxing@gmail.com> wrote: > > > > > > From: Jason Xing <kernelxing@tencent.com> > > > > > > Prior to this patch, what we can see by enabling trace_tcp_send is > > > only happening under two circumstances: > > > 1) active rst mode > > > 2) non-active rst mode and based on the full socket > > > > > > That means the inconsistency occurs if we use tcpdump and trace > > > simultaneously to see how rst happens. > > > > > > It's necessary that we should take into other cases into considerations, > > > say: > > > 1) time-wait socket > > > 2) no socket > > > ... > > > > > > By parsing the incoming skb and reversing its 4-tuple can > > > we know the exact 'flow' which might not exist. > > > > > > Samples after applied this patch: > > > 1. tcp_send_reset: skbaddr=XXX skaddr=XXX src=ip:port dest=ip:port > > > state=TCP_ESTABLISHED > > > 2. tcp_send_reset: skbaddr=000...000 skaddr=XXX src=ip:port dest=ip:port > > > state=UNKNOWN > > > Note: > > > 1) UNKNOWN means we cannot extract the right information from skb. > > > 2) skbaddr/skaddr could be 0 > > > > > > Signed-off-by: Jason Xing <kernelxing@tencent.com> > > > --- > > > include/trace/events/tcp.h | 39 ++++++++++++++++++++++++++++++++++++-- > > > net/ipv4/tcp_ipv4.c | 4 ++-- > > > net/ipv6/tcp_ipv6.c | 3 ++- > > > 3 files changed, 41 insertions(+), 5 deletions(-) > > > > > > diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h > > > index 194425f69642..289438c54227 100644 > > > --- a/include/trace/events/tcp.h > > > +++ b/include/trace/events/tcp.h > > > @@ -78,11 +78,46 @@ DEFINE_EVENT(tcp_event_sk_skb, tcp_retransmit_skb, > > > * skb of trace_tcp_send_reset is the skb that caused RST. In case of > > > * active reset, skb should be NULL > > > */ > > > -DEFINE_EVENT(tcp_event_sk_skb, tcp_send_reset, > > > +TRACE_EVENT(tcp_send_reset, > > > > > > TP_PROTO(const struct sock *sk, const struct sk_buff *skb), > > > > > > - TP_ARGS(sk, skb) > > > + TP_ARGS(sk, skb), > > > + > > > + TP_STRUCT__entry( > > > + __field(const void *, skbaddr) > > > + __field(const void *, skaddr) > > > + __field(int, state) > > > + __array(__u8, saddr, sizeof(struct sockaddr_in6)) > > > + __array(__u8, daddr, sizeof(struct sockaddr_in6)) > > > + ), > > > + > > > + TP_fast_assign( > > > + __entry->skbaddr = skb; > > > + __entry->skaddr = sk; > > > + /* Zero means unknown state. */ > > > + __entry->state = sk ? sk->sk_state : 0; > > > + > > > + memset(__entry->saddr, 0, sizeof(struct sockaddr_in6)); > > > + memset(__entry->daddr, 0, sizeof(struct sockaddr_in6)); > > > + > > > + if (sk && sk_fullsock(sk)) { > > > + const struct inet_sock *inet = inet_sk(sk); > > > + > > > + TP_STORE_ADDR_PORTS(__entry, inet, sk); > > > + } else { > > > > To be on the safe side, I would test if (skb) here. > > We have one caller with skb == NULL, we might have more in the future. > > Thanks for the review. > > How about changing '} else {' to '} else if (skb) {', then if we go > into this else-if branch, we will print nothing, right? I'll test it > in this case. Right, the fields are cleared before this else + memset(__entry->saddr, 0, sizeof(struct sockaddr_in6)); + memset(__entry->daddr, 0, sizeof(struct sockaddr_in6)); > > > > > > + /* > > > + * We should reverse the 4-tuple of skb, so later > > > + * it can print the right flow direction of rst. > > > + */ > > > + TP_STORE_ADDR_PORTS_SKB(skb, entry->daddr, entry->saddr); > > > + } > > > + ), > > > + > > > + TP_printk("skbaddr=%p skaddr=%p src=%pISpc dest=%pISpc state=%s", > > > + __entry->skbaddr, __entry->skaddr, > > > + __entry->saddr, __entry->daddr, > > > + __entry->state ? show_tcp_state_name(__entry->state) : "UNKNOWN") > > > ); > > > > > > /* > > > diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c > > > index a22ee5838751..d5c4a969c066 100644 > > > --- a/net/ipv4/tcp_ipv4.c > > > +++ b/net/ipv4/tcp_ipv4.c > > > @@ -868,10 +868,10 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) > > > */ > > > if (sk) { > > > arg.bound_dev_if = sk->sk_bound_dev_if; > > > - if (sk_fullsock(sk)) > > > - trace_tcp_send_reset(sk, skb); > > > } > > > > Remove the { } ? > > Yes, I forgot to remove them. No problem.
diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h index 194425f69642..289438c54227 100644 --- a/include/trace/events/tcp.h +++ b/include/trace/events/tcp.h @@ -78,11 +78,46 @@ DEFINE_EVENT(tcp_event_sk_skb, tcp_retransmit_skb, * skb of trace_tcp_send_reset is the skb that caused RST. In case of * active reset, skb should be NULL */ -DEFINE_EVENT(tcp_event_sk_skb, tcp_send_reset, +TRACE_EVENT(tcp_send_reset, TP_PROTO(const struct sock *sk, const struct sk_buff *skb), - TP_ARGS(sk, skb) + TP_ARGS(sk, skb), + + TP_STRUCT__entry( + __field(const void *, skbaddr) + __field(const void *, skaddr) + __field(int, state) + __array(__u8, saddr, sizeof(struct sockaddr_in6)) + __array(__u8, daddr, sizeof(struct sockaddr_in6)) + ), + + TP_fast_assign( + __entry->skbaddr = skb; + __entry->skaddr = sk; + /* Zero means unknown state. */ + __entry->state = sk ? sk->sk_state : 0; + + memset(__entry->saddr, 0, sizeof(struct sockaddr_in6)); + memset(__entry->daddr, 0, sizeof(struct sockaddr_in6)); + + if (sk && sk_fullsock(sk)) { + const struct inet_sock *inet = inet_sk(sk); + + TP_STORE_ADDR_PORTS(__entry, inet, sk); + } else { + /* + * We should reverse the 4-tuple of skb, so later + * it can print the right flow direction of rst. + */ + TP_STORE_ADDR_PORTS_SKB(skb, entry->daddr, entry->saddr); + } + ), + + TP_printk("skbaddr=%p skaddr=%p src=%pISpc dest=%pISpc state=%s", + __entry->skbaddr, __entry->skaddr, + __entry->saddr, __entry->daddr, + __entry->state ? show_tcp_state_name(__entry->state) : "UNKNOWN") ); /* diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index a22ee5838751..d5c4a969c066 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -868,10 +868,10 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) */ if (sk) { arg.bound_dev_if = sk->sk_bound_dev_if; - if (sk_fullsock(sk)) - trace_tcp_send_reset(sk, skb); } + trace_tcp_send_reset(sk, skb); + BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) != offsetof(struct inet_timewait_sock, tw_bound_dev_if)); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 3f4cba49e9ee..8e9c59b6c00c 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1113,7 +1113,6 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) if (sk) { oif = sk->sk_bound_dev_if; if (sk_fullsock(sk)) { - trace_tcp_send_reset(sk, skb); if (inet6_test_bit(REPFLOW, sk)) label = ip6_flowlabel(ipv6h); priority = READ_ONCE(sk->sk_priority); @@ -1129,6 +1128,8 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) label = ip6_flowlabel(ipv6h); } + trace_tcp_send_reset(sk, skb); + tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, oif, 1, ipv6_get_dsfield(ipv6h), label, priority, txhash, &key);