diff mbox series

[RFC,v3,net-next,1/7] net: add rx_sk to trace_kfree_skb

Message ID 983c54f98746bd42d778b99840435d0a93963cb3.1717529533.git.yan@cloudflare.com (mailing list archive)
State Superseded
Delegated to: Netdev Maintainers
Headers show
Series net: pass receive socket to drop tracepoint | expand

Checks

Context Check Description
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for net-next
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 914 this patch: 914
netdev/build_tools success Errors and warnings before: 0 this patch: 0
netdev/cc_maintainers success CCed 10 of 10 maintainers
netdev/build_clang success Errors and warnings before: 905 this patch: 905
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 918 this patch: 918
netdev/checkpatch warning WARNING: line length of 85 exceeds 80 columns
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 133 this patch: 133
netdev/source_inline success Was 0 now: 0

Commit Message

Yan Zhai June 4, 2024, 9:47 p.m. UTC
skb does not include enough information to find out receiving
sockets/services and netns/containers on packet drops. In theory
skb->dev tells about netns, but it can get cleared/reused, e.g. by TCP
stack for OOO packet lookup. Similarly, skb->sk often identifies a local
sender, and tells nothing about a receiver.

Allow passing an extra receiving socket to the tracepoint to improve
the visibility on receiving drops.

Signed-off-by: Yan Zhai <yan@cloudflare.com>
---
v2->v3: fixed drop_monitor function prototype
---
 include/trace/events/skb.h | 11 +++++++----
 net/core/dev.c             |  2 +-
 net/core/drop_monitor.c    |  9 ++++++---
 net/core/skbuff.c          |  2 +-
 4 files changed, 15 insertions(+), 9 deletions(-)

Comments

Steven Rostedt June 5, 2024, 11:57 p.m. UTC | #1
On Tue, 4 Jun 2024 14:47:38 -0700
Yan Zhai <yan@cloudflare.com> wrote:

> skb does not include enough information to find out receiving
> sockets/services and netns/containers on packet drops. In theory
> skb->dev tells about netns, but it can get cleared/reused, e.g. by TCP
> stack for OOO packet lookup. Similarly, skb->sk often identifies a local
> sender, and tells nothing about a receiver.
> 
> Allow passing an extra receiving socket to the tracepoint to improve
> the visibility on receiving drops.
> 
> Signed-off-by: Yan Zhai <yan@cloudflare.com>
> ---
> v2->v3: fixed drop_monitor function prototype
> ---
>  include/trace/events/skb.h | 11 +++++++----
>  net/core/dev.c             |  2 +-
>  net/core/drop_monitor.c    |  9 ++++++---
>  net/core/skbuff.c          |  2 +-
>  4 files changed, 15 insertions(+), 9 deletions(-)
> 
> diff --git a/include/trace/events/skb.h b/include/trace/events/skb.h
> index 07e0715628ec..aa6b46b6172c 100644
> --- a/include/trace/events/skb.h
> +++ b/include/trace/events/skb.h
> @@ -24,15 +24,16 @@ DEFINE_DROP_REASON(FN, FN)
>  TRACE_EVENT(kfree_skb,
>  
>  	TP_PROTO(struct sk_buff *skb, void *location,
> -		 enum skb_drop_reason reason),
> +		 enum skb_drop_reason reason, struct sock *rx_sk),
>  
> -	TP_ARGS(skb, location, reason),
> +	TP_ARGS(skb, location, reason, rx_sk),
>  
>  	TP_STRUCT__entry(
>  		__field(void *,		skbaddr)
>  		__field(void *,		location)
>  		__field(unsigned short,	protocol)
>  		__field(enum skb_drop_reason,	reason)
> +		__field(void *,		rx_skaddr)

Please add the pointer after the other pointers:

 		__field(void *,		skbaddr)
 		__field(void *,		location)
+		__field(void *,		rx_skaddr)
 		__field(unsigned short,	protocol)
 		__field(enum skb_drop_reason,	reason)

otherwise you are adding holes in the ring buffer event.

The TP_STRUCT__entry() is a structure that is saved in the ring buffer. We
want to avoid alignment holes. I also question having a short before the
enum, if the emum is 4 bytes. The short should be at the end.

In fact, looking at the format file, there is a 2 byte hole:

 # cat /sys/kernel/tracing/events/skb/kfree_skb/format

name: kfree_skb
ID: 1799
format:
	field:unsigned short common_type;	offset:0;	size:2;	signed:0;
	field:unsigned char common_flags;	offset:2;	size:1;	signed:0;
	field:unsigned char common_preempt_count;	offset:3;	size:1;	signed:0;
	field:int common_pid;	offset:4;	size:4;	signed:1;

	field:void * skbaddr;	offset:8;	size:8;	signed:0;
	field:void * location;	offset:16;	size:8;	signed:0;
	field:unsigned short protocol;	offset:24;	size:2;	signed:0;
	field:enum skb_drop_reason reason;	offset:28;	size:4;	signed:0;

Notice that "protocol" is 2 bytes in size at offset 24, but "reason" starts
at offset 28. This means at offset 26, there's a 2 byte hole.

-- Steve



>  	),
>  
>  	TP_fast_assign(
> @@ -40,12 +41,14 @@ TRACE_EVENT(kfree_skb,
>  		__entry->location = location;
>  		__entry->protocol = ntohs(skb->protocol);
>  		__entry->reason = reason;
> +		__entry->rx_skaddr = rx_sk;
>  	),
>
Yan Zhai June 6, 2024, 3:37 p.m. UTC | #2
On Wed, Jun 5, 2024 at 6:57 PM Steven Rostedt <rostedt@goodmis.org> wrote:
>
> On Tue, 4 Jun 2024 14:47:38 -0700
> Yan Zhai <yan@cloudflare.com> wrote:
>
> > skb does not include enough information to find out receiving
> > sockets/services and netns/containers on packet drops. In theory
> > skb->dev tells about netns, but it can get cleared/reused, e.g. by TCP
> > stack for OOO packet lookup. Similarly, skb->sk often identifies a local
> > sender, and tells nothing about a receiver.
> >
> > Allow passing an extra receiving socket to the tracepoint to improve
> > the visibility on receiving drops.
> >
> > Signed-off-by: Yan Zhai <yan@cloudflare.com>
> > ---
> > v2->v3: fixed drop_monitor function prototype
> > ---
> >  include/trace/events/skb.h | 11 +++++++----
> >  net/core/dev.c             |  2 +-
> >  net/core/drop_monitor.c    |  9 ++++++---
> >  net/core/skbuff.c          |  2 +-
> >  4 files changed, 15 insertions(+), 9 deletions(-)
> >
> > diff --git a/include/trace/events/skb.h b/include/trace/events/skb.h
> > index 07e0715628ec..aa6b46b6172c 100644
> > --- a/include/trace/events/skb.h
> > +++ b/include/trace/events/skb.h
> > @@ -24,15 +24,16 @@ DEFINE_DROP_REASON(FN, FN)
> >  TRACE_EVENT(kfree_skb,
> >
> >       TP_PROTO(struct sk_buff *skb, void *location,
> > -              enum skb_drop_reason reason),
> > +              enum skb_drop_reason reason, struct sock *rx_sk),
> >
> > -     TP_ARGS(skb, location, reason),
> > +     TP_ARGS(skb, location, reason, rx_sk),
> >
> >       TP_STRUCT__entry(
> >               __field(void *,         skbaddr)
> >               __field(void *,         location)
> >               __field(unsigned short, protocol)
> >               __field(enum skb_drop_reason,   reason)
> > +             __field(void *,         rx_skaddr)
>
> Please add the pointer after the other pointers:
>
>                 __field(void *,         skbaddr)
>                 __field(void *,         location)
> +               __field(void *,         rx_skaddr)
>                 __field(unsigned short, protocol)
>                 __field(enum skb_drop_reason,   reason)
>
> otherwise you are adding holes in the ring buffer event.
>
> The TP_STRUCT__entry() is a structure that is saved in the ring buffer. We
> want to avoid alignment holes. I also question having a short before the
> enum, if the emum is 4 bytes. The short should be at the end.
>
> In fact, looking at the format file, there is a 2 byte hole:
>
>  # cat /sys/kernel/tracing/events/skb/kfree_skb/format
>
> name: kfree_skb
> ID: 1799
> format:
>         field:unsigned short common_type;       offset:0;       size:2; signed:0;
>         field:unsigned char common_flags;       offset:2;       size:1; signed:0;
>         field:unsigned char common_preempt_count;       offset:3;       size:1; signed:0;
>         field:int common_pid;   offset:4;       size:4; signed:1;
>
>         field:void * skbaddr;   offset:8;       size:8; signed:0;
>         field:void * location;  offset:16;      size:8; signed:0;
>         field:unsigned short protocol;  offset:24;      size:2; signed:0;
>         field:enum skb_drop_reason reason;      offset:28;      size:4; signed:0;
>
> Notice that "protocol" is 2 bytes in size at offset 24, but "reason" starts
> at offset 28. This means at offset 26, there's a 2 byte hole.
>
The reason I added the pointer as the last argument is trying to
minimize the surprise to existing TP users, because for common ABIs
it's fine to omit later arguments when defining a function, but it
needs change and recompilation if the order of arguments changed.

Looking at the actual format after the change, it does not add a new
hole since protocol and reason are already packed into the same 8-byte
block, so rx_skaddr starts at 8-byte aligned offset:

# cat /sys/kernel/debug/tracing/events/skb/kfree_skb/format
name: kfree_skb
ID: 2260
format:
        field:unsigned short common_type;       offset:0;
size:2; signed:0;
        field:unsigned char common_flags;       offset:2;
size:1; signed:0;
        field:unsigned char common_preempt_count;       offset:3;
 size:1; signed:0;
        field:int common_pid;   offset:4;       size:4; signed:1;

        field:void * skbaddr;   offset:8;       size:8; signed:0;
        field:void * location;  offset:16;      size:8; signed:0;
        field:unsigned short protocol;  offset:24;      size:2; signed:0;
        field:enum skb_drop_reason reason;      offset:28;
size:4; signed:0;
        field:void * rx_skaddr; offset:32;      size:8; signed:0;

Do you think we still need to change the order?

Yan


> -- Steve
>
>
>
> >       ),
> >
> >       TP_fast_assign(
> > @@ -40,12 +41,14 @@ TRACE_EVENT(kfree_skb,
> >               __entry->location = location;
> >               __entry->protocol = ntohs(skb->protocol);
> >               __entry->reason = reason;
> > +             __entry->rx_skaddr = rx_sk;
> >       ),
> >
>
David Ahern June 7, 2024, 2:29 p.m. UTC | #3
On 6/6/24 9:37 AM, Yan Zhai wrote:
> # cat /sys/kernel/debug/tracing/events/skb/kfree_skb/format
> name: kfree_skb
> ID: 2260
> format:
>         field:unsigned short common_type;       offset:0;
> size:2; signed:0;
>         field:unsigned char common_flags;       offset:2;
> size:1; signed:0;
>         field:unsigned char common_preempt_count;       offset:3;
>  size:1; signed:0;
>         field:int common_pid;   offset:4;       size:4; signed:1;
> 
>         field:void * skbaddr;   offset:8;       size:8; signed:0;
>         field:void * location;  offset:16;      size:8; signed:0;
>         field:unsigned short protocol;  offset:24;      size:2; signed:0;
>         field:enum skb_drop_reason reason;      offset:28;
> size:4; signed:0;
>         field:void * rx_skaddr; offset:32;      size:8; signed:0;
> 
> Do you think we still need to change the order?
> 

yes. Keeping proper order now ensures no holes creep in with later changes.
Steven Rostedt June 10, 2024, 4:54 p.m. UTC | #4
On Thu, 6 Jun 2024 10:37:46 -0500
Yan Zhai <yan@cloudflare.com> wrote:

> > name: kfree_skb
> > ID: 1799
> > format:
> >         field:unsigned short common_type;       offset:0;       size:2; signed:0;
> >         field:unsigned char common_flags;       offset:2;       size:1; signed:0;
> >         field:unsigned char common_preempt_count;       offset:3;       size:1; signed:0;
> >         field:int common_pid;   offset:4;       size:4; signed:1;
> >
> >         field:void * skbaddr;   offset:8;       size:8; signed:0;
> >         field:void * location;  offset:16;      size:8; signed:0;
> >         field:unsigned short protocol;  offset:24;      size:2; signed:0;
> >         field:enum skb_drop_reason reason;      offset:28;      size:4; signed:0;
> >
> > Notice that "protocol" is 2 bytes in size at offset 24, but "reason" starts
> > at offset 28. This means at offset 26, there's a 2 byte hole.
> >  
> The reason I added the pointer as the last argument is trying to
> minimize the surprise to existing TP users, because for common ABIs
> it's fine to omit later arguments when defining a function, but it
> needs change and recompilation if the order of arguments changed.

Nothing should be hard coding the offsets of the fields. This is
exported to user space so that tools can see where the fields are.
That's the purpose of libtraceevent. The fields should be movable and
not affect anything. There should be no need to recompile.

> 
> Looking at the actual format after the change, it does not add a new
> hole since protocol and reason are already packed into the same 8-byte
> block, so rx_skaddr starts at 8-byte aligned offset:
> 
> # cat /sys/kernel/debug/tracing/events/skb/kfree_skb/format
> name: kfree_skb
> ID: 2260
> format:
>         field:unsigned short common_type;       offset:0;
> size:2; signed:0;
>         field:unsigned char common_flags;       offset:2;
> size:1; signed:0;
>         field:unsigned char common_preempt_count;       offset:3;
>  size:1; signed:0;
>         field:int common_pid;   offset:4;       size:4; signed:1;
> 
>         field:void * skbaddr;   offset:8;       size:8; signed:0;
>         field:void * location;  offset:16;      size:8; signed:0;
>         field:unsigned short protocol;  offset:24;      size:2; signed:0;
>         field:enum skb_drop_reason reason;      offset:28;
> size:4; signed:0;
>         field:void * rx_skaddr; offset:32;      size:8; signed:0;
> 
> Do you think we still need to change the order?

Up to you, just wanted to point it out.

-- Steve
Yan Zhai June 10, 2024, 8:25 p.m. UTC | #5
On Mon, Jun 10, 2024 at 11:54 AM Steven Rostedt <rostedt@goodmis.org> wrote:
>
> On Thu, 6 Jun 2024 10:37:46 -0500
> Yan Zhai <yan@cloudflare.com> wrote:
>
> > > name: kfree_skb
> > > ID: 1799
> > > format:
> > >         field:unsigned short common_type;       offset:0;       size:2; signed:0;
> > >         field:unsigned char common_flags;       offset:2;       size:1; signed:0;
> > >         field:unsigned char common_preempt_count;       offset:3;       size:1; signed:0;
> > >         field:int common_pid;   offset:4;       size:4; signed:1;
> > >
> > >         field:void * skbaddr;   offset:8;       size:8; signed:0;
> > >         field:void * location;  offset:16;      size:8; signed:0;
> > >         field:unsigned short protocol;  offset:24;      size:2; signed:0;
> > >         field:enum skb_drop_reason reason;      offset:28;      size:4; signed:0;
> > >
> > > Notice that "protocol" is 2 bytes in size at offset 24, but "reason" starts
> > > at offset 28. This means at offset 26, there's a 2 byte hole.
> > >
> > The reason I added the pointer as the last argument is trying to
> > minimize the surprise to existing TP users, because for common ABIs
> > it's fine to omit later arguments when defining a function, but it
> > needs change and recompilation if the order of arguments changed.
>
> Nothing should be hard coding the offsets of the fields. This is
> exported to user space so that tools can see where the fields are.
> That's the purpose of libtraceevent. The fields should be movable and
> not affect anything. There should be no need to recompile.
>
Oh I misunderstood previously. I was also thinking about the argument
order in TP_PROTO, but what you mentioned is just the order in
TP_STRUCT__entry, correct? I'd prefer to not change the argument order
but the struct field order can definitely be aligned better here.

Yan

> >
> > Looking at the actual format after the change, it does not add a new
> > hole since protocol and reason are already packed into the same 8-byte
> > block, so rx_skaddr starts at 8-byte aligned offset:
> >
> > # cat /sys/kernel/debug/tracing/events/skb/kfree_skb/format
> > name: kfree_skb
> > ID: 2260
> > format:
> >         field:unsigned short common_type;       offset:0;
> > size:2; signed:0;
> >         field:unsigned char common_flags;       offset:2;
> > size:1; signed:0;
> >         field:unsigned char common_preempt_count;       offset:3;
> >  size:1; signed:0;
> >         field:int common_pid;   offset:4;       size:4; signed:1;
> >
> >         field:void * skbaddr;   offset:8;       size:8; signed:0;
> >         field:void * location;  offset:16;      size:8; signed:0;
> >         field:unsigned short protocol;  offset:24;      size:2; signed:0;
> >         field:enum skb_drop_reason reason;      offset:28;
> > size:4; signed:0;
> >         field:void * rx_skaddr; offset:32;      size:8; signed:0;
> >
> > Do you think we still need to change the order?
>
> Up to you, just wanted to point it out.
>
> -- Steve
>
diff mbox series

Patch

diff --git a/include/trace/events/skb.h b/include/trace/events/skb.h
index 07e0715628ec..aa6b46b6172c 100644
--- a/include/trace/events/skb.h
+++ b/include/trace/events/skb.h
@@ -24,15 +24,16 @@  DEFINE_DROP_REASON(FN, FN)
 TRACE_EVENT(kfree_skb,
 
 	TP_PROTO(struct sk_buff *skb, void *location,
-		 enum skb_drop_reason reason),
+		 enum skb_drop_reason reason, struct sock *rx_sk),
 
-	TP_ARGS(skb, location, reason),
+	TP_ARGS(skb, location, reason, rx_sk),
 
 	TP_STRUCT__entry(
 		__field(void *,		skbaddr)
 		__field(void *,		location)
 		__field(unsigned short,	protocol)
 		__field(enum skb_drop_reason,	reason)
+		__field(void *,		rx_skaddr)
 	),
 
 	TP_fast_assign(
@@ -40,12 +41,14 @@  TRACE_EVENT(kfree_skb,
 		__entry->location = location;
 		__entry->protocol = ntohs(skb->protocol);
 		__entry->reason = reason;
+		__entry->rx_skaddr = rx_sk;
 	),
 
-	TP_printk("skbaddr=%p protocol=%u location=%pS reason: %s",
+	TP_printk("skbaddr=%p protocol=%u location=%pS reason: %s rx_skaddr=%p",
 		  __entry->skbaddr, __entry->protocol, __entry->location,
 		  __print_symbolic(__entry->reason,
-				   DEFINE_DROP_REASON(FN, FNe)))
+				   DEFINE_DROP_REASON(FN, FNe)),
+		  __entry->rx_skaddr)
 );
 
 #undef FN
diff --git a/net/core/dev.c b/net/core/dev.c
index 85fe8138f3e4..7844227ecbfd 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5233,7 +5233,7 @@  static __latent_entropy void net_tx_action(struct softirq_action *h)
 				trace_consume_skb(skb, net_tx_action);
 			else
 				trace_kfree_skb(skb, net_tx_action,
-						get_kfree_skb_cb(skb)->reason);
+						get_kfree_skb_cb(skb)->reason, NULL);
 
 			if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
 				__kfree_skb(skb);
diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
index 430ed18f8584..2e0ae3328232 100644
--- a/net/core/drop_monitor.c
+++ b/net/core/drop_monitor.c
@@ -109,7 +109,8 @@  static u32 net_dm_queue_len = 1000;
 struct net_dm_alert_ops {
 	void (*kfree_skb_probe)(void *ignore, struct sk_buff *skb,
 				void *location,
-				enum skb_drop_reason reason);
+				enum skb_drop_reason reason,
+				struct sock *rx_sk);
 	void (*napi_poll_probe)(void *ignore, struct napi_struct *napi,
 				int work, int budget);
 	void (*work_item_func)(struct work_struct *work);
@@ -264,7 +265,8 @@  static void trace_drop_common(struct sk_buff *skb, void *location)
 
 static void trace_kfree_skb_hit(void *ignore, struct sk_buff *skb,
 				void *location,
-				enum skb_drop_reason reason)
+				enum skb_drop_reason reason,
+				struct sock *rx_sk)
 {
 	trace_drop_common(skb, location);
 }
@@ -491,7 +493,8 @@  static const struct net_dm_alert_ops net_dm_alert_summary_ops = {
 static void net_dm_packet_trace_kfree_skb_hit(void *ignore,
 					      struct sk_buff *skb,
 					      void *location,
-					      enum skb_drop_reason reason)
+					      enum skb_drop_reason reason,
+					      struct sock *rx_sk)
 {
 	ktime_t tstamp = ktime_get_real();
 	struct per_cpu_dm_data *data;
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 466999a7515e..2854afdd713f 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1203,7 +1203,7 @@  bool __kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason)
 	if (reason == SKB_CONSUMED)
 		trace_consume_skb(skb, __builtin_return_address(0));
 	else
-		trace_kfree_skb(skb, __builtin_return_address(0), reason);
+		trace_kfree_skb(skb, __builtin_return_address(0), reason, NULL);
 	return true;
 }