Message ID | 20230108025545.338-1-cuiyunhui@bytedance.com (mailing list archive) |
---|---|
State | Superseded |
Headers | show |
Series | [v4] sock: add tracepoint for send recv length | expand |
On Sun, Jan 8, 2023 at 3:56 AM Yunhui Cui <cuiyunhui@bytedance.com> wrote: > > Add 2 tracepoints to monitor the tcp/udp traffic > of per process and per cgroup. > > Regarding monitoring the tcp/udp traffic of each process, there are two > existing solutions, the first one is https://www.atoptool.nl/netatop.php. > The second is via kprobe/kretprobe. > > Netatop solution is implemented by registering the hook function at the > hook point provided by the netfilter framework. > > These hook functions may be in the soft interrupt context and cannot > directly obtain the pid. Some data structures are added to bind packets > and processes. For example, struct taskinfobucket, struct taskinfo ... > > Every time the process sends and receives packets it needs multiple > hashmaps,resulting in low performance and it has the problem fo inaccurate > tcp/udp traffic statistics(for example: multiple threads share sockets). > > We can obtain the information with kretprobe, but as we know, kprobe gets > the result by trappig in an exception, which loses performance compared > to tracepoint. > > We compared the performance of tracepoints with the above two methods, and > the results are as follows: > > ab -n 1000000 -c 1000 -r http://127.0.0.1/index.html > without trace: > Time per request: 39.660 [ms] (mean) > Time per request: 0.040 [ms] (mean, across all concurrent requests) > > netatop: > Time per request: 50.717 [ms] (mean) > Time per request: 0.051 [ms] (mean, across all concurrent requests) > > kr: > Time per request: 43.168 [ms] (mean) > Time per request: 0.043 [ms] (mean, across all concurrent requests) > > tracepoint: > Time per request: 41.004 [ms] (mean) > Time per request: 0.041 [ms] (mean, across all concurrent requests > > It can be seen that tracepoint has better performance. > > Signed-off-by: Yunhui Cui <cuiyunhui@bytedance.com> > Signed-off-by: Xiongchun Duan <duanxiongchun@bytedance.com> > --- > include/trace/events/sock.h | 48 +++++++++++++++++++++++++++++++++++++ > net/socket.c | 23 ++++++++++++++---- > 2 files changed, 67 insertions(+), 4 deletions(-) > > diff --git a/include/trace/events/sock.h b/include/trace/events/sock.h > index 777ee6cbe933..d00a5b272404 100644 > --- a/include/trace/events/sock.h > +++ b/include/trace/events/sock.h > @@ -263,6 +263,54 @@ TRACE_EVENT(inet_sk_error_report, > __entry->error) > ); > > +/* > + * sock send/recv msg length > + */ > +DECLARE_EVENT_CLASS(sock_msg_length, > + > + TP_PROTO(struct sock *sk, __u16 family, __u16 protocol, int ret, > + int flags), > + > + TP_ARGS(sk, family, protocol, ret, flags), > + > + TP_STRUCT__entry( > + __field(void *, sk) > + __field(__u16, family) > + __field(__u16, protocol) > + __field(int, length) > + __field(int, error) > + __field(int, flags) > + ), > + > + TP_fast_assign( > + __entry->sk = sk; > + __entry->family = sk->sk_family; > + __entry->protocol = sk->sk_protocol; > + __entry->length = ret > 0 ? ret : 0; > + __entry->error = ret < 0 ? ret : 0; > + __entry->flags = flags; > + ), > + > + TP_printk("sk address = %p, family = %s protocol = %s, length = %d, error = %d, flags = 0x%x", > + __entry->sk, show_family_name(__entry->family), > + show_inet_protocol_name(__entry->protocol), > + __entry->length, > + __entry->error, __entry->flags) > +); > + > +DEFINE_EVENT(sock_msg_length, sock_send_length, > + TP_PROTO(struct sock *sk, __u16 family, __u16 protocol, int ret, > + int flags), > + > + TP_ARGS(sk, family, protocol, ret, flags) > +); > + > +DEFINE_EVENT(sock_msg_length, sock_recv_length, > + TP_PROTO(struct sock *sk, __u16 family, __u16 protocol, int ret, > + int flags), > + > + TP_ARGS(sk, family, protocol, ret, flags) > +); > #endif /* _TRACE_SOCK_H */ > > /* This part must be outside protection */ > diff --git a/net/socket.c b/net/socket.c > index 888cd618a968..60a1ff95b4b1 100644 > --- a/net/socket.c > +++ b/net/socket.c > @@ -106,6 +106,7 @@ > #include <net/busy_poll.h> > #include <linux/errqueue.h> > #include <linux/ptp_clock_kernel.h> > +#include <trace/events/sock.h> > > #ifdef CONFIG_NET_RX_BUSY_POLL > unsigned int sysctl_net_busy_read __read_mostly; > @@ -715,6 +716,9 @@ static inline int sock_sendmsg_nosec(struct socket *sock, struct msghdr *msg) > inet_sendmsg, sock, msg, > msg_data_left(msg)); > BUG_ON(ret == -EIOCBQUEUED); > + > + trace_sock_send_length(sock->sk, sock->sk->sk_family, > + sock->sk->sk_protocol, ret, 0); Note: At least for CONFIG_RETPOLINE=y and gcc 12.2, compiler adds many additional instructions (and additional memory reads), even when the trace point is not enabled. Contrary to some belief, adding a tracepoint is not always 'free'. tail calls for example are replaced with normal calls. sock_recvmsg_nosec: pushq %r12 # movl %edx, %r12d # tmp123, flags pushq %rbp # # net/socket.c:999: int ret = INDIRECT_CALL_INET(sock->ops->recvmsg, inet6_recvmsg, movl %r12d, %ecx # flags, # net/socket.c:998: { movq %rdi, %rbp # tmp121, sock pushq %rbx # # net/socket.c:999: int ret = INDIRECT_CALL_INET(sock->ops->recvmsg, inet6_recvmsg, movq 32(%rdi), %rax # sock_19(D)->ops, sock_19(D)->ops # ./include/linux/uio.h:270: return i->count; movq 32(%rsi), %rdx # MEM[(const struct iov_iter *)msg_20(D) + 16B].count, pretmp_48 # net/socket.c:999: int ret = INDIRECT_CALL_INET(sock->ops->recvmsg, inet6_recvmsg, movq 144(%rax), %rax # _1->recvmsg, _2 cmpq $inet6_recvmsg, %rax #, _2 jne .L107 #, call inet6_recvmsg # movl %eax, %ebx # tmp124, <retval> .L108: # net/socket.c:1003: trace_sock_recv_length(sock->sk, sock->sk->sk_family, xorl %r8d, %r8d # tmp127 testl %ebx, %ebx # <retval> # net/socket.c:1004: sock->sk->sk_protocol, movq 24(%rbp), %rsi # sock_19(D)->sk, _10 # net/socket.c:1003: trace_sock_recv_length(sock->sk, sock->sk->sk_family, cmovle %ebx, %r8d # <retval>,, tmp119 testb $2, %r12b #, flags # net/socket.c:1004: sock->sk->sk_protocol, movzwl 516(%rsi), %ecx # _10->sk_protocol, # net/socket.c:1003: trace_sock_recv_length(sock->sk, sock->sk->sk_family, movzwl 16(%rsi), %edx # _10->__sk_common.skc_family, # net/socket.c:1003: trace_sock_recv_length(sock->sk, sock->sk->sk_family, cmove %ebx, %r8d # tmp119,, <retval>, iftmp.54_16 # ./arch/x86/include/asm/jump_label.h:27: asm_volatile_goto("1:" #APP # 27 "./arch/x86/include/asm/jump_label.h" 1 1:jmp .L111 # objtool NOPs this # .pushsection __jump_table, "aw" .balign 8 .long 1b - . .long .L111 - . # .quad __tracepoint_sock_recv_length+8 + 2 - . #, .popsection # 0 "" 2 #NO_APP .L106: # net/socket.c:1008: } movl %ebx, %eax # <retval>, popq %rbx # popq %rbp # popq %r12 # ret .L111: # ./include/trace/events/sock.h:308: DEFINE_EVENT(sock_msg_length, sock_recv_length, > return ret; > } > > @@ -992,9 +996,15 @@ INDIRECT_CALLABLE_DECLARE(int inet6_recvmsg(struct socket *, struct msghdr *, > static inline int sock_recvmsg_nosec(struct socket *sock, struct msghdr *msg, > int flags) > { > - return INDIRECT_CALL_INET(sock->ops->recvmsg, inet6_recvmsg, > - inet_recvmsg, sock, msg, msg_data_left(msg), > - flags); > + int ret = INDIRECT_CALL_INET(sock->ops->recvmsg, inet6_recvmsg, > + inet_recvmsg, sock, msg, > + msg_data_left(msg), flags); > + > + trace_sock_recv_length(sock->sk, sock->sk->sk_family, > + sock->sk->sk_protocol, > + !(flags & MSG_PEEK) ? ret : > + (ret < 0 ? ret : 0), flags); > + return ret; > } > > /** > @@ -1044,6 +1054,7 @@ static ssize_t sock_sendpage(struct file *file, struct page *page, > { > struct socket *sock; > int flags; > + int ret; > > sock = file->private_data; > > @@ -1051,7 +1062,11 @@ static ssize_t sock_sendpage(struct file *file, struct page *page, > /* more is a combination of MSG_MORE and MSG_SENDPAGE_NOTLAST */ > flags |= more; > > - return kernel_sendpage(sock, page, offset, size, flags); > + ret = kernel_sendpage(sock, page, offset, size, flags); > + > + trace_sock_send_length(sock->sk, sock->sk->sk_family, > + sock->sk->sk_protocol, ret, 0); > + return ret; > } > > static ssize_t sock_splice_read(struct file *file, loff_t *ppos, > -- > 2.20.1 >
On Mon, Jan 9, 2023 at 5:56 PM Eric Dumazet <edumazet@google.com> wrote: > > Note: At least for CONFIG_RETPOLINE=y and gcc 12.2, compiler adds many > additional instructions (and additional memory reads), > even when the trace point is not enabled. > > Contrary to some belief, adding a tracepoint is not always 'free'. > tail calls for example are replaced with normal calls. > > .popsection > > # 0 "" 2 > #NO_APP > .L106: > # net/socket.c:1008: } > movl %ebx, %eax # <retval>, > popq %rbx # > popq %rbp # > popq %r12 # > ret > .L111: > # ./include/trace/events/sock.h:308: DEFINE_EVENT(sock_msg_length, > sock_recv_length, > Hi Eric, Thanks for your reply, In fact, it is because the definition of the tracepoint function is inline, Not just these two tracepoints,right? #define __DECLARE_TRACE(name, proto, args, cond, data_proto) \ ... static inline void trace_##name(proto) Regarding the above issue, I plan to optimize it like this: static noinline void call_trace_sock_send_length(struct sock *sk, __u16 family, __u16 protocol, int ret, int flags) { trace_sock_send_length(sk, family, protocol, ret, 0); } static inline int sock_sendmsg_nosec(struct socket *sock, struct msghdr *msg) { int ret = INDIRECT_CALL_INET(sock->ops->sendmsg, inet6_sendmsg, inet_sendmsg, sock, msg, msg_data_left(msg)); BUG_ON(ret == -EIOCBQUEUED); if (trace_sock_send_length_enabled()) { call_trace_sock_send_length(sock->sk, sock->sk->sk_family, sock->sk->sk_protocol, ret, 0); } return ret; } What do you think? Thanks, Yunhui
On Mon, Jan 9, 2023 at 2:13 PM 运辉崔 <cuiyunhui@bytedance.com> wrote: > > On Mon, Jan 9, 2023 at 5:56 PM Eric Dumazet <edumazet@google.com> wrote: > > > > > Note: At least for CONFIG_RETPOLINE=y and gcc 12.2, compiler adds many > > additional instructions (and additional memory reads), > > even when the trace point is not enabled. > > > > Contrary to some belief, adding a tracepoint is not always 'free'. > > tail calls for example are replaced with normal calls. > > > > > > .popsection > > > > # 0 "" 2 > > #NO_APP > > .L106: > > # net/socket.c:1008: } > > movl %ebx, %eax # <retval>, > > popq %rbx # > > popq %rbp # > > popq %r12 # > > ret > > .L111: > > # ./include/trace/events/sock.h:308: DEFINE_EVENT(sock_msg_length, > > sock_recv_length, > > > > Hi Eric, Thanks for your reply, In fact, it is because the > definition of the tracepoint function is inline, > Not just these two tracepoints,right? > > #define __DECLARE_TRACE(name, proto, args, cond, data_proto) \ > ... > static inline void trace_##name(proto) > > Regarding the above issue, I plan to optimize it like this: > > static noinline void call_trace_sock_send_length(struct sock *sk, __u16 family, > __u16 protocol, int ret, int flags) > { > trace_sock_send_length(sk, family, protocol, ret, 0); > } > > static inline int sock_sendmsg_nosec(struct socket *sock, struct msghdr *msg) > { > int ret = INDIRECT_CALL_INET(sock->ops->sendmsg, inet6_sendmsg, > inet_sendmsg, sock, msg, > msg_data_left(msg)); > BUG_ON(ret == -EIOCBQUEUED); > > if (trace_sock_send_length_enabled()) { A barrier() is needed here, with the current state of affairs. IMO, ftrace/x86 experts should take care of this generic issue ? > call_trace_sock_send_length(sock->sk, sock->sk->sk_family, > sock->sk->sk_protocol, ret, 0); > } > return ret; > } > > What do you think? > > Thanks, > Yunhui
On Mon, 9 Jan 2023 15:54:38 +0100 Eric Dumazet <edumazet@google.com> wrote: > > static inline int sock_sendmsg_nosec(struct socket *sock, struct msghdr *msg) > > { > > int ret = INDIRECT_CALL_INET(sock->ops->sendmsg, inet6_sendmsg, > > inet_sendmsg, sock, msg, > > msg_data_left(msg)); > > BUG_ON(ret == -EIOCBQUEUED); > > > > if (trace_sock_send_length_enabled()) { > > A barrier() is needed here, with the current state of affairs. > > IMO, ftrace/x86 experts should take care of this generic issue ? trace_*_enabled() is a static_branch() (aka. jump label). It's a nop, where the if block is in the out-of-line code and skipped. When the tracepoint is enabled, it gets turned into a jump to the if block (which returns back to this location). That is, when the tracepoint in the block gets enabled so does the above branch. Sure, there could be a race between the two being enabled, but I don't see any issue if there is. But the process to modify the jump labels, does a bunch of synchronization between the CPUs. What barrier are you expecting? -- Steve > > > > > call_trace_sock_send_length(sock->sk, sock->sk->sk_family, > > sock->sk->sk_protocol, ret, 0); > > } > > return ret; > > } > > > > What do you think?
On Mon, Jan 9, 2023 at 4:08 PM Steven Rostedt <rostedt@goodmis.org> wrote: > > On Mon, 9 Jan 2023 15:54:38 +0100 > Eric Dumazet <edumazet@google.com> wrote: > > > > static inline int sock_sendmsg_nosec(struct socket *sock, struct msghdr *msg) > > > { > > > int ret = INDIRECT_CALL_INET(sock->ops->sendmsg, inet6_sendmsg, > > > inet_sendmsg, sock, msg, > > > msg_data_left(msg)); > > > BUG_ON(ret == -EIOCBQUEUED); > > > > > > if (trace_sock_send_length_enabled()) { > > > > A barrier() is needed here, with the current state of affairs. > > > > IMO, ftrace/x86 experts should take care of this generic issue ? > > trace_*_enabled() is a static_branch() (aka. jump label). > > It's a nop, where the if block is in the out-of-line code and skipped. When > the tracepoint is enabled, it gets turned into a jump to the if block > (which returns back to this location). > This is not a nop, as shown in the generated assembly, I copied in this thread earlier. Compiler does all sorts of things before the point the static branch is looked at. Let's add the extract again with <<*>> tags on added instructions/dereferences. sock_recvmsg_nosec: pushq %r12 # movl %edx, %r12d # tmp123, flags pushq %rbp # # net/socket.c:999: int ret = INDIRECT_CALL_INET(sock->ops->recvmsg, inet6_recvmsg, movl %r12d, %ecx # flags, # net/socket.c:998: { movq %rdi, %rbp # tmp121, sock pushq %rbx # # net/socket.c:999: int ret = INDIRECT_CALL_INET(sock->ops->recvmsg, inet6_recvmsg, movq 32(%rdi), %rax # sock_19(D)->ops, sock_19(D)->ops # ./include/linux/uio.h:270: return i->count; movq 32(%rsi), %rdx # MEM[(const struct iov_iter *)msg_20(D) + 16B].count, pretmp_48 # net/socket.c:999: int ret = INDIRECT_CALL_INET(sock->ops->recvmsg, inet6_recvmsg, movq 144(%rax), %rax # _1->recvmsg, _2 cmpq $inet6_recvmsg, %rax #, _2 jne .L107 #, call inet6_recvmsg # <<*>> movl %eax, %ebx # tmp124, <retval> .L108: # net/socket.c:1003: trace_sock_recv_length(sock->sk, sock->sk->sk_family, <<*>> xorl %r8d, %r8d # tmp127 <<*>> testl %ebx, %ebx # <retval> # net/socket.c:1004: sock->sk->sk_protocol, <<*>> movq 24(%rbp), %rsi # sock_19(D)->sk, _10 # net/socket.c:1003: trace_sock_recv_length(sock->sk, sock->sk->sk_family, <<*>> cmovle %ebx, %r8d # <retval>,, tmp119 <<*>> testb $2, %r12b #, flags # net/socket.c:1004: sock->sk->sk_protocol, <<*>> movzwl 516(%rsi), %ecx # _10->sk_protocol, # net/socket.c:1003: trace_sock_recv_length(sock->sk, sock->sk->sk_family, <<*>> movzwl 16(%rsi), %edx # _10->__sk_common.skc_family, # net/socket.c:1003: trace_sock_recv_length(sock->sk, sock->sk->sk_family, <<*>> cmove %ebx, %r8d # tmp119,, <retval>, iftmp.54_16 # ./arch/x86/include/asm/jump_label.h:27: asm_volatile_goto("1:" #APP # 27 "./arch/x86/include/asm/jump_label.h" 1 1:jmp .L111 # objtool NOPs this # .pushsection __jump_table, "aw" .balign 8 .long 1b - . .long .L111 - . # .quad __tracepoint_sock_recv_length+8 + 2 - . #, .popsection # 0 "" 2 #NO_APP .L106: # net/socket.c:1008: } <<*>> movl %ebx, %eax # <retval>, popq %rbx # popq %rbp # popq %r12 # ret .L111: # ./include/trace/events/sock.h:308: DEFINE_EVENT(sock_msg_length, sock_recv_length, > That is, when the tracepoint in the block gets enabled so does the above > branch. Sure, there could be a race between the two being enabled, but I > don't see any issue if there is. But the process to modify the jump labels, > does a bunch of synchronization between the CPUs. > > What barrier are you expecting? Something preventing the compiler being 'smart', forcing expression evaluations before TP_fast_assign() is eventually called. > > -- Steve > > > > > > > > > > call_trace_sock_send_length(sock->sk, sock->sk->sk_family, > > > sock->sk->sk_protocol, ret, 0); > > > } > > > return ret; > > > } > > > > > > What do you think?
On Mon, 9 Jan 2023 16:20:58 +0100 Eric Dumazet <edumazet@google.com> wrote: > On Mon, Jan 9, 2023 at 4:08 PM Steven Rostedt <rostedt@goodmis.org> wrote: > > > > On Mon, 9 Jan 2023 15:54:38 +0100 > > Eric Dumazet <edumazet@google.com> wrote: > > > > > > static inline int sock_sendmsg_nosec(struct socket *sock, struct msghdr *msg) > > > > { > > > > int ret = INDIRECT_CALL_INET(sock->ops->sendmsg, inet6_sendmsg, > > > > inet_sendmsg, sock, msg, > > > > msg_data_left(msg)); > > > > BUG_ON(ret == -EIOCBQUEUED); > > > > > > > > if (trace_sock_send_length_enabled()) { > > > > > > A barrier() is needed here, with the current state of affairs. > > > > > > IMO, ftrace/x86 experts should take care of this generic issue ? > > > > trace_*_enabled() is a static_branch() (aka. jump label). > > > > It's a nop, where the if block is in the out-of-line code and skipped. When > > the tracepoint is enabled, it gets turned into a jump to the if block > > (which returns back to this location). > > > > This is not a nop, as shown in the generated assembly, I copied in > this thread earlier. But I thought that was for the patch before the added noinline helper function to force the tracepoint into its own function, and this now just has the static branch. > > Compiler does all sorts of things before the point the static branch > is looked at. > > Let's add the extract again with <<*>> tags on added instructions/dereferences. > > Ug, bad line wrapping > sock_recvmsg_nosec: > pushq %r12 # > movl %edx, %r12d # tmp123, flags > pushq %rbp # > # net/socket.c:999: int ret = > INDIRECT_CALL_INET(sock->ops->recvmsg, inet6_recvmsg, > movl %r12d, %ecx # flags, > # net/socket.c:998: { > movq %rdi, %rbp # tmp121, sock > pushq %rbx # > # net/socket.c:999: int ret = > INDIRECT_CALL_INET(sock->ops->recvmsg, inet6_recvmsg, > movq 32(%rdi), %rax # sock_19(D)->ops, sock_19(D)->ops > # ./include/linux/uio.h:270: return i->count; > movq 32(%rsi), %rdx # MEM[(const struct iov_iter > *)msg_20(D) + 16B].count, pretmp_48 > # net/socket.c:999: int ret = > INDIRECT_CALL_INET(sock->ops->recvmsg, inet6_recvmsg, > movq 144(%rax), %rax # _1->recvmsg, _2 > cmpq $inet6_recvmsg, %rax #, _2 > jne .L107 #, > call inet6_recvmsg # > <<*>> movl %eax, %ebx # tmp124, <retval> > .L108: > # net/socket.c:1003: trace_sock_recv_length(sock->sk, sock->sk->sk_family, > <<*>> xorl %r8d, %r8d # tmp127 > <<*>> testl %ebx, %ebx # <retval> > # net/socket.c:1004: sock->sk->sk_protocol, > <<*>> movq 24(%rbp), %rsi # sock_19(D)->sk, _10 > # net/socket.c:1003: trace_sock_recv_length(sock->sk, sock->sk->sk_family, > <<*>> cmovle %ebx, %r8d # <retval>,, tmp119 > <<*>> testb $2, %r12b #, flags > # net/socket.c:1004: sock->sk->sk_protocol, > <<*>> movzwl 516(%rsi), %ecx # _10->sk_protocol, > # net/socket.c:1003: trace_sock_recv_length(sock->sk, sock->sk->sk_family, > <<*>> movzwl 16(%rsi), %edx # _10->__sk_common.skc_family, > # net/socket.c:1003: trace_sock_recv_length(sock->sk, sock->sk->sk_family, > <<*>> cmove %ebx, %r8d # tmp119,, <retval>, iftmp.54_16 > # ./arch/x86/include/asm/jump_label.h:27: asm_volatile_goto("1:" > #APP > # 27 "./arch/x86/include/asm/jump_label.h" 1 > 1:jmp .L111 # objtool NOPs this # > .pushsection __jump_table, "aw" > .balign 8 > .long 1b - . > .long .L111 - . # > .quad __tracepoint_sock_recv_length+8 + 2 - . #, > .popsection > > # 0 "" 2 > #NO_APP > .L106: > # net/socket.c:1008: } > <<*>> movl %ebx, %eax # <retval>, > popq %rbx # > popq %rbp # > popq %r12 # > ret > .L111: > # ./include/trace/events/sock.h:308: DEFINE_EVENT(sock_msg_length, > sock_recv_length, > > > That is, when the tracepoint in the block gets enabled so does the above > > branch. Sure, there could be a race between the two being enabled, but I > > don't see any issue if there is. But the process to modify the jump labels, > > does a bunch of synchronization between the CPUs. > > > > What barrier are you expecting? > > Something preventing the compiler being 'smart', forcing expression evaluations > before TP_fast_assign() is eventually called. There's no good way to generically keep the compiler from being 'smart', that I know of. That's because the tracepoint injection is defined by: #define __DECLARE_TRACE(name, proto, args, cond, data_proto) \ extern int __traceiter_##name(data_proto); \ DECLARE_STATIC_CALL(tp_func_##name, __traceiter_##name); \ extern struct tracepoint __tracepoint_##name; \ static inline void __trace_##name(proto) \ { That (proto) is the prototype being passed in. And because macros can't create other macros, I don't know how to have a way to inject a barrier() before and after that call, or better yet, to have the prototype hidden behind the static_branch. But looking at this tracepoint again, I see a issue that can help with the dereferencing. Why is family and protocol passed in? + trace_sock_send_length(sock->sk, sock->sk->sk_family, + sock->sk->sk_protocol, ret, 0); Where the TP_fast_assign() is: + TP_fast_assign( + __entry->sk = sk; + __entry->family = sk->sk_family; + __entry->protocol = sk->sk_protocol; + __entry->length = ret > 0 ? ret : 0; + __entry->error = ret < 0 ? ret : 0; + __entry->flags = flags; + ), The family and protocol is taken from the sk, and not the parameters. I bet dropping those would help. -- Steve
On Sun, Jan 08, 2023 at 10:55:45AM +0800, Yunhui Cui wrote: >Add 2 tracepoints to monitor the tcp/udp traffic >of per process and per cgroup. > >Regarding monitoring the tcp/udp traffic of each process, there are two >existing solutions, the first one is https://www.atoptool.nl/netatop.php. >The second is via kprobe/kretprobe. > >Netatop solution is implemented by registering the hook function at the >hook point provided by the netfilter framework. > >These hook functions may be in the soft interrupt context and cannot >directly obtain the pid. Some data structures are added to bind packets >and processes. For example, struct taskinfobucket, struct taskinfo ... > >Every time the process sends and receives packets it needs multiple >hashmaps,resulting in low performance and it has the problem fo inaccurate >tcp/udp traffic statistics(for example: multiple threads share sockets). > >We can obtain the information with kretprobe, but as we know, kprobe gets >the result by trappig in an exception, which loses performance compared >to tracepoint. > >We compared the performance of tracepoints with the above two methods, and >the results are as follows: > >ab -n 1000000 -c 1000 -r http://127.0.0.1/index.html AFAIK, ab are relatively slow compared to some network benchmarks since it's a http benchmark. Can you test other benchmarks like sockperf or redis-benchmark with small packets, and check the PPS drop ? Those benchmarks should have larger PPS. Like Eric pointed out, those tracepoints in the datapath are not free, its better to make sure we don't get a noticeable PPS drop after adding those tracepoints. Thanks. >without trace: >Time per request: 39.660 [ms] (mean) >Time per request: 0.040 [ms] (mean, across all concurrent requests) > >netatop: >Time per request: 50.717 [ms] (mean) >Time per request: 0.051 [ms] (mean, across all concurrent requests) > >kr: >Time per request: 43.168 [ms] (mean) >Time per request: 0.043 [ms] (mean, across all concurrent requests) > >tracepoint: >Time per request: 41.004 [ms] (mean) >Time per request: 0.041 [ms] (mean, across all concurrent requests > >It can be seen that tracepoint has better performance. > >Signed-off-by: Yunhui Cui <cuiyunhui@bytedance.com> >Signed-off-by: Xiongchun Duan <duanxiongchun@bytedance.com> >--- > include/trace/events/sock.h | 48 +++++++++++++++++++++++++++++++++++++ > net/socket.c | 23 ++++++++++++++---- > 2 files changed, 67 insertions(+), 4 deletions(-) > >diff --git a/include/trace/events/sock.h b/include/trace/events/sock.h >index 777ee6cbe933..d00a5b272404 100644 >--- a/include/trace/events/sock.h >+++ b/include/trace/events/sock.h >@@ -263,6 +263,54 @@ TRACE_EVENT(inet_sk_error_report, > __entry->error) > ); > >+/* >+ * sock send/recv msg length >+ */ >+DECLARE_EVENT_CLASS(sock_msg_length, >+ >+ TP_PROTO(struct sock *sk, __u16 family, __u16 protocol, int ret, >+ int flags), >+ >+ TP_ARGS(sk, family, protocol, ret, flags), >+ >+ TP_STRUCT__entry( >+ __field(void *, sk) >+ __field(__u16, family) >+ __field(__u16, protocol) >+ __field(int, length) >+ __field(int, error) >+ __field(int, flags) >+ ), >+ >+ TP_fast_assign( >+ __entry->sk = sk; >+ __entry->family = sk->sk_family; >+ __entry->protocol = sk->sk_protocol; >+ __entry->length = ret > 0 ? ret : 0; >+ __entry->error = ret < 0 ? ret : 0; >+ __entry->flags = flags; >+ ), >+ >+ TP_printk("sk address = %p, family = %s protocol = %s, length = %d, error = %d, flags = 0x%x", >+ __entry->sk, show_family_name(__entry->family), >+ show_inet_protocol_name(__entry->protocol), >+ __entry->length, >+ __entry->error, __entry->flags) >+); >+ >+DEFINE_EVENT(sock_msg_length, sock_send_length, >+ TP_PROTO(struct sock *sk, __u16 family, __u16 protocol, int ret, >+ int flags), >+ >+ TP_ARGS(sk, family, protocol, ret, flags) >+); >+ >+DEFINE_EVENT(sock_msg_length, sock_recv_length, >+ TP_PROTO(struct sock *sk, __u16 family, __u16 protocol, int ret, >+ int flags), >+ >+ TP_ARGS(sk, family, protocol, ret, flags) >+); > #endif /* _TRACE_SOCK_H */ > > /* This part must be outside protection */ >diff --git a/net/socket.c b/net/socket.c >index 888cd618a968..60a1ff95b4b1 100644 >--- a/net/socket.c >+++ b/net/socket.c >@@ -106,6 +106,7 @@ > #include <net/busy_poll.h> > #include <linux/errqueue.h> > #include <linux/ptp_clock_kernel.h> >+#include <trace/events/sock.h> > > #ifdef CONFIG_NET_RX_BUSY_POLL > unsigned int sysctl_net_busy_read __read_mostly; >@@ -715,6 +716,9 @@ static inline int sock_sendmsg_nosec(struct socket *sock, struct msghdr *msg) > inet_sendmsg, sock, msg, > msg_data_left(msg)); > BUG_ON(ret == -EIOCBQUEUED); >+ >+ trace_sock_send_length(sock->sk, sock->sk->sk_family, >+ sock->sk->sk_protocol, ret, 0); > return ret; > } > >@@ -992,9 +996,15 @@ INDIRECT_CALLABLE_DECLARE(int inet6_recvmsg(struct socket *, struct msghdr *, > static inline int sock_recvmsg_nosec(struct socket *sock, struct msghdr *msg, > int flags) > { >- return INDIRECT_CALL_INET(sock->ops->recvmsg, inet6_recvmsg, >- inet_recvmsg, sock, msg, msg_data_left(msg), >- flags); >+ int ret = INDIRECT_CALL_INET(sock->ops->recvmsg, inet6_recvmsg, >+ inet_recvmsg, sock, msg, >+ msg_data_left(msg), flags); >+ >+ trace_sock_recv_length(sock->sk, sock->sk->sk_family, >+ sock->sk->sk_protocol, >+ !(flags & MSG_PEEK) ? ret : >+ (ret < 0 ? ret : 0), flags); >+ return ret; > } > > /** >@@ -1044,6 +1054,7 @@ static ssize_t sock_sendpage(struct file *file, struct page *page, > { > struct socket *sock; > int flags; >+ int ret; > > sock = file->private_data; > >@@ -1051,7 +1062,11 @@ static ssize_t sock_sendpage(struct file *file, struct page *page, > /* more is a combination of MSG_MORE and MSG_SENDPAGE_NOTLAST */ > flags |= more; > >- return kernel_sendpage(sock, page, offset, size, flags); >+ ret = kernel_sendpage(sock, page, offset, size, flags); >+ >+ trace_sock_send_length(sock->sk, sock->sk->sk_family, >+ sock->sk->sk_protocol, ret, 0); >+ return ret; > } > > static ssize_t sock_splice_read(struct file *file, loff_t *ppos, >-- >2.20.1
On Mon, Jan 9, 2023 at 11:39 PM Steven Rostedt <rostedt@goodmis.org> wrote: > > But looking at this tracepoint again, I see a issue that can help with the > dereferencing. > > Why is family and protocol passed in? > > + trace_sock_send_length(sock->sk, sock->sk->sk_family, > + sock->sk->sk_protocol, ret, 0); > > > Where the TP_fast_assign() is: > > + TP_fast_assign( > + __entry->sk = sk; > + __entry->family = sk->sk_family; > + __entry->protocol = sk->sk_protocol; > + __entry->length = ret > 0 ? ret : 0; > + __entry->error = ret < 0 ? ret : 0; > + __entry->flags = flags; > + ), > > The family and protocol is taken from the sk, and not the parameters. I bet > dropping those would help. > > -- Steve > Many thanks to Eric and Steven for your discussions and suggestions, I will update on v5. Thanks, Yunhui
diff --git a/include/trace/events/sock.h b/include/trace/events/sock.h index 777ee6cbe933..d00a5b272404 100644 --- a/include/trace/events/sock.h +++ b/include/trace/events/sock.h @@ -263,6 +263,54 @@ TRACE_EVENT(inet_sk_error_report, __entry->error) ); +/* + * sock send/recv msg length + */ +DECLARE_EVENT_CLASS(sock_msg_length, + + TP_PROTO(struct sock *sk, __u16 family, __u16 protocol, int ret, + int flags), + + TP_ARGS(sk, family, protocol, ret, flags), + + TP_STRUCT__entry( + __field(void *, sk) + __field(__u16, family) + __field(__u16, protocol) + __field(int, length) + __field(int, error) + __field(int, flags) + ), + + TP_fast_assign( + __entry->sk = sk; + __entry->family = sk->sk_family; + __entry->protocol = sk->sk_protocol; + __entry->length = ret > 0 ? ret : 0; + __entry->error = ret < 0 ? ret : 0; + __entry->flags = flags; + ), + + TP_printk("sk address = %p, family = %s protocol = %s, length = %d, error = %d, flags = 0x%x", + __entry->sk, show_family_name(__entry->family), + show_inet_protocol_name(__entry->protocol), + __entry->length, + __entry->error, __entry->flags) +); + +DEFINE_EVENT(sock_msg_length, sock_send_length, + TP_PROTO(struct sock *sk, __u16 family, __u16 protocol, int ret, + int flags), + + TP_ARGS(sk, family, protocol, ret, flags) +); + +DEFINE_EVENT(sock_msg_length, sock_recv_length, + TP_PROTO(struct sock *sk, __u16 family, __u16 protocol, int ret, + int flags), + + TP_ARGS(sk, family, protocol, ret, flags) +); #endif /* _TRACE_SOCK_H */ /* This part must be outside protection */ diff --git a/net/socket.c b/net/socket.c index 888cd618a968..60a1ff95b4b1 100644 --- a/net/socket.c +++ b/net/socket.c @@ -106,6 +106,7 @@ #include <net/busy_poll.h> #include <linux/errqueue.h> #include <linux/ptp_clock_kernel.h> +#include <trace/events/sock.h> #ifdef CONFIG_NET_RX_BUSY_POLL unsigned int sysctl_net_busy_read __read_mostly; @@ -715,6 +716,9 @@ static inline int sock_sendmsg_nosec(struct socket *sock, struct msghdr *msg) inet_sendmsg, sock, msg, msg_data_left(msg)); BUG_ON(ret == -EIOCBQUEUED); + + trace_sock_send_length(sock->sk, sock->sk->sk_family, + sock->sk->sk_protocol, ret, 0); return ret; } @@ -992,9 +996,15 @@ INDIRECT_CALLABLE_DECLARE(int inet6_recvmsg(struct socket *, struct msghdr *, static inline int sock_recvmsg_nosec(struct socket *sock, struct msghdr *msg, int flags) { - return INDIRECT_CALL_INET(sock->ops->recvmsg, inet6_recvmsg, - inet_recvmsg, sock, msg, msg_data_left(msg), - flags); + int ret = INDIRECT_CALL_INET(sock->ops->recvmsg, inet6_recvmsg, + inet_recvmsg, sock, msg, + msg_data_left(msg), flags); + + trace_sock_recv_length(sock->sk, sock->sk->sk_family, + sock->sk->sk_protocol, + !(flags & MSG_PEEK) ? ret : + (ret < 0 ? ret : 0), flags); + return ret; } /** @@ -1044,6 +1054,7 @@ static ssize_t sock_sendpage(struct file *file, struct page *page, { struct socket *sock; int flags; + int ret; sock = file->private_data; @@ -1051,7 +1062,11 @@ static ssize_t sock_sendpage(struct file *file, struct page *page, /* more is a combination of MSG_MORE and MSG_SENDPAGE_NOTLAST */ flags |= more; - return kernel_sendpage(sock, page, offset, size, flags); + ret = kernel_sendpage(sock, page, offset, size, flags); + + trace_sock_send_length(sock->sk, sock->sk->sk_family, + sock->sk->sk_protocol, ret, 0); + return ret; } static ssize_t sock_splice_read(struct file *file, loff_t *ppos,