diff mbox series

[bpf-next,v2] libbpf: Use dynamically allocated buffer when receiving netlink messages

Message ID 20220211234819.612288-1-toke@redhat.com (mailing list archive)
State Accepted
Commit 9c3de619e13ee6693ec5ac74f50b7aa89056a70e
Delegated to: BPF
Headers show
Series [bpf-next,v2] libbpf: Use dynamically allocated buffer when receiving netlink messages | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-PR success PR summary
bpf/vmtest-bpf-next success VM_Test
netdev/tree_selection success Clearly marked for bpf-next
netdev/fixes_present success Fixes tag not required for -next series
netdev/subject_prefix success Link
netdev/cover_letter success Single patches do not need cover letters
netdev/patch_count success Link
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 0 this patch: 0
netdev/cc_maintainers success CCed 12 of 12 maintainers
netdev/build_clang success Errors and warnings before: 0 this patch: 0
netdev/module_param success Was 0 now: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/verify_fixes success Fixes tag looks correct
netdev/build_allmodconfig_warn success Errors and warnings before: 0 this patch: 0
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 86 lines checked
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Toke Høiland-Jørgensen Feb. 11, 2022, 11:48 p.m. UTC
When receiving netlink messages, libbpf was using a statically allocated
stack buffer of 4k bytes. This happened to work fine on systems with a 4k
page size, but on systems with larger page sizes it can lead to truncated
messages. The user-visible impact of this was that libbpf would insist no
XDP program was attached to some interfaces because that bit of the netlink
message got chopped off.

Fix this by switching to a dynamically allocated buffer; we borrow the
approach from iproute2 of using recvmsg() with MSG_PEEK|MSG_TRUNC to get
the actual size of the pending message before receiving it, adjusting the
buffer as necessary. While we're at it, also add retries on interrupted
system calls around the recvmsg() call.

v2:
  - Move peek logic to libbpf_netlink_recv(), don't double free on ENOMEM.

Reported-by: Zhiqian Guan <zhguan@redhat.com>
Fixes: 8bbb77b7c7a2 ("libbpf: Add various netlink helpers")
Acked-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
---
 tools/lib/bpf/netlink.c | 55 ++++++++++++++++++++++++++++++++++++++---
 1 file changed, 51 insertions(+), 4 deletions(-)

Comments

Andrii Nakryiko Feb. 12, 2022, 3:59 p.m. UTC | #1
On Fri, Feb 11, 2022 at 3:49 PM Toke Høiland-Jørgensen <toke@redhat.com> wrote:
>
> When receiving netlink messages, libbpf was using a statically allocated
> stack buffer of 4k bytes. This happened to work fine on systems with a 4k
> page size, but on systems with larger page sizes it can lead to truncated
> messages. The user-visible impact of this was that libbpf would insist no
> XDP program was attached to some interfaces because that bit of the netlink
> message got chopped off.
>
> Fix this by switching to a dynamically allocated buffer; we borrow the
> approach from iproute2 of using recvmsg() with MSG_PEEK|MSG_TRUNC to get
> the actual size of the pending message before receiving it, adjusting the
> buffer as necessary. While we're at it, also add retries on interrupted
> system calls around the recvmsg() call.
>
> v2:
>   - Move peek logic to libbpf_netlink_recv(), don't double free on ENOMEM.
>
> Reported-by: Zhiqian Guan <zhguan@redhat.com>
> Fixes: 8bbb77b7c7a2 ("libbpf: Add various netlink helpers")
> Acked-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
> Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
> ---

Applied to bpf-next. One improvement would be to avoid initial malloc
of 4096, especially if that size is enough for most cases. You could
detect this through iov.iov_base == buf and not free(iov.iov_base) at
the end. Seems reliable and simple enough. I'll leave it up to you to
follow up, if you think it's a good idea.

>  tools/lib/bpf/netlink.c | 55 ++++++++++++++++++++++++++++++++++++++---
>  1 file changed, 51 insertions(+), 4 deletions(-)
>
> diff --git a/tools/lib/bpf/netlink.c b/tools/lib/bpf/netlink.c
> index c39c37f99d5c..a598061f6fea 100644
> --- a/tools/lib/bpf/netlink.c
> +++ b/tools/lib/bpf/netlink.c
> @@ -87,29 +87,75 @@ enum {
>         NL_DONE,
>  };
>
> +static int netlink_recvmsg(int sock, struct msghdr *mhdr, int flags)
> +{
> +       int len;
> +
> +       do {
> +               len = recvmsg(sock, mhdr, flags);
> +       } while (len < 0 && (errno == EINTR || errno == EAGAIN));
> +
> +       if (len < 0)
> +               return -errno;
> +       return len;
> +}
> +
> +static int alloc_iov(struct iovec *iov, int len)
> +{
> +       void *nbuf;
> +
> +       nbuf = realloc(iov->iov_base, len);
> +       if (!nbuf)
> +               return -ENOMEM;
> +
> +       iov->iov_base = nbuf;
> +       iov->iov_len = len;
> +       return 0;
> +}
> +
>  static int libbpf_netlink_recv(int sock, __u32 nl_pid, int seq,
>                                __dump_nlmsg_t _fn, libbpf_dump_nlmsg_t fn,
>                                void *cookie)
>  {
> +       struct iovec iov = {};
> +       struct msghdr mhdr = {
> +               .msg_iov = &iov,
> +               .msg_iovlen = 1,
> +       };
>         bool multipart = true;
>         struct nlmsgerr *err;
>         struct nlmsghdr *nh;
> -       char buf[4096];
>         int len, ret;
>
> +       ret = alloc_iov(&iov, 4096);
> +       if (ret)
> +               goto done;
> +
>         while (multipart) {
>  start:
>                 multipart = false;
> -               len = recv(sock, buf, sizeof(buf), 0);
> +               len = netlink_recvmsg(sock, &mhdr, MSG_PEEK | MSG_TRUNC);
> +               if (len < 0) {
> +                       ret = len;
> +                       goto done;
> +               }
> +
> +               if (len > iov.iov_len) {
> +                       ret = alloc_iov(&iov, len);
> +                       if (ret)
> +                               goto done;
> +               }
> +
> +               len = netlink_recvmsg(sock, &mhdr, 0);
>                 if (len < 0) {
> -                       ret = -errno;
> +                       ret = len;
>                         goto done;
>                 }
>
>                 if (len == 0)
>                         break;
>
> -               for (nh = (struct nlmsghdr *)buf; NLMSG_OK(nh, len);
> +               for (nh = (struct nlmsghdr *)iov.iov_base; NLMSG_OK(nh, len);
>                      nh = NLMSG_NEXT(nh, len)) {
>                         if (nh->nlmsg_pid != nl_pid) {
>                                 ret = -LIBBPF_ERRNO__WRNGPID;
> @@ -151,6 +197,7 @@ static int libbpf_netlink_recv(int sock, __u32 nl_pid, int seq,
>         }
>         ret = 0;
>  done:
> +       free(iov.iov_base);
>         return ret;
>  }
>
> --
> 2.35.1
>
patchwork-bot+netdevbpf@kernel.org Feb. 12, 2022, 4:10 p.m. UTC | #2
Hello:

This patch was applied to bpf/bpf-next.git (master)
by Andrii Nakryiko <andrii@kernel.org>:

On Sat, 12 Feb 2022 00:48:19 +0100 you wrote:
> When receiving netlink messages, libbpf was using a statically allocated
> stack buffer of 4k bytes. This happened to work fine on systems with a 4k
> page size, but on systems with larger page sizes it can lead to truncated
> messages. The user-visible impact of this was that libbpf would insist no
> XDP program was attached to some interfaces because that bit of the netlink
> message got chopped off.
> 
> [...]

Here is the summary with links:
  - [bpf-next,v2] libbpf: Use dynamically allocated buffer when receiving netlink messages
    https://git.kernel.org/bpf/bpf-next/c/9c3de619e13e

You are awesome, thank you!
Toke Høiland-Jørgensen Feb. 13, 2022, 3:17 p.m. UTC | #3
Andrii Nakryiko <andrii.nakryiko@gmail.com> writes:

> On Fri, Feb 11, 2022 at 3:49 PM Toke Høiland-Jørgensen <toke@redhat.com> wrote:
>>
>> When receiving netlink messages, libbpf was using a statically allocated
>> stack buffer of 4k bytes. This happened to work fine on systems with a 4k
>> page size, but on systems with larger page sizes it can lead to truncated
>> messages. The user-visible impact of this was that libbpf would insist no
>> XDP program was attached to some interfaces because that bit of the netlink
>> message got chopped off.
>>
>> Fix this by switching to a dynamically allocated buffer; we borrow the
>> approach from iproute2 of using recvmsg() with MSG_PEEK|MSG_TRUNC to get
>> the actual size of the pending message before receiving it, adjusting the
>> buffer as necessary. While we're at it, also add retries on interrupted
>> system calls around the recvmsg() call.
>>
>> v2:
>>   - Move peek logic to libbpf_netlink_recv(), don't double free on ENOMEM.
>>
>> Reported-by: Zhiqian Guan <zhguan@redhat.com>
>> Fixes: 8bbb77b7c7a2 ("libbpf: Add various netlink helpers")
>> Acked-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
>> Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
>> ---
>
> Applied to bpf-next.

Awesome, thanks!

> One improvement would be to avoid initial malloc of 4096, especially
> if that size is enough for most cases. You could detect this through
> iov.iov_base == buf and not free(iov.iov_base) at the end. Seems
> reliable and simple enough. I'll leave it up to you to follow up, if
> you think it's a good idea.

Hmm, seems distributions tend to default the stack size limit to 8k; so
not sure if blowing half of that on a buffer just to avoid a call to
malloc() in a non-performance-sensitive is ideal to begin with? I think
I'd prefer to just keep the dynamic allocation...

-Toke
Andrii Nakryiko Feb. 14, 2022, 5:52 a.m. UTC | #4
On Sun, Feb 13, 2022 at 7:17 AM Toke Høiland-Jørgensen <toke@redhat.com> wrote:
>
> Andrii Nakryiko <andrii.nakryiko@gmail.com> writes:
>
> > On Fri, Feb 11, 2022 at 3:49 PM Toke Høiland-Jørgensen <toke@redhat.com> wrote:
> >>
> >> When receiving netlink messages, libbpf was using a statically allocated
> >> stack buffer of 4k bytes. This happened to work fine on systems with a 4k
> >> page size, but on systems with larger page sizes it can lead to truncated
> >> messages. The user-visible impact of this was that libbpf would insist no
> >> XDP program was attached to some interfaces because that bit of the netlink
> >> message got chopped off.
> >>
> >> Fix this by switching to a dynamically allocated buffer; we borrow the
> >> approach from iproute2 of using recvmsg() with MSG_PEEK|MSG_TRUNC to get
> >> the actual size of the pending message before receiving it, adjusting the
> >> buffer as necessary. While we're at it, also add retries on interrupted
> >> system calls around the recvmsg() call.
> >>
> >> v2:
> >>   - Move peek logic to libbpf_netlink_recv(), don't double free on ENOMEM.
> >>
> >> Reported-by: Zhiqian Guan <zhguan@redhat.com>
> >> Fixes: 8bbb77b7c7a2 ("libbpf: Add various netlink helpers")
> >> Acked-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
> >> Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
> >> ---
> >
> > Applied to bpf-next.
>
> Awesome, thanks!
>
> > One improvement would be to avoid initial malloc of 4096, especially
> > if that size is enough for most cases. You could detect this through
> > iov.iov_base == buf and not free(iov.iov_base) at the end. Seems
> > reliable and simple enough. I'll leave it up to you to follow up, if
> > you think it's a good idea.
>
> Hmm, seems distributions tend to default the stack size limit to 8k; so
> not sure if blowing half of that on a buffer just to avoid a call to
> malloc() in a non-performance-sensitive is ideal to begin with? I think
> I'd prefer to just keep the dynamic allocation...

8KB for user-space thread stack, really? Not 2MB by default? Are you
sure you are not confusing this with kernel threads?

>
> -Toke
>
Toke Høiland-Jørgensen Feb. 14, 2022, 4:52 p.m. UTC | #5
Andrii Nakryiko <andrii.nakryiko@gmail.com> writes:

> On Sun, Feb 13, 2022 at 7:17 AM Toke Høiland-Jørgensen <toke@redhat.com> wrote:
>>
>> Andrii Nakryiko <andrii.nakryiko@gmail.com> writes:
>>
>> > On Fri, Feb 11, 2022 at 3:49 PM Toke Høiland-Jørgensen <toke@redhat.com> wrote:
>> >>
>> >> When receiving netlink messages, libbpf was using a statically allocated
>> >> stack buffer of 4k bytes. This happened to work fine on systems with a 4k
>> >> page size, but on systems with larger page sizes it can lead to truncated
>> >> messages. The user-visible impact of this was that libbpf would insist no
>> >> XDP program was attached to some interfaces because that bit of the netlink
>> >> message got chopped off.
>> >>
>> >> Fix this by switching to a dynamically allocated buffer; we borrow the
>> >> approach from iproute2 of using recvmsg() with MSG_PEEK|MSG_TRUNC to get
>> >> the actual size of the pending message before receiving it, adjusting the
>> >> buffer as necessary. While we're at it, also add retries on interrupted
>> >> system calls around the recvmsg() call.
>> >>
>> >> v2:
>> >>   - Move peek logic to libbpf_netlink_recv(), don't double free on ENOMEM.
>> >>
>> >> Reported-by: Zhiqian Guan <zhguan@redhat.com>
>> >> Fixes: 8bbb77b7c7a2 ("libbpf: Add various netlink helpers")
>> >> Acked-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
>> >> Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
>> >> ---
>> >
>> > Applied to bpf-next.
>>
>> Awesome, thanks!
>>
>> > One improvement would be to avoid initial malloc of 4096, especially
>> > if that size is enough for most cases. You could detect this through
>> > iov.iov_base == buf and not free(iov.iov_base) at the end. Seems
>> > reliable and simple enough. I'll leave it up to you to follow up, if
>> > you think it's a good idea.
>>
>> Hmm, seems distributions tend to default the stack size limit to 8k; so
>> not sure if blowing half of that on a buffer just to avoid a call to
>> malloc() in a non-performance-sensitive is ideal to begin with? I think
>> I'd prefer to just keep the dynamic allocation...
>
> 8KB for user-space thread stack, really? Not 2MB by default? Are you
> sure you are not confusing this with kernel threads?

Ha, oops! I was looking in the right place, just got the units wrong;
those were kbytes not bytes, so 8M stack size. Sorry for the confusion :)

-Toke
diff mbox series

Patch

diff --git a/tools/lib/bpf/netlink.c b/tools/lib/bpf/netlink.c
index c39c37f99d5c..a598061f6fea 100644
--- a/tools/lib/bpf/netlink.c
+++ b/tools/lib/bpf/netlink.c
@@ -87,29 +87,75 @@  enum {
 	NL_DONE,
 };
 
+static int netlink_recvmsg(int sock, struct msghdr *mhdr, int flags)
+{
+	int len;
+
+	do {
+		len = recvmsg(sock, mhdr, flags);
+	} while (len < 0 && (errno == EINTR || errno == EAGAIN));
+
+	if (len < 0)
+		return -errno;
+	return len;
+}
+
+static int alloc_iov(struct iovec *iov, int len)
+{
+	void *nbuf;
+
+	nbuf = realloc(iov->iov_base, len);
+	if (!nbuf)
+		return -ENOMEM;
+
+	iov->iov_base = nbuf;
+	iov->iov_len = len;
+	return 0;
+}
+
 static int libbpf_netlink_recv(int sock, __u32 nl_pid, int seq,
 			       __dump_nlmsg_t _fn, libbpf_dump_nlmsg_t fn,
 			       void *cookie)
 {
+	struct iovec iov = {};
+	struct msghdr mhdr = {
+		.msg_iov = &iov,
+		.msg_iovlen = 1,
+	};
 	bool multipart = true;
 	struct nlmsgerr *err;
 	struct nlmsghdr *nh;
-	char buf[4096];
 	int len, ret;
 
+	ret = alloc_iov(&iov, 4096);
+	if (ret)
+		goto done;
+
 	while (multipart) {
 start:
 		multipart = false;
-		len = recv(sock, buf, sizeof(buf), 0);
+		len = netlink_recvmsg(sock, &mhdr, MSG_PEEK | MSG_TRUNC);
+		if (len < 0) {
+			ret = len;
+			goto done;
+		}
+
+		if (len > iov.iov_len) {
+			ret = alloc_iov(&iov, len);
+			if (ret)
+				goto done;
+		}
+
+		len = netlink_recvmsg(sock, &mhdr, 0);
 		if (len < 0) {
-			ret = -errno;
+			ret = len;
 			goto done;
 		}
 
 		if (len == 0)
 			break;
 
-		for (nh = (struct nlmsghdr *)buf; NLMSG_OK(nh, len);
+		for (nh = (struct nlmsghdr *)iov.iov_base; NLMSG_OK(nh, len);
 		     nh = NLMSG_NEXT(nh, len)) {
 			if (nh->nlmsg_pid != nl_pid) {
 				ret = -LIBBPF_ERRNO__WRNGPID;
@@ -151,6 +197,7 @@  static int libbpf_netlink_recv(int sock, __u32 nl_pid, int seq,
 	}
 	ret = 0;
 done:
+	free(iov.iov_base);
 	return ret;
 }