diff mbox series

[bpf-next] libbpf: Use dynamically allocated buffer when receiving netlink messages

Message ID 20220211195101.591642-1-toke@redhat.com (mailing list archive)
State Superseded
Delegated to: BPF
Headers show
Series [bpf-next] libbpf: Use dynamically allocated buffer when receiving netlink messages | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-PR success PR summary
netdev/tree_selection success Clearly marked for bpf-next
netdev/fixes_present success Fixes tag not required for -next series
netdev/subject_prefix success Link
netdev/cover_letter success Single patches do not need cover letters
netdev/patch_count success Link
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 0 this patch: 0
netdev/cc_maintainers success CCed 12 of 12 maintainers
netdev/build_clang success Errors and warnings before: 0 this patch: 0
netdev/module_param success Was 0 now: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/verify_fixes success Fixes tag looks correct
netdev/build_allmodconfig_warn success Errors and warnings before: 0 this patch: 0
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 80 lines checked
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next success VM_Test

Commit Message

Toke Høiland-Jørgensen Feb. 11, 2022, 7:51 p.m. UTC
When receiving netlink messages, libbpf was using a statically allocated
stack buffer of 4k bytes. This happened to work fine on systems with a 4k
page size, but on systems with larger page sizes it can lead to truncated
messages. The user-visible impact of this was that libbpf would insist no
XDP program was attached to some interfaces because that bit of the netlink
message got chopped off.

Fix this by switching to a dynamically allocated buffer; we borrow the
approach from iproute2 of using recvmsg() with MSG_PEEK|MSG_TRUNC to get
the actual size of the pending message before receiving it, adjusting the
buffer as necessary. While we're at it, also add retries on interrupted
system calls around the recvmsg() call.

Reported-by: Zhiqian Guan <zhguan@redhat.com>
Fixes: 8bbb77b7c7a2 ("libbpf: Add various netlink helpers")
Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
---
 tools/lib/bpf/netlink.c | 55 ++++++++++++++++++++++++++++++++++++++---
 1 file changed, 52 insertions(+), 3 deletions(-)

Comments

Kumar Kartikeya Dwivedi Feb. 11, 2022, 9:12 p.m. UTC | #1
On Sat, Feb 12, 2022 at 01:21:00AM IST, Toke Høiland-Jørgensen wrote:
> When receiving netlink messages, libbpf was using a statically allocated
> stack buffer of 4k bytes. This happened to work fine on systems with a 4k
> page size, but on systems with larger page sizes it can lead to truncated
> messages. The user-visible impact of this was that libbpf would insist no
> XDP program was attached to some interfaces because that bit of the netlink
> message got chopped off.
>
> Fix this by switching to a dynamically allocated buffer; we borrow the
> approach from iproute2 of using recvmsg() with MSG_PEEK|MSG_TRUNC to get
> the actual size of the pending message before receiving it, adjusting the
> buffer as necessary. While we're at it, also add retries on interrupted
> system calls around the recvmsg() call.
>
> Reported-by: Zhiqian Guan <zhguan@redhat.com>
> Fixes: 8bbb77b7c7a2 ("libbpf: Add various netlink helpers")
> Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
> ---

Thanks for the fix!

Acked-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>

>  tools/lib/bpf/netlink.c | 55 ++++++++++++++++++++++++++++++++++++++---
>  1 file changed, 52 insertions(+), 3 deletions(-)
>
> diff --git a/tools/lib/bpf/netlink.c b/tools/lib/bpf/netlink.c
> index c39c37f99d5c..9a6e95206bf0 100644
> --- a/tools/lib/bpf/netlink.c
> +++ b/tools/lib/bpf/netlink.c
> @@ -87,22 +87,70 @@ enum {
>  	NL_DONE,
>  };
>
> +static int __libbpf_netlink_recvmsg(int sock, struct msghdr *mhdr, int flags)
> +{
> +	int len;
> +
> +	do {
> +		len = recvmsg(sock, mhdr, flags);
> +	} while (len < 0 && (errno == EINTR || errno == EAGAIN));
> +
> +	if (len < 0)
> +		return -errno;
> +	return len;
> +}
> +
> +static int libbpf_netlink_recvmsg(int sock, struct msghdr *mhdr, char **buf)
> +{
> +	struct iovec *iov = mhdr->msg_iov;
> +	void *nbuf;
> +	int len;
> +
> +	len = __libbpf_netlink_recvmsg(sock, mhdr, MSG_PEEK | MSG_TRUNC);
> +	if (len < 0)
> +		return len;
> +
> +	if (len < 4096)
> +		len = 4096;
> +
> +	if (len > iov->iov_len) {
> +		nbuf = realloc(iov->iov_base, len);
> +		if (!nbuf) {
> +			free(iov->iov_base);
> +			return -ENOMEM;
> +		}
> +		iov->iov_base = nbuf;
> +		iov->iov_len = len;
> +	}
> +
> +	len = __libbpf_netlink_recvmsg(sock, mhdr, 0);
> +	if (len > 0)
> +		*buf = iov->iov_base;
> +	return len;
> +}
> +
>  static int libbpf_netlink_recv(int sock, __u32 nl_pid, int seq,
>  			       __dump_nlmsg_t _fn, libbpf_dump_nlmsg_t fn,
>  			       void *cookie)
>  {
> +	struct iovec iov = {};
> +	struct msghdr mhdr = {
> +		.msg_iov = &iov,
> +		.msg_iovlen = 1,
> +	};
>  	bool multipart = true;
>  	struct nlmsgerr *err;
>  	struct nlmsghdr *nh;
> -	char buf[4096];
>  	int len, ret;
> +	char *buf;
> +
>
>  	while (multipart) {
>  start:
>  		multipart = false;
> -		len = recv(sock, buf, sizeof(buf), 0);
> +		len = libbpf_netlink_recvmsg(sock, &mhdr, &buf);
>  		if (len < 0) {
> -			ret = -errno;
> +			ret = len;
>  			goto done;
>  		}
>
> @@ -151,6 +199,7 @@ static int libbpf_netlink_recv(int sock, __u32 nl_pid, int seq,
>  	}
>  	ret = 0;
>  done:
> +	free(iov.iov_base);
>  	return ret;
>  }
>
> --
> 2.35.1
>

--
Kartikeya
Andrii Nakryiko Feb. 11, 2022, 10:14 p.m. UTC | #2
On Fri, Feb 11, 2022 at 11:51 AM Toke Høiland-Jørgensen <toke@redhat.com> wrote:
>
> When receiving netlink messages, libbpf was using a statically allocated
> stack buffer of 4k bytes. This happened to work fine on systems with a 4k
> page size, but on systems with larger page sizes it can lead to truncated
> messages. The user-visible impact of this was that libbpf would insist no
> XDP program was attached to some interfaces because that bit of the netlink
> message got chopped off.
>
> Fix this by switching to a dynamically allocated buffer; we borrow the
> approach from iproute2 of using recvmsg() with MSG_PEEK|MSG_TRUNC to get
> the actual size of the pending message before receiving it, adjusting the
> buffer as necessary. While we're at it, also add retries on interrupted
> system calls around the recvmsg() call.
>
> Reported-by: Zhiqian Guan <zhguan@redhat.com>
> Fixes: 8bbb77b7c7a2 ("libbpf: Add various netlink helpers")
> Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
> ---
>  tools/lib/bpf/netlink.c | 55 ++++++++++++++++++++++++++++++++++++++---
>  1 file changed, 52 insertions(+), 3 deletions(-)
>
> diff --git a/tools/lib/bpf/netlink.c b/tools/lib/bpf/netlink.c
> index c39c37f99d5c..9a6e95206bf0 100644
> --- a/tools/lib/bpf/netlink.c
> +++ b/tools/lib/bpf/netlink.c
> @@ -87,22 +87,70 @@ enum {
>         NL_DONE,
>  };
>
> +static int __libbpf_netlink_recvmsg(int sock, struct msghdr *mhdr, int flags)

let's not use names starting with underscored. Just call it
"netlink_recvmsg" or something like that.

> +{
> +       int len;
> +
> +       do {
> +               len = recvmsg(sock, mhdr, flags);

recvmsg returns ssize_t, is it ok to truncate to int?


> +       } while (len < 0 && (errno == EINTR || errno == EAGAIN));
> +
> +       if (len < 0)
> +               return -errno;
> +       return len;
> +}
> +
> +static int libbpf_netlink_recvmsg(int sock, struct msghdr *mhdr, char **buf)
> +{
> +       struct iovec *iov = mhdr->msg_iov;
> +       void *nbuf;
> +       int len;
> +
> +       len = __libbpf_netlink_recvmsg(sock, mhdr, MSG_PEEK | MSG_TRUNC);
> +       if (len < 0)
> +               return len;
> +
> +       if (len < 4096)
> +               len = 4096;
> +
> +       if (len > iov->iov_len) {
> +               nbuf = realloc(iov->iov_base, len);
> +               if (!nbuf) {
> +                       free(iov->iov_base);
> +                       return -ENOMEM;
> +               }
> +               iov->iov_base = nbuf;

this function both sets iov->iov_base *and* returns buf. It's quite a
convoluted contract. Seems like buf is not necessary (and also NULL
out iov->iov_base in case of error above?). But it might be cleaner to
do this MSG_PEEK  + realloc + recvmsg  in libbpf_netlink_recv()
explicitly. It's only one place.


> +               iov->iov_len = len;
> +       }
> +
> +       len = __libbpf_netlink_recvmsg(sock, mhdr, 0);
> +       if (len > 0)
> +               *buf = iov->iov_base;
> +       return len;
> +}
> +
>  static int libbpf_netlink_recv(int sock, __u32 nl_pid, int seq,
>                                __dump_nlmsg_t _fn, libbpf_dump_nlmsg_t fn,
>                                void *cookie)
>  {
> +       struct iovec iov = {};
> +       struct msghdr mhdr = {
> +               .msg_iov = &iov,
> +               .msg_iovlen = 1,
> +       };
>         bool multipart = true;
>         struct nlmsgerr *err;
>         struct nlmsghdr *nh;
> -       char buf[4096];
>         int len, ret;
> +       char *buf;
> +
>
>         while (multipart) {
>  start:
>                 multipart = false;
> -               len = recv(sock, buf, sizeof(buf), 0);
> +               len = libbpf_netlink_recvmsg(sock, &mhdr, &buf);
>                 if (len < 0) {
> -                       ret = -errno;
> +                       ret = len;
>                         goto done;
>                 }
>
> @@ -151,6 +199,7 @@ static int libbpf_netlink_recv(int sock, __u32 nl_pid, int seq,
>         }
>         ret = 0;
>  done:
> +       free(iov.iov_base);

double free on -ENOMEM? And even more confusing why you bother with
buf at all...

>         return ret;
>  }
>
> --
> 2.35.1
>
Toke Høiland-Jørgensen Feb. 11, 2022, 11:37 p.m. UTC | #3
Andrii Nakryiko <andrii.nakryiko@gmail.com> writes:

> On Fri, Feb 11, 2022 at 11:51 AM Toke Høiland-Jørgensen <toke@redhat.com> wrote:
>>
>> When receiving netlink messages, libbpf was using a statically allocated
>> stack buffer of 4k bytes. This happened to work fine on systems with a 4k
>> page size, but on systems with larger page sizes it can lead to truncated
>> messages. The user-visible impact of this was that libbpf would insist no
>> XDP program was attached to some interfaces because that bit of the netlink
>> message got chopped off.
>>
>> Fix this by switching to a dynamically allocated buffer; we borrow the
>> approach from iproute2 of using recvmsg() with MSG_PEEK|MSG_TRUNC to get
>> the actual size of the pending message before receiving it, adjusting the
>> buffer as necessary. While we're at it, also add retries on interrupted
>> system calls around the recvmsg() call.
>>
>> Reported-by: Zhiqian Guan <zhguan@redhat.com>
>> Fixes: 8bbb77b7c7a2 ("libbpf: Add various netlink helpers")
>> Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
>> ---
>>  tools/lib/bpf/netlink.c | 55 ++++++++++++++++++++++++++++++++++++++---
>>  1 file changed, 52 insertions(+), 3 deletions(-)
>>
>> diff --git a/tools/lib/bpf/netlink.c b/tools/lib/bpf/netlink.c
>> index c39c37f99d5c..9a6e95206bf0 100644
>> --- a/tools/lib/bpf/netlink.c
>> +++ b/tools/lib/bpf/netlink.c
>> @@ -87,22 +87,70 @@ enum {
>>         NL_DONE,
>>  };
>>
>> +static int __libbpf_netlink_recvmsg(int sock, struct msghdr *mhdr, int flags)
>
> let's not use names starting with underscored. Just call it
> "netlink_recvmsg" or something like that.

Alright, will fix.

>> +{
>> +       int len;
>> +
>> +       do {
>> +               len = recvmsg(sock, mhdr, flags);
>
> recvmsg returns ssize_t, is it ok to truncate to int?

In practice, yeah; the kernel is not going to return a single message
that overflows an int, even on 32bit. And with an int return type it's
more natural to return -errno instead of having the caller deal with
that. So unless you have strong objections I'd prefer to keep it this
way...

>> +       } while (len < 0 && (errno == EINTR || errno == EAGAIN));
>> +
>> +       if (len < 0)
>> +               return -errno;
>> +       return len;
>> +}
>> +
>> +static int libbpf_netlink_recvmsg(int sock, struct msghdr *mhdr, char **buf)
>> +{
>> +       struct iovec *iov = mhdr->msg_iov;
>> +       void *nbuf;
>> +       int len;
>> +
>> +       len = __libbpf_netlink_recvmsg(sock, mhdr, MSG_PEEK | MSG_TRUNC);
>> +       if (len < 0)
>> +               return len;
>> +
>> +       if (len < 4096)
>> +               len = 4096;
>> +
>> +       if (len > iov->iov_len) {
>> +               nbuf = realloc(iov->iov_base, len);
>> +               if (!nbuf) {
>> +                       free(iov->iov_base);
>> +                       return -ENOMEM;
>> +               }
>> +               iov->iov_base = nbuf;
>
> this function both sets iov->iov_base *and* returns buf. It's quite a
> convoluted contract. Seems like buf is not necessary (and also NULL
> out iov->iov_base in case of error above?). But it might be cleaner to
> do this MSG_PEEK  + realloc + recvmsg  in libbpf_netlink_recv()
> explicitly. It's only one place.

Hmm, yeah, if I wrap the realloc code in a small helper that works; will
fix.

-Toke
Andrii Nakryiko Feb. 11, 2022, 11:40 p.m. UTC | #4
On Fri, Feb 11, 2022 at 3:37 PM Toke Høiland-Jørgensen <toke@redhat.com> wrote:
>
> Andrii Nakryiko <andrii.nakryiko@gmail.com> writes:
>
> > On Fri, Feb 11, 2022 at 11:51 AM Toke Høiland-Jørgensen <toke@redhat.com> wrote:
> >>
> >> When receiving netlink messages, libbpf was using a statically allocated
> >> stack buffer of 4k bytes. This happened to work fine on systems with a 4k
> >> page size, but on systems with larger page sizes it can lead to truncated
> >> messages. The user-visible impact of this was that libbpf would insist no
> >> XDP program was attached to some interfaces because that bit of the netlink
> >> message got chopped off.
> >>
> >> Fix this by switching to a dynamically allocated buffer; we borrow the
> >> approach from iproute2 of using recvmsg() with MSG_PEEK|MSG_TRUNC to get
> >> the actual size of the pending message before receiving it, adjusting the
> >> buffer as necessary. While we're at it, also add retries on interrupted
> >> system calls around the recvmsg() call.
> >>
> >> Reported-by: Zhiqian Guan <zhguan@redhat.com>
> >> Fixes: 8bbb77b7c7a2 ("libbpf: Add various netlink helpers")
> >> Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
> >> ---
> >>  tools/lib/bpf/netlink.c | 55 ++++++++++++++++++++++++++++++++++++++---
> >>  1 file changed, 52 insertions(+), 3 deletions(-)
> >>
> >> diff --git a/tools/lib/bpf/netlink.c b/tools/lib/bpf/netlink.c
> >> index c39c37f99d5c..9a6e95206bf0 100644
> >> --- a/tools/lib/bpf/netlink.c
> >> +++ b/tools/lib/bpf/netlink.c
> >> @@ -87,22 +87,70 @@ enum {
> >>         NL_DONE,
> >>  };
> >>
> >> +static int __libbpf_netlink_recvmsg(int sock, struct msghdr *mhdr, int flags)
> >
> > let's not use names starting with underscored. Just call it
> > "netlink_recvmsg" or something like that.
>
> Alright, will fix.
>
> >> +{
> >> +       int len;
> >> +
> >> +       do {
> >> +               len = recvmsg(sock, mhdr, flags);
> >
> > recvmsg returns ssize_t, is it ok to truncate to int?
>
> In practice, yeah; the kernel is not going to return a single message
> that overflows an int, even on 32bit. And with an int return type it's
> more natural to return -errno instead of having the caller deal with
> that. So unless you have strong objections I'd prefer to keep it this
> way...

yep, int is fine

>
> >> +       } while (len < 0 && (errno == EINTR || errno == EAGAIN));
> >> +
> >> +       if (len < 0)
> >> +               return -errno;
> >> +       return len;
> >> +}
> >> +
> >> +static int libbpf_netlink_recvmsg(int sock, struct msghdr *mhdr, char **buf)
> >> +{
> >> +       struct iovec *iov = mhdr->msg_iov;
> >> +       void *nbuf;
> >> +       int len;
> >> +
> >> +       len = __libbpf_netlink_recvmsg(sock, mhdr, MSG_PEEK | MSG_TRUNC);
> >> +       if (len < 0)
> >> +               return len;
> >> +
> >> +       if (len < 4096)
> >> +               len = 4096;
> >> +
> >> +       if (len > iov->iov_len) {
> >> +               nbuf = realloc(iov->iov_base, len);
> >> +               if (!nbuf) {
> >> +                       free(iov->iov_base);
> >> +                       return -ENOMEM;
> >> +               }
> >> +               iov->iov_base = nbuf;
> >
> > this function both sets iov->iov_base *and* returns buf. It's quite a
> > convoluted contract. Seems like buf is not necessary (and also NULL
> > out iov->iov_base in case of error above?). But it might be cleaner to
> > do this MSG_PEEK  + realloc + recvmsg  in libbpf_netlink_recv()
> > explicitly. It's only one place.
>
> Hmm, yeah, if I wrap the realloc code in a small helper that works; will
> fix.
>
> -Toke
>
diff mbox series

Patch

diff --git a/tools/lib/bpf/netlink.c b/tools/lib/bpf/netlink.c
index c39c37f99d5c..9a6e95206bf0 100644
--- a/tools/lib/bpf/netlink.c
+++ b/tools/lib/bpf/netlink.c
@@ -87,22 +87,70 @@  enum {
 	NL_DONE,
 };
 
+static int __libbpf_netlink_recvmsg(int sock, struct msghdr *mhdr, int flags)
+{
+	int len;
+
+	do {
+		len = recvmsg(sock, mhdr, flags);
+	} while (len < 0 && (errno == EINTR || errno == EAGAIN));
+
+	if (len < 0)
+		return -errno;
+	return len;
+}
+
+static int libbpf_netlink_recvmsg(int sock, struct msghdr *mhdr, char **buf)
+{
+	struct iovec *iov = mhdr->msg_iov;
+	void *nbuf;
+	int len;
+
+	len = __libbpf_netlink_recvmsg(sock, mhdr, MSG_PEEK | MSG_TRUNC);
+	if (len < 0)
+		return len;
+
+	if (len < 4096)
+		len = 4096;
+
+	if (len > iov->iov_len) {
+		nbuf = realloc(iov->iov_base, len);
+		if (!nbuf) {
+			free(iov->iov_base);
+			return -ENOMEM;
+		}
+		iov->iov_base = nbuf;
+		iov->iov_len = len;
+	}
+
+	len = __libbpf_netlink_recvmsg(sock, mhdr, 0);
+	if (len > 0)
+		*buf = iov->iov_base;
+	return len;
+}
+
 static int libbpf_netlink_recv(int sock, __u32 nl_pid, int seq,
 			       __dump_nlmsg_t _fn, libbpf_dump_nlmsg_t fn,
 			       void *cookie)
 {
+	struct iovec iov = {};
+	struct msghdr mhdr = {
+		.msg_iov = &iov,
+		.msg_iovlen = 1,
+	};
 	bool multipart = true;
 	struct nlmsgerr *err;
 	struct nlmsghdr *nh;
-	char buf[4096];
 	int len, ret;
+	char *buf;
+
 
 	while (multipart) {
 start:
 		multipart = false;
-		len = recv(sock, buf, sizeof(buf), 0);
+		len = libbpf_netlink_recvmsg(sock, &mhdr, &buf);
 		if (len < 0) {
-			ret = -errno;
+			ret = len;
 			goto done;
 		}
 
@@ -151,6 +199,7 @@  static int libbpf_netlink_recv(int sock, __u32 nl_pid, int seq,
 	}
 	ret = 0;
 done:
+	free(iov.iov_base);
 	return ret;
 }