diff mbox series

[v5,bpf-next,01/10] lib/buildid: harden build ID parsing logic

Message ID 20240813002932.3373935-2-andrii@kernel.org (mailing list archive)
State New
Headers show
Series Harden and extend ELF build ID parsing logic | expand

Commit Message

Andrii Nakryiko Aug. 13, 2024, 12:29 a.m. UTC
Harden build ID parsing logic, adding explicit READ_ONCE() where it's
important to have a consistent value read and validated just once.

Also, as pointed out by Andi Kleen, we need to make sure that entire ELF
note is within a page bounds, so move the overflow check up and add an
extra note_size boundaries validation.

Fixes tag below points to the code that moved this code into
lib/buildid.c, and then subsequently was used in perf subsystem, making
this code exposed to perf_event_open() users in v5.12+.

Cc: stable@vger.kernel.org
Cc: Jann Horn <jannh@google.com>
Suggested-by: Andi Kleen <ak@linux.intel.com>
Fixes: bd7525dacd7e ("bpf: Move stack_map_get_build_id into lib")
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
---
 lib/buildid.c | 74 ++++++++++++++++++++++++++++++---------------------
 1 file changed, 43 insertions(+), 31 deletions(-)

Comments

Andi Kleen Aug. 13, 2024, 12:52 a.m. UTC | #1
> @@ -152,6 +160,10 @@ int build_id_parse(struct vm_area_struct *vma, unsigned char *build_id,
>  	page = find_get_page(vma->vm_file->f_mapping, 0);
>  	if (!page)
>  		return -EFAULT;	/* page not mapped */
> +	if (!PageUptodate(page)) {
> +		put_page(page);
> +		return -EFAULT;
> +	}

That change is not described. As I understand it might prevent reading
previous data in the page or maybe junk under an IO error? Anyways I guess it's a
good change.

Reviewed-by: Andi Kleen <ak@linux.intel.com>

-Andi
Andrii Nakryiko Aug. 13, 2024, 3:06 a.m. UTC | #2
On Mon, Aug 12, 2024 at 5:52 PM Andi Kleen <ak@linux.intel.com> wrote:
>
> > @@ -152,6 +160,10 @@ int build_id_parse(struct vm_area_struct *vma, unsigned char *build_id,
> >       page = find_get_page(vma->vm_file->f_mapping, 0);
> >       if (!page)
> >               return -EFAULT; /* page not mapped */
> > +     if (!PageUptodate(page)) {
> > +             put_page(page);
> > +             return -EFAULT;
> > +     }
>
> That change is not described. As I understand it might prevent reading
> previous data in the page or maybe junk under an IO error? Anyways I guess it's a
> good change.

From what I understood, one can get a valid page from the
find_get_page() (same for folio), but it might not be yet completely
filled out. PageUptodate() is supposed to detect this and prevent the
use of incomplete page data.

>
> Reviewed-by: Andi Kleen <ak@linux.intel.com>
>
> -Andi
Jann Horn Aug. 13, 2024, 8:59 p.m. UTC | #3
On Tue, Aug 13, 2024 at 2:29 AM Andrii Nakryiko <andrii@kernel.org> wrote:
> Harden build ID parsing logic, adding explicit READ_ONCE() where it's
> important to have a consistent value read and validated just once.
>
> Also, as pointed out by Andi Kleen, we need to make sure that entire ELF
> note is within a page bounds, so move the overflow check up and add an
> extra note_size boundaries validation.
>
> Fixes tag below points to the code that moved this code into
> lib/buildid.c, and then subsequently was used in perf subsystem, making
> this code exposed to perf_event_open() users in v5.12+.

Sorry, I missed some things in previous review rounds:

[...]
> @@ -18,31 +18,37 @@ static int parse_build_id_buf(unsigned char *build_id,
[...]
>                 if (nhdr->n_type == BUILD_ID &&
> -                   nhdr->n_namesz == sizeof("GNU") &&
> -                   !strcmp((char *)(nhdr + 1), "GNU") &&
> -                   nhdr->n_descsz > 0 &&
> -                   nhdr->n_descsz <= BUILD_ID_SIZE_MAX) {
> -                       memcpy(build_id,
> -                              note_start + note_offs +
> -                              ALIGN(sizeof("GNU"), 4) + sizeof(Elf32_Nhdr),
> -                              nhdr->n_descsz);
> -                       memset(build_id + nhdr->n_descsz, 0,
> -                              BUILD_ID_SIZE_MAX - nhdr->n_descsz);
> +                   name_sz == note_name_sz &&
> +                   strcmp((char *)(nhdr + 1), note_name) == 0 &&

Please change this to something like "memcmp((char *)(nhdr + 1),
note_name, note_name_sz) == 0" to ensure that we can't run off the end
of the page if there are no null bytes in the rest of the page.

[...]
> @@ -90,8 +97,8 @@ static int get_build_id_32(const void *page_addr, unsigned char *build_id,
>         for (i = 0; i < ehdr->e_phnum; ++i) {

Please change this to "for (i = 0; i < phnum; ++i) {" like in the
64-bit version.

With these two changes applied:

Reviewed-by: Jann Horn <jannh@google.com>
Andrii Nakryiko Aug. 13, 2024, 11:21 p.m. UTC | #4
On Tue, Aug 13, 2024 at 1:59 PM Jann Horn <jannh@google.com> wrote:
>
> On Tue, Aug 13, 2024 at 2:29 AM Andrii Nakryiko <andrii@kernel.org> wrote:
> > Harden build ID parsing logic, adding explicit READ_ONCE() where it's
> > important to have a consistent value read and validated just once.
> >
> > Also, as pointed out by Andi Kleen, we need to make sure that entire ELF
> > note is within a page bounds, so move the overflow check up and add an
> > extra note_size boundaries validation.
> >
> > Fixes tag below points to the code that moved this code into
> > lib/buildid.c, and then subsequently was used in perf subsystem, making
> > this code exposed to perf_event_open() users in v5.12+.
>
> Sorry, I missed some things in previous review rounds:
>
> [...]
> > @@ -18,31 +18,37 @@ static int parse_build_id_buf(unsigned char *build_id,
> [...]
> >                 if (nhdr->n_type == BUILD_ID &&
> > -                   nhdr->n_namesz == sizeof("GNU") &&
> > -                   !strcmp((char *)(nhdr + 1), "GNU") &&
> > -                   nhdr->n_descsz > 0 &&
> > -                   nhdr->n_descsz <= BUILD_ID_SIZE_MAX) {
> > -                       memcpy(build_id,
> > -                              note_start + note_offs +
> > -                              ALIGN(sizeof("GNU"), 4) + sizeof(Elf32_Nhdr),
> > -                              nhdr->n_descsz);
> > -                       memset(build_id + nhdr->n_descsz, 0,
> > -                              BUILD_ID_SIZE_MAX - nhdr->n_descsz);
> > +                   name_sz == note_name_sz &&
> > +                   strcmp((char *)(nhdr + 1), note_name) == 0 &&
>
> Please change this to something like "memcmp((char *)(nhdr + 1),
> note_name, note_name_sz) == 0" to ensure that we can't run off the end
> of the page if there are no null bytes in the rest of the page.

I did switch this to strncmp() at some earlier point, but then
realized that there is no point because note_name is controlled by us
and will ensure there is a zero at byte (note_name_sz - 1). So I don't
think memcmp() buys us anything.

>
> [...]
> > @@ -90,8 +97,8 @@ static int get_build_id_32(const void *page_addr, unsigned char *build_id,
> >         for (i = 0; i < ehdr->e_phnum; ++i) {
>
> Please change this to "for (i = 0; i < phnum; ++i) {" like in the
> 64-bit version.

This did slip through, yep. I'll check with BPF maintainers if this
can be fixed up while applying or whether I should send another
revision.

>
> With these two changes applied:
>
> Reviewed-by: Jann Horn <jannh@google.com>
Jann Horn Aug. 14, 2024, 4:13 p.m. UTC | #5
On Wed, Aug 14, 2024 at 1:21 AM Andrii Nakryiko
<andrii.nakryiko@gmail.com> wrote:
> On Tue, Aug 13, 2024 at 1:59 PM Jann Horn <jannh@google.com> wrote:
> >
> > On Tue, Aug 13, 2024 at 2:29 AM Andrii Nakryiko <andrii@kernel.org> wrote:
> > > Harden build ID parsing logic, adding explicit READ_ONCE() where it's
> > > important to have a consistent value read and validated just once.
> > >
> > > Also, as pointed out by Andi Kleen, we need to make sure that entire ELF
> > > note is within a page bounds, so move the overflow check up and add an
> > > extra note_size boundaries validation.
> > >
> > > Fixes tag below points to the code that moved this code into
> > > lib/buildid.c, and then subsequently was used in perf subsystem, making
> > > this code exposed to perf_event_open() users in v5.12+.
> >
> > Sorry, I missed some things in previous review rounds:
> >
> > [...]
> > > @@ -18,31 +18,37 @@ static int parse_build_id_buf(unsigned char *build_id,
> > [...]
> > >                 if (nhdr->n_type == BUILD_ID &&
> > > -                   nhdr->n_namesz == sizeof("GNU") &&
> > > -                   !strcmp((char *)(nhdr + 1), "GNU") &&
> > > -                   nhdr->n_descsz > 0 &&
> > > -                   nhdr->n_descsz <= BUILD_ID_SIZE_MAX) {
> > > -                       memcpy(build_id,
> > > -                              note_start + note_offs +
> > > -                              ALIGN(sizeof("GNU"), 4) + sizeof(Elf32_Nhdr),
> > > -                              nhdr->n_descsz);
> > > -                       memset(build_id + nhdr->n_descsz, 0,
> > > -                              BUILD_ID_SIZE_MAX - nhdr->n_descsz);
> > > +                   name_sz == note_name_sz &&
> > > +                   strcmp((char *)(nhdr + 1), note_name) == 0 &&
> >
> > Please change this to something like "memcmp((char *)(nhdr + 1),
> > note_name, note_name_sz) == 0" to ensure that we can't run off the end
> > of the page if there are no null bytes in the rest of the page.
>
> I did switch this to strncmp() at some earlier point, but then
> realized that there is no point because note_name is controlled by us
> and will ensure there is a zero at byte (note_name_sz - 1). So I don't
> think memcmp() buys us anything.

There are two reasons why using strcmp() here makes me uneasy.


First: We're still operating on shared memory that can concurrently change.

Let's say strcmp is implemented like this, this is the generic C
implementation in the kernel (which I think is the implementation
that's used for x86-64):

int strcmp(const char *cs, const char *ct)
{
        unsigned char c1, c2;

        while (1) {
                c1 = *cs++;
                c2 = *ct++;
                if (c1 != c2)
                        return c1 < c2 ? -1 : 1;
                if (!c1)
                        break;
        }
        return 0;
}

No READ_ONCE() or anything like that - it's not designed for being
used on concurrently changing memory.

And let's say you call it like strcmp(<shared memory>, "GNU"), and
we're now in the fourth iteration. If the compiler decides to re-fetch
the value of "c1" from memory for each of the two conditions, then it
could be that the "if (c1 != c2)" sees c1='\0' and c2='\0', so the
condition evaluates as false; but then at the "if (!c1)", the value in
memory changed, and we see c1='A'. So now in the next round, we'll be
accessing out-of-bounds memory behind the 4-byte string constant
"GNU".

So I don't think strcmp() on memory that can concurrently change is allowed.

(It actually seems like the generic memcmp() is also implemented
without READ_ONCE(), maybe we should change that...)


Second: You are assuming that if one side of the strcmp() is at most
four bytes long (including null terminator), then strcmp() also won't
access more than 4 bytes of the other string, even if that string does
not have a null terminator at index 4. I don't think that's part of
the normal strcmp() API contract.
Andrii Nakryiko Aug. 14, 2024, 5:06 p.m. UTC | #6
On Wed, Aug 14, 2024 at 9:14 AM Jann Horn <jannh@google.com> wrote:
>
> On Wed, Aug 14, 2024 at 1:21 AM Andrii Nakryiko
> <andrii.nakryiko@gmail.com> wrote:
> > On Tue, Aug 13, 2024 at 1:59 PM Jann Horn <jannh@google.com> wrote:
> > >
> > > On Tue, Aug 13, 2024 at 2:29 AM Andrii Nakryiko <andrii@kernel.org> wrote:
> > > > Harden build ID parsing logic, adding explicit READ_ONCE() where it's
> > > > important to have a consistent value read and validated just once.
> > > >
> > > > Also, as pointed out by Andi Kleen, we need to make sure that entire ELF
> > > > note is within a page bounds, so move the overflow check up and add an
> > > > extra note_size boundaries validation.
> > > >
> > > > Fixes tag below points to the code that moved this code into
> > > > lib/buildid.c, and then subsequently was used in perf subsystem, making
> > > > this code exposed to perf_event_open() users in v5.12+.
> > >
> > > Sorry, I missed some things in previous review rounds:
> > >
> > > [...]
> > > > @@ -18,31 +18,37 @@ static int parse_build_id_buf(unsigned char *build_id,
> > > [...]
> > > >                 if (nhdr->n_type == BUILD_ID &&
> > > > -                   nhdr->n_namesz == sizeof("GNU") &&
> > > > -                   !strcmp((char *)(nhdr + 1), "GNU") &&
> > > > -                   nhdr->n_descsz > 0 &&
> > > > -                   nhdr->n_descsz <= BUILD_ID_SIZE_MAX) {
> > > > -                       memcpy(build_id,
> > > > -                              note_start + note_offs +
> > > > -                              ALIGN(sizeof("GNU"), 4) + sizeof(Elf32_Nhdr),
> > > > -                              nhdr->n_descsz);
> > > > -                       memset(build_id + nhdr->n_descsz, 0,
> > > > -                              BUILD_ID_SIZE_MAX - nhdr->n_descsz);
> > > > +                   name_sz == note_name_sz &&
> > > > +                   strcmp((char *)(nhdr + 1), note_name) == 0 &&
> > >
> > > Please change this to something like "memcmp((char *)(nhdr + 1),
> > > note_name, note_name_sz) == 0" to ensure that we can't run off the end
> > > of the page if there are no null bytes in the rest of the page.
> >
> > I did switch this to strncmp() at some earlier point, but then
> > realized that there is no point because note_name is controlled by us
> > and will ensure there is a zero at byte (note_name_sz - 1). So I don't
> > think memcmp() buys us anything.
>
> There are two reasons why using strcmp() here makes me uneasy.
>
>
> First: We're still operating on shared memory that can concurrently change.
>
> Let's say strcmp is implemented like this, this is the generic C
> implementation in the kernel (which I think is the implementation
> that's used for x86-64):
>
> int strcmp(const char *cs, const char *ct)
> {
>         unsigned char c1, c2;
>
>         while (1) {
>                 c1 = *cs++;
>                 c2 = *ct++;
>                 if (c1 != c2)
>                         return c1 < c2 ? -1 : 1;
>                 if (!c1)
>                         break;
>         }
>         return 0;
> }
>
> No READ_ONCE() or anything like that - it's not designed for being
> used on concurrently changing memory.
>
> And let's say you call it like strcmp(<shared memory>, "GNU"), and
> we're now in the fourth iteration. If the compiler decides to re-fetch
> the value of "c1" from memory for each of the two conditions, then it
> could be that the "if (c1 != c2)" sees c1='\0' and c2='\0', so the
> condition evaluates as false; but then at the "if (!c1)", the value in
> memory changed, and we see c1='A'. So now in the next round, we'll be
> accessing out-of-bounds memory behind the 4-byte string constant
> "GNU".
>
> So I don't think strcmp() on memory that can concurrently change is allowed.
>
> (It actually seems like the generic memcmp() is also implemented
> without READ_ONCE(), maybe we should change that...)
>
>
> Second: You are assuming that if one side of the strcmp() is at most
> four bytes long (including null terminator), then strcmp() also won't
> access more than 4 bytes of the other string, even if that string does
> not have a null terminator at index 4. I don't think that's part of
> the normal strcmp() API contract.

Ok, I'm convinced, all fair points. I'll switch to memcmp(), there is
no downside to that anyways.
diff mbox series

Patch

diff --git a/lib/buildid.c b/lib/buildid.c
index e02b5507418b..61bc5b767064 100644
--- a/lib/buildid.c
+++ b/lib/buildid.c
@@ -18,31 +18,37 @@  static int parse_build_id_buf(unsigned char *build_id,
 			      const void *note_start,
 			      Elf32_Word note_size)
 {
-	Elf32_Word note_offs = 0, new_offs;
-
-	while (note_offs + sizeof(Elf32_Nhdr) < note_size) {
-		Elf32_Nhdr *nhdr = (Elf32_Nhdr *)(note_start + note_offs);
+	const char note_name[] = "GNU";
+	const size_t note_name_sz = sizeof(note_name);
+	u64 note_off = 0, new_off, name_sz, desc_sz;
+	const char *data;
+
+	while (note_off + sizeof(Elf32_Nhdr) < note_size &&
+	       note_off + sizeof(Elf32_Nhdr) > note_off /* overflow */) {
+		Elf32_Nhdr *nhdr = (Elf32_Nhdr *)(note_start + note_off);
+
+		name_sz = READ_ONCE(nhdr->n_namesz);
+		desc_sz = READ_ONCE(nhdr->n_descsz);
+
+		new_off = note_off + sizeof(Elf32_Nhdr);
+		if (check_add_overflow(new_off, ALIGN(name_sz, 4), &new_off) ||
+		    check_add_overflow(new_off, ALIGN(desc_sz, 4), &new_off) ||
+		    new_off > note_size)
+			break;
 
 		if (nhdr->n_type == BUILD_ID &&
-		    nhdr->n_namesz == sizeof("GNU") &&
-		    !strcmp((char *)(nhdr + 1), "GNU") &&
-		    nhdr->n_descsz > 0 &&
-		    nhdr->n_descsz <= BUILD_ID_SIZE_MAX) {
-			memcpy(build_id,
-			       note_start + note_offs +
-			       ALIGN(sizeof("GNU"), 4) + sizeof(Elf32_Nhdr),
-			       nhdr->n_descsz);
-			memset(build_id + nhdr->n_descsz, 0,
-			       BUILD_ID_SIZE_MAX - nhdr->n_descsz);
+		    name_sz == note_name_sz &&
+		    strcmp((char *)(nhdr + 1), note_name) == 0 &&
+		    desc_sz > 0 && desc_sz <= BUILD_ID_SIZE_MAX) {
+			data = note_start + note_off + ALIGN(note_name_sz, 4);
+			memcpy(build_id, data, desc_sz);
+			memset(build_id + desc_sz, 0, BUILD_ID_SIZE_MAX - desc_sz);
 			if (size)
-				*size = nhdr->n_descsz;
+				*size = desc_sz;
 			return 0;
 		}
-		new_offs = note_offs + sizeof(Elf32_Nhdr) +
-			ALIGN(nhdr->n_namesz, 4) + ALIGN(nhdr->n_descsz, 4);
-		if (new_offs <= note_offs)  /* overflow */
-			break;
-		note_offs = new_offs;
+
+		note_off = new_off;
 	}
 
 	return -EINVAL;
@@ -71,7 +77,7 @@  static int get_build_id_32(const void *page_addr, unsigned char *build_id,
 {
 	Elf32_Ehdr *ehdr = (Elf32_Ehdr *)page_addr;
 	Elf32_Phdr *phdr;
-	int i;
+	__u32 i, phnum;
 
 	/*
 	 * FIXME
@@ -80,9 +86,10 @@  static int get_build_id_32(const void *page_addr, unsigned char *build_id,
 	 */
 	if (ehdr->e_phoff != sizeof(Elf32_Ehdr))
 		return -EINVAL;
+
+	phnum = READ_ONCE(ehdr->e_phnum);
 	/* only supports phdr that fits in one page */
-	if (ehdr->e_phnum >
-	    (PAGE_SIZE - sizeof(Elf32_Ehdr)) / sizeof(Elf32_Phdr))
+	if (phnum > (PAGE_SIZE - sizeof(Elf32_Ehdr)) / sizeof(Elf32_Phdr))
 		return -EINVAL;
 
 	phdr = (Elf32_Phdr *)(page_addr + sizeof(Elf32_Ehdr));
@@ -90,8 +97,8 @@  static int get_build_id_32(const void *page_addr, unsigned char *build_id,
 	for (i = 0; i < ehdr->e_phnum; ++i) {
 		if (phdr[i].p_type == PT_NOTE &&
 		    !parse_build_id(page_addr, build_id, size,
-				    page_addr + phdr[i].p_offset,
-				    phdr[i].p_filesz))
+				    page_addr + READ_ONCE(phdr[i].p_offset),
+				    READ_ONCE(phdr[i].p_filesz)))
 			return 0;
 	}
 	return -EINVAL;
@@ -103,7 +110,7 @@  static int get_build_id_64(const void *page_addr, unsigned char *build_id,
 {
 	Elf64_Ehdr *ehdr = (Elf64_Ehdr *)page_addr;
 	Elf64_Phdr *phdr;
-	int i;
+	__u32 i, phnum;
 
 	/*
 	 * FIXME
@@ -112,18 +119,19 @@  static int get_build_id_64(const void *page_addr, unsigned char *build_id,
 	 */
 	if (ehdr->e_phoff != sizeof(Elf64_Ehdr))
 		return -EINVAL;
+
+	phnum = READ_ONCE(ehdr->e_phnum);
 	/* only supports phdr that fits in one page */
-	if (ehdr->e_phnum >
-	    (PAGE_SIZE - sizeof(Elf64_Ehdr)) / sizeof(Elf64_Phdr))
+	if (phnum > (PAGE_SIZE - sizeof(Elf64_Ehdr)) / sizeof(Elf64_Phdr))
 		return -EINVAL;
 
 	phdr = (Elf64_Phdr *)(page_addr + sizeof(Elf64_Ehdr));
 
-	for (i = 0; i < ehdr->e_phnum; ++i) {
+	for (i = 0; i < phnum; ++i) {
 		if (phdr[i].p_type == PT_NOTE &&
 		    !parse_build_id(page_addr, build_id, size,
-				    page_addr + phdr[i].p_offset,
-				    phdr[i].p_filesz))
+				    page_addr + READ_ONCE(phdr[i].p_offset),
+				    READ_ONCE(phdr[i].p_filesz)))
 			return 0;
 	}
 	return -EINVAL;
@@ -152,6 +160,10 @@  int build_id_parse(struct vm_area_struct *vma, unsigned char *build_id,
 	page = find_get_page(vma->vm_file->f_mapping, 0);
 	if (!page)
 		return -EFAULT;	/* page not mapped */
+	if (!PageUptodate(page)) {
+		put_page(page);
+		return -EFAULT;
+	}
 
 	ret = -EINVAL;
 	page_addr = kmap_local_page(page);