diff mbox series

[03/10] reftable/record: handle overflows when decoding varints

Message ID 20250116-b4-pks-reftable-sign-compare-v1-3-bd30e2ee96e7@pks.im (mailing list archive)
State Superseded
Headers show
Series reftable: fix -Wsign-compare warnings | expand

Commit Message

Patrick Steinhardt Jan. 16, 2025, 10:08 a.m. UTC
The logic to decode varints isn't able to detect integer overflows: as
long as the buffer still has more data available, and as long as the
current byte has its 0x80 bit set, we'll continue to add up these values
to the result. This will eventually cause the `uint64_t` to overflow, at
which point we'll return an invalid result.

Refactor the function so that it is able to detect such overflows. The
implementation is basically copied from Git's own `decode_varint()`,
which already knows to handle overflows. The only adjustment is that we
also take into account the string view's length in order to not overrun
it.

While at it, refactor `put_var_int()` in the same way by copying over
the implementation of `encode_varint()`. While `put_var_int()` doesn't
have an issue with overflows, it generates warnings with -Wsign-compare.
The implementation of `encode_varint()` doesn't, is battle-tested and at
the same time way simpler than what we currently have.

Signed-off-by: Patrick Steinhardt <ps@pks.im>
---
 reftable/record.c                | 53 +++++++++++++++++-----------------------
 reftable/record.h                |  4 +++
 t/unit-tests/t-reftable-record.c | 17 +++++++++++++
 3 files changed, 44 insertions(+), 30 deletions(-)

Comments

Karthik Nayak Jan. 20, 2025, 9:47 a.m. UTC | #1
Patrick Steinhardt <ps@pks.im> writes:

[snip]

> diff --git a/reftable/record.c b/reftable/record.c
> index 04429d23fe..4e6541c307 100644
> --- a/reftable/record.c
> +++ b/reftable/record.c
> @@ -21,47 +21,40 @@ static void *reftable_record_data(struct reftable_record *rec);
>
>  int get_var_int(uint64_t *dest, struct string_view *in)
>  {
> -	int ptr = 0;
> +	const unsigned char *buf = in->buf;
> +	unsigned char c;
>  	uint64_t val;
>
> -	if (in->len == 0)
> +	if (!in->len)
>  		return -1;
> -	val = in->buf[ptr] & 0x7f;
> -
> -	while (in->buf[ptr] & 0x80) {
> -		ptr++;
> -		if (ptr > in->len) {
> +	c = *buf++;
> +	val = c & 0x7f;
> +
> +	while (c & 0x80) {
> +		val += 1;

I was at first confused, I understand that we add 1 to check if there is
an overflow before adding the next section. But this actually modifies
the value itself, but looking below at `put_var_int()`, we did value--
before storing each continuation byte. So during decoding.

Nit: it would be nice to explain that part a bit here with comments.

> +		if (!val || (val & (uint64_t)(~0ULL << (64 - 7))))
> +			return -1; /* overflow */
> +		if (buf >= in->buf + in->len)
>  			return -1;
> -		}
> -		val = (val + 1) << 7 | (uint64_t)(in->buf[ptr] & 0x7f);
> +		c = *buf++;
> +		val = (val << 7) | (c & 0x7f);
>  	}
>
>  	*dest = val;
> -	return ptr + 1;
> +	return buf - in->buf;
>  }
>
> -int put_var_int(struct string_view *dest, uint64_t val)
> +int put_var_int(struct string_view *dest, uint64_t value)
>  {
> -	uint8_t buf[10] = { 0 };
> -	int i = 9;
> -	int n = 0;
> -	buf[i] = (uint8_t)(val & 0x7f);
> -	i--;
> -	while (1) {
> -		val >>= 7;
> -		if (!val) {
> -			break;
> -		}
> -		val--;
> -		buf[i] = 0x80 | (uint8_t)(val & 0x7f);
> -		i--;
> -	}
> -
> -	n = sizeof(buf) - i - 1;
> -	if (dest->len < n)
> +	unsigned char varint[10];
> +	unsigned pos = sizeof(varint) - 1;
> +	varint[pos] = value & 127;

Nit: While the `get_var_int()` uses hexes, here we use ints. Would be
nicer to use `0x7f` and so on and be consistent.

> +	while (value >>= 7)
> +		varint[--pos] = 128 | (--value & 127);
> +	if (dest->len < sizeof(varint) - pos)
>  		return -1;
> -	memcpy(dest->buf, &buf[i + 1], n);
> -	return n;
> +	memcpy(dest->buf, varint + pos, sizeof(varint) - pos);
> +	return sizeof(varint) - pos;
>  }
>
>  int reftable_is_block_type(uint8_t typ)
> diff --git a/reftable/record.h b/reftable/record.h
> index a24cb23bd4..721d6c949a 100644
> --- a/reftable/record.h
> +++ b/reftable/record.h
> @@ -34,6 +34,10 @@ static inline void string_view_consume(struct string_view *s, int n)
>
>  /* utilities for de/encoding varints */
>

We should remove this, no?

> +/*
> + * Decode and encode a varint. Returns the number of bytes read/written, or a
> + * negative value in case encoding/decoding the varint has failed.
> + */
>  int get_var_int(uint64_t *dest, struct string_view *in);
>  int put_var_int(struct string_view *dest, uint64_t val);
>
> diff --git a/t/unit-tests/t-reftable-record.c b/t/unit-tests/t-reftable-record.c
> index 42bc64cec8..6d912b9c8f 100644
> --- a/t/unit-tests/t-reftable-record.c
> +++ b/t/unit-tests/t-reftable-record.c
> @@ -58,6 +58,22 @@ static void t_varint_roundtrip(void)
>  	}
>  }

[snip]
Patrick Steinhardt Jan. 20, 2025, 3:09 p.m. UTC | #2
On Mon, Jan 20, 2025 at 04:47:47AM -0500, Karthik Nayak wrote:
> Patrick Steinhardt <ps@pks.im> writes:
> > diff --git a/reftable/record.c b/reftable/record.c
> > index 04429d23fe..4e6541c307 100644
> > --- a/reftable/record.c
> > +++ b/reftable/record.c
> > @@ -21,47 +21,40 @@ static void *reftable_record_data(struct reftable_record *rec);
> >
> >  int get_var_int(uint64_t *dest, struct string_view *in)
> >  {
> > -	int ptr = 0;
> > +	const unsigned char *buf = in->buf;
> > +	unsigned char c;
> >  	uint64_t val;
> >
> > -	if (in->len == 0)
> > +	if (!in->len)
> >  		return -1;
> > -	val = in->buf[ptr] & 0x7f;
> > -
> > -	while (in->buf[ptr] & 0x80) {
> > -		ptr++;
> > -		if (ptr > in->len) {
> > +	c = *buf++;
> > +	val = c & 0x7f;
> > +
> > +	while (c & 0x80) {
> > +		val += 1;
> 
> I was at first confused, I understand that we add 1 to check if there is
> an overflow before adding the next section. But this actually modifies
> the value itself, but looking below at `put_var_int()`, we did value--
> before storing each continuation byte. So during decoding.
> 
> Nit: it would be nice to explain that part a bit here with comments.

Yeah, I had to think about it a bit myself. It's quite a clever
optimization: when the 0x80 bit is set, we know that the remaining value
cannot be 0. We thus don't have to represent that value, which is why we
can subtract 1 when encoding and re-add 1 when decoding. This allows us
to save a byte in some edge cases.

[snip]
> > -int put_var_int(struct string_view *dest, uint64_t val)
> > +int put_var_int(struct string_view *dest, uint64_t value)
> >  {
> > -	uint8_t buf[10] = { 0 };
> > -	int i = 9;
> > -	int n = 0;
> > -	buf[i] = (uint8_t)(val & 0x7f);
> > -	i--;
> > -	while (1) {
> > -		val >>= 7;
> > -		if (!val) {
> > -			break;
> > -		}
> > -		val--;
> > -		buf[i] = 0x80 | (uint8_t)(val & 0x7f);
> > -		i--;
> > -	}
> > -
> > -	n = sizeof(buf) - i - 1;
> > -	if (dest->len < n)
> > +	unsigned char varint[10];
> > +	unsigned pos = sizeof(varint) - 1;
> > +	varint[pos] = value & 127;
> 
> Nit: While the `get_var_int()` uses hexes, here we use ints. Would be
> nicer to use `0x7f` and so on and be consistent.

Yup, makes sense.

> > +	while (value >>= 7)
> > +		varint[--pos] = 128 | (--value & 127);
> > +	if (dest->len < sizeof(varint) - pos)
> >  		return -1;
> > -	memcpy(dest->buf, &buf[i + 1], n);
> > -	return n;
> > +	memcpy(dest->buf, varint + pos, sizeof(varint) - pos);
> > +	return sizeof(varint) - pos;
> >  }
> >
> >  int reftable_is_block_type(uint8_t typ)
> > diff --git a/reftable/record.h b/reftable/record.h
> > index a24cb23bd4..721d6c949a 100644
> > --- a/reftable/record.h
> > +++ b/reftable/record.h
> > @@ -34,6 +34,10 @@ static inline void string_view_consume(struct string_view *s, int n)
> >
> >  /* utilities for de/encoding varints */
> >
> 
> We should remove this, no?

Yup, good catch.

Patrick
diff mbox series

Patch

diff --git a/reftable/record.c b/reftable/record.c
index 04429d23fe..4e6541c307 100644
--- a/reftable/record.c
+++ b/reftable/record.c
@@ -21,47 +21,40 @@  static void *reftable_record_data(struct reftable_record *rec);
 
 int get_var_int(uint64_t *dest, struct string_view *in)
 {
-	int ptr = 0;
+	const unsigned char *buf = in->buf;
+	unsigned char c;
 	uint64_t val;
 
-	if (in->len == 0)
+	if (!in->len)
 		return -1;
-	val = in->buf[ptr] & 0x7f;
-
-	while (in->buf[ptr] & 0x80) {
-		ptr++;
-		if (ptr > in->len) {
+	c = *buf++;
+	val = c & 0x7f;
+
+	while (c & 0x80) {
+		val += 1;
+		if (!val || (val & (uint64_t)(~0ULL << (64 - 7))))
+			return -1; /* overflow */
+		if (buf >= in->buf + in->len)
 			return -1;
-		}
-		val = (val + 1) << 7 | (uint64_t)(in->buf[ptr] & 0x7f);
+		c = *buf++;
+		val = (val << 7) | (c & 0x7f);
 	}
 
 	*dest = val;
-	return ptr + 1;
+	return buf - in->buf;
 }
 
-int put_var_int(struct string_view *dest, uint64_t val)
+int put_var_int(struct string_view *dest, uint64_t value)
 {
-	uint8_t buf[10] = { 0 };
-	int i = 9;
-	int n = 0;
-	buf[i] = (uint8_t)(val & 0x7f);
-	i--;
-	while (1) {
-		val >>= 7;
-		if (!val) {
-			break;
-		}
-		val--;
-		buf[i] = 0x80 | (uint8_t)(val & 0x7f);
-		i--;
-	}
-
-	n = sizeof(buf) - i - 1;
-	if (dest->len < n)
+	unsigned char varint[10];
+	unsigned pos = sizeof(varint) - 1;
+	varint[pos] = value & 127;
+	while (value >>= 7)
+		varint[--pos] = 128 | (--value & 127);
+	if (dest->len < sizeof(varint) - pos)
 		return -1;
-	memcpy(dest->buf, &buf[i + 1], n);
-	return n;
+	memcpy(dest->buf, varint + pos, sizeof(varint) - pos);
+	return sizeof(varint) - pos;
 }
 
 int reftable_is_block_type(uint8_t typ)
diff --git a/reftable/record.h b/reftable/record.h
index a24cb23bd4..721d6c949a 100644
--- a/reftable/record.h
+++ b/reftable/record.h
@@ -34,6 +34,10 @@  static inline void string_view_consume(struct string_view *s, int n)
 
 /* utilities for de/encoding varints */
 
+/*
+ * Decode and encode a varint. Returns the number of bytes read/written, or a
+ * negative value in case encoding/decoding the varint has failed.
+ */
 int get_var_int(uint64_t *dest, struct string_view *in);
 int put_var_int(struct string_view *dest, uint64_t val);
 
diff --git a/t/unit-tests/t-reftable-record.c b/t/unit-tests/t-reftable-record.c
index 42bc64cec8..6d912b9c8f 100644
--- a/t/unit-tests/t-reftable-record.c
+++ b/t/unit-tests/t-reftable-record.c
@@ -58,6 +58,22 @@  static void t_varint_roundtrip(void)
 	}
 }
 
+static void t_varint_overflow(void)
+{
+	unsigned char buf[] = {
+		0xFF, 0xFF, 0xFF, 0xFF,
+		0xFF, 0xFF, 0xFF, 0xFF,
+		0xFF, 0x00,
+	};
+	struct string_view view = {
+		.buf = buf,
+		.len = sizeof(buf),
+	};
+	uint64_t value;
+	int err = get_var_int(&value, &view);
+	check_int(err, ==, -1);
+}
+
 static void set_hash(uint8_t *h, int j)
 {
 	for (int i = 0; i < hash_size(REFTABLE_HASH_SHA1); i++)
@@ -544,6 +560,7 @@  int cmd_main(int argc UNUSED, const char *argv[] UNUSED)
 	TEST(t_reftable_log_record_roundtrip(), "record operations work on log record");
 	TEST(t_reftable_ref_record_roundtrip(), "record operations work on ref record");
 	TEST(t_varint_roundtrip(), "put_var_int and get_var_int work");
+	TEST(t_varint_overflow(), "get_var_int notices an integer overflow");
 	TEST(t_key_roundtrip(), "reftable_encode_key and reftable_decode_key work");
 	TEST(t_reftable_obj_record_roundtrip(), "record operations work on obj record");
 	TEST(t_reftable_index_record_roundtrip(), "record operations work on index record");