diff mbox series

[6/7] reftable/record: use scratch buffer when decoding records

Message ID e0a9057593f761917a3e6fb9a77045400e2b9d1c.1709640322.git.ps@pks.im (mailing list archive)
State Accepted
Commit 7b8abc4d8cd428e4ce044e50f7980baacaccb761
Headers show
Series reftable: memory optimizations for reflog iteration | expand

Commit Message

Patrick Steinhardt March 5, 2024, 12:11 p.m. UTC
When decoding log records we need a temporary buffer to decode the
reflog entry's name, mail address and message. As this buffer is local
to the function we thus have to reallocate it for every single log
record which we're about to decode, which is inefficient.

Refactor the code such that callers need to pass in a scratch buffer,
which allows us to reuse it for multiple decodes. This reduces the
number of allocations when iterating through reflogs. Before:

    HEAP SUMMARY:
        in use at exit: 13,473 bytes in 122 blocks
      total heap usage: 2,068,487 allocs, 2,068,365 frees, 305,122,946 bytes allocated

After:

    HEAP SUMMARY:
        in use at exit: 13,473 bytes in 122 blocks
      total heap usage: 1,068,485 allocs, 1,068,363 frees, 281,122,886 bytes allocated

Note that this commit also drop some redundant calls to `strbuf_reset()`
right before calling `decode_string()`. The latter already knows to
reset the buffer, so there is no need for these.

Signed-off-by: Patrick Steinhardt <ps@pks.im>
---
 reftable/block.c       |  4 ++-
 reftable/block.h       |  2 ++
 reftable/record.c      | 52 ++++++++++++++++++--------------------
 reftable/record.h      |  5 ++--
 reftable/record_test.c | 57 ++++++++++++++++++++++++++----------------
 5 files changed, 68 insertions(+), 52 deletions(-)

Comments

Josh Steadmon March 11, 2024, 7:31 p.m. UTC | #1
On 2024.03.05 13:11, Patrick Steinhardt wrote:
> When decoding log records we need a temporary buffer to decode the
> reflog entry's name, mail address and message. As this buffer is local
> to the function we thus have to reallocate it for every single log
> record which we're about to decode, which is inefficient.
> 
> Refactor the code such that callers need to pass in a scratch buffer,
> which allows us to reuse it for multiple decodes. This reduces the
> number of allocations when iterating through reflogs. Before:
> 
>     HEAP SUMMARY:
>         in use at exit: 13,473 bytes in 122 blocks
>       total heap usage: 2,068,487 allocs, 2,068,365 frees, 305,122,946 bytes allocated
> 
> After:
> 
>     HEAP SUMMARY:
>         in use at exit: 13,473 bytes in 122 blocks
>       total heap usage: 1,068,485 allocs, 1,068,363 frees, 281,122,886 bytes allocated
> 
> Note that this commit also drop some redundant calls to `strbuf_reset()`
> right before calling `decode_string()`. The latter already knows to
> reset the buffer, so there is no need for these.
> 
> Signed-off-by: Patrick Steinhardt <ps@pks.im>
> ---
>  reftable/block.c       |  4 ++-
>  reftable/block.h       |  2 ++
>  reftable/record.c      | 52 ++++++++++++++++++--------------------
>  reftable/record.h      |  5 ++--
>  reftable/record_test.c | 57 ++++++++++++++++++++++++++----------------
>  5 files changed, 68 insertions(+), 52 deletions(-)

[snip]

> diff --git a/reftable/record.c b/reftable/record.c
> index 7c86877586..060244337f 100644
> --- a/reftable/record.c
> +++ b/reftable/record.c
> @@ -374,7 +374,7 @@ static int reftable_ref_record_encode(const void *rec, struct string_view s,
>  
>  static int reftable_ref_record_decode(void *rec, struct strbuf key,
>  				      uint8_t val_type, struct string_view in,
> -				      int hash_size)
> +				      int hash_size, struct strbuf *scratch)
>  {
>  	struct reftable_ref_record *r = rec;
>  	struct string_view start = in;
> @@ -425,13 +425,12 @@ static int reftable_ref_record_decode(void *rec, struct strbuf key,
>  		break;
>  
>  	case REFTABLE_REF_SYMREF: {
> -		struct strbuf dest = STRBUF_INIT;
> -		int n = decode_string(&dest, in);
> +		int n = decode_string(scratch, in);
>  		if (n < 0) {
>  			return -1;
>  		}
>  		string_view_consume(&in, n);
> -		r->value.symref = dest.buf;
> +		r->value.symref = strbuf_detach(scratch, NULL);
>  	} break;

I had to dig into this to convince myself that we aren't leaking memory
here, but IIUC this gets cleaned up eventually by
reftable_ref_record_release(), right?
Josh Steadmon March 11, 2024, 7:40 p.m. UTC | #2
On 2024.03.05 13:11, Patrick Steinhardt wrote:
> When decoding log records we need a temporary buffer to decode the
> reflog entry's name, mail address and message. As this buffer is local
> to the function we thus have to reallocate it for every single log
> record which we're about to decode, which is inefficient.
> 
> Refactor the code such that callers need to pass in a scratch buffer,
> which allows us to reuse it for multiple decodes. This reduces the
> number of allocations when iterating through reflogs. Before:
> 
>     HEAP SUMMARY:
>         in use at exit: 13,473 bytes in 122 blocks
>       total heap usage: 2,068,487 allocs, 2,068,365 frees, 305,122,946 bytes allocated
> 
> After:
> 
>     HEAP SUMMARY:
>         in use at exit: 13,473 bytes in 122 blocks
>       total heap usage: 1,068,485 allocs, 1,068,363 frees, 281,122,886 bytes allocated
> 
> Note that this commit also drop some redundant calls to `strbuf_reset()`
> right before calling `decode_string()`. The latter already knows to
> reset the buffer, so there is no need for these.
> 
> Signed-off-by: Patrick Steinhardt <ps@pks.im>
> ---
>  reftable/block.c       |  4 ++-
>  reftable/block.h       |  2 ++
>  reftable/record.c      | 52 ++++++++++++++++++--------------------
>  reftable/record.h      |  5 ++--
>  reftable/record_test.c | 57 ++++++++++++++++++++++++++----------------
>  5 files changed, 68 insertions(+), 52 deletions(-)

At first glance I was feeling somewhat negatively about this change, as
keeping persistent scratch space buffers means that either the owners
or the users of the scratch space need to be more careful about making
sure it's reset and that they're not accumulating cruft in between
various calls.

However, it appears we already have similar scratch space usage in the
sideband and cat-file code, so I guess we are OK with the pattern in
general, and we can rely on tests to make sure things are good in
practice.
Patrick Steinhardt March 11, 2024, 11:25 p.m. UTC | #3
On Mon, Mar 11, 2024 at 12:31:57PM -0700, Josh Steadmon wrote:
> On 2024.03.05 13:11, Patrick Steinhardt wrote:
[snip]
> > diff --git a/reftable/record.c b/reftable/record.c
> > index 7c86877586..060244337f 100644
> > --- a/reftable/record.c
> > +++ b/reftable/record.c
> > @@ -374,7 +374,7 @@ static int reftable_ref_record_encode(const void *rec, struct string_view s,
> >  
> >  static int reftable_ref_record_decode(void *rec, struct strbuf key,
> >  				      uint8_t val_type, struct string_view in,
> > -				      int hash_size)
> > +				      int hash_size, struct strbuf *scratch)
> >  {
> >  	struct reftable_ref_record *r = rec;
> >  	struct string_view start = in;
> > @@ -425,13 +425,12 @@ static int reftable_ref_record_decode(void *rec, struct strbuf key,
> >  		break;
> >  
> >  	case REFTABLE_REF_SYMREF: {
> > -		struct strbuf dest = STRBUF_INIT;
> > -		int n = decode_string(&dest, in);
> > +		int n = decode_string(scratch, in);
> >  		if (n < 0) {
> >  			return -1;
> >  		}
> >  		string_view_consume(&in, n);
> > -		r->value.symref = dest.buf;
> > +		r->value.symref = strbuf_detach(scratch, NULL);
> >  	} break;
> 
> I had to dig into this to convince myself that we aren't leaking memory
> here, but IIUC this gets cleaned up eventually by
> reftable_ref_record_release(), right?

Yes, exactly.

Patrick
diff mbox series

Patch

diff --git a/reftable/block.c b/reftable/block.c
index ad9074dba6..b67300a734 100644
--- a/reftable/block.c
+++ b/reftable/block.c
@@ -332,7 +332,8 @@  int block_iter_next(struct block_iter *it, struct reftable_record *rec)
 		return REFTABLE_FORMAT_ERROR;
 
 	string_view_consume(&in, n);
-	n = reftable_record_decode(rec, it->last_key, extra, in, it->br->hash_size);
+	n = reftable_record_decode(rec, it->last_key, extra, in, it->br->hash_size,
+				   &it->scratch);
 	if (n < 0)
 		return -1;
 	string_view_consume(&in, n);
@@ -369,6 +370,7 @@  int block_iter_seek(struct block_iter *it, struct strbuf *want)
 void block_iter_close(struct block_iter *it)
 {
 	strbuf_release(&it->last_key);
+	strbuf_release(&it->scratch);
 }
 
 int block_reader_seek(struct block_reader *br, struct block_iter *it,
diff --git a/reftable/block.h b/reftable/block.h
index 51699af233..47acc62c0a 100644
--- a/reftable/block.h
+++ b/reftable/block.h
@@ -84,10 +84,12 @@  struct block_iter {
 
 	/* key for last entry we read. */
 	struct strbuf last_key;
+	struct strbuf scratch;
 };
 
 #define BLOCK_ITER_INIT { \
 	.last_key = STRBUF_INIT, \
+	.scratch = STRBUF_INIT, \
 }
 
 /* initializes a block reader. */
diff --git a/reftable/record.c b/reftable/record.c
index 7c86877586..060244337f 100644
--- a/reftable/record.c
+++ b/reftable/record.c
@@ -374,7 +374,7 @@  static int reftable_ref_record_encode(const void *rec, struct string_view s,
 
 static int reftable_ref_record_decode(void *rec, struct strbuf key,
 				      uint8_t val_type, struct string_view in,
-				      int hash_size)
+				      int hash_size, struct strbuf *scratch)
 {
 	struct reftable_ref_record *r = rec;
 	struct string_view start = in;
@@ -425,13 +425,12 @@  static int reftable_ref_record_decode(void *rec, struct strbuf key,
 		break;
 
 	case REFTABLE_REF_SYMREF: {
-		struct strbuf dest = STRBUF_INIT;
-		int n = decode_string(&dest, in);
+		int n = decode_string(scratch, in);
 		if (n < 0) {
 			return -1;
 		}
 		string_view_consume(&in, n);
-		r->value.symref = dest.buf;
+		r->value.symref = strbuf_detach(scratch, NULL);
 	} break;
 
 	case REFTABLE_REF_DELETION:
@@ -579,7 +578,7 @@  static int reftable_obj_record_encode(const void *rec, struct string_view s,
 
 static int reftable_obj_record_decode(void *rec, struct strbuf key,
 				      uint8_t val_type, struct string_view in,
-				      int hash_size)
+				      int hash_size, struct strbuf *scratch UNUSED)
 {
 	struct string_view start = in;
 	struct reftable_obj_record *r = rec;
@@ -849,13 +848,12 @@  static int reftable_log_record_encode(const void *rec, struct string_view s,
 
 static int reftable_log_record_decode(void *rec, struct strbuf key,
 				      uint8_t val_type, struct string_view in,
-				      int hash_size)
+				      int hash_size, struct strbuf *scratch)
 {
 	struct string_view start = in;
 	struct reftable_log_record *r = rec;
 	uint64_t max = 0;
 	uint64_t ts = 0;
-	struct strbuf dest = STRBUF_INIT;
 	int n;
 
 	if (key.len <= 9 || key.buf[key.len - 9] != 0)
@@ -892,7 +890,7 @@  static int reftable_log_record_decode(void *rec, struct strbuf key,
 
 	string_view_consume(&in, 2 * hash_size);
 
-	n = decode_string(&dest, in);
+	n = decode_string(scratch, in);
 	if (n < 0)
 		goto done;
 	string_view_consume(&in, n);
@@ -904,26 +902,25 @@  static int reftable_log_record_decode(void *rec, struct strbuf key,
 	 * skip copying over the name in case it's accurate already.
 	 */
 	if (!r->value.update.name ||
-	    strcmp(r->value.update.name, dest.buf)) {
+	    strcmp(r->value.update.name, scratch->buf)) {
 		r->value.update.name =
-			reftable_realloc(r->value.update.name, dest.len + 1);
-		memcpy(r->value.update.name, dest.buf, dest.len);
-		r->value.update.name[dest.len] = 0;
+			reftable_realloc(r->value.update.name, scratch->len + 1);
+		memcpy(r->value.update.name, scratch->buf, scratch->len);
+		r->value.update.name[scratch->len] = 0;
 	}
 
-	strbuf_reset(&dest);
-	n = decode_string(&dest, in);
+	n = decode_string(scratch, in);
 	if (n < 0)
 		goto done;
 	string_view_consume(&in, n);
 
 	/* Same as above, but for the reflog email. */
 	if (!r->value.update.email ||
-	    strcmp(r->value.update.email, dest.buf)) {
+	    strcmp(r->value.update.email, scratch->buf)) {
 		r->value.update.email =
-			reftable_realloc(r->value.update.email, dest.len + 1);
-		memcpy(r->value.update.email, dest.buf, dest.len);
-		r->value.update.email[dest.len] = 0;
+			reftable_realloc(r->value.update.email, scratch->len + 1);
+		memcpy(r->value.update.email, scratch->buf, scratch->len);
+		r->value.update.email[scratch->len] = 0;
 	}
 
 	ts = 0;
@@ -938,22 +935,19 @@  static int reftable_log_record_decode(void *rec, struct strbuf key,
 	r->value.update.tz_offset = get_be16(in.buf);
 	string_view_consume(&in, 2);
 
-	strbuf_reset(&dest);
-	n = decode_string(&dest, in);
+	n = decode_string(scratch, in);
 	if (n < 0)
 		goto done;
 	string_view_consume(&in, n);
 
-	REFTABLE_ALLOC_GROW(r->value.update.message, dest.len + 1,
+	REFTABLE_ALLOC_GROW(r->value.update.message, scratch->len + 1,
 			    r->value.update.message_cap);
-	memcpy(r->value.update.message, dest.buf, dest.len);
-	r->value.update.message[dest.len] = 0;
+	memcpy(r->value.update.message, scratch->buf, scratch->len);
+	r->value.update.message[scratch->len] = 0;
 
-	strbuf_release(&dest);
 	return start.len - in.len;
 
 done:
-	strbuf_release(&dest);
 	return REFTABLE_FORMAT_ERROR;
 }
 
@@ -1093,7 +1087,7 @@  static int reftable_index_record_encode(const void *rec, struct string_view out,
 
 static int reftable_index_record_decode(void *rec, struct strbuf key,
 					uint8_t val_type, struct string_view in,
-					int hash_size)
+					int hash_size, struct strbuf *scratch UNUSED)
 {
 	struct string_view start = in;
 	struct reftable_index_record *r = rec;
@@ -1174,10 +1168,12 @@  uint8_t reftable_record_val_type(struct reftable_record *rec)
 }
 
 int reftable_record_decode(struct reftable_record *rec, struct strbuf key,
-			   uint8_t extra, struct string_view src, int hash_size)
+			   uint8_t extra, struct string_view src, int hash_size,
+			   struct strbuf *scratch)
 {
 	return reftable_record_vtable(rec)->decode(reftable_record_data(rec),
-						   key, extra, src, hash_size);
+						   key, extra, src, hash_size,
+						   scratch);
 }
 
 void reftable_record_release(struct reftable_record *rec)
diff --git a/reftable/record.h b/reftable/record.h
index 5e8304e052..826ee1c55c 100644
--- a/reftable/record.h
+++ b/reftable/record.h
@@ -55,7 +55,8 @@  struct reftable_record_vtable {
 
 	/* decode data from `src` into the record. */
 	int (*decode)(void *rec, struct strbuf key, uint8_t extra,
-		      struct string_view src, int hash_size);
+		      struct string_view src, int hash_size,
+		      struct strbuf *scratch);
 
 	/* deallocate and null the record. */
 	void (*release)(void *rec);
@@ -138,7 +139,7 @@  int reftable_record_encode(struct reftable_record *rec, struct string_view dest,
 			   int hash_size);
 int reftable_record_decode(struct reftable_record *rec, struct strbuf key,
 			   uint8_t extra, struct string_view src,
-			   int hash_size);
+			   int hash_size, struct strbuf *scratch);
 int reftable_record_is_deletion(struct reftable_record *rec);
 
 static inline uint8_t reftable_record_type(struct reftable_record *rec)
diff --git a/reftable/record_test.c b/reftable/record_test.c
index 57e56138c0..c158ee79ff 100644
--- a/reftable/record_test.c
+++ b/reftable/record_test.c
@@ -99,6 +99,7 @@  static void set_hash(uint8_t *h, int j)
 
 static void test_reftable_ref_record_roundtrip(void)
 {
+	struct strbuf scratch = STRBUF_INIT;
 	int i = 0;
 
 	for (i = REFTABLE_REF_DELETION; i < REFTABLE_NR_REF_VALUETYPES; i++) {
@@ -140,7 +141,7 @@  static void test_reftable_ref_record_roundtrip(void)
 		EXPECT(n > 0);
 
 		/* decode into a non-zero reftable_record to test for leaks. */
-		m = reftable_record_decode(&out, key, i, dest, GIT_SHA1_RAWSZ);
+		m = reftable_record_decode(&out, key, i, dest, GIT_SHA1_RAWSZ, &scratch);
 		EXPECT(n == m);
 
 		EXPECT(reftable_ref_record_equal(&in.u.ref, &out.u.ref,
@@ -150,6 +151,8 @@  static void test_reftable_ref_record_roundtrip(void)
 		strbuf_release(&key);
 		reftable_record_release(&out);
 	}
+
+	strbuf_release(&scratch);
 }
 
 static void test_reftable_log_record_equal(void)
@@ -175,7 +178,6 @@  static void test_reftable_log_record_equal(void)
 static void test_reftable_log_record_roundtrip(void)
 {
 	int i;
-
 	struct reftable_log_record in[] = {
 		{
 			.refname = xstrdup("refs/heads/master"),
@@ -202,6 +204,8 @@  static void test_reftable_log_record_roundtrip(void)
 			.value_type = REFTABLE_LOG_UPDATE,
 		}
 	};
+	struct strbuf scratch = STRBUF_INIT;
+
 	set_test_hash(in[0].value.update.new_hash, 1);
 	set_test_hash(in[0].value.update.old_hash, 2);
 	set_test_hash(in[2].value.update.new_hash, 3);
@@ -241,7 +245,7 @@  static void test_reftable_log_record_roundtrip(void)
 		EXPECT(n >= 0);
 		valtype = reftable_record_val_type(&rec);
 		m = reftable_record_decode(&out, key, valtype, dest,
-					   GIT_SHA1_RAWSZ);
+					   GIT_SHA1_RAWSZ, &scratch);
 		EXPECT(n == m);
 
 		EXPECT(reftable_log_record_equal(&in[i], &out.u.log,
@@ -250,6 +254,8 @@  static void test_reftable_log_record_roundtrip(void)
 		strbuf_release(&key);
 		reftable_record_release(&out);
 	}
+
+	strbuf_release(&scratch);
 }
 
 static void test_u24_roundtrip(void)
@@ -299,23 +305,27 @@  static void test_reftable_obj_record_roundtrip(void)
 {
 	uint8_t testHash1[GIT_SHA1_RAWSZ] = { 1, 2, 3, 4, 0 };
 	uint64_t till9[] = { 1, 2, 3, 4, 500, 600, 700, 800, 9000 };
-	struct reftable_obj_record recs[3] = { {
-						       .hash_prefix = testHash1,
-						       .hash_prefix_len = 5,
-						       .offsets = till9,
-						       .offset_len = 3,
-					       },
-					       {
-						       .hash_prefix = testHash1,
-						       .hash_prefix_len = 5,
-						       .offsets = till9,
-						       .offset_len = 9,
-					       },
-					       {
-						       .hash_prefix = testHash1,
-						       .hash_prefix_len = 5,
-					       } };
+	struct reftable_obj_record recs[3] = {
+		{
+			.hash_prefix = testHash1,
+			.hash_prefix_len = 5,
+			.offsets = till9,
+			.offset_len = 3,
+		},
+		{
+			.hash_prefix = testHash1,
+			.hash_prefix_len = 5,
+			.offsets = till9,
+			.offset_len = 9,
+		},
+		{
+			.hash_prefix = testHash1,
+			.hash_prefix_len = 5,
+		},
+	};
+	struct strbuf scratch = STRBUF_INIT;
 	int i = 0;
+
 	for (i = 0; i < ARRAY_SIZE(recs); i++) {
 		uint8_t buffer[1024] = { 0 };
 		struct string_view dest = {
@@ -339,13 +349,15 @@  static void test_reftable_obj_record_roundtrip(void)
 		EXPECT(n > 0);
 		extra = reftable_record_val_type(&in);
 		m = reftable_record_decode(&out, key, extra, dest,
-					   GIT_SHA1_RAWSZ);
+					   GIT_SHA1_RAWSZ, &scratch);
 		EXPECT(n == m);
 
 		EXPECT(reftable_record_equal(&in, &out, GIT_SHA1_RAWSZ));
 		strbuf_release(&key);
 		reftable_record_release(&out);
 	}
+
+	strbuf_release(&scratch);
 }
 
 static void test_reftable_index_record_roundtrip(void)
@@ -362,6 +374,7 @@  static void test_reftable_index_record_roundtrip(void)
 		.buf = buffer,
 		.len = sizeof(buffer),
 	};
+	struct strbuf scratch = STRBUF_INIT;
 	struct strbuf key = STRBUF_INIT;
 	struct reftable_record out = {
 		.type = BLOCK_TYPE_INDEX,
@@ -379,13 +392,15 @@  static void test_reftable_index_record_roundtrip(void)
 	EXPECT(n > 0);
 
 	extra = reftable_record_val_type(&in);
-	m = reftable_record_decode(&out, key, extra, dest, GIT_SHA1_RAWSZ);
+	m = reftable_record_decode(&out, key, extra, dest, GIT_SHA1_RAWSZ,
+				   &scratch);
 	EXPECT(m == n);
 
 	EXPECT(reftable_record_equal(&in, &out, GIT_SHA1_RAWSZ));
 
 	reftable_record_release(&out);
 	strbuf_release(&key);
+	strbuf_release(&scratch);
 	strbuf_release(&in.u.idx.last_key);
 }