diff mbox series

[RFC,1/6] bloom: annotate filters with hash version

Message ID e23a956401c5619bd46e8ec9b0e1df958cbcbfec.1691426160.git.me@ttaylorr.com (mailing list archive)
State New, archived
Headers show
Series bloom: reuse existing Bloom filters when possible during upgrade | expand

Commit Message

Taylor Blau Aug. 7, 2023, 4:37 p.m. UTC
In subsequent commits, we will want to load existing Bloom filters out
of a commit-graph, even when the hash version they were computed with
does not match the value of `commitGraph.changedPathVersion`.

In order to differentiate between the two, add a "filter" field to each
Bloom filter.

Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
 bloom.c | 11 ++++++++---
 bloom.h |  1 +
 2 files changed, 9 insertions(+), 3 deletions(-)

Comments

Jonathan Tan Aug. 11, 2023, 9:46 p.m. UTC | #1
Taylor Blau <me@ttaylorr.com> writes:
> In subsequent commits, we will want to load existing Bloom filters out
> of a commit-graph, even when the hash version they were computed with
> does not match the value of `commitGraph.changedPathVersion`.
> 
> In order to differentiate between the two, add a "filter" field to each
> Bloom filter.

You mean "version", I think.

> @@ -55,6 +55,7 @@ struct bloom_filter_settings {
>  struct bloom_filter {
>  	unsigned char *data;
>  	size_t len;
> +	int version;
>  };

We might want to shrink the sizes of len (we have a changed path limit
so we know exactly how big Bloom filters can get) and version so that
this struct doesn't take up more space. But if other reviewers think
that this is OK, I'm fine with that.

Another thing that we might want to track is whether the Bloom filter is
a reference to an existing buffer (and thus does not need to be freed)
or a reference to a malloc-ed buffer that we must free. But both before
and after this patch set, a malloc-ed buffer is never overridden by a
reference-to-existing-buffer, so we should still be fine for now. (This
patch set does add a scenario in which a reference-to-existing buffer is
overridden by a malloc-ed buffer, but that's the only new scenario.)
Taylor Blau Aug. 17, 2023, 7:55 p.m. UTC | #2
On Fri, Aug 11, 2023 at 02:46:51PM -0700, Jonathan Tan wrote:
> Taylor Blau <me@ttaylorr.com> writes:
> > In subsequent commits, we will want to load existing Bloom filters out
> > of a commit-graph, even when the hash version they were computed with
> > does not match the value of `commitGraph.changedPathVersion`.
> >
> > In order to differentiate between the two, add a "filter" field to each
> > Bloom filter.
>
> You mean "version", I think.

Oops, yes -- I'm not sure how my editor tab-completed "version" there,
but oh, well :-).

> > @@ -55,6 +55,7 @@ struct bloom_filter_settings {
> >  struct bloom_filter {
> >  	unsigned char *data;
> >  	size_t len;
> > +	int version;
> >  };
>
> We might want to shrink the sizes of len (we have a changed path limit
> so we know exactly how big Bloom filters can get) and version so that
> this struct doesn't take up more space. But if other reviewers think
> that this is OK, I'm fine with that.

I think that making len a size_t here is an appropriate choice. Even
though the maximum length of a Bloom filter is well below the 2^64-1
threshold, we are often looking at a memory-mapped region here, so
keeping track of it with a size_t / off_t seems reasonable to me.

> Another thing that we might want to track is whether the Bloom filter is
> a reference to an existing buffer (and thus does not need to be freed)
> or a reference to a malloc-ed buffer that we must free. But both before
> and after this patch set, a malloc-ed buffer is never overridden by a
> reference-to-existing-buffer, so we should still be fine for now. (This
> patch set does add a scenario in which a reference-to-existing buffer is
> overridden by a malloc-ed buffer, but that's the only new scenario.)

Yeah, I think there is some opportunity for clean-up here. I'll take a
look...

Thanks,
Taylor
Taylor Blau Aug. 21, 2023, 8:21 p.m. UTC | #3
On Thu, Aug 17, 2023 at 03:55:06PM -0400, Taylor Blau wrote:
> > Another thing that we might want to track is whether the Bloom filter is
> > a reference to an existing buffer (and thus does not need to be freed)
> > or a reference to a malloc-ed buffer that we must free. But both before
> > and after this patch set, a malloc-ed buffer is never overridden by a
> > reference-to-existing-buffer, so we should still be fine for now. (This
> > patch set does add a scenario in which a reference-to-existing buffer is
> > overridden by a malloc-ed buffer, but that's the only new scenario.)
>
> Yeah, I think there is some opportunity for clean-up here. I'll take a
> look...

This ended up being pretty reasonable. I'm not sure whether I should
include it here or not, since any leaks in the Bloom subsystem are
definitely not new as of this series.

But the patch is relatively straightforward anyway, so I think throwing
it on the end would be OK:

--- 8< ---
diff --git a/bloom.c b/bloom.c
index 24dd874e46..ff131893cd 100644
--- a/bloom.c
+++ b/bloom.c
@@ -59,6 +59,7 @@ int load_bloom_filter_from_graph(struct commit_graph *g,
 					sizeof(unsigned char) * start_index +
 					BLOOMDATA_CHUNK_HEADER_SIZE);
 	filter->version = g->bloom_filter_settings->hash_version;
+	filter->to_free = NULL;

 	return 1;
 }
@@ -231,6 +232,18 @@ void init_bloom_filters(void)
 	init_bloom_filter_slab(&bloom_filters);
 }

+static void free_one_bloom_filter(struct bloom_filter *filter)
+{
+	if (!filter)
+		return;
+	free(filter->to_free);
+}
+
+void deinit_bloom_filters(void)
+{
+	deep_clear_bloom_filter_slab(&bloom_filters, free_one_bloom_filter);
+}
+
 static int pathmap_cmp(const void *hashmap_cmp_fn_data UNUSED,
 		       const struct hashmap_entry *eptr,
 		       const struct hashmap_entry *entry_or_key,
@@ -247,7 +260,7 @@ static int pathmap_cmp(const void *hashmap_cmp_fn_data UNUSED,
 static void init_truncated_large_filter(struct bloom_filter *filter,
 					int version)
 {
-	filter->data = xmalloc(1);
+	filter->data = filter->to_free = xmalloc(1);
 	filter->data[0] = 0xFF;
 	filter->len = 1;
 	filter->version = version;
@@ -449,6 +462,7 @@ struct bloom_filter *get_or_compute_bloom_filter(struct repository *r,
 			filter->len = 1;
 		}
 		CALLOC_ARRAY(filter->data, filter->len);
+		filter->to_free = filter->data;

 		hashmap_for_each_entry(&pathmap, &iter, e, entry) {
 			struct bloom_key key;
diff --git a/bloom.h b/bloom.h
index 4462fc3908..c1d74d63e6 100644
--- a/bloom.h
+++ b/bloom.h
@@ -56,6 +56,8 @@ struct bloom_filter {
 	unsigned char *data;
 	size_t len;
 	int version;
+
+	void *to_free;
 };

 /*
@@ -96,6 +98,7 @@ void add_key_to_filter(const struct bloom_key *key,
 		       const struct bloom_filter_settings *settings);

 void init_bloom_filters(void);
+void deinit_bloom_filters(void);

 enum bloom_filter_computed {
 	BLOOM_NOT_COMPUTED = (1 << 0),
diff --git a/commit-graph.c b/commit-graph.c
index 183ed90b6d..f22f2d350d 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -2532,6 +2532,9 @@ int write_commit_graph(struct object_directory *odb,

 	res = write_commit_graph_file(ctx);

+	if (ctx->changed_paths)
+		deinit_bloom_filters();
+
 	if (ctx->split)
 		mark_commit_graphs(ctx);

--- >8 ---

Thanks,
Taylor
diff mbox series

Patch

diff --git a/bloom.c b/bloom.c
index ebef5cfd2f..9b6a30f6f6 100644
--- a/bloom.c
+++ b/bloom.c
@@ -55,6 +55,7 @@  int load_bloom_filter_from_graph(struct commit_graph *g,
 	filter->data = (unsigned char *)(g->chunk_bloom_data +
 					sizeof(unsigned char) * start_index +
 					BLOOMDATA_CHUNK_HEADER_SIZE);
+	filter->version = g->bloom_filter_settings->hash_version;
 
 	return 1;
 }
@@ -240,11 +241,13 @@  static int pathmap_cmp(const void *hashmap_cmp_fn_data UNUSED,
 	return strcmp(e1->path, e2->path);
 }
 
-static void init_truncated_large_filter(struct bloom_filter *filter)
+static void init_truncated_large_filter(struct bloom_filter *filter,
+					int version)
 {
 	filter->data = xmalloc(1);
 	filter->data[0] = 0xFF;
 	filter->len = 1;
+	filter->version = version;
 }
 
 struct bloom_filter *get_or_compute_bloom_filter(struct repository *r,
@@ -329,13 +332,15 @@  struct bloom_filter *get_or_compute_bloom_filter(struct repository *r,
 		}
 
 		if (hashmap_get_size(&pathmap) > settings->max_changed_paths) {
-			init_truncated_large_filter(filter);
+			init_truncated_large_filter(filter,
+						    settings->hash_version);
 			if (computed)
 				*computed |= BLOOM_TRUNC_LARGE;
 			goto cleanup;
 		}
 
 		filter->len = (hashmap_get_size(&pathmap) * settings->bits_per_entry + BITS_PER_WORD - 1) / BITS_PER_WORD;
+		filter->version = settings->hash_version;
 		if (!filter->len) {
 			if (computed)
 				*computed |= BLOOM_TRUNC_EMPTY;
@@ -355,7 +360,7 @@  struct bloom_filter *get_or_compute_bloom_filter(struct repository *r,
 	} else {
 		for (i = 0; i < diff_queued_diff.nr; i++)
 			diff_free_filepair(diff_queued_diff.queue[i]);
-		init_truncated_large_filter(filter);
+		init_truncated_large_filter(filter, settings->hash_version);
 
 		if (computed)
 			*computed |= BLOOM_TRUNC_LARGE;
diff --git a/bloom.h b/bloom.h
index 138d57a86b..330a140520 100644
--- a/bloom.h
+++ b/bloom.h
@@ -55,6 +55,7 @@  struct bloom_filter_settings {
 struct bloom_filter {
 	unsigned char *data;
 	size_t len;
+	int version;
 };
 
 /*