diff mbox series

[v3,13/17] commit-graph.c: unconditionally load Bloom filters

Message ID 09d8669c3a074e7a2ace9d650a345244b2362f7e.1696969994.git.me@ttaylorr.com (mailing list archive)
State Superseded
Headers show
Series bloom: changed-path Bloom filters v2 (& sundries) | expand

Commit Message

Taylor Blau Oct. 10, 2023, 8:33 p.m. UTC
In 9e4df4da07 (commit-graph: new filter ver. that fixes murmur3,
2023-08-01), we began ignoring the Bloom data ("BDAT") chunk for
commit-graphs whose Bloom filters were computed using a hash version
incompatible with the value of `commitGraph.changedPathVersion`.

Now that the Bloom API has been hardened to discard these incompatible
filters (with the exception of low-level APIs), we can safely load these
Bloom filters unconditionally.

We no longer want to return early from `graph_read_bloom_data()`, and
similarly do not want to set the bloom_settings' `hash_version` field as
a side-effect. The latter is because we want to wait until we know which
Bloom settings we're using (either the defaults, from the GIT_TEST
variables, or from the previous commit-graph layer) before deciding what
hash_version to use.

If we detect an existing BDAT chunk, we'll infer the rest of the
settings (e.g., number of hashes, bits per entry, and maximum number of
changed paths) from the earlier graph layer. The hash_version will be
inferred from the previous layer as well, unless one has already been
specified via configuration.

Once all of that is done, we normalize the value of the hash_version to
either "1" or "2".

Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
 commit-graph.c | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

Comments

Patrick Steinhardt Oct. 17, 2023, 8:45 a.m. UTC | #1
On Tue, Oct 10, 2023 at 04:33:59PM -0400, Taylor Blau wrote:
> In 9e4df4da07 (commit-graph: new filter ver. that fixes murmur3,

Nit: It's a bit funny to read this reference to a commit ID when the
commit in question is part of the same series. Isn't it likely to grow
stale?

> 2023-08-01), we began ignoring the Bloom data ("BDAT") chunk for
> commit-graphs whose Bloom filters were computed using a hash version
> incompatible with the value of `commitGraph.changedPathVersion`.
> 
> Now that the Bloom API has been hardened to discard these incompatible
> filters (with the exception of low-level APIs), we can safely load these
> Bloom filters unconditionally.
> 
> We no longer want to return early from `graph_read_bloom_data()`, and
> similarly do not want to set the bloom_settings' `hash_version` field as
> a side-effect. The latter is because we want to wait until we know which
> Bloom settings we're using (either the defaults, from the GIT_TEST
> variables, or from the previous commit-graph layer) before deciding what
> hash_version to use.
> 
> If we detect an existing BDAT chunk, we'll infer the rest of the
> settings (e.g., number of hashes, bits per entry, and maximum number of
> changed paths) from the earlier graph layer. The hash_version will be
> inferred from the previous layer as well, unless one has already been
> specified via configuration.
> 
> Once all of that is done, we normalize the value of the hash_version to
> either "1" or "2".
> 
> Signed-off-by: Taylor Blau <me@ttaylorr.com>
> ---
>  commit-graph.c | 19 ++++++++++---------
>  1 file changed, 10 insertions(+), 9 deletions(-)
> 
> diff --git a/commit-graph.c b/commit-graph.c
> index db623afd09..fa3b58e762 100644
> --- a/commit-graph.c
> +++ b/commit-graph.c
> @@ -327,12 +327,6 @@ static int graph_read_bloom_data(const unsigned char *chunk_start,
>  	uint32_t hash_version;
>  	hash_version = get_be32(chunk_start);
>  
> -	if (*c->commit_graph_changed_paths_version == -1) {
> -		*c->commit_graph_changed_paths_version = hash_version;
> -	} else if (hash_version != *c->commit_graph_changed_paths_version) {
> -		return 0;
> -	}
> -
>  	g->chunk_bloom_data = chunk_start;
>  	g->bloom_filter_settings = xmalloc(sizeof(struct bloom_filter_settings));
>  	g->bloom_filter_settings->hash_version = hash_version;
> @@ -2473,8 +2467,7 @@ int write_commit_graph(struct object_directory *odb,
>  	ctx->write_generation_data = (get_configured_generation_version(r) == 2);
>  	ctx->num_generation_data_overflows = 0;
>  
> -	bloom_settings.hash_version = r->settings.commit_graph_changed_paths_version == 2
> -		? 2 : 1;
> +	bloom_settings.hash_version = r->settings.commit_graph_changed_paths_version;
>  	bloom_settings.bits_per_entry = git_env_ulong("GIT_TEST_BLOOM_SETTINGS_BITS_PER_ENTRY",
>  						      bloom_settings.bits_per_entry);
>  	bloom_settings.num_hashes = git_env_ulong("GIT_TEST_BLOOM_SETTINGS_NUM_HASHES",
> @@ -2506,10 +2499,18 @@ int write_commit_graph(struct object_directory *odb,
>  		/* We have changed-paths already. Keep them in the next graph */
>  		if (g && g->bloom_filter_settings) {
>  			ctx->changed_paths = 1;
> -			ctx->bloom_settings = g->bloom_filter_settings;
> +
> +			/* don't propagate the hash_version unless unspecified */
> +			if (bloom_settings.hash_version == -1)
> +				bloom_settings.hash_version = g->bloom_filter_settings->hash_version;
> +			bloom_settings.bits_per_entry = g->bloom_filter_settings->bits_per_entry;
> +			bloom_settings.num_hashes = g->bloom_filter_settings->num_hashes;
> +			bloom_settings.max_changed_paths = g->bloom_filter_settings->max_changed_paths;
>  		}
>  	}
>  
> +	bloom_settings.hash_version = bloom_settings.hash_version == 2 ? 2 : 1;
> +

What if there is a future version of Git that writes Bloom filters with
hash version 3? Should we really normalize that to `1`?

Patrick

>  	if (ctx->split) {
>  		struct commit_graph *g = ctx->r->objects->commit_graph;
>  
> -- 
> 2.42.0.342.g8bb3a896ee
>
diff mbox series

Patch

diff --git a/commit-graph.c b/commit-graph.c
index db623afd09..fa3b58e762 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -327,12 +327,6 @@  static int graph_read_bloom_data(const unsigned char *chunk_start,
 	uint32_t hash_version;
 	hash_version = get_be32(chunk_start);
 
-	if (*c->commit_graph_changed_paths_version == -1) {
-		*c->commit_graph_changed_paths_version = hash_version;
-	} else if (hash_version != *c->commit_graph_changed_paths_version) {
-		return 0;
-	}
-
 	g->chunk_bloom_data = chunk_start;
 	g->bloom_filter_settings = xmalloc(sizeof(struct bloom_filter_settings));
 	g->bloom_filter_settings->hash_version = hash_version;
@@ -2473,8 +2467,7 @@  int write_commit_graph(struct object_directory *odb,
 	ctx->write_generation_data = (get_configured_generation_version(r) == 2);
 	ctx->num_generation_data_overflows = 0;
 
-	bloom_settings.hash_version = r->settings.commit_graph_changed_paths_version == 2
-		? 2 : 1;
+	bloom_settings.hash_version = r->settings.commit_graph_changed_paths_version;
 	bloom_settings.bits_per_entry = git_env_ulong("GIT_TEST_BLOOM_SETTINGS_BITS_PER_ENTRY",
 						      bloom_settings.bits_per_entry);
 	bloom_settings.num_hashes = git_env_ulong("GIT_TEST_BLOOM_SETTINGS_NUM_HASHES",
@@ -2506,10 +2499,18 @@  int write_commit_graph(struct object_directory *odb,
 		/* We have changed-paths already. Keep them in the next graph */
 		if (g && g->bloom_filter_settings) {
 			ctx->changed_paths = 1;
-			ctx->bloom_settings = g->bloom_filter_settings;
+
+			/* don't propagate the hash_version unless unspecified */
+			if (bloom_settings.hash_version == -1)
+				bloom_settings.hash_version = g->bloom_filter_settings->hash_version;
+			bloom_settings.bits_per_entry = g->bloom_filter_settings->bits_per_entry;
+			bloom_settings.num_hashes = g->bloom_filter_settings->num_hashes;
+			bloom_settings.max_changed_paths = g->bloom_filter_settings->max_changed_paths;
 		}
 	}
 
+	bloom_settings.hash_version = bloom_settings.hash_version == 2 ? 2 : 1;
+
 	if (ctx->split) {
 		struct commit_graph *g = ctx->r->objects->commit_graph;