[v3,14/14] builtin/commit-graph.c: introduce '--max-new-filters=<n>'
diff mbox series

Message ID 09f6871f66bff838c067a3e0d23cd4622171f3bd.1597178915.git.me@ttaylorr.com
State New
Headers show
Series
  • more miscellaneous Bloom filter improvements
Related show

Commit Message

Taylor Blau Aug. 11, 2020, 8:52 p.m. UTC
Introduce a command-line flag and configuration variable to fill in the
'max_new_filters' variable introduced by the previous patch.

The command-line option '--max-new-filters' takes precedence over
'commitGraph.maxNewFilters', which is the default value.
'--no-max-new-filters' can also be provided, which sets the value back
to '-1', indicating that an unlimited number of new Bloom filters may be
generated. (OPT_INTEGER only allows setting the '--no-' variant back to
'0', hence a custom callback was used instead).

Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
 Documentation/config/commitgraph.txt |  4 +++
 Documentation/git-commit-graph.txt   |  4 +++
 bloom.c                              | 15 +++++++++++
 builtin/commit-graph.c               | 39 +++++++++++++++++++++++++---
 commit-graph.c                       | 27 ++++++++++++++++---
 commit-graph.h                       |  1 +
 t/t4216-log-bloom.sh                 | 19 ++++++++++++++
 7 files changed, 102 insertions(+), 7 deletions(-)

Comments

SZEDER Gábor Aug. 12, 2020, 11:49 a.m. UTC | #1
On Tue, Aug 11, 2020 at 04:52:14PM -0400, Taylor Blau wrote:

> diff --git a/commit-graph.c b/commit-graph.c
> index 6886f319a5..4aae5471e3 100644
> --- a/commit-graph.c
> +++ b/commit-graph.c
> @@ -954,7 +954,8 @@ struct tree *get_commit_tree_in_graph(struct repository *r, const struct commit
>  }
>  
>  static int get_bloom_filter_large_in_graph(struct commit_graph *g,
> -					   const struct commit *c)
> +					   const struct commit *c,
> +					   uint32_t max_changed_paths)
>  {
>  	uint32_t graph_pos = commit_graph_position(c);
>  	if (graph_pos == COMMIT_NOT_FROM_GRAPH)
> @@ -965,6 +966,17 @@ static int get_bloom_filter_large_in_graph(struct commit_graph *g,
>  
>  	if (!(g && g->bloom_large))
>  		return 0;
> +	if (g->bloom_filter_settings->max_changed_paths != max_changed_paths) {
> +		/*
> +		 * Force all commits which are subject to a different
> +		 * 'max_changed_paths' limit to be recomputed from scratch.
> +		 *
> +		 * Note that this could likely be improved, but is ignored since
> +		 * all real-world graphs set the maximum number of changed paths
> +		 * at 512.
> +		 */
> +		return 0;
> +	}
>  	return bitmap_get(g->bloom_large, graph_pos - g->num_commits_in_base);
>  }
>  
> @@ -1470,6 +1482,7 @@ static void compute_bloom_filters(struct write_commit_graph_context *ctx)
>  	int i;
>  	struct progress *progress = NULL;
>  	int *sorted_commits;
> +	int max_new_filters;
>  
>  	init_bloom_filters();
>  	ctx->bloom_large = bitmap_word_alloc(ctx->commits.nr / BITS_IN_EWORD + 1);
> @@ -1486,10 +1499,15 @@ static void compute_bloom_filters(struct write_commit_graph_context *ctx)
>  		ctx->order_by_pack ? commit_pos_cmp : commit_gen_cmp,
>  		&ctx->commits);
>  
> +	max_new_filters = ctx->opts->max_new_filters >= 0 ?
> +		ctx->opts->max_new_filters : ctx->commits.nr;

git_test_write_commit_graph_or_die() invokes
write_commit_graph_reachable() with opts=0x0, so 'ctx->opts' is NULL,
and we get segfault.  This breaks a lot of tests when run with
GIT_TEST_COMMIT_GRAPH=1 GIT_TEST_COMMIT_GRAPH_CHANGED_PATHS=1.
Derrick Stolee Aug. 12, 2020, 12:29 p.m. UTC | #2
On 8/11/2020 4:52 PM, Taylor Blau wrote:
> Introduce a command-line flag and configuration variable to fill in the
> 'max_new_filters' variable introduced by the previous patch.
> 
> The command-line option '--max-new-filters' takes precedence over
> 'commitGraph.maxNewFilters', which is the default value.
> '--no-max-new-filters' can also be provided, which sets the value back
> to '-1', indicating that an unlimited number of new Bloom filters may be
> generated. (OPT_INTEGER only allows setting the '--no-' variant back to
> '0', hence a custom callback was used instead).
> 
> Signed-off-by: Taylor Blau <me@ttaylorr.com>

...

> diff --git a/bloom.c b/bloom.c
> index ed54e96e57..8d07209c6b 100644
> --- a/bloom.c
> +++ b/bloom.c
> @@ -51,6 +51,21 @@ static int load_bloom_filter_from_graph(struct commit_graph *g,
>  	else
>  		start_index = 0;
>  
> +	if ((start_index == end_index) &&
> +	    (g->bloom_large && !bitmap_get(g->bloom_large, lex_pos))) {
> +		/*
> +		 * If the filter is zero-length, either (1) the filter has no
> +		 * changes, (2) the filter has too many changes, or (3) it
> +		 * wasn't computed (eg., due to '--max-new-filters').
> +		 *
> +		 * If either (1) or (2) is the case, the 'large' bit will be set
> +		 * for this Bloom filter. If it is unset, then it wasn't
> +		 * computed. In that case, return nothing, since we don't have
> +		 * that filter in the graph.
> +		 */
> +		return 0;
> +	}
> +

Here, you are creating a distinction between an "empty" filter and
a "too large" filter at a place that I don't think is important.

For instance, this code will be triggered by "git log -- <path>"
but you only care about the filter being too large when writing the
commit-graph. I think this check for a "too large" filter should
instead be inside get_or_compute_bloom_filter(). I include a patch
below that applies on top of tb/bloom-improvements that gets to
my point (and how minor the issue might be).

Thanks,
-Stolee

--- >8 ---

From 92a7a3d0f769fad7617426730cb97584a3d07794 Mon Sep 17 00:00:00 2001
From: Derrick Stolee <dstolee@microsoft.com>
Date: Wed, 12 Aug 2020 08:20:03 -0400
Subject: [PATCH] commit-graph: lazy-load large Bloom filter bitmap

The bloom_large bitmap in struct commit_graph is currently loaded
immediately upon first parse of the commit-graph file (when the chunk
exists). This is needed because the current implementation of
get_bloom_filter_from_graph() is special-cased to return NULL when the
filter is marked as "too large".

This has a slight drawback: before we can read a single commit out of
the commit-graph file, we need to load this entire chunk into memory.
This happens even for commands that don't need changed-path Bloom
filters, such as "git log -1".

This "too large" information is only used when writing a commit-graph
file, so we can delay the check for a large filter until after we check
compute_if_not_present in get_or_compute_bloom_filter(). Also, place
that lazy-load directly in the get_bloom_filter_large_in_graph() method,
so we ensure it is ready when needed.

This may be overkill. For a repository with one million commits, this
filter size is approximately 125 *kilobytes* of data. My local
measurements found that this took between 1 and 2 milliseconds to load
into memory. Even for repositories with 10 million commits, this
difference would not be noticeable for end-user commands.

On the other hand, these handfuls of milliseconds could add up when
running a hosting service using Git, so this extra effort is probably
worth it.

Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
---
 bloom.c        | 27 ++++++++-------------------
 commit-graph.c | 30 +++++++++++++++++-------------
 commit-graph.h |  8 ++++++++
 3 files changed, 33 insertions(+), 32 deletions(-)

diff --git a/bloom.c b/bloom.c
index 8d07209c6b..6d0884fa19 100644
--- a/bloom.c
+++ b/bloom.c
@@ -51,21 +51,6 @@ static int load_bloom_filter_from_graph(struct commit_graph *g,
 	else
 		start_index = 0;
 
-	if ((start_index == end_index) &&
-	    (g->bloom_large && !bitmap_get(g->bloom_large, lex_pos))) {
-		/*
-		 * If the filter is zero-length, either (1) the filter has no
-		 * changes, (2) the filter has too many changes, or (3) it
-		 * wasn't computed (eg., due to '--max-new-filters').
-		 *
-		 * If either (1) or (2) is the case, the 'large' bit will be set
-		 * for this Bloom filter. If it is unset, then it wasn't
-		 * computed. In that case, return nothing, since we don't have
-		 * that filter in the graph.
-		 */
-		return 0;
-	}
-
 	filter->len = end_index - start_index;
 	filter->data = (unsigned char *)(g->chunk_bloom_data +
 					sizeof(unsigned char) * start_index +
@@ -212,16 +197,20 @@ struct bloom_filter *get_or_compute_bloom_filter(struct repository *r,
 
 	if (!filter->data) {
 		load_commit_graph_info(r, c);
-		if (commit_graph_position(c) != COMMIT_NOT_FROM_GRAPH &&
-			load_bloom_filter_from_graph(r->objects->commit_graph, filter, c))
-				return filter;
+		if (commit_graph_position(c) != COMMIT_NOT_FROM_GRAPH)
+			load_bloom_filter_from_graph(r->objects->commit_graph, filter, c);
 	}
 
-	if (filter->data)
+	if (filter && filter->len)
 		return filter;
 	if (!compute_if_not_present)
 		return NULL;
 
+	if (filter && !filter->len &&
+	    get_bloom_filter_large_in_graph(r->objects->commit_graph, c,
+	    				    settings->max_changed_paths))
+		return filter;
+
 	repo_diff_setup(r, &diffopt);
 	diffopt.flags.recursive = 1;
 	diffopt.detect_rename = 0;
diff --git a/commit-graph.c b/commit-graph.c
index 4aae5471e3..ea89f431cc 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -435,17 +435,9 @@ struct commit_graph *parse_commit_graph(struct repository *r,
 			if (graph->chunk_bloom_large_filters)
 				chunk_repeated = 1;
 			else if (r->settings.commit_graph_read_changed_paths) {
-				size_t alloc = get_be64(chunk_lookup + 4) - chunk_offset - sizeof(uint32_t);
+				graph->bloom_large_alloc = get_be64(chunk_lookup + 4) - chunk_offset - sizeof(uint32_t);
 				graph->chunk_bloom_large_filters = data + chunk_offset + sizeof(uint32_t);
 				graph->bloom_filter_settings->max_changed_paths = get_be32(data + chunk_offset);
-				if (alloc) {
-					size_t j;
-					graph->bloom_large = bitmap_word_alloc(alloc);
-
-					for (j = 0; j < graph->bloom_large->word_alloc; j++)
-						graph->bloom_large->words[j] = get_be64(
-							graph->chunk_bloom_large_filters + j * sizeof(eword_t));
-				}
 			}
 			break;
 		}
@@ -953,9 +945,9 @@ struct tree *get_commit_tree_in_graph(struct repository *r, const struct commit
 	return get_commit_tree_in_graph_one(r, r->objects->commit_graph, c);
 }
 
-static int get_bloom_filter_large_in_graph(struct commit_graph *g,
-					   const struct commit *c,
-					   uint32_t max_changed_paths)
+int get_bloom_filter_large_in_graph(struct commit_graph *g,
+				    const struct commit *c,
+				    uint32_t max_changed_paths)
 {
 	uint32_t graph_pos = commit_graph_position(c);
 	if (graph_pos == COMMIT_NOT_FROM_GRAPH)
@@ -964,8 +956,20 @@ static int get_bloom_filter_large_in_graph(struct commit_graph *g,
 	while (g && graph_pos < g->num_commits_in_base)
 		g = g->base_graph;
 
-	if (!(g && g->bloom_large))
+	if (!g || !g->bloom_large_alloc)
 		return 0;
+
+	if (!g->bloom_large) {
+		size_t j;
+		g->bloom_large = bitmap_word_alloc(g->bloom_large_alloc);
+
+		for (j = 0; j < g->bloom_large->word_alloc; j++) {
+			const void *data = g->chunk_bloom_large_filters +
+					   j * sizeof(eword_t);
+			g->bloom_large->words[j] = get_be64(data);
+		}
+	}
+
 	if (g->bloom_filter_settings->max_changed_paths != max_changed_paths) {
 		/*
 		 * Force all commits which are subject to a different
diff --git a/commit-graph.h b/commit-graph.h
index 75ef83708b..126fd43380 100644
--- a/commit-graph.h
+++ b/commit-graph.h
@@ -51,6 +51,13 @@ void load_commit_graph_info(struct repository *r, struct commit *item);
 struct tree *get_commit_tree_in_graph(struct repository *r,
 				      const struct commit *c);
 
+/*
+ * Returns 1 if this commit was marked in the BFXL chunk as having more
+ * than max_changed_paths changes.
+ */
+int get_bloom_filter_large_in_graph(struct commit_graph *g,
+				    const struct commit *c,
+				    uint32_t max_changed_paths);
 struct commit_graph {
 	const unsigned char *data;
 	size_t data_len;
@@ -74,6 +81,7 @@ struct commit_graph {
 	const unsigned char *chunk_bloom_data;
 	const unsigned char *chunk_bloom_large_filters;
 
+	size_t bloom_large_alloc;
 	struct bitmap *bloom_large;
 
 	struct bloom_filter_settings *bloom_filter_settings;
Taylor Blau Aug. 14, 2020, 8:10 p.m. UTC | #3
On Wed, Aug 12, 2020 at 08:29:06AM -0400, Derrick Stolee wrote:
> On 8/11/2020 4:52 PM, Taylor Blau wrote:
> > Introduce a command-line flag and configuration variable to fill in the
> > 'max_new_filters' variable introduced by the previous patch.
> >
> > The command-line option '--max-new-filters' takes precedence over
> > 'commitGraph.maxNewFilters', which is the default value.
> > '--no-max-new-filters' can also be provided, which sets the value back
> > to '-1', indicating that an unlimited number of new Bloom filters may be
> > generated. (OPT_INTEGER only allows setting the '--no-' variant back to
> > '0', hence a custom callback was used instead).
> >
> > Signed-off-by: Taylor Blau <me@ttaylorr.com>
>
> ...
>
> > diff --git a/bloom.c b/bloom.c
> > index ed54e96e57..8d07209c6b 100644
> > --- a/bloom.c
> > +++ b/bloom.c
> > @@ -51,6 +51,21 @@ static int load_bloom_filter_from_graph(struct commit_graph *g,
> >  	else
> >  		start_index = 0;
> >
> > +	if ((start_index == end_index) &&
> > +	    (g->bloom_large && !bitmap_get(g->bloom_large, lex_pos))) {
> > +		/*
> > +		 * If the filter is zero-length, either (1) the filter has no
> > +		 * changes, (2) the filter has too many changes, or (3) it
> > +		 * wasn't computed (eg., due to '--max-new-filters').
> > +		 *
> > +		 * If either (1) or (2) is the case, the 'large' bit will be set
> > +		 * for this Bloom filter. If it is unset, then it wasn't
> > +		 * computed. In that case, return nothing, since we don't have
> > +		 * that filter in the graph.
> > +		 */
> > +		return 0;
> > +	}
> > +
>
> Here, you are creating a distinction between an "empty" filter and
> a "too large" filter at a place that I don't think is important.
>
> For instance, this code will be triggered by "git log -- <path>"
> but you only care about the filter being too large when writing the
> commit-graph. I think this check for a "too large" filter should
> instead be inside get_or_compute_bloom_filter(). I include a patch
> below that applies on top of tb/bloom-improvements that gets to
> my point (and how minor the issue might be).

That's a good point. I factored the lazy-loading out in basically the
same way that you did below, but didn't move this call out of
'load_bloom_filter_from_graph'. I think that you're right that this is
better at the calling end (replacing the 'start_index == end_index'
check with a '!filter->len' one instead).

I'm going to "ignore" this patch below since I (a) already have it
locally, and (b) would rather just do this correctly the first time than
introduce and subsequently fix a performance regression.

> Thanks,
> -Stolee
>
> --- >8 ---
>
> From 92a7a3d0f769fad7617426730cb97584a3d07794 Mon Sep 17 00:00:00 2001
> From: Derrick Stolee <dstolee@microsoft.com>
> Date: Wed, 12 Aug 2020 08:20:03 -0400
> Subject: [PATCH] commit-graph: lazy-load large Bloom filter bitmap
>
> The bloom_large bitmap in struct commit_graph is currently loaded
> immediately upon first parse of the commit-graph file (when the chunk
> exists). This is needed because the current implementation of
> get_bloom_filter_from_graph() is special-cased to return NULL when the
> filter is marked as "too large".
>
> This has a slight drawback: before we can read a single commit out of
> the commit-graph file, we need to load this entire chunk into memory.
> This happens even for commands that don't need changed-path Bloom
> filters, such as "git log -1".
>
> This "too large" information is only used when writing a commit-graph
> file, so we can delay the check for a large filter until after we check
> compute_if_not_present in get_or_compute_bloom_filter(). Also, place
> that lazy-load directly in the get_bloom_filter_large_in_graph() method,
> so we ensure it is ready when needed.
>
> This may be overkill. For a repository with one million commits, this
> filter size is approximately 125 *kilobytes* of data. My local
> measurements found that this took between 1 and 2 milliseconds to load
> into memory. Even for repositories with 10 million commits, this
> difference would not be noticeable for end-user commands.
>
> On the other hand, these handfuls of milliseconds could add up when
> running a hosting service using Git, so this extra effort is probably
> worth it.
>
> Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
> ---
>  bloom.c        | 27 ++++++++-------------------
>  commit-graph.c | 30 +++++++++++++++++-------------
>  commit-graph.h |  8 ++++++++
>  3 files changed, 33 insertions(+), 32 deletions(-)
>
> diff --git a/bloom.c b/bloom.c
> index 8d07209c6b..6d0884fa19 100644
> --- a/bloom.c
> +++ b/bloom.c
> @@ -51,21 +51,6 @@ static int load_bloom_filter_from_graph(struct commit_graph *g,
>  	else
>  		start_index = 0;
>
> -	if ((start_index == end_index) &&
> -	    (g->bloom_large && !bitmap_get(g->bloom_large, lex_pos))) {
> -		/*
> -		 * If the filter is zero-length, either (1) the filter has no
> -		 * changes, (2) the filter has too many changes, or (3) it
> -		 * wasn't computed (eg., due to '--max-new-filters').
> -		 *
> -		 * If either (1) or (2) is the case, the 'large' bit will be set
> -		 * for this Bloom filter. If it is unset, then it wasn't
> -		 * computed. In that case, return nothing, since we don't have
> -		 * that filter in the graph.
> -		 */
> -		return 0;
> -	}
> -
>  	filter->len = end_index - start_index;
>  	filter->data = (unsigned char *)(g->chunk_bloom_data +
>  					sizeof(unsigned char) * start_index +
> @@ -212,16 +197,20 @@ struct bloom_filter *get_or_compute_bloom_filter(struct repository *r,
>
>  	if (!filter->data) {
>  		load_commit_graph_info(r, c);
> -		if (commit_graph_position(c) != COMMIT_NOT_FROM_GRAPH &&
> -			load_bloom_filter_from_graph(r->objects->commit_graph, filter, c))
> -				return filter;
> +		if (commit_graph_position(c) != COMMIT_NOT_FROM_GRAPH)
> +			load_bloom_filter_from_graph(r->objects->commit_graph, filter, c);
>  	}
>
> -	if (filter->data)
> +	if (filter && filter->len)
>  		return filter;
>  	if (!compute_if_not_present)
>  		return NULL;
>
> +	if (filter && !filter->len &&
> +	    get_bloom_filter_large_in_graph(r->objects->commit_graph, c,
> +	    				    settings->max_changed_paths))
> +		return filter;
> +
>  	repo_diff_setup(r, &diffopt);
>  	diffopt.flags.recursive = 1;
>  	diffopt.detect_rename = 0;
> diff --git a/commit-graph.c b/commit-graph.c
> index 4aae5471e3..ea89f431cc 100644
> --- a/commit-graph.c
> +++ b/commit-graph.c
> @@ -435,17 +435,9 @@ struct commit_graph *parse_commit_graph(struct repository *r,
>  			if (graph->chunk_bloom_large_filters)
>  				chunk_repeated = 1;
>  			else if (r->settings.commit_graph_read_changed_paths) {
> -				size_t alloc = get_be64(chunk_lookup + 4) - chunk_offset - sizeof(uint32_t);
> +				graph->bloom_large_alloc = get_be64(chunk_lookup + 4) - chunk_offset - sizeof(uint32_t);
>  				graph->chunk_bloom_large_filters = data + chunk_offset + sizeof(uint32_t);
>  				graph->bloom_filter_settings->max_changed_paths = get_be32(data + chunk_offset);
> -				if (alloc) {
> -					size_t j;
> -					graph->bloom_large = bitmap_word_alloc(alloc);
> -
> -					for (j = 0; j < graph->bloom_large->word_alloc; j++)
> -						graph->bloom_large->words[j] = get_be64(
> -							graph->chunk_bloom_large_filters + j * sizeof(eword_t));
> -				}
>  			}
>  			break;
>  		}
> @@ -953,9 +945,9 @@ struct tree *get_commit_tree_in_graph(struct repository *r, const struct commit
>  	return get_commit_tree_in_graph_one(r, r->objects->commit_graph, c);
>  }
>
> -static int get_bloom_filter_large_in_graph(struct commit_graph *g,
> -					   const struct commit *c,
> -					   uint32_t max_changed_paths)
> +int get_bloom_filter_large_in_graph(struct commit_graph *g,
> +				    const struct commit *c,
> +				    uint32_t max_changed_paths)
>  {
>  	uint32_t graph_pos = commit_graph_position(c);
>  	if (graph_pos == COMMIT_NOT_FROM_GRAPH)
> @@ -964,8 +956,20 @@ static int get_bloom_filter_large_in_graph(struct commit_graph *g,
>  	while (g && graph_pos < g->num_commits_in_base)
>  		g = g->base_graph;
>
> -	if (!(g && g->bloom_large))
> +	if (!g || !g->bloom_large_alloc)
>  		return 0;
> +
> +	if (!g->bloom_large) {
> +		size_t j;
> +		g->bloom_large = bitmap_word_alloc(g->bloom_large_alloc);
> +
> +		for (j = 0; j < g->bloom_large->word_alloc; j++) {
> +			const void *data = g->chunk_bloom_large_filters +
> +					   j * sizeof(eword_t);
> +			g->bloom_large->words[j] = get_be64(data);
> +		}
> +	}
> +
>  	if (g->bloom_filter_settings->max_changed_paths != max_changed_paths) {
>  		/*
>  		 * Force all commits which are subject to a different
> diff --git a/commit-graph.h b/commit-graph.h
> index 75ef83708b..126fd43380 100644
> --- a/commit-graph.h
> +++ b/commit-graph.h
> @@ -51,6 +51,13 @@ void load_commit_graph_info(struct repository *r, struct commit *item);
>  struct tree *get_commit_tree_in_graph(struct repository *r,
>  				      const struct commit *c);
>
> +/*
> + * Returns 1 if this commit was marked in the BFXL chunk as having more
> + * than max_changed_paths changes.
> + */
> +int get_bloom_filter_large_in_graph(struct commit_graph *g,
> +				    const struct commit *c,
> +				    uint32_t max_changed_paths);
>  struct commit_graph {
>  	const unsigned char *data;
>  	size_t data_len;
> @@ -74,6 +81,7 @@ struct commit_graph {
>  	const unsigned char *chunk_bloom_data;
>  	const unsigned char *chunk_bloom_large_filters;
>
> +	size_t bloom_large_alloc;
>  	struct bitmap *bloom_large;
>
>  	struct bloom_filter_settings *bloom_filter_settings;
> --
> 2.28.0.38.gc6f546511c1
>
>

Thanks,
Taylor
Taylor Blau Aug. 14, 2020, 8:20 p.m. UTC | #4
On Wed, Aug 12, 2020 at 01:49:29PM +0200, SZEDER Gábor wrote:
> On Tue, Aug 11, 2020 at 04:52:14PM -0400, Taylor Blau wrote:
>
> > diff --git a/commit-graph.c b/commit-graph.c
> > index 6886f319a5..4aae5471e3 100644
> > --- a/commit-graph.c
> > +++ b/commit-graph.c
> > @@ -954,7 +954,8 @@ struct tree *get_commit_tree_in_graph(struct repository *r, const struct commit
> >  }
> >
> >  static int get_bloom_filter_large_in_graph(struct commit_graph *g,
> > -					   const struct commit *c)
> > +					   const struct commit *c,
> > +					   uint32_t max_changed_paths)
> >  {
> >  	uint32_t graph_pos = commit_graph_position(c);
> >  	if (graph_pos == COMMIT_NOT_FROM_GRAPH)
> > @@ -965,6 +966,17 @@ static int get_bloom_filter_large_in_graph(struct commit_graph *g,
> >
> >  	if (!(g && g->bloom_large))
> >  		return 0;
> > +	if (g->bloom_filter_settings->max_changed_paths != max_changed_paths) {
> > +		/*
> > +		 * Force all commits which are subject to a different
> > +		 * 'max_changed_paths' limit to be recomputed from scratch.
> > +		 *
> > +		 * Note that this could likely be improved, but is ignored since
> > +		 * all real-world graphs set the maximum number of changed paths
> > +		 * at 512.
> > +		 */
> > +		return 0;
> > +	}
> >  	return bitmap_get(g->bloom_large, graph_pos - g->num_commits_in_base);
> >  }
> >
> > @@ -1470,6 +1482,7 @@ static void compute_bloom_filters(struct write_commit_graph_context *ctx)
> >  	int i;
> >  	struct progress *progress = NULL;
> >  	int *sorted_commits;
> > +	int max_new_filters;
> >
> >  	init_bloom_filters();
> >  	ctx->bloom_large = bitmap_word_alloc(ctx->commits.nr / BITS_IN_EWORD + 1);
> > @@ -1486,10 +1499,15 @@ static void compute_bloom_filters(struct write_commit_graph_context *ctx)
> >  		ctx->order_by_pack ? commit_pos_cmp : commit_gen_cmp,
> >  		&ctx->commits);
> >
> > +	max_new_filters = ctx->opts->max_new_filters >= 0 ?
> > +		ctx->opts->max_new_filters : ctx->commits.nr;
>
> git_test_write_commit_graph_or_die() invokes
> write_commit_graph_reachable() with opts=0x0, so 'ctx->opts' is NULL,
> and we get segfault.  This breaks a lot of tests when run with
> GIT_TEST_COMMIT_GRAPH=1 GIT_TEST_COMMIT_GRAPH_CHANGED_PATHS=1.

Great catch, thanks. Fixing this is as simple as adding 'ctx->opts &&'
right before we dereference 'ctx->opts', since setting this variable
equal to 'ctx->commits.nr' is the right thing to do in that case.

Unrelated to this comment, I am hoping to send out a final version of
this series sometime next week so that we can keep moving forward with
Bloom filter improvements.

Have you had a chance to review the rest of the patches? I'll happily
wait until you have had a chance to do so before sending v5 so that we
can avoid a v6.

Thanks,
Taylor
SZEDER Gábor Aug. 17, 2020, 10:50 p.m. UTC | #5
On Fri, Aug 14, 2020 at 04:20:21PM -0400, Taylor Blau wrote:
> > > @@ -1486,10 +1499,15 @@ static void compute_bloom_filters(struct write_commit_graph_context *ctx)
> > >  		ctx->order_by_pack ? commit_pos_cmp : commit_gen_cmp,
> > >  		&ctx->commits);
> > >
> > > +	max_new_filters = ctx->opts->max_new_filters >= 0 ?
> > > +		ctx->opts->max_new_filters : ctx->commits.nr;
> >
> > git_test_write_commit_graph_or_die() invokes
> > write_commit_graph_reachable() with opts=0x0, so 'ctx->opts' is NULL,
> > and we get segfault.  This breaks a lot of tests when run with
> > GIT_TEST_COMMIT_GRAPH=1 GIT_TEST_COMMIT_GRAPH_CHANGED_PATHS=1.
> 
> Great catch, thanks. Fixing this is as simple as adding 'ctx->opts &&'
> right before we dereference 'ctx->opts', since setting this variable
> equal to 'ctx->commits.nr' is the right thing to do in that case.

That would avoid the segfault, sure, but I would rather see all
callers of write_commit_graph{_reachable}() passing a valid opts
instance.  Just like we don't call the diff machinery with a NULL
diff_options, or the revision walking machinery with a NULL rev_info.

> Unrelated to this comment, I am hoping to send out a final version of
> this series sometime next week so that we can keep moving forward with
> Bloom filter improvements.
> 
> Have you had a chance to review the rest of the patches? I'll happily
> wait until you have had a chance to do so before sending v5 so that we

v5?  This is v3, and I'm unable to a find a v4.

> can avoid a v6.
> 
> Thanks,
> Taylor
SZEDER Gábor Aug. 18, 2020, 10:23 p.m. UTC | #6
On Tue, Aug 11, 2020 at 04:52:14PM -0400, Taylor Blau wrote:
> Introduce a command-line flag and configuration variable to fill in the
> 'max_new_filters' variable introduced by the previous patch.

'max_new_filters' is introduced in this patch.

> The command-line option '--max-new-filters' takes precedence over
> 'commitGraph.maxNewFilters', which is the default value.

What "filters"?  While misnamed, the '--changed-paths' options did a
good job at hiding the implementation detail that we use Bloom filters
to speed up pathspec-limited revision walks; as far as I remember this
was a conscious design decision.  Including "filter" in the name of
the option and corresponding config variable goes against this
decision.  Furthermore, by not being specific we might end up in abad
situation when adding some other filters to the commit-graph format.
Unfortunately, I can't offhand propose a better option name, all my
ideas were horrible.

> '--no-max-new-filters' can also be provided, which sets the value back
> to '-1', indicating that an unlimited number of new Bloom filters may be
> generated. (OPT_INTEGER only allows setting the '--no-' variant back to
> '0', hence a custom callback was used instead).
> 
> Signed-off-by: Taylor Blau <me@ttaylorr.com>
> ---
>  Documentation/config/commitgraph.txt |  4 +++
>  Documentation/git-commit-graph.txt   |  4 +++
>  bloom.c                              | 15 +++++++++++
>  builtin/commit-graph.c               | 39 +++++++++++++++++++++++++---
>  commit-graph.c                       | 27 ++++++++++++++++---
>  commit-graph.h                       |  1 +
>  t/t4216-log-bloom.sh                 | 19 ++++++++++++++
>  7 files changed, 102 insertions(+), 7 deletions(-)
> 
> diff --git a/Documentation/config/commitgraph.txt b/Documentation/config/commitgraph.txt
> index cff0797b54..4582c39fc4 100644
> --- a/Documentation/config/commitgraph.txt
> +++ b/Documentation/config/commitgraph.txt
> @@ -1,3 +1,7 @@
> +commitGraph.maxNewFilters::
> +	Specifies the default value for the `--max-new-filters` option of `git
> +	commit-graph write` (c.f., linkgit:git-commit-graph[1]).
> +
>  commitGraph.readChangedPaths::
>  	If true, then git will use the changed-path Bloom filters in the
>  	commit-graph file (if it exists, and they are present). Defaults to
> diff --git a/Documentation/git-commit-graph.txt b/Documentation/git-commit-graph.txt
> index 17405c73a9..9c887d5d79 100644
> --- a/Documentation/git-commit-graph.txt
> +++ b/Documentation/git-commit-graph.txt
> @@ -67,6 +67,10 @@ this option is given, future commit-graph writes will automatically assume
>  that this option was intended. Use `--no-changed-paths` to stop storing this
>  data.
>  +
> +With the `--max-new-filters=<n>` option, generate at most `n` new Bloom
> +filters (if `--changed-paths` is specified). If `n` is `-1`, no limit is
> +enforced. Overrides the `commitGraph.maxNewFilters` configuration.

I think this description should also detail what happens with those
commits for which no modified path Bloom filters are calculated, and
which commands will calculate them (even implicitly).

> ++
>  With the `--split[=<strategy>]` option, write the commit-graph as a
>  chain of multiple commit-graph files stored in
>  `<dir>/info/commit-graphs`. Commit-graph layers are merged based on the
> diff --git a/bloom.c b/bloom.c
> index ed54e96e57..8d07209c6b 100644
> --- a/bloom.c
> +++ b/bloom.c
> @@ -51,6 +51,21 @@ static int load_bloom_filter_from_graph(struct commit_graph *g,
>  	else
>  		start_index = 0;
>  
> +	if ((start_index == end_index) &&
> +	    (g->bloom_large && !bitmap_get(g->bloom_large, lex_pos))) {
> +		/*
> +		 * If the filter is zero-length, either (1) the filter has no
> +		 * changes, (2) the filter has too many changes, or (3) it
> +		 * wasn't computed (eg., due to '--max-new-filters').
> +		 *
> +		 * If either (1) or (2) is the case, the 'large' bit will be set
> +		 * for this Bloom filter. If it is unset, then it wasn't
> +		 * computed. In that case, return nothing, since we don't have
> +		 * that filter in the graph.
> +		 */
> +		return 0;
> +	}
> +
>  	filter->len = end_index - start_index;
>  	filter->data = (unsigned char *)(g->chunk_bloom_data +
>  					sizeof(unsigned char) * start_index +
> diff --git a/builtin/commit-graph.c b/builtin/commit-graph.c
> index 38f5f57d15..3500a6e1f1 100644
> --- a/builtin/commit-graph.c
> +++ b/builtin/commit-graph.c
> @@ -13,7 +13,8 @@ static char const * const builtin_commit_graph_usage[] = {
>  	N_("git commit-graph verify [--object-dir <objdir>] [--shallow] [--[no-]progress]"),
>  	N_("git commit-graph write [--object-dir <objdir>] [--append] "
>  	   "[--split[=<strategy>]] [--reachable|--stdin-packs|--stdin-commits] "
> -	   "[--changed-paths] [--[no-]progress] <split options>"),
> +	   "[--changed-paths] [--[no-]max-new-filters <n>] [--[no-]progress] "
> +	   "<split options>"),
>  	NULL
>  };
>  
> @@ -25,7 +26,8 @@ static const char * const builtin_commit_graph_verify_usage[] = {
>  static const char * const builtin_commit_graph_write_usage[] = {
>  	N_("git commit-graph write [--object-dir <objdir>] [--append] "
>  	   "[--split[=<strategy>]] [--reachable|--stdin-packs|--stdin-commits] "
> -	   "[--changed-paths] [--[no-]progress] <split options>"),
> +	   "[--changed-paths] [--[no-]max-new-filters <n>] [--[no-]progress] "
> +	   "<split options>"),
>  	NULL
>  };
>  
> @@ -162,6 +164,23 @@ static int read_one_commit(struct oidset *commits, struct progress *progress,
>  	return 0;
>  }
>  
> +static int write_option_max_new_filters(const struct option *opt,
> +					const char *arg,
> +					int unset)
> +{
> +	int *to = opt->value;
> +	if (unset)
> +		*to = -1;
> +	else {
> +		const char *s;
> +		*to = strtol(arg, (char **)&s, 10);
> +		if (*s)
> +			return error(_("%s expects a numerical value"),
> +				     optname(opt, opt->flags));
> +	}
> +	return 0;
> +}
> +
>  static int graph_write(int argc, const char **argv)
>  {
>  	struct string_list pack_indexes = STRING_LIST_INIT_NODUP;
> @@ -197,6 +216,9 @@ static int graph_write(int argc, const char **argv)
>  			N_("maximum ratio between two levels of a split commit-graph")),
>  		OPT_EXPIRY_DATE(0, "expire-time", &write_opts.expire_time,
>  			N_("only expire files older than a given date-time")),
> +		OPT_CALLBACK_F(0, "max-new-filters", &write_opts.max_new_filters,
> +			NULL, N_("maximum number of changed-path Bloom filters to compute"),
> +			0, write_option_max_new_filters),
>  		OPT_END(),
>  	};
>  
> @@ -205,6 +227,7 @@ static int graph_write(int argc, const char **argv)
>  	write_opts.size_multiple = 2;
>  	write_opts.max_commits = 0;
>  	write_opts.expire_time = 0;
> +	write_opts.max_new_filters = -1;
>  
>  	trace2_cmd_mode("write");
>  
> @@ -270,6 +293,16 @@ static int graph_write(int argc, const char **argv)
>  	return result;
>  }
>  
> +static int git_commit_graph_config(const char *var, const char *value, void *cb)
> +{
> +	if (!strcmp(var, "commitgraph.maxnewfilters")) {
> +		write_opts.max_new_filters = git_config_int(var, value);
> +		return 0;
> +	}
> +
> +	return git_default_config(var, value, cb);
> +}
> +
>  int cmd_commit_graph(int argc, const char **argv, const char *prefix)
>  {
>  	static struct option builtin_commit_graph_options[] = {
> @@ -283,7 +316,7 @@ int cmd_commit_graph(int argc, const char **argv, const char *prefix)
>  		usage_with_options(builtin_commit_graph_usage,
>  				   builtin_commit_graph_options);
>  
> -	git_config(git_default_config, NULL);
> +	git_config(git_commit_graph_config, &opts);
>  	argc = parse_options(argc, argv, prefix,
>  			     builtin_commit_graph_options,
>  			     builtin_commit_graph_usage,
> diff --git a/commit-graph.c b/commit-graph.c
> index 6886f319a5..4aae5471e3 100644
> --- a/commit-graph.c
> +++ b/commit-graph.c
> @@ -954,7 +954,8 @@ struct tree *get_commit_tree_in_graph(struct repository *r, const struct commit
>  }
>  
>  static int get_bloom_filter_large_in_graph(struct commit_graph *g,
> -					   const struct commit *c)
> +					   const struct commit *c,
> +					   uint32_t max_changed_paths)
>  {
>  	uint32_t graph_pos = commit_graph_position(c);
>  	if (graph_pos == COMMIT_NOT_FROM_GRAPH)
> @@ -965,6 +966,17 @@ static int get_bloom_filter_large_in_graph(struct commit_graph *g,
>  
>  	if (!(g && g->bloom_large))
>  		return 0;
> +	if (g->bloom_filter_settings->max_changed_paths != max_changed_paths) {
> +		/*
> +		 * Force all commits which are subject to a different
> +		 * 'max_changed_paths' limit to be recomputed from scratch.
> +		 *
> +		 * Note that this could likely be improved, but is ignored since
> +		 * all real-world graphs set the maximum number of changed paths
> +		 * at 512.

I don't understand what the second part of this comment is trying to
say; and real-world graphs might very well contain Bloom filters with
more than 512 modified paths, because the applying that limit was
buggy.

> +		 */
> +		return 0;
> +	}
>  	return bitmap_get(g->bloom_large, graph_pos - g->num_commits_in_base);
>  }
>  
> @@ -1470,6 +1482,7 @@ static void compute_bloom_filters(struct write_commit_graph_context *ctx)
>  	int i;
>  	struct progress *progress = NULL;
>  	int *sorted_commits;
> +	int max_new_filters;
>  
>  	init_bloom_filters();
>  	ctx->bloom_large = bitmap_word_alloc(ctx->commits.nr / BITS_IN_EWORD + 1);
> @@ -1486,10 +1499,15 @@ static void compute_bloom_filters(struct write_commit_graph_context *ctx)
>  		ctx->order_by_pack ? commit_pos_cmp : commit_gen_cmp,
>  		&ctx->commits);
>  
> +	max_new_filters = ctx->opts->max_new_filters >= 0 ?
> +		ctx->opts->max_new_filters : ctx->commits.nr;
> +
>  	for (i = 0; i < ctx->commits.nr; i++) {
>  		int pos = sorted_commits[i];
>  		struct commit *c = ctx->commits.list[pos];
> -		if (get_bloom_filter_large_in_graph(ctx->r->objects->commit_graph, c)) {
> +		if (get_bloom_filter_large_in_graph(ctx->r->objects->commit_graph,
> +						    c,
> +						    ctx->bloom_settings->max_changed_paths)) {
>  			bitmap_set(ctx->bloom_large, pos);
>  			ctx->count_bloom_filter_known_large++;
>  		} else {
> @@ -1497,7 +1515,7 @@ static void compute_bloom_filters(struct write_commit_graph_context *ctx)
>  			struct bloom_filter *filter = get_or_compute_bloom_filter(
>  				ctx->r,
>  				c,
> -				1,
> +				ctx->count_bloom_filter_computed < max_new_filters,
>  				ctx->bloom_settings,
>  				&computed);
>  			if (computed) {
> @@ -1507,7 +1525,8 @@ static void compute_bloom_filters(struct write_commit_graph_context *ctx)
>  					ctx->count_bloom_filter_found_large++;
>  				}
>  			}
> -			ctx->total_bloom_filter_data_size += sizeof(unsigned char) * filter->len;
> +			if (filter)
> +				ctx->total_bloom_filter_data_size += sizeof(unsigned char) * filter->len;
>  		}
>  		display_progress(progress, i + 1);
>  	}
> diff --git a/commit-graph.h b/commit-graph.h
> index af08c4505d..75ef83708b 100644
> --- a/commit-graph.h
> +++ b/commit-graph.h
> @@ -114,6 +114,7 @@ struct commit_graph_opts {
>  	int max_commits;
>  	timestamp_t expire_time;
>  	enum commit_graph_split_flags flags;
> +	int max_new_filters;
>  };
>  
>  /*
> diff --git a/t/t4216-log-bloom.sh b/t/t4216-log-bloom.sh
> index 6859d85369..3aab8ffbe3 100755
> --- a/t/t4216-log-bloom.sh
> +++ b/t/t4216-log-bloom.sh
> @@ -286,4 +286,23 @@ test_expect_success 'Bloom generation does not recompute too-large filters' '
>  	)
>  '
>  
> +test_expect_success 'Bloom generation is limited by --max-new-filters' '
> +	(
> +		cd limits &&
> +		test_commit c2 filter &&
> +		test_commit c3 filter &&
> +		test_commit c4 no-filter &&
> +		test_bloom_filters_computed "--reachable --changed-paths --split=replace --max-new-filters=2" \
> +			2 0 2
> +	)
> +'
> +
> +test_expect_success 'Bloom generation backfills previously-skipped filters' '
> +	(
> +		cd limits &&
> +		test_bloom_filters_computed "--reachable --changed-paths --split=replace --max-new-filters=1" \
> +			2 0 1
> +	)
> +'
> +
>  test_done
> -- 
> 2.28.0.rc1.13.ge78abce653
SZEDER Gábor Aug. 19, 2020, 8:20 a.m. UTC | #7
On Tue, Aug 11, 2020 at 04:52:14PM -0400, Taylor Blau wrote:
> Introduce a command-line flag and configuration variable to fill in the
> 'max_new_filters' variable introduced by the previous patch.
> 
> The command-line option '--max-new-filters' takes precedence over
> 'commitGraph.maxNewFilters', which is the default value.
> '--no-max-new-filters' can also be provided, which sets the value back
> to '-1', indicating that an unlimited number of new Bloom filters may be
> generated. (OPT_INTEGER only allows setting the '--no-' variant back to
> '0', hence a custom callback was used instead).

Forgot the most important thing: Why?  Please explain in the commit
message why this option is necesary, what problems does it solve,
how it is supposed to interact with other options and why so.
SZEDER Gábor Sept. 1, 2020, 2:36 p.m. UTC | #8
On Tue, Aug 11, 2020 at 04:52:14PM -0400, Taylor Blau wrote:
> Introduce a command-line flag and configuration variable to fill in the
> 'max_new_filters' variable introduced by the previous patch.
> 
> The command-line option '--max-new-filters' takes precedence over
> 'commitGraph.maxNewFilters', which is the default value.
> '--no-max-new-filters' can also be provided, which sets the value back
> to '-1', indicating that an unlimited number of new Bloom filters may be
> generated. (OPT_INTEGER only allows setting the '--no-' variant back to
> '0', hence a custom callback was used instead).
> 
> Signed-off-by: Taylor Blau <me@ttaylorr.com>
> ---
>  Documentation/config/commitgraph.txt |  4 +++
>  Documentation/git-commit-graph.txt   |  4 +++
>  bloom.c                              | 15 +++++++++++
>  builtin/commit-graph.c               | 39 +++++++++++++++++++++++++---
>  commit-graph.c                       | 27 ++++++++++++++++---
>  commit-graph.h                       |  1 +
>  t/t4216-log-bloom.sh                 | 19 ++++++++++++++
>  7 files changed, 102 insertions(+), 7 deletions(-)
> 
> diff --git a/Documentation/config/commitgraph.txt b/Documentation/config/commitgraph.txt
> index cff0797b54..4582c39fc4 100644
> --- a/Documentation/config/commitgraph.txt
> +++ b/Documentation/config/commitgraph.txt
> @@ -1,3 +1,7 @@
> +commitGraph.maxNewFilters::
> +	Specifies the default value for the `--max-new-filters` option of `git
> +	commit-graph write` (c.f., linkgit:git-commit-graph[1]).
> +
>  commitGraph.readChangedPaths::
>  	If true, then git will use the changed-path Bloom filters in the
>  	commit-graph file (if it exists, and they are present). Defaults to
> diff --git a/Documentation/git-commit-graph.txt b/Documentation/git-commit-graph.txt
> index 17405c73a9..9c887d5d79 100644
> --- a/Documentation/git-commit-graph.txt
> +++ b/Documentation/git-commit-graph.txt
> @@ -67,6 +67,10 @@ this option is given, future commit-graph writes will automatically assume
>  that this option was intended. Use `--no-changed-paths` to stop storing this
>  data.
>  +
> +With the `--max-new-filters=<n>` option, generate at most `n` new Bloom
> +filters (if `--changed-paths` is specified). If `n` is `-1`, no limit is
> +enforced. Overrides the `commitGraph.maxNewFilters` configuration.
> ++
>  With the `--split[=<strategy>]` option, write the commit-graph as a
>  chain of multiple commit-graph files stored in
>  `<dir>/info/commit-graphs`. Commit-graph layers are merged based on the
> diff --git a/bloom.c b/bloom.c
> index ed54e96e57..8d07209c6b 100644
> --- a/bloom.c
> +++ b/bloom.c
> @@ -51,6 +51,21 @@ static int load_bloom_filter_from_graph(struct commit_graph *g,
>  	else
>  		start_index = 0;
>  
> +	if ((start_index == end_index) &&
> +	    (g->bloom_large && !bitmap_get(g->bloom_large, lex_pos))) {
> +		/*
> +		 * If the filter is zero-length, either (1) the filter has no
> +		 * changes, (2) the filter has too many changes, or (3) it
> +		 * wasn't computed (eg., due to '--max-new-filters').
> +		 *
> +		 * If either (1) or (2) is the case, the 'large' bit will be set
> +		 * for this Bloom filter. If it is unset, then it wasn't
> +		 * computed. In that case, return nothing, since we don't have
> +		 * that filter in the graph.
> +		 */
> +		return 0;
> +	}
> +
>  	filter->len = end_index - start_index;
>  	filter->data = (unsigned char *)(g->chunk_bloom_data +
>  					sizeof(unsigned char) * start_index +
> diff --git a/builtin/commit-graph.c b/builtin/commit-graph.c
> index 38f5f57d15..3500a6e1f1 100644
> --- a/builtin/commit-graph.c
> +++ b/builtin/commit-graph.c
> @@ -13,7 +13,8 @@ static char const * const builtin_commit_graph_usage[] = {
>  	N_("git commit-graph verify [--object-dir <objdir>] [--shallow] [--[no-]progress]"),
>  	N_("git commit-graph write [--object-dir <objdir>] [--append] "
>  	   "[--split[=<strategy>]] [--reachable|--stdin-packs|--stdin-commits] "
> -	   "[--changed-paths] [--[no-]progress] <split options>"),
> +	   "[--changed-paths] [--[no-]max-new-filters <n>] [--[no-]progress] "
> +	   "<split options>"),
>  	NULL
>  };
>  
> @@ -25,7 +26,8 @@ static const char * const builtin_commit_graph_verify_usage[] = {
>  static const char * const builtin_commit_graph_write_usage[] = {
>  	N_("git commit-graph write [--object-dir <objdir>] [--append] "
>  	   "[--split[=<strategy>]] [--reachable|--stdin-packs|--stdin-commits] "
> -	   "[--changed-paths] [--[no-]progress] <split options>"),
> +	   "[--changed-paths] [--[no-]max-new-filters <n>] [--[no-]progress] "
> +	   "<split options>"),
>  	NULL
>  };
>  
> @@ -162,6 +164,23 @@ static int read_one_commit(struct oidset *commits, struct progress *progress,
>  	return 0;
>  }
>  
> +static int write_option_max_new_filters(const struct option *opt,
> +					const char *arg,
> +					int unset)
> +{
> +	int *to = opt->value;
> +	if (unset)
> +		*to = -1;
> +	else {
> +		const char *s;
> +		*to = strtol(arg, (char **)&s, 10);
> +		if (*s)
> +			return error(_("%s expects a numerical value"),
> +				     optname(opt, opt->flags));
> +	}
> +	return 0;
> +}
> +
>  static int graph_write(int argc, const char **argv)
>  {
>  	struct string_list pack_indexes = STRING_LIST_INIT_NODUP;
> @@ -197,6 +216,9 @@ static int graph_write(int argc, const char **argv)
>  			N_("maximum ratio between two levels of a split commit-graph")),
>  		OPT_EXPIRY_DATE(0, "expire-time", &write_opts.expire_time,
>  			N_("only expire files older than a given date-time")),
> +		OPT_CALLBACK_F(0, "max-new-filters", &write_opts.max_new_filters,
> +			NULL, N_("maximum number of changed-path Bloom filters to compute"),
> +			0, write_option_max_new_filters),
>  		OPT_END(),
>  	};
>  
> @@ -205,6 +227,7 @@ static int graph_write(int argc, const char **argv)
>  	write_opts.size_multiple = 2;
>  	write_opts.max_commits = 0;
>  	write_opts.expire_time = 0;
> +	write_opts.max_new_filters = -1;
>  
>  	trace2_cmd_mode("write");
>  
> @@ -270,6 +293,16 @@ static int graph_write(int argc, const char **argv)
>  	return result;
>  }
>  
> +static int git_commit_graph_config(const char *var, const char *value, void *cb)
> +{
> +	if (!strcmp(var, "commitgraph.maxnewfilters")) {
> +		write_opts.max_new_filters = git_config_int(var, value);
> +		return 0;
> +	}
> +
> +	return git_default_config(var, value, cb);
> +}
> +
>  int cmd_commit_graph(int argc, const char **argv, const char *prefix)
>  {
>  	static struct option builtin_commit_graph_options[] = {
> @@ -283,7 +316,7 @@ int cmd_commit_graph(int argc, const char **argv, const char *prefix)
>  		usage_with_options(builtin_commit_graph_usage,
>  				   builtin_commit_graph_options);
>  
> -	git_config(git_default_config, NULL);
> +	git_config(git_commit_graph_config, &opts);
>  	argc = parse_options(argc, argv, prefix,
>  			     builtin_commit_graph_options,
>  			     builtin_commit_graph_usage,
> diff --git a/commit-graph.c b/commit-graph.c
> index 6886f319a5..4aae5471e3 100644
> --- a/commit-graph.c
> +++ b/commit-graph.c
> @@ -954,7 +954,8 @@ struct tree *get_commit_tree_in_graph(struct repository *r, const struct commit
>  }
>  
>  static int get_bloom_filter_large_in_graph(struct commit_graph *g,
> -					   const struct commit *c)
> +					   const struct commit *c,
> +					   uint32_t max_changed_paths)
>  {
>  	uint32_t graph_pos = commit_graph_position(c);
>  	if (graph_pos == COMMIT_NOT_FROM_GRAPH)
> @@ -965,6 +966,17 @@ static int get_bloom_filter_large_in_graph(struct commit_graph *g,
>  
>  	if (!(g && g->bloom_large))
>  		return 0;
> +	if (g->bloom_filter_settings->max_changed_paths != max_changed_paths) {
> +		/*
> +		 * Force all commits which are subject to a different
> +		 * 'max_changed_paths' limit to be recomputed from scratch.
> +		 *
> +		 * Note that this could likely be improved, but is ignored since
> +		 * all real-world graphs set the maximum number of changed paths
> +		 * at 512.
> +		 */
> +		return 0;
> +	}
>  	return bitmap_get(g->bloom_large, graph_pos - g->num_commits_in_base);
>  }
>  
> @@ -1470,6 +1482,7 @@ static void compute_bloom_filters(struct write_commit_graph_context *ctx)
>  	int i;
>  	struct progress *progress = NULL;
>  	int *sorted_commits;
> +	int max_new_filters;
>  
>  	init_bloom_filters();
>  	ctx->bloom_large = bitmap_word_alloc(ctx->commits.nr / BITS_IN_EWORD + 1);
> @@ -1486,10 +1499,15 @@ static void compute_bloom_filters(struct write_commit_graph_context *ctx)
>  		ctx->order_by_pack ? commit_pos_cmp : commit_gen_cmp,
>  		&ctx->commits);
>  
> +	max_new_filters = ctx->opts->max_new_filters >= 0 ?
> +		ctx->opts->max_new_filters : ctx->commits.nr;
> +
>  	for (i = 0; i < ctx->commits.nr; i++) {
>  		int pos = sorted_commits[i];
>  		struct commit *c = ctx->commits.list[pos];
> -		if (get_bloom_filter_large_in_graph(ctx->r->objects->commit_graph, c)) {
> +		if (get_bloom_filter_large_in_graph(ctx->r->objects->commit_graph,
> +						    c,
> +						    ctx->bloom_settings->max_changed_paths)) {
>  			bitmap_set(ctx->bloom_large, pos);
>  			ctx->count_bloom_filter_known_large++;
>  		} else {
> @@ -1497,7 +1515,7 @@ static void compute_bloom_filters(struct write_commit_graph_context *ctx)
>  			struct bloom_filter *filter = get_or_compute_bloom_filter(
>  				ctx->r,
>  				c,
> -				1,
> +				ctx->count_bloom_filter_computed < max_new_filters,
>  				ctx->bloom_settings,
>  				&computed);
>  			if (computed) {
> @@ -1507,7 +1525,8 @@ static void compute_bloom_filters(struct write_commit_graph_context *ctx)
>  					ctx->count_bloom_filter_found_large++;
>  				}
>  			}
> -			ctx->total_bloom_filter_data_size += sizeof(unsigned char) * filter->len;
> +			if (filter)
> +				ctx->total_bloom_filter_data_size += sizeof(unsigned char) * filter->len;
>  		}
>  		display_progress(progress, i + 1);
>  	}
> diff --git a/commit-graph.h b/commit-graph.h
> index af08c4505d..75ef83708b 100644
> --- a/commit-graph.h
> +++ b/commit-graph.h
> @@ -114,6 +114,7 @@ struct commit_graph_opts {
>  	int max_commits;
>  	timestamp_t expire_time;
>  	enum commit_graph_split_flags flags;
> +	int max_new_filters;
>  };
>  
>  /*
> diff --git a/t/t4216-log-bloom.sh b/t/t4216-log-bloom.sh
> index 6859d85369..3aab8ffbe3 100755
> --- a/t/t4216-log-bloom.sh
> +++ b/t/t4216-log-bloom.sh
> @@ -286,4 +286,23 @@ test_expect_success 'Bloom generation does not recompute too-large filters' '
>  	)
>  '
>  
> +test_expect_success 'Bloom generation is limited by --max-new-filters' '
> +	(
> +		cd limits &&
> +		test_commit c2 filter &&
> +		test_commit c3 filter &&
> +		test_commit c4 no-filter &&
> +		test_bloom_filters_computed "--reachable --changed-paths --split=replace --max-new-filters=2" \
> +			2 0 2
> +	)
> +'
> +
> +test_expect_success 'Bloom generation backfills previously-skipped filters' '
> +	(
> +		cd limits &&
> +		test_bloom_filters_computed "--reachable --changed-paths --split=replace --max-new-filters=1" \
> +			2 0 1
> +	)
> +'
> +
>  test_done
> -- 
> 2.28.0.rc1.13.ge78abce653

Something seems to be wrong in this patch, though I haven't looked
closer.  Consider this test with a bit of makeshift tracing:

  ---  >8  ---

diff --git a/bloom.c b/bloom.c
index 8d07209c6b..1a0dec35cd 100644
--- a/bloom.c
+++ b/bloom.c
@@ -222,6 +222,7 @@ struct bloom_filter *get_or_compute_bloom_filter(struct repository *r,
 	if (!compute_if_not_present)
 		return NULL;
 
+	printf("get_or_compute_bloom_filter(): diff: %s\n", oid_to_hex(&c->object.oid));
 	repo_diff_setup(r, &diffopt);
 	diffopt.flags.recursive = 1;
 	diffopt.detect_rename = 0;
diff --git a/t/t9999-test.sh b/t/t9999-test.sh
new file mode 100755
index 0000000000..0833e6ff7e
--- /dev/null
+++ b/t/t9999-test.sh
@@ -0,0 +1,25 @@
+#!/bin/sh
+
+test_description='test'
+
+. ./test-lib.sh
+
+test_expect_success 'test' '
+	for i in 1 2 3 4 5 6
+	do
+		git commit -q --allow-empty -m $i || return 1
+	done &&
+	git log --oneline &&
+
+	# We have 6 commits and compute 2 Bloom filters per execution,
+	# so 3 executions should be enough...  but, alas, it isnt.
+	for i in 1 2 3 4 5
+	do
+		# No --split=replace!
+		git commit-graph write --reachable --changed-paths --max-new-filters=2 || return 1
+	done &&
+
+	git commit-graph write --reachable --changed-paths --max-new-filters=4
+'
+
+test_done

  ---  >8  ---

It's output looks like:

  [...]
  + git log --oneline
  13fcefa (HEAD -> master) 6
  0c71516 5
  a82a61c 4
  54c6b2c 3
  fc99def 2
  a3a8cd3 1
  + git commit-graph write --reachable --changed-paths --max-new-filters=2
  get_or_compute_bloom_filter(): diff: 0c71516945bf0a23813e80205961d29ebc1020e0
  get_or_compute_bloom_filter(): diff: 13fcefa4bb859a15c4edc6bb01b8b6c91b4f32b6
  + git commit-graph write --reachable --changed-paths --max-new-filters=2
  get_or_compute_bloom_filter(): diff: 54c6b2cd4fb50066683a197cc6d677689618505a
  get_or_compute_bloom_filter(): diff: a3a8cd3c82028671bf51502d77277baf14a2f528
  + git commit-graph write --reachable --changed-paths --max-new-filters=2
  get_or_compute_bloom_filter(): diff: 0c71516945bf0a23813e80205961d29ebc1020e0
  get_or_compute_bloom_filter(): diff: 13fcefa4bb859a15c4edc6bb01b8b6c91b4f32b6
  + git commit-graph write --reachable --changed-paths --max-new-filters=2
  get_or_compute_bloom_filter(): diff: 54c6b2cd4fb50066683a197cc6d677689618505a
  get_or_compute_bloom_filter(): diff: a3a8cd3c82028671bf51502d77277baf14a2f528
  + git commit-graph write --reachable --changed-paths --max-new-filters=2
  get_or_compute_bloom_filter(): diff: 0c71516945bf0a23813e80205961d29ebc1020e0
  get_or_compute_bloom_filter(): diff: 13fcefa4bb859a15c4edc6bb01b8b6c91b4f32b6
  + git commit-graph write --reachable --changed-paths
  get_or_compute_bloom_filter(): diff: 54c6b2cd4fb50066683a197cc6d677689618505a
  get_or_compute_bloom_filter(): diff: a3a8cd3c82028671bf51502d77277baf14a2f528
  get_or_compute_bloom_filter(): diff: a82a61c79b2b07c4440e292613e11a69e33ef7a2
  get_or_compute_bloom_filter(): diff: fc99def8b1df27bcab7d1f4b7ced73239f9bd7ec

See how the third write with '--max-new-filters=2' computes the
filters that have already been computed by the first write instead of
those two that have never been computed?  And then how the fourth
write computes filters that have already been computed by the second
write?  And ultimately we'll need a write without '--max-new-filters' (or
with '--max-new-filters=<large-enough>') to compute all remaining
filters.

With '--split=replace' it appears to work as expected.
Taylor Blau Sept. 2, 2020, 9:03 p.m. UTC | #9
On Tue, Aug 18, 2020 at 12:50:04AM +0200, SZEDER Gábor wrote:
> On Fri, Aug 14, 2020 at 04:20:21PM -0400, Taylor Blau wrote:
> > > > @@ -1486,10 +1499,15 @@ static void compute_bloom_filters(struct write_commit_graph_context *ctx)
> > > >  		ctx->order_by_pack ? commit_pos_cmp : commit_gen_cmp,
> > > >  		&ctx->commits);
> > > >
> > > > +	max_new_filters = ctx->opts->max_new_filters >= 0 ?
> > > > +		ctx->opts->max_new_filters : ctx->commits.nr;
> > >
> > > git_test_write_commit_graph_or_die() invokes
> > > write_commit_graph_reachable() with opts=0x0, so 'ctx->opts' is NULL,
> > > and we get segfault.  This breaks a lot of tests when run with
> > > GIT_TEST_COMMIT_GRAPH=1 GIT_TEST_COMMIT_GRAPH_CHANGED_PATHS=1.
> >
> > Great catch, thanks. Fixing this is as simple as adding 'ctx->opts &&'
> > right before we dereference 'ctx->opts', since setting this variable
> > equal to 'ctx->commits.nr' is the right thing to do in that case.
>
> That would avoid the segfault, sure, but I would rather see all
> callers of write_commit_graph{_reachable}() passing a valid opts
> instance.  Just like we don't call the diff machinery with a NULL
> diff_options, or the revision walking machinery with a NULL rev_info.

I wouldn't mind that either, but this is definitely a common pattern
throughout the commit-graph machinery. So, if/when we do get away from
it, I'd rather do so uniformly than in some spots.

> > Unrelated to this comment, I am hoping to send out a final version of
> > this series sometime next week so that we can keep moving forward with
> > Bloom filter improvements.
> >
> > Have you had a chance to review the rest of the patches? I'll happily
> > wait until you have had a chance to do so before sending v5 so that we
>
> v5?  This is v3, and I'm unable to a find a v4.

Sorry, I clearly had too much on my mind when I was writing this ;). I'm
hopeful that with your careful review that v4 will be the last of this
topic.

Thanks,
Taylor
Taylor Blau Sept. 3, 2020, 4:35 p.m. UTC | #10
On Wed, Aug 19, 2020 at 12:23:29AM +0200, SZEDER Gábor wrote:
> On Tue, Aug 11, 2020 at 04:52:14PM -0400, Taylor Blau wrote:
> > Introduce a command-line flag and configuration variable to fill in the
> > 'max_new_filters' variable introduced by the previous patch.
>
> 'max_new_filters' is introduced in this patch.
>
> > The command-line option '--max-new-filters' takes precedence over
> > 'commitGraph.maxNewFilters', which is the default value.
>
> What "filters"?  While misnamed, the '--changed-paths' options did a
> good job at hiding the implementation detail that we use Bloom filters
> to speed up pathspec-limited revision walks; as far as I remember this
> was a conscious design decision.  Including "filter" in the name of
> the option and corresponding config variable goes against this
> decision.  Furthermore, by not being specific we might end up in abad
> situation when adding some other filters to the commit-graph format.
> Unfortunately, I can't offhand propose a better option name, all my
> ideas were horrible.

I don't disagree, but I can't come up with a better name either.
maxNewChangedPaths? maxNewSomething? I don't know. Usually I think
exposing this sort of detail is smelly, but I'm not so bothered by it
here. Similarly, this thread has been around for a while, and nobody
else has suggested a better name, so, I'm inclined to keep it.

> > +With the `--max-new-filters=<n>` option, generate at most `n` new Bloom
> > +filters (if `--changed-paths` is specified). If `n` is `-1`, no limit is
> > +enforced. Overrides the `commitGraph.maxNewFilters` configuration.
>
> I think this description should also detail what happens with those
> commits for which no modified path Bloom filters are calculated, and
> which commands will calculate them (even implicitly).

OK, sure.

> >  static int get_bloom_filter_large_in_graph(struct commit_graph *g,
> > -					   const struct commit *c)
> > +					   const struct commit *c,
> > +					   uint32_t max_changed_paths)
> >  {
> >  	uint32_t graph_pos = commit_graph_position(c);
> >  	if (graph_pos == COMMIT_NOT_FROM_GRAPH)
> > @@ -965,6 +966,17 @@ static int get_bloom_filter_large_in_graph(struct commit_graph *g,
> >
> >  	if (!(g && g->bloom_large))
> >  		return 0;
> > +	if (g->bloom_filter_settings->max_changed_paths != max_changed_paths) {
> > +		/*
> > +		 * Force all commits which are subject to a different
> > +		 * 'max_changed_paths' limit to be recomputed from scratch.
> > +		 *
> > +		 * Note that this could likely be improved, but is ignored since
> > +		 * all real-world graphs set the maximum number of changed paths
> > +		 * at 512.
>
> I don't understand what the second part of this comment is trying to
> say; and real-world graphs might very well contain Bloom filters with
> more than 512 modified paths, because the applying that limit was
> buggy.

I'm trying to say that there is room for us to improve when we do and
don't recompute filters when the limit on the number of changed paths
deviates between layers, but that such deviations don't currently exist
in the wild.

Bugs are another thing, but we can't tell that they exist without
recomputing every filter.

Thanks,
Taylor
Taylor Blau Sept. 3, 2020, 4:42 p.m. UTC | #11
On Wed, Aug 19, 2020 at 10:20:21AM +0200, SZEDER Gábor wrote:
> On Tue, Aug 11, 2020 at 04:52:14PM -0400, Taylor Blau wrote:
> > Introduce a command-line flag and configuration variable to fill in the
> > 'max_new_filters' variable introduced by the previous patch.
> >
> > The command-line option '--max-new-filters' takes precedence over
> > 'commitGraph.maxNewFilters', which is the default value.
> > '--no-max-new-filters' can also be provided, which sets the value back
> > to '-1', indicating that an unlimited number of new Bloom filters may be
> > generated. (OPT_INTEGER only allows setting the '--no-' variant back to
> > '0', hence a custom callback was used instead).
>
> Forgot the most important thing: Why?  Please explain in the commit
> message why this option is necesary, what problems does it solve,
> how it is supposed to interact with other options and why so.

This is already explained in detail in the patch 'commit-graph: add
large-filters bitmap chunk', although there is an error in the quoted
part of your email (which I wrote) which refers the reader to the
previous patch. The patch I'm actually referring two is the
twice-previous patch.

I'll fix that locally before re-sending.

Thanks,
Taylor
Taylor Blau Sept. 3, 2020, 6:49 p.m. UTC | #12
On Tue, Sep 01, 2020 at 04:36:50PM +0200, SZEDER Gábor wrote:
> Something seems to be wrong in this patch, though I haven't looked
> closer.  Consider this test with a bit of makeshift tracing:
>
>   ---  >8  ---
>
> diff --git a/bloom.c b/bloom.c
> index 8d07209c6b..1a0dec35cd 100644
> --- a/bloom.c
> +++ b/bloom.c
> @@ -222,6 +222,7 @@ struct bloom_filter *get_or_compute_bloom_filter(struct repository *r,
>  	if (!compute_if_not_present)
>  		return NULL;
>
> +	printf("get_or_compute_bloom_filter(): diff: %s\n", oid_to_hex(&c->object.oid));
>  	repo_diff_setup(r, &diffopt);
>  	diffopt.flags.recursive = 1;
>  	diffopt.detect_rename = 0;
> diff --git a/t/t9999-test.sh b/t/t9999-test.sh
> new file mode 100755
> index 0000000000..0833e6ff7e
> --- /dev/null
> +++ b/t/t9999-test.sh
> @@ -0,0 +1,25 @@
> +#!/bin/sh
> +
> +test_description='test'
> +
> +. ./test-lib.sh
> +
> +test_expect_success 'test' '
> +	for i in 1 2 3 4 5 6
> +	do
> +		git commit -q --allow-empty -m $i || return 1
> +	done &&
> +	git log --oneline &&
> +
> +	# We have 6 commits and compute 2 Bloom filters per execution,
> +	# so 3 executions should be enough...  but, alas, it isnt.
> +	for i in 1 2 3 4 5
> +	do
> +		# No --split=replace!
> +		git commit-graph write --reachable --changed-paths --max-new-filters=2 || return 1
> +	done &&
> +
> +	git commit-graph write --reachable --changed-paths --max-new-filters=4
> +'
> +
> +test_done
>
>   ---  >8  ---
>
> It's output looks like:
>
>   [...]
>   + git log --oneline
>   13fcefa (HEAD -> master) 6
>   0c71516 5
>   a82a61c 4
>   54c6b2c 3
>   fc99def 2
>   a3a8cd3 1
>   + git commit-graph write --reachable --changed-paths --max-new-filters=2
>   get_or_compute_bloom_filter(): diff: 0c71516945bf0a23813e80205961d29ebc1020e0
>   get_or_compute_bloom_filter(): diff: 13fcefa4bb859a15c4edc6bb01b8b6c91b4f32b6
>   + git commit-graph write --reachable --changed-paths --max-new-filters=2
>   get_or_compute_bloom_filter(): diff: 54c6b2cd4fb50066683a197cc6d677689618505a
>   get_or_compute_bloom_filter(): diff: a3a8cd3c82028671bf51502d77277baf14a2f528
>   + git commit-graph write --reachable --changed-paths --max-new-filters=2
>   get_or_compute_bloom_filter(): diff: 0c71516945bf0a23813e80205961d29ebc1020e0
>   get_or_compute_bloom_filter(): diff: 13fcefa4bb859a15c4edc6bb01b8b6c91b4f32b6
>   + git commit-graph write --reachable --changed-paths --max-new-filters=2
>   get_or_compute_bloom_filter(): diff: 54c6b2cd4fb50066683a197cc6d677689618505a
>   get_or_compute_bloom_filter(): diff: a3a8cd3c82028671bf51502d77277baf14a2f528
>   + git commit-graph write --reachable --changed-paths --max-new-filters=2
>   get_or_compute_bloom_filter(): diff: 0c71516945bf0a23813e80205961d29ebc1020e0
>   get_or_compute_bloom_filter(): diff: 13fcefa4bb859a15c4edc6bb01b8b6c91b4f32b6
>   + git commit-graph write --reachable --changed-paths
>   get_or_compute_bloom_filter(): diff: 54c6b2cd4fb50066683a197cc6d677689618505a
>   get_or_compute_bloom_filter(): diff: a3a8cd3c82028671bf51502d77277baf14a2f528
>   get_or_compute_bloom_filter(): diff: a82a61c79b2b07c4440e292613e11a69e33ef7a2
>   get_or_compute_bloom_filter(): diff: fc99def8b1df27bcab7d1f4b7ced73239f9bd7ec
>
> See how the third write with '--max-new-filters=2' computes the
> filters that have already been computed by the first write instead of
> those two that have never been computed?  And then how the fourth
> write computes filters that have already been computed by the second
> write?  And ultimately we'll need a write without '--max-new-filters' (or
> with '--max-new-filters=<large-enough>') to compute all remaining
> filters.

Ouch. This definitely has to do with the empty commits, since swapping
out your 'git commit ... --allow-empty' for a 'test_commit' produces the
output that you'd expect.

> With '--split=replace' it appears to work as expected.

This is definitely the critical bit. The crux of the issue is that
'copy_oids_to_commits()' handles split and non-split graphs differently.
The critical bits here are:

  * 43d3561805 (commit-graph write: don't die if the existing graph is
    corrupt, 2019-03-25) which forces the relevant data to *not* be
    loaded from an existing commit-graph, and

  * 8a6ac287b2 (builtin/commit-graph.c: introduce split strategy
    'replace', 2020-04-13), which does load data from an existing
    commit-graph with '--split=replace'.

When writing a graph with '--split=replace', commits are loaded from the
graph, which includes setting their '->graph_pos' (or rather setting
this data in a commit slab, which is I guess how it's done these days).
Without '--split=replace', the graph position will never be set.

So, by the time we get to 'get_bloom_filter_large_in_graph', the graph
position is 'COMMIT_NOT_FROM_GRAPH', which in turn forces us to
recompute the filter from scratch, since we assume that being
'NOT_FROM_GRAPH' implies that we won't find it in any 'BFXL' chunk.

Regardless of whether or not we should be trusting the parentage
information on-disk, recomputing the Bloom filters from scratch is
simply too expensive (and the opposite of the point of this series). So,
doing the following to force 'get_bloom_filter_large_in_graph' to lookup
the Bloom and BFXL data in a commit graph by forcibly loading its graph
position is the right thing to do.

This is sufficient to get us unstuck:

diff --git a/commit-graph.c b/commit-graph.c
index bec4e5b725..243c7253ff 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -956,8 +956,8 @@ int get_bloom_filter_large_in_graph(struct commit_graph *g,
                                    const struct commit *c,
                                    uint32_t max_changed_paths)
 {
-       uint32_t graph_pos = commit_graph_position(c);
-       if (graph_pos == COMMIT_NOT_FROM_GRAPH)
+       uint32_t graph_pos;
+       if (!find_commit_in_graph(c, g, &graph_pos))
                return 0;

        while (g && graph_pos < g->num_commits_in_base)

...but adding something like:

diff --git a/t/t4216-log-bloom.sh b/t/t4216-log-bloom.sh
index a7a3b41919..571676cef2 100755
--- a/t/t4216-log-bloom.sh
+++ b/t/t4216-log-bloom.sh
@@ -310,4 +310,29 @@ test_expect_success 'Bloom generation backfills previously-skipped filters' '
        )
 '

+test_expect_success 'Bloom generation backfills empty commits' '
+       git init empty &&
+       test_when_finished "rm -fr empty" &&
+       (
+               cd empty &&
+               for i in $(test_seq 1 6)
+               do
+                       git commit --allow-empty -m "$i"
+               done &&
+
+               # Generate Bloom filters for empty commits 1-6, two at a time.
+               test_bloom_filters_computed "--reachable --changed-paths --max-new-filters=2" \
+                       0 2 2 &&
+               test_bloom_filters_computed "--reachable --changed-paths --max-new-filters=2" \
+                       2 2 2 &&
+               test_bloom_filters_computed "--reachable --changed-paths --max-new-filters=2" \
+                       4 2 2 &&
+
+               # Finally, make sure that once all commits have filters, that
+               # none are subsequently recomputed.
+               test_bloom_filters_computed "--reachable --changed-paths --max-new-filters=2" \
+                       6 0 0
+       )
+'
+
 test_done

is a good idea to harden what you wrote in your t9999 into an actual
test to prevent against regression. I'll fold both of those into this
patch.

Thanks for the bug report. It led to an interesting investigation as a
result :).

Thanks,
Taylor
SZEDER Gábor Sept. 4, 2020, 8:50 a.m. UTC | #13
On Thu, Sep 03, 2020 at 12:42:25PM -0400, Taylor Blau wrote:
> On Wed, Aug 19, 2020 at 10:20:21AM +0200, SZEDER Gábor wrote:
> > On Tue, Aug 11, 2020 at 04:52:14PM -0400, Taylor Blau wrote:
> > > Introduce a command-line flag and configuration variable to fill in the
> > > 'max_new_filters' variable introduced by the previous patch.
> > >
> > > The command-line option '--max-new-filters' takes precedence over
> > > 'commitGraph.maxNewFilters', which is the default value.
> > > '--no-max-new-filters' can also be provided, which sets the value back
> > > to '-1', indicating that an unlimited number of new Bloom filters may be
> > > generated. (OPT_INTEGER only allows setting the '--no-' variant back to
> > > '0', hence a custom callback was used instead).
> >
> > Forgot the most important thing: Why?  Please explain in the commit
> > message why this option is necesary, what problems does it solve,
> > how it is supposed to interact with other options and why so.
> 
> This is already explained in detail in the patch 'commit-graph: add
> large-filters bitmap chunk', although there is an error in the quoted
> part of your email (which I wrote) which refers the reader to the
> previous patch. The patch I'm actually referring two is the
> twice-previous patch.

The proposed log message of that patch only briefly mentions what this
option will do, and goes into detail about what problems it _causes_
and how that new chunk is supposed to solve _those_ problems.  It does
not explain what problems this new option is supposed to solve, and
why would anyone want to use this option.

> I'll fix that locally before re-sending.
> 
> Thanks,
> Taylor

Patch
diff mbox series

diff --git a/Documentation/config/commitgraph.txt b/Documentation/config/commitgraph.txt
index cff0797b54..4582c39fc4 100644
--- a/Documentation/config/commitgraph.txt
+++ b/Documentation/config/commitgraph.txt
@@ -1,3 +1,7 @@ 
+commitGraph.maxNewFilters::
+	Specifies the default value for the `--max-new-filters` option of `git
+	commit-graph write` (c.f., linkgit:git-commit-graph[1]).
+
 commitGraph.readChangedPaths::
 	If true, then git will use the changed-path Bloom filters in the
 	commit-graph file (if it exists, and they are present). Defaults to
diff --git a/Documentation/git-commit-graph.txt b/Documentation/git-commit-graph.txt
index 17405c73a9..9c887d5d79 100644
--- a/Documentation/git-commit-graph.txt
+++ b/Documentation/git-commit-graph.txt
@@ -67,6 +67,10 @@  this option is given, future commit-graph writes will automatically assume
 that this option was intended. Use `--no-changed-paths` to stop storing this
 data.
 +
+With the `--max-new-filters=<n>` option, generate at most `n` new Bloom
+filters (if `--changed-paths` is specified). If `n` is `-1`, no limit is
+enforced. Overrides the `commitGraph.maxNewFilters` configuration.
++
 With the `--split[=<strategy>]` option, write the commit-graph as a
 chain of multiple commit-graph files stored in
 `<dir>/info/commit-graphs`. Commit-graph layers are merged based on the
diff --git a/bloom.c b/bloom.c
index ed54e96e57..8d07209c6b 100644
--- a/bloom.c
+++ b/bloom.c
@@ -51,6 +51,21 @@  static int load_bloom_filter_from_graph(struct commit_graph *g,
 	else
 		start_index = 0;
 
+	if ((start_index == end_index) &&
+	    (g->bloom_large && !bitmap_get(g->bloom_large, lex_pos))) {
+		/*
+		 * If the filter is zero-length, either (1) the filter has no
+		 * changes, (2) the filter has too many changes, or (3) it
+		 * wasn't computed (eg., due to '--max-new-filters').
+		 *
+		 * If either (1) or (2) is the case, the 'large' bit will be set
+		 * for this Bloom filter. If it is unset, then it wasn't
+		 * computed. In that case, return nothing, since we don't have
+		 * that filter in the graph.
+		 */
+		return 0;
+	}
+
 	filter->len = end_index - start_index;
 	filter->data = (unsigned char *)(g->chunk_bloom_data +
 					sizeof(unsigned char) * start_index +
diff --git a/builtin/commit-graph.c b/builtin/commit-graph.c
index 38f5f57d15..3500a6e1f1 100644
--- a/builtin/commit-graph.c
+++ b/builtin/commit-graph.c
@@ -13,7 +13,8 @@  static char const * const builtin_commit_graph_usage[] = {
 	N_("git commit-graph verify [--object-dir <objdir>] [--shallow] [--[no-]progress]"),
 	N_("git commit-graph write [--object-dir <objdir>] [--append] "
 	   "[--split[=<strategy>]] [--reachable|--stdin-packs|--stdin-commits] "
-	   "[--changed-paths] [--[no-]progress] <split options>"),
+	   "[--changed-paths] [--[no-]max-new-filters <n>] [--[no-]progress] "
+	   "<split options>"),
 	NULL
 };
 
@@ -25,7 +26,8 @@  static const char * const builtin_commit_graph_verify_usage[] = {
 static const char * const builtin_commit_graph_write_usage[] = {
 	N_("git commit-graph write [--object-dir <objdir>] [--append] "
 	   "[--split[=<strategy>]] [--reachable|--stdin-packs|--stdin-commits] "
-	   "[--changed-paths] [--[no-]progress] <split options>"),
+	   "[--changed-paths] [--[no-]max-new-filters <n>] [--[no-]progress] "
+	   "<split options>"),
 	NULL
 };
 
@@ -162,6 +164,23 @@  static int read_one_commit(struct oidset *commits, struct progress *progress,
 	return 0;
 }
 
+static int write_option_max_new_filters(const struct option *opt,
+					const char *arg,
+					int unset)
+{
+	int *to = opt->value;
+	if (unset)
+		*to = -1;
+	else {
+		const char *s;
+		*to = strtol(arg, (char **)&s, 10);
+		if (*s)
+			return error(_("%s expects a numerical value"),
+				     optname(opt, opt->flags));
+	}
+	return 0;
+}
+
 static int graph_write(int argc, const char **argv)
 {
 	struct string_list pack_indexes = STRING_LIST_INIT_NODUP;
@@ -197,6 +216,9 @@  static int graph_write(int argc, const char **argv)
 			N_("maximum ratio between two levels of a split commit-graph")),
 		OPT_EXPIRY_DATE(0, "expire-time", &write_opts.expire_time,
 			N_("only expire files older than a given date-time")),
+		OPT_CALLBACK_F(0, "max-new-filters", &write_opts.max_new_filters,
+			NULL, N_("maximum number of changed-path Bloom filters to compute"),
+			0, write_option_max_new_filters),
 		OPT_END(),
 	};
 
@@ -205,6 +227,7 @@  static int graph_write(int argc, const char **argv)
 	write_opts.size_multiple = 2;
 	write_opts.max_commits = 0;
 	write_opts.expire_time = 0;
+	write_opts.max_new_filters = -1;
 
 	trace2_cmd_mode("write");
 
@@ -270,6 +293,16 @@  static int graph_write(int argc, const char **argv)
 	return result;
 }
 
+static int git_commit_graph_config(const char *var, const char *value, void *cb)
+{
+	if (!strcmp(var, "commitgraph.maxnewfilters")) {
+		write_opts.max_new_filters = git_config_int(var, value);
+		return 0;
+	}
+
+	return git_default_config(var, value, cb);
+}
+
 int cmd_commit_graph(int argc, const char **argv, const char *prefix)
 {
 	static struct option builtin_commit_graph_options[] = {
@@ -283,7 +316,7 @@  int cmd_commit_graph(int argc, const char **argv, const char *prefix)
 		usage_with_options(builtin_commit_graph_usage,
 				   builtin_commit_graph_options);
 
-	git_config(git_default_config, NULL);
+	git_config(git_commit_graph_config, &opts);
 	argc = parse_options(argc, argv, prefix,
 			     builtin_commit_graph_options,
 			     builtin_commit_graph_usage,
diff --git a/commit-graph.c b/commit-graph.c
index 6886f319a5..4aae5471e3 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -954,7 +954,8 @@  struct tree *get_commit_tree_in_graph(struct repository *r, const struct commit
 }
 
 static int get_bloom_filter_large_in_graph(struct commit_graph *g,
-					   const struct commit *c)
+					   const struct commit *c,
+					   uint32_t max_changed_paths)
 {
 	uint32_t graph_pos = commit_graph_position(c);
 	if (graph_pos == COMMIT_NOT_FROM_GRAPH)
@@ -965,6 +966,17 @@  static int get_bloom_filter_large_in_graph(struct commit_graph *g,
 
 	if (!(g && g->bloom_large))
 		return 0;
+	if (g->bloom_filter_settings->max_changed_paths != max_changed_paths) {
+		/*
+		 * Force all commits which are subject to a different
+		 * 'max_changed_paths' limit to be recomputed from scratch.
+		 *
+		 * Note that this could likely be improved, but is ignored since
+		 * all real-world graphs set the maximum number of changed paths
+		 * at 512.
+		 */
+		return 0;
+	}
 	return bitmap_get(g->bloom_large, graph_pos - g->num_commits_in_base);
 }
 
@@ -1470,6 +1482,7 @@  static void compute_bloom_filters(struct write_commit_graph_context *ctx)
 	int i;
 	struct progress *progress = NULL;
 	int *sorted_commits;
+	int max_new_filters;
 
 	init_bloom_filters();
 	ctx->bloom_large = bitmap_word_alloc(ctx->commits.nr / BITS_IN_EWORD + 1);
@@ -1486,10 +1499,15 @@  static void compute_bloom_filters(struct write_commit_graph_context *ctx)
 		ctx->order_by_pack ? commit_pos_cmp : commit_gen_cmp,
 		&ctx->commits);
 
+	max_new_filters = ctx->opts->max_new_filters >= 0 ?
+		ctx->opts->max_new_filters : ctx->commits.nr;
+
 	for (i = 0; i < ctx->commits.nr; i++) {
 		int pos = sorted_commits[i];
 		struct commit *c = ctx->commits.list[pos];
-		if (get_bloom_filter_large_in_graph(ctx->r->objects->commit_graph, c)) {
+		if (get_bloom_filter_large_in_graph(ctx->r->objects->commit_graph,
+						    c,
+						    ctx->bloom_settings->max_changed_paths)) {
 			bitmap_set(ctx->bloom_large, pos);
 			ctx->count_bloom_filter_known_large++;
 		} else {
@@ -1497,7 +1515,7 @@  static void compute_bloom_filters(struct write_commit_graph_context *ctx)
 			struct bloom_filter *filter = get_or_compute_bloom_filter(
 				ctx->r,
 				c,
-				1,
+				ctx->count_bloom_filter_computed < max_new_filters,
 				ctx->bloom_settings,
 				&computed);
 			if (computed) {
@@ -1507,7 +1525,8 @@  static void compute_bloom_filters(struct write_commit_graph_context *ctx)
 					ctx->count_bloom_filter_found_large++;
 				}
 			}
-			ctx->total_bloom_filter_data_size += sizeof(unsigned char) * filter->len;
+			if (filter)
+				ctx->total_bloom_filter_data_size += sizeof(unsigned char) * filter->len;
 		}
 		display_progress(progress, i + 1);
 	}
diff --git a/commit-graph.h b/commit-graph.h
index af08c4505d..75ef83708b 100644
--- a/commit-graph.h
+++ b/commit-graph.h
@@ -114,6 +114,7 @@  struct commit_graph_opts {
 	int max_commits;
 	timestamp_t expire_time;
 	enum commit_graph_split_flags flags;
+	int max_new_filters;
 };
 
 /*
diff --git a/t/t4216-log-bloom.sh b/t/t4216-log-bloom.sh
index 6859d85369..3aab8ffbe3 100755
--- a/t/t4216-log-bloom.sh
+++ b/t/t4216-log-bloom.sh
@@ -286,4 +286,23 @@  test_expect_success 'Bloom generation does not recompute too-large filters' '
 	)
 '
 
+test_expect_success 'Bloom generation is limited by --max-new-filters' '
+	(
+		cd limits &&
+		test_commit c2 filter &&
+		test_commit c3 filter &&
+		test_commit c4 no-filter &&
+		test_bloom_filters_computed "--reachable --changed-paths --split=replace --max-new-filters=2" \
+			2 0 2
+	)
+'
+
+test_expect_success 'Bloom generation backfills previously-skipped filters' '
+	(
+		cd limits &&
+		test_bloom_filters_computed "--reachable --changed-paths --split=replace --max-new-filters=1" \
+			2 0 1
+	)
+'
+
 test_done