diff mbox series

[v3,08/13] bloom: use provided 'struct bloom_filter_settings'

Message ID 3745baf8ef8810fe8e6031d06d2f6b8d967ef13c.1600397826.git.me@ttaylorr.com (mailing list archive)
State New, archived
Headers show
Series more miscellaneous Bloom filter improvements, redux | expand

Commit Message

Taylor Blau Sept. 18, 2020, 2:59 a.m. UTC
When 'get_or_compute_bloom_filter()' needs to compute a Bloom filter
from scratch, it looks to the default 'struct bloom_filter_settings' in
order to determine the maximum number of changed paths, number of bits
per entry, and so on.

All of these values have so far been constant, and so there was no need
to pass in a pointer from the caller (eg., the one that is stored in the
'struct write_commit_graph_context').

Start passing in a 'struct bloom_filter_settings *' instead of using the
default values to respect graph-specific settings (eg., in the case of
setting 'GIT_TEST_BLOOM_SETTINGS_MAX_CHANGED_PATHS').

In order to have an initialized value for these settings, move its
initialization to earlier in the commit-graph write.

Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 bloom.c               | 13 ++++++-------
 bloom.h               |  3 ++-
 commit-graph.c        | 21 ++++++++++-----------
 t/helper/test-bloom.c |  1 +
 4 files changed, 19 insertions(+), 19 deletions(-)

Comments

SZEDER Gábor Sept. 18, 2020, 4:27 p.m. UTC | #1
On Thu, Sep 17, 2020 at 10:59:27PM -0400, Taylor Blau wrote:
> When 'get_or_compute_bloom_filter()' needs to compute a Bloom filter
> from scratch, it looks to the default 'struct bloom_filter_settings' in
> order to determine the maximum number of changed paths, number of bits
> per entry, and so on.
> 
> All of these values have so far been constant, and so there was no need
> to pass in a pointer from the caller (eg., the one that is stored in the
> 'struct write_commit_graph_context').
> 
> Start passing in a 'struct bloom_filter_settings *' instead of using the
> default values to respect graph-specific settings (eg., in the case of
> setting 'GIT_TEST_BLOOM_SETTINGS_MAX_CHANGED_PATHS').

I think this description misses the most important aspect of this
patch: it fixes, even if only partially, the half-broken fix in
0087a87ba8 (commit-graph: persist existence of changed-paths,
2020-07-01).

That commit, among other things, tried to make sure that we use the
same Bloom filter settings to compute new Bloom filters that have been
used in already existing filters.  However, it only read those
settings from the header of the existing BDAT chunk and wrote them to
the header of the new BDAT chunk (and printed them to trace2 output).
Unfortunately, it didn't actually use those settings to compute Bloom
filters, because it left get_bloom_filter() unchanged to use the
hardcoded default Bloom filter settings.  This can result in bogus
commits-graphs and, in turn, pathspec-limited revision walks omitting
commits that do modify the specified path.
Taylor Blau Sept. 18, 2020, 4:32 p.m. UTC | #2
On Fri, Sep 18, 2020 at 06:27:40PM +0200, SZEDER Gábor wrote:
> On Thu, Sep 17, 2020 at 10:59:27PM -0400, Taylor Blau wrote:
> > When 'get_or_compute_bloom_filter()' needs to compute a Bloom filter
> > from scratch, it looks to the default 'struct bloom_filter_settings' in
> > order to determine the maximum number of changed paths, number of bits
> > per entry, and so on.
> >
> > All of these values have so far been constant, and so there was no need
> > to pass in a pointer from the caller (eg., the one that is stored in the
> > 'struct write_commit_graph_context').
> >
> > Start passing in a 'struct bloom_filter_settings *' instead of using the
> > default values to respect graph-specific settings (eg., in the case of
> > setting 'GIT_TEST_BLOOM_SETTINGS_MAX_CHANGED_PATHS').
>
> I think this description misses the most important aspect of this
> patch: it fixes, even if only partially, the half-broken fix in
> 0087a87ba8 (commit-graph: persist existence of changed-paths,
> 2020-07-01).

Could you suggest an amendment? I understand what you are saying below,
but I'm not sure how or if you want it incorporated into this patch.

If you feel this is critical to change (I have no opinion either way),
then I'm happy to send a re-roll once review has stabilized. When do you
plan on finishing your read-through of this v3?

Thanks,
Taylor
diff mbox series

Patch

diff --git a/bloom.c b/bloom.c
index 393c61b4bc..2d6aef9098 100644
--- a/bloom.c
+++ b/bloom.c
@@ -180,13 +180,12 @@  static int pathmap_cmp(const void *hashmap_cmp_fn_data,
 struct bloom_filter *get_or_compute_bloom_filter(struct repository *r,
 						 struct commit *c,
 						 int compute_if_not_present,
+						 const struct bloom_filter_settings *settings,
 						 enum bloom_filter_computed *computed)
 {
 	struct bloom_filter *filter;
-	struct bloom_filter_settings settings = DEFAULT_BLOOM_FILTER_SETTINGS;
 	int i;
 	struct diff_options diffopt;
-	int max_changes = 512;
 
 	if (computed)
 		*computed = BLOOM_NOT_COMPUTED;
@@ -211,7 +210,7 @@  struct bloom_filter *get_or_compute_bloom_filter(struct repository *r,
 	repo_diff_setup(r, &diffopt);
 	diffopt.flags.recursive = 1;
 	diffopt.detect_rename = 0;
-	diffopt.max_changes = max_changes;
+	diffopt.max_changes = settings->max_changed_paths;
 	diff_setup_done(&diffopt);
 
 	/* ensure commit is parsed so we have parent information */
@@ -223,7 +222,7 @@  struct bloom_filter *get_or_compute_bloom_filter(struct repository *r,
 		diff_tree_oid(NULL, &c->object.oid, "", &diffopt);
 	diffcore_std(&diffopt);
 
-	if (diffopt.num_changes <= max_changes) {
+	if (diffopt.num_changes <= settings->max_changed_paths) {
 		struct hashmap pathmap;
 		struct pathmap_hash_entry *e;
 		struct hashmap_iter iter;
@@ -260,13 +259,13 @@  struct bloom_filter *get_or_compute_bloom_filter(struct repository *r,
 			diff_free_filepair(diff_queued_diff.queue[i]);
 		}
 
-		filter->len = (hashmap_get_size(&pathmap) * settings.bits_per_entry + BITS_PER_WORD - 1) / BITS_PER_WORD;
+		filter->len = (hashmap_get_size(&pathmap) * settings->bits_per_entry + BITS_PER_WORD - 1) / BITS_PER_WORD;
 		filter->data = xcalloc(filter->len, sizeof(unsigned char));
 
 		hashmap_for_each_entry(&pathmap, &iter, e, entry) {
 			struct bloom_key key;
-			fill_bloom_key(e->path, strlen(e->path), &key, &settings);
-			add_key_to_filter(&key, filter, &settings);
+			fill_bloom_key(e->path, strlen(e->path), &key, settings);
+			add_key_to_filter(&key, filter, settings);
 		}
 
 		hashmap_free_entries(&pathmap, struct pathmap_hash_entry, entry);
diff --git a/bloom.h b/bloom.h
index e2e035ad14..c6d77e8393 100644
--- a/bloom.h
+++ b/bloom.h
@@ -98,10 +98,11 @@  enum bloom_filter_computed {
 struct bloom_filter *get_or_compute_bloom_filter(struct repository *r,
 						 struct commit *c,
 						 int compute_if_not_present,
+						 const struct bloom_filter_settings *settings,
 						 enum bloom_filter_computed *computed);
 
 #define get_bloom_filter(r, c) get_or_compute_bloom_filter( \
-	(r), (c), 0, NULL)
+	(r), (c), 0, NULL, NULL)
 
 int bloom_filter_contains(const struct bloom_filter *filter,
 			  const struct bloom_key *key,
diff --git a/commit-graph.c b/commit-graph.c
index ecdab89e93..50519eb968 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -1435,6 +1435,7 @@  static void compute_bloom_filters(struct write_commit_graph_context *ctx)
 			ctx->r,
 			c,
 			1,
+			ctx->bloom_settings,
 			&computed);
 		if (computed & BLOOM_COMPUTED) {
 			ctx->count_bloom_filter_computed++;
@@ -1692,17 +1693,6 @@  static int write_commit_graph_file(struct write_commit_graph_context *ctx)
 	int num_chunks = 3;
 	uint64_t chunk_offset;
 	struct object_id file_hash;
-	struct bloom_filter_settings bloom_settings = DEFAULT_BLOOM_FILTER_SETTINGS;
-
-	if (!ctx->bloom_settings) {
-		bloom_settings.bits_per_entry = git_env_ulong("GIT_TEST_BLOOM_SETTINGS_BITS_PER_ENTRY",
-							      bloom_settings.bits_per_entry);
-		bloom_settings.num_hashes = git_env_ulong("GIT_TEST_BLOOM_SETTINGS_NUM_HASHES",
-							  bloom_settings.num_hashes);
-		bloom_settings.max_changed_paths = git_env_ulong("GIT_TEST_BLOOM_SETTINGS_MAX_CHANGED_PATHS",
-							  bloom_settings.max_changed_paths);
-		ctx->bloom_settings = &bloom_settings;
-	}
 
 	if (ctx->split) {
 		struct strbuf tmp_file = STRBUF_INIT;
@@ -2148,6 +2138,7 @@  int write_commit_graph(struct object_directory *odb,
 	uint32_t i, count_distinct = 0;
 	int res = 0;
 	int replace = 0;
+	struct bloom_filter_settings bloom_settings = DEFAULT_BLOOM_FILTER_SETTINGS;
 
 	if (!commit_graph_compatible(the_repository))
 		return 0;
@@ -2161,6 +2152,14 @@  int write_commit_graph(struct object_directory *odb,
 	ctx->split_opts = split_opts;
 	ctx->total_bloom_filter_data_size = 0;
 
+	bloom_settings.bits_per_entry = git_env_ulong("GIT_TEST_BLOOM_SETTINGS_BITS_PER_ENTRY",
+						      bloom_settings.bits_per_entry);
+	bloom_settings.num_hashes = git_env_ulong("GIT_TEST_BLOOM_SETTINGS_NUM_HASHES",
+						  bloom_settings.num_hashes);
+	bloom_settings.max_changed_paths = git_env_ulong("GIT_TEST_BLOOM_SETTINGS_MAX_CHANGED_PATHS",
+							 bloom_settings.max_changed_paths);
+	ctx->bloom_settings = &bloom_settings;
+
 	if (flags & COMMIT_GRAPH_WRITE_BLOOM_FILTERS)
 		ctx->changed_paths = 1;
 	if (!(flags & COMMIT_GRAPH_NO_WRITE_BLOOM_FILTERS)) {
diff --git a/t/helper/test-bloom.c b/t/helper/test-bloom.c
index 9f7bb729fc..46e97b04eb 100644
--- a/t/helper/test-bloom.c
+++ b/t/helper/test-bloom.c
@@ -40,6 +40,7 @@  static void get_bloom_filter_for_commit(const struct object_id *commit_oid)
 	setup_git_directory();
 	c = lookup_commit(the_repository, commit_oid);
 	filter = get_or_compute_bloom_filter(the_repository, c, 1,
+					     &settings,
 					     NULL);
 	print_bloom_filter(filter);
 }