[v3,10/10] commit-graph: check all leading directories in changed path Bloom filters
diff mbox series

Message ID 40061233ca2dee2ecdb9f197fc3e5999793594b3.1593174637.git.gitgitgadget@gmail.com
State New
Headers show
Series
  • More commit-graph/Bloom filter improvements
Related show

Commit Message

Elijah Newren via GitGitGadget June 26, 2020, 12:30 p.m. UTC
From: =?UTF-8?q?SZEDER=20G=C3=A1bor?= <szeder.dev@gmail.com>

The file 'dir/subdir/file' can only be modified if its leading
directories 'dir' and 'dir/subdir' are modified as well.

So when checking modified path Bloom filters looking for commits
modifying a path with multiple path components, then check not only
the full path in the Bloom filters, but all its leading directories as
well.  Take care to check these paths in "deepest first" order,
because it's the full path that is least likely to be modified, and
the Bloom filter queries can short circuit sooner.

This can significantly reduce the average false positive rate, by
about an order of magnitude or three(!), and can further speed up
pathspec-limited revision walks.  The table below compares the average
false positive rate and runtime of

  git rev-list HEAD -- "$path"

before and after this change for 5000+ randomly* selected paths from
each repository:

                    Average false           Average        Average
                    positive rate           runtime        runtime
                  before     after     before     after   difference
  ------------------------------------------------------------------
  git             3.220%   0.7853%     0.0558s   0.0387s   -30.6%
  linux           2.453%   0.0296%     0.1046s   0.0766s   -26.8%
  tensorflow      2.536%   0.6977%     0.0594s   0.0420s   -29.2%

*Path selection was done with the following pipeline:

	git ls-tree -r --name-only HEAD | sort -R | head -n 5000

The improvements in runtime are much smaller than the improvements in
average false positive rate, as we are clearly reaching diminishing
returns here.  However, all these timings depend on that accessing
tree objects is reasonably fast (warm caches).  If we had a partial
clone and the tree objects had to be fetched from a promisor remote,
e.g.:

  $ git clone --filter=tree:0 --bare file://.../webkit.git webkit.notrees.git
  $ git -C webkit.git -c core.modifiedPathBloomFilters=1 \
        commit-graph write --reachable
  $ cp webkit.git/objects/info/commit-graph webkit.notrees.git/objects/info/
  $ git -C webkit.notrees.git -c core.modifiedPathBloomFilters=1 \
        rev-list HEAD -- "$path"

then checking all leading path component can reduce the runtime from
over an hour to a few seconds (and this is with the clone and the
promisor on the same machine).

This adjusts the tracing values in t4216-log-bloom.sh, which provides a
concrete way to notice the improvement.

Helped-by: Taylor Blau <me@ttaylorr.com>
Helped-by: René Scharfe <l.s.r@web.de>
Signed-off-by: SZEDER Gábor <szeder.dev@gmail.com>
Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
---
 revision.c           | 46 +++++++++++++++++++++++++++++++++++---------
 revision.h           |  6 ++++--
 t/t4216-log-bloom.sh |  2 +-
 3 files changed, 42 insertions(+), 12 deletions(-)

Patch
diff mbox series

diff --git a/revision.c b/revision.c
index b53377cd52..b40bc5b51b 100644
--- a/revision.c
+++ b/revision.c
@@ -670,9 +670,10 @@  static void prepare_to_use_bloom_filter(struct rev_info *revs)
 {
 	struct pathspec_item *pi;
 	char *path_alloc = NULL;
-	const char *path;
+	const char *path, *p;
 	int last_index;
-	int len;
+	size_t len;
+	int path_component_nr = 1;
 
 	if (!revs->commits)
 		return;
@@ -709,8 +710,33 @@  static void prepare_to_use_bloom_filter(struct rev_info *revs)
 		return;
 	}
 
-	revs->bloom_key = xmalloc(sizeof(struct bloom_key));
-	fill_bloom_key(path, len, revs->bloom_key, revs->bloom_filter_settings);
+	p = path;
+	while (*p) {
+		/*
+		 * At this point, the path is normalized to use Unix-style
+		 * path separators. This is required due to how the
+		 * changed-path Bloom filters store the paths.
+		 */
+		if (*p == '/')
+			path_component_nr++;
+		p++;
+	}
+
+	revs->bloom_keys_nr = path_component_nr;
+	ALLOC_ARRAY(revs->bloom_keys, revs->bloom_keys_nr);
+
+	fill_bloom_key(path, len, &revs->bloom_keys[0],
+		       revs->bloom_filter_settings);
+	path_component_nr = 1;
+
+	p = path + len - 1;
+	while (p > path) {
+		if (*p == '/')
+			fill_bloom_key(path, p - path,
+				       &revs->bloom_keys[path_component_nr++],
+				       revs->bloom_filter_settings);
+		p--;
+	}
 
 	if (trace2_is_enabled() && !bloom_filter_atexit_registered) {
 		atexit(trace2_bloom_filter_statistics_atexit);
@@ -724,7 +750,7 @@  static int check_maybe_different_in_bloom_filter(struct rev_info *revs,
 						 struct commit *commit)
 {
 	struct bloom_filter *filter;
-	int result;
+	int result = 1, j;
 
 	if (!revs->repo->objects->commit_graph)
 		return -1;
@@ -744,9 +770,11 @@  static int check_maybe_different_in_bloom_filter(struct rev_info *revs,
 		return -1;
 	}
 
-	result = bloom_filter_contains(filter,
-				       revs->bloom_key,
-				       revs->bloom_filter_settings);
+	for (j = 0; result && j < revs->bloom_keys_nr; j++) {
+		result = bloom_filter_contains(filter,
+					       &revs->bloom_keys[j],
+					       revs->bloom_filter_settings);
+	}
 
 	if (result)
 		count_bloom_filter_maybe++;
@@ -786,7 +814,7 @@  static int rev_compare_tree(struct rev_info *revs,
 			return REV_TREE_SAME;
 	}
 
-	if (revs->bloom_key && !nth_parent) {
+	if (revs->bloom_keys_nr && !nth_parent) {
 		bloom_ret = check_maybe_different_in_bloom_filter(revs, commit);
 
 		if (bloom_ret == 0)
diff --git a/revision.h b/revision.h
index 7c026fe41f..abbfb4ab59 100644
--- a/revision.h
+++ b/revision.h
@@ -295,8 +295,10 @@  struct rev_info {
 	struct topo_walk_info *topo_walk_info;
 
 	/* Commit graph bloom filter fields */
-	/* The bloom filter key for the pathspec */
-	struct bloom_key *bloom_key;
+	/* The bloom filter key(s) for the pathspec */
+	struct bloom_key *bloom_keys;
+	int bloom_keys_nr;
+
 	/*
 	 * The bloom filter settings used to generate the key.
 	 * This is loaded from the commit-graph being used.
diff --git a/t/t4216-log-bloom.sh b/t/t4216-log-bloom.sh
index e3e4badd4c..d7dd717347 100755
--- a/t/t4216-log-bloom.sh
+++ b/t/t4216-log-bloom.sh
@@ -146,7 +146,7 @@  test_expect_success 'setup - add commit-graph to the chain with Bloom filters' '
 
 test_bloom_filters_used_when_some_filters_are_missing () {
 	log_args=$1
-	bloom_trace_prefix="statistics:{\"filter_not_present\":3,\"zero_length_filter\":0,\"maybe\":8,\"definitely_not\":6"
+	bloom_trace_prefix="statistics:{\"filter_not_present\":3,\"zero_length_filter\":0,\"maybe\":6,\"definitely_not\":8"
 	setup "$log_args" &&
 	grep -q "$bloom_trace_prefix" "$TRASH_DIRECTORY/trace.perf" &&
 	test_cmp log_wo_bloom log_w_bloom