diff mbox series

[34/34] commit-graph: use modified path Bloom filters with wildcards, if possible

Message ID 20200529085038.26008-35-szeder.dev@gmail.com (mailing list archive)
State New, archived
Headers show
Series An alternative modified path Bloom filters implementation | expand

Commit Message

SZEDER Gábor May 29, 2020, 8:50 a.m. UTC
Modified path Bloom filter don't store the names of modified paths,
they only set a couple of bits based on those paths' hashes.
Consequently, they can only be used when looking for the history of a
concrete path, so we disabled them when looking for pathspecs with
wildcards.

However, if the pathspec has "wildcard-less" leading directories, then
we can use modified path Bloom filters to skip commits that don't
modify those leading directories.  As a result, something like:

  git -c core.modifiedPathBloomFilters=1 rev-list HEAD -- 'compat/win32/pthread.*'

will take only ~0.045s instead of ~1.24s, achieving over 27x speedup.

For comparison, letting the shell do the wildcard matching, i.e. the
equivalent of using the pathspecs 'compat/win32/pthread.c' and
'compat/win32/pthread.h' takes ~0.311s without using modified path
Bloom filters: apparently tree-diff with wildcards can be considerably
more expensive that without wildcards, even if the wildcard is in the
last path component and that directory only contains a dozen files.

Signed-off-by: SZEDER Gábor <szeder.dev@gmail.com>
---
 commit-graph.c | 44 ++++++++++++++++++++++++++++++++++----------
 1 file changed, 34 insertions(+), 10 deletions(-)
diff mbox series

Patch

diff --git a/commit-graph.c b/commit-graph.c
index 8eb0cbedaf..db43877426 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -1023,9 +1023,9 @@  static void compute_modified_path_bloom_hashes_for_path(const char *path,
 void init_pathspec_bloom_fields(struct repository *r,
 				struct pathspec *pathspec)
 {
-	const unsigned bloom_compatible_magic = PATHSPEC_LITERAL;
+	const unsigned bloom_compatible_magic = PATHSPEC_LITERAL | PATHSPEC_GLOB;
 	struct commit_graph *graph = r->objects->commit_graph;
-	int i;
+	int i, can_use_modified_path_bloom_filters;
 
 	if (!graph)
 		return;
@@ -1033,15 +1033,14 @@  void init_pathspec_bloom_fields(struct repository *r,
 		return;
 	if (!pathspec->nr)
 		return;
-	if (pathspec->has_wildcard)
-		return;
 	if (pathspec->magic & ~bloom_compatible_magic)
 		return;
 
+	can_use_modified_path_bloom_filters = 1;
 	for (i = 0; i < pathspec->nr; i++) {
 		struct pathspec_item *pi = &pathspec->items[i];
 		const char *path = pi->match, *p;
-		size_t len = pi->len;
+		size_t nowildcard_len = pi->nowildcard_len;
 		int path_component_nr = 0, j;
 		uint32_t *hashes;
 		struct bloom_filter embedded_bf;
@@ -1051,14 +1050,29 @@  void init_pathspec_bloom_fields(struct repository *r,
 		 * slashes, but a trailing slash might still be present,
 		 * "remove" it.
 		 */
-		if (path[len - 1] == '/')
-			len--;
+		if (path[nowildcard_len - 1] == '/')
+			nowildcard_len--;
 
 		p = path;
 		do {
 			p = strchrnul(p + 1, '/');
-			path_component_nr++;
-		} while (p - path < len);
+			if (p - path <= nowildcard_len)
+				path_component_nr++;
+		} while (p - path < nowildcard_len);
+		/*
+		 * If a pathspec uses wildcards but has wildcard-less
+		 * leading directories, then we can use modified path Bloom
+		 * filters to skip commits that don't modify those leading
+		 * directories.
+		 * However, if there is even one pathspec that has a wilcard
+		 * in its first path component, then we have no choice but
+		 * to run tree-diff anyway, so don't bother with Bloom
+		 * filters at all in that case.
+		 */
+		if (!path_component_nr) {
+			can_use_modified_path_bloom_filters = 0;
+			break;
+		}
 
 		pi->modified_path_bloom_hashes_nr = path_component_nr * graph->num_modified_path_bloom_hashes;
 		ALLOC_ARRAY(pi->modified_path_bloom_hashes,
@@ -1084,7 +1098,17 @@  void init_pathspec_bloom_fields(struct repository *r,
 				      pi->modified_path_bloom_hashes_nr);
 	}
 
-	pathspec->can_use_modified_path_bloom_filters = 1;
+	if (can_use_modified_path_bloom_filters) {
+		pathspec->can_use_modified_path_bloom_filters = 1;
+	} else {
+		int j;
+		for (j = 0; j < i; j++) {
+			struct pathspec_item *pi = &pathspec->items[j];
+			FREE_AND_NULL(pi->modified_path_bloom_hashes);
+			pi->modified_path_bloom_hashes_nr = 0;
+			pi->modified_path_bloom_mask = 0;
+		}
+	}
 }
 
 struct packed_commit_list {