[v2,09/11] sparse-checkout: use hashmaps for cone patterns
diff mbox series

Message ID 95a3285bc6021daa236d98d7e1bbdc5c45fc73b0.1568904188.git.gitgitgadget@gmail.com
State New
Headers show
Series
  • New sparse-checkout builtin and "cone" mode
Related show

Commit Message

Johannes Schindelin via GitGitGadget Sept. 19, 2019, 2:43 p.m. UTC
From: Derrick Stolee <dstolee@microsoft.com>

The parent and recursive patterns allowed by the "cone mode"
option in sparse-checkout are restrictive enough that we
can avoid using the regex parsing. Everything is based on
prefix matches, so we can use hashsets to store the prefixes
from the sparse-checkout file. When checking a path, we can
strip path entries from the path and check the hashset for
an exact match.

As a test, I created a cone-mode sparse-checkout file for the
Linux repository that actually includes every file. This was
constructed by taking every folder in the Linux repo and creating
the pattern pairs here:

	/$folder/
	!/$folder/*/

This resulted in a sparse-checkout file sith 8,296 patterns.
Running 'git read-tree -mu HEAD' on this file had the following
performance:

	core.sparseCheckout=false: 0.21 s (0.00 s)
	 core.sparseCheckout=true: 3.75 s (3.50 s)
	 core.sparseCheckout=cone: 0.23 s (0.01 s)

The times in parentheses above correspond to the time spent
in the first clear_ce_flags() call, according to the trace2
performance traces.

While this example is contrived, it demonstrates how these
patterns can slow the sparse-checkout feature.

Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
---
 dir.c                              | 173 +++++++++++++++++++++++++++--
 dir.h                              |  27 +++++
 t/t1091-sparse-checkout-builtin.sh |  11 +-
 3 files changed, 202 insertions(+), 9 deletions(-)

Comments

Derrick Stolee Sept. 19, 2019, 8:59 p.m. UTC | #1
On 9/19/2019 10:43 AM, Derrick Stolee via GitGitGadget wrote:
> @@ -848,6 +953,10 @@ static int add_patterns_from_buffer(char *buf, size_t size,
>  	int i, lineno = 1;
>  	char *entry;
>  
> +	pl->use_cone_patterns = core_sparse_checkout_cone;
> +	hashmap_init(&pl->recursive_hashmap, pl_hashmap_cmp, NULL, 0);
> +	hashmap_init(&pl->parent_hashmap, pl_hashmap_cmp, NULL, 0);
> +

Just a head's-up to anyone looking at this series: this is not the
right place to set use_cone_patterns (without passing a flag or
something). This same path is called from the .gitignore machinery,
so if you have a non-cone pattern in your .gitignore you will start
seeing warnings with core.sparseCheckoutCone=true.

I figured it out only via integration tests with our C# layer. In
v2 I'll fix this and add a test to make sure it stays fixed.

Otherwise, everything is working as expected.

-Stolee
Derrick Stolee Sept. 20, 2019, 2:37 p.m. UTC | #2
On 9/19/2019 4:59 PM, Derrick Stolee wrote:
> On 9/19/2019 10:43 AM, Derrick Stolee via GitGitGadget wrote:
>> @@ -848,6 +953,10 @@ static int add_patterns_from_buffer(char *buf, size_t size,
>>  	int i, lineno = 1;
>>  	char *entry;
>>  
>> +	pl->use_cone_patterns = core_sparse_checkout_cone;
>> +	hashmap_init(&pl->recursive_hashmap, pl_hashmap_cmp, NULL, 0);
>> +	hashmap_init(&pl->parent_hashmap, pl_hashmap_cmp, NULL, 0);
>> +
> 
> Just a head's-up to anyone looking at this series: this is not the
> right place to set use_cone_patterns (without passing a flag or
> something). This same path is called from the .gitignore machinery,
> so if you have a non-cone pattern in your .gitignore you will start
> seeing warnings with core.sparseCheckoutCone=true.
> 
> I figured it out only via integration tests with our C# layer. In
> v2 I'll fix this and add a test to make sure it stays fixed.

Here is the code fix. I will have a test to check this in v3.

-->8--

From 73b100d11d11bf8f045c2e116390120819dcb800 Mon Sep 17 00:00:00 2001
From: Derrick Stolee <dstolee@microsoft.com>
Date: Fri, 20 Sep 2019 08:55:06 -0400
Subject: [PATCH v2] fixup! sparse-checkout: use hashmaps for cone patterns

---
 dir.c          | 1 -
 unpack-trees.c | 1 +
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/dir.c b/dir.c
index 35fd60d487..248a418379 100644
--- a/dir.c
+++ b/dir.c
@@ -953,7 +953,6 @@ static int add_patterns_from_buffer(char *buf, size_t size,
 	int i, lineno = 1;
 	char *entry;
 
-	pl->use_cone_patterns = core_sparse_checkout_cone;
 	hashmap_init(&pl->recursive_hashmap, pl_hashmap_cmp, NULL, 0);
 	hashmap_init(&pl->parent_hashmap, pl_hashmap_cmp, NULL, 0);
 
diff --git a/unpack-trees.c b/unpack-trees.c
index 43acc0ffd6..b5cf591c38 100644
--- a/unpack-trees.c
+++ b/unpack-trees.c
@@ -1487,6 +1487,7 @@ int unpack_trees(unsigned len, struct tree_desc *t, struct unpack_trees_options
 		o->skip_sparse_checkout = 1;
 	if (!o->skip_sparse_checkout) {
 		char *sparse = git_pathdup("info/sparse-checkout");
+		pl.use_cone_patterns = core_sparse_checkout_cone;
 		if (add_patterns_from_file_to_list(sparse, "", 0, &pl, NULL) < 0)
 			o->skip_sparse_checkout = 1;
 		else

Patch
diff mbox series

diff --git a/dir.c b/dir.c
index 34972abdaf..4fc57187e9 100644
--- a/dir.c
+++ b/dir.c
@@ -599,6 +599,109 @@  void parse_path_pattern(const char **pattern,
 	*patternlen = len;
 }
 
+static int pl_hashmap_cmp(const void *unused_cmp_data,
+			  const void *a, const void *b, const void *key)
+{
+	const struct pattern_entry *ee1 = (const struct pattern_entry *)a;
+	const struct pattern_entry *ee2 = (const struct pattern_entry *)b;
+
+	size_t min_len = ee1->patternlen <= ee2->patternlen
+			 ? ee1->patternlen
+			 : ee2->patternlen;
+
+	return strncmp(ee1->pattern, ee2->pattern, min_len);
+}
+
+static void add_pattern_to_hashsets(struct pattern_list *pl, struct path_pattern *given)
+{
+	struct pattern_entry *translated;
+	char *truncated;
+	char *data = NULL;
+
+	if (!pl->use_cone_patterns)
+		return;
+
+	if (!strcmp(given->pattern, "/*"))
+		return;
+
+	if (given->patternlen > 2 &&
+	    !strcmp(given->pattern + given->patternlen - 2, "/*")) {
+		if (!(given->flags & PATTERN_FLAG_NEGATIVE)) {
+			/* Not a cone pattern. */
+			pl->use_cone_patterns = 0;
+			warning(_("unrecognized pattern: '%s'"), given->pattern);
+			goto clear_hashmaps;
+		}
+
+		truncated = xstrdup(given->pattern);
+		truncated[given->patternlen - 2] = 0;
+
+		translated = xmalloc(sizeof(struct pattern_entry));
+		translated->pattern = truncated;
+		translated->patternlen = given->patternlen - 2;
+		hashmap_entry_init(translated,
+				   memhash(translated->pattern, translated->patternlen));
+
+		if (!hashmap_get(&pl->recursive_hashmap, translated, NULL)) {
+			/* We did not see the "parent" included */
+			warning(_("unrecognized negative pattern: '%s'"),
+				given->pattern);
+			free(truncated);
+			free(translated);
+			goto clear_hashmaps;
+		}
+
+		hashmap_add(&pl->parent_hashmap, translated);
+		hashmap_remove(&pl->recursive_hashmap, translated, &data);
+		free(data);
+		return;
+	}
+
+	if (given->flags & PATTERN_FLAG_NEGATIVE) {
+		warning(_("unrecognized negative pattern: '%s'"),
+			given->pattern);
+		goto clear_hashmaps;
+	}
+
+	translated = xmalloc(sizeof(struct pattern_entry));
+
+	translated->pattern = xstrdup(given->pattern);
+	translated->patternlen = given->patternlen;
+	hashmap_entry_init(translated,
+			   memhash(translated->pattern, translated->patternlen));
+
+	hashmap_add(&pl->recursive_hashmap, translated);
+
+	if (hashmap_get(&pl->parent_hashmap, translated, NULL)) {
+		/* we already included this at the parent level */
+		warning(_("your sparse-checkout file may have issues: pattern '%s' is repeated"),
+			given->pattern);
+		hashmap_remove(&pl->parent_hashmap, translated, &data);
+		free(data);
+		free(translated);
+	}
+
+	return;
+
+clear_hashmaps:
+	warning(_("disabling cone pattern matching"));
+	hashmap_free(&pl->parent_hashmap, 1);
+	hashmap_free(&pl->recursive_hashmap, 1);
+	pl->use_cone_patterns = 0;
+}
+
+static int hashmap_contains_path(struct hashmap *map,
+				 struct strbuf *pattern)
+{
+	struct pattern_entry p;
+
+	/* Check straight mapping */
+	p.pattern = pattern->buf;
+	p.patternlen = pattern->len;
+	hashmap_entry_init(&p, memhash(p.pattern, p.patternlen));
+	return !!hashmap_get(map, &p, NULL);
+}
+
 void add_pattern(const char *string, const char *base,
 		 int baselen, struct pattern_list *pl, int srcpos)
 {
@@ -623,6 +726,8 @@  void add_pattern(const char *string, const char *base,
 	ALLOC_GROW(pl->patterns, pl->nr + 1, pl->alloc);
 	pl->patterns[pl->nr++] = pattern;
 	pattern->pl = pl;
+
+	add_pattern_to_hashsets(pl, pattern);
 }
 
 static int read_skip_worktree_file_from_index(const struct index_state *istate,
@@ -848,6 +953,10 @@  static int add_patterns_from_buffer(char *buf, size_t size,
 	int i, lineno = 1;
 	char *entry;
 
+	pl->use_cone_patterns = core_sparse_checkout_cone;
+	hashmap_init(&pl->recursive_hashmap, pl_hashmap_cmp, NULL, 0);
+	hashmap_init(&pl->parent_hashmap, pl_hashmap_cmp, NULL, 0);
+
 	pl->filebuf = buf;
 
 	if (skip_utf8_bom(&buf, size))
@@ -1084,16 +1193,64 @@  enum pattern_match_result path_matches_pattern_list(
 				struct index_state *istate)
 {
 	struct path_pattern *pattern;
-	pattern = last_matching_pattern_from_list(pathname, pathlen, basename,
-						  dtype, pl, istate);
-	if (pattern) {
-		if (pattern->flags & PATTERN_FLAG_NEGATIVE)
-			return NOT_MATCHED;
-		else
-			return MATCHED;
+	struct strbuf parent_pathname = STRBUF_INIT;
+	int result = NOT_MATCHED;
+	const char *slash_pos;
+
+	if (!pl->use_cone_patterns) {
+		pattern = last_matching_pattern_from_list(pathname, pathlen, basename,
+							dtype, pl, istate);
+		if (pattern) {
+			if (pattern->flags & PATTERN_FLAG_NEGATIVE)
+				return NOT_MATCHED;
+			else
+				return MATCHED;
+		}
+
+		return UNDECIDED;
 	}
 
-	return UNDECIDED;
+	strbuf_addch(&parent_pathname, '/');
+	strbuf_add(&parent_pathname, pathname, pathlen);
+
+	if (hashmap_contains_path(&pl->recursive_hashmap,
+					&parent_pathname)) {
+		result = MATCHED;
+		goto done;
+	}
+
+	slash_pos = strrchr(parent_pathname.buf, '/');
+
+	if (slash_pos == parent_pathname.buf) {
+		/* include every file in root */
+		result = MATCHED;
+		goto done;
+	}
+
+	strbuf_setlen(&parent_pathname, slash_pos - parent_pathname.buf);
+
+	if (hashmap_contains_path(&pl->parent_hashmap, &parent_pathname)) {
+		result = MATCHED;
+		goto done;
+	}
+
+	while (parent_pathname.len) {
+		if (hashmap_contains_path(&pl->recursive_hashmap,
+					  &parent_pathname)) {
+			result = UNDECIDED;
+			goto done;
+		}
+
+		slash_pos = strrchr(parent_pathname.buf, '/');
+		if (slash_pos == parent_pathname.buf)
+			break;
+
+		strbuf_setlen(&parent_pathname, slash_pos - parent_pathname.buf);
+	}
+
+done:
+	strbuf_release(&parent_pathname);
+	return result;
 }
 
 static struct path_pattern *last_matching_pattern_from_lists(
diff --git a/dir.h b/dir.h
index 608696c958..bbd5bd1cc9 100644
--- a/dir.h
+++ b/dir.h
@@ -4,6 +4,7 @@ 
 /* See Documentation/technical/api-directory-listing.txt */
 
 #include "cache.h"
+#include "hashmap.h"
 #include "strbuf.h"
 
 struct dir_entry {
@@ -37,6 +38,13 @@  struct path_pattern {
 	int srcpos;
 };
 
+/* used for hashmaps for cone patterns */
+struct pattern_entry {
+	struct hashmap_entry ent;
+	char *pattern;
+	size_t patternlen;
+};
+
 /*
  * Each excludes file will be parsed into a fresh exclude_list which
  * is appended to the relevant exclude_list_group (either EXC_DIRS or
@@ -55,6 +63,25 @@  struct pattern_list {
 	const char *src;
 
 	struct path_pattern **patterns;
+
+	/*
+	 * While scanning the excludes, we attempt to match the patterns
+	 * with a more restricted set that allows us to use hashsets for
+	 * matching logic, which is faster than the linear lookup in the
+	 * excludes array above. If non-zero, that check succeeded.
+	 */
+	unsigned use_cone_patterns;
+
+	/*
+	 * Stores paths where everything starting with those paths
+	 * is included.
+	 */
+	struct hashmap recursive_hashmap;
+
+	/*
+	 * Used to check single-level parents of blobs.
+	 */
+	struct hashmap parent_hashmap;
 };
 
 /*
diff --git a/t/t1091-sparse-checkout-builtin.sh b/t/t1091-sparse-checkout-builtin.sh
index 9b089c98c4..f726205d21 100755
--- a/t/t1091-sparse-checkout-builtin.sh
+++ b/t/t1091-sparse-checkout-builtin.sh
@@ -143,7 +143,8 @@  test_expect_success 'set sparse-checkout using --stdin' '
 test_expect_success 'cone mode: match patterns' '
 	git -C repo config --worktree core.sparseCheckoutCone true &&
 	rm -rf repo/a repo/folder1 repo/folder2 &&
-	git -C repo read-tree -mu HEAD &&
+	git -C repo read-tree -mu HEAD 2>err &&
+	test_i18ngrep ! "disabling cone patterns" err &&
 	git -C repo reset --hard &&
 	ls repo >dir  &&
 	cat >expect <<-EOF &&
@@ -154,6 +155,14 @@  test_expect_success 'cone mode: match patterns' '
 	test_cmp expect dir
 '
 
+test_expect_success 'cone mode: warn on bad pattern' '
+	test_when_finished mv sparse-checkout repo/.git/info/ &&
+	cp repo/.git/info/sparse-checkout . &&
+	echo "!/deep/deeper/*" >>repo/.git/info/sparse-checkout &&
+	git -C repo read-tree -mu HEAD 2>err &&
+	test_i18ngrep "unrecognized negative pattern" err
+'
+
 test_expect_success 'sparse-checkout disable' '
 	git -C repo sparse-checkout disable &&
 	test_path_is_missing repo/.git/info/sparse-checkout &&