diff mbox series

[v3,09/12] sparse-checkout: properly match escaped characters

Message ID 9ea69e90694e53842acd68d3ac85c9a00c4bd343.1580236003.git.gitgitgadget@gmail.com (mailing list archive)
State New, archived
Headers show
Series Harden the sparse-checkout builtin | expand

Commit Message

Linus Arver via GitGitGadget Jan. 28, 2020, 6:26 p.m. UTC
From: Derrick Stolee <dstolee@microsoft.com>

In cone mode, the sparse-checkout feature uses hashset containment
queries to match paths. Make this algorithm respect escaped asterisk
(*) and backslash (\) characters.

Create dup_and_filter_pattern() method to convert a pattern by
removing escape characters and dropping an optional "/*" at the end.
This method is available in dir.h as we will use it in
builtin/sparse-checkout.c in a later change.

Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
---
 dir.c                              | 31 +++++++++++++++++++++++++++---
 t/t1091-sparse-checkout-builtin.sh | 22 +++++++++++++++++----
 2 files changed, 46 insertions(+), 7 deletions(-)

Comments

Jeff King Jan. 29, 2020, 10:03 a.m. UTC | #1
On Tue, Jan 28, 2020 at 06:26:40PM +0000, Derrick Stolee via GitGitGadget wrote:

> From: Derrick Stolee <dstolee@microsoft.com>
> 
> In cone mode, the sparse-checkout feature uses hashset containment
> queries to match paths. Make this algorithm respect escaped asterisk
> (*) and backslash (\) characters.

Do we also need to worry about other glob metacharacters? E.g., "?" or
ranges like "[A-Z]"?

> +static char *dup_and_filter_pattern(const char *pattern)
> +{
> +	char *set, *read;
> +	char *result = xstrdup(pattern);
> +
> +	set = result;
> +	read = result;
> +
> +	while (*read) {
> +		/* skip escape characters (once) */
> +		if (*read == '\\')
> +			read++;
> +
> +		*set = *read;
> +
> +		set++;
> +		read++;
> +	}
> +	*set = 0;
> +
> +	if (*(read - 2) == '/' && *(read - 1) == '*')
> +		*(read - 2) = 0;
> +
> +	return result;
> +}

Do we need to check that the pattern is longer than 1 character here? If
it's a single character, it seems like this "*(read - 2)" will
dereference the byte before the string.

-Peff
Derrick Stolee Jan. 29, 2020, 1:58 p.m. UTC | #2
On 1/29/2020 5:03 AM, Jeff King wrote:
> On Tue, Jan 28, 2020 at 06:26:40PM +0000, Derrick Stolee via GitGitGadget wrote:
> 
>> From: Derrick Stolee <dstolee@microsoft.com>
>>
>> In cone mode, the sparse-checkout feature uses hashset containment
>> queries to match paths. Make this algorithm respect escaped asterisk
>> (*) and backslash (\) characters.
> 
> Do we also need to worry about other glob metacharacters? E.g., "?" or
> ranges like "[A-Z]"?

These are not part of the .gitignore patterns [1].

[1] https://git-scm.com/docs/gitignore#_pattern_format

>> +static char *dup_and_filter_pattern(const char *pattern)
>> +{
>> +	char *set, *read;
>> +	char *result = xstrdup(pattern);
>> +
>> +	set = result;
>> +	read = result;
>> +
>> +	while (*read) {
>> +		/* skip escape characters (once) */
>> +		if (*read == '\\')
>> +			read++;
>> +
>> +		*set = *read;
>> +
>> +		set++;
>> +		read++;
>> +	}
>> +	*set = 0;
>> +
>> +	if (*(read - 2) == '/' && *(read - 1) == '*')
>> +		*(read - 2) = 0;
>> +
>> +	return result;
>> +}
> 
> Do we need to check that the pattern is longer than 1 character here? If
> it's a single character, it seems like this "*(read - 2)" will
> dereference the byte before the string.

This method is only called by add_pattern_to_hashsets(), which
has a guard against paths of length less than 2, but thats' no
excuse for dangerous pointer arithmetic here.

But you also point out an even more confusing thing: why are we
modifying based on the 'read' pointer, and not the 'set' pointer?
This seems to work _accidentally_ only when the pattern has "<something>/*"
and "<something>" has no escape characters.

I had to recall exactly why we are dropping this "/*", but it's because
the pattern _actually_ ends with "/*/" but the in-memory pattern has
already dropped that last slash and applied PATTERN_FLAG_MUSTBEDIR.

Here is a diff that I can apply to this patch to fix this problem
_and_ demonstrate it in the tests:

diff --git a/dir.c b/dir.c
index 579f274d13..277577c8bf 100644
--- a/dir.c
+++ b/dir.c
@@ -633,6 +633,7 @@ int pl_hashmap_cmp(const void *unused_cmp_data,
 static char *dup_and_filter_pattern(const char *pattern)
 {
        char *set, *read;
+       size_t count  = 0;
        char *result = xstrdup(pattern);
 
        set = result;
@@ -647,11 +648,14 @@ static char *dup_and_filter_pattern(const char *pattern)
 
                set++;
                read++;
+               count++;
        }
        *set = 0;
 
-       if (*(read - 2) == '/' && *(read - 1) == '*')
-               *(read - 2) = 0;
+       if (count > 2 &&
+           *(set - 1) == '*' &&
+           *(set - 2) == '/')
+               *(set - 2) = 0;
 
        return result;
 }
diff --git a/t/t1091-sparse-checkout-builtin.sh b/t/t1091-sparse-checkout-builtin.sh
index 0a21a5e15d..20b0465f77 100755
--- a/t/t1091-sparse-checkout-builtin.sh
+++ b/t/t1091-sparse-checkout-builtin.sh
@@ -383,6 +383,7 @@ test_expect_success BSLASHPSPEC 'pattern-checks: escaped "*"' '
        /*
        !/*/
        /zbad\\dir/
+       !/zbad\\dir/*/
        /zdoes\*not\*exist/
        /zdoes\*exist/
        EOF

With this extra line in the test, but compiling the old version of this patch,
the test fails with:

'err' is not empty, it contains:
+ cat err
warning: unrecognized negative pattern: '/zbad\\dir/*'
warning: disabling cone pattern matching

To ensure this negative pattern exists in the later patch where we set
the patterns using the builtin, I'll add "zbad\\dir/bogus" to the list
of directories to include, which will add another pattern to the set.

Thanks,
-Stolee
Derrick Stolee Jan. 29, 2020, 2:04 p.m. UTC | #3
On 1/29/2020 8:58 AM, Derrick Stolee wrote:
> On 1/29/2020 5:03 AM, Jeff King wrote:
>> On Tue, Jan 28, 2020 at 06:26:40PM +0000, Derrick Stolee via GitGitGadget wrote:
>>
>>> From: Derrick Stolee <dstolee@microsoft.com>
>>>
>>> In cone mode, the sparse-checkout feature uses hashset containment
>>> queries to match paths. Make this algorithm respect escaped asterisk
>>> (*) and backslash (\) characters.
>>
>> Do we also need to worry about other glob metacharacters? E.g., "?" or
>> ranges like "[A-Z]"?
> 
> These are not part of the .gitignore patterns [1].
> 
> [1] https://git-scm.com/docs/gitignore#_pattern_format

I should read things more carefully. There is also this information in
one of the bullets:

	An asterisk "*" matches anything except a slash. The character
	"?" matches any one character except "/". The range notation,
	e.g. [a-zA-Z], can be used to match one of the characters in a range.
	See fnmatch(3) and the FNM_PATHNAME flag for a more detailed
	description.

So this series does not attempt to properly work with globs, and I'll
need to test those a bit. Certainly they shouldn't work in cone mode,
so an extra patch to remove those would be simple. Input sanitizing
would be interesting, and I'll see what `git ls-tree` would output
with paths containing these characters.

-Stolee
diff mbox series

Patch

diff --git a/dir.c b/dir.c
index 7cb78c8b87..579f274d13 100644
--- a/dir.c
+++ b/dir.c
@@ -630,6 +630,32 @@  int pl_hashmap_cmp(const void *unused_cmp_data,
 	return strncmp(ee1->pattern, ee2->pattern, min_len);
 }
 
+static char *dup_and_filter_pattern(const char *pattern)
+{
+	char *set, *read;
+	char *result = xstrdup(pattern);
+
+	set = result;
+	read = result;
+
+	while (*read) {
+		/* skip escape characters (once) */
+		if (*read == '\\')
+			read++;
+
+		*set = *read;
+
+		set++;
+		read++;
+	}
+	*set = 0;
+
+	if (*(read - 2) == '/' && *(read - 1) == '*')
+		*(read - 2) = 0;
+
+	return result;
+}
+
 static void add_pattern_to_hashsets(struct pattern_list *pl, struct path_pattern *given)
 {
 	struct pattern_entry *translated;
@@ -695,8 +721,7 @@  static void add_pattern_to_hashsets(struct pattern_list *pl, struct path_pattern
 			goto clear_hashmaps;
 		}
 
-		truncated = xstrdup(given->pattern);
-		truncated[given->patternlen - 2] = 0;
+		truncated = dup_and_filter_pattern(given->pattern);
 
 		translated = xmalloc(sizeof(struct pattern_entry));
 		translated->pattern = truncated;
@@ -730,7 +755,7 @@  static void add_pattern_to_hashsets(struct pattern_list *pl, struct path_pattern
 
 	translated = xmalloc(sizeof(struct pattern_entry));
 
-	translated->pattern = xstrdup(given->pattern);
+	translated->pattern = dup_and_filter_pattern(given->pattern);
 	translated->patternlen = given->patternlen;
 	hashmap_entry_init(&translated->ent,
 			   ignore_case ?
diff --git a/t/t1091-sparse-checkout-builtin.sh b/t/t1091-sparse-checkout-builtin.sh
index 470900f6f4..0a21a5e15d 100755
--- a/t/t1091-sparse-checkout-builtin.sh
+++ b/t/t1091-sparse-checkout-builtin.sh
@@ -366,13 +366,27 @@  test_expect_success 'pattern-checks: starting "*"' '
 	check_read_tree_errors repo "a deep" "disabling cone pattern matching"
 '
 
-test_expect_success 'pattern-checks: escaped "*"' '
-	cat >repo/.git/info/sparse-checkout <<-\EOF &&
+test_expect_success BSLASHPSPEC 'pattern-checks: escaped "*"' '
+	git clone repo escaped &&
+	TREEOID=$(git -C escaped rev-parse HEAD:folder1) &&
+	NEWTREE=$(git -C escaped mktree <<-EOF
+	$(git -C escaped ls-tree HEAD)
+	040000 tree $TREEOID	zbad\\dir
+	040000 tree $TREEOID	zdoes*exist
+	EOF
+	) &&
+	COMMIT=$(git -C escaped commit-tree $NEWTREE -p HEAD) &&
+	git -C escaped reset --hard $COMMIT &&
+	check_files escaped "a deep folder1 folder2 zbad\\dir zdoes*exist" &&
+	git -C escaped sparse-checkout init --cone &&
+	cat >escaped/.git/info/sparse-checkout <<-\EOF &&
 	/*
 	!/*/
-	/does\*not\*exist/
+	/zbad\\dir/
+	/zdoes\*not\*exist/
+	/zdoes\*exist/
 	EOF
-	check_read_tree_errors repo "a" ""
+	check_read_tree_errors escaped "a zbad\\dir zdoes*exist"
 '
 
 test_done