diff mbox series

[1/2] grep/pcre2: use PCRE2_UTF even with ASCII patterns

Message ID 5fa6962e-3c1c-6dbc-f6d7-589151a9baec@web.de (mailing list archive)
State New, archived
Headers show
Series [1/2] grep/pcre2: use PCRE2_UTF even with ASCII patterns | expand

Commit Message

René Scharfe Dec. 18, 2021, 7:50 p.m. UTC
compile_pcre2_pattern() currently uses the option PCRE2_UTF only for
patterns with non-ASCII characters.  Patterns with ASCII wildcards can
match non-ASCII strings, though.  Without that option PCRE2 mishandles
UTF-8 input, though -- it matches parts of multi-byte characters.  Fix
that by using PCRE2_UTF even for ASCII-only patterns.

This is a remake of the reverted ae39ba431a (grep/pcre2: fix an edge
case concerning ascii patterns and UTF-8 data, 2021-10-15).  The change
to the condition and the test are simplified and more targeted.

Original-patch-by: Hamza Mahfooz <someguy@effective-light.com>
Signed-off-by: René Scharfe <l.s.r@web.de>
---
 grep.c                          | 2 +-
 t/t7812-grep-icase-non-ascii.sh | 6 ++++++
 2 files changed, 7 insertions(+), 1 deletion(-)

--
2.34.0

Comments

SZEDER Gábor Jan. 29, 2022, 5:25 p.m. UTC | #1
On Sat, Dec 18, 2021 at 08:50:02PM +0100, René Scharfe wrote:
> compile_pcre2_pattern() currently uses the option PCRE2_UTF only for
> patterns with non-ASCII characters.  Patterns with ASCII wildcards can
> match non-ASCII strings, though.  Without that option PCRE2 mishandles
> UTF-8 input, though -- it matches parts of multi-byte characters.  Fix
> that by using PCRE2_UTF even for ASCII-only patterns.
> 
> This is a remake of the reverted ae39ba431a (grep/pcre2: fix an edge
> case concerning ascii patterns and UTF-8 data, 2021-10-15).  The change
> to the condition and the test are simplified and more targeted.
> 
> Original-patch-by: Hamza Mahfooz <someguy@effective-light.com>
> Signed-off-by: René Scharfe <l.s.r@web.de>
> ---
>  grep.c                          | 2 +-
>  t/t7812-grep-icase-non-ascii.sh | 6 ++++++
>  2 files changed, 7 insertions(+), 1 deletion(-)
> 
> diff --git a/grep.c b/grep.c
> index fe847a0111..5badb6d851 100644
> --- a/grep.c
> +++ b/grep.c
> @@ -382,7 +382,7 @@ static void compile_pcre2_pattern(struct grep_pat *p, const struct grep_opt *opt
>  		}
>  		options |= PCRE2_CASELESS;
>  	}
> -	if (!opt->ignore_locale && is_utf8_locale() && has_non_ascii(p->pattern) &&
> +	if (!opt->ignore_locale && is_utf8_locale() &&
>  	    !(!opt->ignore_case && (p->fixed || p->is_fixed)))
>  		options |= (PCRE2_UTF | PCRE2_MATCH_INVALID_UTF);
> 

I tried to use 'git grep -P' for the first time ever, and it hung
right away, spinning all CPUs at 100%.  I could narrow it down, both
the complexity of the pattern and the size of the input, see the test
below, and it bisects to this patch.


  ---   >8   ---

#!/bin/sh

test_description='test'

. ./test-lib.sh

test_expect_success PCRE 'test' '
	# LC_ALL=C works
	LC_ALL=en_US.UTF-8 &&
	cat >ascii <<-\EOF &&
	foo
	 bar
	 baz
	EOF
	cat >utf8 <<-\EOF &&
	foo
	 bar
	 báz
	EOF
	git add ascii utf8 &&

	# These all work as expected:
	git grep --threads=1 -P " " ascii &&
	git grep --threads=1 -P "^ " ascii &&
	git grep --threads=1 -P "\s" ascii &&
	git grep --threads=1 -P "^\s" ascii &&
	git grep --threads=1 -P " " utf8 &&
	git grep --threads=1 -P "^ " utf8 &&
	git grep --threads=1 -P "\s" utf8 &&

	# This hangs (but it does work with basic and extended regexp):
	git grep --threads=1 -P "^\s" utf8
'

test_done
René Scharfe Jan. 30, 2022, 7:55 a.m. UTC | #2
Am 29.01.22 um 18:25 schrieb SZEDER Gábor:
> On Sat, Dec 18, 2021 at 08:50:02PM +0100, René Scharfe wrote:
>> compile_pcre2_pattern() currently uses the option PCRE2_UTF only for
>> patterns with non-ASCII characters.  Patterns with ASCII wildcards can
>> match non-ASCII strings, though.  Without that option PCRE2 mishandles
>> UTF-8 input, though -- it matches parts of multi-byte characters.  Fix
>> that by using PCRE2_UTF even for ASCII-only patterns.
>>
>> This is a remake of the reverted ae39ba431a (grep/pcre2: fix an edge
>> case concerning ascii patterns and UTF-8 data, 2021-10-15).  The change
>> to the condition and the test are simplified and more targeted.
>>
>> Original-patch-by: Hamza Mahfooz <someguy@effective-light.com>
>> Signed-off-by: René Scharfe <l.s.r@web.de>
>> ---
>>  grep.c                          | 2 +-
>>  t/t7812-grep-icase-non-ascii.sh | 6 ++++++
>>  2 files changed, 7 insertions(+), 1 deletion(-)
>>
>> diff --git a/grep.c b/grep.c
>> index fe847a0111..5badb6d851 100644
>> --- a/grep.c
>> +++ b/grep.c
>> @@ -382,7 +382,7 @@ static void compile_pcre2_pattern(struct grep_pat *p, const struct grep_opt *opt
>>  		}
>>  		options |= PCRE2_CASELESS;
>>  	}
>> -	if (!opt->ignore_locale && is_utf8_locale() && has_non_ascii(p->pattern) &&
>> +	if (!opt->ignore_locale && is_utf8_locale() &&
>>  	    !(!opt->ignore_case && (p->fixed || p->is_fixed)))
>>  		options |= (PCRE2_UTF | PCRE2_MATCH_INVALID_UTF);
>>
>
> I tried to use 'git grep -P' for the first time ever, and it hung
> right away, spinning all CPUs at 100%.  I could narrow it down, both
> the complexity of the pattern and the size of the input, see the test
> below, and it bisects to this patch.
>
>
>   ---   >8   ---
>
> #!/bin/sh
>
> test_description='test'
>
> . ./test-lib.sh
>
> test_expect_success PCRE 'test' '
> 	# LC_ALL=C works
> 	LC_ALL=en_US.UTF-8 &&
> 	cat >ascii <<-\EOF &&
> 	foo
> 	 bar
> 	 baz
> 	EOF
> 	cat >utf8 <<-\EOF &&
> 	foo
> 	 bar
> 	 báz
> 	EOF
> 	git add ascii utf8 &&
>
> 	# These all work as expected:
> 	git grep --threads=1 -P " " ascii &&
> 	git grep --threads=1 -P "^ " ascii &&
> 	git grep --threads=1 -P "\s" ascii &&
> 	git grep --threads=1 -P "^\s" ascii &&
> 	git grep --threads=1 -P " " utf8 &&
> 	git grep --threads=1 -P "^ " utf8 &&
> 	git grep --threads=1 -P "\s" utf8 &&
>
> 	# This hangs (but it does work with basic and extended regexp):
> 	git grep --threads=1 -P "^\s" utf8
> '
>
> test_done

I get the following result and no hang with PCRE2 10.39:

   utf8: bar
   utf8: báz

e0c6029 (Fix inifinite loop when a single byte newline is searched in
JIT., 2020-05-29) [1] sounds like it might have fixed it.  It's part of
version 10.36.

Do you still get the error when you disable JIT, i.e. when you use the
pattern "(*NO_JIT)^\s" instead?

René


[1] https://github.com/PhilipHazel/pcre2/commit/e0c6029a62db9c2161941ecdf459205382d4d379
SZEDER Gábor Jan. 30, 2022, 9:04 a.m. UTC | #3
On Sun, Jan 30, 2022 at 08:55:02AM +0100, René Scharfe wrote:
> Am 29.01.22 um 18:25 schrieb SZEDER Gábor:
> > On Sat, Dec 18, 2021 at 08:50:02PM +0100, René Scharfe wrote:
> >> compile_pcre2_pattern() currently uses the option PCRE2_UTF only for
> >> patterns with non-ASCII characters.  Patterns with ASCII wildcards can
> >> match non-ASCII strings, though.  Without that option PCRE2 mishandles
> >> UTF-8 input, though -- it matches parts of multi-byte characters.  Fix
> >> that by using PCRE2_UTF even for ASCII-only patterns.
> >>
> >> This is a remake of the reverted ae39ba431a (grep/pcre2: fix an edge
> >> case concerning ascii patterns and UTF-8 data, 2021-10-15).  The change
> >> to the condition and the test are simplified and more targeted.
> >>
> >> Original-patch-by: Hamza Mahfooz <someguy@effective-light.com>
> >> Signed-off-by: René Scharfe <l.s.r@web.de>
> >> ---
> >>  grep.c                          | 2 +-
> >>  t/t7812-grep-icase-non-ascii.sh | 6 ++++++
> >>  2 files changed, 7 insertions(+), 1 deletion(-)
> >>
> >> diff --git a/grep.c b/grep.c
> >> index fe847a0111..5badb6d851 100644
> >> --- a/grep.c
> >> +++ b/grep.c
> >> @@ -382,7 +382,7 @@ static void compile_pcre2_pattern(struct grep_pat *p, const struct grep_opt *opt
> >>  		}
> >>  		options |= PCRE2_CASELESS;
> >>  	}
> >> -	if (!opt->ignore_locale && is_utf8_locale() && has_non_ascii(p->pattern) &&
> >> +	if (!opt->ignore_locale && is_utf8_locale() &&
> >>  	    !(!opt->ignore_case && (p->fixed || p->is_fixed)))
> >>  		options |= (PCRE2_UTF | PCRE2_MATCH_INVALID_UTF);
> >>
> >
> > I tried to use 'git grep -P' for the first time ever, and it hung
> > right away, spinning all CPUs at 100%.  I could narrow it down, both
> > the complexity of the pattern and the size of the input, see the test
> > below, and it bisects to this patch.
> >
> >
> >   ---   >8   ---
> >
> > #!/bin/sh
> >
> > test_description='test'
> >
> > . ./test-lib.sh
> >
> > test_expect_success PCRE 'test' '
> > 	# LC_ALL=C works
> > 	LC_ALL=en_US.UTF-8 &&
> > 	cat >ascii <<-\EOF &&
> > 	foo
> > 	 bar
> > 	 baz
> > 	EOF
> > 	cat >utf8 <<-\EOF &&
> > 	foo
> > 	 bar
> > 	 báz
> > 	EOF
> > 	git add ascii utf8 &&
> >
> > 	# These all work as expected:
> > 	git grep --threads=1 -P " " ascii &&
> > 	git grep --threads=1 -P "^ " ascii &&
> > 	git grep --threads=1 -P "\s" ascii &&
> > 	git grep --threads=1 -P "^\s" ascii &&
> > 	git grep --threads=1 -P " " utf8 &&
> > 	git grep --threads=1 -P "^ " utf8 &&
> > 	git grep --threads=1 -P "\s" utf8 &&
> >
> > 	# This hangs (but it does work with basic and extended regexp):
> > 	git grep --threads=1 -P "^\s" utf8
> > '
> >
> > test_done
> 
> I get the following result and no hang with PCRE2 10.39:
> 
>    utf8: bar
>    utf8: báz
> 
> e0c6029 (Fix inifinite loop when a single byte newline is searched in
> JIT., 2020-05-29) [1] sounds like it might have fixed it.  It's part of
> version 10.36.

I saw this hang on two Ubuntu 20.04 based boxes, which predate that
fix you mention only by a month or two, and apparently the almost two
years since then was not enough for this fix to trickle down into
updated 20.04 pcre packages, because:

> Do you still get the error when you disable JIT, i.e. when you use the
> pattern "(*NO_JIT)^\s" instead?

No, with this pattern it works as expected.

So is there a more convenient way to disable PCRE JIT in Git?  FWIW,
(non-git) 'grep -P' works with the same patterns.

> René
> 
> 
> [1] https://github.com/PhilipHazel/pcre2/commit/e0c6029a62db9c2161941ecdf459205382d4d379
René Scharfe Jan. 30, 2022, 1:32 p.m. UTC | #4
Am 30.01.22 um 10:04 schrieb SZEDER Gábor:
> On Sun, Jan 30, 2022 at 08:55:02AM +0100, René Scharfe wrote:
>> e0c6029 (Fix inifinite loop when a single byte newline is searched in
>> JIT., 2020-05-29) [1] sounds like it might have fixed it.  It's part of
>> version 10.36.
>
> I saw this hang on two Ubuntu 20.04 based boxes, which predate that
> fix you mention only by a month or two, and apparently the almost two
> years since then was not enough for this fix to trickle down into
> updated 20.04 pcre packages, because:
>
>> Do you still get the error when you disable JIT, i.e. when you use the
>> pattern "(*NO_JIT)^\s" instead?
>
> No, with this pattern it works as expected.
>
> So is there a more convenient way to disable PCRE JIT in Git?  FWIW,
> (non-git) 'grep -P' works with the same patterns.

I don't know a better way.  We could do it automatically, though:

--- >8 ---
Subject: [PATCH] grep: disable JIT on PCRE2 before 10.36 to avoid endless loop

Commit e0c6029 (Fix inifinite loop when a single byte newline is
searched in JIT., 2020-05-29) of PCRE2 adds the following point to its
ChangeLog for version 10.36:

  2. Fix inifinite loop when a single byte newline is searched in JIT when
  invalid utf8 mode is enabled.

Avoid that bug on older versions (which are still reportedly found in
the wild) by disabling the JIT when handling UTF-8.

Reported-by: SZEDER Gábor <szeder.dev@gmail.com>
Signed-off-by: René Scharfe <l.s.r@web.de>
---
Not sure how to test it.  Killing git grep after a second or so seems a
bit clumsy.  timeout(1) from GNU coreutils at least allows doing that
from the shell, but it's not a standard tool.  Perhaps we need a new
test helper for that purpose?

 grep.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/grep.c b/grep.c
index 7bb0360869..16629a2301 100644
--- a/grep.c
+++ b/grep.c
@@ -406,6 +406,14 @@ static void compile_pcre2_pattern(struct grep_pat *p, const struct grep_opt *opt
 	}

 	pcre2_config(PCRE2_CONFIG_JIT, &p->pcre2_jit_on);
+#ifndef GIT_PCRE2_VERSION_10_36_OR_HIGHER
+	/*
+	 * Work around the bug fixed by e0c6029 (Fix inifinite loop when a
+	 * single byte newline is searched in JIT., 2020-05-29).
+	 */
+	if (options & PCRE2_MATCH_INVALID_UTF)
+		p->pcre2_jit_on = 0;
+#endif
 	if (p->pcre2_jit_on) {
 		jitret = pcre2_jit_compile(p->pcre2_pattern, PCRE2_JIT_COMPLETE);
 		if (jitret)
--
2.35.0
Ævar Arnfjörð Bjarmason Jan. 31, 2022, 9:01 p.m. UTC | #5
On Sun, Jan 30 2022, René Scharfe wrote:

> Am 30.01.22 um 10:04 schrieb SZEDER Gábor:
>> On Sun, Jan 30, 2022 at 08:55:02AM +0100, René Scharfe wrote:
>>> e0c6029 (Fix inifinite loop when a single byte newline is searched in
>>> JIT., 2020-05-29) [1] sounds like it might have fixed it.  It's part of
>>> version 10.36.
>>
>> I saw this hang on two Ubuntu 20.04 based boxes, which predate that
>> fix you mention only by a month or two, and apparently the almost two
>> years since then was not enough for this fix to trickle down into
>> updated 20.04 pcre packages, because:
>>
>>> Do you still get the error when you disable JIT, i.e. when you use the
>>> pattern "(*NO_JIT)^\s" instead?
>>
>> No, with this pattern it works as expected.
>>
>> So is there a more convenient way to disable PCRE JIT in Git?  FWIW,
>> (non-git) 'grep -P' works with the same patterns.
>
> I don't know a better way.  We could do it automatically, though:
>
> --- >8 ---
> Subject: [PATCH] grep: disable JIT on PCRE2 before 10.36 to avoid endless loop
>
> Commit e0c6029 (Fix inifinite loop when a single byte newline is
> searched in JIT., 2020-05-29) of PCRE2 adds the following point to its
> ChangeLog for version 10.36:
>
>   2. Fix inifinite loop when a single byte newline is searched in JIT when
>   invalid utf8 mode is enabled.
>
> Avoid that bug on older versions (which are still reportedly found in
> the wild) by disabling the JIT when handling UTF-8.
>
> Reported-by: SZEDER Gábor <szeder.dev@gmail.com>
> Signed-off-by: René Scharfe <l.s.r@web.de>
> ---
> Not sure how to test it.  Killing git grep after a second or so seems a
> bit clumsy.  timeout(1) from GNU coreutils at least allows doing that
> from the shell, but it's not a standard tool.  Perhaps we need a new
> test helper for that purpose?
>
>  grep.c | 8 ++++++++
>  1 file changed, 8 insertions(+)
>
> diff --git a/grep.c b/grep.c
> index 7bb0360869..16629a2301 100644
> --- a/grep.c
> +++ b/grep.c
> @@ -406,6 +406,14 @@ static void compile_pcre2_pattern(struct grep_pat *p, const struct grep_opt *opt
>  	}
>
>  	pcre2_config(PCRE2_CONFIG_JIT, &p->pcre2_jit_on);
> +#ifndef GIT_PCRE2_VERSION_10_36_OR_HIGHER
> +	/*
> +	 * Work around the bug fixed by e0c6029 (Fix inifinite loop when a

Better to quote this as PhilipHazel/pcre2@e0c6029 or something, i.e. to
indicate that it's not git.git's commit.

> +	 * single byte newline is searched in JIT., 2020-05-29).
> +	 */
> +	if (options & PCRE2_MATCH_INVALID_UTF)
> +		p->pcre2_jit_on = 0;

It seems rather heavy-hande, but I can't think of a better way to deal
with this, i.e. if we selectively use JIT on older versions, surely we
run into the match-bytes-but-want-chars bug you were fixing.

> +#endif
>  	if (p->pcre2_jit_on) {
>  		jitret = pcre2_jit_compile(p->pcre2_pattern, PCRE2_JIT_COMPLETE);
>  		if (jitret)
René Scharfe Feb. 5, 2022, 5 p.m. UTC | #6
Am 31.01.22 um 22:01 schrieb Ævar Arnfjörð Bjarmason:
>
> On Sun, Jan 30 2022, René Scharfe wrote:
>
>> Am 30.01.22 um 10:04 schrieb SZEDER Gábor:
>>> On Sun, Jan 30, 2022 at 08:55:02AM +0100, René Scharfe wrote:
>>>> e0c6029 (Fix inifinite loop when a single byte newline is searched in
>>>> JIT., 2020-05-29) [1] sounds like it might have fixed it.  It's part of
>>>> version 10.36.
>>>
>>> I saw this hang on two Ubuntu 20.04 based boxes, which predate that
>>> fix you mention only by a month or two, and apparently the almost two
>>> years since then was not enough for this fix to trickle down into
>>> updated 20.04 pcre packages, because:
>>>
>>>> Do you still get the error when you disable JIT, i.e. when you use the
>>>> pattern "(*NO_JIT)^\s" instead?
>>>
>>> No, with this pattern it works as expected.
>>>
>>> So is there a more convenient way to disable PCRE JIT in Git?  FWIW,
>>> (non-git) 'grep -P' works with the same patterns.
>>
>> I don't know a better way.  We could do it automatically, though:
>>
>> --- >8 ---
>> Subject: [PATCH] grep: disable JIT on PCRE2 before 10.36 to avoid endless loop
>>
>> Commit e0c6029 (Fix inifinite loop when a single byte newline is
>> searched in JIT., 2020-05-29) of PCRE2 adds the following point to its
>> ChangeLog for version 10.36:
>>
>>   2. Fix inifinite loop when a single byte newline is searched in JIT when
>>   invalid utf8 mode is enabled.
>>
>> Avoid that bug on older versions (which are still reportedly found in
>> the wild) by disabling the JIT when handling UTF-8.
>>
>> Reported-by: SZEDER Gábor <szeder.dev@gmail.com>
>> Signed-off-by: René Scharfe <l.s.r@web.de>
>> ---
>> Not sure how to test it.  Killing git grep after a second or so seems a
>> bit clumsy.  timeout(1) from GNU coreutils at least allows doing that
>> from the shell, but it's not a standard tool.  Perhaps we need a new
>> test helper for that purpose?

https://mywiki.wooledge.org/BashFAQ/068 offers a Perl-based Shell
function or aborting a program if it takes too long:

   doalarm() { perl -e 'alarm shift; exec @ARGV' -- "$@"; }

It doesn't waste time when the program finishes faster and seems to work
fine with git grep.

I can't actually test the effectiveness of the patch because PCRE2's
JIT doesn't work on my development machine at all (Apple M1), as I just
discovered. :-/  While we know that disabling JIT helps, we didn't
actually determine, yet, if e0c6029 (Fix inifinite loop when a single
byte newline is searched in JIT., 2020-05-29) really fixes the "^\s"
bug.

So I have to abandon this patch, unfortunately.  Any volunteer to pick
it up?

>>
>>  grep.c | 8 ++++++++
>>  1 file changed, 8 insertions(+)
>>
>> diff --git a/grep.c b/grep.c
>> index 7bb0360869..16629a2301 100644
>> --- a/grep.c
>> +++ b/grep.c
>> @@ -406,6 +406,14 @@ static void compile_pcre2_pattern(struct grep_pat *p, const struct grep_opt *opt
>>  	}
>>
>>  	pcre2_config(PCRE2_CONFIG_JIT, &p->pcre2_jit_on);
>> +#ifndef GIT_PCRE2_VERSION_10_36_OR_HIGHER
>> +	/*
>> +	 * Work around the bug fixed by e0c6029 (Fix inifinite loop when a
>
> Better to quote this as PhilipHazel/pcre2@e0c6029 or something, i.e. to
> indicate that it's not git.git's commit.

The context should suffice for a human reader to understand that it's a
PCRE2 about a bug fix, but I can see some program turning hex strings
into repo-local links getting confused.  Is there a standard cross-repo
citation format?  Using a Github link might be the most practical way
for the near term, I suspect.

>
>> +	 * single byte newline is searched in JIT., 2020-05-29).
>> +	 */
>> +	if (options & PCRE2_MATCH_INVALID_UTF)
>> +		p->pcre2_jit_on = 0;
>
> It seems rather heavy-hande, but I can't think of a better way to deal
> with this, i.e. if we selectively use JIT on older versions, surely we
> run into the match-bytes-but-want-chars bug you were fixing.
>
>> +#endif
>>  	if (p->pcre2_jit_on) {
>>  		jitret = pcre2_jit_compile(p->pcre2_pattern, PCRE2_JIT_COMPLETE);
>>  		if (jitret)
>
SZEDER Gábor Feb. 6, 2022, 10:08 a.m. UTC | #7
On Sat, Feb 05, 2022 at 06:00:02PM +0100, René Scharfe wrote:
> >> --- >8 ---
> >> Subject: [PATCH] grep: disable JIT on PCRE2 before 10.36 to avoid endless loop
> >>
> >> Commit e0c6029 (Fix inifinite loop when a single byte newline is
> >> searched in JIT., 2020-05-29) of PCRE2 adds the following point to its
> >> ChangeLog for version 10.36:
> >>
> >>   2. Fix inifinite loop when a single byte newline is searched in JIT when
> >>   invalid utf8 mode is enabled.
> >>
> >> Avoid that bug on older versions (which are still reportedly found in
> >> the wild) by disabling the JIT when handling UTF-8.
> >>
> >> Reported-by: SZEDER Gábor <szeder.dev@gmail.com>
> >> Signed-off-by: René Scharfe <l.s.r@web.de>
> >> ---
> >> Not sure how to test it.  Killing git grep after a second or so seems a
> >> bit clumsy.  timeout(1) from GNU coreutils at least allows doing that
> >> from the shell, but it's not a standard tool.  Perhaps we need a new
> >> test helper for that purpose?
> 
> https://mywiki.wooledge.org/BashFAQ/068 offers a Perl-based Shell
> function or aborting a program if it takes too long:
> 
>    doalarm() { perl -e 'alarm shift; exec @ARGV' -- "$@"; }
> 
> It doesn't waste time when the program finishes faster and seems to work
> fine with git grep.
> 
> I can't actually test the effectiveness of the patch because PCRE2's
> JIT doesn't work on my development machine at all (Apple M1), as I just
> discovered. :-/  While we know that disabling JIT helps, we didn't
> actually determine, yet, if e0c6029 (Fix inifinite loop when a single
> byte newline is searched in JIT., 2020-05-29) really fixes the "^\s"
> bug.
> 
> So I have to abandon this patch, unfortunately.  Any volunteer to pick
> it up?

FWIW, I built Git with your patch and USE_LIBPCRE2=YesPlease and run
the test suite, and it succeeded.  Though I can't judge how much is
this actually worth.
Ævar Arnfjörð Bjarmason Feb. 12, 2022, 8:46 p.m. UTC | #8
On Sat, Feb 05 2022, René Scharfe wrote:

> Am 31.01.22 um 22:01 schrieb Ævar Arnfjörð Bjarmason:
>>
>> On Sun, Jan 30 2022, René Scharfe wrote:
>>
>>> Am 30.01.22 um 10:04 schrieb SZEDER Gábor:
>>>> On Sun, Jan 30, 2022 at 08:55:02AM +0100, René Scharfe wrote:
>>>>> e0c6029 (Fix inifinite loop when a single byte newline is searched in
>>>>> JIT., 2020-05-29) [1] sounds like it might have fixed it.  It's part of
>>>>> version 10.36.
>>>>
>>>> I saw this hang on two Ubuntu 20.04 based boxes, which predate that
>>>> fix you mention only by a month or two, and apparently the almost two
>>>> years since then was not enough for this fix to trickle down into
>>>> updated 20.04 pcre packages, because:
>>>>
>>>>> Do you still get the error when you disable JIT, i.e. when you use the
>>>>> pattern "(*NO_JIT)^\s" instead?
>>>>
>>>> No, with this pattern it works as expected.
>>>>
>>>> So is there a more convenient way to disable PCRE JIT in Git?  FWIW,
>>>> (non-git) 'grep -P' works with the same patterns.
>>>
>>> I don't know a better way.  We could do it automatically, though:
>>>
>>> --- >8 ---
>>> Subject: [PATCH] grep: disable JIT on PCRE2 before 10.36 to avoid endless loop
>>>
>>> Commit e0c6029 (Fix inifinite loop when a single byte newline is
>>> searched in JIT., 2020-05-29) of PCRE2 adds the following point to its
>>> ChangeLog for version 10.36:
>>>
>>>   2. Fix inifinite loop when a single byte newline is searched in JIT when
>>>   invalid utf8 mode is enabled.
>>>
>>> Avoid that bug on older versions (which are still reportedly found in
>>> the wild) by disabling the JIT when handling UTF-8.
>>>
>>> Reported-by: SZEDER Gábor <szeder.dev@gmail.com>
>>> Signed-off-by: René Scharfe <l.s.r@web.de>
>>> ---
>>> Not sure how to test it.  Killing git grep after a second or so seems a
>>> bit clumsy.  timeout(1) from GNU coreutils at least allows doing that
>>> from the shell, but it's not a standard tool.  Perhaps we need a new
>>> test helper for that purpose?
>
> https://mywiki.wooledge.org/BashFAQ/068 offers a Perl-based Shell
> function or aborting a program if it takes too long:
>
>    doalarm() { perl -e 'alarm shift; exec @ARGV' -- "$@"; }
>
> It doesn't waste time when the program finishes faster and seems to work
> fine with git grep.
>
> I can't actually test the effectiveness of the patch because PCRE2's
> JIT doesn't work on my development machine at all (Apple M1), as I just
> discovered. :-/  While we know that disabling JIT helps, we didn't
> actually determine, yet, if e0c6029 (Fix inifinite loop when a single
> byte newline is searched in JIT., 2020-05-29) really fixes the "^\s"
> bug.
>
> So I have to abandon this patch, unfortunately.  Any volunteer to pick
> it up?

We can test it in CI, and have a proposed patch from Hamza Mahfooz to do
so. See
https://lore.kernel.org/git/211220.865yrjszg4.gmgdl@evledraar.gmail.com/

There's been some minor changes to the main.yml since then, but I think
you should be able to just pick that patch up, adjust it, apply whatever
changes you want to test on top, and push it to github.
René Scharfe Feb. 17, 2022, 9:14 p.m. UTC | #9
Am 12.02.22 um 21:46 schrieb Ævar Arnfjörð Bjarmason:
>
> On Sat, Feb 05 2022, René Scharfe wrote:
>
>>
>> I can't actually test the effectiveness of the patch because PCRE2's
>> JIT doesn't work on my development machine at all (Apple M1), as I just
>> discovered. :-/  While we know that disabling JIT helps, we didn't
>> actually determine, yet, if e0c6029 (Fix inifinite loop when a single
>> byte newline is searched in JIT., 2020-05-29) really fixes the "^\s"
>> bug.
>>
>> So I have to abandon this patch, unfortunately.  Any volunteer to pick
>> it up?
>
> We can test it in CI, and have a proposed patch from Hamza Mahfooz to do
> so. See
> https://lore.kernel.org/git/211220.865yrjszg4.gmgdl@evledraar.gmail.com/
>
> There's been some minor changes to the main.yml since then, but I think
> you should be able to just pick that patch up, adjust it, apply whatever
> changes you want to test on top, and push it to github.

Good idea!  Except the "just" is not justified, I feel.  I learned that

  - t7810 fails with PCRE2 built with --disable-unicode because it uses
    \p{...} unconditionally, and that's not supported without Unicode
    support -- no idea how to detect that and skip those tests except
    by trying and maybe looking for the note that "this version of PCRE2
    does not have support for \P, \p, or \X", which somehow feels iffy,

  - PCRE2 10.35 doesn't build on Ubuntu x64 without adding -mshstk to
    CFLAGS, and that's the version I wanted to test,

  - many of the Unicode related tests require Islandic language support,
    and "sudo apt-get -y install `check-language-support -l is`"
    installs it,

  - the condition for our workaround for bug 2642 is reversed,

  - with that fixed I can't trigger the endless loop.

So perhaps that's the only fix we need here -- or perhaps I got
confused by the multitude of options.

--- >8 ---
Subject: [PATCH] grep: fix triggering PCRE2_NO_START_OPTIMIZE workaround

PCRE2 bug 2642 was fixed in version 10.36.  Our 95ca1f987e (grep/pcre2:
better support invalid UTF-8 haystacks, 2021-01-24) worked around it on
older versions by setting the flag PCRE2_NO_START_OPTIMIZE.  797c359978
(grep/pcre2: use compile-time PCREv2 version test, 2021-02-18) switched
it around to set the flag on 10.36 and higher instead, while it claimed
to use "the same test done at compile-time".

Switch the condition back to apply the workaround on PCRE2 versions
_before_ 10.36.

Signed-off-by: René Scharfe <l.s.r@web.de>
---
 grep.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/grep.c b/grep.c
index 5bec7fd793..ef34d764f9 100644
--- a/grep.c
+++ b/grep.c
@@ -386,7 +386,7 @@ static void compile_pcre2_pattern(struct grep_pat *p, const struct grep_opt *opt
 	if (!opt->ignore_locale && is_utf8_locale() && !literal)
 		options |= (PCRE2_UTF | PCRE2_MATCH_INVALID_UTF);

-#ifdef GIT_PCRE2_VERSION_10_36_OR_HIGHER
+#ifndef GIT_PCRE2_VERSION_10_36_OR_HIGHER
 	/* Work around https://bugs.exim.org/show_bug.cgi?id=2642 fixed in 10.36 */
 	if (PCRE2_MATCH_INVALID_UTF && options & (PCRE2_UTF | PCRE2_CASELESS))
 		options |= PCRE2_NO_START_OPTIMIZE;
--
2.35.1
Junio C Hamano Feb. 17, 2022, 10:56 p.m. UTC | #10
René Scharfe <l.s.r@web.de> writes:

> Subject: [PATCH] grep: fix triggering PCRE2_NO_START_OPTIMIZE workaround
>
> PCRE2 bug 2642 was fixed in version 10.36.  Our 95ca1f987e (grep/pcre2:
> better support invalid UTF-8 haystacks, 2021-01-24) worked around it on
> older versions by setting the flag PCRE2_NO_START_OPTIMIZE.  797c359978
> (grep/pcre2: use compile-time PCREv2 version test, 2021-02-18) switched
> it around to set the flag on 10.36 and higher instead, while it claimed
> to use "the same test done at compile-time".
>
> Switch the condition back to apply the workaround on PCRE2 versions
> _before_ 10.36.
>
> Signed-off-by: René Scharfe <l.s.r@web.de>
> ---
>  grep.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/grep.c b/grep.c
> index 5bec7fd793..ef34d764f9 100644
> --- a/grep.c
> +++ b/grep.c
> @@ -386,7 +386,7 @@ static void compile_pcre2_pattern(struct grep_pat *p, const struct grep_opt *opt
>  	if (!opt->ignore_locale && is_utf8_locale() && !literal)
>  		options |= (PCRE2_UTF | PCRE2_MATCH_INVALID_UTF);
>
> -#ifdef GIT_PCRE2_VERSION_10_36_OR_HIGHER
> +#ifndef GIT_PCRE2_VERSION_10_36_OR_HIGHER
>  	/* Work around https://bugs.exim.org/show_bug.cgi?id=2642 fixed in 10.36 */

Oh, that's embarrassing.  The #ifdef and the comment sit next to
each other and they make contradicting statement.

>  	if (PCRE2_MATCH_INVALID_UTF && options & (PCRE2_UTF | PCRE2_CASELESS))
>  		options |= PCRE2_NO_START_OPTIMIZE;
> --
> 2.35.1
diff mbox series

Patch

diff --git a/grep.c b/grep.c
index fe847a0111..5badb6d851 100644
--- a/grep.c
+++ b/grep.c
@@ -382,7 +382,7 @@  static void compile_pcre2_pattern(struct grep_pat *p, const struct grep_opt *opt
 		}
 		options |= PCRE2_CASELESS;
 	}
-	if (!opt->ignore_locale && is_utf8_locale() && has_non_ascii(p->pattern) &&
+	if (!opt->ignore_locale && is_utf8_locale() &&
 	    !(!opt->ignore_case && (p->fixed || p->is_fixed)))
 		options |= (PCRE2_UTF | PCRE2_MATCH_INVALID_UTF);

diff --git a/t/t7812-grep-icase-non-ascii.sh b/t/t7812-grep-icase-non-ascii.sh
index e5d1e4ea68..ca3f24f807 100755
--- a/t/t7812-grep-icase-non-ascii.sh
+++ b/t/t7812-grep-icase-non-ascii.sh
@@ -123,4 +123,10 @@  test_expect_success GETTEXT_LOCALE,LIBPCRE2,PCRE2_MATCH_INVALID_UTF 'PCRE v2: gr
 	test_cmp invalid-0xe5 actual
 '

+test_expect_success GETTEXT_LOCALE,LIBPCRE2 'PCRE v2: grep non-literal ASCII from UTF-8' '
+	git grep --perl-regexp -h -o -e ll. file >actual &&
+	echo "lló" >expected &&
+	test_cmp expected actual
+'
+
 test_done