diff mbox series

[v2,8/8] grep: optimistically use PCRE2_MATCH_INVALID_UTF

Message ID 20190726150818.6373-9-avarab@gmail.com (mailing list archive)
State New, archived
Headers show
Series grep: PCRE JIT fixes | expand

Commit Message

Ævar Arnfjörð Bjarmason July 26, 2019, 3:08 p.m. UTC
As discussed in the "grep: stess test PCRE v2 on invalid UTF-8 data"
commit leading up to this one there's a regression in
b65abcafc7 ("grep: use PCRE v2 for optimized fixed-string search",
2019-07-01) when matching UTF-8 data.

This ultimately isn't straightforward to just "fix", because the kwset
backend was so dumb about icase matching that we'd skip it entirely on
non-ASCII. See the code removed in 48de2a768c ("grep: remove the kwset
optimization", 2019-07-01).

Just going back to the C library for those isn't ideal, since it's
likely to be even dumber about these mixed-encoding cases.

So let's support this "properly" using the PCRE2_MATCH_INVALID_UTF
flag. This is new code that's not in any released PCRE v2 version, so
we might need a fix that emulates it somehow. I figure that the case
that with the non-icase bug out of the way this is obscure enough to
tell people "upgrade your PCRE v2 too!'. It'll likely be released by
the time we release the git version this commit is part of.

We can't just use PCRE2_NO_UTF_CHECK instead for the reasons discussed
in [1].

1. https://public-inbox.org/git/87lfwn70nb.fsf@evledraar.gmail.com/

Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
---
 Makefile                        |  1 +
 grep.c                          |  2 +-
 grep.h                          |  3 +++
 t/helper/test-pcre2-config.c    | 12 ++++++++++++
 t/helper/test-tool.c            |  1 +
 t/helper/test-tool.h            |  1 +
 t/t7812-grep-icase-non-ascii.sh | 13 ++++++++++++-
 7 files changed, 31 insertions(+), 2 deletions(-)
 create mode 100644 t/helper/test-pcre2-config.c

Comments

Junio C Hamano July 26, 2019, 9:07 p.m. UTC | #1
Ævar Arnfjörð Bjarmason  <avarab@gmail.com> writes:

> diff --git a/Makefile b/Makefile
> index bd246f2989..dd38d5e527 100644
> --- a/Makefile
> +++ b/Makefile
> @@ -726,6 +726,7 @@ TEST_BUILTINS_OBJS += test-oidmap.o
>  TEST_BUILTINS_OBJS += test-online-cpus.o
>  TEST_BUILTINS_OBJS += test-parse-options.o
>  TEST_BUILTINS_OBJS += test-path-utils.o
> +TEST_BUILTINS_OBJS += test-pcre2-config.o

This won't even build with any released pcre version; shouldn't we
make it at least conditionally compiled code?  Specifically...

>  TEST_BUILTINS_OBJS += test-pkt-line.o
>  TEST_BUILTINS_OBJS += test-prio-queue.o
>  TEST_BUILTINS_OBJS += test-reach.o
> diff --git a/grep.c b/grep.c
> index c7c06ae08d..8b8b9efe12 100644
> --- a/grep.c
> +++ b/grep.c
> @@ -474,7 +474,7 @@ static void compile_pcre2_pattern(struct grep_pat *p, const struct grep_opt *opt
>  	}
>  	if (!opt->ignore_locale && is_utf8_locale() && has_non_ascii(p->pattern) &&
>  	    !(!opt->ignore_case && (p->fixed || p->is_fixed)))
> -		options |= PCRE2_UTF;
> +		options |= (PCRE2_UTF | PCRE2_MATCH_INVALID_UTF);
>  
>  	p->pcre2_pattern = pcre2_compile((PCRE2_SPTR)p->pattern,
>  					 p->patternlen, options, &error, &erroffset,
> diff --git a/grep.h b/grep.h
> index c0c71eb4a9..506f05b97b 100644
> --- a/grep.h
> +++ b/grep.h
> @@ -21,6 +21,9 @@ typedef int pcre_extra;
>  #ifdef USE_LIBPCRE2
>  #define PCRE2_CODE_UNIT_WIDTH 8
>  #include <pcre2.h>
> +#ifndef PCRE2_MATCH_INVALID_UTF
> +#define PCRE2_MATCH_INVALID_UTF 0
> +#endif

... unlike this piece of code ...

>  #else
>  typedef int pcre2_code;
>  typedef int pcre2_match_data;
> diff --git a/t/helper/test-pcre2-config.c b/t/helper/test-pcre2-config.c
> new file mode 100644
> index 0000000000..5258fdddba
> --- /dev/null
> +++ b/t/helper/test-pcre2-config.c
> @@ -0,0 +1,12 @@
> +#include "test-tool.h"
> +#include "cache.h"
> +#include "grep.h"
> +
> +int cmd__pcre2_config(int argc, const char **argv)
> +{
> +	if (argc == 2 && !strcmp(argv[1], "has-PCRE2_MATCH_INVALID_UTF")) {
> +		int value = PCRE2_MATCH_INVALID_UTF;

... this part does not have any fallback definition.
Ævar Arnfjörð Bjarmason July 26, 2019, 9:53 p.m. UTC | #2
On Fri, Jul 26 2019, Junio C Hamano wrote:

> Ævar Arnfjörð Bjarmason  <avarab@gmail.com> writes:
>
>> diff --git a/Makefile b/Makefile
>> index bd246f2989..dd38d5e527 100644
>> --- a/Makefile
>> +++ b/Makefile
>> @@ -726,6 +726,7 @@ TEST_BUILTINS_OBJS += test-oidmap.o
>>  TEST_BUILTINS_OBJS += test-online-cpus.o
>>  TEST_BUILTINS_OBJS += test-parse-options.o
>>  TEST_BUILTINS_OBJS += test-path-utils.o
>> +TEST_BUILTINS_OBJS += test-pcre2-config.o
>
> This won't even build with any released pcre version; shouldn't we
> make it at least conditionally compiled code?  Specifically...
>
>>  TEST_BUILTINS_OBJS += test-pkt-line.o
>>  TEST_BUILTINS_OBJS += test-prio-queue.o
>>  TEST_BUILTINS_OBJS += test-reach.o
>> diff --git a/grep.c b/grep.c
>> index c7c06ae08d..8b8b9efe12 100644
>> --- a/grep.c
>> +++ b/grep.c
>> @@ -474,7 +474,7 @@ static void compile_pcre2_pattern(struct grep_pat *p, const struct grep_opt *opt
>>  	}
>>  	if (!opt->ignore_locale && is_utf8_locale() && has_non_ascii(p->pattern) &&
>>  	    !(!opt->ignore_case && (p->fixed || p->is_fixed)))
>> -		options |= PCRE2_UTF;
>> +		options |= (PCRE2_UTF | PCRE2_MATCH_INVALID_UTF);
>>
>>  	p->pcre2_pattern = pcre2_compile((PCRE2_SPTR)p->pattern,
>>  					 p->patternlen, options, &error, &erroffset,
>> diff --git a/grep.h b/grep.h
>> index c0c71eb4a9..506f05b97b 100644
>> --- a/grep.h
>> +++ b/grep.h
>> @@ -21,6 +21,9 @@ typedef int pcre_extra;
>>  #ifdef USE_LIBPCRE2
>>  #define PCRE2_CODE_UNIT_WIDTH 8
>>  #include <pcre2.h>
>> +#ifndef PCRE2_MATCH_INVALID_UTF
>> +#define PCRE2_MATCH_INVALID_UTF 0
>> +#endif
>
> ... unlike this piece of code ...
>
>>  #else
>>  typedef int pcre2_code;
>>  typedef int pcre2_match_data;
>> diff --git a/t/helper/test-pcre2-config.c b/t/helper/test-pcre2-config.c
>> new file mode 100644
>> index 0000000000..5258fdddba
>> --- /dev/null
>> +++ b/t/helper/test-pcre2-config.c
>> @@ -0,0 +1,12 @@
>> +#include "test-tool.h"
>> +#include "cache.h"
>> +#include "grep.h"
>> +
>> +int cmd__pcre2_config(int argc, const char **argv)
>> +{
>> +	if (argc == 2 && !strcmp(argv[1], "has-PCRE2_MATCH_INVALID_UTF")) {
>> +		int value = PCRE2_MATCH_INVALID_UTF;
>
> ... this part does not have any fallback definition.

It works because we include grep.h, which'll define
PCRE2_MATCH_INVALID_UTF=0 if pcre2.h doesn't give it to us. I've tested
this on PCRE versions with/without PCRE2_MATCH_INVALID_UTF and it works
& runs/skips the appropriate tests.
Ævar Arnfjörð Bjarmason July 26, 2019, 9:57 p.m. UTC | #3
On Fri, Jul 26 2019, Ævar Arnfjörð Bjarmason wrote:

> On Fri, Jul 26 2019, Junio C Hamano wrote:
>
>> Ævar Arnfjörð Bjarmason  <avarab@gmail.com> writes:
>>
>>> diff --git a/Makefile b/Makefile
>>> index bd246f2989..dd38d5e527 100644
>>> --- a/Makefile
>>> +++ b/Makefile
>>> @@ -726,6 +726,7 @@ TEST_BUILTINS_OBJS += test-oidmap.o
>>>  TEST_BUILTINS_OBJS += test-online-cpus.o
>>>  TEST_BUILTINS_OBJS += test-parse-options.o
>>>  TEST_BUILTINS_OBJS += test-path-utils.o
>>> +TEST_BUILTINS_OBJS += test-pcre2-config.o
>>
>> This won't even build with any released pcre version; shouldn't we
>> make it at least conditionally compiled code?  Specifically...
>>
>>>  TEST_BUILTINS_OBJS += test-pkt-line.o
>>>  TEST_BUILTINS_OBJS += test-prio-queue.o
>>>  TEST_BUILTINS_OBJS += test-reach.o
>>> diff --git a/grep.c b/grep.c
>>> index c7c06ae08d..8b8b9efe12 100644
>>> --- a/grep.c
>>> +++ b/grep.c
>>> @@ -474,7 +474,7 @@ static void compile_pcre2_pattern(struct grep_pat *p, const struct grep_opt *opt
>>>  	}
>>>  	if (!opt->ignore_locale && is_utf8_locale() && has_non_ascii(p->pattern) &&
>>>  	    !(!opt->ignore_case && (p->fixed || p->is_fixed)))
>>> -		options |= PCRE2_UTF;
>>> +		options |= (PCRE2_UTF | PCRE2_MATCH_INVALID_UTF);
>>>
>>>  	p->pcre2_pattern = pcre2_compile((PCRE2_SPTR)p->pattern,
>>>  					 p->patternlen, options, &error, &erroffset,
>>> diff --git a/grep.h b/grep.h
>>> index c0c71eb4a9..506f05b97b 100644
>>> --- a/grep.h
>>> +++ b/grep.h
>>> @@ -21,6 +21,9 @@ typedef int pcre_extra;
>>>  #ifdef USE_LIBPCRE2
>>>  #define PCRE2_CODE_UNIT_WIDTH 8
>>>  #include <pcre2.h>
>>> +#ifndef PCRE2_MATCH_INVALID_UTF
>>> +#define PCRE2_MATCH_INVALID_UTF 0
>>> +#endif
>>
>> ... unlike this piece of code ...
>>
>>>  #else
>>>  typedef int pcre2_code;
>>>  typedef int pcre2_match_data;
>>> diff --git a/t/helper/test-pcre2-config.c b/t/helper/test-pcre2-config.c
>>> new file mode 100644
>>> index 0000000000..5258fdddba
>>> --- /dev/null
>>> +++ b/t/helper/test-pcre2-config.c
>>> @@ -0,0 +1,12 @@
>>> +#include "test-tool.h"
>>> +#include "cache.h"
>>> +#include "grep.h"
>>> +
>>> +int cmd__pcre2_config(int argc, const char **argv)
>>> +{
>>> +	if (argc == 2 && !strcmp(argv[1], "has-PCRE2_MATCH_INVALID_UTF")) {
>>> +		int value = PCRE2_MATCH_INVALID_UTF;
>>
>> ... this part does not have any fallback definition.
>
> It works because we include grep.h, which'll define
> PCRE2_MATCH_INVALID_UTF=0 if pcre2.h doesn't give it to us. I've tested
> this on PCRE versions with/without PCRE2_MATCH_INVALID_UTF and it works
> & runs/skips the appropriate tests.

Ah, I spoke too soon, of course that's all guarded by "are we using PCRE
v2 in general?". I'll fix it...
diff mbox series

Patch

diff --git a/Makefile b/Makefile
index bd246f2989..dd38d5e527 100644
--- a/Makefile
+++ b/Makefile
@@ -726,6 +726,7 @@  TEST_BUILTINS_OBJS += test-oidmap.o
 TEST_BUILTINS_OBJS += test-online-cpus.o
 TEST_BUILTINS_OBJS += test-parse-options.o
 TEST_BUILTINS_OBJS += test-path-utils.o
+TEST_BUILTINS_OBJS += test-pcre2-config.o
 TEST_BUILTINS_OBJS += test-pkt-line.o
 TEST_BUILTINS_OBJS += test-prio-queue.o
 TEST_BUILTINS_OBJS += test-reach.o
diff --git a/grep.c b/grep.c
index c7c06ae08d..8b8b9efe12 100644
--- a/grep.c
+++ b/grep.c
@@ -474,7 +474,7 @@  static void compile_pcre2_pattern(struct grep_pat *p, const struct grep_opt *opt
 	}
 	if (!opt->ignore_locale && is_utf8_locale() && has_non_ascii(p->pattern) &&
 	    !(!opt->ignore_case && (p->fixed || p->is_fixed)))
-		options |= PCRE2_UTF;
+		options |= (PCRE2_UTF | PCRE2_MATCH_INVALID_UTF);
 
 	p->pcre2_pattern = pcre2_compile((PCRE2_SPTR)p->pattern,
 					 p->patternlen, options, &error, &erroffset,
diff --git a/grep.h b/grep.h
index c0c71eb4a9..506f05b97b 100644
--- a/grep.h
+++ b/grep.h
@@ -21,6 +21,9 @@  typedef int pcre_extra;
 #ifdef USE_LIBPCRE2
 #define PCRE2_CODE_UNIT_WIDTH 8
 #include <pcre2.h>
+#ifndef PCRE2_MATCH_INVALID_UTF
+#define PCRE2_MATCH_INVALID_UTF 0
+#endif
 #else
 typedef int pcre2_code;
 typedef int pcre2_match_data;
diff --git a/t/helper/test-pcre2-config.c b/t/helper/test-pcre2-config.c
new file mode 100644
index 0000000000..5258fdddba
--- /dev/null
+++ b/t/helper/test-pcre2-config.c
@@ -0,0 +1,12 @@ 
+#include "test-tool.h"
+#include "cache.h"
+#include "grep.h"
+
+int cmd__pcre2_config(int argc, const char **argv)
+{
+	if (argc == 2 && !strcmp(argv[1], "has-PCRE2_MATCH_INVALID_UTF")) {
+		int value = PCRE2_MATCH_INVALID_UTF;
+		return !value;
+	}
+	return 1;
+}
diff --git a/t/helper/test-tool.c b/t/helper/test-tool.c
index ce7e89028c..e022ce0e48 100644
--- a/t/helper/test-tool.c
+++ b/t/helper/test-tool.c
@@ -40,6 +40,7 @@  static struct test_cmd cmds[] = {
 	{ "online-cpus", cmd__online_cpus },
 	{ "parse-options", cmd__parse_options },
 	{ "path-utils", cmd__path_utils },
+	{ "pcre2-config", cmd__pcre2_config },
 	{ "pkt-line", cmd__pkt_line },
 	{ "prio-queue", cmd__prio_queue },
 	{ "reach", cmd__reach },
diff --git a/t/helper/test-tool.h b/t/helper/test-tool.h
index f805bb39ae..acd8af2a9d 100644
--- a/t/helper/test-tool.h
+++ b/t/helper/test-tool.h
@@ -30,6 +30,7 @@  int cmd__oidmap(int argc, const char **argv);
 int cmd__online_cpus(int argc, const char **argv);
 int cmd__parse_options(int argc, const char **argv);
 int cmd__path_utils(int argc, const char **argv);
+int cmd__pcre2_config(int argc, const char **argv);
 int cmd__pkt_line(int argc, const char **argv);
 int cmd__prio_queue(int argc, const char **argv);
 int cmd__reach(int argc, const char **argv);
diff --git a/t/t7812-grep-icase-non-ascii.sh b/t/t7812-grep-icase-non-ascii.sh
index 531eb59d57..848d46e4f9 100755
--- a/t/t7812-grep-icase-non-ascii.sh
+++ b/t/t7812-grep-icase-non-ascii.sh
@@ -74,11 +74,22 @@  test_expect_success GETTEXT_LOCALE,LIBPCRE2 'PCRE v2: grep non-ASCII from invali
 	test_cmp expected actual
 '
 
-test_expect_success GETTEXT_LOCALE,LIBPCRE2 'PCRE v2: grep non-ASCII from invalid UTF-8 data with -i' '
+test_lazy_prereq PCRE2_MATCH_INVALID_UTF '
+	test-tool pcre2-config has-PCRE2_MATCH_INVALID_UTF
+'
+
+test_expect_success GETTEXT_LOCALE,LIBPCRE2,!PCRE2_MATCH_INVALID_UTF 'PCRE v2: grep non-ASCII from invalid UTF-8 data with -i' '
 	test_might_fail git grep -hi "Æ" invalid-0x80 >actual &&
 	test_cmp expected actual &&
 	test_must_fail git grep -hi "(*NO_JIT)Æ" invalid-0x80 &&
 	test_cmp expected actual
 '
 
+test_expect_success GETTEXT_LOCALE,LIBPCRE2,PCRE2_MATCH_INVALID_UTF 'PCRE v2: grep non-ASCII from invalid UTF-8 data with -i' '
+	git grep -hi "Æ" invalid-0x80 >actual &&
+	test_cmp expected actual &&
+	git grep -hi "(*NO_JIT)Æ" invalid-0x80 &&
+	test_cmp expected actual
+'
+
 test_done