diff mbox series

[RFC/PATCH,7/7] grep: use PCRE v2 for optimized fixed-string search

Message ID 20190626000329.32475-8-avarab@gmail.com (mailing list archive)
State New, archived
Headers show
Series grep: move from kwset to optional PCRE v2 | expand

Commit Message

Ævar Arnfjörð Bjarmason June 26, 2019, 12:03 a.m. UTC
Bring back optimized fixed-string search for "grep", this time with
PCRE v2 as an optional backend. As noted in [1] with kwset we were
slower than PCRE v1 and v2 JIT with the kwset backend, so that
optimization was counterproductive.

This brings back the optimization for "-F", without changing the
semantics of "\0" in patterns. As seen in previous commits in this
series we could support it now, but I'd rather just leave that
edge-case aside so the tests don't need to do one thing or the other
depending on what --fixed-strings backend we're using.

I could also support the v1 backend here, but that would make the code
more complex, and I'd rather aim for simplicity here and in future
changes to the diffcore. We're not going to have someone who
absolutely must have faster search, but for whom building PCRE v2
isn't acceptable.

1. https://public-inbox.org/git/87v9x793qi.fsf@evledraar.gmail.com/

Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
---
 grep.c | 47 +++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 45 insertions(+), 2 deletions(-)

Comments

Johannes Schindelin June 26, 2019, 2:13 p.m. UTC | #1
Hi Ævar,

On Wed, 26 Jun 2019, Ævar Arnfjörð Bjarmason wrote:

> Bring back optimized fixed-string search for "grep", this time with
> PCRE v2 as an optional backend. As noted in [1] with kwset we were
> slower than PCRE v1 and v2 JIT with the kwset backend, so that
> optimization was counterproductive.
>
> This brings back the optimization for "-F", without changing the
> semantics of "\0" in patterns. As seen in previous commits in this
> series we could support it now, but I'd rather just leave that
> edge-case aside so the tests don't need to do one thing or the other
> depending on what --fixed-strings backend we're using.

Nice. Very, very nice.

> I could also support the v1 backend here, but that would make the code
> more complex, and I'd rather aim for simplicity here and in future
> changes to the diffcore. We're not going to have someone who
> absolutely must have faster search, but for whom building PCRE v2
> isn't acceptable.

I could not agree more.

> diff --git a/grep.c b/grep.c
> index 4716217837..6b75d5be68 100644
> --- a/grep.c
> +++ b/grep.c
> @@ -356,6 +356,18 @@ static NORETURN void compile_regexp_failed(const struct grep_pat *p,
>  	die("%s'%s': %s", where, p->pattern, error);
>  }
>
> +static int is_fixed(const char *s, size_t len)
> +{
> +	size_t i;
> +
> +	for (i = 0; i < len; i++) {
> +		if (is_regex_special(s[i]))
> +			return 0;
> +	}
> +
> +	return 1;
> +}
> +
>  #ifdef USE_LIBPCRE1
>  static void compile_pcre1_regexp(struct grep_pat *p, const struct grep_opt *opt)
>  {
> @@ -602,7 +614,6 @@ static int pcre2match(struct grep_pat *p, const char *line, const char *eol,
>  static void free_pcre2_pattern(struct grep_pat *p)
>  {
>  }
> -#endif /* !USE_LIBPCRE2 */

Huh? Removing an `#endif` without removing the corresponding `#if`?

... but...

>  static void compile_fixed_regexp(struct grep_pat *p, struct grep_opt *opt)
>  {
> @@ -623,11 +634,13 @@ static void compile_fixed_regexp(struct grep_pat *p, struct grep_opt *opt)
>  		compile_regexp_failed(p, errbuf);
>  	}
>  }
> +#endif /* !USE_LIBPCRE2 */

Ah hah!

If we would not have plenty of exercise for the PCRE2 build options, I
would be worried. But AFAICT the CI build includes this all the time, so
we're fine.

>  static void compile_regexp(struct grep_pat *p, struct grep_opt *opt)
>  {
>  	int err;
>  	int regflags = REG_NEWLINE;
> +	int pat_is_fixed;
>
>  	p->word_regexp = opt->word_regexp;
>  	p->ignore_case = opt->ignore_case;
> @@ -636,8 +649,38 @@ static void compile_regexp(struct grep_pat *p, struct grep_opt *opt)
>  	if (memchr(p->pattern, 0, p->patternlen) && !opt->pcre2)
>  		die(_("given pattern contains NULL byte (via -f <file>). This is only supported with -P under PCRE v2"));
>
> -	if (opt->fixed) {
> +	pat_is_fixed = is_fixed(p->pattern, p->patternlen);
> +	if (opt->fixed || pat_is_fixed) {
> +#ifdef USE_LIBPCRE2
> +		opt->pcre2 = 1;
> +		if (pat_is_fixed) {
> +			compile_pcre2_pattern(p, opt);
> +		} else {
> +			/*
> +			 * E.g. t7811-grep-open.sh relies on the
> +			 * pattern being restored, and unfortunately
> +			 * there's no PCRE compile flag for "this is
> +			 * fixed", so we need to munge it to
> +			 * "\Q<pat>\E".
> +			 */
> +			char *old_pattern = p->pattern;
> +			size_t old_patternlen = p->patternlen;
> +			struct strbuf sb = STRBUF_INIT;
> +
> +			strbuf_add(&sb, "\\Q", 2);
> +			strbuf_add(&sb, p->pattern, p->patternlen);
> +			strbuf_add(&sb, "\\E", 2);
> +
> +			p->pattern = sb.buf;
> +			p->patternlen = sb.len;
> +			compile_pcre2_pattern(p, opt);
> +			p->pattern = old_pattern;
> +			p->patternlen = old_patternlen;
> +			strbuf_release(&sb);
> +		}
> +#else
>  		compile_fixed_regexp(p, opt);
> +#endif

It might be a bit easier to read if the shorter clause came first.

Other than that: what a nice read. I should save reviewing all your patch
series for just-before-bed time.

Thanks,
Dscho

>  		return;
>  	}
>
> --
> 2.22.0.455.g172b71a6c5
>
>
Junio C Hamano June 26, 2019, 6:45 p.m. UTC | #2
Johannes Schindelin <Johannes.Schindelin@gmx.de> writes:

> ...
> Ah hah!
>
> If we would not have plenty of exercise for the PCRE2 build options, I
> would be worried. But AFAICT the CI build includes this all the time, so
> we're fine.

Well, I'd feel safer if it were not "all the time", i.e. we know we
are testing both sides of the coin.
Johannes Schindelin June 27, 2019, 9:31 a.m. UTC | #3
Hi Junio,

On Wed, 26 Jun 2019, Junio C Hamano wrote:

> Johannes Schindelin <Johannes.Schindelin@gmx.de> writes:
>
> > ...
> > Ah hah!
> >
> > If we would not have plenty of exercise for the PCRE2 build options, I
> > would be worried. But AFAICT the CI build includes this all the time, so
> > we're fine.
>
> Well, I'd feel safer if it were not "all the time", i.e. we know we
> are testing both sides of the coin.

AFAIR at least the Linux32 job is built without PCRE2 by default. I might
be wrong on that, though...

In any case, the upcoming MSVC support for our Azure Pipeline _will_ build
without PCRE2, then we will have that axis covered.

Ciao,
Dscho
Johannes Schindelin June 27, 2019, 6:45 p.m. UTC | #4
Hi,

On Thu, 27 Jun 2019, Johannes Schindelin wrote:

> On Wed, 26 Jun 2019, Junio C Hamano wrote:
>
> > Johannes Schindelin <Johannes.Schindelin@gmx.de> writes:
> >
> > > ...
> > > Ah hah!
> > >
> > > If we would not have plenty of exercise for the PCRE2 build options, I
> > > would be worried. But AFAICT the CI build includes this all the time, so
> > > we're fine.
> >
> > Well, I'd feel safer if it were not "all the time", i.e. we know we
> > are testing both sides of the coin.
>
> AFAIR at least the Linux32 job is built without PCRE2 by default. I might
> be wrong on that, though...

Actually, it seems that _all_ of the Linux builds in our Azure Pipeline
compile without pcre2. It seems you have to pass `USE_LIBPCRE2=1` to
`make`, and we do not do that in `ci/run-build-and-tests.sh` nor in
`azure-pipelines.yml`. I do not even see that for the macOS builds.

So we got PCRE2 covered only in the Windows build, it seems.

Ciao,
Dscho
Junio C Hamano June 27, 2019, 7:06 p.m. UTC | #5
Johannes Schindelin <Johannes.Schindelin@gmx.de> writes:

>> > > If we would not have plenty of exercise for the PCRE2 build options, I
>> > > would be worried. But AFAICT the CI build includes this all the time, so
>> > > we're fine.
>> >
>> > Well, I'd feel safer if it were not "all the time", i.e. we know we
>> > are testing both sides of the coin.
>>
>> AFAIR at least the Linux32 job is built without PCRE2 by default. I might
>> be wrong on that, though...
>
> Actually, it seems that _all_ of the Linux builds in our Azure Pipeline
> compile without pcre2. It seems you have to pass `USE_LIBPCRE2=1` to
> `make`, and we do not do that in `ci/run-build-and-tests.sh` nor in
> `azure-pipelines.yml`. I do not even see that for the macOS builds.
>
> So we got PCRE2 covered only in the Windows build, it seems.

OK, it sounds like we have sufficient coverage on both fronts.
Good.
Johannes Schindelin June 28, 2019, 10:56 a.m. UTC | #6
Hi Junio,

On Thu, 27 Jun 2019, Junio C Hamano wrote:

> Johannes Schindelin <Johannes.Schindelin@gmx.de> writes:
>
> >> > > If we would not have plenty of exercise for the PCRE2 build
> >> > > options, I would be worried. But AFAICT the CI build includes
> >> > > this all the time, so we're fine.
> >> >
> >> > Well, I'd feel safer if it were not "all the time", i.e. we know we
> >> > are testing both sides of the coin.
> >>
> >> AFAIR at least the Linux32 job is built without PCRE2 by default. I
> >> might be wrong on that, though...
> >
> > Actually, it seems that _all_ of the Linux builds in our Azure Pipeline
> > compile without pcre2. It seems you have to pass `USE_LIBPCRE2=1` to
> > `make`, and we do not do that in `ci/run-build-and-tests.sh` nor in
> > `azure-pipelines.yml`. I do not even see that for the macOS builds.
> >
> > So we got PCRE2 covered only in the Windows build, it seems.
>
> OK, it sounds like we have sufficient coverage on both fronts.

Maybe not. With the bug I uncovered that is _only_ triggering an error
message if the PCRE2 in question does not support JIT'ed operations, I am
a bit wary now.

But I cannot really see a reasonable way to add those axes to the CI
builds.

Ciao,
Dscho
diff mbox series

Patch

diff --git a/grep.c b/grep.c
index 4716217837..6b75d5be68 100644
--- a/grep.c
+++ b/grep.c
@@ -356,6 +356,18 @@  static NORETURN void compile_regexp_failed(const struct grep_pat *p,
 	die("%s'%s': %s", where, p->pattern, error);
 }
 
+static int is_fixed(const char *s, size_t len)
+{
+	size_t i;
+
+	for (i = 0; i < len; i++) {
+		if (is_regex_special(s[i]))
+			return 0;
+	}
+
+	return 1;
+}
+
 #ifdef USE_LIBPCRE1
 static void compile_pcre1_regexp(struct grep_pat *p, const struct grep_opt *opt)
 {
@@ -602,7 +614,6 @@  static int pcre2match(struct grep_pat *p, const char *line, const char *eol,
 static void free_pcre2_pattern(struct grep_pat *p)
 {
 }
-#endif /* !USE_LIBPCRE2 */
 
 static void compile_fixed_regexp(struct grep_pat *p, struct grep_opt *opt)
 {
@@ -623,11 +634,13 @@  static void compile_fixed_regexp(struct grep_pat *p, struct grep_opt *opt)
 		compile_regexp_failed(p, errbuf);
 	}
 }
+#endif /* !USE_LIBPCRE2 */
 
 static void compile_regexp(struct grep_pat *p, struct grep_opt *opt)
 {
 	int err;
 	int regflags = REG_NEWLINE;
+	int pat_is_fixed;
 
 	p->word_regexp = opt->word_regexp;
 	p->ignore_case = opt->ignore_case;
@@ -636,8 +649,38 @@  static void compile_regexp(struct grep_pat *p, struct grep_opt *opt)
 	if (memchr(p->pattern, 0, p->patternlen) && !opt->pcre2)
 		die(_("given pattern contains NULL byte (via -f <file>). This is only supported with -P under PCRE v2"));
 
-	if (opt->fixed) {
+	pat_is_fixed = is_fixed(p->pattern, p->patternlen);
+	if (opt->fixed || pat_is_fixed) {
+#ifdef USE_LIBPCRE2
+		opt->pcre2 = 1;
+		if (pat_is_fixed) {
+			compile_pcre2_pattern(p, opt);
+		} else {
+			/*
+			 * E.g. t7811-grep-open.sh relies on the
+			 * pattern being restored, and unfortunately
+			 * there's no PCRE compile flag for "this is
+			 * fixed", so we need to munge it to
+			 * "\Q<pat>\E".
+			 */
+			char *old_pattern = p->pattern;
+			size_t old_patternlen = p->patternlen;
+			struct strbuf sb = STRBUF_INIT;
+
+			strbuf_add(&sb, "\\Q", 2);
+			strbuf_add(&sb, p->pattern, p->patternlen);
+			strbuf_add(&sb, "\\E", 2);
+
+			p->pattern = sb.buf;
+			p->patternlen = sb.len;
+			compile_pcre2_pattern(p, opt);
+			p->pattern = old_pattern;
+			p->patternlen = old_patternlen;
+			strbuf_release(&sb);
+		}
+#else
 		compile_fixed_regexp(p, opt);
+#endif
 		return;
 	}