[RFC,v3,2/3] grep: make PCRE2 aware of custom allocator
diff mbox series

Message ID 20190806163658.66932-3-carenas@gmail.com
State New
Headers show
Series
  • grep: no leaks or crashes (windows testing needed)
Related show

Commit Message

Carlo Marcelo Arenas Belón Aug. 6, 2019, 4:36 p.m. UTC
94da9193a6 (grep: add support for PCRE v2, 2017-06-01) didn't include
a way to override the system allocator, and so it is incompatible with
USE_NED_ALLOCATOR.  The problem was made visible when an attempt to
avoid a leak in a data structure that is created by the library was
passed to NED's free for disposal triggering a segfault in Windows.

PCRE2 requires the use of a general context to override the allocator
and therefore, there is a lot more code needed than in PCRE1, including
a couple of wrapper functions.

Extend the grep API with a "destructor" that could be called to cleanup
any objects that were created and used globally.

Update builtin/grep to use that new API, but any other future
users should make sure to have matching grep_init/grep_destroy calls
if they are using the pattern matching functionality (currently only
relevant when using both NED and PCRE2)

Move some of the logic that was before done per thread (in the workers)
into an earlier phase to avoid degrading performance, but as the use
of PCRE2 with NED is better understood it is expected more of its
functions will be instructed to use the custom allocator as well as
was done in the original code[1] this work was based on.

[1] https://public-inbox.org/git/3397e6797f872aedd18c6d795f4976e1c579514b.1565005867.git.gitgitgadget@gmail.com/

Reported-by: Johannes Schindelin <johannes.schindelin@gmx.de>
Signed-off-by: Carlo Marcelo Arenas Belón <carenas@gmail.com>
---
 builtin/grep.c |  1 +
 grep.c         | 36 +++++++++++++++++++++++++++++++++++-
 grep.h         |  1 +
 3 files changed, 37 insertions(+), 1 deletion(-)

Comments

René Scharfe Aug. 7, 2019, 5:38 a.m. UTC | #1
Am 06.08.19 um 18:36 schrieb Carlo Marcelo Arenas Belón:
> 94da9193a6 (grep: add support for PCRE v2, 2017-06-01) didn't include
> a way to override the system allocator, and so it is incompatible with
> USE_NED_ALLOCATOR.  The problem was made visible when an attempt to
> avoid a leak in a data structure that is created by the library was
> passed to NED's free for disposal triggering a segfault in Windows.
>
> PCRE2 requires the use of a general context to override the allocator
> and therefore, there is a lot more code needed than in PCRE1, including
> a couple of wrapper functions.
>
> Extend the grep API with a "destructor" that could be called to cleanup
> any objects that were created and used globally.
>
> Update builtin/grep to use that new API, but any other future
> users should make sure to have matching grep_init/grep_destroy calls
> if they are using the pattern matching functionality (currently only
> relevant when using both NED and PCRE2)
>
> Move some of the logic that was before done per thread (in the workers)
> into an earlier phase to avoid degrading performance

Which logic is moved?  In the patch I basically only see additions.

>, but as the use
> of PCRE2 with NED is better understood it is expected more of its
> functions will be instructed to use the custom allocator as well as
> was done in the original code[1] this work was based on.
>
> [1] https://public-inbox.org/git/3397e6797f872aedd18c6d795f4976e1c579514b.1565005867.git.gitgitgadget@gmail.com/

I'm not sure I understand that part.  Do you mean there are gaps of
knowledge about nedmalloc and/or PCRE2 and/or their interaction?  And
someone is working on closing those gaps, and is going to submit more
patches in the process?

Your patch uses a custom global context only if USE_NED_ALLOCATOR is
defined, while [1] does it unconditionally.  The latter is easier to
debug and requires less preprocessor directives.  What's the upside
of your approach?

>
> Reported-by: Johannes Schindelin <johannes.schindelin@gmx.de>
> Signed-off-by: Carlo Marcelo Arenas Belón <carenas@gmail.com>
> ---
>  builtin/grep.c |  1 +
>  grep.c         | 36 +++++++++++++++++++++++++++++++++++-
>  grep.h         |  1 +
>  3 files changed, 37 insertions(+), 1 deletion(-)
>
> diff --git a/builtin/grep.c b/builtin/grep.c
> index 560051784e..e49c20df60 100644
> --- a/builtin/grep.c
> +++ b/builtin/grep.c
> @@ -1145,5 +1145,6 @@ int cmd_grep(int argc, const char **argv, const char *prefix)
>  		run_pager(&opt, prefix);
>  	clear_pathspec(&pathspec);
>  	free_grep_patterns(&opt);
> +	grep_destroy();
>  	return !hit;
>  }
> diff --git a/grep.c b/grep.c
> index 0154998695..3e5bdf94a6 100644
> --- a/grep.c
> +++ b/grep.c
> @@ -16,6 +16,22 @@ static int grep_source_is_binary(struct grep_source *gs,
>
>  static struct grep_opt grep_defaults;
>
> +#ifdef USE_LIBPCRE2
> +static pcre2_general_context *pcre2_global_context;
> +
> +#ifdef USE_NED_ALLOCATOR
> +static void *pcre2_malloc(PCRE2_SIZE size, MAYBE_UNUSED void *memory_data)
> +{
> +	return malloc(size);

Should this be xmalloc() to get consistent out-of-memory handling?

> +}
> +
> +static void pcre2_free(void *pointer, MAYBE_UNUSED void *memory_data)
> +{
> +	return free(pointer);
> +}
> +#endif
> +#endif
> +
>  static const char *color_grep_slots[] = {
>  	[GREP_COLOR_CONTEXT]	    = "context",
>  	[GREP_COLOR_FILENAME]	    = "filename",
> @@ -153,6 +169,7 @@ int grep_config(const char *var, const char *value, void *cb)
>   *
>   * If using PCRE make sure that the library is configured
>   * to use the right allocator (ex: NED)
> + * if any object is created it should be cleaned up in grep_destroy()
>   */
>  void grep_init(struct grep_opt *opt, struct repository *repo, const char *prefix)
>  {
> @@ -188,6 +205,13 @@ void grep_init(struct grep_opt *opt, struct repository *repo, const char *prefix
>  		color_set(opt->colors[i], def->colors[i]);
>  }
>
> +void grep_destroy(void)
> +{
> +#ifdef USE_LIBPCRE2
> +	pcre2_general_context_free(pcre2_global_context);
> +#endif
> +}
> +
>  static void grep_set_pattern_type_option(enum grep_pattern_type pattern_type, struct grep_opt *opt)
>  {
>  	/*
> @@ -319,6 +343,11 @@ void append_header_grep_pattern(struct grep_opt *opt,
>  void append_grep_pattern(struct grep_opt *opt, const char *pat,
>  			 const char *origin, int no, enum grep_pat_token t)
>  {
> +#if defined(USE_LIBPCRE2) && defined(USE_NED_ALLOCATOR)
> +	if (!pcre2_global_context && opt->ignore_case && has_non_ascii(pat))
> +		pcre2_global_context = pcre2_general_context_create(
> +					pcre2_malloc, pcre2_free, NULL);
> +#endif
>  	append_grep_pat(opt, pat, strlen(pat), origin, no, t);
>  }
>
> @@ -507,9 +536,14 @@ static void compile_pcre2_pattern(struct grep_pat *p, const struct grep_opt *opt
>
>  	p->pcre2_compile_context = NULL;
>
> +	/* pcre2_global_context is initialized in append_grep_pattern */
>  	if (opt->ignore_case) {
>  		if (has_non_ascii(p->pattern)) {
> -			character_tables = pcre2_maketables(NULL);
> +#ifdef USE_NED_ALLOCATOR
> +			if (!pcre2_global_context)
> +				BUG("pcre2_global_context uninitialized");

[1] initializes on demand.  Why not do that?  To avoid race conditions
that would lead to occasional double-allocation of the global context?

> +#endif
> +			character_tables = pcre2_maketables(pcre2_global_context);
>  			p->pcre2_compile_context = pcre2_compile_context_create(NULL);

Don't you want to pass pcre2_global_context here as well?  And [1] even
uses it in some more places.

Oh, that's the "expected more" when "better understood" part from the
commit message, right?

Basically I'd expect the custom global context to be used for all PCRE2
allocations; I can't think of a reason for mixing allocators (e.g.
system malloc for PCRE2 regexes and nedmalloc for everything else).

>  			pcre2_set_character_tables(p->pcre2_compile_context, character_tables);
>  		}
> diff --git a/grep.h b/grep.h
> index 1875880f37..526c2db9ef 100644
> --- a/grep.h
> +++ b/grep.h
> @@ -189,6 +189,7 @@ struct grep_opt {
>  void init_grep_defaults(struct repository *);
>  int grep_config(const char *var, const char *value, void *);
>  void grep_init(struct grep_opt *, struct repository *repo, const char *prefix);
> +void grep_destroy(void);
>  void grep_commit_pattern_type(enum grep_pattern_type, struct grep_opt *opt);
>
>  void append_grep_pat(struct grep_opt *opt, const char *pat, size_t patlen, const char *origin, int no, enum grep_pat_token t);
>
Carlo Marcelo Arenas Belón Aug. 7, 2019, 9:49 a.m. UTC | #2
On Tue, Aug 6, 2019 at 10:38 PM René Scharfe <l.s.r@web.de> wrote:
>
> Am 06.08.19 um 18:36 schrieb Carlo Marcelo Arenas Belón:
> > Move some of the logic that was before done per thread (in the workers)
> > into an earlier phase to avoid degrading performance
>
> Which logic is moved?  In the patch I basically only see additions.

the decision of if we want to create a global context or not, which is now being
done only once per pattern.  "moving" there was to imply the logic now
was no longer going to always use state in the struct_pat that then
will be evaluated at least once per thread, but could use state
globally and reuse some of the work (as I did for the generation of
the chartables in a later patch, that I had yet to send)

> >, but as the use
> > of PCRE2 with NED is better understood it is expected more of its
> > functions will be instructed to use the custom allocator as well as
> > was done in the original code[1] this work was based on.
> >
> > [1] https://public-inbox.org/git/3397e6797f872aedd18c6d795f4976e1c579514b.1565005867.git.gitgitgadget@gmail.com/
>
> I'm not sure I understand that part.  Do you mean there are gaps of
> knowledge about nedmalloc and/or PCRE2 and/or their interaction?  And
> someone is working on closing those gaps, and is going to submit more
> patches in the process?
>
> Your patch uses a custom global context only if USE_NED_ALLOCATOR is
> defined, while [1] does it unconditionally.  The latter is easier to
> debug and requires less preprocessor directives.  What's the upside
> of your approach?

was hoping will perform better but it seems that testing can be done
only in windows

> > +#ifdef USE_NED_ALLOCATOR
> > +static void *pcre2_malloc(PCRE2_SIZE size, MAYBE_UNUSED void *memory_data)
> > +{
> > +     return malloc(size);
>
> Should this be xmalloc() to get consistent out-of-memory handling?

good point, note though that since it is inside a USE_NED_ALLOCATOR
ifdef it is really
nedalloc in disguise
>
> > +}
> > +
> > +static void pcre2_free(void *pointer, MAYBE_UNUSED void *memory_data)
> > +{
> > +     return free(pointer);
> > +}
> > +#endif
> > +#endif
> > +
> >  static const char *color_grep_slots[] = {
> >       [GREP_COLOR_CONTEXT]        = "context",
> >       [GREP_COLOR_FILENAME]       = "filename",
> > @@ -153,6 +169,7 @@ int grep_config(const char *var, const char *value, void *cb)
> >   *
> >   * If using PCRE make sure that the library is configured
> >   * to use the right allocator (ex: NED)
> > + * if any object is created it should be cleaned up in grep_destroy()
> >   */
> >  void grep_init(struct grep_opt *opt, struct repository *repo, const char *prefix)
> >  {
> > @@ -188,6 +205,13 @@ void grep_init(struct grep_opt *opt, struct repository *repo, const char *prefix
> >               color_set(opt->colors[i], def->colors[i]);
> >  }
> >
> > +void grep_destroy(void)
> > +{
> > +#ifdef USE_LIBPCRE2
> > +     pcre2_general_context_free(pcre2_global_context);
> > +#endif
> > +}
> > +
> >  static void grep_set_pattern_type_option(enum grep_pattern_type pattern_type, struct grep_opt *opt)
> >  {
> >       /*
> > @@ -319,6 +343,11 @@ void append_header_grep_pattern(struct grep_opt *opt,
> >  void append_grep_pattern(struct grep_opt *opt, const char *pat,
> >                        const char *origin, int no, enum grep_pat_token t)
> >  {
> > +#if defined(USE_LIBPCRE2) && defined(USE_NED_ALLOCATOR)
> > +     if (!pcre2_global_context && opt->ignore_case && has_non_ascii(pat))
> > +             pcre2_global_context = pcre2_general_context_create(
> > +                                     pcre2_malloc, pcre2_free, NULL);
> > +#endif
> >       append_grep_pat(opt, pat, strlen(pat), origin, no, t);
> >  }
> >
> > @@ -507,9 +536,14 @@ static void compile_pcre2_pattern(struct grep_pat *p, const struct grep_opt *opt
> >
> >       p->pcre2_compile_context = NULL;
> >
> > +     /* pcre2_global_context is initialized in append_grep_pattern */
> >       if (opt->ignore_case) {
> >               if (has_non_ascii(p->pattern)) {
> > -                     character_tables = pcre2_maketables(NULL);
> > +#ifdef USE_NED_ALLOCATOR
> > +                     if (!pcre2_global_context)
> > +                             BUG("pcre2_global_context uninitialized");
>
> [1] initializes on demand.  Why not do that?  To avoid race conditions

this was just for help migrating the code, could go away even now, but
though it was nice to keep for safety (as someone mentioned recently)

> that would lead to occasional double-allocation of the global context?

and a nicer API that allows for cleaning up any global objects (with
the chartables being an easy one to tackle next) and might
help in the future to make the worker threads less messy (ex: compile vs match)

> > +#endif
> > +                     character_tables = pcre2_maketables(pcre2_global_context);
> >                       p->pcre2_compile_context = pcre2_compile_context_create(NULL);
>
> Don't you want to pass pcre2_global_context here as well?  And [1] even
> uses it in some more places.
>
> Oh, that's the "expected more" when "better understood" part from the
> commit message, right?

correct, was trying to be conservative and minimal (since this code
will conflict as well
with other in the fly changes), but considering that the last Azure
build from pu still failed
and I have no windows box to debug, might need to do it anyway.

> Basically I'd expect the custom global context to be used for all PCRE2
> allocations; I can't think of a reason for mixing allocators (e.g.
> system malloc for PCRE2 regexes and nedmalloc for everything else).

ok

Carlo
René Scharfe Aug. 7, 2019, 1:02 p.m. UTC | #3
Am 07.08.19 um 11:49 schrieb Carlo Arenas:
> was hoping will perform better but it seems that testing can be done
> only in windows

nedmalloc works on other platforms as well.  On Debian Testing with GCC
9.1.0 I need two changes to suppress some compiler warnings, though.
Will post them as replies.

"make USE_NED_ALLOCATOR=1 test" then reports these failures:

t7816-grep-binary-pattern.sh                     (Wstat: 256 Tests: 145 Failed: 5)
  Failed tests:  48, 54, 57, 60, 63
  Non-zero exit status: 1

And the first one when running that test with --verbose and --immediate
is showing:

BUG: grep.c:510: pcre2_global_context uninitialized
Aborted
not ok 48 - LC_ALL='C' git grep -P -f f -i '[æ]<NUL>ð' a
#
#				printf '[æ]Qð' | q_to_nul >f &&
#				LC_ALL='C' git grep -P -f f -i a
#
Junio C Hamano Aug. 7, 2019, 6:15 p.m. UTC | #4
Carlo Arenas <carenas@gmail.com> writes:

>> > +#ifdef USE_NED_ALLOCATOR
>> > +static void *pcre2_malloc(PCRE2_SIZE size, MAYBE_UNUSED void *memory_data)
>> > +{
>> > +     return malloc(size);
>>
>> Should this be xmalloc() to get consistent out-of-memory handling?
>
> good point, note though that since it is inside a USE_NED_ALLOCATOR
> ifdef it is really
> nedalloc in disguise

It would be nedalloc() wrapped inside the "die if we cannot
allocate, possibly after releasing resources held in various caches"
xmalloc() wrapper.  So (1) either xmalloc or malloc would end up
eventually calling nedalloc that can be freed with nedfree, so from
that point of view either can be used without upsetting "free()",
and (2) we should use xmalloc() here because we care about
consistent OOM behaviour throughout the system.

Thanks.
Carlo Marcelo Arenas Belón Aug. 8, 2019, 2:35 a.m. UTC | #5
On Wed, Aug 7, 2019 at 6:03 AM René Scharfe <l.s.r@web.de> wrote:
>
> Am 07.08.19 um 11:49 schrieb Carlo Arenas:
> > was hoping will perform better but it seems that testing can be done
> > only in windows
>
> nedmalloc works on other platforms as well.

I meant[1] it works reliably enough to be useful for performance testing.

goes without saying that the fact that I am using a virtualbox with 2
CPUs running Debian 10 on top of macOS (a macbook pro with 4 cores)
and the test uses by default 8 threads, doesn't help, but to share my
pain here is the result of running p7820 with my last reroll on top of
pu, comparing a build of the same code without NED (this tree) with
one with it (HEAD)

Test                                               this tree
HEAD
-------------------------------------------------------------------------------------------
7820.1: basic grep -i 'how.to'                     0.89(1.12+0.46)
0.95(1.23+0.49) +6.7%
7820.2: extended grep -i 'how.to'                  0.90(1.12+0.49)
0.92(1.19+0.46) +2.2%
7820.3: perl grep -i 'how.to'                      0.54(0.30+0.52)
0.53(0.39+0.52) -1.9%
7820.5: basic grep -i '^how to'                    0.89(1.13+0.47)
0.91(1.13+0.49) +2.2%
7820.6: extended grep -i '^how to'                 0.84(1.04+0.49)
0.94(1.27+0.47) +11.9%
7820.7: perl grep -i '^how to'                     0.49(0.34+0.47)
0.51(0.36+0.49) +4.1%
7820.9: basic grep -i '[how] to'                   1.51(2.31+0.51)
1.55(2.38+0.51) +2.6%
7820.10: extended grep -i '[how] to'               1.50(2.20+0.59)
1.56(2.30+0.62) +4.0%
7820.11: perl grep -i '[how] to'                   0.67(0.50+0.52)
0.62(0.50+0.55) -7.5%
7820.13: basic grep -i '\(e.t[^ ]*\|v.ry\) rare'   2.58(4.39+0.56)
2.64(4.45+0.60) +2.3%
7820.14: extended grep -i '(e.t[^ ]*|v.ry) rare'   2.60(4.41+0.56)
2.66(4.58+0.56) +2.3%
7820.15: perl grep -i '(e.t[^ ]*|v.ry) rare'       1.17(1.66+0.53)
1.23(1.84+0.45) +5.1%
7820.17: basic grep -i 'm\(ú\|u\)lt.b\(æ\|y\)te'   1.12(1.54+0.51)
1.14(1.70+0.44) +1.8%
7820.18: extended grep -i 'm(ú|u)lt.b(æ|y)te'      1.09(1.54+0.48)
1.14(1.62+0.49) +4.6%
7820.19: perl grep -i 'm(ú|u)lt.b(æ|y)te'          0.87(1.09+0.46)
0.90(1.20+0.43) +3.4%

and here one comparing two builds (both with NED)

Test                                               origin/pu
HEAD
-------------------------------------------------------------------------------------------
7820.1: basic grep -i 'how.to'                     1.00(1.24+0.55)
0.94(1.19+0.52) -6.0%
7820.2: extended grep -i 'how.to'                  0.90(1.15+0.49)
0.93(1.23+0.44) +3.3%
7820.3: perl grep -i 'how.to'                      0.52(0.37+0.51)
0.59(0.34+0.53) +13.5%
7820.5: basic grep -i '^how to'                    0.89(1.16+0.48)
0.90(1.17+0.47) +1.1%
7820.6: extended grep -i '^how to'                 0.92(1.17+0.50)
0.92(1.20+0.45) +0.0%
7820.7: perl grep -i '^how to'                     0.45(0.33+0.42)
0.54(0.29+0.57) +20.0%
7820.9: basic grep -i '[how] to'                   1.60(2.46+0.52)
1.61(2.39+0.62) +0.6%
7820.10: extended grep -i '[how] to'               1.71(2.67+0.56)
1.57(2.41+0.54) -8.2%
7820.11: perl grep -i '[how] to'                   0.66(0.61+0.51)
0.59(0.44+0.51) -10.6%
7820.13: basic grep -i '\(e.t[^ ]*\|v.ry\) rare'   2.69(4.49+0.66)
2.67(4.49+0.60) -0.7%
7820.14: extended grep -i '(e.t[^ ]*|v.ry) rare'   2.67(4.49+0.64)
2.64(4.54+0.54) -1.1%
7820.15: perl grep -i '(e.t[^ ]*|v.ry) rare'       1.23(1.80+0.47)
1.25(1.89+0.46) +1.6%
7820.17: basic grep -i 'm\(ú\|u\)lt.b\(æ\|y\)te'   1.13(1.64+0.47)
1.14(1.64+0.48) +0.9%
7820.18: extended grep -i 'm(ú|u)lt.b(æ|y)te'      1.16(1.68+0.46)
1.20(1.60+0.60) +3.4%
7820.19: perl grep -i 'm(ú|u)lt.b(æ|y)te'          0.90(1.16+0.48)
0.88(1.17+0.45) -2.2%

with the only relevant line (for my code) being 7820.19 where it would
seem it performs almost the same (eventhough just adding NED made it
initially worst)

note though that the fact there are 20% swings in parts of the code
that hasn't changed
or that where explicitly #ifdef out of my code changes doesn't give me
much confidence, but since the windows guys seem to be using NED by
default, I am hoping it works better there.

note also there were no segfaults (which is what was reported
originally) so something
else must be off.

> On Debian Testing with GCC
> 9.1.0 I need two changes to suppress some compiler warnings, though.
> Will post them as replies.
>
> "make USE_NED_ALLOCATOR=1 test" then reports these failures:
>
> t7816-grep-binary-pattern.sh                     (Wstat: 256 Tests: 145 Failed: 5)
>   Failed tests:  48, 54, 57, 60, 63
>   Non-zero exit status: 1

you got me so excited I dusted and old PC and was downloading the debian ISO
when I realized the error was not a segfault[2] but my bug.

sorry about it; I would swear I run the full test and it was clean,
but it was most likely
with PCRE1 or the system malloc, and definitely too late at night.

Thanks for your help so far, and while I know it is VERY ugly v4 at
least addresses that
(and uncovered a couple more bugs), thanks also Junio for rerolling it
into pu so at least
we have a chance for it to build and run, and hopefully eventually pass.

Carlo

[1] https://public-inbox.org/git/20190806085014.47776-3-carenas@gmail.com/
[2] https://dev.azure.com/git/git/_build/results?buildId=832
René Scharfe Aug. 8, 2019, 7:07 a.m. UTC | #6
Am 08.08.19 um 04:35 schrieb Carlo Arenas:
> On Wed, Aug 7, 2019 at 6:03 AM René Scharfe <l.s.r@web.de> wrote:
>>
>> Am 07.08.19 um 11:49 schrieb Carlo Arenas:
>>> was hoping will perform better but it seems that testing can be done
>>> only in windows
>>
>> nedmalloc works on other platforms as well.
>
> I meant[1] it works reliably enough to be useful for performance testing.

You mentioned being concerned about performance several times and I
wondered why each time.  I'd expect no measurable difference between
using a custom global context and the internal one of PCRE2 -- setting
two function pointers surely can't take very long, can it?  But
measuring is better than guessing, of course.

> goes without saying that the fact that I am using a virtualbox with 2
> CPUs running Debian 10 on top of macOS (a macbook pro with 4 cores)
> and the test uses by default 8 threads, doesn't help,

nedmalloc is supposed to run on macOS as well.

> but to share my
> pain here is the result of running p7820 with my last reroll on top of
> pu, comparing a build of the same code without NED (this tree) with
> one with it (HEAD)
>
> Test                                               this tree
> HEAD
> -------------------------------------------------------------------------------------------
> 7820.1: basic grep -i 'how.to'                     0.89(1.12+0.46)
> 0.95(1.23+0.49) +6.7%
> 7820.2: extended grep -i 'how.to'                  0.90(1.12+0.49)
> 0.92(1.19+0.46) +2.2%
> 7820.3: perl grep -i 'how.to'                      0.54(0.30+0.52)
> 0.53(0.39+0.52) -1.9%
> 7820.5: basic grep -i '^how to'                    0.89(1.13+0.47)
> 0.91(1.13+0.49) +2.2%
> 7820.6: extended grep -i '^how to'                 0.84(1.04+0.49)
> 0.94(1.27+0.47) +11.9%
> 7820.7: perl grep -i '^how to'                     0.49(0.34+0.47)
> 0.51(0.36+0.49) +4.1%
> 7820.9: basic grep -i '[how] to'                   1.51(2.31+0.51)
> 1.55(2.38+0.51) +2.6%
> 7820.10: extended grep -i '[how] to'               1.50(2.20+0.59)
> 1.56(2.30+0.62) +4.0%
> 7820.11: perl grep -i '[how] to'                   0.67(0.50+0.52)
> 0.62(0.50+0.55) -7.5%
> 7820.13: basic grep -i '\(e.t[^ ]*\|v.ry\) rare'   2.58(4.39+0.56)
> 2.64(4.45+0.60) +2.3%
> 7820.14: extended grep -i '(e.t[^ ]*|v.ry) rare'   2.60(4.41+0.56)
> 2.66(4.58+0.56) +2.3%
> 7820.15: perl grep -i '(e.t[^ ]*|v.ry) rare'       1.17(1.66+0.53)
> 1.23(1.84+0.45) +5.1%
> 7820.17: basic grep -i 'm\(ú\|u\)lt.b\(æ\|y\)te'   1.12(1.54+0.51)
> 1.14(1.70+0.44) +1.8%
> 7820.18: extended grep -i 'm(ú|u)lt.b(æ|y)te'      1.09(1.54+0.48)
> 1.14(1.62+0.49) +4.6%
> 7820.19: perl grep -i 'm(ú|u)lt.b(æ|y)te'          0.87(1.09+0.46)
> 0.90(1.20+0.43) +3.4%
>
> and here one comparing two builds (both with NED)
>
> Test                                               origin/pu
> HEAD
> -------------------------------------------------------------------------------------------
> 7820.1: basic grep -i 'how.to'                     1.00(1.24+0.55)
> 0.94(1.19+0.52) -6.0%
> 7820.2: extended grep -i 'how.to'                  0.90(1.15+0.49)
> 0.93(1.23+0.44) +3.3%
> 7820.3: perl grep -i 'how.to'                      0.52(0.37+0.51)
> 0.59(0.34+0.53) +13.5%
> 7820.5: basic grep -i '^how to'                    0.89(1.16+0.48)
> 0.90(1.17+0.47) +1.1%
> 7820.6: extended grep -i '^how to'                 0.92(1.17+0.50)
> 0.92(1.20+0.45) +0.0%
> 7820.7: perl grep -i '^how to'                     0.45(0.33+0.42)
> 0.54(0.29+0.57) +20.0%
> 7820.9: basic grep -i '[how] to'                   1.60(2.46+0.52)
> 1.61(2.39+0.62) +0.6%
> 7820.10: extended grep -i '[how] to'               1.71(2.67+0.56)
> 1.57(2.41+0.54) -8.2%
> 7820.11: perl grep -i '[how] to'                   0.66(0.61+0.51)
> 0.59(0.44+0.51) -10.6%
> 7820.13: basic grep -i '\(e.t[^ ]*\|v.ry\) rare'   2.69(4.49+0.66)
> 2.67(4.49+0.60) -0.7%
> 7820.14: extended grep -i '(e.t[^ ]*|v.ry) rare'   2.67(4.49+0.64)
> 2.64(4.54+0.54) -1.1%
> 7820.15: perl grep -i '(e.t[^ ]*|v.ry) rare'       1.23(1.80+0.47)
> 1.25(1.89+0.46) +1.6%
> 7820.17: basic grep -i 'm\(ú\|u\)lt.b\(æ\|y\)te'   1.13(1.64+0.47)
> 1.14(1.64+0.48) +0.9%
> 7820.18: extended grep -i 'm(ú|u)lt.b(æ|y)te'      1.16(1.68+0.46)
> 1.20(1.60+0.60) +3.4%
> 7820.19: perl grep -i 'm(ú|u)lt.b(æ|y)te'          0.90(1.16+0.48)
> 0.88(1.17+0.45) -2.2%
>
> with the only relevant line (for my code) being 7820.19 where it would
> seem it performs almost the same (eventhough just adding NED made it
> initially worst)
>
> note though that the fact there are 20% swings in parts of the code
> that hasn't changed
> or that where explicitly #ifdef out of my code changes doesn't give me
> much confidence, but since the windows guys seem to be using NED by
> default, I am hoping it works better there.

These measurement results are quite noisy, so I wouldn't trust them too
much.  nedmalloc being slower than the one from a recent glibc version
is not very surprising given this statement from its home page,
https://www.nedprod.com/programs/portable/nedmalloc/:

   "Windows 7, Linux 3.x, FreeBSD 8, Mac OS X 10.6 all contain
    state-of-the-art allocators and no third party allocator is
    likely to significantly improve on them in real world results"

In particular I don't think that these results justify coupling the use
of nedmalloc to the choice of using a custom global context for PCRE2.

I'd expect:
- Without USE_NED_ALLOCATOR: xmalloc() should be used for all
  allocations, including for PCRE2.  Some special exceptions use
  malloc(3) directly, but for most uses we want the consistent
  out-of-memory handling that xmalloc() brings.
- With USE_NED_ALLOCATOR: malloc() and xmalloc() use nedmalloc
  behind the scenes and free() is similarly overridden, so all
  allocations are affected.
- If USE_NED_ALLOCATOR performs worse than the system allocator on
  some system then it's the problem of those that turn on that flag.

Makes sense?

René
Carlo Marcelo Arenas Belón Aug. 8, 2019, 12:38 p.m. UTC | #7
On Thu, Aug 8, 2019 at 12:07 AM René Scharfe <l.s.r@web.de> wrote:
>
> Am 08.08.19 um 04:35 schrieb Carlo Arenas:
> > On Wed, Aug 7, 2019 at 6:03 AM René Scharfe <l.s.r@web.de> wrote:
> >>
> >> Am 07.08.19 um 11:49 schrieb Carlo Arenas:
> >>> was hoping will perform better but it seems that testing can be done
> >>> only in windows
> >>
> >> nedmalloc works on other platforms as well.
> >
> > I meant[1] it works reliably enough to be useful for performance testing.
>
> You mentioned being concerned about performance several times and I
> wondered why each time.  I'd expect no measurable difference between
> using a custom global context and the internal one of PCRE2 -- setting
> two function pointers surely can't take very long, can it?  But
> measuring is better than guessing, of course.

setting the allocator is not a concern, but using it; it requires an
extra indirect function call which is usually not very friendly to
caches in our speculative execution CPU world.  our implementation
also adds the wrapper call overhead, but in this case it is just the
"cost of doing business" with PCRE2.

compilers had gotten a lot better since (mainly because of C++ and the
need for it with virtual methods) but I would rather measure.

> > goes without saying that the fact that I am using a virtualbox with 2
> > CPUs running Debian 10 on top of macOS (a macbook pro with 4 cores)
> > and the test uses by default 8 threads, doesn't help,
>
> nedmalloc is supposed to run on macOS as well.

the last version has some "fix miscompilations in macOS" fixes that
might be relevant, and the version we have in tree says it works in
the 32-bit version which latest macOS versions are working hard to
deprecate (can't even build for it anymore), eitherway trying to run
with a nedmalloc enabled git in macOS is not fun.

> > with the only relevant line (for my code) being 7820.19 where it would
> > seem it performs almost the same (eventhough just adding NED made it
> > initially worst)
> >
> > note though that the fact there are 20% swings in parts of the code
> > that hasn't changed
> > or that where explicitly #ifdef out of my code changes doesn't give me
> > much confidence, but since the windows guys seem to be using NED by
> > default, I am hoping it works better there.
>
> These measurement results are quite noisy, so I wouldn't trust them too
> much.  nedmalloc being slower than the one from a recent glibc version
> is not very surprising given this statement from its home page,
> https://www.nedprod.com/programs/portable/nedmalloc/:
>
>    "Windows 7, Linux 3.x, FreeBSD 8, Mac OS X 10.6 all contain
>     state-of-the-art allocators and no third party allocator is
>     likely to significantly improve on them in real world results"
>
> In particular I don't think that these results justify coupling the use
> of nedmalloc to the choice of using a custom global context for PCRE2.

neither did I either, the only reason I am holding on fully enabling
NED with PCRE2 in my series is just because I wan't to make sure we
have identified the bug correctly and we are fixing it (specially
since I can't reproduce it, and therefore neither debug it)

sorry for not making that clear enough, and as I said before, if we
keep seeing segfaults even after v4 then we will have to do that or I
might need to do a quick run to the nearest microsoft store hoping for
a distracted rep, instead.

> I'd expect:
> - Without USE_NED_ALLOCATOR: xmalloc() should be used for all
>   allocations, including for PCRE2.  Some special exceptions use
>   malloc(3) directly, but for most uses we want the consistent
>   out-of-memory handling that xmalloc() brings.

that is already in v4 and would expect to carry it forward.  this is
also what I had in mind when I said we will need some fixes on top of
Dscho version if we give up with these.

> - With USE_NED_ALLOCATOR: malloc() and xmalloc() use nedmalloc
>   behind the scenes and free() is similarly overridden, so all
>   allocations are affected.
> - If USE_NED_ALLOCATOR performs worse than the system allocator on
>   some system then it's the problem of those that turn on that flag.
>
> Makes sense?

completely, but note also that Dscho version would make the
performance impacts of using a custom allocator (if any) affect
everyone using PCRE2.

Carlo
René Scharfe Aug. 8, 2019, 2:29 p.m. UTC | #8
Am 08.08.19 um 14:38 schrieb Carlo Arenas:
> On Thu, Aug 8, 2019 at 12:07 AM René Scharfe <l.s.r@web.de> wrote:
>>
>> Am 08.08.19 um 04:35 schrieb Carlo Arenas:
>>> On Wed, Aug 7, 2019 at 6:03 AM René Scharfe <l.s.r@web.de> wrote:
>>>>
>>>> Am 07.08.19 um 11:49 schrieb Carlo Arenas:
>>>>> was hoping will perform better but it seems that testing can be done
>>>>> only in windows
>>>>
>>>> nedmalloc works on other platforms as well.
>>>
>>> I meant[1] it works reliably enough to be useful for performance testing.
>>
>> You mentioned being concerned about performance several times and I
>> wondered why each time.  I'd expect no measurable difference between
>> using a custom global context and the internal one of PCRE2 -- setting
>> two function pointers surely can't take very long, can it?  But
>> measuring is better than guessing, of course.
>
> setting the allocator is not a concern, but using it; it requires an
> extra indirect function call which is usually not very friendly to
> caches in our speculative execution CPU world.  our implementation
> also adds the wrapper call overhead, but in this case it is just the
> "cost of doing business" with PCRE2.

PCRE2 needs to allocate memory once per program run for the character
table and for each pattern compilation.  These are both rare events
compared to matching patterns against lines, and I suspect that
compilation in particular  has much more other work to do, even more so
with JIT enabled; I'd expect function call indirection to not make much
of a difference.

pcre2_compile() always calls the allocation function through a
function pointer in a context struct, by the way (see line 9695 in
https://vcs.pcre.org/pcre2/code/trunk/src/pcre2_compile.c?view=markup
or search for "malloc" in that file).

>>> goes without saying that the fact that I am using a virtualbox with 2
>>> CPUs running Debian 10 on top of macOS (a macbook pro with 4 cores)
>>> and the test uses by default 8 threads, doesn't help,
>>
>> nedmalloc is supposed to run on macOS as well.
>
> the last version has some "fix miscompilations in macOS" fixes that
> might be relevant, and the version we have in tree says it works in
> the 32-bit version which latest macOS versions are working hard to
> deprecate (can't even build for it anymore), eitherway trying to run
> with a nedmalloc enabled git in macOS is not fun.

Importing the latest version of nedmalloc might make sense in general.
The last commit in git://github.com/ned14/nedmalloc.git was done five
years ago; is it finished?  A diffstat with -b looks like this:

 malloc.c.h  | 1193 ++++++++++++++++++++++++++++-------------
 nedmalloc.c | 1720 +++++++++++++++++++++++++++++++++++++++++++++++++++++-------
 nedmalloc.h | 1580 ++++++++++++++++++++++++++++++++++++++++++++++++++++---
 3 files changed, 3840 insertions(+), 653 deletions(-)

Any nedmalloc fans interested in bringing the goodies hidden in there
to Git (presumably while retaining our local fixes)?

>> In particular I don't think that these results justify coupling the use
>> of nedmalloc to the choice of using a custom global context for PCRE2.
>
> neither did I either, the only reason I am holding on fully enabling
> NED with PCRE2 in my series is just because I wan't to make sure we
> have identified the bug correctly and we are fixing it (specially
> since I can't reproduce it, and therefore neither debug it)

That's a good reason against #ifdefs in general.  Sometimes they are
unavoidable, but they can make maintenance a lot harder.

> sorry for not making that clear enough, and as I said before, if we
> keep seeing segfaults even after v4 then we will have to do that or I
> might need to do a quick run to the nearest microsoft store hoping for
> a distracted rep, instead.

Asking to buy a license for Windows Vista might cause quite a bit of a
distraction in there -- Microsoft's support for that version ended two
years ago. :)  It still seems to be popular enough to be supported by
Git for Windows, however.

(You could buy Windows 10 and probably get a downgrade right, but
finding legit install media for Vista might be challenging.)

But I'd say do the easy thing: Custom global context for all.

>> I'd expect:
>> - Without USE_NED_ALLOCATOR: xmalloc() should be used for all
>>   allocations, including for PCRE2.  Some special exceptions use
>>   malloc(3) directly, but for most uses we want the consistent
>>   out-of-memory handling that xmalloc() brings.
>
> that is already in v4 and would expect to carry it forward.  this is
> also what I had in mind when I said we will need some fixes on top of
> Dscho version if we give up with these.
>
>> - With USE_NED_ALLOCATOR: malloc() and xmalloc() use nedmalloc
>>   behind the scenes and free() is similarly overridden, so all
>>   allocations are affected.
>> - If USE_NED_ALLOCATOR performs worse than the system allocator on
>>   some system then it's the problem of those that turn on that flag.
>>
>> Makes sense?
>
> completely, but note also that Dscho version would make the
> performance impacts of using a custom allocator (if any) affect
> everyone using PCRE2.

Sounds fine to me, if the performance numbers don't take too much of
a hit.  I'd be surprised if the needle moved at all (ignoring noise).

René
Johannes Schindelin Aug. 8, 2019, 8:18 p.m. UTC | #9
Hi René,

On Thu, 8 Aug 2019, René Scharfe wrote:

> Importing the latest version of nedmalloc might make sense in general.
> The last commit in git://github.com/ned14/nedmalloc.git was done five
> years ago; is it finished?  A diffstat with -b looks like this:
>
>  malloc.c.h  | 1193 ++++++++++++++++++++++++++++-------------
>  nedmalloc.c | 1720 +++++++++++++++++++++++++++++++++++++++++++++++++++++-------
>  nedmalloc.h | 1580 ++++++++++++++++++++++++++++++++++++++++++++++++++++---
>  3 files changed, 3840 insertions(+), 653 deletions(-)
>
> Any nedmalloc fans interested in bringing the goodies hidden in there
> to Git (presumably while retaining our local fixes)?

I had looked into this already over two years ago, and had to stop after
investigating a performance regression for two weeks and not getting
anywhere.

Also, nedmalloc fell unmaintained, so I don't necessarily think that it
would be a good idea to  spend a lot of time on it.

In the meantime, there is a much more viable contender: mi-malloc.
Preliminary tests suggest that its performance on Windows is at least as
good as nedmalloc's, and Windows was the use case for which we
integrated nedmalloc into Git's compat/ in the first place.

I have tentative patches to integrate it into Git for Windows, and
basically got side-tracked with other things. Expect to see something
regarding mi-malloc from me in September.

Ciao,
Dscho

Patch
diff mbox series

diff --git a/builtin/grep.c b/builtin/grep.c
index 560051784e..e49c20df60 100644
--- a/builtin/grep.c
+++ b/builtin/grep.c
@@ -1145,5 +1145,6 @@  int cmd_grep(int argc, const char **argv, const char *prefix)
 		run_pager(&opt, prefix);
 	clear_pathspec(&pathspec);
 	free_grep_patterns(&opt);
+	grep_destroy();
 	return !hit;
 }
diff --git a/grep.c b/grep.c
index 0154998695..3e5bdf94a6 100644
--- a/grep.c
+++ b/grep.c
@@ -16,6 +16,22 @@  static int grep_source_is_binary(struct grep_source *gs,
 
 static struct grep_opt grep_defaults;
 
+#ifdef USE_LIBPCRE2
+static pcre2_general_context *pcre2_global_context;
+
+#ifdef USE_NED_ALLOCATOR
+static void *pcre2_malloc(PCRE2_SIZE size, MAYBE_UNUSED void *memory_data)
+{
+	return malloc(size);
+}
+
+static void pcre2_free(void *pointer, MAYBE_UNUSED void *memory_data)
+{
+	return free(pointer);
+}
+#endif
+#endif
+
 static const char *color_grep_slots[] = {
 	[GREP_COLOR_CONTEXT]	    = "context",
 	[GREP_COLOR_FILENAME]	    = "filename",
@@ -153,6 +169,7 @@  int grep_config(const char *var, const char *value, void *cb)
  *
  * If using PCRE make sure that the library is configured
  * to use the right allocator (ex: NED)
+ * if any object is created it should be cleaned up in grep_destroy()
  */
 void grep_init(struct grep_opt *opt, struct repository *repo, const char *prefix)
 {
@@ -188,6 +205,13 @@  void grep_init(struct grep_opt *opt, struct repository *repo, const char *prefix
 		color_set(opt->colors[i], def->colors[i]);
 }
 
+void grep_destroy(void)
+{
+#ifdef USE_LIBPCRE2
+	pcre2_general_context_free(pcre2_global_context);
+#endif
+}
+
 static void grep_set_pattern_type_option(enum grep_pattern_type pattern_type, struct grep_opt *opt)
 {
 	/*
@@ -319,6 +343,11 @@  void append_header_grep_pattern(struct grep_opt *opt,
 void append_grep_pattern(struct grep_opt *opt, const char *pat,
 			 const char *origin, int no, enum grep_pat_token t)
 {
+#if defined(USE_LIBPCRE2) && defined(USE_NED_ALLOCATOR)
+	if (!pcre2_global_context && opt->ignore_case && has_non_ascii(pat))
+		pcre2_global_context = pcre2_general_context_create(
+					pcre2_malloc, pcre2_free, NULL);
+#endif
 	append_grep_pat(opt, pat, strlen(pat), origin, no, t);
 }
 
@@ -507,9 +536,14 @@  static void compile_pcre2_pattern(struct grep_pat *p, const struct grep_opt *opt
 
 	p->pcre2_compile_context = NULL;
 
+	/* pcre2_global_context is initialized in append_grep_pattern */
 	if (opt->ignore_case) {
 		if (has_non_ascii(p->pattern)) {
-			character_tables = pcre2_maketables(NULL);
+#ifdef USE_NED_ALLOCATOR
+			if (!pcre2_global_context)
+				BUG("pcre2_global_context uninitialized");
+#endif
+			character_tables = pcre2_maketables(pcre2_global_context);
 			p->pcre2_compile_context = pcre2_compile_context_create(NULL);
 			pcre2_set_character_tables(p->pcre2_compile_context, character_tables);
 		}
diff --git a/grep.h b/grep.h
index 1875880f37..526c2db9ef 100644
--- a/grep.h
+++ b/grep.h
@@ -189,6 +189,7 @@  struct grep_opt {
 void init_grep_defaults(struct repository *);
 int grep_config(const char *var, const char *value, void *);
 void grep_init(struct grep_opt *, struct repository *repo, const char *prefix);
+void grep_destroy(void);
 void grep_commit_pattern_type(enum grep_pattern_type, struct grep_opt *opt);
 
 void append_grep_pat(struct grep_opt *opt, const char *pat, size_t patlen, const char *origin, int no, enum grep_pat_token t);