diff mbox series

[5/8] commit-graph: check all leading directories in changed path Bloom filters

Message ID 9975fc96f1269b049ebdd54835a63480c3dcbe4e.1592252093.git.gitgitgadget@gmail.com (mailing list archive)
State New, archived
Headers show
Series More commit-graph/Bloom filter improvements | expand

Commit Message

John Passaro via GitGitGadget June 15, 2020, 8:14 p.m. UTC
From: =?UTF-8?q?SZEDER=20G=C3=A1bor?= <szeder.dev@gmail.com>

The file 'dir/subdir/file' can only be modified if its leading
directories 'dir' and 'dir/subdir' are modified as well.

So when checking modified path Bloom filters looking for commits
modifying a path with multiple path components, then check not only
the full path in the Bloom filters, but all its leading directories as
well.  Take care to check these paths in "deepest first" order,
because it's the full path that is least likely to be modified, and
the Bloom filter queries can short circuit sooner.

This can significantly reduce the average false positive rate, by
about an order of magnitude or three(!), and can further speed up
pathspec-limited revision walks.  The table below compares the average
false positive rate and runtime of

  git rev-list HEAD -- "$path"

before and after this change for 5000+ randomly* selected paths from
each repository:

                    Average false           Average        Average
                    positive rate           runtime        runtime
                  before     after     before     after   difference
  ------------------------------------------------------------------
  git             3.220%   0.7853%     0.0558s   0.0387s   -30.6%
  linux           2.453%   0.0296%     0.1046s   0.0766s   -26.8%
  tensorflow      2.536%   0.6977%     0.0594s   0.0420s   -29.2%

*Path selection was done with the following pipeline:

	git ls-tree -r --name-only HEAD | sort -R | head -n 5000

The improvements in runtime are much smaller than the improvements in
average false positive rate, as we are clearly reaching diminishing
returns here.  However, all these timings depend on that accessing
tree objects is reasonably fast (warm caches).  If we had a partial
clone and the tree objects had to be fetched from a promisor remote,
e.g.:

  $ git clone --filter=tree:0 --bare file://.../webkit.git webkit.notrees.git
  $ git -C webkit.git -c core.modifiedPathBloomFilters=1 \
        commit-graph write --reachable
  $ cp webkit.git/objects/info/commit-graph webkit.notrees.git/objects/info/
  $ git -C webkit.notrees.git -c core.modifiedPathBloomFilters=1 \
        rev-list HEAD -- "$path"

then checking all leading path component can reduce the runtime from
over an hour to a few seconds (and this is with the clone and the
promisor on the same machine).

This adjusts the tracing values in t4216-log-bloom.sh, which provides a
concrete way to notice the improvement.

Signed-off-by: SZEDER Gábor <szeder.dev@gmail.com>
Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
---
 revision.c           | 35 ++++++++++++++++++++++++++---------
 revision.h           |  6 ++++--
 t/t4216-log-bloom.sh |  2 +-
 3 files changed, 31 insertions(+), 12 deletions(-)

Comments

René Scharfe June 18, 2020, 8:31 p.m. UTC | #1
Am 15.06.20 um 22:14 schrieb SZEDER Gábor via GitGitGadget:
> From: =?UTF-8?q?SZEDER=20G=C3=A1bor?= <szeder.dev@gmail.com>
>
> The file 'dir/subdir/file' can only be modified if its leading
> directories 'dir' and 'dir/subdir' are modified as well.
>
> So when checking modified path Bloom filters looking for commits
> modifying a path with multiple path components, then check not only
> the full path in the Bloom filters, but all its leading directories as
> well.  Take care to check these paths in "deepest first" order,
> because it's the full path that is least likely to be modified, and
> the Bloom filter queries can short circuit sooner.
>
> This can significantly reduce the average false positive rate, by
> about an order of magnitude or three(!), and can further speed up
> pathspec-limited revision walks.  The table below compares the average
> false positive rate and runtime of
>
>   git rev-list HEAD -- "$path"
>
> before and after this change for 5000+ randomly* selected paths from
> each repository:
>
>                     Average false           Average        Average
>                     positive rate           runtime        runtime
>                   before     after     before     after   difference
>   ------------------------------------------------------------------
>   git             3.220%   0.7853%     0.0558s   0.0387s   -30.6%
>   linux           2.453%   0.0296%     0.1046s   0.0766s   -26.8%
>   tensorflow      2.536%   0.6977%     0.0594s   0.0420s   -29.2%

Nice!

>
> *Path selection was done with the following pipeline:
>
> 	git ls-tree -r --name-only HEAD | sort -R | head -n 5000
>
> The improvements in runtime are much smaller than the improvements in
> average false positive rate, as we are clearly reaching diminishing
> returns here.  However, all these timings depend on that accessing
> tree objects is reasonably fast (warm caches).  If we had a partial
> clone and the tree objects had to be fetched from a promisor remote,
> e.g.:
>
>   $ git clone --filter=tree:0 --bare file://.../webkit.git webkit.notrees.git
>   $ git -C webkit.git -c core.modifiedPathBloomFilters=1 \
>         commit-graph write --reachable
>   $ cp webkit.git/objects/info/commit-graph webkit.notrees.git/objects/info/
>   $ git -C webkit.notrees.git -c core.modifiedPathBloomFilters=1 \
>         rev-list HEAD -- "$path"
>
> then checking all leading path component can reduce the runtime from
> over an hour to a few seconds (and this is with the clone and the
> promisor on the same machine).
>
> This adjusts the tracing values in t4216-log-bloom.sh, which provides a
> concrete way to notice the improvement.
>
> Signed-off-by: SZEDER Gábor <szeder.dev@gmail.com>
> Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
> ---
>  revision.c           | 35 ++++++++++++++++++++++++++---------
>  revision.h           |  6 ++++--
>  t/t4216-log-bloom.sh |  2 +-
>  3 files changed, 31 insertions(+), 12 deletions(-)
>
> diff --git a/revision.c b/revision.c
> index c644c660917..027ae3982b4 100644
> --- a/revision.c
> +++ b/revision.c
> @@ -670,9 +670,10 @@ static void prepare_to_use_bloom_filter(struct rev_info *revs)
>  {
>  	struct pathspec_item *pi;
>  	char *path_alloc = NULL;
> -	const char *path;
> +	const char *path, *p;
>  	int last_index;
> -	int len;
> +	size_t len;
> +	int path_component_nr = 0, j;
>
>  	if (!revs->commits)
>  		return;
> @@ -705,8 +706,22 @@ static void prepare_to_use_bloom_filter(struct rev_info *revs)
>
>  	len = strlen(path);
>
> -	revs->bloom_key = xmalloc(sizeof(struct bloom_key));
> -	fill_bloom_key(path, len, revs->bloom_key, revs->bloom_filter_settings);
> +	p = path;
> +	do {
> +		p = strchrnul(p + 1, '/');
> +		path_component_nr++;
> +	} while (p - path < len);

Hmm, that "+ 1" makes me a bit nervous.  Can we be sure that path is not
an empty string?

And shouldn't we use is_dir_sep() or find_last_dir_sep() instead of
hard-coding a slash?

> +
> +	revs->bloom_keys_nr = path_component_nr;
> +	ALLOC_ARRAY(revs->bloom_keys, revs->bloom_keys_nr);
> +
> +	p = path;
> +	for (j = 0; j < revs->bloom_keys_nr; j++) {
> +		p = strchrnul(p + 1, '/');

Same here, of course.

Also note that this puts shorter sub-strings first.


> +
> +		fill_bloom_key(path, p - path, &revs->bloom_keys[j],
> +			       revs->bloom_filter_settings);
> +	}
>
>  	if (trace2_is_enabled() && !bloom_filter_atexit_registered) {
>  		atexit(trace2_bloom_filter_statistics_atexit);
> @@ -720,7 +735,7 @@ static int check_maybe_different_in_bloom_filter(struct rev_info *revs,
>  						 struct commit *commit)
>  {
>  	struct bloom_filter *filter;
> -	int result;
> +	int result = 1, j;
>
>  	if (!revs->repo->objects->commit_graph)
>  		return -1;
> @@ -740,9 +755,11 @@ static int check_maybe_different_in_bloom_filter(struct rev_info *revs,
>  		return -1;
>  	}
>
> -	result = bloom_filter_contains(filter,
> -				       revs->bloom_key,
> -				       revs->bloom_filter_settings);
> +	for (j = 0; result && j < revs->bloom_keys_nr; j++) {
> +		result = bloom_filter_contains(filter,
> +					       &revs->bloom_keys[j],
> +					       revs->bloom_filter_settings);
> +	}

This checks shorter sub-strings first, contradicting the "deepest first"
strategy mentioned in the commit message.

This can easily be fixed by inverting the traversal of one of the loops,
of course.  Or perhaps do something like this?

	revs->bloom_keys = NULL;
	revs->bloom_keys_nr = 0;
	strbuf_add(&path, pi->match, pi->len);
	strbuf_trim_trailing_dir_sep(&path);
	for (;;) {
		const char *sep;
		ALLOC_GROW(revs->bloom_keys, revs->bloom_keys_nr + 1, alloc);
		fill_bloom_key(path.buf, path.len,
			       &revs->bloom_keys[revs->bloom_keys_nr++],
			       revs->bloom_filter_settings);
		sep = find_last_dir_sep(path.buf);
		if (!sep)
			break;
		strbuf_setlen(&path, sep - path.buf);
	}
	strbuf_release(&path);

The find_last_dir_sep() calls scan the first part of the string over and
over, which is a bit silly.  A strbuf_trim_trailing_path_component()
could start at the end and scan backwards if that turns out to be an
actual problem.

ALLOC_GROW wastes memory on revs->bloom_keys, and reallocating instead
of allocating the right size from the start has a cost as well, but I'd
expect this to be dwarfed by the actual revision walk.

>
>  	if (result)
>  		count_bloom_filter_maybe++;
> @@ -782,7 +799,7 @@ static int rev_compare_tree(struct rev_info *revs,
>  			return REV_TREE_SAME;
>  	}
>
> -	if (revs->bloom_key && !nth_parent) {
> +	if (revs->bloom_keys_nr && !nth_parent) {
>  		bloom_ret = check_maybe_different_in_bloom_filter(revs, commit);
>
>  		if (bloom_ret == 0)
> diff --git a/revision.h b/revision.h
> index 7c026fe41fc..abbfb4ab59a 100644
> --- a/revision.h
> +++ b/revision.h
> @@ -295,8 +295,10 @@ struct rev_info {
>  	struct topo_walk_info *topo_walk_info;
>
>  	/* Commit graph bloom filter fields */
> -	/* The bloom filter key for the pathspec */
> -	struct bloom_key *bloom_key;
> +	/* The bloom filter key(s) for the pathspec */
> +	struct bloom_key *bloom_keys;
> +	int bloom_keys_nr;
> +
>  	/*
>  	 * The bloom filter settings used to generate the key.
>  	 * This is loaded from the commit-graph being used.
> diff --git a/t/t4216-log-bloom.sh b/t/t4216-log-bloom.sh
> index c7011f33e2c..c13b97d3bda 100755
> --- a/t/t4216-log-bloom.sh
> +++ b/t/t4216-log-bloom.sh
> @@ -142,7 +142,7 @@ test_expect_success 'setup - add commit-graph to the chain with Bloom filters' '
>
>  test_bloom_filters_used_when_some_filters_are_missing () {
>  	log_args=$1
> -	bloom_trace_prefix="statistics:{\"filter_not_present\":3,\"zero_length_filter\":0,\"maybe\":8,\"definitely_not\":6"
> +	bloom_trace_prefix="statistics:{\"filter_not_present\":3,\"zero_length_filter\":0,\"maybe\":6,\"definitely_not\":8"
>  	setup "$log_args" &&
>  	grep -q "$bloom_trace_prefix" "$TRASH_DIRECTORY/trace.perf" &&
>  	test_cmp log_wo_bloom log_w_bloom
>
René Scharfe June 19, 2020, 9:14 a.m. UTC | #2
Am 18.06.20 um 22:31 schrieb René Scharfe:
> Am 15.06.20 um 22:14 schrieb SZEDER Gábor via GitGitGadget:
>> --- a/revision.c
>> +++ b/revision.c
>> @@ -670,9 +670,10 @@ static void prepare_to_use_bloom_filter(struct rev_info *revs)
>>  {
>>  	struct pathspec_item *pi;
>>  	char *path_alloc = NULL;
>> -	const char *path;
>> +	const char *path, *p;
>>  	int last_index;
>> -	int len;
>> +	size_t len;
>> +	int path_component_nr = 0, j;
>>
>>  	if (!revs->commits)
>>  		return;
>> @@ -705,8 +706,22 @@ static void prepare_to_use_bloom_filter(struct rev_info *revs)
>>
>>  	len = strlen(path);
>>
>> -	revs->bloom_key = xmalloc(sizeof(struct bloom_key));
>> -	fill_bloom_key(path, len, revs->bloom_key, revs->bloom_filter_settings);
>> +	p = path;
>> +	do {
>> +		p = strchrnul(p + 1, '/');
>> +		path_component_nr++;
>> +	} while (p - path < len);

> And shouldn't we use is_dir_sep() or find_last_dir_sep() instead of
> hard-coding a slash?

Not necessarily.  Paths should be normalized to use one specific
separator, probably slash, both when building and querying the Bloom
filter.  Otherwise a filter that knows e.g. "foo/bar" could confidently
claim that "foo\bar" does not match.  If this is done in a previous
step then using a literal slash here would be correct.

René
Taylor Blau June 19, 2020, 5:17 p.m. UTC | #3
Hi Stolee,

On Mon, Jun 15, 2020 at 08:14:50PM +0000, SZEDER Gábor via GitGitGadget wrote:
> From: =?UTF-8?q?SZEDER=20G=C3=A1bor?= <szeder.dev@gmail.com>
>
> The file 'dir/subdir/file' can only be modified if its leading
> directories 'dir' and 'dir/subdir' are modified as well.
>
> So when checking modified path Bloom filters looking for commits
> modifying a path with multiple path components, then check not only
> the full path in the Bloom filters, but all its leading directories as
> well.  Take care to check these paths in "deepest first" order,
> because it's the full path that is least likely to be modified, and
> the Bloom filter queries can short circuit sooner.
>
> This can significantly reduce the average false positive rate, by
> about an order of magnitude or three(!), and can further speed up
> pathspec-limited revision walks.  The table below compares the average
> false positive rate and runtime of
>
>   git rev-list HEAD -- "$path"
>
> before and after this change for 5000+ randomly* selected paths from
> each repository:
>
>                     Average false           Average        Average
>                     positive rate           runtime        runtime
>                   before     after     before     after   difference
>   ------------------------------------------------------------------
>   git             3.220%   0.7853%     0.0558s   0.0387s   -30.6%
>   linux           2.453%   0.0296%     0.1046s   0.0766s   -26.8%
>   tensorflow      2.536%   0.6977%     0.0594s   0.0420s   -29.2%
>
> *Path selection was done with the following pipeline:
>
> 	git ls-tree -r --name-only HEAD | sort -R | head -n 5000
>
> The improvements in runtime are much smaller than the improvements in
> average false positive rate, as we are clearly reaching diminishing
> returns here.  However, all these timings depend on that accessing
> tree objects is reasonably fast (warm caches).  If we had a partial
> clone and the tree objects had to be fetched from a promisor remote,
> e.g.:
>
>   $ git clone --filter=tree:0 --bare file://.../webkit.git webkit.notrees.git
>   $ git -C webkit.git -c core.modifiedPathBloomFilters=1 \
>         commit-graph write --reachable
>   $ cp webkit.git/objects/info/commit-graph webkit.notrees.git/objects/info/
>   $ git -C webkit.notrees.git -c core.modifiedPathBloomFilters=1 \
>         rev-list HEAD -- "$path"
>
> then checking all leading path component can reduce the runtime from
> over an hour to a few seconds (and this is with the clone and the
> promisor on the same machine).
>
> This adjusts the tracing values in t4216-log-bloom.sh, which provides a
> concrete way to notice the improvement.
>
> Signed-off-by: SZEDER Gábor <szeder.dev@gmail.com>
> Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
> ---
>  revision.c           | 35 ++++++++++++++++++++++++++---------
>  revision.h           |  6 ++++--
>  t/t4216-log-bloom.sh |  2 +-
>  3 files changed, 31 insertions(+), 12 deletions(-)
>
> diff --git a/revision.c b/revision.c
> index c644c660917..027ae3982b4 100644
> --- a/revision.c
> +++ b/revision.c
> @@ -670,9 +670,10 @@ static void prepare_to_use_bloom_filter(struct rev_info *revs)
>  {
>  	struct pathspec_item *pi;
>  	char *path_alloc = NULL;
> -	const char *path;
> +	const char *path, *p;
>  	int last_index;
> -	int len;
> +	size_t len;
> +	int path_component_nr = 0, j;
>
>  	if (!revs->commits)
>  		return;
> @@ -705,8 +706,22 @@ static void prepare_to_use_bloom_filter(struct rev_info *revs)
>
>  	len = strlen(path);
>
> -	revs->bloom_key = xmalloc(sizeof(struct bloom_key));
> -	fill_bloom_key(path, len, revs->bloom_key, revs->bloom_filter_settings);
> +	p = path;
> +	do {
> +		p = strchrnul(p + 1, '/');
> +		path_component_nr++;
> +	} while (p - path < len);
> +
> +	revs->bloom_keys_nr = path_component_nr;
> +	ALLOC_ARRAY(revs->bloom_keys, revs->bloom_keys_nr);
> +
> +	p = path;
> +	for (j = 0; j < revs->bloom_keys_nr; j++) {
> +		p = strchrnul(p + 1, '/');
> +
> +		fill_bloom_key(path, p - path, &revs->bloom_keys[j],
> +			       revs->bloom_filter_settings);
> +	}
>

Somewhat related to our off-list discussion yesterday, there is a bug in
both 2.27 and this patch which produces incorrect results when (1)
Bloom filters are enabled, and (2) we are doing a revision walk from
root with the pathspec '.'.

What appears to be going on is that our normalization takes '.' -> '',
and then we form a Bloom key based on the empty string, which will
return 'definitely not' when querying the Bloom filter some of the time,
which should never happen. This is a consequence of never inserting the
empty key into the Bloom filter upon generation.

As a result, I have patched this in GitHub's fork (which is currently
based on 2.27 and doesn't have these patches yet) by doing an early
return when 'strlen(path) == 0'. Since it looks like these patches are
going to land, here is some clean-up and a fix for the bug that you
should feel free to test with and apply on top:

--- >8 ---

diff --git a/revision.c b/revision.c
index 8bd383b1dd..123e72698d 100644
--- a/revision.c
+++ b/revision.c
@@ -670,10 +670,10 @@ static void prepare_to_use_bloom_filter(struct rev_info *revs)
 {
        struct pathspec_item *pi;
        char *path_alloc = NULL;
-       const char *path, *p;
+       char *path, *p;
        int last_index;
        size_t len;
-       int path_component_nr = 0, j;
+       int path_component_nr = 1, j;

        if (!revs->commits)
                return;
@@ -698,29 +698,33 @@ static void prepare_to_use_bloom_filter(struct rev_info *revs)

        /* remove single trailing slash from path, if needed */
        if (pi->match[last_index] == '/') {
-           path_alloc = xstrdup(pi->match);
-           path_alloc[last_index] = '\0';
-           path = path_alloc;
-       } else
-           path = pi->match;
+               path_alloc = xstrdup(pi->match);
+               path_alloc[last_index] = '\0';
+               path = path_alloc;
+       } else {
+               path = pi->match;
+               len = pi->len;
+       }

-       len = strlen(path);
+       if (!len)
+               return;

-       p = path;
        do {
-               p = strchrnul(p + 1, '/');
-               path_component_nr++;
-       } while (p - path < len);
+               if (is_dir_sep(*p)) {
+                       *p = '\0';
+                       path_component_nr++;
+               }
+       } while (*p++);

        revs->bloom_keys_nr = path_component_nr;
        ALLOC_ARRAY(revs->bloom_keys, revs->bloom_keys_nr);

        p = path;
        for (j = 0; j < revs->bloom_keys_nr; j++) {
-               p = strchrnul(p + 1, '/');
-
-               fill_bloom_key(path, p - path, &revs->bloom_keys[j],
+               size_t plen = strlen(p);
+               fill_bloom_key(p, plen, &revs->bloom_keys[j],
                               revs->bloom_filter_settings);
+               p += plen;
        }

        if (trace2_is_enabled() && !bloom_filter_atexit_registered) {

>  	if (trace2_is_enabled() && !bloom_filter_atexit_registered) {
>  		atexit(trace2_bloom_filter_statistics_atexit);
> @@ -720,7 +735,7 @@ static int check_maybe_different_in_bloom_filter(struct rev_info *revs,
>  						 struct commit *commit)
>  {
>  	struct bloom_filter *filter;
> -	int result;
> +	int result = 1, j;
>
>  	if (!revs->repo->objects->commit_graph)
>  		return -1;
> @@ -740,9 +755,11 @@ static int check_maybe_different_in_bloom_filter(struct rev_info *revs,
>  		return -1;
>  	}
>
> -	result = bloom_filter_contains(filter,
> -				       revs->bloom_key,
> -				       revs->bloom_filter_settings);
> +	for (j = 0; result && j < revs->bloom_keys_nr; j++) {
> +		result = bloom_filter_contains(filter,
> +					       &revs->bloom_keys[j],
> +					       revs->bloom_filter_settings);
> +	}
>
>  	if (result)
>  		count_bloom_filter_maybe++;
> @@ -782,7 +799,7 @@ static int rev_compare_tree(struct rev_info *revs,
>  			return REV_TREE_SAME;
>  	}
>
> -	if (revs->bloom_key && !nth_parent) {
> +	if (revs->bloom_keys_nr && !nth_parent) {
>  		bloom_ret = check_maybe_different_in_bloom_filter(revs, commit);
>
>  		if (bloom_ret == 0)
> diff --git a/revision.h b/revision.h
> index 7c026fe41fc..abbfb4ab59a 100644
> --- a/revision.h
> +++ b/revision.h
> @@ -295,8 +295,10 @@ struct rev_info {
>  	struct topo_walk_info *topo_walk_info;
>
>  	/* Commit graph bloom filter fields */
> -	/* The bloom filter key for the pathspec */
> -	struct bloom_key *bloom_key;
> +	/* The bloom filter key(s) for the pathspec */
> +	struct bloom_key *bloom_keys;
> +	int bloom_keys_nr;
> +
>  	/*
>  	 * The bloom filter settings used to generate the key.
>  	 * This is loaded from the commit-graph being used.
> diff --git a/t/t4216-log-bloom.sh b/t/t4216-log-bloom.sh
> index c7011f33e2c..c13b97d3bda 100755
> --- a/t/t4216-log-bloom.sh
> +++ b/t/t4216-log-bloom.sh
> @@ -142,7 +142,7 @@ test_expect_success 'setup - add commit-graph to the chain with Bloom filters' '
>
>  test_bloom_filters_used_when_some_filters_are_missing () {
>  	log_args=$1
> -	bloom_trace_prefix="statistics:{\"filter_not_present\":3,\"zero_length_filter\":0,\"maybe\":8,\"definitely_not\":6"
> +	bloom_trace_prefix="statistics:{\"filter_not_present\":3,\"zero_length_filter\":0,\"maybe\":6,\"definitely_not\":8"
>  	setup "$log_args" &&
>  	grep -q "$bloom_trace_prefix" "$TRASH_DIRECTORY/trace.perf" &&
>  	test_cmp log_wo_bloom log_w_bloom
> --
> gitgitgadget
>
Thanks,
Taylor
Taylor Blau June 19, 2020, 5:19 p.m. UTC | #4
On Fri, Jun 19, 2020 at 11:17:17AM -0600, Taylor Blau wrote:
> Hi Stolee,
>
> On Mon, Jun 15, 2020 at 08:14:50PM +0000, SZEDER Gábor via GitGitGadget wrote:
> > From: =?UTF-8?q?SZEDER=20G=C3=A1bor?= <szeder.dev@gmail.com>
> >
> > The file 'dir/subdir/file' can only be modified if its leading
> > directories 'dir' and 'dir/subdir' are modified as well.
> >
> > So when checking modified path Bloom filters looking for commits
> > modifying a path with multiple path components, then check not only
> > the full path in the Bloom filters, but all its leading directories as
> > well.  Take care to check these paths in "deepest first" order,
> > because it's the full path that is least likely to be modified, and
> > the Bloom filter queries can short circuit sooner.
> >
> > This can significantly reduce the average false positive rate, by
> > about an order of magnitude or three(!), and can further speed up
> > pathspec-limited revision walks.  The table below compares the average
> > false positive rate and runtime of
> >
> >   git rev-list HEAD -- "$path"
> >
> > before and after this change for 5000+ randomly* selected paths from
> > each repository:
> >
> >                     Average false           Average        Average
> >                     positive rate           runtime        runtime
> >                   before     after     before     after   difference
> >   ------------------------------------------------------------------
> >   git             3.220%   0.7853%     0.0558s   0.0387s   -30.6%
> >   linux           2.453%   0.0296%     0.1046s   0.0766s   -26.8%
> >   tensorflow      2.536%   0.6977%     0.0594s   0.0420s   -29.2%
> >
> > *Path selection was done with the following pipeline:
> >
> > 	git ls-tree -r --name-only HEAD | sort -R | head -n 5000
> >
> > The improvements in runtime are much smaller than the improvements in
> > average false positive rate, as we are clearly reaching diminishing
> > returns here.  However, all these timings depend on that accessing
> > tree objects is reasonably fast (warm caches).  If we had a partial
> > clone and the tree objects had to be fetched from a promisor remote,
> > e.g.:
> >
> >   $ git clone --filter=tree:0 --bare file://.../webkit.git webkit.notrees.git
> >   $ git -C webkit.git -c core.modifiedPathBloomFilters=1 \
> >         commit-graph write --reachable
> >   $ cp webkit.git/objects/info/commit-graph webkit.notrees.git/objects/info/
> >   $ git -C webkit.notrees.git -c core.modifiedPathBloomFilters=1 \
> >         rev-list HEAD -- "$path"
> >
> > then checking all leading path component can reduce the runtime from
> > over an hour to a few seconds (and this is with the clone and the
> > promisor on the same machine).
> >
> > This adjusts the tracing values in t4216-log-bloom.sh, which provides a
> > concrete way to notice the improvement.
> >
> > Signed-off-by: SZEDER Gábor <szeder.dev@gmail.com>
> > Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
> > ---
> >  revision.c           | 35 ++++++++++++++++++++++++++---------
> >  revision.h           |  6 ++++--
> >  t/t4216-log-bloom.sh |  2 +-
> >  3 files changed, 31 insertions(+), 12 deletions(-)
> >
> > diff --git a/revision.c b/revision.c
> > index c644c660917..027ae3982b4 100644
> > --- a/revision.c
> > +++ b/revision.c
> > @@ -670,9 +670,10 @@ static void prepare_to_use_bloom_filter(struct rev_info *revs)
> >  {
> >  	struct pathspec_item *pi;
> >  	char *path_alloc = NULL;
> > -	const char *path;
> > +	const char *path, *p;
> >  	int last_index;
> > -	int len;
> > +	size_t len;
> > +	int path_component_nr = 0, j;
> >
> >  	if (!revs->commits)
> >  		return;
> > @@ -705,8 +706,22 @@ static void prepare_to_use_bloom_filter(struct rev_info *revs)
> >
> >  	len = strlen(path);
> >
> > -	revs->bloom_key = xmalloc(sizeof(struct bloom_key));
> > -	fill_bloom_key(path, len, revs->bloom_key, revs->bloom_filter_settings);
> > +	p = path;
> > +	do {
> > +		p = strchrnul(p + 1, '/');
> > +		path_component_nr++;
> > +	} while (p - path < len);
> > +
> > +	revs->bloom_keys_nr = path_component_nr;
> > +	ALLOC_ARRAY(revs->bloom_keys, revs->bloom_keys_nr);
> > +
> > +	p = path;
> > +	for (j = 0; j < revs->bloom_keys_nr; j++) {
> > +		p = strchrnul(p + 1, '/');
> > +
> > +		fill_bloom_key(path, p - path, &revs->bloom_keys[j],
> > +			       revs->bloom_filter_settings);
> > +	}
> >
>
> Somewhat related to our off-list discussion yesterday, there is a bug in
> both 2.27 and this patch which produces incorrect results when (1)
> Bloom filters are enabled, and (2) we are doing a revision walk from
> root with the pathspec '.'.
>
> What appears to be going on is that our normalization takes '.' -> '',
> and then we form a Bloom key based on the empty string, which will
> return 'definitely not' when querying the Bloom filter some of the time,
> which should never happen. This is a consequence of never inserting the
> empty key into the Bloom filter upon generation.
>
> As a result, I have patched this in GitHub's fork (which is currently
> based on 2.27 and doesn't have these patches yet) by doing an early
> return when 'strlen(path) == 0'. Since it looks like these patches are
> going to land, here is some clean-up and a fix for the bug that you
> should feel free to test with and apply on top:
>
> --- >8 ---
>
> diff --git a/revision.c b/revision.c
> index 8bd383b1dd..123e72698d 100644
> --- a/revision.c
> +++ b/revision.c
> @@ -670,10 +670,10 @@ static void prepare_to_use_bloom_filter(struct rev_info *revs)
>  {
>         struct pathspec_item *pi;
>         char *path_alloc = NULL;
> -       const char *path, *p;
> +       char *path, *p;
>         int last_index;
>         size_t len;
> -       int path_component_nr = 0, j;
> +       int path_component_nr = 1, j;
>
>         if (!revs->commits)
>                 return;
> @@ -698,29 +698,33 @@ static void prepare_to_use_bloom_filter(struct rev_info *revs)
>
>         /* remove single trailing slash from path, if needed */
>         if (pi->match[last_index] == '/') {
> -           path_alloc = xstrdup(pi->match);
> -           path_alloc[last_index] = '\0';
> -           path = path_alloc;
> -       } else
> -           path = pi->match;
> +               path_alloc = xstrdup(pi->match);
> +               path_alloc[last_index] = '\0';
> +               path = path_alloc;
> +       } else {
> +               path = pi->match;
> +               len = pi->len;
> +       }
>
> -       len = strlen(path);
> +       if (!len)
> +               return;

I should note that _this_ is the critical fix, and it should fix the bug
if you only applied just this hunk.

Everything else is purely style clean-ups on top (ranging from the four
spaces used instead of a tab, to some string processing niceties that I
_think_ should address Rene's concern, although I'm not sure if an
actual bug is lurking there or not...)

> -       p = path;
>         do {
> -               p = strchrnul(p + 1, '/');
> -               path_component_nr++;
> -       } while (p - path < len);
> +               if (is_dir_sep(*p)) {
> +                       *p = '\0';
> +                       path_component_nr++;
> +               }
> +       } while (*p++);
>
>         revs->bloom_keys_nr = path_component_nr;
>         ALLOC_ARRAY(revs->bloom_keys, revs->bloom_keys_nr);
>
>         p = path;
>         for (j = 0; j < revs->bloom_keys_nr; j++) {
> -               p = strchrnul(p + 1, '/');
> -
> -               fill_bloom_key(path, p - path, &revs->bloom_keys[j],
> +               size_t plen = strlen(p);
> +               fill_bloom_key(p, plen, &revs->bloom_keys[j],
>                                revs->bloom_filter_settings);
> +               p += plen;
>         }
>
>         if (trace2_is_enabled() && !bloom_filter_atexit_registered) {
>
> >  	if (trace2_is_enabled() && !bloom_filter_atexit_registered) {
> >  		atexit(trace2_bloom_filter_statistics_atexit);
> > @@ -720,7 +735,7 @@ static int check_maybe_different_in_bloom_filter(struct rev_info *revs,
> >  						 struct commit *commit)
> >  {
> >  	struct bloom_filter *filter;
> > -	int result;
> > +	int result = 1, j;
> >
> >  	if (!revs->repo->objects->commit_graph)
> >  		return -1;
> > @@ -740,9 +755,11 @@ static int check_maybe_different_in_bloom_filter(struct rev_info *revs,
> >  		return -1;
> >  	}
> >
> > -	result = bloom_filter_contains(filter,
> > -				       revs->bloom_key,
> > -				       revs->bloom_filter_settings);
> > +	for (j = 0; result && j < revs->bloom_keys_nr; j++) {
> > +		result = bloom_filter_contains(filter,
> > +					       &revs->bloom_keys[j],
> > +					       revs->bloom_filter_settings);
> > +	}
> >
> >  	if (result)
> >  		count_bloom_filter_maybe++;
> > @@ -782,7 +799,7 @@ static int rev_compare_tree(struct rev_info *revs,
> >  			return REV_TREE_SAME;
> >  	}
> >
> > -	if (revs->bloom_key && !nth_parent) {
> > +	if (revs->bloom_keys_nr && !nth_parent) {
> >  		bloom_ret = check_maybe_different_in_bloom_filter(revs, commit);
> >
> >  		if (bloom_ret == 0)
> > diff --git a/revision.h b/revision.h
> > index 7c026fe41fc..abbfb4ab59a 100644
> > --- a/revision.h
> > +++ b/revision.h
> > @@ -295,8 +295,10 @@ struct rev_info {
> >  	struct topo_walk_info *topo_walk_info;
> >
> >  	/* Commit graph bloom filter fields */
> > -	/* The bloom filter key for the pathspec */
> > -	struct bloom_key *bloom_key;
> > +	/* The bloom filter key(s) for the pathspec */
> > +	struct bloom_key *bloom_keys;
> > +	int bloom_keys_nr;
> > +
> >  	/*
> >  	 * The bloom filter settings used to generate the key.
> >  	 * This is loaded from the commit-graph being used.
> > diff --git a/t/t4216-log-bloom.sh b/t/t4216-log-bloom.sh
> > index c7011f33e2c..c13b97d3bda 100755
> > --- a/t/t4216-log-bloom.sh
> > +++ b/t/t4216-log-bloom.sh
> > @@ -142,7 +142,7 @@ test_expect_success 'setup - add commit-graph to the chain with Bloom filters' '
> >
> >  test_bloom_filters_used_when_some_filters_are_missing () {
> >  	log_args=$1
> > -	bloom_trace_prefix="statistics:{\"filter_not_present\":3,\"zero_length_filter\":0,\"maybe\":8,\"definitely_not\":6"
> > +	bloom_trace_prefix="statistics:{\"filter_not_present\":3,\"zero_length_filter\":0,\"maybe\":6,\"definitely_not\":8"
> >  	setup "$log_args" &&
> >  	grep -q "$bloom_trace_prefix" "$TRASH_DIRECTORY/trace.perf" &&
> >  	test_cmp log_wo_bloom log_w_bloom
> > --
> > gitgitgadget
> >
> Thanks,
> Taylor
Thanks,
Taylor
Derrick Stolee June 23, 2020, 1:47 p.m. UTC | #5
On 6/19/2020 1:17 PM, Taylor Blau wrote:
> Hi Stolee,
> 
> On Mon, Jun 15, 2020 at 08:14:50PM +0000, SZEDER Gábor via GitGitGadget wrote:
>> From: =?UTF-8?q?SZEDER=20G=C3=A1bor?= <szeder.dev@gmail.com>
>>
>> The file 'dir/subdir/file' can only be modified if its leading
>> directories 'dir' and 'dir/subdir' are modified as well.
>>
>> So when checking modified path Bloom filters looking for commits
>> modifying a path with multiple path components, then check not only
>> the full path in the Bloom filters, but all its leading directories as
>> well.  Take care to check these paths in "deepest first" order,
>> because it's the full path that is least likely to be modified, and
>> the Bloom filter queries can short circuit sooner.
>>
>> This can significantly reduce the average false positive rate, by
>> about an order of magnitude or three(!), and can further speed up
>> pathspec-limited revision walks.  The table below compares the average
>> false positive rate and runtime of
>>
>>   git rev-list HEAD -- "$path"
>>
>> before and after this change for 5000+ randomly* selected paths from
>> each repository:
>>
>>                     Average false           Average        Average
>>                     positive rate           runtime        runtime
>>                   before     after     before     after   difference
>>   ------------------------------------------------------------------
>>   git             3.220%   0.7853%     0.0558s   0.0387s   -30.6%
>>   linux           2.453%   0.0296%     0.1046s   0.0766s   -26.8%
>>   tensorflow      2.536%   0.6977%     0.0594s   0.0420s   -29.2%
>>
>> *Path selection was done with the following pipeline:
>>
>> 	git ls-tree -r --name-only HEAD | sort -R | head -n 5000
>>
>> The improvements in runtime are much smaller than the improvements in
>> average false positive rate, as we are clearly reaching diminishing
>> returns here.  However, all these timings depend on that accessing
>> tree objects is reasonably fast (warm caches).  If we had a partial
>> clone and the tree objects had to be fetched from a promisor remote,
>> e.g.:
>>
>>   $ git clone --filter=tree:0 --bare file://.../webkit.git webkit.notrees.git
>>   $ git -C webkit.git -c core.modifiedPathBloomFilters=1 \
>>         commit-graph write --reachable
>>   $ cp webkit.git/objects/info/commit-graph webkit.notrees.git/objects/info/
>>   $ git -C webkit.notrees.git -c core.modifiedPathBloomFilters=1 \
>>         rev-list HEAD -- "$path"
>>
>> then checking all leading path component can reduce the runtime from
>> over an hour to a few seconds (and this is with the clone and the
>> promisor on the same machine).
>>
>> This adjusts the tracing values in t4216-log-bloom.sh, which provides a
>> concrete way to notice the improvement.
>>
>> Signed-off-by: SZEDER Gábor <szeder.dev@gmail.com>
>> Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
>> ---
>>  revision.c           | 35 ++++++++++++++++++++++++++---------
>>  revision.h           |  6 ++++--
>>  t/t4216-log-bloom.sh |  2 +-
>>  3 files changed, 31 insertions(+), 12 deletions(-)
>>
>> diff --git a/revision.c b/revision.c
>> index c644c660917..027ae3982b4 100644
>> --- a/revision.c
>> +++ b/revision.c
>> @@ -670,9 +670,10 @@ static void prepare_to_use_bloom_filter(struct rev_info *revs)
>>  {
>>  	struct pathspec_item *pi;
>>  	char *path_alloc = NULL;
>> -	const char *path;
>> +	const char *path, *p;
>>  	int last_index;
>> -	int len;
>> +	size_t len;
>> +	int path_component_nr = 0, j;
>>
>>  	if (!revs->commits)
>>  		return;
>> @@ -705,8 +706,22 @@ static void prepare_to_use_bloom_filter(struct rev_info *revs)
>>
>>  	len = strlen(path);
>>
>> -	revs->bloom_key = xmalloc(sizeof(struct bloom_key));
>> -	fill_bloom_key(path, len, revs->bloom_key, revs->bloom_filter_settings);
>> +	p = path;
>> +	do {
>> +		p = strchrnul(p + 1, '/');
>> +		path_component_nr++;
>> +	} while (p - path < len);
>> +
>> +	revs->bloom_keys_nr = path_component_nr;
>> +	ALLOC_ARRAY(revs->bloom_keys, revs->bloom_keys_nr);
>> +
>> +	p = path;
>> +	for (j = 0; j < revs->bloom_keys_nr; j++) {
>> +		p = strchrnul(p + 1, '/');
>> +
>> +		fill_bloom_key(path, p - path, &revs->bloom_keys[j],
>> +			       revs->bloom_filter_settings);
>> +	}
>>
> 
> Somewhat related to our off-list discussion yesterday, there is a bug in
> both 2.27 and this patch which produces incorrect results when (1)
> Bloom filters are enabled, and (2) we are doing a revision walk from
> root with the pathspec '.'.
> 
> What appears to be going on is that our normalization takes '.' -> '',
> and then we form a Bloom key based on the empty string, which will
> return 'definitely not' when querying the Bloom filter some of the time,
> which should never happen. This is a consequence of never inserting the
> empty key into the Bloom filter upon generation.
> 
> As a result, I have patched this in GitHub's fork (which is currently
> based on 2.27 and doesn't have these patches yet) by doing an early
> return when 'strlen(path) == 0'. Since it looks like these patches are
> going to land, here is some clean-up and a fix for the bug that you
> should feel free to test with and apply on top:
> 
> --- >8 ---
> 
> diff --git a/revision.c b/revision.c
> index 8bd383b1dd..123e72698d 100644
> --- a/revision.c
> +++ b/revision.c
> @@ -670,10 +670,10 @@ static void prepare_to_use_bloom_filter(struct rev_info *revs)
>  {
>         struct pathspec_item *pi;
>         char *path_alloc = NULL;
> -       const char *path, *p;
> +       char *path, *p;
>         int last_index;
>         size_t len;
> -       int path_component_nr = 0, j;
> +       int path_component_nr = 1, j;
> 
>         if (!revs->commits)
>                 return;
> @@ -698,29 +698,33 @@ static void prepare_to_use_bloom_filter(struct rev_info *revs)
> 
>         /* remove single trailing slash from path, if needed */
>         if (pi->match[last_index] == '/') {
> -           path_alloc = xstrdup(pi->match);
> -           path_alloc[last_index] = '\0';
> -           path = path_alloc;
> -       } else
> -           path = pi->match;
> +               path_alloc = xstrdup(pi->match);
> +               path_alloc[last_index] = '\0';
> +               path = path_alloc;
> +       } else {
> +               path = pi->match;
> +               len = pi->len;
> +       }
> 
> -       len = strlen(path);
> +       if (!len)
> +               return;
> 
> -       p = path;
>         do {
> -               p = strchrnul(p + 1, '/');
> -               path_component_nr++;
> -       } while (p - path < len);
> +               if (is_dir_sep(*p)) {
> +                       *p = '\0';
> +                       path_component_nr++;
> +               }
> +       } while (*p++);
> 
>         revs->bloom_keys_nr = path_component_nr;
>         ALLOC_ARRAY(revs->bloom_keys, revs->bloom_keys_nr);
> 
>         p = path;
>         for (j = 0; j < revs->bloom_keys_nr; j++) {
> -               p = strchrnul(p + 1, '/');
> -
> -               fill_bloom_key(path, p - path, &revs->bloom_keys[j],
> +               size_t plen = strlen(p);
> +               fill_bloom_key(p, plen, &revs->bloom_keys[j],
>                                revs->bloom_filter_settings);
> +               p += plen;

I don't think this is correct at all. Looking at it, it seems
that it would take a path "A/B/C" and add keys for "A", "B", and
"C" instead of "A", "A/B", and "A/B/C".

Looking more closely, there are a few issues that makes it clear
why you didn't see a failing test:

1. You use "while (*p++)" instead of "while (*++p)" so the scan
   terminates after the first directory split. (So only "A" is
   added, which won't fail, but will be slower than intended.)

Changing that, we see the next problem:

2. You use "p += plen" instead of "p += plen + 1". This causes
   the filters to add "A", "", and "" (because we don't skip the
   terminating character). This _is_ incorrect and would result
   in test failures.

Changing that, we then see "A", "B", and "C" are added as keys.

I'm going to take the style issues that you presented, and
change them in one commit (reported-by you) and then the
if (!len) fix in a separate patch.

I'll update the scan loops to use is_dir_sep() accordingly
in this patch.

Thanks,
-Stolee
diff mbox series

Patch

diff --git a/revision.c b/revision.c
index c644c660917..027ae3982b4 100644
--- a/revision.c
+++ b/revision.c
@@ -670,9 +670,10 @@  static void prepare_to_use_bloom_filter(struct rev_info *revs)
 {
 	struct pathspec_item *pi;
 	char *path_alloc = NULL;
-	const char *path;
+	const char *path, *p;
 	int last_index;
-	int len;
+	size_t len;
+	int path_component_nr = 0, j;
 
 	if (!revs->commits)
 		return;
@@ -705,8 +706,22 @@  static void prepare_to_use_bloom_filter(struct rev_info *revs)
 
 	len = strlen(path);
 
-	revs->bloom_key = xmalloc(sizeof(struct bloom_key));
-	fill_bloom_key(path, len, revs->bloom_key, revs->bloom_filter_settings);
+	p = path;
+	do {
+		p = strchrnul(p + 1, '/');
+		path_component_nr++;
+	} while (p - path < len);
+
+	revs->bloom_keys_nr = path_component_nr;
+	ALLOC_ARRAY(revs->bloom_keys, revs->bloom_keys_nr);
+
+	p = path;
+	for (j = 0; j < revs->bloom_keys_nr; j++) {
+		p = strchrnul(p + 1, '/');
+
+		fill_bloom_key(path, p - path, &revs->bloom_keys[j],
+			       revs->bloom_filter_settings);
+	}
 
 	if (trace2_is_enabled() && !bloom_filter_atexit_registered) {
 		atexit(trace2_bloom_filter_statistics_atexit);
@@ -720,7 +735,7 @@  static int check_maybe_different_in_bloom_filter(struct rev_info *revs,
 						 struct commit *commit)
 {
 	struct bloom_filter *filter;
-	int result;
+	int result = 1, j;
 
 	if (!revs->repo->objects->commit_graph)
 		return -1;
@@ -740,9 +755,11 @@  static int check_maybe_different_in_bloom_filter(struct rev_info *revs,
 		return -1;
 	}
 
-	result = bloom_filter_contains(filter,
-				       revs->bloom_key,
-				       revs->bloom_filter_settings);
+	for (j = 0; result && j < revs->bloom_keys_nr; j++) {
+		result = bloom_filter_contains(filter,
+					       &revs->bloom_keys[j],
+					       revs->bloom_filter_settings);
+	}
 
 	if (result)
 		count_bloom_filter_maybe++;
@@ -782,7 +799,7 @@  static int rev_compare_tree(struct rev_info *revs,
 			return REV_TREE_SAME;
 	}
 
-	if (revs->bloom_key && !nth_parent) {
+	if (revs->bloom_keys_nr && !nth_parent) {
 		bloom_ret = check_maybe_different_in_bloom_filter(revs, commit);
 
 		if (bloom_ret == 0)
diff --git a/revision.h b/revision.h
index 7c026fe41fc..abbfb4ab59a 100644
--- a/revision.h
+++ b/revision.h
@@ -295,8 +295,10 @@  struct rev_info {
 	struct topo_walk_info *topo_walk_info;
 
 	/* Commit graph bloom filter fields */
-	/* The bloom filter key for the pathspec */
-	struct bloom_key *bloom_key;
+	/* The bloom filter key(s) for the pathspec */
+	struct bloom_key *bloom_keys;
+	int bloom_keys_nr;
+
 	/*
 	 * The bloom filter settings used to generate the key.
 	 * This is loaded from the commit-graph being used.
diff --git a/t/t4216-log-bloom.sh b/t/t4216-log-bloom.sh
index c7011f33e2c..c13b97d3bda 100755
--- a/t/t4216-log-bloom.sh
+++ b/t/t4216-log-bloom.sh
@@ -142,7 +142,7 @@  test_expect_success 'setup - add commit-graph to the chain with Bloom filters' '
 
 test_bloom_filters_used_when_some_filters_are_missing () {
 	log_args=$1
-	bloom_trace_prefix="statistics:{\"filter_not_present\":3,\"zero_length_filter\":0,\"maybe\":8,\"definitely_not\":6"
+	bloom_trace_prefix="statistics:{\"filter_not_present\":3,\"zero_length_filter\":0,\"maybe\":6,\"definitely_not\":8"
 	setup "$log_args" &&
 	grep -q "$bloom_trace_prefix" "$TRASH_DIRECTORY/trace.perf" &&
 	test_cmp log_wo_bloom log_w_bloom