diff mbox series

[v2,03/11] diff: halt tree-diff early after max_changes

Message ID a698c04a78cf2988fb822e0aa532989f925e0a9e.1580943390.git.gitgitgadget@gmail.com (mailing list archive)
State New, archived
Headers show
Series Changed Paths Bloom Filters | expand

Commit Message

Linus Arver via GitGitGadget Feb. 5, 2020, 10:56 p.m. UTC
From: Derrick Stolee <dstolee@microsoft.com>

When computing the changed-paths bloom filters for the commit-graph,
we limit the size of the filter by restricting the number of paths
in the diff. Instead of computing a large diff and then ignoring the
result, it is better to halt the diff computation early.

Create a new "max_changes" option in struct diff_options. If non-zero,
then halt the diff computation after discovering strictly more changed
paths. This includes paths corresponding to trees that change.

Use this max_changes option in the bloom filter calculations. This
reduces the time taken to compute the filters for the Linux kernel
repo from 2m50s to 2m35s. On a large internal repository with ~500
commits that perform tree-wide changes, the time reduced from
6m15s to 3m48s.

Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Garima Singh <garima.singh@microsoft.com>
---
 bloom.c     | 4 +++-
 diff.h      | 5 +++++
 tree-diff.c | 6 ++++++
 3 files changed, 14 insertions(+), 1 deletion(-)

Comments

Jakub Narębski Feb. 17, 2020, midnight UTC | #1
"Derrick Stolee via GitGitGadget" <gitgitgadget@gmail.com> writes:

> From: Derrick Stolee <dstolee@microsoft.com>
>
> When computing the changed-paths bloom filters for the commit-graph,
> we limit the size of the filter by restricting the number of paths
> in the diff. Instead of computing a large diff and then ignoring the
> result, it is better to halt the diff computation early.

Good idea.

>
> Create a new "max_changes" option in struct diff_options. If non-zero,
> then halt the diff computation after discovering strictly more changed
> paths. This includes paths corresponding to trees that change.

All right; also, it doesn't need to be exact, though it would be good if
it was.

512 changed paths (changed files) usually generate more than 512
elements to be added to the Bloom filter (changed directories and
files), anyway.

>
> Use this max_changes option in the bloom filter calculations. This
> reduces the time taken to compute the filters for the Linux kernel
> repo from 2m50s to 2m35s. On a large internal repository with ~500
> commits that perform tree-wide changes, the time reduced from
> 6m15s to 3m48s.

I wonder if there is some large open-source project with many commits
performing tree-wide changes, that is with many commits with more than
512 changed files with respect to the first parent.

Maybe https://github.com/whosonfirst-data/whosonfirst-data-venue-us-ny
from "Top Ten Worst Repositories to host on GitHub - Git Merge 2017"
could be a good repository to test ;-)

>
> Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
> Signed-off-by: Garima Singh <garima.singh@microsoft.com>

Looks good to me, but that is from cursory examination.
Don't know the area to say anything more.

> ---
>  bloom.c     | 4 +++-
>  diff.h      | 5 +++++
>  tree-diff.c | 6 ++++++
>  3 files changed, 14 insertions(+), 1 deletion(-)
>
> diff --git a/bloom.c b/bloom.c
> index 6082193a75..818382c03b 100644
> --- a/bloom.c
> +++ b/bloom.c
> @@ -134,6 +134,7 @@ struct bloom_filter *get_bloom_filter(struct repository *r,
>  	struct bloom_filter_settings settings = DEFAULT_BLOOM_FILTER_SETTINGS;
>  	int i;
>  	struct diff_options diffopt;
> +	int max_changes = 512;
>  
>  	if (!bloom_filters.slab_size)
>  		return NULL;
> @@ -142,6 +143,7 @@ struct bloom_filter *get_bloom_filter(struct repository *r,
>  
>  	repo_diff_setup(r, &diffopt);
>  	diffopt.flags.recursive = 1;
> +	diffopt.max_changes = max_changes;
>  	diff_setup_done(&diffopt);
>  
>  	if (c->parents)
> @@ -150,7 +152,7 @@ struct bloom_filter *get_bloom_filter(struct repository *r,
>  		diff_tree_oid(NULL, &c->object.oid, "", &diffopt);
>  	diffcore_std(&diffopt);
>  
> -	if (diff_queued_diff.nr <= 512) {
> +	if (diff_queued_diff.nr <= max_changes) {
>  		struct hashmap pathmap;
>  		struct pathmap_hash_entry* e;
>  		struct hashmap_iter iter;
> diff --git a/diff.h b/diff.h
> index 6febe7e365..9443dc1b00 100644
> --- a/diff.h
> +++ b/diff.h
> @@ -285,6 +285,11 @@ struct diff_options {
>  	/* Number of hexdigits to abbreviate raw format output to. */
>  	int abbrev;
>  
> +	/* If non-zero, then stop computing after this many changes. */
> +	int max_changes;
> +	/* For internal use only. */
> +	int num_changes;
> +
>  	int ita_invisible_in_index;
>  /* white-space error highlighting */
>  #define WSEH_NEW (1<<12)
> diff --git a/tree-diff.c b/tree-diff.c
> index 33ded7f8b3..f3d303c6e5 100644
> --- a/tree-diff.c
> +++ b/tree-diff.c
> @@ -434,6 +434,9 @@ static struct combine_diff_path *ll_diff_tree_paths(
>  		if (diff_can_quit_early(opt))
>  			break;
>  
> +		if (opt->max_changes && opt->num_changes > opt->max_changes)
> +			break;
> +
>  		if (opt->pathspec.nr) {
>  			skip_uninteresting(&t, base, opt);
>  			for (i = 0; i < nparent; i++)
> @@ -518,6 +521,7 @@ static struct combine_diff_path *ll_diff_tree_paths(
>  
>  			/* t↓ */
>  			update_tree_entry(&t);
> +			opt->num_changes++;
>  		}
>  
>  		/* t > p[imin] */
> @@ -535,6 +539,7 @@ static struct combine_diff_path *ll_diff_tree_paths(
>  		skip_emit_tp:
>  			/* ∀ pi=p[imin]  pi↓ */
>  			update_tp_entries(tp, nparent);
> +			opt->num_changes++;
>  		}
>  	}
>  
> @@ -552,6 +557,7 @@ struct combine_diff_path *diff_tree_paths(
>  	const struct object_id **parents_oid, int nparent,
>  	struct strbuf *base, struct diff_options *opt)
>  {
> +	opt->num_changes = 0;
>  	p = ll_diff_tree_paths(p, oid, parents_oid, nparent, base, opt);
>  
>  	/*
Garima Singh Feb. 22, 2020, 12:37 a.m. UTC | #2
On 2/16/2020 7:00 PM, Jakub Narebski wrote:
> "Derrick Stolee via GitGitGadget" <gitgitgadget@gmail.com> writes:
> 
>> From: Derrick Stolee <dstolee@microsoft.com>
>>
>> Use this max_changes option in the bloom filter calculations. This
>> reduces the time taken to compute the filters for the Linux kernel
>> repo from 2m50s to 2m35s. On a large internal repository with ~500
>> commits that perform tree-wide changes, the time reduced from
>> 6m15s to 3m48s.
> 
> I wonder if there is some large open-source project with many commits
> performing tree-wide changes, that is with many commits with more than
> 512 changed files with respect to the first parent.
> 
> Maybe https://github.com/whosonfirst-data/whosonfirst-data-venue-us-ny
> from "Top Ten Worst Repositories to host on GitHub - Git Merge 2017"
> could be a good repository to test ;-)
> 

Thanks for the suggestion! I will see if any of these repos gives us a 
good test bed and add the perf improvement numbers in the appropriate
commit messages in v3. 

Cheers! 
Garima Singh
diff mbox series

Patch

diff --git a/bloom.c b/bloom.c
index 6082193a75..818382c03b 100644
--- a/bloom.c
+++ b/bloom.c
@@ -134,6 +134,7 @@  struct bloom_filter *get_bloom_filter(struct repository *r,
 	struct bloom_filter_settings settings = DEFAULT_BLOOM_FILTER_SETTINGS;
 	int i;
 	struct diff_options diffopt;
+	int max_changes = 512;
 
 	if (!bloom_filters.slab_size)
 		return NULL;
@@ -142,6 +143,7 @@  struct bloom_filter *get_bloom_filter(struct repository *r,
 
 	repo_diff_setup(r, &diffopt);
 	diffopt.flags.recursive = 1;
+	diffopt.max_changes = max_changes;
 	diff_setup_done(&diffopt);
 
 	if (c->parents)
@@ -150,7 +152,7 @@  struct bloom_filter *get_bloom_filter(struct repository *r,
 		diff_tree_oid(NULL, &c->object.oid, "", &diffopt);
 	diffcore_std(&diffopt);
 
-	if (diff_queued_diff.nr <= 512) {
+	if (diff_queued_diff.nr <= max_changes) {
 		struct hashmap pathmap;
 		struct pathmap_hash_entry* e;
 		struct hashmap_iter iter;
diff --git a/diff.h b/diff.h
index 6febe7e365..9443dc1b00 100644
--- a/diff.h
+++ b/diff.h
@@ -285,6 +285,11 @@  struct diff_options {
 	/* Number of hexdigits to abbreviate raw format output to. */
 	int abbrev;
 
+	/* If non-zero, then stop computing after this many changes. */
+	int max_changes;
+	/* For internal use only. */
+	int num_changes;
+
 	int ita_invisible_in_index;
 /* white-space error highlighting */
 #define WSEH_NEW (1<<12)
diff --git a/tree-diff.c b/tree-diff.c
index 33ded7f8b3..f3d303c6e5 100644
--- a/tree-diff.c
+++ b/tree-diff.c
@@ -434,6 +434,9 @@  static struct combine_diff_path *ll_diff_tree_paths(
 		if (diff_can_quit_early(opt))
 			break;
 
+		if (opt->max_changes && opt->num_changes > opt->max_changes)
+			break;
+
 		if (opt->pathspec.nr) {
 			skip_uninteresting(&t, base, opt);
 			for (i = 0; i < nparent; i++)
@@ -518,6 +521,7 @@  static struct combine_diff_path *ll_diff_tree_paths(
 
 			/* t↓ */
 			update_tree_entry(&t);
+			opt->num_changes++;
 		}
 
 		/* t > p[imin] */
@@ -535,6 +539,7 @@  static struct combine_diff_path *ll_diff_tree_paths(
 		skip_emit_tp:
 			/* ∀ pi=p[imin]  pi↓ */
 			update_tp_entries(tp, nparent);
+			opt->num_changes++;
 		}
 	}
 
@@ -552,6 +557,7 @@  struct combine_diff_path *diff_tree_paths(
 	const struct object_id **parents_oid, int nparent,
 	struct strbuf *base, struct diff_options *opt)
 {
+	opt->num_changes = 0;
 	p = ll_diff_tree_paths(p, oid, parents_oid, nparent, base, opt);
 
 	/*