[v4,05/15] diff: halt tree-diff early after max_changes
diff mbox series

Message ID 2d4c0b2da38632424c8bd31ccb2037e0676c3c74.1586192395.git.gitgitgadget@gmail.com
State New
Headers show
Series
  • Changed Paths Bloom Filters
Related show

Commit Message

Elijah Newren via GitGitGadget April 6, 2020, 4:59 p.m. UTC
From: Derrick Stolee <dstolee@microsoft.com>

When computing the changed-paths bloom filters for the commit-graph,
we limit the size of the filter by restricting the number of paths
in the diff. Instead of computing a large diff and then ignoring the
result, it is better to halt the diff computation early.

Create a new "max_changes" option in struct diff_options. If non-zero,
then halt the diff computation after discovering strictly more changed
paths. This includes paths corresponding to trees that change.

Use this max_changes option in the bloom filter calculations. This
reduces the time taken to compute the filters for the Linux kernel
repo from 2m50s to 2m35s. On a large internal repository with ~500
commits that perform tree-wide changes, the time reduced from
6m15s to 3m48s.

Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Garima Singh <garima.singh@microsoft.com>
---
 bloom.c     | 4 +++-
 diff.h      | 5 +++++
 tree-diff.c | 6 ++++++
 3 files changed, 14 insertions(+), 1 deletion(-)

Patch
diff mbox series

diff --git a/bloom.c b/bloom.c
index 881a9841ede..a16eee92331 100644
--- a/bloom.c
+++ b/bloom.c
@@ -133,6 +133,7 @@  struct bloom_filter *get_bloom_filter(struct repository *r,
 	struct bloom_filter_settings settings = DEFAULT_BLOOM_FILTER_SETTINGS;
 	int i;
 	struct diff_options diffopt;
+	int max_changes = 512;
 
 	if (bloom_filters.slab_size == 0)
 		return NULL;
@@ -141,6 +142,7 @@  struct bloom_filter *get_bloom_filter(struct repository *r,
 
 	repo_diff_setup(r, &diffopt);
 	diffopt.flags.recursive = 1;
+	diffopt.max_changes = max_changes;
 	diff_setup_done(&diffopt);
 
 	if (c->parents)
@@ -149,7 +151,7 @@  struct bloom_filter *get_bloom_filter(struct repository *r,
 		diff_tree_oid(NULL, &c->object.oid, "", &diffopt);
 	diffcore_std(&diffopt);
 
-	if (diff_queued_diff.nr <= 512) {
+	if (diff_queued_diff.nr <= max_changes) {
 		struct hashmap pathmap;
 		struct pathmap_hash_entry *e;
 		struct hashmap_iter iter;
diff --git a/diff.h b/diff.h
index 6febe7e3656..9443dc1b003 100644
--- a/diff.h
+++ b/diff.h
@@ -285,6 +285,11 @@  struct diff_options {
 	/* Number of hexdigits to abbreviate raw format output to. */
 	int abbrev;
 
+	/* If non-zero, then stop computing after this many changes. */
+	int max_changes;
+	/* For internal use only. */
+	int num_changes;
+
 	int ita_invisible_in_index;
 /* white-space error highlighting */
 #define WSEH_NEW (1<<12)
diff --git a/tree-diff.c b/tree-diff.c
index 33ded7f8b3e..f3d303c6e54 100644
--- a/tree-diff.c
+++ b/tree-diff.c
@@ -434,6 +434,9 @@  static struct combine_diff_path *ll_diff_tree_paths(
 		if (diff_can_quit_early(opt))
 			break;
 
+		if (opt->max_changes && opt->num_changes > opt->max_changes)
+			break;
+
 		if (opt->pathspec.nr) {
 			skip_uninteresting(&t, base, opt);
 			for (i = 0; i < nparent; i++)
@@ -518,6 +521,7 @@  static struct combine_diff_path *ll_diff_tree_paths(
 
 			/* t↓ */
 			update_tree_entry(&t);
+			opt->num_changes++;
 		}
 
 		/* t > p[imin] */
@@ -535,6 +539,7 @@  static struct combine_diff_path *ll_diff_tree_paths(
 		skip_emit_tp:
 			/* ∀ pi=p[imin]  pi↓ */
 			update_tp_entries(tp, nparent);
+			opt->num_changes++;
 		}
 	}
 
@@ -552,6 +557,7 @@  struct combine_diff_path *diff_tree_paths(
 	const struct object_id **parents_oid, int nparent,
 	struct strbuf *base, struct diff_options *opt)
 {
+	opt->num_changes = 0;
 	p = ll_diff_tree_paths(p, oid, parents_oid, nparent, base, opt);
 
 	/*