diff mbox series

[v4,05/15] diff: halt tree-diff early after max_changes

Message ID 2d4c0b2da38632424c8bd31ccb2037e0676c3c74.1586192395.git.gitgitgadget@gmail.com (mailing list archive)
State New, archived
Headers show
Series Changed Paths Bloom Filters | expand

Commit Message

Linus Arver via GitGitGadget April 6, 2020, 4:59 p.m. UTC
From: Derrick Stolee <dstolee@microsoft.com>

When computing the changed-paths bloom filters for the commit-graph,
we limit the size of the filter by restricting the number of paths
in the diff. Instead of computing a large diff and then ignoring the
result, it is better to halt the diff computation early.

Create a new "max_changes" option in struct diff_options. If non-zero,
then halt the diff computation after discovering strictly more changed
paths. This includes paths corresponding to trees that change.

Use this max_changes option in the bloom filter calculations. This
reduces the time taken to compute the filters for the Linux kernel
repo from 2m50s to 2m35s. On a large internal repository with ~500
commits that perform tree-wide changes, the time reduced from
6m15s to 3m48s.

Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Garima Singh <garima.singh@microsoft.com>
---
 bloom.c     | 4 +++-
 diff.h      | 5 +++++
 tree-diff.c | 6 ++++++
 3 files changed, 14 insertions(+), 1 deletion(-)

Comments

SZEDER Gábor Aug. 4, 2020, 2:47 p.m. UTC | #1
On Mon, Apr 06, 2020 at 04:59:45PM +0000, Derrick Stolee via GitGitGadget wrote:
> From: Derrick Stolee <dstolee@microsoft.com>
> 
> When computing the changed-paths bloom filters for the commit-graph,
> we limit the size of the filter by restricting the number of paths
> in the diff. Instead of computing a large diff and then ignoring the
> result, it is better to halt the diff computation early.
> 
> Create a new "max_changes" option in struct diff_options. If non-zero,
> then halt the diff computation after discovering strictly more changed
> paths. This includes paths corresponding to trees that change.
> 
> Use this max_changes option in the bloom filter calculations. This
> reduces the time taken to compute the filters for the Linux kernel
> repo from 2m50s to 2m35s. On a large internal repository with ~500
> commits that perform tree-wide changes, the time reduced from
> 6m15s to 3m48s.
> 
> Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
> Signed-off-by: Garima Singh <garima.singh@microsoft.com>
> ---
>  bloom.c     | 4 +++-
>  diff.h      | 5 +++++
>  tree-diff.c | 6 ++++++
>  3 files changed, 14 insertions(+), 1 deletion(-)
> 
> diff --git a/bloom.c b/bloom.c
> index 881a9841ede..a16eee92331 100644
> --- a/bloom.c
> +++ b/bloom.c
> @@ -133,6 +133,7 @@ struct bloom_filter *get_bloom_filter(struct repository *r,
>  	struct bloom_filter_settings settings = DEFAULT_BLOOM_FILTER_SETTINGS;
>  	int i;
>  	struct diff_options diffopt;
> +	int max_changes = 512;
>  
>  	if (bloom_filters.slab_size == 0)
>  		return NULL;
> @@ -141,6 +142,7 @@ struct bloom_filter *get_bloom_filter(struct repository *r,
>  
>  	repo_diff_setup(r, &diffopt);
>  	diffopt.flags.recursive = 1;
> +	diffopt.max_changes = max_changes;
>  	diff_setup_done(&diffopt);
>  
>  	if (c->parents)
> @@ -149,7 +151,7 @@ struct bloom_filter *get_bloom_filter(struct repository *r,
>  		diff_tree_oid(NULL, &c->object.oid, "", &diffopt);
>  	diffcore_std(&diffopt);
>  
> -	if (diff_queued_diff.nr <= 512) {
> +	if (diff_queued_diff.nr <= max_changes) {
>  		struct hashmap pathmap;
>  		struct pathmap_hash_entry *e;
>  		struct hashmap_iter iter;
> diff --git a/diff.h b/diff.h
> index 6febe7e3656..9443dc1b003 100644
> --- a/diff.h
> +++ b/diff.h
> @@ -285,6 +285,11 @@ struct diff_options {
>  	/* Number of hexdigits to abbreviate raw format output to. */
>  	int abbrev;
>  
> +	/* If non-zero, then stop computing after this many changes. */
> +	int max_changes;
> +	/* For internal use only. */
> +	int num_changes;

"For internal use only", understood.

> +
>  	int ita_invisible_in_index;
>  /* white-space error highlighting */
>  #define WSEH_NEW (1<<12)
> diff --git a/tree-diff.c b/tree-diff.c
> index 33ded7f8b3e..f3d303c6e54 100644
> --- a/tree-diff.c
> +++ b/tree-diff.c
> @@ -434,6 +434,9 @@ static struct combine_diff_path *ll_diff_tree_paths(
>  		if (diff_can_quit_early(opt))
>  			break;
>  
> +		if (opt->max_changes && opt->num_changes > opt->max_changes)
> +			break;
> +
>  		if (opt->pathspec.nr) {
>  			skip_uninteresting(&t, base, opt);
>  			for (i = 0; i < nparent; i++)
> @@ -518,6 +521,7 @@ static struct combine_diff_path *ll_diff_tree_paths(
>  
>  			/* t↓ */
>  			update_tree_entry(&t);
> +			opt->num_changes++;
>  		}
>  
>  		/* t > p[imin] */
> @@ -535,6 +539,7 @@ static struct combine_diff_path *ll_diff_tree_paths(
>  		skip_emit_tp:
>  			/* ∀ pi=p[imin]  pi↓ */
>  			update_tp_entries(tp, nparent);
> +			opt->num_changes++;
>  		}
>  	}

This counter is basically broken, its value is wrong for over 98% of
commits, and, worse, its value remains 0 for over 85% of commits in
the repositories I usually use to test modified path Bloom filters.
Consequently, a relatively large number of commits modifying more than
512 paths get Bloom filters.

The makeshift tests in the patch below demonstrate these issues as
most of them fail, most notably those two tests that demonstrate that
modifying existing paths are not counted at all.


  ---  >8  ---

diff --git a/bloom.c b/bloom.c
index 9b86aa3f59..3db0fde734 100644
--- a/bloom.c
+++ b/bloom.c
@@ -203,7 +203,7 @@ struct bloom_filter *get_bloom_filter(struct repository *r,
 	repo_diff_setup(r, &diffopt);
 	diffopt.flags.recursive = 1;
 	diffopt.detect_rename = 0;
-	diffopt.max_changes = max_changes;
+	diffopt.max_changes = 0;
 	diff_setup_done(&diffopt);
 
 	/* ensure commit is parsed so we have parent information */
@@ -214,6 +214,7 @@ struct bloom_filter *get_bloom_filter(struct repository *r,
 	else
 		diff_tree_oid(NULL, &c->object.oid, "", &diffopt);
 	diffcore_std(&diffopt);
+	printf("%s  %d\n", oid_to_hex(&c->object.oid), diffopt.num_changes);
 
 	if (diffopt.num_changes <= max_changes) {
 		struct hashmap pathmap;
diff --git a/t/t9999-test.sh b/t/t9999-test.sh
new file mode 100755
index 0000000000..8d2bd9f03f
--- /dev/null
+++ b/t/t9999-test.sh
@@ -0,0 +1,142 @@
+#!/bin/sh
+
+test_description='test'
+
+. ./test-lib.sh
+
+test_expect_success 'setup' '
+	test_tick &&
+
+	echo 1 >file &&
+	mkdir -p dir/subdir &&
+	echo 1 >dir/subdir/file1 &&
+	echo 1 >dir/subdir/file2 &&
+	git add file dir &&
+	git commit -m setup &&
+
+	echo 2 >file &&
+	git commit -a -m "modify one path in root" &&
+	mod_one_path=$(git rev-parse HEAD) &&
+
+	echo 2 >dir/subdir/file1 &&
+	echo 2 >dir/subdir/file2 &&
+	git commit -a -m "modify two file two dirs deep" &&
+	mod_four_paths=$(git rev-parse HEAD) &&
+
+	>new-file &&
+	git add new-file &&
+	git commit -m "add new file in root" &&
+	new_file_in_root=$(git rev-parse HEAD) &&
+
+	git rm new-file &&
+	git commit -m "delete file in root" &&
+	delete_file_in_root=$(git rev-parse HEAD) &&
+
+	>dir/new-file &&
+	git add dir/new-file &&
+	git commit -m "add new file in dir" &&
+	new_file_in_dir=$(git rev-parse HEAD) &&
+
+	git rm dir/new-file &&
+	git commit -m "delete file in dir" &&
+	delete_file_in_dir=$(git rev-parse HEAD) &&
+
+	echo 1 >d-f &&
+	git add d-f &&
+	git commit -m foo &&
+	git rm d-f &&
+	mkdir d-f &&
+	echo 2 >d-f/file &&
+	git add d-f &&
+	git commit -m "replace file with dir" &&
+	file_to_dir=$(git rev-parse HEAD) &&
+
+	>d-f.c &&
+	git add d-f.c &&
+	git commit -m "add a file that sorts between d-f and d-f/" &&
+	git rm -r d-f &&
+	echo 3 >d-f &&
+	git add d-f &&
+	git commit -m "replace dir with file" &&
+	dir_to_file=$(git rev-parse HEAD) &&
+
+	bin_sha1=$(git rev-parse HEAD:dir/subdir | hex2oct) &&
+	# leading zero in mode: the content of the tree remains the same,
+	# but its oid does change!
+	printf "040000 subdir\0$bin_sha1" >rawtree &&
+	tree1=$(git hash-object -t tree -w rawtree) &&
+	git cat-file -p HEAD^{tree} >out &&
+	tree2=$(sed -e "s/$(git rev-parse HEAD:dir/)/$tree1/" out |git mktree) &&
+	different_but_same_tree=$(git commit-tree \
+		-m "leading zeros in mode" \
+		-p $(git rev-parse HEAD) $tree2) &&
+	git update-ref HEAD $different_but_same_tree &&
+
+	git commit-graph write --reachable --changed-paths >out &&
+	cat out  # debug
+'
+
+test_expect_success 'modify one path in root' '
+	git diff --name-status $mod_one_path^ $mod_one_path &&
+	echo "$mod_one_path  1" >expect &&
+	grep "$mod_one_path" out >actual &&
+	test_cmp expect actual
+'
+
+test_expect_success 'modify two file two dirs deep' '
+	git diff --name-status $mod_four_paths^ $mod_four_paths &&
+	echo "$mod_four_paths  4" >expect &&
+	grep "$mod_four_paths" out >actual &&
+	test_cmp expect actual
+'
+
+test_expect_success 'add new file in root' '
+	git diff --name-status $new_file_in_root^ $new_file_in_root &&
+	echo "$new_file_in_root  1" >expect &&
+	grep "$new_file_in_root" out >actual &&
+	test_cmp expect actual
+'
+
+test_expect_success 'delete file in root' '
+	git diff --name-status $delete_file_in_root^ $delete_file_in_root &&
+	echo "$delete_file_in_root  1" >expect &&
+	grep "$delete_file_in_root" out >actual &&
+	test_cmp expect actual
+'
+
+test_expect_success 'add new file in dir' '
+	git diff --name-status $new_file_in_dir^ $new_file_in_dir &&
+	echo "$new_file_in_dir  2" >expect &&
+	grep "$new_file_in_dir" out >actual &&
+	test_cmp expect actual
+'
+
+test_expect_success 'delete file in dir' '
+	git diff --name-status $delete_file_in_dir^ $delete_file_in_dir &&
+	echo "$delete_file_in_dir  2" >expect &&
+	grep "$delete_file_in_dir" out >actual &&
+	test_cmp expect actual
+'
+
+test_expect_success 'replace file with dir' '
+	git diff --name-status $file_to_dir^ $file_to_dir &&
+	echo "$file_to_dir  2" >expect &&
+	grep "$file_to_dir" out >actual &&
+	test_cmp expect actual
+'
+
+test_expect_success 'replace dir with file' '
+	git diff --name-status $dir_to_file^ $dir_to_file &&
+	echo "$dir_to_file  2" >expect &&
+	grep "$dir_to_file" out >actual &&
+	test_cmp expect actual
+'
+
+test_expect_success 'leading zeros in mode' '
+	git diff --name-status $different_but_same_tree^ $different_but_same_tree &&
+	echo "$different_but_same_tree  0" >expect &&
+	grep "$different_but_same_tree" out >actual &&
+	test_cmp expect actual
+'
+
+test_done

  ---  >8  ---


> @@ -552,6 +557,7 @@ struct combine_diff_path *diff_tree_paths(
>  	const struct object_id **parents_oid, int nparent,
>  	struct strbuf *base, struct diff_options *opt)
>  {
> +	opt->num_changes = 0;
>  	p = ll_diff_tree_paths(p, oid, parents_oid, nparent, base, opt);
>  
>  	/*
> -- 
> gitgitgadget
>
Derrick Stolee Aug. 4, 2020, 4:25 p.m. UTC | #2
On 8/4/2020 10:47 AM, SZEDER Gábor wrote:
> On Mon, Apr 06, 2020 at 04:59:45PM +0000, Derrick Stolee via GitGitGadget wrote:
> This counter is basically broken, its value is wrong for over 98% of
> commits, and, worse, its value remains 0 for over 85% of commits in
> the repositories I usually use to test modified path Bloom filters.
> Consequently, a relatively large number of commits modifying more than
> 512 paths get Bloom filters.

Thanks for finding this! The counter is only really tested in one
place, and that test only considers _file adds_, which is a problem.

If I understand this correctly, the bug is a performance-only bug
(since this is a performance-only feature), but it is an important
one to fix.

There is certainly some dark magic happening in this tree-diff logic,
so instead of trying to get an accurate count we should just use the
magic global diff_queued_diff to track the current list of file changes.

Note: diff_queued_diff does not track the directory changes, so it
is an under-count for the total changes to track in the Bloom filter.
This is later corrected by the block that adds these leading directory
changes.

> The makeshift tests in the patch below demonstrate these issues as
> most of them fail, most notably those two tests that demonstrate that
> modifying existing paths are not counted at all.

I adapted your diff along with ripping out 'num_changes' in favor
of diff_queued_diff.nr. This required modifying some of your expected
values in the test script (losing the leading directories in the
count).

I'll work with Taylor to create a fix, and include proper testing
of the logic here. We'll stick it in the v2 of his max-changed-paths
series [1]. He already has some helpful logging that can help create
tests that ensure this logic is performing as expected.

We plan to have that fix available by later today or early tomorrow.
Will you be available to help validate it?

[1] https://lore.kernel.org/git/cover.1596480582.git.me@ttaylorr.com/

Thanks,
-Stolee

  --- >8 ---

diff --git a/bloom.c b/bloom.c
index 1a573226e7..b8d6cb9240 100644
--- a/bloom.c
+++ b/bloom.c
@@ -218,8 +218,9 @@ struct bloom_filter *get_bloom_filter(struct repository *r,
 	else
 		diff_tree_oid(NULL, &c->object.oid, "", &diffopt);
 	diffcore_std(&diffopt);
+	printf("%s  %d\n", oid_to_hex(&c->object.oid), diff_queued_diff.nr);
 
-	if (diffopt.num_changes <= max_changes) {
+	if (diff_queued_diff.nr <= max_changes) {
 		struct hashmap pathmap;
 		struct pathmap_hash_entry *e;
 		struct hashmap_iter iter;
diff --git a/diff.h b/diff.h
index e0c0af6286b..1d32b718857 100644
--- a/diff.h
+++ b/diff.h
@@ -287,8 +287,6 @@ struct diff_options {
 
 	/* If non-zero, then stop computing after this many changes. */
 	int max_changes;
-	/* For internal use only. */
-	int num_changes;
 
 	int ita_invisible_in_index;
 /* white-space error highlighting */
diff --git a/t/t9999-test.sh b/t/t9999-test.sh
new file mode 100755
index 00000000000..1f35aa8e2c5
--- /dev/null
+++ b/t/t9999-test.sh
@@ -0,0 +1,142 @@
+#!/bin/sh
+
+test_description='test'
+
+. ./test-lib.sh
+
+test_expect_success 'setup' '
+	test_tick &&
+
+	echo 1 >file &&
+	mkdir -p dir/subdir &&
+	echo 1 >dir/subdir/file1 &&
+	echo 1 >dir/subdir/file2 &&
+	git add file dir &&
+	git commit -m setup &&
+
+	echo 2 >file &&
+	git commit -a -m "modify one path in root" &&
+	mod_one_path=$(git rev-parse HEAD) &&
+
+	echo 2 >dir/subdir/file1 &&
+	echo 2 >dir/subdir/file2 &&
+	git commit -a -m "modify two file two dirs deep" &&
+	mod_four_paths=$(git rev-parse HEAD) &&
+
+	>new-file &&
+	git add new-file &&
+	git commit -m "add new file in root" &&
+	new_file_in_root=$(git rev-parse HEAD) &&
+
+	git rm new-file &&
+	git commit -m "delete file in root" &&
+	delete_file_in_root=$(git rev-parse HEAD) &&
+
+	>dir/new-file &&
+	git add dir/new-file &&
+	git commit -m "add new file in dir" &&
+	new_file_in_dir=$(git rev-parse HEAD) &&
+
+	git rm dir/new-file &&
+	git commit -m "delete file in dir" &&
+	delete_file_in_dir=$(git rev-parse HEAD) &&
+
+	echo 1 >d-f &&
+	git add d-f &&
+	git commit -m foo &&
+	git rm d-f &&
+	mkdir d-f &&
+	echo 2 >d-f/file &&
+	git add d-f &&
+	git commit -m "replace file with dir" &&
+	file_to_dir=$(git rev-parse HEAD) &&
+
+	>d-f.c &&
+	git add d-f.c &&
+	git commit -m "add a file that sorts between d-f and d-f/" &&
+	git rm -r d-f &&
+	echo 3 >d-f &&
+	git add d-f &&
+	git commit -m "replace dir with file" &&
+	dir_to_file=$(git rev-parse HEAD) &&
+
+	bin_sha1=$(git rev-parse HEAD:dir/subdir | hex2oct) &&
+	# leading zero in mode: the content of the tree remains the same,
+	# but its oid does change!
+	printf "040000 subdir\0$bin_sha1" >rawtree &&
+	tree1=$(git hash-object -t tree -w rawtree) &&
+	git cat-file -p HEAD^{tree} >out &&
+	tree2=$(sed -e "s/$(git rev-parse HEAD:dir/)/$tree1/" out |git mktree) &&
+	different_but_same_tree=$(git commit-tree \
+		-m "leading zeros in mode" \
+		-p $(git rev-parse HEAD) $tree2) &&
+	git update-ref HEAD $different_but_same_tree &&
+
+	git commit-graph write --reachable --changed-paths >out &&
+	cat out  # debug
+'
+
+test_expect_success 'modify one path in root' '
+	git diff --name-status $mod_one_path^ $mod_one_path &&
+	echo "$mod_one_path  1" >expect &&
+	grep "$mod_one_path" out >actual &&
+	test_cmp expect actual
+'
+
+test_expect_success 'modify two file two dirs deep' '
+	git diff --name-status $mod_four_paths^ $mod_four_paths &&
+	echo "$mod_four_paths  2" >expect &&
+	grep "$mod_four_paths" out >actual &&
+	test_cmp expect actual
+'
+
+test_expect_success 'add new file in root' '
+	git diff --name-status $new_file_in_root^ $new_file_in_root &&
+	echo "$new_file_in_root  1" >expect &&
+	grep "$new_file_in_root" out >actual &&
+	test_cmp expect actual
+'
+
+test_expect_success 'delete file in root' '
+	git diff --name-status $delete_file_in_root^ $delete_file_in_root &&
+	echo "$delete_file_in_root  1" >expect &&
+	grep "$delete_file_in_root" out >actual &&
+	test_cmp expect actual
+'
+
+test_expect_success 'add new file in dir' '
+	git diff --name-status $new_file_in_dir^ $new_file_in_dir &&
+	echo "$new_file_in_dir  1" >expect &&
+	grep "$new_file_in_dir" out >actual &&
+	test_cmp expect actual
+'
+
+test_expect_success 'delete file in dir' '
+	git diff --name-status $delete_file_in_dir^ $delete_file_in_dir &&
+	echo "$delete_file_in_dir  1" >expect &&
+	grep "$delete_file_in_dir" out >actual &&
+	test_cmp expect actual
+'
+
+test_expect_success 'replace file with dir' '
+	git diff --name-status $file_to_dir^ $file_to_dir &&
+	echo "$file_to_dir  2" >expect &&
+	grep "$file_to_dir" out >actual &&
+	test_cmp expect actual
+'
+
+test_expect_success 'replace dir with file' '
+	git diff --name-status $dir_to_file^ $dir_to_file &&
+	echo "$dir_to_file  2" >expect &&
+	grep "$dir_to_file" out >actual &&
+	test_cmp expect actual
+'
+
+test_expect_success 'leading zeros in mode' '
+	git diff --name-status $different_but_same_tree^ $different_but_same_tree &&
+	echo "$different_but_same_tree  0" >expect &&
+	grep "$different_but_same_tree" out >actual &&
+	test_cmp expect actual
+'
+
+test_done
diff --git a/tree-diff.c b/tree-diff.c
index 6ebad1a46f3..7cebbb327e2 100644
--- a/tree-diff.c
+++ b/tree-diff.c
@@ -434,7 +434,7 @@ static struct combine_diff_path *ll_diff_tree_paths(
 		if (diff_can_quit_early(opt))
 			break;
 
-		if (opt->max_changes && opt->num_changes > opt->max_changes)
+		if (opt->max_changes && diff_queued_diff.nr > opt->max_changes)
 			break;
 
 		if (opt->pathspec.nr) {
@@ -521,7 +521,6 @@ static struct combine_diff_path *ll_diff_tree_paths(
 
 			/* t↓ */
 			update_tree_entry(&t);
-			opt->num_changes++;
 		}
 
 		/* t > p[imin] */
@@ -539,7 +538,6 @@ static struct combine_diff_path *ll_diff_tree_paths(
 		skip_emit_tp:
 			/* ∀ pi=p[imin]  pi↓ */
 			update_tp_entries(tp, nparent);
-			opt->num_changes++;
 		}
 	}
 
@@ -557,7 +555,6 @@ struct combine_diff_path *diff_tree_paths(
 	const struct object_id **parents_oid, int nparent,
 	struct strbuf *base, struct diff_options *opt)
 {
-	opt->num_changes = 0;
 	p = ll_diff_tree_paths(p, oid, parents_oid, nparent, base, opt);
 
 	/*
SZEDER Gábor Aug. 4, 2020, 5 p.m. UTC | #3
On Tue, Aug 04, 2020 at 12:25:45PM -0400, Derrick Stolee wrote:
> On 8/4/2020 10:47 AM, SZEDER Gábor wrote:
> > On Mon, Apr 06, 2020 at 04:59:45PM +0000, Derrick Stolee via GitGitGadget wrote:
> > This counter is basically broken, its value is wrong for over 98% of
> > commits, and, worse, its value remains 0 for over 85% of commits in
> > the repositories I usually use to test modified path Bloom filters.
> > Consequently, a relatively large number of commits modifying more than
> > 512 paths get Bloom filters.
> 
> Thanks for finding this! The counter is only really tested in one
> place, and that test only considers _file adds_, which is a problem.
> 
> If I understand this correctly, the bug is a performance-only bug
> (since this is a performance-only feature), but it is an important
> one to fix.

Or a performance-only feature in a performance-only feature, because
those additional modified path Bloom filters can improve the runtime
of pathspec-limited revision walks (assuming that the false positive
rate is low enough).

> There is certainly some dark magic happening in this tree-diff logic,
> so instead of trying to get an accurate count we should just use the
> magic global diff_queued_diff to track the current list of file changes.
> 
> Note: diff_queued_diff does not track the directory changes, so it
> is an under-count for the total changes to track in the Bloom filter.
> This is later corrected by the block that adds these leading directory
> changes.
> 
> > The makeshift tests in the patch below demonstrate these issues as
> > most of them fail, most notably those two tests that demonstrate that
> > modifying existing paths are not counted at all.
> 
> I adapted your diff along with ripping out 'num_changes' in favor
> of diff_queued_diff.nr. This required modifying some of your expected
> values in the test script (losing the leading directories in the
> count).
> 
> I'll work with Taylor to create a fix, and include proper testing
> of the logic here. We'll stick it in the v2 of his max-changed-paths
> series [1]. He already has some helpful logging that can help create
> tests that ensure this logic is performing as expected.

Don't forget to include a check of the hashmap's size, to make sure.

FWIW, the patch below does result in the correct count (read: the same
as in my implemenation) for all but 4 commits in those repositories I
use for testing, without adding any memory allocations and extra
strcmp() calls.

  ---  >8  ---

diff --git a/cache.h b/cache.h
index 0f0485ecfe..3fc7e1b427 100644
--- a/cache.h
+++ b/cache.h
@@ -1574,6 +1574,7 @@ int repo_interpret_branch_name(struct repository *r,
 int validate_headref(const char *ref);
 
 int base_name_compare(const char *name1, int len1, int mode1, const char *name2, int len2, int mode2);
+int base_name_compare_df(const char *name1, int len1, int mode1, const char *name2, int len2, int mode2, int *df);
 int df_name_compare(const char *name1, int len1, int mode1, const char *name2, int len2, int mode2);
 int name_compare(const char *name1, size_t len1, const char *name2, size_t len2);
 int cache_name_stage_compare(const char *name1, int len1, int stage1, const char *name2, int len2, int stage2);
diff --git a/read-cache.c b/read-cache.c
index aa427c5c17..041af19e60 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -460,13 +460,16 @@ int ie_modified(struct index_state *istate,
 	return 0;
 }
 
-int base_name_compare(const char *name1, int len1, int mode1,
-		      const char *name2, int len2, int mode2)
+int base_name_compare_df(const char *name1, int len1, int mode1,
+			 const char *name2, int len2, int mode2,
+			 int *df)
 {
 	unsigned char c1, c2;
 	int len = len1 < len2 ? len1 : len2;
 	int cmp;
 
+	*df = 0;
+
 	cmp = memcmp(name1, name2, len);
 	if (cmp)
 		return cmp;
@@ -476,7 +479,21 @@ int base_name_compare(const char *name1, int len1, int mode1,
 		c1 = '/';
 	if (!c2 && S_ISDIR(mode2))
 		c2 = '/';
-	return (c1 < c2) ? -1 : (c1 > c2) ? 1 : 0;
+	if (c1 == c2)
+		return 0;	/* TODO: is this even possible? */
+	if ((c1 == '/' && !c2) ||
+	    (!c1 && c2 == '/'))
+		*df = 1;
+	return (c1 < c2) ? -1 : 1;
+}
+
+int base_name_compare(const char *name1, int len1, int mode1,
+		      const char *name2, int len2, int mode2)
+{
+	int unused;
+	return base_name_compare_df(name1, len1, mode1,
+				    name2, len2, mode2,
+				    &unused);
 }
 
 /*
diff --git a/t/t9999-test.sh b/t/t9999-test.sh
index 8d2bd9f03f..4f08590b45 100755
--- a/t/t9999-test.sh
+++ b/t/t9999-test.sh
@@ -125,7 +125,7 @@ test_expect_success 'replace file with dir' '
 	test_cmp expect actual
 '
 
-test_expect_success 'replace dir with file' '
+test_expect_failure 'replace dir with file' '
 	git diff --name-status $dir_to_file^ $dir_to_file &&
 	echo "$dir_to_file  2" >expect &&
 	grep "$dir_to_file" out >actual &&
diff --git a/tree-diff.c b/tree-diff.c
index f3d303c6e5..e27f9c805e 100644
--- a/tree-diff.c
+++ b/tree-diff.c
@@ -46,11 +46,14 @@ static int ll_diff_tree_oid(const struct object_id *old_oid,
  *      Due to this convention, if trees are scanned in sorted order, all
  *      non-empty descriptors will be processed first.
  */
-static int tree_entry_pathcmp(struct tree_desc *t1, struct tree_desc *t2)
+static int tree_entry_pathcmp(struct tree_desc *t1, struct tree_desc *t2,
+			      int *df)
 {
 	struct name_entry *e1, *e2;
 	int cmp;
 
+	*df = 0;
+
 	/* empty descriptors sort after valid tree entries */
 	if (!t1->size)
 		return t2->size ? 1 : 0;
@@ -59,8 +62,9 @@ static int tree_entry_pathcmp(struct tree_desc *t1, struct tree_desc *t2)
 
 	e1 = &t1->entry;
 	e2 = &t2->entry;
-	cmp = base_name_compare(e1->path, tree_entry_len(e1), e1->mode,
-				e2->path, tree_entry_len(e2), e2->mode);
+	cmp = base_name_compare_df(e1->path, tree_entry_len(e1), e1->mode,
+				   e2->path, tree_entry_len(e2), e2->mode,
+				   df);
 	return cmp;
 }
 
@@ -410,7 +414,7 @@ static struct combine_diff_path *ll_diff_tree_paths(
 {
 	struct tree_desc t, *tp;
 	void *ttree, **tptree;
-	int i;
+	int i, df;
 
 	FAST_ARRAY_ALLOC(tp, nparent);
 	FAST_ARRAY_ALLOC(tptree, nparent);
@@ -463,7 +467,7 @@ static struct combine_diff_path *ll_diff_tree_paths(
 		tp[0].entry.mode &= ~S_IFXMIN_NEQ;
 
 		for (i = 1; i < nparent; ++i) {
-			cmp = tree_entry_pathcmp(&tp[i], &tp[imin]);
+			cmp = tree_entry_pathcmp(&tp[i], &tp[imin], &df);
 			if (cmp < 0) {
 				imin = i;
 				tp[i].entry.mode &= ~S_IFXMIN_NEQ;
@@ -483,10 +487,12 @@ static struct combine_diff_path *ll_diff_tree_paths(
 
 
 		/* compare t vs p[imin] */
-		cmp = tree_entry_pathcmp(&t, &tp[imin]);
+		cmp = tree_entry_pathcmp(&t, &tp[imin], &df);
 
 		/* t = p[imin] */
 		if (cmp == 0) {
+			int prev_num_changes = opt->num_changes;
+
 			/* are either pi > p[imin] or diff(t,pi) != ø ? */
 			if (!opt->flags.find_copies_harder) {
 				for (i = 0; i < nparent; ++i) {
@@ -506,6 +512,9 @@ static struct combine_diff_path *ll_diff_tree_paths(
 			/* D += {δ(t,pi) if pi=p[imin];  "+a" if pi > p[imin]} */
 			p = emit_path(p, base, opt, nparent,
 					&t, tp, imin);
+			if (!(opt->num_changes == prev_num_changes &&
+			      S_ISDIR(t.entry.mode)))
+				opt->num_changes++;
 
 		skip_emit_t_tp:
 			/* t↓,  ∀ pi=p[imin]  pi↓ */
@@ -518,10 +527,11 @@ static struct combine_diff_path *ll_diff_tree_paths(
 			/* D += "+t" */
 			p = emit_path(p, base, opt, nparent,
 					&t, /*tp=*/NULL, -1);
+			if (!df)
+				opt->num_changes++;
 
 			/* t↓ */
 			update_tree_entry(&t);
-			opt->num_changes++;
 		}
 
 		/* t > p[imin] */
@@ -535,11 +545,12 @@ static struct combine_diff_path *ll_diff_tree_paths(
 
 			p = emit_path(p, base, opt, nparent,
 					/*t=*/NULL, tp, imin);
+			if (!df)
+				opt->num_changes++;
 
 		skip_emit_tp:
 			/* ∀ pi=p[imin]  pi↓ */
 			update_tp_entries(tp, nparent);
-			opt->num_changes++;
 		}
 	}
 
  ---  >8  ---


Having said that, the best (i.e faster and accurate) solution to this
issue is probably:

  - Update the callchain between diff_tree_oid() and the diff callback
    functions to allow the callbacks to break diffing with a non-zero
    error code.

  - Fill Bloom filters using the approach presented in:

      https://public-inbox.org/git/20200529085038.26008-21-szeder.dev@gmail.com/

    but modify the callbacks to return non-zero when too many paths
    have been processed.

  - Drop this counter entirely, as there are no other users.

> We plan to have that fix available by later today or early tomorrow.
> Will you be available to help validate it?
> 
> [1] https://lore.kernel.org/git/cover.1596480582.git.me@ttaylorr.com/
> 
> Thanks,
> -Stolee
> 
>   --- >8 ---
> 
> diff --git a/bloom.c b/bloom.c
> index 1a573226e7..b8d6cb9240 100644
> --- a/bloom.c
> +++ b/bloom.c
> @@ -218,8 +218,9 @@ struct bloom_filter *get_bloom_filter(struct repository *r,
>  	else
>  		diff_tree_oid(NULL, &c->object.oid, "", &diffopt);
>  	diffcore_std(&diffopt);
> +	printf("%s  %d\n", oid_to_hex(&c->object.oid), diff_queued_diff.nr);
>  
> -	if (diffopt.num_changes <= max_changes) {
> +	if (diff_queued_diff.nr <= max_changes) {
>  		struct hashmap pathmap;
>  		struct pathmap_hash_entry *e;
>  		struct hashmap_iter iter;
> diff --git a/diff.h b/diff.h
> index e0c0af6286b..1d32b718857 100644
> --- a/diff.h
> +++ b/diff.h
> @@ -287,8 +287,6 @@ struct diff_options {
>  
>  	/* If non-zero, then stop computing after this many changes. */
>  	int max_changes;
> -	/* For internal use only. */
> -	int num_changes;
>  
>  	int ita_invisible_in_index;
>  /* white-space error highlighting */
> diff --git a/t/t9999-test.sh b/t/t9999-test.sh
> new file mode 100755
> index 00000000000..1f35aa8e2c5
> --- /dev/null
> +++ b/t/t9999-test.sh
> @@ -0,0 +1,142 @@
> +#!/bin/sh
> +
> +test_description='test'
> +
> +. ./test-lib.sh
> +
> +test_expect_success 'setup' '
> +	test_tick &&
> +
> +	echo 1 >file &&
> +	mkdir -p dir/subdir &&
> +	echo 1 >dir/subdir/file1 &&
> +	echo 1 >dir/subdir/file2 &&
> +	git add file dir &&
> +	git commit -m setup &&
> +
> +	echo 2 >file &&
> +	git commit -a -m "modify one path in root" &&
> +	mod_one_path=$(git rev-parse HEAD) &&
> +
> +	echo 2 >dir/subdir/file1 &&
> +	echo 2 >dir/subdir/file2 &&
> +	git commit -a -m "modify two file two dirs deep" &&
> +	mod_four_paths=$(git rev-parse HEAD) &&
> +
> +	>new-file &&
> +	git add new-file &&
> +	git commit -m "add new file in root" &&
> +	new_file_in_root=$(git rev-parse HEAD) &&
> +
> +	git rm new-file &&
> +	git commit -m "delete file in root" &&
> +	delete_file_in_root=$(git rev-parse HEAD) &&
> +
> +	>dir/new-file &&
> +	git add dir/new-file &&
> +	git commit -m "add new file in dir" &&
> +	new_file_in_dir=$(git rev-parse HEAD) &&
> +
> +	git rm dir/new-file &&
> +	git commit -m "delete file in dir" &&
> +	delete_file_in_dir=$(git rev-parse HEAD) &&
> +
> +	echo 1 >d-f &&
> +	git add d-f &&
> +	git commit -m foo &&
> +	git rm d-f &&
> +	mkdir d-f &&
> +	echo 2 >d-f/file &&
> +	git add d-f &&
> +	git commit -m "replace file with dir" &&
> +	file_to_dir=$(git rev-parse HEAD) &&
> +
> +	>d-f.c &&
> +	git add d-f.c &&
> +	git commit -m "add a file that sorts between d-f and d-f/" &&
> +	git rm -r d-f &&
> +	echo 3 >d-f &&
> +	git add d-f &&
> +	git commit -m "replace dir with file" &&
> +	dir_to_file=$(git rev-parse HEAD) &&
> +
> +	bin_sha1=$(git rev-parse HEAD:dir/subdir | hex2oct) &&
> +	# leading zero in mode: the content of the tree remains the same,
> +	# but its oid does change!
> +	printf "040000 subdir\0$bin_sha1" >rawtree &&
> +	tree1=$(git hash-object -t tree -w rawtree) &&
> +	git cat-file -p HEAD^{tree} >out &&
> +	tree2=$(sed -e "s/$(git rev-parse HEAD:dir/)/$tree1/" out |git mktree) &&
> +	different_but_same_tree=$(git commit-tree \
> +		-m "leading zeros in mode" \
> +		-p $(git rev-parse HEAD) $tree2) &&
> +	git update-ref HEAD $different_but_same_tree &&
> +
> +	git commit-graph write --reachable --changed-paths >out &&
> +	cat out  # debug
> +'
> +
> +test_expect_success 'modify one path in root' '
> +	git diff --name-status $mod_one_path^ $mod_one_path &&
> +	echo "$mod_one_path  1" >expect &&
> +	grep "$mod_one_path" out >actual &&
> +	test_cmp expect actual
> +'
> +
> +test_expect_success 'modify two file two dirs deep' '
> +	git diff --name-status $mod_four_paths^ $mod_four_paths &&
> +	echo "$mod_four_paths  2" >expect &&
> +	grep "$mod_four_paths" out >actual &&
> +	test_cmp expect actual
> +'
> +
> +test_expect_success 'add new file in root' '
> +	git diff --name-status $new_file_in_root^ $new_file_in_root &&
> +	echo "$new_file_in_root  1" >expect &&
> +	grep "$new_file_in_root" out >actual &&
> +	test_cmp expect actual
> +'
> +
> +test_expect_success 'delete file in root' '
> +	git diff --name-status $delete_file_in_root^ $delete_file_in_root &&
> +	echo "$delete_file_in_root  1" >expect &&
> +	grep "$delete_file_in_root" out >actual &&
> +	test_cmp expect actual
> +'
> +
> +test_expect_success 'add new file in dir' '
> +	git diff --name-status $new_file_in_dir^ $new_file_in_dir &&
> +	echo "$new_file_in_dir  1" >expect &&
> +	grep "$new_file_in_dir" out >actual &&
> +	test_cmp expect actual
> +'
> +
> +test_expect_success 'delete file in dir' '
> +	git diff --name-status $delete_file_in_dir^ $delete_file_in_dir &&
> +	echo "$delete_file_in_dir  1" >expect &&
> +	grep "$delete_file_in_dir" out >actual &&
> +	test_cmp expect actual
> +'
> +
> +test_expect_success 'replace file with dir' '
> +	git diff --name-status $file_to_dir^ $file_to_dir &&
> +	echo "$file_to_dir  2" >expect &&
> +	grep "$file_to_dir" out >actual &&
> +	test_cmp expect actual
> +'
> +
> +test_expect_success 'replace dir with file' '
> +	git diff --name-status $dir_to_file^ $dir_to_file &&
> +	echo "$dir_to_file  2" >expect &&
> +	grep "$dir_to_file" out >actual &&
> +	test_cmp expect actual
> +'
> +
> +test_expect_success 'leading zeros in mode' '
> +	git diff --name-status $different_but_same_tree^ $different_but_same_tree &&
> +	echo "$different_but_same_tree  0" >expect &&
> +	grep "$different_but_same_tree" out >actual &&
> +	test_cmp expect actual
> +'
> +
> +test_done
> diff --git a/tree-diff.c b/tree-diff.c
> index 6ebad1a46f3..7cebbb327e2 100644
> --- a/tree-diff.c
> +++ b/tree-diff.c
> @@ -434,7 +434,7 @@ static struct combine_diff_path *ll_diff_tree_paths(
>  		if (diff_can_quit_early(opt))
>  			break;
>  
> -		if (opt->max_changes && opt->num_changes > opt->max_changes)
> +		if (opt->max_changes && diff_queued_diff.nr > opt->max_changes)
>  			break;
>  
>  		if (opt->pathspec.nr) {
> @@ -521,7 +521,6 @@ static struct combine_diff_path *ll_diff_tree_paths(
>  
>  			/* t↓ */
>  			update_tree_entry(&t);
> -			opt->num_changes++;
>  		}
>  
>  		/* t > p[imin] */
> @@ -539,7 +538,6 @@ static struct combine_diff_path *ll_diff_tree_paths(
>  		skip_emit_tp:
>  			/* ∀ pi=p[imin]  pi↓ */
>  			update_tp_entries(tp, nparent);
> -			opt->num_changes++;
>  		}
>  	}
>  
> @@ -557,7 +555,6 @@ struct combine_diff_path *diff_tree_paths(
>  	const struct object_id **parents_oid, int nparent,
>  	struct strbuf *base, struct diff_options *opt)
>  {
> -	opt->num_changes = 0;
>  	p = ll_diff_tree_paths(p, oid, parents_oid, nparent, base, opt);
>  
>  	/*
Derrick Stolee Aug. 4, 2020, 5:31 p.m. UTC | #4
On 8/4/2020 1:00 PM, SZEDER Gábor wrote:
> On Tue, Aug 04, 2020 at 12:25:45PM -0400, Derrick Stolee wrote:
>> On 8/4/2020 10:47 AM, SZEDER Gábor wrote:
>>> On Mon, Apr 06, 2020 at 04:59:45PM +0000, Derrick Stolee via GitGitGadget wrote:
>>> This counter is basically broken, its value is wrong for over 98% of
>>> commits, and, worse, its value remains 0 for over 85% of commits in
>>> the repositories I usually use to test modified path Bloom filters.
>>> Consequently, a relatively large number of commits modifying more than
>>> 512 paths get Bloom filters.
>>
>> Thanks for finding this! The counter is only really tested in one
>> place, and that test only considers _file adds_, which is a problem.
>>
>> If I understand this correctly, the bug is a performance-only bug
>> (since this is a performance-only feature), but it is an important
>> one to fix.
> 
> Or a performance-only feature in a performance-only feature, because
> those additional modified path Bloom filters can improve the runtime
> of pathspec-limited revision walks (assuming that the false positive
> rate is low enough).
> 
>> There is certainly some dark magic happening in this tree-diff logic,
>> so instead of trying to get an accurate count we should just use the
>> magic global diff_queued_diff to track the current list of file changes.
>>
>> Note: diff_queued_diff does not track the directory changes, so it
>> is an under-count for the total changes to track in the Bloom filter.
>> This is later corrected by the block that adds these leading directory
>> changes.
>>
>>> The makeshift tests in the patch below demonstrate these issues as
>>> most of them fail, most notably those two tests that demonstrate that
>>> modifying existing paths are not counted at all.
>>
>> I adapted your diff along with ripping out 'num_changes' in favor
>> of diff_queued_diff.nr. This required modifying some of your expected
>> values in the test script (losing the leading directories in the
>> count).
>>
>> I'll work with Taylor to create a fix, and include proper testing
>> of the logic here. We'll stick it in the v2 of his max-changed-paths
>> series [1]. He already has some helpful logging that can help create
>> tests that ensure this logic is performing as expected.
> 
> Don't forget to include a check of the hashmap's size, to make sure.

Yes, thanks for the pointer. That check is currently not in there,
since the code assumes the hashmap's size will match num_changes.
Hopefully, the tests I intend to write around this would have caught
such an omission.
 
> FWIW, the patch below does result in the correct count (read: the same
> as in my implemenation) for all but 4 commits in those repositories I
> use for testing, without adding any memory allocations and extra
> strcmp() calls.

...

> Having said that, the best (i.e faster and accurate) solution to this
> issue is probably:
> 
>   - Update the callchain between diff_tree_oid() and the diff callback
>     functions to allow the callbacks to break diffing with a non-zero
>     error code.

It looks like this part would not be too difficult. The pathchange
callback is called by emit_path() which returns a struct combine_diff_path
pointer. This could return NULL to signal an early termination, but
we need to update all callers of the following methods to handle NULL
responses:

 * emit_path()
 * ll_diff_tree_paths()
 * diff_tree_paths()

Of some interest: diff_tree_paths() returns a struct combine_diff_path
pointer, but no callers seem to consume it.

>   - Fill Bloom filters using the approach presented in:
> 
>       https://public-inbox.org/git/20200529085038.26008-21-szeder.dev@gmail.com/
> 
>     but modify the callbacks to return non-zero when too many paths
>     have been processed.

Thanks for the pointer to that specific patch. You do a good job of
describing your thought process, including why you used the callback
approach instead of the diff queue approach. The main reason seemed to
be memory overhead from populating the entire diff queue before
checking the limit.

However, if we are using the diff queue as the short-circuit, then
perhaps that memory overhead isn't as much of a problem?

You admit yourself, that

  This patch implements a more efficient, but more complex, approach:

The logic around matching prefixes definitely seems complex and
hard to test, especially around the file/directory changes with the
sort order problems that have plagued similar prefix checks recently.
I'm not doubting your implementation, just saying that the complexity
is worth considering before jumping to that solution too quickly.

To sum up, I intend to start with a fix that uses the diff queue
count as a limit, then try the callback approach to see if there are
measurable improvements in performance.

>   - Drop this counter entirely, as there are no other users.

With the callback approach, "this counter" is both num_changes and
max_changes, since the callback would perform all of the short-circuit
logic.

Thanks,
-Stolee
Derrick Stolee Aug. 5, 2020, 5:08 p.m. UTC | #5
On 8/4/2020 1:31 PM, Derrick Stolee wrote:
> On 8/4/2020 1:00 PM, SZEDER Gábor wrote:

>> Having said that, the best (i.e faster and accurate) solution to this
>> issue is probably:
>>
>>   - Update the callchain between diff_tree_oid() and the diff callback
>>     functions to allow the callbacks to break diffing with a non-zero
>>     error code.
> 
> It looks like this part would not be too difficult. 

Oh, my hubris! I gave this a shot for some time this morning. This
will definitely take some work to do right. Just changing the callbacks
to return 'int' is a wide-sweeping change, but the place where they are
called already has an 'int' return that means something different.

I'm not saying this is impossible. It just takes more attention and care
than I can currently devote, given my other works in progress right now.

>>   - Fill Bloom filters using the approach presented in:
>>
>>       https://public-inbox.org/git/20200529085038.26008-21-szeder.dev@gmail.com/
>>
>>     but modify the callbacks to return non-zero when too many paths
>>     have been processed.
> 
> Thanks for the pointer to that specific patch. You do a good job of
> describing your thought process, including why you used the callback
> approach instead of the diff queue approach. The main reason seemed to
> be memory overhead from populating the entire diff queue before
> checking the limit.
> 
> However, if we are using the diff queue as the short-circuit, then
> perhaps that memory overhead isn't as much of a problem?
> 
> You admit yourself, that
> 
>   This patch implements a more efficient, but more complex, approach:
> 
> The logic around matching prefixes definitely seems complex and
> hard to test, especially around the file/directory changes with the
> sort order problems that have plagued similar prefix checks recently.
> I'm not doubting your implementation, just saying that the complexity
> is worth considering before jumping to that solution too quickly.
> 
> To sum up, I intend to start with a fix that uses the diff queue
> count as a limit, then try the callback approach to see if there are
> measurable improvements in performance.

That fix is now available [1].

[1] https://lore.kernel.org/git/d1c4bbcaa9627068d5d9fbd0e4a2e8c8834a4bd3.1596646576.git.me@ttaylorr.com/

Again, the callback approach seems promising. The complexity is
stopping me from trying to apply it on top of the current
implementation, while I should be focusing on other things. I completely
believe that that approach is faster and more memory-efficient. I would
love to test and review a patch that takes that approach here.

Thanks,
-Stolee
diff mbox series

Patch

diff --git a/bloom.c b/bloom.c
index 881a9841ede..a16eee92331 100644
--- a/bloom.c
+++ b/bloom.c
@@ -133,6 +133,7 @@  struct bloom_filter *get_bloom_filter(struct repository *r,
 	struct bloom_filter_settings settings = DEFAULT_BLOOM_FILTER_SETTINGS;
 	int i;
 	struct diff_options diffopt;
+	int max_changes = 512;
 
 	if (bloom_filters.slab_size == 0)
 		return NULL;
@@ -141,6 +142,7 @@  struct bloom_filter *get_bloom_filter(struct repository *r,
 
 	repo_diff_setup(r, &diffopt);
 	diffopt.flags.recursive = 1;
+	diffopt.max_changes = max_changes;
 	diff_setup_done(&diffopt);
 
 	if (c->parents)
@@ -149,7 +151,7 @@  struct bloom_filter *get_bloom_filter(struct repository *r,
 		diff_tree_oid(NULL, &c->object.oid, "", &diffopt);
 	diffcore_std(&diffopt);
 
-	if (diff_queued_diff.nr <= 512) {
+	if (diff_queued_diff.nr <= max_changes) {
 		struct hashmap pathmap;
 		struct pathmap_hash_entry *e;
 		struct hashmap_iter iter;
diff --git a/diff.h b/diff.h
index 6febe7e3656..9443dc1b003 100644
--- a/diff.h
+++ b/diff.h
@@ -285,6 +285,11 @@  struct diff_options {
 	/* Number of hexdigits to abbreviate raw format output to. */
 	int abbrev;
 
+	/* If non-zero, then stop computing after this many changes. */
+	int max_changes;
+	/* For internal use only. */
+	int num_changes;
+
 	int ita_invisible_in_index;
 /* white-space error highlighting */
 #define WSEH_NEW (1<<12)
diff --git a/tree-diff.c b/tree-diff.c
index 33ded7f8b3e..f3d303c6e54 100644
--- a/tree-diff.c
+++ b/tree-diff.c
@@ -434,6 +434,9 @@  static struct combine_diff_path *ll_diff_tree_paths(
 		if (diff_can_quit_early(opt))
 			break;
 
+		if (opt->max_changes && opt->num_changes > opt->max_changes)
+			break;
+
 		if (opt->pathspec.nr) {
 			skip_uninteresting(&t, base, opt);
 			for (i = 0; i < nparent; i++)
@@ -518,6 +521,7 @@  static struct combine_diff_path *ll_diff_tree_paths(
 
 			/* t↓ */
 			update_tree_entry(&t);
+			opt->num_changes++;
 		}
 
 		/* t > p[imin] */
@@ -535,6 +539,7 @@  static struct combine_diff_path *ll_diff_tree_paths(
 		skip_emit_tp:
 			/* ∀ pi=p[imin]  pi↓ */
 			update_tp_entries(tp, nparent);
+			opt->num_changes++;
 		}
 	}
 
@@ -552,6 +557,7 @@  struct combine_diff_path *diff_tree_paths(
 	const struct object_id **parents_oid, int nparent,
 	struct strbuf *base, struct diff_options *opt)
 {
+	opt->num_changes = 0;
 	p = ll_diff_tree_paths(p, oid, parents_oid, nparent, base, opt);
 
 	/*