diff mbox series

[v4,08/15] diff --color-moved-ws=allow-indentation-change: simplify and optimize

Message ID d7bbc0041e076077d3c3299858799cc9907d9831.1637056179.git.gitgitgadget@gmail.com (mailing list archive)
State Superseded
Headers show
Series diff --color-moved[-ws] speedups | expand

Commit Message

Phillip Wood Nov. 16, 2021, 9:49 a.m. UTC
From: Phillip Wood <phillip.wood@dunelm.org.uk>

If we already have a block of potentially moved lines then as we move
down the diff we need to check if the next line of each potentially
moved line matches the current line of the diff. The implementation of
--color-moved-ws=allow-indentation-change was needlessly performing
this check on all the lines in the diff that matched the current line
rather than just the current line. To exacerbate the problem finding
all the other lines in the diff that match the current line involves a
fuzzy lookup so we were wasting even more time performing a second
comparison to filter out the non-matching lines. Fixing this reduces
time to run
  git diff --color-moved-ws=allow-indentation-change v2.28.0 v2.29.0
by 93% compared to master and simplifies the code.

Test                                                                  HEAD^              HEAD
---------------------------------------------------------------------------------------------------------------
4002.1: diff --no-color-moved --no-color-moved-ws large change        0.38 (0.35+0.03)   0.38(0.35+0.03)  +0.0%
4002.2: diff --color-moved --no-color-moved-ws large change           0.86 (0.80+0.06)   0.87(0.83+0.04)  +1.2%
4002.3: diff --color-moved-ws=allow-indentation-change large change  19.01(18.93+0.06)   0.97(0.92+0.04) -94.9%
4002.4: log --no-color-moved --no-color-moved-ws                      1.16 (1.06+0.09)   1.17(1.06+0.10)  +0.9%
4002.5: log --color-moved --no-color-moved-ws                         1.32 (1.25+0.07)   1.32(1.24+0.08)  +0.0%
4002.6: log --color-moved-ws=allow-indentation-change                 1.71 (1.64+0.06)   1.36(1.25+0.10) -20.5%

Test                                                                  master             HEAD
---------------------------------------------------------------------------------------------------------------
4002.1: diff --no-color-moved --no-color-moved-ws large change        0.38 (0.33+0.05)   0.38(0.35+0.03)  +0.0%
4002.2: diff --color-moved --no-color-moved-ws large change           0.80 (0.75+0.04)   0.87(0.83+0.04)  +8.7%
4002.3: diff --color-moved-ws=allow-indentation-change large change  14.20(14.15+0.05)   0.97(0.92+0.04) -93.2%
4002.4: log --no-color-moved --no-color-moved-ws                      1.15 (1.05+0.09)   1.17(1.06+0.10)  +1.7%
4002.5: log --color-moved --no-color-moved-ws                         1.30 (1.19+0.11)   1.32(1.24+0.08)  +1.5%
4002.6: log --color-moved-ws=allow-indentation-change                 1.70 (1.63+0.06)   1.36(1.25+0.10) -20.0%

Helped-by: Jeff King <peff@peff.net>
Signed-off-by: Phillip Wood <phillip.wood@dunelm.org.uk>
---
 diff.c | 70 +++++++++++++++++-----------------------------------------
 1 file changed, 20 insertions(+), 50 deletions(-)

Comments

Johannes Schindelin Nov. 23, 2021, 2:51 p.m. UTC | #1
Hi Phillip,

tl;dr: the patch looks good to me (it is a bit tricky to review, though,
but that is not your fault, it is our code review process' fault).

On Tue, 16 Nov 2021, Phillip Wood via GitGitGadget wrote:

>   git diff --color-moved-ws=allow-indentation-change v2.28.0 v2.29.0
> by 93% compared to master and simplifies the code.

Nice!

> diff --git a/diff.c b/diff.c
> index 9aff167be27..78a486021ab 100644
> --- a/diff.c
> +++ b/diff.c
> @@ -879,37 +879,21 @@ static int compute_ws_delta(const struct emitted_diff_symbol *a,
>  	return 1;
>  }
>
> -static int cmp_in_block_with_wsd(const struct diff_options *o,
> -				 const struct moved_entry *cur,
> -				 const struct moved_entry *match,
> -				 struct moved_block *pmb,
> -				 int n)
> -{
> -	struct emitted_diff_symbol *l = &o->emitted_symbols->buf[n];
> -	int al = cur->es->len, bl = match->es->len, cl = l->len;
> +static int cmp_in_block_with_wsd(const struct moved_entry *cur,
> +				 const struct emitted_diff_symbol *l,
> +				 struct moved_block *pmb)
> +{
> +	int al = cur->es->len, bl = l->len;

Once I realized that the old `b` was removed and the old `c` became the
new `b`, it was a breeze to validate this hunk.

>  	const char *a = cur->es->line,
> -		   *b = match->es->line,
> -		   *c = l->line;
> +		   *b = l->line;
>  	int a_off = cur->es->indent_off,
>  	    a_width = cur->es->indent_width,
> -	    c_off = l->indent_off,
> -	    c_width = l->indent_width;
> +	    b_off = l->indent_off,
> +	    b_width = l->indent_width;
>  	int delta;
>
> -	/*
> -	 * We need to check if 'cur' is equal to 'match'.  As those
> -	 * are from the same (+/-) side, we do not need to adjust for
> -	 * indent changes. However these were found using fuzzy
> -	 * matching so we do have to check if they are equal. Here we
> -	 * just check the lengths. We delay calling memcmp() to check
> -	 * the contents until later as if the length comparison for a
> -	 * and c fails we can avoid the call all together.
> -	 */
> -	if (al != bl)
> -		return 1;

The commit message really helped understanding why this is not needed.
Thank you!

> -
>  	/* If 'l' and 'cur' are both blank then they match. */
> -	if (a_width == INDENT_BLANKLINE && c_width == INDENT_BLANKLINE)
> +	if (a_width == INDENT_BLANKLINE && b_width == INDENT_BLANKLINE)
>  		return 0;
>
>  	/*
> @@ -918,7 +902,7 @@ static int cmp_in_block_with_wsd(const struct diff_options *o,
>  	 * match those of the current block and that the text of 'l' and 'cur'
>  	 * after the indentation match.
>  	 */
> -	delta = c_width - a_width;
> +	delta = b_width - a_width;
>
>  	/*
>  	 * If the previous lines of this block were all blank then set its
> @@ -927,9 +911,8 @@ static int cmp_in_block_with_wsd(const struct diff_options *o,
>  	if (pmb->wsd == INDENT_BLANKLINE)
>  		pmb->wsd = delta;
>
> -	return !(delta == pmb->wsd && al - a_off == cl - c_off &&
> -		 !memcmp(a, b, al) && !
> -		 memcmp(a + a_off, c + c_off, al - a_off));
> +	return !(delta == pmb->wsd && al - a_off == bl - b_off &&
> +		 !memcmp(a + a_off, b + b_off, al - a_off));
>  }

Once again, I am sad that we have no better platform to do our code
contribution and review. Whatever you can say about GitHub's UI, it is
better than static diffs in mails.

But you used GitGitGadget, and I finally broke down and wrote a script
that allows me to magic my way from the mail into the correct commit in
the GitGitGadget PR in the browser. It is still shell script (at some
stage, I will need to extend the script to be much smarter than any shell
script can be, and probably convert it to node.js, but not today).

This helped me verify that there are no left-over references to the old
`b`. So all is good!

>
>  static int moved_entry_cmp(const void *hashmap_cmp_fn_data,
> @@ -1030,36 +1013,23 @@ static void pmb_advance_or_null(struct diff_options *o,
>  }
>
>  static void pmb_advance_or_null_multi_match(struct diff_options *o,
> -					    struct moved_entry *match,
> -					    struct hashmap *hm,
> +					    struct emitted_diff_symbol *l,
>  					    struct moved_block *pmb,
> -					    int pmb_nr, int n)
> +					    int pmb_nr)
>  {
>  	int i;
> -	char *got_match = xcalloc(1, pmb_nr);
> -
> -	hashmap_for_each_entry_from(hm, match, ent) {
> -		for (i = 0; i < pmb_nr; i++) {
> -			struct moved_entry *prev = pmb[i].match;
> -			struct moved_entry *cur = (prev && prev->next_line) ?
> -					prev->next_line : NULL;
> -			if (!cur)
> -				continue;
> -			if (!cmp_in_block_with_wsd(o, cur, match, &pmb[i], n))
> -				got_match[i] |= 1;
> -		}
> -	}
>
>  	for (i = 0; i < pmb_nr; i++) {
> -		if (got_match[i]) {
> +		struct moved_entry *prev = pmb[i].match;
> +		struct moved_entry *cur = (prev && prev->next_line) ?
> +			prev->next_line : NULL;
> +		if (cur && !cmp_in_block_with_wsd(cur, l, &pmb[i])) {
>  			/* Advance to the next line */
> -			pmb[i].match = pmb[i].match->next_line;
> +			pmb[i].match = cur;
>  		} else {
>  			moved_block_clear(&pmb[i]);
>  		}
>  	}
> -
> -	free(got_match);

Even got rid of an allocation. Very nice.

>  }
>
>  static int shrink_potential_moved_blocks(struct moved_block *pmb,
> @@ -1223,7 +1193,7 @@ static void mark_color_as_moved(struct diff_options *o,
>
>  		if (o->color_moved_ws_handling &
>  		    COLOR_MOVED_WS_ALLOW_INDENTATION_CHANGE)
> -			pmb_advance_or_null_multi_match(o, match, hm, pmb, pmb_nr, n);
> +			pmb_advance_or_null_multi_match(o, l, pmb, pmb_nr);

Again, magic button to the rescue! And I can verify that `l` is assigned
to `&o->emitted_symbols->buf[n]`, so: the patch does the correct thing.

Thank you,
Dscho

>  		else
>  			pmb_advance_or_null(o, match, hm, pmb, pmb_nr);
>
> --
> gitgitgadget
>
>
diff mbox series

Patch

diff --git a/diff.c b/diff.c
index 9aff167be27..78a486021ab 100644
--- a/diff.c
+++ b/diff.c
@@ -879,37 +879,21 @@  static int compute_ws_delta(const struct emitted_diff_symbol *a,
 	return 1;
 }
 
-static int cmp_in_block_with_wsd(const struct diff_options *o,
-				 const struct moved_entry *cur,
-				 const struct moved_entry *match,
-				 struct moved_block *pmb,
-				 int n)
-{
-	struct emitted_diff_symbol *l = &o->emitted_symbols->buf[n];
-	int al = cur->es->len, bl = match->es->len, cl = l->len;
+static int cmp_in_block_with_wsd(const struct moved_entry *cur,
+				 const struct emitted_diff_symbol *l,
+				 struct moved_block *pmb)
+{
+	int al = cur->es->len, bl = l->len;
 	const char *a = cur->es->line,
-		   *b = match->es->line,
-		   *c = l->line;
+		   *b = l->line;
 	int a_off = cur->es->indent_off,
 	    a_width = cur->es->indent_width,
-	    c_off = l->indent_off,
-	    c_width = l->indent_width;
+	    b_off = l->indent_off,
+	    b_width = l->indent_width;
 	int delta;
 
-	/*
-	 * We need to check if 'cur' is equal to 'match'.  As those
-	 * are from the same (+/-) side, we do not need to adjust for
-	 * indent changes. However these were found using fuzzy
-	 * matching so we do have to check if they are equal. Here we
-	 * just check the lengths. We delay calling memcmp() to check
-	 * the contents until later as if the length comparison for a
-	 * and c fails we can avoid the call all together.
-	 */
-	if (al != bl)
-		return 1;
-
 	/* If 'l' and 'cur' are both blank then they match. */
-	if (a_width == INDENT_BLANKLINE && c_width == INDENT_BLANKLINE)
+	if (a_width == INDENT_BLANKLINE && b_width == INDENT_BLANKLINE)
 		return 0;
 
 	/*
@@ -918,7 +902,7 @@  static int cmp_in_block_with_wsd(const struct diff_options *o,
 	 * match those of the current block and that the text of 'l' and 'cur'
 	 * after the indentation match.
 	 */
-	delta = c_width - a_width;
+	delta = b_width - a_width;
 
 	/*
 	 * If the previous lines of this block were all blank then set its
@@ -927,9 +911,8 @@  static int cmp_in_block_with_wsd(const struct diff_options *o,
 	if (pmb->wsd == INDENT_BLANKLINE)
 		pmb->wsd = delta;
 
-	return !(delta == pmb->wsd && al - a_off == cl - c_off &&
-		 !memcmp(a, b, al) && !
-		 memcmp(a + a_off, c + c_off, al - a_off));
+	return !(delta == pmb->wsd && al - a_off == bl - b_off &&
+		 !memcmp(a + a_off, b + b_off, al - a_off));
 }
 
 static int moved_entry_cmp(const void *hashmap_cmp_fn_data,
@@ -1030,36 +1013,23 @@  static void pmb_advance_or_null(struct diff_options *o,
 }
 
 static void pmb_advance_or_null_multi_match(struct diff_options *o,
-					    struct moved_entry *match,
-					    struct hashmap *hm,
+					    struct emitted_diff_symbol *l,
 					    struct moved_block *pmb,
-					    int pmb_nr, int n)
+					    int pmb_nr)
 {
 	int i;
-	char *got_match = xcalloc(1, pmb_nr);
-
-	hashmap_for_each_entry_from(hm, match, ent) {
-		for (i = 0; i < pmb_nr; i++) {
-			struct moved_entry *prev = pmb[i].match;
-			struct moved_entry *cur = (prev && prev->next_line) ?
-					prev->next_line : NULL;
-			if (!cur)
-				continue;
-			if (!cmp_in_block_with_wsd(o, cur, match, &pmb[i], n))
-				got_match[i] |= 1;
-		}
-	}
 
 	for (i = 0; i < pmb_nr; i++) {
-		if (got_match[i]) {
+		struct moved_entry *prev = pmb[i].match;
+		struct moved_entry *cur = (prev && prev->next_line) ?
+			prev->next_line : NULL;
+		if (cur && !cmp_in_block_with_wsd(cur, l, &pmb[i])) {
 			/* Advance to the next line */
-			pmb[i].match = pmb[i].match->next_line;
+			pmb[i].match = cur;
 		} else {
 			moved_block_clear(&pmb[i]);
 		}
 	}
-
-	free(got_match);
 }
 
 static int shrink_potential_moved_blocks(struct moved_block *pmb,
@@ -1223,7 +1193,7 @@  static void mark_color_as_moved(struct diff_options *o,
 
 		if (o->color_moved_ws_handling &
 		    COLOR_MOVED_WS_ALLOW_INDENTATION_CHANGE)
-			pmb_advance_or_null_multi_match(o, match, hm, pmb, pmb_nr, n);
+			pmb_advance_or_null_multi_match(o, l, pmb, pmb_nr);
 		else
 			pmb_advance_or_null(o, match, hm, pmb, pmb_nr);