diff mbox series

[10/10] diffcore-rename: use directory rename guided basename comparisons

Message ID 1df498b3a2f00cdcbe93acda1dc637e3fca0ee0b.1613289544.git.gitgitgadget@gmail.com (mailing list archive)
State Superseded
Headers show
Series Optimization batch 8: use file basenames even more | expand

Commit Message

Elijah Newren Feb. 14, 2021, 7:59 a.m. UTC
From: Elijah Newren <newren@gmail.com>

Hook the work from the last several patches together so that when
basenames in the sets of possible remaining rename sources or
destinations aren't unique, we can guess which directory source files
were renamed into.  When that guess gives us a pairing of files, and
those files are sufficiently similar, we record the two files as a
rename and remove them from the large matrix of comparisons for inexact
rename detection.

For the testcases mentioned in commit 557ac0350d ("merge-ort: begin
performance work; instrument with trace2_region_* calls", 2020-10-28),
this change improves the performance as follows:

                            Before                  After
    no-renames:       12.775 s ±  0.062 s    12.596 s ±  0.061 s
    mega-renames:    188.754 s ±  0.284 s   130.465 s ±  0.259 s
    just-one-mega:     5.599 s ±  0.019 s     3.958 s ±  0.010 s

Signed-off-by: Elijah Newren <newren@gmail.com>
---
 Documentation/gitdiffcore.txt |  2 +-
 diffcore-rename.c             | 32 +++++++++++++++++++++++---------
 2 files changed, 24 insertions(+), 10 deletions(-)
diff mbox series

Patch

diff --git a/Documentation/gitdiffcore.txt b/Documentation/gitdiffcore.txt
index 80fcf9542441..8673a5c5b2f2 100644
--- a/Documentation/gitdiffcore.txt
+++ b/Documentation/gitdiffcore.txt
@@ -186,7 +186,7 @@  mark a file pair as a rename and stop considering other candidates for
 better matches.  At most, one comparison is done per file in this
 preliminary pass; so if there are several remaining ext.txt files
 throughout the directory hierarchy after exact rename detection, this
-preliminary step will be skipped for those files.
+preliminary step may be skipped for those files.
 
 Note.  When the "-C" option is used with `--find-copies-harder`
 option, 'git diff-{asterisk}' commands feed unmodified filepairs to
diff --git a/diffcore-rename.c b/diffcore-rename.c
index 5de4497e04fa..70a484b9b63e 100644
--- a/diffcore-rename.c
+++ b/diffcore-rename.c
@@ -667,7 +667,6 @@  static const char *get_basename(const char *filename)
 	return base ? base + 1 : filename;
 }
 
-MAYBE_UNUSED
 static int idx_possible_rename(char *filename, struct dir_rename_info *info)
 {
 	/*
@@ -780,8 +779,6 @@  static int find_basename_matches(struct diff_options *options,
 	int i, renames = 0;
 	struct strintmap sources;
 	struct strintmap dests;
-	struct hashmap_iter iter;
-	struct strmap_entry *entry;
 
 	/*
 	 * The prefeteching stuff wants to know if it can skip prefetching
@@ -831,17 +828,34 @@  static int find_basename_matches(struct diff_options *options,
 	}
 
 	/* Now look for basename matchups and do similarity estimation */
-	strintmap_for_each_entry(&sources, &iter, entry) {
-		const char *base = entry->key;
-		intptr_t src_index = (intptr_t)entry->value;
+	for (i = 0; i < rename_src_nr; ++i) {
+		char *filename = rename_src[i].p->one->path;
+		const char *base = NULL;
+		intptr_t src_index;
 		intptr_t dst_index;
-		if (src_index == -1)
-			continue;
 
-		if (0 <= (dst_index = strintmap_get(&dests, base))) {
+		/* Is this basename unique among remaining sources? */
+		base = get_basename(filename);
+		src_index = strintmap_get(&sources, base);
+		assert(src_index == -1 || src_index == i);
+
+		if (strintmap_contains(&dests, base)) {
 			struct diff_filespec *one, *two;
 			int score;
 
+			/* Find a matching destination, if possible */
+			dst_index = strintmap_get(&dests, base);
+			if (src_index == -1 || dst_index == -1) {
+				src_index = i;
+				dst_index = idx_possible_rename(filename, info);
+			}
+			if (dst_index == -1)
+				continue;
+
+			/* Ignore this dest if already used in a rename */
+			if (rename_dst[dst_index].is_rename)
+				continue; /* already used previously */
+
 			/* Estimate the similarity */
 			one = rename_src[src_index].p->one;
 			two = rename_dst[dst_index].p->two;