diff mbox series

[v3,4/5] diffcore-rename: use a different prefetch for basename comparisons

Message ID c9b55241d83144df0ddc479f67fbf3aabca43bd8.1624349082.git.gitgitgadget@gmail.com (mailing list archive)
State Accepted
Commit 1aedd03afb1592be9e4fbacdc614f45a99a865ea
Headers show
Series Optimization batch 13: partial clone optimizations for merge-ort | expand

Commit Message

Elijah Newren June 22, 2021, 8:04 a.m. UTC
From: Elijah Newren <newren@gmail.com>

merge-ort was designed to minimize the amount of data needed and used,
and several changes were made to diffcore-rename to take advantage of
extra metadata to enable this data minimization (particularly the
relevant_sources variable for skipping "irrelevant" renames).  This
effort obviously succeeded in drastically reducing computation times,
but should also theoretically allow partial clones to download much less
information.  Previously, though, the "prefetch" command used in
diffcore-rename had never been modified and downloaded many blobs that
were unnecessary for merge-ort.  This commit corrects that.

When doing basename comparisons, we want to fetch only the objects that
will be used for basename comparisons.  If after basename fetching this
leaves us with no more relevant sources (or no more destinations), then
we won't need to do the full inexact rename detection and can skip
downloading additional source and destination files.  Even if we have to
do that later full inexact rename detection, irrelevant sources are
culled after basename matching and before the full inexact rename
detection, so we can still avoid downloading the blobs for irrelevant
sources.  Rename prefetch() to inexact_prefetch(), and introduce a
new basename_prefetch() to take advantage of this.

If we modify the testcase from commit 557ac0350d ("merge-ort: begin
performance work; instrument with trace2_region_* calls", 2021-01-23)
to pass
    --sparse --filter=blob:none
to the clone command, and use the new trace2 "fetch_count" output from
a few commits ago to track both the number of fetch subcommands invoked
and the number of objects fetched across all those fetches, then for
the mega-renames testcase we observe the following:

BEFORE this commit, rebasing 35 patches:
    strategy     # of fetches    total # of objects fetched
    ---------    ------------    --------------------------
    recursive    62              11423
    ort          30              11391

AFTER this commit, rebasing the same 35 patches:
    ort          32                 63

This means that the new code only needs to download less than 2 blobs
per patch being rebased.  That is especially interesting given that the
repository at the start only had approximately half a dozen TOTAL blobs
downloaded to start with (because the default sparse-checkout of just
the toplevel directory was in use).

So, for this particular linux kernel testcase that involved ~26,000
renames on the upstream side (drivers/ -> pilots/) across which 35
patches were being rebased, this change reduces the number of blobs that
need to be downloaded by a factor of ~180.

Signed-off-by: Elijah Newren <newren@gmail.com>
---
 diffcore-rename.c              | 101 +++++++++++++++++++++++++++------
 t/t6421-merge-partial-clone.sh |   4 +-
 2 files changed, 85 insertions(+), 20 deletions(-)
diff mbox series

Patch

diff --git a/diffcore-rename.c b/diffcore-rename.c
index e13e52046026..4ef0459cfb50 100644
--- a/diffcore-rename.c
+++ b/diffcore-rename.c
@@ -87,13 +87,13 @@  struct diff_score {
 	short name_score;
 };
 
-struct prefetch_options {
+struct inexact_prefetch_options {
 	struct repository *repo;
 	int skip_unmodified;
 };
-static void prefetch(void *prefetch_options)
+static void inexact_prefetch(void *prefetch_options)
 {
-	struct prefetch_options *options = prefetch_options;
+	struct inexact_prefetch_options *options = prefetch_options;
 	int i;
 	struct oid_array to_fetch = OID_ARRAY_INIT;
 
@@ -816,6 +816,78 @@  static int idx_possible_rename(char *filename, struct dir_rename_info *info)
 	return idx;
 }
 
+struct basename_prefetch_options {
+	struct repository *repo;
+	struct strintmap *relevant_sources;
+	struct strintmap *sources;
+	struct strintmap *dests;
+	struct dir_rename_info *info;
+};
+static void basename_prefetch(void *prefetch_options)
+{
+	struct basename_prefetch_options *options = prefetch_options;
+	struct strintmap *relevant_sources = options->relevant_sources;
+	struct strintmap *sources = options->sources;
+	struct strintmap *dests = options->dests;
+	struct dir_rename_info *info = options->info;
+	int i;
+	struct oid_array to_fetch = OID_ARRAY_INIT;
+
+	/*
+	 * TODO: The following loops mirror the code/logic from
+	 * find_basename_matches(), though not quite exactly.  Maybe
+	 * abstract the iteration logic out somehow?
+	 */
+	for (i = 0; i < rename_src_nr; ++i) {
+		char *filename = rename_src[i].p->one->path;
+		const char *base = NULL;
+		intptr_t src_index;
+		intptr_t dst_index;
+
+		/* Skip irrelevant sources */
+		if (relevant_sources &&
+		    !strintmap_contains(relevant_sources, filename))
+			continue;
+
+		/*
+		 * If the basename is unique among remaining sources, then
+		 * src_index will equal 'i' and we can attempt to match it
+		 * to a unique basename in the destinations.  Otherwise,
+		 * use directory rename heuristics, if possible.
+		 */
+		base = get_basename(filename);
+		src_index = strintmap_get(sources, base);
+		assert(src_index == -1 || src_index == i);
+
+		if (strintmap_contains(dests, base)) {
+			struct diff_filespec *one, *two;
+
+			/* Find a matching destination, if possible */
+			dst_index = strintmap_get(dests, base);
+			if (src_index == -1 || dst_index == -1) {
+				src_index = i;
+				dst_index = idx_possible_rename(filename, info);
+			}
+			if (dst_index == -1)
+				continue;
+
+			/* Ignore this dest if already used in a rename */
+			if (rename_dst[dst_index].is_rename)
+				continue; /* already used previously */
+
+			one = rename_src[src_index].p->one;
+			two = rename_dst[dst_index].p->two;
+
+			/* Add the pairs */
+			diff_add_if_missing(options->repo, &to_fetch, two);
+			diff_add_if_missing(options->repo, &to_fetch, one);
+		}
+	}
+
+	promisor_remote_get_direct(options->repo, to_fetch.oid, to_fetch.nr);
+	oid_array_clear(&to_fetch);
+}
+
 static int find_basename_matches(struct diff_options *options,
 				 int minimum_score,
 				 struct dir_rename_info *info,
@@ -860,19 +932,12 @@  static int find_basename_matches(struct diff_options *options,
 		.missing_object_cb = NULL,
 		.missing_object_data = NULL
 	};
-	/*
-	 * The prefeteching stuff wants to know if it can skip prefetching
-	 * blobs that are unmodified...and will then do a little extra work
-	 * to verify that the oids are indeed different before prefetching.
-	 * Unmodified blobs are only relevant when doing copy detection;
-	 * when limiting to rename detection, diffcore_rename[_extended]()
-	 * will never be called with unmodified source paths fed to us, so
-	 * the extra work necessary to check if rename_src entries are
-	 * unmodified would be a small waste.
-	 */
-	struct prefetch_options prefetch_options = {
+	struct basename_prefetch_options prefetch_options = {
 		.repo = options->repo,
-		.skip_unmodified = 0
+		.relevant_sources = relevant_sources,
+		.sources = &sources,
+		.dests = &dests,
+		.info = info
 	};
 
 	/*
@@ -911,7 +976,7 @@  static int find_basename_matches(struct diff_options *options,
 	}
 
 	if (options->repo == the_repository && has_promisor_remote()) {
-		dpf_options.missing_object_cb = prefetch;
+		dpf_options.missing_object_cb = basename_prefetch;
 		dpf_options.missing_object_data = &prefetch_options;
 	}
 
@@ -1282,7 +1347,7 @@  void diffcore_rename_extended(struct diff_options *options,
 		.missing_object_cb = NULL,
 		.missing_object_data = NULL
 	};
-	struct prefetch_options prefetch_options = {
+	struct inexact_prefetch_options prefetch_options = {
 		.repo = options->repo
 	};
 
@@ -1449,7 +1514,7 @@  void diffcore_rename_extended(struct diff_options *options,
 	/* Finish setting up dpf_options */
 	prefetch_options.skip_unmodified = skip_unmodified;
 	if (options->repo == the_repository && has_promisor_remote()) {
-		dpf_options.missing_object_cb = prefetch;
+		dpf_options.missing_object_cb = inexact_prefetch;
 		dpf_options.missing_object_data = &prefetch_options;
 	}
 
diff --git a/t/t6421-merge-partial-clone.sh b/t/t6421-merge-partial-clone.sh
index 3dcffc15f801..aad975d24ddb 100755
--- a/t/t6421-merge-partial-clone.sh
+++ b/t/t6421-merge-partial-clone.sh
@@ -207,7 +207,7 @@  test_setup_repo () {
 #
 #   Summary: 2 fetches (1 for 2 objects, 1 for 1 object)
 #
-test_expect_merge_algorithm failure failure 'Objects downloaded for single relevant rename' '
+test_expect_merge_algorithm failure success 'Objects downloaded for single relevant rename' '
 	test_setup_repo &&
 	git clone --sparse --filter=blob:none "file://$(pwd)/server" objects-single &&
 	(
@@ -296,7 +296,7 @@  test_expect_merge_algorithm failure failure 'Objects downloaded for single relev
 #      this are not all that common.)
 #   Summary: 1 fetches for 6 objects
 #
-test_expect_merge_algorithm failure failure 'Objects downloaded when a directory rename triggered' '
+test_expect_merge_algorithm failure success 'Objects downloaded when a directory rename triggered' '
 	test_setup_repo &&
 	git clone --sparse --filter=blob:none "file://$(pwd)/server" objects-dir &&
 	(