diff mbox series

[03/30] backfill: basic functionality and tests

Message ID be21c8370eb5143c3f6e91d354d48465d221692a.1725935335.git.gitgitgadget@gmail.com (mailing list archive)
State New
Headers show
Series Path-walk API and applications | expand

Commit Message

Derrick Stolee Sept. 10, 2024, 2:28 a.m. UTC
From: Derrick Stolee <derrickstolee@github.com>

The default behavior of 'git backfill' is to fetch all missing blobs that
are reachable from HEAD. Document and test this behavior.

The implementation is a very simple use of the path-walk API, initializing
the revision walk at HEAD to start the path-walk from all commits reachable
from HEAD. Ignore the object arrays that correspond to tree entries,
assuming that they are all present already.

Signed-off-by: Derrick Stolee <stolee@gmail.com>
---
 Documentation/git-backfill.txt |  24 ++++++++
 builtin/backfill.c             | 101 ++++++++++++++++++++++++++++++++-
 t/t5620-backfill.sh            |  97 +++++++++++++++++++++++++++++++
 3 files changed, 219 insertions(+), 3 deletions(-)
 create mode 100755 t/t5620-backfill.sh
diff mbox series

Patch

diff --git a/Documentation/git-backfill.txt b/Documentation/git-backfill.txt
index 640144187d3..0e10f066fef 100644
--- a/Documentation/git-backfill.txt
+++ b/Documentation/git-backfill.txt
@@ -14,6 +14,30 @@  SYNOPSIS
 DESCRIPTION
 -----------
 
+Blobless partial clones are created using `git clone --filter=blob:none`
+and then configure the local repository such that the Git client avoids
+downloading blob objects unless they are required for a local operation.
+This initially means that the clone and later fetches download reachable
+commits and trees but no blobs. Later operations that change the `HEAD`
+pointer, such as `git checkout` or `git merge`, may need to download
+missing blobs in order to complete their operation.
+
+In the worst cases, commands that compute blob diffs, such as `git blame`,
+become very slow as they download the missing blobs in single-blob
+requests to satisfy the missing object as the Git command needs it. This
+leads to multiple download requests and no ability for the Git server to
+provide delta compression across those objects.
+
+The `git backfill` command provides a way for the user to request that
+Git downloads the missing blobs (with optional filters) such that the
+missing blobs representing historical versions of files can be downloaded
+in batches. The `backfill` command attempts to optimize the request by
+grouping blobs that appear at the same path, hopefully leading to good
+delta compression in the packfile sent by the server.
+
+By default, `git backfill` downloads all blobs reachable from the `HEAD`
+commit. This set can be restricted or expanded using various options.
+
 SEE ALSO
 --------
 linkgit:git-clone[1].
diff --git a/builtin/backfill.c b/builtin/backfill.c
index 77b05a2f838..23d40fc02a2 100644
--- a/builtin/backfill.c
+++ b/builtin/backfill.c
@@ -1,16 +1,113 @@ 
 #include "builtin.h"
+#include "git-compat-util.h"
 #include "config.h"
 #include "parse-options.h"
 #include "repository.h"
+#include "commit.h"
+#include "hex.h"
+#include "tree.h"
+#include "tree-walk.h"
 #include "object.h"
+#include "object-store-ll.h"
+#include "oid-array.h"
+#include "oidset.h"
+#include "promisor-remote.h"
+#include "strmap.h"
+#include "string-list.h"
+#include "revision.h"
+#include "trace2.h"
+#include "progress.h"
+#include "packfile.h"
+#include "path-walk.h"
 
 static const char * const builtin_backfill_usage[] = {
 	N_("git backfill [<options>]"),
 	NULL
 };
 
+struct backfill_context {
+	struct repository *repo;
+	struct oid_array current_batch;
+	size_t batch_size;
+};
+
+static void clear_backfill_context(struct backfill_context *ctx)
+{
+	oid_array_clear(&ctx->current_batch);
+}
+
+static void download_batch(struct backfill_context *ctx)
+{
+	promisor_remote_get_direct(ctx->repo,
+				   ctx->current_batch.oid,
+				   ctx->current_batch.nr);
+	oid_array_clear(&ctx->current_batch);
+
+	/*
+	 * We likely have a new packfile. Add it to the packed list to
+	 * avoid possible duplicate downloads of the same objects.
+	 */
+	reprepare_packed_git(ctx->repo);
+}
+
+static int fill_missing_blobs(const char *path,
+			      struct oid_array *list,
+			      enum object_type type,
+			      void *data)
+{
+	struct backfill_context *ctx = data;
+
+	if (type != OBJ_BLOB)
+		return 0;
+
+	for (size_t i = 0; i < list->nr; i++) {
+		off_t size = 0;
+		struct object_info info = OBJECT_INFO_INIT;
+		info.disk_sizep = &size;
+		if (oid_object_info_extended(the_repository,
+					     &list->oid[i],
+					     &info,
+					     OBJECT_INFO_FOR_PREFETCH) ||
+		    !size)
+			oid_array_append(&ctx->current_batch, &list->oid[i]);
+	}
+
+	if (ctx->current_batch.nr >= ctx->batch_size)
+		download_batch(ctx);
+
+	return 0;
+}
+
+static int do_backfill(struct backfill_context *ctx)
+{
+	struct rev_info revs;
+	struct path_walk_info info = PATH_WALK_INFO_INIT;
+	int ret;
+
+	repo_init_revisions(ctx->repo, &revs, "");
+	handle_revision_arg("HEAD", &revs, 0, 0);
+
+	info.revs = &revs;
+	info.path_fn = fill_missing_blobs;
+	info.path_fn_data = ctx;
+
+	ret = walk_objects_by_path(&info);
+
+	/* Download the objects that did not fill a batch. */
+	if (!ret)
+		download_batch(ctx);
+
+	clear_backfill_context(ctx);
+	return ret;
+}
+
 int cmd_backfill(int argc, const char **argv, const char *prefix)
 {
+	struct backfill_context ctx = {
+		.repo = the_repository,
+		.current_batch = OID_ARRAY_INIT,
+		.batch_size = 16000,
+	};
 	struct option options[] = {
 		OPT_END(),
 	};
@@ -23,7 +120,5 @@  int cmd_backfill(int argc, const char **argv, const char *prefix)
 
 	git_config(git_default_config, NULL);
 
-	die(_("not implemented"));
-
-	return 0;
+	return do_backfill(&ctx);
 }
diff --git a/t/t5620-backfill.sh b/t/t5620-backfill.sh
new file mode 100755
index 00000000000..43868a4a75f
--- /dev/null
+++ b/t/t5620-backfill.sh
@@ -0,0 +1,97 @@ 
+#!/bin/sh
+
+test_description='git backfill on partial clones'
+
+GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME=main
+export GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME
+
+TEST_PASSES_SANITIZE_LEAK=0
+export TEST_PASSES_SANITIZE_LEAK
+
+. ./test-lib.sh
+
+# We create objects in the 'src' repo.
+test_expect_success 'setup repo for object creation' '
+	echo "{print \$1}" >print_1.awk &&
+	echo "{print \$2}" >print_2.awk &&
+
+	git init src &&
+
+	mkdir -p src/a/b/c &&
+	mkdir -p src/d/e &&
+
+	for i in 1 2
+	do
+		for n in 1 2 3 4
+		do
+			echo "Version $i of file $n" > src/file.$n.txt &&
+			echo "Version $i of file a/$n" > src/a/file.$n.txt &&
+			echo "Version $i of file a/b/$n" > src/a/b/file.$n.txt &&
+			echo "Version $i of file a/b/c/$n" > src/a/b/c/file.$n.txt &&
+			echo "Version $i of file d/$n" > src/d/file.$n.txt &&
+			echo "Version $i of file d/e/$n" > src/d/e/file.$n.txt &&
+			git -C src add . &&
+			git -C src commit -m "Iteration $n" || return 1
+		done
+	done
+'
+
+# Clone 'src' into 'srv.bare' so we have a bare repo to be our origin
+# server for the partial clone.
+test_expect_success 'setup bare clone for server' '
+	git clone --bare "file://$(pwd)/src" srv.bare &&
+	git -C srv.bare config --local uploadpack.allowfilter 1 &&
+	git -C srv.bare config --local uploadpack.allowanysha1inwant 1
+'
+
+# do basic partial clone from "srv.bare"
+test_expect_success 'do partial clone 1, backfill gets all objects' '
+	git clone --no-checkout --filter=blob:none	\
+		--single-branch --branch=main 		\
+		"file://$(pwd)/srv.bare" backfill1 &&
+
+	# Backfill with no options gets everything reachable from HEAD.
+	GIT_TRACE2_EVENT="$(pwd)/backfill-file-trace" git \
+		-C backfill1 backfill &&
+
+	# We should have engaged the partial clone machinery
+	test_trace2_data promisor fetch_count 48 <backfill-file-trace &&
+
+	# No more missing objects!
+	git -C backfill1 rev-list --quiet --objects --missing=print HEAD >revs2 &&
+	test_line_count = 0 revs2
+'
+
+. "$TEST_DIRECTORY"/lib-httpd.sh
+start_httpd
+
+test_expect_success 'create a partial clone over HTTP' '
+	SERVER="$HTTPD_DOCUMENT_ROOT_PATH/server" &&
+	rm -rf "$SERVER" repo &&
+	git clone --bare "file://$(pwd)/src" "$SERVER" &&
+	test_config -C "$SERVER" uploadpack.allowfilter 1 &&
+	test_config -C "$SERVER" uploadpack.allowanysha1inwant 1 &&
+
+	git clone --no-checkout --filter=blob:none \
+		"$HTTPD_URL/smart/server" backfill-http
+'
+
+test_expect_success 'backfilling over HTTP succeeds' '
+	GIT_TRACE2_EVENT="$(pwd)/backfill-http-trace" git \
+		-C backfill-http backfill &&
+
+	# We should have engaged the partial clone machinery
+	test_trace2_data promisor fetch_count 48 <backfill-http-trace &&
+
+	# Confirm all objects are present, none missing.
+	git -C backfill-http rev-list --objects --all >rev-list-out &&
+	awk "{print \$1;}" <rev-list-out >oids &&
+	GIT_TRACE2_EVENT="$(pwd)/walk-trace" git -C backfill-http \
+		cat-file --batch-check <oids >batch-out &&
+	! grep missing batch-out
+'
+
+# DO NOT add non-httpd-specific tests here, because the last part of this
+# test script is only executed when httpd is available and enabled.
+
+test_done