@@ -14,6 +14,37 @@ SYNOPSIS
DESCRIPTION
-----------
+Blobless partial clones are created using `git clone --filter=blob:none`
+and then configure the local repository such that the Git client avoids
+downloading blob objects unless they are required for a local operation.
+This initially means that the clone and later fetches download reachable
+commits and trees but no blobs. Later operations that change the `HEAD`
+pointer, such as `git checkout` or `git merge`, may need to download
+missing blobs in order to complete their operation.
+
+In the worst cases, commands that compute blob diffs, such as `git blame`,
+become very slow as they download the missing blobs in single-blob
+requests to satisfy the missing object as the Git command needs it. This
+leads to multiple download requests and no ability for the Git server to
+provide delta compression across those objects.
+
+The `git backfill` command provides a way for the user to request that
+Git downloads the missing blobs (with optional filters) such that the
+missing blobs representing historical versions of files can be downloaded
+in batches. The `backfill` command attempts to optimize the request by
+grouping blobs that appear at the same path, hopefully leading to good
+delta compression in the packfile sent by the server.
+
+In this way, `git backfill` provides a mechanism to break a large clone
+into smaller chunks. Starting with a blobless partial clone with `git
+clone --filter=blob:none` and then running `git backfill` in the local
+repository provides a way to download all reachable objects in several
+smaller network calls than downloading the entire repository at clone
+time.
+
+By default, `git backfill` downloads all blobs reachable from the `HEAD`
+commit. This set can be restricted or expanded using various options.
+
SEE ALSO
--------
linkgit:git-clone[1].
@@ -60,4 +60,5 @@ Examples
--------
See example usages in:
- `t/helper/test-path-walk.c`
+ `t/helper/test-path-walk.c`,
+ `builtin/backfill.c`
@@ -1,16 +1,118 @@
#include "builtin.h"
+#include "git-compat-util.h"
#include "config.h"
#include "parse-options.h"
#include "repository.h"
+#include "commit.h"
+#include "hex.h"
+#include "tree.h"
+#include "tree-walk.h"
#include "object.h"
+#include "object-store-ll.h"
+#include "oid-array.h"
+#include "oidset.h"
+#include "promisor-remote.h"
+#include "strmap.h"
+#include "string-list.h"
+#include "revision.h"
+#include "trace2.h"
+#include "progress.h"
+#include "packfile.h"
+#include "path-walk.h"
static const char * const builtin_backfill_usage[] = {
N_("git backfill [<options>]"),
NULL
};
+struct backfill_context {
+ struct repository *repo;
+ struct oid_array current_batch;
+ size_t batch_size;
+};
+
+static void backfill_context_clear(struct backfill_context *ctx)
+{
+ oid_array_clear(&ctx->current_batch);
+}
+
+static void download_batch(struct backfill_context *ctx)
+{
+ promisor_remote_get_direct(ctx->repo,
+ ctx->current_batch.oid,
+ ctx->current_batch.nr);
+ oid_array_clear(&ctx->current_batch);
+
+ /*
+ * We likely have a new packfile. Add it to the packed list to
+ * avoid possible duplicate downloads of the same objects.
+ */
+ reprepare_packed_git(ctx->repo);
+}
+
+static int fill_missing_blobs(const char *path UNUSED,
+ struct oid_array *list,
+ enum object_type type,
+ void *data)
+{
+ struct backfill_context *ctx = data;
+
+ if (type != OBJ_BLOB)
+ return 0;
+
+ for (size_t i = 0; i < list->nr; i++) {
+ off_t size = 0;
+ struct object_info info = OBJECT_INFO_INIT;
+ info.disk_sizep = &size;
+ if (oid_object_info_extended(ctx->repo,
+ &list->oid[i],
+ &info,
+ OBJECT_INFO_FOR_PREFETCH) ||
+ !size)
+ oid_array_append(&ctx->current_batch, &list->oid[i]);
+ }
+
+ if (ctx->current_batch.nr >= ctx->batch_size)
+ download_batch(ctx);
+
+ return 0;
+}
+
+static int do_backfill(struct backfill_context *ctx)
+{
+ struct rev_info revs;
+ struct path_walk_info info = PATH_WALK_INFO_INIT;
+ int ret;
+
+ repo_init_revisions(ctx->repo, &revs, "");
+ handle_revision_arg("HEAD", &revs, 0, 0);
+
+ info.blobs = 1;
+ info.tags = info.commits = info.trees = 0;
+
+ info.revs = &revs;
+ info.path_fn = fill_missing_blobs;
+ info.path_fn_data = ctx;
+
+ ret = walk_objects_by_path(&info);
+
+ /* Download the objects that did not fill a batch. */
+ if (!ret)
+ download_batch(ctx);
+
+ backfill_context_clear(ctx);
+ path_walk_info_clear(&info);
+ release_revisions(&revs);
+ return ret;
+}
+
int cmd_backfill(int argc, const char **argv, const char *prefix, struct repository *repo)
{
+ struct backfill_context ctx = {
+ .repo = repo,
+ .current_batch = OID_ARRAY_INIT,
+ .batch_size = 50000,
+ };
struct option options[] = {
OPT_END(),
};
@@ -23,7 +125,5 @@ int cmd_backfill(int argc, const char **argv, const char *prefix, struct reposit
repo_config(repo, git_default_config, NULL);
- die(_("not implemented"));
-
- return 0;
+ return do_backfill(&ctx);
}
new file mode 100755
@@ -0,0 +1,94 @@
+#!/bin/sh
+
+test_description='git backfill on partial clones'
+
+GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME=main
+export GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME
+
+. ./test-lib.sh
+
+# We create objects in the 'src' repo.
+test_expect_success 'setup repo for object creation' '
+ echo "{print \$1}" >print_1.awk &&
+ echo "{print \$2}" >print_2.awk &&
+
+ git init src &&
+
+ mkdir -p src/a/b/c &&
+ mkdir -p src/d/e &&
+
+ for i in 1 2
+ do
+ for n in 1 2 3 4
+ do
+ echo "Version $i of file $n" > src/file.$n.txt &&
+ echo "Version $i of file a/$n" > src/a/file.$n.txt &&
+ echo "Version $i of file a/b/$n" > src/a/b/file.$n.txt &&
+ echo "Version $i of file a/b/c/$n" > src/a/b/c/file.$n.txt &&
+ echo "Version $i of file d/$n" > src/d/file.$n.txt &&
+ echo "Version $i of file d/e/$n" > src/d/e/file.$n.txt &&
+ git -C src add . &&
+ git -C src commit -m "Iteration $n" || return 1
+ done
+ done
+'
+
+# Clone 'src' into 'srv.bare' so we have a bare repo to be our origin
+# server for the partial clone.
+test_expect_success 'setup bare clone for server' '
+ git clone --bare "file://$(pwd)/src" srv.bare &&
+ git -C srv.bare config --local uploadpack.allowfilter 1 &&
+ git -C srv.bare config --local uploadpack.allowanysha1inwant 1
+'
+
+# do basic partial clone from "srv.bare"
+test_expect_success 'do partial clone 1, backfill gets all objects' '
+ git clone --no-checkout --filter=blob:none \
+ --single-branch --branch=main \
+ "file://$(pwd)/srv.bare" backfill1 &&
+
+ # Backfill with no options gets everything reachable from HEAD.
+ GIT_TRACE2_EVENT="$(pwd)/backfill-file-trace" git \
+ -C backfill1 backfill &&
+
+ # We should have engaged the partial clone machinery
+ test_trace2_data promisor fetch_count 48 <backfill-file-trace &&
+
+ # No more missing objects!
+ git -C backfill1 rev-list --quiet --objects --missing=print HEAD >revs2 &&
+ test_line_count = 0 revs2
+'
+
+. "$TEST_DIRECTORY"/lib-httpd.sh
+start_httpd
+
+test_expect_success 'create a partial clone over HTTP' '
+ SERVER="$HTTPD_DOCUMENT_ROOT_PATH/server" &&
+ rm -rf "$SERVER" repo &&
+ git clone --bare "file://$(pwd)/src" "$SERVER" &&
+ test_config -C "$SERVER" uploadpack.allowfilter 1 &&
+ test_config -C "$SERVER" uploadpack.allowanysha1inwant 1 &&
+
+ git clone --no-checkout --filter=blob:none \
+ "$HTTPD_URL/smart/server" backfill-http
+'
+
+test_expect_success 'backfilling over HTTP succeeds' '
+ GIT_TRACE2_EVENT="$(pwd)/backfill-http-trace" git \
+ -C backfill-http backfill &&
+
+ # We should have engaged the partial clone machinery
+ test_trace2_data promisor fetch_count 48 <backfill-http-trace &&
+
+ # Confirm all objects are present, none missing.
+ git -C backfill-http rev-list --objects --all >rev-list-out &&
+ awk "{print \$1;}" <rev-list-out >oids &&
+ GIT_TRACE2_EVENT="$(pwd)/walk-trace" git -C backfill-http \
+ cat-file --batch-check <oids >batch-out &&
+ ! grep missing batch-out
+'
+
+# DO NOT add non-httpd-specific tests here, because the last part of this
+# test script is only executed when httpd is available and enabled.
+
+test_done