diff mbox series

[8/9] repack: implement `--filter-to` for storing filtered out objects

Message ID 20230614192541.1599256-9-christian.couder@gmail.com (mailing list archive)
State Superseded
Headers show
Series Repack objects into separate packfiles based on a filter | expand

Commit Message

Christian Couder June 14, 2023, 7:25 p.m. UTC
A previous commit has implemented `git repack --filter=<filter-spec>` to
allow users to filter out some objects from the main pack and move them
into a new different pack.

It would be nice if this new different pack could be created in a
different directory than the regular pack. This would make it possible
to move large blobs into a pack on a different kind of storage, for
example cheaper storage. Even in a different directory this pack can be
accessible if, for example, the Git alternates mechanism is used to
point to it.

If users want to remove a pack that contains filtered out objects after
checking that they are all already on a promisor remote, creating the
pack in a different directory makes it easier to do so.

Signed-off-by: Christian Couder <chriscool@tuxfamily.org>
---
 Documentation/git-repack.txt |  6 ++++++
 builtin/repack.c             | 17 ++++++++++++-----
 t/t7700-repack.sh            | 27 +++++++++++++++++++++++++++
 3 files changed, 45 insertions(+), 5 deletions(-)

Comments

Junio C Hamano June 16, 2023, 2:21 a.m. UTC | #1
Christian Couder <christian.couder@gmail.com> writes:

> A previous commit has implemented `git repack --filter=<filter-spec>` to
> allow users to filter out some objects from the main pack and move them
> into a new different pack.
>
> It would be nice if this new different pack could be created in a
> different directory than the regular pack. This would make it possible
> to move large blobs into a pack on a different kind of storage, for
> example cheaper storage. Even in a different directory this pack can be
> accessible if, for example, the Git alternates mechanism is used to
> point to it.

Makes sense, I guess, for "in other usecases" scenario.  I am not
sure how this would be useful for the originally stated goal of
unbloating a bloated repository with promisor remote(s), though. 

> If users want to remove a pack that contains filtered out objects after
> checking that they are all already on a promisor remote, creating the
> pack in a different directory makes it easier to do so.

Care to elaborate?  I do not see how a separate directory would make
it easier.  After separating the potential cruft into a packfile,
you'd walk its .idx and see if there are any objects that are not
available (yet) at the promisor remotes to check if it is safe to
remove.  That can be done regardless of the location of the packfile
that is suspected to be now removable.

> Signed-off-by: Christian Couder <chriscool@tuxfamily.org>
> ---
>  Documentation/git-repack.txt |  6 ++++++
>  builtin/repack.c             | 17 ++++++++++++-----
>  t/t7700-repack.sh            | 27 +++++++++++++++++++++++++++
>  3 files changed, 45 insertions(+), 5 deletions(-)
>
> diff --git a/Documentation/git-repack.txt b/Documentation/git-repack.txt
> index aa29c7e648..070dd22610 100644
> --- a/Documentation/git-repack.txt
> +++ b/Documentation/git-repack.txt
> @@ -148,6 +148,12 @@ depth is 4095.
>  	resulting packfile and put them into a separate packfile. See
>  	linkgit:git-rev-list[1] for valid `<filter-spec>` forms.
>  
> +--filter-to=<dir>::
> +	Write the pack containing filtered out objects to the
> +	directory `<dir>`. This can be used for putting the pack on a
> +	separate object directory that is accessed through the Git
> +	alternates mechanism. Only useful with `--filter`.
> +
>  -b::
>  --write-bitmap-index::
>  	Write a reachability bitmap index as part of the repack. This
> diff --git a/builtin/repack.c b/builtin/repack.c
> index b13d7196de..8c71e8fd51 100644
> --- a/builtin/repack.c
> +++ b/builtin/repack.c
> @@ -838,7 +838,8 @@ static void prepare_pack_filtered_cmd(struct child_process *cmd,
>  }
>  
>  static void finish_pack_filtered_cmd(struct child_process *cmd,
> -				     struct string_list *names)
> +				     struct string_list *names,
> +				     const char *destination)
>  {
>  	if (cmd->in == -1) {
>  		/* No packed objects; cmd was never started */
> @@ -848,7 +849,7 @@ static void finish_pack_filtered_cmd(struct child_process *cmd,
>  
>  	close(cmd->in);
>  
> -	if (finish_pack_objects_cmd(cmd, names, NULL, NULL))
> +	if (finish_pack_objects_cmd(cmd, names, destination, NULL))
>  		die(_("could not finish pack-objects to pack filtered objects"));
>  }
>  
> @@ -877,6 +878,7 @@ int cmd_repack(int argc, const char **argv, const char *prefix)
>  	const char *cruft_expiration = NULL;
>  	const char *expire_to = NULL;
>  	struct child_process pack_filtered_cmd = CHILD_PROCESS_INIT;
> +	const char *filter_to = NULL;
>  
>  	struct option builtin_repack_options[] = {
>  		OPT_BIT('a', NULL, &pack_everything,
> @@ -930,6 +932,8 @@ int cmd_repack(int argc, const char **argv, const char *prefix)
>  			   N_("write a multi-pack index of the resulting packs")),
>  		OPT_STRING(0, "expire-to", &expire_to, N_("dir"),
>  			   N_("pack prefix to store a pack containing pruned objects")),
> +		OPT_STRING(0, "filter-to", &filter_to, N_("dir"),
> +			   N_("pack prefix to store a pack containing filtered out objects")),
>  		OPT_END()
>  	};
>  
> @@ -1073,8 +1077,11 @@ int cmd_repack(int argc, const char **argv, const char *prefix)
>  		strvec_push(&cmd.args, "--incremental");
>  	}
>  
> -	if (po_args.filter)
> -		prepare_pack_filtered_cmd(&pack_filtered_cmd, &po_args, packtmp);
> +	if (po_args.filter) {
> +		if (!filter_to)
> +			filter_to = packtmp;
> +		prepare_pack_filtered_cmd(&pack_filtered_cmd, &po_args, filter_to);
> +	}
>  
>  	if (geometry)
>  		cmd.in = -1;
> @@ -1169,7 +1176,7 @@ int cmd_repack(int argc, const char **argv, const char *prefix)
>  	}
>  
>  	if (po_args.filter)
> -		finish_pack_filtered_cmd(&pack_filtered_cmd, &names);
> +		finish_pack_filtered_cmd(&pack_filtered_cmd, &names, filter_to);
>  
>  	string_list_sort(&names);
>  
> diff --git a/t/t7700-repack.sh b/t/t7700-repack.sh
> index 9e7654090f..898f8a01b4 100755
> --- a/t/t7700-repack.sh
> +++ b/t/t7700-repack.sh
> @@ -286,6 +286,33 @@ test_expect_success 'repacking with a filter works' '
>  	test "$blob_pack2" = "$blob_pack"
>  '
>  
> +test_expect_success '--filter-to stores filtered out objects' '
> +	git -C bare.git repack -a -d &&
> +	test_stdout_line_count = 1 ls bare.git/objects/pack/*.pack &&
> +
> +	git init --bare filtered.git &&
> +	git -C bare.git -c repack.writebitmaps=false repack -a -d \
> +		--filter=blob:none \
> +		--filter-to=../filtered.git/objects/pack/pack &&
> +	test_stdout_line_count = 1 ls bare.git/objects/pack/pack-*.pack &&
> +	test_stdout_line_count = 1 ls filtered.git/objects/pack/pack-*.pack &&
> +
> +	commit_pack=$(test-tool -C bare.git find-pack HEAD) &&
> +	test -n "$commit_pack" &&
> +	blob_pack=$(test-tool -C bare.git find-pack HEAD:file1) &&
> +	test -z "$blob_pack" &&
> +	blob_hash=$(git -C bare.git rev-parse HEAD:file1) &&
> +	test -n "$blob_hash" &&
> +	blob_pack=$(test-tool -C filtered.git find-pack $blob_hash) &&
> +	test -n "$blob_pack" &&
> +
> +	echo $(pwd)/filtered.git/objects >bare.git/objects/info/alternates &&
> +	blob_pack=$(test-tool -C bare.git find-pack HEAD:file1) &&
> +	test -n "$blob_pack" &&
> +	blob_content=$(git -C bare.git show $blob_hash) &&
> +	test "$blob_content" = "content1"
> +'
> +
>  objdir=.git/objects
>  midx=$objdir/pack/multi-pack-index
Taylor Blau June 21, 2023, 11:49 a.m. UTC | #2
On Wed, Jun 14, 2023 at 09:25:40PM +0200, Christian Couder wrote:
> diff --git a/Documentation/git-repack.txt b/Documentation/git-repack.txt
> index aa29c7e648..070dd22610 100644
> --- a/Documentation/git-repack.txt
> +++ b/Documentation/git-repack.txt
> @@ -148,6 +148,12 @@ depth is 4095.
>  	resulting packfile and put them into a separate packfile. See
>  	linkgit:git-rev-list[1] for valid `<filter-spec>` forms.
>
> +--filter-to=<dir>::
> +	Write the pack containing filtered out objects to the
> +	directory `<dir>`. This can be used for putting the pack on a
> +	separate object directory that is accessed through the Git
> +	alternates mechanism. Only useful with `--filter`.

Here you say "only useful with --filter", but...

> @@ -1073,8 +1077,11 @@ int cmd_repack(int argc, const char **argv, const char *prefix)
>  		strvec_push(&cmd.args, "--incremental");
>  	}
>
> -	if (po_args.filter)
> -		prepare_pack_filtered_cmd(&pack_filtered_cmd, &po_args, packtmp);
> +	if (po_args.filter) {
> +		if (!filter_to)
> +			filter_to = packtmp;
> +		prepare_pack_filtered_cmd(&pack_filtered_cmd, &po_args, filter_to);
> +	}

Would you want an "} else if (filter_to)" here to die and show the usage
message, since --filter-to needs --filter? Or maybe it should imply
--filter-to.

Thanks,
Taylor
Christian Couder June 21, 2023, 12:08 p.m. UTC | #3
On Wed, Jun 21, 2023 at 1:49 PM Taylor Blau <me@ttaylorr.com> wrote:
>
> On Wed, Jun 14, 2023 at 09:25:40PM +0200, Christian Couder wrote:

> > +--filter-to=<dir>::
> > +     Write the pack containing filtered out objects to the
> > +     directory `<dir>`. This can be used for putting the pack on a
> > +     separate object directory that is accessed through the Git
> > +     alternates mechanism. Only useful with `--filter`.
>
> Here you say "only useful with --filter", but...
>
> > @@ -1073,8 +1077,11 @@ int cmd_repack(int argc, const char **argv, const char *prefix)
> >               strvec_push(&cmd.args, "--incremental");
> >       }
> >
> > -     if (po_args.filter)
> > -             prepare_pack_filtered_cmd(&pack_filtered_cmd, &po_args, packtmp);
> > +     if (po_args.filter) {
> > +             if (!filter_to)
> > +                     filter_to = packtmp;
> > +             prepare_pack_filtered_cmd(&pack_filtered_cmd, &po_args, filter_to);
> > +     }
>
> Would you want an "} else if (filter_to)" here to die and show the usage
> message, since --filter-to needs --filter? Or maybe it should imply
> --filter-to.

In the doc for --expire-to=<dir> there is "Only useful with `--cruft
-d`" and I don't think there is a check to see if --cruft and -d have
been passed when --expire-to is passed. So I am not sure if it's
better to be consistent with --expire-to or not.
Taylor Blau June 21, 2023, 12:25 p.m. UTC | #4
On Wed, Jun 21, 2023 at 02:08:38PM +0200, Christian Couder wrote:
> > > @@ -1073,8 +1077,11 @@ int cmd_repack(int argc, const char **argv, const char *prefix)
> > >               strvec_push(&cmd.args, "--incremental");
> > >       }
> > >
> > > -     if (po_args.filter)
> > > -             prepare_pack_filtered_cmd(&pack_filtered_cmd, &po_args, packtmp);
> > > +     if (po_args.filter) {
> > > +             if (!filter_to)
> > > +                     filter_to = packtmp;
> > > +             prepare_pack_filtered_cmd(&pack_filtered_cmd, &po_args, filter_to);
> > > +     }
> >
> > Would you want an "} else if (filter_to)" here to die and show the usage
> > message, since --filter-to needs --filter? Or maybe it should imply
> > --filter-to.
>
> In the doc for --expire-to=<dir> there is "Only useful with `--cruft
> -d`" and I don't think there is a check to see if --cruft and -d have
> been passed when --expire-to is passed. So I am not sure if it's
> better to be consistent with --expire-to or not.

TBH, I don't think that my decision at the time to silently accept
--expire-to without --cruft was the right one. It should at least
require --cruft, or imply it. It doesn't make a ton of sense to use
without -d, but doing so is OK, so I wouldn't consider that a failing
condition.

In other words, I would be fine with something like:

--- 8< ---
diff --git a/builtin/repack.c b/builtin/repack.c
index 0541c3ce15..1890f283ee 100644
--- a/builtin/repack.c
+++ b/builtin/repack.c
@@ -866,6 +866,10 @@ int cmd_repack(int argc, const char **argv, const char *prefix)
 	    (unpack_unreachable || (pack_everything & LOOSEN_UNREACHABLE)))
 		die(_("options '%s' and '%s' cannot be used together"), "--keep-unreachable", "-A");

+	/* --expire-to implies cruft */
+	if (expire_to)
+		pack_everything |= PACK_CRUFT;
+
 	if (pack_everything & PACK_CRUFT) {
 		pack_everything |= ALL_INTO_ONE;

--- >8 ---

But that sounds like a good candidate for some #leftoverbits.

In the meantime, I would be absolutely fine with deviating from the
existing behavior of --expire-to w.r.t --cruft.

Thanks,
Taylor
Junio C Hamano June 21, 2023, 4:44 p.m. UTC | #5
Taylor Blau <me@ttaylorr.com> writes:

> In other words, I would be fine with something like:
>
> --- 8< ---
> diff --git a/builtin/repack.c b/builtin/repack.c
> index 0541c3ce15..1890f283ee 100644
> --- a/builtin/repack.c
> +++ b/builtin/repack.c
> @@ -866,6 +866,10 @@ int cmd_repack(int argc, const char **argv, const char *prefix)
>  	    (unpack_unreachable || (pack_everything & LOOSEN_UNREACHABLE)))
>  		die(_("options '%s' and '%s' cannot be used together"), "--keep-unreachable", "-A");
>
> +	/* --expire-to implies cruft */
> +	if (expire_to)
> +		pack_everything |= PACK_CRUFT;
> +
>  	if (pack_everything & PACK_CRUFT) {
>  		pack_everything |= ALL_INTO_ONE;
>
> --- >8 ---
>
> But that sounds like a good candidate for some #leftoverbits.

It does.  Thanks.
Christian Couder July 5, 2023, 6:19 a.m. UTC | #6
On Wed, Jun 21, 2023 at 1:49 PM Taylor Blau <me@ttaylorr.com> wrote:
>
> On Wed, Jun 14, 2023 at 09:25:40PM +0200, Christian Couder wrote:
> > diff --git a/Documentation/git-repack.txt b/Documentation/git-repack.txt
> > index aa29c7e648..070dd22610 100644
> > --- a/Documentation/git-repack.txt
> > +++ b/Documentation/git-repack.txt
> > @@ -148,6 +148,12 @@ depth is 4095.
> >       resulting packfile and put them into a separate packfile. See
> >       linkgit:git-rev-list[1] for valid `<filter-spec>` forms.
> >
> > +--filter-to=<dir>::
> > +     Write the pack containing filtered out objects to the
> > +     directory `<dir>`. This can be used for putting the pack on a
> > +     separate object directory that is accessed through the Git
> > +     alternates mechanism. Only useful with `--filter`.
>
> Here you say "only useful with --filter", but...
>
> > @@ -1073,8 +1077,11 @@ int cmd_repack(int argc, const char **argv, const char *prefix)
> >               strvec_push(&cmd.args, "--incremental");
> >       }
> >
> > -     if (po_args.filter)
> > -             prepare_pack_filtered_cmd(&pack_filtered_cmd, &po_args, packtmp);
> > +     if (po_args.filter) {
> > +             if (!filter_to)
> > +                     filter_to = packtmp;
> > +             prepare_pack_filtered_cmd(&pack_filtered_cmd, &po_args, filter_to);
> > +     }
>
> Would you want an "} else if (filter_to)" here to die and show the usage
> message, since --filter-to needs --filter? Or maybe it should imply
> --filter-to.

I have added such a check in the version 2 I just sent. We now die()
if --filter-to is used without --filter.
diff mbox series

Patch

diff --git a/Documentation/git-repack.txt b/Documentation/git-repack.txt
index aa29c7e648..070dd22610 100644
--- a/Documentation/git-repack.txt
+++ b/Documentation/git-repack.txt
@@ -148,6 +148,12 @@  depth is 4095.
 	resulting packfile and put them into a separate packfile. See
 	linkgit:git-rev-list[1] for valid `<filter-spec>` forms.
 
+--filter-to=<dir>::
+	Write the pack containing filtered out objects to the
+	directory `<dir>`. This can be used for putting the pack on a
+	separate object directory that is accessed through the Git
+	alternates mechanism. Only useful with `--filter`.
+
 -b::
 --write-bitmap-index::
 	Write a reachability bitmap index as part of the repack. This
diff --git a/builtin/repack.c b/builtin/repack.c
index b13d7196de..8c71e8fd51 100644
--- a/builtin/repack.c
+++ b/builtin/repack.c
@@ -838,7 +838,8 @@  static void prepare_pack_filtered_cmd(struct child_process *cmd,
 }
 
 static void finish_pack_filtered_cmd(struct child_process *cmd,
-				     struct string_list *names)
+				     struct string_list *names,
+				     const char *destination)
 {
 	if (cmd->in == -1) {
 		/* No packed objects; cmd was never started */
@@ -848,7 +849,7 @@  static void finish_pack_filtered_cmd(struct child_process *cmd,
 
 	close(cmd->in);
 
-	if (finish_pack_objects_cmd(cmd, names, NULL, NULL))
+	if (finish_pack_objects_cmd(cmd, names, destination, NULL))
 		die(_("could not finish pack-objects to pack filtered objects"));
 }
 
@@ -877,6 +878,7 @@  int cmd_repack(int argc, const char **argv, const char *prefix)
 	const char *cruft_expiration = NULL;
 	const char *expire_to = NULL;
 	struct child_process pack_filtered_cmd = CHILD_PROCESS_INIT;
+	const char *filter_to = NULL;
 
 	struct option builtin_repack_options[] = {
 		OPT_BIT('a', NULL, &pack_everything,
@@ -930,6 +932,8 @@  int cmd_repack(int argc, const char **argv, const char *prefix)
 			   N_("write a multi-pack index of the resulting packs")),
 		OPT_STRING(0, "expire-to", &expire_to, N_("dir"),
 			   N_("pack prefix to store a pack containing pruned objects")),
+		OPT_STRING(0, "filter-to", &filter_to, N_("dir"),
+			   N_("pack prefix to store a pack containing filtered out objects")),
 		OPT_END()
 	};
 
@@ -1073,8 +1077,11 @@  int cmd_repack(int argc, const char **argv, const char *prefix)
 		strvec_push(&cmd.args, "--incremental");
 	}
 
-	if (po_args.filter)
-		prepare_pack_filtered_cmd(&pack_filtered_cmd, &po_args, packtmp);
+	if (po_args.filter) {
+		if (!filter_to)
+			filter_to = packtmp;
+		prepare_pack_filtered_cmd(&pack_filtered_cmd, &po_args, filter_to);
+	}
 
 	if (geometry)
 		cmd.in = -1;
@@ -1169,7 +1176,7 @@  int cmd_repack(int argc, const char **argv, const char *prefix)
 	}
 
 	if (po_args.filter)
-		finish_pack_filtered_cmd(&pack_filtered_cmd, &names);
+		finish_pack_filtered_cmd(&pack_filtered_cmd, &names, filter_to);
 
 	string_list_sort(&names);
 
diff --git a/t/t7700-repack.sh b/t/t7700-repack.sh
index 9e7654090f..898f8a01b4 100755
--- a/t/t7700-repack.sh
+++ b/t/t7700-repack.sh
@@ -286,6 +286,33 @@  test_expect_success 'repacking with a filter works' '
 	test "$blob_pack2" = "$blob_pack"
 '
 
+test_expect_success '--filter-to stores filtered out objects' '
+	git -C bare.git repack -a -d &&
+	test_stdout_line_count = 1 ls bare.git/objects/pack/*.pack &&
+
+	git init --bare filtered.git &&
+	git -C bare.git -c repack.writebitmaps=false repack -a -d \
+		--filter=blob:none \
+		--filter-to=../filtered.git/objects/pack/pack &&
+	test_stdout_line_count = 1 ls bare.git/objects/pack/pack-*.pack &&
+	test_stdout_line_count = 1 ls filtered.git/objects/pack/pack-*.pack &&
+
+	commit_pack=$(test-tool -C bare.git find-pack HEAD) &&
+	test -n "$commit_pack" &&
+	blob_pack=$(test-tool -C bare.git find-pack HEAD:file1) &&
+	test -z "$blob_pack" &&
+	blob_hash=$(git -C bare.git rev-parse HEAD:file1) &&
+	test -n "$blob_hash" &&
+	blob_pack=$(test-tool -C filtered.git find-pack $blob_hash) &&
+	test -n "$blob_pack" &&
+
+	echo $(pwd)/filtered.git/objects >bare.git/objects/info/alternates &&
+	blob_pack=$(test-tool -C bare.git find-pack HEAD:file1) &&
+	test -n "$blob_pack" &&
+	blob_content=$(git -C bare.git show $blob_hash) &&
+	test "$blob_content" = "content1"
+'
+
 objdir=.git/objects
 midx=$objdir/pack/multi-pack-index