diff mbox series

[v3] revision: add `--ignore-missing-links` user option

Message ID 20230915083415.263187-1-knayak@gitlab.com (mailing list archive)
State Superseded
Headers show
Series [v3] revision: add `--ignore-missing-links` user option | expand

Commit Message

Karthik Nayak Sept. 15, 2023, 8:34 a.m. UTC
From: Karthik Nayak <karthik.188@gmail.com>

The revision backend is used by multiple porcelain commands such as
git-rev-list(1) and git-log(1). The backend currently supports ignoring
missing links by setting the `ignore_missing_links` bit. This allows the
revision walk to skip any objects links which are missing. Expose this
bit via an `--ignore-missing-links` user option.

A scenario where this option would be used is to find the boundary
objects between different object directories. Consider a repository with
a main object directory (GIT_OBJECT_DIRECTORY) and one or more alternate
object directories (GIT_ALTERNATE_OBJECT_DIRECTORIES). In such a
repository, enabling this option along with the `--boundary` option
while disabling the alternate object directory allows us to find the
boundary objects between the main and alternate object directory.

Helped-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Karthik Nayak <karthik.188@gmail.com>
---

Changes since v2:
- Refactored the tests thanks to Taylor! 

Range diff against version 2:

 1:  e3f4d85732 ! 1:  a08f3637a0 revision: add `--ignore-missing-links` user option
    @@ Commit message
         while disabling the alternate object directory allows us to find the
         boundary objects between the main and alternate object directory.
     
    +    Helped-by: Taylor Blau <me@ttaylorr.com>
         Signed-off-by: Karthik Nayak <karthik.188@gmail.com>
     
      ## Documentation/rev-list-options.txt ##
    @@ t/t6022-rev-list-alternates.sh (new)
     +# We create 5 commits and move them to the alt directory and
     +# create 5 more commits which will stay in the main odb.
     +test_expect_success 'create repository and alternate directory' '
    -+	git init main &&
    -+	test_commit_bulk -C main 5 &&
    -+	BOUNDARY_COMMIT=$(git -C main rev-parse HEAD) &&
    -+	mkdir alt &&
    -+	mv main/.git/objects/* alt &&
    -+	GIT_ALTERNATE_OBJECT_DIRECTORIES=$PWD/alt test_commit_bulk --start=6 -C main 5
    ++	test_commit_bulk 5 &&
    ++	git clone --reference=. --shared . alt &&
    ++	test_commit_bulk --start=6 -C alt 5
     +'
     +
     +# when the alternate odb is provided, all commits are listed along with the boundary
     +# commit.
     +test_expect_success 'rev-list passes with alternate object directory' '
    -+	GIT_ALTERNATE_OBJECT_DIRECTORIES=$PWD/alt git -C main rev-list HEAD >actual &&
    -+	test_stdout_line_count = 10 cat actual &&
    -+	grep $BOUNDARY_COMMIT actual
    ++	git -C alt rev-list --all --objects --no-object-names >actual.raw &&
    ++	{
    ++		git rev-list --all --objects --no-object-names &&
    ++		git -C alt rev-list --all --objects --no-object-names --not \
    ++			--alternate-refs
    ++	} >expect.raw &&
    ++	sort actual.raw >actual &&
    ++	sort expect.raw >expect &&
    ++	test_cmp expect actual
     +'
     +
    ++alt=alt/.git/objects/info/alternates
    ++
    ++hide_alternates () {
    ++	test -f "$alt.bak" || mv "$alt" "$alt.bak"
    ++}
    ++
    ++show_alternates () {
    ++	test -f "$alt" || mv "$alt.bak" "$alt"
    ++}
    ++
     +# When the alternate odb is not provided, rev-list fails since the 5th commit's
     +# parent is not present in the main odb.
     +test_expect_success 'rev-list fails without alternate object directory' '
    -+	test_must_fail git -C main rev-list HEAD
    ++	hide_alternates &&
    ++	test_must_fail git -C alt rev-list HEAD
     +'
     +
     +# With `--ignore-missing-links`, we stop the traversal when we encounter a
     +# missing link. The boundary commit is not listed as we haven't used the
     +# `--boundary` options.
     +test_expect_success 'rev-list only prints main odb commits with --ignore-missing-links' '
    -+	git -C main rev-list --ignore-missing-links HEAD >actual &&
    -+	test_stdout_line_count = 5 cat actual &&
    -+	! grep -$BOUNDARY_COMMIT actual
    ++	hide_alternates &&
    ++
    ++	git -C alt rev-list --objects --no-object-names \
    ++		--ignore-missing-links HEAD >actual.raw &&
    ++	git -C alt cat-file  --batch-check="%(objectname)" \
    ++		--batch-all-objects >expect.raw &&
    ++
    ++	sort actual.raw >actual &&
    ++	sort expect.raw >expect &&
    ++	test_must_fail git -C alt rev-list HEAD
     +'
     +
     +# With `--ignore-missing-links` and `--boundary`, we can even print those boundary
     +# commits.
     +test_expect_success 'rev-list prints boundary commit with --ignore-missing-links' '
    -+	git -C main rev-list --ignore-missing-links --boundary HEAD >actual &&
    -+	test_stdout_line_count = 6 cat actual &&
    -+	grep -$BOUNDARY_COMMIT actual
    ++	git -C alt rev-list --ignore-missing-links --boundary HEAD >got &&
    ++	grep "^-$(git rev-parse HEAD)" got
     +'
     +
    -+# The `--ignore-missing-links` option should ensure that git-rev-list(1) doesn't
    -+# fail when used alongside `--objects` when a tree is missing.
    -+test_expect_success 'rev-list --ignore-missing-links works with missing tree' '
    -+	echo "foo" >main/file &&
    -+	git -C main add file &&
    -+	GIT_ALTERNATE_OBJECT_DIRECTORIES=$PWD/alt git -C main commit -m"commit 11" &&
    -+	TREE_OID=$(git -C main rev-parse HEAD^{tree}) &&
    -+	mkdir alt/${TREE_OID:0:2} &&
    -+	mv main/.git/objects/${TREE_OID:0:2}/${TREE_OID:2} alt/${TREE_OID:0:2}/ &&
    -+	git -C main rev-list --ignore-missing-links --objects HEAD >actual &&
    -+	! grep $TREE_OID actual
    ++test_expect_success "setup for rev-list --ignore-missing-links with missing objects" '
    ++	show_alternates &&
    ++	test_commit -C alt 11
     +'
     +
    -+# Similar to above, it should also work when a blob is missing.
    -+test_expect_success 'rev-list --ignore-missing-links works with missing blob' '
    -+	echo "bar" >main/file &&
    -+	git -C main add file &&
    -+	GIT_ALTERNATE_OBJECT_DIRECTORIES=$PWD/alt git -C main commit -m"commit 12" &&
    -+	BLOB_OID=$(git -C main rev-parse HEAD:file) &&
    -+	mkdir alt/${BLOB_OID:0:2} &&
    -+	mv main/.git/objects/${BLOB_OID:0:2}/${BLOB_OID:2} alt/${BLOB_OID:0:2}/ &&
    -+	git -C main rev-list --ignore-missing-links --objects HEAD >actual &&
    -+	! grep $BLOB_OID actual
    -+'
    ++for obj in "HEAD^{tree}" "HEAD:11.t"
    ++do
    ++	# The `--ignore-missing-links` option should ensure that git-rev-list(1)
    ++	# doesn't fail when used alongside `--objects` when a tree/blob is
    ++	# missing.
    ++	test_expect_success "rev-list --ignore-missing-links with missing $type" '
    ++		oid="$(git -C alt rev-parse $obj)" &&
    ++		path="alt/.git/objects/$(test_oid_to_path $oid)" &&
    ++
    ++		mv "$path" "$path.hidden" &&
    ++		test_when_finished "mv $path.hidden $path" &&
    ++
    ++		git -C alt rev-list --ignore-missing-links --objects HEAD \
    ++			>actual &&
    ++		! grep $oid actual
    ++       '
    ++done
     +
     +test_done


 Documentation/rev-list-options.txt |  9 +++
 builtin/rev-list.c                 |  3 +-
 revision.c                         |  2 +
 t/t6022-rev-list-alternates.sh     | 93 ++++++++++++++++++++++++++++++
 4 files changed, 106 insertions(+), 1 deletion(-)
 create mode 100755 t/t6022-rev-list-alternates.sh

Comments

Junio C Hamano Sept. 15, 2023, 6:54 p.m. UTC | #1
Karthik Nayak <karthik.188@gmail.com> writes:

> From: Karthik Nayak <karthik.188@gmail.com>
>
> The revision backend is used by multiple porcelain commands such as
> git-rev-list(1) and git-log(1). The backend currently supports ignoring
> missing links by setting the `ignore_missing_links` bit. This allows the
> revision walk to skip any objects links which are missing. Expose this
> bit via an `--ignore-missing-links` user option.

Given the above "we merely surface a feature that already exists and
supported to be used by the end users from the command line" claim ...

> diff --git a/builtin/rev-list.c b/builtin/rev-list.c
> index ff715d6918..5239d83c76 100644
> --- a/builtin/rev-list.c
> +++ b/builtin/rev-list.c
> @@ -266,7 +266,8 @@ static int finish_object(struct object *obj, const char *name UNUSED,
>  {
>  	struct rev_list_info *info = cb_data;
>  	if (oid_object_info_extended(the_repository, &obj->oid, NULL, 0) < 0) {
> -		finish_object__ma(obj);
> +		if (!info->revs->ignore_missing_links)
> +			finish_object__ma(obj);
>  		return 1;
>  	}

... this hunk is a bit unexpected.  As a low-level plumbing command,
shouldn't it be left to the user who gives --ignore-missing-links
from their command line to specify how the missing "obj" here should
be dealt with by giving the "--missing=<foo>" option?  While giving
"allow-promisor" may not make much sense, "--missing=allow-any" may
of course make sense (it is the same as hardcoding the decision not
to call finish_object__ma() at all), and so may "--missing=print".

Stepping back a bit, with "--missing=print", is this change still
needed?  The missing objects discovered will be shown at the end,
with the setting, no?

Thanks.
Karthik Nayak Sept. 18, 2023, 10:12 a.m. UTC | #2
On Fri, Sep 15, 2023 at 8:54 PM Junio C Hamano <gitster@pobox.com> wrote:
>
> Karthik Nayak <karthik.188@gmail.com> writes:
>
> > From: Karthik Nayak <karthik.188@gmail.com>
> >
> > The revision backend is used by multiple porcelain commands such as
> > git-rev-list(1) and git-log(1). The backend currently supports ignoring
> > missing links by setting the `ignore_missing_links` bit. This allows the
> > revision walk to skip any objects links which are missing. Expose this
> > bit via an `--ignore-missing-links` user option.
>
> Given the above "we merely surface a feature that already exists and
> supported to be used by the end users from the command line" claim ...
>
> > diff --git a/builtin/rev-list.c b/builtin/rev-list.c
> > index ff715d6918..5239d83c76 100644
> > --- a/builtin/rev-list.c
> > +++ b/builtin/rev-list.c
> > @@ -266,7 +266,8 @@ static int finish_object(struct object *obj, const char *name UNUSED,
> >  {
> >       struct rev_list_info *info = cb_data;
> >       if (oid_object_info_extended(the_repository, &obj->oid, NULL, 0) < 0) {
> > -             finish_object__ma(obj);
> > +             if (!info->revs->ignore_missing_links)
> > +                     finish_object__ma(obj);
> >               return 1;
> >       }
>
> ... this hunk is a bit unexpected.  As a low-level plumbing command,
> shouldn't it be left to the user who gives --ignore-missing-links
> from their command line to specify how the missing "obj" here should
> be dealt with by giving the "--missing=<foo>" option?  While giving
> "allow-promisor" may not make much sense, "--missing=allow-any" may
> of course make sense (it is the same as hardcoding the decision not
> to call finish_object__ma() at all), and so may "--missing=print".
>

This is to be expected, in my opinion. In terms of revision.c and setting the
`revs->ignore_missing_links` bit, the traversal will go throw all
objects (commits
and otherwise) and call `show_commit` or `show_object` on them.

Here there is a difference for commits and non-commit objects.
1. Commit objects: commits are parsed in revision.c and after that the
`show_commit`
function is called only when the object is available.
2. Non-commit objects: while trees are parsed in revision.c, blobs are
never parsed and
hence, ` show_object` can be called on missing blobs. This is left to
the user to handle. In
our case, we error out in `rev-list.c`, which is not what we want when using the
`--ignore-missing-links` option. Hence, this addition.

There is an argument to be made around compatibility between the
`--missing` option
and `--ignore-missing-links` option, but since the former only works
with non-commit objects
I think the latter should be independent, and also the latter is about
ignoring all missing links.
I also don't think the user should again specify what to do with
missing links by adding
`--missing=allow-any` as `--ignore-missing-links` is a superset of it.

> Stepping back a bit, with "--missing=print", is this change still
> needed?  The missing objects discovered will be shown at the end,
> with the setting, no?
>

The main difference is that the `--missing` options works entirely
with non-commit
objects (I'm assuming this was built with promisor notes in mind). So
if a commit is
missing, git-rev-list(1) will still barf an error, but this error
handling is not in
`builtin/rev-list.c` rather is in a layer above in `revision.c`. So it
wouldn't be trivial for
the `--missing` option to support missing commit links. So that's why we expose
`--ignore-missing-links` which ensures any kind of object (commits
included) if missing,
is ignored.

Thanks for the review!
Junio C Hamano Sept. 18, 2023, 3:56 p.m. UTC | #3
Karthik Nayak <karthik.188@gmail.com> writes:

>> Given the above "we merely surface a feature that already exists and
>> supported to be used by the end users from the command line" claim ...
>>
>> > diff --git a/builtin/rev-list.c b/builtin/rev-list.c
>> > index ff715d6918..5239d83c76 100644
>> > --- a/builtin/rev-list.c
>> > +++ b/builtin/rev-list.c
>> > @@ -266,7 +266,8 @@ static int finish_object(struct object *obj, const char *name UNUSED,
>> >  {
>> >       struct rev_list_info *info = cb_data;
>> >       if (oid_object_info_extended(the_repository, &obj->oid, NULL, 0) < 0) {
>> > -             finish_object__ma(obj);
>> > +             if (!info->revs->ignore_missing_links)
>> > +                     finish_object__ma(obj);
>> >               return 1;
>> >       }
>>
>> ... this hunk is a bit unexpected.  As a low-level plumbing command,
>> shouldn't it be left to the user who gives --ignore-missing-links
>> from their command line to specify how the missing "obj" here should
>> be dealt with by giving the "--missing=<foo>" option?  While giving
>> "allow-promisor" may not make much sense, "--missing=allow-any" may
>> of course make sense (it is the same as hardcoding the decision not
>> to call finish_object__ma() at all), and so may "--missing=print".
>>
>
> This is to be expected, in my opinion. In terms of revision.c and
> setting the `revs->ignore_missing_links` bit, the traversal will
> go throw all objects (commits and otherwise) and call
> `show_commit` or `show_object` on them.

Yes.  And the user can choose how to handle such an object here by
telling finish_object__ma() with the --missing=<how> option, so
letting them do so, instead of robbing the choice from them, would
be a more flexible design here, right?

> if a commit is
> missing, git-rev-list(1) will still barf an error, but this error

OK, yeah, I do see the need for setting the ignore-missing-links bit
for what you are doing, and --missing and --ignore-missing-links are
orthogonal options.  Getting rid of the hardcoded skipping of
finish_object__ma() would make sense from this angle, too.

Thanks.
Karthik Nayak Sept. 19, 2023, 8:45 a.m. UTC | #4
On Mon, Sep 18, 2023 at 5:56 PM Junio C Hamano <gitster@pobox.com> wrote:
> Karthik Nayak <karthik.188@gmail.com> writes:
> > This is to be expected, in my opinion. In terms of revision.c and
> > setting the `revs->ignore_missing_links` bit, the traversal will
> > go throw all objects (commits and otherwise) and call
> > `show_commit` or `show_object` on them.
>
> Yes.  And the user can choose how to handle such an object here by
> telling finish_object__ma() with the --missing=<how> option, so
> letting them do so, instead of robbing the choice from them, would
> be a more flexible design here, right?
>
> > if a commit is
> > missing, git-rev-list(1) will still barf an error, but this error
>
> OK, yeah, I do see the need for setting the ignore-missing-links bit
> for what you are doing, and --missing and --ignore-missing-links are
> orthogonal options.  Getting rid of the hardcoded skipping of
> finish_object__ma() would make sense from this angle, too.

Well. The only problem is that setting `ignore_missing_links` bit never calls
`show_commit` for missing commits (since commits are pre-parsed in revision.c).
So to keep that behavior consistent for non-commit objects, I hardcoded the
skipping of `finish_object__ma()` in `show_object`.

If I remove the hardcoding, it would mean that `--ignore-missing-links` would
skip missing commits but for non-commits objects, the user would have to pass
`--missing=allow-any` else rev-list would still error out with a
missing object error.

Don't you think this would be confusing for the user?
I'm happy to send a revised version removing this hardcoding if you still think
otherwise :)
Junio C Hamano Sept. 19, 2023, 3:13 p.m. UTC | #5
Karthik Nayak <karthik.188@gmail.com> writes:

> If I remove the hardcoding, it would mean that
> `--ignore-missing-links` would skip missing commits but for
> non-commits objects, the user would have to pass
> `--missing=allow-any` else rev-list would still error out with a
> missing object error.
>
> Don't you think this would be confusing for the user?  I'm happy
> to send a revised version removing this hardcoding if you still
> think otherwise :)

Yes.  This is an example of flexibility and ergonomics at odds, and
for a low-level plumbing like rev-list, I would prefer not to limit
the flexibility unnecessarily.

I do not care about the ability to pass allow-any here.  But when
you traverse a range A..B with the --ignore-missing-links option,
the reporting mechanism based on the --boundary cannot tell which
ones are at the usual "traversal boundaries" and which ones are ones
beyond the broken links, can it?  If you allowed the users to pass
'print', then those reported with '?' prefix would be the missing
ones.  The ones that are reported with '-' prefix may still be
mixture of the two kinds, but you can now subtract one set from the
other set to see which ones are true boundaries and which ones are
missing.  The hardcoded "we do not let __ma() logic to kick in"
makes it impossible, which is what I find disturbing.

Thanks.
diff mbox series

Patch

diff --git a/Documentation/rev-list-options.txt b/Documentation/rev-list-options.txt
index a4a0cb93b2..8ee713db3d 100644
--- a/Documentation/rev-list-options.txt
+++ b/Documentation/rev-list-options.txt
@@ -227,6 +227,15 @@  explicitly.
 	Upon seeing an invalid object name in the input, pretend as if
 	the bad input was not given.
 
+--ignore-missing-links::
+	During traversal, if an object that is referenced does not
+	exist, instead of dying of a repository corruption, pretend as
+	if the reference itself does not exist. Running the command
+	with the `--boundary` option makes these missing commits,
+	together with the commits on the edge of revision ranges
+	(i.e. true boundary objects), appear on the output, prefixed
+	with '-'.
+
 ifndef::git-rev-list[]
 --bisect::
 	Pretend as if the bad bisection ref `refs/bisect/bad`
diff --git a/builtin/rev-list.c b/builtin/rev-list.c
index ff715d6918..5239d83c76 100644
--- a/builtin/rev-list.c
+++ b/builtin/rev-list.c
@@ -266,7 +266,8 @@  static int finish_object(struct object *obj, const char *name UNUSED,
 {
 	struct rev_list_info *info = cb_data;
 	if (oid_object_info_extended(the_repository, &obj->oid, NULL, 0) < 0) {
-		finish_object__ma(obj);
+		if (!info->revs->ignore_missing_links)
+			finish_object__ma(obj);
 		return 1;
 	}
 	if (info->revs->verify_objects && !obj->parsed && obj->type != OBJ_COMMIT)
diff --git a/revision.c b/revision.c
index 2f4c53ea20..cbfcbf6e28 100644
--- a/revision.c
+++ b/revision.c
@@ -2595,6 +2595,8 @@  static int handle_revision_opt(struct rev_info *revs, int argc, const char **arg
 		revs->limited = 1;
 	} else if (!strcmp(arg, "--ignore-missing")) {
 		revs->ignore_missing = 1;
+	} else if (!strcmp(arg, "--ignore-missing-links")) {
+		revs->ignore_missing_links = 1;
 	} else if (opt && opt->allow_exclude_promisor_objects &&
 		   !strcmp(arg, "--exclude-promisor-objects")) {
 		if (fetch_if_missing)
diff --git a/t/t6022-rev-list-alternates.sh b/t/t6022-rev-list-alternates.sh
new file mode 100755
index 0000000000..567dd21876
--- /dev/null
+++ b/t/t6022-rev-list-alternates.sh
@@ -0,0 +1,93 @@ 
+#!/bin/sh
+
+test_description='handling of alternates in rev-list'
+
+TEST_PASSES_SANITIZE_LEAK=true
+. ./test-lib.sh
+
+# We create 5 commits and move them to the alt directory and
+# create 5 more commits which will stay in the main odb.
+test_expect_success 'create repository and alternate directory' '
+	test_commit_bulk 5 &&
+	git clone --reference=. --shared . alt &&
+	test_commit_bulk --start=6 -C alt 5
+'
+
+# when the alternate odb is provided, all commits are listed along with the boundary
+# commit.
+test_expect_success 'rev-list passes with alternate object directory' '
+	git -C alt rev-list --all --objects --no-object-names >actual.raw &&
+	{
+		git rev-list --all --objects --no-object-names &&
+		git -C alt rev-list --all --objects --no-object-names --not \
+			--alternate-refs
+	} >expect.raw &&
+	sort actual.raw >actual &&
+	sort expect.raw >expect &&
+	test_cmp expect actual
+'
+
+alt=alt/.git/objects/info/alternates
+
+hide_alternates () {
+	test -f "$alt.bak" || mv "$alt" "$alt.bak"
+}
+
+show_alternates () {
+	test -f "$alt" || mv "$alt.bak" "$alt"
+}
+
+# When the alternate odb is not provided, rev-list fails since the 5th commit's
+# parent is not present in the main odb.
+test_expect_success 'rev-list fails without alternate object directory' '
+	hide_alternates &&
+	test_must_fail git -C alt rev-list HEAD
+'
+
+# With `--ignore-missing-links`, we stop the traversal when we encounter a
+# missing link. The boundary commit is not listed as we haven't used the
+# `--boundary` options.
+test_expect_success 'rev-list only prints main odb commits with --ignore-missing-links' '
+	hide_alternates &&
+
+	git -C alt rev-list --objects --no-object-names \
+		--ignore-missing-links HEAD >actual.raw &&
+	git -C alt cat-file  --batch-check="%(objectname)" \
+		--batch-all-objects >expect.raw &&
+
+	sort actual.raw >actual &&
+	sort expect.raw >expect &&
+	test_must_fail git -C alt rev-list HEAD
+'
+
+# With `--ignore-missing-links` and `--boundary`, we can even print those boundary
+# commits.
+test_expect_success 'rev-list prints boundary commit with --ignore-missing-links' '
+	git -C alt rev-list --ignore-missing-links --boundary HEAD >got &&
+	grep "^-$(git rev-parse HEAD)" got
+'
+
+test_expect_success "setup for rev-list --ignore-missing-links with missing objects" '
+	show_alternates &&
+	test_commit -C alt 11
+'
+
+for obj in "HEAD^{tree}" "HEAD:11.t"
+do
+	# The `--ignore-missing-links` option should ensure that git-rev-list(1)
+	# doesn't fail when used alongside `--objects` when a tree/blob is
+	# missing.
+	test_expect_success "rev-list --ignore-missing-links with missing $type" '
+		oid="$(git -C alt rev-parse $obj)" &&
+		path="alt/.git/objects/$(test_oid_to_path $oid)" &&
+
+		mv "$path" "$path.hidden" &&
+		test_when_finished "mv $path.hidden $path" &&
+
+		git -C alt rev-list --ignore-missing-links --objects HEAD \
+			>actual &&
+		! grep $oid actual
+       '
+done
+
+test_done