diff mbox series

[v3] rev-list: exclude promisor objects at walk time

Message ID 9856e7fc74f51b60ae162cbed3f5c0cf8c603222.1554757275.git.steadmon@google.com (mailing list archive)
State New, archived
Headers show
Series [v3] rev-list: exclude promisor objects at walk time | expand

Commit Message

Josh Steadmon April 8, 2019, 9:06 p.m. UTC
For large repositories, enumerating the list of all promisor objects (in
order to exclude them from a rev-list walk) can take a significant
amount of time).

When --exclude-promisor-objects is passed to rev-list, don't enumerate
the promisor objects. Instead, filter them (and any children objects)
during the actual graph walk.

Remove the mark_uninteresting() function as it's not used anywhere else.

When testing against a large repo [1], this patch reduces the
connectivity check runtime from 3 minutes to ~7 seconds.

[1]: https://android.googlesource.com/platform/frameworks/base/

Helped-By: Jonathan Tan <jonathantanmy@google.com>
Helped-By: Jeff King <peff@peff.net>
Helped-By: Jonathan Nieder <jrnieder@gmail.com>
Signed-off-by: Josh Steadmon <steadmon@google.com>

Signed-off-by: Josh Steadmon <steadmon@google.com>
---
Changes since V2:
* Pulled the "OK to skip?" logic into a separate function.

Changes since V1:
* Switched to alternate approach, we now do the regular rev-list walk
  but skip promisor objects at walk time, rather than pre-excluding
  them.

Range-diff against v2:
1:  9f327d6d8d ! 1:  9856e7fc74 rev-list: exclude promisor objects at walk time
    @@ -10,9 +10,15 @@
         the promisor objects. Instead, filter them (and any children objects)
         during the actual graph walk.
     
    +    When testing against a large repo [1], this reduces the connectivity
    +    check runtime from 3 minutes to ~7 seconds.
    +
    +    [1]: https://android.googlesource.com/platform/frameworks/base/
    +
         Helped-By: Jonathan Tan <jonathantanmy@google.com>
         Helped-By: Jeff King <peff@peff.net>
         Helped-By: Jonathan Nieder <jrnieder@gmail.com>
    +    Signed-off-by: Josh Steadmon <steadmon@google.com>
     
     
    @@ -20,78 +26,55 @@
      --- a/list-objects.c
      +++ b/list-objects.c
     @@
    - 	struct object *obj = &blob->object;
    - 	size_t pathlen;
    - 	enum list_objects_filter_result r = LOFR_MARK_SEEN | LOFR_DO_SHOW;
    -+	struct object_info oi = OBJECT_INFO_INIT;
    + 	void *filter_data;
    + };
      
    - 	if (!ctx->revs->blob_objects)
    - 		return;
    ++static int should_skip_promisor_object(const struct rev_info *revs,
    ++				       const struct object_id *oid)
    ++{
    ++	struct object_info oi = OBJECT_INFO_INIT;
    ++	return (revs->exclude_promisor_objects &&
    ++		!oid_object_info_extended(the_repository, oid, &oi, 0) &&
    ++		oi.whence == OI_PACKED &&
    ++		oi.u.packed.pack->pack_promisor);
    ++}
    ++
    + static void process_blob(struct traversal_context *ctx,
    + 			 struct blob *blob,
    + 			 struct strbuf *path,
     @@
      		die("bad blob object");
      	if (obj->flags & (UNINTERESTING | SEEN))
      		return;
    -+	if (ctx->revs->exclude_promisor_objects &&
    -+	    !oid_object_info_extended(the_repository, &obj->oid, &oi, 0) &&
    -+	    oi.whence == OI_PACKED &&
    -+	    oi.u.packed.pack->pack_promisor)
    ++	if (should_skip_promisor_object(ctx->revs, &obj->oid))
     +		return;
      
      	/*
      	 * Pre-filter known-missing objects when explicitly requested.
    -@@
    - 	int baselen = base->len;
    - 	enum list_objects_filter_result r = LOFR_MARK_SEEN | LOFR_DO_SHOW;
    - 	int failed_parse;
    -+	struct object_info oi = OBJECT_INFO_INIT;
    - 
    - 	if (!revs->tree_objects)
    - 		return;
     @@
      		die("bad tree object");
      	if (obj->flags & (UNINTERESTING | SEEN))
      		return;
    -+	if (ctx->revs->exclude_promisor_objects &&
    -+	    !oid_object_info_extended(the_repository, &obj->oid, &oi, 0) &&
    -+	    oi.whence == OI_PACKED &&
    -+	    oi.u.packed.pack->pack_promisor)
    ++	if (should_skip_promisor_object(ctx->revs, &obj->oid))
     +		return;
      
      	failed_parse = parse_tree_gently(tree, 1);
      	if (failed_parse) {
    -@@
    - 				     struct strbuf *base)
    - {
    - 	int i;
    -+	struct object_info oi = OBJECT_INFO_INIT;
    - 
    - 	assert(base->len == 0);
    - 
     @@
      		struct object *obj = pending->item;
      		const char *name = pending->name;
      		const char *path = pending->path;
    -+		if (ctx->revs->exclude_promisor_objects &&
    -+		    !oid_object_info_extended(the_repository, &obj->oid, &oi, 0) &&
    -+		    oi.whence == OI_PACKED &&
    -+		    oi.u.packed.pack->pack_promisor)
    ++		if (should_skip_promisor_object(ctx->revs, &obj->oid))
     +			continue;
     +
      		if (obj->flags & (UNINTERESTING | SEEN))
      			continue;
      		if (obj->type == OBJ_TAG) {
     @@
    - {
    - 	struct commit *commit;
    - 	struct strbuf csp; /* callee's scratch pad */
    -+	struct object_info oi = OBJECT_INFO_INIT;
      	strbuf_init(&csp, PATH_MAX);
      
      	while ((commit = get_revision(ctx->revs)) != NULL) {
    -+		if (ctx->revs->exclude_promisor_objects &&
    -+		    !oid_object_info_extended(the_repository, &commit->object.oid, &oi, 0) &&
    -+		    oi.whence == OI_PACKED &&
    -+		    oi.u.packed.pack->pack_promisor)
    ++		if (should_skip_promisor_object(ctx->revs, &commit->object.oid))
     +			continue;
     +
      		/*

 list-objects.c | 20 ++++++++++++++++++++
 revision.c     | 16 ----------------
 2 files changed, 20 insertions(+), 16 deletions(-)

Comments

Christian Couder April 8, 2019, 10:23 p.m. UTC | #1
On Mon, Apr 8, 2019 at 11:46 PM Josh Steadmon <steadmon@google.com> wrote:
>
> Helped-By: Jonathan Tan <jonathantanmy@google.com>
> Helped-By: Jeff King <peff@peff.net>
> Helped-By: Jonathan Nieder <jrnieder@gmail.com>
> Signed-off-by: Josh Steadmon <steadmon@google.com>
>
> Signed-off-by: Josh Steadmon <steadmon@google.com>

Your S-o-B is duplicated an there is a spurious line between the
duplicated lines. Do you use an automated script/hook to add your
S-o-B?
Josh Steadmon April 8, 2019, 11:12 p.m. UTC | #2
On 2019.04.09 00:23, Christian Couder wrote:
> On Mon, Apr 8, 2019 at 11:46 PM Josh Steadmon <steadmon@google.com> wrote:
> >
> > Helped-By: Jonathan Tan <jonathantanmy@google.com>
> > Helped-By: Jeff King <peff@peff.net>
> > Helped-By: Jonathan Nieder <jrnieder@gmail.com>
> > Signed-off-by: Josh Steadmon <steadmon@google.com>
> >
> > Signed-off-by: Josh Steadmon <steadmon@google.com>
> 
> Your S-o-B is duplicated an there is a spurious line between the
> duplicated lines. Do you use an automated script/hook to add your
> S-o-B?

I only use the --signoff flag to git-format-patch. It looks like the
cause is that I have a hook to add a Gerrit Change-Id to my commit
messages, and that hook added a blank line followed by Change-Id:blah
after my original Signed-off-by line. Then git-format-patch added
another Signed-off-by after the Change-Id. The Change-Id line was then
stripped out by a wrapper around git-send-email.
Junio C Hamano April 9, 2019, 3:14 p.m. UTC | #3
Josh Steadmon <steadmon@google.com> writes:

> For large repositories, enumerating the list of all promisor objects (in
> order to exclude them from a rev-list walk) can take a significant
> amount of time).
>
> When --exclude-promisor-objects is passed to rev-list, don't enumerate
> the promisor objects. Instead, filter them (and any children objects)
> during the actual graph walk.
>
> Remove the mark_uninteresting() function as it's not used anywhere else.
>
> When testing against a large repo [1], this patch reduces the
> connectivity check runtime from 3 minutes to ~7 seconds.
>
> [1]: https://android.googlesource.com/platform/frameworks/base/
>
> Helped-By: Jonathan Tan <jonathantanmy@google.com>
> Helped-By: Jeff King <peff@peff.net>
> Helped-By: Jonathan Nieder <jrnieder@gmail.com>
> Signed-off-by: Josh Steadmon <steadmon@google.com>
>
> Signed-off-by: Josh Steadmon <steadmon@google.com>

I've dealt with the stray double-sign-off locally, but is there
anything else planned for v4 or later?  Is this performance-only
change, or does it have an externally observable behaviour change
that we can easily add to our test suite?

>  list-objects.c | 20 ++++++++++++++++++++
>  revision.c     | 16 ----------------
>  2 files changed, 20 insertions(+), 16 deletions(-)
Jeff King April 9, 2019, 3:15 p.m. UTC | #4
On Wed, Apr 10, 2019 at 12:14:41AM +0900, Junio C Hamano wrote:

> Josh Steadmon <steadmon@google.com> writes:
> 
> > For large repositories, enumerating the list of all promisor objects (in
> > order to exclude them from a rev-list walk) can take a significant
> > amount of time).
> >
> > When --exclude-promisor-objects is passed to rev-list, don't enumerate
> > the promisor objects. Instead, filter them (and any children objects)
> > during the actual graph walk.
> >
> > Remove the mark_uninteresting() function as it's not used anywhere else.
> >
> > When testing against a large repo [1], this patch reduces the
> > connectivity check runtime from 3 minutes to ~7 seconds.
> >
> > [1]: https://android.googlesource.com/platform/frameworks/base/
> >
> > Helped-By: Jonathan Tan <jonathantanmy@google.com>
> > Helped-By: Jeff King <peff@peff.net>
> > Helped-By: Jonathan Nieder <jrnieder@gmail.com>
> > Signed-off-by: Josh Steadmon <steadmon@google.com>
> >
> > Signed-off-by: Josh Steadmon <steadmon@google.com>
> 
> I've dealt with the stray double-sign-off locally, but is there
> anything else planned for v4 or later?  Is this performance-only
> change, or does it have an externally observable behaviour change
> that we can easily add to our test suite?

I am OK if we do not include it, but even if this is "just" a
performance-only change, we can always add to our perf regression suite.

-Peff
Junio C Hamano April 9, 2019, 3:43 p.m. UTC | #5
Jeff King <peff@peff.net> writes:

> On Wed, Apr 10, 2019 at 12:14:41AM +0900, Junio C Hamano wrote:
>
>> I've dealt with the stray double-sign-off locally, but is there
>> anything else planned for v4 or later?  Is this performance-only
>> change, or does it have an externally observable behaviour change
>> that we can easily add to our test suite?
>
> I am OK if we do not include it, but even if this is "just" a
> performance-only change, we can always add to our perf regression suite.

Hmph, that does not say much about a possible change in behaviour in
corner cases you guys were discuussing near the beginning of the
thread when an object can be reached from both a non-promisor and a
promisor object, does it?

Shouldn't we at least tweak the log message to record that we were
aware of the possibility even though we couldn't readily come up
with a case where this optimization breaks things?  I suspect that
it would help the next person who needs to deal with a possible
regression coming from this change to understand the problem better
and hopefully faster.
Josh Steadmon April 9, 2019, 4:35 p.m. UTC | #6
On 2019.04.10 00:43, Junio C Hamano wrote:
> Jeff King <peff@peff.net> writes:
> 
> > On Wed, Apr 10, 2019 at 12:14:41AM +0900, Junio C Hamano wrote:
> >
> >> I've dealt with the stray double-sign-off locally, but is there
> >> anything else planned for v4 or later?  Is this performance-only
> >> change, or does it have an externally observable behaviour change
> >> that we can easily add to our test suite?
> >
> > I am OK if we do not include it, but even if this is "just" a
> > performance-only change, we can always add to our perf regression suite.
> 
> Hmph, that does not say much about a possible change in behaviour in
> corner cases you guys were discuussing near the beginning of the
> thread when an object can be reached from both a non-promisor and a
> promisor object, does it?
> 
> Shouldn't we at least tweak the log message to record that we were
> aware of the possibility even though we couldn't readily come up
> with a case where this optimization breaks things?  I suspect that
> it would help the next person who needs to deal with a possible
> regression coming from this change to understand the problem better
> and hopefully faster.
> 

I'll update the log message and send a v4 in a few minutes.
SZEDER Gábor April 9, 2019, 6:04 p.m. UTC | #7
On Mon, Apr 08, 2019 at 02:06:04PM -0700, Josh Steadmon wrote:
> For large repositories, enumerating the list of all promisor objects (in
> order to exclude them from a rev-list walk) can take a significant
> amount of time).
> 
> When --exclude-promisor-objects is passed to rev-list, don't enumerate
> the promisor objects. Instead, filter them (and any children objects)
> during the actual graph walk.
> 
> Remove the mark_uninteresting() function as it's not used anywhere else.
> 
> When testing against a large repo [1], this patch reduces the
> connectivity check runtime from 3 minutes to ~7 seconds.

This patch breaks test 'repack -d does not irreversibly delete
promisor objects' in 't0410-partial-clone.sh' when run with
GIT_TEST_COMMIT_GRAPH=1.

  +rm -rf repo
  +test_create_repo repo
  +test 1 = 1
  +repo=repo
  +mkdir -p repo
  +cd repo
  +/home/travis/build/git/git/t/../git init --template=/home/travis/build/git/git/t/../templates/blt/
  Initialized empty Git repository in /home/travis/build/git/git/t/trash directory.t0410-partial-clone/repo/.git/
  +mv .git/hooks .git/hooks-disabled
  +git -C repo config core.repositoryformatversion 1
  +git -C repo config extensions.partialclone arbitrary string
  +git -C repo commit --allow-empty -m one
  [master (root-commit) 71905df] one
   Author: A U Thor <author@example.com>
  +git -C repo commit --allow-empty -m two
  [master 202c4a3] two
   Author: A U Thor <author@example.com>
  +git -C repo commit --allow-empty -m three
  [master 4737577] three
   Author: A U Thor <author@example.com>
  +git -C repo commit --allow-empty -m four
  [master d6ba7e0] four
   Author: A U Thor <author@example.com>
  +git -C repo rev-parse HEAD^^^
  +ONE=71905dfcd543b7cbb0b4b66fbd20379e67220557
  +git -C repo rev-parse HEAD^^
  +TWO=202c4a3dd9a2dac927f056abb747cce9ea2eb67b
  +git -C repo rev-parse HEAD^
  +THREE=47375779ebcca4b422e3afdd14aa37a358081297
  +pack_as_from_promisor
  +printf 202c4a3dd9a2dac927f056abb747cce9ea2eb67b\n
  +git -C repo pack-objects .git/objects/pack/pack
  +HASH=2e675cd706e508d6c52a21d28cfcddde5ec02a06
  +
  +echo 2e675cd706e508d6c52a21d28cfcddde5ec02a06
  2e675cd706e508d6c52a21d28cfcddde5ec02a06
  +printf 47375779ebcca4b422e3afdd14aa37a358081297\n
  +pack_as_from_promisor
  +git -C repo pack-objects .git/objects/pack/pack
  +HASH=31f7d2797549ab9b1c425a9e60eb2030481170e5
  +
  +echo 31f7d2797549ab9b1c425a9e60eb2030481170e5
  31f7d2797549ab9b1c425a9e60eb2030481170e5
  +delete_object repo 71905dfcd543b7cbb0b4b66fbd20379e67220557
  +sed -e s|^..|&/|
  +echo 71905dfcd543b7cbb0b4b66fbd20379e67220557
  +rm repo/.git/objects/71/905dfcd543b7cbb0b4b66fbd20379e67220557
  +repack_and_check -a 202c4a3dd9a2dac927f056abb747cce9ea2eb67b 47375779ebcca4b422e3afdd14aa37a358081297
  +rm -rf repo2
  +cp -r repo repo2
  +git -C repo2 repack -a -d
  warning: reflog of 'HEAD' references pruned commits
  warning: reflog of 'refs/heads/master' references pruned commits
  fatal: unable to read 71905dfcd543b7cbb0b4b66fbd20379e67220557
  error: last command exited with $?=128
  not ok 23 - repack -d does not irreversibly delete promisor objects


  https://travis-ci.org/git/git/jobs/517874310#L5822
Josh Steadmon April 9, 2019, 11:42 p.m. UTC | #8
On 2019.04.09 20:04, SZEDER Gábor wrote:
> On Mon, Apr 08, 2019 at 02:06:04PM -0700, Josh Steadmon wrote:
> > For large repositories, enumerating the list of all promisor objects (in
> > order to exclude them from a rev-list walk) can take a significant
> > amount of time).
> > 
> > When --exclude-promisor-objects is passed to rev-list, don't enumerate
> > the promisor objects. Instead, filter them (and any children objects)
> > during the actual graph walk.
> > 
> > Remove the mark_uninteresting() function as it's not used anywhere else.
> > 
> > When testing against a large repo [1], this patch reduces the
> > connectivity check runtime from 3 minutes to ~7 seconds.
> 
> This patch breaks test 'repack -d does not irreversibly delete
> promisor objects' in 't0410-partial-clone.sh' when run with
> GIT_TEST_COMMIT_GRAPH=1.
> 
>   +rm -rf repo
>   +test_create_repo repo
>   +test 1 = 1
>   +repo=repo
>   +mkdir -p repo
>   +cd repo
>   +/home/travis/build/git/git/t/../git init --template=/home/travis/build/git/git/t/../templates/blt/
>   Initialized empty Git repository in /home/travis/build/git/git/t/trash directory.t0410-partial-clone/repo/.git/
>   +mv .git/hooks .git/hooks-disabled
>   +git -C repo config core.repositoryformatversion 1
>   +git -C repo config extensions.partialclone arbitrary string
>   +git -C repo commit --allow-empty -m one
>   [master (root-commit) 71905df] one
>    Author: A U Thor <author@example.com>
>   +git -C repo commit --allow-empty -m two
>   [master 202c4a3] two
>    Author: A U Thor <author@example.com>
>   +git -C repo commit --allow-empty -m three
>   [master 4737577] three
>    Author: A U Thor <author@example.com>
>   +git -C repo commit --allow-empty -m four
>   [master d6ba7e0] four
>    Author: A U Thor <author@example.com>
>   +git -C repo rev-parse HEAD^^^
>   +ONE=71905dfcd543b7cbb0b4b66fbd20379e67220557
>   +git -C repo rev-parse HEAD^^
>   +TWO=202c4a3dd9a2dac927f056abb747cce9ea2eb67b
>   +git -C repo rev-parse HEAD^
>   +THREE=47375779ebcca4b422e3afdd14aa37a358081297
>   +pack_as_from_promisor
>   +printf 202c4a3dd9a2dac927f056abb747cce9ea2eb67b\n
>   +git -C repo pack-objects .git/objects/pack/pack
>   +HASH=2e675cd706e508d6c52a21d28cfcddde5ec02a06
>   +
>   +echo 2e675cd706e508d6c52a21d28cfcddde5ec02a06
>   2e675cd706e508d6c52a21d28cfcddde5ec02a06
>   +printf 47375779ebcca4b422e3afdd14aa37a358081297\n
>   +pack_as_from_promisor
>   +git -C repo pack-objects .git/objects/pack/pack
>   +HASH=31f7d2797549ab9b1c425a9e60eb2030481170e5
>   +
>   +echo 31f7d2797549ab9b1c425a9e60eb2030481170e5
>   31f7d2797549ab9b1c425a9e60eb2030481170e5
>   +delete_object repo 71905dfcd543b7cbb0b4b66fbd20379e67220557
>   +sed -e s|^..|&/|
>   +echo 71905dfcd543b7cbb0b4b66fbd20379e67220557
>   +rm repo/.git/objects/71/905dfcd543b7cbb0b4b66fbd20379e67220557
>   +repack_and_check -a 202c4a3dd9a2dac927f056abb747cce9ea2eb67b 47375779ebcca4b422e3afdd14aa37a358081297
>   +rm -rf repo2
>   +cp -r repo repo2
>   +git -C repo2 repack -a -d
>   warning: reflog of 'HEAD' references pruned commits
>   warning: reflog of 'refs/heads/master' references pruned commits
>   fatal: unable to read 71905dfcd543b7cbb0b4b66fbd20379e67220557
>   error: last command exited with $?=128
>   not ok 23 - repack -d does not irreversibly delete promisor objects
> 
> 
>   https://travis-ci.org/git/git/jobs/517874310#L5822
> 

Thank you for catching this. I haven't yet figured out the cause. I will
look into this more tomorrow and then send a V4 once I've fixed it.
Jeff King April 11, 2019, 4:06 a.m. UTC | #9
On Tue, Apr 09, 2019 at 04:42:55PM -0700, Josh Steadmon wrote:

> >   warning: reflog of 'HEAD' references pruned commits
> >   warning: reflog of 'refs/heads/master' references pruned commits
> >   fatal: unable to read 71905dfcd543b7cbb0b4b66fbd20379e67220557
> >   error: last command exited with $?=128
> >   not ok 23 - repack -d does not irreversibly delete promisor objects
> > 
> 
> Thank you for catching this. I haven't yet figured out the cause. I will
> look into this more tomorrow and then send a V4 once I've fixed it.

I'm concerned that this is a sign that the approach I suggested does not
actually work everywhere. I.e., could this be a case where we have some
non-promisor object that points to a sub-object that is reachable from
the promisor pack, but not a direct tip? Before your patch we'd consider
that sub-object a promisor (because we enumerate all of the graph that
we do have and mark each such object), but afterwards we would not.

And I wonder if that confuses pack-objects. Though I think it would
confuse it in the _opposite_ direction. I.e., using
--exclude-promisor-objects would count such an object as not-a-promisor
and would be more inclined to include it in the new pack.

It is curious that this only turns up with GIT_TEST_COMMIT_GRAPH=1, too.
It seems like any such problem ought to be independent of that.

Puzzling...

-Peff
Josh Steadmon April 12, 2019, 10:38 p.m. UTC | #10
On 2019.04.11 00:06, Jeff King wrote:
> On Tue, Apr 09, 2019 at 04:42:55PM -0700, Josh Steadmon wrote:
> 
> > >   warning: reflog of 'HEAD' references pruned commits
> > >   warning: reflog of 'refs/heads/master' references pruned commits
> > >   fatal: unable to read 71905dfcd543b7cbb0b4b66fbd20379e67220557
> > >   error: last command exited with $?=128
> > >   not ok 23 - repack -d does not irreversibly delete promisor objects
> > > 
> > 
> > Thank you for catching this. I haven't yet figured out the cause. I will
> > look into this more tomorrow and then send a V4 once I've fixed it.
> 
> I'm concerned that this is a sign that the approach I suggested does not
> actually work everywhere. I.e., could this be a case where we have some
> non-promisor object that points to a sub-object that is reachable from
> the promisor pack, but not a direct tip? Before your patch we'd consider
> that sub-object a promisor (because we enumerate all of the graph that
> we do have and mark each such object), but afterwards we would not.
> 
> And I wonder if that confuses pack-objects. Though I think it would
> confuse it in the _opposite_ direction. I.e., using
> --exclude-promisor-objects would count such an object as not-a-promisor
> and would be more inclined to include it in the new pack.
> 
> It is curious that this only turns up with GIT_TEST_COMMIT_GRAPH=1, too.
> It seems like any such problem ought to be independent of that.
> 
> Puzzling...
> 
> -Peff

Do you think this justifies going back to the V1 approach (only checking
presence of objects pointed to by refs when doing a partial clone)?
Jeff King April 13, 2019, 5:34 a.m. UTC | #11
On Fri, Apr 12, 2019 at 03:38:47PM -0700, Josh Steadmon wrote:

> > > Thank you for catching this. I haven't yet figured out the cause. I will
> > > look into this more tomorrow and then send a V4 once I've fixed it.
> > 
> > I'm concerned that this is a sign that the approach I suggested does not
> > actually work everywhere. I.e., could this be a case where we have some
> > non-promisor object that points to a sub-object that is reachable from
> > the promisor pack, but not a direct tip? Before your patch we'd consider
> > that sub-object a promisor (because we enumerate all of the graph that
> > we do have and mark each such object), but afterwards we would not.
> > 
> > And I wonder if that confuses pack-objects. Though I think it would
> > confuse it in the _opposite_ direction. I.e., using
> > --exclude-promisor-objects would count such an object as not-a-promisor
> > and would be more inclined to include it in the new pack.
> > 
> > It is curious that this only turns up with GIT_TEST_COMMIT_GRAPH=1, too.
> > It seems like any such problem ought to be independent of that.
> > 
> > Puzzling...
> 
> Do you think this justifies going back to the V1 approach (only checking
> presence of objects pointed to by refs when doing a partial clone)?

Yes, I think it might. Especially coupled with your other report that
the V1 approach is 500ms compared to several seconds for this one. Which
I'd guess is probably because we actually parse the ref tip objects in
rev-list, whereas your V1 just skipped that step entirely (which is
perfectly fine for a clone, as we'd have just hashed the objects via
index-pack anyway).

It might be interesting to know if the problem is indeed insurmountable
with the V3 approach here, or if it's simply another bug. But diving
into it is going to be rather tricky, and I am not volunteering to do
it. :) So if you want to punt and go back to the more clearly correct V1
approach, I can live with that. We can always revisit this approach
later (it wouldn't be necessary for the clone case after your V1, but in
theory it could be helping other cases, too).

-Peff
Josh Steadmon April 19, 2019, 8:26 p.m. UTC | #12
On 2019.04.13 01:34, Jeff King wrote:
> On Fri, Apr 12, 2019 at 03:38:47PM -0700, Josh Steadmon wrote:
> 
> > > > Thank you for catching this. I haven't yet figured out the cause. I will
> > > > look into this more tomorrow and then send a V4 once I've fixed it.
> > > 
> > > I'm concerned that this is a sign that the approach I suggested does not
> > > actually work everywhere. I.e., could this be a case where we have some
> > > non-promisor object that points to a sub-object that is reachable from
> > > the promisor pack, but not a direct tip? Before your patch we'd consider
> > > that sub-object a promisor (because we enumerate all of the graph that
> > > we do have and mark each such object), but afterwards we would not.
> > > 
> > > And I wonder if that confuses pack-objects. Though I think it would
> > > confuse it in the _opposite_ direction. I.e., using
> > > --exclude-promisor-objects would count such an object as not-a-promisor
> > > and would be more inclined to include it in the new pack.
> > > 
> > > It is curious that this only turns up with GIT_TEST_COMMIT_GRAPH=1, too.
> > > It seems like any such problem ought to be independent of that.
> > > 
> > > Puzzling...
> > 
> > Do you think this justifies going back to the V1 approach (only checking
> > presence of objects pointed to by refs when doing a partial clone)?
> 
> Yes, I think it might. Especially coupled with your other report that
> the V1 approach is 500ms compared to several seconds for this one. Which
> I'd guess is probably because we actually parse the ref tip objects in
> rev-list, whereas your V1 just skipped that step entirely (which is
> perfectly fine for a clone, as we'd have just hashed the objects via
> index-pack anyway).
> 
> It might be interesting to know if the problem is indeed insurmountable
> with the V3 approach here, or if it's simply another bug. But diving
> into it is going to be rather tricky, and I am not volunteering to do
> it. :) So if you want to punt and go back to the more clearly correct V1
> approach, I can live with that. We can always revisit this approach
> later (it wouldn't be necessary for the clone case after your V1, but in
> theory it could be helping other cases, too).
> 
> -Peff

I have not made any progress in figuring out the repack + commit-graph
failure, so I will resend V1.
diff mbox series

Patch

diff --git a/list-objects.c b/list-objects.c
index dc77361e11..c153ee5dfb 100644
--- a/list-objects.c
+++ b/list-objects.c
@@ -22,6 +22,16 @@  struct traversal_context {
 	void *filter_data;
 };
 
+static int should_skip_promisor_object(const struct rev_info *revs,
+				       const struct object_id *oid)
+{
+	struct object_info oi = OBJECT_INFO_INIT;
+	return (revs->exclude_promisor_objects &&
+		!oid_object_info_extended(the_repository, oid, &oi, 0) &&
+		oi.whence == OI_PACKED &&
+		oi.u.packed.pack->pack_promisor);
+}
+
 static void process_blob(struct traversal_context *ctx,
 			 struct blob *blob,
 			 struct strbuf *path,
@@ -37,6 +47,8 @@  static void process_blob(struct traversal_context *ctx,
 		die("bad blob object");
 	if (obj->flags & (UNINTERESTING | SEEN))
 		return;
+	if (should_skip_promisor_object(ctx->revs, &obj->oid))
+		return;
 
 	/*
 	 * Pre-filter known-missing objects when explicitly requested.
@@ -156,6 +168,8 @@  static void process_tree(struct traversal_context *ctx,
 		die("bad tree object");
 	if (obj->flags & (UNINTERESTING | SEEN))
 		return;
+	if (should_skip_promisor_object(ctx->revs, &obj->oid))
+		return;
 
 	failed_parse = parse_tree_gently(tree, 1);
 	if (failed_parse) {
@@ -326,6 +340,9 @@  static void traverse_trees_and_blobs(struct traversal_context *ctx,
 		struct object *obj = pending->item;
 		const char *name = pending->name;
 		const char *path = pending->path;
+		if (should_skip_promisor_object(ctx->revs, &obj->oid))
+			continue;
+
 		if (obj->flags & (UNINTERESTING | SEEN))
 			continue;
 		if (obj->type == OBJ_TAG) {
@@ -356,6 +373,9 @@  static void do_traverse(struct traversal_context *ctx)
 	strbuf_init(&csp, PATH_MAX);
 
 	while ((commit = get_revision(ctx->revs)) != NULL) {
+		if (should_skip_promisor_object(ctx->revs, &commit->object.oid))
+			continue;
+
 		/*
 		 * an uninteresting boundary commit may not have its tree
 		 * parsed yet, but we are not going to show them anyway
diff --git a/revision.c b/revision.c
index eb8e51bc63..85974e941d 100644
--- a/revision.c
+++ b/revision.c
@@ -3067,17 +3067,6 @@  void reset_revision_walk(void)
 	clear_object_flags(SEEN | ADDED | SHOWN);
 }
 
-static int mark_uninteresting(const struct object_id *oid,
-			      struct packed_git *pack,
-			      uint32_t pos,
-			      void *cb)
-{
-	struct rev_info *revs = cb;
-	struct object *o = parse_object(revs->repo, oid);
-	o->flags |= UNINTERESTING | SEEN;
-	return 0;
-}
-
 define_commit_slab(indegree_slab, int);
 define_commit_slab(author_date_slab, timestamp_t);
 
@@ -3316,11 +3305,6 @@  int prepare_revision_walk(struct rev_info *revs)
 	    (revs->limited && limiting_can_increase_treesame(revs)))
 		revs->treesame.name = "treesame";
 
-	if (revs->exclude_promisor_objects) {
-		for_each_packed_object(mark_uninteresting, revs,
-				       FOR_EACH_OBJECT_PROMISOR_ONLY);
-	}
-
 	if (revs->no_walk != REVISION_WALK_NO_WALK_UNSORTED)
 		commit_list_sort_by_date(&revs->commits);
 	if (revs->no_walk)