diff mbox series

[v4,2/7] fetch-pack: add refetch

Message ID 03f0de3d28c00cbfda73f31a21e0fa8f8fe51742.1648476132.git.gitgitgadget@gmail.com (mailing list archive)
State Accepted
Commit 4dfd0925cbba78cc737e3af29faa5774bbc7b6a3
Headers show
Series fetch: add repair: full refetch without negotiation (was: "refiltering") | expand

Commit Message

Robert Coup March 28, 2022, 2:02 p.m. UTC
From: Robert Coup <robert@coup.net.nz>

Allow a "refetch" where the contents of the local object store are
ignored and a full fetch is performed, not attempting to find or
negotiate common commits with the remote.

A key use case is to apply a new partial clone blob/tree filter and
refetch all the associated matching content, which would otherwise not
be transferred when the commit objects are already present locally.

Signed-off-by: Robert Coup <robert@coup.net.nz>
---
 fetch-pack.c | 46 +++++++++++++++++++++++++++++-----------------
 fetch-pack.h |  1 +
 2 files changed, 30 insertions(+), 17 deletions(-)

Comments

Ævar Arnfjörð Bjarmason March 31, 2022, 3:09 p.m. UTC | #1
On Mon, Mar 28 2022, Robert Coup via GitGitGadget wrote:

> From: Robert Coup <robert@coup.net.nz>
>
> Allow a "refetch" where the contents of the local object store are
> ignored and a full fetch is performed, not attempting to find or
> negotiate common commits with the remote.
>
> A key use case is to apply a new partial clone blob/tree filter and
> refetch all the associated matching content, which would otherwise not
> be transferred when the commit objects are already present locally.

FWIW it's not clear to me re earlier comments on earlier iterations
whether the "noop fetch" is per-se wanted for this feature for some
reason, or if it's just much easier to implement than doing what I
suggested in:
https://lore.kernel.org/git/220228.86ee3m39jf.gmgdl@evledraar.gmail.com/

I don't think such a thing should hold this series up, but as it would
be a bit kinder to servers I think it's worth at least noting in the
commit message what's desired per-se here, v.s. what's just needed for
the convenience of implementation.

I.e. when this series was in an earlier iteration the scope was to
repair repository corruption, which I pointed out to you it really
couldn't do without more wider changes to the object store management,
and at that point having it be NOOP definitely makes sense. The object
lookups etc. take shortcuts that "fsck" wouldn't do, so we could be
negotiating on the basis of corrupt content.

But now that it's a "fetch what's missing" wouldn't it make more sense
to descend from our otherwise-negotiated tips, and find the OIDs that
are "complete", if any, and negotiate with those?

Which again, I think it's fine to say "yeah, that would be ideal, but
this is easier". I'm just checking if I'm missing some subtlety here...

> Signed-off-by: Robert Coup <robert@coup.net.nz>
> ---
>  fetch-pack.c | 46 +++++++++++++++++++++++++++++-----------------
>  fetch-pack.h |  1 +
>  2 files changed, 30 insertions(+), 17 deletions(-)
>
> diff --git a/fetch-pack.c b/fetch-pack.c
> index 87657907e78..4e1e88eea09 100644
> --- a/fetch-pack.c
> +++ b/fetch-pack.c
> @@ -312,19 +312,21 @@ static int find_common(struct fetch_negotiator *negotiator,
>  		const char *remote_hex;
>  		struct object *o;
>  
> -		/*
> -		 * If that object is complete (i.e. it is an ancestor of a
> -		 * local ref), we tell them we have it but do not have to
> -		 * tell them about its ancestors, which they already know
> -		 * about.
> -		 *
> -		 * We use lookup_object here because we are only
> -		 * interested in the case we *know* the object is
> -		 * reachable and we have already scanned it.
> -		 */
> -		if (((o = lookup_object(the_repository, remote)) != NULL) &&
> -				(o->flags & COMPLETE)) {
> -			continue;
> +		if (!args->refetch) {
> +			/*
> +			* If that object is complete (i.e. it is an ancestor of a
> +			* local ref), we tell them we have it but do not have to
> +			* tell them about its ancestors, which they already know
> +			* about.
> +			*
> +			* We use lookup_object here because we are only
> +			* interested in the case we *know* the object is
> +			* reachable and we have already scanned it.
> +			*/
> +			if (((o = lookup_object(the_repository, remote)) != NULL) &&
> +					(o->flags & COMPLETE)) {
> +				continue;
> +			}

nit: remove the {} here
nit: this double-indented if can just be one if-statement
nit: don't compare against NULL, use ! instead.
>  		}
>  
>  		remote_hex = oid_to_hex(remote);
> @@ -692,6 +694,9 @@ static void mark_complete_and_common_ref(struct fetch_negotiator *negotiator,
>  	int old_save_commit_buffer = save_commit_buffer;
>  	timestamp_t cutoff = 0;
>  
> +	if (args->refetch)
> +		return;
> +
>  	save_commit_buffer = 0;

nit: This function has only two callers, perhaps it's clearer to do do
this "early abort" in those calls?

>  	trace2_region_enter("fetch-pack", "parse_remote_refs_and_find_cutoff", NULL);
> @@ -1028,7 +1033,11 @@ static struct ref *do_fetch_pack(struct fetch_pack_args *args,
>  	struct fetch_negotiator *negotiator;
>  
>  	negotiator = &negotiator_alloc;
> -	fetch_negotiator_init(r, negotiator);
> +	if (args->refetch) {
> +		fetch_negotiator_init_noop(negotiator);
> +	} else {
> +		fetch_negotiator_init(r, negotiator);
> +	}

More needless {}

>  	sort_ref_list(&ref, ref_compare_name);
>  	QSORT(sought, nr_sought, cmp_ref_by_name);
> @@ -1121,7 +1130,7 @@ static struct ref *do_fetch_pack(struct fetch_pack_args *args,
>  
>  	mark_complete_and_common_ref(negotiator, args, &ref);
>  	filter_refs(args, &ref, sought, nr_sought);
> -	if (everything_local(args, &ref)) {
> +	if (!args->refetch && everything_local(args, &ref)) {
>  		packet_flush(fd[1]);
>  		goto all_done;
>  	}

Here everything_local() is doing what I suggested for
mark_complete_and_common_ref() above, i.e. we check args->refetch first.

> @@ -1587,7 +1596,10 @@ static struct ref *do_fetch_pack_v2(struct fetch_pack_args *args,
>  	struct strvec index_pack_args = STRVEC_INIT;
>  
>  	negotiator = &negotiator_alloc;
> -	fetch_negotiator_init(r, negotiator);
> +	if (args->refetch)
> +		fetch_negotiator_init_noop(negotiator);
> +	else
> +		fetch_negotiator_init(r, negotiator);

This one doesn't have braces (good), unlike do_fetch_pack()
Robert Coup April 1, 2022, 10:26 a.m. UTC | #2
Hi Ævar,

< just when I was thinking I'm done... ;-) >

On Thu, 31 Mar 2022 at 16:17, Ævar Arnfjörð Bjarmason <avarab@gmail.com> wrote:
>
> FWIW it's not clear to me re earlier comments on earlier iterations
> whether the "noop fetch" is per-se wanted for this feature for some
> reason, or if it's just much easier to implement than doing what I
> suggested in:
> https://lore.kernel.org/git/220228.86ee3m39jf.gmgdl@evledraar.gmail.com/

The noop-fetch is an implementation detail IMO, and can be improved
if/when someone is motivated later.

> I don't think such a thing should hold this series up, but as it would
> be a bit kinder to servers I think it's worth at least noting in the
> commit message what's desired per-se here, v.s. what's just needed for
> the convenience of implementation.

Yes, doing some per-commit negotiation is conceptually nicer. The
user-facing alternative at this point is basically running "clone" in
some form; or temporarily moving aside the object DB and running
fetch. Both those (and the new approach) all end up putting the same
load on the server.

I can note it in the commit message.

> I.e. when this series was in an earlier iteration the scope was to
> repair repository corruption

That was never _my_ motivation, I was a bit eager in reflecting some
comments I received suggesting it would be useful for that as well.

> But now that it's a "fetch what's missing" wouldn't it make more sense
> to descend from our otherwise-negotiated tips, and find the OIDs that
> are "complete", if any, and negotiate with those?
>
> Which again, I think it's fine to say "yeah, that would be ideal, but
> this is easier". I'm just checking if I'm missing some subtlety here...

Yeah, that would be ideal, but this is easier. AFAICS I'm not putting
in place any barriers to making it smarter later.

> nit: remove the {} here
> nit: this double-indented if can just be one if-statement
> nit: don't compare against NULL, use ! instead.

Is the common interpretation for nits "Do another re-roll"; "If you're
doing another re-roll"; or "For future reference"?

> nit: This function has only two callers, perhaps it's clearer to do do
> this "early abort" in those calls?

I discussed similar in
https://lore.kernel.org/git/CACf-nVePhtm_HAzAKzcap0E8kiyyEJPY_+N+bbPcYPVUkjweFg@mail.gmail.com/
but yes, I think you're right about mark_complete_and_common_ref().
Will see what it looks like.

Thanks, Rob.
diff mbox series

Patch

diff --git a/fetch-pack.c b/fetch-pack.c
index 87657907e78..4e1e88eea09 100644
--- a/fetch-pack.c
+++ b/fetch-pack.c
@@ -312,19 +312,21 @@  static int find_common(struct fetch_negotiator *negotiator,
 		const char *remote_hex;
 		struct object *o;
 
-		/*
-		 * If that object is complete (i.e. it is an ancestor of a
-		 * local ref), we tell them we have it but do not have to
-		 * tell them about its ancestors, which they already know
-		 * about.
-		 *
-		 * We use lookup_object here because we are only
-		 * interested in the case we *know* the object is
-		 * reachable and we have already scanned it.
-		 */
-		if (((o = lookup_object(the_repository, remote)) != NULL) &&
-				(o->flags & COMPLETE)) {
-			continue;
+		if (!args->refetch) {
+			/*
+			* If that object is complete (i.e. it is an ancestor of a
+			* local ref), we tell them we have it but do not have to
+			* tell them about its ancestors, which they already know
+			* about.
+			*
+			* We use lookup_object here because we are only
+			* interested in the case we *know* the object is
+			* reachable and we have already scanned it.
+			*/
+			if (((o = lookup_object(the_repository, remote)) != NULL) &&
+					(o->flags & COMPLETE)) {
+				continue;
+			}
 		}
 
 		remote_hex = oid_to_hex(remote);
@@ -692,6 +694,9 @@  static void mark_complete_and_common_ref(struct fetch_negotiator *negotiator,
 	int old_save_commit_buffer = save_commit_buffer;
 	timestamp_t cutoff = 0;
 
+	if (args->refetch)
+		return;
+
 	save_commit_buffer = 0;
 
 	trace2_region_enter("fetch-pack", "parse_remote_refs_and_find_cutoff", NULL);
@@ -1028,7 +1033,11 @@  static struct ref *do_fetch_pack(struct fetch_pack_args *args,
 	struct fetch_negotiator *negotiator;
 
 	negotiator = &negotiator_alloc;
-	fetch_negotiator_init(r, negotiator);
+	if (args->refetch) {
+		fetch_negotiator_init_noop(negotiator);
+	} else {
+		fetch_negotiator_init(r, negotiator);
+	}
 
 	sort_ref_list(&ref, ref_compare_name);
 	QSORT(sought, nr_sought, cmp_ref_by_name);
@@ -1121,7 +1130,7 @@  static struct ref *do_fetch_pack(struct fetch_pack_args *args,
 
 	mark_complete_and_common_ref(negotiator, args, &ref);
 	filter_refs(args, &ref, sought, nr_sought);
-	if (everything_local(args, &ref)) {
+	if (!args->refetch && everything_local(args, &ref)) {
 		packet_flush(fd[1]);
 		goto all_done;
 	}
@@ -1587,7 +1596,10 @@  static struct ref *do_fetch_pack_v2(struct fetch_pack_args *args,
 	struct strvec index_pack_args = STRVEC_INIT;
 
 	negotiator = &negotiator_alloc;
-	fetch_negotiator_init(r, negotiator);
+	if (args->refetch)
+		fetch_negotiator_init_noop(negotiator);
+	else
+		fetch_negotiator_init(r, negotiator);
 
 	packet_reader_init(&reader, fd[0], NULL, 0,
 			   PACKET_READ_CHOMP_NEWLINE |
@@ -1613,7 +1625,7 @@  static struct ref *do_fetch_pack_v2(struct fetch_pack_args *args,
 			/* Filter 'ref' by 'sought' and those that aren't local */
 			mark_complete_and_common_ref(negotiator, args, &ref);
 			filter_refs(args, &ref, sought, nr_sought);
-			if (everything_local(args, &ref))
+			if (!args->refetch && everything_local(args, &ref))
 				state = FETCH_DONE;
 			else
 				state = FETCH_SEND_REQUEST;
diff --git a/fetch-pack.h b/fetch-pack.h
index 7f94a2a5831..8c7752fc821 100644
--- a/fetch-pack.h
+++ b/fetch-pack.h
@@ -42,6 +42,7 @@  struct fetch_pack_args {
 	unsigned update_shallow:1;
 	unsigned reject_shallow_remote:1;
 	unsigned deepen:1;
+	unsigned refetch:1;
 
 	/*
 	 * Indicate that the remote of this request is a promisor remote. The