diff mbox series

[3/6] connected: refactor iterator to return next object ID directly

Message ID 3bdad7bc8b0debd44138a4d3df5744d5a245475d.1629452412.git.ps@pks.im (mailing list archive)
State Superseded
Headers show
Series Speed up mirror-fetches with many refs | expand

Commit Message

Patrick Steinhardt Aug. 20, 2021, 10:08 a.m. UTC
The object ID iterator used by the connectivity checks returns the next
object ID via an out-parameter and then uses a return code to indicate
whether an item was found. This is a bit roundabout: instead of a
separate error code, we can just retrun the next object ID directly and
use `NULL` pointers as indicator that the iterator got no items left.
Furthermore, this avoids a copy of the object ID.

Refactor the iterator and all its implementations to return object IDs
directly. While I was honestly hoping for a small speedup given that we
can now avoid a copy, both versions perform the same. Still, the end
result is easier to understand and thus it makes sense to keep this
refactoring regardless.

Signed-off-by: Patrick Steinhardt <ps@pks.im>
---
 builtin/clone.c        |  8 +++-----
 builtin/fetch.c        |  7 +++----
 builtin/receive-pack.c | 17 +++++++----------
 connected.c            | 15 ++++++++-------
 connected.h            |  2 +-
 fetch-pack.c           |  7 +++----
 6 files changed, 25 insertions(+), 31 deletions(-)

Comments

Derrick Stolee Aug. 20, 2021, 2:32 p.m. UTC | #1
On 8/20/2021 6:08 AM, Patrick Steinhardt wrote:
> The object ID iterator used by the connectivity checks returns the next
> object ID via an out-parameter and then uses a return code to indicate
> whether an item was found. This is a bit roundabout: instead of a
> separate error code, we can just retrun the next object ID directly and
> use `NULL` pointers as indicator that the iterator got no items left.
> Furthermore, this avoids a copy of the object ID.
> 
> Refactor the iterator and all its implementations to return object IDs
> directly. While I was honestly hoping for a small speedup given that we
> can now avoid a copy, both versions perform the same. Still, the end
> result is easier to understand and thus it makes sense to keep this
> refactoring regardless.

It's too bad about the lack of measurable performance gains, but the
new code _is_ doing less, it's just not enough.

I agree that the new code organization is better.

Thanks,
-Stolee
René Scharfe Aug. 20, 2021, 5:43 p.m. UTC | #2
Am 20.08.21 um 12:08 schrieb Patrick Steinhardt:
> The object ID iterator used by the connectivity checks returns the next
> object ID via an out-parameter and then uses a return code to indicate
> whether an item was found. This is a bit roundabout: instead of a
> separate error code, we can just retrun the next object ID directly and

s/retrun/return/

> use `NULL` pointers as indicator that the iterator got no items left.
> Furthermore, this avoids a copy of the object ID.
>
> Refactor the iterator and all its implementations to return object IDs
> directly. While I was honestly hoping for a small speedup given that we
> can now avoid a copy, both versions perform the same. Still, the end
> result is easier to understand and thus it makes sense to keep this
> refactoring regardless.

check_connected() calls find_pack_entry_one() on the object ID hash,
which copies it anyway.  Perhaps that and caching prevent the expected
speedup?

The private copy made sure check_connected() could not modify the
object IDs.  It still only reads them with this patch, but the compiler
no longer prevents writes.  The iterators could return const pointers
to restore that guarantee.

>
> Signed-off-by: Patrick Steinhardt <ps@pks.im>
> ---
>  builtin/clone.c        |  8 +++-----
>  builtin/fetch.c        |  7 +++----
>  builtin/receive-pack.c | 17 +++++++----------
>  connected.c            | 15 ++++++++-------
>  connected.h            |  2 +-
>  fetch-pack.c           |  7 +++----
>  6 files changed, 25 insertions(+), 31 deletions(-)
>
> diff --git a/builtin/clone.c b/builtin/clone.c
> index a74558f30c..817a651936 100644
> --- a/builtin/clone.c
> +++ b/builtin/clone.c
> @@ -544,7 +544,7 @@ static void write_followtags(const struct ref *refs, const char *msg)
>  	}
>  }
>
> -static int iterate_ref_map(void *cb_data, struct object_id *oid)
> +static struct object_id *iterate_ref_map(void *cb_data)
>  {
>  	struct ref **rm = cb_data;
>  	struct ref *ref = *rm;
> @@ -555,13 +555,11 @@ static int iterate_ref_map(void *cb_data, struct object_id *oid)
>  	 */
>  	while (ref && !ref->peer_ref)
>  		ref = ref->next;
> -	/* Returning -1 notes "end of list" to the caller. */
>  	if (!ref)
> -		return -1;
> +		return NULL;
>
> -	oidcpy(oid, &ref->old_oid);
>  	*rm = ref->next;
> -	return 0;
> +	return &ref->old_oid;
>  }
>
>  static void update_remote_refs(const struct ref *refs,
> diff --git a/builtin/fetch.c b/builtin/fetch.c
> index 5fd0f7c791..76ce73ccf5 100644
> --- a/builtin/fetch.c
> +++ b/builtin/fetch.c
> @@ -962,7 +962,7 @@ static int update_local_ref(struct ref *ref,
>  	}
>  }
>
> -static int iterate_ref_map(void *cb_data, struct object_id *oid)
> +static struct object_id *iterate_ref_map(void *cb_data)
>  {
>  	struct ref **rm = cb_data;
>  	struct ref *ref = *rm;
> @@ -970,10 +970,9 @@ static int iterate_ref_map(void *cb_data, struct object_id *oid)
>  	while (ref && ref->status == REF_STATUS_REJECT_SHALLOW)
>  		ref = ref->next;
>  	if (!ref)
> -		return -1; /* end of the list */
> +		return NULL;
>  	*rm = ref->next;
> -	oidcpy(oid, &ref->old_oid);
> -	return 0;
> +	return &ref->old_oid;
>  }
>
>  struct fetch_head {
> diff --git a/builtin/receive-pack.c b/builtin/receive-pack.c
> index a419de5b38..0abda033bc 100644
> --- a/builtin/receive-pack.c
> +++ b/builtin/receive-pack.c
> @@ -1307,7 +1307,7 @@ static void refuse_unconfigured_deny_delete_current(void)
>  	rp_error("%s", _(refuse_unconfigured_deny_delete_current_msg));
>  }
>
> -static int command_singleton_iterator(void *cb_data, struct object_id *oid);
> +static struct object_id *command_singleton_iterator(void *cb_data);
>  static int update_shallow_ref(struct command *cmd, struct shallow_info *si)
>  {
>  	struct shallow_lock shallow_lock = SHALLOW_LOCK_INIT;
> @@ -1725,16 +1725,15 @@ static void check_aliased_updates(struct command *commands)
>  	string_list_clear(&ref_list, 0);
>  }
>
> -static int command_singleton_iterator(void *cb_data, struct object_id *oid)
> +static struct object_id *command_singleton_iterator(void *cb_data)
>  {
>  	struct command **cmd_list = cb_data;
>  	struct command *cmd = *cmd_list;
>
>  	if (!cmd || is_null_oid(&cmd->new_oid))
> -		return -1; /* end of list */
> +		return NULL;
>  	*cmd_list = NULL; /* this returns only one */
> -	oidcpy(oid, &cmd->new_oid);
> -	return 0;
> +	return &cmd->new_oid;
>  }
>
>  static void set_connectivity_errors(struct command *commands,
> @@ -1764,7 +1763,7 @@ struct iterate_data {
>  	struct shallow_info *si;
>  };
>
> -static int iterate_receive_command_list(void *cb_data, struct object_id *oid)
> +static struct object_id *iterate_receive_command_list(void *cb_data)
>  {
>  	struct iterate_data *data = cb_data;
>  	struct command **cmd_list = &data->cmds;
> @@ -1775,13 +1774,11 @@ static int iterate_receive_command_list(void *cb_data, struct object_id *oid)
>  			/* to be checked in update_shallow_ref() */
>  			continue;
>  		if (!is_null_oid(&cmd->new_oid) && !cmd->skip_update) {
> -			oidcpy(oid, &cmd->new_oid);
>  			*cmd_list = cmd->next;
> -			return 0;
> +			return &cmd->new_oid;
>  		}
>  	}
> -	*cmd_list = NULL;
> -	return -1; /* end of list */
> +	return NULL;
>  }
>
>  static void reject_updates_to_hidden(struct command *commands)
> diff --git a/connected.c b/connected.c
> index b5f9523a5f..374b145355 100644
> --- a/connected.c
> +++ b/connected.c
> @@ -24,7 +24,7 @@ int check_connected(oid_iterate_fn fn, void *cb_data,
>  	struct child_process rev_list = CHILD_PROCESS_INIT;
>  	FILE *rev_list_in;
>  	struct check_connected_options defaults = CHECK_CONNECTED_INIT;
> -	struct object_id oid;
> +	struct object_id *oid;
>  	int err = 0;
>  	struct packed_git *new_pack = NULL;
>  	struct transport *transport;
> @@ -34,7 +34,8 @@ int check_connected(oid_iterate_fn fn, void *cb_data,
>  		opt = &defaults;
>  	transport = opt->transport;
>
> -	if (fn(cb_data, &oid)) {
> +	oid = fn(cb_data);
> +	if (!oid) {
>  		if (opt->err_fd)
>  			close(opt->err_fd);
>  		return err;
> @@ -73,7 +74,7 @@ int check_connected(oid_iterate_fn fn, void *cb_data,
>  			for (p = get_all_packs(the_repository); p; p = p->next) {
>  				if (!p->pack_promisor)
>  					continue;
> -				if (find_pack_entry_one(oid.hash, p))
> +				if (find_pack_entry_one(oid->hash, p))
>  					goto promisor_pack_found;
>  			}
>  			/*
> @@ -83,7 +84,7 @@ int check_connected(oid_iterate_fn fn, void *cb_data,
>  			goto no_promisor_pack_found;
>  promisor_pack_found:
>  			;
> -		} while (!fn(cb_data, &oid));
> +		} while ((oid = fn(cb_data)) != NULL);
>  		return 0;
>  	}
>
> @@ -133,12 +134,12 @@ int check_connected(oid_iterate_fn fn, void *cb_data,
>  		 * are sure the ref is good and not sending it to
>  		 * rev-list for verification.
>  		 */
> -		if (new_pack && find_pack_entry_one(oid.hash, new_pack))
> +		if (new_pack && find_pack_entry_one(oid->hash, new_pack))
>  			continue;
>
> -		if (fprintf(rev_list_in, "%s\n", oid_to_hex(&oid)) < 0)
> +		if (fprintf(rev_list_in, "%s\n", oid_to_hex(oid)) < 0)
>  			break;
> -	} while (!fn(cb_data, &oid));
> +	} while ((oid = fn(cb_data)) != NULL);
>
>  	if (ferror(rev_list_in) || fflush(rev_list_in)) {
>  		if (errno != EPIPE && errno != EINVAL)
> diff --git a/connected.h b/connected.h
> index 8d5a6b3ad6..56cc95be2d 100644
> --- a/connected.h
> +++ b/connected.h
> @@ -9,7 +9,7 @@ struct transport;
>   * When called after returning the name for the last object, return -1
>   * to signal EOF, otherwise return 0.
>   */
> -typedef int (*oid_iterate_fn)(void *, struct object_id *oid);
> +typedef struct object_id *(*oid_iterate_fn)(void *);

I.e. this would be safer (requires some more const modifiers throughout
the code):

typedef const struct object_id * (*oid_iterate_fn)(void *);

>
>  /*
>   * Named-arguments struct for check_connected. All arguments are
> diff --git a/fetch-pack.c b/fetch-pack.c
> index 0bf7ed7e47..1a6242cd71 100644
> --- a/fetch-pack.c
> +++ b/fetch-pack.c
> @@ -1912,16 +1912,15 @@ static void update_shallow(struct fetch_pack_args *args,
>  	oid_array_clear(&ref);
>  }
>
> -static int iterate_ref_map(void *cb_data, struct object_id *oid)
> +static struct object_id *iterate_ref_map(void *cb_data)
>  {
>  	struct ref **rm = cb_data;
>  	struct ref *ref = *rm;
>
>  	if (!ref)
> -		return -1; /* end of the list */
> +		return NULL;
>  	*rm = ref->next;
> -	oidcpy(oid, &ref->old_oid);
> -	return 0;
> +	return &ref->old_oid;
>  }
>
>  struct ref *fetch_pack(struct fetch_pack_args *args,
>
Junio C Hamano Aug. 20, 2021, 5:43 p.m. UTC | #3
Derrick Stolee <stolee@gmail.com> writes:

> On 8/20/2021 6:08 AM, Patrick Steinhardt wrote:
>> The object ID iterator used by the connectivity checks returns the next
>> object ID via an out-parameter and then uses a return code to indicate
>> whether an item was found. This is a bit roundabout: instead of a
>> separate error code, we can just retrun the next object ID directly and
>> use `NULL` pointers as indicator that the iterator got no items left.
>> Furthermore, this avoids a copy of the object ID.
>> 
>> Refactor the iterator and all its implementations to return object IDs
>> directly. While I was honestly hoping for a small speedup given that we
>> can now avoid a copy, both versions perform the same. Still, the end
>> result is easier to understand and thus it makes sense to keep this
>> refactoring regardless.
>
> It's too bad about the lack of measurable performance gains, but the
> new code _is_ doing less, it's just not enough.
>
> I agree that the new code organization is better.

The only potential downside I can think of is the loss of ability to
convey the reason why it failed to return one by adding new return
codes from the function, which I do not immediately see all that
useful future extension anyway, so I agree that "we find what we
found, or NULL if we don't find" is much straight-forward and easier
to understand.

Nicely done.

Thanks.
Patrick Steinhardt Aug. 23, 2021, 6:47 a.m. UTC | #4
On Fri, Aug 20, 2021 at 07:43:06PM +0200, René Scharfe wrote:
> Am 20.08.21 um 12:08 schrieb Patrick Steinhardt:
> > The object ID iterator used by the connectivity checks returns the next
> > object ID via an out-parameter and then uses a return code to indicate
> > whether an item was found. This is a bit roundabout: instead of a
> > separate error code, we can just retrun the next object ID directly and
> 
> s/retrun/return/
> 
> > use `NULL` pointers as indicator that the iterator got no items left.
> > Furthermore, this avoids a copy of the object ID.
> >
> > Refactor the iterator and all its implementations to return object IDs
> > directly. While I was honestly hoping for a small speedup given that we
> > can now avoid a copy, both versions perform the same. Still, the end
> > result is easier to understand and thus it makes sense to keep this
> > refactoring regardless.
> 
> check_connected() calls find_pack_entry_one() on the object ID hash,
> which copies it anyway.  Perhaps that and caching prevent the expected
> speedup?
> 
> The private copy made sure check_connected() could not modify the
> object IDs.  It still only reads them with this patch, but the compiler
> no longer prevents writes.  The iterators could return const pointers
> to restore that guarantee.

Right, will change the signature and re-benchmark. I'd be surprised if
it significantly changed the picture, but let's see.

Patrick
diff mbox series

Patch

diff --git a/builtin/clone.c b/builtin/clone.c
index a74558f30c..817a651936 100644
--- a/builtin/clone.c
+++ b/builtin/clone.c
@@ -544,7 +544,7 @@  static void write_followtags(const struct ref *refs, const char *msg)
 	}
 }
 
-static int iterate_ref_map(void *cb_data, struct object_id *oid)
+static struct object_id *iterate_ref_map(void *cb_data)
 {
 	struct ref **rm = cb_data;
 	struct ref *ref = *rm;
@@ -555,13 +555,11 @@  static int iterate_ref_map(void *cb_data, struct object_id *oid)
 	 */
 	while (ref && !ref->peer_ref)
 		ref = ref->next;
-	/* Returning -1 notes "end of list" to the caller. */
 	if (!ref)
-		return -1;
+		return NULL;
 
-	oidcpy(oid, &ref->old_oid);
 	*rm = ref->next;
-	return 0;
+	return &ref->old_oid;
 }
 
 static void update_remote_refs(const struct ref *refs,
diff --git a/builtin/fetch.c b/builtin/fetch.c
index 5fd0f7c791..76ce73ccf5 100644
--- a/builtin/fetch.c
+++ b/builtin/fetch.c
@@ -962,7 +962,7 @@  static int update_local_ref(struct ref *ref,
 	}
 }
 
-static int iterate_ref_map(void *cb_data, struct object_id *oid)
+static struct object_id *iterate_ref_map(void *cb_data)
 {
 	struct ref **rm = cb_data;
 	struct ref *ref = *rm;
@@ -970,10 +970,9 @@  static int iterate_ref_map(void *cb_data, struct object_id *oid)
 	while (ref && ref->status == REF_STATUS_REJECT_SHALLOW)
 		ref = ref->next;
 	if (!ref)
-		return -1; /* end of the list */
+		return NULL;
 	*rm = ref->next;
-	oidcpy(oid, &ref->old_oid);
-	return 0;
+	return &ref->old_oid;
 }
 
 struct fetch_head {
diff --git a/builtin/receive-pack.c b/builtin/receive-pack.c
index a419de5b38..0abda033bc 100644
--- a/builtin/receive-pack.c
+++ b/builtin/receive-pack.c
@@ -1307,7 +1307,7 @@  static void refuse_unconfigured_deny_delete_current(void)
 	rp_error("%s", _(refuse_unconfigured_deny_delete_current_msg));
 }
 
-static int command_singleton_iterator(void *cb_data, struct object_id *oid);
+static struct object_id *command_singleton_iterator(void *cb_data);
 static int update_shallow_ref(struct command *cmd, struct shallow_info *si)
 {
 	struct shallow_lock shallow_lock = SHALLOW_LOCK_INIT;
@@ -1725,16 +1725,15 @@  static void check_aliased_updates(struct command *commands)
 	string_list_clear(&ref_list, 0);
 }
 
-static int command_singleton_iterator(void *cb_data, struct object_id *oid)
+static struct object_id *command_singleton_iterator(void *cb_data)
 {
 	struct command **cmd_list = cb_data;
 	struct command *cmd = *cmd_list;
 
 	if (!cmd || is_null_oid(&cmd->new_oid))
-		return -1; /* end of list */
+		return NULL;
 	*cmd_list = NULL; /* this returns only one */
-	oidcpy(oid, &cmd->new_oid);
-	return 0;
+	return &cmd->new_oid;
 }
 
 static void set_connectivity_errors(struct command *commands,
@@ -1764,7 +1763,7 @@  struct iterate_data {
 	struct shallow_info *si;
 };
 
-static int iterate_receive_command_list(void *cb_data, struct object_id *oid)
+static struct object_id *iterate_receive_command_list(void *cb_data)
 {
 	struct iterate_data *data = cb_data;
 	struct command **cmd_list = &data->cmds;
@@ -1775,13 +1774,11 @@  static int iterate_receive_command_list(void *cb_data, struct object_id *oid)
 			/* to be checked in update_shallow_ref() */
 			continue;
 		if (!is_null_oid(&cmd->new_oid) && !cmd->skip_update) {
-			oidcpy(oid, &cmd->new_oid);
 			*cmd_list = cmd->next;
-			return 0;
+			return &cmd->new_oid;
 		}
 	}
-	*cmd_list = NULL;
-	return -1; /* end of list */
+	return NULL;
 }
 
 static void reject_updates_to_hidden(struct command *commands)
diff --git a/connected.c b/connected.c
index b5f9523a5f..374b145355 100644
--- a/connected.c
+++ b/connected.c
@@ -24,7 +24,7 @@  int check_connected(oid_iterate_fn fn, void *cb_data,
 	struct child_process rev_list = CHILD_PROCESS_INIT;
 	FILE *rev_list_in;
 	struct check_connected_options defaults = CHECK_CONNECTED_INIT;
-	struct object_id oid;
+	struct object_id *oid;
 	int err = 0;
 	struct packed_git *new_pack = NULL;
 	struct transport *transport;
@@ -34,7 +34,8 @@  int check_connected(oid_iterate_fn fn, void *cb_data,
 		opt = &defaults;
 	transport = opt->transport;
 
-	if (fn(cb_data, &oid)) {
+	oid = fn(cb_data);
+	if (!oid) {
 		if (opt->err_fd)
 			close(opt->err_fd);
 		return err;
@@ -73,7 +74,7 @@  int check_connected(oid_iterate_fn fn, void *cb_data,
 			for (p = get_all_packs(the_repository); p; p = p->next) {
 				if (!p->pack_promisor)
 					continue;
-				if (find_pack_entry_one(oid.hash, p))
+				if (find_pack_entry_one(oid->hash, p))
 					goto promisor_pack_found;
 			}
 			/*
@@ -83,7 +84,7 @@  int check_connected(oid_iterate_fn fn, void *cb_data,
 			goto no_promisor_pack_found;
 promisor_pack_found:
 			;
-		} while (!fn(cb_data, &oid));
+		} while ((oid = fn(cb_data)) != NULL);
 		return 0;
 	}
 
@@ -133,12 +134,12 @@  int check_connected(oid_iterate_fn fn, void *cb_data,
 		 * are sure the ref is good and not sending it to
 		 * rev-list for verification.
 		 */
-		if (new_pack && find_pack_entry_one(oid.hash, new_pack))
+		if (new_pack && find_pack_entry_one(oid->hash, new_pack))
 			continue;
 
-		if (fprintf(rev_list_in, "%s\n", oid_to_hex(&oid)) < 0)
+		if (fprintf(rev_list_in, "%s\n", oid_to_hex(oid)) < 0)
 			break;
-	} while (!fn(cb_data, &oid));
+	} while ((oid = fn(cb_data)) != NULL);
 
 	if (ferror(rev_list_in) || fflush(rev_list_in)) {
 		if (errno != EPIPE && errno != EINVAL)
diff --git a/connected.h b/connected.h
index 8d5a6b3ad6..56cc95be2d 100644
--- a/connected.h
+++ b/connected.h
@@ -9,7 +9,7 @@  struct transport;
  * When called after returning the name for the last object, return -1
  * to signal EOF, otherwise return 0.
  */
-typedef int (*oid_iterate_fn)(void *, struct object_id *oid);
+typedef struct object_id *(*oid_iterate_fn)(void *);
 
 /*
  * Named-arguments struct for check_connected. All arguments are
diff --git a/fetch-pack.c b/fetch-pack.c
index 0bf7ed7e47..1a6242cd71 100644
--- a/fetch-pack.c
+++ b/fetch-pack.c
@@ -1912,16 +1912,15 @@  static void update_shallow(struct fetch_pack_args *args,
 	oid_array_clear(&ref);
 }
 
-static int iterate_ref_map(void *cb_data, struct object_id *oid)
+static struct object_id *iterate_ref_map(void *cb_data)
 {
 	struct ref **rm = cb_data;
 	struct ref *ref = *rm;
 
 	if (!ref)
-		return -1; /* end of the list */
+		return NULL;
 	*rm = ref->next;
-	oidcpy(oid, &ref->old_oid);
-	return 0;
+	return &ref->old_oid;
 }
 
 struct ref *fetch_pack(struct fetch_pack_args *args,