diff mbox series

[1/1] fetch: Cache the want OIDs for faster lookup

Message ID 20190915211802.207715-2-masayasuzuki@google.com (mailing list archive)
State New, archived
Headers show
Series fetch: Cache the want OIDs for faster lookup | expand

Commit Message

Masaya Suzuki Sept. 15, 2019, 9:18 p.m. UTC
During git-fetch, the client checks if the advertised tags' OIDs are
already in the fetch request's want OID set. This check is done in a
linear scan. For a repository that has a lot of refs, repeating this
scan takes 15+ minutes. In order to speed this up, create a oid_set for
other refs' OIDs.

Signed-off-by: Masaya Suzuki <masayasuzuki@google.com>
---
 builtin/fetch.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

Comments

Derrick Stolee Sept. 16, 2019, 2:35 a.m. UTC | #1
On 9/15/2019 5:18 PM, Masaya Suzuki wrote:
> During git-fetch, the client checks if the advertised tags' OIDs are
> already in the fetch request's want OID set. This check is done in a
> linear scan. For a repository that has a lot of refs, repeating this
> scan takes 15+ minutes. In order to speed this up, create a oid_set for
> other refs' OIDs.

Good catch! Quadratic performance is never good.

The patch below looks like it works, but could you also share your
performance timings for the 15+ minute case after your patch is
applied?

Thanks,
-Stolee

> 
> Signed-off-by: Masaya Suzuki <masayasuzuki@google.com>
> ---
>  builtin/fetch.c | 18 ++++++++++--------
>  1 file changed, 10 insertions(+), 8 deletions(-)
> 
> diff --git a/builtin/fetch.c b/builtin/fetch.c
> index 54d6b01892..51a276dfaa 100644
> --- a/builtin/fetch.c
> +++ b/builtin/fetch.c
> @@ -7,6 +7,7 @@
>  #include "refs.h"
>  #include "refspec.h"
>  #include "object-store.h"
> +#include "oidset.h"
>  #include "commit.h"
>  #include "builtin.h"
>  #include "string-list.h"
> @@ -243,15 +244,13 @@ static void add_merge_config(struct ref **head,
>  	}
>  }
>  
> -static int will_fetch(struct ref **head, const unsigned char *sha1)
> +static void create_fetch_oidset(struct ref **head, struct oidset *out)
>  {
>  	struct ref *rm = *head;
>  	while (rm) {
> -		if (hasheq(rm->old_oid.hash, sha1))
> -			return 1;
> +		oidset_insert(out, &rm->old_oid);
>  		rm = rm->next;
>  	}
> -	return 0;
>  }
>  
>  struct refname_hash_entry {
> @@ -317,6 +316,7 @@ static void find_non_local_tags(const struct ref *refs,
>  {
>  	struct hashmap existing_refs;
>  	struct hashmap remote_refs;
> +	struct oidset fetch_oids = OIDSET_INIT;
>  	struct string_list remote_refs_list = STRING_LIST_INIT_NODUP;
>  	struct string_list_item *remote_ref_item;
>  	const struct ref *ref;
> @@ -324,6 +324,7 @@ static void find_non_local_tags(const struct ref *refs,
>  
>  	refname_hash_init(&existing_refs);
>  	refname_hash_init(&remote_refs);
> +	create_fetch_oidset(head, &fetch_oids);
>  
>  	for_each_ref(add_one_refname, &existing_refs);
>  	for (ref = refs; ref; ref = ref->next) {
> @@ -340,9 +341,9 @@ static void find_non_local_tags(const struct ref *refs,
>  			if (item &&
>  			    !has_object_file_with_flags(&ref->old_oid,
>  							OBJECT_INFO_QUICK) &&
> -			    !will_fetch(head, ref->old_oid.hash) &&
> +			    !oidset_contains(&fetch_oids, &ref->old_oid) &&
>  			    !has_object_file_with_flags(&item->oid, OBJECT_INFO_QUICK) &&
> -			    !will_fetch(head, item->oid.hash))
> +			    !oidset_contains(&fetch_oids, &item->oid))
>  				clear_item(item);
>  			item = NULL;
>  			continue;
> @@ -356,7 +357,7 @@ static void find_non_local_tags(const struct ref *refs,
>  		 */
>  		if (item &&
>  		    !has_object_file_with_flags(&item->oid, OBJECT_INFO_QUICK) &&
> -		    !will_fetch(head, item->oid.hash))
> +		    !oidset_contains(&fetch_oids, &item->oid))
>  			clear_item(item);
>  
>  		item = NULL;
> @@ -377,7 +378,7 @@ static void find_non_local_tags(const struct ref *refs,
>  	 */
>  	if (item &&
>  	    !has_object_file_with_flags(&item->oid, OBJECT_INFO_QUICK) &&
> -	    !will_fetch(head, item->oid.hash))
> +	    !oidset_contains(&fetch_oids, &item->oid))
>  		clear_item(item);
>  
>  	/*
> @@ -404,6 +405,7 @@ static void find_non_local_tags(const struct ref *refs,
>  	}
>  	hashmap_free(&remote_refs, 1);
>  	string_list_clear(&remote_refs_list, 0);
> +	oidset_clear(&fetch_oids);
>  }
>  
>  static struct ref *get_ref_map(struct remote *remote,
>
Masaya Suzuki Sept. 16, 2019, 4:28 a.m. UTC | #2
On Sun, Sep 15, 2019 at 7:35 PM Derrick Stolee <stolee@gmail.com> wrote:
>
> On 9/15/2019 5:18 PM, Masaya Suzuki wrote:
> > During git-fetch, the client checks if the advertised tags' OIDs are
> > already in the fetch request's want OID set. This check is done in a
> > linear scan. For a repository that has a lot of refs, repeating this
> > scan takes 15+ minutes. In order to speed this up, create a oid_set for
> > other refs' OIDs.
>
> Good catch! Quadratic performance is never good.
>
> The patch below looks like it works, but could you also share your
> performance timings for the 15+ minute case after your patch is
> applied?

With the following code change, I measured the time for
find_non_local_tags. It shows 215 msec with the example commands. (I
didn't measure entire fetch time as good portion of the time is spent
on the server side.)

diff --git a/builtin/fetch.c b/builtin/fetch.c
index 51a276dfaa..d3b06c733d 100644
--- a/builtin/fetch.c
+++ b/builtin/fetch.c
@@ -25,6 +25,7 @@
 #include "list-objects-filter-options.h"
 #include "commit-reach.h"
 #include "branch.h"
+#include <time.h>

 #define FORCED_UPDATES_DELAY_WARNING_IN_MS (10 * 1000)

@@ -322,8 +323,11 @@ static void find_non_local_tags(const struct ref *refs,
        const struct ref *ref;
        struct refname_hash_entry *item = NULL;

+       struct timespec start, end;
+
        refname_hash_init(&existing_refs);
        refname_hash_init(&remote_refs);
+       clock_gettime(CLOCK_MONOTONIC, &start);
        create_fetch_oidset(head, &fetch_oids);

        for_each_ref(add_one_refname, &existing_refs);
@@ -405,6 +409,12 @@ static void find_non_local_tags(const struct ref *refs,
        }
        hashmap_free(&remote_refs, 1);
        string_list_clear(&remote_refs_list, 0);
+       clock_gettime(CLOCK_MONOTONIC, &end);
+       {
+               uint64_t millisec = (end.tv_sec - start.tv_sec) * 1000
+ (end.tv_nsec - start.tv_nsec) / 1000000;
+               fprintf(stderr, "find_non_local_tags: %ld msec\n", millisec);
+       }
+
        oidset_clear(&fetch_oids);
 }
Jeff King Sept. 16, 2019, 4:34 a.m. UTC | #3
On Sun, Sep 15, 2019 at 02:18:02PM -0700, Masaya Suzuki wrote:

> During git-fetch, the client checks if the advertised tags' OIDs are
> already in the fetch request's want OID set. This check is done in a
> linear scan. For a repository that has a lot of refs, repeating this
> scan takes 15+ minutes. In order to speed this up, create a oid_set for
> other refs' OIDs.

Good find. I was curious why nobody noticed it sooner, but I think a key
element in your chromium example is the use of "--mirror", which brings
in a bunch of refs which would not normally be grabbed. There are only
about 17k heads and tags, but over 1.5M entries in refs/changes.
Quadratically speaking, that's almost 8000 times worse, so your
15-minute delay is only 1/10th of a second in the non-mirror case.

Mostly I was wondering if this might have been a recent regression. But
it looks rather old:

> -static int will_fetch(struct ref **head, const unsigned char *sha1)
> +static void create_fetch_oidset(struct ref **head, struct oidset *out)
>  {
>  	struct ref *rm = *head;
>  	while (rm) {
> -		if (hasheq(rm->old_oid.hash, sha1))
> -			return 1;
> +		oidset_insert(out, &rm->old_oid);
>  		rm = rm->next;
>  	}
> -	return 0;
>  }

This function comes from cf7f929a10 (Teach git-fetch to grab a tag at
the same time as a commit, 2008-03-02). As explained in that commit
message, this technique can't actually get all the tags we're looking
for. It was around the same time, in 348e390b17 (Teach
fetch-pack/upload-pack about --include-tag, 2008-03-03), that we added
an extension to do this reliably.

So it _might_ even be possible to get rid of this code entirely, if we
don't care about pre-2008 versions of Git (we'd still get the right
answer from them; it would just take an extra round-trip).

That said, it seems like a good idea to do this obvious and safe
conversion in the near term, and we can look at removing it entirely as
a separate topic. The cost of the oidset isn't too bad (~30MB, I guess
even for that pathological chromium case).

> @@ -324,6 +324,7 @@ static void find_non_local_tags(const struct ref *refs,
>  
>  	refname_hash_init(&existing_refs);
>  	refname_hash_init(&remote_refs);
> +	create_fetch_oidset(head, &fetch_oids);
>  
>  	for_each_ref(add_one_refname, &existing_refs);
>  	for (ref = refs; ref; ref = ref->next) {
> @@ -340,9 +341,9 @@ static void find_non_local_tags(const struct ref *refs,
>  			if (item &&
>  			    !has_object_file_with_flags(&ref->old_oid,
>  							OBJECT_INFO_QUICK) &&
> -			    !will_fetch(head, ref->old_oid.hash) &&
> +			    !oidset_contains(&fetch_oids, &ref->old_oid) &&
>  			    !has_object_file_with_flags(&item->oid, OBJECT_INFO_QUICK) &&
> -			    !will_fetch(head, item->oid.hash))
> +			    !oidset_contains(&fetch_oids, &item->oid))
>  				clear_item(item);
>  			item = NULL;
>  			continue;

One thing to check is whether the list we feed to will_fetch ever gets
updated in the loop that checks it (i.e., do we need to be adding or
clearing elements in the oidset).

I _think_ the answer is no, because we modify only the "refs" list in
that loop (via clear_item()), but the will_fetch() check is on the
head/tail we receive (which isn't updated until later). Both of those
lists are passed into the function, but I didn't trace it all the way up
to make sure they are not aliases.

-Peff
diff mbox series

Patch

diff --git a/builtin/fetch.c b/builtin/fetch.c
index 54d6b01892..51a276dfaa 100644
--- a/builtin/fetch.c
+++ b/builtin/fetch.c
@@ -7,6 +7,7 @@ 
 #include "refs.h"
 #include "refspec.h"
 #include "object-store.h"
+#include "oidset.h"
 #include "commit.h"
 #include "builtin.h"
 #include "string-list.h"
@@ -243,15 +244,13 @@  static void add_merge_config(struct ref **head,
 	}
 }
 
-static int will_fetch(struct ref **head, const unsigned char *sha1)
+static void create_fetch_oidset(struct ref **head, struct oidset *out)
 {
 	struct ref *rm = *head;
 	while (rm) {
-		if (hasheq(rm->old_oid.hash, sha1))
-			return 1;
+		oidset_insert(out, &rm->old_oid);
 		rm = rm->next;
 	}
-	return 0;
 }
 
 struct refname_hash_entry {
@@ -317,6 +316,7 @@  static void find_non_local_tags(const struct ref *refs,
 {
 	struct hashmap existing_refs;
 	struct hashmap remote_refs;
+	struct oidset fetch_oids = OIDSET_INIT;
 	struct string_list remote_refs_list = STRING_LIST_INIT_NODUP;
 	struct string_list_item *remote_ref_item;
 	const struct ref *ref;
@@ -324,6 +324,7 @@  static void find_non_local_tags(const struct ref *refs,
 
 	refname_hash_init(&existing_refs);
 	refname_hash_init(&remote_refs);
+	create_fetch_oidset(head, &fetch_oids);
 
 	for_each_ref(add_one_refname, &existing_refs);
 	for (ref = refs; ref; ref = ref->next) {
@@ -340,9 +341,9 @@  static void find_non_local_tags(const struct ref *refs,
 			if (item &&
 			    !has_object_file_with_flags(&ref->old_oid,
 							OBJECT_INFO_QUICK) &&
-			    !will_fetch(head, ref->old_oid.hash) &&
+			    !oidset_contains(&fetch_oids, &ref->old_oid) &&
 			    !has_object_file_with_flags(&item->oid, OBJECT_INFO_QUICK) &&
-			    !will_fetch(head, item->oid.hash))
+			    !oidset_contains(&fetch_oids, &item->oid))
 				clear_item(item);
 			item = NULL;
 			continue;
@@ -356,7 +357,7 @@  static void find_non_local_tags(const struct ref *refs,
 		 */
 		if (item &&
 		    !has_object_file_with_flags(&item->oid, OBJECT_INFO_QUICK) &&
-		    !will_fetch(head, item->oid.hash))
+		    !oidset_contains(&fetch_oids, &item->oid))
 			clear_item(item);
 
 		item = NULL;
@@ -377,7 +378,7 @@  static void find_non_local_tags(const struct ref *refs,
 	 */
 	if (item &&
 	    !has_object_file_with_flags(&item->oid, OBJECT_INFO_QUICK) &&
-	    !will_fetch(head, item->oid.hash))
+	    !oidset_contains(&fetch_oids, &item->oid))
 		clear_item(item);
 
 	/*
@@ -404,6 +405,7 @@  static void find_non_local_tags(const struct ref *refs,
 	}
 	hashmap_free(&remote_refs, 1);
 	string_list_clear(&remote_refs_list, 0);
+	oidset_clear(&fetch_oids);
 }
 
 static struct ref *get_ref_map(struct remote *remote,