diff mbox series

[v2,05/11] commit-graph: examine changed-path objects in pack order

Message ID 78e8e49c3a1131ffacf660603de60729b3dbadc9.1580943390.git.gitgitgadget@gmail.com (mailing list archive)
State New, archived
Headers show
Series Changed Paths Bloom Filters | expand

Commit Message

Linus Arver via GitGitGadget Feb. 5, 2020, 10:56 p.m. UTC
From: Jeff King <peff@peff.net>

Looking at the diff of commit objects in pack order is much faster than
in sha1 order, as it gives locality to the access of tree deltas
(whereas sha1 order is effectively random). Unfortunately the
commit-graph code sorts the commits (several times, sometimes as an oid
and sometimes a pointer-to-commit), and we ultimately traverse in sha1
order.

Instead, let's remember the position at which we see each commit, and
traverse in that order when looking at bloom filters. This drops my time
for "git commit-graph write --changed-paths" in linux.git from ~4
minutes to ~1.5 minutes.

Probably the "--reachable" code path would want something similar.

Or alternatively, we could use a different data structure (either a
hash, or maybe even just a bit in "struct commit") to keep track of
which oids we've seen, etc instead of sorting. And then we could keep
the original order.

Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Garima Singh <garima.singh@microsoft.com>
---
 commit-graph.c | 34 +++++++++++++++++++++++++++++++++-
 1 file changed, 33 insertions(+), 1 deletion(-)

Comments

Jakub Narębski Feb. 18, 2020, 5:59 p.m. UTC | #1
"Jeff King via GitGitGadget" <gitgitgadget@gmail.com> writes:

> From: Jeff King <peff@peff.net>
>
> Looking at the diff of commit objects in pack order is much faster than
> in sha1 order, as it gives locality to the access of tree deltas

Nitpick: should we still say sha1 order?  Git is still using SHA-1 as an
*oid*, but hopefully soon it will be transitioning to NewHash = SHA-256.
(No need to change anything.)

> (whereas sha1 order is effectively random). Unfortunately the
> commit-graph code sorts the commits (several times, sometimes as an oid
> and sometimes a pointer-to-commit), and we ultimately traverse in sha1
> order.

Actually, commit-graph code needs write_commit_graph_context.commits.list
to be in lexicographical order to be able to turn position in graph into
reference to a commit.  The information about the parents of the commit
are stored using positional references within the graph file.

>
> Instead, let's remember the position at which we see each commit, and
> traverse in that order when looking at bloom filters. This drops my time
> for "git commit-graph write --changed-paths" in linux.git from ~4
> minutes to ~1.5 minutes.

Nitpick: with reordering of patches (which I think is otherwise a good
thing) this patch actually comes before the one adding "--changed-paths"
option to "git commit-graph write".  So it 'This would drop my time'
rather than 'This drops my time...' ;-)

>
> Probably the "--reachable" code path would want something similar.

Has anyone tried doing this?

>
> Or alternatively, we could use a different data structure (either a
> hash, or maybe even just a bit in "struct commit") to keep track of
> which oids we've seen, etc instead of sorting. And then we could keep
> the original order.

I think it is nice to keep those "what ifs?" thoughts in the commit
message.  They add some color.

>
> Signed-off-by: Jeff King <peff@peff.net>
> Signed-off-by: Garima Singh <garima.singh@microsoft.com>
> ---
>  commit-graph.c | 34 +++++++++++++++++++++++++++++++++-
>  1 file changed, 33 insertions(+), 1 deletion(-)
>
> diff --git a/commit-graph.c b/commit-graph.c
> index 724bfcffc4..e125511a1c 100644
> --- a/commit-graph.c
> +++ b/commit-graph.c
> @@ -17,6 +17,7 @@
>  #include "replace-object.h"
>  #include "progress.h"
>  #include "bloom.h"
> +#include "commit-slab.h"
>  
>  #define GRAPH_SIGNATURE 0x43475048 /* "CGPH" */
>  #define GRAPH_CHUNKID_OIDFANOUT 0x4f494446 /* "OIDF" */
> @@ -46,6 +47,29 @@
>  /* Remember to update object flag allocation in object.h */
>  #define REACHABLE       (1u<<15)
>  
> +/* Keep track of the order in which commits are added to our list. */
> +define_commit_slab(commit_pos, int);
> +static struct commit_pos commit_pos = COMMIT_SLAB_INIT(1, commit_pos);
> +
> +static void set_commit_pos(struct repository *r, const struct object_id *oid)
> +{
> +	static int32_t max_pos;
> +	struct commit *commit = lookup_commit(r, oid);
> +
> +	if (!commit)
> +		return; /* should never happen, but be lenient */
> +
> +	*commit_pos_at(&commit_pos, commit) = max_pos++;
> +}

All right, that is nice and universal function.

> +
> +static int commit_pos_cmp(const void *va, const void *vb)
> +{
> +	const struct commit *a = *(const struct commit **)va;
> +	const struct commit *b = *(const struct commit **)vb;
> +	return commit_pos_at(&commit_pos, a) -
> +	       commit_pos_at(&commit_pos, b);
> +}

Hmmm... I wonder what would happen in commit_pos was not set (like
e.g. commit-graph commits not coming from the packfile).  Let's look up
the documenation...

commit_pos_at() returns a pointer to an int... why are we comparing
pointers and not values?  Shouldn't it be

  +	return *commit_pos_at(&commit_pos, a) -
  +	       *commit_pos_at(&commit_pos, b);


With commit_pos_at() the location to store the data is allocated as
necessary (if data for commit doesn't exists), and because we are using
xalloc() the *commit_pos_at() is 0-initialized.  This means that if
commits didn't come from the packfile, we sort all commits as being
equal.  Luckily we fix that in next patch.

> +
>  char *get_commit_graph_filename(const char *obj_dir)
>  {
>  	char *filename = xstrfmt("%s/info/commit-graph", obj_dir);
> @@ -1027,6 +1051,8 @@ static int add_packed_commits(const struct object_id *oid,
>  	oidcpy(&(ctx->oids.list[ctx->oids.nr]), oid);
>  	ctx->oids.nr++;
>  
> +	set_commit_pos(ctx->r, oid);
> +
>  	return 0;
>  }
>  
> @@ -1147,6 +1173,7 @@ static void compute_bloom_filters(struct write_commit_graph_context *ctx)
>  {
>  	int i;
>  	struct progress *progress = NULL;
> +	struct commit **sorted_by_pos;

In the next patch in series we would sort commits by generation number
and creation data; shouldn't this variable name be more generic to
reflect this, for example just `sorted_commits` or `commits_sorted`?

>  
>  	load_bloom_filters();
>  
> @@ -1155,13 +1182,18 @@ static void compute_bloom_filters(struct write_commit_graph_context *ctx)
>  			_("Computing commit diff Bloom filters"),
>  			ctx->commits.nr);
>  
> +	ALLOC_ARRAY(sorted_by_pos, ctx->commits.nr);
> +	COPY_ARRAY(sorted_by_pos, ctx->commits.list, ctx->commits.nr);
> +	QSORT(sorted_by_pos, ctx->commits.nr, commit_pos_cmp);
> +

All right: allocate array, copy data, sort it.

We need to copy data because (what I think) we need commits in
lexicographical order to be able to turn the position in graph that
parents of a commit are stored as into the reference to this commit.

>  	for (i = 0; i < ctx->commits.nr; i++) {
> -		struct commit *c = ctx->commits.list[i];
> +		struct commit *c = sorted_by_pos[i];

All right: use sorted data.

>  		struct bloom_filter *filter = get_bloom_filter(ctx->r, c);
>  		ctx->total_bloom_filter_data_size += sizeof(uint64_t) * filter->len;
>  		display_progress(progress, i + 1);
>  	}
>  
> +	free(sorted_by_pos);

Can we free the slab data, i.e. call `clear_commit_pos(&commit_pos);`
here?  Otherwise we are leaking memory (well, except that finishing
command makes the operating system to free memory for us).

>  	stop_progress(&progress);
>  }

Best,
Garima Singh Feb. 24, 2020, 6:29 p.m. UTC | #2
On 2/18/2020 12:59 PM, Jakub Narebski wrote:
> "Jeff King via GitGitGadget" <gitgitgadget@gmail.com> writes:
> 
>> From: Jeff King <peff@peff.net>
>>
>> Looking at the diff of commit objects in pack order is much faster than
>> in sha1 order, as it gives locality to the access of tree deltas
> 
> Nitpick: should we still say sha1 order?  Git is still using SHA-1 as an
> *oid*, but hopefully soon it will be transitioning to NewHash = SHA-256.
> (No need to change anything.)
> 
>> (whereas sha1 order is effectively random). Unfortunately the
>> commit-graph code sorts the commits (several times, sometimes as an oid
>> and sometimes a pointer-to-commit), and we ultimately traverse in sha1
>> order.
> 
> Actually, commit-graph code needs write_commit_graph_context.commits.list
> to be in lexicographical order to be able to turn position in graph into
> reference to a commit.  The information about the parents of the commit
> are stored using positional references within the graph file.
> 

You are right. Fixing the commit message in v3. 

>>
>> Instead, let's remember the position at which we see each commit, and
>> traverse in that order when looking at bloom filters. This drops my time
>> for "git commit-graph write --changed-paths" in linux.git from ~4
>> minutes to ~1.5 minutes.
> 
> Nitpick: with reordering of patches (which I think is otherwise a good
> thing) this patch actually comes before the one adding "--changed-paths"
> option to "git commit-graph write".  So it 'This would drop my time'
> rather than 'This drops my time...' ;-)
> 

:) I will fix that up. 

>>
>> Probably the "--reachable" code path would want something similar.
> 
> Has anyone tried doing this?
> 

I will and I will include the perf numbers in the appropriately in v3. 


>> +
>>  char *get_commit_graph_filename(const char *obj_dir)
>>  {
>>  	char *filename = xstrfmt("%s/info/commit-graph", obj_dir);
>> @@ -1027,6 +1051,8 @@ static int add_packed_commits(const struct object_id *oid,
>>  	oidcpy(&(ctx->oids.list[ctx->oids.nr]), oid);
>>  	ctx->oids.nr++;
>>  
>> +	set_commit_pos(ctx->r, oid);
>> +
>>  	return 0;
>>  }
>>  
>> @@ -1147,6 +1173,7 @@ static void compute_bloom_filters(struct write_commit_graph_context *ctx)
>>  {
>>  	int i;
>>  	struct progress *progress = NULL;
>> +	struct commit **sorted_by_pos;
> 
> In the next patch in series we would sort commits by generation number
> and creation data; shouldn't this variable name be more generic to
> reflect this, for example just `sorted_commits` or `commits_sorted`?
> 

Good call. I will clean this up in both commits. 

Thanks for the review! 
Cheers! 
Garima Singh
diff mbox series

Patch

diff --git a/commit-graph.c b/commit-graph.c
index 724bfcffc4..e125511a1c 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -17,6 +17,7 @@ 
 #include "replace-object.h"
 #include "progress.h"
 #include "bloom.h"
+#include "commit-slab.h"
 
 #define GRAPH_SIGNATURE 0x43475048 /* "CGPH" */
 #define GRAPH_CHUNKID_OIDFANOUT 0x4f494446 /* "OIDF" */
@@ -46,6 +47,29 @@ 
 /* Remember to update object flag allocation in object.h */
 #define REACHABLE       (1u<<15)
 
+/* Keep track of the order in which commits are added to our list. */
+define_commit_slab(commit_pos, int);
+static struct commit_pos commit_pos = COMMIT_SLAB_INIT(1, commit_pos);
+
+static void set_commit_pos(struct repository *r, const struct object_id *oid)
+{
+	static int32_t max_pos;
+	struct commit *commit = lookup_commit(r, oid);
+
+	if (!commit)
+		return; /* should never happen, but be lenient */
+
+	*commit_pos_at(&commit_pos, commit) = max_pos++;
+}
+
+static int commit_pos_cmp(const void *va, const void *vb)
+{
+	const struct commit *a = *(const struct commit **)va;
+	const struct commit *b = *(const struct commit **)vb;
+	return commit_pos_at(&commit_pos, a) -
+	       commit_pos_at(&commit_pos, b);
+}
+
 char *get_commit_graph_filename(const char *obj_dir)
 {
 	char *filename = xstrfmt("%s/info/commit-graph", obj_dir);
@@ -1027,6 +1051,8 @@  static int add_packed_commits(const struct object_id *oid,
 	oidcpy(&(ctx->oids.list[ctx->oids.nr]), oid);
 	ctx->oids.nr++;
 
+	set_commit_pos(ctx->r, oid);
+
 	return 0;
 }
 
@@ -1147,6 +1173,7 @@  static void compute_bloom_filters(struct write_commit_graph_context *ctx)
 {
 	int i;
 	struct progress *progress = NULL;
+	struct commit **sorted_by_pos;
 
 	load_bloom_filters();
 
@@ -1155,13 +1182,18 @@  static void compute_bloom_filters(struct write_commit_graph_context *ctx)
 			_("Computing commit diff Bloom filters"),
 			ctx->commits.nr);
 
+	ALLOC_ARRAY(sorted_by_pos, ctx->commits.nr);
+	COPY_ARRAY(sorted_by_pos, ctx->commits.list, ctx->commits.nr);
+	QSORT(sorted_by_pos, ctx->commits.nr, commit_pos_cmp);
+
 	for (i = 0; i < ctx->commits.nr; i++) {
-		struct commit *c = ctx->commits.list[i];
+		struct commit *c = sorted_by_pos[i];
 		struct bloom_filter *filter = get_bloom_filter(ctx->r, c);
 		ctx->total_bloom_filter_data_size += sizeof(uint64_t) * filter->len;
 		display_progress(progress, i + 1);
 	}
 
+	free(sorted_by_pos);
 	stop_progress(&progress);
 }