diff mbox series

[v2,06/11] commit-graph: examine commits by generation number

Message ID 58704d81b6b4fbc54715457246aeed783eb32a99.1580943390.git.gitgitgadget@gmail.com (mailing list archive)
State New, archived
Headers show
Series Changed Paths Bloom Filters | expand

Commit Message

Linus Arver via GitGitGadget Feb. 5, 2020, 10:56 p.m. UTC
From: Derrick Stolee <dstolee@microsoft.com>

When running 'git commit-graph write --changed-paths', we sort the
commits by pack-order to save time when computing the changed-paths
bloom filters. This does not help when finding the commits via the
--reachable flag.

If not using pack-order, then sort by generation number before
examining the diff. Commits with similar generation are more likely
to have many trees in common, making the diff faster.

On the Linux kernel repository, this change reduced the computation
time for 'git commit-graph write --reachable --changed-paths' from
3m00s to 1m37s.

Helped-by: Jeff King <peff@peff.net>
Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Garima Singh <garima.singh@microsoft.com>
---
 commit-graph.c | 33 ++++++++++++++++++++++++++++++---
 1 file changed, 30 insertions(+), 3 deletions(-)

Comments

Jakub Narębski Feb. 19, 2020, 12:32 a.m. UTC | #1
"Derrick Stolee via GitGitGadget" <gitgitgadget@gmail.com> writes:

> From: Derrick Stolee <dstolee@microsoft.com>
>
> When running 'git commit-graph write --changed-paths', we sort the
> commits by pack-order to save time when computing the changed-paths
> bloom filters. This does not help when finding the commits via the
> --reachable flag.

Minor improvement suggestion: s/--reachable flag/'--reachable' flag/.

>
> If not using pack-order, then sort by generation number before
> examining the diff.

All right, that is good description of what the patch does.

>                     Commits with similar generation are more likely
> to have many trees in common, making the diff faster.

Is this what causes the performance improvement, that subsequently
examined commits are more likely to have more trees in common, which
means that those trees would be hot in cache, making generating diff
faster?  Is it what profiling shows?

>
> On the Linux kernel repository, this change reduced the computation
> time for 'git commit-graph write --reachable --changed-paths' from
> 3m00s to 1m37s.

Would using the trick used for packfiles also for '--reachable', which
would mean commits examined in recency / reachability order, give
similar, worse or better performance improvements?

We would want this sorting order as one of possibilities anyway, because
'--stdin-commits' we could get commits in random order.

>
> Helped-by: Jeff King <peff@peff.net>
> Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
> Signed-off-by: Garima Singh <garima.singh@microsoft.com>
> ---
>  commit-graph.c | 33 ++++++++++++++++++++++++++++++---
>  1 file changed, 30 insertions(+), 3 deletions(-)
>
> diff --git a/commit-graph.c b/commit-graph.c
> index e125511a1c..32a315058f 100644
> --- a/commit-graph.c
> +++ b/commit-graph.c
> @@ -70,6 +70,25 @@ static int commit_pos_cmp(const void *va, const void *vb)
>  	       commit_pos_at(&commit_pos, b);
>  }
>  
> +static int commit_gen_cmp(const void *va, const void *vb)
> +{
> +	const struct commit *a = *(const struct commit **)va;
> +	const struct commit *b = *(const struct commit **)vb;
> +
> +	/* lower generation commits first */

Shouldn't higher generation commits come first, in recency-like order?
Or it doesn't matter if it is sorted in ascending or descending order,
as long as commits with close generation numbers are examined close
together?

> +	if (a->generation < b->generation)
> +		return -1;
> +	else if (a->generation > b->generation)
> +		return 1;
> +
> +	/* use date as a heuristic when generations are equal */
> +	if (a->date < b->date)
> +		return -1;
> +	else if (a->date > b->date)
> +		return 1;
> +	return 0;
> +}

I thought we have had such comparison function defined somewhere in Git
already, but I think I'm wrong here.

> +
>  char *get_commit_graph_filename(const char *obj_dir)
>  {
>  	char *filename = xstrfmt("%s/info/commit-graph", obj_dir);
> @@ -821,7 +840,8 @@ struct write_commit_graph_context {
>  		 report_progress:1,
>  		 split:1,
>  		 check_oids:1,
> -		 changed_paths:1;
> +		 changed_paths:1,
> +		 order_by_pack:1;
>  
>  	const struct split_commit_graph_opts *split_opts;
>  	uint32_t total_bloom_filter_data_size;
> @@ -1184,7 +1204,11 @@ static void compute_bloom_filters(struct write_commit_graph_context *ctx)
>  
>  	ALLOC_ARRAY(sorted_by_pos, ctx->commits.nr);
>  	COPY_ARRAY(sorted_by_pos, ctx->commits.list, ctx->commits.nr);
> -	QSORT(sorted_by_pos, ctx->commits.nr, commit_pos_cmp);
> +
> +	if (ctx->order_by_pack)
> +		QSORT(sorted_by_pos, ctx->commits.nr, commit_pos_cmp);
> +	else
> +		QSORT(sorted_by_pos, ctx->commits.nr, commit_gen_cmp);

Here 'sorted_b_pos' variable name no longer reflects reality...
(see comment to the previous patch in the series).

>  
>  	for (i = 0; i < ctx->commits.nr; i++) {
>  		struct commit *c = sorted_by_pos[i];
> @@ -1902,6 +1926,7 @@ int write_commit_graph(const char *obj_dir,
>  	}
>  
>  	if (pack_indexes) {
> +		ctx->order_by_pack = 1;
>  		if ((res = fill_oids_from_packs(ctx, pack_indexes)))
>  			goto cleanup;
>  	}
> @@ -1911,8 +1936,10 @@ int write_commit_graph(const char *obj_dir,
>  			goto cleanup;
>  	}
>  
> -	if (!pack_indexes && !commit_hex)
> +	if (!pack_indexes && !commit_hex) {
> +		ctx->order_by_pack = 1;
>  		fill_oids_from_all_packs(ctx);
> +	}
>  
>  	close_reachable(ctx);

All right, that covers all cases where 'git commit-graph write' writes
serialized commit-graph based on the commits found in packfiles:
'--stdin-packs' and default no option case, in that order.

Looks good.

Best,
Garima Singh Feb. 24, 2020, 8:45 p.m. UTC | #2
On 2/18/2020 7:32 PM, Jakub Narebski wrote:
> "Derrick Stolee via GitGitGadget" <gitgitgadget@gmail.com> writes:
> 
>> From: Derrick Stolee <dstolee@microsoft.com>
>>
>> When running 'git commit-graph write --changed-paths', we sort the
>> commits by pack-order to save time when computing the changed-paths
>> bloom filters. This does not help when finding the commits via the
>> --reachable flag.
> 
> Minor improvement suggestion: s/--reachable flag/'--reachable' flag/.
> 

Sure. 


>>                     Commits with similar generation are more likely
>> to have many trees in common, making the diff faster.
> 
> Is this what causes the performance improvement, that subsequently
> examined commits are more likely to have more trees in common, which
> means that those trees would be hot in cache, making generating diff
> faster?  Is it what profiling shows?
> 

Yes. 

>>
>> Helped-by: Jeff King <peff@peff.net>
>> Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
>> Signed-off-by: Garima Singh <garima.singh@microsoft.com>
>> ---
>>  commit-graph.c | 33 ++++++++++++++++++++++++++++++---
>>  1 file changed, 30 insertions(+), 3 deletions(-)
>>
>> diff --git a/commit-graph.c b/commit-graph.c
>> index e125511a1c..32a315058f 100644
>> --- a/commit-graph.c
>> +++ b/commit-graph.c
>> @@ -70,6 +70,25 @@ static int commit_pos_cmp(const void *va, const void *vb)
>>  	       commit_pos_at(&commit_pos, b);
>>  }
>>  
>> +static int commit_gen_cmp(const void *va, const void *vb)
>> +{
>> +	const struct commit *a = *(const struct commit **)va;
>> +	const struct commit *b = *(const struct commit **)vb;
>> +
>> +	/* lower generation commits first */
> 
> Shouldn't higher generation commits come first, in recency-like order?
> Or it doesn't matter if it is sorted in ascending or descending order,
> as long as commits with close generation numbers are examined close
> together?
> 

The direction does not matter. Locality is important. 

>> +	if (a->generation < b->generation)
>> +		return -1;
>> +	else if (a->generation > b->generation)
>> +		return 1;
>> +
>> +	/* use date as a heuristic when generations are equal */
>> +	if (a->date < b->date)
>> +		return -1;
>> +	else if (a->date > b->date)
>> +		return 1;
>> +	return 0;
>> +}
> 
> I thought we have had such comparison function defined somewhere in Git
> already, but I think I'm wrong here.
> 

It actually exists in commit.h
I will just use it here. 
Thanks for pointing it out! 

>> +
>>  char *get_commit_graph_filename(const char *obj_dir)
>>  {
>>  	char *filename = xstrfmt("%s/info/commit-graph", obj_dir);
>> @@ -821,7 +840,8 @@ struct write_commit_graph_context {
>>  		 report_progress:1,
>>  		 split:1,
>>  		 check_oids:1,
>> -		 changed_paths:1;
>> +		 changed_paths:1,
>> +		 order_by_pack:1;
>>  
>>  	const struct split_commit_graph_opts *split_opts;
>>  	uint32_t total_bloom_filter_data_size;
>> @@ -1184,7 +1204,11 @@ static void compute_bloom_filters(struct write_commit_graph_context *ctx)
>>  
>>  	ALLOC_ARRAY(sorted_by_pos, ctx->commits.nr);
>>  	COPY_ARRAY(sorted_by_pos, ctx->commits.list, ctx->commits.nr);
>> -	QSORT(sorted_by_pos, ctx->commits.nr, commit_pos_cmp);
>> +
>> +	if (ctx->order_by_pack)
>> +		QSORT(sorted_by_pos, ctx->commits.nr, commit_pos_cmp);
>> +	else
>> +		QSORT(sorted_by_pos, ctx->commits.nr, commit_gen_cmp);
> 
> Here 'sorted_b_pos' variable name no longer reflects reality...
> (see comment to the previous patch in the series).
> 

Yup. Fixing. 

Thanks!
Garima Singh
diff mbox series

Patch

diff --git a/commit-graph.c b/commit-graph.c
index e125511a1c..32a315058f 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -70,6 +70,25 @@  static int commit_pos_cmp(const void *va, const void *vb)
 	       commit_pos_at(&commit_pos, b);
 }
 
+static int commit_gen_cmp(const void *va, const void *vb)
+{
+	const struct commit *a = *(const struct commit **)va;
+	const struct commit *b = *(const struct commit **)vb;
+
+	/* lower generation commits first */
+	if (a->generation < b->generation)
+		return -1;
+	else if (a->generation > b->generation)
+		return 1;
+
+	/* use date as a heuristic when generations are equal */
+	if (a->date < b->date)
+		return -1;
+	else if (a->date > b->date)
+		return 1;
+	return 0;
+}
+
 char *get_commit_graph_filename(const char *obj_dir)
 {
 	char *filename = xstrfmt("%s/info/commit-graph", obj_dir);
@@ -821,7 +840,8 @@  struct write_commit_graph_context {
 		 report_progress:1,
 		 split:1,
 		 check_oids:1,
-		 changed_paths:1;
+		 changed_paths:1,
+		 order_by_pack:1;
 
 	const struct split_commit_graph_opts *split_opts;
 	uint32_t total_bloom_filter_data_size;
@@ -1184,7 +1204,11 @@  static void compute_bloom_filters(struct write_commit_graph_context *ctx)
 
 	ALLOC_ARRAY(sorted_by_pos, ctx->commits.nr);
 	COPY_ARRAY(sorted_by_pos, ctx->commits.list, ctx->commits.nr);
-	QSORT(sorted_by_pos, ctx->commits.nr, commit_pos_cmp);
+
+	if (ctx->order_by_pack)
+		QSORT(sorted_by_pos, ctx->commits.nr, commit_pos_cmp);
+	else
+		QSORT(sorted_by_pos, ctx->commits.nr, commit_gen_cmp);
 
 	for (i = 0; i < ctx->commits.nr; i++) {
 		struct commit *c = sorted_by_pos[i];
@@ -1902,6 +1926,7 @@  int write_commit_graph(const char *obj_dir,
 	}
 
 	if (pack_indexes) {
+		ctx->order_by_pack = 1;
 		if ((res = fill_oids_from_packs(ctx, pack_indexes)))
 			goto cleanup;
 	}
@@ -1911,8 +1936,10 @@  int write_commit_graph(const char *obj_dir,
 			goto cleanup;
 	}
 
-	if (!pack_indexes && !commit_hex)
+	if (!pack_indexes && !commit_hex) {
+		ctx->order_by_pack = 1;
 		fill_oids_from_all_packs(ctx);
+	}
 
 	close_reachable(ctx);