diff mbox series

[v2,04/11] commit-graph: compute Bloom filters for changed paths

Message ID c17bbcbc66ea77bb480391804d1f2db66ffa0926.1580943390.git.gitgitgadget@gmail.com (mailing list archive)
State New, archived
Headers show
Series Changed Paths Bloom Filters | expand

Commit Message

Linus Arver via GitGitGadget Feb. 5, 2020, 10:56 p.m. UTC
From: Garima Singh <garima.singh@microsoft.com>

Compute Bloom filters for the paths that changed between a commit and its
first parent using the implementation in bloom.c, when the
COMMIT_GRAPH_WRITE_CHANGED_PATHS flag is set. This computation is done on a
commit-by-commit basis. We will write these Bloom filters to the commit graph
file in the next change.

Helped-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Garima Singh <garima.singh@microsoft.com>
---
 commit-graph.c | 32 +++++++++++++++++++++++++++++++-
 commit-graph.h |  3 ++-
 2 files changed, 33 insertions(+), 2 deletions(-)

Comments

Jakub Narębski Feb. 17, 2020, 9:56 p.m. UTC | #1
"Garima Singh via GitGitGadget" <gitgitgadget@gmail.com> writes:

> From: Garima Singh <garima.singh@microsoft.com>
> Subject: [PATCH v2 04/11] commit-graph: compute Bloom filters for changed paths
>
> Compute Bloom filters for the paths that changed between a commit and its
> first parent using the implementation in bloom.c, when the
> COMMIT_GRAPH_WRITE_CHANGED_PATHS flag is set. This computation is done on a
> commit-by-commit basis. We will write these Bloom filters to the commit graph
> file in the next change.

I have no major complaints about the contents of this patch (except lack
of test, and type of total_bloom_filter_data_size), but the commit
message could have been worded better.

I would write something like this instead:

  Add new COMMIT_GRAPH_WRITE_CHANGED_PATHS flag that makes Git compute
  Bloom filters that store the information about changed paths (that
  changed between a commit and its first parent) for each commit in the
  commit-graph.  This computation is done on a commit-by-commit basis.

  We will write these Bloom filters to the commit-graph file, to store
  this data on disk, in the next change in this series.

In my opinion the fact that we compute Bloom filters for each and every
commit in the commit-graph file is more important than quite obvious
fact that we use implementation from bloom.c.

>
> Helped-by: Derrick Stolee <dstolee@microsoft.com>
> Signed-off-by: Garima Singh <garima.singh@microsoft.com>
> ---
>  commit-graph.c | 32 +++++++++++++++++++++++++++++++-
>  commit-graph.h |  3 ++-
>  2 files changed, 33 insertions(+), 2 deletions(-)

It would be good to have at least sanity check of this feature, perhaps
one that would check that the number of per-commit Bloom filters on slab
matches the number of commits in the commit-graph.

It could look something like this:

  test_expect_success 'create Bloom filters for all commit-graph commits' '
  	# create commit-graph with 2 commits
  	git rev-parse HEAD HEAD^ | git commit-graph write --stdin-commits &&
  	# generate Bloom filters for commit-graph commits
  	cat >commands <<\-EOF &&
  	add-graph-commits
  	filters-count
  	EOF
  	NUM_FILTERS=$(git test-tool bloom <commands) %%
  	test "$NUM_FILTERS" -eq 2
  '

>
> diff --git a/commit-graph.c b/commit-graph.c
> index 3c4d411326..724bfcffc4 100644
> --- a/commit-graph.c
> +++ b/commit-graph.c
> @@ -16,6 +16,7 @@
>  #include "hashmap.h"
>  #include "replace-object.h"
>  #include "progress.h"
> +#include "bloom.h"
>  
>  #define GRAPH_SIGNATURE 0x43475048 /* "CGPH" */
>  #define GRAPH_CHUNKID_OIDFANOUT 0x4f494446 /* "OIDF" */
> @@ -795,9 +796,11 @@ struct write_commit_graph_context {
>  	unsigned append:1,
>  		 report_progress:1,
>  		 split:1,
> -		 check_oids:1;
> +		 check_oids:1,
> +		 changed_paths:1;

All right, this flag will be used for handling future `--changed-paths`
option to the `git commit-graph write`.

>  
>  	const struct split_commit_graph_opts *split_opts;
> +	uint32_t total_bloom_filter_data_size;

This is total size of Bloom filters data, in bytes, that will later be
used for BDAT chunk size.  However the commit-graph format uses 8 bytes
for byte-offset, not 4 bytes.  Why it is uint32_t and not uint64_t then?

>  };
>  
>  static void write_graph_chunk_fanout(struct hashfile *f,
> @@ -1140,6 +1143,28 @@ static void compute_generation_numbers(struct write_commit_graph_context *ctx)
>  	stop_progress(&ctx->progress);
>  }
>  
> +static void compute_bloom_filters(struct write_commit_graph_context *ctx)
> +{
> +	int i;
> +	struct progress *progress = NULL;
> +
> +	load_bloom_filters();
> +
> +	if (ctx->report_progress)
> +		progress = start_progress(
> +			_("Computing commit diff Bloom filters"),
> +			ctx->commits.nr);
> +

Shouldn't we initialize ctx->total_bloom_filter_data_size to 0 here?  We
cannot use compute_bloom_filters() to _update_ Bloom filters data, I
think -- we don't distinguish here between new and existing data (where
existing data size is already included in total Bloom filters size).  At
least I don't think so.


> +	for (i = 0; i < ctx->commits.nr; i++) {
> +		struct commit *c = ctx->commits.list[i];

Here we process commit in whatever order commits are in the
commits.list, which probably means lexicographical order, in practice
random order.

> +		struct bloom_filter *filter = get_bloom_filter(ctx->r, c);
> +		ctx->total_bloom_filter_data_size += sizeof(uint64_t) * filter->len;
> +		display_progress(progress, i + 1);
> +	}
> +
> +	stop_progress(&progress);
> +}
> +
>  static int add_ref_to_list(const char *refname,
>  			   const struct object_id *oid,
>  			   int flags, void *cb_data)

> @@ -1794,6 +1819,8 @@ int write_commit_graph(const char *obj_dir,
>  	ctx->split = flags & COMMIT_GRAPH_WRITE_SPLIT ? 1 : 0;
>  	ctx->check_oids = flags & COMMIT_GRAPH_WRITE_CHECK_OIDS ? 1 : 0;
>  	ctx->split_opts = split_opts;
> +	ctx->changed_paths = flags & COMMIT_GRAPH_WRITE_BLOOM_FILTERS ? 1 : 0;
> +	ctx->total_bloom_filter_data_size = 0;
>  
>  	if (ctx->split) {
>  		struct commit_graph *g;
> @@ -1888,6 +1915,9 @@ int write_commit_graph(const char *obj_dir,
>  
>  	compute_generation_numbers(ctx);
>  
> +	if (ctx->changed_paths)
> +		compute_bloom_filters(ctx);
> +

All right.

>  	res = write_commit_graph_file(ctx);
>  
>  	if (ctx->split)
> diff --git a/commit-graph.h b/commit-graph.h
> index 7f5c933fa2..952a4b83be 100644
> --- a/commit-graph.h
> +++ b/commit-graph.h
> @@ -76,7 +76,8 @@ enum commit_graph_write_flags {
>  	COMMIT_GRAPH_WRITE_PROGRESS   = (1 << 1),
>  	COMMIT_GRAPH_WRITE_SPLIT      = (1 << 2),
>  	/* Make sure that each OID in the input is a valid commit OID. */
> -	COMMIT_GRAPH_WRITE_CHECK_OIDS = (1 << 3)
> +	COMMIT_GRAPH_WRITE_CHECK_OIDS = (1 << 3),
> +	COMMIT_GRAPH_WRITE_BLOOM_FILTERS = (1 << 4)

All right.


Side note: perhaps we could add trailing comma after new enum entry,
that is

  +	COMMIT_GRAPH_WRITE_BLOOM_FILTERS = (1 << 4),

following new CodingGuidelines recommendation

 - We try to support a wide range of C compilers to compile Git with,
   including old ones.  You should not use features from newer C
   standard, even if your compiler groks them.

   There are a few exceptions to this guideline:

   . since early 2012 with e1327023ea, we have been using an enum
     definition whose last element is followed by a comma.  This, like
     an array initializer that ends with a trailing comma, can be used
     to reduce the patch noise when adding a new identifier at the end.

https://github.com/git/git/blob/master/Documentation/CodingGuidelines#L197

>  };
>  
>  struct split_commit_graph_opts {
Garima Singh Feb. 22, 2020, 12:55 a.m. UTC | #2
On 2/17/2020 4:56 PM, Jakub Narebski wrote:
> "Garima Singh via GitGitGadget" <gitgitgadget@gmail.com> writes:
> 
>> From: Garima Singh <garima.singh@microsoft.com>
>> Subject: [PATCH v2 04/11] commit-graph: compute Bloom filters for changed paths
>>
>> Compute Bloom filters for the paths that changed between a commit and its
>> first parent using the implementation in bloom.c, when the
>> COMMIT_GRAPH_WRITE_CHANGED_PATHS flag is set. This computation is done on a
>> commit-by-commit basis. We will write these Bloom filters to the commit graph
>> file in the next change.
> 
> I have no major complaints about the contents of this patch (except lack
> of test, and type of total_bloom_filter_data_size), but the commit
> message could have been worded better.
> 
> I would write something like this instead:
> 
>   Add new COMMIT_GRAPH_WRITE_CHANGED_PATHS flag that makes Git compute
>   Bloom filters that store the information about changed paths (that
>   changed between a commit and its first parent) for each commit in the
>   commit-graph.  This computation is done on a commit-by-commit basis.
> 
>   We will write these Bloom filters to the commit-graph file, to store
>   this data on disk, in the next change in this series.
> 
> In my opinion the fact that we compute Bloom filters for each and every
> commit in the commit-graph file is more important than quite obvious
> fact that we use implementation from bloom.c.
> 

Nice! Incorporated in v3. Thanks!

>>
>> Helped-by: Derrick Stolee <dstolee@microsoft.com>
>> Signed-off-by: Garima Singh <garima.singh@microsoft.com>
>> ---
>>  commit-graph.c | 32 +++++++++++++++++++++++++++++++-
>>  commit-graph.h |  3 ++-
>>  2 files changed, 33 insertions(+), 2 deletions(-)
> 
> It would be good to have at least sanity check of this feature, perhaps
> one that would check that the number of per-commit Bloom filters on slab
> matches the number of commits in the commit-graph.
> 

The combination of all the e2e tests in this series with the test
flag being turned on in the CI, and the performance gains we are seeing
confirm that this is happening correctly.

>>  
>>  	const struct split_commit_graph_opts *split_opts;
>> +	uint32_t total_bloom_filter_data_size;
> 
> This is total size of Bloom filters data, in bytes, that will later be
> used for BDAT chunk size.  However the commit-graph format uses 8 bytes
> for byte-offset, not 4 bytes.  Why it is uint32_t and not uint64_t then?
>

Changed to size_t. Thanks for noticing! 
 
>>  };
>>  
>>  static void write_graph_chunk_fanout(struct hashfile *f,
>> @@ -1140,6 +1143,28 @@ static void compute_generation_numbers(struct write_commit_graph_context *ctx)
>>  	stop_progress(&ctx->progress);
>>  }
>>  
>> +static void compute_bloom_filters(struct write_commit_graph_context *ctx)
>> +{
>> +	int i;
>> +	struct progress *progress = NULL;
>> +
>> +	load_bloom_filters();
>> +
>> +	if (ctx->report_progress)
>> +		progress = start_progress(
>> +			_("Computing commit diff Bloom filters"),
>> +			ctx->commits.nr);
>> +
> 
> Shouldn't we initialize ctx->total_bloom_filter_data_size to 0 here?  We
> cannot use compute_bloom_filters() to _update_ Bloom filters data, I
> think -- we don't distinguish here between new and existing data (where
> existing data size is already included in total Bloom filters size).  At
> least I don't think so.
> 

This line in commit-graph.c takes care of reinitializing the graph context and
by consequence the bloom filter data size. 
  ctx = xcalloc(1, sizeof(struct write_commit_graph_context));
  
So the total size gets recalculated every time, which is correct. 

> 
> Side note: perhaps we could add trailing comma after new enum entry,
> that is
> 
>   +	COMMIT_GRAPH_WRITE_BLOOM_FILTERS = (1 << 4),
> 
> following new CodingGuidelines recommendation
> 

Thanks! Fixed in v3.

Cheers! 
Garima Singh
Jakub Narębski Feb. 23, 2020, 5:34 p.m. UTC | #3
Garima Singh <garimasigit@gmail.com> writes:
> On 2/17/2020 4:56 PM, Jakub Narebski wrote:
>> "Garima Singh via GitGitGadget" <gitgitgadget@gmail.com> writes:

[...]
>>> ---
>>>  commit-graph.c | 32 +++++++++++++++++++++++++++++++-
>>>  commit-graph.h |  3 ++-
>>>  2 files changed, 33 insertions(+), 2 deletions(-)
>> 
>> It would be good to have at least sanity check of this feature, perhaps
>> one that would check that the number of per-commit Bloom filters on slab
>> matches the number of commits in the commit-graph.
>
> The combination of all the e2e tests in this series with the test
> flag being turned on in the CI, and the performance gains we are seeing
> confirm that this is happening correctly.

Well, the advantage of unit tests over e2e functional tests is that they
can pinpoint the source of bug much more easily.

That said, I don't think there is absolute need for unit tests here,
though it would be nice to have them.

>>>  
>>>  	const struct split_commit_graph_opts *split_opts;
>>> +	uint32_t total_bloom_filter_data_size;
>> 
>> This is total size of Bloom filters data, in bytes, that will later be
>> used for BDAT chunk size.  However the commit-graph format uses 8 bytes
>> for byte-offset, not 4 bytes.  Why it is uint32_t and not uint64_t then?
>
> Changed to size_t. Thanks for noticing! 

Right, this is a local value (size_t may be different size on different
architectures), even though it will be stored indirectly in chunk lookup
table as pair of uint64_t offsets.

>>>  };
>>>  
>>>  static void write_graph_chunk_fanout(struct hashfile *f,
>>> @@ -1140,6 +1143,28 @@ static void compute_generation_numbers(struct write_commit_graph_context *ctx)
>>>  	stop_progress(&ctx->progress);
>>>  }
>>>  
>>> +static void compute_bloom_filters(struct write_commit_graph_context *ctx)
>>> +{
>>> +	int i;
>>> +	struct progress *progress = NULL;
>>> +
>>> +	load_bloom_filters();
>>> +
>>> +	if (ctx->report_progress)
>>> +		progress = start_progress(
>>> +			_("Computing commit diff Bloom filters"),
>>> +			ctx->commits.nr);
>>> +
>> 
>> Shouldn't we initialize ctx->total_bloom_filter_data_size to 0 here?  We
>> cannot use compute_bloom_filters() to _update_ Bloom filters data, I
>> think -- we don't distinguish here between new and existing data (where
>> existing data size is already included in total Bloom filters size).  At
>> least I don't think so.
>> 
>
> This line in commit-graph.c takes care of reinitializing the graph context and
> by consequence the bloom filter data size.
>
>   ctx = xcalloc(1, sizeof(struct write_commit_graph_context));
>   
> So the total size gets recalculated every time, which is correct. 

True, I have missed this.

Best,
diff mbox series

Patch

diff --git a/commit-graph.c b/commit-graph.c
index 3c4d411326..724bfcffc4 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -16,6 +16,7 @@ 
 #include "hashmap.h"
 #include "replace-object.h"
 #include "progress.h"
+#include "bloom.h"
 
 #define GRAPH_SIGNATURE 0x43475048 /* "CGPH" */
 #define GRAPH_CHUNKID_OIDFANOUT 0x4f494446 /* "OIDF" */
@@ -795,9 +796,11 @@  struct write_commit_graph_context {
 	unsigned append:1,
 		 report_progress:1,
 		 split:1,
-		 check_oids:1;
+		 check_oids:1,
+		 changed_paths:1;
 
 	const struct split_commit_graph_opts *split_opts;
+	uint32_t total_bloom_filter_data_size;
 };
 
 static void write_graph_chunk_fanout(struct hashfile *f,
@@ -1140,6 +1143,28 @@  static void compute_generation_numbers(struct write_commit_graph_context *ctx)
 	stop_progress(&ctx->progress);
 }
 
+static void compute_bloom_filters(struct write_commit_graph_context *ctx)
+{
+	int i;
+	struct progress *progress = NULL;
+
+	load_bloom_filters();
+
+	if (ctx->report_progress)
+		progress = start_progress(
+			_("Computing commit diff Bloom filters"),
+			ctx->commits.nr);
+
+	for (i = 0; i < ctx->commits.nr; i++) {
+		struct commit *c = ctx->commits.list[i];
+		struct bloom_filter *filter = get_bloom_filter(ctx->r, c);
+		ctx->total_bloom_filter_data_size += sizeof(uint64_t) * filter->len;
+		display_progress(progress, i + 1);
+	}
+
+	stop_progress(&progress);
+}
+
 static int add_ref_to_list(const char *refname,
 			   const struct object_id *oid,
 			   int flags, void *cb_data)
@@ -1794,6 +1819,8 @@  int write_commit_graph(const char *obj_dir,
 	ctx->split = flags & COMMIT_GRAPH_WRITE_SPLIT ? 1 : 0;
 	ctx->check_oids = flags & COMMIT_GRAPH_WRITE_CHECK_OIDS ? 1 : 0;
 	ctx->split_opts = split_opts;
+	ctx->changed_paths = flags & COMMIT_GRAPH_WRITE_BLOOM_FILTERS ? 1 : 0;
+	ctx->total_bloom_filter_data_size = 0;
 
 	if (ctx->split) {
 		struct commit_graph *g;
@@ -1888,6 +1915,9 @@  int write_commit_graph(const char *obj_dir,
 
 	compute_generation_numbers(ctx);
 
+	if (ctx->changed_paths)
+		compute_bloom_filters(ctx);
+
 	res = write_commit_graph_file(ctx);
 
 	if (ctx->split)
diff --git a/commit-graph.h b/commit-graph.h
index 7f5c933fa2..952a4b83be 100644
--- a/commit-graph.h
+++ b/commit-graph.h
@@ -76,7 +76,8 @@  enum commit_graph_write_flags {
 	COMMIT_GRAPH_WRITE_PROGRESS   = (1 << 1),
 	COMMIT_GRAPH_WRITE_SPLIT      = (1 << 2),
 	/* Make sure that each OID in the input is a valid commit OID. */
-	COMMIT_GRAPH_WRITE_CHECK_OIDS = (1 << 3)
+	COMMIT_GRAPH_WRITE_CHECK_OIDS = (1 << 3),
+	COMMIT_GRAPH_WRITE_BLOOM_FILTERS = (1 << 4)
 };
 
 struct split_commit_graph_opts {