diff mbox series

[5/9] commit-graph: write changed path bloom filters to commit-graph file.

Message ID 7648021072ca11153ac65c90f0ebed5973f20e1a.1576879520.git.gitgitgadget@gmail.com (mailing list archive)
State New, archived
Headers show
Series Changed Paths Bloom Filters | expand

Commit Message

Derrick Stolee via GitGitGadget Dec. 20, 2019, 10:05 p.m. UTC
From: Garima Singh <garima.singh@microsoft.com>

Write bloom filters to the commit-graph using the format described in
Documentation/technical/commit-graph-format.txt

Helped-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Garima Singh <garima.singh@microsoft.com>
---
 commit-graph.c | 81 +++++++++++++++++++++++++++++++++++++++++++++++++-
 commit-graph.h |  5 ++++
 2 files changed, 85 insertions(+), 1 deletion(-)

Comments

Jakub Narębski Jan. 7, 2020, 4:01 p.m. UTC | #1
"Garima Singh via GitGitGadget" <gitgitgadget@gmail.com> writes:

> From: Garima Singh <garima.singh@microsoft.com>
>
> Write bloom filters to the commit-graph using the format described in
> Documentation/technical/commit-graph-format.txt
>
> Helped-by: Derrick Stolee <dstolee@microsoft.com>
> Signed-off-by: Garima Singh <garima.singh@microsoft.com>

Looks good to me.

> ---
>  commit-graph.c | 81 +++++++++++++++++++++++++++++++++++++++++++++++++-
>  commit-graph.h |  5 ++++
>  2 files changed, 85 insertions(+), 1 deletion(-)
>
> diff --git a/commit-graph.c b/commit-graph.c
> index 8c4941eeaa..def2ade166 100644
> --- a/commit-graph.c
> +++ b/commit-graph.c
> @@ -24,7 +24,9 @@
>  #define GRAPH_CHUNKID_DATA 0x43444154 /* "CDAT" */
>  #define GRAPH_CHUNKID_EXTRAEDGES 0x45444745 /* "EDGE" */
>  #define GRAPH_CHUNKID_BASE 0x42415345 /* "BASE" */
> -#define MAX_NUM_CHUNKS 5
> +#define GRAPH_CHUNKID_BLOOMINDEXES 0x42494458 /* "BIDX" */
> +#define GRAPH_CHUNKID_BLOOMDATA 0x42444154 /* "BDAT" */
> +#define MAX_NUM_CHUNKS 7

Very minor nitpick: shouldn't we follow the order in the
commit-graph-format.txt document (i.e. "BASE" as last chunk and last
preprocessor constant)?

>  
>  #define GRAPH_DATA_WIDTH (the_hash_algo->rawsz + 16)
>  
> @@ -282,6 +284,32 @@ struct commit_graph *parse_commit_graph(void *graph_map, int fd,
>  				chunk_repeated = 1;
>  			else
>  				graph->chunk_base_graphs = data + chunk_offset;
> +			break;
> +
> +		case GRAPH_CHUNKID_BLOOMINDEXES:
> +			if (graph->chunk_bloom_indexes)
> +				chunk_repeated = 1;
> +			else
> +				graph->chunk_bloom_indexes = data + chunk_offset;
> +			break;

All right.

> +
> +		case GRAPH_CHUNKID_BLOOMDATA:
> +			if (graph->chunk_bloom_data)
> +				chunk_repeated = 1;
> +			else {
> +				uint32_t hash_version;
> +				graph->chunk_bloom_data = data + chunk_offset;
> +				hash_version = get_be32(data + chunk_offset);

All right, now I see why all those header values for BDAT chunk are
defined to be 32-bit integers.  For code simplicity.

> +
> +				if (hash_version != 1)
> +					break;

What does it mean for Git?  Behave as if there were no Bloom filter
data?

> +
> +				graph->settings = xmalloc(sizeof(struct bloom_filter_settings));
> +				graph->settings->hash_version = hash_version;
> +				graph->settings->num_hashes = get_be32(data + chunk_offset + 4);
> +				graph->settings->bits_per_entry = get_be32(data + chunk_offset + 8);

All right, looks O.K.

> +			}
> +			break;
>  		}
>  
>  		if (chunk_repeated) {
> @@ -996,6 +1024,39 @@ static void write_graph_chunk_extra_edges(struct hashfile *f,
>  	}
>  }
>  
> +static void write_graph_chunk_bloom_indexes(struct hashfile *f,
> +					    struct write_commit_graph_context *ctx)
> +{
> +	struct commit **list = ctx->commits.list;
> +	struct commit **last = ctx->commits.list + ctx->commits.nr;
> +	uint32_t cur_pos = 0;
> +
> +	while (list < last) {
> +		struct bloom_filter *filter = get_bloom_filter(ctx->r, *list);
> +		cur_pos += filter->len;
> +		hashwrite_be32(f, cur_pos);
> +		list++;
> +	}

Why not follow the write_graph_chunk_oids() example, instead of
write_graph_chunk_data(), that is use simply:

  +	struct commit **list = ctx->commits.list;
  +	uint32_t cur_pos = 0;
  +
  +	for (count = 0; count < ctx->commits.nr; count++, list++) {
  +		struct bloom_filter *filter = get_bloom_filter(ctx->r, *list);
  +		cur_pos += filter->len;
  +		hashwrite_be32(f, cur_pos);
  +	}

I guess using here

  +		cur_pos += get_bloom_filter(ctx->r, *list)->len;

would be too cryptic, and hard to debug?

Also, wouldn't we need

  +		display_progress(ctx->progress, ++ctx->progress_cnt);

before hashwrite_be32()?

> +}
> +
> +static void write_graph_chunk_bloom_data(struct hashfile *f,
> +					 struct write_commit_graph_context *ctx,
> +					 struct bloom_filter_settings *settings)
> +{
> +	struct commit **first = ctx->commits.list;

Even if we decide to use `while` loop, like write_graph_chunk_data(),
and not `for` loop, like write_graph_chunk_oids(), why the change from
`struct commit **list = ...` to `struct commit **first = ...`?

> +	struct commit **last = ctx->commits.list + ctx->commits.nr;
> +
> +	hashwrite_be32(f, settings->hash_version);
> +	hashwrite_be32(f, settings->num_hashes);
> +	hashwrite_be32(f, settings->bits_per_entry);

All right, simple.

> +
> +	while (first < last) {
> +		struct bloom_filter *filter = get_bloom_filter(ctx->r, *first);

Hmmm... wouldn't this compute Bloom filter second time?
get_bloom_filter() does work unconditionally.

Wouldn't

  +		struct bloom_filter *filter = bloom_filter_slab_at(&bloom_filters, *first);

be enough?

Or make get_bloom_filter() use *_peek() to check if Bloom filter for
given commit was already computed, and only if it returns NULL do the
work.

> +		hashwrite(f, filter->data, filter->len * sizeof(uint64_t));

Might need display_progress() before hashwrote().

> +		first++;
> +	}
> +}
> +
>  static int oid_compare(const void *_a, const void *_b)
>  {
>  	const struct object_id *a = (const struct object_id *)_a;
> @@ -1388,6 +1449,7 @@ static int write_commit_graph_file(struct write_commit_graph_context *ctx)
>  	struct strbuf progress_title = STRBUF_INIT;
>  	int num_chunks = 3;
>  	struct object_id file_hash;
> +	struct bloom_filter_settings bloom_settings = DEFAULT_BLOOM_FILTER_SETTINGS;
>  
>  	if (ctx->split) {
>  		struct strbuf tmp_file = STRBUF_INIT;
> @@ -1432,6 +1494,12 @@ static int write_commit_graph_file(struct write_commit_graph_context *ctx)
>  		chunk_ids[num_chunks] = GRAPH_CHUNKID_EXTRAEDGES;
>  		num_chunks++;
>  	}
> +	if (ctx->bloom) {
> +		chunk_ids[num_chunks] = GRAPH_CHUNKID_BLOOMINDEXES;
> +		num_chunks++;
> +		chunk_ids[num_chunks] = GRAPH_CHUNKID_BLOOMDATA;
> +		num_chunks++;
> +	}

Looks all right.

>  	if (ctx->num_commit_graphs_after > 1) {
>  		chunk_ids[num_chunks] = GRAPH_CHUNKID_BASE;
>  		num_chunks++;
> @@ -1450,6 +1518,13 @@ static int write_commit_graph_file(struct write_commit_graph_context *ctx)
>  						4 * ctx->num_extra_edges;
>  		num_chunks++;
>  	}
> +	if (ctx->bloom) {
> +		chunk_offsets[num_chunks + 1] = chunk_offsets[num_chunks] + sizeof(uint32_t) * ctx->commits.nr;
> +		num_chunks++;
> +
> +		chunk_offsets[num_chunks + 1] = chunk_offsets[num_chunks] + sizeof(uint32_t) * 3 + ctx->total_bloom_filter_size;
> +		num_chunks++;
> +	}

Better wrap those long lines, like above:

  +	if (ctx->bloom) {
  +		chunk_offsets[num_chunks + 1] = chunk_offsets[num_chunks] +
  +						sizeof(uint32_t) * ctx->commits.nr;
  +		num_chunks++;
  +
  +		chunk_offsets[num_chunks + 1] = chunk_offsets[num_chunks] +
  +						sizeof(uint32_t) * 3 + ctx->total_bloom_filter_size;
  +		num_chunks++;
  +	}

>  	if (ctx->num_commit_graphs_after > 1) {
>  		chunk_offsets[num_chunks + 1] = chunk_offsets[num_chunks] +
>  						hashsz * (ctx->num_commit_graphs_after - 1);
> @@ -1487,6 +1562,10 @@ static int write_commit_graph_file(struct write_commit_graph_context *ctx)
>  	write_graph_chunk_data(f, hashsz, ctx);
>  	if (ctx->num_extra_edges)
>  		write_graph_chunk_extra_edges(f, ctx);
> +	if (ctx->bloom) {
> +		write_graph_chunk_bloom_indexes(f, ctx);
> +		write_graph_chunk_bloom_data(f, ctx, &bloom_settings);
> +	}

All right.

>  	if (ctx->num_commit_graphs_after > 1 &&
>  	    write_graph_chunk_base(f, ctx)) {
>  		return -1;
> diff --git a/commit-graph.h b/commit-graph.h
> index 952a4b83be..2202ad91ae 100644
> --- a/commit-graph.h
> +++ b/commit-graph.h
> @@ -10,6 +10,7 @@
>  #define GIT_TEST_COMMIT_GRAPH_DIE_ON_LOAD "GIT_TEST_COMMIT_GRAPH_DIE_ON_LOAD"
>  
>  struct commit;
> +struct bloom_filter_settings;
>  
>  char *get_commit_graph_filename(const char *obj_dir);
>  int open_commit_graph(const char *graph_file, int *fd, struct stat *st);
> @@ -58,6 +59,10 @@ struct commit_graph {
>  	const unsigned char *chunk_commit_data;
>  	const unsigned char *chunk_extra_edges;
>  	const unsigned char *chunk_base_graphs;
> +	const unsigned char *chunk_bloom_indexes;
> +	const unsigned char *chunk_bloom_data;
> +
> +	struct bloom_filter_settings *settings;

Should this be part of `struct commit_graph`?  Shouldn't we free() this
data, or is it a pointer into xmmap-ped file... no it isn't -- we
xalloc() it, so we should free() it.

I think it should be done in 'cleanup:' section of write_commit_graph(),
but I am not entirely sure.

>  };
>  
>  struct commit_graph *load_commit_graph_one_fd_st(int fd, struct stat *st);

Best,
Garima Singh Jan. 14, 2020, 3:14 p.m. UTC | #2
On 1/7/2020 11:01 AM, Jakub Narebski wrote:
> "Garima Singh via GitGitGadget" <gitgitgadget@gmail.com> writes:
>> +
>> +				if (hash_version != 1)
>> +					break;
> 
> What does it mean for Git?  Behave as if there were no Bloom filter
> data?
>

Yes. We choose to use Bloom filters best effort, and in such cases, we will 
just fall back to the original code path. 

>>  	if (ctx->num_commit_graphs_after > 1 &&
>>  	    write_graph_chunk_base(f, ctx)) {
>>  		return -1;
>> diff --git a/commit-graph.h b/commit-graph.h
>> index 952a4b83be..2202ad91ae 100644
>> --- a/commit-graph.h
>> +++ b/commit-graph.h
>> @@ -10,6 +10,7 @@
>>  #define GIT_TEST_COMMIT_GRAPH_DIE_ON_LOAD "GIT_TEST_COMMIT_GRAPH_DIE_ON_LOAD"
>>  
>>  struct commit;
>> +struct bloom_filter_settings;
>>  
>>  char *get_commit_graph_filename(const char *obj_dir);
>>  int open_commit_graph(const char *graph_file, int *fd, struct stat *st);
>> @@ -58,6 +59,10 @@ struct commit_graph {
>>  	const unsigned char *chunk_commit_data;
>>  	const unsigned char *chunk_extra_edges;
>>  	const unsigned char *chunk_base_graphs;
>> +	const unsigned char *chunk_bloom_indexes;
>> +	const unsigned char *chunk_bloom_data;
>> +
>> +	struct bloom_filter_settings *settings;
> 
> Should this be part of `struct commit_graph`?  Shouldn't we free() this
> data, or is it a pointer into xmmap-ped file... no it isn't -- we
> xalloc() it, so we should free() it.
> 
> I think it should be done in 'cleanup:' section of write_commit_graph(),
> but I am not entirely sure.
> 

Thanks for calling this out! This is definitely a bug in how commit-graph.c
frees up the graph. The right way to free the graph would be to call
free_commit_graph() instead of free(graph) like many places in that file. 

Cleaning up this entire pattern would be orthogonal to this series, so 
I will follow up with a separate series that cleans it up overall. 

For now, I will free up `bloom_filter_settings` in free_commit_graph(). 

Cheers! 
Garima Singh
diff mbox series

Patch

diff --git a/commit-graph.c b/commit-graph.c
index 8c4941eeaa..def2ade166 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -24,7 +24,9 @@ 
 #define GRAPH_CHUNKID_DATA 0x43444154 /* "CDAT" */
 #define GRAPH_CHUNKID_EXTRAEDGES 0x45444745 /* "EDGE" */
 #define GRAPH_CHUNKID_BASE 0x42415345 /* "BASE" */
-#define MAX_NUM_CHUNKS 5
+#define GRAPH_CHUNKID_BLOOMINDEXES 0x42494458 /* "BIDX" */
+#define GRAPH_CHUNKID_BLOOMDATA 0x42444154 /* "BDAT" */
+#define MAX_NUM_CHUNKS 7
 
 #define GRAPH_DATA_WIDTH (the_hash_algo->rawsz + 16)
 
@@ -282,6 +284,32 @@  struct commit_graph *parse_commit_graph(void *graph_map, int fd,
 				chunk_repeated = 1;
 			else
 				graph->chunk_base_graphs = data + chunk_offset;
+			break;
+
+		case GRAPH_CHUNKID_BLOOMINDEXES:
+			if (graph->chunk_bloom_indexes)
+				chunk_repeated = 1;
+			else
+				graph->chunk_bloom_indexes = data + chunk_offset;
+			break;
+
+		case GRAPH_CHUNKID_BLOOMDATA:
+			if (graph->chunk_bloom_data)
+				chunk_repeated = 1;
+			else {
+				uint32_t hash_version;
+				graph->chunk_bloom_data = data + chunk_offset;
+				hash_version = get_be32(data + chunk_offset);
+
+				if (hash_version != 1)
+					break;
+
+				graph->settings = xmalloc(sizeof(struct bloom_filter_settings));
+				graph->settings->hash_version = hash_version;
+				graph->settings->num_hashes = get_be32(data + chunk_offset + 4);
+				graph->settings->bits_per_entry = get_be32(data + chunk_offset + 8);
+			}
+			break;
 		}
 
 		if (chunk_repeated) {
@@ -996,6 +1024,39 @@  static void write_graph_chunk_extra_edges(struct hashfile *f,
 	}
 }
 
+static void write_graph_chunk_bloom_indexes(struct hashfile *f,
+					    struct write_commit_graph_context *ctx)
+{
+	struct commit **list = ctx->commits.list;
+	struct commit **last = ctx->commits.list + ctx->commits.nr;
+	uint32_t cur_pos = 0;
+
+	while (list < last) {
+		struct bloom_filter *filter = get_bloom_filter(ctx->r, *list);
+		cur_pos += filter->len;
+		hashwrite_be32(f, cur_pos);
+		list++;
+	}
+}
+
+static void write_graph_chunk_bloom_data(struct hashfile *f,
+					 struct write_commit_graph_context *ctx,
+					 struct bloom_filter_settings *settings)
+{
+	struct commit **first = ctx->commits.list;
+	struct commit **last = ctx->commits.list + ctx->commits.nr;
+
+	hashwrite_be32(f, settings->hash_version);
+	hashwrite_be32(f, settings->num_hashes);
+	hashwrite_be32(f, settings->bits_per_entry);
+
+	while (first < last) {
+		struct bloom_filter *filter = get_bloom_filter(ctx->r, *first);
+		hashwrite(f, filter->data, filter->len * sizeof(uint64_t));
+		first++;
+	}
+}
+
 static int oid_compare(const void *_a, const void *_b)
 {
 	const struct object_id *a = (const struct object_id *)_a;
@@ -1388,6 +1449,7 @@  static int write_commit_graph_file(struct write_commit_graph_context *ctx)
 	struct strbuf progress_title = STRBUF_INIT;
 	int num_chunks = 3;
 	struct object_id file_hash;
+	struct bloom_filter_settings bloom_settings = DEFAULT_BLOOM_FILTER_SETTINGS;
 
 	if (ctx->split) {
 		struct strbuf tmp_file = STRBUF_INIT;
@@ -1432,6 +1494,12 @@  static int write_commit_graph_file(struct write_commit_graph_context *ctx)
 		chunk_ids[num_chunks] = GRAPH_CHUNKID_EXTRAEDGES;
 		num_chunks++;
 	}
+	if (ctx->bloom) {
+		chunk_ids[num_chunks] = GRAPH_CHUNKID_BLOOMINDEXES;
+		num_chunks++;
+		chunk_ids[num_chunks] = GRAPH_CHUNKID_BLOOMDATA;
+		num_chunks++;
+	}
 	if (ctx->num_commit_graphs_after > 1) {
 		chunk_ids[num_chunks] = GRAPH_CHUNKID_BASE;
 		num_chunks++;
@@ -1450,6 +1518,13 @@  static int write_commit_graph_file(struct write_commit_graph_context *ctx)
 						4 * ctx->num_extra_edges;
 		num_chunks++;
 	}
+	if (ctx->bloom) {
+		chunk_offsets[num_chunks + 1] = chunk_offsets[num_chunks] + sizeof(uint32_t) * ctx->commits.nr;
+		num_chunks++;
+
+		chunk_offsets[num_chunks + 1] = chunk_offsets[num_chunks] + sizeof(uint32_t) * 3 + ctx->total_bloom_filter_size;
+		num_chunks++;
+	}
 	if (ctx->num_commit_graphs_after > 1) {
 		chunk_offsets[num_chunks + 1] = chunk_offsets[num_chunks] +
 						hashsz * (ctx->num_commit_graphs_after - 1);
@@ -1487,6 +1562,10 @@  static int write_commit_graph_file(struct write_commit_graph_context *ctx)
 	write_graph_chunk_data(f, hashsz, ctx);
 	if (ctx->num_extra_edges)
 		write_graph_chunk_extra_edges(f, ctx);
+	if (ctx->bloom) {
+		write_graph_chunk_bloom_indexes(f, ctx);
+		write_graph_chunk_bloom_data(f, ctx, &bloom_settings);
+	}
 	if (ctx->num_commit_graphs_after > 1 &&
 	    write_graph_chunk_base(f, ctx)) {
 		return -1;
diff --git a/commit-graph.h b/commit-graph.h
index 952a4b83be..2202ad91ae 100644
--- a/commit-graph.h
+++ b/commit-graph.h
@@ -10,6 +10,7 @@ 
 #define GIT_TEST_COMMIT_GRAPH_DIE_ON_LOAD "GIT_TEST_COMMIT_GRAPH_DIE_ON_LOAD"
 
 struct commit;
+struct bloom_filter_settings;
 
 char *get_commit_graph_filename(const char *obj_dir);
 int open_commit_graph(const char *graph_file, int *fd, struct stat *st);
@@ -58,6 +59,10 @@  struct commit_graph {
 	const unsigned char *chunk_commit_data;
 	const unsigned char *chunk_extra_edges;
 	const unsigned char *chunk_base_graphs;
+	const unsigned char *chunk_bloom_indexes;
+	const unsigned char *chunk_bloom_data;
+
+	struct bloom_filter_settings *settings;
 };
 
 struct commit_graph *load_commit_graph_one_fd_st(int fd, struct stat *st);