diff mbox series

[v4,05/10] commit-graph: add a slab to store topological levels

Message ID e067f653ad5d474eee5f40c13bb02fde26ebdb9b.1602079786.git.gitgitgadget@gmail.com (mailing list archive)
State New, archived
Headers show
Series Implement Corrected Commit Date | expand

Commit Message

Koji Nakamaru via GitGitGadget Oct. 7, 2020, 2:09 p.m. UTC
From: Abhishek Kumar <abhishekkumar8222@gmail.com>

In a later commit we will introduce corrected commit date as the
generation number v2. This value will be stored in the new seperate
Generation Data chunk. However, to ensure backwards compatibility with
"Old" Git we need to continue to write generation number v1, which is
topological level, to the commit data chunk. This means that we need to
compute both versions of generation numbers when writing the
commit-graph file. Therefore, let's introduce a commit-slab to store
topological levels; corrected commit date will be stored in the member
`generation` of struct commit_graph_data.

When Git creates a split commit-graph, it takes advantage of the
generation values that have been computed already and present in
existing commit-graph files.

So, let's add a pointer to struct commit_graph as well as struct
write_commit_graph_context to the topological level commit-slab
and populate it with topological levels while writing a commit-graph
file.

Signed-off-by: Abhishek Kumar <abhishekkumar8222@gmail.com>
---
 commit-graph.c | 47 ++++++++++++++++++++++++++++++++---------------
 commit-graph.h |  1 +
 2 files changed, 33 insertions(+), 15 deletions(-)

Comments

Jakub Narębski Oct. 25, 2020, 10:17 p.m. UTC | #1
"Abhishek Kumar via GitGitGadget" <gitgitgadget@gmail.com> writes:

> From: Abhishek Kumar <abhishekkumar8222@gmail.com>
>
> In a later commit we will introduce corrected commit date as the
> generation number v2. This value will be stored in the new seperate
> Generation Data chunk. However, to ensure backwards compatibility with
> "Old" Git we need to continue to write generation number v1, which is
> topological level, to the commit data chunk. This means that we need to
> compute both versions of generation numbers when writing the
> commit-graph file. Therefore, let's introduce a commit-slab to store
> topological levels; corrected commit date will be stored in the member
> `generation` of struct commit_graph_data.
>
> When Git creates a split commit-graph, it takes advantage of the
> generation values that have been computed already and present in
> existing commit-graph files.
>
> So, let's add a pointer to struct commit_graph as well as struct
> write_commit_graph_context to the topological level commit-slab
> and populate it with topological levels while writing a commit-graph
> file.

I think you meant here "add a pointer in `struct commit_graph` as well
as in `struct write_commit_graph_context`...".

Perhaps we should add the information that it is done that way to be
able to allocate topo_level_slab only when needed, in the
write_commit_graph(), and adding new member to those struct is required
to pass it through the call chain (modifying `struct commit_graph` is
needed for fill_commit_graph_info()).  But that might be too much detail
to put in the commit message.

>
> Signed-off-by: Abhishek Kumar <abhishekkumar8222@gmail.com>
> ---
>  commit-graph.c | 47 ++++++++++++++++++++++++++++++++---------------
>  commit-graph.h |  1 +
>  2 files changed, 33 insertions(+), 15 deletions(-)
>

Let me reorder those files for easier review.

> diff --git a/commit-graph.h b/commit-graph.h
> index 8be247fa35..2e9aa7824e 100644
> --- a/commit-graph.h
> +++ b/commit-graph.h
> @@ -73,6 +73,7 @@ struct commit_graph {
>  	const unsigned char *chunk_bloom_indexes;
>  	const unsigned char *chunk_bloom_data;
>  
> +	struct topo_level_slab *topo_levels;
>  	struct bloom_filter_settings *bloom_filter_settings;
>  };

All right, here we add new member to `struct commit_graph` type.

> diff --git a/commit-graph.c b/commit-graph.c
> index bfc532de6f..cedd311024 100644
> --- a/commit-graph.c
> +++ b/commit-graph.c
> @@ -962,6 +967,7 @@ struct write_commit_graph_context {
>  		 changed_paths:1,
>  		 order_by_pack:1;
>  
> +	struct topo_level_slab *topo_levels;
>  	const struct commit_graph_opts *opts;
>  	size_t total_bloom_filter_data_size;
>  	const struct bloom_filter_settings *bloom_settings;

All right, here we add new member to `struct write_commit_graph_context`
type, which is local to commit-graph.c.

> @@ -64,6 +64,8 @@ void git_test_write_commit_graph_or_die(void)
>  /* Remember to update object flag allocation in object.h */
>  #define REACHABLE       (1u<<15)
>  
> +define_commit_slab(topo_level_slab, uint32_t);
> +

All right, here we define new slab for storing topological levels; this
just defines new type. Note that we do not define any setters and
getters to handle non-zero initialization, like we have for
commit_graph_data_slab.

>  /* Keep track of the order in which commits are added to our list. */
>  define_commit_slab(commit_pos, int);
>  static struct commit_pos commit_pos = COMMIT_SLAB_INIT(1, commit_pos);
> @@ -768,6 +770,9 @@ static void fill_commit_graph_info(struct commit *item, struct commit_graph *g,
>  	item->date = (timestamp_t)((date_high << 32) | date_low);
>  
>  	graph_data->generation = get_be32(commit_data + g->hash_len + 8) >> 2;
> +
> +	if (g->topo_levels)
> +		*topo_level_slab_at(g->topo_levels, item) = get_be32(commit_data + g->hash_len + 8) >> 2;

I guess using get_be32() is repeated in this newly added part of code
because previous part would be changed to read in generation number v2,
if available, and we won't be then able to use

		*topo_level_slab_at(g->topo_levels, item) = graph_data->generation;

All right, that's smart.


I guess that in fill_commit_graph_info() we don't know if we are reading
commit-graph, when topo levels slab is not present, or whether we are
extending and writing the commit-graph file, when we need to fill it
with current commit-graph data.

The fact that fill_commit_graph_info() takes 'struct commit_graph' also
explains why we need to add pointer to a topo_levels slab to both
structs.

>  }
>  
>  static inline void set_commit_tree(struct commit *c, struct tree *t)
[...]
> @@ -2142,6 +2146,7 @@ int write_commit_graph(struct object_directory *odb,
>  	int res = 0;
>  	int replace = 0;
>  	struct bloom_filter_settings bloom_settings = DEFAULT_BLOOM_FILTER_SETTINGS;
> +	struct topo_level_slab topo_levels;
>  
>  	if (!commit_graph_compatible(the_repository))
>  		return 0;
> @@ -2163,6 +2168,18 @@ int write_commit_graph(struct object_directory *odb,
>  							 bloom_settings.max_changed_paths);
>  	ctx->bloom_settings = &bloom_settings;
>  
> +	init_topo_level_slab(&topo_levels);
> +	ctx->topo_levels = &topo_levels;
> +
> +	if (ctx->r->objects->commit_graph) {
> +		struct commit_graph *g = ctx->r->objects->commit_graph;
> +
> +		while (g) {
> +			g->topo_levels = &topo_levels;
> +			g = g->base_graph;
> +		}
> +	}
> +
>  	if (flags & COMMIT_GRAPH_WRITE_BLOOM_FILTERS)
>  		ctx->changed_paths = 1;
>  	if (!(flags & COMMIT_GRAPH_NO_WRITE_BLOOM_FILTERS)) {

All right, we need topo_level_slab only for writing the commit-graph, so
we allocate it with init_*_slab() in write_commit_graph(), and set
pointers to it in `struct write_commit_graph_context *ctx` and in
`struct commit_graph` for each layer in the commit graph.  This is
needed to pass it down the call-chain.

Looks good to me.

> @@ -1108,7 +1114,7 @@ static int write_graph_chunk_data(struct hashfile *f,
>  		else
>  			packedDate[0] = 0;
>  
> -		packedDate[0] |= htonl(commit_graph_data_at(*list)->generation << 2);
> +		packedDate[0] |= htonl(*topo_level_slab_at(ctx->topo_levels, *list) << 2);
>

All right, write_graph_chunk_data() is called from write_commit_graph(),
so we know that cxt->topo_levels is not NULL.

>  		packedDate[1] = htonl((*list)->date);
>  		hashwrite(f, packedDate, 8);
> @@ -1350,11 +1356,11 @@ static void compute_generation_numbers(struct write_commit_graph_context *ctx)
>  					_("Computing commit graph generation numbers"),
>  					ctx->commits.nr);
>  	for (i = 0; i < ctx->commits.nr; i++) {
> -		timestamp_t generation = commit_graph_data_at(ctx->commits.list[i])->generation;
> +		timestamp_t level = *topo_level_slab_at(ctx->topo_levels, ctx->commits.list[i]);
>

All right, we know that compute_generation_numbers() is called by the
write_commit_graph(), so we know that cxt->topo_levels is not NULL.

Also, we rename 'generation' to 'level' in preparation for the time when
we would be computing *both* topological level (for backward
compatibility) and corrected committer date (to be used as generation
number v2).  All right.

>  		display_progress(ctx->progress, i + 1);
> -		if (generation != GENERATION_NUMBER_INFINITY &&
> -		    generation != GENERATION_NUMBER_ZERO)
> +		if (level != GENERATION_NUMBER_INFINITY &&
> +		    level != GENERATION_NUMBER_ZERO)
>  			continue;

Same here, the results of renaming of 'generation' local variable to
'level'.

>  
>  		commit_list_insert(ctx->commits.list[i], &list);
> @@ -1362,29 +1368,27 @@ static void compute_generation_numbers(struct write_commit_graph_context *ctx)
>  			struct commit *current = list->item;
>  			struct commit_list *parent;
>  			int all_parents_computed = 1;
> -			uint32_t max_generation = 0;
> +			uint32_t max_level = 0;

Similarly, we rename 'max_generation' to 'max_level'.

>  
>  			for (parent = current->parents; parent; parent = parent->next) {
> -				generation = commit_graph_data_at(parent->item)->generation;
> +				level = *topo_level_slab_at(ctx->topo_levels, parent->item);
>  
> -				if (generation == GENERATION_NUMBER_INFINITY ||
> -				    generation == GENERATION_NUMBER_ZERO) {
> +				if (level == GENERATION_NUMBER_INFINITY ||
> +				    level == GENERATION_NUMBER_ZERO) {
>  					all_parents_computed = 0;
>  					commit_list_insert(parent->item, &list);
>  					break;
> -				} else if (generation > max_generation) {
> -					max_generation = generation;
> +				} else if (level > max_level) {
> +					max_level = level;
>  				}
>  			}

Continuation of those renames.

>  
>  			if (all_parents_computed) {
> -				struct commit_graph_data *data = commit_graph_data_at(current);
> -
> -				data->generation = max_generation + 1;
>  				pop_commit(&list);
>  
> -				if (data->generation > GENERATION_NUMBER_V1_MAX)
> -					data->generation = GENERATION_NUMBER_V1_MAX;
> +				if (max_level > GENERATION_NUMBER_V1_MAX - 1)
> +					max_level = GENERATION_NUMBER_V1_MAX - 1;
> +				*topo_level_slab_at(ctx->topo_levels, current) = max_level + 1;

This is a bit safer way to handle possible overflow: instead of

  final = max_found + 1;            /* set to maximum plus 1 */
  if (final > MAX_POSSIBLE_VALUE)   /* handle overflow */
      final = MAX_POSSIBLE_VALUE;

where we can have problems if MAX_POSSIBLE_VALUE overflows, we use the
following pattern:

  if (max_found > MAX_POSSIBLE_VALUE - 1)  /* handle overflow */
      max_found > MAX_POSSIBLE_VALUE - 1;
  final = max_found + 1;                   /* set to maximum plus 1 */

It is just a bit obscured by renaming variable and switch to using
commit slab.

It is not that important for topological level, where
GENERATION_NUMBER_V1_MAX is smaller than maximum possible value, but it
would be important for generation number v2.

>  			}
>  		}
>  	}

Best,
--
Jakub Narębski
diff mbox series

Patch

diff --git a/commit-graph.c b/commit-graph.c
index bfc532de6f..cedd311024 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -64,6 +64,8 @@  void git_test_write_commit_graph_or_die(void)
 /* Remember to update object flag allocation in object.h */
 #define REACHABLE       (1u<<15)
 
+define_commit_slab(topo_level_slab, uint32_t);
+
 /* Keep track of the order in which commits are added to our list. */
 define_commit_slab(commit_pos, int);
 static struct commit_pos commit_pos = COMMIT_SLAB_INIT(1, commit_pos);
@@ -768,6 +770,9 @@  static void fill_commit_graph_info(struct commit *item, struct commit_graph *g,
 	item->date = (timestamp_t)((date_high << 32) | date_low);
 
 	graph_data->generation = get_be32(commit_data + g->hash_len + 8) >> 2;
+
+	if (g->topo_levels)
+		*topo_level_slab_at(g->topo_levels, item) = get_be32(commit_data + g->hash_len + 8) >> 2;
 }
 
 static inline void set_commit_tree(struct commit *c, struct tree *t)
@@ -962,6 +967,7 @@  struct write_commit_graph_context {
 		 changed_paths:1,
 		 order_by_pack:1;
 
+	struct topo_level_slab *topo_levels;
 	const struct commit_graph_opts *opts;
 	size_t total_bloom_filter_data_size;
 	const struct bloom_filter_settings *bloom_settings;
@@ -1108,7 +1114,7 @@  static int write_graph_chunk_data(struct hashfile *f,
 		else
 			packedDate[0] = 0;
 
-		packedDate[0] |= htonl(commit_graph_data_at(*list)->generation << 2);
+		packedDate[0] |= htonl(*topo_level_slab_at(ctx->topo_levels, *list) << 2);
 
 		packedDate[1] = htonl((*list)->date);
 		hashwrite(f, packedDate, 8);
@@ -1350,11 +1356,11 @@  static void compute_generation_numbers(struct write_commit_graph_context *ctx)
 					_("Computing commit graph generation numbers"),
 					ctx->commits.nr);
 	for (i = 0; i < ctx->commits.nr; i++) {
-		timestamp_t generation = commit_graph_data_at(ctx->commits.list[i])->generation;
+		timestamp_t level = *topo_level_slab_at(ctx->topo_levels, ctx->commits.list[i]);
 
 		display_progress(ctx->progress, i + 1);
-		if (generation != GENERATION_NUMBER_INFINITY &&
-		    generation != GENERATION_NUMBER_ZERO)
+		if (level != GENERATION_NUMBER_INFINITY &&
+		    level != GENERATION_NUMBER_ZERO)
 			continue;
 
 		commit_list_insert(ctx->commits.list[i], &list);
@@ -1362,29 +1368,27 @@  static void compute_generation_numbers(struct write_commit_graph_context *ctx)
 			struct commit *current = list->item;
 			struct commit_list *parent;
 			int all_parents_computed = 1;
-			uint32_t max_generation = 0;
+			uint32_t max_level = 0;
 
 			for (parent = current->parents; parent; parent = parent->next) {
-				generation = commit_graph_data_at(parent->item)->generation;
+				level = *topo_level_slab_at(ctx->topo_levels, parent->item);
 
-				if (generation == GENERATION_NUMBER_INFINITY ||
-				    generation == GENERATION_NUMBER_ZERO) {
+				if (level == GENERATION_NUMBER_INFINITY ||
+				    level == GENERATION_NUMBER_ZERO) {
 					all_parents_computed = 0;
 					commit_list_insert(parent->item, &list);
 					break;
-				} else if (generation > max_generation) {
-					max_generation = generation;
+				} else if (level > max_level) {
+					max_level = level;
 				}
 			}
 
 			if (all_parents_computed) {
-				struct commit_graph_data *data = commit_graph_data_at(current);
-
-				data->generation = max_generation + 1;
 				pop_commit(&list);
 
-				if (data->generation > GENERATION_NUMBER_V1_MAX)
-					data->generation = GENERATION_NUMBER_V1_MAX;
+				if (max_level > GENERATION_NUMBER_V1_MAX - 1)
+					max_level = GENERATION_NUMBER_V1_MAX - 1;
+				*topo_level_slab_at(ctx->topo_levels, current) = max_level + 1;
 			}
 		}
 	}
@@ -2142,6 +2146,7 @@  int write_commit_graph(struct object_directory *odb,
 	int res = 0;
 	int replace = 0;
 	struct bloom_filter_settings bloom_settings = DEFAULT_BLOOM_FILTER_SETTINGS;
+	struct topo_level_slab topo_levels;
 
 	if (!commit_graph_compatible(the_repository))
 		return 0;
@@ -2163,6 +2168,18 @@  int write_commit_graph(struct object_directory *odb,
 							 bloom_settings.max_changed_paths);
 	ctx->bloom_settings = &bloom_settings;
 
+	init_topo_level_slab(&topo_levels);
+	ctx->topo_levels = &topo_levels;
+
+	if (ctx->r->objects->commit_graph) {
+		struct commit_graph *g = ctx->r->objects->commit_graph;
+
+		while (g) {
+			g->topo_levels = &topo_levels;
+			g = g->base_graph;
+		}
+	}
+
 	if (flags & COMMIT_GRAPH_WRITE_BLOOM_FILTERS)
 		ctx->changed_paths = 1;
 	if (!(flags & COMMIT_GRAPH_NO_WRITE_BLOOM_FILTERS)) {
diff --git a/commit-graph.h b/commit-graph.h
index 8be247fa35..2e9aa7824e 100644
--- a/commit-graph.h
+++ b/commit-graph.h
@@ -73,6 +73,7 @@  struct commit_graph {
 	const unsigned char *chunk_bloom_indexes;
 	const unsigned char *chunk_bloom_data;
 
+	struct topo_level_slab *topo_levels;
 	struct bloom_filter_settings *bloom_filter_settings;
 };