diff mbox series

[v4,11/13] pack-bitmap.c: keep track of each layer's type bitmaps

Message ID a29f4ee60d519318d36a8d3c812b4bad039b891e.1741983492.git.me@ttaylorr.com (mailing list archive)
State Superseded
Headers show
Series midx: incremental multi-pack indexes, part two | expand

Commit Message

Taylor Blau March 14, 2025, 8:18 p.m. UTC
Prepare for reading the type-level bitmaps from previous bitmap layers
by maintaining an array for each type, where each element in that type's
array corresponds to one layer's bitmap for that type.

These fields will be used in a later commit to instantiate the 'struct
ewah_or_iterator' for each type.

Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
 pack-bitmap.c | 58 +++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 54 insertions(+), 4 deletions(-)

Comments

Jeff King March 18, 2025, 2:01 a.m. UTC | #1
On Fri, Mar 14, 2025 at 04:18:53PM -0400, Taylor Blau wrote:

> @@ -81,6 +81,24 @@ struct bitmap_index {
>  	struct ewah_bitmap *blobs;
>  	struct ewah_bitmap *tags;
>  
> +	/*
> +	 * Type index arrays when this bitmap is associated with an
> +	 * incremental multi-pack index chain.
> +	 *
> +	 * If n is the number of unique layers in the MIDX chain, then
> +	 * commits_all[n-1] is this structs 'commits' field,
> +	 * commits_all[n-2] is the commits field of this bitmap's
> +	 * 'base', and so on.
> +	 *
> +	 * When either associated either with a non-incremental MIDX, or
> +	 * a single packfile, these arrays each contain a single
> +	 * element.
> +	 */
> +	struct ewah_bitmap **commits_all;
> +	struct ewah_bitmap **trees_all;
> +	struct ewah_bitmap **blobs_all;
> +	struct ewah_bitmap **tags_all;

OK, so these are valid only for the top-level of the chain? I guess
there would not be much point in having the lower levels know about
their incremental versions.

> -static int load_bitmap(struct repository *r, struct bitmap_index *bitmap_git)
> +static void load_all_type_bitmaps(struct bitmap_index *bitmap_git)
> +{
> +	struct bitmap_index *curr = bitmap_git;
> +	size_t i = bitmap_git->base_nr;
> +
> +	ALLOC_ARRAY(bitmap_git->commits_all, bitmap_git->base_nr + 1);
> +	ALLOC_ARRAY(bitmap_git->trees_all, bitmap_git->base_nr + 1);
> +	ALLOC_ARRAY(bitmap_git->blobs_all, bitmap_git->base_nr + 1);
> +	ALLOC_ARRAY(bitmap_git->tags_all, bitmap_git->base_nr + 1);
> +
> +	while (curr) {
> +		bitmap_git->commits_all[i] = curr->commits;
> +		bitmap_git->trees_all[i] = curr->trees;
> +		bitmap_git->blobs_all[i] = curr->blobs;
> +		bitmap_git->tags_all[i] = curr->tags;
> +
> +		curr = curr->base;
> +		if (curr && !i)
> +			BUG("unexpected number of bitmap layers, expected %"PRIu32,
> +			    bitmap_git->base_nr + 1);
> +		i -= 1;
> +	}
> +}

It looks like we always allocate these. For the non-incremental case, I
think you could just do:

  bitmap_git->commits_all = &bitmap_git->commits;

and so forth. But I doubt that micro-optimization really matters, and it
introduces complications when you have to decide whether to free them or
not.

(And if you really cared about micro-optimizing, probably trying to
prevent the extra pointer-chase in the first place would be a more
productive path).

-Peff
Elijah Newren March 18, 2025, 6:43 a.m. UTC | #2
On Fri, Mar 14, 2025 at 1:18 PM Taylor Blau <me@ttaylorr.com> wrote:
>
> Prepare for reading the type-level bitmaps from previous bitmap layers
> by maintaining an array for each type, where each element in that type's
> array corresponds to one layer's bitmap for that type.
>
> These fields will be used in a later commit to instantiate the 'struct
> ewah_or_iterator' for each type.
>

All I spotted was some possible wording fixups...

> +        *
> +        * When either associated either with a non-incremental MIDX, or
> +        * a single packfile, these arrays each contain a single
> +        * element.
> +        */

Drop the first "either", and the first comma?
Taylor Blau March 19, 2025, 12:38 a.m. UTC | #3
On Mon, Mar 17, 2025 at 10:01:13PM -0400, Jeff King wrote:
> On Fri, Mar 14, 2025 at 04:18:53PM -0400, Taylor Blau wrote:
>
> > @@ -81,6 +81,24 @@ struct bitmap_index {
> >  	struct ewah_bitmap *blobs;
> >  	struct ewah_bitmap *tags;
> >
> > +	/*
> > +	 * Type index arrays when this bitmap is associated with an
> > +	 * incremental multi-pack index chain.
> > +	 *
> > +	 * If n is the number of unique layers in the MIDX chain, then
> > +	 * commits_all[n-1] is this structs 'commits' field,
> > +	 * commits_all[n-2] is the commits field of this bitmap's
> > +	 * 'base', and so on.
> > +	 *
> > +	 * When either associated either with a non-incremental MIDX, or
> > +	 * a single packfile, these arrays each contain a single
> > +	 * element.
> > +	 */
> > +	struct ewah_bitmap **commits_all;
> > +	struct ewah_bitmap **trees_all;
> > +	struct ewah_bitmap **blobs_all;
> > +	struct ewah_bitmap **tags_all;
>
> OK, so these are valid only for the top-level of the chain? I guess
> there would not be much point in having the lower levels know about
> their incremental versions.

Right; all of the "useful" computation like counting, traversing,
filtering, etc. is all done at the top-most layer in any chain, so these
have no meaning/purpose for lower layers.

> > -static int load_bitmap(struct repository *r, struct bitmap_index *bitmap_git)
> > +static void load_all_type_bitmaps(struct bitmap_index *bitmap_git)
> > +{
> > +	struct bitmap_index *curr = bitmap_git;
> > +	size_t i = bitmap_git->base_nr;
> > +
> > +	ALLOC_ARRAY(bitmap_git->commits_all, bitmap_git->base_nr + 1);
> > +	ALLOC_ARRAY(bitmap_git->trees_all, bitmap_git->base_nr + 1);
> > +	ALLOC_ARRAY(bitmap_git->blobs_all, bitmap_git->base_nr + 1);
> > +	ALLOC_ARRAY(bitmap_git->tags_all, bitmap_git->base_nr + 1);
> > +
> > +	while (curr) {
> > +		bitmap_git->commits_all[i] = curr->commits;
> > +		bitmap_git->trees_all[i] = curr->trees;
> > +		bitmap_git->blobs_all[i] = curr->blobs;
> > +		bitmap_git->tags_all[i] = curr->tags;
> > +
> > +		curr = curr->base;
> > +		if (curr && !i)
> > +			BUG("unexpected number of bitmap layers, expected %"PRIu32,
> > +			    bitmap_git->base_nr + 1);
> > +		i -= 1;
> > +	}
> > +}
>
> It looks like we always allocate these. For the non-incremental case, I
> think you could just do:
>
>   bitmap_git->commits_all = &bitmap_git->commits;
>
> and so forth. But I doubt that micro-optimization really matters, and it
> introduces complications when you have to decide whether to free them or
> not.

The complications aren't actually too bad... I think you just have to
avoid free()-ing them when you have a non-NULL 'base' pointer. I think
it would look something like:

--- 8< ---
diff --git a/pack-bitmap.c b/pack-bitmap.c
index 2270a646f6..8a530fa7d8 100644
--- a/pack-bitmap.c
+++ b/pack-bitmap.c
@@ -604,6 +604,15 @@ static void load_all_type_bitmaps(struct bitmap_index *bitmap_git)
 	struct bitmap_index *curr = bitmap_git;
 	size_t i = bitmap_git->base_nr;

+	if (!bitmap_git->base) {
+		bitmap_git->commits_all = &bitmap_git->commits;
+		bitmap_git->trees_all = &bitmap_git->trees;
+		bitmap_git->blobs_all = &bitmap_git->blobs;
+		bitmap_git->tags_all = &bitmap_git->tags;
+
+		return;
+	}
+
 	ALLOC_ARRAY(bitmap_git->commits_all, bitmap_git->base_nr + 1);
 	ALLOC_ARRAY(bitmap_git->trees_all, bitmap_git->base_nr + 1);
 	ALLOC_ARRAY(bitmap_git->blobs_all, bitmap_git->base_nr + 1);
@@ -3031,10 +3040,12 @@ void free_bitmap_index(struct bitmap_index *b)
 	ewah_pool_free(b->trees);
 	ewah_pool_free(b->blobs);
 	ewah_pool_free(b->tags);
-	free(b->commits_all);
-	free(b->trees_all);
-	free(b->blobs_all);
-	free(b->tags_all);
+	if (b->base) {
+		free(b->commits_all);
+		free(b->trees_all);
+		free(b->blobs_all);
+		free(b->tags_all);
+	}
 	if (b->bitmaps) {
 		struct stored_bitmap *sb;
 		kh_foreach_value(b->bitmaps, sb, {
--- >8 ---

, but it leaves a bad taste in my mouth tying the NULL-ness of
'bitmap_git->base' to the allocation/freeing of these arrays. Maybe I'm
being paranoid, but it feels like a potential landmine.

> (And if you really cared about micro-optimizing, probably trying to
> prevent the extra pointer-chase in the first place would be a more
> productive path).

Yup.

Thanks,
Taylor
Taylor Blau March 19, 2025, 12:39 a.m. UTC | #4
On Mon, Mar 17, 2025 at 11:43:47PM -0700, Elijah Newren wrote:
> > +        *
> > +        * When either associated either with a non-incremental MIDX, or
> > +        * a single packfile, these arrays each contain a single
> > +        * element.
> > +        */
>
> Drop the first "either", and the first comma?

Good catch, thanks!

Thanks,
Taylor
diff mbox series

Patch

diff --git a/pack-bitmap.c b/pack-bitmap.c
index 00acf5ec73..3517972892 100644
--- a/pack-bitmap.c
+++ b/pack-bitmap.c
@@ -81,6 +81,24 @@  struct bitmap_index {
 	struct ewah_bitmap *blobs;
 	struct ewah_bitmap *tags;
 
+	/*
+	 * Type index arrays when this bitmap is associated with an
+	 * incremental multi-pack index chain.
+	 *
+	 * If n is the number of unique layers in the MIDX chain, then
+	 * commits_all[n-1] is this structs 'commits' field,
+	 * commits_all[n-2] is the commits field of this bitmap's
+	 * 'base', and so on.
+	 *
+	 * When either associated either with a non-incremental MIDX, or
+	 * a single packfile, these arrays each contain a single
+	 * element.
+	 */
+	struct ewah_bitmap **commits_all;
+	struct ewah_bitmap **trees_all;
+	struct ewah_bitmap **blobs_all;
+	struct ewah_bitmap **tags_all;
+
 	/* Map from object ID -> `stored_bitmap` for all the bitmapped commits */
 	kh_oid_map_t *bitmaps;
 
@@ -581,7 +599,32 @@  static int load_reverse_index(struct repository *r, struct bitmap_index *bitmap_
 	return load_pack_revindex(r, bitmap_git->pack);
 }
 
-static int load_bitmap(struct repository *r, struct bitmap_index *bitmap_git)
+static void load_all_type_bitmaps(struct bitmap_index *bitmap_git)
+{
+	struct bitmap_index *curr = bitmap_git;
+	size_t i = bitmap_git->base_nr;
+
+	ALLOC_ARRAY(bitmap_git->commits_all, bitmap_git->base_nr + 1);
+	ALLOC_ARRAY(bitmap_git->trees_all, bitmap_git->base_nr + 1);
+	ALLOC_ARRAY(bitmap_git->blobs_all, bitmap_git->base_nr + 1);
+	ALLOC_ARRAY(bitmap_git->tags_all, bitmap_git->base_nr + 1);
+
+	while (curr) {
+		bitmap_git->commits_all[i] = curr->commits;
+		bitmap_git->trees_all[i] = curr->trees;
+		bitmap_git->blobs_all[i] = curr->blobs;
+		bitmap_git->tags_all[i] = curr->tags;
+
+		curr = curr->base;
+		if (curr && !i)
+			BUG("unexpected number of bitmap layers, expected %"PRIu32,
+			    bitmap_git->base_nr + 1);
+		i -= 1;
+	}
+}
+
+static int load_bitmap(struct repository *r, struct bitmap_index *bitmap_git,
+		       int recursing)
 {
 	assert(bitmap_git->map);
 
@@ -603,10 +646,13 @@  static int load_bitmap(struct repository *r, struct bitmap_index *bitmap_git)
 	if (bitmap_git->base) {
 		if (!bitmap_is_midx(bitmap_git))
 			BUG("non-MIDX bitmap has non-NULL base bitmap index");
-		if (load_bitmap(r, bitmap_git->base) < 0)
+		if (load_bitmap(r, bitmap_git->base, 1) < 0)
 			goto failed;
 	}
 
+	if (!recursing)
+		load_all_type_bitmaps(bitmap_git);
+
 	return 0;
 
 failed:
@@ -682,7 +728,7 @@  struct bitmap_index *prepare_bitmap_git(struct repository *r)
 {
 	struct bitmap_index *bitmap_git = xcalloc(1, sizeof(*bitmap_git));
 
-	if (!open_bitmap(r, bitmap_git) && !load_bitmap(r, bitmap_git))
+	if (!open_bitmap(r, bitmap_git) && !load_bitmap(r, bitmap_git, 0))
 		return bitmap_git;
 
 	free_bitmap_index(bitmap_git);
@@ -2050,7 +2096,7 @@  struct bitmap_index *prepare_bitmap_walk(struct rev_info *revs,
 	 * from disk. this is the point of no return; after this the rev_list
 	 * becomes invalidated and we must perform the revwalk through bitmaps
 	 */
-	if (load_bitmap(revs->repo, bitmap_git) < 0)
+	if (load_bitmap(revs->repo, bitmap_git, 0) < 0)
 		goto cleanup;
 
 	if (!use_boundary_traversal)
@@ -2983,6 +3029,10 @@  void free_bitmap_index(struct bitmap_index *b)
 	ewah_pool_free(b->trees);
 	ewah_pool_free(b->blobs);
 	ewah_pool_free(b->tags);
+	free(b->commits_all);
+	free(b->trees_all);
+	free(b->blobs_all);
+	free(b->tags_all);
 	if (b->bitmaps) {
 		struct stored_bitmap *sb;
 		kh_foreach_value(b->bitmaps, sb, {