diff mbox series

[3/6] pack-bitmap-write.c: write lookup table extension

Message ID ed91ebf69a84405f16a4390e6fd208251b8ec53e.1655728395.git.gitgitgadget@gmail.com (mailing list archive)
State Superseded
Headers show
Series bitmap: integrate a lookup table extension to the bitmap format | expand

Commit Message

Abhradeep Chakraborty June 20, 2022, 12:33 p.m. UTC
From: Abhradeep Chakraborty <chakrabortyabhradeep79@gmail.com>

Teach git to write bitmap lookup table extension. The table has the
following information:

    - `N` no of Object ids of each bitmapped commits

    - A list of offset, xor-offset pair; the i'th pair denotes the
      offsets and xor-offsets of i'th commit in the previous list.

    - 4-byte integer denoting the flags

Co-authored-by: Taylor Blau <ttaylorr@github.com>
Mentored-by: Taylor Blau <ttaylorr@github.com>
Co-mentored-by: Kaartic Sivaraam <kaartic.sivaraam@gmail.com>
Signed-off-by: Abhradeep Chakraborty <chakrabortyabhradeep79@gmail.com>
---
 pack-bitmap-write.c | 59 +++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 57 insertions(+), 2 deletions(-)

Comments

Taylor Blau June 20, 2022, 10:16 p.m. UTC | #1
On Mon, Jun 20, 2022 at 12:33:11PM +0000, Abhradeep Chakraborty via GitGitGadget wrote:
> From: Abhradeep Chakraborty <chakrabortyabhradeep79@gmail.com>
>
> Teach git to write bitmap lookup table extension. The table has the
> following information:
>
>     - `N` no of Object ids of each bitmapped commits

s/no/number, s/Object/object, s/ids/IDs, and s/commits/commit

>     - A list of offset, xor-offset pair; the i'th pair denotes the
>       offsets and xor-offsets of i'th commit in the previous list.

s/pair/pairs

>     - 4-byte integer denoting the flags
>
> Co-authored-by: Taylor Blau <ttaylorr@github.com>
> Mentored-by: Taylor Blau <ttaylorr@github.com>
> Co-mentored-by: Kaartic Sivaraam <kaartic.sivaraam@gmail.com>
> Signed-off-by: Abhradeep Chakraborty <chakrabortyabhradeep79@gmail.com>
> ---
>  pack-bitmap-write.c | 59 +++++++++++++++++++++++++++++++++++++++++++--
>  1 file changed, 57 insertions(+), 2 deletions(-)
>
> diff --git a/pack-bitmap-write.c b/pack-bitmap-write.c
> index c43375bd344..9e88a64dd65 100644
> --- a/pack-bitmap-write.c
> +++ b/pack-bitmap-write.c
> @@ -650,7 +650,8 @@ static const struct object_id *oid_access(size_t pos, const void *table)
>
>  static void write_selected_commits_v1(struct hashfile *f,
>  				      struct pack_idx_entry **index,
> -				      uint32_t index_nr)
> +				      uint32_t index_nr,
> +				      off_t *offsets)
>  {
>  	int i;
>
> @@ -663,6 +664,9 @@ static void write_selected_commits_v1(struct hashfile *f,
>  		if (commit_pos < 0)
>  			BUG("trying to write commit not in index");
>
> +		if (offsets)
> +			offsets[i] = hashfile_total(f);
> +

Makes sense; we record the offset for the ith commit as however many
bytes we've already written into the hashfile up to this point, since
the subsequent byte will begin the bitmap (well, the preceding few
bytes of it, anyways) itself.

>  		hashwrite_be32(f, commit_pos);
>  		hashwrite_u8(f, stored->xor_offset);
>  		hashwrite_u8(f, stored->flags);
> @@ -671,6 +675,49 @@ static void write_selected_commits_v1(struct hashfile *f,
>  	}
>  }
>
> +static int table_cmp(const void *_va, const void *_vb)
> +{
> +	return oidcmp(&writer.selected[*(uint32_t*)_va].commit->object.oid,
> +		      &writer.selected[*(uint32_t*)_vb].commit->object.oid);

This implementation looks right to me, but perhaps we should expand it
out from the one-liner here to make it more readable. Perhaps something
like:

    static int table_cmp(const void *_va, const void *_vb)
    {
      struct commit *c1 = &writer.selected[*(uint32_t*)_va];
      struct commit *c2 = &writer.selected[*(uint32_t*)_vb];

      return oidcmp(&c1->object.oid, &c2->object.oid);
    }

which is arguably slightly more readable than the one-liner (but I don't
feel that strongly about it.)

> +static void write_lookup_table(struct hashfile *f,
> +			       off_t *offsets)
> +{
> +	uint32_t i;
> +	uint32_t flags = 0;
> +	uint32_t *table, *table_inv;
> +
> +	ALLOC_ARRAY(table, writer.selected_nr);
> +	ALLOC_ARRAY(table_inv, writer.selected_nr);
> +
> +	for (i = 0; i < writer.selected_nr; i++)
> +		table[i] = i;
> +	QSORT(table, writer.selected_nr, table_cmp);
> +	for (i = 0; i < writer.selected_nr; i++)
> +		table_inv[table[i]] = i;

Right... so table[0] will give us the index into writer.selected of the
commit with the earliest OID in lexicographic order. And table_inv goes
the other way around: table_inv[i] will tell us the lexicographic
position of the commit at writer.selected[i].

> +	for (i = 0; i < writer.selected_nr; i++) {
> +		struct bitmapped_commit *selected = &writer.selected[table[i]];
> +		struct object_id *oid = &selected->commit->object.oid;
> +
> +		hashwrite(f, oid->hash, the_hash_algo->rawsz);
> +	}
> +	for (i = 0; i < writer.selected_nr; i++) {
> +		struct bitmapped_commit *selected = &writer.selected[table[i]];
> +
> +		hashwrite_be32(f, offsets[table[i]]);
> +		hashwrite_be32(f, selected->xor_offset
> +			       ? table_inv[table[i] - selected->xor_offset]

...which we need to discover the position of the XOR'd bitmap. Though
I'm not sure if I remember why `table[i] - selected->xor_offset` is
right and not `i - selected->xor_offset`.

Thanks,
Taylor
Abhradeep Chakraborty June 21, 2022, 12:50 p.m. UTC | #2
Taylor Blau <me@ttaylorr.com> wrote:

> I'm not sure if I remember why `table[i] - selected->xor_offset` is
> right and not `i - selected->xor_offset`.

Even I myself got confused! Before sending the patch to the mailing
list, I was clear about that. That's why I didn't catch the so called
mistake I have been notifying till now. Thanks Taylor for asking
the question!

I should add a comment before the line so that people can understand it.
Let us parse `table_inv[table[i] - selected->xor_offset]` -

Suppose bitmap entries be like - 

Bitmap 0 (for commit 0)
Bitmap 1 (for commit 1)
Bitmap 2 (for commit 2)
Bitmap 3 (for commit 3)
.
.
.
Bitmap 20 (for commit 20)

These bitmaps are ordered by the date of their corresponding commit.
`table` array maps commit's lexicographic order to its bitmap order.
`table_inv` stores the reverse (i.e. it maps bitmap order to lexicographic
order). Say for example, if commit 4 is lexicographically first among all the
Commits then `table[0]` is 4. Similarly `table[1]`=2, table[2]=1 etc.
`table_inv[4]` is 0, table_inv[2]=1 etc.

Now suppose commit 4's bitmap has xor-relation with commit 2's bitmap.
So, xor-offset for bitmap 4 is 2. And `table[0] - selected->xor_offset`
is equal to 4-2 = 2. It is pointing to the commit 2. Now, 2 is in bitmap
Order. We need to convert it into lexicographic order. So, table_inv[2]
gives us the lexicographic order position of commit 2 I.e. 1.

Long story short, there is no issue regarding xor_offset. This xor_offset
is not relative to the current commit. It is absolute.

Sorry for the initial claim :)
Taylor Blau June 22, 2022, 4:51 p.m. UTC | #3
On Tue, Jun 21, 2022 at 06:20:54PM +0530, Abhradeep Chakraborty wrote:
> Taylor Blau <me@ttaylorr.com> wrote:
>
> > I'm not sure if I remember why `table[i] - selected->xor_offset` is
> > right and not `i - selected->xor_offset`.
>
> Even I myself got confused! Before sending the patch to the mailing
> list, I was clear about that. That's why I didn't catch the so called
> mistake I have been notifying till now. Thanks Taylor for asking
> the question!
>
> I should add a comment before the line so that people can understand it.
> Let us parse `table_inv[table[i] - selected->xor_offset]` -
>
> Suppose bitmap entries be like -
>
> Bitmap 0 (for commit 0)
> Bitmap 1 (for commit 1)
> Bitmap 2 (for commit 2)
> Bitmap 3 (for commit 3)
> .
> .
> .
> Bitmap 20 (for commit 20)
>
> These bitmaps are ordered by the date of their corresponding commit.
> `table` array maps commit's lexicographic order to its bitmap order.
> `table_inv` stores the reverse (i.e. it maps bitmap order to lexicographic
> order). Say for example, if commit 4 is lexicographically first among all the
> Commits then `table[0]` is 4. Similarly `table[1]`=2, table[2]=1 etc.
> `table_inv[4]` is 0, table_inv[2]=1 etc.
>
> Now suppose commit 4's bitmap has xor-relation with commit 2's bitmap.
> So, xor-offset for bitmap 4 is 2. And `table[0] - selected->xor_offset`
> is equal to 4-2 = 2. It is pointing to the commit 2. Now, 2 is in bitmap
> Order. We need to convert it into lexicographic order. So, table_inv[2]
> gives us the lexicographic order position of commit 2 I.e. 1.
>
> Long story short, there is no issue regarding xor_offset. This xor_offset
> is not relative to the current commit. It is absolute.
>
> Sorry for the initial claim :)

Ahhhhh. Makes perfect sense. Thanks!

Thanks,
Taylor
diff mbox series

Patch

diff --git a/pack-bitmap-write.c b/pack-bitmap-write.c
index c43375bd344..9e88a64dd65 100644
--- a/pack-bitmap-write.c
+++ b/pack-bitmap-write.c
@@ -650,7 +650,8 @@  static const struct object_id *oid_access(size_t pos, const void *table)
 
 static void write_selected_commits_v1(struct hashfile *f,
 				      struct pack_idx_entry **index,
-				      uint32_t index_nr)
+				      uint32_t index_nr,
+				      off_t *offsets)
 {
 	int i;
 
@@ -663,6 +664,9 @@  static void write_selected_commits_v1(struct hashfile *f,
 		if (commit_pos < 0)
 			BUG("trying to write commit not in index");
 
+		if (offsets)
+			offsets[i] = hashfile_total(f);
+
 		hashwrite_be32(f, commit_pos);
 		hashwrite_u8(f, stored->xor_offset);
 		hashwrite_u8(f, stored->flags);
@@ -671,6 +675,49 @@  static void write_selected_commits_v1(struct hashfile *f,
 	}
 }
 
+static int table_cmp(const void *_va, const void *_vb)
+{
+	return oidcmp(&writer.selected[*(uint32_t*)_va].commit->object.oid,
+		      &writer.selected[*(uint32_t*)_vb].commit->object.oid);
+}
+
+static void write_lookup_table(struct hashfile *f,
+			       off_t *offsets)
+{
+	uint32_t i;
+	uint32_t flags = 0;
+	uint32_t *table, *table_inv;
+
+	ALLOC_ARRAY(table, writer.selected_nr);
+	ALLOC_ARRAY(table_inv, writer.selected_nr);
+
+	for (i = 0; i < writer.selected_nr; i++)
+		table[i] = i;
+	QSORT(table, writer.selected_nr, table_cmp);
+	for (i = 0; i < writer.selected_nr; i++)
+		table_inv[table[i]] = i;
+
+	for (i = 0; i < writer.selected_nr; i++) {
+		struct bitmapped_commit *selected = &writer.selected[table[i]];
+		struct object_id *oid = &selected->commit->object.oid;
+
+		hashwrite(f, oid->hash, the_hash_algo->rawsz);
+	}
+	for (i = 0; i < writer.selected_nr; i++) {
+		struct bitmapped_commit *selected = &writer.selected[table[i]];
+
+		hashwrite_be32(f, offsets[table[i]]);
+		hashwrite_be32(f, selected->xor_offset
+			       ? table_inv[table[i] - selected->xor_offset]
+			       : 0xffffffff);
+	}
+
+	hashwrite_be32(f, flags);
+
+	free(table);
+	free(table_inv);
+}
+
 static void write_hash_cache(struct hashfile *f,
 			     struct pack_idx_entry **index,
 			     uint32_t index_nr)
@@ -695,6 +742,7 @@  void bitmap_writer_finish(struct pack_idx_entry **index,
 {
 	static uint16_t default_version = 1;
 	static uint16_t flags = BITMAP_OPT_FULL_DAG;
+	off_t *offsets = NULL;
 	struct strbuf tmp_file = STRBUF_INIT;
 	struct hashfile *f;
 
@@ -715,8 +763,14 @@  void bitmap_writer_finish(struct pack_idx_entry **index,
 	dump_bitmap(f, writer.trees);
 	dump_bitmap(f, writer.blobs);
 	dump_bitmap(f, writer.tags);
-	write_selected_commits_v1(f, index, index_nr);
 
+	if (options & BITMAP_OPT_LOOKUP_TABLE)
+		CALLOC_ARRAY(offsets, index_nr);
+
+	write_selected_commits_v1(f, index, index_nr, offsets);
+
+	if (options & BITMAP_OPT_LOOKUP_TABLE)
+		write_lookup_table(f, offsets);
 	if (options & BITMAP_OPT_HASH_CACHE)
 		write_hash_cache(f, index, index_nr);
 
@@ -730,4 +784,5 @@  void bitmap_writer_finish(struct pack_idx_entry **index,
 		die_errno("unable to rename temporary bitmap file to '%s'", filename);
 
 	strbuf_release(&tmp_file);
+	free(offsets);
 }