diff mbox series

[v2,4/6] pack-bitmap: prepare to read lookup table extension

Message ID 4fbfcff8a208798146cd561b0185e094a116cf0e.1656249017.git.gitgitgadget@gmail.com (mailing list archive)
State Superseded
Headers show
Series bitmap: integrate a lookup table extension to the bitmap format | expand

Commit Message

Abhradeep Chakraborty June 26, 2022, 1:10 p.m. UTC
From: Abhradeep Chakraborty <chakrabortyabhradeep79@gmail.com>

Earlier change teaches Git to write bitmap lookup table. But Git
does not know how to parse them.

Teach Git to parse the existing bitmap lookup table. The older
versions of git are not affected by it. Those versions ignore the
lookup table.

Signed-off-by: Abhradeep Chakraborty <chakrabortyabhradeep79@gmail.com>
Mentored-by: Taylor Blau <me@ttaylorr.com>
Co-Mentored-by: Kaartic Sivaraam <kaartic.sivaraam@gmail.com>
---
 pack-bitmap.c                 | 193 ++++++++++++++++++++++++++++++++--
 t/t5310-pack-bitmaps.sh       |   7 ++
 t/t5326-multi-pack-bitmaps.sh |   1 +
 3 files changed, 191 insertions(+), 10 deletions(-)

Comments

Derrick Stolee June 27, 2022, 3:12 p.m. UTC | #1
On 6/26/2022 9:10 AM, Abhradeep Chakraborty via GitGitGadget wrote:
> From: Abhradeep Chakraborty <chakrabortyabhradeep79@gmail.com>
> 
> Earlier change teaches Git to write bitmap lookup table. But Git
> does not know how to parse them.
> 
> Teach Git to parse the existing bitmap lookup table. The older
> versions of git are not affected by it. Those versions ignore the
> lookup table.
> 
> Signed-off-by: Abhradeep Chakraborty <chakrabortyabhradeep79@gmail.com>
> Mentored-by: Taylor Blau <me@ttaylorr.com>
> Co-Mentored-by: Kaartic Sivaraam <kaartic.sivaraam@gmail.com>

I didn't check the previous patches, but your sign-off should be the
last line of the message. (You are singing off on all previous content,
and any later content is not covered by your sign-off.)

> +
> +		if (flags & BITMAP_OPT_LOOKUP_TABLE &&
> +			git_env_bool("GIT_TEST_READ_COMMIT_TABLE", 1)) {

nit: This alignment should use four spaces at the end so the second phrase
matches the start of the previous phrase. Like this:

		if (flags & BITMAP_OPT_LOOKUP_TABLE &&
		    git_env_bool("GIT_TEST_READ_COMMIT_TABLE", 1)) {

Perhaps it looked right in your editor because it renders tabs as 4 spaces
instead of 8 spaces.

> +			size_t table_size = 0;
> +			size_t triplet_sz = st_add3(sizeof(uint32_t),    /* commit position */
> +							sizeof(uint64_t),    /* offset */
> +							sizeof(uint32_t));    /* xor offset */

The 4- vs 8-space tab view would also explain the alignment here:

			size_t triplet_sz = st_add3(sizeof(uint32_t),  /* commit position */
						    sizeof(uint64_t),  /* offset */
						    sizeof(uint32_t)); /* xor offset */

(I also modified the comment alignment.)

Of course, since these values are constants and have no risk of overflowing,
perhaps we can drop st_add3() here:


			size_t triplet_sz = sizeof(uint32_t) + /* commit position */
					    sizeof(uint64_t) +  /* offset */
					    sizeof(uint32_t); /* xor offset */

> +			table_size = st_add(table_size,
> +					st_mult(ntohl(header->entry_count),
> +						triplet_sz));

Here, we _do_ want to keep the st_mult(). Is the st_add() still necessary? It
seems this is a leftover from the previous version that had the 4-byte flag
data.

We set table_size to zero above. We could drop that initialization and instead
have this after the "size_t triplet_sz" definition:

			size_t table_size = st_mult(ntohl(header->entry_count),
						    triplet_sz));

> +			if (table_size > index_end - index->map - header_size)
> +				return error("corrupted bitmap index file (too short to fit lookup table)");

Please add "_(...)" around the error message so it can be translated.

> +			index->table_lookup = (void *)(index_end - table_size);
> +			index_end -= table_size;
> +		}

> -	/* a 0 return code means the insertion succeeded with no changes,
> -	 * because the SHA1 already existed on the map. this is bad, there
> -	 * shouldn't be duplicated commits in the index */
> +	/* A 0 return code means the insertion succeeded with no changes,
> +	 * because the SHA1 already existed on the map. If lookup table
> +	 * is NULL, this is bad, there shouldn't be duplicated commits
> +	 * in the index.
> +	 *
> +	 * If table_lookup exists, that means the desired bitmap is already
> +	 * loaded. Either this bitmap has been stored directly or another
> +	 * bitmap has a direct or indirect xor relation with it. */

If we are modifying this multi-line comment, then we should reformat it to
match convention:

	/*
	 * The first sentence starts after the comment start
	 * so it has symmetry with the comment end which is on
	 * its own line.
	 */

>  	if (ret == 0) {
> -		error("Duplicate entry in bitmap index: %s", oid_to_hex(oid));
> -		return NULL;
> +		if (!index->table_lookup) {
> +			error("Duplicate entry in bitmap index: %s", oid_to_hex(oid));

Errors start with lowercase letters. Please add translation markers "_(...)"

> +static uint32_t triplet_get_xor_pos(const void *triplet)
> +{
> +	const void *p = (unsigned char*) triplet + st_add(sizeof(uint32_t), sizeof(uint64_t));

This st_add() is not necessary since the constants will not overflow.

> +	return get_be32(p);
> +}
> +
> +static int triplet_cmp(const void *va, const void *vb)
> +{
> +	int result = 0;
> +	uint32_t *a = (uint32_t *) va;
> +	uint32_t b = get_be32(vb);
> +	if (*a > b)
> +		result = 1;
> +	else if (*a < b)
> +		result = -1;
> +	else
> +		result = 0;
> +
> +	return result;
> +}
> +
> +static uint32_t bsearch_pos(struct bitmap_index *bitmap_git, struct object_id *oid,
> +						uint32_t *result)

Strange wrapping. Perhaps

static uint32_t bsearch_pos(struct bitmap_index *bitmap_git,
			    struct object_id *oid,
			    uint32_t *result)

> +{
> +	int found;
> +
> +	if (bitmap_git->midx)
> +		found = bsearch_midx(oid, bitmap_git->midx, result);
> +	else
> +		found = bsearch_pack(oid, bitmap_git->pack, result);
> +
> +	return found;

Here, we are doing a binary search on the entire list of packed objects, which could
use quite a few more hops than a binary search on the bitmapped commits.

> +static struct stored_bitmap *lazy_bitmap_for_commit(struct bitmap_index *bitmap_git,
> +					  struct commit *commit)
...
> +	int found = bsearch_pos(bitmap_git, oid, &commit_pos);
> +
> +	if (!found)
> +		return NULL;
> +
> +	triplet = bsearch(&commit_pos, bitmap_git->table_lookup, bitmap_git->entry_count,
> +						triplet_sz, triplet_cmp);

But I see, you are searching the pack-index for the position in the index, and _then_
searching the bitmap lookup table based on that position value.

I expected something different: binary search on the triplets where the comparison is
made by looking up the OID from the [multi-]pack-index and comparing that OID to the
commit OID we are looking for.

I'm not convinced that the binary search I had in mind is meaningfully faster than
what you've implemented here, so I'm happy to leave it as you have it. We can investigate
if that full search on the pack-index matters at all (it probably doesn't).

> +	if (!triplet)
> +		return NULL;
> +
> +	offset = triplet_get_offset(triplet);
> +	xor_pos = triplet_get_xor_pos(triplet);
> +
> +	if (xor_pos != 0xffffffff) {
> +		int xor_flags;
> +		uint64_t offset_xor;
> +		uint32_t *xor_positions;
> +		struct object_id xor_oid;
> +		size_t size = 0;
> +
> +		ALLOC_ARRAY(xor_positions, bitmap_git->entry_count);

While there is potential that this is wasteful, it's probably not that huge,
so we can start with the "maximum XOR depth" and then reconsider a smaller
allocation in the future.

> +		while (xor_pos != 0xffffffff) {

We should consider ensuring that also "size < bitmap_git->entry_count".
Better yet, create an xor_positions_alloc variable that is initialized
to the entry_count value.

"size" should probably be xor_positions_nr.

> +			xor_positions[size++] = xor_pos;
> +			triplet = bitmap_get_triplet(bitmap_git, xor_pos);
> +			xor_pos = triplet_get_xor_pos(triplet);
> +		}

(at this point, "if (xor_positions_nr >= xor_positions_alloc)", then error
out since the file must be malformed with an XOR loop.)

> +		while (size){

nit: ") {"

> +			xor_pos = xor_positions[size - 1];
> +			triplet = bitmap_get_triplet(bitmap_git, xor_pos);
> +			commit_pos = get_be32(triplet);
> +			offset_xor = triplet_get_offset(triplet);
> +
> +			if (nth_bitmap_object_oid(bitmap_git, &xor_oid, commit_pos) < 0) {
> +				free(xor_positions);
> +				return NULL;
> +			}
> +
> +			bitmap_git->map_pos = offset_xor + sizeof(uint32_t) + sizeof(uint8_t);
> +			xor_flags = read_u8(bitmap_git->map, &bitmap_git->map_pos);
> +			bitmap = read_bitmap_1(bitmap_git);
> +
> +			if (!bitmap){

nit: ") {"

> +				free(xor_positions);
> +				return NULL;
> +			}
> +
> +			xor_bitmap = store_bitmap(bitmap_git, bitmap, &xor_oid, xor_bitmap, xor_flags);

Since we are storing the bitmap here as we "pop" the stack, should we be
looking for a stored bitmap while pushing to the stack in the previous loop?
That would save time when using multiple bitmaps with common XOR bases.

(Of course, we want to be careful that we do not create a recursive loop,
but instead _only_ look at the in-memory bitmaps that already exist.)

> +			size--;
> +		}
> +
> +		free(xor_positions);
> +	}
> +
> +	bitmap_git->map_pos = offset + sizeof(uint32_t) + sizeof(uint8_t);
> +	flags = read_u8(bitmap_git->map, &bitmap_git->map_pos);
> +	bitmap = read_bitmap_1(bitmap_git);
> +
> +	if (!bitmap)
> +		return NULL;
> +
> +	return store_bitmap(bitmap_git, bitmap, oid, xor_bitmap, flags);
> +}
> +

I'm happy with the structure of this iterative algorithm!

I'll pause my review here for now.

Thanks,
-Stolee
Abhradeep Chakraborty June 27, 2022, 6:06 p.m. UTC | #2
Derrick Stolee <derrickstolee@github.com> wrote:

> I didn't check the previous patches, but your sign-off should be the
> last line of the message. (You are singing off on all previous content,
> and any later content is not covered by your sign-off.)

Ohhh, got it. I didn't know about it before.

> nit: This alignment should use four spaces at the end so the second phrase
> matches the start of the previous phrase. Like this:
>
>		if (flags & BITMAP_OPT_LOOKUP_TABLE &&
>		    git_env_bool("GIT_TEST_READ_COMMIT_TABLE", 1)) {
>
> Perhaps it looked right in your editor because it renders tabs as 4 spaces
> instead of 8 spaces.

I don't know why but my editor sometimes do some weird things for alignments.
I generally use VS Code. But for alignment related problems, sometimes I have
to use vi editor.

> Here, we _do_ want to keep the st_mult(). Is the st_add() still necessary? It
> seems this is a leftover from the previous version that had the 4-byte flag
> data.
>
> We set table_size to zero above. We could drop that initialization and instead
> have this after the "size_t triplet_sz" definition:
>
>			size_t table_size = st_mult(ntohl(header->entry_count),
>						    triplet_sz));

Yes, you're right. Will update.

> I expected something different: binary search on the triplets where the comparison is
> made by looking up the OID from the [multi-]pack-index and comparing that OID to the
> commit OID we are looking for.
>
> I'm not convinced that the binary search I had in mind is meaningfully faster than
> what you've implemented here, so I'm happy to leave it as you have it. We can investigate
> if that full search on the pack-index matters at all (it probably doesn't).

Good idea! Thanks!

> While there is potential that this is wasteful, it's probably not that huge,
> so we can start with the "maximum XOR depth" and then reconsider a smaller
> allocation in the future.

Ok.

> We should consider ensuring that also "size < bitmap_git->entry_count".
> Better yet, create an xor_positions_alloc variable that is initialized
> to the entry_count value.
>
> "size" should probably be xor_positions_nr.
> 
> > +			xor_positions[size++] = xor_pos;
> > +			triplet = bitmap_get_triplet(bitmap_git, xor_pos);
> > +			xor_pos = triplet_get_xor_pos(triplet);
> > +		}
> 
> (at this point, "if (xor_positions_nr >= xor_positions_alloc)", then error
> out since the file must be malformed with an XOR loop.)

Got it.

> Since we are storing the bitmap here as we "pop" the stack, should we be
> looking for a stored bitmap while pushing to the stack in the previous loop?
> That would save time when using multiple bitmaps with common XOR bases.

Yeah, I also am thinking about it. Will make a try.

Thanks :)
Derrick Stolee June 27, 2022, 6:32 p.m. UTC | #3
On 6/27/2022 2:06 PM, Abhradeep Chakraborty wrote:
> Derrick Stolee <derrickstolee@github.com> wrote:
>> nit: This alignment should use four spaces at the end so the second phrase
>> matches the start of the previous phrase. Like this:
>>
>> 		if (flags & BITMAP_OPT_LOOKUP_TABLE &&
>> 		    git_env_bool("GIT_TEST_READ_COMMIT_TABLE", 1)) {
>>
>> Perhaps it looked right in your editor because it renders tabs as 4 spaces
>> instead of 8 spaces.
> 
> I don't know why but my editor sometimes do some weird things for alignments.
> I generally use VS Code. But for alignment related problems, sometimes I have
> to use vi editor.

I also use VS Code, and I noticed a few spacing issues recently, especially
in .txt files.

I submitted a patch [1] to improve the contrib/vscode/init.sh script, which
adds some helpful config settings to your Git workspace. Please take a look
and see how it works for you.

Thanks,
-Stolee

[1] https://lore.kernel.org/git/pull.1271.git.1656354587496.gitgitgadget@gmail.com
Taylor Blau June 27, 2022, 9:38 p.m. UTC | #4
On Sun, Jun 26, 2022 at 01:10:15PM +0000, Abhradeep Chakraborty via GitGitGadget wrote:
> From: Abhradeep Chakraborty <chakrabortyabhradeep79@gmail.com>
>
> Earlier change teaches Git to write bitmap lookup table. But Git
> does not know how to parse them.
>
> Teach Git to parse the existing bitmap lookup table. The older
> versions of git are not affected by it. Those versions ignore the

s/git/Git

> lookup table.
>
> Signed-off-by: Abhradeep Chakraborty <chakrabortyabhradeep79@gmail.com>
> Mentored-by: Taylor Blau <me@ttaylorr.com>
> Co-Mentored-by: Kaartic Sivaraam <kaartic.sivaraam@gmail.com>
> ---
>  pack-bitmap.c                 | 193 ++++++++++++++++++++++++++++++++--
>  t/t5310-pack-bitmaps.sh       |   7 ++
>  t/t5326-multi-pack-bitmaps.sh |   1 +
>  3 files changed, 191 insertions(+), 10 deletions(-)
>
> diff --git a/pack-bitmap.c b/pack-bitmap.c
> index 36134222d7a..9e09c5824fc 100644
> --- a/pack-bitmap.c
> +++ b/pack-bitmap.c
> @@ -82,6 +82,12 @@ struct bitmap_index {
>  	/* The checksum of the packfile or MIDX; points into map. */
>  	const unsigned char *checksum;
>
> +	/*
> +	 * If not NULL, this point into the commit table extension
> +	 * (within map).

It may be worth replacing "within map" to "within the memory mapped
region `map`" to make clear that this points somewhere within the mmap.

> +	 */
> +	unsigned char *table_lookup;
> +


> @@ -185,6 +191,22 @@ static int load_bitmap_header(struct bitmap_index *index)
>  			index->hashes = (void *)(index_end - cache_size);
>  			index_end -= cache_size;
>  		}
> +
> +		if (flags & BITMAP_OPT_LOOKUP_TABLE &&
> +			git_env_bool("GIT_TEST_READ_COMMIT_TABLE", 1)) {

I should have commented on this in an earlier round, but I wonder what
the behavior should be when we have BITMAP_OPT_LOOKUP_TABLE in our
flags, but GIT_TEST_READ_COMMIT_TABLE is disabled.

Right now, it doesn't matter, since there aren't any flags in bits above
BITMAP_OPT_LOOKUP_TABLE. But in the future, if there was some
BITMAP_OPT_FOO that was newer than BITMAP_OPT_LOOKUP_TABLE, we would
want to be able to read it without needing to read the lookup table.

At least, I think that should be true, though I would be interested to
hear if anybody has a differing opinion there.

> +			size_t table_size = 0;
> +			size_t triplet_sz = st_add3(sizeof(uint32_t),    /* commit position */
> +							sizeof(uint64_t),    /* offset */
> +							sizeof(uint32_t));    /* xor offset */

I don't think we need a st_add3() call here, since the size of these
three types is known to be small and thus won't overflow the available
range of size_t.

> +			table_size = st_add(table_size,
> +					st_mult(ntohl(header->entry_count),
> +						triplet_sz));

And table_size here is going to start off at zero, so the outer st_add()
call isn't necessary, either. This should instead be:

    size_t table_size = st_mult(ntohl(header->entry_count),
                                sizeof(uint32_t) + sizeof(uint64_t) + sizeof(uint32_t));

It might be nice to have triplet_sz #define'd somewhere else, since
there are a handful of declarations in this patch that are all
identical. Probably something like:

    #define BITMAP_LOOKUP_TABLE_RECORD_WIDTH (sizeof(uint32_t) + sizeof(uint64_t) + sizeof(uin32_t))

or even:

    /*
     * The width in bytes of a single record in the lookup table
     * extension:
     *
     *   (commit_pos, offset, xor_pos)
     *
     * whose fields are 32-, 64-, and 32-bits wide, respectively.
     */
    #define BITMAP_LOOKUP_TABLE_RECORD_WIDTH (16)

> +			if (table_size > index_end - index->map - header_size)
> +				return error("corrupted bitmap index file (too short to fit lookup table)");

if we decide to still recognize the lookup table extension without
*reading* from it when GIT_TEST_READ_COMMIT_TABLE is unset, I think we
should do something like:

    if (git_env_bool("GIT_TEST_READ_COMMIT_TABLE", 1))
        index->table_lookup = (void *)(index_end - table_size);
    index_end -= table_size;

...where the subtraction on index_end happens unconditionally.

> +static inline const void *bitmap_get_triplet(struct bitmap_index *bitmap_git, uint32_t xor_pos)
> +{
> +	size_t triplet_sz = st_add3(sizeof(uint32_t), sizeof(uint64_t), sizeof(uint32_t));

Same note about the #define constant here.

> +	const void *p = bitmap_git->table_lookup + st_mult(xor_pos, triplet_sz);

And this can be returned directly. Just:

    return bitmap_git->table_lookup + st_mult(xor_pos, BITMAP_LOOKUP_TABLE_RECORD_WIDTH);

although I wonder: why "xor_pos" and not just "pos" here?

> +static uint64_t triplet_get_offset(const void *triplet)
> +{
> +	const void *p = (unsigned char*) triplet + sizeof(uint32_t);
> +	return get_be64(p);
> +}
> +
> +static uint32_t triplet_get_xor_pos(const void *triplet)
> +{
> +	const void *p = (unsigned char*) triplet + st_add(sizeof(uint32_t), sizeof(uint64_t));
> +	return get_be32(p);
> +}

I wonder if we could get rid of these functions altogether and return a
small structure like:

    struct bitmap_lookup_table_record {
        uint32_t commit_pos;
        uint64_t offset;
        uint32_t xor_pos;
    };

or similar.

> +static int triplet_cmp(const void *va, const void *vb)
> +{
> +	int result = 0;
> +	uint32_t *a = (uint32_t *) va;
> +	uint32_t b = get_be32(vb);

Hmm. This is a little tricky to read. Here we're expecting "va" to hold
commit_pos from below, and "vb" to be a pointer at a lookup record.
Everything here is right, though I wonder if a comment or two might
clarify why one is "*(uint32_t *)va" and the other is "get_be32(vb)".

> +	if (*a > b)
> +		result = 1;
> +	else if (*a < b)
> +		result = -1;
> +	else
> +		result = 0;

Let's just return the result of the comparison directly here. And while
I'm looking at it, I think we can avoid dereferencing "a" on each use,
and instead just dereference va on assignment after casting, e.g.:

    uint32_t a = *(uint32_t*)va;

> +static uint32_t bsearch_pos(struct bitmap_index *bitmap_git, struct object_id *oid,
> +						uint32_t *result)
> +{
> +	int found;
> +
> +	if (bitmap_git->midx)

Nit: let's use the bitmap_is_midx() helper here instead of looking at
bitamp_git->midx directly.

> +		found = bsearch_midx(oid, bitmap_git->midx, result);
> +	else
> +		found = bsearch_pack(oid, bitmap_git->pack, result);
> +
> +	return found;
> +}

Makes sense.

> +static struct stored_bitmap *lazy_bitmap_for_commit(struct bitmap_index *bitmap_git,
> +					  struct commit *commit)
> +{
> +	uint32_t commit_pos, xor_pos;
> +	uint64_t offset;
> +	int flags;
> +	const void *triplet = NULL;
> +	struct object_id *oid = &commit->object.oid;
> +	struct ewah_bitmap *bitmap;
> +	struct stored_bitmap *xor_bitmap = NULL;
> +	size_t triplet_sz = st_add3(sizeof(uint32_t), sizeof(uint64_t), sizeof(uint32_t));
> +
> +	int found = bsearch_pos(bitmap_git, oid, &commit_pos);
> +
> +	if (!found)
> +		return NULL;
> +
> +	triplet = bsearch(&commit_pos, bitmap_git->table_lookup, bitmap_git->entry_count,
> +						triplet_sz, triplet_cmp);
> +	if (!triplet)
> +		return NULL;

OK. If you don't mind, I'm going to "think aloud" while I read through
this function to make sure that we're on the same page.

First thing is to convert the commit OID we're looking for into its
position within the corresponding pack index or MIDX file so that we can
use it as a search key to locate in the lookup table. If we didn't find
anything, or the commit doesn't exist in our pack / MIDX, nothing to do.

> +
> +	offset = triplet_get_offset(triplet);
> +	xor_pos = triplet_get_xor_pos(triplet);

Otherwise, record its offset and XOR "offset".

> +
> +	if (xor_pos != 0xffffffff) {
> +		int xor_flags;
> +		uint64_t offset_xor;
> +		uint32_t *xor_positions;
> +		struct object_id xor_oid;
> +		size_t size = 0;
> +
> +		ALLOC_ARRAY(xor_positions, bitmap_git->entry_count);

If we are XOR'd with another bitmap, make a stack of those bitmaps so
that we can decompress ourself.

I'm a little surprised that we're allocating an array as large as
bitmap_git->entry_count. It's not wrong, but it does waste some bytes
since we likely don't often have these long chains of XOR'd bitmaps.

We should instead allocate a smaller array and grow it over time (search
for examples of ALLOC_GROW() to see the canonical way to do this in
Git's codebase).

> +		while (xor_pos != 0xffffffff) {
> +			xor_positions[size++] = xor_pos;
> +			triplet = bitmap_get_triplet(bitmap_git, xor_pos);
> +			xor_pos = triplet_get_xor_pos(triplet);
> +		}
> +
> +		while (size){

Nit: missing space after ")" and before "{".

> +			xor_pos = xor_positions[size - 1];
> +			triplet = bitmap_get_triplet(bitmap_git, xor_pos);

We already have to get the triplets in the loop above, and then we dig
them back out here. Would it be easier to keep track of a list of
pointers into the mmaped region instead of looking up these triplets
each time?

> +			commit_pos = get_be32(triplet);
> +			offset_xor = triplet_get_offset(triplet);
> +
> +			if (nth_bitmap_object_oid(bitmap_git, &xor_oid, commit_pos) < 0) {

Should it be an error if we can't look up the object's ID here? I'd
think so.

> +				free(xor_positions);
> +				return NULL;
> +			}
> +
> +			bitmap_git->map_pos = offset_xor + sizeof(uint32_t) + sizeof(uint8_t);
> +			xor_flags = read_u8(bitmap_git->map, &bitmap_git->map_pos);
> +			bitmap = read_bitmap_1(bitmap_git);
> +
> +			if (!bitmap){

Nit: missing space between ")" and "{".

> +				free(xor_positions);
> +				return NULL;
> +			}
> +
> +			xor_bitmap = store_bitmap(bitmap_git, bitmap, &xor_oid, xor_bitmap, xor_flags);
> +			size--;

Makes sense. Nicely done!

> +		}
> +
> +		free(xor_positions);
> +	}
> +
> +	bitmap_git->map_pos = offset + sizeof(uint32_t) + sizeof(uint8_t);
> +	flags = read_u8(bitmap_git->map, &bitmap_git->map_pos);
> +	bitmap = read_bitmap_1(bitmap_git);

Great, and now we can finally read the original bitmap that we wanted
to...

> +	if (!bitmap)
> +		return NULL;
> +
> +	return store_bitmap(bitmap_git, bitmap, oid, xor_bitmap, flags);

...and XOR it with the thing we built up in the loop. Very nicely done.
Do we have a good way to make sure that we're testing this code in CI?
It *seems* correct to me, but of course, we should have a computer check
that this produces OK results, not a human ;).

> +}
> +
>  struct ewah_bitmap *bitmap_for_commit(struct bitmap_index *bitmap_git,
>  				      struct commit *commit)
>  {
>  	khiter_t hash_pos = kh_get_oid_map(bitmap_git->bitmaps,
>  					   commit->object.oid);
> -	if (hash_pos >= kh_end(bitmap_git->bitmaps))
> -		return NULL;
> +	if (hash_pos >= kh_end(bitmap_git->bitmaps)) {
> +		struct stored_bitmap *bitmap = NULL;
> +		if (!bitmap_git->table_lookup)
> +			return NULL;
> +
> +		/* NEEDSWORK: cache misses aren't recorded */

For what it's worth, I think that it's completely fine to leave this as
a NEEDSWORK for the purposes of this series. I think we plausibly could
improve this in certain scenarios by finding some threshold on cache
misses when we should just fault in all bitmaps, but that can easily be
done on top.

> +		bitmap = lazy_bitmap_for_commit(bitmap_git, commit);
> +		if(!bitmap)

Nit: missing space between "if" and "(".

> +			return NULL;
> +		return lookup_stored_bitmap(bitmap);
> +	}
>  	return lookup_stored_bitmap(kh_value(bitmap_git->bitmaps, hash_pos));
>  }
>
> @@ -1699,9 +1861,13 @@ void test_bitmap_walk(struct rev_info *revs)
>  	if (revs->pending.nr != 1)
>  		die("you must specify exactly one commit to test");
>
> -	fprintf(stderr, "Bitmap v%d test (%d entries loaded)\n",
> +	fprintf(stderr, "Bitmap v%d test (%d entries)\n",
>  		bitmap_git->version, bitmap_git->entry_count);
>
> +	if (!bitmap_git->table_lookup)
> +		fprintf(stderr, "Bitmap v%d test (%d entries loaded)\n",
> +			bitmap_git->version, bitmap_git->entry_count);
> +

I think we should probably print just one or the other here, perhaps
like:

    fprintf(stderr, "Bitmap v%d test (%d entries%s)",
            bitmap_git->version,
            bitmap_git->entry_count,
            bitmap_git->table_lookup ? "" : " loaded");

>  	root = revs->pending.objects[0].item;
>  	bm = bitmap_for_commit(bitmap_git, (struct commit *)root);
>
> @@ -1753,10 +1919,16 @@ void test_bitmap_walk(struct rev_info *revs)
>
>  int test_bitmap_commits(struct repository *r)
>  {
> -	struct bitmap_index *bitmap_git = prepare_bitmap_git(r);
> +	struct bitmap_index *bitmap_git = NULL;
>  	struct object_id oid;
>  	MAYBE_UNUSED void *value;
>
> +	/* As this function is only used to print bitmap selected
> +	 * commits, we don't have to read the commit table.
> +	 */
> +	setenv("GIT_TEST_READ_COMMIT_TABLE", "0", 1);
> +
> +	bitmap_git = prepare_bitmap_git(r);
>  	if (!bitmap_git)
>  		die("failed to load bitmap indexes");
>
> @@ -1764,6 +1936,7 @@ int test_bitmap_commits(struct repository *r)
>  		printf("%s\n", oid_to_hex(&oid));
>  	});
>
> +	setenv("GIT_TEST_READ_COMMIT_TABLE", "1", 1);
>  	free_bitmap_index(bitmap_git);

Hmm. I'm not sure I follow the purpose of tweaking
GIT_TEST_READ_COMMIT_TABLE like this with setenv(). Are we trying to
avoid reading the lookup table? If so, why? I'd rather avoid
manipulating the environment directly like this, and instead have a
function we could call to fault in all of the bitmaps (when a lookup
table exists, otherwise do nothing).

Thanks,
Taylor
Taylor Blau June 27, 2022, 9:49 p.m. UTC | #5
On Mon, Jun 27, 2022 at 11:12:09AM -0400, Derrick Stolee wrote:
> On 6/26/2022 9:10 AM, Abhradeep Chakraborty via GitGitGadget wrote:
> > +			table_size = st_add(table_size,
> > +					st_mult(ntohl(header->entry_count),
> > +						triplet_sz));
>
> Here, we _do_ want to keep the st_mult(). Is the st_add() still necessary? It
> seems this is a leftover from the previous version that had the 4-byte flag
> data.
>
> We set table_size to zero above. We could drop that initialization and instead
> have this after the "size_t triplet_sz" definition:
>
> 			size_t table_size = st_mult(ntohl(header->entry_count),
> 						    triplet_sz));

Well put, thank you.

> > +			if (table_size > index_end - index->map - header_size)
> > +				return error("corrupted bitmap index file (too short to fit lookup table)");
>
> Please add "_(...)" around the error message so it can be translated.

I missed this in my own review, but yes: this is a good practice.

> > +	if (bitmap_git->midx)
> > +		found = bsearch_midx(oid, bitmap_git->midx, result);
> > +	else
> > +		found = bsearch_pack(oid, bitmap_git->pack, result);
> > +
> > +	return found;
>
> Here, we are doing a binary search on the entire list of packed objects, which could
> use quite a few more hops than a binary search on the bitmapped commits.

I think this is the best we can do if we make the key to our bsearch
through the lookup table be an index into the pack index / MIDX. But...

> > +static struct stored_bitmap *lazy_bitmap_for_commit(struct bitmap_index *bitmap_git,
> > +					  struct commit *commit)
> ...
> > +	int found = bsearch_pos(bitmap_git, oid, &commit_pos);
> > +
> > +	if (!found)
> > +		return NULL;
> > +
> > +	triplet = bsearch(&commit_pos, bitmap_git->table_lookup, bitmap_git->entry_count,
> > +						triplet_sz, triplet_cmp);
>
> But I see, you are searching the pack-index for the position in the index, and _then_
> searching the bitmap lookup table based on that position value.
>
> I expected something different: binary search on the triplets where the comparison is
> made by looking up the OID from the [multi-]pack-index and comparing that OID to the
> commit OID we are looking for.
>
> I'm not convinced that the binary search I had in mind is meaningfully faster than
> what you've implemented here, so I'm happy to leave it as you have it. We can investigate
> if that full search on the pack-index matters at all (it probably doesn't).

...exactly my thoughts, too. It's possible that it would be faster to
key this search on the object_id "oid" above, and then convert each of
the entries in the lookup table from a uint32_t into an object_id by
calling nth_bitmap_object_oid() repeatedly.

I *think* that what Abhradeep wrote here is going to be faster more
often than not since it makes more efficient use of the page cache
rather than switching between reads across different memory mapped
regions at each point in the binary search.

But of course that depends on a number of factors. Abhradeep: if you're
up for it, I think it would be worth trying it both ways and seeing if
one produces a meaningful speed-up or slow-down over the other. Like I
said: my guess is that what you have now will be faster, but I don't
have a clear sense that that is true without trying it both ways ;-).

> > +	if (!triplet)
> > +		return NULL;
> > +
> > +	offset = triplet_get_offset(triplet);
> > +	xor_pos = triplet_get_xor_pos(triplet);
> > +
> > +	if (xor_pos != 0xffffffff) {
> > +		int xor_flags;
> > +		uint64_t offset_xor;
> > +		uint32_t *xor_positions;
> > +		struct object_id xor_oid;
> > +		size_t size = 0;
> > +
> > +		ALLOC_ARRAY(xor_positions, bitmap_git->entry_count);
>
> While there is potential that this is wasteful, it's probably not that huge,
> so we can start with the "maximum XOR depth" and then reconsider a smaller
> allocation in the future.

There is no maximum XOR depth, to my knowledge. We do have a maximum XOR
*offset*, which says we cannot XOR-compress a bitmap with an entry more
than 160 entries away from the current one. But in theory every commit
could be XOR compressed with the one immediately proceeding it, so the
maximum depth could be as long as the entry_count itself.

I think starting off with a small array and then letting it grow
according to alloc_nr() would be fine here, since it will grow more and
more each time, so the amount of times we have to reallocate the buffer
will tail off over time.

If we were really concerned about it, we could treat the buffer as a
static pointer and reuse it over time (making sure to clear out the
portions of it we're going to reuse, or otherwise ensuring that we don't
read old data). But I doubt it matters much either way in practice: the
individual records are small (at just 4 bytes each) and entry_count is
often less than 1,000, so I think this probably has a vanishingly small
impact.

Thanks,
Taylor
Abhradeep Chakraborty June 28, 2022, 8:59 a.m. UTC | #6
Taylor Blau <me@ttaylorr.com> wrote:

> ...exactly my thoughts, too. It's possible that it would be faster to
> key this search on the object_id "oid" above, and then convert each of
> the entries in the lookup table from a uint32_t into an object_id by
> calling nth_bitmap_object_oid() repeatedly.
>
> I *think* that what Abhradeep wrote here is going to be faster more
> often than not since it makes more efficient use of the page cache
> rather than switching between reads across different memory mapped
> regions at each point in the binary search.
>
> But of course that depends on a number of factors. Abhradeep: if you're
> up for it, I think it would be worth trying it both ways and seeing if
> one produces a meaningful speed-up or slow-down over the other. Like I
> said: my guess is that what you have now will be faster, but I don't
> have a clear sense that that is true without trying it both ways ;-).

Ok. Let me try both the ways. In my opinion, I think my version has
less searching and less computation. So, I want to stick with this
version. But I also like to try the other one once so that we can
get the best out of these two.

> I think starting off with a small array and then letting it grow
> according to alloc_nr() would be fine here, since it will grow more and
> more each time, so the amount of times we have to reallocate the buffer
> will tail off over time.

What should be the size of that array?

> If we were really concerned about it, we could treat the buffer as a
> static pointer and reuse it over time (making sure to clear out the
> portions of it we're going to reuse, or otherwise ensuring that we don't
> read old data). But I doubt it matters much either way in practice: the
> individual records are small (at just 4 bytes each) and entry_count is
> often less than 1,000, so I think this probably has a vanishingly small
> impact.

Before submitting it to the mailing list, I did use the ALLOC_GROW macro
function. But my version was worse than yours. For every iteration I was
reallocating the array to support `size+1` positions. But later I drop
the code as this might be very much expensive.

Then I wrote this code. As `table` array and `table_inv` array allocate
this size of arrays (though all the indices are used), I thought it
would not be a problem if I use an array of this size for a small amount
of time.

Honestly, I don't like to realloc arrays. Because as far as I can remember,
realloc allocates a new array internally and copies the items from the old
array to the new array. This irritates me.

But at the same time, it is also true that in most cases we might not need
this amount of space.

Thanks :)
Abhradeep Chakraborty June 28, 2022, 7:25 p.m. UTC | #7
Ohh, sorry! Looks like I missed this comment!

Taylor Blau <me@ttaylorr.com> wrote:

> It may be worth replacing "within map" to "within the memory mapped
> region `map`" to make clear that this points somewhere within the mmap.

Ok.

> I should have commented on this in an earlier round, but I wonder what
> the behavior should be when we have BITMAP_OPT_LOOKUP_TABLE in our
> flags, but GIT_TEST_READ_COMMIT_TABLE is disabled.
>
> Right now, it doesn't matter, since there aren't any flags in bits above
> BITMAP_OPT_LOOKUP_TABLE. But in the future, if there was some
> BITMAP_OPT_FOO that was newer than BITMAP_OPT_LOOKUP_TABLE, we would
> want to be able to read it without needing to read the lookup table.
>
> At least, I think that should be true, though I would be interested to
> hear if anybody has a differing opinion there.

Oh right! I didn't think about it. In that case, we should still subtract
The table size from the last index_size. In that way, These sections will
Not be overlapped.

> And table_size here is going to start off at zero, so the outer st_add()
> call isn't necessary, either. This should instead be:
>
>     size_t table_size = st_mult(ntohl(header->entry_count),
>                                 sizeof(uint32_t) + sizeof(uint64_t) + sizeof(uint32_t));
>
> It might be nice to have triplet_sz #define'd somewhere else, since
> there are a handful of declarations in this patch that are all
> identical. Probably something like:
>
>     #define BITMAP_LOOKUP_TABLE_RECORD_WIDTH (sizeof(uint32_t) + sizeof(uint64_t) + sizeof(uin32_t))
>
> or even:
>
>     /*
>      * The width in bytes of a single record in the lookup table
>      * extension:
>      *
>      *   (commit_pos, offset, xor_pos)
>      *
>      * whose fields are 32-, 64-, and 32-bits wide, respectively.
>      */
>      #define BITMAP_LOOKUP_TABLE_RECORD_WIDTH (16)

Seems perfect to me.

> if we decide to still recognize the lookup table extension without
> *reading* from it when GIT_TEST_READ_COMMIT_TABLE is unset, I think we
> should do something like:
>
>     if (git_env_bool("GIT_TEST_READ_COMMIT_TABLE", 1))
>         index->table_lookup = (void *)(index_end - table_size);
>     index_end -= table_size;
>
> ...where the subtraction on index_end happens unconditionally.

Right. Thanks!

> I wonder if we could get rid of these functions altogether and return a
> small structure like:
>
>     struct bitmap_lookup_table_record {
>         uint32_t commit_pos;
>         uint64_t offset;
>         uint32_t xor_pos;
>     };
>
> or similar.

Ok.

> Hmm. This is a little tricky to read. Here we're expecting "va" to hold
> commit_pos from below, and "vb" to be a pointer at a lookup record.
> Everything here is right, though I wonder if a comment or two might
> clarify why one is "*(uint32_t *)va" and the other is "get_be32(vb)".

Sure. Will add comments.

> Nit: let's use the bitmap_is_midx() helper here instead of looking at
> bitamp_git->midx directly.

Ok.

> First thing is to convert the commit OID we're looking for into its
> position within the corresponding pack index or MIDX file so that we can
> use it as a search key to locate in the lookup table. If we didn't find
> anything, or the commit doesn't exist in our pack / MIDX, nothing to do.
>
> > +
> > +	offset = triplet_get_offset(triplet);
> > +	xor_pos = triplet_get_xor_pos(triplet);
>
> Otherwise, record its offset and XOR "offset".

Exactly!

> We already have to get the triplets in the loop above, and then we dig
> them back out here. Would it be easier to keep track of a list of
> pointers into the mmaped region instead of looking up these triplets
> each time?

Sure. It might be a good idea. Thanks.

> > +			commit_pos = get_be32(triplet);
> > +			offset_xor = triplet_get_offset(triplet);
> > +
> > +			if (nth_bitmap_object_oid(bitmap_git, &xor_oid, commit_pos) < 0) {
>
> Should it be an error if we can't look up the object's ID here? I'd
> think so.

I also am not sure about it. Morally, I think it is better to throw
An error here.

> Do we have a good way to make sure that we're testing this code in CI?
> It *seems* correct to me, but of course, we should have a computer check
> that this produces OK results, not a human ;).

My current test file changes should test this code. As for now, the lookup
Table is enabled by default, all the existing tests that include write and
read bitmaps uses this lookup table. So, all the test case scenarios should
Pass. So, I think it is being tested in CI. Do you have a good idea to test
It better?

> Hmm. I'm not sure I follow the purpose of tweaking
> GIT_TEST_READ_COMMIT_TABLE like this with setenv(). Are we trying to
> avoid reading the lookup table? If so, why? I'd rather avoid
> manipulating the environment directly like this, and instead have a
> function we could call to fault in all of the bitmaps (when a lookup
> table exists, otherwise do nothing).

The problem was that the `test-tool bitmap list-commit` command was
Not printing any commits (the error that I notified you before). It
is because of this function. As lookup table is enabled by default,
`prepare_bitmap_git` function doesn't load each bitmap entries and
thus the below code in this function doesn't provide the bitmapped
commit list (because Hashtable didn't generated).

        kh_foreach(bitmap_git->bitmaps, oid, value, {
		printf("%s\n", oid_to_hex(&oid));
	});

So, the simplest fix I found was this. Should I make a function then
(Which you suggested here)?

Thanks :)
Taylor Blau June 29, 2022, 8:22 p.m. UTC | #8
On Tue, Jun 28, 2022 at 02:29:50PM +0530, Abhradeep Chakraborty wrote:
> Taylor Blau <me@ttaylorr.com> wrote:
>
> > ...exactly my thoughts, too. It's possible that it would be faster to
> > key this search on the object_id "oid" above, and then convert each of
> > the entries in the lookup table from a uint32_t into an object_id by
> > calling nth_bitmap_object_oid() repeatedly.
> >
> > I *think* that what Abhradeep wrote here is going to be faster more
> > often than not since it makes more efficient use of the page cache
> > rather than switching between reads across different memory mapped
> > regions at each point in the binary search.
> >
> > But of course that depends on a number of factors. Abhradeep: if you're
> > up for it, I think it would be worth trying it both ways and seeing if
> > one produces a meaningful speed-up or slow-down over the other. Like I
> > said: my guess is that what you have now will be faster, but I don't
> > have a clear sense that that is true without trying it both ways ;-).
>
> Ok. Let me try both the ways. In my opinion, I think my version has
> less searching and less computation. So, I want to stick with this
> version. But I also like to try the other one once so that we can
> get the best out of these two.

Yeah, I agree with your general sense that the version as written is
going to be faster. We're comparing a smaller datatype (IOW, a 4-byte
integer that can be checked for equality in a single instruction,
instead of comparing two 20-byte OIDs), and likely flushing the cache
far less often.

But having two concrete implementations to compare will help us know for
a fact that our intuition is correct.

I'll be curious to see what you find here!

> > I think starting off with a small array and then letting it grow
> > according to alloc_nr() would be fine here, since it will grow more and
> > more each time, so the amount of times we have to reallocate the buffer
> > will tail off over time.
>
> What should be the size of that array?

I think some small, power of 2 would be a reasonable choice here.

> > If we were really concerned about it, we could treat the buffer as a
> > static pointer and reuse it over time (making sure to clear out the
> > portions of it we're going to reuse, or otherwise ensuring that we don't
> > read old data). But I doubt it matters much either way in practice: the
> > individual records are small (at just 4 bytes each) and entry_count is
> > often less than 1,000, so I think this probably has a vanishingly small
> > impact.
>
> Before submitting it to the mailing list, I did use the ALLOC_GROW macro
> function. But my version was worse than yours. For every iteration I was
> reallocating the array to support `size+1` positions. But later I drop
> the code as this might be very much expensive.

That shouldn't be the case. When you have a chance, take a look at the
alloc_nr macro, which shows how much memory we allocate at each
step:

    #define alloc_nr(x) (((x)+16)*3/2)

Suppose we allocated 16 slots initially, so nr (the number of entries
stored in the list) is 0 and alloc (the number of entries allocated) is
16. Then when we try to add the 17th item, we'll pass 16 to alloc_nr
which will allocate 48 slots. Then 96, then 168, and so on.

We only have to reallocate and copy the array when nr > alloc, which
should be fairly infrequently, and happens less and less often the
larger the array grows.

Thanks,
Taylor
Taylor Blau June 29, 2022, 8:37 p.m. UTC | #9
On Wed, Jun 29, 2022 at 12:55:55AM +0530, Abhradeep Chakraborty wrote:
> > > +			commit_pos = get_be32(triplet);
> > > +			offset_xor = triplet_get_offset(triplet);
> > > +
> > > +			if (nth_bitmap_object_oid(bitmap_git, &xor_oid, commit_pos) < 0) {
> >
> > Should it be an error if we can't look up the object's ID here? I'd
> > think so.
>
> I also am not sure about it. Morally, I think it is better to throw
> An error here.

Yeah.

> > Do we have a good way to make sure that we're testing this code in CI?
> > It *seems* correct to me, but of course, we should have a computer check
> > that this produces OK results, not a human ;).
>
> My current test file changes should test this code. As for now, the lookup
> Table is enabled by default, all the existing tests that include write and
> read bitmaps uses this lookup table. So, all the test case scenarios should
> Pass. So, I think it is being tested in CI. Do you have a good idea to test
> It better?

I think having some indication (maybe via a trace2 region?) that we're
actually executing this code would be good. Although it's going to be
*really* noisy, so probably not a good idea to do that in general.

Stolee runs some coverage tests that show lines that we aren't
exercising via tests. So making sure that this doesn't show up in that
report when you run it locally would be good.

See some information from him about how to run those tests locally here:

    https://lore.kernel.org/git/00a57a1d-0566-8f54-26b2-0f3558bde88d@github.com/

(TL;DR: run `make coverage-test` and make sure that these lines don't
show up ;-)).

> > Hmm. I'm not sure I follow the purpose of tweaking
> > GIT_TEST_READ_COMMIT_TABLE like this with setenv(). Are we trying to
> > avoid reading the lookup table? If so, why? I'd rather avoid
> > manipulating the environment directly like this, and instead have a
> > function we could call to fault in all of the bitmaps (when a lookup
> > table exists, otherwise do nothing).
>
> The problem was that the `test-tool bitmap list-commit` command was
> Not printing any commits (the error that I notified you before). It
> is because of this function. As lookup table is enabled by default,
> `prepare_bitmap_git` function doesn't load each bitmap entries and
> thus the below code in this function doesn't provide the bitmapped
> commit list (because Hashtable didn't generated).
>
>         kh_foreach(bitmap_git->bitmaps, oid, value, {
> 		printf("%s\n", oid_to_hex(&oid));
> 	});
>
> So, the simplest fix I found was this. Should I make a function then
> (Which you suggested here)?

I see. I remember that issue, but I think we should go about fixing it
in a different way. Instead of tricking the code into loading all
bitmaps by pretending the lookup table doesn't exist, we should have a
function that forces loading in all bitmaps from the lookup table, if
one exists. If the lookup table doesn't exist, or we have already loaded
its entries, then that function can be noop.

If we had something like that, we could call that function from within
`test_bitmap_commits()` before reading the keys and values out of
`bitmap_git->bitmaps`.

An alternative approach would be to read the table directly when it
exists, perhaps something like this:

--- 8< ---

diff --git a/pack-bitmap.c b/pack-bitmap.c
index 9e09c5824f..3bda059b9f 100644
--- a/pack-bitmap.c
+++ b/pack-bitmap.c
@@ -1921,22 +1921,28 @@ int test_bitmap_commits(struct repository *r)
 {
 	struct bitmap_index *bitmap_git = NULL;
 	struct object_id oid;
-	MAYBE_UNUSED void *value;
-
-	/* As this function is only used to print bitmap selected
-	 * commits, we don't have to read the commit table.
-	 */
-	setenv("GIT_TEST_READ_COMMIT_TABLE", "0", 1);

 	bitmap_git = prepare_bitmap_git(r);
 	if (!bitmap_git)
 		die("failed to load bitmap indexes");

-	kh_foreach(bitmap_git->bitmaps, oid, value, {
-		printf("%s\n", oid_to_hex(&oid));
-	});
+	if (bitmap_git->table_lookup) {
+		uint32_t i, commit_pos;
+		for (i = 0; i < bitmap_git->entry_count; i++) {
+			commit_pos = get_be32(bitmap_get_triplet(bitmap_git, i));
+			if (nth_bitmap_object_oid(bitmap_git, &oid,
+						  commit_pos) < 0)
+				return error("could not find commit at "
+					     "position %"PRIu32, commit_pos);
+			printf("%s\n", oid_to_hex(&oid));
+		}
+	} else {
+		MAYBE_UNUSED void *value;
+		kh_foreach(bitmap_git->bitmaps, oid, value, {
+			printf("%s\n", oid_to_hex(&oid));
+		});
+	}

-	setenv("GIT_TEST_READ_COMMIT_TABLE", "1", 1);
 	free_bitmap_index(bitmap_git);

 	return 0;

--- >8 ---

Thanks,
Taylor
Taylor Blau June 29, 2022, 8:41 p.m. UTC | #10
On Wed, Jun 29, 2022 at 04:37:38PM -0400, Taylor Blau wrote:
> +				return error("could not find commit at "
> +					     "position %"PRIu32, commit_pos);

Oops. Pretend that I marked this string for translation ;-).

Thanks,
Taylor
Abhradeep Chakraborty June 30, 2022, 6:58 a.m. UTC | #11
Taylor Blau <me@ttaylorr.com> wrote:

> That shouldn't be the case. When you have a chance, take a look at the
> alloc_nr macro, which shows how much memory we allocate at each
> step:
>
>     #define alloc_nr(x) (((x)+16)*3/2)
>
> Suppose we allocated 16 slots initially, so nr (the number of entries
> stored in the list) is 0 and alloc (the number of entries allocated) is
> 16. Then when we try to add the 17th item, we'll pass 16 to alloc_nr
> which will allocate 48 slots. Then 96, then 168, and so on.
>
> We only have to reallocate and copy the array when nr > alloc, which
> should be fairly infrequently, and happens less and less often the
> larger the array grows.

Ohh, I misunderstood the ALLOC_GROW function. Thanks!
Abhradeep Chakraborty June 30, 2022, 8:35 a.m. UTC | #12
Taylor Blau <me@ttaylorr.com> wrote:

> I think having some indication (maybe via a trace2 region?) that we're
> actually executing this code would be good. Although it's going to be
> *really* noisy, so probably not a good idea to do that in general.
>
> Stolee runs some coverage tests that show lines that we aren't
> exercising via tests. So making sure that this doesn't show up in that
> report when you run it locally would be good.
>
> See some information from him about how to run those tests locally here:
>
>     https://lore.kernel.org/git/00a57a1d-0566-8f54-26b2-0f3558bde88d@github.com/
>
> (TL;DR: run `make coverage-test` and make sure that these lines don't
> show up ;-)).

Got it. Thanks.

> If we had something like that, we could call that function from within
> `test_bitmap_commits()` before reading the keys and values out of
> `bitmap_git->bitmaps`.
>
> An alternative approach would be to read the table directly when it
> exists, perhaps something like this:

I think we have a simpler fix than what you suggested here. What if
We do it like this way -

    if (bitmap_git->table_lookup) {
	if (load_bitmap_entries_v1(bitmap_git) < 0)
	    die(_("failed to load bitmap indexes"));
    }

Is this okay for you?
diff mbox series

Patch

diff --git a/pack-bitmap.c b/pack-bitmap.c
index 36134222d7a..9e09c5824fc 100644
--- a/pack-bitmap.c
+++ b/pack-bitmap.c
@@ -82,6 +82,12 @@  struct bitmap_index {
 	/* The checksum of the packfile or MIDX; points into map. */
 	const unsigned char *checksum;
 
+	/*
+	 * If not NULL, this point into the commit table extension
+	 * (within map).
+	 */
+	unsigned char *table_lookup;
+
 	/*
 	 * Extended index.
 	 *
@@ -185,6 +191,22 @@  static int load_bitmap_header(struct bitmap_index *index)
 			index->hashes = (void *)(index_end - cache_size);
 			index_end -= cache_size;
 		}
+
+		if (flags & BITMAP_OPT_LOOKUP_TABLE &&
+			git_env_bool("GIT_TEST_READ_COMMIT_TABLE", 1)) {
+			size_t table_size = 0;
+			size_t triplet_sz = st_add3(sizeof(uint32_t),    /* commit position */
+							sizeof(uint64_t),    /* offset */
+							sizeof(uint32_t));    /* xor offset */
+
+			table_size = st_add(table_size,
+					st_mult(ntohl(header->entry_count),
+						triplet_sz));
+			if (table_size > index_end - index->map - header_size)
+				return error("corrupted bitmap index file (too short to fit lookup table)");
+			index->table_lookup = (void *)(index_end - table_size);
+			index_end -= table_size;
+		}
 	}
 
 	index->entry_count = ntohl(header->entry_count);
@@ -211,12 +233,20 @@  static struct stored_bitmap *store_bitmap(struct bitmap_index *index,
 
 	hash_pos = kh_put_oid_map(index->bitmaps, stored->oid, &ret);
 
-	/* a 0 return code means the insertion succeeded with no changes,
-	 * because the SHA1 already existed on the map. this is bad, there
-	 * shouldn't be duplicated commits in the index */
+	/* A 0 return code means the insertion succeeded with no changes,
+	 * because the SHA1 already existed on the map. If lookup table
+	 * is NULL, this is bad, there shouldn't be duplicated commits
+	 * in the index.
+	 *
+	 * If table_lookup exists, that means the desired bitmap is already
+	 * loaded. Either this bitmap has been stored directly or another
+	 * bitmap has a direct or indirect xor relation with it. */
 	if (ret == 0) {
-		error("Duplicate entry in bitmap index: %s", oid_to_hex(oid));
-		return NULL;
+		if (!index->table_lookup) {
+			error("Duplicate entry in bitmap index: %s", oid_to_hex(oid));
+			return NULL;
+		}
+		return kh_value(index->bitmaps, hash_pos);
 	}
 
 	kh_value(index->bitmaps, hash_pos) = stored;
@@ -470,7 +500,7 @@  static int load_bitmap(struct bitmap_index *bitmap_git)
 		!(bitmap_git->tags = read_bitmap_1(bitmap_git)))
 		goto failed;
 
-	if (load_bitmap_entries_v1(bitmap_git) < 0)
+	if (!bitmap_git->table_lookup && load_bitmap_entries_v1(bitmap_git) < 0)
 		goto failed;
 
 	return 0;
@@ -557,13 +587,145 @@  struct include_data {
 	struct bitmap *seen;
 };
 
+static inline const void *bitmap_get_triplet(struct bitmap_index *bitmap_git, uint32_t xor_pos)
+{
+	size_t triplet_sz = st_add3(sizeof(uint32_t), sizeof(uint64_t), sizeof(uint32_t));
+	const void *p = bitmap_git->table_lookup + st_mult(xor_pos, triplet_sz);
+	return p;
+}
+
+static uint64_t triplet_get_offset(const void *triplet)
+{
+	const void *p = (unsigned char*) triplet + sizeof(uint32_t);
+	return get_be64(p);
+}
+
+static uint32_t triplet_get_xor_pos(const void *triplet)
+{
+	const void *p = (unsigned char*) triplet + st_add(sizeof(uint32_t), sizeof(uint64_t));
+	return get_be32(p);
+}
+
+static int triplet_cmp(const void *va, const void *vb)
+{
+	int result = 0;
+	uint32_t *a = (uint32_t *) va;
+	uint32_t b = get_be32(vb);
+	if (*a > b)
+		result = 1;
+	else if (*a < b)
+		result = -1;
+	else
+		result = 0;
+
+	return result;
+}
+
+static uint32_t bsearch_pos(struct bitmap_index *bitmap_git, struct object_id *oid,
+						uint32_t *result)
+{
+	int found;
+
+	if (bitmap_git->midx)
+		found = bsearch_midx(oid, bitmap_git->midx, result);
+	else
+		found = bsearch_pack(oid, bitmap_git->pack, result);
+
+	return found;
+}
+
+static struct stored_bitmap *lazy_bitmap_for_commit(struct bitmap_index *bitmap_git,
+					  struct commit *commit)
+{
+	uint32_t commit_pos, xor_pos;
+	uint64_t offset;
+	int flags;
+	const void *triplet = NULL;
+	struct object_id *oid = &commit->object.oid;
+	struct ewah_bitmap *bitmap;
+	struct stored_bitmap *xor_bitmap = NULL;
+	size_t triplet_sz = st_add3(sizeof(uint32_t), sizeof(uint64_t), sizeof(uint32_t));
+
+	int found = bsearch_pos(bitmap_git, oid, &commit_pos);
+
+	if (!found)
+		return NULL;
+
+	triplet = bsearch(&commit_pos, bitmap_git->table_lookup, bitmap_git->entry_count,
+						triplet_sz, triplet_cmp);
+	if (!triplet)
+		return NULL;
+
+	offset = triplet_get_offset(triplet);
+	xor_pos = triplet_get_xor_pos(triplet);
+
+	if (xor_pos != 0xffffffff) {
+		int xor_flags;
+		uint64_t offset_xor;
+		uint32_t *xor_positions;
+		struct object_id xor_oid;
+		size_t size = 0;
+
+		ALLOC_ARRAY(xor_positions, bitmap_git->entry_count);
+		while (xor_pos != 0xffffffff) {
+			xor_positions[size++] = xor_pos;
+			triplet = bitmap_get_triplet(bitmap_git, xor_pos);
+			xor_pos = triplet_get_xor_pos(triplet);
+		}
+
+		while (size){
+			xor_pos = xor_positions[size - 1];
+			triplet = bitmap_get_triplet(bitmap_git, xor_pos);
+			commit_pos = get_be32(triplet);
+			offset_xor = triplet_get_offset(triplet);
+
+			if (nth_bitmap_object_oid(bitmap_git, &xor_oid, commit_pos) < 0) {
+				free(xor_positions);
+				return NULL;
+			}
+
+			bitmap_git->map_pos = offset_xor + sizeof(uint32_t) + sizeof(uint8_t);
+			xor_flags = read_u8(bitmap_git->map, &bitmap_git->map_pos);
+			bitmap = read_bitmap_1(bitmap_git);
+
+			if (!bitmap){
+				free(xor_positions);
+				return NULL;
+			}
+
+			xor_bitmap = store_bitmap(bitmap_git, bitmap, &xor_oid, xor_bitmap, xor_flags);
+			size--;
+		}
+
+		free(xor_positions);
+	}
+
+	bitmap_git->map_pos = offset + sizeof(uint32_t) + sizeof(uint8_t);
+	flags = read_u8(bitmap_git->map, &bitmap_git->map_pos);
+	bitmap = read_bitmap_1(bitmap_git);
+
+	if (!bitmap)
+		return NULL;
+
+	return store_bitmap(bitmap_git, bitmap, oid, xor_bitmap, flags);
+}
+
 struct ewah_bitmap *bitmap_for_commit(struct bitmap_index *bitmap_git,
 				      struct commit *commit)
 {
 	khiter_t hash_pos = kh_get_oid_map(bitmap_git->bitmaps,
 					   commit->object.oid);
-	if (hash_pos >= kh_end(bitmap_git->bitmaps))
-		return NULL;
+	if (hash_pos >= kh_end(bitmap_git->bitmaps)) {
+		struct stored_bitmap *bitmap = NULL;
+		if (!bitmap_git->table_lookup)
+			return NULL;
+
+		/* NEEDSWORK: cache misses aren't recorded */
+		bitmap = lazy_bitmap_for_commit(bitmap_git, commit);
+		if(!bitmap)
+			return NULL;
+		return lookup_stored_bitmap(bitmap);
+	}
 	return lookup_stored_bitmap(kh_value(bitmap_git->bitmaps, hash_pos));
 }
 
@@ -1699,9 +1861,13 @@  void test_bitmap_walk(struct rev_info *revs)
 	if (revs->pending.nr != 1)
 		die("you must specify exactly one commit to test");
 
-	fprintf(stderr, "Bitmap v%d test (%d entries loaded)\n",
+	fprintf(stderr, "Bitmap v%d test (%d entries)\n",
 		bitmap_git->version, bitmap_git->entry_count);
 
+	if (!bitmap_git->table_lookup)
+		fprintf(stderr, "Bitmap v%d test (%d entries loaded)\n",
+			bitmap_git->version, bitmap_git->entry_count);
+
 	root = revs->pending.objects[0].item;
 	bm = bitmap_for_commit(bitmap_git, (struct commit *)root);
 
@@ -1753,10 +1919,16 @@  void test_bitmap_walk(struct rev_info *revs)
 
 int test_bitmap_commits(struct repository *r)
 {
-	struct bitmap_index *bitmap_git = prepare_bitmap_git(r);
+	struct bitmap_index *bitmap_git = NULL;
 	struct object_id oid;
 	MAYBE_UNUSED void *value;
 
+	/* As this function is only used to print bitmap selected
+	 * commits, we don't have to read the commit table.
+	 */
+	setenv("GIT_TEST_READ_COMMIT_TABLE", "0", 1);
+
+	bitmap_git = prepare_bitmap_git(r);
 	if (!bitmap_git)
 		die("failed to load bitmap indexes");
 
@@ -1764,6 +1936,7 @@  int test_bitmap_commits(struct repository *r)
 		printf("%s\n", oid_to_hex(&oid));
 	});
 
+	setenv("GIT_TEST_READ_COMMIT_TABLE", "1", 1);
 	free_bitmap_index(bitmap_git);
 
 	return 0;
diff --git a/t/t5310-pack-bitmaps.sh b/t/t5310-pack-bitmaps.sh
index c669ed959e9..10d7691d973 100755
--- a/t/t5310-pack-bitmaps.sh
+++ b/t/t5310-pack-bitmaps.sh
@@ -42,6 +42,12 @@  test_expect_success 'full repack creates bitmaps' '
 	grep "\"label\":\"writing_lookup_table\"" trace
 '
 
+test_expect_success 'using lookup table loads only necessary bitmaps' '
+	git rev-list --test-bitmap HEAD 2>out &&
+	! grep "Bitmap v1 test (106 entries loaded)" out &&
+	grep "Found bitmap for" out
+'
+
 basic_bitmap_tests
 
 test_expect_success 'incremental repack fails when bitmaps are requested' '
@@ -255,6 +261,7 @@  test_expect_success 'pack reuse respects --incremental' '
 
 test_expect_success 'truncated bitmap fails gracefully (ewah)' '
 	test_config pack.writebitmaphashcache false &&
+	test_config pack.writebitmaplookuptable false &&
 	git repack -ad &&
 	git rev-list --use-bitmap-index --count --all >expect &&
 	bitmap=$(ls .git/objects/pack/*.bitmap) &&
diff --git a/t/t5326-multi-pack-bitmaps.sh b/t/t5326-multi-pack-bitmaps.sh
index 43be49617b8..7d36dbcf722 100755
--- a/t/t5326-multi-pack-bitmaps.sh
+++ b/t/t5326-multi-pack-bitmaps.sh
@@ -320,4 +320,5 @@  test_expect_success 'multi-pack-index write writes lookup table if enabled' '
 		grep "\"label\":\"writing_lookup_table\"" trace
 	)
 '
+
 test_done