diff mbox series

[v3,4/6] pack-bitmap: prepare to read lookup table extension

Message ID e64362621d235f2c79f52e984de7a2a2794e2842.1656924376.git.gitgitgadget@gmail.com (mailing list archive)
State Superseded
Headers show
Series bitmap: integrate a lookup table extension to the bitmap format | expand

Commit Message

Abhradeep Chakraborty July 4, 2022, 8:46 a.m. UTC
From: Abhradeep Chakraborty <chakrabortyabhradeep79@gmail.com>

Earlier change teaches Git to write bitmap lookup table. But Git
does not know how to parse them.

Teach Git to parse the existing bitmap lookup table. The older
versions of Git are not affected by it. Those versions ignore the
lookup table.

Mentored-by: Taylor Blau <me@ttaylorr.com>
Co-Mentored-by: Kaartic Sivaraam <kaartic.sivaraam@gmail.com>
Signed-off-by: Abhradeep Chakraborty <chakrabortyabhradeep79@gmail.com>
---
 pack-bitmap.c           | 266 ++++++++++++++++++++++++++++++++++++++--
 pack-bitmap.h           |   9 ++
 t/t5310-pack-bitmaps.sh |  22 ++++
 3 files changed, 287 insertions(+), 10 deletions(-)

Comments

Taylor Blau July 15, 2022, 2:46 a.m. UTC | #1
On Mon, Jul 04, 2022 at 08:46:14AM +0000, Abhradeep Chakraborty via GitGitGadget wrote:
> +/*
> + * Searches for a matching triplet. `va` is a pointer
> + * to the wanted commit position value. `vb` points to
> + * a triplet in lookup table. The first 4 bytes of each
> + * triplet (pointed by `vb`) are compared with `*va`.
> + */
> +static int triplet_cmp(const void *va, const void *vb)
> +{
> +
> +	uint32_t a = *(uint32_t *)va;

The comment you added is definitely helpful, but I still think that this
line is a little magical. `*va` isn't really a pointer to a `uint32_t`,
but a pointer to the start of a triplet, which just *happens* to have a
4-byte integer at the beginning of it.

I don't think there's a way to improve this much more than we already
have, though. Populating a triplet struct to just dereference the first
field feels wasteful and slow. So I think what you have here makes sense
to me.

> +static uint32_t bsearch_pos(struct bitmap_index *bitmap_git,
> +			    struct object_id *oid,
> +			    uint32_t *result)
> +{
> +	int found;
> +
> +	if (bitmap_is_midx(bitmap_git))
> +		found = bsearch_midx(oid, bitmap_git->midx, result);
> +	else
> +		found = bsearch_pack(oid, bitmap_git->pack, result);
> +
> +	return found;
> +}
> +
> +/*
> + * `bsearch_triplet` function searches for the raw triplet having
> + * commit position same as `commit_pos` and fills `triplet`
> + * object from the raw triplet. Returns 1 on success and 0
> + * on failure.
> + */
> +static int bsearch_triplet(uint32_t *commit_pos,
> +			   struct bitmap_index *bitmap_git,
> +			   struct bitmap_lookup_table_triplet *triplet)
> +{
> +	unsigned char *p = bsearch(commit_pos, bitmap_git->table_lookup, bitmap_git->entry_count,
> +				   BITMAP_LOOKUP_TABLE_TRIPLET_WIDTH, triplet_cmp);
> +
> +	if (!p)
> +		return 0;
> +	triplet->commit_pos = get_be32(p);
> +	p += sizeof(uint32_t);
> +	triplet->offset = get_be64(p);
> +	p += sizeof(uint64_t);
> +	triplet->xor_row = get_be32(p);
> +	return 1;
> +}

This implementation jumped out as being quite similar to
`lookup_table_get_triplet()`. Ultimately they both end up filling a
triplet struct based on some position `p` within the bitmap. The main
difference being that in `lookup_table_get_triplet()`, `p` comes from a
numeric position which indexes into the table, while in
`bsearch_triplet()` the position `p` is given to us by a call to
`bsearch()`.

I wonder if it would be worth extracting the common part of: given a
pointer `p` and a triplet struct, read the triplet beginning at `p` into
the struct.

`lookup_table_get_triplet()` could compute `p` and then return the
result of calling the new auxiliary function with that `p`. Similarly
for `bsearch_triplet()`, it would call that auxiliary function with the
pointer it got from calling `bsearch()`, or return `0` if no match was
found.

It's a minor point, but I think it would help us clean up the
implementation a little bit.

> +
> +static struct stored_bitmap *lazy_bitmap_for_commit(struct bitmap_index *bitmap_git,
> +					  struct commit *commit)
> +{
> +	uint32_t commit_pos, xor_row;
> +	uint64_t offset;
> +	int flags;
> +	struct bitmap_lookup_table_triplet triplet;
> +	struct object_id *oid = &commit->object.oid;
> +	struct ewah_bitmap *bitmap;
> +	struct stored_bitmap *xor_bitmap = NULL;
> +
> +	int found = bsearch_pos(bitmap_git, oid, &commit_pos);
> +
> +	if (!found)
> +		return NULL;
> +
> +	if (!bsearch_triplet(&commit_pos, bitmap_git, &triplet))
> +		return NULL;
> +
> +	offset = triplet.offset;
> +	xor_row = triplet.xor_row;
> +
> +	if (xor_row != 0xffffffff) {
> +		int xor_flags;
> +		khiter_t hash_pos;
> +		uint64_t offset_xor;
> +		struct bitmap_lookup_table_xor_item *xor_items;
> +		struct bitmap_lookup_table_xor_item xor_item;
> +		size_t xor_items_nr = 0, xor_items_alloc = 64;
> +
> +		ALLOC_ARRAY(xor_items, xor_items_alloc);

This ALLOC_ARRAY() looks great to me. I wonder if we could amortize the
cost of allocating in this (somewhat) hot function by treating the
`xor_items` array as a reusable static buffer where we reset
xor_items_nr to 0 when entering this function.

> +		while (xor_row != 0xffffffff) {
> +			struct object_id xor_oid;
> +
> +			if (xor_items_nr + 1 >= bitmap_git->entry_count) {
> +				free(xor_items);
> +				error(_("corrupt bitmap lookup table: xor chain exceed entry count"));

I think we can probably `die()` here, we're pretty much out of luck in
this case.

> +				return NULL;
> +			}
> +
> +			if (lookup_table_get_triplet(bitmap_git, xor_row, &triplet) < 0)
> +				return NULL;
> +
> +			offset_xor = triplet.offset;
> +
> +			if (nth_bitmap_object_oid(bitmap_git, &xor_oid, triplet.commit_pos) < 0) {
> +				free(xor_items);
> +				error(_("corrupt bitmap lookup table: commit index %u out of range"),
> +					triplet.commit_pos);

Same here.

> +				return NULL;
> +			}
> +
> +			hash_pos = kh_get_oid_map(bitmap_git->bitmaps, xor_oid);
> +
> +			/*
> +			 * If desired bitmap is already stored, we don't need
> +			 * to iterate further. Because we know that bitmaps
> +			 * that are needed to be parsed to parse this bitmap
> +			 * has already been stored. So, assign this stored bitmap
> +			 * to the xor_bitmap.
> +			 */
> +			if (hash_pos < kh_end(bitmap_git->bitmaps) &&
> +			    (xor_bitmap = kh_value(bitmap_git->bitmaps, hash_pos)))
> +				break;
> +
> +			ALLOC_GROW(xor_items, xor_items_nr + 1, xor_items_alloc);
> +			xor_items[xor_items_nr++] = (struct bitmap_lookup_table_xor_item) {.oid = xor_oid,
> +											   .offset = offset_xor};

This style of initialization is somewhat uncommon for Git's codebase. It
might be a little more natural to write something like:

    xor_items[xor_items_nr].oid = xor_oid;
    xor_items[xor_items_nr].offset = offset_xor;
    xor_items_nr++;

But the struct-copying for `xor_oid` is definitely uncommon for us. We
should use the `oidcpy()` helper there instead. Or better yet, pass a
pointer to `&xor_items[xor_items_nr].oid` as the second argument to
`nth_bitmap_object_oid()` to avoid the copy altogether.

> +			xor_row = triplet.xor_row;
> +		}
> +
> +		while (xor_items_nr) {
> +			xor_item = xor_items[xor_items_nr - 1];
> +			offset_xor = xor_item.offset;
> +
> +			bitmap_git->map_pos = offset_xor;
> +			if (bitmap_git->map_size - bitmap_git->map_pos < 6) {

Should we extract `6` out to a named constant?

Thanks,
Taylor
Abhradeep Chakraborty July 15, 2022, 4:38 p.m. UTC | #2
On Fri, Jul 15, 2022 at 8:16 AM Taylor Blau <me@ttaylorr.com> wrote:
>
> On Mon, Jul 04, 2022 at 08:46:14AM +0000, Abhradeep Chakraborty via GitGitGadget wrote:
> > +/*
> > + * Searches for a matching triplet. `va` is a pointer
> > + * to the wanted commit position value. `vb` points to
> > + * a triplet in lookup table. The first 4 bytes of each
> > + * triplet (pointed by `vb`) are compared with `*va`.
> > + */
> > +static int triplet_cmp(const void *va, const void *vb)
> > +{
> > +
> > +     uint32_t a = *(uint32_t *)va;
>
> The comment you added is definitely helpful, but I still think that this
> line is a little magical. `*va` isn't really a pointer to a `uint32_t`,
> but a pointer to the start of a triplet, which just *happens* to have a
> 4-byte integer at the beginning of it.

Are you sure about this? As far as I know, the first parameter of such
comparing functions is always a pointer to the given key that we need
to search for and the second parameter points to each element of an
array.

I think "`va is a pointer to the wanted commit position value" is not
that descriptive. Maybe "`va` is a pointer to the given key" is
better. What do you think?

> > + * `bsearch_triplet` function searches for the raw triplet having
> > + * commit position same as `commit_pos` and fills `triplet`
> > + * object from the raw triplet. Returns 1 on success and 0
> > + * on failure.
> > + */
> > +static int bsearch_triplet(uint32_t *commit_pos,
> > +                        struct bitmap_index *bitmap_git,
> > +                        struct bitmap_lookup_table_triplet *triplet)
> > +{
> > +     unsigned char *p = bsearch(commit_pos, bitmap_git->table_lookup, bitmap_git->entry_count,
> > +                                BITMAP_LOOKUP_TABLE_TRIPLET_WIDTH, triplet_cmp);
> > +
> > +     if (!p)
> > +             return 0;
> > +     triplet->commit_pos = get_be32(p);
> > +     p += sizeof(uint32_t);
> > +     triplet->offset = get_be64(p);
> > +     p += sizeof(uint64_t);
> > +     triplet->xor_row = get_be32(p);
> > +     return 1;
> > +}
>
> This implementation jumped out as being quite similar to
> `lookup_table_get_triplet()`. Ultimately they both end up filling a
> triplet struct based on some position `p` within the bitmap. The main
> difference being that in `lookup_table_get_triplet()`, `p` comes from a
> numeric position which indexes into the table, while in
> `bsearch_triplet()` the position `p` is given to us by a call to
> `bsearch()`.
>
> I wonder if it would be worth extracting the common part of: given a
> pointer `p` and a triplet struct, read the triplet beginning at `p` into
> the struct.
>
> `lookup_table_get_triplet()` could compute `p` and then return the
> result of calling the new auxiliary function with that `p`. Similarly
> for `bsearch_triplet()`, it would call that auxiliary function with the
> pointer it got from calling `bsearch()`, or return `0` if no match was
> found.
>
> It's a minor point, but I think it would help us clean up the
> implementation a little bit.

Sure! That would be a great idea!

> > +             ALLOC_ARRAY(xor_items, xor_items_alloc);
>
> This ALLOC_ARRAY() looks great to me. I wonder if we could amortize the
> cost of allocating in this (somewhat) hot function by treating the
> `xor_items` array as a reusable static buffer where we reset
> xor_items_nr to 0 when entering this function.
>
> > +             while (xor_row != 0xffffffff) {
> > +                     struct object_id xor_oid;
> > +
> > +                     if (xor_items_nr + 1 >= bitmap_git->entry_count) {
> > +                             free(xor_items);
> > +                             error(_("corrupt bitmap lookup table: xor chain exceed entry count"));
>
> I think we can probably `die()` here, we're pretty much out of luck in
> this case.
> ...
> > +                             error(_("corrupt bitmap lookup table: commit index %u out of range"),
> > +                                     triplet.commit_pos);
>
> Same here.

I didn't use `die()` here because I thought returning NULL would be a
better idea. In that case, Git can still do its job by using the
traditional approach  - traversing  between objects.
`load_bitmap_entries_v1` also returns NULL if an error occurs. What do
you think?

> > +                     ALLOC_GROW(xor_items, xor_items_nr + 1, xor_items_alloc);
> > +                     xor_items[xor_items_nr++] = (struct bitmap_lookup_table_xor_item) {.oid = xor_oid,
> > +                                                                                        .offset = offset_xor};
>
> This style of initialization is somewhat uncommon for Git's codebase. It
> might be a little more natural to write something like:
>
>     xor_items[xor_items_nr].oid = xor_oid;
>     xor_items[xor_items_nr].offset = offset_xor;
>     xor_items_nr++;
>
> But the struct-copying for `xor_oid` is definitely uncommon for us. We
> should use the `oidcpy()` helper there instead. Or better yet, pass a
> pointer to `&xor_items[xor_items_nr].oid` as the second argument to
> `nth_bitmap_object_oid()` to avoid the copy altogether.

Ok, got it.

> > +                     bitmap_git->map_pos = offset_xor;
> > +                     if (bitmap_git->map_size - bitmap_git->map_pos < 6) {
>
> Should we extract `6` out to a named constant?

Ok, sure!

Thanks :)

>
> Thanks,
> Taylor
Taylor Blau July 15, 2022, 10:20 p.m. UTC | #3
On Fri, Jul 15, 2022 at 10:08:17PM +0530, Abhradeep Chakraborty wrote:
> On Fri, Jul 15, 2022 at 8:16 AM Taylor Blau <me@ttaylorr.com> wrote:
> >
> > On Mon, Jul 04, 2022 at 08:46:14AM +0000, Abhradeep Chakraborty via GitGitGadget wrote:
> > > +/*
> > > + * Searches for a matching triplet. `va` is a pointer
> > > + * to the wanted commit position value. `vb` points to
> > > + * a triplet in lookup table. The first 4 bytes of each
> > > + * triplet (pointed by `vb`) are compared with `*va`.
> > > + */
> > > +static int triplet_cmp(const void *va, const void *vb)
> > > +{
> > > +
> > > +     uint32_t a = *(uint32_t *)va;
> >
> > The comment you added is definitely helpful, but I still think that this
> > line is a little magical. `*va` isn't really a pointer to a `uint32_t`,
> > but a pointer to the start of a triplet, which just *happens* to have a
> > 4-byte integer at the beginning of it.
>
> Are you sure about this? As far as I know, the first parameter of such
> comparing functions is always a pointer to the given key that we need
> to search for and the second parameter points to each element of an
> array.
>
> I think "`va is a pointer to the wanted commit position value" is not
> that descriptive. Maybe "`va` is a pointer to the given key" is
> better. What do you think?

Yes, the first argument to the comparison function used in bsearch() is
a pointer to some element in the array. I just meant that that array is
the bitmap_git->table_lookup region, so each element isn't actually a
uint32_t array, but the whole thing is an array of (uint32_t, uint64_t,
uint32_t) triplets.

What you wrote here is fine, and I don't even think that the comment
needs updating. If you did want to clarify, I think you could say
something along the lines of what you wrote above ("`va` is a pointer to
an array element") and add something along the lines of "where the array
is the lookup table region of the .bitmap".

> > > +             ALLOC_ARRAY(xor_items, xor_items_alloc);
> >
> > This ALLOC_ARRAY() looks great to me. I wonder if we could amortize the
> > cost of allocating in this (somewhat) hot function by treating the
> > `xor_items` array as a reusable static buffer where we reset
> > xor_items_nr to 0 when entering this function.
> >
> > > +             while (xor_row != 0xffffffff) {
> > > +                     struct object_id xor_oid;
> > > +
> > > +                     if (xor_items_nr + 1 >= bitmap_git->entry_count) {
> > > +                             free(xor_items);
> > > +                             error(_("corrupt bitmap lookup table: xor chain exceed entry count"));
> >
> > I think we can probably `die()` here, we're pretty much out of luck in
> > this case.
> > ...
> > > +                             error(_("corrupt bitmap lookup table: commit index %u out of range"),
> > > +                                     triplet.commit_pos);
> >
> > Same here.
>
> I didn't use `die()` here because I thought returning NULL would be a
> better idea. In that case, Git can still do its job by using the
> traditional approach  - traversing  between objects.
> `load_bitmap_entries_v1` also returns NULL if an error occurs. What do
> you think?

Ah, I wasn't aware that our callers are graceful enough to handle this
like that. Yes, if we can fallback gracefully, we should, so I think
just error()-ing here (and above) is the right choice. Thanks for saying
so.

Thanks,
Taylor
Martin Ågren July 18, 2022, 9:06 a.m. UTC | #4
Hi Abhradeep and Taylor,

I very much enjoy following from a distance Abhradeep's work on this
series and all the reviewing and mentoring. I don't grasp anywhere near
all the details, but I've looked into this a bit:

On Sat, 16 Jul 2022 at 00:37, Taylor Blau <me@ttaylorr.com> wrote:
>
> On Fri, Jul 15, 2022 at 10:08:17PM +0530, Abhradeep Chakraborty wrote:
> > On Fri, Jul 15, 2022 at 8:16 AM Taylor Blau <me@ttaylorr.com> wrote:
> > >
> > > On Mon, Jul 04, 2022 at 08:46:14AM +0000, Abhradeep Chakraborty via GitGitGadget wrote:
> > > > +/*
> > > > + * Searches for a matching triplet. `va` is a pointer
> > > > + * to the wanted commit position value. `vb` points to
> > > > + * a triplet in lookup table. The first 4 bytes of each
> > > > + * triplet (pointed by `vb`) are compared with `*va`.
> > > > + */
> > > > +static int triplet_cmp(const void *va, const void *vb)
> > > > +{
> > > > +
> > > > +     uint32_t a = *(uint32_t *)va;
> > >
> > > The comment you added is definitely helpful, but I still think that this
> > > line is a little magical. `*va` isn't really a pointer to a `uint32_t`,
> > > but a pointer to the start of a triplet, which just *happens* to have a
> > > 4-byte integer at the beginning of it.

Yeah, this all looks quite magical with the casting, and with the
asymmetric handling of `va` and `vb`.

> > Are you sure about this? As far as I know, the first parameter of such
> > comparing functions is always a pointer to the given key that we need
> > to search for and the second parameter points to each element of an
> > array.

Yes, that matches my understanding and the man-page for bsearch(3):

  "The compar routine is expected to have two arguments which point to
  the key object and to an array member, in that order, [...]"

I think it would help to make this something like

  static int triplet_cmp(const void *key, const void *array_item)

to really highlight this asymmetric nature of this function, or to make
clear how the values flow through our call-chain through something like

  static int triplet_cmp(const void *commit_pos, const void *table_entry)

Because we really do rely on this promise of bsearch(3) -- if we would
instantiate a 'dummy' triplet carrying the key, we wouldn't need to (but
we would instead need to have our `cmp` function constantly re-read the
same value, including doing the byteswap).

Would it make sense to let the `const void *key` directly carry the
32-bit value and hope that `sizeof(key) >= sizeof(uint32_t)`? That's
probably too magical, "just" to save on dereferencing.

One thing that could perhaps make things clearer is if
`bsearch_triplet()` did take the position directly, rather than as a
pointer:

-static int bsearch_triplet(uint32_t *commit_pos,
+static int bsearch_triplet(uint32_t commit_pos,
                           struct bitmap_index *bitmap_git,
                           struct bitmap_lookup_table_triplet *triplet)
 {
-       unsigned char *p = bsearch(commit_pos,
bitmap_git->table_lookup, bitmap_git->entry_count,
+       unsigned char *p = bsearch(&commit_pos,
bitmap_git->table_lookup, bitmap_git->entry_count,
                                   BITMAP_LOOKUP_TABLE_TRIPLET_WIDTH,
triplet_cmp);


Also, maybe s/bsearch_triplet/&_by_pos/ could clarify the intent of this
function?

> > I think "`va is a pointer to the wanted commit position value" is not
> > that descriptive. Maybe "`va` is a pointer to the given key" is
> > better. What do you think?
>
> Yes, the first argument to the comparison function used in bsearch() is

s/first/second/

> a pointer to some element in the array. I just meant that that array is
> the bitmap_git->table_lookup region, so each element isn't actually a
> uint32_t array, but the whole thing is an array of (uint32_t, uint64_t,
> uint32_t) triplets.
>
> What you wrote here is fine, and I don't even think that the comment
> needs updating. If you did want to clarify, I think you could say
> something along the lines of what you wrote above ("`va` is a pointer to
> an array element") and add something along the lines of "where the array
> is the lookup table region of the .bitmap".

I mentioned a few ideas for clarifying things above. I do think it would
be a good idea to differentiate the names of `va` and `vb` to make the
fundamental asymmetry between them clearer. The rest of my comments are
really just musings.

I originally started looking at this because I wanted to see why the
casting to a `uint32_t *` and dereferencing it was safe. The reason is,
we're always handling the same pointer to a `uint32_t` on the stack, so
alignment is guaranteed.


Martin
Abhradeep Chakraborty July 18, 2022, 7:25 p.m. UTC | #5
On Mon, Jul 18, 2022 at 2:37 PM Martin Ågren <martin.agren@gmail.com> wrote:
>
> Hi Abhradeep and Taylor,
>
> I very much enjoy following from a distance Abhradeep's work on this
> series and all the reviewing and mentoring. I don't grasp anywhere near
> all the details, but I've looked into this a bit:

Thanks!

>   "The compar routine is expected to have two arguments which point to
>   the key object and to an array member, in that order, [...]"
>
> I think it would help to make this something like
>
>   static int triplet_cmp(const void *key, const void *array_item)
>
> to really highlight this asymmetric nature of this function, or to make
> clear how the values flow through our call-chain through something like
>
>   static int triplet_cmp(const void *commit_pos, const void *table_entry)

Nice. Will update it.

> Would it make sense to let the `const void *key` directly carry the
> 32-bit value and hope that `sizeof(key) >= sizeof(uint32_t)`? That's
> probably too magical, "just" to save on dereferencing.

I do not have any particular opinion here. I will do whatever you think is best.

> One thing that could perhaps make things clearer is if
> `bsearch_triplet()` did take the position directly, rather than as a
> pointer:
>
> -static int bsearch_triplet(uint32_t *commit_pos,
> +static int bsearch_triplet(uint32_t commit_pos,
>                            struct bitmap_index *bitmap_git,
>                            struct bitmap_lookup_table_triplet *triplet)
>  {
> -       unsigned char *p = bsearch(commit_pos,
> bitmap_git->table_lookup, bitmap_git->entry_count,
> +       unsigned char *p = bsearch(&commit_pos,
> bitmap_git->table_lookup, bitmap_git->entry_count,
>                                    BITMAP_LOOKUP_TABLE_TRIPLET_WIDTH,
> triplet_cmp);
>
>
> Also, maybe s/bsearch_triplet/&_by_pos/ could clarify the intent of this
> function?

Ok, sure!

Thanks :)
Martin Ågren July 18, 2022, 11:26 p.m. UTC | #6
On Mon, 18 Jul 2022 at 21:26, Abhradeep Chakraborty
<chakrabortyabhradeep79@gmail.com> wrote:
>
> On Mon, Jul 18, 2022 at 2:37 PM Martin Ågren <martin.agren@gmail.com> wrote:
> >
> > Would it make sense to let the `const void *key` directly carry the
> > 32-bit value and hope that `sizeof(key) >= sizeof(uint32_t)`? That's
> > probably too magical, "just" to save on dereferencing.
>
> I do not have any particular opinion here. I will do whatever you think is best.

To be honest, I think it would be better not to do that. I floated it as
a random idea, but it's somewhere in the vicinity of undefined behavior,
and in any case, it might be a bit too tricky. If we're doing a byteswap
anyway (on virtually all platforms) and doing a bunch of comparisons,
trying to save on a dereference doesn't seem worth the increased "huh"
factor.

Martin
Taylor Blau July 26, 2022, 12:45 a.m. UTC | #7
Hi Martin,

> > > > The comment you added is definitely helpful, but I still think that this
> > > > line is a little magical. `*va` isn't really a pointer to a `uint32_t`,
> > > > but a pointer to the start of a triplet, which just *happens* to have a
> > > > 4-byte integer at the beginning of it.
>
> Yeah, this all looks quite magical with the casting, and with the
> asymmetric handling of `va` and `vb`.

Yeah, this was my main point (which I didn't intend to create as much of
a digression with as I appear to have!).

The handling here is all correct, but what I was saying was that even
though we're treating `*vb` as a pointer to a `uint32_t`, reading vb[1]
is bogus, since there isn't another 32-bit value there.

So I was saying that you *could* initialize a triplet struct, assign its
fields appropriately, and then compare `*va` to `triplet->foo`. But I
think setting up a struct to only bother reading the first field is
probably wasteful, hence my suggestion for a clarifying comment.

> > > Are you sure about this? As far as I know, the first parameter of such
> > > comparing functions is always a pointer to the given key that we need
> > > to search for and the second parameter points to each element of an
> > > array.
>
> Yes, that matches my understanding and the man-page for bsearch(3):
>
>   "The compar routine is expected to have two arguments which point to
>   the key object and to an array member, in that order, [...]"
>
> I think it would help to make this something like
>
>   static int triplet_cmp(const void *key, const void *array_item)
>
> to really highlight this asymmetric nature of this function, or to make
> clear how the values flow through our call-chain through something like
>
>   static int triplet_cmp(const void *commit_pos, const void *table_entry)

Yeah, that makes sense to me. I'm not too attached to either name, both
seem OK to me.

Thanks,
Taylor
diff mbox series

Patch

diff --git a/pack-bitmap.c b/pack-bitmap.c
index 36134222d7a..e22bbbdc60e 100644
--- a/pack-bitmap.c
+++ b/pack-bitmap.c
@@ -82,6 +82,12 @@  struct bitmap_index {
 	/* The checksum of the packfile or MIDX; points into map. */
 	const unsigned char *checksum;
 
+	/*
+	 * If not NULL, this point into the commit table extension
+	 * (within the memory mapped region `map`).
+	 */
+	unsigned char *table_lookup;
+
 	/*
 	 * Extended index.
 	 *
@@ -185,6 +191,16 @@  static int load_bitmap_header(struct bitmap_index *index)
 			index->hashes = (void *)(index_end - cache_size);
 			index_end -= cache_size;
 		}
+
+		if (flags & BITMAP_OPT_LOOKUP_TABLE) {
+			size_t table_size = st_mult(ntohl(header->entry_count),
+						    BITMAP_LOOKUP_TABLE_TRIPLET_WIDTH);
+			if (table_size > index_end - index->map - header_size)
+				return error(_("corrupted bitmap index file (too short to fit lookup table)"));
+			if (git_env_bool("GIT_TEST_READ_COMMIT_TABLE", 1))
+				index->table_lookup = (void *)(index_end - table_size);
+			index_end -= table_size;
+		}
 	}
 
 	index->entry_count = ntohl(header->entry_count);
@@ -211,11 +227,13 @@  static struct stored_bitmap *store_bitmap(struct bitmap_index *index,
 
 	hash_pos = kh_put_oid_map(index->bitmaps, stored->oid, &ret);
 
-	/* a 0 return code means the insertion succeeded with no changes,
-	 * because the SHA1 already existed on the map. this is bad, there
-	 * shouldn't be duplicated commits in the index */
+	/*
+	 * A 0 return code means the insertion succeeded with no changes,
+	 * because the SHA1 already existed on the map. This is bad, there
+	 * shouldn't be duplicated commits in the index.
+	 */
 	if (ret == 0) {
-		error("Duplicate entry in bitmap index: %s", oid_to_hex(oid));
+		error(_("duplicate entry in bitmap index: %s"), oid_to_hex(oid));
 		return NULL;
 	}
 
@@ -470,7 +488,7 @@  static int load_bitmap(struct bitmap_index *bitmap_git)
 		!(bitmap_git->tags = read_bitmap_1(bitmap_git)))
 		goto failed;
 
-	if (load_bitmap_entries_v1(bitmap_git) < 0)
+	if (!bitmap_git->table_lookup && load_bitmap_entries_v1(bitmap_git) < 0)
 		goto failed;
 
 	return 0;
@@ -557,13 +575,229 @@  struct include_data {
 	struct bitmap *seen;
 };
 
+struct bitmap_lookup_table_triplet {
+	uint32_t commit_pos;
+	uint64_t offset;
+	uint32_t xor_row;
+};
+
+struct bitmap_lookup_table_xor_item {
+	struct object_id oid;
+	uint64_t offset;
+};
+
+/*
+ * This function gets the raw triplet from `row`'th row in the
+ * lookup table and fills that data to the `triplet`.
+ */
+static int lookup_table_get_triplet(struct bitmap_index *bitmap_git,
+				    uint32_t pos,
+				    struct bitmap_lookup_table_triplet *triplet)
+{
+	unsigned char *p = NULL;
+	if (pos >= bitmap_git->entry_count)
+		return error(_("corrupt bitmap lookup table: triplet position out of index"));
+
+	p = bitmap_git->table_lookup + st_mult(pos, BITMAP_LOOKUP_TABLE_TRIPLET_WIDTH);
+
+	triplet->commit_pos = get_be32(p);
+	p += sizeof(uint32_t);
+	triplet->offset = get_be64(p);
+	p += sizeof(uint64_t);
+	triplet->xor_row = get_be32(p);
+	return 0;
+}
+
+/*
+ * Searches for a matching triplet. `va` is a pointer
+ * to the wanted commit position value. `vb` points to
+ * a triplet in lookup table. The first 4 bytes of each
+ * triplet (pointed by `vb`) are compared with `*va`.
+ */
+static int triplet_cmp(const void *va, const void *vb)
+{
+
+	uint32_t a = *(uint32_t *)va;
+	uint32_t b = get_be32(vb);
+	if (a > b)
+		return 1;
+	else if (a < b)
+		return -1;
+
+	return 0;
+}
+
+static uint32_t bsearch_pos(struct bitmap_index *bitmap_git,
+			    struct object_id *oid,
+			    uint32_t *result)
+{
+	int found;
+
+	if (bitmap_is_midx(bitmap_git))
+		found = bsearch_midx(oid, bitmap_git->midx, result);
+	else
+		found = bsearch_pack(oid, bitmap_git->pack, result);
+
+	return found;
+}
+
+/*
+ * `bsearch_triplet` function searches for the raw triplet having
+ * commit position same as `commit_pos` and fills `triplet`
+ * object from the raw triplet. Returns 1 on success and 0
+ * on failure.
+ */
+static int bsearch_triplet(uint32_t *commit_pos,
+			   struct bitmap_index *bitmap_git,
+			   struct bitmap_lookup_table_triplet *triplet)
+{
+	unsigned char *p = bsearch(commit_pos, bitmap_git->table_lookup, bitmap_git->entry_count,
+				   BITMAP_LOOKUP_TABLE_TRIPLET_WIDTH, triplet_cmp);
+
+	if (!p)
+		return 0;
+	triplet->commit_pos = get_be32(p);
+	p += sizeof(uint32_t);
+	triplet->offset = get_be64(p);
+	p += sizeof(uint64_t);
+	triplet->xor_row = get_be32(p);
+	return 1;
+}
+
+static struct stored_bitmap *lazy_bitmap_for_commit(struct bitmap_index *bitmap_git,
+					  struct commit *commit)
+{
+	uint32_t commit_pos, xor_row;
+	uint64_t offset;
+	int flags;
+	struct bitmap_lookup_table_triplet triplet;
+	struct object_id *oid = &commit->object.oid;
+	struct ewah_bitmap *bitmap;
+	struct stored_bitmap *xor_bitmap = NULL;
+
+	int found = bsearch_pos(bitmap_git, oid, &commit_pos);
+
+	if (!found)
+		return NULL;
+
+	if (!bsearch_triplet(&commit_pos, bitmap_git, &triplet))
+		return NULL;
+
+	offset = triplet.offset;
+	xor_row = triplet.xor_row;
+
+	if (xor_row != 0xffffffff) {
+		int xor_flags;
+		khiter_t hash_pos;
+		uint64_t offset_xor;
+		struct bitmap_lookup_table_xor_item *xor_items;
+		struct bitmap_lookup_table_xor_item xor_item;
+		size_t xor_items_nr = 0, xor_items_alloc = 64;
+
+		ALLOC_ARRAY(xor_items, xor_items_alloc);
+		while (xor_row != 0xffffffff) {
+			struct object_id xor_oid;
+
+			if (xor_items_nr + 1 >= bitmap_git->entry_count) {
+				free(xor_items);
+				error(_("corrupt bitmap lookup table: xor chain exceed entry count"));
+				return NULL;
+			}
+
+			if (lookup_table_get_triplet(bitmap_git, xor_row, &triplet) < 0)
+				return NULL;
+
+			offset_xor = triplet.offset;
+
+			if (nth_bitmap_object_oid(bitmap_git, &xor_oid, triplet.commit_pos) < 0) {
+				free(xor_items);
+				error(_("corrupt bitmap lookup table: commit index %u out of range"),
+					triplet.commit_pos);
+				return NULL;
+			}
+
+			hash_pos = kh_get_oid_map(bitmap_git->bitmaps, xor_oid);
+
+			/*
+			 * If desired bitmap is already stored, we don't need
+			 * to iterate further. Because we know that bitmaps
+			 * that are needed to be parsed to parse this bitmap
+			 * has already been stored. So, assign this stored bitmap
+			 * to the xor_bitmap.
+			 */
+			if (hash_pos < kh_end(bitmap_git->bitmaps) &&
+			    (xor_bitmap = kh_value(bitmap_git->bitmaps, hash_pos)))
+				break;
+
+			ALLOC_GROW(xor_items, xor_items_nr + 1, xor_items_alloc);
+			xor_items[xor_items_nr++] = (struct bitmap_lookup_table_xor_item) {.oid = xor_oid,
+											   .offset = offset_xor};
+			xor_row = triplet.xor_row;
+		}
+
+		while (xor_items_nr) {
+			xor_item = xor_items[xor_items_nr - 1];
+			offset_xor = xor_item.offset;
+
+			bitmap_git->map_pos = offset_xor;
+			if (bitmap_git->map_size - bitmap_git->map_pos < 6) {
+				error(_("corrupt ewah bitmap: truncated header for bitmap of commit \"%s\""),
+					oid_to_hex(&xor_item.oid));
+				free(xor_items);
+				return NULL;
+			}
+
+			bitmap_git->map_pos = bitmap_git->map_pos + sizeof(uint32_t) + sizeof(uint8_t);
+			xor_flags = read_u8(bitmap_git->map, &bitmap_git->map_pos);
+			bitmap = read_bitmap_1(bitmap_git);
+
+			if (!bitmap) {
+				free(xor_items);
+				return NULL;
+			}
+
+			xor_bitmap = store_bitmap(bitmap_git, bitmap, &xor_item.oid, xor_bitmap, xor_flags);
+			xor_items_nr--;
+		}
+
+		free(xor_items);
+	}
+
+	bitmap_git->map_pos = offset;
+	if (bitmap_git->map_size - bitmap_git->map_pos < 6) {
+		error(_("corrupt ewah bitmap: truncated header for bitmap of commit \"%s\""),
+			oid_to_hex(oid));
+		return NULL;
+	}
+
+	bitmap_git->map_pos = bitmap_git->map_pos + sizeof(uint32_t) + sizeof(uint8_t);
+	flags = read_u8(bitmap_git->map, &bitmap_git->map_pos);
+	bitmap = read_bitmap_1(bitmap_git);
+
+	if (!bitmap)
+		return NULL;
+
+	return store_bitmap(bitmap_git, bitmap, oid, xor_bitmap, flags);
+}
+
 struct ewah_bitmap *bitmap_for_commit(struct bitmap_index *bitmap_git,
 				      struct commit *commit)
 {
 	khiter_t hash_pos = kh_get_oid_map(bitmap_git->bitmaps,
 					   commit->object.oid);
-	if (hash_pos >= kh_end(bitmap_git->bitmaps))
-		return NULL;
+	if (hash_pos >= kh_end(bitmap_git->bitmaps)) {
+		struct stored_bitmap *bitmap = NULL;
+		if (!bitmap_git->table_lookup)
+			return NULL;
+
+		trace2_region_enter("pack-bitmap", "reading_lookup_table", the_repository);
+		/* NEEDSWORK: cache misses aren't recorded */
+		bitmap = lazy_bitmap_for_commit(bitmap_git, commit);
+		trace2_region_leave("pack-bitmap", "reading_lookup_table", the_repository);
+		if (!bitmap)
+			return NULL;
+		return lookup_stored_bitmap(bitmap);
+	}
 	return lookup_stored_bitmap(kh_value(bitmap_git->bitmaps, hash_pos));
 }
 
@@ -1699,8 +1933,10 @@  void test_bitmap_walk(struct rev_info *revs)
 	if (revs->pending.nr != 1)
 		die("you must specify exactly one commit to test");
 
-	fprintf(stderr, "Bitmap v%d test (%d entries loaded)\n",
-		bitmap_git->version, bitmap_git->entry_count);
+	fprintf(stderr, "Bitmap v%d test (%d entries%s)",
+		bitmap_git->version,
+		bitmap_git->entry_count,
+		bitmap_git->table_lookup ? "" : " loaded");
 
 	root = revs->pending.objects[0].item;
 	bm = bitmap_for_commit(bitmap_git, (struct commit *)root);
@@ -1753,13 +1989,23 @@  void test_bitmap_walk(struct rev_info *revs)
 
 int test_bitmap_commits(struct repository *r)
 {
-	struct bitmap_index *bitmap_git = prepare_bitmap_git(r);
 	struct object_id oid;
 	MAYBE_UNUSED void *value;
+	struct bitmap_index *bitmap_git = prepare_bitmap_git(r);
+
+	/*
+	 * As this function is only used to print bitmap selected
+	 * commits, we don't have to read the commit table.
+	 */
 
 	if (!bitmap_git)
 		die("failed to load bitmap indexes");
 
+	if (bitmap_git->table_lookup) {
+		if (load_bitmap_entries_v1(bitmap_git) < 0)
+			die(_("failed to load bitmap indexes"));
+	}
+
 	kh_foreach(bitmap_git->bitmaps, oid, value, {
 		printf("%s\n", oid_to_hex(&oid));
 	});
diff --git a/pack-bitmap.h b/pack-bitmap.h
index 67a9d0fc303..9278f71ac91 100644
--- a/pack-bitmap.h
+++ b/pack-bitmap.h
@@ -23,6 +23,15 @@  struct bitmap_disk_header {
 
 #define NEEDS_BITMAP (1u<<22)
 
+/*
+ * The width in bytes of a single triplet in the lookup table
+ * extension:
+ *     (commit_pos, offset, xor_row)
+ *
+ * whose fields ar 32-, 64-, 32- bits wide, respectively.
+ */
+#define BITMAP_LOOKUP_TABLE_TRIPLET_WIDTH (16)
+
 enum pack_bitmap_opts {
 	BITMAP_OPT_FULL_DAG = 0x1,
 	BITMAP_OPT_HASH_CACHE = 0x4,
diff --git a/t/t5310-pack-bitmaps.sh b/t/t5310-pack-bitmaps.sh
index c0607172827..7e50f8e7653 100755
--- a/t/t5310-pack-bitmaps.sh
+++ b/t/t5310-pack-bitmaps.sh
@@ -258,6 +258,7 @@  test_bitmap_cases () {
 
 	test_expect_success 'truncated bitmap fails gracefully (ewah)' '
 		test_config pack.writebitmaphashcache false &&
+		test_config pack.writebitmaplookuptable false &&
 		git repack -ad &&
 		git rev-list --use-bitmap-index --count --all >expect &&
 		bitmap=$(ls .git/objects/pack/*.bitmap) &&
@@ -270,6 +271,7 @@  test_bitmap_cases () {
 	'
 
 	test_expect_success 'truncated bitmap fails gracefully (cache)' '
+		git config pack.writeBitmapLookupTable '"$writeLookupTable"' &&
 		git repack -ad &&
 		git rev-list --use-bitmap-index --count --all >expect &&
 		bitmap=$(ls .git/objects/pack/*.bitmap) &&
@@ -453,4 +455,24 @@  test_expect_success 'verify writing bitmap lookup table when enabled' '
 	grep "\"label\":\"writing_lookup_table\"" trace2
 '
 
+test_expect_success 'lookup table is actually used to traverse objects' '
+	git repack -adb &&
+	GIT_TRACE2_EVENT="$(pwd)/trace3" \
+		git rev-list --use-bitmap-index --count --all &&
+	grep "\"label\":\"reading_lookup_table\"" trace3
+'
+
+test_expect_success 'truncated bitmap fails gracefully (lookup table)' '
+	test_config pack.writebitmaphashcache false &&
+	git repack -adb &&
+	git rev-list --use-bitmap-index --count --all >expect &&
+	bitmap=$(ls .git/objects/pack/*.bitmap) &&
+	test_when_finished "rm -f $bitmap" &&
+	test_copy_bytes 512 <$bitmap >$bitmap.tmp &&
+	mv -f $bitmap.tmp $bitmap &&
+	git rev-list --use-bitmap-index --count --all >actual 2>stderr &&
+	test_cmp expect actual &&
+	test_i18ngrep corrupted.bitmap.index stderr
+'
+
 test_done