diff mbox series

[v2,13/24] pack-bitmap: read multi-pack bitmaps

Message ID 7d44ba6299c06c956d5ac8ba01a0288d109c3cae.1624314293.git.me@ttaylorr.com (mailing list archive)
State New, archived
Headers show
Series multi-pack reachability bitmaps | expand

Commit Message

Taylor Blau June 21, 2021, 10:25 p.m. UTC
This prepares the code in pack-bitmap to interpret the new multi-pack
bitmaps described in Documentation/technical/bitmap-format.txt, which
mostly involves converting bit positions to accommodate looking them up
in a MIDX.

Note that there are currently no writers who write multi-pack bitmaps,
and that this will be implemented in the subsequent commit.

Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
 builtin/pack-objects.c |   5 +
 pack-bitmap-write.c    |   2 +-
 pack-bitmap.c          | 362 +++++++++++++++++++++++++++++++++++++----
 pack-bitmap.h          |   5 +
 packfile.c             |   2 +-
 5 files changed, 340 insertions(+), 36 deletions(-)

Comments

Jeff King July 21, 2021, 11:32 a.m. UTC | #1
On Mon, Jun 21, 2021 at 06:25:31PM -0400, Taylor Blau wrote:

> This prepares the code in pack-bitmap to interpret the new multi-pack
> bitmaps described in Documentation/technical/bitmap-format.txt, which
> mostly involves converting bit positions to accommodate looking them up
> in a MIDX.
> 
> Note that there are currently no writers who write multi-pack bitmaps,
> and that this will be implemented in the subsequent commit.

There's quite a lot going on in this one, of course, but most of it
looks right. A few hunks did puzzle me:

> @@ -302,12 +377,18 @@ static int open_pack_bitmap_1(struct bitmap_index *bitmap_git, struct packed_git
>  		return -1;
>  	}
>  
> -	if (bitmap_git->pack) {
> +	if (bitmap_git->pack || bitmap_git->midx) {
> +		/* ignore extra bitmap file; we can only handle one */
>  		warning("ignoring extra bitmap file: %s", packfile->pack_name);
>  		close(fd);
>  		return -1;
>  	}
>  
> +	if (!is_pack_valid(packfile)) {
> +		close(fd);
> +		return -1;
> +	}
> +

What's this extra is_pack_valid() doing? I wouldn't expect many changes
at all to this non-midx code path (aside from the "did we already load a
midx bitmap" in the earlier part of the hunk, which makes sense).

> -static int load_pack_bitmap(struct bitmap_index *bitmap_git)
> +static int load_reverse_index(struct bitmap_index *bitmap_git)
> +{
> +	if (bitmap_is_midx(bitmap_git)) {
> +		uint32_t i;
> +		int ret;
> +
> +		ret = load_midx_revindex(bitmap_git->midx);
> +		if (ret)
> +			return ret;
> +
> +		for (i = 0; i < bitmap_git->midx->num_packs; i++) {
> +			if (prepare_midx_pack(the_repository, bitmap_git->midx, i))
> +				die(_("load_reverse_index: could not open pack"));
> +			ret = load_pack_revindex(bitmap_git->midx->packs[i]);
> +			if (ret)
> +				return ret;
> +		}
> +		return 0;
> +	}
> +	return load_pack_revindex(bitmap_git->pack);
> +}

OK, this new function is used in load_bitmap(), which is used for both
pack and midx bitmaps. So if we have a midx bitmap, we'll
unconditionally load the revindex here. But:

  - why do we then load individual pack revindexes? I can believe it may
    be necessary to meet the assumptions of some other part of the code,
    but it would be nice to have a comment giving us some clue.

  - in open_midx_bitmap_1(), we also unconditionally load the midx
    reverse index. I think that will always happen before us here (we
    cannot load_bitmap() a bitmap that has not been opened). So is this
    load_midx_revindex() call always a noop?

> +static int open_bitmap(struct repository *r,
> +		       struct bitmap_index *bitmap_git)
> +{
> +	assert(!bitmap_git->map);
> +
> +	if (!open_midx_bitmap(r, bitmap_git))
> +		return 0;
> +	return open_pack_bitmap(r, bitmap_git);
> +}

We always prefer a midx bitmap over a pack one. That makes sense, since
that means we can leave old pack bitmaps in place when generating midx
bitmaps, if we choose to.

>  static int bitmap_position(struct bitmap_index *bitmap_git,
>  			   const struct object_id *oid)
>  {
> -	int pos = bitmap_position_packfile(bitmap_git, oid);
> +	int pos;
> +	if (bitmap_is_midx(bitmap_git))
> +		pos = bitmap_position_midx(bitmap_git, oid);
> +	else
> +		pos = bitmap_position_packfile(bitmap_git, oid);
>  	return (pos >= 0) ? pos : bitmap_position_extended(bitmap_git, oid);
>  }

Makes sense. Not new in your patch, but this "int" return is fudging the
same 32-bit space we were talking about elsewhere (i.e., "pos" really
could be 2^32, or even more due to extended objects).

In practice I think even 2^31 objects is pretty out-of-reach, but it may
be worth changing the return type (and the callers), or even just
catching the overflow with an assertion.

> @@ -752,8 +911,13 @@ static int in_bitmapped_pack(struct bitmap_index *bitmap_git,
>  		struct object *object = roots->item;
>  		roots = roots->next;
>  
> -		if (find_pack_entry_one(object->oid.hash, bitmap_git->pack) > 0)
> -			return 1;
> +		if (bitmap_is_midx(bitmap_git)) {
> +			if (bsearch_midx(&object->oid, bitmap_git->midx, NULL))
> +				return 1;
> +		} else {
> +			if (find_pack_entry_one(object->oid.hash, bitmap_git->pack) > 0)
> +				return 1;
> +		}
>  	}

Makes sense. TBH, I am not sure this in_bitmapped_pack() function is all
that useful. It is used only as part of a heuristic to avoid bitmaps
when we don't have coverage of any "have" commits. But I'm not sure that
heuristic is actually useful.

Anyway, we should definitely not get into ripping it out here. This
series is complicated enough. :) Just a note for possible future work.

>  	if (pos < bitmap_num_objects(bitmap_git)) {
> -		off_t ofs = pack_pos_to_offset(pack, pos);
> +		struct packed_git *pack;
> +		off_t ofs;
> +
> +		if (bitmap_is_midx(bitmap_git)) {
> +			uint32_t midx_pos = pack_pos_to_midx(bitmap_git->midx, pos);
> +			uint32_t pack_id = nth_midxed_pack_int_id(bitmap_git->midx, midx_pos);
> +
> +			pack = bitmap_git->midx->packs[pack_id];
> +			ofs = nth_midxed_offset(bitmap_git->midx, midx_pos);
> +		} else {
> +			pack = bitmap_git->pack;
> +			ofs = pack_pos_to_offset(pack, pos);
> +		}
> +

All of the hunks like this make perfect sense. The big problem would be
if we _missed_ a place that needed conversion to handle midx. But the
nice thing is that it would segfault quickly in such an instance. So
there I'm mostly relying on test coverage, plus our experience running
with this code at scale.

>  static void try_partial_reuse(struct bitmap_index *bitmap_git,
> +			      struct packed_git *pack,
>  			      size_t pos,
>  			      struct bitmap *reuse,
>  			      struct pack_window **w_curs)
>  {
> -	off_t offset, header;
> +	off_t offset, delta_obj_offset;

I'm OK with all of this in one big patch. But I suspect you _could_
just put:

  if (bitmap_git->midx)
	return; /* partial reuse not implemented for midx yet */

to start with, and then actually implement it later. I call out this
code in particular just because it's got a lot of subtleties (the
"reuse" bits are much more intimate with the assumptions of packs and
bitmaps than most other code).

I'm not sure if it's worth the trouble at this point or not.

>  	enum object_type type;
>  	unsigned long size;
>  
> -	if (pos >= bitmap_num_objects(bitmap_git))
> -		return; /* not actually in the pack or MIDX */
> +	/*
> +	 * try_partial_reuse() is called either on (a) objects in the
> +	 * bitmapped pack (in the case of a single-pack bitmap) or (b)
> +	 * objects in the preferred pack of a multi-pack bitmap.
> +	 * Importantly, the latter can pretend as if only a single pack
> +	 * exists because:
> +	 *
> +	 *   - The first pack->num_objects bits of a MIDX bitmap are
> +	 *     reserved for the preferred pack, and
> +	 *
> +	 *   - Ties due to duplicate objects are always resolved in
> +	 *     favor of the preferred pack.
> +	 *
> +	 * Therefore we do not need to ever ask the MIDX for its copy of
> +	 * an object by OID, since it will always select it from the
> +	 * preferred pack. Likewise, the selected copy of the base
> +	 * object for any deltas will reside in the same pack.
> +	 *
> +	 * This means that we can reuse pos when looking up the bit in
> +	 * the reuse bitmap, too, since bits corresponding to the
> +	 * preferred pack precede all bits from other packs.
> +	 */
>  
> -	offset = header = pack_pos_to_offset(bitmap_git->pack, pos);
> -	type = unpack_object_header(bitmap_git->pack, w_curs, &offset, &size);
> +	if (pos >= pack->num_objects)
> +		return; /* not actually in the pack or MIDX preferred pack */

It feels weird to go from bitmap_num_objects() back to
pack->num_objects. But I agree it's the right thing for the "pretend as
if only a single pack exists" reasons given above.

> +static uint32_t midx_preferred_pack(struct bitmap_index *bitmap_git)
> +{
> +	struct multi_pack_index *m = bitmap_git->midx;
> +	if (!m)
> +		BUG("midx_preferred_pack: requires non-empty MIDX");
> +	return nth_midxed_pack_int_id(m, pack_pos_to_midx(bitmap_git->midx, 0));
> +}

This part is really subtle. We infer the preferred pack by looking at
the pack of the 0th bit position. In general that works, since that's
part of the definition of the preferred pack.

Could this ever be fooled if we had a preferred pack with 0 objects in
it? I don't know why we would have such a thing, but just trying to
think of cases where our assumptions might not hold (and what bad things
could happen).

> +	if (bitmap_is_midx(bitmap_git))
> +		pack = bitmap_git->midx->packs[midx_preferred_pack(bitmap_git)];
> +	else
> +		pack = bitmap_git->pack;
> +	objects_nr = pack->num_objects;
> +
>  	while (i < result->word_alloc && result->words[i] == (eword_t)~0)
>  		i++;
>  
> -	/* Don't mark objects not in the packfile */
> +	/*
> +	 * Don't mark objects not in the packfile or preferred pack. This bitmap
> +	 * marks objects eligible for reuse, but the pack-reuse code only
> +	 * understands how to reuse a single pack. Since the preferred pack is
> +	 * guaranteed to have all bases for its deltas (in a multi-pack bitmap),
> +	 * we use it instead of another pack. In single-pack bitmaps, the choice
> +	 * is made for us.
> +	 */
>  	if (i > objects_nr / BITS_IN_EWORD)
>  		i = objects_nr / BITS_IN_EWORD;

OK, so this clamps our "quick" contiguous set of bits to the number of
objects in the preferred pack. Makes sense. And then we hit the
object-by-object loop below...

> @@ -1213,7 +1437,15 @@ int reuse_partial_packfile_from_bitmap(struct bitmap_index *bitmap_git,
>  				break;
>  
>  			offset += ewah_bit_ctz64(word >> offset);
> -			try_partial_reuse(bitmap_git, pos + offset, reuse, &w_curs);
> +			if (bitmap_is_midx(bitmap_git)) {
> +				/*
> +				 * Can't reuse from a non-preferred pack (see
> +				 * above).
> +				 */
> +				if (pos + offset >= objects_nr)
> +					continue;
> +			}
> +			try_partial_reuse(bitmap_git, pack, pos + offset, reuse, &w_curs);

...and this likewise makes sure we never go past that first pack. Good.

I think this "continue" could actually be a "break", as the loop is
iterating over "offset" (and "pos + offset" always gets larger). In
fact, it could break out of the outer loop as well (which is
incrementing "pos"). It's probably a pretty small efficiency in
practice, though.

> @@ -1511,8 +1749,13 @@ uint32_t *create_bitmap_mapping(struct bitmap_index *bitmap_git,
>  		struct object_id oid;
>  		struct object_entry *oe;
>  
> -		nth_packed_object_id(&oid, bitmap_git->pack,
> -				     pack_pos_to_index(bitmap_git->pack, i));
> +		if (bitmap_is_midx(bitmap_git))
> +			nth_midxed_object_oid(&oid,
> +					      bitmap_git->midx,
> +					      pack_pos_to_midx(bitmap_git->midx, i));
> +		else
> +			nth_packed_object_id(&oid, bitmap_git->pack,
> +					     pack_pos_to_index(bitmap_git->pack, i));
>  		oe = packlist_find(mapping, &oid);

Could this be using nth_bitmap_object_oid()? I guess not, because we are
feeding from pack_pos_to_*. I'm not sure if another helper function is
worth it (pack_pos_to_bitmap_index() or something?).

> @@ -1575,7 +1831,31 @@ static off_t get_disk_usage_for_type(struct bitmap_index *bitmap_git,
>  				break;
>  
>  			offset += ewah_bit_ctz64(word >> offset);
> -			pos = base + offset;
> +
> +			if (bitmap_is_midx(bitmap_git)) {
> +				uint32_t pack_pos;
> +				uint32_t midx_pos = pack_pos_to_midx(bitmap_git->midx, base + offset);
> +				uint32_t pack_id = nth_midxed_pack_int_id(bitmap_git->midx, midx_pos);
> +				off_t offset = nth_midxed_offset(bitmap_git->midx, midx_pos);
> +
> +				pack = bitmap_git->midx->packs[pack_id];
> +
> +				if (offset_to_pack_pos(pack, offset, &pack_pos) < 0) {
> +					struct object_id oid;
> +					nth_midxed_object_oid(&oid, bitmap_git->midx, midx_pos);
> +
> +					die(_("could not find %s in pack #%"PRIu32" at offset %"PRIuMAX),
> +					    oid_to_hex(&oid),
> +					    pack_id,
> +					    (uintmax_t)offset);
> +				}
> +
> +				pos = pack_pos;
> +			} else {
> +				pack = bitmap_git->pack;
> +				pos = base + offset;
> +			}
> +
>  			total += pack_pos_to_offset(pack, pos + 1) -
>  				 pack_pos_to_offset(pack, pos);
>  		}

In the midx case, we have to go from midx-bitmap-pos to midx-index-pos,
to then get the pack/ofs combo, which then gives us a real "pos" in the
pack. I don't think there's a faster way to do it (and this is still
much faster than looking up objects in the pack only to check their
revindex).

But then with the result, we compare the offset of "pos" and "pos + 1".
We need to know "pos" to find "pos + 1". But in the midx case, don't we
already have the offset of "pos" (it is "offset" in the bitmap_is_midx()
conditional, which is shadowing the completely unrelated "offset" in the
outer loop).

We could reuse it, saving ourselves an extra round-trip of pack_pos to
index_pos to offset. It would just mean stuffing the "total +=" line
into the two sides of the conditional.

> +off_t bitmap_pack_offset(struct bitmap_index *bitmap_git, uint32_t pos)
> +{
> +	if (bitmap_is_midx(bitmap_git))
> +		return nth_midxed_offset(bitmap_git->midx,
> +					 pack_pos_to_midx(bitmap_git->midx, pos));
> +	return nth_packed_object_offset(bitmap_git->pack,
> +					pack_pos_to_index(bitmap_git->pack, pos));
> +}

Does anybody call this function? I don't see any users by the end of the
series.

-Peff
Taylor Blau July 21, 2021, 11:01 p.m. UTC | #2
On Wed, Jul 21, 2021 at 07:32:49AM -0400, Jeff King wrote:
> On Mon, Jun 21, 2021 at 06:25:31PM -0400, Taylor Blau wrote:
> > +	if (!is_pack_valid(packfile)) {
> > +		close(fd);
> > +		return -1;
> > +	}
> > +
>
> What's this extra is_pack_valid() doing? I wouldn't expect many changes
> at all to this non-midx code path (aside from the "did we already load a
> midx bitmap" in the earlier part of the hunk, which makes sense).

That looks like a mistake to me. I did a little digging and tried to
remember if it could have ever been useful, but I think that it's just a
stray change that has no value. Removed.

> > -static int load_pack_bitmap(struct bitmap_index *bitmap_git)
> > +static int load_reverse_index(struct bitmap_index *bitmap_git)
> > +{
> > +	if (bitmap_is_midx(bitmap_git)) {
> > +		uint32_t i;
> > +		int ret;
> > +
> > +		ret = load_midx_revindex(bitmap_git->midx);
> > +		if (ret)
> > +			return ret;
> > +
> > +		for (i = 0; i < bitmap_git->midx->num_packs; i++) {
> > +			if (prepare_midx_pack(the_repository, bitmap_git->midx, i))
> > +				die(_("load_reverse_index: could not open pack"));
> > +			ret = load_pack_revindex(bitmap_git->midx->packs[i]);
> > +			if (ret)
> > +				return ret;
> > +		}
> > +		return 0;
> > +	}
> > +	return load_pack_revindex(bitmap_git->pack);
> > +}
>
> OK, this new function is used in load_bitmap(), which is used for both
> pack and midx bitmaps. So if we have a midx bitmap, we'll
> unconditionally load the revindex here. But:
>
>   - why do we then load individual pack revindexes? I can believe it may
>     be necessary to meet the assumptions of some other part of the code,
>     but it would be nice to have a comment giving us some clue.

Good suggestion. We will need to reference the reverse index belonging
to individual packs in a few locations in pack-objects (for e.g.,
write_reuse_object() calls offset_to_pack_pos(), and
pack_pos_to_offset(), both with arbitrary packs, not just the preferred
one).

I left the comment vague; something along the lines of "lots of routines
in pack-objects will need these structures to be ready to use".

I think there's room for improvement there, since for e.g., `git
rev-list --count --objects --use-bitmap-index` doesn't need to load the
reverse indexes. But that's already the case with classic bitmaps, too,
which eagerly call load_pack_revindex().

>   - in open_midx_bitmap_1(), we also unconditionally load the midx
>     reverse index. I think that will always happen before us here (we
>     cannot load_bitmap() a bitmap that has not been opened). So is this
>     load_midx_revindex() call always a noop?

Great catch. I removed the call to load_midx_revindex(), and replaced it
with a comment explaining why we don't need to call it (because we
already did).

> >  static int bitmap_position(struct bitmap_index *bitmap_git,
> >  			   const struct object_id *oid)
> >  {
> > -	int pos = bitmap_position_packfile(bitmap_git, oid);
> > +	int pos;
> > +	if (bitmap_is_midx(bitmap_git))
> > +		pos = bitmap_position_midx(bitmap_git, oid);
> > +	else
> > +		pos = bitmap_position_packfile(bitmap_git, oid);
> >  	return (pos >= 0) ? pos : bitmap_position_extended(bitmap_git, oid);
> >  }
>
> Makes sense. Not new in your patch, but this "int" return is fudging the
> same 32-bit space we were talking about elsewhere (i.e., "pos" really
> could be 2^32, or even more due to extended objects).

:-). It bothers me to no end, too, because of all of the recent effort
to improve the reverse-index APIs to avoid exactly this issue. But I
tend to agree that the concern is more theoretical than anything,
because we're only using the MSB, so the remaining 2^31 possible objects
still seems pretty generous.

> In practice I think even 2^31 objects is pretty out-of-reach, but it may
> be worth changing the return type (and the callers), or even just
> catching the overflow with an assertion.

Possibly, but keep in mind that the former is basically the same
refactor as we did with the "tell me whether this object was found via
this extra pointer". But bitmap_position() has a lot more callers than
that, so the plumbing required would be a little more prevalent.

So I'd be content to just punt on it for now, if you'd be OK with it.

> >  	if (pos < bitmap_num_objects(bitmap_git)) {
> > -		off_t ofs = pack_pos_to_offset(pack, pos);
> > +		struct packed_git *pack;
> > +		off_t ofs;
> > +
> > +		if (bitmap_is_midx(bitmap_git)) {
> > +			uint32_t midx_pos = pack_pos_to_midx(bitmap_git->midx, pos);
> > +			uint32_t pack_id = nth_midxed_pack_int_id(bitmap_git->midx, midx_pos);
> > +
> > +			pack = bitmap_git->midx->packs[pack_id];
> > +			ofs = nth_midxed_offset(bitmap_git->midx, midx_pos);
> > +		} else {
> > +			pack = bitmap_git->pack;
> > +			ofs = pack_pos_to_offset(pack, pos);
> > +		}
> > +
>
> All of the hunks like this make perfect sense. The big problem would be
> if we _missed_ a place that needed conversion to handle midx. But the
> nice thing is that it would segfault quickly in such an instance. So
> there I'm mostly relying on test coverage, plus our experience running
> with this code at scale.

Yeah; I'm definitely happy to rely on our experience running this and
related patches at GitHub for several months to give us confidence that
we didn't miss anything here.

> >  static void try_partial_reuse(struct bitmap_index *bitmap_git,
> > +			      struct packed_git *pack,
> >  			      size_t pos,
> >  			      struct bitmap *reuse,
> >  			      struct pack_window **w_curs)
> >  {
> > -	off_t offset, header;
> > +	off_t offset, delta_obj_offset;
>
> I'm OK with all of this in one big patch. But I suspect you _could_
> just put:
>
>   if (bitmap_git->midx)
> 	return; /* partial reuse not implemented for midx yet */
>
> to start with, and then actually implement it later. I call out this
> code in particular just because it's got a lot of subtleties (the
> "reuse" bits are much more intimate with the assumptions of packs and
> bitmaps than most other code).
>
> I'm not sure if it's worth the trouble at this point or not.

Yeah, I'd definitely err on the side of not splitting this up now,
especially since you've already gone through the whole patch and
reviewed it. (Of course, if your response was "this patch is way too
big, please split it up so I can more easily review it", that would be a
different story).

But I appreciate the advice, since I have felt that a lot of these
format-level changes require a 3-patch arc where:

  - The first patch describes the new format in Documentation/technical.
  - The second patch implements support for reading files that are
    written in the new format.
  - And finally, the third patch implements support for writing such
    files.

...and it's usually the second of those three patches that is the most
complicated one by far. So this is a good way to split that patch up
into many pieces.

Of course, that only works if you delay adding tests until after support
is added for all parts of the new format, but that's more-or-less what
did here.

> > +static uint32_t midx_preferred_pack(struct bitmap_index *bitmap_git)
> > +{
> > +	struct multi_pack_index *m = bitmap_git->midx;
> > +	if (!m)
> > +		BUG("midx_preferred_pack: requires non-empty MIDX");
> > +	return nth_midxed_pack_int_id(m, pack_pos_to_midx(bitmap_git->midx, 0));
> > +}
>
> This part is really subtle. We infer the preferred pack by looking at
> the pack of the 0th bit position. In general that works, since that's
> part of the definition of the preferred pack.
>
> Could this ever be fooled if we had a preferred pack with 0 objects in
> it? I don't know why we would have such a thing, but just trying to
> think of cases where our assumptions might not hold (and what bad things
> could happen).

An empty preferred pack would cause a problem, yes. The solution is
two-fold (and incorporated into the reroll that I plan on sending
shortly):

  - When the user specifies --preferred-pack, the MIDX code must make
    sure that the given pack is non-empty. That's a new patch, and
    basically adds a new conditional (to check the pack itself) and a
    test (to make sure that we catch the case we are trying to prevent).

  - When the user doesn't specify --preferred-pack (and instead asks us
    to infer one for them) we want to select not just the oldest pack,
    but the oldest *non-empty* pack. That is folded into the "midx:
    infer preferred pack when not given one" patch.

In that patch, I made a note, but I think that it's subtle enough to
merit sharing here again. In the loop over all packs, the conditional for swapping out the oldest pack for the current one was something like:

    if (p->mtime < oldest->mtime)
      oldest = p;

but now we want it to be:

    if (!oldest->num_objects || p->mtime < oldest->mtime)
      oldest = p;

to reject packs that have no objects. And we want to be extra careful in
the case where the only pack fed to the MIDX writer was empty. But we
don't have to do anything there, since there are no objects to write
anyway, so any "preferred_idx" would be fine.

> > +	if (bitmap_is_midx(bitmap_git))
> > +		pack = bitmap_git->midx->packs[midx_preferred_pack(bitmap_git)];
> > +	else
> > +		pack = bitmap_git->pack;
> > +	objects_nr = pack->num_objects;
> > +
> >  	while (i < result->word_alloc && result->words[i] == (eword_t)~0)
> >  		i++;
> >
> > -	/* Don't mark objects not in the packfile */
> > +	/*
> > +	 * Don't mark objects not in the packfile or preferred pack. This bitmap
> > +	 * marks objects eligible for reuse, but the pack-reuse code only
> > +	 * understands how to reuse a single pack. Since the preferred pack is
> > +	 * guaranteed to have all bases for its deltas (in a multi-pack bitmap),
> > +	 * we use it instead of another pack. In single-pack bitmaps, the choice
> > +	 * is made for us.
> > +	 */
> >  	if (i > objects_nr / BITS_IN_EWORD)
> >  		i = objects_nr / BITS_IN_EWORD;
>
> OK, so this clamps our "quick" contiguous set of bits to the number of
> objects in the preferred pack. Makes sense. And then we hit the
> object-by-object loop below...
>
> > @@ -1213,7 +1437,15 @@ int reuse_partial_packfile_from_bitmap(struct bitmap_index *bitmap_git,
> >  				break;
> >
> >  			offset += ewah_bit_ctz64(word >> offset);
> > -			try_partial_reuse(bitmap_git, pos + offset, reuse, &w_curs);
> > +			if (bitmap_is_midx(bitmap_git)) {
> > +				/*
> > +				 * Can't reuse from a non-preferred pack (see
> > +				 * above).
> > +				 */
> > +				if (pos + offset >= objects_nr)
> > +					continue;
> > +			}
> > +			try_partial_reuse(bitmap_git, pack, pos + offset, reuse, &w_curs);
>
> ...and this likewise makes sure we never go past that first pack. Good.
>
> I think this "continue" could actually be a "break", as the loop is
> iterating over "offset" (and "pos + offset" always gets larger). In
> fact, it could break out of the outer loop as well (which is
> incrementing "pos"). It's probably a pretty small efficiency in
> practice, though.

Yeah; you're right. And we'll save up to BITS_IN_EWORD cycles of this
loop. (I wonder if smart-enough compilers will realize the same
optimization that you did and turn that `continue` into a `break`
automatically, but that's neither here nor there).

> > @@ -1511,8 +1749,13 @@ uint32_t *create_bitmap_mapping(struct bitmap_index *bitmap_git,
> >  		struct object_id oid;
> >  		struct object_entry *oe;
> >
> > -		nth_packed_object_id(&oid, bitmap_git->pack,
> > -				     pack_pos_to_index(bitmap_git->pack, i));
> > +		if (bitmap_is_midx(bitmap_git))
> > +			nth_midxed_object_oid(&oid,
> > +					      bitmap_git->midx,
> > +					      pack_pos_to_midx(bitmap_git->midx, i));
> > +		else
> > +			nth_packed_object_id(&oid, bitmap_git->pack,
> > +					     pack_pos_to_index(bitmap_git->pack, i));
> >  		oe = packlist_find(mapping, &oid);
>
> Could this be using nth_bitmap_object_oid()? I guess not, because we are
> feeding from pack_pos_to_*. I'm not sure if another helper function is
> worth it (pack_pos_to_bitmap_index() or something?).

You're right that we can't call nth_bitmap_object_oid here directly,
sadly. But I think your suggestion for pack_pos_to_bitmap_index() (or
similar) would only benefit this caller, since most places that dispatch
conditionally to either pack_pos_to_{midx,index} want to pass the result
to a different function depending on which branch they took.

Definitely possible that I missed another case that would help, but that
was what I came up with after just a quick glance.

> > @@ -1575,7 +1831,31 @@ static off_t get_disk_usage_for_type(struct bitmap_index *bitmap_git,
> >  				break;
> >
> >  			offset += ewah_bit_ctz64(word >> offset);
> > -			pos = base + offset;
> > +
> > +			if (bitmap_is_midx(bitmap_git)) {
> > +				uint32_t pack_pos;
> > +				uint32_t midx_pos = pack_pos_to_midx(bitmap_git->midx, base + offset);
> > +				uint32_t pack_id = nth_midxed_pack_int_id(bitmap_git->midx, midx_pos);
> > +				off_t offset = nth_midxed_offset(bitmap_git->midx, midx_pos);
> > +
> > +				pack = bitmap_git->midx->packs[pack_id];
> > +
> > +				if (offset_to_pack_pos(pack, offset, &pack_pos) < 0) {
> > +					struct object_id oid;
> > +					nth_midxed_object_oid(&oid, bitmap_git->midx, midx_pos);
> > +
> > +					die(_("could not find %s in pack #%"PRIu32" at offset %"PRIuMAX),
> > +					    oid_to_hex(&oid),
> > +					    pack_id,
> > +					    (uintmax_t)offset);
> > +				}
> > +
> > +				pos = pack_pos;
> > +			} else {
> > +				pack = bitmap_git->pack;
> > +				pos = base + offset;
> > +			}
> > +
> >  			total += pack_pos_to_offset(pack, pos + 1) -
> >  				 pack_pos_to_offset(pack, pos);
> >  		}
>
> In the midx case, we have to go from midx-bitmap-pos to midx-index-pos,
> to then get the pack/ofs combo, which then gives us a real "pos" in the
> pack. I don't think there's a faster way to do it (and this is still
> much faster than looking up objects in the pack only to check their
> revindex).
>
> But then with the result, we compare the offset of "pos" and "pos + 1".
> We need to know "pos" to find "pos + 1". But in the midx case, don't we
> already have the offset of "pos" (it is "offset" in the bitmap_is_midx()
> conditional, which is shadowing the completely unrelated "offset" in the
> outer loop).
>
> We could reuse it, saving ourselves an extra round-trip of pack_pos to
> index_pos to offset. It would just mean stuffing the "total +=" line
> into the two sides of the conditional.

Yep; agreed. And it allows us to clean up a few little other things, so
I squashed it in. Thanks for the suggestion!

> > +off_t bitmap_pack_offset(struct bitmap_index *bitmap_git, uint32_t pos)
> > +{
> > +	if (bitmap_is_midx(bitmap_git))
> > +		return nth_midxed_offset(bitmap_git->midx,
> > +					 pack_pos_to_midx(bitmap_git->midx, pos));
> > +	return nth_packed_object_offset(bitmap_git->pack,
> > +					pack_pos_to_index(bitmap_git->pack, pos));
> > +}
>
> Does anybody call this function? I don't see any users by the end of the
> series.

Nope, great catch. I looked at callers of nth_midxed_offset and
nth_packed_object_offset to see if they could use this function and
weren't, but the spots I looked at didn't appear to be able that they
would be helped by the existence of this helper, so I just removed it.

Phew! This was quite the email to respond to, but I suppose that's my
fault for writing such a monstrous patch. Thank you for taking the time
to read through it all so carefully. I think you got the short end of
the stick between writing this email and responding to it, so thank you
:-).

Thanks,
Taylor
Jeff King July 23, 2021, 9:40 a.m. UTC | #3
On Wed, Jul 21, 2021 at 07:01:16PM -0400, Taylor Blau wrote:

> On Wed, Jul 21, 2021 at 07:32:49AM -0400, Jeff King wrote:
> > On Mon, Jun 21, 2021 at 06:25:31PM -0400, Taylor Blau wrote:
> > > +	if (!is_pack_valid(packfile)) {
> > > +		close(fd);
> > > +		return -1;
> > > +	}
> > > +
> >
> > What's this extra is_pack_valid() doing? I wouldn't expect many changes
> > at all to this non-midx code path (aside from the "did we already load a
> > midx bitmap" in the earlier part of the hunk, which makes sense).
> 
> That looks like a mistake to me. I did a little digging and tried to
> remember if it could have ever been useful, but I think that it's just a
> stray change that has no value. Removed.

This turned out to be quite interesting. It _is_ a mistake to include it
in this series. But it turns out to be quite valuable on its own. :)

I just cleaned it up and sent it as its own separate patch:

  https://lore.kernel.org/git/YPqL%2FpZt6hNYN4hB@coredump.intra.peff.net/

So it's a happy accident that your series called attention to it. :)

-Peff
Jeff King July 23, 2021, 10 a.m. UTC | #4
On Wed, Jul 21, 2021 at 07:01:16PM -0400, Taylor Blau wrote:

> > OK, this new function is used in load_bitmap(), which is used for both
> > pack and midx bitmaps. So if we have a midx bitmap, we'll
> > unconditionally load the revindex here. But:
> >
> >   - why do we then load individual pack revindexes? I can believe it may
> >     be necessary to meet the assumptions of some other part of the code,
> >     but it would be nice to have a comment giving us some clue.
> 
> Good suggestion. We will need to reference the reverse index belonging
> to individual packs in a few locations in pack-objects (for e.g.,
> write_reuse_object() calls offset_to_pack_pos(), and
> pack_pos_to_offset(), both with arbitrary packs, not just the preferred
> one).
> 
> I left the comment vague; something along the lines of "lots of routines
> in pack-objects will need these structures to be ready to use".

Makes sense. I think we _could_ be lazy-loading them, but IIRC only some
of the revindex functions are happy to lazy-load. It's definitely fine
to punt on that for now with a comment.

> I think there's room for improvement there, since for e.g., `git
> rev-list --count --objects --use-bitmap-index` doesn't need to load the
> reverse indexes. But that's already the case with classic bitmaps, too,
> which eagerly call load_pack_revindex().

Right. I think our solution there was to make loading the revindex
really cheap (open+mmap, rather than the in-core generation). I'm
definitely happy to call that fast enough for now, and if somebody wants
to benchmark and micro-optimize cases where we can avoid loading them,
we can do that later.

> > In practice I think even 2^31 objects is pretty out-of-reach, but it may
> > be worth changing the return type (and the callers), or even just
> > catching the overflow with an assertion.
> 
> Possibly, but keep in mind that the former is basically the same
> refactor as we did with the "tell me whether this object was found via
> this extra pointer". But bitmap_position() has a lot more callers than
> that, so the plumbing required would be a little more prevalent.
> 
> So I'd be content to just punt on it for now, if you'd be OK with it.

Yeah, I think it's fine to leave it out of this series. It's not new,
and we can revisit it later.

> > Could this ever be fooled if we had a preferred pack with 0 objects in
> > it? I don't know why we would have such a thing, but just trying to
> > think of cases where our assumptions might not hold (and what bad things
> > could happen).
> 
> An empty preferred pack would cause a problem, yes. The solution is
> two-fold (and incorporated into the reroll that I plan on sending
> shortly):
> 
>   - When the user specifies --preferred-pack, the MIDX code must make
>     sure that the given pack is non-empty. That's a new patch, and
>     basically adds a new conditional (to check the pack itself) and a
>     test (to make sure that we catch the case we are trying to prevent).
> 
>   - When the user doesn't specify --preferred-pack (and instead asks us
>     to infer one for them) we want to select not just the oldest pack,
>     but the oldest *non-empty* pack. That is folded into the "midx:
>     infer preferred pack when not given one" patch.

Oh good, I said something useful. ;) The fix you outlined sounds
sensible.

> > > +			if (bitmap_is_midx(bitmap_git)) {
> > > +				/*
> > > +				 * Can't reuse from a non-preferred pack (see
> > > +				 * above).
> > > +				 */
> > > +				if (pos + offset >= objects_nr)
> > > +					continue;
> > > +			}
> > > +			try_partial_reuse(bitmap_git, pack, pos + offset, reuse, &w_curs);
> >
> > ...and this likewise makes sure we never go past that first pack. Good.
> >
> > I think this "continue" could actually be a "break", as the loop is
> > iterating over "offset" (and "pos + offset" always gets larger). In
> > fact, it could break out of the outer loop as well (which is
> > incrementing "pos"). It's probably a pretty small efficiency in
> > practice, though.
> 
> Yeah; you're right. And we'll save up to BITS_IN_EWORD cycles of this
> loop. (I wonder if smart-enough compilers will realize the same
> optimization that you did and turn that `continue` into a `break`
> automatically, but that's neither here nor there).

If you break all the way out, then it saves iterating over all of those
other words that are not in the first pack, too. I.e., if your bitmap
has 10 million bits (for a 10-million object clone), but your first pack
only has a million objects in it, we'll call try_partial_reuse() 9
million extra times.

Fortunately, each call is super cheap, because the first thing it does
is check if the requested bit is past the end of the pack. Which kind of
makes me wonder if we could simplify this further by just letting
try_partial_reuse() tell us when there's no point going further:

diff --git a/pack-bitmap.c b/pack-bitmap.c
index 02948e8c78..b84b55c4f3 100644
--- a/pack-bitmap.c
+++ b/pack-bitmap.c
@@ -1308,7 +1308,11 @@ struct bitmap_index *prepare_bitmap_walk(struct rev_info *revs,
 	return NULL;
 }
 
-static void try_partial_reuse(struct bitmap_index *bitmap_git,
+/*
+ * -1 means "stop trying further objects"; 0 means we may or may not have
+ * reused, but you can keep feeding bits.
+ */
+static int try_partial_reuse(struct bitmap_index *bitmap_git,
 			      struct packed_git *pack,
 			      size_t pos,
 			      struct bitmap *reuse,
@@ -1342,12 +1346,12 @@ static void try_partial_reuse(struct bitmap_index *bitmap_git,
 	 */
 
 	if (pos >= pack->num_objects)
-		return; /* not actually in the pack or MIDX preferred pack */
+		return -1; /* not actually in the pack or MIDX preferred pack */
 
 	offset = delta_obj_offset = pack_pos_to_offset(pack, pos);
 	type = unpack_object_header(pack, w_curs, &offset, &size);
 	if (type < 0)
-		return; /* broken packfile, punt */
+		return -1; /* broken packfile, punt */
 
 	if (type == OBJ_REF_DELTA || type == OBJ_OFS_DELTA) {
 		off_t base_offset;
@@ -1364,9 +1368,9 @@ static void try_partial_reuse(struct bitmap_index *bitmap_git,
 		base_offset = get_delta_base(pack, w_curs, &offset, type,
 					     delta_obj_offset);
 		if (!base_offset)
-			return;
+			return 0;
 		if (offset_to_pack_pos(pack, base_offset, &base_pos) < 0)
-			return;
+			return 0;
 
 		/*
 		 * We assume delta dependencies always point backwards. This
@@ -1378,7 +1382,7 @@ static void try_partial_reuse(struct bitmap_index *bitmap_git,
 		 * odd parameters.
 		 */
 		if (base_pos >= pos)
-			return;
+			return 0;
 
 		/*
 		 * And finally, if we're not sending the base as part of our
@@ -1389,13 +1393,14 @@ static void try_partial_reuse(struct bitmap_index *bitmap_git,
 		 * object_entry code path handle it.
 		 */
 		if (!bitmap_get(reuse, base_pos))
-			return;
+			return 0;
 	}
 
 	/*
 	 * If we got here, then the object is OK to reuse. Mark it.
 	 */
 	bitmap_set(reuse, pos);
+	return 0;
 }
 
 static uint32_t midx_preferred_pack(struct bitmap_index *bitmap_git)
@@ -1449,22 +1454,20 @@ int reuse_partial_packfile_from_bitmap(struct bitmap_index *bitmap_git,
 	for (; i < result->word_alloc; ++i) {
 		eword_t word = result->words[i];
 		size_t pos = (i * BITS_IN_EWORD);
+		int ret;
 
 		for (offset = 0; offset < BITS_IN_EWORD; ++offset) {
 			if ((word >> offset) == 0)
 				break;
 
 			offset += ewah_bit_ctz64(word >> offset);
-			if (bitmap_is_midx(bitmap_git)) {
-				/*
-				 * Can't reuse from a non-preferred pack (see
-				 * above).
-				 */
-				if (pos + offset >= objects_nr)
-					continue;
-			}
-			try_partial_reuse(bitmap_git, pack, pos + offset, reuse, &w_curs);
+			ret = try_partial_reuse(bitmap_git, pack, pos + offset,
+						reuse, &w_curs);
+			if (ret < 0)
+				break;
 		}
+		if (ret < 0)
+			break;
 	}
 
 	unuse_pack(&w_curs);

The double-ret check is kind of ugly, though I suspect compilers
optimize it pretty well. The alternative is a "goto" to a label just
past the loop (also ugly, but easily explained with a comment).

> > > @@ -1511,8 +1749,13 @@ uint32_t *create_bitmap_mapping(struct bitmap_index *bitmap_git,
> > >  		struct object_id oid;
> > >  		struct object_entry *oe;
> > >
> > > -		nth_packed_object_id(&oid, bitmap_git->pack,
> > > -				     pack_pos_to_index(bitmap_git->pack, i));
> > > +		if (bitmap_is_midx(bitmap_git))
> > > +			nth_midxed_object_oid(&oid,
> > > +					      bitmap_git->midx,
> > > +					      pack_pos_to_midx(bitmap_git->midx, i));
> > > +		else
> > > +			nth_packed_object_id(&oid, bitmap_git->pack,
> > > +					     pack_pos_to_index(bitmap_git->pack, i));
> > >  		oe = packlist_find(mapping, &oid);
> >
> > Could this be using nth_bitmap_object_oid()? I guess not, because we are
> > feeding from pack_pos_to_*. I'm not sure if another helper function is
> > worth it (pack_pos_to_bitmap_index() or something?).
> 
> You're right that we can't call nth_bitmap_object_oid here directly,
> sadly. But I think your suggestion for pack_pos_to_bitmap_index() (or
> similar) would only benefit this caller, since most places that dispatch
> conditionally to either pack_pos_to_{midx,index} want to pass the result
> to a different function depending on which branch they took.
> 
> Definitely possible that I missed another case that would help, but that
> was what I came up with after just a quick glance.

Yeah, looking around, I don't see another opportunity. So the benefits
are pretty minimal. We could do:

  index_pos = bitmap_is_midx(bitmap_git) ?
              pack_pos_to_midx(bitmap_git->midx, i) :
	      pack_pos_to_index(bitmap_git->pack, i);
  nth_bitmap_object_oid(&oid, bitmap_git, index_pos);

but that is not buying much. I'm content to leave it.

-Peff
Taylor Blau July 26, 2021, 8:36 p.m. UTC | #5
On Fri, Jul 23, 2021 at 06:00:47AM -0400, Jeff King wrote:
> On Wed, Jul 21, 2021 at 07:01:16PM -0400, Taylor Blau wrote:
> > > > +			if (bitmap_is_midx(bitmap_git)) {
> > > > +				/*
> > > > +				 * Can't reuse from a non-preferred pack (see
> > > > +				 * above).
> > > > +				 */
> > > > +				if (pos + offset >= objects_nr)
> > > > +					continue;
> > > > +			}
> > > > +			try_partial_reuse(bitmap_git, pack, pos + offset, reuse, &w_curs);
> > >
> > > ...and this likewise makes sure we never go past that first pack. Good.
> > >
> > > I think this "continue" could actually be a "break", as the loop is
> > > iterating over "offset" (and "pos + offset" always gets larger). In
> > > fact, it could break out of the outer loop as well (which is
> > > incrementing "pos"). It's probably a pretty small efficiency in
> > > practice, though.
> >
> > Yeah; you're right. And we'll save up to BITS_IN_EWORD cycles of this
> > loop. (I wonder if smart-enough compilers will realize the same
> > optimization that you did and turn that `continue` into a `break`
> > automatically, but that's neither here nor there).
>
> If you break all the way out, then it saves iterating over all of those
> other words that are not in the first pack, too. I.e., if your bitmap
> has 10 million bits (for a 10-million object clone), but your first pack
> only has a million objects in it, we'll call try_partial_reuse() 9
> million extra times.
>
> Fortunately, each call is super cheap, because the first thing it does
> is check if the requested bit is past the end of the pack. Which kind of
> makes me wonder if we could simplify this further by just letting
> try_partial_reuse() tell us when there's no point going further:
>
> [snip suggested diff]

All looks pretty good to me. I think that a goto is a little easier to
read than two identical "if (ret < 0)" checks. And having a comment
makes it clearer to me than the double if statements. So I'm content do
to that instead.

Thanks,
Taylor
diff mbox series

Patch

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 8a523624a1..e11d3ac2e5 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -1124,6 +1124,11 @@  static void write_reused_pack(struct hashfile *f)
 				break;
 
 			offset += ewah_bit_ctz64(word >> offset);
+			/*
+			 * Can use bit positions directly, even for MIDX
+			 * bitmaps. See comment in try_partial_reuse()
+			 * for why.
+			 */
 			write_reused_pack_one(pos + offset, f, &w_curs);
 			display_progress(progress_state, ++written);
 		}
diff --git a/pack-bitmap-write.c b/pack-bitmap-write.c
index 142fd0adb8..9c55c1531e 100644
--- a/pack-bitmap-write.c
+++ b/pack-bitmap-write.c
@@ -48,7 +48,7 @@  void bitmap_writer_show_progress(int show)
 }
 
 /**
- * Build the initial type index for the packfile
+ * Build the initial type index for the packfile or multi-pack-index
  */
 void bitmap_writer_build_type_index(struct packing_data *to_pack,
 				    struct pack_idx_entry **index,
diff --git a/pack-bitmap.c b/pack-bitmap.c
index d882bf7ce1..4110d23ca1 100644
--- a/pack-bitmap.c
+++ b/pack-bitmap.c
@@ -13,6 +13,7 @@ 
 #include "repository.h"
 #include "object-store.h"
 #include "list-objects-filter-options.h"
+#include "midx.h"
 #include "config.h"
 
 /*
@@ -35,8 +36,15 @@  struct stored_bitmap {
  * the active bitmap index is the largest one.
  */
 struct bitmap_index {
-	/* Packfile to which this bitmap index belongs to */
+	/*
+	 * The pack or multi-pack index (MIDX) that this bitmap index belongs
+	 * to.
+	 *
+	 * Exactly one of these must be non-NULL; this specifies the object
+	 * order used to interpret this bitmap.
+	 */
 	struct packed_git *pack;
+	struct multi_pack_index *midx;
 
 	/*
 	 * Mark the first `reuse_objects` in the packfile as reused:
@@ -71,6 +79,9 @@  struct bitmap_index {
 	/* If not NULL, this is a name-hash cache pointing into map. */
 	uint32_t *hashes;
 
+	/* The checksum of the packfile or MIDX; points into map. */
+	const unsigned char *checksum;
+
 	/*
 	 * Extended index.
 	 *
@@ -138,6 +149,8 @@  static struct ewah_bitmap *read_bitmap_1(struct bitmap_index *index)
 
 static uint32_t bitmap_num_objects(struct bitmap_index *index)
 {
+	if (index->midx)
+		return index->midx->num_objects;
 	return index->pack->num_objects;
 }
 
@@ -175,6 +188,7 @@  static int load_bitmap_header(struct bitmap_index *index)
 	}
 
 	index->entry_count = ntohl(header->entry_count);
+	index->checksum = header->checksum;
 	index->map_pos += header_size;
 	return 0;
 }
@@ -227,7 +241,10 @@  static void nth_bitmap_object_oid(struct bitmap_index *index,
 				  struct object_id *oid,
 				  uint32_t n)
 {
-	nth_packed_object_id(oid, index->pack, n);
+	if (index->midx)
+		nth_midxed_object_oid(oid, index->midx, n);
+	else
+		nth_packed_object_id(oid, index->pack, n);
 }
 
 static int load_bitmap_entries_v1(struct bitmap_index *index)
@@ -272,7 +289,14 @@  static int load_bitmap_entries_v1(struct bitmap_index *index)
 	return 0;
 }
 
-static char *pack_bitmap_filename(struct packed_git *p)
+char *midx_bitmap_filename(struct multi_pack_index *midx)
+{
+	return xstrfmt("%s-%s.bitmap",
+		       get_midx_filename(midx->object_dir),
+		       hash_to_hex(get_midx_checksum(midx)));
+}
+
+char *pack_bitmap_filename(struct packed_git *p)
 {
 	size_t len;
 
@@ -281,6 +305,57 @@  static char *pack_bitmap_filename(struct packed_git *p)
 	return xstrfmt("%.*s.bitmap", (int)len, p->pack_name);
 }
 
+static int open_midx_bitmap_1(struct bitmap_index *bitmap_git,
+			      struct multi_pack_index *midx)
+{
+	struct stat st;
+	char *idx_name = midx_bitmap_filename(midx);
+	int fd = git_open(idx_name);
+
+	free(idx_name);
+
+	if (fd < 0)
+		return -1;
+
+	if (fstat(fd, &st)) {
+		close(fd);
+		return -1;
+	}
+
+	if (bitmap_git->pack || bitmap_git->midx) {
+		/* ignore extra bitmap file; we can only handle one */
+		warning("ignoring extra bitmap file: %s",
+			get_midx_filename(midx->object_dir));
+		close(fd);
+		return -1;
+	}
+
+	bitmap_git->midx = midx;
+	bitmap_git->map_size = xsize_t(st.st_size);
+	bitmap_git->map_pos = 0;
+	bitmap_git->map = xmmap(NULL, bitmap_git->map_size, PROT_READ,
+				MAP_PRIVATE, fd, 0);
+	close(fd);
+
+	if (load_bitmap_header(bitmap_git) < 0)
+		goto cleanup;
+
+	if (!hasheq(get_midx_checksum(bitmap_git->midx), bitmap_git->checksum))
+		goto cleanup;
+
+	if (load_midx_revindex(bitmap_git->midx) < 0) {
+		warning(_("multi-pack bitmap is missing required reverse index"));
+		goto cleanup;
+	}
+	return 0;
+
+cleanup:
+	munmap(bitmap_git->map, bitmap_git->map_size);
+	bitmap_git->map_size = 0;
+	bitmap_git->map = NULL;
+	return -1;
+}
+
 static int open_pack_bitmap_1(struct bitmap_index *bitmap_git, struct packed_git *packfile)
 {
 	int fd;
@@ -302,12 +377,18 @@  static int open_pack_bitmap_1(struct bitmap_index *bitmap_git, struct packed_git
 		return -1;
 	}
 
-	if (bitmap_git->pack) {
+	if (bitmap_git->pack || bitmap_git->midx) {
+		/* ignore extra bitmap file; we can only handle one */
 		warning("ignoring extra bitmap file: %s", packfile->pack_name);
 		close(fd);
 		return -1;
 	}
 
+	if (!is_pack_valid(packfile)) {
+		close(fd);
+		return -1;
+	}
+
 	bitmap_git->pack = packfile;
 	bitmap_git->map_size = xsize_t(st.st_size);
 	bitmap_git->map = xmmap(NULL, bitmap_git->map_size, PROT_READ, MAP_PRIVATE, fd, 0);
@@ -324,13 +405,36 @@  static int open_pack_bitmap_1(struct bitmap_index *bitmap_git, struct packed_git
 	return 0;
 }
 
-static int load_pack_bitmap(struct bitmap_index *bitmap_git)
+static int load_reverse_index(struct bitmap_index *bitmap_git)
+{
+	if (bitmap_is_midx(bitmap_git)) {
+		uint32_t i;
+		int ret;
+
+		ret = load_midx_revindex(bitmap_git->midx);
+		if (ret)
+			return ret;
+
+		for (i = 0; i < bitmap_git->midx->num_packs; i++) {
+			if (prepare_midx_pack(the_repository, bitmap_git->midx, i))
+				die(_("load_reverse_index: could not open pack"));
+			ret = load_pack_revindex(bitmap_git->midx->packs[i]);
+			if (ret)
+				return ret;
+		}
+		return 0;
+	}
+	return load_pack_revindex(bitmap_git->pack);
+}
+
+static int load_bitmap(struct bitmap_index *bitmap_git)
 {
 	assert(bitmap_git->map);
 
 	bitmap_git->bitmaps = kh_init_oid_map();
 	bitmap_git->ext_index.positions = kh_init_oid_pos();
-	if (load_pack_revindex(bitmap_git->pack))
+
+	if (load_reverse_index(bitmap_git))
 		goto failed;
 
 	if (!(bitmap_git->commits = read_bitmap_1(bitmap_git)) ||
@@ -374,11 +478,35 @@  static int open_pack_bitmap(struct repository *r,
 	return ret;
 }
 
+static int open_midx_bitmap(struct repository *r,
+			    struct bitmap_index *bitmap_git)
+{
+	struct multi_pack_index *midx;
+
+	assert(!bitmap_git->map);
+
+	for (midx = get_multi_pack_index(r); midx; midx = midx->next) {
+		if (!open_midx_bitmap_1(bitmap_git, midx))
+			return 0;
+	}
+	return -1;
+}
+
+static int open_bitmap(struct repository *r,
+		       struct bitmap_index *bitmap_git)
+{
+	assert(!bitmap_git->map);
+
+	if (!open_midx_bitmap(r, bitmap_git))
+		return 0;
+	return open_pack_bitmap(r, bitmap_git);
+}
+
 struct bitmap_index *prepare_bitmap_git(struct repository *r)
 {
 	struct bitmap_index *bitmap_git = xcalloc(1, sizeof(*bitmap_git));
 
-	if (!open_pack_bitmap(r, bitmap_git) && !load_pack_bitmap(bitmap_git))
+	if (!open_bitmap(r, bitmap_git) && !load_bitmap(bitmap_git))
 		return bitmap_git;
 
 	free_bitmap_index(bitmap_git);
@@ -428,10 +556,26 @@  static inline int bitmap_position_packfile(struct bitmap_index *bitmap_git,
 	return pos;
 }
 
+static int bitmap_position_midx(struct bitmap_index *bitmap_git,
+				const struct object_id *oid)
+{
+	uint32_t want, got;
+	if (!bsearch_midx(oid, bitmap_git->midx, &want))
+		return -1;
+
+	if (midx_to_pack_pos(bitmap_git->midx, want, &got) < 0)
+		return -1;
+	return got;
+}
+
 static int bitmap_position(struct bitmap_index *bitmap_git,
 			   const struct object_id *oid)
 {
-	int pos = bitmap_position_packfile(bitmap_git, oid);
+	int pos;
+	if (bitmap_is_midx(bitmap_git))
+		pos = bitmap_position_midx(bitmap_git, oid);
+	else
+		pos = bitmap_position_packfile(bitmap_git, oid);
 	return (pos >= 0) ? pos : bitmap_position_extended(bitmap_git, oid);
 }
 
@@ -724,6 +868,7 @@  static void show_objects_for_type(
 			continue;
 
 		for (offset = 0; offset < BITS_IN_EWORD; ++offset) {
+			struct packed_git *pack;
 			struct object_id oid;
 			uint32_t hash = 0, index_pos;
 			off_t ofs;
@@ -733,14 +878,28 @@  static void show_objects_for_type(
 
 			offset += ewah_bit_ctz64(word >> offset);
 
-			index_pos = pack_pos_to_index(bitmap_git->pack, pos + offset);
-			ofs = pack_pos_to_offset(bitmap_git->pack, pos + offset);
-			nth_packed_object_id(&oid, bitmap_git->pack, index_pos);
+			if (bitmap_is_midx(bitmap_git)) {
+				struct multi_pack_index *m = bitmap_git->midx;
+				uint32_t pack_id;
+
+				index_pos = pack_pos_to_midx(m, pos + offset);
+				ofs = nth_midxed_offset(m, index_pos);
+				nth_midxed_object_oid(&oid, m, index_pos);
+
+				pack_id = nth_midxed_pack_int_id(m, index_pos);
+				pack = bitmap_git->midx->packs[pack_id];
+			} else {
+				index_pos = pack_pos_to_index(bitmap_git->pack, pos + offset);
+				ofs = pack_pos_to_offset(bitmap_git->pack, pos + offset);
+				nth_bitmap_object_oid(bitmap_git, &oid, index_pos);
+
+				pack = bitmap_git->pack;
+			}
 
 			if (bitmap_git->hashes)
 				hash = get_be32(bitmap_git->hashes + index_pos);
 
-			show_reach(&oid, object_type, 0, hash, bitmap_git->pack, ofs);
+			show_reach(&oid, object_type, 0, hash, pack, ofs);
 		}
 	}
 }
@@ -752,8 +911,13 @@  static int in_bitmapped_pack(struct bitmap_index *bitmap_git,
 		struct object *object = roots->item;
 		roots = roots->next;
 
-		if (find_pack_entry_one(object->oid.hash, bitmap_git->pack) > 0)
-			return 1;
+		if (bitmap_is_midx(bitmap_git)) {
+			if (bsearch_midx(&object->oid, bitmap_git->midx, NULL))
+				return 1;
+		} else {
+			if (find_pack_entry_one(object->oid.hash, bitmap_git->pack) > 0)
+				return 1;
+		}
 	}
 
 	return 0;
@@ -839,14 +1003,26 @@  static void filter_bitmap_blob_none(struct bitmap_index *bitmap_git,
 static unsigned long get_size_by_pos(struct bitmap_index *bitmap_git,
 				     uint32_t pos)
 {
-	struct packed_git *pack = bitmap_git->pack;
 	unsigned long size;
 	struct object_info oi = OBJECT_INFO_INIT;
 
 	oi.sizep = &size;
 
 	if (pos < bitmap_num_objects(bitmap_git)) {
-		off_t ofs = pack_pos_to_offset(pack, pos);
+		struct packed_git *pack;
+		off_t ofs;
+
+		if (bitmap_is_midx(bitmap_git)) {
+			uint32_t midx_pos = pack_pos_to_midx(bitmap_git->midx, pos);
+			uint32_t pack_id = nth_midxed_pack_int_id(bitmap_git->midx, midx_pos);
+
+			pack = bitmap_git->midx->packs[pack_id];
+			ofs = nth_midxed_offset(bitmap_git->midx, midx_pos);
+		} else {
+			pack = bitmap_git->pack;
+			ofs = pack_pos_to_offset(pack, pos);
+		}
+
 		if (packed_object_info(the_repository, pack, ofs, &oi) < 0) {
 			struct object_id oid;
 			nth_bitmap_object_oid(bitmap_git, &oid,
@@ -1027,7 +1203,7 @@  struct bitmap_index *prepare_bitmap_walk(struct rev_info *revs,
 	/* try to open a bitmapped pack, but don't parse it yet
 	 * because we may not need to use it */
 	CALLOC_ARRAY(bitmap_git, 1);
-	if (open_pack_bitmap(revs->repo, bitmap_git) < 0)
+	if (open_bitmap(revs->repo, bitmap_git) < 0)
 		goto cleanup;
 
 	for (i = 0; i < revs->pending.nr; ++i) {
@@ -1071,7 +1247,7 @@  struct bitmap_index *prepare_bitmap_walk(struct rev_info *revs,
 	 * from disk. this is the point of no return; after this the rev_list
 	 * becomes invalidated and we must perform the revwalk through bitmaps
 	 */
-	if (load_pack_bitmap(bitmap_git) < 0)
+	if (load_bitmap(bitmap_git) < 0)
 		goto cleanup;
 
 	object_array_clear(&revs->pending);
@@ -1115,19 +1291,43 @@  struct bitmap_index *prepare_bitmap_walk(struct rev_info *revs,
 }
 
 static void try_partial_reuse(struct bitmap_index *bitmap_git,
+			      struct packed_git *pack,
 			      size_t pos,
 			      struct bitmap *reuse,
 			      struct pack_window **w_curs)
 {
-	off_t offset, header;
+	off_t offset, delta_obj_offset;
 	enum object_type type;
 	unsigned long size;
 
-	if (pos >= bitmap_num_objects(bitmap_git))
-		return; /* not actually in the pack or MIDX */
+	/*
+	 * try_partial_reuse() is called either on (a) objects in the
+	 * bitmapped pack (in the case of a single-pack bitmap) or (b)
+	 * objects in the preferred pack of a multi-pack bitmap.
+	 * Importantly, the latter can pretend as if only a single pack
+	 * exists because:
+	 *
+	 *   - The first pack->num_objects bits of a MIDX bitmap are
+	 *     reserved for the preferred pack, and
+	 *
+	 *   - Ties due to duplicate objects are always resolved in
+	 *     favor of the preferred pack.
+	 *
+	 * Therefore we do not need to ever ask the MIDX for its copy of
+	 * an object by OID, since it will always select it from the
+	 * preferred pack. Likewise, the selected copy of the base
+	 * object for any deltas will reside in the same pack.
+	 *
+	 * This means that we can reuse pos when looking up the bit in
+	 * the reuse bitmap, too, since bits corresponding to the
+	 * preferred pack precede all bits from other packs.
+	 */
 
-	offset = header = pack_pos_to_offset(bitmap_git->pack, pos);
-	type = unpack_object_header(bitmap_git->pack, w_curs, &offset, &size);
+	if (pos >= pack->num_objects)
+		return; /* not actually in the pack or MIDX preferred pack */
+
+	offset = delta_obj_offset = pack_pos_to_offset(pack, pos);
+	type = unpack_object_header(pack, w_curs, &offset, &size);
 	if (type < 0)
 		return; /* broken packfile, punt */
 
@@ -1143,11 +1343,11 @@  static void try_partial_reuse(struct bitmap_index *bitmap_git,
 		 * and the normal slow path will complain about it in
 		 * more detail.
 		 */
-		base_offset = get_delta_base(bitmap_git->pack, w_curs,
-					     &offset, type, header);
+		base_offset = get_delta_base(pack, w_curs, &offset, type,
+					     delta_obj_offset);
 		if (!base_offset)
 			return;
-		if (offset_to_pack_pos(bitmap_git->pack, base_offset, &base_pos) < 0)
+		if (offset_to_pack_pos(pack, base_offset, &base_pos) < 0)
 			return;
 
 		/*
@@ -1180,24 +1380,48 @@  static void try_partial_reuse(struct bitmap_index *bitmap_git,
 	bitmap_set(reuse, pos);
 }
 
+static uint32_t midx_preferred_pack(struct bitmap_index *bitmap_git)
+{
+	struct multi_pack_index *m = bitmap_git->midx;
+	if (!m)
+		BUG("midx_preferred_pack: requires non-empty MIDX");
+	return nth_midxed_pack_int_id(m, pack_pos_to_midx(bitmap_git->midx, 0));
+}
+
 int reuse_partial_packfile_from_bitmap(struct bitmap_index *bitmap_git,
 				       struct packed_git **packfile_out,
 				       uint32_t *entries,
 				       struct bitmap **reuse_out)
 {
+	struct packed_git *pack;
 	struct bitmap *result = bitmap_git->result;
 	struct bitmap *reuse;
 	struct pack_window *w_curs = NULL;
 	size_t i = 0;
 	uint32_t offset;
-	uint32_t objects_nr = bitmap_num_objects(bitmap_git);
+	uint32_t objects_nr;
 
 	assert(result);
 
+	load_reverse_index(bitmap_git);
+
+	if (bitmap_is_midx(bitmap_git))
+		pack = bitmap_git->midx->packs[midx_preferred_pack(bitmap_git)];
+	else
+		pack = bitmap_git->pack;
+	objects_nr = pack->num_objects;
+
 	while (i < result->word_alloc && result->words[i] == (eword_t)~0)
 		i++;
 
-	/* Don't mark objects not in the packfile */
+	/*
+	 * Don't mark objects not in the packfile or preferred pack. This bitmap
+	 * marks objects eligible for reuse, but the pack-reuse code only
+	 * understands how to reuse a single pack. Since the preferred pack is
+	 * guaranteed to have all bases for its deltas (in a multi-pack bitmap),
+	 * we use it instead of another pack. In single-pack bitmaps, the choice
+	 * is made for us.
+	 */
 	if (i > objects_nr / BITS_IN_EWORD)
 		i = objects_nr / BITS_IN_EWORD;
 
@@ -1213,7 +1437,15 @@  int reuse_partial_packfile_from_bitmap(struct bitmap_index *bitmap_git,
 				break;
 
 			offset += ewah_bit_ctz64(word >> offset);
-			try_partial_reuse(bitmap_git, pos + offset, reuse, &w_curs);
+			if (bitmap_is_midx(bitmap_git)) {
+				/*
+				 * Can't reuse from a non-preferred pack (see
+				 * above).
+				 */
+				if (pos + offset >= objects_nr)
+					continue;
+			}
+			try_partial_reuse(bitmap_git, pack, pos + offset, reuse, &w_curs);
 		}
 	}
 
@@ -1230,7 +1462,7 @@  int reuse_partial_packfile_from_bitmap(struct bitmap_index *bitmap_git,
 	 * need to be handled separately.
 	 */
 	bitmap_and_not(result, reuse);
-	*packfile_out = bitmap_git->pack;
+	*packfile_out = pack;
 	*reuse_out = reuse;
 	return 0;
 }
@@ -1504,6 +1736,12 @@  uint32_t *create_bitmap_mapping(struct bitmap_index *bitmap_git,
 	uint32_t i, num_objects;
 	uint32_t *reposition;
 
+	if (!bitmap_is_midx(bitmap_git))
+		load_reverse_index(bitmap_git);
+	else if (load_midx_revindex(bitmap_git->midx) < 0)
+		BUG("rebuild_existing_bitmaps: missing required rev-cache "
+		    "extension");
+
 	num_objects = bitmap_num_objects(bitmap_git);
 	CALLOC_ARRAY(reposition, num_objects);
 
@@ -1511,8 +1749,13 @@  uint32_t *create_bitmap_mapping(struct bitmap_index *bitmap_git,
 		struct object_id oid;
 		struct object_entry *oe;
 
-		nth_packed_object_id(&oid, bitmap_git->pack,
-				     pack_pos_to_index(bitmap_git->pack, i));
+		if (bitmap_is_midx(bitmap_git))
+			nth_midxed_object_oid(&oid,
+					      bitmap_git->midx,
+					      pack_pos_to_midx(bitmap_git->midx, i));
+		else
+			nth_packed_object_id(&oid, bitmap_git->pack,
+					     pack_pos_to_index(bitmap_git->pack, i));
 		oe = packlist_find(mapping, &oid);
 
 		if (oe)
@@ -1538,6 +1781,19 @@  void free_bitmap_index(struct bitmap_index *b)
 	free(b->ext_index.hashes);
 	bitmap_free(b->result);
 	bitmap_free(b->haves);
+	if (bitmap_is_midx(b)) {
+		/*
+		 * Multi-pack bitmaps need to have resources associated with
+		 * their on-disk reverse indexes unmapped so that stale .rev and
+		 * .bitmap files can be removed.
+		 *
+		 * Unlike pack-based bitmaps, multi-pack bitmaps can be read and
+		 * written in the same 'git multi-pack-index write --bitmap'
+		 * process. Close resources so they can be removed safely on
+		 * platforms like Windows.
+		 */
+		close_midx_revindex(b->midx);
+	}
 	free(b);
 }
 
@@ -1552,7 +1808,7 @@  static off_t get_disk_usage_for_type(struct bitmap_index *bitmap_git,
 				     enum object_type object_type)
 {
 	struct bitmap *result = bitmap_git->result;
-	struct packed_git *pack = bitmap_git->pack;
+	struct packed_git *pack;
 	off_t total = 0;
 	struct ewah_iterator it;
 	eword_t filter;
@@ -1575,7 +1831,31 @@  static off_t get_disk_usage_for_type(struct bitmap_index *bitmap_git,
 				break;
 
 			offset += ewah_bit_ctz64(word >> offset);
-			pos = base + offset;
+
+			if (bitmap_is_midx(bitmap_git)) {
+				uint32_t pack_pos;
+				uint32_t midx_pos = pack_pos_to_midx(bitmap_git->midx, base + offset);
+				uint32_t pack_id = nth_midxed_pack_int_id(bitmap_git->midx, midx_pos);
+				off_t offset = nth_midxed_offset(bitmap_git->midx, midx_pos);
+
+				pack = bitmap_git->midx->packs[pack_id];
+
+				if (offset_to_pack_pos(pack, offset, &pack_pos) < 0) {
+					struct object_id oid;
+					nth_midxed_object_oid(&oid, bitmap_git->midx, midx_pos);
+
+					die(_("could not find %s in pack #%"PRIu32" at offset %"PRIuMAX),
+					    oid_to_hex(&oid),
+					    pack_id,
+					    (uintmax_t)offset);
+				}
+
+				pos = pack_pos;
+			} else {
+				pack = bitmap_git->pack;
+				pos = base + offset;
+			}
+
 			total += pack_pos_to_offset(pack, pos + 1) -
 				 pack_pos_to_offset(pack, pos);
 		}
@@ -1628,6 +1908,20 @@  off_t get_disk_usage_from_bitmap(struct bitmap_index *bitmap_git,
 	return total;
 }
 
+int bitmap_is_midx(struct bitmap_index *bitmap_git)
+{
+	return !!bitmap_git->midx;
+}
+
+off_t bitmap_pack_offset(struct bitmap_index *bitmap_git, uint32_t pos)
+{
+	if (bitmap_is_midx(bitmap_git))
+		return nth_midxed_offset(bitmap_git->midx,
+					 pack_pos_to_midx(bitmap_git->midx, pos));
+	return nth_packed_object_offset(bitmap_git->pack,
+					pack_pos_to_index(bitmap_git->pack, pos));
+}
+
 const struct string_list *bitmap_preferred_tips(struct repository *r)
 {
 	return repo_config_get_value_multi(r, "pack.preferbitmaptips");
diff --git a/pack-bitmap.h b/pack-bitmap.h
index 52ea10de51..30396a7a4a 100644
--- a/pack-bitmap.h
+++ b/pack-bitmap.h
@@ -92,6 +92,11 @@  void bitmap_writer_finish(struct pack_idx_entry **index,
 			  uint32_t index_nr,
 			  const char *filename,
 			  uint16_t options);
+char *midx_bitmap_filename(struct multi_pack_index *midx);
+char *pack_bitmap_filename(struct packed_git *p);
+
+int bitmap_is_midx(struct bitmap_index *bitmap_git);
+off_t bitmap_pack_offset(struct bitmap_index *bitmap_git, uint32_t pos);
 
 const struct string_list *bitmap_preferred_tips(struct repository *r);
 int bitmap_is_preferred_refname(struct repository *r, const char *refname);
diff --git a/packfile.c b/packfile.c
index 755aa7aec5..e855b93208 100644
--- a/packfile.c
+++ b/packfile.c
@@ -860,7 +860,7 @@  static void prepare_pack(const char *full_name, size_t full_name_len,
 	if (!strcmp(file_name, "multi-pack-index"))
 		return;
 	if (starts_with(file_name, "multi-pack-index") &&
-	    ends_with(file_name, ".rev"))
+	    (ends_with(file_name, ".bitmap") || ends_with(file_name, ".rev")))
 		return;
 	if (ends_with(file_name, ".idx") ||
 	    ends_with(file_name, ".rev") ||