diff mbox series

[v2,14/24] pack-bitmap: write multi-pack bitmaps

Message ID a8cec2463d0993b1118abdd31cb6c9e88a32e0c4.1624314293.git.me@ttaylorr.com (mailing list archive)
State New, archived
Headers show
Series multi-pack reachability bitmaps | expand

Commit Message

Taylor Blau June 21, 2021, 10:25 p.m. UTC
Write multi-pack bitmaps in the format described by
Documentation/technical/bitmap-format.txt, inferring their presence with
the absence of '--bitmap'.

To write a multi-pack bitmap, this patch attempts to reuse as much of
the existing machinery from pack-objects as possible. Specifically, the
MIDX code prepares a packing_data struct that pretends as if a single
packfile has been generated containing all of the objects contained
within the MIDX.

Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
 Documentation/git-multi-pack-index.txt |  12 +-
 builtin/multi-pack-index.c             |   2 +
 midx.c                                 | 230 ++++++++++++++++++++++++-
 midx.h                                 |   1 +
 4 files changed, 236 insertions(+), 9 deletions(-)

Comments

Ævar Arnfjörð Bjarmason June 24, 2021, 11:45 p.m. UTC | #1
On Mon, Jun 21 2021, Taylor Blau wrote:

> Write multi-pack bitmaps in the format described by
> Documentation/technical/bitmap-format.txt, inferring their presence with
> the absence of '--bitmap'.
>
> To write a multi-pack bitmap, this patch attempts to reuse as much of
> the existing machinery from pack-objects as possible. Specifically, the
> MIDX code prepares a packing_data struct that pretends as if a single
> packfile has been generated containing all of the objects contained
> within the MIDX.
>
> Signed-off-by: Taylor Blau <me@ttaylorr.com>
> ---
>  Documentation/git-multi-pack-index.txt |  12 +-
>  builtin/multi-pack-index.c             |   2 +
>  midx.c                                 | 230 ++++++++++++++++++++++++-
>  midx.h                                 |   1 +
>  4 files changed, 236 insertions(+), 9 deletions(-)
>
> diff --git a/Documentation/git-multi-pack-index.txt b/Documentation/git-multi-pack-index.txt
> index ffd601bc17..ada14deb2c 100644
> --- a/Documentation/git-multi-pack-index.txt
> +++ b/Documentation/git-multi-pack-index.txt
> @@ -10,7 +10,7 @@ SYNOPSIS
>  --------
>  [verse]
>  'git multi-pack-index' [--object-dir=<dir>] [--[no-]progress]
> -	[--preferred-pack=<pack>] <subcommand>
> +	[--preferred-pack=<pack>] [--[no-]bitmap] <subcommand>
>  
>  DESCRIPTION
>  -----------
> @@ -40,6 +40,9 @@ write::
>  		multiple packs contain the same object. If not given,
>  		ties are broken in favor of the pack with the lowest
>  		mtime.
> +
> +	--[no-]bitmap::
> +		Control whether or not a multi-pack bitmap is written.
>  --
>  
>  verify::
> @@ -81,6 +84,13 @@ EXAMPLES
>  $ git multi-pack-index write
>  -----------------------------------------------
>  
> +* Write a MIDX file for the packfiles in the current .git folder with a
> +corresponding bitmap.
> ++
> +-------------------------------------------------------------
> +$ git multi-pack-index write --preferred-pack <pack> --bitmap
> +-------------------------------------------------------------
> +

I wondered if this was a <pack> positional argument, but it's just the
argument for --preferred-pack, even though the synopsis uses the "="
style for it. Even if parse-options.c is loose about it, let's use one
or the other in examples consistently.

>  * Write a MIDX file for the packfiles in an alternate object store.
>  +
>  -----------------------------------------------
> diff --git a/builtin/multi-pack-index.c b/builtin/multi-pack-index.c
> index 5d3ea445fd..bf6fa982e3 100644
> --- a/builtin/multi-pack-index.c
> +++ b/builtin/multi-pack-index.c
> @@ -68,6 +68,8 @@ static int cmd_multi_pack_index_write(int argc, const char **argv)
>  		OPT_STRING(0, "preferred-pack", &opts.preferred_pack,
>  			   N_("preferred-pack"),
>  			   N_("pack for reuse when computing a multi-pack bitmap")),
> +		OPT_BIT(0, "bitmap", &opts.flags, N_("write multi-pack bitmap"),
> +			MIDX_WRITE_BITMAP | MIDX_WRITE_REV_INDEX),
>  		OPT_END(),
>  	};
>  
> diff --git a/midx.c b/midx.c
> index 752d36c57f..a58cca707b 100644
> --- a/midx.c
> +++ b/midx.c
> @@ -13,6 +13,10 @@
>  #include "repository.h"
>  #include "chunk-format.h"
>  #include "pack.h"
> +#include "pack-bitmap.h"
> +#include "refs.h"
> +#include "revision.h"
> +#include "list-objects.h"
>  
>  #define MIDX_SIGNATURE 0x4d494458 /* "MIDX" */
>  #define MIDX_VERSION 1
> @@ -885,6 +889,172 @@ static void write_midx_reverse_index(char *midx_name, unsigned char *midx_hash,
>  static void clear_midx_files_ext(struct repository *r, const char *ext,
>  				 unsigned char *keep_hash);
>  
> +static void prepare_midx_packing_data(struct packing_data *pdata,
> +				      struct write_midx_context *ctx)
> +{
> +	uint32_t i;
> +
> +	memset(pdata, 0, sizeof(struct packing_data));

We initialize this on the stack in write_midx_bitmap(), shouldn't we
just there do:

    struct packing_data pdata = {0}

Instead of:

    struct packing_data pdata;

And then doing this memset() here?

> +	prepare_packing_data(the_repository, pdata);
> +
> +	for (i = 0; i < ctx->entries_nr; i++) {
> +		struct pack_midx_entry *from = &ctx->entries[ctx->pack_order[i]];
> +		struct object_entry *to = packlist_alloc(pdata, &from->oid);
> +
> +		oe_set_in_pack(pdata, to,
> +			       ctx->info[ctx->pack_perm[from->pack_int_id]].p);
> +	}
> +}
> +
> +static int add_ref_to_pending(const char *refname,
> +			      const struct object_id *oid,
> +			      int flag, void *cb_data)
> +{
> +	struct rev_info *revs = (struct rev_info*)cb_data;
> +	struct object *object;
> +
> +	if ((flag & REF_ISSYMREF) && (flag & REF_ISBROKEN)) {

Just since I'd mentioned HAS_MULTI_BITS() offhand on another patch of
yours, it's for cases like this, so:

-    if ((flag & REF_ISSYMREF) && (flag & REF_ISBROKEN)) {
+    if (HAS_MULTI_BITS(flag & (REF_ISSYMREF|REF_ISBROKEN)) {

Saves you 3 bytes of code:) Anyway, you don't need to use it, just an
intresting function... :)

> +{
> +	struct rev_info revs;
> +	struct bitmap_commit_cb cb;
> +
> +	memset(&cb, 0, sizeof(struct bitmap_commit_cb));

Another case of s/memset/"= {0}"/g ?

> +	cb.ctx = ctx;
> +
> +	repo_init_revisions(the_repository, &revs, NULL);
> +	for_each_ref(add_ref_to_pending, &revs);
> +
> +	/*
> +	 * Skipping promisor objects here is intentional, since it only excludes
> +	 * them from the list of reachable commits that we want to select from
> +	 * when computing the selection of MIDX'd commits to receive bitmaps.
> +	 *
> +	 * Reachability bitmaps do require that their objects be closed under
> +	 * reachability, but fetching any objects missing from promisors at this
> +	 * point is too late. But, if one of those objects can be reached from
> +	 * an another object that is included in the bitmap, then we will
> +	 * complain later that we don't have reachability closure (and fail
> +	 * appropriately).
> +	 */
> +	fetch_if_missing = 0;
> +	revs.exclude_promisor_objects = 1;
> +
> +	/*
> +	 * Pass selected commits in topo order to match the behavior of
> +	 * pack-bitmaps when configured with delta islands.
> +	 */
> +	revs.topo_order = 1;
> +	revs.sort_order = REV_SORT_IN_GRAPH_ORDER;
> +
> +	if (prepare_revision_walk(&revs))
> +		die(_("revision walk setup failed"));
> +
> +	traverse_commit_list(&revs, bitmap_show_commit, NULL, &cb);
> +	if (indexed_commits_nr_p)
> +		*indexed_commits_nr_p = cb.commits_nr;
> +
> +	return cb.commits;
> +}
> +
> +static int write_midx_bitmap(char *midx_name, unsigned char *midx_hash,
> +			     struct write_midx_context *ctx,
> +			     unsigned flags)
> +{
> +	struct packing_data pdata;
> +	struct pack_idx_entry **index;
> +	struct commit **commits = NULL;
> +	uint32_t i, commits_nr;
> +	char *bitmap_name = xstrfmt("%s-%s.bitmap", midx_name, hash_to_hex(midx_hash));
> +	int ret;
> +
> +	prepare_midx_packing_data(&pdata, ctx);
> +
> +	commits = find_commits_for_midx_bitmap(&commits_nr, ctx);
> +
> +	/*
> +	 * Build the MIDX-order index based on pdata.objects (which is already
> +	 * in MIDX order; c.f., 'midx_pack_order_cmp()' for the definition of
> +	 * this order).
> +	 */
> +	ALLOC_ARRAY(index, pdata.nr_objects);
> +	for (i = 0; i < pdata.nr_objects; i++)
> +		index[i] = (struct pack_idx_entry *)&pdata.objects[i];
> +
> +	bitmap_writer_show_progress(flags & MIDX_PROGRESS);
> +	bitmap_writer_build_type_index(&pdata, index, pdata.nr_objects);
> +
> +	/*
> +	 * bitmap_writer_finish expects objects in lex order, but pack_order
> +	 * gives us exactly that. use it directly instead of re-sorting the
> +	 * array.
> +	 *
> +	 * This changes the order of objects in 'index' between
> +	 * bitmap_writer_build_type_index and bitmap_writer_finish.
> +	 *
> +	 * The same re-ordering takes place in the single-pack bitmap code via
> +	 * write_idx_file(), which is called by finish_tmp_packfile(), which
> +	 * happens between bitmap_writer_build_type_index() and
> +	 * bitmap_writer_finish().
> +	 */
> +	for (i = 0; i < pdata.nr_objects; i++)
> +		index[ctx->pack_order[i]] = (struct pack_idx_entry *)&pdata.objects[i];
> +
> +	bitmap_writer_select_commits(commits, commits_nr, -1);
> +	ret = bitmap_writer_build(&pdata);
> +	if (ret < 0)
> +		goto cleanup;
> +
> +	bitmap_writer_set_checksum(midx_hash);
> +	bitmap_writer_finish(index, pdata.nr_objects, bitmap_name, 0);
> +
> +cleanup:
> +	free(index);
> +	free(bitmap_name);
> +	return ret;
> +}
> +
>  static int write_midx_internal(const char *object_dir, struct multi_pack_index *m,
>  			       struct string_list *packs_to_drop,
>  			       const char *preferred_pack_name,
> @@ -930,9 +1100,16 @@ static int write_midx_internal(const char *object_dir, struct multi_pack_index *
>  		for (i = 0; i < ctx.m->num_packs; i++) {
>  			ALLOC_GROW(ctx.info, ctx.nr + 1, ctx.alloc);
>  
> +			if (prepare_midx_pack(the_repository, ctx.m, i)) {
> +				error(_("could not load pack %s"),
> +				      ctx.m->pack_names[i]);

Isn't the prepare_midx_pack() tasked with populating that pack_names[i]
that you can't load (the strbuf_addf() it does), but it can also exit
before that, do we get an empty string here then? Maybe I'm misreading
it (I haven't run this, just skimmed the code).

> @@ -1132,6 +1342,9 @@ static int write_midx_internal(const char *object_dir, struct multi_pack_index *
>  	free(ctx.pack_perm);
>  	free(ctx.pack_order);
>  	free(midx_name);
> +	if (ctx.m)
> +		close_midx(ctx.m);
> +

I see Stolee made close_midx() just return silently if !ctx.m in
1dcd9f2043a (midx: close multi-pack-index on repack, 2018-10-12), but
grepping the uses of it it seems calls to it are similarly guarded by
"if"'s.

Just a nit, weird to have a free-like function not invoked like
free. Perhaps (and maybe better for an unrelated cleanup) to either drop
the conditionals, or make it BUG() if it's called with NULL, but at
least we should pick one :)
Taylor Blau July 15, 2021, 2:33 p.m. UTC | #2
On Fri, Jun 25, 2021 at 01:45:46AM +0200, Ævar Arnfjörð Bjarmason wrote:
>
> On Mon, Jun 21 2021, Taylor Blau wrote:
> > +* Write a MIDX file for the packfiles in the current .git folder with a
> > +corresponding bitmap.
> > ++
> > +-------------------------------------------------------------
> > +$ git multi-pack-index write --preferred-pack <pack> --bitmap
> > +-------------------------------------------------------------
> > +
>
> I wondered if this was a <pack> positional argument, but it's just the
> argument for --preferred-pack, even though the synopsis uses the "="
> style for it. Even if parse-options.c is loose about it, let's use one
> or the other in examples consistently.

The example below (for writing a MIDX in an alternate object store)
doesn't include the '=', but probably would be clearer if it did. I
think it's a good suggestion, though, so I'll fix up my addition here.

> > +	memset(pdata, 0, sizeof(struct packing_data));
>
> We initialize this on the stack in write_midx_bitmap(), shouldn't we
> just there do:
>
>     struct packing_data pdata = {0}
>
> Instead of:
>
>     struct packing_data pdata;
>
> And then doing this memset() here?

I could go either way. Part of me prefers the memset() since it lets
callers of prepare_midx_packing_data() pass in anything they want,
including a pointer to uninitialized memory. Of course, there is only
one such caller, so it probably doesn't really matter.

And the other caller of prepare_packing_data() which is in
builtin/pack-objects.c operates on a pointer to a statically allocated
variable, so its bytes are already zero'd.

I don't feel strongly about it, though, so I'd just as soon err on the
side of flexibility than changing the declaration.

> > +{
> > +	struct rev_info revs;
> > +	struct bitmap_commit_cb cb;
> > +
> > +	memset(&cb, 0, sizeof(struct bitmap_commit_cb));
>
> Another case of s/memset/"= {0}"/g ?

Ah, in this case I'd prefer the aggregate-style initialization, since
we're zero-ing it out in the same function.

> >  static int write_midx_internal(const char *object_dir, struct multi_pack_index *m,
> >  			       struct string_list *packs_to_drop,
> >  			       const char *preferred_pack_name,
> > @@ -930,9 +1100,16 @@ static int write_midx_internal(const char *object_dir, struct multi_pack_index *
> >  		for (i = 0; i < ctx.m->num_packs; i++) {
> >  			ALLOC_GROW(ctx.info, ctx.nr + 1, ctx.alloc);
> >
> > +			if (prepare_midx_pack(the_repository, ctx.m, i)) {
> > +				error(_("could not load pack %s"),
> > +				      ctx.m->pack_names[i]);
>
> Isn't the prepare_midx_pack() tasked with populating that pack_names[i]
> that you can't load (the strbuf_addf() it does), but it can also exit
> before that, do we get an empty string here then? Maybe I'm misreading
> it (I haven't run this, just skimmed the code).

Nice catch, we can't rely on ctx->m.pack_names[i] being safe to read
(and at the same time know that we're going to get a non-empty string).

Since prepare_midx_pack() can fail because the pack itself couldn't be
loaded, I think the easiest thing to do here is to just opaquely say
"could not load pack" without adding any pack name that we may or may
not have.

> > @@ -1132,6 +1342,9 @@ static int write_midx_internal(const char *object_dir, struct multi_pack_index *
> >  	free(ctx.pack_perm);
> >  	free(ctx.pack_order);
> >  	free(midx_name);
> > +	if (ctx.m)
> > +		close_midx(ctx.m);
> > +
>
> I see Stolee made close_midx() just return silently if !ctx.m in
> 1dcd9f2043a (midx: close multi-pack-index on repack, 2018-10-12), but
> grepping the uses of it it seems calls to it are similarly guarded by
> "if"'s.
>
> Just a nit, weird to have a free-like function not invoked like
> free. Perhaps (and maybe better for an unrelated cleanup) to either drop
> the conditionals, or make it BUG() if it's called with NULL, but at
> least we should pick one :)

I agree with the direction of 1dcd9f2043a, so I'm happy to just drop the
conditional and call close_midx() with an argument that may or may not
be NULL.

Thanks,
Taylor
Jeff King July 21, 2021, 12:09 p.m. UTC | #3
On Mon, Jun 21, 2021 at 06:25:34PM -0400, Taylor Blau wrote:

> +static int add_ref_to_pending(const char *refname,
> +			      const struct object_id *oid,
> +			      int flag, void *cb_data)
> +{
> +	struct rev_info *revs = (struct rev_info*)cb_data;
> +	struct object *object;
> +
> +	if ((flag & REF_ISSYMREF) && (flag & REF_ISBROKEN)) {
> +		warning("symbolic ref is dangling: %s", refname);
> +		return 0;
> +	}
> +
> +	object = parse_object_or_die(oid, refname);
> +	if (object->type != OBJ_COMMIT)
> +		return 0;
> +
> +	add_pending_object(revs, object, "");
> +	if (bitmap_is_preferred_refname(revs->repo, refname))
> +		object->flags |= NEEDS_BITMAP;
> +	return 0;
> +}

OK, so we'll look at each ref to get the set of commits that we want to
traverse to put into the bitmap. Which is roughly the same as what the
pack bitmap does. We only generate bitmaps for all-into-one repacks, so
it is traversing all of the reachable objects. It is a little different
in that the pack version is probably hitting reflogs, but IMHO we are
better off to ignore reflogs for the purposes of bitmaps (I would
suggest to do so in the pack-bitmap case, too, except that it is
combined with the "what to pack" traversal there, and by the time we see
each commit we don't know how we got there).

> +struct bitmap_commit_cb {
> +	struct commit **commits;
> +	size_t commits_nr, commits_alloc;
> +
> +	struct write_midx_context *ctx;
> +};
> +
> +static const struct object_id *bitmap_oid_access(size_t index,
> +						 const void *_entries)
> +{
> +	const struct pack_midx_entry *entries = _entries;
> +	return &entries[index].oid;
> +}
> +
> +static void bitmap_show_commit(struct commit *commit, void *_data)
> +{
> +	struct bitmap_commit_cb *data = _data;
> +	if (oid_pos(&commit->object.oid, data->ctx->entries,
> +		    data->ctx->entries_nr,
> +		    bitmap_oid_access) > -1) {

This "> -1" struck me as a little bit funny. Perhaps ">= 0" would be a
more obvious way of saying "we found it"?

> +	/*
> +	 * Skipping promisor objects here is intentional, since it only excludes
> +	 * them from the list of reachable commits that we want to select from
> +	 * when computing the selection of MIDX'd commits to receive bitmaps.
> +	 *
> +	 * Reachability bitmaps do require that their objects be closed under
> +	 * reachability, but fetching any objects missing from promisors at this
> +	 * point is too late. But, if one of those objects can be reached from
> +	 * an another object that is included in the bitmap, then we will
> +	 * complain later that we don't have reachability closure (and fail
> +	 * appropriately).
> +	 */
> +	fetch_if_missing = 0;
> +	revs.exclude_promisor_objects = 1;

Makes sense.

> +	/*
> +	 * Pass selected commits in topo order to match the behavior of
> +	 * pack-bitmaps when configured with delta islands.
> +	 */
> +	revs.topo_order = 1;
> +	revs.sort_order = REV_SORT_IN_GRAPH_ORDER;

Hmm. Why do we want to match this side effect of delta islands here?

The only impact this has is on the order of commits we feed for bitmap
selection (and during the actual generation phase, it may impact
visitation order).

Now I'm of the opinion that topo order is probably the best thing for
bitmap generation (since the bitmaps themselves are connected to the
graph structure). But if it is the best thing, shouldn't we perhaps be
turning on topo-order for single-pack bitmaps, too?

And if it isn't the best thing, then why would we want it here?

> +	if (prepare_revision_walk(&revs))
> +		die(_("revision walk setup failed"));

We call init_revisions(), and then go straight to
prepare_revision_walk() with no call to setup_revisions() between. It
doesn't seem to be clearly documented, but I think you're supposed to,
as it finalizes some bits like diff_setup_done().

I suspect it works OK in practice, and I did find a few other spots that
do not call it (e.g., builtin/am.c:write_commit_patch). But most spots
do at least an empty setup_revisions(0, NULL, &rev, NULL).

> +	/*
> +	 * Build the MIDX-order index based on pdata.objects (which is already
> +	 * in MIDX order; c.f., 'midx_pack_order_cmp()' for the definition of
> +	 * this order).
> +	 */
> +	ALLOC_ARRAY(index, pdata.nr_objects);
> +	for (i = 0; i < pdata.nr_objects; i++)
> +		index[i] = (struct pack_idx_entry *)&pdata.objects[i];

This cast is correct because the pack_idx_entry is at the start of each
object_entry. But maybe:

  index[i] = &pdata.objects[i].idx;

would be less scary looking?

> +	/*
> +	 * bitmap_writer_finish expects objects in lex order, but pack_order
> +	 * gives us exactly that. use it directly instead of re-sorting the
> +	 * array.
> +	 *
> +	 * This changes the order of objects in 'index' between
> +	 * bitmap_writer_build_type_index and bitmap_writer_finish.
> +	 *
> +	 * The same re-ordering takes place in the single-pack bitmap code via
> +	 * write_idx_file(), which is called by finish_tmp_packfile(), which
> +	 * happens between bitmap_writer_build_type_index() and
> +	 * bitmap_writer_finish().
> +	 */
> +	for (i = 0; i < pdata.nr_objects; i++)
> +		index[ctx->pack_order[i]] = (struct pack_idx_entry *)&pdata.objects[i];

Ditto here.

> +	bitmap_writer_select_commits(commits, commits_nr, -1);

Not related to your patch, but I had to refresh my memory on what this
"-1" was for. It's "max_bitmaps", and is ignored if it's negative. But
the only callers pass "-1"! So we could get rid of it entirely.

It probably makes sense to leave that cleanup out of this
already-complicated series. But maybe worth doing later on top.

> @@ -930,9 +1100,16 @@ static int write_midx_internal(const char *object_dir, struct multi_pack_index *
>  		for (i = 0; i < ctx.m->num_packs; i++) {
>  			ALLOC_GROW(ctx.info, ctx.nr + 1, ctx.alloc);
>  
> +			if (prepare_midx_pack(the_repository, ctx.m, i)) {
> +				error(_("could not load pack %s"),
> +				      ctx.m->pack_names[i]);
> +				result = 1;
> +				goto cleanup;
> +			}

It might be worth a comment here. I can easily believe that there is
some later part of the bitmap generation code that assumes the packs are
loaded. But somebody reading this is not likely to understand why it's
here.

Should this be done conditionally only if we're writing a bitmap? (That
might also make it obvious why we are doing it).

> @@ -947,8 +1124,26 @@ static int write_midx_internal(const char *object_dir, struct multi_pack_index *
>  	for_each_file_in_pack_dir(object_dir, add_pack_to_midx, &ctx);
>  	stop_progress(&ctx.progress);
>  
> -	if (ctx.m && ctx.nr == ctx.m->num_packs && !packs_to_drop)
> -		goto cleanup;
> +	if (ctx.m && ctx.nr == ctx.m->num_packs && !packs_to_drop) {
> +		struct bitmap_index *bitmap_git;
> +		int bitmap_exists;
> +		int want_bitmap = flags & MIDX_WRITE_BITMAP;
> +
> +		bitmap_git = prepare_bitmap_git(the_repository);
> +		bitmap_exists = bitmap_git && bitmap_is_midx(bitmap_git);
> +		free_bitmap_index(bitmap_git);
> +
> +		if (bitmap_exists || !want_bitmap) {
> +			/*
> +			 * The correct MIDX already exists, and so does a
> +			 * corresponding bitmap (or one wasn't requested).
> +			 */
> +			if (!want_bitmap)
> +				clear_midx_files_ext(the_repository, ".bitmap",
> +						     NULL);
> +			goto cleanup;
> +		}
> +	}

So this makes "git multi-pack-index write --write-bitmap" actually write
a bitmap, even if the midx itself didn't need updating? Sounds good.
Likewise, we'll delete a bitmap if one exists but we were not requested
to write one. Makes sense.

I do think nice-to-have bits like this could have come in a separate
patch with their own explanation and tests. It may not be worth trying
to extract it at this point, though.

> @@ -1075,9 +1271,6 @@ static int write_midx_internal(const char *object_dir, struct multi_pack_index *
>  	hold_lock_file_for_update(&lk, midx_name, LOCK_DIE_ON_ERROR);
>  	f = hashfd(get_lock_file_fd(&lk), get_lock_file_path(&lk));
>  
> -	if (ctx.m)
> -		close_midx(ctx.m);
> -
>  	if (ctx.nr - dropped_packs == 0) {
>  		error(_("no pack files to index."));
>  		result = 1;

I'm not sure what this hunk is doing. We do pick up the close_midx()
call at the end of the function, amidst the other cleanup.

I expect the answer is something like "we need it open when we generate
the bitmaps". But it makes me wonder if we could hit any cases where we
try to overwrite it while it's still open, which would cause problems on
Windows.

-Peff
Taylor Blau July 26, 2021, 6:12 p.m. UTC | #4
On Wed, Jul 21, 2021 at 08:09:19AM -0400, Jeff King wrote:
> On Mon, Jun 21, 2021 at 06:25:34PM -0400, Taylor Blau wrote:
>
> > +static int add_ref_to_pending(const char *refname,
> > +			      const struct object_id *oid,
> > +			      int flag, void *cb_data)
> > +{
> > +	struct rev_info *revs = (struct rev_info*)cb_data;
> > +	struct object *object;
> > +
> > +	if ((flag & REF_ISSYMREF) && (flag & REF_ISBROKEN)) {
> > +		warning("symbolic ref is dangling: %s", refname);
> > +		return 0;
> > +	}
> > +
> > +	object = parse_object_or_die(oid, refname);
> > +	if (object->type != OBJ_COMMIT)
> > +		return 0;
> > +
> > +	add_pending_object(revs, object, "");
> > +	if (bitmap_is_preferred_refname(revs->repo, refname))
> > +		object->flags |= NEEDS_BITMAP;
> > +	return 0;
> > +}
>
> OK, so we'll look at each ref to get the set of commits that we want to
> traverse to put into the bitmap. Which is roughly the same as what the
> pack bitmap does. We only generate bitmaps for all-into-one repacks, so
> it is traversing all of the reachable objects. It is a little different
> in that the pack version is probably hitting reflogs, but IMHO we are
> better off to ignore reflogs for the purposes of bitmaps (I would
> suggest to do so in the pack-bitmap case, too, except that it is
> combined with the "what to pack" traversal there, and by the time we see
> each commit we don't know how we got there).

Right. And we might end up ignoring a lot of these commits, too: the
for-each-ref is just a starting point to enumerate everything, but we
only care about parts of the object graph that are contained in a pack
which is included in the MIDX we are writing (hence the bare "return"
you're commenting around below).

> > +static void bitmap_show_commit(struct commit *commit, void *_data)
> > +{
> > +	struct bitmap_commit_cb *data = _data;
> > +	if (oid_pos(&commit->object.oid, data->ctx->entries,
> > +		    data->ctx->entries_nr,
> > +		    bitmap_oid_access) > -1) {
>
> This "> -1" struck me as a little bit funny. Perhaps ">= 0" would be a
> more obvious way of saying "we found it"?

Sure. (I looked for other uses of oid_pos() to see what is more
common, but there really are vanishingly few uses.) Easier to read might
even be:

    int pos = oid_pos(...);
    if (pos < 0)
      return;
    ALLOC_GROW(...);

which is what I ended up going for.

> > +	/*
> > +	 * Pass selected commits in topo order to match the behavior of
> > +	 * pack-bitmaps when configured with delta islands.
> > +	 */
> > +	revs.topo_order = 1;
> > +	revs.sort_order = REV_SORT_IN_GRAPH_ORDER;
>
> Hmm. Why do we want to match this side effect of delta islands here?
>
> The only impact this has is on the order of commits we feed for bitmap
> selection (and during the actual generation phase, it may impact
> visitation order).
>
> Now I'm of the opinion that topo order is probably the best thing for
> bitmap generation (since the bitmaps themselves are connected to the
> graph structure). But if it is the best thing, shouldn't we perhaps be
> turning on topo-order for single-pack bitmaps, too?
>
> And if it isn't the best thing, then why would we want it here?

Heh, you were the one that suggested I bring this over to MIDX-based
bitmaps in the first place ;).

This comes from an investigation into why bitmap coverage had worsened
for some repositories using MIDX bitmaps at GitHub. The real reason was
resolved and unrelated to this, but trying to match the behavior of MIDX
bitmaps to our existing pack bitmap setup (which uses delta-islands) was
one strategy we tried while debugging.

I actually suspect that it doesn't really matter what order we feed this
list to bitmap_writer_select_commits() in, because the first thing that
it does is QSORT() the incoming list of commits in date order.

But it does mirror the behavior of our previous bitmap generation
settings, which has been running for years.

So... we could probably drop this hunk? I'd probably rather err on the
safe side and leave this alone since it matches a system that we know to
work well in practice.

> > +	if (prepare_revision_walk(&revs))
> > +		die(_("revision walk setup failed"));
>
> We call init_revisions(), and then go straight to
> prepare_revision_walk() with no call to setup_revisions() between. It
> doesn't seem to be clearly documented, but I think you're supposed to,
> as it finalizes some bits like diff_setup_done().
>
> I suspect it works OK in practice, and I did find a few other spots that
> do not call it (e.g., builtin/am.c:write_commit_patch). But most spots
> do at least an empty setup_revisions(0, NULL, &rev, NULL).

Sure, thanks.

> > +	/*
> > +	 * Build the MIDX-order index based on pdata.objects (which is already
> > +	 * in MIDX order; c.f., 'midx_pack_order_cmp()' for the definition of
> > +	 * this order).
> > +	 */
> > +	ALLOC_ARRAY(index, pdata.nr_objects);
> > +	for (i = 0; i < pdata.nr_objects; i++)
> > +		index[i] = (struct pack_idx_entry *)&pdata.objects[i];
>
> This cast is correct because the pack_idx_entry is at the start of each
> object_entry. But maybe:
>
>   index[i] = &pdata.objects[i].idx;
>
> would be less scary looking?

Definitely, and thanks (for this spot and the other one you mentioned).

> > +	bitmap_writer_select_commits(commits, commits_nr, -1);
>
> Not related to your patch, but I had to refresh my memory on what this
> "-1" was for. It's "max_bitmaps", and is ignored if it's negative. But
> the only callers pass "-1"! So we could get rid of it entirely.
>
> It probably makes sense to leave that cleanup out of this
> already-complicated series. But maybe worth doing later on top.

Yeah, seems like an easy topic for somebody interested in any
#leftoverbits could pick up. Once this lands, I'll be happy to take care
of it myself, too.

> > @@ -930,9 +1100,16 @@ static int write_midx_internal(const char *object_dir, struct multi_pack_index *
> >  		for (i = 0; i < ctx.m->num_packs; i++) {
> >  			ALLOC_GROW(ctx.info, ctx.nr + 1, ctx.alloc);
> >
> > +			if (prepare_midx_pack(the_repository, ctx.m, i)) {
> > +				error(_("could not load pack %s"),
> > +				      ctx.m->pack_names[i]);
> > +				result = 1;
> > +				goto cleanup;
> > +			}
>
> It might be worth a comment here. I can easily believe that there is
> some later part of the bitmap generation code that assumes the packs are
> loaded. But somebody reading this is not likely to understand why it's
> here.
>
> Should this be done conditionally only if we're writing a bitmap? (That
> might also make it obvious why we are doing it).

Ah. Actually, I don't think this was necessary before, but we *do* need
it now because we want to compare the pack mtime's for inferring a
preferred pack when one wasn't given. And we also need to open the pack
indexes, too, because we care about the object counts (to make sure that
we don't infer a preferred pack which has no objects).

Luckily, any new packs will be loaded (and likewise have their indexes
open, too), via the the add_pack_to_midx() callback that we pass as an
argument to for_each_file_in_pack_dir().

But we could do something like this instead:

--- 8< ---

diff --git a/midx.c b/midx.c
index 8426e1a0b1..a70a6bca81 100644
--- a/midx.c
+++ b/midx.c
@@ -1111,16 +1111,29 @@ static int write_midx_internal(const char *object_dir, struct multi_pack_index *
 		for (i = 0; i < ctx.m->num_packs; i++) {
 			ALLOC_GROW(ctx.info, ctx.nr + 1, ctx.alloc);

-			if (prepare_midx_pack(the_repository, ctx.m, i)) {
-				error(_("could not load pack"));
-				result = 1;
-				goto cleanup;
-			}
-
 			ctx.info[ctx.nr].orig_pack_int_id = i;
 			ctx.info[ctx.nr].pack_name = xstrdup(ctx.m->pack_names[i]);
-			ctx.info[ctx.nr].p = ctx.m->packs[i];
+			ctx.info[ctx.nr].p = NULL;
 			ctx.info[ctx.nr].expired = 0;
+
+			if (flags & MIDX_WRITE_REV_INDEX) {
+				/*
+				 * If generating a reverse index, need to have
+				 * packed_git's loaded to compare their
+				 * mtimes and object count.
+				 */
+				if (prepare_midx_pack(the_repository, ctx.m, i)) {
+					error(_("could not load pack"));
+					result = 1;
+					goto cleanup;
+				}
+
+				if (open_pack_index(ctx.m->packs[i]))
+					die(_("could not open index for %s"),
+					    ctx.m->packs[i]->pack_name);
+				ctx.info[ctx.nr].p = ctx.m->packs[i];
+			}
+
 			ctx.nr++;
 		}
 	}

--- >8 ---

> > @@ -1075,9 +1271,6 @@ static int write_midx_internal(const char *object_dir, struct multi_pack_index *
> >  	hold_lock_file_for_update(&lk, midx_name, LOCK_DIE_ON_ERROR);
> >  	f = hashfd(get_lock_file_fd(&lk), get_lock_file_path(&lk));
> >
> > -	if (ctx.m)
> > -		close_midx(ctx.m);
> > -
> >  	if (ctx.nr - dropped_packs == 0) {
> >  		error(_("no pack files to index."));
> >  		result = 1;
>
> I'm not sure what this hunk is doing. We do pick up the close_midx()
> call at the end of the function, amidst the other cleanup.
>
> I expect the answer is something like "we need it open when we generate
> the bitmaps". But it makes me wonder if we could hit any cases where we
> try to overwrite it while it's still open, which would cause problems on
> Windows.

The reason is kind of annoying. If we're building a MIDX bitmap
in-process (e.g., `git multi-pack-index write --bitmap`) then we'll call
prepare_packed_git() to build our pseudo-packing list which we pass to
the bitmap generation machinery.

But prepare_packed_git() calls prepare_packed_git_one() ->
for_each_file_in_pack_dir() with the prepare_pack() callback -> which
the wants to see if the MIDX we have open already knows about a given
pack so we avoid opening it twice.

But even though the MIDX would have gone away by this point (with the
previous close_midx() call that is removed above), we still hold onto
a pointer to it via the object_store's `multi_pack_index` pointer. And
then all the way down in packfile.c:prepare_pack() we try to pass a
now-defunct pointer as the first argument to midx_contains_pack(), and
crash.

And clearing out that `multi_pack_index` pointer is tricky, because the
MIDX would have to compare the odb's `object_dir` with its own (which is
brittle in its own right), but also would have to see if that object
store is pointing at *it*, and not some other MIDX.

So we do have to keep it open there. Which makes me wonder how this
could possibly work on Windows, because holding the MIDX open will make
the commit_lock_file() definitely fail. But it seems OK in the
Windows-based CI runs?

Puzzled.

Thanks,
Taylor
Taylor Blau July 26, 2021, 6:23 p.m. UTC | #5
On Mon, Jul 26, 2021 at 02:12:30PM -0400, Taylor Blau wrote:
> So we do have to keep it open there. Which makes me wonder how this
> could possibly work on Windows, because holding the MIDX open will make
> the commit_lock_file() definitely fail. But it seems OK in the
> Windows-based CI runs?
>
> Puzzled.

The below should do the trick; it'll keep the MIDX open just long enough
to generate a bitmap (if one was requested), but will close any
handle(s) on an existing MIDX right before we move the temporary file
into place.

It has the added benefit of making that hunk about destroying stale
references to packs be unnecessary.

Watching the Actions run here to see how this runs on Windows:

    https://github.com/ttaylorr/git/actions/runs/1068457013

Below is the patch.

--- >8 ---

commit c7b7ce0ebc793e311072929772a2d352600f3d54
Author: Taylor Blau <me@ttaylorr.com>
Date:   Mon Jul 26 14:17:27 2021 -0400

    fixup! pack-bitmap: write multi-pack bitmaps

diff --git a/midx.c b/midx.c
index 76c94a0df2..297627f992 100644
--- a/midx.c
+++ b/midx.c
@@ -1358,6 +1358,8 @@ static int write_midx_internal(const char *object_dir, struct multi_pack_index *
 		}
 	}

+	close_midx(ctx.m);
+
 	commit_lock_file(&lk);

 	clear_midx_files_ext(the_repository, ".bitmap", midx_hash);
@@ -1368,15 +1370,6 @@ static int write_midx_internal(const char *object_dir, struct multi_pack_index *
 		if (ctx.info[i].p) {
 			close_pack(ctx.info[i].p);
 			free(ctx.info[i].p);
-			if (ctx.m) {
-				/*
-				 * Destroy a stale reference to the pack in
-				 * 'ctx.m'.
-				 */
-				uint32_t orig = ctx.info[i].orig_pack_int_id;
-				if (orig < ctx.m->num_packs)
-					ctx.m->packs[orig] = NULL;
-			}
 		}
 		free(ctx.info[i].pack_name);
 	}
@@ -1386,7 +1379,6 @@ static int write_midx_internal(const char *object_dir, struct multi_pack_index *
 	free(ctx.pack_perm);
 	free(ctx.pack_order);
 	free(midx_name);
-	close_midx(ctx.m);

 	return result;
 }
Jeff King July 27, 2021, 5:11 p.m. UTC | #6
On Mon, Jul 26, 2021 at 02:12:30PM -0400, Taylor Blau wrote:

> > This "> -1" struck me as a little bit funny. Perhaps ">= 0" would be a
> > more obvious way of saying "we found it"?
> 
> Sure. (I looked for other uses of oid_pos() to see what is more
> common, but there really are vanishingly few uses.) Easier to read might
> even be:
> 
>     int pos = oid_pos(...);
>     if (pos < 0)
>       return;
>     ALLOC_GROW(...);
> 
> which is what I ended up going for.

Sure, that's better still.

> [topo-sorting commits fed to bitmap writer]
>
> This comes from an investigation into why bitmap coverage had worsened
> for some repositories using MIDX bitmaps at GitHub. The real reason was
> resolved and unrelated to this, but trying to match the behavior of MIDX
> bitmaps to our existing pack bitmap setup (which uses delta-islands) was
> one strategy we tried while debugging.
>
> I actually suspect that it doesn't really matter what order we feed this
> list to bitmap_writer_select_commits() in, because the first thing that
> it does is QSORT() the incoming list of commits in date order.

Hmm, yes, I agree that it shouldn't matter for that reason (though
arguably topo order would still be better than a strict date order, it
does nothing now).

I remember looking into reasons why a single-pack bitmap versus a
midx-of-a-single-pack bitmap might not be identical, but the interesting
things turned out to be elsewhere. Did this actually change anything at
all? If so, then perhaps the "it doesn't matter" is not as true as we
are thinking. I could believe that it has a tiny impact when breaking
times between identical committer dates, though.

> But it does mirror the behavior of our previous bitmap generation
> settings, which has been running for years.
> 
> So... we could probably drop this hunk? I'd probably rather err on the
> safe side and leave this alone since it matches a system that we know to
> work well in practice.

I'd rather drop it, if we think it's doing nothing. While I do value
history in production as a sign of stability, upstream review is a good
time to make sure we understand all of the "why", and to clean things up
(e.g., another example is the questionable close_midx() stuff discussed
elsewhere).

And if we do suspect it is doing something, then IMHO the right thing is
probably still to drop it, and to introduce the feature identically to
both the midx and pack bitmap generation code paths. But that should be
a separate topic (and may actually involve fixing the QSORT to put
things in topo order rather than just date order).

> > > @@ -930,9 +1100,16 @@ static int write_midx_internal(const char *object_dir, struct multi_pack_index *
> > >  		for (i = 0; i < ctx.m->num_packs; i++) {
> > >  			ALLOC_GROW(ctx.info, ctx.nr + 1, ctx.alloc);
> > >
> > > +			if (prepare_midx_pack(the_repository, ctx.m, i)) {
> > > +				error(_("could not load pack %s"),
> > > +				      ctx.m->pack_names[i]);
> > > +				result = 1;
> > > +				goto cleanup;
> > > +			}
> >
> > It might be worth a comment here. I can easily believe that there is
> > some later part of the bitmap generation code that assumes the packs are
> > loaded. But somebody reading this is not likely to understand why it's
> > here.
> >
> > Should this be done conditionally only if we're writing a bitmap? (That
> > might also make it obvious why we are doing it).
> 
> Ah. Actually, I don't think this was necessary before, but we *do* need
> it now because we want to compare the pack mtime's for inferring a
> preferred pack when one wasn't given. And we also need to open the pack
> indexes, too, because we care about the object counts (to make sure that
> we don't infer a preferred pack which has no objects).
> 
> Luckily, any new packs will be loaded (and likewise have their indexes
> open, too), via the the add_pack_to_midx() callback that we pass as an
> argument to for_each_file_in_pack_dir().

Hmm, OK. Your second paragraph makes it sound like we _don't_ need to do
this. But the key is "new packs". In add_pack_to_midx() we skip any
packs that are already in the existing midx, assuming they've already
been added. And we probably must do that, otherwise we end up with
duplicate structs that are not actually shared by ctx->m.

> But we could do something like this instead:
> 
> --- 8< ---
> 
> diff --git a/midx.c b/midx.c
> index 8426e1a0b1..a70a6bca81 100644
> --- a/midx.c
> +++ b/midx.c
> @@ -1111,16 +1111,29 @@ static int write_midx_internal(const char *object_dir, struct multi_pack_index *
>  		for (i = 0; i < ctx.m->num_packs; i++) {
>  			ALLOC_GROW(ctx.info, ctx.nr + 1, ctx.alloc);
> 
> -			if (prepare_midx_pack(the_repository, ctx.m, i)) {
> -				error(_("could not load pack"));
> -				result = 1;
> -				goto cleanup;
> -			}
> -
>  			ctx.info[ctx.nr].orig_pack_int_id = i;
>  			ctx.info[ctx.nr].pack_name = xstrdup(ctx.m->pack_names[i]);
> -			ctx.info[ctx.nr].p = ctx.m->packs[i];
> +			ctx.info[ctx.nr].p = NULL;
>  			ctx.info[ctx.nr].expired = 0;
> +
> +			if (flags & MIDX_WRITE_REV_INDEX) {
> +				/*
> +				 * If generating a reverse index, need to have
> +				 * packed_git's loaded to compare their
> +				 * mtimes and object count.
> +				 */
> +				if (prepare_midx_pack(the_repository, ctx.m, i)) {
> +					error(_("could not load pack"));
> +					result = 1;
> +					goto cleanup;
> +				}
> +
> +				if (open_pack_index(ctx.m->packs[i]))
> +					die(_("could not open index for %s"),
> +					    ctx.m->packs[i]->pack_name);
> +				ctx.info[ctx.nr].p = ctx.m->packs[i];
> +			}
> +

Yeah, was what I was suggesting: make it conditional on bitmaps (well, a
.rev index, which is more precise), and put in comments. :)

It's interesting that your earlier iteration didn't call
open_pack_index(). Is it necessary, or not? From your description, it
seems like it should be. But maybe some later step lazy-loads it? Even
if so, I can see how prepare_midx_pack() would still be required
(because we want to make sure we are using the same struct).

> [closing the midx]
> 
> The reason is kind of annoying. If we're building a MIDX bitmap
> in-process (e.g., `git multi-pack-index write --bitmap`) then we'll call
> prepare_packed_git() to build our pseudo-packing list which we pass to
> the bitmap generation machinery.
> 
> But prepare_packed_git() calls prepare_packed_git_one() ->
> for_each_file_in_pack_dir() with the prepare_pack() callback -> which
> the wants to see if the MIDX we have open already knows about a given
> pack so we avoid opening it twice.
> 
> But even though the MIDX would have gone away by this point (with the
> previous close_midx() call that is removed above), we still hold onto
> a pointer to it via the object_store's `multi_pack_index` pointer. And
> then all the way down in packfile.c:prepare_pack() we try to pass a
> now-defunct pointer as the first argument to midx_contains_pack(), and
> crash.
> 
> And clearing out that `multi_pack_index` pointer is tricky, because the
> MIDX would have to compare the odb's `object_dir` with its own (which is
> brittle in its own right), but also would have to see if that object
> store is pointing at *it*, and not some other MIDX.
> 
> So we do have to keep it open there. Which makes me wonder how this
> could possibly work on Windows, because holding the MIDX open will make
> the commit_lock_file() definitely fail. But it seems OK in the
> Windows-based CI runs?

Forgetting Windows for a moment, it seems like the same "the pointer is
bogus and we crash" is now just pushed down to the end of the function,
rather than the middle. I.e., it is safe to close the midx we got from
load_multi_pack_index(), but not one we got from
prepare_multi_pack_index_one(). So it's hard to reason about this at all
until that problem from patch 08 is resolved.

-Peff
Taylor Blau July 27, 2021, 8:33 p.m. UTC | #7
On Tue, Jul 27, 2021 at 01:11:25PM -0400, Jeff King wrote:
> On Mon, Jul 26, 2021 at 02:12:30PM -0400, Taylor Blau wrote:
> > But it does mirror the behavior of our previous bitmap generation
> > settings, which has been running for years.
> >
> > So... we could probably drop this hunk? I'd probably rather err on the
> > safe side and leave this alone since it matches a system that we know to
> > work well in practice.
>
> I'd rather drop it, if we think it's doing nothing. While I do value
> history in production as a sign of stability, upstream review is a good
> time to make sure we understand all of the "why", and to clean things up
> (e.g., another example is the questionable close_midx() stuff discussed
> elsewhere).

OK, I think that's a very reasonable way of thinking about it, so I'd
rather just get rid of it (not to mention that I really doubt it's doing
much of anything in the first place).

> > Luckily, any new packs will be loaded (and likewise have their indexes
> > open, too), via the the add_pack_to_midx() callback that we pass as an
> > argument to for_each_file_in_pack_dir().
>
> Hmm, OK. Your second paragraph makes it sound like we _don't_ need to do
> this. But the key is "new packs". In add_pack_to_midx() we skip any
> packs that are already in the existing midx, assuming they've already
> been added. And we probably must do that, otherwise we end up with
> duplicate structs that are not actually shared by ctx->m.

Exactly.

> It's interesting that your earlier iteration didn't call
> open_pack_index(). Is it necessary, or not? From your description, it
> seems like it should be. But maybe some later step lazy-loads it? Even
> if so, I can see how prepare_midx_pack() would still be required
> (because we want to make sure we are using the same struct).

It's only necessary now (at least for determining a preferred pack if
the caller didn't specify one with `--preferred-pack`) because we care
about reading the `num_objects` field, which the index must be loaded
for.

Thanks,
Taylor
Jeff King July 28, 2021, 5:52 p.m. UTC | #8
On Tue, Jul 27, 2021 at 04:33:01PM -0400, Taylor Blau wrote:

> > It's interesting that your earlier iteration didn't call
> > open_pack_index(). Is it necessary, or not? From your description, it
> > seems like it should be. But maybe some later step lazy-loads it? Even
> > if so, I can see how prepare_midx_pack() would still be required
> > (because we want to make sure we are using the same struct).
> 
> It's only necessary now (at least for determining a preferred pack if
> the caller didn't specify one with `--preferred-pack`) because we care
> about reading the `num_objects` field, which the index must be loaded
> for.

I guess I'm a little confused about "now" in your sentence. I understand
that it's not necessary before your series to have loaded all of the
index files ahead of time. But didn't we need to do so in v2 of your
series, which has the preferred-pack logic?

If so, then was the v2 version buggy, since it only called
prepare_midx_pack() and not open_pack_index()? And then v3 is fixing
that? Or is something else opening the pack index for us?

-Peff
Taylor Blau July 29, 2021, 7:33 p.m. UTC | #9
On Wed, Jul 28, 2021 at 01:52:37PM -0400, Jeff King wrote:
> On Tue, Jul 27, 2021 at 04:33:01PM -0400, Taylor Blau wrote:
>
> > > It's interesting that your earlier iteration didn't call
> > > open_pack_index(). Is it necessary, or not? From your description, it
> > > seems like it should be. But maybe some later step lazy-loads it? Even
> > > if so, I can see how prepare_midx_pack() would still be required
> > > (because we want to make sure we are using the same struct).
> >
> > It's only necessary now (at least for determining a preferred pack if
> > the caller didn't specify one with `--preferred-pack`) because we care
> > about reading the `num_objects` field, which the index must be loaded
> > for.
>
> I guess I'm a little confused about "now" in your sentence. I understand
> that it's not necessary before your series to have loaded all of the
> index files ahead of time. But didn't we need to do so in v2 of your
> series, which has the preferred-pack logic?
>
> If so, then was the v2 version buggy, since it only called
> prepare_midx_pack() and not open_pack_index()? And then v3 is fixing
> that? Or is something else opening the pack index for us?

In earlier versions of this series, I don't think we needed to have the
indexes loaded by this point, since (before v3) we didn't care about
ignoring the empty packs when finding a default preferred-pack.

But now we do, and so we need to call open_pack_index() ourselves.
Confusingly, we only need to do that on packs that *are* included in the
MIDX, since prepare_midx_pack() doesn't do it for us, but
add_pack_to_midx() does.

Thanks,
Taylor
Jeff King Aug. 12, 2021, 8 p.m. UTC | #10
On Thu, Jul 29, 2021 at 03:33:18PM -0400, Taylor Blau wrote:

> > > It's only necessary now (at least for determining a preferred pack if
> > > the caller didn't specify one with `--preferred-pack`) because we care
> > > about reading the `num_objects` field, which the index must be loaded
> > > for.
> >
> > I guess I'm a little confused about "now" in your sentence. I understand
> > that it's not necessary before your series to have loaded all of the
> > index files ahead of time. But didn't we need to do so in v2 of your
> > series, which has the preferred-pack logic?
> >
> > If so, then was the v2 version buggy, since it only called
> > prepare_midx_pack() and not open_pack_index()? And then v3 is fixing
> > that? Or is something else opening the pack index for us?
> 
> In earlier versions of this series, I don't think we needed to have the
> indexes loaded by this point, since (before v3) we didn't care about
> ignoring the empty packs when finding a default preferred-pack.
> 
> But now we do, and so we need to call open_pack_index() ourselves.
> Confusingly, we only need to do that on packs that *are* included in the
> MIDX, since prepare_midx_pack() doesn't do it for us, but
> add_pack_to_midx() does.

Ah, that was the part I was missing: the default preferred-pack stuff is
only in v3. That makes sense.

-Peff
diff mbox series

Patch

diff --git a/Documentation/git-multi-pack-index.txt b/Documentation/git-multi-pack-index.txt
index ffd601bc17..ada14deb2c 100644
--- a/Documentation/git-multi-pack-index.txt
+++ b/Documentation/git-multi-pack-index.txt
@@ -10,7 +10,7 @@  SYNOPSIS
 --------
 [verse]
 'git multi-pack-index' [--object-dir=<dir>] [--[no-]progress]
-	[--preferred-pack=<pack>] <subcommand>
+	[--preferred-pack=<pack>] [--[no-]bitmap] <subcommand>
 
 DESCRIPTION
 -----------
@@ -40,6 +40,9 @@  write::
 		multiple packs contain the same object. If not given,
 		ties are broken in favor of the pack with the lowest
 		mtime.
+
+	--[no-]bitmap::
+		Control whether or not a multi-pack bitmap is written.
 --
 
 verify::
@@ -81,6 +84,13 @@  EXAMPLES
 $ git multi-pack-index write
 -----------------------------------------------
 
+* Write a MIDX file for the packfiles in the current .git folder with a
+corresponding bitmap.
++
+-------------------------------------------------------------
+$ git multi-pack-index write --preferred-pack <pack> --bitmap
+-------------------------------------------------------------
+
 * Write a MIDX file for the packfiles in an alternate object store.
 +
 -----------------------------------------------
diff --git a/builtin/multi-pack-index.c b/builtin/multi-pack-index.c
index 5d3ea445fd..bf6fa982e3 100644
--- a/builtin/multi-pack-index.c
+++ b/builtin/multi-pack-index.c
@@ -68,6 +68,8 @@  static int cmd_multi_pack_index_write(int argc, const char **argv)
 		OPT_STRING(0, "preferred-pack", &opts.preferred_pack,
 			   N_("preferred-pack"),
 			   N_("pack for reuse when computing a multi-pack bitmap")),
+		OPT_BIT(0, "bitmap", &opts.flags, N_("write multi-pack bitmap"),
+			MIDX_WRITE_BITMAP | MIDX_WRITE_REV_INDEX),
 		OPT_END(),
 	};
 
diff --git a/midx.c b/midx.c
index 752d36c57f..a58cca707b 100644
--- a/midx.c
+++ b/midx.c
@@ -13,6 +13,10 @@ 
 #include "repository.h"
 #include "chunk-format.h"
 #include "pack.h"
+#include "pack-bitmap.h"
+#include "refs.h"
+#include "revision.h"
+#include "list-objects.h"
 
 #define MIDX_SIGNATURE 0x4d494458 /* "MIDX" */
 #define MIDX_VERSION 1
@@ -885,6 +889,172 @@  static void write_midx_reverse_index(char *midx_name, unsigned char *midx_hash,
 static void clear_midx_files_ext(struct repository *r, const char *ext,
 				 unsigned char *keep_hash);
 
+static void prepare_midx_packing_data(struct packing_data *pdata,
+				      struct write_midx_context *ctx)
+{
+	uint32_t i;
+
+	memset(pdata, 0, sizeof(struct packing_data));
+	prepare_packing_data(the_repository, pdata);
+
+	for (i = 0; i < ctx->entries_nr; i++) {
+		struct pack_midx_entry *from = &ctx->entries[ctx->pack_order[i]];
+		struct object_entry *to = packlist_alloc(pdata, &from->oid);
+
+		oe_set_in_pack(pdata, to,
+			       ctx->info[ctx->pack_perm[from->pack_int_id]].p);
+	}
+}
+
+static int add_ref_to_pending(const char *refname,
+			      const struct object_id *oid,
+			      int flag, void *cb_data)
+{
+	struct rev_info *revs = (struct rev_info*)cb_data;
+	struct object *object;
+
+	if ((flag & REF_ISSYMREF) && (flag & REF_ISBROKEN)) {
+		warning("symbolic ref is dangling: %s", refname);
+		return 0;
+	}
+
+	object = parse_object_or_die(oid, refname);
+	if (object->type != OBJ_COMMIT)
+		return 0;
+
+	add_pending_object(revs, object, "");
+	if (bitmap_is_preferred_refname(revs->repo, refname))
+		object->flags |= NEEDS_BITMAP;
+	return 0;
+}
+
+struct bitmap_commit_cb {
+	struct commit **commits;
+	size_t commits_nr, commits_alloc;
+
+	struct write_midx_context *ctx;
+};
+
+static const struct object_id *bitmap_oid_access(size_t index,
+						 const void *_entries)
+{
+	const struct pack_midx_entry *entries = _entries;
+	return &entries[index].oid;
+}
+
+static void bitmap_show_commit(struct commit *commit, void *_data)
+{
+	struct bitmap_commit_cb *data = _data;
+	if (oid_pos(&commit->object.oid, data->ctx->entries,
+		    data->ctx->entries_nr,
+		    bitmap_oid_access) > -1) {
+		ALLOC_GROW(data->commits, data->commits_nr + 1,
+			   data->commits_alloc);
+		data->commits[data->commits_nr++] = commit;
+	}
+}
+
+static struct commit **find_commits_for_midx_bitmap(uint32_t *indexed_commits_nr_p,
+						    struct write_midx_context *ctx)
+{
+	struct rev_info revs;
+	struct bitmap_commit_cb cb;
+
+	memset(&cb, 0, sizeof(struct bitmap_commit_cb));
+	cb.ctx = ctx;
+
+	repo_init_revisions(the_repository, &revs, NULL);
+	for_each_ref(add_ref_to_pending, &revs);
+
+	/*
+	 * Skipping promisor objects here is intentional, since it only excludes
+	 * them from the list of reachable commits that we want to select from
+	 * when computing the selection of MIDX'd commits to receive bitmaps.
+	 *
+	 * Reachability bitmaps do require that their objects be closed under
+	 * reachability, but fetching any objects missing from promisors at this
+	 * point is too late. But, if one of those objects can be reached from
+	 * an another object that is included in the bitmap, then we will
+	 * complain later that we don't have reachability closure (and fail
+	 * appropriately).
+	 */
+	fetch_if_missing = 0;
+	revs.exclude_promisor_objects = 1;
+
+	/*
+	 * Pass selected commits in topo order to match the behavior of
+	 * pack-bitmaps when configured with delta islands.
+	 */
+	revs.topo_order = 1;
+	revs.sort_order = REV_SORT_IN_GRAPH_ORDER;
+
+	if (prepare_revision_walk(&revs))
+		die(_("revision walk setup failed"));
+
+	traverse_commit_list(&revs, bitmap_show_commit, NULL, &cb);
+	if (indexed_commits_nr_p)
+		*indexed_commits_nr_p = cb.commits_nr;
+
+	return cb.commits;
+}
+
+static int write_midx_bitmap(char *midx_name, unsigned char *midx_hash,
+			     struct write_midx_context *ctx,
+			     unsigned flags)
+{
+	struct packing_data pdata;
+	struct pack_idx_entry **index;
+	struct commit **commits = NULL;
+	uint32_t i, commits_nr;
+	char *bitmap_name = xstrfmt("%s-%s.bitmap", midx_name, hash_to_hex(midx_hash));
+	int ret;
+
+	prepare_midx_packing_data(&pdata, ctx);
+
+	commits = find_commits_for_midx_bitmap(&commits_nr, ctx);
+
+	/*
+	 * Build the MIDX-order index based on pdata.objects (which is already
+	 * in MIDX order; c.f., 'midx_pack_order_cmp()' for the definition of
+	 * this order).
+	 */
+	ALLOC_ARRAY(index, pdata.nr_objects);
+	for (i = 0; i < pdata.nr_objects; i++)
+		index[i] = (struct pack_idx_entry *)&pdata.objects[i];
+
+	bitmap_writer_show_progress(flags & MIDX_PROGRESS);
+	bitmap_writer_build_type_index(&pdata, index, pdata.nr_objects);
+
+	/*
+	 * bitmap_writer_finish expects objects in lex order, but pack_order
+	 * gives us exactly that. use it directly instead of re-sorting the
+	 * array.
+	 *
+	 * This changes the order of objects in 'index' between
+	 * bitmap_writer_build_type_index and bitmap_writer_finish.
+	 *
+	 * The same re-ordering takes place in the single-pack bitmap code via
+	 * write_idx_file(), which is called by finish_tmp_packfile(), which
+	 * happens between bitmap_writer_build_type_index() and
+	 * bitmap_writer_finish().
+	 */
+	for (i = 0; i < pdata.nr_objects; i++)
+		index[ctx->pack_order[i]] = (struct pack_idx_entry *)&pdata.objects[i];
+
+	bitmap_writer_select_commits(commits, commits_nr, -1);
+	ret = bitmap_writer_build(&pdata);
+	if (ret < 0)
+		goto cleanup;
+
+	bitmap_writer_set_checksum(midx_hash);
+	bitmap_writer_finish(index, pdata.nr_objects, bitmap_name, 0);
+
+cleanup:
+	free(index);
+	free(bitmap_name);
+	return ret;
+}
+
 static int write_midx_internal(const char *object_dir, struct multi_pack_index *m,
 			       struct string_list *packs_to_drop,
 			       const char *preferred_pack_name,
@@ -930,9 +1100,16 @@  static int write_midx_internal(const char *object_dir, struct multi_pack_index *
 		for (i = 0; i < ctx.m->num_packs; i++) {
 			ALLOC_GROW(ctx.info, ctx.nr + 1, ctx.alloc);
 
+			if (prepare_midx_pack(the_repository, ctx.m, i)) {
+				error(_("could not load pack %s"),
+				      ctx.m->pack_names[i]);
+				result = 1;
+				goto cleanup;
+			}
+
 			ctx.info[ctx.nr].orig_pack_int_id = i;
 			ctx.info[ctx.nr].pack_name = xstrdup(ctx.m->pack_names[i]);
-			ctx.info[ctx.nr].p = NULL;
+			ctx.info[ctx.nr].p = ctx.m->packs[i];
 			ctx.info[ctx.nr].expired = 0;
 			ctx.nr++;
 		}
@@ -947,8 +1124,26 @@  static int write_midx_internal(const char *object_dir, struct multi_pack_index *
 	for_each_file_in_pack_dir(object_dir, add_pack_to_midx, &ctx);
 	stop_progress(&ctx.progress);
 
-	if (ctx.m && ctx.nr == ctx.m->num_packs && !packs_to_drop)
-		goto cleanup;
+	if (ctx.m && ctx.nr == ctx.m->num_packs && !packs_to_drop) {
+		struct bitmap_index *bitmap_git;
+		int bitmap_exists;
+		int want_bitmap = flags & MIDX_WRITE_BITMAP;
+
+		bitmap_git = prepare_bitmap_git(the_repository);
+		bitmap_exists = bitmap_git && bitmap_is_midx(bitmap_git);
+		free_bitmap_index(bitmap_git);
+
+		if (bitmap_exists || !want_bitmap) {
+			/*
+			 * The correct MIDX already exists, and so does a
+			 * corresponding bitmap (or one wasn't requested).
+			 */
+			if (!want_bitmap)
+				clear_midx_files_ext(the_repository, ".bitmap",
+						     NULL);
+			goto cleanup;
+		}
+	}
 
 	if (preferred_pack_name) {
 		int found = 0;
@@ -964,7 +1159,8 @@  static int write_midx_internal(const char *object_dir, struct multi_pack_index *
 		if (!found)
 			warning(_("unknown preferred pack: '%s'"),
 				preferred_pack_name);
-	} else if (ctx.nr && (flags & MIDX_WRITE_REV_INDEX)) {
+	} else if (ctx.nr &&
+		   (flags & (MIDX_WRITE_REV_INDEX | MIDX_WRITE_BITMAP))) {
 		time_t oldest = ctx.info[0].p->mtime;
 		ctx.preferred_pack_idx = 0;
 
@@ -1075,9 +1271,6 @@  static int write_midx_internal(const char *object_dir, struct multi_pack_index *
 	hold_lock_file_for_update(&lk, midx_name, LOCK_DIE_ON_ERROR);
 	f = hashfd(get_lock_file_fd(&lk), get_lock_file_path(&lk));
 
-	if (ctx.m)
-		close_midx(ctx.m);
-
 	if (ctx.nr - dropped_packs == 0) {
 		error(_("no pack files to index."));
 		result = 1;
@@ -1108,14 +1301,22 @@  static int write_midx_internal(const char *object_dir, struct multi_pack_index *
 	finalize_hashfile(f, midx_hash, CSUM_FSYNC | CSUM_HASH_IN_STREAM);
 	free_chunkfile(cf);
 
-	if (flags & MIDX_WRITE_REV_INDEX)
+	if (flags & (MIDX_WRITE_REV_INDEX | MIDX_WRITE_BITMAP))
 		ctx.pack_order = midx_pack_order(&ctx);
 
 	if (flags & MIDX_WRITE_REV_INDEX)
 		write_midx_reverse_index(midx_name, midx_hash, &ctx);
+	if (flags & MIDX_WRITE_BITMAP) {
+		if (write_midx_bitmap(midx_name, midx_hash, &ctx, flags) < 0) {
+			error(_("could not write multi-pack bitmap"));
+			result = 1;
+			goto cleanup;
+		}
+	}
 
 	commit_lock_file(&lk);
 
+	clear_midx_files_ext(the_repository, ".bitmap", midx_hash);
 	clear_midx_files_ext(the_repository, ".rev", midx_hash);
 
 cleanup:
@@ -1123,6 +1324,15 @@  static int write_midx_internal(const char *object_dir, struct multi_pack_index *
 		if (ctx.info[i].p) {
 			close_pack(ctx.info[i].p);
 			free(ctx.info[i].p);
+			if (ctx.m) {
+				/*
+				 * Destroy a stale reference to the pack in
+				 * 'ctx.m'.
+				 */
+				uint32_t orig = ctx.info[i].orig_pack_int_id;
+				if (orig < ctx.m->num_packs)
+					ctx.m->packs[orig] = NULL;
+			}
 		}
 		free(ctx.info[i].pack_name);
 	}
@@ -1132,6 +1342,9 @@  static int write_midx_internal(const char *object_dir, struct multi_pack_index *
 	free(ctx.pack_perm);
 	free(ctx.pack_order);
 	free(midx_name);
+	if (ctx.m)
+		close_midx(ctx.m);
+
 	return result;
 }
 
@@ -1193,6 +1406,7 @@  void clear_midx_file(struct repository *r)
 	if (remove_path(midx))
 		die(_("failed to clear multi-pack-index at %s"), midx);
 
+	clear_midx_files_ext(r, ".bitmap", NULL);
 	clear_midx_files_ext(r, ".rev", NULL);
 
 	free(midx);
diff --git a/midx.h b/midx.h
index 1172df1a71..350f4d0a7b 100644
--- a/midx.h
+++ b/midx.h
@@ -41,6 +41,7 @@  struct multi_pack_index {
 
 #define MIDX_PROGRESS     (1 << 0)
 #define MIDX_WRITE_REV_INDEX (1 << 1)
+#define MIDX_WRITE_BITMAP (1 << 2)
 
 const unsigned char *get_midx_checksum(struct multi_pack_index *m);
 char *get_midx_filename(const char *object_dir);