diff mbox series

[v2,14/24] pack-bitmap: write multi-pack bitmaps

Message ID a8cec2463d0993b1118abdd31cb6c9e88a32e0c4.1624314293.git.me@ttaylorr.com (mailing list archive)
State New
Headers show
Series multi-pack reachability bitmaps | expand

Commit Message

Taylor Blau June 21, 2021, 10:25 p.m. UTC
Write multi-pack bitmaps in the format described by
Documentation/technical/bitmap-format.txt, inferring their presence with
the absence of '--bitmap'.

To write a multi-pack bitmap, this patch attempts to reuse as much of
the existing machinery from pack-objects as possible. Specifically, the
MIDX code prepares a packing_data struct that pretends as if a single
packfile has been generated containing all of the objects contained
within the MIDX.

Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
 Documentation/git-multi-pack-index.txt |  12 +-
 builtin/multi-pack-index.c             |   2 +
 midx.c                                 | 230 ++++++++++++++++++++++++-
 midx.h                                 |   1 +
 4 files changed, 236 insertions(+), 9 deletions(-)

Comments

Ævar Arnfjörð Bjarmason June 24, 2021, 11:45 p.m. UTC | #1
On Mon, Jun 21 2021, Taylor Blau wrote:

> Write multi-pack bitmaps in the format described by
> Documentation/technical/bitmap-format.txt, inferring their presence with
> the absence of '--bitmap'.
>
> To write a multi-pack bitmap, this patch attempts to reuse as much of
> the existing machinery from pack-objects as possible. Specifically, the
> MIDX code prepares a packing_data struct that pretends as if a single
> packfile has been generated containing all of the objects contained
> within the MIDX.
>
> Signed-off-by: Taylor Blau <me@ttaylorr.com>
> ---
>  Documentation/git-multi-pack-index.txt |  12 +-
>  builtin/multi-pack-index.c             |   2 +
>  midx.c                                 | 230 ++++++++++++++++++++++++-
>  midx.h                                 |   1 +
>  4 files changed, 236 insertions(+), 9 deletions(-)
>
> diff --git a/Documentation/git-multi-pack-index.txt b/Documentation/git-multi-pack-index.txt
> index ffd601bc17..ada14deb2c 100644
> --- a/Documentation/git-multi-pack-index.txt
> +++ b/Documentation/git-multi-pack-index.txt
> @@ -10,7 +10,7 @@ SYNOPSIS
>  --------
>  [verse]
>  'git multi-pack-index' [--object-dir=<dir>] [--[no-]progress]
> -	[--preferred-pack=<pack>] <subcommand>
> +	[--preferred-pack=<pack>] [--[no-]bitmap] <subcommand>
>  
>  DESCRIPTION
>  -----------
> @@ -40,6 +40,9 @@ write::
>  		multiple packs contain the same object. If not given,
>  		ties are broken in favor of the pack with the lowest
>  		mtime.
> +
> +	--[no-]bitmap::
> +		Control whether or not a multi-pack bitmap is written.
>  --
>  
>  verify::
> @@ -81,6 +84,13 @@ EXAMPLES
>  $ git multi-pack-index write
>  -----------------------------------------------
>  
> +* Write a MIDX file for the packfiles in the current .git folder with a
> +corresponding bitmap.
> ++
> +-------------------------------------------------------------
> +$ git multi-pack-index write --preferred-pack <pack> --bitmap
> +-------------------------------------------------------------
> +

I wondered if this was a <pack> positional argument, but it's just the
argument for --preferred-pack, even though the synopsis uses the "="
style for it. Even if parse-options.c is loose about it, let's use one
or the other in examples consistently.

>  * Write a MIDX file for the packfiles in an alternate object store.
>  +
>  -----------------------------------------------
> diff --git a/builtin/multi-pack-index.c b/builtin/multi-pack-index.c
> index 5d3ea445fd..bf6fa982e3 100644
> --- a/builtin/multi-pack-index.c
> +++ b/builtin/multi-pack-index.c
> @@ -68,6 +68,8 @@ static int cmd_multi_pack_index_write(int argc, const char **argv)
>  		OPT_STRING(0, "preferred-pack", &opts.preferred_pack,
>  			   N_("preferred-pack"),
>  			   N_("pack for reuse when computing a multi-pack bitmap")),
> +		OPT_BIT(0, "bitmap", &opts.flags, N_("write multi-pack bitmap"),
> +			MIDX_WRITE_BITMAP | MIDX_WRITE_REV_INDEX),
>  		OPT_END(),
>  	};
>  
> diff --git a/midx.c b/midx.c
> index 752d36c57f..a58cca707b 100644
> --- a/midx.c
> +++ b/midx.c
> @@ -13,6 +13,10 @@
>  #include "repository.h"
>  #include "chunk-format.h"
>  #include "pack.h"
> +#include "pack-bitmap.h"
> +#include "refs.h"
> +#include "revision.h"
> +#include "list-objects.h"
>  
>  #define MIDX_SIGNATURE 0x4d494458 /* "MIDX" */
>  #define MIDX_VERSION 1
> @@ -885,6 +889,172 @@ static void write_midx_reverse_index(char *midx_name, unsigned char *midx_hash,
>  static void clear_midx_files_ext(struct repository *r, const char *ext,
>  				 unsigned char *keep_hash);
>  
> +static void prepare_midx_packing_data(struct packing_data *pdata,
> +				      struct write_midx_context *ctx)
> +{
> +	uint32_t i;
> +
> +	memset(pdata, 0, sizeof(struct packing_data));

We initialize this on the stack in write_midx_bitmap(), shouldn't we
just there do:

    struct packing_data pdata = {0}

Instead of:

    struct packing_data pdata;

And then doing this memset() here?

> +	prepare_packing_data(the_repository, pdata);
> +
> +	for (i = 0; i < ctx->entries_nr; i++) {
> +		struct pack_midx_entry *from = &ctx->entries[ctx->pack_order[i]];
> +		struct object_entry *to = packlist_alloc(pdata, &from->oid);
> +
> +		oe_set_in_pack(pdata, to,
> +			       ctx->info[ctx->pack_perm[from->pack_int_id]].p);
> +	}
> +}
> +
> +static int add_ref_to_pending(const char *refname,
> +			      const struct object_id *oid,
> +			      int flag, void *cb_data)
> +{
> +	struct rev_info *revs = (struct rev_info*)cb_data;
> +	struct object *object;
> +
> +	if ((flag & REF_ISSYMREF) && (flag & REF_ISBROKEN)) {

Just since I'd mentioned HAS_MULTI_BITS() offhand on another patch of
yours, it's for cases like this, so:

-    if ((flag & REF_ISSYMREF) && (flag & REF_ISBROKEN)) {
+    if (HAS_MULTI_BITS(flag & (REF_ISSYMREF|REF_ISBROKEN)) {

Saves you 3 bytes of code:) Anyway, you don't need to use it, just an
intresting function... :)

> +{
> +	struct rev_info revs;
> +	struct bitmap_commit_cb cb;
> +
> +	memset(&cb, 0, sizeof(struct bitmap_commit_cb));

Another case of s/memset/"= {0}"/g ?

> +	cb.ctx = ctx;
> +
> +	repo_init_revisions(the_repository, &revs, NULL);
> +	for_each_ref(add_ref_to_pending, &revs);
> +
> +	/*
> +	 * Skipping promisor objects here is intentional, since it only excludes
> +	 * them from the list of reachable commits that we want to select from
> +	 * when computing the selection of MIDX'd commits to receive bitmaps.
> +	 *
> +	 * Reachability bitmaps do require that their objects be closed under
> +	 * reachability, but fetching any objects missing from promisors at this
> +	 * point is too late. But, if one of those objects can be reached from
> +	 * an another object that is included in the bitmap, then we will
> +	 * complain later that we don't have reachability closure (and fail
> +	 * appropriately).
> +	 */
> +	fetch_if_missing = 0;
> +	revs.exclude_promisor_objects = 1;
> +
> +	/*
> +	 * Pass selected commits in topo order to match the behavior of
> +	 * pack-bitmaps when configured with delta islands.
> +	 */
> +	revs.topo_order = 1;
> +	revs.sort_order = REV_SORT_IN_GRAPH_ORDER;
> +
> +	if (prepare_revision_walk(&revs))
> +		die(_("revision walk setup failed"));
> +
> +	traverse_commit_list(&revs, bitmap_show_commit, NULL, &cb);
> +	if (indexed_commits_nr_p)
> +		*indexed_commits_nr_p = cb.commits_nr;
> +
> +	return cb.commits;
> +}
> +
> +static int write_midx_bitmap(char *midx_name, unsigned char *midx_hash,
> +			     struct write_midx_context *ctx,
> +			     unsigned flags)
> +{
> +	struct packing_data pdata;
> +	struct pack_idx_entry **index;
> +	struct commit **commits = NULL;
> +	uint32_t i, commits_nr;
> +	char *bitmap_name = xstrfmt("%s-%s.bitmap", midx_name, hash_to_hex(midx_hash));
> +	int ret;
> +
> +	prepare_midx_packing_data(&pdata, ctx);
> +
> +	commits = find_commits_for_midx_bitmap(&commits_nr, ctx);
> +
> +	/*
> +	 * Build the MIDX-order index based on pdata.objects (which is already
> +	 * in MIDX order; c.f., 'midx_pack_order_cmp()' for the definition of
> +	 * this order).
> +	 */
> +	ALLOC_ARRAY(index, pdata.nr_objects);
> +	for (i = 0; i < pdata.nr_objects; i++)
> +		index[i] = (struct pack_idx_entry *)&pdata.objects[i];
> +
> +	bitmap_writer_show_progress(flags & MIDX_PROGRESS);
> +	bitmap_writer_build_type_index(&pdata, index, pdata.nr_objects);
> +
> +	/*
> +	 * bitmap_writer_finish expects objects in lex order, but pack_order
> +	 * gives us exactly that. use it directly instead of re-sorting the
> +	 * array.
> +	 *
> +	 * This changes the order of objects in 'index' between
> +	 * bitmap_writer_build_type_index and bitmap_writer_finish.
> +	 *
> +	 * The same re-ordering takes place in the single-pack bitmap code via
> +	 * write_idx_file(), which is called by finish_tmp_packfile(), which
> +	 * happens between bitmap_writer_build_type_index() and
> +	 * bitmap_writer_finish().
> +	 */
> +	for (i = 0; i < pdata.nr_objects; i++)
> +		index[ctx->pack_order[i]] = (struct pack_idx_entry *)&pdata.objects[i];
> +
> +	bitmap_writer_select_commits(commits, commits_nr, -1);
> +	ret = bitmap_writer_build(&pdata);
> +	if (ret < 0)
> +		goto cleanup;
> +
> +	bitmap_writer_set_checksum(midx_hash);
> +	bitmap_writer_finish(index, pdata.nr_objects, bitmap_name, 0);
> +
> +cleanup:
> +	free(index);
> +	free(bitmap_name);
> +	return ret;
> +}
> +
>  static int write_midx_internal(const char *object_dir, struct multi_pack_index *m,
>  			       struct string_list *packs_to_drop,
>  			       const char *preferred_pack_name,
> @@ -930,9 +1100,16 @@ static int write_midx_internal(const char *object_dir, struct multi_pack_index *
>  		for (i = 0; i < ctx.m->num_packs; i++) {
>  			ALLOC_GROW(ctx.info, ctx.nr + 1, ctx.alloc);
>  
> +			if (prepare_midx_pack(the_repository, ctx.m, i)) {
> +				error(_("could not load pack %s"),
> +				      ctx.m->pack_names[i]);

Isn't the prepare_midx_pack() tasked with populating that pack_names[i]
that you can't load (the strbuf_addf() it does), but it can also exit
before that, do we get an empty string here then? Maybe I'm misreading
it (I haven't run this, just skimmed the code).

> @@ -1132,6 +1342,9 @@ static int write_midx_internal(const char *object_dir, struct multi_pack_index *
>  	free(ctx.pack_perm);
>  	free(ctx.pack_order);
>  	free(midx_name);
> +	if (ctx.m)
> +		close_midx(ctx.m);
> +

I see Stolee made close_midx() just return silently if !ctx.m in
1dcd9f2043a (midx: close multi-pack-index on repack, 2018-10-12), but
grepping the uses of it it seems calls to it are similarly guarded by
"if"'s.

Just a nit, weird to have a free-like function not invoked like
free. Perhaps (and maybe better for an unrelated cleanup) to either drop
the conditionals, or make it BUG() if it's called with NULL, but at
least we should pick one :)
Taylor Blau July 15, 2021, 2:33 p.m. UTC | #2
On Fri, Jun 25, 2021 at 01:45:46AM +0200, Ævar Arnfjörð Bjarmason wrote:
>
> On Mon, Jun 21 2021, Taylor Blau wrote:
> > +* Write a MIDX file for the packfiles in the current .git folder with a
> > +corresponding bitmap.
> > ++
> > +-------------------------------------------------------------
> > +$ git multi-pack-index write --preferred-pack <pack> --bitmap
> > +-------------------------------------------------------------
> > +
>
> I wondered if this was a <pack> positional argument, but it's just the
> argument for --preferred-pack, even though the synopsis uses the "="
> style for it. Even if parse-options.c is loose about it, let's use one
> or the other in examples consistently.

The example below (for writing a MIDX in an alternate object store)
doesn't include the '=', but probably would be clearer if it did. I
think it's a good suggestion, though, so I'll fix up my addition here.

> > +	memset(pdata, 0, sizeof(struct packing_data));
>
> We initialize this on the stack in write_midx_bitmap(), shouldn't we
> just there do:
>
>     struct packing_data pdata = {0}
>
> Instead of:
>
>     struct packing_data pdata;
>
> And then doing this memset() here?

I could go either way. Part of me prefers the memset() since it lets
callers of prepare_midx_packing_data() pass in anything they want,
including a pointer to uninitialized memory. Of course, there is only
one such caller, so it probably doesn't really matter.

And the other caller of prepare_packing_data() which is in
builtin/pack-objects.c operates on a pointer to a statically allocated
variable, so its bytes are already zero'd.

I don't feel strongly about it, though, so I'd just as soon err on the
side of flexibility than changing the declaration.

> > +{
> > +	struct rev_info revs;
> > +	struct bitmap_commit_cb cb;
> > +
> > +	memset(&cb, 0, sizeof(struct bitmap_commit_cb));
>
> Another case of s/memset/"= {0}"/g ?

Ah, in this case I'd prefer the aggregate-style initialization, since
we're zero-ing it out in the same function.

> >  static int write_midx_internal(const char *object_dir, struct multi_pack_index *m,
> >  			       struct string_list *packs_to_drop,
> >  			       const char *preferred_pack_name,
> > @@ -930,9 +1100,16 @@ static int write_midx_internal(const char *object_dir, struct multi_pack_index *
> >  		for (i = 0; i < ctx.m->num_packs; i++) {
> >  			ALLOC_GROW(ctx.info, ctx.nr + 1, ctx.alloc);
> >
> > +			if (prepare_midx_pack(the_repository, ctx.m, i)) {
> > +				error(_("could not load pack %s"),
> > +				      ctx.m->pack_names[i]);
>
> Isn't the prepare_midx_pack() tasked with populating that pack_names[i]
> that you can't load (the strbuf_addf() it does), but it can also exit
> before that, do we get an empty string here then? Maybe I'm misreading
> it (I haven't run this, just skimmed the code).

Nice catch, we can't rely on ctx->m.pack_names[i] being safe to read
(and at the same time know that we're going to get a non-empty string).

Since prepare_midx_pack() can fail because the pack itself couldn't be
loaded, I think the easiest thing to do here is to just opaquely say
"could not load pack" without adding any pack name that we may or may
not have.

> > @@ -1132,6 +1342,9 @@ static int write_midx_internal(const char *object_dir, struct multi_pack_index *
> >  	free(ctx.pack_perm);
> >  	free(ctx.pack_order);
> >  	free(midx_name);
> > +	if (ctx.m)
> > +		close_midx(ctx.m);
> > +
>
> I see Stolee made close_midx() just return silently if !ctx.m in
> 1dcd9f2043a (midx: close multi-pack-index on repack, 2018-10-12), but
> grepping the uses of it it seems calls to it are similarly guarded by
> "if"'s.
>
> Just a nit, weird to have a free-like function not invoked like
> free. Perhaps (and maybe better for an unrelated cleanup) to either drop
> the conditionals, or make it BUG() if it's called with NULL, but at
> least we should pick one :)

I agree with the direction of 1dcd9f2043a, so I'm happy to just drop the
conditional and call close_midx() with an argument that may or may not
be NULL.

Thanks,
Taylor
Jeff King July 21, 2021, 12:09 p.m. UTC | #3
On Mon, Jun 21, 2021 at 06:25:34PM -0400, Taylor Blau wrote:

> +static int add_ref_to_pending(const char *refname,
> +			      const struct object_id *oid,
> +			      int flag, void *cb_data)
> +{
> +	struct rev_info *revs = (struct rev_info*)cb_data;
> +	struct object *object;
> +
> +	if ((flag & REF_ISSYMREF) && (flag & REF_ISBROKEN)) {
> +		warning("symbolic ref is dangling: %s", refname);
> +		return 0;
> +	}
> +
> +	object = parse_object_or_die(oid, refname);
> +	if (object->type != OBJ_COMMIT)
> +		return 0;
> +
> +	add_pending_object(revs, object, "");
> +	if (bitmap_is_preferred_refname(revs->repo, refname))
> +		object->flags |= NEEDS_BITMAP;
> +	return 0;
> +}

OK, so we'll look at each ref to get the set of commits that we want to
traverse to put into the bitmap. Which is roughly the same as what the
pack bitmap does. We only generate bitmaps for all-into-one repacks, so
it is traversing all of the reachable objects. It is a little different
in that the pack version is probably hitting reflogs, but IMHO we are
better off to ignore reflogs for the purposes of bitmaps (I would
suggest to do so in the pack-bitmap case, too, except that it is
combined with the "what to pack" traversal there, and by the time we see
each commit we don't know how we got there).

> +struct bitmap_commit_cb {
> +	struct commit **commits;
> +	size_t commits_nr, commits_alloc;
> +
> +	struct write_midx_context *ctx;
> +};
> +
> +static const struct object_id *bitmap_oid_access(size_t index,
> +						 const void *_entries)
> +{
> +	const struct pack_midx_entry *entries = _entries;
> +	return &entries[index].oid;
> +}
> +
> +static void bitmap_show_commit(struct commit *commit, void *_data)
> +{
> +	struct bitmap_commit_cb *data = _data;
> +	if (oid_pos(&commit->object.oid, data->ctx->entries,
> +		    data->ctx->entries_nr,
> +		    bitmap_oid_access) > -1) {

This "> -1" struck me as a little bit funny. Perhaps ">= 0" would be a
more obvious way of saying "we found it"?

> +	/*
> +	 * Skipping promisor objects here is intentional, since it only excludes
> +	 * them from the list of reachable commits that we want to select from
> +	 * when computing the selection of MIDX'd commits to receive bitmaps.
> +	 *
> +	 * Reachability bitmaps do require that their objects be closed under
> +	 * reachability, but fetching any objects missing from promisors at this
> +	 * point is too late. But, if one of those objects can be reached from
> +	 * an another object that is included in the bitmap, then we will
> +	 * complain later that we don't have reachability closure (and fail
> +	 * appropriately).
> +	 */
> +	fetch_if_missing = 0;
> +	revs.exclude_promisor_objects = 1;

Makes sense.

> +	/*
> +	 * Pass selected commits in topo order to match the behavior of
> +	 * pack-bitmaps when configured with delta islands.
> +	 */
> +	revs.topo_order = 1;
> +	revs.sort_order = REV_SORT_IN_GRAPH_ORDER;

Hmm. Why do we want to match this side effect of delta islands here?

The only impact this has is on the order of commits we feed for bitmap
selection (and during the actual generation phase, it may impact
visitation order).

Now I'm of the opinion that topo order is probably the best thing for
bitmap generation (since the bitmaps themselves are connected to the
graph structure). But if it is the best thing, shouldn't we perhaps be
turning on topo-order for single-pack bitmaps, too?

And if it isn't the best thing, then why would we want it here?

> +	if (prepare_revision_walk(&revs))
> +		die(_("revision walk setup failed"));

We call init_revisions(), and then go straight to
prepare_revision_walk() with no call to setup_revisions() between. It
doesn't seem to be clearly documented, but I think you're supposed to,
as it finalizes some bits like diff_setup_done().

I suspect it works OK in practice, and I did find a few other spots that
do not call it (e.g., builtin/am.c:write_commit_patch). But most spots
do at least an empty setup_revisions(0, NULL, &rev, NULL).

> +	/*
> +	 * Build the MIDX-order index based on pdata.objects (which is already
> +	 * in MIDX order; c.f., 'midx_pack_order_cmp()' for the definition of
> +	 * this order).
> +	 */
> +	ALLOC_ARRAY(index, pdata.nr_objects);
> +	for (i = 0; i < pdata.nr_objects; i++)
> +		index[i] = (struct pack_idx_entry *)&pdata.objects[i];

This cast is correct because the pack_idx_entry is at the start of each
object_entry. But maybe:

  index[i] = &pdata.objects[i].idx;

would be less scary looking?

> +	/*
> +	 * bitmap_writer_finish expects objects in lex order, but pack_order
> +	 * gives us exactly that. use it directly instead of re-sorting the
> +	 * array.
> +	 *
> +	 * This changes the order of objects in 'index' between
> +	 * bitmap_writer_build_type_index and bitmap_writer_finish.
> +	 *
> +	 * The same re-ordering takes place in the single-pack bitmap code via
> +	 * write_idx_file(), which is called by finish_tmp_packfile(), which
> +	 * happens between bitmap_writer_build_type_index() and
> +	 * bitmap_writer_finish().
> +	 */
> +	for (i = 0; i < pdata.nr_objects; i++)
> +		index[ctx->pack_order[i]] = (struct pack_idx_entry *)&pdata.objects[i];

Ditto here.

> +	bitmap_writer_select_commits(commits, commits_nr, -1);

Not related to your patch, but I had to refresh my memory on what this
"-1" was for. It's "max_bitmaps", and is ignored if it's negative. But
the only callers pass "-1"! So we could get rid of it entirely.

It probably makes sense to leave that cleanup out of this
already-complicated series. But maybe worth doing later on top.

> @@ -930,9 +1100,16 @@ static int write_midx_internal(const char *object_dir, struct multi_pack_index *
>  		for (i = 0; i < ctx.m->num_packs; i++) {
>  			ALLOC_GROW(ctx.info, ctx.nr + 1, ctx.alloc);
>  
> +			if (prepare_midx_pack(the_repository, ctx.m, i)) {
> +				error(_("could not load pack %s"),
> +				      ctx.m->pack_names[i]);
> +				result = 1;
> +				goto cleanup;
> +			}

It might be worth a comment here. I can easily believe that there is
some later part of the bitmap generation code that assumes the packs are
loaded. But somebody reading this is not likely to understand why it's
here.

Should this be done conditionally only if we're writing a bitmap? (That
might also make it obvious why we are doing it).

> @@ -947,8 +1124,26 @@ static int write_midx_internal(const char *object_dir, struct multi_pack_index *
>  	for_each_file_in_pack_dir(object_dir, add_pack_to_midx, &ctx);
>  	stop_progress(&ctx.progress);
>  
> -	if (ctx.m && ctx.nr == ctx.m->num_packs && !packs_to_drop)
> -		goto cleanup;
> +	if (ctx.m && ctx.nr == ctx.m->num_packs && !packs_to_drop) {
> +		struct bitmap_index *bitmap_git;
> +		int bitmap_exists;
> +		int want_bitmap = flags & MIDX_WRITE_BITMAP;
> +
> +		bitmap_git = prepare_bitmap_git(the_repository);
> +		bitmap_exists = bitmap_git && bitmap_is_midx(bitmap_git);
> +		free_bitmap_index(bitmap_git);
> +
> +		if (bitmap_exists || !want_bitmap) {
> +			/*
> +			 * The correct MIDX already exists, and so does a
> +			 * corresponding bitmap (or one wasn't requested).
> +			 */
> +			if (!want_bitmap)
> +				clear_midx_files_ext(the_repository, ".bitmap",
> +						     NULL);
> +			goto cleanup;
> +		}
> +	}

So this makes "git multi-pack-index write --write-bitmap" actually write
a bitmap, even if the midx itself didn't need updating? Sounds good.
Likewise, we'll delete a bitmap if one exists but we were not requested
to write one. Makes sense.

I do think nice-to-have bits like this could have come in a separate
patch with their own explanation and tests. It may not be worth trying
to extract it at this point, though.

> @@ -1075,9 +1271,6 @@ static int write_midx_internal(const char *object_dir, struct multi_pack_index *
>  	hold_lock_file_for_update(&lk, midx_name, LOCK_DIE_ON_ERROR);
>  	f = hashfd(get_lock_file_fd(&lk), get_lock_file_path(&lk));
>  
> -	if (ctx.m)
> -		close_midx(ctx.m);
> -
>  	if (ctx.nr - dropped_packs == 0) {
>  		error(_("no pack files to index."));
>  		result = 1;

I'm not sure what this hunk is doing. We do pick up the close_midx()
call at the end of the function, amidst the other cleanup.

I expect the answer is something like "we need it open when we generate
the bitmaps". But it makes me wonder if we could hit any cases where we
try to overwrite it while it's still open, which would cause problems on
Windows.

-Peff
diff mbox series

Patch

diff --git a/Documentation/git-multi-pack-index.txt b/Documentation/git-multi-pack-index.txt
index ffd601bc17..ada14deb2c 100644
--- a/Documentation/git-multi-pack-index.txt
+++ b/Documentation/git-multi-pack-index.txt
@@ -10,7 +10,7 @@  SYNOPSIS
 --------
 [verse]
 'git multi-pack-index' [--object-dir=<dir>] [--[no-]progress]
-	[--preferred-pack=<pack>] <subcommand>
+	[--preferred-pack=<pack>] [--[no-]bitmap] <subcommand>
 
 DESCRIPTION
 -----------
@@ -40,6 +40,9 @@  write::
 		multiple packs contain the same object. If not given,
 		ties are broken in favor of the pack with the lowest
 		mtime.
+
+	--[no-]bitmap::
+		Control whether or not a multi-pack bitmap is written.
 --
 
 verify::
@@ -81,6 +84,13 @@  EXAMPLES
 $ git multi-pack-index write
 -----------------------------------------------
 
+* Write a MIDX file for the packfiles in the current .git folder with a
+corresponding bitmap.
++
+-------------------------------------------------------------
+$ git multi-pack-index write --preferred-pack <pack> --bitmap
+-------------------------------------------------------------
+
 * Write a MIDX file for the packfiles in an alternate object store.
 +
 -----------------------------------------------
diff --git a/builtin/multi-pack-index.c b/builtin/multi-pack-index.c
index 5d3ea445fd..bf6fa982e3 100644
--- a/builtin/multi-pack-index.c
+++ b/builtin/multi-pack-index.c
@@ -68,6 +68,8 @@  static int cmd_multi_pack_index_write(int argc, const char **argv)
 		OPT_STRING(0, "preferred-pack", &opts.preferred_pack,
 			   N_("preferred-pack"),
 			   N_("pack for reuse when computing a multi-pack bitmap")),
+		OPT_BIT(0, "bitmap", &opts.flags, N_("write multi-pack bitmap"),
+			MIDX_WRITE_BITMAP | MIDX_WRITE_REV_INDEX),
 		OPT_END(),
 	};
 
diff --git a/midx.c b/midx.c
index 752d36c57f..a58cca707b 100644
--- a/midx.c
+++ b/midx.c
@@ -13,6 +13,10 @@ 
 #include "repository.h"
 #include "chunk-format.h"
 #include "pack.h"
+#include "pack-bitmap.h"
+#include "refs.h"
+#include "revision.h"
+#include "list-objects.h"
 
 #define MIDX_SIGNATURE 0x4d494458 /* "MIDX" */
 #define MIDX_VERSION 1
@@ -885,6 +889,172 @@  static void write_midx_reverse_index(char *midx_name, unsigned char *midx_hash,
 static void clear_midx_files_ext(struct repository *r, const char *ext,
 				 unsigned char *keep_hash);
 
+static void prepare_midx_packing_data(struct packing_data *pdata,
+				      struct write_midx_context *ctx)
+{
+	uint32_t i;
+
+	memset(pdata, 0, sizeof(struct packing_data));
+	prepare_packing_data(the_repository, pdata);
+
+	for (i = 0; i < ctx->entries_nr; i++) {
+		struct pack_midx_entry *from = &ctx->entries[ctx->pack_order[i]];
+		struct object_entry *to = packlist_alloc(pdata, &from->oid);
+
+		oe_set_in_pack(pdata, to,
+			       ctx->info[ctx->pack_perm[from->pack_int_id]].p);
+	}
+}
+
+static int add_ref_to_pending(const char *refname,
+			      const struct object_id *oid,
+			      int flag, void *cb_data)
+{
+	struct rev_info *revs = (struct rev_info*)cb_data;
+	struct object *object;
+
+	if ((flag & REF_ISSYMREF) && (flag & REF_ISBROKEN)) {
+		warning("symbolic ref is dangling: %s", refname);
+		return 0;
+	}
+
+	object = parse_object_or_die(oid, refname);
+	if (object->type != OBJ_COMMIT)
+		return 0;
+
+	add_pending_object(revs, object, "");
+	if (bitmap_is_preferred_refname(revs->repo, refname))
+		object->flags |= NEEDS_BITMAP;
+	return 0;
+}
+
+struct bitmap_commit_cb {
+	struct commit **commits;
+	size_t commits_nr, commits_alloc;
+
+	struct write_midx_context *ctx;
+};
+
+static const struct object_id *bitmap_oid_access(size_t index,
+						 const void *_entries)
+{
+	const struct pack_midx_entry *entries = _entries;
+	return &entries[index].oid;
+}
+
+static void bitmap_show_commit(struct commit *commit, void *_data)
+{
+	struct bitmap_commit_cb *data = _data;
+	if (oid_pos(&commit->object.oid, data->ctx->entries,
+		    data->ctx->entries_nr,
+		    bitmap_oid_access) > -1) {
+		ALLOC_GROW(data->commits, data->commits_nr + 1,
+			   data->commits_alloc);
+		data->commits[data->commits_nr++] = commit;
+	}
+}
+
+static struct commit **find_commits_for_midx_bitmap(uint32_t *indexed_commits_nr_p,
+						    struct write_midx_context *ctx)
+{
+	struct rev_info revs;
+	struct bitmap_commit_cb cb;
+
+	memset(&cb, 0, sizeof(struct bitmap_commit_cb));
+	cb.ctx = ctx;
+
+	repo_init_revisions(the_repository, &revs, NULL);
+	for_each_ref(add_ref_to_pending, &revs);
+
+	/*
+	 * Skipping promisor objects here is intentional, since it only excludes
+	 * them from the list of reachable commits that we want to select from
+	 * when computing the selection of MIDX'd commits to receive bitmaps.
+	 *
+	 * Reachability bitmaps do require that their objects be closed under
+	 * reachability, but fetching any objects missing from promisors at this
+	 * point is too late. But, if one of those objects can be reached from
+	 * an another object that is included in the bitmap, then we will
+	 * complain later that we don't have reachability closure (and fail
+	 * appropriately).
+	 */
+	fetch_if_missing = 0;
+	revs.exclude_promisor_objects = 1;
+
+	/*
+	 * Pass selected commits in topo order to match the behavior of
+	 * pack-bitmaps when configured with delta islands.
+	 */
+	revs.topo_order = 1;
+	revs.sort_order = REV_SORT_IN_GRAPH_ORDER;
+
+	if (prepare_revision_walk(&revs))
+		die(_("revision walk setup failed"));
+
+	traverse_commit_list(&revs, bitmap_show_commit, NULL, &cb);
+	if (indexed_commits_nr_p)
+		*indexed_commits_nr_p = cb.commits_nr;
+
+	return cb.commits;
+}
+
+static int write_midx_bitmap(char *midx_name, unsigned char *midx_hash,
+			     struct write_midx_context *ctx,
+			     unsigned flags)
+{
+	struct packing_data pdata;
+	struct pack_idx_entry **index;
+	struct commit **commits = NULL;
+	uint32_t i, commits_nr;
+	char *bitmap_name = xstrfmt("%s-%s.bitmap", midx_name, hash_to_hex(midx_hash));
+	int ret;
+
+	prepare_midx_packing_data(&pdata, ctx);
+
+	commits = find_commits_for_midx_bitmap(&commits_nr, ctx);
+
+	/*
+	 * Build the MIDX-order index based on pdata.objects (which is already
+	 * in MIDX order; c.f., 'midx_pack_order_cmp()' for the definition of
+	 * this order).
+	 */
+	ALLOC_ARRAY(index, pdata.nr_objects);
+	for (i = 0; i < pdata.nr_objects; i++)
+		index[i] = (struct pack_idx_entry *)&pdata.objects[i];
+
+	bitmap_writer_show_progress(flags & MIDX_PROGRESS);
+	bitmap_writer_build_type_index(&pdata, index, pdata.nr_objects);
+
+	/*
+	 * bitmap_writer_finish expects objects in lex order, but pack_order
+	 * gives us exactly that. use it directly instead of re-sorting the
+	 * array.
+	 *
+	 * This changes the order of objects in 'index' between
+	 * bitmap_writer_build_type_index and bitmap_writer_finish.
+	 *
+	 * The same re-ordering takes place in the single-pack bitmap code via
+	 * write_idx_file(), which is called by finish_tmp_packfile(), which
+	 * happens between bitmap_writer_build_type_index() and
+	 * bitmap_writer_finish().
+	 */
+	for (i = 0; i < pdata.nr_objects; i++)
+		index[ctx->pack_order[i]] = (struct pack_idx_entry *)&pdata.objects[i];
+
+	bitmap_writer_select_commits(commits, commits_nr, -1);
+	ret = bitmap_writer_build(&pdata);
+	if (ret < 0)
+		goto cleanup;
+
+	bitmap_writer_set_checksum(midx_hash);
+	bitmap_writer_finish(index, pdata.nr_objects, bitmap_name, 0);
+
+cleanup:
+	free(index);
+	free(bitmap_name);
+	return ret;
+}
+
 static int write_midx_internal(const char *object_dir, struct multi_pack_index *m,
 			       struct string_list *packs_to_drop,
 			       const char *preferred_pack_name,
@@ -930,9 +1100,16 @@  static int write_midx_internal(const char *object_dir, struct multi_pack_index *
 		for (i = 0; i < ctx.m->num_packs; i++) {
 			ALLOC_GROW(ctx.info, ctx.nr + 1, ctx.alloc);
 
+			if (prepare_midx_pack(the_repository, ctx.m, i)) {
+				error(_("could not load pack %s"),
+				      ctx.m->pack_names[i]);
+				result = 1;
+				goto cleanup;
+			}
+
 			ctx.info[ctx.nr].orig_pack_int_id = i;
 			ctx.info[ctx.nr].pack_name = xstrdup(ctx.m->pack_names[i]);
-			ctx.info[ctx.nr].p = NULL;
+			ctx.info[ctx.nr].p = ctx.m->packs[i];
 			ctx.info[ctx.nr].expired = 0;
 			ctx.nr++;
 		}
@@ -947,8 +1124,26 @@  static int write_midx_internal(const char *object_dir, struct multi_pack_index *
 	for_each_file_in_pack_dir(object_dir, add_pack_to_midx, &ctx);
 	stop_progress(&ctx.progress);
 
-	if (ctx.m && ctx.nr == ctx.m->num_packs && !packs_to_drop)
-		goto cleanup;
+	if (ctx.m && ctx.nr == ctx.m->num_packs && !packs_to_drop) {
+		struct bitmap_index *bitmap_git;
+		int bitmap_exists;
+		int want_bitmap = flags & MIDX_WRITE_BITMAP;
+
+		bitmap_git = prepare_bitmap_git(the_repository);
+		bitmap_exists = bitmap_git && bitmap_is_midx(bitmap_git);
+		free_bitmap_index(bitmap_git);
+
+		if (bitmap_exists || !want_bitmap) {
+			/*
+			 * The correct MIDX already exists, and so does a
+			 * corresponding bitmap (or one wasn't requested).
+			 */
+			if (!want_bitmap)
+				clear_midx_files_ext(the_repository, ".bitmap",
+						     NULL);
+			goto cleanup;
+		}
+	}
 
 	if (preferred_pack_name) {
 		int found = 0;
@@ -964,7 +1159,8 @@  static int write_midx_internal(const char *object_dir, struct multi_pack_index *
 		if (!found)
 			warning(_("unknown preferred pack: '%s'"),
 				preferred_pack_name);
-	} else if (ctx.nr && (flags & MIDX_WRITE_REV_INDEX)) {
+	} else if (ctx.nr &&
+		   (flags & (MIDX_WRITE_REV_INDEX | MIDX_WRITE_BITMAP))) {
 		time_t oldest = ctx.info[0].p->mtime;
 		ctx.preferred_pack_idx = 0;
 
@@ -1075,9 +1271,6 @@  static int write_midx_internal(const char *object_dir, struct multi_pack_index *
 	hold_lock_file_for_update(&lk, midx_name, LOCK_DIE_ON_ERROR);
 	f = hashfd(get_lock_file_fd(&lk), get_lock_file_path(&lk));
 
-	if (ctx.m)
-		close_midx(ctx.m);
-
 	if (ctx.nr - dropped_packs == 0) {
 		error(_("no pack files to index."));
 		result = 1;
@@ -1108,14 +1301,22 @@  static int write_midx_internal(const char *object_dir, struct multi_pack_index *
 	finalize_hashfile(f, midx_hash, CSUM_FSYNC | CSUM_HASH_IN_STREAM);
 	free_chunkfile(cf);
 
-	if (flags & MIDX_WRITE_REV_INDEX)
+	if (flags & (MIDX_WRITE_REV_INDEX | MIDX_WRITE_BITMAP))
 		ctx.pack_order = midx_pack_order(&ctx);
 
 	if (flags & MIDX_WRITE_REV_INDEX)
 		write_midx_reverse_index(midx_name, midx_hash, &ctx);
+	if (flags & MIDX_WRITE_BITMAP) {
+		if (write_midx_bitmap(midx_name, midx_hash, &ctx, flags) < 0) {
+			error(_("could not write multi-pack bitmap"));
+			result = 1;
+			goto cleanup;
+		}
+	}
 
 	commit_lock_file(&lk);
 
+	clear_midx_files_ext(the_repository, ".bitmap", midx_hash);
 	clear_midx_files_ext(the_repository, ".rev", midx_hash);
 
 cleanup:
@@ -1123,6 +1324,15 @@  static int write_midx_internal(const char *object_dir, struct multi_pack_index *
 		if (ctx.info[i].p) {
 			close_pack(ctx.info[i].p);
 			free(ctx.info[i].p);
+			if (ctx.m) {
+				/*
+				 * Destroy a stale reference to the pack in
+				 * 'ctx.m'.
+				 */
+				uint32_t orig = ctx.info[i].orig_pack_int_id;
+				if (orig < ctx.m->num_packs)
+					ctx.m->packs[orig] = NULL;
+			}
 		}
 		free(ctx.info[i].pack_name);
 	}
@@ -1132,6 +1342,9 @@  static int write_midx_internal(const char *object_dir, struct multi_pack_index *
 	free(ctx.pack_perm);
 	free(ctx.pack_order);
 	free(midx_name);
+	if (ctx.m)
+		close_midx(ctx.m);
+
 	return result;
 }
 
@@ -1193,6 +1406,7 @@  void clear_midx_file(struct repository *r)
 	if (remove_path(midx))
 		die(_("failed to clear multi-pack-index at %s"), midx);
 
+	clear_midx_files_ext(r, ".bitmap", NULL);
 	clear_midx_files_ext(r, ".rev", NULL);
 
 	free(midx);
diff --git a/midx.h b/midx.h
index 1172df1a71..350f4d0a7b 100644
--- a/midx.h
+++ b/midx.h
@@ -41,6 +41,7 @@  struct multi_pack_index {
 
 #define MIDX_PROGRESS     (1 << 0)
 #define MIDX_WRITE_REV_INDEX (1 << 1)
+#define MIDX_WRITE_BITMAP (1 << 2)
 
 const unsigned char *get_midx_checksum(struct multi_pack_index *m);
 char *get_midx_filename(const char *object_dir);