diff mbox series

[13/22] pack-bitmap: write multi-pack bitmaps

Message ID fd320c5ed48c7431b64b898f49101b0f53655a95.1617991824.git.me@ttaylorr.com (mailing list archive)
State New
Headers show
Series multi-pack reachability bitmaps | expand

Commit Message

Taylor Blau April 9, 2021, 6:11 p.m. UTC
Write multi-pack bitmaps in the format described by
Documentation/technical/bitmap-format.txt, inferring their presence with
the absence of '--bitmap'.

To write a multi-pack bitmap, this patch attempts to reuse as much of
the existing machinery from pack-objects as possible. Specifically, the
MIDX code prepares a packing_data struct that pretends as if a single
packfile has been generated containing all of the objects contained
within the MIDX.

Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
 Documentation/git-multi-pack-index.txt |  12 +-
 builtin/multi-pack-index.c             |   2 +
 midx.c                                 | 195 ++++++++++++++++++++++++-
 midx.h                                 |   1 +
 4 files changed, 202 insertions(+), 8 deletions(-)

Comments

Jonathan Tan May 4, 2021, 5:02 a.m. UTC | #1
> Write multi-pack bitmaps in the format described by
> Documentation/technical/bitmap-format.txt, inferring their presence with
> the absence of '--bitmap'.
> 
> To write a multi-pack bitmap, this patch attempts to reuse as much of
> the existing machinery from pack-objects as possible. Specifically, the
> MIDX code prepares a packing_data struct that pretends as if a single
> packfile has been generated containing all of the objects contained
> within the MIDX.

Sounds good, and makes sense. Conceptually, the MIDX bitmap is the same
as a regular packfile bitmap, just that the order of objects in the
bitmap is defined differently.

> +static void prepare_midx_packing_data(struct packing_data *pdata,
> +				      struct write_midx_context *ctx)
> +{
> +	uint32_t i;
> +
> +	memset(pdata, 0, sizeof(struct packing_data));
> +	prepare_packing_data(the_repository, pdata);
> +
> +	for (i = 0; i < ctx->entries_nr; i++) {
> +		struct pack_midx_entry *from = &ctx->entries[ctx->pack_order[i]];
> +		struct object_entry *to = packlist_alloc(pdata, &from->oid);
> +
> +		oe_set_in_pack(pdata, to,
> +			       ctx->info[ctx->pack_perm[from->pack_int_id]].p);
> +	}
> +}

It is surprising to see this right at the top. Scrolling down, I guess
that there is more information needed than just the packing_data struct.

> +static int add_ref_to_pending(const char *refname,
> +			      const struct object_id *oid,
> +			      int flag, void *cb_data)
> +{
> +	struct rev_info *revs = (struct rev_info*)cb_data;
> +	struct object *object;
> +
> +	if ((flag & REF_ISSYMREF) && (flag & REF_ISBROKEN)) {
> +		warning("symbolic ref is dangling: %s", refname);
> +		return 0;
> +	}
> +
> +	object = parse_object_or_die(oid, refname);
> +	if (object->type != OBJ_COMMIT)
> +		return 0;
> +
> +	add_pending_object(revs, object, "");
> +	if (bitmap_is_preferred_refname(revs->repo, refname))
> +		object->flags |= NEEDS_BITMAP;
> +	return 0;
> +}

Makes sense. We need to flag certain commits as NEEDS_BITMAP because
bitmaps are not made for all commits but only certain ones.

> +struct bitmap_commit_cb {
> +	struct commit **commits;
> +	size_t commits_nr, commits_alloc;
> +
> +	struct write_midx_context *ctx;
> +};
> +
> +static const struct object_id *bitmap_oid_access(size_t index,
> +						 const void *_entries)
> +{
> +	const struct pack_midx_entry *entries = _entries;
> +	return &entries[index].oid;
> +}
> +
> +static void bitmap_show_commit(struct commit *commit, void *_data)
> +{
> +	struct bitmap_commit_cb *data = _data;
> +	if (oid_pos(&commit->object.oid, data->ctx->entries,
> +		    data->ctx->entries_nr,
> +		    bitmap_oid_access) > -1) {
> +		ALLOC_GROW(data->commits, data->commits_nr + 1,
> +			   data->commits_alloc);
> +		data->commits[data->commits_nr++] = commit;
> +	}
> +}
> +
> +static struct commit **find_commits_for_midx_bitmap(uint32_t *indexed_commits_nr_p,
> +						    struct write_midx_context *ctx)
> +{
> +	struct rev_info revs;
> +	struct bitmap_commit_cb cb;
> +
> +	memset(&cb, 0, sizeof(struct bitmap_commit_cb));
> +	cb.ctx = ctx;
> +
> +	repo_init_revisions(the_repository, &revs, NULL);
> +	for_each_ref(add_ref_to_pending, &revs);
> +
> +	fetch_if_missing = 0;
> +	revs.exclude_promisor_objects = 1;

I think that the MIDX bitmap requires all objects be present? If yes, we
should omit these 2 lines.

> +
> +	if (prepare_revision_walk(&revs))
> +		die(_("revision walk setup failed"));
> +
> +	traverse_commit_list(&revs, bitmap_show_commit, NULL, &cb);
> +	if (indexed_commits_nr_p)
> +		*indexed_commits_nr_p = cb.commits_nr;
> +
> +	return cb.commits;
> +}

Hmm...I might be missing something obvious, but this function and its
callbacks seem to be written like this in order to put the returned
commits in a certain order. But later on in write_midx_bitmap(), the
return value of this function is passed to
bitmap_writer_select_commits(), which resorts the list anyway?

> +static int write_midx_bitmap(char *midx_name, unsigned char *midx_hash,
> +			     struct write_midx_context *ctx,
> +			     unsigned flags)
> +{
> +	struct packing_data pdata;
> +	struct pack_idx_entry **index;
> +	struct commit **commits = NULL;
> +	uint32_t i, commits_nr;
> +	char *bitmap_name = xstrfmt("%s-%s.bitmap", midx_name, hash_to_hex(midx_hash));
> +	int ret;
> +
> +	prepare_midx_packing_data(&pdata, ctx);
> +
> +	commits = find_commits_for_midx_bitmap(&commits_nr, ctx);
> +
> +	/*
> +	 * Build the MIDX-order index based on pdata.objects (which is already
> +	 * in MIDX order; c.f., 'midx_pack_order_cmp()' for the definition of
> +	 * this order).
> +	 */
> +	ALLOC_ARRAY(index, pdata.nr_objects);
> +	for (i = 0; i < pdata.nr_objects; i++)
> +		index[i] = (struct pack_idx_entry *)&pdata.objects[i];
> +
> +	bitmap_writer_show_progress(flags & MIDX_PROGRESS);
> +	bitmap_writer_build_type_index(&pdata, index, pdata.nr_objects);
> +
> +	/*
> +	 * bitmap_writer_select_commits expects objects in lex order, but
> +	 * pack_order gives us exactly that. use it directly instead of
> +	 * re-sorting the array
> +	 */
> +	for (i = 0; i < pdata.nr_objects; i++)
> +		index[ctx->pack_order[i]] = (struct pack_idx_entry *)&pdata.objects[i];
> +
> +	bitmap_writer_select_commits(commits, commits_nr, -1);

The comment above says bitmap_writer_select_commits() expects objects in
lex order, but (1) you're putting "index" in lex order, not "commits",
and (2) the first thing in bitmap_writer_select_commits() is a QSORT.
Did you mean another function?

> +	ret = bitmap_writer_build(&pdata);
> +	if (!ret)
> +		goto cleanup;
> +
> +	bitmap_writer_set_checksum(midx_hash);
> +	bitmap_writer_finish(index, pdata.nr_objects, bitmap_name, 0);

So bitmap_writer_build_type_index() and bitmap_writer_finish() are
called with 2 different orders of commits. Is this expected? If yes,
maybe this is worth a comment.

> +
> +cleanup:
> +	free(index);
> +	free(bitmap_name);
> +	return ret;
> +}

[snip]

> @@ -930,9 +1073,16 @@ static int write_midx_internal(const char *object_dir, struct multi_pack_index *
>  		for (i = 0; i < ctx.m->num_packs; i++) {
>  			ALLOC_GROW(ctx.info, ctx.nr + 1, ctx.alloc);
>  
> +			if (prepare_midx_pack(the_repository, ctx.m, i)) {
> +				error(_("could not load pack %s"),
> +				      ctx.m->pack_names[i]);
> +				result = 1;
> +				goto cleanup;
> +			}
> +
>  			ctx.info[ctx.nr].orig_pack_int_id = i;
>  			ctx.info[ctx.nr].pack_name = xstrdup(ctx.m->pack_names[i]);
> -			ctx.info[ctx.nr].p = NULL;
> +			ctx.info[ctx.nr].p = ctx.m->packs[i];
>  			ctx.info[ctx.nr].expired = 0;
>  			ctx.nr++;
>  		}

Why is this needed now and not before? From what I see in this function,
nothing seems to happen to this .p pack except that they are closed
later.

> @@ -1096,6 +1264,15 @@ static int write_midx_internal(const char *object_dir, struct multi_pack_index *
>  		if (ctx.info[i].p) {
>  			close_pack(ctx.info[i].p);
>  			free(ctx.info[i].p);
> +			if (ctx.m) {
> +				/*
> +				 * Destroy a stale reference to the pack in
> +				 * 'ctx.m'.
> +				 */
> +				uint32_t orig = ctx.info[i].orig_pack_int_id;
> +				if (orig < ctx.m->num_packs)
> +					ctx.m->packs[orig] = NULL;
> +			}
>  		}
>  		free(ctx.info[i].pack_name);
>  	}

Is this hunk needed? "ctx" is a local variable and will not outlast this
function.

I'll review the rest tomorrow. It seems like I've gotten over the most
difficult patches.
Taylor Blau May 6, 2021, 8:18 p.m. UTC | #2
On Mon, May 03, 2021 at 10:02:30PM -0700, Jonathan Tan wrote:
> > +static void prepare_midx_packing_data(struct packing_data *pdata,
> > +				      struct write_midx_context *ctx)
> > +{
> > +	uint32_t i;
> > +
> > +	memset(pdata, 0, sizeof(struct packing_data));
> > +	prepare_packing_data(the_repository, pdata);
> > +
> > +	for (i = 0; i < ctx->entries_nr; i++) {
> > +		struct pack_midx_entry *from = &ctx->entries[ctx->pack_order[i]];
> > +		struct object_entry *to = packlist_alloc(pdata, &from->oid);
> > +
> > +		oe_set_in_pack(pdata, to,
> > +			       ctx->info[ctx->pack_perm[from->pack_int_id]].p);
> > +	}
> > +}
>
> It is surprising to see this right at the top. Scrolling down, I guess
> that there is more information needed than just the packing_data struct.

Hmm, which part is surprising to you? This function is setting up the
packing_data structure that I mentioned in the commit message, which
happens in two steps. First, we allocate and call
prepare_packing_data(). And then we call packlist_alloc() for each
object in the MIDX, setting up some information about each object
(like its OID and which physical pack it came from).

But if any of this is unclear, let me know which part and I'd be happy
to add a clarifying comment.

> > +static int add_ref_to_pending(const char *refname,
> > +			      const struct object_id *oid,
> > +			      int flag, void *cb_data)
> > +{
> > +	struct rev_info *revs = (struct rev_info*)cb_data;
> > +	struct object *object;
> > +
> > +	if ((flag & REF_ISSYMREF) && (flag & REF_ISBROKEN)) {
> > +		warning("symbolic ref is dangling: %s", refname);
> > +		return 0;
> > +	}
> > +
> > +	object = parse_object_or_die(oid, refname);
> > +	if (object->type != OBJ_COMMIT)
> > +		return 0;
> > +
> > +	add_pending_object(revs, object, "");
> > +	if (bitmap_is_preferred_refname(revs->repo, refname))
> > +		object->flags |= NEEDS_BITMAP;
> > +	return 0;
> > +}
>
> Makes sense. We need to flag certain commits as NEEDS_BITMAP because
> bitmaps are not made for all commits but only certain ones.

Right, and the NEEDS_BITMAP is a bit of a misnomer. It's true meaning is
more like BITMAPPING_THIS_WOULD_BE_A_GOOD_IDEA, since it roughly
translates to "bitmap this commit before any others in its window". More
details are in bitmap_writer_select_commits(), but in all honesty I find
the implementation there somewhat confusing.

> > +static struct commit **find_commits_for_midx_bitmap(uint32_t *indexed_commits_nr_p,
> > +						    struct write_midx_context *ctx)
> > +{
> > +	struct rev_info revs;
> > +	struct bitmap_commit_cb cb;
> > +
> > +	memset(&cb, 0, sizeof(struct bitmap_commit_cb));
> > +	cb.ctx = ctx;
> > +
> > +	repo_init_revisions(the_repository, &revs, NULL);
> > +	for_each_ref(add_ref_to_pending, &revs);
> > +
> > +	fetch_if_missing = 0;
> > +	revs.exclude_promisor_objects = 1;
>
> I think that the MIDX bitmap requires all objects be present? If yes, we
> should omit these 2 lines.

It does require that all objects are present, but if we fetched any
promisor objects at this stage it would be too late. That's because by
the time we're in this function, all of the packs that are to be
included in the MIDX should already exist on disk.

Skipping promisor objects here is intentional, since it only excludes
them from the list of reachable commits that we want to select from when
computing the selection of MIDX'd commits to receive bitmaps.

But, if one of those promisor objects is reachable from another object
that is included in the bitmap, then we will complain later on that we
couldn't find a reachability closure (and fail appropriately).

That said, I'm not sure any of that is obvious from reading this code,
so I'll add a comment to that effect around these lines.

> > +
> > +	if (prepare_revision_walk(&revs))
> > +		die(_("revision walk setup failed"));
> > +
> > +	traverse_commit_list(&revs, bitmap_show_commit, NULL, &cb);
> > +	if (indexed_commits_nr_p)
> > +		*indexed_commits_nr_p = cb.commits_nr;
> > +
> > +	return cb.commits;
> > +}
>
> Hmm...I might be missing something obvious, but this function and its
> callbacks seem to be written like this in order to put the returned
> commits in a certain order. But later on in write_midx_bitmap(), the
> return value of this function is passed to
> bitmap_writer_select_commits(), which resorts the list anyway?

It isn't intentional, but rather just to build up the list in topo
order. In fact, the order we build it up in isn't quite the same as how
the pack bitmap code generates it (it is in true topo order, at least on
GitHub's servers, as a side effect of using delta islands).

The fact that we resort according to date_compare makes me wonder why
changing that seemed to make such a difference for us. The whole
selection code is a mystery to me.

But no, the order shouldn't matter since we QSORT it later. Any code
here that looks like it's putting it in a certain order has much more to
do with convenience than anything else.

>
> > +static int write_midx_bitmap(char *midx_name, unsigned char *midx_hash,
> > +			     struct write_midx_context *ctx,
> > +			     unsigned flags)
> > +{
> > +	struct packing_data pdata;
> > +	struct pack_idx_entry **index;
> > +	struct commit **commits = NULL;
> > +	uint32_t i, commits_nr;
> > +	char *bitmap_name = xstrfmt("%s-%s.bitmap", midx_name, hash_to_hex(midx_hash));
> > +	int ret;
> > +
> > +	prepare_midx_packing_data(&pdata, ctx);
> > +
> > +	commits = find_commits_for_midx_bitmap(&commits_nr, ctx);
> > +
> > +	/*
> > +	 * Build the MIDX-order index based on pdata.objects (which is already
> > +	 * in MIDX order; c.f., 'midx_pack_order_cmp()' for the definition of
> > +	 * this order).
> > +	 */
> > +	ALLOC_ARRAY(index, pdata.nr_objects);
> > +	for (i = 0; i < pdata.nr_objects; i++)
> > +		index[i] = (struct pack_idx_entry *)&pdata.objects[i];
> > +
> > +	bitmap_writer_show_progress(flags & MIDX_PROGRESS);
> > +	bitmap_writer_build_type_index(&pdata, index, pdata.nr_objects);
> > +
> > +	/*
> > +	 * bitmap_writer_select_commits expects objects in lex order, but
> > +	 * pack_order gives us exactly that. use it directly instead of
> > +	 * re-sorting the array
> > +	 */
> > +	for (i = 0; i < pdata.nr_objects; i++)
> > +		index[ctx->pack_order[i]] = (struct pack_idx_entry *)&pdata.objects[i];
> > +
> > +	bitmap_writer_select_commits(commits, commits_nr, -1);
>
> The comment above says bitmap_writer_select_commits() expects objects in
> lex order, but (1) you're putting "index" in lex order, not "commits",
> and (2) the first thing in bitmap_writer_select_commits() is a QSORT.
> Did you mean another function?

Ack, I definitely meant bitmap_writer_build(). Thanks for catching.

> > +	ret = bitmap_writer_build(&pdata);
> > +	if (!ret)
> > +		goto cleanup;
> > +
> > +	bitmap_writer_set_checksum(midx_hash);
> > +	bitmap_writer_finish(index, pdata.nr_objects, bitmap_name, 0);
>
> So bitmap_writer_build_type_index() and bitmap_writer_finish() are
> called with 2 different orders of commits. Is this expected? If yes,
> maybe this is worth a comment.

Confusingly so, but yes, these two do expect different orders. You can
see the same re-sorting going on much more subtly in
pack-write.c:write_idx_file(), which is called by
builtin/pack-objects.c:finish_tmp_packfile(), which happens between
bitmap_writer_build_type_index() and bitmap_writer_finish().

Definitely worth adding a comment.

> > @@ -930,9 +1073,16 @@ static int write_midx_internal(const char *object_dir, struct multi_pack_index *
> >  		for (i = 0; i < ctx.m->num_packs; i++) {
> >  			ALLOC_GROW(ctx.info, ctx.nr + 1, ctx.alloc);
> >
> > +			if (prepare_midx_pack(the_repository, ctx.m, i)) {
> > +				error(_("could not load pack %s"),
> > +				      ctx.m->pack_names[i]);
> > +				result = 1;
> > +				goto cleanup;
> > +			}
> > +
> >  			ctx.info[ctx.nr].orig_pack_int_id = i;
> >  			ctx.info[ctx.nr].pack_name = xstrdup(ctx.m->pack_names[i]);
> > -			ctx.info[ctx.nr].p = NULL;
> > +			ctx.info[ctx.nr].p = ctx.m->packs[i];
> >  			ctx.info[ctx.nr].expired = 0;
> >  			ctx.nr++;
> >  		}
>
> Why is this needed now and not before? From what I see in this function,
> nothing seems to happen to this .p pack except that they are closed
> later.

These are used by prepare_midx_packing_data().

> > @@ -1096,6 +1264,15 @@ static int write_midx_internal(const char *object_dir, struct multi_pack_index *
> >  		if (ctx.info[i].p) {
> >  			close_pack(ctx.info[i].p);
> >  			free(ctx.info[i].p);
> > +			if (ctx.m) {
> > +				/*
> > +				 * Destroy a stale reference to the pack in
> > +				 * 'ctx.m'.
> > +				 */
> > +				uint32_t orig = ctx.info[i].orig_pack_int_id;
> > +				if (orig < ctx.m->num_packs)
> > +					ctx.m->packs[orig] = NULL;
> > +			}
> >  		}
> >  		free(ctx.info[i].pack_name);
> >  	}
>
> Is this hunk needed? "ctx" is a local variable and will not outlast this
> function.

I can't remember exactly why I added this. I'll play around with it and
either remove it or add a comment why it's necessary before the next
reroll.

> I'll review the rest tomorrow. It seems like I've gotten over the most
> difficult patches.

Thanks, and sorry that this took me a few days to get back to. I
appreciate your review immensely.

Thanks,
Taylor
Jonathan Tan May 6, 2021, 10 p.m. UTC | #3
> On Mon, May 03, 2021 at 10:02:30PM -0700, Jonathan Tan wrote:
> > > +static void prepare_midx_packing_data(struct packing_data *pdata,
> > > +				      struct write_midx_context *ctx)
> > > +{
> > > +	uint32_t i;
> > > +
> > > +	memset(pdata, 0, sizeof(struct packing_data));
> > > +	prepare_packing_data(the_repository, pdata);
> > > +
> > > +	for (i = 0; i < ctx->entries_nr; i++) {
> > > +		struct pack_midx_entry *from = &ctx->entries[ctx->pack_order[i]];
> > > +		struct object_entry *to = packlist_alloc(pdata, &from->oid);
> > > +
> > > +		oe_set_in_pack(pdata, to,
> > > +			       ctx->info[ctx->pack_perm[from->pack_int_id]].p);
> > > +	}
> > > +}
> >
> > It is surprising to see this right at the top. Scrolling down, I guess
> > that there is more information needed than just the packing_data struct.
> 
> Hmm, which part is surprising to you? This function is setting up the
> packing_data structure that I mentioned in the commit message, which
> happens in two steps. First, we allocate and call
> prepare_packing_data(). And then we call packlist_alloc() for each
> object in the MIDX, setting up some information about each object
> (like its OID and which physical pack it came from).
> 
> But if any of this is unclear, let me know which part and I'd be happy
> to add a clarifying comment.

Ah, I think I was unclear. I was thinking that the commit message led me
to believe that all information needed for creating a bitmap lies in the
packing_data struct, so I would have expected several helper functions
followed by a function that actually writes the packing_data struct.
Maybe the commit message could be rewritten to avoid that confusion, but
it's probably not a big deal.

> > > +static struct commit **find_commits_for_midx_bitmap(uint32_t *indexed_commits_nr_p,
> > > +						    struct write_midx_context *ctx)
> > > +{
> > > +	struct rev_info revs;
> > > +	struct bitmap_commit_cb cb;
> > > +
> > > +	memset(&cb, 0, sizeof(struct bitmap_commit_cb));
> > > +	cb.ctx = ctx;
> > > +
> > > +	repo_init_revisions(the_repository, &revs, NULL);
> > > +	for_each_ref(add_ref_to_pending, &revs);
> > > +
> > > +	fetch_if_missing = 0;
> > > +	revs.exclude_promisor_objects = 1;
> >
> > I think that the MIDX bitmap requires all objects be present? If yes, we
> > should omit these 2 lines.
> 
> It does require that all objects are present, but if we fetched any
> promisor objects at this stage it would be too late. That's because by
> the time we're in this function, all of the packs that are to be
> included in the MIDX should already exist on disk.
> 
> Skipping promisor objects here is intentional, since it only excludes
> them from the list of reachable commits that we want to select from when
> computing the selection of MIDX'd commits to receive bitmaps.
> 
> But, if one of those promisor objects is reachable from another object
> that is included in the bitmap, then we will complain later on that we
> couldn't find a reachability closure (and fail appropriately).
> 
> That said, I'm not sure any of that is obvious from reading this code,
> so I'll add a comment to that effect around these lines.

So you're saying that if we have missing promisor commits as in the
following graph:

   A
  / \
 B   C
 |   |
 .   .
 .   .
 .   .

where B is missing but promised, and only C is NEEDS_BITMAP, then the
MIDX bitmap write will still work? (So the rev walk is intended to walk
through A and C but not B, and because we are only building bitmaps for
C and potentially its ancestors, we only need the objects in C's
transitive closure.) Even if this is true, "exclude_promisor_objects" is
the wrong option here, because it will exclude all commits that came
from a promisor remote (regardless of whether it is present locally).
(That's how "promisor object" is defined in partial-clone.txt.) What we
need would be an option that permits missing links.

And even if we go with that option that permits missing links, it still
remains that we have very little support for missing promisor commits in
Git right now.

It might be better to just assume that MIDX will only be used for full
clones. If you want, you can add a NEEDSWORK explaining the above case.

> > > +
> > > +	if (prepare_revision_walk(&revs))
> > > +		die(_("revision walk setup failed"));
> > > +
> > > +	traverse_commit_list(&revs, bitmap_show_commit, NULL, &cb);
> > > +	if (indexed_commits_nr_p)
> > > +		*indexed_commits_nr_p = cb.commits_nr;
> > > +
> > > +	return cb.commits;
> > > +}
> >
> > Hmm...I might be missing something obvious, but this function and its
> > callbacks seem to be written like this in order to put the returned
> > commits in a certain order. But later on in write_midx_bitmap(), the
> > return value of this function is passed to
> > bitmap_writer_select_commits(), which resorts the list anyway?
> 
> It isn't intentional, but rather just to build up the list in topo
> order. In fact, the order we build it up in isn't quite the same as how
> the pack bitmap code generates it (it is in true topo order, at least on
> GitHub's servers, as a side effect of using delta islands).
> 
> The fact that we resort according to date_compare makes me wonder why
> changing that seemed to make such a difference for us. The whole
> selection code is a mystery to me.
> 
> But no, the order shouldn't matter since we QSORT it later. Any code
> here that looks like it's putting it in a certain order has much more to
> do with convenience than anything else.

If the order doesn't matter, why don't you just copy one-by-one from
data->ctx->entries into data->commits? (Unless data->ctx->entries has
extra commits?)

> > > +	ret = bitmap_writer_build(&pdata);
> > > +	if (!ret)
> > > +		goto cleanup;
> > > +
> > > +	bitmap_writer_set_checksum(midx_hash);
> > > +	bitmap_writer_finish(index, pdata.nr_objects, bitmap_name, 0);
> >
> > So bitmap_writer_build_type_index() and bitmap_writer_finish() are
> > called with 2 different orders of commits. Is this expected? If yes,
> > maybe this is worth a comment.
> 
> Confusingly so, but yes, these two do expect different orders. You can
> see the same re-sorting going on much more subtly in
> pack-write.c:write_idx_file(), which is called by
> builtin/pack-objects.c:finish_tmp_packfile(), which happens between
> bitmap_writer_build_type_index() and bitmap_writer_finish().
> 
> Definitely worth adding a comment.

Ah, I see. Thanks for your explanation.

> > > @@ -930,9 +1073,16 @@ static int write_midx_internal(const char *object_dir, struct multi_pack_index *
> > >  		for (i = 0; i < ctx.m->num_packs; i++) {
> > >  			ALLOC_GROW(ctx.info, ctx.nr + 1, ctx.alloc);
> > >
> > > +			if (prepare_midx_pack(the_repository, ctx.m, i)) {
> > > +				error(_("could not load pack %s"),
> > > +				      ctx.m->pack_names[i]);
> > > +				result = 1;
> > > +				goto cleanup;
> > > +			}
> > > +
> > >  			ctx.info[ctx.nr].orig_pack_int_id = i;
> > >  			ctx.info[ctx.nr].pack_name = xstrdup(ctx.m->pack_names[i]);
> > > -			ctx.info[ctx.nr].p = NULL;
> > > +			ctx.info[ctx.nr].p = ctx.m->packs[i];
> > >  			ctx.info[ctx.nr].expired = 0;
> > >  			ctx.nr++;
> > >  		}
> >
> > Why is this needed now and not before? From what I see in this function,
> > nothing seems to happen to this .p pack except that they are closed
> > later.
> 
> These are used by prepare_midx_packing_data().

Ah, thanks.

> > > @@ -1096,6 +1264,15 @@ static int write_midx_internal(const char *object_dir, struct multi_pack_index *
> > >  		if (ctx.info[i].p) {
> > >  			close_pack(ctx.info[i].p);
> > >  			free(ctx.info[i].p);
> > > +			if (ctx.m) {
> > > +				/*
> > > +				 * Destroy a stale reference to the pack in
> > > +				 * 'ctx.m'.
> > > +				 */
> > > +				uint32_t orig = ctx.info[i].orig_pack_int_id;
> > > +				if (orig < ctx.m->num_packs)
> > > +					ctx.m->packs[orig] = NULL;
> > > +			}
> > >  		}
> > >  		free(ctx.info[i].pack_name);
> > >  	}
> >
> > Is this hunk needed? "ctx" is a local variable and will not outlast this
> > function.
> 
> I can't remember exactly why I added this. I'll play around with it and
> either remove it or add a comment why it's necessary before the next
> reroll.

OK.

> 
> > I'll review the rest tomorrow. It seems like I've gotten over the most
> > difficult patches.
> 
> Thanks, and sorry that this took me a few days to get back to. I
> appreciate your review immensely.

No worries, and thanks for these patches.
diff mbox series

Patch

diff --git a/Documentation/git-multi-pack-index.txt b/Documentation/git-multi-pack-index.txt
index ffd601bc17..ada14deb2c 100644
--- a/Documentation/git-multi-pack-index.txt
+++ b/Documentation/git-multi-pack-index.txt
@@ -10,7 +10,7 @@  SYNOPSIS
 --------
 [verse]
 'git multi-pack-index' [--object-dir=<dir>] [--[no-]progress]
-	[--preferred-pack=<pack>] <subcommand>
+	[--preferred-pack=<pack>] [--[no-]bitmap] <subcommand>
 
 DESCRIPTION
 -----------
@@ -40,6 +40,9 @@  write::
 		multiple packs contain the same object. If not given,
 		ties are broken in favor of the pack with the lowest
 		mtime.
+
+	--[no-]bitmap::
+		Control whether or not a multi-pack bitmap is written.
 --
 
 verify::
@@ -81,6 +84,13 @@  EXAMPLES
 $ git multi-pack-index write
 -----------------------------------------------
 
+* Write a MIDX file for the packfiles in the current .git folder with a
+corresponding bitmap.
++
+-------------------------------------------------------------
+$ git multi-pack-index write --preferred-pack <pack> --bitmap
+-------------------------------------------------------------
+
 * Write a MIDX file for the packfiles in an alternate object store.
 +
 -----------------------------------------------
diff --git a/builtin/multi-pack-index.c b/builtin/multi-pack-index.c
index 5d3ea445fd..bf6fa982e3 100644
--- a/builtin/multi-pack-index.c
+++ b/builtin/multi-pack-index.c
@@ -68,6 +68,8 @@  static int cmd_multi_pack_index_write(int argc, const char **argv)
 		OPT_STRING(0, "preferred-pack", &opts.preferred_pack,
 			   N_("preferred-pack"),
 			   N_("pack for reuse when computing a multi-pack bitmap")),
+		OPT_BIT(0, "bitmap", &opts.flags, N_("write multi-pack bitmap"),
+			MIDX_WRITE_BITMAP | MIDX_WRITE_REV_INDEX),
 		OPT_END(),
 	};
 
diff --git a/midx.c b/midx.c
index 567cdf0fcf..32d7d184c0 100644
--- a/midx.c
+++ b/midx.c
@@ -13,6 +13,10 @@ 
 #include "repository.h"
 #include "chunk-format.h"
 #include "pack.h"
+#include "pack-bitmap.h"
+#include "refs.h"
+#include "revision.h"
+#include "list-objects.h"
 
 #define MIDX_SIGNATURE 0x4d494458 /* "MIDX" */
 #define MIDX_VERSION 1
@@ -885,6 +889,145 @@  static void write_midx_reverse_index(char *midx_name, unsigned char *midx_hash,
 static void clear_midx_files_ext(struct repository *r, const char *ext,
 				 unsigned char *keep_hash);
 
+static void prepare_midx_packing_data(struct packing_data *pdata,
+				      struct write_midx_context *ctx)
+{
+	uint32_t i;
+
+	memset(pdata, 0, sizeof(struct packing_data));
+	prepare_packing_data(the_repository, pdata);
+
+	for (i = 0; i < ctx->entries_nr; i++) {
+		struct pack_midx_entry *from = &ctx->entries[ctx->pack_order[i]];
+		struct object_entry *to = packlist_alloc(pdata, &from->oid);
+
+		oe_set_in_pack(pdata, to,
+			       ctx->info[ctx->pack_perm[from->pack_int_id]].p);
+	}
+}
+
+static int add_ref_to_pending(const char *refname,
+			      const struct object_id *oid,
+			      int flag, void *cb_data)
+{
+	struct rev_info *revs = (struct rev_info*)cb_data;
+	struct object *object;
+
+	if ((flag & REF_ISSYMREF) && (flag & REF_ISBROKEN)) {
+		warning("symbolic ref is dangling: %s", refname);
+		return 0;
+	}
+
+	object = parse_object_or_die(oid, refname);
+	if (object->type != OBJ_COMMIT)
+		return 0;
+
+	add_pending_object(revs, object, "");
+	if (bitmap_is_preferred_refname(revs->repo, refname))
+		object->flags |= NEEDS_BITMAP;
+	return 0;
+}
+
+struct bitmap_commit_cb {
+	struct commit **commits;
+	size_t commits_nr, commits_alloc;
+
+	struct write_midx_context *ctx;
+};
+
+static const struct object_id *bitmap_oid_access(size_t index,
+						 const void *_entries)
+{
+	const struct pack_midx_entry *entries = _entries;
+	return &entries[index].oid;
+}
+
+static void bitmap_show_commit(struct commit *commit, void *_data)
+{
+	struct bitmap_commit_cb *data = _data;
+	if (oid_pos(&commit->object.oid, data->ctx->entries,
+		    data->ctx->entries_nr,
+		    bitmap_oid_access) > -1) {
+		ALLOC_GROW(data->commits, data->commits_nr + 1,
+			   data->commits_alloc);
+		data->commits[data->commits_nr++] = commit;
+	}
+}
+
+static struct commit **find_commits_for_midx_bitmap(uint32_t *indexed_commits_nr_p,
+						    struct write_midx_context *ctx)
+{
+	struct rev_info revs;
+	struct bitmap_commit_cb cb;
+
+	memset(&cb, 0, sizeof(struct bitmap_commit_cb));
+	cb.ctx = ctx;
+
+	repo_init_revisions(the_repository, &revs, NULL);
+	for_each_ref(add_ref_to_pending, &revs);
+
+	fetch_if_missing = 0;
+	revs.exclude_promisor_objects = 1;
+
+	if (prepare_revision_walk(&revs))
+		die(_("revision walk setup failed"));
+
+	traverse_commit_list(&revs, bitmap_show_commit, NULL, &cb);
+	if (indexed_commits_nr_p)
+		*indexed_commits_nr_p = cb.commits_nr;
+
+	return cb.commits;
+}
+
+static int write_midx_bitmap(char *midx_name, unsigned char *midx_hash,
+			     struct write_midx_context *ctx,
+			     unsigned flags)
+{
+	struct packing_data pdata;
+	struct pack_idx_entry **index;
+	struct commit **commits = NULL;
+	uint32_t i, commits_nr;
+	char *bitmap_name = xstrfmt("%s-%s.bitmap", midx_name, hash_to_hex(midx_hash));
+	int ret;
+
+	prepare_midx_packing_data(&pdata, ctx);
+
+	commits = find_commits_for_midx_bitmap(&commits_nr, ctx);
+
+	/*
+	 * Build the MIDX-order index based on pdata.objects (which is already
+	 * in MIDX order; c.f., 'midx_pack_order_cmp()' for the definition of
+	 * this order).
+	 */
+	ALLOC_ARRAY(index, pdata.nr_objects);
+	for (i = 0; i < pdata.nr_objects; i++)
+		index[i] = (struct pack_idx_entry *)&pdata.objects[i];
+
+	bitmap_writer_show_progress(flags & MIDX_PROGRESS);
+	bitmap_writer_build_type_index(&pdata, index, pdata.nr_objects);
+
+	/*
+	 * bitmap_writer_select_commits expects objects in lex order, but
+	 * pack_order gives us exactly that. use it directly instead of
+	 * re-sorting the array
+	 */
+	for (i = 0; i < pdata.nr_objects; i++)
+		index[ctx->pack_order[i]] = (struct pack_idx_entry *)&pdata.objects[i];
+
+	bitmap_writer_select_commits(commits, commits_nr, -1);
+	ret = bitmap_writer_build(&pdata);
+	if (!ret)
+		goto cleanup;
+
+	bitmap_writer_set_checksum(midx_hash);
+	bitmap_writer_finish(index, pdata.nr_objects, bitmap_name, 0);
+
+cleanup:
+	free(index);
+	free(bitmap_name);
+	return ret;
+}
+
 static int write_midx_internal(const char *object_dir, struct multi_pack_index *m,
 			       struct string_list *packs_to_drop,
 			       const char *preferred_pack_name,
@@ -930,9 +1073,16 @@  static int write_midx_internal(const char *object_dir, struct multi_pack_index *
 		for (i = 0; i < ctx.m->num_packs; i++) {
 			ALLOC_GROW(ctx.info, ctx.nr + 1, ctx.alloc);
 
+			if (prepare_midx_pack(the_repository, ctx.m, i)) {
+				error(_("could not load pack %s"),
+				      ctx.m->pack_names[i]);
+				result = 1;
+				goto cleanup;
+			}
+
 			ctx.info[ctx.nr].orig_pack_int_id = i;
 			ctx.info[ctx.nr].pack_name = xstrdup(ctx.m->pack_names[i]);
-			ctx.info[ctx.nr].p = NULL;
+			ctx.info[ctx.nr].p = ctx.m->packs[i];
 			ctx.info[ctx.nr].expired = 0;
 			ctx.nr++;
 		}
@@ -947,8 +1097,26 @@  static int write_midx_internal(const char *object_dir, struct multi_pack_index *
 	for_each_file_in_pack_dir(object_dir, add_pack_to_midx, &ctx);
 	stop_progress(&ctx.progress);
 
-	if (ctx.m && ctx.nr == ctx.m->num_packs && !packs_to_drop)
-		goto cleanup;
+	if (ctx.m && ctx.nr == ctx.m->num_packs && !packs_to_drop) {
+		struct bitmap_index *bitmap_git;
+		int bitmap_exists;
+		int want_bitmap = flags & MIDX_WRITE_BITMAP;
+
+		bitmap_git = prepare_bitmap_git(the_repository);
+		bitmap_exists = bitmap_git && bitmap_is_midx(bitmap_git);
+		free_bitmap_index(bitmap_git);
+
+		if (bitmap_exists || !want_bitmap) {
+			/*
+			 * The correct MIDX already exists, and so does a
+			 * corresponding bitmap (or one wasn't requested).
+			 */
+			if (!want_bitmap)
+				clear_midx_files_ext(the_repository, ".bitmap",
+						     NULL);
+			goto cleanup;
+		}
+	}
 
 	ctx.preferred_pack_idx = -1;
 	if (preferred_pack_name) {
@@ -1048,9 +1216,6 @@  static int write_midx_internal(const char *object_dir, struct multi_pack_index *
 	hold_lock_file_for_update(&lk, midx_name, LOCK_DIE_ON_ERROR);
 	f = hashfd(get_lock_file_fd(&lk), get_lock_file_path(&lk));
 
-	if (ctx.m)
-		close_midx(ctx.m);
-
 	if (ctx.nr - dropped_packs == 0) {
 		error(_("no pack files to index."));
 		result = 1;
@@ -1081,14 +1246,17 @@  static int write_midx_internal(const char *object_dir, struct multi_pack_index *
 	finalize_hashfile(f, midx_hash, CSUM_FSYNC | CSUM_HASH_IN_STREAM);
 	free_chunkfile(cf);
 
-	if (flags & MIDX_WRITE_REV_INDEX)
+	if (flags & (MIDX_WRITE_REV_INDEX | MIDX_WRITE_BITMAP))
 		ctx.pack_order = midx_pack_order(&ctx);
 
 	if (flags & MIDX_WRITE_REV_INDEX)
 		write_midx_reverse_index(midx_name, midx_hash, &ctx);
+	if (flags & MIDX_WRITE_BITMAP)
+		write_midx_bitmap(midx_name, midx_hash, &ctx, flags);
 
 	commit_lock_file(&lk);
 
+	clear_midx_files_ext(the_repository, ".bitmap", midx_hash);
 	clear_midx_files_ext(the_repository, ".rev", midx_hash);
 
 cleanup:
@@ -1096,6 +1264,15 @@  static int write_midx_internal(const char *object_dir, struct multi_pack_index *
 		if (ctx.info[i].p) {
 			close_pack(ctx.info[i].p);
 			free(ctx.info[i].p);
+			if (ctx.m) {
+				/*
+				 * Destroy a stale reference to the pack in
+				 * 'ctx.m'.
+				 */
+				uint32_t orig = ctx.info[i].orig_pack_int_id;
+				if (orig < ctx.m->num_packs)
+					ctx.m->packs[orig] = NULL;
+			}
 		}
 		free(ctx.info[i].pack_name);
 	}
@@ -1105,6 +1282,9 @@  static int write_midx_internal(const char *object_dir, struct multi_pack_index *
 	free(ctx.pack_perm);
 	free(ctx.pack_order);
 	free(midx_name);
+	if (ctx.m)
+		close_midx(ctx.m);
+
 	return result;
 }
 
@@ -1166,6 +1346,7 @@  void clear_midx_file(struct repository *r)
 	if (remove_path(midx))
 		die(_("failed to clear multi-pack-index at %s"), midx);
 
+	clear_midx_files_ext(r, ".bitmap", NULL);
 	clear_midx_files_ext(r, ".rev", NULL);
 
 	free(midx);
diff --git a/midx.h b/midx.h
index 1172df1a71..350f4d0a7b 100644
--- a/midx.h
+++ b/midx.h
@@ -41,6 +41,7 @@  struct multi_pack_index {
 
 #define MIDX_PROGRESS     (1 << 0)
 #define MIDX_WRITE_REV_INDEX (1 << 1)
+#define MIDX_WRITE_BITMAP (1 << 2)
 
 const unsigned char *get_midx_checksum(struct multi_pack_index *m);
 char *get_midx_filename(const char *object_dir);