diff mbox series

[v3,15/16] pack-revindex: write multi-pack reverse indexes

Message ID fa3acb5d5af3aab46dd2a9703e4da03928625346.1615482270.git.me@ttaylorr.com (mailing list archive)
State Superseded
Headers show
Series midx: implement a multi-pack reverse index | expand

Commit Message

Taylor Blau March 11, 2021, 5:05 p.m. UTC
Implement the writing half of multi-pack reverse indexes. This is
nothing more than the format describe a few patches ago, with a new set
of helper functions that will be used to clear out stale .rev files
corresponding to old MIDXs.

Unfortunately, a very similar comparison function as the one implemented
recently in pack-revindex.c is reimplemented here, this time accepting a
MIDX-internal type. An effort to DRY these up would create more
indirection and overhead than is necessary, so it isn't pursued here.

Currently, there are no callers which pass the MIDX_WRITE_REV_INDEX
flag, meaning that this is all dead code. But, that won't be the case
for long, since subsequent patches will introduce the multi-pack bitmap,
which will begin passing this field.

(In midx.c:write_midx_internal(), the two adjacent if statements share a
conditional, but are written separately since the first one will
eventually also handle the MIDX_WRITE_BITMAP flag, which does not yet
exist.)

Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
 midx.c | 115 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 midx.h |   1 +
 2 files changed, 116 insertions(+)

Comments

Jeff King March 29, 2021, 12:53 p.m. UTC | #1
On Thu, Mar 11, 2021 at 12:05:38PM -0500, Taylor Blau wrote:

> Implement the writing half of multi-pack reverse indexes. This is
> nothing more than the format describe a few patches ago, with a new set
> of helper functions that will be used to clear out stale .rev files
> corresponding to old MIDXs.

Looks good.

> +struct clear_midx_data {
> +	char *keep;
> +	const char *ext;
> +};
> +
> +static void clear_midx_file_ext(const char *full_path, size_t full_path_len,
> +				const char *file_name, void *_data)

This will clean up _any_ stale midx .rev file. So even if we miss one
when writing a new midx (due to a bug, race, power loss, etc), we'll
catch it later.

We _might_ want to also teach various tempfile-cleanup code run by gc to
likewise look for unattached midx .rev files, but I don't think we
necessarily have to do it now.

>  void clear_midx_file(struct repository *r)
>  {
>  	char *midx = get_midx_filename(r->objects->odb->path);
> @@ -1049,6 +1162,8 @@ void clear_midx_file(struct repository *r)
>  	if (remove_path(midx))
>  		die(_("failed to clear multi-pack-index at %s"), midx);
>  
> +	clear_midx_files_ext(r, ".rev", NULL);
> +
>  	free(midx);

The sole caller now doesn't pass the "keep" hash, so we'd always delete
all of them. I guess we'll see that change once somebody starts actually
writing them.

-Peff
Taylor Blau March 29, 2021, 9:30 p.m. UTC | #2
On Mon, Mar 29, 2021 at 08:53:22AM -0400, Jeff King wrote:
> On Thu, Mar 11, 2021 at 12:05:38PM -0500, Taylor Blau wrote:
>
> > Implement the writing half of multi-pack reverse indexes. This is
> > nothing more than the format describe a few patches ago, with a new set
> > of helper functions that will be used to clear out stale .rev files
> > corresponding to old MIDXs.
>
> Looks good.
>
> > +struct clear_midx_data {
> > +	char *keep;
> > +	const char *ext;
> > +};
> > +
> > +static void clear_midx_file_ext(const char *full_path, size_t full_path_len,
> > +				const char *file_name, void *_data)
>
> This will clean up _any_ stale midx .rev file. So even if we miss one
> when writing a new midx (due to a bug, race, power loss, etc), we'll
> catch it later.
>
> We _might_ want to also teach various tempfile-cleanup code run by gc to
> likewise look for unattached midx .rev files, but I don't think we
> necessarily have to do it now.

Agreed there on both counts.

> >  void clear_midx_file(struct repository *r)
> >  {
> >  	char *midx = get_midx_filename(r->objects->odb->path);
> > @@ -1049,6 +1162,8 @@ void clear_midx_file(struct repository *r)
> >  	if (remove_path(midx))
> >  		die(_("failed to clear multi-pack-index at %s"), midx);
> >
> > +	clear_midx_files_ext(r, ".rev", NULL);
> > +
> >  	free(midx);
>
> The sole caller now doesn't pass the "keep" hash, so we'd always delete
> all of them. I guess we'll see that change once somebody starts actually
> writing them.

That's right. I hope that the benefits of splitting the MIDX bitmaps
topic into two series has generally outweighed the drawbacks, but in
instances like these it can be kind of annoying.

Thanks,
Taylor
diff mbox series

Patch

diff --git a/midx.c b/midx.c
index 55f4567fca..eea9574d92 100644
--- a/midx.c
+++ b/midx.c
@@ -12,6 +12,7 @@ 
 #include "run-command.h"
 #include "repository.h"
 #include "chunk-format.h"
+#include "pack.h"
 
 #define MIDX_SIGNATURE 0x4d494458 /* "MIDX" */
 #define MIDX_VERSION 1
@@ -472,6 +473,7 @@  struct write_midx_context {
 	uint32_t entries_nr;
 
 	uint32_t *pack_perm;
+	uint32_t *pack_order;
 	unsigned large_offsets_needed:1;
 	uint32_t num_large_offsets;
 
@@ -826,6 +828,70 @@  static int write_midx_large_offsets(struct hashfile *f,
 	return 0;
 }
 
+static int midx_pack_order_cmp(const void *va, const void *vb, void *_ctx)
+{
+	struct write_midx_context *ctx = _ctx;
+
+	struct pack_midx_entry *a = &ctx->entries[*(const uint32_t *)va];
+	struct pack_midx_entry *b = &ctx->entries[*(const uint32_t *)vb];
+
+	uint32_t perm_a = ctx->pack_perm[a->pack_int_id];
+	uint32_t perm_b = ctx->pack_perm[b->pack_int_id];
+
+	/* Sort objects in the preferred pack ahead of any others. */
+	if (a->preferred > b->preferred)
+		return -1;
+	if (a->preferred < b->preferred)
+		return 1;
+
+	/* Then, order objects by which packs they appear in. */
+	if (perm_a < perm_b)
+		return -1;
+	if (perm_a > perm_b)
+		return 1;
+
+	/* Then, disambiguate by their offset within each pack. */
+	if (a->offset < b->offset)
+		return -1;
+	if (a->offset > b->offset)
+		return 1;
+
+	return 0;
+}
+
+static uint32_t *midx_pack_order(struct write_midx_context *ctx)
+{
+	uint32_t *pack_order;
+	uint32_t i;
+
+	ALLOC_ARRAY(pack_order, ctx->entries_nr);
+	for (i = 0; i < ctx->entries_nr; i++)
+		pack_order[i] = i;
+	QSORT_S(pack_order, ctx->entries_nr, midx_pack_order_cmp, ctx);
+
+	return pack_order;
+}
+
+static void write_midx_reverse_index(char *midx_name, unsigned char *midx_hash,
+				     struct write_midx_context *ctx)
+{
+	struct strbuf buf = STRBUF_INIT;
+	const char *tmp_file;
+
+	strbuf_addf(&buf, "%s-%s.rev", midx_name, hash_to_hex(midx_hash));
+
+	tmp_file = write_rev_file_order(NULL, ctx->pack_order, ctx->entries_nr,
+					midx_hash, WRITE_REV);
+
+	if (finalize_object_file(tmp_file, buf.buf))
+		die(_("cannot store reverse index file"));
+
+	strbuf_release(&buf);
+}
+
+static void clear_midx_files_ext(struct repository *r, const char *ext,
+				 unsigned char *keep_hash);
+
 static int write_midx_internal(const char *object_dir, struct multi_pack_index *m,
 			       struct string_list *packs_to_drop,
 			       const char *preferred_pack_name,
@@ -1011,6 +1077,14 @@  static int write_midx_internal(const char *object_dir, struct multi_pack_index *
 
 	finalize_hashfile(f, midx_hash, CSUM_FSYNC | CSUM_HASH_IN_STREAM);
 	free_chunkfile(cf);
+
+	if (flags & MIDX_WRITE_REV_INDEX)
+		ctx.pack_order = midx_pack_order(&ctx);
+
+	if (flags & MIDX_WRITE_REV_INDEX)
+		write_midx_reverse_index(midx_name, midx_hash, &ctx);
+	clear_midx_files_ext(the_repository, ".rev", midx_hash);
+
 	commit_lock_file(&lk);
 
 cleanup:
@@ -1025,6 +1099,7 @@  static int write_midx_internal(const char *object_dir, struct multi_pack_index *
 	free(ctx.info);
 	free(ctx.entries);
 	free(ctx.pack_perm);
+	free(ctx.pack_order);
 	free(midx_name);
 	return result;
 }
@@ -1037,6 +1112,44 @@  int write_midx_file(const char *object_dir,
 				   flags);
 }
 
+struct clear_midx_data {
+	char *keep;
+	const char *ext;
+};
+
+static void clear_midx_file_ext(const char *full_path, size_t full_path_len,
+				const char *file_name, void *_data)
+{
+	struct clear_midx_data *data = _data;
+
+	if (!(starts_with(file_name, "multi-pack-index-") &&
+	      ends_with(file_name, data->ext)))
+		return;
+	if (data->keep && !strcmp(data->keep, file_name))
+		return;
+
+	if (unlink(full_path))
+		die_errno(_("failed to remove %s"), full_path);
+}
+
+static void clear_midx_files_ext(struct repository *r, const char *ext,
+				 unsigned char *keep_hash)
+{
+	struct clear_midx_data data;
+	memset(&data, 0, sizeof(struct clear_midx_data));
+
+	if (keep_hash)
+		data.keep = xstrfmt("multi-pack-index-%s%s",
+				    hash_to_hex(keep_hash), ext);
+	data.ext = ext;
+
+	for_each_file_in_pack_dir(r->objects->odb->path,
+				  clear_midx_file_ext,
+				  &data);
+
+	free(data.keep);
+}
+
 void clear_midx_file(struct repository *r)
 {
 	char *midx = get_midx_filename(r->objects->odb->path);
@@ -1049,6 +1162,8 @@  void clear_midx_file(struct repository *r)
 	if (remove_path(midx))
 		die(_("failed to clear multi-pack-index at %s"), midx);
 
+	clear_midx_files_ext(r, ".rev", NULL);
+
 	free(midx);
 }
 
diff --git a/midx.h b/midx.h
index 0a8294d2ee..8684cf0fef 100644
--- a/midx.h
+++ b/midx.h
@@ -40,6 +40,7 @@  struct multi_pack_index {
 };
 
 #define MIDX_PROGRESS     (1 << 0)
+#define MIDX_WRITE_REV_INDEX (1 << 1)
 
 char *get_midx_rev_filename(struct multi_pack_index *m);