diff mbox series

[v3,8/9] midx: implement midx_repack()

Message ID a4c2d5a8e16f888f9a3d1090fa92d986919f0162.1547047269.git.gitgitgadget@gmail.com (mailing list archive)
State New, archived
Headers show
Series Create 'expire' and 'repack' verbs for git-multi-pack-index | expand

Commit Message

Linus Arver via GitGitGadget Jan. 9, 2019, 3:21 p.m. UTC
From: Derrick Stolee <dstolee@microsoft.com>

To repack using a multi-pack-index, first sort all pack-files by
their modified time. Second, walk those pack-files from oldest
to newest, adding the packs to a list if they are smaller than the
given pack-size. Finally, collect the objects from the multi-pack-
index that are in those packs and send them to 'git pack-objects'.

While first designing a 'git multi-pack-index repack' operation, I
started by collecting the batches based on the size of the objects
instead of the size of the pack-files. This allows repacking a
large pack-file that has very few referencd objects. However, this
came at a significant cost of parsing pack-files instead of simply
reading the multi-pack-index and getting the file information for
the pack-files. This object-size idea could be a direction for
future expansion in this area.

Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
---
 midx.c                      | 109 +++++++++++++++++++++++++++++++++++-
 t/t5319-multi-pack-index.sh |  25 +++++++++
 2 files changed, 133 insertions(+), 1 deletion(-)

Comments

Jonathan Tan Jan. 23, 2019, 10:33 p.m. UTC | #1
> From: Derrick Stolee <dstolee@microsoft.com>
> 
> To repack using a multi-pack-index, first sort all pack-files by
> their modified time. Second, walk those pack-files from oldest
> to newest, adding the packs to a list if they are smaller than the
> given pack-size. Finally, collect the objects from the multi-pack-
> index that are in those packs and send them to 'git pack-objects'.

Also mention that we stop once the total is at least the batch size.

> +test_expect_success 'repack creates a new pack' '
> +	(
> +		cd dup &&
> +		SECOND_SMALLEST_SIZE=$(ls -l .git/objects/pack/*pack | awk "{print \$5;}" | sort -n | head -n 2 | tail -n 1) &&
> +		BATCH_SIZE=$(($SECOND_SMALLEST_SIZE + 1)) &&
> +		git multi-pack-index repack --batch-size=$BATCH_SIZE &&
> +		ls .git/objects/pack/*idx >idx-list &&
> +		test_line_count = 5 idx-list &&
> +		test-tool read-midx .git/objects | grep idx >midx-list &&
> +		test_line_count = 5 midx-list
> +	)
> +'

Can there be a test_line_count of the output of ls at the beginning, so
that we do not have to look at the previous test to indeed see that the
5 is greater than before?

Also add a test to check what happens when we have 3 packs below the
batch size, but taking 2 of them together is sufficient to exceed the
batch size.

> +test_expect_success 'expire removes repacked packs' '
> +	(
> +		cd dup &&
> +		ls -S .git/objects/pack/*pack | head -n 3 >expect &&
> +		git multi-pack-index expire &&
> +		ls -S .git/objects/pack/*pack >actual &&
> +		test_cmp expect actual &&
> +		test-tool read-midx .git/objects | grep idx >midx-list &&
> +		test_line_count = 3 midx-list
> +	)
> +'

Same comment as above.
diff mbox series

Patch

diff --git a/midx.c b/midx.c
index 30ff4430ab..f0f16690e6 100644
--- a/midx.c
+++ b/midx.c
@@ -8,6 +8,7 @@ 
 #include "sha1-lookup.h"
 #include "midx.h"
 #include "progress.h"
+#include "run-command.h"
 
 #define MIDX_SIGNATURE 0x4d494458 /* "MIDX" */
 #define MIDX_VERSION 1
@@ -1107,7 +1108,113 @@  int expire_midx_packs(const char *object_dir)
 	return result;
 }
 
-int midx_repack(const char *object_dir, size_t batch_size)
+struct time_and_id {
+	timestamp_t mtime;
+	uint32_t pack_int_id;
+};
+
+static int compare_by_mtime(const void *a_, const void *b_)
 {
+	const struct time_and_id *a, *b;
+
+	a = (const struct time_and_id *)a_;
+	b = (const struct time_and_id *)b_;
+
+	if (a->mtime < b->mtime)
+		return -1;
+	if (a->mtime > b->mtime)
+		return 1;
 	return 0;
 }
+
+int midx_repack(const char *object_dir, size_t batch_size)
+{
+	int result = 0;
+	uint32_t i, packs_to_repack;
+	size_t total_size;
+	struct time_and_id *pack_ti;
+	unsigned char *include_pack;
+	struct child_process cmd = CHILD_PROCESS_INIT;
+	struct strbuf base_name = STRBUF_INIT;
+	struct multi_pack_index *m = load_multi_pack_index(object_dir, 1);
+
+	if (!m)
+		return 0;
+
+	include_pack = xcalloc(m->num_packs, sizeof(unsigned char));
+	pack_ti = xcalloc(m->num_packs, sizeof(struct time_and_id));
+
+	for (i = 0; i < m->num_packs; i++) {
+		pack_ti[i].pack_int_id = i;
+
+		if (prepare_midx_pack(m, i))
+			continue;
+
+		pack_ti[i].mtime = m->packs[i]->mtime;
+	}
+	QSORT(pack_ti, m->num_packs, compare_by_mtime);
+
+	total_size = 0;
+	packs_to_repack = 0;
+	for (i = 0; total_size < batch_size && i < m->num_packs; i++) {
+		int pack_int_id = pack_ti[i].pack_int_id;
+		struct packed_git *p = m->packs[pack_int_id];
+
+		if (!p)
+			continue;
+		if (p->pack_size >= batch_size)
+			continue;
+
+		packs_to_repack++;
+		total_size += p->pack_size;
+		include_pack[pack_int_id] = 1;
+	}
+
+	if (total_size < batch_size || packs_to_repack < 2)
+		goto cleanup;
+
+	argv_array_push(&cmd.args, "pack-objects");
+
+	strbuf_addstr(&base_name, object_dir);
+	strbuf_addstr(&base_name, "/pack/pack");
+	argv_array_push(&cmd.args, base_name.buf);
+	strbuf_release(&base_name);
+
+	cmd.git_cmd = 1;
+	cmd.in = cmd.out = -1;
+
+	if (start_command(&cmd)) {
+		error(_("could not start pack-objects"));
+		result = 1;
+		goto cleanup;
+	}
+
+	for (i = 0; i < m->num_objects; i++) {
+		struct object_id oid;
+		uint32_t pack_int_id = nth_midxed_pack_int_id(m, i);
+
+		if (!include_pack[pack_int_id])
+			continue;
+
+		nth_midxed_object_oid(&oid, m, i);
+		xwrite(cmd.in, oid_to_hex(&oid), the_hash_algo->hexsz);
+		xwrite(cmd.in, "\n", 1);
+	}
+	close(cmd.in);
+
+	if (finish_command(&cmd)) {
+		error(_("could not finish pack-objects"));
+		result = 1;
+		goto cleanup;
+	}
+
+	result = write_midx_internal(object_dir, m, NULL);
+	m = NULL;
+
+cleanup:
+	if (m)
+		close_midx(m);
+	free(include_pack);
+	free(pack_ti);
+	return result;
+}
diff --git a/t/t5319-multi-pack-index.sh b/t/t5319-multi-pack-index.sh
index d64767600a..99afb5ec51 100755
--- a/t/t5319-multi-pack-index.sh
+++ b/t/t5319-multi-pack-index.sh
@@ -424,4 +424,29 @@  test_expect_success 'repack with minimum size does not alter existing packs' '
 	)
 '
 
+test_expect_success 'repack creates a new pack' '
+	(
+		cd dup &&
+		SECOND_SMALLEST_SIZE=$(ls -l .git/objects/pack/*pack | awk "{print \$5;}" | sort -n | head -n 2 | tail -n 1) &&
+		BATCH_SIZE=$(($SECOND_SMALLEST_SIZE + 1)) &&
+		git multi-pack-index repack --batch-size=$BATCH_SIZE &&
+		ls .git/objects/pack/*idx >idx-list &&
+		test_line_count = 5 idx-list &&
+		test-tool read-midx .git/objects | grep idx >midx-list &&
+		test_line_count = 5 midx-list
+	)
+'
+
+test_expect_success 'expire removes repacked packs' '
+	(
+		cd dup &&
+		ls -S .git/objects/pack/*pack | head -n 3 >expect &&
+		git multi-pack-index expire &&
+		ls -S .git/objects/pack/*pack >actual &&
+		test_cmp expect actual &&
+		test-tool read-midx .git/objects | grep idx >midx-list &&
+		test_line_count = 3 midx-list
+	)
+'
+
 test_done