diff mbox series

[23/24] pack-bitmap: reuse objects from all disjoint packs

Message ID 7345e3946743cd098a57c384436a3f44592e5fc6.1701198172.git.me@ttaylorr.com (mailing list archive)
State Superseded
Headers show
Series pack-objects: multi-pack verbatim reuse | expand

Commit Message

Taylor Blau Nov. 28, 2023, 7:08 p.m. UTC
Now that both the pack-bitmap and pack-objects code are prepared to
handle marking and using objects from multiple disjoint packs for
verbatim reuse, allow marking objects from all disjoint packs as
eligible for reuse.

Within the `reuse_partial_packfile_from_bitmap()` function, we no longer
only mark the pack whose first object is at bit position zero for reuse,
and instead mark any pack which is flagged as disjoint by the MIDX as a
reuse candidate. If no such packs exist (i.e because we are reading a
MIDX written before the "DISP" chunk was introduced), then treat the
preferred pack as disjoint for the purposes of reuse. This is a safe
assumption to make since all duplicate objects are resolved in favor of
the preferred pack.

Provide a handful of test cases in a new script (t5332) exercising
interesting behavior for multi-pack reuse to ensure that we performed
all of the previous steps correctly.

Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
 Documentation/config/pack.txt |   6 +-
 builtin/pack-objects.c        |   6 +-
 pack-bitmap.c                 |  73 +++++++++---
 pack-bitmap.h                 |   3 +-
 t/t5332-multi-pack-reuse.sh   | 219 ++++++++++++++++++++++++++++++++++
 5 files changed, 290 insertions(+), 17 deletions(-)
 create mode 100755 t/t5332-multi-pack-reuse.sh
diff mbox series

Patch

diff --git a/Documentation/config/pack.txt b/Documentation/config/pack.txt
index fe100d0fb7..9fe48d41c9 100644
--- a/Documentation/config/pack.txt
+++ b/Documentation/config/pack.txt
@@ -30,7 +30,11 @@  to linkgit:git-repack[1].
 pack.allowPackReuse::
 	When true or "single", and when reachability bitmaps are enabled,
 	pack-objects will try to send parts of the bitmapped packfile
-	verbatim. This can reduce memory and CPU usage to serve fetches,
+	verbatim. When "multi", and when a multi-pack reachability bitmap is
+	available, pack-objects will try to send parts of all packs marked as
+	disjoint by the MIDX. If only a single pack bitmap is available, and
+	`pack.allowPackReuse` is set to "multi", reuse parts of just the
+	bitmapped packfile. This can reduce memory and CPU usage to serve fetches,
 	but might result in sending a slightly larger pack. Defaults to
 	true.
 
diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 4853e91251..43b77bff7c 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -233,6 +233,7 @@  static int use_bitmap_index = -1;
 static enum {
 	NO_PACK_REUSE = 0,
 	SINGLE_PACK_REUSE,
+	MULTI_PACK_REUSE,
 } allow_pack_reuse = SINGLE_PACK_REUSE;
 static enum {
 	WRITE_BITMAP_FALSE = 0,
@@ -3253,6 +3254,8 @@  static int git_pack_config(const char *k, const char *v,
 		if (res < 0) {
 			if (!strcasecmp(v, "single"))
 				allow_pack_reuse = SINGLE_PACK_REUSE;
+			else if (!strcasecmp(v, "multi"))
+				allow_pack_reuse = MULTI_PACK_REUSE;
 			else
 				die(_("invalid pack.allowPackReuse value: '%s'"), v);
 		} else if (res) {
@@ -4032,7 +4035,8 @@  static int get_object_list_from_bitmap(struct rev_info *revs)
 		reuse_partial_packfile_from_bitmap(bitmap_git,
 						   &reuse_packfiles,
 						   &reuse_packfiles_nr,
-						   &reuse_packfile_bitmap);
+						   &reuse_packfile_bitmap,
+						   allow_pack_reuse == MULTI_PACK_REUSE);
 
 	if (reuse_packfiles) {
 		reuse_packfile_objects = bitmap_popcount(reuse_packfile_bitmap);
diff --git a/pack-bitmap.c b/pack-bitmap.c
index be53fc6da5..561690c679 100644
--- a/pack-bitmap.c
+++ b/pack-bitmap.c
@@ -2061,10 +2061,19 @@  static void reuse_partial_packfile_from_bitmap_1(struct bitmap_index *bitmap_git
 	unuse_pack(&w_curs);
 }
 
+static void make_disjoint_pack(struct bitmapped_pack *out, struct packed_git *p)
+{
+	out->p = p;
+	out->bitmap_pos = 0;
+	out->bitmap_nr = p->num_objects;
+	out->disjoint = 1;
+}
+
 void reuse_partial_packfile_from_bitmap(struct bitmap_index *bitmap_git,
 					struct bitmapped_pack **packs_out,
 					size_t *packs_nr_out,
-					struct bitmap **reuse_out)
+					struct bitmap **reuse_out,
+					int multi_pack_reuse)
 {
 	struct repository *r = the_repository;
 	struct bitmapped_pack *packs = NULL;
@@ -2088,24 +2097,62 @@  void reuse_partial_packfile_from_bitmap(struct bitmap_index *bitmap_git,
 				free(packs);
 				return;
 			}
-			if (!pack.bitmap_nr)
-				continue; /* no objects from this pack */
-			if (pack.bitmap_pos)
-				continue; /* not preferred pack */
+
+			if (!pack.disjoint)
+				continue;
+
+			if (!multi_pack_reuse && pack.bitmap_pos) {
+				/*
+				 * If we're only reusing a single pack, skip
+				 * over any packs which are not positioned at
+				 * the beginning of the MIDX bitmap.
+				 *
+				 * This is consistent with the existing
+				 * single-pack reuse behavior, which only reuses
+				 * parts of the MIDX's preferred pack.
+				 */
+				continue;
+			}
 
 			ALLOC_GROW(packs, packs_nr + 1, packs_alloc);
 			memcpy(&packs[packs_nr++], &pack, sizeof(pack));
 
 			objects_nr += pack.p->num_objects;
+
+			if (!multi_pack_reuse)
+				break;
+		}
+
+		if (!packs_nr) {
+			/*
+			 * Old MIDXs (i.e. those written before the "DISP" chunk
+			 * existed) will not have any packs marked as disjoint.
+			 *
+			 * But we still want to perform pack reuse with the
+			 * special "preferred pack" as before. To do this, form
+			 * the singleton set containing just the preferred pack,
+			 * which is trivially disjoint with itself.
+			 *
+			 * Moreover, the MIDX is guaranteed to resolve duplicate
+			 * objects in favor of the copy in the preferred pack
+			 * (if one exists). Thus, we can safely perform pack
+			 * reuse on this pack.
+			 */
+			uint32_t preferred_pack_pos;
+			struct packed_git *preferred_pack;
+
+			preferred_pack_pos = midx_preferred_pack(bitmap_git);
+			preferred_pack = bitmap_git->midx->packs[preferred_pack_pos];
+
+			ALLOC_GROW(packs, packs_nr + 1, packs_alloc);
+
+			make_disjoint_pack(&packs[packs_nr], preferred_pack);
+			objects_nr = packs[packs_nr++].p->num_objects;
 		}
 	} else {
 		ALLOC_GROW(packs, packs_nr + 1, packs_alloc);
 
-		packs[packs_nr].p = bitmap_git->pack;
-		packs[packs_nr].bitmap_pos = 0;
-		packs[packs_nr].bitmap_nr = bitmap_git->pack->num_objects;
-		packs[packs_nr].disjoint = 1;
-
+		make_disjoint_pack(&packs[packs_nr], bitmap_git->pack);
 		objects_nr = packs[packs_nr++].p->num_objects;
 	}
 
@@ -2114,10 +2161,8 @@  void reuse_partial_packfile_from_bitmap(struct bitmap_index *bitmap_git,
 		word_alloc++;
 	reuse = bitmap_word_alloc(word_alloc);
 
-	if (packs_nr != 1)
-		BUG("pack reuse not yet implemented for multiple packs");
-
-	reuse_partial_packfile_from_bitmap_1(bitmap_git, packs, reuse);
+	for (i = 0; i < packs_nr; i++)
+		reuse_partial_packfile_from_bitmap_1(bitmap_git, &packs[i], reuse);
 
 	if (!bitmap_popcount(reuse)) {
 		free(packs);
diff --git a/pack-bitmap.h b/pack-bitmap.h
index 901a3b86ed..8bb316ce52 100644
--- a/pack-bitmap.h
+++ b/pack-bitmap.h
@@ -81,7 +81,8 @@  uint32_t midx_preferred_pack(struct bitmap_index *bitmap_git);
 void reuse_partial_packfile_from_bitmap(struct bitmap_index *bitmap_git,
 					struct bitmapped_pack **packs_out,
 					size_t *packs_nr_out,
-					struct bitmap **reuse_out);
+					struct bitmap **reuse_out,
+					int multi_pack_reuse);
 int rebuild_existing_bitmaps(struct bitmap_index *, struct packing_data *mapping,
 			     kh_oid_map_t *reused_bitmaps, int show_progress);
 void free_bitmap_index(struct bitmap_index *);
diff --git a/t/t5332-multi-pack-reuse.sh b/t/t5332-multi-pack-reuse.sh
new file mode 100755
index 0000000000..a9bd3870e6
--- /dev/null
+++ b/t/t5332-multi-pack-reuse.sh
@@ -0,0 +1,219 @@ 
+#!/bin/sh
+
+test_description='pack-objects multi-pack reuse'
+
+. ./test-lib.sh
+. "$TEST_DIRECTORY"/lib-bitmap.sh
+. "$TEST_DIRECTORY"/lib-disjoint.sh
+
+objdir=.git/objects
+packdir=$objdir/pack
+
+all_packs () {
+	find $packdir -type f -name "*.idx" | sed -e 's/^.*\/\([^\/]\)/\1/g'
+}
+
+all_disjoint () {
+	all_packs | sed -e 's/^/+/g'
+}
+
+test_pack_reused () {
+	test_trace2_data pack-objects pack-reused "$1"
+}
+
+test_packs_reused () {
+	test_trace2_data pack-objects packs-reused "$1"
+}
+
+
+# pack_position <object> </path/to/pack.idx
+pack_position () {
+	git show-index >objects &&
+	grep "$1" objects | cut -d" " -f1
+}
+
+test_expect_success 'setup' '
+	git config pack.allowPackReuse multi
+'
+
+test_expect_success 'preferred pack is reused without packs marked disjoint' '
+	test_commit A &&
+	test_commit B &&
+
+	A="$(echo A | git pack-objects --unpacked --delta-base-offset $packdir/pack)" &&
+	B="$(echo B | git pack-objects --unpacked --delta-base-offset $packdir/pack)" &&
+
+	git prune-packed &&
+
+	git multi-pack-index write --bitmap &&
+
+	test_must_not_be_disjoint "pack-$A.pack" &&
+	test_must_not_be_disjoint "pack-$B.pack" &&
+
+	: >trace2.txt &&
+	GIT_TRACE2_EVENT="$PWD/trace2.txt" \
+		git pack-objects --stdout --revs --all >/dev/null &&
+
+	test_pack_reused 3 <trace2.txt &&
+	test_packs_reused 1 <trace2.txt
+'
+
+test_expect_success 'reuse all objects from subset of disjoint packs' '
+	test_commit C &&
+
+	C="$(echo C | git pack-objects --unpacked --delta-base-offset $packdir/pack)" &&
+
+	git prune-packed &&
+
+	cat >in <<-EOF &&
+	pack-$A.idx
+	+pack-$B.idx
+	+pack-$C.idx
+	EOF
+	git multi-pack-index write --bitmap --stdin-packs <in &&
+
+	test_must_not_be_disjoint "pack-$A.pack" &&
+	test_must_be_disjoint "pack-$B.pack" &&
+	test_must_be_disjoint "pack-$C.pack" &&
+
+	: >trace2.txt &&
+	GIT_TRACE2_EVENT="$PWD/trace2.txt" \
+		git pack-objects --stdout --revs --all >/dev/null &&
+
+	test_pack_reused 6 <trace2.txt &&
+	test_packs_reused 2 <trace2.txt
+'
+
+test_expect_success 'reuse all objects from all disjoint packs' '
+	rm -fr $packdir/multi-pack-index* &&
+
+	all_disjoint >in &&
+	git multi-pack-index write --bitmap --stdin-packs <in &&
+
+	test_must_be_disjoint "pack-$A.pack" &&
+	test_must_be_disjoint "pack-$B.pack" &&
+	test_must_be_disjoint "pack-$C.pack" &&
+
+	: >trace2.txt &&
+	GIT_TRACE2_EVENT="$PWD/trace2.txt" \
+		git pack-objects --stdout --revs --all >/dev/null &&
+
+	test_pack_reused 9 <trace2.txt &&
+	test_packs_reused 3 <trace2.txt
+'
+
+test_expect_success 'reuse objects from first disjoint pack with middle gap' '
+	test_commit D &&
+	test_commit E &&
+	test_commit F &&
+
+	# Set "pack.window" to zero to ensure that we do not create any
+	# deltas, which could alter the amount of pack reuse we perform
+	# (if, for e.g., we are not sending one or more bases).
+	D="$(git -c pack.window=0 pack-objects --all --unpacked $packdir/pack)" &&
+
+	d_pos="$(pack_position $(git rev-parse D) <$packdir/pack-$D.idx)" &&
+	e_pos="$(pack_position $(git rev-parse E) <$packdir/pack-$D.idx)" &&
+	f_pos="$(pack_position $(git rev-parse F) <$packdir/pack-$D.idx)" &&
+
+	# commits F, E, and D, should appear in that order at the
+	# beginning of the pack
+	test $f_pos -lt $e_pos &&
+	test $e_pos -lt $d_pos &&
+
+	# Ensure that the pack we are constructing sorts ahead of any
+	# other packs in lexical/bitmap order by choosing it as the
+	# preferred pack.
+	all_disjoint >in &&
+	git multi-pack-index write --bitmap --preferred-pack="pack-$D.idx" \
+		--stdin-packs <in &&
+
+	test_must_be_disjoint pack-$A.pack &&
+	test_must_be_disjoint pack-$B.pack &&
+	test_must_be_disjoint pack-$C.pack &&
+	test_must_be_disjoint pack-$D.pack &&
+
+	cat >in <<-EOF &&
+	$(git rev-parse E)
+	^$(git rev-parse D)
+	EOF
+
+	: >trace2.txt &&
+	GIT_TRACE2_EVENT="$PWD/trace2.txt" \
+		git pack-objects --stdout --delta-base-offset --revs <in >/dev/null &&
+
+	test_pack_reused 3 <trace2.txt &&
+	test_packs_reused 1 <trace2.txt
+'
+
+test_expect_success 'reuse objects from middle disjoint pack with middle gap' '
+	rm -fr $packdir/multi-pack-index* &&
+
+	# Ensure that the pack we are constructing sort into any
+	# position *but* the first one, by choosing a different pack as
+	# the preferred one.
+	all_disjoint >in &&
+	git multi-pack-index write --bitmap --preferred-pack="pack-$A.idx" \
+		--stdin-packs <in &&
+
+	cat >in <<-EOF &&
+	$(git rev-parse E)
+	^$(git rev-parse D)
+	EOF
+
+	: >trace2.txt &&
+	GIT_TRACE2_EVENT="$PWD/trace2.txt" \
+		git pack-objects --stdout --delta-base-offset --revs <in >/dev/null &&
+
+	test_pack_reused 3 <trace2.txt &&
+	test_packs_reused 1 <trace2.txt
+'
+
+test_expect_success 'omit delta with uninteresting base' '
+	git repack -adk &&
+
+	test_seq 32 >f &&
+	git add f &&
+	test_tick &&
+	git commit -m "delta" &&
+	delta="$(git rev-parse HEAD)" &&
+
+	test_seq 64 >f &&
+	test_tick &&
+	git commit -a -m "base" &&
+	base="$(git rev-parse HEAD)" &&
+
+	test_commit other &&
+
+	git repack -d &&
+
+	have_delta "$(git rev-parse $delta:f)" "$(git rev-parse $base:f)" &&
+
+	all_disjoint >in &&
+	git multi-pack-index write --bitmap --stdin-packs <in &&
+
+	cat >in <<-EOF &&
+	$(git rev-parse other)
+	^$base
+	EOF
+
+	: >trace2.txt &&
+	GIT_TRACE2_EVENT="$PWD/trace2.txt" \
+		git pack-objects --stdout --delta-base-offset --revs <in >/dev/null &&
+
+	# Even though all packs are marked disjoint, we can only reuse
+	# the 3 objects corresponding to "other" from the latest pack.
+	#
+	# This is because even though we want "delta", we do not want
+	# "base", meaning that we have to inflate the delta/base-pair
+	# corresponding to the blob in commit "delta", which bypasses
+	# the pack-reuse mechanism.
+	#
+	# The remaining objects from the other pack are similarly not
+	# reused because their objects are on the uninteresting side of
+	# the query.
+	test_pack_reused 3 <trace2.txt &&
+	test_packs_reused 1 <trace2.txt
+'
+
+test_done