diff mbox series

[07/10] packfile: add kept-pack cache for find_kept_pack_entry()

Message ID 182664e1a9c107fd830f9eb02bfa25fe9679e9a7.1611098616.git.me@ttaylorr.com (mailing list archive)
State Superseded
Headers show
Series repack: support repacking into a geometric sequence | expand

Commit Message

Taylor Blau Jan. 19, 2021, 11:24 p.m. UTC
From: Jeff King <peff@peff.net>

In a recent patch we added a function 'find_kept_pack_entry()' to look
for an object only among kept packs.

While this function avoids doing any lookup work in non-kept packs, it
is still linear in the number of packs, since we have to traverse the
linked list of packs once per object. Let's cache a reduced version of
that list to save us time.

Note that this cache will last the lifetime of the program. We could
invalidate it on reprepare_packed_git(), but there's not much point in
being rigorous here:

  - we might already fail to notice new .keep packs showing up after the
    program starts. We only reprepare_packed_git() when we fail to find
    an object. But adding a new pack won't cause that to happen.
    Somebody repacking could add a new pack and delete an old one, but
    most of the time we'd have a descriptor or mmap open to the old
    pack anyway, so we might not even notice.

  - in pack-objects we already cache the .keep state at startup, since
    56dfeb6263 (pack-objects: compute local/ignore_pack_keep early,
    2016-07-29). So this is just extending that concept further.

  - we don't have to worry about any packed_git being removed; we always
    keep the old structs around, even after reprepare_packed_git()

Here are p5303 results (as always, measured against the kernel):

  Test                               HEAD^                  HEAD
  ------------------------------------------------------------------------------------
  5303.5: repack (1)                 56.87(54.63+10.48)     56.63(54.41+10.36) -0.4%
  5303.6: repack with keep (1)       1.26(1.19+0.06)        1.25(1.19+0.05) -0.8%
  5303.10: repack (50)               89.35(132.42+6.25)     89.49(132.31+6.31) +0.2%
  5303.11: repack with keep (50)     6.73(26.61+0.59)       6.72(26.70+0.53) -0.1%
  5303.15: repack (1000)             217.25(494.38+15.24)   218.69(495.62+14.99) +0.7%
  5303.16: repack with keep (1000)   133.12(311.80+8.44)    128.79(306.96+8.55) -3.3%

Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
 builtin/pack-objects.c |   4 +-
 object-store.h         |  10 ++++
 packfile.c             | 103 +++++++++++++++++++++++------------------
 packfile.h             |   4 --
 revision.c             |   8 ++--
 5 files changed, 75 insertions(+), 54 deletions(-)
diff mbox series

Patch

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index c84642df98..f2c7a1e35b 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -1215,9 +1215,9 @@  static int want_found_object(const struct object_id *oid, int exclude,
 		 */
 		unsigned flags = 0;
 		if (ignore_packed_keep_on_disk)
-			flags |= ON_DISK_KEEP_PACKS;
+			flags |= CACHE_ON_DISK_KEEP_PACKS;
 		if (ignore_packed_keep_in_core)
-			flags |= IN_CORE_KEEP_PACKS;
+			flags |= CACHE_IN_CORE_KEEP_PACKS;
 
 		if (ignore_packed_keep_on_disk && p->pack_keep)
 			return 0;
diff --git a/object-store.h b/object-store.h
index c4fc9dd74e..4cbe8eae3c 100644
--- a/object-store.h
+++ b/object-store.h
@@ -105,6 +105,14 @@  static inline int pack_map_entry_cmp(const void *unused_cmp_data,
 	return strcmp(pg1->pack_name, key ? key : pg2->pack_name);
 }
 
+#define CACHE_ON_DISK_KEEP_PACKS 1
+#define CACHE_IN_CORE_KEEP_PACKS 2
+
+struct kept_pack_cache {
+	struct packed_git **packs;
+	unsigned flags;
+};
+
 struct raw_object_store {
 	/*
 	 * Set of all object directories; the main directory is first (and
@@ -150,6 +158,8 @@  struct raw_object_store {
 	/* A most-recently-used ordered version of the packed_git list. */
 	struct list_head packed_git_mru;
 
+	struct kept_pack_cache *kept_pack_cache;
+
 	/*
 	 * A map of packfiles to packed_git structs for tracking which
 	 * packs have been loaded already.
diff --git a/packfile.c b/packfile.c
index 30f43a1a35..25f5407ed0 100644
--- a/packfile.c
+++ b/packfile.c
@@ -2015,10 +2015,7 @@  static int fill_pack_entry(const struct object_id *oid,
 	return 1;
 }
 
-static int find_one_pack_entry(struct repository *r,
-			       const struct object_id *oid,
-			       struct pack_entry *e,
-			       int kept_only)
+int find_pack_entry(struct repository *r, const struct object_id *oid, struct pack_entry *e)
 {
 	struct list_head *pos;
 	struct multi_pack_index *m;
@@ -2028,49 +2025,64 @@  static int find_one_pack_entry(struct repository *r,
 		return 0;
 
 	for (m = r->objects->multi_pack_index; m; m = m->next) {
-		if (!(fill_midx_entry(r, oid, e, m)))
-			continue;
-
-		if (!kept_only)
-			return 1;
-
-		if (((kept_only & ON_DISK_KEEP_PACKS) && e->p->pack_keep) ||
-		    ((kept_only & IN_CORE_KEEP_PACKS) && e->p->pack_keep_in_core))
+		if (fill_midx_entry(r, oid, e, m))
 			return 1;
 	}
 
 	list_for_each(pos, &r->objects->packed_git_mru) {
 		struct packed_git *p = list_entry(pos, struct packed_git, mru);
-		if (p->multi_pack_index && !kept_only) {
-			/*
-			 * If this pack is covered by the MIDX, we'd have found
-			 * the object already in the loop above if it was here,
-			 * so don't bother looking.
-			 *
-			 * The exception is if we are looking only at kept
-			 * packs. An object can be present in two packs covered
-			 * by the MIDX, one kept and one not-kept. And as the
-			 * MIDX points to only one copy of each object, it might
-			 * have returned only the non-kept version above. We
-			 * have to check again to be thorough.
-			 */
-			continue;
-		}
-		if (!kept_only ||
-		    (((kept_only & ON_DISK_KEEP_PACKS) && p->pack_keep) ||
-		     ((kept_only & IN_CORE_KEEP_PACKS) && p->pack_keep_in_core))) {
-			if (fill_pack_entry(oid, e, p)) {
-				list_move(&p->mru, &r->objects->packed_git_mru);
-				return 1;
-			}
+		if (!p->multi_pack_index && fill_pack_entry(oid, e, p)) {
+			list_move(&p->mru, &r->objects->packed_git_mru);
+			return 1;
 		}
 	}
 	return 0;
 }
 
-int find_pack_entry(struct repository *r, const struct object_id *oid, struct pack_entry *e)
+static void maybe_invalidate_kept_pack_cache(struct repository *r,
+					     unsigned flags)
 {
-	return find_one_pack_entry(r, oid, e, 0);
+	if (!r->objects->kept_pack_cache)
+		return;
+	if (r->objects->kept_pack_cache->flags == flags)
+		return;
+	free(r->objects->kept_pack_cache->packs);
+	FREE_AND_NULL(r->objects->kept_pack_cache);
+}
+
+static struct packed_git **kept_pack_cache(struct repository *r, unsigned flags)
+{
+	maybe_invalidate_kept_pack_cache(r, flags);
+
+	if (!r->objects->kept_pack_cache) {
+		struct packed_git **packs = NULL;
+		size_t nr = 0, alloc = 0;
+		struct packed_git *p;
+
+		/*
+		 * We want "all" packs here, because we need to cover ones that
+		 * are used by a midx, as well. We need to look in every one of
+		 * them (instead of the midx itself) to cover duplicates. It's
+		 * possible that an object is found in two packs that the midx
+		 * covers, one kept and one not kept, but the midx returns only
+		 * the non-kept version.
+		 */
+		for (p = get_all_packs(r); p; p = p->next) {
+			if ((p->pack_keep && (flags & CACHE_ON_DISK_KEEP_PACKS)) ||
+			    (p->pack_keep_in_core && (flags & CACHE_IN_CORE_KEEP_PACKS))) {
+				ALLOC_GROW(packs, nr + 1, alloc);
+				packs[nr++] = p;
+			}
+		}
+		ALLOC_GROW(packs, nr + 1, alloc);
+		packs[nr] = NULL;
+
+		r->objects->kept_pack_cache = xmalloc(sizeof(*r->objects->kept_pack_cache));
+		r->objects->kept_pack_cache->packs = packs;
+		r->objects->kept_pack_cache->flags = flags;
+	}
+
+	return r->objects->kept_pack_cache->packs;
 }
 
 int find_kept_pack_entry(struct repository *r,
@@ -2078,13 +2090,15 @@  int find_kept_pack_entry(struct repository *r,
 			 unsigned flags,
 			 struct pack_entry *e)
 {
-	/*
-	 * Load all packs, including midx packs, since our "kept" strategy
-	 * relies on that. We're relying on the side effect of it setting up
-	 * r->objects->packed_git, which is a little ugly.
-	 */
-	get_all_packs(r);
-	return find_one_pack_entry(r, oid, e, flags);
+	struct packed_git **cache;
+
+	for (cache = kept_pack_cache(r, flags); *cache; cache++) {
+		struct packed_git *p = *cache;
+		if (fill_pack_entry(oid, e, p))
+			return 1;
+	}
+
+	return 0;
 }
 
 int has_object_pack(const struct object_id *oid)
@@ -2093,7 +2107,8 @@  int has_object_pack(const struct object_id *oid)
 	return find_pack_entry(the_repository, oid, &e);
 }
 
-int has_object_kept_pack(const struct object_id *oid, unsigned flags)
+int has_object_kept_pack(const struct object_id *oid,
+			 unsigned flags)
 {
 	struct pack_entry e;
 	return find_kept_pack_entry(the_repository, oid, flags, &e);
diff --git a/packfile.h b/packfile.h
index 624327f64d..eb56db2a7b 100644
--- a/packfile.h
+++ b/packfile.h
@@ -161,10 +161,6 @@  int packed_object_info(struct repository *r,
 void mark_bad_packed_object(struct packed_git *p, const unsigned char *sha1);
 const struct packed_git *has_packed_and_bad(struct repository *r, const unsigned char *sha1);
 
-#define ON_DISK_KEEP_PACKS 1
-#define IN_CORE_KEEP_PACKS 2
-#define ALL_KEEP_PACKS (ON_DISK_KEEP_PACKS | IN_CORE_KEEP_PACKS)
-
 /*
  * Iff a pack file in the given repository contains the object named by sha1,
  * return true and store its location to e.
diff --git a/revision.c b/revision.c
index ff1ea77224..ce87081b8e 100644
--- a/revision.c
+++ b/revision.c
@@ -2336,14 +2336,14 @@  static int handle_revision_opt(struct rev_info *revs, int argc, const char **arg
 		die(_("--unpacked=<packfile> no longer supported"));
 	} else if (!strcmp(arg, "--no-kept-objects")) {
 		revs->no_kept_objects = 1;
-		revs->keep_pack_cache_flags |= IN_CORE_KEEP_PACKS;
-		revs->keep_pack_cache_flags |= ON_DISK_KEEP_PACKS;
+		revs->keep_pack_cache_flags |= CACHE_IN_CORE_KEEP_PACKS;
+		revs->keep_pack_cache_flags |= CACHE_ON_DISK_KEEP_PACKS;
 	} else if (skip_prefix(arg, "--no-kept-objects=", &optarg)) {
 		revs->no_kept_objects = 1;
 		if (!strcmp(optarg, "in-core"))
-			revs->keep_pack_cache_flags |= IN_CORE_KEEP_PACKS;
+			revs->keep_pack_cache_flags |= CACHE_IN_CORE_KEEP_PACKS;
 		if (!strcmp(optarg, "on-disk"))
-			revs->keep_pack_cache_flags |= ON_DISK_KEEP_PACKS;
+			revs->keep_pack_cache_flags |= CACHE_ON_DISK_KEEP_PACKS;
 	} else if (!strcmp(arg, "-r")) {
 		revs->diff = 1;
 		revs->diffopt.flags.recursive = 1;