diff mbox series

[v1,02/10] packfile: allow content-limit for cat-file

Message ID 20240715003519.2671385-3-e@80x24.org (mailing list archive)
State New, archived
Headers show
Series cat-file speedups | expand

Commit Message

Eric Wong July 15, 2024, 12:35 a.m. UTC
From: Jeff King <peff@peff.net>

This avoids unnecessary round trips to the object store to speed
up cat-file contents retrievals.  The majority of packed objects
don't benefit from the streaming interface at all and we end up
having to load them in core anyways to satisfy our streaming
API.

This drops the runtime of
`git cat-file --batch-all-objects --unordered --batch' from
~7.1s to ~6.1s on Jeff's machine.

[ew: commit message]

Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Eric Wong <e@80x24.org>
---
 builtin/cat-file.c | 17 +++++++++++++++--
 object-file.c      |  6 ++++++
 object-store-ll.h  |  1 +
 packfile.c         | 13 ++++++++++++-
 4 files changed, 34 insertions(+), 3 deletions(-)

Comments

Patrick Steinhardt July 24, 2024, 8:35 a.m. UTC | #1
On Mon, Jul 15, 2024 at 12:35:11AM +0000, Eric Wong wrote:
> From: Jeff King <peff@peff.net>
> 
> This avoids unnecessary round trips to the object store to speed

Same comment regarding "this". Despite not being self-contained, I also
think that the commit message could do a better job of explaining what
the problem is that you're fixing in the first place. Right now, I'm
left second-guessing what the idea is that this patch has to make
git-cat-file(1) faster.

> up cat-file contents retrievals.  The majority of packed objects
> don't benefit from the streaming interface at all and we end up
> having to load them in core anyways to satisfy our streaming
> API.
> 
> This drops the runtime of
> `git cat-file --batch-all-objects --unordered --batch' from
> ~7.1s to ~6.1s on Jeff's machine.

It would be nice to get some more context here for the benchmark. Most
importantly, what kind of repository did this run in? Otherwise it is
going to be next to impossible to get remotely comparable results.

Patrick
Eric Wong July 26, 2024, 7:30 a.m. UTC | #2
Patrick Steinhardt <ps@pks.im> wrote:
> On Mon, Jul 15, 2024 at 12:35:11AM +0000, Eric Wong wrote:
> > From: Jeff King <peff@peff.net>
> > 
> > This avoids unnecessary round trips to the object store to speed
> 
> Same comment regarding "this". Despite not being self-contained, I also
> think that the commit message could do a better job of explaining what
> the problem is that you're fixing in the first place. Right now, I'm
> left second-guessing what the idea is that this patch has to make
> git-cat-file(1) faster.

I was hoping Jeff would flesh out the commit messages for the
changes he authored.  I'll take a closer look and update the
messages if he's too busy.

> > up cat-file contents retrievals.  The majority of packed objects
> > don't benefit from the streaming interface at all and we end up
> > having to load them in core anyways to satisfy our streaming
> > API.
> > 
> > This drops the runtime of
> > `git cat-file --batch-all-objects --unordered --batch' from
> > ~7.1s to ~6.1s on Jeff's machine.
> 
> It would be nice to get some more context here for the benchmark. Most
> importantly, what kind of repository did this run in? Otherwise it is
> going to be next to impossible to get remotely comparable results.

Oops, that was for git.git
<https://lore.kernel.org/git/20240621062915.GA2105230@coredump.intra.peff.net/>
diff mbox series

Patch

diff --git a/builtin/cat-file.c b/builtin/cat-file.c
index 18fe58d6b8..bc4bb89610 100644
--- a/builtin/cat-file.c
+++ b/builtin/cat-file.c
@@ -280,6 +280,7 @@  struct expand_data {
 	off_t disk_size;
 	const char *rest;
 	struct object_id delta_base_oid;
+	void *content;
 
 	/*
 	 * If mark_query is true, we do not expand anything, but rather
@@ -383,7 +384,10 @@  static void print_object_or_die(struct batch_options *opt, struct expand_data *d
 
 	assert(data->info.typep);
 
-	if (data->type == OBJ_BLOB) {
+	if (data->content) {
+		batch_write(opt, data->content, data->size);
+		FREE_AND_NULL(data->content);
+	} else if (data->type == OBJ_BLOB) {
 		if (opt->buffer_output)
 			fflush(stdout);
 		if (opt->transform_mode) {
@@ -801,9 +805,18 @@  static int batch_objects(struct batch_options *opt)
 	/*
 	 * If we are printing out the object, then always fill in the type,
 	 * since we will want to decide whether or not to stream.
+	 *
+	 * Likewise, grab the content in the initial request if it's small
+	 * and we're not planning to filter it.
 	 */
-	if (opt->batch_mode == BATCH_MODE_CONTENTS)
+	if (opt->batch_mode == BATCH_MODE_CONTENTS) {
 		data.info.typep = &data.type;
+		if (!opt->transform_mode) {
+			data.info.sizep = &data.size;
+			data.info.contentp = &data.content;
+			data.info.content_limit = big_file_threshold;
+		}
+	}
 
 	if (opt->all_objects) {
 		struct object_cb_data cb;
diff --git a/object-file.c b/object-file.c
index 065103be3e..1cc29c3c58 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1492,6 +1492,12 @@  static int loose_object_info(struct repository *r,
 
 		if (!oi->contentp)
 			break;
+		if (oi->content_limit && *oi->sizep > oi->content_limit) {
+			git_inflate_end(&stream);
+			oi->contentp = NULL;
+			goto cleanup;
+		}
+
 		*oi->contentp = unpack_loose_rest(&stream, hdr, *oi->sizep, oid);
 		if (*oi->contentp)
 			goto cleanup;
diff --git a/object-store-ll.h b/object-store-ll.h
index c5f2bb2fc2..b71a15f590 100644
--- a/object-store-ll.h
+++ b/object-store-ll.h
@@ -289,6 +289,7 @@  struct object_info {
 	struct object_id *delta_base_oid;
 	struct strbuf *type_name;
 	void **contentp;
+	size_t content_limit;
 
 	/* Response */
 	enum {
diff --git a/packfile.c b/packfile.c
index e547522e3d..54b9d46928 100644
--- a/packfile.c
+++ b/packfile.c
@@ -1530,7 +1530,7 @@  int packed_object_info(struct repository *r, struct packed_git *p,
 	 * a "real" type later if the caller is interested. Likewise...
 	 * tbd.
 	 */
-	if (oi->contentp) {
+	if (oi->contentp && !oi->content_limit) {
 		*oi->contentp = cache_or_unpack_entry(r, p, obj_offset, oi->sizep,
 						      &type);
 		if (!*oi->contentp)
@@ -1556,6 +1556,17 @@  int packed_object_info(struct repository *r, struct packed_git *p,
 				*oi->sizep = size;
 			}
 		}
+
+		if (oi->contentp) {
+			if (oi->sizep && *oi->sizep < oi->content_limit) {
+				*oi->contentp = cache_or_unpack_entry(r, p, obj_offset,
+								      oi->sizep, &type);
+				if (!*oi->contentp)
+					type = OBJ_BAD;
+			} else {
+				*oi->contentp = NULL;
+			}
+		}
 	}
 
 	if (oi->disk_sizep) {