diff mbox series

[v2,02/10] packfile: allow content-limit for cat-file

Message ID 20240823224630.1180772-3-e@80x24.org (mailing list archive)
State New
Headers show
Series cat-file speedups | expand

Commit Message

Eric Wong Aug. 23, 2024, 10:46 p.m. UTC
From: Jeff King <peff@peff.net>

Avoid unnecessary round trips to the object store to speed
up cat-file contents retrievals.  The majority of packed objects
don't benefit from the streaming interface at all and we end up
having to load them in core anyways to satisfy our streaming
API.

This drops the runtime of
`git cat-file --batch-all-objects --unordered --batch' on
git.git from ~7.1s to ~6.1s on Jeff's machine.

[ew: commit message]

Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Eric Wong <e@80x24.org>
---
 builtin/cat-file.c | 17 +++++++++++++++--
 object-file.c      |  6 ++++++
 object-store-ll.h  |  1 +
 packfile.c         | 13 ++++++++++++-
 4 files changed, 34 insertions(+), 3 deletions(-)

Comments

Junio C Hamano Aug. 26, 2024, 5:10 p.m. UTC | #1
Eric Wong <e@80x24.org> writes:

> From: Jeff King <peff@peff.net>
>
> Avoid unnecessary round trips to the object store to speed
> up cat-file contents retrievals.  The majority of packed objects
> don't benefit from the streaming interface at all and we end up
> having to load them in core anyways to satisfy our streaming
> API.

What I found missing from the description is something like ...

    The new trick used is to teach oid_object_info_extended() that a
    non-NULL oi->contentp that means "grab the contents of the objects
    here" can be told to refrain from grabbing an object that is too
    large.

> diff --git a/object-file.c b/object-file.c
> index 065103be3e..1cc29c3c58 100644
> --- a/object-file.c
> +++ b/object-file.c
> @@ -1492,6 +1492,12 @@ static int loose_object_info(struct repository *r,
>  
>  		if (!oi->contentp)
>  			break;
> +		if (oi->content_limit && *oi->sizep > oi->content_limit) {

I cannot convince myself enough to say "content limit" is a great
name.  It invites "limited by what?  text files are allowed but
images are not?".

> diff --git a/object-store-ll.h b/object-store-ll.h
> index c5f2bb2fc2..b71a15f590 100644
> --- a/object-store-ll.h
> +++ b/object-store-ll.h
> @@ -289,6 +289,7 @@ struct object_info {
>  	struct object_id *delta_base_oid;
>  	struct strbuf *type_name;
>  	void **contentp;
> +	size_t content_limit;
>  
>  	/* Response */
>  	enum {
> diff --git a/packfile.c b/packfile.c
> index 4028763947..c12a0515b3 100644
> --- a/packfile.c
> +++ b/packfile.c
> @@ -1529,7 +1529,7 @@ int packed_object_info(struct repository *r, struct packed_git *p,
>  	 * We always get the representation type, but only convert it to
>  	 * a "real" type later if the caller is interested.
>  	 */
> -	if (oi->contentp) {
> +	if (oi->contentp && !oi->content_limit) {
>  		*oi->contentp = cache_or_unpack_entry(r, p, obj_offset, oi->sizep,
>  						      &type);
>  		if (!*oi->contentp)
> @@ -1555,6 +1555,17 @@ int packed_object_info(struct repository *r, struct packed_git *p,
>  				*oi->sizep = size;
>  			}
>  		}
> +
> +		if (oi->contentp) {
> +			if (oi->sizep && *oi->sizep < oi->content_limit) {

It happens that with the current code structure, at this point,
oi->content_limit is _always_ non-zero.  But it felt somewhat
fragile to rely on it, and I would have appreciated if this was
written with an explicit check for oi->content_limit, just like how
it is done in loose_object_info() function.

Other than that, looking very good.

Thanks.


> +				*oi->contentp = cache_or_unpack_entry(r, p, obj_offset,
> +								      oi->sizep, &type);
> +				if (!*oi->contentp)
> +					type = OBJ_BAD;
> +			} else {
> +				*oi->contentp = NULL;
> +			}
> +		}
>  	}
>  
>  	if (oi->disk_sizep) {
Eric Wong Aug. 27, 2024, 8:23 p.m. UTC | #2
Junio C Hamano <gitster@pobox.com> wrote:
> Eric Wong <e@80x24.org> writes:
> > From: Jeff King <peff@peff.net>
> >
> > Avoid unnecessary round trips to the object store to speed
> > up cat-file contents retrievals.  The majority of packed objects
> > don't benefit from the streaming interface at all and we end up
> > having to load them in core anyways to satisfy our streaming
> > API.
> 
> What I found missing from the description is something like ...
> 
>     The new trick used is to teach oid_object_info_extended() that a
>     non-NULL oi->contentp that means "grab the contents of the objects
>     here" can be told to refrain from grabbing an object that is too
>     large.

OK.

> > diff --git a/object-file.c b/object-file.c
> > index 065103be3e..1cc29c3c58 100644
> > --- a/object-file.c
> > +++ b/object-file.c
> > @@ -1492,6 +1492,12 @@ static int loose_object_info(struct repository *r,
> >  
> >  		if (!oi->contentp)
> >  			break;
> > +		if (oi->content_limit && *oi->sizep > oi->content_limit) {
> 
> I cannot convince myself enough to say "content limit" is a great
> name.  It invites "limited by what?  text files are allowed but
> images are not?".

Hmm... naming is a most difficult problem :<

->slurp_max?  It could be ->content_slurp_max, but I think
that's too long...

Would welcome other suggestions...

> > diff --git a/object-store-ll.h b/object-store-ll.h
> > index c5f2bb2fc2..b71a15f590 100644
> > --- a/object-store-ll.h
> > +++ b/object-store-ll.h
> > @@ -289,6 +289,7 @@ struct object_info {
> >  	struct object_id *delta_base_oid;
> >  	struct strbuf *type_name;
> >  	void **contentp;
> > +	size_t content_limit;
> >  
> >  	/* Response */
> >  	enum {
> > diff --git a/packfile.c b/packfile.c
> > index 4028763947..c12a0515b3 100644
> > --- a/packfile.c
> > +++ b/packfile.c
> > @@ -1529,7 +1529,7 @@ int packed_object_info(struct repository *r, struct packed_git *p,
> >  	 * We always get the representation type, but only convert it to
> >  	 * a "real" type later if the caller is interested.
> >  	 */
> > -	if (oi->contentp) {
> > +	if (oi->contentp && !oi->content_limit) {
> >  		*oi->contentp = cache_or_unpack_entry(r, p, obj_offset, oi->sizep,
> >  						      &type);
> >  		if (!*oi->contentp)
> > @@ -1555,6 +1555,17 @@ int packed_object_info(struct repository *r, struct packed_git *p,
> >  				*oi->sizep = size;
> >  			}
> >  		}
> > +
> > +		if (oi->contentp) {
> > +			if (oi->sizep && *oi->sizep < oi->content_limit) {
> 
> It happens that with the current code structure, at this point,
> oi->content_limit is _always_ non-zero.  But it felt somewhat
> fragile to rely on it, and I would have appreciated if this was
> written with an explicit check for oi->content_limit, just like how
> it is done in loose_object_info() function.

Right.  I actually think something like:

		assert(oi->content_limit); /* see `if' above */
		if (oi->sizep && *oi->sizep < oi->content_limit) {

is good for documentation purposes since this is in the `else'
branch of the `if (oi->contentp && !oi->content_limit) {' condition.
Taylor Blau Sept. 17, 2024, 10:10 a.m. UTC | #3
On Tue, Aug 27, 2024 at 08:23:59PM +0000, Eric Wong wrote:
> > > diff --git a/object-file.c b/object-file.c
> > > index 065103be3e..1cc29c3c58 100644
> > > --- a/object-file.c
> > > +++ b/object-file.c
> > > @@ -1492,6 +1492,12 @@ static int loose_object_info(struct repository *r,
> > >
> > >  		if (!oi->contentp)
> > >  			break;
> > > +		if (oi->content_limit && *oi->sizep > oi->content_limit) {
> >
> > I cannot convince myself enough to say "content limit" is a great
> > name.  It invites "limited by what?  text files are allowed but
> > images are not?".
>
> Hmm... naming is a most difficult problem :<
>
> ->slurp_max?  It could be ->content_slurp_max, but I think
> that's too long...
>
> Would welcome other suggestions...

I don't have a huge problem with "content_limit" as a name, but perhaps
"content_size_limit", "streaming_limit", or "streaming_threshold" (with
a vague preference towards the latter) might be more descriptive? I
dunno.

Thanks,
Taylor
Junio C Hamano Sept. 17, 2024, 9:15 p.m. UTC | #4
Taylor Blau <me@ttaylorr.com> writes:

> I don't have a huge problem with "content_limit" as a name, but perhaps
> "content_size_limit", "streaming_limit", or "streaming_threshold" (with
> a vague preference towards the latter) might be more descriptive? I
> dunno.

Your statement does highlight the second point that should have been
pointed out, but I failed to.  The "limit" is about "maximum content
size" (as opposed to content type or color or smell), but also it is
important to avoid sounding like we are forbidding a blob to exist
if it is larger than that limit.  The limit is only about whether
the contents are prefetched through the contentp pointer (and
anything larger than the limit must be obtained via a separate API
call to obtain the contents).

We can flip the polarity to say "minimum content size to stream"
(i.e., anything larger than this threshold will be streamed out
instead of held in core at once), and it may certainly be a better
way to explain what is going on than "maximum content size to be
held in-core via contentp".

Thanks.
diff mbox series

Patch

diff --git a/builtin/cat-file.c b/builtin/cat-file.c
index 18fe58d6b8..bc4bb89610 100644
--- a/builtin/cat-file.c
+++ b/builtin/cat-file.c
@@ -280,6 +280,7 @@  struct expand_data {
 	off_t disk_size;
 	const char *rest;
 	struct object_id delta_base_oid;
+	void *content;
 
 	/*
 	 * If mark_query is true, we do not expand anything, but rather
@@ -383,7 +384,10 @@  static void print_object_or_die(struct batch_options *opt, struct expand_data *d
 
 	assert(data->info.typep);
 
-	if (data->type == OBJ_BLOB) {
+	if (data->content) {
+		batch_write(opt, data->content, data->size);
+		FREE_AND_NULL(data->content);
+	} else if (data->type == OBJ_BLOB) {
 		if (opt->buffer_output)
 			fflush(stdout);
 		if (opt->transform_mode) {
@@ -801,9 +805,18 @@  static int batch_objects(struct batch_options *opt)
 	/*
 	 * If we are printing out the object, then always fill in the type,
 	 * since we will want to decide whether or not to stream.
+	 *
+	 * Likewise, grab the content in the initial request if it's small
+	 * and we're not planning to filter it.
 	 */
-	if (opt->batch_mode == BATCH_MODE_CONTENTS)
+	if (opt->batch_mode == BATCH_MODE_CONTENTS) {
 		data.info.typep = &data.type;
+		if (!opt->transform_mode) {
+			data.info.sizep = &data.size;
+			data.info.contentp = &data.content;
+			data.info.content_limit = big_file_threshold;
+		}
+	}
 
 	if (opt->all_objects) {
 		struct object_cb_data cb;
diff --git a/object-file.c b/object-file.c
index 065103be3e..1cc29c3c58 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1492,6 +1492,12 @@  static int loose_object_info(struct repository *r,
 
 		if (!oi->contentp)
 			break;
+		if (oi->content_limit && *oi->sizep > oi->content_limit) {
+			git_inflate_end(&stream);
+			oi->contentp = NULL;
+			goto cleanup;
+		}
+
 		*oi->contentp = unpack_loose_rest(&stream, hdr, *oi->sizep, oid);
 		if (*oi->contentp)
 			goto cleanup;
diff --git a/object-store-ll.h b/object-store-ll.h
index c5f2bb2fc2..b71a15f590 100644
--- a/object-store-ll.h
+++ b/object-store-ll.h
@@ -289,6 +289,7 @@  struct object_info {
 	struct object_id *delta_base_oid;
 	struct strbuf *type_name;
 	void **contentp;
+	size_t content_limit;
 
 	/* Response */
 	enum {
diff --git a/packfile.c b/packfile.c
index 4028763947..c12a0515b3 100644
--- a/packfile.c
+++ b/packfile.c
@@ -1529,7 +1529,7 @@  int packed_object_info(struct repository *r, struct packed_git *p,
 	 * We always get the representation type, but only convert it to
 	 * a "real" type later if the caller is interested.
 	 */
-	if (oi->contentp) {
+	if (oi->contentp && !oi->content_limit) {
 		*oi->contentp = cache_or_unpack_entry(r, p, obj_offset, oi->sizep,
 						      &type);
 		if (!*oi->contentp)
@@ -1555,6 +1555,17 @@  int packed_object_info(struct repository *r, struct packed_git *p,
 				*oi->sizep = size;
 			}
 		}
+
+		if (oi->contentp) {
+			if (oi->sizep && *oi->sizep < oi->content_limit) {
+				*oi->contentp = cache_or_unpack_entry(r, p, obj_offset,
+								      oi->sizep, &type);
+				if (!*oi->contentp)
+					type = OBJ_BAD;
+			} else {
+				*oi->contentp = NULL;
+			}
+		}
 	}
 
 	if (oi->disk_sizep) {