diff mbox series

[v2,07/10] object_info: content_limit only applies to blobs

Message ID 20240823224630.1180772-8-e@80x24.org (mailing list archive)
State New
Headers show
Series cat-file speedups | expand

Commit Message

Eric Wong Aug. 23, 2024, 10:46 p.m. UTC
Streaming is only supported for blobs, so we'd end up having to
slurp all the other object types into memory regardless.  So
slurp all the non-blob types up front when requesting content
since we always handle them in-core, anyways.

Signed-off-by: Eric Wong <e@80x24.org>
---
 builtin/cat-file.c  | 21 +++++++++++++++++++--
 object-file.c       |  3 ++-
 packfile.c          |  8 +++++---
 t/t1006-cat-file.sh | 19 ++++++++++++++++---
 4 files changed, 42 insertions(+), 9 deletions(-)

Comments

Junio C Hamano Aug. 26, 2024, 10:02 p.m. UTC | #1
Eric Wong <e@80x24.org> writes:

> Streaming is only supported for blobs, so we'd end up having to
> slurp all the other object types into memory regardless.  So
> slurp all the non-blob types up front when requesting content
> since we always handle them in-core, anyways.
>
> Signed-off-by: Eric Wong <e@80x24.org>
> ---
>  builtin/cat-file.c  | 21 +++++++++++++++++++--
>  object-file.c       |  3 ++-
>  packfile.c          |  8 +++++---
>  t/t1006-cat-file.sh | 19 ++++++++++++++++---
>  4 files changed, 42 insertions(+), 9 deletions(-)
>
> diff --git a/builtin/cat-file.c b/builtin/cat-file.c
> index 8debcdca3e..2aedd62324 100644
> --- a/builtin/cat-file.c
> +++ b/builtin/cat-file.c
> @@ -385,7 +385,24 @@ static void print_object_or_die(struct batch_options *opt, struct expand_data *d
>  	assert(data->info.typep);
>  
>  	if (data->content) {
> -		batch_write(opt, data->content, data->size);
> +		void *content = data->content;
> +		unsigned long size = data->size;
> +
> +		data->content = NULL;
> +		if (use_mailmap && (data->type == OBJ_COMMIT ||
> +					data->type == OBJ_TAG)) {

Line-wrap at higher level in the parse tree, i.e.

		if (use_mailmap &&
		    (data->type == OBJ_COMMIT || data->type == OBJ_TAG)) {

> +			size_t s = size;
> +
> +			if (data->info.whence == OI_DBCACHED) {
> +				content = xmemdupz(content, s);
> +				data->info.whence = OI_PACKED;
> +			}
> +
> +			content = replace_idents_using_mailmap(content, &s);
> +			size = cast_size_t_to_ulong(s);

This piece of code does look good, but it makes me wonder where the
need to duplicate the code comes from.  Before this patch, if
somebody asked use_mailmap, we must have been making the
replace_idents_using_mailmap() call and showing the result elsewhere
in the existing code, and it is curious why that code path is not
removed, or if we can share more common code with that code paths
here.

> +		}
> +
> +		batch_write(opt, content, size);
>  		switch (data->info.whence) {
>  		case OI_CACHED:
>  			/*
> @@ -395,7 +412,7 @@ static void print_object_or_die(struct batch_options *opt, struct expand_data *d
>  			BUG("TODO OI_CACHED support not done");
>  		case OI_LOOSE:
>  		case OI_PACKED:
> -			FREE_AND_NULL(data->content);
> +			free(content);

data->content has been nuked earlier, and content that holds the new
copy with replaced one is freed (and the original data->content was
freed by replace_idents_using_mailmap(), so there is no leak or
double-free here.  Good.

> diff --git a/object-file.c b/object-file.c
> index 19100e823d..59842cfe1b 100644
> --- a/object-file.c
> +++ b/object-file.c
> @@ -1492,7 +1492,8 @@ static int loose_object_info(struct repository *r,
>  
>  		if (!oi->contentp)
>  			break;
> -		if (oi->content_limit && *oi->sizep > oi->content_limit) {
> +		if (oi->content_limit && *oi->typep == OBJ_BLOB &&
> +				*oi->sizep > oi->content_limit) {
>  			git_inflate_end(&stream);
>  			oi->contentp = NULL;
>  			goto cleanup;

OK, so non-BLOB objects we would fall through this if/else cascade
and inflate them fully.
diff mbox series

Patch

diff --git a/builtin/cat-file.c b/builtin/cat-file.c
index 8debcdca3e..2aedd62324 100644
--- a/builtin/cat-file.c
+++ b/builtin/cat-file.c
@@ -385,7 +385,24 @@  static void print_object_or_die(struct batch_options *opt, struct expand_data *d
 	assert(data->info.typep);
 
 	if (data->content) {
-		batch_write(opt, data->content, data->size);
+		void *content = data->content;
+		unsigned long size = data->size;
+
+		data->content = NULL;
+		if (use_mailmap && (data->type == OBJ_COMMIT ||
+					data->type == OBJ_TAG)) {
+			size_t s = size;
+
+			if (data->info.whence == OI_DBCACHED) {
+				content = xmemdupz(content, s);
+				data->info.whence = OI_PACKED;
+			}
+
+			content = replace_idents_using_mailmap(content, &s);
+			size = cast_size_t_to_ulong(s);
+		}
+
+		batch_write(opt, content, size);
 		switch (data->info.whence) {
 		case OI_CACHED:
 			/*
@@ -395,7 +412,7 @@  static void print_object_or_die(struct batch_options *opt, struct expand_data *d
 			BUG("TODO OI_CACHED support not done");
 		case OI_LOOSE:
 		case OI_PACKED:
-			FREE_AND_NULL(data->content);
+			free(content);
 			break;
 		case OI_DBCACHED:
 			unlock_delta_base_cache();
diff --git a/object-file.c b/object-file.c
index 19100e823d..59842cfe1b 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1492,7 +1492,8 @@  static int loose_object_info(struct repository *r,
 
 		if (!oi->contentp)
 			break;
-		if (oi->content_limit && *oi->sizep > oi->content_limit) {
+		if (oi->content_limit && *oi->typep == OBJ_BLOB &&
+				*oi->sizep > oi->content_limit) {
 			git_inflate_end(&stream);
 			oi->contentp = NULL;
 			goto cleanup;
diff --git a/packfile.c b/packfile.c
index 94d20034e4..a592e0b32c 100644
--- a/packfile.c
+++ b/packfile.c
@@ -1546,7 +1546,7 @@  int packed_object_info(struct repository *r, struct packed_git *p,
 			if (oi->direct_cache) {
 				lock_delta_base_cache();
 				*oi->contentp = ent->data;
-			} else if (!oi->content_limit ||
+			} else if (type != OBJ_BLOB || !oi->content_limit ||
 					ent->size <= oi->content_limit) {
 				*oi->contentp = xmemdupz(ent->data, ent->size);
 			} else {
@@ -1583,10 +1583,12 @@  int packed_object_info(struct repository *r, struct packed_git *p,
 		}
 
 		if (oi->contentp) {
-			if (oi->sizep && *oi->sizep <= oi->content_limit) {
+			final_type = packed_to_object_type(r, p, obj_offset,
+						     type, &w_curs, curpos);
+			if (final_type != OBJ_BLOB || (oi->sizep &&
+					*oi->sizep <= oi->content_limit)) {
 				*oi->contentp = unpack_entry(r, p, obj_offset,
 							&type, oi->sizep);
-				final_type = type;
 				if (!*oi->contentp)
 					type = OBJ_BAD;
 			} else {
diff --git a/t/t1006-cat-file.sh b/t/t1006-cat-file.sh
index ff9bf213aa..841e8567e9 100755
--- a/t/t1006-cat-file.sh
+++ b/t/t1006-cat-file.sh
@@ -622,20 +622,33 @@  test_expect_success 'confirm that neither loose blob is a delta' '
 	test_cmp expect actual
 '
 
+test_expect_success 'setup delta base tests' '
+	foo="$(git rev-parse HEAD:foo)" &&
+	foo_plus="$(git rev-parse HEAD:foo-plus)" &&
+	git repack -ad
+'
+
 # To avoid relying too much on the current delta heuristics,
 # we will check only that one of the two objects is a delta
 # against the other, but not the order. We can do so by just
 # asking for the base of both, and checking whether either
 # oid appears in the output.
 test_expect_success '%(deltabase) reports packed delta bases' '
-	git repack -ad &&
 	git cat-file --batch-check="%(deltabase)" <blobs >actual &&
 	{
-		grep "$(git rev-parse HEAD:foo)" actual ||
-		grep "$(git rev-parse HEAD:foo-plus)" actual
+		grep "$foo" actual || grep "$foo_plus" actual
 	}
 '
 
+test_expect_success 'delta base direct cache use succeeds w/o asserting' '
+	commands="info $foo
+info $foo_plus
+contents $foo_plus
+contents $foo" &&
+	echo "$commands" >in &&
+	git cat-file --batch-command <in >out
+'
+
 test_expect_success 'setup bogus data' '
 	bogus_short_type="bogus" &&
 	bogus_short_content="bogus" &&