diff mbox series

[v3,08/10] bulk-checkin: introduce `index_blob_bulk_checkin_incore()`

Message ID 8667b763652ffa71b52b7bd78821e46a6e5fe5a9.1697648864.git.me@ttaylorr.com (mailing list archive)
State Superseded
Headers show
Series merge-ort: implement support for packing objects together | expand

Commit Message

Taylor Blau Oct. 18, 2023, 5:08 p.m. UTC
Now that we have factored out many of the common routines necessary to
index a new object into a pack created by the bulk-checkin machinery, we
can introduce a variant of `index_blob_bulk_checkin()` that acts on
blobs whose contents we can fit in memory.

This will be useful in a couple of more commits in order to provide the
`merge-tree` builtin with a mechanism to create a new pack containing
any objects it created during the merge, instead of storing those
objects individually as loose.

Similar to the existing `index_blob_bulk_checkin()` function, the
entrypoint delegates to `deflate_blob_to_pack_incore()`, which is
responsible for formatting the pack header and then deflating the
contents into the pack. The latter is accomplished by calling
deflate_obj_contents_to_pack_incore(), which takes advantage of the
earlier refactorings and is responsible for writing the object to the
pack and handling any overage from pack.packSizeLimit.

The bulk of the new functionality is implemented in the function
`stream_obj_to_pack()`, which can handle streaming objects from memory
to the bulk-checkin pack as a result of the earlier refactoring.

Consistent with the rest of the bulk-checkin mechanism, there are no
direct tests here. In future commits when we expose this new
functionality via the `merge-tree` builtin, we will test it indirectly
there.

Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
 bulk-checkin.c | 64 ++++++++++++++++++++++++++++++++++++++++++++++++++
 bulk-checkin.h |  4 ++++
 2 files changed, 68 insertions(+)

Comments

Junio C Hamano Oct. 18, 2023, 11:18 p.m. UTC | #1
Taylor Blau <me@ttaylorr.com> writes:

> Now that we have factored out many of the common routines necessary to
> index a new object into a pack created by the bulk-checkin machinery, we
> can introduce a variant of `index_blob_bulk_checkin()` that acts on
> blobs whose contents we can fit in memory.

Hmph.

Doesn't the duplication between the main loop of the new
deflate_obj_contents_to_pack() with existing deflate_blob_to_pack()
bother you?

A similar duplication in the previous round resulted in a nice
refactoring of patches 5 and 6 in this round.  Compared to that,
the differences in the set-up code between the two functions may be
much larger, but subtle and unnecessary differences between the code
that was copied and pasted (e.g., we do not check the errors from
the seek_to method here, but the original does) is a sign that over
time a fix to one will need to be carried over to the other, adding
unnecessary maintenance burden, isn't it?

> +static int deflate_obj_contents_to_pack_incore(struct bulk_checkin_packfile *state,
> +					       git_hash_ctx *ctx,
> +					       struct hashfile_checkpoint *checkpoint,
> +					       struct object_id *result_oid,
> +					       const void *buf, size_t size,
> +					       enum object_type type,
> +					       const char *path, unsigned flags)
> +{
> +	struct pack_idx_entry *idx = NULL;
> +	off_t already_hashed_to = 0;
> +	struct bulk_checkin_source source = {
> +		.type = SOURCE_INCORE,
> +		.buf = buf,
> +		.size = size,
> +		.read = 0,
> +		.path = path,
> +	};
> +
> +	/* Note: idx is non-NULL when we are writing */
> +	if (flags & HASH_WRITE_OBJECT)
> +		CALLOC_ARRAY(idx, 1);
> +
> +	while (1) {
> +		prepare_checkpoint(state, checkpoint, idx, flags);
> +
> +		if (!stream_obj_to_pack(state, ctx, &already_hashed_to, &source,
> +					type, flags))
> +			break;
> +		truncate_checkpoint(state, checkpoint, idx);
> +		bulk_checkin_source_seek_to(&source, 0);
> +	}
> +
> +	finalize_checkpoint(state, ctx, checkpoint, idx, result_oid);
> +
> +	return 0;
> +}
> +
> +static int deflate_blob_to_pack_incore(struct bulk_checkin_packfile *state,
> +				       struct object_id *result_oid,
> +				       const void *buf, size_t size,
> +				       const char *path, unsigned flags)
> +{
> +	git_hash_ctx ctx;
> +	struct hashfile_checkpoint checkpoint = {0};
> +
> +	format_object_header_hash(the_hash_algo, &ctx, &checkpoint, OBJ_BLOB,
> +				  size);
> +
> +	return deflate_obj_contents_to_pack_incore(state, &ctx, &checkpoint,
> +						   result_oid, buf, size,
> +						   OBJ_BLOB, path, flags);
> +}
> +
>  static int deflate_blob_to_pack(struct bulk_checkin_packfile *state,
>  				struct object_id *result_oid,
>  				int fd, size_t size,
> @@ -456,6 +509,17 @@ int index_blob_bulk_checkin(struct object_id *oid,
>  	return status;
>  }
>  
> +int index_blob_bulk_checkin_incore(struct object_id *oid,
> +				   const void *buf, size_t size,
> +				   const char *path, unsigned flags)
> +{
> +	int status = deflate_blob_to_pack_incore(&bulk_checkin_packfile, oid,
> +						 buf, size, path, flags);
> +	if (!odb_transaction_nesting)
> +		flush_bulk_checkin_packfile(&bulk_checkin_packfile);
> +	return status;
> +}
> +
>  void begin_odb_transaction(void)
>  {
>  	odb_transaction_nesting += 1;
> diff --git a/bulk-checkin.h b/bulk-checkin.h
> index aa7286a7b3..1b91daeaee 100644
> --- a/bulk-checkin.h
> +++ b/bulk-checkin.h
> @@ -13,6 +13,10 @@ int index_blob_bulk_checkin(struct object_id *oid,
>  			    int fd, size_t size,
>  			    const char *path, unsigned flags);
>  
> +int index_blob_bulk_checkin_incore(struct object_id *oid,
> +				   const void *buf, size_t size,
> +				   const char *path, unsigned flags);
> +
>  /*
>   * Tell the object database to optimize for adding
>   * multiple objects. end_odb_transaction must be called
Taylor Blau Oct. 19, 2023, 3:30 p.m. UTC | #2
On Wed, Oct 18, 2023 at 04:18:23PM -0700, Junio C Hamano wrote:
> Taylor Blau <me@ttaylorr.com> writes:
>
> > Now that we have factored out many of the common routines necessary to
> > index a new object into a pack created by the bulk-checkin machinery, we
> > can introduce a variant of `index_blob_bulk_checkin()` that acts on
> > blobs whose contents we can fit in memory.
>
> Hmph.
>
> Doesn't the duplication between the main loop of the new
> deflate_obj_contents_to_pack() with existing deflate_blob_to_pack()
> bother you?

Yeah, I am not sure how I missed seeing the opportunity to clean that
up. Another (hopefully final...) reroll incoming.

Thanks,
Taylor
diff mbox series

Patch

diff --git a/bulk-checkin.c b/bulk-checkin.c
index f0115efb2e..9ae43648ba 100644
--- a/bulk-checkin.c
+++ b/bulk-checkin.c
@@ -370,6 +370,59 @@  static void finalize_checkpoint(struct bulk_checkin_packfile *state,
 	}
 }
 
+static int deflate_obj_contents_to_pack_incore(struct bulk_checkin_packfile *state,
+					       git_hash_ctx *ctx,
+					       struct hashfile_checkpoint *checkpoint,
+					       struct object_id *result_oid,
+					       const void *buf, size_t size,
+					       enum object_type type,
+					       const char *path, unsigned flags)
+{
+	struct pack_idx_entry *idx = NULL;
+	off_t already_hashed_to = 0;
+	struct bulk_checkin_source source = {
+		.type = SOURCE_INCORE,
+		.buf = buf,
+		.size = size,
+		.read = 0,
+		.path = path,
+	};
+
+	/* Note: idx is non-NULL when we are writing */
+	if (flags & HASH_WRITE_OBJECT)
+		CALLOC_ARRAY(idx, 1);
+
+	while (1) {
+		prepare_checkpoint(state, checkpoint, idx, flags);
+
+		if (!stream_obj_to_pack(state, ctx, &already_hashed_to, &source,
+					type, flags))
+			break;
+		truncate_checkpoint(state, checkpoint, idx);
+		bulk_checkin_source_seek_to(&source, 0);
+	}
+
+	finalize_checkpoint(state, ctx, checkpoint, idx, result_oid);
+
+	return 0;
+}
+
+static int deflate_blob_to_pack_incore(struct bulk_checkin_packfile *state,
+				       struct object_id *result_oid,
+				       const void *buf, size_t size,
+				       const char *path, unsigned flags)
+{
+	git_hash_ctx ctx;
+	struct hashfile_checkpoint checkpoint = {0};
+
+	format_object_header_hash(the_hash_algo, &ctx, &checkpoint, OBJ_BLOB,
+				  size);
+
+	return deflate_obj_contents_to_pack_incore(state, &ctx, &checkpoint,
+						   result_oid, buf, size,
+						   OBJ_BLOB, path, flags);
+}
+
 static int deflate_blob_to_pack(struct bulk_checkin_packfile *state,
 				struct object_id *result_oid,
 				int fd, size_t size,
@@ -456,6 +509,17 @@  int index_blob_bulk_checkin(struct object_id *oid,
 	return status;
 }
 
+int index_blob_bulk_checkin_incore(struct object_id *oid,
+				   const void *buf, size_t size,
+				   const char *path, unsigned flags)
+{
+	int status = deflate_blob_to_pack_incore(&bulk_checkin_packfile, oid,
+						 buf, size, path, flags);
+	if (!odb_transaction_nesting)
+		flush_bulk_checkin_packfile(&bulk_checkin_packfile);
+	return status;
+}
+
 void begin_odb_transaction(void)
 {
 	odb_transaction_nesting += 1;
diff --git a/bulk-checkin.h b/bulk-checkin.h
index aa7286a7b3..1b91daeaee 100644
--- a/bulk-checkin.h
+++ b/bulk-checkin.h
@@ -13,6 +13,10 @@  int index_blob_bulk_checkin(struct object_id *oid,
 			    int fd, size_t size,
 			    const char *path, unsigned flags);
 
+int index_blob_bulk_checkin_incore(struct object_id *oid,
+				   const void *buf, size_t size,
+				   const char *path, unsigned flags);
+
 /*
  * Tell the object database to optimize for adding
  * multiple objects. end_odb_transaction must be called