diff mbox series

[v4,1/7] bulk-checkin: extract abstract `bulk_checkin_source`

Message ID 97bb6e9f59e5092f0519c7d1141d0673313fdc33.1697736516.git.me@ttaylorr.com (mailing list archive)
State Superseded
Headers show
Series merge-ort: implement support for packing objects together | expand

Commit Message

Taylor Blau Oct. 19, 2023, 5:28 p.m. UTC
A future commit will want to implement a very similar routine as in
`stream_blob_to_pack()` with two notable changes:

  - Instead of streaming just OBJ_BLOBs, this new function may want to
    stream objects of arbitrary type.

  - Instead of streaming the object's contents from an open
    file-descriptor, this new function may want to "stream" its contents
    from memory.

To avoid duplicating a significant chunk of code between the existing
`stream_blob_to_pack()`, extract an abstract `bulk_checkin_source`. This
concept currently is a thin layer of `lseek()` and `read_in_full()`, but
will grow to understand how to perform analogous operations when writing
out an object's contents from memory.

Suggested-by: Junio C Hamano <gitster@pobox.com>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
 bulk-checkin.c | 61 +++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 53 insertions(+), 8 deletions(-)

Comments

Jeff King Oct. 20, 2023, 7:35 a.m. UTC | #1
On Thu, Oct 19, 2023 at 01:28:42PM -0400, Taylor Blau wrote:

> +struct bulk_checkin_source {
> +	enum { SOURCE_FILE } type;
> +
> +	/* SOURCE_FILE fields */
> +	int fd;
> +
> +	/* common fields */
> +	size_t size;
> +	const char *path;
> +};

This is a pretty minor nit, but we may find that "SOURCE_FILE" is not
sufficiently name-spaced. Enum tags are in the global namespace, so
the compiler will barf if there are any conflicts.

It might be OK here, since this is local to a single C file (so we at
least are not hurting other code), but we may be in trouble if code in a
header file is less careful. There is already a near-miss here with
GREP_SOURCE_FILE, but fortunately grep.h is indeed careful. :)

(I notice that ref-filter.c similarly uses SOURCE_* for an internal
enum).

-Peff
Junio C Hamano Oct. 20, 2023, 4:55 p.m. UTC | #2
Jeff King <peff@peff.net> writes:

> On Thu, Oct 19, 2023 at 01:28:42PM -0400, Taylor Blau wrote:
>
>> +struct bulk_checkin_source {
>> +	enum { SOURCE_FILE } type;
>> +
>> +	/* SOURCE_FILE fields */
>> +	int fd;
>> +
>> +	/* common fields */
>> +	size_t size;
>> +	const char *path;
>> +};
>
> This is a pretty minor nit, but we may find that "SOURCE_FILE" is not
> sufficiently name-spaced. Enum tags are in the global namespace, so
> the compiler will barf if there are any conflicts.

A very good point.  This was one of the reasons why I suggested
avoiding the switch() based on a new enum altogether and use a
vtable based approach instead.  But it may do while this one is
private to the file and never exposed to other subsystems.

> It might be OK here, since this is local to a single C file (so we at
> least are not hurting other code), but we may be in trouble if code in a
> header file is less careful. There is already a near-miss here with
> GREP_SOURCE_FILE, but fortunately grep.h is indeed careful. :)
>
> (I notice that ref-filter.c similarly uses SOURCE_* for an internal
> enum).

Thanks.
diff mbox series

Patch

diff --git a/bulk-checkin.c b/bulk-checkin.c
index 6ce62999e5..c05d06e1e1 100644
--- a/bulk-checkin.c
+++ b/bulk-checkin.c
@@ -140,8 +140,41 @@  static int already_written(struct bulk_checkin_packfile *state, struct object_id
 	return 0;
 }
 
+struct bulk_checkin_source {
+	enum { SOURCE_FILE } type;
+
+	/* SOURCE_FILE fields */
+	int fd;
+
+	/* common fields */
+	size_t size;
+	const char *path;
+};
+
+static off_t bulk_checkin_source_seek_to(struct bulk_checkin_source *source,
+					 off_t offset)
+{
+	switch (source->type) {
+	case SOURCE_FILE:
+		return lseek(source->fd, offset, SEEK_SET);
+	default:
+		BUG("unknown bulk-checkin source: %d", source->type);
+	}
+}
+
+static ssize_t bulk_checkin_source_read(struct bulk_checkin_source *source,
+					void *buf, size_t nr)
+{
+	switch (source->type) {
+	case SOURCE_FILE:
+		return read_in_full(source->fd, buf, nr);
+	default:
+		BUG("unknown bulk-checkin source: %d", source->type);
+	}
+}
+
 /*
- * Read the contents from fd for size bytes, streaming it to the
+ * Read the contents from 'source' for 'size' bytes, streaming it to the
  * packfile in state while updating the hash in ctx. Signal a failure
  * by returning a negative value when the resulting pack would exceed
  * the pack size limit and this is not the first object in the pack,
@@ -157,7 +190,7 @@  static int already_written(struct bulk_checkin_packfile *state, struct object_id
  */
 static int stream_blob_to_pack(struct bulk_checkin_packfile *state,
 			       git_hash_ctx *ctx, off_t *already_hashed_to,
-			       int fd, size_t size, const char *path,
+			       struct bulk_checkin_source *source,
 			       unsigned flags)
 {
 	git_zstream s;
@@ -167,22 +200,28 @@  static int stream_blob_to_pack(struct bulk_checkin_packfile *state,
 	int status = Z_OK;
 	int write_object = (flags & HASH_WRITE_OBJECT);
 	off_t offset = 0;
+	size_t size = source->size;
 
 	git_deflate_init(&s, pack_compression_level);
 
-	hdrlen = encode_in_pack_object_header(obuf, sizeof(obuf), OBJ_BLOB, size);
+	hdrlen = encode_in_pack_object_header(obuf, sizeof(obuf), OBJ_BLOB,
+					      size);
 	s.next_out = obuf + hdrlen;
 	s.avail_out = sizeof(obuf) - hdrlen;
 
 	while (status != Z_STREAM_END) {
 		if (size && !s.avail_in) {
 			ssize_t rsize = size < sizeof(ibuf) ? size : sizeof(ibuf);
-			ssize_t read_result = read_in_full(fd, ibuf, rsize);
+			ssize_t read_result;
+
+			read_result = bulk_checkin_source_read(source, ibuf,
+							       rsize);
 			if (read_result < 0)
-				die_errno("failed to read from '%s'", path);
+				die_errno("failed to read from '%s'",
+					  source->path);
 			if (read_result != rsize)
 				die("failed to read %d bytes from '%s'",
-				    (int)rsize, path);
+				    (int)rsize, source->path);
 			offset += rsize;
 			if (*already_hashed_to < offset) {
 				size_t hsize = offset - *already_hashed_to;
@@ -258,6 +297,12 @@  static int deflate_blob_to_pack(struct bulk_checkin_packfile *state,
 	unsigned header_len;
 	struct hashfile_checkpoint checkpoint = {0};
 	struct pack_idx_entry *idx = NULL;
+	struct bulk_checkin_source source = {
+		.type = SOURCE_FILE,
+		.fd = fd,
+		.size = size,
+		.path = path,
+	};
 
 	seekback = lseek(fd, 0, SEEK_CUR);
 	if (seekback == (off_t) -1)
@@ -283,7 +328,7 @@  static int deflate_blob_to_pack(struct bulk_checkin_packfile *state,
 			crc32_begin(state->f);
 		}
 		if (!stream_blob_to_pack(state, &ctx, &already_hashed_to,
-					 fd, size, path, flags))
+					 &source, flags))
 			break;
 		/*
 		 * Writing this object to the current pack will make
@@ -295,7 +340,7 @@  static int deflate_blob_to_pack(struct bulk_checkin_packfile *state,
 		hashfile_truncate(state->f, &checkpoint);
 		state->offset = checkpoint.offset;
 		flush_bulk_checkin_packfile(state);
-		if (lseek(fd, seekback, SEEK_SET) == (off_t) -1)
+		if (bulk_checkin_source_seek_to(&source, seekback) == (off_t)-1)
 			return error("cannot seek back");
 	}
 	the_hash_algo->final_oid_fn(result_oid, &ctx);