diff mbox series

[v5,1/5] bulk-checkin: extract abstract `bulk_checkin_source`

Message ID 696aa027e46ddec310812fad2d4b12082447d925.1698101088.git.me@ttaylorr.com (mailing list archive)
State New, archived
Headers show
Series merge-ort: implement support for packing objects together | expand

Commit Message

Taylor Blau Oct. 23, 2023, 10:44 p.m. UTC
A future commit will want to implement a very similar routine as in
`stream_blob_to_pack()` with two notable changes:

  - Instead of streaming just OBJ_BLOBs, this new function may want to
    stream objects of arbitrary type.

  - Instead of streaming the object's contents from an open
    file-descriptor, this new function may want to "stream" its contents
    from memory.

To avoid duplicating a significant chunk of code between the existing
`stream_blob_to_pack()`, extract an abstract `bulk_checkin_source`. This
concept currently is a thin layer of `lseek()` and `read_in_full()`, but
will grow to understand how to perform analogous operations when writing
out an object's contents from memory.

Suggested-by: Junio C Hamano <gitster@pobox.com>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
 bulk-checkin.c | 65 +++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 57 insertions(+), 8 deletions(-)

Comments

Jeff King Oct. 25, 2023, 7:37 a.m. UTC | #1
On Mon, Oct 23, 2023 at 06:44:56PM -0400, Taylor Blau wrote:

> +struct bulk_checkin_source {
> +	off_t (*read)(struct bulk_checkin_source *, void *, size_t);
> +	off_t (*seek)(struct bulk_checkin_source *, off_t);
> +
> +	union {
> +		struct {
> +			int fd;
> +		} from_fd;
> +	} data;
> +
> +	size_t size;
> +	const char *path;
> +};

The virtual functions combined with the union are a funny mix of
object-oriented and procedural code. The bulk_checkin_source has
totally virtualized functions, but knows about all of the ancillary data
each set of virtualized functions might want. ;)

I think the more pure OO version would embed the parent, and have each
concrete type define its own struct type, like:

  struct bulk_checkin_source_fd {
	struct bulk_checkin_source src;
	int fd;
  };

That works great if the code which constructs it knows which concrete
type it wants, and can just do:

  struct bulk_checkin_source_fd src;
  init_bulk_checkin_source_from_fd(&src, ...);

If even the construction is somewhat virtualized, then you are stuck
with heap constructors like:

  struct bulk_checkin_source *bulk_checkin_source_from_fd(...);

Not too bad, but you have to remember to free now.

Alternatively, I think some of our other OO code just leaves room for
a type-specific void pointer, like:

  struct bulk_checkin_source {
	...the usual stuff...

	void *magic_type_data;
  };

and then the init_bulk_checkin_source_from_fd() function allocates its
own heap struct for the magic_type_data field and sticks the int in
there.

That said, both of those are a lot more annoying to use in C (more
boilerplate, more casting, and more opportunities to get something
wrong, including leaks). So I don't mind this in-between state. It is a
funny layering violating from an OO standpoint, but it's not like we
expect an unbounded set of concrete types to "inherit" from the source
struct.

-Peff
Taylor Blau Oct. 25, 2023, 3:39 p.m. UTC | #2
On Wed, Oct 25, 2023 at 03:37:36AM -0400, Jeff King wrote:
> I don't mind this in-between state. It is a funny layering violating
> from an OO standpoint, but it's not like we expect an unbounded set of
> concrete types to "inherit" from the source struct.

Yeah, this was exactly my thinking when writing up the changes for this
round. Since all of the "sub-classes" are local to the bulk-checkin.o
compilation unit, I don't have grave concerns about one implementation
peering into the details of another's.

Gotta stop somewhere ;-).

Thanks,
Taylor
Junio C Hamano Oct. 27, 2023, 11:12 p.m. UTC | #3
Jeff King <peff@peff.net> writes:

> Alternatively, I think some of our other OO code just leaves room for
> a type-specific void pointer, like:
>
>   struct bulk_checkin_source {
> 	...the usual stuff...
>
> 	void *magic_type_data;
>   };
>
> and then the init_bulk_checkin_source_from_fd() function allocates its
> own heap struct for the magic_type_data field and sticks the int in
> there.

Yup.  All the pros-and-cons makes sense.  I earlier said I found
this a good place to stop, exactly because the full OO with 

    struct specific_subclass {
	struct vtbl *vtbl;
	struct base_class common_data_specific_to_instance;
	... other instance specific data members here ...;
    }

would require us to add too many supporting infrastructure struct
types; with only a few subclasses in use, it is a bit too much to
justify.
diff mbox series

Patch

diff --git a/bulk-checkin.c b/bulk-checkin.c
index 6ce62999e5..174a6c24e4 100644
--- a/bulk-checkin.c
+++ b/bulk-checkin.c
@@ -140,8 +140,49 @@  static int already_written(struct bulk_checkin_packfile *state, struct object_id
 	return 0;
 }
 
+struct bulk_checkin_source {
+	off_t (*read)(struct bulk_checkin_source *, void *, size_t);
+	off_t (*seek)(struct bulk_checkin_source *, off_t);
+
+	union {
+		struct {
+			int fd;
+		} from_fd;
+	} data;
+
+	size_t size;
+	const char *path;
+};
+
+static off_t bulk_checkin_source_read_from_fd(struct bulk_checkin_source *source,
+					      void *buf, size_t nr)
+{
+	return read_in_full(source->data.from_fd.fd, buf, nr);
+}
+
+static off_t bulk_checkin_source_seek_from_fd(struct bulk_checkin_source *source,
+					      off_t offset)
+{
+	return lseek(source->data.from_fd.fd, offset, SEEK_SET);
+}
+
+static void init_bulk_checkin_source_from_fd(struct bulk_checkin_source *source,
+					     int fd, size_t size,
+					     const char *path)
+{
+	memset(source, 0, sizeof(struct bulk_checkin_source));
+
+	source->read = bulk_checkin_source_read_from_fd;
+	source->seek = bulk_checkin_source_seek_from_fd;
+
+	source->data.from_fd.fd = fd;
+
+	source->size = size;
+	source->path = path;
+}
+
 /*
- * Read the contents from fd for size bytes, streaming it to the
+ * Read the contents from 'source' for 'size' bytes, streaming it to the
  * packfile in state while updating the hash in ctx. Signal a failure
  * by returning a negative value when the resulting pack would exceed
  * the pack size limit and this is not the first object in the pack,
@@ -157,7 +198,7 @@  static int already_written(struct bulk_checkin_packfile *state, struct object_id
  */
 static int stream_blob_to_pack(struct bulk_checkin_packfile *state,
 			       git_hash_ctx *ctx, off_t *already_hashed_to,
-			       int fd, size_t size, const char *path,
+			       struct bulk_checkin_source *source,
 			       unsigned flags)
 {
 	git_zstream s;
@@ -167,22 +208,27 @@  static int stream_blob_to_pack(struct bulk_checkin_packfile *state,
 	int status = Z_OK;
 	int write_object = (flags & HASH_WRITE_OBJECT);
 	off_t offset = 0;
+	size_t size = source->size;
 
 	git_deflate_init(&s, pack_compression_level);
 
-	hdrlen = encode_in_pack_object_header(obuf, sizeof(obuf), OBJ_BLOB, size);
+	hdrlen = encode_in_pack_object_header(obuf, sizeof(obuf), OBJ_BLOB,
+					      size);
 	s.next_out = obuf + hdrlen;
 	s.avail_out = sizeof(obuf) - hdrlen;
 
 	while (status != Z_STREAM_END) {
 		if (size && !s.avail_in) {
 			ssize_t rsize = size < sizeof(ibuf) ? size : sizeof(ibuf);
-			ssize_t read_result = read_in_full(fd, ibuf, rsize);
+			ssize_t read_result;
+
+			read_result = source->read(source, ibuf, rsize);
 			if (read_result < 0)
-				die_errno("failed to read from '%s'", path);
+				die_errno("failed to read from '%s'",
+					  source->path);
 			if (read_result != rsize)
 				die("failed to read %d bytes from '%s'",
-				    (int)rsize, path);
+				    (int)rsize, source->path);
 			offset += rsize;
 			if (*already_hashed_to < offset) {
 				size_t hsize = offset - *already_hashed_to;
@@ -258,6 +304,9 @@  static int deflate_blob_to_pack(struct bulk_checkin_packfile *state,
 	unsigned header_len;
 	struct hashfile_checkpoint checkpoint = {0};
 	struct pack_idx_entry *idx = NULL;
+	struct bulk_checkin_source source;
+
+	init_bulk_checkin_source_from_fd(&source, fd, size, path);
 
 	seekback = lseek(fd, 0, SEEK_CUR);
 	if (seekback == (off_t) -1)
@@ -283,7 +332,7 @@  static int deflate_blob_to_pack(struct bulk_checkin_packfile *state,
 			crc32_begin(state->f);
 		}
 		if (!stream_blob_to_pack(state, &ctx, &already_hashed_to,
-					 fd, size, path, flags))
+					 &source, flags))
 			break;
 		/*
 		 * Writing this object to the current pack will make
@@ -295,7 +344,7 @@  static int deflate_blob_to_pack(struct bulk_checkin_packfile *state,
 		hashfile_truncate(state->f, &checkpoint);
 		state->offset = checkpoint.offset;
 		flush_bulk_checkin_packfile(state);
-		if (lseek(fd, seekback, SEEK_SET) == (off_t) -1)
+		if (source.seek(&source, seekback) == (off_t)-1)
 			return error("cannot seek back");
 	}
 	the_hash_algo->final_oid_fn(result_oid, &ctx);