diff mbox series

[v12,3/8] object-file.c: refactor write_loose_object() to several steps

Message ID patch-v12-3.8-3dcaa5d6589-20220329T135446Z-avarab@gmail.com (mailing list archive)
State New, archived
Headers show
Series unpack-objects: support streaming blobs to disk | expand

Commit Message

Ævar Arnfjörð Bjarmason March 29, 2022, 1:56 p.m. UTC
From: Han Xin <hanxin.hx@alibaba-inc.com>

When writing a large blob using "write_loose_object()", we have to pass
a buffer with the whole content of the blob, and this behavior will
consume lots of memory and may cause OOM. We will introduce a stream
version function ("stream_loose_object()") in later commit to resolve
this issue.

Before introducing that streaming function, do some refactoring on
"write_loose_object()" to reuse code for both versions.

Rewrite "write_loose_object()" as follows:

 1. Figure out a path for the (temp) object file. This step is only
    used in "write_loose_object()".

 2. Move common steps for starting to write loose objects into a new
    function "start_loose_object_common()".

 3. Compress data.

 4. Move common steps for ending zlib stream into a new function
    "end_loose_object_common()".

 5. Close fd and finalize the object file.

Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
---
 object-file.c | 102 +++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 76 insertions(+), 26 deletions(-)

Comments

Han Xin March 30, 2022, 7:13 a.m. UTC | #1
On Tue, Mar 29, 2022 at 3:56 PM Ævar Arnfjörð Bjarmason <avarab@gmail.com> wrote:
> 
> +/**
> + * Common steps for loose object writers to end writing loose objects:
> + *
> + * - End the compression of zlib stream.
> + * - Get the calculated oid to "oid".
> + * - fsync() and close() the "fd"

Since we removed close_loose_object() from end_loose_object_common() , I
think this comment should also be removed.

Thanks.
-Han Xin

> + */
> +static int end_loose_object_common(git_hash_ctx *c, git_zstream *stream,
> +				   struct object_id *oid)
> +{
> +	int ret;
> +
> +	ret = git_deflate_end_gently(stream);
> +	if (ret != Z_OK)
> +		return ret;
> +	the_hash_algo->final_oid_fn(oid, c);
> +
> +	return Z_OK;
> +}
> +
Ævar Arnfjörð Bjarmason March 30, 2022, 5:34 p.m. UTC | #2
On Wed, Mar 30 2022, Han Xin wrote:

> On Tue, Mar 29, 2022 at 3:56 PM Ævar Arnfjörð Bjarmason <avarab@gmail.com> wrote:
>> 
>> +/**
>> + * Common steps for loose object writers to end writing loose objects:
>> + *
>> + * - End the compression of zlib stream.
>> + * - Get the calculated oid to "oid".
>> + * - fsync() and close() the "fd"
>
> Since we removed close_loose_object() from end_loose_object_common() , I
> think this comment should also be removed.

You're right. I adjusted it for the "parano_oid" in this v12, but
managed to miss that somehow.

Will submit a re-roll with those changes, but will wait a bit more to
see if there's any other comments on this v12 first. Thanks!
diff mbox series

Patch

diff --git a/object-file.c b/object-file.c
index 5da458eccbf..7f160929e00 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1955,6 +1955,75 @@  static int create_tmpfile(struct strbuf *tmp, const char *filename)
 	return fd;
 }
 
+/**
+ * Common steps for loose object writers to start writing loose
+ * objects:
+ *
+ * - Create tmpfile for the loose object.
+ * - Setup zlib stream for compression.
+ * - Start to feed header to zlib stream.
+ *
+ * Returns a "fd", which should later be provided to
+ * end_loose_object_common().
+ */
+static int start_loose_object_common(struct strbuf *tmp_file,
+				     const char *filename, unsigned flags,
+				     git_zstream *stream,
+				     unsigned char *buf, size_t buflen,
+				     git_hash_ctx *c,
+				     char *hdr, int hdrlen)
+{
+	int fd;
+
+	fd = create_tmpfile(tmp_file, filename);
+	if (fd < 0) {
+		if (flags & HASH_SILENT)
+			return -1;
+		else if (errno == EACCES)
+			return error(_("insufficient permission for adding "
+				       "an object to repository database %s"),
+				     get_object_directory());
+		else
+			return error_errno(
+				_("unable to create temporary file"));
+	}
+
+	/*  Setup zlib stream for compression */
+	git_deflate_init(stream, zlib_compression_level);
+	stream->next_out = buf;
+	stream->avail_out = buflen;
+	the_hash_algo->init_fn(c);
+
+	/*  Start to feed header to zlib stream */
+	stream->next_in = (unsigned char *)hdr;
+	stream->avail_in = hdrlen;
+	while (git_deflate(stream, 0) == Z_OK)
+		; /* nothing */
+	the_hash_algo->update_fn(c, hdr, hdrlen);
+
+	return fd;
+}
+
+/**
+ * Common steps for loose object writers to end writing loose objects:
+ *
+ * - End the compression of zlib stream.
+ * - Get the calculated oid to "oid".
+ * - fsync() and close() the "fd"
+ */
+static int end_loose_object_common(git_hash_ctx *c, git_zstream *stream,
+				   struct object_id *oid)
+{
+	int ret;
+
+	ret = git_deflate_end_gently(stream);
+	if (ret != Z_OK)
+		return ret;
+	the_hash_algo->final_oid_fn(oid, c);
+
+	return Z_OK;
+}
+
 static int write_loose_object(const struct object_id *oid, char *hdr,
 			      int hdrlen, const void *buf, unsigned long len,
 			      time_t mtime, unsigned flags)
@@ -1969,28 +2038,11 @@  static int write_loose_object(const struct object_id *oid, char *hdr,
 
 	loose_object_path(the_repository, &filename, oid);
 
-	fd = create_tmpfile(&tmp_file, filename.buf);
-	if (fd < 0) {
-		if (flags & HASH_SILENT)
-			return -1;
-		else if (errno == EACCES)
-			return error(_("insufficient permission for adding an object to repository database %s"), get_object_directory());
-		else
-			return error_errno(_("unable to create temporary file"));
-	}
-
-	/* Set it up */
-	git_deflate_init(&stream, zlib_compression_level);
-	stream.next_out = compressed;
-	stream.avail_out = sizeof(compressed);
-	the_hash_algo->init_fn(&c);
-
-	/* First header.. */
-	stream.next_in = (unsigned char *)hdr;
-	stream.avail_in = hdrlen;
-	while (git_deflate(&stream, 0) == Z_OK)
-		; /* nothing */
-	the_hash_algo->update_fn(&c, hdr, hdrlen);
+	fd = start_loose_object_common(&tmp_file, filename.buf, flags,
+				       &stream, compressed, sizeof(compressed),
+				       &c, hdr, hdrlen);
+	if (fd < 0)
+		return -1;
 
 	/* Then the data itself.. */
 	stream.next_in = (void *)buf;
@@ -2008,11 +2060,9 @@  static int write_loose_object(const struct object_id *oid, char *hdr,
 	if (ret != Z_STREAM_END)
 		die(_("unable to deflate new object %s (%d)"), oid_to_hex(oid),
 		    ret);
-	ret = git_deflate_end_gently(&stream);
+	ret = end_loose_object_common(&c, &stream, &parano_oid);
 	if (ret != Z_OK)
-		die(_("deflateEnd on object %s failed (%d)"), oid_to_hex(oid),
-		    ret);
-	the_hash_algo->final_oid_fn(&parano_oid, &c);
+		die(_("deflateEnd on object %s failed (%d)"), oid_to_hex(oid), ret);
 	close_loose_object(fd);
 
 	if (!oideq(oid, &parano_oid))