diff mbox series

[02/17] chunk-format: create chunk format write API

Message ID 9bd273f8c94fdb0c3adf8aedef3480ff5f4232b8.1611676886.git.gitgitgadget@gmail.com (mailing list archive)
State Superseded
Headers show
Series Refactor chunk-format into an API | expand

Commit Message

Derrick Stolee Jan. 26, 2021, 4:01 p.m. UTC
From: Derrick Stolee <dstolee@microsoft.com>

In anticipation of combining the logic from the commit-graph and
multi-pack-index file formats, create a new chunk-format API. Use a
'struct chunkfile' pointer to keep track of data that has been
registered for writes. This struct is anonymous outside of
chunk-format.c to ensure no user attempts to interfere with the data.

The next change will use this API in commit-graph.c, but the general
approach is:

 1. initialize the chunkfile with init_chunkfile(f).
 2. add chunks in the intended writing order with add_chunk().
 3. write any header information to the hashfile f.
 4. write the chunkfile data using write_chunkfile().
 5. free the chunkfile struct using free_chunkfile().

Helped-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
---
 Makefile       |  1 +
 chunk-format.c | 91 ++++++++++++++++++++++++++++++++++++++++++++++++++
 chunk-format.h | 20 +++++++++++
 3 files changed, 112 insertions(+)
 create mode 100644 chunk-format.c
 create mode 100644 chunk-format.h

Comments

Taylor Blau Jan. 27, 2021, 2:42 a.m. UTC | #1
On Tue, Jan 26, 2021 at 04:01:11PM +0000, Derrick Stolee via GitGitGadget wrote:
> +/*
> + * When writing a chunk-based file format, collect the chunks in
> + * an array of chunk_info structs. The size stores the _expected_
> + * amount of data that will be written by write_fn.
> + */
> +struct chunk_info {
> +	uint32_t id;
> +	uint64_t size;

Hmm. Would we not want an off_t to indicate the size here?

I wondered briefly if we even needed a size field at all, since calling
write_fn would tell us the number of bytes written. But I suppose you
want to know ahead of time so that you can write the file in one pass
(beginning with the table of contents, which certainly needs to know the
size).

> +	/* Trailing entry marks the end of the chunks */
> +	hashwrite_be32(cf->f, 0);
> +	hashwrite_be64(cf->f, cur_offset);
> +
> +	for (i = 0; i < cf->chunks_nr; i++) {
> +		uint64_t start_offset = cf->f->total + cf->f->offset;
> +		int result = cf->chunks[i].write_fn(cf->f, data);
> +
> +		if (result)
> +			return result;
> +
> +		if (cf->f->total + cf->f->offset != start_offset + cf->chunks[i].size)

I don't think this is a practical concern, but a malicious caller could
overflow this by passing a bogus "size" parameter. Maybe:

    uint64_t end_offset = ...;

    if (end_offset - start_offset != cf->chunks[i].size)
      BUG(...)

?

> diff --git a/chunk-format.h b/chunk-format.h
> new file mode 100644
> index 00000000000..bfaed672813
> --- /dev/null
> +++ b/chunk-format.h
> @@ -0,0 +1,20 @@
> +#ifndef CHUNK_FORMAT_H
> +#define CHUNK_FORMAT_H
> +
> +#include "git-compat-util.h"
> +
> +struct hashfile;
> +struct chunkfile;
> +
> +struct chunkfile *init_chunkfile(struct hashfile *f);
> +void free_chunkfile(struct chunkfile *cf);
> +int get_num_chunks(struct chunkfile *cf);
> +typedef int (*chunk_write_fn)(struct hashfile *f,
> +			      void *data);
> +void add_chunk(struct chunkfile *cf,
> +	       uint64_t id,
> +	       chunk_write_fn fn,
> +	       size_t size);
> +int write_chunkfile(struct chunkfile *cf, void *data);

Very clean API.

Thanks,
Taylor
Derrick Stolee Jan. 27, 2021, 1:49 p.m. UTC | #2
On 1/26/2021 9:42 PM, Taylor Blau wrote:
> On Tue, Jan 26, 2021 at 04:01:11PM +0000, Derrick Stolee via GitGitGadget wrote:
>> +/*
>> + * When writing a chunk-based file format, collect the chunks in
>> + * an array of chunk_info structs. The size stores the _expected_
>> + * amount of data that will be written by write_fn.
>> + */
>> +struct chunk_info {
>> +	uint32_t id;
>> +	uint64_t size;
> 
> Hmm. Would we not want an off_t to indicate the size here?
> 
> I wondered briefly if we even needed a size field at all, since calling
> write_fn would tell us the number of bytes written. But I suppose you
> want to know ahead of time so that you can write the file in one pass
> (beginning with the table of contents, which certainly needs to know the
> size).

Is off_t 64-bits on a 32-bit machine? This is intentionally typed
to be "64 bits no matter what" because it correlates with the file
format's size for the chunk offsets.

>> +		if (cf->f->total + cf->f->offset != start_offset + cf->chunks[i].size)
> 
> I don't think this is a practical concern, but a malicious caller could
> overflow this by passing a bogus "size" parameter. Maybe:
> 
>     uint64_t end_offset = ...;
> 
>     if (end_offset - start_offset != cf->chunks[i].size)
>       BUG(...)

Sure.

Thanks,
-Stolee
diff mbox series

Patch

diff --git a/Makefile b/Makefile
index 7b64106930a..50a7663841e 100644
--- a/Makefile
+++ b/Makefile
@@ -854,6 +854,7 @@  LIB_OBJS += bundle.o
 LIB_OBJS += cache-tree.o
 LIB_OBJS += chdir-notify.o
 LIB_OBJS += checkout.o
+LIB_OBJS += chunk-format.o
 LIB_OBJS += color.o
 LIB_OBJS += column.o
 LIB_OBJS += combine-diff.o
diff --git a/chunk-format.c b/chunk-format.c
new file mode 100644
index 00000000000..2ce37ecc6bb
--- /dev/null
+++ b/chunk-format.c
@@ -0,0 +1,91 @@ 
+#include "cache.h"
+#include "chunk-format.h"
+#include "csum-file.h"
+#define CHUNK_LOOKUP_WIDTH 12
+
+/*
+ * When writing a chunk-based file format, collect the chunks in
+ * an array of chunk_info structs. The size stores the _expected_
+ * amount of data that will be written by write_fn.
+ */
+struct chunk_info {
+	uint32_t id;
+	uint64_t size;
+	chunk_write_fn write_fn;
+};
+
+struct chunkfile {
+	struct hashfile *f;
+
+	struct chunk_info *chunks;
+	size_t chunks_nr;
+	size_t chunks_alloc;
+};
+
+struct chunkfile *init_chunkfile(struct hashfile *f)
+{
+	struct chunkfile *cf = xcalloc(1, sizeof(*cf));
+	cf->f = f;
+	return cf;
+}
+
+void free_chunkfile(struct chunkfile *cf)
+{
+	if (!cf)
+		return;
+	free(cf->chunks);
+	free(cf);
+}
+
+int get_num_chunks(struct chunkfile *cf)
+{
+	return cf->chunks_nr;
+}
+
+void add_chunk(struct chunkfile *cf,
+	       uint64_t id,
+	       chunk_write_fn fn,
+	       size_t size)
+{
+	ALLOC_GROW(cf->chunks, cf->chunks_nr + 1, cf->chunks_alloc);
+
+	cf->chunks[cf->chunks_nr].id = id;
+	cf->chunks[cf->chunks_nr].write_fn = fn;
+	cf->chunks[cf->chunks_nr].size = size;
+	cf->chunks_nr++;
+}
+
+int write_chunkfile(struct chunkfile *cf, void *data)
+{
+	int i;
+	size_t cur_offset = cf->f->offset + cf->f->total;
+
+	/* Add the table of contents to the current offset */
+	cur_offset += (cf->chunks_nr + 1) * CHUNK_LOOKUP_WIDTH;
+
+	for (i = 0; i < cf->chunks_nr; i++) {
+		hashwrite_be32(cf->f, cf->chunks[i].id);
+		hashwrite_be64(cf->f, cur_offset);
+
+		cur_offset += cf->chunks[i].size;
+	}
+
+	/* Trailing entry marks the end of the chunks */
+	hashwrite_be32(cf->f, 0);
+	hashwrite_be64(cf->f, cur_offset);
+
+	for (i = 0; i < cf->chunks_nr; i++) {
+		uint64_t start_offset = cf->f->total + cf->f->offset;
+		int result = cf->chunks[i].write_fn(cf->f, data);
+
+		if (result)
+			return result;
+
+		if (cf->f->total + cf->f->offset != start_offset + cf->chunks[i].size)
+			BUG("expected to write %"PRId64" bytes to chunk %"PRIx32", but wrote %"PRId64" instead",
+			    cf->chunks[i].size, cf->chunks[i].id,
+			    cf->f->total + cf->f->offset - start_offset);
+	}
+
+	return 0;
+}
diff --git a/chunk-format.h b/chunk-format.h
new file mode 100644
index 00000000000..bfaed672813
--- /dev/null
+++ b/chunk-format.h
@@ -0,0 +1,20 @@ 
+#ifndef CHUNK_FORMAT_H
+#define CHUNK_FORMAT_H
+
+#include "git-compat-util.h"
+
+struct hashfile;
+struct chunkfile;
+
+struct chunkfile *init_chunkfile(struct hashfile *f);
+void free_chunkfile(struct chunkfile *cf);
+int get_num_chunks(struct chunkfile *cf);
+typedef int (*chunk_write_fn)(struct hashfile *f,
+			      void *data);
+void add_chunk(struct chunkfile *cf,
+	       uint64_t id,
+	       chunk_write_fn fn,
+	       size_t size);
+int write_chunkfile(struct chunkfile *cf, void *data);
+
+#endif